1//===------- AMDCPU.cpp - Emit LLVM Code for builtins ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This contains code to emit Builtin calls as LLVM code.
10//
11//===----------------------------------------------------------------------===//
12
13#include "CGBuiltin.h"
14#include "CodeGenFunction.h"
15#include "TargetInfo.h"
16#include "clang/Basic/DiagnosticFrontend.h"
17#include "clang/Basic/SyncScope.h"
18#include "clang/Basic/TargetBuiltins.h"
19#include "llvm/Analysis/ValueTracking.h"
20#include "llvm/CodeGen/MachineFunction.h"
21#include "llvm/IR/IntrinsicsAMDGPU.h"
22#include "llvm/IR/IntrinsicsR600.h"
23#include "llvm/IR/MemoryModelRelaxationAnnotations.h"
24#include "llvm/Support/AMDGPUAddrSpace.h"
25#include "llvm/Support/AtomicOrdering.h"
26
27using namespace clang;
28using namespace CodeGen;
29using namespace llvm;
30
31namespace {
32
33// Has second type mangled argument.
34static Value *
35emitBinaryExpMaybeConstrainedFPBuiltin(CodeGenFunction &CGF, const CallExpr *E,
36 Intrinsic::ID IntrinsicID,
37 Intrinsic::ID ConstrainedIntrinsicID) {
38 llvm::Value *Src0 = CGF.EmitScalarExpr(E: E->getArg(Arg: 0));
39 llvm::Value *Src1 = CGF.EmitScalarExpr(E: E->getArg(Arg: 1));
40
41 CodeGenFunction::CGFPOptionsRAII FPOptsRAII(CGF, E);
42 if (CGF.Builder.getIsFPConstrained()) {
43 Function *F = CGF.CGM.getIntrinsic(IID: ConstrainedIntrinsicID,
44 Tys: {Src0->getType(), Src1->getType()});
45 return CGF.Builder.CreateConstrainedFPCall(Callee: F, Args: {Src0, Src1});
46 }
47
48 Function *F =
49 CGF.CGM.getIntrinsic(IID: IntrinsicID, Tys: {Src0->getType(), Src1->getType()});
50 return CGF.Builder.CreateCall(Callee: F, Args: {Src0, Src1});
51}
52
53// If \p E is not null pointer, insert address space cast to match return
54// type of \p E if necessary.
55Value *EmitAMDGPUDispatchPtr(CodeGenFunction &CGF,
56 const CallExpr *E = nullptr) {
57 auto *F = CGF.CGM.getIntrinsic(IID: Intrinsic::amdgcn_dispatch_ptr);
58 auto *Call = CGF.Builder.CreateCall(Callee: F);
59 Call->addRetAttr(
60 Attr: Attribute::getWithDereferenceableBytes(Context&: Call->getContext(), Bytes: 64));
61 Call->addRetAttr(Attr: Attribute::getWithAlignment(Context&: Call->getContext(), Alignment: Align(4)));
62 if (!E)
63 return Call;
64 QualType BuiltinRetType = E->getType();
65 auto *RetTy = cast<llvm::PointerType>(Val: CGF.ConvertType(T: BuiltinRetType));
66 if (RetTy == Call->getType())
67 return Call;
68 return CGF.Builder.CreateAddrSpaceCast(V: Call, DestTy: RetTy);
69}
70
71Value *EmitAMDGPUImplicitArgPtr(CodeGenFunction &CGF) {
72 auto *F = CGF.CGM.getIntrinsic(IID: Intrinsic::amdgcn_implicitarg_ptr);
73 auto *Call = CGF.Builder.CreateCall(Callee: F);
74 Call->addRetAttr(
75 Attr: Attribute::getWithDereferenceableBytes(Context&: Call->getContext(), Bytes: 256));
76 Call->addRetAttr(Attr: Attribute::getWithAlignment(Context&: Call->getContext(), Alignment: Align(8)));
77 return Call;
78}
79
80// \p Index is 0, 1, and 2 for x, y, and z dimension, respectively.
81/// Emit code based on Code Object ABI version.
82/// COV_4 : Emit code to use dispatch ptr
83/// COV_5+ : Emit code to use implicitarg ptr
84/// COV_NONE : Emit code to load a global variable "__oclc_ABI_version"
85/// and use its value for COV_4 or COV_5+ approach. It is used for
86/// compiling device libraries in an ABI-agnostic way.
87Value *EmitAMDGPUWorkGroupSize(CodeGenFunction &CGF, unsigned Index) {
88 llvm::LoadInst *LD;
89
90 auto Cov = CGF.getTarget().getTargetOpts().CodeObjectVersion;
91
92 if (Cov == CodeObjectVersionKind::COV_None) {
93 StringRef Name = "__oclc_ABI_version";
94 auto *ABIVersionC = CGF.CGM.getModule().getNamedGlobal(Name);
95 if (!ABIVersionC)
96 ABIVersionC = new llvm::GlobalVariable(
97 CGF.CGM.getModule(), CGF.Int32Ty, false,
98 llvm::GlobalValue::ExternalLinkage, nullptr, Name, nullptr,
99 llvm::GlobalVariable::NotThreadLocal,
100 CGF.CGM.getContext().getTargetAddressSpace(AS: LangAS::opencl_constant));
101
102 // This load will be eliminated by the IPSCCP because it is constant
103 // weak_odr without externally_initialized. Either changing it to weak or
104 // adding externally_initialized will keep the load.
105 Value *ABIVersion = CGF.Builder.CreateAlignedLoad(Ty: CGF.Int32Ty, Addr: ABIVersionC,
106 Align: CGF.CGM.getIntAlign());
107
108 Value *IsCOV5 = CGF.Builder.CreateICmpSGE(
109 LHS: ABIVersion,
110 RHS: llvm::ConstantInt::get(Ty: CGF.Int32Ty, V: CodeObjectVersionKind::COV_5));
111
112 // Indexing the implicit kernarg segment.
113 Value *ImplicitGEP = CGF.Builder.CreateConstGEP1_32(
114 Ty: CGF.Int8Ty, Ptr: EmitAMDGPUImplicitArgPtr(CGF), Idx0: 12 + Index * 2);
115
116 // Indexing the HSA kernel_dispatch_packet struct.
117 Value *DispatchGEP = CGF.Builder.CreateConstGEP1_32(
118 Ty: CGF.Int8Ty, Ptr: EmitAMDGPUDispatchPtr(CGF), Idx0: 4 + Index * 2);
119
120 auto Result = CGF.Builder.CreateSelect(C: IsCOV5, True: ImplicitGEP, False: DispatchGEP);
121 LD = CGF.Builder.CreateLoad(
122 Addr: Address(Result, CGF.Int16Ty, CharUnits::fromQuantity(Quantity: 2)));
123 } else {
124 Value *GEP = nullptr;
125 if (Cov >= CodeObjectVersionKind::COV_5) {
126 // Indexing the implicit kernarg segment.
127 GEP = CGF.Builder.CreateConstGEP1_32(
128 Ty: CGF.Int8Ty, Ptr: EmitAMDGPUImplicitArgPtr(CGF), Idx0: 12 + Index * 2);
129 } else {
130 // Indexing the HSA kernel_dispatch_packet struct.
131 GEP = CGF.Builder.CreateConstGEP1_32(
132 Ty: CGF.Int8Ty, Ptr: EmitAMDGPUDispatchPtr(CGF), Idx0: 4 + Index * 2);
133 }
134 LD = CGF.Builder.CreateLoad(
135 Addr: Address(GEP, CGF.Int16Ty, CharUnits::fromQuantity(Quantity: 2)));
136 }
137
138 llvm::MDBuilder MDHelper(CGF.getLLVMContext());
139 llvm::MDNode *RNode = MDHelper.createRange(Lo: APInt(16, 1),
140 Hi: APInt(16, CGF.getTarget().getMaxOpenCLWorkGroupSize() + 1));
141 LD->setMetadata(KindID: llvm::LLVMContext::MD_range, Node: RNode);
142 LD->setMetadata(KindID: llvm::LLVMContext::MD_noundef,
143 Node: llvm::MDNode::get(Context&: CGF.getLLVMContext(), MDs: {}));
144 LD->setMetadata(KindID: llvm::LLVMContext::MD_invariant_load,
145 Node: llvm::MDNode::get(Context&: CGF.getLLVMContext(), MDs: {}));
146 return LD;
147}
148
149// \p Index is 0, 1, and 2 for x, y, and z dimension, respectively.
150Value *EmitAMDGPUGridSize(CodeGenFunction &CGF, unsigned Index) {
151 const unsigned XOffset = 12;
152 auto *DP = EmitAMDGPUDispatchPtr(CGF);
153 // Indexing the HSA kernel_dispatch_packet struct.
154 auto *Offset = llvm::ConstantInt::get(Ty: CGF.Int32Ty, V: XOffset + Index * 4);
155 auto *GEP = CGF.Builder.CreateGEP(Ty: CGF.Int8Ty, Ptr: DP, IdxList: Offset);
156 auto *LD = CGF.Builder.CreateLoad(
157 Addr: Address(GEP, CGF.Int32Ty, CharUnits::fromQuantity(Quantity: 4)));
158
159 llvm::MDBuilder MDB(CGF.getLLVMContext());
160
161 // Known non-zero.
162 LD->setMetadata(KindID: llvm::LLVMContext::MD_range,
163 Node: MDB.createRange(Lo: APInt(32, 1), Hi: APInt::getZero(numBits: 32)));
164 LD->setMetadata(KindID: llvm::LLVMContext::MD_invariant_load,
165 Node: llvm::MDNode::get(Context&: CGF.getLLVMContext(), MDs: {}));
166 return LD;
167}
168} // namespace
169
170// Generates the IR for __builtin_read_exec_*.
171// Lowers the builtin to amdgcn_ballot intrinsic.
172static Value *EmitAMDGCNBallotForExec(CodeGenFunction &CGF, const CallExpr *E,
173 llvm::Type *RegisterType,
174 llvm::Type *ValueType, bool isExecHi) {
175 CodeGen::CGBuilderTy &Builder = CGF.Builder;
176 CodeGen::CodeGenModule &CGM = CGF.CGM;
177
178 Function *F = CGM.getIntrinsic(IID: Intrinsic::amdgcn_ballot, Tys: {RegisterType});
179 llvm::Value *Call = Builder.CreateCall(Callee: F, Args: {Builder.getInt1(V: true)});
180
181 if (isExecHi) {
182 Value *Rt2 = Builder.CreateLShr(LHS: Call, RHS: 32);
183 Rt2 = Builder.CreateTrunc(V: Rt2, DestTy: CGF.Int32Ty);
184 return Rt2;
185 }
186
187 return Call;
188}
189
190static llvm::Value *loadTextureDescPtorAsVec8I32(CodeGenFunction &CGF,
191 llvm::Value *RsrcPtr) {
192 auto &B = CGF.Builder;
193 auto *VecTy = llvm::FixedVectorType::get(ElementType: B.getInt32Ty(), NumElts: 8);
194
195 if (RsrcPtr->getType() == VecTy)
196 return RsrcPtr;
197
198 if (RsrcPtr->getType()->isIntegerTy(Bitwidth: 32)) {
199 llvm::PointerType *VecPtrTy =
200 llvm::PointerType::get(C&: CGF.getLLVMContext(), AddressSpace: 8);
201 llvm::Value *Ptr = B.CreateIntToPtr(V: RsrcPtr, DestTy: VecPtrTy, Name: "tex.rsrc.from.int");
202 return B.CreateAlignedLoad(Ty: VecTy, Ptr, Align: llvm::Align(32), Name: "tex.rsrc.val");
203 }
204
205 if (RsrcPtr->getType()->isPointerTy()) {
206 auto *VecPtrTy = llvm::PointerType::get(
207 C&: CGF.getLLVMContext(), AddressSpace: RsrcPtr->getType()->getPointerAddressSpace());
208 llvm::Value *Typed = B.CreateBitCast(V: RsrcPtr, DestTy: VecPtrTy, Name: "tex.rsrc.typed");
209 return B.CreateAlignedLoad(Ty: VecTy, Ptr: Typed, Align: llvm::Align(32), Name: "tex.rsrc.val");
210 }
211
212 const auto &DL = CGF.CGM.getDataLayout();
213 if (DL.getTypeSizeInBits(Ty: RsrcPtr->getType()) == 256)
214 return B.CreateBitCast(V: RsrcPtr, DestTy: VecTy, Name: "tex.rsrc.val");
215
216 llvm::report_fatal_error(reason: "Unexpected texture resource argument form");
217}
218
219llvm::CallInst *
220emitAMDGCNImageOverloadedReturnType(clang::CodeGen::CodeGenFunction &CGF,
221 const clang::CallExpr *E,
222 unsigned IntrinsicID, bool IsImageStore) {
223 auto findTextureDescIndex = [&CGF](const CallExpr *E) -> unsigned {
224 QualType TexQT = CGF.getContext().AMDGPUTextureTy;
225 for (unsigned I = 0, N = E->getNumArgs(); I < N; ++I) {
226 QualType ArgTy = E->getArg(Arg: I)->getType();
227 if (ArgTy == TexQT) {
228 return I;
229 }
230
231 if (ArgTy.getCanonicalType() == TexQT.getCanonicalType()) {
232 return I;
233 }
234 }
235
236 return ~0U;
237 };
238
239 clang::SmallVector<llvm::Value *, 10> Args;
240 unsigned RsrcIndex = findTextureDescIndex(E);
241
242 if (RsrcIndex == ~0U) {
243 llvm::report_fatal_error(reason: "Invalid argument count for image builtin");
244 }
245
246 for (unsigned I = 0; I < E->getNumArgs(); ++I) {
247 llvm::Value *V = CGF.EmitScalarExpr(E: E->getArg(Arg: I));
248 if (I == RsrcIndex)
249 V = loadTextureDescPtorAsVec8I32(CGF, RsrcPtr: V);
250 Args.push_back(Elt: V);
251 }
252
253 llvm::Type *RetTy = IsImageStore ? CGF.VoidTy : CGF.ConvertType(T: E->getType());
254 llvm::CallInst *Call = CGF.Builder.CreateIntrinsic(RetTy, ID: IntrinsicID, Args);
255 return Call;
256}
257
258// Emit an intrinsic that has 1 float or double operand, and 1 integer.
259static Value *emitFPIntBuiltin(CodeGenFunction &CGF,
260 const CallExpr *E,
261 unsigned IntrinsicID) {
262 llvm::Value *Src0 = CGF.EmitScalarExpr(E: E->getArg(Arg: 0));
263 llvm::Value *Src1 = CGF.EmitScalarExpr(E: E->getArg(Arg: 1));
264
265 Function *F = CGF.CGM.getIntrinsic(IID: IntrinsicID, Tys: Src0->getType());
266 return CGF.Builder.CreateCall(Callee: F, Args: {Src0, Src1});
267}
268
269static inline StringRef mapScopeToSPIRV(StringRef AMDGCNScope) {
270 if (AMDGCNScope == "agent")
271 return "device";
272 if (AMDGCNScope == "wavefront")
273 return "subgroup";
274 return AMDGCNScope;
275}
276
277static llvm::AtomicOrdering mapCABIAtomicOrdering(unsigned AO) {
278 // Map C11/C++11 memory ordering to LLVM memory ordering
279 assert(llvm::isValidAtomicOrderingCABI(AO));
280 switch (static_cast<llvm::AtomicOrderingCABI>(AO)) {
281 case llvm::AtomicOrderingCABI::acquire:
282 case llvm::AtomicOrderingCABI::consume:
283 return llvm::AtomicOrdering::Acquire;
284 case llvm::AtomicOrderingCABI::release:
285 return llvm::AtomicOrdering::Release;
286 case llvm::AtomicOrderingCABI::acq_rel:
287 return llvm::AtomicOrdering::AcquireRelease;
288 case llvm::AtomicOrderingCABI::seq_cst:
289 return llvm::AtomicOrdering::SequentiallyConsistent;
290 case llvm::AtomicOrderingCABI::relaxed:
291 return llvm::AtomicOrdering::Monotonic;
292 }
293 llvm_unreachable("Unknown AtomicOrderingCABI enum");
294}
295
296// For processing memory ordering and memory scope arguments of various
297// amdgcn builtins.
298// \p Order takes a C++11 compatible memory-ordering specifier and converts
299// it into LLVM's memory ordering specifier using atomic C ABI, and writes
300// to \p AO. \p Scope takes a const char * and converts it into AMDGCN
301// specific SyncScopeID and writes it to \p SSID.
302void CodeGenFunction::ProcessOrderScopeAMDGCN(Value *Order, Value *Scope,
303 llvm::AtomicOrdering &AO,
304 llvm::SyncScope::ID &SSID) {
305 int ord = cast<llvm::ConstantInt>(Val: Order)->getZExtValue();
306
307 // Map C11/C++11 memory ordering to LLVM memory ordering
308 AO = mapCABIAtomicOrdering(AO: ord);
309
310 // Some of the atomic builtins take the scope as a string name.
311 StringRef scp;
312 if (llvm::getConstantStringInfo(V: Scope, Str&: scp)) {
313 if (getTarget().getTriple().isSPIRV())
314 scp = mapScopeToSPIRV(AMDGCNScope: scp);
315 SSID = getLLVMContext().getOrInsertSyncScopeID(SSN: scp);
316 return;
317 }
318
319 // Older builtins had an enum argument for the memory scope.
320 const char *SSN = nullptr;
321 int scope = cast<llvm::ConstantInt>(Val: Scope)->getZExtValue();
322 switch (scope) {
323 case AtomicScopeGenericModel::System: // __MEMORY_SCOPE_SYSTEM
324 SSID = llvm::SyncScope::System;
325 break;
326 case AtomicScopeGenericModel::Device: // __MEMORY_SCOPE_DEVICE
327 SSN = getTarget().getTriple().isSPIRV() ? "device" : "agent";
328 break;
329 case AtomicScopeGenericModel::Workgroup: // __MEMORY_SCOPE_WRKGRP
330 SSN = "workgroup";
331 break;
332 case AtomicScopeGenericModel::Cluster: // __MEMORY_SCOPE_CLUSTR
333 SSN = getTarget().getTriple().isSPIRV() ? "workgroup" : "cluster";
334 break;
335 case AtomicScopeGenericModel::Wavefront: // __MEMORY_SCOPE_WVFRNT
336 SSN = getTarget().getTriple().isSPIRV() ? "subgroup" : "wavefront";
337 break;
338 case AtomicScopeGenericModel::Single: // __MEMORY_SCOPE_SINGLE
339 SSID = llvm::SyncScope::SingleThread;
340 break;
341 default:
342 SSID = llvm::SyncScope::System;
343 break;
344 }
345 if (SSN)
346 SSID = getLLVMContext().getOrInsertSyncScopeID(SSN);
347}
348
349void CodeGenFunction::AddAMDGPUFenceAddressSpaceMMRA(llvm::Instruction *Inst,
350 const CallExpr *E) {
351 constexpr const char *Tag = "amdgpu-synchronize-as";
352
353 LLVMContext &Ctx = Inst->getContext();
354 SmallVector<MMRAMetadata::TagT, 3> MMRAs;
355 for (unsigned K = 2; K < E->getNumArgs(); ++K) {
356 llvm::Value *V = EmitScalarExpr(E: E->getArg(Arg: K));
357 StringRef AS;
358 if (llvm::getConstantStringInfo(V, Str&: AS)) {
359 MMRAs.push_back(Elt: {Tag, AS});
360 // TODO: Delete the resulting unused constant?
361 continue;
362 }
363 CGM.Error(loc: E->getExprLoc(),
364 error: "expected an address space name as a string literal");
365 }
366
367 llvm::sort(C&: MMRAs);
368 MMRAs.erase(CS: llvm::unique(R&: MMRAs), CE: MMRAs.end());
369 Inst->setMetadata(KindID: LLVMContext::MD_mmra, Node: MMRAMetadata::getMD(Ctx, Tags: MMRAs));
370}
371
372static Intrinsic::ID getIntrinsicIDforWaveReduction(unsigned BuiltinID) {
373 switch (BuiltinID) {
374 default:
375 llvm_unreachable("Unknown BuiltinID for wave reduction");
376 case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_add_u32:
377 case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_add_u64:
378 return Intrinsic::amdgcn_wave_reduce_add;
379 case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_fadd_f32:
380 case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_fadd_f64:
381 return Intrinsic::amdgcn_wave_reduce_fadd;
382 case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_u32:
383 case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_u64:
384 return Intrinsic::amdgcn_wave_reduce_sub;
385 case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_fsub_f32:
386 case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_fsub_f64:
387 return Intrinsic::amdgcn_wave_reduce_fsub;
388 case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_i32:
389 case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_i64:
390 return Intrinsic::amdgcn_wave_reduce_min;
391 case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_fmin_f32:
392 case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_fmin_f64:
393 return Intrinsic::amdgcn_wave_reduce_fmin;
394 case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_u32:
395 case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_u64:
396 return Intrinsic::amdgcn_wave_reduce_umin;
397 case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_i32:
398 case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_i64:
399 return Intrinsic::amdgcn_wave_reduce_max;
400 case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_fmax_f32:
401 case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_fmax_f64:
402 return Intrinsic::amdgcn_wave_reduce_fmax;
403 case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_u32:
404 case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_u64:
405 return Intrinsic::amdgcn_wave_reduce_umax;
406 case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_and_b32:
407 case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_and_b64:
408 return Intrinsic::amdgcn_wave_reduce_and;
409 case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_or_b32:
410 case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_or_b64:
411 return Intrinsic::amdgcn_wave_reduce_or;
412 case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_xor_b32:
413 case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_xor_b64:
414 return Intrinsic::amdgcn_wave_reduce_xor;
415 }
416}
417
418Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
419 const CallExpr *E) {
420 llvm::AtomicOrdering AO = llvm::AtomicOrdering::SequentiallyConsistent;
421 llvm::SyncScope::ID SSID;
422 switch (BuiltinID) {
423 case AMDGPU::BI__builtin_amdgcn_wave_reduce_add_u32:
424 case AMDGPU::BI__builtin_amdgcn_wave_reduce_fadd_f32:
425 case AMDGPU::BI__builtin_amdgcn_wave_reduce_fadd_f64:
426 case AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_u32:
427 case AMDGPU::BI__builtin_amdgcn_wave_reduce_fsub_f32:
428 case AMDGPU::BI__builtin_amdgcn_wave_reduce_fsub_f64:
429 case AMDGPU::BI__builtin_amdgcn_wave_reduce_min_i32:
430 case AMDGPU::BI__builtin_amdgcn_wave_reduce_min_u32:
431 case AMDGPU::BI__builtin_amdgcn_wave_reduce_fmin_f32:
432 case AMDGPU::BI__builtin_amdgcn_wave_reduce_fmin_f64:
433 case AMDGPU::BI__builtin_amdgcn_wave_reduce_max_i32:
434 case AMDGPU::BI__builtin_amdgcn_wave_reduce_max_u32:
435 case AMDGPU::BI__builtin_amdgcn_wave_reduce_fmax_f32:
436 case AMDGPU::BI__builtin_amdgcn_wave_reduce_fmax_f64:
437 case AMDGPU::BI__builtin_amdgcn_wave_reduce_and_b32:
438 case AMDGPU::BI__builtin_amdgcn_wave_reduce_or_b32:
439 case AMDGPU::BI__builtin_amdgcn_wave_reduce_xor_b32:
440 case AMDGPU::BI__builtin_amdgcn_wave_reduce_add_u64:
441 case AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_u64:
442 case AMDGPU::BI__builtin_amdgcn_wave_reduce_min_i64:
443 case AMDGPU::BI__builtin_amdgcn_wave_reduce_min_u64:
444 case AMDGPU::BI__builtin_amdgcn_wave_reduce_max_i64:
445 case AMDGPU::BI__builtin_amdgcn_wave_reduce_max_u64:
446 case AMDGPU::BI__builtin_amdgcn_wave_reduce_and_b64:
447 case AMDGPU::BI__builtin_amdgcn_wave_reduce_or_b64:
448 case AMDGPU::BI__builtin_amdgcn_wave_reduce_xor_b64: {
449 Intrinsic::ID IID = getIntrinsicIDforWaveReduction(BuiltinID);
450 llvm::Value *Value = EmitScalarExpr(E: E->getArg(Arg: 0));
451 llvm::Value *Strategy = EmitScalarExpr(E: E->getArg(Arg: 1));
452 llvm::Function *F = CGM.getIntrinsic(IID, Tys: {Value->getType()});
453 return Builder.CreateCall(Callee: F, Args: {Value, Strategy});
454 }
455 case AMDGPU::BI__builtin_amdgcn_div_scale:
456 case AMDGPU::BI__builtin_amdgcn_div_scalef: {
457 // Translate from the intrinsics's struct return to the builtin's out
458 // argument.
459
460 Address FlagOutPtr = EmitPointerWithAlignment(Addr: E->getArg(Arg: 3));
461
462 llvm::Value *X = EmitScalarExpr(E: E->getArg(Arg: 0));
463 llvm::Value *Y = EmitScalarExpr(E: E->getArg(Arg: 1));
464 llvm::Value *Z = EmitScalarExpr(E: E->getArg(Arg: 2));
465
466 llvm::Function *Callee = CGM.getIntrinsic(IID: Intrinsic::amdgcn_div_scale,
467 Tys: X->getType());
468
469 llvm::Value *Tmp = Builder.CreateCall(Callee, Args: {X, Y, Z});
470
471 llvm::Value *Result = Builder.CreateExtractValue(Agg: Tmp, Idxs: 0);
472 llvm::Value *Flag = Builder.CreateExtractValue(Agg: Tmp, Idxs: 1);
473
474 llvm::Type *RealFlagType = FlagOutPtr.getElementType();
475
476 llvm::Value *FlagExt = Builder.CreateZExt(V: Flag, DestTy: RealFlagType);
477 Builder.CreateStore(Val: FlagExt, Addr: FlagOutPtr);
478 return Result;
479 }
480 case AMDGPU::BI__builtin_amdgcn_div_fmas:
481 case AMDGPU::BI__builtin_amdgcn_div_fmasf: {
482 llvm::Value *Src0 = EmitScalarExpr(E: E->getArg(Arg: 0));
483 llvm::Value *Src1 = EmitScalarExpr(E: E->getArg(Arg: 1));
484 llvm::Value *Src2 = EmitScalarExpr(E: E->getArg(Arg: 2));
485 llvm::Value *Src3 = EmitScalarExpr(E: E->getArg(Arg: 3));
486
487 llvm::Function *F = CGM.getIntrinsic(IID: Intrinsic::amdgcn_div_fmas,
488 Tys: Src0->getType());
489 llvm::Value *Src3ToBool = Builder.CreateIsNotNull(Arg: Src3);
490 return Builder.CreateCall(Callee: F, Args: {Src0, Src1, Src2, Src3ToBool});
491 }
492
493 case AMDGPU::BI__builtin_amdgcn_ds_swizzle:
494 return emitBuiltinWithOneOverloadedType<2>(CGF&: *this, E,
495 IntrinsicID: Intrinsic::amdgcn_ds_swizzle);
496 case AMDGPU::BI__builtin_amdgcn_mov_dpp8:
497 case AMDGPU::BI__builtin_amdgcn_mov_dpp:
498 case AMDGPU::BI__builtin_amdgcn_update_dpp: {
499 llvm::SmallVector<llvm::Value *, 6> Args;
500 // Find out if any arguments are required to be integer constant
501 // expressions.
502 unsigned ICEArguments = 0;
503 ASTContext::GetBuiltinTypeError Error;
504 getContext().GetBuiltinType(ID: BuiltinID, Error, IntegerConstantArgs: &ICEArguments);
505 assert(Error == ASTContext::GE_None && "Should not codegen an error");
506 llvm::Type *DataTy = ConvertType(T: E->getArg(Arg: 0)->getType());
507 unsigned Size = DataTy->getPrimitiveSizeInBits();
508 llvm::Type *IntTy =
509 llvm::IntegerType::get(C&: Builder.getContext(), NumBits: std::max(a: Size, b: 32u));
510 Function *F =
511 CGM.getIntrinsic(IID: BuiltinID == AMDGPU::BI__builtin_amdgcn_mov_dpp8
512 ? Intrinsic::amdgcn_mov_dpp8
513 : Intrinsic::amdgcn_update_dpp,
514 Tys: IntTy);
515 assert(E->getNumArgs() == 5 || E->getNumArgs() == 6 ||
516 E->getNumArgs() == 2);
517 bool InsertOld = BuiltinID == AMDGPU::BI__builtin_amdgcn_mov_dpp;
518 if (InsertOld)
519 Args.push_back(Elt: llvm::PoisonValue::get(T: IntTy));
520 for (unsigned I = 0; I != E->getNumArgs(); ++I) {
521 llvm::Value *V = EmitScalarOrConstFoldImmArg(ICEArguments, Idx: I, E);
522 if (I < (BuiltinID == AMDGPU::BI__builtin_amdgcn_update_dpp ? 2u : 1u) &&
523 Size < 32) {
524 if (!DataTy->isIntegerTy())
525 V = Builder.CreateBitCast(
526 V, DestTy: llvm::IntegerType::get(C&: Builder.getContext(), NumBits: Size));
527 V = Builder.CreateZExtOrBitCast(V, DestTy: IntTy);
528 }
529 llvm::Type *ExpTy =
530 F->getFunctionType()->getFunctionParamType(i: I + InsertOld);
531 Args.push_back(Elt: Builder.CreateTruncOrBitCast(V, DestTy: ExpTy));
532 }
533 Value *V = Builder.CreateCall(Callee: F, Args);
534 if (Size < 32 && !DataTy->isIntegerTy())
535 V = Builder.CreateTrunc(
536 V, DestTy: llvm::IntegerType::get(C&: Builder.getContext(), NumBits: Size));
537 return Builder.CreateTruncOrBitCast(V, DestTy: DataTy);
538 }
539 case AMDGPU::BI__builtin_amdgcn_permlane16:
540 case AMDGPU::BI__builtin_amdgcn_permlanex16:
541 return emitBuiltinWithOneOverloadedType<6>(
542 CGF&: *this, E,
543 IntrinsicID: BuiltinID == AMDGPU::BI__builtin_amdgcn_permlane16
544 ? Intrinsic::amdgcn_permlane16
545 : Intrinsic::amdgcn_permlanex16);
546 case AMDGPU::BI__builtin_amdgcn_permlane64:
547 return emitBuiltinWithOneOverloadedType<1>(CGF&: *this, E,
548 IntrinsicID: Intrinsic::amdgcn_permlane64);
549 case AMDGPU::BI__builtin_amdgcn_readlane:
550 return emitBuiltinWithOneOverloadedType<2>(CGF&: *this, E,
551 IntrinsicID: Intrinsic::amdgcn_readlane);
552 case AMDGPU::BI__builtin_amdgcn_readfirstlane:
553 return emitBuiltinWithOneOverloadedType<1>(CGF&: *this, E,
554 IntrinsicID: Intrinsic::amdgcn_readfirstlane);
555 case AMDGPU::BI__builtin_amdgcn_div_fixup:
556 case AMDGPU::BI__builtin_amdgcn_div_fixupf:
557 case AMDGPU::BI__builtin_amdgcn_div_fixuph:
558 return emitBuiltinWithOneOverloadedType<3>(CGF&: *this, E,
559 IntrinsicID: Intrinsic::amdgcn_div_fixup);
560 case AMDGPU::BI__builtin_amdgcn_trig_preop:
561 case AMDGPU::BI__builtin_amdgcn_trig_preopf:
562 return emitFPIntBuiltin(CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_trig_preop);
563 case AMDGPU::BI__builtin_amdgcn_rcp:
564 case AMDGPU::BI__builtin_amdgcn_rcpf:
565 case AMDGPU::BI__builtin_amdgcn_rcph:
566 case AMDGPU::BI__builtin_amdgcn_rcp_bf16:
567 return emitBuiltinWithOneOverloadedType<1>(CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_rcp);
568 case AMDGPU::BI__builtin_amdgcn_sqrt:
569 case AMDGPU::BI__builtin_amdgcn_sqrtf:
570 case AMDGPU::BI__builtin_amdgcn_sqrth:
571 case AMDGPU::BI__builtin_amdgcn_sqrt_bf16:
572 return emitBuiltinWithOneOverloadedType<1>(CGF&: *this, E,
573 IntrinsicID: Intrinsic::amdgcn_sqrt);
574 case AMDGPU::BI__builtin_amdgcn_rsq:
575 case AMDGPU::BI__builtin_amdgcn_rsqf:
576 case AMDGPU::BI__builtin_amdgcn_rsqh:
577 case AMDGPU::BI__builtin_amdgcn_rsq_bf16:
578 return emitBuiltinWithOneOverloadedType<1>(CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_rsq);
579 case AMDGPU::BI__builtin_amdgcn_rsq_clamp:
580 case AMDGPU::BI__builtin_amdgcn_rsq_clampf:
581 return emitBuiltinWithOneOverloadedType<1>(CGF&: *this, E,
582 IntrinsicID: Intrinsic::amdgcn_rsq_clamp);
583 case AMDGPU::BI__builtin_amdgcn_sinf:
584 case AMDGPU::BI__builtin_amdgcn_sinh:
585 case AMDGPU::BI__builtin_amdgcn_sin_bf16:
586 return emitBuiltinWithOneOverloadedType<1>(CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_sin);
587 case AMDGPU::BI__builtin_amdgcn_cosf:
588 case AMDGPU::BI__builtin_amdgcn_cosh:
589 case AMDGPU::BI__builtin_amdgcn_cos_bf16:
590 return emitBuiltinWithOneOverloadedType<1>(CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_cos);
591 case AMDGPU::BI__builtin_amdgcn_dispatch_ptr:
592 return EmitAMDGPUDispatchPtr(CGF&: *this, E);
593 case AMDGPU::BI__builtin_amdgcn_logf:
594 case AMDGPU::BI__builtin_amdgcn_log_bf16:
595 return emitBuiltinWithOneOverloadedType<1>(CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_log);
596 case AMDGPU::BI__builtin_amdgcn_exp2f:
597 case AMDGPU::BI__builtin_amdgcn_exp2_bf16:
598 return emitBuiltinWithOneOverloadedType<1>(CGF&: *this, E,
599 IntrinsicID: Intrinsic::amdgcn_exp2);
600 case AMDGPU::BI__builtin_amdgcn_log_clampf:
601 return emitBuiltinWithOneOverloadedType<1>(CGF&: *this, E,
602 IntrinsicID: Intrinsic::amdgcn_log_clamp);
603 case AMDGPU::BI__builtin_amdgcn_ldexp:
604 case AMDGPU::BI__builtin_amdgcn_ldexpf: {
605 llvm::Value *Src0 = EmitScalarExpr(E: E->getArg(Arg: 0));
606 llvm::Value *Src1 = EmitScalarExpr(E: E->getArg(Arg: 1));
607 llvm::Function *F =
608 CGM.getIntrinsic(IID: Intrinsic::ldexp, Tys: {Src0->getType(), Src1->getType()});
609 return Builder.CreateCall(Callee: F, Args: {Src0, Src1});
610 }
611 case AMDGPU::BI__builtin_amdgcn_ldexph: {
612 // The raw instruction has a different behavior for out of bounds exponent
613 // values (implicit truncation instead of saturate to short_min/short_max).
614 llvm::Value *Src0 = EmitScalarExpr(E: E->getArg(Arg: 0));
615 llvm::Value *Src1 = EmitScalarExpr(E: E->getArg(Arg: 1));
616 llvm::Function *F =
617 CGM.getIntrinsic(IID: Intrinsic::ldexp, Tys: {Src0->getType(), Int16Ty});
618 return Builder.CreateCall(Callee: F, Args: {Src0, Builder.CreateTrunc(V: Src1, DestTy: Int16Ty)});
619 }
620 case AMDGPU::BI__builtin_amdgcn_frexp_mant:
621 case AMDGPU::BI__builtin_amdgcn_frexp_mantf:
622 case AMDGPU::BI__builtin_amdgcn_frexp_manth:
623 return emitBuiltinWithOneOverloadedType<1>(CGF&: *this, E,
624 IntrinsicID: Intrinsic::amdgcn_frexp_mant);
625 case AMDGPU::BI__builtin_amdgcn_frexp_exp:
626 case AMDGPU::BI__builtin_amdgcn_frexp_expf: {
627 Value *Src0 = EmitScalarExpr(E: E->getArg(Arg: 0));
628 Function *F = CGM.getIntrinsic(IID: Intrinsic::amdgcn_frexp_exp,
629 Tys: { Builder.getInt32Ty(), Src0->getType() });
630 return Builder.CreateCall(Callee: F, Args: Src0);
631 }
632 case AMDGPU::BI__builtin_amdgcn_frexp_exph: {
633 Value *Src0 = EmitScalarExpr(E: E->getArg(Arg: 0));
634 Function *F = CGM.getIntrinsic(IID: Intrinsic::amdgcn_frexp_exp,
635 Tys: { Builder.getInt16Ty(), Src0->getType() });
636 return Builder.CreateCall(Callee: F, Args: Src0);
637 }
638 case AMDGPU::BI__builtin_amdgcn_fract:
639 case AMDGPU::BI__builtin_amdgcn_fractf:
640 case AMDGPU::BI__builtin_amdgcn_fracth:
641 return emitBuiltinWithOneOverloadedType<1>(CGF&: *this, E,
642 IntrinsicID: Intrinsic::amdgcn_fract);
643 case AMDGPU::BI__builtin_amdgcn_lerp:
644 return emitBuiltinWithOneOverloadedType<3>(CGF&: *this, E,
645 IntrinsicID: Intrinsic::amdgcn_lerp);
646 case AMDGPU::BI__builtin_amdgcn_ubfe:
647 return emitBuiltinWithOneOverloadedType<3>(CGF&: *this, E,
648 IntrinsicID: Intrinsic::amdgcn_ubfe);
649 case AMDGPU::BI__builtin_amdgcn_sbfe:
650 return emitBuiltinWithOneOverloadedType<3>(CGF&: *this, E,
651 IntrinsicID: Intrinsic::amdgcn_sbfe);
652 case AMDGPU::BI__builtin_amdgcn_ballot_w32:
653 case AMDGPU::BI__builtin_amdgcn_ballot_w64: {
654 llvm::Type *ResultType = ConvertType(T: E->getType());
655 llvm::Value *Src = EmitScalarExpr(E: E->getArg(Arg: 0));
656 Function *F = CGM.getIntrinsic(IID: Intrinsic::amdgcn_ballot, Tys: {ResultType});
657 return Builder.CreateCall(Callee: F, Args: {Src});
658 }
659 case AMDGPU::BI__builtin_amdgcn_inverse_ballot_w32:
660 case AMDGPU::BI__builtin_amdgcn_inverse_ballot_w64: {
661 llvm::Value *Src = EmitScalarExpr(E: E->getArg(Arg: 0));
662 Function *F =
663 CGM.getIntrinsic(IID: Intrinsic::amdgcn_inverse_ballot, Tys: {Src->getType()});
664 return Builder.CreateCall(Callee: F, Args: {Src});
665 }
666 case AMDGPU::BI__builtin_amdgcn_tanhf:
667 case AMDGPU::BI__builtin_amdgcn_tanhh:
668 case AMDGPU::BI__builtin_amdgcn_tanh_bf16:
669 return emitBuiltinWithOneOverloadedType<1>(CGF&: *this, E,
670 IntrinsicID: Intrinsic::amdgcn_tanh);
671 case AMDGPU::BI__builtin_amdgcn_uicmp:
672 case AMDGPU::BI__builtin_amdgcn_uicmpl:
673 case AMDGPU::BI__builtin_amdgcn_sicmp:
674 case AMDGPU::BI__builtin_amdgcn_sicmpl: {
675 llvm::Value *Src0 = EmitScalarExpr(E: E->getArg(Arg: 0));
676 llvm::Value *Src1 = EmitScalarExpr(E: E->getArg(Arg: 1));
677 llvm::Value *Src2 = EmitScalarExpr(E: E->getArg(Arg: 2));
678
679 // FIXME-GFX10: How should 32 bit mask be handled?
680 Function *F = CGM.getIntrinsic(IID: Intrinsic::amdgcn_icmp,
681 Tys: { Builder.getInt64Ty(), Src0->getType() });
682 return Builder.CreateCall(Callee: F, Args: { Src0, Src1, Src2 });
683 }
684 case AMDGPU::BI__builtin_amdgcn_fcmp:
685 case AMDGPU::BI__builtin_amdgcn_fcmpf: {
686 llvm::Value *Src0 = EmitScalarExpr(E: E->getArg(Arg: 0));
687 llvm::Value *Src1 = EmitScalarExpr(E: E->getArg(Arg: 1));
688 llvm::Value *Src2 = EmitScalarExpr(E: E->getArg(Arg: 2));
689
690 // FIXME-GFX10: How should 32 bit mask be handled?
691 Function *F = CGM.getIntrinsic(IID: Intrinsic::amdgcn_fcmp,
692 Tys: { Builder.getInt64Ty(), Src0->getType() });
693 return Builder.CreateCall(Callee: F, Args: { Src0, Src1, Src2 });
694 }
695 case AMDGPU::BI__builtin_amdgcn_class:
696 case AMDGPU::BI__builtin_amdgcn_classf:
697 case AMDGPU::BI__builtin_amdgcn_classh:
698 return emitFPIntBuiltin(CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_class);
699 case AMDGPU::BI__builtin_amdgcn_fmed3f:
700 case AMDGPU::BI__builtin_amdgcn_fmed3h:
701 return emitBuiltinWithOneOverloadedType<3>(CGF&: *this, E,
702 IntrinsicID: Intrinsic::amdgcn_fmed3);
703 case AMDGPU::BI__builtin_amdgcn_ds_append:
704 case AMDGPU::BI__builtin_amdgcn_ds_consume: {
705 Intrinsic::ID Intrin = BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_append ?
706 Intrinsic::amdgcn_ds_append : Intrinsic::amdgcn_ds_consume;
707 Value *Src0 = EmitScalarExpr(E: E->getArg(Arg: 0));
708 Function *F = CGM.getIntrinsic(IID: Intrin, Tys: { Src0->getType() });
709 return Builder.CreateCall(Callee: F, Args: { Src0, Builder.getFalse() });
710 }
711 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_i32:
712 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_v2i32:
713 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4i16:
714 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4f16:
715 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4bf16:
716 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8i16:
717 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8f16:
718 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8bf16:
719 case AMDGPU::BI__builtin_amdgcn_global_load_tr4_b64_v2i32:
720 case AMDGPU::BI__builtin_amdgcn_global_load_tr8_b64_v2i32:
721 case AMDGPU::BI__builtin_amdgcn_global_load_tr6_b96_v3i32:
722 case AMDGPU::BI__builtin_amdgcn_global_load_tr16_b128_v8i16:
723 case AMDGPU::BI__builtin_amdgcn_global_load_tr16_b128_v8f16:
724 case AMDGPU::BI__builtin_amdgcn_global_load_tr16_b128_v8bf16:
725 case AMDGPU::BI__builtin_amdgcn_ds_load_tr4_b64_v2i32:
726 case AMDGPU::BI__builtin_amdgcn_ds_load_tr8_b64_v2i32:
727 case AMDGPU::BI__builtin_amdgcn_ds_load_tr6_b96_v3i32:
728 case AMDGPU::BI__builtin_amdgcn_ds_load_tr16_b128_v8i16:
729 case AMDGPU::BI__builtin_amdgcn_ds_load_tr16_b128_v8f16:
730 case AMDGPU::BI__builtin_amdgcn_ds_load_tr16_b128_v8bf16:
731 case AMDGPU::BI__builtin_amdgcn_ds_read_tr4_b64_v2i32:
732 case AMDGPU::BI__builtin_amdgcn_ds_read_tr8_b64_v2i32:
733 case AMDGPU::BI__builtin_amdgcn_ds_read_tr6_b96_v3i32:
734 case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4f16:
735 case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4bf16:
736 case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4i16: {
737 Intrinsic::ID IID;
738 switch (BuiltinID) {
739 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_i32:
740 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_v2i32:
741 case AMDGPU::BI__builtin_amdgcn_global_load_tr8_b64_v2i32:
742 IID = Intrinsic::amdgcn_global_load_tr_b64;
743 break;
744 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4i16:
745 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4f16:
746 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4bf16:
747 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8i16:
748 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8f16:
749 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8bf16:
750 case AMDGPU::BI__builtin_amdgcn_global_load_tr16_b128_v8i16:
751 case AMDGPU::BI__builtin_amdgcn_global_load_tr16_b128_v8f16:
752 case AMDGPU::BI__builtin_amdgcn_global_load_tr16_b128_v8bf16:
753 IID = Intrinsic::amdgcn_global_load_tr_b128;
754 break;
755 case AMDGPU::BI__builtin_amdgcn_global_load_tr4_b64_v2i32:
756 IID = Intrinsic::amdgcn_global_load_tr4_b64;
757 break;
758 case AMDGPU::BI__builtin_amdgcn_global_load_tr6_b96_v3i32:
759 IID = Intrinsic::amdgcn_global_load_tr6_b96;
760 break;
761 case AMDGPU::BI__builtin_amdgcn_ds_load_tr4_b64_v2i32:
762 IID = Intrinsic::amdgcn_ds_load_tr4_b64;
763 break;
764 case AMDGPU::BI__builtin_amdgcn_ds_load_tr6_b96_v3i32:
765 IID = Intrinsic::amdgcn_ds_load_tr6_b96;
766 break;
767 case AMDGPU::BI__builtin_amdgcn_ds_load_tr8_b64_v2i32:
768 IID = Intrinsic::amdgcn_ds_load_tr8_b64;
769 break;
770 case AMDGPU::BI__builtin_amdgcn_ds_load_tr16_b128_v8i16:
771 case AMDGPU::BI__builtin_amdgcn_ds_load_tr16_b128_v8f16:
772 case AMDGPU::BI__builtin_amdgcn_ds_load_tr16_b128_v8bf16:
773 IID = Intrinsic::amdgcn_ds_load_tr16_b128;
774 break;
775 case AMDGPU::BI__builtin_amdgcn_ds_read_tr4_b64_v2i32:
776 IID = Intrinsic::amdgcn_ds_read_tr4_b64;
777 break;
778 case AMDGPU::BI__builtin_amdgcn_ds_read_tr8_b64_v2i32:
779 IID = Intrinsic::amdgcn_ds_read_tr8_b64;
780 break;
781 case AMDGPU::BI__builtin_amdgcn_ds_read_tr6_b96_v3i32:
782 IID = Intrinsic::amdgcn_ds_read_tr6_b96;
783 break;
784 case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4i16:
785 case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4f16:
786 case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4bf16:
787 IID = Intrinsic::amdgcn_ds_read_tr16_b64;
788 break;
789 }
790 llvm::Type *LoadTy = ConvertType(T: E->getType());
791 llvm::Value *Addr = EmitScalarExpr(E: E->getArg(Arg: 0));
792 llvm::Function *F = CGM.getIntrinsic(IID, Tys: {LoadTy});
793 return Builder.CreateCall(Callee: F, Args: {Addr});
794 }
795 case AMDGPU::BI__builtin_amdgcn_global_load_monitor_b32:
796 case AMDGPU::BI__builtin_amdgcn_global_load_monitor_b64:
797 case AMDGPU::BI__builtin_amdgcn_global_load_monitor_b128:
798 case AMDGPU::BI__builtin_amdgcn_flat_load_monitor_b32:
799 case AMDGPU::BI__builtin_amdgcn_flat_load_monitor_b64:
800 case AMDGPU::BI__builtin_amdgcn_flat_load_monitor_b128: {
801
802 Intrinsic::ID IID;
803 switch (BuiltinID) {
804 case AMDGPU::BI__builtin_amdgcn_global_load_monitor_b32:
805 IID = Intrinsic::amdgcn_global_load_monitor_b32;
806 break;
807 case AMDGPU::BI__builtin_amdgcn_global_load_monitor_b64:
808 IID = Intrinsic::amdgcn_global_load_monitor_b64;
809 break;
810 case AMDGPU::BI__builtin_amdgcn_global_load_monitor_b128:
811 IID = Intrinsic::amdgcn_global_load_monitor_b128;
812 break;
813 case AMDGPU::BI__builtin_amdgcn_flat_load_monitor_b32:
814 IID = Intrinsic::amdgcn_flat_load_monitor_b32;
815 break;
816 case AMDGPU::BI__builtin_amdgcn_flat_load_monitor_b64:
817 IID = Intrinsic::amdgcn_flat_load_monitor_b64;
818 break;
819 case AMDGPU::BI__builtin_amdgcn_flat_load_monitor_b128:
820 IID = Intrinsic::amdgcn_flat_load_monitor_b128;
821 break;
822 }
823
824 LLVMContext &Ctx = CGM.getLLVMContext();
825 llvm::Type *LoadTy = ConvertType(T: E->getType());
826 llvm::Value *Addr = EmitScalarExpr(E: E->getArg(Arg: 0));
827
828 auto *AOExpr = cast<llvm::ConstantInt>(Val: EmitScalarExpr(E: E->getArg(Arg: 1)));
829 auto *ScopeExpr = cast<llvm::ConstantInt>(Val: EmitScalarExpr(E: E->getArg(Arg: 2)));
830
831 auto Scope = static_cast<SyncScope>(ScopeExpr->getZExtValue());
832 llvm::AtomicOrdering AO = mapCABIAtomicOrdering(AO: AOExpr->getZExtValue());
833
834 StringRef ScopeStr = CGM.getTargetCodeGenInfo().getLLVMSyncScopeStr(
835 LangOpts: CGM.getLangOpts(), Scope, Ordering: AO);
836
837 llvm::MDNode *MD =
838 llvm::MDNode::get(Context&: Ctx, MDs: {llvm::MDString::get(Context&: Ctx, Str: ScopeStr)});
839 llvm::Value *ScopeMD = llvm::MetadataAsValue::get(Context&: Ctx, MD);
840 llvm::Function *F = CGM.getIntrinsic(IID, Tys: {LoadTy});
841 return Builder.CreateCall(Callee: F, Args: {Addr, AOExpr, ScopeMD});
842 }
843 case AMDGPU::BI__builtin_amdgcn_cluster_load_b32:
844 case AMDGPU::BI__builtin_amdgcn_cluster_load_b64:
845 case AMDGPU::BI__builtin_amdgcn_cluster_load_b128: {
846 Intrinsic::ID IID;
847 switch (BuiltinID) {
848 case AMDGPU::BI__builtin_amdgcn_cluster_load_b32:
849 IID = Intrinsic::amdgcn_cluster_load_b32;
850 break;
851 case AMDGPU::BI__builtin_amdgcn_cluster_load_b64:
852 IID = Intrinsic::amdgcn_cluster_load_b64;
853 break;
854 case AMDGPU::BI__builtin_amdgcn_cluster_load_b128:
855 IID = Intrinsic::amdgcn_cluster_load_b128;
856 break;
857 }
858 SmallVector<Value *, 3> Args;
859 for (int i = 0, e = E->getNumArgs(); i != e; ++i)
860 Args.push_back(Elt: EmitScalarExpr(E: E->getArg(Arg: i)));
861 llvm::Function *F = CGM.getIntrinsic(IID, Tys: {ConvertType(T: E->getType())});
862 return Builder.CreateCall(Callee: F, Args: {Args});
863 }
864 case AMDGPU::BI__builtin_amdgcn_load_to_lds: {
865 // Should this have asan instrumentation?
866 return emitBuiltinWithOneOverloadedType<5>(CGF&: *this, E,
867 IntrinsicID: Intrinsic::amdgcn_load_to_lds);
868 }
869 case AMDGPU::BI__builtin_amdgcn_load_async_to_lds: {
870 // Should this have asan instrumentation?
871 return emitBuiltinWithOneOverloadedType<5>(
872 CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_load_async_to_lds);
873 }
874 case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_load_32x4B:
875 case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_store_32x4B:
876 case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_load_16x8B:
877 case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_store_16x8B:
878 case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_load_8x16B:
879 case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_store_8x16B: {
880 Intrinsic::ID IID;
881 switch (BuiltinID) {
882 case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_load_32x4B:
883 IID = Intrinsic::amdgcn_cooperative_atomic_load_32x4B;
884 break;
885 case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_store_32x4B:
886 IID = Intrinsic::amdgcn_cooperative_atomic_store_32x4B;
887 break;
888 case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_load_16x8B:
889 IID = Intrinsic::amdgcn_cooperative_atomic_load_16x8B;
890 break;
891 case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_store_16x8B:
892 IID = Intrinsic::amdgcn_cooperative_atomic_store_16x8B;
893 break;
894 case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_load_8x16B:
895 IID = Intrinsic::amdgcn_cooperative_atomic_load_8x16B;
896 break;
897 case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_store_8x16B:
898 IID = Intrinsic::amdgcn_cooperative_atomic_store_8x16B;
899 break;
900 }
901
902 LLVMContext &Ctx = CGM.getLLVMContext();
903 SmallVector<Value *, 5> Args;
904 // last argument is a MD string
905 const unsigned ScopeArg = E->getNumArgs() - 1;
906 for (unsigned i = 0; i != ScopeArg; ++i)
907 Args.push_back(Elt: EmitScalarExpr(E: E->getArg(Arg: i)));
908 StringRef Arg = cast<StringLiteral>(Val: E->getArg(Arg: ScopeArg)->IgnoreParenCasts())
909 ->getString();
910 llvm::MDNode *MD = llvm::MDNode::get(Context&: Ctx, MDs: {llvm::MDString::get(Context&: Ctx, Str: Arg)});
911 Args.push_back(Elt: llvm::MetadataAsValue::get(Context&: Ctx, MD));
912 // Intrinsic is typed based on the pointer AS. Pointer is always the first
913 // argument.
914 llvm::Function *F = CGM.getIntrinsic(IID, Tys: {Args[0]->getType()});
915 return Builder.CreateCall(Callee: F, Args: {Args});
916 }
917 case AMDGPU::BI__builtin_amdgcn_get_fpenv: {
918 Function *F = CGM.getIntrinsic(IID: Intrinsic::get_fpenv,
919 Tys: {llvm::Type::getInt64Ty(C&: getLLVMContext())});
920 return Builder.CreateCall(Callee: F);
921 }
922 case AMDGPU::BI__builtin_amdgcn_set_fpenv: {
923 Function *F = CGM.getIntrinsic(IID: Intrinsic::set_fpenv,
924 Tys: {llvm::Type::getInt64Ty(C&: getLLVMContext())});
925 llvm::Value *Env = EmitScalarExpr(E: E->getArg(Arg: 0));
926 return Builder.CreateCall(Callee: F, Args: {Env});
927 }
928 case AMDGPU::BI__builtin_amdgcn_read_exec:
929 return EmitAMDGCNBallotForExec(CGF&: *this, E, RegisterType: Int64Ty, ValueType: Int64Ty, isExecHi: false);
930 case AMDGPU::BI__builtin_amdgcn_read_exec_lo:
931 return EmitAMDGCNBallotForExec(CGF&: *this, E, RegisterType: Int32Ty, ValueType: Int32Ty, isExecHi: false);
932 case AMDGPU::BI__builtin_amdgcn_read_exec_hi:
933 return EmitAMDGCNBallotForExec(CGF&: *this, E, RegisterType: Int64Ty, ValueType: Int64Ty, isExecHi: true);
934 case AMDGPU::BI__builtin_amdgcn_image_bvh_intersect_ray:
935 case AMDGPU::BI__builtin_amdgcn_image_bvh_intersect_ray_h:
936 case AMDGPU::BI__builtin_amdgcn_image_bvh_intersect_ray_l:
937 case AMDGPU::BI__builtin_amdgcn_image_bvh_intersect_ray_lh: {
938 llvm::Value *NodePtr = EmitScalarExpr(E: E->getArg(Arg: 0));
939 llvm::Value *RayExtent = EmitScalarExpr(E: E->getArg(Arg: 1));
940 llvm::Value *RayOrigin = EmitScalarExpr(E: E->getArg(Arg: 2));
941 llvm::Value *RayDir = EmitScalarExpr(E: E->getArg(Arg: 3));
942 llvm::Value *RayInverseDir = EmitScalarExpr(E: E->getArg(Arg: 4));
943 llvm::Value *TextureDescr = EmitScalarExpr(E: E->getArg(Arg: 5));
944
945 // The builtins take these arguments as vec4 where the last element is
946 // ignored. The intrinsic takes them as vec3.
947 RayOrigin = Builder.CreateShuffleVector(V1: RayOrigin, V2: RayOrigin,
948 Mask: {0, 1, 2});
949 RayDir =
950 Builder.CreateShuffleVector(V1: RayDir, V2: RayDir, Mask: {0, 1, 2});
951 RayInverseDir = Builder.CreateShuffleVector(V1: RayInverseDir, V2: RayInverseDir,
952 Mask: {0, 1, 2});
953
954 Function *F = CGM.getIntrinsic(IID: Intrinsic::amdgcn_image_bvh_intersect_ray,
955 Tys: {NodePtr->getType(), RayDir->getType()});
956 return Builder.CreateCall(Callee: F, Args: {NodePtr, RayExtent, RayOrigin, RayDir,
957 RayInverseDir, TextureDescr});
958 }
959 case AMDGPU::BI__builtin_amdgcn_image_bvh8_intersect_ray:
960 case AMDGPU::BI__builtin_amdgcn_image_bvh_dual_intersect_ray: {
961 Intrinsic::ID IID;
962 switch (BuiltinID) {
963 case AMDGPU::BI__builtin_amdgcn_image_bvh8_intersect_ray:
964 IID = Intrinsic::amdgcn_image_bvh8_intersect_ray;
965 break;
966 case AMDGPU::BI__builtin_amdgcn_image_bvh_dual_intersect_ray:
967 IID = Intrinsic::amdgcn_image_bvh_dual_intersect_ray;
968 break;
969 }
970 llvm::Value *NodePtr = EmitScalarExpr(E: E->getArg(Arg: 0));
971 llvm::Value *RayExtent = EmitScalarExpr(E: E->getArg(Arg: 1));
972 llvm::Value *InstanceMask = EmitScalarExpr(E: E->getArg(Arg: 2));
973 llvm::Value *RayOrigin = EmitScalarExpr(E: E->getArg(Arg: 3));
974 llvm::Value *RayDir = EmitScalarExpr(E: E->getArg(Arg: 4));
975 llvm::Value *Offset = EmitScalarExpr(E: E->getArg(Arg: 5));
976 llvm::Value *TextureDescr = EmitScalarExpr(E: E->getArg(Arg: 6));
977
978 Address RetRayOriginPtr = EmitPointerWithAlignment(Addr: E->getArg(Arg: 7));
979 Address RetRayDirPtr = EmitPointerWithAlignment(Addr: E->getArg(Arg: 8));
980
981 llvm::Function *IntrinsicFunc = CGM.getIntrinsic(IID);
982
983 llvm::CallInst *CI = Builder.CreateCall(
984 Callee: IntrinsicFunc, Args: {NodePtr, RayExtent, InstanceMask, RayOrigin, RayDir,
985 Offset, TextureDescr});
986
987 llvm::Value *RetVData = Builder.CreateExtractValue(Agg: CI, Idxs: 0);
988 llvm::Value *RetRayOrigin = Builder.CreateExtractValue(Agg: CI, Idxs: 1);
989 llvm::Value *RetRayDir = Builder.CreateExtractValue(Agg: CI, Idxs: 2);
990
991 Builder.CreateStore(Val: RetRayOrigin, Addr: RetRayOriginPtr);
992 Builder.CreateStore(Val: RetRayDir, Addr: RetRayDirPtr);
993
994 return RetVData;
995 }
996
997 case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_rtn:
998 case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push4_pop1_rtn:
999 case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push8_pop1_rtn:
1000 case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push8_pop2_rtn: {
1001 Intrinsic::ID IID;
1002 switch (BuiltinID) {
1003 case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_rtn:
1004 IID = Intrinsic::amdgcn_ds_bvh_stack_rtn;
1005 break;
1006 case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push4_pop1_rtn:
1007 IID = Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn;
1008 break;
1009 case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push8_pop1_rtn:
1010 IID = Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn;
1011 break;
1012 case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push8_pop2_rtn:
1013 IID = Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn;
1014 break;
1015 }
1016
1017 SmallVector<Value *, 4> Args;
1018 for (int i = 0, e = E->getNumArgs(); i != e; ++i)
1019 Args.push_back(Elt: EmitScalarExpr(E: E->getArg(Arg: i)));
1020
1021 Function *F = CGM.getIntrinsic(IID);
1022 Value *Call = Builder.CreateCall(Callee: F, Args);
1023 Value *Rtn = Builder.CreateExtractValue(Agg: Call, Idxs: 0);
1024 Value *A = Builder.CreateExtractValue(Agg: Call, Idxs: 1);
1025 llvm::Type *RetTy = ConvertType(T: E->getType());
1026 Value *I0 = Builder.CreateInsertElement(Vec: PoisonValue::get(T: RetTy), NewElt: Rtn,
1027 Idx: (uint64_t)0);
1028 // ds_bvh_stack_push8_pop2_rtn returns {i64, i32} but the builtin returns
1029 // <2 x i64>, zext the second value.
1030 if (A->getType()->getPrimitiveSizeInBits() <
1031 RetTy->getScalarType()->getPrimitiveSizeInBits())
1032 A = Builder.CreateZExt(V: A, DestTy: RetTy->getScalarType());
1033
1034 return Builder.CreateInsertElement(Vec: I0, NewElt: A, Idx: 1);
1035 }
1036 case AMDGPU::BI__builtin_amdgcn_image_load_1d_v4f32_i32:
1037 case AMDGPU::BI__builtin_amdgcn_image_load_1d_v4f16_i32:
1038 return emitAMDGCNImageOverloadedReturnType(
1039 CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_image_load_1d, IsImageStore: false);
1040 case AMDGPU::BI__builtin_amdgcn_image_load_1darray_v4f32_i32:
1041 case AMDGPU::BI__builtin_amdgcn_image_load_1darray_v4f16_i32:
1042 return emitAMDGCNImageOverloadedReturnType(
1043 CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_image_load_1darray, IsImageStore: false);
1044 case AMDGPU::BI__builtin_amdgcn_image_load_2d_f32_i32:
1045 case AMDGPU::BI__builtin_amdgcn_image_load_2d_v4f32_i32:
1046 case AMDGPU::BI__builtin_amdgcn_image_load_2d_v4f16_i32:
1047 return emitAMDGCNImageOverloadedReturnType(
1048 CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_image_load_2d, IsImageStore: false);
1049 case AMDGPU::BI__builtin_amdgcn_image_load_2darray_f32_i32:
1050 case AMDGPU::BI__builtin_amdgcn_image_load_2darray_v4f32_i32:
1051 case AMDGPU::BI__builtin_amdgcn_image_load_2darray_v4f16_i32:
1052 return emitAMDGCNImageOverloadedReturnType(
1053 CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_image_load_2darray, IsImageStore: false);
1054 case AMDGPU::BI__builtin_amdgcn_image_load_3d_v4f32_i32:
1055 case AMDGPU::BI__builtin_amdgcn_image_load_3d_v4f16_i32:
1056 return emitAMDGCNImageOverloadedReturnType(
1057 CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_image_load_3d, IsImageStore: false);
1058 case AMDGPU::BI__builtin_amdgcn_image_load_cube_v4f32_i32:
1059 case AMDGPU::BI__builtin_amdgcn_image_load_cube_v4f16_i32:
1060 return emitAMDGCNImageOverloadedReturnType(
1061 CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_image_load_cube, IsImageStore: false);
1062 case AMDGPU::BI__builtin_amdgcn_image_load_mip_1d_v4f32_i32:
1063 case AMDGPU::BI__builtin_amdgcn_image_load_mip_1d_v4f16_i32:
1064 return emitAMDGCNImageOverloadedReturnType(
1065 CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_image_load_mip_1d, IsImageStore: false);
1066 case AMDGPU::BI__builtin_amdgcn_image_load_mip_1darray_v4f32_i32:
1067 case AMDGPU::BI__builtin_amdgcn_image_load_mip_1darray_v4f16_i32:
1068 return emitAMDGCNImageOverloadedReturnType(
1069 CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_image_load_mip_1darray, IsImageStore: false);
1070 case AMDGPU::BI__builtin_amdgcn_image_load_mip_2d_f32_i32:
1071 case AMDGPU::BI__builtin_amdgcn_image_load_mip_2d_v4f32_i32:
1072 case AMDGPU::BI__builtin_amdgcn_image_load_mip_2d_v4f16_i32:
1073 return emitAMDGCNImageOverloadedReturnType(
1074 CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_image_load_mip_2d, IsImageStore: false);
1075 case AMDGPU::BI__builtin_amdgcn_image_load_mip_2darray_f32_i32:
1076 case AMDGPU::BI__builtin_amdgcn_image_load_mip_2darray_v4f32_i32:
1077 case AMDGPU::BI__builtin_amdgcn_image_load_mip_2darray_v4f16_i32:
1078 return emitAMDGCNImageOverloadedReturnType(
1079 CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_image_load_mip_2darray, IsImageStore: false);
1080 case AMDGPU::BI__builtin_amdgcn_image_load_mip_3d_v4f32_i32:
1081 case AMDGPU::BI__builtin_amdgcn_image_load_mip_3d_v4f16_i32:
1082 return emitAMDGCNImageOverloadedReturnType(
1083 CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_image_load_mip_3d, IsImageStore: false);
1084 case AMDGPU::BI__builtin_amdgcn_image_load_mip_cube_v4f32_i32:
1085 case AMDGPU::BI__builtin_amdgcn_image_load_mip_cube_v4f16_i32:
1086 return emitAMDGCNImageOverloadedReturnType(
1087 CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_image_load_mip_cube, IsImageStore: false);
1088 case AMDGPU::BI__builtin_amdgcn_image_store_1d_v4f32_i32:
1089 case AMDGPU::BI__builtin_amdgcn_image_store_1d_v4f16_i32:
1090 return emitAMDGCNImageOverloadedReturnType(
1091 CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_image_store_1d, IsImageStore: true);
1092 case AMDGPU::BI__builtin_amdgcn_image_store_1darray_v4f32_i32:
1093 case AMDGPU::BI__builtin_amdgcn_image_store_1darray_v4f16_i32:
1094 return emitAMDGCNImageOverloadedReturnType(
1095 CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_image_store_1darray, IsImageStore: true);
1096 case AMDGPU::BI__builtin_amdgcn_image_store_2d_f32_i32:
1097 case AMDGPU::BI__builtin_amdgcn_image_store_2d_v4f32_i32:
1098 case AMDGPU::BI__builtin_amdgcn_image_store_2d_v4f16_i32:
1099 return emitAMDGCNImageOverloadedReturnType(
1100 CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_image_store_2d, IsImageStore: true);
1101 case AMDGPU::BI__builtin_amdgcn_image_store_2darray_f32_i32:
1102 case AMDGPU::BI__builtin_amdgcn_image_store_2darray_v4f32_i32:
1103 case AMDGPU::BI__builtin_amdgcn_image_store_2darray_v4f16_i32:
1104 return emitAMDGCNImageOverloadedReturnType(
1105 CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_image_store_2darray, IsImageStore: true);
1106 case AMDGPU::BI__builtin_amdgcn_image_store_3d_v4f32_i32:
1107 case AMDGPU::BI__builtin_amdgcn_image_store_3d_v4f16_i32:
1108 return emitAMDGCNImageOverloadedReturnType(
1109 CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_image_store_3d, IsImageStore: true);
1110 case AMDGPU::BI__builtin_amdgcn_image_store_cube_v4f32_i32:
1111 case AMDGPU::BI__builtin_amdgcn_image_store_cube_v4f16_i32:
1112 return emitAMDGCNImageOverloadedReturnType(
1113 CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_image_store_cube, IsImageStore: true);
1114 case AMDGPU::BI__builtin_amdgcn_image_store_mip_1d_v4f32_i32:
1115 case AMDGPU::BI__builtin_amdgcn_image_store_mip_1d_v4f16_i32:
1116 return emitAMDGCNImageOverloadedReturnType(
1117 CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_image_store_mip_1d, IsImageStore: true);
1118 case AMDGPU::BI__builtin_amdgcn_image_store_mip_1darray_v4f32_i32:
1119 case AMDGPU::BI__builtin_amdgcn_image_store_mip_1darray_v4f16_i32:
1120 return emitAMDGCNImageOverloadedReturnType(
1121 CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_image_store_mip_1darray, IsImageStore: true);
1122 case AMDGPU::BI__builtin_amdgcn_image_store_mip_2d_f32_i32:
1123 case AMDGPU::BI__builtin_amdgcn_image_store_mip_2d_v4f32_i32:
1124 case AMDGPU::BI__builtin_amdgcn_image_store_mip_2d_v4f16_i32:
1125 return emitAMDGCNImageOverloadedReturnType(
1126 CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_image_store_mip_2d, IsImageStore: true);
1127 case AMDGPU::BI__builtin_amdgcn_image_store_mip_2darray_f32_i32:
1128 case AMDGPU::BI__builtin_amdgcn_image_store_mip_2darray_v4f32_i32:
1129 case AMDGPU::BI__builtin_amdgcn_image_store_mip_2darray_v4f16_i32:
1130 return emitAMDGCNImageOverloadedReturnType(
1131 CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_image_store_mip_2darray, IsImageStore: true);
1132 case AMDGPU::BI__builtin_amdgcn_image_store_mip_3d_v4f32_i32:
1133 case AMDGPU::BI__builtin_amdgcn_image_store_mip_3d_v4f16_i32:
1134 return emitAMDGCNImageOverloadedReturnType(
1135 CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_image_store_mip_3d, IsImageStore: true);
1136 case AMDGPU::BI__builtin_amdgcn_image_store_mip_cube_v4f32_i32:
1137 case AMDGPU::BI__builtin_amdgcn_image_store_mip_cube_v4f16_i32:
1138 return emitAMDGCNImageOverloadedReturnType(
1139 CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_image_store_mip_cube, IsImageStore: true);
1140 case AMDGPU::BI__builtin_amdgcn_image_sample_1d_v4f32_f32:
1141 case AMDGPU::BI__builtin_amdgcn_image_sample_1d_v4f16_f32:
1142 return emitAMDGCNImageOverloadedReturnType(
1143 CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_image_sample_1d, IsImageStore: false);
1144 case AMDGPU::BI__builtin_amdgcn_image_sample_1darray_v4f32_f32:
1145 case AMDGPU::BI__builtin_amdgcn_image_sample_1darray_v4f16_f32:
1146 return emitAMDGCNImageOverloadedReturnType(
1147 CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_image_sample_1darray, IsImageStore: false);
1148 case AMDGPU::BI__builtin_amdgcn_image_sample_2d_f32_f32:
1149 case AMDGPU::BI__builtin_amdgcn_image_sample_2d_v4f32_f32:
1150 case AMDGPU::BI__builtin_amdgcn_image_sample_2d_v4f16_f32:
1151 return emitAMDGCNImageOverloadedReturnType(
1152 CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_image_sample_2d, IsImageStore: false);
1153 case AMDGPU::BI__builtin_amdgcn_image_sample_2darray_f32_f32:
1154 case AMDGPU::BI__builtin_amdgcn_image_sample_2darray_v4f32_f32:
1155 case AMDGPU::BI__builtin_amdgcn_image_sample_2darray_v4f16_f32:
1156 return emitAMDGCNImageOverloadedReturnType(
1157 CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_image_sample_2darray, IsImageStore: false);
1158 case AMDGPU::BI__builtin_amdgcn_image_sample_3d_v4f32_f32:
1159 case AMDGPU::BI__builtin_amdgcn_image_sample_3d_v4f16_f32:
1160 return emitAMDGCNImageOverloadedReturnType(
1161 CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_image_sample_3d, IsImageStore: false);
1162 case AMDGPU::BI__builtin_amdgcn_image_sample_cube_v4f32_f32:
1163 case AMDGPU::BI__builtin_amdgcn_image_sample_cube_v4f16_f32:
1164 return emitAMDGCNImageOverloadedReturnType(
1165 CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_image_sample_cube, IsImageStore: false);
1166 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_lz_1d_v4f32_f32:
1167 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_lz_1d_v4f16_f32:
1168 return emitAMDGCNImageOverloadedReturnType(
1169 CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_image_sample_lz_1d, IsImageStore: false);
1170 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_l_1d_v4f32_f32:
1171 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_l_1d_v4f16_f32:
1172 return emitAMDGCNImageOverloadedReturnType(
1173 CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_image_sample_l_1d, IsImageStore: false);
1174 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_d_1d_v4f32_f32:
1175 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_d_1d_v4f16_f32:
1176 return emitAMDGCNImageOverloadedReturnType(
1177 CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_image_sample_d_1d, IsImageStore: false);
1178 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_lz_2d_v4f32_f32:
1179 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_lz_2d_v4f16_f32:
1180 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_lz_2d_f32_f32:
1181 return emitAMDGCNImageOverloadedReturnType(
1182 CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_image_sample_lz_2d, IsImageStore: false);
1183 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_l_2d_v4f32_f32:
1184 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_l_2d_v4f16_f32:
1185 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_l_2d_f32_f32:
1186 return emitAMDGCNImageOverloadedReturnType(
1187 CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_image_sample_l_2d, IsImageStore: false);
1188 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_d_2d_v4f32_f32:
1189 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_d_2d_v4f16_f32:
1190 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_d_2d_f32_f32:
1191 return emitAMDGCNImageOverloadedReturnType(
1192 CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_image_sample_d_2d, IsImageStore: false);
1193 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_lz_3d_v4f32_f32:
1194 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_lz_3d_v4f16_f32:
1195 return emitAMDGCNImageOverloadedReturnType(
1196 CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_image_sample_lz_3d, IsImageStore: false);
1197 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_l_3d_v4f32_f32:
1198 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_l_3d_v4f16_f32:
1199 return emitAMDGCNImageOverloadedReturnType(
1200 CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_image_sample_l_3d, IsImageStore: false);
1201 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_d_3d_v4f32_f32:
1202 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_d_3d_v4f16_f32:
1203 return emitAMDGCNImageOverloadedReturnType(
1204 CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_image_sample_d_3d, IsImageStore: false);
1205 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_lz_cube_v4f32_f32:
1206 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_lz_cube_v4f16_f32:
1207 return emitAMDGCNImageOverloadedReturnType(
1208 CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_image_sample_lz_cube, IsImageStore: false);
1209 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_l_cube_v4f32_f32:
1210 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_l_cube_v4f16_f32:
1211 return emitAMDGCNImageOverloadedReturnType(
1212 CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_image_sample_l_cube, IsImageStore: false);
1213 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_lz_1darray_v4f32_f32:
1214 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_lz_1darray_v4f16_f32:
1215 return emitAMDGCNImageOverloadedReturnType(
1216 CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_image_sample_lz_1darray, IsImageStore: false);
1217 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_l_1darray_v4f32_f32:
1218 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_l_1darray_v4f16_f32:
1219 return emitAMDGCNImageOverloadedReturnType(
1220 CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_image_sample_l_1darray, IsImageStore: false);
1221 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_d_1darray_v4f32_f32:
1222 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_d_1darray_v4f16_f32:
1223 return emitAMDGCNImageOverloadedReturnType(
1224 CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_image_sample_d_1darray, IsImageStore: false);
1225 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_lz_2darray_v4f32_f32:
1226 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_lz_2darray_v4f16_f32:
1227 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_lz_2darray_f32_f32:
1228 return emitAMDGCNImageOverloadedReturnType(
1229 CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_image_sample_lz_2darray, IsImageStore: false);
1230 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_l_2darray_v4f32_f32:
1231 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_l_2darray_v4f16_f32:
1232 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_l_2darray_f32_f32:
1233 return emitAMDGCNImageOverloadedReturnType(
1234 CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_image_sample_l_2darray, IsImageStore: false);
1235 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_d_2darray_v4f32_f32:
1236 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_d_2darray_v4f16_f32:
1237 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_d_2darray_f32_f32:
1238 return emitAMDGCNImageOverloadedReturnType(
1239 CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_image_sample_d_2darray, IsImageStore: false);
1240 case clang::AMDGPU::BI__builtin_amdgcn_image_gather4_lz_2d_v4f32_f32:
1241 return emitAMDGCNImageOverloadedReturnType(
1242 CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_image_gather4_lz_2d, IsImageStore: false);
1243 case AMDGPU::BI__builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4:
1244 case AMDGPU::BI__builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4: {
1245 llvm::FixedVectorType *VT = FixedVectorType::get(ElementType: Builder.getInt32Ty(), NumElts: 8);
1246 Function *F = CGM.getIntrinsic(
1247 IID: BuiltinID == AMDGPU::BI__builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4
1248 ? Intrinsic::amdgcn_mfma_scale_f32_32x32x64_f8f6f4
1249 : Intrinsic::amdgcn_mfma_scale_f32_16x16x128_f8f6f4,
1250 Tys: {VT, VT});
1251
1252 SmallVector<Value *, 9> Args;
1253 for (unsigned I = 0, N = E->getNumArgs(); I != N; ++I)
1254 Args.push_back(Elt: EmitScalarExpr(E: E->getArg(Arg: I)));
1255 return Builder.CreateCall(Callee: F, Args);
1256 }
1257 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32:
1258 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32:
1259 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64:
1260 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64:
1261 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w32:
1262 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w32:
1263 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w64:
1264 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w64:
1265 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32:
1266 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64:
1267 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32:
1268 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w64:
1269 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32:
1270 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64:
1271 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32:
1272 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64:
1273 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32_gfx12:
1274 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64_gfx12:
1275 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w32_gfx12:
1276 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w64_gfx12:
1277 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12:
1278 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64_gfx12:
1279 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12:
1280 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w64_gfx12:
1281 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32_gfx12:
1282 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64_gfx12:
1283 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12:
1284 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64_gfx12:
1285 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32_gfx12:
1286 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w64_gfx12:
1287 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32_gfx12:
1288 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w64_gfx12:
1289 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32_gfx12:
1290 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w64_gfx12:
1291 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32_gfx12:
1292 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w64_gfx12:
1293 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x32_iu4_w32_gfx12:
1294 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x32_iu4_w64_gfx12:
1295 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_f16_w32:
1296 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_f16_w64:
1297 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w32:
1298 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w64:
1299 case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x32_f16_w32:
1300 case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x32_f16_w64:
1301 case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w32:
1302 case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w64:
1303 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w32:
1304 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w64:
1305 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w32:
1306 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w64:
1307 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w32:
1308 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w64:
1309 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32:
1310 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64:
1311 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32:
1312 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64:
1313 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32:
1314 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64:
1315 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32:
1316 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64:
1317 // GFX1250 WMMA builtins
1318 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x4_f32:
1319 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x32_bf16:
1320 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x32_f16:
1321 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x32_f16:
1322 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x32_bf16:
1323 case AMDGPU::BI__builtin_amdgcn_wmma_bf16f32_16x16x32_bf16:
1324 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x64_fp8_fp8:
1325 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x64_fp8_bf8:
1326 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x64_bf8_fp8:
1327 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x64_bf8_bf8:
1328 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x64_fp8_fp8:
1329 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x64_fp8_bf8:
1330 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x64_bf8_fp8:
1331 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x64_bf8_bf8:
1332 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x128_fp8_fp8:
1333 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x128_fp8_bf8:
1334 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x128_bf8_fp8:
1335 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x128_bf8_bf8:
1336 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_fp8_fp8:
1337 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_fp8_bf8:
1338 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_bf8_fp8:
1339 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_bf8_bf8:
1340 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x64_iu8:
1341 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_f8f6f4:
1342 case AMDGPU::BI__builtin_amdgcn_wmma_f32_32x16x128_f4:
1343 case AMDGPU::BI__builtin_amdgcn_wmma_scale_f32_16x16x128_f8f6f4:
1344 case AMDGPU::BI__builtin_amdgcn_wmma_scale16_f32_16x16x128_f8f6f4:
1345 case AMDGPU::BI__builtin_amdgcn_wmma_scale_f32_32x16x128_f4:
1346 case AMDGPU::BI__builtin_amdgcn_wmma_scale16_f32_32x16x128_f4:
1347 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x64_f16:
1348 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x64_bf16:
1349 case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x64_f16:
1350 case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x64_bf16:
1351 case AMDGPU::BI__builtin_amdgcn_swmmac_bf16f32_16x16x64_bf16:
1352 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x128_fp8_fp8:
1353 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x128_fp8_bf8:
1354 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x128_bf8_fp8:
1355 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x128_bf8_bf8:
1356 case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x128_fp8_fp8:
1357 case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x128_fp8_bf8:
1358 case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x128_bf8_fp8:
1359 case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x128_bf8_bf8:
1360 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x128_iu8: {
1361
1362 // These operations perform a matrix multiplication and accumulation of
1363 // the form:
1364 // D = A * B + C
1365 // We need to specify one type for matrices AB and one for matrices CD.
1366 // Sparse matrix operations can have different types for A and B as well as
1367 // an additional type for sparsity index.
1368 // Destination type should be put before types used for source operands.
1369 SmallVector<unsigned, 2> ArgsForMatchingMatrixTypes;
1370 // On GFX12, the intrinsics with 16-bit accumulator use a packed layout.
1371 // There is no need for the variable opsel argument, so always set it to
1372 // "false".
1373 bool AppendFalseForOpselArg = false;
1374 unsigned BuiltinWMMAOp;
1375 // Need return type when D and C are of different types.
1376 bool NeedReturnType = false;
1377
1378 switch (BuiltinID) {
1379 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32:
1380 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w64:
1381 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12:
1382 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w64_gfx12:
1383 ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
1384 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_f16;
1385 break;
1386 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32:
1387 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64:
1388 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12:
1389 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64_gfx12:
1390 ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
1391 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_bf16;
1392 break;
1393 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w32_gfx12:
1394 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w64_gfx12:
1395 AppendFalseForOpselArg = true;
1396 [[fallthrough]];
1397 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w32:
1398 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w64:
1399 ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
1400 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x16_f16;
1401 break;
1402 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32_gfx12:
1403 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64_gfx12:
1404 AppendFalseForOpselArg = true;
1405 [[fallthrough]];
1406 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32:
1407 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64:
1408 ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
1409 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16;
1410 break;
1411 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w32:
1412 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w64:
1413 ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
1414 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x16_f16_tied;
1415 break;
1416 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32:
1417 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64:
1418 ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
1419 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16_tied;
1420 break;
1421 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32:
1422 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64:
1423 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12:
1424 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64_gfx12:
1425 ArgsForMatchingMatrixTypes = {4, 1}; // CD, AB
1426 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_i32_16x16x16_iu8;
1427 break;
1428 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32:
1429 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64:
1430 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32_gfx12:
1431 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64_gfx12:
1432 ArgsForMatchingMatrixTypes = {4, 1}; // CD, AB
1433 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_i32_16x16x16_iu4;
1434 break;
1435 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32_gfx12:
1436 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w64_gfx12:
1437 ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
1438 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_fp8;
1439 break;
1440 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32_gfx12:
1441 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w64_gfx12:
1442 ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
1443 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_bf8;
1444 break;
1445 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32_gfx12:
1446 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w64_gfx12:
1447 ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
1448 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_fp8;
1449 break;
1450 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32_gfx12:
1451 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w64_gfx12:
1452 ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
1453 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_bf8;
1454 break;
1455 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x32_iu4_w32_gfx12:
1456 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x32_iu4_w64_gfx12:
1457 ArgsForMatchingMatrixTypes = {4, 1}; // CD, AB
1458 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_i32_16x16x32_iu4;
1459 break;
1460 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_f16_w32:
1461 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_f16_w64:
1462 ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
1463 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_f16;
1464 break;
1465 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w32:
1466 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w64:
1467 ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
1468 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16;
1469 break;
1470 case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x32_f16_w32:
1471 case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x32_f16_w64:
1472 ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
1473 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f16_16x16x32_f16;
1474 break;
1475 case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w32:
1476 case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w64:
1477 ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
1478 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16;
1479 break;
1480 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w32:
1481 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w64:
1482 ArgsForMatchingMatrixTypes = {4, 1, 3, 5}; // CD, A, B, Index
1483 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8;
1484 break;
1485 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w32:
1486 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w64:
1487 ArgsForMatchingMatrixTypes = {4, 1, 3, 5}; // CD, A, B, Index
1488 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4;
1489 break;
1490 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w32:
1491 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w64:
1492 ArgsForMatchingMatrixTypes = {4, 1, 3, 5}; // CD, A, B, Index
1493 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4;
1494 break;
1495 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32:
1496 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64:
1497 ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
1498 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8;
1499 break;
1500 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32:
1501 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64:
1502 ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
1503 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8;
1504 break;
1505 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32:
1506 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64:
1507 ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
1508 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8;
1509 break;
1510 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32:
1511 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64:
1512 ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
1513 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8;
1514 break;
1515 // GFX1250 WMMA builtins
1516 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x4_f32:
1517 ArgsForMatchingMatrixTypes = {5, 1};
1518 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x4_f32;
1519 break;
1520 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x32_bf16:
1521 ArgsForMatchingMatrixTypes = {5, 1};
1522 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x32_bf16;
1523 break;
1524 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x32_f16:
1525 ArgsForMatchingMatrixTypes = {5, 1};
1526 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x32_f16;
1527 break;
1528 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x32_f16:
1529 ArgsForMatchingMatrixTypes = {5, 1};
1530 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x32_f16;
1531 break;
1532 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x32_bf16:
1533 ArgsForMatchingMatrixTypes = {5, 1};
1534 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_bf16_16x16x32_bf16;
1535 break;
1536 case AMDGPU::BI__builtin_amdgcn_wmma_bf16f32_16x16x32_bf16:
1537 NeedReturnType = true;
1538 ArgsForMatchingMatrixTypes = {1, 5};
1539 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_bf16f32_16x16x32_bf16;
1540 break;
1541 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x64_fp8_fp8:
1542 ArgsForMatchingMatrixTypes = {3, 0};
1543 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x64_fp8_fp8;
1544 break;
1545 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x64_fp8_bf8:
1546 ArgsForMatchingMatrixTypes = {3, 0};
1547 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x64_fp8_bf8;
1548 break;
1549 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x64_bf8_fp8:
1550 ArgsForMatchingMatrixTypes = {3, 0};
1551 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x64_bf8_fp8;
1552 break;
1553 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x64_bf8_bf8:
1554 ArgsForMatchingMatrixTypes = {3, 0};
1555 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x64_bf8_bf8;
1556 break;
1557 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x64_fp8_fp8:
1558 ArgsForMatchingMatrixTypes = {3, 0};
1559 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x64_fp8_fp8;
1560 break;
1561 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x64_fp8_bf8:
1562 ArgsForMatchingMatrixTypes = {3, 0};
1563 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x64_fp8_bf8;
1564 break;
1565 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x64_bf8_fp8:
1566 ArgsForMatchingMatrixTypes = {3, 0};
1567 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x64_bf8_fp8;
1568 break;
1569 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x64_bf8_bf8:
1570 ArgsForMatchingMatrixTypes = {3, 0};
1571 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x64_bf8_bf8;
1572 break;
1573 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x128_fp8_fp8:
1574 ArgsForMatchingMatrixTypes = {3, 0};
1575 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x128_fp8_fp8;
1576 break;
1577 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x128_fp8_bf8:
1578 ArgsForMatchingMatrixTypes = {3, 0};
1579 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x128_fp8_bf8;
1580 break;
1581 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x128_bf8_fp8:
1582 ArgsForMatchingMatrixTypes = {3, 0};
1583 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x128_bf8_fp8;
1584 break;
1585 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x128_bf8_bf8:
1586 ArgsForMatchingMatrixTypes = {3, 0};
1587 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x128_bf8_bf8;
1588 break;
1589 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_fp8_fp8:
1590 ArgsForMatchingMatrixTypes = {3, 0};
1591 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x128_fp8_fp8;
1592 break;
1593 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_fp8_bf8:
1594 ArgsForMatchingMatrixTypes = {3, 0};
1595 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x128_fp8_bf8;
1596 break;
1597 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_bf8_fp8:
1598 ArgsForMatchingMatrixTypes = {3, 0};
1599 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x128_bf8_fp8;
1600 break;
1601 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_bf8_bf8:
1602 ArgsForMatchingMatrixTypes = {3, 0};
1603 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x128_bf8_bf8;
1604 break;
1605 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x64_iu8:
1606 ArgsForMatchingMatrixTypes = {4, 1};
1607 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_i32_16x16x64_iu8;
1608 break;
1609 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_f8f6f4:
1610 ArgsForMatchingMatrixTypes = {5, 1, 3};
1611 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x128_f8f6f4;
1612 break;
1613 case AMDGPU::BI__builtin_amdgcn_wmma_scale_f32_16x16x128_f8f6f4:
1614 ArgsForMatchingMatrixTypes = {5, 1, 3};
1615 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_scale_f32_16x16x128_f8f6f4;
1616 break;
1617 case AMDGPU::BI__builtin_amdgcn_wmma_scale16_f32_16x16x128_f8f6f4:
1618 ArgsForMatchingMatrixTypes = {5, 1, 3};
1619 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_scale16_f32_16x16x128_f8f6f4;
1620 break;
1621 case AMDGPU::BI__builtin_amdgcn_wmma_f32_32x16x128_f4:
1622 ArgsForMatchingMatrixTypes = {3, 0, 1};
1623 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_32x16x128_f4;
1624 break;
1625 case AMDGPU::BI__builtin_amdgcn_wmma_scale_f32_32x16x128_f4:
1626 ArgsForMatchingMatrixTypes = {3, 0, 1};
1627 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_scale_f32_32x16x128_f4;
1628 break;
1629 case AMDGPU::BI__builtin_amdgcn_wmma_scale16_f32_32x16x128_f4:
1630 ArgsForMatchingMatrixTypes = {3, 0, 1};
1631 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_scale16_f32_32x16x128_f4;
1632 break;
1633 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x64_f16:
1634 ArgsForMatchingMatrixTypes = {4, 1, 3, 5};
1635 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x64_f16;
1636 break;
1637 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x64_bf16:
1638 ArgsForMatchingMatrixTypes = {4, 1, 3, 5};
1639 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16;
1640 break;
1641 case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x64_f16:
1642 ArgsForMatchingMatrixTypes = {4, 1, 3, 5};
1643 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f16_16x16x64_f16;
1644 break;
1645 case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x64_bf16:
1646 ArgsForMatchingMatrixTypes = {4, 1, 3, 5};
1647 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16;
1648 break;
1649 case AMDGPU::BI__builtin_amdgcn_swmmac_bf16f32_16x16x64_bf16:
1650 ArgsForMatchingMatrixTypes = {4, 1, 3, 5};
1651 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16;
1652 break;
1653 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x128_fp8_fp8:
1654 ArgsForMatchingMatrixTypes = {2, 0, 1, 3};
1655 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8;
1656 break;
1657 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x128_fp8_bf8:
1658 ArgsForMatchingMatrixTypes = {2, 0, 1, 3};
1659 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8;
1660 break;
1661 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x128_bf8_fp8:
1662 ArgsForMatchingMatrixTypes = {2, 0, 1, 3};
1663 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8;
1664 break;
1665 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x128_bf8_bf8:
1666 ArgsForMatchingMatrixTypes = {2, 0, 1, 3};
1667 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8;
1668 break;
1669 case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x128_fp8_fp8:
1670 ArgsForMatchingMatrixTypes = {2, 0, 1, 3};
1671 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8;
1672 break;
1673 case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x128_fp8_bf8:
1674 ArgsForMatchingMatrixTypes = {2, 0, 1, 3};
1675 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8;
1676 break;
1677 case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x128_bf8_fp8:
1678 ArgsForMatchingMatrixTypes = {2, 0, 1, 3};
1679 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8;
1680 break;
1681 case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x128_bf8_bf8:
1682 ArgsForMatchingMatrixTypes = {2, 0, 1, 3};
1683 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8;
1684 break;
1685 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x128_iu8:
1686 ArgsForMatchingMatrixTypes = {4, 1, 3, 5};
1687 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8;
1688 break;
1689 }
1690
1691 SmallVector<Value *, 6> Args;
1692 for (int i = 0, e = E->getNumArgs(); i != e; ++i)
1693 Args.push_back(Elt: EmitScalarExpr(E: E->getArg(Arg: i)));
1694 if (AppendFalseForOpselArg)
1695 Args.push_back(Elt: Builder.getFalse());
1696
1697 // Handle the optional clamp argument of the following two builtins.
1698 if (BuiltinID == AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x64_iu8) {
1699 if (Args.size() == 7)
1700 Args.push_back(Elt: Builder.getFalse());
1701 assert(Args.size() == 8 && "Expected 8 arguments");
1702 Args[7] = Builder.CreateZExtOrTrunc(V: Args[7], DestTy: Builder.getInt1Ty());
1703 } else if (BuiltinID ==
1704 AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x128_iu8) {
1705 if (Args.size() == 8)
1706 Args.push_back(Elt: Builder.getFalse());
1707 assert(Args.size() == 9 && "Expected 9 arguments");
1708 Args[8] = Builder.CreateZExtOrTrunc(V: Args[8], DestTy: Builder.getInt1Ty());
1709 }
1710
1711 SmallVector<llvm::Type *, 6> ArgTypes;
1712 if (NeedReturnType)
1713 ArgTypes.push_back(Elt: ConvertType(T: E->getType()));
1714 for (auto ArgIdx : ArgsForMatchingMatrixTypes)
1715 ArgTypes.push_back(Elt: Args[ArgIdx]->getType());
1716
1717 Function *F = CGM.getIntrinsic(IID: BuiltinWMMAOp, Tys: ArgTypes);
1718 return Builder.CreateCall(Callee: F, Args);
1719 }
1720 // amdgcn workgroup size
1721 case AMDGPU::BI__builtin_amdgcn_workgroup_size_x:
1722 return EmitAMDGPUWorkGroupSize(CGF&: *this, Index: 0);
1723 case AMDGPU::BI__builtin_amdgcn_workgroup_size_y:
1724 return EmitAMDGPUWorkGroupSize(CGF&: *this, Index: 1);
1725 case AMDGPU::BI__builtin_amdgcn_workgroup_size_z:
1726 return EmitAMDGPUWorkGroupSize(CGF&: *this, Index: 2);
1727
1728 // amdgcn grid size
1729 case AMDGPU::BI__builtin_amdgcn_grid_size_x:
1730 return EmitAMDGPUGridSize(CGF&: *this, Index: 0);
1731 case AMDGPU::BI__builtin_amdgcn_grid_size_y:
1732 return EmitAMDGPUGridSize(CGF&: *this, Index: 1);
1733 case AMDGPU::BI__builtin_amdgcn_grid_size_z:
1734 return EmitAMDGPUGridSize(CGF&: *this, Index: 2);
1735
1736 // r600 intrinsics
1737 case AMDGPU::BI__builtin_r600_recipsqrt_ieee:
1738 case AMDGPU::BI__builtin_r600_recipsqrt_ieeef:
1739 return emitBuiltinWithOneOverloadedType<1>(CGF&: *this, E,
1740 IntrinsicID: Intrinsic::r600_recipsqrt_ieee);
1741 case AMDGPU::BI__builtin_amdgcn_alignbit: {
1742 llvm::Value *Src0 = EmitScalarExpr(E: E->getArg(Arg: 0));
1743 llvm::Value *Src1 = EmitScalarExpr(E: E->getArg(Arg: 1));
1744 llvm::Value *Src2 = EmitScalarExpr(E: E->getArg(Arg: 2));
1745 Function *F = CGM.getIntrinsic(IID: Intrinsic::fshr, Tys: Src0->getType());
1746 return Builder.CreateCall(Callee: F, Args: { Src0, Src1, Src2 });
1747 }
1748 case AMDGPU::BI__builtin_amdgcn_fence: {
1749 ProcessOrderScopeAMDGCN(Order: EmitScalarExpr(E: E->getArg(Arg: 0)),
1750 Scope: EmitScalarExpr(E: E->getArg(Arg: 1)), AO, SSID);
1751 FenceInst *Fence = Builder.CreateFence(Ordering: AO, SSID);
1752 if (E->getNumArgs() > 2)
1753 AddAMDGPUFenceAddressSpaceMMRA(Inst: Fence, E);
1754 return Fence;
1755 }
1756 case AMDGPU::BI__builtin_amdgcn_atomic_inc32:
1757 case AMDGPU::BI__builtin_amdgcn_atomic_inc64:
1758 case AMDGPU::BI__builtin_amdgcn_atomic_dec32:
1759 case AMDGPU::BI__builtin_amdgcn_atomic_dec64:
1760 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f64:
1761 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f32:
1762 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2f16:
1763 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16:
1764 case AMDGPU::BI__builtin_amdgcn_ds_faddf:
1765 case AMDGPU::BI__builtin_amdgcn_ds_fminf:
1766 case AMDGPU::BI__builtin_amdgcn_ds_fmaxf:
1767 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
1768 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
1769 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
1770 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2f16:
1771 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f32:
1772 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f64:
1773 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2bf16:
1774 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16:
1775 case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
1776 case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64:
1777 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmin_f64:
1778 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmax_f64: {
1779 llvm::AtomicRMWInst::BinOp BinOp;
1780 switch (BuiltinID) {
1781 case AMDGPU::BI__builtin_amdgcn_atomic_inc32:
1782 case AMDGPU::BI__builtin_amdgcn_atomic_inc64:
1783 BinOp = llvm::AtomicRMWInst::UIncWrap;
1784 break;
1785 case AMDGPU::BI__builtin_amdgcn_atomic_dec32:
1786 case AMDGPU::BI__builtin_amdgcn_atomic_dec64:
1787 BinOp = llvm::AtomicRMWInst::UDecWrap;
1788 break;
1789 case AMDGPU::BI__builtin_amdgcn_ds_faddf:
1790 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f64:
1791 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f32:
1792 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2f16:
1793 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16:
1794 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
1795 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
1796 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
1797 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2f16:
1798 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f32:
1799 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f64:
1800 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2bf16:
1801 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16:
1802 BinOp = llvm::AtomicRMWInst::FAdd;
1803 break;
1804 case AMDGPU::BI__builtin_amdgcn_ds_fminf:
1805 case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
1806 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmin_f64:
1807 BinOp = llvm::AtomicRMWInst::FMin;
1808 break;
1809 case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64:
1810 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmax_f64:
1811 case AMDGPU::BI__builtin_amdgcn_ds_fmaxf:
1812 BinOp = llvm::AtomicRMWInst::FMax;
1813 break;
1814 }
1815
1816 Address Ptr = CheckAtomicAlignment(CGF&: *this, E);
1817 Value *Val = EmitScalarExpr(E: E->getArg(Arg: 1));
1818 llvm::Type *OrigTy = Val->getType();
1819 QualType PtrTy = E->getArg(Arg: 0)->IgnoreImpCasts()->getType();
1820
1821 bool Volatile;
1822
1823 if (BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_faddf ||
1824 BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_fminf ||
1825 BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_fmaxf) {
1826 // __builtin_amdgcn_ds_faddf/fminf/fmaxf has an explicit volatile argument
1827 Volatile =
1828 cast<ConstantInt>(Val: EmitScalarExpr(E: E->getArg(Arg: 4)))->getZExtValue();
1829 } else {
1830 // Infer volatile from the passed type.
1831 Volatile =
1832 PtrTy->castAs<PointerType>()->getPointeeType().isVolatileQualified();
1833 }
1834
1835 if (E->getNumArgs() >= 4) {
1836 // Some of the builtins have explicit ordering and scope arguments.
1837 ProcessOrderScopeAMDGCN(Order: EmitScalarExpr(E: E->getArg(Arg: 2)),
1838 Scope: EmitScalarExpr(E: E->getArg(Arg: 3)), AO, SSID);
1839 } else {
1840 // Most of the builtins do not have syncscope/order arguments. For DS
1841 // atomics the scope doesn't really matter, as they implicitly operate at
1842 // workgroup scope.
1843 //
1844 // The global/flat cases need to use agent scope to consistently produce
1845 // the native instruction instead of a cmpxchg expansion.
1846 if (getTarget().getTriple().isSPIRV())
1847 SSID = getLLVMContext().getOrInsertSyncScopeID(SSN: "device");
1848 else
1849 SSID = getLLVMContext().getOrInsertSyncScopeID(SSN: "agent");
1850 AO = AtomicOrdering::Monotonic;
1851
1852 // The v2bf16 builtin uses i16 instead of a natural bfloat type.
1853 if (BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16 ||
1854 BuiltinID == AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2bf16 ||
1855 BuiltinID == AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16) {
1856 llvm::Type *V2BF16Ty = FixedVectorType::get(
1857 ElementType: llvm::Type::getBFloatTy(C&: Builder.getContext()), NumElts: 2);
1858 Val = Builder.CreateBitCast(V: Val, DestTy: V2BF16Ty);
1859 }
1860 }
1861
1862 llvm::AtomicRMWInst *RMW =
1863 Builder.CreateAtomicRMW(Op: BinOp, Addr: Ptr, Val, Ordering: AO, SSID);
1864 if (Volatile)
1865 RMW->setVolatile(true);
1866
1867 unsigned AddrSpace = Ptr.getType()->getAddressSpace();
1868 if (AddrSpace != llvm::AMDGPUAS::LOCAL_ADDRESS) {
1869 // Most targets require "amdgpu.no.fine.grained.memory" to emit the native
1870 // instruction for flat and global operations.
1871 llvm::MDTuple *EmptyMD = MDNode::get(Context&: getLLVMContext(), MDs: {});
1872 RMW->setMetadata(Kind: "amdgpu.no.fine.grained.memory", Node: EmptyMD);
1873
1874 // Most targets require "amdgpu.ignore.denormal.mode" to emit the native
1875 // instruction, but this only matters for float fadd.
1876 if (BinOp == llvm::AtomicRMWInst::FAdd && Val->getType()->isFloatTy())
1877 RMW->setMetadata(Kind: "amdgpu.ignore.denormal.mode", Node: EmptyMD);
1878 }
1879
1880 return Builder.CreateBitCast(V: RMW, DestTy: OrigTy);
1881 }
1882 case AMDGPU::BI__builtin_amdgcn_s_sendmsg_rtn:
1883 case AMDGPU::BI__builtin_amdgcn_s_sendmsg_rtnl: {
1884 llvm::Value *Arg = EmitScalarExpr(E: E->getArg(Arg: 0));
1885 llvm::Type *ResultType = ConvertType(T: E->getType());
1886 // s_sendmsg_rtn is mangled using return type only.
1887 Function *F =
1888 CGM.getIntrinsic(IID: Intrinsic::amdgcn_s_sendmsg_rtn, Tys: {ResultType});
1889 return Builder.CreateCall(Callee: F, Args: {Arg});
1890 }
1891 case AMDGPU::BI__builtin_amdgcn_permlane16_swap:
1892 case AMDGPU::BI__builtin_amdgcn_permlane32_swap: {
1893 // Because builtin types are limited, and the intrinsic uses a struct/pair
1894 // output, marshal the pair-of-i32 to <2 x i32>.
1895 Value *VDstOld = EmitScalarExpr(E: E->getArg(Arg: 0));
1896 Value *VSrcOld = EmitScalarExpr(E: E->getArg(Arg: 1));
1897 Value *FI = EmitScalarExpr(E: E->getArg(Arg: 2));
1898 Value *BoundCtrl = EmitScalarExpr(E: E->getArg(Arg: 3));
1899 Function *F =
1900 CGM.getIntrinsic(IID: BuiltinID == AMDGPU::BI__builtin_amdgcn_permlane16_swap
1901 ? Intrinsic::amdgcn_permlane16_swap
1902 : Intrinsic::amdgcn_permlane32_swap);
1903 llvm::CallInst *Call =
1904 Builder.CreateCall(Callee: F, Args: {VDstOld, VSrcOld, FI, BoundCtrl});
1905
1906 llvm::Value *Elt0 = Builder.CreateExtractValue(Agg: Call, Idxs: 0);
1907 llvm::Value *Elt1 = Builder.CreateExtractValue(Agg: Call, Idxs: 1);
1908
1909 llvm::Type *ResultType = ConvertType(T: E->getType());
1910
1911 llvm::Value *Insert0 = Builder.CreateInsertElement(
1912 Vec: llvm::PoisonValue::get(T: ResultType), NewElt: Elt0, UINT64_C(0));
1913 llvm::Value *AsVector =
1914 Builder.CreateInsertElement(Vec: Insert0, NewElt: Elt1, UINT64_C(1));
1915 return AsVector;
1916 }
1917 case AMDGPU::BI__builtin_amdgcn_bitop3_b32:
1918 case AMDGPU::BI__builtin_amdgcn_bitop3_b16:
1919 return emitBuiltinWithOneOverloadedType<4>(CGF&: *this, E,
1920 IntrinsicID: Intrinsic::amdgcn_bitop3);
1921 case AMDGPU::BI__builtin_amdgcn_make_buffer_rsrc: {
1922 // TODO: LLVM has this overloaded to allow for fat pointers, but since
1923 // those haven't been plumbed through to Clang yet, default to creating the
1924 // resource type.
1925 SmallVector<Value *, 4> Args;
1926 for (unsigned I = 0; I < 4; ++I)
1927 Args.push_back(Elt: EmitScalarExpr(E: E->getArg(Arg: I)));
1928 llvm::PointerType *RetTy = llvm::PointerType::get(
1929 C&: Builder.getContext(), AddressSpace: llvm::AMDGPUAS::BUFFER_RESOURCE);
1930 Function *F = CGM.getIntrinsic(IID: Intrinsic::amdgcn_make_buffer_rsrc,
1931 Tys: {RetTy, Args[0]->getType()});
1932 return Builder.CreateCall(Callee: F, Args);
1933 }
1934 case AMDGPU::BI__builtin_amdgcn_raw_buffer_store_b8:
1935 case AMDGPU::BI__builtin_amdgcn_raw_buffer_store_b16:
1936 case AMDGPU::BI__builtin_amdgcn_raw_buffer_store_b32:
1937 case AMDGPU::BI__builtin_amdgcn_raw_buffer_store_b64:
1938 case AMDGPU::BI__builtin_amdgcn_raw_buffer_store_b96:
1939 case AMDGPU::BI__builtin_amdgcn_raw_buffer_store_b128:
1940 return emitBuiltinWithOneOverloadedType<5>(
1941 CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_raw_ptr_buffer_store);
1942 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b8:
1943 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b16:
1944 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b32:
1945 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b64:
1946 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b96:
1947 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b128: {
1948 llvm::Type *RetTy = nullptr;
1949 switch (BuiltinID) {
1950 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b8:
1951 RetTy = Int8Ty;
1952 break;
1953 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b16:
1954 RetTy = Int16Ty;
1955 break;
1956 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b32:
1957 RetTy = Int32Ty;
1958 break;
1959 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b64:
1960 RetTy = llvm::FixedVectorType::get(ElementType: Int32Ty, /*NumElements=*/NumElts: 2);
1961 break;
1962 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b96:
1963 RetTy = llvm::FixedVectorType::get(ElementType: Int32Ty, /*NumElements=*/NumElts: 3);
1964 break;
1965 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b128:
1966 RetTy = llvm::FixedVectorType::get(ElementType: Int32Ty, /*NumElements=*/NumElts: 4);
1967 break;
1968 }
1969 Function *F =
1970 CGM.getIntrinsic(IID: Intrinsic::amdgcn_raw_ptr_buffer_load, Tys: RetTy);
1971 return Builder.CreateCall(
1972 Callee: F, Args: {EmitScalarExpr(E: E->getArg(Arg: 0)), EmitScalarExpr(E: E->getArg(Arg: 1)),
1973 EmitScalarExpr(E: E->getArg(Arg: 2)), EmitScalarExpr(E: E->getArg(Arg: 3))});
1974 }
1975 case AMDGPU::BI__builtin_amdgcn_raw_ptr_buffer_atomic_add_i32:
1976 return emitBuiltinWithOneOverloadedType<5>(
1977 CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_raw_ptr_buffer_atomic_add);
1978 case AMDGPU::BI__builtin_amdgcn_raw_ptr_buffer_atomic_fadd_f32:
1979 case AMDGPU::BI__builtin_amdgcn_raw_ptr_buffer_atomic_fadd_v2f16:
1980 return emitBuiltinWithOneOverloadedType<5>(
1981 CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd);
1982 case AMDGPU::BI__builtin_amdgcn_raw_ptr_buffer_atomic_fmin_f32:
1983 case AMDGPU::BI__builtin_amdgcn_raw_ptr_buffer_atomic_fmin_f64:
1984 return emitBuiltinWithOneOverloadedType<5>(
1985 CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin);
1986 case AMDGPU::BI__builtin_amdgcn_raw_ptr_buffer_atomic_fmax_f32:
1987 case AMDGPU::BI__builtin_amdgcn_raw_ptr_buffer_atomic_fmax_f64:
1988 return emitBuiltinWithOneOverloadedType<5>(
1989 CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax);
1990 case AMDGPU::BI__builtin_amdgcn_s_prefetch_data:
1991 return emitBuiltinWithOneOverloadedType<2>(
1992 CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_s_prefetch_data);
1993 case Builtin::BIlogbf:
1994 case Builtin::BI__builtin_logbf: {
1995 Value *Src0 = EmitScalarExpr(E: E->getArg(Arg: 0));
1996 Function *FrExpFunc = CGM.getIntrinsic(
1997 IID: Intrinsic::frexp, Tys: {Src0->getType(), Builder.getInt32Ty()});
1998 CallInst *FrExp = Builder.CreateCall(Callee: FrExpFunc, Args: Src0);
1999 Value *Exp = Builder.CreateExtractValue(Agg: FrExp, Idxs: 1);
2000 Value *Add = Builder.CreateAdd(
2001 LHS: Exp, RHS: ConstantInt::getSigned(Ty: Exp->getType(), V: -1), Name: "", HasNUW: false, HasNSW: true);
2002 Value *SIToFP = Builder.CreateSIToFP(V: Add, DestTy: Builder.getFloatTy());
2003 Value *Fabs =
2004 emitBuiltinWithOneOverloadedType<1>(CGF&: *this, E, IntrinsicID: Intrinsic::fabs);
2005 Value *FCmpONE = Builder.CreateFCmpONE(
2006 LHS: Fabs, RHS: ConstantFP::getInfinity(Ty: Builder.getFloatTy()));
2007 Value *Sel1 = Builder.CreateSelect(C: FCmpONE, True: SIToFP, False: Fabs);
2008 Value *FCmpOEQ =
2009 Builder.CreateFCmpOEQ(LHS: Src0, RHS: ConstantFP::getZero(Ty: Builder.getFloatTy()));
2010 Value *Sel2 = Builder.CreateSelect(
2011 C: FCmpOEQ,
2012 True: ConstantFP::getInfinity(Ty: Builder.getFloatTy(), /*Negative=*/true), False: Sel1);
2013 return Sel2;
2014 }
2015 case Builtin::BIlogb:
2016 case Builtin::BI__builtin_logb: {
2017 Value *Src0 = EmitScalarExpr(E: E->getArg(Arg: 0));
2018 Function *FrExpFunc = CGM.getIntrinsic(
2019 IID: Intrinsic::frexp, Tys: {Src0->getType(), Builder.getInt32Ty()});
2020 CallInst *FrExp = Builder.CreateCall(Callee: FrExpFunc, Args: Src0);
2021 Value *Exp = Builder.CreateExtractValue(Agg: FrExp, Idxs: 1);
2022 Value *Add = Builder.CreateAdd(
2023 LHS: Exp, RHS: ConstantInt::getSigned(Ty: Exp->getType(), V: -1), Name: "", HasNUW: false, HasNSW: true);
2024 Value *SIToFP = Builder.CreateSIToFP(V: Add, DestTy: Builder.getDoubleTy());
2025 Value *Fabs =
2026 emitBuiltinWithOneOverloadedType<1>(CGF&: *this, E, IntrinsicID: Intrinsic::fabs);
2027 Value *FCmpONE = Builder.CreateFCmpONE(
2028 LHS: Fabs, RHS: ConstantFP::getInfinity(Ty: Builder.getDoubleTy()));
2029 Value *Sel1 = Builder.CreateSelect(C: FCmpONE, True: SIToFP, False: Fabs);
2030 Value *FCmpOEQ =
2031 Builder.CreateFCmpOEQ(LHS: Src0, RHS: ConstantFP::getZero(Ty: Builder.getDoubleTy()));
2032 Value *Sel2 = Builder.CreateSelect(
2033 C: FCmpOEQ,
2034 True: ConstantFP::getInfinity(Ty: Builder.getDoubleTy(), /*Negative=*/true),
2035 False: Sel1);
2036 return Sel2;
2037 }
2038 case Builtin::BIscalbnf:
2039 case Builtin::BI__builtin_scalbnf:
2040 case Builtin::BIscalbn:
2041 case Builtin::BI__builtin_scalbn:
2042 return emitBinaryExpMaybeConstrainedFPBuiltin(
2043 CGF&: *this, E, IntrinsicID: Intrinsic::ldexp, ConstrainedIntrinsicID: Intrinsic::experimental_constrained_ldexp);
2044 default:
2045 return nullptr;
2046 }
2047}
2048