1//===------- AMDCPU.cpp - Emit LLVM Code for builtins ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This contains code to emit Builtin calls as LLVM code.
10//
11//===----------------------------------------------------------------------===//
12
13#include "CGBuiltin.h"
14#include "CodeGenFunction.h"
15#include "TargetInfo.h"
16#include "clang/Basic/DiagnosticFrontend.h"
17#include "clang/Basic/SyncScope.h"
18#include "clang/Basic/TargetBuiltins.h"
19#include "llvm/Analysis/ValueTracking.h"
20#include "llvm/CodeGen/MachineFunction.h"
21#include "llvm/IR/IntrinsicsAMDGPU.h"
22#include "llvm/IR/IntrinsicsR600.h"
23#include "llvm/IR/IntrinsicsSPIRV.h"
24#include "llvm/IR/MemoryModelRelaxationAnnotations.h"
25#include "llvm/Support/AMDGPUAddrSpace.h"
26#include "llvm/Support/AtomicOrdering.h"
27
28using namespace clang;
29using namespace CodeGen;
30using namespace llvm;
31
32namespace {
33
34// Has second type mangled argument.
35static Value *
36emitBinaryExpMaybeConstrainedFPBuiltin(CodeGenFunction &CGF, const CallExpr *E,
37 Intrinsic::ID IntrinsicID,
38 Intrinsic::ID ConstrainedIntrinsicID) {
39 llvm::Value *Src0 = CGF.EmitScalarExpr(E: E->getArg(Arg: 0));
40 llvm::Value *Src1 = CGF.EmitScalarExpr(E: E->getArg(Arg: 1));
41
42 CodeGenFunction::CGFPOptionsRAII FPOptsRAII(CGF, E);
43 if (CGF.Builder.getIsFPConstrained()) {
44 Function *F = CGF.CGM.getIntrinsic(IID: ConstrainedIntrinsicID,
45 Tys: {Src0->getType(), Src1->getType()});
46 return CGF.Builder.CreateConstrainedFPCall(Callee: F, Args: {Src0, Src1});
47 }
48
49 Function *F =
50 CGF.CGM.getIntrinsic(IID: IntrinsicID, Tys: {Src0->getType(), Src1->getType()});
51 return CGF.Builder.CreateCall(Callee: F, Args: {Src0, Src1});
52}
53
54// If \p E is not null pointer, insert address space cast to match return
55// type of \p E if necessary.
56Value *EmitAMDGPUDispatchPtr(CodeGenFunction &CGF,
57 const CallExpr *E = nullptr) {
58 auto *F = CGF.CGM.getIntrinsic(IID: Intrinsic::amdgcn_dispatch_ptr);
59 auto *Call = CGF.Builder.CreateCall(Callee: F);
60 if (!E)
61 return Call;
62 QualType BuiltinRetType = E->getType();
63 auto *RetTy = cast<llvm::PointerType>(Val: CGF.ConvertType(T: BuiltinRetType));
64 if (RetTy == Call->getType())
65 return Call;
66 return CGF.Builder.CreateAddrSpaceCast(V: Call, DestTy: RetTy);
67}
68
69Value *EmitAMDGPUImplicitArgPtr(CodeGenFunction &CGF) {
70 auto *F = CGF.CGM.getIntrinsic(IID: Intrinsic::amdgcn_implicitarg_ptr);
71 auto *Call = CGF.Builder.CreateCall(Callee: F);
72 Call->addRetAttr(
73 Attr: Attribute::getWithDereferenceableBytes(Context&: Call->getContext(), Bytes: 256));
74 Call->addRetAttr(Attr: Attribute::getWithAlignment(Context&: Call->getContext(), Alignment: Align(8)));
75 return Call;
76}
77
78static llvm::Intrinsic::ID getAMDGPUWorkGroupID(CodeGenFunction &CGF,
79 unsigned Index) {
80 switch (Index) {
81 case 0:
82 return llvm::Intrinsic::amdgcn_workgroup_id_x;
83 case 1:
84 return llvm::Intrinsic::amdgcn_workgroup_id_y;
85 case 2:
86 return llvm::Intrinsic::amdgcn_workgroup_id_z;
87 default:
88 llvm_unreachable("unhandled index");
89 }
90}
91
92static void setNoundefInvariantLoad(llvm::LoadInst *Ld) {
93 Ld->setMetadata(KindID: llvm::LLVMContext::MD_noundef,
94 Node: llvm::MDNode::get(Context&: Ld->getContext(), MDs: {}));
95 Ld->setMetadata(KindID: llvm::LLVMContext::MD_invariant_load,
96 Node: llvm::MDNode::get(Context&: Ld->getContext(), MDs: {}));
97}
98
99static void addMaxWorkGroupSizeRangeMetadata(CodeGenFunction &CGF,
100 llvm::LoadInst *GroupSize) {
101 llvm::MDBuilder MDHelper(CGF.getLLVMContext());
102 llvm::MDNode *RNode = MDHelper.createRange(
103 Lo: APInt(16, 1), Hi: APInt(16, CGF.getTarget().getMaxOpenCLWorkGroupSize() + 1));
104 GroupSize->setMetadata(KindID: llvm::LLVMContext::MD_range, Node: RNode);
105 setNoundefInvariantLoad(GroupSize);
106}
107
108static Value *emitAMDGPUWorkGroupSizeV5(CodeGenFunction &CGF, unsigned Index) {
109 llvm::Value *ImplicitArgPtr = EmitAMDGPUImplicitArgPtr(CGF);
110
111 // offsetof(amdhsa_implicit_kernarg_v5, block_count[Index])
112 unsigned BlockCountOffset = 0 + Index * 4;
113 // offsetof(amdhsa_implicit_kernarg_v5, group_size[Index])
114 unsigned GroupSizeOffset = 12 + Index * 2;
115 // offsetof(amdhsa_implicit_kernarg_v5, remainder[Index])
116 unsigned RemainderOffset = 18 + Index * 2;
117
118 if (CGF.CGM.getLangOpts().OffloadUniformBlock) {
119 // Indexing the implicit kernarg segment.
120 llvm::Value *GroupSizeGEP = CGF.Builder.CreateConstInBoundsGEP1_64(
121 Ty: CGF.Int8Ty, Ptr: ImplicitArgPtr, Idx0: GroupSizeOffset);
122 llvm::LoadInst *GroupSize = CGF.Builder.CreateLoad(
123 Addr: Address(GroupSizeGEP, CGF.Int16Ty, CharUnits::fromQuantity(Quantity: 2)));
124
125 addMaxWorkGroupSizeRangeMetadata(CGF, GroupSize);
126
127 return CGF.Builder.CreateZExt(V: GroupSize, DestTy: CGF.Int32Ty);
128 }
129
130 llvm::Value *BlockCountGEP = CGF.Builder.CreateConstGEP1_64(
131 Ty: CGF.Int8Ty, Ptr: ImplicitArgPtr, Idx0: BlockCountOffset);
132 llvm::LoadInst *BlockCount = CGF.Builder.CreateLoad(
133 Addr: Address(BlockCountGEP, CGF.Int32Ty, CharUnits::fromQuantity(Quantity: 4)));
134 setNoundefInvariantLoad(BlockCount);
135
136 llvm::Value *WorkgroupID =
137 CGF.Builder.CreateIntrinsic(ID: getAMDGPUWorkGroupID(CGF, Index), Args: {});
138 llvm::Value *IsFull = CGF.Builder.CreateICmpULT(LHS: WorkgroupID, RHS: BlockCount);
139
140 llvm::Value *StructOffset = CGF.Builder.CreateSelect(
141 C: IsFull, True: ConstantInt::get(Ty: CGF.Int32Ty, V: GroupSizeOffset),
142 False: ConstantInt::get(Ty: CGF.Int32Ty, V: RemainderOffset));
143
144 llvm::Value *SizeGEP =
145 CGF.Builder.CreateInBoundsGEP(Ty: CGF.Int8Ty, Ptr: ImplicitArgPtr, IdxList: StructOffset);
146 llvm::LoadInst *Size = CGF.Builder.CreateLoad(
147 Addr: Address(SizeGEP, CGF.Int16Ty, CharUnits::fromQuantity(Quantity: 2)));
148 addMaxWorkGroupSizeRangeMetadata(CGF, GroupSize: Size);
149 setNoundefInvariantLoad(Size);
150
151 return CGF.Builder.CreateZExt(V: Size, DestTy: CGF.Int32Ty);
152}
153
154static Value *emitAMDGPUWorkGroupSizeV4(CodeGenFunction &CGF, unsigned Index) {
155 llvm::Value *DispatchPtr = EmitAMDGPUDispatchPtr(CGF);
156
157 // Indexing the HSA kernel_dispatch_packet struct.
158 llvm::Value *GroupSizeGEP = CGF.Builder.CreateConstInBoundsGEP1_64(
159 Ty: CGF.Int8Ty, Ptr: DispatchPtr, Idx0: 4 + Index * 2);
160 llvm::LoadInst *GroupSizeLD = CGF.Builder.CreateLoad(
161 Addr: Address(GroupSizeGEP, CGF.Int16Ty, CharUnits::fromQuantity(Quantity: 2)));
162
163 addMaxWorkGroupSizeRangeMetadata(CGF, GroupSize: GroupSizeLD);
164
165 llvm::Value *GroupSize = CGF.Builder.CreateZExt(V: GroupSizeLD, DestTy: CGF.Int32Ty);
166
167 if (CGF.CGM.getLangOpts().OffloadUniformBlock)
168 return GroupSize;
169
170 llvm::Value *WorkgroupID =
171 CGF.Builder.CreateIntrinsic(ID: getAMDGPUWorkGroupID(CGF, Index), Args: {});
172
173 llvm::Value *GridSizeGEP = CGF.Builder.CreateConstInBoundsGEP1_64(
174 Ty: CGF.Int8Ty, Ptr: DispatchPtr, Idx0: 12 + Index * 4);
175 llvm::LoadInst *GridSize = CGF.Builder.CreateLoad(
176 Addr: Address(GridSizeGEP, CGF.Int32Ty, CharUnits::fromQuantity(Quantity: 4)));
177
178 llvm::MDBuilder MDB(CGF.getLLVMContext());
179
180 // Known non-zero.
181 GridSize->setMetadata(KindID: llvm::LLVMContext::MD_range,
182 Node: MDB.createRange(Lo: APInt(32, 1), Hi: APInt::getZero(numBits: 32)));
183 GridSize->setMetadata(KindID: llvm::LLVMContext::MD_invariant_load,
184 Node: llvm::MDNode::get(Context&: CGF.getLLVMContext(), MDs: {}));
185
186 llvm::Value *Mul = CGF.Builder.CreateMul(LHS: WorkgroupID, RHS: GroupSize);
187 llvm::Value *Remainder = CGF.Builder.CreateSub(LHS: GridSize, RHS: Mul);
188
189 llvm::Value *IsPartial = CGF.Builder.CreateICmpULT(LHS: Remainder, RHS: GroupSize);
190
191 return CGF.Builder.CreateSelect(C: IsPartial, True: Remainder, False: GroupSize);
192}
193
194// \p Index is 0, 1, and 2 for x, y, and z dimension, respectively.
195/// Emit code based on Code Object ABI version.
196/// COV_4 : Emit code to use dispatch ptr
197/// COV_5+ : Emit code to use implicitarg ptr
198/// COV_NONE : Emit code to load a global variable "__oclc_ABI_version"
199/// and use its value for COV_4 or COV_5+ approach. It is used for
200/// compiling device libraries in an ABI-agnostic way.
201Value *EmitAMDGPUWorkGroupSize(CodeGenFunction &CGF, unsigned Index) {
202 auto Cov = CGF.getTarget().getTargetOpts().CodeObjectVersion;
203
204 // Do not emit __oclc_ABI_version references with non-empt environment.
205 if (Cov == CodeObjectVersionKind::COV_None &&
206 CGF.getTarget().getTriple().hasEnvironment())
207 Cov = CodeObjectVersionKind::COV_6;
208
209 if (Cov == CodeObjectVersionKind::COV_None) {
210 StringRef Name = "__oclc_ABI_version";
211 auto *ABIVersionC = CGF.CGM.getModule().getNamedGlobal(Name);
212 if (!ABIVersionC)
213 ABIVersionC = new llvm::GlobalVariable(
214 CGF.CGM.getModule(), CGF.Int32Ty, false,
215 llvm::GlobalValue::ExternalLinkage, nullptr, Name, nullptr,
216 llvm::GlobalVariable::NotThreadLocal,
217 CGF.CGM.getContext().getTargetAddressSpace(AS: LangAS::opencl_constant));
218
219 // This load will be eliminated by the IPSCCP because it is constant
220 // weak_odr without externally_initialized. Either changing it to weak or
221 // adding externally_initialized will keep the load.
222 Value *ABIVersion = CGF.Builder.CreateAlignedLoad(Ty: CGF.Int32Ty, Addr: ABIVersionC,
223 Align: CGF.CGM.getIntAlign());
224
225 Value *IsCOV5 = CGF.Builder.CreateICmpSGE(
226 LHS: ABIVersion,
227 RHS: llvm::ConstantInt::get(Ty: CGF.Int32Ty, V: CodeObjectVersionKind::COV_5));
228
229 llvm::Value *V5Impl = emitAMDGPUWorkGroupSizeV5(CGF, Index);
230 llvm::Value *V4Impl = emitAMDGPUWorkGroupSizeV4(CGF, Index);
231 return CGF.Builder.CreateSelect(C: IsCOV5, True: V5Impl, False: V4Impl);
232 }
233
234 return Cov >= CodeObjectVersionKind::COV_5
235 ? emitAMDGPUWorkGroupSizeV5(CGF, Index)
236 : emitAMDGPUWorkGroupSizeV4(CGF, Index);
237}
238
239// \p Index is 0, 1, and 2 for x, y, and z dimension, respectively.
240Value *EmitAMDGPUGridSize(CodeGenFunction &CGF, unsigned Index) {
241 const unsigned XOffset = 12;
242 auto *DP = EmitAMDGPUDispatchPtr(CGF);
243 // Indexing the HSA kernel_dispatch_packet struct.
244 auto *Offset = llvm::ConstantInt::get(Ty: CGF.Int32Ty, V: XOffset + Index * 4);
245 auto *GEP = CGF.Builder.CreateGEP(Ty: CGF.Int8Ty, Ptr: DP, IdxList: Offset);
246 auto *LD = CGF.Builder.CreateLoad(
247 Addr: Address(GEP, CGF.Int32Ty, CharUnits::fromQuantity(Quantity: 4)));
248
249 llvm::MDBuilder MDB(CGF.getLLVMContext());
250
251 // Known non-zero.
252 LD->setMetadata(KindID: llvm::LLVMContext::MD_range,
253 Node: MDB.createRange(Lo: APInt(32, 1), Hi: APInt::getZero(numBits: 32)));
254 LD->setMetadata(KindID: llvm::LLVMContext::MD_invariant_load,
255 Node: llvm::MDNode::get(Context&: CGF.getLLVMContext(), MDs: {}));
256 return LD;
257}
258} // namespace
259
260// Generates the IR for __builtin_read_exec_*.
261// Lowers the builtin to amdgcn_ballot intrinsic.
262static Value *EmitAMDGCNBallotForExec(CodeGenFunction &CGF, const CallExpr *E,
263 llvm::Type *RegisterType,
264 llvm::Type *ValueType, bool isExecHi) {
265 CodeGen::CGBuilderTy &Builder = CGF.Builder;
266 CodeGen::CodeGenModule &CGM = CGF.CGM;
267
268 Function *F = CGM.getIntrinsic(IID: Intrinsic::amdgcn_ballot, Tys: {RegisterType});
269 llvm::Value *Call = Builder.CreateCall(Callee: F, Args: {Builder.getInt1(V: true)});
270
271 if (isExecHi) {
272 Value *Rt2 = Builder.CreateLShr(LHS: Call, RHS: 32);
273 Rt2 = Builder.CreateTrunc(V: Rt2, DestTy: CGF.Int32Ty);
274 return Rt2;
275 }
276
277 return Call;
278}
279
280static llvm::Value *loadTextureDescPtorAsVec8I32(CodeGenFunction &CGF,
281 llvm::Value *RsrcPtr) {
282 auto &B = CGF.Builder;
283 auto *VecTy = llvm::FixedVectorType::get(ElementType: B.getInt32Ty(), NumElts: 8);
284
285 if (RsrcPtr->getType() == VecTy)
286 return RsrcPtr;
287
288 if (RsrcPtr->getType()->isIntegerTy(BitWidth: 32)) {
289 llvm::PointerType *VecPtrTy =
290 llvm::PointerType::get(C&: CGF.getLLVMContext(), AddressSpace: 8);
291 llvm::Value *Ptr = B.CreateIntToPtr(V: RsrcPtr, DestTy: VecPtrTy, Name: "tex.rsrc.from.int");
292 return B.CreateAlignedLoad(Ty: VecTy, Ptr, Align: llvm::Align(32), Name: "tex.rsrc.val");
293 }
294
295 if (RsrcPtr->getType()->isPointerTy()) {
296 auto *VecPtrTy = llvm::PointerType::get(
297 C&: CGF.getLLVMContext(), AddressSpace: RsrcPtr->getType()->getPointerAddressSpace());
298 llvm::Value *Typed = B.CreateBitCast(V: RsrcPtr, DestTy: VecPtrTy, Name: "tex.rsrc.typed");
299 return B.CreateAlignedLoad(Ty: VecTy, Ptr: Typed, Align: llvm::Align(32), Name: "tex.rsrc.val");
300 }
301
302 const auto &DL = CGF.CGM.getDataLayout();
303 if (DL.getTypeSizeInBits(Ty: RsrcPtr->getType()) == 256)
304 return B.CreateBitCast(V: RsrcPtr, DestTy: VecTy, Name: "tex.rsrc.val");
305
306 llvm::report_fatal_error(reason: "Unexpected texture resource argument form");
307}
308
309llvm::CallInst *
310emitAMDGCNImageOverloadedReturnType(clang::CodeGen::CodeGenFunction &CGF,
311 const clang::CallExpr *E,
312 unsigned IntrinsicID, bool IsImageStore) {
313 auto findTextureDescIndex = [&CGF](const CallExpr *E) -> unsigned {
314 QualType TexQT = CGF.getContext().AMDGPUTextureTy;
315 for (unsigned I = 0, N = E->getNumArgs(); I < N; ++I) {
316 QualType ArgTy = E->getArg(Arg: I)->getType();
317 if (ArgTy == TexQT) {
318 return I;
319 }
320
321 if (ArgTy.getCanonicalType() == TexQT.getCanonicalType()) {
322 return I;
323 }
324 }
325
326 return ~0U;
327 };
328
329 clang::SmallVector<llvm::Value *, 10> Args;
330 unsigned RsrcIndex = findTextureDescIndex(E);
331
332 if (RsrcIndex == ~0U) {
333 llvm::report_fatal_error(reason: "Invalid argument count for image builtin");
334 }
335
336 for (unsigned I = 0; I < E->getNumArgs(); ++I) {
337 llvm::Value *V = CGF.EmitScalarExpr(E: E->getArg(Arg: I));
338 if (I == RsrcIndex)
339 V = loadTextureDescPtorAsVec8I32(CGF, RsrcPtr: V);
340 Args.push_back(Elt: V);
341 }
342
343 llvm::Type *RetTy = IsImageStore ? CGF.VoidTy : CGF.ConvertType(T: E->getType());
344 llvm::CallInst *Call =
345 CGF.Builder.CreateIntrinsicWithoutFolding(RetTy, ID: IntrinsicID, Args);
346 return Call;
347}
348
349// Emit an intrinsic that has 1 float or double operand, and 1 integer.
350static Value *emitFPIntBuiltin(CodeGenFunction &CGF,
351 const CallExpr *E,
352 unsigned IntrinsicID) {
353 llvm::Value *Src0 = CGF.EmitScalarExpr(E: E->getArg(Arg: 0));
354 llvm::Value *Src1 = CGF.EmitScalarExpr(E: E->getArg(Arg: 1));
355
356 Function *F = CGF.CGM.getIntrinsic(IID: IntrinsicID, Tys: Src0->getType());
357 return CGF.Builder.CreateCall(Callee: F, Args: {Src0, Src1});
358}
359
360static inline StringRef mapScopeToSPIRV(StringRef AMDGCNScope) {
361 if (AMDGCNScope == "agent")
362 return "device";
363 if (AMDGCNScope == "wavefront")
364 return "subgroup";
365 return AMDGCNScope;
366}
367
368static llvm::AtomicOrdering mapCABIAtomicOrdering(unsigned AO) {
369 // Map C11/C++11 memory ordering to LLVM memory ordering
370 assert(llvm::isValidAtomicOrderingCABI(AO));
371 switch (static_cast<llvm::AtomicOrderingCABI>(AO)) {
372 case llvm::AtomicOrderingCABI::acquire:
373 case llvm::AtomicOrderingCABI::consume:
374 return llvm::AtomicOrdering::Acquire;
375 case llvm::AtomicOrderingCABI::release:
376 return llvm::AtomicOrdering::Release;
377 case llvm::AtomicOrderingCABI::acq_rel:
378 return llvm::AtomicOrdering::AcquireRelease;
379 case llvm::AtomicOrderingCABI::seq_cst:
380 return llvm::AtomicOrdering::SequentiallyConsistent;
381 case llvm::AtomicOrderingCABI::relaxed:
382 return llvm::AtomicOrdering::Monotonic;
383 }
384 llvm_unreachable("Unknown AtomicOrderingCABI enum");
385}
386
387// Map a __MEMORY_SCOPE_* integer constant to the AMDGPU-specific syncscope.
388// Invalid scope values are mapped to system scope (empty string).
389static StringRef getAMDGPUSyncScopeStr(CodeGenModule &CGM, unsigned ScopeInt,
390 llvm::AtomicOrdering AO) {
391 AtomicScopeGenericModel ScopeModel;
392 if (!ScopeModel.isValid(S: ScopeInt))
393 return "";
394 clang::SyncScope Scope = ScopeModel.map(S: ScopeInt);
395 return CGM.getTargetCodeGenInfo().getLLVMSyncScopeStr(LangOpts: CGM.getLangOpts(),
396 Scope, Ordering: AO);
397}
398
399/// Convert a __MEMORY_SCOPE_* integer constant to a metadata node containing
400/// the target-specific sync scope string.
401static llvm::MetadataAsValue *emitScopeMD(
402 CodeGenFunction &CGF, unsigned ScopeInt,
403 llvm::AtomicOrdering AO = llvm::AtomicOrdering::SequentiallyConsistent) {
404 StringRef ScopeStr = getAMDGPUSyncScopeStr(CGM&: CGF.CGM, ScopeInt, AO);
405 llvm::LLVMContext &Ctx = CGF.CGM.getLLVMContext();
406 llvm::MDNode *MD =
407 llvm::MDNode::get(Context&: Ctx, MDs: {llvm::MDString::get(Context&: Ctx, Str: ScopeStr)});
408 return llvm::MetadataAsValue::get(Context&: Ctx, MD);
409}
410
411// For processing memory ordering and memory scope arguments of various
412// amdgcn builtins.
413// \p Order takes a C++11 compatible memory-ordering specifier and converts
414// it into LLVM's memory ordering specifier using atomic C ABI, and writes
415// to \p AO. \p Scope takes a const char * and converts it into AMDGCN
416// specific SyncScopeID and writes it to \p SSID.
417void CodeGenFunction::ProcessOrderScopeAMDGCN(Value *Order, Value *Scope,
418 llvm::AtomicOrdering &AO,
419 llvm::SyncScope::ID &SSID) {
420 int ord = cast<llvm::ConstantInt>(Val: Order)->getZExtValue();
421
422 // Map C11/C++11 memory ordering to LLVM memory ordering
423 AO = mapCABIAtomicOrdering(AO: ord);
424
425 // Some of the atomic builtins take the scope as a string name.
426 StringRef scp;
427 if (llvm::getConstantStringInfo(V: Scope, Str&: scp)) {
428 if (getTarget().getTriple().isSPIRV())
429 scp = mapScopeToSPIRV(AMDGCNScope: scp);
430 SSID = getLLVMContext().getOrInsertSyncScopeID(SSN: scp);
431 return;
432 }
433
434 // Older builtins had an enum argument for the memory scope.
435 unsigned scope = cast<llvm::ConstantInt>(Val: Scope)->getZExtValue();
436 StringRef SSN = getAMDGPUSyncScopeStr(CGM, ScopeInt: scope, AO);
437 SSID = getLLVMContext().getOrInsertSyncScopeID(SSN);
438}
439
440void CodeGenFunction::AddAMDGPUFenceAddressSpaceMMRA(llvm::Instruction *Inst,
441 const CallExpr *E) {
442 constexpr const char *Tag = "amdgpu-synchronize-as";
443
444 SmallVector<MMRAMetadata::TagT, 3> MMRAs;
445 for (unsigned K = 2; K < E->getNumArgs(); ++K) {
446 llvm::Value *V = EmitScalarExpr(E: E->getArg(Arg: K));
447 StringRef AS;
448 if (llvm::getConstantStringInfo(V, Str&: AS)) {
449 MMRAs.push_back(Elt: {Tag, AS});
450 // TODO: Delete the resulting unused constant?
451 continue;
452 }
453 CGM.Error(loc: E->getExprLoc(),
454 error: "expected an address space name as a string literal");
455 }
456
457 MMRAMetadata::appendTags(I&: *Inst, Tags: MMRAs);
458}
459
460static Value *GetAMDGPUPredicate(CodeGenFunction &CGF, Twine Name) {
461 Constant *SpecId = ConstantInt::getAllOnesValue(Ty: CGF.Int32Ty);
462
463 LLVMContext &Ctx = CGF.getLLVMContext();
464 MDNode *Predicate = MDNode::get(Context&: Ctx, MDs: MDString::get(Context&: Ctx, Str: Name.str()));
465 std::vector<Value *> Args = {SpecId, ConstantInt::getFalse(Context&: Ctx),
466 MetadataAsValue::get(Context&: Ctx, MD: Predicate)};
467 Value *Call = CGF.Builder.CreateIntrinsic(
468 ID: Intrinsic::spv_named_boolean_spec_constant, Args);
469
470 return Call;
471}
472
473static Intrinsic::ID getIntrinsicIDforWaveReduction(unsigned BuiltinID) {
474 switch (BuiltinID) {
475 default:
476 llvm_unreachable("Unknown BuiltinID for wave reduction");
477 case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_add_u32:
478 case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_add_u64:
479 return Intrinsic::amdgcn_wave_reduce_add;
480 case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_fadd_f32:
481 case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_fadd_f64:
482 return Intrinsic::amdgcn_wave_reduce_fadd;
483 case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_u32:
484 case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_u64:
485 return Intrinsic::amdgcn_wave_reduce_sub;
486 case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_fsub_f32:
487 case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_fsub_f64:
488 return Intrinsic::amdgcn_wave_reduce_fsub;
489 case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_i32:
490 case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_i64:
491 return Intrinsic::amdgcn_wave_reduce_min;
492 case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_fmin_f32:
493 case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_fmin_f64:
494 return Intrinsic::amdgcn_wave_reduce_fmin;
495 case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_u32:
496 case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_u64:
497 return Intrinsic::amdgcn_wave_reduce_umin;
498 case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_i32:
499 case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_i64:
500 return Intrinsic::amdgcn_wave_reduce_max;
501 case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_fmax_f32:
502 case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_fmax_f64:
503 return Intrinsic::amdgcn_wave_reduce_fmax;
504 case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_u32:
505 case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_u64:
506 return Intrinsic::amdgcn_wave_reduce_umax;
507 case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_and_b32:
508 case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_and_b64:
509 return Intrinsic::amdgcn_wave_reduce_and;
510 case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_or_b32:
511 case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_or_b64:
512 return Intrinsic::amdgcn_wave_reduce_or;
513 case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_xor_b32:
514 case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_xor_b64:
515 return Intrinsic::amdgcn_wave_reduce_xor;
516 }
517}
518
519Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
520 const CallExpr *E) {
521 llvm::AtomicOrdering AO = llvm::AtomicOrdering::SequentiallyConsistent;
522 llvm::SyncScope::ID SSID;
523 switch (BuiltinID) {
524 case AMDGPU::BI__builtin_amdgcn_wave_reduce_add_u32:
525 case AMDGPU::BI__builtin_amdgcn_wave_reduce_fadd_f32:
526 case AMDGPU::BI__builtin_amdgcn_wave_reduce_fadd_f64:
527 case AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_u32:
528 case AMDGPU::BI__builtin_amdgcn_wave_reduce_fsub_f32:
529 case AMDGPU::BI__builtin_amdgcn_wave_reduce_fsub_f64:
530 case AMDGPU::BI__builtin_amdgcn_wave_reduce_min_i32:
531 case AMDGPU::BI__builtin_amdgcn_wave_reduce_min_u32:
532 case AMDGPU::BI__builtin_amdgcn_wave_reduce_fmin_f32:
533 case AMDGPU::BI__builtin_amdgcn_wave_reduce_fmin_f64:
534 case AMDGPU::BI__builtin_amdgcn_wave_reduce_max_i32:
535 case AMDGPU::BI__builtin_amdgcn_wave_reduce_max_u32:
536 case AMDGPU::BI__builtin_amdgcn_wave_reduce_fmax_f32:
537 case AMDGPU::BI__builtin_amdgcn_wave_reduce_fmax_f64:
538 case AMDGPU::BI__builtin_amdgcn_wave_reduce_and_b32:
539 case AMDGPU::BI__builtin_amdgcn_wave_reduce_or_b32:
540 case AMDGPU::BI__builtin_amdgcn_wave_reduce_xor_b32:
541 case AMDGPU::BI__builtin_amdgcn_wave_reduce_add_u64:
542 case AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_u64:
543 case AMDGPU::BI__builtin_amdgcn_wave_reduce_min_i64:
544 case AMDGPU::BI__builtin_amdgcn_wave_reduce_min_u64:
545 case AMDGPU::BI__builtin_amdgcn_wave_reduce_max_i64:
546 case AMDGPU::BI__builtin_amdgcn_wave_reduce_max_u64:
547 case AMDGPU::BI__builtin_amdgcn_wave_reduce_and_b64:
548 case AMDGPU::BI__builtin_amdgcn_wave_reduce_or_b64:
549 case AMDGPU::BI__builtin_amdgcn_wave_reduce_xor_b64: {
550 Intrinsic::ID IID = getIntrinsicIDforWaveReduction(BuiltinID);
551 llvm::Value *Value = EmitScalarExpr(E: E->getArg(Arg: 0));
552 llvm::Value *Strategy = EmitScalarExpr(E: E->getArg(Arg: 1));
553 llvm::Function *F = CGM.getIntrinsic(IID, Tys: {Value->getType()});
554 return Builder.CreateCall(Callee: F, Args: {Value, Strategy});
555 }
556 case AMDGPU::BI__builtin_amdgcn_div_scale:
557 case AMDGPU::BI__builtin_amdgcn_div_scalef: {
558 // Translate from the intrinsics's struct return to the builtin's out
559 // argument.
560
561 Address FlagOutPtr = EmitPointerWithAlignment(Addr: E->getArg(Arg: 3));
562
563 llvm::Value *X = EmitScalarExpr(E: E->getArg(Arg: 0));
564 llvm::Value *Y = EmitScalarExpr(E: E->getArg(Arg: 1));
565 llvm::Value *Z = EmitScalarExpr(E: E->getArg(Arg: 2));
566
567 llvm::Function *Callee = CGM.getIntrinsic(IID: Intrinsic::amdgcn_div_scale,
568 Tys: X->getType());
569
570 llvm::Value *Tmp = Builder.CreateCall(Callee, Args: {X, Y, Z});
571
572 llvm::Value *Result = Builder.CreateExtractValue(Agg: Tmp, Idxs: 0);
573 llvm::Value *Flag = Builder.CreateExtractValue(Agg: Tmp, Idxs: 1);
574
575 llvm::Type *RealFlagType = FlagOutPtr.getElementType();
576
577 llvm::Value *FlagExt = Builder.CreateZExt(V: Flag, DestTy: RealFlagType);
578 Builder.CreateStore(Val: FlagExt, Addr: FlagOutPtr);
579 return Result;
580 }
581 case AMDGPU::BI__builtin_amdgcn_div_fmas:
582 case AMDGPU::BI__builtin_amdgcn_div_fmasf: {
583 llvm::Value *Src0 = EmitScalarExpr(E: E->getArg(Arg: 0));
584 llvm::Value *Src1 = EmitScalarExpr(E: E->getArg(Arg: 1));
585 llvm::Value *Src2 = EmitScalarExpr(E: E->getArg(Arg: 2));
586 llvm::Value *Src3 = EmitScalarExpr(E: E->getArg(Arg: 3));
587
588 llvm::Function *F = CGM.getIntrinsic(IID: Intrinsic::amdgcn_div_fmas,
589 Tys: Src0->getType());
590 llvm::Value *Src3ToBool = Builder.CreateIsNotNull(Arg: Src3);
591 return Builder.CreateCall(Callee: F, Args: {Src0, Src1, Src2, Src3ToBool});
592 }
593
594 case AMDGPU::BI__builtin_amdgcn_ds_swizzle:
595 return emitBuiltinWithOneOverloadedType<2>(CGF&: *this, E,
596 IntrinsicID: Intrinsic::amdgcn_ds_swizzle);
597 case AMDGPU::BI__builtin_amdgcn_mov_dpp8:
598 case AMDGPU::BI__builtin_amdgcn_mov_dpp:
599 case AMDGPU::BI__builtin_amdgcn_update_dpp: {
600 llvm::SmallVector<llvm::Value *, 6> Args;
601 // Find out if any arguments are required to be integer constant
602 // expressions.
603 unsigned ICEArguments = 0;
604 ASTContext::GetBuiltinTypeError Error;
605 getContext().GetBuiltinType(ID: BuiltinID, Error, IntegerConstantArgs: &ICEArguments);
606 assert(Error == ASTContext::GE_None && "Should not codegen an error");
607 llvm::Type *DataTy = ConvertType(T: E->getArg(Arg: 0)->getType());
608 unsigned Size = DataTy->getPrimitiveSizeInBits();
609 llvm::Type *IntTy =
610 llvm::IntegerType::get(C&: Builder.getContext(), NumBits: std::max(a: Size, b: 32u));
611 Function *F =
612 CGM.getIntrinsic(IID: BuiltinID == AMDGPU::BI__builtin_amdgcn_mov_dpp8
613 ? Intrinsic::amdgcn_mov_dpp8
614 : Intrinsic::amdgcn_update_dpp,
615 Tys: IntTy);
616 assert(E->getNumArgs() == 5 || E->getNumArgs() == 6 ||
617 E->getNumArgs() == 2);
618 bool InsertOld = BuiltinID == AMDGPU::BI__builtin_amdgcn_mov_dpp;
619 if (InsertOld)
620 Args.push_back(Elt: llvm::PoisonValue::get(T: IntTy));
621 for (unsigned I = 0; I != E->getNumArgs(); ++I) {
622 llvm::Value *V = EmitScalarOrConstFoldImmArg(ICEArguments, Idx: I, E);
623 if (I < (BuiltinID == AMDGPU::BI__builtin_amdgcn_update_dpp ? 2u : 1u) &&
624 Size < 32) {
625 if (!DataTy->isIntegerTy())
626 V = Builder.CreateBitCast(
627 V, DestTy: llvm::IntegerType::get(C&: Builder.getContext(), NumBits: Size));
628 V = Builder.CreateZExtOrBitCast(V, DestTy: IntTy);
629 }
630 llvm::Type *ExpTy =
631 F->getFunctionType()->getFunctionParamType(i: I + InsertOld);
632 Args.push_back(Elt: Builder.CreateTruncOrBitCast(V, DestTy: ExpTy));
633 }
634 Value *V = Builder.CreateCall(Callee: F, Args);
635 if (Size < 32 && !DataTy->isIntegerTy())
636 V = Builder.CreateTrunc(
637 V, DestTy: llvm::IntegerType::get(C&: Builder.getContext(), NumBits: Size));
638 return Builder.CreateTruncOrBitCast(V, DestTy: DataTy);
639 }
640 case AMDGPU::BI__builtin_amdgcn_permlane16:
641 case AMDGPU::BI__builtin_amdgcn_permlanex16:
642 return emitBuiltinWithOneOverloadedType<6>(
643 CGF&: *this, E,
644 IntrinsicID: BuiltinID == AMDGPU::BI__builtin_amdgcn_permlane16
645 ? Intrinsic::amdgcn_permlane16
646 : Intrinsic::amdgcn_permlanex16);
647 case AMDGPU::BI__builtin_amdgcn_permlane64:
648 return emitBuiltinWithOneOverloadedType<1>(CGF&: *this, E,
649 IntrinsicID: Intrinsic::amdgcn_permlane64);
650 case AMDGPU::BI__builtin_amdgcn_readlane:
651 return emitBuiltinWithOneOverloadedType<2>(CGF&: *this, E,
652 IntrinsicID: Intrinsic::amdgcn_readlane);
653 case AMDGPU::BI__builtin_amdgcn_wave_shuffle:
654 return emitBuiltinWithOneOverloadedType<2>(CGF&: *this, E,
655 IntrinsicID: Intrinsic::amdgcn_wave_shuffle);
656 case AMDGPU::BI__builtin_amdgcn_readfirstlane:
657 return emitBuiltinWithOneOverloadedType<1>(CGF&: *this, E,
658 IntrinsicID: Intrinsic::amdgcn_readfirstlane);
659 case AMDGPU::BI__builtin_amdgcn_div_fixup:
660 case AMDGPU::BI__builtin_amdgcn_div_fixupf:
661 case AMDGPU::BI__builtin_amdgcn_div_fixuph:
662 return emitBuiltinWithOneOverloadedType<3>(CGF&: *this, E,
663 IntrinsicID: Intrinsic::amdgcn_div_fixup);
664 case AMDGPU::BI__builtin_amdgcn_trig_preop:
665 case AMDGPU::BI__builtin_amdgcn_trig_preopf:
666 return emitFPIntBuiltin(CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_trig_preop);
667 case AMDGPU::BI__builtin_amdgcn_rcp:
668 case AMDGPU::BI__builtin_amdgcn_rcpf:
669 case AMDGPU::BI__builtin_amdgcn_rcph:
670 case AMDGPU::BI__builtin_amdgcn_rcp_bf16:
671 return emitBuiltinWithOneOverloadedType<1>(CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_rcp);
672 case AMDGPU::BI__builtin_amdgcn_sqrt:
673 case AMDGPU::BI__builtin_amdgcn_sqrtf:
674 case AMDGPU::BI__builtin_amdgcn_sqrth:
675 case AMDGPU::BI__builtin_amdgcn_sqrt_bf16:
676 return emitBuiltinWithOneOverloadedType<1>(CGF&: *this, E,
677 IntrinsicID: Intrinsic::amdgcn_sqrt);
678 case AMDGPU::BI__builtin_amdgcn_rsq:
679 case AMDGPU::BI__builtin_amdgcn_rsqf:
680 case AMDGPU::BI__builtin_amdgcn_rsqh:
681 case AMDGPU::BI__builtin_amdgcn_rsq_bf16:
682 return emitBuiltinWithOneOverloadedType<1>(CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_rsq);
683 case AMDGPU::BI__builtin_amdgcn_rsq_clamp:
684 case AMDGPU::BI__builtin_amdgcn_rsq_clampf:
685 return emitBuiltinWithOneOverloadedType<1>(CGF&: *this, E,
686 IntrinsicID: Intrinsic::amdgcn_rsq_clamp);
687 case AMDGPU::BI__builtin_amdgcn_sinf:
688 case AMDGPU::BI__builtin_amdgcn_sinh:
689 case AMDGPU::BI__builtin_amdgcn_sin_bf16:
690 return emitBuiltinWithOneOverloadedType<1>(CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_sin);
691 case AMDGPU::BI__builtin_amdgcn_cosf:
692 case AMDGPU::BI__builtin_amdgcn_cosh:
693 case AMDGPU::BI__builtin_amdgcn_cos_bf16:
694 return emitBuiltinWithOneOverloadedType<1>(CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_cos);
695 case AMDGPU::BI__builtin_amdgcn_dispatch_ptr:
696 return EmitAMDGPUDispatchPtr(CGF&: *this, E);
697 case AMDGPU::BI__builtin_amdgcn_logf:
698 case AMDGPU::BI__builtin_amdgcn_log_bf16:
699 return emitBuiltinWithOneOverloadedType<1>(CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_log);
700 case AMDGPU::BI__builtin_amdgcn_exp2f:
701 case AMDGPU::BI__builtin_amdgcn_exp2_bf16:
702 return emitBuiltinWithOneOverloadedType<1>(CGF&: *this, E,
703 IntrinsicID: Intrinsic::amdgcn_exp2);
704 case AMDGPU::BI__builtin_amdgcn_log_clampf:
705 return emitBuiltinWithOneOverloadedType<1>(CGF&: *this, E,
706 IntrinsicID: Intrinsic::amdgcn_log_clamp);
707 case AMDGPU::BI__builtin_amdgcn_ldexp:
708 case AMDGPU::BI__builtin_amdgcn_ldexpf: {
709 llvm::Value *Src0 = EmitScalarExpr(E: E->getArg(Arg: 0));
710 llvm::Value *Src1 = EmitScalarExpr(E: E->getArg(Arg: 1));
711 llvm::Function *F =
712 CGM.getIntrinsic(IID: Intrinsic::ldexp, Tys: {Src0->getType(), Src1->getType()});
713 return Builder.CreateCall(Callee: F, Args: {Src0, Src1});
714 }
715 case AMDGPU::BI__builtin_amdgcn_ldexph: {
716 // The raw instruction has a different behavior for out of bounds exponent
717 // values (implicit truncation instead of saturate to short_min/short_max).
718 llvm::Value *Src0 = EmitScalarExpr(E: E->getArg(Arg: 0));
719 llvm::Value *Src1 = EmitScalarExpr(E: E->getArg(Arg: 1));
720 llvm::Function *F =
721 CGM.getIntrinsic(IID: Intrinsic::ldexp, Tys: {Src0->getType(), Int16Ty});
722 return Builder.CreateCall(Callee: F, Args: {Src0, Builder.CreateTrunc(V: Src1, DestTy: Int16Ty)});
723 }
724 case AMDGPU::BI__builtin_amdgcn_frexp_mant:
725 case AMDGPU::BI__builtin_amdgcn_frexp_mantf:
726 case AMDGPU::BI__builtin_amdgcn_frexp_manth:
727 return emitBuiltinWithOneOverloadedType<1>(CGF&: *this, E,
728 IntrinsicID: Intrinsic::amdgcn_frexp_mant);
729 case AMDGPU::BI__builtin_amdgcn_frexp_exp:
730 case AMDGPU::BI__builtin_amdgcn_frexp_expf: {
731 Value *Src0 = EmitScalarExpr(E: E->getArg(Arg: 0));
732 Function *F = CGM.getIntrinsic(IID: Intrinsic::amdgcn_frexp_exp,
733 Tys: { Builder.getInt32Ty(), Src0->getType() });
734 return Builder.CreateCall(Callee: F, Args: Src0);
735 }
736 case AMDGPU::BI__builtin_amdgcn_frexp_exph: {
737 Value *Src0 = EmitScalarExpr(E: E->getArg(Arg: 0));
738 Function *F = CGM.getIntrinsic(IID: Intrinsic::amdgcn_frexp_exp,
739 Tys: { Builder.getInt16Ty(), Src0->getType() });
740 return Builder.CreateCall(Callee: F, Args: Src0);
741 }
742 case AMDGPU::BI__builtin_amdgcn_fract:
743 case AMDGPU::BI__builtin_amdgcn_fractf:
744 case AMDGPU::BI__builtin_amdgcn_fracth:
745 return emitBuiltinWithOneOverloadedType<1>(CGF&: *this, E,
746 IntrinsicID: Intrinsic::amdgcn_fract);
747 case AMDGPU::BI__builtin_amdgcn_lerp:
748 return emitBuiltinWithOneOverloadedType<3>(CGF&: *this, E,
749 IntrinsicID: Intrinsic::amdgcn_lerp);
750 case AMDGPU::BI__builtin_amdgcn_ubfe:
751 return emitBuiltinWithOneOverloadedType<3>(CGF&: *this, E,
752 IntrinsicID: Intrinsic::amdgcn_ubfe);
753 case AMDGPU::BI__builtin_amdgcn_sbfe:
754 return emitBuiltinWithOneOverloadedType<3>(CGF&: *this, E,
755 IntrinsicID: Intrinsic::amdgcn_sbfe);
756 case AMDGPU::BI__builtin_amdgcn_ballot_w32:
757 case AMDGPU::BI__builtin_amdgcn_ballot_w64: {
758 llvm::Type *ResultType = ConvertType(T: E->getType());
759 llvm::Value *Src = EmitScalarExpr(E: E->getArg(Arg: 0));
760 Function *F = CGM.getIntrinsic(IID: Intrinsic::amdgcn_ballot, Tys: {ResultType});
761 return Builder.CreateCall(Callee: F, Args: {Src});
762 }
763 case AMDGPU::BI__builtin_amdgcn_inverse_ballot_w32:
764 case AMDGPU::BI__builtin_amdgcn_inverse_ballot_w64: {
765 llvm::Value *Src = EmitScalarExpr(E: E->getArg(Arg: 0));
766 Function *F =
767 CGM.getIntrinsic(IID: Intrinsic::amdgcn_inverse_ballot, Tys: {Src->getType()});
768 return Builder.CreateCall(Callee: F, Args: {Src});
769 }
770 case AMDGPU::BI__builtin_amdgcn_tanhf:
771 case AMDGPU::BI__builtin_amdgcn_tanhh:
772 case AMDGPU::BI__builtin_amdgcn_tanh_bf16:
773 return emitBuiltinWithOneOverloadedType<1>(CGF&: *this, E,
774 IntrinsicID: Intrinsic::amdgcn_tanh);
775 case AMDGPU::BI__builtin_amdgcn_uicmp:
776 case AMDGPU::BI__builtin_amdgcn_uicmpl:
777 case AMDGPU::BI__builtin_amdgcn_sicmp:
778 case AMDGPU::BI__builtin_amdgcn_sicmpl: {
779 llvm::Value *Src0 = EmitScalarExpr(E: E->getArg(Arg: 0));
780 llvm::Value *Src1 = EmitScalarExpr(E: E->getArg(Arg: 1));
781 llvm::Value *Src2 = EmitScalarExpr(E: E->getArg(Arg: 2));
782
783 // FIXME-GFX10: How should 32 bit mask be handled?
784 Function *F = CGM.getIntrinsic(IID: Intrinsic::amdgcn_icmp,
785 Tys: { Builder.getInt64Ty(), Src0->getType() });
786 return Builder.CreateCall(Callee: F, Args: { Src0, Src1, Src2 });
787 }
788 case AMDGPU::BI__builtin_amdgcn_fcmp:
789 case AMDGPU::BI__builtin_amdgcn_fcmpf: {
790 llvm::Value *Src0 = EmitScalarExpr(E: E->getArg(Arg: 0));
791 llvm::Value *Src1 = EmitScalarExpr(E: E->getArg(Arg: 1));
792 llvm::Value *Src2 = EmitScalarExpr(E: E->getArg(Arg: 2));
793
794 // FIXME-GFX10: How should 32 bit mask be handled?
795 Function *F = CGM.getIntrinsic(IID: Intrinsic::amdgcn_fcmp,
796 Tys: { Builder.getInt64Ty(), Src0->getType() });
797 return Builder.CreateCall(Callee: F, Args: { Src0, Src1, Src2 });
798 }
799 case AMDGPU::BI__builtin_amdgcn_class:
800 case AMDGPU::BI__builtin_amdgcn_classf:
801 case AMDGPU::BI__builtin_amdgcn_classh:
802 return emitFPIntBuiltin(CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_class);
803 case AMDGPU::BI__builtin_amdgcn_fmed3f:
804 case AMDGPU::BI__builtin_amdgcn_fmed3h:
805 return emitBuiltinWithOneOverloadedType<3>(CGF&: *this, E,
806 IntrinsicID: Intrinsic::amdgcn_fmed3);
807 case AMDGPU::BI__builtin_amdgcn_ds_append:
808 case AMDGPU::BI__builtin_amdgcn_ds_consume: {
809 Intrinsic::ID Intrin = BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_append ?
810 Intrinsic::amdgcn_ds_append : Intrinsic::amdgcn_ds_consume;
811 Value *Src0 = EmitScalarExpr(E: E->getArg(Arg: 0));
812 Function *F = CGM.getIntrinsic(IID: Intrin, Tys: { Src0->getType() });
813 return Builder.CreateCall(Callee: F, Args: { Src0, Builder.getFalse() });
814 }
815 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_i32:
816 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_v2i32:
817 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4i16:
818 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4f16:
819 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4bf16:
820 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8i16:
821 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8f16:
822 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8bf16:
823 case AMDGPU::BI__builtin_amdgcn_global_load_tr4_b64_v2i32:
824 case AMDGPU::BI__builtin_amdgcn_global_load_tr8_b64_v2i32:
825 case AMDGPU::BI__builtin_amdgcn_global_load_tr6_b96_v3i32:
826 case AMDGPU::BI__builtin_amdgcn_global_load_tr16_b128_v8i16:
827 case AMDGPU::BI__builtin_amdgcn_global_load_tr16_b128_v8f16:
828 case AMDGPU::BI__builtin_amdgcn_global_load_tr16_b128_v8bf16:
829 case AMDGPU::BI__builtin_amdgcn_ds_load_tr4_b64_v2i32:
830 case AMDGPU::BI__builtin_amdgcn_ds_load_tr8_b64_v2i32:
831 case AMDGPU::BI__builtin_amdgcn_ds_load_tr6_b96_v3i32:
832 case AMDGPU::BI__builtin_amdgcn_ds_load_tr16_b128_v8i16:
833 case AMDGPU::BI__builtin_amdgcn_ds_load_tr16_b128_v8f16:
834 case AMDGPU::BI__builtin_amdgcn_ds_load_tr16_b128_v8bf16:
835 case AMDGPU::BI__builtin_amdgcn_ds_read_tr4_b64_v2i32:
836 case AMDGPU::BI__builtin_amdgcn_ds_read_tr8_b64_v2i32:
837 case AMDGPU::BI__builtin_amdgcn_ds_read_tr6_b96_v3i32:
838 case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4f16:
839 case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4bf16:
840 case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4i16: {
841 Intrinsic::ID IID;
842 switch (BuiltinID) {
843 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_i32:
844 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_v2i32:
845 case AMDGPU::BI__builtin_amdgcn_global_load_tr8_b64_v2i32:
846 IID = Intrinsic::amdgcn_global_load_tr_b64;
847 break;
848 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4i16:
849 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4f16:
850 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4bf16:
851 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8i16:
852 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8f16:
853 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8bf16:
854 case AMDGPU::BI__builtin_amdgcn_global_load_tr16_b128_v8i16:
855 case AMDGPU::BI__builtin_amdgcn_global_load_tr16_b128_v8f16:
856 case AMDGPU::BI__builtin_amdgcn_global_load_tr16_b128_v8bf16:
857 IID = Intrinsic::amdgcn_global_load_tr_b128;
858 break;
859 case AMDGPU::BI__builtin_amdgcn_global_load_tr4_b64_v2i32:
860 IID = Intrinsic::amdgcn_global_load_tr4_b64;
861 break;
862 case AMDGPU::BI__builtin_amdgcn_global_load_tr6_b96_v3i32:
863 IID = Intrinsic::amdgcn_global_load_tr6_b96;
864 break;
865 case AMDGPU::BI__builtin_amdgcn_ds_load_tr4_b64_v2i32:
866 IID = Intrinsic::amdgcn_ds_load_tr4_b64;
867 break;
868 case AMDGPU::BI__builtin_amdgcn_ds_load_tr6_b96_v3i32:
869 IID = Intrinsic::amdgcn_ds_load_tr6_b96;
870 break;
871 case AMDGPU::BI__builtin_amdgcn_ds_load_tr8_b64_v2i32:
872 IID = Intrinsic::amdgcn_ds_load_tr8_b64;
873 break;
874 case AMDGPU::BI__builtin_amdgcn_ds_load_tr16_b128_v8i16:
875 case AMDGPU::BI__builtin_amdgcn_ds_load_tr16_b128_v8f16:
876 case AMDGPU::BI__builtin_amdgcn_ds_load_tr16_b128_v8bf16:
877 IID = Intrinsic::amdgcn_ds_load_tr16_b128;
878 break;
879 case AMDGPU::BI__builtin_amdgcn_ds_read_tr4_b64_v2i32:
880 IID = Intrinsic::amdgcn_ds_read_tr4_b64;
881 break;
882 case AMDGPU::BI__builtin_amdgcn_ds_read_tr8_b64_v2i32:
883 IID = Intrinsic::amdgcn_ds_read_tr8_b64;
884 break;
885 case AMDGPU::BI__builtin_amdgcn_ds_read_tr6_b96_v3i32:
886 IID = Intrinsic::amdgcn_ds_read_tr6_b96;
887 break;
888 case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4i16:
889 case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4f16:
890 case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4bf16:
891 IID = Intrinsic::amdgcn_ds_read_tr16_b64;
892 break;
893 }
894 llvm::Type *LoadTy = ConvertType(T: E->getType());
895 llvm::Value *Addr = EmitScalarExpr(E: E->getArg(Arg: 0));
896 llvm::Function *F = CGM.getIntrinsic(IID, Tys: {LoadTy});
897 return Builder.CreateCall(Callee: F, Args: {Addr});
898 }
899 case AMDGPU::BI__builtin_amdgcn_global_load_monitor_b32:
900 case AMDGPU::BI__builtin_amdgcn_global_load_monitor_b64:
901 case AMDGPU::BI__builtin_amdgcn_global_load_monitor_b128:
902 case AMDGPU::BI__builtin_amdgcn_flat_load_monitor_b32:
903 case AMDGPU::BI__builtin_amdgcn_flat_load_monitor_b64:
904 case AMDGPU::BI__builtin_amdgcn_flat_load_monitor_b128: {
905
906 Intrinsic::ID IID;
907 switch (BuiltinID) {
908 case AMDGPU::BI__builtin_amdgcn_global_load_monitor_b32:
909 IID = Intrinsic::amdgcn_global_load_monitor_b32;
910 break;
911 case AMDGPU::BI__builtin_amdgcn_global_load_monitor_b64:
912 IID = Intrinsic::amdgcn_global_load_monitor_b64;
913 break;
914 case AMDGPU::BI__builtin_amdgcn_global_load_monitor_b128:
915 IID = Intrinsic::amdgcn_global_load_monitor_b128;
916 break;
917 case AMDGPU::BI__builtin_amdgcn_flat_load_monitor_b32:
918 IID = Intrinsic::amdgcn_flat_load_monitor_b32;
919 break;
920 case AMDGPU::BI__builtin_amdgcn_flat_load_monitor_b64:
921 IID = Intrinsic::amdgcn_flat_load_monitor_b64;
922 break;
923 case AMDGPU::BI__builtin_amdgcn_flat_load_monitor_b128:
924 IID = Intrinsic::amdgcn_flat_load_monitor_b128;
925 break;
926 }
927
928 llvm::Type *LoadTy = ConvertType(T: E->getType());
929 llvm::Value *Addr = EmitScalarExpr(E: E->getArg(Arg: 0));
930
931 auto *AOExpr = cast<llvm::ConstantInt>(Val: EmitScalarExpr(E: E->getArg(Arg: 1)));
932 auto *ScopeExpr = cast<llvm::ConstantInt>(Val: EmitScalarExpr(E: E->getArg(Arg: 2)));
933 llvm::AtomicOrdering AO = mapCABIAtomicOrdering(AO: AOExpr->getZExtValue());
934
935 llvm::Value *ScopeMD = emitScopeMD(CGF&: *this, ScopeInt: ScopeExpr->getZExtValue(), AO);
936 llvm::Function *F = CGM.getIntrinsic(IID, Tys: {LoadTy});
937 return Builder.CreateCall(Callee: F, Args: {Addr, AOExpr, ScopeMD});
938 }
939 case AMDGPU::BI__builtin_amdgcn_cluster_load_b32:
940 case AMDGPU::BI__builtin_amdgcn_cluster_load_b64:
941 case AMDGPU::BI__builtin_amdgcn_cluster_load_b128: {
942 Intrinsic::ID IID;
943 switch (BuiltinID) {
944 case AMDGPU::BI__builtin_amdgcn_cluster_load_b32:
945 IID = Intrinsic::amdgcn_cluster_load_b32;
946 break;
947 case AMDGPU::BI__builtin_amdgcn_cluster_load_b64:
948 IID = Intrinsic::amdgcn_cluster_load_b64;
949 break;
950 case AMDGPU::BI__builtin_amdgcn_cluster_load_b128:
951 IID = Intrinsic::amdgcn_cluster_load_b128;
952 break;
953 }
954 SmallVector<Value *, 3> Args;
955 for (int i = 0, e = E->getNumArgs(); i != e; ++i)
956 Args.push_back(Elt: EmitScalarExpr(E: E->getArg(Arg: i)));
957 llvm::Function *F = CGM.getIntrinsic(IID, Tys: {ConvertType(T: E->getType())});
958 return Builder.CreateCall(Callee: F, Args: {Args});
959 }
960 case AMDGPU::BI__builtin_amdgcn_load_to_lds: {
961 // Should this have asan instrumentation?
962 return emitBuiltinWithOneOverloadedType<5>(CGF&: *this, E,
963 IntrinsicID: Intrinsic::amdgcn_load_to_lds);
964 }
965 case AMDGPU::BI__builtin_amdgcn_load_async_to_lds: {
966 // Should this have asan instrumentation?
967 return emitBuiltinWithOneOverloadedType<5>(
968 CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_load_async_to_lds);
969 }
970 case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_load_32x4B:
971 case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_store_32x4B:
972 case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_load_16x8B:
973 case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_store_16x8B:
974 case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_load_8x16B:
975 case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_store_8x16B: {
976 Intrinsic::ID IID;
977 switch (BuiltinID) {
978 case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_load_32x4B:
979 IID = Intrinsic::amdgcn_cooperative_atomic_load_32x4B;
980 break;
981 case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_store_32x4B:
982 IID = Intrinsic::amdgcn_cooperative_atomic_store_32x4B;
983 break;
984 case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_load_16x8B:
985 IID = Intrinsic::amdgcn_cooperative_atomic_load_16x8B;
986 break;
987 case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_store_16x8B:
988 IID = Intrinsic::amdgcn_cooperative_atomic_store_16x8B;
989 break;
990 case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_load_8x16B:
991 IID = Intrinsic::amdgcn_cooperative_atomic_load_8x16B;
992 break;
993 case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_store_8x16B:
994 IID = Intrinsic::amdgcn_cooperative_atomic_store_8x16B;
995 break;
996 }
997
998 LLVMContext &Ctx = CGM.getLLVMContext();
999 SmallVector<Value *, 5> Args;
1000 // last argument is a MD string
1001 const unsigned ScopeArg = E->getNumArgs() - 1;
1002 for (unsigned i = 0; i != ScopeArg; ++i)
1003 Args.push_back(Elt: EmitScalarExpr(E: E->getArg(Arg: i)));
1004 StringRef Arg = cast<StringLiteral>(Val: E->getArg(Arg: ScopeArg)->IgnoreParenCasts())
1005 ->getString();
1006 llvm::MDNode *MD = llvm::MDNode::get(Context&: Ctx, MDs: {llvm::MDString::get(Context&: Ctx, Str: Arg)});
1007 Args.push_back(Elt: llvm::MetadataAsValue::get(Context&: Ctx, MD));
1008 // Intrinsic is typed based on the pointer AS. Pointer is always the first
1009 // argument.
1010 llvm::Function *F = CGM.getIntrinsic(IID, Tys: {Args[0]->getType()});
1011 return Builder.CreateCall(Callee: F, Args: {Args});
1012 }
1013 case AMDGPU::BI__builtin_amdgcn_av_load_b128:
1014 case AMDGPU::BI__builtin_amdgcn_av_store_b128: {
1015 const bool IsStore = BuiltinID == AMDGPU::BI__builtin_amdgcn_av_store_b128;
1016 SmallVector<Value *, 5> Args = {EmitScalarExpr(E: E->getArg(Arg: 0))}; // addr
1017 if (IsStore)
1018 Args.push_back(Elt: EmitScalarExpr(E: E->getArg(Arg: 1))); // data
1019 const unsigned ScopeIdx = E->getNumArgs() - 1;
1020 auto *ScopeExpr =
1021 cast<llvm::ConstantInt>(Val: EmitScalarExpr(E: E->getArg(Arg: ScopeIdx)));
1022 Args.push_back(Elt: emitScopeMD(CGF&: *this, ScopeInt: ScopeExpr->getZExtValue()));
1023 llvm::Function *F =
1024 CGM.getIntrinsic(IID: IsStore ? Intrinsic::amdgcn_av_store_b128
1025 : Intrinsic::amdgcn_av_load_b128,
1026 Tys: {Args[0]->getType()});
1027 return Builder.CreateCall(Callee: F, Args);
1028 }
1029 case AMDGPU::BI__builtin_amdgcn_get_fpenv: {
1030 Function *F = CGM.getIntrinsic(IID: Intrinsic::get_fpenv,
1031 Tys: {llvm::Type::getInt64Ty(C&: getLLVMContext())});
1032 return Builder.CreateCall(Callee: F);
1033 }
1034 case AMDGPU::BI__builtin_amdgcn_set_fpenv: {
1035 Function *F = CGM.getIntrinsic(IID: Intrinsic::set_fpenv,
1036 Tys: {llvm::Type::getInt64Ty(C&: getLLVMContext())});
1037 llvm::Value *Env = EmitScalarExpr(E: E->getArg(Arg: 0));
1038 return Builder.CreateCall(Callee: F, Args: {Env});
1039 }
1040 case AMDGPU::BI__builtin_amdgcn_processor_is: {
1041 assert(CGM.getTriple().isSPIRV() &&
1042 "__builtin_amdgcn_processor_is should never reach CodeGen for "
1043 "concrete targets!");
1044 StringRef Proc = cast<clang::StringLiteral>(Val: E->getArg(Arg: 0))->getString();
1045 return GetAMDGPUPredicate(CGF&: *this, Name: "is." + Proc);
1046 }
1047 case AMDGPU::BI__builtin_amdgcn_is_invocable: {
1048 assert(CGM.getTriple().isSPIRV() &&
1049 "__builtin_amdgcn_is_invocable should never reach CodeGen for "
1050 "concrete targets!");
1051 auto *FD = cast<FunctionDecl>(
1052 Val: cast<DeclRefExpr>(Val: E->getArg(Arg: 0))->getReferencedDeclOfCallee());
1053 StringRef RF =
1054 getContext().BuiltinInfo.getRequiredFeatures(ID: FD->getBuiltinID());
1055 return GetAMDGPUPredicate(CGF&: *this, Name: "has." + RF);
1056 }
1057 case AMDGPU::BI__builtin_amdgcn_read_exec:
1058 return EmitAMDGCNBallotForExec(CGF&: *this, E, RegisterType: Int64Ty, ValueType: Int64Ty, isExecHi: false);
1059 case AMDGPU::BI__builtin_amdgcn_read_exec_lo:
1060 return EmitAMDGCNBallotForExec(CGF&: *this, E, RegisterType: Int32Ty, ValueType: Int32Ty, isExecHi: false);
1061 case AMDGPU::BI__builtin_amdgcn_read_exec_hi:
1062 return EmitAMDGCNBallotForExec(CGF&: *this, E, RegisterType: Int64Ty, ValueType: Int64Ty, isExecHi: true);
1063 case AMDGPU::BI__builtin_amdgcn_image_bvh_intersect_ray:
1064 case AMDGPU::BI__builtin_amdgcn_image_bvh_intersect_ray_h:
1065 case AMDGPU::BI__builtin_amdgcn_image_bvh_intersect_ray_l:
1066 case AMDGPU::BI__builtin_amdgcn_image_bvh_intersect_ray_lh: {
1067 llvm::Value *NodePtr = EmitScalarExpr(E: E->getArg(Arg: 0));
1068 llvm::Value *RayExtent = EmitScalarExpr(E: E->getArg(Arg: 1));
1069 llvm::Value *RayOrigin = EmitScalarExpr(E: E->getArg(Arg: 2));
1070 llvm::Value *RayDir = EmitScalarExpr(E: E->getArg(Arg: 3));
1071 llvm::Value *RayInverseDir = EmitScalarExpr(E: E->getArg(Arg: 4));
1072 llvm::Value *TextureDescr = EmitScalarExpr(E: E->getArg(Arg: 5));
1073
1074 // The builtins take these arguments as vec4 where the last element is
1075 // ignored. The intrinsic takes them as vec3.
1076 RayOrigin = Builder.CreateShuffleVector(V1: RayOrigin, V2: RayOrigin,
1077 Mask: {0, 1, 2});
1078 RayDir =
1079 Builder.CreateShuffleVector(V1: RayDir, V2: RayDir, Mask: {0, 1, 2});
1080 RayInverseDir = Builder.CreateShuffleVector(V1: RayInverseDir, V2: RayInverseDir,
1081 Mask: {0, 1, 2});
1082
1083 Function *F = CGM.getIntrinsic(IID: Intrinsic::amdgcn_image_bvh_intersect_ray,
1084 Tys: {NodePtr->getType(), RayDir->getType()});
1085 return Builder.CreateCall(Callee: F, Args: {NodePtr, RayExtent, RayOrigin, RayDir,
1086 RayInverseDir, TextureDescr});
1087 }
1088 case AMDGPU::BI__builtin_amdgcn_image_bvh8_intersect_ray:
1089 case AMDGPU::BI__builtin_amdgcn_image_bvh_dual_intersect_ray: {
1090 Intrinsic::ID IID;
1091 switch (BuiltinID) {
1092 case AMDGPU::BI__builtin_amdgcn_image_bvh8_intersect_ray:
1093 IID = Intrinsic::amdgcn_image_bvh8_intersect_ray;
1094 break;
1095 case AMDGPU::BI__builtin_amdgcn_image_bvh_dual_intersect_ray:
1096 IID = Intrinsic::amdgcn_image_bvh_dual_intersect_ray;
1097 break;
1098 }
1099 llvm::Value *NodePtr = EmitScalarExpr(E: E->getArg(Arg: 0));
1100 llvm::Value *RayExtent = EmitScalarExpr(E: E->getArg(Arg: 1));
1101 llvm::Value *InstanceMask = EmitScalarExpr(E: E->getArg(Arg: 2));
1102 llvm::Value *RayOrigin = EmitScalarExpr(E: E->getArg(Arg: 3));
1103 llvm::Value *RayDir = EmitScalarExpr(E: E->getArg(Arg: 4));
1104 llvm::Value *Offset = EmitScalarExpr(E: E->getArg(Arg: 5));
1105 llvm::Value *TextureDescr = EmitScalarExpr(E: E->getArg(Arg: 6));
1106
1107 Address RetRayOriginPtr = EmitPointerWithAlignment(Addr: E->getArg(Arg: 7));
1108 Address RetRayDirPtr = EmitPointerWithAlignment(Addr: E->getArg(Arg: 8));
1109
1110 llvm::Function *IntrinsicFunc = CGM.getIntrinsic(IID);
1111
1112 llvm::CallInst *CI = Builder.CreateCall(
1113 Callee: IntrinsicFunc, Args: {NodePtr, RayExtent, InstanceMask, RayOrigin, RayDir,
1114 Offset, TextureDescr});
1115
1116 llvm::Value *RetVData = Builder.CreateExtractValue(Agg: CI, Idxs: 0);
1117 llvm::Value *RetRayOrigin = Builder.CreateExtractValue(Agg: CI, Idxs: 1);
1118 llvm::Value *RetRayDir = Builder.CreateExtractValue(Agg: CI, Idxs: 2);
1119
1120 Builder.CreateStore(Val: RetRayOrigin, Addr: RetRayOriginPtr);
1121 Builder.CreateStore(Val: RetRayDir, Addr: RetRayDirPtr);
1122
1123 return RetVData;
1124 }
1125
1126 case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_rtn:
1127 case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push4_pop1_rtn:
1128 case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push8_pop1_rtn:
1129 case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push8_pop2_rtn: {
1130 Intrinsic::ID IID;
1131 switch (BuiltinID) {
1132 case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_rtn:
1133 IID = Intrinsic::amdgcn_ds_bvh_stack_rtn;
1134 break;
1135 case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push4_pop1_rtn:
1136 IID = Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn;
1137 break;
1138 case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push8_pop1_rtn:
1139 IID = Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn;
1140 break;
1141 case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push8_pop2_rtn:
1142 IID = Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn;
1143 break;
1144 }
1145
1146 SmallVector<Value *, 4> Args;
1147 for (int i = 0, e = E->getNumArgs(); i != e; ++i)
1148 Args.push_back(Elt: EmitScalarExpr(E: E->getArg(Arg: i)));
1149
1150 Function *F = CGM.getIntrinsic(IID);
1151 Value *Call = Builder.CreateCall(Callee: F, Args);
1152 Value *Rtn = Builder.CreateExtractValue(Agg: Call, Idxs: 0);
1153 Value *A = Builder.CreateExtractValue(Agg: Call, Idxs: 1);
1154 llvm::Type *RetTy = ConvertType(T: E->getType());
1155 Value *I0 = Builder.CreateInsertElement(Vec: PoisonValue::get(T: RetTy), NewElt: Rtn,
1156 Idx: (uint64_t)0);
1157 // ds_bvh_stack_push8_pop2_rtn returns {i64, i32} but the builtin returns
1158 // <2 x i64>, zext the second value.
1159 if (A->getType()->getPrimitiveSizeInBits() <
1160 RetTy->getScalarType()->getPrimitiveSizeInBits())
1161 A = Builder.CreateZExt(V: A, DestTy: RetTy->getScalarType());
1162
1163 return Builder.CreateInsertElement(Vec: I0, NewElt: A, Idx: 1);
1164 }
1165 case AMDGPU::BI__builtin_amdgcn_image_load_1d_v4f32_i32:
1166 case AMDGPU::BI__builtin_amdgcn_image_load_1d_v4f16_i32:
1167 return emitAMDGCNImageOverloadedReturnType(
1168 CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_image_load_1d, IsImageStore: false);
1169 case AMDGPU::BI__builtin_amdgcn_image_load_1darray_v4f32_i32:
1170 case AMDGPU::BI__builtin_amdgcn_image_load_1darray_v4f16_i32:
1171 return emitAMDGCNImageOverloadedReturnType(
1172 CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_image_load_1darray, IsImageStore: false);
1173 case AMDGPU::BI__builtin_amdgcn_image_load_2d_f32_i32:
1174 case AMDGPU::BI__builtin_amdgcn_image_load_2d_v4f32_i32:
1175 case AMDGPU::BI__builtin_amdgcn_image_load_2d_v4f16_i32:
1176 return emitAMDGCNImageOverloadedReturnType(
1177 CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_image_load_2d, IsImageStore: false);
1178 case AMDGPU::BI__builtin_amdgcn_image_load_2darray_f32_i32:
1179 case AMDGPU::BI__builtin_amdgcn_image_load_2darray_v4f32_i32:
1180 case AMDGPU::BI__builtin_amdgcn_image_load_2darray_v4f16_i32:
1181 return emitAMDGCNImageOverloadedReturnType(
1182 CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_image_load_2darray, IsImageStore: false);
1183 case AMDGPU::BI__builtin_amdgcn_image_load_3d_v4f32_i32:
1184 case AMDGPU::BI__builtin_amdgcn_image_load_3d_v4f16_i32:
1185 return emitAMDGCNImageOverloadedReturnType(
1186 CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_image_load_3d, IsImageStore: false);
1187 case AMDGPU::BI__builtin_amdgcn_image_load_cube_v4f32_i32:
1188 case AMDGPU::BI__builtin_amdgcn_image_load_cube_v4f16_i32:
1189 return emitAMDGCNImageOverloadedReturnType(
1190 CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_image_load_cube, IsImageStore: false);
1191 case AMDGPU::BI__builtin_amdgcn_image_load_mip_1d_v4f32_i32:
1192 case AMDGPU::BI__builtin_amdgcn_image_load_mip_1d_v4f16_i32:
1193 return emitAMDGCNImageOverloadedReturnType(
1194 CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_image_load_mip_1d, IsImageStore: false);
1195 case AMDGPU::BI__builtin_amdgcn_image_load_mip_1darray_v4f32_i32:
1196 case AMDGPU::BI__builtin_amdgcn_image_load_mip_1darray_v4f16_i32:
1197 return emitAMDGCNImageOverloadedReturnType(
1198 CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_image_load_mip_1darray, IsImageStore: false);
1199 case AMDGPU::BI__builtin_amdgcn_image_load_mip_2d_f32_i32:
1200 case AMDGPU::BI__builtin_amdgcn_image_load_mip_2d_v4f32_i32:
1201 case AMDGPU::BI__builtin_amdgcn_image_load_mip_2d_v4f16_i32:
1202 return emitAMDGCNImageOverloadedReturnType(
1203 CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_image_load_mip_2d, IsImageStore: false);
1204 case AMDGPU::BI__builtin_amdgcn_image_load_mip_2darray_f32_i32:
1205 case AMDGPU::BI__builtin_amdgcn_image_load_mip_2darray_v4f32_i32:
1206 case AMDGPU::BI__builtin_amdgcn_image_load_mip_2darray_v4f16_i32:
1207 return emitAMDGCNImageOverloadedReturnType(
1208 CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_image_load_mip_2darray, IsImageStore: false);
1209 case AMDGPU::BI__builtin_amdgcn_image_load_mip_3d_v4f32_i32:
1210 case AMDGPU::BI__builtin_amdgcn_image_load_mip_3d_v4f16_i32:
1211 return emitAMDGCNImageOverloadedReturnType(
1212 CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_image_load_mip_3d, IsImageStore: false);
1213 case AMDGPU::BI__builtin_amdgcn_image_load_mip_cube_v4f32_i32:
1214 case AMDGPU::BI__builtin_amdgcn_image_load_mip_cube_v4f16_i32:
1215 return emitAMDGCNImageOverloadedReturnType(
1216 CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_image_load_mip_cube, IsImageStore: false);
1217 case AMDGPU::BI__builtin_amdgcn_image_store_1d_v4f32_i32:
1218 case AMDGPU::BI__builtin_amdgcn_image_store_1d_v4f16_i32:
1219 return emitAMDGCNImageOverloadedReturnType(
1220 CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_image_store_1d, IsImageStore: true);
1221 case AMDGPU::BI__builtin_amdgcn_image_store_1darray_v4f32_i32:
1222 case AMDGPU::BI__builtin_amdgcn_image_store_1darray_v4f16_i32:
1223 return emitAMDGCNImageOverloadedReturnType(
1224 CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_image_store_1darray, IsImageStore: true);
1225 case AMDGPU::BI__builtin_amdgcn_image_store_2d_f32_i32:
1226 case AMDGPU::BI__builtin_amdgcn_image_store_2d_v4f32_i32:
1227 case AMDGPU::BI__builtin_amdgcn_image_store_2d_v4f16_i32:
1228 return emitAMDGCNImageOverloadedReturnType(
1229 CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_image_store_2d, IsImageStore: true);
1230 case AMDGPU::BI__builtin_amdgcn_image_store_2darray_f32_i32:
1231 case AMDGPU::BI__builtin_amdgcn_image_store_2darray_v4f32_i32:
1232 case AMDGPU::BI__builtin_amdgcn_image_store_2darray_v4f16_i32:
1233 return emitAMDGCNImageOverloadedReturnType(
1234 CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_image_store_2darray, IsImageStore: true);
1235 case AMDGPU::BI__builtin_amdgcn_image_store_3d_v4f32_i32:
1236 case AMDGPU::BI__builtin_amdgcn_image_store_3d_v4f16_i32:
1237 return emitAMDGCNImageOverloadedReturnType(
1238 CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_image_store_3d, IsImageStore: true);
1239 case AMDGPU::BI__builtin_amdgcn_image_store_cube_v4f32_i32:
1240 case AMDGPU::BI__builtin_amdgcn_image_store_cube_v4f16_i32:
1241 return emitAMDGCNImageOverloadedReturnType(
1242 CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_image_store_cube, IsImageStore: true);
1243 case AMDGPU::BI__builtin_amdgcn_image_store_mip_1d_v4f32_i32:
1244 case AMDGPU::BI__builtin_amdgcn_image_store_mip_1d_v4f16_i32:
1245 return emitAMDGCNImageOverloadedReturnType(
1246 CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_image_store_mip_1d, IsImageStore: true);
1247 case AMDGPU::BI__builtin_amdgcn_image_store_mip_1darray_v4f32_i32:
1248 case AMDGPU::BI__builtin_amdgcn_image_store_mip_1darray_v4f16_i32:
1249 return emitAMDGCNImageOverloadedReturnType(
1250 CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_image_store_mip_1darray, IsImageStore: true);
1251 case AMDGPU::BI__builtin_amdgcn_image_store_mip_2d_f32_i32:
1252 case AMDGPU::BI__builtin_amdgcn_image_store_mip_2d_v4f32_i32:
1253 case AMDGPU::BI__builtin_amdgcn_image_store_mip_2d_v4f16_i32:
1254 return emitAMDGCNImageOverloadedReturnType(
1255 CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_image_store_mip_2d, IsImageStore: true);
1256 case AMDGPU::BI__builtin_amdgcn_image_store_mip_2darray_f32_i32:
1257 case AMDGPU::BI__builtin_amdgcn_image_store_mip_2darray_v4f32_i32:
1258 case AMDGPU::BI__builtin_amdgcn_image_store_mip_2darray_v4f16_i32:
1259 return emitAMDGCNImageOverloadedReturnType(
1260 CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_image_store_mip_2darray, IsImageStore: true);
1261 case AMDGPU::BI__builtin_amdgcn_image_store_mip_3d_v4f32_i32:
1262 case AMDGPU::BI__builtin_amdgcn_image_store_mip_3d_v4f16_i32:
1263 return emitAMDGCNImageOverloadedReturnType(
1264 CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_image_store_mip_3d, IsImageStore: true);
1265 case AMDGPU::BI__builtin_amdgcn_image_store_mip_cube_v4f32_i32:
1266 case AMDGPU::BI__builtin_amdgcn_image_store_mip_cube_v4f16_i32:
1267 return emitAMDGCNImageOverloadedReturnType(
1268 CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_image_store_mip_cube, IsImageStore: true);
1269 case AMDGPU::BI__builtin_amdgcn_image_sample_1d_v4f32_f32:
1270 case AMDGPU::BI__builtin_amdgcn_image_sample_1d_v4f16_f32:
1271 return emitAMDGCNImageOverloadedReturnType(
1272 CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_image_sample_1d, IsImageStore: false);
1273 case AMDGPU::BI__builtin_amdgcn_image_sample_1darray_v4f32_f32:
1274 case AMDGPU::BI__builtin_amdgcn_image_sample_1darray_v4f16_f32:
1275 return emitAMDGCNImageOverloadedReturnType(
1276 CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_image_sample_1darray, IsImageStore: false);
1277 case AMDGPU::BI__builtin_amdgcn_image_sample_2d_f32_f32:
1278 case AMDGPU::BI__builtin_amdgcn_image_sample_2d_v4f32_f32:
1279 case AMDGPU::BI__builtin_amdgcn_image_sample_2d_v4f16_f32:
1280 return emitAMDGCNImageOverloadedReturnType(
1281 CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_image_sample_2d, IsImageStore: false);
1282 case AMDGPU::BI__builtin_amdgcn_image_sample_2darray_f32_f32:
1283 case AMDGPU::BI__builtin_amdgcn_image_sample_2darray_v4f32_f32:
1284 case AMDGPU::BI__builtin_amdgcn_image_sample_2darray_v4f16_f32:
1285 return emitAMDGCNImageOverloadedReturnType(
1286 CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_image_sample_2darray, IsImageStore: false);
1287 case AMDGPU::BI__builtin_amdgcn_image_sample_3d_v4f32_f32:
1288 case AMDGPU::BI__builtin_amdgcn_image_sample_3d_v4f16_f32:
1289 return emitAMDGCNImageOverloadedReturnType(
1290 CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_image_sample_3d, IsImageStore: false);
1291 case AMDGPU::BI__builtin_amdgcn_image_sample_cube_v4f32_f32:
1292 case AMDGPU::BI__builtin_amdgcn_image_sample_cube_v4f16_f32:
1293 return emitAMDGCNImageOverloadedReturnType(
1294 CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_image_sample_cube, IsImageStore: false);
1295 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_lz_1d_v4f32_f32:
1296 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_lz_1d_v4f16_f32:
1297 return emitAMDGCNImageOverloadedReturnType(
1298 CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_image_sample_lz_1d, IsImageStore: false);
1299 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_l_1d_v4f32_f32:
1300 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_l_1d_v4f16_f32:
1301 return emitAMDGCNImageOverloadedReturnType(
1302 CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_image_sample_l_1d, IsImageStore: false);
1303 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_d_1d_v4f32_f32:
1304 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_d_1d_v4f16_f32:
1305 return emitAMDGCNImageOverloadedReturnType(
1306 CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_image_sample_d_1d, IsImageStore: false);
1307 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_lz_2d_v4f32_f32:
1308 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_lz_2d_v4f16_f32:
1309 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_lz_2d_f32_f32:
1310 return emitAMDGCNImageOverloadedReturnType(
1311 CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_image_sample_lz_2d, IsImageStore: false);
1312 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_l_2d_v4f32_f32:
1313 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_l_2d_v4f16_f32:
1314 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_l_2d_f32_f32:
1315 return emitAMDGCNImageOverloadedReturnType(
1316 CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_image_sample_l_2d, IsImageStore: false);
1317 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_d_2d_v4f32_f32:
1318 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_d_2d_v4f16_f32:
1319 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_d_2d_f32_f32:
1320 return emitAMDGCNImageOverloadedReturnType(
1321 CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_image_sample_d_2d, IsImageStore: false);
1322 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_lz_3d_v4f32_f32:
1323 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_lz_3d_v4f16_f32:
1324 return emitAMDGCNImageOverloadedReturnType(
1325 CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_image_sample_lz_3d, IsImageStore: false);
1326 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_l_3d_v4f32_f32:
1327 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_l_3d_v4f16_f32:
1328 return emitAMDGCNImageOverloadedReturnType(
1329 CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_image_sample_l_3d, IsImageStore: false);
1330 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_d_3d_v4f32_f32:
1331 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_d_3d_v4f16_f32:
1332 return emitAMDGCNImageOverloadedReturnType(
1333 CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_image_sample_d_3d, IsImageStore: false);
1334 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_lz_cube_v4f32_f32:
1335 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_lz_cube_v4f16_f32:
1336 return emitAMDGCNImageOverloadedReturnType(
1337 CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_image_sample_lz_cube, IsImageStore: false);
1338 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_l_cube_v4f32_f32:
1339 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_l_cube_v4f16_f32:
1340 return emitAMDGCNImageOverloadedReturnType(
1341 CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_image_sample_l_cube, IsImageStore: false);
1342 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_lz_1darray_v4f32_f32:
1343 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_lz_1darray_v4f16_f32:
1344 return emitAMDGCNImageOverloadedReturnType(
1345 CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_image_sample_lz_1darray, IsImageStore: false);
1346 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_l_1darray_v4f32_f32:
1347 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_l_1darray_v4f16_f32:
1348 return emitAMDGCNImageOverloadedReturnType(
1349 CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_image_sample_l_1darray, IsImageStore: false);
1350 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_d_1darray_v4f32_f32:
1351 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_d_1darray_v4f16_f32:
1352 return emitAMDGCNImageOverloadedReturnType(
1353 CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_image_sample_d_1darray, IsImageStore: false);
1354 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_lz_2darray_v4f32_f32:
1355 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_lz_2darray_v4f16_f32:
1356 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_lz_2darray_f32_f32:
1357 return emitAMDGCNImageOverloadedReturnType(
1358 CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_image_sample_lz_2darray, IsImageStore: false);
1359 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_l_2darray_v4f32_f32:
1360 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_l_2darray_v4f16_f32:
1361 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_l_2darray_f32_f32:
1362 return emitAMDGCNImageOverloadedReturnType(
1363 CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_image_sample_l_2darray, IsImageStore: false);
1364 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_d_2darray_v4f32_f32:
1365 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_d_2darray_v4f16_f32:
1366 case clang::AMDGPU::BI__builtin_amdgcn_image_sample_d_2darray_f32_f32:
1367 return emitAMDGCNImageOverloadedReturnType(
1368 CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_image_sample_d_2darray, IsImageStore: false);
1369 case clang::AMDGPU::BI__builtin_amdgcn_image_gather4_lz_2d_v4f32_f32:
1370 return emitAMDGCNImageOverloadedReturnType(
1371 CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_image_gather4_lz_2d, IsImageStore: false);
1372 case AMDGPU::BI__builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4:
1373 case AMDGPU::BI__builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4: {
1374 llvm::FixedVectorType *VT = FixedVectorType::get(ElementType: Builder.getInt32Ty(), NumElts: 8);
1375 Function *F = CGM.getIntrinsic(
1376 IID: BuiltinID == AMDGPU::BI__builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4
1377 ? Intrinsic::amdgcn_mfma_scale_f32_32x32x64_f8f6f4
1378 : Intrinsic::amdgcn_mfma_scale_f32_16x16x128_f8f6f4,
1379 Tys: {VT, VT});
1380
1381 SmallVector<Value *, 9> Args;
1382 for (unsigned I = 0, N = E->getNumArgs(); I != N; ++I)
1383 Args.push_back(Elt: EmitScalarExpr(E: E->getArg(Arg: I)));
1384 return Builder.CreateCall(Callee: F, Args);
1385 }
1386 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32:
1387 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32:
1388 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64:
1389 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64:
1390 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w32:
1391 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w32:
1392 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w64:
1393 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w64:
1394 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32:
1395 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64:
1396 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32:
1397 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w64:
1398 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32:
1399 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64:
1400 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32:
1401 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64:
1402 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32_gfx12:
1403 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64_gfx12:
1404 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w32_gfx12:
1405 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w64_gfx12:
1406 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12:
1407 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64_gfx12:
1408 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12:
1409 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w64_gfx12:
1410 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32_gfx12:
1411 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64_gfx12:
1412 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12:
1413 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64_gfx12:
1414 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32_gfx12:
1415 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w64_gfx12:
1416 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32_gfx12:
1417 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w64_gfx12:
1418 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32_gfx12:
1419 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w64_gfx12:
1420 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32_gfx12:
1421 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w64_gfx12:
1422 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x32_iu4_w32_gfx12:
1423 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x32_iu4_w64_gfx12:
1424 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_f16_w32:
1425 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_f16_w64:
1426 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w32:
1427 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w64:
1428 case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x32_f16_w32:
1429 case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x32_f16_w64:
1430 case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w32:
1431 case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w64:
1432 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w32:
1433 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w64:
1434 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w32:
1435 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w64:
1436 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w32:
1437 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w64:
1438 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32:
1439 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64:
1440 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32:
1441 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64:
1442 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32:
1443 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64:
1444 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32:
1445 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64:
1446 // GFX1250 WMMA builtins
1447 case AMDGPU::BI__builtin_amdgcn_wmma_f64_16x16x4_f64:
1448 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x4_f32:
1449 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x32_bf16:
1450 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x32_f16:
1451 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x32_f16:
1452 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x32_bf16:
1453 case AMDGPU::BI__builtin_amdgcn_wmma_bf16f32_16x16x32_bf16:
1454 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x64_fp8_fp8:
1455 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x64_fp8_bf8:
1456 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x64_bf8_fp8:
1457 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x64_bf8_bf8:
1458 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x64_fp8_fp8:
1459 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x64_fp8_bf8:
1460 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x64_bf8_fp8:
1461 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x64_bf8_bf8:
1462 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x128_fp8_fp8:
1463 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x128_fp8_bf8:
1464 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x128_bf8_fp8:
1465 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x128_bf8_bf8:
1466 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_fp8_fp8:
1467 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_fp8_bf8:
1468 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_bf8_fp8:
1469 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_bf8_bf8:
1470 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x64_iu8:
1471 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_f8f6f4:
1472 case AMDGPU::BI__builtin_amdgcn_wmma_f32_32x16x128_f4:
1473 case AMDGPU::BI__builtin_amdgcn_wmma_scale_f32_16x16x128_f8f6f4:
1474 case AMDGPU::BI__builtin_amdgcn_wmma_scale16_f32_16x16x128_f8f6f4:
1475 case AMDGPU::BI__builtin_amdgcn_wmma_scale_f32_32x16x128_f4:
1476 case AMDGPU::BI__builtin_amdgcn_wmma_scale16_f32_32x16x128_f4:
1477 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x64_f16:
1478 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x64_bf16:
1479 case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x64_f16:
1480 case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x64_bf16:
1481 case AMDGPU::BI__builtin_amdgcn_swmmac_bf16f32_16x16x64_bf16:
1482 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x128_fp8_fp8:
1483 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x128_fp8_bf8:
1484 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x128_bf8_fp8:
1485 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x128_bf8_bf8:
1486 case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x128_fp8_fp8:
1487 case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x128_fp8_bf8:
1488 case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x128_bf8_fp8:
1489 case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x128_bf8_bf8:
1490 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x128_iu8: {
1491
1492 // These operations perform a matrix multiplication and accumulation of
1493 // the form:
1494 // D = A * B + C
1495 // We need to specify one type for matrices AB and one for matrices CD.
1496 // Sparse matrix operations can have different types for A and B as well as
1497 // an additional type for sparsity index.
1498 // Destination type should be put before types used for source operands.
1499 SmallVector<unsigned, 2> ArgsForMatchingMatrixTypes;
1500 // On GFX12, the intrinsics with 16-bit accumulator use a packed layout.
1501 // There is no need for the variable opsel argument, so always set it to
1502 // "false".
1503 bool AppendFalseForOpselArg = false;
1504 unsigned BuiltinWMMAOp;
1505 // Need return type when D and C are of different types.
1506 bool NeedReturnType = false;
1507 // Need to remove unused neg modifiers.
1508 bool RemoveABNeg = false;
1509
1510 switch (BuiltinID) {
1511 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32:
1512 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w64:
1513 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12:
1514 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w64_gfx12:
1515 ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
1516 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_f16;
1517 break;
1518 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32:
1519 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64:
1520 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12:
1521 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64_gfx12:
1522 ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
1523 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_bf16;
1524 break;
1525 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w32_gfx12:
1526 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w64_gfx12:
1527 AppendFalseForOpselArg = true;
1528 [[fallthrough]];
1529 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w32:
1530 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w64:
1531 ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
1532 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x16_f16;
1533 break;
1534 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32_gfx12:
1535 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64_gfx12:
1536 AppendFalseForOpselArg = true;
1537 [[fallthrough]];
1538 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32:
1539 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64:
1540 ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
1541 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16;
1542 break;
1543 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w32:
1544 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w64:
1545 ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
1546 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x16_f16_tied;
1547 break;
1548 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32:
1549 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64:
1550 ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
1551 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16_tied;
1552 break;
1553 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32:
1554 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64:
1555 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12:
1556 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64_gfx12:
1557 ArgsForMatchingMatrixTypes = {4, 1}; // CD, AB
1558 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_i32_16x16x16_iu8;
1559 break;
1560 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32:
1561 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64:
1562 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32_gfx12:
1563 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64_gfx12:
1564 ArgsForMatchingMatrixTypes = {4, 1}; // CD, AB
1565 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_i32_16x16x16_iu4;
1566 break;
1567 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32_gfx12:
1568 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w64_gfx12:
1569 ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
1570 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_fp8;
1571 break;
1572 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32_gfx12:
1573 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w64_gfx12:
1574 ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
1575 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_bf8;
1576 break;
1577 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32_gfx12:
1578 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w64_gfx12:
1579 ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
1580 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_fp8;
1581 break;
1582 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32_gfx12:
1583 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w64_gfx12:
1584 ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
1585 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_bf8;
1586 break;
1587 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x32_iu4_w32_gfx12:
1588 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x32_iu4_w64_gfx12:
1589 ArgsForMatchingMatrixTypes = {4, 1}; // CD, AB
1590 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_i32_16x16x32_iu4;
1591 break;
1592 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_f16_w32:
1593 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_f16_w64:
1594 ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
1595 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_f16;
1596 break;
1597 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w32:
1598 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w64:
1599 ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
1600 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16;
1601 break;
1602 case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x32_f16_w32:
1603 case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x32_f16_w64:
1604 ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
1605 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f16_16x16x32_f16;
1606 break;
1607 case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w32:
1608 case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w64:
1609 ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
1610 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16;
1611 break;
1612 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w32:
1613 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w64:
1614 ArgsForMatchingMatrixTypes = {4, 1, 3, 5}; // CD, A, B, Index
1615 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8;
1616 break;
1617 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w32:
1618 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w64:
1619 ArgsForMatchingMatrixTypes = {4, 1, 3, 5}; // CD, A, B, Index
1620 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4;
1621 break;
1622 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w32:
1623 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w64:
1624 ArgsForMatchingMatrixTypes = {4, 1, 3, 5}; // CD, A, B, Index
1625 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4;
1626 break;
1627 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32:
1628 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64:
1629 ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
1630 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8;
1631 break;
1632 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32:
1633 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64:
1634 ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
1635 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8;
1636 break;
1637 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32:
1638 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64:
1639 ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
1640 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8;
1641 break;
1642 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32:
1643 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64:
1644 ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
1645 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8;
1646 break;
1647 // GFX1250 WMMA builtins
1648 case AMDGPU::BI__builtin_amdgcn_wmma_f64_16x16x4_f64:
1649 ArgsForMatchingMatrixTypes = {5, 1};
1650 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f64_16x16x4_f64;
1651 break;
1652 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x4_f32:
1653 ArgsForMatchingMatrixTypes = {3, 0};
1654 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x4_f32;
1655 RemoveABNeg = true;
1656 break;
1657 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x32_bf16:
1658 ArgsForMatchingMatrixTypes = {3, 0};
1659 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x32_bf16;
1660 RemoveABNeg = true;
1661 break;
1662 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x32_f16:
1663 ArgsForMatchingMatrixTypes = {3, 0};
1664 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x32_f16;
1665 RemoveABNeg = true;
1666 break;
1667 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x32_f16:
1668 ArgsForMatchingMatrixTypes = {3, 0};
1669 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x32_f16;
1670 RemoveABNeg = true;
1671 break;
1672 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x32_bf16:
1673 ArgsForMatchingMatrixTypes = {3, 0};
1674 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_bf16_16x16x32_bf16;
1675 RemoveABNeg = true;
1676 break;
1677 case AMDGPU::BI__builtin_amdgcn_wmma_bf16f32_16x16x32_bf16:
1678 NeedReturnType = true;
1679 ArgsForMatchingMatrixTypes = {0, 3};
1680 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_bf16f32_16x16x32_bf16;
1681 RemoveABNeg = true;
1682 break;
1683 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x64_fp8_fp8:
1684 ArgsForMatchingMatrixTypes = {3, 0};
1685 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x64_fp8_fp8;
1686 break;
1687 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x64_fp8_bf8:
1688 ArgsForMatchingMatrixTypes = {3, 0};
1689 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x64_fp8_bf8;
1690 break;
1691 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x64_bf8_fp8:
1692 ArgsForMatchingMatrixTypes = {3, 0};
1693 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x64_bf8_fp8;
1694 break;
1695 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x64_bf8_bf8:
1696 ArgsForMatchingMatrixTypes = {3, 0};
1697 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x64_bf8_bf8;
1698 break;
1699 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x64_fp8_fp8:
1700 ArgsForMatchingMatrixTypes = {3, 0};
1701 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x64_fp8_fp8;
1702 break;
1703 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x64_fp8_bf8:
1704 ArgsForMatchingMatrixTypes = {3, 0};
1705 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x64_fp8_bf8;
1706 break;
1707 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x64_bf8_fp8:
1708 ArgsForMatchingMatrixTypes = {3, 0};
1709 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x64_bf8_fp8;
1710 break;
1711 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x64_bf8_bf8:
1712 ArgsForMatchingMatrixTypes = {3, 0};
1713 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x64_bf8_bf8;
1714 break;
1715 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x128_fp8_fp8:
1716 ArgsForMatchingMatrixTypes = {3, 0};
1717 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x128_fp8_fp8;
1718 break;
1719 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x128_fp8_bf8:
1720 ArgsForMatchingMatrixTypes = {3, 0};
1721 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x128_fp8_bf8;
1722 break;
1723 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x128_bf8_fp8:
1724 ArgsForMatchingMatrixTypes = {3, 0};
1725 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x128_bf8_fp8;
1726 break;
1727 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x128_bf8_bf8:
1728 ArgsForMatchingMatrixTypes = {3, 0};
1729 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x128_bf8_bf8;
1730 break;
1731 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_fp8_fp8:
1732 ArgsForMatchingMatrixTypes = {3, 0};
1733 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x128_fp8_fp8;
1734 break;
1735 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_fp8_bf8:
1736 ArgsForMatchingMatrixTypes = {3, 0};
1737 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x128_fp8_bf8;
1738 break;
1739 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_bf8_fp8:
1740 ArgsForMatchingMatrixTypes = {3, 0};
1741 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x128_bf8_fp8;
1742 break;
1743 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_bf8_bf8:
1744 ArgsForMatchingMatrixTypes = {3, 0};
1745 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x128_bf8_bf8;
1746 break;
1747 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x64_iu8:
1748 ArgsForMatchingMatrixTypes = {4, 1};
1749 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_i32_16x16x64_iu8;
1750 break;
1751 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_f8f6f4:
1752 ArgsForMatchingMatrixTypes = {5, 1, 3};
1753 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x128_f8f6f4;
1754 break;
1755 case AMDGPU::BI__builtin_amdgcn_wmma_scale_f32_16x16x128_f8f6f4:
1756 ArgsForMatchingMatrixTypes = {5, 1, 3};
1757 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_scale_f32_16x16x128_f8f6f4;
1758 break;
1759 case AMDGPU::BI__builtin_amdgcn_wmma_scale16_f32_16x16x128_f8f6f4:
1760 ArgsForMatchingMatrixTypes = {5, 1, 3};
1761 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_scale16_f32_16x16x128_f8f6f4;
1762 break;
1763 case AMDGPU::BI__builtin_amdgcn_wmma_f32_32x16x128_f4:
1764 ArgsForMatchingMatrixTypes = {3, 0, 1};
1765 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_32x16x128_f4;
1766 break;
1767 case AMDGPU::BI__builtin_amdgcn_wmma_scale_f32_32x16x128_f4:
1768 ArgsForMatchingMatrixTypes = {3, 0, 1};
1769 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_scale_f32_32x16x128_f4;
1770 break;
1771 case AMDGPU::BI__builtin_amdgcn_wmma_scale16_f32_32x16x128_f4:
1772 ArgsForMatchingMatrixTypes = {3, 0, 1};
1773 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_scale16_f32_32x16x128_f4;
1774 break;
1775 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x64_f16:
1776 ArgsForMatchingMatrixTypes = {4, 1, 3, 5};
1777 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x64_f16;
1778 break;
1779 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x64_bf16:
1780 ArgsForMatchingMatrixTypes = {4, 1, 3, 5};
1781 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16;
1782 break;
1783 case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x64_f16:
1784 ArgsForMatchingMatrixTypes = {4, 1, 3, 5};
1785 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f16_16x16x64_f16;
1786 break;
1787 case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x64_bf16:
1788 ArgsForMatchingMatrixTypes = {4, 1, 3, 5};
1789 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16;
1790 break;
1791 case AMDGPU::BI__builtin_amdgcn_swmmac_bf16f32_16x16x64_bf16:
1792 ArgsForMatchingMatrixTypes = {4, 1, 3, 5};
1793 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16;
1794 break;
1795 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x128_fp8_fp8:
1796 ArgsForMatchingMatrixTypes = {2, 0, 1, 3};
1797 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8;
1798 break;
1799 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x128_fp8_bf8:
1800 ArgsForMatchingMatrixTypes = {2, 0, 1, 3};
1801 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8;
1802 break;
1803 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x128_bf8_fp8:
1804 ArgsForMatchingMatrixTypes = {2, 0, 1, 3};
1805 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8;
1806 break;
1807 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x128_bf8_bf8:
1808 ArgsForMatchingMatrixTypes = {2, 0, 1, 3};
1809 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8;
1810 break;
1811 case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x128_fp8_fp8:
1812 ArgsForMatchingMatrixTypes = {2, 0, 1, 3};
1813 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8;
1814 break;
1815 case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x128_fp8_bf8:
1816 ArgsForMatchingMatrixTypes = {2, 0, 1, 3};
1817 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8;
1818 break;
1819 case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x128_bf8_fp8:
1820 ArgsForMatchingMatrixTypes = {2, 0, 1, 3};
1821 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8;
1822 break;
1823 case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x128_bf8_bf8:
1824 ArgsForMatchingMatrixTypes = {2, 0, 1, 3};
1825 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8;
1826 break;
1827 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x128_iu8:
1828 ArgsForMatchingMatrixTypes = {4, 1, 3, 5};
1829 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8;
1830 break;
1831 }
1832
1833 SmallVector<Value *, 6> Args;
1834 for (int i = 0, e = E->getNumArgs(); i != e; ++i) {
1835 // Remove unused neg modifiers.
1836 if (RemoveABNeg && (i == 0 || i == 2))
1837 continue;
1838 Args.push_back(Elt: EmitScalarExpr(E: E->getArg(Arg: i)));
1839 }
1840 if (AppendFalseForOpselArg)
1841 Args.push_back(Elt: Builder.getFalse());
1842
1843 // Handle the optional clamp argument of the following two builtins.
1844 if (BuiltinID == AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x64_iu8) {
1845 if (Args.size() == 7)
1846 Args.push_back(Elt: Builder.getFalse());
1847 assert(Args.size() == 8 && "Expected 8 arguments");
1848 Args[7] = Builder.CreateZExtOrTrunc(V: Args[7], DestTy: Builder.getInt1Ty());
1849 } else if (BuiltinID ==
1850 AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x128_iu8) {
1851 if (Args.size() == 8)
1852 Args.push_back(Elt: Builder.getFalse());
1853 assert(Args.size() == 9 && "Expected 9 arguments");
1854 Args[8] = Builder.CreateZExtOrTrunc(V: Args[8], DestTy: Builder.getInt1Ty());
1855 }
1856
1857 SmallVector<llvm::Type *, 6> ArgTypes;
1858 if (NeedReturnType)
1859 ArgTypes.push_back(Elt: ConvertType(T: E->getType()));
1860 for (auto ArgIdx : ArgsForMatchingMatrixTypes)
1861 ArgTypes.push_back(Elt: Args[ArgIdx]->getType());
1862
1863 Function *F = CGM.getIntrinsic(IID: BuiltinWMMAOp, Tys: ArgTypes);
1864 return Builder.CreateCall(Callee: F, Args);
1865 }
1866 // amdgcn workgroup size
1867 case AMDGPU::BI__builtin_amdgcn_workgroup_size_x:
1868 return EmitAMDGPUWorkGroupSize(CGF&: *this, Index: 0);
1869 case AMDGPU::BI__builtin_amdgcn_workgroup_size_y:
1870 return EmitAMDGPUWorkGroupSize(CGF&: *this, Index: 1);
1871 case AMDGPU::BI__builtin_amdgcn_workgroup_size_z:
1872 return EmitAMDGPUWorkGroupSize(CGF&: *this, Index: 2);
1873
1874 // amdgcn grid size
1875 case AMDGPU::BI__builtin_amdgcn_grid_size_x:
1876 return EmitAMDGPUGridSize(CGF&: *this, Index: 0);
1877 case AMDGPU::BI__builtin_amdgcn_grid_size_y:
1878 return EmitAMDGPUGridSize(CGF&: *this, Index: 1);
1879 case AMDGPU::BI__builtin_amdgcn_grid_size_z:
1880 return EmitAMDGPUGridSize(CGF&: *this, Index: 2);
1881
1882 // r600 intrinsics
1883 case AMDGPU::BI__builtin_r600_recipsqrt_ieee:
1884 case AMDGPU::BI__builtin_r600_recipsqrt_ieeef:
1885 return emitBuiltinWithOneOverloadedType<1>(CGF&: *this, E,
1886 IntrinsicID: Intrinsic::r600_recipsqrt_ieee);
1887 case AMDGPU::BI__builtin_amdgcn_alignbit: {
1888 llvm::Value *Src0 = EmitScalarExpr(E: E->getArg(Arg: 0));
1889 llvm::Value *Src1 = EmitScalarExpr(E: E->getArg(Arg: 1));
1890 llvm::Value *Src2 = EmitScalarExpr(E: E->getArg(Arg: 2));
1891 Function *F = CGM.getIntrinsic(IID: Intrinsic::fshr, Tys: Src0->getType());
1892 return Builder.CreateCall(Callee: F, Args: { Src0, Src1, Src2 });
1893 }
1894 case AMDGPU::BI__builtin_amdgcn_fence: {
1895 ProcessOrderScopeAMDGCN(Order: EmitScalarExpr(E: E->getArg(Arg: 0)),
1896 Scope: EmitScalarExpr(E: E->getArg(Arg: 1)), AO, SSID);
1897 FenceInst *Fence = Builder.CreateFence(Ordering: AO, SSID);
1898 if (E->getNumArgs() > 2)
1899 AddAMDGPUFenceAddressSpaceMMRA(Inst: Fence, E);
1900 return Fence;
1901 }
1902 case AMDGPU::BI__builtin_amdgcn_atomic_inc32:
1903 case AMDGPU::BI__builtin_amdgcn_atomic_inc64:
1904 case AMDGPU::BI__builtin_amdgcn_atomic_dec32:
1905 case AMDGPU::BI__builtin_amdgcn_atomic_dec64:
1906 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f64:
1907 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f32:
1908 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2f16:
1909 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16:
1910 case AMDGPU::BI__builtin_amdgcn_ds_faddf:
1911 case AMDGPU::BI__builtin_amdgcn_ds_fminf:
1912 case AMDGPU::BI__builtin_amdgcn_ds_fmaxf:
1913 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
1914 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
1915 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
1916 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2f16:
1917 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f32:
1918 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f64:
1919 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2bf16:
1920 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16:
1921 case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
1922 case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64:
1923 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmin_f64:
1924 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmax_f64: {
1925 llvm::AtomicRMWInst::BinOp BinOp;
1926 switch (BuiltinID) {
1927 case AMDGPU::BI__builtin_amdgcn_atomic_inc32:
1928 case AMDGPU::BI__builtin_amdgcn_atomic_inc64:
1929 BinOp = llvm::AtomicRMWInst::UIncWrap;
1930 break;
1931 case AMDGPU::BI__builtin_amdgcn_atomic_dec32:
1932 case AMDGPU::BI__builtin_amdgcn_atomic_dec64:
1933 BinOp = llvm::AtomicRMWInst::UDecWrap;
1934 break;
1935 case AMDGPU::BI__builtin_amdgcn_ds_faddf:
1936 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f64:
1937 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f32:
1938 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2f16:
1939 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16:
1940 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
1941 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
1942 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
1943 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2f16:
1944 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f32:
1945 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f64:
1946 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2bf16:
1947 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16:
1948 BinOp = llvm::AtomicRMWInst::FAdd;
1949 break;
1950 case AMDGPU::BI__builtin_amdgcn_ds_fminf:
1951 case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
1952 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmin_f64:
1953 BinOp = llvm::AtomicRMWInst::FMin;
1954 break;
1955 case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64:
1956 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmax_f64:
1957 case AMDGPU::BI__builtin_amdgcn_ds_fmaxf:
1958 BinOp = llvm::AtomicRMWInst::FMax;
1959 break;
1960 }
1961
1962 Address Ptr = CheckAtomicAlignment(CGF&: *this, E);
1963 Value *Val = EmitScalarExpr(E: E->getArg(Arg: 1));
1964 llvm::Type *OrigTy = Val->getType();
1965 QualType PtrTy = E->getArg(Arg: 0)->IgnoreImpCasts()->getType();
1966
1967 bool Volatile;
1968
1969 if (BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_faddf ||
1970 BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_fminf ||
1971 BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_fmaxf) {
1972 // __builtin_amdgcn_ds_faddf/fminf/fmaxf has an explicit volatile argument
1973 Volatile =
1974 cast<ConstantInt>(Val: EmitScalarExpr(E: E->getArg(Arg: 4)))->getZExtValue();
1975 } else {
1976 // Infer volatile from the passed type.
1977 Volatile =
1978 PtrTy->castAs<PointerType>()->getPointeeType().isVolatileQualified();
1979 }
1980
1981 if (E->getNumArgs() >= 4) {
1982 // Some of the builtins have explicit ordering and scope arguments.
1983 ProcessOrderScopeAMDGCN(Order: EmitScalarExpr(E: E->getArg(Arg: 2)),
1984 Scope: EmitScalarExpr(E: E->getArg(Arg: 3)), AO, SSID);
1985 } else {
1986 // Most of the builtins do not have syncscope/order arguments. For DS
1987 // atomics the scope doesn't really matter, as they implicitly operate at
1988 // workgroup scope.
1989 //
1990 // The global/flat cases need to use agent scope to consistently produce
1991 // the native instruction instead of a cmpxchg expansion.
1992 if (getTarget().getTriple().isSPIRV())
1993 SSID = getLLVMContext().getOrInsertSyncScopeID(SSN: "device");
1994 else
1995 SSID = getLLVMContext().getOrInsertSyncScopeID(SSN: "agent");
1996 AO = AtomicOrdering::Monotonic;
1997
1998 // The v2bf16 builtin uses i16 instead of a natural bfloat type.
1999 if (BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16 ||
2000 BuiltinID == AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2bf16 ||
2001 BuiltinID == AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16) {
2002 llvm::Type *V2BF16Ty = FixedVectorType::get(
2003 ElementType: llvm::Type::getBFloatTy(C&: Builder.getContext()), NumElts: 2);
2004 Val = Builder.CreateBitCast(V: Val, DestTy: V2BF16Ty);
2005 }
2006 }
2007
2008 llvm::AtomicRMWInst *RMW =
2009 Builder.CreateAtomicRMW(Op: BinOp, Addr: Ptr, Val, Ordering: AO, SSID);
2010 if (Volatile)
2011 RMW->setVolatile(true);
2012
2013 unsigned AddrSpace = Ptr.getType()->getAddressSpace();
2014 if (AddrSpace != llvm::AMDGPUAS::LOCAL_ADDRESS) {
2015 // Most targets require "amdgpu.no.fine.grained.memory" to emit the native
2016 // instruction for flat and global operations.
2017 llvm::MDTuple *EmptyMD = MDNode::get(Context&: getLLVMContext(), MDs: {});
2018 RMW->setMetadata(Kind: "amdgpu.no.fine.grained.memory", Node: EmptyMD);
2019
2020 // Most targets require "amdgpu.ignore.denormal.mode" to emit the native
2021 // instruction, but this only matters for float fadd.
2022 if (BinOp == llvm::AtomicRMWInst::FAdd && Val->getType()->isFloatTy())
2023 RMW->setMetadata(Kind: "amdgpu.ignore.denormal.mode", Node: EmptyMD);
2024 }
2025
2026 return Builder.CreateBitCast(V: RMW, DestTy: OrigTy);
2027 }
2028 case AMDGPU::BI__builtin_amdgcn_s_sendmsg_rtn:
2029 case AMDGPU::BI__builtin_amdgcn_s_sendmsg_rtnl: {
2030 llvm::Value *Arg = EmitScalarExpr(E: E->getArg(Arg: 0));
2031 llvm::Type *ResultType = ConvertType(T: E->getType());
2032 // s_sendmsg_rtn is mangled using return type only.
2033 Function *F =
2034 CGM.getIntrinsic(IID: Intrinsic::amdgcn_s_sendmsg_rtn, Tys: {ResultType});
2035 return Builder.CreateCall(Callee: F, Args: {Arg});
2036 }
2037 case AMDGPU::BI__builtin_amdgcn_permlane16_swap:
2038 case AMDGPU::BI__builtin_amdgcn_permlane32_swap: {
2039 // Because builtin types are limited, and the intrinsic uses a struct/pair
2040 // output, marshal the pair-of-i32 to <2 x i32>.
2041 Value *VDstOld = EmitScalarExpr(E: E->getArg(Arg: 0));
2042 Value *VSrcOld = EmitScalarExpr(E: E->getArg(Arg: 1));
2043 Value *FI = EmitScalarExpr(E: E->getArg(Arg: 2));
2044 Value *BoundCtrl = EmitScalarExpr(E: E->getArg(Arg: 3));
2045 Function *F =
2046 CGM.getIntrinsic(IID: BuiltinID == AMDGPU::BI__builtin_amdgcn_permlane16_swap
2047 ? Intrinsic::amdgcn_permlane16_swap
2048 : Intrinsic::amdgcn_permlane32_swap);
2049 llvm::CallInst *Call =
2050 Builder.CreateCall(Callee: F, Args: {VDstOld, VSrcOld, FI, BoundCtrl});
2051
2052 llvm::Value *Elt0 = Builder.CreateExtractValue(Agg: Call, Idxs: 0);
2053 llvm::Value *Elt1 = Builder.CreateExtractValue(Agg: Call, Idxs: 1);
2054
2055 llvm::Type *ResultType = ConvertType(T: E->getType());
2056
2057 llvm::Value *Insert0 = Builder.CreateInsertElement(
2058 Vec: llvm::PoisonValue::get(T: ResultType), NewElt: Elt0, UINT64_C(0));
2059 llvm::Value *AsVector =
2060 Builder.CreateInsertElement(Vec: Insert0, NewElt: Elt1, UINT64_C(1));
2061 return AsVector;
2062 }
2063 case AMDGPU::BI__builtin_amdgcn_bitop3_b32:
2064 case AMDGPU::BI__builtin_amdgcn_bitop3_b16:
2065 return emitBuiltinWithOneOverloadedType<4>(CGF&: *this, E,
2066 IntrinsicID: Intrinsic::amdgcn_bitop3);
2067 case AMDGPU::BI__builtin_amdgcn_make_buffer_rsrc: {
2068 // TODO: LLVM has this overloaded to allow for fat pointers, but since
2069 // those haven't been plumbed through to Clang yet, default to creating the
2070 // resource type.
2071 SmallVector<Value *, 4> Args;
2072 for (unsigned I = 0; I < 4; ++I)
2073 Args.push_back(Elt: EmitScalarExpr(E: E->getArg(Arg: I)));
2074 llvm::PointerType *RetTy = llvm::PointerType::get(
2075 C&: Builder.getContext(), AddressSpace: llvm::AMDGPUAS::BUFFER_RESOURCE);
2076 Function *F = CGM.getIntrinsic(IID: Intrinsic::amdgcn_make_buffer_rsrc,
2077 Tys: {RetTy, Args[0]->getType()});
2078 return Builder.CreateCall(Callee: F, Args);
2079 }
2080 case AMDGPU::BI__builtin_amdgcn_raw_buffer_store_b8:
2081 case AMDGPU::BI__builtin_amdgcn_raw_buffer_store_b16:
2082 case AMDGPU::BI__builtin_amdgcn_raw_buffer_store_b32:
2083 case AMDGPU::BI__builtin_amdgcn_raw_buffer_store_b64:
2084 case AMDGPU::BI__builtin_amdgcn_raw_buffer_store_b96:
2085 case AMDGPU::BI__builtin_amdgcn_raw_buffer_store_b128:
2086 return emitBuiltinWithOneOverloadedType<5>(
2087 CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_raw_ptr_buffer_store);
2088 case AMDGPU::BI__builtin_amdgcn_raw_buffer_store_format_v4f32:
2089 case AMDGPU::BI__builtin_amdgcn_raw_buffer_store_format_v4f16:
2090 return emitBuiltinWithOneOverloadedType<5>(
2091 CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_raw_ptr_buffer_store_format);
2092 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b8:
2093 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b16:
2094 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b32:
2095 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b64:
2096 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b96:
2097 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b128: {
2098 llvm::Type *RetTy = nullptr;
2099 switch (BuiltinID) {
2100 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b8:
2101 RetTy = Int8Ty;
2102 break;
2103 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b16:
2104 RetTy = Int16Ty;
2105 break;
2106 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b32:
2107 RetTy = Int32Ty;
2108 break;
2109 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b64:
2110 RetTy = llvm::FixedVectorType::get(ElementType: Int32Ty, /*NumElements=*/NumElts: 2);
2111 break;
2112 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b96:
2113 RetTy = llvm::FixedVectorType::get(ElementType: Int32Ty, /*NumElements=*/NumElts: 3);
2114 break;
2115 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b128:
2116 RetTy = llvm::FixedVectorType::get(ElementType: Int32Ty, /*NumElements=*/NumElts: 4);
2117 break;
2118 }
2119 Function *F =
2120 CGM.getIntrinsic(IID: Intrinsic::amdgcn_raw_ptr_buffer_load, Tys: RetTy);
2121 return Builder.CreateCall(
2122 Callee: F, Args: {EmitScalarExpr(E: E->getArg(Arg: 0)), EmitScalarExpr(E: E->getArg(Arg: 1)),
2123 EmitScalarExpr(E: E->getArg(Arg: 2)), EmitScalarExpr(E: E->getArg(Arg: 3))});
2124 }
2125 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_format_v4f32:
2126 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_format_v4f16: {
2127 llvm::Type *RetTy = ConvertType(T: E->getType());
2128 Function *F =
2129 CGM.getIntrinsic(IID: Intrinsic::amdgcn_raw_ptr_buffer_load_format, Tys: {RetTy});
2130
2131 return Builder.CreateCall(
2132 Callee: F, Args: {EmitScalarExpr(E: E->getArg(Arg: 0)), EmitScalarExpr(E: E->getArg(Arg: 1)),
2133 EmitScalarExpr(E: E->getArg(Arg: 2)), EmitScalarExpr(E: E->getArg(Arg: 3))});
2134 }
2135 case AMDGPU::BI__builtin_amdgcn_struct_buffer_store_format_v4f32:
2136 case AMDGPU::BI__builtin_amdgcn_struct_buffer_store_format_v4f16:
2137 return emitBuiltinWithOneOverloadedType<6>(
2138 CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_struct_ptr_buffer_store_format);
2139 case AMDGPU::BI__builtin_amdgcn_struct_buffer_load_format_v4f32:
2140 case AMDGPU::BI__builtin_amdgcn_struct_buffer_load_format_v4f16: {
2141 llvm::Type *RetTy = ConvertType(T: E->getType());
2142 Function *F = CGM.getIntrinsic(
2143 IID: Intrinsic::amdgcn_struct_ptr_buffer_load_format, Tys: {RetTy});
2144
2145 return Builder.CreateCall(
2146 Callee: F, Args: {EmitScalarExpr(E: E->getArg(Arg: 0)), EmitScalarExpr(E: E->getArg(Arg: 1)),
2147 EmitScalarExpr(E: E->getArg(Arg: 2)), EmitScalarExpr(E: E->getArg(Arg: 3)),
2148 EmitScalarExpr(E: E->getArg(Arg: 4))});
2149 }
2150 case AMDGPU::BI__builtin_amdgcn_raw_ptr_buffer_atomic_add_i32:
2151 return emitBuiltinWithOneOverloadedType<5>(
2152 CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_raw_ptr_buffer_atomic_add);
2153 case AMDGPU::BI__builtin_amdgcn_raw_ptr_buffer_atomic_fadd_f32:
2154 case AMDGPU::BI__builtin_amdgcn_raw_ptr_buffer_atomic_fadd_v2f16:
2155 return emitBuiltinWithOneOverloadedType<5>(
2156 CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd);
2157 case AMDGPU::BI__builtin_amdgcn_raw_ptr_buffer_atomic_fmin_f32:
2158 case AMDGPU::BI__builtin_amdgcn_raw_ptr_buffer_atomic_fmin_f64:
2159 return emitBuiltinWithOneOverloadedType<5>(
2160 CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin);
2161 case AMDGPU::BI__builtin_amdgcn_raw_ptr_buffer_atomic_fmax_f32:
2162 case AMDGPU::BI__builtin_amdgcn_raw_ptr_buffer_atomic_fmax_f64:
2163 return emitBuiltinWithOneOverloadedType<5>(
2164 CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax);
2165 case AMDGPU::BI__builtin_amdgcn_s_prefetch_data:
2166 return emitBuiltinWithOneOverloadedType<2>(
2167 CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_s_prefetch_data);
2168 case Builtin::BIlogbf:
2169 case Builtin::BI__builtin_logbf: {
2170 Value *Src0 = EmitScalarExpr(E: E->getArg(Arg: 0));
2171 Function *FrExpFunc = CGM.getIntrinsic(
2172 IID: Intrinsic::frexp, Tys: {Src0->getType(), Builder.getInt32Ty()});
2173 CallInst *FrExp = Builder.CreateCall(Callee: FrExpFunc, Args: Src0);
2174 Value *Exp = Builder.CreateExtractValue(Agg: FrExp, Idxs: 1);
2175 Value *Add = Builder.CreateAdd(
2176 LHS: Exp, RHS: ConstantInt::getSigned(Ty: Exp->getType(), V: -1), Name: "", HasNUW: false, HasNSW: true);
2177 Value *SIToFP = Builder.CreateSIToFP(V: Add, DestTy: Builder.getFloatTy());
2178 Value *Fabs =
2179 emitBuiltinWithOneOverloadedType<1>(CGF&: *this, E, IntrinsicID: Intrinsic::fabs);
2180 Value *FCmpONE = Builder.CreateFCmpONE(
2181 LHS: Fabs, RHS: ConstantFP::getInfinity(Ty: Builder.getFloatTy()));
2182 Value *Sel1 = Builder.CreateSelect(C: FCmpONE, True: SIToFP, False: Fabs);
2183 Value *FCmpOEQ =
2184 Builder.CreateFCmpOEQ(LHS: Src0, RHS: ConstantFP::getZero(Ty: Builder.getFloatTy()));
2185 Value *Sel2 = Builder.CreateSelect(
2186 C: FCmpOEQ,
2187 True: ConstantFP::getInfinity(Ty: Builder.getFloatTy(), /*Negative=*/true), False: Sel1);
2188 return Sel2;
2189 }
2190 case Builtin::BIlogb:
2191 case Builtin::BI__builtin_logb: {
2192 Value *Src0 = EmitScalarExpr(E: E->getArg(Arg: 0));
2193 Function *FrExpFunc = CGM.getIntrinsic(
2194 IID: Intrinsic::frexp, Tys: {Src0->getType(), Builder.getInt32Ty()});
2195 CallInst *FrExp = Builder.CreateCall(Callee: FrExpFunc, Args: Src0);
2196 Value *Exp = Builder.CreateExtractValue(Agg: FrExp, Idxs: 1);
2197 Value *Add = Builder.CreateAdd(
2198 LHS: Exp, RHS: ConstantInt::getSigned(Ty: Exp->getType(), V: -1), Name: "", HasNUW: false, HasNSW: true);
2199 Value *SIToFP = Builder.CreateSIToFP(V: Add, DestTy: Builder.getDoubleTy());
2200 Value *Fabs =
2201 emitBuiltinWithOneOverloadedType<1>(CGF&: *this, E, IntrinsicID: Intrinsic::fabs);
2202 Value *FCmpONE = Builder.CreateFCmpONE(
2203 LHS: Fabs, RHS: ConstantFP::getInfinity(Ty: Builder.getDoubleTy()));
2204 Value *Sel1 = Builder.CreateSelect(C: FCmpONE, True: SIToFP, False: Fabs);
2205 Value *FCmpOEQ =
2206 Builder.CreateFCmpOEQ(LHS: Src0, RHS: ConstantFP::getZero(Ty: Builder.getDoubleTy()));
2207 Value *Sel2 = Builder.CreateSelect(
2208 C: FCmpOEQ,
2209 True: ConstantFP::getInfinity(Ty: Builder.getDoubleTy(), /*Negative=*/true),
2210 False: Sel1);
2211 return Sel2;
2212 }
2213 case Builtin::BIscalbnf:
2214 case Builtin::BI__builtin_scalbnf:
2215 case Builtin::BIscalbn:
2216 case Builtin::BI__builtin_scalbn:
2217 return emitBinaryExpMaybeConstrainedFPBuiltin(
2218 CGF&: *this, E, IntrinsicID: Intrinsic::ldexp, ConstrainedIntrinsicID: Intrinsic::experimental_constrained_ldexp);
2219 case AMDGPU::BI__builtin_amdgcn_permlane_bcast:
2220 return emitBuiltinWithOneOverloadedType<3>(
2221 CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_permlane_bcast);
2222 case AMDGPU::BI__builtin_amdgcn_permlane_up:
2223 return emitBuiltinWithOneOverloadedType<3>(CGF&: *this, E,
2224 IntrinsicID: Intrinsic::amdgcn_permlane_up);
2225 case AMDGPU::BI__builtin_amdgcn_permlane_down:
2226 return emitBuiltinWithOneOverloadedType<3>(CGF&: *this, E,
2227 IntrinsicID: Intrinsic::amdgcn_permlane_down);
2228 case AMDGPU::BI__builtin_amdgcn_permlane_xor:
2229 return emitBuiltinWithOneOverloadedType<3>(CGF&: *this, E,
2230 IntrinsicID: Intrinsic::amdgcn_permlane_xor);
2231 default:
2232 return nullptr;
2233 }
2234}
2235