1//===------- AMDCPU.cpp - Emit LLVM Code for builtins ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This contains code to emit Builtin calls as LLVM code.
10//
11//===----------------------------------------------------------------------===//
12
13#include "CGBuiltin.h"
14#include "clang/Basic/TargetBuiltins.h"
15#include "llvm/Analysis/ValueTracking.h"
16#include "llvm/IR/IntrinsicsAMDGPU.h"
17#include "llvm/IR/IntrinsicsR600.h"
18#include "llvm/IR/MemoryModelRelaxationAnnotations.h"
19#include "llvm/Support/AMDGPUAddrSpace.h"
20
21using namespace clang;
22using namespace CodeGen;
23using namespace llvm;
24
25namespace {
26
27// Has second type mangled argument.
28static Value *
29emitBinaryExpMaybeConstrainedFPBuiltin(CodeGenFunction &CGF, const CallExpr *E,
30 Intrinsic::ID IntrinsicID,
31 Intrinsic::ID ConstrainedIntrinsicID) {
32 llvm::Value *Src0 = CGF.EmitScalarExpr(E: E->getArg(Arg: 0));
33 llvm::Value *Src1 = CGF.EmitScalarExpr(E: E->getArg(Arg: 1));
34
35 CodeGenFunction::CGFPOptionsRAII FPOptsRAII(CGF, E);
36 if (CGF.Builder.getIsFPConstrained()) {
37 Function *F = CGF.CGM.getIntrinsic(IID: ConstrainedIntrinsicID,
38 Tys: {Src0->getType(), Src1->getType()});
39 return CGF.Builder.CreateConstrainedFPCall(Callee: F, Args: {Src0, Src1});
40 }
41
42 Function *F =
43 CGF.CGM.getIntrinsic(IID: IntrinsicID, Tys: {Src0->getType(), Src1->getType()});
44 return CGF.Builder.CreateCall(Callee: F, Args: {Src0, Src1});
45}
46
47// If \p E is not null pointer, insert address space cast to match return
48// type of \p E if necessary.
49Value *EmitAMDGPUDispatchPtr(CodeGenFunction &CGF,
50 const CallExpr *E = nullptr) {
51 auto *F = CGF.CGM.getIntrinsic(IID: Intrinsic::amdgcn_dispatch_ptr);
52 auto *Call = CGF.Builder.CreateCall(Callee: F);
53 Call->addRetAttr(
54 Attr: Attribute::getWithDereferenceableBytes(Context&: Call->getContext(), Bytes: 64));
55 Call->addRetAttr(Attr: Attribute::getWithAlignment(Context&: Call->getContext(), Alignment: Align(4)));
56 if (!E)
57 return Call;
58 QualType BuiltinRetType = E->getType();
59 auto *RetTy = cast<llvm::PointerType>(Val: CGF.ConvertType(T: BuiltinRetType));
60 if (RetTy == Call->getType())
61 return Call;
62 return CGF.Builder.CreateAddrSpaceCast(V: Call, DestTy: RetTy);
63}
64
65Value *EmitAMDGPUImplicitArgPtr(CodeGenFunction &CGF) {
66 auto *F = CGF.CGM.getIntrinsic(IID: Intrinsic::amdgcn_implicitarg_ptr);
67 auto *Call = CGF.Builder.CreateCall(Callee: F);
68 Call->addRetAttr(
69 Attr: Attribute::getWithDereferenceableBytes(Context&: Call->getContext(), Bytes: 256));
70 Call->addRetAttr(Attr: Attribute::getWithAlignment(Context&: Call->getContext(), Alignment: Align(8)));
71 return Call;
72}
73
74// \p Index is 0, 1, and 2 for x, y, and z dimension, respectively.
75/// Emit code based on Code Object ABI version.
76/// COV_4 : Emit code to use dispatch ptr
77/// COV_5+ : Emit code to use implicitarg ptr
78/// COV_NONE : Emit code to load a global variable "__oclc_ABI_version"
79/// and use its value for COV_4 or COV_5+ approach. It is used for
80/// compiling device libraries in an ABI-agnostic way.
81Value *EmitAMDGPUWorkGroupSize(CodeGenFunction &CGF, unsigned Index) {
82 llvm::LoadInst *LD;
83
84 auto Cov = CGF.getTarget().getTargetOpts().CodeObjectVersion;
85
86 if (Cov == CodeObjectVersionKind::COV_None) {
87 StringRef Name = "__oclc_ABI_version";
88 auto *ABIVersionC = CGF.CGM.getModule().getNamedGlobal(Name);
89 if (!ABIVersionC)
90 ABIVersionC = new llvm::GlobalVariable(
91 CGF.CGM.getModule(), CGF.Int32Ty, false,
92 llvm::GlobalValue::ExternalLinkage, nullptr, Name, nullptr,
93 llvm::GlobalVariable::NotThreadLocal,
94 CGF.CGM.getContext().getTargetAddressSpace(AS: LangAS::opencl_constant));
95
96 // This load will be eliminated by the IPSCCP because it is constant
97 // weak_odr without externally_initialized. Either changing it to weak or
98 // adding externally_initialized will keep the load.
99 Value *ABIVersion = CGF.Builder.CreateAlignedLoad(Ty: CGF.Int32Ty, Addr: ABIVersionC,
100 Align: CGF.CGM.getIntAlign());
101
102 Value *IsCOV5 = CGF.Builder.CreateICmpSGE(
103 LHS: ABIVersion,
104 RHS: llvm::ConstantInt::get(Ty: CGF.Int32Ty, V: CodeObjectVersionKind::COV_5));
105
106 // Indexing the implicit kernarg segment.
107 Value *ImplicitGEP = CGF.Builder.CreateConstGEP1_32(
108 Ty: CGF.Int8Ty, Ptr: EmitAMDGPUImplicitArgPtr(CGF), Idx0: 12 + Index * 2);
109
110 // Indexing the HSA kernel_dispatch_packet struct.
111 Value *DispatchGEP = CGF.Builder.CreateConstGEP1_32(
112 Ty: CGF.Int8Ty, Ptr: EmitAMDGPUDispatchPtr(CGF), Idx0: 4 + Index * 2);
113
114 auto Result = CGF.Builder.CreateSelect(C: IsCOV5, True: ImplicitGEP, False: DispatchGEP);
115 LD = CGF.Builder.CreateLoad(
116 Addr: Address(Result, CGF.Int16Ty, CharUnits::fromQuantity(Quantity: 2)));
117 } else {
118 Value *GEP = nullptr;
119 if (Cov >= CodeObjectVersionKind::COV_5) {
120 // Indexing the implicit kernarg segment.
121 GEP = CGF.Builder.CreateConstGEP1_32(
122 Ty: CGF.Int8Ty, Ptr: EmitAMDGPUImplicitArgPtr(CGF), Idx0: 12 + Index * 2);
123 } else {
124 // Indexing the HSA kernel_dispatch_packet struct.
125 GEP = CGF.Builder.CreateConstGEP1_32(
126 Ty: CGF.Int8Ty, Ptr: EmitAMDGPUDispatchPtr(CGF), Idx0: 4 + Index * 2);
127 }
128 LD = CGF.Builder.CreateLoad(
129 Addr: Address(GEP, CGF.Int16Ty, CharUnits::fromQuantity(Quantity: 2)));
130 }
131
132 llvm::MDBuilder MDHelper(CGF.getLLVMContext());
133 llvm::MDNode *RNode = MDHelper.createRange(Lo: APInt(16, 1),
134 Hi: APInt(16, CGF.getTarget().getMaxOpenCLWorkGroupSize() + 1));
135 LD->setMetadata(KindID: llvm::LLVMContext::MD_range, Node: RNode);
136 LD->setMetadata(KindID: llvm::LLVMContext::MD_noundef,
137 Node: llvm::MDNode::get(Context&: CGF.getLLVMContext(), MDs: {}));
138 LD->setMetadata(KindID: llvm::LLVMContext::MD_invariant_load,
139 Node: llvm::MDNode::get(Context&: CGF.getLLVMContext(), MDs: {}));
140 return LD;
141}
142
143// \p Index is 0, 1, and 2 for x, y, and z dimension, respectively.
144Value *EmitAMDGPUGridSize(CodeGenFunction &CGF, unsigned Index) {
145 const unsigned XOffset = 12;
146 auto *DP = EmitAMDGPUDispatchPtr(CGF);
147 // Indexing the HSA kernel_dispatch_packet struct.
148 auto *Offset = llvm::ConstantInt::get(Ty: CGF.Int32Ty, V: XOffset + Index * 4);
149 auto *GEP = CGF.Builder.CreateGEP(Ty: CGF.Int8Ty, Ptr: DP, IdxList: Offset);
150 auto *LD = CGF.Builder.CreateLoad(
151 Addr: Address(GEP, CGF.Int32Ty, CharUnits::fromQuantity(Quantity: 4)));
152
153 llvm::MDBuilder MDB(CGF.getLLVMContext());
154
155 // Known non-zero.
156 LD->setMetadata(KindID: llvm::LLVMContext::MD_range,
157 Node: MDB.createRange(Lo: APInt(32, 1), Hi: APInt::getZero(numBits: 32)));
158 LD->setMetadata(KindID: llvm::LLVMContext::MD_invariant_load,
159 Node: llvm::MDNode::get(Context&: CGF.getLLVMContext(), MDs: {}));
160 return LD;
161}
162} // namespace
163
164// Generates the IR for __builtin_read_exec_*.
165// Lowers the builtin to amdgcn_ballot intrinsic.
166static Value *EmitAMDGCNBallotForExec(CodeGenFunction &CGF, const CallExpr *E,
167 llvm::Type *RegisterType,
168 llvm::Type *ValueType, bool isExecHi) {
169 CodeGen::CGBuilderTy &Builder = CGF.Builder;
170 CodeGen::CodeGenModule &CGM = CGF.CGM;
171
172 Function *F = CGM.getIntrinsic(IID: Intrinsic::amdgcn_ballot, Tys: {RegisterType});
173 llvm::Value *Call = Builder.CreateCall(Callee: F, Args: {Builder.getInt1(V: true)});
174
175 if (isExecHi) {
176 Value *Rt2 = Builder.CreateLShr(LHS: Call, RHS: 32);
177 Rt2 = Builder.CreateTrunc(V: Rt2, DestTy: CGF.Int32Ty);
178 return Rt2;
179 }
180
181 return Call;
182}
183
184// Emit an intrinsic that has 1 float or double operand, and 1 integer.
185static Value *emitFPIntBuiltin(CodeGenFunction &CGF,
186 const CallExpr *E,
187 unsigned IntrinsicID) {
188 llvm::Value *Src0 = CGF.EmitScalarExpr(E: E->getArg(Arg: 0));
189 llvm::Value *Src1 = CGF.EmitScalarExpr(E: E->getArg(Arg: 1));
190
191 Function *F = CGF.CGM.getIntrinsic(IID: IntrinsicID, Tys: Src0->getType());
192 return CGF.Builder.CreateCall(Callee: F, Args: {Src0, Src1});
193}
194
195// For processing memory ordering and memory scope arguments of various
196// amdgcn builtins.
197// \p Order takes a C++11 comptabile memory-ordering specifier and converts
198// it into LLVM's memory ordering specifier using atomic C ABI, and writes
199// to \p AO. \p Scope takes a const char * and converts it into AMDGCN
200// specific SyncScopeID and writes it to \p SSID.
201void CodeGenFunction::ProcessOrderScopeAMDGCN(Value *Order, Value *Scope,
202 llvm::AtomicOrdering &AO,
203 llvm::SyncScope::ID &SSID) {
204 int ord = cast<llvm::ConstantInt>(Val: Order)->getZExtValue();
205
206 // Map C11/C++11 memory ordering to LLVM memory ordering
207 assert(llvm::isValidAtomicOrderingCABI(ord));
208 switch (static_cast<llvm::AtomicOrderingCABI>(ord)) {
209 case llvm::AtomicOrderingCABI::acquire:
210 case llvm::AtomicOrderingCABI::consume:
211 AO = llvm::AtomicOrdering::Acquire;
212 break;
213 case llvm::AtomicOrderingCABI::release:
214 AO = llvm::AtomicOrdering::Release;
215 break;
216 case llvm::AtomicOrderingCABI::acq_rel:
217 AO = llvm::AtomicOrdering::AcquireRelease;
218 break;
219 case llvm::AtomicOrderingCABI::seq_cst:
220 AO = llvm::AtomicOrdering::SequentiallyConsistent;
221 break;
222 case llvm::AtomicOrderingCABI::relaxed:
223 AO = llvm::AtomicOrdering::Monotonic;
224 break;
225 }
226
227 // Some of the atomic builtins take the scope as a string name.
228 StringRef scp;
229 if (llvm::getConstantStringInfo(V: Scope, Str&: scp)) {
230 SSID = getLLVMContext().getOrInsertSyncScopeID(SSN: scp);
231 return;
232 }
233
234 // Older builtins had an enum argument for the memory scope.
235 int scope = cast<llvm::ConstantInt>(Val: Scope)->getZExtValue();
236 switch (scope) {
237 case 0: // __MEMORY_SCOPE_SYSTEM
238 SSID = llvm::SyncScope::System;
239 break;
240 case 1: // __MEMORY_SCOPE_DEVICE
241 SSID = getLLVMContext().getOrInsertSyncScopeID(SSN: "agent");
242 break;
243 case 2: // __MEMORY_SCOPE_WRKGRP
244 SSID = getLLVMContext().getOrInsertSyncScopeID(SSN: "workgroup");
245 break;
246 case 3: // __MEMORY_SCOPE_WVFRNT
247 SSID = getLLVMContext().getOrInsertSyncScopeID(SSN: "wavefront");
248 break;
249 case 4: // __MEMORY_SCOPE_SINGLE
250 SSID = llvm::SyncScope::SingleThread;
251 break;
252 default:
253 SSID = llvm::SyncScope::System;
254 break;
255 }
256}
257
258llvm::Value *CodeGenFunction::EmitScalarOrConstFoldImmArg(unsigned ICEArguments,
259 unsigned Idx,
260 const CallExpr *E) {
261 llvm::Value *Arg = nullptr;
262 if ((ICEArguments & (1 << Idx)) == 0) {
263 Arg = EmitScalarExpr(E: E->getArg(Arg: Idx));
264 } else {
265 // If this is required to be a constant, constant fold it so that we
266 // know that the generated intrinsic gets a ConstantInt.
267 std::optional<llvm::APSInt> Result =
268 E->getArg(Arg: Idx)->getIntegerConstantExpr(Ctx: getContext());
269 assert(Result && "Expected argument to be a constant");
270 Arg = llvm::ConstantInt::get(Context&: getLLVMContext(), V: *Result);
271 }
272 return Arg;
273}
274
275void CodeGenFunction::AddAMDGPUFenceAddressSpaceMMRA(llvm::Instruction *Inst,
276 const CallExpr *E) {
277 constexpr const char *Tag = "amdgpu-as";
278
279 LLVMContext &Ctx = Inst->getContext();
280 SmallVector<MMRAMetadata::TagT, 3> MMRAs;
281 for (unsigned K = 2; K < E->getNumArgs(); ++K) {
282 llvm::Value *V = EmitScalarExpr(E: E->getArg(Arg: K));
283 StringRef AS;
284 if (llvm::getConstantStringInfo(V, Str&: AS)) {
285 MMRAs.push_back(Elt: {Tag, AS});
286 // TODO: Delete the resulting unused constant?
287 continue;
288 }
289 CGM.Error(loc: E->getExprLoc(),
290 error: "expected an address space name as a string literal");
291 }
292
293 llvm::sort(C&: MMRAs);
294 MMRAs.erase(CS: llvm::unique(R&: MMRAs), CE: MMRAs.end());
295 Inst->setMetadata(KindID: LLVMContext::MD_mmra, Node: MMRAMetadata::getMD(Ctx, Tags: MMRAs));
296}
297
298Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
299 const CallExpr *E) {
300 llvm::AtomicOrdering AO = llvm::AtomicOrdering::SequentiallyConsistent;
301 llvm::SyncScope::ID SSID;
302 switch (BuiltinID) {
303 case AMDGPU::BI__builtin_amdgcn_div_scale:
304 case AMDGPU::BI__builtin_amdgcn_div_scalef: {
305 // Translate from the intrinsics's struct return to the builtin's out
306 // argument.
307
308 Address FlagOutPtr = EmitPointerWithAlignment(Addr: E->getArg(Arg: 3));
309
310 llvm::Value *X = EmitScalarExpr(E: E->getArg(Arg: 0));
311 llvm::Value *Y = EmitScalarExpr(E: E->getArg(Arg: 1));
312 llvm::Value *Z = EmitScalarExpr(E: E->getArg(Arg: 2));
313
314 llvm::Function *Callee = CGM.getIntrinsic(IID: Intrinsic::amdgcn_div_scale,
315 Tys: X->getType());
316
317 llvm::Value *Tmp = Builder.CreateCall(Callee, Args: {X, Y, Z});
318
319 llvm::Value *Result = Builder.CreateExtractValue(Agg: Tmp, Idxs: 0);
320 llvm::Value *Flag = Builder.CreateExtractValue(Agg: Tmp, Idxs: 1);
321
322 llvm::Type *RealFlagType = FlagOutPtr.getElementType();
323
324 llvm::Value *FlagExt = Builder.CreateZExt(V: Flag, DestTy: RealFlagType);
325 Builder.CreateStore(Val: FlagExt, Addr: FlagOutPtr);
326 return Result;
327 }
328 case AMDGPU::BI__builtin_amdgcn_div_fmas:
329 case AMDGPU::BI__builtin_amdgcn_div_fmasf: {
330 llvm::Value *Src0 = EmitScalarExpr(E: E->getArg(Arg: 0));
331 llvm::Value *Src1 = EmitScalarExpr(E: E->getArg(Arg: 1));
332 llvm::Value *Src2 = EmitScalarExpr(E: E->getArg(Arg: 2));
333 llvm::Value *Src3 = EmitScalarExpr(E: E->getArg(Arg: 3));
334
335 llvm::Function *F = CGM.getIntrinsic(IID: Intrinsic::amdgcn_div_fmas,
336 Tys: Src0->getType());
337 llvm::Value *Src3ToBool = Builder.CreateIsNotNull(Arg: Src3);
338 return Builder.CreateCall(Callee: F, Args: {Src0, Src1, Src2, Src3ToBool});
339 }
340
341 case AMDGPU::BI__builtin_amdgcn_ds_swizzle:
342 return emitBuiltinWithOneOverloadedType<2>(CGF&: *this, E,
343 IntrinsicID: Intrinsic::amdgcn_ds_swizzle);
344 case AMDGPU::BI__builtin_amdgcn_mov_dpp8:
345 case AMDGPU::BI__builtin_amdgcn_mov_dpp:
346 case AMDGPU::BI__builtin_amdgcn_update_dpp: {
347 llvm::SmallVector<llvm::Value *, 6> Args;
348 // Find out if any arguments are required to be integer constant
349 // expressions.
350 unsigned ICEArguments = 0;
351 ASTContext::GetBuiltinTypeError Error;
352 getContext().GetBuiltinType(ID: BuiltinID, Error, IntegerConstantArgs: &ICEArguments);
353 assert(Error == ASTContext::GE_None && "Should not codegen an error");
354 llvm::Type *DataTy = ConvertType(T: E->getArg(Arg: 0)->getType());
355 unsigned Size = DataTy->getPrimitiveSizeInBits();
356 llvm::Type *IntTy =
357 llvm::IntegerType::get(C&: Builder.getContext(), NumBits: std::max(a: Size, b: 32u));
358 Function *F =
359 CGM.getIntrinsic(IID: BuiltinID == AMDGPU::BI__builtin_amdgcn_mov_dpp8
360 ? Intrinsic::amdgcn_mov_dpp8
361 : Intrinsic::amdgcn_update_dpp,
362 Tys: IntTy);
363 assert(E->getNumArgs() == 5 || E->getNumArgs() == 6 ||
364 E->getNumArgs() == 2);
365 bool InsertOld = BuiltinID == AMDGPU::BI__builtin_amdgcn_mov_dpp;
366 if (InsertOld)
367 Args.push_back(Elt: llvm::PoisonValue::get(T: IntTy));
368 for (unsigned I = 0; I != E->getNumArgs(); ++I) {
369 llvm::Value *V = EmitScalarOrConstFoldImmArg(ICEArguments, Idx: I, E);
370 if (I < (BuiltinID == AMDGPU::BI__builtin_amdgcn_update_dpp ? 2u : 1u) &&
371 Size < 32) {
372 if (!DataTy->isIntegerTy())
373 V = Builder.CreateBitCast(
374 V, DestTy: llvm::IntegerType::get(C&: Builder.getContext(), NumBits: Size));
375 V = Builder.CreateZExtOrBitCast(V, DestTy: IntTy);
376 }
377 llvm::Type *ExpTy =
378 F->getFunctionType()->getFunctionParamType(i: I + InsertOld);
379 Args.push_back(Elt: Builder.CreateTruncOrBitCast(V, DestTy: ExpTy));
380 }
381 Value *V = Builder.CreateCall(Callee: F, Args);
382 if (Size < 32 && !DataTy->isIntegerTy())
383 V = Builder.CreateTrunc(
384 V, DestTy: llvm::IntegerType::get(C&: Builder.getContext(), NumBits: Size));
385 return Builder.CreateTruncOrBitCast(V, DestTy: DataTy);
386 }
387 case AMDGPU::BI__builtin_amdgcn_permlane16:
388 case AMDGPU::BI__builtin_amdgcn_permlanex16:
389 return emitBuiltinWithOneOverloadedType<6>(
390 CGF&: *this, E,
391 IntrinsicID: BuiltinID == AMDGPU::BI__builtin_amdgcn_permlane16
392 ? Intrinsic::amdgcn_permlane16
393 : Intrinsic::amdgcn_permlanex16);
394 case AMDGPU::BI__builtin_amdgcn_permlane64:
395 return emitBuiltinWithOneOverloadedType<1>(CGF&: *this, E,
396 IntrinsicID: Intrinsic::amdgcn_permlane64);
397 case AMDGPU::BI__builtin_amdgcn_readlane:
398 return emitBuiltinWithOneOverloadedType<2>(CGF&: *this, E,
399 IntrinsicID: Intrinsic::amdgcn_readlane);
400 case AMDGPU::BI__builtin_amdgcn_readfirstlane:
401 return emitBuiltinWithOneOverloadedType<1>(CGF&: *this, E,
402 IntrinsicID: Intrinsic::amdgcn_readfirstlane);
403 case AMDGPU::BI__builtin_amdgcn_div_fixup:
404 case AMDGPU::BI__builtin_amdgcn_div_fixupf:
405 case AMDGPU::BI__builtin_amdgcn_div_fixuph:
406 return emitBuiltinWithOneOverloadedType<3>(CGF&: *this, E,
407 IntrinsicID: Intrinsic::amdgcn_div_fixup);
408 case AMDGPU::BI__builtin_amdgcn_trig_preop:
409 case AMDGPU::BI__builtin_amdgcn_trig_preopf:
410 return emitFPIntBuiltin(CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_trig_preop);
411 case AMDGPU::BI__builtin_amdgcn_rcp:
412 case AMDGPU::BI__builtin_amdgcn_rcpf:
413 case AMDGPU::BI__builtin_amdgcn_rcph:
414 return emitBuiltinWithOneOverloadedType<1>(CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_rcp);
415 case AMDGPU::BI__builtin_amdgcn_sqrt:
416 case AMDGPU::BI__builtin_amdgcn_sqrtf:
417 case AMDGPU::BI__builtin_amdgcn_sqrth:
418 return emitBuiltinWithOneOverloadedType<1>(CGF&: *this, E,
419 IntrinsicID: Intrinsic::amdgcn_sqrt);
420 case AMDGPU::BI__builtin_amdgcn_rsq:
421 case AMDGPU::BI__builtin_amdgcn_rsqf:
422 case AMDGPU::BI__builtin_amdgcn_rsqh:
423 return emitBuiltinWithOneOverloadedType<1>(CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_rsq);
424 case AMDGPU::BI__builtin_amdgcn_rsq_clamp:
425 case AMDGPU::BI__builtin_amdgcn_rsq_clampf:
426 return emitBuiltinWithOneOverloadedType<1>(CGF&: *this, E,
427 IntrinsicID: Intrinsic::amdgcn_rsq_clamp);
428 case AMDGPU::BI__builtin_amdgcn_sinf:
429 case AMDGPU::BI__builtin_amdgcn_sinh:
430 return emitBuiltinWithOneOverloadedType<1>(CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_sin);
431 case AMDGPU::BI__builtin_amdgcn_cosf:
432 case AMDGPU::BI__builtin_amdgcn_cosh:
433 return emitBuiltinWithOneOverloadedType<1>(CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_cos);
434 case AMDGPU::BI__builtin_amdgcn_dispatch_ptr:
435 return EmitAMDGPUDispatchPtr(CGF&: *this, E);
436 case AMDGPU::BI__builtin_amdgcn_logf:
437 return emitBuiltinWithOneOverloadedType<1>(CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_log);
438 case AMDGPU::BI__builtin_amdgcn_exp2f:
439 return emitBuiltinWithOneOverloadedType<1>(CGF&: *this, E,
440 IntrinsicID: Intrinsic::amdgcn_exp2);
441 case AMDGPU::BI__builtin_amdgcn_log_clampf:
442 return emitBuiltinWithOneOverloadedType<1>(CGF&: *this, E,
443 IntrinsicID: Intrinsic::amdgcn_log_clamp);
444 case AMDGPU::BI__builtin_amdgcn_ldexp:
445 case AMDGPU::BI__builtin_amdgcn_ldexpf: {
446 llvm::Value *Src0 = EmitScalarExpr(E: E->getArg(Arg: 0));
447 llvm::Value *Src1 = EmitScalarExpr(E: E->getArg(Arg: 1));
448 llvm::Function *F =
449 CGM.getIntrinsic(IID: Intrinsic::ldexp, Tys: {Src0->getType(), Src1->getType()});
450 return Builder.CreateCall(Callee: F, Args: {Src0, Src1});
451 }
452 case AMDGPU::BI__builtin_amdgcn_ldexph: {
453 // The raw instruction has a different behavior for out of bounds exponent
454 // values (implicit truncation instead of saturate to short_min/short_max).
455 llvm::Value *Src0 = EmitScalarExpr(E: E->getArg(Arg: 0));
456 llvm::Value *Src1 = EmitScalarExpr(E: E->getArg(Arg: 1));
457 llvm::Function *F =
458 CGM.getIntrinsic(IID: Intrinsic::ldexp, Tys: {Src0->getType(), Int16Ty});
459 return Builder.CreateCall(Callee: F, Args: {Src0, Builder.CreateTrunc(V: Src1, DestTy: Int16Ty)});
460 }
461 case AMDGPU::BI__builtin_amdgcn_frexp_mant:
462 case AMDGPU::BI__builtin_amdgcn_frexp_mantf:
463 case AMDGPU::BI__builtin_amdgcn_frexp_manth:
464 return emitBuiltinWithOneOverloadedType<1>(CGF&: *this, E,
465 IntrinsicID: Intrinsic::amdgcn_frexp_mant);
466 case AMDGPU::BI__builtin_amdgcn_frexp_exp:
467 case AMDGPU::BI__builtin_amdgcn_frexp_expf: {
468 Value *Src0 = EmitScalarExpr(E: E->getArg(Arg: 0));
469 Function *F = CGM.getIntrinsic(IID: Intrinsic::amdgcn_frexp_exp,
470 Tys: { Builder.getInt32Ty(), Src0->getType() });
471 return Builder.CreateCall(Callee: F, Args: Src0);
472 }
473 case AMDGPU::BI__builtin_amdgcn_frexp_exph: {
474 Value *Src0 = EmitScalarExpr(E: E->getArg(Arg: 0));
475 Function *F = CGM.getIntrinsic(IID: Intrinsic::amdgcn_frexp_exp,
476 Tys: { Builder.getInt16Ty(), Src0->getType() });
477 return Builder.CreateCall(Callee: F, Args: Src0);
478 }
479 case AMDGPU::BI__builtin_amdgcn_fract:
480 case AMDGPU::BI__builtin_amdgcn_fractf:
481 case AMDGPU::BI__builtin_amdgcn_fracth:
482 return emitBuiltinWithOneOverloadedType<1>(CGF&: *this, E,
483 IntrinsicID: Intrinsic::amdgcn_fract);
484 case AMDGPU::BI__builtin_amdgcn_lerp:
485 return emitBuiltinWithOneOverloadedType<3>(CGF&: *this, E,
486 IntrinsicID: Intrinsic::amdgcn_lerp);
487 case AMDGPU::BI__builtin_amdgcn_ubfe:
488 return emitBuiltinWithOneOverloadedType<3>(CGF&: *this, E,
489 IntrinsicID: Intrinsic::amdgcn_ubfe);
490 case AMDGPU::BI__builtin_amdgcn_sbfe:
491 return emitBuiltinWithOneOverloadedType<3>(CGF&: *this, E,
492 IntrinsicID: Intrinsic::amdgcn_sbfe);
493 case AMDGPU::BI__builtin_amdgcn_ballot_w32:
494 case AMDGPU::BI__builtin_amdgcn_ballot_w64: {
495 llvm::Type *ResultType = ConvertType(T: E->getType());
496 llvm::Value *Src = EmitScalarExpr(E: E->getArg(Arg: 0));
497 Function *F = CGM.getIntrinsic(IID: Intrinsic::amdgcn_ballot, Tys: { ResultType });
498 return Builder.CreateCall(Callee: F, Args: { Src });
499 }
500 case AMDGPU::BI__builtin_amdgcn_uicmp:
501 case AMDGPU::BI__builtin_amdgcn_uicmpl:
502 case AMDGPU::BI__builtin_amdgcn_sicmp:
503 case AMDGPU::BI__builtin_amdgcn_sicmpl: {
504 llvm::Value *Src0 = EmitScalarExpr(E: E->getArg(Arg: 0));
505 llvm::Value *Src1 = EmitScalarExpr(E: E->getArg(Arg: 1));
506 llvm::Value *Src2 = EmitScalarExpr(E: E->getArg(Arg: 2));
507
508 // FIXME-GFX10: How should 32 bit mask be handled?
509 Function *F = CGM.getIntrinsic(IID: Intrinsic::amdgcn_icmp,
510 Tys: { Builder.getInt64Ty(), Src0->getType() });
511 return Builder.CreateCall(Callee: F, Args: { Src0, Src1, Src2 });
512 }
513 case AMDGPU::BI__builtin_amdgcn_fcmp:
514 case AMDGPU::BI__builtin_amdgcn_fcmpf: {
515 llvm::Value *Src0 = EmitScalarExpr(E: E->getArg(Arg: 0));
516 llvm::Value *Src1 = EmitScalarExpr(E: E->getArg(Arg: 1));
517 llvm::Value *Src2 = EmitScalarExpr(E: E->getArg(Arg: 2));
518
519 // FIXME-GFX10: How should 32 bit mask be handled?
520 Function *F = CGM.getIntrinsic(IID: Intrinsic::amdgcn_fcmp,
521 Tys: { Builder.getInt64Ty(), Src0->getType() });
522 return Builder.CreateCall(Callee: F, Args: { Src0, Src1, Src2 });
523 }
524 case AMDGPU::BI__builtin_amdgcn_class:
525 case AMDGPU::BI__builtin_amdgcn_classf:
526 case AMDGPU::BI__builtin_amdgcn_classh:
527 return emitFPIntBuiltin(CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_class);
528 case AMDGPU::BI__builtin_amdgcn_fmed3f:
529 case AMDGPU::BI__builtin_amdgcn_fmed3h:
530 return emitBuiltinWithOneOverloadedType<3>(CGF&: *this, E,
531 IntrinsicID: Intrinsic::amdgcn_fmed3);
532 case AMDGPU::BI__builtin_amdgcn_ds_append:
533 case AMDGPU::BI__builtin_amdgcn_ds_consume: {
534 Intrinsic::ID Intrin = BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_append ?
535 Intrinsic::amdgcn_ds_append : Intrinsic::amdgcn_ds_consume;
536 Value *Src0 = EmitScalarExpr(E: E->getArg(Arg: 0));
537 Function *F = CGM.getIntrinsic(IID: Intrin, Tys: { Src0->getType() });
538 return Builder.CreateCall(Callee: F, Args: { Src0, Builder.getFalse() });
539 }
540 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_i32:
541 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_v2i32:
542 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4i16:
543 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4f16:
544 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4bf16:
545 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8i16:
546 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8f16:
547 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8bf16:
548 case AMDGPU::BI__builtin_amdgcn_global_load_tr4_b64_v2i32:
549 case AMDGPU::BI__builtin_amdgcn_global_load_tr8_b64_v2i32:
550 case AMDGPU::BI__builtin_amdgcn_global_load_tr6_b96_v3i32:
551 case AMDGPU::BI__builtin_amdgcn_global_load_tr16_b128_v8i16:
552 case AMDGPU::BI__builtin_amdgcn_global_load_tr16_b128_v8f16:
553 case AMDGPU::BI__builtin_amdgcn_global_load_tr16_b128_v8bf16:
554 case AMDGPU::BI__builtin_amdgcn_ds_load_tr4_b64_v2i32:
555 case AMDGPU::BI__builtin_amdgcn_ds_load_tr8_b64_v2i32:
556 case AMDGPU::BI__builtin_amdgcn_ds_load_tr6_b96_v3i32:
557 case AMDGPU::BI__builtin_amdgcn_ds_load_tr16_b128_v8i16:
558 case AMDGPU::BI__builtin_amdgcn_ds_load_tr16_b128_v8f16:
559 case AMDGPU::BI__builtin_amdgcn_ds_load_tr16_b128_v8bf16:
560 case AMDGPU::BI__builtin_amdgcn_ds_read_tr4_b64_v2i32:
561 case AMDGPU::BI__builtin_amdgcn_ds_read_tr8_b64_v2i32:
562 case AMDGPU::BI__builtin_amdgcn_ds_read_tr6_b96_v3i32:
563 case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4f16:
564 case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4bf16:
565 case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4i16: {
566 Intrinsic::ID IID;
567 switch (BuiltinID) {
568 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_i32:
569 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_v2i32:
570 case AMDGPU::BI__builtin_amdgcn_global_load_tr8_b64_v2i32:
571 IID = Intrinsic::amdgcn_global_load_tr_b64;
572 break;
573 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4i16:
574 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4f16:
575 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4bf16:
576 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8i16:
577 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8f16:
578 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8bf16:
579 case AMDGPU::BI__builtin_amdgcn_global_load_tr16_b128_v8i16:
580 case AMDGPU::BI__builtin_amdgcn_global_load_tr16_b128_v8f16:
581 case AMDGPU::BI__builtin_amdgcn_global_load_tr16_b128_v8bf16:
582 IID = Intrinsic::amdgcn_global_load_tr_b128;
583 break;
584 case AMDGPU::BI__builtin_amdgcn_global_load_tr4_b64_v2i32:
585 IID = Intrinsic::amdgcn_global_load_tr4_b64;
586 break;
587 case AMDGPU::BI__builtin_amdgcn_global_load_tr6_b96_v3i32:
588 IID = Intrinsic::amdgcn_global_load_tr6_b96;
589 break;
590 case AMDGPU::BI__builtin_amdgcn_ds_load_tr4_b64_v2i32:
591 IID = Intrinsic::amdgcn_ds_load_tr4_b64;
592 break;
593 case AMDGPU::BI__builtin_amdgcn_ds_load_tr6_b96_v3i32:
594 IID = Intrinsic::amdgcn_ds_load_tr6_b96;
595 break;
596 case AMDGPU::BI__builtin_amdgcn_ds_load_tr8_b64_v2i32:
597 IID = Intrinsic::amdgcn_ds_load_tr8_b64;
598 break;
599 case AMDGPU::BI__builtin_amdgcn_ds_load_tr16_b128_v8i16:
600 case AMDGPU::BI__builtin_amdgcn_ds_load_tr16_b128_v8f16:
601 case AMDGPU::BI__builtin_amdgcn_ds_load_tr16_b128_v8bf16:
602 IID = Intrinsic::amdgcn_ds_load_tr16_b128;
603 break;
604 case AMDGPU::BI__builtin_amdgcn_ds_read_tr4_b64_v2i32:
605 IID = Intrinsic::amdgcn_ds_read_tr4_b64;
606 break;
607 case AMDGPU::BI__builtin_amdgcn_ds_read_tr8_b64_v2i32:
608 IID = Intrinsic::amdgcn_ds_read_tr8_b64;
609 break;
610 case AMDGPU::BI__builtin_amdgcn_ds_read_tr6_b96_v3i32:
611 IID = Intrinsic::amdgcn_ds_read_tr6_b96;
612 break;
613 case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4i16:
614 case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4f16:
615 case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4bf16:
616 IID = Intrinsic::amdgcn_ds_read_tr16_b64;
617 break;
618 }
619 llvm::Type *LoadTy = ConvertType(T: E->getType());
620 llvm::Value *Addr = EmitScalarExpr(E: E->getArg(Arg: 0));
621 llvm::Function *F = CGM.getIntrinsic(IID, Tys: {LoadTy});
622 return Builder.CreateCall(Callee: F, Args: {Addr});
623 }
624 case AMDGPU::BI__builtin_amdgcn_load_to_lds: {
625 // Should this have asan instrumentation?
626 return emitBuiltinWithOneOverloadedType<5>(CGF&: *this, E,
627 IntrinsicID: Intrinsic::amdgcn_load_to_lds);
628 }
629 case AMDGPU::BI__builtin_amdgcn_get_fpenv: {
630 Function *F = CGM.getIntrinsic(IID: Intrinsic::get_fpenv,
631 Tys: {llvm::Type::getInt64Ty(C&: getLLVMContext())});
632 return Builder.CreateCall(Callee: F);
633 }
634 case AMDGPU::BI__builtin_amdgcn_set_fpenv: {
635 Function *F = CGM.getIntrinsic(IID: Intrinsic::set_fpenv,
636 Tys: {llvm::Type::getInt64Ty(C&: getLLVMContext())});
637 llvm::Value *Env = EmitScalarExpr(E: E->getArg(Arg: 0));
638 return Builder.CreateCall(Callee: F, Args: {Env});
639 }
640 case AMDGPU::BI__builtin_amdgcn_read_exec:
641 return EmitAMDGCNBallotForExec(CGF&: *this, E, RegisterType: Int64Ty, ValueType: Int64Ty, isExecHi: false);
642 case AMDGPU::BI__builtin_amdgcn_read_exec_lo:
643 return EmitAMDGCNBallotForExec(CGF&: *this, E, RegisterType: Int32Ty, ValueType: Int32Ty, isExecHi: false);
644 case AMDGPU::BI__builtin_amdgcn_read_exec_hi:
645 return EmitAMDGCNBallotForExec(CGF&: *this, E, RegisterType: Int64Ty, ValueType: Int64Ty, isExecHi: true);
646 case AMDGPU::BI__builtin_amdgcn_image_bvh_intersect_ray:
647 case AMDGPU::BI__builtin_amdgcn_image_bvh_intersect_ray_h:
648 case AMDGPU::BI__builtin_amdgcn_image_bvh_intersect_ray_l:
649 case AMDGPU::BI__builtin_amdgcn_image_bvh_intersect_ray_lh: {
650 llvm::Value *NodePtr = EmitScalarExpr(E: E->getArg(Arg: 0));
651 llvm::Value *RayExtent = EmitScalarExpr(E: E->getArg(Arg: 1));
652 llvm::Value *RayOrigin = EmitScalarExpr(E: E->getArg(Arg: 2));
653 llvm::Value *RayDir = EmitScalarExpr(E: E->getArg(Arg: 3));
654 llvm::Value *RayInverseDir = EmitScalarExpr(E: E->getArg(Arg: 4));
655 llvm::Value *TextureDescr = EmitScalarExpr(E: E->getArg(Arg: 5));
656
657 // The builtins take these arguments as vec4 where the last element is
658 // ignored. The intrinsic takes them as vec3.
659 RayOrigin = Builder.CreateShuffleVector(V1: RayOrigin, V2: RayOrigin,
660 Mask: {0, 1, 2});
661 RayDir =
662 Builder.CreateShuffleVector(V1: RayDir, V2: RayDir, Mask: {0, 1, 2});
663 RayInverseDir = Builder.CreateShuffleVector(V1: RayInverseDir, V2: RayInverseDir,
664 Mask: {0, 1, 2});
665
666 Function *F = CGM.getIntrinsic(IID: Intrinsic::amdgcn_image_bvh_intersect_ray,
667 Tys: {NodePtr->getType(), RayDir->getType()});
668 return Builder.CreateCall(Callee: F, Args: {NodePtr, RayExtent, RayOrigin, RayDir,
669 RayInverseDir, TextureDescr});
670 }
671 case AMDGPU::BI__builtin_amdgcn_image_bvh8_intersect_ray:
672 case AMDGPU::BI__builtin_amdgcn_image_bvh_dual_intersect_ray: {
673 Intrinsic::ID IID;
674 switch (BuiltinID) {
675 case AMDGPU::BI__builtin_amdgcn_image_bvh8_intersect_ray:
676 IID = Intrinsic::amdgcn_image_bvh8_intersect_ray;
677 break;
678 case AMDGPU::BI__builtin_amdgcn_image_bvh_dual_intersect_ray:
679 IID = Intrinsic::amdgcn_image_bvh_dual_intersect_ray;
680 break;
681 }
682 llvm::Value *NodePtr = EmitScalarExpr(E: E->getArg(Arg: 0));
683 llvm::Value *RayExtent = EmitScalarExpr(E: E->getArg(Arg: 1));
684 llvm::Value *InstanceMask = EmitScalarExpr(E: E->getArg(Arg: 2));
685 llvm::Value *RayOrigin = EmitScalarExpr(E: E->getArg(Arg: 3));
686 llvm::Value *RayDir = EmitScalarExpr(E: E->getArg(Arg: 4));
687 llvm::Value *Offset = EmitScalarExpr(E: E->getArg(Arg: 5));
688 llvm::Value *TextureDescr = EmitScalarExpr(E: E->getArg(Arg: 6));
689
690 Address RetRayOriginPtr = EmitPointerWithAlignment(Addr: E->getArg(Arg: 7));
691 Address RetRayDirPtr = EmitPointerWithAlignment(Addr: E->getArg(Arg: 8));
692
693 llvm::Function *IntrinsicFunc = CGM.getIntrinsic(IID);
694
695 llvm::CallInst *CI = Builder.CreateCall(
696 Callee: IntrinsicFunc, Args: {NodePtr, RayExtent, InstanceMask, RayOrigin, RayDir,
697 Offset, TextureDescr});
698
699 llvm::Value *RetVData = Builder.CreateExtractValue(Agg: CI, Idxs: 0);
700 llvm::Value *RetRayOrigin = Builder.CreateExtractValue(Agg: CI, Idxs: 1);
701 llvm::Value *RetRayDir = Builder.CreateExtractValue(Agg: CI, Idxs: 2);
702
703 Builder.CreateStore(Val: RetRayOrigin, Addr: RetRayOriginPtr);
704 Builder.CreateStore(Val: RetRayDir, Addr: RetRayDirPtr);
705
706 return RetVData;
707 }
708
709 case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_rtn:
710 case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push4_pop1_rtn:
711 case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push8_pop1_rtn:
712 case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push8_pop2_rtn: {
713 Intrinsic::ID IID;
714 switch (BuiltinID) {
715 case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_rtn:
716 IID = Intrinsic::amdgcn_ds_bvh_stack_rtn;
717 break;
718 case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push4_pop1_rtn:
719 IID = Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn;
720 break;
721 case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push8_pop1_rtn:
722 IID = Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn;
723 break;
724 case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push8_pop2_rtn:
725 IID = Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn;
726 break;
727 }
728
729 SmallVector<Value *, 4> Args;
730 for (int i = 0, e = E->getNumArgs(); i != e; ++i)
731 Args.push_back(Elt: EmitScalarExpr(E: E->getArg(Arg: i)));
732
733 Function *F = CGM.getIntrinsic(IID);
734 Value *Call = Builder.CreateCall(Callee: F, Args);
735 Value *Rtn = Builder.CreateExtractValue(Agg: Call, Idxs: 0);
736 Value *A = Builder.CreateExtractValue(Agg: Call, Idxs: 1);
737 llvm::Type *RetTy = ConvertType(T: E->getType());
738 Value *I0 = Builder.CreateInsertElement(Vec: PoisonValue::get(T: RetTy), NewElt: Rtn,
739 Idx: (uint64_t)0);
740 // ds_bvh_stack_push8_pop2_rtn returns {i64, i32} but the builtin returns
741 // <2 x i64>, zext the second value.
742 if (A->getType()->getPrimitiveSizeInBits() <
743 RetTy->getScalarType()->getPrimitiveSizeInBits())
744 A = Builder.CreateZExt(V: A, DestTy: RetTy->getScalarType());
745
746 return Builder.CreateInsertElement(Vec: I0, NewElt: A, Idx: 1);
747 }
748 case AMDGPU::BI__builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4:
749 case AMDGPU::BI__builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4: {
750 llvm::FixedVectorType *VT = FixedVectorType::get(ElementType: Builder.getInt32Ty(), NumElts: 8);
751 Function *F = CGM.getIntrinsic(
752 IID: BuiltinID == AMDGPU::BI__builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4
753 ? Intrinsic::amdgcn_mfma_scale_f32_32x32x64_f8f6f4
754 : Intrinsic::amdgcn_mfma_scale_f32_16x16x128_f8f6f4,
755 Tys: {VT, VT});
756
757 SmallVector<Value *, 9> Args;
758 for (unsigned I = 0, N = E->getNumArgs(); I != N; ++I)
759 Args.push_back(Elt: EmitScalarExpr(E: E->getArg(Arg: I)));
760 return Builder.CreateCall(Callee: F, Args);
761 }
762 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32:
763 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32:
764 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64:
765 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64:
766 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w32:
767 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w32:
768 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w64:
769 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w64:
770 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32:
771 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64:
772 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32:
773 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w64:
774 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32:
775 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64:
776 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32:
777 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64:
778 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32_gfx12:
779 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64_gfx12:
780 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w32_gfx12:
781 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w64_gfx12:
782 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12:
783 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64_gfx12:
784 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12:
785 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w64_gfx12:
786 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32_gfx12:
787 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64_gfx12:
788 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12:
789 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64_gfx12:
790 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32_gfx12:
791 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w64_gfx12:
792 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32_gfx12:
793 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w64_gfx12:
794 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32_gfx12:
795 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w64_gfx12:
796 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32_gfx12:
797 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w64_gfx12:
798 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x32_iu4_w32_gfx12:
799 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x32_iu4_w64_gfx12:
800 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_f16_w32:
801 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_f16_w64:
802 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w32:
803 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w64:
804 case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x32_f16_w32:
805 case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x32_f16_w64:
806 case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w32:
807 case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w64:
808 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w32:
809 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w64:
810 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w32:
811 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w64:
812 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w32:
813 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w64:
814 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32:
815 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64:
816 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32:
817 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64:
818 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32:
819 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64:
820 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32:
821 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64: {
822
823 // These operations perform a matrix multiplication and accumulation of
824 // the form:
825 // D = A * B + C
826 // We need to specify one type for matrices AB and one for matrices CD.
827 // Sparse matrix operations can have different types for A and B as well as
828 // an additional type for sparsity index.
829 // Destination type should be put before types used for source operands.
830 SmallVector<unsigned, 2> ArgsForMatchingMatrixTypes;
831 // On GFX12, the intrinsics with 16-bit accumulator use a packed layout.
832 // There is no need for the variable opsel argument, so always set it to
833 // "false".
834 bool AppendFalseForOpselArg = false;
835 unsigned BuiltinWMMAOp;
836
837 switch (BuiltinID) {
838 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32:
839 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w64:
840 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12:
841 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w64_gfx12:
842 ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
843 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_f16;
844 break;
845 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32:
846 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64:
847 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12:
848 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64_gfx12:
849 ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
850 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_bf16;
851 break;
852 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w32_gfx12:
853 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w64_gfx12:
854 AppendFalseForOpselArg = true;
855 [[fallthrough]];
856 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w32:
857 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w64:
858 ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
859 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x16_f16;
860 break;
861 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32_gfx12:
862 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64_gfx12:
863 AppendFalseForOpselArg = true;
864 [[fallthrough]];
865 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32:
866 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64:
867 ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
868 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16;
869 break;
870 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w32:
871 case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w64:
872 ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
873 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x16_f16_tied;
874 break;
875 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32:
876 case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64:
877 ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
878 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16_tied;
879 break;
880 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32:
881 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64:
882 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12:
883 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64_gfx12:
884 ArgsForMatchingMatrixTypes = {4, 1}; // CD, AB
885 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_i32_16x16x16_iu8;
886 break;
887 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32:
888 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64:
889 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32_gfx12:
890 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64_gfx12:
891 ArgsForMatchingMatrixTypes = {4, 1}; // CD, AB
892 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_i32_16x16x16_iu4;
893 break;
894 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32_gfx12:
895 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w64_gfx12:
896 ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
897 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_fp8;
898 break;
899 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32_gfx12:
900 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w64_gfx12:
901 ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
902 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_bf8;
903 break;
904 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32_gfx12:
905 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w64_gfx12:
906 ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
907 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_fp8;
908 break;
909 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32_gfx12:
910 case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w64_gfx12:
911 ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
912 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_bf8;
913 break;
914 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x32_iu4_w32_gfx12:
915 case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x32_iu4_w64_gfx12:
916 ArgsForMatchingMatrixTypes = {4, 1}; // CD, AB
917 BuiltinWMMAOp = Intrinsic::amdgcn_wmma_i32_16x16x32_iu4;
918 break;
919 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_f16_w32:
920 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_f16_w64:
921 ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
922 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_f16;
923 break;
924 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w32:
925 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w64:
926 ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
927 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16;
928 break;
929 case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x32_f16_w32:
930 case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x32_f16_w64:
931 ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
932 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f16_16x16x32_f16;
933 break;
934 case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w32:
935 case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w64:
936 ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
937 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16;
938 break;
939 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w32:
940 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w64:
941 ArgsForMatchingMatrixTypes = {4, 1, 3, 5}; // CD, A, B, Index
942 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8;
943 break;
944 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w32:
945 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w64:
946 ArgsForMatchingMatrixTypes = {4, 1, 3, 5}; // CD, A, B, Index
947 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4;
948 break;
949 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w32:
950 case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w64:
951 ArgsForMatchingMatrixTypes = {4, 1, 3, 5}; // CD, A, B, Index
952 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4;
953 break;
954 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32:
955 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64:
956 ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
957 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8;
958 break;
959 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32:
960 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64:
961 ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
962 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8;
963 break;
964 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32:
965 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64:
966 ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
967 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8;
968 break;
969 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32:
970 case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64:
971 ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
972 BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8;
973 break;
974 }
975
976 SmallVector<Value *, 6> Args;
977 for (int i = 0, e = E->getNumArgs(); i != e; ++i)
978 Args.push_back(Elt: EmitScalarExpr(E: E->getArg(Arg: i)));
979 if (AppendFalseForOpselArg)
980 Args.push_back(Elt: Builder.getFalse());
981
982 SmallVector<llvm::Type *, 6> ArgTypes;
983 for (auto ArgIdx : ArgsForMatchingMatrixTypes)
984 ArgTypes.push_back(Elt: Args[ArgIdx]->getType());
985
986 Function *F = CGM.getIntrinsic(IID: BuiltinWMMAOp, Tys: ArgTypes);
987 return Builder.CreateCall(Callee: F, Args);
988 }
989 // amdgcn workgroup size
990 case AMDGPU::BI__builtin_amdgcn_workgroup_size_x:
991 return EmitAMDGPUWorkGroupSize(CGF&: *this, Index: 0);
992 case AMDGPU::BI__builtin_amdgcn_workgroup_size_y:
993 return EmitAMDGPUWorkGroupSize(CGF&: *this, Index: 1);
994 case AMDGPU::BI__builtin_amdgcn_workgroup_size_z:
995 return EmitAMDGPUWorkGroupSize(CGF&: *this, Index: 2);
996
997 // amdgcn grid size
998 case AMDGPU::BI__builtin_amdgcn_grid_size_x:
999 return EmitAMDGPUGridSize(CGF&: *this, Index: 0);
1000 case AMDGPU::BI__builtin_amdgcn_grid_size_y:
1001 return EmitAMDGPUGridSize(CGF&: *this, Index: 1);
1002 case AMDGPU::BI__builtin_amdgcn_grid_size_z:
1003 return EmitAMDGPUGridSize(CGF&: *this, Index: 2);
1004
1005 // r600 intrinsics
1006 case AMDGPU::BI__builtin_r600_recipsqrt_ieee:
1007 case AMDGPU::BI__builtin_r600_recipsqrt_ieeef:
1008 return emitBuiltinWithOneOverloadedType<1>(CGF&: *this, E,
1009 IntrinsicID: Intrinsic::r600_recipsqrt_ieee);
1010 case AMDGPU::BI__builtin_amdgcn_alignbit: {
1011 llvm::Value *Src0 = EmitScalarExpr(E: E->getArg(Arg: 0));
1012 llvm::Value *Src1 = EmitScalarExpr(E: E->getArg(Arg: 1));
1013 llvm::Value *Src2 = EmitScalarExpr(E: E->getArg(Arg: 2));
1014 Function *F = CGM.getIntrinsic(IID: Intrinsic::fshr, Tys: Src0->getType());
1015 return Builder.CreateCall(Callee: F, Args: { Src0, Src1, Src2 });
1016 }
1017 case AMDGPU::BI__builtin_amdgcn_fence: {
1018 ProcessOrderScopeAMDGCN(Order: EmitScalarExpr(E: E->getArg(Arg: 0)),
1019 Scope: EmitScalarExpr(E: E->getArg(Arg: 1)), AO, SSID);
1020 FenceInst *Fence = Builder.CreateFence(Ordering: AO, SSID);
1021 if (E->getNumArgs() > 2)
1022 AddAMDGPUFenceAddressSpaceMMRA(Inst: Fence, E);
1023 return Fence;
1024 }
1025 case AMDGPU::BI__builtin_amdgcn_atomic_inc32:
1026 case AMDGPU::BI__builtin_amdgcn_atomic_inc64:
1027 case AMDGPU::BI__builtin_amdgcn_atomic_dec32:
1028 case AMDGPU::BI__builtin_amdgcn_atomic_dec64:
1029 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f64:
1030 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f32:
1031 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2f16:
1032 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16:
1033 case AMDGPU::BI__builtin_amdgcn_ds_faddf:
1034 case AMDGPU::BI__builtin_amdgcn_ds_fminf:
1035 case AMDGPU::BI__builtin_amdgcn_ds_fmaxf:
1036 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
1037 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
1038 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
1039 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2f16:
1040 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f32:
1041 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f64:
1042 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2bf16:
1043 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16:
1044 case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
1045 case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64:
1046 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmin_f64:
1047 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmax_f64: {
1048 llvm::AtomicRMWInst::BinOp BinOp;
1049 switch (BuiltinID) {
1050 case AMDGPU::BI__builtin_amdgcn_atomic_inc32:
1051 case AMDGPU::BI__builtin_amdgcn_atomic_inc64:
1052 BinOp = llvm::AtomicRMWInst::UIncWrap;
1053 break;
1054 case AMDGPU::BI__builtin_amdgcn_atomic_dec32:
1055 case AMDGPU::BI__builtin_amdgcn_atomic_dec64:
1056 BinOp = llvm::AtomicRMWInst::UDecWrap;
1057 break;
1058 case AMDGPU::BI__builtin_amdgcn_ds_faddf:
1059 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f64:
1060 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f32:
1061 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2f16:
1062 case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16:
1063 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
1064 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
1065 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
1066 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2f16:
1067 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f32:
1068 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f64:
1069 case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2bf16:
1070 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16:
1071 BinOp = llvm::AtomicRMWInst::FAdd;
1072 break;
1073 case AMDGPU::BI__builtin_amdgcn_ds_fminf:
1074 case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
1075 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmin_f64:
1076 BinOp = llvm::AtomicRMWInst::FMin;
1077 break;
1078 case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64:
1079 case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmax_f64:
1080 case AMDGPU::BI__builtin_amdgcn_ds_fmaxf:
1081 BinOp = llvm::AtomicRMWInst::FMax;
1082 break;
1083 }
1084
1085 Address Ptr = CheckAtomicAlignment(CGF&: *this, E);
1086 Value *Val = EmitScalarExpr(E: E->getArg(Arg: 1));
1087 llvm::Type *OrigTy = Val->getType();
1088 QualType PtrTy = E->getArg(Arg: 0)->IgnoreImpCasts()->getType();
1089
1090 bool Volatile;
1091
1092 if (BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_faddf ||
1093 BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_fminf ||
1094 BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_fmaxf) {
1095 // __builtin_amdgcn_ds_faddf/fminf/fmaxf has an explicit volatile argument
1096 Volatile =
1097 cast<ConstantInt>(Val: EmitScalarExpr(E: E->getArg(Arg: 4)))->getZExtValue();
1098 } else {
1099 // Infer volatile from the passed type.
1100 Volatile =
1101 PtrTy->castAs<PointerType>()->getPointeeType().isVolatileQualified();
1102 }
1103
1104 if (E->getNumArgs() >= 4) {
1105 // Some of the builtins have explicit ordering and scope arguments.
1106 ProcessOrderScopeAMDGCN(Order: EmitScalarExpr(E: E->getArg(Arg: 2)),
1107 Scope: EmitScalarExpr(E: E->getArg(Arg: 3)), AO, SSID);
1108 } else {
1109 // Most of the builtins do not have syncscope/order arguments. For DS
1110 // atomics the scope doesn't really matter, as they implicitly operate at
1111 // workgroup scope.
1112 //
1113 // The global/flat cases need to use agent scope to consistently produce
1114 // the native instruction instead of a cmpxchg expansion.
1115 SSID = getLLVMContext().getOrInsertSyncScopeID(SSN: "agent");
1116 AO = AtomicOrdering::Monotonic;
1117
1118 // The v2bf16 builtin uses i16 instead of a natural bfloat type.
1119 if (BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16 ||
1120 BuiltinID == AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2bf16 ||
1121 BuiltinID == AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16) {
1122 llvm::Type *V2BF16Ty = FixedVectorType::get(
1123 ElementType: llvm::Type::getBFloatTy(C&: Builder.getContext()), NumElts: 2);
1124 Val = Builder.CreateBitCast(V: Val, DestTy: V2BF16Ty);
1125 }
1126 }
1127
1128 llvm::AtomicRMWInst *RMW =
1129 Builder.CreateAtomicRMW(Op: BinOp, Addr: Ptr, Val, Ordering: AO, SSID);
1130 if (Volatile)
1131 RMW->setVolatile(true);
1132
1133 unsigned AddrSpace = Ptr.getType()->getAddressSpace();
1134 if (AddrSpace != llvm::AMDGPUAS::LOCAL_ADDRESS) {
1135 // Most targets require "amdgpu.no.fine.grained.memory" to emit the native
1136 // instruction for flat and global operations.
1137 llvm::MDTuple *EmptyMD = MDNode::get(Context&: getLLVMContext(), MDs: {});
1138 RMW->setMetadata(Kind: "amdgpu.no.fine.grained.memory", Node: EmptyMD);
1139
1140 // Most targets require "amdgpu.ignore.denormal.mode" to emit the native
1141 // instruction, but this only matters for float fadd.
1142 if (BinOp == llvm::AtomicRMWInst::FAdd && Val->getType()->isFloatTy())
1143 RMW->setMetadata(Kind: "amdgpu.ignore.denormal.mode", Node: EmptyMD);
1144 }
1145
1146 return Builder.CreateBitCast(V: RMW, DestTy: OrigTy);
1147 }
1148 case AMDGPU::BI__builtin_amdgcn_s_sendmsg_rtn:
1149 case AMDGPU::BI__builtin_amdgcn_s_sendmsg_rtnl: {
1150 llvm::Value *Arg = EmitScalarExpr(E: E->getArg(Arg: 0));
1151 llvm::Type *ResultType = ConvertType(T: E->getType());
1152 // s_sendmsg_rtn is mangled using return type only.
1153 Function *F =
1154 CGM.getIntrinsic(IID: Intrinsic::amdgcn_s_sendmsg_rtn, Tys: {ResultType});
1155 return Builder.CreateCall(Callee: F, Args: {Arg});
1156 }
1157 case AMDGPU::BI__builtin_amdgcn_permlane16_swap:
1158 case AMDGPU::BI__builtin_amdgcn_permlane32_swap: {
1159 // Because builtin types are limited, and the intrinsic uses a struct/pair
1160 // output, marshal the pair-of-i32 to <2 x i32>.
1161 Value *VDstOld = EmitScalarExpr(E: E->getArg(Arg: 0));
1162 Value *VSrcOld = EmitScalarExpr(E: E->getArg(Arg: 1));
1163 Value *FI = EmitScalarExpr(E: E->getArg(Arg: 2));
1164 Value *BoundCtrl = EmitScalarExpr(E: E->getArg(Arg: 3));
1165 Function *F =
1166 CGM.getIntrinsic(IID: BuiltinID == AMDGPU::BI__builtin_amdgcn_permlane16_swap
1167 ? Intrinsic::amdgcn_permlane16_swap
1168 : Intrinsic::amdgcn_permlane32_swap);
1169 llvm::CallInst *Call =
1170 Builder.CreateCall(Callee: F, Args: {VDstOld, VSrcOld, FI, BoundCtrl});
1171
1172 llvm::Value *Elt0 = Builder.CreateExtractValue(Agg: Call, Idxs: 0);
1173 llvm::Value *Elt1 = Builder.CreateExtractValue(Agg: Call, Idxs: 1);
1174
1175 llvm::Type *ResultType = ConvertType(T: E->getType());
1176
1177 llvm::Value *Insert0 = Builder.CreateInsertElement(
1178 Vec: llvm::PoisonValue::get(T: ResultType), NewElt: Elt0, UINT64_C(0));
1179 llvm::Value *AsVector =
1180 Builder.CreateInsertElement(Vec: Insert0, NewElt: Elt1, UINT64_C(1));
1181 return AsVector;
1182 }
1183 case AMDGPU::BI__builtin_amdgcn_bitop3_b32:
1184 case AMDGPU::BI__builtin_amdgcn_bitop3_b16:
1185 return emitBuiltinWithOneOverloadedType<4>(CGF&: *this, E,
1186 IntrinsicID: Intrinsic::amdgcn_bitop3);
1187 case AMDGPU::BI__builtin_amdgcn_make_buffer_rsrc: {
1188 // TODO: LLVM has this overloaded to allow for fat pointers, but since
1189 // those haven't been plumbed through to Clang yet, default to creating the
1190 // resource type.
1191 SmallVector<Value *, 4> Args;
1192 for (unsigned I = 0; I < 4; ++I)
1193 Args.push_back(Elt: EmitScalarExpr(E: E->getArg(Arg: I)));
1194 llvm::PointerType *RetTy = llvm::PointerType::get(
1195 C&: Builder.getContext(), AddressSpace: llvm::AMDGPUAS::BUFFER_RESOURCE);
1196 Function *F = CGM.getIntrinsic(IID: Intrinsic::amdgcn_make_buffer_rsrc,
1197 Tys: {RetTy, Args[0]->getType()});
1198 return Builder.CreateCall(Callee: F, Args);
1199 }
1200 case AMDGPU::BI__builtin_amdgcn_raw_buffer_store_b8:
1201 case AMDGPU::BI__builtin_amdgcn_raw_buffer_store_b16:
1202 case AMDGPU::BI__builtin_amdgcn_raw_buffer_store_b32:
1203 case AMDGPU::BI__builtin_amdgcn_raw_buffer_store_b64:
1204 case AMDGPU::BI__builtin_amdgcn_raw_buffer_store_b96:
1205 case AMDGPU::BI__builtin_amdgcn_raw_buffer_store_b128:
1206 return emitBuiltinWithOneOverloadedType<5>(
1207 CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_raw_ptr_buffer_store);
1208 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b8:
1209 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b16:
1210 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b32:
1211 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b64:
1212 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b96:
1213 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b128: {
1214 llvm::Type *RetTy = nullptr;
1215 switch (BuiltinID) {
1216 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b8:
1217 RetTy = Int8Ty;
1218 break;
1219 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b16:
1220 RetTy = Int16Ty;
1221 break;
1222 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b32:
1223 RetTy = Int32Ty;
1224 break;
1225 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b64:
1226 RetTy = llvm::FixedVectorType::get(ElementType: Int32Ty, /*NumElements=*/NumElts: 2);
1227 break;
1228 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b96:
1229 RetTy = llvm::FixedVectorType::get(ElementType: Int32Ty, /*NumElements=*/NumElts: 3);
1230 break;
1231 case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b128:
1232 RetTy = llvm::FixedVectorType::get(ElementType: Int32Ty, /*NumElements=*/NumElts: 4);
1233 break;
1234 }
1235 Function *F =
1236 CGM.getIntrinsic(IID: Intrinsic::amdgcn_raw_ptr_buffer_load, Tys: RetTy);
1237 return Builder.CreateCall(
1238 Callee: F, Args: {EmitScalarExpr(E: E->getArg(Arg: 0)), EmitScalarExpr(E: E->getArg(Arg: 1)),
1239 EmitScalarExpr(E: E->getArg(Arg: 2)), EmitScalarExpr(E: E->getArg(Arg: 3))});
1240 }
1241 case AMDGPU::BI__builtin_amdgcn_s_prefetch_data:
1242 return emitBuiltinWithOneOverloadedType<2>(
1243 CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_s_prefetch_data);
1244 case Builtin::BIlogbf:
1245 case Builtin::BI__builtin_logbf: {
1246 Value *Src0 = EmitScalarExpr(E: E->getArg(Arg: 0));
1247 Function *FrExpFunc = CGM.getIntrinsic(
1248 IID: Intrinsic::frexp, Tys: {Src0->getType(), Builder.getInt32Ty()});
1249 CallInst *FrExp = Builder.CreateCall(Callee: FrExpFunc, Args: Src0);
1250 Value *Exp = Builder.CreateExtractValue(Agg: FrExp, Idxs: 1);
1251 Value *Add = Builder.CreateAdd(
1252 LHS: Exp, RHS: ConstantInt::getSigned(Ty: Exp->getType(), V: -1), Name: "", HasNUW: false, HasNSW: true);
1253 Value *SIToFP = Builder.CreateSIToFP(V: Add, DestTy: Builder.getFloatTy());
1254 Value *Fabs =
1255 emitBuiltinWithOneOverloadedType<1>(CGF&: *this, E, IntrinsicID: Intrinsic::fabs);
1256 Value *FCmpONE = Builder.CreateFCmpONE(
1257 LHS: Fabs, RHS: ConstantFP::getInfinity(Ty: Builder.getFloatTy()));
1258 Value *Sel1 = Builder.CreateSelect(C: FCmpONE, True: SIToFP, False: Fabs);
1259 Value *FCmpOEQ =
1260 Builder.CreateFCmpOEQ(LHS: Src0, RHS: ConstantFP::getZero(Ty: Builder.getFloatTy()));
1261 Value *Sel2 = Builder.CreateSelect(
1262 C: FCmpOEQ,
1263 True: ConstantFP::getInfinity(Ty: Builder.getFloatTy(), /*Negative=*/true), False: Sel1);
1264 return Sel2;
1265 }
1266 case Builtin::BIlogb:
1267 case Builtin::BI__builtin_logb: {
1268 Value *Src0 = EmitScalarExpr(E: E->getArg(Arg: 0));
1269 Function *FrExpFunc = CGM.getIntrinsic(
1270 IID: Intrinsic::frexp, Tys: {Src0->getType(), Builder.getInt32Ty()});
1271 CallInst *FrExp = Builder.CreateCall(Callee: FrExpFunc, Args: Src0);
1272 Value *Exp = Builder.CreateExtractValue(Agg: FrExp, Idxs: 1);
1273 Value *Add = Builder.CreateAdd(
1274 LHS: Exp, RHS: ConstantInt::getSigned(Ty: Exp->getType(), V: -1), Name: "", HasNUW: false, HasNSW: true);
1275 Value *SIToFP = Builder.CreateSIToFP(V: Add, DestTy: Builder.getDoubleTy());
1276 Value *Fabs =
1277 emitBuiltinWithOneOverloadedType<1>(CGF&: *this, E, IntrinsicID: Intrinsic::fabs);
1278 Value *FCmpONE = Builder.CreateFCmpONE(
1279 LHS: Fabs, RHS: ConstantFP::getInfinity(Ty: Builder.getDoubleTy()));
1280 Value *Sel1 = Builder.CreateSelect(C: FCmpONE, True: SIToFP, False: Fabs);
1281 Value *FCmpOEQ =
1282 Builder.CreateFCmpOEQ(LHS: Src0, RHS: ConstantFP::getZero(Ty: Builder.getDoubleTy()));
1283 Value *Sel2 = Builder.CreateSelect(
1284 C: FCmpOEQ,
1285 True: ConstantFP::getInfinity(Ty: Builder.getDoubleTy(), /*Negative=*/true),
1286 False: Sel1);
1287 return Sel2;
1288 }
1289 case Builtin::BIscalbnf:
1290 case Builtin::BI__builtin_scalbnf:
1291 case Builtin::BIscalbn:
1292 case Builtin::BI__builtin_scalbn:
1293 return emitBinaryExpMaybeConstrainedFPBuiltin(
1294 CGF&: *this, E, IntrinsicID: Intrinsic::ldexp, ConstrainedIntrinsicID: Intrinsic::experimental_constrained_ldexp);
1295 default:
1296 return nullptr;
1297 }
1298}
1299