AMDGPU.cpp source code [llvm_projects/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp]

1	//===------- AMDCPU.cpp - Emit LLVM Code for builtins ---------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This contains code to emit Builtin calls as LLVM code.
10	//
11	//===----------------------------------------------------------------------===//
12
13	#include "CGBuiltin.h"
14	#include "clang/Basic/TargetBuiltins.h"
15	#include "llvm/Analysis/ValueTracking.h"
16	#include "llvm/IR/IntrinsicsAMDGPU.h"
17	#include "llvm/IR/IntrinsicsR600.h"
18	#include "llvm/IR/MemoryModelRelaxationAnnotations.h"
19	#include "llvm/Support/AMDGPUAddrSpace.h"
20
21	using namespace clang;
22	using namespace CodeGen;
23	using namespace llvm;
24
25	namespace {
26
27	// Has second type mangled argument.
28	static Value *
29	emitBinaryExpMaybeConstrainedFPBuiltin(CodeGenFunction &CGF, const CallExpr *E,
30	Intrinsic::ID IntrinsicID,
31	Intrinsic::ID ConstrainedIntrinsicID) {
32	llvm::Value *Src0 = CGF.EmitScalarExpr(E: E->getArg(Arg: `0`));
33	llvm::Value *Src1 = CGF.EmitScalarExpr(E: E->getArg(Arg: `1`));
34
35	CodeGenFunction::CGFPOptionsRAII FPOptsRAII(CGF, E);
36	if (CGF.Builder.getIsFPConstrained()) {
37	Function *F = CGF.CGM.getIntrinsic(IID: ConstrainedIntrinsicID,
38	Tys: {Src0->getType(), Src1->getType()});
39	return CGF.Builder.CreateConstrainedFPCall(Callee: F, Args: {Src0, Src1});
40	}
41
42	Function *F =
43	CGF.CGM.getIntrinsic(IID: IntrinsicID, Tys: {Src0->getType(), Src1->getType()});
44	return CGF.Builder.CreateCall(Callee: F, Args: {Src0, Src1});
45	}
46
47	// If \p E is not null pointer, insert address space cast to match return
48	// type of \p E if necessary.
49	Value *EmitAMDGPUDispatchPtr(CodeGenFunction &CGF,
50	const CallExpr E = nullptr*) {
51	auto *F = CGF.CGM.getIntrinsic(IID: Intrinsic::amdgcn_dispatch_ptr);
52	auto *Call = CGF.Builder.CreateCall(Callee: F);
53	Call->addRetAttr(
54	Attr: Attribute::getWithDereferenceableBytes(Context&: Call->getContext(), Bytes: `64`));
55	Call->addRetAttr(Attr: Attribute::getWithAlignment(Context&: Call->getContext(), Alignment: Align (`4`)));
56	if (!E)
57	return Call;
58	QualType BuiltinRetType = E->getType();
59	auto *RetTy = cast<llvm::PointerType>(Val: CGF.ConvertType(T: BuiltinRetType));
60	if (RetTy == Call->getType())
61	return Call;
62	return CGF.Builder.CreateAddrSpaceCast(V: Call, DestTy: RetTy);
63	}
64
65	Value *EmitAMDGPUImplicitArgPtr(CodeGenFunction &CGF) {
66	auto *F = CGF.CGM.getIntrinsic(IID: Intrinsic::amdgcn_implicitarg_ptr);
67	auto *Call = CGF.Builder.CreateCall(Callee: F);
68	Call->addRetAttr(
69	Attr: Attribute::getWithDereferenceableBytes(Context&: Call->getContext(), Bytes: `256`));
70	Call->addRetAttr(Attr: Attribute::getWithAlignment(Context&: Call->getContext(), Alignment: Align (`8`)));
71	return Call;
72	}
73
74	// \p Index is 0, 1, and 2 for x, y, and z dimension, respectively.
75	/// Emit code based on Code Object ABI version.
76	/// COV_4 : Emit code to use dispatch ptr
77	/// COV_5+ : Emit code to use implicitarg ptr
78	/// COV_NONE : Emit code to load a global variable "__oclc_ABI_version"
79	/// and use its value for COV_4 or COV_5+ approach. It is used for
80	/// compiling device libraries in an ABI-agnostic way.
81	Value EmitAMDGPUWorkGroupSize(CodeGenFunction &CGF, unsigned* Index) {
82	llvm::LoadInst *LD;
83
84	auto Cov = CGF.getTarget().getTargetOpts().CodeObjectVersion;
85
86	if (Cov == CodeObjectVersionKind::COV_None) {
87	StringRef Name = "__oclc_ABI_version";
88	auto *ABIVersionC = CGF.CGM.getModule().getNamedGlobal(Name);
89	if (!ABIVersionC)
90	ABIVersionC = new llvm::GlobalVariable (
91	CGF.CGM.getModule(), CGF.Int32Ty, false,
92	llvm::GlobalValue::ExternalLinkage, nullptr, Name, nullptr,
93	llvm::GlobalVariable::NotThreadLocal,
94	CGF.CGM.getContext().getTargetAddressSpace(AS: LangAS::opencl_constant));
95
96	// This load will be eliminated by the IPSCCP because it is constant
97	// weak_odr without externally_initialized. Either changing it to weak or
98	// adding externally_initialized will keep the load.
99	Value *ABIVersion = CGF.Builder.CreateAlignedLoad(Ty: CGF.Int32Ty, Addr: ABIVersionC,
100	Align: CGF.CGM.getIntAlign());
101
102	Value *IsCOV5 = CGF.Builder.CreateICmpSGE(
103	LHS: ABIVersion,
104	RHS: llvm::ConstantInt::get(Ty: CGF.Int32Ty, V: CodeObjectVersionKind::COV_5));
105
106	// Indexing the implicit kernarg segment.
107	Value *ImplicitGEP = CGF.Builder.CreateConstGEP1_32(
108	Ty: CGF.Int8Ty, Ptr: EmitAMDGPUImplicitArgPtr(CGF), Idx0: `12` + Index * `2`);
109
110	// Indexing the HSA kernel_dispatch_packet struct.
111	Value *DispatchGEP = CGF.Builder.CreateConstGEP1_32(
112	Ty: CGF.Int8Ty, Ptr: EmitAMDGPUDispatchPtr(CGF), Idx0: `4` + Index * `2`);
113
114	auto Result = CGF.Builder.CreateSelect(C: IsCOV5, True: ImplicitGEP, False: DispatchGEP);
115	LD = CGF.Builder.CreateLoad(
116	Addr: Address (Result, CGF.Int16Ty, CharUnits::fromQuantity(Quantity: `2`)));
117	} else {
118	Value GEP = nullptr*;
119	if (Cov >= CodeObjectVersionKind::COV_5) {
120	// Indexing the implicit kernarg segment.
121	GEP = CGF.Builder.CreateConstGEP1_32(
122	Ty: CGF.Int8Ty, Ptr: EmitAMDGPUImplicitArgPtr(CGF), Idx0: `12` + Index * `2`);
123	} else {
124	// Indexing the HSA kernel_dispatch_packet struct.
125	GEP = CGF.Builder.CreateConstGEP1_32(
126	Ty: CGF.Int8Ty, Ptr: EmitAMDGPUDispatchPtr(CGF), Idx0: `4` + Index * `2`);
127	}
128	LD = CGF.Builder.CreateLoad(
129	Addr: Address (GEP, CGF.Int16Ty, CharUnits::fromQuantity(Quantity: `2`)));
130	}
131
132	llvm::MDBuilder MDHelper(CGF.getLLVMContext());
133	llvm::MDNode *RNode = MDHelper.createRange(Lo: APInt (`16`, `1`),
134	Hi: APInt (`16`, CGF.getTarget().getMaxOpenCLWorkGroupSize() + `1`));
135	LD->setMetadata(KindID: llvm::LLVMContext::MD_range, Node: RNode);
136	LD->setMetadata(KindID: llvm::LLVMContext::MD_noundef,
137	Node: llvm::MDNode::get(Context&: CGF.getLLVMContext(), MDs: {}));
138	LD->setMetadata(KindID: llvm::LLVMContext::MD_invariant_load,
139	Node: llvm::MDNode::get(Context&: CGF.getLLVMContext(), MDs: {}));
140	return LD;
141	}
142
143	// \p Index is 0, 1, and 2 for x, y, and z dimension, respectively.
144	Value EmitAMDGPUGridSize(CodeGenFunction &CGF, unsigned* Index) {
145	const unsigned XOffset = `12`;
146	auto *DP = EmitAMDGPUDispatchPtr(CGF);
147	// Indexing the HSA kernel_dispatch_packet struct.
148	auto Offset = llvm::ConstantInt::get(Ty: CGF.Int32Ty, V: XOffset + Index `4`);
149	auto *GEP = CGF.Builder.CreateGEP(Ty: CGF.Int8Ty, Ptr: DP, IdxList: Offset);
150	auto *LD = CGF.Builder.CreateLoad(
151	Addr: Address (GEP, CGF.Int32Ty, CharUnits::fromQuantity(Quantity: `4`)));
152
153	llvm::MDBuilder MDB(CGF.getLLVMContext());
154
155	// Known non-zero.
156	LD->setMetadata(KindID: llvm::LLVMContext::MD_range,
157	Node: MDB.createRange(Lo: APInt (`32`, `1`), Hi: APInt::getZero(numBits: `32`)));
158	LD->setMetadata(KindID: llvm::LLVMContext::MD_invariant_load,
159	Node: llvm::MDNode::get(Context&: CGF.getLLVMContext(), MDs: {}));
160	return LD;
161	}
162	} // namespace
163
164	// Generates the IR for __builtin_read_exec_.*
165	// Lowers the builtin to amdgcn_ballot intrinsic.
166	static Value EmitAMDGCNBallotForExec(CodeGenFunction &CGF, const* CallExpr *E,
167	llvm::Type *RegisterType,
168	llvm::Type ValueType, bool* isExecHi) {
169	CodeGen::CGBuilderTy &Builder = CGF.Builder;
170	CodeGen::CodeGenModule &CGM = CGF.CGM;
171
172	Function *F = CGM.getIntrinsic(IID: Intrinsic::amdgcn_ballot, Tys: {RegisterType});
173	llvm::Value Call = Builder.CreateCall(Callee: F, Args: {Builder.getInt1(V: true*)});
174
175	if (isExecHi) {
176	Value *Rt2 = Builder.CreateLShr(LHS: Call, RHS: `32`);
177	Rt2 = Builder.CreateTrunc(V: Rt2, DestTy: CGF.Int32Ty);
178	return Rt2;
179	}
180
181	return Call;
182	}
183
184	// Emit an intrinsic that has 1 float or double operand, and 1 integer.
185	static Value *emitFPIntBuiltin(CodeGenFunction &CGF,
186	const CallExpr *E,
187	unsigned IntrinsicID) {
188	llvm::Value *Src0 = CGF.EmitScalarExpr(E: E->getArg(Arg: `0`));
189	llvm::Value *Src1 = CGF.EmitScalarExpr(E: E->getArg(Arg: `1`));
190
191	Function *F = CGF.CGM.getIntrinsic(IID: IntrinsicID, Tys: Src0->getType());
192	return CGF.Builder.CreateCall(Callee: F, Args: {Src0, Src1});
193	}
194
195	// For processing memory ordering and memory scope arguments of various
196	// amdgcn builtins.
197	// \p Order takes a C++11 comptabile memory-ordering specifier and converts
198	// it into LLVM's memory ordering specifier using atomic C ABI, and writes
199	// to \p AO. \p Scope takes a const char and converts it into AMDGCN*
200	// specific SyncScopeID and writes it to \p SSID.
201	void CodeGenFunction::ProcessOrderScopeAMDGCN(Value Order, Value Scope,
202	llvm::AtomicOrdering &AO,
203	llvm::SyncScope::ID &SSID) {
204	int ord = cast<llvm::ConstantInt>(Val: Order)->getZExtValue();
205
206	// Map C11/C++11 memory ordering to LLVM memory ordering
207	assert(llvm::isValidAtomicOrderingCABI(ord));
208	switch (static_cast<llvm::AtomicOrderingCABI>(ord)) {
209	case llvm::AtomicOrderingCABI::acquire:
210	case llvm::AtomicOrderingCABI::consume:
211	AO = llvm::AtomicOrdering::Acquire;
212	break;
213	case llvm::AtomicOrderingCABI::release:
214	AO = llvm::AtomicOrdering::Release;
215	break;
216	case llvm::AtomicOrderingCABI::acq_rel:
217	AO = llvm::AtomicOrdering::AcquireRelease;
218	break;
219	case llvm::AtomicOrderingCABI::seq_cst:
220	AO = llvm::AtomicOrdering::SequentiallyConsistent;
221	break;
222	case llvm::AtomicOrderingCABI::relaxed:
223	AO = llvm::AtomicOrdering::Monotonic;
224	break;
225	}
226
227	// Some of the atomic builtins take the scope as a string name.
228	StringRef scp;
229	if (llvm::getConstantStringInfo(V: Scope, Str&: scp)) {
230	SSID = getLLVMContext().getOrInsertSyncScopeID(SSN: scp);
231	return;
232	}
233
234	// Older builtins had an enum argument for the memory scope.
235	int scope = cast<llvm::ConstantInt>(Val: Scope)->getZExtValue();
236	switch (scope) {
237	case `0`: // __MEMORY_SCOPE_SYSTEM
238	SSID = llvm::SyncScope::System;
239	break;
240	case `1`: // __MEMORY_SCOPE_DEVICE
241	SSID = getLLVMContext().getOrInsertSyncScopeID(SSN: "agent");
242	break;
243	case `2`: // __MEMORY_SCOPE_WRKGRP
244	SSID = getLLVMContext().getOrInsertSyncScopeID(SSN: "workgroup");
245	break;
246	case `3`: // __MEMORY_SCOPE_WVFRNT
247	SSID = getLLVMContext().getOrInsertSyncScopeID(SSN: "wavefront");
248	break;
249	case `4`: // __MEMORY_SCOPE_SINGLE
250	SSID = llvm::SyncScope::SingleThread;
251	break;
252	default:
253	SSID = llvm::SyncScope::System;
254	break;
255	}
256	}
257
258	llvm::Value CodeGenFunction::EmitScalarOrConstFoldImmArg(unsigned* ICEArguments,
259	unsigned Idx,
260	const CallExpr *E) {
261	llvm::Value Arg = nullptr*;
262	if ((ICEArguments & (`1` << Idx)) == `0`) {
263	Arg = EmitScalarExpr(E: E->getArg(Arg: Idx));
264	} else {
265	// If this is required to be a constant, constant fold it so that we
266	// know that the generated intrinsic gets a ConstantInt.
267	std::optional<llvm::APSInt> Result =
268	E->getArg(Arg: Idx)->getIntegerConstantExpr(Ctx: getContext());
269	assert(Result && "Expected argument to be a constant");
270	Arg = llvm::ConstantInt::get(Context&: getLLVMContext(), V: *Result);
271	}
272	return Arg;
273	}
274
275	void CodeGenFunction::AddAMDGPUFenceAddressSpaceMMRA(llvm::Instruction *Inst,
276	const CallExpr *E) {
277	constexpr const char *Tag = "amdgpu-as";
278
279	LLVMContext &Ctx = Inst->getContext();
280	SmallVector<MMRAMetadata::TagT, `3`> MMRAs;
281	for (unsigned K = `2`; K < E->getNumArgs(); ++K) {
282	llvm::Value *V = EmitScalarExpr(E: E->getArg(Arg: K));
283	StringRef AS;
284	if (llvm::getConstantStringInfo(V, Str&: AS)) {
285	MMRAs.push_back(Elt: {Tag, AS});
286	// TODO: Delete the resulting unused constant?
287	continue;
288	}
289	CGM.Error(loc: E->getExprLoc(),
290	error: "expected an address space name as a string literal");
291	}
292
293	llvm::sort(C&: MMRAs);
294	MMRAs.erase(CS: llvm::unique(R&: MMRAs), CE: MMRAs.end());
295	Inst->setMetadata(KindID: LLVMContext::MD_mmra, Node: MMRAMetadata::getMD(Ctx, Tags: MMRAs));
296	}
297
298	Value CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned* BuiltinID,
299	const CallExpr *E) {
300	llvm::AtomicOrdering AO = llvm::AtomicOrdering::SequentiallyConsistent;
301	llvm::SyncScope::ID SSID;
302	switch (BuiltinID) {
303	case AMDGPU::BI__builtin_amdgcn_div_scale:
304	case AMDGPU::BI__builtin_amdgcn_div_scalef: {
305	// Translate from the intrinsics's struct return to the builtin's out
306	// argument.
307
308	Address FlagOutPtr = EmitPointerWithAlignment(Addr: E->getArg(Arg: `3`));
309
310	llvm::Value *X = EmitScalarExpr(E: E->getArg(Arg: `0`));
311	llvm::Value *Y = EmitScalarExpr(E: E->getArg(Arg: `1`));
312	llvm::Value *Z = EmitScalarExpr(E: E->getArg(Arg: `2`));
313
314	llvm::Function *Callee = CGM.getIntrinsic(IID: Intrinsic::amdgcn_div_scale,
315	Tys: X->getType());
316
317	llvm::Value *Tmp = Builder.CreateCall(Callee, Args: {X, Y, Z});
318
319	llvm::Value *Result = Builder.CreateExtractValue(Agg: Tmp, Idxs: `0`);
320	llvm::Value *Flag = Builder.CreateExtractValue(Agg: Tmp, Idxs: `1`);
321
322	llvm::Type *RealFlagType = FlagOutPtr.getElementType();
323
324	llvm::Value *FlagExt = Builder.CreateZExt(V: Flag, DestTy: RealFlagType);
325	Builder.CreateStore(Val: FlagExt, Addr: FlagOutPtr);
326	return Result;
327	}
328	case AMDGPU::BI__builtin_amdgcn_div_fmas:
329	case AMDGPU::BI__builtin_amdgcn_div_fmasf: {
330	llvm::Value *Src0 = EmitScalarExpr(E: E->getArg(Arg: `0`));
331	llvm::Value *Src1 = EmitScalarExpr(E: E->getArg(Arg: `1`));
332	llvm::Value *Src2 = EmitScalarExpr(E: E->getArg(Arg: `2`));
333	llvm::Value *Src3 = EmitScalarExpr(E: E->getArg(Arg: `3`));
334
335	llvm::Function *F = CGM.getIntrinsic(IID: Intrinsic::amdgcn_div_fmas,
336	Tys: Src0->getType());
337	llvm::Value *Src3ToBool = Builder.CreateIsNotNull(Arg: Src3);
338	return Builder.CreateCall(Callee: F, Args: {Src0, Src1, Src2, Src3ToBool});
339	}
340
341	case AMDGPU::BI__builtin_amdgcn_ds_swizzle:
342	return emitBuiltinWithOneOverloadedType<`2`>(CGF&: *this, E,
343	IntrinsicID: Intrinsic::amdgcn_ds_swizzle);
344	case AMDGPU::BI__builtin_amdgcn_mov_dpp8:
345	case AMDGPU::BI__builtin_amdgcn_mov_dpp:
346	case AMDGPU::BI__builtin_amdgcn_update_dpp: {
347	llvm::SmallVector<llvm::Value *, `6`> Args;
348	// Find out if any arguments are required to be integer constant
349	// expressions.
350	unsigned ICEArguments = `0`;
351	ASTContext::GetBuiltinTypeError Error;
352	getContext().GetBuiltinType(ID: BuiltinID, Error, IntegerConstantArgs: &ICEArguments);
353	assert(Error == ASTContext::GE_None && "Should not codegen an error");
354	llvm::Type *DataTy = ConvertType(T: E->getArg(Arg: `0`)->getType());
355	unsigned Size = DataTy->getPrimitiveSizeInBits();
356	llvm::Type *IntTy =
357	llvm::IntegerType::get(C&: Builder.getContext(), NumBits: std::max(a: Size, b: `32u`));
358	Function *F =
359	CGM.getIntrinsic(IID: BuiltinID == AMDGPU::BI__builtin_amdgcn_mov_dpp8
360	? Intrinsic::amdgcn_mov_dpp8
361	: Intrinsic::amdgcn_update_dpp,
362	Tys: IntTy);
363	assert(E->getNumArgs() == `5` \|\| E->getNumArgs() == `6` \|\|
364	E->getNumArgs() == `2`);
365	bool InsertOld = BuiltinID == AMDGPU::BI__builtin_amdgcn_mov_dpp;
366	if (InsertOld)
367	Args.push_back(Elt: llvm::PoisonValue::get(T: IntTy));
368	for (unsigned I = `0`; I != E->getNumArgs(); ++I) {
369	llvm::Value *V = EmitScalarOrConstFoldImmArg(ICEArguments, Idx: I, E);
370	if (I < (BuiltinID == AMDGPU::BI__builtin_amdgcn_update_dpp ? `2u` : `1u`) &&
371	Size < `32`) {
372	if (!DataTy->isIntegerTy())
373	V = Builder.CreateBitCast(
374	V, DestTy: llvm::IntegerType::get(C&: Builder.getContext(), NumBits: Size));
375	V = Builder.CreateZExtOrBitCast(V, DestTy: IntTy);
376	}
377	llvm::Type *ExpTy =
378	F->getFunctionType()->getFunctionParamType(i: I + InsertOld);
379	Args.push_back(Elt: Builder.CreateTruncOrBitCast(V, DestTy: ExpTy));
380	}
381	Value *V = Builder.CreateCall(Callee: F, Args);
382	if (Size < `32` && !DataTy->isIntegerTy())
383	V = Builder.CreateTrunc(
384	V, DestTy: llvm::IntegerType::get(C&: Builder.getContext(), NumBits: Size));
385	return Builder.CreateTruncOrBitCast(V, DestTy: DataTy);
386	}
387	case AMDGPU::BI__builtin_amdgcn_permlane16:
388	case AMDGPU::BI__builtin_amdgcn_permlanex16:
389	return emitBuiltinWithOneOverloadedType<`6`>(
390	CGF&: *this, E,
391	IntrinsicID: BuiltinID == AMDGPU::BI__builtin_amdgcn_permlane16
392	? Intrinsic::amdgcn_permlane16
393	: Intrinsic::amdgcn_permlanex16);
394	case AMDGPU::BI__builtin_amdgcn_permlane64:
395	return emitBuiltinWithOneOverloadedType<`1`>(CGF&: *this, E,
396	IntrinsicID: Intrinsic::amdgcn_permlane64);
397	case AMDGPU::BI__builtin_amdgcn_readlane:
398	return emitBuiltinWithOneOverloadedType<`2`>(CGF&: *this, E,
399	IntrinsicID: Intrinsic::amdgcn_readlane);
400	case AMDGPU::BI__builtin_amdgcn_readfirstlane:
401	return emitBuiltinWithOneOverloadedType<`1`>(CGF&: *this, E,
402	IntrinsicID: Intrinsic::amdgcn_readfirstlane);
403	case AMDGPU::BI__builtin_amdgcn_div_fixup:
404	case AMDGPU::BI__builtin_amdgcn_div_fixupf:
405	case AMDGPU::BI__builtin_amdgcn_div_fixuph:
406	return emitBuiltinWithOneOverloadedType<`3`>(CGF&: *this, E,
407	IntrinsicID: Intrinsic::amdgcn_div_fixup);
408	case AMDGPU::BI__builtin_amdgcn_trig_preop:
409	case AMDGPU::BI__builtin_amdgcn_trig_preopf:
410	return emitFPIntBuiltin(CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_trig_preop);
411	case AMDGPU::BI__builtin_amdgcn_rcp:
412	case AMDGPU::BI__builtin_amdgcn_rcpf:
413	case AMDGPU::BI__builtin_amdgcn_rcph:
414	return emitBuiltinWithOneOverloadedType<`1`>(CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_rcp);
415	case AMDGPU::BI__builtin_amdgcn_sqrt:
416	case AMDGPU::BI__builtin_amdgcn_sqrtf:
417	case AMDGPU::BI__builtin_amdgcn_sqrth:
418	return emitBuiltinWithOneOverloadedType<`1`>(CGF&: *this, E,
419	IntrinsicID: Intrinsic::amdgcn_sqrt);
420	case AMDGPU::BI__builtin_amdgcn_rsq:
421	case AMDGPU::BI__builtin_amdgcn_rsqf:
422	case AMDGPU::BI__builtin_amdgcn_rsqh:
423	return emitBuiltinWithOneOverloadedType<`1`>(CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_rsq);
424	case AMDGPU::BI__builtin_amdgcn_rsq_clamp:
425	case AMDGPU::BI__builtin_amdgcn_rsq_clampf:
426	return emitBuiltinWithOneOverloadedType<`1`>(CGF&: *this, E,
427	IntrinsicID: Intrinsic::amdgcn_rsq_clamp);
428	case AMDGPU::BI__builtin_amdgcn_sinf:
429	case AMDGPU::BI__builtin_amdgcn_sinh:
430	return emitBuiltinWithOneOverloadedType<`1`>(CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_sin);
431	case AMDGPU::BI__builtin_amdgcn_cosf:
432	case AMDGPU::BI__builtin_amdgcn_cosh:
433	return emitBuiltinWithOneOverloadedType<`1`>(CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_cos);
434	case AMDGPU::BI__builtin_amdgcn_dispatch_ptr:
435	return EmitAMDGPUDispatchPtr(CGF&: *this, E);
436	case AMDGPU::BI__builtin_amdgcn_logf:
437	return emitBuiltinWithOneOverloadedType<`1`>(CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_log);
438	case AMDGPU::BI__builtin_amdgcn_exp2f:
439	return emitBuiltinWithOneOverloadedType<`1`>(CGF&: *this, E,
440	IntrinsicID: Intrinsic::amdgcn_exp2);
441	case AMDGPU::BI__builtin_amdgcn_log_clampf:
442	return emitBuiltinWithOneOverloadedType<`1`>(CGF&: *this, E,
443	IntrinsicID: Intrinsic::amdgcn_log_clamp);
444	case AMDGPU::BI__builtin_amdgcn_ldexp:
445	case AMDGPU::BI__builtin_amdgcn_ldexpf: {
446	llvm::Value *Src0 = EmitScalarExpr(E: E->getArg(Arg: `0`));
447	llvm::Value *Src1 = EmitScalarExpr(E: E->getArg(Arg: `1`));
448	llvm::Function *F =
449	CGM.getIntrinsic(IID: Intrinsic::ldexp, Tys: {Src0->getType(), Src1->getType()});
450	return Builder.CreateCall(Callee: F, Args: {Src0, Src1});
451	}
452	case AMDGPU::BI__builtin_amdgcn_ldexph: {
453	// The raw instruction has a different behavior for out of bounds exponent
454	// values (implicit truncation instead of saturate to short_min/short_max).
455	llvm::Value *Src0 = EmitScalarExpr(E: E->getArg(Arg: `0`));
456	llvm::Value *Src1 = EmitScalarExpr(E: E->getArg(Arg: `1`));
457	llvm::Function *F =
458	CGM.getIntrinsic(IID: Intrinsic::ldexp, Tys: {Src0->getType(), Int16Ty});
459	return Builder.CreateCall(Callee: F, Args: {Src0, Builder.CreateTrunc(V: Src1, DestTy: Int16Ty)});
460	}
461	case AMDGPU::BI__builtin_amdgcn_frexp_mant:
462	case AMDGPU::BI__builtin_amdgcn_frexp_mantf:
463	case AMDGPU::BI__builtin_amdgcn_frexp_manth:
464	return emitBuiltinWithOneOverloadedType<`1`>(CGF&: *this, E,
465	IntrinsicID: Intrinsic::amdgcn_frexp_mant);
466	case AMDGPU::BI__builtin_amdgcn_frexp_exp:
467	case AMDGPU::BI__builtin_amdgcn_frexp_expf: {
468	Value *Src0 = EmitScalarExpr(E: E->getArg(Arg: `0`));
469	Function *F = CGM.getIntrinsic(IID: Intrinsic::amdgcn_frexp_exp,
470	Tys: { Builder.getInt32Ty(), Src0->getType() });
471	return Builder.CreateCall(Callee: F, Args: Src0);
472	}
473	case AMDGPU::BI__builtin_amdgcn_frexp_exph: {
474	Value *Src0 = EmitScalarExpr(E: E->getArg(Arg: `0`));
475	Function *F = CGM.getIntrinsic(IID: Intrinsic::amdgcn_frexp_exp,
476	Tys: { Builder.getInt16Ty(), Src0->getType() });
477	return Builder.CreateCall(Callee: F, Args: Src0);
478	}
479	case AMDGPU::BI__builtin_amdgcn_fract:
480	case AMDGPU::BI__builtin_amdgcn_fractf:
481	case AMDGPU::BI__builtin_amdgcn_fracth:
482	return emitBuiltinWithOneOverloadedType<`1`>(CGF&: *this, E,
483	IntrinsicID: Intrinsic::amdgcn_fract);
484	case AMDGPU::BI__builtin_amdgcn_lerp:
485	return emitBuiltinWithOneOverloadedType<`3`>(CGF&: *this, E,
486	IntrinsicID: Intrinsic::amdgcn_lerp);
487	case AMDGPU::BI__builtin_amdgcn_ubfe:
488	return emitBuiltinWithOneOverloadedType<`3`>(CGF&: *this, E,
489	IntrinsicID: Intrinsic::amdgcn_ubfe);
490	case AMDGPU::BI__builtin_amdgcn_sbfe:
491	return emitBuiltinWithOneOverloadedType<`3`>(CGF&: *this, E,
492	IntrinsicID: Intrinsic::amdgcn_sbfe);
493	case AMDGPU::BI__builtin_amdgcn_ballot_w32:
494	case AMDGPU::BI__builtin_amdgcn_ballot_w64: {
495	llvm::Type *ResultType = ConvertType(T: E->getType());
496	llvm::Value *Src = EmitScalarExpr(E: E->getArg(Arg: `0`));
497	Function *F = CGM.getIntrinsic(IID: Intrinsic::amdgcn_ballot, Tys: { ResultType });
498	return Builder.CreateCall(Callee: F, Args: { Src });
499	}
500	case AMDGPU::BI__builtin_amdgcn_uicmp:
501	case AMDGPU::BI__builtin_amdgcn_uicmpl:
502	case AMDGPU::BI__builtin_amdgcn_sicmp:
503	case AMDGPU::BI__builtin_amdgcn_sicmpl: {
504	llvm::Value *Src0 = EmitScalarExpr(E: E->getArg(Arg: `0`));
505	llvm::Value *Src1 = EmitScalarExpr(E: E->getArg(Arg: `1`));
506	llvm::Value *Src2 = EmitScalarExpr(E: E->getArg(Arg: `2`));
507
508	// FIXME-GFX10: How should 32 bit mask be handled?
509	Function *F = CGM.getIntrinsic(IID: Intrinsic::amdgcn_icmp,
510	Tys: { Builder.getInt64Ty(), Src0->getType() });
511	return Builder.CreateCall(Callee: F, Args: { Src0, Src1, Src2 });
512	}
513	case AMDGPU::BI__builtin_amdgcn_fcmp:
514	case AMDGPU::BI__builtin_amdgcn_fcmpf: {
515	llvm::Value *Src0 = EmitScalarExpr(E: E->getArg(Arg: `0`));
516	llvm::Value *Src1 = EmitScalarExpr(E: E->getArg(Arg: `1`));
517	llvm::Value *Src2 = EmitScalarExpr(E: E->getArg(Arg: `2`));
518
519	// FIXME-GFX10: How should 32 bit mask be handled?
520	Function *F = CGM.getIntrinsic(IID: Intrinsic::amdgcn_fcmp,
521	Tys: { Builder.getInt64Ty(), Src0->getType() });
522	return Builder.CreateCall(Callee: F, Args: { Src0, Src1, Src2 });
523	}
524	case AMDGPU::BI__builtin_amdgcn_class:
525	case AMDGPU::BI__builtin_amdgcn_classf:
526	case AMDGPU::BI__builtin_amdgcn_classh:
527	return emitFPIntBuiltin(CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_class);
528	case AMDGPU::BI__builtin_amdgcn_fmed3f:
529	case AMDGPU::BI__builtin_amdgcn_fmed3h:
530	return emitBuiltinWithOneOverloadedType<`3`>(CGF&: *this, E,
531	IntrinsicID: Intrinsic::amdgcn_fmed3);
532	case AMDGPU::BI__builtin_amdgcn_ds_append:
533	case AMDGPU::BI__builtin_amdgcn_ds_consume: {
534	Intrinsic::ID Intrin = BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_append ?
535	Intrinsic::amdgcn_ds_append : Intrinsic::amdgcn_ds_consume;
536	Value *Src0 = EmitScalarExpr(E: E->getArg(Arg: `0`));
537	Function *F = CGM.getIntrinsic(IID: Intrin, Tys: { Src0->getType() });
538	return Builder.CreateCall(Callee: F, Args: { Src0, Builder.getFalse() });
539	}
540	case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_i32:
541	case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_v2i32:
542	case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4i16:
543	case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4f16:
544	case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4bf16:
545	case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8i16:
546	case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8f16:
547	case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8bf16:
548	case AMDGPU::BI__builtin_amdgcn_global_load_tr4_b64_v2i32:
549	case AMDGPU::BI__builtin_amdgcn_global_load_tr8_b64_v2i32:
550	case AMDGPU::BI__builtin_amdgcn_global_load_tr6_b96_v3i32:
551	case AMDGPU::BI__builtin_amdgcn_global_load_tr16_b128_v8i16:
552	case AMDGPU::BI__builtin_amdgcn_global_load_tr16_b128_v8f16:
553	case AMDGPU::BI__builtin_amdgcn_global_load_tr16_b128_v8bf16:
554	case AMDGPU::BI__builtin_amdgcn_ds_load_tr4_b64_v2i32:
555	case AMDGPU::BI__builtin_amdgcn_ds_load_tr8_b64_v2i32:
556	case AMDGPU::BI__builtin_amdgcn_ds_load_tr6_b96_v3i32:
557	case AMDGPU::BI__builtin_amdgcn_ds_load_tr16_b128_v8i16:
558	case AMDGPU::BI__builtin_amdgcn_ds_load_tr16_b128_v8f16:
559	case AMDGPU::BI__builtin_amdgcn_ds_load_tr16_b128_v8bf16:
560	case AMDGPU::BI__builtin_amdgcn_ds_read_tr4_b64_v2i32:
561	case AMDGPU::BI__builtin_amdgcn_ds_read_tr8_b64_v2i32:
562	case AMDGPU::BI__builtin_amdgcn_ds_read_tr6_b96_v3i32:
563	case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4f16:
564	case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4bf16:
565	case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4i16: {
566	Intrinsic::ID IID;
567	switch (BuiltinID) {
568	case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_i32:
569	case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_v2i32:
570	case AMDGPU::BI__builtin_amdgcn_global_load_tr8_b64_v2i32:
571	IID = Intrinsic::amdgcn_global_load_tr_b64;
572	break;
573	case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4i16:
574	case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4f16:
575	case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4bf16:
576	case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8i16:
577	case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8f16:
578	case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8bf16:
579	case AMDGPU::BI__builtin_amdgcn_global_load_tr16_b128_v8i16:
580	case AMDGPU::BI__builtin_amdgcn_global_load_tr16_b128_v8f16:
581	case AMDGPU::BI__builtin_amdgcn_global_load_tr16_b128_v8bf16:
582	IID = Intrinsic::amdgcn_global_load_tr_b128;
583	break;
584	case AMDGPU::BI__builtin_amdgcn_global_load_tr4_b64_v2i32:
585	IID = Intrinsic::amdgcn_global_load_tr4_b64;
586	break;
587	case AMDGPU::BI__builtin_amdgcn_global_load_tr6_b96_v3i32:
588	IID = Intrinsic::amdgcn_global_load_tr6_b96;
589	break;
590	case AMDGPU::BI__builtin_amdgcn_ds_load_tr4_b64_v2i32:
591	IID = Intrinsic::amdgcn_ds_load_tr4_b64;
592	break;
593	case AMDGPU::BI__builtin_amdgcn_ds_load_tr6_b96_v3i32:
594	IID = Intrinsic::amdgcn_ds_load_tr6_b96;
595	break;
596	case AMDGPU::BI__builtin_amdgcn_ds_load_tr8_b64_v2i32:
597	IID = Intrinsic::amdgcn_ds_load_tr8_b64;
598	break;
599	case AMDGPU::BI__builtin_amdgcn_ds_load_tr16_b128_v8i16:
600	case AMDGPU::BI__builtin_amdgcn_ds_load_tr16_b128_v8f16:
601	case AMDGPU::BI__builtin_amdgcn_ds_load_tr16_b128_v8bf16:
602	IID = Intrinsic::amdgcn_ds_load_tr16_b128;
603	break;
604	case AMDGPU::BI__builtin_amdgcn_ds_read_tr4_b64_v2i32:
605	IID = Intrinsic::amdgcn_ds_read_tr4_b64;
606	break;
607	case AMDGPU::BI__builtin_amdgcn_ds_read_tr8_b64_v2i32:
608	IID = Intrinsic::amdgcn_ds_read_tr8_b64;
609	break;
610	case AMDGPU::BI__builtin_amdgcn_ds_read_tr6_b96_v3i32:
611	IID = Intrinsic::amdgcn_ds_read_tr6_b96;
612	break;
613	case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4i16:
614	case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4f16:
615	case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4bf16:
616	IID = Intrinsic::amdgcn_ds_read_tr16_b64;
617	break;
618	}
619	llvm::Type *LoadTy = ConvertType(T: E->getType());
620	llvm::Value *Addr = EmitScalarExpr(E: E->getArg(Arg: `0`));
621	llvm::Function *F = CGM.getIntrinsic(IID, Tys: {LoadTy});
622	return Builder.CreateCall(Callee: F, Args: {Addr});
623	}
624	case AMDGPU::BI__builtin_amdgcn_load_to_lds: {
625	// Should this have asan instrumentation?
626	return emitBuiltinWithOneOverloadedType<`5`>(CGF&: *this, E,
627	IntrinsicID: Intrinsic::amdgcn_load_to_lds);
628	}
629	case AMDGPU::BI__builtin_amdgcn_get_fpenv: {
630	Function *F = CGM.getIntrinsic(IID: Intrinsic::get_fpenv,
631	Tys: {llvm::Type::getInt64Ty(C&: getLLVMContext())});
632	return Builder.CreateCall(Callee: F);
633	}
634	case AMDGPU::BI__builtin_amdgcn_set_fpenv: {
635	Function *F = CGM.getIntrinsic(IID: Intrinsic::set_fpenv,
636	Tys: {llvm::Type::getInt64Ty(C&: getLLVMContext())});
637	llvm::Value *Env = EmitScalarExpr(E: E->getArg(Arg: `0`));
638	return Builder.CreateCall(Callee: F, Args: {Env});
639	}
640	case AMDGPU::BI__builtin_amdgcn_read_exec:
641	return EmitAMDGCNBallotForExec(CGF&: *this, E, RegisterType: Int64Ty, ValueType: Int64Ty, isExecHi: false);
642	case AMDGPU::BI__builtin_amdgcn_read_exec_lo:
643	return EmitAMDGCNBallotForExec(CGF&: *this, E, RegisterType: Int32Ty, ValueType: Int32Ty, isExecHi: false);
644	case AMDGPU::BI__builtin_amdgcn_read_exec_hi:
645	return EmitAMDGCNBallotForExec(CGF&: *this, E, RegisterType: Int64Ty, ValueType: Int64Ty, isExecHi: true);
646	case AMDGPU::BI__builtin_amdgcn_image_bvh_intersect_ray:
647	case AMDGPU::BI__builtin_amdgcn_image_bvh_intersect_ray_h:
648	case AMDGPU::BI__builtin_amdgcn_image_bvh_intersect_ray_l:
649	case AMDGPU::BI__builtin_amdgcn_image_bvh_intersect_ray_lh: {
650	llvm::Value *NodePtr = EmitScalarExpr(E: E->getArg(Arg: `0`));
651	llvm::Value *RayExtent = EmitScalarExpr(E: E->getArg(Arg: `1`));
652	llvm::Value *RayOrigin = EmitScalarExpr(E: E->getArg(Arg: `2`));
653	llvm::Value *RayDir = EmitScalarExpr(E: E->getArg(Arg: `3`));
654	llvm::Value *RayInverseDir = EmitScalarExpr(E: E->getArg(Arg: `4`));
655	llvm::Value *TextureDescr = EmitScalarExpr(E: E->getArg(Arg: `5`));
656
657	// The builtins take these arguments as vec4 where the last element is
658	// ignored. The intrinsic takes them as vec3.
659	RayOrigin = Builder.CreateShuffleVector(V1: RayOrigin, V2: RayOrigin,
660	Mask: {`0`, `1`, `2`});
661	RayDir =
662	Builder.CreateShuffleVector(V1: RayDir, V2: RayDir, Mask: {`0`, `1`, `2`});
663	RayInverseDir = Builder.CreateShuffleVector(V1: RayInverseDir, V2: RayInverseDir,
664	Mask: {`0`, `1`, `2`});
665
666	Function *F = CGM.getIntrinsic(IID: Intrinsic::amdgcn_image_bvh_intersect_ray,
667	Tys: {NodePtr->getType(), RayDir->getType()});
668	return Builder.CreateCall(Callee: F, Args: {NodePtr, RayExtent, RayOrigin, RayDir,
669	RayInverseDir, TextureDescr});
670	}
671	case AMDGPU::BI__builtin_amdgcn_image_bvh8_intersect_ray:
672	case AMDGPU::BI__builtin_amdgcn_image_bvh_dual_intersect_ray: {
673	Intrinsic::ID IID;
674	switch (BuiltinID) {
675	case AMDGPU::BI__builtin_amdgcn_image_bvh8_intersect_ray:
676	IID = Intrinsic::amdgcn_image_bvh8_intersect_ray;
677	break;
678	case AMDGPU::BI__builtin_amdgcn_image_bvh_dual_intersect_ray:
679	IID = Intrinsic::amdgcn_image_bvh_dual_intersect_ray;
680	break;
681	}
682	llvm::Value *NodePtr = EmitScalarExpr(E: E->getArg(Arg: `0`));
683	llvm::Value *RayExtent = EmitScalarExpr(E: E->getArg(Arg: `1`));
684	llvm::Value *InstanceMask = EmitScalarExpr(E: E->getArg(Arg: `2`));
685	llvm::Value *RayOrigin = EmitScalarExpr(E: E->getArg(Arg: `3`));
686	llvm::Value *RayDir = EmitScalarExpr(E: E->getArg(Arg: `4`));
687	llvm::Value *Offset = EmitScalarExpr(E: E->getArg(Arg: `5`));
688	llvm::Value *TextureDescr = EmitScalarExpr(E: E->getArg(Arg: `6`));
689
690	Address RetRayOriginPtr = EmitPointerWithAlignment(Addr: E->getArg(Arg: `7`));
691	Address RetRayDirPtr = EmitPointerWithAlignment(Addr: E->getArg(Arg: `8`));
692
693	llvm::Function *IntrinsicFunc = CGM.getIntrinsic(IID);
694
695	llvm::CallInst *CI = Builder.CreateCall(
696	Callee: IntrinsicFunc, Args: {NodePtr, RayExtent, InstanceMask, RayOrigin, RayDir,
697	Offset, TextureDescr});
698
699	llvm::Value *RetVData = Builder.CreateExtractValue(Agg: CI, Idxs: `0`);
700	llvm::Value *RetRayOrigin = Builder.CreateExtractValue(Agg: CI, Idxs: `1`);
701	llvm::Value *RetRayDir = Builder.CreateExtractValue(Agg: CI, Idxs: `2`);
702
703	Builder.CreateStore(Val: RetRayOrigin, Addr: RetRayOriginPtr);
704	Builder.CreateStore(Val: RetRayDir, Addr: RetRayDirPtr);
705
706	return RetVData;
707	}
708
709	case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_rtn:
710	case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push4_pop1_rtn:
711	case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push8_pop1_rtn:
712	case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push8_pop2_rtn: {
713	Intrinsic::ID IID;
714	switch (BuiltinID) {
715	case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_rtn:
716	IID = Intrinsic::amdgcn_ds_bvh_stack_rtn;
717	break;
718	case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push4_pop1_rtn:
719	IID = Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn;
720	break;
721	case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push8_pop1_rtn:
722	IID = Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn;
723	break;
724	case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push8_pop2_rtn:
725	IID = Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn;
726	break;
727	}
728
729	SmallVector<Value *, `4`> Args;
730	for (int i = `0`, e = E->getNumArgs(); i != e; ++i)
731	Args.push_back(Elt: EmitScalarExpr(E: E->getArg(Arg: i)));
732
733	Function *F = CGM.getIntrinsic(IID);
734	Value *Call = Builder.CreateCall(Callee: F, Args);
735	Value *Rtn = Builder.CreateExtractValue(Agg: Call, Idxs: `0`);
736	Value *A = Builder.CreateExtractValue(Agg: Call, Idxs: `1`);
737	llvm::Type *RetTy = ConvertType(T: E->getType());
738	Value *I0 = Builder.CreateInsertElement(Vec: PoisonValue::get(T: RetTy), NewElt: Rtn,
739	Idx: (uint64_t)`0`);
740	// ds_bvh_stack_push8_pop2_rtn returns {i64, i32} but the builtin returns
741	// <2 x i64>, zext the second value.
742	if (A->getType()->getPrimitiveSizeInBits() <
743	RetTy->getScalarType()->getPrimitiveSizeInBits())
744	A = Builder.CreateZExt(V: A, DestTy: RetTy->getScalarType());
745
746	return Builder.CreateInsertElement(Vec: I0, NewElt: A, Idx: `1`);
747	}
748	case AMDGPU::BI__builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4:
749	case AMDGPU::BI__builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4: {
750	llvm::FixedVectorType *VT = FixedVectorType::get(ElementType: Builder.getInt32Ty(), NumElts: `8`);
751	Function *F = CGM.getIntrinsic(
752	IID: BuiltinID == AMDGPU::BI__builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4
753	? Intrinsic::amdgcn_mfma_scale_f32_32x32x64_f8f6f4
754	: Intrinsic::amdgcn_mfma_scale_f32_16x16x128_f8f6f4,
755	Tys: {VT, VT});
756
757	SmallVector<Value *, `9`> Args;
758	for (unsigned I = `0`, N = E->getNumArgs(); I != N; ++I)
759	Args.push_back(Elt: EmitScalarExpr(E: E->getArg(Arg: I)));
760	return Builder.CreateCall(Callee: F, Args);
761	}
762	case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32:
763	case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32:
764	case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64:
765	case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64:
766	case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w32:
767	case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w32:
768	case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w64:
769	case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w64:
770	case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32:
771	case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64:
772	case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32:
773	case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w64:
774	case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32:
775	case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64:
776	case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32:
777	case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64:
778	case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32_gfx12:
779	case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64_gfx12:
780	case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w32_gfx12:
781	case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w64_gfx12:
782	case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12:
783	case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64_gfx12:
784	case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12:
785	case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w64_gfx12:
786	case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32_gfx12:
787	case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64_gfx12:
788	case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12:
789	case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64_gfx12:
790	case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32_gfx12:
791	case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w64_gfx12:
792	case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32_gfx12:
793	case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w64_gfx12:
794	case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32_gfx12:
795	case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w64_gfx12:
796	case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32_gfx12:
797	case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w64_gfx12:
798	case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x32_iu4_w32_gfx12:
799	case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x32_iu4_w64_gfx12:
800	case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_f16_w32:
801	case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_f16_w64:
802	case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w32:
803	case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w64:
804	case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x32_f16_w32:
805	case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x32_f16_w64:
806	case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w32:
807	case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w64:
808	case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w32:
809	case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w64:
810	case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w32:
811	case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w64:
812	case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w32:
813	case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w64:
814	case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32:
815	case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64:
816	case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32:
817	case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64:
818	case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32:
819	case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64:
820	case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32:
821	case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64: {
822
823	// These operations perform a matrix multiplication and accumulation of
824	// the form:
825	// D = A B + C*
826	// We need to specify one type for matrices AB and one for matrices CD.
827	// Sparse matrix operations can have different types for A and B as well as
828	// an additional type for sparsity index.
829	// Destination type should be put before types used for source operands.
830	SmallVector<unsigned, `2`> ArgsForMatchingMatrixTypes;
831	// On GFX12, the intrinsics with 16-bit accumulator use a packed layout.
832	// There is no need for the variable opsel argument, so always set it to
833	// "false".
834	bool AppendFalseForOpselArg = false;
835	unsigned BuiltinWMMAOp;
836
837	switch (BuiltinID) {
838	case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32:
839	case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w64:
840	case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12:
841	case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w64_gfx12:
842	ArgsForMatchingMatrixTypes = {`2`, `0`}; // CD, AB
843	BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_f16;
844	break;
845	case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32:
846	case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64:
847	case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12:
848	case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64_gfx12:
849	ArgsForMatchingMatrixTypes = {`2`, `0`}; // CD, AB
850	BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_bf16;
851	break;
852	case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w32_gfx12:
853	case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w64_gfx12:
854	AppendFalseForOpselArg = true;
855	[[fallthrough]];
856	case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w32:
857	case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w64:
858	ArgsForMatchingMatrixTypes = {`2`, `0`}; // CD, AB
859	BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x16_f16;
860	break;
861	case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32_gfx12:
862	case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64_gfx12:
863	AppendFalseForOpselArg = true;
864	[[fallthrough]];
865	case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32:
866	case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64:
867	ArgsForMatchingMatrixTypes = {`2`, `0`}; // CD, AB
868	BuiltinWMMAOp = Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16;
869	break;
870	case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w32:
871	case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w64:
872	ArgsForMatchingMatrixTypes = {`2`, `0`}; // CD, AB
873	BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x16_f16_tied;
874	break;
875	case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32:
876	case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64:
877	ArgsForMatchingMatrixTypes = {`2`, `0`}; // CD, AB
878	BuiltinWMMAOp = Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16_tied;
879	break;
880	case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32:
881	case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64:
882	case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12:
883	case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64_gfx12:
884	ArgsForMatchingMatrixTypes = {`4`, `1`}; // CD, AB
885	BuiltinWMMAOp = Intrinsic::amdgcn_wmma_i32_16x16x16_iu8;
886	break;
887	case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32:
888	case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64:
889	case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32_gfx12:
890	case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64_gfx12:
891	ArgsForMatchingMatrixTypes = {`4`, `1`}; // CD, AB
892	BuiltinWMMAOp = Intrinsic::amdgcn_wmma_i32_16x16x16_iu4;
893	break;
894	case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32_gfx12:
895	case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w64_gfx12:
896	ArgsForMatchingMatrixTypes = {`2`, `0`}; // CD, AB
897	BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_fp8;
898	break;
899	case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32_gfx12:
900	case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w64_gfx12:
901	ArgsForMatchingMatrixTypes = {`2`, `0`}; // CD, AB
902	BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_bf8;
903	break;
904	case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32_gfx12:
905	case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w64_gfx12:
906	ArgsForMatchingMatrixTypes = {`2`, `0`}; // CD, AB
907	BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_fp8;
908	break;
909	case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32_gfx12:
910	case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w64_gfx12:
911	ArgsForMatchingMatrixTypes = {`2`, `0`}; // CD, AB
912	BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_bf8;
913	break;
914	case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x32_iu4_w32_gfx12:
915	case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x32_iu4_w64_gfx12:
916	ArgsForMatchingMatrixTypes = {`4`, `1`}; // CD, AB
917	BuiltinWMMAOp = Intrinsic::amdgcn_wmma_i32_16x16x32_iu4;
918	break;
919	case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_f16_w32:
920	case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_f16_w64:
921	ArgsForMatchingMatrixTypes = {`2`, `0`, `1`, `3`}; // CD, A, B, Index
922	BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_f16;
923	break;
924	case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w32:
925	case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w64:
926	ArgsForMatchingMatrixTypes = {`2`, `0`, `1`, `3`}; // CD, A, B, Index
927	BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16;
928	break;
929	case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x32_f16_w32:
930	case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x32_f16_w64:
931	ArgsForMatchingMatrixTypes = {`2`, `0`, `1`, `3`}; // CD, A, B, Index
932	BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f16_16x16x32_f16;
933	break;
934	case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w32:
935	case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w64:
936	ArgsForMatchingMatrixTypes = {`2`, `0`, `1`, `3`}; // CD, A, B, Index
937	BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16;
938	break;
939	case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w32:
940	case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w64:
941	ArgsForMatchingMatrixTypes = {`4`, `1`, `3`, `5`}; // CD, A, B, Index
942	BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8;
943	break;
944	case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w32:
945	case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w64:
946	ArgsForMatchingMatrixTypes = {`4`, `1`, `3`, `5`}; // CD, A, B, Index
947	BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4;
948	break;
949	case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w32:
950	case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w64:
951	ArgsForMatchingMatrixTypes = {`4`, `1`, `3`, `5`}; // CD, A, B, Index
952	BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4;
953	break;
954	case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32:
955	case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64:
956	ArgsForMatchingMatrixTypes = {`2`, `0`, `1`, `3`}; // CD, A, B, Index
957	BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8;
958	break;
959	case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32:
960	case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64:
961	ArgsForMatchingMatrixTypes = {`2`, `0`, `1`, `3`}; // CD, A, B, Index
962	BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8;
963	break;
964	case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32:
965	case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64:
966	ArgsForMatchingMatrixTypes = {`2`, `0`, `1`, `3`}; // CD, A, B, Index
967	BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8;
968	break;
969	case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32:
970	case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64:
971	ArgsForMatchingMatrixTypes = {`2`, `0`, `1`, `3`}; // CD, A, B, Index
972	BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8;
973	break;
974	}
975
976	SmallVector<Value *, `6`> Args;
977	for (int i = `0`, e = E->getNumArgs(); i != e; ++i)
978	Args.push_back(Elt: EmitScalarExpr(E: E->getArg(Arg: i)));
979	if (AppendFalseForOpselArg)
980	Args.push_back(Elt: Builder.getFalse());
981
982	SmallVector<llvm::Type *, `6`> ArgTypes;
983	for (auto ArgIdx : ArgsForMatchingMatrixTypes)
984	ArgTypes.push_back(Elt: Args [ArgIdx]->getType());
985
986	Function *F = CGM.getIntrinsic(IID: BuiltinWMMAOp, Tys: ArgTypes);
987	return Builder.CreateCall(Callee: F, Args);
988	}
989	// amdgcn workgroup size
990	case AMDGPU::BI__builtin_amdgcn_workgroup_size_x:
991	return EmitAMDGPUWorkGroupSize(CGF&: *this, Index: `0`);
992	case AMDGPU::BI__builtin_amdgcn_workgroup_size_y:
993	return EmitAMDGPUWorkGroupSize(CGF&: *this, Index: `1`);
994	case AMDGPU::BI__builtin_amdgcn_workgroup_size_z:
995	return EmitAMDGPUWorkGroupSize(CGF&: *this, Index: `2`);
996
997	// amdgcn grid size
998	case AMDGPU::BI__builtin_amdgcn_grid_size_x:
999	return EmitAMDGPUGridSize(CGF&: *this, Index: `0`);
1000	case AMDGPU::BI__builtin_amdgcn_grid_size_y:
1001	return EmitAMDGPUGridSize(CGF&: *this, Index: `1`);
1002	case AMDGPU::BI__builtin_amdgcn_grid_size_z:
1003	return EmitAMDGPUGridSize(CGF&: *this, Index: `2`);
1004
1005	// r600 intrinsics
1006	case AMDGPU::BI__builtin_r600_recipsqrt_ieee:
1007	case AMDGPU::BI__builtin_r600_recipsqrt_ieeef:
1008	return emitBuiltinWithOneOverloadedType<`1`>(CGF&: *this, E,
1009	IntrinsicID: Intrinsic::r600_recipsqrt_ieee);
1010	case AMDGPU::BI__builtin_amdgcn_alignbit: {
1011	llvm::Value *Src0 = EmitScalarExpr(E: E->getArg(Arg: `0`));
1012	llvm::Value *Src1 = EmitScalarExpr(E: E->getArg(Arg: `1`));
1013	llvm::Value *Src2 = EmitScalarExpr(E: E->getArg(Arg: `2`));
1014	Function *F = CGM.getIntrinsic(IID: Intrinsic::fshr, Tys: Src0->getType());
1015	return Builder.CreateCall(Callee: F, Args: { Src0, Src1, Src2 });
1016	}
1017	case AMDGPU::BI__builtin_amdgcn_fence: {
1018	ProcessOrderScopeAMDGCN(Order: EmitScalarExpr(E: E->getArg(Arg: `0`)),
1019	Scope: EmitScalarExpr(E: E->getArg(Arg: `1`)), AO, SSID);
1020	FenceInst *Fence = Builder.CreateFence(Ordering: AO, SSID);
1021	if (E->getNumArgs() > `2`)
1022	AddAMDGPUFenceAddressSpaceMMRA(Inst: Fence, E);
1023	return Fence;
1024	}
1025	case AMDGPU::BI__builtin_amdgcn_atomic_inc32:
1026	case AMDGPU::BI__builtin_amdgcn_atomic_inc64:
1027	case AMDGPU::BI__builtin_amdgcn_atomic_dec32:
1028	case AMDGPU::BI__builtin_amdgcn_atomic_dec64:
1029	case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f64:
1030	case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f32:
1031	case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2f16:
1032	case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16:
1033	case AMDGPU::BI__builtin_amdgcn_ds_faddf:
1034	case AMDGPU::BI__builtin_amdgcn_ds_fminf:
1035	case AMDGPU::BI__builtin_amdgcn_ds_fmaxf:
1036	case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
1037	case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
1038	case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
1039	case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2f16:
1040	case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f32:
1041	case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f64:
1042	case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2bf16:
1043	case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16:
1044	case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
1045	case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64:
1046	case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmin_f64:
1047	case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmax_f64: {
1048	llvm::AtomicRMWInst::BinOp BinOp;
1049	switch (BuiltinID) {
1050	case AMDGPU::BI__builtin_amdgcn_atomic_inc32:
1051	case AMDGPU::BI__builtin_amdgcn_atomic_inc64:
1052	BinOp = llvm::AtomicRMWInst::UIncWrap;
1053	break;
1054	case AMDGPU::BI__builtin_amdgcn_atomic_dec32:
1055	case AMDGPU::BI__builtin_amdgcn_atomic_dec64:
1056	BinOp = llvm::AtomicRMWInst::UDecWrap;
1057	break;
1058	case AMDGPU::BI__builtin_amdgcn_ds_faddf:
1059	case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f64:
1060	case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f32:
1061	case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2f16:
1062	case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16:
1063	case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
1064	case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
1065	case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
1066	case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2f16:
1067	case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f32:
1068	case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f64:
1069	case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2bf16:
1070	case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16:
1071	BinOp = llvm::AtomicRMWInst::FAdd;
1072	break;
1073	case AMDGPU::BI__builtin_amdgcn_ds_fminf:
1074	case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
1075	case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmin_f64:
1076	BinOp = llvm::AtomicRMWInst::FMin;
1077	break;
1078	case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64:
1079	case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmax_f64:
1080	case AMDGPU::BI__builtin_amdgcn_ds_fmaxf:
1081	BinOp = llvm::AtomicRMWInst::FMax;
1082	break;
1083	}
1084
1085	Address Ptr = CheckAtomicAlignment(CGF&: *this, E);
1086	Value *Val = EmitScalarExpr(E: E->getArg(Arg: `1`));
1087	llvm::Type *OrigTy = Val->getType();
1088	QualType PtrTy = E->getArg(Arg: `0`)->IgnoreImpCasts()->getType();
1089
1090	bool Volatile;
1091
1092	if (BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_faddf \|\|
1093	BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_fminf \|\|
1094	BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_fmaxf) {
1095	// __builtin_amdgcn_ds_faddf/fminf/fmaxf has an explicit volatile argument
1096	Volatile =
1097	cast<ConstantInt>(Val: EmitScalarExpr(E: E->getArg(Arg: `4`)))->getZExtValue();
1098	} else {
1099	// Infer volatile from the passed type.
1100	Volatile =
1101	PtrTy ->castAs<PointerType>()->getPointeeType().isVolatileQualified();
1102	}
1103
1104	if (E->getNumArgs() >= `4`) {
1105	// Some of the builtins have explicit ordering and scope arguments.
1106	ProcessOrderScopeAMDGCN(Order: EmitScalarExpr(E: E->getArg(Arg: `2`)),
1107	Scope: EmitScalarExpr(E: E->getArg(Arg: `3`)), AO, SSID);
1108	} else {
1109	// Most of the builtins do not have syncscope/order arguments. For DS
1110	// atomics the scope doesn't really matter, as they implicitly operate at
1111	// workgroup scope.
1112	//
1113	// The global/flat cases need to use agent scope to consistently produce
1114	// the native instruction instead of a cmpxchg expansion.
1115	SSID = getLLVMContext().getOrInsertSyncScopeID(SSN: "agent");
1116	AO = AtomicOrdering::Monotonic;
1117
1118	// The v2bf16 builtin uses i16 instead of a natural bfloat type.
1119	if (BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16 \|\|
1120	BuiltinID == AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2bf16 \|\|
1121	BuiltinID == AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16) {
1122	llvm::Type *V2BF16Ty = FixedVectorType::get(
1123	ElementType: llvm::Type::getBFloatTy(C&: Builder.getContext()), NumElts: `2`);
1124	Val = Builder.CreateBitCast(V: Val, DestTy: V2BF16Ty);
1125	}
1126	}
1127
1128	llvm::AtomicRMWInst *RMW =
1129	Builder.CreateAtomicRMW(Op: BinOp, Addr: Ptr, Val, Ordering: AO, SSID);
1130	if (Volatile)
1131	RMW->setVolatile(true);
1132
1133	unsigned AddrSpace = Ptr.getType()->getAddressSpace();
1134	if (AddrSpace != llvm::AMDGPUAS::LOCAL_ADDRESS) {
1135	// Most targets require "amdgpu.no.fine.grained.memory" to emit the native
1136	// instruction for flat and global operations.
1137	llvm::MDTuple *EmptyMD = MDNode::get(Context&: getLLVMContext(), MDs: {});
1138	RMW->setMetadata(Kind: "amdgpu.no.fine.grained.memory", Node: EmptyMD);
1139
1140	// Most targets require "amdgpu.ignore.denormal.mode" to emit the native
1141	// instruction, but this only matters for float fadd.
1142	if (BinOp == llvm::AtomicRMWInst::FAdd && Val->getType()->isFloatTy())
1143	RMW->setMetadata(Kind: "amdgpu.ignore.denormal.mode", Node: EmptyMD);
1144	}
1145
1146	return Builder.CreateBitCast(V: RMW, DestTy: OrigTy);
1147	}
1148	case AMDGPU::BI__builtin_amdgcn_s_sendmsg_rtn:
1149	case AMDGPU::BI__builtin_amdgcn_s_sendmsg_rtnl: {
1150	llvm::Value *Arg = EmitScalarExpr(E: E->getArg(Arg: `0`));
1151	llvm::Type *ResultType = ConvertType(T: E->getType());
1152	// s_sendmsg_rtn is mangled using return type only.
1153	Function *F =
1154	CGM.getIntrinsic(IID: Intrinsic::amdgcn_s_sendmsg_rtn, Tys: {ResultType});
1155	return Builder.CreateCall(Callee: F, Args: {Arg});
1156	}
1157	case AMDGPU::BI__builtin_amdgcn_permlane16_swap:
1158	case AMDGPU::BI__builtin_amdgcn_permlane32_swap: {
1159	// Because builtin types are limited, and the intrinsic uses a struct/pair
1160	// output, marshal the pair-of-i32 to <2 x i32>.
1161	Value *VDstOld = EmitScalarExpr(E: E->getArg(Arg: `0`));
1162	Value *VSrcOld = EmitScalarExpr(E: E->getArg(Arg: `1`));
1163	Value *FI = EmitScalarExpr(E: E->getArg(Arg: `2`));
1164	Value *BoundCtrl = EmitScalarExpr(E: E->getArg(Arg: `3`));
1165	Function *F =
1166	CGM.getIntrinsic(IID: BuiltinID == AMDGPU::BI__builtin_amdgcn_permlane16_swap
1167	? Intrinsic::amdgcn_permlane16_swap
1168	: Intrinsic::amdgcn_permlane32_swap);
1169	llvm::CallInst *Call =
1170	Builder.CreateCall(Callee: F, Args: {VDstOld, VSrcOld, FI, BoundCtrl});
1171
1172	llvm::Value *Elt0 = Builder.CreateExtractValue(Agg: Call, Idxs: `0`);
1173	llvm::Value *Elt1 = Builder.CreateExtractValue(Agg: Call, Idxs: `1`);
1174
1175	llvm::Type *ResultType = ConvertType(T: E->getType());
1176
1177	llvm::Value *Insert0 = Builder.CreateInsertElement(
1178	Vec: llvm::PoisonValue::get(T: ResultType), NewElt: Elt0, UINT64_C(`0`));
1179	llvm::Value *AsVector =
1180	Builder.CreateInsertElement(Vec: Insert0, NewElt: Elt1, UINT64_C(`1`));
1181	return AsVector;
1182	}
1183	case AMDGPU::BI__builtin_amdgcn_bitop3_b32:
1184	case AMDGPU::BI__builtin_amdgcn_bitop3_b16:
1185	return emitBuiltinWithOneOverloadedType<`4`>(CGF&: *this, E,
1186	IntrinsicID: Intrinsic::amdgcn_bitop3);
1187	case AMDGPU::BI__builtin_amdgcn_make_buffer_rsrc: {
1188	// TODO: LLVM has this overloaded to allow for fat pointers, but since
1189	// those haven't been plumbed through to Clang yet, default to creating the
1190	// resource type.
1191	SmallVector<Value *, `4`> Args;
1192	for (unsigned I = `0`; I < `4`; ++I)
1193	Args.push_back(Elt: EmitScalarExpr(E: E->getArg(Arg: I)));
1194	llvm::PointerType *RetTy = llvm::PointerType::get(
1195	C&: Builder.getContext(), AddressSpace: llvm::AMDGPUAS::BUFFER_RESOURCE);
1196	Function *F = CGM.getIntrinsic(IID: Intrinsic::amdgcn_make_buffer_rsrc,
1197	Tys: {RetTy, Args [`0`]->getType()});
1198	return Builder.CreateCall(Callee: F, Args);
1199	}
1200	case AMDGPU::BI__builtin_amdgcn_raw_buffer_store_b8:
1201	case AMDGPU::BI__builtin_amdgcn_raw_buffer_store_b16:
1202	case AMDGPU::BI__builtin_amdgcn_raw_buffer_store_b32:
1203	case AMDGPU::BI__builtin_amdgcn_raw_buffer_store_b64:
1204	case AMDGPU::BI__builtin_amdgcn_raw_buffer_store_b96:
1205	case AMDGPU::BI__builtin_amdgcn_raw_buffer_store_b128:
1206	return emitBuiltinWithOneOverloadedType<`5`>(
1207	CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_raw_ptr_buffer_store);
1208	case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b8:
1209	case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b16:
1210	case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b32:
1211	case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b64:
1212	case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b96:
1213	case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b128: {
1214	llvm::Type RetTy = nullptr*;
1215	switch (BuiltinID) {
1216	case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b8:
1217	RetTy = Int8Ty;
1218	break;
1219	case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b16:
1220	RetTy = Int16Ty;
1221	break;
1222	case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b32:
1223	RetTy = Int32Ty;
1224	break;
1225	case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b64:
1226	RetTy = llvm::FixedVectorType::get(ElementType: Int32Ty, /NumElements=/NumElts: `2`);
1227	break;
1228	case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b96:
1229	RetTy = llvm::FixedVectorType::get(ElementType: Int32Ty, /NumElements=/NumElts: `3`);
1230	break;
1231	case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b128:
1232	RetTy = llvm::FixedVectorType::get(ElementType: Int32Ty, /NumElements=/NumElts: `4`);
1233	break;
1234	}
1235	Function *F =
1236	CGM.getIntrinsic(IID: Intrinsic::amdgcn_raw_ptr_buffer_load, Tys: RetTy);
1237	return Builder.CreateCall(
1238	Callee: F, Args: {EmitScalarExpr(E: E->getArg(Arg: `0`)), EmitScalarExpr(E: E->getArg(Arg: `1`)),
1239	EmitScalarExpr(E: E->getArg(Arg: `2`)), EmitScalarExpr(E: E->getArg(Arg: `3`))});
1240	}
1241	case AMDGPU::BI__builtin_amdgcn_s_prefetch_data:
1242	return emitBuiltinWithOneOverloadedType<`2`>(
1243	CGF&: *this, E, IntrinsicID: Intrinsic::amdgcn_s_prefetch_data);
1244	case Builtin::BIlogbf:
1245	case Builtin::BI__builtin_logbf: {
1246	Value *Src0 = EmitScalarExpr(E: E->getArg(Arg: `0`));
1247	Function *FrExpFunc = CGM.getIntrinsic(
1248	IID: Intrinsic::frexp, Tys: {Src0->getType(), Builder.getInt32Ty()});
1249	CallInst *FrExp = Builder.CreateCall(Callee: FrExpFunc, Args: Src0);
1250	Value *Exp = Builder.CreateExtractValue(Agg: FrExp, Idxs: `1`);
1251	Value *Add = Builder.CreateAdd(
1252	LHS: Exp, RHS: ConstantInt::getSigned(Ty: Exp->getType(), V: -`1`), Name: "", HasNUW: false, HasNSW: true);
1253	Value *SIToFP = Builder.CreateSIToFP(V: Add, DestTy: Builder.getFloatTy());
1254	Value *Fabs =
1255	emitBuiltinWithOneOverloadedType<`1`>(CGF&: *this, E, IntrinsicID: Intrinsic::fabs);
1256	Value *FCmpONE = Builder.CreateFCmpONE(
1257	LHS: Fabs, RHS: ConstantFP::getInfinity(Ty: Builder.getFloatTy()));
1258	Value *Sel1 = Builder.CreateSelect(C: FCmpONE, True: SIToFP, False: Fabs);
1259	Value *FCmpOEQ =
1260	Builder.CreateFCmpOEQ(LHS: Src0, RHS: ConstantFP::getZero(Ty: Builder.getFloatTy()));
1261	Value *Sel2 = Builder.CreateSelect(
1262	C: FCmpOEQ,
1263	True: ConstantFP::getInfinity(Ty: Builder.getFloatTy(), /Negative=/true), False: Sel1);
1264	return Sel2;
1265	}
1266	case Builtin::BIlogb:
1267	case Builtin::BI__builtin_logb: {
1268	Value *Src0 = EmitScalarExpr(E: E->getArg(Arg: `0`));
1269	Function *FrExpFunc = CGM.getIntrinsic(
1270	IID: Intrinsic::frexp, Tys: {Src0->getType(), Builder.getInt32Ty()});
1271	CallInst *FrExp = Builder.CreateCall(Callee: FrExpFunc, Args: Src0);
1272	Value *Exp = Builder.CreateExtractValue(Agg: FrExp, Idxs: `1`);
1273	Value *Add = Builder.CreateAdd(
1274	LHS: Exp, RHS: ConstantInt::getSigned(Ty: Exp->getType(), V: -`1`), Name: "", HasNUW: false, HasNSW: true);
1275	Value *SIToFP = Builder.CreateSIToFP(V: Add, DestTy: Builder.getDoubleTy());
1276	Value *Fabs =
1277	emitBuiltinWithOneOverloadedType<`1`>(CGF&: *this, E, IntrinsicID: Intrinsic::fabs);
1278	Value *FCmpONE = Builder.CreateFCmpONE(
1279	LHS: Fabs, RHS: ConstantFP::getInfinity(Ty: Builder.getDoubleTy()));
1280	Value *Sel1 = Builder.CreateSelect(C: FCmpONE, True: SIToFP, False: Fabs);
1281	Value *FCmpOEQ =
1282	Builder.CreateFCmpOEQ(LHS: Src0, RHS: ConstantFP::getZero(Ty: Builder.getDoubleTy()));
1283	Value *Sel2 = Builder.CreateSelect(
1284	C: FCmpOEQ,
1285	True: ConstantFP::getInfinity(Ty: Builder.getDoubleTy(), /Negative=/true),
1286	False: Sel1);
1287	return Sel2;
1288	}
1289	case Builtin::BIscalbnf:
1290	case Builtin::BI__builtin_scalbnf:
1291	case Builtin::BIscalbn:
1292	case Builtin::BI__builtin_scalbn:
1293	return emitBinaryExpMaybeConstrainedFPBuiltin(
1294	CGF&: *this, E, IntrinsicID: Intrinsic::ldexp, ConstrainedIntrinsicID: Intrinsic::experimental_constrained_ldexp);
1295	default:
1296	return nullptr;
1297	}
1298	}
1299

Browse the source code of llvm_projects/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp