AMDGPU.cpp source code [llvm_projects/clang/lib/CodeGen/Targets/AMDGPU.cpp]

1	//===- AMDGPU.cpp ---------------------------------------------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8
9	#include "ABIInfoImpl.h"
10	#include "TargetInfo.h"
11	#include "clang/Basic/TargetOptions.h"
12
13	using namespace clang;
14	using namespace clang::CodeGen;
15
16	//===----------------------------------------------------------------------===//
17	// AMDGPU ABI Implementation
18	//===----------------------------------------------------------------------===//
19
20	namespace {
21
22	class AMDGPUABIInfo final : public DefaultABIInfo {
23	private:
24	static const unsigned MaxNumRegsForArgsRet = `16`;
25
26	unsigned numRegsForType(QualType Ty) const;
27
28	bool isHomogeneousAggregateBaseType(QualType Ty) const override;
29	bool isHomogeneousAggregateSmallEnough(const Type *Base,
30	uint64_t Members) const override;
31
32	// Coerce HIP scalar pointer arguments from generic pointers to global ones.
33	llvm::Type coerceKernelArgumentType(llvm::Type Ty, unsigned FromAS,
34	unsigned ToAS) const {
35	// Single value types.
36	auto *PtrTy = llvm::dyn_cast<llvm::PointerType>(Val: Ty);
37	if (PtrTy && PtrTy->getAddressSpace() == FromAS)
38	return llvm::PointerType::get(C&: Ty->getContext(), AddressSpace: ToAS);
39	return Ty;
40	}
41
42	public:
43	explicit AMDGPUABIInfo(CodeGen::CodeGenTypes &CGT) :
44	DefaultABIInfo (CGT) {}
45
46	ABIArgInfo classifyReturnType(QualType RetTy) const;
47	ABIArgInfo classifyKernelArgumentType(QualType Ty) const;
48	ABIArgInfo classifyArgumentType(QualType Ty, bool Variadic,
49	unsigned &NumRegsLeft) const;
50
51	void computeInfo(CGFunctionInfo &FI) const override;
52	RValue EmitVAArg(CodeGenFunction &CGF, Address VAListAddr, QualType Ty,
53	AggValueSlot Slot) const override;
54	};
55
56	bool AMDGPUABIInfo::isHomogeneousAggregateBaseType(QualType Ty) const {
57	return true;
58	}
59
60	bool AMDGPUABIInfo::isHomogeneousAggregateSmallEnough(
61	const Type Base, uint64_t Members) const* {
62	uint32_t NumRegs = (getContext().getTypeSize(T: Base) + `31`) / `32`;
63
64	// Homogeneous Aggregates may occupy at most 16 registers.
65	return Members * NumRegs <= MaxNumRegsForArgsRet;
66	}
67
68	/// Estimate number of registers the type will use when passed in registers.
69	unsigned AMDGPUABIInfo::numRegsForType(QualType Ty) const {
70	unsigned NumRegs = `0`;
71
72	if (const VectorType *VT = Ty ->getAs<VectorType>()) {
73	// Compute from the number of elements. The reported size is based on the
74	// in-memory size, which includes the padding 4th element for 3-vectors.
75	QualType EltTy = VT->getElementType();
76	unsigned EltSize = getContext().getTypeSize(T: EltTy);
77
78	// 16-bit element vectors should be passed as packed.
79	if (EltSize == `16`)
80	return (VT->getNumElements() + `1`) / `2`;
81
82	unsigned EltNumRegs = (EltSize + `31`) / `32`;
83	return EltNumRegs * VT->getNumElements();
84	}
85
86	if (const RecordType *RT = Ty ->getAs<RecordType>()) {
87	const RecordDecl *RD = RT->getDecl();
88	assert(!RD->hasFlexibleArrayMember());
89
90	for (const FieldDecl *Field : RD->fields()) {
91	QualType FieldTy = Field->getType();
92	NumRegs += numRegsForType(Ty: FieldTy);
93	}
94
95	return NumRegs;
96	}
97
98	return (getContext().getTypeSize(T: Ty) + `31`) / `32`;
99	}
100
101	void AMDGPUABIInfo::computeInfo(CGFunctionInfo &FI) const {
102	llvm::CallingConv::ID CC = FI.getCallingConvention();
103
104	if (!getCXXABI().classifyReturnType(FI))
105	FI.getReturnInfo() = classifyReturnType(RetTy: FI.getReturnType());
106
107	unsigned ArgumentIndex = `0`;
108	const unsigned numFixedArguments = FI.getNumRequiredArgs();
109
110	unsigned NumRegsLeft = MaxNumRegsForArgsRet;
111	for (auto &Arg : FI.arguments()) {
112	if (CC == llvm::CallingConv::AMDGPU_KERNEL) {
113	Arg.info = classifyKernelArgumentType(Ty: Arg.type);
114	} else {
115	bool FixedArgument = ArgumentIndex++ < numFixedArguments;
116	Arg.info = classifyArgumentType(Ty: Arg.type, Variadic: !FixedArgument, NumRegsLeft);
117	}
118	}
119	}
120
121	RValue AMDGPUABIInfo::EmitVAArg(CodeGenFunction &CGF, Address VAListAddr,
122	QualType Ty, AggValueSlot Slot) const {
123	const bool IsIndirect = false;
124	const bool AllowHigherAlign = false;
125	return emitVoidPtrVAArg(CGF, VAListAddr, ValueTy: Ty, IsIndirect,
126	ValueInfo: getContext().getTypeInfoInChars(T: Ty),
127	SlotSizeAndAlign: CharUnits::fromQuantity(Quantity: `4`), AllowHigherAlign, Slot);
128	}
129
130	ABIArgInfo AMDGPUABIInfo::classifyReturnType(QualType RetTy) const {
131	if (isAggregateTypeForABI(T: RetTy)) {
132	// Records with non-trivial destructors/copy-constructors should not be
133	// returned by value.
134	if (!getRecordArgABI(T: RetTy, CXXABI&: getCXXABI())) {
135	// Ignore empty structs/unions.
136	if (isEmptyRecord(Context&: getContext(), T: RetTy, AllowArrays: true))
137	return ABIArgInfo::getIgnore();
138
139	// Lower single-element structs to just return a regular value.
140	if (const Type *SeltTy = isSingleElementStruct(T: RetTy, Context&: getContext()))
141	return ABIArgInfo::getDirect(T: CGT.ConvertType(T: QualType (SeltTy, `0`)));
142
143	if (const RecordType *RT = RetTy ->getAs<RecordType>()) {
144	const RecordDecl *RD = RT->getDecl();
145	if (RD->hasFlexibleArrayMember())
146	return DefaultABIInfo::classifyReturnType(RetTy);
147	}
148
149	// Pack aggregates <= 4 bytes into single VGPR or pair.
150	uint64_t Size = getContext().getTypeSize(T: RetTy);
151	if (Size <= `16`)
152	return ABIArgInfo::getDirect(T: llvm::Type::getInt16Ty(C&: getVMContext()));
153
154	if (Size <= `32`)
155	return ABIArgInfo::getDirect(T: llvm::Type::getInt32Ty(C&: getVMContext()));
156
157	if (Size <= `64`) {
158	llvm::Type *I32Ty = llvm::Type::getInt32Ty(C&: getVMContext());
159	return ABIArgInfo::getDirect(T: llvm::ArrayType::get(ElementType: I32Ty, NumElements: `2`));
160	}
161
162	if (numRegsForType(Ty: RetTy) <= MaxNumRegsForArgsRet)
163	return ABIArgInfo::getDirect();
164	}
165	}
166
167	// Otherwise just do the default thing.
168	return DefaultABIInfo::classifyReturnType(RetTy);
169	}
170
171	/// For kernels all parameters are really passed in a special buffer. It doesn't
172	/// make sense to pass anything byval, so everything must be direct.
173	ABIArgInfo AMDGPUABIInfo::classifyKernelArgumentType(QualType Ty) const {
174	Ty = useFirstFieldIfTransparentUnion(Ty);
175
176	// TODO: Can we omit empty structs?
177
178	if (const Type *SeltTy = isSingleElementStruct(T: Ty, Context&: getContext()))
179	Ty = QualType (SeltTy, `0`);
180
181	llvm::Type *OrigLTy = CGT.ConvertType(T: Ty);
182	llvm::Type *LTy = OrigLTy;
183	if (getContext().getLangOpts().HIP) {
184	LTy = coerceKernelArgumentType(
185	Ty: OrigLTy, /FromAS=/getContext().getTargetAddressSpace(AS: LangAS::Default),
186	/ToAS=/getContext().getTargetAddressSpace(AS: LangAS::cuda_device));
187	}
188
189	// FIXME: Should also use this for OpenCL, but it requires addressing the
190	// problem of kernels being called.
191	//
192	// FIXME: This doesn't apply the optimization of coercing pointers in structs
193	// to global address space when using byref. This would require implementing a
194	// new kind of coercion of the in-memory type when for indirect arguments.
195	if (!getContext().getLangOpts().OpenCL && LTy == OrigLTy &&
196	isAggregateTypeForABI(T: Ty)) {
197	return ABIArgInfo::getIndirectAliased(
198	Alignment: getContext().getTypeAlignInChars(T: Ty),
199	AddrSpace: getContext().getTargetAddressSpace(AS: LangAS::opencl_constant),
200	Realign: false /Realign/, Padding: nullptr /Padding/);
201	}
202
203	// If we set CanBeFlattened to true, CodeGen will expand the struct to its
204	// individual elements, which confuses the Clover OpenCL backend; therefore we
205	// have to set it to false here. Other args of getDirect() are just defaults.
206	return ABIArgInfo::getDirect(T: LTy, Offset: `0`, Padding: nullptr, CanBeFlattened: false);
207	}
208
209	ABIArgInfo AMDGPUABIInfo::classifyArgumentType(QualType Ty, bool Variadic,
210	unsigned &NumRegsLeft) const {
211	assert(NumRegsLeft <= MaxNumRegsForArgsRet && "register estimate underflow");
212
213	Ty = useFirstFieldIfTransparentUnion(Ty);
214
215	if (Variadic) {
216	return ABIArgInfo::getDirect(/T=/nullptr,
217	/Offset=/`0`,
218	/Padding=/nullptr,
219	/CanBeFlattened=/false,
220	/Align=/`0`);
221	}
222
223	if (isAggregateTypeForABI(T: Ty)) {
224	// Records with non-trivial destructors/copy-constructors should not be
225	// passed by value.
226	if (auto RAA = getRecordArgABI(T: Ty, CXXABI&: getCXXABI()))
227	return getNaturalAlignIndirect(Ty, ByVal: RAA == CGCXXABI::RAA_DirectInMemory);
228
229	// Ignore empty structs/unions.
230	if (isEmptyRecord(Context&: getContext(), T: Ty, AllowArrays: true))
231	return ABIArgInfo::getIgnore();
232
233	// Lower single-element structs to just pass a regular value. TODO: We
234	// could do reasonable-size multiple-element structs too, using getExpand(),
235	// though watch out for things like bitfields.
236	if (const Type *SeltTy = isSingleElementStruct(T: Ty, Context&: getContext()))
237	return ABIArgInfo::getDirect(T: CGT.ConvertType(T: QualType (SeltTy, `0`)));
238
239	if (const RecordType *RT = Ty ->getAs<RecordType>()) {
240	const RecordDecl *RD = RT->getDecl();
241	if (RD->hasFlexibleArrayMember())
242	return DefaultABIInfo::classifyArgumentType(RetTy: Ty);
243	}
244
245	// Pack aggregates <= 8 bytes into single VGPR or pair.
246	uint64_t Size = getContext().getTypeSize(T: Ty);
247	if (Size <= `64`) {
248	unsigned NumRegs = (Size + `31`) / `32`;
249	NumRegsLeft -= std::min(a: NumRegsLeft, b: NumRegs);
250
251	if (Size <= `16`)
252	return ABIArgInfo::getDirect(T: llvm::Type::getInt16Ty(C&: getVMContext()));
253
254	if (Size <= `32`)
255	return ABIArgInfo::getDirect(T: llvm::Type::getInt32Ty(C&: getVMContext()));
256
257	// XXX: Should this be i64 instead, and should the limit increase?
258	llvm::Type *I32Ty = llvm::Type::getInt32Ty(C&: getVMContext());
259	return ABIArgInfo::getDirect(T: llvm::ArrayType::get(ElementType: I32Ty, NumElements: `2`));
260	}
261
262	if (NumRegsLeft > `0`) {
263	unsigned NumRegs = numRegsForType(Ty);
264	if (NumRegsLeft >= NumRegs) {
265	NumRegsLeft -= NumRegs;
266	return ABIArgInfo::getDirect();
267	}
268	}
269
270	// Use pass-by-reference in stead of pass-by-value for struct arguments in
271	// function ABI.
272	return ABIArgInfo::getIndirectAliased(
273	Alignment: getContext().getTypeAlignInChars(T: Ty),
274	AddrSpace: getContext().getTargetAddressSpace(AS: LangAS::opencl_private));
275	}
276
277	// Otherwise just do the default thing.
278	ABIArgInfo ArgInfo = DefaultABIInfo::classifyArgumentType(RetTy: Ty);
279	if (!ArgInfo.isIndirect()) {
280	unsigned NumRegs = numRegsForType(Ty);
281	NumRegsLeft -= std::min(a: NumRegs, b: NumRegsLeft);
282	}
283
284	return ArgInfo;
285	}
286
287	class AMDGPUTargetCodeGenInfo : public TargetCodeGenInfo {
288	public:
289	AMDGPUTargetCodeGenInfo(CodeGenTypes &CGT)
290	: TargetCodeGenInfo (std::make_unique<AMDGPUABIInfo>(args&: CGT)) {}
291
292	void setFunctionDeclAttributes(const FunctionDecl FD, llvm::Function F,
293	CodeGenModule &CGM) const;
294
295	void emitTargetGlobals(CodeGen::CodeGenModule &CGM) const override;
296
297	void setTargetAttributes(const Decl D, llvm::GlobalValue GV,
298	CodeGen::CodeGenModule &M) const override;
299	unsigned getOpenCLKernelCallingConv() const override;
300
301	llvm::Constant getNullPointer(const* CodeGen::CodeGenModule &CGM,
302	llvm::PointerType T, QualType QT) const* override;
303
304	LangAS getASTAllocaAddressSpace() const override {
305	return getLangASFromTargetAS(
306	TargetAS: getABIInfo().getDataLayout().getAllocaAddrSpace());
307	}
308	LangAS getGlobalVarAddressSpace(CodeGenModule &CGM,
309	const VarDecl D) const* override;
310	llvm::SyncScope::ID getLLVMSyncScopeID(const LangOptions &LangOpts,
311	SyncScope Scope,
312	llvm::AtomicOrdering Ordering,
313	llvm::LLVMContext &Ctx) const override;
314	llvm::Value *createEnqueuedBlockKernel(CodeGenFunction &CGF,
315	llvm::Function *BlockInvokeFunc,
316	llvm::Type BlockTy) const* override;
317	bool shouldEmitStaticExternCAliases() const override;
318	bool shouldEmitDWARFBitFieldSeparators() const override;
319	void setCUDAKernelCallingConvention(const FunctionType &FT) const* override;
320	};
321	}
322
323	static bool requiresAMDGPUProtectedVisibility(const Decl *D,
324	llvm::GlobalValue *GV) {
325	if (GV->getVisibility() != llvm::GlobalValue::HiddenVisibility)
326	return false;
327
328	return !D->hasAttr<OMPDeclareTargetDeclAttr>() &&
329	(D->hasAttr<OpenCLKernelAttr>() \|\|
330	(isa<FunctionDecl>(Val: D) && D->hasAttr<CUDAGlobalAttr>()) \|\|
331	(isa<VarDecl>(Val: D) &&
332	(D->hasAttr<CUDADeviceAttr>() \|\| D->hasAttr<CUDAConstantAttr>() \|\|
333	cast<VarDecl>(Val: D)->getType()->isCUDADeviceBuiltinSurfaceType() \|\|
334	cast<VarDecl>(Val: D)->getType()->isCUDADeviceBuiltinTextureType())));
335	}
336
337	void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes(
338	const FunctionDecl FD, llvm::Function F, CodeGenModule &M) const {
339	const auto *ReqdWGS =
340	M.getLangOpts().OpenCL ? FD->getAttr<ReqdWorkGroupSizeAttr>() : nullptr;
341	const bool IsOpenCLKernel =
342	M.getLangOpts().OpenCL && FD->hasAttr<OpenCLKernelAttr>();
343	const bool IsHIPKernel = M.getLangOpts().HIP && FD->hasAttr<CUDAGlobalAttr>();
344
345	const auto *FlatWGS = FD->getAttr<AMDGPUFlatWorkGroupSizeAttr>();
346	if (ReqdWGS \|\| FlatWGS) {
347	M.handleAMDGPUFlatWorkGroupSizeAttr(F, A: FlatWGS, ReqdWGS);
348	} else if (IsOpenCLKernel \|\| IsHIPKernel) {
349	// By default, restrict the maximum size to a value specified by
350	// --gpu-max-threads-per-block=n or its default value for HIP.
351	const unsigned OpenCLDefaultMaxWorkGroupSize = `256`;
352	const unsigned DefaultMaxWorkGroupSize =
353	IsOpenCLKernel ? OpenCLDefaultMaxWorkGroupSize
354	: M.getLangOpts().GPUMaxThreadsPerBlock;
355	std::string AttrVal =
356	std::string ("1,") + llvm::utostr(X: DefaultMaxWorkGroupSize);
357	F->addFnAttr(Kind: "amdgpu-flat-work-group-size", Val: AttrVal);
358	}
359
360	if (const auto *Attr = FD->getAttr<AMDGPUWavesPerEUAttr>())
361	M.handleAMDGPUWavesPerEUAttr(F, A: Attr);
362
363	if (const auto *Attr = FD->getAttr<AMDGPUNumSGPRAttr>()) {
364	unsigned NumSGPR = Attr->getNumSGPR();
365
366	if (NumSGPR != `0`)
367	F->addFnAttr(Kind: "amdgpu-num-sgpr", Val: llvm::utostr(X: NumSGPR));
368	}
369
370	if (const auto *Attr = FD->getAttr<AMDGPUNumVGPRAttr>()) {
371	uint32_t NumVGPR = Attr->getNumVGPR();
372
373	if (NumVGPR != `0`)
374	F->addFnAttr(Kind: "amdgpu-num-vgpr", Val: llvm::utostr(X: NumVGPR));
375	}
376
377	if (const auto *Attr = FD->getAttr<AMDGPUMaxNumWorkGroupsAttr>()) {
378	uint32_t X = Attr->getMaxNumWorkGroupsX()
379	->EvaluateKnownConstInt(Ctx: M.getContext())
380	.getExtValue();
381	// Y and Z dimensions default to 1 if not specified
382	uint32_t Y = Attr->getMaxNumWorkGroupsY()
383	? Attr->getMaxNumWorkGroupsY()
384	->EvaluateKnownConstInt(Ctx: M.getContext())
385	.getExtValue()
386	: `1`;
387	uint32_t Z = Attr->getMaxNumWorkGroupsZ()
388	? Attr->getMaxNumWorkGroupsZ()
389	->EvaluateKnownConstInt(Ctx: M.getContext())
390	.getExtValue()
391	: `1`;
392
393	llvm::SmallString<`32`> AttrVal;
394	llvm::raw_svector_ostream OS(AttrVal);
395	OS << X << `','` << Y << `','` << Z;
396
397	F->addFnAttr(Kind: "amdgpu-max-num-workgroups", Val: AttrVal.str());
398	}
399	}
400
401	/// Emits control constants used to change per-architecture behaviour in the
402	/// AMDGPU ROCm device libraries.
403	void AMDGPUTargetCodeGenInfo::emitTargetGlobals(
404	CodeGen::CodeGenModule &CGM) const {
405	StringRef Name = "__oclc_ABI_version";
406	llvm::GlobalVariable *OriginalGV = CGM.getModule().getNamedGlobal(Name);
407	if (OriginalGV && !llvm::GlobalVariable::isExternalLinkage(Linkage: OriginalGV->getLinkage()))
408	return;
409
410	if (CGM.getTarget().getTargetOpts().CodeObjectVersion ==
411	llvm::CodeObjectVersionKind::COV_None)
412	return;
413
414	auto *Type = llvm::IntegerType::getIntNTy(C&: CGM.getModule().getContext(), N: `32`);
415	llvm::Constant *COV = llvm::ConstantInt::get(
416	Ty: Type, V: CGM.getTarget().getTargetOpts().CodeObjectVersion);
417
418	// It needs to be constant weak_odr without externally_initialized so that
419	// the load instuction can be eliminated by the IPSCCP.
420	auto GV = new* llvm::GlobalVariable (
421	CGM.getModule(), Type, true, llvm::GlobalValue::WeakODRLinkage, COV, Name,
422	nullptr, llvm::GlobalValue::ThreadLocalMode::NotThreadLocal,
423	CGM.getContext().getTargetAddressSpace(AS: LangAS::opencl_constant));
424	GV->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Local);
425	GV->setVisibility(llvm::GlobalValue::VisibilityTypes::HiddenVisibility);
426
427	// Replace any external references to this variable with the new global.
428	if (OriginalGV) {
429	OriginalGV->replaceAllUsesWith(V: GV);
430	GV->takeName(V: OriginalGV);
431	OriginalGV->eraseFromParent();
432	}
433	}
434
435	void AMDGPUTargetCodeGenInfo::setTargetAttributes(
436	const Decl D, llvm::GlobalValue GV, CodeGen::CodeGenModule &M) const {
437	if (requiresAMDGPUProtectedVisibility(D, GV)) {
438	GV->setVisibility(llvm::GlobalValue::ProtectedVisibility);
439	GV->setDSOLocal(true);
440	}
441
442	if (GV->isDeclaration())
443	return;
444
445	llvm::Function *F = dyn_cast<llvm::Function>(Val: GV);
446	if (!F)
447	return;
448
449	const FunctionDecl *FD = dyn_cast_or_null<FunctionDecl>(Val: D);
450	if (FD)
451	setFunctionDeclAttributes(FD, F, M);
452
453	if (M.getContext().getTargetInfo().allowAMDGPUUnsafeFPAtomics())
454	F->addFnAttr(Kind: "amdgpu-unsafe-fp-atomics", Val: "true");
455
456	if (!getABIInfo().getCodeGenOpts().EmitIEEENaNCompliantInsts)
457	F->addFnAttr(Kind: "amdgpu-ieee", Val: "false");
458	}
459
460	unsigned AMDGPUTargetCodeGenInfo::getOpenCLKernelCallingConv() const {
461	return llvm::CallingConv::AMDGPU_KERNEL;
462	}
463
464	// Currently LLVM assumes null pointers always have value 0,
465	// which results in incorrectly transformed IR. Therefore, instead of
466	// emitting null pointers in private and local address spaces, a null
467	// pointer in generic address space is emitted which is casted to a
468	// pointer in local or private address space.
469	llvm::Constant *AMDGPUTargetCodeGenInfo::getNullPointer(
470	const CodeGen::CodeGenModule &CGM, llvm::PointerType *PT,
471	QualType QT) const {
472	if (CGM.getContext().getTargetNullPointerValue(QT) == `0`)
473	return llvm::ConstantPointerNull::get(T: PT);
474
475	auto &Ctx = CGM.getContext();
476	auto NPT = llvm::PointerType::get(
477	C&: PT->getContext(), AddressSpace: Ctx.getTargetAddressSpace(AS: LangAS::opencl_generic));
478	return llvm::ConstantExpr::getAddrSpaceCast(
479	C: llvm::ConstantPointerNull::get(T: NPT), Ty: PT);
480	}
481
482	LangAS
483	AMDGPUTargetCodeGenInfo::getGlobalVarAddressSpace(CodeGenModule &CGM,
484	const VarDecl D) const* {
485	assert(!CGM.getLangOpts().OpenCL &&
486	!(CGM.getLangOpts().CUDA && CGM.getLangOpts().CUDAIsDevice) &&
487	"Address space agnostic languages only");
488	LangAS DefaultGlobalAS = getLangASFromTargetAS(
489	TargetAS: CGM.getContext().getTargetAddressSpace(AS: LangAS::opencl_global));
490	if (!D)
491	return DefaultGlobalAS;
492
493	LangAS AddrSpace = D->getType().getAddressSpace();
494	if (AddrSpace != LangAS::Default)
495	return AddrSpace;
496
497	// Only promote to address space 4 if VarDecl has constant initialization.
498	if (D->getType().isConstantStorage(Ctx: CGM.getContext(), ExcludeCtor: false, ExcludeDtor: false) &&
499	D->hasConstantInitialization()) {
500	if (auto ConstAS = CGM.getTarget().getConstantAddressSpace())
501	return *ConstAS;
502	}
503	return DefaultGlobalAS;
504	}
505
506	llvm::SyncScope::ID
507	AMDGPUTargetCodeGenInfo::getLLVMSyncScopeID(const LangOptions &LangOpts,
508	SyncScope Scope,
509	llvm::AtomicOrdering Ordering,
510	llvm::LLVMContext &Ctx) const {
511	std::string Name;
512	switch (Scope) {
513	case SyncScope::HIPSingleThread:
514	case SyncScope::SingleScope:
515	Name = "singlethread";
516	break;
517	case SyncScope::HIPWavefront:
518	case SyncScope::OpenCLSubGroup:
519	case SyncScope::WavefrontScope:
520	Name = "wavefront";
521	break;
522	case SyncScope::HIPWorkgroup:
523	case SyncScope::OpenCLWorkGroup:
524	case SyncScope::WorkgroupScope:
525	Name = "workgroup";
526	break;
527	case SyncScope::HIPAgent:
528	case SyncScope::OpenCLDevice:
529	case SyncScope::DeviceScope:
530	Name = "agent";
531	break;
532	case SyncScope::SystemScope:
533	case SyncScope::HIPSystem:
534	case SyncScope::OpenCLAllSVMDevices:
535	Name = "";
536	break;
537	}
538
539	if (Ordering != llvm::AtomicOrdering::SequentiallyConsistent) {
540	if (!Name.empty())
541	Name = Twine(Twine (Name) + Twine ("-")).str();
542
543	Name = Twine(Twine (Name) + Twine ("one-as")).str();
544	}
545
546	return Ctx.getOrInsertSyncScopeID(SSN: Name);
547	}
548
549	bool AMDGPUTargetCodeGenInfo::shouldEmitStaticExternCAliases() const {
550	return false;
551	}
552
553	bool AMDGPUTargetCodeGenInfo::shouldEmitDWARFBitFieldSeparators() const {
554	return true;
555	}
556
557	void AMDGPUTargetCodeGenInfo::setCUDAKernelCallingConvention(
558	const FunctionType &FT) const* {
559	FT = getABIInfo().getContext().adjustFunctionType(
560	Fn: FT, EInfo: FT->getExtInfo().withCallingConv(cc: CC_OpenCLKernel));
561	}
562
563	/// Create an OpenCL kernel for an enqueued block.
564	///
565	/// The type of the first argument (the block literal) is the struct type
566	/// of the block literal instead of a pointer type. The first argument
567	/// (block literal) is passed directly by value to the kernel. The kernel
568	/// allocates the same type of struct on stack and stores the block literal
569	/// to it and passes its pointer to the block invoke function. The kernel
570	/// has "enqueued-block" function attribute and kernel argument metadata.
571	llvm::Value *AMDGPUTargetCodeGenInfo::createEnqueuedBlockKernel(
572	CodeGenFunction &CGF, llvm::Function Invoke, llvm::Type BlockTy) const {
573	auto &Builder = CGF.Builder;
574	auto &C = CGF.getLLVMContext();
575
576	auto *InvokeFT = Invoke->getFunctionType();
577	llvm::SmallVector<llvm::Type *, `2`> ArgTys;
578	llvm::SmallVector<llvm::Metadata *, `8`> AddressQuals;
579	llvm::SmallVector<llvm::Metadata *, `8`> AccessQuals;
580	llvm::SmallVector<llvm::Metadata *, `8`> ArgTypeNames;
581	llvm::SmallVector<llvm::Metadata *, `8`> ArgBaseTypeNames;
582	llvm::SmallVector<llvm::Metadata *, `8`> ArgTypeQuals;
583	llvm::SmallVector<llvm::Metadata *, `8`> ArgNames;
584
585	ArgTys.push_back(Elt: BlockTy);
586	ArgTypeNames.push_back(Elt: llvm::MDString::get(Context&: C, Str: "__block_literal"));
587	AddressQuals.push_back(Elt: llvm::ConstantAsMetadata::get(C: Builder.getInt32(C: `0`)));
588	ArgBaseTypeNames.push_back(Elt: llvm::MDString::get(Context&: C, Str: "__block_literal"));
589	ArgTypeQuals.push_back(Elt: llvm::MDString::get(Context&: C, Str: ""));
590	AccessQuals.push_back(Elt: llvm::MDString::get(Context&: C, Str: "none"));
591	ArgNames.push_back(Elt: llvm::MDString::get(Context&: C, Str: "block_literal"));
592	for (unsigned I = `1`, E = InvokeFT->getNumParams(); I < E; ++I) {
593	ArgTys.push_back(Elt: InvokeFT->getParamType(i: I));
594	ArgTypeNames.push_back(Elt: llvm::MDString::get(Context&: C, Str: "void*"));
595	AddressQuals.push_back(Elt: llvm::ConstantAsMetadata::get(C: Builder.getInt32(C: `3`)));
596	AccessQuals.push_back(Elt: llvm::MDString::get(Context&: C, Str: "none"));
597	ArgBaseTypeNames.push_back(Elt: llvm::MDString::get(Context&: C, Str: "void*"));
598	ArgTypeQuals.push_back(Elt: llvm::MDString::get(Context&: C, Str: ""));
599	ArgNames.push_back(
600	Elt: llvm::MDString::get(Context&: C, Str: (Twine ("local_arg") + Twine (I)).str()));
601	}
602	std::string Name = Invoke->getName().str() + "_kernel";
603	auto FT = llvm::FunctionType::get(Result: llvm::Type::getVoidTy(C), Params: ArgTys, isVarArg: false*);
604	auto *F = llvm::Function::Create(Ty: FT, Linkage: llvm::GlobalValue::InternalLinkage, N: Name,
605	M: &CGF.CGM.getModule());
606	F->setCallingConv(llvm::CallingConv::AMDGPU_KERNEL);
607
608	llvm::AttrBuilder KernelAttrs(C);
609	// FIXME: The invoke isn't applying the right attributes either
610	// FIXME: This is missing setTargetAttributes
611	CGF.CGM.addDefaultFunctionDefinitionAttributes(attrs&: KernelAttrs);
612	KernelAttrs.addAttribute(A: "enqueued-block");
613	F->addFnAttrs(Attrs: KernelAttrs);
614
615	auto IP = CGF.Builder.saveIP();
616	auto *BB = llvm::BasicBlock::Create(Context&: C, Name: "entry", Parent: F);
617	Builder.SetInsertPoint(BB);
618	const auto BlockAlign = CGF.CGM.getDataLayout().getPrefTypeAlign(Ty: BlockTy);
619	auto BlockPtr = Builder.CreateAlloca(Ty: BlockTy, ArraySize: nullptr*);
620	BlockPtr->setAlignment(BlockAlign);
621	Builder.CreateAlignedStore(Val: F->arg_begin(), Ptr: BlockPtr, Align: BlockAlign);
622	auto *Cast = Builder.CreatePointerCast(V: BlockPtr, DestTy: InvokeFT->getParamType(i: `0`));
623	llvm::SmallVector<llvm::Value *, `2`> Args;
624	Args.push_back(Elt: Cast);
625	for (llvm::Argument &A : llvm::drop_begin(RangeOrContainer: F->args()))
626	Args.push_back(Elt: &A);
627	llvm::CallInst *call = Builder.CreateCall(Callee: Invoke, Args);
628	call->setCallingConv(Invoke->getCallingConv());
629	Builder.CreateRetVoid();
630	Builder.restoreIP(IP);
631
632	F->setMetadata(Kind: "kernel_arg_addr_space", Node: llvm::MDNode::get(Context&: C, MDs: AddressQuals));
633	F->setMetadata(Kind: "kernel_arg_access_qual", Node: llvm::MDNode::get(Context&: C, MDs: AccessQuals));
634	F->setMetadata(Kind: "kernel_arg_type", Node: llvm::MDNode::get(Context&: C, MDs: ArgTypeNames));
635	F->setMetadata(Kind: "kernel_arg_base_type",
636	Node: llvm::MDNode::get(Context&: C, MDs: ArgBaseTypeNames));
637	F->setMetadata(Kind: "kernel_arg_type_qual", Node: llvm::MDNode::get(Context&: C, MDs: ArgTypeQuals));
638	if (CGF.CGM.getCodeGenOpts().EmitOpenCLArgMetadata)
639	F->setMetadata(Kind: "kernel_arg_name", Node: llvm::MDNode::get(Context&: C, MDs: ArgNames));
640
641	return F;
642	}
643
644	void CodeGenModule::handleAMDGPUFlatWorkGroupSizeAttr(
645	llvm::Function F, const* AMDGPUFlatWorkGroupSizeAttr *FlatWGS,
646	const ReqdWorkGroupSizeAttr ReqdWGS, int32_t MinThreadsVal,
647	int32_t *MaxThreadsVal) {
648	unsigned Min = `0`;
649	unsigned Max = `0`;
650	if (FlatWGS) {
651	Min = FlatWGS->getMin()->EvaluateKnownConstInt(Ctx: getContext()).getExtValue();
652	Max = FlatWGS->getMax()->EvaluateKnownConstInt(Ctx: getContext()).getExtValue();
653	}
654	if (ReqdWGS && Min == `0` && Max == `0`)
655	Min = Max = ReqdWGS->getXDim() * ReqdWGS->getYDim() * ReqdWGS->getZDim();
656
657	if (Min != `0`) {
658	assert(Min <= Max && "Min must be less than or equal Max");
659
660	if (MinThreadsVal)
661	*MinThreadsVal = Min;
662	if (MaxThreadsVal)
663	*MaxThreadsVal = Max;
664	std::string AttrVal = llvm::utostr(X: Min) + "," + llvm::utostr(X: Max);
665	if (F)
666	F->addFnAttr(Kind: "amdgpu-flat-work-group-size", Val: AttrVal);
667	} else
668	assert(Max == `0` && "Max must be zero");
669	}
670
671	void CodeGenModule::handleAMDGPUWavesPerEUAttr(
672	llvm::Function F, const* AMDGPUWavesPerEUAttr *Attr) {
673	unsigned Min =
674	Attr->getMin()->EvaluateKnownConstInt(Ctx: getContext()).getExtValue();
675	unsigned Max =
676	Attr->getMax()
677	? Attr->getMax()->EvaluateKnownConstInt(Ctx: getContext()).getExtValue()
678	: `0`;
679
680	if (Min != `0`) {
681	assert((Max == `0` \|\| Min <= Max) && "Min must be less than or equal Max");
682
683	std::string AttrVal = llvm::utostr(X: Min);
684	if (Max != `0`)
685	AttrVal = AttrVal + "," + llvm::utostr(X: Max);
686	F->addFnAttr(Kind: "amdgpu-waves-per-eu", Val: AttrVal);
687	} else
688	assert(Max == `0` && "Max must be zero");
689	}
690
691	std::unique_ptr<TargetCodeGenInfo>
692	CodeGen::createAMDGPUTargetCodeGenInfo(CodeGenModule &CGM) {
693	return std::make_unique<AMDGPUTargetCodeGenInfo>(args&: CGM.getTypes());
694	}
695

Browse the source code of llvm_projects/clang/lib/CodeGen/Targets/AMDGPU.cpp