AMDGPUAttributor.cpp source code [llvm_projects/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp]

1	//===- AMDGPUAttributor.cpp -----------------------------------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	/// \file This pass uses Attributor framework to deduce AMDGPU attributes.
10	//
11	//===----------------------------------------------------------------------===//
12
13	#include "AMDGPU.h"
14	#include "GCNSubtarget.h"
15	#include "Utils/AMDGPUBaseInfo.h"
16	#include "llvm/Analysis/CycleAnalysis.h"
17	#include "llvm/CodeGen/TargetPassConfig.h"
18	#include "llvm/IR/IntrinsicsAMDGPU.h"
19	#include "llvm/IR/IntrinsicsR600.h"
20	#include "llvm/InitializePasses.h"
21	#include "llvm/Target/TargetMachine.h"
22	#include "llvm/Transforms/IPO/Attributor.h"
23
24	#define DEBUG_TYPE "amdgpu-attributor"
25
26	using namespace llvm;
27
28	static cl::opt<unsigned> IndirectCallSpecializationThreshold(
29	"amdgpu-indirect-call-specialization-threshold",
30	cl::desc (
31	"A threshold controls whether an indirect call will be specialized"),
32	cl::init(Val: `3`));
33
34	#define AMDGPU_ATTRIBUTE(Name, Str) Name##_POS,
35
36	enum ImplicitArgumentPositions {
37	#include "AMDGPUAttributes.def"
38	LAST_ARG_POS
39	};
40
41	#define AMDGPU_ATTRIBUTE(Name, Str) Name = 1 << Name##_POS,
42
43	enum ImplicitArgumentMask {
44	NOT_IMPLICIT_INPUT = `0`,
45	#include "AMDGPUAttributes.def"
46	ALL_ARGUMENT_MASK = (`1` << LAST_ARG_POS) - `1`
47	};
48
49	#define AMDGPU_ATTRIBUTE(Name, Str) {Name, Str},
50	static constexpr std::pair<ImplicitArgumentMask, StringLiteral>
51	ImplicitAttrs[] = {
52	#include "AMDGPUAttributes.def"
53	};
54
55	// We do not need to note the x workitem or workgroup id because they are always
56	// initialized.
57	//
58	// TODO: We should not add the attributes if the known compile time workgroup
59	// size is 1 for y/z.
60	static ImplicitArgumentMask
61	intrinsicToAttrMask(Intrinsic::ID ID, bool &NonKernelOnly, bool &NeedsImplicit,
62	bool HasApertureRegs, bool SupportsGetDoorBellID,
63	unsigned CodeObjectVersion) {
64	switch (ID) {
65	case Intrinsic::amdgcn_workitem_id_x:
66	NonKernelOnly = true;
67	return WORKITEM_ID_X;
68	case Intrinsic::amdgcn_workgroup_id_x:
69	NonKernelOnly = true;
70	return WORKGROUP_ID_X;
71	case Intrinsic::amdgcn_workitem_id_y:
72	case Intrinsic::r600_read_tidig_y:
73	return WORKITEM_ID_Y;
74	case Intrinsic::amdgcn_workitem_id_z:
75	case Intrinsic::r600_read_tidig_z:
76	return WORKITEM_ID_Z;
77	case Intrinsic::amdgcn_workgroup_id_y:
78	case Intrinsic::r600_read_tgid_y:
79	return WORKGROUP_ID_Y;
80	case Intrinsic::amdgcn_workgroup_id_z:
81	case Intrinsic::r600_read_tgid_z:
82	return WORKGROUP_ID_Z;
83	case Intrinsic::amdgcn_lds_kernel_id:
84	return LDS_KERNEL_ID;
85	case Intrinsic::amdgcn_dispatch_ptr:
86	return DISPATCH_PTR;
87	case Intrinsic::amdgcn_dispatch_id:
88	return DISPATCH_ID;
89	case Intrinsic::amdgcn_implicitarg_ptr:
90	return IMPLICIT_ARG_PTR;
91	// Need queue_ptr anyway. But under V5, we also need implicitarg_ptr to access
92	// queue_ptr.
93	case Intrinsic::amdgcn_queue_ptr:
94	NeedsImplicit = (CodeObjectVersion >= AMDGPU::AMDHSA_COV5);
95	return QUEUE_PTR;
96	case Intrinsic::amdgcn_is_shared:
97	case Intrinsic::amdgcn_is_private:
98	if (HasApertureRegs)
99	return NOT_IMPLICIT_INPUT;
100	// Under V5, we need implicitarg_ptr + offsets to access private_base or
101	// shared_base. For pre-V5, however, need to access them through queue_ptr +
102	// offsets.
103	return CodeObjectVersion >= AMDGPU::AMDHSA_COV5 ? IMPLICIT_ARG_PTR
104	: QUEUE_PTR;
105	case Intrinsic::trap:
106	case Intrinsic::debugtrap:
107	case Intrinsic::ubsantrap:
108	if (SupportsGetDoorBellID) // GetDoorbellID support implemented since V4.
109	return CodeObjectVersion >= AMDGPU::AMDHSA_COV4 ? NOT_IMPLICIT_INPUT
110	: QUEUE_PTR;
111	NeedsImplicit = (CodeObjectVersion >= AMDGPU::AMDHSA_COV5);
112	return QUEUE_PTR;
113	default:
114	return NOT_IMPLICIT_INPUT;
115	}
116	}
117
118	static bool castRequiresQueuePtr(unsigned SrcAS) {
119	return SrcAS == AMDGPUAS::LOCAL_ADDRESS \|\| SrcAS == AMDGPUAS::PRIVATE_ADDRESS;
120	}
121
122	static bool isDSAddress(const Constant *C) {
123	const GlobalValue *GV = dyn_cast<GlobalValue>(Val: C);
124	if (!GV)
125	return false;
126	unsigned AS = GV->getAddressSpace();
127	return AS == AMDGPUAS::LOCAL_ADDRESS \|\| AS == AMDGPUAS::REGION_ADDRESS;
128	}
129
130	/// Returns true if the function requires the implicit argument be passed
131	/// regardless of the function contents.
132	static bool funcRequiresHostcallPtr(const Function &F) {
133	// Sanitizers require the hostcall buffer passed in the implicit arguments.
134	return F.hasFnAttribute(Kind: Attribute::SanitizeAddress) \|\|
135	F.hasFnAttribute(Kind: Attribute::SanitizeThread) \|\|
136	F.hasFnAttribute(Kind: Attribute::SanitizeMemory) \|\|
137	F.hasFnAttribute(Kind: Attribute::SanitizeHWAddress) \|\|
138	F.hasFnAttribute(Kind: Attribute::SanitizeMemTag);
139	}
140
141	namespace {
142	class AMDGPUInformationCache : public InformationCache {
143	public:
144	AMDGPUInformationCache(const Module &M, AnalysisGetter &AG,
145	BumpPtrAllocator &Allocator,
146	SetVector<Function > CGSCC, TargetMachine &TM)
147	: InformationCache (M, AG, Allocator, CGSCC), TM(TM),
148	CodeObjectVersion(AMDGPU::getAMDHSACodeObjectVersion(M)) {}
149
150	TargetMachine &TM;
151
152	enum ConstantStatus : uint8_t {
153	NONE = `0`,
154	DS_GLOBAL = `1` << `0`,
155	ADDR_SPACE_CAST_PRIVATE_TO_FLAT = `1` << `1`,
156	ADDR_SPACE_CAST_LOCAL_TO_FLAT = `1` << `2`,
157	ADDR_SPACE_CAST_BOTH_TO_FLAT =
158	ADDR_SPACE_CAST_PRIVATE_TO_FLAT \| ADDR_SPACE_CAST_LOCAL_TO_FLAT
159	};
160
161	/// Check if the subtarget has aperture regs.
162	bool hasApertureRegs(Function &F) {
163	const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
164	return ST.hasApertureRegs();
165	}
166
167	/// Check if the subtarget supports GetDoorbellID.
168	bool supportsGetDoorbellID(Function &F) {
169	const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
170	return ST.supportsGetDoorbellID();
171	}
172
173	std::optional<std::pair<unsigned, unsigned>>
174	getFlatWorkGroupSizeAttr(const Function &F) const {
175	auto R = AMDGPU::getIntegerPairAttribute(F, Name: "amdgpu-flat-work-group-size");
176	if (!R)
177	return std::nullopt;
178	return std::make_pair(x&: R ->first, y&: *(R ->second));
179	}
180
181	std::pair<unsigned, unsigned>
182	getDefaultFlatWorkGroupSize(const Function &F) const {
183	const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
184	return ST.getDefaultFlatWorkGroupSize(CC: F.getCallingConv());
185	}
186
187	std::pair<unsigned, unsigned>
188	getMaximumFlatWorkGroupRange(const Function &F) {
189	const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
190	return {ST.getMinFlatWorkGroupSize(), ST.getMaxFlatWorkGroupSize()};
191	}
192
193	SmallVector<unsigned> getMaxNumWorkGroups(const Function &F) {
194	const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
195	return ST.getMaxNumWorkGroups(F);
196	}
197
198	/// Get code object version.
199	unsigned getCodeObjectVersion() const { return CodeObjectVersion; }
200
201	/// Get the effective value of "amdgpu-waves-per-eu" for the function,
202	/// accounting for the interaction with the passed value to use for
203	/// "amdgpu-flat-work-group-size".
204	std::pair<unsigned, unsigned>
205	getWavesPerEU(const Function &F,
206	std::pair<unsigned, unsigned> FlatWorkGroupSize) {
207	const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
208	return ST.getWavesPerEU(FlatWorkGroupSizes: FlatWorkGroupSize, LDSBytes: getLDSSize(F), F);
209	}
210
211	std::optional<std::pair<unsigned, unsigned>>
212	getWavesPerEUAttr(const Function &F) {
213	auto Val = AMDGPU::getIntegerPairAttribute(F, Name: "amdgpu-waves-per-eu",
214	/OnlyFirstRequired=/true);
215	if (!Val)
216	return std::nullopt;
217	if (!Val ->second) {
218	const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
219	Val ->second = ST.getMaxWavesPerEU();
220	}
221	return std::make_pair(x&: Val ->first, y&: *(Val ->second));
222	}
223
224	std::pair<unsigned, unsigned>
225	getEffectiveWavesPerEU(const Function &F,
226	std::pair<unsigned, unsigned> WavesPerEU,
227	std::pair<unsigned, unsigned> FlatWorkGroupSize) {
228	const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
229	return ST.getEffectiveWavesPerEU(RequestedWavesPerEU: WavesPerEU, FlatWorkGroupSizes: FlatWorkGroupSize,
230	LDSBytes: getLDSSize(F));
231	}
232
233	unsigned getMaxWavesPerEU(const Function &F) {
234	const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
235	return ST.getMaxWavesPerEU();
236	}
237
238	private:
239	/// Check if the ConstantExpr \p CE uses an addrspacecast from private or
240	/// local to flat. These casts may require the queue pointer.
241	static uint8_t visitConstExpr(const ConstantExpr *CE) {
242	uint8_t Status = NONE;
243
244	if (CE->getOpcode() == Instruction::AddrSpaceCast) {
245	unsigned SrcAS = CE->getOperand(i_nocapture: `0`)->getType()->getPointerAddressSpace();
246	if (SrcAS == AMDGPUAS::PRIVATE_ADDRESS)
247	Status \|= ADDR_SPACE_CAST_PRIVATE_TO_FLAT;
248	else if (SrcAS == AMDGPUAS::LOCAL_ADDRESS)
249	Status \|= ADDR_SPACE_CAST_LOCAL_TO_FLAT;
250	}
251
252	return Status;
253	}
254
255	/// Returns the minimum amount of LDS space used by a workgroup running
256	/// function \p F.
257	static unsigned getLDSSize(const Function &F) {
258	return AMDGPU::getIntegerPairAttribute(F, Name: "amdgpu-lds-size",
259	Default: {`0`, UINT32_MAX}, OnlyFirstRequired: true)
260	.first;
261	}
262
263	/// Get the constant access bitmap for \p C.
264	uint8_t getConstantAccess(const Constant *C,
265	SmallPtrSetImpl<const Constant *> &Visited) {
266	auto It = ConstantStatus.find(Val: C);
267	if (It != ConstantStatus.end())
268	return It ->second;
269
270	uint8_t Result = `0`;
271	if (isDSAddress(C))
272	Result = DS_GLOBAL;
273
274	if (const auto *CE = dyn_cast<ConstantExpr>(Val: C))
275	Result \|= visitConstExpr(CE);
276
277	for (const Use &U : C->operands()) {
278	const auto *OpC = dyn_cast<Constant>(Val: U);
279	if (!OpC \|\| !Visited.insert(Ptr: OpC).second)
280	continue;
281
282	Result \|= getConstantAccess(C: OpC, Visited);
283	}
284	return Result;
285	}
286
287	public:
288	/// Returns true if \p Fn needs the queue pointer because of \p C.
289	bool needsQueuePtr(const Constant *C, Function &Fn) {
290	bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(CC: Fn.getCallingConv());
291	bool HasAperture = hasApertureRegs(F&: Fn);
292
293	// No need to explore the constants.
294	if (!IsNonEntryFunc && HasAperture)
295	return false;
296
297	SmallPtrSet<const Constant *, `8`> Visited;
298	uint8_t Access = getConstantAccess(C, Visited);
299
300	// We need to trap on DS globals in non-entry functions.
301	if (IsNonEntryFunc && (Access & DS_GLOBAL))
302	return true;
303
304	return !HasAperture && (Access & ADDR_SPACE_CAST_BOTH_TO_FLAT);
305	}
306
307	bool checkConstForAddrSpaceCastFromPrivate(const Constant *C) {
308	SmallPtrSet<const Constant *, `8`> Visited;
309	uint8_t Access = getConstantAccess(C, Visited);
310	return Access & ADDR_SPACE_CAST_PRIVATE_TO_FLAT;
311	}
312
313	private:
314	/// Used to determine if the Constant needs the queue pointer.
315	DenseMap<const Constant *, uint8_t> ConstantStatus;
316	const unsigned CodeObjectVersion;
317	};
318
319	struct AAAMDAttributes
320	: public StateWrapper<BitIntegerState<uint32_t, ALL_ARGUMENT_MASK, `0`>,
321	AbstractAttribute> {
322	using Base = StateWrapper<BitIntegerState<uint32_t, ALL_ARGUMENT_MASK, `0`>,
323	AbstractAttribute>;
324
325	AAAMDAttributes(const IRPosition &IRP, Attributor &A) : Base (IRP) {}
326
327	/// Create an abstract attribute view for the position \p IRP.
328	static AAAMDAttributes &createForPosition(const IRPosition &IRP,
329	Attributor &A);
330
331	/// See AbstractAttribute::getName().
332	StringRef getName() const override { return "AAAMDAttributes"; }
333
334	/// See AbstractAttribute::getIdAddr().
335	const char getIdAddr() const* override { return &ID; }
336
337	/// This function should return true if the type of the \p AA is
338	/// AAAMDAttributes.
339	static bool classof(const AbstractAttribute *AA) {
340	return (AA->getIdAddr() == &ID);
341	}
342
343	/// Unique ID (due to the unique address)
344	static const char ID;
345	};
346	const char AAAMDAttributes::ID = `0`;
347
348	struct AAUniformWorkGroupSize
349	: public StateWrapper<BooleanState, AbstractAttribute> {
350	using Base = StateWrapper<BooleanState, AbstractAttribute>;
351	AAUniformWorkGroupSize(const IRPosition &IRP, Attributor &A) : Base (IRP) {}
352
353	/// Create an abstract attribute view for the position \p IRP.
354	static AAUniformWorkGroupSize &createForPosition(const IRPosition &IRP,
355	Attributor &A);
356
357	/// See AbstractAttribute::getName().
358	StringRef getName() const override { return "AAUniformWorkGroupSize"; }
359
360	/// See AbstractAttribute::getIdAddr().
361	const char getIdAddr() const* override { return &ID; }
362
363	/// This function should return true if the type of the \p AA is
364	/// AAAMDAttributes.
365	static bool classof(const AbstractAttribute *AA) {
366	return (AA->getIdAddr() == &ID);
367	}
368
369	/// Unique ID (due to the unique address)
370	static const char ID;
371	};
372	const char AAUniformWorkGroupSize::ID = `0`;
373
374	struct AAUniformWorkGroupSizeFunction : public AAUniformWorkGroupSize {
375	AAUniformWorkGroupSizeFunction(const IRPosition &IRP, Attributor &A)
376	: AAUniformWorkGroupSize (IRP, A) {}
377
378	void initialize(Attributor &A) override {
379	Function *F = getAssociatedFunction();
380	CallingConv::ID CC = F->getCallingConv();
381
382	if (CC != CallingConv::AMDGPU_KERNEL)
383	return;
384
385	bool InitialValue = false;
386	if (F->hasFnAttribute(Kind: "uniform-work-group-size"))
387	InitialValue =
388	F->getFnAttribute(Kind: "uniform-work-group-size").getValueAsString() ==
389	"true";
390
391	if (InitialValue)
392	indicateOptimisticFixpoint();
393	else
394	indicatePessimisticFixpoint();
395	}
396
397	ChangeStatus updateImpl(Attributor &A) override {
398	ChangeStatus Change = ChangeStatus::UNCHANGED;
399
400	auto CheckCallSite = [&](AbstractCallSite CS) {
401	Function *Caller = CS.getInstruction()->getFunction();
402	LLVM_DEBUG(dbgs() << "[AAUniformWorkGroupSize] Call " << Caller->getName()
403	<< "->" << getAssociatedFunction()->getName() << "\n");
404
405	const auto *CallerInfo = A.getAAFor<AAUniformWorkGroupSize>(
406	QueryingAA: *this, IRP: IRPosition::function(F: *Caller), DepClass: DepClassTy::REQUIRED);
407	if (!CallerInfo \|\| !CallerInfo->isValidState())
408	return false;
409
410	Change = Change \| clampStateAndIndicateChange(S&: this->getState(),
411	R: CallerInfo->getState());
412
413	return true;
414	};
415
416	bool AllCallSitesKnown = true;
417	if (!A.checkForAllCallSites(Pred: CheckCallSite, QueryingAA: *this, RequireAllCallSites: true, UsedAssumedInformation&: AllCallSitesKnown))
418	return indicatePessimisticFixpoint();
419
420	return Change;
421	}
422
423	ChangeStatus manifest(Attributor &A) override {
424	SmallVector<Attribute, `8`> AttrList;
425	LLVMContext &Ctx = getAssociatedFunction()->getContext();
426
427	AttrList.push_back(Elt: Attribute::get(Context&: Ctx, Kind: "uniform-work-group-size",
428	Val: getAssumed() ? "true" : "false"));
429	return A.manifestAttrs(IRP: getIRPosition(), DeducedAttrs: AttrList,
430	/ ForceReplace / true);
431	}
432
433	bool isValidState() const override {
434	// This state is always valid, even when the state is false.
435	return true;
436	}
437
438	const std::string getAsStr(Attributor ) const* override {
439	return "AMDWorkGroupSize[" + std::to_string(val: getAssumed()) + "]";
440	}
441
442	/// See AbstractAttribute::trackStatistics()
443	void trackStatistics() const override {}
444	};
445
446	AAUniformWorkGroupSize &
447	AAUniformWorkGroupSize::createForPosition(const IRPosition &IRP,
448	Attributor &A) {
449	if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
450	return *new (A.Allocator) AAUniformWorkGroupSizeFunction (IRP, A);
451	llvm_unreachable(
452	"AAUniformWorkGroupSize is only valid for function position");
453	}
454
455	struct AAAMDAttributesFunction : public AAAMDAttributes {
456	AAAMDAttributesFunction(const IRPosition &IRP, Attributor &A)
457	: AAAMDAttributes (IRP, A) {}
458
459	void initialize(Attributor &A) override {
460	Function *F = getAssociatedFunction();
461
462	// If the function requires the implicit arg pointer due to sanitizers,
463	// assume it's needed even if explicitly marked as not requiring it.
464	const bool NeedsHostcall = funcRequiresHostcallPtr(F: *F);
465	if (NeedsHostcall) {
466	removeAssumedBits(BitsEncoding: IMPLICIT_ARG_PTR);
467	removeAssumedBits(BitsEncoding: HOSTCALL_PTR);
468	}
469
470	for (auto Attr : ImplicitAttrs) {
471	if (NeedsHostcall &&
472	(Attr.first == IMPLICIT_ARG_PTR \|\| Attr.first == HOSTCALL_PTR))
473	continue;
474
475	if (F->hasFnAttribute(Kind: Attr.second))
476	addKnownBits(Bits: Attr.first);
477	}
478
479	if (F->isDeclaration())
480	return;
481
482	// Ignore functions with graphics calling conventions, these are currently
483	// not allowed to have kernel arguments.
484	if (AMDGPU::isGraphics(CC: F->getCallingConv())) {
485	indicatePessimisticFixpoint();
486	return;
487	}
488	}
489
490	ChangeStatus updateImpl(Attributor &A) override {
491	Function *F = getAssociatedFunction();
492	// The current assumed state used to determine a change.
493	auto OrigAssumed = getAssumed();
494
495	// Check for Intrinsics and propagate attributes.
496	const AACallEdges *AAEdges = A.getAAFor<AACallEdges>(
497	QueryingAA: *this, IRP: this->getIRPosition(), DepClass: DepClassTy::REQUIRED);
498	if (!AAEdges \|\| !AAEdges->isValidState() \|\|
499	AAEdges->hasNonAsmUnknownCallee())
500	return indicatePessimisticFixpoint();
501
502	bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(CC: F->getCallingConv());
503
504	bool NeedsImplicit = false;
505	auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
506	bool HasApertureRegs = InfoCache.hasApertureRegs(F&: *F);
507	bool SupportsGetDoorbellID = InfoCache.supportsGetDoorbellID(F&: *F);
508	unsigned COV = InfoCache.getCodeObjectVersion();
509
510	for (Function *Callee : AAEdges->getOptimisticEdges()) {
511	Intrinsic::ID IID = Callee->getIntrinsicID();
512	if (IID == Intrinsic::not_intrinsic) {
513	const AAAMDAttributes *AAAMD = A.getAAFor<AAAMDAttributes>(
514	QueryingAA: *this, IRP: IRPosition::function(F: *Callee), DepClass: DepClassTy::REQUIRED);
515	if (!AAAMD \|\| !AAAMD->isValidState())
516	return indicatePessimisticFixpoint();
517	*this &= *AAAMD;
518	continue;
519	}
520
521	bool NonKernelOnly = false;
522	ImplicitArgumentMask AttrMask =
523	intrinsicToAttrMask(ID: IID, NonKernelOnly, NeedsImplicit,
524	HasApertureRegs, SupportsGetDoorBellID: SupportsGetDoorbellID, CodeObjectVersion: COV);
525	if (AttrMask != NOT_IMPLICIT_INPUT) {
526	if ((IsNonEntryFunc \|\| !NonKernelOnly))
527	removeAssumedBits(BitsEncoding: AttrMask);
528	}
529	}
530
531	// Need implicitarg_ptr to acess queue_ptr, private_base, and shared_base.
532	if (NeedsImplicit)
533	removeAssumedBits(BitsEncoding: IMPLICIT_ARG_PTR);
534
535	if (isAssumed(BitsEncoding: QUEUE_PTR) && checkForQueuePtr(A)) {
536	// Under V5, we need implicitarg_ptr + offsets to access private_base or
537	// shared_base. We do not actually need queue_ptr.
538	if (COV >= `5`)
539	removeAssumedBits(BitsEncoding: IMPLICIT_ARG_PTR);
540	else
541	removeAssumedBits(BitsEncoding: QUEUE_PTR);
542	}
543
544	if (funcRetrievesMultigridSyncArg(A, COV)) {
545	assert(!isAssumed(IMPLICIT_ARG_PTR) &&
546	"multigrid_sync_arg needs implicitarg_ptr");
547	removeAssumedBits(BitsEncoding: MULTIGRID_SYNC_ARG);
548	}
549
550	if (funcRetrievesHostcallPtr(A, COV)) {
551	assert(!isAssumed(IMPLICIT_ARG_PTR) && "hostcall needs implicitarg_ptr");
552	removeAssumedBits(BitsEncoding: HOSTCALL_PTR);
553	}
554
555	if (funcRetrievesHeapPtr(A, COV)) {
556	assert(!isAssumed(IMPLICIT_ARG_PTR) && "heap_ptr needs implicitarg_ptr");
557	removeAssumedBits(BitsEncoding: HEAP_PTR);
558	}
559
560	if (isAssumed(BitsEncoding: QUEUE_PTR) && funcRetrievesQueuePtr(A, COV)) {
561	assert(!isAssumed(IMPLICIT_ARG_PTR) && "queue_ptr needs implicitarg_ptr");
562	removeAssumedBits(BitsEncoding: QUEUE_PTR);
563	}
564
565	if (isAssumed(BitsEncoding: LDS_KERNEL_ID) && funcRetrievesLDSKernelId(A)) {
566	removeAssumedBits(BitsEncoding: LDS_KERNEL_ID);
567	}
568
569	if (isAssumed(BitsEncoding: DEFAULT_QUEUE) && funcRetrievesDefaultQueue(A, COV))
570	removeAssumedBits(BitsEncoding: DEFAULT_QUEUE);
571
572	if (isAssumed(BitsEncoding: COMPLETION_ACTION) && funcRetrievesCompletionAction(A, COV))
573	removeAssumedBits(BitsEncoding: COMPLETION_ACTION);
574
575	if (isAssumed(BitsEncoding: FLAT_SCRATCH_INIT) && needFlatScratchInit(A))
576	removeAssumedBits(BitsEncoding: FLAT_SCRATCH_INIT);
577
578	return getAssumed() != OrigAssumed ? ChangeStatus::CHANGED
579	: ChangeStatus::UNCHANGED;
580	}
581
582	ChangeStatus manifest(Attributor &A) override {
583	SmallVector<Attribute, `8`> AttrList;
584	LLVMContext &Ctx = getAssociatedFunction()->getContext();
585
586	for (auto Attr : ImplicitAttrs) {
587	if (isKnown(BitsEncoding: Attr.first))
588	AttrList.push_back(Elt: Attribute::get(Context&: Ctx, Kind: Attr.second));
589	}
590
591	return A.manifestAttrs(IRP: getIRPosition(), DeducedAttrs: AttrList,
592	/ ForceReplace / true);
593	}
594
595	const std::string getAsStr(Attributor ) const* override {
596	std::string Str;
597	raw_string_ostream OS(Str);
598	OS << "AMDInfo[";
599	for (auto Attr : ImplicitAttrs)
600	if (isAssumed(BitsEncoding: Attr.first))
601	OS << `' '` << Attr.second;
602	OS << " ]";
603	return OS.str();
604	}
605
606	/// See AbstractAttribute::trackStatistics()
607	void trackStatistics() const override {}
608
609	private:
610	bool checkForQueuePtr(Attributor &A) {
611	Function *F = getAssociatedFunction();
612	bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(CC: F->getCallingConv());
613
614	auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
615
616	bool NeedsQueuePtr = false;
617
618	auto CheckAddrSpaceCasts = [&](Instruction &I) {
619	unsigned SrcAS = static_cast<AddrSpaceCastInst &>(I).getSrcAddressSpace();
620	if (castRequiresQueuePtr(SrcAS)) {
621	NeedsQueuePtr = true;
622	return false;
623	}
624	return true;
625	};
626
627	bool HasApertureRegs = InfoCache.hasApertureRegs(F&: *F);
628
629	// `checkForAllInstructions` is much more cheaper than going through all
630	// instructions, try it first.
631
632	// The queue pointer is not needed if aperture regs is present.
633	if (!HasApertureRegs) {
634	bool UsedAssumedInformation = false;
635	A.checkForAllInstructions(Pred: CheckAddrSpaceCasts, QueryingAA: *this,
636	Opcodes: {Instruction::AddrSpaceCast},
637	UsedAssumedInformation);
638	}
639
640	// If we found that we need the queue pointer, nothing else to do.
641	if (NeedsQueuePtr)
642	return true;
643
644	if (!IsNonEntryFunc && HasApertureRegs)
645	return false;
646
647	for (BasicBlock &BB : *F) {
648	for (Instruction &I : BB) {
649	for (const Use &U : I.operands()) {
650	if (const auto *C = dyn_cast<Constant>(Val: U)) {
651	if (InfoCache.needsQueuePtr(C, Fn&: *F))
652	return true;
653	}
654	}
655	}
656	}
657
658	return false;
659	}
660
661	bool funcRetrievesMultigridSyncArg(Attributor &A, unsigned COV) {
662	auto Pos = llvm::AMDGPU::getMultigridSyncArgImplicitArgPosition(COV);
663	AA::RangeTy Range(Pos, `8`);
664	return funcRetrievesImplicitKernelArg(A, Range);
665	}
666
667	bool funcRetrievesHostcallPtr(Attributor &A, unsigned COV) {
668	auto Pos = llvm::AMDGPU::getHostcallImplicitArgPosition(COV);
669	AA::RangeTy Range(Pos, `8`);
670	return funcRetrievesImplicitKernelArg(A, Range);
671	}
672
673	bool funcRetrievesDefaultQueue(Attributor &A, unsigned COV) {
674	auto Pos = llvm::AMDGPU::getDefaultQueueImplicitArgPosition(COV);
675	AA::RangeTy Range(Pos, `8`);
676	return funcRetrievesImplicitKernelArg(A, Range);
677	}
678
679	bool funcRetrievesCompletionAction(Attributor &A, unsigned COV) {
680	auto Pos = llvm::AMDGPU::getCompletionActionImplicitArgPosition(COV);
681	AA::RangeTy Range(Pos, `8`);
682	return funcRetrievesImplicitKernelArg(A, Range);
683	}
684
685	bool funcRetrievesHeapPtr(Attributor &A, unsigned COV) {
686	if (COV < `5`)
687	return false;
688	AA::RangeTy Range(AMDGPU::ImplicitArg::HEAP_PTR_OFFSET, `8`);
689	return funcRetrievesImplicitKernelArg(A, Range);
690	}
691
692	bool funcRetrievesQueuePtr(Attributor &A, unsigned COV) {
693	if (COV < `5`)
694	return false;
695	AA::RangeTy Range(AMDGPU::ImplicitArg::QUEUE_PTR_OFFSET, `8`);
696	return funcRetrievesImplicitKernelArg(A, Range);
697	}
698
699	bool funcRetrievesImplicitKernelArg(Attributor &A, AA::RangeTy Range) {
700	// Check if this is a call to the implicitarg_ptr builtin and it
701	// is used to retrieve the hostcall pointer. The implicit arg for
702	// hostcall is not used only if every use of the implicitarg_ptr
703	// is a load that clearly does not retrieve any byte of the
704	// hostcall pointer. We check this by tracing all the uses of the
705	// initial call to the implicitarg_ptr intrinsic.
706	auto DoesNotLeadToKernelArgLoc = [&](Instruction &I) {
707	auto &Call = cast<CallBase>(Val&: I);
708	if (Call.getIntrinsicID() != Intrinsic::amdgcn_implicitarg_ptr)
709	return true;
710
711	const auto *PointerInfoAA = A.getAAFor<AAPointerInfo>(
712	QueryingAA: *this, IRP: IRPosition::callsite_returned(CB: Call), DepClass: DepClassTy::REQUIRED);
713	if (!PointerInfoAA \|\| !PointerInfoAA->getState().isValidState())
714	return false;
715
716	return PointerInfoAA->forallInterferingAccesses(
717	Range, CB: [](const AAPointerInfo::Access &Acc, bool IsExact) {
718	return Acc.getRemoteInst()->isDroppable();
719	});
720	};
721
722	bool UsedAssumedInformation = false;
723	return !A.checkForAllCallLikeInstructions(Pred: DoesNotLeadToKernelArgLoc, QueryingAA: *this,
724	UsedAssumedInformation);
725	}
726
727	bool funcRetrievesLDSKernelId(Attributor &A) {
728	auto DoesNotRetrieve = [&](Instruction &I) {
729	auto &Call = cast<CallBase>(Val&: I);
730	return Call.getIntrinsicID() != Intrinsic::amdgcn_lds_kernel_id;
731	};
732	bool UsedAssumedInformation = false;
733	return !A.checkForAllCallLikeInstructions(Pred: DoesNotRetrieve, QueryingAA: *this,
734	UsedAssumedInformation);
735	}
736
737	// Returns true if FlatScratchInit is needed, i.e., no-flat-scratch-init is
738	// not to be set.
739	bool needFlatScratchInit(Attributor &A) {
740	assert(isAssumed(FLAT_SCRATCH_INIT)); // only called if the bit is still set
741
742	// Check all AddrSpaceCast instructions. FlatScratchInit is needed if
743	// there is a cast from PRIVATE_ADDRESS.
744	auto AddrSpaceCastNotFromPrivate = [](Instruction &I) {
745	return cast<AddrSpaceCastInst>(Val&: I).getSrcAddressSpace() !=
746	AMDGPUAS::PRIVATE_ADDRESS;
747	};
748
749	bool UsedAssumedInformation = false;
750	if (!A.checkForAllInstructions(Pred: AddrSpaceCastNotFromPrivate, QueryingAA: *this,
751	Opcodes: {Instruction::AddrSpaceCast},
752	UsedAssumedInformation))
753	return true;
754
755	// Check for addrSpaceCast from PRIVATE_ADDRESS in constant expressions
756	auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
757
758	Function *F = getAssociatedFunction();
759	for (Instruction &I : instructions(F)) {
760	for (const Use &U : I.operands()) {
761	if (const auto *C = dyn_cast<Constant>(Val: U)) {
762	if (InfoCache.checkConstForAddrSpaceCastFromPrivate(C))
763	return true;
764	}
765	}
766	}
767
768	// Finally check callees.
769
770	// This is called on each callee; false means callee shouldn't have
771	// no-flat-scratch-init.
772	auto CheckForNoFlatScratchInit = [&](Instruction &I) {
773	const auto &CB = cast<CallBase>(Val&: I);
774	const Function *Callee = CB.getCalledFunction();
775
776	// Callee == 0 for inline asm or indirect call with known callees.
777	// In the latter case, updateImpl() already checked the callees and we
778	// know their FLAT_SCRATCH_INIT bit is set.
779	// If function has indirect call with unknown callees, the bit is
780	// already removed in updateImpl() and execution won't reach here.
781	if (!Callee)
782	return true;
783
784	return Callee->getIntrinsicID() !=
785	Intrinsic::amdgcn_addrspacecast_nonnull;
786	};
787
788	UsedAssumedInformation = false;
789	// If any callee is false (i.e. need FlatScratchInit),
790	// checkForAllCallLikeInstructions returns false, in which case this
791	// function returns true.
792	return !A.checkForAllCallLikeInstructions(Pred: CheckForNoFlatScratchInit, QueryingAA: *this,
793	UsedAssumedInformation);
794	}
795	};
796
797	AAAMDAttributes &AAAMDAttributes::createForPosition(const IRPosition &IRP,
798	Attributor &A) {
799	if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
800	return *new (A.Allocator) AAAMDAttributesFunction (IRP, A);
801	llvm_unreachable("AAAMDAttributes is only valid for function position");
802	}
803
804	/// Base class to derive different size ranges.
805	struct AAAMDSizeRangeAttribute
806	: public StateWrapper<IntegerRangeState, AbstractAttribute, uint32_t> {
807	using Base = StateWrapper<IntegerRangeState, AbstractAttribute, uint32_t>;
808
809	StringRef AttrName;
810
811	AAAMDSizeRangeAttribute(const IRPosition &IRP, Attributor &A,
812	StringRef AttrName)
813	: Base (IRP, `32`), AttrName (AttrName) {}
814
815	/// See AbstractAttribute::trackStatistics()
816	void trackStatistics() const override {}
817
818	template <class AttributeImpl> ChangeStatus updateImplImpl(Attributor &A) {
819	ChangeStatus Change = ChangeStatus::UNCHANGED;
820
821	auto CheckCallSite = [&](AbstractCallSite CS) {
822	Function *Caller = CS.getInstruction()->getFunction();
823	LLVM_DEBUG(dbgs() << `'['` << getName() << "] Call " << Caller->getName()
824	<< "->" << getAssociatedFunction()->getName() << `'\n'`);
825
826	const auto *CallerInfo = A.getAAFor<AttributeImpl>(
827	*this, IRPosition::function(F: *Caller), DepClassTy::REQUIRED);
828	if (!CallerInfo \|\| !CallerInfo->isValidState())
829	return false;
830
831	Change \|=
832	clampStateAndIndicateChange(this->getState(), CallerInfo->getState());
833
834	return true;
835	};
836
837	bool AllCallSitesKnown = true;
838	if (!A.checkForAllCallSites(CheckCallSite, *this,
839	/RequireAllCallSites=/true,
840	AllCallSitesKnown))
841	return indicatePessimisticFixpoint();
842
843	return Change;
844	}
845
846	/// Clamp the assumed range to the default value ([Min, Max]) and emit the
847	/// attribute if it is not same as default.
848	ChangeStatus
849	emitAttributeIfNotDefaultAfterClamp(Attributor &A,
850	std::pair<unsigned, unsigned> Default) {
851	auto [Min, Max] = Default;
852	unsigned Lower = getAssumed().getLower().getZExtValue();
853	unsigned Upper = getAssumed().getUpper().getZExtValue();
854
855	// Clamp the range to the default value.
856	if (Lower < Min)
857	Lower = Min;
858	if (Upper > Max + `1`)
859	Upper = Max + `1`;
860
861	// No manifest if the value is invalid or same as default after clamp.
862	if ((Lower == Min && Upper == Max + `1`) \|\| (Upper < Lower))
863	return ChangeStatus::UNCHANGED;
864
865	Function *F = getAssociatedFunction();
866	LLVMContext &Ctx = F->getContext();
867	SmallString<`10`> Buffer;
868	raw_svector_ostream OS(Buffer);
869	OS << Lower << `','` << Upper - `1`;
870	return A.manifestAttrs(IRP: getIRPosition(),
871	DeducedAttrs: {Attribute::get(Context&: Ctx, Kind: AttrName, Val: OS.str())},
872	/ForceReplace=/true);
873	}
874
875	const std::string getAsStr(Attributor ) const* override {
876	std::string Str;
877	raw_string_ostream OS(Str);
878	OS << getName() << `'['`;
879	OS << getAssumed().getLower() << `','` << getAssumed().getUpper() - `1`;
880	OS << `']'`;
881	return OS.str();
882	}
883	};
884
885	/// Propagate amdgpu-flat-work-group-size attribute.
886	struct AAAMDFlatWorkGroupSize : public AAAMDSizeRangeAttribute {
887	AAAMDFlatWorkGroupSize(const IRPosition &IRP, Attributor &A)
888	: AAAMDSizeRangeAttribute (IRP, A, "amdgpu-flat-work-group-size") {}
889
890	void initialize(Attributor &A) override {
891	Function *F = getAssociatedFunction();
892	auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
893
894	bool HasAttr = false;
895	auto Range = InfoCache.getDefaultFlatWorkGroupSize(F: *F);
896	auto MaxRange = InfoCache.getMaximumFlatWorkGroupRange(F: *F);
897
898	if (auto Attr = InfoCache.getFlatWorkGroupSizeAttr(F: *F)) {
899	// We only consider an attribute that is not max range because the front
900	// end always emits the attribute, unfortunately, and sometimes it emits
901	// the max range.
902	if (*Attr != MaxRange) {
903	Range = *Attr;
904	HasAttr = true;
905	}
906	}
907
908	// We don't want to directly clamp the state if it's the max range because
909	// that is basically the worst state.
910	if (Range == MaxRange)
911	return;
912
913	auto [Min, Max] = Range;
914	ConstantRange CR(APInt (`32`, Min), APInt (`32`, Max + `1`));
915	IntegerRangeState IRS(CR);
916	clampStateAndIndicateChange(S&: this->getState(), R: IRS);
917
918	if (HasAttr \|\| AMDGPU::isEntryFunctionCC(CC: F->getCallingConv()))
919	indicateOptimisticFixpoint();
920	}
921
922	ChangeStatus updateImpl(Attributor &A) override {
923	return updateImplImpl<AAAMDFlatWorkGroupSize>(A);
924	}
925
926	/// Create an abstract attribute view for the position \p IRP.
927	static AAAMDFlatWorkGroupSize &createForPosition(const IRPosition &IRP,
928	Attributor &A);
929
930	ChangeStatus manifest(Attributor &A) override {
931	Function *F = getAssociatedFunction();
932	auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
933	return emitAttributeIfNotDefaultAfterClamp(
934	A, Default: InfoCache.getMaximumFlatWorkGroupRange(F: *F));
935	}
936
937	/// See AbstractAttribute::getName()
938	StringRef getName() const override { return "AAAMDFlatWorkGroupSize"; }
939
940	/// See AbstractAttribute::getIdAddr()
941	const char getIdAddr() const* override { return &ID; }
942
943	/// This function should return true if the type of the \p AA is
944	/// AAAMDFlatWorkGroupSize
945	static bool classof(const AbstractAttribute *AA) {
946	return (AA->getIdAddr() == &ID);
947	}
948
949	/// Unique ID (due to the unique address)
950	static const char ID;
951	};
952
953	const char AAAMDFlatWorkGroupSize::ID = `0`;
954
955	AAAMDFlatWorkGroupSize &
956	AAAMDFlatWorkGroupSize::createForPosition(const IRPosition &IRP,
957	Attributor &A) {
958	if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
959	return *new (A.Allocator) AAAMDFlatWorkGroupSize (IRP, A);
960	llvm_unreachable(
961	"AAAMDFlatWorkGroupSize is only valid for function position");
962	}
963
964	struct TupleDecIntegerRangeState : public AbstractState {
965	DecIntegerState<uint32_t> X, Y, Z;
966
967	bool isValidState() const override {
968	return X.isValidState() && Y.isValidState() && Z.isValidState();
969	}
970
971	bool isAtFixpoint() const override {
972	return X.isAtFixpoint() && Y.isAtFixpoint() && Z.isAtFixpoint();
973	}
974
975	ChangeStatus indicateOptimisticFixpoint() override {
976	return X.indicateOptimisticFixpoint() \| Y.indicateOptimisticFixpoint() \|
977	Z.indicateOptimisticFixpoint();
978	}
979
980	ChangeStatus indicatePessimisticFixpoint() override {
981	return X.indicatePessimisticFixpoint() \| Y.indicatePessimisticFixpoint() \|
982	Z.indicatePessimisticFixpoint();
983	}
984
985	TupleDecIntegerRangeState operator^=(const TupleDecIntegerRangeState &Other) {
986	X ^= Other.X;
987	Y ^= Other.Y;
988	Z ^= Other.Z;
989	return *this;
990	}
991
992	bool operator==(const TupleDecIntegerRangeState &Other) const {
993	return X == Other.X && Y == Other.Y && Z == Other.Z;
994	}
995
996	TupleDecIntegerRangeState &getAssumed() { return *this; }
997	const TupleDecIntegerRangeState &getAssumed() const { return *this; }
998	};
999
1000	using AAAMDMaxNumWorkgroupsState =
1001	StateWrapper<TupleDecIntegerRangeState, AbstractAttribute, uint32_t>;
1002
1003	/// Propagate amdgpu-max-num-workgroups attribute.
1004	struct AAAMDMaxNumWorkgroups
1005	: public StateWrapper<TupleDecIntegerRangeState, AbstractAttribute> {
1006	using Base = StateWrapper<TupleDecIntegerRangeState, AbstractAttribute>;
1007
1008	AAAMDMaxNumWorkgroups(const IRPosition &IRP, Attributor &A) : Base (IRP) {}
1009
1010	void initialize(Attributor &A) override {
1011	Function *F = getAssociatedFunction();
1012	auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
1013
1014	SmallVector<unsigned> MaxNumWorkgroups = InfoCache.getMaxNumWorkGroups(F: *F);
1015
1016	X.takeKnownMinimum(Value: MaxNumWorkgroups [`0`]);
1017	Y.takeKnownMinimum(Value: MaxNumWorkgroups [`1`]);
1018	Z.takeKnownMinimum(Value: MaxNumWorkgroups [`2`]);
1019
1020	if (AMDGPU::isEntryFunctionCC(CC: F->getCallingConv()))
1021	indicatePessimisticFixpoint();
1022	}
1023
1024	ChangeStatus updateImpl(Attributor &A) override {
1025	ChangeStatus Change = ChangeStatus::UNCHANGED;
1026
1027	auto CheckCallSite = [&](AbstractCallSite CS) {
1028	Function *Caller = CS.getInstruction()->getFunction();
1029	LLVM_DEBUG(dbgs() << "[AAAMDMaxNumWorkgroups] Call " << Caller->getName()
1030	<< "->" << getAssociatedFunction()->getName() << `'\n'`);
1031
1032	const auto *CallerInfo = A.getAAFor<AAAMDMaxNumWorkgroups>(
1033	QueryingAA: *this, IRP: IRPosition::function(F: *Caller), DepClass: DepClassTy::REQUIRED);
1034	if (!CallerInfo \|\| !CallerInfo->isValidState())
1035	return false;
1036
1037	Change \|=
1038	clampStateAndIndicateChange(S&: this->getState(), R: CallerInfo->getState());
1039	return true;
1040	};
1041
1042	bool AllCallSitesKnown = true;
1043	if (!A.checkForAllCallSites(Pred: CheckCallSite, QueryingAA: *this,
1044	/RequireAllCallSites=/true,
1045	UsedAssumedInformation&: AllCallSitesKnown))
1046	return indicatePessimisticFixpoint();
1047
1048	return Change;
1049	}
1050
1051	/// Create an abstract attribute view for the position \p IRP.
1052	static AAAMDMaxNumWorkgroups &createForPosition(const IRPosition &IRP,
1053	Attributor &A);
1054
1055	ChangeStatus manifest(Attributor &A) override {
1056	Function *F = getAssociatedFunction();
1057	LLVMContext &Ctx = F->getContext();
1058	SmallString<`32`> Buffer;
1059	raw_svector_ostream OS(Buffer);
1060	OS << X.getAssumed() << `','` << Y.getAssumed() << `','` << Z.getAssumed();
1061
1062	// TODO: Should annotate loads of the group size for this to do anything
1063	// useful.
1064	return A.manifestAttrs(
1065	IRP: getIRPosition(),
1066	DeducedAttrs: {Attribute::get(Context&: Ctx, Kind: "amdgpu-max-num-workgroups", Val: OS.str())},
1067	/ ForceReplace= / true);
1068	}
1069
1070	StringRef getName() const override { return "AAAMDMaxNumWorkgroups"; }
1071
1072	const std::string getAsStr(Attributor ) const* override {
1073	std::string Buffer = "AAAMDMaxNumWorkgroupsState[";
1074	raw_string_ostream OS(Buffer);
1075	OS << X.getAssumed() << `','` << Y.getAssumed() << `','` << Z.getAssumed()
1076	<< `']'`;
1077	return OS.str();
1078	}
1079
1080	const char getIdAddr() const* override { return &ID; }
1081
1082	/// This function should return true if the type of the \p AA is
1083	/// AAAMDMaxNumWorkgroups
1084	static bool classof(const AbstractAttribute *AA) {
1085	return (AA->getIdAddr() == &ID);
1086	}
1087
1088	void trackStatistics() const override {}
1089
1090	/// Unique ID (due to the unique address)
1091	static const char ID;
1092	};
1093
1094	const char AAAMDMaxNumWorkgroups::ID = `0`;
1095
1096	AAAMDMaxNumWorkgroups &
1097	AAAMDMaxNumWorkgroups::createForPosition(const IRPosition &IRP, Attributor &A) {
1098	if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
1099	return *new (A.Allocator) AAAMDMaxNumWorkgroups (IRP, A);
1100	llvm_unreachable("AAAMDMaxNumWorkgroups is only valid for function position");
1101	}
1102
1103	/// Propagate amdgpu-waves-per-eu attribute.
1104	struct AAAMDWavesPerEU : public AAAMDSizeRangeAttribute {
1105	AAAMDWavesPerEU(const IRPosition &IRP, Attributor &A)
1106	: AAAMDSizeRangeAttribute (IRP, A, "amdgpu-waves-per-eu") {}
1107
1108	void initialize(Attributor &A) override {
1109	Function *F = getAssociatedFunction();
1110	auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
1111
1112	// If the attribute exists, we will honor it if it is not the default.
1113	if (auto Attr = InfoCache.getWavesPerEUAttr(F: *F)) {
1114	std::pair<unsigned, unsigned> MaxWavesPerEURange{
1115	`1U`, InfoCache.getMaxWavesPerEU(F: *F)};
1116	if (*Attr != MaxWavesPerEURange) {
1117	auto [Min, Max] = *Attr;
1118	ConstantRange Range(APInt (`32`, Min), APInt (`32`, Max + `1`));
1119	IntegerRangeState RangeState(Range);
1120	this->getState() = RangeState;
1121	indicateOptimisticFixpoint();
1122	return;
1123	}
1124	}
1125
1126	if (AMDGPU::isEntryFunctionCC(CC: F->getCallingConv()))
1127	indicatePessimisticFixpoint();
1128	}
1129
1130	ChangeStatus updateImpl(Attributor &A) override {
1131	ChangeStatus Change = ChangeStatus::UNCHANGED;
1132
1133	auto CheckCallSite = [&](AbstractCallSite CS) {
1134	Function *Caller = CS.getInstruction()->getFunction();
1135	Function *Func = getAssociatedFunction();
1136	LLVM_DEBUG(dbgs() << `'['` << getName() << "] Call " << Caller->getName()
1137	<< "->" << Func->getName() << `'\n'`);
1138	(void)Func;
1139
1140	const auto *CallerAA = A.getAAFor<AAAMDWavesPerEU>(
1141	QueryingAA: *this, IRP: IRPosition::function(F: *Caller), DepClass: DepClassTy::REQUIRED);
1142	if (!CallerAA \|\| !CallerAA->isValidState())
1143	return false;
1144
1145	ConstantRange Assumed = getAssumed();
1146	unsigned Min = std::max(a: Assumed.getLower().getZExtValue(),
1147	b: CallerAA->getAssumed().getLower().getZExtValue());
1148	unsigned Max = std::max(a: Assumed.getUpper().getZExtValue(),
1149	b: CallerAA->getAssumed().getUpper().getZExtValue());
1150	ConstantRange Range(APInt (`32`, Min), APInt (`32`, Max));
1151	IntegerRangeState RangeState(Range);
1152	getState() = RangeState;
1153	Change \|= getState() == Assumed ? ChangeStatus::UNCHANGED
1154	: ChangeStatus::CHANGED;
1155
1156	return true;
1157	};
1158
1159	bool AllCallSitesKnown = true;
1160	if (!A.checkForAllCallSites(Pred: CheckCallSite, QueryingAA: *this, RequireAllCallSites: true, UsedAssumedInformation&: AllCallSitesKnown))
1161	return indicatePessimisticFixpoint();
1162
1163	return Change;
1164	}
1165
1166	/// Create an abstract attribute view for the position \p IRP.
1167	static AAAMDWavesPerEU &createForPosition(const IRPosition &IRP,
1168	Attributor &A);
1169
1170	ChangeStatus manifest(Attributor &A) override {
1171	Function *F = getAssociatedFunction();
1172	auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
1173	return emitAttributeIfNotDefaultAfterClamp(
1174	A, Default: {`1U`, InfoCache.getMaxWavesPerEU(F: *F)});
1175	}
1176
1177	/// See AbstractAttribute::getName()
1178	StringRef getName() const override { return "AAAMDWavesPerEU"; }
1179
1180	/// See AbstractAttribute::getIdAddr()
1181	const char getIdAddr() const* override { return &ID; }
1182
1183	/// This function should return true if the type of the \p AA is
1184	/// AAAMDWavesPerEU
1185	static bool classof(const AbstractAttribute *AA) {
1186	return (AA->getIdAddr() == &ID);
1187	}
1188
1189	/// Unique ID (due to the unique address)
1190	static const char ID;
1191	};
1192
1193	const char AAAMDWavesPerEU::ID = `0`;
1194
1195	AAAMDWavesPerEU &AAAMDWavesPerEU::createForPosition(const IRPosition &IRP,
1196	Attributor &A) {
1197	if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
1198	return *new (A.Allocator) AAAMDWavesPerEU (IRP, A);
1199	llvm_unreachable("AAAMDWavesPerEU is only valid for function position");
1200	}
1201
1202	static bool inlineAsmUsesAGPRs(const InlineAsm *IA) {
1203	for (const auto &CI : IA->ParseConstraints()) {
1204	for (StringRef Code : CI.Codes) {
1205	Code.consume_front(Prefix: "{");
1206	if (Code.starts_with(Prefix: "a"))
1207	return true;
1208	}
1209	}
1210
1211	return false;
1212	}
1213
1214	// TODO: Migrate to range merge of amdgpu-agpr-alloc.
1215	// FIXME: Why is this using Attribute::NoUnwind?
1216	struct AAAMDGPUNoAGPR
1217	: public IRAttribute<Attribute::NoUnwind,
1218	StateWrapper<BooleanState, AbstractAttribute>,
1219	AAAMDGPUNoAGPR> {
1220	AAAMDGPUNoAGPR(const IRPosition &IRP, Attributor &A) : IRAttribute (IRP) {}
1221
1222	static AAAMDGPUNoAGPR &createForPosition(const IRPosition &IRP,
1223	Attributor &A) {
1224	if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
1225	return *new (A.Allocator) AAAMDGPUNoAGPR (IRP, A);
1226	llvm_unreachable("AAAMDGPUNoAGPR is only valid for function position");
1227	}
1228
1229	void initialize(Attributor &A) override {
1230	Function *F = getAssociatedFunction();
1231	auto [MinNumAGPR, MaxNumAGPR] =
1232	AMDGPU::getIntegerPairAttribute(F: *F, Name: "amdgpu-agpr-alloc", Default: {~`0u`, ~`0u`},
1233	/OnlyFirstRequired=/true);
1234	if (MinNumAGPR == `0`)
1235	indicateOptimisticFixpoint();
1236	}
1237
1238	const std::string getAsStr(Attributor A) const* override {
1239	return getAssumed() ? "amdgpu-no-agpr" : "amdgpu-maybe-agpr";
1240	}
1241
1242	void trackStatistics() const override {}
1243
1244	ChangeStatus updateImpl(Attributor &A) override {
1245	// TODO: Use AACallEdges, but then we need a way to inspect asm edges.
1246
1247	auto CheckForNoAGPRs = [&](Instruction &I) {
1248	const auto &CB = cast<CallBase>(Val&: I);
1249	const Value *CalleeOp = CB.getCalledOperand();
1250	const Function *Callee = dyn_cast<Function>(Val: CalleeOp);
1251	if (!Callee) {
1252	if (const InlineAsm *IA = dyn_cast<InlineAsm>(Val: CalleeOp))
1253	return !inlineAsmUsesAGPRs(IA);
1254	return false;
1255	}
1256
1257	// Some intrinsics may use AGPRs, but if we have a choice, we are not
1258	// required to use AGPRs.
1259	if (Callee->isIntrinsic())
1260	return true;
1261
1262	// TODO: Handle callsite attributes
1263	const auto *CalleeInfo = A.getAAFor<AAAMDGPUNoAGPR>(
1264	QueryingAA: *this, IRP: IRPosition::function(F: *Callee), DepClass: DepClassTy::REQUIRED);
1265	return CalleeInfo && CalleeInfo->isValidState() &&
1266	CalleeInfo->getAssumed();
1267	};
1268
1269	bool UsedAssumedInformation = false;
1270	if (!A.checkForAllCallLikeInstructions(Pred: CheckForNoAGPRs, QueryingAA: *this,
1271	UsedAssumedInformation))
1272	return indicatePessimisticFixpoint();
1273	return ChangeStatus::UNCHANGED;
1274	}
1275
1276	ChangeStatus manifest(Attributor &A) override {
1277	if (!getAssumed())
1278	return ChangeStatus::UNCHANGED;
1279	LLVMContext &Ctx = getAssociatedFunction()->getContext();
1280	return A.manifestAttrs(IRP: getIRPosition(),
1281	DeducedAttrs: {Attribute::get(Context&: Ctx, Kind: "amdgpu-agpr-alloc", Val: "0")});
1282	}
1283
1284	StringRef getName() const override { return "AAAMDGPUNoAGPR"; }
1285	const char getIdAddr() const* override { return &ID; }
1286
1287	/// This function should return true if the type of the \p AA is
1288	/// AAAMDGPUNoAGPRs
1289	static bool classof(const AbstractAttribute *AA) {
1290	return (AA->getIdAddr() == &ID);
1291	}
1292
1293	static const char ID;
1294	};
1295
1296	const char AAAMDGPUNoAGPR::ID = `0`;
1297
1298	/// Performs the final check and updates the 'amdgpu-waves-per-eu' attribute
1299	/// based on the finalized 'amdgpu-flat-work-group-size' attribute.
1300	/// Both attributes start with narrow ranges that expand during iteration.
1301	/// However, a narrower flat-workgroup-size leads to a wider waves-per-eu range,
1302	/// preventing optimal updates later. Therefore, waves-per-eu can't be updated
1303	/// with intermediate values during the attributor run. We defer the
1304	/// finalization of waves-per-eu until after the flat-workgroup-size is
1305	/// finalized.
1306	/// TODO: Remove this and move similar logic back into the attributor run once
1307	/// we have a better representation for waves-per-eu.
1308	static bool updateWavesPerEU(Module &M, TargetMachine &TM) {
1309	bool Changed = false;
1310
1311	LLVMContext &Ctx = M.getContext();
1312
1313	for (Function &F : M) {
1314	if (F.isDeclaration())
1315	continue;
1316
1317	const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
1318
1319	std::optional<std::pair<unsigned, std::optional<unsigned>>>
1320	FlatWgrpSizeAttr =
1321	AMDGPU::getIntegerPairAttribute(F, Name: "amdgpu-flat-work-group-size");
1322
1323	unsigned MinWavesPerEU = ST.getMinWavesPerEU();
1324	unsigned MaxWavesPerEU = ST.getMaxWavesPerEU();
1325
1326	unsigned MinFlatWgrpSize = ST.getMinFlatWorkGroupSize();
1327	unsigned MaxFlatWgrpSize = ST.getMaxFlatWorkGroupSize();
1328	if (FlatWgrpSizeAttr.has_value()) {
1329	MinFlatWgrpSize = FlatWgrpSizeAttr ->first;
1330	MaxFlatWgrpSize = *(FlatWgrpSizeAttr ->second);
1331	}
1332
1333	// Start with the "best" range.
1334	unsigned Min = MinWavesPerEU;
1335	unsigned Max = MinWavesPerEU;
1336
1337	// Compute the range from flat workgroup size. `getWavesPerEU` will also
1338	// account for the 'amdgpu-waves-er-eu' attribute.
1339	auto [MinFromFlatWgrpSize, MaxFromFlatWgrpSize] =
1340	ST.getWavesPerEU(F, FlatWorkGroupSizes: {MinFlatWgrpSize, MaxFlatWgrpSize});
1341
1342	// For the lower bound, we have to "tighten" it.
1343	Min = std::max(a: Min, b: MinFromFlatWgrpSize);
1344	// For the upper bound, we have to "extend" it.
1345	Max = std::max(a: Max, b: MaxFromFlatWgrpSize);
1346
1347	// Clamp the range to the max range.
1348	Min = std::max(a: Min, b: MinWavesPerEU);
1349	Max = std::min(a: Max, b: MaxWavesPerEU);
1350
1351	// Update the attribute if it is not the max.
1352	if (Min != MinWavesPerEU \|\| Max != MaxWavesPerEU) {
1353	SmallString<`10`> Buffer;
1354	raw_svector_ostream OS(Buffer);
1355	OS << Min << `','` << Max;
1356	Attribute OldAttr = F.getFnAttribute(Kind: "amdgpu-waves-per-eu");
1357	Attribute NewAttr = Attribute::get(Context&: Ctx, Kind: "amdgpu-waves-per-eu", Val: OS.str());
1358	F.addFnAttr(Attr: NewAttr);
1359	Changed \|= OldAttr == NewAttr;
1360	}
1361	}
1362
1363	return Changed;
1364	}
1365
1366	static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
1367	AMDGPUAttributorOptions Options,
1368	ThinOrFullLTOPhase LTOPhase) {
1369	SetVector<Function *> Functions;
1370	for (Function &F : M) {
1371	if (!F.isIntrinsic())
1372	Functions.insert(X: &F);
1373	}
1374
1375	CallGraphUpdater CGUpdater;
1376	BumpPtrAllocator Allocator;
1377	AMDGPUInformationCache InfoCache(M, AG, Allocator, nullptr, TM);
1378	DenseSet<const char *> Allowed(
1379	{&AAAMDAttributes::ID, &AAUniformWorkGroupSize::ID,
1380	&AAPotentialValues::ID, &AAAMDFlatWorkGroupSize::ID,
1381	&AAAMDMaxNumWorkgroups::ID, &AAAMDWavesPerEU::ID, &AAAMDGPUNoAGPR::ID,
1382	&AACallEdges::ID, &AAPointerInfo::ID, &AAPotentialConstantValues::ID,
1383	&AAUnderlyingObjects::ID, &AAAddressSpace::ID, &AAIndirectCallInfo::ID,
1384	&AAInstanceInfo::ID});
1385
1386	AttributorConfig AC(CGUpdater);
1387	AC.IsClosedWorldModule = Options.IsClosedWorld;
1388	AC.Allowed = &Allowed;
1389	AC.IsModulePass = true;
1390	AC.DefaultInitializeLiveInternals = false;
1391	AC.IndirectCalleeSpecializationCallback =
1392	[](Attributor &A, const AbstractAttribute &AA, CallBase &CB,
1393	Function &Callee, unsigned NumAssumedCallees) {
1394	return !AMDGPU::isEntryFunctionCC(CC: Callee.getCallingConv()) &&
1395	(NumAssumedCallees <= IndirectCallSpecializationThreshold);
1396	};
1397	AC.IPOAmendableCB = [](const Function &F) {
1398	return F.getCallingConv() == CallingConv::AMDGPU_KERNEL;
1399	};
1400
1401	Attributor A(Functions, InfoCache, AC);
1402
1403	LLVM_DEBUG({
1404	StringRef LTOPhaseStr = to_string(LTOPhase);
1405	dbgs() << "[AMDGPUAttributor] Running at phase " << LTOPhaseStr << `'\n'`
1406	<< "[AMDGPUAttributor] Module " << M.getName() << " is "
1407	<< (AC.IsClosedWorldModule ? "" : "not ")
1408	<< "assumed to be a closed world.\n";
1409	});
1410
1411	for (auto *F : Functions) {
1412	A.getOrCreateAAFor<AAAMDAttributes>(IRP: IRPosition::function(F: *F));
1413	A.getOrCreateAAFor<AAUniformWorkGroupSize>(IRP: IRPosition::function(F: *F));
1414	A.getOrCreateAAFor<AAAMDMaxNumWorkgroups>(IRP: IRPosition::function(F: *F));
1415	A.getOrCreateAAFor<AAAMDGPUNoAGPR>(IRP: IRPosition::function(F: *F));
1416	CallingConv::ID CC = F->getCallingConv();
1417	if (!AMDGPU::isEntryFunctionCC(CC)) {
1418	A.getOrCreateAAFor<AAAMDFlatWorkGroupSize>(IRP: IRPosition::function(F: *F));
1419	A.getOrCreateAAFor<AAAMDWavesPerEU>(IRP: IRPosition::function(F: *F));
1420	}
1421
1422	for (auto &I : instructions(F)) {
1423	if (auto *LI = dyn_cast<LoadInst>(Val: &I)) {
1424	A.getOrCreateAAFor<AAAddressSpace>(
1425	IRP: IRPosition::value(V: *LI->getPointerOperand()));
1426	} else if (auto *SI = dyn_cast<StoreInst>(Val: &I)) {
1427	A.getOrCreateAAFor<AAAddressSpace>(
1428	IRP: IRPosition::value(V: *SI->getPointerOperand()));
1429	} else if (auto *RMW = dyn_cast<AtomicRMWInst>(Val: &I)) {
1430	A.getOrCreateAAFor<AAAddressSpace>(
1431	IRP: IRPosition::value(V: *RMW->getPointerOperand()));
1432	} else if (auto *CmpX = dyn_cast<AtomicCmpXchgInst>(Val: &I)) {
1433	A.getOrCreateAAFor<AAAddressSpace>(
1434	IRP: IRPosition::value(V: *CmpX->getPointerOperand()));
1435	}
1436	}
1437	}
1438
1439	bool Changed = A.run() == ChangeStatus::CHANGED;
1440
1441	Changed \|= updateWavesPerEU(M, TM);
1442
1443	return Changed;
1444	}
1445	} // namespace
1446
1447	PreservedAnalyses llvm::AMDGPUAttributorPass::run(Module &M,
1448	ModuleAnalysisManager &AM) {
1449
1450	FunctionAnalysisManager &FAM =
1451	AM.getResult<FunctionAnalysisManagerModuleProxy>(IR&: M).getManager();
1452	AnalysisGetter AG(FAM);
1453
1454	// TODO: Probably preserves CFG
1455	return runImpl(M, AG, TM, Options, LTOPhase) ? PreservedAnalyses::none()
1456	: PreservedAnalyses::all();
1457	}
1458

Browse the source code of llvm_projects/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp