AMDGPUAttributor.cpp source code [llvm_projects/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp]

1	//===- AMDGPUAttributor.cpp -----------------------------------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	/// \file This pass uses Attributor framework to deduce AMDGPU attributes.
10	//
11	//===----------------------------------------------------------------------===//
12
13	#include "AMDGPU.h"
14	#include "AMDGPUTargetMachine.h"
15	#include "GCNSubtarget.h"
16	#include "Utils/AMDGPUBaseInfo.h"
17	#include "llvm/IR/IntrinsicsAMDGPU.h"
18	#include "llvm/IR/IntrinsicsR600.h"
19	#include "llvm/Target/TargetMachine.h"
20	#include "llvm/Transforms/IPO/Attributor.h"
21	#include <cstdint>
22
23	#define DEBUG_TYPE "amdgpu-attributor"
24
25	using namespace llvm;
26
27	static cl::opt<unsigned> IndirectCallSpecializationThreshold(
28	"amdgpu-indirect-call-specialization-threshold",
29	cl::desc(
30	"A threshold controls whether an indirect call will be specialized"),
31	cl::init(Val: `3`));
32
33	#define AMDGPU_ATTRIBUTE(Name, Str) Name##_POS,
34
35	enum ImplicitArgumentPositions {
36	#include "AMDGPUAttributes.def"
37	LAST_ARG_POS
38	};
39
40	#define AMDGPU_ATTRIBUTE(Name, Str) Name = 1 << Name##_POS,
41
42	enum ImplicitArgumentMask {
43	UNKNOWN_INTRINSIC = `0`,
44	#include "AMDGPUAttributes.def"
45	ALL_ARGUMENT_MASK = (`1` << LAST_ARG_POS) - `1`,
46	NOT_IMPLICIT_INPUT
47	};
48
49	#define AMDGPU_ATTRIBUTE(Name, Str) {Name, Str},
50	static constexpr std::pair<ImplicitArgumentMask, StringLiteral>
51	ImplicitAttrs[] = {
52	#include "AMDGPUAttributes.def"
53	};
54
55	// We do not need to note the x workitem or workgroup id because they are always
56	// initialized.
57	//
58	// TODO: We should not add the attributes if the known compile time workgroup
59	// size is 1 for y/z.
60	static ImplicitArgumentMask
61	intrinsicToAttrMask(Intrinsic::ID ID, bool &NonKernelOnly, bool &NeedsImplicit,
62	bool HasApertureRegs, bool SupportsGetDoorBellID,
63	unsigned CodeObjectVersion) {
64	switch (ID) {
65	case Intrinsic::amdgcn_workitem_id_x:
66	NonKernelOnly = true;
67	return WORKITEM_ID_X;
68	case Intrinsic::amdgcn_workgroup_id_x:
69	NonKernelOnly = true;
70	return WORKGROUP_ID_X;
71	case Intrinsic::amdgcn_workitem_id_y:
72	case Intrinsic::r600_read_tidig_y:
73	return WORKITEM_ID_Y;
74	case Intrinsic::amdgcn_workitem_id_z:
75	case Intrinsic::r600_read_tidig_z:
76	return WORKITEM_ID_Z;
77	case Intrinsic::amdgcn_workgroup_id_y:
78	case Intrinsic::r600_read_tgid_y:
79	return WORKGROUP_ID_Y;
80	case Intrinsic::amdgcn_workgroup_id_z:
81	case Intrinsic::r600_read_tgid_z:
82	return WORKGROUP_ID_Z;
83	case Intrinsic::amdgcn_cluster_id_x:
84	NonKernelOnly = true;
85	return CLUSTER_ID_X;
86	case Intrinsic::amdgcn_cluster_id_y:
87	return CLUSTER_ID_Y;
88	case Intrinsic::amdgcn_cluster_id_z:
89	return CLUSTER_ID_Z;
90	case Intrinsic::amdgcn_lds_kernel_id:
91	return LDS_KERNEL_ID;
92	case Intrinsic::amdgcn_dispatch_ptr:
93	return DISPATCH_PTR;
94	case Intrinsic::amdgcn_dispatch_id:
95	return DISPATCH_ID;
96	case Intrinsic::amdgcn_implicitarg_ptr:
97	return IMPLICIT_ARG_PTR;
98	// Need queue_ptr anyway. But under V5, we also need implicitarg_ptr to access
99	// queue_ptr.
100	case Intrinsic::amdgcn_queue_ptr:
101	NeedsImplicit = (CodeObjectVersion >= AMDGPU::AMDHSA_COV5);
102	return QUEUE_PTR;
103	case Intrinsic::amdgcn_is_shared:
104	case Intrinsic::amdgcn_is_private:
105	if (HasApertureRegs)
106	return NOT_IMPLICIT_INPUT;
107	// Under V5, we need implicitarg_ptr + offsets to access private_base or
108	// shared_base. For pre-V5, however, need to access them through queue_ptr +
109	// offsets.
110	return CodeObjectVersion >= AMDGPU::AMDHSA_COV5 ? IMPLICIT_ARG_PTR
111	: QUEUE_PTR;
112	case Intrinsic::amdgcn_wwm:
113	case Intrinsic::amdgcn_strict_wwm:
114	return WHOLE_WAVE_MODE;
115	case Intrinsic::trap:
116	case Intrinsic::debugtrap:
117	case Intrinsic::ubsantrap:
118	if (SupportsGetDoorBellID) // GetDoorbellID support implemented since V4.
119	return CodeObjectVersion >= AMDGPU::AMDHSA_COV4 ? NOT_IMPLICIT_INPUT
120	: QUEUE_PTR;
121	NeedsImplicit = (CodeObjectVersion >= AMDGPU::AMDHSA_COV5);
122	return QUEUE_PTR;
123	default:
124	return UNKNOWN_INTRINSIC;
125	}
126	}
127
128	static bool castRequiresQueuePtr(unsigned SrcAS) {
129	return SrcAS == AMDGPUAS::LOCAL_ADDRESS \|\| SrcAS == AMDGPUAS::PRIVATE_ADDRESS;
130	}
131
132	static bool isDSAddress(const Constant *C) {
133	const GlobalValue *GV = dyn_cast<GlobalValue>(Val: C);
134	if (!GV)
135	return false;
136	unsigned AS = GV->getAddressSpace();
137	return AS == AMDGPUAS::LOCAL_ADDRESS \|\| AS == AMDGPUAS::REGION_ADDRESS;
138	}
139
140	/// Returns true if sanitizer attributes are present on a function.
141	static bool hasSanitizerAttributes(const Function &F) {
142	return F.hasFnAttribute(Kind: Attribute::SanitizeAddress) \|\|
143	F.hasFnAttribute(Kind: Attribute::SanitizeThread) \|\|
144	F.hasFnAttribute(Kind: Attribute::SanitizeMemory) \|\|
145	F.hasFnAttribute(Kind: Attribute::SanitizeHWAddress) \|\|
146	F.hasFnAttribute(Kind: Attribute::SanitizeMemTag);
147	}
148
149	namespace {
150	class AMDGPUInformationCache : public InformationCache {
151	public:
152	AMDGPUInformationCache(const Module &M, AnalysisGetter &AG,
153	BumpPtrAllocator &Allocator,
154	SetVector<Function > CGSCC, TargetMachine &TM)
155	: InformationCache (M, AG, Allocator, CGSCC), TM(TM),
156	CodeObjectVersion(AMDGPU::getAMDHSACodeObjectVersion(M)) {}
157
158	TargetMachine &TM;
159
160	enum ConstantStatus : uint8_t {
161	NONE = `0`,
162	DS_GLOBAL = `1` << `0`,
163	ADDR_SPACE_CAST_PRIVATE_TO_FLAT = `1` << `1`,
164	ADDR_SPACE_CAST_LOCAL_TO_FLAT = `1` << `2`,
165	ADDR_SPACE_CAST_BOTH_TO_FLAT =
166	ADDR_SPACE_CAST_PRIVATE_TO_FLAT \| ADDR_SPACE_CAST_LOCAL_TO_FLAT,
167	CS_WORST = DS_GLOBAL \| ADDR_SPACE_CAST_BOTH_TO_FLAT,
168	};
169
170	/// Check if the subtarget has aperture regs.
171	bool hasApertureRegs(Function &F) {
172	const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
173	return ST.hasApertureRegs();
174	}
175
176	/// Check if the subtarget supports GetDoorbellID.
177	bool supportsGetDoorbellID(Function &F) {
178	const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
179	return ST.supportsGetDoorbellID();
180	}
181
182	std::optional<std::pair<unsigned, unsigned>>
183	getFlatWorkGroupSizeAttr(const Function &F) const {
184	auto R = AMDGPU::getIntegerPairAttribute(F, Name: "amdgpu-flat-work-group-size");
185	if (!R)
186	return std::nullopt;
187	return std::make_pair(x&: R ->first, y&: *(R ->second));
188	}
189
190	std::pair<unsigned, unsigned>
191	getDefaultFlatWorkGroupSize(const Function &F) const {
192	const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
193	return ST.getDefaultFlatWorkGroupSize(CC: F.getCallingConv());
194	}
195
196	std::pair<unsigned, unsigned>
197	getMaximumFlatWorkGroupRange(const Function &F) {
198	const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
199	return {ST.getMinFlatWorkGroupSize(), ST.getMaxFlatWorkGroupSize()};
200	}
201
202	/// Get code object version.
203	unsigned getCodeObjectVersion() const { return CodeObjectVersion; }
204
205	std::optional<std::pair<unsigned, unsigned>>
206	getWavesPerEUAttr(const Function &F) {
207	auto Val = AMDGPU::getIntegerPairAttribute(F, Name: "amdgpu-waves-per-eu",
208	/OnlyFirstRequired=/true);
209	if (!Val)
210	return std::nullopt;
211	if (!Val ->second) {
212	const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
213	Val ->second = ST.getMaxWavesPerEU();
214	}
215	return std::make_pair(x&: Val ->first, y&: *(Val ->second));
216	}
217
218	unsigned getMaxWavesPerEU(const Function &F) {
219	const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
220	return ST.getMaxWavesPerEU();
221	}
222
223	unsigned getMaxAddrSpace() const override {
224	return AMDGPUAS::MAX_AMDGPU_ADDRESS;
225	}
226
227	private:
228	/// Check if the ConstantExpr \p CE uses an addrspacecast from private or
229	/// local to flat. These casts may require the queue pointer.
230	static uint8_t visitConstExpr(const ConstantExpr *CE) {
231	uint8_t Status = NONE;
232
233	if (CE->getOpcode() == Instruction::AddrSpaceCast) {
234	unsigned SrcAS = CE->getOperand(i_nocapture: `0`)->getType()->getPointerAddressSpace();
235	if (SrcAS == AMDGPUAS::PRIVATE_ADDRESS)
236	Status \|= ADDR_SPACE_CAST_PRIVATE_TO_FLAT;
237	else if (SrcAS == AMDGPUAS::LOCAL_ADDRESS)
238	Status \|= ADDR_SPACE_CAST_LOCAL_TO_FLAT;
239	}
240
241	return Status;
242	}
243
244	/// Get the constant access bitmap for \p C.
245	uint8_t getConstantAccess(const Constant *C) {
246	const auto &It = ConstantStatus.find(Val: C);
247	if (It != ConstantStatus.end())
248	return It ->second.value();
249
250	SmallPtrSet<const Constant *, `8`> Visited;
251	SmallVector<const Constant *> Worklist;
252	Worklist.push_back(Elt: C);
253	Visited.insert(Ptr: C);
254
255	uint8_t Result = `0`;
256	while (Result != CS_WORST && !Worklist.empty()) {
257	const Constant *CurC = Worklist.pop_back_val();
258
259	std::optional<uint8_t> &CurCResultOrNone = ConstantStatus [CurC];
260	if (CurCResultOrNone) {
261	Result \|= CurCResultOrNone.value();
262	continue;
263	}
264	uint8_t CurCResult = `0`;
265
266	if (isDSAddress(C: CurC))
267	CurCResult \|= DS_GLOBAL;
268
269	if (const auto *CE = dyn_cast<ConstantExpr>(Val: CurC))
270	CurCResult \|= visitConstExpr(CE);
271
272	for (const Use &U : CurC->operands()) {
273	if (const auto *OpC = dyn_cast<Constant>(Val: U)) {
274	if (Visited.insert(Ptr: OpC).second)
275	Worklist.push_back(Elt: OpC);
276	}
277	}
278
279	CurCResultOrNone = CurCResult;
280	Result \|= CurCResult;
281	}
282
283	ConstantStatus [C] = Result;
284	return Result;
285	}
286
287	public:
288	/// Returns true if \p Fn needs the queue pointer because of \p C.
289	bool needsQueuePtr(const Constant *C, Function &Fn) {
290	bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(CC: Fn.getCallingConv());
291	bool HasAperture = hasApertureRegs(F&: Fn);
292
293	// No need to explore the constants.
294	if (!IsNonEntryFunc && HasAperture)
295	return false;
296
297	uint8_t Access = getConstantAccess(C);
298
299	// We need to trap on DS globals in non-entry functions.
300	if (IsNonEntryFunc && (Access & DS_GLOBAL))
301	return true;
302
303	return !HasAperture && (Access & ADDR_SPACE_CAST_BOTH_TO_FLAT);
304	}
305
306	bool checkConstForAddrSpaceCastFromPrivate(const Constant *C) {
307	uint8_t Access = getConstantAccess(C);
308	return Access & ADDR_SPACE_CAST_PRIVATE_TO_FLAT;
309	}
310
311	private:
312	/// Used to determine if the Constant needs the queue pointer.
313	DenseMap<const Constant *, std::optional<uint8_t>> ConstantStatus;
314	const unsigned CodeObjectVersion;
315	};
316
317	struct AAAMDAttributes
318	: public StateWrapper<BitIntegerState<uint32_t, ALL_ARGUMENT_MASK, `0`>,
319	AbstractAttribute> {
320	using Base = StateWrapper<BitIntegerState<uint32_t, ALL_ARGUMENT_MASK, `0`>,
321	AbstractAttribute>;
322
323	AAAMDAttributes(const IRPosition &IRP, Attributor &A) : Base (IRP) {}
324
325	/// Create an abstract attribute view for the position \p IRP.
326	static AAAMDAttributes &createForPosition(const IRPosition &IRP,
327	Attributor &A);
328
329	/// See AbstractAttribute::getName().
330	StringRef getName() const override { return "AAAMDAttributes"; }
331
332	/// See AbstractAttribute::getIdAddr().
333	const char getIdAddr() const* override { return &ID; }
334
335	/// This function should return true if the type of the \p AA is
336	/// AAAMDAttributes.
337	static bool classof(const AbstractAttribute *AA) {
338	return (AA->getIdAddr() == &ID);
339	}
340
341	/// Unique ID (due to the unique address)
342	static const char ID;
343	};
344	const char AAAMDAttributes::ID = `0`;
345
346	struct AAUniformWorkGroupSize
347	: public StateWrapper<BooleanState, AbstractAttribute> {
348	using Base = StateWrapper<BooleanState, AbstractAttribute>;
349	AAUniformWorkGroupSize(const IRPosition &IRP, Attributor &A) : Base (IRP) {}
350
351	/// Create an abstract attribute view for the position \p IRP.
352	static AAUniformWorkGroupSize &createForPosition(const IRPosition &IRP,
353	Attributor &A);
354
355	/// See AbstractAttribute::getName().
356	StringRef getName() const override { return "AAUniformWorkGroupSize"; }
357
358	/// See AbstractAttribute::getIdAddr().
359	const char getIdAddr() const* override { return &ID; }
360
361	/// This function should return true if the type of the \p AA is
362	/// AAAMDAttributes.
363	static bool classof(const AbstractAttribute *AA) {
364	return (AA->getIdAddr() == &ID);
365	}
366
367	/// Unique ID (due to the unique address)
368	static const char ID;
369	};
370	const char AAUniformWorkGroupSize::ID = `0`;
371
372	struct AAUniformWorkGroupSizeFunction : public AAUniformWorkGroupSize {
373	AAUniformWorkGroupSizeFunction(const IRPosition &IRP, Attributor &A)
374	: AAUniformWorkGroupSize (IRP, A) {}
375
376	void initialize(Attributor &A) override {
377	Function *F = getAssociatedFunction();
378	CallingConv::ID CC = F->getCallingConv();
379
380	if (CC != CallingConv::AMDGPU_KERNEL)
381	return;
382
383	bool InitialValue = F->hasFnAttribute(Kind: "uniform-work-group-size");
384
385	if (InitialValue)
386	indicateOptimisticFixpoint();
387	else
388	indicatePessimisticFixpoint();
389	}
390
391	ChangeStatus updateImpl(Attributor &A) override {
392	ChangeStatus Change = ChangeStatus::UNCHANGED;
393
394	auto CheckCallSite = [&](AbstractCallSite CS) {
395	Function *Caller = CS.getInstruction()->getFunction();
396	LLVM_DEBUG(dbgs() << "[AAUniformWorkGroupSize] Call " << Caller->getName()
397	<< "->" << getAssociatedFunction()->getName() << "\n");
398
399	const auto *CallerInfo = A.getAAFor<AAUniformWorkGroupSize>(
400	QueryingAA: *this, IRP: IRPosition::function(F: *Caller), DepClass: DepClassTy::REQUIRED);
401	if (!CallerInfo \|\| !CallerInfo->isValidState())
402	return false;
403
404	Change = Change \| clampStateAndIndicateChange(S&: this->getState(),
405	R: CallerInfo->getState());
406
407	return true;
408	};
409
410	bool AllCallSitesKnown = true;
411	if (!A.checkForAllCallSites(Pred: CheckCallSite, QueryingAA: *this, RequireAllCallSites: true, UsedAssumedInformation&: AllCallSitesKnown))
412	return indicatePessimisticFixpoint();
413
414	return Change;
415	}
416
417	ChangeStatus manifest(Attributor &A) override {
418	if (!getAssumed())
419	return ChangeStatus::UNCHANGED;
420
421	LLVMContext &Ctx = getAssociatedFunction()->getContext();
422	return A.manifestAttrs(IRP: getIRPosition(),
423	DeducedAttrs: {Attribute::get(Context&: Ctx, Kind: "uniform-work-group-size")},
424	/ForceReplace=/true);
425	}
426
427	bool isValidState() const override {
428	// This state is always valid, even when the state is false.
429	return true;
430	}
431
432	const std::string getAsStr(Attributor ) const* override {
433	return "AMDWorkGroupSize[" + std::to_string(val: getAssumed()) + "]";
434	}
435
436	/// See AbstractAttribute::trackStatistics()
437	void trackStatistics() const override {}
438	};
439
440	AAUniformWorkGroupSize &
441	AAUniformWorkGroupSize::createForPosition(const IRPosition &IRP,
442	Attributor &A) {
443	if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
444	return *new (A.Allocator) AAUniformWorkGroupSizeFunction (IRP, A);
445	llvm_unreachable(
446	"AAUniformWorkGroupSize is only valid for function position");
447	}
448
449	struct AAAMDAttributesFunction : public AAAMDAttributes {
450	AAAMDAttributesFunction(const IRPosition &IRP, Attributor &A)
451	: AAAMDAttributes (IRP, A) {}
452
453	void initialize(Attributor &A) override {
454	Function *F = getAssociatedFunction();
455
456	// If the function requires the implicit arg pointer due to sanitizers,
457	// assume it's needed even if explicitly marked as not requiring it.
458	// Flat scratch initialization is needed because `asan_malloc_impl`
459	// calls introduced later in pipeline will have flat scratch accesses.
460	// FIXME: FLAT_SCRATCH_INIT will not be required here if device-libs
461	// implementation for `asan_malloc_impl` is updated.
462	const bool HasSanitizerAttrs = hasSanitizerAttributes(F: *F);
463	if (HasSanitizerAttrs) {
464	removeAssumedBits(BitsEncoding: IMPLICIT_ARG_PTR);
465	removeAssumedBits(BitsEncoding: HOSTCALL_PTR);
466	removeAssumedBits(BitsEncoding: FLAT_SCRATCH_INIT);
467	}
468
469	for (auto Attr : ImplicitAttrs) {
470	if (HasSanitizerAttrs &&
471	(Attr.first == IMPLICIT_ARG_PTR \|\| Attr.first == HOSTCALL_PTR \|\|
472	Attr.first == FLAT_SCRATCH_INIT))
473	continue;
474
475	if (F->hasFnAttribute(Kind: Attr.second))
476	addKnownBits(Bits: Attr.first);
477	}
478
479	if (F->isDeclaration())
480	return;
481
482	// Ignore functions with graphics calling conventions, these are currently
483	// not allowed to have kernel arguments.
484	if (AMDGPU::isGraphics(CC: F->getCallingConv())) {
485	indicatePessimisticFixpoint();
486	return;
487	}
488	}
489
490	ChangeStatus updateImpl(Attributor &A) override {
491	Function *F = getAssociatedFunction();
492	// The current assumed state used to determine a change.
493	auto OrigAssumed = getAssumed();
494
495	// Check for Intrinsics and propagate attributes.
496	const AACallEdges *AAEdges = A.getAAFor<AACallEdges>(
497	QueryingAA: *this, IRP: this->getIRPosition(), DepClass: DepClassTy::REQUIRED);
498	if (!AAEdges \|\| !AAEdges->isValidState() \|\|
499	AAEdges->hasNonAsmUnknownCallee())
500	return indicatePessimisticFixpoint();
501
502	bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(CC: F->getCallingConv());
503
504	bool NeedsImplicit = false;
505	auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
506	bool HasApertureRegs = InfoCache.hasApertureRegs(F&: *F);
507	bool SupportsGetDoorbellID = InfoCache.supportsGetDoorbellID(F&: *F);
508	unsigned COV = InfoCache.getCodeObjectVersion();
509
510	for (Function *Callee : AAEdges->getOptimisticEdges()) {
511	Intrinsic::ID IID = Callee->getIntrinsicID();
512	if (IID == Intrinsic::not_intrinsic) {
513	const AAAMDAttributes *AAAMD = A.getAAFor<AAAMDAttributes>(
514	QueryingAA: *this, IRP: IRPosition::function(F: *Callee), DepClass: DepClassTy::REQUIRED);
515	if (!AAAMD \|\| !AAAMD->isValidState())
516	return indicatePessimisticFixpoint();
517	*this &= *AAAMD;
518	continue;
519	}
520
521	bool NonKernelOnly = false;
522	ImplicitArgumentMask AttrMask =
523	intrinsicToAttrMask(ID: IID, NonKernelOnly, NeedsImplicit,
524	HasApertureRegs, SupportsGetDoorBellID: SupportsGetDoorbellID, CodeObjectVersion: COV);
525
526	if (AttrMask == UNKNOWN_INTRINSIC) {
527	// Assume not-nocallback intrinsics may invoke a function which accesses
528	// implicit arguments.
529	//
530	// FIXME: This isn't really the correct check. We want to ensure it
531	// isn't calling any function that may use implicit arguments regardless
532	// of whether it's internal to the module or not.
533	//
534	// TODO: Ignoring callsite attributes.
535	if (!Callee->hasFnAttribute(Kind: Attribute::NoCallback))
536	return indicatePessimisticFixpoint();
537	continue;
538	}
539
540	if (AttrMask != NOT_IMPLICIT_INPUT) {
541	if ((IsNonEntryFunc \|\| !NonKernelOnly))
542	removeAssumedBits(BitsEncoding: AttrMask);
543	}
544	}
545
546	// Need implicitarg_ptr to acess queue_ptr, private_base, and shared_base.
547	if (NeedsImplicit)
548	removeAssumedBits(BitsEncoding: IMPLICIT_ARG_PTR);
549
550	if (isAssumed(BitsEncoding: QUEUE_PTR) && checkForQueuePtr(A)) {
551	// Under V5, we need implicitarg_ptr + offsets to access private_base or
552	// shared_base. We do not actually need queue_ptr.
553	if (COV >= `5`)
554	removeAssumedBits(BitsEncoding: IMPLICIT_ARG_PTR);
555	else
556	removeAssumedBits(BitsEncoding: QUEUE_PTR);
557	}
558
559	if (funcRetrievesMultigridSyncArg(A, COV)) {
560	assert(!isAssumed(IMPLICIT_ARG_PTR) &&
561	"multigrid_sync_arg needs implicitarg_ptr");
562	removeAssumedBits(BitsEncoding: MULTIGRID_SYNC_ARG);
563	}
564
565	if (funcRetrievesHostcallPtr(A, COV)) {
566	assert(!isAssumed(IMPLICIT_ARG_PTR) && "hostcall needs implicitarg_ptr");
567	removeAssumedBits(BitsEncoding: HOSTCALL_PTR);
568	}
569
570	if (funcRetrievesHeapPtr(A, COV)) {
571	assert(!isAssumed(IMPLICIT_ARG_PTR) && "heap_ptr needs implicitarg_ptr");
572	removeAssumedBits(BitsEncoding: HEAP_PTR);
573	}
574
575	if (isAssumed(BitsEncoding: QUEUE_PTR) && funcRetrievesQueuePtr(A, COV)) {
576	assert(!isAssumed(IMPLICIT_ARG_PTR) && "queue_ptr needs implicitarg_ptr");
577	removeAssumedBits(BitsEncoding: QUEUE_PTR);
578	}
579
580	if (isAssumed(BitsEncoding: LDS_KERNEL_ID) && funcRetrievesLDSKernelId(A)) {
581	removeAssumedBits(BitsEncoding: LDS_KERNEL_ID);
582	}
583
584	if (isAssumed(BitsEncoding: DEFAULT_QUEUE) && funcRetrievesDefaultQueue(A, COV))
585	removeAssumedBits(BitsEncoding: DEFAULT_QUEUE);
586
587	if (isAssumed(BitsEncoding: COMPLETION_ACTION) && funcRetrievesCompletionAction(A, COV))
588	removeAssumedBits(BitsEncoding: COMPLETION_ACTION);
589
590	if (isAssumed(BitsEncoding: FLAT_SCRATCH_INIT) && needFlatScratchInit(A))
591	removeAssumedBits(BitsEncoding: FLAT_SCRATCH_INIT);
592
593	return getAssumed() != OrigAssumed ? ChangeStatus::CHANGED
594	: ChangeStatus::UNCHANGED;
595	}
596
597	ChangeStatus manifest(Attributor &A) override {
598	SmallVector<Attribute, `8`> AttrList;
599	LLVMContext &Ctx = getAssociatedFunction()->getContext();
600
601	for (auto Attr : ImplicitAttrs) {
602	if (isKnown(BitsEncoding: Attr.first))
603	AttrList.push_back(Elt: Attribute::get(Context&: Ctx, Kind: Attr.second));
604	}
605
606	return A.manifestAttrs(IRP: getIRPosition(), DeducedAttrs: AttrList,
607	/ ForceReplace / true);
608	}
609
610	const std::string getAsStr(Attributor ) const* override {
611	std::string Str;
612	raw_string_ostream OS(Str);
613	OS << "AMDInfo[";
614	for (auto Attr : ImplicitAttrs)
615	if (isAssumed(BitsEncoding: Attr.first))
616	OS << `' '` << Attr.second;
617	OS << " ]";
618	return OS.str();
619	}
620
621	/// See AbstractAttribute::trackStatistics()
622	void trackStatistics() const override {}
623
624	private:
625	bool checkForQueuePtr(Attributor &A) {
626	Function *F = getAssociatedFunction();
627	bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(CC: F->getCallingConv());
628
629	auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
630
631	bool NeedsQueuePtr = false;
632
633	auto CheckAddrSpaceCasts = [&](Instruction &I) {
634	unsigned SrcAS = static_cast<AddrSpaceCastInst &>(I).getSrcAddressSpace();
635	if (castRequiresQueuePtr(SrcAS)) {
636	NeedsQueuePtr = true;
637	return false;
638	}
639	return true;
640	};
641
642	bool HasApertureRegs = InfoCache.hasApertureRegs(F&: *F);
643
644	// `checkForAllInstructions` is much more cheaper than going through all
645	// instructions, try it first.
646
647	// The queue pointer is not needed if aperture regs is present.
648	if (!HasApertureRegs) {
649	bool UsedAssumedInformation = false;
650	A.checkForAllInstructions(Pred: CheckAddrSpaceCasts, QueryingAA: *this,
651	Opcodes: {Instruction::AddrSpaceCast},
652	UsedAssumedInformation);
653	}
654
655	// If we found that we need the queue pointer, nothing else to do.
656	if (NeedsQueuePtr)
657	return true;
658
659	if (!IsNonEntryFunc && HasApertureRegs)
660	return false;
661
662	for (BasicBlock &BB : *F) {
663	for (Instruction &I : BB) {
664	for (const Use &U : I.operands()) {
665	if (const auto *C = dyn_cast<Constant>(Val: U)) {
666	if (InfoCache.needsQueuePtr(C, Fn&: *F))
667	return true;
668	}
669	}
670	}
671	}
672
673	return false;
674	}
675
676	bool funcRetrievesMultigridSyncArg(Attributor &A, unsigned COV) {
677	auto Pos = llvm::AMDGPU::getMultigridSyncArgImplicitArgPosition(COV);
678	AA::RangeTy Range(Pos, `8`);
679	return funcRetrievesImplicitKernelArg(A, Range);
680	}
681
682	bool funcRetrievesHostcallPtr(Attributor &A, unsigned COV) {
683	auto Pos = llvm::AMDGPU::getHostcallImplicitArgPosition(COV);
684	AA::RangeTy Range(Pos, `8`);
685	return funcRetrievesImplicitKernelArg(A, Range);
686	}
687
688	bool funcRetrievesDefaultQueue(Attributor &A, unsigned COV) {
689	auto Pos = llvm::AMDGPU::getDefaultQueueImplicitArgPosition(COV);
690	AA::RangeTy Range(Pos, `8`);
691	return funcRetrievesImplicitKernelArg(A, Range);
692	}
693
694	bool funcRetrievesCompletionAction(Attributor &A, unsigned COV) {
695	auto Pos = llvm::AMDGPU::getCompletionActionImplicitArgPosition(COV);
696	AA::RangeTy Range(Pos, `8`);
697	return funcRetrievesImplicitKernelArg(A, Range);
698	}
699
700	bool funcRetrievesHeapPtr(Attributor &A, unsigned COV) {
701	if (COV < `5`)
702	return false;
703	AA::RangeTy Range(AMDGPU::ImplicitArg::HEAP_PTR_OFFSET, `8`);
704	return funcRetrievesImplicitKernelArg(A, Range);
705	}
706
707	bool funcRetrievesQueuePtr(Attributor &A, unsigned COV) {
708	if (COV < `5`)
709	return false;
710	AA::RangeTy Range(AMDGPU::ImplicitArg::QUEUE_PTR_OFFSET, `8`);
711	return funcRetrievesImplicitKernelArg(A, Range);
712	}
713
714	bool funcRetrievesImplicitKernelArg(Attributor &A, AA::RangeTy Range) {
715	// Check if this is a call to the implicitarg_ptr builtin and it
716	// is used to retrieve the hostcall pointer. The implicit arg for
717	// hostcall is not used only if every use of the implicitarg_ptr
718	// is a load that clearly does not retrieve any byte of the
719	// hostcall pointer. We check this by tracing all the uses of the
720	// initial call to the implicitarg_ptr intrinsic.
721	auto DoesNotLeadToKernelArgLoc = [&](Instruction &I) {
722	auto &Call = cast<CallBase>(Val&: I);
723	if (Call.getIntrinsicID() != Intrinsic::amdgcn_implicitarg_ptr)
724	return true;
725
726	const auto *PointerInfoAA = A.getAAFor<AAPointerInfo>(
727	QueryingAA: *this, IRP: IRPosition::callsite_returned(CB: Call), DepClass: DepClassTy::REQUIRED);
728	if (!PointerInfoAA \|\| !PointerInfoAA->getState().isValidState())
729	return false;
730
731	return PointerInfoAA->forallInterferingAccesses(
732	Range, CB: [](const AAPointerInfo::Access &Acc, bool IsExact) {
733	return Acc.getRemoteInst()->isDroppable();
734	});
735	};
736
737	bool UsedAssumedInformation = false;
738	return !A.checkForAllCallLikeInstructions(Pred: DoesNotLeadToKernelArgLoc, QueryingAA: *this,
739	UsedAssumedInformation);
740	}
741
742	bool funcRetrievesLDSKernelId(Attributor &A) {
743	auto DoesNotRetrieve = [&](Instruction &I) {
744	auto &Call = cast<CallBase>(Val&: I);
745	return Call.getIntrinsicID() != Intrinsic::amdgcn_lds_kernel_id;
746	};
747	bool UsedAssumedInformation = false;
748	return !A.checkForAllCallLikeInstructions(Pred: DoesNotRetrieve, QueryingAA: *this,
749	UsedAssumedInformation);
750	}
751
752	// Returns true if FlatScratchInit is needed, i.e., no-flat-scratch-init is
753	// not to be set.
754	bool needFlatScratchInit(Attributor &A) {
755	assert(isAssumed(FLAT_SCRATCH_INIT)); // only called if the bit is still set
756
757	// Check all AddrSpaceCast instructions. FlatScratchInit is needed if
758	// there is a cast from PRIVATE_ADDRESS.
759	auto AddrSpaceCastNotFromPrivate = [](Instruction &I) {
760	return cast<AddrSpaceCastInst>(Val&: I).getSrcAddressSpace() !=
761	AMDGPUAS::PRIVATE_ADDRESS;
762	};
763
764	bool UsedAssumedInformation = false;
765	if (!A.checkForAllInstructions(Pred: AddrSpaceCastNotFromPrivate, QueryingAA: *this,
766	Opcodes: {Instruction::AddrSpaceCast},
767	UsedAssumedInformation))
768	return true;
769
770	// Check for addrSpaceCast from PRIVATE_ADDRESS in constant expressions
771	auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
772
773	Function *F = getAssociatedFunction();
774	for (Instruction &I : instructions(F)) {
775	for (const Use &U : I.operands()) {
776	if (const auto *C = dyn_cast<Constant>(Val: U)) {
777	if (InfoCache.checkConstForAddrSpaceCastFromPrivate(C))
778	return true;
779	}
780	}
781	}
782
783	// Finally check callees.
784
785	// This is called on each callee; false means callee shouldn't have
786	// no-flat-scratch-init.
787	auto CheckForNoFlatScratchInit = [&](Instruction &I) {
788	const auto &CB = cast<CallBase>(Val&: I);
789	const Function *Callee = CB.getCalledFunction();
790
791	// Callee == 0 for inline asm or indirect call with known callees.
792	// In the latter case, updateImpl() already checked the callees and we
793	// know their FLAT_SCRATCH_INIT bit is set.
794	// If function has indirect call with unknown callees, the bit is
795	// already removed in updateImpl() and execution won't reach here.
796	if (!Callee)
797	return true;
798
799	return Callee->getIntrinsicID() !=
800	Intrinsic::amdgcn_addrspacecast_nonnull;
801	};
802
803	UsedAssumedInformation = false;
804	// If any callee is false (i.e. need FlatScratchInit),
805	// checkForAllCallLikeInstructions returns false, in which case this
806	// function returns true.
807	return !A.checkForAllCallLikeInstructions(Pred: CheckForNoFlatScratchInit, QueryingAA: *this,
808	UsedAssumedInformation);
809	}
810	};
811
812	AAAMDAttributes &AAAMDAttributes::createForPosition(const IRPosition &IRP,
813	Attributor &A) {
814	if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
815	return *new (A.Allocator) AAAMDAttributesFunction (IRP, A);
816	llvm_unreachable("AAAMDAttributes is only valid for function position");
817	}
818
819	/// Base class to derive different size ranges.
820	struct AAAMDSizeRangeAttribute
821	: public StateWrapper<IntegerRangeState, AbstractAttribute, uint32_t> {
822	using Base = StateWrapper<IntegerRangeState, AbstractAttribute, uint32_t>;
823
824	StringRef AttrName;
825
826	AAAMDSizeRangeAttribute(const IRPosition &IRP, Attributor &A,
827	StringRef AttrName)
828	: Base (IRP, `32`), AttrName (AttrName) {}
829
830	/// See AbstractAttribute::trackStatistics()
831	void trackStatistics() const override {}
832
833	template <class AttributeImpl> ChangeStatus updateImplImpl(Attributor &A) {
834	ChangeStatus Change = ChangeStatus::UNCHANGED;
835
836	auto CheckCallSite = [&](AbstractCallSite CS) {
837	Function *Caller = CS.getInstruction()->getFunction();
838	LLVM_DEBUG(dbgs() << `'['` << getName() << "] Call " << Caller->getName()
839	<< "->" << getAssociatedFunction()->getName() << `'\n'`);
840
841	const auto *CallerInfo = A.getAAFor<AttributeImpl>(
842	*this, IRPosition::function(F: *Caller), DepClassTy::REQUIRED);
843	if (!CallerInfo \|\| !CallerInfo->isValidState())
844	return false;
845
846	Change \|=
847	clampStateAndIndicateChange(this->getState(), CallerInfo->getState());
848
849	return true;
850	};
851
852	bool AllCallSitesKnown = true;
853	if (!A.checkForAllCallSites(CheckCallSite, *this,
854	/RequireAllCallSites=/true,
855	AllCallSitesKnown))
856	return indicatePessimisticFixpoint();
857
858	return Change;
859	}
860
861	/// Clamp the assumed range to the default value ([Min, Max]) and emit the
862	/// attribute if it is not same as default.
863	ChangeStatus
864	emitAttributeIfNotDefaultAfterClamp(Attributor &A,
865	std::pair<unsigned, unsigned> Default) {
866	auto [Min, Max] = Default;
867	unsigned Lower = getAssumed().getLower().getZExtValue();
868	unsigned Upper = getAssumed().getUpper().getZExtValue();
869
870	// Clamp the range to the default value.
871	if (Lower < Min)
872	Lower = Min;
873	if (Upper > Max + `1`)
874	Upper = Max + `1`;
875
876	// No manifest if the value is invalid or same as default after clamp.
877	if ((Lower == Min && Upper == Max + `1`) \|\| (Upper < Lower))
878	return ChangeStatus::UNCHANGED;
879
880	Function *F = getAssociatedFunction();
881	LLVMContext &Ctx = F->getContext();
882	SmallString<`10`> Buffer;
883	raw_svector_ostream OS(Buffer);
884	OS << Lower << `','` << Upper - `1`;
885	return A.manifestAttrs(IRP: getIRPosition(),
886	DeducedAttrs: {Attribute::get(Context&: Ctx, Kind: AttrName, Val: OS.str())},
887	/ForceReplace=/true);
888	}
889
890	const std::string getAsStr(Attributor ) const* override {
891	std::string Str;
892	raw_string_ostream OS(Str);
893	OS << getName() << `'['`;
894	OS << getAssumed().getLower() << `','` << getAssumed().getUpper() - `1`;
895	OS << `']'`;
896	return OS.str();
897	}
898	};
899
900	/// Propagate amdgpu-flat-work-group-size attribute.
901	struct AAAMDFlatWorkGroupSize : public AAAMDSizeRangeAttribute {
902	AAAMDFlatWorkGroupSize(const IRPosition &IRP, Attributor &A)
903	: AAAMDSizeRangeAttribute (IRP, A, "amdgpu-flat-work-group-size") {}
904
905	void initialize(Attributor &A) override {
906	Function *F = getAssociatedFunction();
907	auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
908
909	bool HasAttr = false;
910	auto Range = InfoCache.getDefaultFlatWorkGroupSize(F: *F);
911	auto MaxRange = InfoCache.getMaximumFlatWorkGroupRange(F: *F);
912
913	if (auto Attr = InfoCache.getFlatWorkGroupSizeAttr(F: *F)) {
914	// We only consider an attribute that is not max range because the front
915	// end always emits the attribute, unfortunately, and sometimes it emits
916	// the max range.
917	if (*Attr != MaxRange) {
918	Range = *Attr;
919	HasAttr = true;
920	}
921	}
922
923	// We don't want to directly clamp the state if it's the max range because
924	// that is basically the worst state.
925	if (Range == MaxRange)
926	return;
927
928	auto [Min, Max] = Range;
929	ConstantRange CR(APInt (`32`, Min), APInt (`32`, Max + `1`));
930	IntegerRangeState IRS(CR);
931	clampStateAndIndicateChange(S&: this->getState(), R: IRS);
932
933	if (HasAttr \|\| AMDGPU::isEntryFunctionCC(CC: F->getCallingConv()))
934	indicateOptimisticFixpoint();
935	}
936
937	ChangeStatus updateImpl(Attributor &A) override {
938	return updateImplImpl<AAAMDFlatWorkGroupSize>(A);
939	}
940
941	/// Create an abstract attribute view for the position \p IRP.
942	static AAAMDFlatWorkGroupSize &createForPosition(const IRPosition &IRP,
943	Attributor &A);
944
945	ChangeStatus manifest(Attributor &A) override {
946	Function *F = getAssociatedFunction();
947	auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
948	return emitAttributeIfNotDefaultAfterClamp(
949	A, Default: InfoCache.getMaximumFlatWorkGroupRange(F: *F));
950	}
951
952	/// See AbstractAttribute::getName()
953	StringRef getName() const override { return "AAAMDFlatWorkGroupSize"; }
954
955	/// See AbstractAttribute::getIdAddr()
956	const char getIdAddr() const* override { return &ID; }
957
958	/// This function should return true if the type of the \p AA is
959	/// AAAMDFlatWorkGroupSize
960	static bool classof(const AbstractAttribute *AA) {
961	return (AA->getIdAddr() == &ID);
962	}
963
964	/// Unique ID (due to the unique address)
965	static const char ID;
966	};
967
968	const char AAAMDFlatWorkGroupSize::ID = `0`;
969
970	AAAMDFlatWorkGroupSize &
971	AAAMDFlatWorkGroupSize::createForPosition(const IRPosition &IRP,
972	Attributor &A) {
973	if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
974	return *new (A.Allocator) AAAMDFlatWorkGroupSize (IRP, A);
975	llvm_unreachable(
976	"AAAMDFlatWorkGroupSize is only valid for function position");
977	}
978
979	struct TupleDecIntegerRangeState : public AbstractState {
980	DecIntegerState<uint32_t> X, Y, Z;
981
982	bool isValidState() const override {
983	return X.isValidState() && Y.isValidState() && Z.isValidState();
984	}
985
986	bool isAtFixpoint() const override {
987	return X.isAtFixpoint() && Y.isAtFixpoint() && Z.isAtFixpoint();
988	}
989
990	ChangeStatus indicateOptimisticFixpoint() override {
991	return X.indicateOptimisticFixpoint() \| Y.indicateOptimisticFixpoint() \|
992	Z.indicateOptimisticFixpoint();
993	}
994
995	ChangeStatus indicatePessimisticFixpoint() override {
996	return X.indicatePessimisticFixpoint() \| Y.indicatePessimisticFixpoint() \|
997	Z.indicatePessimisticFixpoint();
998	}
999
1000	TupleDecIntegerRangeState operator^=(const TupleDecIntegerRangeState &Other) {
1001	X ^= Other.X;
1002	Y ^= Other.Y;
1003	Z ^= Other.Z;
1004	return *this;
1005	}
1006
1007	bool operator==(const TupleDecIntegerRangeState &Other) const {
1008	return X == Other.X && Y == Other.Y && Z == Other.Z;
1009	}
1010
1011	TupleDecIntegerRangeState &getAssumed() { return *this; }
1012	const TupleDecIntegerRangeState &getAssumed() const { return *this; }
1013	};
1014
1015	using AAAMDMaxNumWorkgroupsState =
1016	StateWrapper<TupleDecIntegerRangeState, AbstractAttribute, uint32_t>;
1017
1018	/// Propagate amdgpu-max-num-workgroups attribute.
1019	struct AAAMDMaxNumWorkgroups
1020	: public StateWrapper<TupleDecIntegerRangeState, AbstractAttribute> {
1021	using Base = StateWrapper<TupleDecIntegerRangeState, AbstractAttribute>;
1022
1023	AAAMDMaxNumWorkgroups(const IRPosition &IRP, Attributor &A) : Base (IRP) {}
1024
1025	void initialize(Attributor &A) override {
1026	Function *F = getAssociatedFunction();
1027
1028	SmallVector<unsigned> MaxNumWorkgroups = AMDGPU::getMaxNumWorkGroups(F: *F);
1029
1030	X.takeKnownMinimum(Value: MaxNumWorkgroups [`0`]);
1031	Y.takeKnownMinimum(Value: MaxNumWorkgroups [`1`]);
1032	Z.takeKnownMinimum(Value: MaxNumWorkgroups [`2`]);
1033
1034	if (AMDGPU::isEntryFunctionCC(CC: F->getCallingConv()))
1035	indicatePessimisticFixpoint();
1036	}
1037
1038	ChangeStatus updateImpl(Attributor &A) override {
1039	ChangeStatus Change = ChangeStatus::UNCHANGED;
1040
1041	auto CheckCallSite = [&](AbstractCallSite CS) {
1042	Function *Caller = CS.getInstruction()->getFunction();
1043	LLVM_DEBUG(dbgs() << "[AAAMDMaxNumWorkgroups] Call " << Caller->getName()
1044	<< "->" << getAssociatedFunction()->getName() << `'\n'`);
1045
1046	const auto *CallerInfo = A.getAAFor<AAAMDMaxNumWorkgroups>(
1047	QueryingAA: *this, IRP: IRPosition::function(F: *Caller), DepClass: DepClassTy::REQUIRED);
1048	if (!CallerInfo \|\| !CallerInfo->isValidState())
1049	return false;
1050
1051	Change \|=
1052	clampStateAndIndicateChange(S&: this->getState(), R: CallerInfo->getState());
1053	return true;
1054	};
1055
1056	bool AllCallSitesKnown = true;
1057	if (!A.checkForAllCallSites(Pred: CheckCallSite, QueryingAA: *this,
1058	/RequireAllCallSites=/true,
1059	UsedAssumedInformation&: AllCallSitesKnown))
1060	return indicatePessimisticFixpoint();
1061
1062	return Change;
1063	}
1064
1065	/// Create an abstract attribute view for the position \p IRP.
1066	static AAAMDMaxNumWorkgroups &createForPosition(const IRPosition &IRP,
1067	Attributor &A);
1068
1069	ChangeStatus manifest(Attributor &A) override {
1070	Function *F = getAssociatedFunction();
1071	LLVMContext &Ctx = F->getContext();
1072	SmallString<`32`> Buffer;
1073	raw_svector_ostream OS(Buffer);
1074	OS << X.getAssumed() << `','` << Y.getAssumed() << `','` << Z.getAssumed();
1075
1076	// TODO: Should annotate loads of the group size for this to do anything
1077	// useful.
1078	return A.manifestAttrs(
1079	IRP: getIRPosition(),
1080	DeducedAttrs: {Attribute::get(Context&: Ctx, Kind: "amdgpu-max-num-workgroups", Val: OS.str())},
1081	/ ForceReplace= / true);
1082	}
1083
1084	StringRef getName() const override { return "AAAMDMaxNumWorkgroups"; }
1085
1086	const std::string getAsStr(Attributor ) const* override {
1087	std::string Buffer = "AAAMDMaxNumWorkgroupsState[";
1088	raw_string_ostream OS(Buffer);
1089	OS << X.getAssumed() << `','` << Y.getAssumed() << `','` << Z.getAssumed()
1090	<< `']'`;
1091	return OS.str();
1092	}
1093
1094	const char getIdAddr() const* override { return &ID; }
1095
1096	/// This function should return true if the type of the \p AA is
1097	/// AAAMDMaxNumWorkgroups
1098	static bool classof(const AbstractAttribute *AA) {
1099	return (AA->getIdAddr() == &ID);
1100	}
1101
1102	void trackStatistics() const override {}
1103
1104	/// Unique ID (due to the unique address)
1105	static const char ID;
1106	};
1107
1108	const char AAAMDMaxNumWorkgroups::ID = `0`;
1109
1110	AAAMDMaxNumWorkgroups &
1111	AAAMDMaxNumWorkgroups::createForPosition(const IRPosition &IRP, Attributor &A) {
1112	if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
1113	return *new (A.Allocator) AAAMDMaxNumWorkgroups (IRP, A);
1114	llvm_unreachable("AAAMDMaxNumWorkgroups is only valid for function position");
1115	}
1116
1117	/// Propagate amdgpu-waves-per-eu attribute.
1118	struct AAAMDWavesPerEU : public AAAMDSizeRangeAttribute {
1119	AAAMDWavesPerEU(const IRPosition &IRP, Attributor &A)
1120	: AAAMDSizeRangeAttribute (IRP, A, "amdgpu-waves-per-eu") {}
1121
1122	void initialize(Attributor &A) override {
1123	Function *F = getAssociatedFunction();
1124	auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
1125
1126	// If the attribute exists, we will honor it if it is not the default.
1127	if (auto Attr = InfoCache.getWavesPerEUAttr(F: *F)) {
1128	std::pair<unsigned, unsigned> MaxWavesPerEURange{
1129	`1U`, InfoCache.getMaxWavesPerEU(F: *F)};
1130	if (*Attr != MaxWavesPerEURange) {
1131	auto [Min, Max] = *Attr;
1132	ConstantRange Range(APInt (`32`, Min), APInt (`32`, Max + `1`));
1133	IntegerRangeState RangeState(Range);
1134	this->getState() = RangeState;
1135	indicateOptimisticFixpoint();
1136	return;
1137	}
1138	}
1139
1140	if (AMDGPU::isEntryFunctionCC(CC: F->getCallingConv()))
1141	indicatePessimisticFixpoint();
1142	}
1143
1144	ChangeStatus updateImpl(Attributor &A) override {
1145	ChangeStatus Change = ChangeStatus::UNCHANGED;
1146
1147	auto CheckCallSite = [&](AbstractCallSite CS) {
1148	Function *Caller = CS.getInstruction()->getFunction();
1149	Function *Func = getAssociatedFunction();
1150	LLVM_DEBUG(dbgs() << `'['` << getName() << "] Call " << Caller->getName()
1151	<< "->" << Func->getName() << `'\n'`);
1152	(void)Func;
1153
1154	const auto *CallerAA = A.getAAFor<AAAMDWavesPerEU>(
1155	QueryingAA: *this, IRP: IRPosition::function(F: *Caller), DepClass: DepClassTy::REQUIRED);
1156	if (!CallerAA \|\| !CallerAA->isValidState())
1157	return false;
1158
1159	ConstantRange Assumed = getAssumed();
1160	unsigned Min = std::max(a: Assumed.getLower().getZExtValue(),
1161	b: CallerAA->getAssumed().getLower().getZExtValue());
1162	unsigned Max = std::max(a: Assumed.getUpper().getZExtValue(),
1163	b: CallerAA->getAssumed().getUpper().getZExtValue());
1164	ConstantRange Range(APInt (`32`, Min), APInt (`32`, Max));
1165	IntegerRangeState RangeState(Range);
1166	getState() = RangeState;
1167	Change \|= getState() == Assumed ? ChangeStatus::UNCHANGED
1168	: ChangeStatus::CHANGED;
1169
1170	return true;
1171	};
1172
1173	bool AllCallSitesKnown = true;
1174	if (!A.checkForAllCallSites(Pred: CheckCallSite, QueryingAA: *this, RequireAllCallSites: true, UsedAssumedInformation&: AllCallSitesKnown))
1175	return indicatePessimisticFixpoint();
1176
1177	return Change;
1178	}
1179
1180	/// Create an abstract attribute view for the position \p IRP.
1181	static AAAMDWavesPerEU &createForPosition(const IRPosition &IRP,
1182	Attributor &A);
1183
1184	ChangeStatus manifest(Attributor &A) override {
1185	Function *F = getAssociatedFunction();
1186	auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
1187	return emitAttributeIfNotDefaultAfterClamp(
1188	A, Default: {`1U`, InfoCache.getMaxWavesPerEU(F: *F)});
1189	}
1190
1191	/// See AbstractAttribute::getName()
1192	StringRef getName() const override { return "AAAMDWavesPerEU"; }
1193
1194	/// See AbstractAttribute::getIdAddr()
1195	const char getIdAddr() const* override { return &ID; }
1196
1197	/// This function should return true if the type of the \p AA is
1198	/// AAAMDWavesPerEU
1199	static bool classof(const AbstractAttribute *AA) {
1200	return (AA->getIdAddr() == &ID);
1201	}
1202
1203	/// Unique ID (due to the unique address)
1204	static const char ID;
1205	};
1206
1207	const char AAAMDWavesPerEU::ID = `0`;
1208
1209	AAAMDWavesPerEU &AAAMDWavesPerEU::createForPosition(const IRPosition &IRP,
1210	Attributor &A) {
1211	if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
1212	return *new (A.Allocator) AAAMDWavesPerEU (IRP, A);
1213	llvm_unreachable("AAAMDWavesPerEU is only valid for function position");
1214	}
1215
1216	/// Compute the minimum number of AGPRs required to allocate the inline asm.
1217	static unsigned inlineAsmGetNumRequiredAGPRs(const InlineAsm *IA,
1218	const CallBase &Call) {
1219	unsigned ArgNo = `0`;
1220	unsigned ResNo = `0`;
1221	unsigned AGPRDefCount = `0`;
1222	unsigned AGPRUseCount = `0`;
1223	unsigned MaxPhysReg = `0`;
1224	const DataLayout &DL = Call.getFunction()->getParent()->getDataLayout();
1225
1226	// TODO: Overestimates due to not accounting for tied operands
1227	for (const InlineAsm::ConstraintInfo &CI : IA->ParseConstraints()) {
1228	Type Ty = nullptr*;
1229	switch (CI.Type) {
1230	case InlineAsm::isOutput: {
1231	Ty = Call.getType();
1232	if (auto *STy = dyn_cast<StructType>(Val: Ty))
1233	Ty = STy->getElementType(N: ResNo);
1234	++ResNo;
1235	break;
1236	}
1237	case InlineAsm::isInput: {
1238	Ty = Call.getArgOperand(i: ArgNo++)->getType();
1239	break;
1240	}
1241	case InlineAsm::isLabel:
1242	continue;
1243	case InlineAsm::isClobber:
1244	// Parse the physical register reference.
1245	break;
1246	}
1247
1248	for (StringRef Code : CI.Codes) {
1249	unsigned RegCount = `0`;
1250	if (Code.starts_with(Prefix: "a")) {
1251	// Virtual register, compute number of registers based on the type.
1252	//
1253	// We ought to be going through TargetLowering to get the number of
1254	// registers, but we should avoid the dependence on CodeGen here.
1255	RegCount = divideCeil(Numerator: DL.getTypeSizeInBits(Ty), Denominator: `32`);
1256	} else {
1257	// Physical register reference
1258	auto [Kind, RegIdx, NumRegs] = AMDGPU::parseAsmConstraintPhysReg(Constraint: Code);
1259	if (Kind == `'a'`) {
1260	RegCount = NumRegs;
1261	MaxPhysReg = std::max(a: MaxPhysReg, b: std::min(a: RegIdx + NumRegs, b: `256u`));
1262	}
1263
1264	continue;
1265	}
1266
1267	if (CI.Type == InlineAsm::isOutput) {
1268	// Apply tuple alignment requirement
1269	//
1270	// TODO: This is more conservative than necessary.
1271	AGPRDefCount = alignTo(Value: AGPRDefCount, Align: RegCount);
1272
1273	AGPRDefCount += RegCount;
1274	if (CI.isEarlyClobber) {
1275	AGPRUseCount = alignTo(Value: AGPRUseCount, Align: RegCount);
1276	AGPRUseCount += RegCount;
1277	}
1278	} else {
1279	AGPRUseCount = alignTo(Value: AGPRUseCount, Align: RegCount);
1280	AGPRUseCount += RegCount;
1281	}
1282	}
1283	}
1284
1285	unsigned MaxVirtReg = std::max(a: AGPRUseCount, b: AGPRDefCount);
1286
1287	// TODO: This is overly conservative. If there are any physical registers,
1288	// allocate any virtual registers after them so we don't have to solve optimal
1289	// packing.
1290	return std::min(a: MaxVirtReg + MaxPhysReg, b: `256u`);
1291	}
1292
1293	struct AAAMDGPUMinAGPRAlloc
1294	: public StateWrapper<DecIntegerState<>, AbstractAttribute> {
1295	using Base = StateWrapper<DecIntegerState<>, AbstractAttribute>;
1296	AAAMDGPUMinAGPRAlloc(const IRPosition &IRP, Attributor &A) : Base (IRP) {}
1297
1298	static AAAMDGPUMinAGPRAlloc &createForPosition(const IRPosition &IRP,
1299	Attributor &A) {
1300	if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
1301	return *new (A.Allocator) AAAMDGPUMinAGPRAlloc (IRP, A);
1302	llvm_unreachable(
1303	"AAAMDGPUMinAGPRAlloc is only valid for function position");
1304	}
1305
1306	void initialize(Attributor &A) override {
1307	Function *F = getAssociatedFunction();
1308	auto [MinNumAGPR, MaxNumAGPR] =
1309	AMDGPU::getIntegerPairAttribute(F: *F, Name: "amdgpu-agpr-alloc", Default: {~`0u`, ~`0u`},
1310	/OnlyFirstRequired=/true);
1311	if (MinNumAGPR == `0`) {
1312	indicateOptimisticFixpoint();
1313	return;
1314	}
1315
1316	if (hasSanitizerAttributes(F: *F))
1317	indicatePessimisticFixpoint();
1318	}
1319
1320	const std::string getAsStr(Attributor A) const* override {
1321	std::string Str = "amdgpu-agpr-alloc=";
1322	raw_string_ostream OS(Str);
1323	OS << getAssumed();
1324	return OS.str();
1325	}
1326
1327	void trackStatistics() const override {}
1328
1329	ChangeStatus updateImpl(Attributor &A) override {
1330	DecIntegerState<> Maximum;
1331
1332	// Check for cases which require allocation of AGPRs. The only cases where
1333	// AGPRs are required are if there are direct references to AGPRs, so inline
1334	// assembly and special intrinsics.
1335	auto CheckForMinAGPRAllocs = [&](Instruction &I) {
1336	const auto &CB = cast<CallBase>(Val&: I);
1337	const Value *CalleeOp = CB.getCalledOperand();
1338
1339	if (const InlineAsm *IA = dyn_cast<InlineAsm>(Val: CalleeOp)) {
1340	// Technically, the inline asm could be invoking a call to an unknown
1341	// external function that requires AGPRs, but ignore that.
1342	unsigned NumRegs = inlineAsmGetNumRequiredAGPRs(IA, Call: CB);
1343	Maximum.takeAssumedMaximum(Value: NumRegs);
1344	return true;
1345	}
1346	switch (CB.getIntrinsicID()) {
1347	case Intrinsic::not_intrinsic:
1348	break;
1349	case Intrinsic::write_register:
1350	case Intrinsic::read_register:
1351	case Intrinsic::read_volatile_register: {
1352	const MDString *RegName = cast<MDString>(
1353	Val: cast<MDNode>(
1354	Val: cast<MetadataAsValue>(Val: CB.getArgOperand(i: `0`))->getMetadata())
1355	->getOperand(I: `0`));
1356	auto [Kind, RegIdx, NumRegs] =
1357	AMDGPU::parseAsmPhysRegName(TupleString: RegName->getString());
1358	if (Kind == `'a'`)
1359	Maximum.takeAssumedMaximum(Value: std::min(a: RegIdx + NumRegs, b: `256u`));
1360
1361	return true;
1362	}
1363	// Trap-like intrinsics such as llvm.trap and llvm.debugtrap do not have
1364	// the nocallback attribute, so the AMDGPU attributor can conservatively
1365	// drop all implicitly-known inputs and AGPR allocation information. Make
1366	// sure we still infer that no implicit inputs are required and that the
1367	// AGPR allocation stays at zero. Trap-like intrinsics may invoke a
1368	// function which requires AGPRs, so we need to check if the called
1369	// function has the "trap-func-name" attribute.
1370	case Intrinsic::trap:
1371	case Intrinsic::debugtrap:
1372	case Intrinsic::ubsantrap:
1373	return CB.hasFnAttr(Kind: Attribute::NoCallback) \|\|
1374	!CB.hasFnAttr(Kind: "trap-func-name");
1375	default:
1376	// Some intrinsics may use AGPRs, but if we have a choice, we are not
1377	// required to use AGPRs.
1378	// Assume !nocallback intrinsics may call a function which requires
1379	// AGPRs.
1380	return CB.hasFnAttr(Kind: Attribute::NoCallback);
1381	}
1382
1383	// TODO: Handle callsite attributes
1384	auto *CBEdges = A.getAAFor<AACallEdges>(
1385	QueryingAA: *this, IRP: IRPosition::callsite_function(CB), DepClass: DepClassTy::REQUIRED);
1386	if (!CBEdges \|\| CBEdges->hasUnknownCallee()) {
1387	Maximum.indicatePessimisticFixpoint();
1388	return false;
1389	}
1390
1391	for (const Function *PossibleCallee : CBEdges->getOptimisticEdges()) {
1392	const auto *CalleeInfo = A.getAAFor<AAAMDGPUMinAGPRAlloc>(
1393	QueryingAA: *this, IRP: IRPosition::function(F: *PossibleCallee), DepClass: DepClassTy::REQUIRED);
1394	if (!CalleeInfo \|\| !CalleeInfo->isValidState()) {
1395	Maximum.indicatePessimisticFixpoint();
1396	return false;
1397	}
1398
1399	Maximum.takeAssumedMaximum(Value: CalleeInfo->getAssumed());
1400	}
1401
1402	return true;
1403	};
1404
1405	bool UsedAssumedInformation = false;
1406	if (!A.checkForAllCallLikeInstructions(Pred: CheckForMinAGPRAllocs, QueryingAA: *this,
1407	UsedAssumedInformation))
1408	return indicatePessimisticFixpoint();
1409
1410	return clampStateAndIndicateChange(S&: getState(), R: Maximum);
1411	}
1412
1413	ChangeStatus manifest(Attributor &A) override {
1414	LLVMContext &Ctx = getAssociatedFunction()->getContext();
1415	SmallString<`4`> Buffer;
1416	raw_svector_ostream OS(Buffer);
1417	OS << getAssumed();
1418
1419	return A.manifestAttrs(
1420	IRP: getIRPosition(), DeducedAttrs: {Attribute::get(Context&: Ctx, Kind: "amdgpu-agpr-alloc", Val: OS.str())});
1421	}
1422
1423	StringRef getName() const override { return "AAAMDGPUMinAGPRAlloc"; }
1424	const char getIdAddr() const* override { return &ID; }
1425
1426	/// This function should return true if the type of the \p AA is
1427	/// AAAMDGPUMinAGPRAllocs
1428	static bool classof(const AbstractAttribute *AA) {
1429	return (AA->getIdAddr() == &ID);
1430	}
1431
1432	static const char ID;
1433	};
1434
1435	const char AAAMDGPUMinAGPRAlloc::ID = `0`;
1436
1437	/// An abstract attribute to propagate the function attribute
1438	/// "amdgpu-cluster-dims" from kernel entry functions to device functions.
1439	struct AAAMDGPUClusterDims
1440	: public StateWrapper<BooleanState, AbstractAttribute> {
1441	using Base = StateWrapper<BooleanState, AbstractAttribute>;
1442	AAAMDGPUClusterDims(const IRPosition &IRP, Attributor &A) : Base (IRP) {}
1443
1444	/// Create an abstract attribute view for the position \p IRP.
1445	static AAAMDGPUClusterDims &createForPosition(const IRPosition &IRP,
1446	Attributor &A);
1447
1448	/// See AbstractAttribute::getName().
1449	StringRef getName() const override { return "AAAMDGPUClusterDims"; }
1450
1451	/// See AbstractAttribute::getIdAddr().
1452	const char getIdAddr() const* override { return &ID; }
1453
1454	/// This function should return true if the type of the \p AA is
1455	/// AAAMDGPUClusterDims.
1456	static bool classof(const AbstractAttribute *AA) {
1457	return AA->getIdAddr() == &ID;
1458	}
1459
1460	virtual const AMDGPU::ClusterDimsAttr &getClusterDims() const = `0`;
1461
1462	/// Unique ID (due to the unique address)
1463	static const char ID;
1464	};
1465
1466	const char AAAMDGPUClusterDims::ID = `0`;
1467
1468	struct AAAMDGPUClusterDimsFunction : public AAAMDGPUClusterDims {
1469	AAAMDGPUClusterDimsFunction(const IRPosition &IRP, Attributor &A)
1470	: AAAMDGPUClusterDims (IRP, A) {}
1471
1472	void initialize(Attributor &A) override {
1473	Function *F = getAssociatedFunction();
1474	assert(F && "empty associated function");
1475
1476	Attr = AMDGPU::ClusterDimsAttr::get(F: *F);
1477
1478	// No matter what a kernel function has, it is final.
1479	if (AMDGPU::isEntryFunctionCC(CC: F->getCallingConv())) {
1480	if (Attr.isUnknown())
1481	indicatePessimisticFixpoint();
1482	else
1483	indicateOptimisticFixpoint();
1484	}
1485	}
1486
1487	const std::string getAsStr(Attributor A) const* override {
1488	if (!getAssumed() \|\| Attr.isUnknown())
1489	return "unknown";
1490	if (Attr.isNoCluster())
1491	return "no";
1492	if (Attr.isVariableDims())
1493	return "variable";
1494	return Attr.to_string();
1495	}
1496
1497	void trackStatistics() const override {}
1498
1499	ChangeStatus updateImpl(Attributor &A) override {
1500	auto OldState = Attr;
1501
1502	auto CheckCallSite = [&](AbstractCallSite CS) {
1503	const auto *CallerAA = A.getAAFor<AAAMDGPUClusterDims>(
1504	QueryingAA: *this, IRP: IRPosition::function(F: *CS.getInstruction()->getFunction()),
1505	DepClass: DepClassTy::REQUIRED);
1506	if (!CallerAA \|\| !CallerAA->isValidState())
1507	return false;
1508
1509	return merge(Other: CallerAA->getClusterDims());
1510	};
1511
1512	bool UsedAssumedInformation = false;
1513	if (!A.checkForAllCallSites(Pred: CheckCallSite, QueryingAA: *this,
1514	/RequireAllCallSites=/true,
1515	UsedAssumedInformation))
1516	return indicatePessimisticFixpoint();
1517
1518	return OldState == Attr ? ChangeStatus::UNCHANGED : ChangeStatus::CHANGED;
1519	}
1520
1521	ChangeStatus manifest(Attributor &A) override {
1522	if (Attr.isUnknown())
1523	return ChangeStatus::UNCHANGED;
1524	return A.manifestAttrs(
1525	IRP: getIRPosition(),
1526	DeducedAttrs: {Attribute::get(Context&: getAssociatedFunction()->getContext(), Kind: AttrName,
1527	Val: Attr.to_string())},
1528	/ForceReplace=/true);
1529	}
1530
1531	const AMDGPU::ClusterDimsAttr &getClusterDims() const override {
1532	return Attr;
1533	}
1534
1535	private:
1536	bool merge(const AMDGPU::ClusterDimsAttr &Other) {
1537	// Case 1: Both of them are unknown yet, we do nothing and continue wait for
1538	// propagation.
1539	if (Attr.isUnknown() && Other.isUnknown())
1540	return true;
1541
1542	// Case 2: The other is determined, but we are unknown yet, we simply take
1543	// the other's value.
1544	if (Attr.isUnknown()) {
1545	Attr = Other;
1546	return true;
1547	}
1548
1549	// Case 3: We are determined but the other is unknown yet, we simply keep
1550	// everything unchanged.
1551	if (Other.isUnknown())
1552	return true;
1553
1554	// After this point, both are determined.
1555
1556	// Case 4: If they are same, we do nothing.
1557	if (Attr == Other)
1558	return true;
1559
1560	// Now they are not same.
1561
1562	// Case 5: If either of us uses cluster (but not both; otherwise case 4
1563	// would hold), then it is unknown whether cluster will be used, and the
1564	// state is final, unlike case 1.
1565	if (Attr.isNoCluster() \|\| Other.isNoCluster()) {
1566	Attr.setUnknown();
1567	return false;
1568	}
1569
1570	// Case 6: Both of us use cluster, but the dims are different, so the result
1571	// is, cluster is used, but we just don't have a fixed dims.
1572	Attr.setVariableDims();
1573	return true;
1574	}
1575
1576	AMDGPU::ClusterDimsAttr Attr;
1577
1578	static constexpr char AttrName[] = "amdgpu-cluster-dims";
1579	};
1580
1581	AAAMDGPUClusterDims &
1582	AAAMDGPUClusterDims::createForPosition(const IRPosition &IRP, Attributor &A) {
1583	if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
1584	return *new (A.Allocator) AAAMDGPUClusterDimsFunction (IRP, A);
1585	llvm_unreachable("AAAMDGPUClusterDims is only valid for function position");
1586	}
1587
1588	static bool runImpl(SetVector<Function > &Functions, bool* IsModulePass,
1589	bool DeleteFns, Module &M, AnalysisGetter &AG,
1590	TargetMachine &TM, AMDGPUAttributorOptions Options,
1591	ThinOrFullLTOPhase LTOPhase) {
1592
1593	CallGraphUpdater CGUpdater;
1594	BumpPtrAllocator Allocator;
1595	AMDGPUInformationCache InfoCache(M, AG, Allocator, nullptr, TM);
1596	DenseSet<const char *> Allowed(
1597	{&AAAMDAttributes::ID, &AAUniformWorkGroupSize::ID,
1598	&AAPotentialValues::ID, &AAAMDFlatWorkGroupSize::ID,
1599	&AAAMDMaxNumWorkgroups::ID, &AAAMDWavesPerEU::ID,
1600	&AAAMDGPUMinAGPRAlloc::ID, &AACallEdges::ID, &AAPointerInfo::ID,
1601	&AAPotentialConstantValues::ID, &AAUnderlyingObjects::ID,
1602	&AANoAliasAddrSpace::ID, &AAAddressSpace::ID, &AAIndirectCallInfo::ID,
1603	&AAAMDGPUClusterDims::ID, &AAAlign::ID});
1604
1605	AttributorConfig AC(CGUpdater);
1606	AC.IsClosedWorldModule = Options.IsClosedWorld;
1607	AC.Allowed = &Allowed;
1608	AC.IsModulePass = IsModulePass;
1609	AC.DeleteFns = DeleteFns;
1610	AC.DefaultInitializeLiveInternals = false;
1611	AC.IndirectCalleeSpecializationCallback =
1612	[](Attributor &A, const AbstractAttribute &AA, CallBase &CB,
1613	Function &Callee, unsigned NumAssumedCallees) {
1614	return !AMDGPU::isEntryFunctionCC(CC: Callee.getCallingConv()) &&
1615	(NumAssumedCallees <= IndirectCallSpecializationThreshold);
1616	};
1617	AC.IPOAmendableCB = [](const Function &F) {
1618	return F.getCallingConv() == CallingConv::AMDGPU_KERNEL;
1619	};
1620
1621	Attributor A(Functions, InfoCache, AC);
1622
1623	LLVM_DEBUG({
1624	StringRef LTOPhaseStr = to_string(LTOPhase);
1625	dbgs() << "[AMDGPUAttributor] Running at phase " << LTOPhaseStr << `'\n'`
1626	<< "[AMDGPUAttributor] Module " << M.getName() << " is "
1627	<< (AC.IsClosedWorldModule ? "" : "not ")
1628	<< "assumed to be a closed world.\n";
1629	});
1630
1631	for (auto *F : Functions) {
1632	A.getOrCreateAAFor<AAAMDAttributes>(IRP: IRPosition::function(F: *F));
1633	A.getOrCreateAAFor<AAUniformWorkGroupSize>(IRP: IRPosition::function(F: *F));
1634	A.getOrCreateAAFor<AAAMDMaxNumWorkgroups>(IRP: IRPosition::function(F: *F));
1635	CallingConv::ID CC = F->getCallingConv();
1636	if (!AMDGPU::isEntryFunctionCC(CC)) {
1637	A.getOrCreateAAFor<AAAMDFlatWorkGroupSize>(IRP: IRPosition::function(F: *F));
1638	A.getOrCreateAAFor<AAAMDWavesPerEU>(IRP: IRPosition::function(F: *F));
1639	}
1640
1641	const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F: *F);
1642	if (!F->isDeclaration() && ST.hasClusters())
1643	A.getOrCreateAAFor<AAAMDGPUClusterDims>(IRP: IRPosition::function(F: *F));
1644
1645	if (ST.hasGFX90AInsts())
1646	A.getOrCreateAAFor<AAAMDGPUMinAGPRAlloc>(IRP: IRPosition::function(F: *F));
1647
1648	for (auto &I : instructions(F)) {
1649	Value Ptr = nullptr*;
1650	if (auto *LI = dyn_cast<LoadInst>(Val: &I))
1651	Ptr = LI->getPointerOperand();
1652	else if (auto *SI = dyn_cast<StoreInst>(Val: &I))
1653	Ptr = SI->getPointerOperand();
1654	else if (auto *RMW = dyn_cast<AtomicRMWInst>(Val: &I))
1655	Ptr = RMW->getPointerOperand();
1656	else if (auto *CmpX = dyn_cast<AtomicCmpXchgInst>(Val: &I))
1657	Ptr = CmpX->getPointerOperand();
1658
1659	if (Ptr) {
1660	A.getOrCreateAAFor<AAAddressSpace>(IRP: IRPosition::value(V: *Ptr));
1661	A.getOrCreateAAFor<AANoAliasAddrSpace>(IRP: IRPosition::value(V: *Ptr));
1662	if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(Val: Ptr)) {
1663	if (II->getIntrinsicID() == Intrinsic::amdgcn_make_buffer_rsrc)
1664	A.getOrCreateAAFor<AAAlign>(IRP: IRPosition::value(V: *Ptr));
1665	}
1666	}
1667	}
1668	}
1669
1670	return A.run() == ChangeStatus::CHANGED;
1671	}
1672	} // namespace
1673
1674	PreservedAnalyses llvm::AMDGPUAttributorPass::run(Module &M,
1675	ModuleAnalysisManager &AM) {
1676
1677	FunctionAnalysisManager &FAM =
1678	AM.getResult<FunctionAnalysisManagerModuleProxy>(IR&: M).getManager();
1679	AnalysisGetter AG(FAM);
1680
1681	SetVector<Function *> Functions;
1682	for (Function &F : M) {
1683	if (!F.isDeclaration())
1684	Functions.insert(X: &F);
1685	}
1686
1687	// TODO: Probably preserves CFG
1688	return runImpl(Functions, /IsModulePass=/true, /DeleteFns=/true, M, AG,
1689	TM, Options, LTOPhase)
1690	? PreservedAnalyses::none()
1691	: PreservedAnalyses::all();
1692	}
1693
1694	PreservedAnalyses llvm::AMDGPUAttributorCGSCCPass::run(LazyCallGraph::SCC &C,
1695	CGSCCAnalysisManager &AM,
1696	LazyCallGraph &CG,
1697	CGSCCUpdateResult &UR) {
1698
1699	FunctionAnalysisManager &FAM =
1700	AM.getResult<FunctionAnalysisManagerCGSCCProxy>(IR&: C, ExtraArgs&: CG).getManager();
1701	AnalysisGetter AG(FAM);
1702
1703	SetVector<Function *> Functions;
1704	for (LazyCallGraph::Node &N : C) {
1705	Function *F = &N.getFunction();
1706	if (!F->isIntrinsic())
1707	Functions.insert(X: F);
1708	}
1709
1710	AMDGPUAttributorOptions Options;
1711	Module *M = C.begin()->getFunction().getParent();
1712	// In the CGSCC pipeline, avoid untracked call graph modifications by
1713	// disabling function deletion, mirroring the generic AttributorCGSCCPass.
1714	return runImpl(Functions, /IsModulePass=/false, /DeleteFns=/false, M&: *M, AG,
1715	TM, Options, LTOPhase: ThinOrFullLTOPhase::None)
1716	? PreservedAnalyses::none()
1717	: PreservedAnalyses::all();
1718	}
1719

Browse the source code of llvm_projects/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp