1//===- AMDGPUAttributor.cpp -----------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file This pass uses Attributor framework to deduce AMDGPU attributes.
10//
11//===----------------------------------------------------------------------===//
12
13#include "AMDGPU.h"
14#include "GCNSubtarget.h"
15#include "Utils/AMDGPUBaseInfo.h"
16#include "llvm/IR/IntrinsicsAMDGPU.h"
17#include "llvm/IR/IntrinsicsR600.h"
18#include "llvm/Target/TargetMachine.h"
19#include "llvm/Transforms/IPO/Attributor.h"
20
21#define DEBUG_TYPE "amdgpu-attributor"
22
23using namespace llvm;
24
25static cl::opt<unsigned> IndirectCallSpecializationThreshold(
26 "amdgpu-indirect-call-specialization-threshold",
27 cl::desc(
28 "A threshold controls whether an indirect call will be specialized"),
29 cl::init(Val: 3));
30
31#define AMDGPU_ATTRIBUTE(Name, Str) Name##_POS,
32
33enum ImplicitArgumentPositions {
34#include "AMDGPUAttributes.def"
35 LAST_ARG_POS
36};
37
38#define AMDGPU_ATTRIBUTE(Name, Str) Name = 1 << Name##_POS,
39
40enum ImplicitArgumentMask {
41 UNKNOWN_INTRINSIC = 0,
42#include "AMDGPUAttributes.def"
43 ALL_ARGUMENT_MASK = (1 << LAST_ARG_POS) - 1,
44 NOT_IMPLICIT_INPUT
45};
46
47#define AMDGPU_ATTRIBUTE(Name, Str) {Name, Str},
48static constexpr std::pair<ImplicitArgumentMask, StringLiteral>
49 ImplicitAttrs[] = {
50#include "AMDGPUAttributes.def"
51};
52
53// We do not need to note the x workitem or workgroup id because they are always
54// initialized.
55//
56// TODO: We should not add the attributes if the known compile time workgroup
57// size is 1 for y/z.
58static ImplicitArgumentMask
59intrinsicToAttrMask(Intrinsic::ID ID, bool &NonKernelOnly, bool &NeedsImplicit,
60 bool HasApertureRegs, bool SupportsGetDoorBellID,
61 unsigned CodeObjectVersion) {
62 switch (ID) {
63 case Intrinsic::amdgcn_workitem_id_x:
64 NonKernelOnly = true;
65 return WORKITEM_ID_X;
66 case Intrinsic::amdgcn_workgroup_id_x:
67 NonKernelOnly = true;
68 return WORKGROUP_ID_X;
69 case Intrinsic::amdgcn_workitem_id_y:
70 case Intrinsic::r600_read_tidig_y:
71 return WORKITEM_ID_Y;
72 case Intrinsic::amdgcn_workitem_id_z:
73 case Intrinsic::r600_read_tidig_z:
74 return WORKITEM_ID_Z;
75 case Intrinsic::amdgcn_workgroup_id_y:
76 case Intrinsic::r600_read_tgid_y:
77 return WORKGROUP_ID_Y;
78 case Intrinsic::amdgcn_workgroup_id_z:
79 case Intrinsic::r600_read_tgid_z:
80 return WORKGROUP_ID_Z;
81 case Intrinsic::amdgcn_cluster_id_x:
82 NonKernelOnly = true;
83 return CLUSTER_ID_X;
84 case Intrinsic::amdgcn_cluster_id_y:
85 return CLUSTER_ID_Y;
86 case Intrinsic::amdgcn_cluster_id_z:
87 return CLUSTER_ID_Z;
88 case Intrinsic::amdgcn_lds_kernel_id:
89 return LDS_KERNEL_ID;
90 case Intrinsic::amdgcn_dispatch_ptr:
91 return DISPATCH_PTR;
92 case Intrinsic::amdgcn_dispatch_id:
93 return DISPATCH_ID;
94 case Intrinsic::amdgcn_implicitarg_ptr:
95 return IMPLICIT_ARG_PTR;
96 // Need queue_ptr anyway. But under V5, we also need implicitarg_ptr to access
97 // queue_ptr.
98 case Intrinsic::amdgcn_queue_ptr:
99 NeedsImplicit = (CodeObjectVersion >= AMDGPU::AMDHSA_COV5);
100 return QUEUE_PTR;
101 case Intrinsic::amdgcn_is_shared:
102 case Intrinsic::amdgcn_is_private:
103 if (HasApertureRegs)
104 return NOT_IMPLICIT_INPUT;
105 // Under V5, we need implicitarg_ptr + offsets to access private_base or
106 // shared_base. For pre-V5, however, need to access them through queue_ptr +
107 // offsets.
108 return CodeObjectVersion >= AMDGPU::AMDHSA_COV5 ? IMPLICIT_ARG_PTR
109 : QUEUE_PTR;
110 case Intrinsic::trap:
111 case Intrinsic::debugtrap:
112 case Intrinsic::ubsantrap:
113 if (SupportsGetDoorBellID) // GetDoorbellID support implemented since V4.
114 return CodeObjectVersion >= AMDGPU::AMDHSA_COV4 ? NOT_IMPLICIT_INPUT
115 : QUEUE_PTR;
116 NeedsImplicit = (CodeObjectVersion >= AMDGPU::AMDHSA_COV5);
117 return QUEUE_PTR;
118 default:
119 return UNKNOWN_INTRINSIC;
120 }
121}
122
123static bool castRequiresQueuePtr(unsigned SrcAS) {
124 return SrcAS == AMDGPUAS::LOCAL_ADDRESS || SrcAS == AMDGPUAS::PRIVATE_ADDRESS;
125}
126
127static bool isDSAddress(const Constant *C) {
128 const GlobalValue *GV = dyn_cast<GlobalValue>(Val: C);
129 if (!GV)
130 return false;
131 unsigned AS = GV->getAddressSpace();
132 return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS;
133}
134
135/// Returns true if sanitizer attributes are present on a function.
136static bool hasSanitizerAttributes(const Function &F) {
137 return F.hasFnAttribute(Kind: Attribute::SanitizeAddress) ||
138 F.hasFnAttribute(Kind: Attribute::SanitizeThread) ||
139 F.hasFnAttribute(Kind: Attribute::SanitizeMemory) ||
140 F.hasFnAttribute(Kind: Attribute::SanitizeHWAddress) ||
141 F.hasFnAttribute(Kind: Attribute::SanitizeMemTag);
142}
143
144namespace {
145class AMDGPUInformationCache : public InformationCache {
146public:
147 AMDGPUInformationCache(const Module &M, AnalysisGetter &AG,
148 BumpPtrAllocator &Allocator,
149 SetVector<Function *> *CGSCC, TargetMachine &TM)
150 : InformationCache(M, AG, Allocator, CGSCC), TM(TM),
151 CodeObjectVersion(AMDGPU::getAMDHSACodeObjectVersion(M)) {}
152
153 TargetMachine &TM;
154
155 enum ConstantStatus : uint8_t {
156 NONE = 0,
157 DS_GLOBAL = 1 << 0,
158 ADDR_SPACE_CAST_PRIVATE_TO_FLAT = 1 << 1,
159 ADDR_SPACE_CAST_LOCAL_TO_FLAT = 1 << 2,
160 ADDR_SPACE_CAST_BOTH_TO_FLAT =
161 ADDR_SPACE_CAST_PRIVATE_TO_FLAT | ADDR_SPACE_CAST_LOCAL_TO_FLAT
162 };
163
164 /// Check if the subtarget has aperture regs.
165 bool hasApertureRegs(Function &F) {
166 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
167 return ST.hasApertureRegs();
168 }
169
170 /// Check if the subtarget supports GetDoorbellID.
171 bool supportsGetDoorbellID(Function &F) {
172 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
173 return ST.supportsGetDoorbellID();
174 }
175
176 std::optional<std::pair<unsigned, unsigned>>
177 getFlatWorkGroupSizeAttr(const Function &F) const {
178 auto R = AMDGPU::getIntegerPairAttribute(F, Name: "amdgpu-flat-work-group-size");
179 if (!R)
180 return std::nullopt;
181 return std::make_pair(x&: R->first, y&: *(R->second));
182 }
183
184 std::pair<unsigned, unsigned>
185 getDefaultFlatWorkGroupSize(const Function &F) const {
186 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
187 return ST.getDefaultFlatWorkGroupSize(CC: F.getCallingConv());
188 }
189
190 std::pair<unsigned, unsigned>
191 getMaximumFlatWorkGroupRange(const Function &F) {
192 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
193 return {ST.getMinFlatWorkGroupSize(), ST.getMaxFlatWorkGroupSize()};
194 }
195
196 SmallVector<unsigned> getMaxNumWorkGroups(const Function &F) {
197 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
198 return ST.getMaxNumWorkGroups(F);
199 }
200
201 /// Get code object version.
202 unsigned getCodeObjectVersion() const { return CodeObjectVersion; }
203
204 /// Get the effective value of "amdgpu-waves-per-eu" for the function,
205 /// accounting for the interaction with the passed value to use for
206 /// "amdgpu-flat-work-group-size".
207 std::pair<unsigned, unsigned>
208 getWavesPerEU(const Function &F,
209 std::pair<unsigned, unsigned> FlatWorkGroupSize) {
210 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
211 return ST.getWavesPerEU(FlatWorkGroupSizes: FlatWorkGroupSize, LDSBytes: getLDSSize(F), F);
212 }
213
214 std::optional<std::pair<unsigned, unsigned>>
215 getWavesPerEUAttr(const Function &F) {
216 auto Val = AMDGPU::getIntegerPairAttribute(F, Name: "amdgpu-waves-per-eu",
217 /*OnlyFirstRequired=*/true);
218 if (!Val)
219 return std::nullopt;
220 if (!Val->second) {
221 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
222 Val->second = ST.getMaxWavesPerEU();
223 }
224 return std::make_pair(x&: Val->first, y&: *(Val->second));
225 }
226
227 unsigned getMaxWavesPerEU(const Function &F) {
228 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
229 return ST.getMaxWavesPerEU();
230 }
231
232 unsigned getMaxAddrSpace() const override {
233 return AMDGPUAS::MAX_AMDGPU_ADDRESS;
234 }
235
236private:
237 /// Check if the ConstantExpr \p CE uses an addrspacecast from private or
238 /// local to flat. These casts may require the queue pointer.
239 static uint8_t visitConstExpr(const ConstantExpr *CE) {
240 uint8_t Status = NONE;
241
242 if (CE->getOpcode() == Instruction::AddrSpaceCast) {
243 unsigned SrcAS = CE->getOperand(i_nocapture: 0)->getType()->getPointerAddressSpace();
244 if (SrcAS == AMDGPUAS::PRIVATE_ADDRESS)
245 Status |= ADDR_SPACE_CAST_PRIVATE_TO_FLAT;
246 else if (SrcAS == AMDGPUAS::LOCAL_ADDRESS)
247 Status |= ADDR_SPACE_CAST_LOCAL_TO_FLAT;
248 }
249
250 return Status;
251 }
252
253 /// Returns the minimum amount of LDS space used by a workgroup running
254 /// function \p F.
255 static unsigned getLDSSize(const Function &F) {
256 return AMDGPU::getIntegerPairAttribute(F, Name: "amdgpu-lds-size",
257 Default: {0, UINT32_MAX}, OnlyFirstRequired: true)
258 .first;
259 }
260
261 /// Get the constant access bitmap for \p C.
262 uint8_t getConstantAccess(const Constant *C,
263 SmallPtrSetImpl<const Constant *> &Visited) {
264 auto It = ConstantStatus.find(Val: C);
265 if (It != ConstantStatus.end())
266 return It->second;
267
268 uint8_t Result = 0;
269 if (isDSAddress(C))
270 Result = DS_GLOBAL;
271
272 if (const auto *CE = dyn_cast<ConstantExpr>(Val: C))
273 Result |= visitConstExpr(CE);
274
275 for (const Use &U : C->operands()) {
276 const auto *OpC = dyn_cast<Constant>(Val: U);
277 if (!OpC || !Visited.insert(Ptr: OpC).second)
278 continue;
279
280 Result |= getConstantAccess(C: OpC, Visited);
281 }
282 return Result;
283 }
284
285public:
286 /// Returns true if \p Fn needs the queue pointer because of \p C.
287 bool needsQueuePtr(const Constant *C, Function &Fn) {
288 bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(CC: Fn.getCallingConv());
289 bool HasAperture = hasApertureRegs(F&: Fn);
290
291 // No need to explore the constants.
292 if (!IsNonEntryFunc && HasAperture)
293 return false;
294
295 SmallPtrSet<const Constant *, 8> Visited;
296 uint8_t Access = getConstantAccess(C, Visited);
297
298 // We need to trap on DS globals in non-entry functions.
299 if (IsNonEntryFunc && (Access & DS_GLOBAL))
300 return true;
301
302 return !HasAperture && (Access & ADDR_SPACE_CAST_BOTH_TO_FLAT);
303 }
304
305 bool checkConstForAddrSpaceCastFromPrivate(const Constant *C) {
306 SmallPtrSet<const Constant *, 8> Visited;
307 uint8_t Access = getConstantAccess(C, Visited);
308 return Access & ADDR_SPACE_CAST_PRIVATE_TO_FLAT;
309 }
310
311private:
312 /// Used to determine if the Constant needs the queue pointer.
313 DenseMap<const Constant *, uint8_t> ConstantStatus;
314 const unsigned CodeObjectVersion;
315};
316
317struct AAAMDAttributes
318 : public StateWrapper<BitIntegerState<uint32_t, ALL_ARGUMENT_MASK, 0>,
319 AbstractAttribute> {
320 using Base = StateWrapper<BitIntegerState<uint32_t, ALL_ARGUMENT_MASK, 0>,
321 AbstractAttribute>;
322
323 AAAMDAttributes(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
324
325 /// Create an abstract attribute view for the position \p IRP.
326 static AAAMDAttributes &createForPosition(const IRPosition &IRP,
327 Attributor &A);
328
329 /// See AbstractAttribute::getName().
330 StringRef getName() const override { return "AAAMDAttributes"; }
331
332 /// See AbstractAttribute::getIdAddr().
333 const char *getIdAddr() const override { return &ID; }
334
335 /// This function should return true if the type of the \p AA is
336 /// AAAMDAttributes.
337 static bool classof(const AbstractAttribute *AA) {
338 return (AA->getIdAddr() == &ID);
339 }
340
341 /// Unique ID (due to the unique address)
342 static const char ID;
343};
344const char AAAMDAttributes::ID = 0;
345
346struct AAUniformWorkGroupSize
347 : public StateWrapper<BooleanState, AbstractAttribute> {
348 using Base = StateWrapper<BooleanState, AbstractAttribute>;
349 AAUniformWorkGroupSize(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
350
351 /// Create an abstract attribute view for the position \p IRP.
352 static AAUniformWorkGroupSize &createForPosition(const IRPosition &IRP,
353 Attributor &A);
354
355 /// See AbstractAttribute::getName().
356 StringRef getName() const override { return "AAUniformWorkGroupSize"; }
357
358 /// See AbstractAttribute::getIdAddr().
359 const char *getIdAddr() const override { return &ID; }
360
361 /// This function should return true if the type of the \p AA is
362 /// AAAMDAttributes.
363 static bool classof(const AbstractAttribute *AA) {
364 return (AA->getIdAddr() == &ID);
365 }
366
367 /// Unique ID (due to the unique address)
368 static const char ID;
369};
370const char AAUniformWorkGroupSize::ID = 0;
371
372struct AAUniformWorkGroupSizeFunction : public AAUniformWorkGroupSize {
373 AAUniformWorkGroupSizeFunction(const IRPosition &IRP, Attributor &A)
374 : AAUniformWorkGroupSize(IRP, A) {}
375
376 void initialize(Attributor &A) override {
377 Function *F = getAssociatedFunction();
378 CallingConv::ID CC = F->getCallingConv();
379
380 if (CC != CallingConv::AMDGPU_KERNEL)
381 return;
382
383 bool InitialValue = false;
384 if (F->hasFnAttribute(Kind: "uniform-work-group-size"))
385 InitialValue =
386 F->getFnAttribute(Kind: "uniform-work-group-size").getValueAsString() ==
387 "true";
388
389 if (InitialValue)
390 indicateOptimisticFixpoint();
391 else
392 indicatePessimisticFixpoint();
393 }
394
395 ChangeStatus updateImpl(Attributor &A) override {
396 ChangeStatus Change = ChangeStatus::UNCHANGED;
397
398 auto CheckCallSite = [&](AbstractCallSite CS) {
399 Function *Caller = CS.getInstruction()->getFunction();
400 LLVM_DEBUG(dbgs() << "[AAUniformWorkGroupSize] Call " << Caller->getName()
401 << "->" << getAssociatedFunction()->getName() << "\n");
402
403 const auto *CallerInfo = A.getAAFor<AAUniformWorkGroupSize>(
404 QueryingAA: *this, IRP: IRPosition::function(F: *Caller), DepClass: DepClassTy::REQUIRED);
405 if (!CallerInfo || !CallerInfo->isValidState())
406 return false;
407
408 Change = Change | clampStateAndIndicateChange(S&: this->getState(),
409 R: CallerInfo->getState());
410
411 return true;
412 };
413
414 bool AllCallSitesKnown = true;
415 if (!A.checkForAllCallSites(Pred: CheckCallSite, QueryingAA: *this, RequireAllCallSites: true, UsedAssumedInformation&: AllCallSitesKnown))
416 return indicatePessimisticFixpoint();
417
418 return Change;
419 }
420
421 ChangeStatus manifest(Attributor &A) override {
422 SmallVector<Attribute, 8> AttrList;
423 LLVMContext &Ctx = getAssociatedFunction()->getContext();
424
425 AttrList.push_back(Elt: Attribute::get(Context&: Ctx, Kind: "uniform-work-group-size",
426 Val: getAssumed() ? "true" : "false"));
427 return A.manifestAttrs(IRP: getIRPosition(), DeducedAttrs: AttrList,
428 /* ForceReplace */ true);
429 }
430
431 bool isValidState() const override {
432 // This state is always valid, even when the state is false.
433 return true;
434 }
435
436 const std::string getAsStr(Attributor *) const override {
437 return "AMDWorkGroupSize[" + std::to_string(val: getAssumed()) + "]";
438 }
439
440 /// See AbstractAttribute::trackStatistics()
441 void trackStatistics() const override {}
442};
443
444AAUniformWorkGroupSize &
445AAUniformWorkGroupSize::createForPosition(const IRPosition &IRP,
446 Attributor &A) {
447 if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
448 return *new (A.Allocator) AAUniformWorkGroupSizeFunction(IRP, A);
449 llvm_unreachable(
450 "AAUniformWorkGroupSize is only valid for function position");
451}
452
453struct AAAMDAttributesFunction : public AAAMDAttributes {
454 AAAMDAttributesFunction(const IRPosition &IRP, Attributor &A)
455 : AAAMDAttributes(IRP, A) {}
456
457 void initialize(Attributor &A) override {
458 Function *F = getAssociatedFunction();
459
460 // If the function requires the implicit arg pointer due to sanitizers,
461 // assume it's needed even if explicitly marked as not requiring it.
462 // Flat scratch initialization is needed because `asan_malloc_impl`
463 // calls introduced later in pipeline will have flat scratch accesses.
464 // FIXME: FLAT_SCRATCH_INIT will not be required here if device-libs
465 // implementation for `asan_malloc_impl` is updated.
466 const bool HasSanitizerAttrs = hasSanitizerAttributes(F: *F);
467 if (HasSanitizerAttrs) {
468 removeAssumedBits(BitsEncoding: IMPLICIT_ARG_PTR);
469 removeAssumedBits(BitsEncoding: HOSTCALL_PTR);
470 removeAssumedBits(BitsEncoding: FLAT_SCRATCH_INIT);
471 }
472
473 for (auto Attr : ImplicitAttrs) {
474 if (HasSanitizerAttrs &&
475 (Attr.first == IMPLICIT_ARG_PTR || Attr.first == HOSTCALL_PTR ||
476 Attr.first == FLAT_SCRATCH_INIT))
477 continue;
478
479 if (F->hasFnAttribute(Kind: Attr.second))
480 addKnownBits(Bits: Attr.first);
481 }
482
483 if (F->isDeclaration())
484 return;
485
486 // Ignore functions with graphics calling conventions, these are currently
487 // not allowed to have kernel arguments.
488 if (AMDGPU::isGraphics(CC: F->getCallingConv())) {
489 indicatePessimisticFixpoint();
490 return;
491 }
492 }
493
494 ChangeStatus updateImpl(Attributor &A) override {
495 Function *F = getAssociatedFunction();
496 // The current assumed state used to determine a change.
497 auto OrigAssumed = getAssumed();
498
499 // Check for Intrinsics and propagate attributes.
500 const AACallEdges *AAEdges = A.getAAFor<AACallEdges>(
501 QueryingAA: *this, IRP: this->getIRPosition(), DepClass: DepClassTy::REQUIRED);
502 if (!AAEdges || !AAEdges->isValidState() ||
503 AAEdges->hasNonAsmUnknownCallee())
504 return indicatePessimisticFixpoint();
505
506 bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(CC: F->getCallingConv());
507
508 bool NeedsImplicit = false;
509 auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
510 bool HasApertureRegs = InfoCache.hasApertureRegs(F&: *F);
511 bool SupportsGetDoorbellID = InfoCache.supportsGetDoorbellID(F&: *F);
512 unsigned COV = InfoCache.getCodeObjectVersion();
513
514 for (Function *Callee : AAEdges->getOptimisticEdges()) {
515 Intrinsic::ID IID = Callee->getIntrinsicID();
516 if (IID == Intrinsic::not_intrinsic) {
517 const AAAMDAttributes *AAAMD = A.getAAFor<AAAMDAttributes>(
518 QueryingAA: *this, IRP: IRPosition::function(F: *Callee), DepClass: DepClassTy::REQUIRED);
519 if (!AAAMD || !AAAMD->isValidState())
520 return indicatePessimisticFixpoint();
521 *this &= *AAAMD;
522 continue;
523 }
524
525 bool NonKernelOnly = false;
526 ImplicitArgumentMask AttrMask =
527 intrinsicToAttrMask(ID: IID, NonKernelOnly, NeedsImplicit,
528 HasApertureRegs, SupportsGetDoorBellID: SupportsGetDoorbellID, CodeObjectVersion: COV);
529
530 if (AttrMask == UNKNOWN_INTRINSIC) {
531 // Assume not-nocallback intrinsics may invoke a function which accesses
532 // implicit arguments.
533 //
534 // FIXME: This isn't really the correct check. We want to ensure it
535 // isn't calling any function that may use implicit arguments regardless
536 // of whether it's internal to the module or not.
537 //
538 // TODO: Ignoring callsite attributes.
539 if (!Callee->hasFnAttribute(Kind: Attribute::NoCallback))
540 return indicatePessimisticFixpoint();
541 continue;
542 }
543
544 if (AttrMask != NOT_IMPLICIT_INPUT) {
545 if ((IsNonEntryFunc || !NonKernelOnly))
546 removeAssumedBits(BitsEncoding: AttrMask);
547 }
548 }
549
550 // Need implicitarg_ptr to acess queue_ptr, private_base, and shared_base.
551 if (NeedsImplicit)
552 removeAssumedBits(BitsEncoding: IMPLICIT_ARG_PTR);
553
554 if (isAssumed(BitsEncoding: QUEUE_PTR) && checkForQueuePtr(A)) {
555 // Under V5, we need implicitarg_ptr + offsets to access private_base or
556 // shared_base. We do not actually need queue_ptr.
557 if (COV >= 5)
558 removeAssumedBits(BitsEncoding: IMPLICIT_ARG_PTR);
559 else
560 removeAssumedBits(BitsEncoding: QUEUE_PTR);
561 }
562
563 if (funcRetrievesMultigridSyncArg(A, COV)) {
564 assert(!isAssumed(IMPLICIT_ARG_PTR) &&
565 "multigrid_sync_arg needs implicitarg_ptr");
566 removeAssumedBits(BitsEncoding: MULTIGRID_SYNC_ARG);
567 }
568
569 if (funcRetrievesHostcallPtr(A, COV)) {
570 assert(!isAssumed(IMPLICIT_ARG_PTR) && "hostcall needs implicitarg_ptr");
571 removeAssumedBits(BitsEncoding: HOSTCALL_PTR);
572 }
573
574 if (funcRetrievesHeapPtr(A, COV)) {
575 assert(!isAssumed(IMPLICIT_ARG_PTR) && "heap_ptr needs implicitarg_ptr");
576 removeAssumedBits(BitsEncoding: HEAP_PTR);
577 }
578
579 if (isAssumed(BitsEncoding: QUEUE_PTR) && funcRetrievesQueuePtr(A, COV)) {
580 assert(!isAssumed(IMPLICIT_ARG_PTR) && "queue_ptr needs implicitarg_ptr");
581 removeAssumedBits(BitsEncoding: QUEUE_PTR);
582 }
583
584 if (isAssumed(BitsEncoding: LDS_KERNEL_ID) && funcRetrievesLDSKernelId(A)) {
585 removeAssumedBits(BitsEncoding: LDS_KERNEL_ID);
586 }
587
588 if (isAssumed(BitsEncoding: DEFAULT_QUEUE) && funcRetrievesDefaultQueue(A, COV))
589 removeAssumedBits(BitsEncoding: DEFAULT_QUEUE);
590
591 if (isAssumed(BitsEncoding: COMPLETION_ACTION) && funcRetrievesCompletionAction(A, COV))
592 removeAssumedBits(BitsEncoding: COMPLETION_ACTION);
593
594 if (isAssumed(BitsEncoding: FLAT_SCRATCH_INIT) && needFlatScratchInit(A))
595 removeAssumedBits(BitsEncoding: FLAT_SCRATCH_INIT);
596
597 return getAssumed() != OrigAssumed ? ChangeStatus::CHANGED
598 : ChangeStatus::UNCHANGED;
599 }
600
601 ChangeStatus manifest(Attributor &A) override {
602 SmallVector<Attribute, 8> AttrList;
603 LLVMContext &Ctx = getAssociatedFunction()->getContext();
604
605 for (auto Attr : ImplicitAttrs) {
606 if (isKnown(BitsEncoding: Attr.first))
607 AttrList.push_back(Elt: Attribute::get(Context&: Ctx, Kind: Attr.second));
608 }
609
610 return A.manifestAttrs(IRP: getIRPosition(), DeducedAttrs: AttrList,
611 /* ForceReplace */ true);
612 }
613
614 const std::string getAsStr(Attributor *) const override {
615 std::string Str;
616 raw_string_ostream OS(Str);
617 OS << "AMDInfo[";
618 for (auto Attr : ImplicitAttrs)
619 if (isAssumed(BitsEncoding: Attr.first))
620 OS << ' ' << Attr.second;
621 OS << " ]";
622 return OS.str();
623 }
624
625 /// See AbstractAttribute::trackStatistics()
626 void trackStatistics() const override {}
627
628private:
629 bool checkForQueuePtr(Attributor &A) {
630 Function *F = getAssociatedFunction();
631 bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(CC: F->getCallingConv());
632
633 auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
634
635 bool NeedsQueuePtr = false;
636
637 auto CheckAddrSpaceCasts = [&](Instruction &I) {
638 unsigned SrcAS = static_cast<AddrSpaceCastInst &>(I).getSrcAddressSpace();
639 if (castRequiresQueuePtr(SrcAS)) {
640 NeedsQueuePtr = true;
641 return false;
642 }
643 return true;
644 };
645
646 bool HasApertureRegs = InfoCache.hasApertureRegs(F&: *F);
647
648 // `checkForAllInstructions` is much more cheaper than going through all
649 // instructions, try it first.
650
651 // The queue pointer is not needed if aperture regs is present.
652 if (!HasApertureRegs) {
653 bool UsedAssumedInformation = false;
654 A.checkForAllInstructions(Pred: CheckAddrSpaceCasts, QueryingAA: *this,
655 Opcodes: {Instruction::AddrSpaceCast},
656 UsedAssumedInformation);
657 }
658
659 // If we found that we need the queue pointer, nothing else to do.
660 if (NeedsQueuePtr)
661 return true;
662
663 if (!IsNonEntryFunc && HasApertureRegs)
664 return false;
665
666 for (BasicBlock &BB : *F) {
667 for (Instruction &I : BB) {
668 for (const Use &U : I.operands()) {
669 if (const auto *C = dyn_cast<Constant>(Val: U)) {
670 if (InfoCache.needsQueuePtr(C, Fn&: *F))
671 return true;
672 }
673 }
674 }
675 }
676
677 return false;
678 }
679
680 bool funcRetrievesMultigridSyncArg(Attributor &A, unsigned COV) {
681 auto Pos = llvm::AMDGPU::getMultigridSyncArgImplicitArgPosition(COV);
682 AA::RangeTy Range(Pos, 8);
683 return funcRetrievesImplicitKernelArg(A, Range);
684 }
685
686 bool funcRetrievesHostcallPtr(Attributor &A, unsigned COV) {
687 auto Pos = llvm::AMDGPU::getHostcallImplicitArgPosition(COV);
688 AA::RangeTy Range(Pos, 8);
689 return funcRetrievesImplicitKernelArg(A, Range);
690 }
691
692 bool funcRetrievesDefaultQueue(Attributor &A, unsigned COV) {
693 auto Pos = llvm::AMDGPU::getDefaultQueueImplicitArgPosition(COV);
694 AA::RangeTy Range(Pos, 8);
695 return funcRetrievesImplicitKernelArg(A, Range);
696 }
697
698 bool funcRetrievesCompletionAction(Attributor &A, unsigned COV) {
699 auto Pos = llvm::AMDGPU::getCompletionActionImplicitArgPosition(COV);
700 AA::RangeTy Range(Pos, 8);
701 return funcRetrievesImplicitKernelArg(A, Range);
702 }
703
704 bool funcRetrievesHeapPtr(Attributor &A, unsigned COV) {
705 if (COV < 5)
706 return false;
707 AA::RangeTy Range(AMDGPU::ImplicitArg::HEAP_PTR_OFFSET, 8);
708 return funcRetrievesImplicitKernelArg(A, Range);
709 }
710
711 bool funcRetrievesQueuePtr(Attributor &A, unsigned COV) {
712 if (COV < 5)
713 return false;
714 AA::RangeTy Range(AMDGPU::ImplicitArg::QUEUE_PTR_OFFSET, 8);
715 return funcRetrievesImplicitKernelArg(A, Range);
716 }
717
718 bool funcRetrievesImplicitKernelArg(Attributor &A, AA::RangeTy Range) {
719 // Check if this is a call to the implicitarg_ptr builtin and it
720 // is used to retrieve the hostcall pointer. The implicit arg for
721 // hostcall is not used only if every use of the implicitarg_ptr
722 // is a load that clearly does not retrieve any byte of the
723 // hostcall pointer. We check this by tracing all the uses of the
724 // initial call to the implicitarg_ptr intrinsic.
725 auto DoesNotLeadToKernelArgLoc = [&](Instruction &I) {
726 auto &Call = cast<CallBase>(Val&: I);
727 if (Call.getIntrinsicID() != Intrinsic::amdgcn_implicitarg_ptr)
728 return true;
729
730 const auto *PointerInfoAA = A.getAAFor<AAPointerInfo>(
731 QueryingAA: *this, IRP: IRPosition::callsite_returned(CB: Call), DepClass: DepClassTy::REQUIRED);
732 if (!PointerInfoAA || !PointerInfoAA->getState().isValidState())
733 return false;
734
735 return PointerInfoAA->forallInterferingAccesses(
736 Range, CB: [](const AAPointerInfo::Access &Acc, bool IsExact) {
737 return Acc.getRemoteInst()->isDroppable();
738 });
739 };
740
741 bool UsedAssumedInformation = false;
742 return !A.checkForAllCallLikeInstructions(Pred: DoesNotLeadToKernelArgLoc, QueryingAA: *this,
743 UsedAssumedInformation);
744 }
745
746 bool funcRetrievesLDSKernelId(Attributor &A) {
747 auto DoesNotRetrieve = [&](Instruction &I) {
748 auto &Call = cast<CallBase>(Val&: I);
749 return Call.getIntrinsicID() != Intrinsic::amdgcn_lds_kernel_id;
750 };
751 bool UsedAssumedInformation = false;
752 return !A.checkForAllCallLikeInstructions(Pred: DoesNotRetrieve, QueryingAA: *this,
753 UsedAssumedInformation);
754 }
755
756 // Returns true if FlatScratchInit is needed, i.e., no-flat-scratch-init is
757 // not to be set.
758 bool needFlatScratchInit(Attributor &A) {
759 assert(isAssumed(FLAT_SCRATCH_INIT)); // only called if the bit is still set
760
761 // Check all AddrSpaceCast instructions. FlatScratchInit is needed if
762 // there is a cast from PRIVATE_ADDRESS.
763 auto AddrSpaceCastNotFromPrivate = [](Instruction &I) {
764 return cast<AddrSpaceCastInst>(Val&: I).getSrcAddressSpace() !=
765 AMDGPUAS::PRIVATE_ADDRESS;
766 };
767
768 bool UsedAssumedInformation = false;
769 if (!A.checkForAllInstructions(Pred: AddrSpaceCastNotFromPrivate, QueryingAA: *this,
770 Opcodes: {Instruction::AddrSpaceCast},
771 UsedAssumedInformation))
772 return true;
773
774 // Check for addrSpaceCast from PRIVATE_ADDRESS in constant expressions
775 auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
776
777 Function *F = getAssociatedFunction();
778 for (Instruction &I : instructions(F)) {
779 for (const Use &U : I.operands()) {
780 if (const auto *C = dyn_cast<Constant>(Val: U)) {
781 if (InfoCache.checkConstForAddrSpaceCastFromPrivate(C))
782 return true;
783 }
784 }
785 }
786
787 // Finally check callees.
788
789 // This is called on each callee; false means callee shouldn't have
790 // no-flat-scratch-init.
791 auto CheckForNoFlatScratchInit = [&](Instruction &I) {
792 const auto &CB = cast<CallBase>(Val&: I);
793 const Function *Callee = CB.getCalledFunction();
794
795 // Callee == 0 for inline asm or indirect call with known callees.
796 // In the latter case, updateImpl() already checked the callees and we
797 // know their FLAT_SCRATCH_INIT bit is set.
798 // If function has indirect call with unknown callees, the bit is
799 // already removed in updateImpl() and execution won't reach here.
800 if (!Callee)
801 return true;
802
803 return Callee->getIntrinsicID() !=
804 Intrinsic::amdgcn_addrspacecast_nonnull;
805 };
806
807 UsedAssumedInformation = false;
808 // If any callee is false (i.e. need FlatScratchInit),
809 // checkForAllCallLikeInstructions returns false, in which case this
810 // function returns true.
811 return !A.checkForAllCallLikeInstructions(Pred: CheckForNoFlatScratchInit, QueryingAA: *this,
812 UsedAssumedInformation);
813 }
814};
815
816AAAMDAttributes &AAAMDAttributes::createForPosition(const IRPosition &IRP,
817 Attributor &A) {
818 if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
819 return *new (A.Allocator) AAAMDAttributesFunction(IRP, A);
820 llvm_unreachable("AAAMDAttributes is only valid for function position");
821}
822
823/// Base class to derive different size ranges.
824struct AAAMDSizeRangeAttribute
825 : public StateWrapper<IntegerRangeState, AbstractAttribute, uint32_t> {
826 using Base = StateWrapper<IntegerRangeState, AbstractAttribute, uint32_t>;
827
828 StringRef AttrName;
829
830 AAAMDSizeRangeAttribute(const IRPosition &IRP, Attributor &A,
831 StringRef AttrName)
832 : Base(IRP, 32), AttrName(AttrName) {}
833
834 /// See AbstractAttribute::trackStatistics()
835 void trackStatistics() const override {}
836
837 template <class AttributeImpl> ChangeStatus updateImplImpl(Attributor &A) {
838 ChangeStatus Change = ChangeStatus::UNCHANGED;
839
840 auto CheckCallSite = [&](AbstractCallSite CS) {
841 Function *Caller = CS.getInstruction()->getFunction();
842 LLVM_DEBUG(dbgs() << '[' << getName() << "] Call " << Caller->getName()
843 << "->" << getAssociatedFunction()->getName() << '\n');
844
845 const auto *CallerInfo = A.getAAFor<AttributeImpl>(
846 *this, IRPosition::function(F: *Caller), DepClassTy::REQUIRED);
847 if (!CallerInfo || !CallerInfo->isValidState())
848 return false;
849
850 Change |=
851 clampStateAndIndicateChange(this->getState(), CallerInfo->getState());
852
853 return true;
854 };
855
856 bool AllCallSitesKnown = true;
857 if (!A.checkForAllCallSites(CheckCallSite, *this,
858 /*RequireAllCallSites=*/true,
859 AllCallSitesKnown))
860 return indicatePessimisticFixpoint();
861
862 return Change;
863 }
864
865 /// Clamp the assumed range to the default value ([Min, Max]) and emit the
866 /// attribute if it is not same as default.
867 ChangeStatus
868 emitAttributeIfNotDefaultAfterClamp(Attributor &A,
869 std::pair<unsigned, unsigned> Default) {
870 auto [Min, Max] = Default;
871 unsigned Lower = getAssumed().getLower().getZExtValue();
872 unsigned Upper = getAssumed().getUpper().getZExtValue();
873
874 // Clamp the range to the default value.
875 if (Lower < Min)
876 Lower = Min;
877 if (Upper > Max + 1)
878 Upper = Max + 1;
879
880 // No manifest if the value is invalid or same as default after clamp.
881 if ((Lower == Min && Upper == Max + 1) || (Upper < Lower))
882 return ChangeStatus::UNCHANGED;
883
884 Function *F = getAssociatedFunction();
885 LLVMContext &Ctx = F->getContext();
886 SmallString<10> Buffer;
887 raw_svector_ostream OS(Buffer);
888 OS << Lower << ',' << Upper - 1;
889 return A.manifestAttrs(IRP: getIRPosition(),
890 DeducedAttrs: {Attribute::get(Context&: Ctx, Kind: AttrName, Val: OS.str())},
891 /*ForceReplace=*/true);
892 }
893
894 const std::string getAsStr(Attributor *) const override {
895 std::string Str;
896 raw_string_ostream OS(Str);
897 OS << getName() << '[';
898 OS << getAssumed().getLower() << ',' << getAssumed().getUpper() - 1;
899 OS << ']';
900 return OS.str();
901 }
902};
903
904/// Propagate amdgpu-flat-work-group-size attribute.
905struct AAAMDFlatWorkGroupSize : public AAAMDSizeRangeAttribute {
906 AAAMDFlatWorkGroupSize(const IRPosition &IRP, Attributor &A)
907 : AAAMDSizeRangeAttribute(IRP, A, "amdgpu-flat-work-group-size") {}
908
909 void initialize(Attributor &A) override {
910 Function *F = getAssociatedFunction();
911 auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
912
913 bool HasAttr = false;
914 auto Range = InfoCache.getDefaultFlatWorkGroupSize(F: *F);
915 auto MaxRange = InfoCache.getMaximumFlatWorkGroupRange(F: *F);
916
917 if (auto Attr = InfoCache.getFlatWorkGroupSizeAttr(F: *F)) {
918 // We only consider an attribute that is not max range because the front
919 // end always emits the attribute, unfortunately, and sometimes it emits
920 // the max range.
921 if (*Attr != MaxRange) {
922 Range = *Attr;
923 HasAttr = true;
924 }
925 }
926
927 // We don't want to directly clamp the state if it's the max range because
928 // that is basically the worst state.
929 if (Range == MaxRange)
930 return;
931
932 auto [Min, Max] = Range;
933 ConstantRange CR(APInt(32, Min), APInt(32, Max + 1));
934 IntegerRangeState IRS(CR);
935 clampStateAndIndicateChange(S&: this->getState(), R: IRS);
936
937 if (HasAttr || AMDGPU::isEntryFunctionCC(CC: F->getCallingConv()))
938 indicateOptimisticFixpoint();
939 }
940
941 ChangeStatus updateImpl(Attributor &A) override {
942 return updateImplImpl<AAAMDFlatWorkGroupSize>(A);
943 }
944
945 /// Create an abstract attribute view for the position \p IRP.
946 static AAAMDFlatWorkGroupSize &createForPosition(const IRPosition &IRP,
947 Attributor &A);
948
949 ChangeStatus manifest(Attributor &A) override {
950 Function *F = getAssociatedFunction();
951 auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
952 return emitAttributeIfNotDefaultAfterClamp(
953 A, Default: InfoCache.getMaximumFlatWorkGroupRange(F: *F));
954 }
955
956 /// See AbstractAttribute::getName()
957 StringRef getName() const override { return "AAAMDFlatWorkGroupSize"; }
958
959 /// See AbstractAttribute::getIdAddr()
960 const char *getIdAddr() const override { return &ID; }
961
962 /// This function should return true if the type of the \p AA is
963 /// AAAMDFlatWorkGroupSize
964 static bool classof(const AbstractAttribute *AA) {
965 return (AA->getIdAddr() == &ID);
966 }
967
968 /// Unique ID (due to the unique address)
969 static const char ID;
970};
971
972const char AAAMDFlatWorkGroupSize::ID = 0;
973
974AAAMDFlatWorkGroupSize &
975AAAMDFlatWorkGroupSize::createForPosition(const IRPosition &IRP,
976 Attributor &A) {
977 if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
978 return *new (A.Allocator) AAAMDFlatWorkGroupSize(IRP, A);
979 llvm_unreachable(
980 "AAAMDFlatWorkGroupSize is only valid for function position");
981}
982
983struct TupleDecIntegerRangeState : public AbstractState {
984 DecIntegerState<uint32_t> X, Y, Z;
985
986 bool isValidState() const override {
987 return X.isValidState() && Y.isValidState() && Z.isValidState();
988 }
989
990 bool isAtFixpoint() const override {
991 return X.isAtFixpoint() && Y.isAtFixpoint() && Z.isAtFixpoint();
992 }
993
994 ChangeStatus indicateOptimisticFixpoint() override {
995 return X.indicateOptimisticFixpoint() | Y.indicateOptimisticFixpoint() |
996 Z.indicateOptimisticFixpoint();
997 }
998
999 ChangeStatus indicatePessimisticFixpoint() override {
1000 return X.indicatePessimisticFixpoint() | Y.indicatePessimisticFixpoint() |
1001 Z.indicatePessimisticFixpoint();
1002 }
1003
1004 TupleDecIntegerRangeState operator^=(const TupleDecIntegerRangeState &Other) {
1005 X ^= Other.X;
1006 Y ^= Other.Y;
1007 Z ^= Other.Z;
1008 return *this;
1009 }
1010
1011 bool operator==(const TupleDecIntegerRangeState &Other) const {
1012 return X == Other.X && Y == Other.Y && Z == Other.Z;
1013 }
1014
1015 TupleDecIntegerRangeState &getAssumed() { return *this; }
1016 const TupleDecIntegerRangeState &getAssumed() const { return *this; }
1017};
1018
1019using AAAMDMaxNumWorkgroupsState =
1020 StateWrapper<TupleDecIntegerRangeState, AbstractAttribute, uint32_t>;
1021
1022/// Propagate amdgpu-max-num-workgroups attribute.
1023struct AAAMDMaxNumWorkgroups
1024 : public StateWrapper<TupleDecIntegerRangeState, AbstractAttribute> {
1025 using Base = StateWrapper<TupleDecIntegerRangeState, AbstractAttribute>;
1026
1027 AAAMDMaxNumWorkgroups(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
1028
1029 void initialize(Attributor &A) override {
1030 Function *F = getAssociatedFunction();
1031 auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
1032
1033 SmallVector<unsigned> MaxNumWorkgroups = InfoCache.getMaxNumWorkGroups(F: *F);
1034
1035 X.takeKnownMinimum(Value: MaxNumWorkgroups[0]);
1036 Y.takeKnownMinimum(Value: MaxNumWorkgroups[1]);
1037 Z.takeKnownMinimum(Value: MaxNumWorkgroups[2]);
1038
1039 if (AMDGPU::isEntryFunctionCC(CC: F->getCallingConv()))
1040 indicatePessimisticFixpoint();
1041 }
1042
1043 ChangeStatus updateImpl(Attributor &A) override {
1044 ChangeStatus Change = ChangeStatus::UNCHANGED;
1045
1046 auto CheckCallSite = [&](AbstractCallSite CS) {
1047 Function *Caller = CS.getInstruction()->getFunction();
1048 LLVM_DEBUG(dbgs() << "[AAAMDMaxNumWorkgroups] Call " << Caller->getName()
1049 << "->" << getAssociatedFunction()->getName() << '\n');
1050
1051 const auto *CallerInfo = A.getAAFor<AAAMDMaxNumWorkgroups>(
1052 QueryingAA: *this, IRP: IRPosition::function(F: *Caller), DepClass: DepClassTy::REQUIRED);
1053 if (!CallerInfo || !CallerInfo->isValidState())
1054 return false;
1055
1056 Change |=
1057 clampStateAndIndicateChange(S&: this->getState(), R: CallerInfo->getState());
1058 return true;
1059 };
1060
1061 bool AllCallSitesKnown = true;
1062 if (!A.checkForAllCallSites(Pred: CheckCallSite, QueryingAA: *this,
1063 /*RequireAllCallSites=*/true,
1064 UsedAssumedInformation&: AllCallSitesKnown))
1065 return indicatePessimisticFixpoint();
1066
1067 return Change;
1068 }
1069
1070 /// Create an abstract attribute view for the position \p IRP.
1071 static AAAMDMaxNumWorkgroups &createForPosition(const IRPosition &IRP,
1072 Attributor &A);
1073
1074 ChangeStatus manifest(Attributor &A) override {
1075 Function *F = getAssociatedFunction();
1076 LLVMContext &Ctx = F->getContext();
1077 SmallString<32> Buffer;
1078 raw_svector_ostream OS(Buffer);
1079 OS << X.getAssumed() << ',' << Y.getAssumed() << ',' << Z.getAssumed();
1080
1081 // TODO: Should annotate loads of the group size for this to do anything
1082 // useful.
1083 return A.manifestAttrs(
1084 IRP: getIRPosition(),
1085 DeducedAttrs: {Attribute::get(Context&: Ctx, Kind: "amdgpu-max-num-workgroups", Val: OS.str())},
1086 /* ForceReplace= */ true);
1087 }
1088
1089 StringRef getName() const override { return "AAAMDMaxNumWorkgroups"; }
1090
1091 const std::string getAsStr(Attributor *) const override {
1092 std::string Buffer = "AAAMDMaxNumWorkgroupsState[";
1093 raw_string_ostream OS(Buffer);
1094 OS << X.getAssumed() << ',' << Y.getAssumed() << ',' << Z.getAssumed()
1095 << ']';
1096 return OS.str();
1097 }
1098
1099 const char *getIdAddr() const override { return &ID; }
1100
1101 /// This function should return true if the type of the \p AA is
1102 /// AAAMDMaxNumWorkgroups
1103 static bool classof(const AbstractAttribute *AA) {
1104 return (AA->getIdAddr() == &ID);
1105 }
1106
1107 void trackStatistics() const override {}
1108
1109 /// Unique ID (due to the unique address)
1110 static const char ID;
1111};
1112
1113const char AAAMDMaxNumWorkgroups::ID = 0;
1114
1115AAAMDMaxNumWorkgroups &
1116AAAMDMaxNumWorkgroups::createForPosition(const IRPosition &IRP, Attributor &A) {
1117 if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
1118 return *new (A.Allocator) AAAMDMaxNumWorkgroups(IRP, A);
1119 llvm_unreachable("AAAMDMaxNumWorkgroups is only valid for function position");
1120}
1121
1122/// Propagate amdgpu-waves-per-eu attribute.
1123struct AAAMDWavesPerEU : public AAAMDSizeRangeAttribute {
1124 AAAMDWavesPerEU(const IRPosition &IRP, Attributor &A)
1125 : AAAMDSizeRangeAttribute(IRP, A, "amdgpu-waves-per-eu") {}
1126
1127 void initialize(Attributor &A) override {
1128 Function *F = getAssociatedFunction();
1129 auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
1130
1131 // If the attribute exists, we will honor it if it is not the default.
1132 if (auto Attr = InfoCache.getWavesPerEUAttr(F: *F)) {
1133 std::pair<unsigned, unsigned> MaxWavesPerEURange{
1134 1U, InfoCache.getMaxWavesPerEU(F: *F)};
1135 if (*Attr != MaxWavesPerEURange) {
1136 auto [Min, Max] = *Attr;
1137 ConstantRange Range(APInt(32, Min), APInt(32, Max + 1));
1138 IntegerRangeState RangeState(Range);
1139 this->getState() = RangeState;
1140 indicateOptimisticFixpoint();
1141 return;
1142 }
1143 }
1144
1145 if (AMDGPU::isEntryFunctionCC(CC: F->getCallingConv()))
1146 indicatePessimisticFixpoint();
1147 }
1148
1149 ChangeStatus updateImpl(Attributor &A) override {
1150 ChangeStatus Change = ChangeStatus::UNCHANGED;
1151
1152 auto CheckCallSite = [&](AbstractCallSite CS) {
1153 Function *Caller = CS.getInstruction()->getFunction();
1154 Function *Func = getAssociatedFunction();
1155 LLVM_DEBUG(dbgs() << '[' << getName() << "] Call " << Caller->getName()
1156 << "->" << Func->getName() << '\n');
1157 (void)Func;
1158
1159 const auto *CallerAA = A.getAAFor<AAAMDWavesPerEU>(
1160 QueryingAA: *this, IRP: IRPosition::function(F: *Caller), DepClass: DepClassTy::REQUIRED);
1161 if (!CallerAA || !CallerAA->isValidState())
1162 return false;
1163
1164 ConstantRange Assumed = getAssumed();
1165 unsigned Min = std::max(a: Assumed.getLower().getZExtValue(),
1166 b: CallerAA->getAssumed().getLower().getZExtValue());
1167 unsigned Max = std::max(a: Assumed.getUpper().getZExtValue(),
1168 b: CallerAA->getAssumed().getUpper().getZExtValue());
1169 ConstantRange Range(APInt(32, Min), APInt(32, Max));
1170 IntegerRangeState RangeState(Range);
1171 getState() = RangeState;
1172 Change |= getState() == Assumed ? ChangeStatus::UNCHANGED
1173 : ChangeStatus::CHANGED;
1174
1175 return true;
1176 };
1177
1178 bool AllCallSitesKnown = true;
1179 if (!A.checkForAllCallSites(Pred: CheckCallSite, QueryingAA: *this, RequireAllCallSites: true, UsedAssumedInformation&: AllCallSitesKnown))
1180 return indicatePessimisticFixpoint();
1181
1182 return Change;
1183 }
1184
1185 /// Create an abstract attribute view for the position \p IRP.
1186 static AAAMDWavesPerEU &createForPosition(const IRPosition &IRP,
1187 Attributor &A);
1188
1189 ChangeStatus manifest(Attributor &A) override {
1190 Function *F = getAssociatedFunction();
1191 auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
1192 return emitAttributeIfNotDefaultAfterClamp(
1193 A, Default: {1U, InfoCache.getMaxWavesPerEU(F: *F)});
1194 }
1195
1196 /// See AbstractAttribute::getName()
1197 StringRef getName() const override { return "AAAMDWavesPerEU"; }
1198
1199 /// See AbstractAttribute::getIdAddr()
1200 const char *getIdAddr() const override { return &ID; }
1201
1202 /// This function should return true if the type of the \p AA is
1203 /// AAAMDWavesPerEU
1204 static bool classof(const AbstractAttribute *AA) {
1205 return (AA->getIdAddr() == &ID);
1206 }
1207
1208 /// Unique ID (due to the unique address)
1209 static const char ID;
1210};
1211
1212const char AAAMDWavesPerEU::ID = 0;
1213
1214AAAMDWavesPerEU &AAAMDWavesPerEU::createForPosition(const IRPosition &IRP,
1215 Attributor &A) {
1216 if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
1217 return *new (A.Allocator) AAAMDWavesPerEU(IRP, A);
1218 llvm_unreachable("AAAMDWavesPerEU is only valid for function position");
1219}
1220
1221/// Compute the minimum number of AGPRs required to allocate the inline asm.
1222static unsigned inlineAsmGetNumRequiredAGPRs(const InlineAsm *IA,
1223 const CallBase &Call) {
1224 unsigned ArgNo = 0;
1225 unsigned ResNo = 0;
1226 unsigned AGPRDefCount = 0;
1227 unsigned AGPRUseCount = 0;
1228 unsigned MaxPhysReg = 0;
1229 const DataLayout &DL = Call.getFunction()->getParent()->getDataLayout();
1230
1231 // TODO: Overestimates due to not accounting for tied operands
1232 for (const InlineAsm::ConstraintInfo &CI : IA->ParseConstraints()) {
1233 Type *Ty = nullptr;
1234 switch (CI.Type) {
1235 case InlineAsm::isOutput: {
1236 Ty = Call.getType();
1237 if (auto *STy = dyn_cast<StructType>(Val: Ty))
1238 Ty = STy->getElementType(N: ResNo);
1239 ++ResNo;
1240 break;
1241 }
1242 case InlineAsm::isInput: {
1243 Ty = Call.getArgOperand(i: ArgNo++)->getType();
1244 break;
1245 }
1246 case InlineAsm::isLabel:
1247 continue;
1248 case InlineAsm::isClobber:
1249 // Parse the physical register reference.
1250 break;
1251 }
1252
1253 for (StringRef Code : CI.Codes) {
1254 unsigned RegCount = 0;
1255 if (Code.starts_with(Prefix: "a")) {
1256 // Virtual register, compute number of registers based on the type.
1257 //
1258 // We ought to be going through TargetLowering to get the number of
1259 // registers, but we should avoid the dependence on CodeGen here.
1260 RegCount = divideCeil(Numerator: DL.getTypeSizeInBits(Ty), Denominator: 32);
1261 } else {
1262 // Physical register reference
1263 auto [Kind, RegIdx, NumRegs] = AMDGPU::parseAsmConstraintPhysReg(Constraint: Code);
1264 if (Kind == 'a') {
1265 RegCount = NumRegs;
1266 MaxPhysReg = std::max(a: MaxPhysReg, b: std::min(a: RegIdx + NumRegs, b: 256u));
1267 }
1268
1269 continue;
1270 }
1271
1272 if (CI.Type == InlineAsm::isOutput) {
1273 // Apply tuple alignment requirement
1274 //
1275 // TODO: This is more conservative than necessary.
1276 AGPRDefCount = alignTo(Value: AGPRDefCount, Align: RegCount);
1277
1278 AGPRDefCount += RegCount;
1279 if (CI.isEarlyClobber) {
1280 AGPRUseCount = alignTo(Value: AGPRUseCount, Align: RegCount);
1281 AGPRUseCount += RegCount;
1282 }
1283 } else {
1284 AGPRUseCount = alignTo(Value: AGPRUseCount, Align: RegCount);
1285 AGPRUseCount += RegCount;
1286 }
1287 }
1288 }
1289
1290 unsigned MaxVirtReg = std::max(a: AGPRUseCount, b: AGPRDefCount);
1291
1292 // TODO: This is overly conservative. If there are any physical registers,
1293 // allocate any virtual registers after them so we don't have to solve optimal
1294 // packing.
1295 return std::min(a: MaxVirtReg + MaxPhysReg, b: 256u);
1296}
1297
1298struct AAAMDGPUMinAGPRAlloc
1299 : public StateWrapper<DecIntegerState<>, AbstractAttribute> {
1300 using Base = StateWrapper<DecIntegerState<>, AbstractAttribute>;
1301 AAAMDGPUMinAGPRAlloc(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
1302
1303 static AAAMDGPUMinAGPRAlloc &createForPosition(const IRPosition &IRP,
1304 Attributor &A) {
1305 if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
1306 return *new (A.Allocator) AAAMDGPUMinAGPRAlloc(IRP, A);
1307 llvm_unreachable(
1308 "AAAMDGPUMinAGPRAlloc is only valid for function position");
1309 }
1310
1311 void initialize(Attributor &A) override {
1312 Function *F = getAssociatedFunction();
1313 auto [MinNumAGPR, MaxNumAGPR] =
1314 AMDGPU::getIntegerPairAttribute(F: *F, Name: "amdgpu-agpr-alloc", Default: {~0u, ~0u},
1315 /*OnlyFirstRequired=*/true);
1316 if (MinNumAGPR == 0)
1317 indicateOptimisticFixpoint();
1318 }
1319
1320 const std::string getAsStr(Attributor *A) const override {
1321 std::string Str = "amdgpu-agpr-alloc=";
1322 raw_string_ostream OS(Str);
1323 OS << getAssumed();
1324 return OS.str();
1325 }
1326
1327 void trackStatistics() const override {}
1328
1329 ChangeStatus updateImpl(Attributor &A) override {
1330 DecIntegerState<> Maximum;
1331
1332 // Check for cases which require allocation of AGPRs. The only cases where
1333 // AGPRs are required are if there are direct references to AGPRs, so inline
1334 // assembly and special intrinsics.
1335 auto CheckForMinAGPRAllocs = [&](Instruction &I) {
1336 const auto &CB = cast<CallBase>(Val&: I);
1337 const Value *CalleeOp = CB.getCalledOperand();
1338
1339 if (const InlineAsm *IA = dyn_cast<InlineAsm>(Val: CalleeOp)) {
1340 // Technically, the inline asm could be invoking a call to an unknown
1341 // external function that requires AGPRs, but ignore that.
1342 unsigned NumRegs = inlineAsmGetNumRequiredAGPRs(IA, Call: CB);
1343 Maximum.takeAssumedMaximum(Value: NumRegs);
1344 return true;
1345 }
1346 switch (CB.getIntrinsicID()) {
1347 case Intrinsic::not_intrinsic:
1348 break;
1349 case Intrinsic::write_register:
1350 case Intrinsic::read_register:
1351 case Intrinsic::read_volatile_register: {
1352 const MDString *RegName = cast<MDString>(
1353 Val: cast<MDNode>(
1354 Val: cast<MetadataAsValue>(Val: CB.getArgOperand(i: 0))->getMetadata())
1355 ->getOperand(I: 0));
1356 auto [Kind, RegIdx, NumRegs] =
1357 AMDGPU::parseAsmPhysRegName(TupleString: RegName->getString());
1358 if (Kind == 'a')
1359 Maximum.takeAssumedMaximum(Value: std::min(a: RegIdx + NumRegs, b: 256u));
1360
1361 return true;
1362 }
1363 // Trap-like intrinsics such as llvm.trap and llvm.debugtrap do not have
1364 // the nocallback attribute, so the AMDGPU attributor can conservatively
1365 // drop all implicitly-known inputs and AGPR allocation information. Make
1366 // sure we still infer that no implicit inputs are required and that the
1367 // AGPR allocation stays at zero. Trap-like intrinsics may invoke a
1368 // function which requires AGPRs, so we need to check if the called
1369 // function has the "trap-func-name" attribute.
1370 case Intrinsic::trap:
1371 case Intrinsic::debugtrap:
1372 case Intrinsic::ubsantrap:
1373 return CB.hasFnAttr(Kind: Attribute::NoCallback) ||
1374 !CB.hasFnAttr(Kind: "trap-func-name");
1375 default:
1376 // Some intrinsics may use AGPRs, but if we have a choice, we are not
1377 // required to use AGPRs.
1378 // Assume !nocallback intrinsics may call a function which requires
1379 // AGPRs.
1380 return CB.hasFnAttr(Kind: Attribute::NoCallback);
1381 }
1382
1383 // TODO: Handle callsite attributes
1384 auto *CBEdges = A.getAAFor<AACallEdges>(
1385 QueryingAA: *this, IRP: IRPosition::callsite_function(CB), DepClass: DepClassTy::REQUIRED);
1386 if (!CBEdges || CBEdges->hasUnknownCallee()) {
1387 Maximum.indicatePessimisticFixpoint();
1388 return false;
1389 }
1390
1391 for (const Function *PossibleCallee : CBEdges->getOptimisticEdges()) {
1392 const auto *CalleeInfo = A.getAAFor<AAAMDGPUMinAGPRAlloc>(
1393 QueryingAA: *this, IRP: IRPosition::function(F: *PossibleCallee), DepClass: DepClassTy::REQUIRED);
1394 if (!CalleeInfo || !CalleeInfo->isValidState()) {
1395 Maximum.indicatePessimisticFixpoint();
1396 return false;
1397 }
1398
1399 Maximum.takeAssumedMaximum(Value: CalleeInfo->getAssumed());
1400 }
1401
1402 return true;
1403 };
1404
1405 bool UsedAssumedInformation = false;
1406 if (!A.checkForAllCallLikeInstructions(Pred: CheckForMinAGPRAllocs, QueryingAA: *this,
1407 UsedAssumedInformation))
1408 return indicatePessimisticFixpoint();
1409
1410 return clampStateAndIndicateChange(S&: getState(), R: Maximum);
1411 }
1412
1413 ChangeStatus manifest(Attributor &A) override {
1414 LLVMContext &Ctx = getAssociatedFunction()->getContext();
1415 SmallString<4> Buffer;
1416 raw_svector_ostream OS(Buffer);
1417 OS << getAssumed();
1418
1419 return A.manifestAttrs(
1420 IRP: getIRPosition(), DeducedAttrs: {Attribute::get(Context&: Ctx, Kind: "amdgpu-agpr-alloc", Val: OS.str())});
1421 }
1422
1423 StringRef getName() const override { return "AAAMDGPUMinAGPRAlloc"; }
1424 const char *getIdAddr() const override { return &ID; }
1425
1426 /// This function should return true if the type of the \p AA is
1427 /// AAAMDGPUMinAGPRAllocs
1428 static bool classof(const AbstractAttribute *AA) {
1429 return (AA->getIdAddr() == &ID);
1430 }
1431
1432 static const char ID;
1433};
1434
1435const char AAAMDGPUMinAGPRAlloc::ID = 0;
1436
1437/// An abstract attribute to propagate the function attribute
1438/// "amdgpu-cluster-dims" from kernel entry functions to device functions.
1439struct AAAMDGPUClusterDims
1440 : public StateWrapper<BooleanState, AbstractAttribute> {
1441 using Base = StateWrapper<BooleanState, AbstractAttribute>;
1442 AAAMDGPUClusterDims(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
1443
1444 /// Create an abstract attribute view for the position \p IRP.
1445 static AAAMDGPUClusterDims &createForPosition(const IRPosition &IRP,
1446 Attributor &A);
1447
1448 /// See AbstractAttribute::getName().
1449 StringRef getName() const override { return "AAAMDGPUClusterDims"; }
1450
1451 /// See AbstractAttribute::getIdAddr().
1452 const char *getIdAddr() const override { return &ID; }
1453
1454 /// This function should return true if the type of the \p AA is
1455 /// AAAMDGPUClusterDims.
1456 static bool classof(const AbstractAttribute *AA) {
1457 return AA->getIdAddr() == &ID;
1458 }
1459
1460 virtual const AMDGPU::ClusterDimsAttr &getClusterDims() const = 0;
1461
1462 /// Unique ID (due to the unique address)
1463 static const char ID;
1464};
1465
1466const char AAAMDGPUClusterDims::ID = 0;
1467
1468struct AAAMDGPUClusterDimsFunction : public AAAMDGPUClusterDims {
1469 AAAMDGPUClusterDimsFunction(const IRPosition &IRP, Attributor &A)
1470 : AAAMDGPUClusterDims(IRP, A) {}
1471
1472 void initialize(Attributor &A) override {
1473 Function *F = getAssociatedFunction();
1474 assert(F && "empty associated function");
1475
1476 Attr = AMDGPU::ClusterDimsAttr::get(F: *F);
1477
1478 // No matter what a kernel function has, it is final.
1479 if (AMDGPU::isEntryFunctionCC(CC: F->getCallingConv())) {
1480 if (Attr.isUnknown())
1481 indicatePessimisticFixpoint();
1482 else
1483 indicateOptimisticFixpoint();
1484 }
1485 }
1486
1487 const std::string getAsStr(Attributor *A) const override {
1488 if (!getAssumed() || Attr.isUnknown())
1489 return "unknown";
1490 if (Attr.isNoCluster())
1491 return "no";
1492 if (Attr.isVariableDims())
1493 return "variable";
1494 return Attr.to_string();
1495 }
1496
1497 void trackStatistics() const override {}
1498
1499 ChangeStatus updateImpl(Attributor &A) override {
1500 auto OldState = Attr;
1501
1502 auto CheckCallSite = [&](AbstractCallSite CS) {
1503 const auto *CallerAA = A.getAAFor<AAAMDGPUClusterDims>(
1504 QueryingAA: *this, IRP: IRPosition::function(F: *CS.getInstruction()->getFunction()),
1505 DepClass: DepClassTy::REQUIRED);
1506 if (!CallerAA || !CallerAA->isValidState())
1507 return false;
1508
1509 return merge(Other: CallerAA->getClusterDims());
1510 };
1511
1512 bool UsedAssumedInformation = false;
1513 if (!A.checkForAllCallSites(Pred: CheckCallSite, QueryingAA: *this,
1514 /*RequireAllCallSites=*/true,
1515 UsedAssumedInformation))
1516 return indicatePessimisticFixpoint();
1517
1518 return OldState == Attr ? ChangeStatus::UNCHANGED : ChangeStatus::CHANGED;
1519 }
1520
1521 ChangeStatus manifest(Attributor &A) override {
1522 if (Attr.isUnknown())
1523 return ChangeStatus::UNCHANGED;
1524 return A.manifestAttrs(
1525 IRP: getIRPosition(),
1526 DeducedAttrs: {Attribute::get(Context&: getAssociatedFunction()->getContext(), Kind: AttrName,
1527 Val: Attr.to_string())},
1528 /*ForceReplace=*/true);
1529 }
1530
1531 const AMDGPU::ClusterDimsAttr &getClusterDims() const override {
1532 return Attr;
1533 }
1534
1535private:
1536 bool merge(const AMDGPU::ClusterDimsAttr &Other) {
1537 // Case 1: Both of them are unknown yet, we do nothing and continue wait for
1538 // propagation.
1539 if (Attr.isUnknown() && Other.isUnknown())
1540 return true;
1541
1542 // Case 2: The other is determined, but we are unknown yet, we simply take
1543 // the other's value.
1544 if (Attr.isUnknown()) {
1545 Attr = Other;
1546 return true;
1547 }
1548
1549 // Case 3: We are determined but the other is unknown yet, we simply keep
1550 // everything unchanged.
1551 if (Other.isUnknown())
1552 return true;
1553
1554 // After this point, both are determined.
1555
1556 // Case 4: If they are same, we do nothing.
1557 if (Attr == Other)
1558 return true;
1559
1560 // Now they are not same.
1561
1562 // Case 5: If either of us uses cluster (but not both; otherwise case 4
1563 // would hold), then it is unknown whether cluster will be used, and the
1564 // state is final, unlike case 1.
1565 if (Attr.isNoCluster() || Other.isNoCluster()) {
1566 Attr.setUnknown();
1567 return false;
1568 }
1569
1570 // Case 6: Both of us use cluster, but the dims are different, so the result
1571 // is, cluster is used, but we just don't have a fixed dims.
1572 Attr.setVariableDims();
1573 return true;
1574 }
1575
1576 AMDGPU::ClusterDimsAttr Attr;
1577
1578 static constexpr char AttrName[] = "amdgpu-cluster-dims";
1579};
1580
1581AAAMDGPUClusterDims &
1582AAAMDGPUClusterDims::createForPosition(const IRPosition &IRP, Attributor &A) {
1583 if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
1584 return *new (A.Allocator) AAAMDGPUClusterDimsFunction(IRP, A);
1585 llvm_unreachable("AAAMDGPUClusterDims is only valid for function position");
1586}
1587
1588static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
1589 AMDGPUAttributorOptions Options,
1590 ThinOrFullLTOPhase LTOPhase) {
1591 SetVector<Function *> Functions;
1592 for (Function &F : M) {
1593 if (!F.isIntrinsic())
1594 Functions.insert(X: &F);
1595 }
1596
1597 CallGraphUpdater CGUpdater;
1598 BumpPtrAllocator Allocator;
1599 AMDGPUInformationCache InfoCache(M, AG, Allocator, nullptr, TM);
1600 DenseSet<const char *> Allowed(
1601 {&AAAMDAttributes::ID, &AAUniformWorkGroupSize::ID,
1602 &AAPotentialValues::ID, &AAAMDFlatWorkGroupSize::ID,
1603 &AAAMDMaxNumWorkgroups::ID, &AAAMDWavesPerEU::ID,
1604 &AAAMDGPUMinAGPRAlloc::ID, &AACallEdges::ID, &AAPointerInfo::ID,
1605 &AAPotentialConstantValues::ID, &AAUnderlyingObjects::ID,
1606 &AANoAliasAddrSpace::ID, &AAAddressSpace::ID, &AAIndirectCallInfo::ID,
1607 &AAAMDGPUClusterDims::ID, &AAAlign::ID});
1608
1609 AttributorConfig AC(CGUpdater);
1610 AC.IsClosedWorldModule = Options.IsClosedWorld;
1611 AC.Allowed = &Allowed;
1612 AC.IsModulePass = true;
1613 AC.DefaultInitializeLiveInternals = false;
1614 AC.IndirectCalleeSpecializationCallback =
1615 [](Attributor &A, const AbstractAttribute &AA, CallBase &CB,
1616 Function &Callee, unsigned NumAssumedCallees) {
1617 return !AMDGPU::isEntryFunctionCC(CC: Callee.getCallingConv()) &&
1618 (NumAssumedCallees <= IndirectCallSpecializationThreshold);
1619 };
1620 AC.IPOAmendableCB = [](const Function &F) {
1621 return F.getCallingConv() == CallingConv::AMDGPU_KERNEL;
1622 };
1623
1624 Attributor A(Functions, InfoCache, AC);
1625
1626 LLVM_DEBUG({
1627 StringRef LTOPhaseStr = to_string(LTOPhase);
1628 dbgs() << "[AMDGPUAttributor] Running at phase " << LTOPhaseStr << '\n'
1629 << "[AMDGPUAttributor] Module " << M.getName() << " is "
1630 << (AC.IsClosedWorldModule ? "" : "not ")
1631 << "assumed to be a closed world.\n";
1632 });
1633
1634 for (auto *F : Functions) {
1635 A.getOrCreateAAFor<AAAMDAttributes>(IRP: IRPosition::function(F: *F));
1636 A.getOrCreateAAFor<AAUniformWorkGroupSize>(IRP: IRPosition::function(F: *F));
1637 A.getOrCreateAAFor<AAAMDMaxNumWorkgroups>(IRP: IRPosition::function(F: *F));
1638 CallingConv::ID CC = F->getCallingConv();
1639 if (!AMDGPU::isEntryFunctionCC(CC)) {
1640 A.getOrCreateAAFor<AAAMDFlatWorkGroupSize>(IRP: IRPosition::function(F: *F));
1641 A.getOrCreateAAFor<AAAMDWavesPerEU>(IRP: IRPosition::function(F: *F));
1642 }
1643
1644 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F: *F);
1645 if (!F->isDeclaration() && ST.hasClusters())
1646 A.getOrCreateAAFor<AAAMDGPUClusterDims>(IRP: IRPosition::function(F: *F));
1647
1648 if (ST.hasGFX90AInsts())
1649 A.getOrCreateAAFor<AAAMDGPUMinAGPRAlloc>(IRP: IRPosition::function(F: *F));
1650
1651 for (auto &I : instructions(F)) {
1652 Value *Ptr = nullptr;
1653 if (auto *LI = dyn_cast<LoadInst>(Val: &I))
1654 Ptr = LI->getPointerOperand();
1655 else if (auto *SI = dyn_cast<StoreInst>(Val: &I))
1656 Ptr = SI->getPointerOperand();
1657 else if (auto *RMW = dyn_cast<AtomicRMWInst>(Val: &I))
1658 Ptr = RMW->getPointerOperand();
1659 else if (auto *CmpX = dyn_cast<AtomicCmpXchgInst>(Val: &I))
1660 Ptr = CmpX->getPointerOperand();
1661
1662 if (Ptr) {
1663 A.getOrCreateAAFor<AAAddressSpace>(IRP: IRPosition::value(V: *Ptr));
1664 A.getOrCreateAAFor<AANoAliasAddrSpace>(IRP: IRPosition::value(V: *Ptr));
1665 if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(Val: Ptr)) {
1666 if (II->getIntrinsicID() == Intrinsic::amdgcn_make_buffer_rsrc)
1667 A.getOrCreateAAFor<AAAlign>(IRP: IRPosition::value(V: *Ptr));
1668 }
1669 }
1670 }
1671 }
1672
1673 return A.run() == ChangeStatus::CHANGED;
1674}
1675} // namespace
1676
1677PreservedAnalyses llvm::AMDGPUAttributorPass::run(Module &M,
1678 ModuleAnalysisManager &AM) {
1679
1680 FunctionAnalysisManager &FAM =
1681 AM.getResult<FunctionAnalysisManagerModuleProxy>(IR&: M).getManager();
1682 AnalysisGetter AG(FAM);
1683
1684 // TODO: Probably preserves CFG
1685 return runImpl(M, AG, TM, Options, LTOPhase) ? PreservedAnalyses::none()
1686 : PreservedAnalyses::all();
1687}
1688