1//===- AMDGPUAttributor.cpp -----------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file This pass uses Attributor framework to deduce AMDGPU attributes.
10//
11//===----------------------------------------------------------------------===//
12
13#include "AMDGPU.h"
14#include "AMDGPUTargetMachine.h"
15#include "GCNSubtarget.h"
16#include "Utils/AMDGPUBaseInfo.h"
17#include "llvm/IR/IntrinsicsAMDGPU.h"
18#include "llvm/IR/IntrinsicsR600.h"
19#include "llvm/Target/TargetMachine.h"
20#include "llvm/Transforms/IPO/Attributor.h"
21#include <cstdint>
22
23#define DEBUG_TYPE "amdgpu-attributor"
24
25using namespace llvm;
26
27static cl::opt<unsigned> IndirectCallSpecializationThreshold(
28 "amdgpu-indirect-call-specialization-threshold",
29 cl::desc(
30 "A threshold controls whether an indirect call will be specialized"),
31 cl::init(Val: 3));
32
33#define AMDGPU_ATTRIBUTE(Name, Str) Name##_POS,
34
35enum ImplicitArgumentPositions {
36#include "AMDGPUAttributes.def"
37 LAST_ARG_POS
38};
39
40#define AMDGPU_ATTRIBUTE(Name, Str) Name = 1 << Name##_POS,
41
42enum ImplicitArgumentMask {
43 UNKNOWN_INTRINSIC = 0,
44#include "AMDGPUAttributes.def"
45 ALL_ARGUMENT_MASK = (1 << LAST_ARG_POS) - 1,
46 NOT_IMPLICIT_INPUT
47};
48
49#define AMDGPU_ATTRIBUTE(Name, Str) {Name, Str},
50static constexpr std::pair<ImplicitArgumentMask, StringLiteral>
51 ImplicitAttrs[] = {
52#include "AMDGPUAttributes.def"
53};
54
55// We do not need to note the x workitem or workgroup id because they are always
56// initialized.
57//
58// TODO: We should not add the attributes if the known compile time workgroup
59// size is 1 for y/z.
60static ImplicitArgumentMask
61intrinsicToAttrMask(Intrinsic::ID ID, bool &NonKernelOnly, bool &NeedsImplicit,
62 bool HasApertureRegs, bool SupportsGetDoorBellID,
63 unsigned CodeObjectVersion) {
64 switch (ID) {
65 case Intrinsic::amdgcn_workitem_id_x:
66 NonKernelOnly = true;
67 return WORKITEM_ID_X;
68 case Intrinsic::amdgcn_workgroup_id_x:
69 NonKernelOnly = true;
70 return WORKGROUP_ID_X;
71 case Intrinsic::amdgcn_workitem_id_y:
72 case Intrinsic::r600_read_tidig_y:
73 return WORKITEM_ID_Y;
74 case Intrinsic::amdgcn_workitem_id_z:
75 case Intrinsic::r600_read_tidig_z:
76 return WORKITEM_ID_Z;
77 case Intrinsic::amdgcn_workgroup_id_y:
78 case Intrinsic::r600_read_tgid_y:
79 return WORKGROUP_ID_Y;
80 case Intrinsic::amdgcn_workgroup_id_z:
81 case Intrinsic::r600_read_tgid_z:
82 return WORKGROUP_ID_Z;
83 case Intrinsic::amdgcn_cluster_id_x:
84 NonKernelOnly = true;
85 return CLUSTER_ID_X;
86 case Intrinsic::amdgcn_cluster_id_y:
87 return CLUSTER_ID_Y;
88 case Intrinsic::amdgcn_cluster_id_z:
89 return CLUSTER_ID_Z;
90 case Intrinsic::amdgcn_lds_kernel_id:
91 return LDS_KERNEL_ID;
92 case Intrinsic::amdgcn_dispatch_ptr:
93 return DISPATCH_PTR;
94 case Intrinsic::amdgcn_dispatch_id:
95 return DISPATCH_ID;
96 case Intrinsic::amdgcn_implicitarg_ptr:
97 return IMPLICIT_ARG_PTR;
98 // Need queue_ptr anyway. But under V5, we also need implicitarg_ptr to access
99 // queue_ptr.
100 case Intrinsic::amdgcn_queue_ptr:
101 NeedsImplicit = (CodeObjectVersion >= AMDGPU::AMDHSA_COV5);
102 return QUEUE_PTR;
103 case Intrinsic::amdgcn_is_shared:
104 case Intrinsic::amdgcn_is_private:
105 if (HasApertureRegs)
106 return NOT_IMPLICIT_INPUT;
107 // Under V5, we need implicitarg_ptr + offsets to access private_base or
108 // shared_base. For pre-V5, however, need to access them through queue_ptr +
109 // offsets.
110 return CodeObjectVersion >= AMDGPU::AMDHSA_COV5 ? IMPLICIT_ARG_PTR
111 : QUEUE_PTR;
112 case Intrinsic::amdgcn_wwm:
113 case Intrinsic::amdgcn_strict_wwm:
114 return WHOLE_WAVE_MODE;
115 case Intrinsic::trap:
116 case Intrinsic::debugtrap:
117 case Intrinsic::ubsantrap:
118 if (SupportsGetDoorBellID) // GetDoorbellID support implemented since V4.
119 return CodeObjectVersion >= AMDGPU::AMDHSA_COV4 ? NOT_IMPLICIT_INPUT
120 : QUEUE_PTR;
121 NeedsImplicit = (CodeObjectVersion >= AMDGPU::AMDHSA_COV5);
122 return QUEUE_PTR;
123 default:
124 return UNKNOWN_INTRINSIC;
125 }
126}
127
128static bool castRequiresQueuePtr(unsigned SrcAS) {
129 return SrcAS == AMDGPUAS::LOCAL_ADDRESS || SrcAS == AMDGPUAS::PRIVATE_ADDRESS;
130}
131
132static bool isDSAddress(const Constant *C) {
133 const GlobalValue *GV = dyn_cast<GlobalValue>(Val: C);
134 if (!GV)
135 return false;
136 unsigned AS = GV->getAddressSpace();
137 return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS;
138}
139
140/// Returns true if sanitizer attributes are present on a function.
141static bool hasSanitizerAttributes(const Function &F) {
142 return F.hasFnAttribute(Kind: Attribute::SanitizeAddress) ||
143 F.hasFnAttribute(Kind: Attribute::SanitizeThread) ||
144 F.hasFnAttribute(Kind: Attribute::SanitizeMemory) ||
145 F.hasFnAttribute(Kind: Attribute::SanitizeHWAddress) ||
146 F.hasFnAttribute(Kind: Attribute::SanitizeMemTag);
147}
148
149namespace {
150class AMDGPUInformationCache : public InformationCache {
151public:
152 AMDGPUInformationCache(const Module &M, AnalysisGetter &AG,
153 BumpPtrAllocator &Allocator,
154 SetVector<Function *> *CGSCC, TargetMachine &TM)
155 : InformationCache(M, AG, Allocator, CGSCC), TM(TM),
156 CodeObjectVersion(AMDGPU::getAMDHSACodeObjectVersion(M)) {}
157
158 TargetMachine &TM;
159
160 enum ConstantStatus : uint8_t {
161 NONE = 0,
162 DS_GLOBAL = 1 << 0,
163 ADDR_SPACE_CAST_PRIVATE_TO_FLAT = 1 << 1,
164 ADDR_SPACE_CAST_LOCAL_TO_FLAT = 1 << 2,
165 ADDR_SPACE_CAST_BOTH_TO_FLAT =
166 ADDR_SPACE_CAST_PRIVATE_TO_FLAT | ADDR_SPACE_CAST_LOCAL_TO_FLAT,
167 CS_WORST = DS_GLOBAL | ADDR_SPACE_CAST_BOTH_TO_FLAT,
168 };
169
170 /// Check if the subtarget has aperture regs.
171 bool hasApertureRegs(Function &F) {
172 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
173 return ST.hasApertureRegs();
174 }
175
176 /// Check if the subtarget supports GetDoorbellID.
177 bool supportsGetDoorbellID(Function &F) {
178 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
179 return ST.supportsGetDoorbellID();
180 }
181
182 std::optional<std::pair<unsigned, unsigned>>
183 getFlatWorkGroupSizeAttr(const Function &F) const {
184 auto R = AMDGPU::getIntegerPairAttribute(F, Name: "amdgpu-flat-work-group-size");
185 if (!R)
186 return std::nullopt;
187 return std::make_pair(x&: R->first, y&: *(R->second));
188 }
189
190 std::pair<unsigned, unsigned>
191 getDefaultFlatWorkGroupSize(const Function &F) const {
192 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
193 return ST.getDefaultFlatWorkGroupSize(CC: F.getCallingConv());
194 }
195
196 std::pair<unsigned, unsigned>
197 getMaximumFlatWorkGroupRange(const Function &F) {
198 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
199 return {ST.getMinFlatWorkGroupSize(), ST.getMaxFlatWorkGroupSize()};
200 }
201
202 SmallVector<unsigned> getMaxNumWorkGroups(const Function &F) {
203 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
204 return ST.getMaxNumWorkGroups(F);
205 }
206
207 /// Get code object version.
208 unsigned getCodeObjectVersion() const { return CodeObjectVersion; }
209
210 std::optional<std::pair<unsigned, unsigned>>
211 getWavesPerEUAttr(const Function &F) {
212 auto Val = AMDGPU::getIntegerPairAttribute(F, Name: "amdgpu-waves-per-eu",
213 /*OnlyFirstRequired=*/true);
214 if (!Val)
215 return std::nullopt;
216 if (!Val->second) {
217 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
218 Val->second = ST.getMaxWavesPerEU();
219 }
220 return std::make_pair(x&: Val->first, y&: *(Val->second));
221 }
222
223 unsigned getMaxWavesPerEU(const Function &F) {
224 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
225 return ST.getMaxWavesPerEU();
226 }
227
228 unsigned getMaxAddrSpace() const override {
229 return AMDGPUAS::MAX_AMDGPU_ADDRESS;
230 }
231
232private:
233 /// Check if the ConstantExpr \p CE uses an addrspacecast from private or
234 /// local to flat. These casts may require the queue pointer.
235 static uint8_t visitConstExpr(const ConstantExpr *CE) {
236 uint8_t Status = NONE;
237
238 if (CE->getOpcode() == Instruction::AddrSpaceCast) {
239 unsigned SrcAS = CE->getOperand(i_nocapture: 0)->getType()->getPointerAddressSpace();
240 if (SrcAS == AMDGPUAS::PRIVATE_ADDRESS)
241 Status |= ADDR_SPACE_CAST_PRIVATE_TO_FLAT;
242 else if (SrcAS == AMDGPUAS::LOCAL_ADDRESS)
243 Status |= ADDR_SPACE_CAST_LOCAL_TO_FLAT;
244 }
245
246 return Status;
247 }
248
249 /// Get the constant access bitmap for \p C.
250 uint8_t getConstantAccess(const Constant *C) {
251 const auto &It = ConstantStatus.find(Val: C);
252 if (It != ConstantStatus.end())
253 return It->second.value();
254
255 SmallPtrSet<const Constant *, 8> Visited;
256 SmallVector<const Constant *> Worklist;
257 Worklist.push_back(Elt: C);
258 Visited.insert(Ptr: C);
259
260 uint8_t Result = 0;
261 while (Result != CS_WORST && !Worklist.empty()) {
262 const Constant *CurC = Worklist.pop_back_val();
263
264 std::optional<uint8_t> &CurCResultOrNone = ConstantStatus[CurC];
265 if (CurCResultOrNone) {
266 Result |= CurCResultOrNone.value();
267 continue;
268 }
269 uint8_t CurCResult = 0;
270
271 if (isDSAddress(C: CurC))
272 CurCResult |= DS_GLOBAL;
273
274 if (const auto *CE = dyn_cast<ConstantExpr>(Val: CurC))
275 CurCResult |= visitConstExpr(CE);
276
277 for (const Use &U : CurC->operands()) {
278 if (const auto *OpC = dyn_cast<Constant>(Val: U)) {
279 if (Visited.insert(Ptr: OpC).second)
280 Worklist.push_back(Elt: OpC);
281 }
282 }
283
284 CurCResultOrNone = CurCResult;
285 Result |= CurCResult;
286 }
287
288 ConstantStatus[C] = Result;
289 return Result;
290 }
291
292public:
293 /// Returns true if \p Fn needs the queue pointer because of \p C.
294 bool needsQueuePtr(const Constant *C, Function &Fn) {
295 bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(CC: Fn.getCallingConv());
296 bool HasAperture = hasApertureRegs(F&: Fn);
297
298 // No need to explore the constants.
299 if (!IsNonEntryFunc && HasAperture)
300 return false;
301
302 uint8_t Access = getConstantAccess(C);
303
304 // We need to trap on DS globals in non-entry functions.
305 if (IsNonEntryFunc && (Access & DS_GLOBAL))
306 return true;
307
308 return !HasAperture && (Access & ADDR_SPACE_CAST_BOTH_TO_FLAT);
309 }
310
311 bool checkConstForAddrSpaceCastFromPrivate(const Constant *C) {
312 uint8_t Access = getConstantAccess(C);
313 return Access & ADDR_SPACE_CAST_PRIVATE_TO_FLAT;
314 }
315
316private:
317 /// Used to determine if the Constant needs the queue pointer.
318 DenseMap<const Constant *, std::optional<uint8_t>> ConstantStatus;
319 const unsigned CodeObjectVersion;
320};
321
322struct AAAMDAttributes
323 : public StateWrapper<BitIntegerState<uint32_t, ALL_ARGUMENT_MASK, 0>,
324 AbstractAttribute> {
325 using Base = StateWrapper<BitIntegerState<uint32_t, ALL_ARGUMENT_MASK, 0>,
326 AbstractAttribute>;
327
328 AAAMDAttributes(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
329
330 /// Create an abstract attribute view for the position \p IRP.
331 static AAAMDAttributes &createForPosition(const IRPosition &IRP,
332 Attributor &A);
333
334 /// See AbstractAttribute::getName().
335 StringRef getName() const override { return "AAAMDAttributes"; }
336
337 /// See AbstractAttribute::getIdAddr().
338 const char *getIdAddr() const override { return &ID; }
339
340 /// This function should return true if the type of the \p AA is
341 /// AAAMDAttributes.
342 static bool classof(const AbstractAttribute *AA) {
343 return (AA->getIdAddr() == &ID);
344 }
345
346 /// Unique ID (due to the unique address)
347 static const char ID;
348};
349const char AAAMDAttributes::ID = 0;
350
351struct AAUniformWorkGroupSize
352 : public StateWrapper<BooleanState, AbstractAttribute> {
353 using Base = StateWrapper<BooleanState, AbstractAttribute>;
354 AAUniformWorkGroupSize(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
355
356 /// Create an abstract attribute view for the position \p IRP.
357 static AAUniformWorkGroupSize &createForPosition(const IRPosition &IRP,
358 Attributor &A);
359
360 /// See AbstractAttribute::getName().
361 StringRef getName() const override { return "AAUniformWorkGroupSize"; }
362
363 /// See AbstractAttribute::getIdAddr().
364 const char *getIdAddr() const override { return &ID; }
365
366 /// This function should return true if the type of the \p AA is
367 /// AAAMDAttributes.
368 static bool classof(const AbstractAttribute *AA) {
369 return (AA->getIdAddr() == &ID);
370 }
371
372 /// Unique ID (due to the unique address)
373 static const char ID;
374};
375const char AAUniformWorkGroupSize::ID = 0;
376
377struct AAUniformWorkGroupSizeFunction : public AAUniformWorkGroupSize {
378 AAUniformWorkGroupSizeFunction(const IRPosition &IRP, Attributor &A)
379 : AAUniformWorkGroupSize(IRP, A) {}
380
381 void initialize(Attributor &A) override {
382 Function *F = getAssociatedFunction();
383 CallingConv::ID CC = F->getCallingConv();
384
385 if (CC != CallingConv::AMDGPU_KERNEL)
386 return;
387
388 bool InitialValue = F->hasFnAttribute(Kind: "uniform-work-group-size");
389
390 if (InitialValue)
391 indicateOptimisticFixpoint();
392 else
393 indicatePessimisticFixpoint();
394 }
395
396 ChangeStatus updateImpl(Attributor &A) override {
397 ChangeStatus Change = ChangeStatus::UNCHANGED;
398
399 auto CheckCallSite = [&](AbstractCallSite CS) {
400 Function *Caller = CS.getInstruction()->getFunction();
401 LLVM_DEBUG(dbgs() << "[AAUniformWorkGroupSize] Call " << Caller->getName()
402 << "->" << getAssociatedFunction()->getName() << "\n");
403
404 const auto *CallerInfo = A.getAAFor<AAUniformWorkGroupSize>(
405 QueryingAA: *this, IRP: IRPosition::function(F: *Caller), DepClass: DepClassTy::REQUIRED);
406 if (!CallerInfo || !CallerInfo->isValidState())
407 return false;
408
409 Change = Change | clampStateAndIndicateChange(S&: this->getState(),
410 R: CallerInfo->getState());
411
412 return true;
413 };
414
415 bool AllCallSitesKnown = true;
416 if (!A.checkForAllCallSites(Pred: CheckCallSite, QueryingAA: *this, RequireAllCallSites: true, UsedAssumedInformation&: AllCallSitesKnown))
417 return indicatePessimisticFixpoint();
418
419 return Change;
420 }
421
422 ChangeStatus manifest(Attributor &A) override {
423 if (!getAssumed())
424 return ChangeStatus::UNCHANGED;
425
426 LLVMContext &Ctx = getAssociatedFunction()->getContext();
427 return A.manifestAttrs(IRP: getIRPosition(),
428 DeducedAttrs: {Attribute::get(Context&: Ctx, Kind: "uniform-work-group-size")},
429 /*ForceReplace=*/true);
430 }
431
432 bool isValidState() const override {
433 // This state is always valid, even when the state is false.
434 return true;
435 }
436
437 const std::string getAsStr(Attributor *) const override {
438 return "AMDWorkGroupSize[" + std::to_string(val: getAssumed()) + "]";
439 }
440
441 /// See AbstractAttribute::trackStatistics()
442 void trackStatistics() const override {}
443};
444
445AAUniformWorkGroupSize &
446AAUniformWorkGroupSize::createForPosition(const IRPosition &IRP,
447 Attributor &A) {
448 if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
449 return *new (A.Allocator) AAUniformWorkGroupSizeFunction(IRP, A);
450 llvm_unreachable(
451 "AAUniformWorkGroupSize is only valid for function position");
452}
453
454struct AAAMDAttributesFunction : public AAAMDAttributes {
455 AAAMDAttributesFunction(const IRPosition &IRP, Attributor &A)
456 : AAAMDAttributes(IRP, A) {}
457
458 void initialize(Attributor &A) override {
459 Function *F = getAssociatedFunction();
460
461 // If the function requires the implicit arg pointer due to sanitizers,
462 // assume it's needed even if explicitly marked as not requiring it.
463 // Flat scratch initialization is needed because `asan_malloc_impl`
464 // calls introduced later in pipeline will have flat scratch accesses.
465 // FIXME: FLAT_SCRATCH_INIT will not be required here if device-libs
466 // implementation for `asan_malloc_impl` is updated.
467 const bool HasSanitizerAttrs = hasSanitizerAttributes(F: *F);
468 if (HasSanitizerAttrs) {
469 removeAssumedBits(BitsEncoding: IMPLICIT_ARG_PTR);
470 removeAssumedBits(BitsEncoding: HOSTCALL_PTR);
471 removeAssumedBits(BitsEncoding: FLAT_SCRATCH_INIT);
472 }
473
474 for (auto Attr : ImplicitAttrs) {
475 if (HasSanitizerAttrs &&
476 (Attr.first == IMPLICIT_ARG_PTR || Attr.first == HOSTCALL_PTR ||
477 Attr.first == FLAT_SCRATCH_INIT))
478 continue;
479
480 if (F->hasFnAttribute(Kind: Attr.second))
481 addKnownBits(Bits: Attr.first);
482 }
483
484 if (F->isDeclaration())
485 return;
486
487 // Ignore functions with graphics calling conventions, these are currently
488 // not allowed to have kernel arguments.
489 if (AMDGPU::isGraphics(CC: F->getCallingConv())) {
490 indicatePessimisticFixpoint();
491 return;
492 }
493 }
494
495 ChangeStatus updateImpl(Attributor &A) override {
496 Function *F = getAssociatedFunction();
497 // The current assumed state used to determine a change.
498 auto OrigAssumed = getAssumed();
499
500 // Check for Intrinsics and propagate attributes.
501 const AACallEdges *AAEdges = A.getAAFor<AACallEdges>(
502 QueryingAA: *this, IRP: this->getIRPosition(), DepClass: DepClassTy::REQUIRED);
503 if (!AAEdges || !AAEdges->isValidState() ||
504 AAEdges->hasNonAsmUnknownCallee())
505 return indicatePessimisticFixpoint();
506
507 bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(CC: F->getCallingConv());
508
509 bool NeedsImplicit = false;
510 auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
511 bool HasApertureRegs = InfoCache.hasApertureRegs(F&: *F);
512 bool SupportsGetDoorbellID = InfoCache.supportsGetDoorbellID(F&: *F);
513 unsigned COV = InfoCache.getCodeObjectVersion();
514
515 for (Function *Callee : AAEdges->getOptimisticEdges()) {
516 Intrinsic::ID IID = Callee->getIntrinsicID();
517 if (IID == Intrinsic::not_intrinsic) {
518 const AAAMDAttributes *AAAMD = A.getAAFor<AAAMDAttributes>(
519 QueryingAA: *this, IRP: IRPosition::function(F: *Callee), DepClass: DepClassTy::REQUIRED);
520 if (!AAAMD || !AAAMD->isValidState())
521 return indicatePessimisticFixpoint();
522 *this &= *AAAMD;
523 continue;
524 }
525
526 bool NonKernelOnly = false;
527 ImplicitArgumentMask AttrMask =
528 intrinsicToAttrMask(ID: IID, NonKernelOnly, NeedsImplicit,
529 HasApertureRegs, SupportsGetDoorBellID: SupportsGetDoorbellID, CodeObjectVersion: COV);
530
531 if (AttrMask == UNKNOWN_INTRINSIC) {
532 // Assume not-nocallback intrinsics may invoke a function which accesses
533 // implicit arguments.
534 //
535 // FIXME: This isn't really the correct check. We want to ensure it
536 // isn't calling any function that may use implicit arguments regardless
537 // of whether it's internal to the module or not.
538 //
539 // TODO: Ignoring callsite attributes.
540 if (!Callee->hasFnAttribute(Kind: Attribute::NoCallback))
541 return indicatePessimisticFixpoint();
542 continue;
543 }
544
545 if (AttrMask != NOT_IMPLICIT_INPUT) {
546 if ((IsNonEntryFunc || !NonKernelOnly))
547 removeAssumedBits(BitsEncoding: AttrMask);
548 }
549 }
550
551 // Need implicitarg_ptr to acess queue_ptr, private_base, and shared_base.
552 if (NeedsImplicit)
553 removeAssumedBits(BitsEncoding: IMPLICIT_ARG_PTR);
554
555 if (isAssumed(BitsEncoding: QUEUE_PTR) && checkForQueuePtr(A)) {
556 // Under V5, we need implicitarg_ptr + offsets to access private_base or
557 // shared_base. We do not actually need queue_ptr.
558 if (COV >= 5)
559 removeAssumedBits(BitsEncoding: IMPLICIT_ARG_PTR);
560 else
561 removeAssumedBits(BitsEncoding: QUEUE_PTR);
562 }
563
564 if (funcRetrievesMultigridSyncArg(A, COV)) {
565 assert(!isAssumed(IMPLICIT_ARG_PTR) &&
566 "multigrid_sync_arg needs implicitarg_ptr");
567 removeAssumedBits(BitsEncoding: MULTIGRID_SYNC_ARG);
568 }
569
570 if (funcRetrievesHostcallPtr(A, COV)) {
571 assert(!isAssumed(IMPLICIT_ARG_PTR) && "hostcall needs implicitarg_ptr");
572 removeAssumedBits(BitsEncoding: HOSTCALL_PTR);
573 }
574
575 if (funcRetrievesHeapPtr(A, COV)) {
576 assert(!isAssumed(IMPLICIT_ARG_PTR) && "heap_ptr needs implicitarg_ptr");
577 removeAssumedBits(BitsEncoding: HEAP_PTR);
578 }
579
580 if (isAssumed(BitsEncoding: QUEUE_PTR) && funcRetrievesQueuePtr(A, COV)) {
581 assert(!isAssumed(IMPLICIT_ARG_PTR) && "queue_ptr needs implicitarg_ptr");
582 removeAssumedBits(BitsEncoding: QUEUE_PTR);
583 }
584
585 if (isAssumed(BitsEncoding: LDS_KERNEL_ID) && funcRetrievesLDSKernelId(A)) {
586 removeAssumedBits(BitsEncoding: LDS_KERNEL_ID);
587 }
588
589 if (isAssumed(BitsEncoding: DEFAULT_QUEUE) && funcRetrievesDefaultQueue(A, COV))
590 removeAssumedBits(BitsEncoding: DEFAULT_QUEUE);
591
592 if (isAssumed(BitsEncoding: COMPLETION_ACTION) && funcRetrievesCompletionAction(A, COV))
593 removeAssumedBits(BitsEncoding: COMPLETION_ACTION);
594
595 if (isAssumed(BitsEncoding: FLAT_SCRATCH_INIT) && needFlatScratchInit(A))
596 removeAssumedBits(BitsEncoding: FLAT_SCRATCH_INIT);
597
598 return getAssumed() != OrigAssumed ? ChangeStatus::CHANGED
599 : ChangeStatus::UNCHANGED;
600 }
601
602 ChangeStatus manifest(Attributor &A) override {
603 SmallVector<Attribute, 8> AttrList;
604 LLVMContext &Ctx = getAssociatedFunction()->getContext();
605
606 for (auto Attr : ImplicitAttrs) {
607 if (isKnown(BitsEncoding: Attr.first))
608 AttrList.push_back(Elt: Attribute::get(Context&: Ctx, Kind: Attr.second));
609 }
610
611 return A.manifestAttrs(IRP: getIRPosition(), DeducedAttrs: AttrList,
612 /* ForceReplace */ true);
613 }
614
615 const std::string getAsStr(Attributor *) const override {
616 std::string Str;
617 raw_string_ostream OS(Str);
618 OS << "AMDInfo[";
619 for (auto Attr : ImplicitAttrs)
620 if (isAssumed(BitsEncoding: Attr.first))
621 OS << ' ' << Attr.second;
622 OS << " ]";
623 return OS.str();
624 }
625
626 /// See AbstractAttribute::trackStatistics()
627 void trackStatistics() const override {}
628
629private:
630 bool checkForQueuePtr(Attributor &A) {
631 Function *F = getAssociatedFunction();
632 bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(CC: F->getCallingConv());
633
634 auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
635
636 bool NeedsQueuePtr = false;
637
638 auto CheckAddrSpaceCasts = [&](Instruction &I) {
639 unsigned SrcAS = static_cast<AddrSpaceCastInst &>(I).getSrcAddressSpace();
640 if (castRequiresQueuePtr(SrcAS)) {
641 NeedsQueuePtr = true;
642 return false;
643 }
644 return true;
645 };
646
647 bool HasApertureRegs = InfoCache.hasApertureRegs(F&: *F);
648
649 // `checkForAllInstructions` is much more cheaper than going through all
650 // instructions, try it first.
651
652 // The queue pointer is not needed if aperture regs is present.
653 if (!HasApertureRegs) {
654 bool UsedAssumedInformation = false;
655 A.checkForAllInstructions(Pred: CheckAddrSpaceCasts, QueryingAA: *this,
656 Opcodes: {Instruction::AddrSpaceCast},
657 UsedAssumedInformation);
658 }
659
660 // If we found that we need the queue pointer, nothing else to do.
661 if (NeedsQueuePtr)
662 return true;
663
664 if (!IsNonEntryFunc && HasApertureRegs)
665 return false;
666
667 for (BasicBlock &BB : *F) {
668 for (Instruction &I : BB) {
669 for (const Use &U : I.operands()) {
670 if (const auto *C = dyn_cast<Constant>(Val: U)) {
671 if (InfoCache.needsQueuePtr(C, Fn&: *F))
672 return true;
673 }
674 }
675 }
676 }
677
678 return false;
679 }
680
681 bool funcRetrievesMultigridSyncArg(Attributor &A, unsigned COV) {
682 auto Pos = llvm::AMDGPU::getMultigridSyncArgImplicitArgPosition(COV);
683 AA::RangeTy Range(Pos, 8);
684 return funcRetrievesImplicitKernelArg(A, Range);
685 }
686
687 bool funcRetrievesHostcallPtr(Attributor &A, unsigned COV) {
688 auto Pos = llvm::AMDGPU::getHostcallImplicitArgPosition(COV);
689 AA::RangeTy Range(Pos, 8);
690 return funcRetrievesImplicitKernelArg(A, Range);
691 }
692
693 bool funcRetrievesDefaultQueue(Attributor &A, unsigned COV) {
694 auto Pos = llvm::AMDGPU::getDefaultQueueImplicitArgPosition(COV);
695 AA::RangeTy Range(Pos, 8);
696 return funcRetrievesImplicitKernelArg(A, Range);
697 }
698
699 bool funcRetrievesCompletionAction(Attributor &A, unsigned COV) {
700 auto Pos = llvm::AMDGPU::getCompletionActionImplicitArgPosition(COV);
701 AA::RangeTy Range(Pos, 8);
702 return funcRetrievesImplicitKernelArg(A, Range);
703 }
704
705 bool funcRetrievesHeapPtr(Attributor &A, unsigned COV) {
706 if (COV < 5)
707 return false;
708 AA::RangeTy Range(AMDGPU::ImplicitArg::HEAP_PTR_OFFSET, 8);
709 return funcRetrievesImplicitKernelArg(A, Range);
710 }
711
712 bool funcRetrievesQueuePtr(Attributor &A, unsigned COV) {
713 if (COV < 5)
714 return false;
715 AA::RangeTy Range(AMDGPU::ImplicitArg::QUEUE_PTR_OFFSET, 8);
716 return funcRetrievesImplicitKernelArg(A, Range);
717 }
718
719 bool funcRetrievesImplicitKernelArg(Attributor &A, AA::RangeTy Range) {
720 // Check if this is a call to the implicitarg_ptr builtin and it
721 // is used to retrieve the hostcall pointer. The implicit arg for
722 // hostcall is not used only if every use of the implicitarg_ptr
723 // is a load that clearly does not retrieve any byte of the
724 // hostcall pointer. We check this by tracing all the uses of the
725 // initial call to the implicitarg_ptr intrinsic.
726 auto DoesNotLeadToKernelArgLoc = [&](Instruction &I) {
727 auto &Call = cast<CallBase>(Val&: I);
728 if (Call.getIntrinsicID() != Intrinsic::amdgcn_implicitarg_ptr)
729 return true;
730
731 const auto *PointerInfoAA = A.getAAFor<AAPointerInfo>(
732 QueryingAA: *this, IRP: IRPosition::callsite_returned(CB: Call), DepClass: DepClassTy::REQUIRED);
733 if (!PointerInfoAA || !PointerInfoAA->getState().isValidState())
734 return false;
735
736 return PointerInfoAA->forallInterferingAccesses(
737 Range, CB: [](const AAPointerInfo::Access &Acc, bool IsExact) {
738 return Acc.getRemoteInst()->isDroppable();
739 });
740 };
741
742 bool UsedAssumedInformation = false;
743 return !A.checkForAllCallLikeInstructions(Pred: DoesNotLeadToKernelArgLoc, QueryingAA: *this,
744 UsedAssumedInformation);
745 }
746
747 bool funcRetrievesLDSKernelId(Attributor &A) {
748 auto DoesNotRetrieve = [&](Instruction &I) {
749 auto &Call = cast<CallBase>(Val&: I);
750 return Call.getIntrinsicID() != Intrinsic::amdgcn_lds_kernel_id;
751 };
752 bool UsedAssumedInformation = false;
753 return !A.checkForAllCallLikeInstructions(Pred: DoesNotRetrieve, QueryingAA: *this,
754 UsedAssumedInformation);
755 }
756
757 // Returns true if FlatScratchInit is needed, i.e., no-flat-scratch-init is
758 // not to be set.
759 bool needFlatScratchInit(Attributor &A) {
760 assert(isAssumed(FLAT_SCRATCH_INIT)); // only called if the bit is still set
761
762 // Check all AddrSpaceCast instructions. FlatScratchInit is needed if
763 // there is a cast from PRIVATE_ADDRESS.
764 auto AddrSpaceCastNotFromPrivate = [](Instruction &I) {
765 return cast<AddrSpaceCastInst>(Val&: I).getSrcAddressSpace() !=
766 AMDGPUAS::PRIVATE_ADDRESS;
767 };
768
769 bool UsedAssumedInformation = false;
770 if (!A.checkForAllInstructions(Pred: AddrSpaceCastNotFromPrivate, QueryingAA: *this,
771 Opcodes: {Instruction::AddrSpaceCast},
772 UsedAssumedInformation))
773 return true;
774
775 // Check for addrSpaceCast from PRIVATE_ADDRESS in constant expressions
776 auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
777
778 Function *F = getAssociatedFunction();
779 for (Instruction &I : instructions(F)) {
780 for (const Use &U : I.operands()) {
781 if (const auto *C = dyn_cast<Constant>(Val: U)) {
782 if (InfoCache.checkConstForAddrSpaceCastFromPrivate(C))
783 return true;
784 }
785 }
786 }
787
788 // Finally check callees.
789
790 // This is called on each callee; false means callee shouldn't have
791 // no-flat-scratch-init.
792 auto CheckForNoFlatScratchInit = [&](Instruction &I) {
793 const auto &CB = cast<CallBase>(Val&: I);
794 const Function *Callee = CB.getCalledFunction();
795
796 // Callee == 0 for inline asm or indirect call with known callees.
797 // In the latter case, updateImpl() already checked the callees and we
798 // know their FLAT_SCRATCH_INIT bit is set.
799 // If function has indirect call with unknown callees, the bit is
800 // already removed in updateImpl() and execution won't reach here.
801 if (!Callee)
802 return true;
803
804 return Callee->getIntrinsicID() !=
805 Intrinsic::amdgcn_addrspacecast_nonnull;
806 };
807
808 UsedAssumedInformation = false;
809 // If any callee is false (i.e. need FlatScratchInit),
810 // checkForAllCallLikeInstructions returns false, in which case this
811 // function returns true.
812 return !A.checkForAllCallLikeInstructions(Pred: CheckForNoFlatScratchInit, QueryingAA: *this,
813 UsedAssumedInformation);
814 }
815};
816
817AAAMDAttributes &AAAMDAttributes::createForPosition(const IRPosition &IRP,
818 Attributor &A) {
819 if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
820 return *new (A.Allocator) AAAMDAttributesFunction(IRP, A);
821 llvm_unreachable("AAAMDAttributes is only valid for function position");
822}
823
824/// Base class to derive different size ranges.
825struct AAAMDSizeRangeAttribute
826 : public StateWrapper<IntegerRangeState, AbstractAttribute, uint32_t> {
827 using Base = StateWrapper<IntegerRangeState, AbstractAttribute, uint32_t>;
828
829 StringRef AttrName;
830
831 AAAMDSizeRangeAttribute(const IRPosition &IRP, Attributor &A,
832 StringRef AttrName)
833 : Base(IRP, 32), AttrName(AttrName) {}
834
835 /// See AbstractAttribute::trackStatistics()
836 void trackStatistics() const override {}
837
838 template <class AttributeImpl> ChangeStatus updateImplImpl(Attributor &A) {
839 ChangeStatus Change = ChangeStatus::UNCHANGED;
840
841 auto CheckCallSite = [&](AbstractCallSite CS) {
842 Function *Caller = CS.getInstruction()->getFunction();
843 LLVM_DEBUG(dbgs() << '[' << getName() << "] Call " << Caller->getName()
844 << "->" << getAssociatedFunction()->getName() << '\n');
845
846 const auto *CallerInfo = A.getAAFor<AttributeImpl>(
847 *this, IRPosition::function(F: *Caller), DepClassTy::REQUIRED);
848 if (!CallerInfo || !CallerInfo->isValidState())
849 return false;
850
851 Change |=
852 clampStateAndIndicateChange(this->getState(), CallerInfo->getState());
853
854 return true;
855 };
856
857 bool AllCallSitesKnown = true;
858 if (!A.checkForAllCallSites(CheckCallSite, *this,
859 /*RequireAllCallSites=*/true,
860 AllCallSitesKnown))
861 return indicatePessimisticFixpoint();
862
863 return Change;
864 }
865
866 /// Clamp the assumed range to the default value ([Min, Max]) and emit the
867 /// attribute if it is not same as default.
868 ChangeStatus
869 emitAttributeIfNotDefaultAfterClamp(Attributor &A,
870 std::pair<unsigned, unsigned> Default) {
871 auto [Min, Max] = Default;
872 unsigned Lower = getAssumed().getLower().getZExtValue();
873 unsigned Upper = getAssumed().getUpper().getZExtValue();
874
875 // Clamp the range to the default value.
876 if (Lower < Min)
877 Lower = Min;
878 if (Upper > Max + 1)
879 Upper = Max + 1;
880
881 // No manifest if the value is invalid or same as default after clamp.
882 if ((Lower == Min && Upper == Max + 1) || (Upper < Lower))
883 return ChangeStatus::UNCHANGED;
884
885 Function *F = getAssociatedFunction();
886 LLVMContext &Ctx = F->getContext();
887 SmallString<10> Buffer;
888 raw_svector_ostream OS(Buffer);
889 OS << Lower << ',' << Upper - 1;
890 return A.manifestAttrs(IRP: getIRPosition(),
891 DeducedAttrs: {Attribute::get(Context&: Ctx, Kind: AttrName, Val: OS.str())},
892 /*ForceReplace=*/true);
893 }
894
895 const std::string getAsStr(Attributor *) const override {
896 std::string Str;
897 raw_string_ostream OS(Str);
898 OS << getName() << '[';
899 OS << getAssumed().getLower() << ',' << getAssumed().getUpper() - 1;
900 OS << ']';
901 return OS.str();
902 }
903};
904
905/// Propagate amdgpu-flat-work-group-size attribute.
906struct AAAMDFlatWorkGroupSize : public AAAMDSizeRangeAttribute {
907 AAAMDFlatWorkGroupSize(const IRPosition &IRP, Attributor &A)
908 : AAAMDSizeRangeAttribute(IRP, A, "amdgpu-flat-work-group-size") {}
909
910 void initialize(Attributor &A) override {
911 Function *F = getAssociatedFunction();
912 auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
913
914 bool HasAttr = false;
915 auto Range = InfoCache.getDefaultFlatWorkGroupSize(F: *F);
916 auto MaxRange = InfoCache.getMaximumFlatWorkGroupRange(F: *F);
917
918 if (auto Attr = InfoCache.getFlatWorkGroupSizeAttr(F: *F)) {
919 // We only consider an attribute that is not max range because the front
920 // end always emits the attribute, unfortunately, and sometimes it emits
921 // the max range.
922 if (*Attr != MaxRange) {
923 Range = *Attr;
924 HasAttr = true;
925 }
926 }
927
928 // We don't want to directly clamp the state if it's the max range because
929 // that is basically the worst state.
930 if (Range == MaxRange)
931 return;
932
933 auto [Min, Max] = Range;
934 ConstantRange CR(APInt(32, Min), APInt(32, Max + 1));
935 IntegerRangeState IRS(CR);
936 clampStateAndIndicateChange(S&: this->getState(), R: IRS);
937
938 if (HasAttr || AMDGPU::isEntryFunctionCC(CC: F->getCallingConv()))
939 indicateOptimisticFixpoint();
940 }
941
942 ChangeStatus updateImpl(Attributor &A) override {
943 return updateImplImpl<AAAMDFlatWorkGroupSize>(A);
944 }
945
946 /// Create an abstract attribute view for the position \p IRP.
947 static AAAMDFlatWorkGroupSize &createForPosition(const IRPosition &IRP,
948 Attributor &A);
949
950 ChangeStatus manifest(Attributor &A) override {
951 Function *F = getAssociatedFunction();
952 auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
953 return emitAttributeIfNotDefaultAfterClamp(
954 A, Default: InfoCache.getMaximumFlatWorkGroupRange(F: *F));
955 }
956
957 /// See AbstractAttribute::getName()
958 StringRef getName() const override { return "AAAMDFlatWorkGroupSize"; }
959
960 /// See AbstractAttribute::getIdAddr()
961 const char *getIdAddr() const override { return &ID; }
962
963 /// This function should return true if the type of the \p AA is
964 /// AAAMDFlatWorkGroupSize
965 static bool classof(const AbstractAttribute *AA) {
966 return (AA->getIdAddr() == &ID);
967 }
968
969 /// Unique ID (due to the unique address)
970 static const char ID;
971};
972
973const char AAAMDFlatWorkGroupSize::ID = 0;
974
975AAAMDFlatWorkGroupSize &
976AAAMDFlatWorkGroupSize::createForPosition(const IRPosition &IRP,
977 Attributor &A) {
978 if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
979 return *new (A.Allocator) AAAMDFlatWorkGroupSize(IRP, A);
980 llvm_unreachable(
981 "AAAMDFlatWorkGroupSize is only valid for function position");
982}
983
984struct TupleDecIntegerRangeState : public AbstractState {
985 DecIntegerState<uint32_t> X, Y, Z;
986
987 bool isValidState() const override {
988 return X.isValidState() && Y.isValidState() && Z.isValidState();
989 }
990
991 bool isAtFixpoint() const override {
992 return X.isAtFixpoint() && Y.isAtFixpoint() && Z.isAtFixpoint();
993 }
994
995 ChangeStatus indicateOptimisticFixpoint() override {
996 return X.indicateOptimisticFixpoint() | Y.indicateOptimisticFixpoint() |
997 Z.indicateOptimisticFixpoint();
998 }
999
1000 ChangeStatus indicatePessimisticFixpoint() override {
1001 return X.indicatePessimisticFixpoint() | Y.indicatePessimisticFixpoint() |
1002 Z.indicatePessimisticFixpoint();
1003 }
1004
1005 TupleDecIntegerRangeState operator^=(const TupleDecIntegerRangeState &Other) {
1006 X ^= Other.X;
1007 Y ^= Other.Y;
1008 Z ^= Other.Z;
1009 return *this;
1010 }
1011
1012 bool operator==(const TupleDecIntegerRangeState &Other) const {
1013 return X == Other.X && Y == Other.Y && Z == Other.Z;
1014 }
1015
1016 TupleDecIntegerRangeState &getAssumed() { return *this; }
1017 const TupleDecIntegerRangeState &getAssumed() const { return *this; }
1018};
1019
1020using AAAMDMaxNumWorkgroupsState =
1021 StateWrapper<TupleDecIntegerRangeState, AbstractAttribute, uint32_t>;
1022
1023/// Propagate amdgpu-max-num-workgroups attribute.
1024struct AAAMDMaxNumWorkgroups
1025 : public StateWrapper<TupleDecIntegerRangeState, AbstractAttribute> {
1026 using Base = StateWrapper<TupleDecIntegerRangeState, AbstractAttribute>;
1027
1028 AAAMDMaxNumWorkgroups(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
1029
1030 void initialize(Attributor &A) override {
1031 Function *F = getAssociatedFunction();
1032 auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
1033
1034 SmallVector<unsigned> MaxNumWorkgroups = InfoCache.getMaxNumWorkGroups(F: *F);
1035
1036 X.takeKnownMinimum(Value: MaxNumWorkgroups[0]);
1037 Y.takeKnownMinimum(Value: MaxNumWorkgroups[1]);
1038 Z.takeKnownMinimum(Value: MaxNumWorkgroups[2]);
1039
1040 if (AMDGPU::isEntryFunctionCC(CC: F->getCallingConv()))
1041 indicatePessimisticFixpoint();
1042 }
1043
1044 ChangeStatus updateImpl(Attributor &A) override {
1045 ChangeStatus Change = ChangeStatus::UNCHANGED;
1046
1047 auto CheckCallSite = [&](AbstractCallSite CS) {
1048 Function *Caller = CS.getInstruction()->getFunction();
1049 LLVM_DEBUG(dbgs() << "[AAAMDMaxNumWorkgroups] Call " << Caller->getName()
1050 << "->" << getAssociatedFunction()->getName() << '\n');
1051
1052 const auto *CallerInfo = A.getAAFor<AAAMDMaxNumWorkgroups>(
1053 QueryingAA: *this, IRP: IRPosition::function(F: *Caller), DepClass: DepClassTy::REQUIRED);
1054 if (!CallerInfo || !CallerInfo->isValidState())
1055 return false;
1056
1057 Change |=
1058 clampStateAndIndicateChange(S&: this->getState(), R: CallerInfo->getState());
1059 return true;
1060 };
1061
1062 bool AllCallSitesKnown = true;
1063 if (!A.checkForAllCallSites(Pred: CheckCallSite, QueryingAA: *this,
1064 /*RequireAllCallSites=*/true,
1065 UsedAssumedInformation&: AllCallSitesKnown))
1066 return indicatePessimisticFixpoint();
1067
1068 return Change;
1069 }
1070
1071 /// Create an abstract attribute view for the position \p IRP.
1072 static AAAMDMaxNumWorkgroups &createForPosition(const IRPosition &IRP,
1073 Attributor &A);
1074
1075 ChangeStatus manifest(Attributor &A) override {
1076 Function *F = getAssociatedFunction();
1077 LLVMContext &Ctx = F->getContext();
1078 SmallString<32> Buffer;
1079 raw_svector_ostream OS(Buffer);
1080 OS << X.getAssumed() << ',' << Y.getAssumed() << ',' << Z.getAssumed();
1081
1082 // TODO: Should annotate loads of the group size for this to do anything
1083 // useful.
1084 return A.manifestAttrs(
1085 IRP: getIRPosition(),
1086 DeducedAttrs: {Attribute::get(Context&: Ctx, Kind: "amdgpu-max-num-workgroups", Val: OS.str())},
1087 /* ForceReplace= */ true);
1088 }
1089
1090 StringRef getName() const override { return "AAAMDMaxNumWorkgroups"; }
1091
1092 const std::string getAsStr(Attributor *) const override {
1093 std::string Buffer = "AAAMDMaxNumWorkgroupsState[";
1094 raw_string_ostream OS(Buffer);
1095 OS << X.getAssumed() << ',' << Y.getAssumed() << ',' << Z.getAssumed()
1096 << ']';
1097 return OS.str();
1098 }
1099
1100 const char *getIdAddr() const override { return &ID; }
1101
1102 /// This function should return true if the type of the \p AA is
1103 /// AAAMDMaxNumWorkgroups
1104 static bool classof(const AbstractAttribute *AA) {
1105 return (AA->getIdAddr() == &ID);
1106 }
1107
1108 void trackStatistics() const override {}
1109
1110 /// Unique ID (due to the unique address)
1111 static const char ID;
1112};
1113
1114const char AAAMDMaxNumWorkgroups::ID = 0;
1115
1116AAAMDMaxNumWorkgroups &
1117AAAMDMaxNumWorkgroups::createForPosition(const IRPosition &IRP, Attributor &A) {
1118 if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
1119 return *new (A.Allocator) AAAMDMaxNumWorkgroups(IRP, A);
1120 llvm_unreachable("AAAMDMaxNumWorkgroups is only valid for function position");
1121}
1122
1123/// Propagate amdgpu-waves-per-eu attribute.
1124struct AAAMDWavesPerEU : public AAAMDSizeRangeAttribute {
1125 AAAMDWavesPerEU(const IRPosition &IRP, Attributor &A)
1126 : AAAMDSizeRangeAttribute(IRP, A, "amdgpu-waves-per-eu") {}
1127
1128 void initialize(Attributor &A) override {
1129 Function *F = getAssociatedFunction();
1130 auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
1131
1132 // If the attribute exists, we will honor it if it is not the default.
1133 if (auto Attr = InfoCache.getWavesPerEUAttr(F: *F)) {
1134 std::pair<unsigned, unsigned> MaxWavesPerEURange{
1135 1U, InfoCache.getMaxWavesPerEU(F: *F)};
1136 if (*Attr != MaxWavesPerEURange) {
1137 auto [Min, Max] = *Attr;
1138 ConstantRange Range(APInt(32, Min), APInt(32, Max + 1));
1139 IntegerRangeState RangeState(Range);
1140 this->getState() = RangeState;
1141 indicateOptimisticFixpoint();
1142 return;
1143 }
1144 }
1145
1146 if (AMDGPU::isEntryFunctionCC(CC: F->getCallingConv()))
1147 indicatePessimisticFixpoint();
1148 }
1149
1150 ChangeStatus updateImpl(Attributor &A) override {
1151 ChangeStatus Change = ChangeStatus::UNCHANGED;
1152
1153 auto CheckCallSite = [&](AbstractCallSite CS) {
1154 Function *Caller = CS.getInstruction()->getFunction();
1155 Function *Func = getAssociatedFunction();
1156 LLVM_DEBUG(dbgs() << '[' << getName() << "] Call " << Caller->getName()
1157 << "->" << Func->getName() << '\n');
1158 (void)Func;
1159
1160 const auto *CallerAA = A.getAAFor<AAAMDWavesPerEU>(
1161 QueryingAA: *this, IRP: IRPosition::function(F: *Caller), DepClass: DepClassTy::REQUIRED);
1162 if (!CallerAA || !CallerAA->isValidState())
1163 return false;
1164
1165 ConstantRange Assumed = getAssumed();
1166 unsigned Min = std::max(a: Assumed.getLower().getZExtValue(),
1167 b: CallerAA->getAssumed().getLower().getZExtValue());
1168 unsigned Max = std::max(a: Assumed.getUpper().getZExtValue(),
1169 b: CallerAA->getAssumed().getUpper().getZExtValue());
1170 ConstantRange Range(APInt(32, Min), APInt(32, Max));
1171 IntegerRangeState RangeState(Range);
1172 getState() = RangeState;
1173 Change |= getState() == Assumed ? ChangeStatus::UNCHANGED
1174 : ChangeStatus::CHANGED;
1175
1176 return true;
1177 };
1178
1179 bool AllCallSitesKnown = true;
1180 if (!A.checkForAllCallSites(Pred: CheckCallSite, QueryingAA: *this, RequireAllCallSites: true, UsedAssumedInformation&: AllCallSitesKnown))
1181 return indicatePessimisticFixpoint();
1182
1183 return Change;
1184 }
1185
1186 /// Create an abstract attribute view for the position \p IRP.
1187 static AAAMDWavesPerEU &createForPosition(const IRPosition &IRP,
1188 Attributor &A);
1189
1190 ChangeStatus manifest(Attributor &A) override {
1191 Function *F = getAssociatedFunction();
1192 auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
1193 return emitAttributeIfNotDefaultAfterClamp(
1194 A, Default: {1U, InfoCache.getMaxWavesPerEU(F: *F)});
1195 }
1196
1197 /// See AbstractAttribute::getName()
1198 StringRef getName() const override { return "AAAMDWavesPerEU"; }
1199
1200 /// See AbstractAttribute::getIdAddr()
1201 const char *getIdAddr() const override { return &ID; }
1202
1203 /// This function should return true if the type of the \p AA is
1204 /// AAAMDWavesPerEU
1205 static bool classof(const AbstractAttribute *AA) {
1206 return (AA->getIdAddr() == &ID);
1207 }
1208
1209 /// Unique ID (due to the unique address)
1210 static const char ID;
1211};
1212
1213const char AAAMDWavesPerEU::ID = 0;
1214
1215AAAMDWavesPerEU &AAAMDWavesPerEU::createForPosition(const IRPosition &IRP,
1216 Attributor &A) {
1217 if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
1218 return *new (A.Allocator) AAAMDWavesPerEU(IRP, A);
1219 llvm_unreachable("AAAMDWavesPerEU is only valid for function position");
1220}
1221
1222/// Compute the minimum number of AGPRs required to allocate the inline asm.
1223static unsigned inlineAsmGetNumRequiredAGPRs(const InlineAsm *IA,
1224 const CallBase &Call) {
1225 unsigned ArgNo = 0;
1226 unsigned ResNo = 0;
1227 unsigned AGPRDefCount = 0;
1228 unsigned AGPRUseCount = 0;
1229 unsigned MaxPhysReg = 0;
1230 const DataLayout &DL = Call.getFunction()->getParent()->getDataLayout();
1231
1232 // TODO: Overestimates due to not accounting for tied operands
1233 for (const InlineAsm::ConstraintInfo &CI : IA->ParseConstraints()) {
1234 Type *Ty = nullptr;
1235 switch (CI.Type) {
1236 case InlineAsm::isOutput: {
1237 Ty = Call.getType();
1238 if (auto *STy = dyn_cast<StructType>(Val: Ty))
1239 Ty = STy->getElementType(N: ResNo);
1240 ++ResNo;
1241 break;
1242 }
1243 case InlineAsm::isInput: {
1244 Ty = Call.getArgOperand(i: ArgNo++)->getType();
1245 break;
1246 }
1247 case InlineAsm::isLabel:
1248 continue;
1249 case InlineAsm::isClobber:
1250 // Parse the physical register reference.
1251 break;
1252 }
1253
1254 for (StringRef Code : CI.Codes) {
1255 unsigned RegCount = 0;
1256 if (Code.starts_with(Prefix: "a")) {
1257 // Virtual register, compute number of registers based on the type.
1258 //
1259 // We ought to be going through TargetLowering to get the number of
1260 // registers, but we should avoid the dependence on CodeGen here.
1261 RegCount = divideCeil(Numerator: DL.getTypeSizeInBits(Ty), Denominator: 32);
1262 } else {
1263 // Physical register reference
1264 auto [Kind, RegIdx, NumRegs] = AMDGPU::parseAsmConstraintPhysReg(Constraint: Code);
1265 if (Kind == 'a') {
1266 RegCount = NumRegs;
1267 MaxPhysReg = std::max(a: MaxPhysReg, b: std::min(a: RegIdx + NumRegs, b: 256u));
1268 }
1269
1270 continue;
1271 }
1272
1273 if (CI.Type == InlineAsm::isOutput) {
1274 // Apply tuple alignment requirement
1275 //
1276 // TODO: This is more conservative than necessary.
1277 AGPRDefCount = alignTo(Value: AGPRDefCount, Align: RegCount);
1278
1279 AGPRDefCount += RegCount;
1280 if (CI.isEarlyClobber) {
1281 AGPRUseCount = alignTo(Value: AGPRUseCount, Align: RegCount);
1282 AGPRUseCount += RegCount;
1283 }
1284 } else {
1285 AGPRUseCount = alignTo(Value: AGPRUseCount, Align: RegCount);
1286 AGPRUseCount += RegCount;
1287 }
1288 }
1289 }
1290
1291 unsigned MaxVirtReg = std::max(a: AGPRUseCount, b: AGPRDefCount);
1292
1293 // TODO: This is overly conservative. If there are any physical registers,
1294 // allocate any virtual registers after them so we don't have to solve optimal
1295 // packing.
1296 return std::min(a: MaxVirtReg + MaxPhysReg, b: 256u);
1297}
1298
1299struct AAAMDGPUMinAGPRAlloc
1300 : public StateWrapper<DecIntegerState<>, AbstractAttribute> {
1301 using Base = StateWrapper<DecIntegerState<>, AbstractAttribute>;
1302 AAAMDGPUMinAGPRAlloc(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
1303
1304 static AAAMDGPUMinAGPRAlloc &createForPosition(const IRPosition &IRP,
1305 Attributor &A) {
1306 if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
1307 return *new (A.Allocator) AAAMDGPUMinAGPRAlloc(IRP, A);
1308 llvm_unreachable(
1309 "AAAMDGPUMinAGPRAlloc is only valid for function position");
1310 }
1311
1312 void initialize(Attributor &A) override {
1313 Function *F = getAssociatedFunction();
1314 auto [MinNumAGPR, MaxNumAGPR] =
1315 AMDGPU::getIntegerPairAttribute(F: *F, Name: "amdgpu-agpr-alloc", Default: {~0u, ~0u},
1316 /*OnlyFirstRequired=*/true);
1317 if (MinNumAGPR == 0) {
1318 indicateOptimisticFixpoint();
1319 return;
1320 }
1321
1322 if (hasSanitizerAttributes(F: *F))
1323 indicatePessimisticFixpoint();
1324 }
1325
1326 const std::string getAsStr(Attributor *A) const override {
1327 std::string Str = "amdgpu-agpr-alloc=";
1328 raw_string_ostream OS(Str);
1329 OS << getAssumed();
1330 return OS.str();
1331 }
1332
1333 void trackStatistics() const override {}
1334
1335 ChangeStatus updateImpl(Attributor &A) override {
1336 DecIntegerState<> Maximum;
1337
1338 // Check for cases which require allocation of AGPRs. The only cases where
1339 // AGPRs are required are if there are direct references to AGPRs, so inline
1340 // assembly and special intrinsics.
1341 auto CheckForMinAGPRAllocs = [&](Instruction &I) {
1342 const auto &CB = cast<CallBase>(Val&: I);
1343 const Value *CalleeOp = CB.getCalledOperand();
1344
1345 if (const InlineAsm *IA = dyn_cast<InlineAsm>(Val: CalleeOp)) {
1346 // Technically, the inline asm could be invoking a call to an unknown
1347 // external function that requires AGPRs, but ignore that.
1348 unsigned NumRegs = inlineAsmGetNumRequiredAGPRs(IA, Call: CB);
1349 Maximum.takeAssumedMaximum(Value: NumRegs);
1350 return true;
1351 }
1352 switch (CB.getIntrinsicID()) {
1353 case Intrinsic::not_intrinsic:
1354 break;
1355 case Intrinsic::write_register:
1356 case Intrinsic::read_register:
1357 case Intrinsic::read_volatile_register: {
1358 const MDString *RegName = cast<MDString>(
1359 Val: cast<MDNode>(
1360 Val: cast<MetadataAsValue>(Val: CB.getArgOperand(i: 0))->getMetadata())
1361 ->getOperand(I: 0));
1362 auto [Kind, RegIdx, NumRegs] =
1363 AMDGPU::parseAsmPhysRegName(TupleString: RegName->getString());
1364 if (Kind == 'a')
1365 Maximum.takeAssumedMaximum(Value: std::min(a: RegIdx + NumRegs, b: 256u));
1366
1367 return true;
1368 }
1369 // Trap-like intrinsics such as llvm.trap and llvm.debugtrap do not have
1370 // the nocallback attribute, so the AMDGPU attributor can conservatively
1371 // drop all implicitly-known inputs and AGPR allocation information. Make
1372 // sure we still infer that no implicit inputs are required and that the
1373 // AGPR allocation stays at zero. Trap-like intrinsics may invoke a
1374 // function which requires AGPRs, so we need to check if the called
1375 // function has the "trap-func-name" attribute.
1376 case Intrinsic::trap:
1377 case Intrinsic::debugtrap:
1378 case Intrinsic::ubsantrap:
1379 return CB.hasFnAttr(Kind: Attribute::NoCallback) ||
1380 !CB.hasFnAttr(Kind: "trap-func-name");
1381 default:
1382 // Some intrinsics may use AGPRs, but if we have a choice, we are not
1383 // required to use AGPRs.
1384 // Assume !nocallback intrinsics may call a function which requires
1385 // AGPRs.
1386 return CB.hasFnAttr(Kind: Attribute::NoCallback);
1387 }
1388
1389 // TODO: Handle callsite attributes
1390 auto *CBEdges = A.getAAFor<AACallEdges>(
1391 QueryingAA: *this, IRP: IRPosition::callsite_function(CB), DepClass: DepClassTy::REQUIRED);
1392 if (!CBEdges || CBEdges->hasUnknownCallee()) {
1393 Maximum.indicatePessimisticFixpoint();
1394 return false;
1395 }
1396
1397 for (const Function *PossibleCallee : CBEdges->getOptimisticEdges()) {
1398 const auto *CalleeInfo = A.getAAFor<AAAMDGPUMinAGPRAlloc>(
1399 QueryingAA: *this, IRP: IRPosition::function(F: *PossibleCallee), DepClass: DepClassTy::REQUIRED);
1400 if (!CalleeInfo || !CalleeInfo->isValidState()) {
1401 Maximum.indicatePessimisticFixpoint();
1402 return false;
1403 }
1404
1405 Maximum.takeAssumedMaximum(Value: CalleeInfo->getAssumed());
1406 }
1407
1408 return true;
1409 };
1410
1411 bool UsedAssumedInformation = false;
1412 if (!A.checkForAllCallLikeInstructions(Pred: CheckForMinAGPRAllocs, QueryingAA: *this,
1413 UsedAssumedInformation))
1414 return indicatePessimisticFixpoint();
1415
1416 return clampStateAndIndicateChange(S&: getState(), R: Maximum);
1417 }
1418
1419 ChangeStatus manifest(Attributor &A) override {
1420 LLVMContext &Ctx = getAssociatedFunction()->getContext();
1421 SmallString<4> Buffer;
1422 raw_svector_ostream OS(Buffer);
1423 OS << getAssumed();
1424
1425 return A.manifestAttrs(
1426 IRP: getIRPosition(), DeducedAttrs: {Attribute::get(Context&: Ctx, Kind: "amdgpu-agpr-alloc", Val: OS.str())});
1427 }
1428
1429 StringRef getName() const override { return "AAAMDGPUMinAGPRAlloc"; }
1430 const char *getIdAddr() const override { return &ID; }
1431
1432 /// This function should return true if the type of the \p AA is
1433 /// AAAMDGPUMinAGPRAllocs
1434 static bool classof(const AbstractAttribute *AA) {
1435 return (AA->getIdAddr() == &ID);
1436 }
1437
1438 static const char ID;
1439};
1440
1441const char AAAMDGPUMinAGPRAlloc::ID = 0;
1442
1443/// An abstract attribute to propagate the function attribute
1444/// "amdgpu-cluster-dims" from kernel entry functions to device functions.
1445struct AAAMDGPUClusterDims
1446 : public StateWrapper<BooleanState, AbstractAttribute> {
1447 using Base = StateWrapper<BooleanState, AbstractAttribute>;
1448 AAAMDGPUClusterDims(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
1449
1450 /// Create an abstract attribute view for the position \p IRP.
1451 static AAAMDGPUClusterDims &createForPosition(const IRPosition &IRP,
1452 Attributor &A);
1453
1454 /// See AbstractAttribute::getName().
1455 StringRef getName() const override { return "AAAMDGPUClusterDims"; }
1456
1457 /// See AbstractAttribute::getIdAddr().
1458 const char *getIdAddr() const override { return &ID; }
1459
1460 /// This function should return true if the type of the \p AA is
1461 /// AAAMDGPUClusterDims.
1462 static bool classof(const AbstractAttribute *AA) {
1463 return AA->getIdAddr() == &ID;
1464 }
1465
1466 virtual const AMDGPU::ClusterDimsAttr &getClusterDims() const = 0;
1467
1468 /// Unique ID (due to the unique address)
1469 static const char ID;
1470};
1471
1472const char AAAMDGPUClusterDims::ID = 0;
1473
1474struct AAAMDGPUClusterDimsFunction : public AAAMDGPUClusterDims {
1475 AAAMDGPUClusterDimsFunction(const IRPosition &IRP, Attributor &A)
1476 : AAAMDGPUClusterDims(IRP, A) {}
1477
1478 void initialize(Attributor &A) override {
1479 Function *F = getAssociatedFunction();
1480 assert(F && "empty associated function");
1481
1482 Attr = AMDGPU::ClusterDimsAttr::get(F: *F);
1483
1484 // No matter what a kernel function has, it is final.
1485 if (AMDGPU::isEntryFunctionCC(CC: F->getCallingConv())) {
1486 if (Attr.isUnknown())
1487 indicatePessimisticFixpoint();
1488 else
1489 indicateOptimisticFixpoint();
1490 }
1491 }
1492
1493 const std::string getAsStr(Attributor *A) const override {
1494 if (!getAssumed() || Attr.isUnknown())
1495 return "unknown";
1496 if (Attr.isNoCluster())
1497 return "no";
1498 if (Attr.isVariableDims())
1499 return "variable";
1500 return Attr.to_string();
1501 }
1502
1503 void trackStatistics() const override {}
1504
1505 ChangeStatus updateImpl(Attributor &A) override {
1506 auto OldState = Attr;
1507
1508 auto CheckCallSite = [&](AbstractCallSite CS) {
1509 const auto *CallerAA = A.getAAFor<AAAMDGPUClusterDims>(
1510 QueryingAA: *this, IRP: IRPosition::function(F: *CS.getInstruction()->getFunction()),
1511 DepClass: DepClassTy::REQUIRED);
1512 if (!CallerAA || !CallerAA->isValidState())
1513 return false;
1514
1515 return merge(Other: CallerAA->getClusterDims());
1516 };
1517
1518 bool UsedAssumedInformation = false;
1519 if (!A.checkForAllCallSites(Pred: CheckCallSite, QueryingAA: *this,
1520 /*RequireAllCallSites=*/true,
1521 UsedAssumedInformation))
1522 return indicatePessimisticFixpoint();
1523
1524 return OldState == Attr ? ChangeStatus::UNCHANGED : ChangeStatus::CHANGED;
1525 }
1526
1527 ChangeStatus manifest(Attributor &A) override {
1528 if (Attr.isUnknown())
1529 return ChangeStatus::UNCHANGED;
1530 return A.manifestAttrs(
1531 IRP: getIRPosition(),
1532 DeducedAttrs: {Attribute::get(Context&: getAssociatedFunction()->getContext(), Kind: AttrName,
1533 Val: Attr.to_string())},
1534 /*ForceReplace=*/true);
1535 }
1536
1537 const AMDGPU::ClusterDimsAttr &getClusterDims() const override {
1538 return Attr;
1539 }
1540
1541private:
1542 bool merge(const AMDGPU::ClusterDimsAttr &Other) {
1543 // Case 1: Both of them are unknown yet, we do nothing and continue wait for
1544 // propagation.
1545 if (Attr.isUnknown() && Other.isUnknown())
1546 return true;
1547
1548 // Case 2: The other is determined, but we are unknown yet, we simply take
1549 // the other's value.
1550 if (Attr.isUnknown()) {
1551 Attr = Other;
1552 return true;
1553 }
1554
1555 // Case 3: We are determined but the other is unknown yet, we simply keep
1556 // everything unchanged.
1557 if (Other.isUnknown())
1558 return true;
1559
1560 // After this point, both are determined.
1561
1562 // Case 4: If they are same, we do nothing.
1563 if (Attr == Other)
1564 return true;
1565
1566 // Now they are not same.
1567
1568 // Case 5: If either of us uses cluster (but not both; otherwise case 4
1569 // would hold), then it is unknown whether cluster will be used, and the
1570 // state is final, unlike case 1.
1571 if (Attr.isNoCluster() || Other.isNoCluster()) {
1572 Attr.setUnknown();
1573 return false;
1574 }
1575
1576 // Case 6: Both of us use cluster, but the dims are different, so the result
1577 // is, cluster is used, but we just don't have a fixed dims.
1578 Attr.setVariableDims();
1579 return true;
1580 }
1581
1582 AMDGPU::ClusterDimsAttr Attr;
1583
1584 static constexpr char AttrName[] = "amdgpu-cluster-dims";
1585};
1586
1587AAAMDGPUClusterDims &
1588AAAMDGPUClusterDims::createForPosition(const IRPosition &IRP, Attributor &A) {
1589 if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
1590 return *new (A.Allocator) AAAMDGPUClusterDimsFunction(IRP, A);
1591 llvm_unreachable("AAAMDGPUClusterDims is only valid for function position");
1592}
1593
1594static bool runImpl(SetVector<Function *> &Functions, bool IsModulePass,
1595 bool DeleteFns, Module &M, AnalysisGetter &AG,
1596 TargetMachine &TM, AMDGPUAttributorOptions Options,
1597 ThinOrFullLTOPhase LTOPhase) {
1598
1599 CallGraphUpdater CGUpdater;
1600 BumpPtrAllocator Allocator;
1601 AMDGPUInformationCache InfoCache(M, AG, Allocator, nullptr, TM);
1602 DenseSet<const char *> Allowed(
1603 {&AAAMDAttributes::ID, &AAUniformWorkGroupSize::ID,
1604 &AAPotentialValues::ID, &AAAMDFlatWorkGroupSize::ID,
1605 &AAAMDMaxNumWorkgroups::ID, &AAAMDWavesPerEU::ID,
1606 &AAAMDGPUMinAGPRAlloc::ID, &AACallEdges::ID, &AAPointerInfo::ID,
1607 &AAPotentialConstantValues::ID, &AAUnderlyingObjects::ID,
1608 &AANoAliasAddrSpace::ID, &AAAddressSpace::ID, &AAIndirectCallInfo::ID,
1609 &AAAMDGPUClusterDims::ID, &AAAlign::ID});
1610
1611 AttributorConfig AC(CGUpdater);
1612 AC.IsClosedWorldModule = Options.IsClosedWorld;
1613 AC.Allowed = &Allowed;
1614 AC.IsModulePass = IsModulePass;
1615 AC.DeleteFns = DeleteFns;
1616 AC.DefaultInitializeLiveInternals = false;
1617 AC.IndirectCalleeSpecializationCallback =
1618 [](Attributor &A, const AbstractAttribute &AA, CallBase &CB,
1619 Function &Callee, unsigned NumAssumedCallees) {
1620 return !AMDGPU::isEntryFunctionCC(CC: Callee.getCallingConv()) &&
1621 (NumAssumedCallees <= IndirectCallSpecializationThreshold);
1622 };
1623 AC.IPOAmendableCB = [](const Function &F) {
1624 return F.getCallingConv() == CallingConv::AMDGPU_KERNEL;
1625 };
1626
1627 Attributor A(Functions, InfoCache, AC);
1628
1629 LLVM_DEBUG({
1630 StringRef LTOPhaseStr = to_string(LTOPhase);
1631 dbgs() << "[AMDGPUAttributor] Running at phase " << LTOPhaseStr << '\n'
1632 << "[AMDGPUAttributor] Module " << M.getName() << " is "
1633 << (AC.IsClosedWorldModule ? "" : "not ")
1634 << "assumed to be a closed world.\n";
1635 });
1636
1637 for (auto *F : Functions) {
1638 A.getOrCreateAAFor<AAAMDAttributes>(IRP: IRPosition::function(F: *F));
1639 A.getOrCreateAAFor<AAUniformWorkGroupSize>(IRP: IRPosition::function(F: *F));
1640 A.getOrCreateAAFor<AAAMDMaxNumWorkgroups>(IRP: IRPosition::function(F: *F));
1641 CallingConv::ID CC = F->getCallingConv();
1642 if (!AMDGPU::isEntryFunctionCC(CC)) {
1643 A.getOrCreateAAFor<AAAMDFlatWorkGroupSize>(IRP: IRPosition::function(F: *F));
1644 A.getOrCreateAAFor<AAAMDWavesPerEU>(IRP: IRPosition::function(F: *F));
1645 }
1646
1647 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F: *F);
1648 if (!F->isDeclaration() && ST.hasClusters())
1649 A.getOrCreateAAFor<AAAMDGPUClusterDims>(IRP: IRPosition::function(F: *F));
1650
1651 if (ST.hasGFX90AInsts())
1652 A.getOrCreateAAFor<AAAMDGPUMinAGPRAlloc>(IRP: IRPosition::function(F: *F));
1653
1654 for (auto &I : instructions(F)) {
1655 Value *Ptr = nullptr;
1656 if (auto *LI = dyn_cast<LoadInst>(Val: &I))
1657 Ptr = LI->getPointerOperand();
1658 else if (auto *SI = dyn_cast<StoreInst>(Val: &I))
1659 Ptr = SI->getPointerOperand();
1660 else if (auto *RMW = dyn_cast<AtomicRMWInst>(Val: &I))
1661 Ptr = RMW->getPointerOperand();
1662 else if (auto *CmpX = dyn_cast<AtomicCmpXchgInst>(Val: &I))
1663 Ptr = CmpX->getPointerOperand();
1664
1665 if (Ptr) {
1666 A.getOrCreateAAFor<AAAddressSpace>(IRP: IRPosition::value(V: *Ptr));
1667 A.getOrCreateAAFor<AANoAliasAddrSpace>(IRP: IRPosition::value(V: *Ptr));
1668 if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(Val: Ptr)) {
1669 if (II->getIntrinsicID() == Intrinsic::amdgcn_make_buffer_rsrc)
1670 A.getOrCreateAAFor<AAAlign>(IRP: IRPosition::value(V: *Ptr));
1671 }
1672 }
1673 }
1674 }
1675
1676 return A.run() == ChangeStatus::CHANGED;
1677}
1678} // namespace
1679
1680PreservedAnalyses llvm::AMDGPUAttributorPass::run(Module &M,
1681 ModuleAnalysisManager &AM) {
1682
1683 FunctionAnalysisManager &FAM =
1684 AM.getResult<FunctionAnalysisManagerModuleProxy>(IR&: M).getManager();
1685 AnalysisGetter AG(FAM);
1686
1687 SetVector<Function *> Functions;
1688 for (Function &F : M) {
1689 if (!F.isDeclaration())
1690 Functions.insert(X: &F);
1691 }
1692
1693 // TODO: Probably preserves CFG
1694 return runImpl(Functions, /*IsModulePass=*/true, /*DeleteFns=*/true, M, AG,
1695 TM, Options, LTOPhase)
1696 ? PreservedAnalyses::none()
1697 : PreservedAnalyses::all();
1698}
1699
1700PreservedAnalyses llvm::AMDGPUAttributorCGSCCPass::run(LazyCallGraph::SCC &C,
1701 CGSCCAnalysisManager &AM,
1702 LazyCallGraph &CG,
1703 CGSCCUpdateResult &UR) {
1704
1705 FunctionAnalysisManager &FAM =
1706 AM.getResult<FunctionAnalysisManagerCGSCCProxy>(IR&: C, ExtraArgs&: CG).getManager();
1707 AnalysisGetter AG(FAM);
1708
1709 SetVector<Function *> Functions;
1710 for (LazyCallGraph::Node &N : C) {
1711 Function *F = &N.getFunction();
1712 if (!F->isIntrinsic())
1713 Functions.insert(X: F);
1714 }
1715
1716 AMDGPUAttributorOptions Options;
1717 Module *M = C.begin()->getFunction().getParent();
1718 // In the CGSCC pipeline, avoid untracked call graph modifications by
1719 // disabling function deletion, mirroring the generic AttributorCGSCCPass.
1720 return runImpl(Functions, /*IsModulePass=*/false, /*DeleteFns=*/false, M&: *M, AG,
1721 TM, Options, LTOPhase: ThinOrFullLTOPhase::None)
1722 ? PreservedAnalyses::none()
1723 : PreservedAnalyses::all();
1724}
1725