1//===- AMDGPUAttributor.cpp -----------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file This pass uses Attributor framework to deduce AMDGPU attributes.
10//
11//===----------------------------------------------------------------------===//
12
13#include "AMDGPU.h"
14#include "GCNSubtarget.h"
15#include "Utils/AMDGPUBaseInfo.h"
16#include "llvm/Analysis/CycleAnalysis.h"
17#include "llvm/CodeGen/TargetPassConfig.h"
18#include "llvm/IR/IntrinsicsAMDGPU.h"
19#include "llvm/IR/IntrinsicsR600.h"
20#include "llvm/InitializePasses.h"
21#include "llvm/Target/TargetMachine.h"
22#include "llvm/Transforms/IPO/Attributor.h"
23
24#define DEBUG_TYPE "amdgpu-attributor"
25
26using namespace llvm;
27
28static cl::opt<unsigned> IndirectCallSpecializationThreshold(
29 "amdgpu-indirect-call-specialization-threshold",
30 cl::desc(
31 "A threshold controls whether an indirect call will be specialized"),
32 cl::init(Val: 3));
33
34#define AMDGPU_ATTRIBUTE(Name, Str) Name##_POS,
35
36enum ImplicitArgumentPositions {
37#include "AMDGPUAttributes.def"
38 LAST_ARG_POS
39};
40
41#define AMDGPU_ATTRIBUTE(Name, Str) Name = 1 << Name##_POS,
42
43enum ImplicitArgumentMask {
44 NOT_IMPLICIT_INPUT = 0,
45#include "AMDGPUAttributes.def"
46 ALL_ARGUMENT_MASK = (1 << LAST_ARG_POS) - 1
47};
48
49#define AMDGPU_ATTRIBUTE(Name, Str) {Name, Str},
50static constexpr std::pair<ImplicitArgumentMask, StringLiteral>
51 ImplicitAttrs[] = {
52#include "AMDGPUAttributes.def"
53};
54
55// We do not need to note the x workitem or workgroup id because they are always
56// initialized.
57//
58// TODO: We should not add the attributes if the known compile time workgroup
59// size is 1 for y/z.
60static ImplicitArgumentMask
61intrinsicToAttrMask(Intrinsic::ID ID, bool &NonKernelOnly, bool &NeedsImplicit,
62 bool HasApertureRegs, bool SupportsGetDoorBellID,
63 unsigned CodeObjectVersion) {
64 switch (ID) {
65 case Intrinsic::amdgcn_workitem_id_x:
66 NonKernelOnly = true;
67 return WORKITEM_ID_X;
68 case Intrinsic::amdgcn_workgroup_id_x:
69 NonKernelOnly = true;
70 return WORKGROUP_ID_X;
71 case Intrinsic::amdgcn_workitem_id_y:
72 case Intrinsic::r600_read_tidig_y:
73 return WORKITEM_ID_Y;
74 case Intrinsic::amdgcn_workitem_id_z:
75 case Intrinsic::r600_read_tidig_z:
76 return WORKITEM_ID_Z;
77 case Intrinsic::amdgcn_workgroup_id_y:
78 case Intrinsic::r600_read_tgid_y:
79 return WORKGROUP_ID_Y;
80 case Intrinsic::amdgcn_workgroup_id_z:
81 case Intrinsic::r600_read_tgid_z:
82 return WORKGROUP_ID_Z;
83 case Intrinsic::amdgcn_lds_kernel_id:
84 return LDS_KERNEL_ID;
85 case Intrinsic::amdgcn_dispatch_ptr:
86 return DISPATCH_PTR;
87 case Intrinsic::amdgcn_dispatch_id:
88 return DISPATCH_ID;
89 case Intrinsic::amdgcn_implicitarg_ptr:
90 return IMPLICIT_ARG_PTR;
91 // Need queue_ptr anyway. But under V5, we also need implicitarg_ptr to access
92 // queue_ptr.
93 case Intrinsic::amdgcn_queue_ptr:
94 NeedsImplicit = (CodeObjectVersion >= AMDGPU::AMDHSA_COV5);
95 return QUEUE_PTR;
96 case Intrinsic::amdgcn_is_shared:
97 case Intrinsic::amdgcn_is_private:
98 if (HasApertureRegs)
99 return NOT_IMPLICIT_INPUT;
100 // Under V5, we need implicitarg_ptr + offsets to access private_base or
101 // shared_base. For pre-V5, however, need to access them through queue_ptr +
102 // offsets.
103 return CodeObjectVersion >= AMDGPU::AMDHSA_COV5 ? IMPLICIT_ARG_PTR
104 : QUEUE_PTR;
105 case Intrinsic::trap:
106 case Intrinsic::debugtrap:
107 case Intrinsic::ubsantrap:
108 if (SupportsGetDoorBellID) // GetDoorbellID support implemented since V4.
109 return CodeObjectVersion >= AMDGPU::AMDHSA_COV4 ? NOT_IMPLICIT_INPUT
110 : QUEUE_PTR;
111 NeedsImplicit = (CodeObjectVersion >= AMDGPU::AMDHSA_COV5);
112 return QUEUE_PTR;
113 default:
114 return NOT_IMPLICIT_INPUT;
115 }
116}
117
118static bool castRequiresQueuePtr(unsigned SrcAS) {
119 return SrcAS == AMDGPUAS::LOCAL_ADDRESS || SrcAS == AMDGPUAS::PRIVATE_ADDRESS;
120}
121
122static bool isDSAddress(const Constant *C) {
123 const GlobalValue *GV = dyn_cast<GlobalValue>(Val: C);
124 if (!GV)
125 return false;
126 unsigned AS = GV->getAddressSpace();
127 return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS;
128}
129
130/// Returns true if the function requires the implicit argument be passed
131/// regardless of the function contents.
132static bool funcRequiresHostcallPtr(const Function &F) {
133 // Sanitizers require the hostcall buffer passed in the implicit arguments.
134 return F.hasFnAttribute(Kind: Attribute::SanitizeAddress) ||
135 F.hasFnAttribute(Kind: Attribute::SanitizeThread) ||
136 F.hasFnAttribute(Kind: Attribute::SanitizeMemory) ||
137 F.hasFnAttribute(Kind: Attribute::SanitizeHWAddress) ||
138 F.hasFnAttribute(Kind: Attribute::SanitizeMemTag);
139}
140
141namespace {
142class AMDGPUInformationCache : public InformationCache {
143public:
144 AMDGPUInformationCache(const Module &M, AnalysisGetter &AG,
145 BumpPtrAllocator &Allocator,
146 SetVector<Function *> *CGSCC, TargetMachine &TM)
147 : InformationCache(M, AG, Allocator, CGSCC), TM(TM),
148 CodeObjectVersion(AMDGPU::getAMDHSACodeObjectVersion(M)) {}
149
150 TargetMachine &TM;
151
152 enum ConstantStatus : uint8_t {
153 NONE = 0,
154 DS_GLOBAL = 1 << 0,
155 ADDR_SPACE_CAST_PRIVATE_TO_FLAT = 1 << 1,
156 ADDR_SPACE_CAST_LOCAL_TO_FLAT = 1 << 2,
157 ADDR_SPACE_CAST_BOTH_TO_FLAT =
158 ADDR_SPACE_CAST_PRIVATE_TO_FLAT | ADDR_SPACE_CAST_LOCAL_TO_FLAT
159 };
160
161 /// Check if the subtarget has aperture regs.
162 bool hasApertureRegs(Function &F) {
163 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
164 return ST.hasApertureRegs();
165 }
166
167 /// Check if the subtarget supports GetDoorbellID.
168 bool supportsGetDoorbellID(Function &F) {
169 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
170 return ST.supportsGetDoorbellID();
171 }
172
173 std::optional<std::pair<unsigned, unsigned>>
174 getFlatWorkGroupSizeAttr(const Function &F) const {
175 auto R = AMDGPU::getIntegerPairAttribute(F, Name: "amdgpu-flat-work-group-size");
176 if (!R)
177 return std::nullopt;
178 return std::make_pair(x&: R->first, y&: *(R->second));
179 }
180
181 std::pair<unsigned, unsigned>
182 getDefaultFlatWorkGroupSize(const Function &F) const {
183 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
184 return ST.getDefaultFlatWorkGroupSize(CC: F.getCallingConv());
185 }
186
187 std::pair<unsigned, unsigned>
188 getMaximumFlatWorkGroupRange(const Function &F) {
189 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
190 return {ST.getMinFlatWorkGroupSize(), ST.getMaxFlatWorkGroupSize()};
191 }
192
193 SmallVector<unsigned> getMaxNumWorkGroups(const Function &F) {
194 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
195 return ST.getMaxNumWorkGroups(F);
196 }
197
198 /// Get code object version.
199 unsigned getCodeObjectVersion() const { return CodeObjectVersion; }
200
201 /// Get the effective value of "amdgpu-waves-per-eu" for the function,
202 /// accounting for the interaction with the passed value to use for
203 /// "amdgpu-flat-work-group-size".
204 std::pair<unsigned, unsigned>
205 getWavesPerEU(const Function &F,
206 std::pair<unsigned, unsigned> FlatWorkGroupSize) {
207 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
208 return ST.getWavesPerEU(FlatWorkGroupSizes: FlatWorkGroupSize, LDSBytes: getLDSSize(F), F);
209 }
210
211 std::optional<std::pair<unsigned, unsigned>>
212 getWavesPerEUAttr(const Function &F) {
213 auto Val = AMDGPU::getIntegerPairAttribute(F, Name: "amdgpu-waves-per-eu",
214 /*OnlyFirstRequired=*/true);
215 if (!Val)
216 return std::nullopt;
217 if (!Val->second) {
218 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
219 Val->second = ST.getMaxWavesPerEU();
220 }
221 return std::make_pair(x&: Val->first, y&: *(Val->second));
222 }
223
224 std::pair<unsigned, unsigned>
225 getEffectiveWavesPerEU(const Function &F,
226 std::pair<unsigned, unsigned> WavesPerEU,
227 std::pair<unsigned, unsigned> FlatWorkGroupSize) {
228 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
229 return ST.getEffectiveWavesPerEU(RequestedWavesPerEU: WavesPerEU, FlatWorkGroupSizes: FlatWorkGroupSize,
230 LDSBytes: getLDSSize(F));
231 }
232
233 unsigned getMaxWavesPerEU(const Function &F) {
234 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
235 return ST.getMaxWavesPerEU();
236 }
237
238private:
239 /// Check if the ConstantExpr \p CE uses an addrspacecast from private or
240 /// local to flat. These casts may require the queue pointer.
241 static uint8_t visitConstExpr(const ConstantExpr *CE) {
242 uint8_t Status = NONE;
243
244 if (CE->getOpcode() == Instruction::AddrSpaceCast) {
245 unsigned SrcAS = CE->getOperand(i_nocapture: 0)->getType()->getPointerAddressSpace();
246 if (SrcAS == AMDGPUAS::PRIVATE_ADDRESS)
247 Status |= ADDR_SPACE_CAST_PRIVATE_TO_FLAT;
248 else if (SrcAS == AMDGPUAS::LOCAL_ADDRESS)
249 Status |= ADDR_SPACE_CAST_LOCAL_TO_FLAT;
250 }
251
252 return Status;
253 }
254
255 /// Returns the minimum amount of LDS space used by a workgroup running
256 /// function \p F.
257 static unsigned getLDSSize(const Function &F) {
258 return AMDGPU::getIntegerPairAttribute(F, Name: "amdgpu-lds-size",
259 Default: {0, UINT32_MAX}, OnlyFirstRequired: true)
260 .first;
261 }
262
263 /// Get the constant access bitmap for \p C.
264 uint8_t getConstantAccess(const Constant *C,
265 SmallPtrSetImpl<const Constant *> &Visited) {
266 auto It = ConstantStatus.find(Val: C);
267 if (It != ConstantStatus.end())
268 return It->second;
269
270 uint8_t Result = 0;
271 if (isDSAddress(C))
272 Result = DS_GLOBAL;
273
274 if (const auto *CE = dyn_cast<ConstantExpr>(Val: C))
275 Result |= visitConstExpr(CE);
276
277 for (const Use &U : C->operands()) {
278 const auto *OpC = dyn_cast<Constant>(Val: U);
279 if (!OpC || !Visited.insert(Ptr: OpC).second)
280 continue;
281
282 Result |= getConstantAccess(C: OpC, Visited);
283 }
284 return Result;
285 }
286
287public:
288 /// Returns true if \p Fn needs the queue pointer because of \p C.
289 bool needsQueuePtr(const Constant *C, Function &Fn) {
290 bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(CC: Fn.getCallingConv());
291 bool HasAperture = hasApertureRegs(F&: Fn);
292
293 // No need to explore the constants.
294 if (!IsNonEntryFunc && HasAperture)
295 return false;
296
297 SmallPtrSet<const Constant *, 8> Visited;
298 uint8_t Access = getConstantAccess(C, Visited);
299
300 // We need to trap on DS globals in non-entry functions.
301 if (IsNonEntryFunc && (Access & DS_GLOBAL))
302 return true;
303
304 return !HasAperture && (Access & ADDR_SPACE_CAST_BOTH_TO_FLAT);
305 }
306
307 bool checkConstForAddrSpaceCastFromPrivate(const Constant *C) {
308 SmallPtrSet<const Constant *, 8> Visited;
309 uint8_t Access = getConstantAccess(C, Visited);
310 return Access & ADDR_SPACE_CAST_PRIVATE_TO_FLAT;
311 }
312
313private:
314 /// Used to determine if the Constant needs the queue pointer.
315 DenseMap<const Constant *, uint8_t> ConstantStatus;
316 const unsigned CodeObjectVersion;
317};
318
319struct AAAMDAttributes
320 : public StateWrapper<BitIntegerState<uint32_t, ALL_ARGUMENT_MASK, 0>,
321 AbstractAttribute> {
322 using Base = StateWrapper<BitIntegerState<uint32_t, ALL_ARGUMENT_MASK, 0>,
323 AbstractAttribute>;
324
325 AAAMDAttributes(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
326
327 /// Create an abstract attribute view for the position \p IRP.
328 static AAAMDAttributes &createForPosition(const IRPosition &IRP,
329 Attributor &A);
330
331 /// See AbstractAttribute::getName().
332 StringRef getName() const override { return "AAAMDAttributes"; }
333
334 /// See AbstractAttribute::getIdAddr().
335 const char *getIdAddr() const override { return &ID; }
336
337 /// This function should return true if the type of the \p AA is
338 /// AAAMDAttributes.
339 static bool classof(const AbstractAttribute *AA) {
340 return (AA->getIdAddr() == &ID);
341 }
342
343 /// Unique ID (due to the unique address)
344 static const char ID;
345};
346const char AAAMDAttributes::ID = 0;
347
348struct AAUniformWorkGroupSize
349 : public StateWrapper<BooleanState, AbstractAttribute> {
350 using Base = StateWrapper<BooleanState, AbstractAttribute>;
351 AAUniformWorkGroupSize(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
352
353 /// Create an abstract attribute view for the position \p IRP.
354 static AAUniformWorkGroupSize &createForPosition(const IRPosition &IRP,
355 Attributor &A);
356
357 /// See AbstractAttribute::getName().
358 StringRef getName() const override { return "AAUniformWorkGroupSize"; }
359
360 /// See AbstractAttribute::getIdAddr().
361 const char *getIdAddr() const override { return &ID; }
362
363 /// This function should return true if the type of the \p AA is
364 /// AAAMDAttributes.
365 static bool classof(const AbstractAttribute *AA) {
366 return (AA->getIdAddr() == &ID);
367 }
368
369 /// Unique ID (due to the unique address)
370 static const char ID;
371};
372const char AAUniformWorkGroupSize::ID = 0;
373
374struct AAUniformWorkGroupSizeFunction : public AAUniformWorkGroupSize {
375 AAUniformWorkGroupSizeFunction(const IRPosition &IRP, Attributor &A)
376 : AAUniformWorkGroupSize(IRP, A) {}
377
378 void initialize(Attributor &A) override {
379 Function *F = getAssociatedFunction();
380 CallingConv::ID CC = F->getCallingConv();
381
382 if (CC != CallingConv::AMDGPU_KERNEL)
383 return;
384
385 bool InitialValue = false;
386 if (F->hasFnAttribute(Kind: "uniform-work-group-size"))
387 InitialValue =
388 F->getFnAttribute(Kind: "uniform-work-group-size").getValueAsString() ==
389 "true";
390
391 if (InitialValue)
392 indicateOptimisticFixpoint();
393 else
394 indicatePessimisticFixpoint();
395 }
396
397 ChangeStatus updateImpl(Attributor &A) override {
398 ChangeStatus Change = ChangeStatus::UNCHANGED;
399
400 auto CheckCallSite = [&](AbstractCallSite CS) {
401 Function *Caller = CS.getInstruction()->getFunction();
402 LLVM_DEBUG(dbgs() << "[AAUniformWorkGroupSize] Call " << Caller->getName()
403 << "->" << getAssociatedFunction()->getName() << "\n");
404
405 const auto *CallerInfo = A.getAAFor<AAUniformWorkGroupSize>(
406 QueryingAA: *this, IRP: IRPosition::function(F: *Caller), DepClass: DepClassTy::REQUIRED);
407 if (!CallerInfo || !CallerInfo->isValidState())
408 return false;
409
410 Change = Change | clampStateAndIndicateChange(S&: this->getState(),
411 R: CallerInfo->getState());
412
413 return true;
414 };
415
416 bool AllCallSitesKnown = true;
417 if (!A.checkForAllCallSites(Pred: CheckCallSite, QueryingAA: *this, RequireAllCallSites: true, UsedAssumedInformation&: AllCallSitesKnown))
418 return indicatePessimisticFixpoint();
419
420 return Change;
421 }
422
423 ChangeStatus manifest(Attributor &A) override {
424 SmallVector<Attribute, 8> AttrList;
425 LLVMContext &Ctx = getAssociatedFunction()->getContext();
426
427 AttrList.push_back(Elt: Attribute::get(Context&: Ctx, Kind: "uniform-work-group-size",
428 Val: getAssumed() ? "true" : "false"));
429 return A.manifestAttrs(IRP: getIRPosition(), DeducedAttrs: AttrList,
430 /* ForceReplace */ true);
431 }
432
433 bool isValidState() const override {
434 // This state is always valid, even when the state is false.
435 return true;
436 }
437
438 const std::string getAsStr(Attributor *) const override {
439 return "AMDWorkGroupSize[" + std::to_string(val: getAssumed()) + "]";
440 }
441
442 /// See AbstractAttribute::trackStatistics()
443 void trackStatistics() const override {}
444};
445
446AAUniformWorkGroupSize &
447AAUniformWorkGroupSize::createForPosition(const IRPosition &IRP,
448 Attributor &A) {
449 if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
450 return *new (A.Allocator) AAUniformWorkGroupSizeFunction(IRP, A);
451 llvm_unreachable(
452 "AAUniformWorkGroupSize is only valid for function position");
453}
454
455struct AAAMDAttributesFunction : public AAAMDAttributes {
456 AAAMDAttributesFunction(const IRPosition &IRP, Attributor &A)
457 : AAAMDAttributes(IRP, A) {}
458
459 void initialize(Attributor &A) override {
460 Function *F = getAssociatedFunction();
461
462 // If the function requires the implicit arg pointer due to sanitizers,
463 // assume it's needed even if explicitly marked as not requiring it.
464 const bool NeedsHostcall = funcRequiresHostcallPtr(F: *F);
465 if (NeedsHostcall) {
466 removeAssumedBits(BitsEncoding: IMPLICIT_ARG_PTR);
467 removeAssumedBits(BitsEncoding: HOSTCALL_PTR);
468 }
469
470 for (auto Attr : ImplicitAttrs) {
471 if (NeedsHostcall &&
472 (Attr.first == IMPLICIT_ARG_PTR || Attr.first == HOSTCALL_PTR))
473 continue;
474
475 if (F->hasFnAttribute(Kind: Attr.second))
476 addKnownBits(Bits: Attr.first);
477 }
478
479 if (F->isDeclaration())
480 return;
481
482 // Ignore functions with graphics calling conventions, these are currently
483 // not allowed to have kernel arguments.
484 if (AMDGPU::isGraphics(CC: F->getCallingConv())) {
485 indicatePessimisticFixpoint();
486 return;
487 }
488 }
489
490 ChangeStatus updateImpl(Attributor &A) override {
491 Function *F = getAssociatedFunction();
492 // The current assumed state used to determine a change.
493 auto OrigAssumed = getAssumed();
494
495 // Check for Intrinsics and propagate attributes.
496 const AACallEdges *AAEdges = A.getAAFor<AACallEdges>(
497 QueryingAA: *this, IRP: this->getIRPosition(), DepClass: DepClassTy::REQUIRED);
498 if (!AAEdges || !AAEdges->isValidState() ||
499 AAEdges->hasNonAsmUnknownCallee())
500 return indicatePessimisticFixpoint();
501
502 bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(CC: F->getCallingConv());
503
504 bool NeedsImplicit = false;
505 auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
506 bool HasApertureRegs = InfoCache.hasApertureRegs(F&: *F);
507 bool SupportsGetDoorbellID = InfoCache.supportsGetDoorbellID(F&: *F);
508 unsigned COV = InfoCache.getCodeObjectVersion();
509
510 for (Function *Callee : AAEdges->getOptimisticEdges()) {
511 Intrinsic::ID IID = Callee->getIntrinsicID();
512 if (IID == Intrinsic::not_intrinsic) {
513 const AAAMDAttributes *AAAMD = A.getAAFor<AAAMDAttributes>(
514 QueryingAA: *this, IRP: IRPosition::function(F: *Callee), DepClass: DepClassTy::REQUIRED);
515 if (!AAAMD || !AAAMD->isValidState())
516 return indicatePessimisticFixpoint();
517 *this &= *AAAMD;
518 continue;
519 }
520
521 bool NonKernelOnly = false;
522 ImplicitArgumentMask AttrMask =
523 intrinsicToAttrMask(ID: IID, NonKernelOnly, NeedsImplicit,
524 HasApertureRegs, SupportsGetDoorBellID: SupportsGetDoorbellID, CodeObjectVersion: COV);
525 if (AttrMask != NOT_IMPLICIT_INPUT) {
526 if ((IsNonEntryFunc || !NonKernelOnly))
527 removeAssumedBits(BitsEncoding: AttrMask);
528 }
529 }
530
531 // Need implicitarg_ptr to acess queue_ptr, private_base, and shared_base.
532 if (NeedsImplicit)
533 removeAssumedBits(BitsEncoding: IMPLICIT_ARG_PTR);
534
535 if (isAssumed(BitsEncoding: QUEUE_PTR) && checkForQueuePtr(A)) {
536 // Under V5, we need implicitarg_ptr + offsets to access private_base or
537 // shared_base. We do not actually need queue_ptr.
538 if (COV >= 5)
539 removeAssumedBits(BitsEncoding: IMPLICIT_ARG_PTR);
540 else
541 removeAssumedBits(BitsEncoding: QUEUE_PTR);
542 }
543
544 if (funcRetrievesMultigridSyncArg(A, COV)) {
545 assert(!isAssumed(IMPLICIT_ARG_PTR) &&
546 "multigrid_sync_arg needs implicitarg_ptr");
547 removeAssumedBits(BitsEncoding: MULTIGRID_SYNC_ARG);
548 }
549
550 if (funcRetrievesHostcallPtr(A, COV)) {
551 assert(!isAssumed(IMPLICIT_ARG_PTR) && "hostcall needs implicitarg_ptr");
552 removeAssumedBits(BitsEncoding: HOSTCALL_PTR);
553 }
554
555 if (funcRetrievesHeapPtr(A, COV)) {
556 assert(!isAssumed(IMPLICIT_ARG_PTR) && "heap_ptr needs implicitarg_ptr");
557 removeAssumedBits(BitsEncoding: HEAP_PTR);
558 }
559
560 if (isAssumed(BitsEncoding: QUEUE_PTR) && funcRetrievesQueuePtr(A, COV)) {
561 assert(!isAssumed(IMPLICIT_ARG_PTR) && "queue_ptr needs implicitarg_ptr");
562 removeAssumedBits(BitsEncoding: QUEUE_PTR);
563 }
564
565 if (isAssumed(BitsEncoding: LDS_KERNEL_ID) && funcRetrievesLDSKernelId(A)) {
566 removeAssumedBits(BitsEncoding: LDS_KERNEL_ID);
567 }
568
569 if (isAssumed(BitsEncoding: DEFAULT_QUEUE) && funcRetrievesDefaultQueue(A, COV))
570 removeAssumedBits(BitsEncoding: DEFAULT_QUEUE);
571
572 if (isAssumed(BitsEncoding: COMPLETION_ACTION) && funcRetrievesCompletionAction(A, COV))
573 removeAssumedBits(BitsEncoding: COMPLETION_ACTION);
574
575 if (isAssumed(BitsEncoding: FLAT_SCRATCH_INIT) && needFlatScratchInit(A))
576 removeAssumedBits(BitsEncoding: FLAT_SCRATCH_INIT);
577
578 return getAssumed() != OrigAssumed ? ChangeStatus::CHANGED
579 : ChangeStatus::UNCHANGED;
580 }
581
582 ChangeStatus manifest(Attributor &A) override {
583 SmallVector<Attribute, 8> AttrList;
584 LLVMContext &Ctx = getAssociatedFunction()->getContext();
585
586 for (auto Attr : ImplicitAttrs) {
587 if (isKnown(BitsEncoding: Attr.first))
588 AttrList.push_back(Elt: Attribute::get(Context&: Ctx, Kind: Attr.second));
589 }
590
591 return A.manifestAttrs(IRP: getIRPosition(), DeducedAttrs: AttrList,
592 /* ForceReplace */ true);
593 }
594
595 const std::string getAsStr(Attributor *) const override {
596 std::string Str;
597 raw_string_ostream OS(Str);
598 OS << "AMDInfo[";
599 for (auto Attr : ImplicitAttrs)
600 if (isAssumed(BitsEncoding: Attr.first))
601 OS << ' ' << Attr.second;
602 OS << " ]";
603 return OS.str();
604 }
605
606 /// See AbstractAttribute::trackStatistics()
607 void trackStatistics() const override {}
608
609private:
610 bool checkForQueuePtr(Attributor &A) {
611 Function *F = getAssociatedFunction();
612 bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(CC: F->getCallingConv());
613
614 auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
615
616 bool NeedsQueuePtr = false;
617
618 auto CheckAddrSpaceCasts = [&](Instruction &I) {
619 unsigned SrcAS = static_cast<AddrSpaceCastInst &>(I).getSrcAddressSpace();
620 if (castRequiresQueuePtr(SrcAS)) {
621 NeedsQueuePtr = true;
622 return false;
623 }
624 return true;
625 };
626
627 bool HasApertureRegs = InfoCache.hasApertureRegs(F&: *F);
628
629 // `checkForAllInstructions` is much more cheaper than going through all
630 // instructions, try it first.
631
632 // The queue pointer is not needed if aperture regs is present.
633 if (!HasApertureRegs) {
634 bool UsedAssumedInformation = false;
635 A.checkForAllInstructions(Pred: CheckAddrSpaceCasts, QueryingAA: *this,
636 Opcodes: {Instruction::AddrSpaceCast},
637 UsedAssumedInformation);
638 }
639
640 // If we found that we need the queue pointer, nothing else to do.
641 if (NeedsQueuePtr)
642 return true;
643
644 if (!IsNonEntryFunc && HasApertureRegs)
645 return false;
646
647 for (BasicBlock &BB : *F) {
648 for (Instruction &I : BB) {
649 for (const Use &U : I.operands()) {
650 if (const auto *C = dyn_cast<Constant>(Val: U)) {
651 if (InfoCache.needsQueuePtr(C, Fn&: *F))
652 return true;
653 }
654 }
655 }
656 }
657
658 return false;
659 }
660
661 bool funcRetrievesMultigridSyncArg(Attributor &A, unsigned COV) {
662 auto Pos = llvm::AMDGPU::getMultigridSyncArgImplicitArgPosition(COV);
663 AA::RangeTy Range(Pos, 8);
664 return funcRetrievesImplicitKernelArg(A, Range);
665 }
666
667 bool funcRetrievesHostcallPtr(Attributor &A, unsigned COV) {
668 auto Pos = llvm::AMDGPU::getHostcallImplicitArgPosition(COV);
669 AA::RangeTy Range(Pos, 8);
670 return funcRetrievesImplicitKernelArg(A, Range);
671 }
672
673 bool funcRetrievesDefaultQueue(Attributor &A, unsigned COV) {
674 auto Pos = llvm::AMDGPU::getDefaultQueueImplicitArgPosition(COV);
675 AA::RangeTy Range(Pos, 8);
676 return funcRetrievesImplicitKernelArg(A, Range);
677 }
678
679 bool funcRetrievesCompletionAction(Attributor &A, unsigned COV) {
680 auto Pos = llvm::AMDGPU::getCompletionActionImplicitArgPosition(COV);
681 AA::RangeTy Range(Pos, 8);
682 return funcRetrievesImplicitKernelArg(A, Range);
683 }
684
685 bool funcRetrievesHeapPtr(Attributor &A, unsigned COV) {
686 if (COV < 5)
687 return false;
688 AA::RangeTy Range(AMDGPU::ImplicitArg::HEAP_PTR_OFFSET, 8);
689 return funcRetrievesImplicitKernelArg(A, Range);
690 }
691
692 bool funcRetrievesQueuePtr(Attributor &A, unsigned COV) {
693 if (COV < 5)
694 return false;
695 AA::RangeTy Range(AMDGPU::ImplicitArg::QUEUE_PTR_OFFSET, 8);
696 return funcRetrievesImplicitKernelArg(A, Range);
697 }
698
699 bool funcRetrievesImplicitKernelArg(Attributor &A, AA::RangeTy Range) {
700 // Check if this is a call to the implicitarg_ptr builtin and it
701 // is used to retrieve the hostcall pointer. The implicit arg for
702 // hostcall is not used only if every use of the implicitarg_ptr
703 // is a load that clearly does not retrieve any byte of the
704 // hostcall pointer. We check this by tracing all the uses of the
705 // initial call to the implicitarg_ptr intrinsic.
706 auto DoesNotLeadToKernelArgLoc = [&](Instruction &I) {
707 auto &Call = cast<CallBase>(Val&: I);
708 if (Call.getIntrinsicID() != Intrinsic::amdgcn_implicitarg_ptr)
709 return true;
710
711 const auto *PointerInfoAA = A.getAAFor<AAPointerInfo>(
712 QueryingAA: *this, IRP: IRPosition::callsite_returned(CB: Call), DepClass: DepClassTy::REQUIRED);
713 if (!PointerInfoAA || !PointerInfoAA->getState().isValidState())
714 return false;
715
716 return PointerInfoAA->forallInterferingAccesses(
717 Range, CB: [](const AAPointerInfo::Access &Acc, bool IsExact) {
718 return Acc.getRemoteInst()->isDroppable();
719 });
720 };
721
722 bool UsedAssumedInformation = false;
723 return !A.checkForAllCallLikeInstructions(Pred: DoesNotLeadToKernelArgLoc, QueryingAA: *this,
724 UsedAssumedInformation);
725 }
726
727 bool funcRetrievesLDSKernelId(Attributor &A) {
728 auto DoesNotRetrieve = [&](Instruction &I) {
729 auto &Call = cast<CallBase>(Val&: I);
730 return Call.getIntrinsicID() != Intrinsic::amdgcn_lds_kernel_id;
731 };
732 bool UsedAssumedInformation = false;
733 return !A.checkForAllCallLikeInstructions(Pred: DoesNotRetrieve, QueryingAA: *this,
734 UsedAssumedInformation);
735 }
736
737 // Returns true if FlatScratchInit is needed, i.e., no-flat-scratch-init is
738 // not to be set.
739 bool needFlatScratchInit(Attributor &A) {
740 assert(isAssumed(FLAT_SCRATCH_INIT)); // only called if the bit is still set
741
742 // Check all AddrSpaceCast instructions. FlatScratchInit is needed if
743 // there is a cast from PRIVATE_ADDRESS.
744 auto AddrSpaceCastNotFromPrivate = [](Instruction &I) {
745 return cast<AddrSpaceCastInst>(Val&: I).getSrcAddressSpace() !=
746 AMDGPUAS::PRIVATE_ADDRESS;
747 };
748
749 bool UsedAssumedInformation = false;
750 if (!A.checkForAllInstructions(Pred: AddrSpaceCastNotFromPrivate, QueryingAA: *this,
751 Opcodes: {Instruction::AddrSpaceCast},
752 UsedAssumedInformation))
753 return true;
754
755 // Check for addrSpaceCast from PRIVATE_ADDRESS in constant expressions
756 auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
757
758 Function *F = getAssociatedFunction();
759 for (Instruction &I : instructions(F)) {
760 for (const Use &U : I.operands()) {
761 if (const auto *C = dyn_cast<Constant>(Val: U)) {
762 if (InfoCache.checkConstForAddrSpaceCastFromPrivate(C))
763 return true;
764 }
765 }
766 }
767
768 // Finally check callees.
769
770 // This is called on each callee; false means callee shouldn't have
771 // no-flat-scratch-init.
772 auto CheckForNoFlatScratchInit = [&](Instruction &I) {
773 const auto &CB = cast<CallBase>(Val&: I);
774 const Function *Callee = CB.getCalledFunction();
775
776 // Callee == 0 for inline asm or indirect call with known callees.
777 // In the latter case, updateImpl() already checked the callees and we
778 // know their FLAT_SCRATCH_INIT bit is set.
779 // If function has indirect call with unknown callees, the bit is
780 // already removed in updateImpl() and execution won't reach here.
781 if (!Callee)
782 return true;
783
784 return Callee->getIntrinsicID() !=
785 Intrinsic::amdgcn_addrspacecast_nonnull;
786 };
787
788 UsedAssumedInformation = false;
789 // If any callee is false (i.e. need FlatScratchInit),
790 // checkForAllCallLikeInstructions returns false, in which case this
791 // function returns true.
792 return !A.checkForAllCallLikeInstructions(Pred: CheckForNoFlatScratchInit, QueryingAA: *this,
793 UsedAssumedInformation);
794 }
795};
796
797AAAMDAttributes &AAAMDAttributes::createForPosition(const IRPosition &IRP,
798 Attributor &A) {
799 if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
800 return *new (A.Allocator) AAAMDAttributesFunction(IRP, A);
801 llvm_unreachable("AAAMDAttributes is only valid for function position");
802}
803
804/// Base class to derive different size ranges.
805struct AAAMDSizeRangeAttribute
806 : public StateWrapper<IntegerRangeState, AbstractAttribute, uint32_t> {
807 using Base = StateWrapper<IntegerRangeState, AbstractAttribute, uint32_t>;
808
809 StringRef AttrName;
810
811 AAAMDSizeRangeAttribute(const IRPosition &IRP, Attributor &A,
812 StringRef AttrName)
813 : Base(IRP, 32), AttrName(AttrName) {}
814
815 /// See AbstractAttribute::trackStatistics()
816 void trackStatistics() const override {}
817
818 template <class AttributeImpl> ChangeStatus updateImplImpl(Attributor &A) {
819 ChangeStatus Change = ChangeStatus::UNCHANGED;
820
821 auto CheckCallSite = [&](AbstractCallSite CS) {
822 Function *Caller = CS.getInstruction()->getFunction();
823 LLVM_DEBUG(dbgs() << '[' << getName() << "] Call " << Caller->getName()
824 << "->" << getAssociatedFunction()->getName() << '\n');
825
826 const auto *CallerInfo = A.getAAFor<AttributeImpl>(
827 *this, IRPosition::function(F: *Caller), DepClassTy::REQUIRED);
828 if (!CallerInfo || !CallerInfo->isValidState())
829 return false;
830
831 Change |=
832 clampStateAndIndicateChange(this->getState(), CallerInfo->getState());
833
834 return true;
835 };
836
837 bool AllCallSitesKnown = true;
838 if (!A.checkForAllCallSites(CheckCallSite, *this,
839 /*RequireAllCallSites=*/true,
840 AllCallSitesKnown))
841 return indicatePessimisticFixpoint();
842
843 return Change;
844 }
845
846 /// Clamp the assumed range to the default value ([Min, Max]) and emit the
847 /// attribute if it is not same as default.
848 ChangeStatus
849 emitAttributeIfNotDefaultAfterClamp(Attributor &A,
850 std::pair<unsigned, unsigned> Default) {
851 auto [Min, Max] = Default;
852 unsigned Lower = getAssumed().getLower().getZExtValue();
853 unsigned Upper = getAssumed().getUpper().getZExtValue();
854
855 // Clamp the range to the default value.
856 if (Lower < Min)
857 Lower = Min;
858 if (Upper > Max + 1)
859 Upper = Max + 1;
860
861 // No manifest if the value is invalid or same as default after clamp.
862 if ((Lower == Min && Upper == Max + 1) || (Upper < Lower))
863 return ChangeStatus::UNCHANGED;
864
865 Function *F = getAssociatedFunction();
866 LLVMContext &Ctx = F->getContext();
867 SmallString<10> Buffer;
868 raw_svector_ostream OS(Buffer);
869 OS << Lower << ',' << Upper - 1;
870 return A.manifestAttrs(IRP: getIRPosition(),
871 DeducedAttrs: {Attribute::get(Context&: Ctx, Kind: AttrName, Val: OS.str())},
872 /*ForceReplace=*/true);
873 }
874
875 const std::string getAsStr(Attributor *) const override {
876 std::string Str;
877 raw_string_ostream OS(Str);
878 OS << getName() << '[';
879 OS << getAssumed().getLower() << ',' << getAssumed().getUpper() - 1;
880 OS << ']';
881 return OS.str();
882 }
883};
884
885/// Propagate amdgpu-flat-work-group-size attribute.
886struct AAAMDFlatWorkGroupSize : public AAAMDSizeRangeAttribute {
887 AAAMDFlatWorkGroupSize(const IRPosition &IRP, Attributor &A)
888 : AAAMDSizeRangeAttribute(IRP, A, "amdgpu-flat-work-group-size") {}
889
890 void initialize(Attributor &A) override {
891 Function *F = getAssociatedFunction();
892 auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
893
894 bool HasAttr = false;
895 auto Range = InfoCache.getDefaultFlatWorkGroupSize(F: *F);
896 auto MaxRange = InfoCache.getMaximumFlatWorkGroupRange(F: *F);
897
898 if (auto Attr = InfoCache.getFlatWorkGroupSizeAttr(F: *F)) {
899 // We only consider an attribute that is not max range because the front
900 // end always emits the attribute, unfortunately, and sometimes it emits
901 // the max range.
902 if (*Attr != MaxRange) {
903 Range = *Attr;
904 HasAttr = true;
905 }
906 }
907
908 // We don't want to directly clamp the state if it's the max range because
909 // that is basically the worst state.
910 if (Range == MaxRange)
911 return;
912
913 auto [Min, Max] = Range;
914 ConstantRange CR(APInt(32, Min), APInt(32, Max + 1));
915 IntegerRangeState IRS(CR);
916 clampStateAndIndicateChange(S&: this->getState(), R: IRS);
917
918 if (HasAttr || AMDGPU::isEntryFunctionCC(CC: F->getCallingConv()))
919 indicateOptimisticFixpoint();
920 }
921
922 ChangeStatus updateImpl(Attributor &A) override {
923 return updateImplImpl<AAAMDFlatWorkGroupSize>(A);
924 }
925
926 /// Create an abstract attribute view for the position \p IRP.
927 static AAAMDFlatWorkGroupSize &createForPosition(const IRPosition &IRP,
928 Attributor &A);
929
930 ChangeStatus manifest(Attributor &A) override {
931 Function *F = getAssociatedFunction();
932 auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
933 return emitAttributeIfNotDefaultAfterClamp(
934 A, Default: InfoCache.getMaximumFlatWorkGroupRange(F: *F));
935 }
936
937 /// See AbstractAttribute::getName()
938 StringRef getName() const override { return "AAAMDFlatWorkGroupSize"; }
939
940 /// See AbstractAttribute::getIdAddr()
941 const char *getIdAddr() const override { return &ID; }
942
943 /// This function should return true if the type of the \p AA is
944 /// AAAMDFlatWorkGroupSize
945 static bool classof(const AbstractAttribute *AA) {
946 return (AA->getIdAddr() == &ID);
947 }
948
949 /// Unique ID (due to the unique address)
950 static const char ID;
951};
952
953const char AAAMDFlatWorkGroupSize::ID = 0;
954
955AAAMDFlatWorkGroupSize &
956AAAMDFlatWorkGroupSize::createForPosition(const IRPosition &IRP,
957 Attributor &A) {
958 if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
959 return *new (A.Allocator) AAAMDFlatWorkGroupSize(IRP, A);
960 llvm_unreachable(
961 "AAAMDFlatWorkGroupSize is only valid for function position");
962}
963
964struct TupleDecIntegerRangeState : public AbstractState {
965 DecIntegerState<uint32_t> X, Y, Z;
966
967 bool isValidState() const override {
968 return X.isValidState() && Y.isValidState() && Z.isValidState();
969 }
970
971 bool isAtFixpoint() const override {
972 return X.isAtFixpoint() && Y.isAtFixpoint() && Z.isAtFixpoint();
973 }
974
975 ChangeStatus indicateOptimisticFixpoint() override {
976 return X.indicateOptimisticFixpoint() | Y.indicateOptimisticFixpoint() |
977 Z.indicateOptimisticFixpoint();
978 }
979
980 ChangeStatus indicatePessimisticFixpoint() override {
981 return X.indicatePessimisticFixpoint() | Y.indicatePessimisticFixpoint() |
982 Z.indicatePessimisticFixpoint();
983 }
984
985 TupleDecIntegerRangeState operator^=(const TupleDecIntegerRangeState &Other) {
986 X ^= Other.X;
987 Y ^= Other.Y;
988 Z ^= Other.Z;
989 return *this;
990 }
991
992 bool operator==(const TupleDecIntegerRangeState &Other) const {
993 return X == Other.X && Y == Other.Y && Z == Other.Z;
994 }
995
996 TupleDecIntegerRangeState &getAssumed() { return *this; }
997 const TupleDecIntegerRangeState &getAssumed() const { return *this; }
998};
999
1000using AAAMDMaxNumWorkgroupsState =
1001 StateWrapper<TupleDecIntegerRangeState, AbstractAttribute, uint32_t>;
1002
1003/// Propagate amdgpu-max-num-workgroups attribute.
1004struct AAAMDMaxNumWorkgroups
1005 : public StateWrapper<TupleDecIntegerRangeState, AbstractAttribute> {
1006 using Base = StateWrapper<TupleDecIntegerRangeState, AbstractAttribute>;
1007
1008 AAAMDMaxNumWorkgroups(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
1009
1010 void initialize(Attributor &A) override {
1011 Function *F = getAssociatedFunction();
1012 auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
1013
1014 SmallVector<unsigned> MaxNumWorkgroups = InfoCache.getMaxNumWorkGroups(F: *F);
1015
1016 X.takeKnownMinimum(Value: MaxNumWorkgroups[0]);
1017 Y.takeKnownMinimum(Value: MaxNumWorkgroups[1]);
1018 Z.takeKnownMinimum(Value: MaxNumWorkgroups[2]);
1019
1020 if (AMDGPU::isEntryFunctionCC(CC: F->getCallingConv()))
1021 indicatePessimisticFixpoint();
1022 }
1023
1024 ChangeStatus updateImpl(Attributor &A) override {
1025 ChangeStatus Change = ChangeStatus::UNCHANGED;
1026
1027 auto CheckCallSite = [&](AbstractCallSite CS) {
1028 Function *Caller = CS.getInstruction()->getFunction();
1029 LLVM_DEBUG(dbgs() << "[AAAMDMaxNumWorkgroups] Call " << Caller->getName()
1030 << "->" << getAssociatedFunction()->getName() << '\n');
1031
1032 const auto *CallerInfo = A.getAAFor<AAAMDMaxNumWorkgroups>(
1033 QueryingAA: *this, IRP: IRPosition::function(F: *Caller), DepClass: DepClassTy::REQUIRED);
1034 if (!CallerInfo || !CallerInfo->isValidState())
1035 return false;
1036
1037 Change |=
1038 clampStateAndIndicateChange(S&: this->getState(), R: CallerInfo->getState());
1039 return true;
1040 };
1041
1042 bool AllCallSitesKnown = true;
1043 if (!A.checkForAllCallSites(Pred: CheckCallSite, QueryingAA: *this,
1044 /*RequireAllCallSites=*/true,
1045 UsedAssumedInformation&: AllCallSitesKnown))
1046 return indicatePessimisticFixpoint();
1047
1048 return Change;
1049 }
1050
1051 /// Create an abstract attribute view for the position \p IRP.
1052 static AAAMDMaxNumWorkgroups &createForPosition(const IRPosition &IRP,
1053 Attributor &A);
1054
1055 ChangeStatus manifest(Attributor &A) override {
1056 Function *F = getAssociatedFunction();
1057 LLVMContext &Ctx = F->getContext();
1058 SmallString<32> Buffer;
1059 raw_svector_ostream OS(Buffer);
1060 OS << X.getAssumed() << ',' << Y.getAssumed() << ',' << Z.getAssumed();
1061
1062 // TODO: Should annotate loads of the group size for this to do anything
1063 // useful.
1064 return A.manifestAttrs(
1065 IRP: getIRPosition(),
1066 DeducedAttrs: {Attribute::get(Context&: Ctx, Kind: "amdgpu-max-num-workgroups", Val: OS.str())},
1067 /* ForceReplace= */ true);
1068 }
1069
1070 StringRef getName() const override { return "AAAMDMaxNumWorkgroups"; }
1071
1072 const std::string getAsStr(Attributor *) const override {
1073 std::string Buffer = "AAAMDMaxNumWorkgroupsState[";
1074 raw_string_ostream OS(Buffer);
1075 OS << X.getAssumed() << ',' << Y.getAssumed() << ',' << Z.getAssumed()
1076 << ']';
1077 return OS.str();
1078 }
1079
1080 const char *getIdAddr() const override { return &ID; }
1081
1082 /// This function should return true if the type of the \p AA is
1083 /// AAAMDMaxNumWorkgroups
1084 static bool classof(const AbstractAttribute *AA) {
1085 return (AA->getIdAddr() == &ID);
1086 }
1087
1088 void trackStatistics() const override {}
1089
1090 /// Unique ID (due to the unique address)
1091 static const char ID;
1092};
1093
1094const char AAAMDMaxNumWorkgroups::ID = 0;
1095
1096AAAMDMaxNumWorkgroups &
1097AAAMDMaxNumWorkgroups::createForPosition(const IRPosition &IRP, Attributor &A) {
1098 if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
1099 return *new (A.Allocator) AAAMDMaxNumWorkgroups(IRP, A);
1100 llvm_unreachable("AAAMDMaxNumWorkgroups is only valid for function position");
1101}
1102
1103/// Propagate amdgpu-waves-per-eu attribute.
1104struct AAAMDWavesPerEU : public AAAMDSizeRangeAttribute {
1105 AAAMDWavesPerEU(const IRPosition &IRP, Attributor &A)
1106 : AAAMDSizeRangeAttribute(IRP, A, "amdgpu-waves-per-eu") {}
1107
1108 void initialize(Attributor &A) override {
1109 Function *F = getAssociatedFunction();
1110 auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
1111
1112 // If the attribute exists, we will honor it if it is not the default.
1113 if (auto Attr = InfoCache.getWavesPerEUAttr(F: *F)) {
1114 std::pair<unsigned, unsigned> MaxWavesPerEURange{
1115 1U, InfoCache.getMaxWavesPerEU(F: *F)};
1116 if (*Attr != MaxWavesPerEURange) {
1117 auto [Min, Max] = *Attr;
1118 ConstantRange Range(APInt(32, Min), APInt(32, Max + 1));
1119 IntegerRangeState RangeState(Range);
1120 this->getState() = RangeState;
1121 indicateOptimisticFixpoint();
1122 return;
1123 }
1124 }
1125
1126 if (AMDGPU::isEntryFunctionCC(CC: F->getCallingConv()))
1127 indicatePessimisticFixpoint();
1128 }
1129
1130 ChangeStatus updateImpl(Attributor &A) override {
1131 ChangeStatus Change = ChangeStatus::UNCHANGED;
1132
1133 auto CheckCallSite = [&](AbstractCallSite CS) {
1134 Function *Caller = CS.getInstruction()->getFunction();
1135 Function *Func = getAssociatedFunction();
1136 LLVM_DEBUG(dbgs() << '[' << getName() << "] Call " << Caller->getName()
1137 << "->" << Func->getName() << '\n');
1138 (void)Func;
1139
1140 const auto *CallerAA = A.getAAFor<AAAMDWavesPerEU>(
1141 QueryingAA: *this, IRP: IRPosition::function(F: *Caller), DepClass: DepClassTy::REQUIRED);
1142 if (!CallerAA || !CallerAA->isValidState())
1143 return false;
1144
1145 ConstantRange Assumed = getAssumed();
1146 unsigned Min = std::max(a: Assumed.getLower().getZExtValue(),
1147 b: CallerAA->getAssumed().getLower().getZExtValue());
1148 unsigned Max = std::max(a: Assumed.getUpper().getZExtValue(),
1149 b: CallerAA->getAssumed().getUpper().getZExtValue());
1150 ConstantRange Range(APInt(32, Min), APInt(32, Max));
1151 IntegerRangeState RangeState(Range);
1152 getState() = RangeState;
1153 Change |= getState() == Assumed ? ChangeStatus::UNCHANGED
1154 : ChangeStatus::CHANGED;
1155
1156 return true;
1157 };
1158
1159 bool AllCallSitesKnown = true;
1160 if (!A.checkForAllCallSites(Pred: CheckCallSite, QueryingAA: *this, RequireAllCallSites: true, UsedAssumedInformation&: AllCallSitesKnown))
1161 return indicatePessimisticFixpoint();
1162
1163 return Change;
1164 }
1165
1166 /// Create an abstract attribute view for the position \p IRP.
1167 static AAAMDWavesPerEU &createForPosition(const IRPosition &IRP,
1168 Attributor &A);
1169
1170 ChangeStatus manifest(Attributor &A) override {
1171 Function *F = getAssociatedFunction();
1172 auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
1173 return emitAttributeIfNotDefaultAfterClamp(
1174 A, Default: {1U, InfoCache.getMaxWavesPerEU(F: *F)});
1175 }
1176
1177 /// See AbstractAttribute::getName()
1178 StringRef getName() const override { return "AAAMDWavesPerEU"; }
1179
1180 /// See AbstractAttribute::getIdAddr()
1181 const char *getIdAddr() const override { return &ID; }
1182
1183 /// This function should return true if the type of the \p AA is
1184 /// AAAMDWavesPerEU
1185 static bool classof(const AbstractAttribute *AA) {
1186 return (AA->getIdAddr() == &ID);
1187 }
1188
1189 /// Unique ID (due to the unique address)
1190 static const char ID;
1191};
1192
1193const char AAAMDWavesPerEU::ID = 0;
1194
1195AAAMDWavesPerEU &AAAMDWavesPerEU::createForPosition(const IRPosition &IRP,
1196 Attributor &A) {
1197 if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
1198 return *new (A.Allocator) AAAMDWavesPerEU(IRP, A);
1199 llvm_unreachable("AAAMDWavesPerEU is only valid for function position");
1200}
1201
1202static bool inlineAsmUsesAGPRs(const InlineAsm *IA) {
1203 for (const auto &CI : IA->ParseConstraints()) {
1204 for (StringRef Code : CI.Codes) {
1205 Code.consume_front(Prefix: "{");
1206 if (Code.starts_with(Prefix: "a"))
1207 return true;
1208 }
1209 }
1210
1211 return false;
1212}
1213
1214// TODO: Migrate to range merge of amdgpu-agpr-alloc.
1215// FIXME: Why is this using Attribute::NoUnwind?
1216struct AAAMDGPUNoAGPR
1217 : public IRAttribute<Attribute::NoUnwind,
1218 StateWrapper<BooleanState, AbstractAttribute>,
1219 AAAMDGPUNoAGPR> {
1220 AAAMDGPUNoAGPR(const IRPosition &IRP, Attributor &A) : IRAttribute(IRP) {}
1221
1222 static AAAMDGPUNoAGPR &createForPosition(const IRPosition &IRP,
1223 Attributor &A) {
1224 if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
1225 return *new (A.Allocator) AAAMDGPUNoAGPR(IRP, A);
1226 llvm_unreachable("AAAMDGPUNoAGPR is only valid for function position");
1227 }
1228
1229 void initialize(Attributor &A) override {
1230 Function *F = getAssociatedFunction();
1231 auto [MinNumAGPR, MaxNumAGPR] =
1232 AMDGPU::getIntegerPairAttribute(F: *F, Name: "amdgpu-agpr-alloc", Default: {~0u, ~0u},
1233 /*OnlyFirstRequired=*/true);
1234 if (MinNumAGPR == 0)
1235 indicateOptimisticFixpoint();
1236 }
1237
1238 const std::string getAsStr(Attributor *A) const override {
1239 return getAssumed() ? "amdgpu-no-agpr" : "amdgpu-maybe-agpr";
1240 }
1241
1242 void trackStatistics() const override {}
1243
1244 ChangeStatus updateImpl(Attributor &A) override {
1245 // TODO: Use AACallEdges, but then we need a way to inspect asm edges.
1246
1247 auto CheckForNoAGPRs = [&](Instruction &I) {
1248 const auto &CB = cast<CallBase>(Val&: I);
1249 const Value *CalleeOp = CB.getCalledOperand();
1250 const Function *Callee = dyn_cast<Function>(Val: CalleeOp);
1251 if (!Callee) {
1252 if (const InlineAsm *IA = dyn_cast<InlineAsm>(Val: CalleeOp))
1253 return !inlineAsmUsesAGPRs(IA);
1254 return false;
1255 }
1256
1257 // Some intrinsics may use AGPRs, but if we have a choice, we are not
1258 // required to use AGPRs.
1259 if (Callee->isIntrinsic())
1260 return true;
1261
1262 // TODO: Handle callsite attributes
1263 const auto *CalleeInfo = A.getAAFor<AAAMDGPUNoAGPR>(
1264 QueryingAA: *this, IRP: IRPosition::function(F: *Callee), DepClass: DepClassTy::REQUIRED);
1265 return CalleeInfo && CalleeInfo->isValidState() &&
1266 CalleeInfo->getAssumed();
1267 };
1268
1269 bool UsedAssumedInformation = false;
1270 if (!A.checkForAllCallLikeInstructions(Pred: CheckForNoAGPRs, QueryingAA: *this,
1271 UsedAssumedInformation))
1272 return indicatePessimisticFixpoint();
1273 return ChangeStatus::UNCHANGED;
1274 }
1275
1276 ChangeStatus manifest(Attributor &A) override {
1277 if (!getAssumed())
1278 return ChangeStatus::UNCHANGED;
1279 LLVMContext &Ctx = getAssociatedFunction()->getContext();
1280 return A.manifestAttrs(IRP: getIRPosition(),
1281 DeducedAttrs: {Attribute::get(Context&: Ctx, Kind: "amdgpu-agpr-alloc", Val: "0")});
1282 }
1283
1284 StringRef getName() const override { return "AAAMDGPUNoAGPR"; }
1285 const char *getIdAddr() const override { return &ID; }
1286
1287 /// This function should return true if the type of the \p AA is
1288 /// AAAMDGPUNoAGPRs
1289 static bool classof(const AbstractAttribute *AA) {
1290 return (AA->getIdAddr() == &ID);
1291 }
1292
1293 static const char ID;
1294};
1295
1296const char AAAMDGPUNoAGPR::ID = 0;
1297
1298/// Performs the final check and updates the 'amdgpu-waves-per-eu' attribute
1299/// based on the finalized 'amdgpu-flat-work-group-size' attribute.
1300/// Both attributes start with narrow ranges that expand during iteration.
1301/// However, a narrower flat-workgroup-size leads to a wider waves-per-eu range,
1302/// preventing optimal updates later. Therefore, waves-per-eu can't be updated
1303/// with intermediate values during the attributor run. We defer the
1304/// finalization of waves-per-eu until after the flat-workgroup-size is
1305/// finalized.
1306/// TODO: Remove this and move similar logic back into the attributor run once
1307/// we have a better representation for waves-per-eu.
1308static bool updateWavesPerEU(Module &M, TargetMachine &TM) {
1309 bool Changed = false;
1310
1311 LLVMContext &Ctx = M.getContext();
1312
1313 for (Function &F : M) {
1314 if (F.isDeclaration())
1315 continue;
1316
1317 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
1318
1319 std::optional<std::pair<unsigned, std::optional<unsigned>>>
1320 FlatWgrpSizeAttr =
1321 AMDGPU::getIntegerPairAttribute(F, Name: "amdgpu-flat-work-group-size");
1322
1323 unsigned MinWavesPerEU = ST.getMinWavesPerEU();
1324 unsigned MaxWavesPerEU = ST.getMaxWavesPerEU();
1325
1326 unsigned MinFlatWgrpSize = ST.getMinFlatWorkGroupSize();
1327 unsigned MaxFlatWgrpSize = ST.getMaxFlatWorkGroupSize();
1328 if (FlatWgrpSizeAttr.has_value()) {
1329 MinFlatWgrpSize = FlatWgrpSizeAttr->first;
1330 MaxFlatWgrpSize = *(FlatWgrpSizeAttr->second);
1331 }
1332
1333 // Start with the "best" range.
1334 unsigned Min = MinWavesPerEU;
1335 unsigned Max = MinWavesPerEU;
1336
1337 // Compute the range from flat workgroup size. `getWavesPerEU` will also
1338 // account for the 'amdgpu-waves-er-eu' attribute.
1339 auto [MinFromFlatWgrpSize, MaxFromFlatWgrpSize] =
1340 ST.getWavesPerEU(F, FlatWorkGroupSizes: {MinFlatWgrpSize, MaxFlatWgrpSize});
1341
1342 // For the lower bound, we have to "tighten" it.
1343 Min = std::max(a: Min, b: MinFromFlatWgrpSize);
1344 // For the upper bound, we have to "extend" it.
1345 Max = std::max(a: Max, b: MaxFromFlatWgrpSize);
1346
1347 // Clamp the range to the max range.
1348 Min = std::max(a: Min, b: MinWavesPerEU);
1349 Max = std::min(a: Max, b: MaxWavesPerEU);
1350
1351 // Update the attribute if it is not the max.
1352 if (Min != MinWavesPerEU || Max != MaxWavesPerEU) {
1353 SmallString<10> Buffer;
1354 raw_svector_ostream OS(Buffer);
1355 OS << Min << ',' << Max;
1356 Attribute OldAttr = F.getFnAttribute(Kind: "amdgpu-waves-per-eu");
1357 Attribute NewAttr = Attribute::get(Context&: Ctx, Kind: "amdgpu-waves-per-eu", Val: OS.str());
1358 F.addFnAttr(Attr: NewAttr);
1359 Changed |= OldAttr == NewAttr;
1360 }
1361 }
1362
1363 return Changed;
1364}
1365
1366static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
1367 AMDGPUAttributorOptions Options,
1368 ThinOrFullLTOPhase LTOPhase) {
1369 SetVector<Function *> Functions;
1370 for (Function &F : M) {
1371 if (!F.isIntrinsic())
1372 Functions.insert(X: &F);
1373 }
1374
1375 CallGraphUpdater CGUpdater;
1376 BumpPtrAllocator Allocator;
1377 AMDGPUInformationCache InfoCache(M, AG, Allocator, nullptr, TM);
1378 DenseSet<const char *> Allowed(
1379 {&AAAMDAttributes::ID, &AAUniformWorkGroupSize::ID,
1380 &AAPotentialValues::ID, &AAAMDFlatWorkGroupSize::ID,
1381 &AAAMDMaxNumWorkgroups::ID, &AAAMDWavesPerEU::ID, &AAAMDGPUNoAGPR::ID,
1382 &AACallEdges::ID, &AAPointerInfo::ID, &AAPotentialConstantValues::ID,
1383 &AAUnderlyingObjects::ID, &AAAddressSpace::ID, &AAIndirectCallInfo::ID,
1384 &AAInstanceInfo::ID});
1385
1386 AttributorConfig AC(CGUpdater);
1387 AC.IsClosedWorldModule = Options.IsClosedWorld;
1388 AC.Allowed = &Allowed;
1389 AC.IsModulePass = true;
1390 AC.DefaultInitializeLiveInternals = false;
1391 AC.IndirectCalleeSpecializationCallback =
1392 [](Attributor &A, const AbstractAttribute &AA, CallBase &CB,
1393 Function &Callee, unsigned NumAssumedCallees) {
1394 return !AMDGPU::isEntryFunctionCC(CC: Callee.getCallingConv()) &&
1395 (NumAssumedCallees <= IndirectCallSpecializationThreshold);
1396 };
1397 AC.IPOAmendableCB = [](const Function &F) {
1398 return F.getCallingConv() == CallingConv::AMDGPU_KERNEL;
1399 };
1400
1401 Attributor A(Functions, InfoCache, AC);
1402
1403 LLVM_DEBUG({
1404 StringRef LTOPhaseStr = to_string(LTOPhase);
1405 dbgs() << "[AMDGPUAttributor] Running at phase " << LTOPhaseStr << '\n'
1406 << "[AMDGPUAttributor] Module " << M.getName() << " is "
1407 << (AC.IsClosedWorldModule ? "" : "not ")
1408 << "assumed to be a closed world.\n";
1409 });
1410
1411 for (auto *F : Functions) {
1412 A.getOrCreateAAFor<AAAMDAttributes>(IRP: IRPosition::function(F: *F));
1413 A.getOrCreateAAFor<AAUniformWorkGroupSize>(IRP: IRPosition::function(F: *F));
1414 A.getOrCreateAAFor<AAAMDMaxNumWorkgroups>(IRP: IRPosition::function(F: *F));
1415 A.getOrCreateAAFor<AAAMDGPUNoAGPR>(IRP: IRPosition::function(F: *F));
1416 CallingConv::ID CC = F->getCallingConv();
1417 if (!AMDGPU::isEntryFunctionCC(CC)) {
1418 A.getOrCreateAAFor<AAAMDFlatWorkGroupSize>(IRP: IRPosition::function(F: *F));
1419 A.getOrCreateAAFor<AAAMDWavesPerEU>(IRP: IRPosition::function(F: *F));
1420 }
1421
1422 for (auto &I : instructions(F)) {
1423 if (auto *LI = dyn_cast<LoadInst>(Val: &I)) {
1424 A.getOrCreateAAFor<AAAddressSpace>(
1425 IRP: IRPosition::value(V: *LI->getPointerOperand()));
1426 } else if (auto *SI = dyn_cast<StoreInst>(Val: &I)) {
1427 A.getOrCreateAAFor<AAAddressSpace>(
1428 IRP: IRPosition::value(V: *SI->getPointerOperand()));
1429 } else if (auto *RMW = dyn_cast<AtomicRMWInst>(Val: &I)) {
1430 A.getOrCreateAAFor<AAAddressSpace>(
1431 IRP: IRPosition::value(V: *RMW->getPointerOperand()));
1432 } else if (auto *CmpX = dyn_cast<AtomicCmpXchgInst>(Val: &I)) {
1433 A.getOrCreateAAFor<AAAddressSpace>(
1434 IRP: IRPosition::value(V: *CmpX->getPointerOperand()));
1435 }
1436 }
1437 }
1438
1439 bool Changed = A.run() == ChangeStatus::CHANGED;
1440
1441 Changed |= updateWavesPerEU(M, TM);
1442
1443 return Changed;
1444}
1445} // namespace
1446
1447PreservedAnalyses llvm::AMDGPUAttributorPass::run(Module &M,
1448 ModuleAnalysisManager &AM) {
1449
1450 FunctionAnalysisManager &FAM =
1451 AM.getResult<FunctionAnalysisManagerModuleProxy>(IR&: M).getManager();
1452 AnalysisGetter AG(FAM);
1453
1454 // TODO: Probably preserves CFG
1455 return runImpl(M, AG, TM, Options, LTOPhase) ? PreservedAnalyses::none()
1456 : PreservedAnalyses::all();
1457}
1458