1//===-- AMDGPULowerKernelAttributes.cpp------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file This pass does attempts to make use of reqd_work_group_size metadata
10/// to eliminate loads from the dispatch packet and to constant fold OpenCL
11/// get_local_size-like functions.
12//
13//===----------------------------------------------------------------------===//
14
15#include "AMDGPU.h"
16#include "Utils/AMDGPUBaseInfo.h"
17#include "llvm/Analysis/ConstantFolding.h"
18#include "llvm/Analysis/ValueTracking.h"
19#include "llvm/CodeGen/Passes.h"
20#include "llvm/IR/Constants.h"
21#include "llvm/IR/Function.h"
22#include "llvm/IR/IRBuilder.h"
23#include "llvm/IR/InstIterator.h"
24#include "llvm/IR/Instructions.h"
25#include "llvm/IR/IntrinsicsAMDGPU.h"
26#include "llvm/IR/MDBuilder.h"
27#include "llvm/IR/PatternMatch.h"
28#include "llvm/Pass.h"
29
30#define DEBUG_TYPE "amdgpu-lower-kernel-attributes"
31
32using namespace llvm;
33
34namespace {
35
36// Field offsets in hsa_kernel_dispatch_packet_t.
37enum DispatchPackedOffsets {
38 WORKGROUP_SIZE_X = 4,
39 WORKGROUP_SIZE_Y = 6,
40 WORKGROUP_SIZE_Z = 8,
41
42 GRID_SIZE_X = 12,
43 GRID_SIZE_Y = 16,
44 GRID_SIZE_Z = 20
45};
46
47// Field offsets to implicit kernel argument pointer.
48enum ImplicitArgOffsets {
49 HIDDEN_BLOCK_COUNT_X = 0,
50 HIDDEN_BLOCK_COUNT_Y = 4,
51 HIDDEN_BLOCK_COUNT_Z = 8,
52
53 HIDDEN_GROUP_SIZE_X = 12,
54 HIDDEN_GROUP_SIZE_Y = 14,
55 HIDDEN_GROUP_SIZE_Z = 16,
56
57 HIDDEN_REMAINDER_X = 18,
58 HIDDEN_REMAINDER_Y = 20,
59 HIDDEN_REMAINDER_Z = 22,
60};
61
62class AMDGPULowerKernelAttributes : public ModulePass {
63public:
64 static char ID;
65
66 AMDGPULowerKernelAttributes() : ModulePass(ID) {}
67
68 bool runOnModule(Module &M) override;
69
70 StringRef getPassName() const override { return "AMDGPU Kernel Attributes"; }
71
72 void getAnalysisUsage(AnalysisUsage &AU) const override {
73 AU.setPreservesAll();
74 }
75};
76
77Function *getBasePtrIntrinsic(Module &M, bool IsV5OrAbove) {
78 auto IntrinsicId = IsV5OrAbove ? Intrinsic::amdgcn_implicitarg_ptr
79 : Intrinsic::amdgcn_dispatch_ptr;
80 return Intrinsic::getDeclarationIfExists(M: &M, id: IntrinsicId);
81}
82
83} // end anonymous namespace
84
85static void annotateGridSizeLoadWithRangeMD(LoadInst *Load,
86 uint32_t MaxNumGroups) {
87 if (MaxNumGroups == 0 || MaxNumGroups == std::numeric_limits<uint32_t>::max())
88 return;
89
90 if (!Load->getType()->isIntegerTy(Bitwidth: 32))
91 return;
92
93 // TODO: If there is existing range metadata, preserve it if it is stricter.
94 MDBuilder MDB(Load->getContext());
95 MDNode *Range = MDB.createRange(Lo: APInt(32, 1), Hi: APInt(32, MaxNumGroups + 1));
96 Load->setMetadata(KindID: LLVMContext::MD_range, Node: Range);
97}
98
99static bool processUse(CallInst *CI, bool IsV5OrAbove) {
100 Function *F = CI->getFunction();
101
102 auto *MD = F->getMetadata(Kind: "reqd_work_group_size");
103 const bool HasReqdWorkGroupSize = MD && MD->getNumOperands() == 3;
104
105 const bool HasUniformWorkGroupSize =
106 F->getFnAttribute(Kind: "uniform-work-group-size").getValueAsBool();
107
108 SmallVector<unsigned> MaxNumWorkgroups =
109 AMDGPU::getIntegerVecAttribute(F: *F, Name: "amdgpu-max-num-workgroups",
110 /*Size=*/3, /*DefaultVal=*/0);
111
112 if (!HasReqdWorkGroupSize && !HasUniformWorkGroupSize &&
113 !Intrinsic::getDeclarationIfExists(M: CI->getModule(),
114 id: Intrinsic::amdgcn_dispatch_ptr) &&
115 none_of(Range&: MaxNumWorkgroups, P: [](unsigned X) { return X != 0; }))
116 return false;
117
118 Value *BlockCounts[3] = {nullptr, nullptr, nullptr};
119 Value *GroupSizes[3] = {nullptr, nullptr, nullptr};
120 Value *Remainders[3] = {nullptr, nullptr, nullptr};
121 Value *GridSizes[3] = {nullptr, nullptr, nullptr};
122
123 const DataLayout &DL = F->getDataLayout();
124
125 // We expect to see several GEP users, casted to the appropriate type and
126 // loaded.
127 for (User *U : CI->users()) {
128 if (!U->hasOneUse())
129 continue;
130
131 int64_t Offset = 0;
132 auto *Load = dyn_cast<LoadInst>(Val: U); // Load from ImplicitArgPtr/DispatchPtr?
133 auto *BCI = dyn_cast<BitCastInst>(Val: U);
134 if (!Load && !BCI) {
135 if (GetPointerBaseWithConstantOffset(Ptr: U, Offset, DL) != CI)
136 continue;
137 Load = dyn_cast<LoadInst>(Val: *U->user_begin()); // Load from GEP?
138 BCI = dyn_cast<BitCastInst>(Val: *U->user_begin());
139 }
140
141 if (BCI) {
142 if (!BCI->hasOneUse())
143 continue;
144 Load = dyn_cast<LoadInst>(Val: *BCI->user_begin()); // Load from BCI?
145 }
146
147 if (!Load || !Load->isSimple())
148 continue;
149
150 unsigned LoadSize = DL.getTypeStoreSize(Ty: Load->getType());
151
152 // TODO: Handle merged loads.
153 if (IsV5OrAbove) { // Base is ImplicitArgPtr.
154 switch (Offset) {
155 case HIDDEN_BLOCK_COUNT_X:
156 if (LoadSize == 4) {
157 BlockCounts[0] = Load;
158 annotateGridSizeLoadWithRangeMD(Load, MaxNumGroups: MaxNumWorkgroups[0]);
159 }
160 break;
161 case HIDDEN_BLOCK_COUNT_Y:
162 if (LoadSize == 4) {
163 BlockCounts[1] = Load;
164 annotateGridSizeLoadWithRangeMD(Load, MaxNumGroups: MaxNumWorkgroups[1]);
165 }
166 break;
167 case HIDDEN_BLOCK_COUNT_Z:
168 if (LoadSize == 4) {
169 BlockCounts[2] = Load;
170 annotateGridSizeLoadWithRangeMD(Load, MaxNumGroups: MaxNumWorkgroups[2]);
171 }
172 break;
173 case HIDDEN_GROUP_SIZE_X:
174 if (LoadSize == 2)
175 GroupSizes[0] = Load;
176 break;
177 case HIDDEN_GROUP_SIZE_Y:
178 if (LoadSize == 2)
179 GroupSizes[1] = Load;
180 break;
181 case HIDDEN_GROUP_SIZE_Z:
182 if (LoadSize == 2)
183 GroupSizes[2] = Load;
184 break;
185 case HIDDEN_REMAINDER_X:
186 if (LoadSize == 2)
187 Remainders[0] = Load;
188 break;
189 case HIDDEN_REMAINDER_Y:
190 if (LoadSize == 2)
191 Remainders[1] = Load;
192 break;
193 case HIDDEN_REMAINDER_Z:
194 if (LoadSize == 2)
195 Remainders[2] = Load;
196 break;
197 default:
198 break;
199 }
200 } else { // Base is DispatchPtr.
201 switch (Offset) {
202 case WORKGROUP_SIZE_X:
203 if (LoadSize == 2)
204 GroupSizes[0] = Load;
205 break;
206 case WORKGROUP_SIZE_Y:
207 if (LoadSize == 2)
208 GroupSizes[1] = Load;
209 break;
210 case WORKGROUP_SIZE_Z:
211 if (LoadSize == 2)
212 GroupSizes[2] = Load;
213 break;
214 case GRID_SIZE_X:
215 if (LoadSize == 4)
216 GridSizes[0] = Load;
217 break;
218 case GRID_SIZE_Y:
219 if (LoadSize == 4)
220 GridSizes[1] = Load;
221 break;
222 case GRID_SIZE_Z:
223 if (LoadSize == 4)
224 GridSizes[2] = Load;
225 break;
226 default:
227 break;
228 }
229 }
230 }
231
232 bool MadeChange = false;
233 if (IsV5OrAbove && HasUniformWorkGroupSize) {
234 // Under v5 __ockl_get_local_size returns the value computed by the
235 // expression:
236 //
237 // workgroup_id < hidden_block_count ? hidden_group_size :
238 // hidden_remainder
239 //
240 // For functions with the attribute uniform-work-group-size=true. we can
241 // evaluate workgroup_id < hidden_block_count as true, and thus
242 // hidden_group_size is returned for __ockl_get_local_size.
243 for (int I = 0; I < 3; ++I) {
244 Value *BlockCount = BlockCounts[I];
245 if (!BlockCount)
246 continue;
247
248 using namespace llvm::PatternMatch;
249 auto GroupIDIntrin =
250 I == 0 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_x>()
251 : (I == 1 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_y>()
252 : m_Intrinsic<Intrinsic::amdgcn_workgroup_id_z>());
253
254 for (User *ICmp : BlockCount->users()) {
255 if (match(V: ICmp, P: m_SpecificICmp(MatchPred: ICmpInst::ICMP_ULT, L: GroupIDIntrin,
256 R: m_Specific(V: BlockCount)))) {
257 ICmp->replaceAllUsesWith(V: llvm::ConstantInt::getTrue(Ty: ICmp->getType()));
258 MadeChange = true;
259 }
260 }
261 }
262
263 // All remainders should be 0 with uniform work group size.
264 for (Value *Remainder : Remainders) {
265 if (!Remainder)
266 continue;
267 Remainder->replaceAllUsesWith(
268 V: Constant::getNullValue(Ty: Remainder->getType()));
269 MadeChange = true;
270 }
271 } else if (HasUniformWorkGroupSize) { // Pre-V5.
272 // Pattern match the code used to handle partial workgroup dispatches in the
273 // library implementation of get_local_size, so the entire function can be
274 // constant folded with a known group size.
275 //
276 // uint r = grid_size - group_id * group_size;
277 // get_local_size = (r < group_size) ? r : group_size;
278 //
279 // If we have uniform-work-group-size (which is the default in OpenCL 1.2),
280 // the grid_size is required to be a multiple of group_size). In this case:
281 //
282 // grid_size - (group_id * group_size) < group_size
283 // ->
284 // grid_size < group_size + (group_id * group_size)
285 //
286 // (grid_size / group_size) < 1 + group_id
287 //
288 // grid_size / group_size is at least 1, so we can conclude the select
289 // condition is false (except for group_id == 0, where the select result is
290 // the same).
291 for (int I = 0; I < 3; ++I) {
292 Value *GroupSize = GroupSizes[I];
293 Value *GridSize = GridSizes[I];
294 if (!GroupSize || !GridSize)
295 continue;
296
297 using namespace llvm::PatternMatch;
298 auto GroupIDIntrin =
299 I == 0 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_x>()
300 : (I == 1 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_y>()
301 : m_Intrinsic<Intrinsic::amdgcn_workgroup_id_z>());
302
303 for (User *U : GroupSize->users()) {
304 auto *ZextGroupSize = dyn_cast<ZExtInst>(Val: U);
305 if (!ZextGroupSize)
306 continue;
307
308 for (User *UMin : ZextGroupSize->users()) {
309 if (match(V: UMin, P: m_UMin(L: m_Sub(L: m_Specific(V: GridSize),
310 R: m_Mul(L: GroupIDIntrin,
311 R: m_Specific(V: ZextGroupSize))),
312 R: m_Specific(V: ZextGroupSize)))) {
313 if (HasReqdWorkGroupSize) {
314 ConstantInt *KnownSize =
315 mdconst::extract<ConstantInt>(MD: MD->getOperand(I));
316 UMin->replaceAllUsesWith(V: ConstantFoldIntegerCast(
317 C: KnownSize, DestTy: UMin->getType(), IsSigned: false, DL));
318 } else {
319 UMin->replaceAllUsesWith(V: ZextGroupSize);
320 }
321
322 MadeChange = true;
323 }
324 }
325 }
326 }
327 }
328
329 // Upgrade the old method of calculating the block size using the grid size.
330 // We pattern match any case where the implicit argument group size is the
331 // divisor to a dispatch packet grid size read of the same dimension.
332 if (IsV5OrAbove) {
333 for (int I = 0; I < 3; I++) {
334 Value *GroupSize = GroupSizes[I];
335 if (!GroupSize || !GroupSize->getType()->isIntegerTy(Bitwidth: 16))
336 continue;
337
338 for (User *U : GroupSize->users()) {
339 Instruction *Inst = cast<Instruction>(Val: U);
340 if (isa<ZExtInst>(Val: Inst) && !Inst->use_empty())
341 Inst = cast<Instruction>(Val: *Inst->user_begin());
342
343 using namespace llvm::PatternMatch;
344 if (!match(
345 V: Inst,
346 P: m_UDiv(L: m_ZExtOrSelf(Op: m_Load(Op: m_GEP(
347 Ops: m_Intrinsic<Intrinsic::amdgcn_dispatch_ptr>(),
348 Ops: m_SpecificInt(V: GRID_SIZE_X + I * sizeof(uint32_t))))),
349 R: m_Value())))
350 continue;
351
352 IRBuilder<> Builder(Inst);
353
354 Value *GEP = Builder.CreateInBoundsGEP(
355 Ty: Builder.getInt8Ty(), Ptr: CI,
356 IdxList: {ConstantInt::get(Ty: Type::getInt64Ty(C&: CI->getContext()),
357 V: HIDDEN_BLOCK_COUNT_X + I * sizeof(uint32_t))});
358 Instruction *BlockCount = Builder.CreateLoad(Ty: Builder.getInt32Ty(), Ptr: GEP);
359 BlockCount->setMetadata(KindID: LLVMContext::MD_invariant_load,
360 Node: MDNode::get(Context&: CI->getContext(), MDs: {}));
361 BlockCount->setMetadata(KindID: LLVMContext::MD_noundef,
362 Node: MDNode::get(Context&: CI->getContext(), MDs: {}));
363
364 Value *BlockCountExt = Builder.CreateZExt(V: BlockCount, DestTy: Inst->getType());
365 Inst->replaceAllUsesWith(V: BlockCountExt);
366 Inst->eraseFromParent();
367 MadeChange = true;
368 }
369 }
370 }
371
372 // If reqd_work_group_size is set, we can replace work group size with it.
373 if (!HasReqdWorkGroupSize)
374 return MadeChange;
375
376 for (int I = 0; I < 3; I++) {
377 Value *GroupSize = GroupSizes[I];
378 if (!GroupSize)
379 continue;
380
381 ConstantInt *KnownSize = mdconst::extract<ConstantInt>(MD: MD->getOperand(I));
382 GroupSize->replaceAllUsesWith(
383 V: ConstantFoldIntegerCast(C: KnownSize, DestTy: GroupSize->getType(), IsSigned: false, DL));
384 MadeChange = true;
385 }
386
387 return MadeChange;
388}
389
390// TODO: Move makeLIDRangeMetadata usage into here. Seem to not get
391// TargetPassConfig for subtarget.
392bool AMDGPULowerKernelAttributes::runOnModule(Module &M) {
393 bool MadeChange = false;
394 bool IsV5OrAbove =
395 AMDGPU::getAMDHSACodeObjectVersion(M) >= AMDGPU::AMDHSA_COV5;
396 Function *BasePtr = getBasePtrIntrinsic(M, IsV5OrAbove);
397
398 if (!BasePtr) // ImplicitArgPtr/DispatchPtr not used.
399 return false;
400
401 SmallPtrSet<Instruction *, 4> HandledUses;
402 for (auto *U : BasePtr->users()) {
403 CallInst *CI = cast<CallInst>(Val: U);
404 if (HandledUses.insert(Ptr: CI).second) {
405 if (processUse(CI, IsV5OrAbove))
406 MadeChange = true;
407 }
408 }
409
410 return MadeChange;
411}
412
413INITIALIZE_PASS_BEGIN(AMDGPULowerKernelAttributes, DEBUG_TYPE,
414 "AMDGPU Kernel Attributes", false, false)
415INITIALIZE_PASS_END(AMDGPULowerKernelAttributes, DEBUG_TYPE,
416 "AMDGPU Kernel Attributes", false, false)
417
418char AMDGPULowerKernelAttributes::ID = 0;
419
420ModulePass *llvm::createAMDGPULowerKernelAttributesPass() {
421 return new AMDGPULowerKernelAttributes();
422}
423
424PreservedAnalyses
425AMDGPULowerKernelAttributesPass::run(Function &F, FunctionAnalysisManager &AM) {
426 bool IsV5OrAbove =
427 AMDGPU::getAMDHSACodeObjectVersion(M: *F.getParent()) >= AMDGPU::AMDHSA_COV5;
428 Function *BasePtr = getBasePtrIntrinsic(M&: *F.getParent(), IsV5OrAbove);
429
430 if (!BasePtr) // ImplicitArgPtr/DispatchPtr not used.
431 return PreservedAnalyses::all();
432
433 bool Changed = false;
434 for (Instruction &I : instructions(F)) {
435 if (CallInst *CI = dyn_cast<CallInst>(Val: &I)) {
436 if (CI->getCalledFunction() == BasePtr)
437 Changed |= processUse(CI, IsV5OrAbove);
438 }
439 }
440
441 return !Changed ? PreservedAnalyses::all()
442 : PreservedAnalyses::none().preserveSet<CFGAnalyses>();
443}
444