1//===-- AMDGPULowerKernelAttributes.cpp------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file This pass does attempts to make use of reqd_work_group_size metadata
10/// to eliminate loads from the dispatch packet and to constant fold OpenCL
11/// get_local_size-like functions.
12//
13//===----------------------------------------------------------------------===//
14
15#include "AMDGPU.h"
16#include "Utils/AMDGPUBaseInfo.h"
17#include "llvm/Analysis/ConstantFolding.h"
18#include "llvm/Analysis/ValueTracking.h"
19#include "llvm/CodeGen/Passes.h"
20#include "llvm/IR/Constants.h"
21#include "llvm/IR/Function.h"
22#include "llvm/IR/IRBuilder.h"
23#include "llvm/IR/InstIterator.h"
24#include "llvm/IR/Instructions.h"
25#include "llvm/IR/IntrinsicsAMDGPU.h"
26#include "llvm/IR/MDBuilder.h"
27#include "llvm/IR/PatternMatch.h"
28#include "llvm/Pass.h"
29
30#define DEBUG_TYPE "amdgpu-lower-kernel-attributes"
31
32using namespace llvm;
33
34namespace {
35
36// Field offsets in hsa_kernel_dispatch_packet_t.
37enum DispatchPackedOffsets {
38 WORKGROUP_SIZE_X = 4,
39 WORKGROUP_SIZE_Y = 6,
40 WORKGROUP_SIZE_Z = 8,
41
42 GRID_SIZE_X = 12,
43 GRID_SIZE_Y = 16,
44 GRID_SIZE_Z = 20
45};
46
47// Field offsets to implicit kernel argument pointer.
48enum ImplicitArgOffsets {
49 HIDDEN_BLOCK_COUNT_X = 0,
50 HIDDEN_BLOCK_COUNT_Y = 4,
51 HIDDEN_BLOCK_COUNT_Z = 8,
52
53 HIDDEN_GROUP_SIZE_X = 12,
54 HIDDEN_GROUP_SIZE_Y = 14,
55 HIDDEN_GROUP_SIZE_Z = 16,
56
57 HIDDEN_REMAINDER_X = 18,
58 HIDDEN_REMAINDER_Y = 20,
59 HIDDEN_REMAINDER_Z = 22,
60
61 GRID_DIMS = 64
62};
63
64class AMDGPULowerKernelAttributes : public ModulePass {
65public:
66 static char ID;
67
68 AMDGPULowerKernelAttributes() : ModulePass(ID) {}
69
70 bool runOnModule(Module &M) override;
71
72 StringRef getPassName() const override { return "AMDGPU Kernel Attributes"; }
73
74 void getAnalysisUsage(AnalysisUsage &AU) const override {
75 AU.setPreservesAll();
76 }
77};
78
79Function *getBasePtrIntrinsic(Module &M, bool IsV5OrAbove) {
80 auto IntrinsicId = IsV5OrAbove ? Intrinsic::amdgcn_implicitarg_ptr
81 : Intrinsic::amdgcn_dispatch_ptr;
82 return Intrinsic::getDeclarationIfExists(M: &M, id: IntrinsicId);
83}
84
85} // end anonymous namespace
86
87static bool annotateGridSizeLoadWithRangeMD(LoadInst *Load,
88 uint32_t MaxNumGroups) {
89 if (MaxNumGroups == 0 || MaxNumGroups == std::numeric_limits<uint32_t>::max())
90 return false;
91
92 if (!Load->getType()->isIntegerTy(Bitwidth: 32))
93 return false;
94
95 // TODO: If there is existing range metadata, preserve it if it is stricter.
96 if (Load->hasMetadata(KindID: LLVMContext::MD_range))
97 return false;
98
99 MDBuilder MDB(Load->getContext());
100 MDNode *Range = MDB.createRange(Lo: APInt(32, 1), Hi: APInt(32, MaxNumGroups + 1));
101 Load->setMetadata(KindID: LLVMContext::MD_range, Node: Range);
102 return true;
103}
104
105static bool annotateGroupSizeLoadWithRangeMD(LoadInst *Load, bool IsRemainder) {
106 if (!Load->getType()->isIntegerTy(Bitwidth: 16))
107 return false;
108
109 // TODO: If there is existing range metadata, preserve it if it is stricter.
110 if (Load->hasMetadata(KindID: LLVMContext::MD_range))
111 return false;
112
113 MDBuilder MDB(Load->getContext());
114 MDNode *Range = MDB.createRange(
115 Lo: APInt(16, !IsRemainder),
116 Hi: APInt(16, AMDGPU::IsaInfo::getMaxFlatWorkGroupSize() + 1 - IsRemainder));
117 Load->setMetadata(KindID: LLVMContext::MD_range, Node: Range);
118 return true;
119}
120
121static bool annotateGridDimsLoadWithRangeMD(LoadInst *Load,
122 unsigned KnownNumGridDims) {
123 IntegerType *Ty = dyn_cast<IntegerType>(Val: Load->getType());
124 if (!Ty || Ty->getBitWidth() < 3)
125 return false;
126
127 if (KnownNumGridDims != 0) {
128 Load->replaceAllUsesWith(
129 V: ConstantInt::get(Ty: Load->getType(), V: KnownNumGridDims));
130 return true;
131 }
132
133 // TODO: If there is existing range metadata, preserve it if it is stricter.
134 if (Load->hasMetadata(KindID: LLVMContext::MD_range))
135 return false;
136
137 MDBuilder MDB(Load->getContext());
138 MDNode *Range =
139 MDB.createRange(Lo: APInt(Ty->getBitWidth(), 1), Hi: APInt(Ty->getBitWidth(), 4));
140 Load->setMetadata(KindID: LLVMContext::MD_range, Node: Range);
141 return true;
142}
143
144/// Compute the number of grid dimensions based on !reqd_work_group_size
145/// metadata
146static unsigned computeNumGridDims(const MDNode *ReqdWorkGroupSize) {
147 ConstantInt *KnownZ =
148 mdconst::extract<ConstantInt>(MD: ReqdWorkGroupSize->getOperand(I: 2));
149 if (KnownZ->getZExtValue() != 1)
150 return 3;
151
152 ConstantInt *KnownY =
153 mdconst::extract<ConstantInt>(MD: ReqdWorkGroupSize->getOperand(I: 1));
154 if (KnownY->getZExtValue() != 1)
155 return 2;
156
157 return 1;
158}
159
160static bool processUse(CallInst *CI, bool IsV5OrAbove) {
161 Function *F = CI->getFunction();
162
163 auto *MD = F->getMetadata(Kind: "reqd_work_group_size");
164 const bool HasReqdWorkGroupSize = MD && MD->getNumOperands() == 3;
165
166 const bool HasUniformWorkGroupSize =
167 F->hasFnAttribute(Kind: "uniform-work-group-size");
168
169 SmallVector<unsigned> MaxNumWorkgroups =
170 AMDGPU::getIntegerVecAttribute(F: *F, Name: "amdgpu-max-num-workgroups",
171 /*Size=*/3, /*DefaultVal=*/0);
172
173 Value *BlockCounts[3] = {nullptr, nullptr, nullptr};
174 Value *GroupSizes[3] = {nullptr, nullptr, nullptr};
175 Value *Remainders[3] = {nullptr, nullptr, nullptr};
176 Value *GridSizes[3] = {nullptr, nullptr, nullptr};
177
178 const DataLayout &DL = F->getDataLayout();
179 bool MadeChange = false;
180
181 unsigned KnownNumGridDims = HasReqdWorkGroupSize ? computeNumGridDims(ReqdWorkGroupSize: MD) : 0;
182
183 // We expect to see several GEP users, casted to the appropriate type and
184 // loaded.
185 for (User *U : CI->users()) {
186 if (!U->hasOneUse())
187 continue;
188
189 int64_t Offset = 0;
190 auto *Load = dyn_cast<LoadInst>(Val: U); // Load from ImplicitArgPtr/DispatchPtr?
191 auto *BCI = dyn_cast<BitCastInst>(Val: U);
192 if (!Load && !BCI) {
193 if (GetPointerBaseWithConstantOffset(Ptr: U, Offset, DL) != CI)
194 continue;
195 Load = dyn_cast<LoadInst>(Val: *U->user_begin()); // Load from GEP?
196 BCI = dyn_cast<BitCastInst>(Val: *U->user_begin());
197 }
198
199 if (BCI) {
200 if (!BCI->hasOneUse())
201 continue;
202 Load = dyn_cast<LoadInst>(Val: *BCI->user_begin()); // Load from BCI?
203 }
204
205 if (!Load || !Load->isSimple())
206 continue;
207
208 unsigned LoadSize = DL.getTypeStoreSize(Ty: Load->getType());
209
210 // TODO: Handle merged loads.
211 if (IsV5OrAbove) { // Base is ImplicitArgPtr.
212 switch (Offset) {
213 case HIDDEN_BLOCK_COUNT_X:
214 if (LoadSize == 4) {
215 BlockCounts[0] = Load;
216 MadeChange |=
217 annotateGridSizeLoadWithRangeMD(Load, MaxNumGroups: MaxNumWorkgroups[0]);
218 }
219 break;
220 case HIDDEN_BLOCK_COUNT_Y:
221 if (LoadSize == 4) {
222 BlockCounts[1] = Load;
223 MadeChange |=
224 annotateGridSizeLoadWithRangeMD(Load, MaxNumGroups: MaxNumWorkgroups[1]);
225 }
226 break;
227 case HIDDEN_BLOCK_COUNT_Z:
228 if (LoadSize == 4) {
229 BlockCounts[2] = Load;
230 MadeChange |=
231 annotateGridSizeLoadWithRangeMD(Load, MaxNumGroups: MaxNumWorkgroups[2]);
232 }
233 break;
234 case HIDDEN_GROUP_SIZE_X:
235 if (LoadSize == 2) {
236 GroupSizes[0] = Load;
237 MadeChange |= annotateGroupSizeLoadWithRangeMD(Load, IsRemainder: false);
238 }
239 break;
240 case HIDDEN_GROUP_SIZE_Y:
241 if (LoadSize == 2) {
242 GroupSizes[1] = Load;
243 MadeChange |= annotateGroupSizeLoadWithRangeMD(Load, IsRemainder: false);
244 }
245 break;
246 case HIDDEN_GROUP_SIZE_Z:
247 if (LoadSize == 2) {
248 GroupSizes[2] = Load;
249 MadeChange |= annotateGroupSizeLoadWithRangeMD(Load, IsRemainder: false);
250 }
251 break;
252 case HIDDEN_REMAINDER_X:
253 if (LoadSize == 2) {
254 Remainders[0] = Load;
255 MadeChange |= annotateGroupSizeLoadWithRangeMD(Load, IsRemainder: true);
256 }
257 break;
258 case HIDDEN_REMAINDER_Y:
259 if (LoadSize == 2) {
260 Remainders[1] = Load;
261 MadeChange |= annotateGroupSizeLoadWithRangeMD(Load, IsRemainder: true);
262 }
263 break;
264 case HIDDEN_REMAINDER_Z:
265 if (LoadSize == 2) {
266 Remainders[2] = Load;
267 MadeChange |= annotateGroupSizeLoadWithRangeMD(Load, IsRemainder: true);
268 }
269 break;
270
271 case GRID_DIMS:
272 if (LoadSize <= 2)
273 MadeChange |= annotateGridDimsLoadWithRangeMD(Load, KnownNumGridDims);
274 break;
275 default:
276 break;
277 }
278 } else { // Base is DispatchPtr.
279 switch (Offset) {
280 case WORKGROUP_SIZE_X:
281 if (LoadSize == 2)
282 GroupSizes[0] = Load;
283 break;
284 case WORKGROUP_SIZE_Y:
285 if (LoadSize == 2)
286 GroupSizes[1] = Load;
287 break;
288 case WORKGROUP_SIZE_Z:
289 if (LoadSize == 2)
290 GroupSizes[2] = Load;
291 break;
292 case GRID_SIZE_X:
293 if (LoadSize == 4)
294 GridSizes[0] = Load;
295 break;
296 case GRID_SIZE_Y:
297 if (LoadSize == 4)
298 GridSizes[1] = Load;
299 break;
300 case GRID_SIZE_Z:
301 if (LoadSize == 4)
302 GridSizes[2] = Load;
303 break;
304 default:
305 break;
306 }
307 }
308 }
309
310 if (IsV5OrAbove && HasUniformWorkGroupSize) {
311 // Under v5 __ockl_get_local_size returns the value computed by the
312 // expression:
313 //
314 // workgroup_id < hidden_block_count ? hidden_group_size :
315 // hidden_remainder
316 //
317 // For functions with the attribute uniform-work-group-size=true. we can
318 // evaluate workgroup_id < hidden_block_count as true, and thus
319 // hidden_group_size is returned for __ockl_get_local_size.
320 for (int I = 0; I < 3; ++I) {
321 Value *BlockCount = BlockCounts[I];
322 if (!BlockCount)
323 continue;
324
325 using namespace llvm::PatternMatch;
326 auto GroupIDIntrin =
327 I == 0 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_x>()
328 : (I == 1 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_y>()
329 : m_Intrinsic<Intrinsic::amdgcn_workgroup_id_z>());
330
331 for (User *ICmp : BlockCount->users()) {
332 if (match(V: ICmp, P: m_SpecificICmp(MatchPred: ICmpInst::ICMP_ULT, L: GroupIDIntrin,
333 R: m_Specific(V: BlockCount)))) {
334 ICmp->replaceAllUsesWith(V: llvm::ConstantInt::getTrue(Ty: ICmp->getType()));
335 MadeChange = true;
336 }
337 }
338 }
339
340 // All remainders should be 0 with uniform work group size.
341 for (Value *Remainder : Remainders) {
342 if (!Remainder)
343 continue;
344 Remainder->replaceAllUsesWith(
345 V: Constant::getNullValue(Ty: Remainder->getType()));
346 MadeChange = true;
347 }
348 } else if (HasUniformWorkGroupSize) { // Pre-V5.
349 // Pattern match the code used to handle partial workgroup dispatches in the
350 // library implementation of get_local_size, so the entire function can be
351 // constant folded with a known group size.
352 //
353 // uint r = grid_size - group_id * group_size;
354 // get_local_size = (r < group_size) ? r : group_size;
355 //
356 // If we have uniform-work-group-size (which is the default in OpenCL 1.2),
357 // the grid_size is required to be a multiple of group_size). In this case:
358 //
359 // grid_size - (group_id * group_size) < group_size
360 // ->
361 // grid_size < group_size + (group_id * group_size)
362 //
363 // (grid_size / group_size) < 1 + group_id
364 //
365 // grid_size / group_size is at least 1, so we can conclude the select
366 // condition is false (except for group_id == 0, where the select result is
367 // the same).
368 for (int I = 0; I < 3; ++I) {
369 Value *GroupSize = GroupSizes[I];
370 Value *GridSize = GridSizes[I];
371 if (!GroupSize || !GridSize)
372 continue;
373
374 using namespace llvm::PatternMatch;
375 auto GroupIDIntrin =
376 I == 0 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_x>()
377 : (I == 1 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_y>()
378 : m_Intrinsic<Intrinsic::amdgcn_workgroup_id_z>());
379
380 for (User *U : GroupSize->users()) {
381 auto *ZextGroupSize = dyn_cast<ZExtInst>(Val: U);
382 if (!ZextGroupSize)
383 continue;
384
385 for (User *UMin : ZextGroupSize->users()) {
386 if (match(V: UMin, P: m_UMin(L: m_Sub(L: m_Specific(V: GridSize),
387 R: m_Mul(L: GroupIDIntrin,
388 R: m_Specific(V: ZextGroupSize))),
389 R: m_Specific(V: ZextGroupSize)))) {
390 if (HasReqdWorkGroupSize) {
391 ConstantInt *KnownSize =
392 mdconst::extract<ConstantInt>(MD: MD->getOperand(I));
393 UMin->replaceAllUsesWith(V: ConstantFoldIntegerCast(
394 C: KnownSize, DestTy: UMin->getType(), IsSigned: false, DL));
395 } else {
396 UMin->replaceAllUsesWith(V: ZextGroupSize);
397 }
398
399 MadeChange = true;
400 }
401 }
402 }
403 }
404 }
405
406 // Upgrade the old method of calculating the block size using the grid size.
407 // We pattern match any case where the implicit argument group size is the
408 // divisor to a dispatch packet grid size read of the same dimension.
409 if (IsV5OrAbove) {
410 for (int I = 0; I < 3; I++) {
411 Value *GroupSize = GroupSizes[I];
412 if (!GroupSize || !GroupSize->getType()->isIntegerTy(Bitwidth: 16))
413 continue;
414
415 for (User *U : GroupSize->users()) {
416 Instruction *Inst = cast<Instruction>(Val: U);
417 if (isa<ZExtInst>(Val: Inst) && !Inst->use_empty())
418 Inst = cast<Instruction>(Val: *Inst->user_begin());
419
420 using namespace llvm::PatternMatch;
421 if (!match(
422 V: Inst,
423 P: m_UDiv(L: m_ZExtOrSelf(Op: m_Load(Op: m_GEP(
424 Ops: m_Intrinsic<Intrinsic::amdgcn_dispatch_ptr>(),
425 Ops: m_SpecificInt(V: GRID_SIZE_X + I * sizeof(uint32_t))))),
426 R: m_Value())))
427 continue;
428
429 IRBuilder<> Builder(Inst);
430
431 Value *GEP = Builder.CreateInBoundsGEP(
432 Ty: Builder.getInt8Ty(), Ptr: CI,
433 IdxList: {ConstantInt::get(Ty: Type::getInt64Ty(C&: CI->getContext()),
434 V: HIDDEN_BLOCK_COUNT_X + I * sizeof(uint32_t))});
435 Instruction *BlockCount = Builder.CreateLoad(Ty: Builder.getInt32Ty(), Ptr: GEP);
436 BlockCount->setMetadata(KindID: LLVMContext::MD_invariant_load,
437 Node: MDNode::get(Context&: CI->getContext(), MDs: {}));
438 BlockCount->setMetadata(KindID: LLVMContext::MD_noundef,
439 Node: MDNode::get(Context&: CI->getContext(), MDs: {}));
440
441 Value *BlockCountExt = Builder.CreateZExt(V: BlockCount, DestTy: Inst->getType());
442 Inst->replaceAllUsesWith(V: BlockCountExt);
443 Inst->eraseFromParent();
444 MadeChange = true;
445 }
446 }
447 }
448
449 // If reqd_work_group_size is set, we can replace work group size with it.
450 if (!HasReqdWorkGroupSize)
451 return MadeChange;
452
453 for (int I = 0; I < 3; I++) {
454 Value *GroupSize = GroupSizes[I];
455 if (!GroupSize)
456 continue;
457
458 ConstantInt *KnownSize = mdconst::extract<ConstantInt>(MD: MD->getOperand(I));
459 GroupSize->replaceAllUsesWith(
460 V: ConstantFoldIntegerCast(C: KnownSize, DestTy: GroupSize->getType(), IsSigned: false, DL));
461 MadeChange = true;
462 }
463
464 return MadeChange;
465}
466
467// TODO: Move makeLIDRangeMetadata usage into here. Seem to not get
468// TargetPassConfig for subtarget.
469bool AMDGPULowerKernelAttributes::runOnModule(Module &M) {
470 bool MadeChange = false;
471 bool IsV5OrAbove =
472 AMDGPU::getAMDHSACodeObjectVersion(M) >= AMDGPU::AMDHSA_COV5;
473 Function *BasePtr = getBasePtrIntrinsic(M, IsV5OrAbove);
474
475 if (!BasePtr) // ImplicitArgPtr/DispatchPtr not used.
476 return false;
477
478 SmallPtrSet<Instruction *, 4> HandledUses;
479 for (auto *U : BasePtr->users()) {
480 CallInst *CI = cast<CallInst>(Val: U);
481 if (HandledUses.insert(Ptr: CI).second) {
482 if (processUse(CI, IsV5OrAbove))
483 MadeChange = true;
484 }
485 }
486
487 return MadeChange;
488}
489
490INITIALIZE_PASS_BEGIN(AMDGPULowerKernelAttributes, DEBUG_TYPE,
491 "AMDGPU Kernel Attributes", false, false)
492INITIALIZE_PASS_END(AMDGPULowerKernelAttributes, DEBUG_TYPE,
493 "AMDGPU Kernel Attributes", false, false)
494
495char AMDGPULowerKernelAttributes::ID = 0;
496
497ModulePass *llvm::createAMDGPULowerKernelAttributesPass() {
498 return new AMDGPULowerKernelAttributes();
499}
500
501PreservedAnalyses
502AMDGPULowerKernelAttributesPass::run(Function &F, FunctionAnalysisManager &AM) {
503 bool IsV5OrAbove =
504 AMDGPU::getAMDHSACodeObjectVersion(M: *F.getParent()) >= AMDGPU::AMDHSA_COV5;
505 Function *BasePtr = getBasePtrIntrinsic(M&: *F.getParent(), IsV5OrAbove);
506
507 if (!BasePtr) // ImplicitArgPtr/DispatchPtr not used.
508 return PreservedAnalyses::all();
509
510 bool Changed = false;
511 for (Instruction &I : instructions(F)) {
512 if (CallInst *CI = dyn_cast<CallInst>(Val: &I)) {
513 if (CI->getCalledFunction() == BasePtr)
514 Changed |= processUse(CI, IsV5OrAbove);
515 }
516 }
517
518 return !Changed ? PreservedAnalyses::all()
519 : PreservedAnalyses::none().preserveSet<CFGAnalyses>();
520}
521