1//===-- AMDGPULowerKernelAttributes.cpp ------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file This pass does attempts to make use of reqd_work_group_size metadata
10/// to eliminate loads from the dispatch packet and to constant fold OpenCL
11/// get_local_size-like functions.
12//
13//===----------------------------------------------------------------------===//
14
15#include "AMDGPU.h"
16#include "Utils/AMDGPUBaseInfo.h"
17#include "llvm/Analysis/ConstantFolding.h"
18#include "llvm/Analysis/ValueTracking.h"
19#include "llvm/CodeGen/Passes.h"
20#include "llvm/CodeGen/TargetPassConfig.h"
21#include "llvm/IR/Constants.h"
22#include "llvm/IR/Function.h"
23#include "llvm/IR/InstIterator.h"
24#include "llvm/IR/Instructions.h"
25#include "llvm/IR/IntrinsicsAMDGPU.h"
26#include "llvm/IR/PatternMatch.h"
27#include "llvm/Pass.h"
28
29#define DEBUG_TYPE "amdgpu-lower-kernel-attributes"
30
31using namespace llvm;
32
33namespace {
34
35// Field offsets in hsa_kernel_dispatch_packet_t.
36enum DispatchPackedOffsets {
37 WORKGROUP_SIZE_X = 4,
38 WORKGROUP_SIZE_Y = 6,
39 WORKGROUP_SIZE_Z = 8,
40
41 GRID_SIZE_X = 12,
42 GRID_SIZE_Y = 16,
43 GRID_SIZE_Z = 20
44};
45
46// Field offsets to implicit kernel argument pointer.
47enum ImplicitArgOffsets {
48 HIDDEN_BLOCK_COUNT_X = 0,
49 HIDDEN_BLOCK_COUNT_Y = 4,
50 HIDDEN_BLOCK_COUNT_Z = 8,
51
52 HIDDEN_GROUP_SIZE_X = 12,
53 HIDDEN_GROUP_SIZE_Y = 14,
54 HIDDEN_GROUP_SIZE_Z = 16,
55
56 HIDDEN_REMAINDER_X = 18,
57 HIDDEN_REMAINDER_Y = 20,
58 HIDDEN_REMAINDER_Z = 22,
59};
60
61class AMDGPULowerKernelAttributes : public ModulePass {
62public:
63 static char ID;
64
65 AMDGPULowerKernelAttributes() : ModulePass(ID) {}
66
67 bool runOnModule(Module &M) override;
68
69 StringRef getPassName() const override {
70 return "AMDGPU Kernel Attributes";
71 }
72
73 void getAnalysisUsage(AnalysisUsage &AU) const override {
74 AU.setPreservesAll();
75 }
76};
77
78Function *getBasePtrIntrinsic(Module &M, bool IsV5OrAbove) {
79 auto IntrinsicId = IsV5OrAbove ? Intrinsic::amdgcn_implicitarg_ptr
80 : Intrinsic::amdgcn_dispatch_ptr;
81 StringRef Name = Intrinsic::getName(id: IntrinsicId);
82 return M.getFunction(Name);
83}
84
85} // end anonymous namespace
86
87static bool processUse(CallInst *CI, bool IsV5OrAbove) {
88 Function *F = CI->getParent()->getParent();
89
90 auto MD = F->getMetadata(Kind: "reqd_work_group_size");
91 const bool HasReqdWorkGroupSize = MD && MD->getNumOperands() == 3;
92
93 const bool HasUniformWorkGroupSize =
94 F->getFnAttribute(Kind: "uniform-work-group-size").getValueAsBool();
95
96 if (!HasReqdWorkGroupSize && !HasUniformWorkGroupSize)
97 return false;
98
99 Value *BlockCounts[3] = {nullptr, nullptr, nullptr};
100 Value *GroupSizes[3] = {nullptr, nullptr, nullptr};
101 Value *Remainders[3] = {nullptr, nullptr, nullptr};
102 Value *GridSizes[3] = {nullptr, nullptr, nullptr};
103
104 const DataLayout &DL = F->getDataLayout();
105
106 // We expect to see several GEP users, casted to the appropriate type and
107 // loaded.
108 for (User *U : CI->users()) {
109 if (!U->hasOneUse())
110 continue;
111
112 int64_t Offset = 0;
113 auto *Load = dyn_cast<LoadInst>(Val: U); // Load from ImplicitArgPtr/DispatchPtr?
114 auto *BCI = dyn_cast<BitCastInst>(Val: U);
115 if (!Load && !BCI) {
116 if (GetPointerBaseWithConstantOffset(Ptr: U, Offset, DL) != CI)
117 continue;
118 Load = dyn_cast<LoadInst>(Val: *U->user_begin()); // Load from GEP?
119 BCI = dyn_cast<BitCastInst>(Val: *U->user_begin());
120 }
121
122 if (BCI) {
123 if (!BCI->hasOneUse())
124 continue;
125 Load = dyn_cast<LoadInst>(Val: *BCI->user_begin()); // Load from BCI?
126 }
127
128 if (!Load || !Load->isSimple())
129 continue;
130
131 unsigned LoadSize = DL.getTypeStoreSize(Ty: Load->getType());
132
133 // TODO: Handle merged loads.
134 if (IsV5OrAbove) { // Base is ImplicitArgPtr.
135 switch (Offset) {
136 case HIDDEN_BLOCK_COUNT_X:
137 if (LoadSize == 4)
138 BlockCounts[0] = Load;
139 break;
140 case HIDDEN_BLOCK_COUNT_Y:
141 if (LoadSize == 4)
142 BlockCounts[1] = Load;
143 break;
144 case HIDDEN_BLOCK_COUNT_Z:
145 if (LoadSize == 4)
146 BlockCounts[2] = Load;
147 break;
148 case HIDDEN_GROUP_SIZE_X:
149 if (LoadSize == 2)
150 GroupSizes[0] = Load;
151 break;
152 case HIDDEN_GROUP_SIZE_Y:
153 if (LoadSize == 2)
154 GroupSizes[1] = Load;
155 break;
156 case HIDDEN_GROUP_SIZE_Z:
157 if (LoadSize == 2)
158 GroupSizes[2] = Load;
159 break;
160 case HIDDEN_REMAINDER_X:
161 if (LoadSize == 2)
162 Remainders[0] = Load;
163 break;
164 case HIDDEN_REMAINDER_Y:
165 if (LoadSize == 2)
166 Remainders[1] = Load;
167 break;
168 case HIDDEN_REMAINDER_Z:
169 if (LoadSize == 2)
170 Remainders[2] = Load;
171 break;
172 default:
173 break;
174 }
175 } else { // Base is DispatchPtr.
176 switch (Offset) {
177 case WORKGROUP_SIZE_X:
178 if (LoadSize == 2)
179 GroupSizes[0] = Load;
180 break;
181 case WORKGROUP_SIZE_Y:
182 if (LoadSize == 2)
183 GroupSizes[1] = Load;
184 break;
185 case WORKGROUP_SIZE_Z:
186 if (LoadSize == 2)
187 GroupSizes[2] = Load;
188 break;
189 case GRID_SIZE_X:
190 if (LoadSize == 4)
191 GridSizes[0] = Load;
192 break;
193 case GRID_SIZE_Y:
194 if (LoadSize == 4)
195 GridSizes[1] = Load;
196 break;
197 case GRID_SIZE_Z:
198 if (LoadSize == 4)
199 GridSizes[2] = Load;
200 break;
201 default:
202 break;
203 }
204 }
205 }
206
207 bool MadeChange = false;
208 if (IsV5OrAbove && HasUniformWorkGroupSize) {
209 // Under v5 __ockl_get_local_size returns the value computed by the expression:
210 //
211 // workgroup_id < hidden_block_count ? hidden_group_size : hidden_remainder
212 //
213 // For functions with the attribute uniform-work-group-size=true. we can evaluate
214 // workgroup_id < hidden_block_count as true, and thus hidden_group_size is returned
215 // for __ockl_get_local_size.
216 for (int I = 0; I < 3; ++I) {
217 Value *BlockCount = BlockCounts[I];
218 if (!BlockCount)
219 continue;
220
221 using namespace llvm::PatternMatch;
222 auto GroupIDIntrin =
223 I == 0 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_x>()
224 : (I == 1 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_y>()
225 : m_Intrinsic<Intrinsic::amdgcn_workgroup_id_z>());
226
227 for (User *ICmp : BlockCount->users()) {
228 ICmpInst::Predicate Pred;
229 if (match(V: ICmp, P: m_ICmp(Pred, L: GroupIDIntrin, R: m_Specific(V: BlockCount)))) {
230 if (Pred != ICmpInst::ICMP_ULT)
231 continue;
232 ICmp->replaceAllUsesWith(V: llvm::ConstantInt::getTrue(Ty: ICmp->getType()));
233 MadeChange = true;
234 }
235 }
236 }
237
238 // All remainders should be 0 with uniform work group size.
239 for (Value *Remainder : Remainders) {
240 if (!Remainder)
241 continue;
242 Remainder->replaceAllUsesWith(V: Constant::getNullValue(Ty: Remainder->getType()));
243 MadeChange = true;
244 }
245 } else if (HasUniformWorkGroupSize) { // Pre-V5.
246 // Pattern match the code used to handle partial workgroup dispatches in the
247 // library implementation of get_local_size, so the entire function can be
248 // constant folded with a known group size.
249 //
250 // uint r = grid_size - group_id * group_size;
251 // get_local_size = (r < group_size) ? r : group_size;
252 //
253 // If we have uniform-work-group-size (which is the default in OpenCL 1.2),
254 // the grid_size is required to be a multiple of group_size). In this case:
255 //
256 // grid_size - (group_id * group_size) < group_size
257 // ->
258 // grid_size < group_size + (group_id * group_size)
259 //
260 // (grid_size / group_size) < 1 + group_id
261 //
262 // grid_size / group_size is at least 1, so we can conclude the select
263 // condition is false (except for group_id == 0, where the select result is
264 // the same).
265 for (int I = 0; I < 3; ++I) {
266 Value *GroupSize = GroupSizes[I];
267 Value *GridSize = GridSizes[I];
268 if (!GroupSize || !GridSize)
269 continue;
270
271 using namespace llvm::PatternMatch;
272 auto GroupIDIntrin =
273 I == 0 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_x>()
274 : (I == 1 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_y>()
275 : m_Intrinsic<Intrinsic::amdgcn_workgroup_id_z>());
276
277 for (User *U : GroupSize->users()) {
278 auto *ZextGroupSize = dyn_cast<ZExtInst>(Val: U);
279 if (!ZextGroupSize)
280 continue;
281
282 for (User *UMin : ZextGroupSize->users()) {
283 if (match(V: UMin,
284 P: m_UMin(L: m_Sub(L: m_Specific(V: GridSize),
285 R: m_Mul(L: GroupIDIntrin, R: m_Specific(V: ZextGroupSize))),
286 R: m_Specific(V: ZextGroupSize)))) {
287 if (HasReqdWorkGroupSize) {
288 ConstantInt *KnownSize
289 = mdconst::extract<ConstantInt>(MD: MD->getOperand(I));
290 UMin->replaceAllUsesWith(V: ConstantFoldIntegerCast(
291 C: KnownSize, DestTy: UMin->getType(), IsSigned: false, DL));
292 } else {
293 UMin->replaceAllUsesWith(V: ZextGroupSize);
294 }
295
296 MadeChange = true;
297 }
298 }
299 }
300 }
301 }
302
303 // If reqd_work_group_size is set, we can replace work group size with it.
304 if (!HasReqdWorkGroupSize)
305 return MadeChange;
306
307 for (int I = 0; I < 3; I++) {
308 Value *GroupSize = GroupSizes[I];
309 if (!GroupSize)
310 continue;
311
312 ConstantInt *KnownSize = mdconst::extract<ConstantInt>(MD: MD->getOperand(I));
313 GroupSize->replaceAllUsesWith(
314 V: ConstantFoldIntegerCast(C: KnownSize, DestTy: GroupSize->getType(), IsSigned: false, DL));
315 MadeChange = true;
316 }
317
318 return MadeChange;
319}
320
321
322// TODO: Move makeLIDRangeMetadata usage into here. Seem to not get
323// TargetPassConfig for subtarget.
324bool AMDGPULowerKernelAttributes::runOnModule(Module &M) {
325 bool MadeChange = false;
326 bool IsV5OrAbove =
327 AMDGPU::getAMDHSACodeObjectVersion(M) >= AMDGPU::AMDHSA_COV5;
328 Function *BasePtr = getBasePtrIntrinsic(M, IsV5OrAbove);
329
330 if (!BasePtr) // ImplicitArgPtr/DispatchPtr not used.
331 return false;
332
333 SmallPtrSet<Instruction *, 4> HandledUses;
334 for (auto *U : BasePtr->users()) {
335 CallInst *CI = cast<CallInst>(Val: U);
336 if (HandledUses.insert(Ptr: CI).second) {
337 if (processUse(CI, IsV5OrAbove))
338 MadeChange = true;
339 }
340 }
341
342 return MadeChange;
343}
344
345
346INITIALIZE_PASS_BEGIN(AMDGPULowerKernelAttributes, DEBUG_TYPE,
347 "AMDGPU Kernel Attributes", false, false)
348INITIALIZE_PASS_END(AMDGPULowerKernelAttributes, DEBUG_TYPE,
349 "AMDGPU Kernel Attributes", false, false)
350
351char AMDGPULowerKernelAttributes::ID = 0;
352
353ModulePass *llvm::createAMDGPULowerKernelAttributesPass() {
354 return new AMDGPULowerKernelAttributes();
355}
356
357PreservedAnalyses
358AMDGPULowerKernelAttributesPass::run(Function &F, FunctionAnalysisManager &AM) {
359 bool IsV5OrAbove =
360 AMDGPU::getAMDHSACodeObjectVersion(M: *F.getParent()) >= AMDGPU::AMDHSA_COV5;
361 Function *BasePtr = getBasePtrIntrinsic(M&: *F.getParent(), IsV5OrAbove);
362
363 if (!BasePtr) // ImplicitArgPtr/DispatchPtr not used.
364 return PreservedAnalyses::all();
365
366 for (Instruction &I : instructions(F)) {
367 if (CallInst *CI = dyn_cast<CallInst>(Val: &I)) {
368 if (CI->getCalledFunction() == BasePtr)
369 processUse(CI, IsV5OrAbove);
370 }
371 }
372
373 return PreservedAnalyses::all();
374}
375