1 | //===-- AMDGPULowerKernelAttributes.cpp ------------------------------------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | /// \file This pass does attempts to make use of reqd_work_group_size metadata |
10 | /// to eliminate loads from the dispatch packet and to constant fold OpenCL |
11 | /// get_local_size-like functions. |
12 | // |
13 | //===----------------------------------------------------------------------===// |
14 | |
15 | #include "AMDGPU.h" |
16 | #include "Utils/AMDGPUBaseInfo.h" |
17 | #include "llvm/Analysis/ConstantFolding.h" |
18 | #include "llvm/Analysis/ValueTracking.h" |
19 | #include "llvm/CodeGen/Passes.h" |
20 | #include "llvm/CodeGen/TargetPassConfig.h" |
21 | #include "llvm/IR/Constants.h" |
22 | #include "llvm/IR/Function.h" |
23 | #include "llvm/IR/InstIterator.h" |
24 | #include "llvm/IR/Instructions.h" |
25 | #include "llvm/IR/IntrinsicsAMDGPU.h" |
26 | #include "llvm/IR/PatternMatch.h" |
27 | #include "llvm/Pass.h" |
28 | |
29 | #define DEBUG_TYPE "amdgpu-lower-kernel-attributes" |
30 | |
31 | using namespace llvm; |
32 | |
33 | namespace { |
34 | |
35 | // Field offsets in hsa_kernel_dispatch_packet_t. |
36 | enum DispatchPackedOffsets { |
37 | WORKGROUP_SIZE_X = 4, |
38 | WORKGROUP_SIZE_Y = 6, |
39 | WORKGROUP_SIZE_Z = 8, |
40 | |
41 | GRID_SIZE_X = 12, |
42 | GRID_SIZE_Y = 16, |
43 | GRID_SIZE_Z = 20 |
44 | }; |
45 | |
46 | // Field offsets to implicit kernel argument pointer. |
47 | enum ImplicitArgOffsets { |
48 | HIDDEN_BLOCK_COUNT_X = 0, |
49 | HIDDEN_BLOCK_COUNT_Y = 4, |
50 | HIDDEN_BLOCK_COUNT_Z = 8, |
51 | |
52 | HIDDEN_GROUP_SIZE_X = 12, |
53 | HIDDEN_GROUP_SIZE_Y = 14, |
54 | HIDDEN_GROUP_SIZE_Z = 16, |
55 | |
56 | HIDDEN_REMAINDER_X = 18, |
57 | HIDDEN_REMAINDER_Y = 20, |
58 | HIDDEN_REMAINDER_Z = 22, |
59 | }; |
60 | |
61 | class AMDGPULowerKernelAttributes : public ModulePass { |
62 | public: |
63 | static char ID; |
64 | |
65 | AMDGPULowerKernelAttributes() : ModulePass(ID) {} |
66 | |
67 | bool runOnModule(Module &M) override; |
68 | |
69 | StringRef getPassName() const override { |
70 | return "AMDGPU Kernel Attributes" ; |
71 | } |
72 | |
73 | void getAnalysisUsage(AnalysisUsage &AU) const override { |
74 | AU.setPreservesAll(); |
75 | } |
76 | }; |
77 | |
78 | Function *getBasePtrIntrinsic(Module &M, bool IsV5OrAbove) { |
79 | auto IntrinsicId = IsV5OrAbove ? Intrinsic::amdgcn_implicitarg_ptr |
80 | : Intrinsic::amdgcn_dispatch_ptr; |
81 | StringRef Name = Intrinsic::getName(id: IntrinsicId); |
82 | return M.getFunction(Name); |
83 | } |
84 | |
85 | } // end anonymous namespace |
86 | |
87 | static bool processUse(CallInst *CI, bool IsV5OrAbove) { |
88 | Function *F = CI->getParent()->getParent(); |
89 | |
90 | auto MD = F->getMetadata(Kind: "reqd_work_group_size" ); |
91 | const bool HasReqdWorkGroupSize = MD && MD->getNumOperands() == 3; |
92 | |
93 | const bool HasUniformWorkGroupSize = |
94 | F->getFnAttribute(Kind: "uniform-work-group-size" ).getValueAsBool(); |
95 | |
96 | if (!HasReqdWorkGroupSize && !HasUniformWorkGroupSize) |
97 | return false; |
98 | |
99 | Value *BlockCounts[3] = {nullptr, nullptr, nullptr}; |
100 | Value *GroupSizes[3] = {nullptr, nullptr, nullptr}; |
101 | Value *Remainders[3] = {nullptr, nullptr, nullptr}; |
102 | Value *GridSizes[3] = {nullptr, nullptr, nullptr}; |
103 | |
104 | const DataLayout &DL = F->getDataLayout(); |
105 | |
106 | // We expect to see several GEP users, casted to the appropriate type and |
107 | // loaded. |
108 | for (User *U : CI->users()) { |
109 | if (!U->hasOneUse()) |
110 | continue; |
111 | |
112 | int64_t Offset = 0; |
113 | auto *Load = dyn_cast<LoadInst>(Val: U); // Load from ImplicitArgPtr/DispatchPtr? |
114 | auto *BCI = dyn_cast<BitCastInst>(Val: U); |
115 | if (!Load && !BCI) { |
116 | if (GetPointerBaseWithConstantOffset(Ptr: U, Offset, DL) != CI) |
117 | continue; |
118 | Load = dyn_cast<LoadInst>(Val: *U->user_begin()); // Load from GEP? |
119 | BCI = dyn_cast<BitCastInst>(Val: *U->user_begin()); |
120 | } |
121 | |
122 | if (BCI) { |
123 | if (!BCI->hasOneUse()) |
124 | continue; |
125 | Load = dyn_cast<LoadInst>(Val: *BCI->user_begin()); // Load from BCI? |
126 | } |
127 | |
128 | if (!Load || !Load->isSimple()) |
129 | continue; |
130 | |
131 | unsigned LoadSize = DL.getTypeStoreSize(Ty: Load->getType()); |
132 | |
133 | // TODO: Handle merged loads. |
134 | if (IsV5OrAbove) { // Base is ImplicitArgPtr. |
135 | switch (Offset) { |
136 | case HIDDEN_BLOCK_COUNT_X: |
137 | if (LoadSize == 4) |
138 | BlockCounts[0] = Load; |
139 | break; |
140 | case HIDDEN_BLOCK_COUNT_Y: |
141 | if (LoadSize == 4) |
142 | BlockCounts[1] = Load; |
143 | break; |
144 | case HIDDEN_BLOCK_COUNT_Z: |
145 | if (LoadSize == 4) |
146 | BlockCounts[2] = Load; |
147 | break; |
148 | case HIDDEN_GROUP_SIZE_X: |
149 | if (LoadSize == 2) |
150 | GroupSizes[0] = Load; |
151 | break; |
152 | case HIDDEN_GROUP_SIZE_Y: |
153 | if (LoadSize == 2) |
154 | GroupSizes[1] = Load; |
155 | break; |
156 | case HIDDEN_GROUP_SIZE_Z: |
157 | if (LoadSize == 2) |
158 | GroupSizes[2] = Load; |
159 | break; |
160 | case HIDDEN_REMAINDER_X: |
161 | if (LoadSize == 2) |
162 | Remainders[0] = Load; |
163 | break; |
164 | case HIDDEN_REMAINDER_Y: |
165 | if (LoadSize == 2) |
166 | Remainders[1] = Load; |
167 | break; |
168 | case HIDDEN_REMAINDER_Z: |
169 | if (LoadSize == 2) |
170 | Remainders[2] = Load; |
171 | break; |
172 | default: |
173 | break; |
174 | } |
175 | } else { // Base is DispatchPtr. |
176 | switch (Offset) { |
177 | case WORKGROUP_SIZE_X: |
178 | if (LoadSize == 2) |
179 | GroupSizes[0] = Load; |
180 | break; |
181 | case WORKGROUP_SIZE_Y: |
182 | if (LoadSize == 2) |
183 | GroupSizes[1] = Load; |
184 | break; |
185 | case WORKGROUP_SIZE_Z: |
186 | if (LoadSize == 2) |
187 | GroupSizes[2] = Load; |
188 | break; |
189 | case GRID_SIZE_X: |
190 | if (LoadSize == 4) |
191 | GridSizes[0] = Load; |
192 | break; |
193 | case GRID_SIZE_Y: |
194 | if (LoadSize == 4) |
195 | GridSizes[1] = Load; |
196 | break; |
197 | case GRID_SIZE_Z: |
198 | if (LoadSize == 4) |
199 | GridSizes[2] = Load; |
200 | break; |
201 | default: |
202 | break; |
203 | } |
204 | } |
205 | } |
206 | |
207 | bool MadeChange = false; |
208 | if (IsV5OrAbove && HasUniformWorkGroupSize) { |
209 | // Under v5 __ockl_get_local_size returns the value computed by the expression: |
210 | // |
211 | // workgroup_id < hidden_block_count ? hidden_group_size : hidden_remainder |
212 | // |
213 | // For functions with the attribute uniform-work-group-size=true. we can evaluate |
214 | // workgroup_id < hidden_block_count as true, and thus hidden_group_size is returned |
215 | // for __ockl_get_local_size. |
216 | for (int I = 0; I < 3; ++I) { |
217 | Value *BlockCount = BlockCounts[I]; |
218 | if (!BlockCount) |
219 | continue; |
220 | |
221 | using namespace llvm::PatternMatch; |
222 | auto GroupIDIntrin = |
223 | I == 0 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_x>() |
224 | : (I == 1 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_y>() |
225 | : m_Intrinsic<Intrinsic::amdgcn_workgroup_id_z>()); |
226 | |
227 | for (User *ICmp : BlockCount->users()) { |
228 | ICmpInst::Predicate Pred; |
229 | if (match(V: ICmp, P: m_ICmp(Pred, L: GroupIDIntrin, R: m_Specific(V: BlockCount)))) { |
230 | if (Pred != ICmpInst::ICMP_ULT) |
231 | continue; |
232 | ICmp->replaceAllUsesWith(V: llvm::ConstantInt::getTrue(Ty: ICmp->getType())); |
233 | MadeChange = true; |
234 | } |
235 | } |
236 | } |
237 | |
238 | // All remainders should be 0 with uniform work group size. |
239 | for (Value *Remainder : Remainders) { |
240 | if (!Remainder) |
241 | continue; |
242 | Remainder->replaceAllUsesWith(V: Constant::getNullValue(Ty: Remainder->getType())); |
243 | MadeChange = true; |
244 | } |
245 | } else if (HasUniformWorkGroupSize) { // Pre-V5. |
246 | // Pattern match the code used to handle partial workgroup dispatches in the |
247 | // library implementation of get_local_size, so the entire function can be |
248 | // constant folded with a known group size. |
249 | // |
250 | // uint r = grid_size - group_id * group_size; |
251 | // get_local_size = (r < group_size) ? r : group_size; |
252 | // |
253 | // If we have uniform-work-group-size (which is the default in OpenCL 1.2), |
254 | // the grid_size is required to be a multiple of group_size). In this case: |
255 | // |
256 | // grid_size - (group_id * group_size) < group_size |
257 | // -> |
258 | // grid_size < group_size + (group_id * group_size) |
259 | // |
260 | // (grid_size / group_size) < 1 + group_id |
261 | // |
262 | // grid_size / group_size is at least 1, so we can conclude the select |
263 | // condition is false (except for group_id == 0, where the select result is |
264 | // the same). |
265 | for (int I = 0; I < 3; ++I) { |
266 | Value *GroupSize = GroupSizes[I]; |
267 | Value *GridSize = GridSizes[I]; |
268 | if (!GroupSize || !GridSize) |
269 | continue; |
270 | |
271 | using namespace llvm::PatternMatch; |
272 | auto GroupIDIntrin = |
273 | I == 0 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_x>() |
274 | : (I == 1 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_y>() |
275 | : m_Intrinsic<Intrinsic::amdgcn_workgroup_id_z>()); |
276 | |
277 | for (User *U : GroupSize->users()) { |
278 | auto *ZextGroupSize = dyn_cast<ZExtInst>(Val: U); |
279 | if (!ZextGroupSize) |
280 | continue; |
281 | |
282 | for (User *UMin : ZextGroupSize->users()) { |
283 | if (match(V: UMin, |
284 | P: m_UMin(L: m_Sub(L: m_Specific(V: GridSize), |
285 | R: m_Mul(L: GroupIDIntrin, R: m_Specific(V: ZextGroupSize))), |
286 | R: m_Specific(V: ZextGroupSize)))) { |
287 | if (HasReqdWorkGroupSize) { |
288 | ConstantInt *KnownSize |
289 | = mdconst::extract<ConstantInt>(MD: MD->getOperand(I)); |
290 | UMin->replaceAllUsesWith(V: ConstantFoldIntegerCast( |
291 | C: KnownSize, DestTy: UMin->getType(), IsSigned: false, DL)); |
292 | } else { |
293 | UMin->replaceAllUsesWith(V: ZextGroupSize); |
294 | } |
295 | |
296 | MadeChange = true; |
297 | } |
298 | } |
299 | } |
300 | } |
301 | } |
302 | |
303 | // If reqd_work_group_size is set, we can replace work group size with it. |
304 | if (!HasReqdWorkGroupSize) |
305 | return MadeChange; |
306 | |
307 | for (int I = 0; I < 3; I++) { |
308 | Value *GroupSize = GroupSizes[I]; |
309 | if (!GroupSize) |
310 | continue; |
311 | |
312 | ConstantInt *KnownSize = mdconst::extract<ConstantInt>(MD: MD->getOperand(I)); |
313 | GroupSize->replaceAllUsesWith( |
314 | V: ConstantFoldIntegerCast(C: KnownSize, DestTy: GroupSize->getType(), IsSigned: false, DL)); |
315 | MadeChange = true; |
316 | } |
317 | |
318 | return MadeChange; |
319 | } |
320 | |
321 | |
322 | // TODO: Move makeLIDRangeMetadata usage into here. Seem to not get |
323 | // TargetPassConfig for subtarget. |
324 | bool AMDGPULowerKernelAttributes::runOnModule(Module &M) { |
325 | bool MadeChange = false; |
326 | bool IsV5OrAbove = |
327 | AMDGPU::getAMDHSACodeObjectVersion(M) >= AMDGPU::AMDHSA_COV5; |
328 | Function *BasePtr = getBasePtrIntrinsic(M, IsV5OrAbove); |
329 | |
330 | if (!BasePtr) // ImplicitArgPtr/DispatchPtr not used. |
331 | return false; |
332 | |
333 | SmallPtrSet<Instruction *, 4> HandledUses; |
334 | for (auto *U : BasePtr->users()) { |
335 | CallInst *CI = cast<CallInst>(Val: U); |
336 | if (HandledUses.insert(Ptr: CI).second) { |
337 | if (processUse(CI, IsV5OrAbove)) |
338 | MadeChange = true; |
339 | } |
340 | } |
341 | |
342 | return MadeChange; |
343 | } |
344 | |
345 | |
346 | INITIALIZE_PASS_BEGIN(AMDGPULowerKernelAttributes, DEBUG_TYPE, |
347 | "AMDGPU Kernel Attributes" , false, false) |
348 | INITIALIZE_PASS_END(AMDGPULowerKernelAttributes, DEBUG_TYPE, |
349 | "AMDGPU Kernel Attributes" , false, false) |
350 | |
351 | char AMDGPULowerKernelAttributes::ID = 0; |
352 | |
353 | ModulePass *llvm::createAMDGPULowerKernelAttributesPass() { |
354 | return new AMDGPULowerKernelAttributes(); |
355 | } |
356 | |
357 | PreservedAnalyses |
358 | AMDGPULowerKernelAttributesPass::run(Function &F, FunctionAnalysisManager &AM) { |
359 | bool IsV5OrAbove = |
360 | AMDGPU::getAMDHSACodeObjectVersion(M: *F.getParent()) >= AMDGPU::AMDHSA_COV5; |
361 | Function *BasePtr = getBasePtrIntrinsic(M&: *F.getParent(), IsV5OrAbove); |
362 | |
363 | if (!BasePtr) // ImplicitArgPtr/DispatchPtr not used. |
364 | return PreservedAnalyses::all(); |
365 | |
366 | for (Instruction &I : instructions(F)) { |
367 | if (CallInst *CI = dyn_cast<CallInst>(Val: &I)) { |
368 | if (CI->getCalledFunction() == BasePtr) |
369 | processUse(CI, IsV5OrAbove); |
370 | } |
371 | } |
372 | |
373 | return PreservedAnalyses::all(); |
374 | } |
375 | |