1 | //===-- AMDGPULowerKernelAttributes.cpp ------------------------------------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | /// \file This pass does attempts to make use of reqd_work_group_size metadata |
10 | /// to eliminate loads from the dispatch packet and to constant fold OpenCL |
11 | /// get_local_size-like functions. |
12 | // |
13 | //===----------------------------------------------------------------------===// |
14 | |
15 | #include "AMDGPU.h" |
16 | #include "Utils/AMDGPUBaseInfo.h" |
17 | #include "llvm/Analysis/ConstantFolding.h" |
18 | #include "llvm/Analysis/ValueTracking.h" |
19 | #include "llvm/CodeGen/Passes.h" |
20 | #include "llvm/IR/Constants.h" |
21 | #include "llvm/IR/Function.h" |
22 | #include "llvm/IR/InstIterator.h" |
23 | #include "llvm/IR/Instructions.h" |
24 | #include "llvm/IR/IntrinsicsAMDGPU.h" |
25 | #include "llvm/IR/MDBuilder.h" |
26 | #include "llvm/IR/PatternMatch.h" |
27 | #include "llvm/Pass.h" |
28 | |
29 | #define DEBUG_TYPE "amdgpu-lower-kernel-attributes" |
30 | |
31 | using namespace llvm; |
32 | |
33 | namespace { |
34 | |
35 | // Field offsets in hsa_kernel_dispatch_packet_t. |
36 | enum DispatchPackedOffsets { |
37 | WORKGROUP_SIZE_X = 4, |
38 | WORKGROUP_SIZE_Y = 6, |
39 | WORKGROUP_SIZE_Z = 8, |
40 | |
41 | GRID_SIZE_X = 12, |
42 | GRID_SIZE_Y = 16, |
43 | GRID_SIZE_Z = 20 |
44 | }; |
45 | |
46 | // Field offsets to implicit kernel argument pointer. |
47 | enum ImplicitArgOffsets { |
48 | HIDDEN_BLOCK_COUNT_X = 0, |
49 | HIDDEN_BLOCK_COUNT_Y = 4, |
50 | HIDDEN_BLOCK_COUNT_Z = 8, |
51 | |
52 | HIDDEN_GROUP_SIZE_X = 12, |
53 | HIDDEN_GROUP_SIZE_Y = 14, |
54 | HIDDEN_GROUP_SIZE_Z = 16, |
55 | |
56 | HIDDEN_REMAINDER_X = 18, |
57 | HIDDEN_REMAINDER_Y = 20, |
58 | HIDDEN_REMAINDER_Z = 22, |
59 | }; |
60 | |
61 | class AMDGPULowerKernelAttributes : public ModulePass { |
62 | public: |
63 | static char ID; |
64 | |
65 | AMDGPULowerKernelAttributes() : ModulePass(ID) {} |
66 | |
67 | bool runOnModule(Module &M) override; |
68 | |
69 | StringRef getPassName() const override { |
70 | return "AMDGPU Kernel Attributes" ; |
71 | } |
72 | |
73 | void getAnalysisUsage(AnalysisUsage &AU) const override { |
74 | AU.setPreservesAll(); |
75 | } |
76 | }; |
77 | |
78 | Function *getBasePtrIntrinsic(Module &M, bool IsV5OrAbove) { |
79 | auto IntrinsicId = IsV5OrAbove ? Intrinsic::amdgcn_implicitarg_ptr |
80 | : Intrinsic::amdgcn_dispatch_ptr; |
81 | return Intrinsic::getDeclarationIfExists(M: &M, id: IntrinsicId); |
82 | } |
83 | |
84 | } // end anonymous namespace |
85 | |
86 | static void annotateGridSizeLoadWithRangeMD(LoadInst *Load, |
87 | uint32_t MaxNumGroups) { |
88 | if (MaxNumGroups == 0 || MaxNumGroups == std::numeric_limits<uint32_t>::max()) |
89 | return; |
90 | |
91 | if (!Load->getType()->isIntegerTy(Bitwidth: 32)) |
92 | return; |
93 | |
94 | // TODO: If there is existing range metadata, preserve it if it is stricter. |
95 | MDBuilder MDB(Load->getContext()); |
96 | MDNode *Range = MDB.createRange(Lo: APInt(32, 1), Hi: APInt(32, MaxNumGroups + 1)); |
97 | Load->setMetadata(KindID: LLVMContext::MD_range, Node: Range); |
98 | } |
99 | |
100 | static bool processUse(CallInst *CI, bool IsV5OrAbove) { |
101 | Function *F = CI->getParent()->getParent(); |
102 | |
103 | auto *MD = F->getMetadata(Kind: "reqd_work_group_size" ); |
104 | const bool HasReqdWorkGroupSize = MD && MD->getNumOperands() == 3; |
105 | |
106 | const bool HasUniformWorkGroupSize = |
107 | F->getFnAttribute(Kind: "uniform-work-group-size" ).getValueAsBool(); |
108 | |
109 | SmallVector<unsigned> MaxNumWorkgroups = |
110 | AMDGPU::getIntegerVecAttribute(F: *F, Name: "amdgpu-max-num-workgroups" , |
111 | /*Size=*/3, /*DefaultVal=*/0); |
112 | |
113 | if (!HasReqdWorkGroupSize && !HasUniformWorkGroupSize && |
114 | none_of(Range&: MaxNumWorkgroups, P: [](unsigned X) { return X != 0; })) |
115 | return false; |
116 | |
117 | Value *BlockCounts[3] = {nullptr, nullptr, nullptr}; |
118 | Value *GroupSizes[3] = {nullptr, nullptr, nullptr}; |
119 | Value *Remainders[3] = {nullptr, nullptr, nullptr}; |
120 | Value *GridSizes[3] = {nullptr, nullptr, nullptr}; |
121 | |
122 | const DataLayout &DL = F->getDataLayout(); |
123 | |
124 | // We expect to see several GEP users, casted to the appropriate type and |
125 | // loaded. |
126 | for (User *U : CI->users()) { |
127 | if (!U->hasOneUse()) |
128 | continue; |
129 | |
130 | int64_t Offset = 0; |
131 | auto *Load = dyn_cast<LoadInst>(Val: U); // Load from ImplicitArgPtr/DispatchPtr? |
132 | auto *BCI = dyn_cast<BitCastInst>(Val: U); |
133 | if (!Load && !BCI) { |
134 | if (GetPointerBaseWithConstantOffset(Ptr: U, Offset, DL) != CI) |
135 | continue; |
136 | Load = dyn_cast<LoadInst>(Val: *U->user_begin()); // Load from GEP? |
137 | BCI = dyn_cast<BitCastInst>(Val: *U->user_begin()); |
138 | } |
139 | |
140 | if (BCI) { |
141 | if (!BCI->hasOneUse()) |
142 | continue; |
143 | Load = dyn_cast<LoadInst>(Val: *BCI->user_begin()); // Load from BCI? |
144 | } |
145 | |
146 | if (!Load || !Load->isSimple()) |
147 | continue; |
148 | |
149 | unsigned LoadSize = DL.getTypeStoreSize(Ty: Load->getType()); |
150 | |
151 | // TODO: Handle merged loads. |
152 | if (IsV5OrAbove) { // Base is ImplicitArgPtr. |
153 | switch (Offset) { |
154 | case HIDDEN_BLOCK_COUNT_X: |
155 | if (LoadSize == 4) { |
156 | BlockCounts[0] = Load; |
157 | annotateGridSizeLoadWithRangeMD(Load, MaxNumGroups: MaxNumWorkgroups[0]); |
158 | } |
159 | break; |
160 | case HIDDEN_BLOCK_COUNT_Y: |
161 | if (LoadSize == 4) { |
162 | BlockCounts[1] = Load; |
163 | annotateGridSizeLoadWithRangeMD(Load, MaxNumGroups: MaxNumWorkgroups[1]); |
164 | } |
165 | break; |
166 | case HIDDEN_BLOCK_COUNT_Z: |
167 | if (LoadSize == 4) { |
168 | BlockCounts[2] = Load; |
169 | annotateGridSizeLoadWithRangeMD(Load, MaxNumGroups: MaxNumWorkgroups[2]); |
170 | } |
171 | break; |
172 | case HIDDEN_GROUP_SIZE_X: |
173 | if (LoadSize == 2) |
174 | GroupSizes[0] = Load; |
175 | break; |
176 | case HIDDEN_GROUP_SIZE_Y: |
177 | if (LoadSize == 2) |
178 | GroupSizes[1] = Load; |
179 | break; |
180 | case HIDDEN_GROUP_SIZE_Z: |
181 | if (LoadSize == 2) |
182 | GroupSizes[2] = Load; |
183 | break; |
184 | case HIDDEN_REMAINDER_X: |
185 | if (LoadSize == 2) |
186 | Remainders[0] = Load; |
187 | break; |
188 | case HIDDEN_REMAINDER_Y: |
189 | if (LoadSize == 2) |
190 | Remainders[1] = Load; |
191 | break; |
192 | case HIDDEN_REMAINDER_Z: |
193 | if (LoadSize == 2) |
194 | Remainders[2] = Load; |
195 | break; |
196 | default: |
197 | break; |
198 | } |
199 | } else { // Base is DispatchPtr. |
200 | switch (Offset) { |
201 | case WORKGROUP_SIZE_X: |
202 | if (LoadSize == 2) |
203 | GroupSizes[0] = Load; |
204 | break; |
205 | case WORKGROUP_SIZE_Y: |
206 | if (LoadSize == 2) |
207 | GroupSizes[1] = Load; |
208 | break; |
209 | case WORKGROUP_SIZE_Z: |
210 | if (LoadSize == 2) |
211 | GroupSizes[2] = Load; |
212 | break; |
213 | case GRID_SIZE_X: |
214 | if (LoadSize == 4) |
215 | GridSizes[0] = Load; |
216 | break; |
217 | case GRID_SIZE_Y: |
218 | if (LoadSize == 4) |
219 | GridSizes[1] = Load; |
220 | break; |
221 | case GRID_SIZE_Z: |
222 | if (LoadSize == 4) |
223 | GridSizes[2] = Load; |
224 | break; |
225 | default: |
226 | break; |
227 | } |
228 | } |
229 | } |
230 | |
231 | bool MadeChange = false; |
232 | if (IsV5OrAbove && HasUniformWorkGroupSize) { |
233 | // Under v5 __ockl_get_local_size returns the value computed by the expression: |
234 | // |
235 | // workgroup_id < hidden_block_count ? hidden_group_size : hidden_remainder |
236 | // |
237 | // For functions with the attribute uniform-work-group-size=true. we can evaluate |
238 | // workgroup_id < hidden_block_count as true, and thus hidden_group_size is returned |
239 | // for __ockl_get_local_size. |
240 | for (int I = 0; I < 3; ++I) { |
241 | Value *BlockCount = BlockCounts[I]; |
242 | if (!BlockCount) |
243 | continue; |
244 | |
245 | using namespace llvm::PatternMatch; |
246 | auto GroupIDIntrin = |
247 | I == 0 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_x>() |
248 | : (I == 1 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_y>() |
249 | : m_Intrinsic<Intrinsic::amdgcn_workgroup_id_z>()); |
250 | |
251 | for (User *ICmp : BlockCount->users()) { |
252 | if (match(V: ICmp, P: m_SpecificICmp(MatchPred: ICmpInst::ICMP_ULT, L: GroupIDIntrin, |
253 | R: m_Specific(V: BlockCount)))) { |
254 | ICmp->replaceAllUsesWith(V: llvm::ConstantInt::getTrue(Ty: ICmp->getType())); |
255 | MadeChange = true; |
256 | } |
257 | } |
258 | } |
259 | |
260 | // All remainders should be 0 with uniform work group size. |
261 | for (Value *Remainder : Remainders) { |
262 | if (!Remainder) |
263 | continue; |
264 | Remainder->replaceAllUsesWith(V: Constant::getNullValue(Ty: Remainder->getType())); |
265 | MadeChange = true; |
266 | } |
267 | } else if (HasUniformWorkGroupSize) { // Pre-V5. |
268 | // Pattern match the code used to handle partial workgroup dispatches in the |
269 | // library implementation of get_local_size, so the entire function can be |
270 | // constant folded with a known group size. |
271 | // |
272 | // uint r = grid_size - group_id * group_size; |
273 | // get_local_size = (r < group_size) ? r : group_size; |
274 | // |
275 | // If we have uniform-work-group-size (which is the default in OpenCL 1.2), |
276 | // the grid_size is required to be a multiple of group_size). In this case: |
277 | // |
278 | // grid_size - (group_id * group_size) < group_size |
279 | // -> |
280 | // grid_size < group_size + (group_id * group_size) |
281 | // |
282 | // (grid_size / group_size) < 1 + group_id |
283 | // |
284 | // grid_size / group_size is at least 1, so we can conclude the select |
285 | // condition is false (except for group_id == 0, where the select result is |
286 | // the same). |
287 | for (int I = 0; I < 3; ++I) { |
288 | Value *GroupSize = GroupSizes[I]; |
289 | Value *GridSize = GridSizes[I]; |
290 | if (!GroupSize || !GridSize) |
291 | continue; |
292 | |
293 | using namespace llvm::PatternMatch; |
294 | auto GroupIDIntrin = |
295 | I == 0 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_x>() |
296 | : (I == 1 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_y>() |
297 | : m_Intrinsic<Intrinsic::amdgcn_workgroup_id_z>()); |
298 | |
299 | for (User *U : GroupSize->users()) { |
300 | auto *ZextGroupSize = dyn_cast<ZExtInst>(Val: U); |
301 | if (!ZextGroupSize) |
302 | continue; |
303 | |
304 | for (User *UMin : ZextGroupSize->users()) { |
305 | if (match(V: UMin, |
306 | P: m_UMin(L: m_Sub(L: m_Specific(V: GridSize), |
307 | R: m_Mul(L: GroupIDIntrin, R: m_Specific(V: ZextGroupSize))), |
308 | R: m_Specific(V: ZextGroupSize)))) { |
309 | if (HasReqdWorkGroupSize) { |
310 | ConstantInt *KnownSize |
311 | = mdconst::extract<ConstantInt>(MD: MD->getOperand(I)); |
312 | UMin->replaceAllUsesWith(V: ConstantFoldIntegerCast( |
313 | C: KnownSize, DestTy: UMin->getType(), IsSigned: false, DL)); |
314 | } else { |
315 | UMin->replaceAllUsesWith(V: ZextGroupSize); |
316 | } |
317 | |
318 | MadeChange = true; |
319 | } |
320 | } |
321 | } |
322 | } |
323 | } |
324 | |
325 | // If reqd_work_group_size is set, we can replace work group size with it. |
326 | if (!HasReqdWorkGroupSize) |
327 | return MadeChange; |
328 | |
329 | for (int I = 0; I < 3; I++) { |
330 | Value *GroupSize = GroupSizes[I]; |
331 | if (!GroupSize) |
332 | continue; |
333 | |
334 | ConstantInt *KnownSize = mdconst::extract<ConstantInt>(MD: MD->getOperand(I)); |
335 | GroupSize->replaceAllUsesWith( |
336 | V: ConstantFoldIntegerCast(C: KnownSize, DestTy: GroupSize->getType(), IsSigned: false, DL)); |
337 | MadeChange = true; |
338 | } |
339 | |
340 | return MadeChange; |
341 | } |
342 | |
343 | |
344 | // TODO: Move makeLIDRangeMetadata usage into here. Seem to not get |
345 | // TargetPassConfig for subtarget. |
346 | bool AMDGPULowerKernelAttributes::runOnModule(Module &M) { |
347 | bool MadeChange = false; |
348 | bool IsV5OrAbove = |
349 | AMDGPU::getAMDHSACodeObjectVersion(M) >= AMDGPU::AMDHSA_COV5; |
350 | Function *BasePtr = getBasePtrIntrinsic(M, IsV5OrAbove); |
351 | |
352 | if (!BasePtr) // ImplicitArgPtr/DispatchPtr not used. |
353 | return false; |
354 | |
355 | SmallPtrSet<Instruction *, 4> HandledUses; |
356 | for (auto *U : BasePtr->users()) { |
357 | CallInst *CI = cast<CallInst>(Val: U); |
358 | if (HandledUses.insert(Ptr: CI).second) { |
359 | if (processUse(CI, IsV5OrAbove)) |
360 | MadeChange = true; |
361 | } |
362 | } |
363 | |
364 | return MadeChange; |
365 | } |
366 | |
367 | |
368 | INITIALIZE_PASS_BEGIN(AMDGPULowerKernelAttributes, DEBUG_TYPE, |
369 | "AMDGPU Kernel Attributes" , false, false) |
370 | INITIALIZE_PASS_END(AMDGPULowerKernelAttributes, DEBUG_TYPE, |
371 | "AMDGPU Kernel Attributes" , false, false) |
372 | |
373 | char AMDGPULowerKernelAttributes::ID = 0; |
374 | |
375 | ModulePass *llvm::createAMDGPULowerKernelAttributesPass() { |
376 | return new AMDGPULowerKernelAttributes(); |
377 | } |
378 | |
379 | PreservedAnalyses |
380 | AMDGPULowerKernelAttributesPass::run(Function &F, FunctionAnalysisManager &AM) { |
381 | bool IsV5OrAbove = |
382 | AMDGPU::getAMDHSACodeObjectVersion(M: *F.getParent()) >= AMDGPU::AMDHSA_COV5; |
383 | Function *BasePtr = getBasePtrIntrinsic(M&: *F.getParent(), IsV5OrAbove); |
384 | |
385 | if (!BasePtr) // ImplicitArgPtr/DispatchPtr not used. |
386 | return PreservedAnalyses::all(); |
387 | |
388 | for (Instruction &I : instructions(F)) { |
389 | if (CallInst *CI = dyn_cast<CallInst>(Val: &I)) { |
390 | if (CI->getCalledFunction() == BasePtr) |
391 | processUse(CI, IsV5OrAbove); |
392 | } |
393 | } |
394 | |
395 | return PreservedAnalyses::all(); |
396 | } |
397 | |