| 1 | //===-- AMDGPULowerKernelAttributes.cpp ------------------------------------------===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | // |
| 9 | /// \file This pass does attempts to make use of reqd_work_group_size metadata |
| 10 | /// to eliminate loads from the dispatch packet and to constant fold OpenCL |
| 11 | /// get_local_size-like functions. |
| 12 | // |
| 13 | //===----------------------------------------------------------------------===// |
| 14 | |
| 15 | #include "AMDGPU.h" |
| 16 | #include "Utils/AMDGPUBaseInfo.h" |
| 17 | #include "llvm/Analysis/ConstantFolding.h" |
| 18 | #include "llvm/Analysis/ValueTracking.h" |
| 19 | #include "llvm/CodeGen/Passes.h" |
| 20 | #include "llvm/IR/Constants.h" |
| 21 | #include "llvm/IR/Function.h" |
| 22 | #include "llvm/IR/InstIterator.h" |
| 23 | #include "llvm/IR/Instructions.h" |
| 24 | #include "llvm/IR/IntrinsicsAMDGPU.h" |
| 25 | #include "llvm/IR/MDBuilder.h" |
| 26 | #include "llvm/IR/PatternMatch.h" |
| 27 | #include "llvm/Pass.h" |
| 28 | |
| 29 | #define DEBUG_TYPE "amdgpu-lower-kernel-attributes" |
| 30 | |
| 31 | using namespace llvm; |
| 32 | |
| 33 | namespace { |
| 34 | |
| 35 | // Field offsets in hsa_kernel_dispatch_packet_t. |
| 36 | enum DispatchPackedOffsets { |
| 37 | WORKGROUP_SIZE_X = 4, |
| 38 | WORKGROUP_SIZE_Y = 6, |
| 39 | WORKGROUP_SIZE_Z = 8, |
| 40 | |
| 41 | GRID_SIZE_X = 12, |
| 42 | GRID_SIZE_Y = 16, |
| 43 | GRID_SIZE_Z = 20 |
| 44 | }; |
| 45 | |
| 46 | // Field offsets to implicit kernel argument pointer. |
| 47 | enum ImplicitArgOffsets { |
| 48 | HIDDEN_BLOCK_COUNT_X = 0, |
| 49 | HIDDEN_BLOCK_COUNT_Y = 4, |
| 50 | HIDDEN_BLOCK_COUNT_Z = 8, |
| 51 | |
| 52 | HIDDEN_GROUP_SIZE_X = 12, |
| 53 | HIDDEN_GROUP_SIZE_Y = 14, |
| 54 | HIDDEN_GROUP_SIZE_Z = 16, |
| 55 | |
| 56 | HIDDEN_REMAINDER_X = 18, |
| 57 | HIDDEN_REMAINDER_Y = 20, |
| 58 | HIDDEN_REMAINDER_Z = 22, |
| 59 | }; |
| 60 | |
| 61 | class AMDGPULowerKernelAttributes : public ModulePass { |
| 62 | public: |
| 63 | static char ID; |
| 64 | |
| 65 | AMDGPULowerKernelAttributes() : ModulePass(ID) {} |
| 66 | |
| 67 | bool runOnModule(Module &M) override; |
| 68 | |
| 69 | StringRef getPassName() const override { |
| 70 | return "AMDGPU Kernel Attributes" ; |
| 71 | } |
| 72 | |
| 73 | void getAnalysisUsage(AnalysisUsage &AU) const override { |
| 74 | AU.setPreservesAll(); |
| 75 | } |
| 76 | }; |
| 77 | |
| 78 | Function *getBasePtrIntrinsic(Module &M, bool IsV5OrAbove) { |
| 79 | auto IntrinsicId = IsV5OrAbove ? Intrinsic::amdgcn_implicitarg_ptr |
| 80 | : Intrinsic::amdgcn_dispatch_ptr; |
| 81 | return Intrinsic::getDeclarationIfExists(M: &M, id: IntrinsicId); |
| 82 | } |
| 83 | |
| 84 | } // end anonymous namespace |
| 85 | |
| 86 | static void annotateGridSizeLoadWithRangeMD(LoadInst *Load, |
| 87 | uint32_t MaxNumGroups) { |
| 88 | if (MaxNumGroups == 0 || MaxNumGroups == std::numeric_limits<uint32_t>::max()) |
| 89 | return; |
| 90 | |
| 91 | if (!Load->getType()->isIntegerTy(Bitwidth: 32)) |
| 92 | return; |
| 93 | |
| 94 | // TODO: If there is existing range metadata, preserve it if it is stricter. |
| 95 | MDBuilder MDB(Load->getContext()); |
| 96 | MDNode *Range = MDB.createRange(Lo: APInt(32, 1), Hi: APInt(32, MaxNumGroups + 1)); |
| 97 | Load->setMetadata(KindID: LLVMContext::MD_range, Node: Range); |
| 98 | } |
| 99 | |
| 100 | static bool processUse(CallInst *CI, bool IsV5OrAbove) { |
| 101 | Function *F = CI->getParent()->getParent(); |
| 102 | |
| 103 | auto *MD = F->getMetadata(Kind: "reqd_work_group_size" ); |
| 104 | const bool HasReqdWorkGroupSize = MD && MD->getNumOperands() == 3; |
| 105 | |
| 106 | const bool HasUniformWorkGroupSize = |
| 107 | F->getFnAttribute(Kind: "uniform-work-group-size" ).getValueAsBool(); |
| 108 | |
| 109 | SmallVector<unsigned> MaxNumWorkgroups = |
| 110 | AMDGPU::getIntegerVecAttribute(F: *F, Name: "amdgpu-max-num-workgroups" , |
| 111 | /*Size=*/3, /*DefaultVal=*/0); |
| 112 | |
| 113 | if (!HasReqdWorkGroupSize && !HasUniformWorkGroupSize && |
| 114 | none_of(Range&: MaxNumWorkgroups, P: [](unsigned X) { return X != 0; })) |
| 115 | return false; |
| 116 | |
| 117 | Value *BlockCounts[3] = {nullptr, nullptr, nullptr}; |
| 118 | Value *GroupSizes[3] = {nullptr, nullptr, nullptr}; |
| 119 | Value *Remainders[3] = {nullptr, nullptr, nullptr}; |
| 120 | Value *GridSizes[3] = {nullptr, nullptr, nullptr}; |
| 121 | |
| 122 | const DataLayout &DL = F->getDataLayout(); |
| 123 | |
| 124 | // We expect to see several GEP users, casted to the appropriate type and |
| 125 | // loaded. |
| 126 | for (User *U : CI->users()) { |
| 127 | if (!U->hasOneUse()) |
| 128 | continue; |
| 129 | |
| 130 | int64_t Offset = 0; |
| 131 | auto *Load = dyn_cast<LoadInst>(Val: U); // Load from ImplicitArgPtr/DispatchPtr? |
| 132 | auto *BCI = dyn_cast<BitCastInst>(Val: U); |
| 133 | if (!Load && !BCI) { |
| 134 | if (GetPointerBaseWithConstantOffset(Ptr: U, Offset, DL) != CI) |
| 135 | continue; |
| 136 | Load = dyn_cast<LoadInst>(Val: *U->user_begin()); // Load from GEP? |
| 137 | BCI = dyn_cast<BitCastInst>(Val: *U->user_begin()); |
| 138 | } |
| 139 | |
| 140 | if (BCI) { |
| 141 | if (!BCI->hasOneUse()) |
| 142 | continue; |
| 143 | Load = dyn_cast<LoadInst>(Val: *BCI->user_begin()); // Load from BCI? |
| 144 | } |
| 145 | |
| 146 | if (!Load || !Load->isSimple()) |
| 147 | continue; |
| 148 | |
| 149 | unsigned LoadSize = DL.getTypeStoreSize(Ty: Load->getType()); |
| 150 | |
| 151 | // TODO: Handle merged loads. |
| 152 | if (IsV5OrAbove) { // Base is ImplicitArgPtr. |
| 153 | switch (Offset) { |
| 154 | case HIDDEN_BLOCK_COUNT_X: |
| 155 | if (LoadSize == 4) { |
| 156 | BlockCounts[0] = Load; |
| 157 | annotateGridSizeLoadWithRangeMD(Load, MaxNumGroups: MaxNumWorkgroups[0]); |
| 158 | } |
| 159 | break; |
| 160 | case HIDDEN_BLOCK_COUNT_Y: |
| 161 | if (LoadSize == 4) { |
| 162 | BlockCounts[1] = Load; |
| 163 | annotateGridSizeLoadWithRangeMD(Load, MaxNumGroups: MaxNumWorkgroups[1]); |
| 164 | } |
| 165 | break; |
| 166 | case HIDDEN_BLOCK_COUNT_Z: |
| 167 | if (LoadSize == 4) { |
| 168 | BlockCounts[2] = Load; |
| 169 | annotateGridSizeLoadWithRangeMD(Load, MaxNumGroups: MaxNumWorkgroups[2]); |
| 170 | } |
| 171 | break; |
| 172 | case HIDDEN_GROUP_SIZE_X: |
| 173 | if (LoadSize == 2) |
| 174 | GroupSizes[0] = Load; |
| 175 | break; |
| 176 | case HIDDEN_GROUP_SIZE_Y: |
| 177 | if (LoadSize == 2) |
| 178 | GroupSizes[1] = Load; |
| 179 | break; |
| 180 | case HIDDEN_GROUP_SIZE_Z: |
| 181 | if (LoadSize == 2) |
| 182 | GroupSizes[2] = Load; |
| 183 | break; |
| 184 | case HIDDEN_REMAINDER_X: |
| 185 | if (LoadSize == 2) |
| 186 | Remainders[0] = Load; |
| 187 | break; |
| 188 | case HIDDEN_REMAINDER_Y: |
| 189 | if (LoadSize == 2) |
| 190 | Remainders[1] = Load; |
| 191 | break; |
| 192 | case HIDDEN_REMAINDER_Z: |
| 193 | if (LoadSize == 2) |
| 194 | Remainders[2] = Load; |
| 195 | break; |
| 196 | default: |
| 197 | break; |
| 198 | } |
| 199 | } else { // Base is DispatchPtr. |
| 200 | switch (Offset) { |
| 201 | case WORKGROUP_SIZE_X: |
| 202 | if (LoadSize == 2) |
| 203 | GroupSizes[0] = Load; |
| 204 | break; |
| 205 | case WORKGROUP_SIZE_Y: |
| 206 | if (LoadSize == 2) |
| 207 | GroupSizes[1] = Load; |
| 208 | break; |
| 209 | case WORKGROUP_SIZE_Z: |
| 210 | if (LoadSize == 2) |
| 211 | GroupSizes[2] = Load; |
| 212 | break; |
| 213 | case GRID_SIZE_X: |
| 214 | if (LoadSize == 4) |
| 215 | GridSizes[0] = Load; |
| 216 | break; |
| 217 | case GRID_SIZE_Y: |
| 218 | if (LoadSize == 4) |
| 219 | GridSizes[1] = Load; |
| 220 | break; |
| 221 | case GRID_SIZE_Z: |
| 222 | if (LoadSize == 4) |
| 223 | GridSizes[2] = Load; |
| 224 | break; |
| 225 | default: |
| 226 | break; |
| 227 | } |
| 228 | } |
| 229 | } |
| 230 | |
| 231 | bool MadeChange = false; |
| 232 | if (IsV5OrAbove && HasUniformWorkGroupSize) { |
| 233 | // Under v5 __ockl_get_local_size returns the value computed by the expression: |
| 234 | // |
| 235 | // workgroup_id < hidden_block_count ? hidden_group_size : hidden_remainder |
| 236 | // |
| 237 | // For functions with the attribute uniform-work-group-size=true. we can evaluate |
| 238 | // workgroup_id < hidden_block_count as true, and thus hidden_group_size is returned |
| 239 | // for __ockl_get_local_size. |
| 240 | for (int I = 0; I < 3; ++I) { |
| 241 | Value *BlockCount = BlockCounts[I]; |
| 242 | if (!BlockCount) |
| 243 | continue; |
| 244 | |
| 245 | using namespace llvm::PatternMatch; |
| 246 | auto GroupIDIntrin = |
| 247 | I == 0 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_x>() |
| 248 | : (I == 1 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_y>() |
| 249 | : m_Intrinsic<Intrinsic::amdgcn_workgroup_id_z>()); |
| 250 | |
| 251 | for (User *ICmp : BlockCount->users()) { |
| 252 | if (match(V: ICmp, P: m_SpecificICmp(MatchPred: ICmpInst::ICMP_ULT, L: GroupIDIntrin, |
| 253 | R: m_Specific(V: BlockCount)))) { |
| 254 | ICmp->replaceAllUsesWith(V: llvm::ConstantInt::getTrue(Ty: ICmp->getType())); |
| 255 | MadeChange = true; |
| 256 | } |
| 257 | } |
| 258 | } |
| 259 | |
| 260 | // All remainders should be 0 with uniform work group size. |
| 261 | for (Value *Remainder : Remainders) { |
| 262 | if (!Remainder) |
| 263 | continue; |
| 264 | Remainder->replaceAllUsesWith(V: Constant::getNullValue(Ty: Remainder->getType())); |
| 265 | MadeChange = true; |
| 266 | } |
| 267 | } else if (HasUniformWorkGroupSize) { // Pre-V5. |
| 268 | // Pattern match the code used to handle partial workgroup dispatches in the |
| 269 | // library implementation of get_local_size, so the entire function can be |
| 270 | // constant folded with a known group size. |
| 271 | // |
| 272 | // uint r = grid_size - group_id * group_size; |
| 273 | // get_local_size = (r < group_size) ? r : group_size; |
| 274 | // |
| 275 | // If we have uniform-work-group-size (which is the default in OpenCL 1.2), |
| 276 | // the grid_size is required to be a multiple of group_size). In this case: |
| 277 | // |
| 278 | // grid_size - (group_id * group_size) < group_size |
| 279 | // -> |
| 280 | // grid_size < group_size + (group_id * group_size) |
| 281 | // |
| 282 | // (grid_size / group_size) < 1 + group_id |
| 283 | // |
| 284 | // grid_size / group_size is at least 1, so we can conclude the select |
| 285 | // condition is false (except for group_id == 0, where the select result is |
| 286 | // the same). |
| 287 | for (int I = 0; I < 3; ++I) { |
| 288 | Value *GroupSize = GroupSizes[I]; |
| 289 | Value *GridSize = GridSizes[I]; |
| 290 | if (!GroupSize || !GridSize) |
| 291 | continue; |
| 292 | |
| 293 | using namespace llvm::PatternMatch; |
| 294 | auto GroupIDIntrin = |
| 295 | I == 0 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_x>() |
| 296 | : (I == 1 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_y>() |
| 297 | : m_Intrinsic<Intrinsic::amdgcn_workgroup_id_z>()); |
| 298 | |
| 299 | for (User *U : GroupSize->users()) { |
| 300 | auto *ZextGroupSize = dyn_cast<ZExtInst>(Val: U); |
| 301 | if (!ZextGroupSize) |
| 302 | continue; |
| 303 | |
| 304 | for (User *UMin : ZextGroupSize->users()) { |
| 305 | if (match(V: UMin, |
| 306 | P: m_UMin(L: m_Sub(L: m_Specific(V: GridSize), |
| 307 | R: m_Mul(L: GroupIDIntrin, R: m_Specific(V: ZextGroupSize))), |
| 308 | R: m_Specific(V: ZextGroupSize)))) { |
| 309 | if (HasReqdWorkGroupSize) { |
| 310 | ConstantInt *KnownSize |
| 311 | = mdconst::extract<ConstantInt>(MD: MD->getOperand(I)); |
| 312 | UMin->replaceAllUsesWith(V: ConstantFoldIntegerCast( |
| 313 | C: KnownSize, DestTy: UMin->getType(), IsSigned: false, DL)); |
| 314 | } else { |
| 315 | UMin->replaceAllUsesWith(V: ZextGroupSize); |
| 316 | } |
| 317 | |
| 318 | MadeChange = true; |
| 319 | } |
| 320 | } |
| 321 | } |
| 322 | } |
| 323 | } |
| 324 | |
| 325 | // If reqd_work_group_size is set, we can replace work group size with it. |
| 326 | if (!HasReqdWorkGroupSize) |
| 327 | return MadeChange; |
| 328 | |
| 329 | for (int I = 0; I < 3; I++) { |
| 330 | Value *GroupSize = GroupSizes[I]; |
| 331 | if (!GroupSize) |
| 332 | continue; |
| 333 | |
| 334 | ConstantInt *KnownSize = mdconst::extract<ConstantInt>(MD: MD->getOperand(I)); |
| 335 | GroupSize->replaceAllUsesWith( |
| 336 | V: ConstantFoldIntegerCast(C: KnownSize, DestTy: GroupSize->getType(), IsSigned: false, DL)); |
| 337 | MadeChange = true; |
| 338 | } |
| 339 | |
| 340 | return MadeChange; |
| 341 | } |
| 342 | |
| 343 | |
| 344 | // TODO: Move makeLIDRangeMetadata usage into here. Seem to not get |
| 345 | // TargetPassConfig for subtarget. |
| 346 | bool AMDGPULowerKernelAttributes::runOnModule(Module &M) { |
| 347 | bool MadeChange = false; |
| 348 | bool IsV5OrAbove = |
| 349 | AMDGPU::getAMDHSACodeObjectVersion(M) >= AMDGPU::AMDHSA_COV5; |
| 350 | Function *BasePtr = getBasePtrIntrinsic(M, IsV5OrAbove); |
| 351 | |
| 352 | if (!BasePtr) // ImplicitArgPtr/DispatchPtr not used. |
| 353 | return false; |
| 354 | |
| 355 | SmallPtrSet<Instruction *, 4> HandledUses; |
| 356 | for (auto *U : BasePtr->users()) { |
| 357 | CallInst *CI = cast<CallInst>(Val: U); |
| 358 | if (HandledUses.insert(Ptr: CI).second) { |
| 359 | if (processUse(CI, IsV5OrAbove)) |
| 360 | MadeChange = true; |
| 361 | } |
| 362 | } |
| 363 | |
| 364 | return MadeChange; |
| 365 | } |
| 366 | |
| 367 | |
| 368 | INITIALIZE_PASS_BEGIN(AMDGPULowerKernelAttributes, DEBUG_TYPE, |
| 369 | "AMDGPU Kernel Attributes" , false, false) |
| 370 | INITIALIZE_PASS_END(AMDGPULowerKernelAttributes, DEBUG_TYPE, |
| 371 | "AMDGPU Kernel Attributes" , false, false) |
| 372 | |
| 373 | char AMDGPULowerKernelAttributes::ID = 0; |
| 374 | |
| 375 | ModulePass *llvm::createAMDGPULowerKernelAttributesPass() { |
| 376 | return new AMDGPULowerKernelAttributes(); |
| 377 | } |
| 378 | |
| 379 | PreservedAnalyses |
| 380 | AMDGPULowerKernelAttributesPass::run(Function &F, FunctionAnalysisManager &AM) { |
| 381 | bool IsV5OrAbove = |
| 382 | AMDGPU::getAMDHSACodeObjectVersion(M: *F.getParent()) >= AMDGPU::AMDHSA_COV5; |
| 383 | Function *BasePtr = getBasePtrIntrinsic(M&: *F.getParent(), IsV5OrAbove); |
| 384 | |
| 385 | if (!BasePtr) // ImplicitArgPtr/DispatchPtr not used. |
| 386 | return PreservedAnalyses::all(); |
| 387 | |
| 388 | for (Instruction &I : instructions(F)) { |
| 389 | if (CallInst *CI = dyn_cast<CallInst>(Val: &I)) { |
| 390 | if (CI->getCalledFunction() == BasePtr) |
| 391 | processUse(CI, IsV5OrAbove); |
| 392 | } |
| 393 | } |
| 394 | |
| 395 | return PreservedAnalyses::all(); |
| 396 | } |
| 397 | |