| 1 | //===- AMDGPUPreloadKernelArguments.cpp - Preload Kernel Arguments --------===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | // |
| 9 | /// \file This pass preloads kernel arguments into user_data SGPRs before kernel |
| 10 | /// execution begins. The number of registers available for preloading depends |
| 11 | /// on the number of free user SGPRs, up to the hardware's maximum limit. |
| 12 | /// Implicit arguments enabled in the kernel descriptor are allocated first, |
| 13 | /// followed by SGPRs used for preloaded kernel arguments. (Reference: |
| 14 | /// https://llvm.org/docs/AMDGPUUsage.html#initial-kernel-execution-state) |
| 15 | /// Additionally, hidden kernel arguments may be preloaded, in which case they |
| 16 | /// are appended to the kernel signature after explicit arguments. Preloaded |
| 17 | /// arguments will be marked with `inreg`. |
| 18 | // |
| 19 | //===----------------------------------------------------------------------===// |
| 20 | |
| 21 | #include "AMDGPU.h" |
| 22 | #include "AMDGPUTargetMachine.h" |
| 23 | #include "llvm/Analysis/ValueTracking.h" |
| 24 | #include "llvm/IR/Function.h" |
| 25 | #include "llvm/IR/Instructions.h" |
| 26 | #include "llvm/IR/IntrinsicsAMDGPU.h" |
| 27 | #include "llvm/IR/Module.h" |
| 28 | #include "llvm/IR/PassManager.h" |
| 29 | #include "llvm/IR/Verifier.h" |
| 30 | #include "llvm/Pass.h" |
| 31 | |
| 32 | #define DEBUG_TYPE "amdgpu-preload-kernel-arguments" |
| 33 | |
| 34 | using namespace llvm; |
| 35 | |
| 36 | static cl::opt<unsigned> KernargPreloadCount( |
| 37 | "amdgpu-kernarg-preload-count" , |
| 38 | cl::desc("How many kernel arguments to preload onto SGPRs" ), cl::init(Val: 0)); |
| 39 | |
| 40 | namespace { |
| 41 | |
| 42 | class AMDGPUPreloadKernelArgumentsLegacy : public ModulePass { |
| 43 | const GCNTargetMachine *TM; |
| 44 | |
| 45 | public: |
| 46 | static char ID; |
| 47 | explicit AMDGPUPreloadKernelArgumentsLegacy( |
| 48 | const GCNTargetMachine *TM = nullptr); |
| 49 | |
| 50 | StringRef getPassName() const override { |
| 51 | return "AMDGPU Preload Kernel Arguments" ; |
| 52 | } |
| 53 | |
| 54 | bool runOnModule(Module &M) override; |
| 55 | }; |
| 56 | |
| 57 | class PreloadKernelArgInfo { |
| 58 | private: |
| 59 | Function &F; |
| 60 | const GCNSubtarget &ST; |
| 61 | unsigned NumFreeUserSGPRs; |
| 62 | |
| 63 | enum HiddenArg : unsigned { |
| 64 | HIDDEN_BLOCK_COUNT_X, |
| 65 | HIDDEN_BLOCK_COUNT_Y, |
| 66 | HIDDEN_BLOCK_COUNT_Z, |
| 67 | HIDDEN_GROUP_SIZE_X, |
| 68 | HIDDEN_GROUP_SIZE_Y, |
| 69 | HIDDEN_GROUP_SIZE_Z, |
| 70 | HIDDEN_REMAINDER_X, |
| 71 | HIDDEN_REMAINDER_Y, |
| 72 | HIDDEN_REMAINDER_Z, |
| 73 | END_HIDDEN_ARGS |
| 74 | }; |
| 75 | |
| 76 | // Stores information about a specific hidden argument. |
| 77 | struct HiddenArgInfo { |
| 78 | // Offset in bytes from the location in the kernearg segment pointed to by |
| 79 | // the implicitarg pointer. |
| 80 | uint8_t Offset; |
| 81 | // The size of the hidden argument in bytes. |
| 82 | uint8_t Size; |
| 83 | // The name of the hidden argument in the kernel signature. |
| 84 | const char *Name; |
| 85 | }; |
| 86 | |
| 87 | static constexpr HiddenArgInfo HiddenArgs[END_HIDDEN_ARGS] = { |
| 88 | {.Offset: 0, .Size: 4, .Name: "_hidden_block_count_x" }, {.Offset: 4, .Size: 4, .Name: "_hidden_block_count_y" }, |
| 89 | {.Offset: 8, .Size: 4, .Name: "_hidden_block_count_z" }, {.Offset: 12, .Size: 2, .Name: "_hidden_group_size_x" }, |
| 90 | {.Offset: 14, .Size: 2, .Name: "_hidden_group_size_y" }, {.Offset: 16, .Size: 2, .Name: "_hidden_group_size_z" }, |
| 91 | {.Offset: 18, .Size: 2, .Name: "_hidden_remainder_x" }, {.Offset: 20, .Size: 2, .Name: "_hidden_remainder_y" }, |
| 92 | {.Offset: 22, .Size: 2, .Name: "_hidden_remainder_z" }}; |
| 93 | |
| 94 | static HiddenArg getHiddenArgFromOffset(unsigned Offset) { |
| 95 | for (unsigned I = 0; I < END_HIDDEN_ARGS; ++I) |
| 96 | if (HiddenArgs[I].Offset == Offset) |
| 97 | return static_cast<HiddenArg>(I); |
| 98 | |
| 99 | return END_HIDDEN_ARGS; |
| 100 | } |
| 101 | |
| 102 | static Type *getHiddenArgType(LLVMContext &Ctx, HiddenArg HA) { |
| 103 | if (HA < END_HIDDEN_ARGS) |
| 104 | return Type::getIntNTy(C&: Ctx, N: HiddenArgs[HA].Size * 8); |
| 105 | |
| 106 | llvm_unreachable("Unexpected hidden argument." ); |
| 107 | } |
| 108 | |
| 109 | static const char *getHiddenArgName(HiddenArg HA) { |
| 110 | if (HA < END_HIDDEN_ARGS) |
| 111 | return HiddenArgs[HA].Name; |
| 112 | |
| 113 | llvm_unreachable("Unexpected hidden argument." ); |
| 114 | } |
| 115 | |
| 116 | // Clones the function after adding implicit arguments to the argument list |
| 117 | // and returns the new updated function. Preloaded implicit arguments are |
| 118 | // added up to and including the last one that will be preloaded, indicated by |
| 119 | // LastPreloadIndex. Currently preloading is only performed on the totality of |
| 120 | // sequential data from the kernarg segment including implicit (hidden) |
| 121 | // arguments. This means that all arguments up to the last preloaded argument |
| 122 | // will also be preloaded even if that data is unused. |
| 123 | Function *cloneFunctionWithPreloadImplicitArgs(unsigned LastPreloadIndex) { |
| 124 | FunctionType *FT = F.getFunctionType(); |
| 125 | LLVMContext &Ctx = F.getParent()->getContext(); |
| 126 | SmallVector<Type *, 16> FTypes(FT->param_begin(), FT->param_end()); |
| 127 | for (unsigned I = 0; I <= LastPreloadIndex; ++I) |
| 128 | FTypes.push_back(Elt: getHiddenArgType(Ctx, HA: HiddenArg(I))); |
| 129 | |
| 130 | FunctionType *NFT = |
| 131 | FunctionType::get(Result: FT->getReturnType(), Params: FTypes, isVarArg: FT->isVarArg()); |
| 132 | Function *NF = |
| 133 | Function::Create(Ty: NFT, Linkage: F.getLinkage(), AddrSpace: F.getAddressSpace(), N: F.getName()); |
| 134 | |
| 135 | NF->copyAttributesFrom(Src: &F); |
| 136 | NF->copyMetadata(Src: &F, Offset: 0); |
| 137 | |
| 138 | F.getParent()->getFunctionList().insert(where: F.getIterator(), New: NF); |
| 139 | NF->takeName(V: &F); |
| 140 | NF->splice(ToIt: NF->begin(), FromF: &F); |
| 141 | |
| 142 | Function::arg_iterator NFArg = NF->arg_begin(); |
| 143 | for (Argument &Arg : F.args()) { |
| 144 | Arg.replaceAllUsesWith(V: &*NFArg); |
| 145 | NFArg->takeName(V: &Arg); |
| 146 | ++NFArg; |
| 147 | } |
| 148 | |
| 149 | AttrBuilder AB(Ctx); |
| 150 | AB.addAttribute(Val: Attribute::InReg); |
| 151 | AB.addAttribute(A: "amdgpu-hidden-argument" ); |
| 152 | AttributeList AL = NF->getAttributes(); |
| 153 | for (unsigned I = 0; I <= LastPreloadIndex; ++I) { |
| 154 | AL = AL.addParamAttributes(C&: Ctx, ArgNo: NFArg->getArgNo(), B: AB); |
| 155 | NFArg++->setName(getHiddenArgName(HA: HiddenArg(I))); |
| 156 | } |
| 157 | |
| 158 | NF->setAttributes(AL); |
| 159 | F.replaceAllUsesWith(V: NF); |
| 160 | |
| 161 | return NF; |
| 162 | } |
| 163 | |
| 164 | public: |
| 165 | PreloadKernelArgInfo(Function &F, const GCNSubtarget &ST) : F(F), ST(ST) { |
| 166 | setInitialFreeUserSGPRsCount(); |
| 167 | } |
| 168 | |
| 169 | // Returns the maximum number of user SGPRs that we have available to preload |
| 170 | // arguments. |
| 171 | void setInitialFreeUserSGPRsCount() { |
| 172 | GCNUserSGPRUsageInfo UserSGPRInfo(F, ST); |
| 173 | NumFreeUserSGPRs = UserSGPRInfo.getNumFreeUserSGPRs(); |
| 174 | } |
| 175 | |
| 176 | bool canPreloadKernArgAtOffset(uint64_t ExplicitArgOffset) { |
| 177 | return ExplicitArgOffset <= NumFreeUserSGPRs * 4; |
| 178 | } |
| 179 | |
| 180 | // Try to allocate SGPRs to preload hidden kernel arguments. |
| 181 | void |
| 182 | tryAllocHiddenArgPreloadSGPRs(uint64_t ImplicitArgsBaseOffset, |
| 183 | SmallVectorImpl<Function *> &FunctionsToErase) { |
| 184 | Function *ImplicitArgPtr = Intrinsic::getDeclarationIfExists( |
| 185 | M: F.getParent(), id: Intrinsic::amdgcn_implicitarg_ptr); |
| 186 | if (!ImplicitArgPtr) |
| 187 | return; |
| 188 | |
| 189 | const DataLayout &DL = F.getParent()->getDataLayout(); |
| 190 | // Pair is the load and the load offset. |
| 191 | SmallVector<std::pair<LoadInst *, unsigned>, 4> ImplicitArgLoads; |
| 192 | for (auto *U : ImplicitArgPtr->users()) { |
| 193 | Instruction *CI = dyn_cast<Instruction>(Val: U); |
| 194 | if (!CI || CI->getParent()->getParent() != &F) |
| 195 | continue; |
| 196 | |
| 197 | for (auto *U : CI->users()) { |
| 198 | int64_t Offset = 0; |
| 199 | auto *Load = dyn_cast<LoadInst>(Val: U); // Load from ImplicitArgPtr? |
| 200 | if (!Load) { |
| 201 | if (GetPointerBaseWithConstantOffset(Ptr: U, Offset, DL) != CI) |
| 202 | continue; |
| 203 | |
| 204 | Load = dyn_cast<LoadInst>(Val: *U->user_begin()); // Load from GEP? |
| 205 | } |
| 206 | |
| 207 | if (!Load || !Load->isSimple()) |
| 208 | continue; |
| 209 | |
| 210 | // FIXME: Expand handle merged loads. |
| 211 | LLVMContext &Ctx = F.getParent()->getContext(); |
| 212 | Type *LoadTy = Load->getType(); |
| 213 | HiddenArg HA = getHiddenArgFromOffset(Offset); |
| 214 | if (HA == END_HIDDEN_ARGS || LoadTy != getHiddenArgType(Ctx, HA)) |
| 215 | continue; |
| 216 | |
| 217 | ImplicitArgLoads.push_back(Elt: std::make_pair(x&: Load, y&: Offset)); |
| 218 | } |
| 219 | } |
| 220 | |
| 221 | if (ImplicitArgLoads.empty()) |
| 222 | return; |
| 223 | |
| 224 | // Allocate loads in order of offset. We need to be sure that the implicit |
| 225 | // argument can actually be preloaded. |
| 226 | std::sort(first: ImplicitArgLoads.begin(), last: ImplicitArgLoads.end(), comp: less_second()); |
| 227 | |
| 228 | // If we fail to preload any implicit argument we know we don't have SGPRs |
| 229 | // to preload any subsequent ones with larger offsets. Find the first |
| 230 | // argument that we cannot preload. |
| 231 | auto *PreloadEnd = llvm::find_if( |
| 232 | Range&: ImplicitArgLoads, P: [&](const std::pair<LoadInst *, unsigned> &Load) { |
| 233 | unsigned LoadSize = DL.getTypeStoreSize(Ty: Load.first->getType()); |
| 234 | unsigned LoadOffset = Load.second; |
| 235 | if (!canPreloadKernArgAtOffset(ExplicitArgOffset: LoadOffset + LoadSize + |
| 236 | ImplicitArgsBaseOffset)) |
| 237 | return true; |
| 238 | |
| 239 | return false; |
| 240 | }); |
| 241 | |
| 242 | if (PreloadEnd == ImplicitArgLoads.begin()) |
| 243 | return; |
| 244 | |
| 245 | unsigned LastHiddenArgIndex = getHiddenArgFromOffset(Offset: PreloadEnd[-1].second); |
| 246 | Function *NF = cloneFunctionWithPreloadImplicitArgs(LastPreloadIndex: LastHiddenArgIndex); |
| 247 | assert(NF); |
| 248 | FunctionsToErase.push_back(Elt: &F); |
| 249 | for (const auto *I = ImplicitArgLoads.begin(); I != PreloadEnd; ++I) { |
| 250 | LoadInst *LoadInst = I->first; |
| 251 | unsigned LoadOffset = I->second; |
| 252 | unsigned HiddenArgIndex = getHiddenArgFromOffset(Offset: LoadOffset); |
| 253 | unsigned Index = NF->arg_size() - LastHiddenArgIndex + HiddenArgIndex - 1; |
| 254 | Argument *Arg = NF->getArg(i: Index); |
| 255 | LoadInst->replaceAllUsesWith(V: Arg); |
| 256 | } |
| 257 | } |
| 258 | }; |
| 259 | |
| 260 | } // end anonymous namespace |
| 261 | |
| 262 | char AMDGPUPreloadKernelArgumentsLegacy::ID = 0; |
| 263 | |
| 264 | INITIALIZE_PASS(AMDGPUPreloadKernelArgumentsLegacy, DEBUG_TYPE, |
| 265 | "AMDGPU Preload Kernel Arguments" , false, false) |
| 266 | |
| 267 | ModulePass * |
| 268 | llvm::createAMDGPUPreloadKernelArgumentsLegacyPass(const TargetMachine *TM) { |
| 269 | return new AMDGPUPreloadKernelArgumentsLegacy( |
| 270 | static_cast<const GCNTargetMachine *>(TM)); |
| 271 | } |
| 272 | |
| 273 | AMDGPUPreloadKernelArgumentsLegacy::AMDGPUPreloadKernelArgumentsLegacy( |
| 274 | const GCNTargetMachine *TM) |
| 275 | : ModulePass(ID), TM(TM) {} |
| 276 | |
| 277 | static bool markKernelArgsAsInreg(Module &M, const TargetMachine &TM) { |
| 278 | SmallVector<Function *, 4> FunctionsToErase; |
| 279 | bool Changed = false; |
| 280 | for (auto &F : M) { |
| 281 | const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F); |
| 282 | if (!ST.hasKernargPreload() || |
| 283 | F.getCallingConv() != CallingConv::AMDGPU_KERNEL) |
| 284 | continue; |
| 285 | |
| 286 | PreloadKernelArgInfo PreloadInfo(F, ST); |
| 287 | uint64_t ExplicitArgOffset = 0; |
| 288 | const DataLayout &DL = F.getDataLayout(); |
| 289 | const uint64_t BaseOffset = ST.getExplicitKernelArgOffset(); |
| 290 | unsigned NumPreloadsRequested = KernargPreloadCount; |
| 291 | unsigned NumPreloadedExplicitArgs = 0; |
| 292 | for (Argument &Arg : F.args()) { |
| 293 | // Avoid incompatible attributes and guard against running this pass |
| 294 | // twice. |
| 295 | // |
| 296 | // TODO: Preload byref kernel arguments |
| 297 | if (Arg.hasByRefAttr() || Arg.hasNestAttr() || |
| 298 | Arg.hasAttribute(Kind: "amdgpu-hidden-argument" )) |
| 299 | break; |
| 300 | |
| 301 | // Inreg may be pre-existing on some arguments, try to preload these. |
| 302 | if (NumPreloadsRequested == 0 && !Arg.hasInRegAttr()) |
| 303 | break; |
| 304 | |
| 305 | // FIXME: Preload aggregates. |
| 306 | if (Arg.getType()->isAggregateType()) |
| 307 | break; |
| 308 | |
| 309 | Type *ArgTy = Arg.getType(); |
| 310 | Align ABITypeAlign = DL.getABITypeAlign(Ty: ArgTy); |
| 311 | uint64_t AllocSize = DL.getTypeAllocSize(Ty: ArgTy); |
| 312 | ExplicitArgOffset = alignTo(Size: ExplicitArgOffset, A: ABITypeAlign) + AllocSize; |
| 313 | |
| 314 | if (!PreloadInfo.canPreloadKernArgAtOffset(ExplicitArgOffset)) |
| 315 | break; |
| 316 | |
| 317 | Arg.addAttr(Kind: Attribute::InReg); |
| 318 | NumPreloadedExplicitArgs++; |
| 319 | if (NumPreloadsRequested > 0) |
| 320 | NumPreloadsRequested--; |
| 321 | } |
| 322 | |
| 323 | // Only try preloading hidden arguments if we can successfully preload the |
| 324 | // last explicit argument. |
| 325 | if (NumPreloadedExplicitArgs == F.arg_size()) { |
| 326 | uint64_t ImplicitArgsBaseOffset = |
| 327 | alignTo(Size: ExplicitArgOffset, A: ST.getAlignmentForImplicitArgPtr()) + |
| 328 | BaseOffset; |
| 329 | PreloadInfo.tryAllocHiddenArgPreloadSGPRs(ImplicitArgsBaseOffset, |
| 330 | FunctionsToErase); |
| 331 | } |
| 332 | |
| 333 | Changed |= NumPreloadedExplicitArgs > 0; |
| 334 | } |
| 335 | |
| 336 | // Erase cloned functions if we needed to update the kernel signature to |
| 337 | // support preloading hidden kernel arguments. |
| 338 | for (auto *F : FunctionsToErase) |
| 339 | F->eraseFromParent(); |
| 340 | |
| 341 | return Changed; |
| 342 | } |
| 343 | |
| 344 | bool AMDGPUPreloadKernelArgumentsLegacy::runOnModule(Module &M) { |
| 345 | if (skipModule(M) || !TM) |
| 346 | return false; |
| 347 | |
| 348 | return markKernelArgsAsInreg(M, TM: *TM); |
| 349 | } |
| 350 | |
| 351 | PreservedAnalyses |
| 352 | AMDGPUPreloadKernelArgumentsPass::run(Module &M, ModuleAnalysisManager &AM) { |
| 353 | bool Changed = markKernelArgsAsInreg(M, TM); |
| 354 | return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all(); |
| 355 | } |
| 356 | |