1 | //===- AMDGPUPreloadKernelArguments.cpp - Preload Kernel Arguments --------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | /// \file This pass preloads kernel arguments into user_data SGPRs before kernel |
10 | /// execution begins. The number of registers available for preloading depends |
11 | /// on the number of free user SGPRs, up to the hardware's maximum limit. |
12 | /// Implicit arguments enabled in the kernel descriptor are allocated first, |
13 | /// followed by SGPRs used for preloaded kernel arguments. (Reference: |
14 | /// https://llvm.org/docs/AMDGPUUsage.html#initial-kernel-execution-state) |
15 | /// Additionally, hidden kernel arguments may be preloaded, in which case they |
16 | /// are appended to the kernel signature after explicit arguments. Preloaded |
17 | /// arguments will be marked with `inreg`. |
18 | // |
19 | //===----------------------------------------------------------------------===// |
20 | |
21 | #include "AMDGPU.h" |
22 | #include "AMDGPUTargetMachine.h" |
23 | #include "llvm/Analysis/ValueTracking.h" |
24 | #include "llvm/IR/Function.h" |
25 | #include "llvm/IR/Instructions.h" |
26 | #include "llvm/IR/IntrinsicsAMDGPU.h" |
27 | #include "llvm/IR/Module.h" |
28 | #include "llvm/IR/PassManager.h" |
29 | #include "llvm/IR/Verifier.h" |
30 | #include "llvm/Pass.h" |
31 | |
32 | #define DEBUG_TYPE "amdgpu-preload-kernel-arguments" |
33 | |
34 | using namespace llvm; |
35 | |
36 | static cl::opt<unsigned> KernargPreloadCount( |
37 | "amdgpu-kernarg-preload-count" , |
38 | cl::desc("How many kernel arguments to preload onto SGPRs" ), cl::init(Val: 0)); |
39 | |
40 | namespace { |
41 | |
42 | class AMDGPUPreloadKernelArgumentsLegacy : public ModulePass { |
43 | const GCNTargetMachine *TM; |
44 | |
45 | public: |
46 | static char ID; |
47 | explicit AMDGPUPreloadKernelArgumentsLegacy( |
48 | const GCNTargetMachine *TM = nullptr); |
49 | |
50 | StringRef getPassName() const override { |
51 | return "AMDGPU Preload Kernel Arguments" ; |
52 | } |
53 | |
54 | bool runOnModule(Module &M) override; |
55 | }; |
56 | |
57 | class PreloadKernelArgInfo { |
58 | private: |
59 | Function &F; |
60 | const GCNSubtarget &ST; |
61 | unsigned NumFreeUserSGPRs; |
62 | |
63 | enum HiddenArg : unsigned { |
64 | HIDDEN_BLOCK_COUNT_X, |
65 | HIDDEN_BLOCK_COUNT_Y, |
66 | HIDDEN_BLOCK_COUNT_Z, |
67 | HIDDEN_GROUP_SIZE_X, |
68 | HIDDEN_GROUP_SIZE_Y, |
69 | HIDDEN_GROUP_SIZE_Z, |
70 | HIDDEN_REMAINDER_X, |
71 | HIDDEN_REMAINDER_Y, |
72 | HIDDEN_REMAINDER_Z, |
73 | END_HIDDEN_ARGS |
74 | }; |
75 | |
76 | // Stores information about a specific hidden argument. |
77 | struct HiddenArgInfo { |
78 | // Offset in bytes from the location in the kernearg segment pointed to by |
79 | // the implicitarg pointer. |
80 | uint8_t Offset; |
81 | // The size of the hidden argument in bytes. |
82 | uint8_t Size; |
83 | // The name of the hidden argument in the kernel signature. |
84 | const char *Name; |
85 | }; |
86 | |
87 | static constexpr HiddenArgInfo HiddenArgs[END_HIDDEN_ARGS] = { |
88 | {.Offset: 0, .Size: 4, .Name: "_hidden_block_count_x" }, {.Offset: 4, .Size: 4, .Name: "_hidden_block_count_y" }, |
89 | {.Offset: 8, .Size: 4, .Name: "_hidden_block_count_z" }, {.Offset: 12, .Size: 2, .Name: "_hidden_group_size_x" }, |
90 | {.Offset: 14, .Size: 2, .Name: "_hidden_group_size_y" }, {.Offset: 16, .Size: 2, .Name: "_hidden_group_size_z" }, |
91 | {.Offset: 18, .Size: 2, .Name: "_hidden_remainder_x" }, {.Offset: 20, .Size: 2, .Name: "_hidden_remainder_y" }, |
92 | {.Offset: 22, .Size: 2, .Name: "_hidden_remainder_z" }}; |
93 | |
94 | static HiddenArg getHiddenArgFromOffset(unsigned Offset) { |
95 | for (unsigned I = 0; I < END_HIDDEN_ARGS; ++I) |
96 | if (HiddenArgs[I].Offset == Offset) |
97 | return static_cast<HiddenArg>(I); |
98 | |
99 | return END_HIDDEN_ARGS; |
100 | } |
101 | |
102 | static Type *getHiddenArgType(LLVMContext &Ctx, HiddenArg HA) { |
103 | if (HA < END_HIDDEN_ARGS) |
104 | return Type::getIntNTy(C&: Ctx, N: HiddenArgs[HA].Size * 8); |
105 | |
106 | llvm_unreachable("Unexpected hidden argument." ); |
107 | } |
108 | |
109 | static const char *getHiddenArgName(HiddenArg HA) { |
110 | if (HA < END_HIDDEN_ARGS) |
111 | return HiddenArgs[HA].Name; |
112 | |
113 | llvm_unreachable("Unexpected hidden argument." ); |
114 | } |
115 | |
116 | // Clones the function after adding implicit arguments to the argument list |
117 | // and returns the new updated function. Preloaded implicit arguments are |
118 | // added up to and including the last one that will be preloaded, indicated by |
119 | // LastPreloadIndex. Currently preloading is only performed on the totality of |
120 | // sequential data from the kernarg segment including implicit (hidden) |
121 | // arguments. This means that all arguments up to the last preloaded argument |
122 | // will also be preloaded even if that data is unused. |
123 | Function *cloneFunctionWithPreloadImplicitArgs(unsigned LastPreloadIndex) { |
124 | FunctionType *FT = F.getFunctionType(); |
125 | LLVMContext &Ctx = F.getParent()->getContext(); |
126 | SmallVector<Type *, 16> FTypes(FT->param_begin(), FT->param_end()); |
127 | for (unsigned I = 0; I <= LastPreloadIndex; ++I) |
128 | FTypes.push_back(Elt: getHiddenArgType(Ctx, HA: HiddenArg(I))); |
129 | |
130 | FunctionType *NFT = |
131 | FunctionType::get(Result: FT->getReturnType(), Params: FTypes, isVarArg: FT->isVarArg()); |
132 | Function *NF = |
133 | Function::Create(Ty: NFT, Linkage: F.getLinkage(), AddrSpace: F.getAddressSpace(), N: F.getName()); |
134 | |
135 | NF->copyAttributesFrom(Src: &F); |
136 | NF->copyMetadata(Src: &F, Offset: 0); |
137 | |
138 | F.getParent()->getFunctionList().insert(where: F.getIterator(), New: NF); |
139 | NF->takeName(V: &F); |
140 | NF->splice(ToIt: NF->begin(), FromF: &F); |
141 | |
142 | Function::arg_iterator NFArg = NF->arg_begin(); |
143 | for (Argument &Arg : F.args()) { |
144 | Arg.replaceAllUsesWith(V: &*NFArg); |
145 | NFArg->takeName(V: &Arg); |
146 | ++NFArg; |
147 | } |
148 | |
149 | AttrBuilder AB(Ctx); |
150 | AB.addAttribute(Val: Attribute::InReg); |
151 | AB.addAttribute(A: "amdgpu-hidden-argument" ); |
152 | AttributeList AL = NF->getAttributes(); |
153 | for (unsigned I = 0; I <= LastPreloadIndex; ++I) { |
154 | AL = AL.addParamAttributes(C&: Ctx, ArgNo: NFArg->getArgNo(), B: AB); |
155 | NFArg++->setName(getHiddenArgName(HA: HiddenArg(I))); |
156 | } |
157 | |
158 | NF->setAttributes(AL); |
159 | F.replaceAllUsesWith(V: NF); |
160 | |
161 | return NF; |
162 | } |
163 | |
164 | public: |
165 | PreloadKernelArgInfo(Function &F, const GCNSubtarget &ST) : F(F), ST(ST) { |
166 | setInitialFreeUserSGPRsCount(); |
167 | } |
168 | |
169 | // Returns the maximum number of user SGPRs that we have available to preload |
170 | // arguments. |
171 | void setInitialFreeUserSGPRsCount() { |
172 | GCNUserSGPRUsageInfo UserSGPRInfo(F, ST); |
173 | NumFreeUserSGPRs = UserSGPRInfo.getNumFreeUserSGPRs(); |
174 | } |
175 | |
176 | bool canPreloadKernArgAtOffset(uint64_t ExplicitArgOffset) { |
177 | return ExplicitArgOffset <= NumFreeUserSGPRs * 4; |
178 | } |
179 | |
180 | // Try to allocate SGPRs to preload hidden kernel arguments. |
181 | void |
182 | tryAllocHiddenArgPreloadSGPRs(uint64_t ImplicitArgsBaseOffset, |
183 | SmallVectorImpl<Function *> &FunctionsToErase) { |
184 | Function *ImplicitArgPtr = Intrinsic::getDeclarationIfExists( |
185 | M: F.getParent(), id: Intrinsic::amdgcn_implicitarg_ptr); |
186 | if (!ImplicitArgPtr) |
187 | return; |
188 | |
189 | const DataLayout &DL = F.getParent()->getDataLayout(); |
190 | // Pair is the load and the load offset. |
191 | SmallVector<std::pair<LoadInst *, unsigned>, 4> ImplicitArgLoads; |
192 | for (auto *U : ImplicitArgPtr->users()) { |
193 | Instruction *CI = dyn_cast<Instruction>(Val: U); |
194 | if (!CI || CI->getParent()->getParent() != &F) |
195 | continue; |
196 | |
197 | for (auto *U : CI->users()) { |
198 | int64_t Offset = 0; |
199 | auto *Load = dyn_cast<LoadInst>(Val: U); // Load from ImplicitArgPtr? |
200 | if (!Load) { |
201 | if (GetPointerBaseWithConstantOffset(Ptr: U, Offset, DL) != CI) |
202 | continue; |
203 | |
204 | Load = dyn_cast<LoadInst>(Val: *U->user_begin()); // Load from GEP? |
205 | } |
206 | |
207 | if (!Load || !Load->isSimple()) |
208 | continue; |
209 | |
210 | // FIXME: Expand handle merged loads. |
211 | LLVMContext &Ctx = F.getParent()->getContext(); |
212 | Type *LoadTy = Load->getType(); |
213 | HiddenArg HA = getHiddenArgFromOffset(Offset); |
214 | if (HA == END_HIDDEN_ARGS || LoadTy != getHiddenArgType(Ctx, HA)) |
215 | continue; |
216 | |
217 | ImplicitArgLoads.push_back(Elt: std::make_pair(x&: Load, y&: Offset)); |
218 | } |
219 | } |
220 | |
221 | if (ImplicitArgLoads.empty()) |
222 | return; |
223 | |
224 | // Allocate loads in order of offset. We need to be sure that the implicit |
225 | // argument can actually be preloaded. |
226 | std::sort(first: ImplicitArgLoads.begin(), last: ImplicitArgLoads.end(), comp: less_second()); |
227 | |
228 | // If we fail to preload any implicit argument we know we don't have SGPRs |
229 | // to preload any subsequent ones with larger offsets. Find the first |
230 | // argument that we cannot preload. |
231 | auto *PreloadEnd = llvm::find_if( |
232 | Range&: ImplicitArgLoads, P: [&](const std::pair<LoadInst *, unsigned> &Load) { |
233 | unsigned LoadSize = DL.getTypeStoreSize(Ty: Load.first->getType()); |
234 | unsigned LoadOffset = Load.second; |
235 | if (!canPreloadKernArgAtOffset(ExplicitArgOffset: LoadOffset + LoadSize + |
236 | ImplicitArgsBaseOffset)) |
237 | return true; |
238 | |
239 | return false; |
240 | }); |
241 | |
242 | if (PreloadEnd == ImplicitArgLoads.begin()) |
243 | return; |
244 | |
245 | unsigned LastHiddenArgIndex = getHiddenArgFromOffset(Offset: PreloadEnd[-1].second); |
246 | Function *NF = cloneFunctionWithPreloadImplicitArgs(LastPreloadIndex: LastHiddenArgIndex); |
247 | assert(NF); |
248 | FunctionsToErase.push_back(Elt: &F); |
249 | for (const auto *I = ImplicitArgLoads.begin(); I != PreloadEnd; ++I) { |
250 | LoadInst *LoadInst = I->first; |
251 | unsigned LoadOffset = I->second; |
252 | unsigned HiddenArgIndex = getHiddenArgFromOffset(Offset: LoadOffset); |
253 | unsigned Index = NF->arg_size() - LastHiddenArgIndex + HiddenArgIndex - 1; |
254 | Argument *Arg = NF->getArg(i: Index); |
255 | LoadInst->replaceAllUsesWith(V: Arg); |
256 | } |
257 | } |
258 | }; |
259 | |
260 | } // end anonymous namespace |
261 | |
262 | char AMDGPUPreloadKernelArgumentsLegacy::ID = 0; |
263 | |
264 | INITIALIZE_PASS(AMDGPUPreloadKernelArgumentsLegacy, DEBUG_TYPE, |
265 | "AMDGPU Preload Kernel Arguments" , false, false) |
266 | |
267 | ModulePass * |
268 | llvm::createAMDGPUPreloadKernelArgumentsLegacyPass(const TargetMachine *TM) { |
269 | return new AMDGPUPreloadKernelArgumentsLegacy( |
270 | static_cast<const GCNTargetMachine *>(TM)); |
271 | } |
272 | |
273 | AMDGPUPreloadKernelArgumentsLegacy::AMDGPUPreloadKernelArgumentsLegacy( |
274 | const GCNTargetMachine *TM) |
275 | : ModulePass(ID), TM(TM) {} |
276 | |
277 | static bool markKernelArgsAsInreg(Module &M, const TargetMachine &TM) { |
278 | SmallVector<Function *, 4> FunctionsToErase; |
279 | bool Changed = false; |
280 | for (auto &F : M) { |
281 | const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F); |
282 | if (!ST.hasKernargPreload() || |
283 | F.getCallingConv() != CallingConv::AMDGPU_KERNEL) |
284 | continue; |
285 | |
286 | PreloadKernelArgInfo PreloadInfo(F, ST); |
287 | uint64_t ExplicitArgOffset = 0; |
288 | const DataLayout &DL = F.getDataLayout(); |
289 | const uint64_t BaseOffset = ST.getExplicitKernelArgOffset(); |
290 | unsigned NumPreloadsRequested = KernargPreloadCount; |
291 | unsigned NumPreloadedExplicitArgs = 0; |
292 | for (Argument &Arg : F.args()) { |
293 | // Avoid incompatible attributes and guard against running this pass |
294 | // twice. |
295 | // |
296 | // TODO: Preload byref kernel arguments |
297 | if (Arg.hasByRefAttr() || Arg.hasNestAttr() || |
298 | Arg.hasAttribute(Kind: "amdgpu-hidden-argument" )) |
299 | break; |
300 | |
301 | // Inreg may be pre-existing on some arguments, try to preload these. |
302 | if (NumPreloadsRequested == 0 && !Arg.hasInRegAttr()) |
303 | break; |
304 | |
305 | // FIXME: Preload aggregates. |
306 | if (Arg.getType()->isAggregateType()) |
307 | break; |
308 | |
309 | Type *ArgTy = Arg.getType(); |
310 | Align ABITypeAlign = DL.getABITypeAlign(Ty: ArgTy); |
311 | uint64_t AllocSize = DL.getTypeAllocSize(Ty: ArgTy); |
312 | ExplicitArgOffset = alignTo(Size: ExplicitArgOffset, A: ABITypeAlign) + AllocSize; |
313 | |
314 | if (!PreloadInfo.canPreloadKernArgAtOffset(ExplicitArgOffset)) |
315 | break; |
316 | |
317 | Arg.addAttr(Kind: Attribute::InReg); |
318 | NumPreloadedExplicitArgs++; |
319 | if (NumPreloadsRequested > 0) |
320 | NumPreloadsRequested--; |
321 | } |
322 | |
323 | // Only try preloading hidden arguments if we can successfully preload the |
324 | // last explicit argument. |
325 | if (NumPreloadedExplicitArgs == F.arg_size()) { |
326 | uint64_t ImplicitArgsBaseOffset = |
327 | alignTo(Size: ExplicitArgOffset, A: ST.getAlignmentForImplicitArgPtr()) + |
328 | BaseOffset; |
329 | PreloadInfo.tryAllocHiddenArgPreloadSGPRs(ImplicitArgsBaseOffset, |
330 | FunctionsToErase); |
331 | } |
332 | |
333 | Changed |= NumPreloadedExplicitArgs > 0; |
334 | } |
335 | |
336 | // Erase cloned functions if we needed to update the kernel signature to |
337 | // support preloading hidden kernel arguments. |
338 | for (auto *F : FunctionsToErase) |
339 | F->eraseFromParent(); |
340 | |
341 | return Changed; |
342 | } |
343 | |
344 | bool AMDGPUPreloadKernelArgumentsLegacy::runOnModule(Module &M) { |
345 | if (skipModule(M) || !TM) |
346 | return false; |
347 | |
348 | return markKernelArgsAsInreg(M, TM: *TM); |
349 | } |
350 | |
351 | PreservedAnalyses |
352 | AMDGPUPreloadKernelArgumentsPass::run(Module &M, ModuleAnalysisManager &AM) { |
353 | bool Changed = markKernelArgsAsInreg(M, TM); |
354 | return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all(); |
355 | } |
356 | |