1//===- AMDGPUPreloadKernelArguments.cpp - Preload Kernel Arguments --------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file This pass preloads kernel arguments into user_data SGPRs before kernel
10/// execution begins. The number of registers available for preloading depends
11/// on the number of free user SGPRs, up to the hardware's maximum limit.
12/// Implicit arguments enabled in the kernel descriptor are allocated first,
13/// followed by SGPRs used for preloaded kernel arguments. (Reference:
14/// https://llvm.org/docs/AMDGPUUsage.html#initial-kernel-execution-state)
15/// Additionally, hidden kernel arguments may be preloaded, in which case they
16/// are appended to the kernel signature after explicit arguments. Preloaded
17/// arguments will be marked with `inreg`.
18//
19//===----------------------------------------------------------------------===//
20
21#include "AMDGPU.h"
22#include "AMDGPUTargetMachine.h"
23#include "llvm/Analysis/ValueTracking.h"
24#include "llvm/IR/Function.h"
25#include "llvm/IR/Instructions.h"
26#include "llvm/IR/IntrinsicsAMDGPU.h"
27#include "llvm/IR/Module.h"
28#include "llvm/IR/PassManager.h"
29#include "llvm/IR/Verifier.h"
30#include "llvm/Pass.h"
31
32#define DEBUG_TYPE "amdgpu-preload-kernel-arguments"
33
34using namespace llvm;
35
36static cl::opt<unsigned> KernargPreloadCount(
37 "amdgpu-kernarg-preload-count",
38 cl::desc("How many kernel arguments to preload onto SGPRs"), cl::init(Val: 0));
39
40namespace {
41
42class AMDGPUPreloadKernelArgumentsLegacy : public ModulePass {
43 const GCNTargetMachine *TM;
44
45public:
46 static char ID;
47 explicit AMDGPUPreloadKernelArgumentsLegacy(
48 const GCNTargetMachine *TM = nullptr);
49
50 StringRef getPassName() const override {
51 return "AMDGPU Preload Kernel Arguments";
52 }
53
54 bool runOnModule(Module &M) override;
55};
56
57class PreloadKernelArgInfo {
58private:
59 Function &F;
60 const GCNSubtarget &ST;
61 unsigned NumFreeUserSGPRs;
62
63 enum HiddenArg : unsigned {
64 HIDDEN_BLOCK_COUNT_X,
65 HIDDEN_BLOCK_COUNT_Y,
66 HIDDEN_BLOCK_COUNT_Z,
67 HIDDEN_GROUP_SIZE_X,
68 HIDDEN_GROUP_SIZE_Y,
69 HIDDEN_GROUP_SIZE_Z,
70 HIDDEN_REMAINDER_X,
71 HIDDEN_REMAINDER_Y,
72 HIDDEN_REMAINDER_Z,
73 END_HIDDEN_ARGS
74 };
75
76 // Stores information about a specific hidden argument.
77 struct HiddenArgInfo {
78 // Offset in bytes from the location in the kernearg segment pointed to by
79 // the implicitarg pointer.
80 uint8_t Offset;
81 // The size of the hidden argument in bytes.
82 uint8_t Size;
83 // The name of the hidden argument in the kernel signature.
84 const char *Name;
85 };
86
87 static constexpr HiddenArgInfo HiddenArgs[END_HIDDEN_ARGS] = {
88 {.Offset: 0, .Size: 4, .Name: "_hidden_block_count_x"}, {.Offset: 4, .Size: 4, .Name: "_hidden_block_count_y"},
89 {.Offset: 8, .Size: 4, .Name: "_hidden_block_count_z"}, {.Offset: 12, .Size: 2, .Name: "_hidden_group_size_x"},
90 {.Offset: 14, .Size: 2, .Name: "_hidden_group_size_y"}, {.Offset: 16, .Size: 2, .Name: "_hidden_group_size_z"},
91 {.Offset: 18, .Size: 2, .Name: "_hidden_remainder_x"}, {.Offset: 20, .Size: 2, .Name: "_hidden_remainder_y"},
92 {.Offset: 22, .Size: 2, .Name: "_hidden_remainder_z"}};
93
94 static HiddenArg getHiddenArgFromOffset(unsigned Offset) {
95 for (unsigned I = 0; I < END_HIDDEN_ARGS; ++I)
96 if (HiddenArgs[I].Offset == Offset)
97 return static_cast<HiddenArg>(I);
98
99 return END_HIDDEN_ARGS;
100 }
101
102 static Type *getHiddenArgType(LLVMContext &Ctx, HiddenArg HA) {
103 if (HA < END_HIDDEN_ARGS)
104 return Type::getIntNTy(C&: Ctx, N: HiddenArgs[HA].Size * 8);
105
106 llvm_unreachable("Unexpected hidden argument.");
107 }
108
109 static const char *getHiddenArgName(HiddenArg HA) {
110 if (HA < END_HIDDEN_ARGS)
111 return HiddenArgs[HA].Name;
112
113 llvm_unreachable("Unexpected hidden argument.");
114 }
115
116 // Clones the function after adding implicit arguments to the argument list
117 // and returns the new updated function. Preloaded implicit arguments are
118 // added up to and including the last one that will be preloaded, indicated by
119 // LastPreloadIndex. Currently preloading is only performed on the totality of
120 // sequential data from the kernarg segment including implicit (hidden)
121 // arguments. This means that all arguments up to the last preloaded argument
122 // will also be preloaded even if that data is unused.
123 Function *cloneFunctionWithPreloadImplicitArgs(unsigned LastPreloadIndex) {
124 FunctionType *FT = F.getFunctionType();
125 LLVMContext &Ctx = F.getParent()->getContext();
126 SmallVector<Type *, 16> FTypes(FT->param_begin(), FT->param_end());
127 for (unsigned I = 0; I <= LastPreloadIndex; ++I)
128 FTypes.push_back(Elt: getHiddenArgType(Ctx, HA: HiddenArg(I)));
129
130 FunctionType *NFT =
131 FunctionType::get(Result: FT->getReturnType(), Params: FTypes, isVarArg: FT->isVarArg());
132 Function *NF =
133 Function::Create(Ty: NFT, Linkage: F.getLinkage(), AddrSpace: F.getAddressSpace(), N: F.getName());
134
135 NF->copyAttributesFrom(Src: &F);
136 NF->copyMetadata(Src: &F, Offset: 0);
137
138 F.getParent()->getFunctionList().insert(where: F.getIterator(), New: NF);
139 NF->takeName(V: &F);
140 NF->splice(ToIt: NF->begin(), FromF: &F);
141
142 Function::arg_iterator NFArg = NF->arg_begin();
143 for (Argument &Arg : F.args()) {
144 Arg.replaceAllUsesWith(V: &*NFArg);
145 NFArg->takeName(V: &Arg);
146 ++NFArg;
147 }
148
149 AttrBuilder AB(Ctx);
150 AB.addAttribute(Val: Attribute::InReg);
151 AB.addAttribute(A: "amdgpu-hidden-argument");
152 AttributeList AL = NF->getAttributes();
153 for (unsigned I = 0; I <= LastPreloadIndex; ++I) {
154 AL = AL.addParamAttributes(C&: Ctx, ArgNo: NFArg->getArgNo(), B: AB);
155 NFArg++->setName(getHiddenArgName(HA: HiddenArg(I)));
156 }
157
158 NF->setAttributes(AL);
159 F.replaceAllUsesWith(V: NF);
160
161 return NF;
162 }
163
164public:
165 PreloadKernelArgInfo(Function &F, const GCNSubtarget &ST) : F(F), ST(ST) {
166 setInitialFreeUserSGPRsCount();
167 }
168
169 // Returns the maximum number of user SGPRs that we have available to preload
170 // arguments.
171 void setInitialFreeUserSGPRsCount() {
172 GCNUserSGPRUsageInfo UserSGPRInfo(F, ST);
173 NumFreeUserSGPRs = UserSGPRInfo.getNumFreeUserSGPRs();
174 }
175
176 bool canPreloadKernArgAtOffset(uint64_t ExplicitArgOffset) {
177 return ExplicitArgOffset <= NumFreeUserSGPRs * 4;
178 }
179
180 // Try to allocate SGPRs to preload hidden kernel arguments.
181 void
182 tryAllocHiddenArgPreloadSGPRs(uint64_t ImplicitArgsBaseOffset,
183 SmallVectorImpl<Function *> &FunctionsToErase) {
184 Function *ImplicitArgPtr = Intrinsic::getDeclarationIfExists(
185 M: F.getParent(), id: Intrinsic::amdgcn_implicitarg_ptr);
186 if (!ImplicitArgPtr)
187 return;
188
189 const DataLayout &DL = F.getParent()->getDataLayout();
190 // Pair is the load and the load offset.
191 SmallVector<std::pair<LoadInst *, unsigned>, 4> ImplicitArgLoads;
192 for (auto *U : ImplicitArgPtr->users()) {
193 Instruction *CI = dyn_cast<Instruction>(Val: U);
194 if (!CI || CI->getParent()->getParent() != &F)
195 continue;
196
197 for (auto *U : CI->users()) {
198 int64_t Offset = 0;
199 auto *Load = dyn_cast<LoadInst>(Val: U); // Load from ImplicitArgPtr?
200 if (!Load) {
201 if (GetPointerBaseWithConstantOffset(Ptr: U, Offset, DL) != CI)
202 continue;
203
204 Load = dyn_cast<LoadInst>(Val: *U->user_begin()); // Load from GEP?
205 }
206
207 if (!Load || !Load->isSimple())
208 continue;
209
210 // FIXME: Expand handle merged loads.
211 LLVMContext &Ctx = F.getParent()->getContext();
212 Type *LoadTy = Load->getType();
213 HiddenArg HA = getHiddenArgFromOffset(Offset);
214 if (HA == END_HIDDEN_ARGS || LoadTy != getHiddenArgType(Ctx, HA))
215 continue;
216
217 ImplicitArgLoads.push_back(Elt: std::make_pair(x&: Load, y&: Offset));
218 }
219 }
220
221 if (ImplicitArgLoads.empty())
222 return;
223
224 // Allocate loads in order of offset. We need to be sure that the implicit
225 // argument can actually be preloaded.
226 std::sort(first: ImplicitArgLoads.begin(), last: ImplicitArgLoads.end(), comp: less_second());
227
228 // If we fail to preload any implicit argument we know we don't have SGPRs
229 // to preload any subsequent ones with larger offsets. Find the first
230 // argument that we cannot preload.
231 auto *PreloadEnd = llvm::find_if(
232 Range&: ImplicitArgLoads, P: [&](const std::pair<LoadInst *, unsigned> &Load) {
233 unsigned LoadSize = DL.getTypeStoreSize(Ty: Load.first->getType());
234 unsigned LoadOffset = Load.second;
235 if (!canPreloadKernArgAtOffset(ExplicitArgOffset: LoadOffset + LoadSize +
236 ImplicitArgsBaseOffset))
237 return true;
238
239 return false;
240 });
241
242 if (PreloadEnd == ImplicitArgLoads.begin())
243 return;
244
245 unsigned LastHiddenArgIndex = getHiddenArgFromOffset(Offset: PreloadEnd[-1].second);
246 Function *NF = cloneFunctionWithPreloadImplicitArgs(LastPreloadIndex: LastHiddenArgIndex);
247 assert(NF);
248 FunctionsToErase.push_back(Elt: &F);
249 for (const auto *I = ImplicitArgLoads.begin(); I != PreloadEnd; ++I) {
250 LoadInst *LoadInst = I->first;
251 unsigned LoadOffset = I->second;
252 unsigned HiddenArgIndex = getHiddenArgFromOffset(Offset: LoadOffset);
253 unsigned Index = NF->arg_size() - LastHiddenArgIndex + HiddenArgIndex - 1;
254 Argument *Arg = NF->getArg(i: Index);
255 LoadInst->replaceAllUsesWith(V: Arg);
256 }
257 }
258};
259
260} // end anonymous namespace
261
262char AMDGPUPreloadKernelArgumentsLegacy::ID = 0;
263
264INITIALIZE_PASS(AMDGPUPreloadKernelArgumentsLegacy, DEBUG_TYPE,
265 "AMDGPU Preload Kernel Arguments", false, false)
266
267ModulePass *
268llvm::createAMDGPUPreloadKernelArgumentsLegacyPass(const TargetMachine *TM) {
269 return new AMDGPUPreloadKernelArgumentsLegacy(
270 static_cast<const GCNTargetMachine *>(TM));
271}
272
273AMDGPUPreloadKernelArgumentsLegacy::AMDGPUPreloadKernelArgumentsLegacy(
274 const GCNTargetMachine *TM)
275 : ModulePass(ID), TM(TM) {}
276
277static bool markKernelArgsAsInreg(Module &M, const TargetMachine &TM) {
278 SmallVector<Function *, 4> FunctionsToErase;
279 bool Changed = false;
280 for (auto &F : M) {
281 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
282 if (!ST.hasKernargPreload() ||
283 F.getCallingConv() != CallingConv::AMDGPU_KERNEL)
284 continue;
285
286 PreloadKernelArgInfo PreloadInfo(F, ST);
287 uint64_t ExplicitArgOffset = 0;
288 const DataLayout &DL = F.getDataLayout();
289 const uint64_t BaseOffset = ST.getExplicitKernelArgOffset();
290 unsigned NumPreloadsRequested = KernargPreloadCount;
291 unsigned NumPreloadedExplicitArgs = 0;
292 for (Argument &Arg : F.args()) {
293 // Avoid incompatible attributes and guard against running this pass
294 // twice.
295 //
296 // TODO: Preload byref kernel arguments
297 if (Arg.hasByRefAttr() || Arg.hasNestAttr() ||
298 Arg.hasAttribute(Kind: "amdgpu-hidden-argument"))
299 break;
300
301 // Inreg may be pre-existing on some arguments, try to preload these.
302 if (NumPreloadsRequested == 0 && !Arg.hasInRegAttr())
303 break;
304
305 // FIXME: Preload aggregates.
306 if (Arg.getType()->isAggregateType())
307 break;
308
309 Type *ArgTy = Arg.getType();
310 Align ABITypeAlign = DL.getABITypeAlign(Ty: ArgTy);
311 uint64_t AllocSize = DL.getTypeAllocSize(Ty: ArgTy);
312 ExplicitArgOffset = alignTo(Size: ExplicitArgOffset, A: ABITypeAlign) + AllocSize;
313
314 if (!PreloadInfo.canPreloadKernArgAtOffset(ExplicitArgOffset))
315 break;
316
317 Arg.addAttr(Kind: Attribute::InReg);
318 NumPreloadedExplicitArgs++;
319 if (NumPreloadsRequested > 0)
320 NumPreloadsRequested--;
321 }
322
323 // Only try preloading hidden arguments if we can successfully preload the
324 // last explicit argument.
325 if (NumPreloadedExplicitArgs == F.arg_size()) {
326 uint64_t ImplicitArgsBaseOffset =
327 alignTo(Size: ExplicitArgOffset, A: ST.getAlignmentForImplicitArgPtr()) +
328 BaseOffset;
329 PreloadInfo.tryAllocHiddenArgPreloadSGPRs(ImplicitArgsBaseOffset,
330 FunctionsToErase);
331 }
332
333 Changed |= NumPreloadedExplicitArgs > 0;
334 }
335
336 // Erase cloned functions if we needed to update the kernel signature to
337 // support preloading hidden kernel arguments.
338 for (auto *F : FunctionsToErase)
339 F->eraseFromParent();
340
341 return Changed;
342}
343
344bool AMDGPUPreloadKernelArgumentsLegacy::runOnModule(Module &M) {
345 if (skipModule(M) || !TM)
346 return false;
347
348 return markKernelArgsAsInreg(M, TM: *TM);
349}
350
351PreservedAnalyses
352AMDGPUPreloadKernelArgumentsPass::run(Module &M, ModuleAnalysisManager &AM) {
353 bool Changed = markKernelArgsAsInreg(M, TM);
354 return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
355}
356