1 | //===-- AMDGPULowerModuleLDSPass.cpp ------------------------------*- C++ -*-=// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This pass eliminates local data store, LDS, uses from non-kernel functions. |
10 | // LDS is contiguous memory allocated per kernel execution. |
11 | // |
12 | // Background. |
13 | // |
14 | // The programming model is global variables, or equivalently function local |
15 | // static variables, accessible from kernels or other functions. For uses from |
16 | // kernels this is straightforward - assign an integer to the kernel for the |
17 | // memory required by all the variables combined, allocate them within that. |
18 | // For uses from functions there are performance tradeoffs to choose between. |
19 | // |
20 | // This model means the GPU runtime can specify the amount of memory allocated. |
21 | // If this is more than the kernel assumed, the excess can be made available |
22 | // using a language specific feature, which IR represents as a variable with |
23 | // no initializer. This feature is referred to here as "Dynamic LDS" and is |
24 | // lowered slightly differently to the normal case. |
25 | // |
26 | // Consequences of this GPU feature: |
27 | // - memory is limited and exceeding it halts compilation |
28 | // - a global accessed by one kernel exists independent of other kernels |
29 | // - a global exists independent of simultaneous execution of the same kernel |
30 | // - the address of the global may be different from different kernels as they |
31 | // do not alias, which permits only allocating variables they use |
32 | // - if the address is allowed to differ, functions need help to find it |
33 | // |
34 | // Uses from kernels are implemented here by grouping them in a per-kernel |
35 | // struct instance. This duplicates the variables, accurately modelling their |
36 | // aliasing properties relative to a single global representation. It also |
37 | // permits control over alignment via padding. |
38 | // |
39 | // Uses from functions are more complicated and the primary purpose of this |
40 | // IR pass. Several different lowering are chosen between to meet requirements |
41 | // to avoid allocating any LDS where it is not necessary, as that impacts |
42 | // occupancy and may fail the compilation, while not imposing overhead on a |
43 | // feature whose primary advantage over global memory is performance. The basic |
44 | // design goal is to avoid one kernel imposing overhead on another. |
45 | // |
46 | // Implementation. |
47 | // |
48 | // LDS variables with constant annotation or non-undef initializer are passed |
49 | // through unchanged for simplification or error diagnostics in later passes. |
50 | // Non-undef initializers are not yet implemented for LDS. |
51 | // |
52 | // LDS variables that are always allocated at the same address can be found |
53 | // by lookup at that address. Otherwise runtime information/cost is required. |
54 | // |
55 | // The simplest strategy possible is to group all LDS variables in a single |
56 | // struct and allocate that struct in every kernel such that the original |
57 | // variables are always at the same address. LDS is however a limited resource |
58 | // so this strategy is unusable in practice. It is not implemented here. |
59 | // |
60 | // Strategy | Precise allocation | Zero runtime cost | General purpose | |
61 | // --------+--------------------+-------------------+-----------------+ |
62 | // Module | No | Yes | Yes | |
63 | // Table | Yes | No | Yes | |
64 | // Kernel | Yes | Yes | No | |
65 | // Hybrid | Yes | Partial | Yes | |
66 | // |
67 | // "Module" spends LDS memory to save cycles. "Table" spends cycles and global |
68 | // memory to save LDS. "Kernel" is as fast as kernel allocation but only works |
69 | // for variables that are known reachable from a single kernel. "Hybrid" picks |
70 | // between all three. When forced to choose between LDS and cycles we minimise |
71 | // LDS use. |
72 | |
73 | // The "module" lowering implemented here finds LDS variables which are used by |
74 | // non-kernel functions and creates a new struct with a field for each of those |
75 | // LDS variables. Variables that are only used from kernels are excluded. |
76 | // |
77 | // The "table" lowering implemented here has three components. |
78 | // First kernels are assigned a unique integer identifier which is available in |
79 | // functions it calls through the intrinsic amdgcn_lds_kernel_id. The integer |
80 | // is passed through a specific SGPR, thus works with indirect calls. |
81 | // Second, each kernel allocates LDS variables independent of other kernels and |
82 | // writes the addresses it chose for each variable into an array in consistent |
83 | // order. If the kernel does not allocate a given variable, it writes undef to |
84 | // the corresponding array location. These arrays are written to a constant |
85 | // table in the order matching the kernel unique integer identifier. |
86 | // Third, uses from non-kernel functions are replaced with a table lookup using |
87 | // the intrinsic function to find the address of the variable. |
88 | // |
89 | // "Kernel" lowering is only applicable for variables that are unambiguously |
90 | // reachable from exactly one kernel. For those cases, accesses to the variable |
91 | // can be lowered to ConstantExpr address of a struct instance specific to that |
92 | // one kernel. This is zero cost in space and in compute. It will raise a fatal |
93 | // error on any variable that might be reachable from multiple kernels and is |
94 | // thus most easily used as part of the hybrid lowering strategy. |
95 | // |
96 | // Hybrid lowering is a mixture of the above. It uses the zero cost kernel |
97 | // lowering where it can. It lowers the variable accessed by the greatest |
98 | // number of kernels using the module strategy as that is free for the first |
99 | // variable. Any futher variables that can be lowered with the module strategy |
100 | // without incurring LDS memory overhead are. The remaining ones are lowered |
101 | // via table. |
102 | // |
103 | // Consequences |
104 | // - No heuristics or user controlled magic numbers, hybrid is the right choice |
105 | // - Kernels that don't use functions (or have had them all inlined) are not |
106 | // affected by any lowering for kernels that do. |
107 | // - Kernels that don't make indirect function calls are not affected by those |
108 | // that do. |
109 | // - Variables which are used by lots of kernels, e.g. those injected by a |
110 | // language runtime in most kernels, are expected to have no overhead |
111 | // - Implementations that instantiate templates per-kernel where those templates |
112 | // use LDS are expected to hit the "Kernel" lowering strategy |
113 | // - The runtime properties impose a cost in compiler implementation complexity |
114 | // |
115 | // Dynamic LDS implementation |
116 | // Dynamic LDS is lowered similarly to the "table" strategy above and uses the |
117 | // same intrinsic to identify which kernel is at the root of the dynamic call |
118 | // graph. This relies on the specified behaviour that all dynamic LDS variables |
119 | // alias one another, i.e. are at the same address, with respect to a given |
120 | // kernel. Therefore this pass creates new dynamic LDS variables for each kernel |
121 | // that allocates any dynamic LDS and builds a table of addresses out of those. |
122 | // The AMDGPUPromoteAlloca pass skips kernels that use dynamic LDS. |
123 | // The corresponding optimisation for "kernel" lowering where the table lookup |
124 | // is elided is not implemented. |
125 | // |
126 | // |
127 | // Implementation notes / limitations |
128 | // A single LDS global variable represents an instance per kernel that can reach |
129 | // said variables. This pass essentially specialises said variables per kernel. |
130 | // Handling ConstantExpr during the pass complicated this significantly so now |
131 | // all ConstantExpr uses of LDS variables are expanded to instructions. This |
132 | // may need amending when implementing non-undef initialisers. |
133 | // |
134 | // Lowering is split between this IR pass and the back end. This pass chooses |
135 | // where given variables should be allocated and marks them with metadata, |
136 | // MD_absolute_symbol. The backend places the variables in coincidentally the |
137 | // same location and raises a fatal error if something has gone awry. This works |
138 | // in practice because the only pass between this one and the backend that |
139 | // changes LDS is PromoteAlloca and the changes it makes do not conflict. |
140 | // |
141 | // Addresses are written to constant global arrays based on the same metadata. |
142 | // |
143 | // The backend lowers LDS variables in the order of traversal of the function. |
144 | // This is at odds with the deterministic layout required. The workaround is to |
145 | // allocate the fixed-address variables immediately upon starting the function |
146 | // where they can be placed as intended. This requires a means of mapping from |
147 | // the function to the variables that it allocates. For the module scope lds, |
148 | // this is via metadata indicating whether the variable is not required. If a |
149 | // pass deletes that metadata, a fatal error on disagreement with the absolute |
150 | // symbol metadata will occur. For kernel scope and dynamic, this is by _name_ |
151 | // correspondence between the function and the variable. It requires the |
152 | // kernel to have a name (which is only a limitation for tests in practice) and |
153 | // for nothing to rename the corresponding symbols. This is a hazard if the pass |
154 | // is run multiple times during debugging. Alternative schemes considered all |
155 | // involve bespoke metadata. |
156 | // |
157 | // If the name correspondence can be replaced, multiple distinct kernels that |
158 | // have the same memory layout can map to the same kernel id (as the address |
159 | // itself is handled by the absolute symbol metadata) and that will allow more |
160 | // uses of the "kernel" style faster lowering and reduce the size of the lookup |
161 | // tables. |
162 | // |
163 | // There is a test that checks this does not fire for a graphics shader. This |
164 | // lowering is expected to work for graphics if the isKernel test is changed. |
165 | // |
166 | // The current markUsedByKernel is sufficient for PromoteAlloca but is elided |
167 | // before codegen. Replacing this with an equivalent intrinsic which lasts until |
168 | // shortly after the machine function lowering of LDS would help break the name |
169 | // mapping. The other part needed is probably to amend PromoteAlloca to embed |
170 | // the LDS variables it creates in the same struct created here. That avoids the |
171 | // current hazard where a PromoteAlloca LDS variable might be allocated before |
172 | // the kernel scope (and thus error on the address check). Given a new invariant |
173 | // that no LDS variables exist outside of the structs managed here, and an |
174 | // intrinsic that lasts until after the LDS frame lowering, it should be |
175 | // possible to drop the name mapping and fold equivalent memory layouts. |
176 | // |
177 | //===----------------------------------------------------------------------===// |
178 | |
179 | #include "AMDGPU.h" |
180 | #include "AMDGPUTargetMachine.h" |
181 | #include "Utils/AMDGPUBaseInfo.h" |
182 | #include "Utils/AMDGPUMemoryUtils.h" |
183 | #include "llvm/ADT/BitVector.h" |
184 | #include "llvm/ADT/DenseMap.h" |
185 | #include "llvm/ADT/DenseSet.h" |
186 | #include "llvm/ADT/STLExtras.h" |
187 | #include "llvm/ADT/SetOperations.h" |
188 | #include "llvm/Analysis/CallGraph.h" |
189 | #include "llvm/CodeGen/TargetPassConfig.h" |
190 | #include "llvm/IR/Constants.h" |
191 | #include "llvm/IR/DerivedTypes.h" |
192 | #include "llvm/IR/IRBuilder.h" |
193 | #include "llvm/IR/InlineAsm.h" |
194 | #include "llvm/IR/Instructions.h" |
195 | #include "llvm/IR/IntrinsicsAMDGPU.h" |
196 | #include "llvm/IR/MDBuilder.h" |
197 | #include "llvm/IR/ReplaceConstant.h" |
198 | #include "llvm/InitializePasses.h" |
199 | #include "llvm/Pass.h" |
200 | #include "llvm/Support/CommandLine.h" |
201 | #include "llvm/Support/Debug.h" |
202 | #include "llvm/Support/Format.h" |
203 | #include "llvm/Support/OptimizedStructLayout.h" |
204 | #include "llvm/Support/raw_ostream.h" |
205 | #include "llvm/Transforms/Utils/BasicBlockUtils.h" |
206 | #include "llvm/Transforms/Utils/ModuleUtils.h" |
207 | |
208 | #include <vector> |
209 | |
210 | #include <cstdio> |
211 | |
212 | #define DEBUG_TYPE "amdgpu-lower-module-lds" |
213 | |
214 | using namespace llvm; |
215 | using namespace AMDGPU; |
216 | |
217 | namespace { |
218 | |
219 | cl::opt<bool> SuperAlignLDSGlobals( |
220 | "amdgpu-super-align-lds-globals" , |
221 | cl::desc("Increase alignment of LDS if it is not on align boundary" ), |
222 | cl::init(Val: true), cl::Hidden); |
223 | |
224 | enum class LoweringKind { module, table, kernel, hybrid }; |
225 | cl::opt<LoweringKind> LoweringKindLoc( |
226 | "amdgpu-lower-module-lds-strategy" , |
227 | cl::desc("Specify lowering strategy for function LDS access:" ), cl::Hidden, |
228 | cl::init(Val: LoweringKind::hybrid), |
229 | cl::values( |
230 | clEnumValN(LoweringKind::table, "table" , "Lower via table lookup" ), |
231 | clEnumValN(LoweringKind::module, "module" , "Lower via module struct" ), |
232 | clEnumValN( |
233 | LoweringKind::kernel, "kernel" , |
234 | "Lower variables reachable from one kernel, otherwise abort" ), |
235 | clEnumValN(LoweringKind::hybrid, "hybrid" , |
236 | "Lower via mixture of above strategies" ))); |
237 | |
238 | template <typename T> std::vector<T> sortByName(std::vector<T> &&V) { |
239 | llvm::sort(V.begin(), V.end(), [](const auto *L, const auto *R) { |
240 | return L->getName() < R->getName(); |
241 | }); |
242 | return {std::move(V)}; |
243 | } |
244 | |
245 | class AMDGPULowerModuleLDS { |
246 | const AMDGPUTargetMachine &TM; |
247 | |
248 | static void |
249 | removeLocalVarsFromUsedLists(Module &M, |
250 | const DenseSet<GlobalVariable *> &LocalVars) { |
251 | // The verifier rejects used lists containing an inttoptr of a constant |
252 | // so remove the variables from these lists before replaceAllUsesWith |
253 | SmallPtrSet<Constant *, 8> ; |
254 | for (GlobalVariable *LocalVar : LocalVars) |
255 | LocalVarsSet.insert(Ptr: cast<Constant>(Val: LocalVar->stripPointerCasts())); |
256 | |
257 | removeFromUsedLists( |
258 | M, ShouldRemove: [&LocalVarsSet](Constant *C) { return LocalVarsSet.count(Ptr: C); }); |
259 | |
260 | for (GlobalVariable *LocalVar : LocalVars) |
261 | LocalVar->removeDeadConstantUsers(); |
262 | } |
263 | |
264 | static void markUsedByKernel(Function *Func, GlobalVariable *SGV) { |
265 | // The llvm.amdgcn.module.lds instance is implicitly used by all kernels |
266 | // that might call a function which accesses a field within it. This is |
267 | // presently approximated to 'all kernels' if there are any such functions |
268 | // in the module. This implicit use is redefined as an explicit use here so |
269 | // that later passes, specifically PromoteAlloca, account for the required |
270 | // memory without any knowledge of this transform. |
271 | |
272 | // An operand bundle on llvm.donothing works because the call instruction |
273 | // survives until after the last pass that needs to account for LDS. It is |
274 | // better than inline asm as the latter survives until the end of codegen. A |
275 | // totally robust solution would be a function with the same semantics as |
276 | // llvm.donothing that takes a pointer to the instance and is lowered to a |
277 | // no-op after LDS is allocated, but that is not presently necessary. |
278 | |
279 | // This intrinsic is eliminated shortly before instruction selection. It |
280 | // does not suffice to indicate to ISel that a given global which is not |
281 | // immediately used by the kernel must still be allocated by it. An |
282 | // equivalent target specific intrinsic which lasts until immediately after |
283 | // codegen would suffice for that, but one would still need to ensure that |
284 | // the variables are allocated in the anticipated order. |
285 | BasicBlock *Entry = &Func->getEntryBlock(); |
286 | IRBuilder<> Builder(Entry, Entry->getFirstNonPHIIt()); |
287 | |
288 | Function *Decl = |
289 | Intrinsic::getDeclaration(M: Func->getParent(), id: Intrinsic::donothing, Tys: {}); |
290 | |
291 | Value *UseInstance[1] = { |
292 | Builder.CreateConstInBoundsGEP1_32(Ty: SGV->getValueType(), Ptr: SGV, Idx0: 0)}; |
293 | |
294 | Builder.CreateCall( |
295 | Callee: Decl, Args: {}, OpBundles: {OperandBundleDefT<Value *>("ExplicitUse" , UseInstance)}); |
296 | } |
297 | |
298 | public: |
299 | AMDGPULowerModuleLDS(const AMDGPUTargetMachine &TM_) : TM(TM_) {} |
300 | |
301 | struct LDSVariableReplacement { |
302 | GlobalVariable *SGV = nullptr; |
303 | DenseMap<GlobalVariable *, Constant *> LDSVarsToConstantGEP; |
304 | }; |
305 | |
306 | // remap from lds global to a constantexpr gep to where it has been moved to |
307 | // for each kernel |
308 | // an array with an element for each kernel containing where the corresponding |
309 | // variable was remapped to |
310 | |
311 | static Constant *getAddressesOfVariablesInKernel( |
312 | LLVMContext &Ctx, ArrayRef<GlobalVariable *> Variables, |
313 | const DenseMap<GlobalVariable *, Constant *> &LDSVarsToConstantGEP) { |
314 | // Create a ConstantArray containing the address of each Variable within the |
315 | // kernel corresponding to LDSVarsToConstantGEP, or poison if that kernel |
316 | // does not allocate it |
317 | // TODO: Drop the ptrtoint conversion |
318 | |
319 | Type *I32 = Type::getInt32Ty(C&: Ctx); |
320 | |
321 | ArrayType *KernelOffsetsType = ArrayType::get(ElementType: I32, NumElements: Variables.size()); |
322 | |
323 | SmallVector<Constant *> Elements; |
324 | for (GlobalVariable *GV : Variables) { |
325 | auto ConstantGepIt = LDSVarsToConstantGEP.find(Val: GV); |
326 | if (ConstantGepIt != LDSVarsToConstantGEP.end()) { |
327 | auto elt = ConstantExpr::getPtrToInt(C: ConstantGepIt->second, Ty: I32); |
328 | Elements.push_back(Elt: elt); |
329 | } else { |
330 | Elements.push_back(Elt: PoisonValue::get(T: I32)); |
331 | } |
332 | } |
333 | return ConstantArray::get(T: KernelOffsetsType, V: Elements); |
334 | } |
335 | |
336 | static GlobalVariable *buildLookupTable( |
337 | Module &M, ArrayRef<GlobalVariable *> Variables, |
338 | ArrayRef<Function *> kernels, |
339 | DenseMap<Function *, LDSVariableReplacement> &KernelToReplacement) { |
340 | if (Variables.empty()) { |
341 | return nullptr; |
342 | } |
343 | LLVMContext &Ctx = M.getContext(); |
344 | |
345 | const size_t NumberVariables = Variables.size(); |
346 | const size_t NumberKernels = kernels.size(); |
347 | |
348 | ArrayType *KernelOffsetsType = |
349 | ArrayType::get(ElementType: Type::getInt32Ty(C&: Ctx), NumElements: NumberVariables); |
350 | |
351 | ArrayType *AllKernelsOffsetsType = |
352 | ArrayType::get(ElementType: KernelOffsetsType, NumElements: NumberKernels); |
353 | |
354 | Constant *Missing = PoisonValue::get(T: KernelOffsetsType); |
355 | std::vector<Constant *> overallConstantExprElts(NumberKernels); |
356 | for (size_t i = 0; i < NumberKernels; i++) { |
357 | auto Replacement = KernelToReplacement.find(Val: kernels[i]); |
358 | overallConstantExprElts[i] = |
359 | (Replacement == KernelToReplacement.end()) |
360 | ? Missing |
361 | : getAddressesOfVariablesInKernel( |
362 | Ctx, Variables, LDSVarsToConstantGEP: Replacement->second.LDSVarsToConstantGEP); |
363 | } |
364 | |
365 | Constant *init = |
366 | ConstantArray::get(T: AllKernelsOffsetsType, V: overallConstantExprElts); |
367 | |
368 | return new GlobalVariable( |
369 | M, AllKernelsOffsetsType, true, GlobalValue::InternalLinkage, init, |
370 | "llvm.amdgcn.lds.offset.table" , nullptr, GlobalValue::NotThreadLocal, |
371 | AMDGPUAS::CONSTANT_ADDRESS); |
372 | } |
373 | |
374 | void replaceUseWithTableLookup(Module &M, IRBuilder<> &Builder, |
375 | GlobalVariable *LookupTable, |
376 | GlobalVariable *GV, Use &U, |
377 | Value *OptionalIndex) { |
378 | // Table is a constant array of the same length as OrderedKernels |
379 | LLVMContext &Ctx = M.getContext(); |
380 | Type *I32 = Type::getInt32Ty(C&: Ctx); |
381 | auto *I = cast<Instruction>(Val: U.getUser()); |
382 | |
383 | Value *tableKernelIndex = getTableLookupKernelIndex(M, F: I->getFunction()); |
384 | |
385 | if (auto *Phi = dyn_cast<PHINode>(Val: I)) { |
386 | BasicBlock *BB = Phi->getIncomingBlock(U); |
387 | Builder.SetInsertPoint(&(*(BB->getFirstInsertionPt()))); |
388 | } else { |
389 | Builder.SetInsertPoint(I); |
390 | } |
391 | |
392 | SmallVector<Value *, 3> GEPIdx = { |
393 | ConstantInt::get(Ty: I32, V: 0), |
394 | tableKernelIndex, |
395 | }; |
396 | if (OptionalIndex) |
397 | GEPIdx.push_back(Elt: OptionalIndex); |
398 | |
399 | Value *Address = Builder.CreateInBoundsGEP( |
400 | Ty: LookupTable->getValueType(), Ptr: LookupTable, IdxList: GEPIdx, Name: GV->getName()); |
401 | |
402 | Value *loaded = Builder.CreateLoad(Ty: I32, Ptr: Address); |
403 | |
404 | Value *replacement = |
405 | Builder.CreateIntToPtr(V: loaded, DestTy: GV->getType(), Name: GV->getName()); |
406 | |
407 | U.set(replacement); |
408 | } |
409 | |
410 | void replaceUsesInInstructionsWithTableLookup( |
411 | Module &M, ArrayRef<GlobalVariable *> ModuleScopeVariables, |
412 | GlobalVariable *LookupTable) { |
413 | |
414 | LLVMContext &Ctx = M.getContext(); |
415 | IRBuilder<> Builder(Ctx); |
416 | Type *I32 = Type::getInt32Ty(C&: Ctx); |
417 | |
418 | for (size_t Index = 0; Index < ModuleScopeVariables.size(); Index++) { |
419 | auto *GV = ModuleScopeVariables[Index]; |
420 | |
421 | for (Use &U : make_early_inc_range(Range: GV->uses())) { |
422 | auto *I = dyn_cast<Instruction>(Val: U.getUser()); |
423 | if (!I) |
424 | continue; |
425 | |
426 | replaceUseWithTableLookup(M, Builder, LookupTable, GV, U, |
427 | OptionalIndex: ConstantInt::get(Ty: I32, V: Index)); |
428 | } |
429 | } |
430 | } |
431 | |
432 | static DenseSet<Function *> kernelsThatIndirectlyAccessAnyOfPassedVariables( |
433 | Module &M, LDSUsesInfoTy &LDSUsesInfo, |
434 | DenseSet<GlobalVariable *> const &VariableSet) { |
435 | |
436 | DenseSet<Function *> KernelSet; |
437 | |
438 | if (VariableSet.empty()) |
439 | return KernelSet; |
440 | |
441 | for (Function &Func : M.functions()) { |
442 | if (Func.isDeclaration() || !isKernelLDS(F: &Func)) |
443 | continue; |
444 | for (GlobalVariable *GV : LDSUsesInfo.indirect_access[&Func]) { |
445 | if (VariableSet.contains(V: GV)) { |
446 | KernelSet.insert(V: &Func); |
447 | break; |
448 | } |
449 | } |
450 | } |
451 | |
452 | return KernelSet; |
453 | } |
454 | |
455 | static GlobalVariable * |
456 | chooseBestVariableForModuleStrategy(const DataLayout &DL, |
457 | VariableFunctionMap &LDSVars) { |
458 | // Find the global variable with the most indirect uses from kernels |
459 | |
460 | struct CandidateTy { |
461 | GlobalVariable *GV = nullptr; |
462 | size_t UserCount = 0; |
463 | size_t Size = 0; |
464 | |
465 | CandidateTy() = default; |
466 | |
467 | CandidateTy(GlobalVariable *GV, uint64_t UserCount, uint64_t AllocSize) |
468 | : GV(GV), UserCount(UserCount), Size(AllocSize) {} |
469 | |
470 | bool operator<(const CandidateTy &Other) const { |
471 | // Fewer users makes module scope variable less attractive |
472 | if (UserCount < Other.UserCount) { |
473 | return true; |
474 | } |
475 | if (UserCount > Other.UserCount) { |
476 | return false; |
477 | } |
478 | |
479 | // Bigger makes module scope variable less attractive |
480 | if (Size < Other.Size) { |
481 | return false; |
482 | } |
483 | |
484 | if (Size > Other.Size) { |
485 | return true; |
486 | } |
487 | |
488 | // Arbitrary but consistent |
489 | return GV->getName() < Other.GV->getName(); |
490 | } |
491 | }; |
492 | |
493 | CandidateTy MostUsed; |
494 | |
495 | for (auto &K : LDSVars) { |
496 | GlobalVariable *GV = K.first; |
497 | if (K.second.size() <= 1) { |
498 | // A variable reachable by only one kernel is best lowered with kernel |
499 | // strategy |
500 | continue; |
501 | } |
502 | CandidateTy Candidate( |
503 | GV, K.second.size(), |
504 | DL.getTypeAllocSize(Ty: GV->getValueType()).getFixedValue()); |
505 | if (MostUsed < Candidate) |
506 | MostUsed = Candidate; |
507 | } |
508 | |
509 | return MostUsed.GV; |
510 | } |
511 | |
512 | static void recordLDSAbsoluteAddress(Module *M, GlobalVariable *GV, |
513 | uint32_t Address) { |
514 | // Write the specified address into metadata where it can be retrieved by |
515 | // the assembler. Format is a half open range, [Address Address+1) |
516 | LLVMContext &Ctx = M->getContext(); |
517 | auto *IntTy = |
518 | M->getDataLayout().getIntPtrType(C&: Ctx, AddressSpace: AMDGPUAS::LOCAL_ADDRESS); |
519 | auto *MinC = ConstantAsMetadata::get(C: ConstantInt::get(Ty: IntTy, V: Address)); |
520 | auto *MaxC = ConstantAsMetadata::get(C: ConstantInt::get(Ty: IntTy, V: Address + 1)); |
521 | GV->setMetadata(KindID: LLVMContext::MD_absolute_symbol, |
522 | Node: MDNode::get(Context&: Ctx, MDs: {MinC, MaxC})); |
523 | } |
524 | |
525 | DenseMap<Function *, Value *> tableKernelIndexCache; |
526 | Value *getTableLookupKernelIndex(Module &M, Function *F) { |
527 | // Accesses from a function use the amdgcn_lds_kernel_id intrinsic which |
528 | // lowers to a read from a live in register. Emit it once in the entry |
529 | // block to spare deduplicating it later. |
530 | auto [It, Inserted] = tableKernelIndexCache.try_emplace(Key: F); |
531 | if (Inserted) { |
532 | Function *Decl = |
533 | Intrinsic::getDeclaration(M: &M, id: Intrinsic::amdgcn_lds_kernel_id, Tys: {}); |
534 | |
535 | auto InsertAt = F->getEntryBlock().getFirstNonPHIOrDbgOrAlloca(); |
536 | IRBuilder<> Builder(&*InsertAt); |
537 | |
538 | It->second = Builder.CreateCall(Callee: Decl, Args: {}); |
539 | } |
540 | |
541 | return It->second; |
542 | } |
543 | |
544 | static std::vector<Function *> assignLDSKernelIDToEachKernel( |
545 | Module *M, DenseSet<Function *> const &KernelsThatAllocateTableLDS, |
546 | DenseSet<Function *> const &KernelsThatIndirectlyAllocateDynamicLDS) { |
547 | // Associate kernels in the set with an arbitrary but reproducible order and |
548 | // annotate them with that order in metadata. This metadata is recognised by |
549 | // the backend and lowered to a SGPR which can be read from using |
550 | // amdgcn_lds_kernel_id. |
551 | |
552 | std::vector<Function *> OrderedKernels; |
553 | if (!KernelsThatAllocateTableLDS.empty() || |
554 | !KernelsThatIndirectlyAllocateDynamicLDS.empty()) { |
555 | |
556 | for (Function &Func : M->functions()) { |
557 | if (Func.isDeclaration()) |
558 | continue; |
559 | if (!isKernelLDS(F: &Func)) |
560 | continue; |
561 | |
562 | if (KernelsThatAllocateTableLDS.contains(V: &Func) || |
563 | KernelsThatIndirectlyAllocateDynamicLDS.contains(V: &Func)) { |
564 | assert(Func.hasName()); // else fatal error earlier |
565 | OrderedKernels.push_back(x: &Func); |
566 | } |
567 | } |
568 | |
569 | // Put them in an arbitrary but reproducible order |
570 | OrderedKernels = sortByName(V: std::move(OrderedKernels)); |
571 | |
572 | // Annotate the kernels with their order in this vector |
573 | LLVMContext &Ctx = M->getContext(); |
574 | IRBuilder<> Builder(Ctx); |
575 | |
576 | if (OrderedKernels.size() > UINT32_MAX) { |
577 | // 32 bit keeps it in one SGPR. > 2**32 kernels won't fit on the GPU |
578 | report_fatal_error(reason: "Unimplemented LDS lowering for > 2**32 kernels" ); |
579 | } |
580 | |
581 | for (size_t i = 0; i < OrderedKernels.size(); i++) { |
582 | Metadata *AttrMDArgs[1] = { |
583 | ConstantAsMetadata::get(C: Builder.getInt32(C: i)), |
584 | }; |
585 | OrderedKernels[i]->setMetadata(Kind: "llvm.amdgcn.lds.kernel.id" , |
586 | Node: MDNode::get(Context&: Ctx, MDs: AttrMDArgs)); |
587 | } |
588 | } |
589 | return OrderedKernels; |
590 | } |
591 | |
592 | static void partitionVariablesIntoIndirectStrategies( |
593 | Module &M, LDSUsesInfoTy const &LDSUsesInfo, |
594 | VariableFunctionMap &LDSToKernelsThatNeedToAccessItIndirectly, |
595 | DenseSet<GlobalVariable *> &ModuleScopeVariables, |
596 | DenseSet<GlobalVariable *> &TableLookupVariables, |
597 | DenseSet<GlobalVariable *> &KernelAccessVariables, |
598 | DenseSet<GlobalVariable *> &DynamicVariables) { |
599 | |
600 | GlobalVariable *HybridModuleRoot = |
601 | LoweringKindLoc != LoweringKind::hybrid |
602 | ? nullptr |
603 | : chooseBestVariableForModuleStrategy( |
604 | DL: M.getDataLayout(), LDSVars&: LDSToKernelsThatNeedToAccessItIndirectly); |
605 | |
606 | DenseSet<Function *> const EmptySet; |
607 | DenseSet<Function *> const &HybridModuleRootKernels = |
608 | HybridModuleRoot |
609 | ? LDSToKernelsThatNeedToAccessItIndirectly[HybridModuleRoot] |
610 | : EmptySet; |
611 | |
612 | for (auto &K : LDSToKernelsThatNeedToAccessItIndirectly) { |
613 | // Each iteration of this loop assigns exactly one global variable to |
614 | // exactly one of the implementation strategies. |
615 | |
616 | GlobalVariable *GV = K.first; |
617 | assert(AMDGPU::isLDSVariableToLower(*GV)); |
618 | assert(K.second.size() != 0); |
619 | |
620 | if (AMDGPU::isDynamicLDS(GV: *GV)) { |
621 | DynamicVariables.insert(V: GV); |
622 | continue; |
623 | } |
624 | |
625 | switch (LoweringKindLoc) { |
626 | case LoweringKind::module: |
627 | ModuleScopeVariables.insert(V: GV); |
628 | break; |
629 | |
630 | case LoweringKind::table: |
631 | TableLookupVariables.insert(V: GV); |
632 | break; |
633 | |
634 | case LoweringKind::kernel: |
635 | if (K.second.size() == 1) { |
636 | KernelAccessVariables.insert(V: GV); |
637 | } else { |
638 | report_fatal_error( |
639 | reason: "cannot lower LDS '" + GV->getName() + |
640 | "' to kernel access as it is reachable from multiple kernels" ); |
641 | } |
642 | break; |
643 | |
644 | case LoweringKind::hybrid: { |
645 | if (GV == HybridModuleRoot) { |
646 | assert(K.second.size() != 1); |
647 | ModuleScopeVariables.insert(V: GV); |
648 | } else if (K.second.size() == 1) { |
649 | KernelAccessVariables.insert(V: GV); |
650 | } else if (set_is_subset(S1: K.second, S2: HybridModuleRootKernels)) { |
651 | ModuleScopeVariables.insert(V: GV); |
652 | } else { |
653 | TableLookupVariables.insert(V: GV); |
654 | } |
655 | break; |
656 | } |
657 | } |
658 | } |
659 | |
660 | // All LDS variables accessed indirectly have now been partitioned into |
661 | // the distinct lowering strategies. |
662 | assert(ModuleScopeVariables.size() + TableLookupVariables.size() + |
663 | KernelAccessVariables.size() + DynamicVariables.size() == |
664 | LDSToKernelsThatNeedToAccessItIndirectly.size()); |
665 | } |
666 | |
667 | static GlobalVariable *lowerModuleScopeStructVariables( |
668 | Module &M, DenseSet<GlobalVariable *> const &ModuleScopeVariables, |
669 | DenseSet<Function *> const &KernelsThatAllocateModuleLDS) { |
670 | // Create a struct to hold the ModuleScopeVariables |
671 | // Replace all uses of those variables from non-kernel functions with the |
672 | // new struct instance Replace only the uses from kernel functions that will |
673 | // allocate this instance. That is a space optimisation - kernels that use a |
674 | // subset of the module scope struct and do not need to allocate it for |
675 | // indirect calls will only allocate the subset they use (they do so as part |
676 | // of the per-kernel lowering). |
677 | if (ModuleScopeVariables.empty()) { |
678 | return nullptr; |
679 | } |
680 | |
681 | LLVMContext &Ctx = M.getContext(); |
682 | |
683 | LDSVariableReplacement ModuleScopeReplacement = |
684 | createLDSVariableReplacement(M, VarName: "llvm.amdgcn.module.lds" , |
685 | LDSVarsToTransform: ModuleScopeVariables); |
686 | |
687 | appendToCompilerUsed(M, Values: {static_cast<GlobalValue *>( |
688 | ConstantExpr::getPointerBitCastOrAddrSpaceCast( |
689 | C: cast<Constant>(Val: ModuleScopeReplacement.SGV), |
690 | Ty: PointerType::getUnqual(C&: Ctx)))}); |
691 | |
692 | // module.lds will be allocated at zero in any kernel that allocates it |
693 | recordLDSAbsoluteAddress(M: &M, GV: ModuleScopeReplacement.SGV, Address: 0); |
694 | |
695 | // historic |
696 | removeLocalVarsFromUsedLists(M, LocalVars: ModuleScopeVariables); |
697 | |
698 | // Replace all uses of module scope variable from non-kernel functions |
699 | replaceLDSVariablesWithStruct( |
700 | M, LDSVarsToTransformArg: ModuleScopeVariables, Replacement: ModuleScopeReplacement, Predicate: [&](Use &U) { |
701 | Instruction *I = dyn_cast<Instruction>(Val: U.getUser()); |
702 | if (!I) { |
703 | return false; |
704 | } |
705 | Function *F = I->getFunction(); |
706 | return !isKernelLDS(F); |
707 | }); |
708 | |
709 | // Replace uses of module scope variable from kernel functions that |
710 | // allocate the module scope variable, otherwise leave them unchanged |
711 | // Record on each kernel whether the module scope global is used by it |
712 | |
713 | for (Function &Func : M.functions()) { |
714 | if (Func.isDeclaration() || !isKernelLDS(F: &Func)) |
715 | continue; |
716 | |
717 | if (KernelsThatAllocateModuleLDS.contains(V: &Func)) { |
718 | replaceLDSVariablesWithStruct( |
719 | M, LDSVarsToTransformArg: ModuleScopeVariables, Replacement: ModuleScopeReplacement, Predicate: [&](Use &U) { |
720 | Instruction *I = dyn_cast<Instruction>(Val: U.getUser()); |
721 | if (!I) { |
722 | return false; |
723 | } |
724 | Function *F = I->getFunction(); |
725 | return F == &Func; |
726 | }); |
727 | |
728 | markUsedByKernel(Func: &Func, SGV: ModuleScopeReplacement.SGV); |
729 | } |
730 | } |
731 | |
732 | return ModuleScopeReplacement.SGV; |
733 | } |
734 | |
735 | static DenseMap<Function *, LDSVariableReplacement> |
736 | lowerKernelScopeStructVariables( |
737 | Module &M, LDSUsesInfoTy &LDSUsesInfo, |
738 | DenseSet<GlobalVariable *> const &ModuleScopeVariables, |
739 | DenseSet<Function *> const &KernelsThatAllocateModuleLDS, |
740 | GlobalVariable *MaybeModuleScopeStruct) { |
741 | |
742 | // Create a struct for each kernel for the non-module-scope variables. |
743 | |
744 | DenseMap<Function *, LDSVariableReplacement> KernelToReplacement; |
745 | for (Function &Func : M.functions()) { |
746 | if (Func.isDeclaration() || !isKernelLDS(F: &Func)) |
747 | continue; |
748 | |
749 | DenseSet<GlobalVariable *> KernelUsedVariables; |
750 | // Allocating variables that are used directly in this struct to get |
751 | // alignment aware allocation and predictable frame size. |
752 | for (auto &v : LDSUsesInfo.direct_access[&Func]) { |
753 | if (!AMDGPU::isDynamicLDS(GV: *v)) { |
754 | KernelUsedVariables.insert(V: v); |
755 | } |
756 | } |
757 | |
758 | // Allocating variables that are accessed indirectly so that a lookup of |
759 | // this struct instance can find them from nested functions. |
760 | for (auto &v : LDSUsesInfo.indirect_access[&Func]) { |
761 | if (!AMDGPU::isDynamicLDS(GV: *v)) { |
762 | KernelUsedVariables.insert(V: v); |
763 | } |
764 | } |
765 | |
766 | // Variables allocated in module lds must all resolve to that struct, |
767 | // not to the per-kernel instance. |
768 | if (KernelsThatAllocateModuleLDS.contains(V: &Func)) { |
769 | for (GlobalVariable *v : ModuleScopeVariables) { |
770 | KernelUsedVariables.erase(V: v); |
771 | } |
772 | } |
773 | |
774 | if (KernelUsedVariables.empty()) { |
775 | // Either used no LDS, or the LDS it used was all in the module struct |
776 | // or dynamically sized |
777 | continue; |
778 | } |
779 | |
780 | // The association between kernel function and LDS struct is done by |
781 | // symbol name, which only works if the function in question has a |
782 | // name This is not expected to be a problem in practice as kernels |
783 | // are called by name making anonymous ones (which are named by the |
784 | // backend) difficult to use. This does mean that llvm test cases need |
785 | // to name the kernels. |
786 | if (!Func.hasName()) { |
787 | report_fatal_error(reason: "Anonymous kernels cannot use LDS variables" ); |
788 | } |
789 | |
790 | std::string VarName = |
791 | (Twine("llvm.amdgcn.kernel." ) + Func.getName() + ".lds" ).str(); |
792 | |
793 | auto Replacement = |
794 | createLDSVariableReplacement(M, VarName, LDSVarsToTransform: KernelUsedVariables); |
795 | |
796 | // If any indirect uses, create a direct use to ensure allocation |
797 | // TODO: Simpler to unconditionally mark used but that regresses |
798 | // codegen in test/CodeGen/AMDGPU/noclobber-barrier.ll |
799 | auto Accesses = LDSUsesInfo.indirect_access.find(Val: &Func); |
800 | if ((Accesses != LDSUsesInfo.indirect_access.end()) && |
801 | !Accesses->second.empty()) |
802 | markUsedByKernel(Func: &Func, SGV: Replacement.SGV); |
803 | |
804 | // remove preserves existing codegen |
805 | removeLocalVarsFromUsedLists(M, LocalVars: KernelUsedVariables); |
806 | KernelToReplacement[&Func] = Replacement; |
807 | |
808 | // Rewrite uses within kernel to the new struct |
809 | replaceLDSVariablesWithStruct( |
810 | M, LDSVarsToTransformArg: KernelUsedVariables, Replacement, Predicate: [&Func](Use &U) { |
811 | Instruction *I = dyn_cast<Instruction>(Val: U.getUser()); |
812 | return I && I->getFunction() == &Func; |
813 | }); |
814 | } |
815 | return KernelToReplacement; |
816 | } |
817 | |
818 | static GlobalVariable * |
819 | buildRepresentativeDynamicLDSInstance(Module &M, LDSUsesInfoTy &LDSUsesInfo, |
820 | Function *func) { |
821 | // Create a dynamic lds variable with a name associated with the passed |
822 | // function that has the maximum alignment of any dynamic lds variable |
823 | // reachable from this kernel. Dynamic LDS is allocated after the static LDS |
824 | // allocation, possibly after alignment padding. The representative variable |
825 | // created here has the maximum alignment of any other dynamic variable |
826 | // reachable by that kernel. All dynamic LDS variables are allocated at the |
827 | // same address in each kernel in order to provide the documented aliasing |
828 | // semantics. Setting the alignment here allows this IR pass to accurately |
829 | // predict the exact constant at which it will be allocated. |
830 | |
831 | assert(isKernelLDS(func)); |
832 | |
833 | LLVMContext &Ctx = M.getContext(); |
834 | const DataLayout &DL = M.getDataLayout(); |
835 | Align MaxDynamicAlignment(1); |
836 | |
837 | auto UpdateMaxAlignment = [&MaxDynamicAlignment, &DL](GlobalVariable *GV) { |
838 | if (AMDGPU::isDynamicLDS(GV: *GV)) { |
839 | MaxDynamicAlignment = |
840 | std::max(a: MaxDynamicAlignment, b: AMDGPU::getAlign(DL, GV)); |
841 | } |
842 | }; |
843 | |
844 | for (GlobalVariable *GV : LDSUsesInfo.indirect_access[func]) { |
845 | UpdateMaxAlignment(GV); |
846 | } |
847 | |
848 | for (GlobalVariable *GV : LDSUsesInfo.direct_access[func]) { |
849 | UpdateMaxAlignment(GV); |
850 | } |
851 | |
852 | assert(func->hasName()); // Checked by caller |
853 | auto emptyCharArray = ArrayType::get(ElementType: Type::getInt8Ty(C&: Ctx), NumElements: 0); |
854 | GlobalVariable *N = new GlobalVariable( |
855 | M, emptyCharArray, false, GlobalValue::ExternalLinkage, nullptr, |
856 | Twine("llvm.amdgcn." + func->getName() + ".dynlds" ), nullptr, GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS, |
857 | false); |
858 | N->setAlignment(MaxDynamicAlignment); |
859 | |
860 | assert(AMDGPU::isDynamicLDS(*N)); |
861 | return N; |
862 | } |
863 | |
864 | DenseMap<Function *, GlobalVariable *> lowerDynamicLDSVariables( |
865 | Module &M, LDSUsesInfoTy &LDSUsesInfo, |
866 | DenseSet<Function *> const &KernelsThatIndirectlyAllocateDynamicLDS, |
867 | DenseSet<GlobalVariable *> const &DynamicVariables, |
868 | std::vector<Function *> const &OrderedKernels) { |
869 | DenseMap<Function *, GlobalVariable *> KernelToCreatedDynamicLDS; |
870 | if (!KernelsThatIndirectlyAllocateDynamicLDS.empty()) { |
871 | LLVMContext &Ctx = M.getContext(); |
872 | IRBuilder<> Builder(Ctx); |
873 | Type *I32 = Type::getInt32Ty(C&: Ctx); |
874 | |
875 | std::vector<Constant *> newDynamicLDS; |
876 | |
877 | // Table is built in the same order as OrderedKernels |
878 | for (auto &func : OrderedKernels) { |
879 | |
880 | if (KernelsThatIndirectlyAllocateDynamicLDS.contains(V: func)) { |
881 | assert(isKernelLDS(func)); |
882 | if (!func->hasName()) { |
883 | report_fatal_error(reason: "Anonymous kernels cannot use LDS variables" ); |
884 | } |
885 | |
886 | GlobalVariable *N = |
887 | buildRepresentativeDynamicLDSInstance(M, LDSUsesInfo, func); |
888 | |
889 | KernelToCreatedDynamicLDS[func] = N; |
890 | |
891 | markUsedByKernel(Func: func, SGV: N); |
892 | |
893 | auto emptyCharArray = ArrayType::get(ElementType: Type::getInt8Ty(C&: Ctx), NumElements: 0); |
894 | auto GEP = ConstantExpr::getGetElementPtr( |
895 | Ty: emptyCharArray, C: N, Idx: ConstantInt::get(Ty: I32, V: 0), NW: true); |
896 | newDynamicLDS.push_back(x: ConstantExpr::getPtrToInt(C: GEP, Ty: I32)); |
897 | } else { |
898 | newDynamicLDS.push_back(x: PoisonValue::get(T: I32)); |
899 | } |
900 | } |
901 | assert(OrderedKernels.size() == newDynamicLDS.size()); |
902 | |
903 | ArrayType *t = ArrayType::get(ElementType: I32, NumElements: newDynamicLDS.size()); |
904 | Constant *init = ConstantArray::get(T: t, V: newDynamicLDS); |
905 | GlobalVariable *table = new GlobalVariable( |
906 | M, t, true, GlobalValue::InternalLinkage, init, |
907 | "llvm.amdgcn.dynlds.offset.table" , nullptr, |
908 | GlobalValue::NotThreadLocal, AMDGPUAS::CONSTANT_ADDRESS); |
909 | |
910 | for (GlobalVariable *GV : DynamicVariables) { |
911 | for (Use &U : make_early_inc_range(Range: GV->uses())) { |
912 | auto *I = dyn_cast<Instruction>(Val: U.getUser()); |
913 | if (!I) |
914 | continue; |
915 | if (isKernelLDS(F: I->getFunction())) |
916 | continue; |
917 | |
918 | replaceUseWithTableLookup(M, Builder, LookupTable: table, GV, U, OptionalIndex: nullptr); |
919 | } |
920 | } |
921 | } |
922 | return KernelToCreatedDynamicLDS; |
923 | } |
924 | |
925 | bool runOnModule(Module &M) { |
926 | CallGraph CG = CallGraph(M); |
927 | bool Changed = superAlignLDSGlobals(M); |
928 | |
929 | Changed |= eliminateConstantExprUsesOfLDSFromAllInstructions(M); |
930 | |
931 | Changed = true; // todo: narrow this down |
932 | |
933 | // For each kernel, what variables does it access directly or through |
934 | // callees |
935 | LDSUsesInfoTy LDSUsesInfo = getTransitiveUsesOfLDS(CG, M); |
936 | |
937 | // For each variable accessed through callees, which kernels access it |
938 | VariableFunctionMap LDSToKernelsThatNeedToAccessItIndirectly; |
939 | for (auto &K : LDSUsesInfo.indirect_access) { |
940 | Function *F = K.first; |
941 | assert(isKernelLDS(F)); |
942 | for (GlobalVariable *GV : K.second) { |
943 | LDSToKernelsThatNeedToAccessItIndirectly[GV].insert(V: F); |
944 | } |
945 | } |
946 | |
947 | // Partition variables accessed indirectly into the different strategies |
948 | DenseSet<GlobalVariable *> ModuleScopeVariables; |
949 | DenseSet<GlobalVariable *> TableLookupVariables; |
950 | DenseSet<GlobalVariable *> KernelAccessVariables; |
951 | DenseSet<GlobalVariable *> DynamicVariables; |
952 | partitionVariablesIntoIndirectStrategies( |
953 | M, LDSUsesInfo, LDSToKernelsThatNeedToAccessItIndirectly, |
954 | ModuleScopeVariables, TableLookupVariables, KernelAccessVariables, |
955 | DynamicVariables); |
956 | |
957 | // If the kernel accesses a variable that is going to be stored in the |
958 | // module instance through a call then that kernel needs to allocate the |
959 | // module instance |
960 | const DenseSet<Function *> KernelsThatAllocateModuleLDS = |
961 | kernelsThatIndirectlyAccessAnyOfPassedVariables(M, LDSUsesInfo, |
962 | VariableSet: ModuleScopeVariables); |
963 | const DenseSet<Function *> KernelsThatAllocateTableLDS = |
964 | kernelsThatIndirectlyAccessAnyOfPassedVariables(M, LDSUsesInfo, |
965 | VariableSet: TableLookupVariables); |
966 | |
967 | const DenseSet<Function *> KernelsThatIndirectlyAllocateDynamicLDS = |
968 | kernelsThatIndirectlyAccessAnyOfPassedVariables(M, LDSUsesInfo, |
969 | VariableSet: DynamicVariables); |
970 | |
971 | GlobalVariable *MaybeModuleScopeStruct = lowerModuleScopeStructVariables( |
972 | M, ModuleScopeVariables, KernelsThatAllocateModuleLDS); |
973 | |
974 | DenseMap<Function *, LDSVariableReplacement> KernelToReplacement = |
975 | lowerKernelScopeStructVariables(M, LDSUsesInfo, ModuleScopeVariables, |
976 | KernelsThatAllocateModuleLDS, |
977 | MaybeModuleScopeStruct); |
978 | |
979 | // Lower zero cost accesses to the kernel instances just created |
980 | for (auto &GV : KernelAccessVariables) { |
981 | auto &funcs = LDSToKernelsThatNeedToAccessItIndirectly[GV]; |
982 | assert(funcs.size() == 1); // Only one kernel can access it |
983 | LDSVariableReplacement Replacement = |
984 | KernelToReplacement[*(funcs.begin())]; |
985 | |
986 | DenseSet<GlobalVariable *> Vec; |
987 | Vec.insert(V: GV); |
988 | |
989 | replaceLDSVariablesWithStruct(M, LDSVarsToTransformArg: Vec, Replacement, Predicate: [](Use &U) { |
990 | return isa<Instruction>(Val: U.getUser()); |
991 | }); |
992 | } |
993 | |
994 | // The ith element of this vector is kernel id i |
995 | std::vector<Function *> OrderedKernels = |
996 | assignLDSKernelIDToEachKernel(M: &M, KernelsThatAllocateTableLDS, |
997 | KernelsThatIndirectlyAllocateDynamicLDS); |
998 | |
999 | if (!KernelsThatAllocateTableLDS.empty()) { |
1000 | LLVMContext &Ctx = M.getContext(); |
1001 | IRBuilder<> Builder(Ctx); |
1002 | |
1003 | // The order must be consistent between lookup table and accesses to |
1004 | // lookup table |
1005 | auto TableLookupVariablesOrdered = |
1006 | sortByName(V: std::vector<GlobalVariable *>(TableLookupVariables.begin(), |
1007 | TableLookupVariables.end())); |
1008 | |
1009 | GlobalVariable *LookupTable = buildLookupTable( |
1010 | M, Variables: TableLookupVariablesOrdered, kernels: OrderedKernels, KernelToReplacement); |
1011 | replaceUsesInInstructionsWithTableLookup(M, ModuleScopeVariables: TableLookupVariablesOrdered, |
1012 | LookupTable); |
1013 | |
1014 | // Strip amdgpu-no-lds-kernel-id from all functions reachable from the |
1015 | // kernel. We may have inferred this wasn't used prior to the pass. |
1016 | // |
1017 | // TODO: We could filter out subgraphs that do not access LDS globals. |
1018 | for (Function *F : KernelsThatAllocateTableLDS) |
1019 | removeFnAttrFromReachable(CG, KernelRoot: F, FnAttrs: {"amdgpu-no-lds-kernel-id" }); |
1020 | } |
1021 | |
1022 | DenseMap<Function *, GlobalVariable *> KernelToCreatedDynamicLDS = |
1023 | lowerDynamicLDSVariables(M, LDSUsesInfo, |
1024 | KernelsThatIndirectlyAllocateDynamicLDS, |
1025 | DynamicVariables, OrderedKernels); |
1026 | |
1027 | // All kernel frames have been allocated. Calculate and record the |
1028 | // addresses. |
1029 | { |
1030 | const DataLayout &DL = M.getDataLayout(); |
1031 | |
1032 | for (Function &Func : M.functions()) { |
1033 | if (Func.isDeclaration() || !isKernelLDS(F: &Func)) |
1034 | continue; |
1035 | |
1036 | // All three of these are optional. The first variable is allocated at |
1037 | // zero. They are allocated by AMDGPUMachineFunction as one block. |
1038 | // Layout: |
1039 | //{ |
1040 | // module.lds |
1041 | // alignment padding |
1042 | // kernel instance |
1043 | // alignment padding |
1044 | // dynamic lds variables |
1045 | //} |
1046 | |
1047 | const bool AllocateModuleScopeStruct = |
1048 | MaybeModuleScopeStruct && |
1049 | KernelsThatAllocateModuleLDS.contains(V: &Func); |
1050 | |
1051 | auto Replacement = KernelToReplacement.find(Val: &Func); |
1052 | const bool AllocateKernelScopeStruct = |
1053 | Replacement != KernelToReplacement.end(); |
1054 | |
1055 | const bool AllocateDynamicVariable = |
1056 | KernelToCreatedDynamicLDS.contains(Val: &Func); |
1057 | |
1058 | uint32_t Offset = 0; |
1059 | |
1060 | if (AllocateModuleScopeStruct) { |
1061 | // Allocated at zero, recorded once on construction, not once per |
1062 | // kernel |
1063 | Offset += DL.getTypeAllocSize(Ty: MaybeModuleScopeStruct->getValueType()); |
1064 | } |
1065 | |
1066 | if (AllocateKernelScopeStruct) { |
1067 | GlobalVariable *KernelStruct = Replacement->second.SGV; |
1068 | Offset = alignTo(Size: Offset, A: AMDGPU::getAlign(DL, GV: KernelStruct)); |
1069 | recordLDSAbsoluteAddress(M: &M, GV: KernelStruct, Address: Offset); |
1070 | Offset += DL.getTypeAllocSize(Ty: KernelStruct->getValueType()); |
1071 | } |
1072 | |
1073 | // If there is dynamic allocation, the alignment needed is included in |
1074 | // the static frame size. There may be no reference to the dynamic |
1075 | // variable in the kernel itself, so without including it here, that |
1076 | // alignment padding could be missed. |
1077 | if (AllocateDynamicVariable) { |
1078 | GlobalVariable *DynamicVariable = KernelToCreatedDynamicLDS[&Func]; |
1079 | Offset = alignTo(Size: Offset, A: AMDGPU::getAlign(DL, GV: DynamicVariable)); |
1080 | recordLDSAbsoluteAddress(M: &M, GV: DynamicVariable, Address: Offset); |
1081 | } |
1082 | |
1083 | if (Offset != 0) { |
1084 | (void)TM; // TODO: Account for target maximum LDS |
1085 | std::string Buffer; |
1086 | raw_string_ostream SS{Buffer}; |
1087 | SS << format(Fmt: "%u" , Vals: Offset); |
1088 | |
1089 | // Instead of explicitly marking kernels that access dynamic variables |
1090 | // using special case metadata, annotate with min-lds == max-lds, i.e. |
1091 | // that there is no more space available for allocating more static |
1092 | // LDS variables. That is the right condition to prevent allocating |
1093 | // more variables which would collide with the addresses assigned to |
1094 | // dynamic variables. |
1095 | if (AllocateDynamicVariable) |
1096 | SS << format(Fmt: ",%u" , Vals: Offset); |
1097 | |
1098 | Func.addFnAttr(Kind: "amdgpu-lds-size" , Val: Buffer); |
1099 | } |
1100 | } |
1101 | } |
1102 | |
1103 | for (auto &GV : make_early_inc_range(Range: M.globals())) |
1104 | if (AMDGPU::isLDSVariableToLower(GV)) { |
1105 | // probably want to remove from used lists |
1106 | GV.removeDeadConstantUsers(); |
1107 | if (GV.use_empty()) |
1108 | GV.eraseFromParent(); |
1109 | } |
1110 | |
1111 | return Changed; |
1112 | } |
1113 | |
1114 | private: |
1115 | // Increase the alignment of LDS globals if necessary to maximise the chance |
1116 | // that we can use aligned LDS instructions to access them. |
1117 | static bool superAlignLDSGlobals(Module &M) { |
1118 | const DataLayout &DL = M.getDataLayout(); |
1119 | bool Changed = false; |
1120 | if (!SuperAlignLDSGlobals) { |
1121 | return Changed; |
1122 | } |
1123 | |
1124 | for (auto &GV : M.globals()) { |
1125 | if (GV.getType()->getPointerAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) { |
1126 | // Only changing alignment of LDS variables |
1127 | continue; |
1128 | } |
1129 | if (!GV.hasInitializer()) { |
1130 | // cuda/hip extern __shared__ variable, leave alignment alone |
1131 | continue; |
1132 | } |
1133 | |
1134 | Align Alignment = AMDGPU::getAlign(DL, GV: &GV); |
1135 | TypeSize GVSize = DL.getTypeAllocSize(Ty: GV.getValueType()); |
1136 | |
1137 | if (GVSize > 8) { |
1138 | // We might want to use a b96 or b128 load/store |
1139 | Alignment = std::max(a: Alignment, b: Align(16)); |
1140 | } else if (GVSize > 4) { |
1141 | // We might want to use a b64 load/store |
1142 | Alignment = std::max(a: Alignment, b: Align(8)); |
1143 | } else if (GVSize > 2) { |
1144 | // We might want to use a b32 load/store |
1145 | Alignment = std::max(a: Alignment, b: Align(4)); |
1146 | } else if (GVSize > 1) { |
1147 | // We might want to use a b16 load/store |
1148 | Alignment = std::max(a: Alignment, b: Align(2)); |
1149 | } |
1150 | |
1151 | if (Alignment != AMDGPU::getAlign(DL, GV: &GV)) { |
1152 | Changed = true; |
1153 | GV.setAlignment(Alignment); |
1154 | } |
1155 | } |
1156 | return Changed; |
1157 | } |
1158 | |
1159 | static LDSVariableReplacement createLDSVariableReplacement( |
1160 | Module &M, std::string VarName, |
1161 | DenseSet<GlobalVariable *> const &LDSVarsToTransform) { |
1162 | // Create a struct instance containing LDSVarsToTransform and map from those |
1163 | // variables to ConstantExprGEP |
1164 | // Variables may be introduced to meet alignment requirements. No aliasing |
1165 | // metadata is useful for these as they have no uses. Erased before return. |
1166 | |
1167 | LLVMContext &Ctx = M.getContext(); |
1168 | const DataLayout &DL = M.getDataLayout(); |
1169 | assert(!LDSVarsToTransform.empty()); |
1170 | |
1171 | SmallVector<OptimizedStructLayoutField, 8> LayoutFields; |
1172 | LayoutFields.reserve(N: LDSVarsToTransform.size()); |
1173 | { |
1174 | // The order of fields in this struct depends on the order of |
1175 | // variables in the argument which varies when changing how they |
1176 | // are identified, leading to spurious test breakage. |
1177 | auto Sorted = sortByName(V: std::vector<GlobalVariable *>( |
1178 | LDSVarsToTransform.begin(), LDSVarsToTransform.end())); |
1179 | |
1180 | for (GlobalVariable *GV : Sorted) { |
1181 | OptimizedStructLayoutField F(GV, |
1182 | DL.getTypeAllocSize(Ty: GV->getValueType()), |
1183 | AMDGPU::getAlign(DL, GV)); |
1184 | LayoutFields.emplace_back(Args&: F); |
1185 | } |
1186 | } |
1187 | |
1188 | performOptimizedStructLayout(Fields: LayoutFields); |
1189 | |
1190 | std::vector<GlobalVariable *> LocalVars; |
1191 | BitVector IsPaddingField; |
1192 | LocalVars.reserve(n: LDSVarsToTransform.size()); // will be at least this large |
1193 | IsPaddingField.reserve(N: LDSVarsToTransform.size()); |
1194 | { |
1195 | uint64_t CurrentOffset = 0; |
1196 | for (auto &F : LayoutFields) { |
1197 | GlobalVariable *FGV = |
1198 | static_cast<GlobalVariable *>(const_cast<void *>(F.Id)); |
1199 | Align DataAlign = F.Alignment; |
1200 | |
1201 | uint64_t DataAlignV = DataAlign.value(); |
1202 | if (uint64_t Rem = CurrentOffset % DataAlignV) { |
1203 | uint64_t Padding = DataAlignV - Rem; |
1204 | |
1205 | // Append an array of padding bytes to meet alignment requested |
1206 | // Note (o + (a - (o % a)) ) % a == 0 |
1207 | // (offset + Padding ) % align == 0 |
1208 | |
1209 | Type *ATy = ArrayType::get(ElementType: Type::getInt8Ty(C&: Ctx), NumElements: Padding); |
1210 | LocalVars.push_back(x: new GlobalVariable( |
1211 | M, ATy, false, GlobalValue::InternalLinkage, |
1212 | PoisonValue::get(T: ATy), "" , nullptr, GlobalValue::NotThreadLocal, |
1213 | AMDGPUAS::LOCAL_ADDRESS, false)); |
1214 | IsPaddingField.push_back(Val: true); |
1215 | CurrentOffset += Padding; |
1216 | } |
1217 | |
1218 | LocalVars.push_back(x: FGV); |
1219 | IsPaddingField.push_back(Val: false); |
1220 | CurrentOffset += F.Size; |
1221 | } |
1222 | } |
1223 | |
1224 | std::vector<Type *> LocalVarTypes; |
1225 | LocalVarTypes.reserve(n: LocalVars.size()); |
1226 | std::transform( |
1227 | first: LocalVars.cbegin(), last: LocalVars.cend(), result: std::back_inserter(x&: LocalVarTypes), |
1228 | unary_op: [](const GlobalVariable *V) -> Type * { return V->getValueType(); }); |
1229 | |
1230 | StructType *LDSTy = StructType::create(Context&: Ctx, Elements: LocalVarTypes, Name: VarName + ".t" ); |
1231 | |
1232 | Align StructAlign = AMDGPU::getAlign(DL, GV: LocalVars[0]); |
1233 | |
1234 | GlobalVariable *SGV = new GlobalVariable( |
1235 | M, LDSTy, false, GlobalValue::InternalLinkage, PoisonValue::get(T: LDSTy), |
1236 | VarName, nullptr, GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS, |
1237 | false); |
1238 | SGV->setAlignment(StructAlign); |
1239 | |
1240 | DenseMap<GlobalVariable *, Constant *> Map; |
1241 | Type *I32 = Type::getInt32Ty(C&: Ctx); |
1242 | for (size_t I = 0; I < LocalVars.size(); I++) { |
1243 | GlobalVariable *GV = LocalVars[I]; |
1244 | Constant *GEPIdx[] = {ConstantInt::get(Ty: I32, V: 0), ConstantInt::get(Ty: I32, V: I)}; |
1245 | Constant *GEP = ConstantExpr::getGetElementPtr(Ty: LDSTy, C: SGV, IdxList: GEPIdx, NW: true); |
1246 | if (IsPaddingField[I]) { |
1247 | assert(GV->use_empty()); |
1248 | GV->eraseFromParent(); |
1249 | } else { |
1250 | Map[GV] = GEP; |
1251 | } |
1252 | } |
1253 | assert(Map.size() == LDSVarsToTransform.size()); |
1254 | return {.SGV: SGV, .LDSVarsToConstantGEP: std::move(Map)}; |
1255 | } |
1256 | |
1257 | template <typename PredicateTy> |
1258 | static void replaceLDSVariablesWithStruct( |
1259 | Module &M, DenseSet<GlobalVariable *> const &LDSVarsToTransformArg, |
1260 | const LDSVariableReplacement &Replacement, PredicateTy Predicate) { |
1261 | LLVMContext &Ctx = M.getContext(); |
1262 | const DataLayout &DL = M.getDataLayout(); |
1263 | |
1264 | // A hack... we need to insert the aliasing info in a predictable order for |
1265 | // lit tests. Would like to have them in a stable order already, ideally the |
1266 | // same order they get allocated, which might mean an ordered set container |
1267 | auto LDSVarsToTransform = sortByName(V: std::vector<GlobalVariable *>( |
1268 | LDSVarsToTransformArg.begin(), LDSVarsToTransformArg.end())); |
1269 | |
1270 | // Create alias.scope and their lists. Each field in the new structure |
1271 | // does not alias with all other fields. |
1272 | SmallVector<MDNode *> AliasScopes; |
1273 | SmallVector<Metadata *> NoAliasList; |
1274 | const size_t NumberVars = LDSVarsToTransform.size(); |
1275 | if (NumberVars > 1) { |
1276 | MDBuilder MDB(Ctx); |
1277 | AliasScopes.reserve(N: NumberVars); |
1278 | MDNode *Domain = MDB.createAnonymousAliasScopeDomain(); |
1279 | for (size_t I = 0; I < NumberVars; I++) { |
1280 | MDNode *Scope = MDB.createAnonymousAliasScope(Domain); |
1281 | AliasScopes.push_back(Elt: Scope); |
1282 | } |
1283 | NoAliasList.append(in_start: &AliasScopes[1], in_end: AliasScopes.end()); |
1284 | } |
1285 | |
1286 | // Replace uses of ith variable with a constantexpr to the corresponding |
1287 | // field of the instance that will be allocated by AMDGPUMachineFunction |
1288 | for (size_t I = 0; I < NumberVars; I++) { |
1289 | GlobalVariable *GV = LDSVarsToTransform[I]; |
1290 | Constant *GEP = Replacement.LDSVarsToConstantGEP.at(Val: GV); |
1291 | |
1292 | GV->replaceUsesWithIf(New: GEP, ShouldReplace: Predicate); |
1293 | |
1294 | APInt APOff(DL.getIndexTypeSizeInBits(Ty: GEP->getType()), 0); |
1295 | GEP->stripAndAccumulateInBoundsConstantOffsets(DL, Offset&: APOff); |
1296 | uint64_t Offset = APOff.getZExtValue(); |
1297 | |
1298 | Align A = |
1299 | commonAlignment(A: Replacement.SGV->getAlign().valueOrOne(), Offset); |
1300 | |
1301 | if (I) |
1302 | NoAliasList[I - 1] = AliasScopes[I - 1]; |
1303 | MDNode *NoAlias = |
1304 | NoAliasList.empty() ? nullptr : MDNode::get(Context&: Ctx, MDs: NoAliasList); |
1305 | MDNode *AliasScope = |
1306 | AliasScopes.empty() ? nullptr : MDNode::get(Context&: Ctx, MDs: {AliasScopes[I]}); |
1307 | |
1308 | refineUsesAlignmentAndAA(Ptr: GEP, A, DL, AliasScope, NoAlias); |
1309 | } |
1310 | } |
1311 | |
1312 | static void refineUsesAlignmentAndAA(Value *Ptr, Align A, |
1313 | const DataLayout &DL, MDNode *AliasScope, |
1314 | MDNode *NoAlias, unsigned MaxDepth = 5) { |
1315 | if (!MaxDepth || (A == 1 && !AliasScope)) |
1316 | return; |
1317 | |
1318 | for (User *U : Ptr->users()) { |
1319 | if (auto *I = dyn_cast<Instruction>(Val: U)) { |
1320 | if (AliasScope && I->mayReadOrWriteMemory()) { |
1321 | MDNode *AS = I->getMetadata(KindID: LLVMContext::MD_alias_scope); |
1322 | AS = (AS ? MDNode::getMostGenericAliasScope(A: AS, B: AliasScope) |
1323 | : AliasScope); |
1324 | I->setMetadata(KindID: LLVMContext::MD_alias_scope, Node: AS); |
1325 | |
1326 | MDNode *NA = I->getMetadata(KindID: LLVMContext::MD_noalias); |
1327 | NA = (NA ? MDNode::intersect(A: NA, B: NoAlias) : NoAlias); |
1328 | I->setMetadata(KindID: LLVMContext::MD_noalias, Node: NA); |
1329 | } |
1330 | } |
1331 | |
1332 | if (auto *LI = dyn_cast<LoadInst>(Val: U)) { |
1333 | LI->setAlignment(std::max(a: A, b: LI->getAlign())); |
1334 | continue; |
1335 | } |
1336 | if (auto *SI = dyn_cast<StoreInst>(Val: U)) { |
1337 | if (SI->getPointerOperand() == Ptr) |
1338 | SI->setAlignment(std::max(a: A, b: SI->getAlign())); |
1339 | continue; |
1340 | } |
1341 | if (auto *AI = dyn_cast<AtomicRMWInst>(Val: U)) { |
1342 | // None of atomicrmw operations can work on pointers, but let's |
1343 | // check it anyway in case it will or we will process ConstantExpr. |
1344 | if (AI->getPointerOperand() == Ptr) |
1345 | AI->setAlignment(std::max(a: A, b: AI->getAlign())); |
1346 | continue; |
1347 | } |
1348 | if (auto *AI = dyn_cast<AtomicCmpXchgInst>(Val: U)) { |
1349 | if (AI->getPointerOperand() == Ptr) |
1350 | AI->setAlignment(std::max(a: A, b: AI->getAlign())); |
1351 | continue; |
1352 | } |
1353 | if (auto *GEP = dyn_cast<GetElementPtrInst>(Val: U)) { |
1354 | unsigned BitWidth = DL.getIndexTypeSizeInBits(Ty: GEP->getType()); |
1355 | APInt Off(BitWidth, 0); |
1356 | if (GEP->getPointerOperand() == Ptr) { |
1357 | Align GA; |
1358 | if (GEP->accumulateConstantOffset(DL, Offset&: Off)) |
1359 | GA = commonAlignment(A, Offset: Off.getLimitedValue()); |
1360 | refineUsesAlignmentAndAA(Ptr: GEP, A: GA, DL, AliasScope, NoAlias, |
1361 | MaxDepth: MaxDepth - 1); |
1362 | } |
1363 | continue; |
1364 | } |
1365 | if (auto *I = dyn_cast<Instruction>(Val: U)) { |
1366 | if (I->getOpcode() == Instruction::BitCast || |
1367 | I->getOpcode() == Instruction::AddrSpaceCast) |
1368 | refineUsesAlignmentAndAA(Ptr: I, A, DL, AliasScope, NoAlias, MaxDepth: MaxDepth - 1); |
1369 | } |
1370 | } |
1371 | } |
1372 | }; |
1373 | |
1374 | class AMDGPULowerModuleLDSLegacy : public ModulePass { |
1375 | public: |
1376 | const AMDGPUTargetMachine *TM; |
1377 | static char ID; |
1378 | |
1379 | AMDGPULowerModuleLDSLegacy(const AMDGPUTargetMachine *TM_ = nullptr) |
1380 | : ModulePass(ID), TM(TM_) { |
1381 | initializeAMDGPULowerModuleLDSLegacyPass(*PassRegistry::getPassRegistry()); |
1382 | } |
1383 | |
1384 | void getAnalysisUsage(AnalysisUsage &AU) const override { |
1385 | if (!TM) |
1386 | AU.addRequired<TargetPassConfig>(); |
1387 | } |
1388 | |
1389 | bool runOnModule(Module &M) override { |
1390 | if (!TM) { |
1391 | auto &TPC = getAnalysis<TargetPassConfig>(); |
1392 | TM = &TPC.getTM<AMDGPUTargetMachine>(); |
1393 | } |
1394 | |
1395 | return AMDGPULowerModuleLDS(*TM).runOnModule(M); |
1396 | } |
1397 | }; |
1398 | |
1399 | } // namespace |
1400 | char AMDGPULowerModuleLDSLegacy::ID = 0; |
1401 | |
1402 | char &llvm::AMDGPULowerModuleLDSLegacyPassID = AMDGPULowerModuleLDSLegacy::ID; |
1403 | |
1404 | INITIALIZE_PASS_BEGIN(AMDGPULowerModuleLDSLegacy, DEBUG_TYPE, |
1405 | "Lower uses of LDS variables from non-kernel functions" , |
1406 | false, false) |
1407 | INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) |
1408 | INITIALIZE_PASS_END(AMDGPULowerModuleLDSLegacy, DEBUG_TYPE, |
1409 | "Lower uses of LDS variables from non-kernel functions" , |
1410 | false, false) |
1411 | |
1412 | ModulePass * |
1413 | llvm::createAMDGPULowerModuleLDSLegacyPass(const AMDGPUTargetMachine *TM) { |
1414 | return new AMDGPULowerModuleLDSLegacy(TM); |
1415 | } |
1416 | |
1417 | PreservedAnalyses AMDGPULowerModuleLDSPass::run(Module &M, |
1418 | ModuleAnalysisManager &) { |
1419 | return AMDGPULowerModuleLDS(TM).runOnModule(M) ? PreservedAnalyses::none() |
1420 | : PreservedAnalyses::all(); |
1421 | } |
1422 | |