1 | //===-- AMDGPULowerModuleLDSPass.cpp ------------------------------*- C++ -*-=// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This pass eliminates local data store, LDS, uses from non-kernel functions. |
10 | // LDS is contiguous memory allocated per kernel execution. |
11 | // |
12 | // Background. |
13 | // |
14 | // The programming model is global variables, or equivalently function local |
15 | // static variables, accessible from kernels or other functions. For uses from |
16 | // kernels this is straightforward - assign an integer to the kernel for the |
17 | // memory required by all the variables combined, allocate them within that. |
18 | // For uses from functions there are performance tradeoffs to choose between. |
19 | // |
20 | // This model means the GPU runtime can specify the amount of memory allocated. |
21 | // If this is more than the kernel assumed, the excess can be made available |
22 | // using a language specific feature, which IR represents as a variable with |
23 | // no initializer. This feature is referred to here as "Dynamic LDS" and is |
24 | // lowered slightly differently to the normal case. |
25 | // |
26 | // Consequences of this GPU feature: |
27 | // - memory is limited and exceeding it halts compilation |
28 | // - a global accessed by one kernel exists independent of other kernels |
29 | // - a global exists independent of simultaneous execution of the same kernel |
30 | // - the address of the global may be different from different kernels as they |
31 | // do not alias, which permits only allocating variables they use |
32 | // - if the address is allowed to differ, functions need help to find it |
33 | // |
34 | // Uses from kernels are implemented here by grouping them in a per-kernel |
35 | // struct instance. This duplicates the variables, accurately modelling their |
36 | // aliasing properties relative to a single global representation. It also |
37 | // permits control over alignment via padding. |
38 | // |
39 | // Uses from functions are more complicated and the primary purpose of this |
40 | // IR pass. Several different lowering are chosen between to meet requirements |
41 | // to avoid allocating any LDS where it is not necessary, as that impacts |
42 | // occupancy and may fail the compilation, while not imposing overhead on a |
43 | // feature whose primary advantage over global memory is performance. The basic |
44 | // design goal is to avoid one kernel imposing overhead on another. |
45 | // |
46 | // Implementation. |
47 | // |
48 | // LDS variables with constant annotation or non-undef initializer are passed |
49 | // through unchanged for simplification or error diagnostics in later passes. |
50 | // Non-undef initializers are not yet implemented for LDS. |
51 | // |
52 | // LDS variables that are always allocated at the same address can be found |
53 | // by lookup at that address. Otherwise runtime information/cost is required. |
54 | // |
55 | // The simplest strategy possible is to group all LDS variables in a single |
56 | // struct and allocate that struct in every kernel such that the original |
57 | // variables are always at the same address. LDS is however a limited resource |
58 | // so this strategy is unusable in practice. It is not implemented here. |
59 | // |
60 | // Strategy | Precise allocation | Zero runtime cost | General purpose | |
61 | // --------+--------------------+-------------------+-----------------+ |
62 | // Module | No | Yes | Yes | |
63 | // Table | Yes | No | Yes | |
64 | // Kernel | Yes | Yes | No | |
65 | // Hybrid | Yes | Partial | Yes | |
66 | // |
67 | // "Module" spends LDS memory to save cycles. "Table" spends cycles and global |
68 | // memory to save LDS. "Kernel" is as fast as kernel allocation but only works |
69 | // for variables that are known reachable from a single kernel. "Hybrid" picks |
70 | // between all three. When forced to choose between LDS and cycles we minimise |
71 | // LDS use. |
72 | |
73 | // The "module" lowering implemented here finds LDS variables which are used by |
74 | // non-kernel functions and creates a new struct with a field for each of those |
75 | // LDS variables. Variables that are only used from kernels are excluded. |
76 | // |
77 | // The "table" lowering implemented here has three components. |
78 | // First kernels are assigned a unique integer identifier which is available in |
79 | // functions it calls through the intrinsic amdgcn_lds_kernel_id. The integer |
80 | // is passed through a specific SGPR, thus works with indirect calls. |
81 | // Second, each kernel allocates LDS variables independent of other kernels and |
82 | // writes the addresses it chose for each variable into an array in consistent |
83 | // order. If the kernel does not allocate a given variable, it writes undef to |
84 | // the corresponding array location. These arrays are written to a constant |
85 | // table in the order matching the kernel unique integer identifier. |
86 | // Third, uses from non-kernel functions are replaced with a table lookup using |
87 | // the intrinsic function to find the address of the variable. |
88 | // |
89 | // "Kernel" lowering is only applicable for variables that are unambiguously |
90 | // reachable from exactly one kernel. For those cases, accesses to the variable |
91 | // can be lowered to ConstantExpr address of a struct instance specific to that |
92 | // one kernel. This is zero cost in space and in compute. It will raise a fatal |
93 | // error on any variable that might be reachable from multiple kernels and is |
94 | // thus most easily used as part of the hybrid lowering strategy. |
95 | // |
96 | // Hybrid lowering is a mixture of the above. It uses the zero cost kernel |
97 | // lowering where it can. It lowers the variable accessed by the greatest |
98 | // number of kernels using the module strategy as that is free for the first |
99 | // variable. Any futher variables that can be lowered with the module strategy |
100 | // without incurring LDS memory overhead are. The remaining ones are lowered |
101 | // via table. |
102 | // |
103 | // Consequences |
104 | // - No heuristics or user controlled magic numbers, hybrid is the right choice |
105 | // - Kernels that don't use functions (or have had them all inlined) are not |
106 | // affected by any lowering for kernels that do. |
107 | // - Kernels that don't make indirect function calls are not affected by those |
108 | // that do. |
109 | // - Variables which are used by lots of kernels, e.g. those injected by a |
110 | // language runtime in most kernels, are expected to have no overhead |
111 | // - Implementations that instantiate templates per-kernel where those templates |
112 | // use LDS are expected to hit the "Kernel" lowering strategy |
113 | // - The runtime properties impose a cost in compiler implementation complexity |
114 | // |
115 | // Dynamic LDS implementation |
116 | // Dynamic LDS is lowered similarly to the "table" strategy above and uses the |
117 | // same intrinsic to identify which kernel is at the root of the dynamic call |
118 | // graph. This relies on the specified behaviour that all dynamic LDS variables |
119 | // alias one another, i.e. are at the same address, with respect to a given |
120 | // kernel. Therefore this pass creates new dynamic LDS variables for each kernel |
121 | // that allocates any dynamic LDS and builds a table of addresses out of those. |
122 | // The AMDGPUPromoteAlloca pass skips kernels that use dynamic LDS. |
123 | // The corresponding optimisation for "kernel" lowering where the table lookup |
124 | // is elided is not implemented. |
125 | // |
126 | // |
127 | // Implementation notes / limitations |
128 | // A single LDS global variable represents an instance per kernel that can reach |
129 | // said variables. This pass essentially specialises said variables per kernel. |
130 | // Handling ConstantExpr during the pass complicated this significantly so now |
131 | // all ConstantExpr uses of LDS variables are expanded to instructions. This |
132 | // may need amending when implementing non-undef initialisers. |
133 | // |
134 | // Lowering is split between this IR pass and the back end. This pass chooses |
135 | // where given variables should be allocated and marks them with metadata, |
136 | // MD_absolute_symbol. The backend places the variables in coincidentally the |
137 | // same location and raises a fatal error if something has gone awry. This works |
138 | // in practice because the only pass between this one and the backend that |
139 | // changes LDS is PromoteAlloca and the changes it makes do not conflict. |
140 | // |
141 | // Addresses are written to constant global arrays based on the same metadata. |
142 | // |
143 | // The backend lowers LDS variables in the order of traversal of the function. |
144 | // This is at odds with the deterministic layout required. The workaround is to |
145 | // allocate the fixed-address variables immediately upon starting the function |
146 | // where they can be placed as intended. This requires a means of mapping from |
147 | // the function to the variables that it allocates. For the module scope lds, |
148 | // this is via metadata indicating whether the variable is not required. If a |
149 | // pass deletes that metadata, a fatal error on disagreement with the absolute |
150 | // symbol metadata will occur. For kernel scope and dynamic, this is by _name_ |
151 | // correspondence between the function and the variable. It requires the |
152 | // kernel to have a name (which is only a limitation for tests in practice) and |
153 | // for nothing to rename the corresponding symbols. This is a hazard if the pass |
154 | // is run multiple times during debugging. Alternative schemes considered all |
155 | // involve bespoke metadata. |
156 | // |
157 | // If the name correspondence can be replaced, multiple distinct kernels that |
158 | // have the same memory layout can map to the same kernel id (as the address |
159 | // itself is handled by the absolute symbol metadata) and that will allow more |
160 | // uses of the "kernel" style faster lowering and reduce the size of the lookup |
161 | // tables. |
162 | // |
163 | // There is a test that checks this does not fire for a graphics shader. This |
164 | // lowering is expected to work for graphics if the isKernel test is changed. |
165 | // |
166 | // The current markUsedByKernel is sufficient for PromoteAlloca but is elided |
167 | // before codegen. Replacing this with an equivalent intrinsic which lasts until |
168 | // shortly after the machine function lowering of LDS would help break the name |
169 | // mapping. The other part needed is probably to amend PromoteAlloca to embed |
170 | // the LDS variables it creates in the same struct created here. That avoids the |
171 | // current hazard where a PromoteAlloca LDS variable might be allocated before |
172 | // the kernel scope (and thus error on the address check). Given a new invariant |
173 | // that no LDS variables exist outside of the structs managed here, and an |
174 | // intrinsic that lasts until after the LDS frame lowering, it should be |
175 | // possible to drop the name mapping and fold equivalent memory layouts. |
176 | // |
177 | //===----------------------------------------------------------------------===// |
178 | |
179 | #include "AMDGPU.h" |
180 | #include "AMDGPUMemoryUtils.h" |
181 | #include "AMDGPUTargetMachine.h" |
182 | #include "Utils/AMDGPUBaseInfo.h" |
183 | #include "llvm/ADT/BitVector.h" |
184 | #include "llvm/ADT/DenseMap.h" |
185 | #include "llvm/ADT/DenseSet.h" |
186 | #include "llvm/ADT/STLExtras.h" |
187 | #include "llvm/ADT/SetOperations.h" |
188 | #include "llvm/Analysis/CallGraph.h" |
189 | #include "llvm/Analysis/ScopedNoAliasAA.h" |
190 | #include "llvm/CodeGen/TargetPassConfig.h" |
191 | #include "llvm/IR/Constants.h" |
192 | #include "llvm/IR/DerivedTypes.h" |
193 | #include "llvm/IR/IRBuilder.h" |
194 | #include "llvm/IR/InlineAsm.h" |
195 | #include "llvm/IR/Instructions.h" |
196 | #include "llvm/IR/IntrinsicsAMDGPU.h" |
197 | #include "llvm/IR/MDBuilder.h" |
198 | #include "llvm/IR/ReplaceConstant.h" |
199 | #include "llvm/Pass.h" |
200 | #include "llvm/Support/CommandLine.h" |
201 | #include "llvm/Support/Debug.h" |
202 | #include "llvm/Support/Format.h" |
203 | #include "llvm/Support/OptimizedStructLayout.h" |
204 | #include "llvm/Support/raw_ostream.h" |
205 | #include "llvm/Transforms/Utils/BasicBlockUtils.h" |
206 | #include "llvm/Transforms/Utils/ModuleUtils.h" |
207 | |
208 | #include <vector> |
209 | |
210 | #include <cstdio> |
211 | |
212 | #define DEBUG_TYPE "amdgpu-lower-module-lds" |
213 | |
214 | using namespace llvm; |
215 | using namespace AMDGPU; |
216 | |
217 | namespace { |
218 | |
219 | cl::opt<bool> SuperAlignLDSGlobals( |
220 | "amdgpu-super-align-lds-globals" , |
221 | cl::desc("Increase alignment of LDS if it is not on align boundary" ), |
222 | cl::init(Val: true), cl::Hidden); |
223 | |
224 | enum class LoweringKind { module, table, kernel, hybrid }; |
225 | cl::opt<LoweringKind> LoweringKindLoc( |
226 | "amdgpu-lower-module-lds-strategy" , |
227 | cl::desc("Specify lowering strategy for function LDS access:" ), cl::Hidden, |
228 | cl::init(Val: LoweringKind::hybrid), |
229 | cl::values( |
230 | clEnumValN(LoweringKind::table, "table" , "Lower via table lookup" ), |
231 | clEnumValN(LoweringKind::module, "module" , "Lower via module struct" ), |
232 | clEnumValN( |
233 | LoweringKind::kernel, "kernel" , |
234 | "Lower variables reachable from one kernel, otherwise abort" ), |
235 | clEnumValN(LoweringKind::hybrid, "hybrid" , |
236 | "Lower via mixture of above strategies" ))); |
237 | |
238 | template <typename T> std::vector<T> sortByName(std::vector<T> &&V) { |
239 | llvm::sort(V.begin(), V.end(), [](const auto *L, const auto *R) { |
240 | return L->getName() < R->getName(); |
241 | }); |
242 | return {std::move(V)}; |
243 | } |
244 | |
245 | class AMDGPULowerModuleLDS { |
246 | const AMDGPUTargetMachine &TM; |
247 | |
248 | static void |
249 | removeLocalVarsFromUsedLists(Module &M, |
250 | const DenseSet<GlobalVariable *> &LocalVars) { |
251 | // The verifier rejects used lists containing an inttoptr of a constant |
252 | // so remove the variables from these lists before replaceAllUsesWith |
253 | SmallPtrSet<Constant *, 8> ; |
254 | for (GlobalVariable *LocalVar : LocalVars) |
255 | LocalVarsSet.insert(Ptr: cast<Constant>(Val: LocalVar->stripPointerCasts())); |
256 | |
257 | removeFromUsedLists( |
258 | M, ShouldRemove: [&LocalVarsSet](Constant *C) { return LocalVarsSet.count(Ptr: C); }); |
259 | |
260 | for (GlobalVariable *LocalVar : LocalVars) |
261 | LocalVar->removeDeadConstantUsers(); |
262 | } |
263 | |
264 | static void markUsedByKernel(Function *Func, GlobalVariable *SGV) { |
265 | // The llvm.amdgcn.module.lds instance is implicitly used by all kernels |
266 | // that might call a function which accesses a field within it. This is |
267 | // presently approximated to 'all kernels' if there are any such functions |
268 | // in the module. This implicit use is redefined as an explicit use here so |
269 | // that later passes, specifically PromoteAlloca, account for the required |
270 | // memory without any knowledge of this transform. |
271 | |
272 | // An operand bundle on llvm.donothing works because the call instruction |
273 | // survives until after the last pass that needs to account for LDS. It is |
274 | // better than inline asm as the latter survives until the end of codegen. A |
275 | // totally robust solution would be a function with the same semantics as |
276 | // llvm.donothing that takes a pointer to the instance and is lowered to a |
277 | // no-op after LDS is allocated, but that is not presently necessary. |
278 | |
279 | // This intrinsic is eliminated shortly before instruction selection. It |
280 | // does not suffice to indicate to ISel that a given global which is not |
281 | // immediately used by the kernel must still be allocated by it. An |
282 | // equivalent target specific intrinsic which lasts until immediately after |
283 | // codegen would suffice for that, but one would still need to ensure that |
284 | // the variables are allocated in the anticipated order. |
285 | BasicBlock *Entry = &Func->getEntryBlock(); |
286 | IRBuilder<> Builder(Entry, Entry->getFirstNonPHIIt()); |
287 | |
288 | Function *Decl = Intrinsic::getOrInsertDeclaration( |
289 | M: Func->getParent(), id: Intrinsic::donothing, Tys: {}); |
290 | |
291 | Value *UseInstance[1] = { |
292 | Builder.CreateConstInBoundsGEP1_32(Ty: SGV->getValueType(), Ptr: SGV, Idx0: 0)}; |
293 | |
294 | Builder.CreateCall( |
295 | Callee: Decl, Args: {}, OpBundles: {OperandBundleDefT<Value *>("ExplicitUse" , UseInstance)}); |
296 | } |
297 | |
298 | public: |
299 | AMDGPULowerModuleLDS(const AMDGPUTargetMachine &TM_) : TM(TM_) {} |
300 | |
301 | struct LDSVariableReplacement { |
302 | GlobalVariable *SGV = nullptr; |
303 | DenseMap<GlobalVariable *, Constant *> LDSVarsToConstantGEP; |
304 | }; |
305 | |
306 | // remap from lds global to a constantexpr gep to where it has been moved to |
307 | // for each kernel |
308 | // an array with an element for each kernel containing where the corresponding |
309 | // variable was remapped to |
310 | |
311 | static Constant *getAddressesOfVariablesInKernel( |
312 | LLVMContext &Ctx, ArrayRef<GlobalVariable *> Variables, |
313 | const DenseMap<GlobalVariable *, Constant *> &LDSVarsToConstantGEP) { |
314 | // Create a ConstantArray containing the address of each Variable within the |
315 | // kernel corresponding to LDSVarsToConstantGEP, or poison if that kernel |
316 | // does not allocate it |
317 | // TODO: Drop the ptrtoint conversion |
318 | |
319 | Type *I32 = Type::getInt32Ty(C&: Ctx); |
320 | |
321 | ArrayType *KernelOffsetsType = ArrayType::get(ElementType: I32, NumElements: Variables.size()); |
322 | |
323 | SmallVector<Constant *> Elements; |
324 | for (GlobalVariable *GV : Variables) { |
325 | auto ConstantGepIt = LDSVarsToConstantGEP.find(Val: GV); |
326 | if (ConstantGepIt != LDSVarsToConstantGEP.end()) { |
327 | auto *elt = ConstantExpr::getPtrToInt(C: ConstantGepIt->second, Ty: I32); |
328 | Elements.push_back(Elt: elt); |
329 | } else { |
330 | Elements.push_back(Elt: PoisonValue::get(T: I32)); |
331 | } |
332 | } |
333 | return ConstantArray::get(T: KernelOffsetsType, V: Elements); |
334 | } |
335 | |
336 | static GlobalVariable *buildLookupTable( |
337 | Module &M, ArrayRef<GlobalVariable *> Variables, |
338 | ArrayRef<Function *> kernels, |
339 | DenseMap<Function *, LDSVariableReplacement> &KernelToReplacement) { |
340 | if (Variables.empty()) { |
341 | return nullptr; |
342 | } |
343 | LLVMContext &Ctx = M.getContext(); |
344 | |
345 | const size_t NumberVariables = Variables.size(); |
346 | const size_t NumberKernels = kernels.size(); |
347 | |
348 | ArrayType *KernelOffsetsType = |
349 | ArrayType::get(ElementType: Type::getInt32Ty(C&: Ctx), NumElements: NumberVariables); |
350 | |
351 | ArrayType *AllKernelsOffsetsType = |
352 | ArrayType::get(ElementType: KernelOffsetsType, NumElements: NumberKernels); |
353 | |
354 | Constant *Missing = PoisonValue::get(T: KernelOffsetsType); |
355 | std::vector<Constant *> overallConstantExprElts(NumberKernels); |
356 | for (size_t i = 0; i < NumberKernels; i++) { |
357 | auto Replacement = KernelToReplacement.find(Val: kernels[i]); |
358 | overallConstantExprElts[i] = |
359 | (Replacement == KernelToReplacement.end()) |
360 | ? Missing |
361 | : getAddressesOfVariablesInKernel( |
362 | Ctx, Variables, LDSVarsToConstantGEP: Replacement->second.LDSVarsToConstantGEP); |
363 | } |
364 | |
365 | Constant *init = |
366 | ConstantArray::get(T: AllKernelsOffsetsType, V: overallConstantExprElts); |
367 | |
368 | return new GlobalVariable( |
369 | M, AllKernelsOffsetsType, true, GlobalValue::InternalLinkage, init, |
370 | "llvm.amdgcn.lds.offset.table" , nullptr, GlobalValue::NotThreadLocal, |
371 | AMDGPUAS::CONSTANT_ADDRESS); |
372 | } |
373 | |
374 | void replaceUseWithTableLookup(Module &M, IRBuilder<> &Builder, |
375 | GlobalVariable *LookupTable, |
376 | GlobalVariable *GV, Use &U, |
377 | Value *OptionalIndex) { |
378 | // Table is a constant array of the same length as OrderedKernels |
379 | LLVMContext &Ctx = M.getContext(); |
380 | Type *I32 = Type::getInt32Ty(C&: Ctx); |
381 | auto *I = cast<Instruction>(Val: U.getUser()); |
382 | |
383 | Value *tableKernelIndex = getTableLookupKernelIndex(M, F: I->getFunction()); |
384 | |
385 | if (auto *Phi = dyn_cast<PHINode>(Val: I)) { |
386 | BasicBlock *BB = Phi->getIncomingBlock(U); |
387 | Builder.SetInsertPoint(&(*(BB->getFirstInsertionPt()))); |
388 | } else { |
389 | Builder.SetInsertPoint(I); |
390 | } |
391 | |
392 | SmallVector<Value *, 3> GEPIdx = { |
393 | ConstantInt::get(Ty: I32, V: 0), |
394 | tableKernelIndex, |
395 | }; |
396 | if (OptionalIndex) |
397 | GEPIdx.push_back(Elt: OptionalIndex); |
398 | |
399 | Value *Address = Builder.CreateInBoundsGEP( |
400 | Ty: LookupTable->getValueType(), Ptr: LookupTable, IdxList: GEPIdx, Name: GV->getName()); |
401 | |
402 | Value *loaded = Builder.CreateLoad(Ty: I32, Ptr: Address); |
403 | |
404 | Value *replacement = |
405 | Builder.CreateIntToPtr(V: loaded, DestTy: GV->getType(), Name: GV->getName()); |
406 | |
407 | U.set(replacement); |
408 | } |
409 | |
410 | void replaceUsesInInstructionsWithTableLookup( |
411 | Module &M, ArrayRef<GlobalVariable *> ModuleScopeVariables, |
412 | GlobalVariable *LookupTable) { |
413 | |
414 | LLVMContext &Ctx = M.getContext(); |
415 | IRBuilder<> Builder(Ctx); |
416 | Type *I32 = Type::getInt32Ty(C&: Ctx); |
417 | |
418 | for (size_t Index = 0; Index < ModuleScopeVariables.size(); Index++) { |
419 | auto *GV = ModuleScopeVariables[Index]; |
420 | |
421 | for (Use &U : make_early_inc_range(Range: GV->uses())) { |
422 | auto *I = dyn_cast<Instruction>(Val: U.getUser()); |
423 | if (!I) |
424 | continue; |
425 | |
426 | replaceUseWithTableLookup(M, Builder, LookupTable, GV, U, |
427 | OptionalIndex: ConstantInt::get(Ty: I32, V: Index)); |
428 | } |
429 | } |
430 | } |
431 | |
432 | static DenseSet<Function *> kernelsThatIndirectlyAccessAnyOfPassedVariables( |
433 | Module &M, LDSUsesInfoTy &LDSUsesInfo, |
434 | DenseSet<GlobalVariable *> const &VariableSet) { |
435 | |
436 | DenseSet<Function *> KernelSet; |
437 | |
438 | if (VariableSet.empty()) |
439 | return KernelSet; |
440 | |
441 | for (Function &Func : M.functions()) { |
442 | if (Func.isDeclaration() || !isKernelLDS(F: &Func)) |
443 | continue; |
444 | for (GlobalVariable *GV : LDSUsesInfo.indirect_access[&Func]) { |
445 | if (VariableSet.contains(V: GV)) { |
446 | KernelSet.insert(V: &Func); |
447 | break; |
448 | } |
449 | } |
450 | } |
451 | |
452 | return KernelSet; |
453 | } |
454 | |
455 | static GlobalVariable * |
456 | chooseBestVariableForModuleStrategy(const DataLayout &DL, |
457 | VariableFunctionMap &LDSVars) { |
458 | // Find the global variable with the most indirect uses from kernels |
459 | |
460 | struct CandidateTy { |
461 | GlobalVariable *GV = nullptr; |
462 | size_t UserCount = 0; |
463 | size_t Size = 0; |
464 | |
465 | CandidateTy() = default; |
466 | |
467 | CandidateTy(GlobalVariable *GV, uint64_t UserCount, uint64_t AllocSize) |
468 | : GV(GV), UserCount(UserCount), Size(AllocSize) {} |
469 | |
470 | bool operator<(const CandidateTy &Other) const { |
471 | // Fewer users makes module scope variable less attractive |
472 | if (UserCount < Other.UserCount) { |
473 | return true; |
474 | } |
475 | if (UserCount > Other.UserCount) { |
476 | return false; |
477 | } |
478 | |
479 | // Bigger makes module scope variable less attractive |
480 | if (Size < Other.Size) { |
481 | return false; |
482 | } |
483 | |
484 | if (Size > Other.Size) { |
485 | return true; |
486 | } |
487 | |
488 | // Arbitrary but consistent |
489 | return GV->getName() < Other.GV->getName(); |
490 | } |
491 | }; |
492 | |
493 | CandidateTy MostUsed; |
494 | |
495 | for (auto &K : LDSVars) { |
496 | GlobalVariable *GV = K.first; |
497 | if (K.second.size() <= 1) { |
498 | // A variable reachable by only one kernel is best lowered with kernel |
499 | // strategy |
500 | continue; |
501 | } |
502 | CandidateTy Candidate( |
503 | GV, K.second.size(), |
504 | DL.getTypeAllocSize(Ty: GV->getValueType()).getFixedValue()); |
505 | if (MostUsed < Candidate) |
506 | MostUsed = Candidate; |
507 | } |
508 | |
509 | return MostUsed.GV; |
510 | } |
511 | |
512 | static void recordLDSAbsoluteAddress(Module *M, GlobalVariable *GV, |
513 | uint32_t Address) { |
514 | // Write the specified address into metadata where it can be retrieved by |
515 | // the assembler. Format is a half open range, [Address Address+1) |
516 | LLVMContext &Ctx = M->getContext(); |
517 | auto *IntTy = |
518 | M->getDataLayout().getIntPtrType(C&: Ctx, AddressSpace: AMDGPUAS::LOCAL_ADDRESS); |
519 | auto *MinC = ConstantAsMetadata::get(C: ConstantInt::get(Ty: IntTy, V: Address)); |
520 | auto *MaxC = ConstantAsMetadata::get(C: ConstantInt::get(Ty: IntTy, V: Address + 1)); |
521 | GV->setMetadata(KindID: LLVMContext::MD_absolute_symbol, |
522 | Node: MDNode::get(Context&: Ctx, MDs: {MinC, MaxC})); |
523 | } |
524 | |
525 | DenseMap<Function *, Value *> tableKernelIndexCache; |
526 | Value *getTableLookupKernelIndex(Module &M, Function *F) { |
527 | // Accesses from a function use the amdgcn_lds_kernel_id intrinsic which |
528 | // lowers to a read from a live in register. Emit it once in the entry |
529 | // block to spare deduplicating it later. |
530 | auto [It, Inserted] = tableKernelIndexCache.try_emplace(Key: F); |
531 | if (Inserted) { |
532 | auto InsertAt = F->getEntryBlock().getFirstNonPHIOrDbgOrAlloca(); |
533 | IRBuilder<> Builder(&*InsertAt); |
534 | |
535 | It->second = Builder.CreateIntrinsic(ID: Intrinsic::amdgcn_lds_kernel_id, Args: {}); |
536 | } |
537 | |
538 | return It->second; |
539 | } |
540 | |
541 | static std::vector<Function *> assignLDSKernelIDToEachKernel( |
542 | Module *M, DenseSet<Function *> const &KernelsThatAllocateTableLDS, |
543 | DenseSet<Function *> const &KernelsThatIndirectlyAllocateDynamicLDS) { |
544 | // Associate kernels in the set with an arbitrary but reproducible order and |
545 | // annotate them with that order in metadata. This metadata is recognised by |
546 | // the backend and lowered to a SGPR which can be read from using |
547 | // amdgcn_lds_kernel_id. |
548 | |
549 | std::vector<Function *> OrderedKernels; |
550 | if (!KernelsThatAllocateTableLDS.empty() || |
551 | !KernelsThatIndirectlyAllocateDynamicLDS.empty()) { |
552 | |
553 | for (Function &Func : M->functions()) { |
554 | if (Func.isDeclaration()) |
555 | continue; |
556 | if (!isKernelLDS(F: &Func)) |
557 | continue; |
558 | |
559 | if (KernelsThatAllocateTableLDS.contains(V: &Func) || |
560 | KernelsThatIndirectlyAllocateDynamicLDS.contains(V: &Func)) { |
561 | assert(Func.hasName()); // else fatal error earlier |
562 | OrderedKernels.push_back(x: &Func); |
563 | } |
564 | } |
565 | |
566 | // Put them in an arbitrary but reproducible order |
567 | OrderedKernels = sortByName(V: std::move(OrderedKernels)); |
568 | |
569 | // Annotate the kernels with their order in this vector |
570 | LLVMContext &Ctx = M->getContext(); |
571 | IRBuilder<> Builder(Ctx); |
572 | |
573 | if (OrderedKernels.size() > UINT32_MAX) { |
574 | // 32 bit keeps it in one SGPR. > 2**32 kernels won't fit on the GPU |
575 | reportFatalUsageError(reason: "unimplemented LDS lowering for > 2**32 kernels" ); |
576 | } |
577 | |
578 | for (size_t i = 0; i < OrderedKernels.size(); i++) { |
579 | Metadata *AttrMDArgs[1] = { |
580 | ConstantAsMetadata::get(C: Builder.getInt32(C: i)), |
581 | }; |
582 | OrderedKernels[i]->setMetadata(Kind: "llvm.amdgcn.lds.kernel.id" , |
583 | Node: MDNode::get(Context&: Ctx, MDs: AttrMDArgs)); |
584 | } |
585 | } |
586 | return OrderedKernels; |
587 | } |
588 | |
589 | static void partitionVariablesIntoIndirectStrategies( |
590 | Module &M, LDSUsesInfoTy const &LDSUsesInfo, |
591 | VariableFunctionMap &LDSToKernelsThatNeedToAccessItIndirectly, |
592 | DenseSet<GlobalVariable *> &ModuleScopeVariables, |
593 | DenseSet<GlobalVariable *> &TableLookupVariables, |
594 | DenseSet<GlobalVariable *> &KernelAccessVariables, |
595 | DenseSet<GlobalVariable *> &DynamicVariables) { |
596 | |
597 | GlobalVariable *HybridModuleRoot = |
598 | LoweringKindLoc != LoweringKind::hybrid |
599 | ? nullptr |
600 | : chooseBestVariableForModuleStrategy( |
601 | DL: M.getDataLayout(), LDSVars&: LDSToKernelsThatNeedToAccessItIndirectly); |
602 | |
603 | DenseSet<Function *> const EmptySet; |
604 | DenseSet<Function *> const &HybridModuleRootKernels = |
605 | HybridModuleRoot |
606 | ? LDSToKernelsThatNeedToAccessItIndirectly[HybridModuleRoot] |
607 | : EmptySet; |
608 | |
609 | for (auto &K : LDSToKernelsThatNeedToAccessItIndirectly) { |
610 | // Each iteration of this loop assigns exactly one global variable to |
611 | // exactly one of the implementation strategies. |
612 | |
613 | GlobalVariable *GV = K.first; |
614 | assert(AMDGPU::isLDSVariableToLower(*GV)); |
615 | assert(K.second.size() != 0); |
616 | |
617 | if (AMDGPU::isDynamicLDS(GV: *GV)) { |
618 | DynamicVariables.insert(V: GV); |
619 | continue; |
620 | } |
621 | |
622 | switch (LoweringKindLoc) { |
623 | case LoweringKind::module: |
624 | ModuleScopeVariables.insert(V: GV); |
625 | break; |
626 | |
627 | case LoweringKind::table: |
628 | TableLookupVariables.insert(V: GV); |
629 | break; |
630 | |
631 | case LoweringKind::kernel: |
632 | if (K.second.size() == 1) { |
633 | KernelAccessVariables.insert(V: GV); |
634 | } else { |
635 | // FIXME: This should use DiagnosticInfo |
636 | reportFatalUsageError( |
637 | reason: "cannot lower LDS '" + GV->getName() + |
638 | "' to kernel access as it is reachable from multiple kernels" ); |
639 | } |
640 | break; |
641 | |
642 | case LoweringKind::hybrid: { |
643 | if (GV == HybridModuleRoot) { |
644 | assert(K.second.size() != 1); |
645 | ModuleScopeVariables.insert(V: GV); |
646 | } else if (K.second.size() == 1) { |
647 | KernelAccessVariables.insert(V: GV); |
648 | } else if (set_is_subset(S1: K.second, S2: HybridModuleRootKernels)) { |
649 | ModuleScopeVariables.insert(V: GV); |
650 | } else { |
651 | TableLookupVariables.insert(V: GV); |
652 | } |
653 | break; |
654 | } |
655 | } |
656 | } |
657 | |
658 | // All LDS variables accessed indirectly have now been partitioned into |
659 | // the distinct lowering strategies. |
660 | assert(ModuleScopeVariables.size() + TableLookupVariables.size() + |
661 | KernelAccessVariables.size() + DynamicVariables.size() == |
662 | LDSToKernelsThatNeedToAccessItIndirectly.size()); |
663 | } |
664 | |
665 | static GlobalVariable *lowerModuleScopeStructVariables( |
666 | Module &M, DenseSet<GlobalVariable *> const &ModuleScopeVariables, |
667 | DenseSet<Function *> const &KernelsThatAllocateModuleLDS) { |
668 | // Create a struct to hold the ModuleScopeVariables |
669 | // Replace all uses of those variables from non-kernel functions with the |
670 | // new struct instance Replace only the uses from kernel functions that will |
671 | // allocate this instance. That is a space optimisation - kernels that use a |
672 | // subset of the module scope struct and do not need to allocate it for |
673 | // indirect calls will only allocate the subset they use (they do so as part |
674 | // of the per-kernel lowering). |
675 | if (ModuleScopeVariables.empty()) { |
676 | return nullptr; |
677 | } |
678 | |
679 | LLVMContext &Ctx = M.getContext(); |
680 | |
681 | LDSVariableReplacement ModuleScopeReplacement = |
682 | createLDSVariableReplacement(M, VarName: "llvm.amdgcn.module.lds" , |
683 | LDSVarsToTransform: ModuleScopeVariables); |
684 | |
685 | appendToCompilerUsed(M, Values: {static_cast<GlobalValue *>( |
686 | ConstantExpr::getPointerBitCastOrAddrSpaceCast( |
687 | C: cast<Constant>(Val: ModuleScopeReplacement.SGV), |
688 | Ty: PointerType::getUnqual(C&: Ctx)))}); |
689 | |
690 | // module.lds will be allocated at zero in any kernel that allocates it |
691 | recordLDSAbsoluteAddress(M: &M, GV: ModuleScopeReplacement.SGV, Address: 0); |
692 | |
693 | // historic |
694 | removeLocalVarsFromUsedLists(M, LocalVars: ModuleScopeVariables); |
695 | |
696 | // Replace all uses of module scope variable from non-kernel functions |
697 | replaceLDSVariablesWithStruct( |
698 | M, LDSVarsToTransformArg: ModuleScopeVariables, Replacement: ModuleScopeReplacement, Predicate: [&](Use &U) { |
699 | Instruction *I = dyn_cast<Instruction>(Val: U.getUser()); |
700 | if (!I) { |
701 | return false; |
702 | } |
703 | Function *F = I->getFunction(); |
704 | return !isKernelLDS(F); |
705 | }); |
706 | |
707 | // Replace uses of module scope variable from kernel functions that |
708 | // allocate the module scope variable, otherwise leave them unchanged |
709 | // Record on each kernel whether the module scope global is used by it |
710 | |
711 | for (Function &Func : M.functions()) { |
712 | if (Func.isDeclaration() || !isKernelLDS(F: &Func)) |
713 | continue; |
714 | |
715 | if (KernelsThatAllocateModuleLDS.contains(V: &Func)) { |
716 | replaceLDSVariablesWithStruct( |
717 | M, LDSVarsToTransformArg: ModuleScopeVariables, Replacement: ModuleScopeReplacement, Predicate: [&](Use &U) { |
718 | Instruction *I = dyn_cast<Instruction>(Val: U.getUser()); |
719 | if (!I) { |
720 | return false; |
721 | } |
722 | Function *F = I->getFunction(); |
723 | return F == &Func; |
724 | }); |
725 | |
726 | markUsedByKernel(Func: &Func, SGV: ModuleScopeReplacement.SGV); |
727 | } |
728 | } |
729 | |
730 | return ModuleScopeReplacement.SGV; |
731 | } |
732 | |
733 | static DenseMap<Function *, LDSVariableReplacement> |
734 | lowerKernelScopeStructVariables( |
735 | Module &M, LDSUsesInfoTy &LDSUsesInfo, |
736 | DenseSet<GlobalVariable *> const &ModuleScopeVariables, |
737 | DenseSet<Function *> const &KernelsThatAllocateModuleLDS, |
738 | GlobalVariable *MaybeModuleScopeStruct) { |
739 | |
740 | // Create a struct for each kernel for the non-module-scope variables. |
741 | |
742 | DenseMap<Function *, LDSVariableReplacement> KernelToReplacement; |
743 | for (Function &Func : M.functions()) { |
744 | if (Func.isDeclaration() || !isKernelLDS(F: &Func)) |
745 | continue; |
746 | |
747 | DenseSet<GlobalVariable *> KernelUsedVariables; |
748 | // Allocating variables that are used directly in this struct to get |
749 | // alignment aware allocation and predictable frame size. |
750 | for (auto &v : LDSUsesInfo.direct_access[&Func]) { |
751 | if (!AMDGPU::isDynamicLDS(GV: *v)) { |
752 | KernelUsedVariables.insert(V: v); |
753 | } |
754 | } |
755 | |
756 | // Allocating variables that are accessed indirectly so that a lookup of |
757 | // this struct instance can find them from nested functions. |
758 | for (auto &v : LDSUsesInfo.indirect_access[&Func]) { |
759 | if (!AMDGPU::isDynamicLDS(GV: *v)) { |
760 | KernelUsedVariables.insert(V: v); |
761 | } |
762 | } |
763 | |
764 | // Variables allocated in module lds must all resolve to that struct, |
765 | // not to the per-kernel instance. |
766 | if (KernelsThatAllocateModuleLDS.contains(V: &Func)) { |
767 | for (GlobalVariable *v : ModuleScopeVariables) { |
768 | KernelUsedVariables.erase(V: v); |
769 | } |
770 | } |
771 | |
772 | if (KernelUsedVariables.empty()) { |
773 | // Either used no LDS, or the LDS it used was all in the module struct |
774 | // or dynamically sized |
775 | continue; |
776 | } |
777 | |
778 | // The association between kernel function and LDS struct is done by |
779 | // symbol name, which only works if the function in question has a |
780 | // name This is not expected to be a problem in practice as kernels |
781 | // are called by name making anonymous ones (which are named by the |
782 | // backend) difficult to use. This does mean that llvm test cases need |
783 | // to name the kernels. |
784 | if (!Func.hasName()) { |
785 | reportFatalUsageError(reason: "anonymous kernels cannot use LDS variables" ); |
786 | } |
787 | |
788 | std::string VarName = |
789 | (Twine("llvm.amdgcn.kernel." ) + Func.getName() + ".lds" ).str(); |
790 | |
791 | auto Replacement = |
792 | createLDSVariableReplacement(M, VarName, LDSVarsToTransform: KernelUsedVariables); |
793 | |
794 | // If any indirect uses, create a direct use to ensure allocation |
795 | // TODO: Simpler to unconditionally mark used but that regresses |
796 | // codegen in test/CodeGen/AMDGPU/noclobber-barrier.ll |
797 | auto Accesses = LDSUsesInfo.indirect_access.find(Val: &Func); |
798 | if ((Accesses != LDSUsesInfo.indirect_access.end()) && |
799 | !Accesses->second.empty()) |
800 | markUsedByKernel(Func: &Func, SGV: Replacement.SGV); |
801 | |
802 | // remove preserves existing codegen |
803 | removeLocalVarsFromUsedLists(M, LocalVars: KernelUsedVariables); |
804 | KernelToReplacement[&Func] = Replacement; |
805 | |
806 | // Rewrite uses within kernel to the new struct |
807 | replaceLDSVariablesWithStruct( |
808 | M, LDSVarsToTransformArg: KernelUsedVariables, Replacement, Predicate: [&Func](Use &U) { |
809 | Instruction *I = dyn_cast<Instruction>(Val: U.getUser()); |
810 | return I && I->getFunction() == &Func; |
811 | }); |
812 | } |
813 | return KernelToReplacement; |
814 | } |
815 | |
816 | static GlobalVariable * |
817 | buildRepresentativeDynamicLDSInstance(Module &M, LDSUsesInfoTy &LDSUsesInfo, |
818 | Function *func) { |
819 | // Create a dynamic lds variable with a name associated with the passed |
820 | // function that has the maximum alignment of any dynamic lds variable |
821 | // reachable from this kernel. Dynamic LDS is allocated after the static LDS |
822 | // allocation, possibly after alignment padding. The representative variable |
823 | // created here has the maximum alignment of any other dynamic variable |
824 | // reachable by that kernel. All dynamic LDS variables are allocated at the |
825 | // same address in each kernel in order to provide the documented aliasing |
826 | // semantics. Setting the alignment here allows this IR pass to accurately |
827 | // predict the exact constant at which it will be allocated. |
828 | |
829 | assert(isKernelLDS(func)); |
830 | |
831 | LLVMContext &Ctx = M.getContext(); |
832 | const DataLayout &DL = M.getDataLayout(); |
833 | Align MaxDynamicAlignment(1); |
834 | |
835 | auto UpdateMaxAlignment = [&MaxDynamicAlignment, &DL](GlobalVariable *GV) { |
836 | if (AMDGPU::isDynamicLDS(GV: *GV)) { |
837 | MaxDynamicAlignment = |
838 | std::max(a: MaxDynamicAlignment, b: AMDGPU::getAlign(DL, GV)); |
839 | } |
840 | }; |
841 | |
842 | for (GlobalVariable *GV : LDSUsesInfo.indirect_access[func]) { |
843 | UpdateMaxAlignment(GV); |
844 | } |
845 | |
846 | for (GlobalVariable *GV : LDSUsesInfo.direct_access[func]) { |
847 | UpdateMaxAlignment(GV); |
848 | } |
849 | |
850 | assert(func->hasName()); // Checked by caller |
851 | auto *emptyCharArray = ArrayType::get(ElementType: Type::getInt8Ty(C&: Ctx), NumElements: 0); |
852 | GlobalVariable *N = new GlobalVariable( |
853 | M, emptyCharArray, false, GlobalValue::ExternalLinkage, nullptr, |
854 | Twine("llvm.amdgcn." + func->getName() + ".dynlds" ), nullptr, GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS, |
855 | false); |
856 | N->setAlignment(MaxDynamicAlignment); |
857 | |
858 | assert(AMDGPU::isDynamicLDS(*N)); |
859 | return N; |
860 | } |
861 | |
862 | DenseMap<Function *, GlobalVariable *> lowerDynamicLDSVariables( |
863 | Module &M, LDSUsesInfoTy &LDSUsesInfo, |
864 | DenseSet<Function *> const &KernelsThatIndirectlyAllocateDynamicLDS, |
865 | DenseSet<GlobalVariable *> const &DynamicVariables, |
866 | std::vector<Function *> const &OrderedKernels) { |
867 | DenseMap<Function *, GlobalVariable *> KernelToCreatedDynamicLDS; |
868 | if (!KernelsThatIndirectlyAllocateDynamicLDS.empty()) { |
869 | LLVMContext &Ctx = M.getContext(); |
870 | IRBuilder<> Builder(Ctx); |
871 | Type *I32 = Type::getInt32Ty(C&: Ctx); |
872 | |
873 | std::vector<Constant *> newDynamicLDS; |
874 | |
875 | // Table is built in the same order as OrderedKernels |
876 | for (auto &func : OrderedKernels) { |
877 | |
878 | if (KernelsThatIndirectlyAllocateDynamicLDS.contains(V: func)) { |
879 | assert(isKernelLDS(func)); |
880 | if (!func->hasName()) { |
881 | reportFatalUsageError(reason: "anonymous kernels cannot use LDS variables" ); |
882 | } |
883 | |
884 | GlobalVariable *N = |
885 | buildRepresentativeDynamicLDSInstance(M, LDSUsesInfo, func); |
886 | |
887 | KernelToCreatedDynamicLDS[func] = N; |
888 | |
889 | markUsedByKernel(Func: func, SGV: N); |
890 | |
891 | auto *emptyCharArray = ArrayType::get(ElementType: Type::getInt8Ty(C&: Ctx), NumElements: 0); |
892 | auto *GEP = ConstantExpr::getGetElementPtr( |
893 | Ty: emptyCharArray, C: N, Idx: ConstantInt::get(Ty: I32, V: 0), NW: true); |
894 | newDynamicLDS.push_back(x: ConstantExpr::getPtrToInt(C: GEP, Ty: I32)); |
895 | } else { |
896 | newDynamicLDS.push_back(x: PoisonValue::get(T: I32)); |
897 | } |
898 | } |
899 | assert(OrderedKernels.size() == newDynamicLDS.size()); |
900 | |
901 | ArrayType *t = ArrayType::get(ElementType: I32, NumElements: newDynamicLDS.size()); |
902 | Constant *init = ConstantArray::get(T: t, V: newDynamicLDS); |
903 | GlobalVariable *table = new GlobalVariable( |
904 | M, t, true, GlobalValue::InternalLinkage, init, |
905 | "llvm.amdgcn.dynlds.offset.table" , nullptr, |
906 | GlobalValue::NotThreadLocal, AMDGPUAS::CONSTANT_ADDRESS); |
907 | |
908 | for (GlobalVariable *GV : DynamicVariables) { |
909 | for (Use &U : make_early_inc_range(Range: GV->uses())) { |
910 | auto *I = dyn_cast<Instruction>(Val: U.getUser()); |
911 | if (!I) |
912 | continue; |
913 | if (isKernelLDS(F: I->getFunction())) |
914 | continue; |
915 | |
916 | replaceUseWithTableLookup(M, Builder, LookupTable: table, GV, U, OptionalIndex: nullptr); |
917 | } |
918 | } |
919 | } |
920 | return KernelToCreatedDynamicLDS; |
921 | } |
922 | |
923 | static GlobalVariable *uniquifyGVPerKernel(Module &M, GlobalVariable *GV, |
924 | Function *KF) { |
925 | bool NeedsReplacement = false; |
926 | for (Use &U : GV->uses()) { |
927 | if (auto *I = dyn_cast<Instruction>(Val: U.getUser())) { |
928 | Function *F = I->getFunction(); |
929 | if (isKernelLDS(F) && F != KF) { |
930 | NeedsReplacement = true; |
931 | break; |
932 | } |
933 | } |
934 | } |
935 | if (!NeedsReplacement) |
936 | return GV; |
937 | // Create a new GV used only by this kernel and its function |
938 | GlobalVariable *NewGV = new GlobalVariable( |
939 | M, GV->getValueType(), GV->isConstant(), GV->getLinkage(), |
940 | GV->getInitializer(), GV->getName() + "." + KF->getName(), nullptr, |
941 | GV->getThreadLocalMode(), GV->getType()->getAddressSpace()); |
942 | NewGV->copyAttributesFrom(Src: GV); |
943 | for (Use &U : make_early_inc_range(Range: GV->uses())) { |
944 | if (auto *I = dyn_cast<Instruction>(Val: U.getUser())) { |
945 | Function *F = I->getFunction(); |
946 | if (!isKernelLDS(F) || F == KF) { |
947 | U.getUser()->replaceUsesOfWith(From: GV, To: NewGV); |
948 | } |
949 | } |
950 | } |
951 | return NewGV; |
952 | } |
953 | |
954 | bool lowerSpecialLDSVariables( |
955 | Module &M, LDSUsesInfoTy &LDSUsesInfo, |
956 | VariableFunctionMap &LDSToKernelsThatNeedToAccessItIndirectly) { |
957 | bool Changed = false; |
958 | // The 1st round: give module-absolute assignments |
959 | int NumAbsolutes = 0; |
960 | std::vector<GlobalVariable *> OrderedGVs; |
961 | for (auto &K : LDSToKernelsThatNeedToAccessItIndirectly) { |
962 | GlobalVariable *GV = K.first; |
963 | if (!isNamedBarrier(GV: *GV)) |
964 | continue; |
965 | // give a module-absolute assignment if it is indirectly accessed by |
966 | // multiple kernels. This is not precise, but we don't want to duplicate |
967 | // a function when it is called by multiple kernels. |
968 | if (LDSToKernelsThatNeedToAccessItIndirectly[GV].size() > 1) { |
969 | OrderedGVs.push_back(x: GV); |
970 | } else { |
971 | // leave it to the 2nd round, which will give a kernel-relative |
972 | // assignment if it is only indirectly accessed by one kernel |
973 | LDSUsesInfo.direct_access[*K.second.begin()].insert(V: GV); |
974 | } |
975 | LDSToKernelsThatNeedToAccessItIndirectly.erase(Val: GV); |
976 | } |
977 | OrderedGVs = sortByName(V: std::move(OrderedGVs)); |
978 | for (GlobalVariable *GV : OrderedGVs) { |
979 | int BarId = ++NumAbsolutes; |
980 | unsigned BarrierScope = llvm::AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP; |
981 | // 4 bits for alignment, 5 bits for the barrier num, |
982 | // 3 bits for the barrier scope |
983 | unsigned Offset = 0x802000u | BarrierScope << 9 | BarId << 4; |
984 | recordLDSAbsoluteAddress(M: &M, GV, Address: Offset); |
985 | } |
986 | OrderedGVs.clear(); |
987 | |
988 | // The 2nd round: give a kernel-relative assignment for GV that |
989 | // either only indirectly accessed by single kernel or only directly |
990 | // accessed by multiple kernels. |
991 | std::vector<Function *> OrderedKernels; |
992 | for (auto &K : LDSUsesInfo.direct_access) { |
993 | Function *F = K.first; |
994 | assert(isKernelLDS(F)); |
995 | OrderedKernels.push_back(x: F); |
996 | } |
997 | OrderedKernels = sortByName(V: std::move(OrderedKernels)); |
998 | |
999 | llvm::DenseMap<Function *, uint32_t> Kernel2BarId; |
1000 | for (Function *F : OrderedKernels) { |
1001 | for (GlobalVariable *GV : LDSUsesInfo.direct_access[F]) { |
1002 | if (!isNamedBarrier(GV: *GV)) |
1003 | continue; |
1004 | |
1005 | LDSUsesInfo.direct_access[F].erase(V: GV); |
1006 | if (GV->isAbsoluteSymbolRef()) { |
1007 | // already assigned |
1008 | continue; |
1009 | } |
1010 | OrderedGVs.push_back(x: GV); |
1011 | } |
1012 | OrderedGVs = sortByName(V: std::move(OrderedGVs)); |
1013 | for (GlobalVariable *GV : OrderedGVs) { |
1014 | // GV could also be used directly by other kernels. If so, we need to |
1015 | // create a new GV used only by this kernel and its function. |
1016 | auto NewGV = uniquifyGVPerKernel(M, GV, KF: F); |
1017 | Changed |= (NewGV != GV); |
1018 | int BarId = (NumAbsolutes + 1); |
1019 | if (Kernel2BarId.contains(Val: F)) { |
1020 | BarId = (Kernel2BarId[F] + 1); |
1021 | } |
1022 | Kernel2BarId[F] = BarId; |
1023 | unsigned BarrierScope = llvm::AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP; |
1024 | unsigned Offset = 0x802000u | BarrierScope << 9 | BarId << 4; |
1025 | recordLDSAbsoluteAddress(M: &M, GV: NewGV, Address: Offset); |
1026 | } |
1027 | OrderedGVs.clear(); |
1028 | } |
1029 | // Also erase those special LDS variables from indirect_access. |
1030 | for (auto &K : LDSUsesInfo.indirect_access) { |
1031 | assert(isKernelLDS(K.first)); |
1032 | for (GlobalVariable *GV : K.second) { |
1033 | if (isNamedBarrier(GV: *GV)) |
1034 | K.second.erase(V: GV); |
1035 | } |
1036 | } |
1037 | return Changed; |
1038 | } |
1039 | |
1040 | bool runOnModule(Module &M) { |
1041 | CallGraph CG = CallGraph(M); |
1042 | bool Changed = superAlignLDSGlobals(M); |
1043 | |
1044 | Changed |= eliminateConstantExprUsesOfLDSFromAllInstructions(M); |
1045 | |
1046 | Changed = true; // todo: narrow this down |
1047 | |
1048 | // For each kernel, what variables does it access directly or through |
1049 | // callees |
1050 | LDSUsesInfoTy LDSUsesInfo = getTransitiveUsesOfLDS(CG, M); |
1051 | |
1052 | // For each variable accessed through callees, which kernels access it |
1053 | VariableFunctionMap LDSToKernelsThatNeedToAccessItIndirectly; |
1054 | for (auto &K : LDSUsesInfo.indirect_access) { |
1055 | Function *F = K.first; |
1056 | assert(isKernelLDS(F)); |
1057 | for (GlobalVariable *GV : K.second) { |
1058 | LDSToKernelsThatNeedToAccessItIndirectly[GV].insert(V: F); |
1059 | } |
1060 | } |
1061 | |
1062 | if (LDSUsesInfo.HasSpecialGVs) { |
1063 | // Special LDS variables need special address assignment |
1064 | Changed |= lowerSpecialLDSVariables( |
1065 | M, LDSUsesInfo, LDSToKernelsThatNeedToAccessItIndirectly); |
1066 | } |
1067 | |
1068 | // Partition variables accessed indirectly into the different strategies |
1069 | DenseSet<GlobalVariable *> ModuleScopeVariables; |
1070 | DenseSet<GlobalVariable *> TableLookupVariables; |
1071 | DenseSet<GlobalVariable *> KernelAccessVariables; |
1072 | DenseSet<GlobalVariable *> DynamicVariables; |
1073 | partitionVariablesIntoIndirectStrategies( |
1074 | M, LDSUsesInfo, LDSToKernelsThatNeedToAccessItIndirectly, |
1075 | ModuleScopeVariables, TableLookupVariables, KernelAccessVariables, |
1076 | DynamicVariables); |
1077 | |
1078 | // If the kernel accesses a variable that is going to be stored in the |
1079 | // module instance through a call then that kernel needs to allocate the |
1080 | // module instance |
1081 | const DenseSet<Function *> KernelsThatAllocateModuleLDS = |
1082 | kernelsThatIndirectlyAccessAnyOfPassedVariables(M, LDSUsesInfo, |
1083 | VariableSet: ModuleScopeVariables); |
1084 | const DenseSet<Function *> KernelsThatAllocateTableLDS = |
1085 | kernelsThatIndirectlyAccessAnyOfPassedVariables(M, LDSUsesInfo, |
1086 | VariableSet: TableLookupVariables); |
1087 | |
1088 | const DenseSet<Function *> KernelsThatIndirectlyAllocateDynamicLDS = |
1089 | kernelsThatIndirectlyAccessAnyOfPassedVariables(M, LDSUsesInfo, |
1090 | VariableSet: DynamicVariables); |
1091 | |
1092 | GlobalVariable *MaybeModuleScopeStruct = lowerModuleScopeStructVariables( |
1093 | M, ModuleScopeVariables, KernelsThatAllocateModuleLDS); |
1094 | |
1095 | DenseMap<Function *, LDSVariableReplacement> KernelToReplacement = |
1096 | lowerKernelScopeStructVariables(M, LDSUsesInfo, ModuleScopeVariables, |
1097 | KernelsThatAllocateModuleLDS, |
1098 | MaybeModuleScopeStruct); |
1099 | |
1100 | // Lower zero cost accesses to the kernel instances just created |
1101 | for (auto &GV : KernelAccessVariables) { |
1102 | auto &funcs = LDSToKernelsThatNeedToAccessItIndirectly[GV]; |
1103 | assert(funcs.size() == 1); // Only one kernel can access it |
1104 | LDSVariableReplacement Replacement = |
1105 | KernelToReplacement[*(funcs.begin())]; |
1106 | |
1107 | DenseSet<GlobalVariable *> Vec; |
1108 | Vec.insert(V: GV); |
1109 | |
1110 | replaceLDSVariablesWithStruct(M, LDSVarsToTransformArg: Vec, Replacement, Predicate: [](Use &U) { |
1111 | return isa<Instruction>(Val: U.getUser()); |
1112 | }); |
1113 | } |
1114 | |
1115 | // The ith element of this vector is kernel id i |
1116 | std::vector<Function *> OrderedKernels = |
1117 | assignLDSKernelIDToEachKernel(M: &M, KernelsThatAllocateTableLDS, |
1118 | KernelsThatIndirectlyAllocateDynamicLDS); |
1119 | |
1120 | if (!KernelsThatAllocateTableLDS.empty()) { |
1121 | LLVMContext &Ctx = M.getContext(); |
1122 | IRBuilder<> Builder(Ctx); |
1123 | |
1124 | // The order must be consistent between lookup table and accesses to |
1125 | // lookup table |
1126 | auto TableLookupVariablesOrdered = |
1127 | sortByName(V: std::vector<GlobalVariable *>(TableLookupVariables.begin(), |
1128 | TableLookupVariables.end())); |
1129 | |
1130 | GlobalVariable *LookupTable = buildLookupTable( |
1131 | M, Variables: TableLookupVariablesOrdered, kernels: OrderedKernels, KernelToReplacement); |
1132 | replaceUsesInInstructionsWithTableLookup(M, ModuleScopeVariables: TableLookupVariablesOrdered, |
1133 | LookupTable); |
1134 | } |
1135 | |
1136 | DenseMap<Function *, GlobalVariable *> KernelToCreatedDynamicLDS = |
1137 | lowerDynamicLDSVariables(M, LDSUsesInfo, |
1138 | KernelsThatIndirectlyAllocateDynamicLDS, |
1139 | DynamicVariables, OrderedKernels); |
1140 | |
1141 | // Strip amdgpu-no-lds-kernel-id from all functions reachable from the |
1142 | // kernel. We may have inferred this wasn't used prior to the pass. |
1143 | // TODO: We could filter out subgraphs that do not access LDS globals. |
1144 | for (auto *KernelSet : {&KernelsThatIndirectlyAllocateDynamicLDS, |
1145 | &KernelsThatAllocateTableLDS}) |
1146 | for (Function *F : *KernelSet) |
1147 | removeFnAttrFromReachable(CG, KernelRoot: F, FnAttrs: {"amdgpu-no-lds-kernel-id" }); |
1148 | |
1149 | // All kernel frames have been allocated. Calculate and record the |
1150 | // addresses. |
1151 | { |
1152 | const DataLayout &DL = M.getDataLayout(); |
1153 | |
1154 | for (Function &Func : M.functions()) { |
1155 | if (Func.isDeclaration() || !isKernelLDS(F: &Func)) |
1156 | continue; |
1157 | |
1158 | // All three of these are optional. The first variable is allocated at |
1159 | // zero. They are allocated by AMDGPUMachineFunction as one block. |
1160 | // Layout: |
1161 | //{ |
1162 | // module.lds |
1163 | // alignment padding |
1164 | // kernel instance |
1165 | // alignment padding |
1166 | // dynamic lds variables |
1167 | //} |
1168 | |
1169 | const bool AllocateModuleScopeStruct = |
1170 | MaybeModuleScopeStruct && |
1171 | KernelsThatAllocateModuleLDS.contains(V: &Func); |
1172 | |
1173 | auto Replacement = KernelToReplacement.find(Val: &Func); |
1174 | const bool AllocateKernelScopeStruct = |
1175 | Replacement != KernelToReplacement.end(); |
1176 | |
1177 | const bool AllocateDynamicVariable = |
1178 | KernelToCreatedDynamicLDS.contains(Val: &Func); |
1179 | |
1180 | uint32_t Offset = 0; |
1181 | |
1182 | if (AllocateModuleScopeStruct) { |
1183 | // Allocated at zero, recorded once on construction, not once per |
1184 | // kernel |
1185 | Offset += DL.getTypeAllocSize(Ty: MaybeModuleScopeStruct->getValueType()); |
1186 | } |
1187 | |
1188 | if (AllocateKernelScopeStruct) { |
1189 | GlobalVariable *KernelStruct = Replacement->second.SGV; |
1190 | Offset = alignTo(Size: Offset, A: AMDGPU::getAlign(DL, GV: KernelStruct)); |
1191 | recordLDSAbsoluteAddress(M: &M, GV: KernelStruct, Address: Offset); |
1192 | Offset += DL.getTypeAllocSize(Ty: KernelStruct->getValueType()); |
1193 | } |
1194 | |
1195 | // If there is dynamic allocation, the alignment needed is included in |
1196 | // the static frame size. There may be no reference to the dynamic |
1197 | // variable in the kernel itself, so without including it here, that |
1198 | // alignment padding could be missed. |
1199 | if (AllocateDynamicVariable) { |
1200 | GlobalVariable *DynamicVariable = KernelToCreatedDynamicLDS[&Func]; |
1201 | Offset = alignTo(Size: Offset, A: AMDGPU::getAlign(DL, GV: DynamicVariable)); |
1202 | recordLDSAbsoluteAddress(M: &M, GV: DynamicVariable, Address: Offset); |
1203 | } |
1204 | |
1205 | if (Offset != 0) { |
1206 | (void)TM; // TODO: Account for target maximum LDS |
1207 | std::string Buffer; |
1208 | raw_string_ostream SS{Buffer}; |
1209 | SS << format(Fmt: "%u" , Vals: Offset); |
1210 | |
1211 | // Instead of explicitly marking kernels that access dynamic variables |
1212 | // using special case metadata, annotate with min-lds == max-lds, i.e. |
1213 | // that there is no more space available for allocating more static |
1214 | // LDS variables. That is the right condition to prevent allocating |
1215 | // more variables which would collide with the addresses assigned to |
1216 | // dynamic variables. |
1217 | if (AllocateDynamicVariable) |
1218 | SS << format(Fmt: ",%u" , Vals: Offset); |
1219 | |
1220 | Func.addFnAttr(Kind: "amdgpu-lds-size" , Val: Buffer); |
1221 | } |
1222 | } |
1223 | } |
1224 | |
1225 | for (auto &GV : make_early_inc_range(Range: M.globals())) |
1226 | if (AMDGPU::isLDSVariableToLower(GV)) { |
1227 | // probably want to remove from used lists |
1228 | GV.removeDeadConstantUsers(); |
1229 | if (GV.use_empty()) |
1230 | GV.eraseFromParent(); |
1231 | } |
1232 | |
1233 | return Changed; |
1234 | } |
1235 | |
1236 | private: |
1237 | // Increase the alignment of LDS globals if necessary to maximise the chance |
1238 | // that we can use aligned LDS instructions to access them. |
1239 | static bool superAlignLDSGlobals(Module &M) { |
1240 | const DataLayout &DL = M.getDataLayout(); |
1241 | bool Changed = false; |
1242 | if (!SuperAlignLDSGlobals) { |
1243 | return Changed; |
1244 | } |
1245 | |
1246 | for (auto &GV : M.globals()) { |
1247 | if (GV.getType()->getPointerAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) { |
1248 | // Only changing alignment of LDS variables |
1249 | continue; |
1250 | } |
1251 | if (!GV.hasInitializer()) { |
1252 | // cuda/hip extern __shared__ variable, leave alignment alone |
1253 | continue; |
1254 | } |
1255 | |
1256 | if (GV.isAbsoluteSymbolRef()) { |
1257 | // If the variable is already allocated, don't change the alignment |
1258 | continue; |
1259 | } |
1260 | |
1261 | Align Alignment = AMDGPU::getAlign(DL, GV: &GV); |
1262 | TypeSize GVSize = DL.getTypeAllocSize(Ty: GV.getValueType()); |
1263 | |
1264 | if (GVSize > 8) { |
1265 | // We might want to use a b96 or b128 load/store |
1266 | Alignment = std::max(a: Alignment, b: Align(16)); |
1267 | } else if (GVSize > 4) { |
1268 | // We might want to use a b64 load/store |
1269 | Alignment = std::max(a: Alignment, b: Align(8)); |
1270 | } else if (GVSize > 2) { |
1271 | // We might want to use a b32 load/store |
1272 | Alignment = std::max(a: Alignment, b: Align(4)); |
1273 | } else if (GVSize > 1) { |
1274 | // We might want to use a b16 load/store |
1275 | Alignment = std::max(a: Alignment, b: Align(2)); |
1276 | } |
1277 | |
1278 | if (Alignment != AMDGPU::getAlign(DL, GV: &GV)) { |
1279 | Changed = true; |
1280 | GV.setAlignment(Alignment); |
1281 | } |
1282 | } |
1283 | return Changed; |
1284 | } |
1285 | |
1286 | static LDSVariableReplacement createLDSVariableReplacement( |
1287 | Module &M, std::string VarName, |
1288 | DenseSet<GlobalVariable *> const &LDSVarsToTransform) { |
1289 | // Create a struct instance containing LDSVarsToTransform and map from those |
1290 | // variables to ConstantExprGEP |
1291 | // Variables may be introduced to meet alignment requirements. No aliasing |
1292 | // metadata is useful for these as they have no uses. Erased before return. |
1293 | |
1294 | LLVMContext &Ctx = M.getContext(); |
1295 | const DataLayout &DL = M.getDataLayout(); |
1296 | assert(!LDSVarsToTransform.empty()); |
1297 | |
1298 | SmallVector<OptimizedStructLayoutField, 8> LayoutFields; |
1299 | LayoutFields.reserve(N: LDSVarsToTransform.size()); |
1300 | { |
1301 | // The order of fields in this struct depends on the order of |
1302 | // variables in the argument which varies when changing how they |
1303 | // are identified, leading to spurious test breakage. |
1304 | auto Sorted = sortByName(V: std::vector<GlobalVariable *>( |
1305 | LDSVarsToTransform.begin(), LDSVarsToTransform.end())); |
1306 | |
1307 | for (GlobalVariable *GV : Sorted) { |
1308 | OptimizedStructLayoutField F(GV, |
1309 | DL.getTypeAllocSize(Ty: GV->getValueType()), |
1310 | AMDGPU::getAlign(DL, GV)); |
1311 | LayoutFields.emplace_back(Args&: F); |
1312 | } |
1313 | } |
1314 | |
1315 | performOptimizedStructLayout(Fields: LayoutFields); |
1316 | |
1317 | std::vector<GlobalVariable *> LocalVars; |
1318 | BitVector IsPaddingField; |
1319 | LocalVars.reserve(n: LDSVarsToTransform.size()); // will be at least this large |
1320 | IsPaddingField.reserve(N: LDSVarsToTransform.size()); |
1321 | { |
1322 | uint64_t CurrentOffset = 0; |
1323 | for (auto &F : LayoutFields) { |
1324 | GlobalVariable *FGV = |
1325 | static_cast<GlobalVariable *>(const_cast<void *>(F.Id)); |
1326 | Align DataAlign = F.Alignment; |
1327 | |
1328 | uint64_t DataAlignV = DataAlign.value(); |
1329 | if (uint64_t Rem = CurrentOffset % DataAlignV) { |
1330 | uint64_t Padding = DataAlignV - Rem; |
1331 | |
1332 | // Append an array of padding bytes to meet alignment requested |
1333 | // Note (o + (a - (o % a)) ) % a == 0 |
1334 | // (offset + Padding ) % align == 0 |
1335 | |
1336 | Type *ATy = ArrayType::get(ElementType: Type::getInt8Ty(C&: Ctx), NumElements: Padding); |
1337 | LocalVars.push_back(x: new GlobalVariable( |
1338 | M, ATy, false, GlobalValue::InternalLinkage, |
1339 | PoisonValue::get(T: ATy), "" , nullptr, GlobalValue::NotThreadLocal, |
1340 | AMDGPUAS::LOCAL_ADDRESS, false)); |
1341 | IsPaddingField.push_back(Val: true); |
1342 | CurrentOffset += Padding; |
1343 | } |
1344 | |
1345 | LocalVars.push_back(x: FGV); |
1346 | IsPaddingField.push_back(Val: false); |
1347 | CurrentOffset += F.Size; |
1348 | } |
1349 | } |
1350 | |
1351 | std::vector<Type *> LocalVarTypes; |
1352 | LocalVarTypes.reserve(n: LocalVars.size()); |
1353 | std::transform( |
1354 | first: LocalVars.cbegin(), last: LocalVars.cend(), result: std::back_inserter(x&: LocalVarTypes), |
1355 | unary_op: [](const GlobalVariable *V) -> Type * { return V->getValueType(); }); |
1356 | |
1357 | StructType *LDSTy = StructType::create(Context&: Ctx, Elements: LocalVarTypes, Name: VarName + ".t" ); |
1358 | |
1359 | Align StructAlign = AMDGPU::getAlign(DL, GV: LocalVars[0]); |
1360 | |
1361 | GlobalVariable *SGV = new GlobalVariable( |
1362 | M, LDSTy, false, GlobalValue::InternalLinkage, PoisonValue::get(T: LDSTy), |
1363 | VarName, nullptr, GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS, |
1364 | false); |
1365 | SGV->setAlignment(StructAlign); |
1366 | |
1367 | DenseMap<GlobalVariable *, Constant *> Map; |
1368 | Type *I32 = Type::getInt32Ty(C&: Ctx); |
1369 | for (size_t I = 0; I < LocalVars.size(); I++) { |
1370 | GlobalVariable *GV = LocalVars[I]; |
1371 | Constant *GEPIdx[] = {ConstantInt::get(Ty: I32, V: 0), ConstantInt::get(Ty: I32, V: I)}; |
1372 | Constant *GEP = ConstantExpr::getGetElementPtr(Ty: LDSTy, C: SGV, IdxList: GEPIdx, NW: true); |
1373 | if (IsPaddingField[I]) { |
1374 | assert(GV->use_empty()); |
1375 | GV->eraseFromParent(); |
1376 | } else { |
1377 | Map[GV] = GEP; |
1378 | } |
1379 | } |
1380 | assert(Map.size() == LDSVarsToTransform.size()); |
1381 | return {.SGV: SGV, .LDSVarsToConstantGEP: std::move(Map)}; |
1382 | } |
1383 | |
1384 | template <typename PredicateTy> |
1385 | static void replaceLDSVariablesWithStruct( |
1386 | Module &M, DenseSet<GlobalVariable *> const &LDSVarsToTransformArg, |
1387 | const LDSVariableReplacement &Replacement, PredicateTy Predicate) { |
1388 | LLVMContext &Ctx = M.getContext(); |
1389 | const DataLayout &DL = M.getDataLayout(); |
1390 | |
1391 | // A hack... we need to insert the aliasing info in a predictable order for |
1392 | // lit tests. Would like to have them in a stable order already, ideally the |
1393 | // same order they get allocated, which might mean an ordered set container |
1394 | auto LDSVarsToTransform = sortByName(V: std::vector<GlobalVariable *>( |
1395 | LDSVarsToTransformArg.begin(), LDSVarsToTransformArg.end())); |
1396 | |
1397 | // Create alias.scope and their lists. Each field in the new structure |
1398 | // does not alias with all other fields. |
1399 | SmallVector<MDNode *> AliasScopes; |
1400 | SmallVector<Metadata *> NoAliasList; |
1401 | const size_t NumberVars = LDSVarsToTransform.size(); |
1402 | if (NumberVars > 1) { |
1403 | MDBuilder MDB(Ctx); |
1404 | AliasScopes.reserve(N: NumberVars); |
1405 | MDNode *Domain = MDB.createAnonymousAliasScopeDomain(); |
1406 | for (size_t I = 0; I < NumberVars; I++) { |
1407 | MDNode *Scope = MDB.createAnonymousAliasScope(Domain); |
1408 | AliasScopes.push_back(Elt: Scope); |
1409 | } |
1410 | NoAliasList.append(in_start: &AliasScopes[1], in_end: AliasScopes.end()); |
1411 | } |
1412 | |
1413 | // Replace uses of ith variable with a constantexpr to the corresponding |
1414 | // field of the instance that will be allocated by AMDGPUMachineFunction |
1415 | for (size_t I = 0; I < NumberVars; I++) { |
1416 | GlobalVariable *GV = LDSVarsToTransform[I]; |
1417 | Constant *GEP = Replacement.LDSVarsToConstantGEP.at(Val: GV); |
1418 | |
1419 | GV->replaceUsesWithIf(New: GEP, ShouldReplace: Predicate); |
1420 | |
1421 | APInt APOff(DL.getIndexTypeSizeInBits(Ty: GEP->getType()), 0); |
1422 | GEP->stripAndAccumulateInBoundsConstantOffsets(DL, Offset&: APOff); |
1423 | uint64_t Offset = APOff.getZExtValue(); |
1424 | |
1425 | Align A = |
1426 | commonAlignment(A: Replacement.SGV->getAlign().valueOrOne(), Offset); |
1427 | |
1428 | if (I) |
1429 | NoAliasList[I - 1] = AliasScopes[I - 1]; |
1430 | MDNode *NoAlias = |
1431 | NoAliasList.empty() ? nullptr : MDNode::get(Context&: Ctx, MDs: NoAliasList); |
1432 | MDNode *AliasScope = |
1433 | AliasScopes.empty() ? nullptr : MDNode::get(Context&: Ctx, MDs: {AliasScopes[I]}); |
1434 | |
1435 | refineUsesAlignmentAndAA(Ptr: GEP, A, DL, AliasScope, NoAlias); |
1436 | } |
1437 | } |
1438 | |
1439 | static void refineUsesAlignmentAndAA(Value *Ptr, Align A, |
1440 | const DataLayout &DL, MDNode *AliasScope, |
1441 | MDNode *NoAlias, unsigned MaxDepth = 5) { |
1442 | if (!MaxDepth || (A == 1 && !AliasScope)) |
1443 | return; |
1444 | |
1445 | ScopedNoAliasAAResult ScopedNoAlias; |
1446 | |
1447 | for (User *U : Ptr->users()) { |
1448 | if (auto *I = dyn_cast<Instruction>(Val: U)) { |
1449 | if (AliasScope && I->mayReadOrWriteMemory()) { |
1450 | MDNode *AS = I->getMetadata(KindID: LLVMContext::MD_alias_scope); |
1451 | AS = (AS ? MDNode::getMostGenericAliasScope(A: AS, B: AliasScope) |
1452 | : AliasScope); |
1453 | I->setMetadata(KindID: LLVMContext::MD_alias_scope, Node: AS); |
1454 | |
1455 | MDNode *NA = I->getMetadata(KindID: LLVMContext::MD_noalias); |
1456 | |
1457 | // Scoped aliases can originate from two different domains. |
1458 | // First domain would be from LDS domain (created by this pass). |
1459 | // All entries (LDS vars) into LDS struct will have same domain. |
1460 | |
1461 | // Second domain could be existing scoped aliases that are the |
1462 | // results of noalias params and subsequent optimizations that |
1463 | // may alter thesse sets. |
1464 | |
1465 | // We need to be careful how we create new alias sets, and |
1466 | // have right scopes and domains for loads/stores of these new |
1467 | // LDS variables. We intersect NoAlias set if alias sets belong |
1468 | // to the same domain. This is the case if we have memcpy using |
1469 | // LDS variables. Both src and dst of memcpy would belong to |
1470 | // LDS struct, they donot alias. |
1471 | // On the other hand, if one of the domains is LDS and other is |
1472 | // existing domain prior to LDS, we need to have a union of all |
1473 | // these aliases set to preserve existing aliasing information. |
1474 | |
1475 | SmallPtrSet<const MDNode *, 16> ExistingDomains, LDSDomains; |
1476 | ScopedNoAlias.collectScopedDomains(NoAlias: NA, Domains&: ExistingDomains); |
1477 | ScopedNoAlias.collectScopedDomains(NoAlias, Domains&: LDSDomains); |
1478 | auto Intersection = set_intersection(S1: ExistingDomains, S2: LDSDomains); |
1479 | if (Intersection.empty()) { |
1480 | NA = NA ? MDNode::concatenate(A: NA, B: NoAlias) : NoAlias; |
1481 | } else { |
1482 | NA = NA ? MDNode::intersect(A: NA, B: NoAlias) : NoAlias; |
1483 | } |
1484 | I->setMetadata(KindID: LLVMContext::MD_noalias, Node: NA); |
1485 | } |
1486 | } |
1487 | |
1488 | if (auto *LI = dyn_cast<LoadInst>(Val: U)) { |
1489 | LI->setAlignment(std::max(a: A, b: LI->getAlign())); |
1490 | continue; |
1491 | } |
1492 | if (auto *SI = dyn_cast<StoreInst>(Val: U)) { |
1493 | if (SI->getPointerOperand() == Ptr) |
1494 | SI->setAlignment(std::max(a: A, b: SI->getAlign())); |
1495 | continue; |
1496 | } |
1497 | if (auto *AI = dyn_cast<AtomicRMWInst>(Val: U)) { |
1498 | // None of atomicrmw operations can work on pointers, but let's |
1499 | // check it anyway in case it will or we will process ConstantExpr. |
1500 | if (AI->getPointerOperand() == Ptr) |
1501 | AI->setAlignment(std::max(a: A, b: AI->getAlign())); |
1502 | continue; |
1503 | } |
1504 | if (auto *AI = dyn_cast<AtomicCmpXchgInst>(Val: U)) { |
1505 | if (AI->getPointerOperand() == Ptr) |
1506 | AI->setAlignment(std::max(a: A, b: AI->getAlign())); |
1507 | continue; |
1508 | } |
1509 | if (auto *GEP = dyn_cast<GetElementPtrInst>(Val: U)) { |
1510 | unsigned BitWidth = DL.getIndexTypeSizeInBits(Ty: GEP->getType()); |
1511 | APInt Off(BitWidth, 0); |
1512 | if (GEP->getPointerOperand() == Ptr) { |
1513 | Align GA; |
1514 | if (GEP->accumulateConstantOffset(DL, Offset&: Off)) |
1515 | GA = commonAlignment(A, Offset: Off.getLimitedValue()); |
1516 | refineUsesAlignmentAndAA(Ptr: GEP, A: GA, DL, AliasScope, NoAlias, |
1517 | MaxDepth: MaxDepth - 1); |
1518 | } |
1519 | continue; |
1520 | } |
1521 | if (auto *I = dyn_cast<Instruction>(Val: U)) { |
1522 | if (I->getOpcode() == Instruction::BitCast || |
1523 | I->getOpcode() == Instruction::AddrSpaceCast) |
1524 | refineUsesAlignmentAndAA(Ptr: I, A, DL, AliasScope, NoAlias, MaxDepth: MaxDepth - 1); |
1525 | } |
1526 | } |
1527 | } |
1528 | }; |
1529 | |
1530 | class AMDGPULowerModuleLDSLegacy : public ModulePass { |
1531 | public: |
1532 | const AMDGPUTargetMachine *TM; |
1533 | static char ID; |
1534 | |
1535 | AMDGPULowerModuleLDSLegacy(const AMDGPUTargetMachine *TM = nullptr) |
1536 | : ModulePass(ID), TM(TM) {} |
1537 | |
1538 | void getAnalysisUsage(AnalysisUsage &AU) const override { |
1539 | if (!TM) |
1540 | AU.addRequired<TargetPassConfig>(); |
1541 | } |
1542 | |
1543 | bool runOnModule(Module &M) override { |
1544 | if (!TM) { |
1545 | auto &TPC = getAnalysis<TargetPassConfig>(); |
1546 | TM = &TPC.getTM<AMDGPUTargetMachine>(); |
1547 | } |
1548 | |
1549 | return AMDGPULowerModuleLDS(*TM).runOnModule(M); |
1550 | } |
1551 | }; |
1552 | |
1553 | } // namespace |
1554 | char AMDGPULowerModuleLDSLegacy::ID = 0; |
1555 | |
1556 | char &llvm::AMDGPULowerModuleLDSLegacyPassID = AMDGPULowerModuleLDSLegacy::ID; |
1557 | |
1558 | INITIALIZE_PASS_BEGIN(AMDGPULowerModuleLDSLegacy, DEBUG_TYPE, |
1559 | "Lower uses of LDS variables from non-kernel functions" , |
1560 | false, false) |
1561 | INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) |
1562 | INITIALIZE_PASS_END(AMDGPULowerModuleLDSLegacy, DEBUG_TYPE, |
1563 | "Lower uses of LDS variables from non-kernel functions" , |
1564 | false, false) |
1565 | |
1566 | ModulePass * |
1567 | llvm::createAMDGPULowerModuleLDSLegacyPass(const AMDGPUTargetMachine *TM) { |
1568 | return new AMDGPULowerModuleLDSLegacy(TM); |
1569 | } |
1570 | |
1571 | PreservedAnalyses AMDGPULowerModuleLDSPass::run(Module &M, |
1572 | ModuleAnalysisManager &) { |
1573 | return AMDGPULowerModuleLDS(TM).runOnModule(M) ? PreservedAnalyses::none() |
1574 | : PreservedAnalyses::all(); |
1575 | } |
1576 | |