1//===-- AMDGPULowerModuleLDSPass.cpp ------------------------------*- C++ -*-=//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass eliminates local data store, LDS, uses from non-kernel functions.
10// LDS is contiguous memory allocated per kernel execution.
11//
12// Background.
13//
14// The programming model is global variables, or equivalently function local
15// static variables, accessible from kernels or other functions. For uses from
16// kernels this is straightforward - assign an integer to the kernel for the
17// memory required by all the variables combined, allocate them within that.
18// For uses from functions there are performance tradeoffs to choose between.
19//
20// This model means the GPU runtime can specify the amount of memory allocated.
21// If this is more than the kernel assumed, the excess can be made available
22// using a language specific feature, which IR represents as a variable with
23// no initializer. This feature is referred to here as "Dynamic LDS" and is
24// lowered slightly differently to the normal case.
25//
26// Consequences of this GPU feature:
27// - memory is limited and exceeding it halts compilation
28// - a global accessed by one kernel exists independent of other kernels
29// - a global exists independent of simultaneous execution of the same kernel
30// - the address of the global may be different from different kernels as they
31// do not alias, which permits only allocating variables they use
32// - if the address is allowed to differ, functions need help to find it
33//
34// Uses from kernels are implemented here by grouping them in a per-kernel
35// struct instance. This duplicates the variables, accurately modelling their
36// aliasing properties relative to a single global representation. It also
37// permits control over alignment via padding.
38//
39// Uses from functions are more complicated and the primary purpose of this
40// IR pass. Several different lowering are chosen between to meet requirements
41// to avoid allocating any LDS where it is not necessary, as that impacts
42// occupancy and may fail the compilation, while not imposing overhead on a
43// feature whose primary advantage over global memory is performance. The basic
44// design goal is to avoid one kernel imposing overhead on another.
45//
46// Implementation.
47//
48// LDS variables with constant annotation or non-undef initializer are passed
49// through unchanged for simplification or error diagnostics in later passes.
50// Non-undef initializers are not yet implemented for LDS.
51//
52// LDS variables that are always allocated at the same address can be found
53// by lookup at that address. Otherwise runtime information/cost is required.
54//
55// The simplest strategy possible is to group all LDS variables in a single
56// struct and allocate that struct in every kernel such that the original
57// variables are always at the same address. LDS is however a limited resource
58// so this strategy is unusable in practice. It is not implemented here.
59//
60// Strategy | Precise allocation | Zero runtime cost | General purpose |
61// --------+--------------------+-------------------+-----------------+
62// Module | No | Yes | Yes |
63// Table | Yes | No | Yes |
64// Kernel | Yes | Yes | No |
65// Hybrid | Yes | Partial | Yes |
66//
67// "Module" spends LDS memory to save cycles. "Table" spends cycles and global
68// memory to save LDS. "Kernel" is as fast as kernel allocation but only works
69// for variables that are known reachable from a single kernel. "Hybrid" picks
70// between all three. When forced to choose between LDS and cycles we minimise
71// LDS use.
72
73// The "module" lowering implemented here finds LDS variables which are used by
74// non-kernel functions and creates a new struct with a field for each of those
75// LDS variables. Variables that are only used from kernels are excluded.
76//
77// The "table" lowering implemented here has three components.
78// First kernels are assigned a unique integer identifier which is available in
79// functions it calls through the intrinsic amdgcn_lds_kernel_id. The integer
80// is passed through a specific SGPR, thus works with indirect calls.
81// Second, each kernel allocates LDS variables independent of other kernels and
82// writes the addresses it chose for each variable into an array in consistent
83// order. If the kernel does not allocate a given variable, it writes undef to
84// the corresponding array location. These arrays are written to a constant
85// table in the order matching the kernel unique integer identifier.
86// Third, uses from non-kernel functions are replaced with a table lookup using
87// the intrinsic function to find the address of the variable.
88//
89// "Kernel" lowering is only applicable for variables that are unambiguously
90// reachable from exactly one kernel. For those cases, accesses to the variable
91// can be lowered to ConstantExpr address of a struct instance specific to that
92// one kernel. This is zero cost in space and in compute. It will raise a fatal
93// error on any variable that might be reachable from multiple kernels and is
94// thus most easily used as part of the hybrid lowering strategy.
95//
96// Hybrid lowering is a mixture of the above. It uses the zero cost kernel
97// lowering where it can. It lowers the variable accessed by the greatest
98// number of kernels using the module strategy as that is free for the first
99// variable. Any futher variables that can be lowered with the module strategy
100// without incurring LDS memory overhead are. The remaining ones are lowered
101// via table.
102//
103// Consequences
104// - No heuristics or user controlled magic numbers, hybrid is the right choice
105// - Kernels that don't use functions (or have had them all inlined) are not
106// affected by any lowering for kernels that do.
107// - Kernels that don't make indirect function calls are not affected by those
108// that do.
109// - Variables which are used by lots of kernels, e.g. those injected by a
110// language runtime in most kernels, are expected to have no overhead
111// - Implementations that instantiate templates per-kernel where those templates
112// use LDS are expected to hit the "Kernel" lowering strategy
113// - The runtime properties impose a cost in compiler implementation complexity
114//
115// Dynamic LDS implementation
116// Dynamic LDS is lowered similarly to the "table" strategy above and uses the
117// same intrinsic to identify which kernel is at the root of the dynamic call
118// graph. This relies on the specified behaviour that all dynamic LDS variables
119// alias one another, i.e. are at the same address, with respect to a given
120// kernel. Therefore this pass creates new dynamic LDS variables for each kernel
121// that allocates any dynamic LDS and builds a table of addresses out of those.
122// The AMDGPUPromoteAlloca pass skips kernels that use dynamic LDS.
123// The corresponding optimisation for "kernel" lowering where the table lookup
124// is elided is not implemented.
125//
126//
127// Implementation notes / limitations
128// A single LDS global variable represents an instance per kernel that can reach
129// said variables. This pass essentially specialises said variables per kernel.
130// Handling ConstantExpr during the pass complicated this significantly so now
131// all ConstantExpr uses of LDS variables are expanded to instructions. This
132// may need amending when implementing non-undef initialisers.
133//
134// Lowering is split between this IR pass and the back end. This pass chooses
135// where given variables should be allocated and marks them with metadata,
136// MD_absolute_symbol. The backend places the variables in coincidentally the
137// same location and raises a fatal error if something has gone awry. This works
138// in practice because the only pass between this one and the backend that
139// changes LDS is PromoteAlloca and the changes it makes do not conflict.
140//
141// Addresses are written to constant global arrays based on the same metadata.
142//
143// The backend lowers LDS variables in the order of traversal of the function.
144// This is at odds with the deterministic layout required. The workaround is to
145// allocate the fixed-address variables immediately upon starting the function
146// where they can be placed as intended. This requires a means of mapping from
147// the function to the variables that it allocates. For the module scope lds,
148// this is via metadata indicating whether the variable is not required. If a
149// pass deletes that metadata, a fatal error on disagreement with the absolute
150// symbol metadata will occur. For kernel scope and dynamic, this is by _name_
151// correspondence between the function and the variable. It requires the
152// kernel to have a name (which is only a limitation for tests in practice) and
153// for nothing to rename the corresponding symbols. This is a hazard if the pass
154// is run multiple times during debugging. Alternative schemes considered all
155// involve bespoke metadata.
156//
157// If the name correspondence can be replaced, multiple distinct kernels that
158// have the same memory layout can map to the same kernel id (as the address
159// itself is handled by the absolute symbol metadata) and that will allow more
160// uses of the "kernel" style faster lowering and reduce the size of the lookup
161// tables.
162//
163// There is a test that checks this does not fire for a graphics shader. This
164// lowering is expected to work for graphics if the isKernel test is changed.
165//
166// The current markUsedByKernel is sufficient for PromoteAlloca but is elided
167// before codegen. Replacing this with an equivalent intrinsic which lasts until
168// shortly after the machine function lowering of LDS would help break the name
169// mapping. The other part needed is probably to amend PromoteAlloca to embed
170// the LDS variables it creates in the same struct created here. That avoids the
171// current hazard where a PromoteAlloca LDS variable might be allocated before
172// the kernel scope (and thus error on the address check). Given a new invariant
173// that no LDS variables exist outside of the structs managed here, and an
174// intrinsic that lasts until after the LDS frame lowering, it should be
175// possible to drop the name mapping and fold equivalent memory layouts.
176//
177//===----------------------------------------------------------------------===//
178
179#include "AMDGPU.h"
180#include "AMDGPUMemoryUtils.h"
181#include "AMDGPUTargetMachine.h"
182#include "Utils/AMDGPUBaseInfo.h"
183#include "llvm/ADT/BitVector.h"
184#include "llvm/ADT/DenseMap.h"
185#include "llvm/ADT/DenseSet.h"
186#include "llvm/ADT/STLExtras.h"
187#include "llvm/ADT/SetOperations.h"
188#include "llvm/ADT/SmallString.h"
189#include "llvm/Analysis/CallGraph.h"
190#include "llvm/Analysis/ScopedNoAliasAA.h"
191#include "llvm/CodeGen/TargetPassConfig.h"
192#include "llvm/IR/Constants.h"
193#include "llvm/IR/DerivedTypes.h"
194#include "llvm/IR/Dominators.h"
195#include "llvm/IR/IRBuilder.h"
196#include "llvm/IR/InlineAsm.h"
197#include "llvm/IR/Instructions.h"
198#include "llvm/IR/IntrinsicsAMDGPU.h"
199#include "llvm/IR/MDBuilder.h"
200#include "llvm/IR/ReplaceConstant.h"
201#include "llvm/InitializePasses.h"
202#include "llvm/Pass.h"
203#include "llvm/Support/CommandLine.h"
204#include "llvm/Support/Debug.h"
205#include "llvm/Support/Format.h"
206#include "llvm/Support/OptimizedStructLayout.h"
207#include "llvm/Support/raw_ostream.h"
208#include "llvm/Transforms/Utils/BasicBlockUtils.h"
209#include "llvm/Transforms/Utils/ModuleUtils.h"
210
211#include <vector>
212
213#include <cstdio>
214
215#define DEBUG_TYPE "amdgpu-lower-module-lds"
216
217using namespace llvm;
218using namespace AMDGPU;
219
220namespace {
221
222cl::opt<bool> SuperAlignLDSGlobals(
223 "amdgpu-super-align-lds-globals",
224 cl::desc("Increase alignment of LDS if it is not on align boundary"),
225 cl::init(Val: true), cl::Hidden);
226
227enum class LoweringKind { module, table, kernel, hybrid };
228cl::opt<LoweringKind> LoweringKindLoc(
229 "amdgpu-lower-module-lds-strategy",
230 cl::desc("Specify lowering strategy for function LDS access:"), cl::Hidden,
231 cl::init(Val: LoweringKind::hybrid),
232 cl::values(
233 clEnumValN(LoweringKind::table, "table", "Lower via table lookup"),
234 clEnumValN(LoweringKind::module, "module", "Lower via module struct"),
235 clEnumValN(
236 LoweringKind::kernel, "kernel",
237 "Lower variables reachable from one kernel, otherwise abort"),
238 clEnumValN(LoweringKind::hybrid, "hybrid",
239 "Lower via mixture of above strategies")));
240
241template <typename T> std::vector<T> sortByName(std::vector<T> &&V) {
242 llvm::sort(V, [](const auto *L, const auto *R) {
243 return L->getName() < R->getName();
244 });
245 return {std::move(V)};
246}
247
248class AMDGPULowerModuleLDS {
249 const AMDGPUTargetMachine &TM;
250
251 static void
252 removeLocalVarsFromUsedLists(Module &M,
253 const DenseSet<GlobalVariable *> &LocalVars) {
254 // The verifier rejects used lists containing an inttoptr of a constant
255 // so remove the variables from these lists before replaceAllUsesWith
256 SmallPtrSet<Constant *, 8> LocalVarsSet;
257 for (GlobalVariable *LocalVar : LocalVars)
258 LocalVarsSet.insert(Ptr: cast<Constant>(Val: LocalVar->stripPointerCasts()));
259
260 removeFromUsedLists(
261 M, ShouldRemove: [&LocalVarsSet](Constant *C) { return LocalVarsSet.count(Ptr: C); });
262
263 for (GlobalVariable *LocalVar : LocalVars)
264 LocalVar->removeDeadConstantUsers();
265 }
266
267 static void markUsedByKernel(Function *Func, GlobalVariable *SGV) {
268 // The llvm.amdgcn.module.lds instance is implicitly used by all kernels
269 // that might call a function which accesses a field within it. This is
270 // presently approximated to 'all kernels' if there are any such functions
271 // in the module. This implicit use is redefined as an explicit use here so
272 // that later passes, specifically PromoteAlloca, account for the required
273 // memory without any knowledge of this transform.
274
275 // An operand bundle on llvm.donothing works because the call instruction
276 // survives until after the last pass that needs to account for LDS. It is
277 // better than inline asm as the latter survives until the end of codegen. A
278 // totally robust solution would be a function with the same semantics as
279 // llvm.donothing that takes a pointer to the instance and is lowered to a
280 // no-op after LDS is allocated, but that is not presently necessary.
281
282 // This intrinsic is eliminated shortly before instruction selection. It
283 // does not suffice to indicate to ISel that a given global which is not
284 // immediately used by the kernel must still be allocated by it. An
285 // equivalent target specific intrinsic which lasts until immediately after
286 // codegen would suffice for that, but one would still need to ensure that
287 // the variables are allocated in the anticipated order.
288 BasicBlock *Entry = &Func->getEntryBlock();
289 IRBuilder<> Builder(Entry, Entry->getFirstNonPHIIt());
290
291 Function *Decl = Intrinsic::getOrInsertDeclaration(
292 M: Func->getParent(), id: Intrinsic::donothing, OverloadTys: {});
293
294 Value *UseInstance[1] = {
295 Builder.CreateConstInBoundsGEP1_32(Ty: SGV->getValueType(), Ptr: SGV, Idx0: 0)};
296
297 Builder.CreateCall(
298 Callee: Decl, Args: {}, OpBundles: {OperandBundleDefT<Value *>("ExplicitUse", UseInstance)});
299 }
300
301public:
302 AMDGPULowerModuleLDS(const AMDGPUTargetMachine &TM_) : TM(TM_) {}
303
304 struct LDSVariableReplacement {
305 GlobalVariable *SGV = nullptr;
306 DenseMap<GlobalVariable *, Constant *> LDSVarsToConstantGEP;
307 };
308
309 // remap from lds global to a constantexpr gep to where it has been moved to
310 // for each kernel
311 // an array with an element for each kernel containing where the corresponding
312 // variable was remapped to
313
314 static Constant *getAddressesOfVariablesInKernel(
315 LLVMContext &Ctx, ArrayRef<GlobalVariable *> Variables,
316 const DenseMap<GlobalVariable *, Constant *> &LDSVarsToConstantGEP) {
317 // Create a ConstantArray containing the address of each Variable within the
318 // kernel corresponding to LDSVarsToConstantGEP, or poison if that kernel
319 // does not allocate it
320
321 Type *LocalPtrTy = PointerType::get(C&: Ctx, AddressSpace: AMDGPUAS::LOCAL_ADDRESS);
322 ArrayType *KernelOffsetsType = ArrayType::get(ElementType: LocalPtrTy, NumElements: Variables.size());
323
324 SmallVector<Constant *> Elements;
325 for (GlobalVariable *GV : Variables) {
326 auto ConstantGepIt = LDSVarsToConstantGEP.find(Val: GV);
327 if (ConstantGepIt != LDSVarsToConstantGEP.end()) {
328 Elements.push_back(Elt: ConstantGepIt->second);
329 } else {
330 Elements.push_back(Elt: PoisonValue::get(T: LocalPtrTy));
331 }
332 }
333 return ConstantArray::get(T: KernelOffsetsType, V: Elements);
334 }
335
336 static GlobalVariable *buildLookupTable(
337 Module &M, ArrayRef<GlobalVariable *> Variables,
338 ArrayRef<Function *> kernels,
339 DenseMap<Function *, LDSVariableReplacement> &KernelToReplacement) {
340 if (Variables.empty()) {
341 return nullptr;
342 }
343 LLVMContext &Ctx = M.getContext();
344
345 const size_t NumberVariables = Variables.size();
346 const size_t NumberKernels = kernels.size();
347
348 Type *LocalPtrTy = PointerType::get(C&: Ctx, AddressSpace: AMDGPUAS::LOCAL_ADDRESS);
349 ArrayType *KernelOffsetsType = ArrayType::get(ElementType: LocalPtrTy, NumElements: NumberVariables);
350
351 ArrayType *AllKernelsOffsetsType =
352 ArrayType::get(ElementType: KernelOffsetsType, NumElements: NumberKernels);
353
354 Constant *Missing = PoisonValue::get(T: KernelOffsetsType);
355 std::vector<Constant *> overallConstantExprElts(NumberKernels);
356 for (size_t i = 0; i < NumberKernels; i++) {
357 auto Replacement = KernelToReplacement.find(Val: kernels[i]);
358 overallConstantExprElts[i] =
359 (Replacement == KernelToReplacement.end())
360 ? Missing
361 : getAddressesOfVariablesInKernel(
362 Ctx, Variables, LDSVarsToConstantGEP: Replacement->second.LDSVarsToConstantGEP);
363 }
364
365 Constant *init =
366 ConstantArray::get(T: AllKernelsOffsetsType, V: overallConstantExprElts);
367
368 return new GlobalVariable(
369 M, AllKernelsOffsetsType, true, GlobalValue::InternalLinkage, init,
370 "llvm.amdgcn.lds.offset.table", nullptr, GlobalValue::NotThreadLocal,
371 AMDGPUAS::CONSTANT_ADDRESS);
372 }
373
374 void replaceUseWithTableLookup(Module &M, IRBuilder<> &Builder,
375 GlobalVariable *LookupTable,
376 GlobalVariable *GV, Use &U,
377 Value *OptionalIndex) {
378 // Table is a constant array of the same length as OrderedKernels
379 LLVMContext &Ctx = M.getContext();
380 Type *I32 = Type::getInt32Ty(C&: Ctx);
381 auto *I = cast<Instruction>(Val: U.getUser());
382
383 Value *tableKernelIndex = getTableLookupKernelIndex(M, F: I->getFunction());
384
385 if (auto *Phi = dyn_cast<PHINode>(Val: I)) {
386 BasicBlock *BB = Phi->getIncomingBlock(U);
387 Builder.SetInsertPoint(&(*(BB->getFirstInsertionPt())));
388 } else {
389 Builder.SetInsertPoint(I);
390 }
391
392 SmallVector<Value *, 3> GEPIdx = {
393 ConstantInt::get(Ty: I32, V: 0),
394 tableKernelIndex,
395 };
396 if (OptionalIndex)
397 GEPIdx.push_back(Elt: OptionalIndex);
398
399 Value *Address = Builder.CreateInBoundsGEP(
400 Ty: LookupTable->getValueType(), Ptr: LookupTable, IdxList: GEPIdx, Name: GV->getName());
401
402 Value *Loaded = Builder.CreateLoad(Ty: GV->getType(), Ptr: Address);
403 U.set(Loaded);
404 }
405
406 void replaceUsesInInstructionsWithTableLookup(
407 Module &M, ArrayRef<GlobalVariable *> ModuleScopeVariables,
408 GlobalVariable *LookupTable) {
409
410 LLVMContext &Ctx = M.getContext();
411 IRBuilder<> Builder(Ctx);
412 Type *I32 = Type::getInt32Ty(C&: Ctx);
413
414 for (size_t Index = 0; Index < ModuleScopeVariables.size(); Index++) {
415 auto *GV = ModuleScopeVariables[Index];
416
417 for (Use &U : make_early_inc_range(Range: GV->uses())) {
418 auto *I = dyn_cast<Instruction>(Val: U.getUser());
419 if (!I)
420 continue;
421
422 replaceUseWithTableLookup(M, Builder, LookupTable, GV, U,
423 OptionalIndex: ConstantInt::get(Ty: I32, V: Index));
424 }
425 }
426 }
427
428 static DenseSet<Function *> kernelsThatIndirectlyAccessAnyOfPassedVariables(
429 Module &M, GVUsesInfoTy &LDSUsesInfo,
430 DenseSet<GlobalVariable *> const &VariableSet) {
431
432 DenseSet<Function *> KernelSet;
433
434 if (VariableSet.empty())
435 return KernelSet;
436
437 for (Function &Func : M.functions()) {
438 if (Func.isDeclaration() || !isKernel(F: Func))
439 continue;
440 for (GlobalVariable *GV : LDSUsesInfo.IndirectAccess[&Func]) {
441 if (VariableSet.contains(V: GV)) {
442 KernelSet.insert(V: &Func);
443 break;
444 }
445 }
446 }
447
448 return KernelSet;
449 }
450
451 static GlobalVariable *
452 chooseBestVariableForModuleStrategy(const DataLayout &DL,
453 VariableFunctionMap &LDSVars) {
454 // Find the global variable with the most indirect uses from kernels
455
456 struct CandidateTy {
457 GlobalVariable *GV = nullptr;
458 size_t UserCount = 0;
459 size_t Size = 0;
460
461 CandidateTy() = default;
462
463 CandidateTy(GlobalVariable *GV, uint64_t UserCount, uint64_t AllocSize)
464 : GV(GV), UserCount(UserCount), Size(AllocSize) {}
465
466 bool operator<(const CandidateTy &Other) const {
467 // Fewer users makes module scope variable less attractive
468 if (UserCount < Other.UserCount) {
469 return true;
470 }
471 if (UserCount > Other.UserCount) {
472 return false;
473 }
474
475 // Bigger makes module scope variable less attractive
476 if (Size < Other.Size) {
477 return false;
478 }
479
480 if (Size > Other.Size) {
481 return true;
482 }
483
484 // Arbitrary but consistent
485 return GV->getName() < Other.GV->getName();
486 }
487 };
488
489 CandidateTy MostUsed;
490
491 for (auto &K : LDSVars) {
492 GlobalVariable *GV = K.first;
493 if (K.second.size() <= 1) {
494 // A variable reachable by only one kernel is best lowered with kernel
495 // strategy
496 continue;
497 }
498 CandidateTy Candidate(GV, K.second.size(), GV->getGlobalSize(DL));
499 if (MostUsed < Candidate)
500 MostUsed = Candidate;
501 }
502
503 return MostUsed.GV;
504 }
505
506 static void recordLDSAbsoluteAddress(Module *M, GlobalVariable *GV,
507 uint32_t Address) {
508 // Write the specified address into metadata where it can be retrieved by
509 // the assembler. Format is a half open range, [Address Address+1)
510 LLVMContext &Ctx = M->getContext();
511 auto *IntTy =
512 M->getDataLayout().getIntPtrType(C&: Ctx, AddressSpace: AMDGPUAS::LOCAL_ADDRESS);
513 auto *MinC = ConstantAsMetadata::get(C: ConstantInt::get(Ty: IntTy, V: Address));
514 auto *MaxC = ConstantAsMetadata::get(C: ConstantInt::get(Ty: IntTy, V: Address + 1));
515 GV->setMetadata(KindID: LLVMContext::MD_absolute_symbol,
516 Node: MDNode::get(Context&: Ctx, MDs: {MinC, MaxC}));
517 }
518
519 DenseMap<Function *, Value *> tableKernelIndexCache;
520 Value *getTableLookupKernelIndex(Module &M, Function *F) {
521 // Accesses from a function use the amdgcn_lds_kernel_id intrinsic which
522 // lowers to a read from a live in register. Emit it once in the entry
523 // block to spare deduplicating it later.
524 auto [It, Inserted] = tableKernelIndexCache.try_emplace(Key: F);
525 if (Inserted) {
526 auto InsertAt = F->getEntryBlock().getFirstNonPHIOrDbgOrAlloca();
527 IRBuilder<> Builder(&*InsertAt);
528
529 It->second = Builder.CreateIntrinsic(ID: Intrinsic::amdgcn_lds_kernel_id, Args: {});
530 }
531
532 return It->second;
533 }
534
535 static std::vector<Function *> assignLDSKernelIDToEachKernel(
536 Module *M, DenseSet<Function *> const &KernelsThatAllocateTableLDS,
537 DenseSet<Function *> const &KernelsThatIndirectlyAllocateDynamicLDS) {
538 // Associate kernels in the set with an arbitrary but reproducible order and
539 // annotate them with that order in metadata. This metadata is recognised by
540 // the backend and lowered to a SGPR which can be read from using
541 // amdgcn_lds_kernel_id.
542
543 std::vector<Function *> OrderedKernels;
544 if (!KernelsThatAllocateTableLDS.empty() ||
545 !KernelsThatIndirectlyAllocateDynamicLDS.empty()) {
546
547 for (Function &Func : M->functions()) {
548 if (Func.isDeclaration())
549 continue;
550 if (!isKernel(F: Func))
551 continue;
552
553 if (KernelsThatAllocateTableLDS.contains(V: &Func) ||
554 KernelsThatIndirectlyAllocateDynamicLDS.contains(V: &Func)) {
555 assert(Func.hasName()); // else fatal error earlier
556 OrderedKernels.push_back(x: &Func);
557 }
558 }
559
560 // Put them in an arbitrary but reproducible order
561 OrderedKernels = sortByName(V: std::move(OrderedKernels));
562
563 // Annotate the kernels with their order in this vector
564 LLVMContext &Ctx = M->getContext();
565 IRBuilder<> Builder(Ctx);
566
567 if (OrderedKernels.size() > UINT32_MAX) {
568 // 32 bit keeps it in one SGPR. > 2**32 kernels won't fit on the GPU
569 reportFatalUsageError(reason: "unimplemented LDS lowering for > 2**32 kernels");
570 }
571
572 for (size_t i = 0; i < OrderedKernels.size(); i++) {
573 Metadata *AttrMDArgs[1] = {
574 ConstantAsMetadata::get(C: Builder.getInt32(C: i)),
575 };
576 OrderedKernels[i]->setMetadata(Kind: "llvm.amdgcn.lds.kernel.id",
577 Node: MDNode::get(Context&: Ctx, MDs: AttrMDArgs));
578 }
579 }
580 return OrderedKernels;
581 }
582
583 static void partitionVariablesIntoIndirectStrategies(
584 Module &M, GVUsesInfoTy const &LDSUsesInfo,
585 VariableFunctionMap &LDSToKernelsThatNeedToAccessItIndirectly,
586 DenseSet<GlobalVariable *> &ModuleScopeVariables,
587 DenseSet<GlobalVariable *> &TableLookupVariables,
588 DenseSet<GlobalVariable *> &KernelAccessVariables,
589 DenseSet<GlobalVariable *> &DynamicVariables) {
590
591 GlobalVariable *HybridModuleRoot =
592 LoweringKindLoc != LoweringKind::hybrid
593 ? nullptr
594 : chooseBestVariableForModuleStrategy(
595 DL: M.getDataLayout(), LDSVars&: LDSToKernelsThatNeedToAccessItIndirectly);
596
597 DenseSet<Function *> const EmptySet;
598 DenseSet<Function *> const &HybridModuleRootKernels =
599 HybridModuleRoot
600 ? LDSToKernelsThatNeedToAccessItIndirectly[HybridModuleRoot]
601 : EmptySet;
602
603 for (auto &K : LDSToKernelsThatNeedToAccessItIndirectly) {
604 // Each iteration of this loop assigns exactly one global variable to
605 // exactly one of the implementation strategies.
606
607 GlobalVariable *GV = K.first;
608 assert(AMDGPU::isLDSVariableToLower(*GV));
609 assert(!K.second.empty());
610
611 if (AMDGPU::isDynamicLDS(GV: *GV)) {
612 DynamicVariables.insert(V: GV);
613 continue;
614 }
615
616 switch (LoweringKindLoc) {
617 case LoweringKind::module:
618 ModuleScopeVariables.insert(V: GV);
619 break;
620
621 case LoweringKind::table:
622 TableLookupVariables.insert(V: GV);
623 break;
624
625 case LoweringKind::kernel:
626 if (K.second.size() == 1) {
627 KernelAccessVariables.insert(V: GV);
628 } else {
629 // FIXME: This should use DiagnosticInfo
630 reportFatalUsageError(
631 reason: "cannot lower LDS '" + GV->getName() +
632 "' to kernel access as it is reachable from multiple kernels");
633 }
634 break;
635
636 case LoweringKind::hybrid: {
637 if (GV == HybridModuleRoot) {
638 assert(K.second.size() != 1);
639 ModuleScopeVariables.insert(V: GV);
640 } else if (K.second.size() == 1) {
641 KernelAccessVariables.insert(V: GV);
642 } else if (K.second == HybridModuleRootKernels) {
643 ModuleScopeVariables.insert(V: GV);
644 } else {
645 TableLookupVariables.insert(V: GV);
646 }
647 break;
648 }
649 }
650 }
651
652 // All LDS variables accessed indirectly have now been partitioned into
653 // the distinct lowering strategies.
654 assert(ModuleScopeVariables.size() + TableLookupVariables.size() +
655 KernelAccessVariables.size() + DynamicVariables.size() ==
656 LDSToKernelsThatNeedToAccessItIndirectly.size());
657 }
658
659 static GlobalVariable *lowerModuleScopeStructVariables(
660 Module &M, DenseSet<GlobalVariable *> const &ModuleScopeVariables,
661 DenseSet<Function *> const &KernelsThatAllocateModuleLDS) {
662 // Create a struct to hold the ModuleScopeVariables
663 // Replace all uses of those variables from non-kernel functions with the
664 // new struct instance Replace only the uses from kernel functions that will
665 // allocate this instance. That is a space optimisation - kernels that use a
666 // subset of the module scope struct and do not need to allocate it for
667 // indirect calls will only allocate the subset they use (they do so as part
668 // of the per-kernel lowering).
669 if (ModuleScopeVariables.empty()) {
670 return nullptr;
671 }
672
673 LLVMContext &Ctx = M.getContext();
674
675 LDSVariableReplacement ModuleScopeReplacement =
676 createLDSVariableReplacement(M, VarName: "llvm.amdgcn.module.lds",
677 LDSVarsToTransform: ModuleScopeVariables);
678
679 appendToCompilerUsed(M, Values: {static_cast<GlobalValue *>(
680 ConstantExpr::getPointerBitCastOrAddrSpaceCast(
681 C: cast<Constant>(Val: ModuleScopeReplacement.SGV),
682 Ty: PointerType::getUnqual(C&: Ctx)))});
683
684 // module.lds will be allocated at zero in any kernel that allocates it
685 recordLDSAbsoluteAddress(M: &M, GV: ModuleScopeReplacement.SGV, Address: 0);
686
687 // historic
688 removeLocalVarsFromUsedLists(M, LocalVars: ModuleScopeVariables);
689
690 // Replace all uses of module scope variable from non-kernel functions
691 replaceLDSVariablesWithStruct(
692 M, LDSVarsToTransformArg: ModuleScopeVariables, Replacement: ModuleScopeReplacement, Predicate: [&](Use &U) {
693 Instruction *I = dyn_cast<Instruction>(Val: U.getUser());
694 if (!I) {
695 return false;
696 }
697 Function *F = I->getFunction();
698 return !isKernel(F: *F);
699 });
700
701 // Replace uses of module scope variable from kernel functions that
702 // allocate the module scope variable, otherwise leave them unchanged
703 // Record on each kernel whether the module scope global is used by it
704
705 for (Function &Func : M.functions()) {
706 if (Func.isDeclaration() || !isKernel(F: Func))
707 continue;
708
709 if (KernelsThatAllocateModuleLDS.contains(V: &Func)) {
710 replaceLDSVariablesWithStruct(
711 M, LDSVarsToTransformArg: ModuleScopeVariables, Replacement: ModuleScopeReplacement, Predicate: [&](Use &U) {
712 Instruction *I = dyn_cast<Instruction>(Val: U.getUser());
713 if (!I) {
714 return false;
715 }
716 Function *F = I->getFunction();
717 return F == &Func;
718 });
719
720 markUsedByKernel(Func: &Func, SGV: ModuleScopeReplacement.SGV);
721 }
722 }
723
724 return ModuleScopeReplacement.SGV;
725 }
726
727 static DenseMap<Function *, LDSVariableReplacement>
728 lowerKernelScopeStructVariables(
729 Module &M, GVUsesInfoTy &LDSUsesInfo,
730 DenseSet<GlobalVariable *> const &ModuleScopeVariables,
731 DenseSet<Function *> const &KernelsThatAllocateModuleLDS,
732 GlobalVariable *MaybeModuleScopeStruct) {
733
734 // Create a struct for each kernel for the non-module-scope variables.
735
736 DenseMap<Function *, LDSVariableReplacement> KernelToReplacement;
737 for (Function &Func : M.functions()) {
738 if (Func.isDeclaration() || !isKernel(F: Func))
739 continue;
740
741 DenseSet<GlobalVariable *> KernelUsedVariables;
742 // Allocating variables that are used directly in this struct to get
743 // alignment aware allocation and predictable frame size.
744 for (auto &v : LDSUsesInfo.DirectAccess[&Func]) {
745 if (!AMDGPU::isDynamicLDS(GV: *v)) {
746 KernelUsedVariables.insert(V: v);
747 }
748 }
749
750 // Allocating variables that are accessed indirectly so that a lookup of
751 // this struct instance can find them from nested functions.
752 for (auto &v : LDSUsesInfo.IndirectAccess[&Func]) {
753 if (!AMDGPU::isDynamicLDS(GV: *v)) {
754 KernelUsedVariables.insert(V: v);
755 }
756 }
757
758 // Variables allocated in module lds must all resolve to that struct,
759 // not to the per-kernel instance.
760 if (KernelsThatAllocateModuleLDS.contains(V: &Func)) {
761 for (GlobalVariable *v : ModuleScopeVariables) {
762 KernelUsedVariables.erase(V: v);
763 }
764 }
765
766 if (KernelUsedVariables.empty()) {
767 // Either used no LDS, or the LDS it used was all in the module struct
768 // or dynamically sized
769 continue;
770 }
771
772 // The association between kernel function and LDS struct is done by
773 // symbol name, which only works if the function in question has a
774 // name This is not expected to be a problem in practice as kernels
775 // are called by name making anonymous ones (which are named by the
776 // backend) difficult to use. This does mean that llvm test cases need
777 // to name the kernels.
778 if (!Func.hasName()) {
779 reportFatalUsageError(reason: "anonymous kernels cannot use LDS variables");
780 }
781
782 std::string VarName =
783 (Twine("llvm.amdgcn.kernel.") + Func.getName() + ".lds").str();
784
785 auto Replacement =
786 createLDSVariableReplacement(M, VarName, LDSVarsToTransform: KernelUsedVariables);
787
788 // If any indirect uses, create a direct use to ensure allocation
789 // TODO: Simpler to unconditionally mark used but that regresses
790 // codegen in test/CodeGen/AMDGPU/noclobber-barrier.ll
791 auto Accesses = LDSUsesInfo.IndirectAccess.find(Val: &Func);
792 if ((Accesses != LDSUsesInfo.IndirectAccess.end()) &&
793 !Accesses->second.empty())
794 markUsedByKernel(Func: &Func, SGV: Replacement.SGV);
795
796 // remove preserves existing codegen
797 removeLocalVarsFromUsedLists(M, LocalVars: KernelUsedVariables);
798 KernelToReplacement[&Func] = Replacement;
799
800 // Rewrite uses within kernel to the new struct
801 replaceLDSVariablesWithStruct(
802 M, LDSVarsToTransformArg: KernelUsedVariables, Replacement, Predicate: [&Func](Use &U) {
803 Instruction *I = dyn_cast<Instruction>(Val: U.getUser());
804 return I && I->getFunction() == &Func;
805 });
806 }
807 return KernelToReplacement;
808 }
809
810 static GlobalVariable *
811 buildRepresentativeDynamicLDSInstance(Module &M, GVUsesInfoTy &LDSUsesInfo,
812 Function *func) {
813 // Create a dynamic lds variable with a name associated with the passed
814 // function that has the maximum alignment of any dynamic lds variable
815 // reachable from this kernel. Dynamic LDS is allocated after the static LDS
816 // allocation, possibly after alignment padding. The representative variable
817 // created here has the maximum alignment of any other dynamic variable
818 // reachable by that kernel. All dynamic LDS variables are allocated at the
819 // same address in each kernel in order to provide the documented aliasing
820 // semantics. Setting the alignment here allows this IR pass to accurately
821 // predict the exact constant at which it will be allocated.
822
823 assert(isKernel(*func));
824
825 LLVMContext &Ctx = M.getContext();
826 const DataLayout &DL = M.getDataLayout();
827 Align MaxDynamicAlignment(1);
828
829 auto UpdateMaxAlignment = [&MaxDynamicAlignment, &DL](GlobalVariable *GV) {
830 if (AMDGPU::isDynamicLDS(GV: *GV)) {
831 MaxDynamicAlignment =
832 std::max(a: MaxDynamicAlignment, b: AMDGPU::getAlign(DL, GV));
833 }
834 };
835
836 for (GlobalVariable *GV : LDSUsesInfo.IndirectAccess[func]) {
837 UpdateMaxAlignment(GV);
838 }
839
840 for (GlobalVariable *GV : LDSUsesInfo.DirectAccess[func]) {
841 UpdateMaxAlignment(GV);
842 }
843
844 assert(func->hasName()); // Checked by caller
845 auto *emptyCharArray = ArrayType::get(ElementType: Type::getInt8Ty(C&: Ctx), NumElements: 0);
846 GlobalVariable *N = new GlobalVariable(
847 M, emptyCharArray, false, GlobalValue::ExternalLinkage, nullptr,
848 Twine("llvm.amdgcn." + func->getName() + ".dynlds"), nullptr,
849 GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS, false);
850 N->setAlignment(MaxDynamicAlignment);
851
852 assert(AMDGPU::isDynamicLDS(*N));
853 return N;
854 }
855
856 DenseMap<Function *, GlobalVariable *> lowerDynamicLDSVariables(
857 Module &M, GVUsesInfoTy &LDSUsesInfo,
858 DenseSet<Function *> const &KernelsThatIndirectlyAllocateDynamicLDS,
859 DenseSet<GlobalVariable *> const &DynamicVariables,
860 std::vector<Function *> const &OrderedKernels) {
861 DenseMap<Function *, GlobalVariable *> KernelToCreatedDynamicLDS;
862 if (!KernelsThatIndirectlyAllocateDynamicLDS.empty()) {
863 LLVMContext &Ctx = M.getContext();
864 IRBuilder<> Builder(Ctx);
865 Type *LocalPtrTy = PointerType::get(C&: Ctx, AddressSpace: AMDGPUAS::LOCAL_ADDRESS);
866
867 std::vector<Constant *> newDynamicLDS;
868
869 // Table is built in the same order as OrderedKernels
870 for (auto &func : OrderedKernels) {
871
872 if (KernelsThatIndirectlyAllocateDynamicLDS.contains(V: func)) {
873 assert(isKernel(*func));
874 if (!func->hasName()) {
875 reportFatalUsageError(reason: "anonymous kernels cannot use LDS variables");
876 }
877
878 GlobalVariable *N =
879 buildRepresentativeDynamicLDSInstance(M, LDSUsesInfo, func);
880
881 KernelToCreatedDynamicLDS[func] = N;
882
883 markUsedByKernel(Func: func, SGV: N);
884
885 newDynamicLDS.push_back(x: N);
886 } else {
887 newDynamicLDS.push_back(x: PoisonValue::get(T: LocalPtrTy));
888 }
889 }
890 assert(OrderedKernels.size() == newDynamicLDS.size());
891
892 ArrayType *t = ArrayType::get(ElementType: LocalPtrTy, NumElements: newDynamicLDS.size());
893 Constant *init = ConstantArray::get(T: t, V: newDynamicLDS);
894 GlobalVariable *table = new GlobalVariable(
895 M, t, true, GlobalValue::InternalLinkage, init,
896 "llvm.amdgcn.dynlds.offset.table", nullptr,
897 GlobalValue::NotThreadLocal, AMDGPUAS::CONSTANT_ADDRESS);
898
899 for (GlobalVariable *GV : DynamicVariables) {
900 for (Use &U : make_early_inc_range(Range: GV->uses())) {
901 auto *I = dyn_cast<Instruction>(Val: U.getUser());
902 if (!I)
903 continue;
904 if (isKernel(F: *I->getFunction()))
905 continue;
906
907 replaceUseWithTableLookup(M, Builder, LookupTable: table, GV, U, OptionalIndex: nullptr);
908 }
909 }
910 }
911 return KernelToCreatedDynamicLDS;
912 }
913
914 // Per-TU mode for link-time LDS resolution. Instead of computing a global
915 // layout, create per-function LDS struct declarations so the linker can
916 // assign offsets across TUs.
917 bool runOnModuleLinkTime(Module &M) {
918 bool Changed = superAlignLDSGlobals(M);
919 Changed |=
920 eliminateGVConstantExprUsesFromAllInstructions(M, Filter: isLDSVariableToLower);
921
922 CallGraph CG(M);
923 FunctionVariableMap KernelLDSUses, FunctionLDSUses;
924 getUsesOfGVByFunction(CG, M, Filter: isLDSVariableToLower, Kernels&: KernelLDSUses,
925 Functions&: FunctionLDSUses);
926
927 if (KernelLDSUses.empty() && FunctionLDSUses.empty())
928 return Changed;
929
930 std::string ModuleId = getUniqueModuleId(M: &M);
931 assert(!ModuleId.empty() &&
932 "modules with LDS variables should have a unique ID");
933
934 FunctionVariableMap AllLDSUses;
935 for (auto &[F, Vars] : KernelLDSUses)
936 AllLDSUses[F].insert(I: Vars.begin(), E: Vars.end());
937 for (auto &[F, Vars] : FunctionLDSUses)
938 AllLDSUses[F].insert(I: Vars.begin(), E: Vars.end());
939
940 // Named barriers are handled by AMDGPULowerExecSync; filter them out.
941 for (auto &[F, Vars] : AllLDSUses) {
942 SmallVector<GlobalVariable *> Barriers;
943 for (GlobalVariable *V : Vars)
944 if (AMDGPU::isNamedBarrier(GV: *V))
945 Barriers.push_back(Elt: V);
946 for (GlobalVariable *V : Barriers)
947 Vars.erase(V);
948 }
949
950 // Build reverse map: LDS variable -> functions that use it.
951 DenseMap<GlobalVariable *, SmallVector<Function *, 4>> VarToFuncs;
952 for (auto &[F, Vars] : AllLDSUses) {
953 for (GlobalVariable *V : Vars)
954 VarToFuncs[V].push_back(Elt: F);
955 }
956
957 // A variable is function-scope iff it has local linkage and exactly one
958 // user function. Everything else is global-scope and must remain as a
959 // standalone external declaration so the linker can assign a single shared
960 // offset.
961 DenseSet<GlobalVariable *> GlobalScopeVars;
962 DenseSet<GlobalVariable *> InternalMultiUserVars;
963 for (auto &[V, Funcs] : VarToFuncs) {
964 if (!V->hasLocalLinkage() || Funcs.size() > 1) {
965 GlobalScopeVars.insert(V);
966 if (V->hasLocalLinkage())
967 InternalMultiUserVars.insert(V);
968 }
969 }
970
971 // Wrap function-scope LDS into per-function structs (unchanged logic,
972 // but global-scope variables are excluded from the set).
973 SmallVector<std::pair<Function *, GlobalVariable *>, 4> FuncToLdsStruct;
974 DenseSet<GlobalVariable *> AllReplacedVars;
975 for (auto &KV : AllLDSUses) {
976 Function *F = KV.first;
977 DenseSet<GlobalVariable *> FuncScopeVars;
978 for (GlobalVariable *V : KV.second) {
979 if (!GlobalScopeVars.count(V))
980 FuncScopeVars.insert(V);
981 }
982
983 if (FuncScopeVars.empty())
984 continue;
985
986 std::string StructName =
987 F->hasLocalLinkage()
988 ? ("__amdgpu_lds." + F->getName() + ModuleId).str()
989 : ("__amdgpu_lds." + F->getName()).str();
990 LDSVariableReplacement Replacement =
991 createLDSVariableReplacement(M, VarName: StructName, LDSVarsToTransform: FuncScopeVars);
992
993 GlobalVariable *SGV = Replacement.SGV;
994 SGV->setLinkage(GlobalValue::ExternalLinkage);
995 SGV->setInitializer(nullptr);
996 FuncToLdsStruct.push_back(Elt: {F, SGV});
997
998 replaceLDSVariablesWithStruct(
999 M, LDSVarsToTransformArg: FuncScopeVars, Replacement, Predicate: [F](const Use &U) {
1000 auto *I = dyn_cast<Instruction>(Val: U.getUser());
1001 return I && I->getFunction() == F;
1002 });
1003
1004 AllReplacedVars.insert(I: FuncScopeVars.begin(), E: FuncScopeVars.end());
1005 }
1006
1007 // Internal-linkage LDS variables used by multiple functions would collide
1008 // across TUs if promoted individually to external linkage (same name in
1009 // different TUs). Pack them into a single per-module struct with a
1010 // module-unique name so the linker treats them as one allocation unit.
1011 if (!InternalMultiUserVars.empty()) {
1012 std::string StructName = "__amdgpu_lds.__internal" + ModuleId;
1013 LDSVariableReplacement Replacement =
1014 createLDSVariableReplacement(M, VarName: StructName, LDSVarsToTransform: InternalMultiUserVars);
1015
1016 GlobalVariable *SGV = Replacement.SGV;
1017 SGV->setLinkage(GlobalValue::ExternalLinkage);
1018 SGV->setInitializer(nullptr);
1019
1020 replaceLDSVariablesWithStruct(
1021 M, LDSVarsToTransformArg: InternalMultiUserVars, Replacement,
1022 Predicate: [](const Use &U) { return isa<Instruction>(Val: U.getUser()); });
1023
1024 DenseSet<Function *> FuncsUsingInternalVars;
1025 for (GlobalVariable *V : InternalMultiUserVars) {
1026 for (Function *F : VarToFuncs[V])
1027 FuncsUsingInternalVars.insert(V: F);
1028 }
1029 for (Function *F : FuncsUsingInternalVars)
1030 FuncToLdsStruct.push_back(Elt: {F, SGV});
1031
1032 AllReplacedVars.insert(I: InternalMultiUserVars.begin(),
1033 E: InternalMultiUserVars.end());
1034 }
1035
1036 // Convert global-scope LDS to external declarations. Their uses remain
1037 // intact and ISel generates R_AMDGPU_ABS32_LO relocations for them.
1038 for (GlobalVariable *V : GlobalScopeVars) {
1039 V->setInitializer(nullptr);
1040 V->setLinkage(GlobalValue::ExternalLinkage);
1041 }
1042
1043 // Emit amdgpu.lds.uses metadata for struct and global-scope LDS.
1044 {
1045 LLVMContext &Ctx = M.getContext();
1046 NamedMDNode *LdsMD = M.getOrInsertNamedMetadata(Name: "amdgpu.lds.uses");
1047
1048 for (auto &[F, SGV] : FuncToLdsStruct)
1049 LdsMD->addOperand(M: MDNode::get(
1050 Context&: Ctx, MDs: {ValueAsMetadata::get(V: F), ValueAsMetadata::get(V: SGV)}));
1051
1052 for (auto &[V, Funcs] : VarToFuncs) {
1053 if (GlobalScopeVars.count(V) && !InternalMultiUserVars.count(V)) {
1054 for (Function *F : Funcs) {
1055 LdsMD->addOperand(M: MDNode::get(
1056 Context&: Ctx, MDs: {ValueAsMetadata::get(V: F), ValueAsMetadata::get(V)}));
1057 }
1058 }
1059 }
1060 }
1061
1062 DenseSet<GlobalVariable *> AllLDSVarsForCleanup = AllReplacedVars;
1063 AllLDSVarsForCleanup.insert(I: GlobalScopeVars.begin(), E: GlobalScopeVars.end());
1064 removeLocalVarsFromUsedLists(M, LocalVars: AllLDSVarsForCleanup);
1065 for (GlobalVariable *GV : AllReplacedVars) {
1066 GV->removeDeadConstantUsers();
1067 if (GV->use_empty())
1068 GV->eraseFromParent();
1069 }
1070
1071 return true;
1072 }
1073
1074 bool runOnModule(Module &M) {
1075 if (AMDGPUTargetMachine::EnableObjectLinking)
1076 return runOnModuleLinkTime(M);
1077 return runOnModuleNormal(M);
1078 }
1079
1080 bool runOnModuleNormal(Module &M) {
1081 CallGraph CG = CallGraph(M);
1082 bool Changed = superAlignLDSGlobals(M);
1083
1084 Changed |=
1085 eliminateGVConstantExprUsesFromAllInstructions(M, Filter: isLDSVariableToLower);
1086
1087 Changed = true; // todo: narrow this down
1088
1089 // For each kernel, what variables does it access directly or through
1090 // callees
1091 GVUsesInfoTy LDSUsesInfo = getTransitiveUsesOfLDSForLowering(CG, M);
1092
1093 // For each variable accessed through callees, which kernels access it
1094 VariableFunctionMap LDSToKernelsThatNeedToAccessItIndirectly;
1095 for (auto &K : LDSUsesInfo.IndirectAccess) {
1096 Function *F = K.first;
1097 assert(isKernel(*F));
1098 for (GlobalVariable *GV : K.second) {
1099 LDSToKernelsThatNeedToAccessItIndirectly[GV].insert(V: F);
1100 }
1101 }
1102
1103 // Partition variables accessed indirectly into the different strategies
1104 DenseSet<GlobalVariable *> ModuleScopeVariables;
1105 DenseSet<GlobalVariable *> TableLookupVariables;
1106 DenseSet<GlobalVariable *> KernelAccessVariables;
1107 DenseSet<GlobalVariable *> DynamicVariables;
1108 partitionVariablesIntoIndirectStrategies(
1109 M, LDSUsesInfo, LDSToKernelsThatNeedToAccessItIndirectly,
1110 ModuleScopeVariables, TableLookupVariables, KernelAccessVariables,
1111 DynamicVariables);
1112
1113 // If the kernel accesses a variable that is going to be stored in the
1114 // module instance through a call then that kernel needs to allocate the
1115 // module instance
1116 const DenseSet<Function *> KernelsThatAllocateModuleLDS =
1117 kernelsThatIndirectlyAccessAnyOfPassedVariables(M, LDSUsesInfo,
1118 VariableSet: ModuleScopeVariables);
1119 const DenseSet<Function *> KernelsThatAllocateTableLDS =
1120 kernelsThatIndirectlyAccessAnyOfPassedVariables(M, LDSUsesInfo,
1121 VariableSet: TableLookupVariables);
1122
1123 const DenseSet<Function *> KernelsThatIndirectlyAllocateDynamicLDS =
1124 kernelsThatIndirectlyAccessAnyOfPassedVariables(M, LDSUsesInfo,
1125 VariableSet: DynamicVariables);
1126
1127 GlobalVariable *MaybeModuleScopeStruct = lowerModuleScopeStructVariables(
1128 M, ModuleScopeVariables, KernelsThatAllocateModuleLDS);
1129
1130 DenseMap<Function *, LDSVariableReplacement> KernelToReplacement =
1131 lowerKernelScopeStructVariables(M, LDSUsesInfo, ModuleScopeVariables,
1132 KernelsThatAllocateModuleLDS,
1133 MaybeModuleScopeStruct);
1134
1135 // Lower zero cost accesses to the kernel instances just created
1136 for (auto &GV : KernelAccessVariables) {
1137 auto &funcs = LDSToKernelsThatNeedToAccessItIndirectly[GV];
1138 assert(funcs.size() == 1); // Only one kernel can access it
1139 LDSVariableReplacement Replacement =
1140 KernelToReplacement[*(funcs.begin())];
1141
1142 DenseSet<GlobalVariable *> Vec;
1143 Vec.insert(V: GV);
1144
1145 replaceLDSVariablesWithStruct(M, LDSVarsToTransformArg: Vec, Replacement, Predicate: [](Use &U) {
1146 return isa<Instruction>(Val: U.getUser());
1147 });
1148 }
1149
1150 // The ith element of this vector is kernel id i
1151 std::vector<Function *> OrderedKernels =
1152 assignLDSKernelIDToEachKernel(M: &M, KernelsThatAllocateTableLDS,
1153 KernelsThatIndirectlyAllocateDynamicLDS);
1154
1155 if (!KernelsThatAllocateTableLDS.empty()) {
1156 LLVMContext &Ctx = M.getContext();
1157 IRBuilder<> Builder(Ctx);
1158
1159 // The order must be consistent between lookup table and accesses to
1160 // lookup table
1161 auto TableLookupVariablesOrdered =
1162 sortByName(V: std::vector<GlobalVariable *>(TableLookupVariables.begin(),
1163 TableLookupVariables.end()));
1164
1165 GlobalVariable *LookupTable = buildLookupTable(
1166 M, Variables: TableLookupVariablesOrdered, kernels: OrderedKernels, KernelToReplacement);
1167 replaceUsesInInstructionsWithTableLookup(M, ModuleScopeVariables: TableLookupVariablesOrdered,
1168 LookupTable);
1169 }
1170
1171 DenseMap<Function *, GlobalVariable *> KernelToCreatedDynamicLDS =
1172 lowerDynamicLDSVariables(M, LDSUsesInfo,
1173 KernelsThatIndirectlyAllocateDynamicLDS,
1174 DynamicVariables, OrderedKernels);
1175
1176 // Strip amdgpu-no-lds-kernel-id from all functions reachable from the
1177 // kernel. We may have inferred this wasn't used prior to the pass.
1178 // TODO: We could filter out subgraphs that do not access LDS globals.
1179 for (auto *KernelSet : {&KernelsThatIndirectlyAllocateDynamicLDS,
1180 &KernelsThatAllocateTableLDS})
1181 for (Function *F : *KernelSet)
1182 removeFnAttrFromReachable(CG, KernelRoot: F, FnAttrs: {"amdgpu-no-lds-kernel-id"});
1183
1184 // All kernel frames have been allocated. Calculate and record the
1185 // addresses.
1186 {
1187 const DataLayout &DL = M.getDataLayout();
1188
1189 for (Function &Func : M.functions()) {
1190 if (Func.isDeclaration() || !isKernel(F: Func))
1191 continue;
1192
1193 // All three of these are optional. The first variable is allocated at
1194 // zero. They are allocated by AMDGPUMachineFunctionInfo as one block.
1195 // Layout:
1196 //{
1197 // module.lds
1198 // alignment padding
1199 // kernel instance
1200 // alignment padding
1201 // dynamic lds variables
1202 //}
1203
1204 const bool AllocateModuleScopeStruct =
1205 MaybeModuleScopeStruct &&
1206 KernelsThatAllocateModuleLDS.contains(V: &Func);
1207
1208 auto Replacement = KernelToReplacement.find(Val: &Func);
1209 const bool AllocateKernelScopeStruct =
1210 Replacement != KernelToReplacement.end();
1211
1212 const bool AllocateDynamicVariable =
1213 KernelToCreatedDynamicLDS.contains(Val: &Func);
1214
1215 uint32_t Offset = 0;
1216
1217 if (AllocateModuleScopeStruct) {
1218 // Allocated at zero, recorded once on construction, not once per
1219 // kernel
1220 Offset += MaybeModuleScopeStruct->getGlobalSize(DL);
1221 }
1222
1223 if (AllocateKernelScopeStruct) {
1224 GlobalVariable *KernelStruct = Replacement->second.SGV;
1225 Offset = alignTo(Size: Offset, A: AMDGPU::getAlign(DL, GV: KernelStruct));
1226 recordLDSAbsoluteAddress(M: &M, GV: KernelStruct, Address: Offset);
1227 Offset += KernelStruct->getGlobalSize(DL);
1228 }
1229
1230 // If there is dynamic allocation, the alignment needed is included in
1231 // the static frame size. There may be no reference to the dynamic
1232 // variable in the kernel itself, so without including it here, that
1233 // alignment padding could be missed.
1234 if (AllocateDynamicVariable) {
1235 GlobalVariable *DynamicVariable = KernelToCreatedDynamicLDS[&Func];
1236 Offset = alignTo(Size: Offset, A: AMDGPU::getAlign(DL, GV: DynamicVariable));
1237 recordLDSAbsoluteAddress(M: &M, GV: DynamicVariable, Address: Offset);
1238 }
1239
1240 if (Offset != 0) {
1241 (void)TM; // TODO: Account for target maximum LDS
1242 std::string Buffer;
1243 raw_string_ostream SS{Buffer};
1244 SS << format(Fmt: "%u", Vals: Offset);
1245
1246 // Instead of explicitly marking kernels that access dynamic variables
1247 // using special case metadata, annotate with min-lds == max-lds, i.e.
1248 // that there is no more space available for allocating more static
1249 // LDS variables. That is the right condition to prevent allocating
1250 // more variables which would collide with the addresses assigned to
1251 // dynamic variables.
1252 if (AllocateDynamicVariable)
1253 SS << format(Fmt: ",%u", Vals: Offset);
1254
1255 Func.addFnAttr(Kind: "amdgpu-lds-size", Val: Buffer);
1256 }
1257 }
1258 }
1259
1260 for (auto &GV : make_early_inc_range(Range: M.globals()))
1261 if (AMDGPU::isLDSVariableToLower(GV)) {
1262 // probably want to remove from used lists
1263 GV.removeDeadConstantUsers();
1264 if (GV.use_empty())
1265 GV.eraseFromParent();
1266 }
1267
1268 return Changed;
1269 }
1270
1271private:
1272 // Increase the alignment of LDS globals if necessary to maximise the chance
1273 // that we can use aligned LDS instructions to access them.
1274 static bool superAlignLDSGlobals(Module &M) {
1275 const DataLayout &DL = M.getDataLayout();
1276 bool Changed = false;
1277 if (!SuperAlignLDSGlobals) {
1278 return Changed;
1279 }
1280
1281 for (auto &GV : M.globals()) {
1282 if (GV.getType()->getPointerAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) {
1283 // Only changing alignment of LDS variables
1284 continue;
1285 }
1286 if (!GV.hasInitializer()) {
1287 // cuda/hip extern __shared__ variable, leave alignment alone
1288 continue;
1289 }
1290
1291 if (GV.isAbsoluteSymbolRef()) {
1292 // If the variable is already allocated, don't change the alignment
1293 continue;
1294 }
1295
1296 Align Alignment = AMDGPU::getAlign(DL, GV: &GV);
1297 uint64_t GVSize = GV.getGlobalSize(DL);
1298
1299 if (GVSize > 8) {
1300 // We might want to use a b96 or b128 load/store
1301 Alignment = std::max(a: Alignment, b: Align(16));
1302 } else if (GVSize > 4) {
1303 // We might want to use a b64 load/store
1304 Alignment = std::max(a: Alignment, b: Align(8));
1305 } else if (GVSize > 2) {
1306 // We might want to use a b32 load/store
1307 Alignment = std::max(a: Alignment, b: Align(4));
1308 } else if (GVSize > 1) {
1309 // We might want to use a b16 load/store
1310 Alignment = std::max(a: Alignment, b: Align(2));
1311 }
1312
1313 if (Alignment != AMDGPU::getAlign(DL, GV: &GV)) {
1314 Changed = true;
1315 GV.setAlignment(Alignment);
1316 }
1317 }
1318 return Changed;
1319 }
1320
1321 static LDSVariableReplacement createLDSVariableReplacement(
1322 Module &M, std::string VarName,
1323 DenseSet<GlobalVariable *> const &LDSVarsToTransform) {
1324 // Create a struct instance containing LDSVarsToTransform and map from those
1325 // variables to ConstantExprGEP
1326 // Variables may be introduced to meet alignment requirements. No aliasing
1327 // metadata is useful for these as they have no uses. Erased before return.
1328
1329 LLVMContext &Ctx = M.getContext();
1330 const DataLayout &DL = M.getDataLayout();
1331 assert(!LDSVarsToTransform.empty());
1332
1333 SmallVector<OptimizedStructLayoutField, 8> LayoutFields;
1334 LayoutFields.reserve(N: LDSVarsToTransform.size());
1335 {
1336 // The order of fields in this struct depends on the order of
1337 // variables in the argument which varies when changing how they
1338 // are identified, leading to spurious test breakage.
1339 auto Sorted = sortByName(V: std::vector<GlobalVariable *>(
1340 LDSVarsToTransform.begin(), LDSVarsToTransform.end()));
1341
1342 for (GlobalVariable *GV : Sorted) {
1343 OptimizedStructLayoutField F(GV, GV->getGlobalSize(DL),
1344 AMDGPU::getAlign(DL, GV));
1345 LayoutFields.emplace_back(Args&: F);
1346 }
1347 }
1348
1349 performOptimizedStructLayout(Fields: LayoutFields);
1350
1351 std::vector<GlobalVariable *> LocalVars;
1352 BitVector IsPaddingField;
1353 LocalVars.reserve(n: LDSVarsToTransform.size()); // will be at least this large
1354 IsPaddingField.reserve(N: LDSVarsToTransform.size());
1355 {
1356 uint64_t CurrentOffset = 0;
1357 for (auto &F : LayoutFields) {
1358 GlobalVariable *FGV =
1359 static_cast<GlobalVariable *>(const_cast<void *>(F.Id));
1360 Align DataAlign = F.Alignment;
1361
1362 uint64_t DataAlignV = DataAlign.value();
1363 if (uint64_t Rem = CurrentOffset % DataAlignV) {
1364 uint64_t Padding = DataAlignV - Rem;
1365
1366 // Append an array of padding bytes to meet alignment requested
1367 // Note (o + (a - (o % a)) ) % a == 0
1368 // (offset + Padding ) % align == 0
1369
1370 Type *ATy = ArrayType::get(ElementType: Type::getInt8Ty(C&: Ctx), NumElements: Padding);
1371 LocalVars.push_back(x: new GlobalVariable(
1372 M, ATy, false, GlobalValue::InternalLinkage,
1373 PoisonValue::get(T: ATy), "", nullptr, GlobalValue::NotThreadLocal,
1374 AMDGPUAS::LOCAL_ADDRESS, false));
1375 IsPaddingField.push_back(Val: true);
1376 CurrentOffset += Padding;
1377 }
1378
1379 LocalVars.push_back(x: FGV);
1380 IsPaddingField.push_back(Val: false);
1381 CurrentOffset += F.Size;
1382 }
1383 }
1384
1385 std::vector<Type *> LocalVarTypes;
1386 LocalVarTypes.reserve(n: LocalVars.size());
1387 std::transform(
1388 first: LocalVars.cbegin(), last: LocalVars.cend(), result: std::back_inserter(x&: LocalVarTypes),
1389 unary_op: [](const GlobalVariable *V) -> Type * { return V->getValueType(); });
1390
1391 StructType *LDSTy = StructType::create(Context&: Ctx, Elements: LocalVarTypes, Name: VarName + ".t");
1392
1393 Align StructAlign = AMDGPU::getAlign(DL, GV: LocalVars[0]);
1394
1395 GlobalVariable *SGV = new GlobalVariable(
1396 M, LDSTy, false, GlobalValue::InternalLinkage, PoisonValue::get(T: LDSTy),
1397 VarName, nullptr, GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS,
1398 false);
1399 SGV->setAlignment(StructAlign);
1400
1401 DenseMap<GlobalVariable *, Constant *> Map;
1402 Type *I32 = Type::getInt32Ty(C&: Ctx);
1403 for (size_t I = 0; I < LocalVars.size(); I++) {
1404 GlobalVariable *GV = LocalVars[I];
1405 Constant *GEPIdx[] = {ConstantInt::get(Ty: I32, V: 0), ConstantInt::get(Ty: I32, V: I)};
1406 Constant *GEP = ConstantExpr::getGetElementPtr(Ty: LDSTy, C: SGV, IdxList: GEPIdx, NW: true);
1407 if (IsPaddingField[I]) {
1408 assert(GV->use_empty());
1409 GV->eraseFromParent();
1410 } else {
1411 Map[GV] = GEP;
1412 }
1413 }
1414 assert(Map.size() == LDSVarsToTransform.size());
1415 return {.SGV: SGV, .LDSVarsToConstantGEP: std::move(Map)};
1416 }
1417
1418 template <typename PredicateTy>
1419 static void replaceLDSVariablesWithStruct(
1420 Module &M, DenseSet<GlobalVariable *> const &LDSVarsToTransformArg,
1421 const LDSVariableReplacement &Replacement, PredicateTy Predicate) {
1422 LLVMContext &Ctx = M.getContext();
1423 const DataLayout &DL = M.getDataLayout();
1424
1425 // A hack... we need to insert the aliasing info in a predictable order for
1426 // lit tests. Would like to have them in a stable order already, ideally the
1427 // same order they get allocated, which might mean an ordered set container
1428 auto LDSVarsToTransform = sortByName(V: std::vector<GlobalVariable *>(
1429 LDSVarsToTransformArg.begin(), LDSVarsToTransformArg.end()));
1430
1431 // Create alias.scope and their lists. Each field in the new structure
1432 // does not alias with all other fields.
1433 SmallVector<MDNode *> AliasScopes;
1434 SmallVector<Metadata *> NoAliasList;
1435 const size_t NumberVars = LDSVarsToTransform.size();
1436 if (NumberVars > 1) {
1437 MDBuilder MDB(Ctx);
1438 AliasScopes.reserve(N: NumberVars);
1439 MDNode *Domain = MDB.createAnonymousAliasScopeDomain();
1440 for (size_t I = 0; I < NumberVars; I++) {
1441 MDNode *Scope = MDB.createAnonymousAliasScope(Domain);
1442 AliasScopes.push_back(Elt: Scope);
1443 }
1444 NoAliasList.append(in_start: &AliasScopes[1], in_end: AliasScopes.end());
1445 }
1446
1447 // Replace uses of ith variable with a constantexpr to the corresponding
1448 // field of the instance that will be allocated by AMDGPUMachineFunctionInfo
1449 for (size_t I = 0; I < NumberVars; I++) {
1450 GlobalVariable *GV = LDSVarsToTransform[I];
1451 Constant *GEP = Replacement.LDSVarsToConstantGEP.at(Val: GV);
1452
1453 GV->replaceUsesWithIf(New: GEP, ShouldReplace: Predicate);
1454
1455 APInt APOff(DL.getIndexTypeSizeInBits(Ty: GEP->getType()), 0);
1456 GEP->stripAndAccumulateInBoundsConstantOffsets(DL, Offset&: APOff);
1457 uint64_t Offset = APOff.getZExtValue();
1458
1459 Align A =
1460 commonAlignment(A: Replacement.SGV->getAlign().valueOrOne(), Offset);
1461
1462 if (I)
1463 NoAliasList[I - 1] = AliasScopes[I - 1];
1464 MDNode *NoAlias =
1465 NoAliasList.empty() ? nullptr : MDNode::get(Context&: Ctx, MDs: NoAliasList);
1466 MDNode *AliasScope =
1467 AliasScopes.empty() ? nullptr : MDNode::get(Context&: Ctx, MDs: {AliasScopes[I]});
1468
1469 refineUsesAlignmentAndAA(Ptr: GEP, A, DL, AliasScope, NoAlias);
1470 }
1471 }
1472
1473 static void refineUsesAlignmentAndAA(Value *Ptr, Align A,
1474 const DataLayout &DL, MDNode *AliasScope,
1475 MDNode *NoAlias, unsigned MaxDepth = 5) {
1476 if (!MaxDepth || (A == 1 && !AliasScope))
1477 return;
1478
1479 ScopedNoAliasAAResult ScopedNoAlias;
1480
1481 for (User *U : Ptr->users()) {
1482 if (auto *I = dyn_cast<Instruction>(Val: U)) {
1483 if (AliasScope && I->mayReadOrWriteMemory()) {
1484 MDNode *AS = I->getMetadata(KindID: LLVMContext::MD_alias_scope);
1485 AS = (AS ? MDNode::getMostGenericAliasScope(A: AS, B: AliasScope)
1486 : AliasScope);
1487 I->setMetadata(KindID: LLVMContext::MD_alias_scope, Node: AS);
1488
1489 MDNode *NA = I->getMetadata(KindID: LLVMContext::MD_noalias);
1490
1491 // Scoped aliases can originate from two different domains.
1492 // First domain would be from LDS domain (created by this pass).
1493 // All entries (LDS vars) into LDS struct will have same domain.
1494
1495 // Second domain could be existing scoped aliases that are the
1496 // results of noalias params and subsequent optimizations that
1497 // may alter thesse sets.
1498
1499 // We need to be careful how we create new alias sets, and
1500 // have right scopes and domains for loads/stores of these new
1501 // LDS variables. We intersect NoAlias set if alias sets belong
1502 // to the same domain. This is the case if we have memcpy using
1503 // LDS variables. Both src and dst of memcpy would belong to
1504 // LDS struct, they donot alias.
1505 // On the other hand, if one of the domains is LDS and other is
1506 // existing domain prior to LDS, we need to have a union of all
1507 // these aliases set to preserve existing aliasing information.
1508
1509 SmallPtrSet<const MDNode *, 16> ExistingDomains, LDSDomains;
1510 ScopedNoAlias.collectScopedDomains(NoAlias: NA, Domains&: ExistingDomains);
1511 ScopedNoAlias.collectScopedDomains(NoAlias, Domains&: LDSDomains);
1512 auto Intersection = set_intersection(S1: ExistingDomains, S2: LDSDomains);
1513 if (Intersection.empty()) {
1514 NA = NA ? MDNode::concatenate(A: NA, B: NoAlias) : NoAlias;
1515 } else {
1516 NA = NA ? MDNode::intersect(A: NA, B: NoAlias) : NoAlias;
1517 }
1518 I->setMetadata(KindID: LLVMContext::MD_noalias, Node: NA);
1519 }
1520 }
1521
1522 if (auto *LI = dyn_cast<LoadInst>(Val: U)) {
1523 LI->setAlignment(std::max(a: A, b: LI->getAlign()));
1524 continue;
1525 }
1526 if (auto *SI = dyn_cast<StoreInst>(Val: U)) {
1527 if (SI->getPointerOperand() == Ptr)
1528 SI->setAlignment(std::max(a: A, b: SI->getAlign()));
1529 continue;
1530 }
1531 if (auto *AI = dyn_cast<AtomicRMWInst>(Val: U)) {
1532 // None of atomicrmw operations can work on pointers, but let's
1533 // check it anyway in case it will or we will process ConstantExpr.
1534 if (AI->getPointerOperand() == Ptr)
1535 AI->setAlignment(std::max(a: A, b: AI->getAlign()));
1536 continue;
1537 }
1538 if (auto *AI = dyn_cast<AtomicCmpXchgInst>(Val: U)) {
1539 if (AI->getPointerOperand() == Ptr)
1540 AI->setAlignment(std::max(a: A, b: AI->getAlign()));
1541 continue;
1542 }
1543 if (auto *GEP = dyn_cast<GetElementPtrInst>(Val: U)) {
1544 unsigned BitWidth = DL.getIndexTypeSizeInBits(Ty: GEP->getType());
1545 APInt Off(BitWidth, 0);
1546 if (GEP->getPointerOperand() == Ptr) {
1547 Align GA;
1548 if (GEP->accumulateConstantOffset(DL, Offset&: Off))
1549 GA = commonAlignment(A, Offset: Off.getLimitedValue());
1550 refineUsesAlignmentAndAA(Ptr: GEP, A: GA, DL, AliasScope, NoAlias,
1551 MaxDepth: MaxDepth - 1);
1552 }
1553 continue;
1554 }
1555 if (auto *I = dyn_cast<Instruction>(Val: U)) {
1556 if (I->getOpcode() == Instruction::BitCast ||
1557 I->getOpcode() == Instruction::AddrSpaceCast)
1558 refineUsesAlignmentAndAA(Ptr: I, A, DL, AliasScope, NoAlias, MaxDepth: MaxDepth - 1);
1559 }
1560 }
1561 }
1562};
1563
1564class AMDGPULowerModuleLDSLegacy : public ModulePass {
1565public:
1566 const AMDGPUTargetMachine *TM;
1567 static char ID;
1568
1569 AMDGPULowerModuleLDSLegacy(const AMDGPUTargetMachine *TM = nullptr)
1570 : ModulePass(ID), TM(TM) {}
1571
1572 void getAnalysisUsage(AnalysisUsage &AU) const override {
1573 if (!TM)
1574 AU.addRequired<TargetPassConfig>();
1575 }
1576
1577 bool runOnModule(Module &M) override {
1578 if (!TM) {
1579 auto &TPC = getAnalysis<TargetPassConfig>();
1580 TM = &TPC.getTM<AMDGPUTargetMachine>();
1581 }
1582
1583 return AMDGPULowerModuleLDS(*TM).runOnModule(M);
1584 }
1585};
1586
1587} // namespace
1588char AMDGPULowerModuleLDSLegacy::ID = 0;
1589
1590char &llvm::AMDGPULowerModuleLDSLegacyPassID = AMDGPULowerModuleLDSLegacy::ID;
1591
1592INITIALIZE_PASS_BEGIN(AMDGPULowerModuleLDSLegacy, DEBUG_TYPE,
1593 "Lower uses of LDS variables from non-kernel functions",
1594 false, false)
1595INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
1596INITIALIZE_PASS_END(AMDGPULowerModuleLDSLegacy, DEBUG_TYPE,
1597 "Lower uses of LDS variables from non-kernel functions",
1598 false, false)
1599
1600ModulePass *
1601llvm::createAMDGPULowerModuleLDSLegacyPass(const AMDGPUTargetMachine *TM) {
1602 return new AMDGPULowerModuleLDSLegacy(TM);
1603}
1604
1605PreservedAnalyses AMDGPULowerModuleLDSPass::run(Module &M,
1606 ModuleAnalysisManager &) {
1607 return AMDGPULowerModuleLDS(TM).runOnModule(M) ? PreservedAnalyses::none()
1608 : PreservedAnalyses::all();
1609}
1610