1//===-- AMDGPUSwLowerLDS.cpp -----------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass lowers the local data store, LDS, uses in kernel and non-kernel
10// functions in module to use dynamically allocated global memory.
11// Packed LDS Layout is emulated in the global memory.
12// The lowered memory instructions from LDS to global memory are then
13// instrumented for address sanitizer, to catch addressing errors.
14// This pass only work when address sanitizer has been enabled and has
15// instrumented the IR. It identifies that IR has been instrumented using
16// "nosanitize_address" module flag.
17//
18// Replacement of Kernel LDS accesses:
19// For a kernel, LDS access can be static or dynamic which are direct
20// (accessed within kernel) and indirect (accessed through non-kernels).
21// All these LDS accesses corresponding to kernel will be packed together,
22// where all static LDS accesses will be allocated first and then dynamic
23// LDS follows. The total size with alignment is calculated. A new LDS global
24// will be created for the kernel called "SW LDS" and it will have the
25// attribute "amdgpu-lds-size" attached with value of the size calculated.
26// All the LDS accesses in the module will be replaced by GEP with offset
27// into the "Sw LDS".
28// A new "llvm.amdgcn.<kernel>.dynlds" is created per kernel accessing
29// the dynamic LDS. This will be marked used by kernel and will have
30// MD_absolue_symbol metadata set to total static LDS size, Since dynamic
31// LDS allocation starts after all static LDS allocation.
32//
33// A device global memory equal to the total LDS size will be allocated.
34// At the prologue of the kernel, a single work-item from the
35// work-group, does a "malloc" and stores the pointer of the
36// allocation in "SW LDS".
37//
38// To store the offsets corresponding to all LDS accesses, another global
39// variable is created which will be called "SW LDS metadata" in this pass.
40// - SW LDS Global:
41// It is LDS global of ptr type with name
42// "llvm.amdgcn.sw.lds.<kernel-name>".
43// - Metadata Global:
44// It is of struct type, with n members. n equals the number of LDS
45// globals accessed by the kernel(direct and indirect). Each member of
46// struct is another struct of type {i32, i32, i32}. First member
47// corresponds to offset, second member corresponds to size of LDS global
48// being replaced and third represents the total aligned size. It will
49// have name "llvm.amdgcn.sw.lds.<kernel-name>.md". This global will have
50// an intializer with static LDS related offsets and sizes initialized.
51// But for dynamic LDS related entries, offsets will be intialized to
52// previous static LDS allocation end offset. Sizes for them will be zero
53// initially. These dynamic LDS offset and size values will be updated
54// within the kernel, since kernel can read the dynamic LDS size
55// allocation done at runtime with query to "hidden_dynamic_lds_size"
56// hidden kernel argument.
57//
58// At the epilogue of kernel, allocated memory would be made free by the same
59// single work-item.
60//
61// Replacement of non-kernel LDS accesses:
62// Multiple kernels can access the same non-kernel function.
63// All the kernels accessing LDS through non-kernels are sorted and
64// assigned a kernel-id. All the LDS globals accessed by non-kernels
65// are sorted. This information is used to build two tables:
66// - Base table:
67// Base table will have single row, with elements of the row
68// placed as per kernel ID. Each element in the row corresponds
69// to ptr of "SW LDS" variable created for that kernel.
70// - Offset table:
71// Offset table will have multiple rows and columns.
72// Rows are assumed to be from 0 to (n-1). n is total number
73// of kernels accessing the LDS through non-kernels.
74// Each row will have m elements. m is the total number of
75// unique LDS globals accessed by all non-kernels.
76// Each element in the row correspond to the ptr of
77// the replacement of LDS global done by that particular kernel.
78// A LDS variable in non-kernel will be replaced based on the information
79// from base and offset tables. Based on kernel-id query, ptr of "SW
80// LDS" for that corresponding kernel is obtained from base table.
81// The Offset into the base "SW LDS" is obtained from
82// corresponding element in offset table. With this information, replacement
83// value is obtained.
84//===----------------------------------------------------------------------===//
85
86#include "AMDGPU.h"
87#include "AMDGPUAsanInstrumentation.h"
88#include "AMDGPUMemoryUtils.h"
89#include "AMDGPUTargetMachine.h"
90#include "llvm/ADT/DenseMap.h"
91#include "llvm/ADT/DenseSet.h"
92#include "llvm/ADT/SetVector.h"
93#include "llvm/ADT/StringExtras.h"
94#include "llvm/ADT/StringRef.h"
95#include "llvm/Analysis/CallGraph.h"
96#include "llvm/Analysis/DomTreeUpdater.h"
97#include "llvm/CodeGen/TargetPassConfig.h"
98#include "llvm/IR/Constants.h"
99#include "llvm/IR/DIBuilder.h"
100#include "llvm/IR/DebugInfo.h"
101#include "llvm/IR/DebugInfoMetadata.h"
102#include "llvm/IR/IRBuilder.h"
103#include "llvm/IR/Instructions.h"
104#include "llvm/IR/IntrinsicsAMDGPU.h"
105#include "llvm/IR/MDBuilder.h"
106#include "llvm/IR/ReplaceConstant.h"
107#include "llvm/Pass.h"
108#include "llvm/Support/raw_ostream.h"
109#include "llvm/Transforms/Instrumentation/AddressSanitizerCommon.h"
110#include "llvm/Transforms/Utils/ModuleUtils.h"
111
112#include <algorithm>
113
114#define DEBUG_TYPE "amdgpu-sw-lower-lds"
115#define COV5_HIDDEN_DYN_LDS_SIZE_ARG 15
116
117using namespace llvm;
118using namespace AMDGPU;
119
120namespace {
121
122cl::opt<bool>
123 AsanInstrumentLDS("amdgpu-asan-instrument-lds",
124 cl::desc("Run asan instrumentation on LDS instructions "
125 "lowered to global memory"),
126 cl::init(Val: true), cl::Hidden);
127
128using DomTreeCallback = function_ref<DominatorTree *(Function &F)>;
129
130struct LDSAccessTypeInfo {
131 SetVector<GlobalVariable *> StaticLDSGlobals;
132 SetVector<GlobalVariable *> DynamicLDSGlobals;
133};
134
135// Struct to hold all the Metadata required for a kernel
136// to replace a LDS global uses with corresponding offset
137// in to device global memory.
138struct KernelLDSParameters {
139 GlobalVariable *SwLDS = nullptr;
140 GlobalVariable *SwDynLDS = nullptr;
141 GlobalVariable *SwLDSMetadata = nullptr;
142 LDSAccessTypeInfo DirectAccess;
143 LDSAccessTypeInfo IndirectAccess;
144 DenseMap<GlobalVariable *, SmallVector<uint32_t, 3>>
145 LDSToReplacementIndicesMap;
146 uint32_t MallocSize = 0;
147 uint32_t LDSSize = 0;
148 SmallVector<std::pair<uint32_t, uint32_t>, 64> RedzoneOffsetAndSizeVector;
149};
150
151// Struct to store information for creation of offset table
152// for all the non-kernel LDS accesses.
153struct NonKernelLDSParameters {
154 GlobalVariable *LDSBaseTable = nullptr;
155 GlobalVariable *LDSOffsetTable = nullptr;
156 SetVector<Function *> OrderedKernels;
157 SetVector<GlobalVariable *> OrdereLDSGlobals;
158};
159
160struct AsanInstrumentInfo {
161 int Scale = 0;
162 uint32_t Offset = 0;
163 SetVector<Instruction *> Instructions;
164};
165
166struct FunctionsAndLDSAccess {
167 DenseMap<Function *, KernelLDSParameters> KernelToLDSParametersMap;
168 SetVector<Function *> KernelsWithIndirectLDSAccess;
169 SetVector<Function *> NonKernelsWithLDSArgument;
170 SetVector<GlobalVariable *> AllNonKernelLDSAccess;
171 FunctionVariableMap NonKernelToLDSAccessMap;
172};
173
174class AMDGPUSwLowerLDS {
175public:
176 AMDGPUSwLowerLDS(Module &Mod, const AMDGPUTargetMachine &TM,
177 DomTreeCallback Callback)
178 : M(Mod), AMDGPUTM(TM), IRB(M.getContext()), DTCallback(Callback) {}
179 bool run();
180 void getUsesOfLDSByNonKernels();
181 void getNonKernelsWithLDSArguments(const CallGraph &CG);
182 SetVector<Function *>
183 getOrderedIndirectLDSAccessingKernels(SetVector<Function *> &Kernels);
184 SetVector<GlobalVariable *>
185 getOrderedNonKernelAllLDSGlobals(SetVector<GlobalVariable *> &Variables);
186 void buildSwLDSGlobal(Function *Func);
187 void buildSwDynLDSGlobal(Function *Func);
188 void populateSwMetadataGlobal(Function *Func);
189 void populateSwLDSAttributeAndMetadata(Function *Func);
190 void populateLDSToReplacementIndicesMap(Function *Func);
191 void getLDSMemoryInstructions(Function *Func,
192 SetVector<Instruction *> &LDSInstructions);
193 void replaceKernelLDSAccesses(Function *Func);
194 Value *getTranslatedGlobalMemoryPtrOfLDS(Value *LoadMallocPtr, Value *LDSPtr);
195 void translateLDSMemoryOperationsToGlobalMemory(
196 Function *Func, Value *LoadMallocPtr,
197 SetVector<Instruction *> &LDSInstructions);
198 void poisonRedzones(Function *Func, Value *MallocPtr);
199 void lowerKernelLDSAccesses(Function *Func, DomTreeUpdater &DTU);
200 void buildNonKernelLDSOffsetTable(NonKernelLDSParameters &NKLDSParams);
201 void buildNonKernelLDSBaseTable(NonKernelLDSParameters &NKLDSParams);
202 Constant *
203 getAddressesOfVariablesInKernel(Function *Func,
204 SetVector<GlobalVariable *> &Variables);
205 void lowerNonKernelLDSAccesses(Function *Func,
206 SetVector<GlobalVariable *> &LDSGlobals,
207 NonKernelLDSParameters &NKLDSParams);
208 void
209 updateMallocSizeForDynamicLDS(Function *Func, Value **CurrMallocSize,
210 Value *HiddenDynLDSSize,
211 SetVector<GlobalVariable *> &DynamicLDSGlobals);
212 void initAsanInfo();
213
214private:
215 Module &M;
216 const AMDGPUTargetMachine &AMDGPUTM;
217 IRBuilder<> IRB;
218 DomTreeCallback DTCallback;
219 FunctionsAndLDSAccess FuncLDSAccessInfo;
220 AsanInstrumentInfo AsanInfo;
221};
222
223template <typename T> SetVector<T> sortByName(std::vector<T> &&V) {
224 // Sort the vector of globals or Functions based on their name.
225 // Returns a SetVector of globals/Functions.
226 sort(V, [](const auto *L, const auto *R) {
227 return L->getName() < R->getName();
228 });
229 return {SetVector<T>(llvm::from_range, V)};
230}
231
232SetVector<GlobalVariable *> AMDGPUSwLowerLDS::getOrderedNonKernelAllLDSGlobals(
233 SetVector<GlobalVariable *> &Variables) {
234 // Sort all the non-kernel LDS accesses based on their name.
235 return sortByName(
236 V: std::vector<GlobalVariable *>(Variables.begin(), Variables.end()));
237}
238
239SetVector<Function *> AMDGPUSwLowerLDS::getOrderedIndirectLDSAccessingKernels(
240 SetVector<Function *> &Kernels) {
241 // Sort the non-kernels accessing LDS based on their name.
242 // Also assign a kernel ID metadata based on the sorted order.
243 LLVMContext &Ctx = M.getContext();
244 if (Kernels.size() > UINT32_MAX) {
245 report_fatal_error(reason: "Unimplemented SW LDS lowering for > 2**32 kernels");
246 }
247 SetVector<Function *> OrderedKernels =
248 sortByName(V: std::vector<Function *>(Kernels.begin(), Kernels.end()));
249 for (size_t i = 0; i < Kernels.size(); i++) {
250 Metadata *AttrMDArgs[1] = {
251 ConstantAsMetadata::get(C: IRB.getInt32(C: i)),
252 };
253 Function *Func = OrderedKernels[i];
254 Func->setMetadata(Kind: "llvm.amdgcn.lds.kernel.id",
255 Node: MDNode::get(Context&: Ctx, MDs: AttrMDArgs));
256 }
257 return OrderedKernels;
258}
259
260void AMDGPUSwLowerLDS::getNonKernelsWithLDSArguments(const CallGraph &CG) {
261 // Among the kernels accessing LDS, get list of
262 // Non-kernels to which a call is made and a ptr
263 // to addrspace(3) is passed as argument.
264 for (auto &K : FuncLDSAccessInfo.KernelToLDSParametersMap) {
265 Function *Func = K.first;
266 const CallGraphNode *CGN = CG[Func];
267 if (!CGN)
268 continue;
269 for (auto &I : *CGN) {
270 CallGraphNode *CallerCGN = I.second;
271 Function *CalledFunc = CallerCGN->getFunction();
272 if (!CalledFunc || CalledFunc->isDeclaration())
273 continue;
274 if (AMDGPU::isKernelLDS(F: CalledFunc))
275 continue;
276 for (auto AI = CalledFunc->arg_begin(), E = CalledFunc->arg_end();
277 AI != E; ++AI) {
278 Type *ArgTy = (*AI).getType();
279 if (!ArgTy->isPointerTy())
280 continue;
281 if (ArgTy->getPointerAddressSpace() != AMDGPUAS::LOCAL_ADDRESS)
282 continue;
283 FuncLDSAccessInfo.NonKernelsWithLDSArgument.insert(X: CalledFunc);
284 // Also add the Calling function to KernelsWithIndirectLDSAccess list
285 // so that base table of LDS is generated.
286 FuncLDSAccessInfo.KernelsWithIndirectLDSAccess.insert(X: Func);
287 }
288 }
289 }
290}
291
292void AMDGPUSwLowerLDS::getUsesOfLDSByNonKernels() {
293 for (GlobalVariable *GV : FuncLDSAccessInfo.AllNonKernelLDSAccess) {
294 if (!AMDGPU::isLDSVariableToLower(GV: *GV))
295 continue;
296
297 for (User *V : GV->users()) {
298 if (auto *I = dyn_cast<Instruction>(Val: V)) {
299 Function *F = I->getFunction();
300 if (!isKernelLDS(F) && !F->isDeclaration())
301 FuncLDSAccessInfo.NonKernelToLDSAccessMap[F].insert(V: GV);
302 }
303 }
304 }
305}
306
307static void recordLDSAbsoluteAddress(Module &M, GlobalVariable *GV,
308 uint32_t Address) {
309 // Write the specified address into metadata where it can be retrieved by
310 // the assembler. Format is a half open range, [Address Address+1)
311 LLVMContext &Ctx = M.getContext();
312 auto *IntTy = M.getDataLayout().getIntPtrType(C&: Ctx, AddressSpace: AMDGPUAS::LOCAL_ADDRESS);
313 MDBuilder MDB(Ctx);
314 MDNode *MetadataNode = MDB.createRange(Lo: ConstantInt::get(Ty: IntTy, V: Address),
315 Hi: ConstantInt::get(Ty: IntTy, V: Address + 1));
316 GV->setMetadata(KindID: LLVMContext::MD_absolute_symbol, Node: MetadataNode);
317}
318
319static void addLDSSizeAttribute(Function *Func, uint32_t Offset,
320 bool IsDynLDS) {
321 if (Offset != 0) {
322 std::string Buffer;
323 raw_string_ostream SS{Buffer};
324 SS << Offset;
325 if (IsDynLDS)
326 SS << "," << Offset;
327 Func->addFnAttr(Kind: "amdgpu-lds-size", Val: Buffer);
328 }
329}
330
331static void markUsedByKernel(Function *Func, GlobalVariable *SGV) {
332 BasicBlock *Entry = &Func->getEntryBlock();
333 IRBuilder<> Builder(Entry, Entry->getFirstNonPHIIt());
334
335 Function *Decl = Intrinsic::getOrInsertDeclaration(M: Func->getParent(),
336 id: Intrinsic::donothing, Tys: {});
337
338 Value *UseInstance[1] = {
339 Builder.CreateConstInBoundsGEP1_32(Ty: SGV->getValueType(), Ptr: SGV, Idx0: 0)};
340
341 Builder.CreateCall(Callee: Decl, Args: {},
342 OpBundles: {OperandBundleDefT<Value *>("ExplicitUse", UseInstance)});
343}
344
345void AMDGPUSwLowerLDS::buildSwLDSGlobal(Function *Func) {
346 // Create new LDS global required for each kernel to store
347 // device global memory pointer.
348 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
349 // Create new global pointer variable
350 LDSParams.SwLDS = new GlobalVariable(
351 M, IRB.getPtrTy(), false, GlobalValue::InternalLinkage,
352 PoisonValue::get(T: IRB.getPtrTy()), "llvm.amdgcn.sw.lds." + Func->getName(),
353 nullptr, GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS, false);
354 GlobalValue::SanitizerMetadata MD;
355 MD.NoAddress = true;
356 LDSParams.SwLDS->setSanitizerMetadata(MD);
357}
358
359void AMDGPUSwLowerLDS::buildSwDynLDSGlobal(Function *Func) {
360 // Create new Dyn LDS global if kernel accesses dyn LDS.
361 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
362 if (LDSParams.DirectAccess.DynamicLDSGlobals.empty() &&
363 LDSParams.IndirectAccess.DynamicLDSGlobals.empty())
364 return;
365 // Create new global pointer variable
366 auto *emptyCharArray = ArrayType::get(ElementType: IRB.getInt8Ty(), NumElements: 0);
367 LDSParams.SwDynLDS = new GlobalVariable(
368 M, emptyCharArray, false, GlobalValue::ExternalLinkage, nullptr,
369 "llvm.amdgcn." + Func->getName() + ".dynlds", nullptr,
370 GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS, false);
371 markUsedByKernel(Func, SGV: LDSParams.SwDynLDS);
372 GlobalValue::SanitizerMetadata MD;
373 MD.NoAddress = true;
374 LDSParams.SwDynLDS->setSanitizerMetadata(MD);
375}
376
377void AMDGPUSwLowerLDS::populateSwLDSAttributeAndMetadata(Function *Func) {
378 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
379 bool IsDynLDSUsed = LDSParams.SwDynLDS;
380 uint32_t Offset = LDSParams.LDSSize;
381 recordLDSAbsoluteAddress(M, GV: LDSParams.SwLDS, Address: 0);
382 addLDSSizeAttribute(Func, Offset, IsDynLDS: IsDynLDSUsed);
383 if (LDSParams.SwDynLDS)
384 recordLDSAbsoluteAddress(M, GV: LDSParams.SwDynLDS, Address: Offset);
385}
386
387void AMDGPUSwLowerLDS::populateSwMetadataGlobal(Function *Func) {
388 // Create new metadata global for every kernel and initialize the
389 // start offsets and sizes corresponding to each LDS accesses.
390 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
391 auto &Ctx = M.getContext();
392 auto &DL = M.getDataLayout();
393 std::vector<Type *> Items;
394 Type *Int32Ty = IRB.getInt32Ty();
395 std::vector<Constant *> Initializers;
396 Align MaxAlignment(1);
397 auto UpdateMaxAlignment = [&MaxAlignment, &DL](GlobalVariable *GV) {
398 Align GVAlign = AMDGPU::getAlign(DL, GV);
399 MaxAlignment = std::max(a: MaxAlignment, b: GVAlign);
400 };
401
402 for (GlobalVariable *GV : LDSParams.DirectAccess.StaticLDSGlobals)
403 UpdateMaxAlignment(GV);
404
405 for (GlobalVariable *GV : LDSParams.DirectAccess.DynamicLDSGlobals)
406 UpdateMaxAlignment(GV);
407
408 for (GlobalVariable *GV : LDSParams.IndirectAccess.StaticLDSGlobals)
409 UpdateMaxAlignment(GV);
410
411 for (GlobalVariable *GV : LDSParams.IndirectAccess.DynamicLDSGlobals)
412 UpdateMaxAlignment(GV);
413
414 //{StartOffset, AlignedSizeInBytes}
415 SmallString<128> MDItemStr;
416 raw_svector_ostream MDItemOS(MDItemStr);
417 MDItemOS << "llvm.amdgcn.sw.lds." << Func->getName() << ".md.item";
418
419 StructType *LDSItemTy =
420 StructType::create(Context&: Ctx, Elements: {Int32Ty, Int32Ty, Int32Ty}, Name: MDItemOS.str());
421 uint32_t &MallocSize = LDSParams.MallocSize;
422 SetVector<GlobalVariable *> UniqueLDSGlobals;
423 int AsanScale = AsanInfo.Scale;
424 auto buildInitializerForSwLDSMD =
425 [&](SetVector<GlobalVariable *> &LDSGlobals) {
426 for (auto &GV : LDSGlobals) {
427 if (is_contained(Range&: UniqueLDSGlobals, Element: GV))
428 continue;
429 UniqueLDSGlobals.insert(X: GV);
430
431 Type *Ty = GV->getValueType();
432 const uint64_t SizeInBytes = DL.getTypeAllocSize(Ty);
433 Items.push_back(x: LDSItemTy);
434 Constant *ItemStartOffset = ConstantInt::get(Ty: Int32Ty, V: MallocSize);
435 Constant *SizeInBytesConst = ConstantInt::get(Ty: Int32Ty, V: SizeInBytes);
436 // Get redzone size corresponding a size.
437 const uint64_t RightRedzoneSize =
438 AMDGPU::getRedzoneSizeForGlobal(Scale: AsanScale, SizeInBytes);
439 // Update MallocSize with current size and redzone size.
440 MallocSize += SizeInBytes;
441 if (!AMDGPU::isDynamicLDS(GV: *GV))
442 LDSParams.RedzoneOffsetAndSizeVector.emplace_back(Args&: MallocSize,
443 Args: RightRedzoneSize);
444 MallocSize += RightRedzoneSize;
445 // Align current size plus redzone.
446 uint64_t AlignedSize =
447 alignTo(Size: SizeInBytes + RightRedzoneSize, A: MaxAlignment);
448 Constant *AlignedSizeInBytesConst =
449 ConstantInt::get(Ty: Int32Ty, V: AlignedSize);
450 // Align MallocSize
451 MallocSize = alignTo(Size: MallocSize, A: MaxAlignment);
452 Constant *InitItem =
453 ConstantStruct::get(T: LDSItemTy, V: {ItemStartOffset, SizeInBytesConst,
454 AlignedSizeInBytesConst});
455 Initializers.push_back(x: InitItem);
456 }
457 };
458 SetVector<GlobalVariable *> SwLDSVector;
459 SwLDSVector.insert(X: LDSParams.SwLDS);
460 buildInitializerForSwLDSMD(SwLDSVector);
461 buildInitializerForSwLDSMD(LDSParams.DirectAccess.StaticLDSGlobals);
462 buildInitializerForSwLDSMD(LDSParams.IndirectAccess.StaticLDSGlobals);
463 buildInitializerForSwLDSMD(LDSParams.DirectAccess.DynamicLDSGlobals);
464 buildInitializerForSwLDSMD(LDSParams.IndirectAccess.DynamicLDSGlobals);
465
466 // Update the LDS size used by the kernel.
467 Type *Ty = LDSParams.SwLDS->getValueType();
468 const uint64_t SizeInBytes = DL.getTypeAllocSize(Ty);
469 uint64_t AlignedSize = alignTo(Size: SizeInBytes, A: MaxAlignment);
470 LDSParams.LDSSize = AlignedSize;
471 SmallString<128> MDTypeStr;
472 raw_svector_ostream MDTypeOS(MDTypeStr);
473 MDTypeOS << "llvm.amdgcn.sw.lds." << Func->getName() << ".md.type";
474 StructType *MetadataStructType =
475 StructType::create(Context&: Ctx, Elements: Items, Name: MDTypeOS.str());
476 SmallString<128> MDStr;
477 raw_svector_ostream MDOS(MDStr);
478 MDOS << "llvm.amdgcn.sw.lds." << Func->getName() << ".md";
479 LDSParams.SwLDSMetadata = new GlobalVariable(
480 M, MetadataStructType, false, GlobalValue::InternalLinkage,
481 PoisonValue::get(T: MetadataStructType), MDOS.str(), nullptr,
482 GlobalValue::NotThreadLocal, AMDGPUAS::GLOBAL_ADDRESS, false);
483 Constant *data = ConstantStruct::get(T: MetadataStructType, V: Initializers);
484 LDSParams.SwLDSMetadata->setInitializer(data);
485 assert(LDSParams.SwLDS);
486 // Set the alignment to MaxAlignment for SwLDS.
487 LDSParams.SwLDS->setAlignment(MaxAlignment);
488 if (LDSParams.SwDynLDS)
489 LDSParams.SwDynLDS->setAlignment(MaxAlignment);
490 GlobalValue::SanitizerMetadata MD;
491 MD.NoAddress = true;
492 LDSParams.SwLDSMetadata->setSanitizerMetadata(MD);
493}
494
495void AMDGPUSwLowerLDS::populateLDSToReplacementIndicesMap(Function *Func) {
496 // Fill the corresponding LDS replacement indices for each LDS access
497 // related to this kernel.
498 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
499 SetVector<GlobalVariable *> UniqueLDSGlobals;
500 auto PopulateIndices = [&](SetVector<GlobalVariable *> &LDSGlobals,
501 uint32_t &Idx) {
502 for (auto &GV : LDSGlobals) {
503 if (is_contained(Range&: UniqueLDSGlobals, Element: GV))
504 continue;
505 UniqueLDSGlobals.insert(X: GV);
506 LDSParams.LDSToReplacementIndicesMap[GV] = {0, Idx, 0};
507 ++Idx;
508 }
509 };
510 uint32_t Idx = 0;
511 SetVector<GlobalVariable *> SwLDSVector;
512 SwLDSVector.insert(X: LDSParams.SwLDS);
513 PopulateIndices(SwLDSVector, Idx);
514 PopulateIndices(LDSParams.DirectAccess.StaticLDSGlobals, Idx);
515 PopulateIndices(LDSParams.IndirectAccess.StaticLDSGlobals, Idx);
516 PopulateIndices(LDSParams.DirectAccess.DynamicLDSGlobals, Idx);
517 PopulateIndices(LDSParams.IndirectAccess.DynamicLDSGlobals, Idx);
518}
519
520static void replacesUsesOfGlobalInFunction(Function *Func, GlobalVariable *GV,
521 Value *Replacement) {
522 // Replace all uses of LDS global in this Function with a Replacement.
523 auto ReplaceUsesLambda = [Func](const Use &U) -> bool {
524 auto *V = U.getUser();
525 if (auto *Inst = dyn_cast<Instruction>(Val: V)) {
526 auto *Func1 = Inst->getParent()->getParent();
527 if (Func == Func1)
528 return true;
529 }
530 return false;
531 };
532 GV->replaceUsesWithIf(New: Replacement, ShouldReplace: ReplaceUsesLambda);
533}
534
535void AMDGPUSwLowerLDS::replaceKernelLDSAccesses(Function *Func) {
536 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
537 GlobalVariable *SwLDS = LDSParams.SwLDS;
538 assert(SwLDS);
539 GlobalVariable *SwLDSMetadata = LDSParams.SwLDSMetadata;
540 assert(SwLDSMetadata);
541 StructType *SwLDSMetadataStructType =
542 cast<StructType>(Val: SwLDSMetadata->getValueType());
543 Type *Int32Ty = IRB.getInt32Ty();
544 auto &IndirectAccess = LDSParams.IndirectAccess;
545 auto &DirectAccess = LDSParams.DirectAccess;
546 // Replace all uses of LDS global in this Function with a Replacement.
547 SetVector<GlobalVariable *> UniqueLDSGlobals;
548 auto ReplaceLDSGlobalUses = [&](SetVector<GlobalVariable *> &LDSGlobals) {
549 for (auto &GV : LDSGlobals) {
550 // Do not generate instructions if LDS access is in non-kernel
551 // i.e indirect-access.
552 if ((IndirectAccess.StaticLDSGlobals.contains(key: GV) ||
553 IndirectAccess.DynamicLDSGlobals.contains(key: GV)) &&
554 (!DirectAccess.StaticLDSGlobals.contains(key: GV) &&
555 !DirectAccess.DynamicLDSGlobals.contains(key: GV)))
556 continue;
557 if (is_contained(Range&: UniqueLDSGlobals, Element: GV))
558 continue;
559 UniqueLDSGlobals.insert(X: GV);
560 auto &Indices = LDSParams.LDSToReplacementIndicesMap[GV];
561 assert(Indices.size() == 3);
562 Constant *GEPIdx[] = {ConstantInt::get(Ty: Int32Ty, V: Indices[0]),
563 ConstantInt::get(Ty: Int32Ty, V: Indices[1]),
564 ConstantInt::get(Ty: Int32Ty, V: Indices[2])};
565 Constant *GEP = ConstantExpr::getGetElementPtr(
566 Ty: SwLDSMetadataStructType, C: SwLDSMetadata, IdxList: GEPIdx, NW: true);
567 Value *Offset = IRB.CreateLoad(Ty: Int32Ty, Ptr: GEP);
568 Value *BasePlusOffset =
569 IRB.CreateInBoundsGEP(Ty: IRB.getInt8Ty(), Ptr: SwLDS, IdxList: {Offset});
570 LLVM_DEBUG(GV->printAsOperand(dbgs() << "Sw LDS Lowering, Replacing LDS ",
571 false));
572 replacesUsesOfGlobalInFunction(Func, GV, Replacement: BasePlusOffset);
573 }
574 };
575 ReplaceLDSGlobalUses(DirectAccess.StaticLDSGlobals);
576 ReplaceLDSGlobalUses(IndirectAccess.StaticLDSGlobals);
577 ReplaceLDSGlobalUses(DirectAccess.DynamicLDSGlobals);
578 ReplaceLDSGlobalUses(IndirectAccess.DynamicLDSGlobals);
579}
580
581void AMDGPUSwLowerLDS::updateMallocSizeForDynamicLDS(
582 Function *Func, Value **CurrMallocSize, Value *HiddenDynLDSSize,
583 SetVector<GlobalVariable *> &DynamicLDSGlobals) {
584 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
585 Type *Int32Ty = IRB.getInt32Ty();
586
587 GlobalVariable *SwLDS = LDSParams.SwLDS;
588 GlobalVariable *SwLDSMetadata = LDSParams.SwLDSMetadata;
589 assert(SwLDS && SwLDSMetadata);
590 StructType *MetadataStructType =
591 cast<StructType>(Val: SwLDSMetadata->getValueType());
592 unsigned MaxAlignment = SwLDS->getAlignment();
593 Value *MaxAlignValue = IRB.getInt32(C: MaxAlignment);
594 Value *MaxAlignValueMinusOne = IRB.getInt32(C: MaxAlignment - 1);
595
596 for (GlobalVariable *DynGV : DynamicLDSGlobals) {
597 auto &Indices = LDSParams.LDSToReplacementIndicesMap[DynGV];
598 // Update the Offset metadata.
599 Constant *Index0 = ConstantInt::get(Ty: Int32Ty, V: 0);
600 Constant *Index1 = ConstantInt::get(Ty: Int32Ty, V: Indices[1]);
601
602 Constant *Index2Offset = ConstantInt::get(Ty: Int32Ty, V: 0);
603 auto *GEPForOffset = IRB.CreateInBoundsGEP(
604 Ty: MetadataStructType, Ptr: SwLDSMetadata, IdxList: {Index0, Index1, Index2Offset});
605
606 IRB.CreateStore(Val: *CurrMallocSize, Ptr: GEPForOffset);
607 // Update the size and Aligned Size metadata.
608 Constant *Index2Size = ConstantInt::get(Ty: Int32Ty, V: 1);
609 auto *GEPForSize = IRB.CreateInBoundsGEP(Ty: MetadataStructType, Ptr: SwLDSMetadata,
610 IdxList: {Index0, Index1, Index2Size});
611
612 Value *CurrDynLDSSize = IRB.CreateLoad(Ty: Int32Ty, Ptr: HiddenDynLDSSize);
613 IRB.CreateStore(Val: CurrDynLDSSize, Ptr: GEPForSize);
614 Constant *Index2AlignedSize = ConstantInt::get(Ty: Int32Ty, V: 2);
615 auto *GEPForAlignedSize = IRB.CreateInBoundsGEP(
616 Ty: MetadataStructType, Ptr: SwLDSMetadata, IdxList: {Index0, Index1, Index2AlignedSize});
617
618 Value *AlignedDynLDSSize =
619 IRB.CreateAdd(LHS: CurrDynLDSSize, RHS: MaxAlignValueMinusOne);
620 AlignedDynLDSSize = IRB.CreateUDiv(LHS: AlignedDynLDSSize, RHS: MaxAlignValue);
621 AlignedDynLDSSize = IRB.CreateMul(LHS: AlignedDynLDSSize, RHS: MaxAlignValue);
622 IRB.CreateStore(Val: AlignedDynLDSSize, Ptr: GEPForAlignedSize);
623
624 // Update the Current Malloc Size
625 *CurrMallocSize = IRB.CreateAdd(LHS: *CurrMallocSize, RHS: AlignedDynLDSSize);
626 }
627}
628
629static DebugLoc getOrCreateDebugLoc(const Instruction *InsertBefore,
630 DISubprogram *SP) {
631 assert(InsertBefore);
632 if (InsertBefore->getDebugLoc())
633 return InsertBefore->getDebugLoc();
634 if (SP)
635 return DILocation::get(Context&: SP->getContext(), Line: SP->getLine(), Column: 1, Scope: SP);
636 return DebugLoc();
637}
638
639void AMDGPUSwLowerLDS::getLDSMemoryInstructions(
640 Function *Func, SetVector<Instruction *> &LDSInstructions) {
641 for (BasicBlock &BB : *Func) {
642 for (Instruction &Inst : BB) {
643 if (LoadInst *LI = dyn_cast<LoadInst>(Val: &Inst)) {
644 if (LI->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS)
645 LDSInstructions.insert(X: &Inst);
646 } else if (StoreInst *SI = dyn_cast<StoreInst>(Val: &Inst)) {
647 if (SI->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS)
648 LDSInstructions.insert(X: &Inst);
649 } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(Val: &Inst)) {
650 if (RMW->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS)
651 LDSInstructions.insert(X: &Inst);
652 } else if (AtomicCmpXchgInst *XCHG = dyn_cast<AtomicCmpXchgInst>(Val: &Inst)) {
653 if (XCHG->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS)
654 LDSInstructions.insert(X: &Inst);
655 } else if (AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(Val: &Inst)) {
656 if (ASC->getSrcAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&
657 ASC->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS)
658 LDSInstructions.insert(X: &Inst);
659 } else
660 continue;
661 }
662 }
663}
664
665Value *AMDGPUSwLowerLDS::getTranslatedGlobalMemoryPtrOfLDS(Value *LoadMallocPtr,
666 Value *LDSPtr) {
667 assert(LDSPtr && "Invalid LDS pointer operand");
668 Type *LDSPtrType = LDSPtr->getType();
669 LLVMContext &Ctx = M.getContext();
670 const DataLayout &DL = M.getDataLayout();
671 Type *IntTy = DL.getIntPtrType(C&: Ctx, AddressSpace: AMDGPUAS::LOCAL_ADDRESS);
672 if (auto *VecPtrTy = dyn_cast<VectorType>(Val: LDSPtrType)) {
673 // Handle vector of pointers
674 ElementCount NumElements = VecPtrTy->getElementCount();
675 IntTy = VectorType::get(ElementType: IntTy, EC: NumElements);
676 }
677 Value *GepIndex = IRB.CreatePtrToInt(V: LDSPtr, DestTy: IntTy);
678 return IRB.CreateInBoundsGEP(Ty: IRB.getInt8Ty(), Ptr: LoadMallocPtr, IdxList: {GepIndex});
679}
680
681void AMDGPUSwLowerLDS::translateLDSMemoryOperationsToGlobalMemory(
682 Function *Func, Value *LoadMallocPtr,
683 SetVector<Instruction *> &LDSInstructions) {
684 LLVM_DEBUG(dbgs() << "Translating LDS memory operations to global memory : "
685 << Func->getName());
686 for (Instruction *Inst : LDSInstructions) {
687 IRB.SetInsertPoint(Inst);
688 if (LoadInst *LI = dyn_cast<LoadInst>(Val: Inst)) {
689 Value *LIOperand = LI->getPointerOperand();
690 Value *Replacement =
691 getTranslatedGlobalMemoryPtrOfLDS(LoadMallocPtr, LDSPtr: LIOperand);
692 LoadInst *NewLI = IRB.CreateAlignedLoad(Ty: LI->getType(), Ptr: Replacement,
693 Align: LI->getAlign(), isVolatile: LI->isVolatile());
694 NewLI->setAtomic(Ordering: LI->getOrdering(), SSID: LI->getSyncScopeID());
695 AsanInfo.Instructions.insert(X: NewLI);
696 LI->replaceAllUsesWith(V: NewLI);
697 LI->eraseFromParent();
698 } else if (StoreInst *SI = dyn_cast<StoreInst>(Val: Inst)) {
699 Value *SIOperand = SI->getPointerOperand();
700 Value *Replacement =
701 getTranslatedGlobalMemoryPtrOfLDS(LoadMallocPtr, LDSPtr: SIOperand);
702 StoreInst *NewSI = IRB.CreateAlignedStore(
703 Val: SI->getValueOperand(), Ptr: Replacement, Align: SI->getAlign(), isVolatile: SI->isVolatile());
704 NewSI->setAtomic(Ordering: SI->getOrdering(), SSID: SI->getSyncScopeID());
705 AsanInfo.Instructions.insert(X: NewSI);
706 SI->replaceAllUsesWith(V: NewSI);
707 SI->eraseFromParent();
708 } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(Val: Inst)) {
709 Value *RMWPtrOperand = RMW->getPointerOperand();
710 Value *RMWValOperand = RMW->getValOperand();
711 Value *Replacement =
712 getTranslatedGlobalMemoryPtrOfLDS(LoadMallocPtr, LDSPtr: RMWPtrOperand);
713 AtomicRMWInst *NewRMW = IRB.CreateAtomicRMW(
714 Op: RMW->getOperation(), Ptr: Replacement, Val: RMWValOperand, Align: RMW->getAlign(),
715 Ordering: RMW->getOrdering(), SSID: RMW->getSyncScopeID());
716 NewRMW->setVolatile(RMW->isVolatile());
717 AsanInfo.Instructions.insert(X: NewRMW);
718 RMW->replaceAllUsesWith(V: NewRMW);
719 RMW->eraseFromParent();
720 } else if (AtomicCmpXchgInst *XCHG = dyn_cast<AtomicCmpXchgInst>(Val: Inst)) {
721 Value *XCHGPtrOperand = XCHG->getPointerOperand();
722 Value *Replacement =
723 getTranslatedGlobalMemoryPtrOfLDS(LoadMallocPtr, LDSPtr: XCHGPtrOperand);
724 AtomicCmpXchgInst *NewXCHG = IRB.CreateAtomicCmpXchg(
725 Ptr: Replacement, Cmp: XCHG->getCompareOperand(), New: XCHG->getNewValOperand(),
726 Align: XCHG->getAlign(), SuccessOrdering: XCHG->getSuccessOrdering(),
727 FailureOrdering: XCHG->getFailureOrdering(), SSID: XCHG->getSyncScopeID());
728 NewXCHG->setVolatile(XCHG->isVolatile());
729 AsanInfo.Instructions.insert(X: NewXCHG);
730 XCHG->replaceAllUsesWith(V: NewXCHG);
731 XCHG->eraseFromParent();
732 } else if (AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(Val: Inst)) {
733 Value *AIOperand = ASC->getPointerOperand();
734 Value *Replacement =
735 getTranslatedGlobalMemoryPtrOfLDS(LoadMallocPtr, LDSPtr: AIOperand);
736 Value *NewAI = IRB.CreateAddrSpaceCast(V: Replacement, DestTy: ASC->getType());
737 // Note: No need to add the instruction to AsanInfo instructions to be
738 // instrumented list. FLAT_ADDRESS ptr would have been already
739 // instrumented by asan pass prior to this pass.
740 ASC->replaceAllUsesWith(V: NewAI);
741 ASC->eraseFromParent();
742 } else
743 report_fatal_error(reason: "Unimplemented LDS lowering instruction");
744 }
745}
746
747void AMDGPUSwLowerLDS::poisonRedzones(Function *Func, Value *MallocPtr) {
748 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
749 Type *Int64Ty = IRB.getInt64Ty();
750 Type *VoidTy = IRB.getVoidTy();
751 FunctionCallee AsanPoisonRegion = M.getOrInsertFunction(
752 Name: "__asan_poison_region",
753 T: FunctionType::get(Result: VoidTy, Params: {Int64Ty, Int64Ty}, isVarArg: false));
754
755 auto RedzonesVec = LDSParams.RedzoneOffsetAndSizeVector;
756 size_t VecSize = RedzonesVec.size();
757 for (unsigned i = 0; i < VecSize; i++) {
758 auto &RedzonePair = RedzonesVec[i];
759 uint64_t RedzoneOffset = RedzonePair.first;
760 uint64_t RedzoneSize = RedzonePair.second;
761 Value *RedzoneAddrOffset = IRB.CreateInBoundsGEP(
762 Ty: IRB.getInt8Ty(), Ptr: MallocPtr, IdxList: {IRB.getInt64(C: RedzoneOffset)});
763 Value *RedzoneAddress = IRB.CreatePtrToInt(V: RedzoneAddrOffset, DestTy: Int64Ty);
764 IRB.CreateCall(Callee: AsanPoisonRegion,
765 Args: {RedzoneAddress, IRB.getInt64(C: RedzoneSize)});
766 }
767}
768
769void AMDGPUSwLowerLDS::lowerKernelLDSAccesses(Function *Func,
770 DomTreeUpdater &DTU) {
771 LLVM_DEBUG(dbgs() << "Sw Lowering Kernel LDS for : " << Func->getName());
772 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
773 auto &Ctx = M.getContext();
774 auto *PrevEntryBlock = &Func->getEntryBlock();
775 SetVector<Instruction *> LDSInstructions;
776 getLDSMemoryInstructions(Func, LDSInstructions);
777
778 // Create malloc block.
779 auto *MallocBlock = BasicBlock::Create(Context&: Ctx, Name: "Malloc", Parent: Func, InsertBefore: PrevEntryBlock);
780
781 // Create WIdBlock block which has instructions related to selection of
782 // {0,0,0} indiex work item in the work group.
783 auto *WIdBlock = BasicBlock::Create(Context&: Ctx, Name: "WId", Parent: Func, InsertBefore: MallocBlock);
784 IRB.SetInsertPoint(TheBB: WIdBlock, IP: WIdBlock->begin());
785 DebugLoc FirstDL =
786 getOrCreateDebugLoc(InsertBefore: &*PrevEntryBlock->begin(), SP: Func->getSubprogram());
787 IRB.SetCurrentDebugLocation(FirstDL);
788 Value *WIdx = IRB.CreateIntrinsic(ID: Intrinsic::amdgcn_workitem_id_x, Args: {});
789 Value *WIdy = IRB.CreateIntrinsic(ID: Intrinsic::amdgcn_workitem_id_y, Args: {});
790 Value *WIdz = IRB.CreateIntrinsic(ID: Intrinsic::amdgcn_workitem_id_z, Args: {});
791 Value *XYOr = IRB.CreateOr(LHS: WIdx, RHS: WIdy);
792 Value *XYZOr = IRB.CreateOr(LHS: XYOr, RHS: WIdz);
793 Value *WIdzCond = IRB.CreateICmpEQ(LHS: XYZOr, RHS: IRB.getInt32(C: 0));
794
795 // All work items will branch to PrevEntryBlock except {0,0,0} index
796 // work item which will branch to malloc block.
797 IRB.CreateCondBr(Cond: WIdzCond, True: MallocBlock, False: PrevEntryBlock);
798
799 // Malloc block
800 IRB.SetInsertPoint(TheBB: MallocBlock, IP: MallocBlock->begin());
801
802 // If Dynamic LDS globals are accessed by the kernel,
803 // Get the size of dyn lds from hidden dyn_lds_size kernel arg.
804 // Update the corresponding metadata global entries for this dyn lds global.
805 GlobalVariable *SwLDS = LDSParams.SwLDS;
806 GlobalVariable *SwLDSMetadata = LDSParams.SwLDSMetadata;
807 assert(SwLDS && SwLDSMetadata);
808 StructType *MetadataStructType =
809 cast<StructType>(Val: SwLDSMetadata->getValueType());
810 uint32_t MallocSize = 0;
811 Value *CurrMallocSize;
812 Type *Int32Ty = IRB.getInt32Ty();
813 Type *Int64Ty = IRB.getInt64Ty();
814
815 SetVector<GlobalVariable *> UniqueLDSGlobals;
816 auto GetUniqueLDSGlobals = [&](SetVector<GlobalVariable *> &LDSGlobals) {
817 for (auto &GV : LDSGlobals) {
818 if (is_contained(Range&: UniqueLDSGlobals, Element: GV))
819 continue;
820 UniqueLDSGlobals.insert(X: GV);
821 }
822 };
823
824 GetUniqueLDSGlobals(LDSParams.DirectAccess.StaticLDSGlobals);
825 GetUniqueLDSGlobals(LDSParams.IndirectAccess.StaticLDSGlobals);
826 unsigned NumStaticLDS = 1 + UniqueLDSGlobals.size();
827 UniqueLDSGlobals.clear();
828
829 if (NumStaticLDS) {
830 auto *GEPForEndStaticLDSOffset =
831 IRB.CreateInBoundsGEP(Ty: MetadataStructType, Ptr: SwLDSMetadata,
832 IdxList: {ConstantInt::get(Ty: Int32Ty, V: 0),
833 ConstantInt::get(Ty: Int32Ty, V: NumStaticLDS - 1),
834 ConstantInt::get(Ty: Int32Ty, V: 0)});
835
836 auto *GEPForEndStaticLDSSize =
837 IRB.CreateInBoundsGEP(Ty: MetadataStructType, Ptr: SwLDSMetadata,
838 IdxList: {ConstantInt::get(Ty: Int32Ty, V: 0),
839 ConstantInt::get(Ty: Int32Ty, V: NumStaticLDS - 1),
840 ConstantInt::get(Ty: Int32Ty, V: 2)});
841
842 Value *EndStaticLDSOffset =
843 IRB.CreateLoad(Ty: Int32Ty, Ptr: GEPForEndStaticLDSOffset);
844 Value *EndStaticLDSSize = IRB.CreateLoad(Ty: Int32Ty, Ptr: GEPForEndStaticLDSSize);
845 CurrMallocSize = IRB.CreateAdd(LHS: EndStaticLDSOffset, RHS: EndStaticLDSSize);
846 } else
847 CurrMallocSize = IRB.getInt32(C: MallocSize);
848
849 if (LDSParams.SwDynLDS) {
850 if (!(AMDGPU::getAMDHSACodeObjectVersion(M) >= AMDGPU::AMDHSA_COV5))
851 report_fatal_error(
852 reason: "Dynamic LDS size query is only supported for CO V5 and later.");
853 // Get size from hidden dyn_lds_size argument of kernel
854 Value *ImplicitArg =
855 IRB.CreateIntrinsic(ID: Intrinsic::amdgcn_implicitarg_ptr, Args: {});
856 Value *HiddenDynLDSSize = IRB.CreateInBoundsGEP(
857 Ty: ImplicitArg->getType(), Ptr: ImplicitArg,
858 IdxList: {ConstantInt::get(Ty: Int64Ty, COV5_HIDDEN_DYN_LDS_SIZE_ARG)});
859 UniqueLDSGlobals.clear();
860 GetUniqueLDSGlobals(LDSParams.DirectAccess.DynamicLDSGlobals);
861 GetUniqueLDSGlobals(LDSParams.IndirectAccess.DynamicLDSGlobals);
862 updateMallocSizeForDynamicLDS(Func, CurrMallocSize: &CurrMallocSize, HiddenDynLDSSize,
863 DynamicLDSGlobals&: UniqueLDSGlobals);
864 }
865
866 CurrMallocSize = IRB.CreateZExt(V: CurrMallocSize, DestTy: Int64Ty);
867
868 // Create a call to malloc function which does device global memory allocation
869 // with size equals to all LDS global accesses size in this kernel.
870 Value *ReturnAddress =
871 IRB.CreateIntrinsic(ID: Intrinsic::returnaddress, Args: {IRB.getInt32(C: 0)});
872 FunctionCallee MallocFunc = M.getOrInsertFunction(
873 Name: StringRef("__asan_malloc_impl"),
874 T: FunctionType::get(Result: Int64Ty, Params: {Int64Ty, Int64Ty}, isVarArg: false));
875 Value *RAPtrToInt = IRB.CreatePtrToInt(V: ReturnAddress, DestTy: Int64Ty);
876 Value *MallocCall = IRB.CreateCall(Callee: MallocFunc, Args: {CurrMallocSize, RAPtrToInt});
877
878 Value *MallocPtr =
879 IRB.CreateIntToPtr(V: MallocCall, DestTy: IRB.getPtrTy(AddrSpace: AMDGPUAS::GLOBAL_ADDRESS));
880
881 // Create store of malloc to new global
882 IRB.CreateStore(Val: MallocPtr, Ptr: SwLDS);
883
884 // Create calls to __asan_poison_region to poison redzones.
885 poisonRedzones(Func, MallocPtr);
886
887 // Create branch to PrevEntryBlock
888 IRB.CreateBr(Dest: PrevEntryBlock);
889
890 // Create wave-group barrier at the starting of Previous entry block
891 Type *Int1Ty = IRB.getInt1Ty();
892 IRB.SetInsertPoint(TheBB: PrevEntryBlock, IP: PrevEntryBlock->begin());
893 auto *XYZCondPhi = IRB.CreatePHI(Ty: Int1Ty, NumReservedValues: 2, Name: "xyzCond");
894 XYZCondPhi->addIncoming(V: IRB.getInt1(V: 0), BB: WIdBlock);
895 XYZCondPhi->addIncoming(V: IRB.getInt1(V: 1), BB: MallocBlock);
896
897 IRB.CreateIntrinsic(ID: Intrinsic::amdgcn_s_barrier, Args: {});
898
899 // Load malloc pointer from Sw LDS.
900 Value *LoadMallocPtr =
901 IRB.CreateLoad(Ty: IRB.getPtrTy(AddrSpace: AMDGPUAS::GLOBAL_ADDRESS), Ptr: SwLDS);
902
903 // Replace All uses of LDS globals with new LDS pointers.
904 replaceKernelLDSAccesses(Func);
905
906 // Replace Memory Operations on LDS with corresponding
907 // global memory pointers.
908 translateLDSMemoryOperationsToGlobalMemory(Func, LoadMallocPtr,
909 LDSInstructions);
910
911 auto *CondFreeBlock = BasicBlock::Create(Context&: Ctx, Name: "CondFree", Parent: Func);
912 auto *FreeBlock = BasicBlock::Create(Context&: Ctx, Name: "Free", Parent: Func);
913 auto *EndBlock = BasicBlock::Create(Context&: Ctx, Name: "End", Parent: Func);
914 for (BasicBlock &BB : *Func) {
915 if (!BB.empty()) {
916 if (ReturnInst *RI = dyn_cast<ReturnInst>(Val: &BB.back())) {
917 RI->eraseFromParent();
918 IRB.SetInsertPoint(TheBB: &BB, IP: BB.end());
919 IRB.CreateBr(Dest: CondFreeBlock);
920 }
921 }
922 }
923
924 // Cond Free Block
925 IRB.SetInsertPoint(TheBB: CondFreeBlock, IP: CondFreeBlock->begin());
926 IRB.CreateIntrinsic(ID: Intrinsic::amdgcn_s_barrier, Args: {});
927 IRB.CreateCondBr(Cond: XYZCondPhi, True: FreeBlock, False: EndBlock);
928
929 // Free Block
930 IRB.SetInsertPoint(TheBB: FreeBlock, IP: FreeBlock->begin());
931
932 // Free the previously allocate device global memory.
933 FunctionCallee AsanFreeFunc = M.getOrInsertFunction(
934 Name: StringRef("__asan_free_impl"),
935 T: FunctionType::get(Result: IRB.getVoidTy(), Params: {Int64Ty, Int64Ty}, isVarArg: false));
936 Value *ReturnAddr =
937 IRB.CreateIntrinsic(ID: Intrinsic::returnaddress, Args: IRB.getInt32(C: 0));
938 Value *RAPToInt = IRB.CreatePtrToInt(V: ReturnAddr, DestTy: Int64Ty);
939 Value *MallocPtrToInt = IRB.CreatePtrToInt(V: LoadMallocPtr, DestTy: Int64Ty);
940 IRB.CreateCall(Callee: AsanFreeFunc, Args: {MallocPtrToInt, RAPToInt});
941
942 IRB.CreateBr(Dest: EndBlock);
943
944 // End Block
945 IRB.SetInsertPoint(TheBB: EndBlock, IP: EndBlock->begin());
946 IRB.CreateRetVoid();
947 // Update the DomTree with corresponding links to basic blocks.
948 DTU.applyUpdates(Updates: {{DominatorTree::Insert, WIdBlock, MallocBlock},
949 {DominatorTree::Insert, MallocBlock, PrevEntryBlock},
950 {DominatorTree::Insert, CondFreeBlock, FreeBlock},
951 {DominatorTree::Insert, FreeBlock, EndBlock}});
952}
953
954Constant *AMDGPUSwLowerLDS::getAddressesOfVariablesInKernel(
955 Function *Func, SetVector<GlobalVariable *> &Variables) {
956 Type *Int32Ty = IRB.getInt32Ty();
957 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
958
959 GlobalVariable *SwLDSMetadata = LDSParams.SwLDSMetadata;
960 assert(SwLDSMetadata);
961 auto *SwLDSMetadataStructType =
962 cast<StructType>(Val: SwLDSMetadata->getValueType());
963 ArrayType *KernelOffsetsType =
964 ArrayType::get(ElementType: IRB.getPtrTy(AddrSpace: AMDGPUAS::GLOBAL_ADDRESS), NumElements: Variables.size());
965
966 SmallVector<Constant *> Elements;
967 for (auto *GV : Variables) {
968 auto It = LDSParams.LDSToReplacementIndicesMap.find(Val: GV);
969 if (It == LDSParams.LDSToReplacementIndicesMap.end()) {
970 Elements.push_back(
971 Elt: PoisonValue::get(T: IRB.getPtrTy(AddrSpace: AMDGPUAS::GLOBAL_ADDRESS)));
972 continue;
973 }
974 auto &Indices = It->second;
975 Constant *GEPIdx[] = {ConstantInt::get(Ty: Int32Ty, V: Indices[0]),
976 ConstantInt::get(Ty: Int32Ty, V: Indices[1]),
977 ConstantInt::get(Ty: Int32Ty, V: Indices[2])};
978 Constant *GEP = ConstantExpr::getGetElementPtr(Ty: SwLDSMetadataStructType,
979 C: SwLDSMetadata, IdxList: GEPIdx, NW: true);
980 Elements.push_back(Elt: GEP);
981 }
982 return ConstantArray::get(T: KernelOffsetsType, V: Elements);
983}
984
985void AMDGPUSwLowerLDS::buildNonKernelLDSBaseTable(
986 NonKernelLDSParameters &NKLDSParams) {
987 // Base table will have single row, with elements of the row
988 // placed as per kernel ID. Each element in the row corresponds
989 // to addresss of "SW LDS" global of the kernel.
990 auto &Kernels = NKLDSParams.OrderedKernels;
991 if (Kernels.empty())
992 return;
993 Type *Int32Ty = IRB.getInt32Ty();
994 const size_t NumberKernels = Kernels.size();
995 ArrayType *AllKernelsOffsetsType =
996 ArrayType::get(ElementType: IRB.getPtrTy(AddrSpace: AMDGPUAS::LOCAL_ADDRESS), NumElements: NumberKernels);
997 std::vector<Constant *> OverallConstantExprElts(NumberKernels);
998 for (size_t i = 0; i < NumberKernels; i++) {
999 Function *Func = Kernels[i];
1000 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
1001 GlobalVariable *SwLDS = LDSParams.SwLDS;
1002 assert(SwLDS);
1003 Constant *GEPIdx[] = {ConstantInt::get(Ty: Int32Ty, V: 0)};
1004 Constant *GEP =
1005 ConstantExpr::getGetElementPtr(Ty: SwLDS->getType(), C: SwLDS, IdxList: GEPIdx, NW: true);
1006 OverallConstantExprElts[i] = GEP;
1007 }
1008 Constant *init =
1009 ConstantArray::get(T: AllKernelsOffsetsType, V: OverallConstantExprElts);
1010 NKLDSParams.LDSBaseTable = new GlobalVariable(
1011 M, AllKernelsOffsetsType, true, GlobalValue::InternalLinkage, init,
1012 "llvm.amdgcn.sw.lds.base.table", nullptr, GlobalValue::NotThreadLocal,
1013 AMDGPUAS::GLOBAL_ADDRESS);
1014 GlobalValue::SanitizerMetadata MD;
1015 MD.NoAddress = true;
1016 NKLDSParams.LDSBaseTable->setSanitizerMetadata(MD);
1017}
1018
1019void AMDGPUSwLowerLDS::buildNonKernelLDSOffsetTable(
1020 NonKernelLDSParameters &NKLDSParams) {
1021 // Offset table will have multiple rows and columns.
1022 // Rows are assumed to be from 0 to (n-1). n is total number
1023 // of kernels accessing the LDS through non-kernels.
1024 // Each row will have m elements. m is the total number of
1025 // unique LDS globals accessed by non-kernels.
1026 // Each element in the row correspond to the address of
1027 // the replacement of LDS global done by that particular kernel.
1028 auto &Variables = NKLDSParams.OrdereLDSGlobals;
1029 auto &Kernels = NKLDSParams.OrderedKernels;
1030 if (Variables.empty() || Kernels.empty())
1031 return;
1032 const size_t NumberVariables = Variables.size();
1033 const size_t NumberKernels = Kernels.size();
1034
1035 ArrayType *KernelOffsetsType =
1036 ArrayType::get(ElementType: IRB.getPtrTy(AddrSpace: AMDGPUAS::GLOBAL_ADDRESS), NumElements: NumberVariables);
1037
1038 ArrayType *AllKernelsOffsetsType =
1039 ArrayType::get(ElementType: KernelOffsetsType, NumElements: NumberKernels);
1040 std::vector<Constant *> overallConstantExprElts(NumberKernels);
1041 for (size_t i = 0; i < NumberKernels; i++) {
1042 Function *Func = Kernels[i];
1043 overallConstantExprElts[i] =
1044 getAddressesOfVariablesInKernel(Func, Variables);
1045 }
1046 Constant *Init =
1047 ConstantArray::get(T: AllKernelsOffsetsType, V: overallConstantExprElts);
1048 NKLDSParams.LDSOffsetTable = new GlobalVariable(
1049 M, AllKernelsOffsetsType, true, GlobalValue::InternalLinkage, Init,
1050 "llvm.amdgcn.sw.lds.offset.table", nullptr, GlobalValue::NotThreadLocal,
1051 AMDGPUAS::GLOBAL_ADDRESS);
1052 GlobalValue::SanitizerMetadata MD;
1053 MD.NoAddress = true;
1054 NKLDSParams.LDSOffsetTable->setSanitizerMetadata(MD);
1055}
1056
1057void AMDGPUSwLowerLDS::lowerNonKernelLDSAccesses(
1058 Function *Func, SetVector<GlobalVariable *> &LDSGlobals,
1059 NonKernelLDSParameters &NKLDSParams) {
1060 // Replace LDS access in non-kernel with replacement queried from
1061 // Base table and offset from offset table.
1062 LLVM_DEBUG(dbgs() << "Sw LDS lowering, lower non-kernel access for : "
1063 << Func->getName());
1064 auto InsertAt = Func->getEntryBlock().getFirstNonPHIOrDbgOrAlloca();
1065 IRB.SetInsertPoint(InsertAt);
1066
1067 // Get LDS memory instructions.
1068 SetVector<Instruction *> LDSInstructions;
1069 getLDSMemoryInstructions(Func, LDSInstructions);
1070
1071 auto *KernelId = IRB.CreateIntrinsic(ID: Intrinsic::amdgcn_lds_kernel_id, Args: {});
1072 GlobalVariable *LDSBaseTable = NKLDSParams.LDSBaseTable;
1073 GlobalVariable *LDSOffsetTable = NKLDSParams.LDSOffsetTable;
1074 auto &OrdereLDSGlobals = NKLDSParams.OrdereLDSGlobals;
1075 Value *BaseGEP = IRB.CreateInBoundsGEP(
1076 Ty: LDSBaseTable->getValueType(), Ptr: LDSBaseTable, IdxList: {IRB.getInt32(C: 0), KernelId});
1077 Value *BaseLoad =
1078 IRB.CreateLoad(Ty: IRB.getPtrTy(AddrSpace: AMDGPUAS::LOCAL_ADDRESS), Ptr: BaseGEP);
1079 Value *LoadMallocPtr =
1080 IRB.CreateLoad(Ty: IRB.getPtrTy(AddrSpace: AMDGPUAS::GLOBAL_ADDRESS), Ptr: BaseLoad);
1081
1082 for (GlobalVariable *GV : LDSGlobals) {
1083 const auto *GVIt = llvm::find(Range&: OrdereLDSGlobals, Val: GV);
1084 assert(GVIt != OrdereLDSGlobals.end());
1085 uint32_t GVOffset = std::distance(first: OrdereLDSGlobals.begin(), last: GVIt);
1086
1087 Value *OffsetGEP = IRB.CreateInBoundsGEP(
1088 Ty: LDSOffsetTable->getValueType(), Ptr: LDSOffsetTable,
1089 IdxList: {IRB.getInt32(C: 0), KernelId, IRB.getInt32(C: GVOffset)});
1090 Value *OffsetLoad =
1091 IRB.CreateLoad(Ty: IRB.getPtrTy(AddrSpace: AMDGPUAS::GLOBAL_ADDRESS), Ptr: OffsetGEP);
1092 Value *Offset = IRB.CreateLoad(Ty: IRB.getInt32Ty(), Ptr: OffsetLoad);
1093 Value *BasePlusOffset =
1094 IRB.CreateInBoundsGEP(Ty: IRB.getInt8Ty(), Ptr: BaseLoad, IdxList: {Offset});
1095 LLVM_DEBUG(dbgs() << "Sw LDS Lowering, Replace non-kernel LDS for "
1096 << GV->getName());
1097 replacesUsesOfGlobalInFunction(Func, GV, Replacement: BasePlusOffset);
1098 }
1099 translateLDSMemoryOperationsToGlobalMemory(Func, LoadMallocPtr,
1100 LDSInstructions);
1101}
1102
1103static void reorderStaticDynamicIndirectLDSSet(KernelLDSParameters &LDSParams) {
1104 // Sort Static, dynamic LDS globals which are either
1105 // direct or indirect access on basis of name.
1106 auto &DirectAccess = LDSParams.DirectAccess;
1107 auto &IndirectAccess = LDSParams.IndirectAccess;
1108 LDSParams.DirectAccess.StaticLDSGlobals = sortByName(
1109 V: std::vector<GlobalVariable *>(DirectAccess.StaticLDSGlobals.begin(),
1110 DirectAccess.StaticLDSGlobals.end()));
1111 LDSParams.DirectAccess.DynamicLDSGlobals = sortByName(
1112 V: std::vector<GlobalVariable *>(DirectAccess.DynamicLDSGlobals.begin(),
1113 DirectAccess.DynamicLDSGlobals.end()));
1114 LDSParams.IndirectAccess.StaticLDSGlobals = sortByName(
1115 V: std::vector<GlobalVariable *>(IndirectAccess.StaticLDSGlobals.begin(),
1116 IndirectAccess.StaticLDSGlobals.end()));
1117 LDSParams.IndirectAccess.DynamicLDSGlobals = sortByName(
1118 V: std::vector<GlobalVariable *>(IndirectAccess.DynamicLDSGlobals.begin(),
1119 IndirectAccess.DynamicLDSGlobals.end()));
1120}
1121
1122void AMDGPUSwLowerLDS::initAsanInfo() {
1123 // Get Shadow mapping scale and offset.
1124 unsigned LongSize =
1125 M.getDataLayout().getPointerSizeInBits(AS: AMDGPUAS::GLOBAL_ADDRESS);
1126 uint64_t Offset;
1127 int Scale;
1128 bool OrShadowOffset;
1129 llvm::getAddressSanitizerParams(TargetTriple: AMDGPUTM.getTargetTriple(), LongSize, IsKasan: false,
1130 ShadowBase: &Offset, MappingScale: &Scale, OrShadowOffset: &OrShadowOffset);
1131 AsanInfo.Scale = Scale;
1132 AsanInfo.Offset = Offset;
1133}
1134
1135static bool hasFnWithSanitizeAddressAttr(FunctionVariableMap &LDSAccesses) {
1136 for (auto &K : LDSAccesses) {
1137 Function *F = K.first;
1138 if (!F)
1139 continue;
1140 if (F->hasFnAttribute(Kind: Attribute::SanitizeAddress))
1141 return true;
1142 }
1143 return false;
1144}
1145
1146bool AMDGPUSwLowerLDS::run() {
1147 bool Changed = false;
1148
1149 CallGraph CG = CallGraph(M);
1150
1151 Changed |= eliminateConstantExprUsesOfLDSFromAllInstructions(M);
1152
1153 // Get all the direct and indirect access of LDS for all the kernels.
1154 LDSUsesInfoTy LDSUsesInfo = getTransitiveUsesOfLDS(CG, M);
1155
1156 // Flag to decide whether to lower all the LDS accesses
1157 // based on sanitize_address attribute.
1158 bool LowerAllLDS = hasFnWithSanitizeAddressAttr(LDSAccesses&: LDSUsesInfo.direct_access) ||
1159 hasFnWithSanitizeAddressAttr(LDSAccesses&: LDSUsesInfo.indirect_access);
1160
1161 if (!LowerAllLDS)
1162 return Changed;
1163
1164 // Utility to group LDS access into direct, indirect, static and dynamic.
1165 auto PopulateKernelStaticDynamicLDS = [&](FunctionVariableMap &LDSAccesses,
1166 bool DirectAccess) {
1167 for (auto &K : LDSAccesses) {
1168 Function *F = K.first;
1169 if (!F || K.second.empty())
1170 continue;
1171
1172 assert(isKernelLDS(F));
1173
1174 // Only inserts if key isn't already in the map.
1175 FuncLDSAccessInfo.KernelToLDSParametersMap.insert(
1176 KV: {F, KernelLDSParameters()});
1177
1178 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[F];
1179 if (!DirectAccess)
1180 FuncLDSAccessInfo.KernelsWithIndirectLDSAccess.insert(X: F);
1181 for (GlobalVariable *GV : K.second) {
1182 if (!DirectAccess) {
1183 if (AMDGPU::isDynamicLDS(GV: *GV))
1184 LDSParams.IndirectAccess.DynamicLDSGlobals.insert(X: GV);
1185 else
1186 LDSParams.IndirectAccess.StaticLDSGlobals.insert(X: GV);
1187 FuncLDSAccessInfo.AllNonKernelLDSAccess.insert(X: GV);
1188 } else {
1189 if (AMDGPU::isDynamicLDS(GV: *GV))
1190 LDSParams.DirectAccess.DynamicLDSGlobals.insert(X: GV);
1191 else
1192 LDSParams.DirectAccess.StaticLDSGlobals.insert(X: GV);
1193 }
1194 }
1195 }
1196 };
1197
1198 PopulateKernelStaticDynamicLDS(LDSUsesInfo.direct_access, true);
1199 PopulateKernelStaticDynamicLDS(LDSUsesInfo.indirect_access, false);
1200
1201 // Get address sanitizer scale.
1202 initAsanInfo();
1203
1204 for (auto &K : FuncLDSAccessInfo.KernelToLDSParametersMap) {
1205 Function *Func = K.first;
1206 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
1207 if (LDSParams.DirectAccess.StaticLDSGlobals.empty() &&
1208 LDSParams.DirectAccess.DynamicLDSGlobals.empty() &&
1209 LDSParams.IndirectAccess.StaticLDSGlobals.empty() &&
1210 LDSParams.IndirectAccess.DynamicLDSGlobals.empty()) {
1211 Changed = false;
1212 } else {
1213 removeFnAttrFromReachable(
1214 CG, KernelRoot: Func,
1215 FnAttrs: {"amdgpu-no-workitem-id-x", "amdgpu-no-workitem-id-y",
1216 "amdgpu-no-workitem-id-z", "amdgpu-no-heap-ptr"});
1217 if (!LDSParams.IndirectAccess.StaticLDSGlobals.empty() ||
1218 !LDSParams.IndirectAccess.DynamicLDSGlobals.empty())
1219 removeFnAttrFromReachable(CG, KernelRoot: Func, FnAttrs: {"amdgpu-no-lds-kernel-id"});
1220 reorderStaticDynamicIndirectLDSSet(LDSParams);
1221 buildSwLDSGlobal(Func);
1222 buildSwDynLDSGlobal(Func);
1223 populateSwMetadataGlobal(Func);
1224 populateSwLDSAttributeAndMetadata(Func);
1225 populateLDSToReplacementIndicesMap(Func);
1226 DomTreeUpdater DTU(DTCallback(*Func),
1227 DomTreeUpdater::UpdateStrategy::Lazy);
1228 lowerKernelLDSAccesses(Func, DTU);
1229 Changed = true;
1230 }
1231 }
1232
1233 // Get the Uses of LDS from non-kernels.
1234 getUsesOfLDSByNonKernels();
1235
1236 // Get non-kernels with LDS ptr as argument and called by kernels.
1237 getNonKernelsWithLDSArguments(CG);
1238
1239 // Lower LDS accesses in non-kernels.
1240 if (!FuncLDSAccessInfo.NonKernelToLDSAccessMap.empty() ||
1241 !FuncLDSAccessInfo.NonKernelsWithLDSArgument.empty()) {
1242 NonKernelLDSParameters NKLDSParams;
1243 NKLDSParams.OrderedKernels = getOrderedIndirectLDSAccessingKernels(
1244 Kernels&: FuncLDSAccessInfo.KernelsWithIndirectLDSAccess);
1245 NKLDSParams.OrdereLDSGlobals = getOrderedNonKernelAllLDSGlobals(
1246 Variables&: FuncLDSAccessInfo.AllNonKernelLDSAccess);
1247 buildNonKernelLDSBaseTable(NKLDSParams);
1248 buildNonKernelLDSOffsetTable(NKLDSParams);
1249 for (auto &K : FuncLDSAccessInfo.NonKernelToLDSAccessMap) {
1250 Function *Func = K.first;
1251 DenseSet<GlobalVariable *> &LDSGlobals = K.second;
1252 SetVector<GlobalVariable *> OrderedLDSGlobals = sortByName(
1253 V: std::vector<GlobalVariable *>(LDSGlobals.begin(), LDSGlobals.end()));
1254 lowerNonKernelLDSAccesses(Func, LDSGlobals&: OrderedLDSGlobals, NKLDSParams);
1255 }
1256 for (Function *Func : FuncLDSAccessInfo.NonKernelsWithLDSArgument) {
1257 auto &K = FuncLDSAccessInfo.NonKernelToLDSAccessMap;
1258 if (K.contains(Val: Func))
1259 continue;
1260 SetVector<llvm::GlobalVariable *> Vec;
1261 lowerNonKernelLDSAccesses(Func, LDSGlobals&: Vec, NKLDSParams);
1262 }
1263 Changed = true;
1264 }
1265
1266 if (!Changed)
1267 return Changed;
1268
1269 for (auto &GV : make_early_inc_range(Range: M.globals())) {
1270 if (AMDGPU::isLDSVariableToLower(GV)) {
1271 // probably want to remove from used lists
1272 GV.removeDeadConstantUsers();
1273 if (GV.use_empty())
1274 GV.eraseFromParent();
1275 }
1276 }
1277
1278 if (AsanInstrumentLDS) {
1279 SmallVector<InterestingMemoryOperand, 16> OperandsToInstrument;
1280 for (Instruction *Inst : AsanInfo.Instructions) {
1281 SmallVector<InterestingMemoryOperand, 1> InterestingOperands;
1282 getInterestingMemoryOperands(M, I: Inst, Interesting&: InterestingOperands);
1283 llvm::append_range(C&: OperandsToInstrument, R&: InterestingOperands);
1284 }
1285 for (auto &Operand : OperandsToInstrument) {
1286 Value *Addr = Operand.getPtr();
1287 instrumentAddress(M, IRB, OrigIns: Operand.getInsn(), InsertBefore: Operand.getInsn(), Addr,
1288 Alignment: Operand.Alignment.valueOrOne(), TypeStoreSize: Operand.TypeStoreSize,
1289 IsWrite: Operand.IsWrite, SizeArgument: nullptr, UseCalls: false, Recover: false, Scale: AsanInfo.Scale,
1290 Offset: AsanInfo.Offset);
1291 Changed = true;
1292 }
1293 }
1294
1295 return Changed;
1296}
1297
1298class AMDGPUSwLowerLDSLegacy : public ModulePass {
1299public:
1300 const AMDGPUTargetMachine *AMDGPUTM;
1301 static char ID;
1302 AMDGPUSwLowerLDSLegacy(const AMDGPUTargetMachine *TM)
1303 : ModulePass(ID), AMDGPUTM(TM) {}
1304 bool runOnModule(Module &M) override;
1305 void getAnalysisUsage(AnalysisUsage &AU) const override {
1306 AU.addPreserved<DominatorTreeWrapperPass>();
1307 }
1308};
1309} // namespace
1310
1311char AMDGPUSwLowerLDSLegacy::ID = 0;
1312char &llvm::AMDGPUSwLowerLDSLegacyPassID = AMDGPUSwLowerLDSLegacy::ID;
1313
1314INITIALIZE_PASS_BEGIN(AMDGPUSwLowerLDSLegacy, "amdgpu-sw-lower-lds",
1315 "AMDGPU Software lowering of LDS", false, false)
1316INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
1317INITIALIZE_PASS_END(AMDGPUSwLowerLDSLegacy, "amdgpu-sw-lower-lds",
1318 "AMDGPU Software lowering of LDS", false, false)
1319
1320bool AMDGPUSwLowerLDSLegacy::runOnModule(Module &M) {
1321 // AddressSanitizer pass adds "nosanitize_address" module flag if it has
1322 // instrumented the IR. Return early if the flag is not present.
1323 if (!M.getModuleFlag(Key: "nosanitize_address"))
1324 return false;
1325 DominatorTreeWrapperPass *const DTW =
1326 getAnalysisIfAvailable<DominatorTreeWrapperPass>();
1327 auto DTCallback = [&DTW](Function &F) -> DominatorTree * {
1328 return DTW ? &DTW->getDomTree() : nullptr;
1329 };
1330 if (!AMDGPUTM) {
1331 auto &TPC = getAnalysis<TargetPassConfig>();
1332 AMDGPUTM = &TPC.getTM<AMDGPUTargetMachine>();
1333 }
1334 AMDGPUSwLowerLDS SwLowerLDSImpl(M, *AMDGPUTM, DTCallback);
1335 bool IsChanged = SwLowerLDSImpl.run();
1336 return IsChanged;
1337}
1338
1339ModulePass *
1340llvm::createAMDGPUSwLowerLDSLegacyPass(const AMDGPUTargetMachine *TM) {
1341 return new AMDGPUSwLowerLDSLegacy(TM);
1342}
1343
1344PreservedAnalyses AMDGPUSwLowerLDSPass::run(Module &M,
1345 ModuleAnalysisManager &AM) {
1346 // AddressSanitizer pass adds "nosanitize_address" module flag if it has
1347 // instrumented the IR. Return early if the flag is not present.
1348 if (!M.getModuleFlag(Key: "nosanitize_address"))
1349 return PreservedAnalyses::all();
1350 auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(IR&: M).getManager();
1351 auto DTCallback = [&FAM](Function &F) -> DominatorTree * {
1352 return &FAM.getResult<DominatorTreeAnalysis>(IR&: F);
1353 };
1354 AMDGPUSwLowerLDS SwLowerLDSImpl(M, TM, DTCallback);
1355 bool IsChanged = SwLowerLDSImpl.run();
1356 if (!IsChanged)
1357 return PreservedAnalyses::all();
1358
1359 PreservedAnalyses PA;
1360 PA.preserve<DominatorTreeAnalysis>();
1361 return PA;
1362}
1363