1//===-- AMDGPUSwLowerLDS.cpp -----------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass lowers the local data store, LDS, uses in kernel and non-kernel
10// functions in module to use dynamically allocated global memory.
11// Packed LDS Layout is emulated in the global memory.
12// The lowered memory instructions from LDS to global memory are then
13// instrumented for address sanitizer, to catch addressing errors.
14// This pass only work when address sanitizer has been enabled and has
15// instrumented the IR. It identifies that IR has been instrumented using
16// "nosanitize_address" module flag.
17//
18// Replacement of Kernel LDS accesses:
19// For a kernel, LDS access can be static or dynamic which are direct
20// (accessed within kernel) and indirect (accessed through non-kernels).
21// All these LDS accesses corresponding to kernel will be packed together,
22// where all static LDS accesses will be allocated first and then dynamic
23// LDS follows. The total size with alignment is calculated. A new LDS global
24// will be created for the kernel called "SW LDS" and it will have the
25// attribute "amdgpu-lds-size" attached with value of the size calculated.
26// All the LDS accesses in the module will be replaced by GEP with offset
27// into the "Sw LDS".
28// A new "llvm.amdgcn.<kernel>.dynlds" is created per kernel accessing
29// the dynamic LDS. This will be marked used by kernel and will have
30// MD_absolue_symbol metadata set to total static LDS size, Since dynamic
31// LDS allocation starts after all static LDS allocation.
32//
33// A device global memory equal to the total LDS size will be allocated.
34// At the prologue of the kernel, a single work-item from the
35// work-group, does a "malloc" and stores the pointer of the
36// allocation in "SW LDS".
37//
38// To store the offsets corresponding to all LDS accesses, another global
39// variable is created which will be called "SW LDS metadata" in this pass.
40// - SW LDS Global:
41// It is LDS global of ptr type with name
42// "llvm.amdgcn.sw.lds.<kernel-name>".
43// - Metadata Global:
44// It is of struct type, with n members. n equals the number of LDS
45// globals accessed by the kernel(direct and indirect). Each member of
46// struct is another struct of type {i32, i32, i32}. First member
47// corresponds to offset, second member corresponds to size of LDS global
48// being replaced and third represents the total aligned size. It will
49// have name "llvm.amdgcn.sw.lds.<kernel-name>.md". This global will have
50// an initializer with static LDS related offsets and sizes initialized.
51// But for dynamic LDS related entries, offsets will be initialized to
52// previous static LDS allocation end offset. Sizes for them will be zero
53// initially. These dynamic LDS offset and size values will be updated
54// within the kernel, since kernel can read the dynamic LDS size
55// allocation done at runtime with query to "hidden_dynamic_lds_size"
56// hidden kernel argument.
57//
58// At the epilogue of kernel, allocated memory would be made free by the same
59// single work-item.
60//
61// Replacement of non-kernel LDS accesses:
62// Multiple kernels can access the same non-kernel function.
63// All the kernels accessing LDS through non-kernels are sorted and
64// assigned a kernel-id. All the LDS globals accessed by non-kernels
65// are sorted. This information is used to build two tables:
66// - Base table:
67// Base table will have single row, with elements of the row
68// placed as per kernel ID. Each element in the row corresponds
69// to ptr of "SW LDS" variable created for that kernel.
70// - Offset table:
71// Offset table will have multiple rows and columns.
72// Rows are assumed to be from 0 to (n-1). n is total number
73// of kernels accessing the LDS through non-kernels.
74// Each row will have m elements. m is the total number of
75// unique LDS globals accessed by all non-kernels.
76// Each element in the row correspond to the ptr of
77// the replacement of LDS global done by that particular kernel.
78// A LDS variable in non-kernel will be replaced based on the information
79// from base and offset tables. Based on kernel-id query, ptr of "SW
80// LDS" for that corresponding kernel is obtained from base table.
81// The Offset into the base "SW LDS" is obtained from
82// corresponding element in offset table. With this information, replacement
83// value is obtained.
84//===----------------------------------------------------------------------===//
85
86#include "AMDGPU.h"
87#include "AMDGPUAsanInstrumentation.h"
88#include "AMDGPUMemoryUtils.h"
89#include "AMDGPUTargetMachine.h"
90#include "llvm/ADT/DenseMap.h"
91#include "llvm/ADT/DenseSet.h"
92#include "llvm/ADT/SetVector.h"
93#include "llvm/ADT/StringExtras.h"
94#include "llvm/ADT/StringRef.h"
95#include "llvm/Analysis/CallGraph.h"
96#include "llvm/Analysis/DomTreeUpdater.h"
97#include "llvm/CodeGen/TargetPassConfig.h"
98#include "llvm/IR/Constants.h"
99#include "llvm/IR/DIBuilder.h"
100#include "llvm/IR/DebugInfo.h"
101#include "llvm/IR/DebugInfoMetadata.h"
102#include "llvm/IR/IRBuilder.h"
103#include "llvm/IR/Instructions.h"
104#include "llvm/IR/IntrinsicsAMDGPU.h"
105#include "llvm/IR/MDBuilder.h"
106#include "llvm/IR/ReplaceConstant.h"
107#include "llvm/Pass.h"
108#include "llvm/Support/raw_ostream.h"
109#include "llvm/Transforms/Instrumentation/AddressSanitizerCommon.h"
110#include "llvm/Transforms/Utils/ModuleUtils.h"
111
112#include <algorithm>
113
114#define DEBUG_TYPE "amdgpu-sw-lower-lds"
115#define COV5_HIDDEN_DYN_LDS_SIZE_ARG 15
116
117using namespace llvm;
118using namespace AMDGPU;
119
120namespace {
121
122cl::opt<bool>
123 AsanInstrumentLDS("amdgpu-asan-instrument-lds",
124 cl::desc("Run asan instrumentation on LDS instructions "
125 "lowered to global memory"),
126 cl::init(Val: true), cl::Hidden);
127
128using DomTreeCallback = function_ref<DominatorTree *(Function &F)>;
129
130struct LDSAccessTypeInfo {
131 SetVector<GlobalVariable *> StaticLDSGlobals;
132 SetVector<GlobalVariable *> DynamicLDSGlobals;
133};
134
135// Struct to hold all the Metadata required for a kernel
136// to replace a LDS global uses with corresponding offset
137// in to device global memory.
138struct KernelLDSParameters {
139 GlobalVariable *SwLDS = nullptr;
140 GlobalVariable *SwDynLDS = nullptr;
141 GlobalVariable *SwLDSMetadata = nullptr;
142 LDSAccessTypeInfo DirectAccess;
143 LDSAccessTypeInfo IndirectAccess;
144 DenseMap<GlobalVariable *, SmallVector<uint32_t, 3>>
145 LDSToReplacementIndicesMap;
146 uint32_t MallocSize = 0;
147 uint32_t LDSSize = 0;
148 SmallVector<std::pair<uint32_t, uint32_t>, 64> RedzoneOffsetAndSizeVector;
149};
150
151// Struct to store information for creation of offset table
152// for all the non-kernel LDS accesses.
153struct NonKernelLDSParameters {
154 GlobalVariable *LDSBaseTable = nullptr;
155 GlobalVariable *LDSOffsetTable = nullptr;
156 SetVector<Function *> OrderedKernels;
157 SetVector<GlobalVariable *> OrdereLDSGlobals;
158};
159
160struct AsanInstrumentInfo {
161 int Scale = 0;
162 uint32_t Offset = 0;
163 SetVector<Instruction *> Instructions;
164};
165
166struct FunctionsAndLDSAccess {
167 DenseMap<Function *, KernelLDSParameters> KernelToLDSParametersMap;
168 SetVector<Function *> KernelsWithIndirectLDSAccess;
169 SetVector<Function *> NonKernelsWithLDSArgument;
170 SetVector<GlobalVariable *> AllNonKernelLDSAccess;
171 FunctionVariableMap NonKernelToLDSAccessMap;
172};
173
174class AMDGPUSwLowerLDS {
175public:
176 AMDGPUSwLowerLDS(Module &Mod, const AMDGPUTargetMachine &TM,
177 DomTreeCallback Callback)
178 : M(Mod), AMDGPUTM(TM), IRB(M.getContext()), DTCallback(Callback) {}
179 bool run();
180 void getUsesOfLDSByNonKernels();
181 void getNonKernelsWithLDSArguments(const CallGraph &CG);
182 SetVector<Function *>
183 getOrderedIndirectLDSAccessingKernels(SetVector<Function *> &Kernels);
184 SetVector<GlobalVariable *>
185 getOrderedNonKernelAllLDSGlobals(SetVector<GlobalVariable *> &Variables);
186 void buildSwLDSGlobal(Function *Func);
187 void buildSwDynLDSGlobal(Function *Func);
188 void populateSwMetadataGlobal(Function *Func);
189 void populateSwLDSAttributeAndMetadata(Function *Func);
190 void populateLDSToReplacementIndicesMap(Function *Func);
191 void getLDSMemoryInstructions(Function *Func,
192 SetVector<Instruction *> &LDSInstructions);
193 void replaceKernelLDSAccesses(Function *Func);
194 Value *getTranslatedGlobalMemoryPtrOfLDS(Value *LoadMallocPtr, Value *LDSPtr);
195 void translateLDSMemoryOperationsToGlobalMemory(
196 Function *Func, Value *LoadMallocPtr,
197 SetVector<Instruction *> &LDSInstructions);
198 void poisonRedzones(Function *Func, Value *MallocPtr);
199 void lowerKernelLDSAccesses(Function *Func, DomTreeUpdater &DTU);
200 void buildNonKernelLDSOffsetTable(NonKernelLDSParameters &NKLDSParams);
201 void buildNonKernelLDSBaseTable(NonKernelLDSParameters &NKLDSParams);
202 Constant *
203 getAddressesOfVariablesInKernel(Function *Func,
204 SetVector<GlobalVariable *> &Variables);
205 void lowerNonKernelLDSAccesses(Function *Func,
206 SetVector<GlobalVariable *> &LDSGlobals,
207 NonKernelLDSParameters &NKLDSParams);
208 void
209 updateMallocSizeForDynamicLDS(Function *Func, Value **CurrMallocSize,
210 Value *HiddenDynLDSSize,
211 SetVector<GlobalVariable *> &DynamicLDSGlobals);
212 void initAsanInfo();
213
214private:
215 Module &M;
216 const AMDGPUTargetMachine &AMDGPUTM;
217 IRBuilder<> IRB;
218 DomTreeCallback DTCallback;
219 FunctionsAndLDSAccess FuncLDSAccessInfo;
220 AsanInstrumentInfo AsanInfo;
221};
222
223template <typename T> SetVector<T> sortByName(std::vector<T> &&V) {
224 // Sort the vector of globals or Functions based on their name.
225 // Returns a SetVector of globals/Functions.
226 sort(V, [](const auto *L, const auto *R) {
227 return L->getName() < R->getName();
228 });
229 return {SetVector<T>(llvm::from_range, V)};
230}
231
232SetVector<GlobalVariable *> AMDGPUSwLowerLDS::getOrderedNonKernelAllLDSGlobals(
233 SetVector<GlobalVariable *> &Variables) {
234 // Sort all the non-kernel LDS accesses based on their name.
235 return sortByName(
236 V: std::vector<GlobalVariable *>(Variables.begin(), Variables.end()));
237}
238
239SetVector<Function *> AMDGPUSwLowerLDS::getOrderedIndirectLDSAccessingKernels(
240 SetVector<Function *> &Kernels) {
241 // Sort the non-kernels accessing LDS based on their name.
242 // Also assign a kernel ID metadata based on the sorted order.
243 LLVMContext &Ctx = M.getContext();
244 if (Kernels.size() > UINT32_MAX) {
245 report_fatal_error(reason: "Unimplemented SW LDS lowering for > 2**32 kernels");
246 }
247 SetVector<Function *> OrderedKernels =
248 sortByName(V: std::vector<Function *>(Kernels.begin(), Kernels.end()));
249 for (size_t i = 0; i < Kernels.size(); i++) {
250 Metadata *AttrMDArgs[1] = {
251 ConstantAsMetadata::get(C: IRB.getInt32(C: i)),
252 };
253 Function *Func = OrderedKernels[i];
254 Func->setMetadata(Kind: "llvm.amdgcn.lds.kernel.id",
255 Node: MDNode::get(Context&: Ctx, MDs: AttrMDArgs));
256 }
257 return OrderedKernels;
258}
259
260void AMDGPUSwLowerLDS::getNonKernelsWithLDSArguments(const CallGraph &CG) {
261 // Among the kernels accessing LDS, get list of
262 // Non-kernels to which a call is made and a ptr
263 // to addrspace(3) is passed as argument.
264 for (auto &K : FuncLDSAccessInfo.KernelToLDSParametersMap) {
265 Function *Func = K.first;
266 const CallGraphNode *CGN = CG[Func];
267 if (!CGN)
268 continue;
269 for (auto &I : *CGN) {
270 CallGraphNode *CallerCGN = I.second;
271 Function *CalledFunc = CallerCGN->getFunction();
272 if (!CalledFunc || CalledFunc->isDeclaration())
273 continue;
274 if (AMDGPU::isKernel(F: *CalledFunc))
275 continue;
276 for (auto AI = CalledFunc->arg_begin(), E = CalledFunc->arg_end();
277 AI != E; ++AI) {
278 Type *ArgTy = (*AI).getType();
279 if (!ArgTy->isPointerTy())
280 continue;
281 if (ArgTy->getPointerAddressSpace() != AMDGPUAS::LOCAL_ADDRESS)
282 continue;
283 FuncLDSAccessInfo.NonKernelsWithLDSArgument.insert(X: CalledFunc);
284 // Also add the Calling function to KernelsWithIndirectLDSAccess list
285 // so that base table of LDS is generated.
286 FuncLDSAccessInfo.KernelsWithIndirectLDSAccess.insert(X: Func);
287 }
288 }
289 }
290}
291
292void AMDGPUSwLowerLDS::getUsesOfLDSByNonKernels() {
293 for (GlobalVariable *GV : FuncLDSAccessInfo.AllNonKernelLDSAccess) {
294 if (!AMDGPU::isLDSVariableToLower(GV: *GV))
295 continue;
296
297 for (User *V : GV->users()) {
298 if (auto *I = dyn_cast<Instruction>(Val: V)) {
299 Function *F = I->getFunction();
300 if (!isKernel(F: *F) && !F->isDeclaration())
301 FuncLDSAccessInfo.NonKernelToLDSAccessMap[F].insert(V: GV);
302 }
303 }
304 }
305}
306
307static void recordLDSAbsoluteAddress(Module &M, GlobalVariable *GV,
308 uint32_t Address) {
309 // Write the specified address into metadata where it can be retrieved by
310 // the assembler. Format is a half open range, [Address Address+1)
311 LLVMContext &Ctx = M.getContext();
312 auto *IntTy = M.getDataLayout().getIntPtrType(C&: Ctx, AddressSpace: AMDGPUAS::LOCAL_ADDRESS);
313 MDBuilder MDB(Ctx);
314 MDNode *MetadataNode = MDB.createRange(Lo: ConstantInt::get(Ty: IntTy, V: Address),
315 Hi: ConstantInt::get(Ty: IntTy, V: Address + 1));
316 GV->setMetadata(KindID: LLVMContext::MD_absolute_symbol, Node: MetadataNode);
317}
318
319static void addLDSSizeAttribute(Function *Func, uint32_t Offset,
320 bool IsDynLDS) {
321 if (Offset != 0) {
322 std::string Buffer;
323 raw_string_ostream SS{Buffer};
324 SS << Offset;
325 if (IsDynLDS)
326 SS << "," << Offset;
327 Func->addFnAttr(Kind: "amdgpu-lds-size", Val: Buffer);
328 }
329}
330
331static void markUsedByKernel(Function *Func, GlobalVariable *SGV) {
332 BasicBlock *Entry = &Func->getEntryBlock();
333 IRBuilder<> Builder(Entry, Entry->getFirstNonPHIIt());
334
335 Function *Decl = Intrinsic::getOrInsertDeclaration(M: Func->getParent(),
336 id: Intrinsic::donothing, Tys: {});
337
338 Value *UseInstance[1] = {
339 Builder.CreateConstInBoundsGEP1_32(Ty: SGV->getValueType(), Ptr: SGV, Idx0: 0)};
340
341 Builder.CreateCall(Callee: Decl, Args: {},
342 OpBundles: {OperandBundleDefT<Value *>("ExplicitUse", UseInstance)});
343}
344
345void AMDGPUSwLowerLDS::buildSwLDSGlobal(Function *Func) {
346 // Create new LDS global required for each kernel to store
347 // device global memory pointer.
348 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
349 // Create new global pointer variable
350 LDSParams.SwLDS = new GlobalVariable(
351 M, IRB.getPtrTy(), false, GlobalValue::InternalLinkage,
352 PoisonValue::get(T: IRB.getPtrTy()), "llvm.amdgcn.sw.lds." + Func->getName(),
353 nullptr, GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS, false);
354 GlobalValue::SanitizerMetadata MD;
355 MD.NoAddress = true;
356 LDSParams.SwLDS->setSanitizerMetadata(MD);
357}
358
359void AMDGPUSwLowerLDS::buildSwDynLDSGlobal(Function *Func) {
360 // Create new Dyn LDS global if kernel accesses dyn LDS.
361 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
362 if (LDSParams.DirectAccess.DynamicLDSGlobals.empty() &&
363 LDSParams.IndirectAccess.DynamicLDSGlobals.empty())
364 return;
365 // Create new global pointer variable
366 auto *emptyCharArray = ArrayType::get(ElementType: IRB.getInt8Ty(), NumElements: 0);
367 LDSParams.SwDynLDS = new GlobalVariable(
368 M, emptyCharArray, false, GlobalValue::ExternalLinkage, nullptr,
369 "llvm.amdgcn." + Func->getName() + ".dynlds", nullptr,
370 GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS, false);
371 markUsedByKernel(Func, SGV: LDSParams.SwDynLDS);
372 GlobalValue::SanitizerMetadata MD;
373 MD.NoAddress = true;
374 LDSParams.SwDynLDS->setSanitizerMetadata(MD);
375}
376
377void AMDGPUSwLowerLDS::populateSwLDSAttributeAndMetadata(Function *Func) {
378 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
379 bool IsDynLDSUsed = LDSParams.SwDynLDS;
380 uint32_t Offset = LDSParams.LDSSize;
381 recordLDSAbsoluteAddress(M, GV: LDSParams.SwLDS, Address: 0);
382 addLDSSizeAttribute(Func, Offset, IsDynLDS: IsDynLDSUsed);
383 if (LDSParams.SwDynLDS)
384 recordLDSAbsoluteAddress(M, GV: LDSParams.SwDynLDS, Address: Offset);
385}
386
387void AMDGPUSwLowerLDS::populateSwMetadataGlobal(Function *Func) {
388 // Create new metadata global for every kernel and initialize the
389 // start offsets and sizes corresponding to each LDS accesses.
390 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
391 auto &Ctx = M.getContext();
392 auto &DL = M.getDataLayout();
393 std::vector<Type *> Items;
394 Type *Int32Ty = IRB.getInt32Ty();
395 std::vector<Constant *> Initializers;
396 Align MaxAlignment(1);
397 auto UpdateMaxAlignment = [&MaxAlignment, &DL](GlobalVariable *GV) {
398 Align GVAlign = AMDGPU::getAlign(DL, GV);
399 MaxAlignment = std::max(a: MaxAlignment, b: GVAlign);
400 };
401
402 for (GlobalVariable *GV : LDSParams.DirectAccess.StaticLDSGlobals)
403 UpdateMaxAlignment(GV);
404
405 for (GlobalVariable *GV : LDSParams.DirectAccess.DynamicLDSGlobals)
406 UpdateMaxAlignment(GV);
407
408 for (GlobalVariable *GV : LDSParams.IndirectAccess.StaticLDSGlobals)
409 UpdateMaxAlignment(GV);
410
411 for (GlobalVariable *GV : LDSParams.IndirectAccess.DynamicLDSGlobals)
412 UpdateMaxAlignment(GV);
413
414 //{StartOffset, AlignedSizeInBytes}
415 SmallString<128> MDItemStr;
416 raw_svector_ostream MDItemOS(MDItemStr);
417 MDItemOS << "llvm.amdgcn.sw.lds." << Func->getName() << ".md.item";
418
419 StructType *LDSItemTy =
420 StructType::create(Context&: Ctx, Elements: {Int32Ty, Int32Ty, Int32Ty}, Name: MDItemOS.str());
421 uint32_t &MallocSize = LDSParams.MallocSize;
422 SetVector<GlobalVariable *> UniqueLDSGlobals;
423 int AsanScale = AsanInfo.Scale;
424 auto buildInitializerForSwLDSMD =
425 [&](SetVector<GlobalVariable *> &LDSGlobals) {
426 for (auto &GV : LDSGlobals) {
427 if (is_contained(Range&: UniqueLDSGlobals, Element: GV))
428 continue;
429 UniqueLDSGlobals.insert(X: GV);
430
431 Type *Ty = GV->getValueType();
432 const uint64_t SizeInBytes = DL.getTypeAllocSize(Ty);
433 Items.push_back(x: LDSItemTy);
434 Constant *ItemStartOffset = ConstantInt::get(Ty: Int32Ty, V: MallocSize);
435 Constant *SizeInBytesConst = ConstantInt::get(Ty: Int32Ty, V: SizeInBytes);
436 // Get redzone size corresponding a size.
437 const uint64_t RightRedzoneSize =
438 AMDGPU::getRedzoneSizeForGlobal(Scale: AsanScale, SizeInBytes);
439 // Update MallocSize with current size and redzone size.
440 MallocSize += SizeInBytes;
441 if (!AMDGPU::isDynamicLDS(GV: *GV))
442 LDSParams.RedzoneOffsetAndSizeVector.emplace_back(Args&: MallocSize,
443 Args: RightRedzoneSize);
444 MallocSize += RightRedzoneSize;
445 // Align current size plus redzone.
446 uint64_t AlignedSize =
447 alignTo(Size: SizeInBytes + RightRedzoneSize, A: MaxAlignment);
448 Constant *AlignedSizeInBytesConst =
449 ConstantInt::get(Ty: Int32Ty, V: AlignedSize);
450 // Align MallocSize
451 MallocSize = alignTo(Size: MallocSize, A: MaxAlignment);
452 Constant *InitItem =
453 ConstantStruct::get(T: LDSItemTy, V: {ItemStartOffset, SizeInBytesConst,
454 AlignedSizeInBytesConst});
455 Initializers.push_back(x: InitItem);
456 }
457 };
458 SetVector<GlobalVariable *> SwLDSVector;
459 SwLDSVector.insert(X: LDSParams.SwLDS);
460 buildInitializerForSwLDSMD(SwLDSVector);
461 buildInitializerForSwLDSMD(LDSParams.DirectAccess.StaticLDSGlobals);
462 buildInitializerForSwLDSMD(LDSParams.IndirectAccess.StaticLDSGlobals);
463 buildInitializerForSwLDSMD(LDSParams.DirectAccess.DynamicLDSGlobals);
464 buildInitializerForSwLDSMD(LDSParams.IndirectAccess.DynamicLDSGlobals);
465
466 // Update the LDS size used by the kernel.
467 Type *Ty = LDSParams.SwLDS->getValueType();
468 const uint64_t SizeInBytes = DL.getTypeAllocSize(Ty);
469 uint64_t AlignedSize = alignTo(Size: SizeInBytes, A: MaxAlignment);
470 LDSParams.LDSSize = AlignedSize;
471 SmallString<128> MDTypeStr;
472 raw_svector_ostream MDTypeOS(MDTypeStr);
473 MDTypeOS << "llvm.amdgcn.sw.lds." << Func->getName() << ".md.type";
474 StructType *MetadataStructType =
475 StructType::create(Context&: Ctx, Elements: Items, Name: MDTypeOS.str());
476 SmallString<128> MDStr;
477 raw_svector_ostream MDOS(MDStr);
478 MDOS << "llvm.amdgcn.sw.lds." << Func->getName() << ".md";
479 LDSParams.SwLDSMetadata = new GlobalVariable(
480 M, MetadataStructType, false, GlobalValue::InternalLinkage,
481 PoisonValue::get(T: MetadataStructType), MDOS.str(), nullptr,
482 GlobalValue::NotThreadLocal, AMDGPUAS::GLOBAL_ADDRESS, false);
483 Constant *data = ConstantStruct::get(T: MetadataStructType, V: Initializers);
484 LDSParams.SwLDSMetadata->setInitializer(data);
485 assert(LDSParams.SwLDS);
486 // Set the alignment to MaxAlignment for SwLDS.
487 LDSParams.SwLDS->setAlignment(MaxAlignment);
488 if (LDSParams.SwDynLDS)
489 LDSParams.SwDynLDS->setAlignment(MaxAlignment);
490 GlobalValue::SanitizerMetadata MD;
491 MD.NoAddress = true;
492 LDSParams.SwLDSMetadata->setSanitizerMetadata(MD);
493}
494
495void AMDGPUSwLowerLDS::populateLDSToReplacementIndicesMap(Function *Func) {
496 // Fill the corresponding LDS replacement indices for each LDS access
497 // related to this kernel.
498 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
499 SetVector<GlobalVariable *> UniqueLDSGlobals;
500 auto PopulateIndices = [&](SetVector<GlobalVariable *> &LDSGlobals,
501 uint32_t &Idx) {
502 for (auto &GV : LDSGlobals) {
503 if (is_contained(Range&: UniqueLDSGlobals, Element: GV))
504 continue;
505 UniqueLDSGlobals.insert(X: GV);
506 LDSParams.LDSToReplacementIndicesMap[GV] = {0, Idx, 0};
507 ++Idx;
508 }
509 };
510 uint32_t Idx = 0;
511 SetVector<GlobalVariable *> SwLDSVector;
512 SwLDSVector.insert(X: LDSParams.SwLDS);
513 PopulateIndices(SwLDSVector, Idx);
514 PopulateIndices(LDSParams.DirectAccess.StaticLDSGlobals, Idx);
515 PopulateIndices(LDSParams.IndirectAccess.StaticLDSGlobals, Idx);
516 PopulateIndices(LDSParams.DirectAccess.DynamicLDSGlobals, Idx);
517 PopulateIndices(LDSParams.IndirectAccess.DynamicLDSGlobals, Idx);
518}
519
520static void replacesUsesOfGlobalInFunction(Function *Func, GlobalVariable *GV,
521 Value *Replacement) {
522 // Replace all uses of LDS global in this Function with a Replacement.
523 auto ReplaceUsesLambda = [Func](const Use &U) -> bool {
524 auto *V = U.getUser();
525 if (auto *Inst = dyn_cast<Instruction>(Val: V)) {
526 auto *Func1 = Inst->getFunction();
527 if (Func == Func1)
528 return true;
529 }
530 return false;
531 };
532 GV->replaceUsesWithIf(New: Replacement, ShouldReplace: ReplaceUsesLambda);
533}
534
535void AMDGPUSwLowerLDS::replaceKernelLDSAccesses(Function *Func) {
536 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
537 GlobalVariable *SwLDS = LDSParams.SwLDS;
538 assert(SwLDS);
539 GlobalVariable *SwLDSMetadata = LDSParams.SwLDSMetadata;
540 assert(SwLDSMetadata);
541 StructType *SwLDSMetadataStructType =
542 cast<StructType>(Val: SwLDSMetadata->getValueType());
543 Type *Int32Ty = IRB.getInt32Ty();
544 auto &IndirectAccess = LDSParams.IndirectAccess;
545 auto &DirectAccess = LDSParams.DirectAccess;
546 // Replace all uses of LDS global in this Function with a Replacement.
547 SetVector<GlobalVariable *> UniqueLDSGlobals;
548 auto ReplaceLDSGlobalUses = [&](SetVector<GlobalVariable *> &LDSGlobals) {
549 for (auto &GV : LDSGlobals) {
550 // Do not generate instructions if LDS access is in non-kernel
551 // i.e indirect-access.
552 if ((IndirectAccess.StaticLDSGlobals.contains(key: GV) ||
553 IndirectAccess.DynamicLDSGlobals.contains(key: GV)) &&
554 (!DirectAccess.StaticLDSGlobals.contains(key: GV) &&
555 !DirectAccess.DynamicLDSGlobals.contains(key: GV)))
556 continue;
557 if (is_contained(Range&: UniqueLDSGlobals, Element: GV))
558 continue;
559 UniqueLDSGlobals.insert(X: GV);
560 auto &Indices = LDSParams.LDSToReplacementIndicesMap[GV];
561 assert(Indices.size() == 3);
562 Constant *GEPIdx[] = {ConstantInt::get(Ty: Int32Ty, V: Indices[0]),
563 ConstantInt::get(Ty: Int32Ty, V: Indices[1]),
564 ConstantInt::get(Ty: Int32Ty, V: Indices[2])};
565 Constant *GEP = ConstantExpr::getGetElementPtr(
566 Ty: SwLDSMetadataStructType, C: SwLDSMetadata, IdxList: GEPIdx, NW: true);
567 Value *Offset = IRB.CreateLoad(Ty: Int32Ty, Ptr: GEP);
568 Value *BasePlusOffset =
569 IRB.CreateInBoundsGEP(Ty: IRB.getInt8Ty(), Ptr: SwLDS, IdxList: {Offset});
570 LLVM_DEBUG(GV->printAsOperand(dbgs() << "Sw LDS Lowering, Replacing LDS ",
571 false));
572 replacesUsesOfGlobalInFunction(Func, GV, Replacement: BasePlusOffset);
573 }
574 };
575 ReplaceLDSGlobalUses(DirectAccess.StaticLDSGlobals);
576 ReplaceLDSGlobalUses(IndirectAccess.StaticLDSGlobals);
577 ReplaceLDSGlobalUses(DirectAccess.DynamicLDSGlobals);
578 ReplaceLDSGlobalUses(IndirectAccess.DynamicLDSGlobals);
579}
580
581void AMDGPUSwLowerLDS::updateMallocSizeForDynamicLDS(
582 Function *Func, Value **CurrMallocSize, Value *HiddenDynLDSSize,
583 SetVector<GlobalVariable *> &DynamicLDSGlobals) {
584 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
585 Type *Int32Ty = IRB.getInt32Ty();
586
587 GlobalVariable *SwLDS = LDSParams.SwLDS;
588 GlobalVariable *SwLDSMetadata = LDSParams.SwLDSMetadata;
589 assert(SwLDS && SwLDSMetadata);
590 StructType *MetadataStructType =
591 cast<StructType>(Val: SwLDSMetadata->getValueType());
592 unsigned MaxAlignment = SwLDS->getAlignment();
593 Value *MaxAlignValue = IRB.getInt32(C: MaxAlignment);
594 Value *MaxAlignValueMinusOne = IRB.getInt32(C: MaxAlignment - 1);
595
596 for (GlobalVariable *DynGV : DynamicLDSGlobals) {
597 auto &Indices = LDSParams.LDSToReplacementIndicesMap[DynGV];
598 // Update the Offset metadata.
599 Constant *Index0 = ConstantInt::get(Ty: Int32Ty, V: 0);
600 Constant *Index1 = ConstantInt::get(Ty: Int32Ty, V: Indices[1]);
601
602 Constant *Index2Offset = ConstantInt::get(Ty: Int32Ty, V: 0);
603 auto *GEPForOffset = IRB.CreateInBoundsGEP(
604 Ty: MetadataStructType, Ptr: SwLDSMetadata, IdxList: {Index0, Index1, Index2Offset});
605
606 IRB.CreateStore(Val: *CurrMallocSize, Ptr: GEPForOffset);
607 // Update the size and Aligned Size metadata.
608 Constant *Index2Size = ConstantInt::get(Ty: Int32Ty, V: 1);
609 auto *GEPForSize = IRB.CreateInBoundsGEP(Ty: MetadataStructType, Ptr: SwLDSMetadata,
610 IdxList: {Index0, Index1, Index2Size});
611
612 Value *CurrDynLDSSize = IRB.CreateLoad(Ty: Int32Ty, Ptr: HiddenDynLDSSize);
613 IRB.CreateStore(Val: CurrDynLDSSize, Ptr: GEPForSize);
614 Constant *Index2AlignedSize = ConstantInt::get(Ty: Int32Ty, V: 2);
615 auto *GEPForAlignedSize = IRB.CreateInBoundsGEP(
616 Ty: MetadataStructType, Ptr: SwLDSMetadata, IdxList: {Index0, Index1, Index2AlignedSize});
617
618 Value *AlignedDynLDSSize =
619 IRB.CreateAdd(LHS: CurrDynLDSSize, RHS: MaxAlignValueMinusOne);
620 AlignedDynLDSSize = IRB.CreateUDiv(LHS: AlignedDynLDSSize, RHS: MaxAlignValue);
621 AlignedDynLDSSize = IRB.CreateMul(LHS: AlignedDynLDSSize, RHS: MaxAlignValue);
622 IRB.CreateStore(Val: AlignedDynLDSSize, Ptr: GEPForAlignedSize);
623
624 // Update the Current Malloc Size
625 *CurrMallocSize = IRB.CreateAdd(LHS: *CurrMallocSize, RHS: AlignedDynLDSSize);
626 }
627}
628
629static DebugLoc getOrCreateDebugLoc(const Instruction *InsertBefore,
630 DISubprogram *SP) {
631 assert(InsertBefore);
632 if (InsertBefore->getDebugLoc())
633 return InsertBefore->getDebugLoc();
634 if (SP)
635 return DILocation::get(Context&: SP->getContext(), Line: SP->getLine(), Column: 1, Scope: SP);
636 return DebugLoc();
637}
638
639void AMDGPUSwLowerLDS::getLDSMemoryInstructions(
640 Function *Func, SetVector<Instruction *> &LDSInstructions) {
641 for (BasicBlock &BB : *Func) {
642 for (Instruction &Inst : BB) {
643 if (LoadInst *LI = dyn_cast<LoadInst>(Val: &Inst)) {
644 if (LI->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS)
645 LDSInstructions.insert(X: &Inst);
646 } else if (StoreInst *SI = dyn_cast<StoreInst>(Val: &Inst)) {
647 if (SI->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS)
648 LDSInstructions.insert(X: &Inst);
649 } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(Val: &Inst)) {
650 if (RMW->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS)
651 LDSInstructions.insert(X: &Inst);
652 } else if (AtomicCmpXchgInst *XCHG = dyn_cast<AtomicCmpXchgInst>(Val: &Inst)) {
653 if (XCHG->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS)
654 LDSInstructions.insert(X: &Inst);
655 } else if (AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(Val: &Inst)) {
656 if (ASC->getSrcAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&
657 ASC->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS)
658 LDSInstructions.insert(X: &Inst);
659 } else
660 continue;
661 }
662 }
663}
664
665Value *AMDGPUSwLowerLDS::getTranslatedGlobalMemoryPtrOfLDS(Value *LoadMallocPtr,
666 Value *LDSPtr) {
667 assert(LDSPtr && "Invalid LDS pointer operand");
668 Type *LDSPtrType = LDSPtr->getType();
669 LLVMContext &Ctx = M.getContext();
670 const DataLayout &DL = M.getDataLayout();
671 Type *IntTy = DL.getIntPtrType(C&: Ctx, AddressSpace: AMDGPUAS::LOCAL_ADDRESS);
672 if (auto *VecPtrTy = dyn_cast<VectorType>(Val: LDSPtrType)) {
673 // Handle vector of pointers
674 ElementCount NumElements = VecPtrTy->getElementCount();
675 IntTy = VectorType::get(ElementType: IntTy, EC: NumElements);
676 }
677 Value *GepIndex = IRB.CreatePtrToInt(V: LDSPtr, DestTy: IntTy);
678 return IRB.CreateInBoundsGEP(Ty: IRB.getInt8Ty(), Ptr: LoadMallocPtr, IdxList: {GepIndex});
679}
680
681void AMDGPUSwLowerLDS::translateLDSMemoryOperationsToGlobalMemory(
682 Function *Func, Value *LoadMallocPtr,
683 SetVector<Instruction *> &LDSInstructions) {
684 LLVM_DEBUG(dbgs() << "Translating LDS memory operations to global memory : "
685 << Func->getName());
686 for (Instruction *Inst : LDSInstructions) {
687 IRB.SetInsertPoint(Inst);
688 if (LoadInst *LI = dyn_cast<LoadInst>(Val: Inst)) {
689 Value *LIOperand = LI->getPointerOperand();
690 Value *Replacement =
691 getTranslatedGlobalMemoryPtrOfLDS(LoadMallocPtr, LDSPtr: LIOperand);
692 LoadInst *NewLI = IRB.CreateAlignedLoad(Ty: LI->getType(), Ptr: Replacement,
693 Align: LI->getAlign(), isVolatile: LI->isVolatile());
694 NewLI->setAtomic(Ordering: LI->getOrdering(), SSID: LI->getSyncScopeID());
695 AsanInfo.Instructions.insert(X: NewLI);
696 LI->replaceAllUsesWith(V: NewLI);
697 LI->eraseFromParent();
698 } else if (StoreInst *SI = dyn_cast<StoreInst>(Val: Inst)) {
699 Value *SIOperand = SI->getPointerOperand();
700 Value *Replacement =
701 getTranslatedGlobalMemoryPtrOfLDS(LoadMallocPtr, LDSPtr: SIOperand);
702 StoreInst *NewSI = IRB.CreateAlignedStore(
703 Val: SI->getValueOperand(), Ptr: Replacement, Align: SI->getAlign(), isVolatile: SI->isVolatile());
704 NewSI->setAtomic(Ordering: SI->getOrdering(), SSID: SI->getSyncScopeID());
705 AsanInfo.Instructions.insert(X: NewSI);
706 SI->replaceAllUsesWith(V: NewSI);
707 SI->eraseFromParent();
708 } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(Val: Inst)) {
709 Value *RMWPtrOperand = RMW->getPointerOperand();
710 Value *RMWValOperand = RMW->getValOperand();
711 Value *Replacement =
712 getTranslatedGlobalMemoryPtrOfLDS(LoadMallocPtr, LDSPtr: RMWPtrOperand);
713 AtomicRMWInst *NewRMW = IRB.CreateAtomicRMW(
714 Op: RMW->getOperation(), Ptr: Replacement, Val: RMWValOperand, Align: RMW->getAlign(),
715 Ordering: RMW->getOrdering(), SSID: RMW->getSyncScopeID());
716 NewRMW->setVolatile(RMW->isVolatile());
717 AsanInfo.Instructions.insert(X: NewRMW);
718 RMW->replaceAllUsesWith(V: NewRMW);
719 RMW->eraseFromParent();
720 } else if (AtomicCmpXchgInst *XCHG = dyn_cast<AtomicCmpXchgInst>(Val: Inst)) {
721 Value *XCHGPtrOperand = XCHG->getPointerOperand();
722 Value *Replacement =
723 getTranslatedGlobalMemoryPtrOfLDS(LoadMallocPtr, LDSPtr: XCHGPtrOperand);
724 AtomicCmpXchgInst *NewXCHG = IRB.CreateAtomicCmpXchg(
725 Ptr: Replacement, Cmp: XCHG->getCompareOperand(), New: XCHG->getNewValOperand(),
726 Align: XCHG->getAlign(), SuccessOrdering: XCHG->getSuccessOrdering(),
727 FailureOrdering: XCHG->getFailureOrdering(), SSID: XCHG->getSyncScopeID());
728 NewXCHG->setVolatile(XCHG->isVolatile());
729 AsanInfo.Instructions.insert(X: NewXCHG);
730 XCHG->replaceAllUsesWith(V: NewXCHG);
731 XCHG->eraseFromParent();
732 } else if (AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(Val: Inst)) {
733 Value *AIOperand = ASC->getPointerOperand();
734 Value *Replacement =
735 getTranslatedGlobalMemoryPtrOfLDS(LoadMallocPtr, LDSPtr: AIOperand);
736 Value *NewAI = IRB.CreateAddrSpaceCast(V: Replacement, DestTy: ASC->getType());
737 // Note: No need to add the instruction to AsanInfo instructions to be
738 // instrumented list. FLAT_ADDRESS ptr would have been already
739 // instrumented by asan pass prior to this pass.
740 ASC->replaceAllUsesWith(V: NewAI);
741 ASC->eraseFromParent();
742 } else
743 report_fatal_error(reason: "Unimplemented LDS lowering instruction");
744 }
745}
746
747void AMDGPUSwLowerLDS::poisonRedzones(Function *Func, Value *MallocPtr) {
748 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
749 Type *Int64Ty = IRB.getInt64Ty();
750 Type *VoidTy = IRB.getVoidTy();
751 FunctionCallee AsanPoisonRegion = M.getOrInsertFunction(
752 Name: "__asan_poison_region",
753 T: FunctionType::get(Result: VoidTy, Params: {Int64Ty, Int64Ty}, isVarArg: false));
754
755 auto RedzonesVec = LDSParams.RedzoneOffsetAndSizeVector;
756 size_t VecSize = RedzonesVec.size();
757 for (unsigned i = 0; i < VecSize; i++) {
758 auto &RedzonePair = RedzonesVec[i];
759 uint64_t RedzoneOffset = RedzonePair.first;
760 uint64_t RedzoneSize = RedzonePair.second;
761 Value *RedzoneAddrOffset = IRB.CreateInBoundsGEP(
762 Ty: IRB.getInt8Ty(), Ptr: MallocPtr, IdxList: {IRB.getInt64(C: RedzoneOffset)});
763 Value *RedzoneAddress = IRB.CreatePtrToInt(V: RedzoneAddrOffset, DestTy: Int64Ty);
764 IRB.CreateCall(Callee: AsanPoisonRegion,
765 Args: {RedzoneAddress, IRB.getInt64(C: RedzoneSize)});
766 }
767}
768
769void AMDGPUSwLowerLDS::lowerKernelLDSAccesses(Function *Func,
770 DomTreeUpdater &DTU) {
771 LLVM_DEBUG(dbgs() << "Sw Lowering Kernel LDS for : " << Func->getName());
772 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
773 auto &Ctx = M.getContext();
774 auto *PrevEntryBlock = &Func->getEntryBlock();
775 SetVector<Instruction *> LDSInstructions;
776 getLDSMemoryInstructions(Func, LDSInstructions);
777
778 // Create malloc block.
779 auto *MallocBlock = BasicBlock::Create(Context&: Ctx, Name: "Malloc", Parent: Func, InsertBefore: PrevEntryBlock);
780
781 // Create WIdBlock block which has instructions related to selection of
782 // {0,0,0} indiex work item in the work group.
783 auto *WIdBlock = BasicBlock::Create(Context&: Ctx, Name: "WId", Parent: Func, InsertBefore: MallocBlock);
784 IRB.SetInsertPoint(TheBB: WIdBlock, IP: WIdBlock->begin());
785 DebugLoc FirstDL =
786 getOrCreateDebugLoc(InsertBefore: &*PrevEntryBlock->begin(), SP: Func->getSubprogram());
787 IRB.SetCurrentDebugLocation(FirstDL);
788 Value *WIdx = IRB.CreateIntrinsic(ID: Intrinsic::amdgcn_workitem_id_x, Args: {});
789 Value *WIdy = IRB.CreateIntrinsic(ID: Intrinsic::amdgcn_workitem_id_y, Args: {});
790 Value *WIdz = IRB.CreateIntrinsic(ID: Intrinsic::amdgcn_workitem_id_z, Args: {});
791 Value *XYOr = IRB.CreateOr(LHS: WIdx, RHS: WIdy);
792 Value *XYZOr = IRB.CreateOr(LHS: XYOr, RHS: WIdz);
793 Value *WIdzCond = IRB.CreateICmpEQ(LHS: XYZOr, RHS: IRB.getInt32(C: 0));
794
795 // All work items will branch to PrevEntryBlock except {0,0,0} index
796 // work item which will branch to malloc block.
797 IRB.CreateCondBr(Cond: WIdzCond, True: MallocBlock, False: PrevEntryBlock);
798
799 // Malloc block
800 IRB.SetInsertPoint(TheBB: MallocBlock, IP: MallocBlock->begin());
801
802 // If Dynamic LDS globals are accessed by the kernel,
803 // Get the size of dyn lds from hidden dyn_lds_size kernel arg.
804 // Update the corresponding metadata global entries for this dyn lds global.
805 GlobalVariable *SwLDS = LDSParams.SwLDS;
806 GlobalVariable *SwLDSMetadata = LDSParams.SwLDSMetadata;
807 assert(SwLDS && SwLDSMetadata);
808 StructType *MetadataStructType =
809 cast<StructType>(Val: SwLDSMetadata->getValueType());
810 uint32_t MallocSize = 0;
811 Value *CurrMallocSize;
812 Type *Int32Ty = IRB.getInt32Ty();
813 Type *Int64Ty = IRB.getInt64Ty();
814
815 SetVector<GlobalVariable *> UniqueLDSGlobals;
816 auto GetUniqueLDSGlobals = [&](SetVector<GlobalVariable *> &LDSGlobals) {
817 for (auto &GV : LDSGlobals) {
818 if (is_contained(Range&: UniqueLDSGlobals, Element: GV))
819 continue;
820 UniqueLDSGlobals.insert(X: GV);
821 }
822 };
823
824 GetUniqueLDSGlobals(LDSParams.DirectAccess.StaticLDSGlobals);
825 GetUniqueLDSGlobals(LDSParams.IndirectAccess.StaticLDSGlobals);
826 unsigned NumStaticLDS = 1 + UniqueLDSGlobals.size();
827 UniqueLDSGlobals.clear();
828
829 if (NumStaticLDS) {
830 auto *GEPForEndStaticLDSOffset =
831 IRB.CreateInBoundsGEP(Ty: MetadataStructType, Ptr: SwLDSMetadata,
832 IdxList: {ConstantInt::get(Ty: Int32Ty, V: 0),
833 ConstantInt::get(Ty: Int32Ty, V: NumStaticLDS - 1),
834 ConstantInt::get(Ty: Int32Ty, V: 0)});
835
836 auto *GEPForEndStaticLDSSize =
837 IRB.CreateInBoundsGEP(Ty: MetadataStructType, Ptr: SwLDSMetadata,
838 IdxList: {ConstantInt::get(Ty: Int32Ty, V: 0),
839 ConstantInt::get(Ty: Int32Ty, V: NumStaticLDS - 1),
840 ConstantInt::get(Ty: Int32Ty, V: 2)});
841
842 Value *EndStaticLDSOffset =
843 IRB.CreateLoad(Ty: Int32Ty, Ptr: GEPForEndStaticLDSOffset);
844 Value *EndStaticLDSSize = IRB.CreateLoad(Ty: Int32Ty, Ptr: GEPForEndStaticLDSSize);
845 CurrMallocSize = IRB.CreateAdd(LHS: EndStaticLDSOffset, RHS: EndStaticLDSSize);
846 } else
847 CurrMallocSize = IRB.getInt32(C: MallocSize);
848
849 if (LDSParams.SwDynLDS) {
850 if (!(AMDGPU::getAMDHSACodeObjectVersion(M) >= AMDGPU::AMDHSA_COV5))
851 report_fatal_error(
852 reason: "Dynamic LDS size query is only supported for CO V5 and later.");
853 // Get size from hidden dyn_lds_size argument of kernel
854 Value *ImplicitArg =
855 IRB.CreateIntrinsic(ID: Intrinsic::amdgcn_implicitarg_ptr, Args: {});
856 Value *HiddenDynLDSSize = IRB.CreateInBoundsGEP(
857 Ty: ImplicitArg->getType(), Ptr: ImplicitArg,
858 IdxList: {ConstantInt::get(Ty: Int64Ty, COV5_HIDDEN_DYN_LDS_SIZE_ARG)});
859 UniqueLDSGlobals.clear();
860 GetUniqueLDSGlobals(LDSParams.DirectAccess.DynamicLDSGlobals);
861 GetUniqueLDSGlobals(LDSParams.IndirectAccess.DynamicLDSGlobals);
862 updateMallocSizeForDynamicLDS(Func, CurrMallocSize: &CurrMallocSize, HiddenDynLDSSize,
863 DynamicLDSGlobals&: UniqueLDSGlobals);
864 }
865
866 CurrMallocSize = IRB.CreateZExt(V: CurrMallocSize, DestTy: Int64Ty);
867
868 // Create a call to malloc function which does device global memory allocation
869 // with size equals to all LDS global accesses size in this kernel.
870 Value *ReturnAddress =
871 IRB.CreateIntrinsic(ID: Intrinsic::returnaddress, Args: {IRB.getInt32(C: 0)});
872 FunctionCallee MallocFunc = M.getOrInsertFunction(
873 Name: StringRef("__asan_malloc_impl"),
874 T: FunctionType::get(Result: Int64Ty, Params: {Int64Ty, Int64Ty}, isVarArg: false));
875 Value *RAPtrToInt = IRB.CreatePtrToInt(V: ReturnAddress, DestTy: Int64Ty);
876 Value *MallocCall = IRB.CreateCall(Callee: MallocFunc, Args: {CurrMallocSize, RAPtrToInt});
877
878 Value *MallocPtr =
879 IRB.CreateIntToPtr(V: MallocCall, DestTy: IRB.getPtrTy(AddrSpace: AMDGPUAS::GLOBAL_ADDRESS));
880
881 // Create store of malloc to new global
882 IRB.CreateStore(Val: MallocPtr, Ptr: SwLDS);
883
884 // Create calls to __asan_poison_region to poison redzones.
885 poisonRedzones(Func, MallocPtr);
886
887 // Create branch to PrevEntryBlock
888 IRB.CreateBr(Dest: PrevEntryBlock);
889
890 // Create wave-group barrier at the starting of Previous entry block
891 Type *Int1Ty = IRB.getInt1Ty();
892 IRB.SetInsertPoint(TheBB: PrevEntryBlock, IP: PrevEntryBlock->begin());
893 auto *XYZCondPhi = IRB.CreatePHI(Ty: Int1Ty, NumReservedValues: 2, Name: "xyzCond");
894 XYZCondPhi->addIncoming(V: IRB.getInt1(V: 0), BB: WIdBlock);
895 XYZCondPhi->addIncoming(V: IRB.getInt1(V: 1), BB: MallocBlock);
896
897 IRB.CreateIntrinsic(ID: Intrinsic::amdgcn_s_barrier, Args: {});
898
899 // Load malloc pointer from Sw LDS.
900 Value *LoadMallocPtr =
901 IRB.CreateLoad(Ty: IRB.getPtrTy(AddrSpace: AMDGPUAS::GLOBAL_ADDRESS), Ptr: SwLDS);
902
903 // Replace All uses of LDS globals with new LDS pointers.
904 replaceKernelLDSAccesses(Func);
905
906 // Replace Memory Operations on LDS with corresponding
907 // global memory pointers.
908 translateLDSMemoryOperationsToGlobalMemory(Func, LoadMallocPtr,
909 LDSInstructions);
910
911 auto *CondFreeBlock = BasicBlock::Create(Context&: Ctx, Name: "CondFree", Parent: Func);
912 auto *FreeBlock = BasicBlock::Create(Context&: Ctx, Name: "Free", Parent: Func);
913 auto *EndBlock = BasicBlock::Create(Context&: Ctx, Name: "End", Parent: Func);
914 for (BasicBlock &BB : *Func) {
915 if (!BB.empty()) {
916 if (ReturnInst *RI = dyn_cast<ReturnInst>(Val: &BB.back())) {
917 RI->eraseFromParent();
918 IRB.SetInsertPoint(TheBB: &BB, IP: BB.end());
919 IRB.CreateBr(Dest: CondFreeBlock);
920 }
921 }
922 }
923
924 // Cond Free Block
925 IRB.SetInsertPoint(TheBB: CondFreeBlock, IP: CondFreeBlock->begin());
926 IRB.CreateIntrinsic(ID: Intrinsic::amdgcn_s_barrier, Args: {});
927 IRB.CreateCondBr(Cond: XYZCondPhi, True: FreeBlock, False: EndBlock);
928
929 // Free Block
930 IRB.SetInsertPoint(TheBB: FreeBlock, IP: FreeBlock->begin());
931
932 // Free the previously allocate device global memory.
933 FunctionCallee AsanFreeFunc = M.getOrInsertFunction(
934 Name: StringRef("__asan_free_impl"),
935 T: FunctionType::get(Result: IRB.getVoidTy(), Params: {Int64Ty, Int64Ty}, isVarArg: false));
936 Value *ReturnAddr =
937 IRB.CreateIntrinsic(ID: Intrinsic::returnaddress, Args: IRB.getInt32(C: 0));
938 Value *RAPToInt = IRB.CreatePtrToInt(V: ReturnAddr, DestTy: Int64Ty);
939 Value *MallocPtrToInt = IRB.CreatePtrToInt(V: LoadMallocPtr, DestTy: Int64Ty);
940 IRB.CreateCall(Callee: AsanFreeFunc, Args: {MallocPtrToInt, RAPToInt});
941
942 IRB.CreateBr(Dest: EndBlock);
943
944 // End Block
945 IRB.SetInsertPoint(TheBB: EndBlock, IP: EndBlock->begin());
946 IRB.CreateRetVoid();
947 // Update the DomTree with corresponding links to basic blocks.
948 DTU.applyUpdates(Updates: {{DominatorTree::Insert, WIdBlock, MallocBlock},
949 {DominatorTree::Insert, MallocBlock, PrevEntryBlock},
950 {DominatorTree::Insert, CondFreeBlock, FreeBlock},
951 {DominatorTree::Insert, FreeBlock, EndBlock}});
952}
953
954Constant *AMDGPUSwLowerLDS::getAddressesOfVariablesInKernel(
955 Function *Func, SetVector<GlobalVariable *> &Variables) {
956 Type *Int32Ty = IRB.getInt32Ty();
957 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
958
959 GlobalVariable *SwLDSMetadata = LDSParams.SwLDSMetadata;
960 assert(SwLDSMetadata);
961 auto *SwLDSMetadataStructType =
962 cast<StructType>(Val: SwLDSMetadata->getValueType());
963 ArrayType *KernelOffsetsType =
964 ArrayType::get(ElementType: IRB.getPtrTy(AddrSpace: AMDGPUAS::GLOBAL_ADDRESS), NumElements: Variables.size());
965
966 SmallVector<Constant *> Elements;
967 for (auto *GV : Variables) {
968 auto It = LDSParams.LDSToReplacementIndicesMap.find(Val: GV);
969 if (It == LDSParams.LDSToReplacementIndicesMap.end()) {
970 Elements.push_back(
971 Elt: PoisonValue::get(T: IRB.getPtrTy(AddrSpace: AMDGPUAS::GLOBAL_ADDRESS)));
972 continue;
973 }
974 auto &Indices = It->second;
975 Constant *GEPIdx[] = {ConstantInt::get(Ty: Int32Ty, V: Indices[0]),
976 ConstantInt::get(Ty: Int32Ty, V: Indices[1]),
977 ConstantInt::get(Ty: Int32Ty, V: Indices[2])};
978 Constant *GEP = ConstantExpr::getGetElementPtr(Ty: SwLDSMetadataStructType,
979 C: SwLDSMetadata, IdxList: GEPIdx, NW: true);
980 Elements.push_back(Elt: GEP);
981 }
982 return ConstantArray::get(T: KernelOffsetsType, V: Elements);
983}
984
985void AMDGPUSwLowerLDS::buildNonKernelLDSBaseTable(
986 NonKernelLDSParameters &NKLDSParams) {
987 // Base table will have single row, with elements of the row
988 // placed as per kernel ID. Each element in the row corresponds
989 // to addresss of "SW LDS" global of the kernel.
990 auto &Kernels = NKLDSParams.OrderedKernels;
991 if (Kernels.empty())
992 return;
993 const size_t NumberKernels = Kernels.size();
994 ArrayType *AllKernelsOffsetsType =
995 ArrayType::get(ElementType: IRB.getPtrTy(AddrSpace: AMDGPUAS::LOCAL_ADDRESS), NumElements: NumberKernels);
996 std::vector<Constant *> OverallConstantExprElts(NumberKernels);
997 for (size_t i = 0; i < NumberKernels; i++) {
998 Function *Func = Kernels[i];
999 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
1000 OverallConstantExprElts[i] = LDSParams.SwLDS;
1001 }
1002 Constant *init =
1003 ConstantArray::get(T: AllKernelsOffsetsType, V: OverallConstantExprElts);
1004 NKLDSParams.LDSBaseTable = new GlobalVariable(
1005 M, AllKernelsOffsetsType, true, GlobalValue::InternalLinkage, init,
1006 "llvm.amdgcn.sw.lds.base.table", nullptr, GlobalValue::NotThreadLocal,
1007 AMDGPUAS::GLOBAL_ADDRESS);
1008 GlobalValue::SanitizerMetadata MD;
1009 MD.NoAddress = true;
1010 NKLDSParams.LDSBaseTable->setSanitizerMetadata(MD);
1011}
1012
1013void AMDGPUSwLowerLDS::buildNonKernelLDSOffsetTable(
1014 NonKernelLDSParameters &NKLDSParams) {
1015 // Offset table will have multiple rows and columns.
1016 // Rows are assumed to be from 0 to (n-1). n is total number
1017 // of kernels accessing the LDS through non-kernels.
1018 // Each row will have m elements. m is the total number of
1019 // unique LDS globals accessed by non-kernels.
1020 // Each element in the row correspond to the address of
1021 // the replacement of LDS global done by that particular kernel.
1022 auto &Variables = NKLDSParams.OrdereLDSGlobals;
1023 auto &Kernels = NKLDSParams.OrderedKernels;
1024 if (Variables.empty() || Kernels.empty())
1025 return;
1026 const size_t NumberVariables = Variables.size();
1027 const size_t NumberKernels = Kernels.size();
1028
1029 ArrayType *KernelOffsetsType =
1030 ArrayType::get(ElementType: IRB.getPtrTy(AddrSpace: AMDGPUAS::GLOBAL_ADDRESS), NumElements: NumberVariables);
1031
1032 ArrayType *AllKernelsOffsetsType =
1033 ArrayType::get(ElementType: KernelOffsetsType, NumElements: NumberKernels);
1034 std::vector<Constant *> overallConstantExprElts(NumberKernels);
1035 for (size_t i = 0; i < NumberKernels; i++) {
1036 Function *Func = Kernels[i];
1037 overallConstantExprElts[i] =
1038 getAddressesOfVariablesInKernel(Func, Variables);
1039 }
1040 Constant *Init =
1041 ConstantArray::get(T: AllKernelsOffsetsType, V: overallConstantExprElts);
1042 NKLDSParams.LDSOffsetTable = new GlobalVariable(
1043 M, AllKernelsOffsetsType, true, GlobalValue::InternalLinkage, Init,
1044 "llvm.amdgcn.sw.lds.offset.table", nullptr, GlobalValue::NotThreadLocal,
1045 AMDGPUAS::GLOBAL_ADDRESS);
1046 GlobalValue::SanitizerMetadata MD;
1047 MD.NoAddress = true;
1048 NKLDSParams.LDSOffsetTable->setSanitizerMetadata(MD);
1049}
1050
1051void AMDGPUSwLowerLDS::lowerNonKernelLDSAccesses(
1052 Function *Func, SetVector<GlobalVariable *> &LDSGlobals,
1053 NonKernelLDSParameters &NKLDSParams) {
1054 // Replace LDS access in non-kernel with replacement queried from
1055 // Base table and offset from offset table.
1056 LLVM_DEBUG(dbgs() << "Sw LDS lowering, lower non-kernel access for : "
1057 << Func->getName());
1058 auto InsertAt = Func->getEntryBlock().getFirstNonPHIOrDbgOrAlloca();
1059 IRB.SetInsertPoint(InsertAt);
1060
1061 // Get LDS memory instructions.
1062 SetVector<Instruction *> LDSInstructions;
1063 getLDSMemoryInstructions(Func, LDSInstructions);
1064
1065 auto *KernelId = IRB.CreateIntrinsic(ID: Intrinsic::amdgcn_lds_kernel_id, Args: {});
1066 GlobalVariable *LDSBaseTable = NKLDSParams.LDSBaseTable;
1067 GlobalVariable *LDSOffsetTable = NKLDSParams.LDSOffsetTable;
1068 auto &OrdereLDSGlobals = NKLDSParams.OrdereLDSGlobals;
1069 Value *BaseGEP = IRB.CreateInBoundsGEP(
1070 Ty: LDSBaseTable->getValueType(), Ptr: LDSBaseTable, IdxList: {IRB.getInt32(C: 0), KernelId});
1071 Value *BaseLoad =
1072 IRB.CreateLoad(Ty: IRB.getPtrTy(AddrSpace: AMDGPUAS::LOCAL_ADDRESS), Ptr: BaseGEP);
1073 Value *LoadMallocPtr =
1074 IRB.CreateLoad(Ty: IRB.getPtrTy(AddrSpace: AMDGPUAS::GLOBAL_ADDRESS), Ptr: BaseLoad);
1075
1076 for (GlobalVariable *GV : LDSGlobals) {
1077 const auto *GVIt = llvm::find(Range&: OrdereLDSGlobals, Val: GV);
1078 assert(GVIt != OrdereLDSGlobals.end());
1079 uint32_t GVOffset = std::distance(first: OrdereLDSGlobals.begin(), last: GVIt);
1080
1081 Value *OffsetGEP = IRB.CreateInBoundsGEP(
1082 Ty: LDSOffsetTable->getValueType(), Ptr: LDSOffsetTable,
1083 IdxList: {IRB.getInt32(C: 0), KernelId, IRB.getInt32(C: GVOffset)});
1084 Value *OffsetLoad =
1085 IRB.CreateLoad(Ty: IRB.getPtrTy(AddrSpace: AMDGPUAS::GLOBAL_ADDRESS), Ptr: OffsetGEP);
1086 Value *Offset = IRB.CreateLoad(Ty: IRB.getInt32Ty(), Ptr: OffsetLoad);
1087 Value *BasePlusOffset =
1088 IRB.CreateInBoundsGEP(Ty: IRB.getInt8Ty(), Ptr: BaseLoad, IdxList: {Offset});
1089 LLVM_DEBUG(dbgs() << "Sw LDS Lowering, Replace non-kernel LDS for "
1090 << GV->getName());
1091 replacesUsesOfGlobalInFunction(Func, GV, Replacement: BasePlusOffset);
1092 }
1093 translateLDSMemoryOperationsToGlobalMemory(Func, LoadMallocPtr,
1094 LDSInstructions);
1095}
1096
1097static void reorderStaticDynamicIndirectLDSSet(KernelLDSParameters &LDSParams) {
1098 // Sort Static, dynamic LDS globals which are either
1099 // direct or indirect access on basis of name.
1100 auto &DirectAccess = LDSParams.DirectAccess;
1101 auto &IndirectAccess = LDSParams.IndirectAccess;
1102 LDSParams.DirectAccess.StaticLDSGlobals = sortByName(
1103 V: std::vector<GlobalVariable *>(DirectAccess.StaticLDSGlobals.begin(),
1104 DirectAccess.StaticLDSGlobals.end()));
1105 LDSParams.DirectAccess.DynamicLDSGlobals = sortByName(
1106 V: std::vector<GlobalVariable *>(DirectAccess.DynamicLDSGlobals.begin(),
1107 DirectAccess.DynamicLDSGlobals.end()));
1108 LDSParams.IndirectAccess.StaticLDSGlobals = sortByName(
1109 V: std::vector<GlobalVariable *>(IndirectAccess.StaticLDSGlobals.begin(),
1110 IndirectAccess.StaticLDSGlobals.end()));
1111 LDSParams.IndirectAccess.DynamicLDSGlobals = sortByName(
1112 V: std::vector<GlobalVariable *>(IndirectAccess.DynamicLDSGlobals.begin(),
1113 IndirectAccess.DynamicLDSGlobals.end()));
1114}
1115
1116void AMDGPUSwLowerLDS::initAsanInfo() {
1117 // Get Shadow mapping scale and offset.
1118 unsigned LongSize =
1119 M.getDataLayout().getPointerSizeInBits(AS: AMDGPUAS::GLOBAL_ADDRESS);
1120 uint64_t Offset;
1121 int Scale;
1122 bool OrShadowOffset;
1123 llvm::getAddressSanitizerParams(TargetTriple: AMDGPUTM.getTargetTriple(), LongSize, IsKasan: false,
1124 ShadowBase: &Offset, MappingScale: &Scale, OrShadowOffset: &OrShadowOffset);
1125 AsanInfo.Scale = Scale;
1126 AsanInfo.Offset = Offset;
1127}
1128
1129static bool hasFnWithSanitizeAddressAttr(FunctionVariableMap &LDSAccesses) {
1130 for (auto &K : LDSAccesses) {
1131 Function *F = K.first;
1132 if (!F)
1133 continue;
1134 if (F->hasFnAttribute(Kind: Attribute::SanitizeAddress))
1135 return true;
1136 }
1137 return false;
1138}
1139
1140bool AMDGPUSwLowerLDS::run() {
1141 bool Changed = false;
1142
1143 CallGraph CG = CallGraph(M);
1144
1145 Changed |= eliminateConstantExprUsesOfLDSFromAllInstructions(M);
1146
1147 // Get all the direct and indirect access of LDS for all the kernels.
1148 LDSUsesInfoTy LDSUsesInfo = getTransitiveUsesOfLDS(CG, M);
1149
1150 // Flag to decide whether to lower all the LDS accesses
1151 // based on sanitize_address attribute.
1152 bool LowerAllLDS = hasFnWithSanitizeAddressAttr(LDSAccesses&: LDSUsesInfo.direct_access) ||
1153 hasFnWithSanitizeAddressAttr(LDSAccesses&: LDSUsesInfo.indirect_access);
1154
1155 if (!LowerAllLDS)
1156 return Changed;
1157
1158 // Utility to group LDS access into direct, indirect, static and dynamic.
1159 auto PopulateKernelStaticDynamicLDS = [&](FunctionVariableMap &LDSAccesses,
1160 bool DirectAccess) {
1161 for (auto &K : LDSAccesses) {
1162 Function *F = K.first;
1163 if (!F || K.second.empty())
1164 continue;
1165
1166 assert(isKernel(*F));
1167
1168 // Only inserts if key isn't already in the map.
1169 FuncLDSAccessInfo.KernelToLDSParametersMap.insert(
1170 KV: {F, KernelLDSParameters()});
1171
1172 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[F];
1173 if (!DirectAccess)
1174 FuncLDSAccessInfo.KernelsWithIndirectLDSAccess.insert(X: F);
1175 for (GlobalVariable *GV : K.second) {
1176 if (!DirectAccess) {
1177 if (AMDGPU::isDynamicLDS(GV: *GV))
1178 LDSParams.IndirectAccess.DynamicLDSGlobals.insert(X: GV);
1179 else
1180 LDSParams.IndirectAccess.StaticLDSGlobals.insert(X: GV);
1181 FuncLDSAccessInfo.AllNonKernelLDSAccess.insert(X: GV);
1182 } else {
1183 if (AMDGPU::isDynamicLDS(GV: *GV))
1184 LDSParams.DirectAccess.DynamicLDSGlobals.insert(X: GV);
1185 else
1186 LDSParams.DirectAccess.StaticLDSGlobals.insert(X: GV);
1187 }
1188 }
1189 }
1190 };
1191
1192 PopulateKernelStaticDynamicLDS(LDSUsesInfo.direct_access, true);
1193 PopulateKernelStaticDynamicLDS(LDSUsesInfo.indirect_access, false);
1194
1195 // Get address sanitizer scale.
1196 initAsanInfo();
1197
1198 for (auto &K : FuncLDSAccessInfo.KernelToLDSParametersMap) {
1199 Function *Func = K.first;
1200 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
1201 if (LDSParams.DirectAccess.StaticLDSGlobals.empty() &&
1202 LDSParams.DirectAccess.DynamicLDSGlobals.empty() &&
1203 LDSParams.IndirectAccess.StaticLDSGlobals.empty() &&
1204 LDSParams.IndirectAccess.DynamicLDSGlobals.empty()) {
1205 Changed = false;
1206 } else {
1207 removeFnAttrFromReachable(
1208 CG, KernelRoot: Func,
1209 FnAttrs: {"amdgpu-no-workitem-id-x", "amdgpu-no-workitem-id-y",
1210 "amdgpu-no-workitem-id-z", "amdgpu-no-heap-ptr"});
1211 if (!LDSParams.IndirectAccess.StaticLDSGlobals.empty() ||
1212 !LDSParams.IndirectAccess.DynamicLDSGlobals.empty())
1213 removeFnAttrFromReachable(CG, KernelRoot: Func, FnAttrs: {"amdgpu-no-lds-kernel-id"});
1214 reorderStaticDynamicIndirectLDSSet(LDSParams);
1215 buildSwLDSGlobal(Func);
1216 buildSwDynLDSGlobal(Func);
1217 populateSwMetadataGlobal(Func);
1218 populateSwLDSAttributeAndMetadata(Func);
1219 populateLDSToReplacementIndicesMap(Func);
1220 DomTreeUpdater DTU(DTCallback(*Func),
1221 DomTreeUpdater::UpdateStrategy::Lazy);
1222 lowerKernelLDSAccesses(Func, DTU);
1223 Changed = true;
1224 }
1225 }
1226
1227 // Get the Uses of LDS from non-kernels.
1228 getUsesOfLDSByNonKernels();
1229
1230 // Get non-kernels with LDS ptr as argument and called by kernels.
1231 getNonKernelsWithLDSArguments(CG);
1232
1233 // Lower LDS accesses in non-kernels.
1234 if (!FuncLDSAccessInfo.NonKernelToLDSAccessMap.empty() ||
1235 !FuncLDSAccessInfo.NonKernelsWithLDSArgument.empty()) {
1236 NonKernelLDSParameters NKLDSParams;
1237 NKLDSParams.OrderedKernels = getOrderedIndirectLDSAccessingKernels(
1238 Kernels&: FuncLDSAccessInfo.KernelsWithIndirectLDSAccess);
1239 NKLDSParams.OrdereLDSGlobals = getOrderedNonKernelAllLDSGlobals(
1240 Variables&: FuncLDSAccessInfo.AllNonKernelLDSAccess);
1241 buildNonKernelLDSBaseTable(NKLDSParams);
1242 buildNonKernelLDSOffsetTable(NKLDSParams);
1243 for (auto &K : FuncLDSAccessInfo.NonKernelToLDSAccessMap) {
1244 Function *Func = K.first;
1245 DenseSet<GlobalVariable *> &LDSGlobals = K.second;
1246 SetVector<GlobalVariable *> OrderedLDSGlobals = sortByName(
1247 V: std::vector<GlobalVariable *>(LDSGlobals.begin(), LDSGlobals.end()));
1248 lowerNonKernelLDSAccesses(Func, LDSGlobals&: OrderedLDSGlobals, NKLDSParams);
1249 }
1250 for (Function *Func : FuncLDSAccessInfo.NonKernelsWithLDSArgument) {
1251 auto &K = FuncLDSAccessInfo.NonKernelToLDSAccessMap;
1252 if (K.contains(Val: Func))
1253 continue;
1254 SetVector<llvm::GlobalVariable *> Vec;
1255 lowerNonKernelLDSAccesses(Func, LDSGlobals&: Vec, NKLDSParams);
1256 }
1257 Changed = true;
1258 }
1259
1260 if (!Changed)
1261 return Changed;
1262
1263 for (auto &GV : make_early_inc_range(Range: M.globals())) {
1264 if (AMDGPU::isLDSVariableToLower(GV)) {
1265 // probably want to remove from used lists
1266 GV.removeDeadConstantUsers();
1267 if (GV.use_empty())
1268 GV.eraseFromParent();
1269 }
1270 }
1271
1272 if (AsanInstrumentLDS) {
1273 SmallVector<InterestingMemoryOperand, 16> OperandsToInstrument;
1274 for (Instruction *Inst : AsanInfo.Instructions) {
1275 SmallVector<InterestingMemoryOperand, 1> InterestingOperands;
1276 getInterestingMemoryOperands(M, I: Inst, Interesting&: InterestingOperands);
1277 llvm::append_range(C&: OperandsToInstrument, R&: InterestingOperands);
1278 }
1279 for (auto &Operand : OperandsToInstrument) {
1280 Value *Addr = Operand.getPtr();
1281 instrumentAddress(M, IRB, OrigIns: Operand.getInsn(), InsertBefore: Operand.getInsn(), Addr,
1282 Alignment: Operand.Alignment.valueOrOne(), TypeStoreSize: Operand.TypeStoreSize,
1283 IsWrite: Operand.IsWrite, SizeArgument: nullptr, UseCalls: false, Recover: false, Scale: AsanInfo.Scale,
1284 Offset: AsanInfo.Offset);
1285 Changed = true;
1286 }
1287 }
1288
1289 return Changed;
1290}
1291
1292class AMDGPUSwLowerLDSLegacy : public ModulePass {
1293public:
1294 const AMDGPUTargetMachine *AMDGPUTM;
1295 static char ID;
1296 AMDGPUSwLowerLDSLegacy(const AMDGPUTargetMachine *TM)
1297 : ModulePass(ID), AMDGPUTM(TM) {}
1298 bool runOnModule(Module &M) override;
1299 void getAnalysisUsage(AnalysisUsage &AU) const override {
1300 AU.addPreserved<DominatorTreeWrapperPass>();
1301 }
1302};
1303} // namespace
1304
1305char AMDGPUSwLowerLDSLegacy::ID = 0;
1306char &llvm::AMDGPUSwLowerLDSLegacyPassID = AMDGPUSwLowerLDSLegacy::ID;
1307
1308INITIALIZE_PASS_BEGIN(AMDGPUSwLowerLDSLegacy, "amdgpu-sw-lower-lds",
1309 "AMDGPU Software lowering of LDS", false, false)
1310INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
1311INITIALIZE_PASS_END(AMDGPUSwLowerLDSLegacy, "amdgpu-sw-lower-lds",
1312 "AMDGPU Software lowering of LDS", false, false)
1313
1314bool AMDGPUSwLowerLDSLegacy::runOnModule(Module &M) {
1315 // AddressSanitizer pass adds "nosanitize_address" module flag if it has
1316 // instrumented the IR. Return early if the flag is not present.
1317 if (!M.getModuleFlag(Key: "nosanitize_address"))
1318 return false;
1319 DominatorTreeWrapperPass *const DTW =
1320 getAnalysisIfAvailable<DominatorTreeWrapperPass>();
1321 auto DTCallback = [&DTW](Function &F) -> DominatorTree * {
1322 return DTW ? &DTW->getDomTree() : nullptr;
1323 };
1324 if (!AMDGPUTM) {
1325 auto &TPC = getAnalysis<TargetPassConfig>();
1326 AMDGPUTM = &TPC.getTM<AMDGPUTargetMachine>();
1327 }
1328 AMDGPUSwLowerLDS SwLowerLDSImpl(M, *AMDGPUTM, DTCallback);
1329 bool IsChanged = SwLowerLDSImpl.run();
1330 return IsChanged;
1331}
1332
1333ModulePass *
1334llvm::createAMDGPUSwLowerLDSLegacyPass(const AMDGPUTargetMachine *TM) {
1335 return new AMDGPUSwLowerLDSLegacy(TM);
1336}
1337
1338PreservedAnalyses AMDGPUSwLowerLDSPass::run(Module &M,
1339 ModuleAnalysisManager &AM) {
1340 // AddressSanitizer pass adds "nosanitize_address" module flag if it has
1341 // instrumented the IR. Return early if the flag is not present.
1342 if (!M.getModuleFlag(Key: "nosanitize_address"))
1343 return PreservedAnalyses::all();
1344 auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(IR&: M).getManager();
1345 auto DTCallback = [&FAM](Function &F) -> DominatorTree * {
1346 return &FAM.getResult<DominatorTreeAnalysis>(IR&: F);
1347 };
1348 AMDGPUSwLowerLDS SwLowerLDSImpl(M, TM, DTCallback);
1349 bool IsChanged = SwLowerLDSImpl.run();
1350 if (!IsChanged)
1351 return PreservedAnalyses::all();
1352
1353 PreservedAnalyses PA;
1354 PA.preserve<DominatorTreeAnalysis>();
1355 return PA;
1356}
1357