1//===-- AMDGPUSwLowerLDS.cpp -----------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass lowers the local data store, LDS, uses in kernel and non-kernel
10// functions in module to use dynamically allocated global memory.
11// Packed LDS Layout is emulated in the global memory.
12// The lowered memory instructions from LDS to global memory are then
13// instrumented for address sanitizer, to catch addressing errors.
14// This pass only work when address sanitizer has been enabled and has
15// instrumented the IR. It identifies that IR has been instrumented using
16// "nosanitize_address" module flag.
17//
18// Replacement of Kernel LDS accesses:
19// For a kernel, LDS access can be static or dynamic which are direct
20// (accessed within kernel) and indirect (accessed through non-kernels).
21// All these LDS accesses corresponding to kernel will be packed together,
22// where all static LDS accesses will be allocated first and then dynamic
23// LDS follows. The total size with alignment is calculated. A new LDS global
24// will be created for the kernel called "SW LDS" and it will have the
25// attribute "amdgpu-lds-size" attached with value of the size calculated.
26// All the LDS accesses in the module will be replaced by GEP with offset
27// into the "Sw LDS".
28// A new "llvm.amdgcn.<kernel>.dynlds" is created per kernel accessing
29// the dynamic LDS. This will be marked used by kernel and will have
30// MD_absolue_symbol metadata set to total static LDS size, Since dynamic
31// LDS allocation starts after all static LDS allocation.
32//
33// A device global memory equal to the total LDS size will be allocated.
34// At the prologue of the kernel, a single work-item from the
35// work-group, does a "malloc" and stores the pointer of the
36// allocation in "SW LDS".
37//
38// To store the offsets corresponding to all LDS accesses, another global
39// variable is created which will be called "SW LDS metadata" in this pass.
40// - SW LDS Global:
41// It is LDS global of ptr type with name
42// "llvm.amdgcn.sw.lds.<kernel-name>".
43// - Metadata Global:
44// It is of struct type, with n members. n equals the number of LDS
45// globals accessed by the kernel(direct and indirect). Each member of
46// struct is another struct of type {i32, i32, i32}. First member
47// corresponds to offset, second member corresponds to size of LDS global
48// being replaced and third represents the total aligned size. It will
49// have name "llvm.amdgcn.sw.lds.<kernel-name>.md". This global will have
50// an initializer with static LDS related offsets and sizes initialized.
51// But for dynamic LDS related entries, offsets will be initialized to
52// previous static LDS allocation end offset. Sizes for them will be zero
53// initially. These dynamic LDS offset and size values will be updated
54// within the kernel, since kernel can read the dynamic LDS size
55// allocation done at runtime with query to "hidden_dynamic_lds_size"
56// hidden kernel argument.
57//
58// At the epilogue of kernel, allocated memory would be made free by the same
59// single work-item.
60//
61// Replacement of non-kernel LDS accesses:
62// Multiple kernels can access the same non-kernel function.
63// All the kernels accessing LDS through non-kernels are sorted and
64// assigned a kernel-id. All the LDS globals accessed by non-kernels
65// are sorted. This information is used to build two tables:
66// - Base table:
67// Base table will have single row, with elements of the row
68// placed as per kernel ID. Each element in the row corresponds
69// to ptr of "SW LDS" variable created for that kernel.
70// - Offset table:
71// Offset table will have multiple rows and columns.
72// Rows are assumed to be from 0 to (n-1). n is total number
73// of kernels accessing the LDS through non-kernels.
74// Each row will have m elements. m is the total number of
75// unique LDS globals accessed by all non-kernels.
76// Each element in the row correspond to the ptr of
77// the replacement of LDS global done by that particular kernel.
78// A LDS variable in non-kernel will be replaced based on the information
79// from base and offset tables. Based on kernel-id query, ptr of "SW
80// LDS" for that corresponding kernel is obtained from base table.
81// The Offset into the base "SW LDS" is obtained from
82// corresponding element in offset table. With this information, replacement
83// value is obtained.
84//===----------------------------------------------------------------------===//
85
86#include "AMDGPU.h"
87#include "AMDGPUAsanInstrumentation.h"
88#include "AMDGPUMemoryUtils.h"
89#include "AMDGPUTargetMachine.h"
90#include "llvm/ADT/DenseMap.h"
91#include "llvm/ADT/DenseSet.h"
92#include "llvm/ADT/SetVector.h"
93#include "llvm/ADT/StringExtras.h"
94#include "llvm/ADT/StringRef.h"
95#include "llvm/Analysis/CallGraph.h"
96#include "llvm/Analysis/DomTreeUpdater.h"
97#include "llvm/CodeGen/TargetPassConfig.h"
98#include "llvm/IR/Constants.h"
99#include "llvm/IR/DIBuilder.h"
100#include "llvm/IR/DebugInfo.h"
101#include "llvm/IR/DebugInfoMetadata.h"
102#include "llvm/IR/IRBuilder.h"
103#include "llvm/IR/Instructions.h"
104#include "llvm/IR/IntrinsicsAMDGPU.h"
105#include "llvm/IR/MDBuilder.h"
106#include "llvm/IR/ReplaceConstant.h"
107#include "llvm/Pass.h"
108#include "llvm/Support/raw_ostream.h"
109#include "llvm/Transforms/Instrumentation/AddressSanitizerCommon.h"
110#include "llvm/Transforms/Utils/ModuleUtils.h"
111
112#include <algorithm>
113
114#define DEBUG_TYPE "amdgpu-sw-lower-lds"
115#define COV5_HIDDEN_DYN_LDS_SIZE_ARG 15
116
117using namespace llvm;
118using namespace AMDGPU;
119
120namespace {
121
122cl::opt<bool>
123 AsanInstrumentLDS("amdgpu-asan-instrument-lds",
124 cl::desc("Run asan instrumentation on LDS instructions "
125 "lowered to global memory"),
126 cl::init(Val: true), cl::Hidden);
127
128using DomTreeCallback = function_ref<DominatorTree *(Function &F)>;
129
130struct LDSAccessTypeInfo {
131 SetVector<GlobalVariable *> StaticLDSGlobals;
132 SetVector<GlobalVariable *> DynamicLDSGlobals;
133};
134
135// Struct to hold all the Metadata required for a kernel
136// to replace a LDS global uses with corresponding offset
137// in to device global memory.
138struct KernelLDSParameters {
139 GlobalVariable *SwLDS = nullptr;
140 GlobalVariable *SwDynLDS = nullptr;
141 GlobalVariable *SwLDSMetadata = nullptr;
142 LDSAccessTypeInfo DirectAccess;
143 LDSAccessTypeInfo IndirectAccess;
144 DenseMap<GlobalVariable *, SmallVector<uint32_t, 3>>
145 LDSToReplacementIndicesMap;
146 uint32_t MallocSize = 0;
147 uint32_t LDSSize = 0;
148 SmallVector<std::pair<uint32_t, uint32_t>, 64> RedzoneOffsetAndSizeVector;
149};
150
151// Struct to store information for creation of offset table
152// for all the non-kernel LDS accesses.
153struct NonKernelLDSParameters {
154 GlobalVariable *LDSBaseTable = nullptr;
155 GlobalVariable *LDSOffsetTable = nullptr;
156 SetVector<Function *> OrderedKernels;
157 SetVector<GlobalVariable *> OrdereLDSGlobals;
158};
159
160struct AsanInstrumentInfo {
161 int Scale = 0;
162 uint32_t Offset = 0;
163 SetVector<Instruction *> Instructions;
164};
165
166struct FunctionsAndLDSAccess {
167 DenseMap<Function *, KernelLDSParameters> KernelToLDSParametersMap;
168 SetVector<Function *> KernelsWithIndirectLDSAccess;
169 SetVector<Function *> NonKernelsWithLDSArgument;
170 SetVector<GlobalVariable *> AllNonKernelLDSAccess;
171 FunctionVariableMap NonKernelToLDSAccessMap;
172};
173
174class AMDGPUSwLowerLDS {
175public:
176 AMDGPUSwLowerLDS(Module &Mod, DomTreeCallback Callback)
177 : M(Mod), IRB(M.getContext()), DTCallback(Callback) {}
178 bool run();
179 void getUsesOfLDSByNonKernels();
180 void getNonKernelsWithLDSArguments(const CallGraph &CG);
181 SetVector<Function *>
182 getOrderedIndirectLDSAccessingKernels(SetVector<Function *> &Kernels);
183 SetVector<GlobalVariable *>
184 getOrderedNonKernelAllLDSGlobals(SetVector<GlobalVariable *> &Variables);
185 void buildSwLDSGlobal(Function *Func);
186 void buildSwDynLDSGlobal(Function *Func);
187 void populateSwMetadataGlobal(Function *Func);
188 void populateSwLDSAttributeAndMetadata(Function *Func);
189 void populateLDSToReplacementIndicesMap(Function *Func);
190 void getLDSMemoryInstructions(Function *Func,
191 SetVector<Instruction *> &LDSInstructions);
192 void replaceKernelLDSAccesses(Function *Func);
193 Value *getTranslatedGlobalMemoryPtrOfLDS(Value *LoadMallocPtr, Value *LDSPtr);
194 void translateLDSMemoryOperationsToGlobalMemory(
195 Function *Func, Value *LoadMallocPtr,
196 SetVector<Instruction *> &LDSInstructions);
197 void poisonRedzones(Function *Func, Value *MallocPtr);
198 void lowerKernelLDSAccesses(Function *Func, DomTreeUpdater &DTU);
199 void buildNonKernelLDSOffsetTable(NonKernelLDSParameters &NKLDSParams);
200 void buildNonKernelLDSBaseTable(NonKernelLDSParameters &NKLDSParams);
201 Constant *
202 getAddressesOfVariablesInKernel(Function *Func,
203 SetVector<GlobalVariable *> &Variables);
204 void lowerNonKernelLDSAccesses(Function *Func,
205 SetVector<GlobalVariable *> &LDSGlobals,
206 NonKernelLDSParameters &NKLDSParams);
207 void
208 updateMallocSizeForDynamicLDS(Function *Func, Value **CurrMallocSize,
209 Value *HiddenDynLDSSize,
210 SetVector<GlobalVariable *> &DynamicLDSGlobals);
211 void initAsanInfo();
212
213private:
214 Module &M;
215 IRBuilder<> IRB;
216 DomTreeCallback DTCallback;
217 FunctionsAndLDSAccess FuncLDSAccessInfo;
218 AsanInstrumentInfo AsanInfo;
219};
220
221template <typename T> SetVector<T> sortByName(std::vector<T> &&V) {
222 // Sort the vector of globals or Functions based on their name.
223 // Returns a SetVector of globals/Functions.
224 sort(V, [](const auto *L, const auto *R) {
225 return L->getName() < R->getName();
226 });
227 return {SetVector<T>(llvm::from_range, V)};
228}
229
230SetVector<GlobalVariable *> AMDGPUSwLowerLDS::getOrderedNonKernelAllLDSGlobals(
231 SetVector<GlobalVariable *> &Variables) {
232 // Sort all the non-kernel LDS accesses based on their name.
233 return sortByName(
234 V: std::vector<GlobalVariable *>(Variables.begin(), Variables.end()));
235}
236
237SetVector<Function *> AMDGPUSwLowerLDS::getOrderedIndirectLDSAccessingKernels(
238 SetVector<Function *> &Kernels) {
239 // Sort the non-kernels accessing LDS based on their name.
240 // Also assign a kernel ID metadata based on the sorted order.
241 LLVMContext &Ctx = M.getContext();
242 if (Kernels.size() > UINT32_MAX) {
243 report_fatal_error(reason: "Unimplemented SW LDS lowering for > 2**32 kernels");
244 }
245 SetVector<Function *> OrderedKernels =
246 sortByName(V: std::vector<Function *>(Kernels.begin(), Kernels.end()));
247 for (size_t i = 0; i < Kernels.size(); i++) {
248 Metadata *AttrMDArgs[1] = {
249 ConstantAsMetadata::get(C: IRB.getInt32(C: i)),
250 };
251 Function *Func = OrderedKernels[i];
252 Func->setMetadata(Kind: "llvm.amdgcn.lds.kernel.id",
253 Node: MDNode::get(Context&: Ctx, MDs: AttrMDArgs));
254 }
255 return OrderedKernels;
256}
257
258void AMDGPUSwLowerLDS::getNonKernelsWithLDSArguments(const CallGraph &CG) {
259 // Among the kernels accessing LDS, get list of
260 // Non-kernels to which a call is made and a ptr
261 // to addrspace(3) is passed as argument.
262 for (auto &K : FuncLDSAccessInfo.KernelToLDSParametersMap) {
263 Function *Func = K.first;
264 const CallGraphNode *CGN = CG[Func];
265 if (!CGN)
266 continue;
267 for (auto &I : *CGN) {
268 CallGraphNode *CallerCGN = I.second;
269 Function *CalledFunc = CallerCGN->getFunction();
270 if (!CalledFunc || CalledFunc->isDeclaration())
271 continue;
272 if (AMDGPU::isKernel(F: *CalledFunc))
273 continue;
274 for (auto AI = CalledFunc->arg_begin(), E = CalledFunc->arg_end();
275 AI != E; ++AI) {
276 Type *ArgTy = (*AI).getType();
277 if (!ArgTy->isPointerTy())
278 continue;
279 if (ArgTy->getPointerAddressSpace() != AMDGPUAS::LOCAL_ADDRESS)
280 continue;
281 FuncLDSAccessInfo.NonKernelsWithLDSArgument.insert(X: CalledFunc);
282 // Also add the Calling function to KernelsWithIndirectLDSAccess list
283 // so that base table of LDS is generated.
284 FuncLDSAccessInfo.KernelsWithIndirectLDSAccess.insert(X: Func);
285 }
286 }
287 }
288}
289
290void AMDGPUSwLowerLDS::getUsesOfLDSByNonKernels() {
291 for (GlobalVariable *GV : FuncLDSAccessInfo.AllNonKernelLDSAccess) {
292 if (!AMDGPU::isLDSVariableToLower(GV: *GV))
293 continue;
294
295 for (User *V : GV->users()) {
296 if (auto *I = dyn_cast<Instruction>(Val: V)) {
297 Function *F = I->getFunction();
298 if (!isKernel(F: *F) && !F->isDeclaration())
299 FuncLDSAccessInfo.NonKernelToLDSAccessMap[F].insert(V: GV);
300 }
301 }
302 }
303}
304
305static void recordLDSAbsoluteAddress(Module &M, GlobalVariable *GV,
306 uint32_t Address) {
307 // Write the specified address into metadata where it can be retrieved by
308 // the assembler. Format is a half open range, [Address Address+1)
309 LLVMContext &Ctx = M.getContext();
310 auto *IntTy = M.getDataLayout().getIntPtrType(C&: Ctx, AddressSpace: AMDGPUAS::LOCAL_ADDRESS);
311 MDBuilder MDB(Ctx);
312 MDNode *MetadataNode = MDB.createRange(Lo: ConstantInt::get(Ty: IntTy, V: Address),
313 Hi: ConstantInt::get(Ty: IntTy, V: Address + 1));
314 GV->setMetadata(KindID: LLVMContext::MD_absolute_symbol, Node: MetadataNode);
315}
316
317static void addLDSSizeAttribute(Function *Func, uint32_t Offset,
318 bool IsDynLDS) {
319 if (Offset != 0) {
320 std::string Buffer;
321 raw_string_ostream SS{Buffer};
322 SS << Offset;
323 if (IsDynLDS)
324 SS << "," << Offset;
325 Func->addFnAttr(Kind: "amdgpu-lds-size", Val: Buffer);
326 }
327}
328
329static void markUsedByKernel(Function *Func, GlobalVariable *SGV) {
330 BasicBlock *Entry = &Func->getEntryBlock();
331 IRBuilder<> Builder(Entry, Entry->getFirstNonPHIIt());
332
333 Function *Decl = Intrinsic::getOrInsertDeclaration(M: Func->getParent(),
334 id: Intrinsic::donothing, OverloadTys: {});
335
336 Value *UseInstance[1] = {
337 Builder.CreateConstInBoundsGEP1_32(Ty: SGV->getValueType(), Ptr: SGV, Idx0: 0)};
338
339 Builder.CreateCall(Callee: Decl, Args: {},
340 OpBundles: {OperandBundleDefT<Value *>("ExplicitUse", UseInstance)});
341}
342
343void AMDGPUSwLowerLDS::buildSwLDSGlobal(Function *Func) {
344 // Create new LDS global required for each kernel to store
345 // device global memory pointer.
346 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
347 // Create new global pointer variable
348 LDSParams.SwLDS = new GlobalVariable(
349 M, IRB.getPtrTy(), false, GlobalValue::InternalLinkage,
350 PoisonValue::get(T: IRB.getPtrTy()), "llvm.amdgcn.sw.lds." + Func->getName(),
351 nullptr, GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS, false);
352 GlobalValue::SanitizerMetadata MD;
353 MD.NoAddress = true;
354 LDSParams.SwLDS->setSanitizerMetadata(MD);
355}
356
357void AMDGPUSwLowerLDS::buildSwDynLDSGlobal(Function *Func) {
358 // Create new Dyn LDS global if kernel accesses dyn LDS.
359 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
360 if (LDSParams.DirectAccess.DynamicLDSGlobals.empty() &&
361 LDSParams.IndirectAccess.DynamicLDSGlobals.empty())
362 return;
363 // Create new global pointer variable
364 auto *emptyCharArray = ArrayType::get(ElementType: IRB.getInt8Ty(), NumElements: 0);
365 LDSParams.SwDynLDS = new GlobalVariable(
366 M, emptyCharArray, false, GlobalValue::ExternalLinkage, nullptr,
367 "llvm.amdgcn." + Func->getName() + ".dynlds", nullptr,
368 GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS, false);
369 markUsedByKernel(Func, SGV: LDSParams.SwDynLDS);
370 GlobalValue::SanitizerMetadata MD;
371 MD.NoAddress = true;
372 LDSParams.SwDynLDS->setSanitizerMetadata(MD);
373}
374
375void AMDGPUSwLowerLDS::populateSwLDSAttributeAndMetadata(Function *Func) {
376 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
377 bool IsDynLDSUsed = LDSParams.SwDynLDS;
378 uint32_t Offset = LDSParams.LDSSize;
379 recordLDSAbsoluteAddress(M, GV: LDSParams.SwLDS, Address: 0);
380 addLDSSizeAttribute(Func, Offset, IsDynLDS: IsDynLDSUsed);
381 if (LDSParams.SwDynLDS)
382 recordLDSAbsoluteAddress(M, GV: LDSParams.SwDynLDS, Address: Offset);
383}
384
385void AMDGPUSwLowerLDS::populateSwMetadataGlobal(Function *Func) {
386 // Create new metadata global for every kernel and initialize the
387 // start offsets and sizes corresponding to each LDS accesses.
388 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
389 auto &Ctx = M.getContext();
390 auto &DL = M.getDataLayout();
391 std::vector<Type *> Items;
392 Type *Int32Ty = IRB.getInt32Ty();
393 std::vector<Constant *> Initializers;
394 Align MaxAlignment(1);
395 auto UpdateMaxAlignment = [&MaxAlignment, &DL](GlobalVariable *GV) {
396 Align GVAlign = AMDGPU::getAlign(DL, GV);
397 MaxAlignment = std::max(a: MaxAlignment, b: GVAlign);
398 };
399
400 for (GlobalVariable *GV : LDSParams.DirectAccess.StaticLDSGlobals)
401 UpdateMaxAlignment(GV);
402
403 for (GlobalVariable *GV : LDSParams.DirectAccess.DynamicLDSGlobals)
404 UpdateMaxAlignment(GV);
405
406 for (GlobalVariable *GV : LDSParams.IndirectAccess.StaticLDSGlobals)
407 UpdateMaxAlignment(GV);
408
409 for (GlobalVariable *GV : LDSParams.IndirectAccess.DynamicLDSGlobals)
410 UpdateMaxAlignment(GV);
411
412 //{StartOffset, AlignedSizeInBytes}
413 SmallString<128> MDItemStr;
414 raw_svector_ostream MDItemOS(MDItemStr);
415 MDItemOS << "llvm.amdgcn.sw.lds." << Func->getName() << ".md.item";
416
417 StructType *LDSItemTy =
418 StructType::create(Context&: Ctx, Elements: {Int32Ty, Int32Ty, Int32Ty}, Name: MDItemOS.str());
419 uint32_t &MallocSize = LDSParams.MallocSize;
420 SetVector<GlobalVariable *> UniqueLDSGlobals;
421 int AsanScale = AsanInfo.Scale;
422 auto buildInitializerForSwLDSMD =
423 [&](SetVector<GlobalVariable *> &LDSGlobals) {
424 for (auto &GV : LDSGlobals) {
425 if (is_contained(Range&: UniqueLDSGlobals, Element: GV))
426 continue;
427 UniqueLDSGlobals.insert(X: GV);
428
429 Type *Ty = GV->getValueType();
430 const uint64_t SizeInBytes = DL.getTypeAllocSize(Ty);
431 Items.push_back(x: LDSItemTy);
432 Constant *ItemStartOffset = ConstantInt::get(Ty: Int32Ty, V: MallocSize);
433 Constant *SizeInBytesConst = ConstantInt::get(Ty: Int32Ty, V: SizeInBytes);
434 // Get redzone size corresponding a size.
435 const uint64_t RightRedzoneSize =
436 AMDGPU::getRedzoneSizeForGlobal(Scale: AsanScale, SizeInBytes);
437 // Update MallocSize with current size and redzone size.
438 MallocSize += SizeInBytes;
439 if (!AMDGPU::isDynamicLDS(GV: *GV))
440 LDSParams.RedzoneOffsetAndSizeVector.emplace_back(Args&: MallocSize,
441 Args: RightRedzoneSize);
442 MallocSize += RightRedzoneSize;
443 // Align current size plus redzone.
444 uint64_t AlignedSize =
445 alignTo(Size: SizeInBytes + RightRedzoneSize, A: MaxAlignment);
446 Constant *AlignedSizeInBytesConst =
447 ConstantInt::get(Ty: Int32Ty, V: AlignedSize);
448 // Align MallocSize
449 MallocSize = alignTo(Size: MallocSize, A: MaxAlignment);
450 Constant *InitItem =
451 ConstantStruct::get(T: LDSItemTy, V: {ItemStartOffset, SizeInBytesConst,
452 AlignedSizeInBytesConst});
453 Initializers.push_back(x: InitItem);
454 }
455 };
456 SetVector<GlobalVariable *> SwLDSVector;
457 SwLDSVector.insert(X: LDSParams.SwLDS);
458 buildInitializerForSwLDSMD(SwLDSVector);
459 buildInitializerForSwLDSMD(LDSParams.DirectAccess.StaticLDSGlobals);
460 buildInitializerForSwLDSMD(LDSParams.IndirectAccess.StaticLDSGlobals);
461 buildInitializerForSwLDSMD(LDSParams.DirectAccess.DynamicLDSGlobals);
462 buildInitializerForSwLDSMD(LDSParams.IndirectAccess.DynamicLDSGlobals);
463
464 // Update the LDS size used by the kernel.
465 Type *Ty = LDSParams.SwLDS->getValueType();
466 const uint64_t SizeInBytes = DL.getTypeAllocSize(Ty);
467 uint64_t AlignedSize = alignTo(Size: SizeInBytes, A: MaxAlignment);
468 LDSParams.LDSSize = AlignedSize;
469 SmallString<128> MDTypeStr;
470 raw_svector_ostream MDTypeOS(MDTypeStr);
471 MDTypeOS << "llvm.amdgcn.sw.lds." << Func->getName() << ".md.type";
472 StructType *MetadataStructType =
473 StructType::create(Context&: Ctx, Elements: Items, Name: MDTypeOS.str());
474 SmallString<128> MDStr;
475 raw_svector_ostream MDOS(MDStr);
476 MDOS << "llvm.amdgcn.sw.lds." << Func->getName() << ".md";
477 LDSParams.SwLDSMetadata = new GlobalVariable(
478 M, MetadataStructType, false, GlobalValue::InternalLinkage,
479 PoisonValue::get(T: MetadataStructType), MDOS.str(), nullptr,
480 GlobalValue::NotThreadLocal, AMDGPUAS::GLOBAL_ADDRESS, false);
481 Constant *data = ConstantStruct::get(T: MetadataStructType, V: Initializers);
482 LDSParams.SwLDSMetadata->setInitializer(data);
483 assert(LDSParams.SwLDS);
484 // Set the alignment to MaxAlignment for SwLDS.
485 LDSParams.SwLDS->setAlignment(MaxAlignment);
486 if (LDSParams.SwDynLDS)
487 LDSParams.SwDynLDS->setAlignment(MaxAlignment);
488 GlobalValue::SanitizerMetadata MD;
489 MD.NoAddress = true;
490 LDSParams.SwLDSMetadata->setSanitizerMetadata(MD);
491}
492
493void AMDGPUSwLowerLDS::populateLDSToReplacementIndicesMap(Function *Func) {
494 // Fill the corresponding LDS replacement indices for each LDS access
495 // related to this kernel.
496 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
497 SetVector<GlobalVariable *> UniqueLDSGlobals;
498 auto PopulateIndices = [&](SetVector<GlobalVariable *> &LDSGlobals,
499 uint32_t &Idx) {
500 for (auto &GV : LDSGlobals) {
501 if (is_contained(Range&: UniqueLDSGlobals, Element: GV))
502 continue;
503 UniqueLDSGlobals.insert(X: GV);
504 LDSParams.LDSToReplacementIndicesMap[GV] = {0, Idx, 0};
505 ++Idx;
506 }
507 };
508 uint32_t Idx = 0;
509 SetVector<GlobalVariable *> SwLDSVector;
510 SwLDSVector.insert(X: LDSParams.SwLDS);
511 PopulateIndices(SwLDSVector, Idx);
512 PopulateIndices(LDSParams.DirectAccess.StaticLDSGlobals, Idx);
513 PopulateIndices(LDSParams.IndirectAccess.StaticLDSGlobals, Idx);
514 PopulateIndices(LDSParams.DirectAccess.DynamicLDSGlobals, Idx);
515 PopulateIndices(LDSParams.IndirectAccess.DynamicLDSGlobals, Idx);
516}
517
518static void replacesUsesOfGlobalInFunction(Function *Func, GlobalVariable *GV,
519 Value *Replacement) {
520 // Replace all uses of LDS global in this Function with a Replacement.
521 auto ReplaceUsesLambda = [Func](const Use &U) -> bool {
522 auto *V = U.getUser();
523 if (auto *Inst = dyn_cast<Instruction>(Val: V)) {
524 auto *Func1 = Inst->getFunction();
525 if (Func == Func1)
526 return true;
527 }
528 return false;
529 };
530 GV->replaceUsesWithIf(New: Replacement, ShouldReplace: ReplaceUsesLambda);
531}
532
533void AMDGPUSwLowerLDS::replaceKernelLDSAccesses(Function *Func) {
534 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
535 GlobalVariable *SwLDS = LDSParams.SwLDS;
536 assert(SwLDS);
537 GlobalVariable *SwLDSMetadata = LDSParams.SwLDSMetadata;
538 assert(SwLDSMetadata);
539 StructType *SwLDSMetadataStructType =
540 cast<StructType>(Val: SwLDSMetadata->getValueType());
541 Type *Int32Ty = IRB.getInt32Ty();
542 auto &IndirectAccess = LDSParams.IndirectAccess;
543 auto &DirectAccess = LDSParams.DirectAccess;
544 // Replace all uses of LDS global in this Function with a Replacement.
545 SetVector<GlobalVariable *> UniqueLDSGlobals;
546 auto ReplaceLDSGlobalUses = [&](SetVector<GlobalVariable *> &LDSGlobals) {
547 for (auto &GV : LDSGlobals) {
548 // Do not generate instructions if LDS access is in non-kernel
549 // i.e indirect-access.
550 if ((IndirectAccess.StaticLDSGlobals.contains(key: GV) ||
551 IndirectAccess.DynamicLDSGlobals.contains(key: GV)) &&
552 (!DirectAccess.StaticLDSGlobals.contains(key: GV) &&
553 !DirectAccess.DynamicLDSGlobals.contains(key: GV)))
554 continue;
555 if (is_contained(Range&: UniqueLDSGlobals, Element: GV))
556 continue;
557 UniqueLDSGlobals.insert(X: GV);
558 auto &Indices = LDSParams.LDSToReplacementIndicesMap[GV];
559 assert(Indices.size() == 3);
560 Constant *GEPIdx[] = {ConstantInt::get(Ty: Int32Ty, V: Indices[0]),
561 ConstantInt::get(Ty: Int32Ty, V: Indices[1]),
562 ConstantInt::get(Ty: Int32Ty, V: Indices[2])};
563 Constant *GEP = ConstantExpr::getGetElementPtr(
564 Ty: SwLDSMetadataStructType, C: SwLDSMetadata, IdxList: GEPIdx, NW: true);
565 Value *Offset = IRB.CreateLoad(Ty: Int32Ty, Ptr: GEP);
566 Value *BasePlusOffset =
567 IRB.CreateInBoundsGEP(Ty: IRB.getInt8Ty(), Ptr: SwLDS, IdxList: {Offset});
568 LLVM_DEBUG(GV->printAsOperand(dbgs() << "Sw LDS Lowering, Replacing LDS ",
569 false));
570 replacesUsesOfGlobalInFunction(Func, GV, Replacement: BasePlusOffset);
571 }
572 };
573 ReplaceLDSGlobalUses(DirectAccess.StaticLDSGlobals);
574 ReplaceLDSGlobalUses(IndirectAccess.StaticLDSGlobals);
575 ReplaceLDSGlobalUses(DirectAccess.DynamicLDSGlobals);
576 ReplaceLDSGlobalUses(IndirectAccess.DynamicLDSGlobals);
577}
578
579void AMDGPUSwLowerLDS::updateMallocSizeForDynamicLDS(
580 Function *Func, Value **CurrMallocSize, Value *HiddenDynLDSSize,
581 SetVector<GlobalVariable *> &DynamicLDSGlobals) {
582 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
583 Type *Int32Ty = IRB.getInt32Ty();
584
585 GlobalVariable *SwLDS = LDSParams.SwLDS;
586 GlobalVariable *SwLDSMetadata = LDSParams.SwLDSMetadata;
587 assert(SwLDS && SwLDSMetadata);
588 StructType *MetadataStructType =
589 cast<StructType>(Val: SwLDSMetadata->getValueType());
590 unsigned MaxAlignment = SwLDS->getAlignment();
591 Value *MaxAlignValue = IRB.getInt32(C: MaxAlignment);
592 Value *MaxAlignValueMinusOne = IRB.getInt32(C: MaxAlignment - 1);
593
594 for (GlobalVariable *DynGV : DynamicLDSGlobals) {
595 auto &Indices = LDSParams.LDSToReplacementIndicesMap[DynGV];
596 // Update the Offset metadata.
597 Constant *Index0 = ConstantInt::get(Ty: Int32Ty, V: 0);
598 Constant *Index1 = ConstantInt::get(Ty: Int32Ty, V: Indices[1]);
599
600 Constant *Index2Offset = ConstantInt::get(Ty: Int32Ty, V: 0);
601 auto *GEPForOffset = IRB.CreateInBoundsGEP(
602 Ty: MetadataStructType, Ptr: SwLDSMetadata, IdxList: {Index0, Index1, Index2Offset});
603
604 IRB.CreateStore(Val: *CurrMallocSize, Ptr: GEPForOffset);
605 // Update the size and Aligned Size metadata.
606 Constant *Index2Size = ConstantInt::get(Ty: Int32Ty, V: 1);
607 auto *GEPForSize = IRB.CreateInBoundsGEP(Ty: MetadataStructType, Ptr: SwLDSMetadata,
608 IdxList: {Index0, Index1, Index2Size});
609
610 Value *CurrDynLDSSize = IRB.CreateLoad(Ty: Int32Ty, Ptr: HiddenDynLDSSize);
611 IRB.CreateStore(Val: CurrDynLDSSize, Ptr: GEPForSize);
612 Constant *Index2AlignedSize = ConstantInt::get(Ty: Int32Ty, V: 2);
613 auto *GEPForAlignedSize = IRB.CreateInBoundsGEP(
614 Ty: MetadataStructType, Ptr: SwLDSMetadata, IdxList: {Index0, Index1, Index2AlignedSize});
615
616 Value *AlignedDynLDSSize =
617 IRB.CreateAdd(LHS: CurrDynLDSSize, RHS: MaxAlignValueMinusOne);
618 AlignedDynLDSSize = IRB.CreateUDiv(LHS: AlignedDynLDSSize, RHS: MaxAlignValue);
619 AlignedDynLDSSize = IRB.CreateMul(LHS: AlignedDynLDSSize, RHS: MaxAlignValue);
620 IRB.CreateStore(Val: AlignedDynLDSSize, Ptr: GEPForAlignedSize);
621
622 // Update the Current Malloc Size
623 *CurrMallocSize = IRB.CreateAdd(LHS: *CurrMallocSize, RHS: AlignedDynLDSSize);
624 }
625}
626
627static DebugLoc getOrCreateDebugLoc(const Instruction *InsertBefore,
628 DISubprogram *SP) {
629 assert(InsertBefore);
630 if (InsertBefore->getDebugLoc())
631 return InsertBefore->getDebugLoc();
632 if (SP)
633 return DILocation::get(Context&: SP->getContext(), Line: SP->getLine(), Column: 1, Scope: SP);
634 return DebugLoc();
635}
636
637void AMDGPUSwLowerLDS::getLDSMemoryInstructions(
638 Function *Func, SetVector<Instruction *> &LDSInstructions) {
639 for (BasicBlock &BB : *Func) {
640 for (Instruction &Inst : BB) {
641 if (LoadInst *LI = dyn_cast<LoadInst>(Val: &Inst)) {
642 if (LI->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS)
643 LDSInstructions.insert(X: &Inst);
644 } else if (StoreInst *SI = dyn_cast<StoreInst>(Val: &Inst)) {
645 if (SI->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS)
646 LDSInstructions.insert(X: &Inst);
647 } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(Val: &Inst)) {
648 if (RMW->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS)
649 LDSInstructions.insert(X: &Inst);
650 } else if (AtomicCmpXchgInst *XCHG = dyn_cast<AtomicCmpXchgInst>(Val: &Inst)) {
651 if (XCHG->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS)
652 LDSInstructions.insert(X: &Inst);
653 } else if (AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(Val: &Inst)) {
654 if (ASC->getSrcAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&
655 ASC->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS)
656 LDSInstructions.insert(X: &Inst);
657 } else if (AnyMemIntrinsic *MI = dyn_cast<AnyMemIntrinsic>(Val: &Inst)) {
658 if (MI->getDestAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
659 LDSInstructions.insert(X: &Inst);
660 } else if (auto *MTI = dyn_cast<AnyMemTransferInst>(Val: MI)) {
661 if (MTI->getSourceAddressSpace() == AMDGPUAS::LOCAL_ADDRESS)
662 LDSInstructions.insert(X: &Inst);
663 }
664 } else
665 continue;
666 }
667 }
668}
669
670Value *AMDGPUSwLowerLDS::getTranslatedGlobalMemoryPtrOfLDS(Value *LoadMallocPtr,
671 Value *LDSPtr) {
672 assert(LDSPtr && "Invalid LDS pointer operand");
673 Type *LDSPtrType = LDSPtr->getType();
674 LLVMContext &Ctx = M.getContext();
675 const DataLayout &DL = M.getDataLayout();
676 Type *IntTy = DL.getIntPtrType(C&: Ctx, AddressSpace: AMDGPUAS::LOCAL_ADDRESS);
677 if (auto *VecPtrTy = dyn_cast<VectorType>(Val: LDSPtrType)) {
678 // Handle vector of pointers
679 ElementCount NumElements = VecPtrTy->getElementCount();
680 IntTy = VectorType::get(ElementType: IntTy, EC: NumElements);
681 }
682 Value *GepIndex = IRB.CreatePtrToInt(V: LDSPtr, DestTy: IntTy);
683 return IRB.CreateInBoundsGEP(Ty: IRB.getInt8Ty(), Ptr: LoadMallocPtr, IdxList: {GepIndex});
684}
685
686void AMDGPUSwLowerLDS::translateLDSMemoryOperationsToGlobalMemory(
687 Function *Func, Value *LoadMallocPtr,
688 SetVector<Instruction *> &LDSInstructions) {
689 LLVM_DEBUG(dbgs() << "Translating LDS memory operations to global memory : "
690 << Func->getName());
691 for (Instruction *Inst : LDSInstructions) {
692 IRB.SetInsertPoint(Inst);
693 if (LoadInst *LI = dyn_cast<LoadInst>(Val: Inst)) {
694 Value *LIOperand = LI->getPointerOperand();
695 Value *Replacement =
696 getTranslatedGlobalMemoryPtrOfLDS(LoadMallocPtr, LDSPtr: LIOperand);
697 LoadInst *NewLI = IRB.CreateAlignedLoad(Ty: LI->getType(), Ptr: Replacement,
698 Align: LI->getAlign(), isVolatile: LI->isVolatile());
699 NewLI->setAtomic(Ordering: LI->getOrdering(), SSID: LI->getSyncScopeID());
700 AsanInfo.Instructions.insert(X: NewLI);
701 LI->replaceAllUsesWith(V: NewLI);
702 LI->eraseFromParent();
703 } else if (StoreInst *SI = dyn_cast<StoreInst>(Val: Inst)) {
704 Value *SIOperand = SI->getPointerOperand();
705 Value *Replacement =
706 getTranslatedGlobalMemoryPtrOfLDS(LoadMallocPtr, LDSPtr: SIOperand);
707 StoreInst *NewSI = IRB.CreateAlignedStore(
708 Val: SI->getValueOperand(), Ptr: Replacement, Align: SI->getAlign(), isVolatile: SI->isVolatile());
709 NewSI->setAtomic(Ordering: SI->getOrdering(), SSID: SI->getSyncScopeID());
710 AsanInfo.Instructions.insert(X: NewSI);
711 SI->replaceAllUsesWith(V: NewSI);
712 SI->eraseFromParent();
713 } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(Val: Inst)) {
714 Value *RMWPtrOperand = RMW->getPointerOperand();
715 Value *RMWValOperand = RMW->getValOperand();
716 Value *Replacement =
717 getTranslatedGlobalMemoryPtrOfLDS(LoadMallocPtr, LDSPtr: RMWPtrOperand);
718 AtomicRMWInst *NewRMW = IRB.CreateAtomicRMW(
719 Op: RMW->getOperation(), Ptr: Replacement, Val: RMWValOperand, Align: RMW->getAlign(),
720 Ordering: RMW->getOrdering(), SSID: RMW->getSyncScopeID());
721 NewRMW->setVolatile(RMW->isVolatile());
722 AsanInfo.Instructions.insert(X: NewRMW);
723 RMW->replaceAllUsesWith(V: NewRMW);
724 RMW->eraseFromParent();
725 } else if (AtomicCmpXchgInst *XCHG = dyn_cast<AtomicCmpXchgInst>(Val: Inst)) {
726 Value *XCHGPtrOperand = XCHG->getPointerOperand();
727 Value *Replacement =
728 getTranslatedGlobalMemoryPtrOfLDS(LoadMallocPtr, LDSPtr: XCHGPtrOperand);
729 AtomicCmpXchgInst *NewXCHG = IRB.CreateAtomicCmpXchg(
730 Ptr: Replacement, Cmp: XCHG->getCompareOperand(), New: XCHG->getNewValOperand(),
731 Align: XCHG->getAlign(), SuccessOrdering: XCHG->getSuccessOrdering(),
732 FailureOrdering: XCHG->getFailureOrdering(), SSID: XCHG->getSyncScopeID());
733 NewXCHG->setVolatile(XCHG->isVolatile());
734 AsanInfo.Instructions.insert(X: NewXCHG);
735 XCHG->replaceAllUsesWith(V: NewXCHG);
736 XCHG->eraseFromParent();
737 } else if (AnyMemIntrinsic *MI = dyn_cast<AnyMemIntrinsic>(Val: Inst)) {
738 Value *NewDest = MI->getRawDest();
739 if (MI->getDestAddressSpace() == AMDGPUAS::LOCAL_ADDRESS)
740 NewDest = getTranslatedGlobalMemoryPtrOfLDS(LoadMallocPtr, LDSPtr: NewDest);
741 CallInst *NewMI = nullptr;
742 if (AnyMemSetInst *MSI = dyn_cast<AnyMemSetInst>(Val: MI)) {
743 if (MI->isAtomic()) {
744 NewMI = IRB.CreateElementUnorderedAtomicMemSet(
745 Ptr: NewDest, Val: MSI->getValue(), Size: MSI->getLength(),
746 Alignment: MSI->getDestAlign().valueOrOne(), ElementSize: MSI->getElementSizeInBytes());
747 } else {
748 NewMI = IRB.CreateMemSet(Ptr: NewDest, Val: MSI->getValue(), Size: MSI->getLength(),
749 Align: MSI->getDestAlign(),
750 isVolatile: cast<MemSetInst>(Val: MI)->isVolatile());
751 }
752 } else if (AnyMemTransferInst *MTI = dyn_cast<AnyMemTransferInst>(Val: MI)) {
753 Value *NewSrc = MTI->getRawSource();
754 if (MTI->getSourceAddressSpace() == AMDGPUAS::LOCAL_ADDRESS)
755 NewSrc = getTranslatedGlobalMemoryPtrOfLDS(LoadMallocPtr, LDSPtr: NewSrc);
756 if (MI->isAtomic()) {
757 if (MI->getIntrinsicID() ==
758 Intrinsic::memmove_element_unordered_atomic) {
759 NewMI = IRB.CreateElementUnorderedAtomicMemMove(
760 Dst: NewDest, DstAlign: MTI->getDestAlign().valueOrOne(), Src: NewSrc,
761 SrcAlign: MTI->getSourceAlign().valueOrOne(), Size: MTI->getLength(),
762 ElementSize: MTI->getElementSizeInBytes());
763 } else {
764 NewMI = IRB.CreateElementUnorderedAtomicMemCpy(
765 Dst: NewDest, DstAlign: MTI->getDestAlign().valueOrOne(), Src: NewSrc,
766 SrcAlign: MTI->getSourceAlign().valueOrOne(), Size: MTI->getLength(),
767 ElementSize: MTI->getElementSizeInBytes());
768 }
769 } else {
770 NewMI = IRB.CreateMemTransferInst(
771 IntrID: MI->getIntrinsicID(), Dst: NewDest, DstAlign: MTI->getDestAlign(), Src: NewSrc,
772 SrcAlign: MTI->getSourceAlign(), Size: MTI->getLength(),
773 isVolatile: cast<MemTransferInst>(Val: MI)->isVolatile());
774 }
775 } else
776 reportFatalUsageError(reason: "Unimplemented LDS lowering memory intrinsic");
777 AsanInfo.Instructions.insert(X: NewMI);
778 MI->replaceAllUsesWith(V: NewMI);
779 MI->eraseFromParent();
780 } else if (AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(Val: Inst)) {
781 Value *AIOperand = ASC->getPointerOperand();
782 Value *Replacement =
783 getTranslatedGlobalMemoryPtrOfLDS(LoadMallocPtr, LDSPtr: AIOperand);
784 Value *NewAI = IRB.CreateAddrSpaceCast(V: Replacement, DestTy: ASC->getType());
785 // Note: No need to add the instruction to AsanInfo instructions to be
786 // instrumented list. FLAT_ADDRESS ptr would have been already
787 // instrumented by asan pass prior to this pass.
788 ASC->replaceAllUsesWith(V: NewAI);
789 ASC->eraseFromParent();
790 } else
791 report_fatal_error(reason: "Unimplemented LDS lowering instruction");
792 }
793}
794
795void AMDGPUSwLowerLDS::poisonRedzones(Function *Func, Value *MallocPtr) {
796 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
797 Type *Int64Ty = IRB.getInt64Ty();
798 Type *VoidTy = IRB.getVoidTy();
799 FunctionCallee AsanPoisonRegion = M.getOrInsertFunction(
800 Name: "__asan_poison_region",
801 T: FunctionType::get(Result: VoidTy, Params: {Int64Ty, Int64Ty}, isVarArg: false));
802
803 auto RedzonesVec = LDSParams.RedzoneOffsetAndSizeVector;
804 size_t VecSize = RedzonesVec.size();
805 for (unsigned i = 0; i < VecSize; i++) {
806 auto &RedzonePair = RedzonesVec[i];
807 uint64_t RedzoneOffset = RedzonePair.first;
808 uint64_t RedzoneSize = RedzonePair.second;
809 Value *RedzoneAddrOffset = IRB.CreateInBoundsGEP(
810 Ty: IRB.getInt8Ty(), Ptr: MallocPtr, IdxList: {IRB.getInt64(C: RedzoneOffset)});
811 Value *RedzoneAddress = IRB.CreatePtrToInt(V: RedzoneAddrOffset, DestTy: Int64Ty);
812 IRB.CreateCall(Callee: AsanPoisonRegion,
813 Args: {RedzoneAddress, IRB.getInt64(C: RedzoneSize)});
814 }
815}
816
817void AMDGPUSwLowerLDS::lowerKernelLDSAccesses(Function *Func,
818 DomTreeUpdater &DTU) {
819 LLVM_DEBUG(dbgs() << "Sw Lowering Kernel LDS for : " << Func->getName());
820 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
821 auto &Ctx = M.getContext();
822 auto *PrevEntryBlock = &Func->getEntryBlock();
823 SetVector<Instruction *> LDSInstructions;
824 getLDSMemoryInstructions(Func, LDSInstructions);
825 const DataLayout &DL = M.getDataLayout();
826
827 // Create malloc block.
828 auto *MallocBlock = BasicBlock::Create(Context&: Ctx, Name: "Malloc", Parent: Func, InsertBefore: PrevEntryBlock);
829
830 // Create WIdBlock block which has instructions related to selection of
831 // {0,0,0} indiex work item in the work group.
832 auto *WIdBlock = BasicBlock::Create(Context&: Ctx, Name: "WId", Parent: Func, InsertBefore: MallocBlock);
833
834 // Move constant-size allocas from the original entry block to the new entry
835 // block (WIdBlock) so they remain static allocas. Splice the leading cluster
836 // in bulk, then move any stragglers that are interleaved with other
837 // instructions.
838 auto SplitIt = PrevEntryBlock->getFirstNonPHIOrDbgOrAlloca();
839 WIdBlock->splice(ToIt: WIdBlock->end(), FromBB: PrevEntryBlock, FromBeginIt: PrevEntryBlock->begin(),
840 FromEndIt: SplitIt);
841 for (Instruction &I : make_early_inc_range(Range&: *PrevEntryBlock))
842 if (auto *AI = dyn_cast<AllocaInst>(Val: &I))
843 if (isa<ConstantInt>(Val: AI->getArraySize()))
844 AI->moveBefore(BB&: *WIdBlock, I: WIdBlock->end());
845
846 IRB.SetInsertPoint(TheBB: WIdBlock, IP: WIdBlock->end());
847 DebugLoc FirstDL =
848 getOrCreateDebugLoc(InsertBefore: &*PrevEntryBlock->begin(), SP: Func->getSubprogram());
849 IRB.SetCurrentDebugLocation(FirstDL);
850 Value *WIdx = IRB.CreateIntrinsic(ID: Intrinsic::amdgcn_workitem_id_x, Args: {});
851 Value *WIdy = IRB.CreateIntrinsic(ID: Intrinsic::amdgcn_workitem_id_y, Args: {});
852 Value *WIdz = IRB.CreateIntrinsic(ID: Intrinsic::amdgcn_workitem_id_z, Args: {});
853 Value *XYOr = IRB.CreateOr(LHS: WIdx, RHS: WIdy);
854 Value *XYZOr = IRB.CreateOr(LHS: XYOr, RHS: WIdz);
855 Value *WIdzCond = IRB.CreateICmpEQ(LHS: XYZOr, RHS: IRB.getInt32(C: 0));
856
857 // All work items will branch to PrevEntryBlock except {0,0,0} index
858 // work item which will branch to malloc block.
859 IRB.CreateCondBr(Cond: WIdzCond, True: MallocBlock, False: PrevEntryBlock);
860
861 // Malloc block
862 IRB.SetInsertPoint(TheBB: MallocBlock, IP: MallocBlock->begin());
863
864 // If Dynamic LDS globals are accessed by the kernel,
865 // Get the size of dyn lds from hidden dyn_lds_size kernel arg.
866 // Update the corresponding metadata global entries for this dyn lds global.
867 GlobalVariable *SwLDS = LDSParams.SwLDS;
868 GlobalVariable *SwLDSMetadata = LDSParams.SwLDSMetadata;
869 assert(SwLDS && SwLDSMetadata);
870 StructType *MetadataStructType =
871 cast<StructType>(Val: SwLDSMetadata->getValueType());
872 uint32_t MallocSize = 0;
873 Value *CurrMallocSize;
874 Type *Int32Ty = IRB.getInt32Ty();
875 Type *Int64Ty = IRB.getInt64Ty();
876
877 SetVector<GlobalVariable *> UniqueLDSGlobals;
878 auto GetUniqueLDSGlobals = [&](SetVector<GlobalVariable *> &LDSGlobals) {
879 for (auto &GV : LDSGlobals) {
880 if (is_contained(Range&: UniqueLDSGlobals, Element: GV))
881 continue;
882 UniqueLDSGlobals.insert(X: GV);
883 }
884 };
885
886 GetUniqueLDSGlobals(LDSParams.DirectAccess.StaticLDSGlobals);
887 GetUniqueLDSGlobals(LDSParams.IndirectAccess.StaticLDSGlobals);
888 unsigned NumStaticLDS = 1 + UniqueLDSGlobals.size();
889 UniqueLDSGlobals.clear();
890
891 if (NumStaticLDS) {
892 auto *GEPForEndStaticLDSOffset =
893 IRB.CreateInBoundsGEP(Ty: MetadataStructType, Ptr: SwLDSMetadata,
894 IdxList: {ConstantInt::get(Ty: Int32Ty, V: 0),
895 ConstantInt::get(Ty: Int32Ty, V: NumStaticLDS - 1),
896 ConstantInt::get(Ty: Int32Ty, V: 0)});
897
898 auto *GEPForEndStaticLDSSize =
899 IRB.CreateInBoundsGEP(Ty: MetadataStructType, Ptr: SwLDSMetadata,
900 IdxList: {ConstantInt::get(Ty: Int32Ty, V: 0),
901 ConstantInt::get(Ty: Int32Ty, V: NumStaticLDS - 1),
902 ConstantInt::get(Ty: Int32Ty, V: 2)});
903
904 Value *EndStaticLDSOffset =
905 IRB.CreateLoad(Ty: Int32Ty, Ptr: GEPForEndStaticLDSOffset);
906 Value *EndStaticLDSSize = IRB.CreateLoad(Ty: Int32Ty, Ptr: GEPForEndStaticLDSSize);
907 CurrMallocSize = IRB.CreateAdd(LHS: EndStaticLDSOffset, RHS: EndStaticLDSSize);
908 } else
909 CurrMallocSize = IRB.getInt32(C: MallocSize);
910
911 if (LDSParams.SwDynLDS) {
912 if (!(AMDGPU::getAMDHSACodeObjectVersion(M) >= AMDGPU::AMDHSA_COV5))
913 report_fatal_error(
914 reason: "Dynamic LDS size query is only supported for CO V5 and later.");
915 // Get size from hidden dyn_lds_size argument of kernel
916 Value *ImplicitArg =
917 IRB.CreateIntrinsic(ID: Intrinsic::amdgcn_implicitarg_ptr, Args: {});
918 Value *HiddenDynLDSSize = IRB.CreateInBoundsGEP(
919 Ty: ImplicitArg->getType(), Ptr: ImplicitArg,
920 IdxList: {ConstantInt::get(Ty: Int64Ty, COV5_HIDDEN_DYN_LDS_SIZE_ARG)});
921 UniqueLDSGlobals.clear();
922 GetUniqueLDSGlobals(LDSParams.DirectAccess.DynamicLDSGlobals);
923 GetUniqueLDSGlobals(LDSParams.IndirectAccess.DynamicLDSGlobals);
924 updateMallocSizeForDynamicLDS(Func, CurrMallocSize: &CurrMallocSize, HiddenDynLDSSize,
925 DynamicLDSGlobals&: UniqueLDSGlobals);
926 }
927
928 CurrMallocSize = IRB.CreateZExt(V: CurrMallocSize, DestTy: Int64Ty);
929
930 // Create a call to malloc function which does device global memory allocation
931 // with size equals to all LDS global accesses size in this kernel.
932 Value *ReturnAddress = IRB.CreateIntrinsic(
933 ID: Intrinsic::returnaddress, OverloadTypes: IRB.getPtrTy(AddrSpace: DL.getProgramAddressSpace()),
934 Args: {IRB.getInt32(C: 0)});
935 FunctionCallee MallocFunc = M.getOrInsertFunction(
936 Name: StringRef("__asan_malloc_impl"),
937 T: FunctionType::get(Result: Int64Ty, Params: {Int64Ty, Int64Ty}, isVarArg: false));
938 Value *RAPtrToInt = IRB.CreatePtrToInt(V: ReturnAddress, DestTy: Int64Ty);
939 Value *MallocCall = IRB.CreateCall(Callee: MallocFunc, Args: {CurrMallocSize, RAPtrToInt});
940
941 Value *MallocPtr =
942 IRB.CreateIntToPtr(V: MallocCall, DestTy: IRB.getPtrTy(AddrSpace: AMDGPUAS::GLOBAL_ADDRESS));
943
944 // Create store of malloc to new global
945 IRB.CreateStore(Val: MallocPtr, Ptr: SwLDS);
946
947 // Create calls to __asan_poison_region to poison redzones.
948 poisonRedzones(Func, MallocPtr);
949
950 // Create branch to PrevEntryBlock
951 IRB.CreateBr(Dest: PrevEntryBlock);
952
953 // Create wave-group barrier at the starting of Previous entry block
954 Type *Int1Ty = IRB.getInt1Ty();
955 IRB.SetInsertPoint(TheBB: PrevEntryBlock, IP: PrevEntryBlock->begin());
956 auto *XYZCondPhi = IRB.CreatePHI(Ty: Int1Ty, NumReservedValues: 2, Name: "xyzCond");
957 XYZCondPhi->addIncoming(V: IRB.getInt1(V: 0), BB: WIdBlock);
958 XYZCondPhi->addIncoming(V: IRB.getInt1(V: 1), BB: MallocBlock);
959
960 IRB.CreateIntrinsic(ID: Intrinsic::amdgcn_s_barrier, Args: {});
961
962 // Load malloc pointer from Sw LDS.
963 Value *LoadMallocPtr =
964 IRB.CreateLoad(Ty: IRB.getPtrTy(AddrSpace: AMDGPUAS::GLOBAL_ADDRESS), Ptr: SwLDS);
965
966 // Replace All uses of LDS globals with new LDS pointers.
967 replaceKernelLDSAccesses(Func);
968
969 // Replace Memory Operations on LDS with corresponding
970 // global memory pointers.
971 translateLDSMemoryOperationsToGlobalMemory(Func, LoadMallocPtr,
972 LDSInstructions);
973
974 auto *CondFreeBlock = BasicBlock::Create(Context&: Ctx, Name: "CondFree", Parent: Func);
975 auto *FreeBlock = BasicBlock::Create(Context&: Ctx, Name: "Free", Parent: Func);
976 auto *EndBlock = BasicBlock::Create(Context&: Ctx, Name: "End", Parent: Func);
977 for (BasicBlock &BB : *Func) {
978 if (!BB.empty()) {
979 if (ReturnInst *RI = dyn_cast<ReturnInst>(Val: &BB.back())) {
980 RI->eraseFromParent();
981 IRB.SetInsertPoint(TheBB: &BB, IP: BB.end());
982 IRB.CreateBr(Dest: CondFreeBlock);
983 }
984 }
985 }
986
987 // Cond Free Block
988 IRB.SetInsertPoint(TheBB: CondFreeBlock, IP: CondFreeBlock->begin());
989 IRB.CreateIntrinsic(ID: Intrinsic::amdgcn_s_barrier, Args: {});
990 IRB.CreateCondBr(Cond: XYZCondPhi, True: FreeBlock, False: EndBlock);
991
992 // Free Block
993 IRB.SetInsertPoint(TheBB: FreeBlock, IP: FreeBlock->begin());
994
995 // Free the previously allocate device global memory.
996 FunctionCallee AsanFreeFunc = M.getOrInsertFunction(
997 Name: StringRef("__asan_free_impl"),
998 T: FunctionType::get(Result: IRB.getVoidTy(), Params: {Int64Ty, Int64Ty}, isVarArg: false));
999 Value *ReturnAddr = IRB.CreateIntrinsic(
1000 ID: Intrinsic::returnaddress, OverloadTypes: IRB.getPtrTy(AddrSpace: DL.getProgramAddressSpace()),
1001 Args: IRB.getInt32(C: 0));
1002 Value *RAPToInt = IRB.CreatePtrToInt(V: ReturnAddr, DestTy: Int64Ty);
1003 Value *MallocPtrToInt = IRB.CreatePtrToInt(V: LoadMallocPtr, DestTy: Int64Ty);
1004 IRB.CreateCall(Callee: AsanFreeFunc, Args: {MallocPtrToInt, RAPToInt});
1005
1006 IRB.CreateBr(Dest: EndBlock);
1007
1008 // End Block
1009 IRB.SetInsertPoint(TheBB: EndBlock, IP: EndBlock->begin());
1010 IRB.CreateRetVoid();
1011 // Update the DomTree with corresponding links to basic blocks.
1012 DTU.applyUpdates(Updates: {{DominatorTree::Insert, WIdBlock, MallocBlock},
1013 {DominatorTree::Insert, MallocBlock, PrevEntryBlock},
1014 {DominatorTree::Insert, CondFreeBlock, FreeBlock},
1015 {DominatorTree::Insert, FreeBlock, EndBlock}});
1016}
1017
1018Constant *AMDGPUSwLowerLDS::getAddressesOfVariablesInKernel(
1019 Function *Func, SetVector<GlobalVariable *> &Variables) {
1020 Type *Int32Ty = IRB.getInt32Ty();
1021 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
1022
1023 GlobalVariable *SwLDSMetadata = LDSParams.SwLDSMetadata;
1024 assert(SwLDSMetadata);
1025 auto *SwLDSMetadataStructType =
1026 cast<StructType>(Val: SwLDSMetadata->getValueType());
1027 ArrayType *KernelOffsetsType =
1028 ArrayType::get(ElementType: IRB.getPtrTy(AddrSpace: AMDGPUAS::GLOBAL_ADDRESS), NumElements: Variables.size());
1029
1030 SmallVector<Constant *> Elements;
1031 for (auto *GV : Variables) {
1032 auto It = LDSParams.LDSToReplacementIndicesMap.find(Val: GV);
1033 if (It == LDSParams.LDSToReplacementIndicesMap.end()) {
1034 Elements.push_back(
1035 Elt: PoisonValue::get(T: IRB.getPtrTy(AddrSpace: AMDGPUAS::GLOBAL_ADDRESS)));
1036 continue;
1037 }
1038 auto &Indices = It->second;
1039 Constant *GEPIdx[] = {ConstantInt::get(Ty: Int32Ty, V: Indices[0]),
1040 ConstantInt::get(Ty: Int32Ty, V: Indices[1]),
1041 ConstantInt::get(Ty: Int32Ty, V: Indices[2])};
1042 Constant *GEP = ConstantExpr::getGetElementPtr(Ty: SwLDSMetadataStructType,
1043 C: SwLDSMetadata, IdxList: GEPIdx, NW: true);
1044 Elements.push_back(Elt: GEP);
1045 }
1046 return ConstantArray::get(T: KernelOffsetsType, V: Elements);
1047}
1048
1049void AMDGPUSwLowerLDS::buildNonKernelLDSBaseTable(
1050 NonKernelLDSParameters &NKLDSParams) {
1051 // Base table will have single row, with elements of the row
1052 // placed as per kernel ID. Each element in the row corresponds
1053 // to addresss of "SW LDS" global of the kernel.
1054 auto &Kernels = NKLDSParams.OrderedKernels;
1055 if (Kernels.empty())
1056 return;
1057 const size_t NumberKernels = Kernels.size();
1058 ArrayType *AllKernelsOffsetsType =
1059 ArrayType::get(ElementType: IRB.getPtrTy(AddrSpace: AMDGPUAS::LOCAL_ADDRESS), NumElements: NumberKernels);
1060 std::vector<Constant *> OverallConstantExprElts(NumberKernels);
1061 for (size_t i = 0; i < NumberKernels; i++) {
1062 Function *Func = Kernels[i];
1063 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
1064 OverallConstantExprElts[i] = LDSParams.SwLDS;
1065 }
1066 Constant *init =
1067 ConstantArray::get(T: AllKernelsOffsetsType, V: OverallConstantExprElts);
1068 NKLDSParams.LDSBaseTable = new GlobalVariable(
1069 M, AllKernelsOffsetsType, true, GlobalValue::InternalLinkage, init,
1070 "llvm.amdgcn.sw.lds.base.table", nullptr, GlobalValue::NotThreadLocal,
1071 AMDGPUAS::GLOBAL_ADDRESS);
1072 GlobalValue::SanitizerMetadata MD;
1073 MD.NoAddress = true;
1074 NKLDSParams.LDSBaseTable->setSanitizerMetadata(MD);
1075}
1076
1077void AMDGPUSwLowerLDS::buildNonKernelLDSOffsetTable(
1078 NonKernelLDSParameters &NKLDSParams) {
1079 // Offset table will have multiple rows and columns.
1080 // Rows are assumed to be from 0 to (n-1). n is total number
1081 // of kernels accessing the LDS through non-kernels.
1082 // Each row will have m elements. m is the total number of
1083 // unique LDS globals accessed by non-kernels.
1084 // Each element in the row correspond to the address of
1085 // the replacement of LDS global done by that particular kernel.
1086 auto &Variables = NKLDSParams.OrdereLDSGlobals;
1087 auto &Kernels = NKLDSParams.OrderedKernels;
1088 if (Variables.empty() || Kernels.empty())
1089 return;
1090 const size_t NumberVariables = Variables.size();
1091 const size_t NumberKernels = Kernels.size();
1092
1093 ArrayType *KernelOffsetsType =
1094 ArrayType::get(ElementType: IRB.getPtrTy(AddrSpace: AMDGPUAS::GLOBAL_ADDRESS), NumElements: NumberVariables);
1095
1096 ArrayType *AllKernelsOffsetsType =
1097 ArrayType::get(ElementType: KernelOffsetsType, NumElements: NumberKernels);
1098 std::vector<Constant *> overallConstantExprElts(NumberKernels);
1099 for (size_t i = 0; i < NumberKernels; i++) {
1100 Function *Func = Kernels[i];
1101 overallConstantExprElts[i] =
1102 getAddressesOfVariablesInKernel(Func, Variables);
1103 }
1104 Constant *Init =
1105 ConstantArray::get(T: AllKernelsOffsetsType, V: overallConstantExprElts);
1106 NKLDSParams.LDSOffsetTable = new GlobalVariable(
1107 M, AllKernelsOffsetsType, true, GlobalValue::InternalLinkage, Init,
1108 "llvm.amdgcn.sw.lds.offset.table", nullptr, GlobalValue::NotThreadLocal,
1109 AMDGPUAS::GLOBAL_ADDRESS);
1110 GlobalValue::SanitizerMetadata MD;
1111 MD.NoAddress = true;
1112 NKLDSParams.LDSOffsetTable->setSanitizerMetadata(MD);
1113}
1114
1115void AMDGPUSwLowerLDS::lowerNonKernelLDSAccesses(
1116 Function *Func, SetVector<GlobalVariable *> &LDSGlobals,
1117 NonKernelLDSParameters &NKLDSParams) {
1118 // Replace LDS access in non-kernel with replacement queried from
1119 // Base table and offset from offset table.
1120 LLVM_DEBUG(dbgs() << "Sw LDS lowering, lower non-kernel access for : "
1121 << Func->getName());
1122 auto InsertAt = Func->getEntryBlock().getFirstNonPHIOrDbgOrAlloca();
1123 IRB.SetInsertPoint(InsertAt);
1124
1125 // Get LDS memory instructions.
1126 SetVector<Instruction *> LDSInstructions;
1127 getLDSMemoryInstructions(Func, LDSInstructions);
1128
1129 auto *KernelId = IRB.CreateIntrinsic(ID: Intrinsic::amdgcn_lds_kernel_id, Args: {});
1130 GlobalVariable *LDSBaseTable = NKLDSParams.LDSBaseTable;
1131 GlobalVariable *LDSOffsetTable = NKLDSParams.LDSOffsetTable;
1132 auto &OrdereLDSGlobals = NKLDSParams.OrdereLDSGlobals;
1133 Value *BaseGEP = IRB.CreateInBoundsGEP(
1134 Ty: LDSBaseTable->getValueType(), Ptr: LDSBaseTable, IdxList: {IRB.getInt32(C: 0), KernelId});
1135 Value *BaseLoad =
1136 IRB.CreateLoad(Ty: IRB.getPtrTy(AddrSpace: AMDGPUAS::LOCAL_ADDRESS), Ptr: BaseGEP);
1137 Value *LoadMallocPtr =
1138 IRB.CreateLoad(Ty: IRB.getPtrTy(AddrSpace: AMDGPUAS::GLOBAL_ADDRESS), Ptr: BaseLoad);
1139
1140 for (GlobalVariable *GV : LDSGlobals) {
1141 const auto *GVIt = llvm::find(Range&: OrdereLDSGlobals, Val: GV);
1142 assert(GVIt != OrdereLDSGlobals.end());
1143 uint32_t GVOffset = std::distance(first: OrdereLDSGlobals.begin(), last: GVIt);
1144
1145 Value *OffsetGEP = IRB.CreateInBoundsGEP(
1146 Ty: LDSOffsetTable->getValueType(), Ptr: LDSOffsetTable,
1147 IdxList: {IRB.getInt32(C: 0), KernelId, IRB.getInt32(C: GVOffset)});
1148 Value *OffsetLoad =
1149 IRB.CreateLoad(Ty: IRB.getPtrTy(AddrSpace: AMDGPUAS::GLOBAL_ADDRESS), Ptr: OffsetGEP);
1150 Value *Offset = IRB.CreateLoad(Ty: IRB.getInt32Ty(), Ptr: OffsetLoad);
1151 Value *BasePlusOffset =
1152 IRB.CreateInBoundsGEP(Ty: IRB.getInt8Ty(), Ptr: BaseLoad, IdxList: {Offset});
1153 LLVM_DEBUG(dbgs() << "Sw LDS Lowering, Replace non-kernel LDS for "
1154 << GV->getName());
1155 replacesUsesOfGlobalInFunction(Func, GV, Replacement: BasePlusOffset);
1156 }
1157 translateLDSMemoryOperationsToGlobalMemory(Func, LoadMallocPtr,
1158 LDSInstructions);
1159}
1160
1161static void reorderStaticDynamicIndirectLDSSet(KernelLDSParameters &LDSParams) {
1162 // Sort Static, dynamic LDS globals which are either
1163 // direct or indirect access on basis of name.
1164 auto &DirectAccess = LDSParams.DirectAccess;
1165 auto &IndirectAccess = LDSParams.IndirectAccess;
1166 LDSParams.DirectAccess.StaticLDSGlobals = sortByName(
1167 V: std::vector<GlobalVariable *>(DirectAccess.StaticLDSGlobals.begin(),
1168 DirectAccess.StaticLDSGlobals.end()));
1169 LDSParams.DirectAccess.DynamicLDSGlobals = sortByName(
1170 V: std::vector<GlobalVariable *>(DirectAccess.DynamicLDSGlobals.begin(),
1171 DirectAccess.DynamicLDSGlobals.end()));
1172 LDSParams.IndirectAccess.StaticLDSGlobals = sortByName(
1173 V: std::vector<GlobalVariable *>(IndirectAccess.StaticLDSGlobals.begin(),
1174 IndirectAccess.StaticLDSGlobals.end()));
1175 LDSParams.IndirectAccess.DynamicLDSGlobals = sortByName(
1176 V: std::vector<GlobalVariable *>(IndirectAccess.DynamicLDSGlobals.begin(),
1177 IndirectAccess.DynamicLDSGlobals.end()));
1178}
1179
1180void AMDGPUSwLowerLDS::initAsanInfo() {
1181 // Get Shadow mapping scale and offset.
1182 unsigned LongSize =
1183 M.getDataLayout().getPointerSizeInBits(AS: AMDGPUAS::GLOBAL_ADDRESS);
1184 uint64_t Offset;
1185 int Scale;
1186 bool OrShadowOffset;
1187 llvm::getAddressSanitizerParams(TargetTriple: M.getTargetTriple(), LongSize, IsKasan: false, ShadowBase: &Offset,
1188 MappingScale: &Scale, OrShadowOffset: &OrShadowOffset);
1189 AsanInfo.Scale = Scale;
1190 AsanInfo.Offset = Offset;
1191}
1192
1193static bool hasFnWithSanitizeAddressAttr(FunctionVariableMap &LDSAccesses) {
1194 for (auto &K : LDSAccesses) {
1195 Function *F = K.first;
1196 if (!F)
1197 continue;
1198 if (F->hasFnAttribute(Kind: Attribute::SanitizeAddress))
1199 return true;
1200 }
1201 return false;
1202}
1203
1204bool AMDGPUSwLowerLDS::run() {
1205 bool Changed = false;
1206
1207 CallGraph CG = CallGraph(M);
1208
1209 Changed |=
1210 eliminateGVConstantExprUsesFromAllInstructions(M, Filter: isLDSVariableToLower);
1211
1212 // Get all the direct and indirect access of LDS for all the kernels.
1213 GVUsesInfoTy LDSUsesInfo = getTransitiveUsesOfLDSForLowering(CG, M);
1214
1215 // Flag to decide whether to lower all the LDS accesses
1216 // based on sanitize_address attribute.
1217 bool LowerAllLDS = hasFnWithSanitizeAddressAttr(LDSAccesses&: LDSUsesInfo.DirectAccess) ||
1218 hasFnWithSanitizeAddressAttr(LDSAccesses&: LDSUsesInfo.IndirectAccess);
1219
1220 if (!LowerAllLDS)
1221 return Changed;
1222
1223 // Utility to group LDS access into direct, indirect, static and dynamic.
1224 auto PopulateKernelStaticDynamicLDS = [&](FunctionVariableMap &LDSAccesses,
1225 bool DirectAccess) {
1226 for (auto &K : LDSAccesses) {
1227 Function *F = K.first;
1228 if (!F || K.second.empty())
1229 continue;
1230
1231 assert(isKernel(*F));
1232
1233 // Only inserts if key isn't already in the map.
1234 FuncLDSAccessInfo.KernelToLDSParametersMap.insert(
1235 KV: {F, KernelLDSParameters()});
1236
1237 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[F];
1238 if (!DirectAccess)
1239 FuncLDSAccessInfo.KernelsWithIndirectLDSAccess.insert(X: F);
1240 for (GlobalVariable *GV : K.second) {
1241 if (!DirectAccess) {
1242 if (AMDGPU::isDynamicLDS(GV: *GV))
1243 LDSParams.IndirectAccess.DynamicLDSGlobals.insert(X: GV);
1244 else
1245 LDSParams.IndirectAccess.StaticLDSGlobals.insert(X: GV);
1246 FuncLDSAccessInfo.AllNonKernelLDSAccess.insert(X: GV);
1247 } else {
1248 if (AMDGPU::isDynamicLDS(GV: *GV))
1249 LDSParams.DirectAccess.DynamicLDSGlobals.insert(X: GV);
1250 else
1251 LDSParams.DirectAccess.StaticLDSGlobals.insert(X: GV);
1252 }
1253 }
1254 }
1255 };
1256
1257 PopulateKernelStaticDynamicLDS(LDSUsesInfo.DirectAccess, true);
1258 PopulateKernelStaticDynamicLDS(LDSUsesInfo.IndirectAccess, false);
1259
1260 // Get address sanitizer scale.
1261 initAsanInfo();
1262
1263 for (auto &K : FuncLDSAccessInfo.KernelToLDSParametersMap) {
1264 Function *Func = K.first;
1265 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
1266 if (LDSParams.DirectAccess.StaticLDSGlobals.empty() &&
1267 LDSParams.DirectAccess.DynamicLDSGlobals.empty() &&
1268 LDSParams.IndirectAccess.StaticLDSGlobals.empty() &&
1269 LDSParams.IndirectAccess.DynamicLDSGlobals.empty()) {
1270 Changed = false;
1271 } else {
1272 removeFnAttrFromReachable(
1273 CG, KernelRoot: Func,
1274 FnAttrs: {"amdgpu-no-workitem-id-x", "amdgpu-no-workitem-id-y",
1275 "amdgpu-no-workitem-id-z", "amdgpu-no-heap-ptr"});
1276 if (!LDSParams.IndirectAccess.StaticLDSGlobals.empty() ||
1277 !LDSParams.IndirectAccess.DynamicLDSGlobals.empty())
1278 removeFnAttrFromReachable(CG, KernelRoot: Func, FnAttrs: {"amdgpu-no-lds-kernel-id"});
1279 reorderStaticDynamicIndirectLDSSet(LDSParams);
1280 buildSwLDSGlobal(Func);
1281 buildSwDynLDSGlobal(Func);
1282 populateSwMetadataGlobal(Func);
1283 populateSwLDSAttributeAndMetadata(Func);
1284 populateLDSToReplacementIndicesMap(Func);
1285 DomTreeUpdater DTU(DTCallback(*Func),
1286 DomTreeUpdater::UpdateStrategy::Lazy);
1287 lowerKernelLDSAccesses(Func, DTU);
1288 Changed = true;
1289 }
1290 }
1291
1292 // Get the Uses of LDS from non-kernels.
1293 getUsesOfLDSByNonKernels();
1294
1295 // Get non-kernels with LDS ptr as argument and called by kernels.
1296 getNonKernelsWithLDSArguments(CG);
1297
1298 // Lower LDS accesses in non-kernels.
1299 if (!FuncLDSAccessInfo.NonKernelToLDSAccessMap.empty() ||
1300 !FuncLDSAccessInfo.NonKernelsWithLDSArgument.empty()) {
1301 NonKernelLDSParameters NKLDSParams;
1302 NKLDSParams.OrderedKernels = getOrderedIndirectLDSAccessingKernels(
1303 Kernels&: FuncLDSAccessInfo.KernelsWithIndirectLDSAccess);
1304 NKLDSParams.OrdereLDSGlobals = getOrderedNonKernelAllLDSGlobals(
1305 Variables&: FuncLDSAccessInfo.AllNonKernelLDSAccess);
1306 buildNonKernelLDSBaseTable(NKLDSParams);
1307 buildNonKernelLDSOffsetTable(NKLDSParams);
1308 for (auto &K : FuncLDSAccessInfo.NonKernelToLDSAccessMap) {
1309 Function *Func = K.first;
1310 DenseSet<GlobalVariable *> &LDSGlobals = K.second;
1311 SetVector<GlobalVariable *> OrderedLDSGlobals = sortByName(
1312 V: std::vector<GlobalVariable *>(LDSGlobals.begin(), LDSGlobals.end()));
1313 lowerNonKernelLDSAccesses(Func, LDSGlobals&: OrderedLDSGlobals, NKLDSParams);
1314 }
1315 for (Function *Func : FuncLDSAccessInfo.NonKernelsWithLDSArgument) {
1316 auto &K = FuncLDSAccessInfo.NonKernelToLDSAccessMap;
1317 if (K.contains(Val: Func))
1318 continue;
1319 SetVector<llvm::GlobalVariable *> Vec;
1320 lowerNonKernelLDSAccesses(Func, LDSGlobals&: Vec, NKLDSParams);
1321 }
1322 Changed = true;
1323 }
1324
1325 if (!Changed)
1326 return Changed;
1327
1328 for (auto &GV : make_early_inc_range(Range: M.globals())) {
1329 if (AMDGPU::isLDSVariableToLower(GV)) {
1330 // probably want to remove from used lists
1331 GV.removeDeadConstantUsers();
1332 if (GV.use_empty())
1333 GV.eraseFromParent();
1334 }
1335 }
1336
1337 if (AsanInstrumentLDS) {
1338 SmallVector<InterestingMemoryOperand, 16> OperandsToInstrument;
1339 for (Instruction *Inst : AsanInfo.Instructions) {
1340 SmallVector<InterestingMemoryOperand, 1> InterestingOperands;
1341 getInterestingMemoryOperands(M, I: Inst, Interesting&: InterestingOperands);
1342 llvm::append_range(C&: OperandsToInstrument, R&: InterestingOperands);
1343 }
1344 for (auto &Operand : OperandsToInstrument) {
1345 Value *Addr = Operand.getPtr();
1346 instrumentAddress(M, IRB, OrigIns: Operand.getInsn(), InsertBefore: Operand.getInsn(), Addr,
1347 Alignment: Operand.Alignment.valueOrOne(), TypeStoreSize: Operand.TypeStoreSize,
1348 IsWrite: Operand.IsWrite, SizeArgument: nullptr, UseCalls: false, Recover: false, Scale: AsanInfo.Scale,
1349 Offset: AsanInfo.Offset);
1350 Changed = true;
1351 }
1352 }
1353
1354 return Changed;
1355}
1356
1357class AMDGPUSwLowerLDSLegacy : public ModulePass {
1358public:
1359 static char ID;
1360 AMDGPUSwLowerLDSLegacy() : ModulePass(ID) {}
1361 bool runOnModule(Module &M) override;
1362 void getAnalysisUsage(AnalysisUsage &AU) const override {
1363 AU.addPreserved<DominatorTreeWrapperPass>();
1364 }
1365};
1366} // namespace
1367
1368char AMDGPUSwLowerLDSLegacy::ID = 0;
1369char &llvm::AMDGPUSwLowerLDSLegacyPassID = AMDGPUSwLowerLDSLegacy::ID;
1370
1371INITIALIZE_PASS_BEGIN(AMDGPUSwLowerLDSLegacy, "amdgpu-sw-lower-lds",
1372 "AMDGPU Software lowering of LDS", false, false)
1373INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
1374INITIALIZE_PASS_END(AMDGPUSwLowerLDSLegacy, "amdgpu-sw-lower-lds",
1375 "AMDGPU Software lowering of LDS", false, false)
1376
1377bool AMDGPUSwLowerLDSLegacy::runOnModule(Module &M) {
1378 // AddressSanitizer pass adds "nosanitize_address" module flag if it has
1379 // instrumented the IR. Return early if the flag is not present.
1380 if (!M.getModuleFlag(Key: "nosanitize_address"))
1381 return false;
1382 DominatorTreeWrapperPass *const DTW =
1383 getAnalysisIfAvailable<DominatorTreeWrapperPass>();
1384 auto DTCallback = [&DTW](Function &F) -> DominatorTree * {
1385 return DTW ? &DTW->getDomTree() : nullptr;
1386 };
1387
1388 AMDGPUSwLowerLDS SwLowerLDSImpl(M, DTCallback);
1389 bool IsChanged = SwLowerLDSImpl.run();
1390 return IsChanged;
1391}
1392
1393ModulePass *llvm::createAMDGPUSwLowerLDSLegacyPass() {
1394 return new AMDGPUSwLowerLDSLegacy();
1395}
1396
1397PreservedAnalyses AMDGPUSwLowerLDSPass::run(Module &M,
1398 ModuleAnalysisManager &AM) {
1399 // AddressSanitizer pass adds "nosanitize_address" module flag if it has
1400 // instrumented the IR. Return early if the flag is not present.
1401 if (!M.getModuleFlag(Key: "nosanitize_address"))
1402 return PreservedAnalyses::all();
1403 auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(IR&: M).getManager();
1404 auto DTCallback = [&FAM](Function &F) -> DominatorTree * {
1405 return &FAM.getResult<DominatorTreeAnalysis>(IR&: F);
1406 };
1407 AMDGPUSwLowerLDS SwLowerLDSImpl(M, DTCallback);
1408 bool IsChanged = SwLowerLDSImpl.run();
1409 if (!IsChanged)
1410 return PreservedAnalyses::all();
1411
1412 PreservedAnalyses PA;
1413 PA.preserve<DominatorTreeAnalysis>();
1414 return PA;
1415}
1416