1//===-- AMDGPUMemoryUtils.cpp - -------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#include "AMDGPUMemoryUtils.h"
10#include "AMDGPU.h"
11#include "Utils/AMDGPUBaseInfo.h"
12#include "llvm/ADT/SetOperations.h"
13#include "llvm/ADT/SmallSet.h"
14#include "llvm/Analysis/AliasAnalysis.h"
15#include "llvm/Analysis/CallGraph.h"
16#include "llvm/Analysis/MemorySSA.h"
17#include "llvm/IR/DataLayout.h"
18#include "llvm/IR/Instructions.h"
19#include "llvm/IR/IntrinsicInst.h"
20#include "llvm/IR/IntrinsicsAMDGPU.h"
21#include "llvm/IR/ReplaceConstant.h"
22
23#define DEBUG_TYPE "amdgpu-memory-utils"
24
25using namespace llvm;
26
27namespace llvm::AMDGPU {
28
29Align getAlign(const DataLayout &DL, const GlobalVariable *GV) {
30 return DL.getValueOrABITypeAlignment(Alignment: GV->getPointerAlignment(DL),
31 Ty: GV->getValueType());
32}
33
34TargetExtType *isNamedBarrier(const GlobalVariable &GV) {
35 // TODO: Allow arrays and structs, if all members are barriers
36 // in the same scope.
37 // TODO: Disallow other uses of target("amdgcn.named.barrier") including:
38 // - Structs containing barriers in different scope.
39 // - Structs containing a mixture of barriers and other data.
40 // - Globals in other address spaces.
41 // - Allocas.
42 Type *Ty = GV.getValueType();
43 while (true) {
44 if (auto *TTy = dyn_cast<TargetExtType>(Val: Ty))
45 return TTy->getName() == "amdgcn.named.barrier" ? TTy : nullptr;
46 if (auto *STy = dyn_cast<StructType>(Val: Ty)) {
47 if (STy->getNumElements() == 0)
48 return nullptr;
49 Ty = STy->getElementType(N: 0);
50 continue;
51 }
52 return nullptr;
53 }
54}
55
56bool isDynamicLDS(const GlobalVariable &GV) {
57 // external zero size addrspace(3) without initializer is dynlds.
58 const Module *M = GV.getParent();
59 const DataLayout &DL = M->getDataLayout();
60 if (GV.getType()->getPointerAddressSpace() != AMDGPUAS::LOCAL_ADDRESS)
61 return false;
62 return DL.getTypeAllocSize(Ty: GV.getValueType()) == 0;
63}
64
65bool isLDSVariableToLower(const GlobalVariable &GV) {
66 if (GV.getType()->getPointerAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) {
67 return false;
68 }
69 if (isDynamicLDS(GV)) {
70 return true;
71 }
72 if (GV.isConstant()) {
73 // A constant undef variable can't be written to, and any load is
74 // undef, so it should be eliminated by the optimizer. It could be
75 // dropped by the back end if not. This pass skips over it.
76 return false;
77 }
78 if (GV.hasInitializer() && !isa<UndefValue>(Val: GV.getInitializer())) {
79 // Initializers are unimplemented for LDS address space.
80 // Leave such variables in place for consistent error reporting.
81 return false;
82 }
83 return true;
84}
85
86bool eliminateConstantExprUsesOfLDSFromAllInstructions(Module &M) {
87 // Constants are uniqued within LLVM. A ConstantExpr referring to a LDS
88 // global may have uses from multiple different functions as a result.
89 // This pass specialises LDS variables with respect to the kernel that
90 // allocates them.
91
92 // This is semantically equivalent to (the unimplemented as slow):
93 // for (auto &F : M.functions())
94 // for (auto &BB : F)
95 // for (auto &I : BB)
96 // for (Use &Op : I.operands())
97 // if (constantExprUsesLDS(Op))
98 // replaceConstantExprInFunction(I, Op);
99
100 SmallVector<Constant *> LDSGlobals;
101 for (auto &GV : M.globals())
102 if (AMDGPU::isLDSVariableToLower(GV))
103 LDSGlobals.push_back(Elt: &GV);
104 return convertUsersOfConstantsToInstructions(Consts: LDSGlobals);
105}
106
107void getUsesOfLDSByFunction(const CallGraph &CG, Module &M,
108 FunctionVariableMap &kernels,
109 FunctionVariableMap &Functions) {
110 // Get uses from the current function, excluding uses by called Functions
111 // Two output variables to avoid walking the globals list twice
112 for (auto &GV : M.globals()) {
113 if (!AMDGPU::isLDSVariableToLower(GV))
114 continue;
115 for (User *V : GV.users()) {
116 if (auto *I = dyn_cast<Instruction>(Val: V)) {
117 Function *F = I->getFunction();
118 if (isKernelLDS(F))
119 kernels[F].insert(V: &GV);
120 else
121 Functions[F].insert(V: &GV);
122 }
123 }
124 }
125}
126
127bool isKernelLDS(const Function *F) {
128 return AMDGPU::isKernel(CC: F->getCallingConv());
129}
130
131LDSUsesInfoTy getTransitiveUsesOfLDS(const CallGraph &CG, Module &M) {
132
133 FunctionVariableMap DirectMapKernel;
134 FunctionVariableMap DirectMapFunction;
135 getUsesOfLDSByFunction(CG, M, kernels&: DirectMapKernel, Functions&: DirectMapFunction);
136
137 // Collect functions whose address has escaped
138 DenseSet<Function *> AddressTakenFuncs;
139 for (Function &F : M.functions()) {
140 if (!isKernelLDS(F: &F))
141 if (F.hasAddressTaken(nullptr,
142 /* IgnoreCallbackUses */ false,
143 /* IgnoreAssumeLikeCalls */ false,
144 /* IgnoreLLVMUsed */ IngoreLLVMUsed: true,
145 /* IgnoreArcAttachedCall */ IgnoreARCAttachedCall: false)) {
146 AddressTakenFuncs.insert(V: &F);
147 }
148 }
149
150 // Collect variables that are used by functions whose address has escaped
151 DenseSet<GlobalVariable *> VariablesReachableThroughFunctionPointer;
152 for (Function *F : AddressTakenFuncs) {
153 set_union(S1&: VariablesReachableThroughFunctionPointer, S2: DirectMapFunction[F]);
154 }
155
156 auto FunctionMakesUnknownCall = [&](const Function *F) -> bool {
157 assert(!F->isDeclaration());
158 for (const CallGraphNode::CallRecord &R : *CG[F]) {
159 if (!R.second->getFunction())
160 return true;
161 }
162 return false;
163 };
164
165 // Work out which variables are reachable through function calls
166 FunctionVariableMap TransitiveMapFunction = DirectMapFunction;
167
168 // If the function makes any unknown call, assume the worst case that it can
169 // access all variables accessed by functions whose address escaped
170 for (Function &F : M.functions()) {
171 if (!F.isDeclaration() && FunctionMakesUnknownCall(&F)) {
172 if (!isKernelLDS(F: &F)) {
173 set_union(S1&: TransitiveMapFunction[&F],
174 S2: VariablesReachableThroughFunctionPointer);
175 }
176 }
177 }
178
179 // Direct implementation of collecting all variables reachable from each
180 // function
181 for (Function &Func : M.functions()) {
182 if (Func.isDeclaration() || isKernelLDS(F: &Func))
183 continue;
184
185 DenseSet<Function *> seen; // catches cycles
186 SmallVector<Function *, 4> wip = {&Func};
187
188 while (!wip.empty()) {
189 Function *F = wip.pop_back_val();
190
191 // Can accelerate this by referring to transitive map for functions that
192 // have already been computed, with more care than this
193 set_union(S1&: TransitiveMapFunction[&Func], S2: DirectMapFunction[F]);
194
195 for (const CallGraphNode::CallRecord &R : *CG[F]) {
196 Function *Ith = R.second->getFunction();
197 if (Ith) {
198 if (!seen.contains(V: Ith)) {
199 seen.insert(V: Ith);
200 wip.push_back(Elt: Ith);
201 }
202 }
203 }
204 }
205 }
206
207 // Collect variables that are transitively used by functions whose address has
208 // escaped
209 for (Function *F : AddressTakenFuncs) {
210 set_union(S1&: VariablesReachableThroughFunctionPointer,
211 S2: TransitiveMapFunction[F]);
212 }
213
214 // DirectMapKernel lists which variables are used by the kernel
215 // find the variables which are used through a function call
216 FunctionVariableMap IndirectMapKernel;
217
218 for (Function &Func : M.functions()) {
219 if (Func.isDeclaration() || !isKernelLDS(F: &Func))
220 continue;
221
222 for (const CallGraphNode::CallRecord &R : *CG[&Func]) {
223 Function *Ith = R.second->getFunction();
224 if (Ith) {
225 set_union(S1&: IndirectMapKernel[&Func], S2: TransitiveMapFunction[Ith]);
226 }
227 }
228
229 // Check if the kernel encounters unknows calls, wheher directly or
230 // indirectly.
231 bool SeesUnknownCalls = [&]() {
232 SmallVector<Function *> WorkList = {CG[&Func]->getFunction()};
233 SmallPtrSet<Function *, 8> Visited;
234
235 while (!WorkList.empty()) {
236 Function *F = WorkList.pop_back_val();
237
238 for (const CallGraphNode::CallRecord &CallRecord : *CG[F]) {
239 if (!CallRecord.second)
240 continue;
241
242 Function *Callee = CallRecord.second->getFunction();
243 if (!Callee)
244 return true;
245
246 if (Visited.insert(Ptr: Callee).second)
247 WorkList.push_back(Elt: Callee);
248 }
249 }
250 return false;
251 }();
252
253 if (SeesUnknownCalls) {
254 set_union(S1&: IndirectMapKernel[&Func],
255 S2: VariablesReachableThroughFunctionPointer);
256 }
257 }
258
259 // Verify that we fall into one of 2 cases:
260 // - All variables are either absolute
261 // or direct mapped dynamic LDS that is not lowered.
262 // this is a re-run of the pass
263 // so we don't have anything to do.
264 // - No variables are absolute.
265 std::optional<bool> HasAbsoluteGVs;
266 bool HasSpecialGVs = false;
267 for (auto &Map : {DirectMapKernel, IndirectMapKernel}) {
268 for (auto &[Fn, GVs] : Map) {
269 for (auto *GV : GVs) {
270 bool IsAbsolute = GV->isAbsoluteSymbolRef();
271 bool IsDirectMapDynLDSGV =
272 AMDGPU::isDynamicLDS(GV: *GV) && DirectMapKernel.contains(Val: Fn);
273 if (IsDirectMapDynLDSGV)
274 continue;
275 if (isNamedBarrier(GV: *GV)) {
276 HasSpecialGVs = true;
277 continue;
278 }
279 if (HasAbsoluteGVs.has_value()) {
280 if (*HasAbsoluteGVs != IsAbsolute) {
281 reportFatalUsageError(
282 reason: "module cannot mix absolute and non-absolute LDS GVs");
283 }
284 } else
285 HasAbsoluteGVs = IsAbsolute;
286 }
287 }
288 }
289
290 // If we only had absolute GVs, we have nothing to do, return an empty
291 // result.
292 if (HasAbsoluteGVs && *HasAbsoluteGVs)
293 return {.direct_access: FunctionVariableMap(), .indirect_access: FunctionVariableMap(), .HasSpecialGVs: false};
294
295 return {.direct_access: std::move(DirectMapKernel), .indirect_access: std::move(IndirectMapKernel),
296 .HasSpecialGVs: HasSpecialGVs};
297}
298
299void removeFnAttrFromReachable(CallGraph &CG, Function *KernelRoot,
300 ArrayRef<StringRef> FnAttrs) {
301 for (StringRef Attr : FnAttrs)
302 KernelRoot->removeFnAttr(Kind: Attr);
303
304 SmallVector<Function *> WorkList = {CG[KernelRoot]->getFunction()};
305 SmallPtrSet<Function *, 8> Visited;
306 bool SeenUnknownCall = false;
307
308 while (!WorkList.empty()) {
309 Function *F = WorkList.pop_back_val();
310
311 for (auto &CallRecord : *CG[F]) {
312 if (!CallRecord.second)
313 continue;
314
315 Function *Callee = CallRecord.second->getFunction();
316 if (!Callee) {
317 if (!SeenUnknownCall) {
318 SeenUnknownCall = true;
319
320 // If we see any indirect calls, assume nothing about potential
321 // targets.
322 // TODO: This could be refined to possible LDS global users.
323 for (auto &ExternalCallRecord : *CG.getExternalCallingNode()) {
324 Function *PotentialCallee =
325 ExternalCallRecord.second->getFunction();
326 assert(PotentialCallee);
327 if (!isKernelLDS(F: PotentialCallee)) {
328 for (StringRef Attr : FnAttrs)
329 PotentialCallee->removeFnAttr(Kind: Attr);
330 }
331 }
332 }
333 } else {
334 for (StringRef Attr : FnAttrs)
335 Callee->removeFnAttr(Kind: Attr);
336 if (Visited.insert(Ptr: Callee).second)
337 WorkList.push_back(Elt: Callee);
338 }
339 }
340 }
341}
342
343bool isReallyAClobber(const Value *Ptr, MemoryDef *Def, AAResults *AA) {
344 Instruction *DefInst = Def->getMemoryInst();
345
346 if (isa<FenceInst>(Val: DefInst))
347 return false;
348
349 if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(Val: DefInst)) {
350 switch (II->getIntrinsicID()) {
351 case Intrinsic::amdgcn_s_barrier:
352 case Intrinsic::amdgcn_s_barrier_signal:
353 case Intrinsic::amdgcn_s_barrier_signal_var:
354 case Intrinsic::amdgcn_s_barrier_signal_isfirst:
355 case Intrinsic::amdgcn_s_barrier_wait:
356 case Intrinsic::amdgcn_s_get_barrier_state:
357 case Intrinsic::amdgcn_wave_barrier:
358 case Intrinsic::amdgcn_sched_barrier:
359 case Intrinsic::amdgcn_sched_group_barrier:
360 case Intrinsic::amdgcn_iglp_opt:
361 return false;
362 default:
363 break;
364 }
365 }
366
367 // Ignore atomics not aliasing with the original load, any atomic is a
368 // universal MemoryDef from MSSA's point of view too, just like a fence.
369 const auto checkNoAlias = [AA, Ptr](auto I) -> bool {
370 return I && AA->isNoAlias(I->getPointerOperand(), Ptr);
371 };
372
373 if (checkNoAlias(dyn_cast<AtomicCmpXchgInst>(Val: DefInst)) ||
374 checkNoAlias(dyn_cast<AtomicRMWInst>(Val: DefInst)))
375 return false;
376
377 return true;
378}
379
380bool isClobberedInFunction(const LoadInst *Load, MemorySSA *MSSA,
381 AAResults *AA) {
382 MemorySSAWalker *Walker = MSSA->getWalker();
383 SmallVector<MemoryAccess *> WorkList{Walker->getClobberingMemoryAccess(I: Load)};
384 SmallSet<MemoryAccess *, 8> Visited;
385 MemoryLocation Loc(MemoryLocation::get(LI: Load));
386
387 LLVM_DEBUG(dbgs() << "Checking clobbering of: " << *Load << '\n');
388
389 // Start with a nearest dominating clobbering access, it will be either
390 // live on entry (nothing to do, load is not clobbered), MemoryDef, or
391 // MemoryPhi if several MemoryDefs can define this memory state. In that
392 // case add all Defs to WorkList and continue going up and checking all
393 // the definitions of this memory location until the root. When all the
394 // defs are exhausted and came to the entry state we have no clobber.
395 // Along the scan ignore barriers and fences which are considered clobbers
396 // by the MemorySSA, but not really writing anything into the memory.
397 while (!WorkList.empty()) {
398 MemoryAccess *MA = WorkList.pop_back_val();
399 if (!Visited.insert(Ptr: MA).second)
400 continue;
401
402 if (MSSA->isLiveOnEntryDef(MA))
403 continue;
404
405 if (MemoryDef *Def = dyn_cast<MemoryDef>(Val: MA)) {
406 LLVM_DEBUG(dbgs() << " Def: " << *Def->getMemoryInst() << '\n');
407
408 if (isReallyAClobber(Ptr: Load->getPointerOperand(), Def, AA)) {
409 LLVM_DEBUG(dbgs() << " -> load is clobbered\n");
410 return true;
411 }
412
413 WorkList.push_back(
414 Elt: Walker->getClobberingMemoryAccess(MA: Def->getDefiningAccess(), Loc));
415 continue;
416 }
417
418 const MemoryPhi *Phi = cast<MemoryPhi>(Val: MA);
419 for (const auto &Use : Phi->incoming_values())
420 WorkList.push_back(Elt: cast<MemoryAccess>(Val: &Use));
421 }
422
423 LLVM_DEBUG(dbgs() << " -> no clobber\n");
424 return false;
425}
426
427} // end namespace llvm::AMDGPU
428