1//===-- AMDGPUMemoryUtils.cpp - -------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#include "AMDGPUMemoryUtils.h"
10#include "AMDGPU.h"
11#include "Utils/AMDGPUBaseInfo.h"
12#include "llvm/ADT/SetOperations.h"
13#include "llvm/Analysis/AliasAnalysis.h"
14#include "llvm/Analysis/CallGraph.h"
15#include "llvm/Analysis/MemorySSA.h"
16#include "llvm/IR/DataLayout.h"
17#include "llvm/IR/Instructions.h"
18#include "llvm/IR/IntrinsicInst.h"
19#include "llvm/IR/IntrinsicsAMDGPU.h"
20#include "llvm/IR/ReplaceConstant.h"
21
22#define DEBUG_TYPE "amdgpu-memory-utils"
23
24using namespace llvm;
25
26namespace llvm::AMDGPU {
27
28Align getAlign(const DataLayout &DL, const GlobalVariable *GV) {
29 return DL.getValueOrABITypeAlignment(Alignment: GV->getPointerAlignment(DL),
30 Ty: GV->getValueType());
31}
32
33// Returns the target extension type of a global variable,
34// which can only be a TargetExtType, an array or single-element struct of it,
35// or their nesting combination.
36// TODO: allow struct of multiple TargetExtType elements of the same type.
37// TODO: Disallow other uses of target("amdgcn.named.barrier") including:
38// - Structs containing barriers in different scope/rank
39// - Structs containing a mixture of barriers and other data.
40// - Globals in other address spaces.
41// - Allocas.
42static TargetExtType *getTargetExtType(const GlobalVariable &GV) {
43 Type *Ty = GV.getValueType();
44 while (true) {
45 if (auto *TTy = dyn_cast<TargetExtType>(Val: Ty))
46 return TTy;
47 if (auto *STy = dyn_cast<StructType>(Val: Ty)) {
48 if (STy->getNumElements() != 1)
49 return nullptr;
50 Ty = STy->getElementType(N: 0);
51 continue;
52 }
53 if (auto *ATy = dyn_cast<ArrayType>(Val: Ty)) {
54 Ty = ATy->getElementType();
55 continue;
56 }
57 return nullptr;
58 }
59}
60
61TargetExtType *isNamedBarrier(const GlobalVariable &GV) {
62 if (TargetExtType *Ty = getTargetExtType(GV))
63 return Ty->getName() == "amdgcn.named.barrier" ? Ty : nullptr;
64 return nullptr;
65}
66
67bool isDynamicLDS(const GlobalVariable &GV) {
68 // external zero size addrspace(3) without initializer is dynlds.
69 const Module *M = GV.getParent();
70 const DataLayout &DL = M->getDataLayout();
71 if (GV.getType()->getPointerAddressSpace() != AMDGPUAS::LOCAL_ADDRESS)
72 return false;
73 return GV.getGlobalSize(DL) == 0;
74}
75
76bool isLDSVariableToLower(const GlobalVariable &GV) {
77 if (GV.getType()->getPointerAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) {
78 return false;
79 }
80 if (isDynamicLDS(GV)) {
81 return true;
82 }
83 if (GV.isConstant()) {
84 // A constant undef variable can't be written to, and any load is
85 // undef, so it should be eliminated by the optimizer. It could be
86 // dropped by the back end if not. This pass skips over it.
87 return false;
88 }
89 if (GV.hasInitializer() && !isa<UndefValue>(Val: GV.getInitializer())) {
90 // Initializers are unimplemented for LDS address space.
91 // Leave such variables in place for consistent error reporting.
92 return false;
93 }
94 return true;
95}
96
97bool eliminateConstantExprUsesOfLDSFromAllInstructions(Module &M) {
98 // Constants are uniqued within LLVM. A ConstantExpr referring to a LDS
99 // global may have uses from multiple different functions as a result.
100 // This pass specialises LDS variables with respect to the kernel that
101 // allocates them.
102
103 // This is semantically equivalent to (the unimplemented as slow):
104 // for (auto &F : M.functions())
105 // for (auto &BB : F)
106 // for (auto &I : BB)
107 // for (Use &Op : I.operands())
108 // if (constantExprUsesLDS(Op))
109 // replaceConstantExprInFunction(I, Op);
110
111 SmallVector<Constant *> LDSGlobals;
112 for (auto &GV : M.globals())
113 if (AMDGPU::isLDSVariableToLower(GV))
114 LDSGlobals.push_back(Elt: &GV);
115 return convertUsersOfConstantsToInstructions(Consts: LDSGlobals);
116}
117
118void getUsesOfLDSByFunction(const CallGraph &CG, Module &M,
119 FunctionVariableMap &kernels,
120 FunctionVariableMap &Functions) {
121 // Get uses from the current function, excluding uses by called Functions
122 // Two output variables to avoid walking the globals list twice
123 for (auto &GV : M.globals()) {
124 if (!AMDGPU::isLDSVariableToLower(GV))
125 continue;
126 for (User *V : GV.users()) {
127 if (auto *I = dyn_cast<Instruction>(Val: V)) {
128 Function *F = I->getFunction();
129 if (isKernel(F: *F))
130 kernels[F].insert(V: &GV);
131 else
132 Functions[F].insert(V: &GV);
133 }
134 }
135 }
136}
137
138LDSUsesInfoTy getTransitiveUsesOfLDS(const CallGraph &CG, Module &M) {
139
140 FunctionVariableMap DirectMapKernel;
141 FunctionVariableMap DirectMapFunction;
142 getUsesOfLDSByFunction(CG, M, kernels&: DirectMapKernel, Functions&: DirectMapFunction);
143
144 // Collect functions whose address has escaped
145 DenseSet<Function *> AddressTakenFuncs;
146 for (Function &F : M.functions()) {
147 if (!isKernel(F))
148 if (F.hasAddressTaken(nullptr,
149 /* IgnoreCallbackUses */ false,
150 /* IgnoreAssumeLikeCalls */ false,
151 /* IgnoreLLVMUsed */ IngoreLLVMUsed: true,
152 /* IgnoreArcAttachedCall */ IgnoreARCAttachedCall: false)) {
153 AddressTakenFuncs.insert(V: &F);
154 }
155 }
156
157 // Collect variables that are used by functions whose address has escaped
158 DenseSet<GlobalVariable *> VariablesReachableThroughFunctionPointer;
159 for (Function *F : AddressTakenFuncs) {
160 set_union(S1&: VariablesReachableThroughFunctionPointer, S2: DirectMapFunction[F]);
161 }
162
163 auto FunctionMakesUnknownCall = [&](const Function *F) -> bool {
164 assert(!F->isDeclaration());
165 for (const CallGraphNode::CallRecord &R : *CG[F]) {
166 if (!R.second->getFunction())
167 return true;
168 }
169 return false;
170 };
171
172 // Work out which variables are reachable through function calls
173 FunctionVariableMap TransitiveMapFunction = DirectMapFunction;
174
175 // If the function makes any unknown call, assume the worst case that it can
176 // access all variables accessed by functions whose address escaped
177 for (Function &F : M.functions()) {
178 if (!F.isDeclaration() && FunctionMakesUnknownCall(&F)) {
179 if (!isKernel(F)) {
180 set_union(S1&: TransitiveMapFunction[&F],
181 S2: VariablesReachableThroughFunctionPointer);
182 }
183 }
184 }
185
186 // Direct implementation of collecting all variables reachable from each
187 // function
188 for (Function &Func : M.functions()) {
189 if (Func.isDeclaration() || isKernel(F: Func))
190 continue;
191
192 DenseSet<Function *> seen; // catches cycles
193 SmallVector<Function *, 4> wip = {&Func};
194
195 while (!wip.empty()) {
196 Function *F = wip.pop_back_val();
197
198 // Can accelerate this by referring to transitive map for functions that
199 // have already been computed, with more care than this
200 set_union(S1&: TransitiveMapFunction[&Func], S2: DirectMapFunction[F]);
201
202 for (const CallGraphNode::CallRecord &R : *CG[F]) {
203 Function *Ith = R.second->getFunction();
204 if (Ith) {
205 if (!seen.contains(V: Ith)) {
206 seen.insert(V: Ith);
207 wip.push_back(Elt: Ith);
208 }
209 }
210 }
211 }
212 }
213
214 // Collect variables that are transitively used by functions whose address has
215 // escaped
216 for (Function *F : AddressTakenFuncs) {
217 set_union(S1&: VariablesReachableThroughFunctionPointer,
218 S2: TransitiveMapFunction[F]);
219 }
220
221 // DirectMapKernel lists which variables are used by the kernel
222 // find the variables which are used through a function call
223 FunctionVariableMap IndirectMapKernel;
224
225 for (Function &Func : M.functions()) {
226 if (Func.isDeclaration() || !isKernel(F: Func))
227 continue;
228
229 for (const CallGraphNode::CallRecord &R : *CG[&Func]) {
230 Function *Ith = R.second->getFunction();
231 if (Ith) {
232 set_union(S1&: IndirectMapKernel[&Func], S2: TransitiveMapFunction[Ith]);
233 }
234 }
235
236 // Check if the kernel encounters unknows calls, wheher directly or
237 // indirectly.
238 bool SeesUnknownCalls = [&]() {
239 SmallVector<Function *> WorkList = {CG[&Func]->getFunction()};
240 SmallPtrSet<Function *, 8> Visited;
241
242 while (!WorkList.empty()) {
243 Function *F = WorkList.pop_back_val();
244
245 for (const CallGraphNode::CallRecord &CallRecord : *CG[F]) {
246 if (!CallRecord.second)
247 continue;
248
249 Function *Callee = CallRecord.second->getFunction();
250 if (!Callee)
251 return true;
252
253 if (Visited.insert(Ptr: Callee).second)
254 WorkList.push_back(Elt: Callee);
255 }
256 }
257 return false;
258 }();
259
260 if (SeesUnknownCalls) {
261 set_union(S1&: IndirectMapKernel[&Func],
262 S2: VariablesReachableThroughFunctionPointer);
263 }
264 }
265
266 // Verify that we fall into one of 2 cases:
267 // - All variables are either absolute
268 // or direct mapped dynamic LDS that is not lowered.
269 // this is a re-run of the pass
270 // so we don't have anything to do.
271 // - No variables are absolute.
272 // Named-barriers which are absolute symbols are removed
273 // from the maps.
274 std::optional<bool> HasAbsoluteGVs;
275 bool HasSpecialGVs = false;
276 for (auto &Map : {DirectMapKernel, IndirectMapKernel}) {
277 for (auto &[Fn, GVs] : Map) {
278 for (auto *GV : GVs) {
279 bool IsAbsolute = GV->isAbsoluteSymbolRef();
280 bool IsDirectMapDynLDSGV =
281 AMDGPU::isDynamicLDS(GV: *GV) && DirectMapKernel.contains(Val: Fn);
282 if (IsDirectMapDynLDSGV)
283 continue;
284 if (isNamedBarrier(GV: *GV)) {
285 if (IsAbsolute) {
286 DirectMapKernel[Fn].erase(V: GV);
287 IndirectMapKernel[Fn].erase(V: GV);
288 }
289 HasSpecialGVs = true;
290 continue;
291 }
292 if (HasAbsoluteGVs.has_value()) {
293 if (*HasAbsoluteGVs != IsAbsolute) {
294 reportFatalUsageError(
295 reason: "module cannot mix absolute and non-absolute LDS GVs");
296 }
297 } else
298 HasAbsoluteGVs = IsAbsolute;
299 }
300 }
301 }
302
303 // If we only had absolute GVs, we have nothing to do, return an empty
304 // result.
305 if (HasAbsoluteGVs && *HasAbsoluteGVs)
306 return {.direct_access: FunctionVariableMap(), .indirect_access: FunctionVariableMap(), .HasSpecialGVs: false};
307
308 return {.direct_access: std::move(DirectMapKernel), .indirect_access: std::move(IndirectMapKernel),
309 .HasSpecialGVs: HasSpecialGVs};
310}
311
312void removeFnAttrFromReachable(CallGraph &CG, Function *KernelRoot,
313 ArrayRef<StringRef> FnAttrs) {
314 for (StringRef Attr : FnAttrs)
315 KernelRoot->removeFnAttr(Kind: Attr);
316
317 SmallVector<Function *> WorkList = {CG[KernelRoot]->getFunction()};
318 SmallPtrSet<Function *, 8> Visited;
319 bool SeenUnknownCall = false;
320
321 while (!WorkList.empty()) {
322 Function *F = WorkList.pop_back_val();
323
324 for (auto &CallRecord : *CG[F]) {
325 if (!CallRecord.second)
326 continue;
327
328 Function *Callee = CallRecord.second->getFunction();
329 if (!Callee) {
330 if (!SeenUnknownCall) {
331 SeenUnknownCall = true;
332
333 // If we see any indirect calls, assume nothing about potential
334 // targets.
335 // TODO: This could be refined to possible LDS global users.
336 for (auto &ExternalCallRecord : *CG.getExternalCallingNode()) {
337 Function *PotentialCallee =
338 ExternalCallRecord.second->getFunction();
339 assert(PotentialCallee);
340 if (!isKernel(F: *PotentialCallee)) {
341 for (StringRef Attr : FnAttrs)
342 PotentialCallee->removeFnAttr(Kind: Attr);
343 }
344 }
345 }
346 } else {
347 for (StringRef Attr : FnAttrs)
348 Callee->removeFnAttr(Kind: Attr);
349 if (Visited.insert(Ptr: Callee).second)
350 WorkList.push_back(Elt: Callee);
351 }
352 }
353 }
354}
355
356bool isReallyAClobber(const Value *Ptr, MemoryDef *Def, AAResults *AA) {
357 Instruction *DefInst = Def->getMemoryInst();
358
359 if (isa<FenceInst>(Val: DefInst))
360 return false;
361
362 if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(Val: DefInst)) {
363 switch (II->getIntrinsicID()) {
364 case Intrinsic::amdgcn_s_barrier:
365 case Intrinsic::amdgcn_s_cluster_barrier:
366 case Intrinsic::amdgcn_s_barrier_signal:
367 case Intrinsic::amdgcn_s_barrier_signal_var:
368 case Intrinsic::amdgcn_s_barrier_signal_isfirst:
369 case Intrinsic::amdgcn_s_barrier_init:
370 case Intrinsic::amdgcn_s_barrier_join:
371 case Intrinsic::amdgcn_s_barrier_wait:
372 case Intrinsic::amdgcn_s_barrier_leave:
373 case Intrinsic::amdgcn_s_get_barrier_state:
374 case Intrinsic::amdgcn_s_wakeup_barrier:
375 case Intrinsic::amdgcn_wave_barrier:
376 case Intrinsic::amdgcn_sched_barrier:
377 case Intrinsic::amdgcn_sched_group_barrier:
378 case Intrinsic::amdgcn_iglp_opt:
379 return false;
380 default:
381 break;
382 }
383 }
384
385 // Ignore atomics not aliasing with the original load, any atomic is a
386 // universal MemoryDef from MSSA's point of view too, just like a fence.
387 const auto checkNoAlias = [AA, Ptr](auto I) -> bool {
388 return I && AA->isNoAlias(I->getPointerOperand(), Ptr);
389 };
390
391 if (checkNoAlias(dyn_cast<AtomicCmpXchgInst>(Val: DefInst)) ||
392 checkNoAlias(dyn_cast<AtomicRMWInst>(Val: DefInst)))
393 return false;
394
395 return true;
396}
397
398bool isClobberedInFunction(const LoadInst *Load, MemorySSA *MSSA,
399 AAResults *AA) {
400 MemorySSAWalker *Walker = MSSA->getWalker();
401 SmallVector<MemoryAccess *> WorkList{Walker->getClobberingMemoryAccess(I: Load)};
402 SmallPtrSet<MemoryAccess *, 8> Visited;
403 MemoryLocation Loc(MemoryLocation::get(LI: Load));
404
405 LLVM_DEBUG(dbgs() << "Checking clobbering of: " << *Load << '\n');
406
407 // Start with a nearest dominating clobbering access, it will be either
408 // live on entry (nothing to do, load is not clobbered), MemoryDef, or
409 // MemoryPhi if several MemoryDefs can define this memory state. In that
410 // case add all Defs to WorkList and continue going up and checking all
411 // the definitions of this memory location until the root. When all the
412 // defs are exhausted and came to the entry state we have no clobber.
413 // Along the scan ignore barriers and fences which are considered clobbers
414 // by the MemorySSA, but not really writing anything into the memory.
415 while (!WorkList.empty()) {
416 MemoryAccess *MA = WorkList.pop_back_val();
417 if (!Visited.insert(Ptr: MA).second)
418 continue;
419
420 if (MSSA->isLiveOnEntryDef(MA))
421 continue;
422
423 if (MemoryDef *Def = dyn_cast<MemoryDef>(Val: MA)) {
424 LLVM_DEBUG(dbgs() << " Def: " << *Def->getMemoryInst() << '\n');
425
426 if (isReallyAClobber(Ptr: Load->getPointerOperand(), Def, AA)) {
427 LLVM_DEBUG(dbgs() << " -> load is clobbered\n");
428 return true;
429 }
430
431 WorkList.push_back(
432 Elt: Walker->getClobberingMemoryAccess(MA: Def->getDefiningAccess(), Loc));
433 continue;
434 }
435
436 const MemoryPhi *Phi = cast<MemoryPhi>(Val: MA);
437 for (const auto &Use : Phi->incoming_values())
438 WorkList.push_back(Elt: cast<MemoryAccess>(Val: &Use));
439 }
440
441 LLVM_DEBUG(dbgs() << " -> no clobber\n");
442 return false;
443}
444
445} // end namespace llvm::AMDGPU
446