1 | //===-- AMDGPUMemoryUtils.cpp - -------------------------------------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | |
9 | #include "AMDGPUMemoryUtils.h" |
10 | #include "AMDGPU.h" |
11 | #include "AMDGPUBaseInfo.h" |
12 | #include "llvm/ADT/SetOperations.h" |
13 | #include "llvm/ADT/SmallSet.h" |
14 | #include "llvm/Analysis/AliasAnalysis.h" |
15 | #include "llvm/Analysis/CallGraph.h" |
16 | #include "llvm/Analysis/MemorySSA.h" |
17 | #include "llvm/IR/DataLayout.h" |
18 | #include "llvm/IR/Instructions.h" |
19 | #include "llvm/IR/IntrinsicInst.h" |
20 | #include "llvm/IR/IntrinsicsAMDGPU.h" |
21 | #include "llvm/IR/Operator.h" |
22 | #include "llvm/IR/ReplaceConstant.h" |
23 | |
24 | #define DEBUG_TYPE "amdgpu-memory-utils" |
25 | |
26 | using namespace llvm; |
27 | |
28 | namespace llvm::AMDGPU { |
29 | |
30 | Align getAlign(const DataLayout &DL, const GlobalVariable *GV) { |
31 | return DL.getValueOrABITypeAlignment(Alignment: GV->getPointerAlignment(DL), |
32 | Ty: GV->getValueType()); |
33 | } |
34 | |
35 | bool isDynamicLDS(const GlobalVariable &GV) { |
36 | // external zero size addrspace(3) without initializer is dynlds. |
37 | const Module *M = GV.getParent(); |
38 | const DataLayout &DL = M->getDataLayout(); |
39 | if (GV.getType()->getPointerAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) |
40 | return false; |
41 | return DL.getTypeAllocSize(Ty: GV.getValueType()) == 0; |
42 | } |
43 | |
44 | bool isLDSVariableToLower(const GlobalVariable &GV) { |
45 | if (GV.getType()->getPointerAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) { |
46 | return false; |
47 | } |
48 | if (isDynamicLDS(GV)) { |
49 | return true; |
50 | } |
51 | if (GV.isConstant()) { |
52 | // A constant undef variable can't be written to, and any load is |
53 | // undef, so it should be eliminated by the optimizer. It could be |
54 | // dropped by the back end if not. This pass skips over it. |
55 | return false; |
56 | } |
57 | if (GV.hasInitializer() && !isa<UndefValue>(Val: GV.getInitializer())) { |
58 | // Initializers are unimplemented for LDS address space. |
59 | // Leave such variables in place for consistent error reporting. |
60 | return false; |
61 | } |
62 | return true; |
63 | } |
64 | |
65 | bool eliminateConstantExprUsesOfLDSFromAllInstructions(Module &M) { |
66 | // Constants are uniqued within LLVM. A ConstantExpr referring to a LDS |
67 | // global may have uses from multiple different functions as a result. |
68 | // This pass specialises LDS variables with respect to the kernel that |
69 | // allocates them. |
70 | |
71 | // This is semantically equivalent to (the unimplemented as slow): |
72 | // for (auto &F : M.functions()) |
73 | // for (auto &BB : F) |
74 | // for (auto &I : BB) |
75 | // for (Use &Op : I.operands()) |
76 | // if (constantExprUsesLDS(Op)) |
77 | // replaceConstantExprInFunction(I, Op); |
78 | |
79 | SmallVector<Constant *> LDSGlobals; |
80 | for (auto &GV : M.globals()) |
81 | if (AMDGPU::isLDSVariableToLower(GV)) |
82 | LDSGlobals.push_back(Elt: &GV); |
83 | return convertUsersOfConstantsToInstructions(Consts: LDSGlobals); |
84 | } |
85 | |
86 | void getUsesOfLDSByFunction(const CallGraph &CG, Module &M, |
87 | FunctionVariableMap &kernels, |
88 | FunctionVariableMap &Functions) { |
89 | // Get uses from the current function, excluding uses by called Functions |
90 | // Two output variables to avoid walking the globals list twice |
91 | for (auto &GV : M.globals()) { |
92 | if (!AMDGPU::isLDSVariableToLower(GV)) |
93 | continue; |
94 | for (User *V : GV.users()) { |
95 | if (auto *I = dyn_cast<Instruction>(Val: V)) { |
96 | Function *F = I->getFunction(); |
97 | if (isKernelLDS(F)) |
98 | kernels[F].insert(V: &GV); |
99 | else |
100 | Functions[F].insert(V: &GV); |
101 | } |
102 | } |
103 | } |
104 | } |
105 | |
106 | bool isKernelLDS(const Function *F) { |
107 | // Some weirdness here. AMDGPU::isKernelCC does not call into |
108 | // AMDGPU::isKernel with the calling conv, it instead calls into |
109 | // isModuleEntryFunction which returns true for more calling conventions |
110 | // than AMDGPU::isKernel does. There's a FIXME on AMDGPU::isKernel. |
111 | // There's also a test that checks that the LDS lowering does not hit on |
112 | // a graphics shader, denoted amdgpu_ps, so stay with the limited case. |
113 | // Putting LDS in the name of the function to draw attention to this. |
114 | return AMDGPU::isKernel(CC: F->getCallingConv()); |
115 | } |
116 | |
117 | LDSUsesInfoTy getTransitiveUsesOfLDS(const CallGraph &CG, Module &M) { |
118 | |
119 | FunctionVariableMap DirectMapKernel; |
120 | FunctionVariableMap DirectMapFunction; |
121 | getUsesOfLDSByFunction(CG, M, kernels&: DirectMapKernel, Functions&: DirectMapFunction); |
122 | |
123 | // Collect variables that are used by functions whose address has escaped |
124 | DenseSet<GlobalVariable *> VariablesReachableThroughFunctionPointer; |
125 | for (Function &F : M.functions()) { |
126 | if (!isKernelLDS(F: &F)) |
127 | if (F.hasAddressTaken(nullptr, |
128 | /* IgnoreCallbackUses */ false, |
129 | /* IgnoreAssumeLikeCalls */ false, |
130 | /* IgnoreLLVMUsed */ IngoreLLVMUsed: true, |
131 | /* IgnoreArcAttachedCall */ IgnoreARCAttachedCall: false)) { |
132 | set_union(S1&: VariablesReachableThroughFunctionPointer, |
133 | S2: DirectMapFunction[&F]); |
134 | } |
135 | } |
136 | |
137 | auto FunctionMakesUnknownCall = [&](const Function *F) -> bool { |
138 | assert(!F->isDeclaration()); |
139 | for (const CallGraphNode::CallRecord &R : *CG[F]) { |
140 | if (!R.second->getFunction()) |
141 | return true; |
142 | } |
143 | return false; |
144 | }; |
145 | |
146 | // Work out which variables are reachable through function calls |
147 | FunctionVariableMap TransitiveMapFunction = DirectMapFunction; |
148 | |
149 | // If the function makes any unknown call, assume the worst case that it can |
150 | // access all variables accessed by functions whose address escaped |
151 | for (Function &F : M.functions()) { |
152 | if (!F.isDeclaration() && FunctionMakesUnknownCall(&F)) { |
153 | if (!isKernelLDS(F: &F)) { |
154 | set_union(S1&: TransitiveMapFunction[&F], |
155 | S2: VariablesReachableThroughFunctionPointer); |
156 | } |
157 | } |
158 | } |
159 | |
160 | // Direct implementation of collecting all variables reachable from each |
161 | // function |
162 | for (Function &Func : M.functions()) { |
163 | if (Func.isDeclaration() || isKernelLDS(F: &Func)) |
164 | continue; |
165 | |
166 | DenseSet<Function *> seen; // catches cycles |
167 | SmallVector<Function *, 4> wip = {&Func}; |
168 | |
169 | while (!wip.empty()) { |
170 | Function *F = wip.pop_back_val(); |
171 | |
172 | // Can accelerate this by referring to transitive map for functions that |
173 | // have already been computed, with more care than this |
174 | set_union(S1&: TransitiveMapFunction[&Func], S2: DirectMapFunction[F]); |
175 | |
176 | for (const CallGraphNode::CallRecord &R : *CG[F]) { |
177 | Function *Ith = R.second->getFunction(); |
178 | if (Ith) { |
179 | if (!seen.contains(V: Ith)) { |
180 | seen.insert(V: Ith); |
181 | wip.push_back(Elt: Ith); |
182 | } |
183 | } |
184 | } |
185 | } |
186 | } |
187 | |
188 | // DirectMapKernel lists which variables are used by the kernel |
189 | // find the variables which are used through a function call |
190 | FunctionVariableMap IndirectMapKernel; |
191 | |
192 | for (Function &Func : M.functions()) { |
193 | if (Func.isDeclaration() || !isKernelLDS(F: &Func)) |
194 | continue; |
195 | |
196 | for (const CallGraphNode::CallRecord &R : *CG[&Func]) { |
197 | Function *Ith = R.second->getFunction(); |
198 | if (Ith) { |
199 | set_union(S1&: IndirectMapKernel[&Func], S2: TransitiveMapFunction[Ith]); |
200 | } else { |
201 | set_union(S1&: IndirectMapKernel[&Func], |
202 | S2: VariablesReachableThroughFunctionPointer); |
203 | } |
204 | } |
205 | } |
206 | |
207 | // Verify that we fall into one of 2 cases: |
208 | // - All variables are either absolute |
209 | // or direct mapped dynamic LDS that is not lowered. |
210 | // this is a re-run of the pass |
211 | // so we don't have anything to do. |
212 | // - No variables are absolute. |
213 | std::optional<bool> HasAbsoluteGVs; |
214 | for (auto &Map : {DirectMapKernel, IndirectMapKernel}) { |
215 | for (auto &[Fn, GVs] : Map) { |
216 | for (auto *GV : GVs) { |
217 | bool IsAbsolute = GV->isAbsoluteSymbolRef(); |
218 | bool IsDirectMapDynLDSGV = AMDGPU::isDynamicLDS(GV: *GV) && DirectMapKernel.contains(Val: Fn); |
219 | if (IsDirectMapDynLDSGV) |
220 | continue; |
221 | if (HasAbsoluteGVs.has_value()) { |
222 | if (*HasAbsoluteGVs != IsAbsolute) { |
223 | report_fatal_error( |
224 | reason: "Module cannot mix absolute and non-absolute LDS GVs" ); |
225 | } |
226 | } else |
227 | HasAbsoluteGVs = IsAbsolute; |
228 | } |
229 | } |
230 | } |
231 | |
232 | // If we only had absolute GVs, we have nothing to do, return an empty |
233 | // result. |
234 | if (HasAbsoluteGVs && *HasAbsoluteGVs) |
235 | return {.direct_access: FunctionVariableMap(), .indirect_access: FunctionVariableMap()}; |
236 | |
237 | return {.direct_access: std::move(DirectMapKernel), .indirect_access: std::move(IndirectMapKernel)}; |
238 | } |
239 | |
240 | void removeFnAttrFromReachable(CallGraph &CG, Function *KernelRoot, |
241 | ArrayRef<StringRef> FnAttrs) { |
242 | for (StringRef Attr : FnAttrs) |
243 | KernelRoot->removeFnAttr(Kind: Attr); |
244 | |
245 | SmallVector<Function *> WorkList = {CG[KernelRoot]->getFunction()}; |
246 | SmallPtrSet<Function *, 8> Visited; |
247 | bool SeenUnknownCall = false; |
248 | |
249 | while (!WorkList.empty()) { |
250 | Function *F = WorkList.pop_back_val(); |
251 | |
252 | for (auto &CallRecord : *CG[F]) { |
253 | if (!CallRecord.second) |
254 | continue; |
255 | |
256 | Function *Callee = CallRecord.second->getFunction(); |
257 | if (!Callee) { |
258 | if (!SeenUnknownCall) { |
259 | SeenUnknownCall = true; |
260 | |
261 | // If we see any indirect calls, assume nothing about potential |
262 | // targets. |
263 | // TODO: This could be refined to possible LDS global users. |
264 | for (auto &ExternalCallRecord : *CG.getExternalCallingNode()) { |
265 | Function *PotentialCallee = |
266 | ExternalCallRecord.second->getFunction(); |
267 | assert(PotentialCallee); |
268 | if (!isKernelLDS(F: PotentialCallee)) { |
269 | for (StringRef Attr : FnAttrs) |
270 | PotentialCallee->removeFnAttr(Kind: Attr); |
271 | } |
272 | } |
273 | } |
274 | } else { |
275 | for (StringRef Attr : FnAttrs) |
276 | Callee->removeFnAttr(Kind: Attr); |
277 | if (Visited.insert(Ptr: Callee).second) |
278 | WorkList.push_back(Elt: Callee); |
279 | } |
280 | } |
281 | } |
282 | } |
283 | |
284 | bool isReallyAClobber(const Value *Ptr, MemoryDef *Def, AAResults *AA) { |
285 | Instruction *DefInst = Def->getMemoryInst(); |
286 | |
287 | if (isa<FenceInst>(Val: DefInst)) |
288 | return false; |
289 | |
290 | if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(Val: DefInst)) { |
291 | switch (II->getIntrinsicID()) { |
292 | case Intrinsic::amdgcn_s_barrier: |
293 | case Intrinsic::amdgcn_s_barrier_signal: |
294 | case Intrinsic::amdgcn_s_barrier_signal_var: |
295 | case Intrinsic::amdgcn_s_barrier_signal_isfirst: |
296 | case Intrinsic::amdgcn_s_barrier_signal_isfirst_var: |
297 | case Intrinsic::amdgcn_s_barrier_init: |
298 | case Intrinsic::amdgcn_s_barrier_join: |
299 | case Intrinsic::amdgcn_s_barrier_wait: |
300 | case Intrinsic::amdgcn_s_barrier_leave: |
301 | case Intrinsic::amdgcn_s_get_barrier_state: |
302 | case Intrinsic::amdgcn_s_wakeup_barrier: |
303 | case Intrinsic::amdgcn_wave_barrier: |
304 | case Intrinsic::amdgcn_sched_barrier: |
305 | case Intrinsic::amdgcn_sched_group_barrier: |
306 | return false; |
307 | default: |
308 | break; |
309 | } |
310 | } |
311 | |
312 | // Ignore atomics not aliasing with the original load, any atomic is a |
313 | // universal MemoryDef from MSSA's point of view too, just like a fence. |
314 | const auto checkNoAlias = [AA, Ptr](auto I) -> bool { |
315 | return I && AA->isNoAlias(I->getPointerOperand(), Ptr); |
316 | }; |
317 | |
318 | if (checkNoAlias(dyn_cast<AtomicCmpXchgInst>(Val: DefInst)) || |
319 | checkNoAlias(dyn_cast<AtomicRMWInst>(Val: DefInst))) |
320 | return false; |
321 | |
322 | return true; |
323 | } |
324 | |
325 | bool isClobberedInFunction(const LoadInst *Load, MemorySSA *MSSA, |
326 | AAResults *AA) { |
327 | MemorySSAWalker *Walker = MSSA->getWalker(); |
328 | SmallVector<MemoryAccess *> WorkList{Walker->getClobberingMemoryAccess(I: Load)}; |
329 | SmallSet<MemoryAccess *, 8> Visited; |
330 | MemoryLocation Loc(MemoryLocation::get(LI: Load)); |
331 | |
332 | LLVM_DEBUG(dbgs() << "Checking clobbering of: " << *Load << '\n'); |
333 | |
334 | // Start with a nearest dominating clobbering access, it will be either |
335 | // live on entry (nothing to do, load is not clobbered), MemoryDef, or |
336 | // MemoryPhi if several MemoryDefs can define this memory state. In that |
337 | // case add all Defs to WorkList and continue going up and checking all |
338 | // the definitions of this memory location until the root. When all the |
339 | // defs are exhausted and came to the entry state we have no clobber. |
340 | // Along the scan ignore barriers and fences which are considered clobbers |
341 | // by the MemorySSA, but not really writing anything into the memory. |
342 | while (!WorkList.empty()) { |
343 | MemoryAccess *MA = WorkList.pop_back_val(); |
344 | if (!Visited.insert(Ptr: MA).second) |
345 | continue; |
346 | |
347 | if (MSSA->isLiveOnEntryDef(MA)) |
348 | continue; |
349 | |
350 | if (MemoryDef *Def = dyn_cast<MemoryDef>(Val: MA)) { |
351 | LLVM_DEBUG(dbgs() << " Def: " << *Def->getMemoryInst() << '\n'); |
352 | |
353 | if (isReallyAClobber(Ptr: Load->getPointerOperand(), Def, AA)) { |
354 | LLVM_DEBUG(dbgs() << " -> load is clobbered\n" ); |
355 | return true; |
356 | } |
357 | |
358 | WorkList.push_back( |
359 | Elt: Walker->getClobberingMemoryAccess(MA: Def->getDefiningAccess(), Loc)); |
360 | continue; |
361 | } |
362 | |
363 | const MemoryPhi *Phi = cast<MemoryPhi>(Val: MA); |
364 | for (const auto &Use : Phi->incoming_values()) |
365 | WorkList.push_back(Elt: cast<MemoryAccess>(Val: &Use)); |
366 | } |
367 | |
368 | LLVM_DEBUG(dbgs() << " -> no clobber\n" ); |
369 | return false; |
370 | } |
371 | |
372 | } // end namespace llvm::AMDGPU |
373 | |