AMDGPUMemoryUtils.cpp source code [llvm_projects/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp]

1	//===-- AMDGPUMemoryUtils.cpp - -------------------------------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8
9	#include "AMDGPUMemoryUtils.h"
10	#include "AMDGPU.h"
11	#include "AMDGPUBaseInfo.h"
12	#include "llvm/ADT/SetOperations.h"
13	#include "llvm/ADT/SmallSet.h"
14	#include "llvm/Analysis/AliasAnalysis.h"
15	#include "llvm/Analysis/CallGraph.h"
16	#include "llvm/Analysis/MemorySSA.h"
17	#include "llvm/IR/DataLayout.h"
18	#include "llvm/IR/Instructions.h"
19	#include "llvm/IR/IntrinsicInst.h"
20	#include "llvm/IR/IntrinsicsAMDGPU.h"
21	#include "llvm/IR/Operator.h"
22	#include "llvm/IR/ReplaceConstant.h"
23
24	#define DEBUG_TYPE "amdgpu-memory-utils"
25
26	using namespace llvm;
27
28	namespace llvm::AMDGPU {
29
30	Align getAlign(const DataLayout &DL, const GlobalVariable *GV) {
31	return DL.getValueOrABITypeAlignment(Alignment: GV->getPointerAlignment(DL),
32	Ty: GV->getValueType());
33	}
34
35	bool isDynamicLDS(const GlobalVariable &GV) {
36	// external zero size addrspace(3) without initializer is dynlds.
37	const Module *M = GV.getParent();
38	const DataLayout &DL = M->getDataLayout();
39	if (GV.getType()->getPointerAddressSpace() != AMDGPUAS::LOCAL_ADDRESS)
40	return false;
41	return DL.getTypeAllocSize(Ty: GV.getValueType()) == `0`;
42	}
43
44	bool isLDSVariableToLower(const GlobalVariable &GV) {
45	if (GV.getType()->getPointerAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) {
46	return false;
47	}
48	if (isDynamicLDS(GV)) {
49	return true;
50	}
51	if (GV.isConstant()) {
52	// A constant undef variable can't be written to, and any load is
53	// undef, so it should be eliminated by the optimizer. It could be
54	// dropped by the back end if not. This pass skips over it.
55	return false;
56	}
57	if (GV.hasInitializer() && !isa<UndefValue>(Val: GV.getInitializer())) {
58	// Initializers are unimplemented for LDS address space.
59	// Leave such variables in place for consistent error reporting.
60	return false;
61	}
62	return true;
63	}
64
65	bool eliminateConstantExprUsesOfLDSFromAllInstructions(Module &M) {
66	// Constants are uniqued within LLVM. A ConstantExpr referring to a LDS
67	// global may have uses from multiple different functions as a result.
68	// This pass specialises LDS variables with respect to the kernel that
69	// allocates them.
70
71	// This is semantically equivalent to (the unimplemented as slow):
72	// for (auto &F : M.functions())
73	// for (auto &BB : F)
74	// for (auto &I : BB)
75	// for (Use &Op : I.operands())
76	// if (constantExprUsesLDS(Op))
77	// replaceConstantExprInFunction(I, Op);
78
79	SmallVector<Constant *> LDSGlobals;
80	for (auto &GV : M.globals())
81	if (AMDGPU::isLDSVariableToLower(GV))
82	LDSGlobals.push_back(Elt: &GV);
83	return convertUsersOfConstantsToInstructions(Consts: LDSGlobals);
84	}
85
86	void getUsesOfLDSByFunction(const CallGraph &CG, Module &M,
87	FunctionVariableMap &kernels,
88	FunctionVariableMap &Functions) {
89	// Get uses from the current function, excluding uses by called Functions
90	// Two output variables to avoid walking the globals list twice
91	for (auto &GV : M.globals()) {
92	if (!AMDGPU::isLDSVariableToLower(GV))
93	continue;
94	for (User *V : GV.users()) {
95	if (auto *I = dyn_cast<Instruction>(Val: V)) {
96	Function *F = I->getFunction();
97	if (isKernelLDS(F))
98	kernels [F].insert(V: &GV);
99	else
100	Functions [F].insert(V: &GV);
101	}
102	}
103	}
104	}
105
106	bool isKernelLDS(const Function *F) {
107	// Some weirdness here. AMDGPU::isKernelCC does not call into
108	// AMDGPU::isKernel with the calling conv, it instead calls into
109	// isModuleEntryFunction which returns true for more calling conventions
110	// than AMDGPU::isKernel does. There's a FIXME on AMDGPU::isKernel.
111	// There's also a test that checks that the LDS lowering does not hit on
112	// a graphics shader, denoted amdgpu_ps, so stay with the limited case.
113	// Putting LDS in the name of the function to draw attention to this.
114	return AMDGPU::isKernel(CC: F->getCallingConv());
115	}
116
117	LDSUsesInfoTy getTransitiveUsesOfLDS(const CallGraph &CG, Module &M) {
118
119	FunctionVariableMap DirectMapKernel;
120	FunctionVariableMap DirectMapFunction;
121	getUsesOfLDSByFunction(CG, M, kernels&: DirectMapKernel, Functions&: DirectMapFunction);
122
123	// Collect variables that are used by functions whose address has escaped
124	DenseSet<GlobalVariable *> VariablesReachableThroughFunctionPointer;
125	for (Function &F : M.functions()) {
126	if (!isKernelLDS(F: &F))
127	if (F.hasAddressTaken(nullptr,
128	/ IgnoreCallbackUses / false,
129	/ IgnoreAssumeLikeCalls / false,
130	/ IgnoreLLVMUsed / IngoreLLVMUsed: true,
131	/ IgnoreArcAttachedCall / IgnoreARCAttachedCall: false)) {
132	set_union(S1&: VariablesReachableThroughFunctionPointer,
133	S2: DirectMapFunction [&F]);
134	}
135	}
136
137	auto FunctionMakesUnknownCall = [&](const Function F) -> bool* {
138	assert(!F->isDeclaration());
139	for (const CallGraphNode::CallRecord &R : *CG [F]) {
140	if (!R.second->getFunction())
141	return true;
142	}
143	return false;
144	};
145
146	// Work out which variables are reachable through function calls
147	FunctionVariableMap TransitiveMapFunction = DirectMapFunction;
148
149	// If the function makes any unknown call, assume the worst case that it can
150	// access all variables accessed by functions whose address escaped
151	for (Function &F : M.functions()) {
152	if (!F.isDeclaration() && FunctionMakesUnknownCall (&F)) {
153	if (!isKernelLDS(F: &F)) {
154	set_union(S1&: TransitiveMapFunction [&F],
155	S2: VariablesReachableThroughFunctionPointer);
156	}
157	}
158	}
159
160	// Direct implementation of collecting all variables reachable from each
161	// function
162	for (Function &Func : M.functions()) {
163	if (Func.isDeclaration() \|\| isKernelLDS(F: &Func))
164	continue;
165
166	DenseSet<Function > seen; // catches cycles*
167	SmallVector<Function *, `4`> wip = {&Func};
168
169	while (!wip.empty()) {
170	Function *F = wip.pop_back_val();
171
172	// Can accelerate this by referring to transitive map for functions that
173	// have already been computed, with more care than this
174	set_union(S1&: TransitiveMapFunction [&Func], S2: DirectMapFunction [F]);
175
176	for (const CallGraphNode::CallRecord &R : *CG [F]) {
177	Function *Ith = R.second->getFunction();
178	if (Ith) {
179	if (!seen.contains(V: Ith)) {
180	seen.insert(V: Ith);
181	wip.push_back(Elt: Ith);
182	}
183	}
184	}
185	}
186	}
187
188	// DirectMapKernel lists which variables are used by the kernel
189	// find the variables which are used through a function call
190	FunctionVariableMap IndirectMapKernel;
191
192	for (Function &Func : M.functions()) {
193	if (Func.isDeclaration() \|\| !isKernelLDS(F: &Func))
194	continue;
195
196	for (const CallGraphNode::CallRecord &R : *CG [&Func]) {
197	Function *Ith = R.second->getFunction();
198	if (Ith) {
199	set_union(S1&: IndirectMapKernel [&Func], S2: TransitiveMapFunction [Ith]);
200	} else {
201	set_union(S1&: IndirectMapKernel [&Func],
202	S2: VariablesReachableThroughFunctionPointer);
203	}
204	}
205	}
206
207	// Verify that we fall into one of 2 cases:
208	// - All variables are either absolute
209	// or direct mapped dynamic LDS that is not lowered.
210	// this is a re-run of the pass
211	// so we don't have anything to do.
212	// - No variables are absolute.
213	std::optional<bool> HasAbsoluteGVs;
214	for (auto &Map : {DirectMapKernel, IndirectMapKernel}) {
215	for (auto &[Fn, GVs] : Map) {
216	for (auto *GV : GVs) {
217	bool IsAbsolute = GV->isAbsoluteSymbolRef();
218	bool IsDirectMapDynLDSGV = AMDGPU::isDynamicLDS(GV: *GV) && DirectMapKernel.contains(Val: Fn);
219	if (IsDirectMapDynLDSGV)
220	continue;
221	if (HasAbsoluteGVs.has_value()) {
222	if (*HasAbsoluteGVs != IsAbsolute) {
223	report_fatal_error(
224	reason: "Module cannot mix absolute and non-absolute LDS GVs");
225	}
226	} else
227	HasAbsoluteGVs = IsAbsolute;
228	}
229	}
230	}
231
232	// If we only had absolute GVs, we have nothing to do, return an empty
233	// result.
234	if (HasAbsoluteGVs && *HasAbsoluteGVs)
235	return {.direct_access: FunctionVariableMap (), .indirect_access: FunctionVariableMap ()};
236
237	return {.direct_access: std::move(DirectMapKernel), .indirect_access: std::move(IndirectMapKernel)};
238	}
239
240	void removeFnAttrFromReachable(CallGraph &CG, Function *KernelRoot,
241	ArrayRef<StringRef> FnAttrs) {
242	for (StringRef Attr : FnAttrs)
243	KernelRoot->removeFnAttr(Kind: Attr);
244
245	SmallVector<Function *> WorkList = {CG [KernelRoot]->getFunction()};
246	SmallPtrSet<Function *, `8`> Visited;
247	bool SeenUnknownCall = false;
248
249	while (!WorkList.empty()) {
250	Function *F = WorkList.pop_back_val();
251
252	for (auto &CallRecord : *CG [F]) {
253	if (!CallRecord.second)
254	continue;
255
256	Function *Callee = CallRecord.second->getFunction();
257	if (!Callee) {
258	if (!SeenUnknownCall) {
259	SeenUnknownCall = true;
260
261	// If we see any indirect calls, assume nothing about potential
262	// targets.
263	// TODO: This could be refined to possible LDS global users.
264	for (auto &ExternalCallRecord : *CG.getExternalCallingNode()) {
265	Function *PotentialCallee =
266	ExternalCallRecord.second->getFunction();
267	assert(PotentialCallee);
268	if (!isKernelLDS(F: PotentialCallee)) {
269	for (StringRef Attr : FnAttrs)
270	PotentialCallee->removeFnAttr(Kind: Attr);
271	}
272	}
273	}
274	} else {
275	for (StringRef Attr : FnAttrs)
276	Callee->removeFnAttr(Kind: Attr);
277	if (Visited.insert(Ptr: Callee).second)
278	WorkList.push_back(Elt: Callee);
279	}
280	}
281	}
282	}
283
284	bool isReallyAClobber(const Value Ptr, MemoryDef Def, AAResults *AA) {
285	Instruction *DefInst = Def->getMemoryInst();
286
287	if (isa<FenceInst>(Val: DefInst))
288	return false;
289
290	if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(Val: DefInst)) {
291	switch (II->getIntrinsicID()) {
292	case Intrinsic::amdgcn_s_barrier:
293	case Intrinsic::amdgcn_s_barrier_signal:
294	case Intrinsic::amdgcn_s_barrier_signal_var:
295	case Intrinsic::amdgcn_s_barrier_signal_isfirst:
296	case Intrinsic::amdgcn_s_barrier_signal_isfirst_var:
297	case Intrinsic::amdgcn_s_barrier_init:
298	case Intrinsic::amdgcn_s_barrier_join:
299	case Intrinsic::amdgcn_s_barrier_wait:
300	case Intrinsic::amdgcn_s_barrier_leave:
301	case Intrinsic::amdgcn_s_get_barrier_state:
302	case Intrinsic::amdgcn_s_wakeup_barrier:
303	case Intrinsic::amdgcn_wave_barrier:
304	case Intrinsic::amdgcn_sched_barrier:
305	case Intrinsic::amdgcn_sched_group_barrier:
306	return false;
307	default:
308	break;
309	}
310	}
311
312	// Ignore atomics not aliasing with the original load, any atomic is a
313	// universal MemoryDef from MSSA's point of view too, just like a fence.
314	const auto checkNoAlias = [AA, Ptr](auto I) -> bool {
315	return I && AA->isNoAlias(I->getPointerOperand(), Ptr);
316	};
317
318	if (checkNoAlias (dyn_cast<AtomicCmpXchgInst>(Val: DefInst)) \|\|
319	checkNoAlias (dyn_cast<AtomicRMWInst>(Val: DefInst)))
320	return false;
321
322	return true;
323	}
324
325	bool isClobberedInFunction(const LoadInst Load, MemorySSA MSSA,
326	AAResults *AA) {
327	MemorySSAWalker *Walker = MSSA->getWalker();
328	SmallVector<MemoryAccess *> WorkList{Walker->getClobberingMemoryAccess(I: Load)};
329	SmallSet<MemoryAccess *, `8`> Visited;
330	MemoryLocation Loc(MemoryLocation::get(LI: Load));
331
332	LLVM_DEBUG(dbgs() << "Checking clobbering of: " << *Load << `'\n'`);
333
334	// Start with a nearest dominating clobbering access, it will be either
335	// live on entry (nothing to do, load is not clobbered), MemoryDef, or
336	// MemoryPhi if several MemoryDefs can define this memory state. In that
337	// case add all Defs to WorkList and continue going up and checking all
338	// the definitions of this memory location until the root. When all the
339	// defs are exhausted and came to the entry state we have no clobber.
340	// Along the scan ignore barriers and fences which are considered clobbers
341	// by the MemorySSA, but not really writing anything into the memory.
342	while (!WorkList.empty()) {
343	MemoryAccess *MA = WorkList.pop_back_val();
344	if (!Visited.insert(Ptr: MA).second)
345	continue;
346
347	if (MSSA->isLiveOnEntryDef(MA))
348	continue;
349
350	if (MemoryDef *Def = dyn_cast<MemoryDef>(Val: MA)) {
351	LLVM_DEBUG(dbgs() << " Def: " << *Def->getMemoryInst() << `'\n'`);
352
353	if (isReallyAClobber(Ptr: Load->getPointerOperand(), Def, AA)) {
354	LLVM_DEBUG(dbgs() << " -> load is clobbered\n");
355	return true;
356	}
357
358	WorkList.push_back(
359	Elt: Walker->getClobberingMemoryAccess(MA: Def->getDefiningAccess(), Loc));
360	continue;
361	}
362
363	const MemoryPhi *Phi = cast<MemoryPhi>(Val: MA);
364	for (const auto &Use : Phi->incoming_values())
365	WorkList.push_back(Elt: cast<MemoryAccess>(Val: &Use));
366	}
367
368	LLVM_DEBUG(dbgs() << " -> no clobber\n");
369	return false;
370	}
371
372	} // end namespace llvm::AMDGPU
373

Browse the source code of llvm_projects/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp