AMDGPUPerfHintAnalysis.cpp source code [llvm_projects/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp]

1	//===- AMDGPUPerfHintAnalysis.cpp - analysis of functions memory traffic --===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	/// \file
10	/// \brief Analyzes if a function potentially memory bound and if a kernel
11	/// kernel may benefit from limiting number of waves to reduce cache thrashing.
12	///
13	//===----------------------------------------------------------------------===//
14
15	#include "AMDGPUPerfHintAnalysis.h"
16	#include "AMDGPU.h"
17	#include "AMDGPUTargetMachine.h"
18	#include "Utils/AMDGPUBaseInfo.h"
19	#include "llvm/ADT/Statistic.h"
20	#include "llvm/Analysis/CallGraph.h"
21	#include "llvm/Analysis/CallGraphSCCPass.h"
22	#include "llvm/Analysis/LazyCallGraph.h"
23	#include "llvm/Analysis/ValueTracking.h"
24	#include "llvm/CodeGen/TargetLowering.h"
25	#include "llvm/CodeGen/TargetPassConfig.h"
26	#include "llvm/CodeGen/TargetSubtargetInfo.h"
27	#include "llvm/IR/Instructions.h"
28	#include "llvm/IR/IntrinsicInst.h"
29	#include "llvm/Support/CommandLine.h"
30	#include "llvm/Target/TargetMachine.h"
31
32	using namespace llvm;
33
34	#define DEBUG_TYPE "amdgpu-perf-hint"
35
36	static cl::opt<unsigned>
37	MemBoundThresh("amdgpu-membound-threshold", cl::init(Val: `50`), cl::Hidden,
38	cl::desc ("Function mem bound threshold in %"));
39
40	static cl::opt<unsigned>
41	LimitWaveThresh("amdgpu-limit-wave-threshold", cl::init(Val: `50`), cl::Hidden,
42	cl::desc ("Kernel limit wave threshold in %"));
43
44	static cl::opt<unsigned>
45	IAWeight("amdgpu-indirect-access-weight", cl::init(Val: `1000`), cl::Hidden,
46	cl::desc ("Indirect access memory instruction weight"));
47
48	static cl::opt<unsigned>
49	LSWeight("amdgpu-large-stride-weight", cl::init(Val: `1000`), cl::Hidden,
50	cl::desc ("Large stride memory access weight"));
51
52	static cl::opt<unsigned>
53	LargeStrideThresh("amdgpu-large-stride-threshold", cl::init(Val: `64`), cl::Hidden,
54	cl::desc ("Large stride memory access threshold"));
55
56	STATISTIC(NumMemBound, "Number of functions marked as memory bound");
57	STATISTIC(NumLimitWave, "Number of functions marked as needing limit wave");
58
59	namespace {
60
61	struct AMDGPUPerfHint {
62	friend AMDGPUPerfHintAnalysis;
63
64	public:
65	AMDGPUPerfHint(AMDGPUPerfHintAnalysis::FuncInfoMap &FIM_,
66	const SITargetLowering *TLI_)
67	: FIM(FIM_), TLI(TLI_) {}
68
69	bool runOnFunction(Function &F);
70
71	private:
72	struct MemAccessInfo {
73	const Value V = nullptr*;
74	const Value Base = nullptr*;
75	int64_t Offset = `0`;
76	MemAccessInfo() = default;
77	bool isLargeStride(MemAccessInfo &Reference) const;
78	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
79	Printable print() const {
80	return Printable([this](raw_ostream &OS) {
81	OS << "Value: " << *V << `'\n'`
82	<< "Base: " << *Base << " Offset: " << Offset << `'\n'`;
83	});
84	}
85	#endif
86	};
87
88	MemAccessInfo makeMemAccessInfo(Instruction ) const*;
89
90	MemAccessInfo LastAccess; // Last memory access info
91
92	AMDGPUPerfHintAnalysis::FuncInfoMap &FIM;
93
94	const DataLayout DL = nullptr*;
95
96	const SITargetLowering *TLI;
97
98	AMDGPUPerfHintAnalysis::FuncInfo visit(const* Function &F);
99	static bool isMemBound(const AMDGPUPerfHintAnalysis::FuncInfo &F);
100	static bool needLimitWave(const AMDGPUPerfHintAnalysis::FuncInfo &F);
101
102	bool isIndirectAccess(const Instruction Inst) const*;
103
104	/// Check if the instruction is large stride.
105	/// The purpose is to identify memory access pattern like:
106	/// x = a[i];
107	/// y = a[i+1000];
108	/// z = a[i+2000];
109	/// In the above example, the second and third memory access will be marked
110	/// large stride memory access.
111	bool isLargeStride(const Instruction *Inst);
112
113	bool isGlobalAddr(const Value V) const*;
114	bool isLocalAddr(const Value V) const*;
115	bool isGlobalLoadUsedInBB(const Instruction &) const;
116	};
117
118	static std::pair<const Value , const* Type *> getMemoryInstrPtrAndType(
119	const Instruction *Inst) {
120	if (const auto *LI = dyn_cast<LoadInst>(Val: Inst))
121	return {LI->getPointerOperand(), LI->getType()};
122	if (const auto *SI = dyn_cast<StoreInst>(Val: Inst))
123	return {SI->getPointerOperand(), SI->getValueOperand()->getType()};
124	if (const auto *AI = dyn_cast<AtomicCmpXchgInst>(Val: Inst))
125	return {AI->getPointerOperand(), AI->getCompareOperand()->getType()};
126	if (const auto *AI = dyn_cast<AtomicRMWInst>(Val: Inst))
127	return {AI->getPointerOperand(), AI->getValOperand()->getType()};
128	if (const auto *MI = dyn_cast<AnyMemIntrinsic>(Val: Inst))
129	return {MI->getRawDest(), Type::getInt8Ty(C&: MI->getContext())};
130
131	return {nullptr, nullptr};
132	}
133
134	bool AMDGPUPerfHint::isIndirectAccess(const Instruction Inst) const* {
135	LLVM_DEBUG(dbgs() << "[isIndirectAccess] " << *Inst << `'\n'`);
136	SmallPtrSet<const Value *, `32`> WorkSet;
137	SmallPtrSet<const Value *, `32`> Visited;
138	if (const Value *MO = getMemoryInstrPtrAndType(Inst).first) {
139	if (isGlobalAddr(V: MO))
140	WorkSet.insert(Ptr: MO);
141	}
142
143	while (!WorkSet.empty()) {
144	const Value V = WorkSet.begin();
145	WorkSet.erase(Ptr: *WorkSet.begin());
146	if (!Visited.insert(Ptr: V).second)
147	continue;
148	LLVM_DEBUG(dbgs() << " check: " << *V << `'\n'`);
149
150	if (const auto *LD = dyn_cast<LoadInst>(Val: V)) {
151	const auto *M = LD->getPointerOperand();
152	if (isGlobalAddr(V: M)) {
153	LLVM_DEBUG(dbgs() << " is IA\n");
154	return true;
155	}
156	continue;
157	}
158
159	if (const auto *GEP = dyn_cast<GetElementPtrInst>(Val: V)) {
160	const auto *P = GEP->getPointerOperand();
161	WorkSet.insert(Ptr: P);
162	for (unsigned I = `1`, E = GEP->getNumIndices() + `1`; I != E; ++I)
163	WorkSet.insert(Ptr: GEP->getOperand(i_nocapture: I));
164	continue;
165	}
166
167	if (const auto *U = dyn_cast<UnaryInstruction>(Val: V)) {
168	WorkSet.insert(Ptr: U->getOperand(i_nocapture: `0`));
169	continue;
170	}
171
172	if (const auto *BO = dyn_cast<BinaryOperator>(Val: V)) {
173	WorkSet.insert(Ptr: BO->getOperand(i_nocapture: `0`));
174	WorkSet.insert(Ptr: BO->getOperand(i_nocapture: `1`));
175	continue;
176	}
177
178	if (const auto *S = dyn_cast<SelectInst>(Val: V)) {
179	WorkSet.insert(Ptr: S->getFalseValue());
180	WorkSet.insert(Ptr: S->getTrueValue());
181	continue;
182	}
183
184	if (const auto *E = dyn_cast<ExtractElementInst>(Val: V)) {
185	WorkSet.insert(Ptr: E->getVectorOperand());
186	continue;
187	}
188
189	LLVM_DEBUG(dbgs() << " dropped\n");
190	}
191
192	LLVM_DEBUG(dbgs() << " is not IA\n");
193	return false;
194	}
195
196	// Returns true if the global load `I` is used in its own basic block.
197	bool AMDGPUPerfHint::isGlobalLoadUsedInBB(const Instruction &I) const {
198	const auto *Ld = dyn_cast<LoadInst>(Val: &I);
199	if (!Ld)
200	return false;
201	if (!isGlobalAddr(V: Ld->getPointerOperand()))
202	return false;
203
204	for (const User *Usr : Ld->users()) {
205	if (const Instruction *UsrInst = dyn_cast<Instruction>(Val: Usr)) {
206	if (UsrInst->getParent() == I.getParent())
207	return true;
208	}
209	}
210
211	return false;
212	}
213
214	AMDGPUPerfHintAnalysis::FuncInfo AMDGPUPerfHint::visit(const* Function &F) {
215	AMDGPUPerfHintAnalysis::FuncInfo &FI = FIM [&F];
216
217	LLVM_DEBUG(dbgs() << "[AMDGPUPerfHint] process " << F.getName() << `'\n'`);
218
219	for (auto &B : F) {
220	LastAccess = MemAccessInfo ();
221	unsigned UsedGlobalLoadsInBB = `0`;
222	for (auto &I : B) {
223	if (const Type *Ty = getMemoryInstrPtrAndType(Inst: &I).second) {
224	unsigned Size = divideCeil(Numerator: Ty->getPrimitiveSizeInBits(), Denominator: `32`);
225	// TODO: Check if the global load and its user are close to each other
226	// instead (Or do this analysis in GCNSchedStrategy?).
227	if (isGlobalLoadUsedInBB(I))
228	UsedGlobalLoadsInBB += Size;
229	if (isIndirectAccess(Inst: &I))
230	FI.IAMInstCost += Size;
231	if (isLargeStride(Inst: &I))
232	FI.LSMInstCost += Size;
233	FI.MemInstCost += Size;
234	FI.InstCost += Size;
235	continue;
236	}
237	if (auto *CB = dyn_cast<CallBase>(Val: &I)) {
238	Function *Callee = CB->getCalledFunction();
239	if (!Callee \|\| Callee->isDeclaration()) {
240	++FI.InstCost;
241	continue;
242	}
243	if (&F == Callee) // Handle immediate recursion
244	continue;
245
246	auto Loc = FIM.find(Val: Callee);
247	if (Loc == FIM.end())
248	continue;
249
250	FI.MemInstCost += Loc ->second.MemInstCost;
251	FI.InstCost += Loc ->second.InstCost;
252	FI.IAMInstCost += Loc ->second.IAMInstCost;
253	FI.LSMInstCost += Loc ->second.LSMInstCost;
254	} else if (auto *GEP = dyn_cast<GetElementPtrInst>(Val: &I)) {
255	TargetLoweringBase::AddrMode AM;
256	auto Ptr = GetPointerBaseWithConstantOffset(Ptr: GEP, Offset&: AM.BaseOffs, DL: DL);
257	AM.BaseGV = dyn_cast_or_null<GlobalValue>(Val: const_cast<Value *>(Ptr));
258	AM.HasBaseReg = !AM.BaseGV;
259	if (TLI->isLegalAddressingMode(DL: *DL, AM, Ty: GEP->getResultElementType(),
260	AS: GEP->getPointerAddressSpace()))
261	// Offset will likely be folded into load or store
262	continue;
263	++FI.InstCost;
264	} else {
265	++FI.InstCost;
266	}
267	}
268
269	if (!FI.HasDenseGlobalMemAcc) {
270	unsigned GlobalMemAccPercentage = UsedGlobalLoadsInBB * `100` / B.size();
271	if (GlobalMemAccPercentage > `50`) {
272	LLVM_DEBUG(dbgs() << "[HasDenseGlobalMemAcc] Set to true since "
273	<< B.getName() << " has " << GlobalMemAccPercentage
274	<< "% global memory access\n");
275	FI.HasDenseGlobalMemAcc = true;
276	}
277	}
278	}
279
280	return &FI;
281	}
282
283	bool AMDGPUPerfHint::runOnFunction(Function &F) {
284	const Module &M = *F.getParent();
285	DL = &M.getDataLayout();
286
287	if (F.hasFnAttribute(Kind: "amdgpu-wave-limiter") &&
288	F.hasFnAttribute(Kind: "amdgpu-memory-bound"))
289	return false;
290
291	const AMDGPUPerfHintAnalysis::FuncInfo *Info = visit(F);
292
293	LLVM_DEBUG(dbgs() << F.getName() << " MemInst cost: " << Info->MemInstCost
294	<< `'\n'`
295	<< " IAMInst cost: " << Info->IAMInstCost << `'\n'`
296	<< " LSMInst cost: " << Info->LSMInstCost << `'\n'`
297	<< " TotalInst cost: " << Info->InstCost << `'\n'`);
298
299	bool Changed = false;
300
301	if (isMemBound(F: *Info)) {
302	LLVM_DEBUG(dbgs() << F.getName() << " is memory bound\n");
303	NumMemBound ++;
304	F.addFnAttr(Kind: "amdgpu-memory-bound", Val: "true");
305	Changed = true;
306	}
307
308	if (AMDGPU::isEntryFunctionCC(CC: F.getCallingConv()) && needLimitWave(F: *Info)) {
309	LLVM_DEBUG(dbgs() << F.getName() << " needs limit wave\n");
310	NumLimitWave ++;
311	F.addFnAttr(Kind: "amdgpu-wave-limiter", Val: "true");
312	Changed = true;
313	}
314
315	return Changed;
316	}
317
318	bool AMDGPUPerfHint::isMemBound(const AMDGPUPerfHintAnalysis::FuncInfo &FI) {
319	// Reverting optimal scheduling in favour of occupancy with basic block(s)
320	// having dense global memory access can potentially hurt performance.
321	if (FI.HasDenseGlobalMemAcc)
322	return true;
323
324	return FI.MemInstCost * `100` / FI.InstCost > MemBoundThresh;
325	}
326
327	bool AMDGPUPerfHint::needLimitWave(const AMDGPUPerfHintAnalysis::FuncInfo &FI) {
328	return ((FI.MemInstCost + FI.IAMInstCost * IAWeight +
329	FI.LSMInstCost * LSWeight) * `100` / FI.InstCost) > LimitWaveThresh;
330	}
331
332	bool AMDGPUPerfHint::isGlobalAddr(const Value V) const* {
333	if (auto *PT = dyn_cast<PointerType>(Val: V->getType())) {
334	unsigned As = PT->getAddressSpace();
335	// Flat likely points to global too.
336	return As == AMDGPUAS::GLOBAL_ADDRESS \|\| As == AMDGPUAS::FLAT_ADDRESS;
337	}
338	return false;
339	}
340
341	bool AMDGPUPerfHint::isLocalAddr(const Value V) const* {
342	if (auto *PT = dyn_cast<PointerType>(Val: V->getType()))
343	return PT->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
344	return false;
345	}
346
347	bool AMDGPUPerfHint::isLargeStride(const Instruction *Inst) {
348	LLVM_DEBUG(dbgs() << "[isLargeStride] " << *Inst << `'\n'`);
349
350	MemAccessInfo MAI = makeMemAccessInfo(const_cast<Instruction *>(Inst));
351	bool IsLargeStride = MAI.isLargeStride(Reference&: LastAccess);
352	if (MAI.Base)
353	LastAccess = std::move(MAI);
354
355	return IsLargeStride;
356	}
357
358	AMDGPUPerfHint::MemAccessInfo
359	AMDGPUPerfHint::makeMemAccessInfo(Instruction Inst) const* {
360	MemAccessInfo MAI;
361	const Value *MO = getMemoryInstrPtrAndType(Inst).first;
362
363	LLVM_DEBUG(dbgs() << "[isLargeStride] MO: " << *MO << `'\n'`);
364	// Do not treat local-addr memory access as large stride.
365	if (isLocalAddr(V: MO))
366	return MAI;
367
368	MAI.V = MO;
369	MAI.Base = GetPointerBaseWithConstantOffset(Ptr: MO, Offset&: MAI.Offset, DL: *DL);
370	return MAI;
371	}
372
373	bool AMDGPUPerfHint::MemAccessInfo::isLargeStride(
374	MemAccessInfo &Reference) const {
375
376	if (!Base \|\| !Reference.Base \|\| Base != Reference.Base)
377	return false;
378
379	uint64_t Diff = Offset > Reference.Offset ? Offset - Reference.Offset
380	: Reference.Offset - Offset;
381	bool Result = Diff > LargeStrideThresh;
382	LLVM_DEBUG(dbgs() << "[isLargeStride compare]\n"
383	<< print() << "<=>\n"
384	<< Reference.print() << "Result:" << Result << `'\n'`);
385	return Result;
386	}
387
388	class AMDGPUPerfHintAnalysisLegacy : public CallGraphSCCPass {
389	private:
390	// FIXME: This is relying on maintaining state between different SCCs.
391	AMDGPUPerfHintAnalysis Impl;
392
393	public:
394	static char ID;
395
396	AMDGPUPerfHintAnalysisLegacy() : CallGraphSCCPass (ID) {}
397
398	bool runOnSCC(CallGraphSCC &SCC) override;
399
400	void getAnalysisUsage(AnalysisUsage &AU) const override {
401	AU.setPreservesAll();
402	}
403	};
404
405	} // namespace
406
407	bool AMDGPUPerfHintAnalysis::isMemoryBound(const Function F) const* {
408	auto FI = FIM.find(Val: F);
409	if (FI == FIM.end())
410	return false;
411
412	return AMDGPUPerfHint::isMemBound(FI: FI ->second);
413	}
414
415	bool AMDGPUPerfHintAnalysis::needsWaveLimiter(const Function F) const* {
416	auto FI = FIM.find(Val: F);
417	if (FI == FIM.end())
418	return false;
419
420	return AMDGPUPerfHint::needLimitWave(FI: FI ->second);
421	}
422
423	bool AMDGPUPerfHintAnalysis::runOnSCC(const GCNTargetMachine &TM,
424	CallGraphSCC &SCC) {
425	bool Changed = false;
426	for (CallGraphNode *I : SCC) {
427	Function *F = I->getFunction();
428	if (!F \|\| F->isDeclaration())
429	continue;
430
431	const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F: *F);
432	AMDGPUPerfHint Analyzer(FIM, ST.getTargetLowering());
433
434	if (Analyzer.runOnFunction(F&: *F))
435	Changed = true;
436	}
437
438	return Changed;
439	}
440
441	bool AMDGPUPerfHintAnalysis::run(const GCNTargetMachine &TM,
442	LazyCallGraph &CG) {
443	bool Changed = false;
444
445	CG.buildRefSCCs();
446
447	for (LazyCallGraph::RefSCC &RC : CG.postorder_ref_sccs()) {
448	for (LazyCallGraph::SCC &SCC : RC) {
449	if (SCC.size() != `1`)
450	continue;
451	Function &F = SCC.begin()->getFunction();
452	// TODO: Skip without norecurse, or interposable?
453	if (F.isDeclaration())
454	continue;
455
456	const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
457	AMDGPUPerfHint Analyzer(FIM, ST.getTargetLowering());
458	if (Analyzer.runOnFunction(F))
459	Changed = true;
460	}
461	}
462
463	return Changed;
464	}
465
466	char AMDGPUPerfHintAnalysisLegacy::ID = `0`;
467	char &llvm::AMDGPUPerfHintAnalysisLegacyID = AMDGPUPerfHintAnalysisLegacy::ID;
468
469	INITIALIZE_PASS(AMDGPUPerfHintAnalysisLegacy, DEBUG_TYPE,
470	"Analysis if a function is memory bound", true, true)
471
472	bool AMDGPUPerfHintAnalysisLegacy::runOnSCC(CallGraphSCC &SCC) {
473	auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
474	if (!TPC)
475	return false;
476
477	const GCNTargetMachine &TM = TPC->getTM<GCNTargetMachine>();
478	return Impl.runOnSCC(TM, SCC);
479	}
480
481	PreservedAnalyses AMDGPUPerfHintAnalysisPass::run(Module &M,
482	ModuleAnalysisManager &AM) {
483	auto &CG = AM.getResult<LazyCallGraphAnalysis>(IR&: M);
484
485	bool Changed = Impl ->run(TM, CG);
486	if (!Changed)
487	return PreservedAnalyses::all();
488
489	PreservedAnalyses PA;
490	PA.preserve<LazyCallGraphAnalysis>();
491	return PA;
492	}
493

Browse the source code of llvm_projects/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp