1 | //===- AMDGPUPerfHintAnalysis.cpp - analysis of functions memory traffic --===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | /// \file |
10 | /// \brief Analyzes if a function potentially memory bound and if a kernel |
11 | /// kernel may benefit from limiting number of waves to reduce cache thrashing. |
12 | /// |
13 | //===----------------------------------------------------------------------===// |
14 | |
15 | #include "AMDGPUPerfHintAnalysis.h" |
16 | #include "AMDGPU.h" |
17 | #include "AMDGPUTargetMachine.h" |
18 | #include "Utils/AMDGPUBaseInfo.h" |
19 | #include "llvm/ADT/SmallSet.h" |
20 | #include "llvm/ADT/Statistic.h" |
21 | #include "llvm/Analysis/CallGraph.h" |
22 | #include "llvm/Analysis/CallGraphSCCPass.h" |
23 | #include "llvm/Analysis/LazyCallGraph.h" |
24 | #include "llvm/Analysis/ValueTracking.h" |
25 | #include "llvm/CodeGen/TargetLowering.h" |
26 | #include "llvm/CodeGen/TargetPassConfig.h" |
27 | #include "llvm/CodeGen/TargetSubtargetInfo.h" |
28 | #include "llvm/IR/Instructions.h" |
29 | #include "llvm/IR/IntrinsicInst.h" |
30 | #include "llvm/Support/CommandLine.h" |
31 | #include "llvm/Target/TargetMachine.h" |
32 | |
33 | using namespace llvm; |
34 | |
35 | #define DEBUG_TYPE "amdgpu-perf-hint" |
36 | |
37 | static cl::opt<unsigned> |
38 | MemBoundThresh("amdgpu-membound-threshold" , cl::init(Val: 50), cl::Hidden, |
39 | cl::desc("Function mem bound threshold in %" )); |
40 | |
41 | static cl::opt<unsigned> |
42 | LimitWaveThresh("amdgpu-limit-wave-threshold" , cl::init(Val: 50), cl::Hidden, |
43 | cl::desc("Kernel limit wave threshold in %" )); |
44 | |
45 | static cl::opt<unsigned> |
46 | IAWeight("amdgpu-indirect-access-weight" , cl::init(Val: 1000), cl::Hidden, |
47 | cl::desc("Indirect access memory instruction weight" )); |
48 | |
49 | static cl::opt<unsigned> |
50 | LSWeight("amdgpu-large-stride-weight" , cl::init(Val: 1000), cl::Hidden, |
51 | cl::desc("Large stride memory access weight" )); |
52 | |
53 | static cl::opt<unsigned> |
54 | LargeStrideThresh("amdgpu-large-stride-threshold" , cl::init(Val: 64), cl::Hidden, |
55 | cl::desc("Large stride memory access threshold" )); |
56 | |
57 | STATISTIC(NumMemBound, "Number of functions marked as memory bound" ); |
58 | STATISTIC(NumLimitWave, "Number of functions marked as needing limit wave" ); |
59 | |
60 | namespace { |
61 | |
62 | struct AMDGPUPerfHint { |
63 | friend AMDGPUPerfHintAnalysis; |
64 | |
65 | public: |
66 | AMDGPUPerfHint(AMDGPUPerfHintAnalysis::FuncInfoMap &FIM_, |
67 | const SITargetLowering *TLI_) |
68 | : FIM(FIM_), TLI(TLI_) {} |
69 | |
70 | bool runOnFunction(Function &F); |
71 | |
72 | private: |
73 | struct MemAccessInfo { |
74 | const Value *V = nullptr; |
75 | const Value *Base = nullptr; |
76 | int64_t Offset = 0; |
77 | MemAccessInfo() = default; |
78 | bool isLargeStride(MemAccessInfo &Reference) const; |
79 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
80 | Printable print() const { |
81 | return Printable([this](raw_ostream &OS) { |
82 | OS << "Value: " << *V << '\n' |
83 | << "Base: " << *Base << " Offset: " << Offset << '\n'; |
84 | }); |
85 | } |
86 | #endif |
87 | }; |
88 | |
89 | MemAccessInfo makeMemAccessInfo(Instruction *) const; |
90 | |
91 | MemAccessInfo LastAccess; // Last memory access info |
92 | |
93 | AMDGPUPerfHintAnalysis::FuncInfoMap &FIM; |
94 | |
95 | const DataLayout *DL = nullptr; |
96 | |
97 | const SITargetLowering *TLI; |
98 | |
99 | AMDGPUPerfHintAnalysis::FuncInfo *visit(const Function &F); |
100 | static bool isMemBound(const AMDGPUPerfHintAnalysis::FuncInfo &F); |
101 | static bool needLimitWave(const AMDGPUPerfHintAnalysis::FuncInfo &F); |
102 | |
103 | bool isIndirectAccess(const Instruction *Inst) const; |
104 | |
105 | /// Check if the instruction is large stride. |
106 | /// The purpose is to identify memory access pattern like: |
107 | /// x = a[i]; |
108 | /// y = a[i+1000]; |
109 | /// z = a[i+2000]; |
110 | /// In the above example, the second and third memory access will be marked |
111 | /// large stride memory access. |
112 | bool isLargeStride(const Instruction *Inst); |
113 | |
114 | bool isGlobalAddr(const Value *V) const; |
115 | bool isLocalAddr(const Value *V) const; |
116 | bool isGlobalLoadUsedInBB(const Instruction &) const; |
117 | }; |
118 | |
119 | static std::pair<const Value *, const Type *> getMemoryInstrPtrAndType( |
120 | const Instruction *Inst) { |
121 | if (const auto *LI = dyn_cast<LoadInst>(Val: Inst)) |
122 | return {LI->getPointerOperand(), LI->getType()}; |
123 | if (const auto *SI = dyn_cast<StoreInst>(Val: Inst)) |
124 | return {SI->getPointerOperand(), SI->getValueOperand()->getType()}; |
125 | if (const auto *AI = dyn_cast<AtomicCmpXchgInst>(Val: Inst)) |
126 | return {AI->getPointerOperand(), AI->getCompareOperand()->getType()}; |
127 | if (const auto *AI = dyn_cast<AtomicRMWInst>(Val: Inst)) |
128 | return {AI->getPointerOperand(), AI->getValOperand()->getType()}; |
129 | if (const auto *MI = dyn_cast<AnyMemIntrinsic>(Val: Inst)) |
130 | return {MI->getRawDest(), Type::getInt8Ty(C&: MI->getContext())}; |
131 | |
132 | return {nullptr, nullptr}; |
133 | } |
134 | |
135 | bool AMDGPUPerfHint::isIndirectAccess(const Instruction *Inst) const { |
136 | LLVM_DEBUG(dbgs() << "[isIndirectAccess] " << *Inst << '\n'); |
137 | SmallSet<const Value *, 32> WorkSet; |
138 | SmallSet<const Value *, 32> Visited; |
139 | if (const Value *MO = getMemoryInstrPtrAndType(Inst).first) { |
140 | if (isGlobalAddr(V: MO)) |
141 | WorkSet.insert(Ptr: MO); |
142 | } |
143 | |
144 | while (!WorkSet.empty()) { |
145 | const Value *V = *WorkSet.begin(); |
146 | WorkSet.erase(Ptr: *WorkSet.begin()); |
147 | if (!Visited.insert(Ptr: V).second) |
148 | continue; |
149 | LLVM_DEBUG(dbgs() << " check: " << *V << '\n'); |
150 | |
151 | if (const auto *LD = dyn_cast<LoadInst>(Val: V)) { |
152 | const auto *M = LD->getPointerOperand(); |
153 | if (isGlobalAddr(V: M)) { |
154 | LLVM_DEBUG(dbgs() << " is IA\n" ); |
155 | return true; |
156 | } |
157 | continue; |
158 | } |
159 | |
160 | if (const auto *GEP = dyn_cast<GetElementPtrInst>(Val: V)) { |
161 | const auto *P = GEP->getPointerOperand(); |
162 | WorkSet.insert(Ptr: P); |
163 | for (unsigned I = 1, E = GEP->getNumIndices() + 1; I != E; ++I) |
164 | WorkSet.insert(Ptr: GEP->getOperand(i_nocapture: I)); |
165 | continue; |
166 | } |
167 | |
168 | if (const auto *U = dyn_cast<UnaryInstruction>(Val: V)) { |
169 | WorkSet.insert(Ptr: U->getOperand(i_nocapture: 0)); |
170 | continue; |
171 | } |
172 | |
173 | if (const auto *BO = dyn_cast<BinaryOperator>(Val: V)) { |
174 | WorkSet.insert(Ptr: BO->getOperand(i_nocapture: 0)); |
175 | WorkSet.insert(Ptr: BO->getOperand(i_nocapture: 1)); |
176 | continue; |
177 | } |
178 | |
179 | if (const auto *S = dyn_cast<SelectInst>(Val: V)) { |
180 | WorkSet.insert(Ptr: S->getFalseValue()); |
181 | WorkSet.insert(Ptr: S->getTrueValue()); |
182 | continue; |
183 | } |
184 | |
185 | if (const auto *E = dyn_cast<ExtractElementInst>(Val: V)) { |
186 | WorkSet.insert(Ptr: E->getVectorOperand()); |
187 | continue; |
188 | } |
189 | |
190 | LLVM_DEBUG(dbgs() << " dropped\n" ); |
191 | } |
192 | |
193 | LLVM_DEBUG(dbgs() << " is not IA\n" ); |
194 | return false; |
195 | } |
196 | |
197 | // Returns true if the global load `I` is used in its own basic block. |
198 | bool AMDGPUPerfHint::isGlobalLoadUsedInBB(const Instruction &I) const { |
199 | const auto *Ld = dyn_cast<LoadInst>(Val: &I); |
200 | if (!Ld) |
201 | return false; |
202 | if (!isGlobalAddr(V: Ld->getPointerOperand())) |
203 | return false; |
204 | |
205 | for (const User *Usr : Ld->users()) { |
206 | if (const Instruction *UsrInst = dyn_cast<Instruction>(Val: Usr)) { |
207 | if (UsrInst->getParent() == I.getParent()) |
208 | return true; |
209 | } |
210 | } |
211 | |
212 | return false; |
213 | } |
214 | |
215 | AMDGPUPerfHintAnalysis::FuncInfo *AMDGPUPerfHint::visit(const Function &F) { |
216 | AMDGPUPerfHintAnalysis::FuncInfo &FI = FIM[&F]; |
217 | |
218 | LLVM_DEBUG(dbgs() << "[AMDGPUPerfHint] process " << F.getName() << '\n'); |
219 | |
220 | for (auto &B : F) { |
221 | LastAccess = MemAccessInfo(); |
222 | unsigned UsedGlobalLoadsInBB = 0; |
223 | for (auto &I : B) { |
224 | if (const Type *Ty = getMemoryInstrPtrAndType(Inst: &I).second) { |
225 | unsigned Size = divideCeil(Numerator: Ty->getPrimitiveSizeInBits(), Denominator: 32); |
226 | // TODO: Check if the global load and its user are close to each other |
227 | // instead (Or do this analysis in GCNSchedStrategy?). |
228 | if (isGlobalLoadUsedInBB(I)) |
229 | UsedGlobalLoadsInBB += Size; |
230 | if (isIndirectAccess(Inst: &I)) |
231 | FI.IAMInstCost += Size; |
232 | if (isLargeStride(Inst: &I)) |
233 | FI.LSMInstCost += Size; |
234 | FI.MemInstCost += Size; |
235 | FI.InstCost += Size; |
236 | continue; |
237 | } |
238 | if (auto *CB = dyn_cast<CallBase>(Val: &I)) { |
239 | Function *Callee = CB->getCalledFunction(); |
240 | if (!Callee || Callee->isDeclaration()) { |
241 | ++FI.InstCost; |
242 | continue; |
243 | } |
244 | if (&F == Callee) // Handle immediate recursion |
245 | continue; |
246 | |
247 | auto Loc = FIM.find(Val: Callee); |
248 | if (Loc == FIM.end()) |
249 | continue; |
250 | |
251 | FI.MemInstCost += Loc->second.MemInstCost; |
252 | FI.InstCost += Loc->second.InstCost; |
253 | FI.IAMInstCost += Loc->second.IAMInstCost; |
254 | FI.LSMInstCost += Loc->second.LSMInstCost; |
255 | } else if (auto *GEP = dyn_cast<GetElementPtrInst>(Val: &I)) { |
256 | TargetLoweringBase::AddrMode AM; |
257 | auto *Ptr = GetPointerBaseWithConstantOffset(Ptr: GEP, Offset&: AM.BaseOffs, DL: *DL); |
258 | AM.BaseGV = dyn_cast_or_null<GlobalValue>(Val: const_cast<Value *>(Ptr)); |
259 | AM.HasBaseReg = !AM.BaseGV; |
260 | if (TLI->isLegalAddressingMode(DL: *DL, AM, Ty: GEP->getResultElementType(), |
261 | AS: GEP->getPointerAddressSpace())) |
262 | // Offset will likely be folded into load or store |
263 | continue; |
264 | ++FI.InstCost; |
265 | } else { |
266 | ++FI.InstCost; |
267 | } |
268 | } |
269 | |
270 | if (!FI.HasDenseGlobalMemAcc) { |
271 | unsigned GlobalMemAccPercentage = UsedGlobalLoadsInBB * 100 / B.size(); |
272 | if (GlobalMemAccPercentage > 50) { |
273 | LLVM_DEBUG(dbgs() << "[HasDenseGlobalMemAcc] Set to true since " |
274 | << B.getName() << " has " << GlobalMemAccPercentage |
275 | << "% global memory access\n" ); |
276 | FI.HasDenseGlobalMemAcc = true; |
277 | } |
278 | } |
279 | } |
280 | |
281 | return &FI; |
282 | } |
283 | |
284 | bool AMDGPUPerfHint::runOnFunction(Function &F) { |
285 | const Module &M = *F.getParent(); |
286 | DL = &M.getDataLayout(); |
287 | |
288 | if (F.hasFnAttribute(Kind: "amdgpu-wave-limiter" ) && |
289 | F.hasFnAttribute(Kind: "amdgpu-memory-bound" )) |
290 | return false; |
291 | |
292 | const AMDGPUPerfHintAnalysis::FuncInfo *Info = visit(F); |
293 | |
294 | LLVM_DEBUG(dbgs() << F.getName() << " MemInst cost: " << Info->MemInstCost |
295 | << '\n' |
296 | << " IAMInst cost: " << Info->IAMInstCost << '\n' |
297 | << " LSMInst cost: " << Info->LSMInstCost << '\n' |
298 | << " TotalInst cost: " << Info->InstCost << '\n'); |
299 | |
300 | bool Changed = false; |
301 | |
302 | if (isMemBound(F: *Info)) { |
303 | LLVM_DEBUG(dbgs() << F.getName() << " is memory bound\n" ); |
304 | NumMemBound++; |
305 | F.addFnAttr(Kind: "amdgpu-memory-bound" , Val: "true" ); |
306 | Changed = true; |
307 | } |
308 | |
309 | if (AMDGPU::isEntryFunctionCC(CC: F.getCallingConv()) && needLimitWave(F: *Info)) { |
310 | LLVM_DEBUG(dbgs() << F.getName() << " needs limit wave\n" ); |
311 | NumLimitWave++; |
312 | F.addFnAttr(Kind: "amdgpu-wave-limiter" , Val: "true" ); |
313 | Changed = true; |
314 | } |
315 | |
316 | return Changed; |
317 | } |
318 | |
319 | bool AMDGPUPerfHint::isMemBound(const AMDGPUPerfHintAnalysis::FuncInfo &FI) { |
320 | // Reverting optimal scheduling in favour of occupancy with basic block(s) |
321 | // having dense global memory access can potentially hurt performance. |
322 | if (FI.HasDenseGlobalMemAcc) |
323 | return true; |
324 | |
325 | return FI.MemInstCost * 100 / FI.InstCost > MemBoundThresh; |
326 | } |
327 | |
328 | bool AMDGPUPerfHint::needLimitWave(const AMDGPUPerfHintAnalysis::FuncInfo &FI) { |
329 | return ((FI.MemInstCost + FI.IAMInstCost * IAWeight + |
330 | FI.LSMInstCost * LSWeight) * 100 / FI.InstCost) > LimitWaveThresh; |
331 | } |
332 | |
333 | bool AMDGPUPerfHint::isGlobalAddr(const Value *V) const { |
334 | if (auto *PT = dyn_cast<PointerType>(Val: V->getType())) { |
335 | unsigned As = PT->getAddressSpace(); |
336 | // Flat likely points to global too. |
337 | return As == AMDGPUAS::GLOBAL_ADDRESS || As == AMDGPUAS::FLAT_ADDRESS; |
338 | } |
339 | return false; |
340 | } |
341 | |
342 | bool AMDGPUPerfHint::isLocalAddr(const Value *V) const { |
343 | if (auto *PT = dyn_cast<PointerType>(Val: V->getType())) |
344 | return PT->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS; |
345 | return false; |
346 | } |
347 | |
348 | bool AMDGPUPerfHint::isLargeStride(const Instruction *Inst) { |
349 | LLVM_DEBUG(dbgs() << "[isLargeStride] " << *Inst << '\n'); |
350 | |
351 | MemAccessInfo MAI = makeMemAccessInfo(const_cast<Instruction *>(Inst)); |
352 | bool IsLargeStride = MAI.isLargeStride(Reference&: LastAccess); |
353 | if (MAI.Base) |
354 | LastAccess = std::move(MAI); |
355 | |
356 | return IsLargeStride; |
357 | } |
358 | |
359 | AMDGPUPerfHint::MemAccessInfo |
360 | AMDGPUPerfHint::makeMemAccessInfo(Instruction *Inst) const { |
361 | MemAccessInfo MAI; |
362 | const Value *MO = getMemoryInstrPtrAndType(Inst).first; |
363 | |
364 | LLVM_DEBUG(dbgs() << "[isLargeStride] MO: " << *MO << '\n'); |
365 | // Do not treat local-addr memory access as large stride. |
366 | if (isLocalAddr(V: MO)) |
367 | return MAI; |
368 | |
369 | MAI.V = MO; |
370 | MAI.Base = GetPointerBaseWithConstantOffset(Ptr: MO, Offset&: MAI.Offset, DL: *DL); |
371 | return MAI; |
372 | } |
373 | |
374 | bool AMDGPUPerfHint::MemAccessInfo::isLargeStride( |
375 | MemAccessInfo &Reference) const { |
376 | |
377 | if (!Base || !Reference.Base || Base != Reference.Base) |
378 | return false; |
379 | |
380 | uint64_t Diff = Offset > Reference.Offset ? Offset - Reference.Offset |
381 | : Reference.Offset - Offset; |
382 | bool Result = Diff > LargeStrideThresh; |
383 | LLVM_DEBUG(dbgs() << "[isLargeStride compare]\n" |
384 | << print() << "<=>\n" |
385 | << Reference.print() << "Result:" << Result << '\n'); |
386 | return Result; |
387 | } |
388 | |
389 | class AMDGPUPerfHintAnalysisLegacy : public CallGraphSCCPass { |
390 | private: |
391 | // FIXME: This is relying on maintaining state between different SCCs. |
392 | AMDGPUPerfHintAnalysis Impl; |
393 | |
394 | public: |
395 | static char ID; |
396 | |
397 | AMDGPUPerfHintAnalysisLegacy() : CallGraphSCCPass(ID) {} |
398 | |
399 | bool runOnSCC(CallGraphSCC &SCC) override; |
400 | |
401 | void getAnalysisUsage(AnalysisUsage &AU) const override { |
402 | AU.setPreservesAll(); |
403 | } |
404 | }; |
405 | |
406 | } // namespace |
407 | |
408 | bool AMDGPUPerfHintAnalysis::isMemoryBound(const Function *F) const { |
409 | auto FI = FIM.find(Val: F); |
410 | if (FI == FIM.end()) |
411 | return false; |
412 | |
413 | return AMDGPUPerfHint::isMemBound(FI: FI->second); |
414 | } |
415 | |
416 | bool AMDGPUPerfHintAnalysis::needsWaveLimiter(const Function *F) const { |
417 | auto FI = FIM.find(Val: F); |
418 | if (FI == FIM.end()) |
419 | return false; |
420 | |
421 | return AMDGPUPerfHint::needLimitWave(FI: FI->second); |
422 | } |
423 | |
424 | bool AMDGPUPerfHintAnalysis::runOnSCC(const GCNTargetMachine &TM, |
425 | CallGraphSCC &SCC) { |
426 | bool Changed = false; |
427 | for (CallGraphNode *I : SCC) { |
428 | Function *F = I->getFunction(); |
429 | if (!F || F->isDeclaration()) |
430 | continue; |
431 | |
432 | const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F: *F); |
433 | AMDGPUPerfHint Analyzer(FIM, ST.getTargetLowering()); |
434 | |
435 | if (Analyzer.runOnFunction(F&: *F)) |
436 | Changed = true; |
437 | } |
438 | |
439 | return Changed; |
440 | } |
441 | |
442 | bool AMDGPUPerfHintAnalysis::run(const GCNTargetMachine &TM, |
443 | LazyCallGraph &CG) { |
444 | bool Changed = false; |
445 | |
446 | CG.buildRefSCCs(); |
447 | |
448 | for (LazyCallGraph::RefSCC &RC : CG.postorder_ref_sccs()) { |
449 | for (LazyCallGraph::SCC &SCC : RC) { |
450 | if (SCC.size() != 1) |
451 | continue; |
452 | Function &F = SCC.begin()->getFunction(); |
453 | // TODO: Skip without norecurse, or interposable? |
454 | if (F.isDeclaration()) |
455 | continue; |
456 | |
457 | const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F); |
458 | AMDGPUPerfHint Analyzer(FIM, ST.getTargetLowering()); |
459 | if (Analyzer.runOnFunction(F)) |
460 | Changed = true; |
461 | } |
462 | } |
463 | |
464 | return Changed; |
465 | } |
466 | |
467 | char AMDGPUPerfHintAnalysisLegacy::ID = 0; |
468 | char &llvm::AMDGPUPerfHintAnalysisLegacyID = AMDGPUPerfHintAnalysisLegacy::ID; |
469 | |
470 | INITIALIZE_PASS(AMDGPUPerfHintAnalysisLegacy, DEBUG_TYPE, |
471 | "Analysis if a function is memory bound" , true, true) |
472 | |
473 | bool AMDGPUPerfHintAnalysisLegacy::runOnSCC(CallGraphSCC &SCC) { |
474 | auto *TPC = getAnalysisIfAvailable<TargetPassConfig>(); |
475 | if (!TPC) |
476 | return false; |
477 | |
478 | const GCNTargetMachine &TM = TPC->getTM<GCNTargetMachine>(); |
479 | return Impl.runOnSCC(TM, SCC); |
480 | } |
481 | |
482 | PreservedAnalyses AMDGPUPerfHintAnalysisPass::run(Module &M, |
483 | ModuleAnalysisManager &AM) { |
484 | auto &CG = AM.getResult<LazyCallGraphAnalysis>(IR&: M); |
485 | |
486 | bool Changed = Impl->run(TM, CG); |
487 | if (!Changed) |
488 | return PreservedAnalyses::all(); |
489 | |
490 | PreservedAnalyses PA; |
491 | PA.preserve<LazyCallGraphAnalysis>(); |
492 | return PA; |
493 | } |
494 | |