1 | //===- AMDGPUPerfHintAnalysis.cpp - analysis of functions memory traffic --===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | /// \file |
10 | /// \brief Analyzes if a function potentially memory bound and if a kernel |
11 | /// kernel may benefit from limiting number of waves to reduce cache thrashing. |
12 | /// |
13 | //===----------------------------------------------------------------------===// |
14 | |
15 | #include "AMDGPU.h" |
16 | #include "AMDGPUPerfHintAnalysis.h" |
17 | #include "Utils/AMDGPUBaseInfo.h" |
18 | #include "llvm/ADT/SmallSet.h" |
19 | #include "llvm/ADT/Statistic.h" |
20 | #include "llvm/Analysis/CallGraph.h" |
21 | #include "llvm/Analysis/ValueTracking.h" |
22 | #include "llvm/CodeGen/TargetLowering.h" |
23 | #include "llvm/CodeGen/TargetPassConfig.h" |
24 | #include "llvm/CodeGen/TargetSubtargetInfo.h" |
25 | #include "llvm/IR/Instructions.h" |
26 | #include "llvm/IR/IntrinsicInst.h" |
27 | #include "llvm/Support/CommandLine.h" |
28 | #include "llvm/Target/TargetMachine.h" |
29 | |
30 | using namespace llvm; |
31 | |
32 | #define DEBUG_TYPE "amdgpu-perf-hint" |
33 | |
34 | static cl::opt<unsigned> |
35 | MemBoundThresh("amdgpu-membound-threshold" , cl::init(Val: 50), cl::Hidden, |
36 | cl::desc("Function mem bound threshold in %" )); |
37 | |
38 | static cl::opt<unsigned> |
39 | LimitWaveThresh("amdgpu-limit-wave-threshold" , cl::init(Val: 50), cl::Hidden, |
40 | cl::desc("Kernel limit wave threshold in %" )); |
41 | |
42 | static cl::opt<unsigned> |
43 | IAWeight("amdgpu-indirect-access-weight" , cl::init(Val: 1000), cl::Hidden, |
44 | cl::desc("Indirect access memory instruction weight" )); |
45 | |
46 | static cl::opt<unsigned> |
47 | LSWeight("amdgpu-large-stride-weight" , cl::init(Val: 1000), cl::Hidden, |
48 | cl::desc("Large stride memory access weight" )); |
49 | |
50 | static cl::opt<unsigned> |
51 | LargeStrideThresh("amdgpu-large-stride-threshold" , cl::init(Val: 64), cl::Hidden, |
52 | cl::desc("Large stride memory access threshold" )); |
53 | |
54 | STATISTIC(NumMemBound, "Number of functions marked as memory bound" ); |
55 | STATISTIC(NumLimitWave, "Number of functions marked as needing limit wave" ); |
56 | |
57 | char llvm::AMDGPUPerfHintAnalysis::ID = 0; |
58 | char &llvm::AMDGPUPerfHintAnalysisID = AMDGPUPerfHintAnalysis::ID; |
59 | |
60 | INITIALIZE_PASS(AMDGPUPerfHintAnalysis, DEBUG_TYPE, |
61 | "Analysis if a function is memory bound" , true, true) |
62 | |
63 | namespace { |
64 | |
65 | struct AMDGPUPerfHint { |
66 | friend AMDGPUPerfHintAnalysis; |
67 | |
68 | public: |
69 | AMDGPUPerfHint(AMDGPUPerfHintAnalysis::FuncInfoMap &FIM_, |
70 | const TargetLowering *TLI_) |
71 | : FIM(FIM_), TLI(TLI_) {} |
72 | |
73 | bool runOnFunction(Function &F); |
74 | |
75 | private: |
76 | struct MemAccessInfo { |
77 | const Value *V = nullptr; |
78 | const Value *Base = nullptr; |
79 | int64_t Offset = 0; |
80 | MemAccessInfo() = default; |
81 | bool isLargeStride(MemAccessInfo &Reference) const; |
82 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
83 | Printable print() const { |
84 | return Printable([this](raw_ostream &OS) { |
85 | OS << "Value: " << *V << '\n' |
86 | << "Base: " << *Base << " Offset: " << Offset << '\n'; |
87 | }); |
88 | } |
89 | #endif |
90 | }; |
91 | |
92 | MemAccessInfo makeMemAccessInfo(Instruction *) const; |
93 | |
94 | MemAccessInfo LastAccess; // Last memory access info |
95 | |
96 | AMDGPUPerfHintAnalysis::FuncInfoMap &FIM; |
97 | |
98 | const DataLayout *DL = nullptr; |
99 | |
100 | const TargetLowering *TLI; |
101 | |
102 | AMDGPUPerfHintAnalysis::FuncInfo *visit(const Function &F); |
103 | static bool isMemBound(const AMDGPUPerfHintAnalysis::FuncInfo &F); |
104 | static bool needLimitWave(const AMDGPUPerfHintAnalysis::FuncInfo &F); |
105 | |
106 | bool isIndirectAccess(const Instruction *Inst) const; |
107 | |
108 | /// Check if the instruction is large stride. |
109 | /// The purpose is to identify memory access pattern like: |
110 | /// x = a[i]; |
111 | /// y = a[i+1000]; |
112 | /// z = a[i+2000]; |
113 | /// In the above example, the second and third memory access will be marked |
114 | /// large stride memory access. |
115 | bool isLargeStride(const Instruction *Inst); |
116 | |
117 | bool isGlobalAddr(const Value *V) const; |
118 | bool isLocalAddr(const Value *V) const; |
119 | bool isGlobalLoadUsedInBB(const Instruction &) const; |
120 | }; |
121 | |
122 | static std::pair<const Value *, const Type *> getMemoryInstrPtrAndType( |
123 | const Instruction *Inst) { |
124 | if (auto LI = dyn_cast<LoadInst>(Val: Inst)) |
125 | return {LI->getPointerOperand(), LI->getType()}; |
126 | if (auto SI = dyn_cast<StoreInst>(Val: Inst)) |
127 | return {SI->getPointerOperand(), SI->getValueOperand()->getType()}; |
128 | if (auto AI = dyn_cast<AtomicCmpXchgInst>(Val: Inst)) |
129 | return {AI->getPointerOperand(), AI->getCompareOperand()->getType()}; |
130 | if (auto AI = dyn_cast<AtomicRMWInst>(Val: Inst)) |
131 | return {AI->getPointerOperand(), AI->getValOperand()->getType()}; |
132 | if (auto MI = dyn_cast<AnyMemIntrinsic>(Val: Inst)) |
133 | return {MI->getRawDest(), Type::getInt8Ty(C&: MI->getContext())}; |
134 | |
135 | return {nullptr, nullptr}; |
136 | } |
137 | |
138 | bool AMDGPUPerfHint::isIndirectAccess(const Instruction *Inst) const { |
139 | LLVM_DEBUG(dbgs() << "[isIndirectAccess] " << *Inst << '\n'); |
140 | SmallSet<const Value *, 32> WorkSet; |
141 | SmallSet<const Value *, 32> Visited; |
142 | if (const Value *MO = getMemoryInstrPtrAndType(Inst).first) { |
143 | if (isGlobalAddr(V: MO)) |
144 | WorkSet.insert(Ptr: MO); |
145 | } |
146 | |
147 | while (!WorkSet.empty()) { |
148 | const Value *V = *WorkSet.begin(); |
149 | WorkSet.erase(Ptr: *WorkSet.begin()); |
150 | if (!Visited.insert(Ptr: V).second) |
151 | continue; |
152 | LLVM_DEBUG(dbgs() << " check: " << *V << '\n'); |
153 | |
154 | if (auto LD = dyn_cast<LoadInst>(Val: V)) { |
155 | auto M = LD->getPointerOperand(); |
156 | if (isGlobalAddr(V: M)) { |
157 | LLVM_DEBUG(dbgs() << " is IA\n" ); |
158 | return true; |
159 | } |
160 | continue; |
161 | } |
162 | |
163 | if (auto GEP = dyn_cast<GetElementPtrInst>(Val: V)) { |
164 | auto P = GEP->getPointerOperand(); |
165 | WorkSet.insert(Ptr: P); |
166 | for (unsigned I = 1, E = GEP->getNumIndices() + 1; I != E; ++I) |
167 | WorkSet.insert(Ptr: GEP->getOperand(i_nocapture: I)); |
168 | continue; |
169 | } |
170 | |
171 | if (auto U = dyn_cast<UnaryInstruction>(Val: V)) { |
172 | WorkSet.insert(Ptr: U->getOperand(i_nocapture: 0)); |
173 | continue; |
174 | } |
175 | |
176 | if (auto BO = dyn_cast<BinaryOperator>(Val: V)) { |
177 | WorkSet.insert(Ptr: BO->getOperand(i_nocapture: 0)); |
178 | WorkSet.insert(Ptr: BO->getOperand(i_nocapture: 1)); |
179 | continue; |
180 | } |
181 | |
182 | if (auto S = dyn_cast<SelectInst>(Val: V)) { |
183 | WorkSet.insert(Ptr: S->getFalseValue()); |
184 | WorkSet.insert(Ptr: S->getTrueValue()); |
185 | continue; |
186 | } |
187 | |
188 | if (auto E = dyn_cast<ExtractElementInst>(Val: V)) { |
189 | WorkSet.insert(Ptr: E->getVectorOperand()); |
190 | continue; |
191 | } |
192 | |
193 | LLVM_DEBUG(dbgs() << " dropped\n" ); |
194 | } |
195 | |
196 | LLVM_DEBUG(dbgs() << " is not IA\n" ); |
197 | return false; |
198 | } |
199 | |
200 | // Returns true if the global load `I` is used in its own basic block. |
201 | bool AMDGPUPerfHint::isGlobalLoadUsedInBB(const Instruction &I) const { |
202 | const auto *Ld = dyn_cast<LoadInst>(Val: &I); |
203 | if (!Ld) |
204 | return false; |
205 | if (!isGlobalAddr(V: Ld->getPointerOperand())) |
206 | return false; |
207 | |
208 | for (const User *Usr : Ld->users()) { |
209 | if (const Instruction *UsrInst = dyn_cast<Instruction>(Val: Usr)) { |
210 | if (UsrInst->getParent() == I.getParent()) |
211 | return true; |
212 | } |
213 | } |
214 | |
215 | return false; |
216 | } |
217 | |
218 | AMDGPUPerfHintAnalysis::FuncInfo *AMDGPUPerfHint::visit(const Function &F) { |
219 | AMDGPUPerfHintAnalysis::FuncInfo &FI = FIM[&F]; |
220 | |
221 | LLVM_DEBUG(dbgs() << "[AMDGPUPerfHint] process " << F.getName() << '\n'); |
222 | |
223 | for (auto &B : F) { |
224 | LastAccess = MemAccessInfo(); |
225 | unsigned UsedGlobalLoadsInBB = 0; |
226 | for (auto &I : B) { |
227 | if (const Type *Ty = getMemoryInstrPtrAndType(Inst: &I).second) { |
228 | unsigned Size = divideCeil(Numerator: Ty->getPrimitiveSizeInBits(), Denominator: 32); |
229 | // TODO: Check if the global load and its user are close to each other |
230 | // instead (Or do this analysis in GCNSchedStrategy?). |
231 | if (isGlobalLoadUsedInBB(I)) |
232 | UsedGlobalLoadsInBB += Size; |
233 | if (isIndirectAccess(Inst: &I)) |
234 | FI.IAMInstCost += Size; |
235 | if (isLargeStride(Inst: &I)) |
236 | FI.LSMInstCost += Size; |
237 | FI.MemInstCost += Size; |
238 | FI.InstCost += Size; |
239 | continue; |
240 | } |
241 | if (auto *CB = dyn_cast<CallBase>(Val: &I)) { |
242 | Function *Callee = CB->getCalledFunction(); |
243 | if (!Callee || Callee->isDeclaration()) { |
244 | ++FI.InstCost; |
245 | continue; |
246 | } |
247 | if (&F == Callee) // Handle immediate recursion |
248 | continue; |
249 | |
250 | auto Loc = FIM.find(Val: Callee); |
251 | if (Loc == FIM.end()) |
252 | continue; |
253 | |
254 | FI.MemInstCost += Loc->second.MemInstCost; |
255 | FI.InstCost += Loc->second.InstCost; |
256 | FI.IAMInstCost += Loc->second.IAMInstCost; |
257 | FI.LSMInstCost += Loc->second.LSMInstCost; |
258 | } else if (auto *GEP = dyn_cast<GetElementPtrInst>(Val: &I)) { |
259 | TargetLoweringBase::AddrMode AM; |
260 | auto *Ptr = GetPointerBaseWithConstantOffset(Ptr: GEP, Offset&: AM.BaseOffs, DL: *DL); |
261 | AM.BaseGV = dyn_cast_or_null<GlobalValue>(Val: const_cast<Value *>(Ptr)); |
262 | AM.HasBaseReg = !AM.BaseGV; |
263 | if (TLI->isLegalAddressingMode(DL: *DL, AM, Ty: GEP->getResultElementType(), |
264 | AddrSpace: GEP->getPointerAddressSpace())) |
265 | // Offset will likely be folded into load or store |
266 | continue; |
267 | ++FI.InstCost; |
268 | } else { |
269 | ++FI.InstCost; |
270 | } |
271 | } |
272 | |
273 | if (!FI.HasDenseGlobalMemAcc) { |
274 | unsigned GlobalMemAccPercentage = UsedGlobalLoadsInBB * 100 / B.size(); |
275 | if (GlobalMemAccPercentage > 50) { |
276 | LLVM_DEBUG(dbgs() << "[HasDenseGlobalMemAcc] Set to true since " |
277 | << B.getName() << " has " << GlobalMemAccPercentage |
278 | << "% global memory access\n" ); |
279 | FI.HasDenseGlobalMemAcc = true; |
280 | } |
281 | } |
282 | } |
283 | |
284 | return &FI; |
285 | } |
286 | |
287 | bool AMDGPUPerfHint::runOnFunction(Function &F) { |
288 | const Module &M = *F.getParent(); |
289 | DL = &M.getDataLayout(); |
290 | |
291 | if (F.hasFnAttribute(Kind: "amdgpu-wave-limiter" ) && |
292 | F.hasFnAttribute(Kind: "amdgpu-memory-bound" )) |
293 | return false; |
294 | |
295 | const AMDGPUPerfHintAnalysis::FuncInfo *Info = visit(F); |
296 | |
297 | LLVM_DEBUG(dbgs() << F.getName() << " MemInst cost: " << Info->MemInstCost |
298 | << '\n' |
299 | << " IAMInst cost: " << Info->IAMInstCost << '\n' |
300 | << " LSMInst cost: " << Info->LSMInstCost << '\n' |
301 | << " TotalInst cost: " << Info->InstCost << '\n'); |
302 | |
303 | bool Changed = false; |
304 | |
305 | if (isMemBound(F: *Info)) { |
306 | LLVM_DEBUG(dbgs() << F.getName() << " is memory bound\n" ); |
307 | NumMemBound++; |
308 | F.addFnAttr(Kind: "amdgpu-memory-bound" , Val: "true" ); |
309 | Changed = true; |
310 | } |
311 | |
312 | if (AMDGPU::isEntryFunctionCC(CC: F.getCallingConv()) && needLimitWave(F: *Info)) { |
313 | LLVM_DEBUG(dbgs() << F.getName() << " needs limit wave\n" ); |
314 | NumLimitWave++; |
315 | F.addFnAttr(Kind: "amdgpu-wave-limiter" , Val: "true" ); |
316 | Changed = true; |
317 | } |
318 | |
319 | return Changed; |
320 | } |
321 | |
322 | bool AMDGPUPerfHint::isMemBound(const AMDGPUPerfHintAnalysis::FuncInfo &FI) { |
323 | // Reverting optimal scheduling in favour of occupancy with basic block(s) |
324 | // having dense global memory access can potentially hurt performance. |
325 | if (FI.HasDenseGlobalMemAcc) |
326 | return true; |
327 | |
328 | return FI.MemInstCost * 100 / FI.InstCost > MemBoundThresh; |
329 | } |
330 | |
331 | bool AMDGPUPerfHint::needLimitWave(const AMDGPUPerfHintAnalysis::FuncInfo &FI) { |
332 | return ((FI.MemInstCost + FI.IAMInstCost * IAWeight + |
333 | FI.LSMInstCost * LSWeight) * 100 / FI.InstCost) > LimitWaveThresh; |
334 | } |
335 | |
336 | bool AMDGPUPerfHint::isGlobalAddr(const Value *V) const { |
337 | if (auto PT = dyn_cast<PointerType>(Val: V->getType())) { |
338 | unsigned As = PT->getAddressSpace(); |
339 | // Flat likely points to global too. |
340 | return As == AMDGPUAS::GLOBAL_ADDRESS || As == AMDGPUAS::FLAT_ADDRESS; |
341 | } |
342 | return false; |
343 | } |
344 | |
345 | bool AMDGPUPerfHint::isLocalAddr(const Value *V) const { |
346 | if (auto PT = dyn_cast<PointerType>(Val: V->getType())) |
347 | return PT->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS; |
348 | return false; |
349 | } |
350 | |
351 | bool AMDGPUPerfHint::isLargeStride(const Instruction *Inst) { |
352 | LLVM_DEBUG(dbgs() << "[isLargeStride] " << *Inst << '\n'); |
353 | |
354 | MemAccessInfo MAI = makeMemAccessInfo(const_cast<Instruction *>(Inst)); |
355 | bool IsLargeStride = MAI.isLargeStride(Reference&: LastAccess); |
356 | if (MAI.Base) |
357 | LastAccess = std::move(MAI); |
358 | |
359 | return IsLargeStride; |
360 | } |
361 | |
362 | AMDGPUPerfHint::MemAccessInfo |
363 | AMDGPUPerfHint::makeMemAccessInfo(Instruction *Inst) const { |
364 | MemAccessInfo MAI; |
365 | const Value *MO = getMemoryInstrPtrAndType(Inst).first; |
366 | |
367 | LLVM_DEBUG(dbgs() << "[isLargeStride] MO: " << *MO << '\n'); |
368 | // Do not treat local-addr memory access as large stride. |
369 | if (isLocalAddr(V: MO)) |
370 | return MAI; |
371 | |
372 | MAI.V = MO; |
373 | MAI.Base = GetPointerBaseWithConstantOffset(Ptr: MO, Offset&: MAI.Offset, DL: *DL); |
374 | return MAI; |
375 | } |
376 | |
377 | bool AMDGPUPerfHint::MemAccessInfo::isLargeStride( |
378 | MemAccessInfo &Reference) const { |
379 | |
380 | if (!Base || !Reference.Base || Base != Reference.Base) |
381 | return false; |
382 | |
383 | uint64_t Diff = Offset > Reference.Offset ? Offset - Reference.Offset |
384 | : Reference.Offset - Offset; |
385 | bool Result = Diff > LargeStrideThresh; |
386 | LLVM_DEBUG(dbgs() << "[isLargeStride compare]\n" |
387 | << print() << "<=>\n" |
388 | << Reference.print() << "Result:" << Result << '\n'); |
389 | return Result; |
390 | } |
391 | } // namespace |
392 | |
393 | bool AMDGPUPerfHintAnalysis::runOnSCC(CallGraphSCC &SCC) { |
394 | auto *TPC = getAnalysisIfAvailable<TargetPassConfig>(); |
395 | if (!TPC) |
396 | return false; |
397 | |
398 | const TargetMachine &TM = TPC->getTM<TargetMachine>(); |
399 | |
400 | bool Changed = false; |
401 | for (CallGraphNode *I : SCC) { |
402 | Function *F = I->getFunction(); |
403 | if (!F || F->isDeclaration()) |
404 | continue; |
405 | |
406 | const TargetSubtargetInfo *ST = TM.getSubtargetImpl(*F); |
407 | AMDGPUPerfHint Analyzer(FIM, ST->getTargetLowering()); |
408 | |
409 | if (Analyzer.runOnFunction(F&: *F)) |
410 | Changed = true; |
411 | } |
412 | |
413 | return Changed; |
414 | } |
415 | |
416 | bool AMDGPUPerfHintAnalysis::isMemoryBound(const Function *F) const { |
417 | auto FI = FIM.find(Val: F); |
418 | if (FI == FIM.end()) |
419 | return false; |
420 | |
421 | return AMDGPUPerfHint::isMemBound(FI: FI->second); |
422 | } |
423 | |
424 | bool AMDGPUPerfHintAnalysis::needsWaveLimiter(const Function *F) const { |
425 | auto FI = FIM.find(Val: F); |
426 | if (FI == FIM.end()) |
427 | return false; |
428 | |
429 | return AMDGPUPerfHint::needLimitWave(FI: FI->second); |
430 | } |
431 | |