1 | //===-- AMDGPUPromoteAlloca.cpp - Promote Allocas -------------------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // Eliminates allocas by either converting them into vectors or by migrating |
10 | // them to local address space. |
11 | // |
12 | // Two passes are exposed by this file: |
13 | // - "promote-alloca-to-vector", which runs early in the pipeline and only |
14 | // promotes to vector. Promotion to vector is almost always profitable |
15 | // except when the alloca is too big and the promotion would result in |
16 | // very high register pressure. |
17 | // - "promote-alloca", which does both promotion to vector and LDS and runs |
18 | // much later in the pipeline. This runs after SROA because promoting to |
19 | // LDS is of course less profitable than getting rid of the alloca or |
20 | // vectorizing it, thus we only want to do it when the only alternative is |
21 | // lowering the alloca to stack. |
22 | // |
23 | // Note that both of them exist for the old and new PMs. The new PM passes are |
24 | // declared in AMDGPU.h and the legacy PM ones are declared here.s |
25 | // |
26 | //===----------------------------------------------------------------------===// |
27 | |
28 | #include "AMDGPU.h" |
29 | #include "GCNSubtarget.h" |
30 | #include "Utils/AMDGPUBaseInfo.h" |
31 | #include "llvm/ADT/STLExtras.h" |
32 | #include "llvm/Analysis/CaptureTracking.h" |
33 | #include "llvm/Analysis/InstSimplifyFolder.h" |
34 | #include "llvm/Analysis/InstructionSimplify.h" |
35 | #include "llvm/Analysis/LoopInfo.h" |
36 | #include "llvm/Analysis/ValueTracking.h" |
37 | #include "llvm/CodeGen/TargetPassConfig.h" |
38 | #include "llvm/IR/IRBuilder.h" |
39 | #include "llvm/IR/IntrinsicInst.h" |
40 | #include "llvm/IR/IntrinsicsAMDGPU.h" |
41 | #include "llvm/IR/IntrinsicsR600.h" |
42 | #include "llvm/IR/PatternMatch.h" |
43 | #include "llvm/InitializePasses.h" |
44 | #include "llvm/Pass.h" |
45 | #include "llvm/Target/TargetMachine.h" |
46 | #include "llvm/Transforms/Utils/SSAUpdater.h" |
47 | |
48 | #define DEBUG_TYPE "amdgpu-promote-alloca" |
49 | |
50 | using namespace llvm; |
51 | |
52 | namespace { |
53 | |
54 | static cl::opt<bool> |
55 | DisablePromoteAllocaToVector("disable-promote-alloca-to-vector" , |
56 | cl::desc("Disable promote alloca to vector" ), |
57 | cl::init(Val: false)); |
58 | |
59 | static cl::opt<bool> |
60 | DisablePromoteAllocaToLDS("disable-promote-alloca-to-lds" , |
61 | cl::desc("Disable promote alloca to LDS" ), |
62 | cl::init(Val: false)); |
63 | |
64 | static cl::opt<unsigned> PromoteAllocaToVectorLimit( |
65 | "amdgpu-promote-alloca-to-vector-limit" , |
66 | cl::desc("Maximum byte size to consider promote alloca to vector" ), |
67 | cl::init(Val: 0)); |
68 | |
69 | static cl::opt<unsigned> PromoteAllocaToVectorMaxRegs( |
70 | "amdgpu-promote-alloca-to-vector-max-regs" , |
71 | cl::desc( |
72 | "Maximum vector size (in 32b registers) to use when promoting alloca" ), |
73 | cl::init(Val: 16)); |
74 | |
75 | // Use up to 1/4 of available register budget for vectorization. |
76 | // FIXME: Increase the limit for whole function budgets? Perhaps x2? |
77 | static cl::opt<unsigned> PromoteAllocaToVectorVGPRRatio( |
78 | "amdgpu-promote-alloca-to-vector-vgpr-ratio" , |
79 | cl::desc("Ratio of VGPRs to budget for promoting alloca to vectors" ), |
80 | cl::init(Val: 4)); |
81 | |
82 | static cl::opt<unsigned> |
83 | LoopUserWeight("promote-alloca-vector-loop-user-weight" , |
84 | cl::desc("The bonus weight of users of allocas within loop " |
85 | "when sorting profitable allocas" ), |
86 | cl::init(Val: 4)); |
87 | |
88 | // Shared implementation which can do both promotion to vector and to LDS. |
89 | class AMDGPUPromoteAllocaImpl { |
90 | private: |
91 | const TargetMachine &TM; |
92 | LoopInfo &LI; |
93 | Module *Mod = nullptr; |
94 | const DataLayout *DL = nullptr; |
95 | |
96 | // FIXME: This should be per-kernel. |
97 | uint32_t LocalMemLimit = 0; |
98 | uint32_t CurrentLocalMemUsage = 0; |
99 | unsigned MaxVGPRs; |
100 | unsigned VGPRBudgetRatio; |
101 | unsigned MaxVectorRegs; |
102 | |
103 | bool IsAMDGCN = false; |
104 | bool IsAMDHSA = false; |
105 | |
106 | std::pair<Value *, Value *> getLocalSizeYZ(IRBuilder<> &Builder); |
107 | Value *getWorkitemID(IRBuilder<> &Builder, unsigned N); |
108 | |
109 | /// BaseAlloca is the alloca root the search started from. |
110 | /// Val may be that alloca or a recursive user of it. |
111 | bool collectUsesWithPtrTypes(Value *BaseAlloca, Value *Val, |
112 | std::vector<Value *> &WorkList) const; |
113 | |
114 | /// Val is a derived pointer from Alloca. OpIdx0/OpIdx1 are the operand |
115 | /// indices to an instruction with 2 pointer inputs (e.g. select, icmp). |
116 | /// Returns true if both operands are derived from the same alloca. Val should |
117 | /// be the same value as one of the input operands of UseInst. |
118 | bool binaryOpIsDerivedFromSameAlloca(Value *Alloca, Value *Val, |
119 | Instruction *UseInst, int OpIdx0, |
120 | int OpIdx1) const; |
121 | |
122 | /// Check whether we have enough local memory for promotion. |
123 | bool hasSufficientLocalMem(const Function &F); |
124 | |
125 | bool tryPromoteAllocaToVector(AllocaInst &I); |
126 | bool tryPromoteAllocaToLDS(AllocaInst &I, bool SufficientLDS); |
127 | |
128 | void sortAllocasToPromote(SmallVectorImpl<AllocaInst *> &Allocas); |
129 | |
130 | void setFunctionLimits(const Function &F); |
131 | |
132 | public: |
133 | AMDGPUPromoteAllocaImpl(TargetMachine &TM, LoopInfo &LI) : TM(TM), LI(LI) { |
134 | |
135 | const Triple &TT = TM.getTargetTriple(); |
136 | IsAMDGCN = TT.isAMDGCN(); |
137 | IsAMDHSA = TT.getOS() == Triple::AMDHSA; |
138 | } |
139 | |
140 | bool run(Function &F, bool PromoteToLDS); |
141 | }; |
142 | |
143 | // FIXME: This can create globals so should be a module pass. |
144 | class AMDGPUPromoteAlloca : public FunctionPass { |
145 | public: |
146 | static char ID; |
147 | |
148 | AMDGPUPromoteAlloca() : FunctionPass(ID) {} |
149 | |
150 | bool runOnFunction(Function &F) override { |
151 | if (skipFunction(F)) |
152 | return false; |
153 | if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>()) |
154 | return AMDGPUPromoteAllocaImpl( |
155 | TPC->getTM<TargetMachine>(), |
156 | getAnalysis<LoopInfoWrapperPass>().getLoopInfo()) |
157 | .run(F, /*PromoteToLDS*/ true); |
158 | return false; |
159 | } |
160 | |
161 | StringRef getPassName() const override { return "AMDGPU Promote Alloca" ; } |
162 | |
163 | void getAnalysisUsage(AnalysisUsage &AU) const override { |
164 | AU.setPreservesCFG(); |
165 | AU.addRequired<LoopInfoWrapperPass>(); |
166 | FunctionPass::getAnalysisUsage(AU); |
167 | } |
168 | }; |
169 | |
170 | static unsigned getMaxVGPRs(unsigned LDSBytes, const TargetMachine &TM, |
171 | const Function &F) { |
172 | if (!TM.getTargetTriple().isAMDGCN()) |
173 | return 128; |
174 | |
175 | const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F); |
176 | |
177 | unsigned DynamicVGPRBlockSize = AMDGPU::getDynamicVGPRBlockSize(F); |
178 | // Temporarily check both the attribute and the subtarget feature, until the |
179 | // latter is removed. |
180 | if (DynamicVGPRBlockSize == 0 && ST.isDynamicVGPREnabled()) |
181 | DynamicVGPRBlockSize = ST.getDynamicVGPRBlockSize(); |
182 | |
183 | unsigned MaxVGPRs = ST.getMaxNumVGPRs( |
184 | WavesPerEU: ST.getWavesPerEU(FlatWorkGroupSizes: ST.getFlatWorkGroupSizes(F), LDSBytes, F).first, |
185 | DynamicVGPRBlockSize); |
186 | |
187 | // A non-entry function has only 32 caller preserved registers. |
188 | // Do not promote alloca which will force spilling unless we know the function |
189 | // will be inlined. |
190 | if (!F.hasFnAttribute(Kind: Attribute::AlwaysInline) && |
191 | !AMDGPU::isEntryFunctionCC(CC: F.getCallingConv())) |
192 | MaxVGPRs = std::min(a: MaxVGPRs, b: 32u); |
193 | return MaxVGPRs; |
194 | } |
195 | |
196 | } // end anonymous namespace |
197 | |
198 | char AMDGPUPromoteAlloca::ID = 0; |
199 | |
200 | INITIALIZE_PASS_BEGIN(AMDGPUPromoteAlloca, DEBUG_TYPE, |
201 | "AMDGPU promote alloca to vector or LDS" , false, false) |
202 | // Move LDS uses from functions to kernels before promote alloca for accurate |
203 | // estimation of LDS available |
204 | INITIALIZE_PASS_DEPENDENCY(AMDGPULowerModuleLDSLegacy) |
205 | INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) |
206 | INITIALIZE_PASS_END(AMDGPUPromoteAlloca, DEBUG_TYPE, |
207 | "AMDGPU promote alloca to vector or LDS" , false, false) |
208 | |
209 | char &llvm::AMDGPUPromoteAllocaID = AMDGPUPromoteAlloca::ID; |
210 | |
211 | PreservedAnalyses AMDGPUPromoteAllocaPass::run(Function &F, |
212 | FunctionAnalysisManager &AM) { |
213 | auto &LI = AM.getResult<LoopAnalysis>(IR&: F); |
214 | bool Changed = AMDGPUPromoteAllocaImpl(TM, LI).run(F, /*PromoteToLDS=*/true); |
215 | if (Changed) { |
216 | PreservedAnalyses PA; |
217 | PA.preserveSet<CFGAnalyses>(); |
218 | return PA; |
219 | } |
220 | return PreservedAnalyses::all(); |
221 | } |
222 | |
223 | PreservedAnalyses |
224 | AMDGPUPromoteAllocaToVectorPass::run(Function &F, FunctionAnalysisManager &AM) { |
225 | auto &LI = AM.getResult<LoopAnalysis>(IR&: F); |
226 | bool Changed = AMDGPUPromoteAllocaImpl(TM, LI).run(F, /*PromoteToLDS=*/false); |
227 | if (Changed) { |
228 | PreservedAnalyses PA; |
229 | PA.preserveSet<CFGAnalyses>(); |
230 | return PA; |
231 | } |
232 | return PreservedAnalyses::all(); |
233 | } |
234 | |
235 | FunctionPass *llvm::createAMDGPUPromoteAlloca() { |
236 | return new AMDGPUPromoteAlloca(); |
237 | } |
238 | |
239 | static void collectAllocaUses(AllocaInst &Alloca, |
240 | SmallVectorImpl<Use *> &Uses) { |
241 | SmallVector<Instruction *, 4> WorkList({&Alloca}); |
242 | while (!WorkList.empty()) { |
243 | auto *Cur = WorkList.pop_back_val(); |
244 | for (auto &U : Cur->uses()) { |
245 | Uses.push_back(Elt: &U); |
246 | |
247 | if (isa<GetElementPtrInst>(Val: U.getUser())) |
248 | WorkList.push_back(Elt: cast<Instruction>(Val: U.getUser())); |
249 | } |
250 | } |
251 | } |
252 | |
253 | void AMDGPUPromoteAllocaImpl::sortAllocasToPromote( |
254 | SmallVectorImpl<AllocaInst *> &Allocas) { |
255 | DenseMap<AllocaInst *, unsigned> Scores; |
256 | |
257 | for (auto *Alloca : Allocas) { |
258 | LLVM_DEBUG(dbgs() << "Scoring: " << *Alloca << "\n" ); |
259 | unsigned &Score = Scores[Alloca]; |
260 | // Increment score by one for each user + a bonus for users within loops. |
261 | SmallVector<Use *, 8> Uses; |
262 | collectAllocaUses(Alloca&: *Alloca, Uses); |
263 | for (auto *U : Uses) { |
264 | Instruction *Inst = cast<Instruction>(Val: U->getUser()); |
265 | if (isa<GetElementPtrInst>(Val: Inst)) |
266 | continue; |
267 | unsigned UserScore = |
268 | 1 + (LoopUserWeight * LI.getLoopDepth(BB: Inst->getParent())); |
269 | LLVM_DEBUG(dbgs() << " [+" << UserScore << "]:\t" << *Inst << "\n" ); |
270 | Score += UserScore; |
271 | } |
272 | LLVM_DEBUG(dbgs() << " => Final Score:" << Score << "\n" ); |
273 | } |
274 | |
275 | stable_sort(Range&: Allocas, C: [&](AllocaInst *A, AllocaInst *B) { |
276 | return Scores.at(Val: A) > Scores.at(Val: B); |
277 | }); |
278 | |
279 | // clang-format off |
280 | LLVM_DEBUG( |
281 | dbgs() << "Sorted Worklist:\n" ; |
282 | for (auto *A: Allocas) |
283 | dbgs() << " " << *A << "\n" ; |
284 | ); |
285 | // clang-format on |
286 | } |
287 | |
288 | void AMDGPUPromoteAllocaImpl::setFunctionLimits(const Function &F) { |
289 | // Load per function limits, overriding with global options where appropriate. |
290 | MaxVectorRegs = F.getFnAttributeAsParsedInteger( |
291 | Kind: "amdgpu-promote-alloca-to-vector-max-regs" , Default: PromoteAllocaToVectorMaxRegs); |
292 | if (PromoteAllocaToVectorMaxRegs.getNumOccurrences()) |
293 | MaxVectorRegs = PromoteAllocaToVectorMaxRegs; |
294 | VGPRBudgetRatio = F.getFnAttributeAsParsedInteger( |
295 | Kind: "amdgpu-promote-alloca-to-vector-vgpr-ratio" , |
296 | Default: PromoteAllocaToVectorVGPRRatio); |
297 | if (PromoteAllocaToVectorVGPRRatio.getNumOccurrences()) |
298 | VGPRBudgetRatio = PromoteAllocaToVectorVGPRRatio; |
299 | } |
300 | |
301 | bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) { |
302 | Mod = F.getParent(); |
303 | DL = &Mod->getDataLayout(); |
304 | |
305 | const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F); |
306 | if (!ST.isPromoteAllocaEnabled()) |
307 | return false; |
308 | |
309 | bool SufficientLDS = PromoteToLDS && hasSufficientLocalMem(F); |
310 | MaxVGPRs = getMaxVGPRs(LDSBytes: CurrentLocalMemUsage, TM, F); |
311 | setFunctionLimits(F); |
312 | |
313 | unsigned VectorizationBudget = |
314 | (PromoteAllocaToVectorLimit ? PromoteAllocaToVectorLimit * 8 |
315 | : (MaxVGPRs * 32)) / |
316 | VGPRBudgetRatio; |
317 | |
318 | SmallVector<AllocaInst *, 16> Allocas; |
319 | for (Instruction &I : F.getEntryBlock()) { |
320 | if (AllocaInst *AI = dyn_cast<AllocaInst>(Val: &I)) { |
321 | // Array allocations are probably not worth handling, since an allocation |
322 | // of the array type is the canonical form. |
323 | if (!AI->isStaticAlloca() || AI->isArrayAllocation()) |
324 | continue; |
325 | Allocas.push_back(Elt: AI); |
326 | } |
327 | } |
328 | |
329 | sortAllocasToPromote(Allocas); |
330 | |
331 | bool Changed = false; |
332 | for (AllocaInst *AI : Allocas) { |
333 | const unsigned AllocaCost = DL->getTypeSizeInBits(Ty: AI->getAllocatedType()); |
334 | // First, check if we have enough budget to vectorize this alloca. |
335 | if (AllocaCost <= VectorizationBudget) { |
336 | // If we do, attempt vectorization, otherwise, fall through and try |
337 | // promoting to LDS instead. |
338 | if (tryPromoteAllocaToVector(I&: *AI)) { |
339 | Changed = true; |
340 | assert((VectorizationBudget - AllocaCost) < VectorizationBudget && |
341 | "Underflow!" ); |
342 | VectorizationBudget -= AllocaCost; |
343 | LLVM_DEBUG(dbgs() << " Remaining vectorization budget:" |
344 | << VectorizationBudget << "\n" ); |
345 | continue; |
346 | } |
347 | } else { |
348 | LLVM_DEBUG(dbgs() << "Alloca too big for vectorization (size:" |
349 | << AllocaCost << ", budget:" << VectorizationBudget |
350 | << "): " << *AI << "\n" ); |
351 | } |
352 | |
353 | if (PromoteToLDS && tryPromoteAllocaToLDS(I&: *AI, SufficientLDS)) |
354 | Changed = true; |
355 | } |
356 | |
357 | // NOTE: tryPromoteAllocaToVector removes the alloca, so Allocas contains |
358 | // dangling pointers. If we want to reuse it past this point, the loop above |
359 | // would need to be updated to remove successfully promoted allocas. |
360 | |
361 | return Changed; |
362 | } |
363 | |
364 | struct MemTransferInfo { |
365 | ConstantInt *SrcIndex = nullptr; |
366 | ConstantInt *DestIndex = nullptr; |
367 | }; |
368 | |
369 | // Checks if the instruction I is a memset user of the alloca AI that we can |
370 | // deal with. Currently, only non-volatile memsets that affect the whole alloca |
371 | // are handled. |
372 | static bool isSupportedMemset(MemSetInst *I, AllocaInst *AI, |
373 | const DataLayout &DL) { |
374 | using namespace PatternMatch; |
375 | // For now we only care about non-volatile memsets that affect the whole type |
376 | // (start at index 0 and fill the whole alloca). |
377 | // |
378 | // TODO: Now that we moved to PromoteAlloca we could handle any memsets |
379 | // (except maybe volatile ones?) - we just need to use shufflevector if it |
380 | // only affects a subset of the vector. |
381 | const unsigned Size = DL.getTypeStoreSize(Ty: AI->getAllocatedType()); |
382 | return I->getOperand(i_nocapture: 0) == AI && |
383 | match(V: I->getOperand(i_nocapture: 2), P: m_SpecificInt(V: Size)) && !I->isVolatile(); |
384 | } |
385 | |
386 | static Value *calculateVectorIndex( |
387 | Value *Ptr, const std::map<GetElementPtrInst *, WeakTrackingVH> &GEPIdx) { |
388 | auto *GEP = dyn_cast<GetElementPtrInst>(Val: Ptr->stripPointerCasts()); |
389 | if (!GEP) |
390 | return ConstantInt::getNullValue(Ty: Type::getInt32Ty(C&: Ptr->getContext())); |
391 | |
392 | auto I = GEPIdx.find(x: GEP); |
393 | assert(I != GEPIdx.end() && "Must have entry for GEP!" ); |
394 | |
395 | Value *IndexValue = I->second; |
396 | assert(IndexValue && "index value missing from GEP index map" ); |
397 | return IndexValue; |
398 | } |
399 | |
400 | static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca, |
401 | Type *VecElemTy, const DataLayout &DL, |
402 | SmallVector<Instruction *> &NewInsts) { |
403 | // TODO: Extracting a "multiple of X" from a GEP might be a useful generic |
404 | // helper. |
405 | unsigned BW = DL.getIndexTypeSizeInBits(Ty: GEP->getType()); |
406 | SmallMapVector<Value *, APInt, 4> VarOffsets; |
407 | APInt ConstOffset(BW, 0); |
408 | |
409 | // Walk backwards through nested GEPs to collect both constant and variable |
410 | // offsets, so that nested vector GEP chains can be lowered in one step. |
411 | // |
412 | // Given this IR fragment as input: |
413 | // |
414 | // %0 = alloca [10 x <2 x i32>], align 8, addrspace(5) |
415 | // %1 = getelementptr [10 x <2 x i32>], ptr addrspace(5) %0, i32 0, i32 %j |
416 | // %2 = getelementptr i8, ptr addrspace(5) %1, i32 4 |
417 | // %3 = load i32, ptr addrspace(5) %2, align 4 |
418 | // |
419 | // Combine both GEP operations in a single pass, producing: |
420 | // BasePtr = %0 |
421 | // ConstOffset = 4 |
422 | // VarOffsets = { %j -> element_size(<2 x i32>) } |
423 | // |
424 | // That lets us emit a single buffer_load directly into a VGPR, without ever |
425 | // allocating scratch memory for the intermediate pointer. |
426 | Value *CurPtr = GEP; |
427 | while (auto *CurGEP = dyn_cast<GetElementPtrInst>(Val: CurPtr)) { |
428 | if (!CurGEP->collectOffset(DL, BitWidth: BW, VariableOffsets&: VarOffsets, ConstantOffset&: ConstOffset)) |
429 | return nullptr; |
430 | |
431 | // Move to the next outer pointer. |
432 | CurPtr = CurGEP->getPointerOperand(); |
433 | } |
434 | |
435 | assert(CurPtr == Alloca && "GEP not based on alloca" ); |
436 | |
437 | unsigned VecElemSize = DL.getTypeAllocSize(Ty: VecElemTy); |
438 | if (VarOffsets.size() > 1) |
439 | return nullptr; |
440 | |
441 | APInt IndexQuot; |
442 | uint64_t Rem; |
443 | APInt::udivrem(LHS: ConstOffset, RHS: VecElemSize, Quotient&: IndexQuot, Remainder&: Rem); |
444 | if (Rem != 0) |
445 | return nullptr; |
446 | if (VarOffsets.size() == 0) |
447 | return ConstantInt::get(Context&: GEP->getContext(), V: IndexQuot); |
448 | |
449 | IRBuilder<> Builder(GEP); |
450 | |
451 | const auto &VarOffset = VarOffsets.front(); |
452 | APInt OffsetQuot; |
453 | APInt::udivrem(LHS: VarOffset.second, RHS: VecElemSize, Quotient&: OffsetQuot, Remainder&: Rem); |
454 | if (Rem != 0 || OffsetQuot.isZero()) |
455 | return nullptr; |
456 | |
457 | Value *Offset = VarOffset.first; |
458 | auto *OffsetType = dyn_cast<IntegerType>(Val: Offset->getType()); |
459 | if (!OffsetType) |
460 | return nullptr; |
461 | |
462 | if (!OffsetQuot.isOne()) { |
463 | ConstantInt *ConstMul = |
464 | ConstantInt::get(Ty: OffsetType, V: OffsetQuot.getZExtValue()); |
465 | Offset = Builder.CreateMul(LHS: Offset, RHS: ConstMul); |
466 | if (Instruction *NewInst = dyn_cast<Instruction>(Val: Offset)) |
467 | NewInsts.push_back(Elt: NewInst); |
468 | } |
469 | if (ConstOffset.isZero()) |
470 | return Offset; |
471 | |
472 | ConstantInt *ConstIndex = |
473 | ConstantInt::get(Ty: OffsetType, V: IndexQuot.getZExtValue()); |
474 | Value *IndexAdd = Builder.CreateAdd(LHS: ConstIndex, RHS: Offset); |
475 | if (Instruction *NewInst = dyn_cast<Instruction>(Val: IndexAdd)) |
476 | NewInsts.push_back(Elt: NewInst); |
477 | return IndexAdd; |
478 | } |
479 | |
480 | /// Promotes a single user of the alloca to a vector form. |
481 | /// |
482 | /// \param Inst Instruction to be promoted. |
483 | /// \param DL Module Data Layout. |
484 | /// \param VectorTy Vectorized Type. |
485 | /// \param VecStoreSize Size of \p VectorTy in bytes. |
486 | /// \param ElementSize Size of \p VectorTy element type in bytes. |
487 | /// \param TransferInfo MemTransferInst info map. |
488 | /// \param GEPVectorIdx GEP -> VectorIdx cache. |
489 | /// \param CurVal Current value of the vector (e.g. last stored value) |
490 | /// \param[out] DeferredLoads \p Inst is added to this vector if it can't |
491 | /// be promoted now. This happens when promoting requires \p |
492 | /// CurVal, but \p CurVal is nullptr. |
493 | /// \return the stored value if \p Inst would have written to the alloca, or |
494 | /// nullptr otherwise. |
495 | static Value *promoteAllocaUserToVector( |
496 | Instruction *Inst, const DataLayout &DL, FixedVectorType *VectorTy, |
497 | unsigned VecStoreSize, unsigned ElementSize, |
498 | DenseMap<MemTransferInst *, MemTransferInfo> &TransferInfo, |
499 | std::map<GetElementPtrInst *, WeakTrackingVH> &GEPVectorIdx, Value *CurVal, |
500 | SmallVectorImpl<LoadInst *> &DeferredLoads) { |
501 | // Note: we use InstSimplifyFolder because it can leverage the DataLayout |
502 | // to do more folding, especially in the case of vector splats. |
503 | IRBuilder<InstSimplifyFolder> Builder(Inst->getContext(), |
504 | InstSimplifyFolder(DL)); |
505 | Builder.SetInsertPoint(Inst); |
506 | |
507 | const auto GetOrLoadCurrentVectorValue = [&]() -> Value * { |
508 | if (CurVal) |
509 | return CurVal; |
510 | |
511 | // If the current value is not known, insert a dummy load and lower it on |
512 | // the second pass. |
513 | LoadInst *Dummy = |
514 | Builder.CreateLoad(Ty: VectorTy, Ptr: PoisonValue::get(T: Builder.getPtrTy()), |
515 | Name: "promotealloca.dummyload" ); |
516 | DeferredLoads.push_back(Elt: Dummy); |
517 | return Dummy; |
518 | }; |
519 | |
520 | const auto CreateTempPtrIntCast = [&Builder, DL](Value *Val, |
521 | Type *PtrTy) -> Value * { |
522 | assert(DL.getTypeStoreSize(Val->getType()) == DL.getTypeStoreSize(PtrTy)); |
523 | const unsigned Size = DL.getTypeStoreSizeInBits(Ty: PtrTy); |
524 | if (!PtrTy->isVectorTy()) |
525 | return Builder.CreateBitOrPointerCast(V: Val, DestTy: Builder.getIntNTy(N: Size)); |
526 | const unsigned NumPtrElts = cast<FixedVectorType>(Val: PtrTy)->getNumElements(); |
527 | // If we want to cast to cast, e.g. a <2 x ptr> into a <4 x i32>, we need to |
528 | // first cast the ptr vector to <2 x i64>. |
529 | assert((Size % NumPtrElts == 0) && "Vector size not divisble" ); |
530 | Type *EltTy = Builder.getIntNTy(N: Size / NumPtrElts); |
531 | return Builder.CreateBitOrPointerCast( |
532 | V: Val, DestTy: FixedVectorType::get(ElementType: EltTy, NumElts: NumPtrElts)); |
533 | }; |
534 | |
535 | Type *VecEltTy = VectorTy->getElementType(); |
536 | |
537 | switch (Inst->getOpcode()) { |
538 | case Instruction::Load: { |
539 | // Loads can only be lowered if the value is known. |
540 | if (!CurVal) { |
541 | DeferredLoads.push_back(Elt: cast<LoadInst>(Val: Inst)); |
542 | return nullptr; |
543 | } |
544 | |
545 | Value *Index = calculateVectorIndex( |
546 | Ptr: cast<LoadInst>(Val: Inst)->getPointerOperand(), GEPIdx: GEPVectorIdx); |
547 | |
548 | // We're loading the full vector. |
549 | Type *AccessTy = Inst->getType(); |
550 | TypeSize AccessSize = DL.getTypeStoreSize(Ty: AccessTy); |
551 | if (Constant *CI = dyn_cast<Constant>(Val: Index)) { |
552 | if (CI->isZeroValue() && AccessSize == VecStoreSize) { |
553 | if (AccessTy->isPtrOrPtrVectorTy()) |
554 | CurVal = CreateTempPtrIntCast(CurVal, AccessTy); |
555 | else if (CurVal->getType()->isPtrOrPtrVectorTy()) |
556 | CurVal = CreateTempPtrIntCast(CurVal, CurVal->getType()); |
557 | Value *NewVal = Builder.CreateBitOrPointerCast(V: CurVal, DestTy: AccessTy); |
558 | Inst->replaceAllUsesWith(V: NewVal); |
559 | return nullptr; |
560 | } |
561 | } |
562 | |
563 | // Loading a subvector. |
564 | if (isa<FixedVectorType>(Val: AccessTy)) { |
565 | assert(AccessSize.isKnownMultipleOf(DL.getTypeStoreSize(VecEltTy))); |
566 | const unsigned NumLoadedElts = AccessSize / DL.getTypeStoreSize(Ty: VecEltTy); |
567 | auto *SubVecTy = FixedVectorType::get(ElementType: VecEltTy, NumElts: NumLoadedElts); |
568 | assert(DL.getTypeStoreSize(SubVecTy) == DL.getTypeStoreSize(AccessTy)); |
569 | |
570 | Value *SubVec = PoisonValue::get(T: SubVecTy); |
571 | for (unsigned K = 0; K < NumLoadedElts; ++K) { |
572 | Value *CurIdx = |
573 | Builder.CreateAdd(LHS: Index, RHS: ConstantInt::get(Ty: Index->getType(), V: K)); |
574 | SubVec = Builder.CreateInsertElement( |
575 | Vec: SubVec, NewElt: Builder.CreateExtractElement(Vec: CurVal, Idx: CurIdx), Idx: K); |
576 | } |
577 | |
578 | if (AccessTy->isPtrOrPtrVectorTy()) |
579 | SubVec = CreateTempPtrIntCast(SubVec, AccessTy); |
580 | else if (SubVecTy->isPtrOrPtrVectorTy()) |
581 | SubVec = CreateTempPtrIntCast(SubVec, SubVecTy); |
582 | |
583 | SubVec = Builder.CreateBitOrPointerCast(V: SubVec, DestTy: AccessTy); |
584 | Inst->replaceAllUsesWith(V: SubVec); |
585 | return nullptr; |
586 | } |
587 | |
588 | // We're loading one element. |
589 | Value * = Builder.CreateExtractElement(Vec: CurVal, Idx: Index); |
590 | if (AccessTy != VecEltTy) |
591 | ExtractElement = Builder.CreateBitOrPointerCast(V: ExtractElement, DestTy: AccessTy); |
592 | |
593 | Inst->replaceAllUsesWith(V: ExtractElement); |
594 | return nullptr; |
595 | } |
596 | case Instruction::Store: { |
597 | // For stores, it's a bit trickier and it depends on whether we're storing |
598 | // the full vector or not. If we're storing the full vector, we don't need |
599 | // to know the current value. If this is a store of a single element, we |
600 | // need to know the value. |
601 | StoreInst *SI = cast<StoreInst>(Val: Inst); |
602 | Value *Index = calculateVectorIndex(Ptr: SI->getPointerOperand(), GEPIdx: GEPVectorIdx); |
603 | Value *Val = SI->getValueOperand(); |
604 | |
605 | // We're storing the full vector, we can handle this without knowing CurVal. |
606 | Type *AccessTy = Val->getType(); |
607 | TypeSize AccessSize = DL.getTypeStoreSize(Ty: AccessTy); |
608 | if (Constant *CI = dyn_cast<Constant>(Val: Index)) { |
609 | if (CI->isZeroValue() && AccessSize == VecStoreSize) { |
610 | if (AccessTy->isPtrOrPtrVectorTy()) |
611 | Val = CreateTempPtrIntCast(Val, AccessTy); |
612 | else if (VectorTy->isPtrOrPtrVectorTy()) |
613 | Val = CreateTempPtrIntCast(Val, VectorTy); |
614 | return Builder.CreateBitOrPointerCast(V: Val, DestTy: VectorTy); |
615 | } |
616 | } |
617 | |
618 | // Storing a subvector. |
619 | if (isa<FixedVectorType>(Val: AccessTy)) { |
620 | assert(AccessSize.isKnownMultipleOf(DL.getTypeStoreSize(VecEltTy))); |
621 | const unsigned NumWrittenElts = |
622 | AccessSize / DL.getTypeStoreSize(Ty: VecEltTy); |
623 | const unsigned NumVecElts = VectorTy->getNumElements(); |
624 | auto *SubVecTy = FixedVectorType::get(ElementType: VecEltTy, NumElts: NumWrittenElts); |
625 | assert(DL.getTypeStoreSize(SubVecTy) == DL.getTypeStoreSize(AccessTy)); |
626 | |
627 | if (SubVecTy->isPtrOrPtrVectorTy()) |
628 | Val = CreateTempPtrIntCast(Val, SubVecTy); |
629 | else if (AccessTy->isPtrOrPtrVectorTy()) |
630 | Val = CreateTempPtrIntCast(Val, AccessTy); |
631 | |
632 | Val = Builder.CreateBitOrPointerCast(V: Val, DestTy: SubVecTy); |
633 | |
634 | Value *CurVec = GetOrLoadCurrentVectorValue(); |
635 | for (unsigned K = 0, NumElts = std::min(a: NumWrittenElts, b: NumVecElts); |
636 | K < NumElts; ++K) { |
637 | Value *CurIdx = |
638 | Builder.CreateAdd(LHS: Index, RHS: ConstantInt::get(Ty: Index->getType(), V: K)); |
639 | CurVec = Builder.CreateInsertElement( |
640 | Vec: CurVec, NewElt: Builder.CreateExtractElement(Vec: Val, Idx: K), Idx: CurIdx); |
641 | } |
642 | return CurVec; |
643 | } |
644 | |
645 | if (Val->getType() != VecEltTy) |
646 | Val = Builder.CreateBitOrPointerCast(V: Val, DestTy: VecEltTy); |
647 | return Builder.CreateInsertElement(Vec: GetOrLoadCurrentVectorValue(), NewElt: Val, |
648 | Idx: Index); |
649 | } |
650 | case Instruction::Call: { |
651 | if (auto *MTI = dyn_cast<MemTransferInst>(Val: Inst)) { |
652 | // For memcpy, we need to know curval. |
653 | ConstantInt *Length = cast<ConstantInt>(Val: MTI->getLength()); |
654 | unsigned NumCopied = Length->getZExtValue() / ElementSize; |
655 | MemTransferInfo *TI = &TransferInfo[MTI]; |
656 | unsigned SrcBegin = TI->SrcIndex->getZExtValue(); |
657 | unsigned DestBegin = TI->DestIndex->getZExtValue(); |
658 | |
659 | SmallVector<int> Mask; |
660 | for (unsigned Idx = 0; Idx < VectorTy->getNumElements(); ++Idx) { |
661 | if (Idx >= DestBegin && Idx < DestBegin + NumCopied) { |
662 | Mask.push_back(Elt: SrcBegin < VectorTy->getNumElements() |
663 | ? SrcBegin++ |
664 | : PoisonMaskElem); |
665 | } else { |
666 | Mask.push_back(Elt: Idx); |
667 | } |
668 | } |
669 | |
670 | return Builder.CreateShuffleVector(V: GetOrLoadCurrentVectorValue(), Mask); |
671 | } |
672 | |
673 | if (auto *MSI = dyn_cast<MemSetInst>(Val: Inst)) { |
674 | // For memset, we don't need to know the previous value because we |
675 | // currently only allow memsets that cover the whole alloca. |
676 | Value *Elt = MSI->getOperand(i_nocapture: 1); |
677 | const unsigned BytesPerElt = DL.getTypeStoreSize(Ty: VecEltTy); |
678 | if (BytesPerElt > 1) { |
679 | Value *EltBytes = Builder.CreateVectorSplat(NumElts: BytesPerElt, V: Elt); |
680 | |
681 | // If the element type of the vector is a pointer, we need to first cast |
682 | // to an integer, then use a PtrCast. |
683 | if (VecEltTy->isPointerTy()) { |
684 | Type *PtrInt = Builder.getIntNTy(N: BytesPerElt * 8); |
685 | Elt = Builder.CreateBitCast(V: EltBytes, DestTy: PtrInt); |
686 | Elt = Builder.CreateIntToPtr(V: Elt, DestTy: VecEltTy); |
687 | } else |
688 | Elt = Builder.CreateBitCast(V: EltBytes, DestTy: VecEltTy); |
689 | } |
690 | |
691 | return Builder.CreateVectorSplat(EC: VectorTy->getElementCount(), V: Elt); |
692 | } |
693 | |
694 | if (auto *Intr = dyn_cast<IntrinsicInst>(Val: Inst)) { |
695 | if (Intr->getIntrinsicID() == Intrinsic::objectsize) { |
696 | Intr->replaceAllUsesWith( |
697 | V: Builder.getIntN(N: Intr->getType()->getIntegerBitWidth(), |
698 | C: DL.getTypeAllocSize(Ty: VectorTy))); |
699 | return nullptr; |
700 | } |
701 | } |
702 | |
703 | llvm_unreachable("Unsupported call when promoting alloca to vector" ); |
704 | } |
705 | |
706 | default: |
707 | llvm_unreachable("Inconsistency in instructions promotable to vector" ); |
708 | } |
709 | |
710 | llvm_unreachable("Did not return after promoting instruction!" ); |
711 | } |
712 | |
713 | static bool isSupportedAccessType(FixedVectorType *VecTy, Type *AccessTy, |
714 | const DataLayout &DL) { |
715 | // Access as a vector type can work if the size of the access vector is a |
716 | // multiple of the size of the alloca's vector element type. |
717 | // |
718 | // Examples: |
719 | // - VecTy = <8 x float>, AccessTy = <4 x float> -> OK |
720 | // - VecTy = <4 x double>, AccessTy = <2 x float> -> OK |
721 | // - VecTy = <4 x double>, AccessTy = <3 x float> -> NOT OK |
722 | // - 3*32 is not a multiple of 64 |
723 | // |
724 | // We could handle more complicated cases, but it'd make things a lot more |
725 | // complicated. |
726 | if (isa<FixedVectorType>(Val: AccessTy)) { |
727 | TypeSize AccTS = DL.getTypeStoreSize(Ty: AccessTy); |
728 | // If the type size and the store size don't match, we would need to do more |
729 | // than just bitcast to translate between an extracted/insertable subvectors |
730 | // and the accessed value. |
731 | if (AccTS * 8 != DL.getTypeSizeInBits(Ty: AccessTy)) |
732 | return false; |
733 | TypeSize VecTS = DL.getTypeStoreSize(Ty: VecTy->getElementType()); |
734 | return AccTS.isKnownMultipleOf(RHS: VecTS); |
735 | } |
736 | |
737 | return CastInst::isBitOrNoopPointerCastable(SrcTy: VecTy->getElementType(), DestTy: AccessTy, |
738 | DL); |
739 | } |
740 | |
741 | /// Iterates over an instruction worklist that may contain multiple instructions |
742 | /// from the same basic block, but in a different order. |
743 | template <typename InstContainer> |
744 | static void forEachWorkListItem(const InstContainer &WorkList, |
745 | std::function<void(Instruction *)> Fn) { |
746 | // Bucket up uses of the alloca by the block they occur in. |
747 | // This is important because we have to handle multiple defs/uses in a block |
748 | // ourselves: SSAUpdater is purely for cross-block references. |
749 | DenseMap<BasicBlock *, SmallDenseSet<Instruction *>> UsesByBlock; |
750 | for (Instruction *User : WorkList) |
751 | UsesByBlock[User->getParent()].insert(V: User); |
752 | |
753 | for (Instruction *User : WorkList) { |
754 | BasicBlock *BB = User->getParent(); |
755 | auto &BlockUses = UsesByBlock[BB]; |
756 | |
757 | // Already processed, skip. |
758 | if (BlockUses.empty()) |
759 | continue; |
760 | |
761 | // Only user in the block, directly process it. |
762 | if (BlockUses.size() == 1) { |
763 | Fn(User); |
764 | continue; |
765 | } |
766 | |
767 | // Multiple users in the block, do a linear scan to see users in order. |
768 | for (Instruction &Inst : *BB) { |
769 | if (!BlockUses.contains(V: &Inst)) |
770 | continue; |
771 | |
772 | Fn(&Inst); |
773 | } |
774 | |
775 | // Clear the block so we know it's been processed. |
776 | BlockUses.clear(); |
777 | } |
778 | } |
779 | |
780 | /// Find an insert point after an alloca, after all other allocas clustered at |
781 | /// the start of the block. |
782 | static BasicBlock::iterator skipToNonAllocaInsertPt(BasicBlock &BB, |
783 | BasicBlock::iterator I) { |
784 | for (BasicBlock::iterator E = BB.end(); I != E && isa<AllocaInst>(Val: *I); ++I) |
785 | ; |
786 | return I; |
787 | } |
788 | |
789 | // FIXME: Should try to pick the most likely to be profitable allocas first. |
790 | bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) { |
791 | LLVM_DEBUG(dbgs() << "Trying to promote to vector: " << Alloca << '\n'); |
792 | |
793 | if (DisablePromoteAllocaToVector) { |
794 | LLVM_DEBUG(dbgs() << " Promote alloca to vector is disabled\n" ); |
795 | return false; |
796 | } |
797 | |
798 | Type *AllocaTy = Alloca.getAllocatedType(); |
799 | auto *VectorTy = dyn_cast<FixedVectorType>(Val: AllocaTy); |
800 | if (auto *ArrayTy = dyn_cast<ArrayType>(Val: AllocaTy)) { |
801 | uint64_t NumElems = 1; |
802 | Type *ElemTy; |
803 | do { |
804 | NumElems *= ArrayTy->getNumElements(); |
805 | ElemTy = ArrayTy->getElementType(); |
806 | } while ((ArrayTy = dyn_cast<ArrayType>(Val: ElemTy))); |
807 | |
808 | // Check for array of vectors |
809 | auto *InnerVectorTy = dyn_cast<FixedVectorType>(Val: ElemTy); |
810 | if (InnerVectorTy) { |
811 | NumElems *= InnerVectorTy->getNumElements(); |
812 | ElemTy = InnerVectorTy->getElementType(); |
813 | } |
814 | |
815 | if (VectorType::isValidElementType(ElemTy) && NumElems > 0) { |
816 | unsigned ElementSize = DL->getTypeSizeInBits(Ty: ElemTy) / 8; |
817 | if (ElementSize > 0) { |
818 | unsigned AllocaSize = DL->getTypeStoreSize(Ty: AllocaTy); |
819 | // Expand vector if required to match padding of inner type, |
820 | // i.e. odd size subvectors. |
821 | // Storage size of new vector must match that of alloca for correct |
822 | // behaviour of byte offsets and GEP computation. |
823 | if (NumElems * ElementSize != AllocaSize) |
824 | NumElems = AllocaSize / ElementSize; |
825 | if (NumElems > 0 && (AllocaSize % ElementSize) == 0) |
826 | VectorTy = FixedVectorType::get(ElementType: ElemTy, NumElts: NumElems); |
827 | } |
828 | } |
829 | } |
830 | |
831 | if (!VectorTy) { |
832 | LLVM_DEBUG(dbgs() << " Cannot convert type to vector\n" ); |
833 | return false; |
834 | } |
835 | |
836 | const unsigned MaxElements = |
837 | (MaxVectorRegs * 32) / DL->getTypeSizeInBits(Ty: VectorTy->getElementType()); |
838 | |
839 | if (VectorTy->getNumElements() > MaxElements || |
840 | VectorTy->getNumElements() < 2) { |
841 | LLVM_DEBUG(dbgs() << " " << *VectorTy |
842 | << " has an unsupported number of elements\n" ); |
843 | return false; |
844 | } |
845 | |
846 | std::map<GetElementPtrInst *, WeakTrackingVH> GEPVectorIdx; |
847 | SmallVector<Instruction *> WorkList; |
848 | SmallVector<Instruction *> UsersToRemove; |
849 | SmallVector<Instruction *> DeferredInsts; |
850 | SmallVector<Instruction *> NewGEPInsts; |
851 | DenseMap<MemTransferInst *, MemTransferInfo> TransferInfo; |
852 | |
853 | const auto RejectUser = [&](Instruction *Inst, Twine Msg) { |
854 | LLVM_DEBUG(dbgs() << " Cannot promote alloca to vector: " << Msg << "\n" |
855 | << " " << *Inst << "\n" ); |
856 | for (auto *Inst : reverse(C&: NewGEPInsts)) |
857 | Inst->eraseFromParent(); |
858 | return false; |
859 | }; |
860 | |
861 | SmallVector<Use *, 8> Uses; |
862 | collectAllocaUses(Alloca, Uses); |
863 | |
864 | LLVM_DEBUG(dbgs() << " Attempting promotion to: " << *VectorTy << "\n" ); |
865 | |
866 | Type *VecEltTy = VectorTy->getElementType(); |
867 | unsigned ElementSizeInBits = DL->getTypeSizeInBits(Ty: VecEltTy); |
868 | if (ElementSizeInBits != DL->getTypeAllocSizeInBits(Ty: VecEltTy)) { |
869 | LLVM_DEBUG(dbgs() << " Cannot convert to vector if the allocation size " |
870 | "does not match the type's size\n" ); |
871 | return false; |
872 | } |
873 | unsigned ElementSize = ElementSizeInBits / 8; |
874 | assert(ElementSize > 0); |
875 | for (auto *U : Uses) { |
876 | Instruction *Inst = cast<Instruction>(Val: U->getUser()); |
877 | |
878 | if (Value *Ptr = getLoadStorePointerOperand(V: Inst)) { |
879 | // This is a store of the pointer, not to the pointer. |
880 | if (isa<StoreInst>(Val: Inst) && |
881 | U->getOperandNo() != StoreInst::getPointerOperandIndex()) |
882 | return RejectUser(Inst, "pointer is being stored" ); |
883 | |
884 | Type *AccessTy = getLoadStoreType(I: Inst); |
885 | if (AccessTy->isAggregateType()) |
886 | return RejectUser(Inst, "unsupported load/store as aggregate" ); |
887 | assert(!AccessTy->isAggregateType() || AccessTy->isArrayTy()); |
888 | |
889 | // Check that this is a simple access of a vector element. |
890 | bool IsSimple = isa<LoadInst>(Val: Inst) ? cast<LoadInst>(Val: Inst)->isSimple() |
891 | : cast<StoreInst>(Val: Inst)->isSimple(); |
892 | if (!IsSimple) |
893 | return RejectUser(Inst, "not a simple load or store" ); |
894 | |
895 | Ptr = Ptr->stripPointerCasts(); |
896 | |
897 | // Alloca already accessed as vector. |
898 | if (Ptr == &Alloca && DL->getTypeStoreSize(Ty: Alloca.getAllocatedType()) == |
899 | DL->getTypeStoreSize(Ty: AccessTy)) { |
900 | WorkList.push_back(Elt: Inst); |
901 | continue; |
902 | } |
903 | |
904 | if (!isSupportedAccessType(VecTy: VectorTy, AccessTy, DL: *DL)) |
905 | return RejectUser(Inst, "not a supported access type" ); |
906 | |
907 | WorkList.push_back(Elt: Inst); |
908 | continue; |
909 | } |
910 | |
911 | if (auto *GEP = dyn_cast<GetElementPtrInst>(Val: Inst)) { |
912 | // If we can't compute a vector index from this GEP, then we can't |
913 | // promote this alloca to vector. |
914 | Value *Index = GEPToVectorIndex(GEP, Alloca: &Alloca, VecElemTy: VecEltTy, DL: *DL, NewInsts&: NewGEPInsts); |
915 | if (!Index) |
916 | return RejectUser(Inst, "cannot compute vector index for GEP" ); |
917 | |
918 | GEPVectorIdx[GEP] = Index; |
919 | UsersToRemove.push_back(Elt: Inst); |
920 | continue; |
921 | } |
922 | |
923 | if (MemSetInst *MSI = dyn_cast<MemSetInst>(Val: Inst); |
924 | MSI && isSupportedMemset(I: MSI, AI: &Alloca, DL: *DL)) { |
925 | WorkList.push_back(Elt: Inst); |
926 | continue; |
927 | } |
928 | |
929 | if (MemTransferInst *TransferInst = dyn_cast<MemTransferInst>(Val: Inst)) { |
930 | if (TransferInst->isVolatile()) |
931 | return RejectUser(Inst, "mem transfer inst is volatile" ); |
932 | |
933 | ConstantInt *Len = dyn_cast<ConstantInt>(Val: TransferInst->getLength()); |
934 | if (!Len || (Len->getZExtValue() % ElementSize)) |
935 | return RejectUser(Inst, "mem transfer inst length is non-constant or " |
936 | "not a multiple of the vector element size" ); |
937 | |
938 | if (TransferInfo.try_emplace(Key: TransferInst).second) { |
939 | DeferredInsts.push_back(Elt: Inst); |
940 | WorkList.push_back(Elt: Inst); |
941 | } |
942 | |
943 | auto getPointerIndexOfAlloca = [&](Value *Ptr) -> ConstantInt * { |
944 | GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Val: Ptr); |
945 | if (Ptr != &Alloca && !GEPVectorIdx.count(x: GEP)) |
946 | return nullptr; |
947 | |
948 | return dyn_cast<ConstantInt>(Val: calculateVectorIndex(Ptr, GEPIdx: GEPVectorIdx)); |
949 | }; |
950 | |
951 | unsigned OpNum = U->getOperandNo(); |
952 | MemTransferInfo *TI = &TransferInfo[TransferInst]; |
953 | if (OpNum == 0) { |
954 | Value *Dest = TransferInst->getDest(); |
955 | ConstantInt *Index = getPointerIndexOfAlloca(Dest); |
956 | if (!Index) |
957 | return RejectUser(Inst, "could not calculate constant dest index" ); |
958 | TI->DestIndex = Index; |
959 | } else { |
960 | assert(OpNum == 1); |
961 | Value *Src = TransferInst->getSource(); |
962 | ConstantInt *Index = getPointerIndexOfAlloca(Src); |
963 | if (!Index) |
964 | return RejectUser(Inst, "could not calculate constant src index" ); |
965 | TI->SrcIndex = Index; |
966 | } |
967 | continue; |
968 | } |
969 | |
970 | if (auto *Intr = dyn_cast<IntrinsicInst>(Val: Inst)) { |
971 | if (Intr->getIntrinsicID() == Intrinsic::objectsize) { |
972 | WorkList.push_back(Elt: Inst); |
973 | continue; |
974 | } |
975 | } |
976 | |
977 | // Ignore assume-like intrinsics and comparisons used in assumes. |
978 | if (isAssumeLikeIntrinsic(I: Inst)) { |
979 | if (!Inst->use_empty()) |
980 | return RejectUser(Inst, "assume-like intrinsic cannot have any users" ); |
981 | UsersToRemove.push_back(Elt: Inst); |
982 | continue; |
983 | } |
984 | |
985 | if (isa<ICmpInst>(Val: Inst) && all_of(Range: Inst->users(), P: [](User *U) { |
986 | return isAssumeLikeIntrinsic(I: cast<Instruction>(Val: U)); |
987 | })) { |
988 | UsersToRemove.push_back(Elt: Inst); |
989 | continue; |
990 | } |
991 | |
992 | return RejectUser(Inst, "unhandled alloca user" ); |
993 | } |
994 | |
995 | while (!DeferredInsts.empty()) { |
996 | Instruction *Inst = DeferredInsts.pop_back_val(); |
997 | MemTransferInst *TransferInst = cast<MemTransferInst>(Val: Inst); |
998 | // TODO: Support the case if the pointers are from different alloca or |
999 | // from different address spaces. |
1000 | MemTransferInfo &Info = TransferInfo[TransferInst]; |
1001 | if (!Info.SrcIndex || !Info.DestIndex) |
1002 | return RejectUser( |
1003 | Inst, "mem transfer inst is missing constant src and/or dst index" ); |
1004 | } |
1005 | |
1006 | LLVM_DEBUG(dbgs() << " Converting alloca to vector " << *AllocaTy << " -> " |
1007 | << *VectorTy << '\n'); |
1008 | const unsigned VecStoreSize = DL->getTypeStoreSize(Ty: VectorTy); |
1009 | |
1010 | // Alloca is uninitialized memory. Imitate that by making the first value |
1011 | // undef. |
1012 | SSAUpdater Updater; |
1013 | Updater.Initialize(Ty: VectorTy, Name: "promotealloca" ); |
1014 | |
1015 | BasicBlock *EntryBB = Alloca.getParent(); |
1016 | BasicBlock::iterator InitInsertPos = |
1017 | skipToNonAllocaInsertPt(BB&: *EntryBB, I: Alloca.getIterator()); |
1018 | // Alloca memory is undefined to begin, not poison. |
1019 | Value *AllocaInitValue = |
1020 | new FreezeInst(PoisonValue::get(T: VectorTy), "" , InitInsertPos); |
1021 | AllocaInitValue->takeName(V: &Alloca); |
1022 | |
1023 | Updater.AddAvailableValue(BB: EntryBB, V: AllocaInitValue); |
1024 | |
1025 | // First handle the initial worklist. |
1026 | SmallVector<LoadInst *, 4> DeferredLoads; |
1027 | forEachWorkListItem(WorkList, Fn: [&](Instruction *I) { |
1028 | BasicBlock *BB = I->getParent(); |
1029 | // On the first pass, we only take values that are trivially known, i.e. |
1030 | // where AddAvailableValue was already called in this block. |
1031 | Value *Result = promoteAllocaUserToVector( |
1032 | Inst: I, DL: *DL, VectorTy, VecStoreSize, ElementSize, TransferInfo, GEPVectorIdx, |
1033 | CurVal: Updater.FindValueForBlock(BB), DeferredLoads); |
1034 | if (Result) |
1035 | Updater.AddAvailableValue(BB, V: Result); |
1036 | }); |
1037 | |
1038 | // Then handle deferred loads. |
1039 | forEachWorkListItem(WorkList: DeferredLoads, Fn: [&](Instruction *I) { |
1040 | SmallVector<LoadInst *, 0> NewDLs; |
1041 | BasicBlock *BB = I->getParent(); |
1042 | // On the second pass, we use GetValueInMiddleOfBlock to guarantee we always |
1043 | // get a value, inserting PHIs as needed. |
1044 | Value *Result = promoteAllocaUserToVector( |
1045 | Inst: I, DL: *DL, VectorTy, VecStoreSize, ElementSize, TransferInfo, GEPVectorIdx, |
1046 | CurVal: Updater.GetValueInMiddleOfBlock(BB: I->getParent()), DeferredLoads&: NewDLs); |
1047 | if (Result) |
1048 | Updater.AddAvailableValue(BB, V: Result); |
1049 | assert(NewDLs.empty() && "No more deferred loads should be queued!" ); |
1050 | }); |
1051 | |
1052 | // Delete all instructions. On the first pass, new dummy loads may have been |
1053 | // added so we need to collect them too. |
1054 | DenseSet<Instruction *> InstsToDelete(WorkList.begin(), WorkList.end()); |
1055 | InstsToDelete.insert_range(R&: DeferredLoads); |
1056 | for (Instruction *I : InstsToDelete) { |
1057 | assert(I->use_empty()); |
1058 | I->eraseFromParent(); |
1059 | } |
1060 | |
1061 | // Delete all the users that are known to be removeable. |
1062 | for (Instruction *I : reverse(C&: UsersToRemove)) { |
1063 | I->dropDroppableUses(); |
1064 | assert(I->use_empty()); |
1065 | I->eraseFromParent(); |
1066 | } |
1067 | |
1068 | // Alloca should now be dead too. |
1069 | assert(Alloca.use_empty()); |
1070 | Alloca.eraseFromParent(); |
1071 | return true; |
1072 | } |
1073 | |
1074 | std::pair<Value *, Value *> |
1075 | AMDGPUPromoteAllocaImpl::getLocalSizeYZ(IRBuilder<> &Builder) { |
1076 | Function &F = *Builder.GetInsertBlock()->getParent(); |
1077 | const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F); |
1078 | |
1079 | if (!IsAMDHSA) { |
1080 | CallInst *LocalSizeY = |
1081 | Builder.CreateIntrinsic(ID: Intrinsic::r600_read_local_size_y, Args: {}); |
1082 | CallInst *LocalSizeZ = |
1083 | Builder.CreateIntrinsic(ID: Intrinsic::r600_read_local_size_z, Args: {}); |
1084 | |
1085 | ST.makeLIDRangeMetadata(I: LocalSizeY); |
1086 | ST.makeLIDRangeMetadata(I: LocalSizeZ); |
1087 | |
1088 | return std::pair(LocalSizeY, LocalSizeZ); |
1089 | } |
1090 | |
1091 | // We must read the size out of the dispatch pointer. |
1092 | assert(IsAMDGCN); |
1093 | |
1094 | // We are indexing into this struct, and want to extract the workgroup_size_* |
1095 | // fields. |
1096 | // |
1097 | // typedef struct hsa_kernel_dispatch_packet_s { |
1098 | // uint16_t header; |
1099 | // uint16_t setup; |
1100 | // uint16_t workgroup_size_x ; |
1101 | // uint16_t workgroup_size_y; |
1102 | // uint16_t workgroup_size_z; |
1103 | // uint16_t reserved0; |
1104 | // uint32_t grid_size_x ; |
1105 | // uint32_t grid_size_y ; |
1106 | // uint32_t grid_size_z; |
1107 | // |
1108 | // uint32_t private_segment_size; |
1109 | // uint32_t group_segment_size; |
1110 | // uint64_t kernel_object; |
1111 | // |
1112 | // #ifdef HSA_LARGE_MODEL |
1113 | // void *kernarg_address; |
1114 | // #elif defined HSA_LITTLE_ENDIAN |
1115 | // void *kernarg_address; |
1116 | // uint32_t reserved1; |
1117 | // #else |
1118 | // uint32_t reserved1; |
1119 | // void *kernarg_address; |
1120 | // #endif |
1121 | // uint64_t reserved2; |
1122 | // hsa_signal_t completion_signal; // uint64_t wrapper |
1123 | // } hsa_kernel_dispatch_packet_t |
1124 | // |
1125 | CallInst *DispatchPtr = |
1126 | Builder.CreateIntrinsic(ID: Intrinsic::amdgcn_dispatch_ptr, Args: {}); |
1127 | DispatchPtr->addRetAttr(Kind: Attribute::NoAlias); |
1128 | DispatchPtr->addRetAttr(Kind: Attribute::NonNull); |
1129 | F.removeFnAttr(Kind: "amdgpu-no-dispatch-ptr" ); |
1130 | |
1131 | // Size of the dispatch packet struct. |
1132 | DispatchPtr->addDereferenceableRetAttr(Bytes: 64); |
1133 | |
1134 | Type *I32Ty = Type::getInt32Ty(C&: Mod->getContext()); |
1135 | |
1136 | // We could do a single 64-bit load here, but it's likely that the basic |
1137 | // 32-bit and extract sequence is already present, and it is probably easier |
1138 | // to CSE this. The loads should be mergeable later anyway. |
1139 | Value *GEPXY = Builder.CreateConstInBoundsGEP1_64(Ty: I32Ty, Ptr: DispatchPtr, Idx0: 1); |
1140 | LoadInst *LoadXY = Builder.CreateAlignedLoad(Ty: I32Ty, Ptr: GEPXY, Align: Align(4)); |
1141 | |
1142 | Value *GEPZU = Builder.CreateConstInBoundsGEP1_64(Ty: I32Ty, Ptr: DispatchPtr, Idx0: 2); |
1143 | LoadInst *LoadZU = Builder.CreateAlignedLoad(Ty: I32Ty, Ptr: GEPZU, Align: Align(4)); |
1144 | |
1145 | MDNode *MD = MDNode::get(Context&: Mod->getContext(), MDs: {}); |
1146 | LoadXY->setMetadata(KindID: LLVMContext::MD_invariant_load, Node: MD); |
1147 | LoadZU->setMetadata(KindID: LLVMContext::MD_invariant_load, Node: MD); |
1148 | ST.makeLIDRangeMetadata(I: LoadZU); |
1149 | |
1150 | // Extract y component. Upper half of LoadZU should be zero already. |
1151 | Value *Y = Builder.CreateLShr(LHS: LoadXY, RHS: 16); |
1152 | |
1153 | return std::pair(Y, LoadZU); |
1154 | } |
1155 | |
1156 | Value *AMDGPUPromoteAllocaImpl::getWorkitemID(IRBuilder<> &Builder, |
1157 | unsigned N) { |
1158 | Function *F = Builder.GetInsertBlock()->getParent(); |
1159 | const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F: *F); |
1160 | Intrinsic::ID IntrID = Intrinsic::not_intrinsic; |
1161 | StringRef AttrName; |
1162 | |
1163 | switch (N) { |
1164 | case 0: |
1165 | IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_x |
1166 | : (Intrinsic::ID)Intrinsic::r600_read_tidig_x; |
1167 | AttrName = "amdgpu-no-workitem-id-x" ; |
1168 | break; |
1169 | case 1: |
1170 | IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_y |
1171 | : (Intrinsic::ID)Intrinsic::r600_read_tidig_y; |
1172 | AttrName = "amdgpu-no-workitem-id-y" ; |
1173 | break; |
1174 | |
1175 | case 2: |
1176 | IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_z |
1177 | : (Intrinsic::ID)Intrinsic::r600_read_tidig_z; |
1178 | AttrName = "amdgpu-no-workitem-id-z" ; |
1179 | break; |
1180 | default: |
1181 | llvm_unreachable("invalid dimension" ); |
1182 | } |
1183 | |
1184 | Function *WorkitemIdFn = Intrinsic::getOrInsertDeclaration(M: Mod, id: IntrID); |
1185 | CallInst *CI = Builder.CreateCall(Callee: WorkitemIdFn); |
1186 | ST.makeLIDRangeMetadata(I: CI); |
1187 | F->removeFnAttr(Kind: AttrName); |
1188 | |
1189 | return CI; |
1190 | } |
1191 | |
1192 | static bool isCallPromotable(CallInst *CI) { |
1193 | IntrinsicInst *II = dyn_cast<IntrinsicInst>(Val: CI); |
1194 | if (!II) |
1195 | return false; |
1196 | |
1197 | switch (II->getIntrinsicID()) { |
1198 | case Intrinsic::memcpy: |
1199 | case Intrinsic::memmove: |
1200 | case Intrinsic::memset: |
1201 | case Intrinsic::lifetime_start: |
1202 | case Intrinsic::lifetime_end: |
1203 | case Intrinsic::invariant_start: |
1204 | case Intrinsic::invariant_end: |
1205 | case Intrinsic::launder_invariant_group: |
1206 | case Intrinsic::strip_invariant_group: |
1207 | case Intrinsic::objectsize: |
1208 | return true; |
1209 | default: |
1210 | return false; |
1211 | } |
1212 | } |
1213 | |
1214 | bool AMDGPUPromoteAllocaImpl::binaryOpIsDerivedFromSameAlloca( |
1215 | Value *BaseAlloca, Value *Val, Instruction *Inst, int OpIdx0, |
1216 | int OpIdx1) const { |
1217 | // Figure out which operand is the one we might not be promoting. |
1218 | Value *OtherOp = Inst->getOperand(i: OpIdx0); |
1219 | if (Val == OtherOp) |
1220 | OtherOp = Inst->getOperand(i: OpIdx1); |
1221 | |
1222 | if (isa<ConstantPointerNull, ConstantAggregateZero>(Val: OtherOp)) |
1223 | return true; |
1224 | |
1225 | // TODO: getUnderlyingObject will not work on a vector getelementptr |
1226 | Value *OtherObj = getUnderlyingObject(V: OtherOp); |
1227 | if (!isa<AllocaInst>(Val: OtherObj)) |
1228 | return false; |
1229 | |
1230 | // TODO: We should be able to replace undefs with the right pointer type. |
1231 | |
1232 | // TODO: If we know the other base object is another promotable |
1233 | // alloca, not necessarily this alloca, we can do this. The |
1234 | // important part is both must have the same address space at |
1235 | // the end. |
1236 | if (OtherObj != BaseAlloca) { |
1237 | LLVM_DEBUG( |
1238 | dbgs() << "Found a binary instruction with another alloca object\n" ); |
1239 | return false; |
1240 | } |
1241 | |
1242 | return true; |
1243 | } |
1244 | |
1245 | bool AMDGPUPromoteAllocaImpl::collectUsesWithPtrTypes( |
1246 | Value *BaseAlloca, Value *Val, std::vector<Value *> &WorkList) const { |
1247 | |
1248 | for (User *User : Val->users()) { |
1249 | if (is_contained(Range&: WorkList, Element: User)) |
1250 | continue; |
1251 | |
1252 | if (CallInst *CI = dyn_cast<CallInst>(Val: User)) { |
1253 | if (!isCallPromotable(CI)) |
1254 | return false; |
1255 | |
1256 | WorkList.push_back(x: User); |
1257 | continue; |
1258 | } |
1259 | |
1260 | Instruction *UseInst = cast<Instruction>(Val: User); |
1261 | if (UseInst->getOpcode() == Instruction::PtrToInt) |
1262 | return false; |
1263 | |
1264 | if (LoadInst *LI = dyn_cast<LoadInst>(Val: UseInst)) { |
1265 | if (LI->isVolatile()) |
1266 | return false; |
1267 | continue; |
1268 | } |
1269 | |
1270 | if (StoreInst *SI = dyn_cast<StoreInst>(Val: UseInst)) { |
1271 | if (SI->isVolatile()) |
1272 | return false; |
1273 | |
1274 | // Reject if the stored value is not the pointer operand. |
1275 | if (SI->getPointerOperand() != Val) |
1276 | return false; |
1277 | continue; |
1278 | } |
1279 | |
1280 | if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(Val: UseInst)) { |
1281 | if (RMW->isVolatile()) |
1282 | return false; |
1283 | continue; |
1284 | } |
1285 | |
1286 | if (AtomicCmpXchgInst *CAS = dyn_cast<AtomicCmpXchgInst>(Val: UseInst)) { |
1287 | if (CAS->isVolatile()) |
1288 | return false; |
1289 | continue; |
1290 | } |
1291 | |
1292 | // Only promote a select if we know that the other select operand |
1293 | // is from another pointer that will also be promoted. |
1294 | if (ICmpInst *ICmp = dyn_cast<ICmpInst>(Val: UseInst)) { |
1295 | if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, Inst: ICmp, OpIdx0: 0, OpIdx1: 1)) |
1296 | return false; |
1297 | |
1298 | // May need to rewrite constant operands. |
1299 | WorkList.push_back(x: ICmp); |
1300 | continue; |
1301 | } |
1302 | |
1303 | if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Val: UseInst)) { |
1304 | // Be conservative if an address could be computed outside the bounds of |
1305 | // the alloca. |
1306 | if (!GEP->isInBounds()) |
1307 | return false; |
1308 | } else if (SelectInst *SI = dyn_cast<SelectInst>(Val: UseInst)) { |
1309 | // Only promote a select if we know that the other select operand is from |
1310 | // another pointer that will also be promoted. |
1311 | if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, Inst: SI, OpIdx0: 1, OpIdx1: 2)) |
1312 | return false; |
1313 | } else if (PHINode *Phi = dyn_cast<PHINode>(Val: UseInst)) { |
1314 | // Repeat for phis. |
1315 | |
1316 | // TODO: Handle more complex cases. We should be able to replace loops |
1317 | // over arrays. |
1318 | switch (Phi->getNumIncomingValues()) { |
1319 | case 1: |
1320 | break; |
1321 | case 2: |
1322 | if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, Inst: Phi, OpIdx0: 0, OpIdx1: 1)) |
1323 | return false; |
1324 | break; |
1325 | default: |
1326 | return false; |
1327 | } |
1328 | } else if (!isa<ExtractElementInst>(Val: User)) { |
1329 | // Do not promote vector/aggregate type instructions. It is hard to track |
1330 | // their users. |
1331 | |
1332 | // Do not promote addrspacecast. |
1333 | // |
1334 | // TODO: If we know the address is only observed through flat pointers, we |
1335 | // could still promote. |
1336 | return false; |
1337 | } |
1338 | |
1339 | WorkList.push_back(x: User); |
1340 | if (!collectUsesWithPtrTypes(BaseAlloca, Val: User, WorkList)) |
1341 | return false; |
1342 | } |
1343 | |
1344 | return true; |
1345 | } |
1346 | |
1347 | bool AMDGPUPromoteAllocaImpl::hasSufficientLocalMem(const Function &F) { |
1348 | |
1349 | FunctionType *FTy = F.getFunctionType(); |
1350 | const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F); |
1351 | |
1352 | // If the function has any arguments in the local address space, then it's |
1353 | // possible these arguments require the entire local memory space, so |
1354 | // we cannot use local memory in the pass. |
1355 | for (Type *ParamTy : FTy->params()) { |
1356 | PointerType *PtrTy = dyn_cast<PointerType>(Val: ParamTy); |
1357 | if (PtrTy && PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { |
1358 | LocalMemLimit = 0; |
1359 | LLVM_DEBUG(dbgs() << "Function has local memory argument. Promoting to " |
1360 | "local memory disabled.\n" ); |
1361 | return false; |
1362 | } |
1363 | } |
1364 | |
1365 | LocalMemLimit = ST.getAddressableLocalMemorySize(); |
1366 | if (LocalMemLimit == 0) |
1367 | return false; |
1368 | |
1369 | SmallVector<const Constant *, 16> Stack; |
1370 | SmallPtrSet<const Constant *, 8> VisitedConstants; |
1371 | SmallPtrSet<const GlobalVariable *, 8> UsedLDS; |
1372 | |
1373 | auto visitUsers = [&](const GlobalVariable *GV, const Constant *Val) -> bool { |
1374 | for (const User *U : Val->users()) { |
1375 | if (const Instruction *Use = dyn_cast<Instruction>(Val: U)) { |
1376 | if (Use->getParent()->getParent() == &F) |
1377 | return true; |
1378 | } else { |
1379 | const Constant *C = cast<Constant>(Val: U); |
1380 | if (VisitedConstants.insert(Ptr: C).second) |
1381 | Stack.push_back(Elt: C); |
1382 | } |
1383 | } |
1384 | |
1385 | return false; |
1386 | }; |
1387 | |
1388 | for (GlobalVariable &GV : Mod->globals()) { |
1389 | if (GV.getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) |
1390 | continue; |
1391 | |
1392 | if (visitUsers(&GV, &GV)) { |
1393 | UsedLDS.insert(Ptr: &GV); |
1394 | Stack.clear(); |
1395 | continue; |
1396 | } |
1397 | |
1398 | // For any ConstantExpr uses, we need to recursively search the users until |
1399 | // we see a function. |
1400 | while (!Stack.empty()) { |
1401 | const Constant *C = Stack.pop_back_val(); |
1402 | if (visitUsers(&GV, C)) { |
1403 | UsedLDS.insert(Ptr: &GV); |
1404 | Stack.clear(); |
1405 | break; |
1406 | } |
1407 | } |
1408 | } |
1409 | |
1410 | const DataLayout &DL = Mod->getDataLayout(); |
1411 | SmallVector<std::pair<uint64_t, Align>, 16> AllocatedSizes; |
1412 | AllocatedSizes.reserve(N: UsedLDS.size()); |
1413 | |
1414 | for (const GlobalVariable *GV : UsedLDS) { |
1415 | Align Alignment = |
1416 | DL.getValueOrABITypeAlignment(Alignment: GV->getAlign(), Ty: GV->getValueType()); |
1417 | uint64_t AllocSize = DL.getTypeAllocSize(Ty: GV->getValueType()); |
1418 | |
1419 | // HIP uses an extern unsized array in local address space for dynamically |
1420 | // allocated shared memory. In that case, we have to disable the promotion. |
1421 | if (GV->hasExternalLinkage() && AllocSize == 0) { |
1422 | LocalMemLimit = 0; |
1423 | LLVM_DEBUG(dbgs() << "Function has a reference to externally allocated " |
1424 | "local memory. Promoting to local memory " |
1425 | "disabled.\n" ); |
1426 | return false; |
1427 | } |
1428 | |
1429 | AllocatedSizes.emplace_back(Args&: AllocSize, Args&: Alignment); |
1430 | } |
1431 | |
1432 | // Sort to try to estimate the worst case alignment padding |
1433 | // |
1434 | // FIXME: We should really do something to fix the addresses to a more optimal |
1435 | // value instead |
1436 | llvm::sort(C&: AllocatedSizes, Comp: llvm::less_second()); |
1437 | |
1438 | // Check how much local memory is being used by global objects |
1439 | CurrentLocalMemUsage = 0; |
1440 | |
1441 | // FIXME: Try to account for padding here. The real padding and address is |
1442 | // currently determined from the inverse order of uses in the function when |
1443 | // legalizing, which could also potentially change. We try to estimate the |
1444 | // worst case here, but we probably should fix the addresses earlier. |
1445 | for (auto Alloc : AllocatedSizes) { |
1446 | CurrentLocalMemUsage = alignTo(Size: CurrentLocalMemUsage, A: Alloc.second); |
1447 | CurrentLocalMemUsage += Alloc.first; |
1448 | } |
1449 | |
1450 | unsigned MaxOccupancy = |
1451 | ST.getWavesPerEU(FlatWorkGroupSizes: ST.getFlatWorkGroupSizes(F), LDSBytes: CurrentLocalMemUsage, F) |
1452 | .second; |
1453 | |
1454 | // Round up to the next tier of usage. |
1455 | unsigned MaxSizeWithWaveCount = |
1456 | ST.getMaxLocalMemSizeWithWaveCount(WaveCount: MaxOccupancy, F); |
1457 | |
1458 | // Program may already use more LDS than is usable at maximum occupancy. |
1459 | if (CurrentLocalMemUsage > MaxSizeWithWaveCount) |
1460 | return false; |
1461 | |
1462 | LocalMemLimit = MaxSizeWithWaveCount; |
1463 | |
1464 | LLVM_DEBUG(dbgs() << F.getName() << " uses " << CurrentLocalMemUsage |
1465 | << " bytes of LDS\n" |
1466 | << " Rounding size to " << MaxSizeWithWaveCount |
1467 | << " with a maximum occupancy of " << MaxOccupancy << '\n' |
1468 | << " and " << (LocalMemLimit - CurrentLocalMemUsage) |
1469 | << " available for promotion\n" ); |
1470 | |
1471 | return true; |
1472 | } |
1473 | |
1474 | // FIXME: Should try to pick the most likely to be profitable allocas first. |
1475 | bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(AllocaInst &I, |
1476 | bool SufficientLDS) { |
1477 | LLVM_DEBUG(dbgs() << "Trying to promote to LDS: " << I << '\n'); |
1478 | |
1479 | if (DisablePromoteAllocaToLDS) { |
1480 | LLVM_DEBUG(dbgs() << " Promote alloca to LDS is disabled\n" ); |
1481 | return false; |
1482 | } |
1483 | |
1484 | const DataLayout &DL = Mod->getDataLayout(); |
1485 | IRBuilder<> Builder(&I); |
1486 | |
1487 | const Function &ContainingFunction = *I.getParent()->getParent(); |
1488 | CallingConv::ID CC = ContainingFunction.getCallingConv(); |
1489 | |
1490 | // Don't promote the alloca to LDS for shader calling conventions as the work |
1491 | // item ID intrinsics are not supported for these calling conventions. |
1492 | // Furthermore not all LDS is available for some of the stages. |
1493 | switch (CC) { |
1494 | case CallingConv::AMDGPU_KERNEL: |
1495 | case CallingConv::SPIR_KERNEL: |
1496 | break; |
1497 | default: |
1498 | LLVM_DEBUG( |
1499 | dbgs() |
1500 | << " promote alloca to LDS not supported with calling convention.\n" ); |
1501 | return false; |
1502 | } |
1503 | |
1504 | // Not likely to have sufficient local memory for promotion. |
1505 | if (!SufficientLDS) |
1506 | return false; |
1507 | |
1508 | const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F: ContainingFunction); |
1509 | unsigned WorkGroupSize = ST.getFlatWorkGroupSizes(F: ContainingFunction).second; |
1510 | |
1511 | Align Alignment = |
1512 | DL.getValueOrABITypeAlignment(Alignment: I.getAlign(), Ty: I.getAllocatedType()); |
1513 | |
1514 | // FIXME: This computed padding is likely wrong since it depends on inverse |
1515 | // usage order. |
1516 | // |
1517 | // FIXME: It is also possible that if we're allowed to use all of the memory |
1518 | // could end up using more than the maximum due to alignment padding. |
1519 | |
1520 | uint32_t NewSize = alignTo(Size: CurrentLocalMemUsage, A: Alignment); |
1521 | uint32_t AllocSize = |
1522 | WorkGroupSize * DL.getTypeAllocSize(Ty: I.getAllocatedType()); |
1523 | NewSize += AllocSize; |
1524 | |
1525 | if (NewSize > LocalMemLimit) { |
1526 | LLVM_DEBUG(dbgs() << " " << AllocSize |
1527 | << " bytes of local memory not available to promote\n" ); |
1528 | return false; |
1529 | } |
1530 | |
1531 | CurrentLocalMemUsage = NewSize; |
1532 | |
1533 | std::vector<Value *> WorkList; |
1534 | |
1535 | if (!collectUsesWithPtrTypes(BaseAlloca: &I, Val: &I, WorkList)) { |
1536 | LLVM_DEBUG(dbgs() << " Do not know how to convert all uses\n" ); |
1537 | return false; |
1538 | } |
1539 | |
1540 | LLVM_DEBUG(dbgs() << "Promoting alloca to local memory\n" ); |
1541 | |
1542 | Function *F = I.getParent()->getParent(); |
1543 | |
1544 | Type *GVTy = ArrayType::get(ElementType: I.getAllocatedType(), NumElements: WorkGroupSize); |
1545 | GlobalVariable *GV = new GlobalVariable( |
1546 | *Mod, GVTy, false, GlobalValue::InternalLinkage, PoisonValue::get(T: GVTy), |
1547 | Twine(F->getName()) + Twine('.') + I.getName(), nullptr, |
1548 | GlobalVariable::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS); |
1549 | GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); |
1550 | GV->setAlignment(I.getAlign()); |
1551 | |
1552 | Value *TCntY, *TCntZ; |
1553 | |
1554 | std::tie(args&: TCntY, args&: TCntZ) = getLocalSizeYZ(Builder); |
1555 | Value *TIdX = getWorkitemID(Builder, N: 0); |
1556 | Value *TIdY = getWorkitemID(Builder, N: 1); |
1557 | Value *TIdZ = getWorkitemID(Builder, N: 2); |
1558 | |
1559 | Value *Tmp0 = Builder.CreateMul(LHS: TCntY, RHS: TCntZ, Name: "" , HasNUW: true, HasNSW: true); |
1560 | Tmp0 = Builder.CreateMul(LHS: Tmp0, RHS: TIdX); |
1561 | Value *Tmp1 = Builder.CreateMul(LHS: TIdY, RHS: TCntZ, Name: "" , HasNUW: true, HasNSW: true); |
1562 | Value *TID = Builder.CreateAdd(LHS: Tmp0, RHS: Tmp1); |
1563 | TID = Builder.CreateAdd(LHS: TID, RHS: TIdZ); |
1564 | |
1565 | LLVMContext &Context = Mod->getContext(); |
1566 | Value *Indices[] = {Constant::getNullValue(Ty: Type::getInt32Ty(C&: Context)), TID}; |
1567 | |
1568 | Value *Offset = Builder.CreateInBoundsGEP(Ty: GVTy, Ptr: GV, IdxList: Indices); |
1569 | I.mutateType(Ty: Offset->getType()); |
1570 | I.replaceAllUsesWith(V: Offset); |
1571 | I.eraseFromParent(); |
1572 | |
1573 | SmallVector<IntrinsicInst *> DeferredIntrs; |
1574 | |
1575 | PointerType *NewPtrTy = PointerType::get(C&: Context, AddressSpace: AMDGPUAS::LOCAL_ADDRESS); |
1576 | |
1577 | for (Value *V : WorkList) { |
1578 | CallInst *Call = dyn_cast<CallInst>(Val: V); |
1579 | if (!Call) { |
1580 | if (ICmpInst *CI = dyn_cast<ICmpInst>(Val: V)) { |
1581 | Value *LHS = CI->getOperand(i_nocapture: 0); |
1582 | Value *RHS = CI->getOperand(i_nocapture: 1); |
1583 | |
1584 | Type *NewTy = LHS->getType()->getWithNewType(EltTy: NewPtrTy); |
1585 | if (isa<ConstantPointerNull, ConstantAggregateZero>(Val: LHS)) |
1586 | CI->setOperand(i_nocapture: 0, Val_nocapture: Constant::getNullValue(Ty: NewTy)); |
1587 | |
1588 | if (isa<ConstantPointerNull, ConstantAggregateZero>(Val: RHS)) |
1589 | CI->setOperand(i_nocapture: 1, Val_nocapture: Constant::getNullValue(Ty: NewTy)); |
1590 | |
1591 | continue; |
1592 | } |
1593 | |
1594 | // The operand's value should be corrected on its own and we don't want to |
1595 | // touch the users. |
1596 | if (isa<AddrSpaceCastInst>(Val: V)) |
1597 | continue; |
1598 | |
1599 | assert(V->getType()->isPtrOrPtrVectorTy()); |
1600 | |
1601 | Type *NewTy = V->getType()->getWithNewType(EltTy: NewPtrTy); |
1602 | V->mutateType(Ty: NewTy); |
1603 | |
1604 | // Adjust the types of any constant operands. |
1605 | if (SelectInst *SI = dyn_cast<SelectInst>(Val: V)) { |
1606 | if (isa<ConstantPointerNull, ConstantAggregateZero>(Val: SI->getOperand(i_nocapture: 1))) |
1607 | SI->setOperand(i_nocapture: 1, Val_nocapture: Constant::getNullValue(Ty: NewTy)); |
1608 | |
1609 | if (isa<ConstantPointerNull, ConstantAggregateZero>(Val: SI->getOperand(i_nocapture: 2))) |
1610 | SI->setOperand(i_nocapture: 2, Val_nocapture: Constant::getNullValue(Ty: NewTy)); |
1611 | } else if (PHINode *Phi = dyn_cast<PHINode>(Val: V)) { |
1612 | for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I) { |
1613 | if (isa<ConstantPointerNull, ConstantAggregateZero>( |
1614 | Val: Phi->getIncomingValue(i: I))) |
1615 | Phi->setIncomingValue(i: I, V: Constant::getNullValue(Ty: NewTy)); |
1616 | } |
1617 | } |
1618 | |
1619 | continue; |
1620 | } |
1621 | |
1622 | IntrinsicInst *Intr = cast<IntrinsicInst>(Val: Call); |
1623 | Builder.SetInsertPoint(Intr); |
1624 | switch (Intr->getIntrinsicID()) { |
1625 | case Intrinsic::lifetime_start: |
1626 | case Intrinsic::lifetime_end: |
1627 | // These intrinsics are for address space 0 only |
1628 | Intr->eraseFromParent(); |
1629 | continue; |
1630 | case Intrinsic::memcpy: |
1631 | case Intrinsic::memmove: |
1632 | // These have 2 pointer operands. In case if second pointer also needs |
1633 | // to be replaced we defer processing of these intrinsics until all |
1634 | // other values are processed. |
1635 | DeferredIntrs.push_back(Elt: Intr); |
1636 | continue; |
1637 | case Intrinsic::memset: { |
1638 | MemSetInst *MemSet = cast<MemSetInst>(Val: Intr); |
1639 | Builder.CreateMemSet(Ptr: MemSet->getRawDest(), Val: MemSet->getValue(), |
1640 | Size: MemSet->getLength(), Align: MemSet->getDestAlign(), |
1641 | isVolatile: MemSet->isVolatile()); |
1642 | Intr->eraseFromParent(); |
1643 | continue; |
1644 | } |
1645 | case Intrinsic::invariant_start: |
1646 | case Intrinsic::invariant_end: |
1647 | case Intrinsic::launder_invariant_group: |
1648 | case Intrinsic::strip_invariant_group: { |
1649 | SmallVector<Value *> Args; |
1650 | if (Intr->getIntrinsicID() == Intrinsic::invariant_start) { |
1651 | Args.emplace_back(Args: Intr->getArgOperand(i: 0)); |
1652 | } else if (Intr->getIntrinsicID() == Intrinsic::invariant_end) { |
1653 | Args.emplace_back(Args: Intr->getArgOperand(i: 0)); |
1654 | Args.emplace_back(Args: Intr->getArgOperand(i: 1)); |
1655 | } |
1656 | Args.emplace_back(Args&: Offset); |
1657 | Function *F = Intrinsic::getOrInsertDeclaration( |
1658 | M: Intr->getModule(), id: Intr->getIntrinsicID(), Tys: Offset->getType()); |
1659 | CallInst *NewIntr = |
1660 | CallInst::Create(Func: F, Args, NameStr: Intr->getName(), InsertBefore: Intr->getIterator()); |
1661 | Intr->mutateType(Ty: NewIntr->getType()); |
1662 | Intr->replaceAllUsesWith(V: NewIntr); |
1663 | Intr->eraseFromParent(); |
1664 | continue; |
1665 | } |
1666 | case Intrinsic::objectsize: { |
1667 | Value *Src = Intr->getOperand(i_nocapture: 0); |
1668 | |
1669 | CallInst *NewCall = Builder.CreateIntrinsic( |
1670 | ID: Intrinsic::objectsize, |
1671 | Types: {Intr->getType(), PointerType::get(C&: Context, AddressSpace: AMDGPUAS::LOCAL_ADDRESS)}, |
1672 | Args: {Src, Intr->getOperand(i_nocapture: 1), Intr->getOperand(i_nocapture: 2), Intr->getOperand(i_nocapture: 3)}); |
1673 | Intr->replaceAllUsesWith(V: NewCall); |
1674 | Intr->eraseFromParent(); |
1675 | continue; |
1676 | } |
1677 | default: |
1678 | Intr->print(O&: errs()); |
1679 | llvm_unreachable("Don't know how to promote alloca intrinsic use." ); |
1680 | } |
1681 | } |
1682 | |
1683 | for (IntrinsicInst *Intr : DeferredIntrs) { |
1684 | Builder.SetInsertPoint(Intr); |
1685 | Intrinsic::ID ID = Intr->getIntrinsicID(); |
1686 | assert(ID == Intrinsic::memcpy || ID == Intrinsic::memmove); |
1687 | |
1688 | MemTransferInst *MI = cast<MemTransferInst>(Val: Intr); |
1689 | auto *B = Builder.CreateMemTransferInst( |
1690 | IntrID: ID, Dst: MI->getRawDest(), DstAlign: MI->getDestAlign(), Src: MI->getRawSource(), |
1691 | SrcAlign: MI->getSourceAlign(), Size: MI->getLength(), isVolatile: MI->isVolatile()); |
1692 | |
1693 | for (unsigned I = 0; I != 2; ++I) { |
1694 | if (uint64_t Bytes = Intr->getParamDereferenceableBytes(i: I)) { |
1695 | B->addDereferenceableParamAttr(i: I, Bytes); |
1696 | } |
1697 | } |
1698 | |
1699 | Intr->eraseFromParent(); |
1700 | } |
1701 | |
1702 | return true; |
1703 | } |
1704 | |