1 | //===-- AMDGPUPromoteAlloca.cpp - Promote Allocas -------------------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // Eliminates allocas by either converting them into vectors or by migrating |
10 | // them to local address space. |
11 | // |
12 | // Two passes are exposed by this file: |
13 | // - "promote-alloca-to-vector", which runs early in the pipeline and only |
14 | // promotes to vector. Promotion to vector is almost always profitable |
15 | // except when the alloca is too big and the promotion would result in |
16 | // very high register pressure. |
17 | // - "promote-alloca", which does both promotion to vector and LDS and runs |
18 | // much later in the pipeline. This runs after SROA because promoting to |
19 | // LDS is of course less profitable than getting rid of the alloca or |
20 | // vectorizing it, thus we only want to do it when the only alternative is |
21 | // lowering the alloca to stack. |
22 | // |
23 | // Note that both of them exist for the old and new PMs. The new PM passes are |
24 | // declared in AMDGPU.h and the legacy PM ones are declared here.s |
25 | // |
26 | //===----------------------------------------------------------------------===// |
27 | |
28 | #include "AMDGPU.h" |
29 | #include "GCNSubtarget.h" |
30 | #include "Utils/AMDGPUBaseInfo.h" |
31 | #include "llvm/ADT/STLExtras.h" |
32 | #include "llvm/Analysis/CaptureTracking.h" |
33 | #include "llvm/Analysis/InstSimplifyFolder.h" |
34 | #include "llvm/Analysis/InstructionSimplify.h" |
35 | #include "llvm/Analysis/LoopInfo.h" |
36 | #include "llvm/Analysis/ValueTracking.h" |
37 | #include "llvm/CodeGen/TargetPassConfig.h" |
38 | #include "llvm/IR/IRBuilder.h" |
39 | #include "llvm/IR/IntrinsicInst.h" |
40 | #include "llvm/IR/IntrinsicsAMDGPU.h" |
41 | #include "llvm/IR/IntrinsicsR600.h" |
42 | #include "llvm/IR/PatternMatch.h" |
43 | #include "llvm/InitializePasses.h" |
44 | #include "llvm/Pass.h" |
45 | #include "llvm/Target/TargetMachine.h" |
46 | #include "llvm/Transforms/Utils/SSAUpdater.h" |
47 | |
48 | #define DEBUG_TYPE "amdgpu-promote-alloca" |
49 | |
50 | using namespace llvm; |
51 | |
52 | namespace { |
53 | |
54 | static cl::opt<bool> |
55 | DisablePromoteAllocaToVector("disable-promote-alloca-to-vector" , |
56 | cl::desc("Disable promote alloca to vector" ), |
57 | cl::init(Val: false)); |
58 | |
59 | static cl::opt<bool> |
60 | DisablePromoteAllocaToLDS("disable-promote-alloca-to-lds" , |
61 | cl::desc("Disable promote alloca to LDS" ), |
62 | cl::init(Val: false)); |
63 | |
64 | static cl::opt<unsigned> PromoteAllocaToVectorLimit( |
65 | "amdgpu-promote-alloca-to-vector-limit" , |
66 | cl::desc("Maximum byte size to consider promote alloca to vector" ), |
67 | cl::init(Val: 0)); |
68 | |
69 | static cl::opt<unsigned> |
70 | LoopUserWeight("promote-alloca-vector-loop-user-weight" , |
71 | cl::desc("The bonus weight of users of allocas within loop " |
72 | "when sorting profitable allocas" ), |
73 | cl::init(Val: 4)); |
74 | |
75 | // Shared implementation which can do both promotion to vector and to LDS. |
76 | class AMDGPUPromoteAllocaImpl { |
77 | private: |
78 | const TargetMachine &TM; |
79 | LoopInfo &LI; |
80 | Module *Mod = nullptr; |
81 | const DataLayout *DL = nullptr; |
82 | |
83 | // FIXME: This should be per-kernel. |
84 | uint32_t LocalMemLimit = 0; |
85 | uint32_t CurrentLocalMemUsage = 0; |
86 | unsigned MaxVGPRs; |
87 | |
88 | bool IsAMDGCN = false; |
89 | bool IsAMDHSA = false; |
90 | |
91 | std::pair<Value *, Value *> getLocalSizeYZ(IRBuilder<> &Builder); |
92 | Value *getWorkitemID(IRBuilder<> &Builder, unsigned N); |
93 | |
94 | /// BaseAlloca is the alloca root the search started from. |
95 | /// Val may be that alloca or a recursive user of it. |
96 | bool collectUsesWithPtrTypes(Value *BaseAlloca, Value *Val, |
97 | std::vector<Value *> &WorkList) const; |
98 | |
99 | /// Val is a derived pointer from Alloca. OpIdx0/OpIdx1 are the operand |
100 | /// indices to an instruction with 2 pointer inputs (e.g. select, icmp). |
101 | /// Returns true if both operands are derived from the same alloca. Val should |
102 | /// be the same value as one of the input operands of UseInst. |
103 | bool binaryOpIsDerivedFromSameAlloca(Value *Alloca, Value *Val, |
104 | Instruction *UseInst, int OpIdx0, |
105 | int OpIdx1) const; |
106 | |
107 | /// Check whether we have enough local memory for promotion. |
108 | bool hasSufficientLocalMem(const Function &F); |
109 | |
110 | bool tryPromoteAllocaToVector(AllocaInst &I); |
111 | bool tryPromoteAllocaToLDS(AllocaInst &I, bool SufficientLDS); |
112 | |
113 | void sortAllocasToPromote(SmallVectorImpl<AllocaInst *> &Allocas); |
114 | |
115 | public: |
116 | AMDGPUPromoteAllocaImpl(TargetMachine &TM, LoopInfo &LI) : TM(TM), LI(LI) { |
117 | |
118 | const Triple &TT = TM.getTargetTriple(); |
119 | IsAMDGCN = TT.getArch() == Triple::amdgcn; |
120 | IsAMDHSA = TT.getOS() == Triple::AMDHSA; |
121 | } |
122 | |
123 | bool run(Function &F, bool PromoteToLDS); |
124 | }; |
125 | |
126 | // FIXME: This can create globals so should be a module pass. |
127 | class AMDGPUPromoteAlloca : public FunctionPass { |
128 | public: |
129 | static char ID; |
130 | |
131 | AMDGPUPromoteAlloca() : FunctionPass(ID) {} |
132 | |
133 | bool runOnFunction(Function &F) override { |
134 | if (skipFunction(F)) |
135 | return false; |
136 | if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>()) |
137 | return AMDGPUPromoteAllocaImpl( |
138 | TPC->getTM<TargetMachine>(), |
139 | getAnalysis<LoopInfoWrapperPass>().getLoopInfo()) |
140 | .run(F, /*PromoteToLDS*/ true); |
141 | return false; |
142 | } |
143 | |
144 | StringRef getPassName() const override { return "AMDGPU Promote Alloca" ; } |
145 | |
146 | void getAnalysisUsage(AnalysisUsage &AU) const override { |
147 | AU.setPreservesCFG(); |
148 | AU.addRequired<LoopInfoWrapperPass>(); |
149 | FunctionPass::getAnalysisUsage(AU); |
150 | } |
151 | }; |
152 | |
153 | class AMDGPUPromoteAllocaToVector : public FunctionPass { |
154 | public: |
155 | static char ID; |
156 | |
157 | AMDGPUPromoteAllocaToVector() : FunctionPass(ID) {} |
158 | |
159 | bool runOnFunction(Function &F) override { |
160 | if (skipFunction(F)) |
161 | return false; |
162 | if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>()) |
163 | return AMDGPUPromoteAllocaImpl( |
164 | TPC->getTM<TargetMachine>(), |
165 | getAnalysis<LoopInfoWrapperPass>().getLoopInfo()) |
166 | .run(F, /*PromoteToLDS*/ false); |
167 | return false; |
168 | } |
169 | |
170 | StringRef getPassName() const override { |
171 | return "AMDGPU Promote Alloca to vector" ; |
172 | } |
173 | |
174 | void getAnalysisUsage(AnalysisUsage &AU) const override { |
175 | AU.setPreservesCFG(); |
176 | AU.addRequired<LoopInfoWrapperPass>(); |
177 | FunctionPass::getAnalysisUsage(AU); |
178 | } |
179 | }; |
180 | |
181 | unsigned getMaxVGPRs(const TargetMachine &TM, const Function &F) { |
182 | if (!TM.getTargetTriple().isAMDGCN()) |
183 | return 128; |
184 | |
185 | const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F); |
186 | unsigned MaxVGPRs = ST.getMaxNumVGPRs(WavesPerEU: ST.getWavesPerEU(F).first); |
187 | |
188 | // A non-entry function has only 32 caller preserved registers. |
189 | // Do not promote alloca which will force spilling unless we know the function |
190 | // will be inlined. |
191 | if (!F.hasFnAttribute(Kind: Attribute::AlwaysInline) && |
192 | !AMDGPU::isEntryFunctionCC(CC: F.getCallingConv())) |
193 | MaxVGPRs = std::min(a: MaxVGPRs, b: 32u); |
194 | return MaxVGPRs; |
195 | } |
196 | |
197 | } // end anonymous namespace |
198 | |
199 | char AMDGPUPromoteAlloca::ID = 0; |
200 | char AMDGPUPromoteAllocaToVector::ID = 0; |
201 | |
202 | INITIALIZE_PASS_BEGIN(AMDGPUPromoteAlloca, DEBUG_TYPE, |
203 | "AMDGPU promote alloca to vector or LDS" , false, false) |
204 | // Move LDS uses from functions to kernels before promote alloca for accurate |
205 | // estimation of LDS available |
206 | INITIALIZE_PASS_DEPENDENCY(AMDGPULowerModuleLDSLegacy) |
207 | INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) |
208 | INITIALIZE_PASS_END(AMDGPUPromoteAlloca, DEBUG_TYPE, |
209 | "AMDGPU promote alloca to vector or LDS" , false, false) |
210 | |
211 | INITIALIZE_PASS_BEGIN(AMDGPUPromoteAllocaToVector, DEBUG_TYPE "-to-vector" , |
212 | "AMDGPU promote alloca to vector" , false, false) |
213 | INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) |
214 | INITIALIZE_PASS_END(AMDGPUPromoteAllocaToVector, DEBUG_TYPE "-to-vector" , |
215 | "AMDGPU promote alloca to vector" , false, false) |
216 | |
217 | char &llvm::AMDGPUPromoteAllocaID = AMDGPUPromoteAlloca::ID; |
218 | char &llvm::AMDGPUPromoteAllocaToVectorID = AMDGPUPromoteAllocaToVector::ID; |
219 | |
220 | PreservedAnalyses AMDGPUPromoteAllocaPass::run(Function &F, |
221 | FunctionAnalysisManager &AM) { |
222 | auto &LI = AM.getResult<LoopAnalysis>(IR&: F); |
223 | bool Changed = AMDGPUPromoteAllocaImpl(TM, LI).run(F, /*PromoteToLDS=*/true); |
224 | if (Changed) { |
225 | PreservedAnalyses PA; |
226 | PA.preserveSet<CFGAnalyses>(); |
227 | return PA; |
228 | } |
229 | return PreservedAnalyses::all(); |
230 | } |
231 | |
232 | PreservedAnalyses |
233 | AMDGPUPromoteAllocaToVectorPass::run(Function &F, FunctionAnalysisManager &AM) { |
234 | auto &LI = AM.getResult<LoopAnalysis>(IR&: F); |
235 | bool Changed = AMDGPUPromoteAllocaImpl(TM, LI).run(F, /*PromoteToLDS=*/false); |
236 | if (Changed) { |
237 | PreservedAnalyses PA; |
238 | PA.preserveSet<CFGAnalyses>(); |
239 | return PA; |
240 | } |
241 | return PreservedAnalyses::all(); |
242 | } |
243 | |
244 | FunctionPass *llvm::createAMDGPUPromoteAlloca() { |
245 | return new AMDGPUPromoteAlloca(); |
246 | } |
247 | |
248 | FunctionPass *llvm::createAMDGPUPromoteAllocaToVector() { |
249 | return new AMDGPUPromoteAllocaToVector(); |
250 | } |
251 | |
252 | static void collectAllocaUses(AllocaInst &Alloca, |
253 | SmallVectorImpl<Use *> &Uses) { |
254 | SmallVector<Instruction *, 4> WorkList({&Alloca}); |
255 | while (!WorkList.empty()) { |
256 | auto *Cur = WorkList.pop_back_val(); |
257 | for (auto &U : Cur->uses()) { |
258 | Uses.push_back(Elt: &U); |
259 | |
260 | if (isa<GetElementPtrInst>(Val: U.getUser())) |
261 | WorkList.push_back(Elt: cast<Instruction>(Val: U.getUser())); |
262 | } |
263 | } |
264 | } |
265 | |
266 | void AMDGPUPromoteAllocaImpl::sortAllocasToPromote( |
267 | SmallVectorImpl<AllocaInst *> &Allocas) { |
268 | DenseMap<AllocaInst *, unsigned> Scores; |
269 | |
270 | for (auto *Alloca : Allocas) { |
271 | LLVM_DEBUG(dbgs() << "Scoring: " << *Alloca << "\n" ); |
272 | unsigned &Score = Scores[Alloca]; |
273 | // Increment score by one for each user + a bonus for users within loops. |
274 | SmallVector<Use *, 8> Uses; |
275 | collectAllocaUses(Alloca&: *Alloca, Uses); |
276 | for (auto *U : Uses) { |
277 | Instruction *Inst = cast<Instruction>(Val: U->getUser()); |
278 | if (isa<GetElementPtrInst>(Val: Inst)) |
279 | continue; |
280 | unsigned UserScore = |
281 | 1 + (LoopUserWeight * LI.getLoopDepth(BB: Inst->getParent())); |
282 | LLVM_DEBUG(dbgs() << " [+" << UserScore << "]:\t" << *Inst << "\n" ); |
283 | Score += UserScore; |
284 | } |
285 | LLVM_DEBUG(dbgs() << " => Final Score:" << Score << "\n" ); |
286 | } |
287 | |
288 | stable_sort(Range&: Allocas, C: [&](AllocaInst *A, AllocaInst *B) { |
289 | return Scores.at(Val: A) > Scores.at(Val: B); |
290 | }); |
291 | |
292 | // clang-format off |
293 | LLVM_DEBUG( |
294 | dbgs() << "Sorted Worklist:\n" ; |
295 | for (auto *A: Allocas) |
296 | dbgs() << " " << *A << "\n" ; |
297 | ); |
298 | // clang-format on |
299 | } |
300 | |
301 | bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) { |
302 | Mod = F.getParent(); |
303 | DL = &Mod->getDataLayout(); |
304 | |
305 | const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F); |
306 | if (!ST.isPromoteAllocaEnabled()) |
307 | return false; |
308 | |
309 | MaxVGPRs = getMaxVGPRs(TM, F); |
310 | |
311 | bool SufficientLDS = PromoteToLDS ? hasSufficientLocalMem(F) : false; |
312 | |
313 | // Use up to 1/4 of available register budget for vectorization. |
314 | // FIXME: Increase the limit for whole function budgets? Perhaps x2? |
315 | unsigned VectorizationBudget = |
316 | (PromoteAllocaToVectorLimit ? PromoteAllocaToVectorLimit * 8 |
317 | : (MaxVGPRs * 32)) / |
318 | 4; |
319 | |
320 | SmallVector<AllocaInst *, 16> Allocas; |
321 | for (Instruction &I : F.getEntryBlock()) { |
322 | if (AllocaInst *AI = dyn_cast<AllocaInst>(Val: &I)) { |
323 | // Array allocations are probably not worth handling, since an allocation |
324 | // of the array type is the canonical form. |
325 | if (!AI->isStaticAlloca() || AI->isArrayAllocation()) |
326 | continue; |
327 | Allocas.push_back(Elt: AI); |
328 | } |
329 | } |
330 | |
331 | sortAllocasToPromote(Allocas); |
332 | |
333 | bool Changed = false; |
334 | for (AllocaInst *AI : Allocas) { |
335 | const unsigned AllocaCost = DL->getTypeSizeInBits(Ty: AI->getAllocatedType()); |
336 | // First, check if we have enough budget to vectorize this alloca. |
337 | if (AllocaCost <= VectorizationBudget) { |
338 | // If we do, attempt vectorization, otherwise, fall through and try |
339 | // promoting to LDS instead. |
340 | if (tryPromoteAllocaToVector(I&: *AI)) { |
341 | Changed = true; |
342 | assert((VectorizationBudget - AllocaCost) < VectorizationBudget && |
343 | "Underflow!" ); |
344 | VectorizationBudget -= AllocaCost; |
345 | LLVM_DEBUG(dbgs() << " Remaining vectorization budget:" |
346 | << VectorizationBudget << "\n" ); |
347 | continue; |
348 | } |
349 | } else { |
350 | LLVM_DEBUG(dbgs() << "Alloca too big for vectorization (size:" |
351 | << AllocaCost << ", budget:" << VectorizationBudget |
352 | << "): " << *AI << "\n" ); |
353 | } |
354 | |
355 | if (PromoteToLDS && tryPromoteAllocaToLDS(I&: *AI, SufficientLDS)) |
356 | Changed = true; |
357 | } |
358 | |
359 | // NOTE: tryPromoteAllocaToVector removes the alloca, so Allocas contains |
360 | // dangling pointers. If we want to reuse it past this point, the loop above |
361 | // would need to be updated to remove successfully promoted allocas. |
362 | |
363 | return Changed; |
364 | } |
365 | |
366 | struct MemTransferInfo { |
367 | ConstantInt *SrcIndex = nullptr; |
368 | ConstantInt *DestIndex = nullptr; |
369 | }; |
370 | |
371 | // Checks if the instruction I is a memset user of the alloca AI that we can |
372 | // deal with. Currently, only non-volatile memsets that affect the whole alloca |
373 | // are handled. |
374 | static bool isSupportedMemset(MemSetInst *I, AllocaInst *AI, |
375 | const DataLayout &DL) { |
376 | using namespace PatternMatch; |
377 | // For now we only care about non-volatile memsets that affect the whole type |
378 | // (start at index 0 and fill the whole alloca). |
379 | // |
380 | // TODO: Now that we moved to PromoteAlloca we could handle any memsets |
381 | // (except maybe volatile ones?) - we just need to use shufflevector if it |
382 | // only affects a subset of the vector. |
383 | const unsigned Size = DL.getTypeStoreSize(Ty: AI->getAllocatedType()); |
384 | return I->getOperand(i_nocapture: 0) == AI && |
385 | match(V: I->getOperand(i_nocapture: 2), P: m_SpecificInt(V: Size)) && !I->isVolatile(); |
386 | } |
387 | |
388 | static Value * |
389 | calculateVectorIndex(Value *Ptr, |
390 | const std::map<GetElementPtrInst *, Value *> &GEPIdx) { |
391 | auto *GEP = dyn_cast<GetElementPtrInst>(Val: Ptr->stripPointerCasts()); |
392 | if (!GEP) |
393 | return ConstantInt::getNullValue(Ty: Type::getInt32Ty(C&: Ptr->getContext())); |
394 | |
395 | auto I = GEPIdx.find(x: GEP); |
396 | assert(I != GEPIdx.end() && "Must have entry for GEP!" ); |
397 | return I->second; |
398 | } |
399 | |
400 | static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca, |
401 | Type *VecElemTy, const DataLayout &DL) { |
402 | // TODO: Extracting a "multiple of X" from a GEP might be a useful generic |
403 | // helper. |
404 | unsigned BW = DL.getIndexTypeSizeInBits(Ty: GEP->getType()); |
405 | MapVector<Value *, APInt> VarOffsets; |
406 | APInt ConstOffset(BW, 0); |
407 | if (GEP->getPointerOperand()->stripPointerCasts() != Alloca || |
408 | !GEP->collectOffset(DL, BitWidth: BW, VariableOffsets&: VarOffsets, ConstantOffset&: ConstOffset)) |
409 | return nullptr; |
410 | |
411 | unsigned VecElemSize = DL.getTypeAllocSize(Ty: VecElemTy); |
412 | if (VarOffsets.size() > 1) |
413 | return nullptr; |
414 | |
415 | if (VarOffsets.size() == 1) { |
416 | // Only handle cases where we don't need to insert extra arithmetic |
417 | // instructions. |
418 | const auto &VarOffset = VarOffsets.front(); |
419 | if (!ConstOffset.isZero() || VarOffset.second != VecElemSize) |
420 | return nullptr; |
421 | return VarOffset.first; |
422 | } |
423 | |
424 | APInt Quot; |
425 | uint64_t Rem; |
426 | APInt::udivrem(LHS: ConstOffset, RHS: VecElemSize, Quotient&: Quot, Remainder&: Rem); |
427 | if (Rem != 0) |
428 | return nullptr; |
429 | |
430 | return ConstantInt::get(Context&: GEP->getContext(), V: Quot); |
431 | } |
432 | |
433 | /// Promotes a single user of the alloca to a vector form. |
434 | /// |
435 | /// \param Inst Instruction to be promoted. |
436 | /// \param DL Module Data Layout. |
437 | /// \param VectorTy Vectorized Type. |
438 | /// \param VecStoreSize Size of \p VectorTy in bytes. |
439 | /// \param ElementSize Size of \p VectorTy element type in bytes. |
440 | /// \param TransferInfo MemTransferInst info map. |
441 | /// \param GEPVectorIdx GEP -> VectorIdx cache. |
442 | /// \param CurVal Current value of the vector (e.g. last stored value) |
443 | /// \param[out] DeferredLoads \p Inst is added to this vector if it can't |
444 | /// be promoted now. This happens when promoting requires \p |
445 | /// CurVal, but \p CurVal is nullptr. |
446 | /// \return the stored value if \p Inst would have written to the alloca, or |
447 | /// nullptr otherwise. |
448 | static Value *promoteAllocaUserToVector( |
449 | Instruction *Inst, const DataLayout &DL, FixedVectorType *VectorTy, |
450 | unsigned VecStoreSize, unsigned ElementSize, |
451 | DenseMap<MemTransferInst *, MemTransferInfo> &TransferInfo, |
452 | std::map<GetElementPtrInst *, Value *> &GEPVectorIdx, Value *CurVal, |
453 | SmallVectorImpl<LoadInst *> &DeferredLoads) { |
454 | // Note: we use InstSimplifyFolder because it can leverage the DataLayout |
455 | // to do more folding, especially in the case of vector splats. |
456 | IRBuilder<InstSimplifyFolder> Builder(Inst->getContext(), |
457 | InstSimplifyFolder(DL)); |
458 | Builder.SetInsertPoint(Inst); |
459 | |
460 | const auto GetOrLoadCurrentVectorValue = [&]() -> Value * { |
461 | if (CurVal) |
462 | return CurVal; |
463 | |
464 | // If the current value is not known, insert a dummy load and lower it on |
465 | // the second pass. |
466 | LoadInst *Dummy = |
467 | Builder.CreateLoad(Ty: VectorTy, Ptr: PoisonValue::get(T: Builder.getPtrTy()), |
468 | Name: "promotealloca.dummyload" ); |
469 | DeferredLoads.push_back(Elt: Dummy); |
470 | return Dummy; |
471 | }; |
472 | |
473 | const auto CreateTempPtrIntCast = [&Builder, DL](Value *Val, |
474 | Type *PtrTy) -> Value * { |
475 | assert(DL.getTypeStoreSize(Val->getType()) == DL.getTypeStoreSize(PtrTy)); |
476 | const unsigned Size = DL.getTypeStoreSizeInBits(Ty: PtrTy); |
477 | if (!PtrTy->isVectorTy()) |
478 | return Builder.CreateBitOrPointerCast(V: Val, DestTy: Builder.getIntNTy(N: Size)); |
479 | const unsigned NumPtrElts = cast<FixedVectorType>(Val: PtrTy)->getNumElements(); |
480 | // If we want to cast to cast, e.g. a <2 x ptr> into a <4 x i32>, we need to |
481 | // first cast the ptr vector to <2 x i64>. |
482 | assert((Size % NumPtrElts == 0) && "Vector size not divisble" ); |
483 | Type *EltTy = Builder.getIntNTy(N: Size / NumPtrElts); |
484 | return Builder.CreateBitOrPointerCast( |
485 | V: Val, DestTy: FixedVectorType::get(ElementType: EltTy, NumElts: NumPtrElts)); |
486 | }; |
487 | |
488 | Type *VecEltTy = VectorTy->getElementType(); |
489 | |
490 | switch (Inst->getOpcode()) { |
491 | case Instruction::Load: { |
492 | // Loads can only be lowered if the value is known. |
493 | if (!CurVal) { |
494 | DeferredLoads.push_back(Elt: cast<LoadInst>(Val: Inst)); |
495 | return nullptr; |
496 | } |
497 | |
498 | Value *Index = calculateVectorIndex( |
499 | Ptr: cast<LoadInst>(Val: Inst)->getPointerOperand(), GEPIdx: GEPVectorIdx); |
500 | |
501 | // We're loading the full vector. |
502 | Type *AccessTy = Inst->getType(); |
503 | TypeSize AccessSize = DL.getTypeStoreSize(Ty: AccessTy); |
504 | if (Constant *CI = dyn_cast<Constant>(Val: Index)) { |
505 | if (CI->isZeroValue() && AccessSize == VecStoreSize) { |
506 | if (AccessTy->isPtrOrPtrVectorTy()) |
507 | CurVal = CreateTempPtrIntCast(CurVal, AccessTy); |
508 | else if (CurVal->getType()->isPtrOrPtrVectorTy()) |
509 | CurVal = CreateTempPtrIntCast(CurVal, CurVal->getType()); |
510 | Value *NewVal = Builder.CreateBitOrPointerCast(V: CurVal, DestTy: AccessTy); |
511 | Inst->replaceAllUsesWith(V: NewVal); |
512 | return nullptr; |
513 | } |
514 | } |
515 | |
516 | // Loading a subvector. |
517 | if (isa<FixedVectorType>(Val: AccessTy)) { |
518 | assert(AccessSize.isKnownMultipleOf(DL.getTypeStoreSize(VecEltTy))); |
519 | const unsigned NumLoadedElts = AccessSize / DL.getTypeStoreSize(Ty: VecEltTy); |
520 | auto *SubVecTy = FixedVectorType::get(ElementType: VecEltTy, NumElts: NumLoadedElts); |
521 | assert(DL.getTypeStoreSize(SubVecTy) == DL.getTypeStoreSize(AccessTy)); |
522 | |
523 | Value *SubVec = PoisonValue::get(T: SubVecTy); |
524 | for (unsigned K = 0; K < NumLoadedElts; ++K) { |
525 | Value *CurIdx = |
526 | Builder.CreateAdd(LHS: Index, RHS: ConstantInt::get(Ty: Index->getType(), V: K)); |
527 | SubVec = Builder.CreateInsertElement( |
528 | Vec: SubVec, NewElt: Builder.CreateExtractElement(Vec: CurVal, Idx: CurIdx), Idx: K); |
529 | } |
530 | |
531 | if (AccessTy->isPtrOrPtrVectorTy()) |
532 | SubVec = CreateTempPtrIntCast(SubVec, AccessTy); |
533 | else if (SubVecTy->isPtrOrPtrVectorTy()) |
534 | SubVec = CreateTempPtrIntCast(SubVec, SubVecTy); |
535 | |
536 | SubVec = Builder.CreateBitOrPointerCast(V: SubVec, DestTy: AccessTy); |
537 | Inst->replaceAllUsesWith(V: SubVec); |
538 | return nullptr; |
539 | } |
540 | |
541 | // We're loading one element. |
542 | Value * = Builder.CreateExtractElement(Vec: CurVal, Idx: Index); |
543 | if (AccessTy != VecEltTy) |
544 | ExtractElement = Builder.CreateBitOrPointerCast(V: ExtractElement, DestTy: AccessTy); |
545 | |
546 | Inst->replaceAllUsesWith(V: ExtractElement); |
547 | return nullptr; |
548 | } |
549 | case Instruction::Store: { |
550 | // For stores, it's a bit trickier and it depends on whether we're storing |
551 | // the full vector or not. If we're storing the full vector, we don't need |
552 | // to know the current value. If this is a store of a single element, we |
553 | // need to know the value. |
554 | StoreInst *SI = cast<StoreInst>(Val: Inst); |
555 | Value *Index = calculateVectorIndex(Ptr: SI->getPointerOperand(), GEPIdx: GEPVectorIdx); |
556 | Value *Val = SI->getValueOperand(); |
557 | |
558 | // We're storing the full vector, we can handle this without knowing CurVal. |
559 | Type *AccessTy = Val->getType(); |
560 | TypeSize AccessSize = DL.getTypeStoreSize(Ty: AccessTy); |
561 | if (Constant *CI = dyn_cast<Constant>(Val: Index)) { |
562 | if (CI->isZeroValue() && AccessSize == VecStoreSize) { |
563 | if (AccessTy->isPtrOrPtrVectorTy()) |
564 | Val = CreateTempPtrIntCast(Val, AccessTy); |
565 | else if (VectorTy->isPtrOrPtrVectorTy()) |
566 | Val = CreateTempPtrIntCast(Val, VectorTy); |
567 | return Builder.CreateBitOrPointerCast(V: Val, DestTy: VectorTy); |
568 | } |
569 | } |
570 | |
571 | // Storing a subvector. |
572 | if (isa<FixedVectorType>(Val: AccessTy)) { |
573 | assert(AccessSize.isKnownMultipleOf(DL.getTypeStoreSize(VecEltTy))); |
574 | const unsigned NumWrittenElts = |
575 | AccessSize / DL.getTypeStoreSize(Ty: VecEltTy); |
576 | const unsigned NumVecElts = VectorTy->getNumElements(); |
577 | auto *SubVecTy = FixedVectorType::get(ElementType: VecEltTy, NumElts: NumWrittenElts); |
578 | assert(DL.getTypeStoreSize(SubVecTy) == DL.getTypeStoreSize(AccessTy)); |
579 | |
580 | if (SubVecTy->isPtrOrPtrVectorTy()) |
581 | Val = CreateTempPtrIntCast(Val, SubVecTy); |
582 | else if (AccessTy->isPtrOrPtrVectorTy()) |
583 | Val = CreateTempPtrIntCast(Val, AccessTy); |
584 | |
585 | Val = Builder.CreateBitOrPointerCast(V: Val, DestTy: SubVecTy); |
586 | |
587 | Value *CurVec = GetOrLoadCurrentVectorValue(); |
588 | for (unsigned K = 0, NumElts = std::min(a: NumWrittenElts, b: NumVecElts); |
589 | K < NumElts; ++K) { |
590 | Value *CurIdx = |
591 | Builder.CreateAdd(LHS: Index, RHS: ConstantInt::get(Ty: Index->getType(), V: K)); |
592 | CurVec = Builder.CreateInsertElement( |
593 | Vec: CurVec, NewElt: Builder.CreateExtractElement(Vec: Val, Idx: K), Idx: CurIdx); |
594 | } |
595 | return CurVec; |
596 | } |
597 | |
598 | if (Val->getType() != VecEltTy) |
599 | Val = Builder.CreateBitOrPointerCast(V: Val, DestTy: VecEltTy); |
600 | return Builder.CreateInsertElement(Vec: GetOrLoadCurrentVectorValue(), NewElt: Val, |
601 | Idx: Index); |
602 | } |
603 | case Instruction::Call: { |
604 | if (auto *MTI = dyn_cast<MemTransferInst>(Val: Inst)) { |
605 | // For memcpy, we need to know curval. |
606 | ConstantInt *Length = cast<ConstantInt>(Val: MTI->getLength()); |
607 | unsigned NumCopied = Length->getZExtValue() / ElementSize; |
608 | MemTransferInfo *TI = &TransferInfo[MTI]; |
609 | unsigned SrcBegin = TI->SrcIndex->getZExtValue(); |
610 | unsigned DestBegin = TI->DestIndex->getZExtValue(); |
611 | |
612 | SmallVector<int> Mask; |
613 | for (unsigned Idx = 0; Idx < VectorTy->getNumElements(); ++Idx) { |
614 | if (Idx >= DestBegin && Idx < DestBegin + NumCopied) { |
615 | Mask.push_back(Elt: SrcBegin++); |
616 | } else { |
617 | Mask.push_back(Elt: Idx); |
618 | } |
619 | } |
620 | |
621 | return Builder.CreateShuffleVector(V: GetOrLoadCurrentVectorValue(), Mask); |
622 | } |
623 | |
624 | if (auto *MSI = dyn_cast<MemSetInst>(Val: Inst)) { |
625 | // For memset, we don't need to know the previous value because we |
626 | // currently only allow memsets that cover the whole alloca. |
627 | Value *Elt = MSI->getOperand(i_nocapture: 1); |
628 | const unsigned BytesPerElt = DL.getTypeStoreSize(Ty: VecEltTy); |
629 | if (BytesPerElt > 1) { |
630 | Value *EltBytes = Builder.CreateVectorSplat(NumElts: BytesPerElt, V: Elt); |
631 | |
632 | // If the element type of the vector is a pointer, we need to first cast |
633 | // to an integer, then use a PtrCast. |
634 | if (VecEltTy->isPointerTy()) { |
635 | Type *PtrInt = Builder.getIntNTy(N: BytesPerElt * 8); |
636 | Elt = Builder.CreateBitCast(V: EltBytes, DestTy: PtrInt); |
637 | Elt = Builder.CreateIntToPtr(V: Elt, DestTy: VecEltTy); |
638 | } else |
639 | Elt = Builder.CreateBitCast(V: EltBytes, DestTy: VecEltTy); |
640 | } |
641 | |
642 | return Builder.CreateVectorSplat(EC: VectorTy->getElementCount(), V: Elt); |
643 | } |
644 | |
645 | if (auto *Intr = dyn_cast<IntrinsicInst>(Val: Inst)) { |
646 | if (Intr->getIntrinsicID() == Intrinsic::objectsize) { |
647 | Intr->replaceAllUsesWith( |
648 | V: Builder.getIntN(N: Intr->getType()->getIntegerBitWidth(), |
649 | C: DL.getTypeAllocSize(Ty: VectorTy))); |
650 | return nullptr; |
651 | } |
652 | } |
653 | |
654 | llvm_unreachable("Unsupported call when promoting alloca to vector" ); |
655 | } |
656 | |
657 | default: |
658 | llvm_unreachable("Inconsistency in instructions promotable to vector" ); |
659 | } |
660 | |
661 | llvm_unreachable("Did not return after promoting instruction!" ); |
662 | } |
663 | |
664 | static bool isSupportedAccessType(FixedVectorType *VecTy, Type *AccessTy, |
665 | const DataLayout &DL) { |
666 | // Access as a vector type can work if the size of the access vector is a |
667 | // multiple of the size of the alloca's vector element type. |
668 | // |
669 | // Examples: |
670 | // - VecTy = <8 x float>, AccessTy = <4 x float> -> OK |
671 | // - VecTy = <4 x double>, AccessTy = <2 x float> -> OK |
672 | // - VecTy = <4 x double>, AccessTy = <3 x float> -> NOT OK |
673 | // - 3*32 is not a multiple of 64 |
674 | // |
675 | // We could handle more complicated cases, but it'd make things a lot more |
676 | // complicated. |
677 | if (isa<FixedVectorType>(Val: AccessTy)) { |
678 | TypeSize AccTS = DL.getTypeStoreSize(Ty: AccessTy); |
679 | TypeSize VecTS = DL.getTypeStoreSize(Ty: VecTy->getElementType()); |
680 | return AccTS.isKnownMultipleOf(RHS: VecTS); |
681 | } |
682 | |
683 | return CastInst::isBitOrNoopPointerCastable(SrcTy: VecTy->getElementType(), DestTy: AccessTy, |
684 | DL); |
685 | } |
686 | |
687 | /// Iterates over an instruction worklist that may contain multiple instructions |
688 | /// from the same basic block, but in a different order. |
689 | template <typename InstContainer> |
690 | static void forEachWorkListItem(const InstContainer &WorkList, |
691 | std::function<void(Instruction *)> Fn) { |
692 | // Bucket up uses of the alloca by the block they occur in. |
693 | // This is important because we have to handle multiple defs/uses in a block |
694 | // ourselves: SSAUpdater is purely for cross-block references. |
695 | DenseMap<BasicBlock *, SmallDenseSet<Instruction *>> UsesByBlock; |
696 | for (Instruction *User : WorkList) |
697 | UsesByBlock[User->getParent()].insert(V: User); |
698 | |
699 | for (Instruction *User : WorkList) { |
700 | BasicBlock *BB = User->getParent(); |
701 | auto &BlockUses = UsesByBlock[BB]; |
702 | |
703 | // Already processed, skip. |
704 | if (BlockUses.empty()) |
705 | continue; |
706 | |
707 | // Only user in the block, directly process it. |
708 | if (BlockUses.size() == 1) { |
709 | Fn(User); |
710 | continue; |
711 | } |
712 | |
713 | // Multiple users in the block, do a linear scan to see users in order. |
714 | for (Instruction &Inst : *BB) { |
715 | if (!BlockUses.contains(V: &Inst)) |
716 | continue; |
717 | |
718 | Fn(&Inst); |
719 | } |
720 | |
721 | // Clear the block so we know it's been processed. |
722 | BlockUses.clear(); |
723 | } |
724 | } |
725 | |
726 | // FIXME: Should try to pick the most likely to be profitable allocas first. |
727 | bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) { |
728 | LLVM_DEBUG(dbgs() << "Trying to promote to vector: " << Alloca << '\n'); |
729 | |
730 | if (DisablePromoteAllocaToVector) { |
731 | LLVM_DEBUG(dbgs() << " Promote alloca to vector is disabled\n" ); |
732 | return false; |
733 | } |
734 | |
735 | Type *AllocaTy = Alloca.getAllocatedType(); |
736 | auto *VectorTy = dyn_cast<FixedVectorType>(Val: AllocaTy); |
737 | if (auto *ArrayTy = dyn_cast<ArrayType>(Val: AllocaTy)) { |
738 | if (VectorType::isValidElementType(ElemTy: ArrayTy->getElementType()) && |
739 | ArrayTy->getNumElements() > 0) |
740 | VectorTy = FixedVectorType::get(ElementType: ArrayTy->getElementType(), |
741 | NumElts: ArrayTy->getNumElements()); |
742 | } |
743 | |
744 | // FIXME: There is no reason why we can't support larger arrays, we |
745 | // are just being conservative for now. |
746 | // FIXME: We also reject alloca's of the form [ 2 x [ 2 x i32 ]] or |
747 | // equivalent. Potentially these could also be promoted but we don't currently |
748 | // handle this case |
749 | if (!VectorTy) { |
750 | LLVM_DEBUG(dbgs() << " Cannot convert type to vector\n" ); |
751 | return false; |
752 | } |
753 | |
754 | if (VectorTy->getNumElements() > 16 || VectorTy->getNumElements() < 2) { |
755 | LLVM_DEBUG(dbgs() << " " << *VectorTy |
756 | << " has an unsupported number of elements\n" ); |
757 | return false; |
758 | } |
759 | |
760 | std::map<GetElementPtrInst *, Value *> GEPVectorIdx; |
761 | SmallVector<Instruction *> WorkList; |
762 | SmallVector<Instruction *> UsersToRemove; |
763 | SmallVector<Instruction *> DeferredInsts; |
764 | DenseMap<MemTransferInst *, MemTransferInfo> TransferInfo; |
765 | |
766 | const auto RejectUser = [&](Instruction *Inst, Twine Msg) { |
767 | LLVM_DEBUG(dbgs() << " Cannot promote alloca to vector: " << Msg << "\n" |
768 | << " " << *Inst << "\n" ); |
769 | return false; |
770 | }; |
771 | |
772 | SmallVector<Use *, 8> Uses; |
773 | collectAllocaUses(Alloca, Uses); |
774 | |
775 | LLVM_DEBUG(dbgs() << " Attempting promotion to: " << *VectorTy << "\n" ); |
776 | |
777 | Type *VecEltTy = VectorTy->getElementType(); |
778 | unsigned ElementSize = DL->getTypeSizeInBits(Ty: VecEltTy) / 8; |
779 | for (auto *U : Uses) { |
780 | Instruction *Inst = cast<Instruction>(Val: U->getUser()); |
781 | |
782 | if (Value *Ptr = getLoadStorePointerOperand(V: Inst)) { |
783 | // This is a store of the pointer, not to the pointer. |
784 | if (isa<StoreInst>(Val: Inst) && |
785 | U->getOperandNo() != StoreInst::getPointerOperandIndex()) |
786 | return RejectUser(Inst, "pointer is being stored" ); |
787 | |
788 | Type *AccessTy = getLoadStoreType(I: Inst); |
789 | if (AccessTy->isAggregateType()) |
790 | return RejectUser(Inst, "unsupported load/store as aggregate" ); |
791 | assert(!AccessTy->isAggregateType() || AccessTy->isArrayTy()); |
792 | |
793 | // Check that this is a simple access of a vector element. |
794 | bool IsSimple = isa<LoadInst>(Val: Inst) ? cast<LoadInst>(Val: Inst)->isSimple() |
795 | : cast<StoreInst>(Val: Inst)->isSimple(); |
796 | if (!IsSimple) |
797 | return RejectUser(Inst, "not a simple load or store" ); |
798 | |
799 | Ptr = Ptr->stripPointerCasts(); |
800 | |
801 | // Alloca already accessed as vector. |
802 | if (Ptr == &Alloca && DL->getTypeStoreSize(Ty: Alloca.getAllocatedType()) == |
803 | DL->getTypeStoreSize(Ty: AccessTy)) { |
804 | WorkList.push_back(Elt: Inst); |
805 | continue; |
806 | } |
807 | |
808 | if (!isSupportedAccessType(VecTy: VectorTy, AccessTy, DL: *DL)) |
809 | return RejectUser(Inst, "not a supported access type" ); |
810 | |
811 | WorkList.push_back(Elt: Inst); |
812 | continue; |
813 | } |
814 | |
815 | if (auto *GEP = dyn_cast<GetElementPtrInst>(Val: Inst)) { |
816 | // If we can't compute a vector index from this GEP, then we can't |
817 | // promote this alloca to vector. |
818 | Value *Index = GEPToVectorIndex(GEP, Alloca: &Alloca, VecElemTy: VecEltTy, DL: *DL); |
819 | if (!Index) |
820 | return RejectUser(Inst, "cannot compute vector index for GEP" ); |
821 | |
822 | GEPVectorIdx[GEP] = Index; |
823 | UsersToRemove.push_back(Elt: Inst); |
824 | continue; |
825 | } |
826 | |
827 | if (MemSetInst *MSI = dyn_cast<MemSetInst>(Val: Inst); |
828 | MSI && isSupportedMemset(I: MSI, AI: &Alloca, DL: *DL)) { |
829 | WorkList.push_back(Elt: Inst); |
830 | continue; |
831 | } |
832 | |
833 | if (MemTransferInst *TransferInst = dyn_cast<MemTransferInst>(Val: Inst)) { |
834 | if (TransferInst->isVolatile()) |
835 | return RejectUser(Inst, "mem transfer inst is volatile" ); |
836 | |
837 | ConstantInt *Len = dyn_cast<ConstantInt>(Val: TransferInst->getLength()); |
838 | if (!Len || (Len->getZExtValue() % ElementSize)) |
839 | return RejectUser(Inst, "mem transfer inst length is non-constant or " |
840 | "not a multiple of the vector element size" ); |
841 | |
842 | if (!TransferInfo.count(Val: TransferInst)) { |
843 | DeferredInsts.push_back(Elt: Inst); |
844 | WorkList.push_back(Elt: Inst); |
845 | TransferInfo[TransferInst] = MemTransferInfo(); |
846 | } |
847 | |
848 | auto getPointerIndexOfAlloca = [&](Value *Ptr) -> ConstantInt * { |
849 | GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Val: Ptr); |
850 | if (Ptr != &Alloca && !GEPVectorIdx.count(x: GEP)) |
851 | return nullptr; |
852 | |
853 | return dyn_cast<ConstantInt>(Val: calculateVectorIndex(Ptr, GEPIdx: GEPVectorIdx)); |
854 | }; |
855 | |
856 | unsigned OpNum = U->getOperandNo(); |
857 | MemTransferInfo *TI = &TransferInfo[TransferInst]; |
858 | if (OpNum == 0) { |
859 | Value *Dest = TransferInst->getDest(); |
860 | ConstantInt *Index = getPointerIndexOfAlloca(Dest); |
861 | if (!Index) |
862 | return RejectUser(Inst, "could not calculate constant dest index" ); |
863 | TI->DestIndex = Index; |
864 | } else { |
865 | assert(OpNum == 1); |
866 | Value *Src = TransferInst->getSource(); |
867 | ConstantInt *Index = getPointerIndexOfAlloca(Src); |
868 | if (!Index) |
869 | return RejectUser(Inst, "could not calculate constant src index" ); |
870 | TI->SrcIndex = Index; |
871 | } |
872 | continue; |
873 | } |
874 | |
875 | if (auto *Intr = dyn_cast<IntrinsicInst>(Val: Inst)) { |
876 | if (Intr->getIntrinsicID() == Intrinsic::objectsize) { |
877 | WorkList.push_back(Elt: Inst); |
878 | continue; |
879 | } |
880 | } |
881 | |
882 | // Ignore assume-like intrinsics and comparisons used in assumes. |
883 | if (isAssumeLikeIntrinsic(I: Inst)) { |
884 | if (!Inst->use_empty()) |
885 | return RejectUser(Inst, "assume-like intrinsic cannot have any users" ); |
886 | UsersToRemove.push_back(Elt: Inst); |
887 | continue; |
888 | } |
889 | |
890 | if (isa<ICmpInst>(Val: Inst) && all_of(Range: Inst->users(), P: [](User *U) { |
891 | return isAssumeLikeIntrinsic(I: cast<Instruction>(Val: U)); |
892 | })) { |
893 | UsersToRemove.push_back(Elt: Inst); |
894 | continue; |
895 | } |
896 | |
897 | return RejectUser(Inst, "unhandled alloca user" ); |
898 | } |
899 | |
900 | while (!DeferredInsts.empty()) { |
901 | Instruction *Inst = DeferredInsts.pop_back_val(); |
902 | MemTransferInst *TransferInst = cast<MemTransferInst>(Val: Inst); |
903 | // TODO: Support the case if the pointers are from different alloca or |
904 | // from different address spaces. |
905 | MemTransferInfo &Info = TransferInfo[TransferInst]; |
906 | if (!Info.SrcIndex || !Info.DestIndex) |
907 | return RejectUser( |
908 | Inst, "mem transfer inst is missing constant src and/or dst index" ); |
909 | } |
910 | |
911 | LLVM_DEBUG(dbgs() << " Converting alloca to vector " << *AllocaTy << " -> " |
912 | << *VectorTy << '\n'); |
913 | const unsigned VecStoreSize = DL->getTypeStoreSize(Ty: VectorTy); |
914 | |
915 | // Alloca is uninitialized memory. Imitate that by making the first value |
916 | // undef. |
917 | SSAUpdater Updater; |
918 | Updater.Initialize(Ty: VectorTy, Name: "promotealloca" ); |
919 | Updater.AddAvailableValue(BB: Alloca.getParent(), V: UndefValue::get(T: VectorTy)); |
920 | |
921 | // First handle the initial worklist. |
922 | SmallVector<LoadInst *, 4> DeferredLoads; |
923 | forEachWorkListItem(WorkList, Fn: [&](Instruction *I) { |
924 | BasicBlock *BB = I->getParent(); |
925 | // On the first pass, we only take values that are trivially known, i.e. |
926 | // where AddAvailableValue was already called in this block. |
927 | Value *Result = promoteAllocaUserToVector( |
928 | Inst: I, DL: *DL, VectorTy, VecStoreSize, ElementSize, TransferInfo, GEPVectorIdx, |
929 | CurVal: Updater.FindValueForBlock(BB), DeferredLoads); |
930 | if (Result) |
931 | Updater.AddAvailableValue(BB, V: Result); |
932 | }); |
933 | |
934 | // Then handle deferred loads. |
935 | forEachWorkListItem(WorkList: DeferredLoads, Fn: [&](Instruction *I) { |
936 | SmallVector<LoadInst *, 0> NewDLs; |
937 | BasicBlock *BB = I->getParent(); |
938 | // On the second pass, we use GetValueInMiddleOfBlock to guarantee we always |
939 | // get a value, inserting PHIs as needed. |
940 | Value *Result = promoteAllocaUserToVector( |
941 | Inst: I, DL: *DL, VectorTy, VecStoreSize, ElementSize, TransferInfo, GEPVectorIdx, |
942 | CurVal: Updater.GetValueInMiddleOfBlock(BB: I->getParent()), DeferredLoads&: NewDLs); |
943 | if (Result) |
944 | Updater.AddAvailableValue(BB, V: Result); |
945 | assert(NewDLs.empty() && "No more deferred loads should be queued!" ); |
946 | }); |
947 | |
948 | // Delete all instructions. On the first pass, new dummy loads may have been |
949 | // added so we need to collect them too. |
950 | DenseSet<Instruction *> InstsToDelete(WorkList.begin(), WorkList.end()); |
951 | InstsToDelete.insert(I: DeferredLoads.begin(), E: DeferredLoads.end()); |
952 | for (Instruction *I : InstsToDelete) { |
953 | assert(I->use_empty()); |
954 | I->eraseFromParent(); |
955 | } |
956 | |
957 | // Delete all the users that are known to be removeable. |
958 | for (Instruction *I : reverse(C&: UsersToRemove)) { |
959 | I->dropDroppableUses(); |
960 | assert(I->use_empty()); |
961 | I->eraseFromParent(); |
962 | } |
963 | |
964 | // Alloca should now be dead too. |
965 | assert(Alloca.use_empty()); |
966 | Alloca.eraseFromParent(); |
967 | return true; |
968 | } |
969 | |
970 | std::pair<Value *, Value *> |
971 | AMDGPUPromoteAllocaImpl::getLocalSizeYZ(IRBuilder<> &Builder) { |
972 | Function &F = *Builder.GetInsertBlock()->getParent(); |
973 | const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F); |
974 | |
975 | if (!IsAMDHSA) { |
976 | Function *LocalSizeYFn = |
977 | Intrinsic::getDeclaration(M: Mod, id: Intrinsic::r600_read_local_size_y); |
978 | Function *LocalSizeZFn = |
979 | Intrinsic::getDeclaration(M: Mod, id: Intrinsic::r600_read_local_size_z); |
980 | |
981 | CallInst *LocalSizeY = Builder.CreateCall(Callee: LocalSizeYFn, Args: {}); |
982 | CallInst *LocalSizeZ = Builder.CreateCall(Callee: LocalSizeZFn, Args: {}); |
983 | |
984 | ST.makeLIDRangeMetadata(I: LocalSizeY); |
985 | ST.makeLIDRangeMetadata(I: LocalSizeZ); |
986 | |
987 | return std::pair(LocalSizeY, LocalSizeZ); |
988 | } |
989 | |
990 | // We must read the size out of the dispatch pointer. |
991 | assert(IsAMDGCN); |
992 | |
993 | // We are indexing into this struct, and want to extract the workgroup_size_* |
994 | // fields. |
995 | // |
996 | // typedef struct hsa_kernel_dispatch_packet_s { |
997 | // uint16_t header; |
998 | // uint16_t setup; |
999 | // uint16_t workgroup_size_x ; |
1000 | // uint16_t workgroup_size_y; |
1001 | // uint16_t workgroup_size_z; |
1002 | // uint16_t reserved0; |
1003 | // uint32_t grid_size_x ; |
1004 | // uint32_t grid_size_y ; |
1005 | // uint32_t grid_size_z; |
1006 | // |
1007 | // uint32_t private_segment_size; |
1008 | // uint32_t group_segment_size; |
1009 | // uint64_t kernel_object; |
1010 | // |
1011 | // #ifdef HSA_LARGE_MODEL |
1012 | // void *kernarg_address; |
1013 | // #elif defined HSA_LITTLE_ENDIAN |
1014 | // void *kernarg_address; |
1015 | // uint32_t reserved1; |
1016 | // #else |
1017 | // uint32_t reserved1; |
1018 | // void *kernarg_address; |
1019 | // #endif |
1020 | // uint64_t reserved2; |
1021 | // hsa_signal_t completion_signal; // uint64_t wrapper |
1022 | // } hsa_kernel_dispatch_packet_t |
1023 | // |
1024 | Function *DispatchPtrFn = |
1025 | Intrinsic::getDeclaration(M: Mod, id: Intrinsic::amdgcn_dispatch_ptr); |
1026 | |
1027 | CallInst *DispatchPtr = Builder.CreateCall(Callee: DispatchPtrFn, Args: {}); |
1028 | DispatchPtr->addRetAttr(Kind: Attribute::NoAlias); |
1029 | DispatchPtr->addRetAttr(Kind: Attribute::NonNull); |
1030 | F.removeFnAttr(Kind: "amdgpu-no-dispatch-ptr" ); |
1031 | |
1032 | // Size of the dispatch packet struct. |
1033 | DispatchPtr->addDereferenceableRetAttr(Bytes: 64); |
1034 | |
1035 | Type *I32Ty = Type::getInt32Ty(C&: Mod->getContext()); |
1036 | Value *CastDispatchPtr = Builder.CreateBitCast( |
1037 | V: DispatchPtr, DestTy: PointerType::get(ElementType: I32Ty, AddressSpace: AMDGPUAS::CONSTANT_ADDRESS)); |
1038 | |
1039 | // We could do a single 64-bit load here, but it's likely that the basic |
1040 | // 32-bit and extract sequence is already present, and it is probably easier |
1041 | // to CSE this. The loads should be mergeable later anyway. |
1042 | Value *GEPXY = Builder.CreateConstInBoundsGEP1_64(Ty: I32Ty, Ptr: CastDispatchPtr, Idx0: 1); |
1043 | LoadInst *LoadXY = Builder.CreateAlignedLoad(Ty: I32Ty, Ptr: GEPXY, Align: Align(4)); |
1044 | |
1045 | Value *GEPZU = Builder.CreateConstInBoundsGEP1_64(Ty: I32Ty, Ptr: CastDispatchPtr, Idx0: 2); |
1046 | LoadInst *LoadZU = Builder.CreateAlignedLoad(Ty: I32Ty, Ptr: GEPZU, Align: Align(4)); |
1047 | |
1048 | MDNode *MD = MDNode::get(Context&: Mod->getContext(), MDs: std::nullopt); |
1049 | LoadXY->setMetadata(KindID: LLVMContext::MD_invariant_load, Node: MD); |
1050 | LoadZU->setMetadata(KindID: LLVMContext::MD_invariant_load, Node: MD); |
1051 | ST.makeLIDRangeMetadata(I: LoadZU); |
1052 | |
1053 | // Extract y component. Upper half of LoadZU should be zero already. |
1054 | Value *Y = Builder.CreateLShr(LHS: LoadXY, RHS: 16); |
1055 | |
1056 | return std::pair(Y, LoadZU); |
1057 | } |
1058 | |
1059 | Value *AMDGPUPromoteAllocaImpl::getWorkitemID(IRBuilder<> &Builder, |
1060 | unsigned N) { |
1061 | Function *F = Builder.GetInsertBlock()->getParent(); |
1062 | const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F: *F); |
1063 | Intrinsic::ID IntrID = Intrinsic::not_intrinsic; |
1064 | StringRef AttrName; |
1065 | |
1066 | switch (N) { |
1067 | case 0: |
1068 | IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_x |
1069 | : (Intrinsic::ID)Intrinsic::r600_read_tidig_x; |
1070 | AttrName = "amdgpu-no-workitem-id-x" ; |
1071 | break; |
1072 | case 1: |
1073 | IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_y |
1074 | : (Intrinsic::ID)Intrinsic::r600_read_tidig_y; |
1075 | AttrName = "amdgpu-no-workitem-id-y" ; |
1076 | break; |
1077 | |
1078 | case 2: |
1079 | IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_z |
1080 | : (Intrinsic::ID)Intrinsic::r600_read_tidig_z; |
1081 | AttrName = "amdgpu-no-workitem-id-z" ; |
1082 | break; |
1083 | default: |
1084 | llvm_unreachable("invalid dimension" ); |
1085 | } |
1086 | |
1087 | Function *WorkitemIdFn = Intrinsic::getDeclaration(M: Mod, id: IntrID); |
1088 | CallInst *CI = Builder.CreateCall(Callee: WorkitemIdFn); |
1089 | ST.makeLIDRangeMetadata(I: CI); |
1090 | F->removeFnAttr(Kind: AttrName); |
1091 | |
1092 | return CI; |
1093 | } |
1094 | |
1095 | static bool isCallPromotable(CallInst *CI) { |
1096 | IntrinsicInst *II = dyn_cast<IntrinsicInst>(Val: CI); |
1097 | if (!II) |
1098 | return false; |
1099 | |
1100 | switch (II->getIntrinsicID()) { |
1101 | case Intrinsic::memcpy: |
1102 | case Intrinsic::memmove: |
1103 | case Intrinsic::memset: |
1104 | case Intrinsic::lifetime_start: |
1105 | case Intrinsic::lifetime_end: |
1106 | case Intrinsic::invariant_start: |
1107 | case Intrinsic::invariant_end: |
1108 | case Intrinsic::launder_invariant_group: |
1109 | case Intrinsic::strip_invariant_group: |
1110 | case Intrinsic::objectsize: |
1111 | return true; |
1112 | default: |
1113 | return false; |
1114 | } |
1115 | } |
1116 | |
1117 | bool AMDGPUPromoteAllocaImpl::binaryOpIsDerivedFromSameAlloca( |
1118 | Value *BaseAlloca, Value *Val, Instruction *Inst, int OpIdx0, |
1119 | int OpIdx1) const { |
1120 | // Figure out which operand is the one we might not be promoting. |
1121 | Value *OtherOp = Inst->getOperand(i: OpIdx0); |
1122 | if (Val == OtherOp) |
1123 | OtherOp = Inst->getOperand(i: OpIdx1); |
1124 | |
1125 | if (isa<ConstantPointerNull>(Val: OtherOp)) |
1126 | return true; |
1127 | |
1128 | Value *OtherObj = getUnderlyingObject(V: OtherOp); |
1129 | if (!isa<AllocaInst>(Val: OtherObj)) |
1130 | return false; |
1131 | |
1132 | // TODO: We should be able to replace undefs with the right pointer type. |
1133 | |
1134 | // TODO: If we know the other base object is another promotable |
1135 | // alloca, not necessarily this alloca, we can do this. The |
1136 | // important part is both must have the same address space at |
1137 | // the end. |
1138 | if (OtherObj != BaseAlloca) { |
1139 | LLVM_DEBUG( |
1140 | dbgs() << "Found a binary instruction with another alloca object\n" ); |
1141 | return false; |
1142 | } |
1143 | |
1144 | return true; |
1145 | } |
1146 | |
1147 | bool AMDGPUPromoteAllocaImpl::collectUsesWithPtrTypes( |
1148 | Value *BaseAlloca, Value *Val, std::vector<Value *> &WorkList) const { |
1149 | |
1150 | for (User *User : Val->users()) { |
1151 | if (is_contained(Range&: WorkList, Element: User)) |
1152 | continue; |
1153 | |
1154 | if (CallInst *CI = dyn_cast<CallInst>(Val: User)) { |
1155 | if (!isCallPromotable(CI)) |
1156 | return false; |
1157 | |
1158 | WorkList.push_back(x: User); |
1159 | continue; |
1160 | } |
1161 | |
1162 | Instruction *UseInst = cast<Instruction>(Val: User); |
1163 | if (UseInst->getOpcode() == Instruction::PtrToInt) |
1164 | return false; |
1165 | |
1166 | if (LoadInst *LI = dyn_cast<LoadInst>(Val: UseInst)) { |
1167 | if (LI->isVolatile()) |
1168 | return false; |
1169 | |
1170 | continue; |
1171 | } |
1172 | |
1173 | if (StoreInst *SI = dyn_cast<StoreInst>(Val: UseInst)) { |
1174 | if (SI->isVolatile()) |
1175 | return false; |
1176 | |
1177 | // Reject if the stored value is not the pointer operand. |
1178 | if (SI->getPointerOperand() != Val) |
1179 | return false; |
1180 | } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(Val: UseInst)) { |
1181 | if (RMW->isVolatile()) |
1182 | return false; |
1183 | } else if (AtomicCmpXchgInst *CAS = dyn_cast<AtomicCmpXchgInst>(Val: UseInst)) { |
1184 | if (CAS->isVolatile()) |
1185 | return false; |
1186 | } |
1187 | |
1188 | // Only promote a select if we know that the other select operand |
1189 | // is from another pointer that will also be promoted. |
1190 | if (ICmpInst *ICmp = dyn_cast<ICmpInst>(Val: UseInst)) { |
1191 | if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, Inst: ICmp, OpIdx0: 0, OpIdx1: 1)) |
1192 | return false; |
1193 | |
1194 | // May need to rewrite constant operands. |
1195 | WorkList.push_back(x: ICmp); |
1196 | } |
1197 | |
1198 | if (UseInst->getOpcode() == Instruction::AddrSpaceCast) { |
1199 | // Give up if the pointer may be captured. |
1200 | if (PointerMayBeCaptured(V: UseInst, ReturnCaptures: true, StoreCaptures: true)) |
1201 | return false; |
1202 | // Don't collect the users of this. |
1203 | WorkList.push_back(x: User); |
1204 | continue; |
1205 | } |
1206 | |
1207 | // Do not promote vector/aggregate type instructions. It is hard to track |
1208 | // their users. |
1209 | if (isa<InsertValueInst>(Val: User) || isa<InsertElementInst>(Val: User)) |
1210 | return false; |
1211 | |
1212 | if (!User->getType()->isPointerTy()) |
1213 | continue; |
1214 | |
1215 | if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Val: UseInst)) { |
1216 | // Be conservative if an address could be computed outside the bounds of |
1217 | // the alloca. |
1218 | if (!GEP->isInBounds()) |
1219 | return false; |
1220 | } |
1221 | |
1222 | // Only promote a select if we know that the other select operand is from |
1223 | // another pointer that will also be promoted. |
1224 | if (SelectInst *SI = dyn_cast<SelectInst>(Val: UseInst)) { |
1225 | if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, Inst: SI, OpIdx0: 1, OpIdx1: 2)) |
1226 | return false; |
1227 | } |
1228 | |
1229 | // Repeat for phis. |
1230 | if (PHINode *Phi = dyn_cast<PHINode>(Val: UseInst)) { |
1231 | // TODO: Handle more complex cases. We should be able to replace loops |
1232 | // over arrays. |
1233 | switch (Phi->getNumIncomingValues()) { |
1234 | case 1: |
1235 | break; |
1236 | case 2: |
1237 | if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, Inst: Phi, OpIdx0: 0, OpIdx1: 1)) |
1238 | return false; |
1239 | break; |
1240 | default: |
1241 | return false; |
1242 | } |
1243 | } |
1244 | |
1245 | WorkList.push_back(x: User); |
1246 | if (!collectUsesWithPtrTypes(BaseAlloca, Val: User, WorkList)) |
1247 | return false; |
1248 | } |
1249 | |
1250 | return true; |
1251 | } |
1252 | |
1253 | bool AMDGPUPromoteAllocaImpl::hasSufficientLocalMem(const Function &F) { |
1254 | |
1255 | FunctionType *FTy = F.getFunctionType(); |
1256 | const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F); |
1257 | |
1258 | // If the function has any arguments in the local address space, then it's |
1259 | // possible these arguments require the entire local memory space, so |
1260 | // we cannot use local memory in the pass. |
1261 | for (Type *ParamTy : FTy->params()) { |
1262 | PointerType *PtrTy = dyn_cast<PointerType>(Val: ParamTy); |
1263 | if (PtrTy && PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { |
1264 | LocalMemLimit = 0; |
1265 | LLVM_DEBUG(dbgs() << "Function has local memory argument. Promoting to " |
1266 | "local memory disabled.\n" ); |
1267 | return false; |
1268 | } |
1269 | } |
1270 | |
1271 | LocalMemLimit = ST.getAddressableLocalMemorySize(); |
1272 | if (LocalMemLimit == 0) |
1273 | return false; |
1274 | |
1275 | SmallVector<const Constant *, 16> Stack; |
1276 | SmallPtrSet<const Constant *, 8> VisitedConstants; |
1277 | SmallPtrSet<const GlobalVariable *, 8> UsedLDS; |
1278 | |
1279 | auto visitUsers = [&](const GlobalVariable *GV, const Constant *Val) -> bool { |
1280 | for (const User *U : Val->users()) { |
1281 | if (const Instruction *Use = dyn_cast<Instruction>(Val: U)) { |
1282 | if (Use->getParent()->getParent() == &F) |
1283 | return true; |
1284 | } else { |
1285 | const Constant *C = cast<Constant>(Val: U); |
1286 | if (VisitedConstants.insert(Ptr: C).second) |
1287 | Stack.push_back(Elt: C); |
1288 | } |
1289 | } |
1290 | |
1291 | return false; |
1292 | }; |
1293 | |
1294 | for (GlobalVariable &GV : Mod->globals()) { |
1295 | if (GV.getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) |
1296 | continue; |
1297 | |
1298 | if (visitUsers(&GV, &GV)) { |
1299 | UsedLDS.insert(Ptr: &GV); |
1300 | Stack.clear(); |
1301 | continue; |
1302 | } |
1303 | |
1304 | // For any ConstantExpr uses, we need to recursively search the users until |
1305 | // we see a function. |
1306 | while (!Stack.empty()) { |
1307 | const Constant *C = Stack.pop_back_val(); |
1308 | if (visitUsers(&GV, C)) { |
1309 | UsedLDS.insert(Ptr: &GV); |
1310 | Stack.clear(); |
1311 | break; |
1312 | } |
1313 | } |
1314 | } |
1315 | |
1316 | const DataLayout &DL = Mod->getDataLayout(); |
1317 | SmallVector<std::pair<uint64_t, Align>, 16> AllocatedSizes; |
1318 | AllocatedSizes.reserve(N: UsedLDS.size()); |
1319 | |
1320 | for (const GlobalVariable *GV : UsedLDS) { |
1321 | Align Alignment = |
1322 | DL.getValueOrABITypeAlignment(Alignment: GV->getAlign(), Ty: GV->getValueType()); |
1323 | uint64_t AllocSize = DL.getTypeAllocSize(Ty: GV->getValueType()); |
1324 | |
1325 | // HIP uses an extern unsized array in local address space for dynamically |
1326 | // allocated shared memory. In that case, we have to disable the promotion. |
1327 | if (GV->hasExternalLinkage() && AllocSize == 0) { |
1328 | LocalMemLimit = 0; |
1329 | LLVM_DEBUG(dbgs() << "Function has a reference to externally allocated " |
1330 | "local memory. Promoting to local memory " |
1331 | "disabled.\n" ); |
1332 | return false; |
1333 | } |
1334 | |
1335 | AllocatedSizes.emplace_back(Args&: AllocSize, Args&: Alignment); |
1336 | } |
1337 | |
1338 | // Sort to try to estimate the worst case alignment padding |
1339 | // |
1340 | // FIXME: We should really do something to fix the addresses to a more optimal |
1341 | // value instead |
1342 | llvm::sort(C&: AllocatedSizes, Comp: llvm::less_second()); |
1343 | |
1344 | // Check how much local memory is being used by global objects |
1345 | CurrentLocalMemUsage = 0; |
1346 | |
1347 | // FIXME: Try to account for padding here. The real padding and address is |
1348 | // currently determined from the inverse order of uses in the function when |
1349 | // legalizing, which could also potentially change. We try to estimate the |
1350 | // worst case here, but we probably should fix the addresses earlier. |
1351 | for (auto Alloc : AllocatedSizes) { |
1352 | CurrentLocalMemUsage = alignTo(Size: CurrentLocalMemUsage, A: Alloc.second); |
1353 | CurrentLocalMemUsage += Alloc.first; |
1354 | } |
1355 | |
1356 | unsigned MaxOccupancy = |
1357 | ST.getOccupancyWithLocalMemSize(Bytes: CurrentLocalMemUsage, F); |
1358 | |
1359 | // Restrict local memory usage so that we don't drastically reduce occupancy, |
1360 | // unless it is already significantly reduced. |
1361 | |
1362 | // TODO: Have some sort of hint or other heuristics to guess occupancy based |
1363 | // on other factors.. |
1364 | unsigned OccupancyHint = ST.getWavesPerEU(F).second; |
1365 | if (OccupancyHint == 0) |
1366 | OccupancyHint = 7; |
1367 | |
1368 | // Clamp to max value. |
1369 | OccupancyHint = std::min(a: OccupancyHint, b: ST.getMaxWavesPerEU()); |
1370 | |
1371 | // Check the hint but ignore it if it's obviously wrong from the existing LDS |
1372 | // usage. |
1373 | MaxOccupancy = std::min(a: OccupancyHint, b: MaxOccupancy); |
1374 | |
1375 | // Round up to the next tier of usage. |
1376 | unsigned MaxSizeWithWaveCount = |
1377 | ST.getMaxLocalMemSizeWithWaveCount(WaveCount: MaxOccupancy, F); |
1378 | |
1379 | // Program is possibly broken by using more local mem than available. |
1380 | if (CurrentLocalMemUsage > MaxSizeWithWaveCount) |
1381 | return false; |
1382 | |
1383 | LocalMemLimit = MaxSizeWithWaveCount; |
1384 | |
1385 | LLVM_DEBUG(dbgs() << F.getName() << " uses " << CurrentLocalMemUsage |
1386 | << " bytes of LDS\n" |
1387 | << " Rounding size to " << MaxSizeWithWaveCount |
1388 | << " with a maximum occupancy of " << MaxOccupancy << '\n' |
1389 | << " and " << (LocalMemLimit - CurrentLocalMemUsage) |
1390 | << " available for promotion\n" ); |
1391 | |
1392 | return true; |
1393 | } |
1394 | |
1395 | // FIXME: Should try to pick the most likely to be profitable allocas first. |
1396 | bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(AllocaInst &I, |
1397 | bool SufficientLDS) { |
1398 | LLVM_DEBUG(dbgs() << "Trying to promote to LDS: " << I << '\n'); |
1399 | |
1400 | if (DisablePromoteAllocaToLDS) { |
1401 | LLVM_DEBUG(dbgs() << " Promote alloca to LDS is disabled\n" ); |
1402 | return false; |
1403 | } |
1404 | |
1405 | const DataLayout &DL = Mod->getDataLayout(); |
1406 | IRBuilder<> Builder(&I); |
1407 | |
1408 | const Function &ContainingFunction = *I.getParent()->getParent(); |
1409 | CallingConv::ID CC = ContainingFunction.getCallingConv(); |
1410 | |
1411 | // Don't promote the alloca to LDS for shader calling conventions as the work |
1412 | // item ID intrinsics are not supported for these calling conventions. |
1413 | // Furthermore not all LDS is available for some of the stages. |
1414 | switch (CC) { |
1415 | case CallingConv::AMDGPU_KERNEL: |
1416 | case CallingConv::SPIR_KERNEL: |
1417 | break; |
1418 | default: |
1419 | LLVM_DEBUG( |
1420 | dbgs() |
1421 | << " promote alloca to LDS not supported with calling convention.\n" ); |
1422 | return false; |
1423 | } |
1424 | |
1425 | // Not likely to have sufficient local memory for promotion. |
1426 | if (!SufficientLDS) |
1427 | return false; |
1428 | |
1429 | const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F: ContainingFunction); |
1430 | unsigned WorkGroupSize = ST.getFlatWorkGroupSizes(F: ContainingFunction).second; |
1431 | |
1432 | Align Alignment = |
1433 | DL.getValueOrABITypeAlignment(Alignment: I.getAlign(), Ty: I.getAllocatedType()); |
1434 | |
1435 | // FIXME: This computed padding is likely wrong since it depends on inverse |
1436 | // usage order. |
1437 | // |
1438 | // FIXME: It is also possible that if we're allowed to use all of the memory |
1439 | // could end up using more than the maximum due to alignment padding. |
1440 | |
1441 | uint32_t NewSize = alignTo(Size: CurrentLocalMemUsage, A: Alignment); |
1442 | uint32_t AllocSize = |
1443 | WorkGroupSize * DL.getTypeAllocSize(Ty: I.getAllocatedType()); |
1444 | NewSize += AllocSize; |
1445 | |
1446 | if (NewSize > LocalMemLimit) { |
1447 | LLVM_DEBUG(dbgs() << " " << AllocSize |
1448 | << " bytes of local memory not available to promote\n" ); |
1449 | return false; |
1450 | } |
1451 | |
1452 | CurrentLocalMemUsage = NewSize; |
1453 | |
1454 | std::vector<Value *> WorkList; |
1455 | |
1456 | if (!collectUsesWithPtrTypes(BaseAlloca: &I, Val: &I, WorkList)) { |
1457 | LLVM_DEBUG(dbgs() << " Do not know how to convert all uses\n" ); |
1458 | return false; |
1459 | } |
1460 | |
1461 | LLVM_DEBUG(dbgs() << "Promoting alloca to local memory\n" ); |
1462 | |
1463 | Function *F = I.getParent()->getParent(); |
1464 | |
1465 | Type *GVTy = ArrayType::get(ElementType: I.getAllocatedType(), NumElements: WorkGroupSize); |
1466 | GlobalVariable *GV = new GlobalVariable( |
1467 | *Mod, GVTy, false, GlobalValue::InternalLinkage, PoisonValue::get(T: GVTy), |
1468 | Twine(F->getName()) + Twine('.') + I.getName(), nullptr, |
1469 | GlobalVariable::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS); |
1470 | GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); |
1471 | GV->setAlignment(I.getAlign()); |
1472 | |
1473 | Value *TCntY, *TCntZ; |
1474 | |
1475 | std::tie(args&: TCntY, args&: TCntZ) = getLocalSizeYZ(Builder); |
1476 | Value *TIdX = getWorkitemID(Builder, N: 0); |
1477 | Value *TIdY = getWorkitemID(Builder, N: 1); |
1478 | Value *TIdZ = getWorkitemID(Builder, N: 2); |
1479 | |
1480 | Value *Tmp0 = Builder.CreateMul(LHS: TCntY, RHS: TCntZ, Name: "" , HasNUW: true, HasNSW: true); |
1481 | Tmp0 = Builder.CreateMul(LHS: Tmp0, RHS: TIdX); |
1482 | Value *Tmp1 = Builder.CreateMul(LHS: TIdY, RHS: TCntZ, Name: "" , HasNUW: true, HasNSW: true); |
1483 | Value *TID = Builder.CreateAdd(LHS: Tmp0, RHS: Tmp1); |
1484 | TID = Builder.CreateAdd(LHS: TID, RHS: TIdZ); |
1485 | |
1486 | LLVMContext &Context = Mod->getContext(); |
1487 | Value *Indices[] = {Constant::getNullValue(Ty: Type::getInt32Ty(C&: Context)), TID}; |
1488 | |
1489 | Value *Offset = Builder.CreateInBoundsGEP(Ty: GVTy, Ptr: GV, IdxList: Indices); |
1490 | I.mutateType(Ty: Offset->getType()); |
1491 | I.replaceAllUsesWith(V: Offset); |
1492 | I.eraseFromParent(); |
1493 | |
1494 | SmallVector<IntrinsicInst *> DeferredIntrs; |
1495 | |
1496 | for (Value *V : WorkList) { |
1497 | CallInst *Call = dyn_cast<CallInst>(Val: V); |
1498 | if (!Call) { |
1499 | if (ICmpInst *CI = dyn_cast<ICmpInst>(Val: V)) { |
1500 | PointerType *NewTy = PointerType::get(C&: Context, AddressSpace: AMDGPUAS::LOCAL_ADDRESS); |
1501 | |
1502 | if (isa<ConstantPointerNull>(Val: CI->getOperand(i_nocapture: 0))) |
1503 | CI->setOperand(i_nocapture: 0, Val_nocapture: ConstantPointerNull::get(T: NewTy)); |
1504 | |
1505 | if (isa<ConstantPointerNull>(Val: CI->getOperand(i_nocapture: 1))) |
1506 | CI->setOperand(i_nocapture: 1, Val_nocapture: ConstantPointerNull::get(T: NewTy)); |
1507 | |
1508 | continue; |
1509 | } |
1510 | |
1511 | // The operand's value should be corrected on its own and we don't want to |
1512 | // touch the users. |
1513 | if (isa<AddrSpaceCastInst>(Val: V)) |
1514 | continue; |
1515 | |
1516 | PointerType *NewTy = PointerType::get(C&: Context, AddressSpace: AMDGPUAS::LOCAL_ADDRESS); |
1517 | |
1518 | // FIXME: It doesn't really make sense to try to do this for all |
1519 | // instructions. |
1520 | V->mutateType(Ty: NewTy); |
1521 | |
1522 | // Adjust the types of any constant operands. |
1523 | if (SelectInst *SI = dyn_cast<SelectInst>(Val: V)) { |
1524 | if (isa<ConstantPointerNull>(Val: SI->getOperand(i_nocapture: 1))) |
1525 | SI->setOperand(i_nocapture: 1, Val_nocapture: ConstantPointerNull::get(T: NewTy)); |
1526 | |
1527 | if (isa<ConstantPointerNull>(Val: SI->getOperand(i_nocapture: 2))) |
1528 | SI->setOperand(i_nocapture: 2, Val_nocapture: ConstantPointerNull::get(T: NewTy)); |
1529 | } else if (PHINode *Phi = dyn_cast<PHINode>(Val: V)) { |
1530 | for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I) { |
1531 | if (isa<ConstantPointerNull>(Val: Phi->getIncomingValue(i: I))) |
1532 | Phi->setIncomingValue(i: I, V: ConstantPointerNull::get(T: NewTy)); |
1533 | } |
1534 | } |
1535 | |
1536 | continue; |
1537 | } |
1538 | |
1539 | IntrinsicInst *Intr = cast<IntrinsicInst>(Val: Call); |
1540 | Builder.SetInsertPoint(Intr); |
1541 | switch (Intr->getIntrinsicID()) { |
1542 | case Intrinsic::lifetime_start: |
1543 | case Intrinsic::lifetime_end: |
1544 | // These intrinsics are for address space 0 only |
1545 | Intr->eraseFromParent(); |
1546 | continue; |
1547 | case Intrinsic::memcpy: |
1548 | case Intrinsic::memmove: |
1549 | // These have 2 pointer operands. In case if second pointer also needs |
1550 | // to be replaced we defer processing of these intrinsics until all |
1551 | // other values are processed. |
1552 | DeferredIntrs.push_back(Elt: Intr); |
1553 | continue; |
1554 | case Intrinsic::memset: { |
1555 | MemSetInst *MemSet = cast<MemSetInst>(Val: Intr); |
1556 | Builder.CreateMemSet(Ptr: MemSet->getRawDest(), Val: MemSet->getValue(), |
1557 | Size: MemSet->getLength(), Align: MemSet->getDestAlign(), |
1558 | isVolatile: MemSet->isVolatile()); |
1559 | Intr->eraseFromParent(); |
1560 | continue; |
1561 | } |
1562 | case Intrinsic::invariant_start: |
1563 | case Intrinsic::invariant_end: |
1564 | case Intrinsic::launder_invariant_group: |
1565 | case Intrinsic::strip_invariant_group: |
1566 | Intr->eraseFromParent(); |
1567 | // FIXME: I think the invariant marker should still theoretically apply, |
1568 | // but the intrinsics need to be changed to accept pointers with any |
1569 | // address space. |
1570 | continue; |
1571 | case Intrinsic::objectsize: { |
1572 | Value *Src = Intr->getOperand(i_nocapture: 0); |
1573 | Function *ObjectSize = Intrinsic::getDeclaration( |
1574 | M: Mod, id: Intrinsic::objectsize, |
1575 | Tys: {Intr->getType(), |
1576 | PointerType::get(C&: Context, AddressSpace: AMDGPUAS::LOCAL_ADDRESS)}); |
1577 | |
1578 | CallInst *NewCall = Builder.CreateCall( |
1579 | Callee: ObjectSize, |
1580 | Args: {Src, Intr->getOperand(i_nocapture: 1), Intr->getOperand(i_nocapture: 2), Intr->getOperand(i_nocapture: 3)}); |
1581 | Intr->replaceAllUsesWith(V: NewCall); |
1582 | Intr->eraseFromParent(); |
1583 | continue; |
1584 | } |
1585 | default: |
1586 | Intr->print(O&: errs()); |
1587 | llvm_unreachable("Don't know how to promote alloca intrinsic use." ); |
1588 | } |
1589 | } |
1590 | |
1591 | for (IntrinsicInst *Intr : DeferredIntrs) { |
1592 | Builder.SetInsertPoint(Intr); |
1593 | Intrinsic::ID ID = Intr->getIntrinsicID(); |
1594 | assert(ID == Intrinsic::memcpy || ID == Intrinsic::memmove); |
1595 | |
1596 | MemTransferInst *MI = cast<MemTransferInst>(Val: Intr); |
1597 | auto *B = Builder.CreateMemTransferInst( |
1598 | IntrID: ID, Dst: MI->getRawDest(), DstAlign: MI->getDestAlign(), Src: MI->getRawSource(), |
1599 | SrcAlign: MI->getSourceAlign(), Size: MI->getLength(), isVolatile: MI->isVolatile()); |
1600 | |
1601 | for (unsigned I = 0; I != 2; ++I) { |
1602 | if (uint64_t Bytes = Intr->getParamDereferenceableBytes(i: I)) { |
1603 | B->addDereferenceableParamAttr(i: I, Bytes); |
1604 | } |
1605 | } |
1606 | |
1607 | Intr->eraseFromParent(); |
1608 | } |
1609 | |
1610 | return true; |
1611 | } |
1612 | |