1//===-- AMDGPUPromoteAlloca.cpp - Promote Allocas -------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// Eliminates allocas by either converting them into vectors or by migrating
10// them to local address space.
11//
12// Two passes are exposed by this file:
13// - "promote-alloca-to-vector", which runs early in the pipeline and only
14// promotes to vector. Promotion to vector is almost always profitable
15// except when the alloca is too big and the promotion would result in
16// very high register pressure.
17// - "promote-alloca", which does both promotion to vector and LDS and runs
18// much later in the pipeline. This runs after SROA because promoting to
19// LDS is of course less profitable than getting rid of the alloca or
20// vectorizing it, thus we only want to do it when the only alternative is
21// lowering the alloca to stack.
22//
23// Note that both of them exist for the old and new PMs. The new PM passes are
24// declared in AMDGPU.h and the legacy PM ones are declared here.s
25//
26//===----------------------------------------------------------------------===//
27
28#include "AMDGPU.h"
29#include "GCNSubtarget.h"
30#include "Utils/AMDGPUBaseInfo.h"
31#include "llvm/ADT/STLExtras.h"
32#include "llvm/Analysis/CaptureTracking.h"
33#include "llvm/Analysis/InstSimplifyFolder.h"
34#include "llvm/Analysis/InstructionSimplify.h"
35#include "llvm/Analysis/LoopInfo.h"
36#include "llvm/Analysis/ValueTracking.h"
37#include "llvm/CodeGen/TargetPassConfig.h"
38#include "llvm/IR/IRBuilder.h"
39#include "llvm/IR/IntrinsicInst.h"
40#include "llvm/IR/IntrinsicsAMDGPU.h"
41#include "llvm/IR/IntrinsicsR600.h"
42#include "llvm/IR/PatternMatch.h"
43#include "llvm/InitializePasses.h"
44#include "llvm/Pass.h"
45#include "llvm/Support/MathExtras.h"
46#include "llvm/Target/TargetMachine.h"
47#include "llvm/Transforms/Utils/SSAUpdater.h"
48
49#define DEBUG_TYPE "amdgpu-promote-alloca"
50
51using namespace llvm;
52
53namespace {
54
55static cl::opt<bool>
56 DisablePromoteAllocaToVector("disable-promote-alloca-to-vector",
57 cl::desc("Disable promote alloca to vector"),
58 cl::init(Val: false));
59
60static cl::opt<bool>
61 DisablePromoteAllocaToLDS("disable-promote-alloca-to-lds",
62 cl::desc("Disable promote alloca to LDS"),
63 cl::init(Val: false));
64
65static cl::opt<unsigned> PromoteAllocaToVectorLimit(
66 "amdgpu-promote-alloca-to-vector-limit",
67 cl::desc("Maximum byte size to consider promote alloca to vector"),
68 cl::init(Val: 0));
69
70static cl::opt<unsigned> PromoteAllocaToVectorMaxRegs(
71 "amdgpu-promote-alloca-to-vector-max-regs",
72 cl::desc(
73 "Maximum vector size (in 32b registers) to use when promoting alloca"),
74 cl::init(Val: 32));
75
76// Use up to 1/4 of available register budget for vectorization.
77// FIXME: Increase the limit for whole function budgets? Perhaps x2?
78static cl::opt<unsigned> PromoteAllocaToVectorVGPRRatio(
79 "amdgpu-promote-alloca-to-vector-vgpr-ratio",
80 cl::desc("Ratio of VGPRs to budget for promoting alloca to vectors"),
81 cl::init(Val: 4));
82
83static cl::opt<unsigned>
84 LoopUserWeight("promote-alloca-vector-loop-user-weight",
85 cl::desc("The bonus weight of users of allocas within loop "
86 "when sorting profitable allocas"),
87 cl::init(Val: 4));
88
89// We support vector indices of the form ((A * stride) >> shift) + B
90// VarIndex is A, VarMul is stride, VarShift is shift and ConstIndex is B. All
91// parts are optional.
92struct GEPToVectorIndex {
93 WeakTrackingVH VarIndex = nullptr; // defaults to 0
94 ConstantInt *VarMul = nullptr; // defaults to 1
95 ConstantInt *VarShift = nullptr; // defaults to 0
96 ConstantInt *ConstIndex = nullptr; // defaults to 0
97 Value *Full = nullptr;
98};
99
100struct MemTransferInfo {
101 ConstantInt *SrcIndex = nullptr;
102 ConstantInt *DestIndex = nullptr;
103};
104
105// Analysis for planning the different strategies of alloca promotion.
106struct AllocaAnalysis {
107 AllocaInst *Alloca = nullptr;
108 DenseSet<Value *> Pointers;
109 SmallVector<Use *> Uses;
110 unsigned Score = 0;
111 bool HaveSelectOrPHI = false;
112 struct {
113 FixedVectorType *Ty = nullptr;
114 SmallVector<Instruction *> Worklist;
115 SmallVector<Instruction *> UsersToRemove;
116 MapVector<GetElementPtrInst *, GEPToVectorIndex> GEPVectorIdx;
117 MapVector<MemTransferInst *, MemTransferInfo> TransferInfo;
118 } Vector;
119 struct {
120 bool Enable = false;
121 SmallVector<User *> Worklist;
122 } LDS;
123
124 explicit AllocaAnalysis(AllocaInst *Alloca) : Alloca(Alloca) {}
125};
126
127// Shared implementation which can do both promotion to vector and to LDS.
128class AMDGPUPromoteAllocaImpl {
129private:
130 const TargetMachine &TM;
131 LoopInfo &LI;
132 Module *Mod = nullptr;
133 const DataLayout *DL = nullptr;
134
135 // FIXME: This should be per-kernel.
136 uint32_t LocalMemLimit = 0;
137 uint32_t CurrentLocalMemUsage = 0;
138 unsigned MaxVGPRs;
139 unsigned VGPRBudgetRatio;
140 unsigned MaxVectorRegs;
141
142 bool IsAMDGCN = false;
143 bool IsAMDHSA = false;
144
145 std::pair<Value *, Value *> getLocalSizeYZ(IRBuilder<> &Builder);
146 Value *getWorkitemID(IRBuilder<> &Builder, unsigned N);
147
148 bool collectAllocaUses(AllocaAnalysis &AA) const;
149
150 /// Val is a derived pointer from Alloca. OpIdx0/OpIdx1 are the operand
151 /// indices to an instruction with 2 pointer inputs (e.g. select, icmp).
152 /// Returns true if both operands are derived from the same alloca. Val should
153 /// be the same value as one of the input operands of UseInst.
154 bool binaryOpIsDerivedFromSameAlloca(Value *Alloca, Value *Val,
155 Instruction *UseInst, int OpIdx0,
156 int OpIdx1) const;
157
158 /// Check whether we have enough local memory for promotion.
159 bool hasSufficientLocalMem(const Function &F);
160
161 FixedVectorType *getVectorTypeForAlloca(Type *AllocaTy) const;
162 void analyzePromoteToVector(AllocaAnalysis &AA) const;
163 void promoteAllocaToVector(AllocaAnalysis &AA);
164 void analyzePromoteToLDS(AllocaAnalysis &AA) const;
165 bool tryPromoteAllocaToLDS(AllocaAnalysis &AA, bool SufficientLDS,
166 SetVector<IntrinsicInst *> &DeferredIntrs);
167 void
168 finishDeferredAllocaToLDSPromotion(SetVector<IntrinsicInst *> &DeferredIntrs);
169
170 void scoreAlloca(AllocaAnalysis &AA) const;
171
172 void setFunctionLimits(const Function &F);
173
174public:
175 AMDGPUPromoteAllocaImpl(TargetMachine &TM, LoopInfo &LI) : TM(TM), LI(LI) {
176
177 const Triple &TT = TM.getTargetTriple();
178 IsAMDGCN = TT.isAMDGCN();
179 IsAMDHSA = TT.getOS() == Triple::AMDHSA;
180 }
181
182 bool run(Function &F, bool PromoteToLDS);
183};
184
185// FIXME: This can create globals so should be a module pass.
186class AMDGPUPromoteAlloca : public FunctionPass {
187public:
188 static char ID;
189
190 AMDGPUPromoteAlloca() : FunctionPass(ID) {}
191
192 bool runOnFunction(Function &F) override {
193 if (skipFunction(F))
194 return false;
195 if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>())
196 return AMDGPUPromoteAllocaImpl(
197 TPC->getTM<TargetMachine>(),
198 getAnalysis<LoopInfoWrapperPass>().getLoopInfo())
199 .run(F, /*PromoteToLDS*/ true);
200 return false;
201 }
202
203 StringRef getPassName() const override { return "AMDGPU Promote Alloca"; }
204
205 void getAnalysisUsage(AnalysisUsage &AU) const override {
206 AU.setPreservesCFG();
207 AU.addRequired<LoopInfoWrapperPass>();
208 FunctionPass::getAnalysisUsage(AU);
209 }
210};
211
212static unsigned getMaxVGPRs(unsigned LDSBytes, const TargetMachine &TM,
213 const Function &F) {
214 if (!TM.getTargetTriple().isAMDGCN())
215 return 128;
216
217 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
218
219 unsigned DynamicVGPRBlockSize = AMDGPU::getDynamicVGPRBlockSize(F);
220 // Temporarily check both the attribute and the subtarget feature, until the
221 // latter is removed.
222 if (DynamicVGPRBlockSize == 0 && ST.isDynamicVGPREnabled())
223 DynamicVGPRBlockSize = ST.getDynamicVGPRBlockSize();
224
225 unsigned MaxVGPRs = ST.getMaxNumVGPRs(
226 WavesPerEU: ST.getWavesPerEU(FlatWorkGroupSizes: ST.getFlatWorkGroupSizes(F), LDSBytes, F).first,
227 DynamicVGPRBlockSize);
228
229 // A non-entry function has only 32 caller preserved registers.
230 // Do not promote alloca which will force spilling unless we know the function
231 // will be inlined.
232 if (!F.hasFnAttribute(Kind: Attribute::AlwaysInline) &&
233 !AMDGPU::isEntryFunctionCC(CC: F.getCallingConv()))
234 MaxVGPRs = std::min(a: MaxVGPRs, b: 32u);
235 return MaxVGPRs;
236}
237
238} // end anonymous namespace
239
240char AMDGPUPromoteAlloca::ID = 0;
241
242INITIALIZE_PASS_BEGIN(AMDGPUPromoteAlloca, DEBUG_TYPE,
243 "AMDGPU promote alloca to vector or LDS", false, false)
244// Move LDS uses from functions to kernels before promote alloca for accurate
245// estimation of LDS available
246INITIALIZE_PASS_DEPENDENCY(AMDGPULowerModuleLDSLegacy)
247INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
248INITIALIZE_PASS_END(AMDGPUPromoteAlloca, DEBUG_TYPE,
249 "AMDGPU promote alloca to vector or LDS", false, false)
250
251char &llvm::AMDGPUPromoteAllocaID = AMDGPUPromoteAlloca::ID;
252
253PreservedAnalyses AMDGPUPromoteAllocaPass::run(Function &F,
254 FunctionAnalysisManager &AM) {
255 auto &LI = AM.getResult<LoopAnalysis>(IR&: F);
256 bool Changed = AMDGPUPromoteAllocaImpl(TM, LI).run(F, /*PromoteToLDS=*/true);
257 if (Changed) {
258 PreservedAnalyses PA;
259 PA.preserveSet<CFGAnalyses>();
260 return PA;
261 }
262 return PreservedAnalyses::all();
263}
264
265PreservedAnalyses
266AMDGPUPromoteAllocaToVectorPass::run(Function &F, FunctionAnalysisManager &AM) {
267 auto &LI = AM.getResult<LoopAnalysis>(IR&: F);
268 bool Changed = AMDGPUPromoteAllocaImpl(TM, LI).run(F, /*PromoteToLDS=*/false);
269 if (Changed) {
270 PreservedAnalyses PA;
271 PA.preserveSet<CFGAnalyses>();
272 return PA;
273 }
274 return PreservedAnalyses::all();
275}
276
277FunctionPass *llvm::createAMDGPUPromoteAlloca() {
278 return new AMDGPUPromoteAlloca();
279}
280
281bool AMDGPUPromoteAllocaImpl::collectAllocaUses(AllocaAnalysis &AA) const {
282 const auto RejectUser = [&](Instruction *Inst, Twine Msg) {
283 LLVM_DEBUG(dbgs() << " Cannot promote alloca: " << Msg << "\n"
284 << " " << *Inst << "\n");
285 return false;
286 };
287
288 SmallVector<Instruction *, 4> WorkList({AA.Alloca});
289 while (!WorkList.empty()) {
290 auto *Cur = WorkList.pop_back_val();
291 if (find(Range&: AA.Pointers, Val: Cur) != AA.Pointers.end())
292 continue;
293 AA.Pointers.insert(V: Cur);
294 for (auto &U : Cur->uses()) {
295 auto *Inst = cast<Instruction>(Val: U.getUser());
296 if (isa<StoreInst>(Val: Inst)) {
297 if (U.getOperandNo() != StoreInst::getPointerOperandIndex()) {
298 return RejectUser(Inst, "pointer escapes via store");
299 }
300 }
301 AA.Uses.push_back(Elt: &U);
302
303 if (isa<GetElementPtrInst>(Val: U.getUser())) {
304 WorkList.push_back(Elt: Inst);
305 } else if (auto *SI = dyn_cast<SelectInst>(Val: Inst)) {
306 // Only promote a select if we know that the other select operand is
307 // from another pointer that will also be promoted.
308 if (!binaryOpIsDerivedFromSameAlloca(Alloca: AA.Alloca, Val: Cur, UseInst: SI, OpIdx0: 1, OpIdx1: 2))
309 return RejectUser(Inst, "select from mixed objects");
310 WorkList.push_back(Elt: Inst);
311 AA.HaveSelectOrPHI = true;
312 } else if (auto *Phi = dyn_cast<PHINode>(Val: Inst)) {
313 // Repeat for phis.
314
315 // TODO: Handle more complex cases. We should be able to replace loops
316 // over arrays.
317 switch (Phi->getNumIncomingValues()) {
318 case 1:
319 break;
320 case 2:
321 if (!binaryOpIsDerivedFromSameAlloca(Alloca: AA.Alloca, Val: Cur, UseInst: Phi, OpIdx0: 0, OpIdx1: 1))
322 return RejectUser(Inst, "phi from mixed objects");
323 break;
324 default:
325 return RejectUser(Inst, "phi with too many operands");
326 }
327
328 WorkList.push_back(Elt: Inst);
329 AA.HaveSelectOrPHI = true;
330 }
331 }
332 }
333 return true;
334}
335
336void AMDGPUPromoteAllocaImpl::scoreAlloca(AllocaAnalysis &AA) const {
337 LLVM_DEBUG(dbgs() << "Scoring: " << *AA.Alloca << "\n");
338 unsigned Score = 0;
339 // Increment score by one for each user + a bonus for users within loops.
340 for (auto *U : AA.Uses) {
341 Instruction *Inst = cast<Instruction>(Val: U->getUser());
342 if (isa<GetElementPtrInst>(Val: Inst) || isa<SelectInst>(Val: Inst) ||
343 isa<PHINode>(Val: Inst))
344 continue;
345 unsigned UserScore =
346 1 + (LoopUserWeight * LI.getLoopDepth(BB: Inst->getParent()));
347 LLVM_DEBUG(dbgs() << " [+" << UserScore << "]:\t" << *Inst << "\n");
348 Score += UserScore;
349 }
350 LLVM_DEBUG(dbgs() << " => Final Score:" << Score << "\n");
351 AA.Score = Score;
352}
353
354void AMDGPUPromoteAllocaImpl::setFunctionLimits(const Function &F) {
355 // Load per function limits, overriding with global options where appropriate.
356 // R600 register tuples/aliasing are fragile with large vector promotions so
357 // apply architecture specific limit here.
358 const int R600MaxVectorRegs = 16;
359 MaxVectorRegs = F.getFnAttributeAsParsedInteger(
360 Kind: "amdgpu-promote-alloca-to-vector-max-regs",
361 Default: IsAMDGCN ? PromoteAllocaToVectorMaxRegs : R600MaxVectorRegs);
362 if (PromoteAllocaToVectorMaxRegs.getNumOccurrences())
363 MaxVectorRegs = PromoteAllocaToVectorMaxRegs;
364 VGPRBudgetRatio = F.getFnAttributeAsParsedInteger(
365 Kind: "amdgpu-promote-alloca-to-vector-vgpr-ratio",
366 Default: PromoteAllocaToVectorVGPRRatio);
367 if (PromoteAllocaToVectorVGPRRatio.getNumOccurrences())
368 VGPRBudgetRatio = PromoteAllocaToVectorVGPRRatio;
369}
370
371bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
372 if (DisablePromoteAllocaToLDS && DisablePromoteAllocaToVector)
373 return false;
374
375 Mod = F.getParent();
376 DL = &Mod->getDataLayout();
377
378 bool SufficientLDS = PromoteToLDS && hasSufficientLocalMem(F);
379 MaxVGPRs = getMaxVGPRs(LDSBytes: CurrentLocalMemUsage, TM, F);
380 setFunctionLimits(F);
381
382 unsigned VectorizationBudget =
383 (PromoteAllocaToVectorLimit ? PromoteAllocaToVectorLimit * 8
384 : (MaxVGPRs * 32)) /
385 VGPRBudgetRatio;
386
387 std::vector<AllocaAnalysis> Allocas;
388 for (Instruction &I : F.getEntryBlock()) {
389 if (AllocaInst *AI = dyn_cast<AllocaInst>(Val: &I)) {
390 // Array allocations are probably not worth handling, since an allocation
391 // of the array type is the canonical form.
392 if (!AI->isStaticAlloca() || AI->isArrayAllocation())
393 continue;
394
395 LLVM_DEBUG(dbgs() << "Analyzing: " << *AI << '\n');
396
397 AllocaAnalysis AA{AI};
398 if (collectAllocaUses(AA)) {
399 analyzePromoteToVector(AA);
400 if (PromoteToLDS)
401 analyzePromoteToLDS(AA);
402 if (AA.Vector.Ty || AA.LDS.Enable) {
403 scoreAlloca(AA);
404 Allocas.push_back(x: std::move(AA));
405 }
406 }
407 }
408 }
409
410 stable_sort(Range&: Allocas,
411 C: [](const auto &A, const auto &B) { return A.Score > B.Score; });
412
413 // clang-format off
414 LLVM_DEBUG(
415 dbgs() << "Sorted Worklist:\n";
416 for (const auto &AA : Allocas)
417 dbgs() << " " << *AA.Alloca << "\n";
418 );
419 // clang-format on
420
421 bool Changed = false;
422 SetVector<IntrinsicInst *> DeferredIntrs;
423 for (AllocaAnalysis &AA : Allocas) {
424 if (AA.Vector.Ty) {
425 std::optional<TypeSize> Size = AA.Alloca->getAllocationSize(DL: *DL);
426 assert(Size); // Expected to succeed on non-array alloca.
427 const unsigned AllocaCost = Size->getFixedValue() * 8;
428 // First, check if we have enough budget to vectorize this alloca.
429 if (AllocaCost <= VectorizationBudget) {
430 promoteAllocaToVector(AA);
431 Changed = true;
432 assert((VectorizationBudget - AllocaCost) < VectorizationBudget &&
433 "Underflow!");
434 VectorizationBudget -= AllocaCost;
435 LLVM_DEBUG(dbgs() << " Remaining vectorization budget:"
436 << VectorizationBudget << "\n");
437 continue;
438 } else {
439 LLVM_DEBUG(dbgs() << "Alloca too big for vectorization (size:"
440 << AllocaCost << ", budget:" << VectorizationBudget
441 << "): " << *AA.Alloca << "\n");
442 }
443 }
444
445 if (AA.LDS.Enable &&
446 tryPromoteAllocaToLDS(AA, SufficientLDS, DeferredIntrs))
447 Changed = true;
448 }
449 finishDeferredAllocaToLDSPromotion(DeferredIntrs);
450
451 // NOTE: tryPromoteAllocaToVector removes the alloca, so Allocas contains
452 // dangling pointers. If we want to reuse it past this point, the loop above
453 // would need to be updated to remove successfully promoted allocas.
454
455 return Changed;
456}
457
458// Checks if the instruction I is a memset user of the alloca AI that we can
459// deal with. Currently, only non-volatile memsets that affect the whole alloca
460// are handled.
461static bool isSupportedMemset(MemSetInst *I, AllocaInst *AI,
462 const DataLayout &DL) {
463 using namespace PatternMatch;
464 // For now we only care about non-volatile memsets that affect the whole type
465 // (start at index 0 and fill the whole alloca).
466 //
467 // TODO: Now that we moved to PromoteAlloca we could handle any memsets
468 // (except maybe volatile ones?) - we just need to use shufflevector if it
469 // only affects a subset of the vector.
470 const unsigned Size = DL.getTypeStoreSize(Ty: AI->getAllocatedType());
471 return I->getOperand(i_nocapture: 0) == AI &&
472 match(V: I->getOperand(i_nocapture: 2), P: m_SpecificInt(V: Size)) && !I->isVolatile();
473}
474
475static Value *calculateVectorIndex(Value *Ptr, AllocaAnalysis &AA) {
476 IRBuilder<> B(Ptr->getContext());
477
478 Ptr = Ptr->stripPointerCasts();
479 if (Ptr == AA.Alloca)
480 return B.getInt32(C: 0);
481
482 auto *GEP = cast<GetElementPtrInst>(Val: Ptr);
483 auto I = AA.Vector.GEPVectorIdx.find(Key: GEP);
484 assert(I != AA.Vector.GEPVectorIdx.end() && "Must have entry for GEP!");
485
486 if (!I->second.Full) {
487 Value *Result = nullptr;
488 B.SetInsertPoint(GEP);
489
490 if (I->second.VarIndex) {
491 Result = I->second.VarIndex;
492 Result = B.CreateSExtOrTrunc(V: Result, DestTy: B.getInt32Ty());
493
494 if (I->second.VarMul)
495 Result = B.CreateMul(LHS: Result, RHS: I->second.VarMul);
496
497 if (I->second.VarShift)
498 Result = B.CreateAShr(LHS: Result, RHS: I->second.VarShift, Name: "", /*isExact*/ true);
499 }
500
501 if (I->second.ConstIndex) {
502 if (Result)
503 Result = B.CreateAdd(LHS: Result, RHS: I->second.ConstIndex);
504 else
505 Result = I->second.ConstIndex;
506 }
507
508 if (!Result)
509 Result = B.getInt32(C: 0);
510
511 I->second.Full = Result;
512 }
513
514 return I->second.Full;
515}
516
517static std::optional<GEPToVectorIndex>
518computeGEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca,
519 Type *VecElemTy, const DataLayout &DL) {
520 // TODO: Extracting a "multiple of X" from a GEP might be a useful generic
521 // helper.
522 LLVMContext &Ctx = GEP->getContext();
523 unsigned BW = DL.getIndexTypeSizeInBits(Ty: GEP->getType());
524 SmallMapVector<Value *, APInt, 4> VarOffsets;
525 APInt ConstOffset(BW, 0);
526
527 // Walk backwards through nested GEPs to collect both constant and variable
528 // offsets, so that nested vector GEP chains can be lowered in one step.
529 //
530 // Given this IR fragment as input:
531 //
532 // %0 = alloca [10 x <2 x i32>], align 8, addrspace(5)
533 // %1 = getelementptr [10 x <2 x i32>], ptr addrspace(5) %0, i32 0, i32 %j
534 // %2 = getelementptr i8, ptr addrspace(5) %1, i32 4
535 // %3 = load i32, ptr addrspace(5) %2, align 4
536 //
537 // Combine both GEP operations in a single pass, producing:
538 // BasePtr = %0
539 // ConstOffset = 4
540 // VarOffsets = { %j -> element_size(<2 x i32>) }
541 //
542 // That lets us emit a single buffer_load directly into a VGPR, without ever
543 // allocating scratch memory for the intermediate pointer.
544 Value *CurPtr = GEP;
545 while (auto *CurGEP = dyn_cast<GetElementPtrInst>(Val: CurPtr)) {
546 if (!CurGEP->collectOffset(DL, BitWidth: BW, VariableOffsets&: VarOffsets, ConstantOffset&: ConstOffset))
547 return {};
548
549 // Move to the next outer pointer.
550 CurPtr = CurGEP->getPointerOperand();
551 }
552
553 assert(CurPtr == Alloca && "GEP not based on alloca");
554
555 int64_t VecElemSize = DL.getTypeAllocSize(Ty: VecElemTy);
556 if (VarOffsets.size() > 1)
557 return {};
558
559 // We support vector indices of the form ((VarIndex * stride) >> shift) + B.
560 // IndexQuot represents B. Check that the constant offset is a multiple
561 // of the vector element size.
562 if (ConstOffset.srem(RHS: VecElemSize) != 0)
563 return {};
564 APInt IndexQuot = ConstOffset.sdiv(RHS: VecElemSize);
565
566 GEPToVectorIndex Result;
567
568 if (!ConstOffset.isZero())
569 Result.ConstIndex = ConstantInt::get(Context&: Ctx, V: IndexQuot.sextOrTrunc(width: BW));
570
571 // If there are no variable offsets, only a constant offset, then we're done.
572 if (VarOffsets.empty())
573 return Result;
574
575 // Scale is the stride in the (A * stride) part. Check that there is only one
576 // variable offset and extract the scale factor.
577 const auto &VarOffset = VarOffsets.front();
578 auto ScaleOpt = VarOffset.second.tryZExtValue();
579 if (!ScaleOpt || *ScaleOpt == 0)
580 return {};
581
582 uint64_t Scale = *ScaleOpt;
583 Result.VarIndex = VarOffset.first;
584 auto *OffsetType = dyn_cast<IntegerType>(Val: Result.VarIndex->getType());
585 if (!OffsetType)
586 return {};
587
588 // The vector index for the variable part is: VarIndex * Scale / VecElemSize.
589 if (Scale >= (uint64_t)VecElemSize) {
590 if (Scale % VecElemSize != 0)
591 return {};
592
593 // Scale is a multiple of VecElemSize, so the index is just: VarIndex *
594 // (Scale / VecElemSize).
595 uint64_t VarMul = Scale / VecElemSize;
596 // Only the multiplier is needed.
597 if (VarMul != 1)
598 Result.VarMul = ConstantInt::get(Context&: Ctx, V: APInt(BW, VarMul));
599 } else {
600 if ((uint64_t)VecElemSize % Scale != 0)
601 return {};
602
603 // VecElemSize is a multiple of Scale, so the index is just: VarIndex /
604 // (VecElemSize / Scale).
605 uint64_t Divisor = VecElemSize / Scale;
606 // The divisor must be a power of 2 so we can use a right shift.
607 if (!isPowerOf2_64(Value: Divisor))
608 return {};
609
610 // VarIndex must be known to be divisible by that divisor.
611 KnownBits KB = computeKnownBits(V: VarOffset.first, DL);
612 if (KB.countMinTrailingZeros() < Log2_64(Value: Divisor))
613 return {};
614
615 Result.VarShift = ConstantInt::get(Context&: Ctx, V: APInt(BW, Log2_64(Value: Divisor)));
616 }
617
618 return Result;
619}
620
621/// Promotes a single user of the alloca to a vector form.
622///
623/// \param Inst Instruction to be promoted.
624/// \param DL Module Data Layout.
625/// \param AA Alloca Analysis.
626/// \param VecStoreSize Size of \p VectorTy in bytes.
627/// \param ElementSize Size of \p VectorTy element type in bytes.
628/// \param CurVal Current value of the vector (e.g. last stored value)
629/// \param[out] DeferredLoads \p Inst is added to this vector if it can't
630/// be promoted now. This happens when promoting requires \p
631/// CurVal, but \p CurVal is nullptr.
632/// \return the stored value if \p Inst would have written to the alloca, or
633/// nullptr otherwise.
634static Value *promoteAllocaUserToVector(Instruction *Inst, const DataLayout &DL,
635 AllocaAnalysis &AA,
636 unsigned VecStoreSize,
637 unsigned ElementSize,
638 function_ref<Value *()> GetCurVal) {
639 // Note: we use InstSimplifyFolder because it can leverage the DataLayout
640 // to do more folding, especially in the case of vector splats.
641 IRBuilder<InstSimplifyFolder> Builder(Inst->getContext(),
642 InstSimplifyFolder(DL));
643 Builder.SetInsertPoint(Inst);
644
645 Type *VecEltTy = AA.Vector.Ty->getElementType();
646
647 switch (Inst->getOpcode()) {
648 case Instruction::Load: {
649 Value *CurVal = GetCurVal();
650 Value *Index =
651 calculateVectorIndex(Ptr: cast<LoadInst>(Val: Inst)->getPointerOperand(), AA);
652
653 // We're loading the full vector.
654 Type *AccessTy = Inst->getType();
655 TypeSize AccessSize = DL.getTypeStoreSize(Ty: AccessTy);
656 if (Constant *CI = dyn_cast<Constant>(Val: Index)) {
657 if (CI->isNullValue() && AccessSize == VecStoreSize) {
658 Inst->replaceAllUsesWith(
659 V: Builder.CreateBitPreservingCastChain(DL, V: CurVal, NewTy: AccessTy));
660 return nullptr;
661 }
662 }
663
664 // Loading a subvector.
665 if (isa<FixedVectorType>(Val: AccessTy)) {
666 assert(AccessSize.isKnownMultipleOf(DL.getTypeStoreSize(VecEltTy)));
667 const unsigned NumLoadedElts = AccessSize / DL.getTypeStoreSize(Ty: VecEltTy);
668 auto *SubVecTy = FixedVectorType::get(ElementType: VecEltTy, NumElts: NumLoadedElts);
669 assert(DL.getTypeStoreSize(SubVecTy) == DL.getTypeStoreSize(AccessTy));
670
671 // If idx is dynamic, then sandwich load with bitcasts.
672 // ie. VectorTy SubVecTy AccessTy
673 // <64 x i8> -> <16 x i8> <8 x i16>
674 // <64 x i8> -> <4 x i128> -> i128 -> <8 x i16>
675 // Extracting subvector with dynamic index has very large expansion in
676 // the amdgpu backend. Limit to pow2.
677 FixedVectorType *VectorTy = AA.Vector.Ty;
678 TypeSize NumBits = DL.getTypeStoreSize(Ty: SubVecTy) * 8u;
679 uint64_t LoadAlign = cast<LoadInst>(Val: Inst)->getAlign().value();
680 bool IsAlignedLoad = NumBits <= (LoadAlign * 8u);
681 unsigned TotalNumElts = VectorTy->getNumElements();
682 bool IsProperlyDivisible = TotalNumElts % NumLoadedElts == 0;
683 if (!isa<ConstantInt>(Val: Index) &&
684 llvm::isPowerOf2_32(Value: SubVecTy->getNumElements()) &&
685 IsProperlyDivisible && IsAlignedLoad) {
686 IntegerType *NewElemTy = Builder.getIntNTy(N: NumBits);
687 const unsigned NewNumElts =
688 DL.getTypeStoreSize(Ty: VectorTy) * 8u / NumBits;
689 const unsigned LShrAmt = llvm::Log2_32(Value: SubVecTy->getNumElements());
690 FixedVectorType *BitCastTy =
691 FixedVectorType::get(ElementType: NewElemTy, NumElts: NewNumElts);
692 Value *BCVal = Builder.CreateBitCast(V: CurVal, DestTy: BitCastTy);
693 Value *NewIdx = Builder.CreateLShr(
694 LHS: Index, RHS: ConstantInt::get(Ty: Index->getType(), V: LShrAmt));
695 Value *ExtVal = Builder.CreateExtractElement(Vec: BCVal, Idx: NewIdx);
696 Value *BCOut = Builder.CreateBitCast(V: ExtVal, DestTy: AccessTy);
697 Inst->replaceAllUsesWith(V: BCOut);
698 return nullptr;
699 }
700
701 Value *SubVec = PoisonValue::get(T: SubVecTy);
702 for (unsigned K = 0; K < NumLoadedElts; ++K) {
703 Value *CurIdx =
704 Builder.CreateAdd(LHS: Index, RHS: ConstantInt::get(Ty: Index->getType(), V: K));
705 SubVec = Builder.CreateInsertElement(
706 Vec: SubVec, NewElt: Builder.CreateExtractElement(Vec: CurVal, Idx: CurIdx), Idx: K);
707 }
708
709 Inst->replaceAllUsesWith(
710 V: Builder.CreateBitPreservingCastChain(DL, V: SubVec, NewTy: AccessTy));
711 return nullptr;
712 }
713
714 // We're loading one element.
715 Value *ExtractElement = Builder.CreateExtractElement(Vec: CurVal, Idx: Index);
716 if (AccessTy != VecEltTy)
717 ExtractElement = Builder.CreateBitOrPointerCast(V: ExtractElement, DestTy: AccessTy);
718
719 Inst->replaceAllUsesWith(V: ExtractElement);
720 return nullptr;
721 }
722 case Instruction::Store: {
723 // For stores, it's a bit trickier and it depends on whether we're storing
724 // the full vector or not. If we're storing the full vector, we don't need
725 // to know the current value. If this is a store of a single element, we
726 // need to know the value.
727 StoreInst *SI = cast<StoreInst>(Val: Inst);
728 Value *Index = calculateVectorIndex(Ptr: SI->getPointerOperand(), AA);
729 Value *Val = SI->getValueOperand();
730
731 // We're storing the full vector, we can handle this without knowing CurVal.
732 Type *AccessTy = Val->getType();
733 TypeSize AccessSize = DL.getTypeStoreSize(Ty: AccessTy);
734 if (Constant *CI = dyn_cast<Constant>(Val: Index))
735 if (CI->isNullValue() && AccessSize == VecStoreSize)
736 return Builder.CreateBitPreservingCastChain(DL, V: Val, NewTy: AA.Vector.Ty);
737
738 // Storing a subvector.
739 if (isa<FixedVectorType>(Val: AccessTy)) {
740 assert(AccessSize.isKnownMultipleOf(DL.getTypeStoreSize(VecEltTy)));
741 const unsigned NumWrittenElts =
742 AccessSize / DL.getTypeStoreSize(Ty: VecEltTy);
743 const unsigned NumVecElts = AA.Vector.Ty->getNumElements();
744 auto *SubVecTy = FixedVectorType::get(ElementType: VecEltTy, NumElts: NumWrittenElts);
745 assert(DL.getTypeStoreSize(SubVecTy) == DL.getTypeStoreSize(AccessTy));
746
747 Val = Builder.CreateBitPreservingCastChain(DL, V: Val, NewTy: SubVecTy);
748 Value *CurVec = GetCurVal();
749 for (unsigned K = 0, NumElts = std::min(a: NumWrittenElts, b: NumVecElts);
750 K < NumElts; ++K) {
751 Value *CurIdx =
752 Builder.CreateAdd(LHS: Index, RHS: ConstantInt::get(Ty: Index->getType(), V: K));
753 CurVec = Builder.CreateInsertElement(
754 Vec: CurVec, NewElt: Builder.CreateExtractElement(Vec: Val, Idx: K), Idx: CurIdx);
755 }
756 return CurVec;
757 }
758
759 if (Val->getType() != VecEltTy)
760 Val = Builder.CreateBitOrPointerCast(V: Val, DestTy: VecEltTy);
761 return Builder.CreateInsertElement(Vec: GetCurVal(), NewElt: Val, Idx: Index);
762 }
763 case Instruction::Call: {
764 if (auto *MTI = dyn_cast<MemTransferInst>(Val: Inst)) {
765 // For memcpy, we need to know curval.
766 ConstantInt *Length = cast<ConstantInt>(Val: MTI->getLength());
767 unsigned NumCopied = Length->getZExtValue() / ElementSize;
768 MemTransferInfo *TI = &AA.Vector.TransferInfo[MTI];
769 unsigned SrcBegin = TI->SrcIndex->getZExtValue();
770 unsigned DestBegin = TI->DestIndex->getZExtValue();
771
772 SmallVector<int> Mask;
773 for (unsigned Idx = 0; Idx < AA.Vector.Ty->getNumElements(); ++Idx) {
774 if (Idx >= DestBegin && Idx < DestBegin + NumCopied) {
775 Mask.push_back(Elt: SrcBegin < AA.Vector.Ty->getNumElements()
776 ? SrcBegin++
777 : PoisonMaskElem);
778 } else {
779 Mask.push_back(Elt: Idx);
780 }
781 }
782
783 return Builder.CreateShuffleVector(V: GetCurVal(), Mask);
784 }
785
786 if (auto *MSI = dyn_cast<MemSetInst>(Val: Inst)) {
787 // For memset, we don't need to know the previous value because we
788 // currently only allow memsets that cover the whole alloca.
789 Value *Elt = MSI->getOperand(i_nocapture: 1);
790 const unsigned BytesPerElt = DL.getTypeStoreSize(Ty: VecEltTy);
791 if (BytesPerElt > 1) {
792 Value *EltBytes = Builder.CreateVectorSplat(NumElts: BytesPerElt, V: Elt);
793
794 // If the element type of the vector is a pointer, we need to first cast
795 // to an integer, then use a PtrCast.
796 if (VecEltTy->isPointerTy()) {
797 Type *PtrInt = Builder.getIntNTy(N: BytesPerElt * 8);
798 Elt = Builder.CreateBitCast(V: EltBytes, DestTy: PtrInt);
799 Elt = Builder.CreateIntToPtr(V: Elt, DestTy: VecEltTy);
800 } else
801 Elt = Builder.CreateBitCast(V: EltBytes, DestTy: VecEltTy);
802 }
803
804 return Builder.CreateVectorSplat(EC: AA.Vector.Ty->getElementCount(), V: Elt);
805 }
806
807 if (auto *Intr = dyn_cast<IntrinsicInst>(Val: Inst)) {
808 if (Intr->getIntrinsicID() == Intrinsic::objectsize) {
809 Intr->replaceAllUsesWith(
810 V: Builder.getIntN(N: Intr->getType()->getIntegerBitWidth(),
811 C: DL.getTypeAllocSize(Ty: AA.Vector.Ty)));
812 return nullptr;
813 }
814 }
815
816 llvm_unreachable("Unsupported call when promoting alloca to vector");
817 }
818
819 default:
820 llvm_unreachable("Inconsistency in instructions promotable to vector");
821 }
822
823 llvm_unreachable("Did not return after promoting instruction!");
824}
825
826static bool isSupportedAccessType(FixedVectorType *VecTy, Type *AccessTy,
827 const DataLayout &DL) {
828 // Access as a vector type can work if the size of the access vector is a
829 // multiple of the size of the alloca's vector element type.
830 //
831 // Examples:
832 // - VecTy = <8 x float>, AccessTy = <4 x float> -> OK
833 // - VecTy = <4 x double>, AccessTy = <2 x float> -> OK
834 // - VecTy = <4 x double>, AccessTy = <3 x float> -> NOT OK
835 // - 3*32 is not a multiple of 64
836 //
837 // We could handle more complicated cases, but it'd make things a lot more
838 // complicated.
839 if (isa<FixedVectorType>(Val: AccessTy)) {
840 TypeSize AccTS = DL.getTypeStoreSize(Ty: AccessTy);
841 // If the type size and the store size don't match, we would need to do more
842 // than just bitcast to translate between an extracted/insertable subvectors
843 // and the accessed value.
844 if (AccTS * 8 != DL.getTypeSizeInBits(Ty: AccessTy))
845 return false;
846 TypeSize VecTS = DL.getTypeStoreSize(Ty: VecTy->getElementType());
847 return AccTS.isKnownMultipleOf(RHS: VecTS);
848 }
849
850 return CastInst::isBitOrNoopPointerCastable(SrcTy: VecTy->getElementType(), DestTy: AccessTy,
851 DL);
852}
853
854/// Iterates over an instruction worklist that may contain multiple instructions
855/// from the same basic block, but in a different order.
856template <typename InstContainer>
857static void forEachWorkListItem(const InstContainer &WorkList,
858 std::function<void(Instruction *)> Fn) {
859 // Bucket up uses of the alloca by the block they occur in.
860 // This is important because we have to handle multiple defs/uses in a block
861 // ourselves: SSAUpdater is purely for cross-block references.
862 DenseMap<BasicBlock *, SmallDenseSet<Instruction *>> UsesByBlock;
863 for (Instruction *User : WorkList)
864 UsesByBlock[User->getParent()].insert(V: User);
865
866 for (Instruction *User : WorkList) {
867 BasicBlock *BB = User->getParent();
868 auto &BlockUses = UsesByBlock[BB];
869
870 // Already processed, skip.
871 if (BlockUses.empty())
872 continue;
873
874 // Only user in the block, directly process it.
875 if (BlockUses.size() == 1) {
876 Fn(User);
877 continue;
878 }
879
880 // Multiple users in the block, do a linear scan to see users in order.
881 for (Instruction &Inst : *BB) {
882 if (!BlockUses.contains(V: &Inst))
883 continue;
884
885 Fn(&Inst);
886 }
887
888 // Clear the block so we know it's been processed.
889 BlockUses.clear();
890 }
891}
892
893/// Find an insert point after an alloca, after all other allocas clustered at
894/// the start of the block.
895static BasicBlock::iterator skipToNonAllocaInsertPt(BasicBlock &BB,
896 BasicBlock::iterator I) {
897 for (BasicBlock::iterator E = BB.end(); I != E && isa<AllocaInst>(Val: *I); ++I)
898 ;
899 return I;
900}
901
902FixedVectorType *
903AMDGPUPromoteAllocaImpl::getVectorTypeForAlloca(Type *AllocaTy) const {
904 if (DisablePromoteAllocaToVector) {
905 LLVM_DEBUG(dbgs() << " Promote alloca to vectors is disabled\n");
906 return nullptr;
907 }
908
909 auto *VectorTy = dyn_cast<FixedVectorType>(Val: AllocaTy);
910 if (auto *ArrayTy = dyn_cast<ArrayType>(Val: AllocaTy)) {
911 uint64_t NumElems = 1;
912 Type *ElemTy;
913 do {
914 NumElems *= ArrayTy->getNumElements();
915 ElemTy = ArrayTy->getElementType();
916 } while ((ArrayTy = dyn_cast<ArrayType>(Val: ElemTy)));
917
918 // Check for array of vectors
919 auto *InnerVectorTy = dyn_cast<FixedVectorType>(Val: ElemTy);
920 if (InnerVectorTy) {
921 NumElems *= InnerVectorTy->getNumElements();
922 ElemTy = InnerVectorTy->getElementType();
923 }
924
925 if (VectorType::isValidElementType(ElemTy) && NumElems > 0) {
926 unsigned ElementSize = DL->getTypeSizeInBits(Ty: ElemTy) / 8;
927 if (ElementSize > 0) {
928 unsigned AllocaSize = DL->getTypeStoreSize(Ty: AllocaTy);
929 // Expand vector if required to match padding of inner type,
930 // i.e. odd size subvectors.
931 // Storage size of new vector must match that of alloca for correct
932 // behaviour of byte offsets and GEP computation.
933 if (NumElems * ElementSize != AllocaSize)
934 NumElems = AllocaSize / ElementSize;
935 if (NumElems > 0 && (AllocaSize % ElementSize) == 0)
936 VectorTy = FixedVectorType::get(ElementType: ElemTy, NumElts: NumElems);
937 }
938 }
939 }
940 if (!VectorTy) {
941 LLVM_DEBUG(dbgs() << " Cannot convert type to vector\n");
942 return nullptr;
943 }
944
945 const unsigned MaxElements =
946 (MaxVectorRegs * 32) / DL->getTypeSizeInBits(Ty: VectorTy->getElementType());
947
948 if (VectorTy->getNumElements() > MaxElements ||
949 VectorTy->getNumElements() < 2) {
950 LLVM_DEBUG(dbgs() << " " << *VectorTy
951 << " has an unsupported number of elements\n");
952 return nullptr;
953 }
954
955 Type *VecEltTy = VectorTy->getElementType();
956 unsigned ElementSizeInBits = DL->getTypeSizeInBits(Ty: VecEltTy);
957 if (ElementSizeInBits != DL->getTypeAllocSizeInBits(Ty: VecEltTy)) {
958 LLVM_DEBUG(dbgs() << " Cannot convert to vector if the allocation size "
959 "does not match the type's size\n");
960 return nullptr;
961 }
962
963 return VectorTy;
964}
965
966void AMDGPUPromoteAllocaImpl::analyzePromoteToVector(AllocaAnalysis &AA) const {
967 if (AA.HaveSelectOrPHI) {
968 LLVM_DEBUG(dbgs() << " Cannot convert to vector due to select or phi\n");
969 return;
970 }
971
972 Type *AllocaTy = AA.Alloca->getAllocatedType();
973 AA.Vector.Ty = getVectorTypeForAlloca(AllocaTy);
974 if (!AA.Vector.Ty)
975 return;
976
977 const auto RejectUser = [&](Instruction *Inst, Twine Msg) {
978 LLVM_DEBUG(dbgs() << " Cannot promote alloca to vector: " << Msg << "\n"
979 << " " << *Inst << "\n");
980 AA.Vector.Ty = nullptr;
981 };
982
983 Type *VecEltTy = AA.Vector.Ty->getElementType();
984 unsigned ElementSize = DL->getTypeSizeInBits(Ty: VecEltTy) / 8;
985 assert(ElementSize > 0);
986 for (auto *U : AA.Uses) {
987 Instruction *Inst = cast<Instruction>(Val: U->getUser());
988
989 if (Value *Ptr = getLoadStorePointerOperand(V: Inst)) {
990 assert(!isa<StoreInst>(Inst) ||
991 U->getOperandNo() == StoreInst::getPointerOperandIndex());
992
993 Type *AccessTy = getLoadStoreType(I: Inst);
994 if (AccessTy->isAggregateType())
995 return RejectUser(Inst, "unsupported load/store as aggregate");
996 assert(!AccessTy->isAggregateType() || AccessTy->isArrayTy());
997
998 // Check that this is a simple access of a vector element.
999 bool IsSimple = isa<LoadInst>(Val: Inst) ? cast<LoadInst>(Val: Inst)->isSimple()
1000 : cast<StoreInst>(Val: Inst)->isSimple();
1001 if (!IsSimple)
1002 return RejectUser(Inst, "not a simple load or store");
1003
1004 Ptr = Ptr->stripPointerCasts();
1005
1006 // Alloca already accessed as vector.
1007 if (Ptr == AA.Alloca &&
1008 DL->getTypeStoreSize(Ty: AA.Alloca->getAllocatedType()) ==
1009 DL->getTypeStoreSize(Ty: AccessTy)) {
1010 AA.Vector.Worklist.push_back(Elt: Inst);
1011 continue;
1012 }
1013
1014 if (!isSupportedAccessType(VecTy: AA.Vector.Ty, AccessTy, DL: *DL))
1015 return RejectUser(Inst, "not a supported access type");
1016
1017 AA.Vector.Worklist.push_back(Elt: Inst);
1018 continue;
1019 }
1020
1021 if (auto *GEP = dyn_cast<GetElementPtrInst>(Val: Inst)) {
1022 // If we can't compute a vector index from this GEP, then we can't
1023 // promote this alloca to vector.
1024 auto Index = computeGEPToVectorIndex(GEP, Alloca: AA.Alloca, VecElemTy: VecEltTy, DL: *DL);
1025 if (!Index)
1026 return RejectUser(Inst, "cannot compute vector index for GEP");
1027
1028 AA.Vector.GEPVectorIdx[GEP] = std::move(Index.value());
1029 AA.Vector.UsersToRemove.push_back(Elt: Inst);
1030 continue;
1031 }
1032
1033 if (MemSetInst *MSI = dyn_cast<MemSetInst>(Val: Inst);
1034 MSI && isSupportedMemset(I: MSI, AI: AA.Alloca, DL: *DL)) {
1035 AA.Vector.Worklist.push_back(Elt: Inst);
1036 continue;
1037 }
1038
1039 if (MemTransferInst *TransferInst = dyn_cast<MemTransferInst>(Val: Inst)) {
1040 if (TransferInst->isVolatile())
1041 return RejectUser(Inst, "mem transfer inst is volatile");
1042
1043 ConstantInt *Len = dyn_cast<ConstantInt>(Val: TransferInst->getLength());
1044 if (!Len || (Len->getZExtValue() % ElementSize))
1045 return RejectUser(Inst, "mem transfer inst length is non-constant or "
1046 "not a multiple of the vector element size");
1047
1048 auto getConstIndexIntoAlloca = [&](Value *Ptr) -> ConstantInt * {
1049 if (Ptr == AA.Alloca)
1050 return ConstantInt::get(Context&: Ptr->getContext(), V: APInt(32, 0));
1051
1052 GetElementPtrInst *GEP = cast<GetElementPtrInst>(Val: Ptr);
1053 const auto &GEPI = AA.Vector.GEPVectorIdx.find(Key: GEP)->second;
1054 if (GEPI.VarIndex)
1055 return nullptr;
1056 if (GEPI.ConstIndex)
1057 return GEPI.ConstIndex;
1058 return ConstantInt::get(Context&: Ptr->getContext(), V: APInt(32, 0));
1059 };
1060
1061 MemTransferInfo *TI =
1062 &AA.Vector.TransferInfo.try_emplace(Key: TransferInst).first->second;
1063 unsigned OpNum = U->getOperandNo();
1064 if (OpNum == 0) {
1065 Value *Dest = TransferInst->getDest();
1066 ConstantInt *Index = getConstIndexIntoAlloca(Dest);
1067 if (!Index)
1068 return RejectUser(Inst, "could not calculate constant dest index");
1069 TI->DestIndex = Index;
1070 } else {
1071 assert(OpNum == 1);
1072 Value *Src = TransferInst->getSource();
1073 ConstantInt *Index = getConstIndexIntoAlloca(Src);
1074 if (!Index)
1075 return RejectUser(Inst, "could not calculate constant src index");
1076 TI->SrcIndex = Index;
1077 }
1078 continue;
1079 }
1080
1081 if (auto *Intr = dyn_cast<IntrinsicInst>(Val: Inst)) {
1082 if (Intr->getIntrinsicID() == Intrinsic::objectsize) {
1083 AA.Vector.Worklist.push_back(Elt: Inst);
1084 continue;
1085 }
1086 }
1087
1088 // Ignore assume-like intrinsics and comparisons used in assumes.
1089 if (isAssumeLikeIntrinsic(I: Inst)) {
1090 if (!Inst->use_empty())
1091 return RejectUser(Inst, "assume-like intrinsic cannot have any users");
1092 AA.Vector.UsersToRemove.push_back(Elt: Inst);
1093 continue;
1094 }
1095
1096 if (isa<ICmpInst>(Val: Inst) && all_of(Range: Inst->users(), P: [](User *U) {
1097 return isAssumeLikeIntrinsic(I: cast<Instruction>(Val: U));
1098 })) {
1099 AA.Vector.UsersToRemove.push_back(Elt: Inst);
1100 continue;
1101 }
1102
1103 return RejectUser(Inst, "unhandled alloca user");
1104 }
1105
1106 // Follow-up check to ensure we've seen both sides of all transfer insts.
1107 for (const auto &Entry : AA.Vector.TransferInfo) {
1108 const MemTransferInfo &TI = Entry.second;
1109 if (!TI.SrcIndex || !TI.DestIndex)
1110 return RejectUser(Entry.first,
1111 "mem transfer inst between different objects");
1112 AA.Vector.Worklist.push_back(Elt: Entry.first);
1113 }
1114}
1115
1116void AMDGPUPromoteAllocaImpl::promoteAllocaToVector(AllocaAnalysis &AA) {
1117 LLVM_DEBUG(dbgs() << "Promoting to vectors: " << *AA.Alloca << '\n');
1118 LLVM_DEBUG(dbgs() << " type conversion: " << *AA.Alloca->getAllocatedType()
1119 << " -> " << *AA.Vector.Ty << '\n');
1120 const unsigned VecStoreSize = DL->getTypeStoreSize(Ty: AA.Vector.Ty);
1121
1122 Type *VecEltTy = AA.Vector.Ty->getElementType();
1123 const unsigned ElementSize = DL->getTypeSizeInBits(Ty: VecEltTy) / 8;
1124
1125 // Alloca is uninitialized memory. Imitate that by making the first value
1126 // undef.
1127 SSAUpdater Updater;
1128 Updater.Initialize(Ty: AA.Vector.Ty, Name: "promotealloca");
1129
1130 BasicBlock *EntryBB = AA.Alloca->getParent();
1131 BasicBlock::iterator InitInsertPos =
1132 skipToNonAllocaInsertPt(BB&: *EntryBB, I: AA.Alloca->getIterator());
1133 IRBuilder<> Builder(&*InitInsertPos);
1134 Value *AllocaInitValue = Builder.CreateFreeze(V: PoisonValue::get(T: AA.Vector.Ty));
1135 AllocaInitValue->takeName(V: AA.Alloca);
1136
1137 Updater.AddAvailableValue(BB: AA.Alloca->getParent(), V: AllocaInitValue);
1138
1139 // First handle the initial worklist, in basic block order.
1140 //
1141 // Insert a placeholder whenever we need the vector value at the top of a
1142 // basic block.
1143 SmallSetVector<Instruction *, 8> Placeholders;
1144 forEachWorkListItem(WorkList: AA.Vector.Worklist, Fn: [&](Instruction *I) {
1145 BasicBlock *BB = I->getParent();
1146 auto GetCurVal = [&]() -> Value * {
1147 if (Value *CurVal = Updater.FindValueForBlock(BB))
1148 return CurVal;
1149
1150 if (!Placeholders.empty() && Placeholders.back()->getParent() == BB)
1151 return Placeholders.back();
1152
1153 // If the current value in the basic block is not yet known, insert a
1154 // placeholder that we will replace later.
1155 IRBuilder<> Builder(I);
1156 auto *Placeholder = cast<Instruction>(Val: Builder.CreateFreeze(
1157 V: PoisonValue::get(T: AA.Vector.Ty), Name: "promotealloca.placeholder"));
1158 Placeholders.insert(X: Placeholder);
1159 return Placeholders.back();
1160 };
1161
1162 Value *Result = promoteAllocaUserToVector(Inst: I, DL: *DL, AA, VecStoreSize,
1163 ElementSize, GetCurVal);
1164 // If the returned result is a placeholder, it means the instruction does
1165 // not really modify the alloca. So no need to make it being available value
1166 // to SSAUpdater.
1167 // This will stop placeholder being cached in SSAUpdater. The cached
1168 // placeholder may cause stale pointer being referenced when doing
1169 // placeholder replacement.
1170 if (Result && (!isa<Instruction>(Val: Result) ||
1171 !Placeholders.contains(key: cast<Instruction>(Val: Result))))
1172 Updater.AddAvailableValue(BB, V: Result);
1173 });
1174
1175 // Now fixup the placeholders.
1176 for (Instruction *Placeholder : Placeholders) {
1177 Placeholder->replaceAllUsesWith(
1178 V: Updater.GetValueInMiddleOfBlock(BB: Placeholder->getParent()));
1179 Placeholder->eraseFromParent();
1180 }
1181
1182 // Delete all instructions.
1183 for (Instruction *I : AA.Vector.Worklist) {
1184 assert(I->use_empty());
1185 I->eraseFromParent();
1186 }
1187
1188 // Delete all the users that are known to be removeable.
1189 for (Instruction *I : reverse(C&: AA.Vector.UsersToRemove)) {
1190 I->dropDroppableUses();
1191 assert(I->use_empty());
1192 I->eraseFromParent();
1193 }
1194
1195 // Alloca should now be dead too.
1196 assert(AA.Alloca->use_empty());
1197 AA.Alloca->eraseFromParent();
1198}
1199
1200std::pair<Value *, Value *>
1201AMDGPUPromoteAllocaImpl::getLocalSizeYZ(IRBuilder<> &Builder) {
1202 Function &F = *Builder.GetInsertBlock()->getParent();
1203 const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F);
1204
1205 if (!IsAMDHSA) {
1206 CallInst *LocalSizeY =
1207 Builder.CreateIntrinsic(ID: Intrinsic::r600_read_local_size_y, Args: {});
1208 CallInst *LocalSizeZ =
1209 Builder.CreateIntrinsic(ID: Intrinsic::r600_read_local_size_z, Args: {});
1210
1211 ST.makeLIDRangeMetadata(I: LocalSizeY);
1212 ST.makeLIDRangeMetadata(I: LocalSizeZ);
1213
1214 return std::pair(LocalSizeY, LocalSizeZ);
1215 }
1216
1217 // We must read the size out of the dispatch pointer.
1218 assert(IsAMDGCN);
1219
1220 // We are indexing into this struct, and want to extract the workgroup_size_*
1221 // fields.
1222 //
1223 // typedef struct hsa_kernel_dispatch_packet_s {
1224 // uint16_t header;
1225 // uint16_t setup;
1226 // uint16_t workgroup_size_x ;
1227 // uint16_t workgroup_size_y;
1228 // uint16_t workgroup_size_z;
1229 // uint16_t reserved0;
1230 // uint32_t grid_size_x ;
1231 // uint32_t grid_size_y ;
1232 // uint32_t grid_size_z;
1233 //
1234 // uint32_t private_segment_size;
1235 // uint32_t group_segment_size;
1236 // uint64_t kernel_object;
1237 //
1238 // #ifdef HSA_LARGE_MODEL
1239 // void *kernarg_address;
1240 // #elif defined HSA_LITTLE_ENDIAN
1241 // void *kernarg_address;
1242 // uint32_t reserved1;
1243 // #else
1244 // uint32_t reserved1;
1245 // void *kernarg_address;
1246 // #endif
1247 // uint64_t reserved2;
1248 // hsa_signal_t completion_signal; // uint64_t wrapper
1249 // } hsa_kernel_dispatch_packet_t
1250 //
1251 CallInst *DispatchPtr =
1252 Builder.CreateIntrinsic(ID: Intrinsic::amdgcn_dispatch_ptr, Args: {});
1253 DispatchPtr->addRetAttr(Kind: Attribute::NoAlias);
1254 DispatchPtr->addRetAttr(Kind: Attribute::NonNull);
1255 F.removeFnAttr(Kind: "amdgpu-no-dispatch-ptr");
1256
1257 // Size of the dispatch packet struct.
1258 DispatchPtr->addDereferenceableRetAttr(Bytes: 64);
1259
1260 Type *I32Ty = Type::getInt32Ty(C&: Mod->getContext());
1261
1262 // We could do a single 64-bit load here, but it's likely that the basic
1263 // 32-bit and extract sequence is already present, and it is probably easier
1264 // to CSE this. The loads should be mergeable later anyway.
1265 Value *GEPXY = Builder.CreateConstInBoundsGEP1_64(Ty: I32Ty, Ptr: DispatchPtr, Idx0: 1);
1266 LoadInst *LoadXY = Builder.CreateAlignedLoad(Ty: I32Ty, Ptr: GEPXY, Align: Align(4));
1267
1268 Value *GEPZU = Builder.CreateConstInBoundsGEP1_64(Ty: I32Ty, Ptr: DispatchPtr, Idx0: 2);
1269 LoadInst *LoadZU = Builder.CreateAlignedLoad(Ty: I32Ty, Ptr: GEPZU, Align: Align(4));
1270
1271 MDNode *MD = MDNode::get(Context&: Mod->getContext(), MDs: {});
1272 LoadXY->setMetadata(KindID: LLVMContext::MD_invariant_load, Node: MD);
1273 LoadZU->setMetadata(KindID: LLVMContext::MD_invariant_load, Node: MD);
1274 ST.makeLIDRangeMetadata(I: LoadZU);
1275
1276 // Extract y component. Upper half of LoadZU should be zero already.
1277 Value *Y = Builder.CreateLShr(LHS: LoadXY, RHS: 16);
1278
1279 return std::pair(Y, LoadZU);
1280}
1281
1282Value *AMDGPUPromoteAllocaImpl::getWorkitemID(IRBuilder<> &Builder,
1283 unsigned N) {
1284 Function *F = Builder.GetInsertBlock()->getParent();
1285 const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F: *F);
1286 Intrinsic::ID IntrID = Intrinsic::not_intrinsic;
1287 StringRef AttrName;
1288
1289 switch (N) {
1290 case 0:
1291 IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_x
1292 : (Intrinsic::ID)Intrinsic::r600_read_tidig_x;
1293 AttrName = "amdgpu-no-workitem-id-x";
1294 break;
1295 case 1:
1296 IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_y
1297 : (Intrinsic::ID)Intrinsic::r600_read_tidig_y;
1298 AttrName = "amdgpu-no-workitem-id-y";
1299 break;
1300
1301 case 2:
1302 IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_z
1303 : (Intrinsic::ID)Intrinsic::r600_read_tidig_z;
1304 AttrName = "amdgpu-no-workitem-id-z";
1305 break;
1306 default:
1307 llvm_unreachable("invalid dimension");
1308 }
1309
1310 Function *WorkitemIdFn = Intrinsic::getOrInsertDeclaration(M: Mod, id: IntrID);
1311 CallInst *CI = Builder.CreateCall(Callee: WorkitemIdFn);
1312 ST.makeLIDRangeMetadata(I: CI);
1313 F->removeFnAttr(Kind: AttrName);
1314
1315 return CI;
1316}
1317
1318static bool isCallPromotable(CallInst *CI) {
1319 IntrinsicInst *II = dyn_cast<IntrinsicInst>(Val: CI);
1320 if (!II)
1321 return false;
1322
1323 switch (II->getIntrinsicID()) {
1324 case Intrinsic::memcpy:
1325 case Intrinsic::memmove:
1326 case Intrinsic::memset:
1327 case Intrinsic::lifetime_start:
1328 case Intrinsic::lifetime_end:
1329 case Intrinsic::invariant_start:
1330 case Intrinsic::invariant_end:
1331 case Intrinsic::launder_invariant_group:
1332 case Intrinsic::strip_invariant_group:
1333 case Intrinsic::objectsize:
1334 return true;
1335 default:
1336 return false;
1337 }
1338}
1339
1340bool AMDGPUPromoteAllocaImpl::binaryOpIsDerivedFromSameAlloca(
1341 Value *BaseAlloca, Value *Val, Instruction *Inst, int OpIdx0,
1342 int OpIdx1) const {
1343 // Figure out which operand is the one we might not be promoting.
1344 Value *OtherOp = Inst->getOperand(i: OpIdx0);
1345 if (Val == OtherOp)
1346 OtherOp = Inst->getOperand(i: OpIdx1);
1347
1348 if (isa<ConstantPointerNull, ConstantAggregateZero>(Val: OtherOp))
1349 return true;
1350
1351 // TODO: getUnderlyingObject will not work on a vector getelementptr
1352 Value *OtherObj = getUnderlyingObject(V: OtherOp);
1353 if (!isa<AllocaInst>(Val: OtherObj))
1354 return false;
1355
1356 // TODO: We should be able to replace undefs with the right pointer type.
1357
1358 // TODO: If we know the other base object is another promotable
1359 // alloca, not necessarily this alloca, we can do this. The
1360 // important part is both must have the same address space at
1361 // the end.
1362 if (OtherObj != BaseAlloca) {
1363 LLVM_DEBUG(
1364 dbgs() << "Found a binary instruction with another alloca object\n");
1365 return false;
1366 }
1367
1368 return true;
1369}
1370
1371void AMDGPUPromoteAllocaImpl::analyzePromoteToLDS(AllocaAnalysis &AA) const {
1372 if (DisablePromoteAllocaToLDS) {
1373 LLVM_DEBUG(dbgs() << " Promote alloca to LDS is disabled\n");
1374 return;
1375 }
1376
1377 // Don't promote the alloca to LDS for shader calling conventions as the work
1378 // item ID intrinsics are not supported for these calling conventions.
1379 // Furthermore not all LDS is available for some of the stages.
1380 const Function &ContainingFunction = *AA.Alloca->getFunction();
1381 CallingConv::ID CC = ContainingFunction.getCallingConv();
1382
1383 switch (CC) {
1384 case CallingConv::AMDGPU_KERNEL:
1385 case CallingConv::SPIR_KERNEL:
1386 break;
1387 default:
1388 LLVM_DEBUG(
1389 dbgs()
1390 << " promote alloca to LDS not supported with calling convention.\n");
1391 return;
1392 }
1393
1394 for (Use *Use : AA.Uses) {
1395 auto *User = Use->getUser();
1396
1397 if (CallInst *CI = dyn_cast<CallInst>(Val: User)) {
1398 if (!isCallPromotable(CI))
1399 return;
1400
1401 if (find(Range&: AA.LDS.Worklist, Val: User) == AA.LDS.Worklist.end())
1402 AA.LDS.Worklist.push_back(Elt: User);
1403 continue;
1404 }
1405
1406 Instruction *UseInst = cast<Instruction>(Val: User);
1407 if (UseInst->getOpcode() == Instruction::PtrToInt)
1408 return;
1409
1410 if (LoadInst *LI = dyn_cast<LoadInst>(Val: UseInst)) {
1411 if (LI->isVolatile())
1412 return;
1413 continue;
1414 }
1415
1416 if (StoreInst *SI = dyn_cast<StoreInst>(Val: UseInst)) {
1417 if (SI->isVolatile())
1418 return;
1419 continue;
1420 }
1421
1422 if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(Val: UseInst)) {
1423 if (RMW->isVolatile())
1424 return;
1425 continue;
1426 }
1427
1428 if (AtomicCmpXchgInst *CAS = dyn_cast<AtomicCmpXchgInst>(Val: UseInst)) {
1429 if (CAS->isVolatile())
1430 return;
1431 continue;
1432 }
1433
1434 // Only promote a select if we know that the other select operand
1435 // is from another pointer that will also be promoted.
1436 if (ICmpInst *ICmp = dyn_cast<ICmpInst>(Val: UseInst)) {
1437 if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca: AA.Alloca, Val: Use->get(), Inst: ICmp, OpIdx0: 0, OpIdx1: 1))
1438 return;
1439
1440 // May need to rewrite constant operands.
1441 if (find(Range&: AA.LDS.Worklist, Val: User) == AA.LDS.Worklist.end())
1442 AA.LDS.Worklist.push_back(Elt: ICmp);
1443 continue;
1444 }
1445
1446 if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Val: UseInst)) {
1447 // Be conservative if an address could be computed outside the bounds of
1448 // the alloca.
1449 if (!GEP->isInBounds())
1450 return;
1451 } else if (!isa<ExtractElementInst, SelectInst, PHINode>(Val: User)) {
1452 // Do not promote vector/aggregate type instructions. It is hard to track
1453 // their users.
1454
1455 // Do not promote addrspacecast.
1456 //
1457 // TODO: If we know the address is only observed through flat pointers, we
1458 // could still promote.
1459 return;
1460 }
1461
1462 if (find(Range&: AA.LDS.Worklist, Val: User) == AA.LDS.Worklist.end())
1463 AA.LDS.Worklist.push_back(Elt: User);
1464 }
1465
1466 AA.LDS.Enable = true;
1467}
1468
1469bool AMDGPUPromoteAllocaImpl::hasSufficientLocalMem(const Function &F) {
1470
1471 FunctionType *FTy = F.getFunctionType();
1472 const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F);
1473
1474 // If the function has any arguments in the local address space, then it's
1475 // possible these arguments require the entire local memory space, so
1476 // we cannot use local memory in the pass.
1477 for (Type *ParamTy : FTy->params()) {
1478 PointerType *PtrTy = dyn_cast<PointerType>(Val: ParamTy);
1479 if (PtrTy && PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
1480 LocalMemLimit = 0;
1481 LLVM_DEBUG(dbgs() << "Function has local memory argument. Promoting to "
1482 "local memory disabled.\n");
1483 return false;
1484 }
1485 }
1486
1487 LocalMemLimit = ST.getAddressableLocalMemorySize();
1488 if (LocalMemLimit == 0)
1489 return false;
1490
1491 SmallVector<const Constant *, 16> Stack;
1492 SmallPtrSet<const Constant *, 8> VisitedConstants;
1493 SmallPtrSet<const GlobalVariable *, 8> UsedLDS;
1494
1495 auto visitUsers = [&](const GlobalVariable *GV, const Constant *Val) -> bool {
1496 for (const User *U : Val->users()) {
1497 if (const Instruction *Use = dyn_cast<Instruction>(Val: U)) {
1498 if (Use->getFunction() == &F)
1499 return true;
1500 } else {
1501 const Constant *C = cast<Constant>(Val: U);
1502 if (VisitedConstants.insert(Ptr: C).second)
1503 Stack.push_back(Elt: C);
1504 }
1505 }
1506
1507 return false;
1508 };
1509
1510 for (GlobalVariable &GV : Mod->globals()) {
1511 if (GV.getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS)
1512 continue;
1513
1514 if (visitUsers(&GV, &GV)) {
1515 UsedLDS.insert(Ptr: &GV);
1516 Stack.clear();
1517 continue;
1518 }
1519
1520 // For any ConstantExpr uses, we need to recursively search the users until
1521 // we see a function.
1522 while (!Stack.empty()) {
1523 const Constant *C = Stack.pop_back_val();
1524 if (visitUsers(&GV, C)) {
1525 UsedLDS.insert(Ptr: &GV);
1526 Stack.clear();
1527 break;
1528 }
1529 }
1530 }
1531
1532 const DataLayout &DL = Mod->getDataLayout();
1533 SmallVector<std::pair<uint64_t, Align>, 16> AllocatedSizes;
1534 AllocatedSizes.reserve(N: UsedLDS.size());
1535
1536 for (const GlobalVariable *GV : UsedLDS) {
1537 Align Alignment =
1538 DL.getValueOrABITypeAlignment(Alignment: GV->getAlign(), Ty: GV->getValueType());
1539 uint64_t AllocSize = GV->getGlobalSize(DL);
1540
1541 // HIP uses an extern unsized array in local address space for dynamically
1542 // allocated shared memory. In that case, we have to disable the promotion.
1543 if (GV->hasExternalLinkage() && AllocSize == 0) {
1544 LocalMemLimit = 0;
1545 LLVM_DEBUG(dbgs() << "Function has a reference to externally allocated "
1546 "local memory. Promoting to local memory "
1547 "disabled.\n");
1548 return false;
1549 }
1550
1551 AllocatedSizes.emplace_back(Args&: AllocSize, Args&: Alignment);
1552 }
1553
1554 // Sort to try to estimate the worst case alignment padding
1555 //
1556 // FIXME: We should really do something to fix the addresses to a more optimal
1557 // value instead
1558 llvm::sort(C&: AllocatedSizes, Comp: llvm::less_second());
1559
1560 // Check how much local memory is being used by global objects
1561 CurrentLocalMemUsage = 0;
1562
1563 // FIXME: Try to account for padding here. The real padding and address is
1564 // currently determined from the inverse order of uses in the function when
1565 // legalizing, which could also potentially change. We try to estimate the
1566 // worst case here, but we probably should fix the addresses earlier.
1567 for (auto Alloc : AllocatedSizes) {
1568 CurrentLocalMemUsage = alignTo(Size: CurrentLocalMemUsage, A: Alloc.second);
1569 CurrentLocalMemUsage += Alloc.first;
1570 }
1571
1572 unsigned MaxOccupancy =
1573 ST.getWavesPerEU(FlatWorkGroupSizes: ST.getFlatWorkGroupSizes(F), LDSBytes: CurrentLocalMemUsage, F)
1574 .second;
1575
1576 // Round up to the next tier of usage.
1577 unsigned MaxSizeWithWaveCount =
1578 ST.getMaxLocalMemSizeWithWaveCount(WaveCount: MaxOccupancy, F);
1579
1580 // Program may already use more LDS than is usable at maximum occupancy.
1581 if (CurrentLocalMemUsage > MaxSizeWithWaveCount)
1582 return false;
1583
1584 LocalMemLimit = MaxSizeWithWaveCount;
1585
1586 LLVM_DEBUG(dbgs() << F.getName() << " uses " << CurrentLocalMemUsage
1587 << " bytes of LDS\n"
1588 << " Rounding size to " << MaxSizeWithWaveCount
1589 << " with a maximum occupancy of " << MaxOccupancy << '\n'
1590 << " and " << (LocalMemLimit - CurrentLocalMemUsage)
1591 << " available for promotion\n");
1592
1593 return true;
1594}
1595
1596// FIXME: Should try to pick the most likely to be profitable allocas first.
1597bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(
1598 AllocaAnalysis &AA, bool SufficientLDS,
1599 SetVector<IntrinsicInst *> &DeferredIntrs) {
1600 LLVM_DEBUG(dbgs() << "Trying to promote to LDS: " << *AA.Alloca << '\n');
1601
1602 // Not likely to have sufficient local memory for promotion.
1603 if (!SufficientLDS)
1604 return false;
1605
1606 const DataLayout &DL = Mod->getDataLayout();
1607 IRBuilder<> Builder(AA.Alloca);
1608
1609 const Function &ContainingFunction = *AA.Alloca->getParent()->getParent();
1610 const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F: ContainingFunction);
1611 unsigned WorkGroupSize = ST.getFlatWorkGroupSizes(F: ContainingFunction).second;
1612
1613 Align Alignment = AA.Alloca->getAlign();
1614
1615 // FIXME: This computed padding is likely wrong since it depends on inverse
1616 // usage order.
1617 //
1618 // FIXME: It is also possible that if we're allowed to use all of the memory
1619 // could end up using more than the maximum due to alignment padding.
1620
1621 uint32_t NewSize = alignTo(Size: CurrentLocalMemUsage, A: Alignment);
1622 std::optional<TypeSize> ElemSize = AA.Alloca->getAllocationSize(DL);
1623 if (!ElemSize || ElemSize->isScalable())
1624 return false;
1625 TypeSize AllocSize = WorkGroupSize * *ElemSize;
1626 NewSize += AllocSize.getFixedValue();
1627
1628 if (NewSize > LocalMemLimit) {
1629 LLVM_DEBUG(dbgs() << " " << AllocSize
1630 << " bytes of local memory not available to promote\n");
1631 return false;
1632 }
1633
1634 CurrentLocalMemUsage = NewSize;
1635
1636 LLVM_DEBUG(dbgs() << "Promoting alloca to local memory\n");
1637
1638 Function *F = AA.Alloca->getFunction();
1639
1640 Type *GVTy = ArrayType::get(ElementType: AA.Alloca->getAllocatedType(), NumElements: WorkGroupSize);
1641 GlobalVariable *GV = new GlobalVariable(
1642 *Mod, GVTy, false, GlobalValue::InternalLinkage, PoisonValue::get(T: GVTy),
1643 Twine(F->getName()) + Twine('.') + AA.Alloca->getName(), nullptr,
1644 GlobalVariable::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS);
1645 GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
1646 GV->setAlignment(AA.Alloca->getAlign());
1647
1648 Value *TCntY, *TCntZ;
1649
1650 std::tie(args&: TCntY, args&: TCntZ) = getLocalSizeYZ(Builder);
1651 Value *TIdX = getWorkitemID(Builder, N: 0);
1652 Value *TIdY = getWorkitemID(Builder, N: 1);
1653 Value *TIdZ = getWorkitemID(Builder, N: 2);
1654
1655 Value *Tmp0 = Builder.CreateMul(LHS: TCntY, RHS: TCntZ, Name: "", HasNUW: true, HasNSW: true);
1656 Tmp0 = Builder.CreateMul(LHS: Tmp0, RHS: TIdX);
1657 Value *Tmp1 = Builder.CreateMul(LHS: TIdY, RHS: TCntZ, Name: "", HasNUW: true, HasNSW: true);
1658 Value *TID = Builder.CreateAdd(LHS: Tmp0, RHS: Tmp1);
1659 TID = Builder.CreateAdd(LHS: TID, RHS: TIdZ);
1660
1661 LLVMContext &Context = Mod->getContext();
1662 Value *Indices[] = {Constant::getNullValue(Ty: Type::getInt32Ty(C&: Context)), TID};
1663
1664 Value *Offset = Builder.CreateInBoundsGEP(Ty: GVTy, Ptr: GV, IdxList: Indices);
1665 AA.Alloca->mutateType(Ty: Offset->getType());
1666 AA.Alloca->replaceAllUsesWith(V: Offset);
1667 AA.Alloca->eraseFromParent();
1668
1669 PointerType *NewPtrTy = PointerType::get(C&: Context, AddressSpace: AMDGPUAS::LOCAL_ADDRESS);
1670
1671 for (Value *V : AA.LDS.Worklist) {
1672 CallInst *Call = dyn_cast<CallInst>(Val: V);
1673 if (!Call) {
1674 if (ICmpInst *CI = dyn_cast<ICmpInst>(Val: V)) {
1675 Value *LHS = CI->getOperand(i_nocapture: 0);
1676 Value *RHS = CI->getOperand(i_nocapture: 1);
1677
1678 Type *NewTy = LHS->getType()->getWithNewType(EltTy: NewPtrTy);
1679 if (isa<ConstantPointerNull, ConstantAggregateZero>(Val: LHS))
1680 CI->setOperand(i_nocapture: 0, Val_nocapture: Constant::getNullValue(Ty: NewTy));
1681
1682 if (isa<ConstantPointerNull, ConstantAggregateZero>(Val: RHS))
1683 CI->setOperand(i_nocapture: 1, Val_nocapture: Constant::getNullValue(Ty: NewTy));
1684
1685 continue;
1686 }
1687
1688 // The operand's value should be corrected on its own and we don't want to
1689 // touch the users.
1690 if (isa<AddrSpaceCastInst>(Val: V))
1691 continue;
1692
1693 assert(V->getType()->isPtrOrPtrVectorTy());
1694
1695 Type *NewTy = V->getType()->getWithNewType(EltTy: NewPtrTy);
1696 V->mutateType(Ty: NewTy);
1697
1698 // Adjust the types of any constant operands.
1699 if (SelectInst *SI = dyn_cast<SelectInst>(Val: V)) {
1700 if (isa<ConstantPointerNull, ConstantAggregateZero>(Val: SI->getOperand(i_nocapture: 1)))
1701 SI->setOperand(i_nocapture: 1, Val_nocapture: Constant::getNullValue(Ty: NewTy));
1702
1703 if (isa<ConstantPointerNull, ConstantAggregateZero>(Val: SI->getOperand(i_nocapture: 2)))
1704 SI->setOperand(i_nocapture: 2, Val_nocapture: Constant::getNullValue(Ty: NewTy));
1705 } else if (PHINode *Phi = dyn_cast<PHINode>(Val: V)) {
1706 for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I) {
1707 if (isa<ConstantPointerNull, ConstantAggregateZero>(
1708 Val: Phi->getIncomingValue(i: I)))
1709 Phi->setIncomingValue(i: I, V: Constant::getNullValue(Ty: NewTy));
1710 }
1711 }
1712
1713 continue;
1714 }
1715
1716 IntrinsicInst *Intr = cast<IntrinsicInst>(Val: Call);
1717 Builder.SetInsertPoint(Intr);
1718 switch (Intr->getIntrinsicID()) {
1719 case Intrinsic::lifetime_start:
1720 case Intrinsic::lifetime_end:
1721 // These intrinsics are for address space 0 only
1722 Intr->eraseFromParent();
1723 continue;
1724 case Intrinsic::memcpy:
1725 case Intrinsic::memmove:
1726 // These have 2 pointer operands. In case if second pointer also needs
1727 // to be replaced we defer processing of these intrinsics until all
1728 // other values are processed.
1729 DeferredIntrs.insert(X: Intr);
1730 continue;
1731 case Intrinsic::memset: {
1732 MemSetInst *MemSet = cast<MemSetInst>(Val: Intr);
1733 Builder.CreateMemSet(Ptr: MemSet->getRawDest(), Val: MemSet->getValue(),
1734 Size: MemSet->getLength(), Align: MemSet->getDestAlign(),
1735 isVolatile: MemSet->isVolatile());
1736 Intr->eraseFromParent();
1737 continue;
1738 }
1739 case Intrinsic::invariant_start:
1740 case Intrinsic::invariant_end:
1741 case Intrinsic::launder_invariant_group:
1742 case Intrinsic::strip_invariant_group: {
1743 SmallVector<Value *> Args;
1744 if (Intr->getIntrinsicID() == Intrinsic::invariant_start) {
1745 Args.emplace_back(Args: Intr->getArgOperand(i: 0));
1746 } else if (Intr->getIntrinsicID() == Intrinsic::invariant_end) {
1747 Args.emplace_back(Args: Intr->getArgOperand(i: 0));
1748 Args.emplace_back(Args: Intr->getArgOperand(i: 1));
1749 }
1750 Args.emplace_back(Args&: Offset);
1751 Function *F = Intrinsic::getOrInsertDeclaration(
1752 M: Intr->getModule(), id: Intr->getIntrinsicID(), OverloadTys: Offset->getType());
1753 CallInst *NewIntr =
1754 CallInst::Create(Func: F, Args, NameStr: Intr->getName(), InsertBefore: Intr->getIterator());
1755 Intr->mutateType(Ty: NewIntr->getType());
1756 Intr->replaceAllUsesWith(V: NewIntr);
1757 Intr->eraseFromParent();
1758 continue;
1759 }
1760 case Intrinsic::objectsize: {
1761 Value *Src = Intr->getOperand(i_nocapture: 0);
1762
1763 CallInst *NewCall = Builder.CreateIntrinsic(
1764 ID: Intrinsic::objectsize,
1765 Types: {Intr->getType(), PointerType::get(C&: Context, AddressSpace: AMDGPUAS::LOCAL_ADDRESS)},
1766 Args: {Src, Intr->getOperand(i_nocapture: 1), Intr->getOperand(i_nocapture: 2), Intr->getOperand(i_nocapture: 3)});
1767 Intr->replaceAllUsesWith(V: NewCall);
1768 Intr->eraseFromParent();
1769 continue;
1770 }
1771 default:
1772 Intr->print(O&: errs());
1773 llvm_unreachable("Don't know how to promote alloca intrinsic use.");
1774 }
1775 }
1776
1777 return true;
1778}
1779
1780void AMDGPUPromoteAllocaImpl::finishDeferredAllocaToLDSPromotion(
1781 SetVector<IntrinsicInst *> &DeferredIntrs) {
1782
1783 for (IntrinsicInst *Intr : DeferredIntrs) {
1784 IRBuilder<> Builder(Intr);
1785 Builder.SetInsertPoint(Intr);
1786 Intrinsic::ID ID = Intr->getIntrinsicID();
1787 assert(ID == Intrinsic::memcpy || ID == Intrinsic::memmove);
1788
1789 MemTransferInst *MI = cast<MemTransferInst>(Val: Intr);
1790 auto *B = Builder.CreateMemTransferInst(
1791 IntrID: ID, Dst: MI->getRawDest(), DstAlign: MI->getDestAlign(), Src: MI->getRawSource(),
1792 SrcAlign: MI->getSourceAlign(), Size: MI->getLength(), isVolatile: MI->isVolatile());
1793
1794 for (unsigned I = 0; I != 2; ++I) {
1795 if (uint64_t Bytes = Intr->getParamDereferenceableBytes(i: I)) {
1796 B->addDereferenceableParamAttr(i: I, Bytes);
1797 }
1798 }
1799
1800 Intr->eraseFromParent();
1801 }
1802}
1803