1//===-- AMDGPUPromoteAlloca.cpp - Promote Allocas -------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// Eliminates allocas by either converting them into vectors or by migrating
10// them to local address space.
11//
12// Two passes are exposed by this file:
13// - "promote-alloca-to-vector", which runs early in the pipeline and only
14// promotes to vector. Promotion to vector is almost always profitable
15// except when the alloca is too big and the promotion would result in
16// very high register pressure.
17// - "promote-alloca", which does both promotion to vector and LDS and runs
18// much later in the pipeline. This runs after SROA because promoting to
19// LDS is of course less profitable than getting rid of the alloca or
20// vectorizing it, thus we only want to do it when the only alternative is
21// lowering the alloca to stack.
22//
23// Note that both of them exist for the old and new PMs. The new PM passes are
24// declared in AMDGPU.h and the legacy PM ones are declared here.s
25//
26//===----------------------------------------------------------------------===//
27
28#include "AMDGPU.h"
29#include "GCNSubtarget.h"
30#include "Utils/AMDGPUBaseInfo.h"
31#include "llvm/ADT/STLExtras.h"
32#include "llvm/Analysis/CaptureTracking.h"
33#include "llvm/Analysis/InstSimplifyFolder.h"
34#include "llvm/Analysis/InstructionSimplify.h"
35#include "llvm/Analysis/LoopInfo.h"
36#include "llvm/Analysis/ValueTracking.h"
37#include "llvm/CodeGen/TargetPassConfig.h"
38#include "llvm/IR/IRBuilder.h"
39#include "llvm/IR/IntrinsicInst.h"
40#include "llvm/IR/IntrinsicsAMDGPU.h"
41#include "llvm/IR/IntrinsicsR600.h"
42#include "llvm/IR/PatternMatch.h"
43#include "llvm/InitializePasses.h"
44#include "llvm/Pass.h"
45#include "llvm/Support/MathExtras.h"
46#include "llvm/Target/TargetMachine.h"
47#include "llvm/Transforms/Utils/SSAUpdater.h"
48
49#define DEBUG_TYPE "amdgpu-promote-alloca"
50
51using namespace llvm;
52
53namespace {
54
55static cl::opt<bool>
56 DisablePromoteAllocaToVector("disable-promote-alloca-to-vector",
57 cl::desc("Disable promote alloca to vector"),
58 cl::init(Val: false));
59
60static cl::opt<bool>
61 DisablePromoteAllocaToLDS("disable-promote-alloca-to-lds",
62 cl::desc("Disable promote alloca to LDS"),
63 cl::init(Val: false));
64
65static cl::opt<unsigned> PromoteAllocaToVectorLimit(
66 "amdgpu-promote-alloca-to-vector-limit",
67 cl::desc("Maximum byte size to consider promote alloca to vector"),
68 cl::init(Val: 0));
69
70static cl::opt<unsigned> PromoteAllocaToVectorMaxRegs(
71 "amdgpu-promote-alloca-to-vector-max-regs",
72 cl::desc(
73 "Maximum vector size (in 32b registers) to use when promoting alloca"),
74 cl::init(Val: 32));
75
76// Use up to 1/4 of available register budget for vectorization.
77// FIXME: Increase the limit for whole function budgets? Perhaps x2?
78static cl::opt<unsigned> PromoteAllocaToVectorVGPRRatio(
79 "amdgpu-promote-alloca-to-vector-vgpr-ratio",
80 cl::desc("Ratio of VGPRs to budget for promoting alloca to vectors"),
81 cl::init(Val: 4));
82
83static cl::opt<unsigned>
84 LoopUserWeight("promote-alloca-vector-loop-user-weight",
85 cl::desc("The bonus weight of users of allocas within loop "
86 "when sorting profitable allocas"),
87 cl::init(Val: 4));
88
89// We support vector indices of the form ((A * stride) >> shift) + B
90// VarIndex is A, VarMul is stride, VarShift is shift and ConstIndex is B. All
91// parts are optional.
92struct GEPToVectorIndex {
93 WeakTrackingVH VarIndex = nullptr; // defaults to 0
94 ConstantInt *VarMul = nullptr; // defaults to 1
95 ConstantInt *VarShift = nullptr; // defaults to 0
96 ConstantInt *ConstIndex = nullptr; // defaults to 0
97 Value *Full = nullptr;
98};
99
100struct MemTransferInfo {
101 ConstantInt *SrcIndex = nullptr;
102 ConstantInt *DestIndex = nullptr;
103};
104
105// Analysis for planning the different strategies of alloca promotion.
106struct AllocaAnalysis {
107 AllocaInst *Alloca = nullptr;
108 DenseSet<Value *> Pointers;
109 SmallVector<Use *> Uses;
110 unsigned Score = 0;
111 bool HaveSelectOrPHI = false;
112 struct {
113 FixedVectorType *Ty = nullptr;
114 SmallVector<Instruction *> Worklist;
115 SmallVector<Instruction *> UsersToRemove;
116 MapVector<GetElementPtrInst *, GEPToVectorIndex> GEPVectorIdx;
117 MapVector<MemTransferInst *, MemTransferInfo> TransferInfo;
118 } Vector;
119 struct {
120 bool Enable = false;
121 SmallVector<User *> Worklist;
122 } LDS;
123
124 explicit AllocaAnalysis(AllocaInst *Alloca) : Alloca(Alloca) {}
125};
126
127// Shared implementation which can do both promotion to vector and to LDS.
128class AMDGPUPromoteAllocaImpl {
129private:
130 const TargetMachine &TM;
131 LoopInfo &LI;
132 Module &Mod;
133 const DataLayout &DL;
134
135 // FIXME: This should be per-kernel.
136 uint32_t LocalMemLimit = 0;
137 uint32_t CurrentLocalMemUsage = 0;
138 unsigned MaxVGPRs;
139 unsigned VGPRBudgetRatio;
140 unsigned MaxVectorRegs;
141
142 bool IsAMDGCN = false;
143 bool IsAMDHSA = false;
144
145 std::pair<Value *, Value *> getLocalSizeYZ(IRBuilder<> &Builder);
146 Value *getWorkitemID(IRBuilder<> &Builder, unsigned N);
147
148 bool collectAllocaUses(AllocaAnalysis &AA) const;
149
150 /// Val is a derived pointer from Alloca. OpIdx0/OpIdx1 are the operand
151 /// indices to an instruction with 2 pointer inputs (e.g. select, icmp).
152 /// Returns true if both operands are derived from the same alloca. Val should
153 /// be the same value as one of the input operands of UseInst.
154 bool binaryOpIsDerivedFromSameAlloca(Value *Alloca, Value *Val,
155 Instruction *UseInst, int OpIdx0,
156 int OpIdx1) const;
157
158 /// Check whether we have enough local memory for promotion.
159 bool hasSufficientLocalMem(const Function &F);
160
161 FixedVectorType *getVectorTypeForAlloca(Type *AllocaTy) const;
162 void analyzePromoteToVector(AllocaAnalysis &AA) const;
163 void promoteAllocaToVector(AllocaAnalysis &AA);
164 void analyzePromoteToLDS(AllocaAnalysis &AA) const;
165 bool tryPromoteAllocaToLDS(AllocaAnalysis &AA, bool SufficientLDS,
166 SetVector<IntrinsicInst *> &DeferredIntrs);
167 void
168 finishDeferredAllocaToLDSPromotion(SetVector<IntrinsicInst *> &DeferredIntrs);
169
170 void scoreAlloca(AllocaAnalysis &AA) const;
171
172 void setFunctionLimits(const Function &F);
173
174public:
175 AMDGPUPromoteAllocaImpl(TargetMachine &TM, Module &M, LoopInfo &LI)
176 : TM(TM), LI(LI), Mod(M), DL(M.getDataLayout()) {
177 const Triple &TT = M.getTargetTriple();
178 IsAMDGCN = TT.isAMDGCN();
179 IsAMDHSA = TT.getOS() == Triple::AMDHSA;
180 }
181
182 bool run(Function &F, bool PromoteToLDS);
183};
184
185// FIXME: This can create globals so should be a module pass.
186class AMDGPUPromoteAlloca : public FunctionPass {
187public:
188 static char ID;
189
190 AMDGPUPromoteAlloca() : FunctionPass(ID) {}
191
192 bool runOnFunction(Function &F) override {
193 if (skipFunction(F))
194 return false;
195 if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>())
196 return AMDGPUPromoteAllocaImpl(
197 TPC->getTM<TargetMachine>(), *F.getParent(),
198 getAnalysis<LoopInfoWrapperPass>().getLoopInfo())
199 .run(F, /*PromoteToLDS*/ true);
200 return false;
201 }
202
203 StringRef getPassName() const override { return "AMDGPU Promote Alloca"; }
204
205 void getAnalysisUsage(AnalysisUsage &AU) const override {
206 AU.setPreservesCFG();
207 AU.addRequired<LoopInfoWrapperPass>();
208 FunctionPass::getAnalysisUsage(AU);
209 }
210};
211
212static unsigned getMaxVGPRs(unsigned LDSBytes, const TargetMachine &TM,
213 const Function &F) {
214 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
215
216 unsigned DynamicVGPRBlockSize = AMDGPU::getDynamicVGPRBlockSize(F);
217 // Temporarily check both the attribute and the subtarget feature, until the
218 // latter is removed.
219 if (DynamicVGPRBlockSize == 0 && ST.isDynamicVGPREnabled())
220 DynamicVGPRBlockSize = ST.getDynamicVGPRBlockSize();
221
222 unsigned MaxVGPRs = ST.getMaxNumVGPRs(
223 WavesPerEU: ST.getWavesPerEU(FlatWorkGroupSizes: ST.getFlatWorkGroupSizes(F), LDSBytes, F).first,
224 DynamicVGPRBlockSize);
225
226 // A non-entry function has only 32 caller preserved registers.
227 // Do not promote alloca which will force spilling unless we know the function
228 // will be inlined.
229 if (!F.hasFnAttribute(Kind: Attribute::AlwaysInline) &&
230 !AMDGPU::isEntryFunctionCC(CC: F.getCallingConv()))
231 MaxVGPRs = std::min(a: MaxVGPRs, b: 32u);
232 return MaxVGPRs;
233}
234
235} // end anonymous namespace
236
237char AMDGPUPromoteAlloca::ID = 0;
238
239INITIALIZE_PASS_BEGIN(AMDGPUPromoteAlloca, DEBUG_TYPE,
240 "AMDGPU promote alloca to vector or LDS", false, false)
241// Move LDS uses from functions to kernels before promote alloca for accurate
242// estimation of LDS available
243INITIALIZE_PASS_DEPENDENCY(AMDGPULowerModuleLDSLegacy)
244INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
245INITIALIZE_PASS_END(AMDGPUPromoteAlloca, DEBUG_TYPE,
246 "AMDGPU promote alloca to vector or LDS", false, false)
247
248char &llvm::AMDGPUPromoteAllocaID = AMDGPUPromoteAlloca::ID;
249
250PreservedAnalyses AMDGPUPromoteAllocaPass::run(Function &F,
251 FunctionAnalysisManager &AM) {
252 auto &LI = AM.getResult<LoopAnalysis>(IR&: F);
253 bool Changed = AMDGPUPromoteAllocaImpl(TM, *F.getParent(), LI)
254 .run(F, /*PromoteToLDS=*/true);
255 if (Changed) {
256 PreservedAnalyses PA;
257 PA.preserveSet<CFGAnalyses>();
258 return PA;
259 }
260 return PreservedAnalyses::all();
261}
262
263PreservedAnalyses
264AMDGPUPromoteAllocaToVectorPass::run(Function &F, FunctionAnalysisManager &AM) {
265 auto &LI = AM.getResult<LoopAnalysis>(IR&: F);
266 bool Changed = AMDGPUPromoteAllocaImpl(TM, *F.getParent(), LI)
267 .run(F, /*PromoteToLDS=*/false);
268 if (Changed) {
269 PreservedAnalyses PA;
270 PA.preserveSet<CFGAnalyses>();
271 return PA;
272 }
273 return PreservedAnalyses::all();
274}
275
276FunctionPass *llvm::createAMDGPUPromoteAlloca() {
277 return new AMDGPUPromoteAlloca();
278}
279
280bool AMDGPUPromoteAllocaImpl::collectAllocaUses(AllocaAnalysis &AA) const {
281 const auto RejectUser = [&](Instruction *Inst, Twine Msg) {
282 LLVM_DEBUG(dbgs() << " Cannot promote alloca: " << Msg << "\n"
283 << " " << *Inst << "\n");
284 return false;
285 };
286
287 SmallVector<Instruction *, 4> WorkList({AA.Alloca});
288 while (!WorkList.empty()) {
289 auto *Cur = WorkList.pop_back_val();
290 if (find(Range&: AA.Pointers, Val: Cur) != AA.Pointers.end())
291 continue;
292 AA.Pointers.insert(V: Cur);
293 for (auto &U : Cur->uses()) {
294 auto *Inst = cast<Instruction>(Val: U.getUser());
295 if (isa<StoreInst>(Val: Inst)) {
296 if (U.getOperandNo() != StoreInst::getPointerOperandIndex()) {
297 return RejectUser(Inst, "pointer escapes via store");
298 }
299 }
300 AA.Uses.push_back(Elt: &U);
301
302 if (isa<GetElementPtrInst>(Val: U.getUser())) {
303 WorkList.push_back(Elt: Inst);
304 } else if (auto *SI = dyn_cast<SelectInst>(Val: Inst)) {
305 // Only promote a select if we know that the other select operand is
306 // from another pointer that will also be promoted.
307 if (!binaryOpIsDerivedFromSameAlloca(Alloca: AA.Alloca, Val: Cur, UseInst: SI, OpIdx0: 1, OpIdx1: 2))
308 return RejectUser(Inst, "select from mixed objects");
309 WorkList.push_back(Elt: Inst);
310 AA.HaveSelectOrPHI = true;
311 } else if (auto *Phi = dyn_cast<PHINode>(Val: Inst)) {
312 // Repeat for phis.
313
314 // TODO: Handle more complex cases. We should be able to replace loops
315 // over arrays.
316 switch (Phi->getNumIncomingValues()) {
317 case 1:
318 break;
319 case 2:
320 if (!binaryOpIsDerivedFromSameAlloca(Alloca: AA.Alloca, Val: Cur, UseInst: Phi, OpIdx0: 0, OpIdx1: 1))
321 return RejectUser(Inst, "phi from mixed objects");
322 break;
323 default:
324 return RejectUser(Inst, "phi with too many operands");
325 }
326
327 WorkList.push_back(Elt: Inst);
328 AA.HaveSelectOrPHI = true;
329 }
330 }
331 }
332 return true;
333}
334
335void AMDGPUPromoteAllocaImpl::scoreAlloca(AllocaAnalysis &AA) const {
336 LLVM_DEBUG(dbgs() << "Scoring: " << *AA.Alloca << "\n");
337 unsigned Score = 0;
338 // Increment score by one for each user + a bonus for users within loops.
339 for (auto *U : AA.Uses) {
340 Instruction *Inst = cast<Instruction>(Val: U->getUser());
341 if (isa<GetElementPtrInst>(Val: Inst) || isa<SelectInst>(Val: Inst) ||
342 isa<PHINode>(Val: Inst))
343 continue;
344 unsigned UserScore =
345 1 + (LoopUserWeight * LI.getLoopDepth(BB: Inst->getParent()));
346 LLVM_DEBUG(dbgs() << " [+" << UserScore << "]:\t" << *Inst << "\n");
347 Score += UserScore;
348 }
349 LLVM_DEBUG(dbgs() << " => Final Score:" << Score << "\n");
350 AA.Score = Score;
351}
352
353void AMDGPUPromoteAllocaImpl::setFunctionLimits(const Function &F) {
354 // Load per function limits, overriding with global options where appropriate.
355 // R600 register tuples/aliasing are fragile with large vector promotions so
356 // apply architecture specific limit here.
357 const int R600MaxVectorRegs = 16;
358 MaxVectorRegs = F.getFnAttributeAsParsedInteger(
359 Kind: "amdgpu-promote-alloca-to-vector-max-regs",
360 Default: IsAMDGCN ? PromoteAllocaToVectorMaxRegs : R600MaxVectorRegs);
361 if (PromoteAllocaToVectorMaxRegs.getNumOccurrences())
362 MaxVectorRegs = PromoteAllocaToVectorMaxRegs;
363 VGPRBudgetRatio = F.getFnAttributeAsParsedInteger(
364 Kind: "amdgpu-promote-alloca-to-vector-vgpr-ratio",
365 Default: PromoteAllocaToVectorVGPRRatio);
366 if (PromoteAllocaToVectorVGPRRatio.getNumOccurrences())
367 VGPRBudgetRatio = PromoteAllocaToVectorVGPRRatio;
368}
369
370bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
371 if (DisablePromoteAllocaToLDS && DisablePromoteAllocaToVector)
372 return false;
373
374 bool SufficientLDS = PromoteToLDS && hasSufficientLocalMem(F);
375 MaxVGPRs = IsAMDGCN ? getMaxVGPRs(LDSBytes: CurrentLocalMemUsage, TM, F) : 128;
376 setFunctionLimits(F);
377
378 unsigned VectorizationBudget =
379 (PromoteAllocaToVectorLimit ? PromoteAllocaToVectorLimit * 8
380 : (MaxVGPRs * 32)) /
381 VGPRBudgetRatio;
382
383 std::vector<AllocaAnalysis> Allocas;
384 for (Instruction &I : F.getEntryBlock()) {
385 if (AllocaInst *AI = dyn_cast<AllocaInst>(Val: &I)) {
386 // Array allocations are probably not worth handling, since an allocation
387 // of the array type is the canonical form.
388 if (!AI->isStaticAlloca() || AI->isArrayAllocation())
389 continue;
390
391 LLVM_DEBUG(dbgs() << "Analyzing: " << *AI << '\n');
392
393 AllocaAnalysis AA{AI};
394 if (collectAllocaUses(AA)) {
395 analyzePromoteToVector(AA);
396 if (PromoteToLDS)
397 analyzePromoteToLDS(AA);
398 if (AA.Vector.Ty || AA.LDS.Enable) {
399 scoreAlloca(AA);
400 Allocas.push_back(x: std::move(AA));
401 }
402 }
403 }
404 }
405
406 stable_sort(Range&: Allocas,
407 C: [](const auto &A, const auto &B) { return A.Score > B.Score; });
408
409 // clang-format off
410 LLVM_DEBUG(
411 dbgs() << "Sorted Worklist:\n";
412 for (const auto &AA : Allocas)
413 dbgs() << " " << *AA.Alloca << "\n";
414 );
415 // clang-format on
416
417 bool Changed = false;
418 SetVector<IntrinsicInst *> DeferredIntrs;
419 for (AllocaAnalysis &AA : Allocas) {
420 if (AA.Vector.Ty) {
421 std::optional<TypeSize> Size = AA.Alloca->getAllocationSize(DL);
422 assert(Size); // Expected to succeed on non-array alloca.
423 const unsigned AllocaCost = Size->getFixedValue() * 8;
424 // First, check if we have enough budget to vectorize this alloca.
425 if (AllocaCost <= VectorizationBudget) {
426 promoteAllocaToVector(AA);
427 Changed = true;
428 assert((VectorizationBudget - AllocaCost) < VectorizationBudget &&
429 "Underflow!");
430 VectorizationBudget -= AllocaCost;
431 LLVM_DEBUG(dbgs() << " Remaining vectorization budget:"
432 << VectorizationBudget << "\n");
433 continue;
434 } else {
435 LLVM_DEBUG(dbgs() << "Alloca too big for vectorization (size:"
436 << AllocaCost << ", budget:" << VectorizationBudget
437 << "): " << *AA.Alloca << "\n");
438 }
439 }
440
441 if (AA.LDS.Enable &&
442 tryPromoteAllocaToLDS(AA, SufficientLDS, DeferredIntrs))
443 Changed = true;
444 }
445 finishDeferredAllocaToLDSPromotion(DeferredIntrs);
446
447 // NOTE: tryPromoteAllocaToVector removes the alloca, so Allocas contains
448 // dangling pointers. If we want to reuse it past this point, the loop above
449 // would need to be updated to remove successfully promoted allocas.
450
451 return Changed;
452}
453
454// Checks if the instruction I is a memset user of the alloca AI that we can
455// deal with. Currently, only non-volatile memsets that affect the whole alloca
456// are handled.
457static bool isSupportedMemset(MemSetInst *I, AllocaInst *AI,
458 const DataLayout &DL) {
459 using namespace PatternMatch;
460 // For now we only care about non-volatile memsets that affect the whole type
461 // (start at index 0 and fill the whole alloca).
462 //
463 // TODO: Now that we moved to PromoteAlloca we could handle any memsets
464 // (except maybe volatile ones?) - we just need to use shufflevector if it
465 // only affects a subset of the vector.
466 const unsigned Size = DL.getTypeStoreSize(Ty: AI->getAllocatedType());
467 return I->getOperand(i_nocapture: 0) == AI &&
468 match(V: I->getOperand(i_nocapture: 2), P: m_SpecificInt(V: Size)) && !I->isVolatile();
469}
470
471static Value *calculateVectorIndex(Value *Ptr, AllocaAnalysis &AA) {
472 IRBuilder<> B(Ptr->getContext());
473
474 Ptr = Ptr->stripPointerCasts();
475 if (Ptr == AA.Alloca)
476 return B.getInt32(C: 0);
477
478 auto *GEP = cast<GetElementPtrInst>(Val: Ptr);
479 auto I = AA.Vector.GEPVectorIdx.find(Key: GEP);
480 assert(I != AA.Vector.GEPVectorIdx.end() && "Must have entry for GEP!");
481
482 if (!I->second.Full) {
483 Value *Result = nullptr;
484 B.SetInsertPoint(GEP);
485
486 if (I->second.VarIndex) {
487 Result = I->second.VarIndex;
488 Result = B.CreateSExtOrTrunc(V: Result, DestTy: B.getInt32Ty());
489
490 if (I->second.VarMul)
491 Result = B.CreateMul(LHS: Result, RHS: I->second.VarMul);
492
493 if (I->second.VarShift)
494 Result = B.CreateAShr(LHS: Result, RHS: I->second.VarShift, Name: "", /*isExact*/ true);
495 }
496
497 if (I->second.ConstIndex) {
498 if (Result)
499 Result = B.CreateAdd(LHS: Result, RHS: I->second.ConstIndex);
500 else
501 Result = I->second.ConstIndex;
502 }
503
504 if (!Result)
505 Result = B.getInt32(C: 0);
506
507 I->second.Full = Result;
508 }
509
510 return I->second.Full;
511}
512
513static std::optional<GEPToVectorIndex>
514computeGEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca,
515 Type *VecElemTy, const DataLayout &DL) {
516 // TODO: Extracting a "multiple of X" from a GEP might be a useful generic
517 // helper.
518 LLVMContext &Ctx = GEP->getContext();
519 unsigned BW = DL.getIndexTypeSizeInBits(Ty: GEP->getType());
520 SmallMapVector<Value *, APInt, 4> VarOffsets;
521 APInt ConstOffset(BW, 0);
522
523 // Walk backwards through nested GEPs to collect both constant and variable
524 // offsets, so that nested vector GEP chains can be lowered in one step.
525 //
526 // Given this IR fragment as input:
527 //
528 // %0 = alloca [10 x <2 x i32>], align 8, addrspace(5)
529 // %1 = getelementptr [10 x <2 x i32>], ptr addrspace(5) %0, i32 0, i32 %j
530 // %2 = getelementptr i8, ptr addrspace(5) %1, i32 4
531 // %3 = load i32, ptr addrspace(5) %2, align 4
532 //
533 // Combine both GEP operations in a single pass, producing:
534 // BasePtr = %0
535 // ConstOffset = 4
536 // VarOffsets = { %j -> element_size(<2 x i32>) }
537 //
538 // That lets us emit a single buffer_load directly into a VGPR, without ever
539 // allocating scratch memory for the intermediate pointer.
540 Value *CurPtr = GEP;
541 while (auto *CurGEP = dyn_cast<GetElementPtrInst>(Val: CurPtr)) {
542 if (!CurGEP->collectOffset(DL, BitWidth: BW, VariableOffsets&: VarOffsets, ConstantOffset&: ConstOffset))
543 return {};
544
545 // Move to the next outer pointer.
546 CurPtr = CurGEP->getPointerOperand();
547 }
548
549 assert(CurPtr == Alloca && "GEP not based on alloca");
550
551 int64_t VecElemSize = DL.getTypeAllocSize(Ty: VecElemTy);
552 if (VarOffsets.size() > 1)
553 return {};
554
555 // We support vector indices of the form ((VarIndex * stride) >> shift) + B.
556 // IndexQuot represents B. Check that the constant offset is a multiple
557 // of the vector element size.
558 if (ConstOffset.srem(RHS: VecElemSize) != 0)
559 return {};
560 APInt IndexQuot = ConstOffset.sdiv(RHS: VecElemSize);
561
562 GEPToVectorIndex Result;
563
564 if (!ConstOffset.isZero())
565 Result.ConstIndex = ConstantInt::get(Context&: Ctx, V: IndexQuot.sextOrTrunc(width: BW));
566
567 // If there are no variable offsets, only a constant offset, then we're done.
568 if (VarOffsets.empty())
569 return Result;
570
571 // Scale is the stride in the (A * stride) part. Check that there is only one
572 // variable offset and extract the scale factor.
573 const auto &VarOffset = VarOffsets.front();
574 auto ScaleOpt = VarOffset.second.tryZExtValue();
575 if (!ScaleOpt || *ScaleOpt == 0)
576 return {};
577
578 uint64_t Scale = *ScaleOpt;
579 Result.VarIndex = VarOffset.first;
580 auto *OffsetType = dyn_cast<IntegerType>(Val: Result.VarIndex->getType());
581 if (!OffsetType)
582 return {};
583
584 // The vector index for the variable part is: VarIndex * Scale / VecElemSize.
585 if (Scale >= (uint64_t)VecElemSize) {
586 if (Scale % VecElemSize != 0)
587 return {};
588
589 // Scale is a multiple of VecElemSize, so the index is just: VarIndex *
590 // (Scale / VecElemSize).
591 uint64_t VarMul = Scale / VecElemSize;
592 // Only the multiplier is needed.
593 if (VarMul != 1)
594 Result.VarMul = ConstantInt::get(Context&: Ctx, V: APInt(BW, VarMul));
595 } else {
596 if ((uint64_t)VecElemSize % Scale != 0)
597 return {};
598
599 // VecElemSize is a multiple of Scale, so the index is just: VarIndex /
600 // (VecElemSize / Scale).
601 uint64_t Divisor = VecElemSize / Scale;
602 // The divisor must be a power of 2 so we can use a right shift.
603 if (!isPowerOf2_64(Value: Divisor))
604 return {};
605
606 // VarIndex must be known to be divisible by that divisor.
607 KnownBits KB = computeKnownBits(V: VarOffset.first, DL);
608 if (KB.countMinTrailingZeros() < Log2_64(Value: Divisor))
609 return {};
610
611 Result.VarShift = ConstantInt::get(Context&: Ctx, V: APInt(BW, Log2_64(Value: Divisor)));
612 }
613
614 return Result;
615}
616
617/// Promotes a single user of the alloca to a vector form.
618///
619/// \param Inst Instruction to be promoted.
620/// \param DL Module Data Layout.
621/// \param AA Alloca Analysis.
622/// \param VecStoreSize Size of \p VectorTy in bytes.
623/// \param ElementSize Size of \p VectorTy element type in bytes.
624/// \param CurVal Current value of the vector (e.g. last stored value)
625/// \param[out] DeferredLoads \p Inst is added to this vector if it can't
626/// be promoted now. This happens when promoting requires \p
627/// CurVal, but \p CurVal is nullptr.
628/// \return the stored value if \p Inst would have written to the alloca, or
629/// nullptr otherwise.
630static Value *promoteAllocaUserToVector(Instruction *Inst, const DataLayout &DL,
631 AllocaAnalysis &AA,
632 unsigned VecStoreSize,
633 unsigned ElementSize,
634 function_ref<Value *()> GetCurVal) {
635 // Note: we use InstSimplifyFolder because it can leverage the DataLayout
636 // to do more folding, especially in the case of vector splats.
637 IRBuilder<InstSimplifyFolder> Builder(Inst->getContext(),
638 InstSimplifyFolder(DL));
639 Builder.SetInsertPoint(Inst);
640
641 Type *VecEltTy = AA.Vector.Ty->getElementType();
642
643 switch (Inst->getOpcode()) {
644 case Instruction::Load: {
645 Value *CurVal = GetCurVal();
646 Value *Index =
647 calculateVectorIndex(Ptr: cast<LoadInst>(Val: Inst)->getPointerOperand(), AA);
648
649 // We're loading the full vector.
650 Type *AccessTy = Inst->getType();
651 TypeSize AccessSize = DL.getTypeStoreSize(Ty: AccessTy);
652 if (Constant *CI = dyn_cast<Constant>(Val: Index)) {
653 if (CI->isNullValue() && AccessSize == VecStoreSize) {
654 Inst->replaceAllUsesWith(
655 V: Builder.CreateBitPreservingCastChain(DL, V: CurVal, NewTy: AccessTy));
656 return nullptr;
657 }
658 }
659
660 // Loading a subvector.
661 if (isa<FixedVectorType>(Val: AccessTy)) {
662 assert(AccessSize.isKnownMultipleOf(DL.getTypeStoreSize(VecEltTy)));
663 const unsigned NumLoadedElts = AccessSize / DL.getTypeStoreSize(Ty: VecEltTy);
664 auto *SubVecTy = FixedVectorType::get(ElementType: VecEltTy, NumElts: NumLoadedElts);
665 assert(DL.getTypeStoreSize(SubVecTy) == DL.getTypeStoreSize(AccessTy));
666
667 // If idx is dynamic, then sandwich load with bitcasts.
668 // ie. VectorTy SubVecTy AccessTy
669 // <64 x i8> -> <16 x i8> <8 x i16>
670 // <64 x i8> -> <4 x i128> -> i128 -> <8 x i16>
671 // Extracting subvector with dynamic index has very large expansion in
672 // the amdgpu backend. Limit to pow2.
673 FixedVectorType *VectorTy = AA.Vector.Ty;
674 TypeSize NumBits = DL.getTypeStoreSize(Ty: SubVecTy) * 8u;
675 uint64_t LoadAlign = cast<LoadInst>(Val: Inst)->getAlign().value();
676 bool IsAlignedLoad = NumBits <= (LoadAlign * 8u);
677 unsigned TotalNumElts = VectorTy->getNumElements();
678 bool IsProperlyDivisible = TotalNumElts % NumLoadedElts == 0;
679 if (!isa<ConstantInt>(Val: Index) &&
680 llvm::isPowerOf2_32(Value: SubVecTy->getNumElements()) &&
681 IsProperlyDivisible && IsAlignedLoad) {
682 IntegerType *NewElemTy = Builder.getIntNTy(N: NumBits);
683 const unsigned NewNumElts =
684 DL.getTypeStoreSize(Ty: VectorTy) * 8u / NumBits;
685 const unsigned LShrAmt = llvm::Log2_32(Value: SubVecTy->getNumElements());
686 FixedVectorType *BitCastTy =
687 FixedVectorType::get(ElementType: NewElemTy, NumElts: NewNumElts);
688 Value *BCVal = Builder.CreateBitCast(V: CurVal, DestTy: BitCastTy);
689 Value *NewIdx = Builder.CreateLShr(
690 LHS: Index, RHS: ConstantInt::get(Ty: Index->getType(), V: LShrAmt));
691 Value *ExtVal = Builder.CreateExtractElement(Vec: BCVal, Idx: NewIdx);
692 Value *BCOut = Builder.CreateBitCast(V: ExtVal, DestTy: AccessTy);
693 Inst->replaceAllUsesWith(V: BCOut);
694 return nullptr;
695 }
696
697 Value *SubVec = PoisonValue::get(T: SubVecTy);
698 for (unsigned K = 0; K < NumLoadedElts; ++K) {
699 Value *CurIdx =
700 Builder.CreateAdd(LHS: Index, RHS: ConstantInt::get(Ty: Index->getType(), V: K));
701 SubVec = Builder.CreateInsertElement(
702 Vec: SubVec, NewElt: Builder.CreateExtractElement(Vec: CurVal, Idx: CurIdx), Idx: K);
703 }
704
705 Inst->replaceAllUsesWith(
706 V: Builder.CreateBitPreservingCastChain(DL, V: SubVec, NewTy: AccessTy));
707 return nullptr;
708 }
709
710 // We're loading one element.
711 Value *ExtractElement = Builder.CreateExtractElement(Vec: CurVal, Idx: Index);
712 if (AccessTy != VecEltTy)
713 ExtractElement = Builder.CreateBitOrPointerCast(V: ExtractElement, DestTy: AccessTy);
714
715 Inst->replaceAllUsesWith(V: ExtractElement);
716 return nullptr;
717 }
718 case Instruction::Store: {
719 // For stores, it's a bit trickier and it depends on whether we're storing
720 // the full vector or not. If we're storing the full vector, we don't need
721 // to know the current value. If this is a store of a single element, we
722 // need to know the value.
723 StoreInst *SI = cast<StoreInst>(Val: Inst);
724 Value *Index = calculateVectorIndex(Ptr: SI->getPointerOperand(), AA);
725 Value *Val = SI->getValueOperand();
726
727 // We're storing the full vector, we can handle this without knowing CurVal.
728 Type *AccessTy = Val->getType();
729 TypeSize AccessSize = DL.getTypeStoreSize(Ty: AccessTy);
730 if (Constant *CI = dyn_cast<Constant>(Val: Index))
731 if (CI->isNullValue() && AccessSize == VecStoreSize)
732 return Builder.CreateBitPreservingCastChain(DL, V: Val, NewTy: AA.Vector.Ty);
733
734 // Storing a subvector.
735 if (isa<FixedVectorType>(Val: AccessTy)) {
736 assert(AccessSize.isKnownMultipleOf(DL.getTypeStoreSize(VecEltTy)));
737 const unsigned NumWrittenElts =
738 AccessSize / DL.getTypeStoreSize(Ty: VecEltTy);
739 const unsigned NumVecElts = AA.Vector.Ty->getNumElements();
740 auto *SubVecTy = FixedVectorType::get(ElementType: VecEltTy, NumElts: NumWrittenElts);
741 assert(DL.getTypeStoreSize(SubVecTy) == DL.getTypeStoreSize(AccessTy));
742
743 Val = Builder.CreateBitPreservingCastChain(DL, V: Val, NewTy: SubVecTy);
744 Value *CurVec = GetCurVal();
745 for (unsigned K = 0, NumElts = std::min(a: NumWrittenElts, b: NumVecElts);
746 K < NumElts; ++K) {
747 Value *CurIdx =
748 Builder.CreateAdd(LHS: Index, RHS: ConstantInt::get(Ty: Index->getType(), V: K));
749 CurVec = Builder.CreateInsertElement(
750 Vec: CurVec, NewElt: Builder.CreateExtractElement(Vec: Val, Idx: K), Idx: CurIdx);
751 }
752 return CurVec;
753 }
754
755 if (Val->getType() != VecEltTy)
756 Val = Builder.CreateBitOrPointerCast(V: Val, DestTy: VecEltTy);
757 return Builder.CreateInsertElement(Vec: GetCurVal(), NewElt: Val, Idx: Index);
758 }
759 case Instruction::Call: {
760 if (auto *MTI = dyn_cast<MemTransferInst>(Val: Inst)) {
761 // For memcpy, we need to know curval.
762 ConstantInt *Length = cast<ConstantInt>(Val: MTI->getLength());
763 unsigned NumCopied = Length->getZExtValue() / ElementSize;
764 MemTransferInfo *TI = &AA.Vector.TransferInfo[MTI];
765 unsigned SrcBegin = TI->SrcIndex->getZExtValue();
766 unsigned DestBegin = TI->DestIndex->getZExtValue();
767
768 SmallVector<int> Mask;
769 for (unsigned Idx = 0; Idx < AA.Vector.Ty->getNumElements(); ++Idx) {
770 if (Idx >= DestBegin && Idx < DestBegin + NumCopied) {
771 Mask.push_back(Elt: SrcBegin < AA.Vector.Ty->getNumElements()
772 ? SrcBegin++
773 : PoisonMaskElem);
774 } else {
775 Mask.push_back(Elt: Idx);
776 }
777 }
778
779 return Builder.CreateShuffleVector(V: GetCurVal(), Mask);
780 }
781
782 if (auto *MSI = dyn_cast<MemSetInst>(Val: Inst)) {
783 // For memset, we don't need to know the previous value because we
784 // currently only allow memsets that cover the whole alloca.
785 Value *Elt = MSI->getOperand(i_nocapture: 1);
786 const unsigned BytesPerElt = DL.getTypeStoreSize(Ty: VecEltTy);
787 if (BytesPerElt > 1) {
788 Value *EltBytes = Builder.CreateVectorSplat(NumElts: BytesPerElt, V: Elt);
789
790 // If the element type of the vector is a pointer, we need to first cast
791 // to an integer, then use a PtrCast.
792 if (VecEltTy->isPointerTy()) {
793 Type *PtrInt = Builder.getIntNTy(N: BytesPerElt * 8);
794 Elt = Builder.CreateBitCast(V: EltBytes, DestTy: PtrInt);
795 Elt = Builder.CreateIntToPtr(V: Elt, DestTy: VecEltTy);
796 } else
797 Elt = Builder.CreateBitCast(V: EltBytes, DestTy: VecEltTy);
798 }
799
800 return Builder.CreateVectorSplat(EC: AA.Vector.Ty->getElementCount(), V: Elt);
801 }
802
803 if (auto *Intr = dyn_cast<IntrinsicInst>(Val: Inst)) {
804 if (Intr->getIntrinsicID() == Intrinsic::objectsize) {
805 Intr->replaceAllUsesWith(
806 V: Builder.getIntN(N: Intr->getType()->getIntegerBitWidth(),
807 C: DL.getTypeAllocSize(Ty: AA.Vector.Ty)));
808 return nullptr;
809 }
810 }
811
812 llvm_unreachable("Unsupported call when promoting alloca to vector");
813 }
814
815 default:
816 llvm_unreachable("Inconsistency in instructions promotable to vector");
817 }
818
819 llvm_unreachable("Did not return after promoting instruction!");
820}
821
822static bool isSupportedAccessType(FixedVectorType *VecTy, Type *AccessTy,
823 const DataLayout &DL) {
824 // Access as a vector type can work if the size of the access vector is a
825 // multiple of the size of the alloca's vector element type.
826 //
827 // Examples:
828 // - VecTy = <8 x float>, AccessTy = <4 x float> -> OK
829 // - VecTy = <4 x double>, AccessTy = <2 x float> -> OK
830 // - VecTy = <4 x double>, AccessTy = <3 x float> -> NOT OK
831 // - 3*32 is not a multiple of 64
832 //
833 // We could handle more complicated cases, but it'd make things a lot more
834 // complicated.
835 if (isa<FixedVectorType>(Val: AccessTy)) {
836 TypeSize AccTS = DL.getTypeStoreSize(Ty: AccessTy);
837 // If the type size and the store size don't match, we would need to do more
838 // than just bitcast to translate between an extracted/insertable subvectors
839 // and the accessed value.
840 if (AccTS * 8 != DL.getTypeSizeInBits(Ty: AccessTy))
841 return false;
842 TypeSize VecTS = DL.getTypeStoreSize(Ty: VecTy->getElementType());
843 return AccTS.isKnownMultipleOf(RHS: VecTS);
844 }
845
846 return CastInst::isBitOrNoopPointerCastable(SrcTy: VecTy->getElementType(), DestTy: AccessTy,
847 DL);
848}
849
850/// Iterates over an instruction worklist that may contain multiple instructions
851/// from the same basic block, but in a different order.
852template <typename InstContainer>
853static void forEachWorkListItem(const InstContainer &WorkList,
854 std::function<void(Instruction *)> Fn) {
855 // Bucket up uses of the alloca by the block they occur in.
856 // This is important because we have to handle multiple defs/uses in a block
857 // ourselves: SSAUpdater is purely for cross-block references.
858 DenseMap<BasicBlock *, SmallDenseSet<Instruction *>> UsesByBlock;
859 for (Instruction *User : WorkList)
860 UsesByBlock[User->getParent()].insert(V: User);
861
862 for (Instruction *User : WorkList) {
863 BasicBlock *BB = User->getParent();
864 auto &BlockUses = UsesByBlock[BB];
865
866 // Already processed, skip.
867 if (BlockUses.empty())
868 continue;
869
870 // Only user in the block, directly process it.
871 if (BlockUses.size() == 1) {
872 Fn(User);
873 continue;
874 }
875
876 // Multiple users in the block, do a linear scan to see users in order.
877 for (Instruction &Inst : *BB) {
878 if (!BlockUses.contains(V: &Inst))
879 continue;
880
881 Fn(&Inst);
882 }
883
884 // Clear the block so we know it's been processed.
885 BlockUses.clear();
886 }
887}
888
889/// Find an insert point after an alloca, after all other allocas clustered at
890/// the start of the block.
891static BasicBlock::iterator skipToNonAllocaInsertPt(BasicBlock &BB,
892 BasicBlock::iterator I) {
893 for (BasicBlock::iterator E = BB.end(); I != E && isa<AllocaInst>(Val: *I); ++I)
894 ;
895 return I;
896}
897
898FixedVectorType *
899AMDGPUPromoteAllocaImpl::getVectorTypeForAlloca(Type *AllocaTy) const {
900 if (DisablePromoteAllocaToVector) {
901 LLVM_DEBUG(dbgs() << " Promote alloca to vectors is disabled\n");
902 return nullptr;
903 }
904
905 auto *VectorTy = dyn_cast<FixedVectorType>(Val: AllocaTy);
906 if (auto *ArrayTy = dyn_cast<ArrayType>(Val: AllocaTy)) {
907 uint64_t NumElems = 1;
908 Type *ElemTy;
909 do {
910 NumElems *= ArrayTy->getNumElements();
911 ElemTy = ArrayTy->getElementType();
912 } while ((ArrayTy = dyn_cast<ArrayType>(Val: ElemTy)));
913
914 // Check for array of vectors
915 auto *InnerVectorTy = dyn_cast<FixedVectorType>(Val: ElemTy);
916 if (InnerVectorTy) {
917 NumElems *= InnerVectorTy->getNumElements();
918 ElemTy = InnerVectorTy->getElementType();
919 }
920
921 if (VectorType::isValidElementType(ElemTy) && NumElems > 0) {
922 unsigned ElementSize = DL.getTypeSizeInBits(Ty: ElemTy) / 8;
923 if (ElementSize > 0) {
924 unsigned AllocaSize = DL.getTypeStoreSize(Ty: AllocaTy);
925 // Expand vector if required to match padding of inner type,
926 // i.e. odd size subvectors.
927 // Storage size of new vector must match that of alloca for correct
928 // behaviour of byte offsets and GEP computation.
929 if (NumElems * ElementSize != AllocaSize)
930 NumElems = AllocaSize / ElementSize;
931 if (NumElems > 0 && (AllocaSize % ElementSize) == 0)
932 VectorTy = FixedVectorType::get(ElementType: ElemTy, NumElts: NumElems);
933 }
934 }
935 }
936 if (!VectorTy) {
937 LLVM_DEBUG(dbgs() << " Cannot convert type to vector\n");
938 return nullptr;
939 }
940
941 const unsigned MaxElements =
942 (MaxVectorRegs * 32) / DL.getTypeSizeInBits(Ty: VectorTy->getElementType());
943
944 if (VectorTy->getNumElements() > MaxElements ||
945 VectorTy->getNumElements() < 2) {
946 LLVM_DEBUG(dbgs() << " " << *VectorTy
947 << " has an unsupported number of elements\n");
948 return nullptr;
949 }
950
951 Type *VecEltTy = VectorTy->getElementType();
952 unsigned ElementSizeInBits = DL.getTypeSizeInBits(Ty: VecEltTy);
953 if (ElementSizeInBits != DL.getTypeAllocSizeInBits(Ty: VecEltTy)) {
954 LLVM_DEBUG(dbgs() << " Cannot convert to vector if the allocation size "
955 "does not match the type's size\n");
956 return nullptr;
957 }
958
959 return VectorTy;
960}
961
962void AMDGPUPromoteAllocaImpl::analyzePromoteToVector(AllocaAnalysis &AA) const {
963 if (AA.HaveSelectOrPHI) {
964 LLVM_DEBUG(dbgs() << " Cannot convert to vector due to select or phi\n");
965 return;
966 }
967
968 Type *AllocaTy = AA.Alloca->getAllocatedType();
969 AA.Vector.Ty = getVectorTypeForAlloca(AllocaTy);
970 if (!AA.Vector.Ty)
971 return;
972
973 const auto RejectUser = [&](Instruction *Inst, Twine Msg) {
974 LLVM_DEBUG(dbgs() << " Cannot promote alloca to vector: " << Msg << "\n"
975 << " " << *Inst << "\n");
976 AA.Vector.Ty = nullptr;
977 };
978
979 Type *VecEltTy = AA.Vector.Ty->getElementType();
980 unsigned ElementSize = DL.getTypeSizeInBits(Ty: VecEltTy) / 8;
981 assert(ElementSize > 0);
982 for (auto *U : AA.Uses) {
983 Instruction *Inst = cast<Instruction>(Val: U->getUser());
984
985 if (Value *Ptr = getLoadStorePointerOperand(V: Inst)) {
986 assert(!isa<StoreInst>(Inst) ||
987 U->getOperandNo() == StoreInst::getPointerOperandIndex());
988
989 Type *AccessTy = getLoadStoreType(I: Inst);
990 if (AccessTy->isAggregateType())
991 return RejectUser(Inst, "unsupported load/store as aggregate");
992 assert(!AccessTy->isAggregateType() || AccessTy->isArrayTy());
993
994 // Check that this is a simple access of a vector element.
995 bool IsSimple = isa<LoadInst>(Val: Inst) ? cast<LoadInst>(Val: Inst)->isSimple()
996 : cast<StoreInst>(Val: Inst)->isSimple();
997 if (!IsSimple)
998 return RejectUser(Inst, "not a simple load or store");
999
1000 Ptr = Ptr->stripPointerCasts();
1001
1002 // Alloca already accessed as vector.
1003 if (Ptr == AA.Alloca &&
1004 DL.getTypeStoreSize(Ty: AA.Alloca->getAllocatedType()) ==
1005 DL.getTypeStoreSize(Ty: AccessTy)) {
1006 AA.Vector.Worklist.push_back(Elt: Inst);
1007 continue;
1008 }
1009
1010 if (!isSupportedAccessType(VecTy: AA.Vector.Ty, AccessTy, DL))
1011 return RejectUser(Inst, "not a supported access type");
1012
1013 AA.Vector.Worklist.push_back(Elt: Inst);
1014 continue;
1015 }
1016
1017 if (auto *GEP = dyn_cast<GetElementPtrInst>(Val: Inst)) {
1018 // If we can't compute a vector index from this GEP, then we can't
1019 // promote this alloca to vector.
1020 auto Index = computeGEPToVectorIndex(GEP, Alloca: AA.Alloca, VecElemTy: VecEltTy, DL);
1021 if (!Index)
1022 return RejectUser(Inst, "cannot compute vector index for GEP");
1023
1024 AA.Vector.GEPVectorIdx[GEP] = std::move(Index.value());
1025 AA.Vector.UsersToRemove.push_back(Elt: Inst);
1026 continue;
1027 }
1028
1029 if (MemSetInst *MSI = dyn_cast<MemSetInst>(Val: Inst);
1030 MSI && isSupportedMemset(I: MSI, AI: AA.Alloca, DL)) {
1031 AA.Vector.Worklist.push_back(Elt: Inst);
1032 continue;
1033 }
1034
1035 if (MemTransferInst *TransferInst = dyn_cast<MemTransferInst>(Val: Inst)) {
1036 if (TransferInst->isVolatile())
1037 return RejectUser(Inst, "mem transfer inst is volatile");
1038
1039 ConstantInt *Len = dyn_cast<ConstantInt>(Val: TransferInst->getLength());
1040 if (!Len || (Len->getZExtValue() % ElementSize))
1041 return RejectUser(Inst, "mem transfer inst length is non-constant or "
1042 "not a multiple of the vector element size");
1043
1044 auto getConstIndexIntoAlloca = [&](Value *Ptr) -> ConstantInt * {
1045 if (Ptr == AA.Alloca)
1046 return ConstantInt::get(Context&: Ptr->getContext(), V: APInt(32, 0));
1047
1048 GetElementPtrInst *GEP = cast<GetElementPtrInst>(Val: Ptr);
1049 const auto &GEPI = AA.Vector.GEPVectorIdx.find(Key: GEP)->second;
1050 if (GEPI.VarIndex)
1051 return nullptr;
1052 if (GEPI.ConstIndex)
1053 return GEPI.ConstIndex;
1054 return ConstantInt::get(Context&: Ptr->getContext(), V: APInt(32, 0));
1055 };
1056
1057 MemTransferInfo *TI =
1058 &AA.Vector.TransferInfo.try_emplace(Key: TransferInst).first->second;
1059 unsigned OpNum = U->getOperandNo();
1060 if (OpNum == 0) {
1061 Value *Dest = TransferInst->getDest();
1062 ConstantInt *Index = getConstIndexIntoAlloca(Dest);
1063 if (!Index)
1064 return RejectUser(Inst, "could not calculate constant dest index");
1065 TI->DestIndex = Index;
1066 } else {
1067 assert(OpNum == 1);
1068 Value *Src = TransferInst->getSource();
1069 ConstantInt *Index = getConstIndexIntoAlloca(Src);
1070 if (!Index)
1071 return RejectUser(Inst, "could not calculate constant src index");
1072 TI->SrcIndex = Index;
1073 }
1074 continue;
1075 }
1076
1077 if (auto *Intr = dyn_cast<IntrinsicInst>(Val: Inst)) {
1078 if (Intr->getIntrinsicID() == Intrinsic::objectsize) {
1079 AA.Vector.Worklist.push_back(Elt: Inst);
1080 continue;
1081 }
1082 }
1083
1084 // Ignore assume-like intrinsics and comparisons used in assumes.
1085 if (isAssumeLikeIntrinsic(I: Inst)) {
1086 if (!Inst->use_empty())
1087 return RejectUser(Inst, "assume-like intrinsic cannot have any users");
1088 AA.Vector.UsersToRemove.push_back(Elt: Inst);
1089 continue;
1090 }
1091
1092 if (isa<ICmpInst>(Val: Inst) && all_of(Range: Inst->users(), P: [](User *U) {
1093 return isAssumeLikeIntrinsic(I: cast<Instruction>(Val: U));
1094 })) {
1095 AA.Vector.UsersToRemove.push_back(Elt: Inst);
1096 continue;
1097 }
1098
1099 return RejectUser(Inst, "unhandled alloca user");
1100 }
1101
1102 // Follow-up check to ensure we've seen both sides of all transfer insts.
1103 for (const auto &Entry : AA.Vector.TransferInfo) {
1104 const MemTransferInfo &TI = Entry.second;
1105 if (!TI.SrcIndex || !TI.DestIndex)
1106 return RejectUser(Entry.first,
1107 "mem transfer inst between different objects");
1108 AA.Vector.Worklist.push_back(Elt: Entry.first);
1109 }
1110}
1111
1112void AMDGPUPromoteAllocaImpl::promoteAllocaToVector(AllocaAnalysis &AA) {
1113 LLVM_DEBUG(dbgs() << "Promoting to vectors: " << *AA.Alloca << '\n');
1114 LLVM_DEBUG(dbgs() << " type conversion: " << *AA.Alloca->getAllocatedType()
1115 << " -> " << *AA.Vector.Ty << '\n');
1116 const unsigned VecStoreSize = DL.getTypeStoreSize(Ty: AA.Vector.Ty);
1117
1118 Type *VecEltTy = AA.Vector.Ty->getElementType();
1119 const unsigned ElementSize = DL.getTypeSizeInBits(Ty: VecEltTy) / 8;
1120
1121 // Alloca is uninitialized memory. Imitate that by making the first value
1122 // undef.
1123 SSAUpdater Updater;
1124 Updater.Initialize(Ty: AA.Vector.Ty, Name: "promotealloca");
1125
1126 BasicBlock *EntryBB = AA.Alloca->getParent();
1127 BasicBlock::iterator InitInsertPos =
1128 skipToNonAllocaInsertPt(BB&: *EntryBB, I: AA.Alloca->getIterator());
1129 IRBuilder<> Builder(&*InitInsertPos);
1130 Value *AllocaInitValue = Builder.CreateFreeze(V: PoisonValue::get(T: AA.Vector.Ty));
1131 AllocaInitValue->takeName(V: AA.Alloca);
1132
1133 Updater.AddAvailableValue(BB: AA.Alloca->getParent(), V: AllocaInitValue);
1134
1135 // First handle the initial worklist, in basic block order.
1136 //
1137 // Insert a placeholder whenever we need the vector value at the top of a
1138 // basic block.
1139 SmallSetVector<Instruction *, 8> Placeholders;
1140 forEachWorkListItem(WorkList: AA.Vector.Worklist, Fn: [&](Instruction *I) {
1141 BasicBlock *BB = I->getParent();
1142 auto GetCurVal = [&]() -> Value * {
1143 if (Value *CurVal = Updater.FindValueForBlock(BB))
1144 return CurVal;
1145
1146 if (!Placeholders.empty() && Placeholders.back()->getParent() == BB)
1147 return Placeholders.back();
1148
1149 // If the current value in the basic block is not yet known, insert a
1150 // placeholder that we will replace later.
1151 IRBuilder<> Builder(I);
1152 auto *Placeholder = cast<Instruction>(Val: Builder.CreateFreeze(
1153 V: PoisonValue::get(T: AA.Vector.Ty), Name: "promotealloca.placeholder"));
1154 Placeholders.insert(X: Placeholder);
1155 return Placeholders.back();
1156 };
1157
1158 Value *Result = promoteAllocaUserToVector(Inst: I, DL, AA, VecStoreSize,
1159 ElementSize, GetCurVal);
1160 // If the returned result is a placeholder, it means the instruction does
1161 // not really modify the alloca. So no need to make it being available value
1162 // to SSAUpdater.
1163 // This will stop placeholder being cached in SSAUpdater. The cached
1164 // placeholder may cause stale pointer being referenced when doing
1165 // placeholder replacement.
1166 if (Result && (!isa<Instruction>(Val: Result) ||
1167 !Placeholders.contains(key: cast<Instruction>(Val: Result))))
1168 Updater.AddAvailableValue(BB, V: Result);
1169 });
1170
1171 // Now fixup the placeholders.
1172 for (Instruction *Placeholder : Placeholders) {
1173 Placeholder->replaceAllUsesWith(
1174 V: Updater.GetValueInMiddleOfBlock(BB: Placeholder->getParent()));
1175 Placeholder->eraseFromParent();
1176 }
1177
1178 // Delete all instructions.
1179 for (Instruction *I : AA.Vector.Worklist) {
1180 assert(I->use_empty());
1181 I->eraseFromParent();
1182 }
1183
1184 // Delete all the users that are known to be removeable.
1185 for (Instruction *I : reverse(C&: AA.Vector.UsersToRemove)) {
1186 I->dropDroppableUses();
1187 assert(I->use_empty());
1188 I->eraseFromParent();
1189 }
1190
1191 // Alloca should now be dead too.
1192 assert(AA.Alloca->use_empty());
1193 AA.Alloca->eraseFromParent();
1194}
1195
1196std::pair<Value *, Value *>
1197AMDGPUPromoteAllocaImpl::getLocalSizeYZ(IRBuilder<> &Builder) {
1198 Function &F = *Builder.GetInsertBlock()->getParent();
1199 const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F);
1200
1201 if (!IsAMDHSA) {
1202 CallInst *LocalSizeY = Builder.CreateIntrinsicWithoutFolding(
1203 ID: Intrinsic::r600_read_local_size_y, Args: {});
1204 CallInst *LocalSizeZ = Builder.CreateIntrinsicWithoutFolding(
1205 ID: Intrinsic::r600_read_local_size_z, Args: {});
1206
1207 ST.makeLIDRangeMetadata(I: LocalSizeY);
1208 ST.makeLIDRangeMetadata(I: LocalSizeZ);
1209
1210 return std::pair(LocalSizeY, LocalSizeZ);
1211 }
1212
1213 // We must read the size out of the dispatch pointer.
1214 assert(IsAMDGCN);
1215
1216 // We are indexing into this struct, and want to extract the workgroup_size_*
1217 // fields.
1218 //
1219 // typedef struct hsa_kernel_dispatch_packet_s {
1220 // uint16_t header;
1221 // uint16_t setup;
1222 // uint16_t workgroup_size_x ;
1223 // uint16_t workgroup_size_y;
1224 // uint16_t workgroup_size_z;
1225 // uint16_t reserved0;
1226 // uint32_t grid_size_x ;
1227 // uint32_t grid_size_y ;
1228 // uint32_t grid_size_z;
1229 //
1230 // uint32_t private_segment_size;
1231 // uint32_t group_segment_size;
1232 // uint64_t kernel_object;
1233 //
1234 // #ifdef HSA_LARGE_MODEL
1235 // void *kernarg_address;
1236 // #elif defined HSA_LITTLE_ENDIAN
1237 // void *kernarg_address;
1238 // uint32_t reserved1;
1239 // #else
1240 // uint32_t reserved1;
1241 // void *kernarg_address;
1242 // #endif
1243 // uint64_t reserved2;
1244 // hsa_signal_t completion_signal; // uint64_t wrapper
1245 // } hsa_kernel_dispatch_packet_t
1246 //
1247 CallInst *DispatchPtr =
1248 Builder.CreateIntrinsicWithoutFolding(ID: Intrinsic::amdgcn_dispatch_ptr, Args: {});
1249 DispatchPtr->addRetAttr(Kind: Attribute::NoAlias);
1250 DispatchPtr->addRetAttr(Kind: Attribute::NonNull);
1251 F.removeFnAttr(Kind: "amdgpu-no-dispatch-ptr");
1252
1253 // Size of the dispatch packet struct.
1254 DispatchPtr->addDereferenceableRetAttr(Bytes: 64);
1255
1256 Type *I32Ty = Type::getInt32Ty(C&: Mod.getContext());
1257
1258 // We could do a single 64-bit load here, but it's likely that the basic
1259 // 32-bit and extract sequence is already present, and it is probably easier
1260 // to CSE this. The loads should be mergeable later anyway.
1261 Value *GEPXY = Builder.CreateConstInBoundsGEP1_64(Ty: I32Ty, Ptr: DispatchPtr, Idx0: 1);
1262 LoadInst *LoadXY = Builder.CreateAlignedLoad(Ty: I32Ty, Ptr: GEPXY, Align: Align(4));
1263
1264 Value *GEPZU = Builder.CreateConstInBoundsGEP1_64(Ty: I32Ty, Ptr: DispatchPtr, Idx0: 2);
1265 LoadInst *LoadZU = Builder.CreateAlignedLoad(Ty: I32Ty, Ptr: GEPZU, Align: Align(4));
1266
1267 MDNode *MD = MDNode::get(Context&: Mod.getContext(), MDs: {});
1268 LoadXY->setMetadata(KindID: LLVMContext::MD_invariant_load, Node: MD);
1269 LoadZU->setMetadata(KindID: LLVMContext::MD_invariant_load, Node: MD);
1270 ST.makeLIDRangeMetadata(I: LoadZU);
1271
1272 // Extract y component. Upper half of LoadZU should be zero already.
1273 Value *Y = Builder.CreateLShr(LHS: LoadXY, RHS: 16);
1274
1275 return std::pair(Y, LoadZU);
1276}
1277
1278Value *AMDGPUPromoteAllocaImpl::getWorkitemID(IRBuilder<> &Builder,
1279 unsigned N) {
1280 Function *F = Builder.GetInsertBlock()->getParent();
1281 const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F: *F);
1282 Intrinsic::ID IntrID = Intrinsic::not_intrinsic;
1283 StringRef AttrName;
1284
1285 switch (N) {
1286 case 0:
1287 IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_x
1288 : (Intrinsic::ID)Intrinsic::r600_read_tidig_x;
1289 AttrName = "amdgpu-no-workitem-id-x";
1290 break;
1291 case 1:
1292 IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_y
1293 : (Intrinsic::ID)Intrinsic::r600_read_tidig_y;
1294 AttrName = "amdgpu-no-workitem-id-y";
1295 break;
1296
1297 case 2:
1298 IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_z
1299 : (Intrinsic::ID)Intrinsic::r600_read_tidig_z;
1300 AttrName = "amdgpu-no-workitem-id-z";
1301 break;
1302 default:
1303 llvm_unreachable("invalid dimension");
1304 }
1305
1306 Function *WorkitemIdFn = Intrinsic::getOrInsertDeclaration(M: &Mod, id: IntrID);
1307 CallInst *CI = Builder.CreateCall(Callee: WorkitemIdFn);
1308 ST.makeLIDRangeMetadata(I: CI);
1309 F->removeFnAttr(Kind: AttrName);
1310
1311 return CI;
1312}
1313
1314static bool isCallPromotable(CallInst *CI) {
1315 IntrinsicInst *II = dyn_cast<IntrinsicInst>(Val: CI);
1316 if (!II)
1317 return false;
1318
1319 switch (II->getIntrinsicID()) {
1320 case Intrinsic::memcpy:
1321 case Intrinsic::memmove:
1322 case Intrinsic::memset:
1323 case Intrinsic::lifetime_start:
1324 case Intrinsic::lifetime_end:
1325 case Intrinsic::invariant_start:
1326 case Intrinsic::invariant_end:
1327 case Intrinsic::launder_invariant_group:
1328 case Intrinsic::strip_invariant_group:
1329 case Intrinsic::objectsize:
1330 return true;
1331 default:
1332 return false;
1333 }
1334}
1335
1336bool AMDGPUPromoteAllocaImpl::binaryOpIsDerivedFromSameAlloca(
1337 Value *BaseAlloca, Value *Val, Instruction *Inst, int OpIdx0,
1338 int OpIdx1) const {
1339 // Figure out which operand is the one we might not be promoting.
1340 Value *OtherOp = Inst->getOperand(i: OpIdx0);
1341 if (Val == OtherOp)
1342 OtherOp = Inst->getOperand(i: OpIdx1);
1343
1344 if (isa<ConstantPointerNull, ConstantAggregateZero>(Val: OtherOp))
1345 return true;
1346
1347 // TODO: getUnderlyingObject will not work on a vector getelementptr
1348 Value *OtherObj = getUnderlyingObject(V: OtherOp);
1349 if (!isa<AllocaInst>(Val: OtherObj))
1350 return false;
1351
1352 // TODO: We should be able to replace undefs with the right pointer type.
1353
1354 // TODO: If we know the other base object is another promotable
1355 // alloca, not necessarily this alloca, we can do this. The
1356 // important part is both must have the same address space at
1357 // the end.
1358 if (OtherObj != BaseAlloca) {
1359 LLVM_DEBUG(
1360 dbgs() << "Found a binary instruction with another alloca object\n");
1361 return false;
1362 }
1363
1364 return true;
1365}
1366
1367void AMDGPUPromoteAllocaImpl::analyzePromoteToLDS(AllocaAnalysis &AA) const {
1368 if (DisablePromoteAllocaToLDS) {
1369 LLVM_DEBUG(dbgs() << " Promote alloca to LDS is disabled\n");
1370 return;
1371 }
1372
1373 // Don't promote the alloca to LDS for shader calling conventions as the work
1374 // item ID intrinsics are not supported for these calling conventions.
1375 // Furthermore not all LDS is available for some of the stages.
1376 const Function &ContainingFunction = *AA.Alloca->getFunction();
1377 CallingConv::ID CC = ContainingFunction.getCallingConv();
1378
1379 switch (CC) {
1380 case CallingConv::AMDGPU_KERNEL:
1381 case CallingConv::SPIR_KERNEL:
1382 break;
1383 default:
1384 LLVM_DEBUG(
1385 dbgs()
1386 << " promote alloca to LDS not supported with calling convention.\n");
1387 return;
1388 }
1389
1390 for (Use *Use : AA.Uses) {
1391 auto *User = Use->getUser();
1392
1393 if (CallInst *CI = dyn_cast<CallInst>(Val: User)) {
1394 if (!isCallPromotable(CI))
1395 return;
1396
1397 if (find(Range&: AA.LDS.Worklist, Val: User) == AA.LDS.Worklist.end())
1398 AA.LDS.Worklist.push_back(Elt: User);
1399 continue;
1400 }
1401
1402 Instruction *UseInst = cast<Instruction>(Val: User);
1403 if (UseInst->getOpcode() == Instruction::PtrToInt)
1404 return;
1405
1406 if (LoadInst *LI = dyn_cast<LoadInst>(Val: UseInst)) {
1407 if (LI->isVolatile())
1408 return;
1409 continue;
1410 }
1411
1412 if (StoreInst *SI = dyn_cast<StoreInst>(Val: UseInst)) {
1413 if (SI->isVolatile())
1414 return;
1415 continue;
1416 }
1417
1418 if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(Val: UseInst)) {
1419 if (RMW->isVolatile())
1420 return;
1421 continue;
1422 }
1423
1424 if (AtomicCmpXchgInst *CAS = dyn_cast<AtomicCmpXchgInst>(Val: UseInst)) {
1425 if (CAS->isVolatile())
1426 return;
1427 continue;
1428 }
1429
1430 // Only promote a select if we know that the other select operand
1431 // is from another pointer that will also be promoted.
1432 if (ICmpInst *ICmp = dyn_cast<ICmpInst>(Val: UseInst)) {
1433 if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca: AA.Alloca, Val: Use->get(), Inst: ICmp, OpIdx0: 0, OpIdx1: 1))
1434 return;
1435
1436 // May need to rewrite constant operands.
1437 if (find(Range&: AA.LDS.Worklist, Val: User) == AA.LDS.Worklist.end())
1438 AA.LDS.Worklist.push_back(Elt: ICmp);
1439 continue;
1440 }
1441
1442 if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Val: UseInst)) {
1443 // Be conservative if an address could be computed outside the bounds of
1444 // the alloca.
1445 if (!GEP->isInBounds())
1446 return;
1447 } else if (!isa<ExtractElementInst, SelectInst, PHINode>(Val: User)) {
1448 // Do not promote vector/aggregate type instructions. It is hard to track
1449 // their users.
1450
1451 // Do not promote addrspacecast.
1452 //
1453 // TODO: If we know the address is only observed through flat pointers, we
1454 // could still promote.
1455 return;
1456 }
1457
1458 if (find(Range&: AA.LDS.Worklist, Val: User) == AA.LDS.Worklist.end())
1459 AA.LDS.Worklist.push_back(Elt: User);
1460 }
1461
1462 AA.LDS.Enable = true;
1463}
1464
1465bool AMDGPUPromoteAllocaImpl::hasSufficientLocalMem(const Function &F) {
1466
1467 FunctionType *FTy = F.getFunctionType();
1468 const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F);
1469
1470 // If the function has any arguments in the local address space, then it's
1471 // possible these arguments require the entire local memory space, so
1472 // we cannot use local memory in the pass.
1473 for (Type *ParamTy : FTy->params()) {
1474 PointerType *PtrTy = dyn_cast<PointerType>(Val: ParamTy);
1475 if (PtrTy && PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
1476 LocalMemLimit = 0;
1477 LLVM_DEBUG(dbgs() << "Function has local memory argument. Promoting to "
1478 "local memory disabled.\n");
1479 return false;
1480 }
1481 }
1482
1483 LocalMemLimit = ST.getAddressableLocalMemorySize();
1484 if (LocalMemLimit == 0)
1485 return false;
1486
1487 SmallVector<const Constant *, 16> Stack;
1488 SmallPtrSet<const Constant *, 8> VisitedConstants;
1489 SmallPtrSet<const GlobalVariable *, 8> UsedLDS;
1490
1491 auto visitUsers = [&](const GlobalVariable *GV, const Constant *Val) -> bool {
1492 for (const User *U : Val->users()) {
1493 if (const Instruction *Use = dyn_cast<Instruction>(Val: U)) {
1494 if (Use->getFunction() == &F)
1495 return true;
1496 } else {
1497 const Constant *C = cast<Constant>(Val: U);
1498 if (VisitedConstants.insert(Ptr: C).second)
1499 Stack.push_back(Elt: C);
1500 }
1501 }
1502
1503 return false;
1504 };
1505
1506 for (GlobalVariable &GV : Mod.globals()) {
1507 if (GV.getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS)
1508 continue;
1509
1510 if (visitUsers(&GV, &GV)) {
1511 UsedLDS.insert(Ptr: &GV);
1512 Stack.clear();
1513 continue;
1514 }
1515
1516 // For any ConstantExpr uses, we need to recursively search the users until
1517 // we see a function.
1518 while (!Stack.empty()) {
1519 const Constant *C = Stack.pop_back_val();
1520 if (visitUsers(&GV, C)) {
1521 UsedLDS.insert(Ptr: &GV);
1522 Stack.clear();
1523 break;
1524 }
1525 }
1526 }
1527
1528 SmallVector<std::pair<uint64_t, Align>, 16> AllocatedSizes;
1529 AllocatedSizes.reserve(N: UsedLDS.size());
1530
1531 for (const GlobalVariable *GV : UsedLDS) {
1532 Align Alignment =
1533 DL.getValueOrABITypeAlignment(Alignment: GV->getAlign(), Ty: GV->getValueType());
1534 uint64_t AllocSize = GV->getGlobalSize(DL);
1535
1536 // HIP uses an extern unsized array in local address space for dynamically
1537 // allocated shared memory. In that case, we have to disable the promotion.
1538 if (GV->hasExternalLinkage() && AllocSize == 0) {
1539 LocalMemLimit = 0;
1540 LLVM_DEBUG(dbgs() << "Function has a reference to externally allocated "
1541 "local memory. Promoting to local memory "
1542 "disabled.\n");
1543 return false;
1544 }
1545
1546 AllocatedSizes.emplace_back(Args&: AllocSize, Args&: Alignment);
1547 }
1548
1549 // Sort to try to estimate the worst case alignment padding
1550 //
1551 // FIXME: We should really do something to fix the addresses to a more optimal
1552 // value instead
1553 llvm::sort(C&: AllocatedSizes, Comp: llvm::less_second());
1554
1555 // Check how much local memory is being used by global objects
1556 CurrentLocalMemUsage = 0;
1557
1558 // FIXME: Try to account for padding here. The real padding and address is
1559 // currently determined from the inverse order of uses in the function when
1560 // legalizing, which could also potentially change. We try to estimate the
1561 // worst case here, but we probably should fix the addresses earlier.
1562 for (auto Alloc : AllocatedSizes) {
1563 CurrentLocalMemUsage = alignTo(Size: CurrentLocalMemUsage, A: Alloc.second);
1564 CurrentLocalMemUsage += Alloc.first;
1565 }
1566
1567 unsigned MaxOccupancy =
1568 ST.getWavesPerEU(FlatWorkGroupSizes: ST.getFlatWorkGroupSizes(F), LDSBytes: CurrentLocalMemUsage, F)
1569 .second;
1570
1571 // Round up to the next tier of usage.
1572 unsigned MaxSizeWithWaveCount =
1573 ST.getMaxLocalMemSizeWithWaveCount(WaveCount: MaxOccupancy, F);
1574
1575 // Program may already use more LDS than is usable at maximum occupancy.
1576 if (CurrentLocalMemUsage > MaxSizeWithWaveCount)
1577 return false;
1578
1579 LocalMemLimit = MaxSizeWithWaveCount;
1580
1581 LLVM_DEBUG(dbgs() << F.getName() << " uses " << CurrentLocalMemUsage
1582 << " bytes of LDS\n"
1583 << " Rounding size to " << MaxSizeWithWaveCount
1584 << " with a maximum occupancy of " << MaxOccupancy << '\n'
1585 << " and " << (LocalMemLimit - CurrentLocalMemUsage)
1586 << " available for promotion\n");
1587
1588 return true;
1589}
1590
1591// FIXME: Should try to pick the most likely to be profitable allocas first.
1592bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(
1593 AllocaAnalysis &AA, bool SufficientLDS,
1594 SetVector<IntrinsicInst *> &DeferredIntrs) {
1595 LLVM_DEBUG(dbgs() << "Trying to promote to LDS: " << *AA.Alloca << '\n');
1596
1597 // Not likely to have sufficient local memory for promotion.
1598 if (!SufficientLDS)
1599 return false;
1600
1601 IRBuilder<> Builder(AA.Alloca);
1602
1603 const Function &ContainingFunction = *AA.Alloca->getParent()->getParent();
1604 const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F: ContainingFunction);
1605 unsigned WorkGroupSize = ST.getFlatWorkGroupSizes(F: ContainingFunction).second;
1606
1607 Align Alignment = AA.Alloca->getAlign();
1608
1609 // FIXME: This computed padding is likely wrong since it depends on inverse
1610 // usage order.
1611 //
1612 // FIXME: It is also possible that if we're allowed to use all of the memory
1613 // could end up using more than the maximum due to alignment padding.
1614
1615 uint32_t NewSize = alignTo(Size: CurrentLocalMemUsage, A: Alignment);
1616 std::optional<TypeSize> ElemSize = AA.Alloca->getAllocationSize(DL);
1617 if (!ElemSize || ElemSize->isScalable())
1618 return false;
1619 TypeSize AllocSize = WorkGroupSize * *ElemSize;
1620 NewSize += AllocSize.getFixedValue();
1621
1622 if (NewSize > LocalMemLimit) {
1623 LLVM_DEBUG(dbgs() << " " << AllocSize
1624 << " bytes of local memory not available to promote\n");
1625 return false;
1626 }
1627
1628 CurrentLocalMemUsage = NewSize;
1629
1630 LLVM_DEBUG(dbgs() << "Promoting alloca to local memory\n");
1631
1632 Function *F = AA.Alloca->getFunction();
1633
1634 Type *GVTy = ArrayType::get(ElementType: AA.Alloca->getAllocatedType(), NumElements: WorkGroupSize);
1635 GlobalVariable *GV = new GlobalVariable(
1636 Mod, GVTy, false, GlobalValue::InternalLinkage, PoisonValue::get(T: GVTy),
1637 Twine(F->getName()) + Twine('.') + AA.Alloca->getName(), nullptr,
1638 GlobalVariable::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS);
1639 GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
1640 GV->setAlignment(AA.Alloca->getAlign());
1641
1642 Value *TCntY, *TCntZ;
1643
1644 std::tie(args&: TCntY, args&: TCntZ) = getLocalSizeYZ(Builder);
1645 Value *TIdX = getWorkitemID(Builder, N: 0);
1646 Value *TIdY = getWorkitemID(Builder, N: 1);
1647 Value *TIdZ = getWorkitemID(Builder, N: 2);
1648
1649 Value *Tmp0 = Builder.CreateMul(LHS: TCntY, RHS: TCntZ, Name: "", HasNUW: true, HasNSW: true);
1650 Tmp0 = Builder.CreateMul(LHS: Tmp0, RHS: TIdX);
1651 Value *Tmp1 = Builder.CreateMul(LHS: TIdY, RHS: TCntZ, Name: "", HasNUW: true, HasNSW: true);
1652 Value *TID = Builder.CreateAdd(LHS: Tmp0, RHS: Tmp1);
1653 TID = Builder.CreateAdd(LHS: TID, RHS: TIdZ);
1654
1655 LLVMContext &Context = Mod.getContext();
1656 Value *Indices[] = {Constant::getNullValue(Ty: Type::getInt32Ty(C&: Context)), TID};
1657
1658 Value *Offset = Builder.CreateInBoundsGEP(Ty: GVTy, Ptr: GV, IdxList: Indices);
1659 AA.Alloca->mutateType(Ty: Offset->getType());
1660 AA.Alloca->replaceAllUsesWith(V: Offset);
1661 AA.Alloca->eraseFromParent();
1662
1663 PointerType *NewPtrTy = PointerType::get(C&: Context, AddressSpace: AMDGPUAS::LOCAL_ADDRESS);
1664
1665 for (Value *V : AA.LDS.Worklist) {
1666 CallInst *Call = dyn_cast<CallInst>(Val: V);
1667 if (!Call) {
1668 if (ICmpInst *CI = dyn_cast<ICmpInst>(Val: V)) {
1669 Value *LHS = CI->getOperand(i_nocapture: 0);
1670 Value *RHS = CI->getOperand(i_nocapture: 1);
1671
1672 Type *NewTy = LHS->getType()->getWithNewType(EltTy: NewPtrTy);
1673 if (isa<ConstantPointerNull, ConstantAggregateZero>(Val: LHS))
1674 CI->setOperand(i_nocapture: 0, Val_nocapture: Constant::getNullValue(Ty: NewTy));
1675
1676 if (isa<ConstantPointerNull, ConstantAggregateZero>(Val: RHS))
1677 CI->setOperand(i_nocapture: 1, Val_nocapture: Constant::getNullValue(Ty: NewTy));
1678
1679 continue;
1680 }
1681
1682 // The operand's value should be corrected on its own and we don't want to
1683 // touch the users.
1684 if (isa<AddrSpaceCastInst>(Val: V))
1685 continue;
1686
1687 assert(V->getType()->isPtrOrPtrVectorTy());
1688
1689 Type *NewTy = V->getType()->getWithNewType(EltTy: NewPtrTy);
1690 V->mutateType(Ty: NewTy);
1691
1692 // Adjust the types of any constant operands.
1693 if (SelectInst *SI = dyn_cast<SelectInst>(Val: V)) {
1694 if (isa<ConstantPointerNull, ConstantAggregateZero>(Val: SI->getOperand(i_nocapture: 1)))
1695 SI->setOperand(i_nocapture: 1, Val_nocapture: Constant::getNullValue(Ty: NewTy));
1696
1697 if (isa<ConstantPointerNull, ConstantAggregateZero>(Val: SI->getOperand(i_nocapture: 2)))
1698 SI->setOperand(i_nocapture: 2, Val_nocapture: Constant::getNullValue(Ty: NewTy));
1699 } else if (PHINode *Phi = dyn_cast<PHINode>(Val: V)) {
1700 for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I) {
1701 if (isa<ConstantPointerNull, ConstantAggregateZero>(
1702 Val: Phi->getIncomingValue(i: I)))
1703 Phi->setIncomingValue(i: I, V: Constant::getNullValue(Ty: NewTy));
1704 }
1705 }
1706
1707 continue;
1708 }
1709
1710 IntrinsicInst *Intr = cast<IntrinsicInst>(Val: Call);
1711 Builder.SetInsertPoint(Intr);
1712 switch (Intr->getIntrinsicID()) {
1713 case Intrinsic::lifetime_start:
1714 case Intrinsic::lifetime_end:
1715 // These intrinsics are for address space 0 only
1716 Intr->eraseFromParent();
1717 continue;
1718 case Intrinsic::memcpy:
1719 case Intrinsic::memmove:
1720 // These have 2 pointer operands. In case if second pointer also needs
1721 // to be replaced we defer processing of these intrinsics until all
1722 // other values are processed.
1723 DeferredIntrs.insert(X: Intr);
1724 continue;
1725 case Intrinsic::memset: {
1726 MemSetInst *MemSet = cast<MemSetInst>(Val: Intr);
1727 Builder.CreateMemSet(Ptr: MemSet->getRawDest(), Val: MemSet->getValue(),
1728 Size: MemSet->getLength(), Align: MemSet->getDestAlign(),
1729 isVolatile: MemSet->isVolatile());
1730 Intr->eraseFromParent();
1731 continue;
1732 }
1733 case Intrinsic::invariant_start:
1734 case Intrinsic::invariant_end:
1735 case Intrinsic::launder_invariant_group:
1736 case Intrinsic::strip_invariant_group: {
1737 SmallVector<Value *> Args;
1738 if (Intr->getIntrinsicID() == Intrinsic::invariant_start) {
1739 Args.emplace_back(Args: Intr->getArgOperand(i: 0));
1740 } else if (Intr->getIntrinsicID() == Intrinsic::invariant_end) {
1741 Args.emplace_back(Args: Intr->getArgOperand(i: 0));
1742 Args.emplace_back(Args: Intr->getArgOperand(i: 1));
1743 }
1744 Args.emplace_back(Args&: Offset);
1745 Function *F = Intrinsic::getOrInsertDeclaration(
1746 M: Intr->getModule(), id: Intr->getIntrinsicID(), OverloadTys: Offset->getType());
1747 CallInst *NewIntr =
1748 CallInst::Create(Func: F, Args, NameStr: Intr->getName(), InsertBefore: Intr->getIterator());
1749 Intr->mutateType(Ty: NewIntr->getType());
1750 Intr->replaceAllUsesWith(V: NewIntr);
1751 Intr->eraseFromParent();
1752 continue;
1753 }
1754 case Intrinsic::objectsize: {
1755 Value *Src = Intr->getOperand(i_nocapture: 0);
1756
1757 Value *NewCall = Builder.CreateIntrinsic(
1758 ID: Intrinsic::objectsize,
1759 OverloadTypes: {Intr->getType(), PointerType::get(C&: Context, AddressSpace: AMDGPUAS::LOCAL_ADDRESS)},
1760 Args: {Src, Intr->getOperand(i_nocapture: 1), Intr->getOperand(i_nocapture: 2), Intr->getOperand(i_nocapture: 3)});
1761 Intr->replaceAllUsesWith(V: NewCall);
1762 Intr->eraseFromParent();
1763 continue;
1764 }
1765 default:
1766 Intr->print(O&: errs());
1767 llvm_unreachable("Don't know how to promote alloca intrinsic use.");
1768 }
1769 }
1770
1771 return true;
1772}
1773
1774void AMDGPUPromoteAllocaImpl::finishDeferredAllocaToLDSPromotion(
1775 SetVector<IntrinsicInst *> &DeferredIntrs) {
1776
1777 for (IntrinsicInst *Intr : DeferredIntrs) {
1778 IRBuilder<> Builder(Intr);
1779 Builder.SetInsertPoint(Intr);
1780 Intrinsic::ID ID = Intr->getIntrinsicID();
1781 assert(ID == Intrinsic::memcpy || ID == Intrinsic::memmove);
1782
1783 MemTransferInst *MI = cast<MemTransferInst>(Val: Intr);
1784 auto *B = Builder.CreateMemTransferInst(
1785 IntrID: ID, Dst: MI->getRawDest(), DstAlign: MI->getDestAlign(), Src: MI->getRawSource(),
1786 SrcAlign: MI->getSourceAlign(), Size: MI->getLength(), isVolatile: MI->isVolatile());
1787
1788 for (unsigned I = 0; I != 2; ++I) {
1789 if (uint64_t Bytes = Intr->getParamDereferenceableBytes(i: I)) {
1790 B->addDereferenceableParamAttr(i: I, Bytes);
1791 }
1792 }
1793
1794 Intr->eraseFromParent();
1795 }
1796}
1797