1//===-- AMDGPUPromoteAlloca.cpp - Promote Allocas -------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// Eliminates allocas by either converting them into vectors or by migrating
10// them to local address space.
11//
12// Two passes are exposed by this file:
13// - "promote-alloca-to-vector", which runs early in the pipeline and only
14// promotes to vector. Promotion to vector is almost always profitable
15// except when the alloca is too big and the promotion would result in
16// very high register pressure.
17// - "promote-alloca", which does both promotion to vector and LDS and runs
18// much later in the pipeline. This runs after SROA because promoting to
19// LDS is of course less profitable than getting rid of the alloca or
20// vectorizing it, thus we only want to do it when the only alternative is
21// lowering the alloca to stack.
22//
23// Note that both of them exist for the old and new PMs. The new PM passes are
24// declared in AMDGPU.h and the legacy PM ones are declared here.s
25//
26//===----------------------------------------------------------------------===//
27
28#include "AMDGPU.h"
29#include "GCNSubtarget.h"
30#include "Utils/AMDGPUBaseInfo.h"
31#include "llvm/ADT/STLExtras.h"
32#include "llvm/Analysis/CaptureTracking.h"
33#include "llvm/Analysis/InstSimplifyFolder.h"
34#include "llvm/Analysis/InstructionSimplify.h"
35#include "llvm/Analysis/LoopInfo.h"
36#include "llvm/Analysis/ValueTracking.h"
37#include "llvm/CodeGen/TargetPassConfig.h"
38#include "llvm/IR/IRBuilder.h"
39#include "llvm/IR/IntrinsicInst.h"
40#include "llvm/IR/IntrinsicsAMDGPU.h"
41#include "llvm/IR/IntrinsicsR600.h"
42#include "llvm/IR/PatternMatch.h"
43#include "llvm/InitializePasses.h"
44#include "llvm/Pass.h"
45#include "llvm/Support/MathExtras.h"
46#include "llvm/Target/TargetMachine.h"
47#include "llvm/Transforms/Utils/SSAUpdater.h"
48
49#define DEBUG_TYPE "amdgpu-promote-alloca"
50
51using namespace llvm;
52
53namespace {
54
55static cl::opt<bool>
56 DisablePromoteAllocaToVector("disable-promote-alloca-to-vector",
57 cl::desc("Disable promote alloca to vector"),
58 cl::init(Val: false));
59
60static cl::opt<bool>
61 DisablePromoteAllocaToLDS("disable-promote-alloca-to-lds",
62 cl::desc("Disable promote alloca to LDS"),
63 cl::init(Val: false));
64
65static cl::opt<unsigned> PromoteAllocaToVectorLimit(
66 "amdgpu-promote-alloca-to-vector-limit",
67 cl::desc("Maximum byte size to consider promote alloca to vector"),
68 cl::init(Val: 0));
69
70static cl::opt<unsigned> PromoteAllocaToVectorMaxRegs(
71 "amdgpu-promote-alloca-to-vector-max-regs",
72 cl::desc(
73 "Maximum vector size (in 32b registers) to use when promoting alloca"),
74 cl::init(Val: 32));
75
76// Use up to 1/4 of available register budget for vectorization.
77// FIXME: Increase the limit for whole function budgets? Perhaps x2?
78static cl::opt<unsigned> PromoteAllocaToVectorVGPRRatio(
79 "amdgpu-promote-alloca-to-vector-vgpr-ratio",
80 cl::desc("Ratio of VGPRs to budget for promoting alloca to vectors"),
81 cl::init(Val: 4));
82
83static cl::opt<unsigned>
84 LoopUserWeight("promote-alloca-vector-loop-user-weight",
85 cl::desc("The bonus weight of users of allocas within loop "
86 "when sorting profitable allocas"),
87 cl::init(Val: 4));
88
89// We support vector indices of the form ((A * stride) >> shift) + B
90// VarIndex is A, VarMul is stride, VarShift is shift and ConstIndex is B. All
91// parts are optional.
92struct GEPToVectorIndex {
93 Value *VarIndex = nullptr; // defaults to 0
94 ConstantInt *VarMul = nullptr; // defaults to 1
95 ConstantInt *VarShift = nullptr; // defaults to 0
96 ConstantInt *ConstIndex = nullptr; // defaults to 0
97 Value *Full = nullptr;
98};
99
100struct MemTransferInfo {
101 ConstantInt *SrcIndex = nullptr;
102 ConstantInt *DestIndex = nullptr;
103};
104
105// Analysis for planning the different strategies of alloca promotion.
106struct AllocaAnalysis {
107 AllocaInst *Alloca = nullptr;
108 DenseSet<Value *> Pointers;
109 SmallVector<Use *> Uses;
110 unsigned Score = 0;
111 bool HaveSelectOrPHI = false;
112 struct {
113 FixedVectorType *Ty = nullptr;
114 SmallVector<Instruction *> Worklist;
115 SmallVector<Instruction *> UsersToRemove;
116 MapVector<GetElementPtrInst *, GEPToVectorIndex> GEPVectorIdx;
117 MapVector<MemTransferInst *, MemTransferInfo> TransferInfo;
118 } Vector;
119 struct {
120 bool Enable = false;
121 SmallVector<User *> Worklist;
122 } LDS;
123
124 explicit AllocaAnalysis(AllocaInst *Alloca) : Alloca(Alloca) {}
125};
126
127// Shared implementation which can do both promotion to vector and to LDS.
128class AMDGPUPromoteAllocaImpl {
129private:
130 const TargetMachine &TM;
131 LoopInfo &LI;
132 Module *Mod = nullptr;
133 const DataLayout *DL = nullptr;
134
135 // FIXME: This should be per-kernel.
136 uint32_t LocalMemLimit = 0;
137 uint32_t CurrentLocalMemUsage = 0;
138 unsigned MaxVGPRs;
139 unsigned VGPRBudgetRatio;
140 unsigned MaxVectorRegs;
141
142 bool IsAMDGCN = false;
143 bool IsAMDHSA = false;
144
145 std::pair<Value *, Value *> getLocalSizeYZ(IRBuilder<> &Builder);
146 Value *getWorkitemID(IRBuilder<> &Builder, unsigned N);
147
148 bool collectAllocaUses(AllocaAnalysis &AA) const;
149
150 /// Val is a derived pointer from Alloca. OpIdx0/OpIdx1 are the operand
151 /// indices to an instruction with 2 pointer inputs (e.g. select, icmp).
152 /// Returns true if both operands are derived from the same alloca. Val should
153 /// be the same value as one of the input operands of UseInst.
154 bool binaryOpIsDerivedFromSameAlloca(Value *Alloca, Value *Val,
155 Instruction *UseInst, int OpIdx0,
156 int OpIdx1) const;
157
158 /// Check whether we have enough local memory for promotion.
159 bool hasSufficientLocalMem(const Function &F);
160
161 FixedVectorType *getVectorTypeForAlloca(Type *AllocaTy) const;
162 void analyzePromoteToVector(AllocaAnalysis &AA) const;
163 void promoteAllocaToVector(AllocaAnalysis &AA);
164 void analyzePromoteToLDS(AllocaAnalysis &AA) const;
165 bool tryPromoteAllocaToLDS(AllocaAnalysis &AA, bool SufficientLDS,
166 SetVector<IntrinsicInst *> &DeferredIntrs);
167 void
168 finishDeferredAllocaToLDSPromotion(SetVector<IntrinsicInst *> &DeferredIntrs);
169
170 void scoreAlloca(AllocaAnalysis &AA) const;
171
172 void setFunctionLimits(const Function &F);
173
174public:
175 AMDGPUPromoteAllocaImpl(TargetMachine &TM, LoopInfo &LI) : TM(TM), LI(LI) {
176
177 const Triple &TT = TM.getTargetTriple();
178 IsAMDGCN = TT.isAMDGCN();
179 IsAMDHSA = TT.getOS() == Triple::AMDHSA;
180 }
181
182 bool run(Function &F, bool PromoteToLDS);
183};
184
185// FIXME: This can create globals so should be a module pass.
186class AMDGPUPromoteAlloca : public FunctionPass {
187public:
188 static char ID;
189
190 AMDGPUPromoteAlloca() : FunctionPass(ID) {}
191
192 bool runOnFunction(Function &F) override {
193 if (skipFunction(F))
194 return false;
195 if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>())
196 return AMDGPUPromoteAllocaImpl(
197 TPC->getTM<TargetMachine>(),
198 getAnalysis<LoopInfoWrapperPass>().getLoopInfo())
199 .run(F, /*PromoteToLDS*/ true);
200 return false;
201 }
202
203 StringRef getPassName() const override { return "AMDGPU Promote Alloca"; }
204
205 void getAnalysisUsage(AnalysisUsage &AU) const override {
206 AU.setPreservesCFG();
207 AU.addRequired<LoopInfoWrapperPass>();
208 FunctionPass::getAnalysisUsage(AU);
209 }
210};
211
212static unsigned getMaxVGPRs(unsigned LDSBytes, const TargetMachine &TM,
213 const Function &F) {
214 if (!TM.getTargetTriple().isAMDGCN())
215 return 128;
216
217 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
218
219 unsigned DynamicVGPRBlockSize = AMDGPU::getDynamicVGPRBlockSize(F);
220 // Temporarily check both the attribute and the subtarget feature, until the
221 // latter is removed.
222 if (DynamicVGPRBlockSize == 0 && ST.isDynamicVGPREnabled())
223 DynamicVGPRBlockSize = ST.getDynamicVGPRBlockSize();
224
225 unsigned MaxVGPRs = ST.getMaxNumVGPRs(
226 WavesPerEU: ST.getWavesPerEU(FlatWorkGroupSizes: ST.getFlatWorkGroupSizes(F), LDSBytes, F).first,
227 DynamicVGPRBlockSize);
228
229 // A non-entry function has only 32 caller preserved registers.
230 // Do not promote alloca which will force spilling unless we know the function
231 // will be inlined.
232 if (!F.hasFnAttribute(Kind: Attribute::AlwaysInline) &&
233 !AMDGPU::isEntryFunctionCC(CC: F.getCallingConv()))
234 MaxVGPRs = std::min(a: MaxVGPRs, b: 32u);
235 return MaxVGPRs;
236}
237
238} // end anonymous namespace
239
240char AMDGPUPromoteAlloca::ID = 0;
241
242INITIALIZE_PASS_BEGIN(AMDGPUPromoteAlloca, DEBUG_TYPE,
243 "AMDGPU promote alloca to vector or LDS", false, false)
244// Move LDS uses from functions to kernels before promote alloca for accurate
245// estimation of LDS available
246INITIALIZE_PASS_DEPENDENCY(AMDGPULowerModuleLDSLegacy)
247INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
248INITIALIZE_PASS_END(AMDGPUPromoteAlloca, DEBUG_TYPE,
249 "AMDGPU promote alloca to vector or LDS", false, false)
250
251char &llvm::AMDGPUPromoteAllocaID = AMDGPUPromoteAlloca::ID;
252
253PreservedAnalyses AMDGPUPromoteAllocaPass::run(Function &F,
254 FunctionAnalysisManager &AM) {
255 auto &LI = AM.getResult<LoopAnalysis>(IR&: F);
256 bool Changed = AMDGPUPromoteAllocaImpl(TM, LI).run(F, /*PromoteToLDS=*/true);
257 if (Changed) {
258 PreservedAnalyses PA;
259 PA.preserveSet<CFGAnalyses>();
260 return PA;
261 }
262 return PreservedAnalyses::all();
263}
264
265PreservedAnalyses
266AMDGPUPromoteAllocaToVectorPass::run(Function &F, FunctionAnalysisManager &AM) {
267 auto &LI = AM.getResult<LoopAnalysis>(IR&: F);
268 bool Changed = AMDGPUPromoteAllocaImpl(TM, LI).run(F, /*PromoteToLDS=*/false);
269 if (Changed) {
270 PreservedAnalyses PA;
271 PA.preserveSet<CFGAnalyses>();
272 return PA;
273 }
274 return PreservedAnalyses::all();
275}
276
277FunctionPass *llvm::createAMDGPUPromoteAlloca() {
278 return new AMDGPUPromoteAlloca();
279}
280
281bool AMDGPUPromoteAllocaImpl::collectAllocaUses(AllocaAnalysis &AA) const {
282 const auto RejectUser = [&](Instruction *Inst, Twine Msg) {
283 LLVM_DEBUG(dbgs() << " Cannot promote alloca: " << Msg << "\n"
284 << " " << *Inst << "\n");
285 return false;
286 };
287
288 SmallVector<Instruction *, 4> WorkList({AA.Alloca});
289 while (!WorkList.empty()) {
290 auto *Cur = WorkList.pop_back_val();
291 if (find(Range&: AA.Pointers, Val: Cur) != AA.Pointers.end())
292 continue;
293 AA.Pointers.insert(V: Cur);
294 for (auto &U : Cur->uses()) {
295 auto *Inst = cast<Instruction>(Val: U.getUser());
296 if (isa<StoreInst>(Val: Inst)) {
297 if (U.getOperandNo() != StoreInst::getPointerOperandIndex()) {
298 return RejectUser(Inst, "pointer escapes via store");
299 }
300 }
301 AA.Uses.push_back(Elt: &U);
302
303 if (isa<GetElementPtrInst>(Val: U.getUser())) {
304 WorkList.push_back(Elt: Inst);
305 } else if (auto *SI = dyn_cast<SelectInst>(Val: Inst)) {
306 // Only promote a select if we know that the other select operand is
307 // from another pointer that will also be promoted.
308 if (!binaryOpIsDerivedFromSameAlloca(Alloca: AA.Alloca, Val: Cur, UseInst: SI, OpIdx0: 1, OpIdx1: 2))
309 return RejectUser(Inst, "select from mixed objects");
310 WorkList.push_back(Elt: Inst);
311 AA.HaveSelectOrPHI = true;
312 } else if (auto *Phi = dyn_cast<PHINode>(Val: Inst)) {
313 // Repeat for phis.
314
315 // TODO: Handle more complex cases. We should be able to replace loops
316 // over arrays.
317 switch (Phi->getNumIncomingValues()) {
318 case 1:
319 break;
320 case 2:
321 if (!binaryOpIsDerivedFromSameAlloca(Alloca: AA.Alloca, Val: Cur, UseInst: Phi, OpIdx0: 0, OpIdx1: 1))
322 return RejectUser(Inst, "phi from mixed objects");
323 break;
324 default:
325 return RejectUser(Inst, "phi with too many operands");
326 }
327
328 WorkList.push_back(Elt: Inst);
329 AA.HaveSelectOrPHI = true;
330 }
331 }
332 }
333 return true;
334}
335
336void AMDGPUPromoteAllocaImpl::scoreAlloca(AllocaAnalysis &AA) const {
337 LLVM_DEBUG(dbgs() << "Scoring: " << *AA.Alloca << "\n");
338 unsigned Score = 0;
339 // Increment score by one for each user + a bonus for users within loops.
340 for (auto *U : AA.Uses) {
341 Instruction *Inst = cast<Instruction>(Val: U->getUser());
342 if (isa<GetElementPtrInst>(Val: Inst) || isa<SelectInst>(Val: Inst) ||
343 isa<PHINode>(Val: Inst))
344 continue;
345 unsigned UserScore =
346 1 + (LoopUserWeight * LI.getLoopDepth(BB: Inst->getParent()));
347 LLVM_DEBUG(dbgs() << " [+" << UserScore << "]:\t" << *Inst << "\n");
348 Score += UserScore;
349 }
350 LLVM_DEBUG(dbgs() << " => Final Score:" << Score << "\n");
351 AA.Score = Score;
352}
353
354void AMDGPUPromoteAllocaImpl::setFunctionLimits(const Function &F) {
355 // Load per function limits, overriding with global options where appropriate.
356 // R600 register tuples/aliasing are fragile with large vector promotions so
357 // apply architecture specific limit here.
358 const int R600MaxVectorRegs = 16;
359 MaxVectorRegs = F.getFnAttributeAsParsedInteger(
360 Kind: "amdgpu-promote-alloca-to-vector-max-regs",
361 Default: IsAMDGCN ? PromoteAllocaToVectorMaxRegs : R600MaxVectorRegs);
362 if (PromoteAllocaToVectorMaxRegs.getNumOccurrences())
363 MaxVectorRegs = PromoteAllocaToVectorMaxRegs;
364 VGPRBudgetRatio = F.getFnAttributeAsParsedInteger(
365 Kind: "amdgpu-promote-alloca-to-vector-vgpr-ratio",
366 Default: PromoteAllocaToVectorVGPRRatio);
367 if (PromoteAllocaToVectorVGPRRatio.getNumOccurrences())
368 VGPRBudgetRatio = PromoteAllocaToVectorVGPRRatio;
369}
370
371bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
372 if (DisablePromoteAllocaToLDS && DisablePromoteAllocaToVector)
373 return false;
374
375 Mod = F.getParent();
376 DL = &Mod->getDataLayout();
377
378 bool SufficientLDS = PromoteToLDS && hasSufficientLocalMem(F);
379 MaxVGPRs = getMaxVGPRs(LDSBytes: CurrentLocalMemUsage, TM, F);
380 setFunctionLimits(F);
381
382 unsigned VectorizationBudget =
383 (PromoteAllocaToVectorLimit ? PromoteAllocaToVectorLimit * 8
384 : (MaxVGPRs * 32)) /
385 VGPRBudgetRatio;
386
387 std::vector<AllocaAnalysis> Allocas;
388 for (Instruction &I : F.getEntryBlock()) {
389 if (AllocaInst *AI = dyn_cast<AllocaInst>(Val: &I)) {
390 // Array allocations are probably not worth handling, since an allocation
391 // of the array type is the canonical form.
392 if (!AI->isStaticAlloca() || AI->isArrayAllocation())
393 continue;
394
395 LLVM_DEBUG(dbgs() << "Analyzing: " << *AI << '\n');
396
397 AllocaAnalysis AA{AI};
398 if (collectAllocaUses(AA)) {
399 analyzePromoteToVector(AA);
400 if (PromoteToLDS)
401 analyzePromoteToLDS(AA);
402 if (AA.Vector.Ty || AA.LDS.Enable) {
403 scoreAlloca(AA);
404 Allocas.push_back(x: std::move(AA));
405 }
406 }
407 }
408 }
409
410 stable_sort(Range&: Allocas,
411 C: [](const auto &A, const auto &B) { return A.Score > B.Score; });
412
413 // clang-format off
414 LLVM_DEBUG(
415 dbgs() << "Sorted Worklist:\n";
416 for (const auto &AA : Allocas)
417 dbgs() << " " << *AA.Alloca << "\n";
418 );
419 // clang-format on
420
421 bool Changed = false;
422 SetVector<IntrinsicInst *> DeferredIntrs;
423 for (AllocaAnalysis &AA : Allocas) {
424 if (AA.Vector.Ty) {
425 std::optional<TypeSize> Size = AA.Alloca->getAllocationSize(DL: *DL);
426 assert(Size); // Expected to succeed on non-array alloca.
427 const unsigned AllocaCost = Size->getFixedValue() * 8;
428 // First, check if we have enough budget to vectorize this alloca.
429 if (AllocaCost <= VectorizationBudget) {
430 promoteAllocaToVector(AA);
431 Changed = true;
432 assert((VectorizationBudget - AllocaCost) < VectorizationBudget &&
433 "Underflow!");
434 VectorizationBudget -= AllocaCost;
435 LLVM_DEBUG(dbgs() << " Remaining vectorization budget:"
436 << VectorizationBudget << "\n");
437 continue;
438 } else {
439 LLVM_DEBUG(dbgs() << "Alloca too big for vectorization (size:"
440 << AllocaCost << ", budget:" << VectorizationBudget
441 << "): " << *AA.Alloca << "\n");
442 }
443 }
444
445 if (AA.LDS.Enable &&
446 tryPromoteAllocaToLDS(AA, SufficientLDS, DeferredIntrs))
447 Changed = true;
448 }
449 finishDeferredAllocaToLDSPromotion(DeferredIntrs);
450
451 // NOTE: tryPromoteAllocaToVector removes the alloca, so Allocas contains
452 // dangling pointers. If we want to reuse it past this point, the loop above
453 // would need to be updated to remove successfully promoted allocas.
454
455 return Changed;
456}
457
458// Checks if the instruction I is a memset user of the alloca AI that we can
459// deal with. Currently, only non-volatile memsets that affect the whole alloca
460// are handled.
461static bool isSupportedMemset(MemSetInst *I, AllocaInst *AI,
462 const DataLayout &DL) {
463 using namespace PatternMatch;
464 // For now we only care about non-volatile memsets that affect the whole type
465 // (start at index 0 and fill the whole alloca).
466 //
467 // TODO: Now that we moved to PromoteAlloca we could handle any memsets
468 // (except maybe volatile ones?) - we just need to use shufflevector if it
469 // only affects a subset of the vector.
470 const unsigned Size = DL.getTypeStoreSize(Ty: AI->getAllocatedType());
471 return I->getOperand(i_nocapture: 0) == AI &&
472 match(V: I->getOperand(i_nocapture: 2), P: m_SpecificInt(V: Size)) && !I->isVolatile();
473}
474
475static Value *calculateVectorIndex(Value *Ptr, AllocaAnalysis &AA) {
476 IRBuilder<> B(Ptr->getContext());
477
478 Ptr = Ptr->stripPointerCasts();
479 if (Ptr == AA.Alloca)
480 return B.getInt32(C: 0);
481
482 auto *GEP = cast<GetElementPtrInst>(Val: Ptr);
483 auto I = AA.Vector.GEPVectorIdx.find(Key: GEP);
484 assert(I != AA.Vector.GEPVectorIdx.end() && "Must have entry for GEP!");
485
486 if (!I->second.Full) {
487 Value *Result = nullptr;
488 B.SetInsertPoint(GEP);
489
490 if (I->second.VarIndex) {
491 Result = I->second.VarIndex;
492 Result = B.CreateSExtOrTrunc(V: Result, DestTy: B.getInt32Ty());
493
494 if (I->second.VarMul)
495 Result = B.CreateMul(LHS: Result, RHS: I->second.VarMul);
496
497 if (I->second.VarShift)
498 Result = B.CreateAShr(LHS: Result, RHS: I->second.VarShift, Name: "", /*isExact*/ true);
499 }
500
501 if (I->second.ConstIndex) {
502 if (Result)
503 Result = B.CreateAdd(LHS: Result, RHS: I->second.ConstIndex);
504 else
505 Result = I->second.ConstIndex;
506 }
507
508 if (!Result)
509 Result = B.getInt32(C: 0);
510
511 I->second.Full = Result;
512 }
513
514 return I->second.Full;
515}
516
517static std::optional<GEPToVectorIndex>
518computeGEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca,
519 Type *VecElemTy, const DataLayout &DL) {
520 // TODO: Extracting a "multiple of X" from a GEP might be a useful generic
521 // helper.
522 LLVMContext &Ctx = GEP->getContext();
523 unsigned BW = DL.getIndexTypeSizeInBits(Ty: GEP->getType());
524 SmallMapVector<Value *, APInt, 4> VarOffsets;
525 APInt ConstOffset(BW, 0);
526
527 // Walk backwards through nested GEPs to collect both constant and variable
528 // offsets, so that nested vector GEP chains can be lowered in one step.
529 //
530 // Given this IR fragment as input:
531 //
532 // %0 = alloca [10 x <2 x i32>], align 8, addrspace(5)
533 // %1 = getelementptr [10 x <2 x i32>], ptr addrspace(5) %0, i32 0, i32 %j
534 // %2 = getelementptr i8, ptr addrspace(5) %1, i32 4
535 // %3 = load i32, ptr addrspace(5) %2, align 4
536 //
537 // Combine both GEP operations in a single pass, producing:
538 // BasePtr = %0
539 // ConstOffset = 4
540 // VarOffsets = { %j -> element_size(<2 x i32>) }
541 //
542 // That lets us emit a single buffer_load directly into a VGPR, without ever
543 // allocating scratch memory for the intermediate pointer.
544 Value *CurPtr = GEP;
545 while (auto *CurGEP = dyn_cast<GetElementPtrInst>(Val: CurPtr)) {
546 if (!CurGEP->collectOffset(DL, BitWidth: BW, VariableOffsets&: VarOffsets, ConstantOffset&: ConstOffset))
547 return {};
548
549 // Move to the next outer pointer.
550 CurPtr = CurGEP->getPointerOperand();
551 }
552
553 assert(CurPtr == Alloca && "GEP not based on alloca");
554
555 int64_t VecElemSize = DL.getTypeAllocSize(Ty: VecElemTy);
556 if (VarOffsets.size() > 1)
557 return {};
558
559 // We support vector indices of the form ((VarIndex * stride) >> shift) + B.
560 // IndexQuot represents B. Check that the constant offset is a multiple
561 // of the vector element size.
562 if (ConstOffset.srem(RHS: VecElemSize) != 0)
563 return {};
564 APInt IndexQuot = ConstOffset.sdiv(RHS: VecElemSize);
565
566 GEPToVectorIndex Result;
567
568 if (!ConstOffset.isZero())
569 Result.ConstIndex = ConstantInt::get(Context&: Ctx, V: IndexQuot.sextOrTrunc(width: BW));
570
571 // If there are no variable offsets, only a constant offset, then we're done.
572 if (VarOffsets.empty())
573 return Result;
574
575 // Scale is the stride in the (A * stride) part. Check that there is only one
576 // variable offset and extract the scale factor.
577 const auto &VarOffset = VarOffsets.front();
578 auto ScaleOpt = VarOffset.second.tryZExtValue();
579 if (!ScaleOpt || *ScaleOpt == 0)
580 return {};
581
582 uint64_t Scale = *ScaleOpt;
583 Result.VarIndex = VarOffset.first;
584 auto *OffsetType = dyn_cast<IntegerType>(Val: Result.VarIndex->getType());
585 if (!OffsetType)
586 return {};
587
588 // The vector index for the variable part is: VarIndex * Scale / VecElemSize.
589 if (Scale >= (uint64_t)VecElemSize) {
590 if (Scale % VecElemSize != 0)
591 return {};
592
593 // Scale is a multiple of VecElemSize, so the index is just: VarIndex *
594 // (Scale / VecElemSize).
595 uint64_t VarMul = Scale / VecElemSize;
596 // Only the multiplier is needed.
597 if (VarMul != 1)
598 Result.VarMul = ConstantInt::get(Context&: Ctx, V: APInt(BW, VarMul));
599 } else {
600 if ((uint64_t)VecElemSize % Scale != 0)
601 return {};
602
603 // VecElemSize is a multiple of Scale, so the index is just: VarIndex /
604 // (VecElemSize / Scale).
605 uint64_t Divisor = VecElemSize / Scale;
606 // The divisor must be a power of 2 so we can use a right shift.
607 if (!isPowerOf2_64(Value: Divisor))
608 return {};
609
610 // VarIndex must be known to be divisible by that divisor.
611 KnownBits KB = computeKnownBits(V: VarOffset.first, DL);
612 if (KB.countMinTrailingZeros() < Log2_64(Value: Divisor))
613 return {};
614
615 Result.VarShift = ConstantInt::get(Context&: Ctx, V: APInt(BW, Log2_64(Value: Divisor)));
616 }
617
618 return Result;
619}
620
621/// Promotes a single user of the alloca to a vector form.
622///
623/// \param Inst Instruction to be promoted.
624/// \param DL Module Data Layout.
625/// \param AA Alloca Analysis.
626/// \param VecStoreSize Size of \p VectorTy in bytes.
627/// \param ElementSize Size of \p VectorTy element type in bytes.
628/// \param CurVal Current value of the vector (e.g. last stored value)
629/// \param[out] DeferredLoads \p Inst is added to this vector if it can't
630/// be promoted now. This happens when promoting requires \p
631/// CurVal, but \p CurVal is nullptr.
632/// \return the stored value if \p Inst would have written to the alloca, or
633/// nullptr otherwise.
634static Value *promoteAllocaUserToVector(Instruction *Inst, const DataLayout &DL,
635 AllocaAnalysis &AA,
636 unsigned VecStoreSize,
637 unsigned ElementSize,
638 function_ref<Value *()> GetCurVal) {
639 // Note: we use InstSimplifyFolder because it can leverage the DataLayout
640 // to do more folding, especially in the case of vector splats.
641 IRBuilder<InstSimplifyFolder> Builder(Inst->getContext(),
642 InstSimplifyFolder(DL));
643 Builder.SetInsertPoint(Inst);
644
645 Type *VecEltTy = AA.Vector.Ty->getElementType();
646
647 switch (Inst->getOpcode()) {
648 case Instruction::Load: {
649 Value *CurVal = GetCurVal();
650 Value *Index =
651 calculateVectorIndex(Ptr: cast<LoadInst>(Val: Inst)->getPointerOperand(), AA);
652
653 // We're loading the full vector.
654 Type *AccessTy = Inst->getType();
655 TypeSize AccessSize = DL.getTypeStoreSize(Ty: AccessTy);
656 if (Constant *CI = dyn_cast<Constant>(Val: Index)) {
657 if (CI->isNullValue() && AccessSize == VecStoreSize) {
658 Inst->replaceAllUsesWith(
659 V: Builder.CreateBitPreservingCastChain(DL, V: CurVal, NewTy: AccessTy));
660 return nullptr;
661 }
662 }
663
664 // Loading a subvector.
665 if (isa<FixedVectorType>(Val: AccessTy)) {
666 assert(AccessSize.isKnownMultipleOf(DL.getTypeStoreSize(VecEltTy)));
667 const unsigned NumLoadedElts = AccessSize / DL.getTypeStoreSize(Ty: VecEltTy);
668 auto *SubVecTy = FixedVectorType::get(ElementType: VecEltTy, NumElts: NumLoadedElts);
669 assert(DL.getTypeStoreSize(SubVecTy) == DL.getTypeStoreSize(AccessTy));
670
671 // If idx is dynamic, then sandwich load with bitcasts.
672 // ie. VectorTy SubVecTy AccessTy
673 // <64 x i8> -> <16 x i8> <8 x i16>
674 // <64 x i8> -> <4 x i128> -> i128 -> <8 x i16>
675 // Extracting subvector with dynamic index has very large expansion in
676 // the amdgpu backend. Limit to pow2.
677 FixedVectorType *VectorTy = AA.Vector.Ty;
678 TypeSize NumBits = DL.getTypeStoreSize(Ty: SubVecTy) * 8u;
679 uint64_t LoadAlign = cast<LoadInst>(Val: Inst)->getAlign().value();
680 bool IsAlignedLoad = NumBits <= (LoadAlign * 8u);
681 unsigned TotalNumElts = VectorTy->getNumElements();
682 bool IsProperlyDivisible = TotalNumElts % NumLoadedElts == 0;
683 if (!isa<ConstantInt>(Val: Index) &&
684 llvm::isPowerOf2_32(Value: SubVecTy->getNumElements()) &&
685 IsProperlyDivisible && IsAlignedLoad) {
686 IntegerType *NewElemTy = Builder.getIntNTy(N: NumBits);
687 const unsigned NewNumElts =
688 DL.getTypeStoreSize(Ty: VectorTy) * 8u / NumBits;
689 const unsigned LShrAmt = llvm::Log2_32(Value: SubVecTy->getNumElements());
690 FixedVectorType *BitCastTy =
691 FixedVectorType::get(ElementType: NewElemTy, NumElts: NewNumElts);
692 Value *BCVal = Builder.CreateBitCast(V: CurVal, DestTy: BitCastTy);
693 Value *NewIdx = Builder.CreateLShr(
694 LHS: Index, RHS: ConstantInt::get(Ty: Index->getType(), V: LShrAmt));
695 Value *ExtVal = Builder.CreateExtractElement(Vec: BCVal, Idx: NewIdx);
696 Value *BCOut = Builder.CreateBitCast(V: ExtVal, DestTy: AccessTy);
697 Inst->replaceAllUsesWith(V: BCOut);
698 return nullptr;
699 }
700
701 Value *SubVec = PoisonValue::get(T: SubVecTy);
702 for (unsigned K = 0; K < NumLoadedElts; ++K) {
703 Value *CurIdx =
704 Builder.CreateAdd(LHS: Index, RHS: ConstantInt::get(Ty: Index->getType(), V: K));
705 SubVec = Builder.CreateInsertElement(
706 Vec: SubVec, NewElt: Builder.CreateExtractElement(Vec: CurVal, Idx: CurIdx), Idx: K);
707 }
708
709 Inst->replaceAllUsesWith(
710 V: Builder.CreateBitPreservingCastChain(DL, V: SubVec, NewTy: AccessTy));
711 return nullptr;
712 }
713
714 // We're loading one element.
715 Value *ExtractElement = Builder.CreateExtractElement(Vec: CurVal, Idx: Index);
716 if (AccessTy != VecEltTy)
717 ExtractElement = Builder.CreateBitOrPointerCast(V: ExtractElement, DestTy: AccessTy);
718
719 Inst->replaceAllUsesWith(V: ExtractElement);
720 return nullptr;
721 }
722 case Instruction::Store: {
723 // For stores, it's a bit trickier and it depends on whether we're storing
724 // the full vector or not. If we're storing the full vector, we don't need
725 // to know the current value. If this is a store of a single element, we
726 // need to know the value.
727 StoreInst *SI = cast<StoreInst>(Val: Inst);
728 Value *Index = calculateVectorIndex(Ptr: SI->getPointerOperand(), AA);
729 Value *Val = SI->getValueOperand();
730
731 // We're storing the full vector, we can handle this without knowing CurVal.
732 Type *AccessTy = Val->getType();
733 TypeSize AccessSize = DL.getTypeStoreSize(Ty: AccessTy);
734 if (Constant *CI = dyn_cast<Constant>(Val: Index))
735 if (CI->isNullValue() && AccessSize == VecStoreSize)
736 return Builder.CreateBitPreservingCastChain(DL, V: Val, NewTy: AA.Vector.Ty);
737
738 // Storing a subvector.
739 if (isa<FixedVectorType>(Val: AccessTy)) {
740 assert(AccessSize.isKnownMultipleOf(DL.getTypeStoreSize(VecEltTy)));
741 const unsigned NumWrittenElts =
742 AccessSize / DL.getTypeStoreSize(Ty: VecEltTy);
743 const unsigned NumVecElts = AA.Vector.Ty->getNumElements();
744 auto *SubVecTy = FixedVectorType::get(ElementType: VecEltTy, NumElts: NumWrittenElts);
745 assert(DL.getTypeStoreSize(SubVecTy) == DL.getTypeStoreSize(AccessTy));
746
747 Val = Builder.CreateBitPreservingCastChain(DL, V: Val, NewTy: SubVecTy);
748 Value *CurVec = GetCurVal();
749 for (unsigned K = 0, NumElts = std::min(a: NumWrittenElts, b: NumVecElts);
750 K < NumElts; ++K) {
751 Value *CurIdx =
752 Builder.CreateAdd(LHS: Index, RHS: ConstantInt::get(Ty: Index->getType(), V: K));
753 CurVec = Builder.CreateInsertElement(
754 Vec: CurVec, NewElt: Builder.CreateExtractElement(Vec: Val, Idx: K), Idx: CurIdx);
755 }
756 return CurVec;
757 }
758
759 if (Val->getType() != VecEltTy)
760 Val = Builder.CreateBitOrPointerCast(V: Val, DestTy: VecEltTy);
761 return Builder.CreateInsertElement(Vec: GetCurVal(), NewElt: Val, Idx: Index);
762 }
763 case Instruction::Call: {
764 if (auto *MTI = dyn_cast<MemTransferInst>(Val: Inst)) {
765 // For memcpy, we need to know curval.
766 ConstantInt *Length = cast<ConstantInt>(Val: MTI->getLength());
767 unsigned NumCopied = Length->getZExtValue() / ElementSize;
768 MemTransferInfo *TI = &AA.Vector.TransferInfo[MTI];
769 unsigned SrcBegin = TI->SrcIndex->getZExtValue();
770 unsigned DestBegin = TI->DestIndex->getZExtValue();
771
772 SmallVector<int> Mask;
773 for (unsigned Idx = 0; Idx < AA.Vector.Ty->getNumElements(); ++Idx) {
774 if (Idx >= DestBegin && Idx < DestBegin + NumCopied) {
775 Mask.push_back(Elt: SrcBegin < AA.Vector.Ty->getNumElements()
776 ? SrcBegin++
777 : PoisonMaskElem);
778 } else {
779 Mask.push_back(Elt: Idx);
780 }
781 }
782
783 return Builder.CreateShuffleVector(V: GetCurVal(), Mask);
784 }
785
786 if (auto *MSI = dyn_cast<MemSetInst>(Val: Inst)) {
787 // For memset, we don't need to know the previous value because we
788 // currently only allow memsets that cover the whole alloca.
789 Value *Elt = MSI->getOperand(i_nocapture: 1);
790 const unsigned BytesPerElt = DL.getTypeStoreSize(Ty: VecEltTy);
791 if (BytesPerElt > 1) {
792 Value *EltBytes = Builder.CreateVectorSplat(NumElts: BytesPerElt, V: Elt);
793
794 // If the element type of the vector is a pointer, we need to first cast
795 // to an integer, then use a PtrCast.
796 if (VecEltTy->isPointerTy()) {
797 Type *PtrInt = Builder.getIntNTy(N: BytesPerElt * 8);
798 Elt = Builder.CreateBitCast(V: EltBytes, DestTy: PtrInt);
799 Elt = Builder.CreateIntToPtr(V: Elt, DestTy: VecEltTy);
800 } else
801 Elt = Builder.CreateBitCast(V: EltBytes, DestTy: VecEltTy);
802 }
803
804 return Builder.CreateVectorSplat(EC: AA.Vector.Ty->getElementCount(), V: Elt);
805 }
806
807 if (auto *Intr = dyn_cast<IntrinsicInst>(Val: Inst)) {
808 if (Intr->getIntrinsicID() == Intrinsic::objectsize) {
809 Intr->replaceAllUsesWith(
810 V: Builder.getIntN(N: Intr->getType()->getIntegerBitWidth(),
811 C: DL.getTypeAllocSize(Ty: AA.Vector.Ty)));
812 return nullptr;
813 }
814 }
815
816 llvm_unreachable("Unsupported call when promoting alloca to vector");
817 }
818
819 default:
820 llvm_unreachable("Inconsistency in instructions promotable to vector");
821 }
822
823 llvm_unreachable("Did not return after promoting instruction!");
824}
825
826static bool isSupportedAccessType(FixedVectorType *VecTy, Type *AccessTy,
827 const DataLayout &DL) {
828 // Access as a vector type can work if the size of the access vector is a
829 // multiple of the size of the alloca's vector element type.
830 //
831 // Examples:
832 // - VecTy = <8 x float>, AccessTy = <4 x float> -> OK
833 // - VecTy = <4 x double>, AccessTy = <2 x float> -> OK
834 // - VecTy = <4 x double>, AccessTy = <3 x float> -> NOT OK
835 // - 3*32 is not a multiple of 64
836 //
837 // We could handle more complicated cases, but it'd make things a lot more
838 // complicated.
839 if (isa<FixedVectorType>(Val: AccessTy)) {
840 TypeSize AccTS = DL.getTypeStoreSize(Ty: AccessTy);
841 // If the type size and the store size don't match, we would need to do more
842 // than just bitcast to translate between an extracted/insertable subvectors
843 // and the accessed value.
844 if (AccTS * 8 != DL.getTypeSizeInBits(Ty: AccessTy))
845 return false;
846 TypeSize VecTS = DL.getTypeStoreSize(Ty: VecTy->getElementType());
847 return AccTS.isKnownMultipleOf(RHS: VecTS);
848 }
849
850 return CastInst::isBitOrNoopPointerCastable(SrcTy: VecTy->getElementType(), DestTy: AccessTy,
851 DL);
852}
853
854/// Iterates over an instruction worklist that may contain multiple instructions
855/// from the same basic block, but in a different order.
856template <typename InstContainer>
857static void forEachWorkListItem(const InstContainer &WorkList,
858 std::function<void(Instruction *)> Fn) {
859 // Bucket up uses of the alloca by the block they occur in.
860 // This is important because we have to handle multiple defs/uses in a block
861 // ourselves: SSAUpdater is purely for cross-block references.
862 DenseMap<BasicBlock *, SmallDenseSet<Instruction *>> UsesByBlock;
863 for (Instruction *User : WorkList)
864 UsesByBlock[User->getParent()].insert(V: User);
865
866 for (Instruction *User : WorkList) {
867 BasicBlock *BB = User->getParent();
868 auto &BlockUses = UsesByBlock[BB];
869
870 // Already processed, skip.
871 if (BlockUses.empty())
872 continue;
873
874 // Only user in the block, directly process it.
875 if (BlockUses.size() == 1) {
876 Fn(User);
877 continue;
878 }
879
880 // Multiple users in the block, do a linear scan to see users in order.
881 for (Instruction &Inst : *BB) {
882 if (!BlockUses.contains(V: &Inst))
883 continue;
884
885 Fn(&Inst);
886 }
887
888 // Clear the block so we know it's been processed.
889 BlockUses.clear();
890 }
891}
892
893/// Find an insert point after an alloca, after all other allocas clustered at
894/// the start of the block.
895static BasicBlock::iterator skipToNonAllocaInsertPt(BasicBlock &BB,
896 BasicBlock::iterator I) {
897 for (BasicBlock::iterator E = BB.end(); I != E && isa<AllocaInst>(Val: *I); ++I)
898 ;
899 return I;
900}
901
902FixedVectorType *
903AMDGPUPromoteAllocaImpl::getVectorTypeForAlloca(Type *AllocaTy) const {
904 if (DisablePromoteAllocaToVector) {
905 LLVM_DEBUG(dbgs() << " Promote alloca to vectors is disabled\n");
906 return nullptr;
907 }
908
909 auto *VectorTy = dyn_cast<FixedVectorType>(Val: AllocaTy);
910 if (auto *ArrayTy = dyn_cast<ArrayType>(Val: AllocaTy)) {
911 uint64_t NumElems = 1;
912 Type *ElemTy;
913 do {
914 NumElems *= ArrayTy->getNumElements();
915 ElemTy = ArrayTy->getElementType();
916 } while ((ArrayTy = dyn_cast<ArrayType>(Val: ElemTy)));
917
918 // Check for array of vectors
919 auto *InnerVectorTy = dyn_cast<FixedVectorType>(Val: ElemTy);
920 if (InnerVectorTy) {
921 NumElems *= InnerVectorTy->getNumElements();
922 ElemTy = InnerVectorTy->getElementType();
923 }
924
925 if (VectorType::isValidElementType(ElemTy) && NumElems > 0) {
926 unsigned ElementSize = DL->getTypeSizeInBits(Ty: ElemTy) / 8;
927 if (ElementSize > 0) {
928 unsigned AllocaSize = DL->getTypeStoreSize(Ty: AllocaTy);
929 // Expand vector if required to match padding of inner type,
930 // i.e. odd size subvectors.
931 // Storage size of new vector must match that of alloca for correct
932 // behaviour of byte offsets and GEP computation.
933 if (NumElems * ElementSize != AllocaSize)
934 NumElems = AllocaSize / ElementSize;
935 if (NumElems > 0 && (AllocaSize % ElementSize) == 0)
936 VectorTy = FixedVectorType::get(ElementType: ElemTy, NumElts: NumElems);
937 }
938 }
939 }
940 if (!VectorTy) {
941 LLVM_DEBUG(dbgs() << " Cannot convert type to vector\n");
942 return nullptr;
943 }
944
945 const unsigned MaxElements =
946 (MaxVectorRegs * 32) / DL->getTypeSizeInBits(Ty: VectorTy->getElementType());
947
948 if (VectorTy->getNumElements() > MaxElements ||
949 VectorTy->getNumElements() < 2) {
950 LLVM_DEBUG(dbgs() << " " << *VectorTy
951 << " has an unsupported number of elements\n");
952 return nullptr;
953 }
954
955 Type *VecEltTy = VectorTy->getElementType();
956 unsigned ElementSizeInBits = DL->getTypeSizeInBits(Ty: VecEltTy);
957 if (ElementSizeInBits != DL->getTypeAllocSizeInBits(Ty: VecEltTy)) {
958 LLVM_DEBUG(dbgs() << " Cannot convert to vector if the allocation size "
959 "does not match the type's size\n");
960 return nullptr;
961 }
962
963 return VectorTy;
964}
965
966void AMDGPUPromoteAllocaImpl::analyzePromoteToVector(AllocaAnalysis &AA) const {
967 if (AA.HaveSelectOrPHI) {
968 LLVM_DEBUG(dbgs() << " Cannot convert to vector due to select or phi\n");
969 return;
970 }
971
972 Type *AllocaTy = AA.Alloca->getAllocatedType();
973 AA.Vector.Ty = getVectorTypeForAlloca(AllocaTy);
974 if (!AA.Vector.Ty)
975 return;
976
977 const auto RejectUser = [&](Instruction *Inst, Twine Msg) {
978 LLVM_DEBUG(dbgs() << " Cannot promote alloca to vector: " << Msg << "\n"
979 << " " << *Inst << "\n");
980 AA.Vector.Ty = nullptr;
981 };
982
983 Type *VecEltTy = AA.Vector.Ty->getElementType();
984 unsigned ElementSize = DL->getTypeSizeInBits(Ty: VecEltTy) / 8;
985 assert(ElementSize > 0);
986 for (auto *U : AA.Uses) {
987 Instruction *Inst = cast<Instruction>(Val: U->getUser());
988
989 if (Value *Ptr = getLoadStorePointerOperand(V: Inst)) {
990 assert(!isa<StoreInst>(Inst) ||
991 U->getOperandNo() == StoreInst::getPointerOperandIndex());
992
993 Type *AccessTy = getLoadStoreType(I: Inst);
994 if (AccessTy->isAggregateType())
995 return RejectUser(Inst, "unsupported load/store as aggregate");
996 assert(!AccessTy->isAggregateType() || AccessTy->isArrayTy());
997
998 // Check that this is a simple access of a vector element.
999 bool IsSimple = isa<LoadInst>(Val: Inst) ? cast<LoadInst>(Val: Inst)->isSimple()
1000 : cast<StoreInst>(Val: Inst)->isSimple();
1001 if (!IsSimple)
1002 return RejectUser(Inst, "not a simple load or store");
1003
1004 Ptr = Ptr->stripPointerCasts();
1005
1006 // Alloca already accessed as vector.
1007 if (Ptr == AA.Alloca &&
1008 DL->getTypeStoreSize(Ty: AA.Alloca->getAllocatedType()) ==
1009 DL->getTypeStoreSize(Ty: AccessTy)) {
1010 AA.Vector.Worklist.push_back(Elt: Inst);
1011 continue;
1012 }
1013
1014 if (!isSupportedAccessType(VecTy: AA.Vector.Ty, AccessTy, DL: *DL))
1015 return RejectUser(Inst, "not a supported access type");
1016
1017 AA.Vector.Worklist.push_back(Elt: Inst);
1018 continue;
1019 }
1020
1021 if (auto *GEP = dyn_cast<GetElementPtrInst>(Val: Inst)) {
1022 // If we can't compute a vector index from this GEP, then we can't
1023 // promote this alloca to vector.
1024 auto Index = computeGEPToVectorIndex(GEP, Alloca: AA.Alloca, VecElemTy: VecEltTy, DL: *DL);
1025 if (!Index)
1026 return RejectUser(Inst, "cannot compute vector index for GEP");
1027
1028 AA.Vector.GEPVectorIdx[GEP] = std::move(Index.value());
1029 AA.Vector.UsersToRemove.push_back(Elt: Inst);
1030 continue;
1031 }
1032
1033 if (MemSetInst *MSI = dyn_cast<MemSetInst>(Val: Inst);
1034 MSI && isSupportedMemset(I: MSI, AI: AA.Alloca, DL: *DL)) {
1035 AA.Vector.Worklist.push_back(Elt: Inst);
1036 continue;
1037 }
1038
1039 if (MemTransferInst *TransferInst = dyn_cast<MemTransferInst>(Val: Inst)) {
1040 if (TransferInst->isVolatile())
1041 return RejectUser(Inst, "mem transfer inst is volatile");
1042
1043 ConstantInt *Len = dyn_cast<ConstantInt>(Val: TransferInst->getLength());
1044 if (!Len || (Len->getZExtValue() % ElementSize))
1045 return RejectUser(Inst, "mem transfer inst length is non-constant or "
1046 "not a multiple of the vector element size");
1047
1048 auto getConstIndexIntoAlloca = [&](Value *Ptr) -> ConstantInt * {
1049 if (Ptr == AA.Alloca)
1050 return ConstantInt::get(Context&: Ptr->getContext(), V: APInt(32, 0));
1051
1052 GetElementPtrInst *GEP = cast<GetElementPtrInst>(Val: Ptr);
1053 const auto &GEPI = AA.Vector.GEPVectorIdx.find(Key: GEP)->second;
1054 if (GEPI.VarIndex)
1055 return nullptr;
1056 if (GEPI.ConstIndex)
1057 return GEPI.ConstIndex;
1058 return ConstantInt::get(Context&: Ptr->getContext(), V: APInt(32, 0));
1059 };
1060
1061 MemTransferInfo *TI =
1062 &AA.Vector.TransferInfo.try_emplace(Key: TransferInst).first->second;
1063 unsigned OpNum = U->getOperandNo();
1064 if (OpNum == 0) {
1065 Value *Dest = TransferInst->getDest();
1066 ConstantInt *Index = getConstIndexIntoAlloca(Dest);
1067 if (!Index)
1068 return RejectUser(Inst, "could not calculate constant dest index");
1069 TI->DestIndex = Index;
1070 } else {
1071 assert(OpNum == 1);
1072 Value *Src = TransferInst->getSource();
1073 ConstantInt *Index = getConstIndexIntoAlloca(Src);
1074 if (!Index)
1075 return RejectUser(Inst, "could not calculate constant src index");
1076 TI->SrcIndex = Index;
1077 }
1078 continue;
1079 }
1080
1081 if (auto *Intr = dyn_cast<IntrinsicInst>(Val: Inst)) {
1082 if (Intr->getIntrinsicID() == Intrinsic::objectsize) {
1083 AA.Vector.Worklist.push_back(Elt: Inst);
1084 continue;
1085 }
1086 }
1087
1088 // Ignore assume-like intrinsics and comparisons used in assumes.
1089 if (isAssumeLikeIntrinsic(I: Inst)) {
1090 if (!Inst->use_empty())
1091 return RejectUser(Inst, "assume-like intrinsic cannot have any users");
1092 AA.Vector.UsersToRemove.push_back(Elt: Inst);
1093 continue;
1094 }
1095
1096 if (isa<ICmpInst>(Val: Inst) && all_of(Range: Inst->users(), P: [](User *U) {
1097 return isAssumeLikeIntrinsic(I: cast<Instruction>(Val: U));
1098 })) {
1099 AA.Vector.UsersToRemove.push_back(Elt: Inst);
1100 continue;
1101 }
1102
1103 return RejectUser(Inst, "unhandled alloca user");
1104 }
1105
1106 // Follow-up check to ensure we've seen both sides of all transfer insts.
1107 for (const auto &Entry : AA.Vector.TransferInfo) {
1108 const MemTransferInfo &TI = Entry.second;
1109 if (!TI.SrcIndex || !TI.DestIndex)
1110 return RejectUser(Entry.first,
1111 "mem transfer inst between different objects");
1112 AA.Vector.Worklist.push_back(Elt: Entry.first);
1113 }
1114}
1115
1116void AMDGPUPromoteAllocaImpl::promoteAllocaToVector(AllocaAnalysis &AA) {
1117 LLVM_DEBUG(dbgs() << "Promoting to vectors: " << *AA.Alloca << '\n');
1118 LLVM_DEBUG(dbgs() << " type conversion: " << *AA.Alloca->getAllocatedType()
1119 << " -> " << *AA.Vector.Ty << '\n');
1120 const unsigned VecStoreSize = DL->getTypeStoreSize(Ty: AA.Vector.Ty);
1121
1122 Type *VecEltTy = AA.Vector.Ty->getElementType();
1123 const unsigned ElementSize = DL->getTypeSizeInBits(Ty: VecEltTy) / 8;
1124
1125 // Alloca is uninitialized memory. Imitate that by making the first value
1126 // undef.
1127 SSAUpdater Updater;
1128 Updater.Initialize(Ty: AA.Vector.Ty, Name: "promotealloca");
1129
1130 BasicBlock *EntryBB = AA.Alloca->getParent();
1131 BasicBlock::iterator InitInsertPos =
1132 skipToNonAllocaInsertPt(BB&: *EntryBB, I: AA.Alloca->getIterator());
1133 IRBuilder<> Builder(&*InitInsertPos);
1134 Value *AllocaInitValue = Builder.CreateFreeze(V: PoisonValue::get(T: AA.Vector.Ty));
1135 AllocaInitValue->takeName(V: AA.Alloca);
1136
1137 Updater.AddAvailableValue(BB: AA.Alloca->getParent(), V: AllocaInitValue);
1138
1139 // First handle the initial worklist, in basic block order.
1140 //
1141 // Insert a placeholder whenever we need the vector value at the top of a
1142 // basic block.
1143 SmallVector<Instruction *> Placeholders;
1144 forEachWorkListItem(WorkList: AA.Vector.Worklist, Fn: [&](Instruction *I) {
1145 BasicBlock *BB = I->getParent();
1146 auto GetCurVal = [&]() -> Value * {
1147 if (Value *CurVal = Updater.FindValueForBlock(BB))
1148 return CurVal;
1149
1150 if (!Placeholders.empty() && Placeholders.back()->getParent() == BB)
1151 return Placeholders.back();
1152
1153 // If the current value in the basic block is not yet known, insert a
1154 // placeholder that we will replace later.
1155 IRBuilder<> Builder(I);
1156 auto *Placeholder = cast<Instruction>(Val: Builder.CreateFreeze(
1157 V: PoisonValue::get(T: AA.Vector.Ty), Name: "promotealloca.placeholder"));
1158 Placeholders.push_back(Elt: Placeholder);
1159 return Placeholders.back();
1160 };
1161
1162 Value *Result = promoteAllocaUserToVector(Inst: I, DL: *DL, AA, VecStoreSize,
1163 ElementSize, GetCurVal);
1164 if (Result)
1165 Updater.AddAvailableValue(BB, V: Result);
1166 });
1167
1168 // Now fixup the placeholders.
1169 SmallVector<Value *> PlaceholderToNewVal(Placeholders.size());
1170 for (auto [Index, Placeholder] : enumerate(First&: Placeholders)) {
1171 Value *NewVal = Updater.GetValueInMiddleOfBlock(BB: Placeholder->getParent());
1172 PlaceholderToNewVal[Index] = NewVal;
1173 Placeholder->replaceAllUsesWith(V: NewVal);
1174 }
1175 // Note: we cannot merge this loop with the previous one because it is
1176 // possible that the placeholder itself can be used in the SSAUpdater. The
1177 // replaceAllUsesWith doesn't replace those uses.
1178 for (auto [Index, Placeholder] : enumerate(First&: Placeholders)) {
1179 if (!Placeholder->use_empty())
1180 Placeholder->replaceAllUsesWith(V: PlaceholderToNewVal[Index]);
1181 Placeholder->eraseFromParent();
1182 }
1183
1184 // Delete all instructions.
1185 for (Instruction *I : AA.Vector.Worklist) {
1186 assert(I->use_empty());
1187 I->eraseFromParent();
1188 }
1189
1190 // Delete all the users that are known to be removeable.
1191 for (Instruction *I : reverse(C&: AA.Vector.UsersToRemove)) {
1192 I->dropDroppableUses();
1193 assert(I->use_empty());
1194 I->eraseFromParent();
1195 }
1196
1197 // Alloca should now be dead too.
1198 assert(AA.Alloca->use_empty());
1199 AA.Alloca->eraseFromParent();
1200}
1201
1202std::pair<Value *, Value *>
1203AMDGPUPromoteAllocaImpl::getLocalSizeYZ(IRBuilder<> &Builder) {
1204 Function &F = *Builder.GetInsertBlock()->getParent();
1205 const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F);
1206
1207 if (!IsAMDHSA) {
1208 CallInst *LocalSizeY =
1209 Builder.CreateIntrinsic(ID: Intrinsic::r600_read_local_size_y, Args: {});
1210 CallInst *LocalSizeZ =
1211 Builder.CreateIntrinsic(ID: Intrinsic::r600_read_local_size_z, Args: {});
1212
1213 ST.makeLIDRangeMetadata(I: LocalSizeY);
1214 ST.makeLIDRangeMetadata(I: LocalSizeZ);
1215
1216 return std::pair(LocalSizeY, LocalSizeZ);
1217 }
1218
1219 // We must read the size out of the dispatch pointer.
1220 assert(IsAMDGCN);
1221
1222 // We are indexing into this struct, and want to extract the workgroup_size_*
1223 // fields.
1224 //
1225 // typedef struct hsa_kernel_dispatch_packet_s {
1226 // uint16_t header;
1227 // uint16_t setup;
1228 // uint16_t workgroup_size_x ;
1229 // uint16_t workgroup_size_y;
1230 // uint16_t workgroup_size_z;
1231 // uint16_t reserved0;
1232 // uint32_t grid_size_x ;
1233 // uint32_t grid_size_y ;
1234 // uint32_t grid_size_z;
1235 //
1236 // uint32_t private_segment_size;
1237 // uint32_t group_segment_size;
1238 // uint64_t kernel_object;
1239 //
1240 // #ifdef HSA_LARGE_MODEL
1241 // void *kernarg_address;
1242 // #elif defined HSA_LITTLE_ENDIAN
1243 // void *kernarg_address;
1244 // uint32_t reserved1;
1245 // #else
1246 // uint32_t reserved1;
1247 // void *kernarg_address;
1248 // #endif
1249 // uint64_t reserved2;
1250 // hsa_signal_t completion_signal; // uint64_t wrapper
1251 // } hsa_kernel_dispatch_packet_t
1252 //
1253 CallInst *DispatchPtr =
1254 Builder.CreateIntrinsic(ID: Intrinsic::amdgcn_dispatch_ptr, Args: {});
1255 DispatchPtr->addRetAttr(Kind: Attribute::NoAlias);
1256 DispatchPtr->addRetAttr(Kind: Attribute::NonNull);
1257 F.removeFnAttr(Kind: "amdgpu-no-dispatch-ptr");
1258
1259 // Size of the dispatch packet struct.
1260 DispatchPtr->addDereferenceableRetAttr(Bytes: 64);
1261
1262 Type *I32Ty = Type::getInt32Ty(C&: Mod->getContext());
1263
1264 // We could do a single 64-bit load here, but it's likely that the basic
1265 // 32-bit and extract sequence is already present, and it is probably easier
1266 // to CSE this. The loads should be mergeable later anyway.
1267 Value *GEPXY = Builder.CreateConstInBoundsGEP1_64(Ty: I32Ty, Ptr: DispatchPtr, Idx0: 1);
1268 LoadInst *LoadXY = Builder.CreateAlignedLoad(Ty: I32Ty, Ptr: GEPXY, Align: Align(4));
1269
1270 Value *GEPZU = Builder.CreateConstInBoundsGEP1_64(Ty: I32Ty, Ptr: DispatchPtr, Idx0: 2);
1271 LoadInst *LoadZU = Builder.CreateAlignedLoad(Ty: I32Ty, Ptr: GEPZU, Align: Align(4));
1272
1273 MDNode *MD = MDNode::get(Context&: Mod->getContext(), MDs: {});
1274 LoadXY->setMetadata(KindID: LLVMContext::MD_invariant_load, Node: MD);
1275 LoadZU->setMetadata(KindID: LLVMContext::MD_invariant_load, Node: MD);
1276 ST.makeLIDRangeMetadata(I: LoadZU);
1277
1278 // Extract y component. Upper half of LoadZU should be zero already.
1279 Value *Y = Builder.CreateLShr(LHS: LoadXY, RHS: 16);
1280
1281 return std::pair(Y, LoadZU);
1282}
1283
1284Value *AMDGPUPromoteAllocaImpl::getWorkitemID(IRBuilder<> &Builder,
1285 unsigned N) {
1286 Function *F = Builder.GetInsertBlock()->getParent();
1287 const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F: *F);
1288 Intrinsic::ID IntrID = Intrinsic::not_intrinsic;
1289 StringRef AttrName;
1290
1291 switch (N) {
1292 case 0:
1293 IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_x
1294 : (Intrinsic::ID)Intrinsic::r600_read_tidig_x;
1295 AttrName = "amdgpu-no-workitem-id-x";
1296 break;
1297 case 1:
1298 IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_y
1299 : (Intrinsic::ID)Intrinsic::r600_read_tidig_y;
1300 AttrName = "amdgpu-no-workitem-id-y";
1301 break;
1302
1303 case 2:
1304 IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_z
1305 : (Intrinsic::ID)Intrinsic::r600_read_tidig_z;
1306 AttrName = "amdgpu-no-workitem-id-z";
1307 break;
1308 default:
1309 llvm_unreachable("invalid dimension");
1310 }
1311
1312 Function *WorkitemIdFn = Intrinsic::getOrInsertDeclaration(M: Mod, id: IntrID);
1313 CallInst *CI = Builder.CreateCall(Callee: WorkitemIdFn);
1314 ST.makeLIDRangeMetadata(I: CI);
1315 F->removeFnAttr(Kind: AttrName);
1316
1317 return CI;
1318}
1319
1320static bool isCallPromotable(CallInst *CI) {
1321 IntrinsicInst *II = dyn_cast<IntrinsicInst>(Val: CI);
1322 if (!II)
1323 return false;
1324
1325 switch (II->getIntrinsicID()) {
1326 case Intrinsic::memcpy:
1327 case Intrinsic::memmove:
1328 case Intrinsic::memset:
1329 case Intrinsic::lifetime_start:
1330 case Intrinsic::lifetime_end:
1331 case Intrinsic::invariant_start:
1332 case Intrinsic::invariant_end:
1333 case Intrinsic::launder_invariant_group:
1334 case Intrinsic::strip_invariant_group:
1335 case Intrinsic::objectsize:
1336 return true;
1337 default:
1338 return false;
1339 }
1340}
1341
1342bool AMDGPUPromoteAllocaImpl::binaryOpIsDerivedFromSameAlloca(
1343 Value *BaseAlloca, Value *Val, Instruction *Inst, int OpIdx0,
1344 int OpIdx1) const {
1345 // Figure out which operand is the one we might not be promoting.
1346 Value *OtherOp = Inst->getOperand(i: OpIdx0);
1347 if (Val == OtherOp)
1348 OtherOp = Inst->getOperand(i: OpIdx1);
1349
1350 if (isa<ConstantPointerNull, ConstantAggregateZero>(Val: OtherOp))
1351 return true;
1352
1353 // TODO: getUnderlyingObject will not work on a vector getelementptr
1354 Value *OtherObj = getUnderlyingObject(V: OtherOp);
1355 if (!isa<AllocaInst>(Val: OtherObj))
1356 return false;
1357
1358 // TODO: We should be able to replace undefs with the right pointer type.
1359
1360 // TODO: If we know the other base object is another promotable
1361 // alloca, not necessarily this alloca, we can do this. The
1362 // important part is both must have the same address space at
1363 // the end.
1364 if (OtherObj != BaseAlloca) {
1365 LLVM_DEBUG(
1366 dbgs() << "Found a binary instruction with another alloca object\n");
1367 return false;
1368 }
1369
1370 return true;
1371}
1372
1373void AMDGPUPromoteAllocaImpl::analyzePromoteToLDS(AllocaAnalysis &AA) const {
1374 if (DisablePromoteAllocaToLDS) {
1375 LLVM_DEBUG(dbgs() << " Promote alloca to LDS is disabled\n");
1376 return;
1377 }
1378
1379 // Don't promote the alloca to LDS for shader calling conventions as the work
1380 // item ID intrinsics are not supported for these calling conventions.
1381 // Furthermore not all LDS is available for some of the stages.
1382 const Function &ContainingFunction = *AA.Alloca->getFunction();
1383 CallingConv::ID CC = ContainingFunction.getCallingConv();
1384
1385 switch (CC) {
1386 case CallingConv::AMDGPU_KERNEL:
1387 case CallingConv::SPIR_KERNEL:
1388 break;
1389 default:
1390 LLVM_DEBUG(
1391 dbgs()
1392 << " promote alloca to LDS not supported with calling convention.\n");
1393 return;
1394 }
1395
1396 for (Use *Use : AA.Uses) {
1397 auto *User = Use->getUser();
1398
1399 if (CallInst *CI = dyn_cast<CallInst>(Val: User)) {
1400 if (!isCallPromotable(CI))
1401 return;
1402
1403 if (find(Range&: AA.LDS.Worklist, Val: User) == AA.LDS.Worklist.end())
1404 AA.LDS.Worklist.push_back(Elt: User);
1405 continue;
1406 }
1407
1408 Instruction *UseInst = cast<Instruction>(Val: User);
1409 if (UseInst->getOpcode() == Instruction::PtrToInt)
1410 return;
1411
1412 if (LoadInst *LI = dyn_cast<LoadInst>(Val: UseInst)) {
1413 if (LI->isVolatile())
1414 return;
1415 continue;
1416 }
1417
1418 if (StoreInst *SI = dyn_cast<StoreInst>(Val: UseInst)) {
1419 if (SI->isVolatile())
1420 return;
1421 continue;
1422 }
1423
1424 if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(Val: UseInst)) {
1425 if (RMW->isVolatile())
1426 return;
1427 continue;
1428 }
1429
1430 if (AtomicCmpXchgInst *CAS = dyn_cast<AtomicCmpXchgInst>(Val: UseInst)) {
1431 if (CAS->isVolatile())
1432 return;
1433 continue;
1434 }
1435
1436 // Only promote a select if we know that the other select operand
1437 // is from another pointer that will also be promoted.
1438 if (ICmpInst *ICmp = dyn_cast<ICmpInst>(Val: UseInst)) {
1439 if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca: AA.Alloca, Val: Use->get(), Inst: ICmp, OpIdx0: 0, OpIdx1: 1))
1440 return;
1441
1442 // May need to rewrite constant operands.
1443 if (find(Range&: AA.LDS.Worklist, Val: User) == AA.LDS.Worklist.end())
1444 AA.LDS.Worklist.push_back(Elt: ICmp);
1445 continue;
1446 }
1447
1448 if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Val: UseInst)) {
1449 // Be conservative if an address could be computed outside the bounds of
1450 // the alloca.
1451 if (!GEP->isInBounds())
1452 return;
1453 } else if (!isa<ExtractElementInst, SelectInst, PHINode>(Val: User)) {
1454 // Do not promote vector/aggregate type instructions. It is hard to track
1455 // their users.
1456
1457 // Do not promote addrspacecast.
1458 //
1459 // TODO: If we know the address is only observed through flat pointers, we
1460 // could still promote.
1461 return;
1462 }
1463
1464 if (find(Range&: AA.LDS.Worklist, Val: User) == AA.LDS.Worklist.end())
1465 AA.LDS.Worklist.push_back(Elt: User);
1466 }
1467
1468 AA.LDS.Enable = true;
1469}
1470
1471bool AMDGPUPromoteAllocaImpl::hasSufficientLocalMem(const Function &F) {
1472
1473 FunctionType *FTy = F.getFunctionType();
1474 const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F);
1475
1476 // If the function has any arguments in the local address space, then it's
1477 // possible these arguments require the entire local memory space, so
1478 // we cannot use local memory in the pass.
1479 for (Type *ParamTy : FTy->params()) {
1480 PointerType *PtrTy = dyn_cast<PointerType>(Val: ParamTy);
1481 if (PtrTy && PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
1482 LocalMemLimit = 0;
1483 LLVM_DEBUG(dbgs() << "Function has local memory argument. Promoting to "
1484 "local memory disabled.\n");
1485 return false;
1486 }
1487 }
1488
1489 LocalMemLimit = ST.getAddressableLocalMemorySize();
1490 if (LocalMemLimit == 0)
1491 return false;
1492
1493 SmallVector<const Constant *, 16> Stack;
1494 SmallPtrSet<const Constant *, 8> VisitedConstants;
1495 SmallPtrSet<const GlobalVariable *, 8> UsedLDS;
1496
1497 auto visitUsers = [&](const GlobalVariable *GV, const Constant *Val) -> bool {
1498 for (const User *U : Val->users()) {
1499 if (const Instruction *Use = dyn_cast<Instruction>(Val: U)) {
1500 if (Use->getFunction() == &F)
1501 return true;
1502 } else {
1503 const Constant *C = cast<Constant>(Val: U);
1504 if (VisitedConstants.insert(Ptr: C).second)
1505 Stack.push_back(Elt: C);
1506 }
1507 }
1508
1509 return false;
1510 };
1511
1512 for (GlobalVariable &GV : Mod->globals()) {
1513 if (GV.getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS)
1514 continue;
1515
1516 if (visitUsers(&GV, &GV)) {
1517 UsedLDS.insert(Ptr: &GV);
1518 Stack.clear();
1519 continue;
1520 }
1521
1522 // For any ConstantExpr uses, we need to recursively search the users until
1523 // we see a function.
1524 while (!Stack.empty()) {
1525 const Constant *C = Stack.pop_back_val();
1526 if (visitUsers(&GV, C)) {
1527 UsedLDS.insert(Ptr: &GV);
1528 Stack.clear();
1529 break;
1530 }
1531 }
1532 }
1533
1534 const DataLayout &DL = Mod->getDataLayout();
1535 SmallVector<std::pair<uint64_t, Align>, 16> AllocatedSizes;
1536 AllocatedSizes.reserve(N: UsedLDS.size());
1537
1538 for (const GlobalVariable *GV : UsedLDS) {
1539 Align Alignment =
1540 DL.getValueOrABITypeAlignment(Alignment: GV->getAlign(), Ty: GV->getValueType());
1541 uint64_t AllocSize = GV->getGlobalSize(DL);
1542
1543 // HIP uses an extern unsized array in local address space for dynamically
1544 // allocated shared memory. In that case, we have to disable the promotion.
1545 if (GV->hasExternalLinkage() && AllocSize == 0) {
1546 LocalMemLimit = 0;
1547 LLVM_DEBUG(dbgs() << "Function has a reference to externally allocated "
1548 "local memory. Promoting to local memory "
1549 "disabled.\n");
1550 return false;
1551 }
1552
1553 AllocatedSizes.emplace_back(Args&: AllocSize, Args&: Alignment);
1554 }
1555
1556 // Sort to try to estimate the worst case alignment padding
1557 //
1558 // FIXME: We should really do something to fix the addresses to a more optimal
1559 // value instead
1560 llvm::sort(C&: AllocatedSizes, Comp: llvm::less_second());
1561
1562 // Check how much local memory is being used by global objects
1563 CurrentLocalMemUsage = 0;
1564
1565 // FIXME: Try to account for padding here. The real padding and address is
1566 // currently determined from the inverse order of uses in the function when
1567 // legalizing, which could also potentially change. We try to estimate the
1568 // worst case here, but we probably should fix the addresses earlier.
1569 for (auto Alloc : AllocatedSizes) {
1570 CurrentLocalMemUsage = alignTo(Size: CurrentLocalMemUsage, A: Alloc.second);
1571 CurrentLocalMemUsage += Alloc.first;
1572 }
1573
1574 unsigned MaxOccupancy =
1575 ST.getWavesPerEU(FlatWorkGroupSizes: ST.getFlatWorkGroupSizes(F), LDSBytes: CurrentLocalMemUsage, F)
1576 .second;
1577
1578 // Round up to the next tier of usage.
1579 unsigned MaxSizeWithWaveCount =
1580 ST.getMaxLocalMemSizeWithWaveCount(WaveCount: MaxOccupancy, F);
1581
1582 // Program may already use more LDS than is usable at maximum occupancy.
1583 if (CurrentLocalMemUsage > MaxSizeWithWaveCount)
1584 return false;
1585
1586 LocalMemLimit = MaxSizeWithWaveCount;
1587
1588 LLVM_DEBUG(dbgs() << F.getName() << " uses " << CurrentLocalMemUsage
1589 << " bytes of LDS\n"
1590 << " Rounding size to " << MaxSizeWithWaveCount
1591 << " with a maximum occupancy of " << MaxOccupancy << '\n'
1592 << " and " << (LocalMemLimit - CurrentLocalMemUsage)
1593 << " available for promotion\n");
1594
1595 return true;
1596}
1597
1598// FIXME: Should try to pick the most likely to be profitable allocas first.
1599bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(
1600 AllocaAnalysis &AA, bool SufficientLDS,
1601 SetVector<IntrinsicInst *> &DeferredIntrs) {
1602 LLVM_DEBUG(dbgs() << "Trying to promote to LDS: " << *AA.Alloca << '\n');
1603
1604 // Not likely to have sufficient local memory for promotion.
1605 if (!SufficientLDS)
1606 return false;
1607
1608 const DataLayout &DL = Mod->getDataLayout();
1609 IRBuilder<> Builder(AA.Alloca);
1610
1611 const Function &ContainingFunction = *AA.Alloca->getParent()->getParent();
1612 const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F: ContainingFunction);
1613 unsigned WorkGroupSize = ST.getFlatWorkGroupSizes(F: ContainingFunction).second;
1614
1615 Align Alignment = AA.Alloca->getAlign();
1616
1617 // FIXME: This computed padding is likely wrong since it depends on inverse
1618 // usage order.
1619 //
1620 // FIXME: It is also possible that if we're allowed to use all of the memory
1621 // could end up using more than the maximum due to alignment padding.
1622
1623 uint32_t NewSize = alignTo(Size: CurrentLocalMemUsage, A: Alignment);
1624 std::optional<TypeSize> ElemSize = AA.Alloca->getAllocationSize(DL);
1625 if (!ElemSize || ElemSize->isScalable())
1626 return false;
1627 TypeSize AllocSize = WorkGroupSize * *ElemSize;
1628 NewSize += AllocSize.getFixedValue();
1629
1630 if (NewSize > LocalMemLimit) {
1631 LLVM_DEBUG(dbgs() << " " << AllocSize
1632 << " bytes of local memory not available to promote\n");
1633 return false;
1634 }
1635
1636 CurrentLocalMemUsage = NewSize;
1637
1638 LLVM_DEBUG(dbgs() << "Promoting alloca to local memory\n");
1639
1640 Function *F = AA.Alloca->getFunction();
1641
1642 Type *GVTy = ArrayType::get(ElementType: AA.Alloca->getAllocatedType(), NumElements: WorkGroupSize);
1643 GlobalVariable *GV = new GlobalVariable(
1644 *Mod, GVTy, false, GlobalValue::InternalLinkage, PoisonValue::get(T: GVTy),
1645 Twine(F->getName()) + Twine('.') + AA.Alloca->getName(), nullptr,
1646 GlobalVariable::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS);
1647 GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
1648 GV->setAlignment(AA.Alloca->getAlign());
1649
1650 Value *TCntY, *TCntZ;
1651
1652 std::tie(args&: TCntY, args&: TCntZ) = getLocalSizeYZ(Builder);
1653 Value *TIdX = getWorkitemID(Builder, N: 0);
1654 Value *TIdY = getWorkitemID(Builder, N: 1);
1655 Value *TIdZ = getWorkitemID(Builder, N: 2);
1656
1657 Value *Tmp0 = Builder.CreateMul(LHS: TCntY, RHS: TCntZ, Name: "", HasNUW: true, HasNSW: true);
1658 Tmp0 = Builder.CreateMul(LHS: Tmp0, RHS: TIdX);
1659 Value *Tmp1 = Builder.CreateMul(LHS: TIdY, RHS: TCntZ, Name: "", HasNUW: true, HasNSW: true);
1660 Value *TID = Builder.CreateAdd(LHS: Tmp0, RHS: Tmp1);
1661 TID = Builder.CreateAdd(LHS: TID, RHS: TIdZ);
1662
1663 LLVMContext &Context = Mod->getContext();
1664 Value *Indices[] = {Constant::getNullValue(Ty: Type::getInt32Ty(C&: Context)), TID};
1665
1666 Value *Offset = Builder.CreateInBoundsGEP(Ty: GVTy, Ptr: GV, IdxList: Indices);
1667 AA.Alloca->mutateType(Ty: Offset->getType());
1668 AA.Alloca->replaceAllUsesWith(V: Offset);
1669 AA.Alloca->eraseFromParent();
1670
1671 PointerType *NewPtrTy = PointerType::get(C&: Context, AddressSpace: AMDGPUAS::LOCAL_ADDRESS);
1672
1673 for (Value *V : AA.LDS.Worklist) {
1674 CallInst *Call = dyn_cast<CallInst>(Val: V);
1675 if (!Call) {
1676 if (ICmpInst *CI = dyn_cast<ICmpInst>(Val: V)) {
1677 Value *LHS = CI->getOperand(i_nocapture: 0);
1678 Value *RHS = CI->getOperand(i_nocapture: 1);
1679
1680 Type *NewTy = LHS->getType()->getWithNewType(EltTy: NewPtrTy);
1681 if (isa<ConstantPointerNull, ConstantAggregateZero>(Val: LHS))
1682 CI->setOperand(i_nocapture: 0, Val_nocapture: Constant::getNullValue(Ty: NewTy));
1683
1684 if (isa<ConstantPointerNull, ConstantAggregateZero>(Val: RHS))
1685 CI->setOperand(i_nocapture: 1, Val_nocapture: Constant::getNullValue(Ty: NewTy));
1686
1687 continue;
1688 }
1689
1690 // The operand's value should be corrected on its own and we don't want to
1691 // touch the users.
1692 if (isa<AddrSpaceCastInst>(Val: V))
1693 continue;
1694
1695 assert(V->getType()->isPtrOrPtrVectorTy());
1696
1697 Type *NewTy = V->getType()->getWithNewType(EltTy: NewPtrTy);
1698 V->mutateType(Ty: NewTy);
1699
1700 // Adjust the types of any constant operands.
1701 if (SelectInst *SI = dyn_cast<SelectInst>(Val: V)) {
1702 if (isa<ConstantPointerNull, ConstantAggregateZero>(Val: SI->getOperand(i_nocapture: 1)))
1703 SI->setOperand(i_nocapture: 1, Val_nocapture: Constant::getNullValue(Ty: NewTy));
1704
1705 if (isa<ConstantPointerNull, ConstantAggregateZero>(Val: SI->getOperand(i_nocapture: 2)))
1706 SI->setOperand(i_nocapture: 2, Val_nocapture: Constant::getNullValue(Ty: NewTy));
1707 } else if (PHINode *Phi = dyn_cast<PHINode>(Val: V)) {
1708 for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I) {
1709 if (isa<ConstantPointerNull, ConstantAggregateZero>(
1710 Val: Phi->getIncomingValue(i: I)))
1711 Phi->setIncomingValue(i: I, V: Constant::getNullValue(Ty: NewTy));
1712 }
1713 }
1714
1715 continue;
1716 }
1717
1718 IntrinsicInst *Intr = cast<IntrinsicInst>(Val: Call);
1719 Builder.SetInsertPoint(Intr);
1720 switch (Intr->getIntrinsicID()) {
1721 case Intrinsic::lifetime_start:
1722 case Intrinsic::lifetime_end:
1723 // These intrinsics are for address space 0 only
1724 Intr->eraseFromParent();
1725 continue;
1726 case Intrinsic::memcpy:
1727 case Intrinsic::memmove:
1728 // These have 2 pointer operands. In case if second pointer also needs
1729 // to be replaced we defer processing of these intrinsics until all
1730 // other values are processed.
1731 DeferredIntrs.insert(X: Intr);
1732 continue;
1733 case Intrinsic::memset: {
1734 MemSetInst *MemSet = cast<MemSetInst>(Val: Intr);
1735 Builder.CreateMemSet(Ptr: MemSet->getRawDest(), Val: MemSet->getValue(),
1736 Size: MemSet->getLength(), Align: MemSet->getDestAlign(),
1737 isVolatile: MemSet->isVolatile());
1738 Intr->eraseFromParent();
1739 continue;
1740 }
1741 case Intrinsic::invariant_start:
1742 case Intrinsic::invariant_end:
1743 case Intrinsic::launder_invariant_group:
1744 case Intrinsic::strip_invariant_group: {
1745 SmallVector<Value *> Args;
1746 if (Intr->getIntrinsicID() == Intrinsic::invariant_start) {
1747 Args.emplace_back(Args: Intr->getArgOperand(i: 0));
1748 } else if (Intr->getIntrinsicID() == Intrinsic::invariant_end) {
1749 Args.emplace_back(Args: Intr->getArgOperand(i: 0));
1750 Args.emplace_back(Args: Intr->getArgOperand(i: 1));
1751 }
1752 Args.emplace_back(Args&: Offset);
1753 Function *F = Intrinsic::getOrInsertDeclaration(
1754 M: Intr->getModule(), id: Intr->getIntrinsicID(), Tys: Offset->getType());
1755 CallInst *NewIntr =
1756 CallInst::Create(Func: F, Args, NameStr: Intr->getName(), InsertBefore: Intr->getIterator());
1757 Intr->mutateType(Ty: NewIntr->getType());
1758 Intr->replaceAllUsesWith(V: NewIntr);
1759 Intr->eraseFromParent();
1760 continue;
1761 }
1762 case Intrinsic::objectsize: {
1763 Value *Src = Intr->getOperand(i_nocapture: 0);
1764
1765 CallInst *NewCall = Builder.CreateIntrinsic(
1766 ID: Intrinsic::objectsize,
1767 Types: {Intr->getType(), PointerType::get(C&: Context, AddressSpace: AMDGPUAS::LOCAL_ADDRESS)},
1768 Args: {Src, Intr->getOperand(i_nocapture: 1), Intr->getOperand(i_nocapture: 2), Intr->getOperand(i_nocapture: 3)});
1769 Intr->replaceAllUsesWith(V: NewCall);
1770 Intr->eraseFromParent();
1771 continue;
1772 }
1773 default:
1774 Intr->print(O&: errs());
1775 llvm_unreachable("Don't know how to promote alloca intrinsic use.");
1776 }
1777 }
1778
1779 return true;
1780}
1781
1782void AMDGPUPromoteAllocaImpl::finishDeferredAllocaToLDSPromotion(
1783 SetVector<IntrinsicInst *> &DeferredIntrs) {
1784
1785 for (IntrinsicInst *Intr : DeferredIntrs) {
1786 IRBuilder<> Builder(Intr);
1787 Builder.SetInsertPoint(Intr);
1788 Intrinsic::ID ID = Intr->getIntrinsicID();
1789 assert(ID == Intrinsic::memcpy || ID == Intrinsic::memmove);
1790
1791 MemTransferInst *MI = cast<MemTransferInst>(Val: Intr);
1792 auto *B = Builder.CreateMemTransferInst(
1793 IntrID: ID, Dst: MI->getRawDest(), DstAlign: MI->getDestAlign(), Src: MI->getRawSource(),
1794 SrcAlign: MI->getSourceAlign(), Size: MI->getLength(), isVolatile: MI->isVolatile());
1795
1796 for (unsigned I = 0; I != 2; ++I) {
1797 if (uint64_t Bytes = Intr->getParamDereferenceableBytes(i: I)) {
1798 B->addDereferenceableParamAttr(i: I, Bytes);
1799 }
1800 }
1801
1802 Intr->eraseFromParent();
1803 }
1804}
1805