1//===-- AMDGPUPromoteAlloca.cpp - Promote Allocas -------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// Eliminates allocas by either converting them into vectors or by migrating
10// them to local address space.
11//
12// Two passes are exposed by this file:
13// - "promote-alloca-to-vector", which runs early in the pipeline and only
14// promotes to vector. Promotion to vector is almost always profitable
15// except when the alloca is too big and the promotion would result in
16// very high register pressure.
17// - "promote-alloca", which does both promotion to vector and LDS and runs
18// much later in the pipeline. This runs after SROA because promoting to
19// LDS is of course less profitable than getting rid of the alloca or
20// vectorizing it, thus we only want to do it when the only alternative is
21// lowering the alloca to stack.
22//
23// Note that both of them exist for the old and new PMs. The new PM passes are
24// declared in AMDGPU.h and the legacy PM ones are declared here.s
25//
26//===----------------------------------------------------------------------===//
27
28#include "AMDGPU.h"
29#include "GCNSubtarget.h"
30#include "Utils/AMDGPUBaseInfo.h"
31#include "llvm/ADT/STLExtras.h"
32#include "llvm/Analysis/CaptureTracking.h"
33#include "llvm/Analysis/InstSimplifyFolder.h"
34#include "llvm/Analysis/InstructionSimplify.h"
35#include "llvm/Analysis/LoopInfo.h"
36#include "llvm/Analysis/ValueTracking.h"
37#include "llvm/CodeGen/TargetPassConfig.h"
38#include "llvm/IR/IRBuilder.h"
39#include "llvm/IR/IntrinsicInst.h"
40#include "llvm/IR/IntrinsicsAMDGPU.h"
41#include "llvm/IR/IntrinsicsR600.h"
42#include "llvm/IR/PatternMatch.h"
43#include "llvm/InitializePasses.h"
44#include "llvm/Pass.h"
45#include "llvm/Support/MathExtras.h"
46#include "llvm/Target/TargetMachine.h"
47#include "llvm/Transforms/Utils/SSAUpdater.h"
48
49#define DEBUG_TYPE "amdgpu-promote-alloca"
50
51using namespace llvm;
52
53namespace {
54
55static cl::opt<bool>
56 DisablePromoteAllocaToVector("disable-promote-alloca-to-vector",
57 cl::desc("Disable promote alloca to vector"),
58 cl::init(Val: false));
59
60static cl::opt<bool>
61 DisablePromoteAllocaToLDS("disable-promote-alloca-to-lds",
62 cl::desc("Disable promote alloca to LDS"),
63 cl::init(Val: false));
64
65static cl::opt<unsigned> PromoteAllocaToVectorLimit(
66 "amdgpu-promote-alloca-to-vector-limit",
67 cl::desc("Maximum byte size to consider promote alloca to vector"),
68 cl::init(Val: 0));
69
70static cl::opt<unsigned> PromoteAllocaToVectorMaxRegs(
71 "amdgpu-promote-alloca-to-vector-max-regs",
72 cl::desc(
73 "Maximum vector size (in 32b registers) to use when promoting alloca"),
74 cl::init(Val: 32));
75
76// Use up to 1/4 of available register budget for vectorization.
77// FIXME: Increase the limit for whole function budgets? Perhaps x2?
78static cl::opt<unsigned> PromoteAllocaToVectorVGPRRatio(
79 "amdgpu-promote-alloca-to-vector-vgpr-ratio",
80 cl::desc("Ratio of VGPRs to budget for promoting alloca to vectors"),
81 cl::init(Val: 4));
82
83static cl::opt<unsigned>
84 LoopUserWeight("promote-alloca-vector-loop-user-weight",
85 cl::desc("The bonus weight of users of allocas within loop "
86 "when sorting profitable allocas"),
87 cl::init(Val: 4));
88
89// We support vector indices of the form (A * stride) + B
90// All parts are optional.
91struct GEPToVectorIndex {
92 Value *VarIndex = nullptr; // defaults to 0
93 ConstantInt *VarMul = nullptr; // defaults to 1
94 ConstantInt *ConstIndex = nullptr; // defaults to 0
95 Value *Full = nullptr;
96};
97
98struct MemTransferInfo {
99 ConstantInt *SrcIndex = nullptr;
100 ConstantInt *DestIndex = nullptr;
101};
102
103// Analysis for planning the different strategies of alloca promotion.
104struct AllocaAnalysis {
105 AllocaInst *Alloca = nullptr;
106 DenseSet<Value *> Pointers;
107 SmallVector<Use *> Uses;
108 unsigned Score = 0;
109 bool HaveSelectOrPHI = false;
110 struct {
111 FixedVectorType *Ty = nullptr;
112 SmallVector<Instruction *> Worklist;
113 SmallVector<Instruction *> UsersToRemove;
114 MapVector<GetElementPtrInst *, GEPToVectorIndex> GEPVectorIdx;
115 MapVector<MemTransferInst *, MemTransferInfo> TransferInfo;
116 } Vector;
117 struct {
118 bool Enable = false;
119 SmallVector<User *> Worklist;
120 } LDS;
121
122 explicit AllocaAnalysis(AllocaInst *Alloca) : Alloca(Alloca) {}
123};
124
125// Shared implementation which can do both promotion to vector and to LDS.
126class AMDGPUPromoteAllocaImpl {
127private:
128 const TargetMachine &TM;
129 LoopInfo &LI;
130 Module *Mod = nullptr;
131 const DataLayout *DL = nullptr;
132
133 // FIXME: This should be per-kernel.
134 uint32_t LocalMemLimit = 0;
135 uint32_t CurrentLocalMemUsage = 0;
136 unsigned MaxVGPRs;
137 unsigned VGPRBudgetRatio;
138 unsigned MaxVectorRegs;
139
140 bool IsAMDGCN = false;
141 bool IsAMDHSA = false;
142
143 std::pair<Value *, Value *> getLocalSizeYZ(IRBuilder<> &Builder);
144 Value *getWorkitemID(IRBuilder<> &Builder, unsigned N);
145
146 bool collectAllocaUses(AllocaAnalysis &AA) const;
147
148 /// Val is a derived pointer from Alloca. OpIdx0/OpIdx1 are the operand
149 /// indices to an instruction with 2 pointer inputs (e.g. select, icmp).
150 /// Returns true if both operands are derived from the same alloca. Val should
151 /// be the same value as one of the input operands of UseInst.
152 bool binaryOpIsDerivedFromSameAlloca(Value *Alloca, Value *Val,
153 Instruction *UseInst, int OpIdx0,
154 int OpIdx1) const;
155
156 /// Check whether we have enough local memory for promotion.
157 bool hasSufficientLocalMem(const Function &F);
158
159 FixedVectorType *getVectorTypeForAlloca(Type *AllocaTy) const;
160 void analyzePromoteToVector(AllocaAnalysis &AA) const;
161 void promoteAllocaToVector(AllocaAnalysis &AA);
162 void analyzePromoteToLDS(AllocaAnalysis &AA) const;
163 bool tryPromoteAllocaToLDS(AllocaAnalysis &AA, bool SufficientLDS,
164 SetVector<IntrinsicInst *> &DeferredIntrs);
165 void
166 finishDeferredAllocaToLDSPromotion(SetVector<IntrinsicInst *> &DeferredIntrs);
167
168 void scoreAlloca(AllocaAnalysis &AA) const;
169
170 void setFunctionLimits(const Function &F);
171
172public:
173 AMDGPUPromoteAllocaImpl(TargetMachine &TM, LoopInfo &LI) : TM(TM), LI(LI) {
174
175 const Triple &TT = TM.getTargetTriple();
176 IsAMDGCN = TT.isAMDGCN();
177 IsAMDHSA = TT.getOS() == Triple::AMDHSA;
178 }
179
180 bool run(Function &F, bool PromoteToLDS);
181};
182
183// FIXME: This can create globals so should be a module pass.
184class AMDGPUPromoteAlloca : public FunctionPass {
185public:
186 static char ID;
187
188 AMDGPUPromoteAlloca() : FunctionPass(ID) {}
189
190 bool runOnFunction(Function &F) override {
191 if (skipFunction(F))
192 return false;
193 if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>())
194 return AMDGPUPromoteAllocaImpl(
195 TPC->getTM<TargetMachine>(),
196 getAnalysis<LoopInfoWrapperPass>().getLoopInfo())
197 .run(F, /*PromoteToLDS*/ true);
198 return false;
199 }
200
201 StringRef getPassName() const override { return "AMDGPU Promote Alloca"; }
202
203 void getAnalysisUsage(AnalysisUsage &AU) const override {
204 AU.setPreservesCFG();
205 AU.addRequired<LoopInfoWrapperPass>();
206 FunctionPass::getAnalysisUsage(AU);
207 }
208};
209
210static unsigned getMaxVGPRs(unsigned LDSBytes, const TargetMachine &TM,
211 const Function &F) {
212 if (!TM.getTargetTriple().isAMDGCN())
213 return 128;
214
215 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
216
217 unsigned DynamicVGPRBlockSize = AMDGPU::getDynamicVGPRBlockSize(F);
218 // Temporarily check both the attribute and the subtarget feature, until the
219 // latter is removed.
220 if (DynamicVGPRBlockSize == 0 && ST.isDynamicVGPREnabled())
221 DynamicVGPRBlockSize = ST.getDynamicVGPRBlockSize();
222
223 unsigned MaxVGPRs = ST.getMaxNumVGPRs(
224 WavesPerEU: ST.getWavesPerEU(FlatWorkGroupSizes: ST.getFlatWorkGroupSizes(F), LDSBytes, F).first,
225 DynamicVGPRBlockSize);
226
227 // A non-entry function has only 32 caller preserved registers.
228 // Do not promote alloca which will force spilling unless we know the function
229 // will be inlined.
230 if (!F.hasFnAttribute(Kind: Attribute::AlwaysInline) &&
231 !AMDGPU::isEntryFunctionCC(CC: F.getCallingConv()))
232 MaxVGPRs = std::min(a: MaxVGPRs, b: 32u);
233 return MaxVGPRs;
234}
235
236} // end anonymous namespace
237
238char AMDGPUPromoteAlloca::ID = 0;
239
240INITIALIZE_PASS_BEGIN(AMDGPUPromoteAlloca, DEBUG_TYPE,
241 "AMDGPU promote alloca to vector or LDS", false, false)
242// Move LDS uses from functions to kernels before promote alloca for accurate
243// estimation of LDS available
244INITIALIZE_PASS_DEPENDENCY(AMDGPULowerModuleLDSLegacy)
245INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
246INITIALIZE_PASS_END(AMDGPUPromoteAlloca, DEBUG_TYPE,
247 "AMDGPU promote alloca to vector or LDS", false, false)
248
249char &llvm::AMDGPUPromoteAllocaID = AMDGPUPromoteAlloca::ID;
250
251PreservedAnalyses AMDGPUPromoteAllocaPass::run(Function &F,
252 FunctionAnalysisManager &AM) {
253 auto &LI = AM.getResult<LoopAnalysis>(IR&: F);
254 bool Changed = AMDGPUPromoteAllocaImpl(TM, LI).run(F, /*PromoteToLDS=*/true);
255 if (Changed) {
256 PreservedAnalyses PA;
257 PA.preserveSet<CFGAnalyses>();
258 return PA;
259 }
260 return PreservedAnalyses::all();
261}
262
263PreservedAnalyses
264AMDGPUPromoteAllocaToVectorPass::run(Function &F, FunctionAnalysisManager &AM) {
265 auto &LI = AM.getResult<LoopAnalysis>(IR&: F);
266 bool Changed = AMDGPUPromoteAllocaImpl(TM, LI).run(F, /*PromoteToLDS=*/false);
267 if (Changed) {
268 PreservedAnalyses PA;
269 PA.preserveSet<CFGAnalyses>();
270 return PA;
271 }
272 return PreservedAnalyses::all();
273}
274
275FunctionPass *llvm::createAMDGPUPromoteAlloca() {
276 return new AMDGPUPromoteAlloca();
277}
278
279bool AMDGPUPromoteAllocaImpl::collectAllocaUses(AllocaAnalysis &AA) const {
280 const auto RejectUser = [&](Instruction *Inst, Twine Msg) {
281 LLVM_DEBUG(dbgs() << " Cannot promote alloca: " << Msg << "\n"
282 << " " << *Inst << "\n");
283 return false;
284 };
285
286 SmallVector<Instruction *, 4> WorkList({AA.Alloca});
287 while (!WorkList.empty()) {
288 auto *Cur = WorkList.pop_back_val();
289 if (find(Range&: AA.Pointers, Val: Cur) != AA.Pointers.end())
290 continue;
291 AA.Pointers.insert(V: Cur);
292 for (auto &U : Cur->uses()) {
293 auto *Inst = cast<Instruction>(Val: U.getUser());
294 if (isa<StoreInst>(Val: Inst)) {
295 if (U.getOperandNo() != StoreInst::getPointerOperandIndex()) {
296 return RejectUser(Inst, "pointer escapes via store");
297 }
298 }
299 AA.Uses.push_back(Elt: &U);
300
301 if (isa<GetElementPtrInst>(Val: U.getUser())) {
302 WorkList.push_back(Elt: Inst);
303 } else if (auto *SI = dyn_cast<SelectInst>(Val: Inst)) {
304 // Only promote a select if we know that the other select operand is
305 // from another pointer that will also be promoted.
306 if (!binaryOpIsDerivedFromSameAlloca(Alloca: AA.Alloca, Val: Cur, UseInst: SI, OpIdx0: 1, OpIdx1: 2))
307 return RejectUser(Inst, "select from mixed objects");
308 WorkList.push_back(Elt: Inst);
309 AA.HaveSelectOrPHI = true;
310 } else if (auto *Phi = dyn_cast<PHINode>(Val: Inst)) {
311 // Repeat for phis.
312
313 // TODO: Handle more complex cases. We should be able to replace loops
314 // over arrays.
315 switch (Phi->getNumIncomingValues()) {
316 case 1:
317 break;
318 case 2:
319 if (!binaryOpIsDerivedFromSameAlloca(Alloca: AA.Alloca, Val: Cur, UseInst: Phi, OpIdx0: 0, OpIdx1: 1))
320 return RejectUser(Inst, "phi from mixed objects");
321 break;
322 default:
323 return RejectUser(Inst, "phi with too many operands");
324 }
325
326 WorkList.push_back(Elt: Inst);
327 AA.HaveSelectOrPHI = true;
328 }
329 }
330 }
331 return true;
332}
333
334void AMDGPUPromoteAllocaImpl::scoreAlloca(AllocaAnalysis &AA) const {
335 LLVM_DEBUG(dbgs() << "Scoring: " << *AA.Alloca << "\n");
336 unsigned Score = 0;
337 // Increment score by one for each user + a bonus for users within loops.
338 for (auto *U : AA.Uses) {
339 Instruction *Inst = cast<Instruction>(Val: U->getUser());
340 if (isa<GetElementPtrInst>(Val: Inst) || isa<SelectInst>(Val: Inst) ||
341 isa<PHINode>(Val: Inst))
342 continue;
343 unsigned UserScore =
344 1 + (LoopUserWeight * LI.getLoopDepth(BB: Inst->getParent()));
345 LLVM_DEBUG(dbgs() << " [+" << UserScore << "]:\t" << *Inst << "\n");
346 Score += UserScore;
347 }
348 LLVM_DEBUG(dbgs() << " => Final Score:" << Score << "\n");
349 AA.Score = Score;
350}
351
352void AMDGPUPromoteAllocaImpl::setFunctionLimits(const Function &F) {
353 // Load per function limits, overriding with global options where appropriate.
354 // R600 register tuples/aliasing are fragile with large vector promotions so
355 // apply architecture specific limit here.
356 const int R600MaxVectorRegs = 16;
357 MaxVectorRegs = F.getFnAttributeAsParsedInteger(
358 Kind: "amdgpu-promote-alloca-to-vector-max-regs",
359 Default: IsAMDGCN ? PromoteAllocaToVectorMaxRegs : R600MaxVectorRegs);
360 if (PromoteAllocaToVectorMaxRegs.getNumOccurrences())
361 MaxVectorRegs = PromoteAllocaToVectorMaxRegs;
362 VGPRBudgetRatio = F.getFnAttributeAsParsedInteger(
363 Kind: "amdgpu-promote-alloca-to-vector-vgpr-ratio",
364 Default: PromoteAllocaToVectorVGPRRatio);
365 if (PromoteAllocaToVectorVGPRRatio.getNumOccurrences())
366 VGPRBudgetRatio = PromoteAllocaToVectorVGPRRatio;
367}
368
369bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
370 Mod = F.getParent();
371 DL = &Mod->getDataLayout();
372
373 const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F);
374 if (!ST.enablePromoteAlloca())
375 return false;
376
377 bool SufficientLDS = PromoteToLDS && hasSufficientLocalMem(F);
378 MaxVGPRs = getMaxVGPRs(LDSBytes: CurrentLocalMemUsage, TM, F);
379 setFunctionLimits(F);
380
381 unsigned VectorizationBudget =
382 (PromoteAllocaToVectorLimit ? PromoteAllocaToVectorLimit * 8
383 : (MaxVGPRs * 32)) /
384 VGPRBudgetRatio;
385
386 std::vector<AllocaAnalysis> Allocas;
387 for (Instruction &I : F.getEntryBlock()) {
388 if (AllocaInst *AI = dyn_cast<AllocaInst>(Val: &I)) {
389 // Array allocations are probably not worth handling, since an allocation
390 // of the array type is the canonical form.
391 if (!AI->isStaticAlloca() || AI->isArrayAllocation())
392 continue;
393
394 LLVM_DEBUG(dbgs() << "Analyzing: " << *AI << '\n');
395
396 AllocaAnalysis AA{AI};
397 if (collectAllocaUses(AA)) {
398 analyzePromoteToVector(AA);
399 if (PromoteToLDS)
400 analyzePromoteToLDS(AA);
401 if (AA.Vector.Ty || AA.LDS.Enable) {
402 scoreAlloca(AA);
403 Allocas.push_back(x: std::move(AA));
404 }
405 }
406 }
407 }
408
409 stable_sort(Range&: Allocas,
410 C: [](const auto &A, const auto &B) { return A.Score > B.Score; });
411
412 // clang-format off
413 LLVM_DEBUG(
414 dbgs() << "Sorted Worklist:\n";
415 for (const auto &AA : Allocas)
416 dbgs() << " " << *AA.Alloca << "\n";
417 );
418 // clang-format on
419
420 bool Changed = false;
421 SetVector<IntrinsicInst *> DeferredIntrs;
422 for (AllocaAnalysis &AA : Allocas) {
423 if (AA.Vector.Ty) {
424 const unsigned AllocaCost =
425 DL->getTypeSizeInBits(Ty: AA.Alloca->getAllocatedType());
426 // First, check if we have enough budget to vectorize this alloca.
427 if (AllocaCost <= VectorizationBudget) {
428 promoteAllocaToVector(AA);
429 Changed = true;
430 assert((VectorizationBudget - AllocaCost) < VectorizationBudget &&
431 "Underflow!");
432 VectorizationBudget -= AllocaCost;
433 LLVM_DEBUG(dbgs() << " Remaining vectorization budget:"
434 << VectorizationBudget << "\n");
435 continue;
436 } else {
437 LLVM_DEBUG(dbgs() << "Alloca too big for vectorization (size:"
438 << AllocaCost << ", budget:" << VectorizationBudget
439 << "): " << *AA.Alloca << "\n");
440 }
441 }
442
443 if (AA.LDS.Enable &&
444 tryPromoteAllocaToLDS(AA, SufficientLDS, DeferredIntrs))
445 Changed = true;
446 }
447 finishDeferredAllocaToLDSPromotion(DeferredIntrs);
448
449 // NOTE: tryPromoteAllocaToVector removes the alloca, so Allocas contains
450 // dangling pointers. If we want to reuse it past this point, the loop above
451 // would need to be updated to remove successfully promoted allocas.
452
453 return Changed;
454}
455
456// Checks if the instruction I is a memset user of the alloca AI that we can
457// deal with. Currently, only non-volatile memsets that affect the whole alloca
458// are handled.
459static bool isSupportedMemset(MemSetInst *I, AllocaInst *AI,
460 const DataLayout &DL) {
461 using namespace PatternMatch;
462 // For now we only care about non-volatile memsets that affect the whole type
463 // (start at index 0 and fill the whole alloca).
464 //
465 // TODO: Now that we moved to PromoteAlloca we could handle any memsets
466 // (except maybe volatile ones?) - we just need to use shufflevector if it
467 // only affects a subset of the vector.
468 const unsigned Size = DL.getTypeStoreSize(Ty: AI->getAllocatedType());
469 return I->getOperand(i_nocapture: 0) == AI &&
470 match(V: I->getOperand(i_nocapture: 2), P: m_SpecificInt(V: Size)) && !I->isVolatile();
471}
472
473static Value *calculateVectorIndex(Value *Ptr, AllocaAnalysis &AA) {
474 IRBuilder<> B(Ptr->getContext());
475
476 Ptr = Ptr->stripPointerCasts();
477 if (Ptr == AA.Alloca)
478 return B.getInt32(C: 0);
479
480 auto *GEP = cast<GetElementPtrInst>(Val: Ptr);
481 auto I = AA.Vector.GEPVectorIdx.find(Key: GEP);
482 assert(I != AA.Vector.GEPVectorIdx.end() && "Must have entry for GEP!");
483
484 if (!I->second.Full) {
485 Value *Result = nullptr;
486 B.SetInsertPoint(GEP);
487
488 if (I->second.VarIndex) {
489 Result = I->second.VarIndex;
490 Result = B.CreateSExtOrTrunc(V: Result, DestTy: B.getInt32Ty());
491
492 if (I->second.VarMul)
493 Result = B.CreateMul(LHS: Result, RHS: I->second.VarMul);
494 }
495
496 if (I->second.ConstIndex) {
497 if (Result)
498 Result = B.CreateAdd(LHS: Result, RHS: I->second.ConstIndex);
499 else
500 Result = I->second.ConstIndex;
501 }
502
503 if (!Result)
504 Result = B.getInt32(C: 0);
505
506 I->second.Full = Result;
507 }
508
509 return I->second.Full;
510}
511
512static std::optional<GEPToVectorIndex>
513computeGEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca,
514 Type *VecElemTy, const DataLayout &DL) {
515 // TODO: Extracting a "multiple of X" from a GEP might be a useful generic
516 // helper.
517 LLVMContext &Ctx = GEP->getContext();
518 unsigned BW = DL.getIndexTypeSizeInBits(Ty: GEP->getType());
519 SmallMapVector<Value *, APInt, 4> VarOffsets;
520 APInt ConstOffset(BW, 0);
521
522 // Walk backwards through nested GEPs to collect both constant and variable
523 // offsets, so that nested vector GEP chains can be lowered in one step.
524 //
525 // Given this IR fragment as input:
526 //
527 // %0 = alloca [10 x <2 x i32>], align 8, addrspace(5)
528 // %1 = getelementptr [10 x <2 x i32>], ptr addrspace(5) %0, i32 0, i32 %j
529 // %2 = getelementptr i8, ptr addrspace(5) %1, i32 4
530 // %3 = load i32, ptr addrspace(5) %2, align 4
531 //
532 // Combine both GEP operations in a single pass, producing:
533 // BasePtr = %0
534 // ConstOffset = 4
535 // VarOffsets = { %j -> element_size(<2 x i32>) }
536 //
537 // That lets us emit a single buffer_load directly into a VGPR, without ever
538 // allocating scratch memory for the intermediate pointer.
539 Value *CurPtr = GEP;
540 while (auto *CurGEP = dyn_cast<GetElementPtrInst>(Val: CurPtr)) {
541 if (!CurGEP->collectOffset(DL, BitWidth: BW, VariableOffsets&: VarOffsets, ConstantOffset&: ConstOffset))
542 return {};
543
544 // Move to the next outer pointer.
545 CurPtr = CurGEP->getPointerOperand();
546 }
547
548 assert(CurPtr == Alloca && "GEP not based on alloca");
549
550 int64_t VecElemSize = DL.getTypeAllocSize(Ty: VecElemTy);
551 if (VarOffsets.size() > 1)
552 return {};
553
554 APInt IndexQuot;
555 int64_t Rem;
556 APInt::sdivrem(LHS: ConstOffset, RHS: VecElemSize, Quotient&: IndexQuot, Remainder&: Rem);
557 if (Rem != 0)
558 return {};
559
560 GEPToVectorIndex Result;
561
562 if (!ConstOffset.isZero())
563 Result.ConstIndex = ConstantInt::get(Context&: Ctx, V: IndexQuot.sextOrTrunc(width: BW));
564
565 if (VarOffsets.empty())
566 return Result;
567
568 const auto &VarOffset = VarOffsets.front();
569 APInt OffsetQuot;
570 APInt::sdivrem(LHS: VarOffset.second, RHS: VecElemSize, Quotient&: OffsetQuot, Remainder&: Rem);
571 if (Rem != 0 || OffsetQuot.isZero())
572 return {};
573
574 Result.VarIndex = VarOffset.first;
575 auto *OffsetType = dyn_cast<IntegerType>(Val: Result.VarIndex->getType());
576 if (!OffsetType)
577 return {};
578
579 if (!OffsetQuot.isOne())
580 Result.VarMul = ConstantInt::get(Context&: Ctx, V: OffsetQuot.sextOrTrunc(width: BW));
581
582 return Result;
583}
584
585/// Promotes a single user of the alloca to a vector form.
586///
587/// \param Inst Instruction to be promoted.
588/// \param DL Module Data Layout.
589/// \param AA Alloca Analysis.
590/// \param VecStoreSize Size of \p VectorTy in bytes.
591/// \param ElementSize Size of \p VectorTy element type in bytes.
592/// \param CurVal Current value of the vector (e.g. last stored value)
593/// \param[out] DeferredLoads \p Inst is added to this vector if it can't
594/// be promoted now. This happens when promoting requires \p
595/// CurVal, but \p CurVal is nullptr.
596/// \return the stored value if \p Inst would have written to the alloca, or
597/// nullptr otherwise.
598static Value *promoteAllocaUserToVector(Instruction *Inst, const DataLayout &DL,
599 AllocaAnalysis &AA,
600 unsigned VecStoreSize,
601 unsigned ElementSize,
602 function_ref<Value *()> GetCurVal) {
603 // Note: we use InstSimplifyFolder because it can leverage the DataLayout
604 // to do more folding, especially in the case of vector splats.
605 IRBuilder<InstSimplifyFolder> Builder(Inst->getContext(),
606 InstSimplifyFolder(DL));
607 Builder.SetInsertPoint(Inst);
608
609 Type *VecEltTy = AA.Vector.Ty->getElementType();
610
611 switch (Inst->getOpcode()) {
612 case Instruction::Load: {
613 Value *CurVal = GetCurVal();
614 Value *Index =
615 calculateVectorIndex(Ptr: cast<LoadInst>(Val: Inst)->getPointerOperand(), AA);
616
617 // We're loading the full vector.
618 Type *AccessTy = Inst->getType();
619 TypeSize AccessSize = DL.getTypeStoreSize(Ty: AccessTy);
620 if (Constant *CI = dyn_cast<Constant>(Val: Index)) {
621 if (CI->isZeroValue() && AccessSize == VecStoreSize) {
622 Inst->replaceAllUsesWith(
623 V: Builder.CreateBitPreservingCastChain(DL, V: CurVal, NewTy: AccessTy));
624 return nullptr;
625 }
626 }
627
628 // Loading a subvector.
629 if (isa<FixedVectorType>(Val: AccessTy)) {
630 assert(AccessSize.isKnownMultipleOf(DL.getTypeStoreSize(VecEltTy)));
631 const unsigned NumLoadedElts = AccessSize / DL.getTypeStoreSize(Ty: VecEltTy);
632 auto *SubVecTy = FixedVectorType::get(ElementType: VecEltTy, NumElts: NumLoadedElts);
633 assert(DL.getTypeStoreSize(SubVecTy) == DL.getTypeStoreSize(AccessTy));
634
635 // If idx is dynamic, then sandwich load with bitcasts.
636 // ie. VectorTy SubVecTy AccessTy
637 // <64 x i8> -> <16 x i8> <8 x i16>
638 // <64 x i8> -> <4 x i128> -> i128 -> <8 x i16>
639 // Extracting subvector with dynamic index has very large expansion in
640 // the amdgpu backend. Limit to pow2.
641 FixedVectorType *VectorTy = AA.Vector.Ty;
642 TypeSize NumBits = DL.getTypeStoreSize(Ty: SubVecTy) * 8u;
643 uint64_t LoadAlign = cast<LoadInst>(Val: Inst)->getAlign().value();
644 bool IsAlignedLoad = NumBits <= (LoadAlign * 8u);
645 unsigned TotalNumElts = VectorTy->getNumElements();
646 bool IsProperlyDivisible = TotalNumElts % NumLoadedElts == 0;
647 if (!isa<ConstantInt>(Val: Index) &&
648 llvm::isPowerOf2_32(Value: SubVecTy->getNumElements()) &&
649 IsProperlyDivisible && IsAlignedLoad) {
650 IntegerType *NewElemTy = Builder.getIntNTy(N: NumBits);
651 const unsigned NewNumElts =
652 DL.getTypeStoreSize(Ty: VectorTy) * 8u / NumBits;
653 const unsigned LShrAmt = llvm::Log2_32(Value: SubVecTy->getNumElements());
654 FixedVectorType *BitCastTy =
655 FixedVectorType::get(ElementType: NewElemTy, NumElts: NewNumElts);
656 Value *BCVal = Builder.CreateBitCast(V: CurVal, DestTy: BitCastTy);
657 Value *NewIdx = Builder.CreateLShr(
658 LHS: Index, RHS: ConstantInt::get(Ty: Index->getType(), V: LShrAmt));
659 Value *ExtVal = Builder.CreateExtractElement(Vec: BCVal, Idx: NewIdx);
660 Value *BCOut = Builder.CreateBitCast(V: ExtVal, DestTy: AccessTy);
661 Inst->replaceAllUsesWith(V: BCOut);
662 return nullptr;
663 }
664
665 Value *SubVec = PoisonValue::get(T: SubVecTy);
666 for (unsigned K = 0; K < NumLoadedElts; ++K) {
667 Value *CurIdx =
668 Builder.CreateAdd(LHS: Index, RHS: ConstantInt::get(Ty: Index->getType(), V: K));
669 SubVec = Builder.CreateInsertElement(
670 Vec: SubVec, NewElt: Builder.CreateExtractElement(Vec: CurVal, Idx: CurIdx), Idx: K);
671 }
672
673 Inst->replaceAllUsesWith(
674 V: Builder.CreateBitPreservingCastChain(DL, V: SubVec, NewTy: AccessTy));
675 return nullptr;
676 }
677
678 // We're loading one element.
679 Value *ExtractElement = Builder.CreateExtractElement(Vec: CurVal, Idx: Index);
680 if (AccessTy != VecEltTy)
681 ExtractElement = Builder.CreateBitOrPointerCast(V: ExtractElement, DestTy: AccessTy);
682
683 Inst->replaceAllUsesWith(V: ExtractElement);
684 return nullptr;
685 }
686 case Instruction::Store: {
687 // For stores, it's a bit trickier and it depends on whether we're storing
688 // the full vector or not. If we're storing the full vector, we don't need
689 // to know the current value. If this is a store of a single element, we
690 // need to know the value.
691 StoreInst *SI = cast<StoreInst>(Val: Inst);
692 Value *Index = calculateVectorIndex(Ptr: SI->getPointerOperand(), AA);
693 Value *Val = SI->getValueOperand();
694
695 // We're storing the full vector, we can handle this without knowing CurVal.
696 Type *AccessTy = Val->getType();
697 TypeSize AccessSize = DL.getTypeStoreSize(Ty: AccessTy);
698 if (Constant *CI = dyn_cast<Constant>(Val: Index))
699 if (CI->isZeroValue() && AccessSize == VecStoreSize)
700 return Builder.CreateBitPreservingCastChain(DL, V: Val, NewTy: AA.Vector.Ty);
701
702 // Storing a subvector.
703 if (isa<FixedVectorType>(Val: AccessTy)) {
704 assert(AccessSize.isKnownMultipleOf(DL.getTypeStoreSize(VecEltTy)));
705 const unsigned NumWrittenElts =
706 AccessSize / DL.getTypeStoreSize(Ty: VecEltTy);
707 const unsigned NumVecElts = AA.Vector.Ty->getNumElements();
708 auto *SubVecTy = FixedVectorType::get(ElementType: VecEltTy, NumElts: NumWrittenElts);
709 assert(DL.getTypeStoreSize(SubVecTy) == DL.getTypeStoreSize(AccessTy));
710
711 Val = Builder.CreateBitPreservingCastChain(DL, V: Val, NewTy: SubVecTy);
712 Value *CurVec = GetCurVal();
713 for (unsigned K = 0, NumElts = std::min(a: NumWrittenElts, b: NumVecElts);
714 K < NumElts; ++K) {
715 Value *CurIdx =
716 Builder.CreateAdd(LHS: Index, RHS: ConstantInt::get(Ty: Index->getType(), V: K));
717 CurVec = Builder.CreateInsertElement(
718 Vec: CurVec, NewElt: Builder.CreateExtractElement(Vec: Val, Idx: K), Idx: CurIdx);
719 }
720 return CurVec;
721 }
722
723 if (Val->getType() != VecEltTy)
724 Val = Builder.CreateBitOrPointerCast(V: Val, DestTy: VecEltTy);
725 return Builder.CreateInsertElement(Vec: GetCurVal(), NewElt: Val, Idx: Index);
726 }
727 case Instruction::Call: {
728 if (auto *MTI = dyn_cast<MemTransferInst>(Val: Inst)) {
729 // For memcpy, we need to know curval.
730 ConstantInt *Length = cast<ConstantInt>(Val: MTI->getLength());
731 unsigned NumCopied = Length->getZExtValue() / ElementSize;
732 MemTransferInfo *TI = &AA.Vector.TransferInfo[MTI];
733 unsigned SrcBegin = TI->SrcIndex->getZExtValue();
734 unsigned DestBegin = TI->DestIndex->getZExtValue();
735
736 SmallVector<int> Mask;
737 for (unsigned Idx = 0; Idx < AA.Vector.Ty->getNumElements(); ++Idx) {
738 if (Idx >= DestBegin && Idx < DestBegin + NumCopied) {
739 Mask.push_back(Elt: SrcBegin < AA.Vector.Ty->getNumElements()
740 ? SrcBegin++
741 : PoisonMaskElem);
742 } else {
743 Mask.push_back(Elt: Idx);
744 }
745 }
746
747 return Builder.CreateShuffleVector(V: GetCurVal(), Mask);
748 }
749
750 if (auto *MSI = dyn_cast<MemSetInst>(Val: Inst)) {
751 // For memset, we don't need to know the previous value because we
752 // currently only allow memsets that cover the whole alloca.
753 Value *Elt = MSI->getOperand(i_nocapture: 1);
754 const unsigned BytesPerElt = DL.getTypeStoreSize(Ty: VecEltTy);
755 if (BytesPerElt > 1) {
756 Value *EltBytes = Builder.CreateVectorSplat(NumElts: BytesPerElt, V: Elt);
757
758 // If the element type of the vector is a pointer, we need to first cast
759 // to an integer, then use a PtrCast.
760 if (VecEltTy->isPointerTy()) {
761 Type *PtrInt = Builder.getIntNTy(N: BytesPerElt * 8);
762 Elt = Builder.CreateBitCast(V: EltBytes, DestTy: PtrInt);
763 Elt = Builder.CreateIntToPtr(V: Elt, DestTy: VecEltTy);
764 } else
765 Elt = Builder.CreateBitCast(V: EltBytes, DestTy: VecEltTy);
766 }
767
768 return Builder.CreateVectorSplat(EC: AA.Vector.Ty->getElementCount(), V: Elt);
769 }
770
771 if (auto *Intr = dyn_cast<IntrinsicInst>(Val: Inst)) {
772 if (Intr->getIntrinsicID() == Intrinsic::objectsize) {
773 Intr->replaceAllUsesWith(
774 V: Builder.getIntN(N: Intr->getType()->getIntegerBitWidth(),
775 C: DL.getTypeAllocSize(Ty: AA.Vector.Ty)));
776 return nullptr;
777 }
778 }
779
780 llvm_unreachable("Unsupported call when promoting alloca to vector");
781 }
782
783 default:
784 llvm_unreachable("Inconsistency in instructions promotable to vector");
785 }
786
787 llvm_unreachable("Did not return after promoting instruction!");
788}
789
790static bool isSupportedAccessType(FixedVectorType *VecTy, Type *AccessTy,
791 const DataLayout &DL) {
792 // Access as a vector type can work if the size of the access vector is a
793 // multiple of the size of the alloca's vector element type.
794 //
795 // Examples:
796 // - VecTy = <8 x float>, AccessTy = <4 x float> -> OK
797 // - VecTy = <4 x double>, AccessTy = <2 x float> -> OK
798 // - VecTy = <4 x double>, AccessTy = <3 x float> -> NOT OK
799 // - 3*32 is not a multiple of 64
800 //
801 // We could handle more complicated cases, but it'd make things a lot more
802 // complicated.
803 if (isa<FixedVectorType>(Val: AccessTy)) {
804 TypeSize AccTS = DL.getTypeStoreSize(Ty: AccessTy);
805 // If the type size and the store size don't match, we would need to do more
806 // than just bitcast to translate between an extracted/insertable subvectors
807 // and the accessed value.
808 if (AccTS * 8 != DL.getTypeSizeInBits(Ty: AccessTy))
809 return false;
810 TypeSize VecTS = DL.getTypeStoreSize(Ty: VecTy->getElementType());
811 return AccTS.isKnownMultipleOf(RHS: VecTS);
812 }
813
814 return CastInst::isBitOrNoopPointerCastable(SrcTy: VecTy->getElementType(), DestTy: AccessTy,
815 DL);
816}
817
818/// Iterates over an instruction worklist that may contain multiple instructions
819/// from the same basic block, but in a different order.
820template <typename InstContainer>
821static void forEachWorkListItem(const InstContainer &WorkList,
822 std::function<void(Instruction *)> Fn) {
823 // Bucket up uses of the alloca by the block they occur in.
824 // This is important because we have to handle multiple defs/uses in a block
825 // ourselves: SSAUpdater is purely for cross-block references.
826 DenseMap<BasicBlock *, SmallDenseSet<Instruction *>> UsesByBlock;
827 for (Instruction *User : WorkList)
828 UsesByBlock[User->getParent()].insert(V: User);
829
830 for (Instruction *User : WorkList) {
831 BasicBlock *BB = User->getParent();
832 auto &BlockUses = UsesByBlock[BB];
833
834 // Already processed, skip.
835 if (BlockUses.empty())
836 continue;
837
838 // Only user in the block, directly process it.
839 if (BlockUses.size() == 1) {
840 Fn(User);
841 continue;
842 }
843
844 // Multiple users in the block, do a linear scan to see users in order.
845 for (Instruction &Inst : *BB) {
846 if (!BlockUses.contains(V: &Inst))
847 continue;
848
849 Fn(&Inst);
850 }
851
852 // Clear the block so we know it's been processed.
853 BlockUses.clear();
854 }
855}
856
857/// Find an insert point after an alloca, after all other allocas clustered at
858/// the start of the block.
859static BasicBlock::iterator skipToNonAllocaInsertPt(BasicBlock &BB,
860 BasicBlock::iterator I) {
861 for (BasicBlock::iterator E = BB.end(); I != E && isa<AllocaInst>(Val: *I); ++I)
862 ;
863 return I;
864}
865
866FixedVectorType *
867AMDGPUPromoteAllocaImpl::getVectorTypeForAlloca(Type *AllocaTy) const {
868 if (DisablePromoteAllocaToVector) {
869 LLVM_DEBUG(dbgs() << " Promote alloca to vectors is disabled\n");
870 return nullptr;
871 }
872
873 auto *VectorTy = dyn_cast<FixedVectorType>(Val: AllocaTy);
874 if (auto *ArrayTy = dyn_cast<ArrayType>(Val: AllocaTy)) {
875 uint64_t NumElems = 1;
876 Type *ElemTy;
877 do {
878 NumElems *= ArrayTy->getNumElements();
879 ElemTy = ArrayTy->getElementType();
880 } while ((ArrayTy = dyn_cast<ArrayType>(Val: ElemTy)));
881
882 // Check for array of vectors
883 auto *InnerVectorTy = dyn_cast<FixedVectorType>(Val: ElemTy);
884 if (InnerVectorTy) {
885 NumElems *= InnerVectorTy->getNumElements();
886 ElemTy = InnerVectorTy->getElementType();
887 }
888
889 if (VectorType::isValidElementType(ElemTy) && NumElems > 0) {
890 unsigned ElementSize = DL->getTypeSizeInBits(Ty: ElemTy) / 8;
891 if (ElementSize > 0) {
892 unsigned AllocaSize = DL->getTypeStoreSize(Ty: AllocaTy);
893 // Expand vector if required to match padding of inner type,
894 // i.e. odd size subvectors.
895 // Storage size of new vector must match that of alloca for correct
896 // behaviour of byte offsets and GEP computation.
897 if (NumElems * ElementSize != AllocaSize)
898 NumElems = AllocaSize / ElementSize;
899 if (NumElems > 0 && (AllocaSize % ElementSize) == 0)
900 VectorTy = FixedVectorType::get(ElementType: ElemTy, NumElts: NumElems);
901 }
902 }
903 }
904 if (!VectorTy) {
905 LLVM_DEBUG(dbgs() << " Cannot convert type to vector\n");
906 return nullptr;
907 }
908
909 const unsigned MaxElements =
910 (MaxVectorRegs * 32) / DL->getTypeSizeInBits(Ty: VectorTy->getElementType());
911
912 if (VectorTy->getNumElements() > MaxElements ||
913 VectorTy->getNumElements() < 2) {
914 LLVM_DEBUG(dbgs() << " " << *VectorTy
915 << " has an unsupported number of elements\n");
916 return nullptr;
917 }
918
919 Type *VecEltTy = VectorTy->getElementType();
920 unsigned ElementSizeInBits = DL->getTypeSizeInBits(Ty: VecEltTy);
921 if (ElementSizeInBits != DL->getTypeAllocSizeInBits(Ty: VecEltTy)) {
922 LLVM_DEBUG(dbgs() << " Cannot convert to vector if the allocation size "
923 "does not match the type's size\n");
924 return nullptr;
925 }
926
927 return VectorTy;
928}
929
930void AMDGPUPromoteAllocaImpl::analyzePromoteToVector(AllocaAnalysis &AA) const {
931 if (AA.HaveSelectOrPHI) {
932 LLVM_DEBUG(dbgs() << " Cannot convert to vector due to select or phi\n");
933 return;
934 }
935
936 Type *AllocaTy = AA.Alloca->getAllocatedType();
937 AA.Vector.Ty = getVectorTypeForAlloca(AllocaTy);
938 if (!AA.Vector.Ty)
939 return;
940
941 const auto RejectUser = [&](Instruction *Inst, Twine Msg) {
942 LLVM_DEBUG(dbgs() << " Cannot promote alloca to vector: " << Msg << "\n"
943 << " " << *Inst << "\n");
944 AA.Vector.Ty = nullptr;
945 };
946
947 Type *VecEltTy = AA.Vector.Ty->getElementType();
948 unsigned ElementSize = DL->getTypeSizeInBits(Ty: VecEltTy) / 8;
949 assert(ElementSize > 0);
950 for (auto *U : AA.Uses) {
951 Instruction *Inst = cast<Instruction>(Val: U->getUser());
952
953 if (Value *Ptr = getLoadStorePointerOperand(V: Inst)) {
954 assert(!isa<StoreInst>(Inst) ||
955 U->getOperandNo() == StoreInst::getPointerOperandIndex());
956
957 Type *AccessTy = getLoadStoreType(I: Inst);
958 if (AccessTy->isAggregateType())
959 return RejectUser(Inst, "unsupported load/store as aggregate");
960 assert(!AccessTy->isAggregateType() || AccessTy->isArrayTy());
961
962 // Check that this is a simple access of a vector element.
963 bool IsSimple = isa<LoadInst>(Val: Inst) ? cast<LoadInst>(Val: Inst)->isSimple()
964 : cast<StoreInst>(Val: Inst)->isSimple();
965 if (!IsSimple)
966 return RejectUser(Inst, "not a simple load or store");
967
968 Ptr = Ptr->stripPointerCasts();
969
970 // Alloca already accessed as vector.
971 if (Ptr == AA.Alloca &&
972 DL->getTypeStoreSize(Ty: AA.Alloca->getAllocatedType()) ==
973 DL->getTypeStoreSize(Ty: AccessTy)) {
974 AA.Vector.Worklist.push_back(Elt: Inst);
975 continue;
976 }
977
978 if (!isSupportedAccessType(VecTy: AA.Vector.Ty, AccessTy, DL: *DL))
979 return RejectUser(Inst, "not a supported access type");
980
981 AA.Vector.Worklist.push_back(Elt: Inst);
982 continue;
983 }
984
985 if (auto *GEP = dyn_cast<GetElementPtrInst>(Val: Inst)) {
986 // If we can't compute a vector index from this GEP, then we can't
987 // promote this alloca to vector.
988 auto Index = computeGEPToVectorIndex(GEP, Alloca: AA.Alloca, VecElemTy: VecEltTy, DL: *DL);
989 if (!Index)
990 return RejectUser(Inst, "cannot compute vector index for GEP");
991
992 AA.Vector.GEPVectorIdx[GEP] = std::move(Index.value());
993 AA.Vector.UsersToRemove.push_back(Elt: Inst);
994 continue;
995 }
996
997 if (MemSetInst *MSI = dyn_cast<MemSetInst>(Val: Inst);
998 MSI && isSupportedMemset(I: MSI, AI: AA.Alloca, DL: *DL)) {
999 AA.Vector.Worklist.push_back(Elt: Inst);
1000 continue;
1001 }
1002
1003 if (MemTransferInst *TransferInst = dyn_cast<MemTransferInst>(Val: Inst)) {
1004 if (TransferInst->isVolatile())
1005 return RejectUser(Inst, "mem transfer inst is volatile");
1006
1007 ConstantInt *Len = dyn_cast<ConstantInt>(Val: TransferInst->getLength());
1008 if (!Len || (Len->getZExtValue() % ElementSize))
1009 return RejectUser(Inst, "mem transfer inst length is non-constant or "
1010 "not a multiple of the vector element size");
1011
1012 auto getConstIndexIntoAlloca = [&](Value *Ptr) -> ConstantInt * {
1013 if (Ptr == AA.Alloca)
1014 return ConstantInt::get(Context&: Ptr->getContext(), V: APInt(32, 0));
1015
1016 GetElementPtrInst *GEP = cast<GetElementPtrInst>(Val: Ptr);
1017 const auto &GEPI = AA.Vector.GEPVectorIdx.find(Key: GEP)->second;
1018 if (GEPI.VarIndex)
1019 return nullptr;
1020 if (GEPI.ConstIndex)
1021 return GEPI.ConstIndex;
1022 return ConstantInt::get(Context&: Ptr->getContext(), V: APInt(32, 0));
1023 };
1024
1025 MemTransferInfo *TI =
1026 &AA.Vector.TransferInfo.try_emplace(Key: TransferInst).first->second;
1027 unsigned OpNum = U->getOperandNo();
1028 if (OpNum == 0) {
1029 Value *Dest = TransferInst->getDest();
1030 ConstantInt *Index = getConstIndexIntoAlloca(Dest);
1031 if (!Index)
1032 return RejectUser(Inst, "could not calculate constant dest index");
1033 TI->DestIndex = Index;
1034 } else {
1035 assert(OpNum == 1);
1036 Value *Src = TransferInst->getSource();
1037 ConstantInt *Index = getConstIndexIntoAlloca(Src);
1038 if (!Index)
1039 return RejectUser(Inst, "could not calculate constant src index");
1040 TI->SrcIndex = Index;
1041 }
1042 continue;
1043 }
1044
1045 if (auto *Intr = dyn_cast<IntrinsicInst>(Val: Inst)) {
1046 if (Intr->getIntrinsicID() == Intrinsic::objectsize) {
1047 AA.Vector.Worklist.push_back(Elt: Inst);
1048 continue;
1049 }
1050 }
1051
1052 // Ignore assume-like intrinsics and comparisons used in assumes.
1053 if (isAssumeLikeIntrinsic(I: Inst)) {
1054 if (!Inst->use_empty())
1055 return RejectUser(Inst, "assume-like intrinsic cannot have any users");
1056 AA.Vector.UsersToRemove.push_back(Elt: Inst);
1057 continue;
1058 }
1059
1060 if (isa<ICmpInst>(Val: Inst) && all_of(Range: Inst->users(), P: [](User *U) {
1061 return isAssumeLikeIntrinsic(I: cast<Instruction>(Val: U));
1062 })) {
1063 AA.Vector.UsersToRemove.push_back(Elt: Inst);
1064 continue;
1065 }
1066
1067 return RejectUser(Inst, "unhandled alloca user");
1068 }
1069
1070 // Follow-up check to ensure we've seen both sides of all transfer insts.
1071 for (const auto &Entry : AA.Vector.TransferInfo) {
1072 const MemTransferInfo &TI = Entry.second;
1073 if (!TI.SrcIndex || !TI.DestIndex)
1074 return RejectUser(Entry.first,
1075 "mem transfer inst between different objects");
1076 AA.Vector.Worklist.push_back(Elt: Entry.first);
1077 }
1078}
1079
1080void AMDGPUPromoteAllocaImpl::promoteAllocaToVector(AllocaAnalysis &AA) {
1081 LLVM_DEBUG(dbgs() << "Promoting to vectors: " << *AA.Alloca << '\n');
1082 LLVM_DEBUG(dbgs() << " type conversion: " << *AA.Alloca->getAllocatedType()
1083 << " -> " << *AA.Vector.Ty << '\n');
1084 const unsigned VecStoreSize = DL->getTypeStoreSize(Ty: AA.Vector.Ty);
1085
1086 Type *VecEltTy = AA.Vector.Ty->getElementType();
1087 const unsigned ElementSize = DL->getTypeSizeInBits(Ty: VecEltTy) / 8;
1088
1089 // Alloca is uninitialized memory. Imitate that by making the first value
1090 // undef.
1091 SSAUpdater Updater;
1092 Updater.Initialize(Ty: AA.Vector.Ty, Name: "promotealloca");
1093
1094 BasicBlock *EntryBB = AA.Alloca->getParent();
1095 BasicBlock::iterator InitInsertPos =
1096 skipToNonAllocaInsertPt(BB&: *EntryBB, I: AA.Alloca->getIterator());
1097 IRBuilder<> Builder(&*InitInsertPos);
1098 Value *AllocaInitValue = Builder.CreateFreeze(V: PoisonValue::get(T: AA.Vector.Ty));
1099 AllocaInitValue->takeName(V: AA.Alloca);
1100
1101 Updater.AddAvailableValue(BB: AA.Alloca->getParent(), V: AllocaInitValue);
1102
1103 // First handle the initial worklist, in basic block order.
1104 //
1105 // Insert a placeholder whenever we need the vector value at the top of a
1106 // basic block.
1107 SmallVector<Instruction *> Placeholders;
1108 forEachWorkListItem(WorkList: AA.Vector.Worklist, Fn: [&](Instruction *I) {
1109 BasicBlock *BB = I->getParent();
1110 auto GetCurVal = [&]() -> Value * {
1111 if (Value *CurVal = Updater.FindValueForBlock(BB))
1112 return CurVal;
1113
1114 if (!Placeholders.empty() && Placeholders.back()->getParent() == BB)
1115 return Placeholders.back();
1116
1117 // If the current value in the basic block is not yet known, insert a
1118 // placeholder that we will replace later.
1119 IRBuilder<> Builder(I);
1120 auto *Placeholder = cast<Instruction>(Val: Builder.CreateFreeze(
1121 V: PoisonValue::get(T: AA.Vector.Ty), Name: "promotealloca.placeholder"));
1122 Placeholders.push_back(Elt: Placeholder);
1123 return Placeholders.back();
1124 };
1125
1126 Value *Result = promoteAllocaUserToVector(Inst: I, DL: *DL, AA, VecStoreSize,
1127 ElementSize, GetCurVal);
1128 if (Result)
1129 Updater.AddAvailableValue(BB, V: Result);
1130 });
1131
1132 // Now fixup the placeholders.
1133 SmallVector<Value *> PlaceholderToNewVal(Placeholders.size());
1134 for (auto [Index, Placeholder] : enumerate(First&: Placeholders)) {
1135 Value *NewVal = Updater.GetValueInMiddleOfBlock(BB: Placeholder->getParent());
1136 PlaceholderToNewVal[Index] = NewVal;
1137 Placeholder->replaceAllUsesWith(V: NewVal);
1138 }
1139 // Note: we cannot merge this loop with the previous one because it is
1140 // possible that the placeholder itself can be used in the SSAUpdater. The
1141 // replaceAllUsesWith doesn't replace those uses.
1142 for (auto [Index, Placeholder] : enumerate(First&: Placeholders)) {
1143 if (!Placeholder->use_empty())
1144 Placeholder->replaceAllUsesWith(V: PlaceholderToNewVal[Index]);
1145 Placeholder->eraseFromParent();
1146 }
1147
1148 // Delete all instructions.
1149 for (Instruction *I : AA.Vector.Worklist) {
1150 assert(I->use_empty());
1151 I->eraseFromParent();
1152 }
1153
1154 // Delete all the users that are known to be removeable.
1155 for (Instruction *I : reverse(C&: AA.Vector.UsersToRemove)) {
1156 I->dropDroppableUses();
1157 assert(I->use_empty());
1158 I->eraseFromParent();
1159 }
1160
1161 // Alloca should now be dead too.
1162 assert(AA.Alloca->use_empty());
1163 AA.Alloca->eraseFromParent();
1164}
1165
1166std::pair<Value *, Value *>
1167AMDGPUPromoteAllocaImpl::getLocalSizeYZ(IRBuilder<> &Builder) {
1168 Function &F = *Builder.GetInsertBlock()->getParent();
1169 const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F);
1170
1171 if (!IsAMDHSA) {
1172 CallInst *LocalSizeY =
1173 Builder.CreateIntrinsic(ID: Intrinsic::r600_read_local_size_y, Args: {});
1174 CallInst *LocalSizeZ =
1175 Builder.CreateIntrinsic(ID: Intrinsic::r600_read_local_size_z, Args: {});
1176
1177 ST.makeLIDRangeMetadata(I: LocalSizeY);
1178 ST.makeLIDRangeMetadata(I: LocalSizeZ);
1179
1180 return std::pair(LocalSizeY, LocalSizeZ);
1181 }
1182
1183 // We must read the size out of the dispatch pointer.
1184 assert(IsAMDGCN);
1185
1186 // We are indexing into this struct, and want to extract the workgroup_size_*
1187 // fields.
1188 //
1189 // typedef struct hsa_kernel_dispatch_packet_s {
1190 // uint16_t header;
1191 // uint16_t setup;
1192 // uint16_t workgroup_size_x ;
1193 // uint16_t workgroup_size_y;
1194 // uint16_t workgroup_size_z;
1195 // uint16_t reserved0;
1196 // uint32_t grid_size_x ;
1197 // uint32_t grid_size_y ;
1198 // uint32_t grid_size_z;
1199 //
1200 // uint32_t private_segment_size;
1201 // uint32_t group_segment_size;
1202 // uint64_t kernel_object;
1203 //
1204 // #ifdef HSA_LARGE_MODEL
1205 // void *kernarg_address;
1206 // #elif defined HSA_LITTLE_ENDIAN
1207 // void *kernarg_address;
1208 // uint32_t reserved1;
1209 // #else
1210 // uint32_t reserved1;
1211 // void *kernarg_address;
1212 // #endif
1213 // uint64_t reserved2;
1214 // hsa_signal_t completion_signal; // uint64_t wrapper
1215 // } hsa_kernel_dispatch_packet_t
1216 //
1217 CallInst *DispatchPtr =
1218 Builder.CreateIntrinsic(ID: Intrinsic::amdgcn_dispatch_ptr, Args: {});
1219 DispatchPtr->addRetAttr(Kind: Attribute::NoAlias);
1220 DispatchPtr->addRetAttr(Kind: Attribute::NonNull);
1221 F.removeFnAttr(Kind: "amdgpu-no-dispatch-ptr");
1222
1223 // Size of the dispatch packet struct.
1224 DispatchPtr->addDereferenceableRetAttr(Bytes: 64);
1225
1226 Type *I32Ty = Type::getInt32Ty(C&: Mod->getContext());
1227
1228 // We could do a single 64-bit load here, but it's likely that the basic
1229 // 32-bit and extract sequence is already present, and it is probably easier
1230 // to CSE this. The loads should be mergeable later anyway.
1231 Value *GEPXY = Builder.CreateConstInBoundsGEP1_64(Ty: I32Ty, Ptr: DispatchPtr, Idx0: 1);
1232 LoadInst *LoadXY = Builder.CreateAlignedLoad(Ty: I32Ty, Ptr: GEPXY, Align: Align(4));
1233
1234 Value *GEPZU = Builder.CreateConstInBoundsGEP1_64(Ty: I32Ty, Ptr: DispatchPtr, Idx0: 2);
1235 LoadInst *LoadZU = Builder.CreateAlignedLoad(Ty: I32Ty, Ptr: GEPZU, Align: Align(4));
1236
1237 MDNode *MD = MDNode::get(Context&: Mod->getContext(), MDs: {});
1238 LoadXY->setMetadata(KindID: LLVMContext::MD_invariant_load, Node: MD);
1239 LoadZU->setMetadata(KindID: LLVMContext::MD_invariant_load, Node: MD);
1240 ST.makeLIDRangeMetadata(I: LoadZU);
1241
1242 // Extract y component. Upper half of LoadZU should be zero already.
1243 Value *Y = Builder.CreateLShr(LHS: LoadXY, RHS: 16);
1244
1245 return std::pair(Y, LoadZU);
1246}
1247
1248Value *AMDGPUPromoteAllocaImpl::getWorkitemID(IRBuilder<> &Builder,
1249 unsigned N) {
1250 Function *F = Builder.GetInsertBlock()->getParent();
1251 const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F: *F);
1252 Intrinsic::ID IntrID = Intrinsic::not_intrinsic;
1253 StringRef AttrName;
1254
1255 switch (N) {
1256 case 0:
1257 IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_x
1258 : (Intrinsic::ID)Intrinsic::r600_read_tidig_x;
1259 AttrName = "amdgpu-no-workitem-id-x";
1260 break;
1261 case 1:
1262 IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_y
1263 : (Intrinsic::ID)Intrinsic::r600_read_tidig_y;
1264 AttrName = "amdgpu-no-workitem-id-y";
1265 break;
1266
1267 case 2:
1268 IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_z
1269 : (Intrinsic::ID)Intrinsic::r600_read_tidig_z;
1270 AttrName = "amdgpu-no-workitem-id-z";
1271 break;
1272 default:
1273 llvm_unreachable("invalid dimension");
1274 }
1275
1276 Function *WorkitemIdFn = Intrinsic::getOrInsertDeclaration(M: Mod, id: IntrID);
1277 CallInst *CI = Builder.CreateCall(Callee: WorkitemIdFn);
1278 ST.makeLIDRangeMetadata(I: CI);
1279 F->removeFnAttr(Kind: AttrName);
1280
1281 return CI;
1282}
1283
1284static bool isCallPromotable(CallInst *CI) {
1285 IntrinsicInst *II = dyn_cast<IntrinsicInst>(Val: CI);
1286 if (!II)
1287 return false;
1288
1289 switch (II->getIntrinsicID()) {
1290 case Intrinsic::memcpy:
1291 case Intrinsic::memmove:
1292 case Intrinsic::memset:
1293 case Intrinsic::lifetime_start:
1294 case Intrinsic::lifetime_end:
1295 case Intrinsic::invariant_start:
1296 case Intrinsic::invariant_end:
1297 case Intrinsic::launder_invariant_group:
1298 case Intrinsic::strip_invariant_group:
1299 case Intrinsic::objectsize:
1300 return true;
1301 default:
1302 return false;
1303 }
1304}
1305
1306bool AMDGPUPromoteAllocaImpl::binaryOpIsDerivedFromSameAlloca(
1307 Value *BaseAlloca, Value *Val, Instruction *Inst, int OpIdx0,
1308 int OpIdx1) const {
1309 // Figure out which operand is the one we might not be promoting.
1310 Value *OtherOp = Inst->getOperand(i: OpIdx0);
1311 if (Val == OtherOp)
1312 OtherOp = Inst->getOperand(i: OpIdx1);
1313
1314 if (isa<ConstantPointerNull, ConstantAggregateZero>(Val: OtherOp))
1315 return true;
1316
1317 // TODO: getUnderlyingObject will not work on a vector getelementptr
1318 Value *OtherObj = getUnderlyingObject(V: OtherOp);
1319 if (!isa<AllocaInst>(Val: OtherObj))
1320 return false;
1321
1322 // TODO: We should be able to replace undefs with the right pointer type.
1323
1324 // TODO: If we know the other base object is another promotable
1325 // alloca, not necessarily this alloca, we can do this. The
1326 // important part is both must have the same address space at
1327 // the end.
1328 if (OtherObj != BaseAlloca) {
1329 LLVM_DEBUG(
1330 dbgs() << "Found a binary instruction with another alloca object\n");
1331 return false;
1332 }
1333
1334 return true;
1335}
1336
1337void AMDGPUPromoteAllocaImpl::analyzePromoteToLDS(AllocaAnalysis &AA) const {
1338 if (DisablePromoteAllocaToLDS) {
1339 LLVM_DEBUG(dbgs() << " Promote alloca to LDS is disabled\n");
1340 return;
1341 }
1342
1343 // Don't promote the alloca to LDS for shader calling conventions as the work
1344 // item ID intrinsics are not supported for these calling conventions.
1345 // Furthermore not all LDS is available for some of the stages.
1346 const Function &ContainingFunction = *AA.Alloca->getFunction();
1347 CallingConv::ID CC = ContainingFunction.getCallingConv();
1348
1349 switch (CC) {
1350 case CallingConv::AMDGPU_KERNEL:
1351 case CallingConv::SPIR_KERNEL:
1352 break;
1353 default:
1354 LLVM_DEBUG(
1355 dbgs()
1356 << " promote alloca to LDS not supported with calling convention.\n");
1357 return;
1358 }
1359
1360 for (Use *Use : AA.Uses) {
1361 auto *User = Use->getUser();
1362
1363 if (CallInst *CI = dyn_cast<CallInst>(Val: User)) {
1364 if (!isCallPromotable(CI))
1365 return;
1366
1367 if (find(Range&: AA.LDS.Worklist, Val: User) == AA.LDS.Worklist.end())
1368 AA.LDS.Worklist.push_back(Elt: User);
1369 continue;
1370 }
1371
1372 Instruction *UseInst = cast<Instruction>(Val: User);
1373 if (UseInst->getOpcode() == Instruction::PtrToInt)
1374 return;
1375
1376 if (LoadInst *LI = dyn_cast<LoadInst>(Val: UseInst)) {
1377 if (LI->isVolatile())
1378 return;
1379 continue;
1380 }
1381
1382 if (StoreInst *SI = dyn_cast<StoreInst>(Val: UseInst)) {
1383 if (SI->isVolatile())
1384 return;
1385 continue;
1386 }
1387
1388 if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(Val: UseInst)) {
1389 if (RMW->isVolatile())
1390 return;
1391 continue;
1392 }
1393
1394 if (AtomicCmpXchgInst *CAS = dyn_cast<AtomicCmpXchgInst>(Val: UseInst)) {
1395 if (CAS->isVolatile())
1396 return;
1397 continue;
1398 }
1399
1400 // Only promote a select if we know that the other select operand
1401 // is from another pointer that will also be promoted.
1402 if (ICmpInst *ICmp = dyn_cast<ICmpInst>(Val: UseInst)) {
1403 if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca: AA.Alloca, Val: Use->get(), Inst: ICmp, OpIdx0: 0, OpIdx1: 1))
1404 return;
1405
1406 // May need to rewrite constant operands.
1407 if (find(Range&: AA.LDS.Worklist, Val: User) == AA.LDS.Worklist.end())
1408 AA.LDS.Worklist.push_back(Elt: ICmp);
1409 continue;
1410 }
1411
1412 if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Val: UseInst)) {
1413 // Be conservative if an address could be computed outside the bounds of
1414 // the alloca.
1415 if (!GEP->isInBounds())
1416 return;
1417 } else if (!isa<ExtractElementInst, SelectInst, PHINode>(Val: User)) {
1418 // Do not promote vector/aggregate type instructions. It is hard to track
1419 // their users.
1420
1421 // Do not promote addrspacecast.
1422 //
1423 // TODO: If we know the address is only observed through flat pointers, we
1424 // could still promote.
1425 return;
1426 }
1427
1428 if (find(Range&: AA.LDS.Worklist, Val: User) == AA.LDS.Worklist.end())
1429 AA.LDS.Worklist.push_back(Elt: User);
1430 }
1431
1432 AA.LDS.Enable = true;
1433}
1434
1435bool AMDGPUPromoteAllocaImpl::hasSufficientLocalMem(const Function &F) {
1436
1437 FunctionType *FTy = F.getFunctionType();
1438 const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F);
1439
1440 // If the function has any arguments in the local address space, then it's
1441 // possible these arguments require the entire local memory space, so
1442 // we cannot use local memory in the pass.
1443 for (Type *ParamTy : FTy->params()) {
1444 PointerType *PtrTy = dyn_cast<PointerType>(Val: ParamTy);
1445 if (PtrTy && PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
1446 LocalMemLimit = 0;
1447 LLVM_DEBUG(dbgs() << "Function has local memory argument. Promoting to "
1448 "local memory disabled.\n");
1449 return false;
1450 }
1451 }
1452
1453 LocalMemLimit = ST.getAddressableLocalMemorySize();
1454 if (LocalMemLimit == 0)
1455 return false;
1456
1457 SmallVector<const Constant *, 16> Stack;
1458 SmallPtrSet<const Constant *, 8> VisitedConstants;
1459 SmallPtrSet<const GlobalVariable *, 8> UsedLDS;
1460
1461 auto visitUsers = [&](const GlobalVariable *GV, const Constant *Val) -> bool {
1462 for (const User *U : Val->users()) {
1463 if (const Instruction *Use = dyn_cast<Instruction>(Val: U)) {
1464 if (Use->getFunction() == &F)
1465 return true;
1466 } else {
1467 const Constant *C = cast<Constant>(Val: U);
1468 if (VisitedConstants.insert(Ptr: C).second)
1469 Stack.push_back(Elt: C);
1470 }
1471 }
1472
1473 return false;
1474 };
1475
1476 for (GlobalVariable &GV : Mod->globals()) {
1477 if (GV.getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS)
1478 continue;
1479
1480 if (visitUsers(&GV, &GV)) {
1481 UsedLDS.insert(Ptr: &GV);
1482 Stack.clear();
1483 continue;
1484 }
1485
1486 // For any ConstantExpr uses, we need to recursively search the users until
1487 // we see a function.
1488 while (!Stack.empty()) {
1489 const Constant *C = Stack.pop_back_val();
1490 if (visitUsers(&GV, C)) {
1491 UsedLDS.insert(Ptr: &GV);
1492 Stack.clear();
1493 break;
1494 }
1495 }
1496 }
1497
1498 const DataLayout &DL = Mod->getDataLayout();
1499 SmallVector<std::pair<uint64_t, Align>, 16> AllocatedSizes;
1500 AllocatedSizes.reserve(N: UsedLDS.size());
1501
1502 for (const GlobalVariable *GV : UsedLDS) {
1503 Align Alignment =
1504 DL.getValueOrABITypeAlignment(Alignment: GV->getAlign(), Ty: GV->getValueType());
1505 uint64_t AllocSize = GV->getGlobalSize(DL);
1506
1507 // HIP uses an extern unsized array in local address space for dynamically
1508 // allocated shared memory. In that case, we have to disable the promotion.
1509 if (GV->hasExternalLinkage() && AllocSize == 0) {
1510 LocalMemLimit = 0;
1511 LLVM_DEBUG(dbgs() << "Function has a reference to externally allocated "
1512 "local memory. Promoting to local memory "
1513 "disabled.\n");
1514 return false;
1515 }
1516
1517 AllocatedSizes.emplace_back(Args&: AllocSize, Args&: Alignment);
1518 }
1519
1520 // Sort to try to estimate the worst case alignment padding
1521 //
1522 // FIXME: We should really do something to fix the addresses to a more optimal
1523 // value instead
1524 llvm::sort(C&: AllocatedSizes, Comp: llvm::less_second());
1525
1526 // Check how much local memory is being used by global objects
1527 CurrentLocalMemUsage = 0;
1528
1529 // FIXME: Try to account for padding here. The real padding and address is
1530 // currently determined from the inverse order of uses in the function when
1531 // legalizing, which could also potentially change. We try to estimate the
1532 // worst case here, but we probably should fix the addresses earlier.
1533 for (auto Alloc : AllocatedSizes) {
1534 CurrentLocalMemUsage = alignTo(Size: CurrentLocalMemUsage, A: Alloc.second);
1535 CurrentLocalMemUsage += Alloc.first;
1536 }
1537
1538 unsigned MaxOccupancy =
1539 ST.getWavesPerEU(FlatWorkGroupSizes: ST.getFlatWorkGroupSizes(F), LDSBytes: CurrentLocalMemUsage, F)
1540 .second;
1541
1542 // Round up to the next tier of usage.
1543 unsigned MaxSizeWithWaveCount =
1544 ST.getMaxLocalMemSizeWithWaveCount(WaveCount: MaxOccupancy, F);
1545
1546 // Program may already use more LDS than is usable at maximum occupancy.
1547 if (CurrentLocalMemUsage > MaxSizeWithWaveCount)
1548 return false;
1549
1550 LocalMemLimit = MaxSizeWithWaveCount;
1551
1552 LLVM_DEBUG(dbgs() << F.getName() << " uses " << CurrentLocalMemUsage
1553 << " bytes of LDS\n"
1554 << " Rounding size to " << MaxSizeWithWaveCount
1555 << " with a maximum occupancy of " << MaxOccupancy << '\n'
1556 << " and " << (LocalMemLimit - CurrentLocalMemUsage)
1557 << " available for promotion\n");
1558
1559 return true;
1560}
1561
1562// FIXME: Should try to pick the most likely to be profitable allocas first.
1563bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(
1564 AllocaAnalysis &AA, bool SufficientLDS,
1565 SetVector<IntrinsicInst *> &DeferredIntrs) {
1566 LLVM_DEBUG(dbgs() << "Trying to promote to LDS: " << *AA.Alloca << '\n');
1567
1568 // Not likely to have sufficient local memory for promotion.
1569 if (!SufficientLDS)
1570 return false;
1571
1572 const DataLayout &DL = Mod->getDataLayout();
1573 IRBuilder<> Builder(AA.Alloca);
1574
1575 const Function &ContainingFunction = *AA.Alloca->getParent()->getParent();
1576 const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F: ContainingFunction);
1577 unsigned WorkGroupSize = ST.getFlatWorkGroupSizes(F: ContainingFunction).second;
1578
1579 Align Alignment = DL.getValueOrABITypeAlignment(
1580 Alignment: AA.Alloca->getAlign(), Ty: AA.Alloca->getAllocatedType());
1581
1582 // FIXME: This computed padding is likely wrong since it depends on inverse
1583 // usage order.
1584 //
1585 // FIXME: It is also possible that if we're allowed to use all of the memory
1586 // could end up using more than the maximum due to alignment padding.
1587
1588 uint32_t NewSize = alignTo(Size: CurrentLocalMemUsage, A: Alignment);
1589 uint32_t AllocSize =
1590 WorkGroupSize * DL.getTypeAllocSize(Ty: AA.Alloca->getAllocatedType());
1591 NewSize += AllocSize;
1592
1593 if (NewSize > LocalMemLimit) {
1594 LLVM_DEBUG(dbgs() << " " << AllocSize
1595 << " bytes of local memory not available to promote\n");
1596 return false;
1597 }
1598
1599 CurrentLocalMemUsage = NewSize;
1600
1601 LLVM_DEBUG(dbgs() << "Promoting alloca to local memory\n");
1602
1603 Function *F = AA.Alloca->getFunction();
1604
1605 Type *GVTy = ArrayType::get(ElementType: AA.Alloca->getAllocatedType(), NumElements: WorkGroupSize);
1606 GlobalVariable *GV = new GlobalVariable(
1607 *Mod, GVTy, false, GlobalValue::InternalLinkage, PoisonValue::get(T: GVTy),
1608 Twine(F->getName()) + Twine('.') + AA.Alloca->getName(), nullptr,
1609 GlobalVariable::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS);
1610 GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
1611 GV->setAlignment(AA.Alloca->getAlign());
1612
1613 Value *TCntY, *TCntZ;
1614
1615 std::tie(args&: TCntY, args&: TCntZ) = getLocalSizeYZ(Builder);
1616 Value *TIdX = getWorkitemID(Builder, N: 0);
1617 Value *TIdY = getWorkitemID(Builder, N: 1);
1618 Value *TIdZ = getWorkitemID(Builder, N: 2);
1619
1620 Value *Tmp0 = Builder.CreateMul(LHS: TCntY, RHS: TCntZ, Name: "", HasNUW: true, HasNSW: true);
1621 Tmp0 = Builder.CreateMul(LHS: Tmp0, RHS: TIdX);
1622 Value *Tmp1 = Builder.CreateMul(LHS: TIdY, RHS: TCntZ, Name: "", HasNUW: true, HasNSW: true);
1623 Value *TID = Builder.CreateAdd(LHS: Tmp0, RHS: Tmp1);
1624 TID = Builder.CreateAdd(LHS: TID, RHS: TIdZ);
1625
1626 LLVMContext &Context = Mod->getContext();
1627 Value *Indices[] = {Constant::getNullValue(Ty: Type::getInt32Ty(C&: Context)), TID};
1628
1629 Value *Offset = Builder.CreateInBoundsGEP(Ty: GVTy, Ptr: GV, IdxList: Indices);
1630 AA.Alloca->mutateType(Ty: Offset->getType());
1631 AA.Alloca->replaceAllUsesWith(V: Offset);
1632 AA.Alloca->eraseFromParent();
1633
1634 PointerType *NewPtrTy = PointerType::get(C&: Context, AddressSpace: AMDGPUAS::LOCAL_ADDRESS);
1635
1636 for (Value *V : AA.LDS.Worklist) {
1637 CallInst *Call = dyn_cast<CallInst>(Val: V);
1638 if (!Call) {
1639 if (ICmpInst *CI = dyn_cast<ICmpInst>(Val: V)) {
1640 Value *LHS = CI->getOperand(i_nocapture: 0);
1641 Value *RHS = CI->getOperand(i_nocapture: 1);
1642
1643 Type *NewTy = LHS->getType()->getWithNewType(EltTy: NewPtrTy);
1644 if (isa<ConstantPointerNull, ConstantAggregateZero>(Val: LHS))
1645 CI->setOperand(i_nocapture: 0, Val_nocapture: Constant::getNullValue(Ty: NewTy));
1646
1647 if (isa<ConstantPointerNull, ConstantAggregateZero>(Val: RHS))
1648 CI->setOperand(i_nocapture: 1, Val_nocapture: Constant::getNullValue(Ty: NewTy));
1649
1650 continue;
1651 }
1652
1653 // The operand's value should be corrected on its own and we don't want to
1654 // touch the users.
1655 if (isa<AddrSpaceCastInst>(Val: V))
1656 continue;
1657
1658 assert(V->getType()->isPtrOrPtrVectorTy());
1659
1660 Type *NewTy = V->getType()->getWithNewType(EltTy: NewPtrTy);
1661 V->mutateType(Ty: NewTy);
1662
1663 // Adjust the types of any constant operands.
1664 if (SelectInst *SI = dyn_cast<SelectInst>(Val: V)) {
1665 if (isa<ConstantPointerNull, ConstantAggregateZero>(Val: SI->getOperand(i_nocapture: 1)))
1666 SI->setOperand(i_nocapture: 1, Val_nocapture: Constant::getNullValue(Ty: NewTy));
1667
1668 if (isa<ConstantPointerNull, ConstantAggregateZero>(Val: SI->getOperand(i_nocapture: 2)))
1669 SI->setOperand(i_nocapture: 2, Val_nocapture: Constant::getNullValue(Ty: NewTy));
1670 } else if (PHINode *Phi = dyn_cast<PHINode>(Val: V)) {
1671 for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I) {
1672 if (isa<ConstantPointerNull, ConstantAggregateZero>(
1673 Val: Phi->getIncomingValue(i: I)))
1674 Phi->setIncomingValue(i: I, V: Constant::getNullValue(Ty: NewTy));
1675 }
1676 }
1677
1678 continue;
1679 }
1680
1681 IntrinsicInst *Intr = cast<IntrinsicInst>(Val: Call);
1682 Builder.SetInsertPoint(Intr);
1683 switch (Intr->getIntrinsicID()) {
1684 case Intrinsic::lifetime_start:
1685 case Intrinsic::lifetime_end:
1686 // These intrinsics are for address space 0 only
1687 Intr->eraseFromParent();
1688 continue;
1689 case Intrinsic::memcpy:
1690 case Intrinsic::memmove:
1691 // These have 2 pointer operands. In case if second pointer also needs
1692 // to be replaced we defer processing of these intrinsics until all
1693 // other values are processed.
1694 DeferredIntrs.insert(X: Intr);
1695 continue;
1696 case Intrinsic::memset: {
1697 MemSetInst *MemSet = cast<MemSetInst>(Val: Intr);
1698 Builder.CreateMemSet(Ptr: MemSet->getRawDest(), Val: MemSet->getValue(),
1699 Size: MemSet->getLength(), Align: MemSet->getDestAlign(),
1700 isVolatile: MemSet->isVolatile());
1701 Intr->eraseFromParent();
1702 continue;
1703 }
1704 case Intrinsic::invariant_start:
1705 case Intrinsic::invariant_end:
1706 case Intrinsic::launder_invariant_group:
1707 case Intrinsic::strip_invariant_group: {
1708 SmallVector<Value *> Args;
1709 if (Intr->getIntrinsicID() == Intrinsic::invariant_start) {
1710 Args.emplace_back(Args: Intr->getArgOperand(i: 0));
1711 } else if (Intr->getIntrinsicID() == Intrinsic::invariant_end) {
1712 Args.emplace_back(Args: Intr->getArgOperand(i: 0));
1713 Args.emplace_back(Args: Intr->getArgOperand(i: 1));
1714 }
1715 Args.emplace_back(Args&: Offset);
1716 Function *F = Intrinsic::getOrInsertDeclaration(
1717 M: Intr->getModule(), id: Intr->getIntrinsicID(), Tys: Offset->getType());
1718 CallInst *NewIntr =
1719 CallInst::Create(Func: F, Args, NameStr: Intr->getName(), InsertBefore: Intr->getIterator());
1720 Intr->mutateType(Ty: NewIntr->getType());
1721 Intr->replaceAllUsesWith(V: NewIntr);
1722 Intr->eraseFromParent();
1723 continue;
1724 }
1725 case Intrinsic::objectsize: {
1726 Value *Src = Intr->getOperand(i_nocapture: 0);
1727
1728 CallInst *NewCall = Builder.CreateIntrinsic(
1729 ID: Intrinsic::objectsize,
1730 Types: {Intr->getType(), PointerType::get(C&: Context, AddressSpace: AMDGPUAS::LOCAL_ADDRESS)},
1731 Args: {Src, Intr->getOperand(i_nocapture: 1), Intr->getOperand(i_nocapture: 2), Intr->getOperand(i_nocapture: 3)});
1732 Intr->replaceAllUsesWith(V: NewCall);
1733 Intr->eraseFromParent();
1734 continue;
1735 }
1736 default:
1737 Intr->print(O&: errs());
1738 llvm_unreachable("Don't know how to promote alloca intrinsic use.");
1739 }
1740 }
1741
1742 return true;
1743}
1744
1745void AMDGPUPromoteAllocaImpl::finishDeferredAllocaToLDSPromotion(
1746 SetVector<IntrinsicInst *> &DeferredIntrs) {
1747
1748 for (IntrinsicInst *Intr : DeferredIntrs) {
1749 IRBuilder<> Builder(Intr);
1750 Builder.SetInsertPoint(Intr);
1751 Intrinsic::ID ID = Intr->getIntrinsicID();
1752 assert(ID == Intrinsic::memcpy || ID == Intrinsic::memmove);
1753
1754 MemTransferInst *MI = cast<MemTransferInst>(Val: Intr);
1755 auto *B = Builder.CreateMemTransferInst(
1756 IntrID: ID, Dst: MI->getRawDest(), DstAlign: MI->getDestAlign(), Src: MI->getRawSource(),
1757 SrcAlign: MI->getSourceAlign(), Size: MI->getLength(), isVolatile: MI->isVolatile());
1758
1759 for (unsigned I = 0; I != 2; ++I) {
1760 if (uint64_t Bytes = Intr->getParamDereferenceableBytes(i: I)) {
1761 B->addDereferenceableParamAttr(i: I, Bytes);
1762 }
1763 }
1764
1765 Intr->eraseFromParent();
1766 }
1767}
1768