1//===-- AMDGPUPromoteAlloca.cpp - Promote Allocas -------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// Eliminates allocas by either converting them into vectors or by migrating
10// them to local address space.
11//
12// Two passes are exposed by this file:
13// - "promote-alloca-to-vector", which runs early in the pipeline and only
14// promotes to vector. Promotion to vector is almost always profitable
15// except when the alloca is too big and the promotion would result in
16// very high register pressure.
17// - "promote-alloca", which does both promotion to vector and LDS and runs
18// much later in the pipeline. This runs after SROA because promoting to
19// LDS is of course less profitable than getting rid of the alloca or
20// vectorizing it, thus we only want to do it when the only alternative is
21// lowering the alloca to stack.
22//
23// Note that both of them exist for the old and new PMs. The new PM passes are
24// declared in AMDGPU.h and the legacy PM ones are declared here.s
25//
26//===----------------------------------------------------------------------===//
27
28#include "AMDGPU.h"
29#include "GCNSubtarget.h"
30#include "Utils/AMDGPUBaseInfo.h"
31#include "llvm/ADT/STLExtras.h"
32#include "llvm/Analysis/CaptureTracking.h"
33#include "llvm/Analysis/InstSimplifyFolder.h"
34#include "llvm/Analysis/InstructionSimplify.h"
35#include "llvm/Analysis/LoopInfo.h"
36#include "llvm/Analysis/ValueTracking.h"
37#include "llvm/CodeGen/TargetPassConfig.h"
38#include "llvm/IR/IRBuilder.h"
39#include "llvm/IR/IntrinsicInst.h"
40#include "llvm/IR/IntrinsicsAMDGPU.h"
41#include "llvm/IR/IntrinsicsR600.h"
42#include "llvm/IR/PatternMatch.h"
43#include "llvm/InitializePasses.h"
44#include "llvm/Pass.h"
45#include "llvm/Support/MathExtras.h"
46#include "llvm/Target/TargetMachine.h"
47#include "llvm/Transforms/Utils/SSAUpdater.h"
48
49#define DEBUG_TYPE "amdgpu-promote-alloca"
50
51using namespace llvm;
52
53namespace {
54
55static cl::opt<bool>
56 DisablePromoteAllocaToVector("disable-promote-alloca-to-vector",
57 cl::desc("Disable promote alloca to vector"),
58 cl::init(Val: false));
59
60static cl::opt<bool>
61 DisablePromoteAllocaToLDS("disable-promote-alloca-to-lds",
62 cl::desc("Disable promote alloca to LDS"),
63 cl::init(Val: false));
64
65static cl::opt<unsigned> PromoteAllocaToVectorLimit(
66 "amdgpu-promote-alloca-to-vector-limit",
67 cl::desc("Maximum byte size to consider promote alloca to vector"),
68 cl::init(Val: 0));
69
70static cl::opt<unsigned> PromoteAllocaToVectorMaxRegs(
71 "amdgpu-promote-alloca-to-vector-max-regs",
72 cl::desc(
73 "Maximum vector size (in 32b registers) to use when promoting alloca"),
74 cl::init(Val: 32));
75
76// Use up to 1/4 of available register budget for vectorization.
77// FIXME: Increase the limit for whole function budgets? Perhaps x2?
78static cl::opt<unsigned> PromoteAllocaToVectorVGPRRatio(
79 "amdgpu-promote-alloca-to-vector-vgpr-ratio",
80 cl::desc("Ratio of VGPRs to budget for promoting alloca to vectors"),
81 cl::init(Val: 4));
82
83static cl::opt<unsigned>
84 LoopUserWeight("promote-alloca-vector-loop-user-weight",
85 cl::desc("The bonus weight of users of allocas within loop "
86 "when sorting profitable allocas"),
87 cl::init(Val: 4));
88
89// We support vector indices of the form (A * stride) + B
90// All parts are optional.
91struct GEPToVectorIndex {
92 Value *VarIndex = nullptr; // defaults to 0
93 ConstantInt *VarMul = nullptr; // defaults to 1
94 ConstantInt *ConstIndex = nullptr; // defaults to 0
95 Value *Full = nullptr;
96};
97
98struct MemTransferInfo {
99 ConstantInt *SrcIndex = nullptr;
100 ConstantInt *DestIndex = nullptr;
101};
102
103// Analysis for planning the different strategies of alloca promotion.
104struct AllocaAnalysis {
105 AllocaInst *Alloca = nullptr;
106 DenseSet<Value *> Pointers;
107 SmallVector<Use *> Uses;
108 unsigned Score = 0;
109 bool HaveSelectOrPHI = false;
110 struct {
111 FixedVectorType *Ty = nullptr;
112 SmallVector<Instruction *> Worklist;
113 SmallVector<Instruction *> UsersToRemove;
114 MapVector<GetElementPtrInst *, GEPToVectorIndex> GEPVectorIdx;
115 MapVector<MemTransferInst *, MemTransferInfo> TransferInfo;
116 } Vector;
117 struct {
118 bool Enable = false;
119 SmallVector<User *> Worklist;
120 } LDS;
121
122 explicit AllocaAnalysis(AllocaInst *Alloca) : Alloca(Alloca) {}
123};
124
125// Shared implementation which can do both promotion to vector and to LDS.
126class AMDGPUPromoteAllocaImpl {
127private:
128 const TargetMachine &TM;
129 LoopInfo &LI;
130 Module *Mod = nullptr;
131 const DataLayout *DL = nullptr;
132
133 // FIXME: This should be per-kernel.
134 uint32_t LocalMemLimit = 0;
135 uint32_t CurrentLocalMemUsage = 0;
136 unsigned MaxVGPRs;
137 unsigned VGPRBudgetRatio;
138 unsigned MaxVectorRegs;
139
140 bool IsAMDGCN = false;
141 bool IsAMDHSA = false;
142
143 std::pair<Value *, Value *> getLocalSizeYZ(IRBuilder<> &Builder);
144 Value *getWorkitemID(IRBuilder<> &Builder, unsigned N);
145
146 bool collectAllocaUses(AllocaAnalysis &AA) const;
147
148 /// Val is a derived pointer from Alloca. OpIdx0/OpIdx1 are the operand
149 /// indices to an instruction with 2 pointer inputs (e.g. select, icmp).
150 /// Returns true if both operands are derived from the same alloca. Val should
151 /// be the same value as one of the input operands of UseInst.
152 bool binaryOpIsDerivedFromSameAlloca(Value *Alloca, Value *Val,
153 Instruction *UseInst, int OpIdx0,
154 int OpIdx1) const;
155
156 /// Check whether we have enough local memory for promotion.
157 bool hasSufficientLocalMem(const Function &F);
158
159 FixedVectorType *getVectorTypeForAlloca(Type *AllocaTy) const;
160 void analyzePromoteToVector(AllocaAnalysis &AA) const;
161 void promoteAllocaToVector(AllocaAnalysis &AA);
162 void analyzePromoteToLDS(AllocaAnalysis &AA) const;
163 bool tryPromoteAllocaToLDS(AllocaAnalysis &AA, bool SufficientLDS,
164 SetVector<IntrinsicInst *> &DeferredIntrs);
165 void
166 finishDeferredAllocaToLDSPromotion(SetVector<IntrinsicInst *> &DeferredIntrs);
167
168 void scoreAlloca(AllocaAnalysis &AA) const;
169
170 void setFunctionLimits(const Function &F);
171
172public:
173 AMDGPUPromoteAllocaImpl(TargetMachine &TM, LoopInfo &LI) : TM(TM), LI(LI) {
174
175 const Triple &TT = TM.getTargetTriple();
176 IsAMDGCN = TT.isAMDGCN();
177 IsAMDHSA = TT.getOS() == Triple::AMDHSA;
178 }
179
180 bool run(Function &F, bool PromoteToLDS);
181};
182
183// FIXME: This can create globals so should be a module pass.
184class AMDGPUPromoteAlloca : public FunctionPass {
185public:
186 static char ID;
187
188 AMDGPUPromoteAlloca() : FunctionPass(ID) {}
189
190 bool runOnFunction(Function &F) override {
191 if (skipFunction(F))
192 return false;
193 if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>())
194 return AMDGPUPromoteAllocaImpl(
195 TPC->getTM<TargetMachine>(),
196 getAnalysis<LoopInfoWrapperPass>().getLoopInfo())
197 .run(F, /*PromoteToLDS*/ true);
198 return false;
199 }
200
201 StringRef getPassName() const override { return "AMDGPU Promote Alloca"; }
202
203 void getAnalysisUsage(AnalysisUsage &AU) const override {
204 AU.setPreservesCFG();
205 AU.addRequired<LoopInfoWrapperPass>();
206 FunctionPass::getAnalysisUsage(AU);
207 }
208};
209
210static unsigned getMaxVGPRs(unsigned LDSBytes, const TargetMachine &TM,
211 const Function &F) {
212 if (!TM.getTargetTriple().isAMDGCN())
213 return 128;
214
215 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
216
217 unsigned DynamicVGPRBlockSize = AMDGPU::getDynamicVGPRBlockSize(F);
218 // Temporarily check both the attribute and the subtarget feature, until the
219 // latter is removed.
220 if (DynamicVGPRBlockSize == 0 && ST.isDynamicVGPREnabled())
221 DynamicVGPRBlockSize = ST.getDynamicVGPRBlockSize();
222
223 unsigned MaxVGPRs = ST.getMaxNumVGPRs(
224 WavesPerEU: ST.getWavesPerEU(FlatWorkGroupSizes: ST.getFlatWorkGroupSizes(F), LDSBytes, F).first,
225 DynamicVGPRBlockSize);
226
227 // A non-entry function has only 32 caller preserved registers.
228 // Do not promote alloca which will force spilling unless we know the function
229 // will be inlined.
230 if (!F.hasFnAttribute(Kind: Attribute::AlwaysInline) &&
231 !AMDGPU::isEntryFunctionCC(CC: F.getCallingConv()))
232 MaxVGPRs = std::min(a: MaxVGPRs, b: 32u);
233 return MaxVGPRs;
234}
235
236} // end anonymous namespace
237
238char AMDGPUPromoteAlloca::ID = 0;
239
240INITIALIZE_PASS_BEGIN(AMDGPUPromoteAlloca, DEBUG_TYPE,
241 "AMDGPU promote alloca to vector or LDS", false, false)
242// Move LDS uses from functions to kernels before promote alloca for accurate
243// estimation of LDS available
244INITIALIZE_PASS_DEPENDENCY(AMDGPULowerModuleLDSLegacy)
245INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
246INITIALIZE_PASS_END(AMDGPUPromoteAlloca, DEBUG_TYPE,
247 "AMDGPU promote alloca to vector or LDS", false, false)
248
249char &llvm::AMDGPUPromoteAllocaID = AMDGPUPromoteAlloca::ID;
250
251PreservedAnalyses AMDGPUPromoteAllocaPass::run(Function &F,
252 FunctionAnalysisManager &AM) {
253 auto &LI = AM.getResult<LoopAnalysis>(IR&: F);
254 bool Changed = AMDGPUPromoteAllocaImpl(TM, LI).run(F, /*PromoteToLDS=*/true);
255 if (Changed) {
256 PreservedAnalyses PA;
257 PA.preserveSet<CFGAnalyses>();
258 return PA;
259 }
260 return PreservedAnalyses::all();
261}
262
263PreservedAnalyses
264AMDGPUPromoteAllocaToVectorPass::run(Function &F, FunctionAnalysisManager &AM) {
265 auto &LI = AM.getResult<LoopAnalysis>(IR&: F);
266 bool Changed = AMDGPUPromoteAllocaImpl(TM, LI).run(F, /*PromoteToLDS=*/false);
267 if (Changed) {
268 PreservedAnalyses PA;
269 PA.preserveSet<CFGAnalyses>();
270 return PA;
271 }
272 return PreservedAnalyses::all();
273}
274
275FunctionPass *llvm::createAMDGPUPromoteAlloca() {
276 return new AMDGPUPromoteAlloca();
277}
278
279bool AMDGPUPromoteAllocaImpl::collectAllocaUses(AllocaAnalysis &AA) const {
280 const auto RejectUser = [&](Instruction *Inst, Twine Msg) {
281 LLVM_DEBUG(dbgs() << " Cannot promote alloca: " << Msg << "\n"
282 << " " << *Inst << "\n");
283 return false;
284 };
285
286 SmallVector<Instruction *, 4> WorkList({AA.Alloca});
287 while (!WorkList.empty()) {
288 auto *Cur = WorkList.pop_back_val();
289 if (find(Range&: AA.Pointers, Val: Cur) != AA.Pointers.end())
290 continue;
291 AA.Pointers.insert(V: Cur);
292 for (auto &U : Cur->uses()) {
293 auto *Inst = cast<Instruction>(Val: U.getUser());
294 if (isa<StoreInst>(Val: Inst)) {
295 if (U.getOperandNo() != StoreInst::getPointerOperandIndex()) {
296 return RejectUser(Inst, "pointer escapes via store");
297 }
298 }
299 AA.Uses.push_back(Elt: &U);
300
301 if (isa<GetElementPtrInst>(Val: U.getUser())) {
302 WorkList.push_back(Elt: Inst);
303 } else if (auto *SI = dyn_cast<SelectInst>(Val: Inst)) {
304 // Only promote a select if we know that the other select operand is
305 // from another pointer that will also be promoted.
306 if (!binaryOpIsDerivedFromSameAlloca(Alloca: AA.Alloca, Val: Cur, UseInst: SI, OpIdx0: 1, OpIdx1: 2))
307 return RejectUser(Inst, "select from mixed objects");
308 WorkList.push_back(Elt: Inst);
309 AA.HaveSelectOrPHI = true;
310 } else if (auto *Phi = dyn_cast<PHINode>(Val: Inst)) {
311 // Repeat for phis.
312
313 // TODO: Handle more complex cases. We should be able to replace loops
314 // over arrays.
315 switch (Phi->getNumIncomingValues()) {
316 case 1:
317 break;
318 case 2:
319 if (!binaryOpIsDerivedFromSameAlloca(Alloca: AA.Alloca, Val: Cur, UseInst: Phi, OpIdx0: 0, OpIdx1: 1))
320 return RejectUser(Inst, "phi from mixed objects");
321 break;
322 default:
323 return RejectUser(Inst, "phi with too many operands");
324 }
325
326 WorkList.push_back(Elt: Inst);
327 AA.HaveSelectOrPHI = true;
328 }
329 }
330 }
331 return true;
332}
333
334void AMDGPUPromoteAllocaImpl::scoreAlloca(AllocaAnalysis &AA) const {
335 LLVM_DEBUG(dbgs() << "Scoring: " << *AA.Alloca << "\n");
336 unsigned Score = 0;
337 // Increment score by one for each user + a bonus for users within loops.
338 for (auto *U : AA.Uses) {
339 Instruction *Inst = cast<Instruction>(Val: U->getUser());
340 if (isa<GetElementPtrInst>(Val: Inst) || isa<SelectInst>(Val: Inst) ||
341 isa<PHINode>(Val: Inst))
342 continue;
343 unsigned UserScore =
344 1 + (LoopUserWeight * LI.getLoopDepth(BB: Inst->getParent()));
345 LLVM_DEBUG(dbgs() << " [+" << UserScore << "]:\t" << *Inst << "\n");
346 Score += UserScore;
347 }
348 LLVM_DEBUG(dbgs() << " => Final Score:" << Score << "\n");
349 AA.Score = Score;
350}
351
352void AMDGPUPromoteAllocaImpl::setFunctionLimits(const Function &F) {
353 // Load per function limits, overriding with global options where appropriate.
354 // R600 register tuples/aliasing are fragile with large vector promotions so
355 // apply architecture specific limit here.
356 const int R600MaxVectorRegs = 16;
357 MaxVectorRegs = F.getFnAttributeAsParsedInteger(
358 Kind: "amdgpu-promote-alloca-to-vector-max-regs",
359 Default: IsAMDGCN ? PromoteAllocaToVectorMaxRegs : R600MaxVectorRegs);
360 if (PromoteAllocaToVectorMaxRegs.getNumOccurrences())
361 MaxVectorRegs = PromoteAllocaToVectorMaxRegs;
362 VGPRBudgetRatio = F.getFnAttributeAsParsedInteger(
363 Kind: "amdgpu-promote-alloca-to-vector-vgpr-ratio",
364 Default: PromoteAllocaToVectorVGPRRatio);
365 if (PromoteAllocaToVectorVGPRRatio.getNumOccurrences())
366 VGPRBudgetRatio = PromoteAllocaToVectorVGPRRatio;
367}
368
369bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
370 if (DisablePromoteAllocaToLDS && DisablePromoteAllocaToVector)
371 return false;
372
373 Mod = F.getParent();
374 DL = &Mod->getDataLayout();
375
376 bool SufficientLDS = PromoteToLDS && hasSufficientLocalMem(F);
377 MaxVGPRs = getMaxVGPRs(LDSBytes: CurrentLocalMemUsage, TM, F);
378 setFunctionLimits(F);
379
380 unsigned VectorizationBudget =
381 (PromoteAllocaToVectorLimit ? PromoteAllocaToVectorLimit * 8
382 : (MaxVGPRs * 32)) /
383 VGPRBudgetRatio;
384
385 std::vector<AllocaAnalysis> Allocas;
386 for (Instruction &I : F.getEntryBlock()) {
387 if (AllocaInst *AI = dyn_cast<AllocaInst>(Val: &I)) {
388 // Array allocations are probably not worth handling, since an allocation
389 // of the array type is the canonical form.
390 if (!AI->isStaticAlloca() || AI->isArrayAllocation())
391 continue;
392
393 LLVM_DEBUG(dbgs() << "Analyzing: " << *AI << '\n');
394
395 AllocaAnalysis AA{AI};
396 if (collectAllocaUses(AA)) {
397 analyzePromoteToVector(AA);
398 if (PromoteToLDS)
399 analyzePromoteToLDS(AA);
400 if (AA.Vector.Ty || AA.LDS.Enable) {
401 scoreAlloca(AA);
402 Allocas.push_back(x: std::move(AA));
403 }
404 }
405 }
406 }
407
408 stable_sort(Range&: Allocas,
409 C: [](const auto &A, const auto &B) { return A.Score > B.Score; });
410
411 // clang-format off
412 LLVM_DEBUG(
413 dbgs() << "Sorted Worklist:\n";
414 for (const auto &AA : Allocas)
415 dbgs() << " " << *AA.Alloca << "\n";
416 );
417 // clang-format on
418
419 bool Changed = false;
420 SetVector<IntrinsicInst *> DeferredIntrs;
421 for (AllocaAnalysis &AA : Allocas) {
422 if (AA.Vector.Ty) {
423 std::optional<TypeSize> Size = AA.Alloca->getAllocationSize(DL: *DL);
424 if (!Size)
425 continue; // Skip dynamic allocas
426 const unsigned AllocaCost = Size->getFixedValue() * 8;
427 // First, check if we have enough budget to vectorize this alloca.
428 if (AllocaCost <= VectorizationBudget) {
429 promoteAllocaToVector(AA);
430 Changed = true;
431 assert((VectorizationBudget - AllocaCost) < VectorizationBudget &&
432 "Underflow!");
433 VectorizationBudget -= AllocaCost;
434 LLVM_DEBUG(dbgs() << " Remaining vectorization budget:"
435 << VectorizationBudget << "\n");
436 continue;
437 } else {
438 LLVM_DEBUG(dbgs() << "Alloca too big for vectorization (size:"
439 << AllocaCost << ", budget:" << VectorizationBudget
440 << "): " << *AA.Alloca << "\n");
441 }
442 }
443
444 if (AA.LDS.Enable &&
445 tryPromoteAllocaToLDS(AA, SufficientLDS, DeferredIntrs))
446 Changed = true;
447 }
448 finishDeferredAllocaToLDSPromotion(DeferredIntrs);
449
450 // NOTE: tryPromoteAllocaToVector removes the alloca, so Allocas contains
451 // dangling pointers. If we want to reuse it past this point, the loop above
452 // would need to be updated to remove successfully promoted allocas.
453
454 return Changed;
455}
456
457// Checks if the instruction I is a memset user of the alloca AI that we can
458// deal with. Currently, only non-volatile memsets that affect the whole alloca
459// are handled.
460static bool isSupportedMemset(MemSetInst *I, AllocaInst *AI,
461 const DataLayout &DL) {
462 using namespace PatternMatch;
463 // For now we only care about non-volatile memsets that affect the whole type
464 // (start at index 0 and fill the whole alloca).
465 //
466 // TODO: Now that we moved to PromoteAlloca we could handle any memsets
467 // (except maybe volatile ones?) - we just need to use shufflevector if it
468 // only affects a subset of the vector.
469 const unsigned Size = DL.getTypeStoreSize(Ty: AI->getAllocatedType());
470 return I->getOperand(i_nocapture: 0) == AI &&
471 match(V: I->getOperand(i_nocapture: 2), P: m_SpecificInt(V: Size)) && !I->isVolatile();
472}
473
474static Value *calculateVectorIndex(Value *Ptr, AllocaAnalysis &AA) {
475 IRBuilder<> B(Ptr->getContext());
476
477 Ptr = Ptr->stripPointerCasts();
478 if (Ptr == AA.Alloca)
479 return B.getInt32(C: 0);
480
481 auto *GEP = cast<GetElementPtrInst>(Val: Ptr);
482 auto I = AA.Vector.GEPVectorIdx.find(Key: GEP);
483 assert(I != AA.Vector.GEPVectorIdx.end() && "Must have entry for GEP!");
484
485 if (!I->second.Full) {
486 Value *Result = nullptr;
487 B.SetInsertPoint(GEP);
488
489 if (I->second.VarIndex) {
490 Result = I->second.VarIndex;
491 Result = B.CreateSExtOrTrunc(V: Result, DestTy: B.getInt32Ty());
492
493 if (I->second.VarMul)
494 Result = B.CreateMul(LHS: Result, RHS: I->second.VarMul);
495 }
496
497 if (I->second.ConstIndex) {
498 if (Result)
499 Result = B.CreateAdd(LHS: Result, RHS: I->second.ConstIndex);
500 else
501 Result = I->second.ConstIndex;
502 }
503
504 if (!Result)
505 Result = B.getInt32(C: 0);
506
507 I->second.Full = Result;
508 }
509
510 return I->second.Full;
511}
512
513static std::optional<GEPToVectorIndex>
514computeGEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca,
515 Type *VecElemTy, const DataLayout &DL) {
516 // TODO: Extracting a "multiple of X" from a GEP might be a useful generic
517 // helper.
518 LLVMContext &Ctx = GEP->getContext();
519 unsigned BW = DL.getIndexTypeSizeInBits(Ty: GEP->getType());
520 SmallMapVector<Value *, APInt, 4> VarOffsets;
521 APInt ConstOffset(BW, 0);
522
523 // Walk backwards through nested GEPs to collect both constant and variable
524 // offsets, so that nested vector GEP chains can be lowered in one step.
525 //
526 // Given this IR fragment as input:
527 //
528 // %0 = alloca [10 x <2 x i32>], align 8, addrspace(5)
529 // %1 = getelementptr [10 x <2 x i32>], ptr addrspace(5) %0, i32 0, i32 %j
530 // %2 = getelementptr i8, ptr addrspace(5) %1, i32 4
531 // %3 = load i32, ptr addrspace(5) %2, align 4
532 //
533 // Combine both GEP operations in a single pass, producing:
534 // BasePtr = %0
535 // ConstOffset = 4
536 // VarOffsets = { %j -> element_size(<2 x i32>) }
537 //
538 // That lets us emit a single buffer_load directly into a VGPR, without ever
539 // allocating scratch memory for the intermediate pointer.
540 Value *CurPtr = GEP;
541 while (auto *CurGEP = dyn_cast<GetElementPtrInst>(Val: CurPtr)) {
542 if (!CurGEP->collectOffset(DL, BitWidth: BW, VariableOffsets&: VarOffsets, ConstantOffset&: ConstOffset))
543 return {};
544
545 // Move to the next outer pointer.
546 CurPtr = CurGEP->getPointerOperand();
547 }
548
549 assert(CurPtr == Alloca && "GEP not based on alloca");
550
551 int64_t VecElemSize = DL.getTypeAllocSize(Ty: VecElemTy);
552 if (VarOffsets.size() > 1)
553 return {};
554
555 APInt IndexQuot;
556 int64_t Rem;
557 APInt::sdivrem(LHS: ConstOffset, RHS: VecElemSize, Quotient&: IndexQuot, Remainder&: Rem);
558 if (Rem != 0)
559 return {};
560
561 GEPToVectorIndex Result;
562
563 if (!ConstOffset.isZero())
564 Result.ConstIndex = ConstantInt::get(Context&: Ctx, V: IndexQuot.sextOrTrunc(width: BW));
565
566 if (VarOffsets.empty())
567 return Result;
568
569 const auto &VarOffset = VarOffsets.front();
570 APInt OffsetQuot;
571 APInt::sdivrem(LHS: VarOffset.second, RHS: VecElemSize, Quotient&: OffsetQuot, Remainder&: Rem);
572 if (Rem != 0 || OffsetQuot.isZero())
573 return {};
574
575 Result.VarIndex = VarOffset.first;
576 auto *OffsetType = dyn_cast<IntegerType>(Val: Result.VarIndex->getType());
577 if (!OffsetType)
578 return {};
579
580 if (!OffsetQuot.isOne())
581 Result.VarMul = ConstantInt::get(Context&: Ctx, V: OffsetQuot.sextOrTrunc(width: BW));
582
583 return Result;
584}
585
586/// Promotes a single user of the alloca to a vector form.
587///
588/// \param Inst Instruction to be promoted.
589/// \param DL Module Data Layout.
590/// \param AA Alloca Analysis.
591/// \param VecStoreSize Size of \p VectorTy in bytes.
592/// \param ElementSize Size of \p VectorTy element type in bytes.
593/// \param CurVal Current value of the vector (e.g. last stored value)
594/// \param[out] DeferredLoads \p Inst is added to this vector if it can't
595/// be promoted now. This happens when promoting requires \p
596/// CurVal, but \p CurVal is nullptr.
597/// \return the stored value if \p Inst would have written to the alloca, or
598/// nullptr otherwise.
599static Value *promoteAllocaUserToVector(Instruction *Inst, const DataLayout &DL,
600 AllocaAnalysis &AA,
601 unsigned VecStoreSize,
602 unsigned ElementSize,
603 function_ref<Value *()> GetCurVal) {
604 // Note: we use InstSimplifyFolder because it can leverage the DataLayout
605 // to do more folding, especially in the case of vector splats.
606 IRBuilder<InstSimplifyFolder> Builder(Inst->getContext(),
607 InstSimplifyFolder(DL));
608 Builder.SetInsertPoint(Inst);
609
610 Type *VecEltTy = AA.Vector.Ty->getElementType();
611
612 switch (Inst->getOpcode()) {
613 case Instruction::Load: {
614 Value *CurVal = GetCurVal();
615 Value *Index =
616 calculateVectorIndex(Ptr: cast<LoadInst>(Val: Inst)->getPointerOperand(), AA);
617
618 // We're loading the full vector.
619 Type *AccessTy = Inst->getType();
620 TypeSize AccessSize = DL.getTypeStoreSize(Ty: AccessTy);
621 if (Constant *CI = dyn_cast<Constant>(Val: Index)) {
622 if (CI->isNullValue() && AccessSize == VecStoreSize) {
623 Inst->replaceAllUsesWith(
624 V: Builder.CreateBitPreservingCastChain(DL, V: CurVal, NewTy: AccessTy));
625 return nullptr;
626 }
627 }
628
629 // Loading a subvector.
630 if (isa<FixedVectorType>(Val: AccessTy)) {
631 assert(AccessSize.isKnownMultipleOf(DL.getTypeStoreSize(VecEltTy)));
632 const unsigned NumLoadedElts = AccessSize / DL.getTypeStoreSize(Ty: VecEltTy);
633 auto *SubVecTy = FixedVectorType::get(ElementType: VecEltTy, NumElts: NumLoadedElts);
634 assert(DL.getTypeStoreSize(SubVecTy) == DL.getTypeStoreSize(AccessTy));
635
636 // If idx is dynamic, then sandwich load with bitcasts.
637 // ie. VectorTy SubVecTy AccessTy
638 // <64 x i8> -> <16 x i8> <8 x i16>
639 // <64 x i8> -> <4 x i128> -> i128 -> <8 x i16>
640 // Extracting subvector with dynamic index has very large expansion in
641 // the amdgpu backend. Limit to pow2.
642 FixedVectorType *VectorTy = AA.Vector.Ty;
643 TypeSize NumBits = DL.getTypeStoreSize(Ty: SubVecTy) * 8u;
644 uint64_t LoadAlign = cast<LoadInst>(Val: Inst)->getAlign().value();
645 bool IsAlignedLoad = NumBits <= (LoadAlign * 8u);
646 unsigned TotalNumElts = VectorTy->getNumElements();
647 bool IsProperlyDivisible = TotalNumElts % NumLoadedElts == 0;
648 if (!isa<ConstantInt>(Val: Index) &&
649 llvm::isPowerOf2_32(Value: SubVecTy->getNumElements()) &&
650 IsProperlyDivisible && IsAlignedLoad) {
651 IntegerType *NewElemTy = Builder.getIntNTy(N: NumBits);
652 const unsigned NewNumElts =
653 DL.getTypeStoreSize(Ty: VectorTy) * 8u / NumBits;
654 const unsigned LShrAmt = llvm::Log2_32(Value: SubVecTy->getNumElements());
655 FixedVectorType *BitCastTy =
656 FixedVectorType::get(ElementType: NewElemTy, NumElts: NewNumElts);
657 Value *BCVal = Builder.CreateBitCast(V: CurVal, DestTy: BitCastTy);
658 Value *NewIdx = Builder.CreateLShr(
659 LHS: Index, RHS: ConstantInt::get(Ty: Index->getType(), V: LShrAmt));
660 Value *ExtVal = Builder.CreateExtractElement(Vec: BCVal, Idx: NewIdx);
661 Value *BCOut = Builder.CreateBitCast(V: ExtVal, DestTy: AccessTy);
662 Inst->replaceAllUsesWith(V: BCOut);
663 return nullptr;
664 }
665
666 Value *SubVec = PoisonValue::get(T: SubVecTy);
667 for (unsigned K = 0; K < NumLoadedElts; ++K) {
668 Value *CurIdx =
669 Builder.CreateAdd(LHS: Index, RHS: ConstantInt::get(Ty: Index->getType(), V: K));
670 SubVec = Builder.CreateInsertElement(
671 Vec: SubVec, NewElt: Builder.CreateExtractElement(Vec: CurVal, Idx: CurIdx), Idx: K);
672 }
673
674 Inst->replaceAllUsesWith(
675 V: Builder.CreateBitPreservingCastChain(DL, V: SubVec, NewTy: AccessTy));
676 return nullptr;
677 }
678
679 // We're loading one element.
680 Value *ExtractElement = Builder.CreateExtractElement(Vec: CurVal, Idx: Index);
681 if (AccessTy != VecEltTy)
682 ExtractElement = Builder.CreateBitOrPointerCast(V: ExtractElement, DestTy: AccessTy);
683
684 Inst->replaceAllUsesWith(V: ExtractElement);
685 return nullptr;
686 }
687 case Instruction::Store: {
688 // For stores, it's a bit trickier and it depends on whether we're storing
689 // the full vector or not. If we're storing the full vector, we don't need
690 // to know the current value. If this is a store of a single element, we
691 // need to know the value.
692 StoreInst *SI = cast<StoreInst>(Val: Inst);
693 Value *Index = calculateVectorIndex(Ptr: SI->getPointerOperand(), AA);
694 Value *Val = SI->getValueOperand();
695
696 // We're storing the full vector, we can handle this without knowing CurVal.
697 Type *AccessTy = Val->getType();
698 TypeSize AccessSize = DL.getTypeStoreSize(Ty: AccessTy);
699 if (Constant *CI = dyn_cast<Constant>(Val: Index))
700 if (CI->isNullValue() && AccessSize == VecStoreSize)
701 return Builder.CreateBitPreservingCastChain(DL, V: Val, NewTy: AA.Vector.Ty);
702
703 // Storing a subvector.
704 if (isa<FixedVectorType>(Val: AccessTy)) {
705 assert(AccessSize.isKnownMultipleOf(DL.getTypeStoreSize(VecEltTy)));
706 const unsigned NumWrittenElts =
707 AccessSize / DL.getTypeStoreSize(Ty: VecEltTy);
708 const unsigned NumVecElts = AA.Vector.Ty->getNumElements();
709 auto *SubVecTy = FixedVectorType::get(ElementType: VecEltTy, NumElts: NumWrittenElts);
710 assert(DL.getTypeStoreSize(SubVecTy) == DL.getTypeStoreSize(AccessTy));
711
712 Val = Builder.CreateBitPreservingCastChain(DL, V: Val, NewTy: SubVecTy);
713 Value *CurVec = GetCurVal();
714 for (unsigned K = 0, NumElts = std::min(a: NumWrittenElts, b: NumVecElts);
715 K < NumElts; ++K) {
716 Value *CurIdx =
717 Builder.CreateAdd(LHS: Index, RHS: ConstantInt::get(Ty: Index->getType(), V: K));
718 CurVec = Builder.CreateInsertElement(
719 Vec: CurVec, NewElt: Builder.CreateExtractElement(Vec: Val, Idx: K), Idx: CurIdx);
720 }
721 return CurVec;
722 }
723
724 if (Val->getType() != VecEltTy)
725 Val = Builder.CreateBitOrPointerCast(V: Val, DestTy: VecEltTy);
726 return Builder.CreateInsertElement(Vec: GetCurVal(), NewElt: Val, Idx: Index);
727 }
728 case Instruction::Call: {
729 if (auto *MTI = dyn_cast<MemTransferInst>(Val: Inst)) {
730 // For memcpy, we need to know curval.
731 ConstantInt *Length = cast<ConstantInt>(Val: MTI->getLength());
732 unsigned NumCopied = Length->getZExtValue() / ElementSize;
733 MemTransferInfo *TI = &AA.Vector.TransferInfo[MTI];
734 unsigned SrcBegin = TI->SrcIndex->getZExtValue();
735 unsigned DestBegin = TI->DestIndex->getZExtValue();
736
737 SmallVector<int> Mask;
738 for (unsigned Idx = 0; Idx < AA.Vector.Ty->getNumElements(); ++Idx) {
739 if (Idx >= DestBegin && Idx < DestBegin + NumCopied) {
740 Mask.push_back(Elt: SrcBegin < AA.Vector.Ty->getNumElements()
741 ? SrcBegin++
742 : PoisonMaskElem);
743 } else {
744 Mask.push_back(Elt: Idx);
745 }
746 }
747
748 return Builder.CreateShuffleVector(V: GetCurVal(), Mask);
749 }
750
751 if (auto *MSI = dyn_cast<MemSetInst>(Val: Inst)) {
752 // For memset, we don't need to know the previous value because we
753 // currently only allow memsets that cover the whole alloca.
754 Value *Elt = MSI->getOperand(i_nocapture: 1);
755 const unsigned BytesPerElt = DL.getTypeStoreSize(Ty: VecEltTy);
756 if (BytesPerElt > 1) {
757 Value *EltBytes = Builder.CreateVectorSplat(NumElts: BytesPerElt, V: Elt);
758
759 // If the element type of the vector is a pointer, we need to first cast
760 // to an integer, then use a PtrCast.
761 if (VecEltTy->isPointerTy()) {
762 Type *PtrInt = Builder.getIntNTy(N: BytesPerElt * 8);
763 Elt = Builder.CreateBitCast(V: EltBytes, DestTy: PtrInt);
764 Elt = Builder.CreateIntToPtr(V: Elt, DestTy: VecEltTy);
765 } else
766 Elt = Builder.CreateBitCast(V: EltBytes, DestTy: VecEltTy);
767 }
768
769 return Builder.CreateVectorSplat(EC: AA.Vector.Ty->getElementCount(), V: Elt);
770 }
771
772 if (auto *Intr = dyn_cast<IntrinsicInst>(Val: Inst)) {
773 if (Intr->getIntrinsicID() == Intrinsic::objectsize) {
774 Intr->replaceAllUsesWith(
775 V: Builder.getIntN(N: Intr->getType()->getIntegerBitWidth(),
776 C: DL.getTypeAllocSize(Ty: AA.Vector.Ty)));
777 return nullptr;
778 }
779 }
780
781 llvm_unreachable("Unsupported call when promoting alloca to vector");
782 }
783
784 default:
785 llvm_unreachable("Inconsistency in instructions promotable to vector");
786 }
787
788 llvm_unreachable("Did not return after promoting instruction!");
789}
790
791static bool isSupportedAccessType(FixedVectorType *VecTy, Type *AccessTy,
792 const DataLayout &DL) {
793 // Access as a vector type can work if the size of the access vector is a
794 // multiple of the size of the alloca's vector element type.
795 //
796 // Examples:
797 // - VecTy = <8 x float>, AccessTy = <4 x float> -> OK
798 // - VecTy = <4 x double>, AccessTy = <2 x float> -> OK
799 // - VecTy = <4 x double>, AccessTy = <3 x float> -> NOT OK
800 // - 3*32 is not a multiple of 64
801 //
802 // We could handle more complicated cases, but it'd make things a lot more
803 // complicated.
804 if (isa<FixedVectorType>(Val: AccessTy)) {
805 TypeSize AccTS = DL.getTypeStoreSize(Ty: AccessTy);
806 // If the type size and the store size don't match, we would need to do more
807 // than just bitcast to translate between an extracted/insertable subvectors
808 // and the accessed value.
809 if (AccTS * 8 != DL.getTypeSizeInBits(Ty: AccessTy))
810 return false;
811 TypeSize VecTS = DL.getTypeStoreSize(Ty: VecTy->getElementType());
812 return AccTS.isKnownMultipleOf(RHS: VecTS);
813 }
814
815 return CastInst::isBitOrNoopPointerCastable(SrcTy: VecTy->getElementType(), DestTy: AccessTy,
816 DL);
817}
818
819/// Iterates over an instruction worklist that may contain multiple instructions
820/// from the same basic block, but in a different order.
821template <typename InstContainer>
822static void forEachWorkListItem(const InstContainer &WorkList,
823 std::function<void(Instruction *)> Fn) {
824 // Bucket up uses of the alloca by the block they occur in.
825 // This is important because we have to handle multiple defs/uses in a block
826 // ourselves: SSAUpdater is purely for cross-block references.
827 DenseMap<BasicBlock *, SmallDenseSet<Instruction *>> UsesByBlock;
828 for (Instruction *User : WorkList)
829 UsesByBlock[User->getParent()].insert(V: User);
830
831 for (Instruction *User : WorkList) {
832 BasicBlock *BB = User->getParent();
833 auto &BlockUses = UsesByBlock[BB];
834
835 // Already processed, skip.
836 if (BlockUses.empty())
837 continue;
838
839 // Only user in the block, directly process it.
840 if (BlockUses.size() == 1) {
841 Fn(User);
842 continue;
843 }
844
845 // Multiple users in the block, do a linear scan to see users in order.
846 for (Instruction &Inst : *BB) {
847 if (!BlockUses.contains(V: &Inst))
848 continue;
849
850 Fn(&Inst);
851 }
852
853 // Clear the block so we know it's been processed.
854 BlockUses.clear();
855 }
856}
857
858/// Find an insert point after an alloca, after all other allocas clustered at
859/// the start of the block.
860static BasicBlock::iterator skipToNonAllocaInsertPt(BasicBlock &BB,
861 BasicBlock::iterator I) {
862 for (BasicBlock::iterator E = BB.end(); I != E && isa<AllocaInst>(Val: *I); ++I)
863 ;
864 return I;
865}
866
867FixedVectorType *
868AMDGPUPromoteAllocaImpl::getVectorTypeForAlloca(Type *AllocaTy) const {
869 if (DisablePromoteAllocaToVector) {
870 LLVM_DEBUG(dbgs() << " Promote alloca to vectors is disabled\n");
871 return nullptr;
872 }
873
874 auto *VectorTy = dyn_cast<FixedVectorType>(Val: AllocaTy);
875 if (auto *ArrayTy = dyn_cast<ArrayType>(Val: AllocaTy)) {
876 uint64_t NumElems = 1;
877 Type *ElemTy;
878 do {
879 NumElems *= ArrayTy->getNumElements();
880 ElemTy = ArrayTy->getElementType();
881 } while ((ArrayTy = dyn_cast<ArrayType>(Val: ElemTy)));
882
883 // Check for array of vectors
884 auto *InnerVectorTy = dyn_cast<FixedVectorType>(Val: ElemTy);
885 if (InnerVectorTy) {
886 NumElems *= InnerVectorTy->getNumElements();
887 ElemTy = InnerVectorTy->getElementType();
888 }
889
890 if (VectorType::isValidElementType(ElemTy) && NumElems > 0) {
891 unsigned ElementSize = DL->getTypeSizeInBits(Ty: ElemTy) / 8;
892 if (ElementSize > 0) {
893 unsigned AllocaSize = DL->getTypeStoreSize(Ty: AllocaTy);
894 // Expand vector if required to match padding of inner type,
895 // i.e. odd size subvectors.
896 // Storage size of new vector must match that of alloca for correct
897 // behaviour of byte offsets and GEP computation.
898 if (NumElems * ElementSize != AllocaSize)
899 NumElems = AllocaSize / ElementSize;
900 if (NumElems > 0 && (AllocaSize % ElementSize) == 0)
901 VectorTy = FixedVectorType::get(ElementType: ElemTy, NumElts: NumElems);
902 }
903 }
904 }
905 if (!VectorTy) {
906 LLVM_DEBUG(dbgs() << " Cannot convert type to vector\n");
907 return nullptr;
908 }
909
910 const unsigned MaxElements =
911 (MaxVectorRegs * 32) / DL->getTypeSizeInBits(Ty: VectorTy->getElementType());
912
913 if (VectorTy->getNumElements() > MaxElements ||
914 VectorTy->getNumElements() < 2) {
915 LLVM_DEBUG(dbgs() << " " << *VectorTy
916 << " has an unsupported number of elements\n");
917 return nullptr;
918 }
919
920 Type *VecEltTy = VectorTy->getElementType();
921 unsigned ElementSizeInBits = DL->getTypeSizeInBits(Ty: VecEltTy);
922 if (ElementSizeInBits != DL->getTypeAllocSizeInBits(Ty: VecEltTy)) {
923 LLVM_DEBUG(dbgs() << " Cannot convert to vector if the allocation size "
924 "does not match the type's size\n");
925 return nullptr;
926 }
927
928 return VectorTy;
929}
930
931void AMDGPUPromoteAllocaImpl::analyzePromoteToVector(AllocaAnalysis &AA) const {
932 if (AA.HaveSelectOrPHI) {
933 LLVM_DEBUG(dbgs() << " Cannot convert to vector due to select or phi\n");
934 return;
935 }
936
937 Type *AllocaTy = AA.Alloca->getAllocatedType();
938 AA.Vector.Ty = getVectorTypeForAlloca(AllocaTy);
939 if (!AA.Vector.Ty)
940 return;
941
942 const auto RejectUser = [&](Instruction *Inst, Twine Msg) {
943 LLVM_DEBUG(dbgs() << " Cannot promote alloca to vector: " << Msg << "\n"
944 << " " << *Inst << "\n");
945 AA.Vector.Ty = nullptr;
946 };
947
948 Type *VecEltTy = AA.Vector.Ty->getElementType();
949 unsigned ElementSize = DL->getTypeSizeInBits(Ty: VecEltTy) / 8;
950 assert(ElementSize > 0);
951 for (auto *U : AA.Uses) {
952 Instruction *Inst = cast<Instruction>(Val: U->getUser());
953
954 if (Value *Ptr = getLoadStorePointerOperand(V: Inst)) {
955 assert(!isa<StoreInst>(Inst) ||
956 U->getOperandNo() == StoreInst::getPointerOperandIndex());
957
958 Type *AccessTy = getLoadStoreType(I: Inst);
959 if (AccessTy->isAggregateType())
960 return RejectUser(Inst, "unsupported load/store as aggregate");
961 assert(!AccessTy->isAggregateType() || AccessTy->isArrayTy());
962
963 // Check that this is a simple access of a vector element.
964 bool IsSimple = isa<LoadInst>(Val: Inst) ? cast<LoadInst>(Val: Inst)->isSimple()
965 : cast<StoreInst>(Val: Inst)->isSimple();
966 if (!IsSimple)
967 return RejectUser(Inst, "not a simple load or store");
968
969 Ptr = Ptr->stripPointerCasts();
970
971 // Alloca already accessed as vector.
972 if (Ptr == AA.Alloca &&
973 DL->getTypeStoreSize(Ty: AA.Alloca->getAllocatedType()) ==
974 DL->getTypeStoreSize(Ty: AccessTy)) {
975 AA.Vector.Worklist.push_back(Elt: Inst);
976 continue;
977 }
978
979 if (!isSupportedAccessType(VecTy: AA.Vector.Ty, AccessTy, DL: *DL))
980 return RejectUser(Inst, "not a supported access type");
981
982 AA.Vector.Worklist.push_back(Elt: Inst);
983 continue;
984 }
985
986 if (auto *GEP = dyn_cast<GetElementPtrInst>(Val: Inst)) {
987 // If we can't compute a vector index from this GEP, then we can't
988 // promote this alloca to vector.
989 auto Index = computeGEPToVectorIndex(GEP, Alloca: AA.Alloca, VecElemTy: VecEltTy, DL: *DL);
990 if (!Index)
991 return RejectUser(Inst, "cannot compute vector index for GEP");
992
993 AA.Vector.GEPVectorIdx[GEP] = std::move(Index.value());
994 AA.Vector.UsersToRemove.push_back(Elt: Inst);
995 continue;
996 }
997
998 if (MemSetInst *MSI = dyn_cast<MemSetInst>(Val: Inst);
999 MSI && isSupportedMemset(I: MSI, AI: AA.Alloca, DL: *DL)) {
1000 AA.Vector.Worklist.push_back(Elt: Inst);
1001 continue;
1002 }
1003
1004 if (MemTransferInst *TransferInst = dyn_cast<MemTransferInst>(Val: Inst)) {
1005 if (TransferInst->isVolatile())
1006 return RejectUser(Inst, "mem transfer inst is volatile");
1007
1008 ConstantInt *Len = dyn_cast<ConstantInt>(Val: TransferInst->getLength());
1009 if (!Len || (Len->getZExtValue() % ElementSize))
1010 return RejectUser(Inst, "mem transfer inst length is non-constant or "
1011 "not a multiple of the vector element size");
1012
1013 auto getConstIndexIntoAlloca = [&](Value *Ptr) -> ConstantInt * {
1014 if (Ptr == AA.Alloca)
1015 return ConstantInt::get(Context&: Ptr->getContext(), V: APInt(32, 0));
1016
1017 GetElementPtrInst *GEP = cast<GetElementPtrInst>(Val: Ptr);
1018 const auto &GEPI = AA.Vector.GEPVectorIdx.find(Key: GEP)->second;
1019 if (GEPI.VarIndex)
1020 return nullptr;
1021 if (GEPI.ConstIndex)
1022 return GEPI.ConstIndex;
1023 return ConstantInt::get(Context&: Ptr->getContext(), V: APInt(32, 0));
1024 };
1025
1026 MemTransferInfo *TI =
1027 &AA.Vector.TransferInfo.try_emplace(Key: TransferInst).first->second;
1028 unsigned OpNum = U->getOperandNo();
1029 if (OpNum == 0) {
1030 Value *Dest = TransferInst->getDest();
1031 ConstantInt *Index = getConstIndexIntoAlloca(Dest);
1032 if (!Index)
1033 return RejectUser(Inst, "could not calculate constant dest index");
1034 TI->DestIndex = Index;
1035 } else {
1036 assert(OpNum == 1);
1037 Value *Src = TransferInst->getSource();
1038 ConstantInt *Index = getConstIndexIntoAlloca(Src);
1039 if (!Index)
1040 return RejectUser(Inst, "could not calculate constant src index");
1041 TI->SrcIndex = Index;
1042 }
1043 continue;
1044 }
1045
1046 if (auto *Intr = dyn_cast<IntrinsicInst>(Val: Inst)) {
1047 if (Intr->getIntrinsicID() == Intrinsic::objectsize) {
1048 AA.Vector.Worklist.push_back(Elt: Inst);
1049 continue;
1050 }
1051 }
1052
1053 // Ignore assume-like intrinsics and comparisons used in assumes.
1054 if (isAssumeLikeIntrinsic(I: Inst)) {
1055 if (!Inst->use_empty())
1056 return RejectUser(Inst, "assume-like intrinsic cannot have any users");
1057 AA.Vector.UsersToRemove.push_back(Elt: Inst);
1058 continue;
1059 }
1060
1061 if (isa<ICmpInst>(Val: Inst) && all_of(Range: Inst->users(), P: [](User *U) {
1062 return isAssumeLikeIntrinsic(I: cast<Instruction>(Val: U));
1063 })) {
1064 AA.Vector.UsersToRemove.push_back(Elt: Inst);
1065 continue;
1066 }
1067
1068 return RejectUser(Inst, "unhandled alloca user");
1069 }
1070
1071 // Follow-up check to ensure we've seen both sides of all transfer insts.
1072 for (const auto &Entry : AA.Vector.TransferInfo) {
1073 const MemTransferInfo &TI = Entry.second;
1074 if (!TI.SrcIndex || !TI.DestIndex)
1075 return RejectUser(Entry.first,
1076 "mem transfer inst between different objects");
1077 AA.Vector.Worklist.push_back(Elt: Entry.first);
1078 }
1079}
1080
1081void AMDGPUPromoteAllocaImpl::promoteAllocaToVector(AllocaAnalysis &AA) {
1082 LLVM_DEBUG(dbgs() << "Promoting to vectors: " << *AA.Alloca << '\n');
1083 LLVM_DEBUG(dbgs() << " type conversion: " << *AA.Alloca->getAllocatedType()
1084 << " -> " << *AA.Vector.Ty << '\n');
1085 const unsigned VecStoreSize = DL->getTypeStoreSize(Ty: AA.Vector.Ty);
1086
1087 Type *VecEltTy = AA.Vector.Ty->getElementType();
1088 const unsigned ElementSize = DL->getTypeSizeInBits(Ty: VecEltTy) / 8;
1089
1090 // Alloca is uninitialized memory. Imitate that by making the first value
1091 // undef.
1092 SSAUpdater Updater;
1093 Updater.Initialize(Ty: AA.Vector.Ty, Name: "promotealloca");
1094
1095 BasicBlock *EntryBB = AA.Alloca->getParent();
1096 BasicBlock::iterator InitInsertPos =
1097 skipToNonAllocaInsertPt(BB&: *EntryBB, I: AA.Alloca->getIterator());
1098 IRBuilder<> Builder(&*InitInsertPos);
1099 Value *AllocaInitValue = Builder.CreateFreeze(V: PoisonValue::get(T: AA.Vector.Ty));
1100 AllocaInitValue->takeName(V: AA.Alloca);
1101
1102 Updater.AddAvailableValue(BB: AA.Alloca->getParent(), V: AllocaInitValue);
1103
1104 // First handle the initial worklist, in basic block order.
1105 //
1106 // Insert a placeholder whenever we need the vector value at the top of a
1107 // basic block.
1108 SmallVector<Instruction *> Placeholders;
1109 forEachWorkListItem(WorkList: AA.Vector.Worklist, Fn: [&](Instruction *I) {
1110 BasicBlock *BB = I->getParent();
1111 auto GetCurVal = [&]() -> Value * {
1112 if (Value *CurVal = Updater.FindValueForBlock(BB))
1113 return CurVal;
1114
1115 if (!Placeholders.empty() && Placeholders.back()->getParent() == BB)
1116 return Placeholders.back();
1117
1118 // If the current value in the basic block is not yet known, insert a
1119 // placeholder that we will replace later.
1120 IRBuilder<> Builder(I);
1121 auto *Placeholder = cast<Instruction>(Val: Builder.CreateFreeze(
1122 V: PoisonValue::get(T: AA.Vector.Ty), Name: "promotealloca.placeholder"));
1123 Placeholders.push_back(Elt: Placeholder);
1124 return Placeholders.back();
1125 };
1126
1127 Value *Result = promoteAllocaUserToVector(Inst: I, DL: *DL, AA, VecStoreSize,
1128 ElementSize, GetCurVal);
1129 if (Result)
1130 Updater.AddAvailableValue(BB, V: Result);
1131 });
1132
1133 // Now fixup the placeholders.
1134 SmallVector<Value *> PlaceholderToNewVal(Placeholders.size());
1135 for (auto [Index, Placeholder] : enumerate(First&: Placeholders)) {
1136 Value *NewVal = Updater.GetValueInMiddleOfBlock(BB: Placeholder->getParent());
1137 PlaceholderToNewVal[Index] = NewVal;
1138 Placeholder->replaceAllUsesWith(V: NewVal);
1139 }
1140 // Note: we cannot merge this loop with the previous one because it is
1141 // possible that the placeholder itself can be used in the SSAUpdater. The
1142 // replaceAllUsesWith doesn't replace those uses.
1143 for (auto [Index, Placeholder] : enumerate(First&: Placeholders)) {
1144 if (!Placeholder->use_empty())
1145 Placeholder->replaceAllUsesWith(V: PlaceholderToNewVal[Index]);
1146 Placeholder->eraseFromParent();
1147 }
1148
1149 // Delete all instructions.
1150 for (Instruction *I : AA.Vector.Worklist) {
1151 assert(I->use_empty());
1152 I->eraseFromParent();
1153 }
1154
1155 // Delete all the users that are known to be removeable.
1156 for (Instruction *I : reverse(C&: AA.Vector.UsersToRemove)) {
1157 I->dropDroppableUses();
1158 assert(I->use_empty());
1159 I->eraseFromParent();
1160 }
1161
1162 // Alloca should now be dead too.
1163 assert(AA.Alloca->use_empty());
1164 AA.Alloca->eraseFromParent();
1165}
1166
1167std::pair<Value *, Value *>
1168AMDGPUPromoteAllocaImpl::getLocalSizeYZ(IRBuilder<> &Builder) {
1169 Function &F = *Builder.GetInsertBlock()->getParent();
1170 const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F);
1171
1172 if (!IsAMDHSA) {
1173 CallInst *LocalSizeY =
1174 Builder.CreateIntrinsic(ID: Intrinsic::r600_read_local_size_y, Args: {});
1175 CallInst *LocalSizeZ =
1176 Builder.CreateIntrinsic(ID: Intrinsic::r600_read_local_size_z, Args: {});
1177
1178 ST.makeLIDRangeMetadata(I: LocalSizeY);
1179 ST.makeLIDRangeMetadata(I: LocalSizeZ);
1180
1181 return std::pair(LocalSizeY, LocalSizeZ);
1182 }
1183
1184 // We must read the size out of the dispatch pointer.
1185 assert(IsAMDGCN);
1186
1187 // We are indexing into this struct, and want to extract the workgroup_size_*
1188 // fields.
1189 //
1190 // typedef struct hsa_kernel_dispatch_packet_s {
1191 // uint16_t header;
1192 // uint16_t setup;
1193 // uint16_t workgroup_size_x ;
1194 // uint16_t workgroup_size_y;
1195 // uint16_t workgroup_size_z;
1196 // uint16_t reserved0;
1197 // uint32_t grid_size_x ;
1198 // uint32_t grid_size_y ;
1199 // uint32_t grid_size_z;
1200 //
1201 // uint32_t private_segment_size;
1202 // uint32_t group_segment_size;
1203 // uint64_t kernel_object;
1204 //
1205 // #ifdef HSA_LARGE_MODEL
1206 // void *kernarg_address;
1207 // #elif defined HSA_LITTLE_ENDIAN
1208 // void *kernarg_address;
1209 // uint32_t reserved1;
1210 // #else
1211 // uint32_t reserved1;
1212 // void *kernarg_address;
1213 // #endif
1214 // uint64_t reserved2;
1215 // hsa_signal_t completion_signal; // uint64_t wrapper
1216 // } hsa_kernel_dispatch_packet_t
1217 //
1218 CallInst *DispatchPtr =
1219 Builder.CreateIntrinsic(ID: Intrinsic::amdgcn_dispatch_ptr, Args: {});
1220 DispatchPtr->addRetAttr(Kind: Attribute::NoAlias);
1221 DispatchPtr->addRetAttr(Kind: Attribute::NonNull);
1222 F.removeFnAttr(Kind: "amdgpu-no-dispatch-ptr");
1223
1224 // Size of the dispatch packet struct.
1225 DispatchPtr->addDereferenceableRetAttr(Bytes: 64);
1226
1227 Type *I32Ty = Type::getInt32Ty(C&: Mod->getContext());
1228
1229 // We could do a single 64-bit load here, but it's likely that the basic
1230 // 32-bit and extract sequence is already present, and it is probably easier
1231 // to CSE this. The loads should be mergeable later anyway.
1232 Value *GEPXY = Builder.CreateConstInBoundsGEP1_64(Ty: I32Ty, Ptr: DispatchPtr, Idx0: 1);
1233 LoadInst *LoadXY = Builder.CreateAlignedLoad(Ty: I32Ty, Ptr: GEPXY, Align: Align(4));
1234
1235 Value *GEPZU = Builder.CreateConstInBoundsGEP1_64(Ty: I32Ty, Ptr: DispatchPtr, Idx0: 2);
1236 LoadInst *LoadZU = Builder.CreateAlignedLoad(Ty: I32Ty, Ptr: GEPZU, Align: Align(4));
1237
1238 MDNode *MD = MDNode::get(Context&: Mod->getContext(), MDs: {});
1239 LoadXY->setMetadata(KindID: LLVMContext::MD_invariant_load, Node: MD);
1240 LoadZU->setMetadata(KindID: LLVMContext::MD_invariant_load, Node: MD);
1241 ST.makeLIDRangeMetadata(I: LoadZU);
1242
1243 // Extract y component. Upper half of LoadZU should be zero already.
1244 Value *Y = Builder.CreateLShr(LHS: LoadXY, RHS: 16);
1245
1246 return std::pair(Y, LoadZU);
1247}
1248
1249Value *AMDGPUPromoteAllocaImpl::getWorkitemID(IRBuilder<> &Builder,
1250 unsigned N) {
1251 Function *F = Builder.GetInsertBlock()->getParent();
1252 const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F: *F);
1253 Intrinsic::ID IntrID = Intrinsic::not_intrinsic;
1254 StringRef AttrName;
1255
1256 switch (N) {
1257 case 0:
1258 IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_x
1259 : (Intrinsic::ID)Intrinsic::r600_read_tidig_x;
1260 AttrName = "amdgpu-no-workitem-id-x";
1261 break;
1262 case 1:
1263 IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_y
1264 : (Intrinsic::ID)Intrinsic::r600_read_tidig_y;
1265 AttrName = "amdgpu-no-workitem-id-y";
1266 break;
1267
1268 case 2:
1269 IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_z
1270 : (Intrinsic::ID)Intrinsic::r600_read_tidig_z;
1271 AttrName = "amdgpu-no-workitem-id-z";
1272 break;
1273 default:
1274 llvm_unreachable("invalid dimension");
1275 }
1276
1277 Function *WorkitemIdFn = Intrinsic::getOrInsertDeclaration(M: Mod, id: IntrID);
1278 CallInst *CI = Builder.CreateCall(Callee: WorkitemIdFn);
1279 ST.makeLIDRangeMetadata(I: CI);
1280 F->removeFnAttr(Kind: AttrName);
1281
1282 return CI;
1283}
1284
1285static bool isCallPromotable(CallInst *CI) {
1286 IntrinsicInst *II = dyn_cast<IntrinsicInst>(Val: CI);
1287 if (!II)
1288 return false;
1289
1290 switch (II->getIntrinsicID()) {
1291 case Intrinsic::memcpy:
1292 case Intrinsic::memmove:
1293 case Intrinsic::memset:
1294 case Intrinsic::lifetime_start:
1295 case Intrinsic::lifetime_end:
1296 case Intrinsic::invariant_start:
1297 case Intrinsic::invariant_end:
1298 case Intrinsic::launder_invariant_group:
1299 case Intrinsic::strip_invariant_group:
1300 case Intrinsic::objectsize:
1301 return true;
1302 default:
1303 return false;
1304 }
1305}
1306
1307bool AMDGPUPromoteAllocaImpl::binaryOpIsDerivedFromSameAlloca(
1308 Value *BaseAlloca, Value *Val, Instruction *Inst, int OpIdx0,
1309 int OpIdx1) const {
1310 // Figure out which operand is the one we might not be promoting.
1311 Value *OtherOp = Inst->getOperand(i: OpIdx0);
1312 if (Val == OtherOp)
1313 OtherOp = Inst->getOperand(i: OpIdx1);
1314
1315 if (isa<ConstantPointerNull, ConstantAggregateZero>(Val: OtherOp))
1316 return true;
1317
1318 // TODO: getUnderlyingObject will not work on a vector getelementptr
1319 Value *OtherObj = getUnderlyingObject(V: OtherOp);
1320 if (!isa<AllocaInst>(Val: OtherObj))
1321 return false;
1322
1323 // TODO: We should be able to replace undefs with the right pointer type.
1324
1325 // TODO: If we know the other base object is another promotable
1326 // alloca, not necessarily this alloca, we can do this. The
1327 // important part is both must have the same address space at
1328 // the end.
1329 if (OtherObj != BaseAlloca) {
1330 LLVM_DEBUG(
1331 dbgs() << "Found a binary instruction with another alloca object\n");
1332 return false;
1333 }
1334
1335 return true;
1336}
1337
1338void AMDGPUPromoteAllocaImpl::analyzePromoteToLDS(AllocaAnalysis &AA) const {
1339 if (DisablePromoteAllocaToLDS) {
1340 LLVM_DEBUG(dbgs() << " Promote alloca to LDS is disabled\n");
1341 return;
1342 }
1343
1344 // Don't promote the alloca to LDS for shader calling conventions as the work
1345 // item ID intrinsics are not supported for these calling conventions.
1346 // Furthermore not all LDS is available for some of the stages.
1347 const Function &ContainingFunction = *AA.Alloca->getFunction();
1348 CallingConv::ID CC = ContainingFunction.getCallingConv();
1349
1350 switch (CC) {
1351 case CallingConv::AMDGPU_KERNEL:
1352 case CallingConv::SPIR_KERNEL:
1353 break;
1354 default:
1355 LLVM_DEBUG(
1356 dbgs()
1357 << " promote alloca to LDS not supported with calling convention.\n");
1358 return;
1359 }
1360
1361 for (Use *Use : AA.Uses) {
1362 auto *User = Use->getUser();
1363
1364 if (CallInst *CI = dyn_cast<CallInst>(Val: User)) {
1365 if (!isCallPromotable(CI))
1366 return;
1367
1368 if (find(Range&: AA.LDS.Worklist, Val: User) == AA.LDS.Worklist.end())
1369 AA.LDS.Worklist.push_back(Elt: User);
1370 continue;
1371 }
1372
1373 Instruction *UseInst = cast<Instruction>(Val: User);
1374 if (UseInst->getOpcode() == Instruction::PtrToInt)
1375 return;
1376
1377 if (LoadInst *LI = dyn_cast<LoadInst>(Val: UseInst)) {
1378 if (LI->isVolatile())
1379 return;
1380 continue;
1381 }
1382
1383 if (StoreInst *SI = dyn_cast<StoreInst>(Val: UseInst)) {
1384 if (SI->isVolatile())
1385 return;
1386 continue;
1387 }
1388
1389 if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(Val: UseInst)) {
1390 if (RMW->isVolatile())
1391 return;
1392 continue;
1393 }
1394
1395 if (AtomicCmpXchgInst *CAS = dyn_cast<AtomicCmpXchgInst>(Val: UseInst)) {
1396 if (CAS->isVolatile())
1397 return;
1398 continue;
1399 }
1400
1401 // Only promote a select if we know that the other select operand
1402 // is from another pointer that will also be promoted.
1403 if (ICmpInst *ICmp = dyn_cast<ICmpInst>(Val: UseInst)) {
1404 if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca: AA.Alloca, Val: Use->get(), Inst: ICmp, OpIdx0: 0, OpIdx1: 1))
1405 return;
1406
1407 // May need to rewrite constant operands.
1408 if (find(Range&: AA.LDS.Worklist, Val: User) == AA.LDS.Worklist.end())
1409 AA.LDS.Worklist.push_back(Elt: ICmp);
1410 continue;
1411 }
1412
1413 if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Val: UseInst)) {
1414 // Be conservative if an address could be computed outside the bounds of
1415 // the alloca.
1416 if (!GEP->isInBounds())
1417 return;
1418 } else if (!isa<ExtractElementInst, SelectInst, PHINode>(Val: User)) {
1419 // Do not promote vector/aggregate type instructions. It is hard to track
1420 // their users.
1421
1422 // Do not promote addrspacecast.
1423 //
1424 // TODO: If we know the address is only observed through flat pointers, we
1425 // could still promote.
1426 return;
1427 }
1428
1429 if (find(Range&: AA.LDS.Worklist, Val: User) == AA.LDS.Worklist.end())
1430 AA.LDS.Worklist.push_back(Elt: User);
1431 }
1432
1433 AA.LDS.Enable = true;
1434}
1435
1436bool AMDGPUPromoteAllocaImpl::hasSufficientLocalMem(const Function &F) {
1437
1438 FunctionType *FTy = F.getFunctionType();
1439 const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F);
1440
1441 // If the function has any arguments in the local address space, then it's
1442 // possible these arguments require the entire local memory space, so
1443 // we cannot use local memory in the pass.
1444 for (Type *ParamTy : FTy->params()) {
1445 PointerType *PtrTy = dyn_cast<PointerType>(Val: ParamTy);
1446 if (PtrTy && PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
1447 LocalMemLimit = 0;
1448 LLVM_DEBUG(dbgs() << "Function has local memory argument. Promoting to "
1449 "local memory disabled.\n");
1450 return false;
1451 }
1452 }
1453
1454 LocalMemLimit = ST.getAddressableLocalMemorySize();
1455 if (LocalMemLimit == 0)
1456 return false;
1457
1458 SmallVector<const Constant *, 16> Stack;
1459 SmallPtrSet<const Constant *, 8> VisitedConstants;
1460 SmallPtrSet<const GlobalVariable *, 8> UsedLDS;
1461
1462 auto visitUsers = [&](const GlobalVariable *GV, const Constant *Val) -> bool {
1463 for (const User *U : Val->users()) {
1464 if (const Instruction *Use = dyn_cast<Instruction>(Val: U)) {
1465 if (Use->getFunction() == &F)
1466 return true;
1467 } else {
1468 const Constant *C = cast<Constant>(Val: U);
1469 if (VisitedConstants.insert(Ptr: C).second)
1470 Stack.push_back(Elt: C);
1471 }
1472 }
1473
1474 return false;
1475 };
1476
1477 for (GlobalVariable &GV : Mod->globals()) {
1478 if (GV.getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS)
1479 continue;
1480
1481 if (visitUsers(&GV, &GV)) {
1482 UsedLDS.insert(Ptr: &GV);
1483 Stack.clear();
1484 continue;
1485 }
1486
1487 // For any ConstantExpr uses, we need to recursively search the users until
1488 // we see a function.
1489 while (!Stack.empty()) {
1490 const Constant *C = Stack.pop_back_val();
1491 if (visitUsers(&GV, C)) {
1492 UsedLDS.insert(Ptr: &GV);
1493 Stack.clear();
1494 break;
1495 }
1496 }
1497 }
1498
1499 const DataLayout &DL = Mod->getDataLayout();
1500 SmallVector<std::pair<uint64_t, Align>, 16> AllocatedSizes;
1501 AllocatedSizes.reserve(N: UsedLDS.size());
1502
1503 for (const GlobalVariable *GV : UsedLDS) {
1504 Align Alignment =
1505 DL.getValueOrABITypeAlignment(Alignment: GV->getAlign(), Ty: GV->getValueType());
1506 uint64_t AllocSize = GV->getGlobalSize(DL);
1507
1508 // HIP uses an extern unsized array in local address space for dynamically
1509 // allocated shared memory. In that case, we have to disable the promotion.
1510 if (GV->hasExternalLinkage() && AllocSize == 0) {
1511 LocalMemLimit = 0;
1512 LLVM_DEBUG(dbgs() << "Function has a reference to externally allocated "
1513 "local memory. Promoting to local memory "
1514 "disabled.\n");
1515 return false;
1516 }
1517
1518 AllocatedSizes.emplace_back(Args&: AllocSize, Args&: Alignment);
1519 }
1520
1521 // Sort to try to estimate the worst case alignment padding
1522 //
1523 // FIXME: We should really do something to fix the addresses to a more optimal
1524 // value instead
1525 llvm::sort(C&: AllocatedSizes, Comp: llvm::less_second());
1526
1527 // Check how much local memory is being used by global objects
1528 CurrentLocalMemUsage = 0;
1529
1530 // FIXME: Try to account for padding here. The real padding and address is
1531 // currently determined from the inverse order of uses in the function when
1532 // legalizing, which could also potentially change. We try to estimate the
1533 // worst case here, but we probably should fix the addresses earlier.
1534 for (auto Alloc : AllocatedSizes) {
1535 CurrentLocalMemUsage = alignTo(Size: CurrentLocalMemUsage, A: Alloc.second);
1536 CurrentLocalMemUsage += Alloc.first;
1537 }
1538
1539 unsigned MaxOccupancy =
1540 ST.getWavesPerEU(FlatWorkGroupSizes: ST.getFlatWorkGroupSizes(F), LDSBytes: CurrentLocalMemUsage, F)
1541 .second;
1542
1543 // Round up to the next tier of usage.
1544 unsigned MaxSizeWithWaveCount =
1545 ST.getMaxLocalMemSizeWithWaveCount(WaveCount: MaxOccupancy, F);
1546
1547 // Program may already use more LDS than is usable at maximum occupancy.
1548 if (CurrentLocalMemUsage > MaxSizeWithWaveCount)
1549 return false;
1550
1551 LocalMemLimit = MaxSizeWithWaveCount;
1552
1553 LLVM_DEBUG(dbgs() << F.getName() << " uses " << CurrentLocalMemUsage
1554 << " bytes of LDS\n"
1555 << " Rounding size to " << MaxSizeWithWaveCount
1556 << " with a maximum occupancy of " << MaxOccupancy << '\n'
1557 << " and " << (LocalMemLimit - CurrentLocalMemUsage)
1558 << " available for promotion\n");
1559
1560 return true;
1561}
1562
1563// FIXME: Should try to pick the most likely to be profitable allocas first.
1564bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(
1565 AllocaAnalysis &AA, bool SufficientLDS,
1566 SetVector<IntrinsicInst *> &DeferredIntrs) {
1567 LLVM_DEBUG(dbgs() << "Trying to promote to LDS: " << *AA.Alloca << '\n');
1568
1569 // Not likely to have sufficient local memory for promotion.
1570 if (!SufficientLDS)
1571 return false;
1572
1573 const DataLayout &DL = Mod->getDataLayout();
1574 IRBuilder<> Builder(AA.Alloca);
1575
1576 const Function &ContainingFunction = *AA.Alloca->getParent()->getParent();
1577 const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F: ContainingFunction);
1578 unsigned WorkGroupSize = ST.getFlatWorkGroupSizes(F: ContainingFunction).second;
1579
1580 Align Alignment = AA.Alloca->getAlign();
1581
1582 // FIXME: This computed padding is likely wrong since it depends on inverse
1583 // usage order.
1584 //
1585 // FIXME: It is also possible that if we're allowed to use all of the memory
1586 // could end up using more than the maximum due to alignment padding.
1587
1588 uint32_t NewSize = alignTo(Size: CurrentLocalMemUsage, A: Alignment);
1589 std::optional<TypeSize> ElemSize = AA.Alloca->getAllocationSize(DL);
1590 if (!ElemSize || ElemSize->isScalable())
1591 return false;
1592 TypeSize AllocSize = WorkGroupSize * *ElemSize;
1593 NewSize += AllocSize.getFixedValue();
1594
1595 if (NewSize > LocalMemLimit) {
1596 LLVM_DEBUG(dbgs() << " " << AllocSize
1597 << " bytes of local memory not available to promote\n");
1598 return false;
1599 }
1600
1601 CurrentLocalMemUsage = NewSize;
1602
1603 LLVM_DEBUG(dbgs() << "Promoting alloca to local memory\n");
1604
1605 Function *F = AA.Alloca->getFunction();
1606
1607 Type *GVTy = ArrayType::get(ElementType: AA.Alloca->getAllocatedType(), NumElements: WorkGroupSize);
1608 GlobalVariable *GV = new GlobalVariable(
1609 *Mod, GVTy, false, GlobalValue::InternalLinkage, PoisonValue::get(T: GVTy),
1610 Twine(F->getName()) + Twine('.') + AA.Alloca->getName(), nullptr,
1611 GlobalVariable::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS);
1612 GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
1613 GV->setAlignment(AA.Alloca->getAlign());
1614
1615 Value *TCntY, *TCntZ;
1616
1617 std::tie(args&: TCntY, args&: TCntZ) = getLocalSizeYZ(Builder);
1618 Value *TIdX = getWorkitemID(Builder, N: 0);
1619 Value *TIdY = getWorkitemID(Builder, N: 1);
1620 Value *TIdZ = getWorkitemID(Builder, N: 2);
1621
1622 Value *Tmp0 = Builder.CreateMul(LHS: TCntY, RHS: TCntZ, Name: "", HasNUW: true, HasNSW: true);
1623 Tmp0 = Builder.CreateMul(LHS: Tmp0, RHS: TIdX);
1624 Value *Tmp1 = Builder.CreateMul(LHS: TIdY, RHS: TCntZ, Name: "", HasNUW: true, HasNSW: true);
1625 Value *TID = Builder.CreateAdd(LHS: Tmp0, RHS: Tmp1);
1626 TID = Builder.CreateAdd(LHS: TID, RHS: TIdZ);
1627
1628 LLVMContext &Context = Mod->getContext();
1629 Value *Indices[] = {Constant::getNullValue(Ty: Type::getInt32Ty(C&: Context)), TID};
1630
1631 Value *Offset = Builder.CreateInBoundsGEP(Ty: GVTy, Ptr: GV, IdxList: Indices);
1632 AA.Alloca->mutateType(Ty: Offset->getType());
1633 AA.Alloca->replaceAllUsesWith(V: Offset);
1634 AA.Alloca->eraseFromParent();
1635
1636 PointerType *NewPtrTy = PointerType::get(C&: Context, AddressSpace: AMDGPUAS::LOCAL_ADDRESS);
1637
1638 for (Value *V : AA.LDS.Worklist) {
1639 CallInst *Call = dyn_cast<CallInst>(Val: V);
1640 if (!Call) {
1641 if (ICmpInst *CI = dyn_cast<ICmpInst>(Val: V)) {
1642 Value *LHS = CI->getOperand(i_nocapture: 0);
1643 Value *RHS = CI->getOperand(i_nocapture: 1);
1644
1645 Type *NewTy = LHS->getType()->getWithNewType(EltTy: NewPtrTy);
1646 if (isa<ConstantPointerNull, ConstantAggregateZero>(Val: LHS))
1647 CI->setOperand(i_nocapture: 0, Val_nocapture: Constant::getNullValue(Ty: NewTy));
1648
1649 if (isa<ConstantPointerNull, ConstantAggregateZero>(Val: RHS))
1650 CI->setOperand(i_nocapture: 1, Val_nocapture: Constant::getNullValue(Ty: NewTy));
1651
1652 continue;
1653 }
1654
1655 // The operand's value should be corrected on its own and we don't want to
1656 // touch the users.
1657 if (isa<AddrSpaceCastInst>(Val: V))
1658 continue;
1659
1660 assert(V->getType()->isPtrOrPtrVectorTy());
1661
1662 Type *NewTy = V->getType()->getWithNewType(EltTy: NewPtrTy);
1663 V->mutateType(Ty: NewTy);
1664
1665 // Adjust the types of any constant operands.
1666 if (SelectInst *SI = dyn_cast<SelectInst>(Val: V)) {
1667 if (isa<ConstantPointerNull, ConstantAggregateZero>(Val: SI->getOperand(i_nocapture: 1)))
1668 SI->setOperand(i_nocapture: 1, Val_nocapture: Constant::getNullValue(Ty: NewTy));
1669
1670 if (isa<ConstantPointerNull, ConstantAggregateZero>(Val: SI->getOperand(i_nocapture: 2)))
1671 SI->setOperand(i_nocapture: 2, Val_nocapture: Constant::getNullValue(Ty: NewTy));
1672 } else if (PHINode *Phi = dyn_cast<PHINode>(Val: V)) {
1673 for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I) {
1674 if (isa<ConstantPointerNull, ConstantAggregateZero>(
1675 Val: Phi->getIncomingValue(i: I)))
1676 Phi->setIncomingValue(i: I, V: Constant::getNullValue(Ty: NewTy));
1677 }
1678 }
1679
1680 continue;
1681 }
1682
1683 IntrinsicInst *Intr = cast<IntrinsicInst>(Val: Call);
1684 Builder.SetInsertPoint(Intr);
1685 switch (Intr->getIntrinsicID()) {
1686 case Intrinsic::lifetime_start:
1687 case Intrinsic::lifetime_end:
1688 // These intrinsics are for address space 0 only
1689 Intr->eraseFromParent();
1690 continue;
1691 case Intrinsic::memcpy:
1692 case Intrinsic::memmove:
1693 // These have 2 pointer operands. In case if second pointer also needs
1694 // to be replaced we defer processing of these intrinsics until all
1695 // other values are processed.
1696 DeferredIntrs.insert(X: Intr);
1697 continue;
1698 case Intrinsic::memset: {
1699 MemSetInst *MemSet = cast<MemSetInst>(Val: Intr);
1700 Builder.CreateMemSet(Ptr: MemSet->getRawDest(), Val: MemSet->getValue(),
1701 Size: MemSet->getLength(), Align: MemSet->getDestAlign(),
1702 isVolatile: MemSet->isVolatile());
1703 Intr->eraseFromParent();
1704 continue;
1705 }
1706 case Intrinsic::invariant_start:
1707 case Intrinsic::invariant_end:
1708 case Intrinsic::launder_invariant_group:
1709 case Intrinsic::strip_invariant_group: {
1710 SmallVector<Value *> Args;
1711 if (Intr->getIntrinsicID() == Intrinsic::invariant_start) {
1712 Args.emplace_back(Args: Intr->getArgOperand(i: 0));
1713 } else if (Intr->getIntrinsicID() == Intrinsic::invariant_end) {
1714 Args.emplace_back(Args: Intr->getArgOperand(i: 0));
1715 Args.emplace_back(Args: Intr->getArgOperand(i: 1));
1716 }
1717 Args.emplace_back(Args&: Offset);
1718 Function *F = Intrinsic::getOrInsertDeclaration(
1719 M: Intr->getModule(), id: Intr->getIntrinsicID(), Tys: Offset->getType());
1720 CallInst *NewIntr =
1721 CallInst::Create(Func: F, Args, NameStr: Intr->getName(), InsertBefore: Intr->getIterator());
1722 Intr->mutateType(Ty: NewIntr->getType());
1723 Intr->replaceAllUsesWith(V: NewIntr);
1724 Intr->eraseFromParent();
1725 continue;
1726 }
1727 case Intrinsic::objectsize: {
1728 Value *Src = Intr->getOperand(i_nocapture: 0);
1729
1730 CallInst *NewCall = Builder.CreateIntrinsic(
1731 ID: Intrinsic::objectsize,
1732 Types: {Intr->getType(), PointerType::get(C&: Context, AddressSpace: AMDGPUAS::LOCAL_ADDRESS)},
1733 Args: {Src, Intr->getOperand(i_nocapture: 1), Intr->getOperand(i_nocapture: 2), Intr->getOperand(i_nocapture: 3)});
1734 Intr->replaceAllUsesWith(V: NewCall);
1735 Intr->eraseFromParent();
1736 continue;
1737 }
1738 default:
1739 Intr->print(O&: errs());
1740 llvm_unreachable("Don't know how to promote alloca intrinsic use.");
1741 }
1742 }
1743
1744 return true;
1745}
1746
1747void AMDGPUPromoteAllocaImpl::finishDeferredAllocaToLDSPromotion(
1748 SetVector<IntrinsicInst *> &DeferredIntrs) {
1749
1750 for (IntrinsicInst *Intr : DeferredIntrs) {
1751 IRBuilder<> Builder(Intr);
1752 Builder.SetInsertPoint(Intr);
1753 Intrinsic::ID ID = Intr->getIntrinsicID();
1754 assert(ID == Intrinsic::memcpy || ID == Intrinsic::memmove);
1755
1756 MemTransferInst *MI = cast<MemTransferInst>(Val: Intr);
1757 auto *B = Builder.CreateMemTransferInst(
1758 IntrID: ID, Dst: MI->getRawDest(), DstAlign: MI->getDestAlign(), Src: MI->getRawSource(),
1759 SrcAlign: MI->getSourceAlign(), Size: MI->getLength(), isVolatile: MI->isVolatile());
1760
1761 for (unsigned I = 0; I != 2; ++I) {
1762 if (uint64_t Bytes = Intr->getParamDereferenceableBytes(i: I)) {
1763 B->addDereferenceableParamAttr(i: I, Bytes);
1764 }
1765 }
1766
1767 Intr->eraseFromParent();
1768 }
1769}
1770