1//===-- AMDGPUPromoteAlloca.cpp - Promote Allocas -------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// Eliminates allocas by either converting them into vectors or by migrating
10// them to local address space.
11//
12// Two passes are exposed by this file:
13// - "promote-alloca-to-vector", which runs early in the pipeline and only
14// promotes to vector. Promotion to vector is almost always profitable
15// except when the alloca is too big and the promotion would result in
16// very high register pressure.
17// - "promote-alloca", which does both promotion to vector and LDS and runs
18// much later in the pipeline. This runs after SROA because promoting to
19// LDS is of course less profitable than getting rid of the alloca or
20// vectorizing it, thus we only want to do it when the only alternative is
21// lowering the alloca to stack.
22//
23// Note that both of them exist for the old and new PMs. The new PM passes are
24// declared in AMDGPU.h and the legacy PM ones are declared here.s
25//
26//===----------------------------------------------------------------------===//
27
28#include "AMDGPU.h"
29#include "GCNSubtarget.h"
30#include "Utils/AMDGPUBaseInfo.h"
31#include "llvm/ADT/STLExtras.h"
32#include "llvm/Analysis/CaptureTracking.h"
33#include "llvm/Analysis/InstSimplifyFolder.h"
34#include "llvm/Analysis/InstructionSimplify.h"
35#include "llvm/Analysis/LoopInfo.h"
36#include "llvm/Analysis/ValueTracking.h"
37#include "llvm/CodeGen/TargetPassConfig.h"
38#include "llvm/IR/IRBuilder.h"
39#include "llvm/IR/IntrinsicInst.h"
40#include "llvm/IR/IntrinsicsAMDGPU.h"
41#include "llvm/IR/IntrinsicsR600.h"
42#include "llvm/IR/PatternMatch.h"
43#include "llvm/InitializePasses.h"
44#include "llvm/Pass.h"
45#include "llvm/Target/TargetMachine.h"
46#include "llvm/Transforms/Utils/SSAUpdater.h"
47
48#define DEBUG_TYPE "amdgpu-promote-alloca"
49
50using namespace llvm;
51
52namespace {
53
54static cl::opt<bool>
55 DisablePromoteAllocaToVector("disable-promote-alloca-to-vector",
56 cl::desc("Disable promote alloca to vector"),
57 cl::init(Val: false));
58
59static cl::opt<bool>
60 DisablePromoteAllocaToLDS("disable-promote-alloca-to-lds",
61 cl::desc("Disable promote alloca to LDS"),
62 cl::init(Val: false));
63
64static cl::opt<unsigned> PromoteAllocaToVectorLimit(
65 "amdgpu-promote-alloca-to-vector-limit",
66 cl::desc("Maximum byte size to consider promote alloca to vector"),
67 cl::init(Val: 0));
68
69static cl::opt<unsigned> PromoteAllocaToVectorMaxRegs(
70 "amdgpu-promote-alloca-to-vector-max-regs",
71 cl::desc(
72 "Maximum vector size (in 32b registers) to use when promoting alloca"),
73 cl::init(Val: 16));
74
75// Use up to 1/4 of available register budget for vectorization.
76// FIXME: Increase the limit for whole function budgets? Perhaps x2?
77static cl::opt<unsigned> PromoteAllocaToVectorVGPRRatio(
78 "amdgpu-promote-alloca-to-vector-vgpr-ratio",
79 cl::desc("Ratio of VGPRs to budget for promoting alloca to vectors"),
80 cl::init(Val: 4));
81
82static cl::opt<unsigned>
83 LoopUserWeight("promote-alloca-vector-loop-user-weight",
84 cl::desc("The bonus weight of users of allocas within loop "
85 "when sorting profitable allocas"),
86 cl::init(Val: 4));
87
88// Shared implementation which can do both promotion to vector and to LDS.
89class AMDGPUPromoteAllocaImpl {
90private:
91 const TargetMachine &TM;
92 LoopInfo &LI;
93 Module *Mod = nullptr;
94 const DataLayout *DL = nullptr;
95
96 // FIXME: This should be per-kernel.
97 uint32_t LocalMemLimit = 0;
98 uint32_t CurrentLocalMemUsage = 0;
99 unsigned MaxVGPRs;
100 unsigned VGPRBudgetRatio;
101 unsigned MaxVectorRegs;
102
103 bool IsAMDGCN = false;
104 bool IsAMDHSA = false;
105
106 std::pair<Value *, Value *> getLocalSizeYZ(IRBuilder<> &Builder);
107 Value *getWorkitemID(IRBuilder<> &Builder, unsigned N);
108
109 /// BaseAlloca is the alloca root the search started from.
110 /// Val may be that alloca or a recursive user of it.
111 bool collectUsesWithPtrTypes(Value *BaseAlloca, Value *Val,
112 std::vector<Value *> &WorkList) const;
113
114 /// Val is a derived pointer from Alloca. OpIdx0/OpIdx1 are the operand
115 /// indices to an instruction with 2 pointer inputs (e.g. select, icmp).
116 /// Returns true if both operands are derived from the same alloca. Val should
117 /// be the same value as one of the input operands of UseInst.
118 bool binaryOpIsDerivedFromSameAlloca(Value *Alloca, Value *Val,
119 Instruction *UseInst, int OpIdx0,
120 int OpIdx1) const;
121
122 /// Check whether we have enough local memory for promotion.
123 bool hasSufficientLocalMem(const Function &F);
124
125 bool tryPromoteAllocaToVector(AllocaInst &I);
126 bool tryPromoteAllocaToLDS(AllocaInst &I, bool SufficientLDS);
127
128 void sortAllocasToPromote(SmallVectorImpl<AllocaInst *> &Allocas);
129
130 void setFunctionLimits(const Function &F);
131
132public:
133 AMDGPUPromoteAllocaImpl(TargetMachine &TM, LoopInfo &LI) : TM(TM), LI(LI) {
134
135 const Triple &TT = TM.getTargetTriple();
136 IsAMDGCN = TT.isAMDGCN();
137 IsAMDHSA = TT.getOS() == Triple::AMDHSA;
138 }
139
140 bool run(Function &F, bool PromoteToLDS);
141};
142
143// FIXME: This can create globals so should be a module pass.
144class AMDGPUPromoteAlloca : public FunctionPass {
145public:
146 static char ID;
147
148 AMDGPUPromoteAlloca() : FunctionPass(ID) {}
149
150 bool runOnFunction(Function &F) override {
151 if (skipFunction(F))
152 return false;
153 if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>())
154 return AMDGPUPromoteAllocaImpl(
155 TPC->getTM<TargetMachine>(),
156 getAnalysis<LoopInfoWrapperPass>().getLoopInfo())
157 .run(F, /*PromoteToLDS*/ true);
158 return false;
159 }
160
161 StringRef getPassName() const override { return "AMDGPU Promote Alloca"; }
162
163 void getAnalysisUsage(AnalysisUsage &AU) const override {
164 AU.setPreservesCFG();
165 AU.addRequired<LoopInfoWrapperPass>();
166 FunctionPass::getAnalysisUsage(AU);
167 }
168};
169
170static unsigned getMaxVGPRs(unsigned LDSBytes, const TargetMachine &TM,
171 const Function &F) {
172 if (!TM.getTargetTriple().isAMDGCN())
173 return 128;
174
175 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
176
177 unsigned DynamicVGPRBlockSize = AMDGPU::getDynamicVGPRBlockSize(F);
178 // Temporarily check both the attribute and the subtarget feature, until the
179 // latter is removed.
180 if (DynamicVGPRBlockSize == 0 && ST.isDynamicVGPREnabled())
181 DynamicVGPRBlockSize = ST.getDynamicVGPRBlockSize();
182
183 unsigned MaxVGPRs = ST.getMaxNumVGPRs(
184 WavesPerEU: ST.getWavesPerEU(FlatWorkGroupSizes: ST.getFlatWorkGroupSizes(F), LDSBytes, F).first,
185 DynamicVGPRBlockSize);
186
187 // A non-entry function has only 32 caller preserved registers.
188 // Do not promote alloca which will force spilling unless we know the function
189 // will be inlined.
190 if (!F.hasFnAttribute(Kind: Attribute::AlwaysInline) &&
191 !AMDGPU::isEntryFunctionCC(CC: F.getCallingConv()))
192 MaxVGPRs = std::min(a: MaxVGPRs, b: 32u);
193 return MaxVGPRs;
194}
195
196} // end anonymous namespace
197
198char AMDGPUPromoteAlloca::ID = 0;
199
200INITIALIZE_PASS_BEGIN(AMDGPUPromoteAlloca, DEBUG_TYPE,
201 "AMDGPU promote alloca to vector or LDS", false, false)
202// Move LDS uses from functions to kernels before promote alloca for accurate
203// estimation of LDS available
204INITIALIZE_PASS_DEPENDENCY(AMDGPULowerModuleLDSLegacy)
205INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
206INITIALIZE_PASS_END(AMDGPUPromoteAlloca, DEBUG_TYPE,
207 "AMDGPU promote alloca to vector or LDS", false, false)
208
209char &llvm::AMDGPUPromoteAllocaID = AMDGPUPromoteAlloca::ID;
210
211PreservedAnalyses AMDGPUPromoteAllocaPass::run(Function &F,
212 FunctionAnalysisManager &AM) {
213 auto &LI = AM.getResult<LoopAnalysis>(IR&: F);
214 bool Changed = AMDGPUPromoteAllocaImpl(TM, LI).run(F, /*PromoteToLDS=*/true);
215 if (Changed) {
216 PreservedAnalyses PA;
217 PA.preserveSet<CFGAnalyses>();
218 return PA;
219 }
220 return PreservedAnalyses::all();
221}
222
223PreservedAnalyses
224AMDGPUPromoteAllocaToVectorPass::run(Function &F, FunctionAnalysisManager &AM) {
225 auto &LI = AM.getResult<LoopAnalysis>(IR&: F);
226 bool Changed = AMDGPUPromoteAllocaImpl(TM, LI).run(F, /*PromoteToLDS=*/false);
227 if (Changed) {
228 PreservedAnalyses PA;
229 PA.preserveSet<CFGAnalyses>();
230 return PA;
231 }
232 return PreservedAnalyses::all();
233}
234
235FunctionPass *llvm::createAMDGPUPromoteAlloca() {
236 return new AMDGPUPromoteAlloca();
237}
238
239static void collectAllocaUses(AllocaInst &Alloca,
240 SmallVectorImpl<Use *> &Uses) {
241 SmallVector<Instruction *, 4> WorkList({&Alloca});
242 while (!WorkList.empty()) {
243 auto *Cur = WorkList.pop_back_val();
244 for (auto &U : Cur->uses()) {
245 Uses.push_back(Elt: &U);
246
247 if (isa<GetElementPtrInst>(Val: U.getUser()))
248 WorkList.push_back(Elt: cast<Instruction>(Val: U.getUser()));
249 }
250 }
251}
252
253void AMDGPUPromoteAllocaImpl::sortAllocasToPromote(
254 SmallVectorImpl<AllocaInst *> &Allocas) {
255 DenseMap<AllocaInst *, unsigned> Scores;
256
257 for (auto *Alloca : Allocas) {
258 LLVM_DEBUG(dbgs() << "Scoring: " << *Alloca << "\n");
259 unsigned &Score = Scores[Alloca];
260 // Increment score by one for each user + a bonus for users within loops.
261 SmallVector<Use *, 8> Uses;
262 collectAllocaUses(Alloca&: *Alloca, Uses);
263 for (auto *U : Uses) {
264 Instruction *Inst = cast<Instruction>(Val: U->getUser());
265 if (isa<GetElementPtrInst>(Val: Inst))
266 continue;
267 unsigned UserScore =
268 1 + (LoopUserWeight * LI.getLoopDepth(BB: Inst->getParent()));
269 LLVM_DEBUG(dbgs() << " [+" << UserScore << "]:\t" << *Inst << "\n");
270 Score += UserScore;
271 }
272 LLVM_DEBUG(dbgs() << " => Final Score:" << Score << "\n");
273 }
274
275 stable_sort(Range&: Allocas, C: [&](AllocaInst *A, AllocaInst *B) {
276 return Scores.at(Val: A) > Scores.at(Val: B);
277 });
278
279 // clang-format off
280 LLVM_DEBUG(
281 dbgs() << "Sorted Worklist:\n";
282 for (auto *A: Allocas)
283 dbgs() << " " << *A << "\n";
284 );
285 // clang-format on
286}
287
288void AMDGPUPromoteAllocaImpl::setFunctionLimits(const Function &F) {
289 // Load per function limits, overriding with global options where appropriate.
290 MaxVectorRegs = F.getFnAttributeAsParsedInteger(
291 Kind: "amdgpu-promote-alloca-to-vector-max-regs", Default: PromoteAllocaToVectorMaxRegs);
292 if (PromoteAllocaToVectorMaxRegs.getNumOccurrences())
293 MaxVectorRegs = PromoteAllocaToVectorMaxRegs;
294 VGPRBudgetRatio = F.getFnAttributeAsParsedInteger(
295 Kind: "amdgpu-promote-alloca-to-vector-vgpr-ratio",
296 Default: PromoteAllocaToVectorVGPRRatio);
297 if (PromoteAllocaToVectorVGPRRatio.getNumOccurrences())
298 VGPRBudgetRatio = PromoteAllocaToVectorVGPRRatio;
299}
300
301bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
302 Mod = F.getParent();
303 DL = &Mod->getDataLayout();
304
305 const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F);
306 if (!ST.isPromoteAllocaEnabled())
307 return false;
308
309 bool SufficientLDS = PromoteToLDS && hasSufficientLocalMem(F);
310 MaxVGPRs = getMaxVGPRs(LDSBytes: CurrentLocalMemUsage, TM, F);
311 setFunctionLimits(F);
312
313 unsigned VectorizationBudget =
314 (PromoteAllocaToVectorLimit ? PromoteAllocaToVectorLimit * 8
315 : (MaxVGPRs * 32)) /
316 VGPRBudgetRatio;
317
318 SmallVector<AllocaInst *, 16> Allocas;
319 for (Instruction &I : F.getEntryBlock()) {
320 if (AllocaInst *AI = dyn_cast<AllocaInst>(Val: &I)) {
321 // Array allocations are probably not worth handling, since an allocation
322 // of the array type is the canonical form.
323 if (!AI->isStaticAlloca() || AI->isArrayAllocation())
324 continue;
325 Allocas.push_back(Elt: AI);
326 }
327 }
328
329 sortAllocasToPromote(Allocas);
330
331 bool Changed = false;
332 for (AllocaInst *AI : Allocas) {
333 const unsigned AllocaCost = DL->getTypeSizeInBits(Ty: AI->getAllocatedType());
334 // First, check if we have enough budget to vectorize this alloca.
335 if (AllocaCost <= VectorizationBudget) {
336 // If we do, attempt vectorization, otherwise, fall through and try
337 // promoting to LDS instead.
338 if (tryPromoteAllocaToVector(I&: *AI)) {
339 Changed = true;
340 assert((VectorizationBudget - AllocaCost) < VectorizationBudget &&
341 "Underflow!");
342 VectorizationBudget -= AllocaCost;
343 LLVM_DEBUG(dbgs() << " Remaining vectorization budget:"
344 << VectorizationBudget << "\n");
345 continue;
346 }
347 } else {
348 LLVM_DEBUG(dbgs() << "Alloca too big for vectorization (size:"
349 << AllocaCost << ", budget:" << VectorizationBudget
350 << "): " << *AI << "\n");
351 }
352
353 if (PromoteToLDS && tryPromoteAllocaToLDS(I&: *AI, SufficientLDS))
354 Changed = true;
355 }
356
357 // NOTE: tryPromoteAllocaToVector removes the alloca, so Allocas contains
358 // dangling pointers. If we want to reuse it past this point, the loop above
359 // would need to be updated to remove successfully promoted allocas.
360
361 return Changed;
362}
363
364struct MemTransferInfo {
365 ConstantInt *SrcIndex = nullptr;
366 ConstantInt *DestIndex = nullptr;
367};
368
369// Checks if the instruction I is a memset user of the alloca AI that we can
370// deal with. Currently, only non-volatile memsets that affect the whole alloca
371// are handled.
372static bool isSupportedMemset(MemSetInst *I, AllocaInst *AI,
373 const DataLayout &DL) {
374 using namespace PatternMatch;
375 // For now we only care about non-volatile memsets that affect the whole type
376 // (start at index 0 and fill the whole alloca).
377 //
378 // TODO: Now that we moved to PromoteAlloca we could handle any memsets
379 // (except maybe volatile ones?) - we just need to use shufflevector if it
380 // only affects a subset of the vector.
381 const unsigned Size = DL.getTypeStoreSize(Ty: AI->getAllocatedType());
382 return I->getOperand(i_nocapture: 0) == AI &&
383 match(V: I->getOperand(i_nocapture: 2), P: m_SpecificInt(V: Size)) && !I->isVolatile();
384}
385
386static Value *calculateVectorIndex(
387 Value *Ptr, const std::map<GetElementPtrInst *, WeakTrackingVH> &GEPIdx) {
388 auto *GEP = dyn_cast<GetElementPtrInst>(Val: Ptr->stripPointerCasts());
389 if (!GEP)
390 return ConstantInt::getNullValue(Ty: Type::getInt32Ty(C&: Ptr->getContext()));
391
392 auto I = GEPIdx.find(x: GEP);
393 assert(I != GEPIdx.end() && "Must have entry for GEP!");
394
395 Value *IndexValue = I->second;
396 assert(IndexValue && "index value missing from GEP index map");
397 return IndexValue;
398}
399
400static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca,
401 Type *VecElemTy, const DataLayout &DL,
402 SmallVector<Instruction *> &NewInsts) {
403 // TODO: Extracting a "multiple of X" from a GEP might be a useful generic
404 // helper.
405 unsigned BW = DL.getIndexTypeSizeInBits(Ty: GEP->getType());
406 SmallMapVector<Value *, APInt, 4> VarOffsets;
407 APInt ConstOffset(BW, 0);
408
409 // Walk backwards through nested GEPs to collect both constant and variable
410 // offsets, so that nested vector GEP chains can be lowered in one step.
411 //
412 // Given this IR fragment as input:
413 //
414 // %0 = alloca [10 x <2 x i32>], align 8, addrspace(5)
415 // %1 = getelementptr [10 x <2 x i32>], ptr addrspace(5) %0, i32 0, i32 %j
416 // %2 = getelementptr i8, ptr addrspace(5) %1, i32 4
417 // %3 = load i32, ptr addrspace(5) %2, align 4
418 //
419 // Combine both GEP operations in a single pass, producing:
420 // BasePtr = %0
421 // ConstOffset = 4
422 // VarOffsets = { %j -> element_size(<2 x i32>) }
423 //
424 // That lets us emit a single buffer_load directly into a VGPR, without ever
425 // allocating scratch memory for the intermediate pointer.
426 Value *CurPtr = GEP;
427 while (auto *CurGEP = dyn_cast<GetElementPtrInst>(Val: CurPtr)) {
428 if (!CurGEP->collectOffset(DL, BitWidth: BW, VariableOffsets&: VarOffsets, ConstantOffset&: ConstOffset))
429 return nullptr;
430
431 // Move to the next outer pointer.
432 CurPtr = CurGEP->getPointerOperand();
433 }
434
435 assert(CurPtr == Alloca && "GEP not based on alloca");
436
437 unsigned VecElemSize = DL.getTypeAllocSize(Ty: VecElemTy);
438 if (VarOffsets.size() > 1)
439 return nullptr;
440
441 APInt IndexQuot;
442 uint64_t Rem;
443 APInt::udivrem(LHS: ConstOffset, RHS: VecElemSize, Quotient&: IndexQuot, Remainder&: Rem);
444 if (Rem != 0)
445 return nullptr;
446 if (VarOffsets.size() == 0)
447 return ConstantInt::get(Context&: GEP->getContext(), V: IndexQuot);
448
449 IRBuilder<> Builder(GEP);
450
451 const auto &VarOffset = VarOffsets.front();
452 APInt OffsetQuot;
453 APInt::udivrem(LHS: VarOffset.second, RHS: VecElemSize, Quotient&: OffsetQuot, Remainder&: Rem);
454 if (Rem != 0 || OffsetQuot.isZero())
455 return nullptr;
456
457 Value *Offset = VarOffset.first;
458 auto *OffsetType = dyn_cast<IntegerType>(Val: Offset->getType());
459 if (!OffsetType)
460 return nullptr;
461
462 if (!OffsetQuot.isOne()) {
463 ConstantInt *ConstMul =
464 ConstantInt::get(Ty: OffsetType, V: OffsetQuot.getZExtValue());
465 Offset = Builder.CreateMul(LHS: Offset, RHS: ConstMul);
466 if (Instruction *NewInst = dyn_cast<Instruction>(Val: Offset))
467 NewInsts.push_back(Elt: NewInst);
468 }
469 if (ConstOffset.isZero())
470 return Offset;
471
472 ConstantInt *ConstIndex =
473 ConstantInt::get(Ty: OffsetType, V: IndexQuot.getZExtValue());
474 Value *IndexAdd = Builder.CreateAdd(LHS: ConstIndex, RHS: Offset);
475 if (Instruction *NewInst = dyn_cast<Instruction>(Val: IndexAdd))
476 NewInsts.push_back(Elt: NewInst);
477 return IndexAdd;
478}
479
480/// Promotes a single user of the alloca to a vector form.
481///
482/// \param Inst Instruction to be promoted.
483/// \param DL Module Data Layout.
484/// \param VectorTy Vectorized Type.
485/// \param VecStoreSize Size of \p VectorTy in bytes.
486/// \param ElementSize Size of \p VectorTy element type in bytes.
487/// \param TransferInfo MemTransferInst info map.
488/// \param GEPVectorIdx GEP -> VectorIdx cache.
489/// \param CurVal Current value of the vector (e.g. last stored value)
490/// \param[out] DeferredLoads \p Inst is added to this vector if it can't
491/// be promoted now. This happens when promoting requires \p
492/// CurVal, but \p CurVal is nullptr.
493/// \return the stored value if \p Inst would have written to the alloca, or
494/// nullptr otherwise.
495static Value *promoteAllocaUserToVector(
496 Instruction *Inst, const DataLayout &DL, FixedVectorType *VectorTy,
497 unsigned VecStoreSize, unsigned ElementSize,
498 DenseMap<MemTransferInst *, MemTransferInfo> &TransferInfo,
499 std::map<GetElementPtrInst *, WeakTrackingVH> &GEPVectorIdx, Value *CurVal,
500 SmallVectorImpl<LoadInst *> &DeferredLoads) {
501 // Note: we use InstSimplifyFolder because it can leverage the DataLayout
502 // to do more folding, especially in the case of vector splats.
503 IRBuilder<InstSimplifyFolder> Builder(Inst->getContext(),
504 InstSimplifyFolder(DL));
505 Builder.SetInsertPoint(Inst);
506
507 const auto GetOrLoadCurrentVectorValue = [&]() -> Value * {
508 if (CurVal)
509 return CurVal;
510
511 // If the current value is not known, insert a dummy load and lower it on
512 // the second pass.
513 LoadInst *Dummy =
514 Builder.CreateLoad(Ty: VectorTy, Ptr: PoisonValue::get(T: Builder.getPtrTy()),
515 Name: "promotealloca.dummyload");
516 DeferredLoads.push_back(Elt: Dummy);
517 return Dummy;
518 };
519
520 const auto CreateTempPtrIntCast = [&Builder, DL](Value *Val,
521 Type *PtrTy) -> Value * {
522 assert(DL.getTypeStoreSize(Val->getType()) == DL.getTypeStoreSize(PtrTy));
523 const unsigned Size = DL.getTypeStoreSizeInBits(Ty: PtrTy);
524 if (!PtrTy->isVectorTy())
525 return Builder.CreateBitOrPointerCast(V: Val, DestTy: Builder.getIntNTy(N: Size));
526 const unsigned NumPtrElts = cast<FixedVectorType>(Val: PtrTy)->getNumElements();
527 // If we want to cast to cast, e.g. a <2 x ptr> into a <4 x i32>, we need to
528 // first cast the ptr vector to <2 x i64>.
529 assert((Size % NumPtrElts == 0) && "Vector size not divisble");
530 Type *EltTy = Builder.getIntNTy(N: Size / NumPtrElts);
531 return Builder.CreateBitOrPointerCast(
532 V: Val, DestTy: FixedVectorType::get(ElementType: EltTy, NumElts: NumPtrElts));
533 };
534
535 Type *VecEltTy = VectorTy->getElementType();
536
537 switch (Inst->getOpcode()) {
538 case Instruction::Load: {
539 // Loads can only be lowered if the value is known.
540 if (!CurVal) {
541 DeferredLoads.push_back(Elt: cast<LoadInst>(Val: Inst));
542 return nullptr;
543 }
544
545 Value *Index = calculateVectorIndex(
546 Ptr: cast<LoadInst>(Val: Inst)->getPointerOperand(), GEPIdx: GEPVectorIdx);
547
548 // We're loading the full vector.
549 Type *AccessTy = Inst->getType();
550 TypeSize AccessSize = DL.getTypeStoreSize(Ty: AccessTy);
551 if (Constant *CI = dyn_cast<Constant>(Val: Index)) {
552 if (CI->isZeroValue() && AccessSize == VecStoreSize) {
553 if (AccessTy->isPtrOrPtrVectorTy())
554 CurVal = CreateTempPtrIntCast(CurVal, AccessTy);
555 else if (CurVal->getType()->isPtrOrPtrVectorTy())
556 CurVal = CreateTempPtrIntCast(CurVal, CurVal->getType());
557 Value *NewVal = Builder.CreateBitOrPointerCast(V: CurVal, DestTy: AccessTy);
558 Inst->replaceAllUsesWith(V: NewVal);
559 return nullptr;
560 }
561 }
562
563 // Loading a subvector.
564 if (isa<FixedVectorType>(Val: AccessTy)) {
565 assert(AccessSize.isKnownMultipleOf(DL.getTypeStoreSize(VecEltTy)));
566 const unsigned NumLoadedElts = AccessSize / DL.getTypeStoreSize(Ty: VecEltTy);
567 auto *SubVecTy = FixedVectorType::get(ElementType: VecEltTy, NumElts: NumLoadedElts);
568 assert(DL.getTypeStoreSize(SubVecTy) == DL.getTypeStoreSize(AccessTy));
569
570 Value *SubVec = PoisonValue::get(T: SubVecTy);
571 for (unsigned K = 0; K < NumLoadedElts; ++K) {
572 Value *CurIdx =
573 Builder.CreateAdd(LHS: Index, RHS: ConstantInt::get(Ty: Index->getType(), V: K));
574 SubVec = Builder.CreateInsertElement(
575 Vec: SubVec, NewElt: Builder.CreateExtractElement(Vec: CurVal, Idx: CurIdx), Idx: K);
576 }
577
578 if (AccessTy->isPtrOrPtrVectorTy())
579 SubVec = CreateTempPtrIntCast(SubVec, AccessTy);
580 else if (SubVecTy->isPtrOrPtrVectorTy())
581 SubVec = CreateTempPtrIntCast(SubVec, SubVecTy);
582
583 SubVec = Builder.CreateBitOrPointerCast(V: SubVec, DestTy: AccessTy);
584 Inst->replaceAllUsesWith(V: SubVec);
585 return nullptr;
586 }
587
588 // We're loading one element.
589 Value *ExtractElement = Builder.CreateExtractElement(Vec: CurVal, Idx: Index);
590 if (AccessTy != VecEltTy)
591 ExtractElement = Builder.CreateBitOrPointerCast(V: ExtractElement, DestTy: AccessTy);
592
593 Inst->replaceAllUsesWith(V: ExtractElement);
594 return nullptr;
595 }
596 case Instruction::Store: {
597 // For stores, it's a bit trickier and it depends on whether we're storing
598 // the full vector or not. If we're storing the full vector, we don't need
599 // to know the current value. If this is a store of a single element, we
600 // need to know the value.
601 StoreInst *SI = cast<StoreInst>(Val: Inst);
602 Value *Index = calculateVectorIndex(Ptr: SI->getPointerOperand(), GEPIdx: GEPVectorIdx);
603 Value *Val = SI->getValueOperand();
604
605 // We're storing the full vector, we can handle this without knowing CurVal.
606 Type *AccessTy = Val->getType();
607 TypeSize AccessSize = DL.getTypeStoreSize(Ty: AccessTy);
608 if (Constant *CI = dyn_cast<Constant>(Val: Index)) {
609 if (CI->isZeroValue() && AccessSize == VecStoreSize) {
610 if (AccessTy->isPtrOrPtrVectorTy())
611 Val = CreateTempPtrIntCast(Val, AccessTy);
612 else if (VectorTy->isPtrOrPtrVectorTy())
613 Val = CreateTempPtrIntCast(Val, VectorTy);
614 return Builder.CreateBitOrPointerCast(V: Val, DestTy: VectorTy);
615 }
616 }
617
618 // Storing a subvector.
619 if (isa<FixedVectorType>(Val: AccessTy)) {
620 assert(AccessSize.isKnownMultipleOf(DL.getTypeStoreSize(VecEltTy)));
621 const unsigned NumWrittenElts =
622 AccessSize / DL.getTypeStoreSize(Ty: VecEltTy);
623 const unsigned NumVecElts = VectorTy->getNumElements();
624 auto *SubVecTy = FixedVectorType::get(ElementType: VecEltTy, NumElts: NumWrittenElts);
625 assert(DL.getTypeStoreSize(SubVecTy) == DL.getTypeStoreSize(AccessTy));
626
627 if (SubVecTy->isPtrOrPtrVectorTy())
628 Val = CreateTempPtrIntCast(Val, SubVecTy);
629 else if (AccessTy->isPtrOrPtrVectorTy())
630 Val = CreateTempPtrIntCast(Val, AccessTy);
631
632 Val = Builder.CreateBitOrPointerCast(V: Val, DestTy: SubVecTy);
633
634 Value *CurVec = GetOrLoadCurrentVectorValue();
635 for (unsigned K = 0, NumElts = std::min(a: NumWrittenElts, b: NumVecElts);
636 K < NumElts; ++K) {
637 Value *CurIdx =
638 Builder.CreateAdd(LHS: Index, RHS: ConstantInt::get(Ty: Index->getType(), V: K));
639 CurVec = Builder.CreateInsertElement(
640 Vec: CurVec, NewElt: Builder.CreateExtractElement(Vec: Val, Idx: K), Idx: CurIdx);
641 }
642 return CurVec;
643 }
644
645 if (Val->getType() != VecEltTy)
646 Val = Builder.CreateBitOrPointerCast(V: Val, DestTy: VecEltTy);
647 return Builder.CreateInsertElement(Vec: GetOrLoadCurrentVectorValue(), NewElt: Val,
648 Idx: Index);
649 }
650 case Instruction::Call: {
651 if (auto *MTI = dyn_cast<MemTransferInst>(Val: Inst)) {
652 // For memcpy, we need to know curval.
653 ConstantInt *Length = cast<ConstantInt>(Val: MTI->getLength());
654 unsigned NumCopied = Length->getZExtValue() / ElementSize;
655 MemTransferInfo *TI = &TransferInfo[MTI];
656 unsigned SrcBegin = TI->SrcIndex->getZExtValue();
657 unsigned DestBegin = TI->DestIndex->getZExtValue();
658
659 SmallVector<int> Mask;
660 for (unsigned Idx = 0; Idx < VectorTy->getNumElements(); ++Idx) {
661 if (Idx >= DestBegin && Idx < DestBegin + NumCopied) {
662 Mask.push_back(Elt: SrcBegin < VectorTy->getNumElements()
663 ? SrcBegin++
664 : PoisonMaskElem);
665 } else {
666 Mask.push_back(Elt: Idx);
667 }
668 }
669
670 return Builder.CreateShuffleVector(V: GetOrLoadCurrentVectorValue(), Mask);
671 }
672
673 if (auto *MSI = dyn_cast<MemSetInst>(Val: Inst)) {
674 // For memset, we don't need to know the previous value because we
675 // currently only allow memsets that cover the whole alloca.
676 Value *Elt = MSI->getOperand(i_nocapture: 1);
677 const unsigned BytesPerElt = DL.getTypeStoreSize(Ty: VecEltTy);
678 if (BytesPerElt > 1) {
679 Value *EltBytes = Builder.CreateVectorSplat(NumElts: BytesPerElt, V: Elt);
680
681 // If the element type of the vector is a pointer, we need to first cast
682 // to an integer, then use a PtrCast.
683 if (VecEltTy->isPointerTy()) {
684 Type *PtrInt = Builder.getIntNTy(N: BytesPerElt * 8);
685 Elt = Builder.CreateBitCast(V: EltBytes, DestTy: PtrInt);
686 Elt = Builder.CreateIntToPtr(V: Elt, DestTy: VecEltTy);
687 } else
688 Elt = Builder.CreateBitCast(V: EltBytes, DestTy: VecEltTy);
689 }
690
691 return Builder.CreateVectorSplat(EC: VectorTy->getElementCount(), V: Elt);
692 }
693
694 if (auto *Intr = dyn_cast<IntrinsicInst>(Val: Inst)) {
695 if (Intr->getIntrinsicID() == Intrinsic::objectsize) {
696 Intr->replaceAllUsesWith(
697 V: Builder.getIntN(N: Intr->getType()->getIntegerBitWidth(),
698 C: DL.getTypeAllocSize(Ty: VectorTy)));
699 return nullptr;
700 }
701 }
702
703 llvm_unreachable("Unsupported call when promoting alloca to vector");
704 }
705
706 default:
707 llvm_unreachable("Inconsistency in instructions promotable to vector");
708 }
709
710 llvm_unreachable("Did not return after promoting instruction!");
711}
712
713static bool isSupportedAccessType(FixedVectorType *VecTy, Type *AccessTy,
714 const DataLayout &DL) {
715 // Access as a vector type can work if the size of the access vector is a
716 // multiple of the size of the alloca's vector element type.
717 //
718 // Examples:
719 // - VecTy = <8 x float>, AccessTy = <4 x float> -> OK
720 // - VecTy = <4 x double>, AccessTy = <2 x float> -> OK
721 // - VecTy = <4 x double>, AccessTy = <3 x float> -> NOT OK
722 // - 3*32 is not a multiple of 64
723 //
724 // We could handle more complicated cases, but it'd make things a lot more
725 // complicated.
726 if (isa<FixedVectorType>(Val: AccessTy)) {
727 TypeSize AccTS = DL.getTypeStoreSize(Ty: AccessTy);
728 // If the type size and the store size don't match, we would need to do more
729 // than just bitcast to translate between an extracted/insertable subvectors
730 // and the accessed value.
731 if (AccTS * 8 != DL.getTypeSizeInBits(Ty: AccessTy))
732 return false;
733 TypeSize VecTS = DL.getTypeStoreSize(Ty: VecTy->getElementType());
734 return AccTS.isKnownMultipleOf(RHS: VecTS);
735 }
736
737 return CastInst::isBitOrNoopPointerCastable(SrcTy: VecTy->getElementType(), DestTy: AccessTy,
738 DL);
739}
740
741/// Iterates over an instruction worklist that may contain multiple instructions
742/// from the same basic block, but in a different order.
743template <typename InstContainer>
744static void forEachWorkListItem(const InstContainer &WorkList,
745 std::function<void(Instruction *)> Fn) {
746 // Bucket up uses of the alloca by the block they occur in.
747 // This is important because we have to handle multiple defs/uses in a block
748 // ourselves: SSAUpdater is purely for cross-block references.
749 DenseMap<BasicBlock *, SmallDenseSet<Instruction *>> UsesByBlock;
750 for (Instruction *User : WorkList)
751 UsesByBlock[User->getParent()].insert(V: User);
752
753 for (Instruction *User : WorkList) {
754 BasicBlock *BB = User->getParent();
755 auto &BlockUses = UsesByBlock[BB];
756
757 // Already processed, skip.
758 if (BlockUses.empty())
759 continue;
760
761 // Only user in the block, directly process it.
762 if (BlockUses.size() == 1) {
763 Fn(User);
764 continue;
765 }
766
767 // Multiple users in the block, do a linear scan to see users in order.
768 for (Instruction &Inst : *BB) {
769 if (!BlockUses.contains(V: &Inst))
770 continue;
771
772 Fn(&Inst);
773 }
774
775 // Clear the block so we know it's been processed.
776 BlockUses.clear();
777 }
778}
779
780/// Find an insert point after an alloca, after all other allocas clustered at
781/// the start of the block.
782static BasicBlock::iterator skipToNonAllocaInsertPt(BasicBlock &BB,
783 BasicBlock::iterator I) {
784 for (BasicBlock::iterator E = BB.end(); I != E && isa<AllocaInst>(Val: *I); ++I)
785 ;
786 return I;
787}
788
789// FIXME: Should try to pick the most likely to be profitable allocas first.
790bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
791 LLVM_DEBUG(dbgs() << "Trying to promote to vector: " << Alloca << '\n');
792
793 if (DisablePromoteAllocaToVector) {
794 LLVM_DEBUG(dbgs() << " Promote alloca to vector is disabled\n");
795 return false;
796 }
797
798 Type *AllocaTy = Alloca.getAllocatedType();
799 auto *VectorTy = dyn_cast<FixedVectorType>(Val: AllocaTy);
800 if (auto *ArrayTy = dyn_cast<ArrayType>(Val: AllocaTy)) {
801 uint64_t NumElems = 1;
802 Type *ElemTy;
803 do {
804 NumElems *= ArrayTy->getNumElements();
805 ElemTy = ArrayTy->getElementType();
806 } while ((ArrayTy = dyn_cast<ArrayType>(Val: ElemTy)));
807
808 // Check for array of vectors
809 auto *InnerVectorTy = dyn_cast<FixedVectorType>(Val: ElemTy);
810 if (InnerVectorTy) {
811 NumElems *= InnerVectorTy->getNumElements();
812 ElemTy = InnerVectorTy->getElementType();
813 }
814
815 if (VectorType::isValidElementType(ElemTy) && NumElems > 0) {
816 unsigned ElementSize = DL->getTypeSizeInBits(Ty: ElemTy) / 8;
817 if (ElementSize > 0) {
818 unsigned AllocaSize = DL->getTypeStoreSize(Ty: AllocaTy);
819 // Expand vector if required to match padding of inner type,
820 // i.e. odd size subvectors.
821 // Storage size of new vector must match that of alloca for correct
822 // behaviour of byte offsets and GEP computation.
823 if (NumElems * ElementSize != AllocaSize)
824 NumElems = AllocaSize / ElementSize;
825 if (NumElems > 0 && (AllocaSize % ElementSize) == 0)
826 VectorTy = FixedVectorType::get(ElementType: ElemTy, NumElts: NumElems);
827 }
828 }
829 }
830
831 if (!VectorTy) {
832 LLVM_DEBUG(dbgs() << " Cannot convert type to vector\n");
833 return false;
834 }
835
836 const unsigned MaxElements =
837 (MaxVectorRegs * 32) / DL->getTypeSizeInBits(Ty: VectorTy->getElementType());
838
839 if (VectorTy->getNumElements() > MaxElements ||
840 VectorTy->getNumElements() < 2) {
841 LLVM_DEBUG(dbgs() << " " << *VectorTy
842 << " has an unsupported number of elements\n");
843 return false;
844 }
845
846 std::map<GetElementPtrInst *, WeakTrackingVH> GEPVectorIdx;
847 SmallVector<Instruction *> WorkList;
848 SmallVector<Instruction *> UsersToRemove;
849 SmallVector<Instruction *> DeferredInsts;
850 SmallVector<Instruction *> NewGEPInsts;
851 DenseMap<MemTransferInst *, MemTransferInfo> TransferInfo;
852
853 const auto RejectUser = [&](Instruction *Inst, Twine Msg) {
854 LLVM_DEBUG(dbgs() << " Cannot promote alloca to vector: " << Msg << "\n"
855 << " " << *Inst << "\n");
856 for (auto *Inst : reverse(C&: NewGEPInsts))
857 Inst->eraseFromParent();
858 return false;
859 };
860
861 SmallVector<Use *, 8> Uses;
862 collectAllocaUses(Alloca, Uses);
863
864 LLVM_DEBUG(dbgs() << " Attempting promotion to: " << *VectorTy << "\n");
865
866 Type *VecEltTy = VectorTy->getElementType();
867 unsigned ElementSizeInBits = DL->getTypeSizeInBits(Ty: VecEltTy);
868 if (ElementSizeInBits != DL->getTypeAllocSizeInBits(Ty: VecEltTy)) {
869 LLVM_DEBUG(dbgs() << " Cannot convert to vector if the allocation size "
870 "does not match the type's size\n");
871 return false;
872 }
873 unsigned ElementSize = ElementSizeInBits / 8;
874 assert(ElementSize > 0);
875 for (auto *U : Uses) {
876 Instruction *Inst = cast<Instruction>(Val: U->getUser());
877
878 if (Value *Ptr = getLoadStorePointerOperand(V: Inst)) {
879 // This is a store of the pointer, not to the pointer.
880 if (isa<StoreInst>(Val: Inst) &&
881 U->getOperandNo() != StoreInst::getPointerOperandIndex())
882 return RejectUser(Inst, "pointer is being stored");
883
884 Type *AccessTy = getLoadStoreType(I: Inst);
885 if (AccessTy->isAggregateType())
886 return RejectUser(Inst, "unsupported load/store as aggregate");
887 assert(!AccessTy->isAggregateType() || AccessTy->isArrayTy());
888
889 // Check that this is a simple access of a vector element.
890 bool IsSimple = isa<LoadInst>(Val: Inst) ? cast<LoadInst>(Val: Inst)->isSimple()
891 : cast<StoreInst>(Val: Inst)->isSimple();
892 if (!IsSimple)
893 return RejectUser(Inst, "not a simple load or store");
894
895 Ptr = Ptr->stripPointerCasts();
896
897 // Alloca already accessed as vector.
898 if (Ptr == &Alloca && DL->getTypeStoreSize(Ty: Alloca.getAllocatedType()) ==
899 DL->getTypeStoreSize(Ty: AccessTy)) {
900 WorkList.push_back(Elt: Inst);
901 continue;
902 }
903
904 if (!isSupportedAccessType(VecTy: VectorTy, AccessTy, DL: *DL))
905 return RejectUser(Inst, "not a supported access type");
906
907 WorkList.push_back(Elt: Inst);
908 continue;
909 }
910
911 if (auto *GEP = dyn_cast<GetElementPtrInst>(Val: Inst)) {
912 // If we can't compute a vector index from this GEP, then we can't
913 // promote this alloca to vector.
914 Value *Index = GEPToVectorIndex(GEP, Alloca: &Alloca, VecElemTy: VecEltTy, DL: *DL, NewInsts&: NewGEPInsts);
915 if (!Index)
916 return RejectUser(Inst, "cannot compute vector index for GEP");
917
918 GEPVectorIdx[GEP] = Index;
919 UsersToRemove.push_back(Elt: Inst);
920 continue;
921 }
922
923 if (MemSetInst *MSI = dyn_cast<MemSetInst>(Val: Inst);
924 MSI && isSupportedMemset(I: MSI, AI: &Alloca, DL: *DL)) {
925 WorkList.push_back(Elt: Inst);
926 continue;
927 }
928
929 if (MemTransferInst *TransferInst = dyn_cast<MemTransferInst>(Val: Inst)) {
930 if (TransferInst->isVolatile())
931 return RejectUser(Inst, "mem transfer inst is volatile");
932
933 ConstantInt *Len = dyn_cast<ConstantInt>(Val: TransferInst->getLength());
934 if (!Len || (Len->getZExtValue() % ElementSize))
935 return RejectUser(Inst, "mem transfer inst length is non-constant or "
936 "not a multiple of the vector element size");
937
938 if (TransferInfo.try_emplace(Key: TransferInst).second) {
939 DeferredInsts.push_back(Elt: Inst);
940 WorkList.push_back(Elt: Inst);
941 }
942
943 auto getPointerIndexOfAlloca = [&](Value *Ptr) -> ConstantInt * {
944 GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Val: Ptr);
945 if (Ptr != &Alloca && !GEPVectorIdx.count(x: GEP))
946 return nullptr;
947
948 return dyn_cast<ConstantInt>(Val: calculateVectorIndex(Ptr, GEPIdx: GEPVectorIdx));
949 };
950
951 unsigned OpNum = U->getOperandNo();
952 MemTransferInfo *TI = &TransferInfo[TransferInst];
953 if (OpNum == 0) {
954 Value *Dest = TransferInst->getDest();
955 ConstantInt *Index = getPointerIndexOfAlloca(Dest);
956 if (!Index)
957 return RejectUser(Inst, "could not calculate constant dest index");
958 TI->DestIndex = Index;
959 } else {
960 assert(OpNum == 1);
961 Value *Src = TransferInst->getSource();
962 ConstantInt *Index = getPointerIndexOfAlloca(Src);
963 if (!Index)
964 return RejectUser(Inst, "could not calculate constant src index");
965 TI->SrcIndex = Index;
966 }
967 continue;
968 }
969
970 if (auto *Intr = dyn_cast<IntrinsicInst>(Val: Inst)) {
971 if (Intr->getIntrinsicID() == Intrinsic::objectsize) {
972 WorkList.push_back(Elt: Inst);
973 continue;
974 }
975 }
976
977 // Ignore assume-like intrinsics and comparisons used in assumes.
978 if (isAssumeLikeIntrinsic(I: Inst)) {
979 if (!Inst->use_empty())
980 return RejectUser(Inst, "assume-like intrinsic cannot have any users");
981 UsersToRemove.push_back(Elt: Inst);
982 continue;
983 }
984
985 if (isa<ICmpInst>(Val: Inst) && all_of(Range: Inst->users(), P: [](User *U) {
986 return isAssumeLikeIntrinsic(I: cast<Instruction>(Val: U));
987 })) {
988 UsersToRemove.push_back(Elt: Inst);
989 continue;
990 }
991
992 return RejectUser(Inst, "unhandled alloca user");
993 }
994
995 while (!DeferredInsts.empty()) {
996 Instruction *Inst = DeferredInsts.pop_back_val();
997 MemTransferInst *TransferInst = cast<MemTransferInst>(Val: Inst);
998 // TODO: Support the case if the pointers are from different alloca or
999 // from different address spaces.
1000 MemTransferInfo &Info = TransferInfo[TransferInst];
1001 if (!Info.SrcIndex || !Info.DestIndex)
1002 return RejectUser(
1003 Inst, "mem transfer inst is missing constant src and/or dst index");
1004 }
1005
1006 LLVM_DEBUG(dbgs() << " Converting alloca to vector " << *AllocaTy << " -> "
1007 << *VectorTy << '\n');
1008 const unsigned VecStoreSize = DL->getTypeStoreSize(Ty: VectorTy);
1009
1010 // Alloca is uninitialized memory. Imitate that by making the first value
1011 // undef.
1012 SSAUpdater Updater;
1013 Updater.Initialize(Ty: VectorTy, Name: "promotealloca");
1014
1015 BasicBlock *EntryBB = Alloca.getParent();
1016 BasicBlock::iterator InitInsertPos =
1017 skipToNonAllocaInsertPt(BB&: *EntryBB, I: Alloca.getIterator());
1018 // Alloca memory is undefined to begin, not poison.
1019 Value *AllocaInitValue =
1020 new FreezeInst(PoisonValue::get(T: VectorTy), "", InitInsertPos);
1021 AllocaInitValue->takeName(V: &Alloca);
1022
1023 Updater.AddAvailableValue(BB: EntryBB, V: AllocaInitValue);
1024
1025 // First handle the initial worklist.
1026 SmallVector<LoadInst *, 4> DeferredLoads;
1027 forEachWorkListItem(WorkList, Fn: [&](Instruction *I) {
1028 BasicBlock *BB = I->getParent();
1029 // On the first pass, we only take values that are trivially known, i.e.
1030 // where AddAvailableValue was already called in this block.
1031 Value *Result = promoteAllocaUserToVector(
1032 Inst: I, DL: *DL, VectorTy, VecStoreSize, ElementSize, TransferInfo, GEPVectorIdx,
1033 CurVal: Updater.FindValueForBlock(BB), DeferredLoads);
1034 if (Result)
1035 Updater.AddAvailableValue(BB, V: Result);
1036 });
1037
1038 // Then handle deferred loads.
1039 forEachWorkListItem(WorkList: DeferredLoads, Fn: [&](Instruction *I) {
1040 SmallVector<LoadInst *, 0> NewDLs;
1041 BasicBlock *BB = I->getParent();
1042 // On the second pass, we use GetValueInMiddleOfBlock to guarantee we always
1043 // get a value, inserting PHIs as needed.
1044 Value *Result = promoteAllocaUserToVector(
1045 Inst: I, DL: *DL, VectorTy, VecStoreSize, ElementSize, TransferInfo, GEPVectorIdx,
1046 CurVal: Updater.GetValueInMiddleOfBlock(BB: I->getParent()), DeferredLoads&: NewDLs);
1047 if (Result)
1048 Updater.AddAvailableValue(BB, V: Result);
1049 assert(NewDLs.empty() && "No more deferred loads should be queued!");
1050 });
1051
1052 // Delete all instructions. On the first pass, new dummy loads may have been
1053 // added so we need to collect them too.
1054 DenseSet<Instruction *> InstsToDelete(WorkList.begin(), WorkList.end());
1055 InstsToDelete.insert_range(R&: DeferredLoads);
1056 for (Instruction *I : InstsToDelete) {
1057 assert(I->use_empty());
1058 I->eraseFromParent();
1059 }
1060
1061 // Delete all the users that are known to be removeable.
1062 for (Instruction *I : reverse(C&: UsersToRemove)) {
1063 I->dropDroppableUses();
1064 assert(I->use_empty());
1065 I->eraseFromParent();
1066 }
1067
1068 // Alloca should now be dead too.
1069 assert(Alloca.use_empty());
1070 Alloca.eraseFromParent();
1071 return true;
1072}
1073
1074std::pair<Value *, Value *>
1075AMDGPUPromoteAllocaImpl::getLocalSizeYZ(IRBuilder<> &Builder) {
1076 Function &F = *Builder.GetInsertBlock()->getParent();
1077 const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F);
1078
1079 if (!IsAMDHSA) {
1080 CallInst *LocalSizeY =
1081 Builder.CreateIntrinsic(ID: Intrinsic::r600_read_local_size_y, Args: {});
1082 CallInst *LocalSizeZ =
1083 Builder.CreateIntrinsic(ID: Intrinsic::r600_read_local_size_z, Args: {});
1084
1085 ST.makeLIDRangeMetadata(I: LocalSizeY);
1086 ST.makeLIDRangeMetadata(I: LocalSizeZ);
1087
1088 return std::pair(LocalSizeY, LocalSizeZ);
1089 }
1090
1091 // We must read the size out of the dispatch pointer.
1092 assert(IsAMDGCN);
1093
1094 // We are indexing into this struct, and want to extract the workgroup_size_*
1095 // fields.
1096 //
1097 // typedef struct hsa_kernel_dispatch_packet_s {
1098 // uint16_t header;
1099 // uint16_t setup;
1100 // uint16_t workgroup_size_x ;
1101 // uint16_t workgroup_size_y;
1102 // uint16_t workgroup_size_z;
1103 // uint16_t reserved0;
1104 // uint32_t grid_size_x ;
1105 // uint32_t grid_size_y ;
1106 // uint32_t grid_size_z;
1107 //
1108 // uint32_t private_segment_size;
1109 // uint32_t group_segment_size;
1110 // uint64_t kernel_object;
1111 //
1112 // #ifdef HSA_LARGE_MODEL
1113 // void *kernarg_address;
1114 // #elif defined HSA_LITTLE_ENDIAN
1115 // void *kernarg_address;
1116 // uint32_t reserved1;
1117 // #else
1118 // uint32_t reserved1;
1119 // void *kernarg_address;
1120 // #endif
1121 // uint64_t reserved2;
1122 // hsa_signal_t completion_signal; // uint64_t wrapper
1123 // } hsa_kernel_dispatch_packet_t
1124 //
1125 CallInst *DispatchPtr =
1126 Builder.CreateIntrinsic(ID: Intrinsic::amdgcn_dispatch_ptr, Args: {});
1127 DispatchPtr->addRetAttr(Kind: Attribute::NoAlias);
1128 DispatchPtr->addRetAttr(Kind: Attribute::NonNull);
1129 F.removeFnAttr(Kind: "amdgpu-no-dispatch-ptr");
1130
1131 // Size of the dispatch packet struct.
1132 DispatchPtr->addDereferenceableRetAttr(Bytes: 64);
1133
1134 Type *I32Ty = Type::getInt32Ty(C&: Mod->getContext());
1135
1136 // We could do a single 64-bit load here, but it's likely that the basic
1137 // 32-bit and extract sequence is already present, and it is probably easier
1138 // to CSE this. The loads should be mergeable later anyway.
1139 Value *GEPXY = Builder.CreateConstInBoundsGEP1_64(Ty: I32Ty, Ptr: DispatchPtr, Idx0: 1);
1140 LoadInst *LoadXY = Builder.CreateAlignedLoad(Ty: I32Ty, Ptr: GEPXY, Align: Align(4));
1141
1142 Value *GEPZU = Builder.CreateConstInBoundsGEP1_64(Ty: I32Ty, Ptr: DispatchPtr, Idx0: 2);
1143 LoadInst *LoadZU = Builder.CreateAlignedLoad(Ty: I32Ty, Ptr: GEPZU, Align: Align(4));
1144
1145 MDNode *MD = MDNode::get(Context&: Mod->getContext(), MDs: {});
1146 LoadXY->setMetadata(KindID: LLVMContext::MD_invariant_load, Node: MD);
1147 LoadZU->setMetadata(KindID: LLVMContext::MD_invariant_load, Node: MD);
1148 ST.makeLIDRangeMetadata(I: LoadZU);
1149
1150 // Extract y component. Upper half of LoadZU should be zero already.
1151 Value *Y = Builder.CreateLShr(LHS: LoadXY, RHS: 16);
1152
1153 return std::pair(Y, LoadZU);
1154}
1155
1156Value *AMDGPUPromoteAllocaImpl::getWorkitemID(IRBuilder<> &Builder,
1157 unsigned N) {
1158 Function *F = Builder.GetInsertBlock()->getParent();
1159 const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F: *F);
1160 Intrinsic::ID IntrID = Intrinsic::not_intrinsic;
1161 StringRef AttrName;
1162
1163 switch (N) {
1164 case 0:
1165 IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_x
1166 : (Intrinsic::ID)Intrinsic::r600_read_tidig_x;
1167 AttrName = "amdgpu-no-workitem-id-x";
1168 break;
1169 case 1:
1170 IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_y
1171 : (Intrinsic::ID)Intrinsic::r600_read_tidig_y;
1172 AttrName = "amdgpu-no-workitem-id-y";
1173 break;
1174
1175 case 2:
1176 IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_z
1177 : (Intrinsic::ID)Intrinsic::r600_read_tidig_z;
1178 AttrName = "amdgpu-no-workitem-id-z";
1179 break;
1180 default:
1181 llvm_unreachable("invalid dimension");
1182 }
1183
1184 Function *WorkitemIdFn = Intrinsic::getOrInsertDeclaration(M: Mod, id: IntrID);
1185 CallInst *CI = Builder.CreateCall(Callee: WorkitemIdFn);
1186 ST.makeLIDRangeMetadata(I: CI);
1187 F->removeFnAttr(Kind: AttrName);
1188
1189 return CI;
1190}
1191
1192static bool isCallPromotable(CallInst *CI) {
1193 IntrinsicInst *II = dyn_cast<IntrinsicInst>(Val: CI);
1194 if (!II)
1195 return false;
1196
1197 switch (II->getIntrinsicID()) {
1198 case Intrinsic::memcpy:
1199 case Intrinsic::memmove:
1200 case Intrinsic::memset:
1201 case Intrinsic::lifetime_start:
1202 case Intrinsic::lifetime_end:
1203 case Intrinsic::invariant_start:
1204 case Intrinsic::invariant_end:
1205 case Intrinsic::launder_invariant_group:
1206 case Intrinsic::strip_invariant_group:
1207 case Intrinsic::objectsize:
1208 return true;
1209 default:
1210 return false;
1211 }
1212}
1213
1214bool AMDGPUPromoteAllocaImpl::binaryOpIsDerivedFromSameAlloca(
1215 Value *BaseAlloca, Value *Val, Instruction *Inst, int OpIdx0,
1216 int OpIdx1) const {
1217 // Figure out which operand is the one we might not be promoting.
1218 Value *OtherOp = Inst->getOperand(i: OpIdx0);
1219 if (Val == OtherOp)
1220 OtherOp = Inst->getOperand(i: OpIdx1);
1221
1222 if (isa<ConstantPointerNull, ConstantAggregateZero>(Val: OtherOp))
1223 return true;
1224
1225 // TODO: getUnderlyingObject will not work on a vector getelementptr
1226 Value *OtherObj = getUnderlyingObject(V: OtherOp);
1227 if (!isa<AllocaInst>(Val: OtherObj))
1228 return false;
1229
1230 // TODO: We should be able to replace undefs with the right pointer type.
1231
1232 // TODO: If we know the other base object is another promotable
1233 // alloca, not necessarily this alloca, we can do this. The
1234 // important part is both must have the same address space at
1235 // the end.
1236 if (OtherObj != BaseAlloca) {
1237 LLVM_DEBUG(
1238 dbgs() << "Found a binary instruction with another alloca object\n");
1239 return false;
1240 }
1241
1242 return true;
1243}
1244
1245bool AMDGPUPromoteAllocaImpl::collectUsesWithPtrTypes(
1246 Value *BaseAlloca, Value *Val, std::vector<Value *> &WorkList) const {
1247
1248 for (User *User : Val->users()) {
1249 if (is_contained(Range&: WorkList, Element: User))
1250 continue;
1251
1252 if (CallInst *CI = dyn_cast<CallInst>(Val: User)) {
1253 if (!isCallPromotable(CI))
1254 return false;
1255
1256 WorkList.push_back(x: User);
1257 continue;
1258 }
1259
1260 Instruction *UseInst = cast<Instruction>(Val: User);
1261 if (UseInst->getOpcode() == Instruction::PtrToInt)
1262 return false;
1263
1264 if (LoadInst *LI = dyn_cast<LoadInst>(Val: UseInst)) {
1265 if (LI->isVolatile())
1266 return false;
1267 continue;
1268 }
1269
1270 if (StoreInst *SI = dyn_cast<StoreInst>(Val: UseInst)) {
1271 if (SI->isVolatile())
1272 return false;
1273
1274 // Reject if the stored value is not the pointer operand.
1275 if (SI->getPointerOperand() != Val)
1276 return false;
1277 continue;
1278 }
1279
1280 if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(Val: UseInst)) {
1281 if (RMW->isVolatile())
1282 return false;
1283 continue;
1284 }
1285
1286 if (AtomicCmpXchgInst *CAS = dyn_cast<AtomicCmpXchgInst>(Val: UseInst)) {
1287 if (CAS->isVolatile())
1288 return false;
1289 continue;
1290 }
1291
1292 // Only promote a select if we know that the other select operand
1293 // is from another pointer that will also be promoted.
1294 if (ICmpInst *ICmp = dyn_cast<ICmpInst>(Val: UseInst)) {
1295 if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, Inst: ICmp, OpIdx0: 0, OpIdx1: 1))
1296 return false;
1297
1298 // May need to rewrite constant operands.
1299 WorkList.push_back(x: ICmp);
1300 continue;
1301 }
1302
1303 if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Val: UseInst)) {
1304 // Be conservative if an address could be computed outside the bounds of
1305 // the alloca.
1306 if (!GEP->isInBounds())
1307 return false;
1308 } else if (SelectInst *SI = dyn_cast<SelectInst>(Val: UseInst)) {
1309 // Only promote a select if we know that the other select operand is from
1310 // another pointer that will also be promoted.
1311 if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, Inst: SI, OpIdx0: 1, OpIdx1: 2))
1312 return false;
1313 } else if (PHINode *Phi = dyn_cast<PHINode>(Val: UseInst)) {
1314 // Repeat for phis.
1315
1316 // TODO: Handle more complex cases. We should be able to replace loops
1317 // over arrays.
1318 switch (Phi->getNumIncomingValues()) {
1319 case 1:
1320 break;
1321 case 2:
1322 if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, Inst: Phi, OpIdx0: 0, OpIdx1: 1))
1323 return false;
1324 break;
1325 default:
1326 return false;
1327 }
1328 } else if (!isa<ExtractElementInst>(Val: User)) {
1329 // Do not promote vector/aggregate type instructions. It is hard to track
1330 // their users.
1331
1332 // Do not promote addrspacecast.
1333 //
1334 // TODO: If we know the address is only observed through flat pointers, we
1335 // could still promote.
1336 return false;
1337 }
1338
1339 WorkList.push_back(x: User);
1340 if (!collectUsesWithPtrTypes(BaseAlloca, Val: User, WorkList))
1341 return false;
1342 }
1343
1344 return true;
1345}
1346
1347bool AMDGPUPromoteAllocaImpl::hasSufficientLocalMem(const Function &F) {
1348
1349 FunctionType *FTy = F.getFunctionType();
1350 const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F);
1351
1352 // If the function has any arguments in the local address space, then it's
1353 // possible these arguments require the entire local memory space, so
1354 // we cannot use local memory in the pass.
1355 for (Type *ParamTy : FTy->params()) {
1356 PointerType *PtrTy = dyn_cast<PointerType>(Val: ParamTy);
1357 if (PtrTy && PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
1358 LocalMemLimit = 0;
1359 LLVM_DEBUG(dbgs() << "Function has local memory argument. Promoting to "
1360 "local memory disabled.\n");
1361 return false;
1362 }
1363 }
1364
1365 LocalMemLimit = ST.getAddressableLocalMemorySize();
1366 if (LocalMemLimit == 0)
1367 return false;
1368
1369 SmallVector<const Constant *, 16> Stack;
1370 SmallPtrSet<const Constant *, 8> VisitedConstants;
1371 SmallPtrSet<const GlobalVariable *, 8> UsedLDS;
1372
1373 auto visitUsers = [&](const GlobalVariable *GV, const Constant *Val) -> bool {
1374 for (const User *U : Val->users()) {
1375 if (const Instruction *Use = dyn_cast<Instruction>(Val: U)) {
1376 if (Use->getParent()->getParent() == &F)
1377 return true;
1378 } else {
1379 const Constant *C = cast<Constant>(Val: U);
1380 if (VisitedConstants.insert(Ptr: C).second)
1381 Stack.push_back(Elt: C);
1382 }
1383 }
1384
1385 return false;
1386 };
1387
1388 for (GlobalVariable &GV : Mod->globals()) {
1389 if (GV.getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS)
1390 continue;
1391
1392 if (visitUsers(&GV, &GV)) {
1393 UsedLDS.insert(Ptr: &GV);
1394 Stack.clear();
1395 continue;
1396 }
1397
1398 // For any ConstantExpr uses, we need to recursively search the users until
1399 // we see a function.
1400 while (!Stack.empty()) {
1401 const Constant *C = Stack.pop_back_val();
1402 if (visitUsers(&GV, C)) {
1403 UsedLDS.insert(Ptr: &GV);
1404 Stack.clear();
1405 break;
1406 }
1407 }
1408 }
1409
1410 const DataLayout &DL = Mod->getDataLayout();
1411 SmallVector<std::pair<uint64_t, Align>, 16> AllocatedSizes;
1412 AllocatedSizes.reserve(N: UsedLDS.size());
1413
1414 for (const GlobalVariable *GV : UsedLDS) {
1415 Align Alignment =
1416 DL.getValueOrABITypeAlignment(Alignment: GV->getAlign(), Ty: GV->getValueType());
1417 uint64_t AllocSize = DL.getTypeAllocSize(Ty: GV->getValueType());
1418
1419 // HIP uses an extern unsized array in local address space for dynamically
1420 // allocated shared memory. In that case, we have to disable the promotion.
1421 if (GV->hasExternalLinkage() && AllocSize == 0) {
1422 LocalMemLimit = 0;
1423 LLVM_DEBUG(dbgs() << "Function has a reference to externally allocated "
1424 "local memory. Promoting to local memory "
1425 "disabled.\n");
1426 return false;
1427 }
1428
1429 AllocatedSizes.emplace_back(Args&: AllocSize, Args&: Alignment);
1430 }
1431
1432 // Sort to try to estimate the worst case alignment padding
1433 //
1434 // FIXME: We should really do something to fix the addresses to a more optimal
1435 // value instead
1436 llvm::sort(C&: AllocatedSizes, Comp: llvm::less_second());
1437
1438 // Check how much local memory is being used by global objects
1439 CurrentLocalMemUsage = 0;
1440
1441 // FIXME: Try to account for padding here. The real padding and address is
1442 // currently determined from the inverse order of uses in the function when
1443 // legalizing, which could also potentially change. We try to estimate the
1444 // worst case here, but we probably should fix the addresses earlier.
1445 for (auto Alloc : AllocatedSizes) {
1446 CurrentLocalMemUsage = alignTo(Size: CurrentLocalMemUsage, A: Alloc.second);
1447 CurrentLocalMemUsage += Alloc.first;
1448 }
1449
1450 unsigned MaxOccupancy =
1451 ST.getWavesPerEU(FlatWorkGroupSizes: ST.getFlatWorkGroupSizes(F), LDSBytes: CurrentLocalMemUsage, F)
1452 .second;
1453
1454 // Round up to the next tier of usage.
1455 unsigned MaxSizeWithWaveCount =
1456 ST.getMaxLocalMemSizeWithWaveCount(WaveCount: MaxOccupancy, F);
1457
1458 // Program may already use more LDS than is usable at maximum occupancy.
1459 if (CurrentLocalMemUsage > MaxSizeWithWaveCount)
1460 return false;
1461
1462 LocalMemLimit = MaxSizeWithWaveCount;
1463
1464 LLVM_DEBUG(dbgs() << F.getName() << " uses " << CurrentLocalMemUsage
1465 << " bytes of LDS\n"
1466 << " Rounding size to " << MaxSizeWithWaveCount
1467 << " with a maximum occupancy of " << MaxOccupancy << '\n'
1468 << " and " << (LocalMemLimit - CurrentLocalMemUsage)
1469 << " available for promotion\n");
1470
1471 return true;
1472}
1473
1474// FIXME: Should try to pick the most likely to be profitable allocas first.
1475bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(AllocaInst &I,
1476 bool SufficientLDS) {
1477 LLVM_DEBUG(dbgs() << "Trying to promote to LDS: " << I << '\n');
1478
1479 if (DisablePromoteAllocaToLDS) {
1480 LLVM_DEBUG(dbgs() << " Promote alloca to LDS is disabled\n");
1481 return false;
1482 }
1483
1484 const DataLayout &DL = Mod->getDataLayout();
1485 IRBuilder<> Builder(&I);
1486
1487 const Function &ContainingFunction = *I.getParent()->getParent();
1488 CallingConv::ID CC = ContainingFunction.getCallingConv();
1489
1490 // Don't promote the alloca to LDS for shader calling conventions as the work
1491 // item ID intrinsics are not supported for these calling conventions.
1492 // Furthermore not all LDS is available for some of the stages.
1493 switch (CC) {
1494 case CallingConv::AMDGPU_KERNEL:
1495 case CallingConv::SPIR_KERNEL:
1496 break;
1497 default:
1498 LLVM_DEBUG(
1499 dbgs()
1500 << " promote alloca to LDS not supported with calling convention.\n");
1501 return false;
1502 }
1503
1504 // Not likely to have sufficient local memory for promotion.
1505 if (!SufficientLDS)
1506 return false;
1507
1508 const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F: ContainingFunction);
1509 unsigned WorkGroupSize = ST.getFlatWorkGroupSizes(F: ContainingFunction).second;
1510
1511 Align Alignment =
1512 DL.getValueOrABITypeAlignment(Alignment: I.getAlign(), Ty: I.getAllocatedType());
1513
1514 // FIXME: This computed padding is likely wrong since it depends on inverse
1515 // usage order.
1516 //
1517 // FIXME: It is also possible that if we're allowed to use all of the memory
1518 // could end up using more than the maximum due to alignment padding.
1519
1520 uint32_t NewSize = alignTo(Size: CurrentLocalMemUsage, A: Alignment);
1521 uint32_t AllocSize =
1522 WorkGroupSize * DL.getTypeAllocSize(Ty: I.getAllocatedType());
1523 NewSize += AllocSize;
1524
1525 if (NewSize > LocalMemLimit) {
1526 LLVM_DEBUG(dbgs() << " " << AllocSize
1527 << " bytes of local memory not available to promote\n");
1528 return false;
1529 }
1530
1531 CurrentLocalMemUsage = NewSize;
1532
1533 std::vector<Value *> WorkList;
1534
1535 if (!collectUsesWithPtrTypes(BaseAlloca: &I, Val: &I, WorkList)) {
1536 LLVM_DEBUG(dbgs() << " Do not know how to convert all uses\n");
1537 return false;
1538 }
1539
1540 LLVM_DEBUG(dbgs() << "Promoting alloca to local memory\n");
1541
1542 Function *F = I.getParent()->getParent();
1543
1544 Type *GVTy = ArrayType::get(ElementType: I.getAllocatedType(), NumElements: WorkGroupSize);
1545 GlobalVariable *GV = new GlobalVariable(
1546 *Mod, GVTy, false, GlobalValue::InternalLinkage, PoisonValue::get(T: GVTy),
1547 Twine(F->getName()) + Twine('.') + I.getName(), nullptr,
1548 GlobalVariable::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS);
1549 GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
1550 GV->setAlignment(I.getAlign());
1551
1552 Value *TCntY, *TCntZ;
1553
1554 std::tie(args&: TCntY, args&: TCntZ) = getLocalSizeYZ(Builder);
1555 Value *TIdX = getWorkitemID(Builder, N: 0);
1556 Value *TIdY = getWorkitemID(Builder, N: 1);
1557 Value *TIdZ = getWorkitemID(Builder, N: 2);
1558
1559 Value *Tmp0 = Builder.CreateMul(LHS: TCntY, RHS: TCntZ, Name: "", HasNUW: true, HasNSW: true);
1560 Tmp0 = Builder.CreateMul(LHS: Tmp0, RHS: TIdX);
1561 Value *Tmp1 = Builder.CreateMul(LHS: TIdY, RHS: TCntZ, Name: "", HasNUW: true, HasNSW: true);
1562 Value *TID = Builder.CreateAdd(LHS: Tmp0, RHS: Tmp1);
1563 TID = Builder.CreateAdd(LHS: TID, RHS: TIdZ);
1564
1565 LLVMContext &Context = Mod->getContext();
1566 Value *Indices[] = {Constant::getNullValue(Ty: Type::getInt32Ty(C&: Context)), TID};
1567
1568 Value *Offset = Builder.CreateInBoundsGEP(Ty: GVTy, Ptr: GV, IdxList: Indices);
1569 I.mutateType(Ty: Offset->getType());
1570 I.replaceAllUsesWith(V: Offset);
1571 I.eraseFromParent();
1572
1573 SmallVector<IntrinsicInst *> DeferredIntrs;
1574
1575 PointerType *NewPtrTy = PointerType::get(C&: Context, AddressSpace: AMDGPUAS::LOCAL_ADDRESS);
1576
1577 for (Value *V : WorkList) {
1578 CallInst *Call = dyn_cast<CallInst>(Val: V);
1579 if (!Call) {
1580 if (ICmpInst *CI = dyn_cast<ICmpInst>(Val: V)) {
1581 Value *LHS = CI->getOperand(i_nocapture: 0);
1582 Value *RHS = CI->getOperand(i_nocapture: 1);
1583
1584 Type *NewTy = LHS->getType()->getWithNewType(EltTy: NewPtrTy);
1585 if (isa<ConstantPointerNull, ConstantAggregateZero>(Val: LHS))
1586 CI->setOperand(i_nocapture: 0, Val_nocapture: Constant::getNullValue(Ty: NewTy));
1587
1588 if (isa<ConstantPointerNull, ConstantAggregateZero>(Val: RHS))
1589 CI->setOperand(i_nocapture: 1, Val_nocapture: Constant::getNullValue(Ty: NewTy));
1590
1591 continue;
1592 }
1593
1594 // The operand's value should be corrected on its own and we don't want to
1595 // touch the users.
1596 if (isa<AddrSpaceCastInst>(Val: V))
1597 continue;
1598
1599 assert(V->getType()->isPtrOrPtrVectorTy());
1600
1601 Type *NewTy = V->getType()->getWithNewType(EltTy: NewPtrTy);
1602 V->mutateType(Ty: NewTy);
1603
1604 // Adjust the types of any constant operands.
1605 if (SelectInst *SI = dyn_cast<SelectInst>(Val: V)) {
1606 if (isa<ConstantPointerNull, ConstantAggregateZero>(Val: SI->getOperand(i_nocapture: 1)))
1607 SI->setOperand(i_nocapture: 1, Val_nocapture: Constant::getNullValue(Ty: NewTy));
1608
1609 if (isa<ConstantPointerNull, ConstantAggregateZero>(Val: SI->getOperand(i_nocapture: 2)))
1610 SI->setOperand(i_nocapture: 2, Val_nocapture: Constant::getNullValue(Ty: NewTy));
1611 } else if (PHINode *Phi = dyn_cast<PHINode>(Val: V)) {
1612 for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I) {
1613 if (isa<ConstantPointerNull, ConstantAggregateZero>(
1614 Val: Phi->getIncomingValue(i: I)))
1615 Phi->setIncomingValue(i: I, V: Constant::getNullValue(Ty: NewTy));
1616 }
1617 }
1618
1619 continue;
1620 }
1621
1622 IntrinsicInst *Intr = cast<IntrinsicInst>(Val: Call);
1623 Builder.SetInsertPoint(Intr);
1624 switch (Intr->getIntrinsicID()) {
1625 case Intrinsic::lifetime_start:
1626 case Intrinsic::lifetime_end:
1627 // These intrinsics are for address space 0 only
1628 Intr->eraseFromParent();
1629 continue;
1630 case Intrinsic::memcpy:
1631 case Intrinsic::memmove:
1632 // These have 2 pointer operands. In case if second pointer also needs
1633 // to be replaced we defer processing of these intrinsics until all
1634 // other values are processed.
1635 DeferredIntrs.push_back(Elt: Intr);
1636 continue;
1637 case Intrinsic::memset: {
1638 MemSetInst *MemSet = cast<MemSetInst>(Val: Intr);
1639 Builder.CreateMemSet(Ptr: MemSet->getRawDest(), Val: MemSet->getValue(),
1640 Size: MemSet->getLength(), Align: MemSet->getDestAlign(),
1641 isVolatile: MemSet->isVolatile());
1642 Intr->eraseFromParent();
1643 continue;
1644 }
1645 case Intrinsic::invariant_start:
1646 case Intrinsic::invariant_end:
1647 case Intrinsic::launder_invariant_group:
1648 case Intrinsic::strip_invariant_group: {
1649 SmallVector<Value *> Args;
1650 if (Intr->getIntrinsicID() == Intrinsic::invariant_start) {
1651 Args.emplace_back(Args: Intr->getArgOperand(i: 0));
1652 } else if (Intr->getIntrinsicID() == Intrinsic::invariant_end) {
1653 Args.emplace_back(Args: Intr->getArgOperand(i: 0));
1654 Args.emplace_back(Args: Intr->getArgOperand(i: 1));
1655 }
1656 Args.emplace_back(Args&: Offset);
1657 Function *F = Intrinsic::getOrInsertDeclaration(
1658 M: Intr->getModule(), id: Intr->getIntrinsicID(), Tys: Offset->getType());
1659 CallInst *NewIntr =
1660 CallInst::Create(Func: F, Args, NameStr: Intr->getName(), InsertBefore: Intr->getIterator());
1661 Intr->mutateType(Ty: NewIntr->getType());
1662 Intr->replaceAllUsesWith(V: NewIntr);
1663 Intr->eraseFromParent();
1664 continue;
1665 }
1666 case Intrinsic::objectsize: {
1667 Value *Src = Intr->getOperand(i_nocapture: 0);
1668
1669 CallInst *NewCall = Builder.CreateIntrinsic(
1670 ID: Intrinsic::objectsize,
1671 Types: {Intr->getType(), PointerType::get(C&: Context, AddressSpace: AMDGPUAS::LOCAL_ADDRESS)},
1672 Args: {Src, Intr->getOperand(i_nocapture: 1), Intr->getOperand(i_nocapture: 2), Intr->getOperand(i_nocapture: 3)});
1673 Intr->replaceAllUsesWith(V: NewCall);
1674 Intr->eraseFromParent();
1675 continue;
1676 }
1677 default:
1678 Intr->print(O&: errs());
1679 llvm_unreachable("Don't know how to promote alloca intrinsic use.");
1680 }
1681 }
1682
1683 for (IntrinsicInst *Intr : DeferredIntrs) {
1684 Builder.SetInsertPoint(Intr);
1685 Intrinsic::ID ID = Intr->getIntrinsicID();
1686 assert(ID == Intrinsic::memcpy || ID == Intrinsic::memmove);
1687
1688 MemTransferInst *MI = cast<MemTransferInst>(Val: Intr);
1689 auto *B = Builder.CreateMemTransferInst(
1690 IntrID: ID, Dst: MI->getRawDest(), DstAlign: MI->getDestAlign(), Src: MI->getRawSource(),
1691 SrcAlign: MI->getSourceAlign(), Size: MI->getLength(), isVolatile: MI->isVolatile());
1692
1693 for (unsigned I = 0; I != 2; ++I) {
1694 if (uint64_t Bytes = Intr->getParamDereferenceableBytes(i: I)) {
1695 B->addDereferenceableParamAttr(i: I, Bytes);
1696 }
1697 }
1698
1699 Intr->eraseFromParent();
1700 }
1701
1702 return true;
1703}
1704