AMDGPUPromoteAlloca.cpp source code [llvm_projects/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp]

1	//===-- AMDGPUPromoteAlloca.cpp - Promote Allocas -------------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// Eliminates allocas by either converting them into vectors or by migrating
10	// them to local address space.
11	//
12	// Two passes are exposed by this file:
13	// - "promote-alloca-to-vector", which runs early in the pipeline and only
14	// promotes to vector. Promotion to vector is almost always profitable
15	// except when the alloca is too big and the promotion would result in
16	// very high register pressure.
17	// - "promote-alloca", which does both promotion to vector and LDS and runs
18	// much later in the pipeline. This runs after SROA because promoting to
19	// LDS is of course less profitable than getting rid of the alloca or
20	// vectorizing it, thus we only want to do it when the only alternative is
21	// lowering the alloca to stack.
22	//
23	// Note that both of them exist for the old and new PMs. The new PM passes are
24	// declared in AMDGPU.h and the legacy PM ones are declared here.s
25	//
26	//===----------------------------------------------------------------------===//
27
28	#include "AMDGPU.h"
29	#include "GCNSubtarget.h"
30	#include "Utils/AMDGPUBaseInfo.h"
31	#include "llvm/ADT/STLExtras.h"
32	#include "llvm/Analysis/CaptureTracking.h"
33	#include "llvm/Analysis/InstSimplifyFolder.h"
34	#include "llvm/Analysis/InstructionSimplify.h"
35	#include "llvm/Analysis/LoopInfo.h"
36	#include "llvm/Analysis/ValueTracking.h"
37	#include "llvm/CodeGen/TargetPassConfig.h"
38	#include "llvm/IR/IRBuilder.h"
39	#include "llvm/IR/IntrinsicInst.h"
40	#include "llvm/IR/IntrinsicsAMDGPU.h"
41	#include "llvm/IR/IntrinsicsR600.h"
42	#include "llvm/IR/PatternMatch.h"
43	#include "llvm/InitializePasses.h"
44	#include "llvm/Pass.h"
45	#include "llvm/Target/TargetMachine.h"
46	#include "llvm/Transforms/Utils/SSAUpdater.h"
47
48	#define DEBUG_TYPE "amdgpu-promote-alloca"
49
50	using namespace llvm;
51
52	namespace {
53
54	static cl::opt<bool>
55	DisablePromoteAllocaToVector("disable-promote-alloca-to-vector",
56	cl::desc ("Disable promote alloca to vector"),
57	cl::init(Val: false));
58
59	static cl::opt<bool>
60	DisablePromoteAllocaToLDS("disable-promote-alloca-to-lds",
61	cl::desc ("Disable promote alloca to LDS"),
62	cl::init(Val: false));
63
64	static cl::opt<unsigned> PromoteAllocaToVectorLimit(
65	"amdgpu-promote-alloca-to-vector-limit",
66	cl::desc ("Maximum byte size to consider promote alloca to vector"),
67	cl::init(Val: `0`));
68
69	static cl::opt<unsigned>
70	LoopUserWeight("promote-alloca-vector-loop-user-weight",
71	cl::desc ("The bonus weight of users of allocas within loop "
72	"when sorting profitable allocas"),
73	cl::init(Val: `4`));
74
75	// Shared implementation which can do both promotion to vector and to LDS.
76	class AMDGPUPromoteAllocaImpl {
77	private:
78	const TargetMachine &TM;
79	LoopInfo &LI;
80	Module Mod = nullptr*;
81	const DataLayout DL = nullptr*;
82
83	// FIXME: This should be per-kernel.
84	uint32_t LocalMemLimit = `0`;
85	uint32_t CurrentLocalMemUsage = `0`;
86	unsigned MaxVGPRs;
87
88	bool IsAMDGCN = false;
89	bool IsAMDHSA = false;
90
91	std::pair<Value , Value > getLocalSizeYZ(IRBuilder<> &Builder);
92	Value getWorkitemID(IRBuilder<> &Builder, unsigned* N);
93
94	/// BaseAlloca is the alloca root the search started from.
95	/// Val may be that alloca or a recursive user of it.
96	bool collectUsesWithPtrTypes(Value BaseAlloca, Value Val,
97	std::vector<Value > &WorkList) const*;
98
99	/// Val is a derived pointer from Alloca. OpIdx0/OpIdx1 are the operand
100	/// indices to an instruction with 2 pointer inputs (e.g. select, icmp).
101	/// Returns true if both operands are derived from the same alloca. Val should
102	/// be the same value as one of the input operands of UseInst.
103	bool binaryOpIsDerivedFromSameAlloca(Value Alloca, Value Val,
104	Instruction UseInst, int* OpIdx0,
105	int OpIdx1) const;
106
107	/// Check whether we have enough local memory for promotion.
108	bool hasSufficientLocalMem(const Function &F);
109
110	bool tryPromoteAllocaToVector(AllocaInst &I);
111	bool tryPromoteAllocaToLDS(AllocaInst &I, bool SufficientLDS);
112
113	void sortAllocasToPromote(SmallVectorImpl<AllocaInst *> &Allocas);
114
115	public:
116	AMDGPUPromoteAllocaImpl(TargetMachine &TM, LoopInfo &LI) : TM(TM), LI(LI) {
117
118	const Triple &TT = TM.getTargetTriple();
119	IsAMDGCN = TT.getArch() == Triple::amdgcn;
120	IsAMDHSA = TT.getOS() == Triple::AMDHSA;
121	}
122
123	bool run(Function &F, bool PromoteToLDS);
124	};
125
126	// FIXME: This can create globals so should be a module pass.
127	class AMDGPUPromoteAlloca : public FunctionPass {
128	public:
129	static char ID;
130
131	AMDGPUPromoteAlloca() : FunctionPass (ID) {}
132
133	bool runOnFunction(Function &F) override {
134	if (skipFunction(F))
135	return false;
136	if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>())
137	return AMDGPUPromoteAllocaImpl (
138	TPC->getTM<TargetMachine>(),
139	getAnalysis<LoopInfoWrapperPass>().getLoopInfo())
140	.run(F, /PromoteToLDS/ true);
141	return false;
142	}
143
144	StringRef getPassName() const override { return "AMDGPU Promote Alloca"; }
145
146	void getAnalysisUsage(AnalysisUsage &AU) const override {
147	AU.setPreservesCFG();
148	AU.addRequired<LoopInfoWrapperPass>();
149	FunctionPass::getAnalysisUsage(AU);
150	}
151	};
152
153	class AMDGPUPromoteAllocaToVector : public FunctionPass {
154	public:
155	static char ID;
156
157	AMDGPUPromoteAllocaToVector() : FunctionPass (ID) {}
158
159	bool runOnFunction(Function &F) override {
160	if (skipFunction(F))
161	return false;
162	if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>())
163	return AMDGPUPromoteAllocaImpl (
164	TPC->getTM<TargetMachine>(),
165	getAnalysis<LoopInfoWrapperPass>().getLoopInfo())
166	.run(F, /PromoteToLDS/ false);
167	return false;
168	}
169
170	StringRef getPassName() const override {
171	return "AMDGPU Promote Alloca to vector";
172	}
173
174	void getAnalysisUsage(AnalysisUsage &AU) const override {
175	AU.setPreservesCFG();
176	AU.addRequired<LoopInfoWrapperPass>();
177	FunctionPass::getAnalysisUsage(AU);
178	}
179	};
180
181	unsigned getMaxVGPRs(const TargetMachine &TM, const Function &F) {
182	if (!TM.getTargetTriple().isAMDGCN())
183	return `128`;
184
185	const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
186	unsigned MaxVGPRs = ST.getMaxNumVGPRs(WavesPerEU: ST.getWavesPerEU(F).first);
187
188	// A non-entry function has only 32 caller preserved registers.
189	// Do not promote alloca which will force spilling unless we know the function
190	// will be inlined.
191	if (!F.hasFnAttribute(Kind: Attribute::AlwaysInline) &&
192	!AMDGPU::isEntryFunctionCC(CC: F.getCallingConv()))
193	MaxVGPRs = std::min(a: MaxVGPRs, b: `32u`);
194	return MaxVGPRs;
195	}
196
197	} // end anonymous namespace
198
199	char AMDGPUPromoteAlloca::ID = `0`;
200	char AMDGPUPromoteAllocaToVector::ID = `0`;
201
202	INITIALIZE_PASS_BEGIN(AMDGPUPromoteAlloca, DEBUG_TYPE,
203	"AMDGPU promote alloca to vector or LDS", false, false)
204	// Move LDS uses from functions to kernels before promote alloca for accurate
205	// estimation of LDS available
206	INITIALIZE_PASS_DEPENDENCY(AMDGPULowerModuleLDSLegacy)
207	INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
208	INITIALIZE_PASS_END(AMDGPUPromoteAlloca, DEBUG_TYPE,
209	"AMDGPU promote alloca to vector or LDS", false, false)
210
211	INITIALIZE_PASS_BEGIN(AMDGPUPromoteAllocaToVector, DEBUG_TYPE "-to-vector",
212	"AMDGPU promote alloca to vector", false, false)
213	INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
214	INITIALIZE_PASS_END(AMDGPUPromoteAllocaToVector, DEBUG_TYPE "-to-vector",
215	"AMDGPU promote alloca to vector", false, false)
216
217	char &llvm::AMDGPUPromoteAllocaID = AMDGPUPromoteAlloca::ID;
218	char &llvm::AMDGPUPromoteAllocaToVectorID = AMDGPUPromoteAllocaToVector::ID;
219
220	PreservedAnalyses AMDGPUPromoteAllocaPass::run(Function &F,
221	FunctionAnalysisManager &AM) {
222	auto &LI = AM.getResult<LoopAnalysis>(IR&: F);
223	bool Changed = AMDGPUPromoteAllocaImpl (TM, LI).run(F, /PromoteToLDS=/true);
224	if (Changed) {
225	PreservedAnalyses PA;
226	PA.preserveSet<CFGAnalyses>();
227	return PA;
228	}
229	return PreservedAnalyses::all();
230	}
231
232	PreservedAnalyses
233	AMDGPUPromoteAllocaToVectorPass::run(Function &F, FunctionAnalysisManager &AM) {
234	auto &LI = AM.getResult<LoopAnalysis>(IR&: F);
235	bool Changed = AMDGPUPromoteAllocaImpl (TM, LI).run(F, /PromoteToLDS=/false);
236	if (Changed) {
237	PreservedAnalyses PA;
238	PA.preserveSet<CFGAnalyses>();
239	return PA;
240	}
241	return PreservedAnalyses::all();
242	}
243
244	FunctionPass *llvm::createAMDGPUPromoteAlloca() {
245	return new AMDGPUPromoteAlloca ();
246	}
247
248	FunctionPass *llvm::createAMDGPUPromoteAllocaToVector() {
249	return new AMDGPUPromoteAllocaToVector ();
250	}
251
252	static void collectAllocaUses(AllocaInst &Alloca,
253	SmallVectorImpl<Use *> &Uses) {
254	SmallVector<Instruction *, `4`> WorkList({&Alloca});
255	while (!WorkList.empty()) {
256	auto *Cur = WorkList.pop_back_val();
257	for (auto &U : Cur->uses()) {
258	Uses.push_back(Elt: &U);
259
260	if (isa<GetElementPtrInst>(Val: U.getUser()))
261	WorkList.push_back(Elt: cast<Instruction>(Val: U.getUser()));
262	}
263	}
264	}
265
266	void AMDGPUPromoteAllocaImpl::sortAllocasToPromote(
267	SmallVectorImpl<AllocaInst *> &Allocas) {
268	DenseMap<AllocaInst , unsigned*> Scores;
269
270	for (auto *Alloca : Allocas) {
271	LLVM_DEBUG(dbgs() << "Scoring: " << *Alloca << "\n");
272	unsigned &Score = Scores [Alloca];
273	// Increment score by one for each user + a bonus for users within loops.
274	SmallVector<Use *, `8`> Uses;
275	collectAllocaUses(Alloca&: *Alloca, Uses);
276	for (auto *U : Uses) {
277	Instruction *Inst = cast<Instruction>(Val: U->getUser());
278	if (isa<GetElementPtrInst>(Val: Inst))
279	continue;
280	unsigned UserScore =
281	`1` + (LoopUserWeight * LI.getLoopDepth(BB: Inst->getParent()));
282	LLVM_DEBUG(dbgs() << " [+" << UserScore << "]:\t" << *Inst << "\n");
283	Score += UserScore;
284	}
285	LLVM_DEBUG(dbgs() << " => Final Score:" << Score << "\n");
286	}
287
288	stable_sort(Range&: Allocas, C: [&](AllocaInst A, AllocaInst B) {
289	return Scores.at(Val: A) > Scores.at(Val: B);
290	});
291
292	// clang-format off
293	LLVM_DEBUG(
294	dbgs() << "Sorted Worklist:\n";
295	for (auto *A: Allocas)
296	dbgs() << " " << *A << "\n";
297	);
298	// clang-format on
299	}
300
301	bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
302	Mod = F.getParent();
303	DL = &Mod->getDataLayout();
304
305	const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F);
306	if (!ST.isPromoteAllocaEnabled())
307	return false;
308
309	MaxVGPRs = getMaxVGPRs(TM, F);
310
311	bool SufficientLDS = PromoteToLDS ? hasSufficientLocalMem(F) : false;
312
313	// Use up to 1/4 of available register budget for vectorization.
314	// FIXME: Increase the limit for whole function budgets? Perhaps x2?
315	unsigned VectorizationBudget =
316	(PromoteAllocaToVectorLimit ? PromoteAllocaToVectorLimit * `8`
317	: (MaxVGPRs * `32`)) /
318	`4`;
319
320	SmallVector<AllocaInst *, `16`> Allocas;
321	for (Instruction &I : F.getEntryBlock()) {
322	if (AllocaInst *AI = dyn_cast<AllocaInst>(Val: &I)) {
323	// Array allocations are probably not worth handling, since an allocation
324	// of the array type is the canonical form.
325	if (!AI->isStaticAlloca() \|\| AI->isArrayAllocation())
326	continue;
327	Allocas.push_back(Elt: AI);
328	}
329	}
330
331	sortAllocasToPromote(Allocas);
332
333	bool Changed = false;
334	for (AllocaInst *AI : Allocas) {
335	const unsigned AllocaCost = DL->getTypeSizeInBits(Ty: AI->getAllocatedType());
336	// First, check if we have enough budget to vectorize this alloca.
337	if (AllocaCost <= VectorizationBudget) {
338	// If we do, attempt vectorization, otherwise, fall through and try
339	// promoting to LDS instead.
340	if (tryPromoteAllocaToVector(I&: *AI)) {
341	Changed = true;
342	assert((VectorizationBudget - AllocaCost) < VectorizationBudget &&
343	"Underflow!");
344	VectorizationBudget -= AllocaCost;
345	LLVM_DEBUG(dbgs() << " Remaining vectorization budget:"
346	<< VectorizationBudget << "\n");
347	continue;
348	}
349	} else {
350	LLVM_DEBUG(dbgs() << "Alloca too big for vectorization (size:"
351	<< AllocaCost << ", budget:" << VectorizationBudget
352	<< "): " << *AI << "\n");
353	}
354
355	if (PromoteToLDS && tryPromoteAllocaToLDS(I&: *AI, SufficientLDS))
356	Changed = true;
357	}
358
359	// NOTE: tryPromoteAllocaToVector removes the alloca, so Allocas contains
360	// dangling pointers. If we want to reuse it past this point, the loop above
361	// would need to be updated to remove successfully promoted allocas.
362
363	return Changed;
364	}
365
366	struct MemTransferInfo {
367	ConstantInt SrcIndex = nullptr*;
368	ConstantInt DestIndex = nullptr*;
369	};
370
371	// Checks if the instruction I is a memset user of the alloca AI that we can
372	// deal with. Currently, only non-volatile memsets that affect the whole alloca
373	// are handled.
374	static bool isSupportedMemset(MemSetInst I, AllocaInst AI,
375	const DataLayout &DL) {
376	using namespace PatternMatch;
377	// For now we only care about non-volatile memsets that affect the whole type
378	// (start at index 0 and fill the whole alloca).
379	//
380	// TODO: Now that we moved to PromoteAlloca we could handle any memsets
381	// (except maybe volatile ones?) - we just need to use shufflevector if it
382	// only affects a subset of the vector.
383	const unsigned Size = DL.getTypeStoreSize(Ty: AI->getAllocatedType());
384	return I->getOperand(i_nocapture: `0`) == AI &&
385	match(V: I->getOperand(i_nocapture: `2`), P: m_SpecificInt(V: Size)) && !I->isVolatile();
386	}
387
388	static Value *
389	calculateVectorIndex(Value *Ptr,
390	const std::map<GetElementPtrInst , Value > &GEPIdx) {
391	auto *GEP = dyn_cast<GetElementPtrInst>(Val: Ptr->stripPointerCasts());
392	if (!GEP)
393	return ConstantInt::getNullValue(Ty: Type::getInt32Ty(C&: Ptr->getContext()));
394
395	auto I = GEPIdx.find(x: GEP);
396	assert(I != GEPIdx.end() && "Must have entry for GEP!");
397	return I ->second;
398	}
399
400	static Value GEPToVectorIndex(GetElementPtrInst GEP, AllocaInst *Alloca,
401	Type VecElemTy, const* DataLayout &DL) {
402	// TODO: Extracting a "multiple of X" from a GEP might be a useful generic
403	// helper.
404	unsigned BW = DL.getIndexTypeSizeInBits(Ty: GEP->getType());
405	MapVector<Value *, APInt> VarOffsets;
406	APInt ConstOffset(BW, `0`);
407	if (GEP->getPointerOperand()->stripPointerCasts() != Alloca \|\|
408	!GEP->collectOffset(DL, BitWidth: BW, VariableOffsets&: VarOffsets, ConstantOffset&: ConstOffset))
409	return nullptr;
410
411	unsigned VecElemSize = DL.getTypeAllocSize(Ty: VecElemTy);
412	if (VarOffsets.size() > `1`)
413	return nullptr;
414
415	if (VarOffsets.size() == `1`) {
416	// Only handle cases where we don't need to insert extra arithmetic
417	// instructions.
418	const auto &VarOffset = VarOffsets.front();
419	if (!ConstOffset.isZero() \|\| VarOffset.second != VecElemSize)
420	return nullptr;
421	return VarOffset.first;
422	}
423
424	APInt Quot;
425	uint64_t Rem;
426	APInt::udivrem(LHS: ConstOffset, RHS: VecElemSize, Quotient&: Quot, Remainder&: Rem);
427	if (Rem != `0`)
428	return nullptr;
429
430	return ConstantInt::get(Context&: GEP->getContext(), V: Quot);
431	}
432
433	/// Promotes a single user of the alloca to a vector form.
434	///
435	/// \param Inst Instruction to be promoted.
436	/// \param DL Module Data Layout.
437	/// \param VectorTy Vectorized Type.
438	/// \param VecStoreSize Size of \p VectorTy in bytes.
439	/// \param ElementSize Size of \p VectorTy element type in bytes.
440	/// \param TransferInfo MemTransferInst info map.
441	/// \param GEPVectorIdx GEP -> VectorIdx cache.
442	/// \param CurVal Current value of the vector (e.g. last stored value)
443	/// \param[out] DeferredLoads \p Inst is added to this vector if it can't
444	/// be promoted now. This happens when promoting requires \p
445	/// CurVal, but \p CurVal is nullptr.
446	/// \return the stored value if \p Inst would have written to the alloca, or
447	/// nullptr otherwise.
448	static Value *promoteAllocaUserToVector(
449	Instruction Inst, const* DataLayout &DL, FixedVectorType *VectorTy,
450	unsigned VecStoreSize, unsigned ElementSize,
451	DenseMap<MemTransferInst *, MemTransferInfo> &TransferInfo,
452	std::map<GetElementPtrInst , Value > &GEPVectorIdx, Value *CurVal,
453	SmallVectorImpl<LoadInst *> &DeferredLoads) {
454	// Note: we use InstSimplifyFolder because it can leverage the DataLayout
455	// to do more folding, especially in the case of vector splats.
456	IRBuilder<InstSimplifyFolder> Builder(Inst->getContext(),
457	InstSimplifyFolder (DL));
458	Builder.SetInsertPoint(Inst);
459
460	const auto GetOrLoadCurrentVectorValue = [&]() -> Value * {
461	if (CurVal)
462	return CurVal;
463
464	// If the current value is not known, insert a dummy load and lower it on
465	// the second pass.
466	LoadInst *Dummy =
467	Builder.CreateLoad(Ty: VectorTy, Ptr: PoisonValue::get(T: Builder.getPtrTy()),
468	Name: "promotealloca.dummyload");
469	DeferredLoads.push_back(Elt: Dummy);
470	return Dummy;
471	};
472
473	const auto CreateTempPtrIntCast = [&Builder, DL](Value *Val,
474	Type PtrTy) -> Value {
475	assert(DL.getTypeStoreSize(Val->getType()) == DL.getTypeStoreSize(PtrTy));
476	const unsigned Size = DL.getTypeStoreSizeInBits(Ty: PtrTy);
477	if (!PtrTy->isVectorTy())
478	return Builder.CreateBitOrPointerCast(V: Val, DestTy: Builder.getIntNTy(N: Size));
479	const unsigned NumPtrElts = cast<FixedVectorType>(Val: PtrTy)->getNumElements();
480	// If we want to cast to cast, e.g. a <2 x ptr> into a <4 x i32>, we need to
481	// first cast the ptr vector to <2 x i64>.
482	assert((Size % NumPtrElts == `0`) && "Vector size not divisble");
483	Type *EltTy = Builder.getIntNTy(N: Size / NumPtrElts);
484	return Builder.CreateBitOrPointerCast(
485	V: Val, DestTy: FixedVectorType::get(ElementType: EltTy, NumElts: NumPtrElts));
486	};
487
488	Type *VecEltTy = VectorTy->getElementType();
489
490	switch (Inst->getOpcode()) {
491	case Instruction::Load: {
492	// Loads can only be lowered if the value is known.
493	if (!CurVal) {
494	DeferredLoads.push_back(Elt: cast<LoadInst>(Val: Inst));
495	return nullptr;
496	}
497
498	Value *Index = calculateVectorIndex(
499	Ptr: cast<LoadInst>(Val: Inst)->getPointerOperand(), GEPIdx: GEPVectorIdx);
500
501	// We're loading the full vector.
502	Type *AccessTy = Inst->getType();
503	TypeSize AccessSize = DL.getTypeStoreSize(Ty: AccessTy);
504	if (Constant *CI = dyn_cast<Constant>(Val: Index)) {
505	if (CI->isZeroValue() && AccessSize == VecStoreSize) {
506	if (AccessTy->isPtrOrPtrVectorTy())
507	CurVal = CreateTempPtrIntCast (CurVal, AccessTy);
508	else if (CurVal->getType()->isPtrOrPtrVectorTy())
509	CurVal = CreateTempPtrIntCast (CurVal, CurVal->getType());
510	Value *NewVal = Builder.CreateBitOrPointerCast(V: CurVal, DestTy: AccessTy);
511	Inst->replaceAllUsesWith(V: NewVal);
512	return nullptr;
513	}
514	}
515
516	// Loading a subvector.
517	if (isa<FixedVectorType>(Val: AccessTy)) {
518	assert(AccessSize.isKnownMultipleOf(DL.getTypeStoreSize(VecEltTy)));
519	const unsigned NumLoadedElts = AccessSize / DL.getTypeStoreSize(Ty: VecEltTy);
520	auto *SubVecTy = FixedVectorType::get(ElementType: VecEltTy, NumElts: NumLoadedElts);
521	assert(DL.getTypeStoreSize(SubVecTy) == DL.getTypeStoreSize(AccessTy));
522
523	Value *SubVec = PoisonValue::get(T: SubVecTy);
524	for (unsigned K = `0`; K < NumLoadedElts; ++K) {
525	Value *CurIdx =
526	Builder.CreateAdd(LHS: Index, RHS: ConstantInt::get(Ty: Index->getType(), V: K));
527	SubVec = Builder.CreateInsertElement(
528	Vec: SubVec, NewElt: Builder.CreateExtractElement(Vec: CurVal, Idx: CurIdx), Idx: K);
529	}
530
531	if (AccessTy->isPtrOrPtrVectorTy())
532	SubVec = CreateTempPtrIntCast (SubVec, AccessTy);
533	else if (SubVecTy->isPtrOrPtrVectorTy())
534	SubVec = CreateTempPtrIntCast (SubVec, SubVecTy);
535
536	SubVec = Builder.CreateBitOrPointerCast(V: SubVec, DestTy: AccessTy);
537	Inst->replaceAllUsesWith(V: SubVec);
538	return nullptr;
539	}
540
541	// We're loading one element.
542	Value *ExtractElement = Builder.CreateExtractElement(Vec: CurVal, Idx: Index);
543	if (AccessTy != VecEltTy)
544	ExtractElement = Builder.CreateBitOrPointerCast(V: ExtractElement, DestTy: AccessTy);
545
546	Inst->replaceAllUsesWith(V: ExtractElement);
547	return nullptr;
548	}
549	case Instruction::Store: {
550	// For stores, it's a bit trickier and it depends on whether we're storing
551	// the full vector or not. If we're storing the full vector, we don't need
552	// to know the current value. If this is a store of a single element, we
553	// need to know the value.
554	StoreInst *SI = cast<StoreInst>(Val: Inst);
555	Value *Index = calculateVectorIndex(Ptr: SI->getPointerOperand(), GEPIdx: GEPVectorIdx);
556	Value *Val = SI->getValueOperand();
557
558	// We're storing the full vector, we can handle this without knowing CurVal.
559	Type *AccessTy = Val->getType();
560	TypeSize AccessSize = DL.getTypeStoreSize(Ty: AccessTy);
561	if (Constant *CI = dyn_cast<Constant>(Val: Index)) {
562	if (CI->isZeroValue() && AccessSize == VecStoreSize) {
563	if (AccessTy->isPtrOrPtrVectorTy())
564	Val = CreateTempPtrIntCast (Val, AccessTy);
565	else if (VectorTy->isPtrOrPtrVectorTy())
566	Val = CreateTempPtrIntCast (Val, VectorTy);
567	return Builder.CreateBitOrPointerCast(V: Val, DestTy: VectorTy);
568	}
569	}
570
571	// Storing a subvector.
572	if (isa<FixedVectorType>(Val: AccessTy)) {
573	assert(AccessSize.isKnownMultipleOf(DL.getTypeStoreSize(VecEltTy)));
574	const unsigned NumWrittenElts =
575	AccessSize / DL.getTypeStoreSize(Ty: VecEltTy);
576	const unsigned NumVecElts = VectorTy->getNumElements();
577	auto *SubVecTy = FixedVectorType::get(ElementType: VecEltTy, NumElts: NumWrittenElts);
578	assert(DL.getTypeStoreSize(SubVecTy) == DL.getTypeStoreSize(AccessTy));
579
580	if (SubVecTy->isPtrOrPtrVectorTy())
581	Val = CreateTempPtrIntCast (Val, SubVecTy);
582	else if (AccessTy->isPtrOrPtrVectorTy())
583	Val = CreateTempPtrIntCast (Val, AccessTy);
584
585	Val = Builder.CreateBitOrPointerCast(V: Val, DestTy: SubVecTy);
586
587	Value *CurVec = GetOrLoadCurrentVectorValue ();
588	for (unsigned K = `0`, NumElts = std::min(a: NumWrittenElts, b: NumVecElts);
589	K < NumElts; ++K) {
590	Value *CurIdx =
591	Builder.CreateAdd(LHS: Index, RHS: ConstantInt::get(Ty: Index->getType(), V: K));
592	CurVec = Builder.CreateInsertElement(
593	Vec: CurVec, NewElt: Builder.CreateExtractElement(Vec: Val, Idx: K), Idx: CurIdx);
594	}
595	return CurVec;
596	}
597
598	if (Val->getType() != VecEltTy)
599	Val = Builder.CreateBitOrPointerCast(V: Val, DestTy: VecEltTy);
600	return Builder.CreateInsertElement(Vec: GetOrLoadCurrentVectorValue (), NewElt: Val,
601	Idx: Index);
602	}
603	case Instruction::Call: {
604	if (auto *MTI = dyn_cast<MemTransferInst>(Val: Inst)) {
605	// For memcpy, we need to know curval.
606	ConstantInt *Length = cast<ConstantInt>(Val: MTI->getLength());
607	unsigned NumCopied = Length->getZExtValue() / ElementSize;
608	MemTransferInfo *TI = &TransferInfo [MTI];
609	unsigned SrcBegin = TI->SrcIndex->getZExtValue();
610	unsigned DestBegin = TI->DestIndex->getZExtValue();
611
612	SmallVector<int> Mask;
613	for (unsigned Idx = `0`; Idx < VectorTy->getNumElements(); ++Idx) {
614	if (Idx >= DestBegin && Idx < DestBegin + NumCopied) {
615	Mask.push_back(Elt: SrcBegin++);
616	} else {
617	Mask.push_back(Elt: Idx);
618	}
619	}
620
621	return Builder.CreateShuffleVector(V: GetOrLoadCurrentVectorValue (), Mask);
622	}
623
624	if (auto *MSI = dyn_cast<MemSetInst>(Val: Inst)) {
625	// For memset, we don't need to know the previous value because we
626	// currently only allow memsets that cover the whole alloca.
627	Value *Elt = MSI->getOperand(i_nocapture: `1`);
628	const unsigned BytesPerElt = DL.getTypeStoreSize(Ty: VecEltTy);
629	if (BytesPerElt > `1`) {
630	Value *EltBytes = Builder.CreateVectorSplat(NumElts: BytesPerElt, V: Elt);
631
632	// If the element type of the vector is a pointer, we need to first cast
633	// to an integer, then use a PtrCast.
634	if (VecEltTy->isPointerTy()) {
635	Type PtrInt = Builder.getIntNTy(N: BytesPerElt `8`);
636	Elt = Builder.CreateBitCast(V: EltBytes, DestTy: PtrInt);
637	Elt = Builder.CreateIntToPtr(V: Elt, DestTy: VecEltTy);
638	} else
639	Elt = Builder.CreateBitCast(V: EltBytes, DestTy: VecEltTy);
640	}
641
642	return Builder.CreateVectorSplat(EC: VectorTy->getElementCount(), V: Elt);
643	}
644
645	if (auto *Intr = dyn_cast<IntrinsicInst>(Val: Inst)) {
646	if (Intr->getIntrinsicID() == Intrinsic::objectsize) {
647	Intr->replaceAllUsesWith(
648	V: Builder.getIntN(N: Intr->getType()->getIntegerBitWidth(),
649	C: DL.getTypeAllocSize(Ty: VectorTy)));
650	return nullptr;
651	}
652	}
653
654	llvm_unreachable("Unsupported call when promoting alloca to vector");
655	}
656
657	default:
658	llvm_unreachable("Inconsistency in instructions promotable to vector");
659	}
660
661	llvm_unreachable("Did not return after promoting instruction!");
662	}
663
664	static bool isSupportedAccessType(FixedVectorType VecTy, Type AccessTy,
665	const DataLayout &DL) {
666	// Access as a vector type can work if the size of the access vector is a
667	// multiple of the size of the alloca's vector element type.
668	//
669	// Examples:
670	// - VecTy = <8 x float>, AccessTy = <4 x float> -> OK
671	// - VecTy = <4 x double>, AccessTy = <2 x float> -> OK
672	// - VecTy = <4 x double>, AccessTy = <3 x float> -> NOT OK
673	// - 332 is not a multiple of 64*
674	//
675	// We could handle more complicated cases, but it'd make things a lot more
676	// complicated.
677	if (isa<FixedVectorType>(Val: AccessTy)) {
678	TypeSize AccTS = DL.getTypeStoreSize(Ty: AccessTy);
679	TypeSize VecTS = DL.getTypeStoreSize(Ty: VecTy->getElementType());
680	return AccTS.isKnownMultipleOf(RHS: VecTS);
681	}
682
683	return CastInst::isBitOrNoopPointerCastable(SrcTy: VecTy->getElementType(), DestTy: AccessTy,
684	DL);
685	}
686
687	/// Iterates over an instruction worklist that may contain multiple instructions
688	/// from the same basic block, but in a different order.
689	template <typename InstContainer>
690	static void forEachWorkListItem(const InstContainer &WorkList,
691	std::function<void(Instruction *)> Fn) {
692	// Bucket up uses of the alloca by the block they occur in.
693	// This is important because we have to handle multiple defs/uses in a block
694	// ourselves: SSAUpdater is purely for cross-block references.
695	DenseMap<BasicBlock , SmallDenseSet<Instruction >> UsesByBlock;
696	for (Instruction *User : WorkList)
697	UsesByBlock [User->getParent()].insert(V: User);
698
699	for (Instruction *User : WorkList) {
700	BasicBlock *BB = User->getParent();
701	auto &BlockUses = UsesByBlock [BB];
702
703	// Already processed, skip.
704	if (BlockUses.empty())
705	continue;
706
707	// Only user in the block, directly process it.
708	if (BlockUses.size() == `1`) {
709	Fn (User);
710	continue;
711	}
712
713	// Multiple users in the block, do a linear scan to see users in order.
714	for (Instruction &Inst : *BB) {
715	if (!BlockUses.contains(V: &Inst))
716	continue;
717
718	Fn (&Inst);
719	}
720
721	// Clear the block so we know it's been processed.
722	BlockUses.clear();
723	}
724	}
725
726	// FIXME: Should try to pick the most likely to be profitable allocas first.
727	bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
728	LLVM_DEBUG(dbgs() << "Trying to promote to vector: " << Alloca << `'\n'`);
729
730	if (DisablePromoteAllocaToVector) {
731	LLVM_DEBUG(dbgs() << " Promote alloca to vector is disabled\n");
732	return false;
733	}
734
735	Type *AllocaTy = Alloca.getAllocatedType();
736	auto *VectorTy = dyn_cast<FixedVectorType>(Val: AllocaTy);
737	if (auto *ArrayTy = dyn_cast<ArrayType>(Val: AllocaTy)) {
738	if (VectorType::isValidElementType(ElemTy: ArrayTy->getElementType()) &&
739	ArrayTy->getNumElements() > `0`)
740	VectorTy = FixedVectorType::get(ElementType: ArrayTy->getElementType(),
741	NumElts: ArrayTy->getNumElements());
742	}
743
744	// FIXME: There is no reason why we can't support larger arrays, we
745	// are just being conservative for now.
746	// FIXME: We also reject alloca's of the form [ 2 x [ 2 x i32 ]] or
747	// equivalent. Potentially these could also be promoted but we don't currently
748	// handle this case
749	if (!VectorTy) {
750	LLVM_DEBUG(dbgs() << " Cannot convert type to vector\n");
751	return false;
752	}
753
754	if (VectorTy->getNumElements() > `16` \|\| VectorTy->getNumElements() < `2`) {
755	LLVM_DEBUG(dbgs() << " " << *VectorTy
756	<< " has an unsupported number of elements\n");
757	return false;
758	}
759
760	std::map<GetElementPtrInst , Value > GEPVectorIdx;
761	SmallVector<Instruction *> WorkList;
762	SmallVector<Instruction *> UsersToRemove;
763	SmallVector<Instruction *> DeferredInsts;
764	DenseMap<MemTransferInst *, MemTransferInfo> TransferInfo;
765
766	const auto RejectUser = [&](Instruction *Inst, Twine Msg) {
767	LLVM_DEBUG(dbgs() << " Cannot promote alloca to vector: " << Msg << "\n"
768	<< " " << *Inst << "\n");
769	return false;
770	};
771
772	SmallVector<Use *, `8`> Uses;
773	collectAllocaUses(Alloca, Uses);
774
775	LLVM_DEBUG(dbgs() << " Attempting promotion to: " << *VectorTy << "\n");
776
777	Type *VecEltTy = VectorTy->getElementType();
778	unsigned ElementSize = DL->getTypeSizeInBits(Ty: VecEltTy) / `8`;
779	for (auto *U : Uses) {
780	Instruction *Inst = cast<Instruction>(Val: U->getUser());
781
782	if (Value *Ptr = getLoadStorePointerOperand(V: Inst)) {
783	// This is a store of the pointer, not to the pointer.
784	if (isa<StoreInst>(Val: Inst) &&
785	U->getOperandNo() != StoreInst::getPointerOperandIndex())
786	return RejectUser (Inst, "pointer is being stored");
787
788	Type *AccessTy = getLoadStoreType(I: Inst);
789	if (AccessTy->isAggregateType())
790	return RejectUser (Inst, "unsupported load/store as aggregate");
791	assert(!AccessTy->isAggregateType() \|\| AccessTy->isArrayTy());
792
793	// Check that this is a simple access of a vector element.
794	bool IsSimple = isa<LoadInst>(Val: Inst) ? cast<LoadInst>(Val: Inst)->isSimple()
795	: cast<StoreInst>(Val: Inst)->isSimple();
796	if (!IsSimple)
797	return RejectUser (Inst, "not a simple load or store");
798
799	Ptr = Ptr->stripPointerCasts();
800
801	// Alloca already accessed as vector.
802	if (Ptr == &Alloca && DL->getTypeStoreSize(Ty: Alloca.getAllocatedType()) ==
803	DL->getTypeStoreSize(Ty: AccessTy)) {
804	WorkList.push_back(Elt: Inst);
805	continue;
806	}
807
808	if (!isSupportedAccessType(VecTy: VectorTy, AccessTy, DL: *DL))
809	return RejectUser (Inst, "not a supported access type");
810
811	WorkList.push_back(Elt: Inst);
812	continue;
813	}
814
815	if (auto *GEP = dyn_cast<GetElementPtrInst>(Val: Inst)) {
816	// If we can't compute a vector index from this GEP, then we can't
817	// promote this alloca to vector.
818	Value Index = GEPToVectorIndex(GEP, Alloca: &Alloca, VecElemTy: VecEltTy, DL: DL);
819	if (!Index)
820	return RejectUser (Inst, "cannot compute vector index for GEP");
821
822	GEPVectorIdx [GEP] = Index;
823	UsersToRemove.push_back(Elt: Inst);
824	continue;
825	}
826
827	if (MemSetInst *MSI = dyn_cast<MemSetInst>(Val: Inst);
828	MSI && isSupportedMemset(I: MSI, AI: &Alloca, DL: *DL)) {
829	WorkList.push_back(Elt: Inst);
830	continue;
831	}
832
833	if (MemTransferInst *TransferInst = dyn_cast<MemTransferInst>(Val: Inst)) {
834	if (TransferInst->isVolatile())
835	return RejectUser (Inst, "mem transfer inst is volatile");
836
837	ConstantInt *Len = dyn_cast<ConstantInt>(Val: TransferInst->getLength());
838	if (!Len \|\| (Len->getZExtValue() % ElementSize))
839	return RejectUser (Inst, "mem transfer inst length is non-constant or "
840	"not a multiple of the vector element size");
841
842	if (!TransferInfo.count(Val: TransferInst)) {
843	DeferredInsts.push_back(Elt: Inst);
844	WorkList.push_back(Elt: Inst);
845	TransferInfo [TransferInst] = MemTransferInfo ();
846	}
847
848	auto getPointerIndexOfAlloca = [&](Value Ptr) -> ConstantInt {
849	GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Val: Ptr);
850	if (Ptr != &Alloca && !GEPVectorIdx.count(x: GEP))
851	return nullptr;
852
853	return dyn_cast<ConstantInt>(Val: calculateVectorIndex(Ptr, GEPIdx: GEPVectorIdx));
854	};
855
856	unsigned OpNum = U->getOperandNo();
857	MemTransferInfo *TI = &TransferInfo [TransferInst];
858	if (OpNum == `0`) {
859	Value *Dest = TransferInst->getDest();
860	ConstantInt *Index = getPointerIndexOfAlloca (Dest);
861	if (!Index)
862	return RejectUser (Inst, "could not calculate constant dest index");
863	TI->DestIndex = Index;
864	} else {
865	assert(OpNum == `1`);
866	Value *Src = TransferInst->getSource();
867	ConstantInt *Index = getPointerIndexOfAlloca (Src);
868	if (!Index)
869	return RejectUser (Inst, "could not calculate constant src index");
870	TI->SrcIndex = Index;
871	}
872	continue;
873	}
874
875	if (auto *Intr = dyn_cast<IntrinsicInst>(Val: Inst)) {
876	if (Intr->getIntrinsicID() == Intrinsic::objectsize) {
877	WorkList.push_back(Elt: Inst);
878	continue;
879	}
880	}
881
882	// Ignore assume-like intrinsics and comparisons used in assumes.
883	if (isAssumeLikeIntrinsic(I: Inst)) {
884	if (!Inst->use_empty())
885	return RejectUser (Inst, "assume-like intrinsic cannot have any users");
886	UsersToRemove.push_back(Elt: Inst);
887	continue;
888	}
889
890	if (isa<ICmpInst>(Val: Inst) && all_of(Range: Inst->users(), P: [](User *U) {
891	return isAssumeLikeIntrinsic(I: cast<Instruction>(Val: U));
892	})) {
893	UsersToRemove.push_back(Elt: Inst);
894	continue;
895	}
896
897	return RejectUser (Inst, "unhandled alloca user");
898	}
899
900	while (!DeferredInsts.empty()) {
901	Instruction *Inst = DeferredInsts.pop_back_val();
902	MemTransferInst *TransferInst = cast<MemTransferInst>(Val: Inst);
903	// TODO: Support the case if the pointers are from different alloca or
904	// from different address spaces.
905	MemTransferInfo &Info = TransferInfo [TransferInst];
906	if (!Info.SrcIndex \|\| !Info.DestIndex)
907	return RejectUser (
908	Inst, "mem transfer inst is missing constant src and/or dst index");
909	}
910
911	LLVM_DEBUG(dbgs() << " Converting alloca to vector " << *AllocaTy << " -> "
912	<< *VectorTy << `'\n'`);
913	const unsigned VecStoreSize = DL->getTypeStoreSize(Ty: VectorTy);
914
915	// Alloca is uninitialized memory. Imitate that by making the first value
916	// undef.
917	SSAUpdater Updater;
918	Updater.Initialize(Ty: VectorTy, Name: "promotealloca");
919	Updater.AddAvailableValue(BB: Alloca.getParent(), V: UndefValue::get(T: VectorTy));
920
921	// First handle the initial worklist.
922	SmallVector<LoadInst *, `4`> DeferredLoads;
923	forEachWorkListItem(WorkList, Fn: [&](Instruction *I) {
924	BasicBlock *BB = I->getParent();
925	// On the first pass, we only take values that are trivially known, i.e.
926	// where AddAvailableValue was already called in this block.
927	Value *Result = promoteAllocaUserToVector(
928	Inst: I, DL: *DL, VectorTy, VecStoreSize, ElementSize, TransferInfo, GEPVectorIdx,
929	CurVal: Updater.FindValueForBlock(BB), DeferredLoads);
930	if (Result)
931	Updater.AddAvailableValue(BB, V: Result);
932	});
933
934	// Then handle deferred loads.
935	forEachWorkListItem(WorkList: DeferredLoads, Fn: [&](Instruction *I) {
936	SmallVector<LoadInst *, `0`> NewDLs;
937	BasicBlock *BB = I->getParent();
938	// On the second pass, we use GetValueInMiddleOfBlock to guarantee we always
939	// get a value, inserting PHIs as needed.
940	Value *Result = promoteAllocaUserToVector(
941	Inst: I, DL: *DL, VectorTy, VecStoreSize, ElementSize, TransferInfo, GEPVectorIdx,
942	CurVal: Updater.GetValueInMiddleOfBlock(BB: I->getParent()), DeferredLoads&: NewDLs);
943	if (Result)
944	Updater.AddAvailableValue(BB, V: Result);
945	assert(NewDLs.empty() && "No more deferred loads should be queued!");
946	});
947
948	// Delete all instructions. On the first pass, new dummy loads may have been
949	// added so we need to collect them too.
950	DenseSet<Instruction *> InstsToDelete(WorkList.begin(), WorkList.end());
951	InstsToDelete.insert(I: DeferredLoads.begin(), E: DeferredLoads.end());
952	for (Instruction *I : InstsToDelete) {
953	assert(I->use_empty());
954	I->eraseFromParent();
955	}
956
957	// Delete all the users that are known to be removeable.
958	for (Instruction *I : reverse(C&: UsersToRemove)) {
959	I->dropDroppableUses();
960	assert(I->use_empty());
961	I->eraseFromParent();
962	}
963
964	// Alloca should now be dead too.
965	assert(Alloca.use_empty());
966	Alloca.eraseFromParent();
967	return true;
968	}
969
970	std::pair<Value , Value >
971	AMDGPUPromoteAllocaImpl::getLocalSizeYZ(IRBuilder<> &Builder) {
972	Function &F = *Builder.GetInsertBlock()->getParent();
973	const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F);
974
975	if (!IsAMDHSA) {
976	Function *LocalSizeYFn =
977	Intrinsic::getDeclaration(M: Mod, id: Intrinsic::r600_read_local_size_y);
978	Function *LocalSizeZFn =
979	Intrinsic::getDeclaration(M: Mod, id: Intrinsic::r600_read_local_size_z);
980
981	CallInst *LocalSizeY = Builder.CreateCall(Callee: LocalSizeYFn, Args: {});
982	CallInst *LocalSizeZ = Builder.CreateCall(Callee: LocalSizeZFn, Args: {});
983
984	ST.makeLIDRangeMetadata(I: LocalSizeY);
985	ST.makeLIDRangeMetadata(I: LocalSizeZ);
986
987	return std::pair(LocalSizeY, LocalSizeZ);
988	}
989
990	// We must read the size out of the dispatch pointer.
991	assert(IsAMDGCN);
992
993	// We are indexing into this struct, and want to extract the workgroup_size_*
994	// fields.
995	//
996	// typedef struct hsa_kernel_dispatch_packet_s {
997	// uint16_t header;
998	// uint16_t setup;
999	// uint16_t workgroup_size_x ;
1000	// uint16_t workgroup_size_y;
1001	// uint16_t workgroup_size_z;
1002	// uint16_t reserved0;
1003	// uint32_t grid_size_x ;
1004	// uint32_t grid_size_y ;
1005	// uint32_t grid_size_z;
1006	//
1007	// uint32_t private_segment_size;
1008	// uint32_t group_segment_size;
1009	// uint64_t kernel_object;
1010	//
1011	// #ifdef HSA_LARGE_MODEL
1012	// void kernarg_address;*
1013	// #elif defined HSA_LITTLE_ENDIAN
1014	// void kernarg_address;*
1015	// uint32_t reserved1;
1016	// #else
1017	// uint32_t reserved1;
1018	// void kernarg_address;*
1019	// #endif
1020	// uint64_t reserved2;
1021	// hsa_signal_t completion_signal; // uint64_t wrapper
1022	// } hsa_kernel_dispatch_packet_t
1023	//
1024	Function *DispatchPtrFn =
1025	Intrinsic::getDeclaration(M: Mod, id: Intrinsic::amdgcn_dispatch_ptr);
1026
1027	CallInst *DispatchPtr = Builder.CreateCall(Callee: DispatchPtrFn, Args: {});
1028	DispatchPtr->addRetAttr(Kind: Attribute::NoAlias);
1029	DispatchPtr->addRetAttr(Kind: Attribute::NonNull);
1030	F.removeFnAttr(Kind: "amdgpu-no-dispatch-ptr");
1031
1032	// Size of the dispatch packet struct.
1033	DispatchPtr->addDereferenceableRetAttr(Bytes: `64`);
1034
1035	Type *I32Ty = Type::getInt32Ty(C&: Mod->getContext());
1036	Value *CastDispatchPtr = Builder.CreateBitCast(
1037	V: DispatchPtr, DestTy: PointerType::get(ElementType: I32Ty, AddressSpace: AMDGPUAS::CONSTANT_ADDRESS));
1038
1039	// We could do a single 64-bit load here, but it's likely that the basic
1040	// 32-bit and extract sequence is already present, and it is probably easier
1041	// to CSE this. The loads should be mergeable later anyway.
1042	Value *GEPXY = Builder.CreateConstInBoundsGEP1_64(Ty: I32Ty, Ptr: CastDispatchPtr, Idx0: `1`);
1043	LoadInst *LoadXY = Builder.CreateAlignedLoad(Ty: I32Ty, Ptr: GEPXY, Align: Align (`4`));
1044
1045	Value *GEPZU = Builder.CreateConstInBoundsGEP1_64(Ty: I32Ty, Ptr: CastDispatchPtr, Idx0: `2`);
1046	LoadInst *LoadZU = Builder.CreateAlignedLoad(Ty: I32Ty, Ptr: GEPZU, Align: Align (`4`));
1047
1048	MDNode *MD = MDNode::get(Context&: Mod->getContext(), MDs: std::nullopt);
1049	LoadXY->setMetadata(KindID: LLVMContext::MD_invariant_load, Node: MD);
1050	LoadZU->setMetadata(KindID: LLVMContext::MD_invariant_load, Node: MD);
1051	ST.makeLIDRangeMetadata(I: LoadZU);
1052
1053	// Extract y component. Upper half of LoadZU should be zero already.
1054	Value *Y = Builder.CreateLShr(LHS: LoadXY, RHS: `16`);
1055
1056	return std::pair(Y, LoadZU);
1057	}
1058
1059	Value *AMDGPUPromoteAllocaImpl::getWorkitemID(IRBuilder<> &Builder,
1060	unsigned N) {
1061	Function *F = Builder.GetInsertBlock()->getParent();
1062	const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F: *F);
1063	Intrinsic::ID IntrID = Intrinsic::not_intrinsic;
1064	StringRef AttrName;
1065
1066	switch (N) {
1067	case `0`:
1068	IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_x
1069	: (Intrinsic::ID)Intrinsic::r600_read_tidig_x;
1070	AttrName = "amdgpu-no-workitem-id-x";
1071	break;
1072	case `1`:
1073	IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_y
1074	: (Intrinsic::ID)Intrinsic::r600_read_tidig_y;
1075	AttrName = "amdgpu-no-workitem-id-y";
1076	break;
1077
1078	case `2`:
1079	IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_z
1080	: (Intrinsic::ID)Intrinsic::r600_read_tidig_z;
1081	AttrName = "amdgpu-no-workitem-id-z";
1082	break;
1083	default:
1084	llvm_unreachable("invalid dimension");
1085	}
1086
1087	Function *WorkitemIdFn = Intrinsic::getDeclaration(M: Mod, id: IntrID);
1088	CallInst *CI = Builder.CreateCall(Callee: WorkitemIdFn);
1089	ST.makeLIDRangeMetadata(I: CI);
1090	F->removeFnAttr(Kind: AttrName);
1091
1092	return CI;
1093	}
1094
1095	static bool isCallPromotable(CallInst *CI) {
1096	IntrinsicInst *II = dyn_cast<IntrinsicInst>(Val: CI);
1097	if (!II)
1098	return false;
1099
1100	switch (II->getIntrinsicID()) {
1101	case Intrinsic::memcpy:
1102	case Intrinsic::memmove:
1103	case Intrinsic::memset:
1104	case Intrinsic::lifetime_start:
1105	case Intrinsic::lifetime_end:
1106	case Intrinsic::invariant_start:
1107	case Intrinsic::invariant_end:
1108	case Intrinsic::launder_invariant_group:
1109	case Intrinsic::strip_invariant_group:
1110	case Intrinsic::objectsize:
1111	return true;
1112	default:
1113	return false;
1114	}
1115	}
1116
1117	bool AMDGPUPromoteAllocaImpl::binaryOpIsDerivedFromSameAlloca(
1118	Value BaseAlloca, Value Val, Instruction Inst, int* OpIdx0,
1119	int OpIdx1) const {
1120	// Figure out which operand is the one we might not be promoting.
1121	Value *OtherOp = Inst->getOperand(i: OpIdx0);
1122	if (Val == OtherOp)
1123	OtherOp = Inst->getOperand(i: OpIdx1);
1124
1125	if (isa<ConstantPointerNull>(Val: OtherOp))
1126	return true;
1127
1128	Value *OtherObj = getUnderlyingObject(V: OtherOp);
1129	if (!isa<AllocaInst>(Val: OtherObj))
1130	return false;
1131
1132	// TODO: We should be able to replace undefs with the right pointer type.
1133
1134	// TODO: If we know the other base object is another promotable
1135	// alloca, not necessarily this alloca, we can do this. The
1136	// important part is both must have the same address space at
1137	// the end.
1138	if (OtherObj != BaseAlloca) {
1139	LLVM_DEBUG(
1140	dbgs() << "Found a binary instruction with another alloca object\n");
1141	return false;
1142	}
1143
1144	return true;
1145	}
1146
1147	bool AMDGPUPromoteAllocaImpl::collectUsesWithPtrTypes(
1148	Value BaseAlloca, Value Val, std::vector<Value > &WorkList) const* {
1149
1150	for (User *User : Val->users()) {
1151	if (is_contained(Range&: WorkList, Element: User))
1152	continue;
1153
1154	if (CallInst *CI = dyn_cast<CallInst>(Val: User)) {
1155	if (!isCallPromotable(CI))
1156	return false;
1157
1158	WorkList.push_back(x: User);
1159	continue;
1160	}
1161
1162	Instruction *UseInst = cast<Instruction>(Val: User);
1163	if (UseInst->getOpcode() == Instruction::PtrToInt)
1164	return false;
1165
1166	if (LoadInst *LI = dyn_cast<LoadInst>(Val: UseInst)) {
1167	if (LI->isVolatile())
1168	return false;
1169
1170	continue;
1171	}
1172
1173	if (StoreInst *SI = dyn_cast<StoreInst>(Val: UseInst)) {
1174	if (SI->isVolatile())
1175	return false;
1176
1177	// Reject if the stored value is not the pointer operand.
1178	if (SI->getPointerOperand() != Val)
1179	return false;
1180	} else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(Val: UseInst)) {
1181	if (RMW->isVolatile())
1182	return false;
1183	} else if (AtomicCmpXchgInst *CAS = dyn_cast<AtomicCmpXchgInst>(Val: UseInst)) {
1184	if (CAS->isVolatile())
1185	return false;
1186	}
1187
1188	// Only promote a select if we know that the other select operand
1189	// is from another pointer that will also be promoted.
1190	if (ICmpInst *ICmp = dyn_cast<ICmpInst>(Val: UseInst)) {
1191	if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, Inst: ICmp, OpIdx0: `0`, OpIdx1: `1`))
1192	return false;
1193
1194	// May need to rewrite constant operands.
1195	WorkList.push_back(x: ICmp);
1196	}
1197
1198	if (UseInst->getOpcode() == Instruction::AddrSpaceCast) {
1199	// Give up if the pointer may be captured.
1200	if (PointerMayBeCaptured(V: UseInst, ReturnCaptures: true, StoreCaptures: true))
1201	return false;
1202	// Don't collect the users of this.
1203	WorkList.push_back(x: User);
1204	continue;
1205	}
1206
1207	// Do not promote vector/aggregate type instructions. It is hard to track
1208	// their users.
1209	if (isa<InsertValueInst>(Val: User) \|\| isa<InsertElementInst>(Val: User))
1210	return false;
1211
1212	if (!User->getType()->isPointerTy())
1213	continue;
1214
1215	if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Val: UseInst)) {
1216	// Be conservative if an address could be computed outside the bounds of
1217	// the alloca.
1218	if (!GEP->isInBounds())
1219	return false;
1220	}
1221
1222	// Only promote a select if we know that the other select operand is from
1223	// another pointer that will also be promoted.
1224	if (SelectInst *SI = dyn_cast<SelectInst>(Val: UseInst)) {
1225	if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, Inst: SI, OpIdx0: `1`, OpIdx1: `2`))
1226	return false;
1227	}
1228
1229	// Repeat for phis.
1230	if (PHINode *Phi = dyn_cast<PHINode>(Val: UseInst)) {
1231	// TODO: Handle more complex cases. We should be able to replace loops
1232	// over arrays.
1233	switch (Phi->getNumIncomingValues()) {
1234	case `1`:
1235	break;
1236	case `2`:
1237	if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, Inst: Phi, OpIdx0: `0`, OpIdx1: `1`))
1238	return false;
1239	break;
1240	default:
1241	return false;
1242	}
1243	}
1244
1245	WorkList.push_back(x: User);
1246	if (!collectUsesWithPtrTypes(BaseAlloca, Val: User, WorkList))
1247	return false;
1248	}
1249
1250	return true;
1251	}
1252
1253	bool AMDGPUPromoteAllocaImpl::hasSufficientLocalMem(const Function &F) {
1254
1255	FunctionType *FTy = F.getFunctionType();
1256	const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F);
1257
1258	// If the function has any arguments in the local address space, then it's
1259	// possible these arguments require the entire local memory space, so
1260	// we cannot use local memory in the pass.
1261	for (Type *ParamTy : FTy->params()) {
1262	PointerType *PtrTy = dyn_cast<PointerType>(Val: ParamTy);
1263	if (PtrTy && PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
1264	LocalMemLimit = `0`;
1265	LLVM_DEBUG(dbgs() << "Function has local memory argument. Promoting to "
1266	"local memory disabled.\n");
1267	return false;
1268	}
1269	}
1270
1271	LocalMemLimit = ST.getAddressableLocalMemorySize();
1272	if (LocalMemLimit == `0`)
1273	return false;
1274
1275	SmallVector<const Constant *, `16`> Stack;
1276	SmallPtrSet<const Constant *, `8`> VisitedConstants;
1277	SmallPtrSet<const GlobalVariable *, `8`> UsedLDS;
1278
1279	auto visitUsers = [&](const GlobalVariable GV, const* Constant Val) -> bool* {
1280	for (const User *U : Val->users()) {
1281	if (const Instruction *Use = dyn_cast<Instruction>(Val: U)) {
1282	if (Use->getParent()->getParent() == &F)
1283	return true;
1284	} else {
1285	const Constant *C = cast<Constant>(Val: U);
1286	if (VisitedConstants.insert(Ptr: C).second)
1287	Stack.push_back(Elt: C);
1288	}
1289	}
1290
1291	return false;
1292	};
1293
1294	for (GlobalVariable &GV : Mod->globals()) {
1295	if (GV.getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS)
1296	continue;
1297
1298	if (visitUsers (&GV, &GV)) {
1299	UsedLDS.insert(Ptr: &GV);
1300	Stack.clear();
1301	continue;
1302	}
1303
1304	// For any ConstantExpr uses, we need to recursively search the users until
1305	// we see a function.
1306	while (!Stack.empty()) {
1307	const Constant *C = Stack.pop_back_val();
1308	if (visitUsers (&GV, C)) {
1309	UsedLDS.insert(Ptr: &GV);
1310	Stack.clear();
1311	break;
1312	}
1313	}
1314	}
1315
1316	const DataLayout &DL = Mod->getDataLayout();
1317	SmallVector<std::pair<uint64_t, Align>, `16`> AllocatedSizes;
1318	AllocatedSizes.reserve(N: UsedLDS.size());
1319
1320	for (const GlobalVariable *GV : UsedLDS) {
1321	Align Alignment =
1322	DL.getValueOrABITypeAlignment(Alignment: GV->getAlign(), Ty: GV->getValueType());
1323	uint64_t AllocSize = DL.getTypeAllocSize(Ty: GV->getValueType());
1324
1325	// HIP uses an extern unsized array in local address space for dynamically
1326	// allocated shared memory. In that case, we have to disable the promotion.
1327	if (GV->hasExternalLinkage() && AllocSize == `0`) {
1328	LocalMemLimit = `0`;
1329	LLVM_DEBUG(dbgs() << "Function has a reference to externally allocated "
1330	"local memory. Promoting to local memory "
1331	"disabled.\n");
1332	return false;
1333	}
1334
1335	AllocatedSizes.emplace_back(Args&: AllocSize, Args&: Alignment);
1336	}
1337
1338	// Sort to try to estimate the worst case alignment padding
1339	//
1340	// FIXME: We should really do something to fix the addresses to a more optimal
1341	// value instead
1342	llvm::sort(C&: AllocatedSizes, Comp: llvm::less_second ());
1343
1344	// Check how much local memory is being used by global objects
1345	CurrentLocalMemUsage = `0`;
1346
1347	// FIXME: Try to account for padding here. The real padding and address is
1348	// currently determined from the inverse order of uses in the function when
1349	// legalizing, which could also potentially change. We try to estimate the
1350	// worst case here, but we probably should fix the addresses earlier.
1351	for (auto Alloc : AllocatedSizes) {
1352	CurrentLocalMemUsage = alignTo(Size: CurrentLocalMemUsage, A: Alloc.second);
1353	CurrentLocalMemUsage += Alloc.first;
1354	}
1355
1356	unsigned MaxOccupancy =
1357	ST.getOccupancyWithLocalMemSize(Bytes: CurrentLocalMemUsage, F);
1358
1359	// Restrict local memory usage so that we don't drastically reduce occupancy,
1360	// unless it is already significantly reduced.
1361
1362	// TODO: Have some sort of hint or other heuristics to guess occupancy based
1363	// on other factors..
1364	unsigned OccupancyHint = ST.getWavesPerEU(F).second;
1365	if (OccupancyHint == `0`)
1366	OccupancyHint = `7`;
1367
1368	// Clamp to max value.
1369	OccupancyHint = std::min(a: OccupancyHint, b: ST.getMaxWavesPerEU());
1370
1371	// Check the hint but ignore it if it's obviously wrong from the existing LDS
1372	// usage.
1373	MaxOccupancy = std::min(a: OccupancyHint, b: MaxOccupancy);
1374
1375	// Round up to the next tier of usage.
1376	unsigned MaxSizeWithWaveCount =
1377	ST.getMaxLocalMemSizeWithWaveCount(WaveCount: MaxOccupancy, F);
1378
1379	// Program is possibly broken by using more local mem than available.
1380	if (CurrentLocalMemUsage > MaxSizeWithWaveCount)
1381	return false;
1382
1383	LocalMemLimit = MaxSizeWithWaveCount;
1384
1385	LLVM_DEBUG(dbgs() << F.getName() << " uses " << CurrentLocalMemUsage
1386	<< " bytes of LDS\n"
1387	<< " Rounding size to " << MaxSizeWithWaveCount
1388	<< " with a maximum occupancy of " << MaxOccupancy << `'\n'`
1389	<< " and " << (LocalMemLimit - CurrentLocalMemUsage)
1390	<< " available for promotion\n");
1391
1392	return true;
1393	}
1394
1395	// FIXME: Should try to pick the most likely to be profitable allocas first.
1396	bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(AllocaInst &I,
1397	bool SufficientLDS) {
1398	LLVM_DEBUG(dbgs() << "Trying to promote to LDS: " << I << `'\n'`);
1399
1400	if (DisablePromoteAllocaToLDS) {
1401	LLVM_DEBUG(dbgs() << " Promote alloca to LDS is disabled\n");
1402	return false;
1403	}
1404
1405	const DataLayout &DL = Mod->getDataLayout();
1406	IRBuilder<> Builder(&I);
1407
1408	const Function &ContainingFunction = *I.getParent()->getParent();
1409	CallingConv::ID CC = ContainingFunction.getCallingConv();
1410
1411	// Don't promote the alloca to LDS for shader calling conventions as the work
1412	// item ID intrinsics are not supported for these calling conventions.
1413	// Furthermore not all LDS is available for some of the stages.
1414	switch (CC) {
1415	case CallingConv::AMDGPU_KERNEL:
1416	case CallingConv::SPIR_KERNEL:
1417	break;
1418	default:
1419	LLVM_DEBUG(
1420	dbgs()
1421	<< " promote alloca to LDS not supported with calling convention.\n");
1422	return false;
1423	}
1424
1425	// Not likely to have sufficient local memory for promotion.
1426	if (!SufficientLDS)
1427	return false;
1428
1429	const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F: ContainingFunction);
1430	unsigned WorkGroupSize = ST.getFlatWorkGroupSizes(F: ContainingFunction).second;
1431
1432	Align Alignment =
1433	DL.getValueOrABITypeAlignment(Alignment: I.getAlign(), Ty: I.getAllocatedType());
1434
1435	// FIXME: This computed padding is likely wrong since it depends on inverse
1436	// usage order.
1437	//
1438	// FIXME: It is also possible that if we're allowed to use all of the memory
1439	// could end up using more than the maximum due to alignment padding.
1440
1441	uint32_t NewSize = alignTo(Size: CurrentLocalMemUsage, A: Alignment);
1442	uint32_t AllocSize =
1443	WorkGroupSize * DL.getTypeAllocSize(Ty: I.getAllocatedType());
1444	NewSize += AllocSize;
1445
1446	if (NewSize > LocalMemLimit) {
1447	LLVM_DEBUG(dbgs() << " " << AllocSize
1448	<< " bytes of local memory not available to promote\n");
1449	return false;
1450	}
1451
1452	CurrentLocalMemUsage = NewSize;
1453
1454	std::vector<Value *> WorkList;
1455
1456	if (!collectUsesWithPtrTypes(BaseAlloca: &I, Val: &I, WorkList)) {
1457	LLVM_DEBUG(dbgs() << " Do not know how to convert all uses\n");
1458	return false;
1459	}
1460
1461	LLVM_DEBUG(dbgs() << "Promoting alloca to local memory\n");
1462
1463	Function *F = I.getParent()->getParent();
1464
1465	Type *GVTy = ArrayType::get(ElementType: I.getAllocatedType(), NumElements: WorkGroupSize);
1466	GlobalVariable GV = new* GlobalVariable (
1467	Mod, GVTy, false*, GlobalValue::InternalLinkage, PoisonValue::get(T: GVTy),
1468	Twine (F->getName()) + Twine (`'.'`) + I.getName(), nullptr,
1469	GlobalVariable::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS);
1470	GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
1471	GV->setAlignment(I.getAlign());
1472
1473	Value TCntY, TCntZ;
1474
1475	std::tie(args&: TCntY, args&: TCntZ) = getLocalSizeYZ(Builder);
1476	Value *TIdX = getWorkitemID(Builder, N: `0`);
1477	Value *TIdY = getWorkitemID(Builder, N: `1`);
1478	Value *TIdZ = getWorkitemID(Builder, N: `2`);
1479
1480	Value Tmp0 = Builder.CreateMul(LHS: TCntY, RHS: TCntZ, Name: "", HasNUW: true, HasNSW: true*);
1481	Tmp0 = Builder.CreateMul(LHS: Tmp0, RHS: TIdX);
1482	Value Tmp1 = Builder.CreateMul(LHS: TIdY, RHS: TCntZ, Name: "", HasNUW: true, HasNSW: true*);
1483	Value *TID = Builder.CreateAdd(LHS: Tmp0, RHS: Tmp1);
1484	TID = Builder.CreateAdd(LHS: TID, RHS: TIdZ);
1485
1486	LLVMContext &Context = Mod->getContext();
1487	Value *Indices[] = {Constant::getNullValue(Ty: Type::getInt32Ty(C&: Context)), TID};
1488
1489	Value *Offset = Builder.CreateInBoundsGEP(Ty: GVTy, Ptr: GV, IdxList: Indices);
1490	I.mutateType(Ty: Offset->getType());
1491	I.replaceAllUsesWith(V: Offset);
1492	I.eraseFromParent();
1493
1494	SmallVector<IntrinsicInst *> DeferredIntrs;
1495
1496	for (Value *V : WorkList) {
1497	CallInst *Call = dyn_cast<CallInst>(Val: V);
1498	if (!Call) {
1499	if (ICmpInst *CI = dyn_cast<ICmpInst>(Val: V)) {
1500	PointerType *NewTy = PointerType::get(C&: Context, AddressSpace: AMDGPUAS::LOCAL_ADDRESS);
1501
1502	if (isa<ConstantPointerNull>(Val: CI->getOperand(i_nocapture: `0`)))
1503	CI->setOperand(i_nocapture: `0`, Val_nocapture: ConstantPointerNull::get(T: NewTy));
1504
1505	if (isa<ConstantPointerNull>(Val: CI->getOperand(i_nocapture: `1`)))
1506	CI->setOperand(i_nocapture: `1`, Val_nocapture: ConstantPointerNull::get(T: NewTy));
1507
1508	continue;
1509	}
1510
1511	// The operand's value should be corrected on its own and we don't want to
1512	// touch the users.
1513	if (isa<AddrSpaceCastInst>(Val: V))
1514	continue;
1515
1516	PointerType *NewTy = PointerType::get(C&: Context, AddressSpace: AMDGPUAS::LOCAL_ADDRESS);
1517
1518	// FIXME: It doesn't really make sense to try to do this for all
1519	// instructions.
1520	V->mutateType(Ty: NewTy);
1521
1522	// Adjust the types of any constant operands.
1523	if (SelectInst *SI = dyn_cast<SelectInst>(Val: V)) {
1524	if (isa<ConstantPointerNull>(Val: SI->getOperand(i_nocapture: `1`)))
1525	SI->setOperand(i_nocapture: `1`, Val_nocapture: ConstantPointerNull::get(T: NewTy));
1526
1527	if (isa<ConstantPointerNull>(Val: SI->getOperand(i_nocapture: `2`)))
1528	SI->setOperand(i_nocapture: `2`, Val_nocapture: ConstantPointerNull::get(T: NewTy));
1529	} else if (PHINode *Phi = dyn_cast<PHINode>(Val: V)) {
1530	for (unsigned I = `0`, E = Phi->getNumIncomingValues(); I != E; ++I) {
1531	if (isa<ConstantPointerNull>(Val: Phi->getIncomingValue(i: I)))
1532	Phi->setIncomingValue(i: I, V: ConstantPointerNull::get(T: NewTy));
1533	}
1534	}
1535
1536	continue;
1537	}
1538
1539	IntrinsicInst *Intr = cast<IntrinsicInst>(Val: Call);
1540	Builder.SetInsertPoint(Intr);
1541	switch (Intr->getIntrinsicID()) {
1542	case Intrinsic::lifetime_start:
1543	case Intrinsic::lifetime_end:
1544	// These intrinsics are for address space 0 only
1545	Intr->eraseFromParent();
1546	continue;
1547	case Intrinsic::memcpy:
1548	case Intrinsic::memmove:
1549	// These have 2 pointer operands. In case if second pointer also needs
1550	// to be replaced we defer processing of these intrinsics until all
1551	// other values are processed.
1552	DeferredIntrs.push_back(Elt: Intr);
1553	continue;
1554	case Intrinsic::memset: {
1555	MemSetInst *MemSet = cast<MemSetInst>(Val: Intr);
1556	Builder.CreateMemSet(Ptr: MemSet->getRawDest(), Val: MemSet->getValue(),
1557	Size: MemSet->getLength(), Align: MemSet->getDestAlign(),
1558	isVolatile: MemSet->isVolatile());
1559	Intr->eraseFromParent();
1560	continue;
1561	}
1562	case Intrinsic::invariant_start:
1563	case Intrinsic::invariant_end:
1564	case Intrinsic::launder_invariant_group:
1565	case Intrinsic::strip_invariant_group:
1566	Intr->eraseFromParent();
1567	// FIXME: I think the invariant marker should still theoretically apply,
1568	// but the intrinsics need to be changed to accept pointers with any
1569	// address space.
1570	continue;
1571	case Intrinsic::objectsize: {
1572	Value *Src = Intr->getOperand(i_nocapture: `0`);
1573	Function *ObjectSize = Intrinsic::getDeclaration(
1574	M: Mod, id: Intrinsic::objectsize,
1575	Tys: {Intr->getType(),
1576	PointerType::get(C&: Context, AddressSpace: AMDGPUAS::LOCAL_ADDRESS)});
1577
1578	CallInst *NewCall = Builder.CreateCall(
1579	Callee: ObjectSize,
1580	Args: {Src, Intr->getOperand(i_nocapture: `1`), Intr->getOperand(i_nocapture: `2`), Intr->getOperand(i_nocapture: `3`)});
1581	Intr->replaceAllUsesWith(V: NewCall);
1582	Intr->eraseFromParent();
1583	continue;
1584	}
1585	default:
1586	Intr->print(O&: errs());
1587	llvm_unreachable("Don't know how to promote alloca intrinsic use.");
1588	}
1589	}
1590
1591	for (IntrinsicInst *Intr : DeferredIntrs) {
1592	Builder.SetInsertPoint(Intr);
1593	Intrinsic::ID ID = Intr->getIntrinsicID();
1594	assert(ID == Intrinsic::memcpy \|\| ID == Intrinsic::memmove);
1595
1596	MemTransferInst *MI = cast<MemTransferInst>(Val: Intr);
1597	auto *B = Builder.CreateMemTransferInst(
1598	IntrID: ID, Dst: MI->getRawDest(), DstAlign: MI->getDestAlign(), Src: MI->getRawSource(),
1599	SrcAlign: MI->getSourceAlign(), Size: MI->getLength(), isVolatile: MI->isVolatile());
1600
1601	for (unsigned I = `0`; I != `2`; ++I) {
1602	if (uint64_t Bytes = Intr->getParamDereferenceableBytes(i: I)) {
1603	B->addDereferenceableParamAttr(i: I, Bytes);
1604	}
1605	}
1606
1607	Intr->eraseFromParent();
1608	}
1609
1610	return true;
1611	}
1612

Browse the source code of llvm_projects/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp