AMDGPUPromoteAlloca.cpp source code [llvm_projects/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp]

1	//===-- AMDGPUPromoteAlloca.cpp - Promote Allocas -------------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// Eliminates allocas by either converting them into vectors or by migrating
10	// them to local address space.
11	//
12	// Two passes are exposed by this file:
13	// - "promote-alloca-to-vector", which runs early in the pipeline and only
14	// promotes to vector. Promotion to vector is almost always profitable
15	// except when the alloca is too big and the promotion would result in
16	// very high register pressure.
17	// - "promote-alloca", which does both promotion to vector and LDS and runs
18	// much later in the pipeline. This runs after SROA because promoting to
19	// LDS is of course less profitable than getting rid of the alloca or
20	// vectorizing it, thus we only want to do it when the only alternative is
21	// lowering the alloca to stack.
22	//
23	// Note that both of them exist for the old and new PMs. The new PM passes are
24	// declared in AMDGPU.h and the legacy PM ones are declared here.s
25	//
26	//===----------------------------------------------------------------------===//
27
28	#include "AMDGPU.h"
29	#include "GCNSubtarget.h"
30	#include "Utils/AMDGPUBaseInfo.h"
31	#include "llvm/ADT/STLExtras.h"
32	#include "llvm/Analysis/CaptureTracking.h"
33	#include "llvm/Analysis/InstSimplifyFolder.h"
34	#include "llvm/Analysis/InstructionSimplify.h"
35	#include "llvm/Analysis/LoopInfo.h"
36	#include "llvm/Analysis/ValueTracking.h"
37	#include "llvm/CodeGen/TargetPassConfig.h"
38	#include "llvm/IR/IRBuilder.h"
39	#include "llvm/IR/IntrinsicInst.h"
40	#include "llvm/IR/IntrinsicsAMDGPU.h"
41	#include "llvm/IR/IntrinsicsR600.h"
42	#include "llvm/IR/PatternMatch.h"
43	#include "llvm/InitializePasses.h"
44	#include "llvm/Pass.h"
45	#include "llvm/Target/TargetMachine.h"
46	#include "llvm/Transforms/Utils/SSAUpdater.h"
47
48	#define DEBUG_TYPE "amdgpu-promote-alloca"
49
50	using namespace llvm;
51
52	namespace {
53
54	static cl::opt<bool>
55	DisablePromoteAllocaToVector("disable-promote-alloca-to-vector",
56	cl::desc ("Disable promote alloca to vector"),
57	cl::init(Val: false));
58
59	static cl::opt<bool>
60	DisablePromoteAllocaToLDS("disable-promote-alloca-to-lds",
61	cl::desc ("Disable promote alloca to LDS"),
62	cl::init(Val: false));
63
64	static cl::opt<unsigned> PromoteAllocaToVectorLimit(
65	"amdgpu-promote-alloca-to-vector-limit",
66	cl::desc ("Maximum byte size to consider promote alloca to vector"),
67	cl::init(Val: `0`));
68
69	static cl::opt<unsigned> PromoteAllocaToVectorMaxRegs(
70	"amdgpu-promote-alloca-to-vector-max-regs",
71	cl::desc (
72	"Maximum vector size (in 32b registers) to use when promoting alloca"),
73	cl::init(Val: `16`));
74
75	// Use up to 1/4 of available register budget for vectorization.
76	// FIXME: Increase the limit for whole function budgets? Perhaps x2?
77	static cl::opt<unsigned> PromoteAllocaToVectorVGPRRatio(
78	"amdgpu-promote-alloca-to-vector-vgpr-ratio",
79	cl::desc ("Ratio of VGPRs to budget for promoting alloca to vectors"),
80	cl::init(Val: `4`));
81
82	static cl::opt<unsigned>
83	LoopUserWeight("promote-alloca-vector-loop-user-weight",
84	cl::desc ("The bonus weight of users of allocas within loop "
85	"when sorting profitable allocas"),
86	cl::init(Val: `4`));
87
88	// Shared implementation which can do both promotion to vector and to LDS.
89	class AMDGPUPromoteAllocaImpl {
90	private:
91	const TargetMachine &TM;
92	LoopInfo &LI;
93	Module Mod = nullptr*;
94	const DataLayout DL = nullptr*;
95
96	// FIXME: This should be per-kernel.
97	uint32_t LocalMemLimit = `0`;
98	uint32_t CurrentLocalMemUsage = `0`;
99	unsigned MaxVGPRs;
100	unsigned VGPRBudgetRatio;
101	unsigned MaxVectorRegs;
102
103	bool IsAMDGCN = false;
104	bool IsAMDHSA = false;
105
106	std::pair<Value , Value > getLocalSizeYZ(IRBuilder<> &Builder);
107	Value getWorkitemID(IRBuilder<> &Builder, unsigned* N);
108
109	/// BaseAlloca is the alloca root the search started from.
110	/// Val may be that alloca or a recursive user of it.
111	bool collectUsesWithPtrTypes(Value BaseAlloca, Value Val,
112	std::vector<Value > &WorkList) const*;
113
114	/// Val is a derived pointer from Alloca. OpIdx0/OpIdx1 are the operand
115	/// indices to an instruction with 2 pointer inputs (e.g. select, icmp).
116	/// Returns true if both operands are derived from the same alloca. Val should
117	/// be the same value as one of the input operands of UseInst.
118	bool binaryOpIsDerivedFromSameAlloca(Value Alloca, Value Val,
119	Instruction UseInst, int* OpIdx0,
120	int OpIdx1) const;
121
122	/// Check whether we have enough local memory for promotion.
123	bool hasSufficientLocalMem(const Function &F);
124
125	bool tryPromoteAllocaToVector(AllocaInst &I);
126	bool tryPromoteAllocaToLDS(AllocaInst &I, bool SufficientLDS);
127
128	void sortAllocasToPromote(SmallVectorImpl<AllocaInst *> &Allocas);
129
130	void setFunctionLimits(const Function &F);
131
132	public:
133	AMDGPUPromoteAllocaImpl(TargetMachine &TM, LoopInfo &LI) : TM(TM), LI(LI) {
134
135	const Triple &TT = TM.getTargetTriple();
136	IsAMDGCN = TT.isAMDGCN();
137	IsAMDHSA = TT.getOS() == Triple::AMDHSA;
138	}
139
140	bool run(Function &F, bool PromoteToLDS);
141	};
142
143	// FIXME: This can create globals so should be a module pass.
144	class AMDGPUPromoteAlloca : public FunctionPass {
145	public:
146	static char ID;
147
148	AMDGPUPromoteAlloca() : FunctionPass (ID) {}
149
150	bool runOnFunction(Function &F) override {
151	if (skipFunction(F))
152	return false;
153	if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>())
154	return AMDGPUPromoteAllocaImpl (
155	TPC->getTM<TargetMachine>(),
156	getAnalysis<LoopInfoWrapperPass>().getLoopInfo())
157	.run(F, /PromoteToLDS/ true);
158	return false;
159	}
160
161	StringRef getPassName() const override { return "AMDGPU Promote Alloca"; }
162
163	void getAnalysisUsage(AnalysisUsage &AU) const override {
164	AU.setPreservesCFG();
165	AU.addRequired<LoopInfoWrapperPass>();
166	FunctionPass::getAnalysisUsage(AU);
167	}
168	};
169
170	static unsigned getMaxVGPRs(unsigned LDSBytes, const TargetMachine &TM,
171	const Function &F) {
172	if (!TM.getTargetTriple().isAMDGCN())
173	return `128`;
174
175	const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
176
177	unsigned DynamicVGPRBlockSize = AMDGPU::getDynamicVGPRBlockSize(F);
178	// Temporarily check both the attribute and the subtarget feature, until the
179	// latter is removed.
180	if (DynamicVGPRBlockSize == `0` && ST.isDynamicVGPREnabled())
181	DynamicVGPRBlockSize = ST.getDynamicVGPRBlockSize();
182
183	unsigned MaxVGPRs = ST.getMaxNumVGPRs(
184	WavesPerEU: ST.getWavesPerEU(FlatWorkGroupSizes: ST.getFlatWorkGroupSizes(F), LDSBytes, F).first,
185	DynamicVGPRBlockSize);
186
187	// A non-entry function has only 32 caller preserved registers.
188	// Do not promote alloca which will force spilling unless we know the function
189	// will be inlined.
190	if (!F.hasFnAttribute(Kind: Attribute::AlwaysInline) &&
191	!AMDGPU::isEntryFunctionCC(CC: F.getCallingConv()))
192	MaxVGPRs = std::min(a: MaxVGPRs, b: `32u`);
193	return MaxVGPRs;
194	}
195
196	} // end anonymous namespace
197
198	char AMDGPUPromoteAlloca::ID = `0`;
199
200	INITIALIZE_PASS_BEGIN(AMDGPUPromoteAlloca, DEBUG_TYPE,
201	"AMDGPU promote alloca to vector or LDS", false, false)
202	// Move LDS uses from functions to kernels before promote alloca for accurate
203	// estimation of LDS available
204	INITIALIZE_PASS_DEPENDENCY(AMDGPULowerModuleLDSLegacy)
205	INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
206	INITIALIZE_PASS_END(AMDGPUPromoteAlloca, DEBUG_TYPE,
207	"AMDGPU promote alloca to vector or LDS", false, false)
208
209	char &llvm::AMDGPUPromoteAllocaID = AMDGPUPromoteAlloca::ID;
210
211	PreservedAnalyses AMDGPUPromoteAllocaPass::run(Function &F,
212	FunctionAnalysisManager &AM) {
213	auto &LI = AM.getResult<LoopAnalysis>(IR&: F);
214	bool Changed = AMDGPUPromoteAllocaImpl (TM, LI).run(F, /PromoteToLDS=/true);
215	if (Changed) {
216	PreservedAnalyses PA;
217	PA.preserveSet<CFGAnalyses>();
218	return PA;
219	}
220	return PreservedAnalyses::all();
221	}
222
223	PreservedAnalyses
224	AMDGPUPromoteAllocaToVectorPass::run(Function &F, FunctionAnalysisManager &AM) {
225	auto &LI = AM.getResult<LoopAnalysis>(IR&: F);
226	bool Changed = AMDGPUPromoteAllocaImpl (TM, LI).run(F, /PromoteToLDS=/false);
227	if (Changed) {
228	PreservedAnalyses PA;
229	PA.preserveSet<CFGAnalyses>();
230	return PA;
231	}
232	return PreservedAnalyses::all();
233	}
234
235	FunctionPass *llvm::createAMDGPUPromoteAlloca() {
236	return new AMDGPUPromoteAlloca ();
237	}
238
239	static void collectAllocaUses(AllocaInst &Alloca,
240	SmallVectorImpl<Use *> &Uses) {
241	SmallVector<Instruction *, `4`> WorkList({&Alloca});
242	while (!WorkList.empty()) {
243	auto *Cur = WorkList.pop_back_val();
244	for (auto &U : Cur->uses()) {
245	Uses.push_back(Elt: &U);
246
247	if (isa<GetElementPtrInst>(Val: U.getUser()))
248	WorkList.push_back(Elt: cast<Instruction>(Val: U.getUser()));
249	}
250	}
251	}
252
253	void AMDGPUPromoteAllocaImpl::sortAllocasToPromote(
254	SmallVectorImpl<AllocaInst *> &Allocas) {
255	DenseMap<AllocaInst , unsigned*> Scores;
256
257	for (auto *Alloca : Allocas) {
258	LLVM_DEBUG(dbgs() << "Scoring: " << *Alloca << "\n");
259	unsigned &Score = Scores [Alloca];
260	// Increment score by one for each user + a bonus for users within loops.
261	SmallVector<Use *, `8`> Uses;
262	collectAllocaUses(Alloca&: *Alloca, Uses);
263	for (auto *U : Uses) {
264	Instruction *Inst = cast<Instruction>(Val: U->getUser());
265	if (isa<GetElementPtrInst>(Val: Inst))
266	continue;
267	unsigned UserScore =
268	`1` + (LoopUserWeight * LI.getLoopDepth(BB: Inst->getParent()));
269	LLVM_DEBUG(dbgs() << " [+" << UserScore << "]:\t" << *Inst << "\n");
270	Score += UserScore;
271	}
272	LLVM_DEBUG(dbgs() << " => Final Score:" << Score << "\n");
273	}
274
275	stable_sort(Range&: Allocas, C: [&](AllocaInst A, AllocaInst B) {
276	return Scores.at(Val: A) > Scores.at(Val: B);
277	});
278
279	// clang-format off
280	LLVM_DEBUG(
281	dbgs() << "Sorted Worklist:\n";
282	for (auto *A: Allocas)
283	dbgs() << " " << *A << "\n";
284	);
285	// clang-format on
286	}
287
288	void AMDGPUPromoteAllocaImpl::setFunctionLimits(const Function &F) {
289	// Load per function limits, overriding with global options where appropriate.
290	MaxVectorRegs = F.getFnAttributeAsParsedInteger(
291	Kind: "amdgpu-promote-alloca-to-vector-max-regs", Default: PromoteAllocaToVectorMaxRegs);
292	if (PromoteAllocaToVectorMaxRegs.getNumOccurrences())
293	MaxVectorRegs = PromoteAllocaToVectorMaxRegs;
294	VGPRBudgetRatio = F.getFnAttributeAsParsedInteger(
295	Kind: "amdgpu-promote-alloca-to-vector-vgpr-ratio",
296	Default: PromoteAllocaToVectorVGPRRatio);
297	if (PromoteAllocaToVectorVGPRRatio.getNumOccurrences())
298	VGPRBudgetRatio = PromoteAllocaToVectorVGPRRatio;
299	}
300
301	bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
302	Mod = F.getParent();
303	DL = &Mod->getDataLayout();
304
305	const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F);
306	if (!ST.isPromoteAllocaEnabled())
307	return false;
308
309	bool SufficientLDS = PromoteToLDS && hasSufficientLocalMem(F);
310	MaxVGPRs = getMaxVGPRs(LDSBytes: CurrentLocalMemUsage, TM, F);
311	setFunctionLimits(F);
312
313	unsigned VectorizationBudget =
314	(PromoteAllocaToVectorLimit ? PromoteAllocaToVectorLimit * `8`
315	: (MaxVGPRs * `32`)) /
316	VGPRBudgetRatio;
317
318	SmallVector<AllocaInst *, `16`> Allocas;
319	for (Instruction &I : F.getEntryBlock()) {
320	if (AllocaInst *AI = dyn_cast<AllocaInst>(Val: &I)) {
321	// Array allocations are probably not worth handling, since an allocation
322	// of the array type is the canonical form.
323	if (!AI->isStaticAlloca() \|\| AI->isArrayAllocation())
324	continue;
325	Allocas.push_back(Elt: AI);
326	}
327	}
328
329	sortAllocasToPromote(Allocas);
330
331	bool Changed = false;
332	for (AllocaInst *AI : Allocas) {
333	const unsigned AllocaCost = DL->getTypeSizeInBits(Ty: AI->getAllocatedType());
334	// First, check if we have enough budget to vectorize this alloca.
335	if (AllocaCost <= VectorizationBudget) {
336	// If we do, attempt vectorization, otherwise, fall through and try
337	// promoting to LDS instead.
338	if (tryPromoteAllocaToVector(I&: *AI)) {
339	Changed = true;
340	assert((VectorizationBudget - AllocaCost) < VectorizationBudget &&
341	"Underflow!");
342	VectorizationBudget -= AllocaCost;
343	LLVM_DEBUG(dbgs() << " Remaining vectorization budget:"
344	<< VectorizationBudget << "\n");
345	continue;
346	}
347	} else {
348	LLVM_DEBUG(dbgs() << "Alloca too big for vectorization (size:"
349	<< AllocaCost << ", budget:" << VectorizationBudget
350	<< "): " << *AI << "\n");
351	}
352
353	if (PromoteToLDS && tryPromoteAllocaToLDS(I&: *AI, SufficientLDS))
354	Changed = true;
355	}
356
357	// NOTE: tryPromoteAllocaToVector removes the alloca, so Allocas contains
358	// dangling pointers. If we want to reuse it past this point, the loop above
359	// would need to be updated to remove successfully promoted allocas.
360
361	return Changed;
362	}
363
364	struct MemTransferInfo {
365	ConstantInt SrcIndex = nullptr*;
366	ConstantInt DestIndex = nullptr*;
367	};
368
369	// Checks if the instruction I is a memset user of the alloca AI that we can
370	// deal with. Currently, only non-volatile memsets that affect the whole alloca
371	// are handled.
372	static bool isSupportedMemset(MemSetInst I, AllocaInst AI,
373	const DataLayout &DL) {
374	using namespace PatternMatch;
375	// For now we only care about non-volatile memsets that affect the whole type
376	// (start at index 0 and fill the whole alloca).
377	//
378	// TODO: Now that we moved to PromoteAlloca we could handle any memsets
379	// (except maybe volatile ones?) - we just need to use shufflevector if it
380	// only affects a subset of the vector.
381	const unsigned Size = DL.getTypeStoreSize(Ty: AI->getAllocatedType());
382	return I->getOperand(i_nocapture: `0`) == AI &&
383	match(V: I->getOperand(i_nocapture: `2`), P: m_SpecificInt(V: Size)) && !I->isVolatile();
384	}
385
386	static Value *calculateVectorIndex(
387	Value Ptr, const* std::map<GetElementPtrInst *, WeakTrackingVH> &GEPIdx) {
388	auto *GEP = dyn_cast<GetElementPtrInst>(Val: Ptr->stripPointerCasts());
389	if (!GEP)
390	return ConstantInt::getNullValue(Ty: Type::getInt32Ty(C&: Ptr->getContext()));
391
392	auto I = GEPIdx.find(x: GEP);
393	assert(I != GEPIdx.end() && "Must have entry for GEP!");
394
395	Value *IndexValue = I ->second;
396	assert(IndexValue && "index value missing from GEP index map");
397	return IndexValue;
398	}
399
400	static Value GEPToVectorIndex(GetElementPtrInst GEP, AllocaInst *Alloca,
401	Type VecElemTy, const* DataLayout &DL,
402	SmallVector<Instruction *> &NewInsts) {
403	// TODO: Extracting a "multiple of X" from a GEP might be a useful generic
404	// helper.
405	unsigned BW = DL.getIndexTypeSizeInBits(Ty: GEP->getType());
406	SmallMapVector<Value *, APInt, `4`> VarOffsets;
407	APInt ConstOffset(BW, `0`);
408
409	// Walk backwards through nested GEPs to collect both constant and variable
410	// offsets, so that nested vector GEP chains can be lowered in one step.
411	//
412	// Given this IR fragment as input:
413	//
414	// %0 = alloca [10 x <2 x i32>], align 8, addrspace(5)
415	// %1 = getelementptr [10 x <2 x i32>], ptr addrspace(5) %0, i32 0, i32 %j
416	// %2 = getelementptr i8, ptr addrspace(5) %1, i32 4
417	// %3 = load i32, ptr addrspace(5) %2, align 4
418	//
419	// Combine both GEP operations in a single pass, producing:
420	// BasePtr = %0
421	// ConstOffset = 4
422	// VarOffsets = { %j -> element_size(<2 x i32>) }
423	//
424	// That lets us emit a single buffer_load directly into a VGPR, without ever
425	// allocating scratch memory for the intermediate pointer.
426	Value *CurPtr = GEP;
427	while (auto *CurGEP = dyn_cast<GetElementPtrInst>(Val: CurPtr)) {
428	if (!CurGEP->collectOffset(DL, BitWidth: BW, VariableOffsets&: VarOffsets, ConstantOffset&: ConstOffset))
429	return nullptr;
430
431	// Move to the next outer pointer.
432	CurPtr = CurGEP->getPointerOperand();
433	}
434
435	assert(CurPtr == Alloca && "GEP not based on alloca");
436
437	unsigned VecElemSize = DL.getTypeAllocSize(Ty: VecElemTy);
438	if (VarOffsets.size() > `1`)
439	return nullptr;
440
441	APInt IndexQuot;
442	uint64_t Rem;
443	APInt::udivrem(LHS: ConstOffset, RHS: VecElemSize, Quotient&: IndexQuot, Remainder&: Rem);
444	if (Rem != `0`)
445	return nullptr;
446	if (VarOffsets.size() == `0`)
447	return ConstantInt::get(Context&: GEP->getContext(), V: IndexQuot);
448
449	IRBuilder<> Builder(GEP);
450
451	const auto &VarOffset = VarOffsets.front();
452	APInt OffsetQuot;
453	APInt::udivrem(LHS: VarOffset.second, RHS: VecElemSize, Quotient&: OffsetQuot, Remainder&: Rem);
454	if (Rem != `0` \|\| OffsetQuot.isZero())
455	return nullptr;
456
457	Value *Offset = VarOffset.first;
458	auto *OffsetType = dyn_cast<IntegerType>(Val: Offset->getType());
459	if (!OffsetType)
460	return nullptr;
461
462	if (!OffsetQuot.isOne()) {
463	ConstantInt *ConstMul =
464	ConstantInt::get(Ty: OffsetType, V: OffsetQuot.getZExtValue());
465	Offset = Builder.CreateMul(LHS: Offset, RHS: ConstMul);
466	if (Instruction *NewInst = dyn_cast<Instruction>(Val: Offset))
467	NewInsts.push_back(Elt: NewInst);
468	}
469	if (ConstOffset.isZero())
470	return Offset;
471
472	ConstantInt *ConstIndex =
473	ConstantInt::get(Ty: OffsetType, V: IndexQuot.getZExtValue());
474	Value *IndexAdd = Builder.CreateAdd(LHS: ConstIndex, RHS: Offset);
475	if (Instruction *NewInst = dyn_cast<Instruction>(Val: IndexAdd))
476	NewInsts.push_back(Elt: NewInst);
477	return IndexAdd;
478	}
479
480	/// Promotes a single user of the alloca to a vector form.
481	///
482	/// \param Inst Instruction to be promoted.
483	/// \param DL Module Data Layout.
484	/// \param VectorTy Vectorized Type.
485	/// \param VecStoreSize Size of \p VectorTy in bytes.
486	/// \param ElementSize Size of \p VectorTy element type in bytes.
487	/// \param TransferInfo MemTransferInst info map.
488	/// \param GEPVectorIdx GEP -> VectorIdx cache.
489	/// \param CurVal Current value of the vector (e.g. last stored value)
490	/// \param[out] DeferredLoads \p Inst is added to this vector if it can't
491	/// be promoted now. This happens when promoting requires \p
492	/// CurVal, but \p CurVal is nullptr.
493	/// \return the stored value if \p Inst would have written to the alloca, or
494	/// nullptr otherwise.
495	static Value *promoteAllocaUserToVector(
496	Instruction Inst, const* DataLayout &DL, FixedVectorType *VectorTy,
497	unsigned VecStoreSize, unsigned ElementSize,
498	DenseMap<MemTransferInst *, MemTransferInfo> &TransferInfo,
499	std::map<GetElementPtrInst , WeakTrackingVH> &GEPVectorIdx, Value CurVal,
500	SmallVectorImpl<LoadInst *> &DeferredLoads) {
501	// Note: we use InstSimplifyFolder because it can leverage the DataLayout
502	// to do more folding, especially in the case of vector splats.
503	IRBuilder<InstSimplifyFolder> Builder(Inst->getContext(),
504	InstSimplifyFolder (DL));
505	Builder.SetInsertPoint(Inst);
506
507	const auto GetOrLoadCurrentVectorValue = [&]() -> Value * {
508	if (CurVal)
509	return CurVal;
510
511	// If the current value is not known, insert a dummy load and lower it on
512	// the second pass.
513	LoadInst *Dummy =
514	Builder.CreateLoad(Ty: VectorTy, Ptr: PoisonValue::get(T: Builder.getPtrTy()),
515	Name: "promotealloca.dummyload");
516	DeferredLoads.push_back(Elt: Dummy);
517	return Dummy;
518	};
519
520	const auto CreateTempPtrIntCast = [&Builder, DL](Value *Val,
521	Type PtrTy) -> Value {
522	assert(DL.getTypeStoreSize(Val->getType()) == DL.getTypeStoreSize(PtrTy));
523	const unsigned Size = DL.getTypeStoreSizeInBits(Ty: PtrTy);
524	if (!PtrTy->isVectorTy())
525	return Builder.CreateBitOrPointerCast(V: Val, DestTy: Builder.getIntNTy(N: Size));
526	const unsigned NumPtrElts = cast<FixedVectorType>(Val: PtrTy)->getNumElements();
527	// If we want to cast to cast, e.g. a <2 x ptr> into a <4 x i32>, we need to
528	// first cast the ptr vector to <2 x i64>.
529	assert((Size % NumPtrElts == `0`) && "Vector size not divisble");
530	Type *EltTy = Builder.getIntNTy(N: Size / NumPtrElts);
531	return Builder.CreateBitOrPointerCast(
532	V: Val, DestTy: FixedVectorType::get(ElementType: EltTy, NumElts: NumPtrElts));
533	};
534
535	Type *VecEltTy = VectorTy->getElementType();
536
537	switch (Inst->getOpcode()) {
538	case Instruction::Load: {
539	// Loads can only be lowered if the value is known.
540	if (!CurVal) {
541	DeferredLoads.push_back(Elt: cast<LoadInst>(Val: Inst));
542	return nullptr;
543	}
544
545	Value *Index = calculateVectorIndex(
546	Ptr: cast<LoadInst>(Val: Inst)->getPointerOperand(), GEPIdx: GEPVectorIdx);
547
548	// We're loading the full vector.
549	Type *AccessTy = Inst->getType();
550	TypeSize AccessSize = DL.getTypeStoreSize(Ty: AccessTy);
551	if (Constant *CI = dyn_cast<Constant>(Val: Index)) {
552	if (CI->isZeroValue() && AccessSize == VecStoreSize) {
553	if (AccessTy->isPtrOrPtrVectorTy())
554	CurVal = CreateTempPtrIntCast (CurVal, AccessTy);
555	else if (CurVal->getType()->isPtrOrPtrVectorTy())
556	CurVal = CreateTempPtrIntCast (CurVal, CurVal->getType());
557	Value *NewVal = Builder.CreateBitOrPointerCast(V: CurVal, DestTy: AccessTy);
558	Inst->replaceAllUsesWith(V: NewVal);
559	return nullptr;
560	}
561	}
562
563	// Loading a subvector.
564	if (isa<FixedVectorType>(Val: AccessTy)) {
565	assert(AccessSize.isKnownMultipleOf(DL.getTypeStoreSize(VecEltTy)));
566	const unsigned NumLoadedElts = AccessSize / DL.getTypeStoreSize(Ty: VecEltTy);
567	auto *SubVecTy = FixedVectorType::get(ElementType: VecEltTy, NumElts: NumLoadedElts);
568	assert(DL.getTypeStoreSize(SubVecTy) == DL.getTypeStoreSize(AccessTy));
569
570	Value *SubVec = PoisonValue::get(T: SubVecTy);
571	for (unsigned K = `0`; K < NumLoadedElts; ++K) {
572	Value *CurIdx =
573	Builder.CreateAdd(LHS: Index, RHS: ConstantInt::get(Ty: Index->getType(), V: K));
574	SubVec = Builder.CreateInsertElement(
575	Vec: SubVec, NewElt: Builder.CreateExtractElement(Vec: CurVal, Idx: CurIdx), Idx: K);
576	}
577
578	if (AccessTy->isPtrOrPtrVectorTy())
579	SubVec = CreateTempPtrIntCast (SubVec, AccessTy);
580	else if (SubVecTy->isPtrOrPtrVectorTy())
581	SubVec = CreateTempPtrIntCast (SubVec, SubVecTy);
582
583	SubVec = Builder.CreateBitOrPointerCast(V: SubVec, DestTy: AccessTy);
584	Inst->replaceAllUsesWith(V: SubVec);
585	return nullptr;
586	}
587
588	// We're loading one element.
589	Value *ExtractElement = Builder.CreateExtractElement(Vec: CurVal, Idx: Index);
590	if (AccessTy != VecEltTy)
591	ExtractElement = Builder.CreateBitOrPointerCast(V: ExtractElement, DestTy: AccessTy);
592
593	Inst->replaceAllUsesWith(V: ExtractElement);
594	return nullptr;
595	}
596	case Instruction::Store: {
597	// For stores, it's a bit trickier and it depends on whether we're storing
598	// the full vector or not. If we're storing the full vector, we don't need
599	// to know the current value. If this is a store of a single element, we
600	// need to know the value.
601	StoreInst *SI = cast<StoreInst>(Val: Inst);
602	Value *Index = calculateVectorIndex(Ptr: SI->getPointerOperand(), GEPIdx: GEPVectorIdx);
603	Value *Val = SI->getValueOperand();
604
605	// We're storing the full vector, we can handle this without knowing CurVal.
606	Type *AccessTy = Val->getType();
607	TypeSize AccessSize = DL.getTypeStoreSize(Ty: AccessTy);
608	if (Constant *CI = dyn_cast<Constant>(Val: Index)) {
609	if (CI->isZeroValue() && AccessSize == VecStoreSize) {
610	if (AccessTy->isPtrOrPtrVectorTy())
611	Val = CreateTempPtrIntCast (Val, AccessTy);
612	else if (VectorTy->isPtrOrPtrVectorTy())
613	Val = CreateTempPtrIntCast (Val, VectorTy);
614	return Builder.CreateBitOrPointerCast(V: Val, DestTy: VectorTy);
615	}
616	}
617
618	// Storing a subvector.
619	if (isa<FixedVectorType>(Val: AccessTy)) {
620	assert(AccessSize.isKnownMultipleOf(DL.getTypeStoreSize(VecEltTy)));
621	const unsigned NumWrittenElts =
622	AccessSize / DL.getTypeStoreSize(Ty: VecEltTy);
623	const unsigned NumVecElts = VectorTy->getNumElements();
624	auto *SubVecTy = FixedVectorType::get(ElementType: VecEltTy, NumElts: NumWrittenElts);
625	assert(DL.getTypeStoreSize(SubVecTy) == DL.getTypeStoreSize(AccessTy));
626
627	if (SubVecTy->isPtrOrPtrVectorTy())
628	Val = CreateTempPtrIntCast (Val, SubVecTy);
629	else if (AccessTy->isPtrOrPtrVectorTy())
630	Val = CreateTempPtrIntCast (Val, AccessTy);
631
632	Val = Builder.CreateBitOrPointerCast(V: Val, DestTy: SubVecTy);
633
634	Value *CurVec = GetOrLoadCurrentVectorValue ();
635	for (unsigned K = `0`, NumElts = std::min(a: NumWrittenElts, b: NumVecElts);
636	K < NumElts; ++K) {
637	Value *CurIdx =
638	Builder.CreateAdd(LHS: Index, RHS: ConstantInt::get(Ty: Index->getType(), V: K));
639	CurVec = Builder.CreateInsertElement(
640	Vec: CurVec, NewElt: Builder.CreateExtractElement(Vec: Val, Idx: K), Idx: CurIdx);
641	}
642	return CurVec;
643	}
644
645	if (Val->getType() != VecEltTy)
646	Val = Builder.CreateBitOrPointerCast(V: Val, DestTy: VecEltTy);
647	return Builder.CreateInsertElement(Vec: GetOrLoadCurrentVectorValue (), NewElt: Val,
648	Idx: Index);
649	}
650	case Instruction::Call: {
651	if (auto *MTI = dyn_cast<MemTransferInst>(Val: Inst)) {
652	// For memcpy, we need to know curval.
653	ConstantInt *Length = cast<ConstantInt>(Val: MTI->getLength());
654	unsigned NumCopied = Length->getZExtValue() / ElementSize;
655	MemTransferInfo *TI = &TransferInfo [MTI];
656	unsigned SrcBegin = TI->SrcIndex->getZExtValue();
657	unsigned DestBegin = TI->DestIndex->getZExtValue();
658
659	SmallVector<int> Mask;
660	for (unsigned Idx = `0`; Idx < VectorTy->getNumElements(); ++Idx) {
661	if (Idx >= DestBegin && Idx < DestBegin + NumCopied) {
662	Mask.push_back(Elt: SrcBegin < VectorTy->getNumElements()
663	? SrcBegin++
664	: PoisonMaskElem);
665	} else {
666	Mask.push_back(Elt: Idx);
667	}
668	}
669
670	return Builder.CreateShuffleVector(V: GetOrLoadCurrentVectorValue (), Mask);
671	}
672
673	if (auto *MSI = dyn_cast<MemSetInst>(Val: Inst)) {
674	// For memset, we don't need to know the previous value because we
675	// currently only allow memsets that cover the whole alloca.
676	Value *Elt = MSI->getOperand(i_nocapture: `1`);
677	const unsigned BytesPerElt = DL.getTypeStoreSize(Ty: VecEltTy);
678	if (BytesPerElt > `1`) {
679	Value *EltBytes = Builder.CreateVectorSplat(NumElts: BytesPerElt, V: Elt);
680
681	// If the element type of the vector is a pointer, we need to first cast
682	// to an integer, then use a PtrCast.
683	if (VecEltTy->isPointerTy()) {
684	Type PtrInt = Builder.getIntNTy(N: BytesPerElt `8`);
685	Elt = Builder.CreateBitCast(V: EltBytes, DestTy: PtrInt);
686	Elt = Builder.CreateIntToPtr(V: Elt, DestTy: VecEltTy);
687	} else
688	Elt = Builder.CreateBitCast(V: EltBytes, DestTy: VecEltTy);
689	}
690
691	return Builder.CreateVectorSplat(EC: VectorTy->getElementCount(), V: Elt);
692	}
693
694	if (auto *Intr = dyn_cast<IntrinsicInst>(Val: Inst)) {
695	if (Intr->getIntrinsicID() == Intrinsic::objectsize) {
696	Intr->replaceAllUsesWith(
697	V: Builder.getIntN(N: Intr->getType()->getIntegerBitWidth(),
698	C: DL.getTypeAllocSize(Ty: VectorTy)));
699	return nullptr;
700	}
701	}
702
703	llvm_unreachable("Unsupported call when promoting alloca to vector");
704	}
705
706	default:
707	llvm_unreachable("Inconsistency in instructions promotable to vector");
708	}
709
710	llvm_unreachable("Did not return after promoting instruction!");
711	}
712
713	static bool isSupportedAccessType(FixedVectorType VecTy, Type AccessTy,
714	const DataLayout &DL) {
715	// Access as a vector type can work if the size of the access vector is a
716	// multiple of the size of the alloca's vector element type.
717	//
718	// Examples:
719	// - VecTy = <8 x float>, AccessTy = <4 x float> -> OK
720	// - VecTy = <4 x double>, AccessTy = <2 x float> -> OK
721	// - VecTy = <4 x double>, AccessTy = <3 x float> -> NOT OK
722	// - 332 is not a multiple of 64*
723	//
724	// We could handle more complicated cases, but it'd make things a lot more
725	// complicated.
726	if (isa<FixedVectorType>(Val: AccessTy)) {
727	TypeSize AccTS = DL.getTypeStoreSize(Ty: AccessTy);
728	// If the type size and the store size don't match, we would need to do more
729	// than just bitcast to translate between an extracted/insertable subvectors
730	// and the accessed value.
731	if (AccTS * `8` != DL.getTypeSizeInBits(Ty: AccessTy))
732	return false;
733	TypeSize VecTS = DL.getTypeStoreSize(Ty: VecTy->getElementType());
734	return AccTS.isKnownMultipleOf(RHS: VecTS);
735	}
736
737	return CastInst::isBitOrNoopPointerCastable(SrcTy: VecTy->getElementType(), DestTy: AccessTy,
738	DL);
739	}
740
741	/// Iterates over an instruction worklist that may contain multiple instructions
742	/// from the same basic block, but in a different order.
743	template <typename InstContainer>
744	static void forEachWorkListItem(const InstContainer &WorkList,
745	std::function<void(Instruction *)> Fn) {
746	// Bucket up uses of the alloca by the block they occur in.
747	// This is important because we have to handle multiple defs/uses in a block
748	// ourselves: SSAUpdater is purely for cross-block references.
749	DenseMap<BasicBlock , SmallDenseSet<Instruction >> UsesByBlock;
750	for (Instruction *User : WorkList)
751	UsesByBlock [User->getParent()].insert(V: User);
752
753	for (Instruction *User : WorkList) {
754	BasicBlock *BB = User->getParent();
755	auto &BlockUses = UsesByBlock [BB];
756
757	// Already processed, skip.
758	if (BlockUses.empty())
759	continue;
760
761	// Only user in the block, directly process it.
762	if (BlockUses.size() == `1`) {
763	Fn (User);
764	continue;
765	}
766
767	// Multiple users in the block, do a linear scan to see users in order.
768	for (Instruction &Inst : *BB) {
769	if (!BlockUses.contains(V: &Inst))
770	continue;
771
772	Fn (&Inst);
773	}
774
775	// Clear the block so we know it's been processed.
776	BlockUses.clear();
777	}
778	}
779
780	/// Find an insert point after an alloca, after all other allocas clustered at
781	/// the start of the block.
782	static BasicBlock::iterator skipToNonAllocaInsertPt(BasicBlock &BB,
783	BasicBlock::iterator I) {
784	for (BasicBlock::iterator E = BB.end(); I != E && isa<AllocaInst>(Val: *I); ++I)
785	;
786	return I;
787	}
788
789	// FIXME: Should try to pick the most likely to be profitable allocas first.
790	bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
791	LLVM_DEBUG(dbgs() << "Trying to promote to vector: " << Alloca << `'\n'`);
792
793	if (DisablePromoteAllocaToVector) {
794	LLVM_DEBUG(dbgs() << " Promote alloca to vector is disabled\n");
795	return false;
796	}
797
798	Type *AllocaTy = Alloca.getAllocatedType();
799	auto *VectorTy = dyn_cast<FixedVectorType>(Val: AllocaTy);
800	if (auto *ArrayTy = dyn_cast<ArrayType>(Val: AllocaTy)) {
801	uint64_t NumElems = `1`;
802	Type *ElemTy;
803	do {
804	NumElems *= ArrayTy->getNumElements();
805	ElemTy = ArrayTy->getElementType();
806	} while ((ArrayTy = dyn_cast<ArrayType>(Val: ElemTy)));
807
808	// Check for array of vectors
809	auto *InnerVectorTy = dyn_cast<FixedVectorType>(Val: ElemTy);
810	if (InnerVectorTy) {
811	NumElems *= InnerVectorTy->getNumElements();
812	ElemTy = InnerVectorTy->getElementType();
813	}
814
815	if (VectorType::isValidElementType(ElemTy) && NumElems > `0`) {
816	unsigned ElementSize = DL->getTypeSizeInBits(Ty: ElemTy) / `8`;
817	if (ElementSize > `0`) {
818	unsigned AllocaSize = DL->getTypeStoreSize(Ty: AllocaTy);
819	// Expand vector if required to match padding of inner type,
820	// i.e. odd size subvectors.
821	// Storage size of new vector must match that of alloca for correct
822	// behaviour of byte offsets and GEP computation.
823	if (NumElems * ElementSize != AllocaSize)
824	NumElems = AllocaSize / ElementSize;
825	if (NumElems > `0` && (AllocaSize % ElementSize) == `0`)
826	VectorTy = FixedVectorType::get(ElementType: ElemTy, NumElts: NumElems);
827	}
828	}
829	}
830
831	if (!VectorTy) {
832	LLVM_DEBUG(dbgs() << " Cannot convert type to vector\n");
833	return false;
834	}
835
836	const unsigned MaxElements =
837	(MaxVectorRegs * `32`) / DL->getTypeSizeInBits(Ty: VectorTy->getElementType());
838
839	if (VectorTy->getNumElements() > MaxElements \|\|
840	VectorTy->getNumElements() < `2`) {
841	LLVM_DEBUG(dbgs() << " " << *VectorTy
842	<< " has an unsupported number of elements\n");
843	return false;
844	}
845
846	std::map<GetElementPtrInst *, WeakTrackingVH> GEPVectorIdx;
847	SmallVector<Instruction *> WorkList;
848	SmallVector<Instruction *> UsersToRemove;
849	SmallVector<Instruction *> DeferredInsts;
850	SmallVector<Instruction *> NewGEPInsts;
851	DenseMap<MemTransferInst *, MemTransferInfo> TransferInfo;
852
853	const auto RejectUser = [&](Instruction *Inst, Twine Msg) {
854	LLVM_DEBUG(dbgs() << " Cannot promote alloca to vector: " << Msg << "\n"
855	<< " " << *Inst << "\n");
856	for (auto *Inst : reverse(C&: NewGEPInsts))
857	Inst->eraseFromParent();
858	return false;
859	};
860
861	SmallVector<Use *, `8`> Uses;
862	collectAllocaUses(Alloca, Uses);
863
864	LLVM_DEBUG(dbgs() << " Attempting promotion to: " << *VectorTy << "\n");
865
866	Type *VecEltTy = VectorTy->getElementType();
867	unsigned ElementSizeInBits = DL->getTypeSizeInBits(Ty: VecEltTy);
868	if (ElementSizeInBits != DL->getTypeAllocSizeInBits(Ty: VecEltTy)) {
869	LLVM_DEBUG(dbgs() << " Cannot convert to vector if the allocation size "
870	"does not match the type's size\n");
871	return false;
872	}
873	unsigned ElementSize = ElementSizeInBits / `8`;
874	assert(ElementSize > `0`);
875	for (auto *U : Uses) {
876	Instruction *Inst = cast<Instruction>(Val: U->getUser());
877
878	if (Value *Ptr = getLoadStorePointerOperand(V: Inst)) {
879	// This is a store of the pointer, not to the pointer.
880	if (isa<StoreInst>(Val: Inst) &&
881	U->getOperandNo() != StoreInst::getPointerOperandIndex())
882	return RejectUser (Inst, "pointer is being stored");
883
884	Type *AccessTy = getLoadStoreType(I: Inst);
885	if (AccessTy->isAggregateType())
886	return RejectUser (Inst, "unsupported load/store as aggregate");
887	assert(!AccessTy->isAggregateType() \|\| AccessTy->isArrayTy());
888
889	// Check that this is a simple access of a vector element.
890	bool IsSimple = isa<LoadInst>(Val: Inst) ? cast<LoadInst>(Val: Inst)->isSimple()
891	: cast<StoreInst>(Val: Inst)->isSimple();
892	if (!IsSimple)
893	return RejectUser (Inst, "not a simple load or store");
894
895	Ptr = Ptr->stripPointerCasts();
896
897	// Alloca already accessed as vector.
898	if (Ptr == &Alloca && DL->getTypeStoreSize(Ty: Alloca.getAllocatedType()) ==
899	DL->getTypeStoreSize(Ty: AccessTy)) {
900	WorkList.push_back(Elt: Inst);
901	continue;
902	}
903
904	if (!isSupportedAccessType(VecTy: VectorTy, AccessTy, DL: *DL))
905	return RejectUser (Inst, "not a supported access type");
906
907	WorkList.push_back(Elt: Inst);
908	continue;
909	}
910
911	if (auto *GEP = dyn_cast<GetElementPtrInst>(Val: Inst)) {
912	// If we can't compute a vector index from this GEP, then we can't
913	// promote this alloca to vector.
914	Value Index = GEPToVectorIndex(GEP, Alloca: &Alloca, VecElemTy: VecEltTy, DL: DL, NewInsts&: NewGEPInsts);
915	if (!Index)
916	return RejectUser (Inst, "cannot compute vector index for GEP");
917
918	GEPVectorIdx [GEP] = Index;
919	UsersToRemove.push_back(Elt: Inst);
920	continue;
921	}
922
923	if (MemSetInst *MSI = dyn_cast<MemSetInst>(Val: Inst);
924	MSI && isSupportedMemset(I: MSI, AI: &Alloca, DL: *DL)) {
925	WorkList.push_back(Elt: Inst);
926	continue;
927	}
928
929	if (MemTransferInst *TransferInst = dyn_cast<MemTransferInst>(Val: Inst)) {
930	if (TransferInst->isVolatile())
931	return RejectUser (Inst, "mem transfer inst is volatile");
932
933	ConstantInt *Len = dyn_cast<ConstantInt>(Val: TransferInst->getLength());
934	if (!Len \|\| (Len->getZExtValue() % ElementSize))
935	return RejectUser (Inst, "mem transfer inst length is non-constant or "
936	"not a multiple of the vector element size");
937
938	if (TransferInfo.try_emplace(Key: TransferInst).second) {
939	DeferredInsts.push_back(Elt: Inst);
940	WorkList.push_back(Elt: Inst);
941	}
942
943	auto getPointerIndexOfAlloca = [&](Value Ptr) -> ConstantInt {
944	GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Val: Ptr);
945	if (Ptr != &Alloca && !GEPVectorIdx.count(x: GEP))
946	return nullptr;
947
948	return dyn_cast<ConstantInt>(Val: calculateVectorIndex(Ptr, GEPIdx: GEPVectorIdx));
949	};
950
951	unsigned OpNum = U->getOperandNo();
952	MemTransferInfo *TI = &TransferInfo [TransferInst];
953	if (OpNum == `0`) {
954	Value *Dest = TransferInst->getDest();
955	ConstantInt *Index = getPointerIndexOfAlloca (Dest);
956	if (!Index)
957	return RejectUser (Inst, "could not calculate constant dest index");
958	TI->DestIndex = Index;
959	} else {
960	assert(OpNum == `1`);
961	Value *Src = TransferInst->getSource();
962	ConstantInt *Index = getPointerIndexOfAlloca (Src);
963	if (!Index)
964	return RejectUser (Inst, "could not calculate constant src index");
965	TI->SrcIndex = Index;
966	}
967	continue;
968	}
969
970	if (auto *Intr = dyn_cast<IntrinsicInst>(Val: Inst)) {
971	if (Intr->getIntrinsicID() == Intrinsic::objectsize) {
972	WorkList.push_back(Elt: Inst);
973	continue;
974	}
975	}
976
977	// Ignore assume-like intrinsics and comparisons used in assumes.
978	if (isAssumeLikeIntrinsic(I: Inst)) {
979	if (!Inst->use_empty())
980	return RejectUser (Inst, "assume-like intrinsic cannot have any users");
981	UsersToRemove.push_back(Elt: Inst);
982	continue;
983	}
984
985	if (isa<ICmpInst>(Val: Inst) && all_of(Range: Inst->users(), P: [](User *U) {
986	return isAssumeLikeIntrinsic(I: cast<Instruction>(Val: U));
987	})) {
988	UsersToRemove.push_back(Elt: Inst);
989	continue;
990	}
991
992	return RejectUser (Inst, "unhandled alloca user");
993	}
994
995	while (!DeferredInsts.empty()) {
996	Instruction *Inst = DeferredInsts.pop_back_val();
997	MemTransferInst *TransferInst = cast<MemTransferInst>(Val: Inst);
998	// TODO: Support the case if the pointers are from different alloca or
999	// from different address spaces.
1000	MemTransferInfo &Info = TransferInfo [TransferInst];
1001	if (!Info.SrcIndex \|\| !Info.DestIndex)
1002	return RejectUser (
1003	Inst, "mem transfer inst is missing constant src and/or dst index");
1004	}
1005
1006	LLVM_DEBUG(dbgs() << " Converting alloca to vector " << *AllocaTy << " -> "
1007	<< *VectorTy << `'\n'`);
1008	const unsigned VecStoreSize = DL->getTypeStoreSize(Ty: VectorTy);
1009
1010	// Alloca is uninitialized memory. Imitate that by making the first value
1011	// undef.
1012	SSAUpdater Updater;
1013	Updater.Initialize(Ty: VectorTy, Name: "promotealloca");
1014
1015	BasicBlock *EntryBB = Alloca.getParent();
1016	BasicBlock::iterator InitInsertPos =
1017	skipToNonAllocaInsertPt(BB&: *EntryBB, I: Alloca.getIterator());
1018	// Alloca memory is undefined to begin, not poison.
1019	Value *AllocaInitValue =
1020	new FreezeInst (PoisonValue::get(T: VectorTy), "", InitInsertPos);
1021	AllocaInitValue->takeName(V: &Alloca);
1022
1023	Updater.AddAvailableValue(BB: EntryBB, V: AllocaInitValue);
1024
1025	// First handle the initial worklist.
1026	SmallVector<LoadInst *, `4`> DeferredLoads;
1027	forEachWorkListItem(WorkList, Fn: [&](Instruction *I) {
1028	BasicBlock *BB = I->getParent();
1029	// On the first pass, we only take values that are trivially known, i.e.
1030	// where AddAvailableValue was already called in this block.
1031	Value *Result = promoteAllocaUserToVector(
1032	Inst: I, DL: *DL, VectorTy, VecStoreSize, ElementSize, TransferInfo, GEPVectorIdx,
1033	CurVal: Updater.FindValueForBlock(BB), DeferredLoads);
1034	if (Result)
1035	Updater.AddAvailableValue(BB, V: Result);
1036	});
1037
1038	// Then handle deferred loads.
1039	forEachWorkListItem(WorkList: DeferredLoads, Fn: [&](Instruction *I) {
1040	SmallVector<LoadInst *, `0`> NewDLs;
1041	BasicBlock *BB = I->getParent();
1042	// On the second pass, we use GetValueInMiddleOfBlock to guarantee we always
1043	// get a value, inserting PHIs as needed.
1044	Value *Result = promoteAllocaUserToVector(
1045	Inst: I, DL: *DL, VectorTy, VecStoreSize, ElementSize, TransferInfo, GEPVectorIdx,
1046	CurVal: Updater.GetValueInMiddleOfBlock(BB: I->getParent()), DeferredLoads&: NewDLs);
1047	if (Result)
1048	Updater.AddAvailableValue(BB, V: Result);
1049	assert(NewDLs.empty() && "No more deferred loads should be queued!");
1050	});
1051
1052	// Delete all instructions. On the first pass, new dummy loads may have been
1053	// added so we need to collect them too.
1054	DenseSet<Instruction *> InstsToDelete(WorkList.begin(), WorkList.end());
1055	InstsToDelete.insert_range(R&: DeferredLoads);
1056	for (Instruction *I : InstsToDelete) {
1057	assert(I->use_empty());
1058	I->eraseFromParent();
1059	}
1060
1061	// Delete all the users that are known to be removeable.
1062	for (Instruction *I : reverse(C&: UsersToRemove)) {
1063	I->dropDroppableUses();
1064	assert(I->use_empty());
1065	I->eraseFromParent();
1066	}
1067
1068	// Alloca should now be dead too.
1069	assert(Alloca.use_empty());
1070	Alloca.eraseFromParent();
1071	return true;
1072	}
1073
1074	std::pair<Value , Value >
1075	AMDGPUPromoteAllocaImpl::getLocalSizeYZ(IRBuilder<> &Builder) {
1076	Function &F = *Builder.GetInsertBlock()->getParent();
1077	const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F);
1078
1079	if (!IsAMDHSA) {
1080	CallInst *LocalSizeY =
1081	Builder.CreateIntrinsic(ID: Intrinsic::r600_read_local_size_y, Args: {});
1082	CallInst *LocalSizeZ =
1083	Builder.CreateIntrinsic(ID: Intrinsic::r600_read_local_size_z, Args: {});
1084
1085	ST.makeLIDRangeMetadata(I: LocalSizeY);
1086	ST.makeLIDRangeMetadata(I: LocalSizeZ);
1087
1088	return std::pair(LocalSizeY, LocalSizeZ);
1089	}
1090
1091	// We must read the size out of the dispatch pointer.
1092	assert(IsAMDGCN);
1093
1094	// We are indexing into this struct, and want to extract the workgroup_size_*
1095	// fields.
1096	//
1097	// typedef struct hsa_kernel_dispatch_packet_s {
1098	// uint16_t header;
1099	// uint16_t setup;
1100	// uint16_t workgroup_size_x ;
1101	// uint16_t workgroup_size_y;
1102	// uint16_t workgroup_size_z;
1103	// uint16_t reserved0;
1104	// uint32_t grid_size_x ;
1105	// uint32_t grid_size_y ;
1106	// uint32_t grid_size_z;
1107	//
1108	// uint32_t private_segment_size;
1109	// uint32_t group_segment_size;
1110	// uint64_t kernel_object;
1111	//
1112	// #ifdef HSA_LARGE_MODEL
1113	// void kernarg_address;*
1114	// #elif defined HSA_LITTLE_ENDIAN
1115	// void kernarg_address;*
1116	// uint32_t reserved1;
1117	// #else
1118	// uint32_t reserved1;
1119	// void kernarg_address;*
1120	// #endif
1121	// uint64_t reserved2;
1122	// hsa_signal_t completion_signal; // uint64_t wrapper
1123	// } hsa_kernel_dispatch_packet_t
1124	//
1125	CallInst *DispatchPtr =
1126	Builder.CreateIntrinsic(ID: Intrinsic::amdgcn_dispatch_ptr, Args: {});
1127	DispatchPtr->addRetAttr(Kind: Attribute::NoAlias);
1128	DispatchPtr->addRetAttr(Kind: Attribute::NonNull);
1129	F.removeFnAttr(Kind: "amdgpu-no-dispatch-ptr");
1130
1131	// Size of the dispatch packet struct.
1132	DispatchPtr->addDereferenceableRetAttr(Bytes: `64`);
1133
1134	Type *I32Ty = Type::getInt32Ty(C&: Mod->getContext());
1135
1136	// We could do a single 64-bit load here, but it's likely that the basic
1137	// 32-bit and extract sequence is already present, and it is probably easier
1138	// to CSE this. The loads should be mergeable later anyway.
1139	Value *GEPXY = Builder.CreateConstInBoundsGEP1_64(Ty: I32Ty, Ptr: DispatchPtr, Idx0: `1`);
1140	LoadInst *LoadXY = Builder.CreateAlignedLoad(Ty: I32Ty, Ptr: GEPXY, Align: Align (`4`));
1141
1142	Value *GEPZU = Builder.CreateConstInBoundsGEP1_64(Ty: I32Ty, Ptr: DispatchPtr, Idx0: `2`);
1143	LoadInst *LoadZU = Builder.CreateAlignedLoad(Ty: I32Ty, Ptr: GEPZU, Align: Align (`4`));
1144
1145	MDNode *MD = MDNode::get(Context&: Mod->getContext(), MDs: {});
1146	LoadXY->setMetadata(KindID: LLVMContext::MD_invariant_load, Node: MD);
1147	LoadZU->setMetadata(KindID: LLVMContext::MD_invariant_load, Node: MD);
1148	ST.makeLIDRangeMetadata(I: LoadZU);
1149
1150	// Extract y component. Upper half of LoadZU should be zero already.
1151	Value *Y = Builder.CreateLShr(LHS: LoadXY, RHS: `16`);
1152
1153	return std::pair(Y, LoadZU);
1154	}
1155
1156	Value *AMDGPUPromoteAllocaImpl::getWorkitemID(IRBuilder<> &Builder,
1157	unsigned N) {
1158	Function *F = Builder.GetInsertBlock()->getParent();
1159	const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F: *F);
1160	Intrinsic::ID IntrID = Intrinsic::not_intrinsic;
1161	StringRef AttrName;
1162
1163	switch (N) {
1164	case `0`:
1165	IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_x
1166	: (Intrinsic::ID)Intrinsic::r600_read_tidig_x;
1167	AttrName = "amdgpu-no-workitem-id-x";
1168	break;
1169	case `1`:
1170	IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_y
1171	: (Intrinsic::ID)Intrinsic::r600_read_tidig_y;
1172	AttrName = "amdgpu-no-workitem-id-y";
1173	break;
1174
1175	case `2`:
1176	IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_z
1177	: (Intrinsic::ID)Intrinsic::r600_read_tidig_z;
1178	AttrName = "amdgpu-no-workitem-id-z";
1179	break;
1180	default:
1181	llvm_unreachable("invalid dimension");
1182	}
1183
1184	Function *WorkitemIdFn = Intrinsic::getOrInsertDeclaration(M: Mod, id: IntrID);
1185	CallInst *CI = Builder.CreateCall(Callee: WorkitemIdFn);
1186	ST.makeLIDRangeMetadata(I: CI);
1187	F->removeFnAttr(Kind: AttrName);
1188
1189	return CI;
1190	}
1191
1192	static bool isCallPromotable(CallInst *CI) {
1193	IntrinsicInst *II = dyn_cast<IntrinsicInst>(Val: CI);
1194	if (!II)
1195	return false;
1196
1197	switch (II->getIntrinsicID()) {
1198	case Intrinsic::memcpy:
1199	case Intrinsic::memmove:
1200	case Intrinsic::memset:
1201	case Intrinsic::lifetime_start:
1202	case Intrinsic::lifetime_end:
1203	case Intrinsic::invariant_start:
1204	case Intrinsic::invariant_end:
1205	case Intrinsic::launder_invariant_group:
1206	case Intrinsic::strip_invariant_group:
1207	case Intrinsic::objectsize:
1208	return true;
1209	default:
1210	return false;
1211	}
1212	}
1213
1214	bool AMDGPUPromoteAllocaImpl::binaryOpIsDerivedFromSameAlloca(
1215	Value BaseAlloca, Value Val, Instruction Inst, int* OpIdx0,
1216	int OpIdx1) const {
1217	// Figure out which operand is the one we might not be promoting.
1218	Value *OtherOp = Inst->getOperand(i: OpIdx0);
1219	if (Val == OtherOp)
1220	OtherOp = Inst->getOperand(i: OpIdx1);
1221
1222	if (isa<ConstantPointerNull, ConstantAggregateZero>(Val: OtherOp))
1223	return true;
1224
1225	// TODO: getUnderlyingObject will not work on a vector getelementptr
1226	Value *OtherObj = getUnderlyingObject(V: OtherOp);
1227	if (!isa<AllocaInst>(Val: OtherObj))
1228	return false;
1229
1230	// TODO: We should be able to replace undefs with the right pointer type.
1231
1232	// TODO: If we know the other base object is another promotable
1233	// alloca, not necessarily this alloca, we can do this. The
1234	// important part is both must have the same address space at
1235	// the end.
1236	if (OtherObj != BaseAlloca) {
1237	LLVM_DEBUG(
1238	dbgs() << "Found a binary instruction with another alloca object\n");
1239	return false;
1240	}
1241
1242	return true;
1243	}
1244
1245	bool AMDGPUPromoteAllocaImpl::collectUsesWithPtrTypes(
1246	Value BaseAlloca, Value Val, std::vector<Value > &WorkList) const* {
1247
1248	for (User *User : Val->users()) {
1249	if (is_contained(Range&: WorkList, Element: User))
1250	continue;
1251
1252	if (CallInst *CI = dyn_cast<CallInst>(Val: User)) {
1253	if (!isCallPromotable(CI))
1254	return false;
1255
1256	WorkList.push_back(x: User);
1257	continue;
1258	}
1259
1260	Instruction *UseInst = cast<Instruction>(Val: User);
1261	if (UseInst->getOpcode() == Instruction::PtrToInt)
1262	return false;
1263
1264	if (LoadInst *LI = dyn_cast<LoadInst>(Val: UseInst)) {
1265	if (LI->isVolatile())
1266	return false;
1267	continue;
1268	}
1269
1270	if (StoreInst *SI = dyn_cast<StoreInst>(Val: UseInst)) {
1271	if (SI->isVolatile())
1272	return false;
1273
1274	// Reject if the stored value is not the pointer operand.
1275	if (SI->getPointerOperand() != Val)
1276	return false;
1277	continue;
1278	}
1279
1280	if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(Val: UseInst)) {
1281	if (RMW->isVolatile())
1282	return false;
1283	continue;
1284	}
1285
1286	if (AtomicCmpXchgInst *CAS = dyn_cast<AtomicCmpXchgInst>(Val: UseInst)) {
1287	if (CAS->isVolatile())
1288	return false;
1289	continue;
1290	}
1291
1292	// Only promote a select if we know that the other select operand
1293	// is from another pointer that will also be promoted.
1294	if (ICmpInst *ICmp = dyn_cast<ICmpInst>(Val: UseInst)) {
1295	if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, Inst: ICmp, OpIdx0: `0`, OpIdx1: `1`))
1296	return false;
1297
1298	// May need to rewrite constant operands.
1299	WorkList.push_back(x: ICmp);
1300	continue;
1301	}
1302
1303	if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Val: UseInst)) {
1304	// Be conservative if an address could be computed outside the bounds of
1305	// the alloca.
1306	if (!GEP->isInBounds())
1307	return false;
1308	} else if (SelectInst *SI = dyn_cast<SelectInst>(Val: UseInst)) {
1309	// Only promote a select if we know that the other select operand is from
1310	// another pointer that will also be promoted.
1311	if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, Inst: SI, OpIdx0: `1`, OpIdx1: `2`))
1312	return false;
1313	} else if (PHINode *Phi = dyn_cast<PHINode>(Val: UseInst)) {
1314	// Repeat for phis.
1315
1316	// TODO: Handle more complex cases. We should be able to replace loops
1317	// over arrays.
1318	switch (Phi->getNumIncomingValues()) {
1319	case `1`:
1320	break;
1321	case `2`:
1322	if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, Inst: Phi, OpIdx0: `0`, OpIdx1: `1`))
1323	return false;
1324	break;
1325	default:
1326	return false;
1327	}
1328	} else if (!isa<ExtractElementInst>(Val: User)) {
1329	// Do not promote vector/aggregate type instructions. It is hard to track
1330	// their users.
1331
1332	// Do not promote addrspacecast.
1333	//
1334	// TODO: If we know the address is only observed through flat pointers, we
1335	// could still promote.
1336	return false;
1337	}
1338
1339	WorkList.push_back(x: User);
1340	if (!collectUsesWithPtrTypes(BaseAlloca, Val: User, WorkList))
1341	return false;
1342	}
1343
1344	return true;
1345	}
1346
1347	bool AMDGPUPromoteAllocaImpl::hasSufficientLocalMem(const Function &F) {
1348
1349	FunctionType *FTy = F.getFunctionType();
1350	const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F);
1351
1352	// If the function has any arguments in the local address space, then it's
1353	// possible these arguments require the entire local memory space, so
1354	// we cannot use local memory in the pass.
1355	for (Type *ParamTy : FTy->params()) {
1356	PointerType *PtrTy = dyn_cast<PointerType>(Val: ParamTy);
1357	if (PtrTy && PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
1358	LocalMemLimit = `0`;
1359	LLVM_DEBUG(dbgs() << "Function has local memory argument. Promoting to "
1360	"local memory disabled.\n");
1361	return false;
1362	}
1363	}
1364
1365	LocalMemLimit = ST.getAddressableLocalMemorySize();
1366	if (LocalMemLimit == `0`)
1367	return false;
1368
1369	SmallVector<const Constant *, `16`> Stack;
1370	SmallPtrSet<const Constant *, `8`> VisitedConstants;
1371	SmallPtrSet<const GlobalVariable *, `8`> UsedLDS;
1372
1373	auto visitUsers = [&](const GlobalVariable GV, const* Constant Val) -> bool* {
1374	for (const User *U : Val->users()) {
1375	if (const Instruction *Use = dyn_cast<Instruction>(Val: U)) {
1376	if (Use->getParent()->getParent() == &F)
1377	return true;
1378	} else {
1379	const Constant *C = cast<Constant>(Val: U);
1380	if (VisitedConstants.insert(Ptr: C).second)
1381	Stack.push_back(Elt: C);
1382	}
1383	}
1384
1385	return false;
1386	};
1387
1388	for (GlobalVariable &GV : Mod->globals()) {
1389	if (GV.getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS)
1390	continue;
1391
1392	if (visitUsers (&GV, &GV)) {
1393	UsedLDS.insert(Ptr: &GV);
1394	Stack.clear();
1395	continue;
1396	}
1397
1398	// For any ConstantExpr uses, we need to recursively search the users until
1399	// we see a function.
1400	while (!Stack.empty()) {
1401	const Constant *C = Stack.pop_back_val();
1402	if (visitUsers (&GV, C)) {
1403	UsedLDS.insert(Ptr: &GV);
1404	Stack.clear();
1405	break;
1406	}
1407	}
1408	}
1409
1410	const DataLayout &DL = Mod->getDataLayout();
1411	SmallVector<std::pair<uint64_t, Align>, `16`> AllocatedSizes;
1412	AllocatedSizes.reserve(N: UsedLDS.size());
1413
1414	for (const GlobalVariable *GV : UsedLDS) {
1415	Align Alignment =
1416	DL.getValueOrABITypeAlignment(Alignment: GV->getAlign(), Ty: GV->getValueType());
1417	uint64_t AllocSize = DL.getTypeAllocSize(Ty: GV->getValueType());
1418
1419	// HIP uses an extern unsized array in local address space for dynamically
1420	// allocated shared memory. In that case, we have to disable the promotion.
1421	if (GV->hasExternalLinkage() && AllocSize == `0`) {
1422	LocalMemLimit = `0`;
1423	LLVM_DEBUG(dbgs() << "Function has a reference to externally allocated "
1424	"local memory. Promoting to local memory "
1425	"disabled.\n");
1426	return false;
1427	}
1428
1429	AllocatedSizes.emplace_back(Args&: AllocSize, Args&: Alignment);
1430	}
1431
1432	// Sort to try to estimate the worst case alignment padding
1433	//
1434	// FIXME: We should really do something to fix the addresses to a more optimal
1435	// value instead
1436	llvm::sort(C&: AllocatedSizes, Comp: llvm::less_second ());
1437
1438	// Check how much local memory is being used by global objects
1439	CurrentLocalMemUsage = `0`;
1440
1441	// FIXME: Try to account for padding here. The real padding and address is
1442	// currently determined from the inverse order of uses in the function when
1443	// legalizing, which could also potentially change. We try to estimate the
1444	// worst case here, but we probably should fix the addresses earlier.
1445	for (auto Alloc : AllocatedSizes) {
1446	CurrentLocalMemUsage = alignTo(Size: CurrentLocalMemUsage, A: Alloc.second);
1447	CurrentLocalMemUsage += Alloc.first;
1448	}
1449
1450	unsigned MaxOccupancy =
1451	ST.getWavesPerEU(FlatWorkGroupSizes: ST.getFlatWorkGroupSizes(F), LDSBytes: CurrentLocalMemUsage, F)
1452	.second;
1453
1454	// Round up to the next tier of usage.
1455	unsigned MaxSizeWithWaveCount =
1456	ST.getMaxLocalMemSizeWithWaveCount(WaveCount: MaxOccupancy, F);
1457
1458	// Program may already use more LDS than is usable at maximum occupancy.
1459	if (CurrentLocalMemUsage > MaxSizeWithWaveCount)
1460	return false;
1461
1462	LocalMemLimit = MaxSizeWithWaveCount;
1463
1464	LLVM_DEBUG(dbgs() << F.getName() << " uses " << CurrentLocalMemUsage
1465	<< " bytes of LDS\n"
1466	<< " Rounding size to " << MaxSizeWithWaveCount
1467	<< " with a maximum occupancy of " << MaxOccupancy << `'\n'`
1468	<< " and " << (LocalMemLimit - CurrentLocalMemUsage)
1469	<< " available for promotion\n");
1470
1471	return true;
1472	}
1473
1474	// FIXME: Should try to pick the most likely to be profitable allocas first.
1475	bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(AllocaInst &I,
1476	bool SufficientLDS) {
1477	LLVM_DEBUG(dbgs() << "Trying to promote to LDS: " << I << `'\n'`);
1478
1479	if (DisablePromoteAllocaToLDS) {
1480	LLVM_DEBUG(dbgs() << " Promote alloca to LDS is disabled\n");
1481	return false;
1482	}
1483
1484	const DataLayout &DL = Mod->getDataLayout();
1485	IRBuilder<> Builder(&I);
1486
1487	const Function &ContainingFunction = *I.getParent()->getParent();
1488	CallingConv::ID CC = ContainingFunction.getCallingConv();
1489
1490	// Don't promote the alloca to LDS for shader calling conventions as the work
1491	// item ID intrinsics are not supported for these calling conventions.
1492	// Furthermore not all LDS is available for some of the stages.
1493	switch (CC) {
1494	case CallingConv::AMDGPU_KERNEL:
1495	case CallingConv::SPIR_KERNEL:
1496	break;
1497	default:
1498	LLVM_DEBUG(
1499	dbgs()
1500	<< " promote alloca to LDS not supported with calling convention.\n");
1501	return false;
1502	}
1503
1504	// Not likely to have sufficient local memory for promotion.
1505	if (!SufficientLDS)
1506	return false;
1507
1508	const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F: ContainingFunction);
1509	unsigned WorkGroupSize = ST.getFlatWorkGroupSizes(F: ContainingFunction).second;
1510
1511	Align Alignment =
1512	DL.getValueOrABITypeAlignment(Alignment: I.getAlign(), Ty: I.getAllocatedType());
1513
1514	// FIXME: This computed padding is likely wrong since it depends on inverse
1515	// usage order.
1516	//
1517	// FIXME: It is also possible that if we're allowed to use all of the memory
1518	// could end up using more than the maximum due to alignment padding.
1519
1520	uint32_t NewSize = alignTo(Size: CurrentLocalMemUsage, A: Alignment);
1521	uint32_t AllocSize =
1522	WorkGroupSize * DL.getTypeAllocSize(Ty: I.getAllocatedType());
1523	NewSize += AllocSize;
1524
1525	if (NewSize > LocalMemLimit) {
1526	LLVM_DEBUG(dbgs() << " " << AllocSize
1527	<< " bytes of local memory not available to promote\n");
1528	return false;
1529	}
1530
1531	CurrentLocalMemUsage = NewSize;
1532
1533	std::vector<Value *> WorkList;
1534
1535	if (!collectUsesWithPtrTypes(BaseAlloca: &I, Val: &I, WorkList)) {
1536	LLVM_DEBUG(dbgs() << " Do not know how to convert all uses\n");
1537	return false;
1538	}
1539
1540	LLVM_DEBUG(dbgs() << "Promoting alloca to local memory\n");
1541
1542	Function *F = I.getParent()->getParent();
1543
1544	Type *GVTy = ArrayType::get(ElementType: I.getAllocatedType(), NumElements: WorkGroupSize);
1545	GlobalVariable GV = new* GlobalVariable (
1546	Mod, GVTy, false*, GlobalValue::InternalLinkage, PoisonValue::get(T: GVTy),
1547	Twine (F->getName()) + Twine (`'.'`) + I.getName(), nullptr,
1548	GlobalVariable::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS);
1549	GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
1550	GV->setAlignment(I.getAlign());
1551
1552	Value TCntY, TCntZ;
1553
1554	std::tie(args&: TCntY, args&: TCntZ) = getLocalSizeYZ(Builder);
1555	Value *TIdX = getWorkitemID(Builder, N: `0`);
1556	Value *TIdY = getWorkitemID(Builder, N: `1`);
1557	Value *TIdZ = getWorkitemID(Builder, N: `2`);
1558
1559	Value Tmp0 = Builder.CreateMul(LHS: TCntY, RHS: TCntZ, Name: "", HasNUW: true, HasNSW: true*);
1560	Tmp0 = Builder.CreateMul(LHS: Tmp0, RHS: TIdX);
1561	Value Tmp1 = Builder.CreateMul(LHS: TIdY, RHS: TCntZ, Name: "", HasNUW: true, HasNSW: true*);
1562	Value *TID = Builder.CreateAdd(LHS: Tmp0, RHS: Tmp1);
1563	TID = Builder.CreateAdd(LHS: TID, RHS: TIdZ);
1564
1565	LLVMContext &Context = Mod->getContext();
1566	Value *Indices[] = {Constant::getNullValue(Ty: Type::getInt32Ty(C&: Context)), TID};
1567
1568	Value *Offset = Builder.CreateInBoundsGEP(Ty: GVTy, Ptr: GV, IdxList: Indices);
1569	I.mutateType(Ty: Offset->getType());
1570	I.replaceAllUsesWith(V: Offset);
1571	I.eraseFromParent();
1572
1573	SmallVector<IntrinsicInst *> DeferredIntrs;
1574
1575	PointerType *NewPtrTy = PointerType::get(C&: Context, AddressSpace: AMDGPUAS::LOCAL_ADDRESS);
1576
1577	for (Value *V : WorkList) {
1578	CallInst *Call = dyn_cast<CallInst>(Val: V);
1579	if (!Call) {
1580	if (ICmpInst *CI = dyn_cast<ICmpInst>(Val: V)) {
1581	Value *LHS = CI->getOperand(i_nocapture: `0`);
1582	Value *RHS = CI->getOperand(i_nocapture: `1`);
1583
1584	Type *NewTy = LHS->getType()->getWithNewType(EltTy: NewPtrTy);
1585	if (isa<ConstantPointerNull, ConstantAggregateZero>(Val: LHS))
1586	CI->setOperand(i_nocapture: `0`, Val_nocapture: Constant::getNullValue(Ty: NewTy));
1587
1588	if (isa<ConstantPointerNull, ConstantAggregateZero>(Val: RHS))
1589	CI->setOperand(i_nocapture: `1`, Val_nocapture: Constant::getNullValue(Ty: NewTy));
1590
1591	continue;
1592	}
1593
1594	// The operand's value should be corrected on its own and we don't want to
1595	// touch the users.
1596	if (isa<AddrSpaceCastInst>(Val: V))
1597	continue;
1598
1599	assert(V->getType()->isPtrOrPtrVectorTy());
1600
1601	Type *NewTy = V->getType()->getWithNewType(EltTy: NewPtrTy);
1602	V->mutateType(Ty: NewTy);
1603
1604	// Adjust the types of any constant operands.
1605	if (SelectInst *SI = dyn_cast<SelectInst>(Val: V)) {
1606	if (isa<ConstantPointerNull, ConstantAggregateZero>(Val: SI->getOperand(i_nocapture: `1`)))
1607	SI->setOperand(i_nocapture: `1`, Val_nocapture: Constant::getNullValue(Ty: NewTy));
1608
1609	if (isa<ConstantPointerNull, ConstantAggregateZero>(Val: SI->getOperand(i_nocapture: `2`)))
1610	SI->setOperand(i_nocapture: `2`, Val_nocapture: Constant::getNullValue(Ty: NewTy));
1611	} else if (PHINode *Phi = dyn_cast<PHINode>(Val: V)) {
1612	for (unsigned I = `0`, E = Phi->getNumIncomingValues(); I != E; ++I) {
1613	if (isa<ConstantPointerNull, ConstantAggregateZero>(
1614	Val: Phi->getIncomingValue(i: I)))
1615	Phi->setIncomingValue(i: I, V: Constant::getNullValue(Ty: NewTy));
1616	}
1617	}
1618
1619	continue;
1620	}
1621
1622	IntrinsicInst *Intr = cast<IntrinsicInst>(Val: Call);
1623	Builder.SetInsertPoint(Intr);
1624	switch (Intr->getIntrinsicID()) {
1625	case Intrinsic::lifetime_start:
1626	case Intrinsic::lifetime_end:
1627	// These intrinsics are for address space 0 only
1628	Intr->eraseFromParent();
1629	continue;
1630	case Intrinsic::memcpy:
1631	case Intrinsic::memmove:
1632	// These have 2 pointer operands. In case if second pointer also needs
1633	// to be replaced we defer processing of these intrinsics until all
1634	// other values are processed.
1635	DeferredIntrs.push_back(Elt: Intr);
1636	continue;
1637	case Intrinsic::memset: {
1638	MemSetInst *MemSet = cast<MemSetInst>(Val: Intr);
1639	Builder.CreateMemSet(Ptr: MemSet->getRawDest(), Val: MemSet->getValue(),
1640	Size: MemSet->getLength(), Align: MemSet->getDestAlign(),
1641	isVolatile: MemSet->isVolatile());
1642	Intr->eraseFromParent();
1643	continue;
1644	}
1645	case Intrinsic::invariant_start:
1646	case Intrinsic::invariant_end:
1647	case Intrinsic::launder_invariant_group:
1648	case Intrinsic::strip_invariant_group: {
1649	SmallVector<Value *> Args;
1650	if (Intr->getIntrinsicID() == Intrinsic::invariant_start) {
1651	Args.emplace_back(Args: Intr->getArgOperand(i: `0`));
1652	} else if (Intr->getIntrinsicID() == Intrinsic::invariant_end) {
1653	Args.emplace_back(Args: Intr->getArgOperand(i: `0`));
1654	Args.emplace_back(Args: Intr->getArgOperand(i: `1`));
1655	}
1656	Args.emplace_back(Args&: Offset);
1657	Function *F = Intrinsic::getOrInsertDeclaration(
1658	M: Intr->getModule(), id: Intr->getIntrinsicID(), Tys: Offset->getType());
1659	CallInst *NewIntr =
1660	CallInst::Create(Func: F, Args, NameStr: Intr->getName(), InsertBefore: Intr->getIterator());
1661	Intr->mutateType(Ty: NewIntr->getType());
1662	Intr->replaceAllUsesWith(V: NewIntr);
1663	Intr->eraseFromParent();
1664	continue;
1665	}
1666	case Intrinsic::objectsize: {
1667	Value *Src = Intr->getOperand(i_nocapture: `0`);
1668
1669	CallInst *NewCall = Builder.CreateIntrinsic(
1670	ID: Intrinsic::objectsize,
1671	Types: {Intr->getType(), PointerType::get(C&: Context, AddressSpace: AMDGPUAS::LOCAL_ADDRESS)},
1672	Args: {Src, Intr->getOperand(i_nocapture: `1`), Intr->getOperand(i_nocapture: `2`), Intr->getOperand(i_nocapture: `3`)});
1673	Intr->replaceAllUsesWith(V: NewCall);
1674	Intr->eraseFromParent();
1675	continue;
1676	}
1677	default:
1678	Intr->print(O&: errs());
1679	llvm_unreachable("Don't know how to promote alloca intrinsic use.");
1680	}
1681	}
1682
1683	for (IntrinsicInst *Intr : DeferredIntrs) {
1684	Builder.SetInsertPoint(Intr);
1685	Intrinsic::ID ID = Intr->getIntrinsicID();
1686	assert(ID == Intrinsic::memcpy \|\| ID == Intrinsic::memmove);
1687
1688	MemTransferInst *MI = cast<MemTransferInst>(Val: Intr);
1689	auto *B = Builder.CreateMemTransferInst(
1690	IntrID: ID, Dst: MI->getRawDest(), DstAlign: MI->getDestAlign(), Src: MI->getRawSource(),
1691	SrcAlign: MI->getSourceAlign(), Size: MI->getLength(), isVolatile: MI->isVolatile());
1692
1693	for (unsigned I = `0`; I != `2`; ++I) {
1694	if (uint64_t Bytes = Intr->getParamDereferenceableBytes(i: I)) {
1695	B->addDereferenceableParamAttr(i: I, Bytes);
1696	}
1697	}
1698
1699	Intr->eraseFromParent();
1700	}
1701
1702	return true;
1703	}
1704

Browse the source code of llvm_projects/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp