AMDGPUPromoteAlloca.cpp source code [llvm_projects/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp]

1	//===-- AMDGPUPromoteAlloca.cpp - Promote Allocas -------------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// Eliminates allocas by either converting them into vectors or by migrating
10	// them to local address space.
11	//
12	// Two passes are exposed by this file:
13	// - "promote-alloca-to-vector", which runs early in the pipeline and only
14	// promotes to vector. Promotion to vector is almost always profitable
15	// except when the alloca is too big and the promotion would result in
16	// very high register pressure.
17	// - "promote-alloca", which does both promotion to vector and LDS and runs
18	// much later in the pipeline. This runs after SROA because promoting to
19	// LDS is of course less profitable than getting rid of the alloca or
20	// vectorizing it, thus we only want to do it when the only alternative is
21	// lowering the alloca to stack.
22	//
23	// Note that both of them exist for the old and new PMs. The new PM passes are
24	// declared in AMDGPU.h and the legacy PM ones are declared here.s
25	//
26	//===----------------------------------------------------------------------===//
27
28	#include "AMDGPU.h"
29	#include "GCNSubtarget.h"
30	#include "Utils/AMDGPUBaseInfo.h"
31	#include "llvm/ADT/STLExtras.h"
32	#include "llvm/Analysis/CaptureTracking.h"
33	#include "llvm/Analysis/InstSimplifyFolder.h"
34	#include "llvm/Analysis/InstructionSimplify.h"
35	#include "llvm/Analysis/LoopInfo.h"
36	#include "llvm/Analysis/ValueTracking.h"
37	#include "llvm/CodeGen/TargetPassConfig.h"
38	#include "llvm/IR/IRBuilder.h"
39	#include "llvm/IR/IntrinsicInst.h"
40	#include "llvm/IR/IntrinsicsAMDGPU.h"
41	#include "llvm/IR/IntrinsicsR600.h"
42	#include "llvm/IR/PatternMatch.h"
43	#include "llvm/InitializePasses.h"
44	#include "llvm/Pass.h"
45	#include "llvm/Support/MathExtras.h"
46	#include "llvm/Target/TargetMachine.h"
47	#include "llvm/Transforms/Utils/SSAUpdater.h"
48
49	#define DEBUG_TYPE "amdgpu-promote-alloca"
50
51	using namespace llvm;
52
53	namespace {
54
55	static cl::opt<bool>
56	DisablePromoteAllocaToVector("disable-promote-alloca-to-vector",
57	cl::desc ("Disable promote alloca to vector"),
58	cl::init(Val: false));
59
60	static cl::opt<bool>
61	DisablePromoteAllocaToLDS("disable-promote-alloca-to-lds",
62	cl::desc ("Disable promote alloca to LDS"),
63	cl::init(Val: false));
64
65	static cl::opt<unsigned> PromoteAllocaToVectorLimit(
66	"amdgpu-promote-alloca-to-vector-limit",
67	cl::desc ("Maximum byte size to consider promote alloca to vector"),
68	cl::init(Val: `0`));
69
70	static cl::opt<unsigned> PromoteAllocaToVectorMaxRegs(
71	"amdgpu-promote-alloca-to-vector-max-regs",
72	cl::desc (
73	"Maximum vector size (in 32b registers) to use when promoting alloca"),
74	cl::init(Val: `32`));
75
76	// Use up to 1/4 of available register budget for vectorization.
77	// FIXME: Increase the limit for whole function budgets? Perhaps x2?
78	static cl::opt<unsigned> PromoteAllocaToVectorVGPRRatio(
79	"amdgpu-promote-alloca-to-vector-vgpr-ratio",
80	cl::desc ("Ratio of VGPRs to budget for promoting alloca to vectors"),
81	cl::init(Val: `4`));
82
83	static cl::opt<unsigned>
84	LoopUserWeight("promote-alloca-vector-loop-user-weight",
85	cl::desc ("The bonus weight of users of allocas within loop "
86	"when sorting profitable allocas"),
87	cl::init(Val: `4`));
88
89	// We support vector indices of the form ((A stride) >> shift) + B*
90	// VarIndex is A, VarMul is stride, VarShift is shift and ConstIndex is B. All
91	// parts are optional.
92	struct GEPToVectorIndex {
93	Value VarIndex = nullptr; // defaults to 0*
94	ConstantInt VarMul = nullptr; // defaults to 1*
95	ConstantInt VarShift = nullptr; // defaults to 0*
96	ConstantInt ConstIndex = nullptr; // defaults to 0*
97	Value Full = nullptr*;
98	};
99
100	struct MemTransferInfo {
101	ConstantInt SrcIndex = nullptr*;
102	ConstantInt DestIndex = nullptr*;
103	};
104
105	// Analysis for planning the different strategies of alloca promotion.
106	struct AllocaAnalysis {
107	AllocaInst Alloca = nullptr*;
108	DenseSet<Value *> Pointers;
109	SmallVector<Use *> Uses;
110	unsigned Score = `0`;
111	bool HaveSelectOrPHI = false;
112	struct {
113	FixedVectorType Ty = nullptr*;
114	SmallVector<Instruction *> Worklist;
115	SmallVector<Instruction *> UsersToRemove;
116	MapVector<GetElementPtrInst *, GEPToVectorIndex> GEPVectorIdx;
117	MapVector<MemTransferInst *, MemTransferInfo> TransferInfo;
118	} Vector;
119	struct {
120	bool Enable = false;
121	SmallVector<User *> Worklist;
122	} LDS;
123
124	explicit AllocaAnalysis(AllocaInst *Alloca) : Alloca(Alloca) {}
125	};
126
127	// Shared implementation which can do both promotion to vector and to LDS.
128	class AMDGPUPromoteAllocaImpl {
129	private:
130	const TargetMachine &TM;
131	LoopInfo &LI;
132	Module Mod = nullptr*;
133	const DataLayout DL = nullptr*;
134
135	// FIXME: This should be per-kernel.
136	uint32_t LocalMemLimit = `0`;
137	uint32_t CurrentLocalMemUsage = `0`;
138	unsigned MaxVGPRs;
139	unsigned VGPRBudgetRatio;
140	unsigned MaxVectorRegs;
141
142	bool IsAMDGCN = false;
143	bool IsAMDHSA = false;
144
145	std::pair<Value , Value > getLocalSizeYZ(IRBuilder<> &Builder);
146	Value getWorkitemID(IRBuilder<> &Builder, unsigned* N);
147
148	bool collectAllocaUses(AllocaAnalysis &AA) const;
149
150	/// Val is a derived pointer from Alloca. OpIdx0/OpIdx1 are the operand
151	/// indices to an instruction with 2 pointer inputs (e.g. select, icmp).
152	/// Returns true if both operands are derived from the same alloca. Val should
153	/// be the same value as one of the input operands of UseInst.
154	bool binaryOpIsDerivedFromSameAlloca(Value Alloca, Value Val,
155	Instruction UseInst, int* OpIdx0,
156	int OpIdx1) const;
157
158	/// Check whether we have enough local memory for promotion.
159	bool hasSufficientLocalMem(const Function &F);
160
161	FixedVectorType getVectorTypeForAlloca(Type AllocaTy) const;
162	void analyzePromoteToVector(AllocaAnalysis &AA) const;
163	void promoteAllocaToVector(AllocaAnalysis &AA);
164	void analyzePromoteToLDS(AllocaAnalysis &AA) const;
165	bool tryPromoteAllocaToLDS(AllocaAnalysis &AA, bool SufficientLDS,
166	SetVector<IntrinsicInst *> &DeferredIntrs);
167	void
168	finishDeferredAllocaToLDSPromotion(SetVector<IntrinsicInst *> &DeferredIntrs);
169
170	void scoreAlloca(AllocaAnalysis &AA) const;
171
172	void setFunctionLimits(const Function &F);
173
174	public:
175	AMDGPUPromoteAllocaImpl(TargetMachine &TM, LoopInfo &LI) : TM(TM), LI(LI) {
176
177	const Triple &TT = TM.getTargetTriple();
178	IsAMDGCN = TT.isAMDGCN();
179	IsAMDHSA = TT.getOS() == Triple::AMDHSA;
180	}
181
182	bool run(Function &F, bool PromoteToLDS);
183	};
184
185	// FIXME: This can create globals so should be a module pass.
186	class AMDGPUPromoteAlloca : public FunctionPass {
187	public:
188	static char ID;
189
190	AMDGPUPromoteAlloca() : FunctionPass (ID) {}
191
192	bool runOnFunction(Function &F) override {
193	if (skipFunction(F))
194	return false;
195	if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>())
196	return AMDGPUPromoteAllocaImpl (
197	TPC->getTM<TargetMachine>(),
198	getAnalysis<LoopInfoWrapperPass>().getLoopInfo())
199	.run(F, /PromoteToLDS/ true);
200	return false;
201	}
202
203	StringRef getPassName() const override { return "AMDGPU Promote Alloca"; }
204
205	void getAnalysisUsage(AnalysisUsage &AU) const override {
206	AU.setPreservesCFG();
207	AU.addRequired<LoopInfoWrapperPass>();
208	FunctionPass::getAnalysisUsage(AU);
209	}
210	};
211
212	static unsigned getMaxVGPRs(unsigned LDSBytes, const TargetMachine &TM,
213	const Function &F) {
214	if (!TM.getTargetTriple().isAMDGCN())
215	return `128`;
216
217	const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
218
219	unsigned DynamicVGPRBlockSize = AMDGPU::getDynamicVGPRBlockSize(F);
220	// Temporarily check both the attribute and the subtarget feature, until the
221	// latter is removed.
222	if (DynamicVGPRBlockSize == `0` && ST.isDynamicVGPREnabled())
223	DynamicVGPRBlockSize = ST.getDynamicVGPRBlockSize();
224
225	unsigned MaxVGPRs = ST.getMaxNumVGPRs(
226	WavesPerEU: ST.getWavesPerEU(FlatWorkGroupSizes: ST.getFlatWorkGroupSizes(F), LDSBytes, F).first,
227	DynamicVGPRBlockSize);
228
229	// A non-entry function has only 32 caller preserved registers.
230	// Do not promote alloca which will force spilling unless we know the function
231	// will be inlined.
232	if (!F.hasFnAttribute(Kind: Attribute::AlwaysInline) &&
233	!AMDGPU::isEntryFunctionCC(CC: F.getCallingConv()))
234	MaxVGPRs = std::min(a: MaxVGPRs, b: `32u`);
235	return MaxVGPRs;
236	}
237
238	} // end anonymous namespace
239
240	char AMDGPUPromoteAlloca::ID = `0`;
241
242	INITIALIZE_PASS_BEGIN(AMDGPUPromoteAlloca, DEBUG_TYPE,
243	"AMDGPU promote alloca to vector or LDS", false, false)
244	// Move LDS uses from functions to kernels before promote alloca for accurate
245	// estimation of LDS available
246	INITIALIZE_PASS_DEPENDENCY(AMDGPULowerModuleLDSLegacy)
247	INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
248	INITIALIZE_PASS_END(AMDGPUPromoteAlloca, DEBUG_TYPE,
249	"AMDGPU promote alloca to vector or LDS", false, false)
250
251	char &llvm::AMDGPUPromoteAllocaID = AMDGPUPromoteAlloca::ID;
252
253	PreservedAnalyses AMDGPUPromoteAllocaPass::run(Function &F,
254	FunctionAnalysisManager &AM) {
255	auto &LI = AM.getResult<LoopAnalysis>(IR&: F);
256	bool Changed = AMDGPUPromoteAllocaImpl (TM, LI).run(F, /PromoteToLDS=/true);
257	if (Changed) {
258	PreservedAnalyses PA;
259	PA.preserveSet<CFGAnalyses>();
260	return PA;
261	}
262	return PreservedAnalyses::all();
263	}
264
265	PreservedAnalyses
266	AMDGPUPromoteAllocaToVectorPass::run(Function &F, FunctionAnalysisManager &AM) {
267	auto &LI = AM.getResult<LoopAnalysis>(IR&: F);
268	bool Changed = AMDGPUPromoteAllocaImpl (TM, LI).run(F, /PromoteToLDS=/false);
269	if (Changed) {
270	PreservedAnalyses PA;
271	PA.preserveSet<CFGAnalyses>();
272	return PA;
273	}
274	return PreservedAnalyses::all();
275	}
276
277	FunctionPass *llvm::createAMDGPUPromoteAlloca() {
278	return new AMDGPUPromoteAlloca ();
279	}
280
281	bool AMDGPUPromoteAllocaImpl::collectAllocaUses(AllocaAnalysis &AA) const {
282	const auto RejectUser = [&](Instruction *Inst, Twine Msg) {
283	LLVM_DEBUG(dbgs() << " Cannot promote alloca: " << Msg << "\n"
284	<< " " << *Inst << "\n");
285	return false;
286	};
287
288	SmallVector<Instruction *, `4`> WorkList({AA.Alloca});
289	while (!WorkList.empty()) {
290	auto *Cur = WorkList.pop_back_val();
291	if (find(Range&: AA.Pointers, Val: Cur) != AA.Pointers.end())
292	continue;
293	AA.Pointers.insert(V: Cur);
294	for (auto &U : Cur->uses()) {
295	auto *Inst = cast<Instruction>(Val: U.getUser());
296	if (isa<StoreInst>(Val: Inst)) {
297	if (U.getOperandNo() != StoreInst::getPointerOperandIndex()) {
298	return RejectUser (Inst, "pointer escapes via store");
299	}
300	}
301	AA.Uses.push_back(Elt: &U);
302
303	if (isa<GetElementPtrInst>(Val: U.getUser())) {
304	WorkList.push_back(Elt: Inst);
305	} else if (auto *SI = dyn_cast<SelectInst>(Val: Inst)) {
306	// Only promote a select if we know that the other select operand is
307	// from another pointer that will also be promoted.
308	if (!binaryOpIsDerivedFromSameAlloca(Alloca: AA.Alloca, Val: Cur, UseInst: SI, OpIdx0: `1`, OpIdx1: `2`))
309	return RejectUser (Inst, "select from mixed objects");
310	WorkList.push_back(Elt: Inst);
311	AA.HaveSelectOrPHI = true;
312	} else if (auto *Phi = dyn_cast<PHINode>(Val: Inst)) {
313	// Repeat for phis.
314
315	// TODO: Handle more complex cases. We should be able to replace loops
316	// over arrays.
317	switch (Phi->getNumIncomingValues()) {
318	case `1`:
319	break;
320	case `2`:
321	if (!binaryOpIsDerivedFromSameAlloca(Alloca: AA.Alloca, Val: Cur, UseInst: Phi, OpIdx0: `0`, OpIdx1: `1`))
322	return RejectUser (Inst, "phi from mixed objects");
323	break;
324	default:
325	return RejectUser (Inst, "phi with too many operands");
326	}
327
328	WorkList.push_back(Elt: Inst);
329	AA.HaveSelectOrPHI = true;
330	}
331	}
332	}
333	return true;
334	}
335
336	void AMDGPUPromoteAllocaImpl::scoreAlloca(AllocaAnalysis &AA) const {
337	LLVM_DEBUG(dbgs() << "Scoring: " << *AA.Alloca << "\n");
338	unsigned Score = `0`;
339	// Increment score by one for each user + a bonus for users within loops.
340	for (auto *U : AA.Uses) {
341	Instruction *Inst = cast<Instruction>(Val: U->getUser());
342	if (isa<GetElementPtrInst>(Val: Inst) \|\| isa<SelectInst>(Val: Inst) \|\|
343	isa<PHINode>(Val: Inst))
344	continue;
345	unsigned UserScore =
346	`1` + (LoopUserWeight * LI.getLoopDepth(BB: Inst->getParent()));
347	LLVM_DEBUG(dbgs() << " [+" << UserScore << "]:\t" << *Inst << "\n");
348	Score += UserScore;
349	}
350	LLVM_DEBUG(dbgs() << " => Final Score:" << Score << "\n");
351	AA.Score = Score;
352	}
353
354	void AMDGPUPromoteAllocaImpl::setFunctionLimits(const Function &F) {
355	// Load per function limits, overriding with global options where appropriate.
356	// R600 register tuples/aliasing are fragile with large vector promotions so
357	// apply architecture specific limit here.
358	const int R600MaxVectorRegs = `16`;
359	MaxVectorRegs = F.getFnAttributeAsParsedInteger(
360	Kind: "amdgpu-promote-alloca-to-vector-max-regs",
361	Default: IsAMDGCN ? PromoteAllocaToVectorMaxRegs : R600MaxVectorRegs);
362	if (PromoteAllocaToVectorMaxRegs.getNumOccurrences())
363	MaxVectorRegs = PromoteAllocaToVectorMaxRegs;
364	VGPRBudgetRatio = F.getFnAttributeAsParsedInteger(
365	Kind: "amdgpu-promote-alloca-to-vector-vgpr-ratio",
366	Default: PromoteAllocaToVectorVGPRRatio);
367	if (PromoteAllocaToVectorVGPRRatio.getNumOccurrences())
368	VGPRBudgetRatio = PromoteAllocaToVectorVGPRRatio;
369	}
370
371	bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
372	if (DisablePromoteAllocaToLDS && DisablePromoteAllocaToVector)
373	return false;
374
375	Mod = F.getParent();
376	DL = &Mod->getDataLayout();
377
378	bool SufficientLDS = PromoteToLDS && hasSufficientLocalMem(F);
379	MaxVGPRs = getMaxVGPRs(LDSBytes: CurrentLocalMemUsage, TM, F);
380	setFunctionLimits(F);
381
382	unsigned VectorizationBudget =
383	(PromoteAllocaToVectorLimit ? PromoteAllocaToVectorLimit * `8`
384	: (MaxVGPRs * `32`)) /
385	VGPRBudgetRatio;
386
387	std::vector<AllocaAnalysis> Allocas;
388	for (Instruction &I : F.getEntryBlock()) {
389	if (AllocaInst *AI = dyn_cast<AllocaInst>(Val: &I)) {
390	// Array allocations are probably not worth handling, since an allocation
391	// of the array type is the canonical form.
392	if (!AI->isStaticAlloca() \|\| AI->isArrayAllocation())
393	continue;
394
395	LLVM_DEBUG(dbgs() << "Analyzing: " << *AI << `'\n'`);
396
397	AllocaAnalysis AA{AI};
398	if (collectAllocaUses(AA)) {
399	analyzePromoteToVector(AA);
400	if (PromoteToLDS)
401	analyzePromoteToLDS(AA);
402	if (AA.Vector.Ty \|\| AA.LDS.Enable) {
403	scoreAlloca(AA);
404	Allocas.push_back(x: std::move(AA));
405	}
406	}
407	}
408	}
409
410	stable_sort(Range&: Allocas,
411	C: [](const auto &A, const auto &B) { return A.Score > B.Score; });
412
413	// clang-format off
414	LLVM_DEBUG(
415	dbgs() << "Sorted Worklist:\n";
416	for (const auto &AA : Allocas)
417	dbgs() << " " << *AA.Alloca << "\n";
418	);
419	// clang-format on
420
421	bool Changed = false;
422	SetVector<IntrinsicInst *> DeferredIntrs;
423	for (AllocaAnalysis &AA : Allocas) {
424	if (AA.Vector.Ty) {
425	std::optional<TypeSize> Size = AA.Alloca->getAllocationSize(DL: *DL);
426	assert(Size); // Expected to succeed on non-array alloca.
427	const unsigned AllocaCost = Size ->getFixedValue() * `8`;
428	// First, check if we have enough budget to vectorize this alloca.
429	if (AllocaCost <= VectorizationBudget) {
430	promoteAllocaToVector(AA);
431	Changed = true;
432	assert((VectorizationBudget - AllocaCost) < VectorizationBudget &&
433	"Underflow!");
434	VectorizationBudget -= AllocaCost;
435	LLVM_DEBUG(dbgs() << " Remaining vectorization budget:"
436	<< VectorizationBudget << "\n");
437	continue;
438	} else {
439	LLVM_DEBUG(dbgs() << "Alloca too big for vectorization (size:"
440	<< AllocaCost << ", budget:" << VectorizationBudget
441	<< "): " << *AA.Alloca << "\n");
442	}
443	}
444
445	if (AA.LDS.Enable &&
446	tryPromoteAllocaToLDS(AA, SufficientLDS, DeferredIntrs))
447	Changed = true;
448	}
449	finishDeferredAllocaToLDSPromotion(DeferredIntrs);
450
451	// NOTE: tryPromoteAllocaToVector removes the alloca, so Allocas contains
452	// dangling pointers. If we want to reuse it past this point, the loop above
453	// would need to be updated to remove successfully promoted allocas.
454
455	return Changed;
456	}
457
458	// Checks if the instruction I is a memset user of the alloca AI that we can
459	// deal with. Currently, only non-volatile memsets that affect the whole alloca
460	// are handled.
461	static bool isSupportedMemset(MemSetInst I, AllocaInst AI,
462	const DataLayout &DL) {
463	using namespace PatternMatch;
464	// For now we only care about non-volatile memsets that affect the whole type
465	// (start at index 0 and fill the whole alloca).
466	//
467	// TODO: Now that we moved to PromoteAlloca we could handle any memsets
468	// (except maybe volatile ones?) - we just need to use shufflevector if it
469	// only affects a subset of the vector.
470	const unsigned Size = DL.getTypeStoreSize(Ty: AI->getAllocatedType());
471	return I->getOperand(i_nocapture: `0`) == AI &&
472	match(V: I->getOperand(i_nocapture: `2`), P: m_SpecificInt(V: Size)) && !I->isVolatile();
473	}
474
475	static Value calculateVectorIndex(Value Ptr, AllocaAnalysis &AA) {
476	IRBuilder<> B(Ptr->getContext());
477
478	Ptr = Ptr->stripPointerCasts();
479	if (Ptr == AA.Alloca)
480	return B.getInt32(C: `0`);
481
482	auto *GEP = cast<GetElementPtrInst>(Val: Ptr);
483	auto I = AA.Vector.GEPVectorIdx.find(Key: GEP);
484	assert(I != AA.Vector.GEPVectorIdx.end() && "Must have entry for GEP!");
485
486	if (!I->second.Full) {
487	Value Result = nullptr*;
488	B.SetInsertPoint(GEP);
489
490	if (I->second.VarIndex) {
491	Result = I->second.VarIndex;
492	Result = B.CreateSExtOrTrunc(V: Result, DestTy: B.getInt32Ty());
493
494	if (I->second.VarMul)
495	Result = B.CreateMul(LHS: Result, RHS: I->second.VarMul);
496
497	if (I->second.VarShift)
498	Result = B.CreateAShr(LHS: Result, RHS: I->second.VarShift, Name: "", /isExact/ true);
499	}
500
501	if (I->second.ConstIndex) {
502	if (Result)
503	Result = B.CreateAdd(LHS: Result, RHS: I->second.ConstIndex);
504	else
505	Result = I->second.ConstIndex;
506	}
507
508	if (!Result)
509	Result = B.getInt32(C: `0`);
510
511	I->second.Full = Result;
512	}
513
514	return I->second.Full;
515	}
516
517	static std::optional<GEPToVectorIndex>
518	computeGEPToVectorIndex(GetElementPtrInst GEP, AllocaInst Alloca,
519	Type VecElemTy, const* DataLayout &DL) {
520	// TODO: Extracting a "multiple of X" from a GEP might be a useful generic
521	// helper.
522	LLVMContext &Ctx = GEP->getContext();
523	unsigned BW = DL.getIndexTypeSizeInBits(Ty: GEP->getType());
524	SmallMapVector<Value *, APInt, `4`> VarOffsets;
525	APInt ConstOffset(BW, `0`);
526
527	// Walk backwards through nested GEPs to collect both constant and variable
528	// offsets, so that nested vector GEP chains can be lowered in one step.
529	//
530	// Given this IR fragment as input:
531	//
532	// %0 = alloca [10 x <2 x i32>], align 8, addrspace(5)
533	// %1 = getelementptr [10 x <2 x i32>], ptr addrspace(5) %0, i32 0, i32 %j
534	// %2 = getelementptr i8, ptr addrspace(5) %1, i32 4
535	// %3 = load i32, ptr addrspace(5) %2, align 4
536	//
537	// Combine both GEP operations in a single pass, producing:
538	// BasePtr = %0
539	// ConstOffset = 4
540	// VarOffsets = { %j -> element_size(<2 x i32>) }
541	//
542	// That lets us emit a single buffer_load directly into a VGPR, without ever
543	// allocating scratch memory for the intermediate pointer.
544	Value *CurPtr = GEP;
545	while (auto *CurGEP = dyn_cast<GetElementPtrInst>(Val: CurPtr)) {
546	if (!CurGEP->collectOffset(DL, BitWidth: BW, VariableOffsets&: VarOffsets, ConstantOffset&: ConstOffset))
547	return {};
548
549	// Move to the next outer pointer.
550	CurPtr = CurGEP->getPointerOperand();
551	}
552
553	assert(CurPtr == Alloca && "GEP not based on alloca");
554
555	int64_t VecElemSize = DL.getTypeAllocSize(Ty: VecElemTy);
556	if (VarOffsets.size() > `1`)
557	return {};
558
559	// We support vector indices of the form ((VarIndex stride) >> shift) + B.*
560	// IndexQuot represents B. Check that the constant offset is a multiple
561	// of the vector element size.
562	if (ConstOffset.srem(RHS: VecElemSize) != `0`)
563	return {};
564	APInt IndexQuot = ConstOffset.sdiv(RHS: VecElemSize);
565
566	GEPToVectorIndex Result;
567
568	if (!ConstOffset.isZero())
569	Result.ConstIndex = ConstantInt::get(Context&: Ctx, V: IndexQuot.sextOrTrunc(width: BW));
570
571	// If there are no variable offsets, only a constant offset, then we're done.
572	if (VarOffsets.empty())
573	return Result;
574
575	// Scale is the stride in the (A stride) part. Check that there is only one*
576	// variable offset and extract the scale factor.
577	const auto &VarOffset = VarOffsets.front();
578	auto ScaleOpt = VarOffset.second.tryZExtValue();
579	if (!ScaleOpt \|\| *ScaleOpt == `0`)
580	return {};
581
582	uint64_t Scale = *ScaleOpt;
583	Result.VarIndex = VarOffset.first;
584	auto *OffsetType = dyn_cast<IntegerType>(Val: Result.VarIndex->getType());
585	if (!OffsetType)
586	return {};
587
588	// The vector index for the variable part is: VarIndex Scale / VecElemSize.*
589	if (Scale >= (uint64_t)VecElemSize) {
590	if (Scale % VecElemSize != `0`)
591	return {};
592
593	// Scale is a multiple of VecElemSize, so the index is just: VarIndex *
594	// (Scale / VecElemSize).
595	uint64_t VarMul = Scale / VecElemSize;
596	// Only the multiplier is needed.
597	if (VarMul != `1`)
598	Result.VarMul = ConstantInt::get(Context&: Ctx, V: APInt (BW, VarMul));
599	} else {
600	if ((uint64_t)VecElemSize % Scale != `0`)
601	return {};
602
603	// VecElemSize is a multiple of Scale, so the index is just: VarIndex /
604	// (VecElemSize / Scale).
605	uint64_t Divisor = VecElemSize / Scale;
606	// The divisor must be a power of 2 so we can use a right shift.
607	if (!isPowerOf2_64(Value: Divisor))
608	return {};
609
610	// VarIndex must be known to be divisible by that divisor.
611	KnownBits KB = computeKnownBits(V: VarOffset.first, DL);
612	if (KB.countMinTrailingZeros() < Log2_64(Value: Divisor))
613	return {};
614
615	Result.VarShift = ConstantInt::get(Context&: Ctx, V: APInt (BW, Log2_64(Value: Divisor)));
616	}
617
618	return Result;
619	}
620
621	/// Promotes a single user of the alloca to a vector form.
622	///
623	/// \param Inst Instruction to be promoted.
624	/// \param DL Module Data Layout.
625	/// \param AA Alloca Analysis.
626	/// \param VecStoreSize Size of \p VectorTy in bytes.
627	/// \param ElementSize Size of \p VectorTy element type in bytes.
628	/// \param CurVal Current value of the vector (e.g. last stored value)
629	/// \param[out] DeferredLoads \p Inst is added to this vector if it can't
630	/// be promoted now. This happens when promoting requires \p
631	/// CurVal, but \p CurVal is nullptr.
632	/// \return the stored value if \p Inst would have written to the alloca, or
633	/// nullptr otherwise.
634	static Value promoteAllocaUserToVector(Instruction Inst, const DataLayout &DL,
635	AllocaAnalysis &AA,
636	unsigned VecStoreSize,
637	unsigned ElementSize,
638	function_ref<Value *()> GetCurVal) {
639	// Note: we use InstSimplifyFolder because it can leverage the DataLayout
640	// to do more folding, especially in the case of vector splats.
641	IRBuilder<InstSimplifyFolder> Builder(Inst->getContext(),
642	InstSimplifyFolder (DL));
643	Builder.SetInsertPoint(Inst);
644
645	Type *VecEltTy = AA.Vector.Ty->getElementType();
646
647	switch (Inst->getOpcode()) {
648	case Instruction::Load: {
649	Value *CurVal = GetCurVal ();
650	Value *Index =
651	calculateVectorIndex(Ptr: cast<LoadInst>(Val: Inst)->getPointerOperand(), AA);
652
653	// We're loading the full vector.
654	Type *AccessTy = Inst->getType();
655	TypeSize AccessSize = DL.getTypeStoreSize(Ty: AccessTy);
656	if (Constant *CI = dyn_cast<Constant>(Val: Index)) {
657	if (CI->isNullValue() && AccessSize == VecStoreSize) {
658	Inst->replaceAllUsesWith(
659	V: Builder.CreateBitPreservingCastChain(DL, V: CurVal, NewTy: AccessTy));
660	return nullptr;
661	}
662	}
663
664	// Loading a subvector.
665	if (isa<FixedVectorType>(Val: AccessTy)) {
666	assert(AccessSize.isKnownMultipleOf(DL.getTypeStoreSize(VecEltTy)));
667	const unsigned NumLoadedElts = AccessSize / DL.getTypeStoreSize(Ty: VecEltTy);
668	auto *SubVecTy = FixedVectorType::get(ElementType: VecEltTy, NumElts: NumLoadedElts);
669	assert(DL.getTypeStoreSize(SubVecTy) == DL.getTypeStoreSize(AccessTy));
670
671	// If idx is dynamic, then sandwich load with bitcasts.
672	// ie. VectorTy SubVecTy AccessTy
673	// <64 x i8> -> <16 x i8> <8 x i16>
674	// <64 x i8> -> <4 x i128> -> i128 -> <8 x i16>
675	// Extracting subvector with dynamic index has very large expansion in
676	// the amdgpu backend. Limit to pow2.
677	FixedVectorType *VectorTy = AA.Vector.Ty;
678	TypeSize NumBits = DL.getTypeStoreSize(Ty: SubVecTy) * `8u`;
679	uint64_t LoadAlign = cast<LoadInst>(Val: Inst)->getAlign().value();
680	bool IsAlignedLoad = NumBits <= (LoadAlign * `8u`);
681	unsigned TotalNumElts = VectorTy->getNumElements();
682	bool IsProperlyDivisible = TotalNumElts % NumLoadedElts == `0`;
683	if (!isa<ConstantInt>(Val: Index) &&
684	llvm::isPowerOf2_32(Value: SubVecTy->getNumElements()) &&
685	IsProperlyDivisible && IsAlignedLoad) {
686	IntegerType *NewElemTy = Builder.getIntNTy(N: NumBits);
687	const unsigned NewNumElts =
688	DL.getTypeStoreSize(Ty: VectorTy) * `8u` / NumBits;
689	const unsigned LShrAmt = llvm::Log2_32(Value: SubVecTy->getNumElements());
690	FixedVectorType *BitCastTy =
691	FixedVectorType::get(ElementType: NewElemTy, NumElts: NewNumElts);
692	Value *BCVal = Builder.CreateBitCast(V: CurVal, DestTy: BitCastTy);
693	Value *NewIdx = Builder.CreateLShr(
694	LHS: Index, RHS: ConstantInt::get(Ty: Index->getType(), V: LShrAmt));
695	Value *ExtVal = Builder.CreateExtractElement(Vec: BCVal, Idx: NewIdx);
696	Value *BCOut = Builder.CreateBitCast(V: ExtVal, DestTy: AccessTy);
697	Inst->replaceAllUsesWith(V: BCOut);
698	return nullptr;
699	}
700
701	Value *SubVec = PoisonValue::get(T: SubVecTy);
702	for (unsigned K = `0`; K < NumLoadedElts; ++K) {
703	Value *CurIdx =
704	Builder.CreateAdd(LHS: Index, RHS: ConstantInt::get(Ty: Index->getType(), V: K));
705	SubVec = Builder.CreateInsertElement(
706	Vec: SubVec, NewElt: Builder.CreateExtractElement(Vec: CurVal, Idx: CurIdx), Idx: K);
707	}
708
709	Inst->replaceAllUsesWith(
710	V: Builder.CreateBitPreservingCastChain(DL, V: SubVec, NewTy: AccessTy));
711	return nullptr;
712	}
713
714	// We're loading one element.
715	Value *ExtractElement = Builder.CreateExtractElement(Vec: CurVal, Idx: Index);
716	if (AccessTy != VecEltTy)
717	ExtractElement = Builder.CreateBitOrPointerCast(V: ExtractElement, DestTy: AccessTy);
718
719	Inst->replaceAllUsesWith(V: ExtractElement);
720	return nullptr;
721	}
722	case Instruction::Store: {
723	// For stores, it's a bit trickier and it depends on whether we're storing
724	// the full vector or not. If we're storing the full vector, we don't need
725	// to know the current value. If this is a store of a single element, we
726	// need to know the value.
727	StoreInst *SI = cast<StoreInst>(Val: Inst);
728	Value *Index = calculateVectorIndex(Ptr: SI->getPointerOperand(), AA);
729	Value *Val = SI->getValueOperand();
730
731	// We're storing the full vector, we can handle this without knowing CurVal.
732	Type *AccessTy = Val->getType();
733	TypeSize AccessSize = DL.getTypeStoreSize(Ty: AccessTy);
734	if (Constant *CI = dyn_cast<Constant>(Val: Index))
735	if (CI->isNullValue() && AccessSize == VecStoreSize)
736	return Builder.CreateBitPreservingCastChain(DL, V: Val, NewTy: AA.Vector.Ty);
737
738	// Storing a subvector.
739	if (isa<FixedVectorType>(Val: AccessTy)) {
740	assert(AccessSize.isKnownMultipleOf(DL.getTypeStoreSize(VecEltTy)));
741	const unsigned NumWrittenElts =
742	AccessSize / DL.getTypeStoreSize(Ty: VecEltTy);
743	const unsigned NumVecElts = AA.Vector.Ty->getNumElements();
744	auto *SubVecTy = FixedVectorType::get(ElementType: VecEltTy, NumElts: NumWrittenElts);
745	assert(DL.getTypeStoreSize(SubVecTy) == DL.getTypeStoreSize(AccessTy));
746
747	Val = Builder.CreateBitPreservingCastChain(DL, V: Val, NewTy: SubVecTy);
748	Value *CurVec = GetCurVal ();
749	for (unsigned K = `0`, NumElts = std::min(a: NumWrittenElts, b: NumVecElts);
750	K < NumElts; ++K) {
751	Value *CurIdx =
752	Builder.CreateAdd(LHS: Index, RHS: ConstantInt::get(Ty: Index->getType(), V: K));
753	CurVec = Builder.CreateInsertElement(
754	Vec: CurVec, NewElt: Builder.CreateExtractElement(Vec: Val, Idx: K), Idx: CurIdx);
755	}
756	return CurVec;
757	}
758
759	if (Val->getType() != VecEltTy)
760	Val = Builder.CreateBitOrPointerCast(V: Val, DestTy: VecEltTy);
761	return Builder.CreateInsertElement(Vec: GetCurVal (), NewElt: Val, Idx: Index);
762	}
763	case Instruction::Call: {
764	if (auto *MTI = dyn_cast<MemTransferInst>(Val: Inst)) {
765	// For memcpy, we need to know curval.
766	ConstantInt *Length = cast<ConstantInt>(Val: MTI->getLength());
767	unsigned NumCopied = Length->getZExtValue() / ElementSize;
768	MemTransferInfo *TI = &AA.Vector.TransferInfo [MTI];
769	unsigned SrcBegin = TI->SrcIndex->getZExtValue();
770	unsigned DestBegin = TI->DestIndex->getZExtValue();
771
772	SmallVector<int> Mask;
773	for (unsigned Idx = `0`; Idx < AA.Vector.Ty->getNumElements(); ++Idx) {
774	if (Idx >= DestBegin && Idx < DestBegin + NumCopied) {
775	Mask.push_back(Elt: SrcBegin < AA.Vector.Ty->getNumElements()
776	? SrcBegin++
777	: PoisonMaskElem);
778	} else {
779	Mask.push_back(Elt: Idx);
780	}
781	}
782
783	return Builder.CreateShuffleVector(V: GetCurVal (), Mask);
784	}
785
786	if (auto *MSI = dyn_cast<MemSetInst>(Val: Inst)) {
787	// For memset, we don't need to know the previous value because we
788	// currently only allow memsets that cover the whole alloca.
789	Value *Elt = MSI->getOperand(i_nocapture: `1`);
790	const unsigned BytesPerElt = DL.getTypeStoreSize(Ty: VecEltTy);
791	if (BytesPerElt > `1`) {
792	Value *EltBytes = Builder.CreateVectorSplat(NumElts: BytesPerElt, V: Elt);
793
794	// If the element type of the vector is a pointer, we need to first cast
795	// to an integer, then use a PtrCast.
796	if (VecEltTy->isPointerTy()) {
797	Type PtrInt = Builder.getIntNTy(N: BytesPerElt `8`);
798	Elt = Builder.CreateBitCast(V: EltBytes, DestTy: PtrInt);
799	Elt = Builder.CreateIntToPtr(V: Elt, DestTy: VecEltTy);
800	} else
801	Elt = Builder.CreateBitCast(V: EltBytes, DestTy: VecEltTy);
802	}
803
804	return Builder.CreateVectorSplat(EC: AA.Vector.Ty->getElementCount(), V: Elt);
805	}
806
807	if (auto *Intr = dyn_cast<IntrinsicInst>(Val: Inst)) {
808	if (Intr->getIntrinsicID() == Intrinsic::objectsize) {
809	Intr->replaceAllUsesWith(
810	V: Builder.getIntN(N: Intr->getType()->getIntegerBitWidth(),
811	C: DL.getTypeAllocSize(Ty: AA.Vector.Ty)));
812	return nullptr;
813	}
814	}
815
816	llvm_unreachable("Unsupported call when promoting alloca to vector");
817	}
818
819	default:
820	llvm_unreachable("Inconsistency in instructions promotable to vector");
821	}
822
823	llvm_unreachable("Did not return after promoting instruction!");
824	}
825
826	static bool isSupportedAccessType(FixedVectorType VecTy, Type AccessTy,
827	const DataLayout &DL) {
828	// Access as a vector type can work if the size of the access vector is a
829	// multiple of the size of the alloca's vector element type.
830	//
831	// Examples:
832	// - VecTy = <8 x float>, AccessTy = <4 x float> -> OK
833	// - VecTy = <4 x double>, AccessTy = <2 x float> -> OK
834	// - VecTy = <4 x double>, AccessTy = <3 x float> -> NOT OK
835	// - 332 is not a multiple of 64*
836	//
837	// We could handle more complicated cases, but it'd make things a lot more
838	// complicated.
839	if (isa<FixedVectorType>(Val: AccessTy)) {
840	TypeSize AccTS = DL.getTypeStoreSize(Ty: AccessTy);
841	// If the type size and the store size don't match, we would need to do more
842	// than just bitcast to translate between an extracted/insertable subvectors
843	// and the accessed value.
844	if (AccTS * `8` != DL.getTypeSizeInBits(Ty: AccessTy))
845	return false;
846	TypeSize VecTS = DL.getTypeStoreSize(Ty: VecTy->getElementType());
847	return AccTS.isKnownMultipleOf(RHS: VecTS);
848	}
849
850	return CastInst::isBitOrNoopPointerCastable(SrcTy: VecTy->getElementType(), DestTy: AccessTy,
851	DL);
852	}
853
854	/// Iterates over an instruction worklist that may contain multiple instructions
855	/// from the same basic block, but in a different order.
856	template <typename InstContainer>
857	static void forEachWorkListItem(const InstContainer &WorkList,
858	std::function<void(Instruction *)> Fn) {
859	// Bucket up uses of the alloca by the block they occur in.
860	// This is important because we have to handle multiple defs/uses in a block
861	// ourselves: SSAUpdater is purely for cross-block references.
862	DenseMap<BasicBlock , SmallDenseSet<Instruction >> UsesByBlock;
863	for (Instruction *User : WorkList)
864	UsesByBlock [User->getParent()].insert(V: User);
865
866	for (Instruction *User : WorkList) {
867	BasicBlock *BB = User->getParent();
868	auto &BlockUses = UsesByBlock [BB];
869
870	// Already processed, skip.
871	if (BlockUses.empty())
872	continue;
873
874	// Only user in the block, directly process it.
875	if (BlockUses.size() == `1`) {
876	Fn (User);
877	continue;
878	}
879
880	// Multiple users in the block, do a linear scan to see users in order.
881	for (Instruction &Inst : *BB) {
882	if (!BlockUses.contains(V: &Inst))
883	continue;
884
885	Fn (&Inst);
886	}
887
888	// Clear the block so we know it's been processed.
889	BlockUses.clear();
890	}
891	}
892
893	/// Find an insert point after an alloca, after all other allocas clustered at
894	/// the start of the block.
895	static BasicBlock::iterator skipToNonAllocaInsertPt(BasicBlock &BB,
896	BasicBlock::iterator I) {
897	for (BasicBlock::iterator E = BB.end(); I != E && isa<AllocaInst>(Val: *I); ++I)
898	;
899	return I;
900	}
901
902	FixedVectorType *
903	AMDGPUPromoteAllocaImpl::getVectorTypeForAlloca(Type AllocaTy) const* {
904	if (DisablePromoteAllocaToVector) {
905	LLVM_DEBUG(dbgs() << " Promote alloca to vectors is disabled\n");
906	return nullptr;
907	}
908
909	auto *VectorTy = dyn_cast<FixedVectorType>(Val: AllocaTy);
910	if (auto *ArrayTy = dyn_cast<ArrayType>(Val: AllocaTy)) {
911	uint64_t NumElems = `1`;
912	Type *ElemTy;
913	do {
914	NumElems *= ArrayTy->getNumElements();
915	ElemTy = ArrayTy->getElementType();
916	} while ((ArrayTy = dyn_cast<ArrayType>(Val: ElemTy)));
917
918	// Check for array of vectors
919	auto *InnerVectorTy = dyn_cast<FixedVectorType>(Val: ElemTy);
920	if (InnerVectorTy) {
921	NumElems *= InnerVectorTy->getNumElements();
922	ElemTy = InnerVectorTy->getElementType();
923	}
924
925	if (VectorType::isValidElementType(ElemTy) && NumElems > `0`) {
926	unsigned ElementSize = DL->getTypeSizeInBits(Ty: ElemTy) / `8`;
927	if (ElementSize > `0`) {
928	unsigned AllocaSize = DL->getTypeStoreSize(Ty: AllocaTy);
929	// Expand vector if required to match padding of inner type,
930	// i.e. odd size subvectors.
931	// Storage size of new vector must match that of alloca for correct
932	// behaviour of byte offsets and GEP computation.
933	if (NumElems * ElementSize != AllocaSize)
934	NumElems = AllocaSize / ElementSize;
935	if (NumElems > `0` && (AllocaSize % ElementSize) == `0`)
936	VectorTy = FixedVectorType::get(ElementType: ElemTy, NumElts: NumElems);
937	}
938	}
939	}
940	if (!VectorTy) {
941	LLVM_DEBUG(dbgs() << " Cannot convert type to vector\n");
942	return nullptr;
943	}
944
945	const unsigned MaxElements =
946	(MaxVectorRegs * `32`) / DL->getTypeSizeInBits(Ty: VectorTy->getElementType());
947
948	if (VectorTy->getNumElements() > MaxElements \|\|
949	VectorTy->getNumElements() < `2`) {
950	LLVM_DEBUG(dbgs() << " " << *VectorTy
951	<< " has an unsupported number of elements\n");
952	return nullptr;
953	}
954
955	Type *VecEltTy = VectorTy->getElementType();
956	unsigned ElementSizeInBits = DL->getTypeSizeInBits(Ty: VecEltTy);
957	if (ElementSizeInBits != DL->getTypeAllocSizeInBits(Ty: VecEltTy)) {
958	LLVM_DEBUG(dbgs() << " Cannot convert to vector if the allocation size "
959	"does not match the type's size\n");
960	return nullptr;
961	}
962
963	return VectorTy;
964	}
965
966	void AMDGPUPromoteAllocaImpl::analyzePromoteToVector(AllocaAnalysis &AA) const {
967	if (AA.HaveSelectOrPHI) {
968	LLVM_DEBUG(dbgs() << " Cannot convert to vector due to select or phi\n");
969	return;
970	}
971
972	Type *AllocaTy = AA.Alloca->getAllocatedType();
973	AA.Vector.Ty = getVectorTypeForAlloca(AllocaTy);
974	if (!AA.Vector.Ty)
975	return;
976
977	const auto RejectUser = [&](Instruction *Inst, Twine Msg) {
978	LLVM_DEBUG(dbgs() << " Cannot promote alloca to vector: " << Msg << "\n"
979	<< " " << *Inst << "\n");
980	AA.Vector.Ty = nullptr;
981	};
982
983	Type *VecEltTy = AA.Vector.Ty->getElementType();
984	unsigned ElementSize = DL->getTypeSizeInBits(Ty: VecEltTy) / `8`;
985	assert(ElementSize > `0`);
986	for (auto *U : AA.Uses) {
987	Instruction *Inst = cast<Instruction>(Val: U->getUser());
988
989	if (Value *Ptr = getLoadStorePointerOperand(V: Inst)) {
990	assert(!isa<StoreInst>(Inst) \|\|
991	U->getOperandNo() == StoreInst::getPointerOperandIndex());
992
993	Type *AccessTy = getLoadStoreType(I: Inst);
994	if (AccessTy->isAggregateType())
995	return RejectUser (Inst, "unsupported load/store as aggregate");
996	assert(!AccessTy->isAggregateType() \|\| AccessTy->isArrayTy());
997
998	// Check that this is a simple access of a vector element.
999	bool IsSimple = isa<LoadInst>(Val: Inst) ? cast<LoadInst>(Val: Inst)->isSimple()
1000	: cast<StoreInst>(Val: Inst)->isSimple();
1001	if (!IsSimple)
1002	return RejectUser (Inst, "not a simple load or store");
1003
1004	Ptr = Ptr->stripPointerCasts();
1005
1006	// Alloca already accessed as vector.
1007	if (Ptr == AA.Alloca &&
1008	DL->getTypeStoreSize(Ty: AA.Alloca->getAllocatedType()) ==
1009	DL->getTypeStoreSize(Ty: AccessTy)) {
1010	AA.Vector.Worklist.push_back(Elt: Inst);
1011	continue;
1012	}
1013
1014	if (!isSupportedAccessType(VecTy: AA.Vector.Ty, AccessTy, DL: *DL))
1015	return RejectUser (Inst, "not a supported access type");
1016
1017	AA.Vector.Worklist.push_back(Elt: Inst);
1018	continue;
1019	}
1020
1021	if (auto *GEP = dyn_cast<GetElementPtrInst>(Val: Inst)) {
1022	// If we can't compute a vector index from this GEP, then we can't
1023	// promote this alloca to vector.
1024	auto Index = computeGEPToVectorIndex(GEP, Alloca: AA.Alloca, VecElemTy: VecEltTy, DL: *DL);
1025	if (!Index)
1026	return RejectUser (Inst, "cannot compute vector index for GEP");
1027
1028	AA.Vector.GEPVectorIdx [GEP] = std::move(Index.value());
1029	AA.Vector.UsersToRemove.push_back(Elt: Inst);
1030	continue;
1031	}
1032
1033	if (MemSetInst *MSI = dyn_cast<MemSetInst>(Val: Inst);
1034	MSI && isSupportedMemset(I: MSI, AI: AA.Alloca, DL: *DL)) {
1035	AA.Vector.Worklist.push_back(Elt: Inst);
1036	continue;
1037	}
1038
1039	if (MemTransferInst *TransferInst = dyn_cast<MemTransferInst>(Val: Inst)) {
1040	if (TransferInst->isVolatile())
1041	return RejectUser (Inst, "mem transfer inst is volatile");
1042
1043	ConstantInt *Len = dyn_cast<ConstantInt>(Val: TransferInst->getLength());
1044	if (!Len \|\| (Len->getZExtValue() % ElementSize))
1045	return RejectUser (Inst, "mem transfer inst length is non-constant or "
1046	"not a multiple of the vector element size");
1047
1048	auto getConstIndexIntoAlloca = [&](Value Ptr) -> ConstantInt {
1049	if (Ptr == AA.Alloca)
1050	return ConstantInt::get(Context&: Ptr->getContext(), V: APInt (`32`, `0`));
1051
1052	GetElementPtrInst *GEP = cast<GetElementPtrInst>(Val: Ptr);
1053	const auto &GEPI = AA.Vector.GEPVectorIdx.find(Key: GEP)->second;
1054	if (GEPI.VarIndex)
1055	return nullptr;
1056	if (GEPI.ConstIndex)
1057	return GEPI.ConstIndex;
1058	return ConstantInt::get(Context&: Ptr->getContext(), V: APInt (`32`, `0`));
1059	};
1060
1061	MemTransferInfo *TI =
1062	&AA.Vector.TransferInfo.try_emplace(Key: TransferInst).first->second;
1063	unsigned OpNum = U->getOperandNo();
1064	if (OpNum == `0`) {
1065	Value *Dest = TransferInst->getDest();
1066	ConstantInt *Index = getConstIndexIntoAlloca (Dest);
1067	if (!Index)
1068	return RejectUser (Inst, "could not calculate constant dest index");
1069	TI->DestIndex = Index;
1070	} else {
1071	assert(OpNum == `1`);
1072	Value *Src = TransferInst->getSource();
1073	ConstantInt *Index = getConstIndexIntoAlloca (Src);
1074	if (!Index)
1075	return RejectUser (Inst, "could not calculate constant src index");
1076	TI->SrcIndex = Index;
1077	}
1078	continue;
1079	}
1080
1081	if (auto *Intr = dyn_cast<IntrinsicInst>(Val: Inst)) {
1082	if (Intr->getIntrinsicID() == Intrinsic::objectsize) {
1083	AA.Vector.Worklist.push_back(Elt: Inst);
1084	continue;
1085	}
1086	}
1087
1088	// Ignore assume-like intrinsics and comparisons used in assumes.
1089	if (isAssumeLikeIntrinsic(I: Inst)) {
1090	if (!Inst->use_empty())
1091	return RejectUser (Inst, "assume-like intrinsic cannot have any users");
1092	AA.Vector.UsersToRemove.push_back(Elt: Inst);
1093	continue;
1094	}
1095
1096	if (isa<ICmpInst>(Val: Inst) && all_of(Range: Inst->users(), P: [](User *U) {
1097	return isAssumeLikeIntrinsic(I: cast<Instruction>(Val: U));
1098	})) {
1099	AA.Vector.UsersToRemove.push_back(Elt: Inst);
1100	continue;
1101	}
1102
1103	return RejectUser (Inst, "unhandled alloca user");
1104	}
1105
1106	// Follow-up check to ensure we've seen both sides of all transfer insts.
1107	for (const auto &Entry : AA.Vector.TransferInfo) {
1108	const MemTransferInfo &TI = Entry.second;
1109	if (!TI.SrcIndex \|\| !TI.DestIndex)
1110	return RejectUser (Entry.first,
1111	"mem transfer inst between different objects");
1112	AA.Vector.Worklist.push_back(Elt: Entry.first);
1113	}
1114	}
1115
1116	void AMDGPUPromoteAllocaImpl::promoteAllocaToVector(AllocaAnalysis &AA) {
1117	LLVM_DEBUG(dbgs() << "Promoting to vectors: " << *AA.Alloca << `'\n'`);
1118	LLVM_DEBUG(dbgs() << " type conversion: " << *AA.Alloca->getAllocatedType()
1119	<< " -> " << *AA.Vector.Ty << `'\n'`);
1120	const unsigned VecStoreSize = DL->getTypeStoreSize(Ty: AA.Vector.Ty);
1121
1122	Type *VecEltTy = AA.Vector.Ty->getElementType();
1123	const unsigned ElementSize = DL->getTypeSizeInBits(Ty: VecEltTy) / `8`;
1124
1125	// Alloca is uninitialized memory. Imitate that by making the first value
1126	// undef.
1127	SSAUpdater Updater;
1128	Updater.Initialize(Ty: AA.Vector.Ty, Name: "promotealloca");
1129
1130	BasicBlock *EntryBB = AA.Alloca->getParent();
1131	BasicBlock::iterator InitInsertPos =
1132	skipToNonAllocaInsertPt(BB&: *EntryBB, I: AA.Alloca->getIterator());
1133	IRBuilder<> Builder(&*InitInsertPos);
1134	Value *AllocaInitValue = Builder.CreateFreeze(V: PoisonValue::get(T: AA.Vector.Ty));
1135	AllocaInitValue->takeName(V: AA.Alloca);
1136
1137	Updater.AddAvailableValue(BB: AA.Alloca->getParent(), V: AllocaInitValue);
1138
1139	// First handle the initial worklist, in basic block order.
1140	//
1141	// Insert a placeholder whenever we need the vector value at the top of a
1142	// basic block.
1143	SmallVector<Instruction *> Placeholders;
1144	forEachWorkListItem(WorkList: AA.Vector.Worklist, Fn: [&](Instruction *I) {
1145	BasicBlock *BB = I->getParent();
1146	auto GetCurVal = [&]() -> Value * {
1147	if (Value *CurVal = Updater.FindValueForBlock(BB))
1148	return CurVal;
1149
1150	if (!Placeholders.empty() && Placeholders.back()->getParent() == BB)
1151	return Placeholders.back();
1152
1153	// If the current value in the basic block is not yet known, insert a
1154	// placeholder that we will replace later.
1155	IRBuilder<> Builder(I);
1156	auto *Placeholder = cast<Instruction>(Val: Builder.CreateFreeze(
1157	V: PoisonValue::get(T: AA.Vector.Ty), Name: "promotealloca.placeholder"));
1158	Placeholders.push_back(Elt: Placeholder);
1159	return Placeholders.back();
1160	};
1161
1162	Value Result = promoteAllocaUserToVector(Inst: I, DL: DL, AA, VecStoreSize,
1163	ElementSize, GetCurVal);
1164	if (Result)
1165	Updater.AddAvailableValue(BB, V: Result);
1166	});
1167
1168	// Now fixup the placeholders.
1169	SmallVector<Value *> PlaceholderToNewVal(Placeholders.size());
1170	for (auto [Index, Placeholder] : enumerate(First&: Placeholders)) {
1171	Value *NewVal = Updater.GetValueInMiddleOfBlock(BB: Placeholder->getParent());
1172	PlaceholderToNewVal [Index] = NewVal;
1173	Placeholder->replaceAllUsesWith(V: NewVal);
1174	}
1175	// Note: we cannot merge this loop with the previous one because it is
1176	// possible that the placeholder itself can be used in the SSAUpdater. The
1177	// replaceAllUsesWith doesn't replace those uses.
1178	for (auto [Index, Placeholder] : enumerate(First&: Placeholders)) {
1179	if (!Placeholder->use_empty())
1180	Placeholder->replaceAllUsesWith(V: PlaceholderToNewVal [Index]);
1181	Placeholder->eraseFromParent();
1182	}
1183
1184	// Delete all instructions.
1185	for (Instruction *I : AA.Vector.Worklist) {
1186	assert(I->use_empty());
1187	I->eraseFromParent();
1188	}
1189
1190	// Delete all the users that are known to be removeable.
1191	for (Instruction *I : reverse(C&: AA.Vector.UsersToRemove)) {
1192	I->dropDroppableUses();
1193	assert(I->use_empty());
1194	I->eraseFromParent();
1195	}
1196
1197	// Alloca should now be dead too.
1198	assert(AA.Alloca->use_empty());
1199	AA.Alloca->eraseFromParent();
1200	}
1201
1202	std::pair<Value , Value >
1203	AMDGPUPromoteAllocaImpl::getLocalSizeYZ(IRBuilder<> &Builder) {
1204	Function &F = *Builder.GetInsertBlock()->getParent();
1205	const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F);
1206
1207	if (!IsAMDHSA) {
1208	CallInst *LocalSizeY =
1209	Builder.CreateIntrinsic(ID: Intrinsic::r600_read_local_size_y, Args: {});
1210	CallInst *LocalSizeZ =
1211	Builder.CreateIntrinsic(ID: Intrinsic::r600_read_local_size_z, Args: {});
1212
1213	ST.makeLIDRangeMetadata(I: LocalSizeY);
1214	ST.makeLIDRangeMetadata(I: LocalSizeZ);
1215
1216	return std::pair(LocalSizeY, LocalSizeZ);
1217	}
1218
1219	// We must read the size out of the dispatch pointer.
1220	assert(IsAMDGCN);
1221
1222	// We are indexing into this struct, and want to extract the workgroup_size_*
1223	// fields.
1224	//
1225	// typedef struct hsa_kernel_dispatch_packet_s {
1226	// uint16_t header;
1227	// uint16_t setup;
1228	// uint16_t workgroup_size_x ;
1229	// uint16_t workgroup_size_y;
1230	// uint16_t workgroup_size_z;
1231	// uint16_t reserved0;
1232	// uint32_t grid_size_x ;
1233	// uint32_t grid_size_y ;
1234	// uint32_t grid_size_z;
1235	//
1236	// uint32_t private_segment_size;
1237	// uint32_t group_segment_size;
1238	// uint64_t kernel_object;
1239	//
1240	// #ifdef HSA_LARGE_MODEL
1241	// void kernarg_address;*
1242	// #elif defined HSA_LITTLE_ENDIAN
1243	// void kernarg_address;*
1244	// uint32_t reserved1;
1245	// #else
1246	// uint32_t reserved1;
1247	// void kernarg_address;*
1248	// #endif
1249	// uint64_t reserved2;
1250	// hsa_signal_t completion_signal; // uint64_t wrapper
1251	// } hsa_kernel_dispatch_packet_t
1252	//
1253	CallInst *DispatchPtr =
1254	Builder.CreateIntrinsic(ID: Intrinsic::amdgcn_dispatch_ptr, Args: {});
1255	DispatchPtr->addRetAttr(Kind: Attribute::NoAlias);
1256	DispatchPtr->addRetAttr(Kind: Attribute::NonNull);
1257	F.removeFnAttr(Kind: "amdgpu-no-dispatch-ptr");
1258
1259	// Size of the dispatch packet struct.
1260	DispatchPtr->addDereferenceableRetAttr(Bytes: `64`);
1261
1262	Type *I32Ty = Type::getInt32Ty(C&: Mod->getContext());
1263
1264	// We could do a single 64-bit load here, but it's likely that the basic
1265	// 32-bit and extract sequence is already present, and it is probably easier
1266	// to CSE this. The loads should be mergeable later anyway.
1267	Value *GEPXY = Builder.CreateConstInBoundsGEP1_64(Ty: I32Ty, Ptr: DispatchPtr, Idx0: `1`);
1268	LoadInst *LoadXY = Builder.CreateAlignedLoad(Ty: I32Ty, Ptr: GEPXY, Align: Align (`4`));
1269
1270	Value *GEPZU = Builder.CreateConstInBoundsGEP1_64(Ty: I32Ty, Ptr: DispatchPtr, Idx0: `2`);
1271	LoadInst *LoadZU = Builder.CreateAlignedLoad(Ty: I32Ty, Ptr: GEPZU, Align: Align (`4`));
1272
1273	MDNode *MD = MDNode::get(Context&: Mod->getContext(), MDs: {});
1274	LoadXY->setMetadata(KindID: LLVMContext::MD_invariant_load, Node: MD);
1275	LoadZU->setMetadata(KindID: LLVMContext::MD_invariant_load, Node: MD);
1276	ST.makeLIDRangeMetadata(I: LoadZU);
1277
1278	// Extract y component. Upper half of LoadZU should be zero already.
1279	Value *Y = Builder.CreateLShr(LHS: LoadXY, RHS: `16`);
1280
1281	return std::pair(Y, LoadZU);
1282	}
1283
1284	Value *AMDGPUPromoteAllocaImpl::getWorkitemID(IRBuilder<> &Builder,
1285	unsigned N) {
1286	Function *F = Builder.GetInsertBlock()->getParent();
1287	const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F: *F);
1288	Intrinsic::ID IntrID = Intrinsic::not_intrinsic;
1289	StringRef AttrName;
1290
1291	switch (N) {
1292	case `0`:
1293	IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_x
1294	: (Intrinsic::ID)Intrinsic::r600_read_tidig_x;
1295	AttrName = "amdgpu-no-workitem-id-x";
1296	break;
1297	case `1`:
1298	IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_y
1299	: (Intrinsic::ID)Intrinsic::r600_read_tidig_y;
1300	AttrName = "amdgpu-no-workitem-id-y";
1301	break;
1302
1303	case `2`:
1304	IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_z
1305	: (Intrinsic::ID)Intrinsic::r600_read_tidig_z;
1306	AttrName = "amdgpu-no-workitem-id-z";
1307	break;
1308	default:
1309	llvm_unreachable("invalid dimension");
1310	}
1311
1312	Function *WorkitemIdFn = Intrinsic::getOrInsertDeclaration(M: Mod, id: IntrID);
1313	CallInst *CI = Builder.CreateCall(Callee: WorkitemIdFn);
1314	ST.makeLIDRangeMetadata(I: CI);
1315	F->removeFnAttr(Kind: AttrName);
1316
1317	return CI;
1318	}
1319
1320	static bool isCallPromotable(CallInst *CI) {
1321	IntrinsicInst *II = dyn_cast<IntrinsicInst>(Val: CI);
1322	if (!II)
1323	return false;
1324
1325	switch (II->getIntrinsicID()) {
1326	case Intrinsic::memcpy:
1327	case Intrinsic::memmove:
1328	case Intrinsic::memset:
1329	case Intrinsic::lifetime_start:
1330	case Intrinsic::lifetime_end:
1331	case Intrinsic::invariant_start:
1332	case Intrinsic::invariant_end:
1333	case Intrinsic::launder_invariant_group:
1334	case Intrinsic::strip_invariant_group:
1335	case Intrinsic::objectsize:
1336	return true;
1337	default:
1338	return false;
1339	}
1340	}
1341
1342	bool AMDGPUPromoteAllocaImpl::binaryOpIsDerivedFromSameAlloca(
1343	Value BaseAlloca, Value Val, Instruction Inst, int* OpIdx0,
1344	int OpIdx1) const {
1345	// Figure out which operand is the one we might not be promoting.
1346	Value *OtherOp = Inst->getOperand(i: OpIdx0);
1347	if (Val == OtherOp)
1348	OtherOp = Inst->getOperand(i: OpIdx1);
1349
1350	if (isa<ConstantPointerNull, ConstantAggregateZero>(Val: OtherOp))
1351	return true;
1352
1353	// TODO: getUnderlyingObject will not work on a vector getelementptr
1354	Value *OtherObj = getUnderlyingObject(V: OtherOp);
1355	if (!isa<AllocaInst>(Val: OtherObj))
1356	return false;
1357
1358	// TODO: We should be able to replace undefs with the right pointer type.
1359
1360	// TODO: If we know the other base object is another promotable
1361	// alloca, not necessarily this alloca, we can do this. The
1362	// important part is both must have the same address space at
1363	// the end.
1364	if (OtherObj != BaseAlloca) {
1365	LLVM_DEBUG(
1366	dbgs() << "Found a binary instruction with another alloca object\n");
1367	return false;
1368	}
1369
1370	return true;
1371	}
1372
1373	void AMDGPUPromoteAllocaImpl::analyzePromoteToLDS(AllocaAnalysis &AA) const {
1374	if (DisablePromoteAllocaToLDS) {
1375	LLVM_DEBUG(dbgs() << " Promote alloca to LDS is disabled\n");
1376	return;
1377	}
1378
1379	// Don't promote the alloca to LDS for shader calling conventions as the work
1380	// item ID intrinsics are not supported for these calling conventions.
1381	// Furthermore not all LDS is available for some of the stages.
1382	const Function &ContainingFunction = *AA.Alloca->getFunction();
1383	CallingConv::ID CC = ContainingFunction.getCallingConv();
1384
1385	switch (CC) {
1386	case CallingConv::AMDGPU_KERNEL:
1387	case CallingConv::SPIR_KERNEL:
1388	break;
1389	default:
1390	LLVM_DEBUG(
1391	dbgs()
1392	<< " promote alloca to LDS not supported with calling convention.\n");
1393	return;
1394	}
1395
1396	for (Use *Use : AA.Uses) {
1397	auto *User = Use->getUser();
1398
1399	if (CallInst *CI = dyn_cast<CallInst>(Val: User)) {
1400	if (!isCallPromotable(CI))
1401	return;
1402
1403	if (find(Range&: AA.LDS.Worklist, Val: User) == AA.LDS.Worklist.end())
1404	AA.LDS.Worklist.push_back(Elt: User);
1405	continue;
1406	}
1407
1408	Instruction *UseInst = cast<Instruction>(Val: User);
1409	if (UseInst->getOpcode() == Instruction::PtrToInt)
1410	return;
1411
1412	if (LoadInst *LI = dyn_cast<LoadInst>(Val: UseInst)) {
1413	if (LI->isVolatile())
1414	return;
1415	continue;
1416	}
1417
1418	if (StoreInst *SI = dyn_cast<StoreInst>(Val: UseInst)) {
1419	if (SI->isVolatile())
1420	return;
1421	continue;
1422	}
1423
1424	if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(Val: UseInst)) {
1425	if (RMW->isVolatile())
1426	return;
1427	continue;
1428	}
1429
1430	if (AtomicCmpXchgInst *CAS = dyn_cast<AtomicCmpXchgInst>(Val: UseInst)) {
1431	if (CAS->isVolatile())
1432	return;
1433	continue;
1434	}
1435
1436	// Only promote a select if we know that the other select operand
1437	// is from another pointer that will also be promoted.
1438	if (ICmpInst *ICmp = dyn_cast<ICmpInst>(Val: UseInst)) {
1439	if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca: AA.Alloca, Val: Use->get(), Inst: ICmp, OpIdx0: `0`, OpIdx1: `1`))
1440	return;
1441
1442	// May need to rewrite constant operands.
1443	if (find(Range&: AA.LDS.Worklist, Val: User) == AA.LDS.Worklist.end())
1444	AA.LDS.Worklist.push_back(Elt: ICmp);
1445	continue;
1446	}
1447
1448	if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Val: UseInst)) {
1449	// Be conservative if an address could be computed outside the bounds of
1450	// the alloca.
1451	if (!GEP->isInBounds())
1452	return;
1453	} else if (!isa<ExtractElementInst, SelectInst, PHINode>(Val: User)) {
1454	// Do not promote vector/aggregate type instructions. It is hard to track
1455	// their users.
1456
1457	// Do not promote addrspacecast.
1458	//
1459	// TODO: If we know the address is only observed through flat pointers, we
1460	// could still promote.
1461	return;
1462	}
1463
1464	if (find(Range&: AA.LDS.Worklist, Val: User) == AA.LDS.Worklist.end())
1465	AA.LDS.Worklist.push_back(Elt: User);
1466	}
1467
1468	AA.LDS.Enable = true;
1469	}
1470
1471	bool AMDGPUPromoteAllocaImpl::hasSufficientLocalMem(const Function &F) {
1472
1473	FunctionType *FTy = F.getFunctionType();
1474	const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F);
1475
1476	// If the function has any arguments in the local address space, then it's
1477	// possible these arguments require the entire local memory space, so
1478	// we cannot use local memory in the pass.
1479	for (Type *ParamTy : FTy->params()) {
1480	PointerType *PtrTy = dyn_cast<PointerType>(Val: ParamTy);
1481	if (PtrTy && PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
1482	LocalMemLimit = `0`;
1483	LLVM_DEBUG(dbgs() << "Function has local memory argument. Promoting to "
1484	"local memory disabled.\n");
1485	return false;
1486	}
1487	}
1488
1489	LocalMemLimit = ST.getAddressableLocalMemorySize();
1490	if (LocalMemLimit == `0`)
1491	return false;
1492
1493	SmallVector<const Constant *, `16`> Stack;
1494	SmallPtrSet<const Constant *, `8`> VisitedConstants;
1495	SmallPtrSet<const GlobalVariable *, `8`> UsedLDS;
1496
1497	auto visitUsers = [&](const GlobalVariable GV, const* Constant Val) -> bool* {
1498	for (const User *U : Val->users()) {
1499	if (const Instruction *Use = dyn_cast<Instruction>(Val: U)) {
1500	if (Use->getFunction() == &F)
1501	return true;
1502	} else {
1503	const Constant *C = cast<Constant>(Val: U);
1504	if (VisitedConstants.insert(Ptr: C).second)
1505	Stack.push_back(Elt: C);
1506	}
1507	}
1508
1509	return false;
1510	};
1511
1512	for (GlobalVariable &GV : Mod->globals()) {
1513	if (GV.getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS)
1514	continue;
1515
1516	if (visitUsers (&GV, &GV)) {
1517	UsedLDS.insert(Ptr: &GV);
1518	Stack.clear();
1519	continue;
1520	}
1521
1522	// For any ConstantExpr uses, we need to recursively search the users until
1523	// we see a function.
1524	while (!Stack.empty()) {
1525	const Constant *C = Stack.pop_back_val();
1526	if (visitUsers (&GV, C)) {
1527	UsedLDS.insert(Ptr: &GV);
1528	Stack.clear();
1529	break;
1530	}
1531	}
1532	}
1533
1534	const DataLayout &DL = Mod->getDataLayout();
1535	SmallVector<std::pair<uint64_t, Align>, `16`> AllocatedSizes;
1536	AllocatedSizes.reserve(N: UsedLDS.size());
1537
1538	for (const GlobalVariable *GV : UsedLDS) {
1539	Align Alignment =
1540	DL.getValueOrABITypeAlignment(Alignment: GV->getAlign(), Ty: GV->getValueType());
1541	uint64_t AllocSize = GV->getGlobalSize(DL);
1542
1543	// HIP uses an extern unsized array in local address space for dynamically
1544	// allocated shared memory. In that case, we have to disable the promotion.
1545	if (GV->hasExternalLinkage() && AllocSize == `0`) {
1546	LocalMemLimit = `0`;
1547	LLVM_DEBUG(dbgs() << "Function has a reference to externally allocated "
1548	"local memory. Promoting to local memory "
1549	"disabled.\n");
1550	return false;
1551	}
1552
1553	AllocatedSizes.emplace_back(Args&: AllocSize, Args&: Alignment);
1554	}
1555
1556	// Sort to try to estimate the worst case alignment padding
1557	//
1558	// FIXME: We should really do something to fix the addresses to a more optimal
1559	// value instead
1560	llvm::sort(C&: AllocatedSizes, Comp: llvm::less_second ());
1561
1562	// Check how much local memory is being used by global objects
1563	CurrentLocalMemUsage = `0`;
1564
1565	// FIXME: Try to account for padding here. The real padding and address is
1566	// currently determined from the inverse order of uses in the function when
1567	// legalizing, which could also potentially change. We try to estimate the
1568	// worst case here, but we probably should fix the addresses earlier.
1569	for (auto Alloc : AllocatedSizes) {
1570	CurrentLocalMemUsage = alignTo(Size: CurrentLocalMemUsage, A: Alloc.second);
1571	CurrentLocalMemUsage += Alloc.first;
1572	}
1573
1574	unsigned MaxOccupancy =
1575	ST.getWavesPerEU(FlatWorkGroupSizes: ST.getFlatWorkGroupSizes(F), LDSBytes: CurrentLocalMemUsage, F)
1576	.second;
1577
1578	// Round up to the next tier of usage.
1579	unsigned MaxSizeWithWaveCount =
1580	ST.getMaxLocalMemSizeWithWaveCount(WaveCount: MaxOccupancy, F);
1581
1582	// Program may already use more LDS than is usable at maximum occupancy.
1583	if (CurrentLocalMemUsage > MaxSizeWithWaveCount)
1584	return false;
1585
1586	LocalMemLimit = MaxSizeWithWaveCount;
1587
1588	LLVM_DEBUG(dbgs() << F.getName() << " uses " << CurrentLocalMemUsage
1589	<< " bytes of LDS\n"
1590	<< " Rounding size to " << MaxSizeWithWaveCount
1591	<< " with a maximum occupancy of " << MaxOccupancy << `'\n'`
1592	<< " and " << (LocalMemLimit - CurrentLocalMemUsage)
1593	<< " available for promotion\n");
1594
1595	return true;
1596	}
1597
1598	// FIXME: Should try to pick the most likely to be profitable allocas first.
1599	bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(
1600	AllocaAnalysis &AA, bool SufficientLDS,
1601	SetVector<IntrinsicInst *> &DeferredIntrs) {
1602	LLVM_DEBUG(dbgs() << "Trying to promote to LDS: " << *AA.Alloca << `'\n'`);
1603
1604	// Not likely to have sufficient local memory for promotion.
1605	if (!SufficientLDS)
1606	return false;
1607
1608	const DataLayout &DL = Mod->getDataLayout();
1609	IRBuilder<> Builder(AA.Alloca);
1610
1611	const Function &ContainingFunction = *AA.Alloca->getParent()->getParent();
1612	const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F: ContainingFunction);
1613	unsigned WorkGroupSize = ST.getFlatWorkGroupSizes(F: ContainingFunction).second;
1614
1615	Align Alignment = AA.Alloca->getAlign();
1616
1617	// FIXME: This computed padding is likely wrong since it depends on inverse
1618	// usage order.
1619	//
1620	// FIXME: It is also possible that if we're allowed to use all of the memory
1621	// could end up using more than the maximum due to alignment padding.
1622
1623	uint32_t NewSize = alignTo(Size: CurrentLocalMemUsage, A: Alignment);
1624	std::optional<TypeSize> ElemSize = AA.Alloca->getAllocationSize(DL);
1625	if (!ElemSize \|\| ElemSize ->isScalable())
1626	return false;
1627	TypeSize AllocSize = WorkGroupSize * *ElemSize;
1628	NewSize += AllocSize.getFixedValue();
1629
1630	if (NewSize > LocalMemLimit) {
1631	LLVM_DEBUG(dbgs() << " " << AllocSize
1632	<< " bytes of local memory not available to promote\n");
1633	return false;
1634	}
1635
1636	CurrentLocalMemUsage = NewSize;
1637
1638	LLVM_DEBUG(dbgs() << "Promoting alloca to local memory\n");
1639
1640	Function *F = AA.Alloca->getFunction();
1641
1642	Type *GVTy = ArrayType::get(ElementType: AA.Alloca->getAllocatedType(), NumElements: WorkGroupSize);
1643	GlobalVariable GV = new* GlobalVariable (
1644	Mod, GVTy, false*, GlobalValue::InternalLinkage, PoisonValue::get(T: GVTy),
1645	Twine (F->getName()) + Twine (`'.'`) + AA.Alloca->getName(), nullptr,
1646	GlobalVariable::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS);
1647	GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
1648	GV->setAlignment(AA.Alloca->getAlign());
1649
1650	Value TCntY, TCntZ;
1651
1652	std::tie(args&: TCntY, args&: TCntZ) = getLocalSizeYZ(Builder);
1653	Value *TIdX = getWorkitemID(Builder, N: `0`);
1654	Value *TIdY = getWorkitemID(Builder, N: `1`);
1655	Value *TIdZ = getWorkitemID(Builder, N: `2`);
1656
1657	Value Tmp0 = Builder.CreateMul(LHS: TCntY, RHS: TCntZ, Name: "", HasNUW: true, HasNSW: true*);
1658	Tmp0 = Builder.CreateMul(LHS: Tmp0, RHS: TIdX);
1659	Value Tmp1 = Builder.CreateMul(LHS: TIdY, RHS: TCntZ, Name: "", HasNUW: true, HasNSW: true*);
1660	Value *TID = Builder.CreateAdd(LHS: Tmp0, RHS: Tmp1);
1661	TID = Builder.CreateAdd(LHS: TID, RHS: TIdZ);
1662
1663	LLVMContext &Context = Mod->getContext();
1664	Value *Indices[] = {Constant::getNullValue(Ty: Type::getInt32Ty(C&: Context)), TID};
1665
1666	Value *Offset = Builder.CreateInBoundsGEP(Ty: GVTy, Ptr: GV, IdxList: Indices);
1667	AA.Alloca->mutateType(Ty: Offset->getType());
1668	AA.Alloca->replaceAllUsesWith(V: Offset);
1669	AA.Alloca->eraseFromParent();
1670
1671	PointerType *NewPtrTy = PointerType::get(C&: Context, AddressSpace: AMDGPUAS::LOCAL_ADDRESS);
1672
1673	for (Value *V : AA.LDS.Worklist) {
1674	CallInst *Call = dyn_cast<CallInst>(Val: V);
1675	if (!Call) {
1676	if (ICmpInst *CI = dyn_cast<ICmpInst>(Val: V)) {
1677	Value *LHS = CI->getOperand(i_nocapture: `0`);
1678	Value *RHS = CI->getOperand(i_nocapture: `1`);
1679
1680	Type *NewTy = LHS->getType()->getWithNewType(EltTy: NewPtrTy);
1681	if (isa<ConstantPointerNull, ConstantAggregateZero>(Val: LHS))
1682	CI->setOperand(i_nocapture: `0`, Val_nocapture: Constant::getNullValue(Ty: NewTy));
1683
1684	if (isa<ConstantPointerNull, ConstantAggregateZero>(Val: RHS))
1685	CI->setOperand(i_nocapture: `1`, Val_nocapture: Constant::getNullValue(Ty: NewTy));
1686
1687	continue;
1688	}
1689
1690	// The operand's value should be corrected on its own and we don't want to
1691	// touch the users.
1692	if (isa<AddrSpaceCastInst>(Val: V))
1693	continue;
1694
1695	assert(V->getType()->isPtrOrPtrVectorTy());
1696
1697	Type *NewTy = V->getType()->getWithNewType(EltTy: NewPtrTy);
1698	V->mutateType(Ty: NewTy);
1699
1700	// Adjust the types of any constant operands.
1701	if (SelectInst *SI = dyn_cast<SelectInst>(Val: V)) {
1702	if (isa<ConstantPointerNull, ConstantAggregateZero>(Val: SI->getOperand(i_nocapture: `1`)))
1703	SI->setOperand(i_nocapture: `1`, Val_nocapture: Constant::getNullValue(Ty: NewTy));
1704
1705	if (isa<ConstantPointerNull, ConstantAggregateZero>(Val: SI->getOperand(i_nocapture: `2`)))
1706	SI->setOperand(i_nocapture: `2`, Val_nocapture: Constant::getNullValue(Ty: NewTy));
1707	} else if (PHINode *Phi = dyn_cast<PHINode>(Val: V)) {
1708	for (unsigned I = `0`, E = Phi->getNumIncomingValues(); I != E; ++I) {
1709	if (isa<ConstantPointerNull, ConstantAggregateZero>(
1710	Val: Phi->getIncomingValue(i: I)))
1711	Phi->setIncomingValue(i: I, V: Constant::getNullValue(Ty: NewTy));
1712	}
1713	}
1714
1715	continue;
1716	}
1717
1718	IntrinsicInst *Intr = cast<IntrinsicInst>(Val: Call);
1719	Builder.SetInsertPoint(Intr);
1720	switch (Intr->getIntrinsicID()) {
1721	case Intrinsic::lifetime_start:
1722	case Intrinsic::lifetime_end:
1723	// These intrinsics are for address space 0 only
1724	Intr->eraseFromParent();
1725	continue;
1726	case Intrinsic::memcpy:
1727	case Intrinsic::memmove:
1728	// These have 2 pointer operands. In case if second pointer also needs
1729	// to be replaced we defer processing of these intrinsics until all
1730	// other values are processed.
1731	DeferredIntrs.insert(X: Intr);
1732	continue;
1733	case Intrinsic::memset: {
1734	MemSetInst *MemSet = cast<MemSetInst>(Val: Intr);
1735	Builder.CreateMemSet(Ptr: MemSet->getRawDest(), Val: MemSet->getValue(),
1736	Size: MemSet->getLength(), Align: MemSet->getDestAlign(),
1737	isVolatile: MemSet->isVolatile());
1738	Intr->eraseFromParent();
1739	continue;
1740	}
1741	case Intrinsic::invariant_start:
1742	case Intrinsic::invariant_end:
1743	case Intrinsic::launder_invariant_group:
1744	case Intrinsic::strip_invariant_group: {
1745	SmallVector<Value *> Args;
1746	if (Intr->getIntrinsicID() == Intrinsic::invariant_start) {
1747	Args.emplace_back(Args: Intr->getArgOperand(i: `0`));
1748	} else if (Intr->getIntrinsicID() == Intrinsic::invariant_end) {
1749	Args.emplace_back(Args: Intr->getArgOperand(i: `0`));
1750	Args.emplace_back(Args: Intr->getArgOperand(i: `1`));
1751	}
1752	Args.emplace_back(Args&: Offset);
1753	Function *F = Intrinsic::getOrInsertDeclaration(
1754	M: Intr->getModule(), id: Intr->getIntrinsicID(), Tys: Offset->getType());
1755	CallInst *NewIntr =
1756	CallInst::Create(Func: F, Args, NameStr: Intr->getName(), InsertBefore: Intr->getIterator());
1757	Intr->mutateType(Ty: NewIntr->getType());
1758	Intr->replaceAllUsesWith(V: NewIntr);
1759	Intr->eraseFromParent();
1760	continue;
1761	}
1762	case Intrinsic::objectsize: {
1763	Value *Src = Intr->getOperand(i_nocapture: `0`);
1764
1765	CallInst *NewCall = Builder.CreateIntrinsic(
1766	ID: Intrinsic::objectsize,
1767	Types: {Intr->getType(), PointerType::get(C&: Context, AddressSpace: AMDGPUAS::LOCAL_ADDRESS)},
1768	Args: {Src, Intr->getOperand(i_nocapture: `1`), Intr->getOperand(i_nocapture: `2`), Intr->getOperand(i_nocapture: `3`)});
1769	Intr->replaceAllUsesWith(V: NewCall);
1770	Intr->eraseFromParent();
1771	continue;
1772	}
1773	default:
1774	Intr->print(O&: errs());
1775	llvm_unreachable("Don't know how to promote alloca intrinsic use.");
1776	}
1777	}
1778
1779	return true;
1780	}
1781
1782	void AMDGPUPromoteAllocaImpl::finishDeferredAllocaToLDSPromotion(
1783	SetVector<IntrinsicInst *> &DeferredIntrs) {
1784
1785	for (IntrinsicInst *Intr : DeferredIntrs) {
1786	IRBuilder<> Builder(Intr);
1787	Builder.SetInsertPoint(Intr);
1788	Intrinsic::ID ID = Intr->getIntrinsicID();
1789	assert(ID == Intrinsic::memcpy \|\| ID == Intrinsic::memmove);
1790
1791	MemTransferInst *MI = cast<MemTransferInst>(Val: Intr);
1792	auto *B = Builder.CreateMemTransferInst(
1793	IntrID: ID, Dst: MI->getRawDest(), DstAlign: MI->getDestAlign(), Src: MI->getRawSource(),
1794	SrcAlign: MI->getSourceAlign(), Size: MI->getLength(), isVolatile: MI->isVolatile());
1795
1796	for (unsigned I = `0`; I != `2`; ++I) {
1797	if (uint64_t Bytes = Intr->getParamDereferenceableBytes(i: I)) {
1798	B->addDereferenceableParamAttr(i: I, Bytes);
1799	}
1800	}
1801
1802	Intr->eraseFromParent();
1803	}
1804	}
1805

Browse the source code of llvm_projects/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp