AMDGPULowerKernelAttributes.cpp source code [llvm_projects/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp]

1	//===-- AMDGPULowerKernelAttributes.cpp------------------------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	/// \file This pass does attempts to make use of reqd_work_group_size metadata
10	/// to eliminate loads from the dispatch packet and to constant fold OpenCL
11	/// get_local_size-like functions.
12	//
13	//===----------------------------------------------------------------------===//
14
15	#include "AMDGPU.h"
16	#include "Utils/AMDGPUBaseInfo.h"
17	#include "llvm/Analysis/ConstantFolding.h"
18	#include "llvm/Analysis/ValueTracking.h"
19	#include "llvm/CodeGen/Passes.h"
20	#include "llvm/IR/Constants.h"
21	#include "llvm/IR/Function.h"
22	#include "llvm/IR/IRBuilder.h"
23	#include "llvm/IR/InstIterator.h"
24	#include "llvm/IR/Instructions.h"
25	#include "llvm/IR/IntrinsicsAMDGPU.h"
26	#include "llvm/IR/MDBuilder.h"
27	#include "llvm/IR/PatternMatch.h"
28	#include "llvm/Pass.h"
29
30	#define DEBUG_TYPE "amdgpu-lower-kernel-attributes"
31
32	using namespace llvm;
33
34	namespace {
35
36	// Field offsets in hsa_kernel_dispatch_packet_t.
37	enum DispatchPackedOffsets {
38	WORKGROUP_SIZE_X = `4`,
39	WORKGROUP_SIZE_Y = `6`,
40	WORKGROUP_SIZE_Z = `8`,
41
42	GRID_SIZE_X = `12`,
43	GRID_SIZE_Y = `16`,
44	GRID_SIZE_Z = `20`
45	};
46
47	// Field offsets to implicit kernel argument pointer.
48	enum ImplicitArgOffsets {
49	HIDDEN_BLOCK_COUNT_X = `0`,
50	HIDDEN_BLOCK_COUNT_Y = `4`,
51	HIDDEN_BLOCK_COUNT_Z = `8`,
52
53	HIDDEN_GROUP_SIZE_X = `12`,
54	HIDDEN_GROUP_SIZE_Y = `14`,
55	HIDDEN_GROUP_SIZE_Z = `16`,
56
57	HIDDEN_REMAINDER_X = `18`,
58	HIDDEN_REMAINDER_Y = `20`,
59	HIDDEN_REMAINDER_Z = `22`,
60
61	GRID_DIMS = `64`
62	};
63
64	class AMDGPULowerKernelAttributes : public ModulePass {
65	public:
66	static char ID;
67
68	AMDGPULowerKernelAttributes() : ModulePass (ID) {}
69
70	bool runOnModule(Module &M) override;
71
72	StringRef getPassName() const override { return "AMDGPU Kernel Attributes"; }
73
74	void getAnalysisUsage(AnalysisUsage &AU) const override {
75	AU.setPreservesAll();
76	}
77	};
78
79	Function getBasePtrIntrinsic(Module &M, bool* IsV5OrAbove) {
80	auto IntrinsicId = IsV5OrAbove ? Intrinsic::amdgcn_implicitarg_ptr
81	: Intrinsic::amdgcn_dispatch_ptr;
82	return Intrinsic::getDeclarationIfExists(M: &M, id: IntrinsicId);
83	}
84
85	} // end anonymous namespace
86
87	static bool annotateGridSizeLoadWithRangeMD(LoadInst *Load,
88	uint32_t MaxNumGroups) {
89	if (MaxNumGroups == `0` \|\| MaxNumGroups == std::numeric_limits<uint32_t>::max())
90	return false;
91
92	if (!Load->getType()->isIntegerTy(Bitwidth: `32`))
93	return false;
94
95	// TODO: If there is existing range metadata, preserve it if it is stricter.
96	if (Load->hasMetadata(KindID: LLVMContext::MD_range))
97	return false;
98
99	MDBuilder MDB(Load->getContext());
100	MDNode *Range = MDB.createRange(Lo: APInt (`32`, `1`), Hi: APInt (`32`, MaxNumGroups + `1`));
101	Load->setMetadata(KindID: LLVMContext::MD_range, Node: Range);
102	return true;
103	}
104
105	static bool annotateGroupSizeLoadWithRangeMD(LoadInst Load, bool* IsRemainder) {
106	if (!Load->getType()->isIntegerTy(Bitwidth: `16`))
107	return false;
108
109	// TODO: If there is existing range metadata, preserve it if it is stricter.
110	if (Load->hasMetadata(KindID: LLVMContext::MD_range))
111	return false;
112
113	MDBuilder MDB(Load->getContext());
114	MDNode *Range = MDB.createRange(
115	Lo: APInt (`16`, !IsRemainder),
116	Hi: APInt (`16`, AMDGPU::IsaInfo::getMaxFlatWorkGroupSize() + `1` - IsRemainder));
117	Load->setMetadata(KindID: LLVMContext::MD_range, Node: Range);
118	return true;
119	}
120
121	static bool annotateGridDimsLoadWithRangeMD(LoadInst *Load,
122	unsigned KnownNumGridDims) {
123	IntegerType *Ty = dyn_cast<IntegerType>(Val: Load->getType());
124	if (!Ty \|\| Ty->getBitWidth() < `3`)
125	return false;
126
127	if (KnownNumGridDims != `0`) {
128	Load->replaceAllUsesWith(
129	V: ConstantInt::get(Ty: Load->getType(), V: KnownNumGridDims));
130	return true;
131	}
132
133	// TODO: If there is existing range metadata, preserve it if it is stricter.
134	if (Load->hasMetadata(KindID: LLVMContext::MD_range))
135	return false;
136
137	MDBuilder MDB(Load->getContext());
138	MDNode *Range =
139	MDB.createRange(Lo: APInt (Ty->getBitWidth(), `1`), Hi: APInt (Ty->getBitWidth(), `4`));
140	Load->setMetadata(KindID: LLVMContext::MD_range, Node: Range);
141	return true;
142	}
143
144	/// Compute the number of grid dimensions based on !reqd_work_group_size
145	/// metadata
146	static unsigned computeNumGridDims(const MDNode *ReqdWorkGroupSize) {
147	ConstantInt *KnownZ =
148	mdconst::extract<ConstantInt>(MD: ReqdWorkGroupSize->getOperand(I: `2`));
149	if (KnownZ->getZExtValue() != `1`)
150	return `3`;
151
152	ConstantInt *KnownY =
153	mdconst::extract<ConstantInt>(MD: ReqdWorkGroupSize->getOperand(I: `1`));
154	if (KnownY->getZExtValue() != `1`)
155	return `2`;
156
157	return `1`;
158	}
159
160	static bool processUse(CallInst CI, bool* IsV5OrAbove) {
161	Function *F = CI->getFunction();
162
163	auto *MD = F->getMetadata(Kind: "reqd_work_group_size");
164	const bool HasReqdWorkGroupSize = MD && MD->getNumOperands() == `3`;
165
166	const bool HasUniformWorkGroupSize =
167	F->hasFnAttribute(Kind: "uniform-work-group-size");
168
169	SmallVector<unsigned> MaxNumWorkgroups =
170	AMDGPU::getIntegerVecAttribute(F: *F, Name: "amdgpu-max-num-workgroups",
171	/Size=/`3`, /DefaultVal=/`0`);
172
173	Value BlockCounts[`3`] = {nullptr, nullptr, nullptr*};
174	Value GroupSizes[`3`] = {nullptr, nullptr, nullptr*};
175	Value Remainders[`3`] = {nullptr, nullptr, nullptr*};
176	Value GridSizes[`3`] = {nullptr, nullptr, nullptr*};
177
178	const DataLayout &DL = F->getDataLayout();
179	bool MadeChange = false;
180
181	unsigned KnownNumGridDims = HasReqdWorkGroupSize ? computeNumGridDims(ReqdWorkGroupSize: MD) : `0`;
182
183	// We expect to see several GEP users, casted to the appropriate type and
184	// loaded.
185	for (User *U : CI->users()) {
186	if (!U->hasOneUse())
187	continue;
188
189	int64_t Offset = `0`;
190	auto Load = dyn_cast<LoadInst>(Val: U); // Load from ImplicitArgPtr/DispatchPtr?*
191	auto *BCI = dyn_cast<BitCastInst>(Val: U);
192	if (!Load && !BCI) {
193	if (GetPointerBaseWithConstantOffset(Ptr: U, Offset, DL) != CI)
194	continue;
195	Load = dyn_cast<LoadInst>(Val: U->user_begin()); // Load from GEP?*
196	BCI = dyn_cast<BitCastInst>(Val: *U->user_begin());
197	}
198
199	if (BCI) {
200	if (!BCI->hasOneUse())
201	continue;
202	Load = dyn_cast<LoadInst>(Val: BCI->user_begin()); // Load from BCI?*
203	}
204
205	if (!Load \|\| !Load->isSimple())
206	continue;
207
208	unsigned LoadSize = DL.getTypeStoreSize(Ty: Load->getType());
209
210	// TODO: Handle merged loads.
211	if (IsV5OrAbove) { // Base is ImplicitArgPtr.
212	switch (Offset) {
213	case HIDDEN_BLOCK_COUNT_X:
214	if (LoadSize == `4`) {
215	BlockCounts[`0`] = Load;
216	MadeChange \|=
217	annotateGridSizeLoadWithRangeMD(Load, MaxNumGroups: MaxNumWorkgroups [`0`]);
218	}
219	break;
220	case HIDDEN_BLOCK_COUNT_Y:
221	if (LoadSize == `4`) {
222	BlockCounts[`1`] = Load;
223	MadeChange \|=
224	annotateGridSizeLoadWithRangeMD(Load, MaxNumGroups: MaxNumWorkgroups [`1`]);
225	}
226	break;
227	case HIDDEN_BLOCK_COUNT_Z:
228	if (LoadSize == `4`) {
229	BlockCounts[`2`] = Load;
230	MadeChange \|=
231	annotateGridSizeLoadWithRangeMD(Load, MaxNumGroups: MaxNumWorkgroups [`2`]);
232	}
233	break;
234	case HIDDEN_GROUP_SIZE_X:
235	if (LoadSize == `2`) {
236	GroupSizes[`0`] = Load;
237	MadeChange \|= annotateGroupSizeLoadWithRangeMD(Load, IsRemainder: false);
238	}
239	break;
240	case HIDDEN_GROUP_SIZE_Y:
241	if (LoadSize == `2`) {
242	GroupSizes[`1`] = Load;
243	MadeChange \|= annotateGroupSizeLoadWithRangeMD(Load, IsRemainder: false);
244	}
245	break;
246	case HIDDEN_GROUP_SIZE_Z:
247	if (LoadSize == `2`) {
248	GroupSizes[`2`] = Load;
249	MadeChange \|= annotateGroupSizeLoadWithRangeMD(Load, IsRemainder: false);
250	}
251	break;
252	case HIDDEN_REMAINDER_X:
253	if (LoadSize == `2`) {
254	Remainders[`0`] = Load;
255	MadeChange \|= annotateGroupSizeLoadWithRangeMD(Load, IsRemainder: true);
256	}
257	break;
258	case HIDDEN_REMAINDER_Y:
259	if (LoadSize == `2`) {
260	Remainders[`1`] = Load;
261	MadeChange \|= annotateGroupSizeLoadWithRangeMD(Load, IsRemainder: true);
262	}
263	break;
264	case HIDDEN_REMAINDER_Z:
265	if (LoadSize == `2`) {
266	Remainders[`2`] = Load;
267	MadeChange \|= annotateGroupSizeLoadWithRangeMD(Load, IsRemainder: true);
268	}
269	break;
270
271	case GRID_DIMS:
272	if (LoadSize <= `2`)
273	MadeChange \|= annotateGridDimsLoadWithRangeMD(Load, KnownNumGridDims);
274	break;
275	default:
276	break;
277	}
278	} else { // Base is DispatchPtr.
279	switch (Offset) {
280	case WORKGROUP_SIZE_X:
281	if (LoadSize == `2`)
282	GroupSizes[`0`] = Load;
283	break;
284	case WORKGROUP_SIZE_Y:
285	if (LoadSize == `2`)
286	GroupSizes[`1`] = Load;
287	break;
288	case WORKGROUP_SIZE_Z:
289	if (LoadSize == `2`)
290	GroupSizes[`2`] = Load;
291	break;
292	case GRID_SIZE_X:
293	if (LoadSize == `4`)
294	GridSizes[`0`] = Load;
295	break;
296	case GRID_SIZE_Y:
297	if (LoadSize == `4`)
298	GridSizes[`1`] = Load;
299	break;
300	case GRID_SIZE_Z:
301	if (LoadSize == `4`)
302	GridSizes[`2`] = Load;
303	break;
304	default:
305	break;
306	}
307	}
308	}
309
310	if (IsV5OrAbove && HasUniformWorkGroupSize) {
311	// Under v5 __ockl_get_local_size returns the value computed by the
312	// expression:
313	//
314	// workgroup_id < hidden_block_count ? hidden_group_size :
315	// hidden_remainder
316	//
317	// For functions with the attribute uniform-work-group-size=true. we can
318	// evaluate workgroup_id < hidden_block_count as true, and thus
319	// hidden_group_size is returned for __ockl_get_local_size.
320	for (int I = `0`; I < `3`; ++I) {
321	Value *BlockCount = BlockCounts[I];
322	if (!BlockCount)
323	continue;
324
325	using namespace llvm::PatternMatch;
326	auto GroupIDIntrin =
327	I == `0` ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_x>()
328	: (I == `1` ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_y>()
329	: m_Intrinsic<Intrinsic::amdgcn_workgroup_id_z>());
330
331	for (User *ICmp : BlockCount->users()) {
332	if (match(V: ICmp, P: m_SpecificICmp(MatchPred: ICmpInst::ICMP_ULT, L: GroupIDIntrin,
333	R: m_Specific(V: BlockCount)))) {
334	ICmp->replaceAllUsesWith(V: llvm::ConstantInt::getTrue(Ty: ICmp->getType()));
335	MadeChange = true;
336	}
337	}
338	}
339
340	// All remainders should be 0 with uniform work group size.
341	for (Value *Remainder : Remainders) {
342	if (!Remainder)
343	continue;
344	Remainder->replaceAllUsesWith(
345	V: Constant::getNullValue(Ty: Remainder->getType()));
346	MadeChange = true;
347	}
348	} else if (HasUniformWorkGroupSize) { // Pre-V5.
349	// Pattern match the code used to handle partial workgroup dispatches in the
350	// library implementation of get_local_size, so the entire function can be
351	// constant folded with a known group size.
352	//
353	// uint r = grid_size - group_id group_size;*
354	// get_local_size = (r < group_size) ? r : group_size;
355	//
356	// If we have uniform-work-group-size (which is the default in OpenCL 1.2),
357	// the grid_size is required to be a multiple of group_size). In this case:
358	//
359	// grid_size - (group_id group_size) < group_size*
360	// ->
361	// grid_size < group_size + (group_id group_size)*
362	//
363	// (grid_size / group_size) < 1 + group_id
364	//
365	// grid_size / group_size is at least 1, so we can conclude the select
366	// condition is false (except for group_id == 0, where the select result is
367	// the same).
368	for (int I = `0`; I < `3`; ++I) {
369	Value *GroupSize = GroupSizes[I];
370	Value *GridSize = GridSizes[I];
371	if (!GroupSize \|\| !GridSize)
372	continue;
373
374	using namespace llvm::PatternMatch;
375	auto GroupIDIntrin =
376	I == `0` ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_x>()
377	: (I == `1` ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_y>()
378	: m_Intrinsic<Intrinsic::amdgcn_workgroup_id_z>());
379
380	for (User *U : GroupSize->users()) {
381	auto *ZextGroupSize = dyn_cast<ZExtInst>(Val: U);
382	if (!ZextGroupSize)
383	continue;
384
385	for (User *UMin : ZextGroupSize->users()) {
386	if (match(V: UMin, P: m_UMin(L: m_Sub(L: m_Specific(V: GridSize),
387	R: m_Mul(L: GroupIDIntrin,
388	R: m_Specific(V: ZextGroupSize))),
389	R: m_Specific(V: ZextGroupSize)))) {
390	if (HasReqdWorkGroupSize) {
391	ConstantInt *KnownSize =
392	mdconst::extract<ConstantInt>(MD: MD->getOperand(I));
393	UMin->replaceAllUsesWith(V: ConstantFoldIntegerCast(
394	C: KnownSize, DestTy: UMin->getType(), IsSigned: false, DL));
395	} else {
396	UMin->replaceAllUsesWith(V: ZextGroupSize);
397	}
398
399	MadeChange = true;
400	}
401	}
402	}
403	}
404	}
405
406	// Upgrade the old method of calculating the block size using the grid size.
407	// We pattern match any case where the implicit argument group size is the
408	// divisor to a dispatch packet grid size read of the same dimension.
409	if (IsV5OrAbove) {
410	for (int I = `0`; I < `3`; I++) {
411	Value *GroupSize = GroupSizes[I];
412	if (!GroupSize \|\| !GroupSize->getType()->isIntegerTy(Bitwidth: `16`))
413	continue;
414
415	for (User *U : GroupSize->users()) {
416	Instruction *Inst = cast<Instruction>(Val: U);
417	if (isa<ZExtInst>(Val: Inst) && !Inst->use_empty())
418	Inst = cast<Instruction>(Val: *Inst->user_begin());
419
420	using namespace llvm::PatternMatch;
421	if (!match(
422	V: Inst,
423	P: m_UDiv(L: m_ZExtOrSelf(Op: m_Load(Op: m_GEP(
424	Ops: m_Intrinsic<Intrinsic::amdgcn_dispatch_ptr>(),
425	Ops: m_SpecificInt(V: GRID_SIZE_X + I * sizeof(uint32_t))))),
426	R: m_Value())))
427	continue;
428
429	IRBuilder<> Builder(Inst);
430
431	Value *GEP = Builder.CreateInBoundsGEP(
432	Ty: Builder.getInt8Ty(), Ptr: CI,
433	IdxList: {ConstantInt::get(Ty: Type::getInt64Ty(C&: CI->getContext()),
434	V: HIDDEN_BLOCK_COUNT_X + I * sizeof(uint32_t))});
435	Instruction *BlockCount = Builder.CreateLoad(Ty: Builder.getInt32Ty(), Ptr: GEP);
436	BlockCount->setMetadata(KindID: LLVMContext::MD_invariant_load,
437	Node: MDNode::get(Context&: CI->getContext(), MDs: {}));
438	BlockCount->setMetadata(KindID: LLVMContext::MD_noundef,
439	Node: MDNode::get(Context&: CI->getContext(), MDs: {}));
440
441	Value *BlockCountExt = Builder.CreateZExt(V: BlockCount, DestTy: Inst->getType());
442	Inst->replaceAllUsesWith(V: BlockCountExt);
443	Inst->eraseFromParent();
444	MadeChange = true;
445	}
446	}
447	}
448
449	// If reqd_work_group_size is set, we can replace work group size with it.
450	if (!HasReqdWorkGroupSize)
451	return MadeChange;
452
453	for (int I = `0`; I < `3`; I++) {
454	Value *GroupSize = GroupSizes[I];
455	if (!GroupSize)
456	continue;
457
458	ConstantInt *KnownSize = mdconst::extract<ConstantInt>(MD: MD->getOperand(I));
459	GroupSize->replaceAllUsesWith(
460	V: ConstantFoldIntegerCast(C: KnownSize, DestTy: GroupSize->getType(), IsSigned: false, DL));
461	MadeChange = true;
462	}
463
464	return MadeChange;
465	}
466
467	// TODO: Move makeLIDRangeMetadata usage into here. Seem to not get
468	// TargetPassConfig for subtarget.
469	bool AMDGPULowerKernelAttributes::runOnModule(Module &M) {
470	bool MadeChange = false;
471	bool IsV5OrAbove =
472	AMDGPU::getAMDHSACodeObjectVersion(M) >= AMDGPU::AMDHSA_COV5;
473	Function *BasePtr = getBasePtrIntrinsic(M, IsV5OrAbove);
474
475	if (!BasePtr) // ImplicitArgPtr/DispatchPtr not used.
476	return false;
477
478	SmallPtrSet<Instruction *, `4`> HandledUses;
479	for (auto *U : BasePtr->users()) {
480	CallInst *CI = cast<CallInst>(Val: U);
481	if (HandledUses.insert(Ptr: CI).second) {
482	if (processUse(CI, IsV5OrAbove))
483	MadeChange = true;
484	}
485	}
486
487	return MadeChange;
488	}
489
490	INITIALIZE_PASS_BEGIN(AMDGPULowerKernelAttributes, DEBUG_TYPE,
491	"AMDGPU Kernel Attributes", false, false)
492	INITIALIZE_PASS_END(AMDGPULowerKernelAttributes, DEBUG_TYPE,
493	"AMDGPU Kernel Attributes", false, false)
494
495	char AMDGPULowerKernelAttributes::ID = `0`;
496
497	ModulePass *llvm::createAMDGPULowerKernelAttributesPass() {
498	return new AMDGPULowerKernelAttributes ();
499	}
500
501	PreservedAnalyses
502	AMDGPULowerKernelAttributesPass::run(Function &F, FunctionAnalysisManager &AM) {
503	bool IsV5OrAbove =
504	AMDGPU::getAMDHSACodeObjectVersion(M: *F.getParent()) >= AMDGPU::AMDHSA_COV5;
505	Function BasePtr = getBasePtrIntrinsic(M&: F.getParent(), IsV5OrAbove);
506
507	if (!BasePtr) // ImplicitArgPtr/DispatchPtr not used.
508	return PreservedAnalyses::all();
509
510	bool Changed = false;
511	for (Instruction &I : instructions(F)) {
512	if (CallInst *CI = dyn_cast<CallInst>(Val: &I)) {
513	if (CI->getCalledFunction() == BasePtr)
514	Changed \|= processUse(CI, IsV5OrAbove);
515	}
516	}
517
518	return !Changed ? PreservedAnalyses::all()
519	: PreservedAnalyses::none().preserveSet<CFGAnalyses>();
520	}
521

Browse the source code of llvm_projects/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp