AMDGPULowerKernelAttributes.cpp source code [llvm_projects/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp]

1	//===-- AMDGPULowerKernelAttributes.cpp ------------------------------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	/// \file This pass does attempts to make use of reqd_work_group_size metadata
10	/// to eliminate loads from the dispatch packet and to constant fold OpenCL
11	/// get_local_size-like functions.
12	//
13	//===----------------------------------------------------------------------===//
14
15	#include "AMDGPU.h"
16	#include "Utils/AMDGPUBaseInfo.h"
17	#include "llvm/Analysis/ConstantFolding.h"
18	#include "llvm/Analysis/ValueTracking.h"
19	#include "llvm/CodeGen/Passes.h"
20	#include "llvm/IR/Constants.h"
21	#include "llvm/IR/Function.h"
22	#include "llvm/IR/InstIterator.h"
23	#include "llvm/IR/Instructions.h"
24	#include "llvm/IR/IntrinsicsAMDGPU.h"
25	#include "llvm/IR/MDBuilder.h"
26	#include "llvm/IR/PatternMatch.h"
27	#include "llvm/Pass.h"
28
29	#define DEBUG_TYPE "amdgpu-lower-kernel-attributes"
30
31	using namespace llvm;
32
33	namespace {
34
35	// Field offsets in hsa_kernel_dispatch_packet_t.
36	enum DispatchPackedOffsets {
37	WORKGROUP_SIZE_X = `4`,
38	WORKGROUP_SIZE_Y = `6`,
39	WORKGROUP_SIZE_Z = `8`,
40
41	GRID_SIZE_X = `12`,
42	GRID_SIZE_Y = `16`,
43	GRID_SIZE_Z = `20`
44	};
45
46	// Field offsets to implicit kernel argument pointer.
47	enum ImplicitArgOffsets {
48	HIDDEN_BLOCK_COUNT_X = `0`,
49	HIDDEN_BLOCK_COUNT_Y = `4`,
50	HIDDEN_BLOCK_COUNT_Z = `8`,
51
52	HIDDEN_GROUP_SIZE_X = `12`,
53	HIDDEN_GROUP_SIZE_Y = `14`,
54	HIDDEN_GROUP_SIZE_Z = `16`,
55
56	HIDDEN_REMAINDER_X = `18`,
57	HIDDEN_REMAINDER_Y = `20`,
58	HIDDEN_REMAINDER_Z = `22`,
59	};
60
61	class AMDGPULowerKernelAttributes : public ModulePass {
62	public:
63	static char ID;
64
65	AMDGPULowerKernelAttributes() : ModulePass (ID) {}
66
67	bool runOnModule(Module &M) override;
68
69	StringRef getPassName() const override {
70	return "AMDGPU Kernel Attributes";
71	}
72
73	void getAnalysisUsage(AnalysisUsage &AU) const override {
74	AU.setPreservesAll();
75	}
76	};
77
78	Function getBasePtrIntrinsic(Module &M, bool* IsV5OrAbove) {
79	auto IntrinsicId = IsV5OrAbove ? Intrinsic::amdgcn_implicitarg_ptr
80	: Intrinsic::amdgcn_dispatch_ptr;
81	return Intrinsic::getDeclarationIfExists(M: &M, id: IntrinsicId);
82	}
83
84	} // end anonymous namespace
85
86	static void annotateGridSizeLoadWithRangeMD(LoadInst *Load,
87	uint32_t MaxNumGroups) {
88	if (MaxNumGroups == `0` \|\| MaxNumGroups == std::numeric_limits<uint32_t>::max())
89	return;
90
91	if (!Load->getType()->isIntegerTy(Bitwidth: `32`))
92	return;
93
94	// TODO: If there is existing range metadata, preserve it if it is stricter.
95	MDBuilder MDB(Load->getContext());
96	MDNode *Range = MDB.createRange(Lo: APInt (`32`, `1`), Hi: APInt (`32`, MaxNumGroups + `1`));
97	Load->setMetadata(KindID: LLVMContext::MD_range, Node: Range);
98	}
99
100	static bool processUse(CallInst CI, bool* IsV5OrAbove) {
101	Function *F = CI->getParent()->getParent();
102
103	auto *MD = F->getMetadata(Kind: "reqd_work_group_size");
104	const bool HasReqdWorkGroupSize = MD && MD->getNumOperands() == `3`;
105
106	const bool HasUniformWorkGroupSize =
107	F->getFnAttribute(Kind: "uniform-work-group-size").getValueAsBool();
108
109	SmallVector<unsigned> MaxNumWorkgroups =
110	AMDGPU::getIntegerVecAttribute(F: *F, Name: "amdgpu-max-num-workgroups",
111	/Size=/`3`, /DefaultVal=/`0`);
112
113	if (!HasReqdWorkGroupSize && !HasUniformWorkGroupSize &&
114	none_of(Range&: MaxNumWorkgroups, P: [](unsigned X) { return X != `0`; }))
115	return false;
116
117	Value BlockCounts[`3`] = {nullptr, nullptr, nullptr*};
118	Value GroupSizes[`3`] = {nullptr, nullptr, nullptr*};
119	Value Remainders[`3`] = {nullptr, nullptr, nullptr*};
120	Value GridSizes[`3`] = {nullptr, nullptr, nullptr*};
121
122	const DataLayout &DL = F->getDataLayout();
123
124	// We expect to see several GEP users, casted to the appropriate type and
125	// loaded.
126	for (User *U : CI->users()) {
127	if (!U->hasOneUse())
128	continue;
129
130	int64_t Offset = `0`;
131	auto Load = dyn_cast<LoadInst>(Val: U); // Load from ImplicitArgPtr/DispatchPtr?*
132	auto *BCI = dyn_cast<BitCastInst>(Val: U);
133	if (!Load && !BCI) {
134	if (GetPointerBaseWithConstantOffset(Ptr: U, Offset, DL) != CI)
135	continue;
136	Load = dyn_cast<LoadInst>(Val: U->user_begin()); // Load from GEP?*
137	BCI = dyn_cast<BitCastInst>(Val: *U->user_begin());
138	}
139
140	if (BCI) {
141	if (!BCI->hasOneUse())
142	continue;
143	Load = dyn_cast<LoadInst>(Val: BCI->user_begin()); // Load from BCI?*
144	}
145
146	if (!Load \|\| !Load->isSimple())
147	continue;
148
149	unsigned LoadSize = DL.getTypeStoreSize(Ty: Load->getType());
150
151	// TODO: Handle merged loads.
152	if (IsV5OrAbove) { // Base is ImplicitArgPtr.
153	switch (Offset) {
154	case HIDDEN_BLOCK_COUNT_X:
155	if (LoadSize == `4`) {
156	BlockCounts[`0`] = Load;
157	annotateGridSizeLoadWithRangeMD(Load, MaxNumGroups: MaxNumWorkgroups [`0`]);
158	}
159	break;
160	case HIDDEN_BLOCK_COUNT_Y:
161	if (LoadSize == `4`) {
162	BlockCounts[`1`] = Load;
163	annotateGridSizeLoadWithRangeMD(Load, MaxNumGroups: MaxNumWorkgroups [`1`]);
164	}
165	break;
166	case HIDDEN_BLOCK_COUNT_Z:
167	if (LoadSize == `4`) {
168	BlockCounts[`2`] = Load;
169	annotateGridSizeLoadWithRangeMD(Load, MaxNumGroups: MaxNumWorkgroups [`2`]);
170	}
171	break;
172	case HIDDEN_GROUP_SIZE_X:
173	if (LoadSize == `2`)
174	GroupSizes[`0`] = Load;
175	break;
176	case HIDDEN_GROUP_SIZE_Y:
177	if (LoadSize == `2`)
178	GroupSizes[`1`] = Load;
179	break;
180	case HIDDEN_GROUP_SIZE_Z:
181	if (LoadSize == `2`)
182	GroupSizes[`2`] = Load;
183	break;
184	case HIDDEN_REMAINDER_X:
185	if (LoadSize == `2`)
186	Remainders[`0`] = Load;
187	break;
188	case HIDDEN_REMAINDER_Y:
189	if (LoadSize == `2`)
190	Remainders[`1`] = Load;
191	break;
192	case HIDDEN_REMAINDER_Z:
193	if (LoadSize == `2`)
194	Remainders[`2`] = Load;
195	break;
196	default:
197	break;
198	}
199	} else { // Base is DispatchPtr.
200	switch (Offset) {
201	case WORKGROUP_SIZE_X:
202	if (LoadSize == `2`)
203	GroupSizes[`0`] = Load;
204	break;
205	case WORKGROUP_SIZE_Y:
206	if (LoadSize == `2`)
207	GroupSizes[`1`] = Load;
208	break;
209	case WORKGROUP_SIZE_Z:
210	if (LoadSize == `2`)
211	GroupSizes[`2`] = Load;
212	break;
213	case GRID_SIZE_X:
214	if (LoadSize == `4`)
215	GridSizes[`0`] = Load;
216	break;
217	case GRID_SIZE_Y:
218	if (LoadSize == `4`)
219	GridSizes[`1`] = Load;
220	break;
221	case GRID_SIZE_Z:
222	if (LoadSize == `4`)
223	GridSizes[`2`] = Load;
224	break;
225	default:
226	break;
227	}
228	}
229	}
230
231	bool MadeChange = false;
232	if (IsV5OrAbove && HasUniformWorkGroupSize) {
233	// Under v5 __ockl_get_local_size returns the value computed by the expression:
234	//
235	// workgroup_id < hidden_block_count ? hidden_group_size : hidden_remainder
236	//
237	// For functions with the attribute uniform-work-group-size=true. we can evaluate
238	// workgroup_id < hidden_block_count as true, and thus hidden_group_size is returned
239	// for __ockl_get_local_size.
240	for (int I = `0`; I < `3`; ++I) {
241	Value *BlockCount = BlockCounts[I];
242	if (!BlockCount)
243	continue;
244
245	using namespace llvm::PatternMatch;
246	auto GroupIDIntrin =
247	I == `0` ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_x>()
248	: (I == `1` ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_y>()
249	: m_Intrinsic<Intrinsic::amdgcn_workgroup_id_z>());
250
251	for (User *ICmp : BlockCount->users()) {
252	if (match(V: ICmp, P: m_SpecificICmp(MatchPred: ICmpInst::ICMP_ULT, L: GroupIDIntrin,
253	R: m_Specific(V: BlockCount)))) {
254	ICmp->replaceAllUsesWith(V: llvm::ConstantInt::getTrue(Ty: ICmp->getType()));
255	MadeChange = true;
256	}
257	}
258	}
259
260	// All remainders should be 0 with uniform work group size.
261	for (Value *Remainder : Remainders) {
262	if (!Remainder)
263	continue;
264	Remainder->replaceAllUsesWith(V: Constant::getNullValue(Ty: Remainder->getType()));
265	MadeChange = true;
266	}
267	} else if (HasUniformWorkGroupSize) { // Pre-V5.
268	// Pattern match the code used to handle partial workgroup dispatches in the
269	// library implementation of get_local_size, so the entire function can be
270	// constant folded with a known group size.
271	//
272	// uint r = grid_size - group_id group_size;*
273	// get_local_size = (r < group_size) ? r : group_size;
274	//
275	// If we have uniform-work-group-size (which is the default in OpenCL 1.2),
276	// the grid_size is required to be a multiple of group_size). In this case:
277	//
278	// grid_size - (group_id group_size) < group_size*
279	// ->
280	// grid_size < group_size + (group_id group_size)*
281	//
282	// (grid_size / group_size) < 1 + group_id
283	//
284	// grid_size / group_size is at least 1, so we can conclude the select
285	// condition is false (except for group_id == 0, where the select result is
286	// the same).
287	for (int I = `0`; I < `3`; ++I) {
288	Value *GroupSize = GroupSizes[I];
289	Value *GridSize = GridSizes[I];
290	if (!GroupSize \|\| !GridSize)
291	continue;
292
293	using namespace llvm::PatternMatch;
294	auto GroupIDIntrin =
295	I == `0` ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_x>()
296	: (I == `1` ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_y>()
297	: m_Intrinsic<Intrinsic::amdgcn_workgroup_id_z>());
298
299	for (User *U : GroupSize->users()) {
300	auto *ZextGroupSize = dyn_cast<ZExtInst>(Val: U);
301	if (!ZextGroupSize)
302	continue;
303
304	for (User *UMin : ZextGroupSize->users()) {
305	if (match(V: UMin,
306	P: m_UMin(L: m_Sub(L: m_Specific(V: GridSize),
307	R: m_Mul(L: GroupIDIntrin, R: m_Specific(V: ZextGroupSize))),
308	R: m_Specific(V: ZextGroupSize)))) {
309	if (HasReqdWorkGroupSize) {
310	ConstantInt *KnownSize
311	= mdconst::extract<ConstantInt>(MD: MD->getOperand(I));
312	UMin->replaceAllUsesWith(V: ConstantFoldIntegerCast(
313	C: KnownSize, DestTy: UMin->getType(), IsSigned: false, DL));
314	} else {
315	UMin->replaceAllUsesWith(V: ZextGroupSize);
316	}
317
318	MadeChange = true;
319	}
320	}
321	}
322	}
323	}
324
325	// If reqd_work_group_size is set, we can replace work group size with it.
326	if (!HasReqdWorkGroupSize)
327	return MadeChange;
328
329	for (int I = `0`; I < `3`; I++) {
330	Value *GroupSize = GroupSizes[I];
331	if (!GroupSize)
332	continue;
333
334	ConstantInt *KnownSize = mdconst::extract<ConstantInt>(MD: MD->getOperand(I));
335	GroupSize->replaceAllUsesWith(
336	V: ConstantFoldIntegerCast(C: KnownSize, DestTy: GroupSize->getType(), IsSigned: false, DL));
337	MadeChange = true;
338	}
339
340	return MadeChange;
341	}
342
343
344	// TODO: Move makeLIDRangeMetadata usage into here. Seem to not get
345	// TargetPassConfig for subtarget.
346	bool AMDGPULowerKernelAttributes::runOnModule(Module &M) {
347	bool MadeChange = false;
348	bool IsV5OrAbove =
349	AMDGPU::getAMDHSACodeObjectVersion(M) >= AMDGPU::AMDHSA_COV5;
350	Function *BasePtr = getBasePtrIntrinsic(M, IsV5OrAbove);
351
352	if (!BasePtr) // ImplicitArgPtr/DispatchPtr not used.
353	return false;
354
355	SmallPtrSet<Instruction *, `4`> HandledUses;
356	for (auto *U : BasePtr->users()) {
357	CallInst *CI = cast<CallInst>(Val: U);
358	if (HandledUses.insert(Ptr: CI).second) {
359	if (processUse(CI, IsV5OrAbove))
360	MadeChange = true;
361	}
362	}
363
364	return MadeChange;
365	}
366
367
368	INITIALIZE_PASS_BEGIN(AMDGPULowerKernelAttributes, DEBUG_TYPE,
369	"AMDGPU Kernel Attributes", false, false)
370	INITIALIZE_PASS_END(AMDGPULowerKernelAttributes, DEBUG_TYPE,
371	"AMDGPU Kernel Attributes", false, false)
372
373	char AMDGPULowerKernelAttributes::ID = `0`;
374
375	ModulePass *llvm::createAMDGPULowerKernelAttributesPass() {
376	return new AMDGPULowerKernelAttributes ();
377	}
378
379	PreservedAnalyses
380	AMDGPULowerKernelAttributesPass::run(Function &F, FunctionAnalysisManager &AM) {
381	bool IsV5OrAbove =
382	AMDGPU::getAMDHSACodeObjectVersion(M: *F.getParent()) >= AMDGPU::AMDHSA_COV5;
383	Function BasePtr = getBasePtrIntrinsic(M&: F.getParent(), IsV5OrAbove);
384
385	if (!BasePtr) // ImplicitArgPtr/DispatchPtr not used.
386	return PreservedAnalyses::all();
387
388	for (Instruction &I : instructions(F)) {
389	if (CallInst *CI = dyn_cast<CallInst>(Val: &I)) {
390	if (CI->getCalledFunction() == BasePtr)
391	processUse(CI, IsV5OrAbove);
392	}
393	}
394
395	return PreservedAnalyses::all();
396	}
397

Browse the source code of llvm_projects/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp