AMDGPULowerKernelAttributes.cpp source code [llvm_projects/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp]

1	//===-- AMDGPULowerKernelAttributes.cpp ------------------------------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	/// \file This pass does attempts to make use of reqd_work_group_size metadata
10	/// to eliminate loads from the dispatch packet and to constant fold OpenCL
11	/// get_local_size-like functions.
12	//
13	//===----------------------------------------------------------------------===//
14
15	#include "AMDGPU.h"
16	#include "Utils/AMDGPUBaseInfo.h"
17	#include "llvm/Analysis/ConstantFolding.h"
18	#include "llvm/Analysis/ValueTracking.h"
19	#include "llvm/CodeGen/Passes.h"
20	#include "llvm/CodeGen/TargetPassConfig.h"
21	#include "llvm/IR/Constants.h"
22	#include "llvm/IR/Function.h"
23	#include "llvm/IR/InstIterator.h"
24	#include "llvm/IR/Instructions.h"
25	#include "llvm/IR/IntrinsicsAMDGPU.h"
26	#include "llvm/IR/PatternMatch.h"
27	#include "llvm/Pass.h"
28
29	#define DEBUG_TYPE "amdgpu-lower-kernel-attributes"
30
31	using namespace llvm;
32
33	namespace {
34
35	// Field offsets in hsa_kernel_dispatch_packet_t.
36	enum DispatchPackedOffsets {
37	WORKGROUP_SIZE_X = `4`,
38	WORKGROUP_SIZE_Y = `6`,
39	WORKGROUP_SIZE_Z = `8`,
40
41	GRID_SIZE_X = `12`,
42	GRID_SIZE_Y = `16`,
43	GRID_SIZE_Z = `20`
44	};
45
46	// Field offsets to implicit kernel argument pointer.
47	enum ImplicitArgOffsets {
48	HIDDEN_BLOCK_COUNT_X = `0`,
49	HIDDEN_BLOCK_COUNT_Y = `4`,
50	HIDDEN_BLOCK_COUNT_Z = `8`,
51
52	HIDDEN_GROUP_SIZE_X = `12`,
53	HIDDEN_GROUP_SIZE_Y = `14`,
54	HIDDEN_GROUP_SIZE_Z = `16`,
55
56	HIDDEN_REMAINDER_X = `18`,
57	HIDDEN_REMAINDER_Y = `20`,
58	HIDDEN_REMAINDER_Z = `22`,
59	};
60
61	class AMDGPULowerKernelAttributes : public ModulePass {
62	public:
63	static char ID;
64
65	AMDGPULowerKernelAttributes() : ModulePass (ID) {}
66
67	bool runOnModule(Module &M) override;
68
69	StringRef getPassName() const override {
70	return "AMDGPU Kernel Attributes";
71	}
72
73	void getAnalysisUsage(AnalysisUsage &AU) const override {
74	AU.setPreservesAll();
75	}
76	};
77
78	Function getBasePtrIntrinsic(Module &M, bool* IsV5OrAbove) {
79	auto IntrinsicId = IsV5OrAbove ? Intrinsic::amdgcn_implicitarg_ptr
80	: Intrinsic::amdgcn_dispatch_ptr;
81	StringRef Name = Intrinsic::getName(id: IntrinsicId);
82	return M.getFunction(Name);
83	}
84
85	} // end anonymous namespace
86
87	static bool processUse(CallInst CI, bool* IsV5OrAbove) {
88	Function *F = CI->getParent()->getParent();
89
90	auto MD = F->getMetadata(Kind: "reqd_work_group_size");
91	const bool HasReqdWorkGroupSize = MD && MD->getNumOperands() == `3`;
92
93	const bool HasUniformWorkGroupSize =
94	F->getFnAttribute(Kind: "uniform-work-group-size").getValueAsBool();
95
96	if (!HasReqdWorkGroupSize && !HasUniformWorkGroupSize)
97	return false;
98
99	Value BlockCounts[`3`] = {nullptr, nullptr, nullptr*};
100	Value GroupSizes[`3`] = {nullptr, nullptr, nullptr*};
101	Value Remainders[`3`] = {nullptr, nullptr, nullptr*};
102	Value GridSizes[`3`] = {nullptr, nullptr, nullptr*};
103
104	const DataLayout &DL = F->getDataLayout();
105
106	// We expect to see several GEP users, casted to the appropriate type and
107	// loaded.
108	for (User *U : CI->users()) {
109	if (!U->hasOneUse())
110	continue;
111
112	int64_t Offset = `0`;
113	auto Load = dyn_cast<LoadInst>(Val: U); // Load from ImplicitArgPtr/DispatchPtr?*
114	auto *BCI = dyn_cast<BitCastInst>(Val: U);
115	if (!Load && !BCI) {
116	if (GetPointerBaseWithConstantOffset(Ptr: U, Offset, DL) != CI)
117	continue;
118	Load = dyn_cast<LoadInst>(Val: U->user_begin()); // Load from GEP?*
119	BCI = dyn_cast<BitCastInst>(Val: *U->user_begin());
120	}
121
122	if (BCI) {
123	if (!BCI->hasOneUse())
124	continue;
125	Load = dyn_cast<LoadInst>(Val: BCI->user_begin()); // Load from BCI?*
126	}
127
128	if (!Load \|\| !Load->isSimple())
129	continue;
130
131	unsigned LoadSize = DL.getTypeStoreSize(Ty: Load->getType());
132
133	// TODO: Handle merged loads.
134	if (IsV5OrAbove) { // Base is ImplicitArgPtr.
135	switch (Offset) {
136	case HIDDEN_BLOCK_COUNT_X:
137	if (LoadSize == `4`)
138	BlockCounts[`0`] = Load;
139	break;
140	case HIDDEN_BLOCK_COUNT_Y:
141	if (LoadSize == `4`)
142	BlockCounts[`1`] = Load;
143	break;
144	case HIDDEN_BLOCK_COUNT_Z:
145	if (LoadSize == `4`)
146	BlockCounts[`2`] = Load;
147	break;
148	case HIDDEN_GROUP_SIZE_X:
149	if (LoadSize == `2`)
150	GroupSizes[`0`] = Load;
151	break;
152	case HIDDEN_GROUP_SIZE_Y:
153	if (LoadSize == `2`)
154	GroupSizes[`1`] = Load;
155	break;
156	case HIDDEN_GROUP_SIZE_Z:
157	if (LoadSize == `2`)
158	GroupSizes[`2`] = Load;
159	break;
160	case HIDDEN_REMAINDER_X:
161	if (LoadSize == `2`)
162	Remainders[`0`] = Load;
163	break;
164	case HIDDEN_REMAINDER_Y:
165	if (LoadSize == `2`)
166	Remainders[`1`] = Load;
167	break;
168	case HIDDEN_REMAINDER_Z:
169	if (LoadSize == `2`)
170	Remainders[`2`] = Load;
171	break;
172	default:
173	break;
174	}
175	} else { // Base is DispatchPtr.
176	switch (Offset) {
177	case WORKGROUP_SIZE_X:
178	if (LoadSize == `2`)
179	GroupSizes[`0`] = Load;
180	break;
181	case WORKGROUP_SIZE_Y:
182	if (LoadSize == `2`)
183	GroupSizes[`1`] = Load;
184	break;
185	case WORKGROUP_SIZE_Z:
186	if (LoadSize == `2`)
187	GroupSizes[`2`] = Load;
188	break;
189	case GRID_SIZE_X:
190	if (LoadSize == `4`)
191	GridSizes[`0`] = Load;
192	break;
193	case GRID_SIZE_Y:
194	if (LoadSize == `4`)
195	GridSizes[`1`] = Load;
196	break;
197	case GRID_SIZE_Z:
198	if (LoadSize == `4`)
199	GridSizes[`2`] = Load;
200	break;
201	default:
202	break;
203	}
204	}
205	}
206
207	bool MadeChange = false;
208	if (IsV5OrAbove && HasUniformWorkGroupSize) {
209	// Under v5 __ockl_get_local_size returns the value computed by the expression:
210	//
211	// workgroup_id < hidden_block_count ? hidden_group_size : hidden_remainder
212	//
213	// For functions with the attribute uniform-work-group-size=true. we can evaluate
214	// workgroup_id < hidden_block_count as true, and thus hidden_group_size is returned
215	// for __ockl_get_local_size.
216	for (int I = `0`; I < `3`; ++I) {
217	Value *BlockCount = BlockCounts[I];
218	if (!BlockCount)
219	continue;
220
221	using namespace llvm::PatternMatch;
222	auto GroupIDIntrin =
223	I == `0` ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_x>()
224	: (I == `1` ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_y>()
225	: m_Intrinsic<Intrinsic::amdgcn_workgroup_id_z>());
226
227	for (User *ICmp : BlockCount->users()) {
228	ICmpInst::Predicate Pred;
229	if (match(V: ICmp, P: m_ICmp(Pred, L: GroupIDIntrin, R: m_Specific(V: BlockCount)))) {
230	if (Pred != ICmpInst::ICMP_ULT)
231	continue;
232	ICmp->replaceAllUsesWith(V: llvm::ConstantInt::getTrue(Ty: ICmp->getType()));
233	MadeChange = true;
234	}
235	}
236	}
237
238	// All remainders should be 0 with uniform work group size.
239	for (Value *Remainder : Remainders) {
240	if (!Remainder)
241	continue;
242	Remainder->replaceAllUsesWith(V: Constant::getNullValue(Ty: Remainder->getType()));
243	MadeChange = true;
244	}
245	} else if (HasUniformWorkGroupSize) { // Pre-V5.
246	// Pattern match the code used to handle partial workgroup dispatches in the
247	// library implementation of get_local_size, so the entire function can be
248	// constant folded with a known group size.
249	//
250	// uint r = grid_size - group_id group_size;*
251	// get_local_size = (r < group_size) ? r : group_size;
252	//
253	// If we have uniform-work-group-size (which is the default in OpenCL 1.2),
254	// the grid_size is required to be a multiple of group_size). In this case:
255	//
256	// grid_size - (group_id group_size) < group_size*
257	// ->
258	// grid_size < group_size + (group_id group_size)*
259	//
260	// (grid_size / group_size) < 1 + group_id
261	//
262	// grid_size / group_size is at least 1, so we can conclude the select
263	// condition is false (except for group_id == 0, where the select result is
264	// the same).
265	for (int I = `0`; I < `3`; ++I) {
266	Value *GroupSize = GroupSizes[I];
267	Value *GridSize = GridSizes[I];
268	if (!GroupSize \|\| !GridSize)
269	continue;
270
271	using namespace llvm::PatternMatch;
272	auto GroupIDIntrin =
273	I == `0` ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_x>()
274	: (I == `1` ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_y>()
275	: m_Intrinsic<Intrinsic::amdgcn_workgroup_id_z>());
276
277	for (User *U : GroupSize->users()) {
278	auto *ZextGroupSize = dyn_cast<ZExtInst>(Val: U);
279	if (!ZextGroupSize)
280	continue;
281
282	for (User *UMin : ZextGroupSize->users()) {
283	if (match(V: UMin,
284	P: m_UMin(L: m_Sub(L: m_Specific(V: GridSize),
285	R: m_Mul(L: GroupIDIntrin, R: m_Specific(V: ZextGroupSize))),
286	R: m_Specific(V: ZextGroupSize)))) {
287	if (HasReqdWorkGroupSize) {
288	ConstantInt *KnownSize
289	= mdconst::extract<ConstantInt>(MD: MD->getOperand(I));
290	UMin->replaceAllUsesWith(V: ConstantFoldIntegerCast(
291	C: KnownSize, DestTy: UMin->getType(), IsSigned: false, DL));
292	} else {
293	UMin->replaceAllUsesWith(V: ZextGroupSize);
294	}
295
296	MadeChange = true;
297	}
298	}
299	}
300	}
301	}
302
303	// If reqd_work_group_size is set, we can replace work group size with it.
304	if (!HasReqdWorkGroupSize)
305	return MadeChange;
306
307	for (int I = `0`; I < `3`; I++) {
308	Value *GroupSize = GroupSizes[I];
309	if (!GroupSize)
310	continue;
311
312	ConstantInt *KnownSize = mdconst::extract<ConstantInt>(MD: MD->getOperand(I));
313	GroupSize->replaceAllUsesWith(
314	V: ConstantFoldIntegerCast(C: KnownSize, DestTy: GroupSize->getType(), IsSigned: false, DL));
315	MadeChange = true;
316	}
317
318	return MadeChange;
319	}
320
321
322	// TODO: Move makeLIDRangeMetadata usage into here. Seem to not get
323	// TargetPassConfig for subtarget.
324	bool AMDGPULowerKernelAttributes::runOnModule(Module &M) {
325	bool MadeChange = false;
326	bool IsV5OrAbove =
327	AMDGPU::getAMDHSACodeObjectVersion(M) >= AMDGPU::AMDHSA_COV5;
328	Function *BasePtr = getBasePtrIntrinsic(M, IsV5OrAbove);
329
330	if (!BasePtr) // ImplicitArgPtr/DispatchPtr not used.
331	return false;
332
333	SmallPtrSet<Instruction *, `4`> HandledUses;
334	for (auto *U : BasePtr->users()) {
335	CallInst *CI = cast<CallInst>(Val: U);
336	if (HandledUses.insert(Ptr: CI).second) {
337	if (processUse(CI, IsV5OrAbove))
338	MadeChange = true;
339	}
340	}
341
342	return MadeChange;
343	}
344
345
346	INITIALIZE_PASS_BEGIN(AMDGPULowerKernelAttributes, DEBUG_TYPE,
347	"AMDGPU Kernel Attributes", false, false)
348	INITIALIZE_PASS_END(AMDGPULowerKernelAttributes, DEBUG_TYPE,
349	"AMDGPU Kernel Attributes", false, false)
350
351	char AMDGPULowerKernelAttributes::ID = `0`;
352
353	ModulePass *llvm::createAMDGPULowerKernelAttributesPass() {
354	return new AMDGPULowerKernelAttributes ();
355	}
356
357	PreservedAnalyses
358	AMDGPULowerKernelAttributesPass::run(Function &F, FunctionAnalysisManager &AM) {
359	bool IsV5OrAbove =
360	AMDGPU::getAMDHSACodeObjectVersion(M: *F.getParent()) >= AMDGPU::AMDHSA_COV5;
361	Function BasePtr = getBasePtrIntrinsic(M&: F.getParent(), IsV5OrAbove);
362
363	if (!BasePtr) // ImplicitArgPtr/DispatchPtr not used.
364	return PreservedAnalyses::all();
365
366	for (Instruction &I : instructions(F)) {
367	if (CallInst *CI = dyn_cast<CallInst>(Val: &I)) {
368	if (CI->getCalledFunction() == BasePtr)
369	processUse(CI, IsV5OrAbove);
370	}
371	}
372
373	return PreservedAnalyses::all();
374	}
375

Browse the source code of llvm_projects/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp