AMDGPULowerExecSync.cpp source code [llvm_projects/llvm/lib/Target/AMDGPU/AMDGPULowerExecSync.cpp]

1	//===----------------------------------------------------------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// Lower LDS global variables with target extension type "amdgpu.named.barrier"
10	// that require specialized address assignment. It assigns a unique
11	// barrier identifier to each named-barrier LDS variable and encodes
12	// this identifier within the !absolute_symbol metadata of that global.
13	// This encoding ensures that subsequent LDS lowering passes can process these
14	// barriers correctly without conflicts.
15	//
16	//===----------------------------------------------------------------------===//
17
18	#include "AMDGPU.h"
19	#include "AMDGPUMemoryUtils.h"
20	#include "AMDGPUTargetMachine.h"
21	#include "llvm/ADT/DenseMap.h"
22	#include "llvm/Analysis/CallGraph.h"
23	#include "llvm/CodeGen/TargetPassConfig.h"
24	#include "llvm/IR/Constants.h"
25	#include "llvm/IR/Instructions.h"
26	#include "llvm/IR/ReplaceConstant.h"
27	#include "llvm/InitializePasses.h"
28	#include "llvm/Pass.h"
29	#include "llvm/Transforms/Utils/ModuleUtils.h"
30
31	#include <algorithm>
32
33	#define DEBUG_TYPE "amdgpu-lower-exec-sync"
34
35	using namespace llvm;
36	using namespace AMDGPU;
37
38	namespace {
39
40	// Write the specified address into metadata where it can be retrieved by
41	// the assembler. Format is a half open range, [Address Address+1)
42	static void recordLDSAbsoluteAddress(Module M, GlobalVariable GV,
43	uint32_t Address) {
44	LLVMContext &Ctx = M->getContext();
45	auto *IntTy = M->getDataLayout().getIntPtrType(C&: Ctx, AddressSpace: AMDGPUAS::LOCAL_ADDRESS);
46	auto *MinC = ConstantAsMetadata::get(C: ConstantInt::get(Ty: IntTy, V: Address));
47	auto *MaxC = ConstantAsMetadata::get(C: ConstantInt::get(Ty: IntTy, V: Address + `1`));
48	GV->setMetadata(KindID: LLVMContext::MD_absolute_symbol,
49	Node: MDNode::get(Context&: Ctx, MDs: {MinC, MaxC}));
50	}
51
52	/// Get next available ID for sync object. The ID allocation is tracked in \p
53	/// MaxNumGroup groups by \p NextAvailableIDTracker. Each call of the function
54	/// will ask for \p IDCnt against all the \p Kernels, it will return the
55	/// maximum of the available ones and update the ID tracker.
56	template <typename T>
57	unsigned allocateExecSyncID(T &NextAvailableIDTracker,
58	ArrayRef<Function > Kernels, unsigned* GroupID,
59	unsigned MaxNumGroup, unsigned IDCnt) {
60	constexpr unsigned InitialVal = `1`;
61	unsigned NextID = InitialVal;
62	for (Function *F : Kernels) {
63	const SmallVectorImpl<unsigned> &NextAvailableID =
64	NextAvailableIDTracker.lookup(F);
65	unsigned ID = InitialVal;
66	if (!NextAvailableID.empty())
67	ID = NextAvailableID [GroupID];
68
69	if (ID > NextID)
70	NextID = ID;
71	}
72
73	// Bump the next available id for the kernels.
74	for (Function *F : Kernels) {
75	auto Inserted = NextAvailableIDTracker.try_emplace(F);
76	// Initialize on first insertion.
77	if (Inserted.second)
78	Inserted.first->second.assign(MaxNumGroup, InitialVal);
79	// Update the available ID.
80	Inserted.first->second[GroupID] = NextID + IDCnt;
81	}
82	return NextID;
83	}
84
85	// Main utility function for special LDS variables lowering.
86	static bool lowerExecSyncGlobalVariables(Module &M, GVUsesInfoTy &GVUsesInfo) {
87	bool Changed = false;
88	const DataLayout &DL = M.getDataLayout();
89
90	constexpr unsigned NumBarScopes = `1`;
91	MapVector<GlobalVariable , SmallVector<Function >> AllocationQ;
92	DenseMap<Function , SmallVector<unsigned*, NumBarScopes>> KernelBarrierIDs;
93
94	for (auto &[F, GVs] : GVUsesInfo.IndirectAccess) {
95	for (auto *GV : GVs) {
96	if (!isNamedBarrier(GV: *GV) \|\| GV->isAbsoluteSymbolRef())
97	continue;
98	auto Iter = AllocationQ.find(Key: GV);
99	if (Iter == AllocationQ.end())
100	AllocationQ.insert(KV: {GV, {F}});
101	else
102	Iter->second.push_back(Elt: F);
103	}
104	}
105
106	for (auto &[F, GVs] : GVUsesInfo.DirectAccess) {
107	for (auto *GV : GVs) {
108	if (!isNamedBarrier(GV: *GV) \|\| GV->isAbsoluteSymbolRef())
109	continue;
110	auto Iter = AllocationQ.find(Key: GV);
111	if (Iter == AllocationQ.end())
112	AllocationQ.insert(KV: {GV, {F}});
113	else
114	Iter->second.push_back(Elt: F);
115	}
116	}
117
118	sort(C&: AllocationQ, Comp: [](std::pair<GlobalVariable , SmallVector<Function >> A,
119	std::pair<GlobalVariable , SmallVector<Function >> B) {
120	// First order by number of kernels that access the GlobalVariable.
121	if (A.second.size() != B.second.size())
122	return A.second.size() > B.second.size();
123
124	// Then order by their names so we always get a deterministic order.
125	return A.first->getName() < B.first->getName();
126	});
127
128	for (auto &[GV, Kernels] : AllocationQ) {
129	unsigned Offset;
130	if (TargetExtType ExtTy = isNamedBarrier(GV: GV)) {
131	unsigned BarrierScope = ExtTy->getIntParameter(i: `0`);
132	unsigned BarCnt = GV->getGlobalSize(DL) / `16`;
133
134	unsigned BarID = allocateExecSyncID(NextAvailableIDTracker&: KernelBarrierIDs, Kernels,
135	GroupID: BarrierScope, MaxNumGroup: NumBarScopes, IDCnt: BarCnt);
136
137	LLVM_DEBUG(GV->printAsOperand(dbgs(), false);
138	dbgs() << " was assigned barrier id: " << BarID
139	<< " id-count: " << BarCnt << "\n");
140	// 4 bits for alignment, 5 bits for the barrier num,
141	// 3 bits for the barrier scope
142	Offset = `0x802000u` \| BarrierScope << `9` \| BarID << `4`;
143	} else {
144	llvm_unreachable("Unhandled special variable type.");
145	}
146
147	recordLDSAbsoluteAddress(M: &M, GV, Address: Offset);
148	}
149
150	// Also erase those special LDS variables from indirect_access.
151	for (auto &K : GVUsesInfo.IndirectAccess) {
152	assert(isKernel(*K.first));
153	K.second.remove_if(Pred: [](GlobalVariable GV) { return* isNamedBarrier(GV: *GV); });
154	}
155	return Changed;
156	}
157
158	static bool hasBarrierToLower(const GVUsesInfoTy &GVUsesInfo) {
159	for (auto &Map : {GVUsesInfo.DirectAccess, GVUsesInfo.IndirectAccess}) {
160	for (auto &[Fn, GVs] : Map) {
161	for (auto &GV : GVs) {
162	if (AMDGPU::isNamedBarrier(GV: *GV))
163	return true;
164	}
165	}
166	}
167	return false;
168	}
169
170	// With object linking, barrier ID assignment is deferred to the linker.
171	// Externalize named barrier globals and emit self-contained metadata so the
172	// AsmPrinter can generate the callgraph entries the linker needs.
173	static bool handleNamedBarriersForObjectLinking(Module &M) {
174	DenseMap<GlobalVariable , DenseSet<Function >> BarrierToFuncs;
175	for (GlobalVariable &GV : M.globals()) {
176	if (!isNamedBarrier(GV) \|\| GV.use_empty())
177	continue;
178	for (User *U : GV.users()) {
179	if (auto *I = dyn_cast<Instruction>(Val: U))
180	BarrierToFuncs [&GV].insert(V: I->getFunction());
181	}
182	}
183	if (BarrierToFuncs.empty())
184	return false;
185
186	LLVMContext &Ctx = M.getContext();
187	NamedMDNode *BarMD = M.getOrInsertNamedMetadata(Name: "amdgpu.named_barrier.uses");
188
189	std::string ModuleId;
190	ModuleId = getUniqueModuleId(M: &M);
191	assert(!ModuleId.empty() &&
192	"modules with named barriers should have a unique ID");
193	for (auto &[V, Funcs] : BarrierToFuncs) {
194	if (V->hasLocalLinkage())
195	V->setName("__amdgpu_named_barrier." + V->getName() + ModuleId);
196	else if (!V->getName().starts_with(Prefix: "__amdgpu_named_barrier"))
197	V->setName("__amdgpu_named_barrier." + V->getName());
198	V->setInitializer(nullptr);
199	V->setLinkage(GlobalValue::ExternalLinkage);
200
201	SmallVector<Metadata *, `4`> Ops;
202	Ops.push_back(Elt: ValueAsMetadata::get(V));
203	for (Function *F : Funcs)
204	Ops.push_back(Elt: ValueAsMetadata::get(V: F));
205	BarMD->addOperand(M: MDNode::get(Context&: Ctx, MDs: Ops));
206	}
207	return true;
208	}
209
210	static bool runLowerExecSyncGlobals(Module &M) {
211	if (AMDGPUTargetMachine::EnableObjectLinking)
212	return handleNamedBarriersForObjectLinking(M);
213
214	CallGraph CG = CallGraph (M);
215	bool Changed = false;
216	Changed \|=
217	eliminateGVConstantExprUsesFromAllInstructions(M, Filter: isLDSVariableToLower);
218
219	// For each kernel, what variables does it access directly or through
220	// callees
221	GVUsesInfoTy LDSUsesInfo = getTransitiveUsesOfLDSForLowering(CG, M);
222
223	if (hasBarrierToLower(GVUsesInfo: LDSUsesInfo)) {
224	// Special LDS variables need special address assignment
225	Changed \|= lowerExecSyncGlobalVariables(M, GVUsesInfo&: LDSUsesInfo);
226	}
227
228	return Changed;
229	}
230
231	class AMDGPULowerExecSyncLegacy : public ModulePass {
232	public:
233	static char ID;
234	AMDGPULowerExecSyncLegacy() : ModulePass (ID) {}
235	bool runOnModule(Module &M) override;
236	};
237
238	} // namespace
239
240	char AMDGPULowerExecSyncLegacy::ID = `0`;
241	char &llvm::AMDGPULowerExecSyncLegacyPassID = AMDGPULowerExecSyncLegacy::ID;
242
243	INITIALIZE_PASS_BEGIN(AMDGPULowerExecSyncLegacy, DEBUG_TYPE,
244	"AMDGPU lowering of execution synchronization", false,
245	false)
246	INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
247	INITIALIZE_PASS_END(AMDGPULowerExecSyncLegacy, DEBUG_TYPE,
248	"AMDGPU lowering of execution synchronization", false,
249	false)
250
251	bool AMDGPULowerExecSyncLegacy::runOnModule(Module &M) {
252	return runLowerExecSyncGlobals(M);
253	}
254
255	ModulePass *llvm::createAMDGPULowerExecSyncLegacyPass() {
256	return new AMDGPULowerExecSyncLegacy ();
257	}
258
259	PreservedAnalyses AMDGPULowerExecSyncPass::run(Module &M,
260	ModuleAnalysisManager &AM) {
261	return runLowerExecSyncGlobals(M) ? PreservedAnalyses::none()
262	: PreservedAnalyses::all();
263	}
264

Browse the source code of llvm_projects/llvm/lib/Target/AMDGPU/AMDGPULowerExecSync.cpp