1//===----------------------------------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// Lower LDS global variables with target extension type "amdgpu.named.barrier"
10// that require specialized address assignment. It assigns a unique
11// barrier identifier to each named-barrier LDS variable and encodes
12// this identifier within the !absolute_symbol metadata of that global.
13// This encoding ensures that subsequent LDS lowering passes can process these
14// barriers correctly without conflicts.
15//
16//===----------------------------------------------------------------------===//
17
18#include "AMDGPU.h"
19#include "AMDGPUMemoryUtils.h"
20#include "AMDGPUTargetMachine.h"
21#include "llvm/ADT/DenseMap.h"
22#include "llvm/Analysis/CallGraph.h"
23#include "llvm/CodeGen/TargetPassConfig.h"
24#include "llvm/IR/Constants.h"
25#include "llvm/IR/Instructions.h"
26#include "llvm/IR/ReplaceConstant.h"
27#include "llvm/InitializePasses.h"
28#include "llvm/Pass.h"
29#include "llvm/Transforms/Utils/ModuleUtils.h"
30
31#include <algorithm>
32
33#define DEBUG_TYPE "amdgpu-lower-exec-sync"
34
35using namespace llvm;
36using namespace AMDGPU;
37
38namespace {
39
40// Write the specified address into metadata where it can be retrieved by
41// the assembler. Format is a half open range, [Address Address+1)
42static void recordLDSAbsoluteAddress(Module *M, GlobalVariable *GV,
43 uint32_t Address) {
44 LLVMContext &Ctx = M->getContext();
45 auto *IntTy = M->getDataLayout().getIntPtrType(C&: Ctx, AddressSpace: AMDGPUAS::LOCAL_ADDRESS);
46 auto *MinC = ConstantAsMetadata::get(C: ConstantInt::get(Ty: IntTy, V: Address));
47 auto *MaxC = ConstantAsMetadata::get(C: ConstantInt::get(Ty: IntTy, V: Address + 1));
48 GV->setMetadata(KindID: LLVMContext::MD_absolute_symbol,
49 Node: MDNode::get(Context&: Ctx, MDs: {MinC, MaxC}));
50}
51
52/// Get next available ID for sync object. The ID allocation is tracked in \p
53/// MaxNumGroup groups by \p NextAvailableIDTracker. Each call of the function
54/// will ask for \p IDCnt against all the \p Kernels, it will return the
55/// maximum of the available ones and update the ID tracker.
56template <typename T>
57unsigned allocateExecSyncID(T &NextAvailableIDTracker,
58 ArrayRef<Function *> Kernels, unsigned GroupID,
59 unsigned MaxNumGroup, unsigned IDCnt) {
60 constexpr unsigned InitialVal = 1;
61 unsigned NextID = InitialVal;
62 for (Function *F : Kernels) {
63 const SmallVectorImpl<unsigned> &NextAvailableID =
64 NextAvailableIDTracker.lookup(F);
65 unsigned ID = InitialVal;
66 if (!NextAvailableID.empty())
67 ID = NextAvailableID[GroupID];
68
69 if (ID > NextID)
70 NextID = ID;
71 }
72
73 // Bump the next available id for the kernels.
74 for (Function *F : Kernels) {
75 auto Inserted = NextAvailableIDTracker.try_emplace(F);
76 // Initialize on first insertion.
77 if (Inserted.second)
78 Inserted.first->second.assign(MaxNumGroup, InitialVal);
79 // Update the available ID.
80 Inserted.first->second[GroupID] = NextID + IDCnt;
81 }
82 return NextID;
83}
84
85// Main utility function for special LDS variables lowering.
86static bool lowerExecSyncGlobalVariables(Module &M, GVUsesInfoTy &GVUsesInfo) {
87 bool Changed = false;
88 const DataLayout &DL = M.getDataLayout();
89
90 constexpr unsigned NumBarScopes = 1;
91 MapVector<GlobalVariable *, SmallVector<Function *>> AllocationQ;
92 DenseMap<Function *, SmallVector<unsigned, NumBarScopes>> KernelBarrierIDs;
93
94 for (auto &[F, GVs] : GVUsesInfo.IndirectAccess) {
95 for (auto *GV : GVs) {
96 if (!isNamedBarrier(GV: *GV) || GV->isAbsoluteSymbolRef())
97 continue;
98 auto Iter = AllocationQ.find(Key: GV);
99 if (Iter == AllocationQ.end())
100 AllocationQ.insert(KV: {GV, {F}});
101 else
102 Iter->second.push_back(Elt: F);
103 }
104 }
105
106 for (auto &[F, GVs] : GVUsesInfo.DirectAccess) {
107 for (auto *GV : GVs) {
108 if (!isNamedBarrier(GV: *GV) || GV->isAbsoluteSymbolRef())
109 continue;
110 auto Iter = AllocationQ.find(Key: GV);
111 if (Iter == AllocationQ.end())
112 AllocationQ.insert(KV: {GV, {F}});
113 else
114 Iter->second.push_back(Elt: F);
115 }
116 }
117
118 sort(C&: AllocationQ, Comp: [](std::pair<GlobalVariable *, SmallVector<Function *>> A,
119 std::pair<GlobalVariable *, SmallVector<Function *>> B) {
120 // First order by number of kernels that access the GlobalVariable.
121 if (A.second.size() != B.second.size())
122 return A.second.size() > B.second.size();
123
124 // Then order by their names so we always get a deterministic order.
125 return A.first->getName() < B.first->getName();
126 });
127
128 for (auto &[GV, Kernels] : AllocationQ) {
129 unsigned Offset;
130 if (TargetExtType *ExtTy = isNamedBarrier(GV: *GV)) {
131 unsigned BarrierScope = ExtTy->getIntParameter(i: 0);
132 unsigned BarCnt = GV->getGlobalSize(DL) / 16;
133
134 unsigned BarID = allocateExecSyncID(NextAvailableIDTracker&: KernelBarrierIDs, Kernels,
135 GroupID: BarrierScope, MaxNumGroup: NumBarScopes, IDCnt: BarCnt);
136
137 LLVM_DEBUG(GV->printAsOperand(dbgs(), false);
138 dbgs() << " was assigned barrier id: " << BarID
139 << " id-count: " << BarCnt << "\n");
140 // 4 bits for alignment, 5 bits for the barrier num,
141 // 3 bits for the barrier scope
142 Offset = 0x802000u | BarrierScope << 9 | BarID << 4;
143 } else {
144 llvm_unreachable("Unhandled special variable type.");
145 }
146
147 recordLDSAbsoluteAddress(M: &M, GV, Address: Offset);
148 }
149
150 // Also erase those special LDS variables from indirect_access.
151 for (auto &K : GVUsesInfo.IndirectAccess) {
152 assert(isKernel(*K.first));
153 K.second.remove_if(Pred: [](GlobalVariable *GV) { return isNamedBarrier(GV: *GV); });
154 }
155 return Changed;
156}
157
158static bool hasBarrierToLower(const GVUsesInfoTy &GVUsesInfo) {
159 for (auto &Map : {GVUsesInfo.DirectAccess, GVUsesInfo.IndirectAccess}) {
160 for (auto &[Fn, GVs] : Map) {
161 for (auto &GV : GVs) {
162 if (AMDGPU::isNamedBarrier(GV: *GV))
163 return true;
164 }
165 }
166 }
167 return false;
168}
169
170// With object linking, barrier ID assignment is deferred to the linker.
171// Externalize named barrier globals and emit self-contained metadata so the
172// AsmPrinter can generate the callgraph entries the linker needs.
173static bool handleNamedBarriersForObjectLinking(Module &M) {
174 DenseMap<GlobalVariable *, DenseSet<Function *>> BarrierToFuncs;
175 for (GlobalVariable &GV : M.globals()) {
176 if (!isNamedBarrier(GV) || GV.use_empty())
177 continue;
178 for (User *U : GV.users()) {
179 if (auto *I = dyn_cast<Instruction>(Val: U))
180 BarrierToFuncs[&GV].insert(V: I->getFunction());
181 }
182 }
183 if (BarrierToFuncs.empty())
184 return false;
185
186 LLVMContext &Ctx = M.getContext();
187 NamedMDNode *BarMD = M.getOrInsertNamedMetadata(Name: "amdgpu.named_barrier.uses");
188
189 std::string ModuleId;
190 ModuleId = getUniqueModuleId(M: &M);
191 assert(!ModuleId.empty() &&
192 "modules with named barriers should have a unique ID");
193 for (auto &[V, Funcs] : BarrierToFuncs) {
194 if (V->hasLocalLinkage())
195 V->setName("__amdgpu_named_barrier." + V->getName() + ModuleId);
196 else if (!V->getName().starts_with(Prefix: "__amdgpu_named_barrier"))
197 V->setName("__amdgpu_named_barrier." + V->getName());
198 V->setInitializer(nullptr);
199 V->setLinkage(GlobalValue::ExternalLinkage);
200
201 SmallVector<Metadata *, 4> Ops;
202 Ops.push_back(Elt: ValueAsMetadata::get(V));
203 for (Function *F : Funcs)
204 Ops.push_back(Elt: ValueAsMetadata::get(V: F));
205 BarMD->addOperand(M: MDNode::get(Context&: Ctx, MDs: Ops));
206 }
207 return true;
208}
209
210static bool runLowerExecSyncGlobals(Module &M) {
211 if (AMDGPUTargetMachine::EnableObjectLinking)
212 return handleNamedBarriersForObjectLinking(M);
213
214 CallGraph CG = CallGraph(M);
215 bool Changed = false;
216 Changed |=
217 eliminateGVConstantExprUsesFromAllInstructions(M, Filter: isLDSVariableToLower);
218
219 // For each kernel, what variables does it access directly or through
220 // callees
221 GVUsesInfoTy LDSUsesInfo = getTransitiveUsesOfLDSForLowering(CG, M);
222
223 if (hasBarrierToLower(GVUsesInfo: LDSUsesInfo)) {
224 // Special LDS variables need special address assignment
225 Changed |= lowerExecSyncGlobalVariables(M, GVUsesInfo&: LDSUsesInfo);
226 }
227
228 return Changed;
229}
230
231class AMDGPULowerExecSyncLegacy : public ModulePass {
232public:
233 static char ID;
234 AMDGPULowerExecSyncLegacy() : ModulePass(ID) {}
235 bool runOnModule(Module &M) override;
236};
237
238} // namespace
239
240char AMDGPULowerExecSyncLegacy::ID = 0;
241char &llvm::AMDGPULowerExecSyncLegacyPassID = AMDGPULowerExecSyncLegacy::ID;
242
243INITIALIZE_PASS_BEGIN(AMDGPULowerExecSyncLegacy, DEBUG_TYPE,
244 "AMDGPU lowering of execution synchronization", false,
245 false)
246INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
247INITIALIZE_PASS_END(AMDGPULowerExecSyncLegacy, DEBUG_TYPE,
248 "AMDGPU lowering of execution synchronization", false,
249 false)
250
251bool AMDGPULowerExecSyncLegacy::runOnModule(Module &M) {
252 return runLowerExecSyncGlobals(M);
253}
254
255ModulePass *llvm::createAMDGPULowerExecSyncLegacyPass() {
256 return new AMDGPULowerExecSyncLegacy();
257}
258
259PreservedAnalyses AMDGPULowerExecSyncPass::run(Module &M,
260 ModuleAnalysisManager &AM) {
261 return runLowerExecSyncGlobals(M) ? PreservedAnalyses::none()
262 : PreservedAnalyses::all();
263}
264