1//===----------------------------------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// Lower LDS global variables with target extension type "amdgpu.named.barrier"
10// that require specialized address assignment. It assigns a unique
11// barrier identifier to each named-barrier LDS variable and encodes
12// this identifier within the !absolute_symbol metadata of that global.
13// This encoding ensures that subsequent LDS lowering passes can process these
14// barriers correctly without conflicts.
15//
16//===----------------------------------------------------------------------===//
17
18#include "AMDGPU.h"
19#include "AMDGPUMemoryUtils.h"
20#include "AMDGPUTargetMachine.h"
21#include "llvm/ADT/DenseMap.h"
22#include "llvm/Analysis/CallGraph.h"
23#include "llvm/CodeGen/TargetPassConfig.h"
24#include "llvm/IR/Constants.h"
25#include "llvm/IR/Instructions.h"
26#include "llvm/IR/ReplaceConstant.h"
27#include "llvm/InitializePasses.h"
28#include "llvm/Pass.h"
29
30#include <algorithm>
31
32#define DEBUG_TYPE "amdgpu-lower-exec-sync"
33
34using namespace llvm;
35using namespace AMDGPU;
36
37namespace {
38
39// If GV is also used directly by other kernels, create a new GV
40// used only by this kernel and its function.
41static GlobalVariable *uniquifyGVPerKernel(Module &M, GlobalVariable *GV,
42 Function *KF) {
43 bool NeedsReplacement = false;
44 for (Use &U : GV->uses()) {
45 if (auto *I = dyn_cast<Instruction>(Val: U.getUser())) {
46 Function *F = I->getFunction();
47 if (isKernel(F: *F) && F != KF) {
48 NeedsReplacement = true;
49 break;
50 }
51 }
52 }
53 if (!NeedsReplacement)
54 return GV;
55 // Create a new GV used only by this kernel and its function
56 GlobalVariable *NewGV = new GlobalVariable(
57 M, GV->getValueType(), GV->isConstant(), GV->getLinkage(),
58 GV->getInitializer(), GV->getName() + "." + KF->getName(), nullptr,
59 GV->getThreadLocalMode(), GV->getType()->getAddressSpace());
60 NewGV->copyAttributesFrom(Src: GV);
61 for (Use &U : make_early_inc_range(Range: GV->uses())) {
62 if (auto *I = dyn_cast<Instruction>(Val: U.getUser())) {
63 Function *F = I->getFunction();
64 if (!isKernel(F: *F) || F == KF) {
65 U.getUser()->replaceUsesOfWith(From: GV, To: NewGV);
66 }
67 }
68 }
69 return NewGV;
70}
71
72// Write the specified address into metadata where it can be retrieved by
73// the assembler. Format is a half open range, [Address Address+1)
74static void recordLDSAbsoluteAddress(Module *M, GlobalVariable *GV,
75 uint32_t Address) {
76 LLVMContext &Ctx = M->getContext();
77 auto *IntTy = M->getDataLayout().getIntPtrType(C&: Ctx, AddressSpace: AMDGPUAS::LOCAL_ADDRESS);
78 auto *MinC = ConstantAsMetadata::get(C: ConstantInt::get(Ty: IntTy, V: Address));
79 auto *MaxC = ConstantAsMetadata::get(C: ConstantInt::get(Ty: IntTy, V: Address + 1));
80 GV->setMetadata(KindID: LLVMContext::MD_absolute_symbol,
81 Node: MDNode::get(Context&: Ctx, MDs: {MinC, MaxC}));
82}
83
84template <typename T> SmallVector<T> sortByName(SmallVector<T> &&V) {
85 sort(V, [](const auto *L, const auto *R) {
86 return L->getName() < R->getName();
87 });
88 return {std::move(V)};
89}
90
91// Main utility function for special LDS variables lowering.
92static bool lowerExecSyncGlobalVariables(
93 Module &M, LDSUsesInfoTy &LDSUsesInfo,
94 VariableFunctionMap &LDSToKernelsThatNeedToAccessItIndirectly) {
95 bool Changed = false;
96 const DataLayout &DL = M.getDataLayout();
97 // The 1st round: give module-absolute assignments
98 int NumAbsolutes = 0;
99 SmallVector<GlobalVariable *> OrderedGVs;
100 for (auto &K : LDSToKernelsThatNeedToAccessItIndirectly) {
101 GlobalVariable *GV = K.first;
102 if (!isNamedBarrier(GV: *GV))
103 continue;
104 // give a module-absolute assignment if it is indirectly accessed by
105 // multiple kernels. This is not precise, but we don't want to duplicate
106 // a function when it is called by multiple kernels.
107 if (LDSToKernelsThatNeedToAccessItIndirectly[GV].size() > 1) {
108 OrderedGVs.push_back(Elt: GV);
109 } else {
110 // leave it to the 2nd round, which will give a kernel-relative
111 // assignment if it is only indirectly accessed by one kernel
112 LDSUsesInfo.direct_access[*K.second.begin()].insert(V: GV);
113 }
114 LDSToKernelsThatNeedToAccessItIndirectly.erase(Val: GV);
115 }
116 OrderedGVs = sortByName(V: std::move(OrderedGVs));
117 for (GlobalVariable *GV : OrderedGVs) {
118 unsigned BarrierScope = AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP;
119 unsigned BarId = NumAbsolutes + 1;
120 unsigned BarCnt = GV->getGlobalSize(DL) / 16;
121 NumAbsolutes += BarCnt;
122
123 // 4 bits for alignment, 5 bits for the barrier num,
124 // 3 bits for the barrier scope
125 unsigned Offset = 0x802000u | BarrierScope << 9 | BarId << 4;
126 recordLDSAbsoluteAddress(M: &M, GV, Address: Offset);
127 }
128 OrderedGVs.clear();
129
130 // The 2nd round: give a kernel-relative assignment for GV that
131 // either only indirectly accessed by single kernel or only directly
132 // accessed by multiple kernels.
133 SmallVector<Function *> OrderedKernels;
134 for (auto &K : LDSUsesInfo.direct_access) {
135 Function *F = K.first;
136 assert(isKernel(*F));
137 OrderedKernels.push_back(Elt: F);
138 }
139 OrderedKernels = sortByName(V: std::move(OrderedKernels));
140
141 DenseMap<Function *, uint32_t> Kernel2BarId;
142 for (Function *F : OrderedKernels) {
143 for (GlobalVariable *GV : LDSUsesInfo.direct_access[F]) {
144 if (!isNamedBarrier(GV: *GV))
145 continue;
146
147 LDSUsesInfo.direct_access[F].erase(V: GV);
148 if (GV->isAbsoluteSymbolRef()) {
149 // already assigned
150 continue;
151 }
152 OrderedGVs.push_back(Elt: GV);
153 }
154 OrderedGVs = sortByName(V: std::move(OrderedGVs));
155 for (GlobalVariable *GV : OrderedGVs) {
156 // GV could also be used directly by other kernels. If so, we need to
157 // create a new GV used only by this kernel and its function.
158 auto NewGV = uniquifyGVPerKernel(M, GV, KF: F);
159 Changed |= (NewGV != GV);
160 unsigned BarrierScope = AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP;
161 unsigned BarId = Kernel2BarId[F];
162 BarId += NumAbsolutes + 1;
163 unsigned BarCnt = GV->getGlobalSize(DL) / 16;
164 Kernel2BarId[F] += BarCnt;
165 unsigned Offset = 0x802000u | BarrierScope << 9 | BarId << 4;
166 recordLDSAbsoluteAddress(M: &M, GV: NewGV, Address: Offset);
167 }
168 OrderedGVs.clear();
169 }
170 // Also erase those special LDS variables from indirect_access.
171 for (auto &K : LDSUsesInfo.indirect_access) {
172 assert(isKernel(*K.first));
173 for (GlobalVariable *GV : K.second) {
174 if (isNamedBarrier(GV: *GV))
175 K.second.erase(V: GV);
176 }
177 }
178 return Changed;
179}
180
181static bool runLowerExecSyncGlobals(Module &M) {
182 CallGraph CG = CallGraph(M);
183 bool Changed = false;
184 Changed |= eliminateConstantExprUsesOfLDSFromAllInstructions(M);
185
186 // For each kernel, what variables does it access directly or through
187 // callees
188 LDSUsesInfoTy LDSUsesInfo = getTransitiveUsesOfLDS(CG, M);
189
190 // For each variable accessed through callees, which kernels access it
191 VariableFunctionMap LDSToKernelsThatNeedToAccessItIndirectly;
192 for (auto &K : LDSUsesInfo.indirect_access) {
193 Function *F = K.first;
194 assert(isKernel(*F));
195 for (GlobalVariable *GV : K.second) {
196 LDSToKernelsThatNeedToAccessItIndirectly[GV].insert(V: F);
197 }
198 }
199
200 if (LDSUsesInfo.HasSpecialGVs) {
201 // Special LDS variables need special address assignment
202 Changed |= lowerExecSyncGlobalVariables(
203 M, LDSUsesInfo, LDSToKernelsThatNeedToAccessItIndirectly);
204 }
205 return Changed;
206}
207
208class AMDGPULowerExecSyncLegacy : public ModulePass {
209public:
210 static char ID;
211 AMDGPULowerExecSyncLegacy() : ModulePass(ID) {}
212 bool runOnModule(Module &M) override;
213};
214
215} // namespace
216
217char AMDGPULowerExecSyncLegacy::ID = 0;
218char &llvm::AMDGPULowerExecSyncLegacyPassID = AMDGPULowerExecSyncLegacy::ID;
219
220INITIALIZE_PASS_BEGIN(AMDGPULowerExecSyncLegacy, DEBUG_TYPE,
221 "AMDGPU lowering of execution synchronization", false,
222 false)
223INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
224INITIALIZE_PASS_END(AMDGPULowerExecSyncLegacy, DEBUG_TYPE,
225 "AMDGPU lowering of execution synchronization", false,
226 false)
227
228bool AMDGPULowerExecSyncLegacy::runOnModule(Module &M) {
229 return runLowerExecSyncGlobals(M);
230}
231
232ModulePass *llvm::createAMDGPULowerExecSyncLegacyPass() {
233 return new AMDGPULowerExecSyncLegacy();
234}
235
236PreservedAnalyses AMDGPULowerExecSyncPass::run(Module &M,
237 ModuleAnalysisManager &AM) {
238 return runLowerExecSyncGlobals(M) ? PreservedAnalyses::none()
239 : PreservedAnalyses::all();
240}
241