| 1 | //===-- AMDGPULowerModuleLDSPass.cpp ------------------------------*- C++ -*-=// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | // |
| 9 | // This pass eliminates local data store, LDS, uses from non-kernel functions. |
| 10 | // LDS is contiguous memory allocated per kernel execution. |
| 11 | // |
| 12 | // Background. |
| 13 | // |
| 14 | // The programming model is global variables, or equivalently function local |
| 15 | // static variables, accessible from kernels or other functions. For uses from |
| 16 | // kernels this is straightforward - assign an integer to the kernel for the |
| 17 | // memory required by all the variables combined, allocate them within that. |
| 18 | // For uses from functions there are performance tradeoffs to choose between. |
| 19 | // |
| 20 | // This model means the GPU runtime can specify the amount of memory allocated. |
| 21 | // If this is more than the kernel assumed, the excess can be made available |
| 22 | // using a language specific feature, which IR represents as a variable with |
| 23 | // no initializer. This feature is referred to here as "Dynamic LDS" and is |
| 24 | // lowered slightly differently to the normal case. |
| 25 | // |
| 26 | // Consequences of this GPU feature: |
| 27 | // - memory is limited and exceeding it halts compilation |
| 28 | // - a global accessed by one kernel exists independent of other kernels |
| 29 | // - a global exists independent of simultaneous execution of the same kernel |
| 30 | // - the address of the global may be different from different kernels as they |
| 31 | // do not alias, which permits only allocating variables they use |
| 32 | // - if the address is allowed to differ, functions need help to find it |
| 33 | // |
| 34 | // Uses from kernels are implemented here by grouping them in a per-kernel |
| 35 | // struct instance. This duplicates the variables, accurately modelling their |
| 36 | // aliasing properties relative to a single global representation. It also |
| 37 | // permits control over alignment via padding. |
| 38 | // |
| 39 | // Uses from functions are more complicated and the primary purpose of this |
| 40 | // IR pass. Several different lowering are chosen between to meet requirements |
| 41 | // to avoid allocating any LDS where it is not necessary, as that impacts |
| 42 | // occupancy and may fail the compilation, while not imposing overhead on a |
| 43 | // feature whose primary advantage over global memory is performance. The basic |
| 44 | // design goal is to avoid one kernel imposing overhead on another. |
| 45 | // |
| 46 | // Implementation. |
| 47 | // |
| 48 | // LDS variables with constant annotation or non-undef initializer are passed |
| 49 | // through unchanged for simplification or error diagnostics in later passes. |
| 50 | // Non-undef initializers are not yet implemented for LDS. |
| 51 | // |
| 52 | // LDS variables that are always allocated at the same address can be found |
| 53 | // by lookup at that address. Otherwise runtime information/cost is required. |
| 54 | // |
| 55 | // The simplest strategy possible is to group all LDS variables in a single |
| 56 | // struct and allocate that struct in every kernel such that the original |
| 57 | // variables are always at the same address. LDS is however a limited resource |
| 58 | // so this strategy is unusable in practice. It is not implemented here. |
| 59 | // |
| 60 | // Strategy | Precise allocation | Zero runtime cost | General purpose | |
| 61 | // --------+--------------------+-------------------+-----------------+ |
| 62 | // Module | No | Yes | Yes | |
| 63 | // Table | Yes | No | Yes | |
| 64 | // Kernel | Yes | Yes | No | |
| 65 | // Hybrid | Yes | Partial | Yes | |
| 66 | // |
| 67 | // "Module" spends LDS memory to save cycles. "Table" spends cycles and global |
| 68 | // memory to save LDS. "Kernel" is as fast as kernel allocation but only works |
| 69 | // for variables that are known reachable from a single kernel. "Hybrid" picks |
| 70 | // between all three. When forced to choose between LDS and cycles we minimise |
| 71 | // LDS use. |
| 72 | |
| 73 | // The "module" lowering implemented here finds LDS variables which are used by |
| 74 | // non-kernel functions and creates a new struct with a field for each of those |
| 75 | // LDS variables. Variables that are only used from kernels are excluded. |
| 76 | // |
| 77 | // The "table" lowering implemented here has three components. |
| 78 | // First kernels are assigned a unique integer identifier which is available in |
| 79 | // functions it calls through the intrinsic amdgcn_lds_kernel_id. The integer |
| 80 | // is passed through a specific SGPR, thus works with indirect calls. |
| 81 | // Second, each kernel allocates LDS variables independent of other kernels and |
| 82 | // writes the addresses it chose for each variable into an array in consistent |
| 83 | // order. If the kernel does not allocate a given variable, it writes undef to |
| 84 | // the corresponding array location. These arrays are written to a constant |
| 85 | // table in the order matching the kernel unique integer identifier. |
| 86 | // Third, uses from non-kernel functions are replaced with a table lookup using |
| 87 | // the intrinsic function to find the address of the variable. |
| 88 | // |
| 89 | // "Kernel" lowering is only applicable for variables that are unambiguously |
| 90 | // reachable from exactly one kernel. For those cases, accesses to the variable |
| 91 | // can be lowered to ConstantExpr address of a struct instance specific to that |
| 92 | // one kernel. This is zero cost in space and in compute. It will raise a fatal |
| 93 | // error on any variable that might be reachable from multiple kernels and is |
| 94 | // thus most easily used as part of the hybrid lowering strategy. |
| 95 | // |
| 96 | // Hybrid lowering is a mixture of the above. It uses the zero cost kernel |
| 97 | // lowering where it can. It lowers the variable accessed by the greatest |
| 98 | // number of kernels using the module strategy as that is free for the first |
| 99 | // variable. Any futher variables that can be lowered with the module strategy |
| 100 | // without incurring LDS memory overhead are. The remaining ones are lowered |
| 101 | // via table. |
| 102 | // |
| 103 | // Consequences |
| 104 | // - No heuristics or user controlled magic numbers, hybrid is the right choice |
| 105 | // - Kernels that don't use functions (or have had them all inlined) are not |
| 106 | // affected by any lowering for kernels that do. |
| 107 | // - Kernels that don't make indirect function calls are not affected by those |
| 108 | // that do. |
| 109 | // - Variables which are used by lots of kernels, e.g. those injected by a |
| 110 | // language runtime in most kernels, are expected to have no overhead |
| 111 | // - Implementations that instantiate templates per-kernel where those templates |
| 112 | // use LDS are expected to hit the "Kernel" lowering strategy |
| 113 | // - The runtime properties impose a cost in compiler implementation complexity |
| 114 | // |
| 115 | // Dynamic LDS implementation |
| 116 | // Dynamic LDS is lowered similarly to the "table" strategy above and uses the |
| 117 | // same intrinsic to identify which kernel is at the root of the dynamic call |
| 118 | // graph. This relies on the specified behaviour that all dynamic LDS variables |
| 119 | // alias one another, i.e. are at the same address, with respect to a given |
| 120 | // kernel. Therefore this pass creates new dynamic LDS variables for each kernel |
| 121 | // that allocates any dynamic LDS and builds a table of addresses out of those. |
| 122 | // The AMDGPUPromoteAlloca pass skips kernels that use dynamic LDS. |
| 123 | // The corresponding optimisation for "kernel" lowering where the table lookup |
| 124 | // is elided is not implemented. |
| 125 | // |
| 126 | // |
| 127 | // Implementation notes / limitations |
| 128 | // A single LDS global variable represents an instance per kernel that can reach |
| 129 | // said variables. This pass essentially specialises said variables per kernel. |
| 130 | // Handling ConstantExpr during the pass complicated this significantly so now |
| 131 | // all ConstantExpr uses of LDS variables are expanded to instructions. This |
| 132 | // may need amending when implementing non-undef initialisers. |
| 133 | // |
| 134 | // Lowering is split between this IR pass and the back end. This pass chooses |
| 135 | // where given variables should be allocated and marks them with metadata, |
| 136 | // MD_absolute_symbol. The backend places the variables in coincidentally the |
| 137 | // same location and raises a fatal error if something has gone awry. This works |
| 138 | // in practice because the only pass between this one and the backend that |
| 139 | // changes LDS is PromoteAlloca and the changes it makes do not conflict. |
| 140 | // |
| 141 | // Addresses are written to constant global arrays based on the same metadata. |
| 142 | // |
| 143 | // The backend lowers LDS variables in the order of traversal of the function. |
| 144 | // This is at odds with the deterministic layout required. The workaround is to |
| 145 | // allocate the fixed-address variables immediately upon starting the function |
| 146 | // where they can be placed as intended. This requires a means of mapping from |
| 147 | // the function to the variables that it allocates. For the module scope lds, |
| 148 | // this is via metadata indicating whether the variable is not required. If a |
| 149 | // pass deletes that metadata, a fatal error on disagreement with the absolute |
| 150 | // symbol metadata will occur. For kernel scope and dynamic, this is by _name_ |
| 151 | // correspondence between the function and the variable. It requires the |
| 152 | // kernel to have a name (which is only a limitation for tests in practice) and |
| 153 | // for nothing to rename the corresponding symbols. This is a hazard if the pass |
| 154 | // is run multiple times during debugging. Alternative schemes considered all |
| 155 | // involve bespoke metadata. |
| 156 | // |
| 157 | // If the name correspondence can be replaced, multiple distinct kernels that |
| 158 | // have the same memory layout can map to the same kernel id (as the address |
| 159 | // itself is handled by the absolute symbol metadata) and that will allow more |
| 160 | // uses of the "kernel" style faster lowering and reduce the size of the lookup |
| 161 | // tables. |
| 162 | // |
| 163 | // There is a test that checks this does not fire for a graphics shader. This |
| 164 | // lowering is expected to work for graphics if the isKernel test is changed. |
| 165 | // |
| 166 | // The current markUsedByKernel is sufficient for PromoteAlloca but is elided |
| 167 | // before codegen. Replacing this with an equivalent intrinsic which lasts until |
| 168 | // shortly after the machine function lowering of LDS would help break the name |
| 169 | // mapping. The other part needed is probably to amend PromoteAlloca to embed |
| 170 | // the LDS variables it creates in the same struct created here. That avoids the |
| 171 | // current hazard where a PromoteAlloca LDS variable might be allocated before |
| 172 | // the kernel scope (and thus error on the address check). Given a new invariant |
| 173 | // that no LDS variables exist outside of the structs managed here, and an |
| 174 | // intrinsic that lasts until after the LDS frame lowering, it should be |
| 175 | // possible to drop the name mapping and fold equivalent memory layouts. |
| 176 | // |
| 177 | //===----------------------------------------------------------------------===// |
| 178 | |
| 179 | #include "AMDGPU.h" |
| 180 | #include "AMDGPUMemoryUtils.h" |
| 181 | #include "AMDGPUTargetMachine.h" |
| 182 | #include "Utils/AMDGPUBaseInfo.h" |
| 183 | #include "llvm/ADT/BitVector.h" |
| 184 | #include "llvm/ADT/DenseMap.h" |
| 185 | #include "llvm/ADT/DenseSet.h" |
| 186 | #include "llvm/ADT/STLExtras.h" |
| 187 | #include "llvm/ADT/SetOperations.h" |
| 188 | #include "llvm/Analysis/CallGraph.h" |
| 189 | #include "llvm/Analysis/ScopedNoAliasAA.h" |
| 190 | #include "llvm/CodeGen/TargetPassConfig.h" |
| 191 | #include "llvm/IR/Constants.h" |
| 192 | #include "llvm/IR/DerivedTypes.h" |
| 193 | #include "llvm/IR/IRBuilder.h" |
| 194 | #include "llvm/IR/InlineAsm.h" |
| 195 | #include "llvm/IR/Instructions.h" |
| 196 | #include "llvm/IR/IntrinsicsAMDGPU.h" |
| 197 | #include "llvm/IR/MDBuilder.h" |
| 198 | #include "llvm/IR/ReplaceConstant.h" |
| 199 | #include "llvm/Pass.h" |
| 200 | #include "llvm/Support/CommandLine.h" |
| 201 | #include "llvm/Support/Debug.h" |
| 202 | #include "llvm/Support/Format.h" |
| 203 | #include "llvm/Support/OptimizedStructLayout.h" |
| 204 | #include "llvm/Support/raw_ostream.h" |
| 205 | #include "llvm/Transforms/Utils/BasicBlockUtils.h" |
| 206 | #include "llvm/Transforms/Utils/ModuleUtils.h" |
| 207 | |
| 208 | #include <vector> |
| 209 | |
| 210 | #include <cstdio> |
| 211 | |
| 212 | #define DEBUG_TYPE "amdgpu-lower-module-lds" |
| 213 | |
| 214 | using namespace llvm; |
| 215 | using namespace AMDGPU; |
| 216 | |
| 217 | namespace { |
| 218 | |
| 219 | cl::opt<bool> SuperAlignLDSGlobals( |
| 220 | "amdgpu-super-align-lds-globals" , |
| 221 | cl::desc("Increase alignment of LDS if it is not on align boundary" ), |
| 222 | cl::init(Val: true), cl::Hidden); |
| 223 | |
| 224 | enum class LoweringKind { module, table, kernel, hybrid }; |
| 225 | cl::opt<LoweringKind> LoweringKindLoc( |
| 226 | "amdgpu-lower-module-lds-strategy" , |
| 227 | cl::desc("Specify lowering strategy for function LDS access:" ), cl::Hidden, |
| 228 | cl::init(Val: LoweringKind::hybrid), |
| 229 | cl::values( |
| 230 | clEnumValN(LoweringKind::table, "table" , "Lower via table lookup" ), |
| 231 | clEnumValN(LoweringKind::module, "module" , "Lower via module struct" ), |
| 232 | clEnumValN( |
| 233 | LoweringKind::kernel, "kernel" , |
| 234 | "Lower variables reachable from one kernel, otherwise abort" ), |
| 235 | clEnumValN(LoweringKind::hybrid, "hybrid" , |
| 236 | "Lower via mixture of above strategies" ))); |
| 237 | |
| 238 | template <typename T> std::vector<T> sortByName(std::vector<T> &&V) { |
| 239 | llvm::sort(V.begin(), V.end(), [](const auto *L, const auto *R) { |
| 240 | return L->getName() < R->getName(); |
| 241 | }); |
| 242 | return {std::move(V)}; |
| 243 | } |
| 244 | |
| 245 | class AMDGPULowerModuleLDS { |
| 246 | const AMDGPUTargetMachine &TM; |
| 247 | |
| 248 | static void |
| 249 | removeLocalVarsFromUsedLists(Module &M, |
| 250 | const DenseSet<GlobalVariable *> &LocalVars) { |
| 251 | // The verifier rejects used lists containing an inttoptr of a constant |
| 252 | // so remove the variables from these lists before replaceAllUsesWith |
| 253 | SmallPtrSet<Constant *, 8> ; |
| 254 | for (GlobalVariable *LocalVar : LocalVars) |
| 255 | LocalVarsSet.insert(Ptr: cast<Constant>(Val: LocalVar->stripPointerCasts())); |
| 256 | |
| 257 | removeFromUsedLists( |
| 258 | M, ShouldRemove: [&LocalVarsSet](Constant *C) { return LocalVarsSet.count(Ptr: C); }); |
| 259 | |
| 260 | for (GlobalVariable *LocalVar : LocalVars) |
| 261 | LocalVar->removeDeadConstantUsers(); |
| 262 | } |
| 263 | |
| 264 | static void markUsedByKernel(Function *Func, GlobalVariable *SGV) { |
| 265 | // The llvm.amdgcn.module.lds instance is implicitly used by all kernels |
| 266 | // that might call a function which accesses a field within it. This is |
| 267 | // presently approximated to 'all kernels' if there are any such functions |
| 268 | // in the module. This implicit use is redefined as an explicit use here so |
| 269 | // that later passes, specifically PromoteAlloca, account for the required |
| 270 | // memory without any knowledge of this transform. |
| 271 | |
| 272 | // An operand bundle on llvm.donothing works because the call instruction |
| 273 | // survives until after the last pass that needs to account for LDS. It is |
| 274 | // better than inline asm as the latter survives until the end of codegen. A |
| 275 | // totally robust solution would be a function with the same semantics as |
| 276 | // llvm.donothing that takes a pointer to the instance and is lowered to a |
| 277 | // no-op after LDS is allocated, but that is not presently necessary. |
| 278 | |
| 279 | // This intrinsic is eliminated shortly before instruction selection. It |
| 280 | // does not suffice to indicate to ISel that a given global which is not |
| 281 | // immediately used by the kernel must still be allocated by it. An |
| 282 | // equivalent target specific intrinsic which lasts until immediately after |
| 283 | // codegen would suffice for that, but one would still need to ensure that |
| 284 | // the variables are allocated in the anticipated order. |
| 285 | BasicBlock *Entry = &Func->getEntryBlock(); |
| 286 | IRBuilder<> Builder(Entry, Entry->getFirstNonPHIIt()); |
| 287 | |
| 288 | Function *Decl = Intrinsic::getOrInsertDeclaration( |
| 289 | M: Func->getParent(), id: Intrinsic::donothing, Tys: {}); |
| 290 | |
| 291 | Value *UseInstance[1] = { |
| 292 | Builder.CreateConstInBoundsGEP1_32(Ty: SGV->getValueType(), Ptr: SGV, Idx0: 0)}; |
| 293 | |
| 294 | Builder.CreateCall( |
| 295 | Callee: Decl, Args: {}, OpBundles: {OperandBundleDefT<Value *>("ExplicitUse" , UseInstance)}); |
| 296 | } |
| 297 | |
| 298 | public: |
| 299 | AMDGPULowerModuleLDS(const AMDGPUTargetMachine &TM_) : TM(TM_) {} |
| 300 | |
| 301 | struct LDSVariableReplacement { |
| 302 | GlobalVariable *SGV = nullptr; |
| 303 | DenseMap<GlobalVariable *, Constant *> LDSVarsToConstantGEP; |
| 304 | }; |
| 305 | |
| 306 | // remap from lds global to a constantexpr gep to where it has been moved to |
| 307 | // for each kernel |
| 308 | // an array with an element for each kernel containing where the corresponding |
| 309 | // variable was remapped to |
| 310 | |
| 311 | static Constant *getAddressesOfVariablesInKernel( |
| 312 | LLVMContext &Ctx, ArrayRef<GlobalVariable *> Variables, |
| 313 | const DenseMap<GlobalVariable *, Constant *> &LDSVarsToConstantGEP) { |
| 314 | // Create a ConstantArray containing the address of each Variable within the |
| 315 | // kernel corresponding to LDSVarsToConstantGEP, or poison if that kernel |
| 316 | // does not allocate it |
| 317 | // TODO: Drop the ptrtoint conversion |
| 318 | |
| 319 | Type *I32 = Type::getInt32Ty(C&: Ctx); |
| 320 | |
| 321 | ArrayType *KernelOffsetsType = ArrayType::get(ElementType: I32, NumElements: Variables.size()); |
| 322 | |
| 323 | SmallVector<Constant *> Elements; |
| 324 | for (GlobalVariable *GV : Variables) { |
| 325 | auto ConstantGepIt = LDSVarsToConstantGEP.find(Val: GV); |
| 326 | if (ConstantGepIt != LDSVarsToConstantGEP.end()) { |
| 327 | auto *elt = ConstantExpr::getPtrToInt(C: ConstantGepIt->second, Ty: I32); |
| 328 | Elements.push_back(Elt: elt); |
| 329 | } else { |
| 330 | Elements.push_back(Elt: PoisonValue::get(T: I32)); |
| 331 | } |
| 332 | } |
| 333 | return ConstantArray::get(T: KernelOffsetsType, V: Elements); |
| 334 | } |
| 335 | |
| 336 | static GlobalVariable *buildLookupTable( |
| 337 | Module &M, ArrayRef<GlobalVariable *> Variables, |
| 338 | ArrayRef<Function *> kernels, |
| 339 | DenseMap<Function *, LDSVariableReplacement> &KernelToReplacement) { |
| 340 | if (Variables.empty()) { |
| 341 | return nullptr; |
| 342 | } |
| 343 | LLVMContext &Ctx = M.getContext(); |
| 344 | |
| 345 | const size_t NumberVariables = Variables.size(); |
| 346 | const size_t NumberKernels = kernels.size(); |
| 347 | |
| 348 | ArrayType *KernelOffsetsType = |
| 349 | ArrayType::get(ElementType: Type::getInt32Ty(C&: Ctx), NumElements: NumberVariables); |
| 350 | |
| 351 | ArrayType *AllKernelsOffsetsType = |
| 352 | ArrayType::get(ElementType: KernelOffsetsType, NumElements: NumberKernels); |
| 353 | |
| 354 | Constant *Missing = PoisonValue::get(T: KernelOffsetsType); |
| 355 | std::vector<Constant *> overallConstantExprElts(NumberKernels); |
| 356 | for (size_t i = 0; i < NumberKernels; i++) { |
| 357 | auto Replacement = KernelToReplacement.find(Val: kernels[i]); |
| 358 | overallConstantExprElts[i] = |
| 359 | (Replacement == KernelToReplacement.end()) |
| 360 | ? Missing |
| 361 | : getAddressesOfVariablesInKernel( |
| 362 | Ctx, Variables, LDSVarsToConstantGEP: Replacement->second.LDSVarsToConstantGEP); |
| 363 | } |
| 364 | |
| 365 | Constant *init = |
| 366 | ConstantArray::get(T: AllKernelsOffsetsType, V: overallConstantExprElts); |
| 367 | |
| 368 | return new GlobalVariable( |
| 369 | M, AllKernelsOffsetsType, true, GlobalValue::InternalLinkage, init, |
| 370 | "llvm.amdgcn.lds.offset.table" , nullptr, GlobalValue::NotThreadLocal, |
| 371 | AMDGPUAS::CONSTANT_ADDRESS); |
| 372 | } |
| 373 | |
| 374 | void replaceUseWithTableLookup(Module &M, IRBuilder<> &Builder, |
| 375 | GlobalVariable *LookupTable, |
| 376 | GlobalVariable *GV, Use &U, |
| 377 | Value *OptionalIndex) { |
| 378 | // Table is a constant array of the same length as OrderedKernels |
| 379 | LLVMContext &Ctx = M.getContext(); |
| 380 | Type *I32 = Type::getInt32Ty(C&: Ctx); |
| 381 | auto *I = cast<Instruction>(Val: U.getUser()); |
| 382 | |
| 383 | Value *tableKernelIndex = getTableLookupKernelIndex(M, F: I->getFunction()); |
| 384 | |
| 385 | if (auto *Phi = dyn_cast<PHINode>(Val: I)) { |
| 386 | BasicBlock *BB = Phi->getIncomingBlock(U); |
| 387 | Builder.SetInsertPoint(&(*(BB->getFirstInsertionPt()))); |
| 388 | } else { |
| 389 | Builder.SetInsertPoint(I); |
| 390 | } |
| 391 | |
| 392 | SmallVector<Value *, 3> GEPIdx = { |
| 393 | ConstantInt::get(Ty: I32, V: 0), |
| 394 | tableKernelIndex, |
| 395 | }; |
| 396 | if (OptionalIndex) |
| 397 | GEPIdx.push_back(Elt: OptionalIndex); |
| 398 | |
| 399 | Value *Address = Builder.CreateInBoundsGEP( |
| 400 | Ty: LookupTable->getValueType(), Ptr: LookupTable, IdxList: GEPIdx, Name: GV->getName()); |
| 401 | |
| 402 | Value *loaded = Builder.CreateLoad(Ty: I32, Ptr: Address); |
| 403 | |
| 404 | Value *replacement = |
| 405 | Builder.CreateIntToPtr(V: loaded, DestTy: GV->getType(), Name: GV->getName()); |
| 406 | |
| 407 | U.set(replacement); |
| 408 | } |
| 409 | |
| 410 | void replaceUsesInInstructionsWithTableLookup( |
| 411 | Module &M, ArrayRef<GlobalVariable *> ModuleScopeVariables, |
| 412 | GlobalVariable *LookupTable) { |
| 413 | |
| 414 | LLVMContext &Ctx = M.getContext(); |
| 415 | IRBuilder<> Builder(Ctx); |
| 416 | Type *I32 = Type::getInt32Ty(C&: Ctx); |
| 417 | |
| 418 | for (size_t Index = 0; Index < ModuleScopeVariables.size(); Index++) { |
| 419 | auto *GV = ModuleScopeVariables[Index]; |
| 420 | |
| 421 | for (Use &U : make_early_inc_range(Range: GV->uses())) { |
| 422 | auto *I = dyn_cast<Instruction>(Val: U.getUser()); |
| 423 | if (!I) |
| 424 | continue; |
| 425 | |
| 426 | replaceUseWithTableLookup(M, Builder, LookupTable, GV, U, |
| 427 | OptionalIndex: ConstantInt::get(Ty: I32, V: Index)); |
| 428 | } |
| 429 | } |
| 430 | } |
| 431 | |
| 432 | static DenseSet<Function *> kernelsThatIndirectlyAccessAnyOfPassedVariables( |
| 433 | Module &M, LDSUsesInfoTy &LDSUsesInfo, |
| 434 | DenseSet<GlobalVariable *> const &VariableSet) { |
| 435 | |
| 436 | DenseSet<Function *> KernelSet; |
| 437 | |
| 438 | if (VariableSet.empty()) |
| 439 | return KernelSet; |
| 440 | |
| 441 | for (Function &Func : M.functions()) { |
| 442 | if (Func.isDeclaration() || !isKernelLDS(F: &Func)) |
| 443 | continue; |
| 444 | for (GlobalVariable *GV : LDSUsesInfo.indirect_access[&Func]) { |
| 445 | if (VariableSet.contains(V: GV)) { |
| 446 | KernelSet.insert(V: &Func); |
| 447 | break; |
| 448 | } |
| 449 | } |
| 450 | } |
| 451 | |
| 452 | return KernelSet; |
| 453 | } |
| 454 | |
| 455 | static GlobalVariable * |
| 456 | chooseBestVariableForModuleStrategy(const DataLayout &DL, |
| 457 | VariableFunctionMap &LDSVars) { |
| 458 | // Find the global variable with the most indirect uses from kernels |
| 459 | |
| 460 | struct CandidateTy { |
| 461 | GlobalVariable *GV = nullptr; |
| 462 | size_t UserCount = 0; |
| 463 | size_t Size = 0; |
| 464 | |
| 465 | CandidateTy() = default; |
| 466 | |
| 467 | CandidateTy(GlobalVariable *GV, uint64_t UserCount, uint64_t AllocSize) |
| 468 | : GV(GV), UserCount(UserCount), Size(AllocSize) {} |
| 469 | |
| 470 | bool operator<(const CandidateTy &Other) const { |
| 471 | // Fewer users makes module scope variable less attractive |
| 472 | if (UserCount < Other.UserCount) { |
| 473 | return true; |
| 474 | } |
| 475 | if (UserCount > Other.UserCount) { |
| 476 | return false; |
| 477 | } |
| 478 | |
| 479 | // Bigger makes module scope variable less attractive |
| 480 | if (Size < Other.Size) { |
| 481 | return false; |
| 482 | } |
| 483 | |
| 484 | if (Size > Other.Size) { |
| 485 | return true; |
| 486 | } |
| 487 | |
| 488 | // Arbitrary but consistent |
| 489 | return GV->getName() < Other.GV->getName(); |
| 490 | } |
| 491 | }; |
| 492 | |
| 493 | CandidateTy MostUsed; |
| 494 | |
| 495 | for (auto &K : LDSVars) { |
| 496 | GlobalVariable *GV = K.first; |
| 497 | if (K.second.size() <= 1) { |
| 498 | // A variable reachable by only one kernel is best lowered with kernel |
| 499 | // strategy |
| 500 | continue; |
| 501 | } |
| 502 | CandidateTy Candidate( |
| 503 | GV, K.second.size(), |
| 504 | DL.getTypeAllocSize(Ty: GV->getValueType()).getFixedValue()); |
| 505 | if (MostUsed < Candidate) |
| 506 | MostUsed = Candidate; |
| 507 | } |
| 508 | |
| 509 | return MostUsed.GV; |
| 510 | } |
| 511 | |
| 512 | static void recordLDSAbsoluteAddress(Module *M, GlobalVariable *GV, |
| 513 | uint32_t Address) { |
| 514 | // Write the specified address into metadata where it can be retrieved by |
| 515 | // the assembler. Format is a half open range, [Address Address+1) |
| 516 | LLVMContext &Ctx = M->getContext(); |
| 517 | auto *IntTy = |
| 518 | M->getDataLayout().getIntPtrType(C&: Ctx, AddressSpace: AMDGPUAS::LOCAL_ADDRESS); |
| 519 | auto *MinC = ConstantAsMetadata::get(C: ConstantInt::get(Ty: IntTy, V: Address)); |
| 520 | auto *MaxC = ConstantAsMetadata::get(C: ConstantInt::get(Ty: IntTy, V: Address + 1)); |
| 521 | GV->setMetadata(KindID: LLVMContext::MD_absolute_symbol, |
| 522 | Node: MDNode::get(Context&: Ctx, MDs: {MinC, MaxC})); |
| 523 | } |
| 524 | |
| 525 | DenseMap<Function *, Value *> tableKernelIndexCache; |
| 526 | Value *getTableLookupKernelIndex(Module &M, Function *F) { |
| 527 | // Accesses from a function use the amdgcn_lds_kernel_id intrinsic which |
| 528 | // lowers to a read from a live in register. Emit it once in the entry |
| 529 | // block to spare deduplicating it later. |
| 530 | auto [It, Inserted] = tableKernelIndexCache.try_emplace(Key: F); |
| 531 | if (Inserted) { |
| 532 | auto InsertAt = F->getEntryBlock().getFirstNonPHIOrDbgOrAlloca(); |
| 533 | IRBuilder<> Builder(&*InsertAt); |
| 534 | |
| 535 | It->second = Builder.CreateIntrinsic(ID: Intrinsic::amdgcn_lds_kernel_id, Args: {}); |
| 536 | } |
| 537 | |
| 538 | return It->second; |
| 539 | } |
| 540 | |
| 541 | static std::vector<Function *> assignLDSKernelIDToEachKernel( |
| 542 | Module *M, DenseSet<Function *> const &KernelsThatAllocateTableLDS, |
| 543 | DenseSet<Function *> const &KernelsThatIndirectlyAllocateDynamicLDS) { |
| 544 | // Associate kernels in the set with an arbitrary but reproducible order and |
| 545 | // annotate them with that order in metadata. This metadata is recognised by |
| 546 | // the backend and lowered to a SGPR which can be read from using |
| 547 | // amdgcn_lds_kernel_id. |
| 548 | |
| 549 | std::vector<Function *> OrderedKernels; |
| 550 | if (!KernelsThatAllocateTableLDS.empty() || |
| 551 | !KernelsThatIndirectlyAllocateDynamicLDS.empty()) { |
| 552 | |
| 553 | for (Function &Func : M->functions()) { |
| 554 | if (Func.isDeclaration()) |
| 555 | continue; |
| 556 | if (!isKernelLDS(F: &Func)) |
| 557 | continue; |
| 558 | |
| 559 | if (KernelsThatAllocateTableLDS.contains(V: &Func) || |
| 560 | KernelsThatIndirectlyAllocateDynamicLDS.contains(V: &Func)) { |
| 561 | assert(Func.hasName()); // else fatal error earlier |
| 562 | OrderedKernels.push_back(x: &Func); |
| 563 | } |
| 564 | } |
| 565 | |
| 566 | // Put them in an arbitrary but reproducible order |
| 567 | OrderedKernels = sortByName(V: std::move(OrderedKernels)); |
| 568 | |
| 569 | // Annotate the kernels with their order in this vector |
| 570 | LLVMContext &Ctx = M->getContext(); |
| 571 | IRBuilder<> Builder(Ctx); |
| 572 | |
| 573 | if (OrderedKernels.size() > UINT32_MAX) { |
| 574 | // 32 bit keeps it in one SGPR. > 2**32 kernels won't fit on the GPU |
| 575 | reportFatalUsageError(reason: "unimplemented LDS lowering for > 2**32 kernels" ); |
| 576 | } |
| 577 | |
| 578 | for (size_t i = 0; i < OrderedKernels.size(); i++) { |
| 579 | Metadata *AttrMDArgs[1] = { |
| 580 | ConstantAsMetadata::get(C: Builder.getInt32(C: i)), |
| 581 | }; |
| 582 | OrderedKernels[i]->setMetadata(Kind: "llvm.amdgcn.lds.kernel.id" , |
| 583 | Node: MDNode::get(Context&: Ctx, MDs: AttrMDArgs)); |
| 584 | } |
| 585 | } |
| 586 | return OrderedKernels; |
| 587 | } |
| 588 | |
| 589 | static void partitionVariablesIntoIndirectStrategies( |
| 590 | Module &M, LDSUsesInfoTy const &LDSUsesInfo, |
| 591 | VariableFunctionMap &LDSToKernelsThatNeedToAccessItIndirectly, |
| 592 | DenseSet<GlobalVariable *> &ModuleScopeVariables, |
| 593 | DenseSet<GlobalVariable *> &TableLookupVariables, |
| 594 | DenseSet<GlobalVariable *> &KernelAccessVariables, |
| 595 | DenseSet<GlobalVariable *> &DynamicVariables) { |
| 596 | |
| 597 | GlobalVariable *HybridModuleRoot = |
| 598 | LoweringKindLoc != LoweringKind::hybrid |
| 599 | ? nullptr |
| 600 | : chooseBestVariableForModuleStrategy( |
| 601 | DL: M.getDataLayout(), LDSVars&: LDSToKernelsThatNeedToAccessItIndirectly); |
| 602 | |
| 603 | DenseSet<Function *> const EmptySet; |
| 604 | DenseSet<Function *> const &HybridModuleRootKernels = |
| 605 | HybridModuleRoot |
| 606 | ? LDSToKernelsThatNeedToAccessItIndirectly[HybridModuleRoot] |
| 607 | : EmptySet; |
| 608 | |
| 609 | for (auto &K : LDSToKernelsThatNeedToAccessItIndirectly) { |
| 610 | // Each iteration of this loop assigns exactly one global variable to |
| 611 | // exactly one of the implementation strategies. |
| 612 | |
| 613 | GlobalVariable *GV = K.first; |
| 614 | assert(AMDGPU::isLDSVariableToLower(*GV)); |
| 615 | assert(K.second.size() != 0); |
| 616 | |
| 617 | if (AMDGPU::isDynamicLDS(GV: *GV)) { |
| 618 | DynamicVariables.insert(V: GV); |
| 619 | continue; |
| 620 | } |
| 621 | |
| 622 | switch (LoweringKindLoc) { |
| 623 | case LoweringKind::module: |
| 624 | ModuleScopeVariables.insert(V: GV); |
| 625 | break; |
| 626 | |
| 627 | case LoweringKind::table: |
| 628 | TableLookupVariables.insert(V: GV); |
| 629 | break; |
| 630 | |
| 631 | case LoweringKind::kernel: |
| 632 | if (K.second.size() == 1) { |
| 633 | KernelAccessVariables.insert(V: GV); |
| 634 | } else { |
| 635 | // FIXME: This should use DiagnosticInfo |
| 636 | reportFatalUsageError( |
| 637 | reason: "cannot lower LDS '" + GV->getName() + |
| 638 | "' to kernel access as it is reachable from multiple kernels" ); |
| 639 | } |
| 640 | break; |
| 641 | |
| 642 | case LoweringKind::hybrid: { |
| 643 | if (GV == HybridModuleRoot) { |
| 644 | assert(K.second.size() != 1); |
| 645 | ModuleScopeVariables.insert(V: GV); |
| 646 | } else if (K.second.size() == 1) { |
| 647 | KernelAccessVariables.insert(V: GV); |
| 648 | } else if (set_is_subset(S1: K.second, S2: HybridModuleRootKernels)) { |
| 649 | ModuleScopeVariables.insert(V: GV); |
| 650 | } else { |
| 651 | TableLookupVariables.insert(V: GV); |
| 652 | } |
| 653 | break; |
| 654 | } |
| 655 | } |
| 656 | } |
| 657 | |
| 658 | // All LDS variables accessed indirectly have now been partitioned into |
| 659 | // the distinct lowering strategies. |
| 660 | assert(ModuleScopeVariables.size() + TableLookupVariables.size() + |
| 661 | KernelAccessVariables.size() + DynamicVariables.size() == |
| 662 | LDSToKernelsThatNeedToAccessItIndirectly.size()); |
| 663 | } |
| 664 | |
| 665 | static GlobalVariable *lowerModuleScopeStructVariables( |
| 666 | Module &M, DenseSet<GlobalVariable *> const &ModuleScopeVariables, |
| 667 | DenseSet<Function *> const &KernelsThatAllocateModuleLDS) { |
| 668 | // Create a struct to hold the ModuleScopeVariables |
| 669 | // Replace all uses of those variables from non-kernel functions with the |
| 670 | // new struct instance Replace only the uses from kernel functions that will |
| 671 | // allocate this instance. That is a space optimisation - kernels that use a |
| 672 | // subset of the module scope struct and do not need to allocate it for |
| 673 | // indirect calls will only allocate the subset they use (they do so as part |
| 674 | // of the per-kernel lowering). |
| 675 | if (ModuleScopeVariables.empty()) { |
| 676 | return nullptr; |
| 677 | } |
| 678 | |
| 679 | LLVMContext &Ctx = M.getContext(); |
| 680 | |
| 681 | LDSVariableReplacement ModuleScopeReplacement = |
| 682 | createLDSVariableReplacement(M, VarName: "llvm.amdgcn.module.lds" , |
| 683 | LDSVarsToTransform: ModuleScopeVariables); |
| 684 | |
| 685 | appendToCompilerUsed(M, Values: {static_cast<GlobalValue *>( |
| 686 | ConstantExpr::getPointerBitCastOrAddrSpaceCast( |
| 687 | C: cast<Constant>(Val: ModuleScopeReplacement.SGV), |
| 688 | Ty: PointerType::getUnqual(C&: Ctx)))}); |
| 689 | |
| 690 | // module.lds will be allocated at zero in any kernel that allocates it |
| 691 | recordLDSAbsoluteAddress(M: &M, GV: ModuleScopeReplacement.SGV, Address: 0); |
| 692 | |
| 693 | // historic |
| 694 | removeLocalVarsFromUsedLists(M, LocalVars: ModuleScopeVariables); |
| 695 | |
| 696 | // Replace all uses of module scope variable from non-kernel functions |
| 697 | replaceLDSVariablesWithStruct( |
| 698 | M, LDSVarsToTransformArg: ModuleScopeVariables, Replacement: ModuleScopeReplacement, Predicate: [&](Use &U) { |
| 699 | Instruction *I = dyn_cast<Instruction>(Val: U.getUser()); |
| 700 | if (!I) { |
| 701 | return false; |
| 702 | } |
| 703 | Function *F = I->getFunction(); |
| 704 | return !isKernelLDS(F); |
| 705 | }); |
| 706 | |
| 707 | // Replace uses of module scope variable from kernel functions that |
| 708 | // allocate the module scope variable, otherwise leave them unchanged |
| 709 | // Record on each kernel whether the module scope global is used by it |
| 710 | |
| 711 | for (Function &Func : M.functions()) { |
| 712 | if (Func.isDeclaration() || !isKernelLDS(F: &Func)) |
| 713 | continue; |
| 714 | |
| 715 | if (KernelsThatAllocateModuleLDS.contains(V: &Func)) { |
| 716 | replaceLDSVariablesWithStruct( |
| 717 | M, LDSVarsToTransformArg: ModuleScopeVariables, Replacement: ModuleScopeReplacement, Predicate: [&](Use &U) { |
| 718 | Instruction *I = dyn_cast<Instruction>(Val: U.getUser()); |
| 719 | if (!I) { |
| 720 | return false; |
| 721 | } |
| 722 | Function *F = I->getFunction(); |
| 723 | return F == &Func; |
| 724 | }); |
| 725 | |
| 726 | markUsedByKernel(Func: &Func, SGV: ModuleScopeReplacement.SGV); |
| 727 | } |
| 728 | } |
| 729 | |
| 730 | return ModuleScopeReplacement.SGV; |
| 731 | } |
| 732 | |
| 733 | static DenseMap<Function *, LDSVariableReplacement> |
| 734 | lowerKernelScopeStructVariables( |
| 735 | Module &M, LDSUsesInfoTy &LDSUsesInfo, |
| 736 | DenseSet<GlobalVariable *> const &ModuleScopeVariables, |
| 737 | DenseSet<Function *> const &KernelsThatAllocateModuleLDS, |
| 738 | GlobalVariable *MaybeModuleScopeStruct) { |
| 739 | |
| 740 | // Create a struct for each kernel for the non-module-scope variables. |
| 741 | |
| 742 | DenseMap<Function *, LDSVariableReplacement> KernelToReplacement; |
| 743 | for (Function &Func : M.functions()) { |
| 744 | if (Func.isDeclaration() || !isKernelLDS(F: &Func)) |
| 745 | continue; |
| 746 | |
| 747 | DenseSet<GlobalVariable *> KernelUsedVariables; |
| 748 | // Allocating variables that are used directly in this struct to get |
| 749 | // alignment aware allocation and predictable frame size. |
| 750 | for (auto &v : LDSUsesInfo.direct_access[&Func]) { |
| 751 | if (!AMDGPU::isDynamicLDS(GV: *v)) { |
| 752 | KernelUsedVariables.insert(V: v); |
| 753 | } |
| 754 | } |
| 755 | |
| 756 | // Allocating variables that are accessed indirectly so that a lookup of |
| 757 | // this struct instance can find them from nested functions. |
| 758 | for (auto &v : LDSUsesInfo.indirect_access[&Func]) { |
| 759 | if (!AMDGPU::isDynamicLDS(GV: *v)) { |
| 760 | KernelUsedVariables.insert(V: v); |
| 761 | } |
| 762 | } |
| 763 | |
| 764 | // Variables allocated in module lds must all resolve to that struct, |
| 765 | // not to the per-kernel instance. |
| 766 | if (KernelsThatAllocateModuleLDS.contains(V: &Func)) { |
| 767 | for (GlobalVariable *v : ModuleScopeVariables) { |
| 768 | KernelUsedVariables.erase(V: v); |
| 769 | } |
| 770 | } |
| 771 | |
| 772 | if (KernelUsedVariables.empty()) { |
| 773 | // Either used no LDS, or the LDS it used was all in the module struct |
| 774 | // or dynamically sized |
| 775 | continue; |
| 776 | } |
| 777 | |
| 778 | // The association between kernel function and LDS struct is done by |
| 779 | // symbol name, which only works if the function in question has a |
| 780 | // name This is not expected to be a problem in practice as kernels |
| 781 | // are called by name making anonymous ones (which are named by the |
| 782 | // backend) difficult to use. This does mean that llvm test cases need |
| 783 | // to name the kernels. |
| 784 | if (!Func.hasName()) { |
| 785 | reportFatalUsageError(reason: "anonymous kernels cannot use LDS variables" ); |
| 786 | } |
| 787 | |
| 788 | std::string VarName = |
| 789 | (Twine("llvm.amdgcn.kernel." ) + Func.getName() + ".lds" ).str(); |
| 790 | |
| 791 | auto Replacement = |
| 792 | createLDSVariableReplacement(M, VarName, LDSVarsToTransform: KernelUsedVariables); |
| 793 | |
| 794 | // If any indirect uses, create a direct use to ensure allocation |
| 795 | // TODO: Simpler to unconditionally mark used but that regresses |
| 796 | // codegen in test/CodeGen/AMDGPU/noclobber-barrier.ll |
| 797 | auto Accesses = LDSUsesInfo.indirect_access.find(Val: &Func); |
| 798 | if ((Accesses != LDSUsesInfo.indirect_access.end()) && |
| 799 | !Accesses->second.empty()) |
| 800 | markUsedByKernel(Func: &Func, SGV: Replacement.SGV); |
| 801 | |
| 802 | // remove preserves existing codegen |
| 803 | removeLocalVarsFromUsedLists(M, LocalVars: KernelUsedVariables); |
| 804 | KernelToReplacement[&Func] = Replacement; |
| 805 | |
| 806 | // Rewrite uses within kernel to the new struct |
| 807 | replaceLDSVariablesWithStruct( |
| 808 | M, LDSVarsToTransformArg: KernelUsedVariables, Replacement, Predicate: [&Func](Use &U) { |
| 809 | Instruction *I = dyn_cast<Instruction>(Val: U.getUser()); |
| 810 | return I && I->getFunction() == &Func; |
| 811 | }); |
| 812 | } |
| 813 | return KernelToReplacement; |
| 814 | } |
| 815 | |
| 816 | static GlobalVariable * |
| 817 | buildRepresentativeDynamicLDSInstance(Module &M, LDSUsesInfoTy &LDSUsesInfo, |
| 818 | Function *func) { |
| 819 | // Create a dynamic lds variable with a name associated with the passed |
| 820 | // function that has the maximum alignment of any dynamic lds variable |
| 821 | // reachable from this kernel. Dynamic LDS is allocated after the static LDS |
| 822 | // allocation, possibly after alignment padding. The representative variable |
| 823 | // created here has the maximum alignment of any other dynamic variable |
| 824 | // reachable by that kernel. All dynamic LDS variables are allocated at the |
| 825 | // same address in each kernel in order to provide the documented aliasing |
| 826 | // semantics. Setting the alignment here allows this IR pass to accurately |
| 827 | // predict the exact constant at which it will be allocated. |
| 828 | |
| 829 | assert(isKernelLDS(func)); |
| 830 | |
| 831 | LLVMContext &Ctx = M.getContext(); |
| 832 | const DataLayout &DL = M.getDataLayout(); |
| 833 | Align MaxDynamicAlignment(1); |
| 834 | |
| 835 | auto UpdateMaxAlignment = [&MaxDynamicAlignment, &DL](GlobalVariable *GV) { |
| 836 | if (AMDGPU::isDynamicLDS(GV: *GV)) { |
| 837 | MaxDynamicAlignment = |
| 838 | std::max(a: MaxDynamicAlignment, b: AMDGPU::getAlign(DL, GV)); |
| 839 | } |
| 840 | }; |
| 841 | |
| 842 | for (GlobalVariable *GV : LDSUsesInfo.indirect_access[func]) { |
| 843 | UpdateMaxAlignment(GV); |
| 844 | } |
| 845 | |
| 846 | for (GlobalVariable *GV : LDSUsesInfo.direct_access[func]) { |
| 847 | UpdateMaxAlignment(GV); |
| 848 | } |
| 849 | |
| 850 | assert(func->hasName()); // Checked by caller |
| 851 | auto *emptyCharArray = ArrayType::get(ElementType: Type::getInt8Ty(C&: Ctx), NumElements: 0); |
| 852 | GlobalVariable *N = new GlobalVariable( |
| 853 | M, emptyCharArray, false, GlobalValue::ExternalLinkage, nullptr, |
| 854 | Twine("llvm.amdgcn." + func->getName() + ".dynlds" ), nullptr, GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS, |
| 855 | false); |
| 856 | N->setAlignment(MaxDynamicAlignment); |
| 857 | |
| 858 | assert(AMDGPU::isDynamicLDS(*N)); |
| 859 | return N; |
| 860 | } |
| 861 | |
| 862 | DenseMap<Function *, GlobalVariable *> lowerDynamicLDSVariables( |
| 863 | Module &M, LDSUsesInfoTy &LDSUsesInfo, |
| 864 | DenseSet<Function *> const &KernelsThatIndirectlyAllocateDynamicLDS, |
| 865 | DenseSet<GlobalVariable *> const &DynamicVariables, |
| 866 | std::vector<Function *> const &OrderedKernels) { |
| 867 | DenseMap<Function *, GlobalVariable *> KernelToCreatedDynamicLDS; |
| 868 | if (!KernelsThatIndirectlyAllocateDynamicLDS.empty()) { |
| 869 | LLVMContext &Ctx = M.getContext(); |
| 870 | IRBuilder<> Builder(Ctx); |
| 871 | Type *I32 = Type::getInt32Ty(C&: Ctx); |
| 872 | |
| 873 | std::vector<Constant *> newDynamicLDS; |
| 874 | |
| 875 | // Table is built in the same order as OrderedKernels |
| 876 | for (auto &func : OrderedKernels) { |
| 877 | |
| 878 | if (KernelsThatIndirectlyAllocateDynamicLDS.contains(V: func)) { |
| 879 | assert(isKernelLDS(func)); |
| 880 | if (!func->hasName()) { |
| 881 | reportFatalUsageError(reason: "anonymous kernels cannot use LDS variables" ); |
| 882 | } |
| 883 | |
| 884 | GlobalVariable *N = |
| 885 | buildRepresentativeDynamicLDSInstance(M, LDSUsesInfo, func); |
| 886 | |
| 887 | KernelToCreatedDynamicLDS[func] = N; |
| 888 | |
| 889 | markUsedByKernel(Func: func, SGV: N); |
| 890 | |
| 891 | auto *emptyCharArray = ArrayType::get(ElementType: Type::getInt8Ty(C&: Ctx), NumElements: 0); |
| 892 | auto *GEP = ConstantExpr::getGetElementPtr( |
| 893 | Ty: emptyCharArray, C: N, Idx: ConstantInt::get(Ty: I32, V: 0), NW: true); |
| 894 | newDynamicLDS.push_back(x: ConstantExpr::getPtrToInt(C: GEP, Ty: I32)); |
| 895 | } else { |
| 896 | newDynamicLDS.push_back(x: PoisonValue::get(T: I32)); |
| 897 | } |
| 898 | } |
| 899 | assert(OrderedKernels.size() == newDynamicLDS.size()); |
| 900 | |
| 901 | ArrayType *t = ArrayType::get(ElementType: I32, NumElements: newDynamicLDS.size()); |
| 902 | Constant *init = ConstantArray::get(T: t, V: newDynamicLDS); |
| 903 | GlobalVariable *table = new GlobalVariable( |
| 904 | M, t, true, GlobalValue::InternalLinkage, init, |
| 905 | "llvm.amdgcn.dynlds.offset.table" , nullptr, |
| 906 | GlobalValue::NotThreadLocal, AMDGPUAS::CONSTANT_ADDRESS); |
| 907 | |
| 908 | for (GlobalVariable *GV : DynamicVariables) { |
| 909 | for (Use &U : make_early_inc_range(Range: GV->uses())) { |
| 910 | auto *I = dyn_cast<Instruction>(Val: U.getUser()); |
| 911 | if (!I) |
| 912 | continue; |
| 913 | if (isKernelLDS(F: I->getFunction())) |
| 914 | continue; |
| 915 | |
| 916 | replaceUseWithTableLookup(M, Builder, LookupTable: table, GV, U, OptionalIndex: nullptr); |
| 917 | } |
| 918 | } |
| 919 | } |
| 920 | return KernelToCreatedDynamicLDS; |
| 921 | } |
| 922 | |
| 923 | static GlobalVariable *uniquifyGVPerKernel(Module &M, GlobalVariable *GV, |
| 924 | Function *KF) { |
| 925 | bool NeedsReplacement = false; |
| 926 | for (Use &U : GV->uses()) { |
| 927 | if (auto *I = dyn_cast<Instruction>(Val: U.getUser())) { |
| 928 | Function *F = I->getFunction(); |
| 929 | if (isKernelLDS(F) && F != KF) { |
| 930 | NeedsReplacement = true; |
| 931 | break; |
| 932 | } |
| 933 | } |
| 934 | } |
| 935 | if (!NeedsReplacement) |
| 936 | return GV; |
| 937 | // Create a new GV used only by this kernel and its function |
| 938 | GlobalVariable *NewGV = new GlobalVariable( |
| 939 | M, GV->getValueType(), GV->isConstant(), GV->getLinkage(), |
| 940 | GV->getInitializer(), GV->getName() + "." + KF->getName(), nullptr, |
| 941 | GV->getThreadLocalMode(), GV->getType()->getAddressSpace()); |
| 942 | NewGV->copyAttributesFrom(Src: GV); |
| 943 | for (Use &U : make_early_inc_range(Range: GV->uses())) { |
| 944 | if (auto *I = dyn_cast<Instruction>(Val: U.getUser())) { |
| 945 | Function *F = I->getFunction(); |
| 946 | if (!isKernelLDS(F) || F == KF) { |
| 947 | U.getUser()->replaceUsesOfWith(From: GV, To: NewGV); |
| 948 | } |
| 949 | } |
| 950 | } |
| 951 | return NewGV; |
| 952 | } |
| 953 | |
| 954 | bool lowerSpecialLDSVariables( |
| 955 | Module &M, LDSUsesInfoTy &LDSUsesInfo, |
| 956 | VariableFunctionMap &LDSToKernelsThatNeedToAccessItIndirectly) { |
| 957 | bool Changed = false; |
| 958 | // The 1st round: give module-absolute assignments |
| 959 | int NumAbsolutes = 0; |
| 960 | std::vector<GlobalVariable *> OrderedGVs; |
| 961 | for (auto &K : LDSToKernelsThatNeedToAccessItIndirectly) { |
| 962 | GlobalVariable *GV = K.first; |
| 963 | if (!isNamedBarrier(GV: *GV)) |
| 964 | continue; |
| 965 | // give a module-absolute assignment if it is indirectly accessed by |
| 966 | // multiple kernels. This is not precise, but we don't want to duplicate |
| 967 | // a function when it is called by multiple kernels. |
| 968 | if (LDSToKernelsThatNeedToAccessItIndirectly[GV].size() > 1) { |
| 969 | OrderedGVs.push_back(x: GV); |
| 970 | } else { |
| 971 | // leave it to the 2nd round, which will give a kernel-relative |
| 972 | // assignment if it is only indirectly accessed by one kernel |
| 973 | LDSUsesInfo.direct_access[*K.second.begin()].insert(V: GV); |
| 974 | } |
| 975 | LDSToKernelsThatNeedToAccessItIndirectly.erase(Val: GV); |
| 976 | } |
| 977 | OrderedGVs = sortByName(V: std::move(OrderedGVs)); |
| 978 | for (GlobalVariable *GV : OrderedGVs) { |
| 979 | int BarId = ++NumAbsolutes; |
| 980 | unsigned BarrierScope = llvm::AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP; |
| 981 | // 4 bits for alignment, 5 bits for the barrier num, |
| 982 | // 3 bits for the barrier scope |
| 983 | unsigned Offset = 0x802000u | BarrierScope << 9 | BarId << 4; |
| 984 | recordLDSAbsoluteAddress(M: &M, GV, Address: Offset); |
| 985 | } |
| 986 | OrderedGVs.clear(); |
| 987 | |
| 988 | // The 2nd round: give a kernel-relative assignment for GV that |
| 989 | // either only indirectly accessed by single kernel or only directly |
| 990 | // accessed by multiple kernels. |
| 991 | std::vector<Function *> OrderedKernels; |
| 992 | for (auto &K : LDSUsesInfo.direct_access) { |
| 993 | Function *F = K.first; |
| 994 | assert(isKernelLDS(F)); |
| 995 | OrderedKernels.push_back(x: F); |
| 996 | } |
| 997 | OrderedKernels = sortByName(V: std::move(OrderedKernels)); |
| 998 | |
| 999 | llvm::DenseMap<Function *, uint32_t> Kernel2BarId; |
| 1000 | for (Function *F : OrderedKernels) { |
| 1001 | for (GlobalVariable *GV : LDSUsesInfo.direct_access[F]) { |
| 1002 | if (!isNamedBarrier(GV: *GV)) |
| 1003 | continue; |
| 1004 | |
| 1005 | LDSUsesInfo.direct_access[F].erase(V: GV); |
| 1006 | if (GV->isAbsoluteSymbolRef()) { |
| 1007 | // already assigned |
| 1008 | continue; |
| 1009 | } |
| 1010 | OrderedGVs.push_back(x: GV); |
| 1011 | } |
| 1012 | OrderedGVs = sortByName(V: std::move(OrderedGVs)); |
| 1013 | for (GlobalVariable *GV : OrderedGVs) { |
| 1014 | // GV could also be used directly by other kernels. If so, we need to |
| 1015 | // create a new GV used only by this kernel and its function. |
| 1016 | auto NewGV = uniquifyGVPerKernel(M, GV, KF: F); |
| 1017 | Changed |= (NewGV != GV); |
| 1018 | int BarId = (NumAbsolutes + 1); |
| 1019 | if (Kernel2BarId.contains(Val: F)) { |
| 1020 | BarId = (Kernel2BarId[F] + 1); |
| 1021 | } |
| 1022 | Kernel2BarId[F] = BarId; |
| 1023 | unsigned BarrierScope = llvm::AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP; |
| 1024 | unsigned Offset = 0x802000u | BarrierScope << 9 | BarId << 4; |
| 1025 | recordLDSAbsoluteAddress(M: &M, GV: NewGV, Address: Offset); |
| 1026 | } |
| 1027 | OrderedGVs.clear(); |
| 1028 | } |
| 1029 | // Also erase those special LDS variables from indirect_access. |
| 1030 | for (auto &K : LDSUsesInfo.indirect_access) { |
| 1031 | assert(isKernelLDS(K.first)); |
| 1032 | for (GlobalVariable *GV : K.second) { |
| 1033 | if (isNamedBarrier(GV: *GV)) |
| 1034 | K.second.erase(V: GV); |
| 1035 | } |
| 1036 | } |
| 1037 | return Changed; |
| 1038 | } |
| 1039 | |
| 1040 | bool runOnModule(Module &M) { |
| 1041 | CallGraph CG = CallGraph(M); |
| 1042 | bool Changed = superAlignLDSGlobals(M); |
| 1043 | |
| 1044 | Changed |= eliminateConstantExprUsesOfLDSFromAllInstructions(M); |
| 1045 | |
| 1046 | Changed = true; // todo: narrow this down |
| 1047 | |
| 1048 | // For each kernel, what variables does it access directly or through |
| 1049 | // callees |
| 1050 | LDSUsesInfoTy LDSUsesInfo = getTransitiveUsesOfLDS(CG, M); |
| 1051 | |
| 1052 | // For each variable accessed through callees, which kernels access it |
| 1053 | VariableFunctionMap LDSToKernelsThatNeedToAccessItIndirectly; |
| 1054 | for (auto &K : LDSUsesInfo.indirect_access) { |
| 1055 | Function *F = K.first; |
| 1056 | assert(isKernelLDS(F)); |
| 1057 | for (GlobalVariable *GV : K.second) { |
| 1058 | LDSToKernelsThatNeedToAccessItIndirectly[GV].insert(V: F); |
| 1059 | } |
| 1060 | } |
| 1061 | |
| 1062 | if (LDSUsesInfo.HasSpecialGVs) { |
| 1063 | // Special LDS variables need special address assignment |
| 1064 | Changed |= lowerSpecialLDSVariables( |
| 1065 | M, LDSUsesInfo, LDSToKernelsThatNeedToAccessItIndirectly); |
| 1066 | } |
| 1067 | |
| 1068 | // Partition variables accessed indirectly into the different strategies |
| 1069 | DenseSet<GlobalVariable *> ModuleScopeVariables; |
| 1070 | DenseSet<GlobalVariable *> TableLookupVariables; |
| 1071 | DenseSet<GlobalVariable *> KernelAccessVariables; |
| 1072 | DenseSet<GlobalVariable *> DynamicVariables; |
| 1073 | partitionVariablesIntoIndirectStrategies( |
| 1074 | M, LDSUsesInfo, LDSToKernelsThatNeedToAccessItIndirectly, |
| 1075 | ModuleScopeVariables, TableLookupVariables, KernelAccessVariables, |
| 1076 | DynamicVariables); |
| 1077 | |
| 1078 | // If the kernel accesses a variable that is going to be stored in the |
| 1079 | // module instance through a call then that kernel needs to allocate the |
| 1080 | // module instance |
| 1081 | const DenseSet<Function *> KernelsThatAllocateModuleLDS = |
| 1082 | kernelsThatIndirectlyAccessAnyOfPassedVariables(M, LDSUsesInfo, |
| 1083 | VariableSet: ModuleScopeVariables); |
| 1084 | const DenseSet<Function *> KernelsThatAllocateTableLDS = |
| 1085 | kernelsThatIndirectlyAccessAnyOfPassedVariables(M, LDSUsesInfo, |
| 1086 | VariableSet: TableLookupVariables); |
| 1087 | |
| 1088 | const DenseSet<Function *> KernelsThatIndirectlyAllocateDynamicLDS = |
| 1089 | kernelsThatIndirectlyAccessAnyOfPassedVariables(M, LDSUsesInfo, |
| 1090 | VariableSet: DynamicVariables); |
| 1091 | |
| 1092 | GlobalVariable *MaybeModuleScopeStruct = lowerModuleScopeStructVariables( |
| 1093 | M, ModuleScopeVariables, KernelsThatAllocateModuleLDS); |
| 1094 | |
| 1095 | DenseMap<Function *, LDSVariableReplacement> KernelToReplacement = |
| 1096 | lowerKernelScopeStructVariables(M, LDSUsesInfo, ModuleScopeVariables, |
| 1097 | KernelsThatAllocateModuleLDS, |
| 1098 | MaybeModuleScopeStruct); |
| 1099 | |
| 1100 | // Lower zero cost accesses to the kernel instances just created |
| 1101 | for (auto &GV : KernelAccessVariables) { |
| 1102 | auto &funcs = LDSToKernelsThatNeedToAccessItIndirectly[GV]; |
| 1103 | assert(funcs.size() == 1); // Only one kernel can access it |
| 1104 | LDSVariableReplacement Replacement = |
| 1105 | KernelToReplacement[*(funcs.begin())]; |
| 1106 | |
| 1107 | DenseSet<GlobalVariable *> Vec; |
| 1108 | Vec.insert(V: GV); |
| 1109 | |
| 1110 | replaceLDSVariablesWithStruct(M, LDSVarsToTransformArg: Vec, Replacement, Predicate: [](Use &U) { |
| 1111 | return isa<Instruction>(Val: U.getUser()); |
| 1112 | }); |
| 1113 | } |
| 1114 | |
| 1115 | // The ith element of this vector is kernel id i |
| 1116 | std::vector<Function *> OrderedKernels = |
| 1117 | assignLDSKernelIDToEachKernel(M: &M, KernelsThatAllocateTableLDS, |
| 1118 | KernelsThatIndirectlyAllocateDynamicLDS); |
| 1119 | |
| 1120 | if (!KernelsThatAllocateTableLDS.empty()) { |
| 1121 | LLVMContext &Ctx = M.getContext(); |
| 1122 | IRBuilder<> Builder(Ctx); |
| 1123 | |
| 1124 | // The order must be consistent between lookup table and accesses to |
| 1125 | // lookup table |
| 1126 | auto TableLookupVariablesOrdered = |
| 1127 | sortByName(V: std::vector<GlobalVariable *>(TableLookupVariables.begin(), |
| 1128 | TableLookupVariables.end())); |
| 1129 | |
| 1130 | GlobalVariable *LookupTable = buildLookupTable( |
| 1131 | M, Variables: TableLookupVariablesOrdered, kernels: OrderedKernels, KernelToReplacement); |
| 1132 | replaceUsesInInstructionsWithTableLookup(M, ModuleScopeVariables: TableLookupVariablesOrdered, |
| 1133 | LookupTable); |
| 1134 | } |
| 1135 | |
| 1136 | DenseMap<Function *, GlobalVariable *> KernelToCreatedDynamicLDS = |
| 1137 | lowerDynamicLDSVariables(M, LDSUsesInfo, |
| 1138 | KernelsThatIndirectlyAllocateDynamicLDS, |
| 1139 | DynamicVariables, OrderedKernels); |
| 1140 | |
| 1141 | // Strip amdgpu-no-lds-kernel-id from all functions reachable from the |
| 1142 | // kernel. We may have inferred this wasn't used prior to the pass. |
| 1143 | // TODO: We could filter out subgraphs that do not access LDS globals. |
| 1144 | for (auto *KernelSet : {&KernelsThatIndirectlyAllocateDynamicLDS, |
| 1145 | &KernelsThatAllocateTableLDS}) |
| 1146 | for (Function *F : *KernelSet) |
| 1147 | removeFnAttrFromReachable(CG, KernelRoot: F, FnAttrs: {"amdgpu-no-lds-kernel-id" }); |
| 1148 | |
| 1149 | // All kernel frames have been allocated. Calculate and record the |
| 1150 | // addresses. |
| 1151 | { |
| 1152 | const DataLayout &DL = M.getDataLayout(); |
| 1153 | |
| 1154 | for (Function &Func : M.functions()) { |
| 1155 | if (Func.isDeclaration() || !isKernelLDS(F: &Func)) |
| 1156 | continue; |
| 1157 | |
| 1158 | // All three of these are optional. The first variable is allocated at |
| 1159 | // zero. They are allocated by AMDGPUMachineFunction as one block. |
| 1160 | // Layout: |
| 1161 | //{ |
| 1162 | // module.lds |
| 1163 | // alignment padding |
| 1164 | // kernel instance |
| 1165 | // alignment padding |
| 1166 | // dynamic lds variables |
| 1167 | //} |
| 1168 | |
| 1169 | const bool AllocateModuleScopeStruct = |
| 1170 | MaybeModuleScopeStruct && |
| 1171 | KernelsThatAllocateModuleLDS.contains(V: &Func); |
| 1172 | |
| 1173 | auto Replacement = KernelToReplacement.find(Val: &Func); |
| 1174 | const bool AllocateKernelScopeStruct = |
| 1175 | Replacement != KernelToReplacement.end(); |
| 1176 | |
| 1177 | const bool AllocateDynamicVariable = |
| 1178 | KernelToCreatedDynamicLDS.contains(Val: &Func); |
| 1179 | |
| 1180 | uint32_t Offset = 0; |
| 1181 | |
| 1182 | if (AllocateModuleScopeStruct) { |
| 1183 | // Allocated at zero, recorded once on construction, not once per |
| 1184 | // kernel |
| 1185 | Offset += DL.getTypeAllocSize(Ty: MaybeModuleScopeStruct->getValueType()); |
| 1186 | } |
| 1187 | |
| 1188 | if (AllocateKernelScopeStruct) { |
| 1189 | GlobalVariable *KernelStruct = Replacement->second.SGV; |
| 1190 | Offset = alignTo(Size: Offset, A: AMDGPU::getAlign(DL, GV: KernelStruct)); |
| 1191 | recordLDSAbsoluteAddress(M: &M, GV: KernelStruct, Address: Offset); |
| 1192 | Offset += DL.getTypeAllocSize(Ty: KernelStruct->getValueType()); |
| 1193 | } |
| 1194 | |
| 1195 | // If there is dynamic allocation, the alignment needed is included in |
| 1196 | // the static frame size. There may be no reference to the dynamic |
| 1197 | // variable in the kernel itself, so without including it here, that |
| 1198 | // alignment padding could be missed. |
| 1199 | if (AllocateDynamicVariable) { |
| 1200 | GlobalVariable *DynamicVariable = KernelToCreatedDynamicLDS[&Func]; |
| 1201 | Offset = alignTo(Size: Offset, A: AMDGPU::getAlign(DL, GV: DynamicVariable)); |
| 1202 | recordLDSAbsoluteAddress(M: &M, GV: DynamicVariable, Address: Offset); |
| 1203 | } |
| 1204 | |
| 1205 | if (Offset != 0) { |
| 1206 | (void)TM; // TODO: Account for target maximum LDS |
| 1207 | std::string Buffer; |
| 1208 | raw_string_ostream SS{Buffer}; |
| 1209 | SS << format(Fmt: "%u" , Vals: Offset); |
| 1210 | |
| 1211 | // Instead of explicitly marking kernels that access dynamic variables |
| 1212 | // using special case metadata, annotate with min-lds == max-lds, i.e. |
| 1213 | // that there is no more space available for allocating more static |
| 1214 | // LDS variables. That is the right condition to prevent allocating |
| 1215 | // more variables which would collide with the addresses assigned to |
| 1216 | // dynamic variables. |
| 1217 | if (AllocateDynamicVariable) |
| 1218 | SS << format(Fmt: ",%u" , Vals: Offset); |
| 1219 | |
| 1220 | Func.addFnAttr(Kind: "amdgpu-lds-size" , Val: Buffer); |
| 1221 | } |
| 1222 | } |
| 1223 | } |
| 1224 | |
| 1225 | for (auto &GV : make_early_inc_range(Range: M.globals())) |
| 1226 | if (AMDGPU::isLDSVariableToLower(GV)) { |
| 1227 | // probably want to remove from used lists |
| 1228 | GV.removeDeadConstantUsers(); |
| 1229 | if (GV.use_empty()) |
| 1230 | GV.eraseFromParent(); |
| 1231 | } |
| 1232 | |
| 1233 | return Changed; |
| 1234 | } |
| 1235 | |
| 1236 | private: |
| 1237 | // Increase the alignment of LDS globals if necessary to maximise the chance |
| 1238 | // that we can use aligned LDS instructions to access them. |
| 1239 | static bool superAlignLDSGlobals(Module &M) { |
| 1240 | const DataLayout &DL = M.getDataLayout(); |
| 1241 | bool Changed = false; |
| 1242 | if (!SuperAlignLDSGlobals) { |
| 1243 | return Changed; |
| 1244 | } |
| 1245 | |
| 1246 | for (auto &GV : M.globals()) { |
| 1247 | if (GV.getType()->getPointerAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) { |
| 1248 | // Only changing alignment of LDS variables |
| 1249 | continue; |
| 1250 | } |
| 1251 | if (!GV.hasInitializer()) { |
| 1252 | // cuda/hip extern __shared__ variable, leave alignment alone |
| 1253 | continue; |
| 1254 | } |
| 1255 | |
| 1256 | if (GV.isAbsoluteSymbolRef()) { |
| 1257 | // If the variable is already allocated, don't change the alignment |
| 1258 | continue; |
| 1259 | } |
| 1260 | |
| 1261 | Align Alignment = AMDGPU::getAlign(DL, GV: &GV); |
| 1262 | TypeSize GVSize = DL.getTypeAllocSize(Ty: GV.getValueType()); |
| 1263 | |
| 1264 | if (GVSize > 8) { |
| 1265 | // We might want to use a b96 or b128 load/store |
| 1266 | Alignment = std::max(a: Alignment, b: Align(16)); |
| 1267 | } else if (GVSize > 4) { |
| 1268 | // We might want to use a b64 load/store |
| 1269 | Alignment = std::max(a: Alignment, b: Align(8)); |
| 1270 | } else if (GVSize > 2) { |
| 1271 | // We might want to use a b32 load/store |
| 1272 | Alignment = std::max(a: Alignment, b: Align(4)); |
| 1273 | } else if (GVSize > 1) { |
| 1274 | // We might want to use a b16 load/store |
| 1275 | Alignment = std::max(a: Alignment, b: Align(2)); |
| 1276 | } |
| 1277 | |
| 1278 | if (Alignment != AMDGPU::getAlign(DL, GV: &GV)) { |
| 1279 | Changed = true; |
| 1280 | GV.setAlignment(Alignment); |
| 1281 | } |
| 1282 | } |
| 1283 | return Changed; |
| 1284 | } |
| 1285 | |
| 1286 | static LDSVariableReplacement createLDSVariableReplacement( |
| 1287 | Module &M, std::string VarName, |
| 1288 | DenseSet<GlobalVariable *> const &LDSVarsToTransform) { |
| 1289 | // Create a struct instance containing LDSVarsToTransform and map from those |
| 1290 | // variables to ConstantExprGEP |
| 1291 | // Variables may be introduced to meet alignment requirements. No aliasing |
| 1292 | // metadata is useful for these as they have no uses. Erased before return. |
| 1293 | |
| 1294 | LLVMContext &Ctx = M.getContext(); |
| 1295 | const DataLayout &DL = M.getDataLayout(); |
| 1296 | assert(!LDSVarsToTransform.empty()); |
| 1297 | |
| 1298 | SmallVector<OptimizedStructLayoutField, 8> LayoutFields; |
| 1299 | LayoutFields.reserve(N: LDSVarsToTransform.size()); |
| 1300 | { |
| 1301 | // The order of fields in this struct depends on the order of |
| 1302 | // variables in the argument which varies when changing how they |
| 1303 | // are identified, leading to spurious test breakage. |
| 1304 | auto Sorted = sortByName(V: std::vector<GlobalVariable *>( |
| 1305 | LDSVarsToTransform.begin(), LDSVarsToTransform.end())); |
| 1306 | |
| 1307 | for (GlobalVariable *GV : Sorted) { |
| 1308 | OptimizedStructLayoutField F(GV, |
| 1309 | DL.getTypeAllocSize(Ty: GV->getValueType()), |
| 1310 | AMDGPU::getAlign(DL, GV)); |
| 1311 | LayoutFields.emplace_back(Args&: F); |
| 1312 | } |
| 1313 | } |
| 1314 | |
| 1315 | performOptimizedStructLayout(Fields: LayoutFields); |
| 1316 | |
| 1317 | std::vector<GlobalVariable *> LocalVars; |
| 1318 | BitVector IsPaddingField; |
| 1319 | LocalVars.reserve(n: LDSVarsToTransform.size()); // will be at least this large |
| 1320 | IsPaddingField.reserve(N: LDSVarsToTransform.size()); |
| 1321 | { |
| 1322 | uint64_t CurrentOffset = 0; |
| 1323 | for (auto &F : LayoutFields) { |
| 1324 | GlobalVariable *FGV = |
| 1325 | static_cast<GlobalVariable *>(const_cast<void *>(F.Id)); |
| 1326 | Align DataAlign = F.Alignment; |
| 1327 | |
| 1328 | uint64_t DataAlignV = DataAlign.value(); |
| 1329 | if (uint64_t Rem = CurrentOffset % DataAlignV) { |
| 1330 | uint64_t Padding = DataAlignV - Rem; |
| 1331 | |
| 1332 | // Append an array of padding bytes to meet alignment requested |
| 1333 | // Note (o + (a - (o % a)) ) % a == 0 |
| 1334 | // (offset + Padding ) % align == 0 |
| 1335 | |
| 1336 | Type *ATy = ArrayType::get(ElementType: Type::getInt8Ty(C&: Ctx), NumElements: Padding); |
| 1337 | LocalVars.push_back(x: new GlobalVariable( |
| 1338 | M, ATy, false, GlobalValue::InternalLinkage, |
| 1339 | PoisonValue::get(T: ATy), "" , nullptr, GlobalValue::NotThreadLocal, |
| 1340 | AMDGPUAS::LOCAL_ADDRESS, false)); |
| 1341 | IsPaddingField.push_back(Val: true); |
| 1342 | CurrentOffset += Padding; |
| 1343 | } |
| 1344 | |
| 1345 | LocalVars.push_back(x: FGV); |
| 1346 | IsPaddingField.push_back(Val: false); |
| 1347 | CurrentOffset += F.Size; |
| 1348 | } |
| 1349 | } |
| 1350 | |
| 1351 | std::vector<Type *> LocalVarTypes; |
| 1352 | LocalVarTypes.reserve(n: LocalVars.size()); |
| 1353 | std::transform( |
| 1354 | first: LocalVars.cbegin(), last: LocalVars.cend(), result: std::back_inserter(x&: LocalVarTypes), |
| 1355 | unary_op: [](const GlobalVariable *V) -> Type * { return V->getValueType(); }); |
| 1356 | |
| 1357 | StructType *LDSTy = StructType::create(Context&: Ctx, Elements: LocalVarTypes, Name: VarName + ".t" ); |
| 1358 | |
| 1359 | Align StructAlign = AMDGPU::getAlign(DL, GV: LocalVars[0]); |
| 1360 | |
| 1361 | GlobalVariable *SGV = new GlobalVariable( |
| 1362 | M, LDSTy, false, GlobalValue::InternalLinkage, PoisonValue::get(T: LDSTy), |
| 1363 | VarName, nullptr, GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS, |
| 1364 | false); |
| 1365 | SGV->setAlignment(StructAlign); |
| 1366 | |
| 1367 | DenseMap<GlobalVariable *, Constant *> Map; |
| 1368 | Type *I32 = Type::getInt32Ty(C&: Ctx); |
| 1369 | for (size_t I = 0; I < LocalVars.size(); I++) { |
| 1370 | GlobalVariable *GV = LocalVars[I]; |
| 1371 | Constant *GEPIdx[] = {ConstantInt::get(Ty: I32, V: 0), ConstantInt::get(Ty: I32, V: I)}; |
| 1372 | Constant *GEP = ConstantExpr::getGetElementPtr(Ty: LDSTy, C: SGV, IdxList: GEPIdx, NW: true); |
| 1373 | if (IsPaddingField[I]) { |
| 1374 | assert(GV->use_empty()); |
| 1375 | GV->eraseFromParent(); |
| 1376 | } else { |
| 1377 | Map[GV] = GEP; |
| 1378 | } |
| 1379 | } |
| 1380 | assert(Map.size() == LDSVarsToTransform.size()); |
| 1381 | return {.SGV: SGV, .LDSVarsToConstantGEP: std::move(Map)}; |
| 1382 | } |
| 1383 | |
| 1384 | template <typename PredicateTy> |
| 1385 | static void replaceLDSVariablesWithStruct( |
| 1386 | Module &M, DenseSet<GlobalVariable *> const &LDSVarsToTransformArg, |
| 1387 | const LDSVariableReplacement &Replacement, PredicateTy Predicate) { |
| 1388 | LLVMContext &Ctx = M.getContext(); |
| 1389 | const DataLayout &DL = M.getDataLayout(); |
| 1390 | |
| 1391 | // A hack... we need to insert the aliasing info in a predictable order for |
| 1392 | // lit tests. Would like to have them in a stable order already, ideally the |
| 1393 | // same order they get allocated, which might mean an ordered set container |
| 1394 | auto LDSVarsToTransform = sortByName(V: std::vector<GlobalVariable *>( |
| 1395 | LDSVarsToTransformArg.begin(), LDSVarsToTransformArg.end())); |
| 1396 | |
| 1397 | // Create alias.scope and their lists. Each field in the new structure |
| 1398 | // does not alias with all other fields. |
| 1399 | SmallVector<MDNode *> AliasScopes; |
| 1400 | SmallVector<Metadata *> NoAliasList; |
| 1401 | const size_t NumberVars = LDSVarsToTransform.size(); |
| 1402 | if (NumberVars > 1) { |
| 1403 | MDBuilder MDB(Ctx); |
| 1404 | AliasScopes.reserve(N: NumberVars); |
| 1405 | MDNode *Domain = MDB.createAnonymousAliasScopeDomain(); |
| 1406 | for (size_t I = 0; I < NumberVars; I++) { |
| 1407 | MDNode *Scope = MDB.createAnonymousAliasScope(Domain); |
| 1408 | AliasScopes.push_back(Elt: Scope); |
| 1409 | } |
| 1410 | NoAliasList.append(in_start: &AliasScopes[1], in_end: AliasScopes.end()); |
| 1411 | } |
| 1412 | |
| 1413 | // Replace uses of ith variable with a constantexpr to the corresponding |
| 1414 | // field of the instance that will be allocated by AMDGPUMachineFunction |
| 1415 | for (size_t I = 0; I < NumberVars; I++) { |
| 1416 | GlobalVariable *GV = LDSVarsToTransform[I]; |
| 1417 | Constant *GEP = Replacement.LDSVarsToConstantGEP.at(Val: GV); |
| 1418 | |
| 1419 | GV->replaceUsesWithIf(New: GEP, ShouldReplace: Predicate); |
| 1420 | |
| 1421 | APInt APOff(DL.getIndexTypeSizeInBits(Ty: GEP->getType()), 0); |
| 1422 | GEP->stripAndAccumulateInBoundsConstantOffsets(DL, Offset&: APOff); |
| 1423 | uint64_t Offset = APOff.getZExtValue(); |
| 1424 | |
| 1425 | Align A = |
| 1426 | commonAlignment(A: Replacement.SGV->getAlign().valueOrOne(), Offset); |
| 1427 | |
| 1428 | if (I) |
| 1429 | NoAliasList[I - 1] = AliasScopes[I - 1]; |
| 1430 | MDNode *NoAlias = |
| 1431 | NoAliasList.empty() ? nullptr : MDNode::get(Context&: Ctx, MDs: NoAliasList); |
| 1432 | MDNode *AliasScope = |
| 1433 | AliasScopes.empty() ? nullptr : MDNode::get(Context&: Ctx, MDs: {AliasScopes[I]}); |
| 1434 | |
| 1435 | refineUsesAlignmentAndAA(Ptr: GEP, A, DL, AliasScope, NoAlias); |
| 1436 | } |
| 1437 | } |
| 1438 | |
| 1439 | static void refineUsesAlignmentAndAA(Value *Ptr, Align A, |
| 1440 | const DataLayout &DL, MDNode *AliasScope, |
| 1441 | MDNode *NoAlias, unsigned MaxDepth = 5) { |
| 1442 | if (!MaxDepth || (A == 1 && !AliasScope)) |
| 1443 | return; |
| 1444 | |
| 1445 | ScopedNoAliasAAResult ScopedNoAlias; |
| 1446 | |
| 1447 | for (User *U : Ptr->users()) { |
| 1448 | if (auto *I = dyn_cast<Instruction>(Val: U)) { |
| 1449 | if (AliasScope && I->mayReadOrWriteMemory()) { |
| 1450 | MDNode *AS = I->getMetadata(KindID: LLVMContext::MD_alias_scope); |
| 1451 | AS = (AS ? MDNode::getMostGenericAliasScope(A: AS, B: AliasScope) |
| 1452 | : AliasScope); |
| 1453 | I->setMetadata(KindID: LLVMContext::MD_alias_scope, Node: AS); |
| 1454 | |
| 1455 | MDNode *NA = I->getMetadata(KindID: LLVMContext::MD_noalias); |
| 1456 | |
| 1457 | // Scoped aliases can originate from two different domains. |
| 1458 | // First domain would be from LDS domain (created by this pass). |
| 1459 | // All entries (LDS vars) into LDS struct will have same domain. |
| 1460 | |
| 1461 | // Second domain could be existing scoped aliases that are the |
| 1462 | // results of noalias params and subsequent optimizations that |
| 1463 | // may alter thesse sets. |
| 1464 | |
| 1465 | // We need to be careful how we create new alias sets, and |
| 1466 | // have right scopes and domains for loads/stores of these new |
| 1467 | // LDS variables. We intersect NoAlias set if alias sets belong |
| 1468 | // to the same domain. This is the case if we have memcpy using |
| 1469 | // LDS variables. Both src and dst of memcpy would belong to |
| 1470 | // LDS struct, they donot alias. |
| 1471 | // On the other hand, if one of the domains is LDS and other is |
| 1472 | // existing domain prior to LDS, we need to have a union of all |
| 1473 | // these aliases set to preserve existing aliasing information. |
| 1474 | |
| 1475 | SmallPtrSet<const MDNode *, 16> ExistingDomains, LDSDomains; |
| 1476 | ScopedNoAlias.collectScopedDomains(NoAlias: NA, Domains&: ExistingDomains); |
| 1477 | ScopedNoAlias.collectScopedDomains(NoAlias, Domains&: LDSDomains); |
| 1478 | auto Intersection = set_intersection(S1: ExistingDomains, S2: LDSDomains); |
| 1479 | if (Intersection.empty()) { |
| 1480 | NA = NA ? MDNode::concatenate(A: NA, B: NoAlias) : NoAlias; |
| 1481 | } else { |
| 1482 | NA = NA ? MDNode::intersect(A: NA, B: NoAlias) : NoAlias; |
| 1483 | } |
| 1484 | I->setMetadata(KindID: LLVMContext::MD_noalias, Node: NA); |
| 1485 | } |
| 1486 | } |
| 1487 | |
| 1488 | if (auto *LI = dyn_cast<LoadInst>(Val: U)) { |
| 1489 | LI->setAlignment(std::max(a: A, b: LI->getAlign())); |
| 1490 | continue; |
| 1491 | } |
| 1492 | if (auto *SI = dyn_cast<StoreInst>(Val: U)) { |
| 1493 | if (SI->getPointerOperand() == Ptr) |
| 1494 | SI->setAlignment(std::max(a: A, b: SI->getAlign())); |
| 1495 | continue; |
| 1496 | } |
| 1497 | if (auto *AI = dyn_cast<AtomicRMWInst>(Val: U)) { |
| 1498 | // None of atomicrmw operations can work on pointers, but let's |
| 1499 | // check it anyway in case it will or we will process ConstantExpr. |
| 1500 | if (AI->getPointerOperand() == Ptr) |
| 1501 | AI->setAlignment(std::max(a: A, b: AI->getAlign())); |
| 1502 | continue; |
| 1503 | } |
| 1504 | if (auto *AI = dyn_cast<AtomicCmpXchgInst>(Val: U)) { |
| 1505 | if (AI->getPointerOperand() == Ptr) |
| 1506 | AI->setAlignment(std::max(a: A, b: AI->getAlign())); |
| 1507 | continue; |
| 1508 | } |
| 1509 | if (auto *GEP = dyn_cast<GetElementPtrInst>(Val: U)) { |
| 1510 | unsigned BitWidth = DL.getIndexTypeSizeInBits(Ty: GEP->getType()); |
| 1511 | APInt Off(BitWidth, 0); |
| 1512 | if (GEP->getPointerOperand() == Ptr) { |
| 1513 | Align GA; |
| 1514 | if (GEP->accumulateConstantOffset(DL, Offset&: Off)) |
| 1515 | GA = commonAlignment(A, Offset: Off.getLimitedValue()); |
| 1516 | refineUsesAlignmentAndAA(Ptr: GEP, A: GA, DL, AliasScope, NoAlias, |
| 1517 | MaxDepth: MaxDepth - 1); |
| 1518 | } |
| 1519 | continue; |
| 1520 | } |
| 1521 | if (auto *I = dyn_cast<Instruction>(Val: U)) { |
| 1522 | if (I->getOpcode() == Instruction::BitCast || |
| 1523 | I->getOpcode() == Instruction::AddrSpaceCast) |
| 1524 | refineUsesAlignmentAndAA(Ptr: I, A, DL, AliasScope, NoAlias, MaxDepth: MaxDepth - 1); |
| 1525 | } |
| 1526 | } |
| 1527 | } |
| 1528 | }; |
| 1529 | |
| 1530 | class AMDGPULowerModuleLDSLegacy : public ModulePass { |
| 1531 | public: |
| 1532 | const AMDGPUTargetMachine *TM; |
| 1533 | static char ID; |
| 1534 | |
| 1535 | AMDGPULowerModuleLDSLegacy(const AMDGPUTargetMachine *TM = nullptr) |
| 1536 | : ModulePass(ID), TM(TM) {} |
| 1537 | |
| 1538 | void getAnalysisUsage(AnalysisUsage &AU) const override { |
| 1539 | if (!TM) |
| 1540 | AU.addRequired<TargetPassConfig>(); |
| 1541 | } |
| 1542 | |
| 1543 | bool runOnModule(Module &M) override { |
| 1544 | if (!TM) { |
| 1545 | auto &TPC = getAnalysis<TargetPassConfig>(); |
| 1546 | TM = &TPC.getTM<AMDGPUTargetMachine>(); |
| 1547 | } |
| 1548 | |
| 1549 | return AMDGPULowerModuleLDS(*TM).runOnModule(M); |
| 1550 | } |
| 1551 | }; |
| 1552 | |
| 1553 | } // namespace |
| 1554 | char AMDGPULowerModuleLDSLegacy::ID = 0; |
| 1555 | |
| 1556 | char &llvm::AMDGPULowerModuleLDSLegacyPassID = AMDGPULowerModuleLDSLegacy::ID; |
| 1557 | |
| 1558 | INITIALIZE_PASS_BEGIN(AMDGPULowerModuleLDSLegacy, DEBUG_TYPE, |
| 1559 | "Lower uses of LDS variables from non-kernel functions" , |
| 1560 | false, false) |
| 1561 | INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) |
| 1562 | INITIALIZE_PASS_END(AMDGPULowerModuleLDSLegacy, DEBUG_TYPE, |
| 1563 | "Lower uses of LDS variables from non-kernel functions" , |
| 1564 | false, false) |
| 1565 | |
| 1566 | ModulePass * |
| 1567 | llvm::createAMDGPULowerModuleLDSLegacyPass(const AMDGPUTargetMachine *TM) { |
| 1568 | return new AMDGPULowerModuleLDSLegacy(TM); |
| 1569 | } |
| 1570 | |
| 1571 | PreservedAnalyses AMDGPULowerModuleLDSPass::run(Module &M, |
| 1572 | ModuleAnalysisManager &) { |
| 1573 | return AMDGPULowerModuleLDS(TM).runOnModule(M) ? PreservedAnalyses::none() |
| 1574 | : PreservedAnalyses::all(); |
| 1575 | } |
| 1576 | |