| 1 | //===-- AMDGPUPromoteKernelArguments.cpp ----------------------------------===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | // |
| 9 | /// \file This pass recursively promotes generic pointer arguments of a kernel |
| 10 | /// into the global address space. |
| 11 | /// |
| 12 | /// The pass walks kernel's pointer arguments, then loads from them. If a loaded |
| 13 | /// value is a pointer and loaded pointer is unmodified in the kernel before the |
| 14 | /// load, then promote loaded pointer to global. Then recursively continue. |
| 15 | // |
| 16 | //===----------------------------------------------------------------------===// |
| 17 | |
| 18 | #include "AMDGPU.h" |
| 19 | #include "AMDGPUMemoryUtils.h" |
| 20 | #include "llvm/ADT/SmallVector.h" |
| 21 | #include "llvm/Analysis/AliasAnalysis.h" |
| 22 | #include "llvm/Analysis/MemorySSA.h" |
| 23 | #include "llvm/IR/IRBuilder.h" |
| 24 | #include "llvm/InitializePasses.h" |
| 25 | |
| 26 | #define DEBUG_TYPE "amdgpu-promote-kernel-arguments" |
| 27 | |
| 28 | using namespace llvm; |
| 29 | |
| 30 | namespace { |
| 31 | |
| 32 | class AMDGPUPromoteKernelArguments : public FunctionPass { |
| 33 | MemorySSA *MSSA; |
| 34 | |
| 35 | AliasAnalysis *AA; |
| 36 | |
| 37 | Instruction *ArgCastInsertPt; |
| 38 | |
| 39 | SmallVector<Value *> Ptrs; |
| 40 | |
| 41 | void enqueueUsers(Value *Ptr); |
| 42 | |
| 43 | bool promotePointer(Value *Ptr); |
| 44 | |
| 45 | bool promoteLoad(LoadInst *LI); |
| 46 | |
| 47 | public: |
| 48 | static char ID; |
| 49 | |
| 50 | AMDGPUPromoteKernelArguments() : FunctionPass(ID) {} |
| 51 | |
| 52 | bool run(Function &F, MemorySSA &MSSA, AliasAnalysis &AA); |
| 53 | |
| 54 | bool runOnFunction(Function &F) override; |
| 55 | |
| 56 | void getAnalysisUsage(AnalysisUsage &AU) const override { |
| 57 | AU.addRequired<AAResultsWrapperPass>(); |
| 58 | AU.addRequired<MemorySSAWrapperPass>(); |
| 59 | AU.setPreservesAll(); |
| 60 | } |
| 61 | }; |
| 62 | |
| 63 | } // end anonymous namespace |
| 64 | |
| 65 | void AMDGPUPromoteKernelArguments::enqueueUsers(Value *Ptr) { |
| 66 | SmallVector<User *> PtrUsers(Ptr->users()); |
| 67 | |
| 68 | while (!PtrUsers.empty()) { |
| 69 | Instruction *U = dyn_cast<Instruction>(Val: PtrUsers.pop_back_val()); |
| 70 | if (!U) |
| 71 | continue; |
| 72 | |
| 73 | switch (U->getOpcode()) { |
| 74 | default: |
| 75 | break; |
| 76 | case Instruction::Load: { |
| 77 | LoadInst *LD = cast<LoadInst>(Val: U); |
| 78 | if (LD->getPointerOperand()->stripInBoundsOffsets() == Ptr && |
| 79 | !AMDGPU::isClobberedInFunction(Load: LD, MSSA, AA)) |
| 80 | Ptrs.push_back(Elt: LD); |
| 81 | |
| 82 | break; |
| 83 | } |
| 84 | case Instruction::GetElementPtr: |
| 85 | case Instruction::AddrSpaceCast: |
| 86 | case Instruction::BitCast: |
| 87 | if (U->getOperand(i: 0)->stripInBoundsOffsets() == Ptr) |
| 88 | PtrUsers.append(in_start: U->user_begin(), in_end: U->user_end()); |
| 89 | break; |
| 90 | } |
| 91 | } |
| 92 | } |
| 93 | |
| 94 | bool AMDGPUPromoteKernelArguments::promotePointer(Value *Ptr) { |
| 95 | bool Changed = false; |
| 96 | |
| 97 | LoadInst *LI = dyn_cast<LoadInst>(Val: Ptr); |
| 98 | if (LI) |
| 99 | Changed |= promoteLoad(LI); |
| 100 | |
| 101 | PointerType *PT = dyn_cast<PointerType>(Val: Ptr->getType()); |
| 102 | if (!PT) |
| 103 | return Changed; |
| 104 | |
| 105 | if (PT->getAddressSpace() == AMDGPUAS::FLAT_ADDRESS || |
| 106 | PT->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS || |
| 107 | PT->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS) |
| 108 | enqueueUsers(Ptr); |
| 109 | |
| 110 | if (PT->getAddressSpace() != AMDGPUAS::FLAT_ADDRESS) |
| 111 | return Changed; |
| 112 | |
| 113 | IRBuilder<> B(LI ? &*std::next(x: cast<Instruction>(Val: Ptr)->getIterator()) |
| 114 | : ArgCastInsertPt); |
| 115 | |
| 116 | // Cast pointer to global address space and back to flat and let |
| 117 | // Infer Address Spaces pass to do all necessary rewriting. |
| 118 | PointerType *NewPT = |
| 119 | PointerType::get(C&: PT->getContext(), AddressSpace: AMDGPUAS::GLOBAL_ADDRESS); |
| 120 | Value *Cast = |
| 121 | B.CreateAddrSpaceCast(V: Ptr, DestTy: NewPT, Name: Twine(Ptr->getName(), ".global" )); |
| 122 | Value *CastBack = |
| 123 | B.CreateAddrSpaceCast(V: Cast, DestTy: PT, Name: Twine(Ptr->getName(), ".flat" )); |
| 124 | Ptr->replaceUsesWithIf(New: CastBack, |
| 125 | ShouldReplace: [Cast](Use &U) { return U.getUser() != Cast; }); |
| 126 | |
| 127 | return true; |
| 128 | } |
| 129 | |
| 130 | bool AMDGPUPromoteKernelArguments::promoteLoad(LoadInst *LI) { |
| 131 | if (!LI->isSimple()) |
| 132 | return false; |
| 133 | |
| 134 | LI->setMetadata(Kind: "amdgpu.noclobber" , Node: MDNode::get(Context&: LI->getContext(), MDs: {})); |
| 135 | return true; |
| 136 | } |
| 137 | |
| 138 | // skip allocas |
| 139 | static BasicBlock::iterator getInsertPt(BasicBlock &BB) { |
| 140 | BasicBlock::iterator InsPt = BB.getFirstInsertionPt(); |
| 141 | for (BasicBlock::iterator E = BB.end(); InsPt != E; ++InsPt) { |
| 142 | AllocaInst *AI = dyn_cast<AllocaInst>(Val: &*InsPt); |
| 143 | |
| 144 | // If this is a dynamic alloca, the value may depend on the loaded kernargs, |
| 145 | // so loads will need to be inserted before it. |
| 146 | if (!AI || !AI->isStaticAlloca()) |
| 147 | break; |
| 148 | } |
| 149 | |
| 150 | return InsPt; |
| 151 | } |
| 152 | |
| 153 | bool AMDGPUPromoteKernelArguments::run(Function &F, MemorySSA &MSSA, |
| 154 | AliasAnalysis &AA) { |
| 155 | if (skipFunction(F)) |
| 156 | return false; |
| 157 | |
| 158 | CallingConv::ID CC = F.getCallingConv(); |
| 159 | if (CC != CallingConv::AMDGPU_KERNEL || F.arg_empty()) |
| 160 | return false; |
| 161 | |
| 162 | ArgCastInsertPt = &*getInsertPt(BB&: *F.begin()); |
| 163 | this->MSSA = &MSSA; |
| 164 | this->AA = &AA; |
| 165 | |
| 166 | for (Argument &Arg : F.args()) { |
| 167 | if (Arg.use_empty()) |
| 168 | continue; |
| 169 | |
| 170 | PointerType *PT = dyn_cast<PointerType>(Val: Arg.getType()); |
| 171 | if (!PT || (PT->getAddressSpace() != AMDGPUAS::FLAT_ADDRESS && |
| 172 | PT->getAddressSpace() != AMDGPUAS::GLOBAL_ADDRESS && |
| 173 | PT->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS)) |
| 174 | continue; |
| 175 | |
| 176 | Ptrs.push_back(Elt: &Arg); |
| 177 | } |
| 178 | |
| 179 | bool Changed = false; |
| 180 | while (!Ptrs.empty()) { |
| 181 | Value *Ptr = Ptrs.pop_back_val(); |
| 182 | Changed |= promotePointer(Ptr); |
| 183 | } |
| 184 | |
| 185 | return Changed; |
| 186 | } |
| 187 | |
| 188 | bool AMDGPUPromoteKernelArguments::runOnFunction(Function &F) { |
| 189 | MemorySSA &MSSA = getAnalysis<MemorySSAWrapperPass>().getMSSA(); |
| 190 | AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults(); |
| 191 | return run(F, MSSA, AA); |
| 192 | } |
| 193 | |
| 194 | INITIALIZE_PASS_BEGIN(AMDGPUPromoteKernelArguments, DEBUG_TYPE, |
| 195 | "AMDGPU Promote Kernel Arguments" , false, false) |
| 196 | INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) |
| 197 | INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass) |
| 198 | INITIALIZE_PASS_END(AMDGPUPromoteKernelArguments, DEBUG_TYPE, |
| 199 | "AMDGPU Promote Kernel Arguments" , false, false) |
| 200 | |
| 201 | char AMDGPUPromoteKernelArguments::ID = 0; |
| 202 | |
| 203 | FunctionPass *llvm::createAMDGPUPromoteKernelArgumentsPass() { |
| 204 | return new AMDGPUPromoteKernelArguments(); |
| 205 | } |
| 206 | |
| 207 | PreservedAnalyses |
| 208 | AMDGPUPromoteKernelArgumentsPass::run(Function &F, |
| 209 | FunctionAnalysisManager &AM) { |
| 210 | MemorySSA &MSSA = AM.getResult<MemorySSAAnalysis>(IR&: F).getMSSA(); |
| 211 | AliasAnalysis &AA = AM.getResult<AAManager>(IR&: F); |
| 212 | if (AMDGPUPromoteKernelArguments().run(F, MSSA, AA)) { |
| 213 | PreservedAnalyses PA; |
| 214 | PA.preserveSet<CFGAnalyses>(); |
| 215 | PA.preserve<MemorySSAAnalysis>(); |
| 216 | return PA; |
| 217 | } |
| 218 | return PreservedAnalyses::all(); |
| 219 | } |
| 220 | |