1 | //===- AMDGPUImageIntrinsicOptimizer.cpp ----------------------------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This pass tries to combine multiple image_load intrinsics with dim=2dmsaa |
10 | // or dim=2darraymsaa into a single image_msaa_load intrinsic if: |
11 | // |
12 | // - they refer to the same vaddr except for sample_id, |
13 | // - they use a constant sample_id and they fall into the same group, |
14 | // - they have the same dmask and the number of intrinsics and the number of |
15 | // vaddr/vdata dword transfers is reduced by the combine. |
16 | // |
17 | // Examples for the tradeoff (all are assuming 2DMsaa for vaddr): |
18 | // |
19 | // +----------+-----+-----+-------+---------+------------+---------+----------+ |
20 | // | popcount | a16 | d16 | #load | vaddr / | #msaa_load | vaddr / | combine? | |
21 | // | (dmask) | | | | vdata | | vdata | | |
22 | // +----------+-----+-----+-------+---------+------------+---------+----------+ |
23 | // | 1 | 0 | 0 | 4 | 12 / 4 | 1 | 3 / 4 | yes | |
24 | // +----------+-----+-----+-------+---------+------------+---------+----------+ |
25 | // | 1 | 0 | 0 | 2 | 6 / 2 | 1 | 3 / 4 | yes? | |
26 | // +----------+-----+-----+-------+---------+------------+---------+----------+ |
27 | // | 2 | 0 | 0 | 4 | 12 / 8 | 2 | 6 / 8 | yes | |
28 | // +----------+-----+-----+-------+---------+------------+---------+----------+ |
29 | // | 2 | 0 | 0 | 2 | 6 / 4 | 2 | 6 / 8 | no | |
30 | // +----------+-----+-----+-------+---------+------------+---------+----------+ |
31 | // | 1 | 0 | 1 | 2 | 6 / 2 | 1 | 3 / 2 | yes | |
32 | // +----------+-----+-----+-------+---------+------------+---------+----------+ |
33 | // |
34 | // Some cases are of questionable benefit, like the one marked with "yes?" |
35 | // above: fewer intrinsics and fewer vaddr and fewer total transfers between SP |
36 | // and TX, but higher vdata. We start by erring on the side of converting these |
37 | // to MSAA_LOAD. |
38 | // |
39 | // clang-format off |
40 | // |
41 | // This pass will combine intrinsics such as (not neccessarily consecutive): |
42 | // call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0) |
43 | // call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0) |
44 | // call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 2, <8 x i32> %rsrc, i32 0, i32 0) |
45 | // call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 3, <8 x i32> %rsrc, i32 0, i32 0) |
46 | // ==> |
47 | // call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0) |
48 | // |
49 | // clang-format on |
50 | // |
51 | // Future improvements: |
52 | // |
53 | // - We may occasionally not want to do the combine if it increases the maximum |
54 | // register pressure. |
55 | // |
56 | // - Ensure clausing when multiple MSAA_LOAD are generated. |
57 | // |
58 | // Note: Even though the image_msaa_load intrinsic already exists on gfx10, this |
59 | // combine only applies to gfx11, due to a limitation in gfx10: the gfx10 |
60 | // IMAGE_MSAA_LOAD only works correctly with single-channel texture formats, and |
61 | // we don't know the format at compile time. |
62 | //===----------------------------------------------------------------------===// |
63 | |
64 | #include "AMDGPU.h" |
65 | #include "AMDGPUInstrInfo.h" |
66 | #include "AMDGPUTargetMachine.h" |
67 | #include "llvm/IR/Function.h" |
68 | #include "llvm/IR/IRBuilder.h" |
69 | #include "llvm/IR/IntrinsicInst.h" |
70 | #include "llvm/IR/IntrinsicsAMDGPU.h" |
71 | #include "llvm/Pass.h" |
72 | #include "llvm/Support/raw_ostream.h" |
73 | |
74 | using namespace llvm; |
75 | |
76 | #define DEBUG_TYPE "amdgpu-image-intrinsic-opt" |
77 | |
78 | namespace { |
79 | class AMDGPUImageIntrinsicOptimizer : public FunctionPass { |
80 | const TargetMachine *TM; |
81 | |
82 | public: |
83 | static char ID; |
84 | |
85 | AMDGPUImageIntrinsicOptimizer(const TargetMachine *TM = nullptr) |
86 | : FunctionPass(ID), TM(TM) {} |
87 | |
88 | bool runOnFunction(Function &F) override; |
89 | |
90 | }; // End of class AMDGPUImageIntrinsicOptimizer |
91 | } // End anonymous namespace |
92 | |
93 | INITIALIZE_PASS(AMDGPUImageIntrinsicOptimizer, DEBUG_TYPE, |
94 | "AMDGPU Image Intrinsic Optimizer" , false, false) |
95 | |
96 | char AMDGPUImageIntrinsicOptimizer::ID = 0; |
97 | |
98 | void addInstToMergeableList( |
99 | IntrinsicInst *II, |
100 | SmallVector<SmallVector<IntrinsicInst *, 4>> &MergeableInsts, |
101 | const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) { |
102 | for (SmallVector<IntrinsicInst *, 4> &IIList : MergeableInsts) { |
103 | // Check Dim. |
104 | if (IIList.front()->getIntrinsicID() != II->getIntrinsicID()) |
105 | continue; |
106 | |
107 | // Check D16. |
108 | if (IIList.front()->getType() != II->getType()) |
109 | continue; |
110 | |
111 | // Check all arguments (DMask, VAddr, RSrc etc). |
112 | bool AllEqual = true; |
113 | assert(IIList.front()->arg_size() == II->arg_size()); |
114 | for (int I = 1, E = II->arg_size(); AllEqual && I != E; ++I) { |
115 | Value *ArgList = IIList.front()->getArgOperand(i: I); |
116 | Value *Arg = II->getArgOperand(i: I); |
117 | if (I == ImageDimIntr->VAddrEnd - 1) { |
118 | // Check FragId group. |
119 | auto FragIdList = cast<ConstantInt>(Val: IIList.front()->getArgOperand(i: I)); |
120 | auto FragId = cast<ConstantInt>(Val: II->getArgOperand(i: I)); |
121 | AllEqual = FragIdList->getValue().udiv(RHS: 4) == FragId->getValue().udiv(RHS: 4); |
122 | } else { |
123 | // Check all arguments except FragId. |
124 | AllEqual = ArgList == Arg; |
125 | } |
126 | } |
127 | if (!AllEqual) |
128 | continue; |
129 | |
130 | // Add to the list. |
131 | IIList.emplace_back(Args&: II); |
132 | return; |
133 | } |
134 | |
135 | // Similar instruction not found, so add a new list. |
136 | MergeableInsts.emplace_back(Args: 1, Args&: II); |
137 | LLVM_DEBUG(dbgs() << "New: " << *II << "\n" ); |
138 | } |
139 | |
140 | // Collect list of all instructions we know how to merge in a subset of the |
141 | // block. It returns an iterator to the instruction after the last one analyzed. |
142 | BasicBlock::iterator collectMergeableInsts( |
143 | BasicBlock::iterator I, BasicBlock::iterator E, |
144 | SmallVector<SmallVector<IntrinsicInst *, 4>> &MergeableInsts) { |
145 | for (; I != E; ++I) { |
146 | // Don't combine if there is a store in the middle or if there is a memory |
147 | // barrier. |
148 | if (I->mayHaveSideEffects()) { |
149 | ++I; |
150 | break; |
151 | } |
152 | |
153 | // Ignore non-intrinsics. |
154 | if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Val&: I)) { |
155 | Intrinsic::ID IntrinID = II->getIntrinsicID(); |
156 | |
157 | // Ignore other intrinsics. |
158 | if (IntrinID != Intrinsic::amdgcn_image_load_2dmsaa && |
159 | IntrinID != Intrinsic::amdgcn_image_load_2darraymsaa) |
160 | continue; |
161 | |
162 | // Check for constant FragId. |
163 | const auto *ImageDimIntr = AMDGPU::getImageDimIntrinsicInfo(Intr: IntrinID); |
164 | const uint8_t FragIdIndex = ImageDimIntr->VAddrEnd - 1; |
165 | if (!isa<ConstantInt>(Val: II->getArgOperand(i: FragIdIndex))) |
166 | continue; |
167 | |
168 | LLVM_DEBUG(dbgs() << "Merge: " << *II << "\n" ); |
169 | addInstToMergeableList(II, MergeableInsts, ImageDimIntr); |
170 | } |
171 | } |
172 | |
173 | return I; |
174 | } |
175 | |
176 | bool optimizeSection(ArrayRef<SmallVector<IntrinsicInst *, 4>> MergeableInsts) { |
177 | bool Modified = false; |
178 | |
179 | SmallVector<Instruction *, 4> InstrsToErase; |
180 | for (const auto &IIList : MergeableInsts) { |
181 | if (IIList.size() <= 1) |
182 | continue; |
183 | |
184 | // Assume the arguments are unchanged and later override them, if needed. |
185 | SmallVector<Value *, 16> Args(IIList.front()->args()); |
186 | |
187 | // Validate function argument and return types, extracting overloaded |
188 | // types along the way. |
189 | SmallVector<Type *, 6> OverloadTys; |
190 | Function *F = IIList.front()->getCalledFunction(); |
191 | if (!Intrinsic::getIntrinsicSignature(F, ArgTys&: OverloadTys)) |
192 | continue; |
193 | |
194 | Intrinsic::ID IntrinID = IIList.front()->getIntrinsicID(); |
195 | const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = |
196 | AMDGPU::getImageDimIntrinsicInfo(Intr: IntrinID); |
197 | |
198 | Type *EltTy = IIList.front()->getType()->getScalarType(); |
199 | Type *NewTy = FixedVectorType::get(ElementType: EltTy, NumElts: 4); |
200 | OverloadTys[0] = NewTy; |
201 | bool isD16 = EltTy->isHalfTy(); |
202 | |
203 | ConstantInt *DMask = cast<ConstantInt>( |
204 | Val: IIList.front()->getArgOperand(i: ImageDimIntr->DMaskIndex)); |
205 | unsigned DMaskVal = DMask->getZExtValue() & 0xf; |
206 | unsigned NumElts = popcount(Value: DMaskVal); |
207 | |
208 | // Number of instructions and the number of vaddr/vdata dword transfers |
209 | // should be reduced. |
210 | unsigned NumLoads = IIList.size(); |
211 | unsigned NumMsaas = NumElts; |
212 | unsigned NumVAddrLoads = 3 * NumLoads; |
213 | unsigned NumVDataLoads = divideCeil(Numerator: NumElts, Denominator: isD16 ? 2 : 1) * NumLoads; |
214 | unsigned NumVAddrMsaas = 3 * NumMsaas; |
215 | unsigned NumVDataMsaas = divideCeil(Numerator: 4, Denominator: isD16 ? 2 : 1) * NumMsaas; |
216 | |
217 | if (NumLoads < NumMsaas || |
218 | (NumVAddrLoads + NumVDataLoads < NumVAddrMsaas + NumVDataMsaas)) |
219 | continue; |
220 | |
221 | const uint8_t FragIdIndex = ImageDimIntr->VAddrEnd - 1; |
222 | auto FragId = cast<ConstantInt>(Val: IIList.front()->getArgOperand(i: FragIdIndex)); |
223 | const APInt &NewFragIdVal = FragId->getValue().udiv(RHS: 4) * 4; |
224 | |
225 | // Create the new instructions. |
226 | IRBuilder<> B(IIList.front()); |
227 | |
228 | // Create the new image_msaa_load intrinsic. |
229 | SmallVector<Instruction *, 4> NewCalls; |
230 | while (DMaskVal != 0) { |
231 | unsigned NewMaskVal = 1 << countr_zero(Val: DMaskVal); |
232 | |
233 | Intrinsic::ID NewIntrinID; |
234 | if (IntrinID == Intrinsic::amdgcn_image_load_2dmsaa) |
235 | NewIntrinID = Intrinsic::amdgcn_image_msaa_load_2dmsaa; |
236 | else |
237 | NewIntrinID = Intrinsic::amdgcn_image_msaa_load_2darraymsaa; |
238 | |
239 | Function *NewIntrin = Intrinsic::getDeclaration( |
240 | M: IIList.front()->getModule(), id: NewIntrinID, Tys: OverloadTys); |
241 | Args[ImageDimIntr->DMaskIndex] = |
242 | ConstantInt::get(Ty: DMask->getType(), V: NewMaskVal); |
243 | Args[FragIdIndex] = ConstantInt::get(Ty: FragId->getType(), V: NewFragIdVal); |
244 | CallInst *NewCall = B.CreateCall(Callee: NewIntrin, Args); |
245 | LLVM_DEBUG(dbgs() << "Optimize: " << *NewCall << "\n" ); |
246 | |
247 | NewCalls.push_back(Elt: NewCall); |
248 | DMaskVal -= NewMaskVal; |
249 | } |
250 | |
251 | // Create the new extractelement instructions. |
252 | for (auto &II : IIList) { |
253 | Value *VecOp = nullptr; |
254 | auto Idx = cast<ConstantInt>(Val: II->getArgOperand(i: FragIdIndex)); |
255 | B.SetCurrentDebugLocation(II->getDebugLoc()); |
256 | if (NumElts == 1) { |
257 | VecOp = B.CreateExtractElement(Vec: NewCalls[0], Idx: Idx->getValue().urem(RHS: 4)); |
258 | LLVM_DEBUG(dbgs() << "Add: " << *VecOp << "\n" ); |
259 | } else { |
260 | VecOp = UndefValue::get(T: II->getType()); |
261 | for (unsigned I = 0; I < NumElts; ++I) { |
262 | VecOp = B.CreateInsertElement( |
263 | Vec: VecOp, |
264 | NewElt: B.CreateExtractElement(Vec: NewCalls[I], Idx: Idx->getValue().urem(RHS: 4)), Idx: I); |
265 | LLVM_DEBUG(dbgs() << "Add: " << *VecOp << "\n" ); |
266 | } |
267 | } |
268 | |
269 | // Replace the old instruction. |
270 | II->replaceAllUsesWith(V: VecOp); |
271 | VecOp->takeName(V: II); |
272 | InstrsToErase.push_back(Elt: II); |
273 | } |
274 | |
275 | Modified = true; |
276 | } |
277 | |
278 | for (auto I : InstrsToErase) |
279 | I->eraseFromParent(); |
280 | |
281 | return Modified; |
282 | } |
283 | |
284 | static bool imageIntrinsicOptimizerImpl(Function &F, const TargetMachine *TM) { |
285 | if (!TM) |
286 | return false; |
287 | |
288 | // This optimization only applies to GFX11 and beyond. |
289 | const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F); |
290 | if (!AMDGPU::isGFX11Plus(STI: ST) || ST.hasMSAALoadDstSelBug()) |
291 | return false; |
292 | |
293 | Module *M = F.getParent(); |
294 | |
295 | // Early test to determine if the intrinsics are used. |
296 | if (llvm::none_of(Range&: *M, P: [](Function &F) { |
297 | return !F.users().empty() && |
298 | (F.getIntrinsicID() == Intrinsic::amdgcn_image_load_2dmsaa || |
299 | F.getIntrinsicID() == Intrinsic::amdgcn_image_load_2darraymsaa); |
300 | })) |
301 | return false; |
302 | |
303 | bool Modified = false; |
304 | for (auto &BB : F) { |
305 | BasicBlock::iterator SectionEnd; |
306 | for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; |
307 | I = SectionEnd) { |
308 | SmallVector<SmallVector<IntrinsicInst *, 4>> MergeableInsts; |
309 | |
310 | SectionEnd = collectMergeableInsts(I, E, MergeableInsts); |
311 | Modified |= optimizeSection(MergeableInsts); |
312 | } |
313 | } |
314 | |
315 | return Modified; |
316 | } |
317 | |
318 | bool AMDGPUImageIntrinsicOptimizer::runOnFunction(Function &F) { |
319 | if (skipFunction(F)) |
320 | return false; |
321 | |
322 | return imageIntrinsicOptimizerImpl(F, TM); |
323 | } |
324 | |
325 | FunctionPass * |
326 | llvm::createAMDGPUImageIntrinsicOptimizerPass(const TargetMachine *TM) { |
327 | return new AMDGPUImageIntrinsicOptimizer(TM); |
328 | } |
329 | |
330 | PreservedAnalyses |
331 | AMDGPUImageIntrinsicOptimizerPass::run(Function &F, |
332 | FunctionAnalysisManager &AM) { |
333 | |
334 | bool Changed = imageIntrinsicOptimizerImpl(F, TM: &TM); |
335 | return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all(); |
336 | } |
337 | |