1 | //===- SILoadStoreOptimizer.cpp -------------------------------------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This pass tries to fuse DS instructions with close by immediate offsets. |
10 | // This will fuse operations such as |
11 | // ds_read_b32 v0, v2 offset:16 |
12 | // ds_read_b32 v1, v2 offset:32 |
13 | // ==> |
14 | // ds_read2_b32 v[0:1], v2, offset0:4 offset1:8 |
15 | // |
16 | // The same is done for certain SMEM and VMEM opcodes, e.g.: |
17 | // s_buffer_load_dword s4, s[0:3], 4 |
18 | // s_buffer_load_dword s5, s[0:3], 8 |
19 | // ==> |
20 | // s_buffer_load_dwordx2 s[4:5], s[0:3], 4 |
21 | // |
22 | // This pass also tries to promote constant offset to the immediate by |
23 | // adjusting the base. It tries to use a base from the nearby instructions that |
24 | // allows it to have a 13bit constant offset and then promotes the 13bit offset |
25 | // to the immediate. |
26 | // E.g. |
27 | // s_movk_i32 s0, 0x1800 |
28 | // v_add_co_u32_e32 v0, vcc, s0, v2 |
29 | // v_addc_co_u32_e32 v1, vcc, 0, v6, vcc |
30 | // |
31 | // s_movk_i32 s0, 0x1000 |
32 | // v_add_co_u32_e32 v5, vcc, s0, v2 |
33 | // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc |
34 | // global_load_dwordx2 v[5:6], v[5:6], off |
35 | // global_load_dwordx2 v[0:1], v[0:1], off |
36 | // => |
37 | // s_movk_i32 s0, 0x1000 |
38 | // v_add_co_u32_e32 v5, vcc, s0, v2 |
39 | // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc |
40 | // global_load_dwordx2 v[5:6], v[5:6], off |
41 | // global_load_dwordx2 v[0:1], v[5:6], off offset:2048 |
42 | // |
43 | // Future improvements: |
44 | // |
45 | // - This is currently missing stores of constants because loading |
46 | // the constant into the data register is placed between the stores, although |
47 | // this is arguably a scheduling problem. |
48 | // |
49 | // - Live interval recomputing seems inefficient. This currently only matches |
50 | // one pair, and recomputes live intervals and moves on to the next pair. It |
51 | // would be better to compute a list of all merges that need to occur. |
52 | // |
53 | // - With a list of instructions to process, we can also merge more. If a |
54 | // cluster of loads have offsets that are too large to fit in the 8-bit |
55 | // offsets, but are close enough to fit in the 8 bits, we can add to the base |
56 | // pointer and use the new reduced offsets. |
57 | // |
58 | //===----------------------------------------------------------------------===// |
59 | |
60 | #include "AMDGPU.h" |
61 | #include "GCNSubtarget.h" |
62 | #include "MCTargetDesc/AMDGPUMCTargetDesc.h" |
63 | #include "llvm/Analysis/AliasAnalysis.h" |
64 | #include "llvm/CodeGen/MachineFunctionPass.h" |
65 | #include "llvm/InitializePasses.h" |
66 | |
67 | using namespace llvm; |
68 | |
69 | #define DEBUG_TYPE "si-load-store-opt" |
70 | |
71 | namespace { |
72 | enum InstClassEnum { |
73 | UNKNOWN, |
74 | DS_READ, |
75 | DS_WRITE, |
76 | S_BUFFER_LOAD_IMM, |
77 | S_BUFFER_LOAD_SGPR_IMM, |
78 | S_LOAD_IMM, |
79 | BUFFER_LOAD, |
80 | BUFFER_STORE, |
81 | MIMG, |
82 | TBUFFER_LOAD, |
83 | TBUFFER_STORE, |
84 | GLOBAL_LOAD_SADDR, |
85 | GLOBAL_STORE_SADDR, |
86 | FLAT_LOAD, |
87 | FLAT_STORE, |
88 | GLOBAL_LOAD, // GLOBAL_LOAD/GLOBAL_STORE are never used as the InstClass of |
89 | GLOBAL_STORE // any CombineInfo, they are only ever returned by |
90 | // getCommonInstClass. |
91 | }; |
92 | |
93 | struct AddressRegs { |
94 | unsigned char NumVAddrs = 0; |
95 | bool SBase = false; |
96 | bool SRsrc = false; |
97 | bool SOffset = false; |
98 | bool SAddr = false; |
99 | bool VAddr = false; |
100 | bool Addr = false; |
101 | bool SSamp = false; |
102 | }; |
103 | |
104 | // GFX10 image_sample instructions can have 12 vaddrs + srsrc + ssamp. |
105 | const unsigned MaxAddressRegs = 12 + 1 + 1; |
106 | |
107 | class SILoadStoreOptimizer : public MachineFunctionPass { |
108 | struct CombineInfo { |
109 | MachineBasicBlock::iterator I; |
110 | unsigned EltSize; |
111 | unsigned Offset; |
112 | unsigned Width; |
113 | unsigned Format; |
114 | unsigned BaseOff; |
115 | unsigned DMask; |
116 | InstClassEnum InstClass; |
117 | unsigned CPol = 0; |
118 | bool IsAGPR; |
119 | bool UseST64; |
120 | int AddrIdx[MaxAddressRegs]; |
121 | const MachineOperand *AddrReg[MaxAddressRegs]; |
122 | unsigned NumAddresses; |
123 | unsigned Order; |
124 | |
125 | bool hasSameBaseAddress(const CombineInfo &CI) { |
126 | if (NumAddresses != CI.NumAddresses) |
127 | return false; |
128 | |
129 | const MachineInstr &MI = *CI.I; |
130 | for (unsigned i = 0; i < NumAddresses; i++) { |
131 | const MachineOperand &AddrRegNext = MI.getOperand(i: AddrIdx[i]); |
132 | |
133 | if (AddrReg[i]->isImm() || AddrRegNext.isImm()) { |
134 | if (AddrReg[i]->isImm() != AddrRegNext.isImm() || |
135 | AddrReg[i]->getImm() != AddrRegNext.getImm()) { |
136 | return false; |
137 | } |
138 | continue; |
139 | } |
140 | |
141 | // Check same base pointer. Be careful of subregisters, which can occur |
142 | // with vectors of pointers. |
143 | if (AddrReg[i]->getReg() != AddrRegNext.getReg() || |
144 | AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) { |
145 | return false; |
146 | } |
147 | } |
148 | return true; |
149 | } |
150 | |
151 | bool hasMergeableAddress(const MachineRegisterInfo &MRI) { |
152 | for (unsigned i = 0; i < NumAddresses; ++i) { |
153 | const MachineOperand *AddrOp = AddrReg[i]; |
154 | // Immediates are always OK. |
155 | if (AddrOp->isImm()) |
156 | continue; |
157 | |
158 | // Don't try to merge addresses that aren't either immediates or registers. |
159 | // TODO: Should be possible to merge FrameIndexes and maybe some other |
160 | // non-register |
161 | if (!AddrOp->isReg()) |
162 | return false; |
163 | |
164 | // TODO: We should be able to merge instructions with other physical reg |
165 | // addresses too. |
166 | if (AddrOp->getReg().isPhysical() && |
167 | AddrOp->getReg() != AMDGPU::SGPR_NULL) |
168 | return false; |
169 | |
170 | // If an address has only one use then there will be no other |
171 | // instructions with the same address, so we can't merge this one. |
172 | if (MRI.hasOneNonDBGUse(RegNo: AddrOp->getReg())) |
173 | return false; |
174 | } |
175 | return true; |
176 | } |
177 | |
178 | void setMI(MachineBasicBlock::iterator MI, const SILoadStoreOptimizer &LSO); |
179 | |
180 | // Compare by pointer order. |
181 | bool operator<(const CombineInfo& Other) const { |
182 | return (InstClass == MIMG) ? DMask < Other.DMask : Offset < Other.Offset; |
183 | } |
184 | }; |
185 | |
186 | struct BaseRegisters { |
187 | Register LoReg; |
188 | Register HiReg; |
189 | |
190 | unsigned LoSubReg = 0; |
191 | unsigned HiSubReg = 0; |
192 | }; |
193 | |
194 | struct MemAddress { |
195 | BaseRegisters Base; |
196 | int64_t Offset = 0; |
197 | }; |
198 | |
199 | using MemInfoMap = DenseMap<MachineInstr *, MemAddress>; |
200 | |
201 | private: |
202 | const GCNSubtarget *STM = nullptr; |
203 | const SIInstrInfo *TII = nullptr; |
204 | const SIRegisterInfo *TRI = nullptr; |
205 | MachineRegisterInfo *MRI = nullptr; |
206 | AliasAnalysis *AA = nullptr; |
207 | bool OptimizeAgain; |
208 | |
209 | bool canSwapInstructions(const DenseSet<Register> &ARegDefs, |
210 | const DenseSet<Register> &ARegUses, |
211 | const MachineInstr &A, const MachineInstr &B) const; |
212 | static bool dmasksCanBeCombined(const CombineInfo &CI, |
213 | const SIInstrInfo &TII, |
214 | const CombineInfo &Paired); |
215 | static bool offsetsCanBeCombined(CombineInfo &CI, const GCNSubtarget &STI, |
216 | CombineInfo &Paired, bool Modify = false); |
217 | static bool widthsFit(const GCNSubtarget &STI, const CombineInfo &CI, |
218 | const CombineInfo &Paired); |
219 | unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired); |
220 | static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI, |
221 | const CombineInfo &Paired); |
222 | const TargetRegisterClass * |
223 | getTargetRegisterClass(const CombineInfo &CI, |
224 | const CombineInfo &Paired) const; |
225 | const TargetRegisterClass *getDataRegClass(const MachineInstr &MI) const; |
226 | |
227 | CombineInfo *checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired); |
228 | |
229 | void copyToDestRegs(CombineInfo &CI, CombineInfo &Paired, |
230 | MachineBasicBlock::iterator InsertBefore, int OpName, |
231 | Register DestReg) const; |
232 | Register copyFromSrcRegs(CombineInfo &CI, CombineInfo &Paired, |
233 | MachineBasicBlock::iterator InsertBefore, |
234 | int OpName) const; |
235 | |
236 | unsigned read2Opcode(unsigned EltSize) const; |
237 | unsigned read2ST64Opcode(unsigned EltSize) const; |
238 | MachineBasicBlock::iterator |
239 | mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired, |
240 | MachineBasicBlock::iterator InsertBefore); |
241 | |
242 | unsigned write2Opcode(unsigned EltSize) const; |
243 | unsigned write2ST64Opcode(unsigned EltSize) const; |
244 | MachineBasicBlock::iterator |
245 | mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired, |
246 | MachineBasicBlock::iterator InsertBefore); |
247 | MachineBasicBlock::iterator |
248 | mergeImagePair(CombineInfo &CI, CombineInfo &Paired, |
249 | MachineBasicBlock::iterator InsertBefore); |
250 | MachineBasicBlock::iterator |
251 | mergeSMemLoadImmPair(CombineInfo &CI, CombineInfo &Paired, |
252 | MachineBasicBlock::iterator InsertBefore); |
253 | MachineBasicBlock::iterator |
254 | mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired, |
255 | MachineBasicBlock::iterator InsertBefore); |
256 | MachineBasicBlock::iterator |
257 | mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired, |
258 | MachineBasicBlock::iterator InsertBefore); |
259 | MachineBasicBlock::iterator |
260 | mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired, |
261 | MachineBasicBlock::iterator InsertBefore); |
262 | MachineBasicBlock::iterator |
263 | mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired, |
264 | MachineBasicBlock::iterator InsertBefore); |
265 | MachineBasicBlock::iterator |
266 | mergeFlatLoadPair(CombineInfo &CI, CombineInfo &Paired, |
267 | MachineBasicBlock::iterator InsertBefore); |
268 | MachineBasicBlock::iterator |
269 | mergeFlatStorePair(CombineInfo &CI, CombineInfo &Paired, |
270 | MachineBasicBlock::iterator InsertBefore); |
271 | |
272 | void updateBaseAndOffset(MachineInstr &I, Register NewBase, |
273 | int32_t NewOffset) const; |
274 | Register computeBase(MachineInstr &MI, const MemAddress &Addr) const; |
275 | MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const; |
276 | std::optional<int32_t> extractConstOffset(const MachineOperand &Op) const; |
277 | void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr) const; |
278 | /// Promotes constant offset to the immediate by adjusting the base. It |
279 | /// tries to use a base from the nearby instructions that allows it to have |
280 | /// a 13bit constant offset which gets promoted to the immediate. |
281 | bool promoteConstantOffsetToImm(MachineInstr &CI, |
282 | MemInfoMap &Visited, |
283 | SmallPtrSet<MachineInstr *, 4> &Promoted) const; |
284 | void addInstToMergeableList(const CombineInfo &CI, |
285 | std::list<std::list<CombineInfo> > &MergeableInsts) const; |
286 | |
287 | std::pair<MachineBasicBlock::iterator, bool> collectMergeableInsts( |
288 | MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End, |
289 | MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList, |
290 | std::list<std::list<CombineInfo>> &MergeableInsts) const; |
291 | |
292 | static MachineMemOperand *combineKnownAdjacentMMOs(const CombineInfo &CI, |
293 | const CombineInfo &Paired); |
294 | |
295 | static InstClassEnum getCommonInstClass(const CombineInfo &CI, |
296 | const CombineInfo &Paired); |
297 | |
298 | public: |
299 | static char ID; |
300 | |
301 | SILoadStoreOptimizer() : MachineFunctionPass(ID) { |
302 | initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry()); |
303 | } |
304 | |
305 | bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList, |
306 | bool &OptimizeListAgain); |
307 | bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts); |
308 | |
309 | bool runOnMachineFunction(MachineFunction &MF) override; |
310 | |
311 | StringRef getPassName() const override { return "SI Load Store Optimizer" ; } |
312 | |
313 | void getAnalysisUsage(AnalysisUsage &AU) const override { |
314 | AU.setPreservesCFG(); |
315 | AU.addRequired<AAResultsWrapperPass>(); |
316 | |
317 | MachineFunctionPass::getAnalysisUsage(AU); |
318 | } |
319 | |
320 | MachineFunctionProperties getRequiredProperties() const override { |
321 | return MachineFunctionProperties() |
322 | .set(MachineFunctionProperties::Property::IsSSA); |
323 | } |
324 | }; |
325 | |
326 | static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) { |
327 | const unsigned Opc = MI.getOpcode(); |
328 | |
329 | if (TII.isMUBUF(Opcode: Opc)) { |
330 | // FIXME: Handle d16 correctly |
331 | return AMDGPU::getMUBUFElements(Opc); |
332 | } |
333 | if (TII.isImage(MI)) { |
334 | uint64_t DMaskImm = |
335 | TII.getNamedOperand(MI, OpName: AMDGPU::OpName::dmask)->getImm(); |
336 | return llvm::popcount(Value: DMaskImm); |
337 | } |
338 | if (TII.isMTBUF(Opcode: Opc)) { |
339 | return AMDGPU::getMTBUFElements(Opc); |
340 | } |
341 | |
342 | switch (Opc) { |
343 | case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: |
344 | case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM: |
345 | case AMDGPU::S_LOAD_DWORD_IMM: |
346 | case AMDGPU::GLOBAL_LOAD_DWORD: |
347 | case AMDGPU::GLOBAL_LOAD_DWORD_SADDR: |
348 | case AMDGPU::GLOBAL_STORE_DWORD: |
349 | case AMDGPU::GLOBAL_STORE_DWORD_SADDR: |
350 | case AMDGPU::FLAT_LOAD_DWORD: |
351 | case AMDGPU::FLAT_STORE_DWORD: |
352 | return 1; |
353 | case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: |
354 | case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM: |
355 | case AMDGPU::S_LOAD_DWORDX2_IMM: |
356 | case AMDGPU::S_LOAD_DWORDX2_IMM_ec: |
357 | case AMDGPU::GLOBAL_LOAD_DWORDX2: |
358 | case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR: |
359 | case AMDGPU::GLOBAL_STORE_DWORDX2: |
360 | case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR: |
361 | case AMDGPU::FLAT_LOAD_DWORDX2: |
362 | case AMDGPU::FLAT_STORE_DWORDX2: |
363 | return 2; |
364 | case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM: |
365 | case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM: |
366 | case AMDGPU::S_LOAD_DWORDX3_IMM: |
367 | case AMDGPU::S_LOAD_DWORDX3_IMM_ec: |
368 | case AMDGPU::GLOBAL_LOAD_DWORDX3: |
369 | case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR: |
370 | case AMDGPU::GLOBAL_STORE_DWORDX3: |
371 | case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR: |
372 | case AMDGPU::FLAT_LOAD_DWORDX3: |
373 | case AMDGPU::FLAT_STORE_DWORDX3: |
374 | return 3; |
375 | case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: |
376 | case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM: |
377 | case AMDGPU::S_LOAD_DWORDX4_IMM: |
378 | case AMDGPU::S_LOAD_DWORDX4_IMM_ec: |
379 | case AMDGPU::GLOBAL_LOAD_DWORDX4: |
380 | case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR: |
381 | case AMDGPU::GLOBAL_STORE_DWORDX4: |
382 | case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR: |
383 | case AMDGPU::FLAT_LOAD_DWORDX4: |
384 | case AMDGPU::FLAT_STORE_DWORDX4: |
385 | return 4; |
386 | case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: |
387 | case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM: |
388 | case AMDGPU::S_LOAD_DWORDX8_IMM: |
389 | case AMDGPU::S_LOAD_DWORDX8_IMM_ec: |
390 | return 8; |
391 | case AMDGPU::DS_READ_B32: |
392 | case AMDGPU::DS_READ_B32_gfx9: |
393 | case AMDGPU::DS_WRITE_B32: |
394 | case AMDGPU::DS_WRITE_B32_gfx9: |
395 | return 1; |
396 | case AMDGPU::DS_READ_B64: |
397 | case AMDGPU::DS_READ_B64_gfx9: |
398 | case AMDGPU::DS_WRITE_B64: |
399 | case AMDGPU::DS_WRITE_B64_gfx9: |
400 | return 2; |
401 | default: |
402 | return 0; |
403 | } |
404 | } |
405 | |
406 | /// Maps instruction opcode to enum InstClassEnum. |
407 | static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) { |
408 | switch (Opc) { |
409 | default: |
410 | if (TII.isMUBUF(Opcode: Opc)) { |
411 | switch (AMDGPU::getMUBUFBaseOpcode(Opc)) { |
412 | default: |
413 | return UNKNOWN; |
414 | case AMDGPU::BUFFER_LOAD_DWORD_BOTHEN: |
415 | case AMDGPU::BUFFER_LOAD_DWORD_BOTHEN_exact: |
416 | case AMDGPU::BUFFER_LOAD_DWORD_IDXEN: |
417 | case AMDGPU::BUFFER_LOAD_DWORD_IDXEN_exact: |
418 | case AMDGPU::BUFFER_LOAD_DWORD_OFFEN: |
419 | case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact: |
420 | case AMDGPU::BUFFER_LOAD_DWORD_OFFSET: |
421 | case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact: |
422 | case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_BOTHEN: |
423 | case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_BOTHEN_exact: |
424 | case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_IDXEN: |
425 | case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_IDXEN_exact: |
426 | case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN: |
427 | case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN_exact: |
428 | case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET: |
429 | case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET_exact: |
430 | return BUFFER_LOAD; |
431 | case AMDGPU::BUFFER_STORE_DWORD_BOTHEN: |
432 | case AMDGPU::BUFFER_STORE_DWORD_BOTHEN_exact: |
433 | case AMDGPU::BUFFER_STORE_DWORD_IDXEN: |
434 | case AMDGPU::BUFFER_STORE_DWORD_IDXEN_exact: |
435 | case AMDGPU::BUFFER_STORE_DWORD_OFFEN: |
436 | case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact: |
437 | case AMDGPU::BUFFER_STORE_DWORD_OFFSET: |
438 | case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact: |
439 | case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_BOTHEN: |
440 | case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_BOTHEN_exact: |
441 | case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_IDXEN: |
442 | case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_IDXEN_exact: |
443 | case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN: |
444 | case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact: |
445 | case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET: |
446 | case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact: |
447 | return BUFFER_STORE; |
448 | } |
449 | } |
450 | if (TII.isImage(Opcode: Opc)) { |
451 | // Ignore instructions encoded without vaddr. |
452 | if (!AMDGPU::hasNamedOperand(Opcode: Opc, NamedIdx: AMDGPU::OpName::vaddr) && |
453 | !AMDGPU::hasNamedOperand(Opcode: Opc, NamedIdx: AMDGPU::OpName::vaddr0)) |
454 | return UNKNOWN; |
455 | // Ignore BVH instructions |
456 | if (AMDGPU::getMIMGBaseOpcode(Opc)->BVH) |
457 | return UNKNOWN; |
458 | // TODO: Support IMAGE_GET_RESINFO and IMAGE_GET_LOD. |
459 | if (TII.get(Opcode: Opc).mayStore() || !TII.get(Opcode: Opc).mayLoad() || |
460 | TII.isGather4(Opcode: Opc)) |
461 | return UNKNOWN; |
462 | return MIMG; |
463 | } |
464 | if (TII.isMTBUF(Opcode: Opc)) { |
465 | switch (AMDGPU::getMTBUFBaseOpcode(Opc)) { |
466 | default: |
467 | return UNKNOWN; |
468 | case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN: |
469 | case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN_exact: |
470 | case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN: |
471 | case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN_exact: |
472 | case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN: |
473 | case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN_exact: |
474 | case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET: |
475 | case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET_exact: |
476 | case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN: |
477 | case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN_exact: |
478 | case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN: |
479 | case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN_exact: |
480 | case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN: |
481 | case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN_exact: |
482 | case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET: |
483 | case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET_exact: |
484 | return TBUFFER_LOAD; |
485 | case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN: |
486 | case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN_exact: |
487 | case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET: |
488 | case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET_exact: |
489 | case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN: |
490 | case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN_exact: |
491 | case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET: |
492 | case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET_exact: |
493 | return TBUFFER_STORE; |
494 | } |
495 | } |
496 | return UNKNOWN; |
497 | case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: |
498 | case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: |
499 | case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM: |
500 | case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: |
501 | case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: |
502 | return S_BUFFER_LOAD_IMM; |
503 | case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM: |
504 | case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM: |
505 | case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM: |
506 | case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM: |
507 | case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM: |
508 | return S_BUFFER_LOAD_SGPR_IMM; |
509 | case AMDGPU::S_LOAD_DWORD_IMM: |
510 | case AMDGPU::S_LOAD_DWORDX2_IMM: |
511 | case AMDGPU::S_LOAD_DWORDX3_IMM: |
512 | case AMDGPU::S_LOAD_DWORDX4_IMM: |
513 | case AMDGPU::S_LOAD_DWORDX8_IMM: |
514 | case AMDGPU::S_LOAD_DWORDX2_IMM_ec: |
515 | case AMDGPU::S_LOAD_DWORDX3_IMM_ec: |
516 | case AMDGPU::S_LOAD_DWORDX4_IMM_ec: |
517 | case AMDGPU::S_LOAD_DWORDX8_IMM_ec: |
518 | return S_LOAD_IMM; |
519 | case AMDGPU::DS_READ_B32: |
520 | case AMDGPU::DS_READ_B32_gfx9: |
521 | case AMDGPU::DS_READ_B64: |
522 | case AMDGPU::DS_READ_B64_gfx9: |
523 | return DS_READ; |
524 | case AMDGPU::DS_WRITE_B32: |
525 | case AMDGPU::DS_WRITE_B32_gfx9: |
526 | case AMDGPU::DS_WRITE_B64: |
527 | case AMDGPU::DS_WRITE_B64_gfx9: |
528 | return DS_WRITE; |
529 | case AMDGPU::GLOBAL_LOAD_DWORD: |
530 | case AMDGPU::GLOBAL_LOAD_DWORDX2: |
531 | case AMDGPU::GLOBAL_LOAD_DWORDX3: |
532 | case AMDGPU::GLOBAL_LOAD_DWORDX4: |
533 | case AMDGPU::FLAT_LOAD_DWORD: |
534 | case AMDGPU::FLAT_LOAD_DWORDX2: |
535 | case AMDGPU::FLAT_LOAD_DWORDX3: |
536 | case AMDGPU::FLAT_LOAD_DWORDX4: |
537 | return FLAT_LOAD; |
538 | case AMDGPU::GLOBAL_LOAD_DWORD_SADDR: |
539 | case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR: |
540 | case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR: |
541 | case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR: |
542 | return GLOBAL_LOAD_SADDR; |
543 | case AMDGPU::GLOBAL_STORE_DWORD: |
544 | case AMDGPU::GLOBAL_STORE_DWORDX2: |
545 | case AMDGPU::GLOBAL_STORE_DWORDX3: |
546 | case AMDGPU::GLOBAL_STORE_DWORDX4: |
547 | case AMDGPU::FLAT_STORE_DWORD: |
548 | case AMDGPU::FLAT_STORE_DWORDX2: |
549 | case AMDGPU::FLAT_STORE_DWORDX3: |
550 | case AMDGPU::FLAT_STORE_DWORDX4: |
551 | return FLAT_STORE; |
552 | case AMDGPU::GLOBAL_STORE_DWORD_SADDR: |
553 | case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR: |
554 | case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR: |
555 | case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR: |
556 | return GLOBAL_STORE_SADDR; |
557 | } |
558 | } |
559 | |
560 | /// Determines instruction subclass from opcode. Only instructions |
561 | /// of the same subclass can be merged together. The merged instruction may have |
562 | /// a different subclass but must have the same class. |
563 | static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) { |
564 | switch (Opc) { |
565 | default: |
566 | if (TII.isMUBUF(Opcode: Opc)) |
567 | return AMDGPU::getMUBUFBaseOpcode(Opc); |
568 | if (TII.isImage(Opcode: Opc)) { |
569 | const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc); |
570 | assert(Info); |
571 | return Info->BaseOpcode; |
572 | } |
573 | if (TII.isMTBUF(Opcode: Opc)) |
574 | return AMDGPU::getMTBUFBaseOpcode(Opc); |
575 | return -1; |
576 | case AMDGPU::DS_READ_B32: |
577 | case AMDGPU::DS_READ_B32_gfx9: |
578 | case AMDGPU::DS_READ_B64: |
579 | case AMDGPU::DS_READ_B64_gfx9: |
580 | case AMDGPU::DS_WRITE_B32: |
581 | case AMDGPU::DS_WRITE_B32_gfx9: |
582 | case AMDGPU::DS_WRITE_B64: |
583 | case AMDGPU::DS_WRITE_B64_gfx9: |
584 | return Opc; |
585 | case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: |
586 | case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: |
587 | case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM: |
588 | case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: |
589 | case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: |
590 | return AMDGPU::S_BUFFER_LOAD_DWORD_IMM; |
591 | case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM: |
592 | case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM: |
593 | case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM: |
594 | case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM: |
595 | case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM: |
596 | return AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM; |
597 | case AMDGPU::S_LOAD_DWORD_IMM: |
598 | case AMDGPU::S_LOAD_DWORDX2_IMM: |
599 | case AMDGPU::S_LOAD_DWORDX3_IMM: |
600 | case AMDGPU::S_LOAD_DWORDX4_IMM: |
601 | case AMDGPU::S_LOAD_DWORDX8_IMM: |
602 | case AMDGPU::S_LOAD_DWORDX2_IMM_ec: |
603 | case AMDGPU::S_LOAD_DWORDX3_IMM_ec: |
604 | case AMDGPU::S_LOAD_DWORDX4_IMM_ec: |
605 | case AMDGPU::S_LOAD_DWORDX8_IMM_ec: |
606 | return AMDGPU::S_LOAD_DWORD_IMM; |
607 | case AMDGPU::GLOBAL_LOAD_DWORD: |
608 | case AMDGPU::GLOBAL_LOAD_DWORDX2: |
609 | case AMDGPU::GLOBAL_LOAD_DWORDX3: |
610 | case AMDGPU::GLOBAL_LOAD_DWORDX4: |
611 | case AMDGPU::FLAT_LOAD_DWORD: |
612 | case AMDGPU::FLAT_LOAD_DWORDX2: |
613 | case AMDGPU::FLAT_LOAD_DWORDX3: |
614 | case AMDGPU::FLAT_LOAD_DWORDX4: |
615 | return AMDGPU::FLAT_LOAD_DWORD; |
616 | case AMDGPU::GLOBAL_LOAD_DWORD_SADDR: |
617 | case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR: |
618 | case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR: |
619 | case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR: |
620 | return AMDGPU::GLOBAL_LOAD_DWORD_SADDR; |
621 | case AMDGPU::GLOBAL_STORE_DWORD: |
622 | case AMDGPU::GLOBAL_STORE_DWORDX2: |
623 | case AMDGPU::GLOBAL_STORE_DWORDX3: |
624 | case AMDGPU::GLOBAL_STORE_DWORDX4: |
625 | case AMDGPU::FLAT_STORE_DWORD: |
626 | case AMDGPU::FLAT_STORE_DWORDX2: |
627 | case AMDGPU::FLAT_STORE_DWORDX3: |
628 | case AMDGPU::FLAT_STORE_DWORDX4: |
629 | return AMDGPU::FLAT_STORE_DWORD; |
630 | case AMDGPU::GLOBAL_STORE_DWORD_SADDR: |
631 | case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR: |
632 | case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR: |
633 | case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR: |
634 | return AMDGPU::GLOBAL_STORE_DWORD_SADDR; |
635 | } |
636 | } |
637 | |
638 | // GLOBAL loads and stores are classified as FLAT initially. If both combined |
639 | // instructions are FLAT GLOBAL adjust the class to GLOBAL_LOAD or GLOBAL_STORE. |
640 | // If either or both instructions are non segment specific FLAT the resulting |
641 | // combined operation will be FLAT, potentially promoting one of the GLOBAL |
642 | // operations to FLAT. |
643 | // For other instructions return the original unmodified class. |
644 | InstClassEnum |
645 | SILoadStoreOptimizer::getCommonInstClass(const CombineInfo &CI, |
646 | const CombineInfo &Paired) { |
647 | assert(CI.InstClass == Paired.InstClass); |
648 | |
649 | if ((CI.InstClass == FLAT_LOAD || CI.InstClass == FLAT_STORE) && |
650 | SIInstrInfo::isFLATGlobal(MI: *CI.I) && SIInstrInfo::isFLATGlobal(MI: *Paired.I)) |
651 | return (CI.InstClass == FLAT_STORE) ? GLOBAL_STORE : GLOBAL_LOAD; |
652 | |
653 | return CI.InstClass; |
654 | } |
655 | |
656 | static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) { |
657 | AddressRegs Result; |
658 | |
659 | if (TII.isMUBUF(Opcode: Opc)) { |
660 | if (AMDGPU::getMUBUFHasVAddr(Opc)) |
661 | Result.VAddr = true; |
662 | if (AMDGPU::getMUBUFHasSrsrc(Opc)) |
663 | Result.SRsrc = true; |
664 | if (AMDGPU::getMUBUFHasSoffset(Opc)) |
665 | Result.SOffset = true; |
666 | |
667 | return Result; |
668 | } |
669 | |
670 | if (TII.isImage(Opcode: Opc)) { |
671 | int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opcode: Opc, NamedIdx: AMDGPU::OpName::vaddr0); |
672 | if (VAddr0Idx >= 0) { |
673 | int RsrcName = |
674 | TII.isMIMG(Opcode: Opc) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc; |
675 | int RsrcIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, NamedIdx: RsrcName); |
676 | Result.NumVAddrs = RsrcIdx - VAddr0Idx; |
677 | } else { |
678 | Result.VAddr = true; |
679 | } |
680 | Result.SRsrc = true; |
681 | const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc); |
682 | if (Info && AMDGPU::getMIMGBaseOpcodeInfo(BaseOpcode: Info->BaseOpcode)->Sampler) |
683 | Result.SSamp = true; |
684 | |
685 | return Result; |
686 | } |
687 | if (TII.isMTBUF(Opcode: Opc)) { |
688 | if (AMDGPU::getMTBUFHasVAddr(Opc)) |
689 | Result.VAddr = true; |
690 | if (AMDGPU::getMTBUFHasSrsrc(Opc)) |
691 | Result.SRsrc = true; |
692 | if (AMDGPU::getMTBUFHasSoffset(Opc)) |
693 | Result.SOffset = true; |
694 | |
695 | return Result; |
696 | } |
697 | |
698 | switch (Opc) { |
699 | default: |
700 | return Result; |
701 | case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM: |
702 | case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM: |
703 | case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM: |
704 | case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM: |
705 | case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM: |
706 | Result.SOffset = true; |
707 | [[fallthrough]]; |
708 | case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: |
709 | case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: |
710 | case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM: |
711 | case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: |
712 | case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: |
713 | case AMDGPU::S_LOAD_DWORD_IMM: |
714 | case AMDGPU::S_LOAD_DWORDX2_IMM: |
715 | case AMDGPU::S_LOAD_DWORDX3_IMM: |
716 | case AMDGPU::S_LOAD_DWORDX4_IMM: |
717 | case AMDGPU::S_LOAD_DWORDX8_IMM: |
718 | case AMDGPU::S_LOAD_DWORDX2_IMM_ec: |
719 | case AMDGPU::S_LOAD_DWORDX3_IMM_ec: |
720 | case AMDGPU::S_LOAD_DWORDX4_IMM_ec: |
721 | case AMDGPU::S_LOAD_DWORDX8_IMM_ec: |
722 | Result.SBase = true; |
723 | return Result; |
724 | case AMDGPU::DS_READ_B32: |
725 | case AMDGPU::DS_READ_B64: |
726 | case AMDGPU::DS_READ_B32_gfx9: |
727 | case AMDGPU::DS_READ_B64_gfx9: |
728 | case AMDGPU::DS_WRITE_B32: |
729 | case AMDGPU::DS_WRITE_B64: |
730 | case AMDGPU::DS_WRITE_B32_gfx9: |
731 | case AMDGPU::DS_WRITE_B64_gfx9: |
732 | Result.Addr = true; |
733 | return Result; |
734 | case AMDGPU::GLOBAL_LOAD_DWORD_SADDR: |
735 | case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR: |
736 | case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR: |
737 | case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR: |
738 | case AMDGPU::GLOBAL_STORE_DWORD_SADDR: |
739 | case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR: |
740 | case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR: |
741 | case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR: |
742 | Result.SAddr = true; |
743 | [[fallthrough]]; |
744 | case AMDGPU::GLOBAL_LOAD_DWORD: |
745 | case AMDGPU::GLOBAL_LOAD_DWORDX2: |
746 | case AMDGPU::GLOBAL_LOAD_DWORDX3: |
747 | case AMDGPU::GLOBAL_LOAD_DWORDX4: |
748 | case AMDGPU::GLOBAL_STORE_DWORD: |
749 | case AMDGPU::GLOBAL_STORE_DWORDX2: |
750 | case AMDGPU::GLOBAL_STORE_DWORDX3: |
751 | case AMDGPU::GLOBAL_STORE_DWORDX4: |
752 | case AMDGPU::FLAT_LOAD_DWORD: |
753 | case AMDGPU::FLAT_LOAD_DWORDX2: |
754 | case AMDGPU::FLAT_LOAD_DWORDX3: |
755 | case AMDGPU::FLAT_LOAD_DWORDX4: |
756 | case AMDGPU::FLAT_STORE_DWORD: |
757 | case AMDGPU::FLAT_STORE_DWORDX2: |
758 | case AMDGPU::FLAT_STORE_DWORDX3: |
759 | case AMDGPU::FLAT_STORE_DWORDX4: |
760 | Result.VAddr = true; |
761 | return Result; |
762 | } |
763 | } |
764 | |
765 | void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI, |
766 | const SILoadStoreOptimizer &LSO) { |
767 | I = MI; |
768 | unsigned Opc = MI->getOpcode(); |
769 | InstClass = getInstClass(Opc, TII: *LSO.TII); |
770 | |
771 | if (InstClass == UNKNOWN) |
772 | return; |
773 | |
774 | IsAGPR = LSO.TRI->hasAGPRs(RC: LSO.getDataRegClass(MI: *MI)); |
775 | |
776 | switch (InstClass) { |
777 | case DS_READ: |
778 | EltSize = |
779 | (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8 |
780 | : 4; |
781 | break; |
782 | case DS_WRITE: |
783 | EltSize = |
784 | (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8 |
785 | : 4; |
786 | break; |
787 | case S_BUFFER_LOAD_IMM: |
788 | case S_BUFFER_LOAD_SGPR_IMM: |
789 | case S_LOAD_IMM: |
790 | EltSize = AMDGPU::convertSMRDOffsetUnits(ST: *LSO.STM, ByteOffset: 4); |
791 | break; |
792 | default: |
793 | EltSize = 4; |
794 | break; |
795 | } |
796 | |
797 | if (InstClass == MIMG) { |
798 | DMask = LSO.TII->getNamedOperand(MI&: *I, OperandName: AMDGPU::OpName::dmask)->getImm(); |
799 | // Offset is not considered for MIMG instructions. |
800 | Offset = 0; |
801 | } else { |
802 | int OffsetIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, NamedIdx: AMDGPU::OpName::offset); |
803 | Offset = I->getOperand(i: OffsetIdx).getImm(); |
804 | } |
805 | |
806 | if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE) |
807 | Format = LSO.TII->getNamedOperand(MI&: *I, OperandName: AMDGPU::OpName::format)->getImm(); |
808 | |
809 | Width = getOpcodeWidth(MI: *I, TII: *LSO.TII); |
810 | |
811 | if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) { |
812 | Offset &= 0xffff; |
813 | } else if (InstClass != MIMG) { |
814 | CPol = LSO.TII->getNamedOperand(MI&: *I, OperandName: AMDGPU::OpName::cpol)->getImm(); |
815 | } |
816 | |
817 | AddressRegs Regs = getRegs(Opc, TII: *LSO.TII); |
818 | bool isVIMAGEorVSAMPLE = LSO.TII->isVIMAGE(MI: *I) || LSO.TII->isVSAMPLE(MI: *I); |
819 | |
820 | NumAddresses = 0; |
821 | for (unsigned J = 0; J < Regs.NumVAddrs; J++) |
822 | AddrIdx[NumAddresses++] = |
823 | AMDGPU::getNamedOperandIdx(Opcode: Opc, NamedIdx: AMDGPU::OpName::vaddr0) + J; |
824 | if (Regs.Addr) |
825 | AddrIdx[NumAddresses++] = |
826 | AMDGPU::getNamedOperandIdx(Opcode: Opc, NamedIdx: AMDGPU::OpName::addr); |
827 | if (Regs.SBase) |
828 | AddrIdx[NumAddresses++] = |
829 | AMDGPU::getNamedOperandIdx(Opcode: Opc, NamedIdx: AMDGPU::OpName::sbase); |
830 | if (Regs.SRsrc) |
831 | AddrIdx[NumAddresses++] = AMDGPU::getNamedOperandIdx( |
832 | Opcode: Opc, NamedIdx: isVIMAGEorVSAMPLE ? AMDGPU::OpName::rsrc : AMDGPU::OpName::srsrc); |
833 | if (Regs.SOffset) |
834 | AddrIdx[NumAddresses++] = |
835 | AMDGPU::getNamedOperandIdx(Opcode: Opc, NamedIdx: AMDGPU::OpName::soffset); |
836 | if (Regs.SAddr) |
837 | AddrIdx[NumAddresses++] = |
838 | AMDGPU::getNamedOperandIdx(Opcode: Opc, NamedIdx: AMDGPU::OpName::saddr); |
839 | if (Regs.VAddr) |
840 | AddrIdx[NumAddresses++] = |
841 | AMDGPU::getNamedOperandIdx(Opcode: Opc, NamedIdx: AMDGPU::OpName::vaddr); |
842 | if (Regs.SSamp) |
843 | AddrIdx[NumAddresses++] = AMDGPU::getNamedOperandIdx( |
844 | Opcode: Opc, NamedIdx: isVIMAGEorVSAMPLE ? AMDGPU::OpName::samp : AMDGPU::OpName::ssamp); |
845 | assert(NumAddresses <= MaxAddressRegs); |
846 | |
847 | for (unsigned J = 0; J < NumAddresses; J++) |
848 | AddrReg[J] = &I->getOperand(i: AddrIdx[J]); |
849 | } |
850 | |
851 | } // end anonymous namespace. |
852 | |
853 | INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE, |
854 | "SI Load Store Optimizer" , false, false) |
855 | INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) |
856 | INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer" , |
857 | false, false) |
858 | |
859 | char SILoadStoreOptimizer::ID = 0; |
860 | |
861 | char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID; |
862 | |
863 | FunctionPass *llvm::createSILoadStoreOptimizerPass() { |
864 | return new SILoadStoreOptimizer(); |
865 | } |
866 | |
867 | static void addDefsUsesToList(const MachineInstr &MI, |
868 | DenseSet<Register> &RegDefs, |
869 | DenseSet<Register> &RegUses) { |
870 | for (const auto &Op : MI.operands()) { |
871 | if (!Op.isReg()) |
872 | continue; |
873 | if (Op.isDef()) |
874 | RegDefs.insert(V: Op.getReg()); |
875 | if (Op.readsReg()) |
876 | RegUses.insert(V: Op.getReg()); |
877 | } |
878 | } |
879 | |
880 | bool SILoadStoreOptimizer::canSwapInstructions( |
881 | const DenseSet<Register> &ARegDefs, const DenseSet<Register> &ARegUses, |
882 | const MachineInstr &A, const MachineInstr &B) const { |
883 | if (A.mayLoadOrStore() && B.mayLoadOrStore() && |
884 | (A.mayStore() || B.mayStore()) && A.mayAlias(AA, Other: B, UseTBAA: true)) |
885 | return false; |
886 | for (const auto &BOp : B.operands()) { |
887 | if (!BOp.isReg()) |
888 | continue; |
889 | if ((BOp.isDef() || BOp.readsReg()) && ARegDefs.contains(V: BOp.getReg())) |
890 | return false; |
891 | if (BOp.isDef() && ARegUses.contains(V: BOp.getReg())) |
892 | return false; |
893 | } |
894 | return true; |
895 | } |
896 | |
897 | // Given that \p CI and \p Paired are adjacent memory operations produce a new |
898 | // MMO for the combined operation with a new access size. |
899 | MachineMemOperand * |
900 | SILoadStoreOptimizer::combineKnownAdjacentMMOs(const CombineInfo &CI, |
901 | const CombineInfo &Paired) { |
902 | const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); |
903 | const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); |
904 | |
905 | unsigned Size = MMOa->getSize().getValue() + MMOb->getSize().getValue(); |
906 | |
907 | // A base pointer for the combined operation is the same as the leading |
908 | // operation's pointer. |
909 | if (Paired < CI) |
910 | std::swap(a&: MMOa, b&: MMOb); |
911 | |
912 | MachinePointerInfo PtrInfo(MMOa->getPointerInfo()); |
913 | // If merging FLAT and GLOBAL set address space to FLAT. |
914 | if (MMOb->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS) |
915 | PtrInfo.AddrSpace = AMDGPUAS::FLAT_ADDRESS; |
916 | |
917 | MachineFunction *MF = CI.I->getMF(); |
918 | return MF->getMachineMemOperand(MMO: MMOa, PtrInfo, Size); |
919 | } |
920 | |
921 | bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI, |
922 | const SIInstrInfo &TII, |
923 | const CombineInfo &Paired) { |
924 | assert(CI.InstClass == MIMG); |
925 | |
926 | // Ignore instructions with tfe/lwe set. |
927 | const auto *TFEOp = TII.getNamedOperand(MI&: *CI.I, OperandName: AMDGPU::OpName::tfe); |
928 | const auto *LWEOp = TII.getNamedOperand(MI&: *CI.I, OperandName: AMDGPU::OpName::lwe); |
929 | |
930 | if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm())) |
931 | return false; |
932 | |
933 | // Check other optional immediate operands for equality. |
934 | unsigned OperandsToMatch[] = {AMDGPU::OpName::cpol, AMDGPU::OpName::d16, |
935 | AMDGPU::OpName::unorm, AMDGPU::OpName::da, |
936 | AMDGPU::OpName::r128, AMDGPU::OpName::a16}; |
937 | |
938 | for (auto op : OperandsToMatch) { |
939 | int Idx = AMDGPU::getNamedOperandIdx(Opcode: CI.I->getOpcode(), NamedIdx: op); |
940 | if (AMDGPU::getNamedOperandIdx(Opcode: Paired.I->getOpcode(), NamedIdx: op) != Idx) |
941 | return false; |
942 | if (Idx != -1 && |
943 | CI.I->getOperand(i: Idx).getImm() != Paired.I->getOperand(i: Idx).getImm()) |
944 | return false; |
945 | } |
946 | |
947 | // Check DMask for overlaps. |
948 | unsigned MaxMask = std::max(a: CI.DMask, b: Paired.DMask); |
949 | unsigned MinMask = std::min(a: CI.DMask, b: Paired.DMask); |
950 | |
951 | if (!MaxMask) |
952 | return false; |
953 | |
954 | unsigned AllowedBitsForMin = llvm::countr_zero(Val: MaxMask); |
955 | if ((1u << AllowedBitsForMin) <= MinMask) |
956 | return false; |
957 | |
958 | return true; |
959 | } |
960 | |
961 | static unsigned getBufferFormatWithCompCount(unsigned OldFormat, |
962 | unsigned ComponentCount, |
963 | const GCNSubtarget &STI) { |
964 | if (ComponentCount > 4) |
965 | return 0; |
966 | |
967 | const llvm::AMDGPU::GcnBufferFormatInfo *OldFormatInfo = |
968 | llvm::AMDGPU::getGcnBufferFormatInfo(Format: OldFormat, STI); |
969 | if (!OldFormatInfo) |
970 | return 0; |
971 | |
972 | const llvm::AMDGPU::GcnBufferFormatInfo *NewFormatInfo = |
973 | llvm::AMDGPU::getGcnBufferFormatInfo(BitsPerComp: OldFormatInfo->BitsPerComp, |
974 | NumComponents: ComponentCount, |
975 | NumFormat: OldFormatInfo->NumFormat, STI); |
976 | |
977 | if (!NewFormatInfo) |
978 | return 0; |
979 | |
980 | assert(NewFormatInfo->NumFormat == OldFormatInfo->NumFormat && |
981 | NewFormatInfo->BitsPerComp == OldFormatInfo->BitsPerComp); |
982 | |
983 | return NewFormatInfo->Format; |
984 | } |
985 | |
986 | // Return the value in the inclusive range [Lo,Hi] that is aligned to the |
987 | // highest power of two. Note that the result is well defined for all inputs |
988 | // including corner cases like: |
989 | // - if Lo == Hi, return that value |
990 | // - if Lo == 0, return 0 (even though the "- 1" below underflows |
991 | // - if Lo > Hi, return 0 (as if the range wrapped around) |
992 | static uint32_t mostAlignedValueInRange(uint32_t Lo, uint32_t Hi) { |
993 | return Hi & maskLeadingOnes<uint32_t>(N: llvm::countl_zero(Val: (Lo - 1) ^ Hi) + 1); |
994 | } |
995 | |
996 | bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI, |
997 | const GCNSubtarget &STI, |
998 | CombineInfo &Paired, |
999 | bool Modify) { |
1000 | assert(CI.InstClass != MIMG); |
1001 | |
1002 | // XXX - Would the same offset be OK? Is there any reason this would happen or |
1003 | // be useful? |
1004 | if (CI.Offset == Paired.Offset) |
1005 | return false; |
1006 | |
1007 | // This won't be valid if the offset isn't aligned. |
1008 | if ((CI.Offset % CI.EltSize != 0) || (Paired.Offset % CI.EltSize != 0)) |
1009 | return false; |
1010 | |
1011 | if (CI.InstClass == TBUFFER_LOAD || CI.InstClass == TBUFFER_STORE) { |
1012 | |
1013 | const llvm::AMDGPU::GcnBufferFormatInfo *Info0 = |
1014 | llvm::AMDGPU::getGcnBufferFormatInfo(Format: CI.Format, STI); |
1015 | if (!Info0) |
1016 | return false; |
1017 | const llvm::AMDGPU::GcnBufferFormatInfo *Info1 = |
1018 | llvm::AMDGPU::getGcnBufferFormatInfo(Format: Paired.Format, STI); |
1019 | if (!Info1) |
1020 | return false; |
1021 | |
1022 | if (Info0->BitsPerComp != Info1->BitsPerComp || |
1023 | Info0->NumFormat != Info1->NumFormat) |
1024 | return false; |
1025 | |
1026 | // TODO: Should be possible to support more formats, but if format loads |
1027 | // are not dword-aligned, the merged load might not be valid. |
1028 | if (Info0->BitsPerComp != 32) |
1029 | return false; |
1030 | |
1031 | if (getBufferFormatWithCompCount(OldFormat: CI.Format, ComponentCount: CI.Width + Paired.Width, STI) == 0) |
1032 | return false; |
1033 | } |
1034 | |
1035 | uint32_t EltOffset0 = CI.Offset / CI.EltSize; |
1036 | uint32_t EltOffset1 = Paired.Offset / CI.EltSize; |
1037 | CI.UseST64 = false; |
1038 | CI.BaseOff = 0; |
1039 | |
1040 | // Handle all non-DS instructions. |
1041 | if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) { |
1042 | if (EltOffset0 + CI.Width != EltOffset1 && |
1043 | EltOffset1 + Paired.Width != EltOffset0) |
1044 | return false; |
1045 | if (CI.CPol != Paired.CPol) |
1046 | return false; |
1047 | if (CI.InstClass == S_LOAD_IMM || CI.InstClass == S_BUFFER_LOAD_IMM || |
1048 | CI.InstClass == S_BUFFER_LOAD_SGPR_IMM) { |
1049 | // Reject cases like: |
1050 | // dword + dwordx2 -> dwordx3 |
1051 | // dword + dwordx3 -> dwordx4 |
1052 | // If we tried to combine these cases, we would fail to extract a subreg |
1053 | // for the result of the second load due to SGPR alignment requirements. |
1054 | if (CI.Width != Paired.Width && |
1055 | (CI.Width < Paired.Width) == (CI.Offset < Paired.Offset)) |
1056 | return false; |
1057 | } |
1058 | return true; |
1059 | } |
1060 | |
1061 | // If the offset in elements doesn't fit in 8-bits, we might be able to use |
1062 | // the stride 64 versions. |
1063 | if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 && |
1064 | isUInt<8>(x: EltOffset0 / 64) && isUInt<8>(x: EltOffset1 / 64)) { |
1065 | if (Modify) { |
1066 | CI.Offset = EltOffset0 / 64; |
1067 | Paired.Offset = EltOffset1 / 64; |
1068 | CI.UseST64 = true; |
1069 | } |
1070 | return true; |
1071 | } |
1072 | |
1073 | // Check if the new offsets fit in the reduced 8-bit range. |
1074 | if (isUInt<8>(x: EltOffset0) && isUInt<8>(x: EltOffset1)) { |
1075 | if (Modify) { |
1076 | CI.Offset = EltOffset0; |
1077 | Paired.Offset = EltOffset1; |
1078 | } |
1079 | return true; |
1080 | } |
1081 | |
1082 | // Try to shift base address to decrease offsets. |
1083 | uint32_t Min = std::min(a: EltOffset0, b: EltOffset1); |
1084 | uint32_t Max = std::max(a: EltOffset0, b: EltOffset1); |
1085 | |
1086 | const uint32_t Mask = maskTrailingOnes<uint32_t>(N: 8) * 64; |
1087 | if (((Max - Min) & ~Mask) == 0) { |
1088 | if (Modify) { |
1089 | // From the range of values we could use for BaseOff, choose the one that |
1090 | // is aligned to the highest power of two, to maximise the chance that |
1091 | // the same offset can be reused for other load/store pairs. |
1092 | uint32_t BaseOff = mostAlignedValueInRange(Lo: Max - 0xff * 64, Hi: Min); |
1093 | // Copy the low bits of the offsets, so that when we adjust them by |
1094 | // subtracting BaseOff they will be multiples of 64. |
1095 | BaseOff |= Min & maskTrailingOnes<uint32_t>(N: 6); |
1096 | CI.BaseOff = BaseOff * CI.EltSize; |
1097 | CI.Offset = (EltOffset0 - BaseOff) / 64; |
1098 | Paired.Offset = (EltOffset1 - BaseOff) / 64; |
1099 | CI.UseST64 = true; |
1100 | } |
1101 | return true; |
1102 | } |
1103 | |
1104 | if (isUInt<8>(x: Max - Min)) { |
1105 | if (Modify) { |
1106 | // From the range of values we could use for BaseOff, choose the one that |
1107 | // is aligned to the highest power of two, to maximise the chance that |
1108 | // the same offset can be reused for other load/store pairs. |
1109 | uint32_t BaseOff = mostAlignedValueInRange(Lo: Max - 0xff, Hi: Min); |
1110 | CI.BaseOff = BaseOff * CI.EltSize; |
1111 | CI.Offset = EltOffset0 - BaseOff; |
1112 | Paired.Offset = EltOffset1 - BaseOff; |
1113 | } |
1114 | return true; |
1115 | } |
1116 | |
1117 | return false; |
1118 | } |
1119 | |
1120 | bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM, |
1121 | const CombineInfo &CI, |
1122 | const CombineInfo &Paired) { |
1123 | const unsigned Width = (CI.Width + Paired.Width); |
1124 | switch (CI.InstClass) { |
1125 | default: |
1126 | return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3)); |
1127 | case S_BUFFER_LOAD_IMM: |
1128 | case S_BUFFER_LOAD_SGPR_IMM: |
1129 | case S_LOAD_IMM: |
1130 | switch (Width) { |
1131 | default: |
1132 | return false; |
1133 | case 2: |
1134 | case 4: |
1135 | case 8: |
1136 | return true; |
1137 | case 3: |
1138 | return STM.hasScalarDwordx3Loads(); |
1139 | } |
1140 | } |
1141 | } |
1142 | |
1143 | const TargetRegisterClass * |
1144 | SILoadStoreOptimizer::getDataRegClass(const MachineInstr &MI) const { |
1145 | if (const auto *Dst = TII->getNamedOperand(MI, OpName: AMDGPU::OpName::vdst)) { |
1146 | return TRI->getRegClassForReg(MRI: *MRI, Reg: Dst->getReg()); |
1147 | } |
1148 | if (const auto *Src = TII->getNamedOperand(MI, OpName: AMDGPU::OpName::vdata)) { |
1149 | return TRI->getRegClassForReg(MRI: *MRI, Reg: Src->getReg()); |
1150 | } |
1151 | if (const auto *Src = TII->getNamedOperand(MI, OpName: AMDGPU::OpName::data0)) { |
1152 | return TRI->getRegClassForReg(MRI: *MRI, Reg: Src->getReg()); |
1153 | } |
1154 | if (const auto *Dst = TII->getNamedOperand(MI, OpName: AMDGPU::OpName::sdst)) { |
1155 | return TRI->getRegClassForReg(MRI: *MRI, Reg: Dst->getReg()); |
1156 | } |
1157 | if (const auto *Src = TII->getNamedOperand(MI, OpName: AMDGPU::OpName::sdata)) { |
1158 | return TRI->getRegClassForReg(MRI: *MRI, Reg: Src->getReg()); |
1159 | } |
1160 | return nullptr; |
1161 | } |
1162 | |
1163 | /// This function assumes that CI comes before Paired in a basic block. Return |
1164 | /// an insertion point for the merged instruction or nullptr on failure. |
1165 | SILoadStoreOptimizer::CombineInfo * |
1166 | SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI, |
1167 | CombineInfo &Paired) { |
1168 | // If another instruction has already been merged into CI, it may now be a |
1169 | // type that we can't do any further merging into. |
1170 | if (CI.InstClass == UNKNOWN || Paired.InstClass == UNKNOWN) |
1171 | return nullptr; |
1172 | assert(CI.InstClass == Paired.InstClass); |
1173 | |
1174 | if (getInstSubclass(Opc: CI.I->getOpcode(), TII: *TII) != |
1175 | getInstSubclass(Opc: Paired.I->getOpcode(), TII: *TII)) |
1176 | return nullptr; |
1177 | |
1178 | // Check both offsets (or masks for MIMG) can be combined and fit in the |
1179 | // reduced range. |
1180 | if (CI.InstClass == MIMG) { |
1181 | if (!dmasksCanBeCombined(CI, TII: *TII, Paired)) |
1182 | return nullptr; |
1183 | } else { |
1184 | if (!widthsFit(STM: *STM, CI, Paired) || !offsetsCanBeCombined(CI, STI: *STM, Paired)) |
1185 | return nullptr; |
1186 | } |
1187 | |
1188 | DenseSet<Register> RegDefs; |
1189 | DenseSet<Register> RegUses; |
1190 | CombineInfo *Where; |
1191 | if (CI.I->mayLoad()) { |
1192 | // Try to hoist Paired up to CI. |
1193 | addDefsUsesToList(MI: *Paired.I, RegDefs, RegUses); |
1194 | for (MachineBasicBlock::iterator MBBI = Paired.I; --MBBI != CI.I;) { |
1195 | if (!canSwapInstructions(ARegDefs: RegDefs, ARegUses: RegUses, A: *Paired.I, B: *MBBI)) |
1196 | return nullptr; |
1197 | } |
1198 | Where = &CI; |
1199 | } else { |
1200 | // Try to sink CI down to Paired. |
1201 | addDefsUsesToList(MI: *CI.I, RegDefs, RegUses); |
1202 | for (MachineBasicBlock::iterator MBBI = CI.I; ++MBBI != Paired.I;) { |
1203 | if (!canSwapInstructions(ARegDefs: RegDefs, ARegUses: RegUses, A: *CI.I, B: *MBBI)) |
1204 | return nullptr; |
1205 | } |
1206 | Where = &Paired; |
1207 | } |
1208 | |
1209 | // Call offsetsCanBeCombined with modify = true so that the offsets are |
1210 | // correct for the new instruction. This should return true, because |
1211 | // this function should only be called on CombineInfo objects that |
1212 | // have already been confirmed to be mergeable. |
1213 | if (CI.InstClass == DS_READ || CI.InstClass == DS_WRITE) |
1214 | offsetsCanBeCombined(CI, STI: *STM, Paired, Modify: true); |
1215 | return Where; |
1216 | } |
1217 | |
1218 | // Copy the merged load result from DestReg to the original dest regs of CI and |
1219 | // Paired. |
1220 | void SILoadStoreOptimizer::copyToDestRegs( |
1221 | CombineInfo &CI, CombineInfo &Paired, |
1222 | MachineBasicBlock::iterator InsertBefore, int OpName, |
1223 | Register DestReg) const { |
1224 | MachineBasicBlock *MBB = CI.I->getParent(); |
1225 | DebugLoc DL = CI.I->getDebugLoc(); |
1226 | |
1227 | auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired); |
1228 | |
1229 | // Copy to the old destination registers. |
1230 | const MCInstrDesc &CopyDesc = TII->get(Opcode: TargetOpcode::COPY); |
1231 | auto *Dest0 = TII->getNamedOperand(MI&: *CI.I, OperandName: OpName); |
1232 | auto *Dest1 = TII->getNamedOperand(MI&: *Paired.I, OperandName: OpName); |
1233 | |
1234 | // The constrained sload instructions in S_LOAD_IMM class will have |
1235 | // `early-clobber` flag in the dst operand. Remove the flag before using the |
1236 | // MOs in copies. |
1237 | Dest0->setIsEarlyClobber(false); |
1238 | Dest1->setIsEarlyClobber(false); |
1239 | |
1240 | BuildMI(BB&: *MBB, I: InsertBefore, MIMD: DL, MCID: CopyDesc) |
1241 | .add(MO: *Dest0) // Copy to same destination including flags and sub reg. |
1242 | .addReg(RegNo: DestReg, flags: 0, SubReg: SubRegIdx0); |
1243 | BuildMI(BB&: *MBB, I: InsertBefore, MIMD: DL, MCID: CopyDesc) |
1244 | .add(MO: *Dest1) |
1245 | .addReg(RegNo: DestReg, flags: RegState::Kill, SubReg: SubRegIdx1); |
1246 | } |
1247 | |
1248 | // Return a register for the source of the merged store after copying the |
1249 | // original source regs of CI and Paired into it. |
1250 | Register |
1251 | SILoadStoreOptimizer::copyFromSrcRegs(CombineInfo &CI, CombineInfo &Paired, |
1252 | MachineBasicBlock::iterator InsertBefore, |
1253 | int OpName) const { |
1254 | MachineBasicBlock *MBB = CI.I->getParent(); |
1255 | DebugLoc DL = CI.I->getDebugLoc(); |
1256 | |
1257 | auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired); |
1258 | |
1259 | // Copy to the new source register. |
1260 | const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); |
1261 | Register SrcReg = MRI->createVirtualRegister(RegClass: SuperRC); |
1262 | |
1263 | const auto *Src0 = TII->getNamedOperand(MI&: *CI.I, OperandName: OpName); |
1264 | const auto *Src1 = TII->getNamedOperand(MI&: *Paired.I, OperandName: OpName); |
1265 | |
1266 | BuildMI(BB&: *MBB, I: InsertBefore, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: SrcReg) |
1267 | .add(MO: *Src0) |
1268 | .addImm(Val: SubRegIdx0) |
1269 | .add(MO: *Src1) |
1270 | .addImm(Val: SubRegIdx1); |
1271 | |
1272 | return SrcReg; |
1273 | } |
1274 | |
1275 | unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const { |
1276 | if (STM->ldsRequiresM0Init()) |
1277 | return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64; |
1278 | return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9; |
1279 | } |
1280 | |
1281 | unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const { |
1282 | if (STM->ldsRequiresM0Init()) |
1283 | return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64; |
1284 | |
1285 | return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9 |
1286 | : AMDGPU::DS_READ2ST64_B64_gfx9; |
1287 | } |
1288 | |
1289 | MachineBasicBlock::iterator |
1290 | SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired, |
1291 | MachineBasicBlock::iterator InsertBefore) { |
1292 | MachineBasicBlock *MBB = CI.I->getParent(); |
1293 | |
1294 | // Be careful, since the addresses could be subregisters themselves in weird |
1295 | // cases, like vectors of pointers. |
1296 | const auto *AddrReg = TII->getNamedOperand(MI&: *CI.I, OperandName: AMDGPU::OpName::addr); |
1297 | |
1298 | unsigned NewOffset0 = std::min(a: CI.Offset, b: Paired.Offset); |
1299 | unsigned NewOffset1 = std::max(a: CI.Offset, b: Paired.Offset); |
1300 | unsigned Opc = |
1301 | CI.UseST64 ? read2ST64Opcode(EltSize: CI.EltSize) : read2Opcode(EltSize: CI.EltSize); |
1302 | |
1303 | assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && |
1304 | (NewOffset0 != NewOffset1) && "Computed offset doesn't fit" ); |
1305 | |
1306 | const MCInstrDesc &Read2Desc = TII->get(Opcode: Opc); |
1307 | |
1308 | const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); |
1309 | Register DestReg = MRI->createVirtualRegister(RegClass: SuperRC); |
1310 | |
1311 | DebugLoc DL = CI.I->getDebugLoc(); |
1312 | |
1313 | Register BaseReg = AddrReg->getReg(); |
1314 | unsigned BaseSubReg = AddrReg->getSubReg(); |
1315 | unsigned BaseRegFlags = 0; |
1316 | if (CI.BaseOff) { |
1317 | Register ImmReg = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass); |
1318 | BuildMI(BB&: *MBB, I: InsertBefore, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MOV_B32), DestReg: ImmReg) |
1319 | .addImm(Val: CI.BaseOff); |
1320 | |
1321 | BaseReg = MRI->createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass); |
1322 | BaseRegFlags = RegState::Kill; |
1323 | |
1324 | TII->getAddNoCarry(MBB&: *MBB, I: InsertBefore, DL, DestReg: BaseReg) |
1325 | .addReg(RegNo: ImmReg) |
1326 | .addReg(RegNo: AddrReg->getReg(), flags: 0, SubReg: BaseSubReg) |
1327 | .addImm(Val: 0); // clamp bit |
1328 | BaseSubReg = 0; |
1329 | } |
1330 | |
1331 | MachineInstrBuilder Read2 = |
1332 | BuildMI(BB&: *MBB, I: InsertBefore, MIMD: DL, MCID: Read2Desc, DestReg) |
1333 | .addReg(RegNo: BaseReg, flags: BaseRegFlags, SubReg: BaseSubReg) // addr |
1334 | .addImm(Val: NewOffset0) // offset0 |
1335 | .addImm(Val: NewOffset1) // offset1 |
1336 | .addImm(Val: 0) // gds |
1337 | .cloneMergedMemRefs(OtherMIs: {&*CI.I, &*Paired.I}); |
1338 | |
1339 | copyToDestRegs(CI, Paired, InsertBefore, OpName: AMDGPU::OpName::vdst, DestReg); |
1340 | |
1341 | CI.I->eraseFromParent(); |
1342 | Paired.I->eraseFromParent(); |
1343 | |
1344 | LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n'); |
1345 | return Read2; |
1346 | } |
1347 | |
1348 | unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const { |
1349 | if (STM->ldsRequiresM0Init()) |
1350 | return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64; |
1351 | return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9 |
1352 | : AMDGPU::DS_WRITE2_B64_gfx9; |
1353 | } |
1354 | |
1355 | unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const { |
1356 | if (STM->ldsRequiresM0Init()) |
1357 | return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32 |
1358 | : AMDGPU::DS_WRITE2ST64_B64; |
1359 | |
1360 | return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9 |
1361 | : AMDGPU::DS_WRITE2ST64_B64_gfx9; |
1362 | } |
1363 | |
1364 | MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair( |
1365 | CombineInfo &CI, CombineInfo &Paired, |
1366 | MachineBasicBlock::iterator InsertBefore) { |
1367 | MachineBasicBlock *MBB = CI.I->getParent(); |
1368 | |
1369 | // Be sure to use .addOperand(), and not .addReg() with these. We want to be |
1370 | // sure we preserve the subregister index and any register flags set on them. |
1371 | const MachineOperand *AddrReg = |
1372 | TII->getNamedOperand(MI&: *CI.I, OperandName: AMDGPU::OpName::addr); |
1373 | const MachineOperand *Data0 = |
1374 | TII->getNamedOperand(MI&: *CI.I, OperandName: AMDGPU::OpName::data0); |
1375 | const MachineOperand *Data1 = |
1376 | TII->getNamedOperand(MI&: *Paired.I, OperandName: AMDGPU::OpName::data0); |
1377 | |
1378 | unsigned NewOffset0 = CI.Offset; |
1379 | unsigned NewOffset1 = Paired.Offset; |
1380 | unsigned Opc = |
1381 | CI.UseST64 ? write2ST64Opcode(EltSize: CI.EltSize) : write2Opcode(EltSize: CI.EltSize); |
1382 | |
1383 | if (NewOffset0 > NewOffset1) { |
1384 | // Canonicalize the merged instruction so the smaller offset comes first. |
1385 | std::swap(a&: NewOffset0, b&: NewOffset1); |
1386 | std::swap(a&: Data0, b&: Data1); |
1387 | } |
1388 | |
1389 | assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && |
1390 | (NewOffset0 != NewOffset1) && "Computed offset doesn't fit" ); |
1391 | |
1392 | const MCInstrDesc &Write2Desc = TII->get(Opcode: Opc); |
1393 | DebugLoc DL = CI.I->getDebugLoc(); |
1394 | |
1395 | Register BaseReg = AddrReg->getReg(); |
1396 | unsigned BaseSubReg = AddrReg->getSubReg(); |
1397 | unsigned BaseRegFlags = 0; |
1398 | if (CI.BaseOff) { |
1399 | Register ImmReg = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass); |
1400 | BuildMI(BB&: *MBB, I: InsertBefore, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MOV_B32), DestReg: ImmReg) |
1401 | .addImm(Val: CI.BaseOff); |
1402 | |
1403 | BaseReg = MRI->createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass); |
1404 | BaseRegFlags = RegState::Kill; |
1405 | |
1406 | TII->getAddNoCarry(MBB&: *MBB, I: InsertBefore, DL, DestReg: BaseReg) |
1407 | .addReg(RegNo: ImmReg) |
1408 | .addReg(RegNo: AddrReg->getReg(), flags: 0, SubReg: BaseSubReg) |
1409 | .addImm(Val: 0); // clamp bit |
1410 | BaseSubReg = 0; |
1411 | } |
1412 | |
1413 | MachineInstrBuilder Write2 = |
1414 | BuildMI(BB&: *MBB, I: InsertBefore, MIMD: DL, MCID: Write2Desc) |
1415 | .addReg(RegNo: BaseReg, flags: BaseRegFlags, SubReg: BaseSubReg) // addr |
1416 | .add(MO: *Data0) // data0 |
1417 | .add(MO: *Data1) // data1 |
1418 | .addImm(Val: NewOffset0) // offset0 |
1419 | .addImm(Val: NewOffset1) // offset1 |
1420 | .addImm(Val: 0) // gds |
1421 | .cloneMergedMemRefs(OtherMIs: {&*CI.I, &*Paired.I}); |
1422 | |
1423 | CI.I->eraseFromParent(); |
1424 | Paired.I->eraseFromParent(); |
1425 | |
1426 | LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n'); |
1427 | return Write2; |
1428 | } |
1429 | |
1430 | MachineBasicBlock::iterator |
1431 | SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired, |
1432 | MachineBasicBlock::iterator InsertBefore) { |
1433 | MachineBasicBlock *MBB = CI.I->getParent(); |
1434 | DebugLoc DL = CI.I->getDebugLoc(); |
1435 | const unsigned Opcode = getNewOpcode(CI, Paired); |
1436 | |
1437 | const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); |
1438 | |
1439 | Register DestReg = MRI->createVirtualRegister(RegClass: SuperRC); |
1440 | unsigned MergedDMask = CI.DMask | Paired.DMask; |
1441 | unsigned DMaskIdx = |
1442 | AMDGPU::getNamedOperandIdx(Opcode: CI.I->getOpcode(), NamedIdx: AMDGPU::OpName::dmask); |
1443 | |
1444 | auto MIB = BuildMI(BB&: *MBB, I: InsertBefore, MIMD: DL, MCID: TII->get(Opcode), DestReg); |
1445 | for (unsigned I = 1, E = (*CI.I).getNumOperands(); I != E; ++I) { |
1446 | if (I == DMaskIdx) |
1447 | MIB.addImm(Val: MergedDMask); |
1448 | else |
1449 | MIB.add(MO: (*CI.I).getOperand(i: I)); |
1450 | } |
1451 | |
1452 | // It shouldn't be possible to get this far if the two instructions |
1453 | // don't have a single memoperand, because MachineInstr::mayAlias() |
1454 | // will return true if this is the case. |
1455 | assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); |
1456 | |
1457 | MachineInstr *New = MIB.addMemOperand(MMO: combineKnownAdjacentMMOs(CI, Paired)); |
1458 | |
1459 | copyToDestRegs(CI, Paired, InsertBefore, OpName: AMDGPU::OpName::vdata, DestReg); |
1460 | |
1461 | CI.I->eraseFromParent(); |
1462 | Paired.I->eraseFromParent(); |
1463 | return New; |
1464 | } |
1465 | |
1466 | MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSMemLoadImmPair( |
1467 | CombineInfo &CI, CombineInfo &Paired, |
1468 | MachineBasicBlock::iterator InsertBefore) { |
1469 | MachineBasicBlock *MBB = CI.I->getParent(); |
1470 | DebugLoc DL = CI.I->getDebugLoc(); |
1471 | const unsigned Opcode = getNewOpcode(CI, Paired); |
1472 | |
1473 | const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); |
1474 | |
1475 | Register DestReg = MRI->createVirtualRegister(RegClass: SuperRC); |
1476 | unsigned MergedOffset = std::min(a: CI.Offset, b: Paired.Offset); |
1477 | |
1478 | // It shouldn't be possible to get this far if the two instructions |
1479 | // don't have a single memoperand, because MachineInstr::mayAlias() |
1480 | // will return true if this is the case. |
1481 | assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); |
1482 | |
1483 | MachineInstrBuilder New = |
1484 | BuildMI(BB&: *MBB, I: InsertBefore, MIMD: DL, MCID: TII->get(Opcode), DestReg) |
1485 | .add(MO: *TII->getNamedOperand(MI&: *CI.I, OperandName: AMDGPU::OpName::sbase)); |
1486 | if (CI.InstClass == S_BUFFER_LOAD_SGPR_IMM) |
1487 | New.add(MO: *TII->getNamedOperand(MI&: *CI.I, OperandName: AMDGPU::OpName::soffset)); |
1488 | New.addImm(Val: MergedOffset); |
1489 | New.addImm(Val: CI.CPol).addMemOperand(MMO: combineKnownAdjacentMMOs(CI, Paired)); |
1490 | |
1491 | copyToDestRegs(CI, Paired, InsertBefore, OpName: AMDGPU::OpName::sdst, DestReg); |
1492 | |
1493 | CI.I->eraseFromParent(); |
1494 | Paired.I->eraseFromParent(); |
1495 | return New; |
1496 | } |
1497 | |
1498 | MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair( |
1499 | CombineInfo &CI, CombineInfo &Paired, |
1500 | MachineBasicBlock::iterator InsertBefore) { |
1501 | MachineBasicBlock *MBB = CI.I->getParent(); |
1502 | DebugLoc DL = CI.I->getDebugLoc(); |
1503 | |
1504 | const unsigned Opcode = getNewOpcode(CI, Paired); |
1505 | |
1506 | const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); |
1507 | |
1508 | // Copy to the new source register. |
1509 | Register DestReg = MRI->createVirtualRegister(RegClass: SuperRC); |
1510 | unsigned MergedOffset = std::min(a: CI.Offset, b: Paired.Offset); |
1511 | |
1512 | auto MIB = BuildMI(BB&: *MBB, I: InsertBefore, MIMD: DL, MCID: TII->get(Opcode), DestReg); |
1513 | |
1514 | AddressRegs Regs = getRegs(Opc: Opcode, TII: *TII); |
1515 | |
1516 | if (Regs.VAddr) |
1517 | MIB.add(MO: *TII->getNamedOperand(MI&: *CI.I, OperandName: AMDGPU::OpName::vaddr)); |
1518 | |
1519 | // It shouldn't be possible to get this far if the two instructions |
1520 | // don't have a single memoperand, because MachineInstr::mayAlias() |
1521 | // will return true if this is the case. |
1522 | assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); |
1523 | |
1524 | MachineInstr *New = |
1525 | MIB.add(MO: *TII->getNamedOperand(MI&: *CI.I, OperandName: AMDGPU::OpName::srsrc)) |
1526 | .add(MO: *TII->getNamedOperand(MI&: *CI.I, OperandName: AMDGPU::OpName::soffset)) |
1527 | .addImm(Val: MergedOffset) // offset |
1528 | .addImm(Val: CI.CPol) // cpol |
1529 | .addImm(Val: 0) // swz |
1530 | .addMemOperand(MMO: combineKnownAdjacentMMOs(CI, Paired)); |
1531 | |
1532 | copyToDestRegs(CI, Paired, InsertBefore, OpName: AMDGPU::OpName::vdata, DestReg); |
1533 | |
1534 | CI.I->eraseFromParent(); |
1535 | Paired.I->eraseFromParent(); |
1536 | return New; |
1537 | } |
1538 | |
1539 | MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair( |
1540 | CombineInfo &CI, CombineInfo &Paired, |
1541 | MachineBasicBlock::iterator InsertBefore) { |
1542 | MachineBasicBlock *MBB = CI.I->getParent(); |
1543 | DebugLoc DL = CI.I->getDebugLoc(); |
1544 | |
1545 | const unsigned Opcode = getNewOpcode(CI, Paired); |
1546 | |
1547 | const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); |
1548 | |
1549 | // Copy to the new source register. |
1550 | Register DestReg = MRI->createVirtualRegister(RegClass: SuperRC); |
1551 | unsigned MergedOffset = std::min(a: CI.Offset, b: Paired.Offset); |
1552 | |
1553 | auto MIB = BuildMI(BB&: *MBB, I: InsertBefore, MIMD: DL, MCID: TII->get(Opcode), DestReg); |
1554 | |
1555 | AddressRegs Regs = getRegs(Opc: Opcode, TII: *TII); |
1556 | |
1557 | if (Regs.VAddr) |
1558 | MIB.add(MO: *TII->getNamedOperand(MI&: *CI.I, OperandName: AMDGPU::OpName::vaddr)); |
1559 | |
1560 | unsigned JoinedFormat = |
1561 | getBufferFormatWithCompCount(OldFormat: CI.Format, ComponentCount: CI.Width + Paired.Width, STI: *STM); |
1562 | |
1563 | // It shouldn't be possible to get this far if the two instructions |
1564 | // don't have a single memoperand, because MachineInstr::mayAlias() |
1565 | // will return true if this is the case. |
1566 | assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); |
1567 | |
1568 | MachineInstr *New = |
1569 | MIB.add(MO: *TII->getNamedOperand(MI&: *CI.I, OperandName: AMDGPU::OpName::srsrc)) |
1570 | .add(MO: *TII->getNamedOperand(MI&: *CI.I, OperandName: AMDGPU::OpName::soffset)) |
1571 | .addImm(Val: MergedOffset) // offset |
1572 | .addImm(Val: JoinedFormat) // format |
1573 | .addImm(Val: CI.CPol) // cpol |
1574 | .addImm(Val: 0) // swz |
1575 | .addMemOperand(MMO: combineKnownAdjacentMMOs(CI, Paired)); |
1576 | |
1577 | copyToDestRegs(CI, Paired, InsertBefore, OpName: AMDGPU::OpName::vdata, DestReg); |
1578 | |
1579 | CI.I->eraseFromParent(); |
1580 | Paired.I->eraseFromParent(); |
1581 | return New; |
1582 | } |
1583 | |
1584 | MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair( |
1585 | CombineInfo &CI, CombineInfo &Paired, |
1586 | MachineBasicBlock::iterator InsertBefore) { |
1587 | MachineBasicBlock *MBB = CI.I->getParent(); |
1588 | DebugLoc DL = CI.I->getDebugLoc(); |
1589 | |
1590 | const unsigned Opcode = getNewOpcode(CI, Paired); |
1591 | |
1592 | Register SrcReg = |
1593 | copyFromSrcRegs(CI, Paired, InsertBefore, OpName: AMDGPU::OpName::vdata); |
1594 | |
1595 | auto MIB = BuildMI(BB&: *MBB, I: InsertBefore, MIMD: DL, MCID: TII->get(Opcode)) |
1596 | .addReg(RegNo: SrcReg, flags: RegState::Kill); |
1597 | |
1598 | AddressRegs Regs = getRegs(Opc: Opcode, TII: *TII); |
1599 | |
1600 | if (Regs.VAddr) |
1601 | MIB.add(MO: *TII->getNamedOperand(MI&: *CI.I, OperandName: AMDGPU::OpName::vaddr)); |
1602 | |
1603 | unsigned JoinedFormat = |
1604 | getBufferFormatWithCompCount(OldFormat: CI.Format, ComponentCount: CI.Width + Paired.Width, STI: *STM); |
1605 | |
1606 | // It shouldn't be possible to get this far if the two instructions |
1607 | // don't have a single memoperand, because MachineInstr::mayAlias() |
1608 | // will return true if this is the case. |
1609 | assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); |
1610 | |
1611 | MachineInstr *New = |
1612 | MIB.add(MO: *TII->getNamedOperand(MI&: *CI.I, OperandName: AMDGPU::OpName::srsrc)) |
1613 | .add(MO: *TII->getNamedOperand(MI&: *CI.I, OperandName: AMDGPU::OpName::soffset)) |
1614 | .addImm(Val: std::min(a: CI.Offset, b: Paired.Offset)) // offset |
1615 | .addImm(Val: JoinedFormat) // format |
1616 | .addImm(Val: CI.CPol) // cpol |
1617 | .addImm(Val: 0) // swz |
1618 | .addMemOperand(MMO: combineKnownAdjacentMMOs(CI, Paired)); |
1619 | |
1620 | CI.I->eraseFromParent(); |
1621 | Paired.I->eraseFromParent(); |
1622 | return New; |
1623 | } |
1624 | |
1625 | MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatLoadPair( |
1626 | CombineInfo &CI, CombineInfo &Paired, |
1627 | MachineBasicBlock::iterator InsertBefore) { |
1628 | MachineBasicBlock *MBB = CI.I->getParent(); |
1629 | DebugLoc DL = CI.I->getDebugLoc(); |
1630 | |
1631 | const unsigned Opcode = getNewOpcode(CI, Paired); |
1632 | |
1633 | const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); |
1634 | Register DestReg = MRI->createVirtualRegister(RegClass: SuperRC); |
1635 | |
1636 | auto MIB = BuildMI(BB&: *MBB, I: InsertBefore, MIMD: DL, MCID: TII->get(Opcode), DestReg); |
1637 | |
1638 | if (auto *SAddr = TII->getNamedOperand(MI&: *CI.I, OperandName: AMDGPU::OpName::saddr)) |
1639 | MIB.add(MO: *SAddr); |
1640 | |
1641 | MachineInstr *New = |
1642 | MIB.add(MO: *TII->getNamedOperand(MI&: *CI.I, OperandName: AMDGPU::OpName::vaddr)) |
1643 | .addImm(Val: std::min(a: CI.Offset, b: Paired.Offset)) |
1644 | .addImm(Val: CI.CPol) |
1645 | .addMemOperand(MMO: combineKnownAdjacentMMOs(CI, Paired)); |
1646 | |
1647 | copyToDestRegs(CI, Paired, InsertBefore, OpName: AMDGPU::OpName::vdst, DestReg); |
1648 | |
1649 | CI.I->eraseFromParent(); |
1650 | Paired.I->eraseFromParent(); |
1651 | return New; |
1652 | } |
1653 | |
1654 | MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatStorePair( |
1655 | CombineInfo &CI, CombineInfo &Paired, |
1656 | MachineBasicBlock::iterator InsertBefore) { |
1657 | MachineBasicBlock *MBB = CI.I->getParent(); |
1658 | DebugLoc DL = CI.I->getDebugLoc(); |
1659 | |
1660 | const unsigned Opcode = getNewOpcode(CI, Paired); |
1661 | |
1662 | Register SrcReg = |
1663 | copyFromSrcRegs(CI, Paired, InsertBefore, OpName: AMDGPU::OpName::vdata); |
1664 | |
1665 | auto MIB = BuildMI(BB&: *MBB, I: InsertBefore, MIMD: DL, MCID: TII->get(Opcode)) |
1666 | .add(MO: *TII->getNamedOperand(MI&: *CI.I, OperandName: AMDGPU::OpName::vaddr)) |
1667 | .addReg(RegNo: SrcReg, flags: RegState::Kill); |
1668 | |
1669 | if (auto *SAddr = TII->getNamedOperand(MI&: *CI.I, OperandName: AMDGPU::OpName::saddr)) |
1670 | MIB.add(MO: *SAddr); |
1671 | |
1672 | MachineInstr *New = |
1673 | MIB.addImm(Val: std::min(a: CI.Offset, b: Paired.Offset)) |
1674 | .addImm(Val: CI.CPol) |
1675 | .addMemOperand(MMO: combineKnownAdjacentMMOs(CI, Paired)); |
1676 | |
1677 | CI.I->eraseFromParent(); |
1678 | Paired.I->eraseFromParent(); |
1679 | return New; |
1680 | } |
1681 | |
1682 | unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI, |
1683 | const CombineInfo &Paired) { |
1684 | const unsigned Width = CI.Width + Paired.Width; |
1685 | |
1686 | switch (getCommonInstClass(CI, Paired)) { |
1687 | default: |
1688 | assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE); |
1689 | // FIXME: Handle d16 correctly |
1690 | return AMDGPU::getMUBUFOpcode(BaseOpc: AMDGPU::getMUBUFBaseOpcode(Opc: CI.I->getOpcode()), |
1691 | Elements: Width); |
1692 | case TBUFFER_LOAD: |
1693 | case TBUFFER_STORE: |
1694 | return AMDGPU::getMTBUFOpcode(BaseOpc: AMDGPU::getMTBUFBaseOpcode(Opc: CI.I->getOpcode()), |
1695 | Elements: Width); |
1696 | |
1697 | case UNKNOWN: |
1698 | llvm_unreachable("Unknown instruction class" ); |
1699 | case S_BUFFER_LOAD_IMM: |
1700 | switch (Width) { |
1701 | default: |
1702 | return 0; |
1703 | case 2: |
1704 | return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM; |
1705 | case 3: |
1706 | return AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM; |
1707 | case 4: |
1708 | return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM; |
1709 | case 8: |
1710 | return AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM; |
1711 | } |
1712 | case S_BUFFER_LOAD_SGPR_IMM: |
1713 | switch (Width) { |
1714 | default: |
1715 | return 0; |
1716 | case 2: |
1717 | return AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM; |
1718 | case 3: |
1719 | return AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM; |
1720 | case 4: |
1721 | return AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM; |
1722 | case 8: |
1723 | return AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM; |
1724 | } |
1725 | case S_LOAD_IMM: { |
1726 | // If XNACK is enabled, use the constrained opcodes when the first load is |
1727 | // under-aligned. |
1728 | const MachineMemOperand *MMO = *CI.I->memoperands_begin(); |
1729 | bool NeedsConstrainedOpc = |
1730 | STM->isXNACKEnabled() && MMO->getAlign().value() < Width * 4; |
1731 | switch (Width) { |
1732 | default: |
1733 | return 0; |
1734 | case 2: |
1735 | return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX2_IMM_ec |
1736 | : AMDGPU::S_LOAD_DWORDX2_IMM; |
1737 | case 3: |
1738 | return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX3_IMM_ec |
1739 | : AMDGPU::S_LOAD_DWORDX3_IMM; |
1740 | case 4: |
1741 | return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX4_IMM_ec |
1742 | : AMDGPU::S_LOAD_DWORDX4_IMM; |
1743 | case 8: |
1744 | return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX8_IMM_ec |
1745 | : AMDGPU::S_LOAD_DWORDX8_IMM; |
1746 | } |
1747 | } |
1748 | case GLOBAL_LOAD: |
1749 | switch (Width) { |
1750 | default: |
1751 | return 0; |
1752 | case 2: |
1753 | return AMDGPU::GLOBAL_LOAD_DWORDX2; |
1754 | case 3: |
1755 | return AMDGPU::GLOBAL_LOAD_DWORDX3; |
1756 | case 4: |
1757 | return AMDGPU::GLOBAL_LOAD_DWORDX4; |
1758 | } |
1759 | case GLOBAL_LOAD_SADDR: |
1760 | switch (Width) { |
1761 | default: |
1762 | return 0; |
1763 | case 2: |
1764 | return AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR; |
1765 | case 3: |
1766 | return AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR; |
1767 | case 4: |
1768 | return AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR; |
1769 | } |
1770 | case GLOBAL_STORE: |
1771 | switch (Width) { |
1772 | default: |
1773 | return 0; |
1774 | case 2: |
1775 | return AMDGPU::GLOBAL_STORE_DWORDX2; |
1776 | case 3: |
1777 | return AMDGPU::GLOBAL_STORE_DWORDX3; |
1778 | case 4: |
1779 | return AMDGPU::GLOBAL_STORE_DWORDX4; |
1780 | } |
1781 | case GLOBAL_STORE_SADDR: |
1782 | switch (Width) { |
1783 | default: |
1784 | return 0; |
1785 | case 2: |
1786 | return AMDGPU::GLOBAL_STORE_DWORDX2_SADDR; |
1787 | case 3: |
1788 | return AMDGPU::GLOBAL_STORE_DWORDX3_SADDR; |
1789 | case 4: |
1790 | return AMDGPU::GLOBAL_STORE_DWORDX4_SADDR; |
1791 | } |
1792 | case FLAT_LOAD: |
1793 | switch (Width) { |
1794 | default: |
1795 | return 0; |
1796 | case 2: |
1797 | return AMDGPU::FLAT_LOAD_DWORDX2; |
1798 | case 3: |
1799 | return AMDGPU::FLAT_LOAD_DWORDX3; |
1800 | case 4: |
1801 | return AMDGPU::FLAT_LOAD_DWORDX4; |
1802 | } |
1803 | case FLAT_STORE: |
1804 | switch (Width) { |
1805 | default: |
1806 | return 0; |
1807 | case 2: |
1808 | return AMDGPU::FLAT_STORE_DWORDX2; |
1809 | case 3: |
1810 | return AMDGPU::FLAT_STORE_DWORDX3; |
1811 | case 4: |
1812 | return AMDGPU::FLAT_STORE_DWORDX4; |
1813 | } |
1814 | case MIMG: |
1815 | assert(((unsigned)llvm::popcount(CI.DMask | Paired.DMask) == Width) && |
1816 | "No overlaps" ); |
1817 | return AMDGPU::getMaskedMIMGOp(Opc: CI.I->getOpcode(), NewChannels: Width); |
1818 | } |
1819 | } |
1820 | |
1821 | std::pair<unsigned, unsigned> |
1822 | SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI, |
1823 | const CombineInfo &Paired) { |
1824 | assert((CI.InstClass != MIMG || |
1825 | ((unsigned)llvm::popcount(CI.DMask | Paired.DMask) == |
1826 | CI.Width + Paired.Width)) && |
1827 | "No overlaps" ); |
1828 | |
1829 | unsigned Idx0; |
1830 | unsigned Idx1; |
1831 | |
1832 | static const unsigned Idxs[5][4] = { |
1833 | {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3}, |
1834 | {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, AMDGPU::sub1_sub2_sub3_sub4}, |
1835 | {AMDGPU::sub2, AMDGPU::sub2_sub3, AMDGPU::sub2_sub3_sub4, AMDGPU::sub2_sub3_sub4_sub5}, |
1836 | {AMDGPU::sub3, AMDGPU::sub3_sub4, AMDGPU::sub3_sub4_sub5, AMDGPU::sub3_sub4_sub5_sub6}, |
1837 | {AMDGPU::sub4, AMDGPU::sub4_sub5, AMDGPU::sub4_sub5_sub6, AMDGPU::sub4_sub5_sub6_sub7}, |
1838 | }; |
1839 | |
1840 | assert(CI.Width >= 1 && CI.Width <= 4); |
1841 | assert(Paired.Width >= 1 && Paired.Width <= 4); |
1842 | |
1843 | if (Paired < CI) { |
1844 | Idx1 = Idxs[0][Paired.Width - 1]; |
1845 | Idx0 = Idxs[Paired.Width][CI.Width - 1]; |
1846 | } else { |
1847 | Idx0 = Idxs[0][CI.Width - 1]; |
1848 | Idx1 = Idxs[CI.Width][Paired.Width - 1]; |
1849 | } |
1850 | |
1851 | return {Idx0, Idx1}; |
1852 | } |
1853 | |
1854 | const TargetRegisterClass * |
1855 | SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI, |
1856 | const CombineInfo &Paired) const { |
1857 | if (CI.InstClass == S_BUFFER_LOAD_IMM || |
1858 | CI.InstClass == S_BUFFER_LOAD_SGPR_IMM || CI.InstClass == S_LOAD_IMM) { |
1859 | switch (CI.Width + Paired.Width) { |
1860 | default: |
1861 | return nullptr; |
1862 | case 2: |
1863 | return &AMDGPU::SReg_64_XEXECRegClass; |
1864 | case 3: |
1865 | return &AMDGPU::SGPR_96RegClass; |
1866 | case 4: |
1867 | return &AMDGPU::SGPR_128RegClass; |
1868 | case 8: |
1869 | return &AMDGPU::SGPR_256RegClass; |
1870 | case 16: |
1871 | return &AMDGPU::SGPR_512RegClass; |
1872 | } |
1873 | } |
1874 | |
1875 | unsigned BitWidth = 32 * (CI.Width + Paired.Width); |
1876 | return TRI->isAGPRClass(RC: getDataRegClass(MI: *CI.I)) |
1877 | ? TRI->getAGPRClassForBitWidth(BitWidth) |
1878 | : TRI->getVGPRClassForBitWidth(BitWidth); |
1879 | } |
1880 | |
1881 | MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair( |
1882 | CombineInfo &CI, CombineInfo &Paired, |
1883 | MachineBasicBlock::iterator InsertBefore) { |
1884 | MachineBasicBlock *MBB = CI.I->getParent(); |
1885 | DebugLoc DL = CI.I->getDebugLoc(); |
1886 | |
1887 | const unsigned Opcode = getNewOpcode(CI, Paired); |
1888 | |
1889 | Register SrcReg = |
1890 | copyFromSrcRegs(CI, Paired, InsertBefore, OpName: AMDGPU::OpName::vdata); |
1891 | |
1892 | auto MIB = BuildMI(BB&: *MBB, I: InsertBefore, MIMD: DL, MCID: TII->get(Opcode)) |
1893 | .addReg(RegNo: SrcReg, flags: RegState::Kill); |
1894 | |
1895 | AddressRegs Regs = getRegs(Opc: Opcode, TII: *TII); |
1896 | |
1897 | if (Regs.VAddr) |
1898 | MIB.add(MO: *TII->getNamedOperand(MI&: *CI.I, OperandName: AMDGPU::OpName::vaddr)); |
1899 | |
1900 | |
1901 | // It shouldn't be possible to get this far if the two instructions |
1902 | // don't have a single memoperand, because MachineInstr::mayAlias() |
1903 | // will return true if this is the case. |
1904 | assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); |
1905 | |
1906 | MachineInstr *New = |
1907 | MIB.add(MO: *TII->getNamedOperand(MI&: *CI.I, OperandName: AMDGPU::OpName::srsrc)) |
1908 | .add(MO: *TII->getNamedOperand(MI&: *CI.I, OperandName: AMDGPU::OpName::soffset)) |
1909 | .addImm(Val: std::min(a: CI.Offset, b: Paired.Offset)) // offset |
1910 | .addImm(Val: CI.CPol) // cpol |
1911 | .addImm(Val: 0) // swz |
1912 | .addMemOperand(MMO: combineKnownAdjacentMMOs(CI, Paired)); |
1913 | |
1914 | CI.I->eraseFromParent(); |
1915 | Paired.I->eraseFromParent(); |
1916 | return New; |
1917 | } |
1918 | |
1919 | MachineOperand |
1920 | SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) const { |
1921 | APInt V(32, Val, true); |
1922 | if (TII->isInlineConstant(Imm: V)) |
1923 | return MachineOperand::CreateImm(Val); |
1924 | |
1925 | Register Reg = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass); |
1926 | MachineInstr *Mov = |
1927 | BuildMI(BB&: *MI.getParent(), I: MI.getIterator(), MIMD: MI.getDebugLoc(), |
1928 | MCID: TII->get(Opcode: AMDGPU::S_MOV_B32), DestReg: Reg) |
1929 | .addImm(Val); |
1930 | (void)Mov; |
1931 | LLVM_DEBUG(dbgs() << " " ; Mov->dump()); |
1932 | return MachineOperand::CreateReg(Reg, isDef: false); |
1933 | } |
1934 | |
1935 | // Compute base address using Addr and return the final register. |
1936 | Register SILoadStoreOptimizer::computeBase(MachineInstr &MI, |
1937 | const MemAddress &Addr) const { |
1938 | MachineBasicBlock *MBB = MI.getParent(); |
1939 | MachineBasicBlock::iterator MBBI = MI.getIterator(); |
1940 | DebugLoc DL = MI.getDebugLoc(); |
1941 | |
1942 | assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 || |
1943 | Addr.Base.LoSubReg) && |
1944 | "Expected 32-bit Base-Register-Low!!" ); |
1945 | |
1946 | assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 || |
1947 | Addr.Base.HiSubReg) && |
1948 | "Expected 32-bit Base-Register-Hi!!" ); |
1949 | |
1950 | LLVM_DEBUG(dbgs() << " Re-Computed Anchor-Base:\n" ); |
1951 | MachineOperand OffsetLo = createRegOrImm(Val: static_cast<int32_t>(Addr.Offset), MI); |
1952 | MachineOperand OffsetHi = |
1953 | createRegOrImm(Val: static_cast<int32_t>(Addr.Offset >> 32), MI); |
1954 | |
1955 | const auto *CarryRC = TRI->getRegClass(RCID: AMDGPU::SReg_1_XEXECRegClassID); |
1956 | Register CarryReg = MRI->createVirtualRegister(RegClass: CarryRC); |
1957 | Register DeadCarryReg = MRI->createVirtualRegister(RegClass: CarryRC); |
1958 | |
1959 | Register DestSub0 = MRI->createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass); |
1960 | Register DestSub1 = MRI->createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass); |
1961 | MachineInstr *LoHalf = |
1962 | BuildMI(BB&: *MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_ADD_CO_U32_e64), DestReg: DestSub0) |
1963 | .addReg(RegNo: CarryReg, flags: RegState::Define) |
1964 | .addReg(RegNo: Addr.Base.LoReg, flags: 0, SubReg: Addr.Base.LoSubReg) |
1965 | .add(MO: OffsetLo) |
1966 | .addImm(Val: 0); // clamp bit |
1967 | (void)LoHalf; |
1968 | LLVM_DEBUG(dbgs() << " " ; LoHalf->dump();); |
1969 | |
1970 | MachineInstr *HiHalf = |
1971 | BuildMI(BB&: *MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_ADDC_U32_e64), DestReg: DestSub1) |
1972 | .addReg(RegNo: DeadCarryReg, flags: RegState::Define | RegState::Dead) |
1973 | .addReg(RegNo: Addr.Base.HiReg, flags: 0, SubReg: Addr.Base.HiSubReg) |
1974 | .add(MO: OffsetHi) |
1975 | .addReg(RegNo: CarryReg, flags: RegState::Kill) |
1976 | .addImm(Val: 0); // clamp bit |
1977 | (void)HiHalf; |
1978 | LLVM_DEBUG(dbgs() << " " ; HiHalf->dump();); |
1979 | |
1980 | Register FullDestReg = MRI->createVirtualRegister(RegClass: TRI->getVGPR64Class()); |
1981 | MachineInstr *FullBase = |
1982 | BuildMI(BB&: *MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::REG_SEQUENCE), DestReg: FullDestReg) |
1983 | .addReg(RegNo: DestSub0) |
1984 | .addImm(Val: AMDGPU::sub0) |
1985 | .addReg(RegNo: DestSub1) |
1986 | .addImm(Val: AMDGPU::sub1); |
1987 | (void)FullBase; |
1988 | LLVM_DEBUG(dbgs() << " " ; FullBase->dump(); dbgs() << "\n" ;); |
1989 | |
1990 | return FullDestReg; |
1991 | } |
1992 | |
1993 | // Update base and offset with the NewBase and NewOffset in MI. |
1994 | void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI, |
1995 | Register NewBase, |
1996 | int32_t NewOffset) const { |
1997 | auto Base = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vaddr); |
1998 | Base->setReg(NewBase); |
1999 | Base->setIsKill(false); |
2000 | TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::offset)->setImm(NewOffset); |
2001 | } |
2002 | |
2003 | std::optional<int32_t> |
2004 | SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) const { |
2005 | if (Op.isImm()) |
2006 | return Op.getImm(); |
2007 | |
2008 | if (!Op.isReg()) |
2009 | return std::nullopt; |
2010 | |
2011 | MachineInstr *Def = MRI->getUniqueVRegDef(Reg: Op.getReg()); |
2012 | if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 || |
2013 | !Def->getOperand(i: 1).isImm()) |
2014 | return std::nullopt; |
2015 | |
2016 | return Def->getOperand(i: 1).getImm(); |
2017 | } |
2018 | |
2019 | // Analyze Base and extracts: |
2020 | // - 32bit base registers, subregisters |
2021 | // - 64bit constant offset |
2022 | // Expecting base computation as: |
2023 | // %OFFSET0:sgpr_32 = S_MOV_B32 8000 |
2024 | // %LO:vgpr_32, %c:sreg_64_xexec = |
2025 | // V_ADD_CO_U32_e64 %BASE_LO:vgpr_32, %103:sgpr_32, |
2026 | // %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec |
2027 | // %Base:vreg_64 = |
2028 | // REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1 |
2029 | void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base, |
2030 | MemAddress &Addr) const { |
2031 | if (!Base.isReg()) |
2032 | return; |
2033 | |
2034 | MachineInstr *Def = MRI->getUniqueVRegDef(Reg: Base.getReg()); |
2035 | if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE |
2036 | || Def->getNumOperands() != 5) |
2037 | return; |
2038 | |
2039 | MachineOperand BaseLo = Def->getOperand(i: 1); |
2040 | MachineOperand BaseHi = Def->getOperand(i: 3); |
2041 | if (!BaseLo.isReg() || !BaseHi.isReg()) |
2042 | return; |
2043 | |
2044 | MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(Reg: BaseLo.getReg()); |
2045 | MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(Reg: BaseHi.getReg()); |
2046 | |
2047 | if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_CO_U32_e64 || |
2048 | !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64) |
2049 | return; |
2050 | |
2051 | const auto *Src0 = TII->getNamedOperand(MI&: *BaseLoDef, OperandName: AMDGPU::OpName::src0); |
2052 | const auto *Src1 = TII->getNamedOperand(MI&: *BaseLoDef, OperandName: AMDGPU::OpName::src1); |
2053 | |
2054 | auto Offset0P = extractConstOffset(Op: *Src0); |
2055 | if (Offset0P) |
2056 | BaseLo = *Src1; |
2057 | else { |
2058 | if (!(Offset0P = extractConstOffset(Op: *Src1))) |
2059 | return; |
2060 | BaseLo = *Src0; |
2061 | } |
2062 | |
2063 | Src0 = TII->getNamedOperand(MI&: *BaseHiDef, OperandName: AMDGPU::OpName::src0); |
2064 | Src1 = TII->getNamedOperand(MI&: *BaseHiDef, OperandName: AMDGPU::OpName::src1); |
2065 | |
2066 | if (Src0->isImm()) |
2067 | std::swap(a&: Src0, b&: Src1); |
2068 | |
2069 | if (!Src1->isImm() || Src0->isImm()) |
2070 | return; |
2071 | |
2072 | uint64_t Offset1 = Src1->getImm(); |
2073 | BaseHi = *Src0; |
2074 | |
2075 | Addr.Base.LoReg = BaseLo.getReg(); |
2076 | Addr.Base.HiReg = BaseHi.getReg(); |
2077 | Addr.Base.LoSubReg = BaseLo.getSubReg(); |
2078 | Addr.Base.HiSubReg = BaseHi.getSubReg(); |
2079 | Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32); |
2080 | } |
2081 | |
2082 | bool SILoadStoreOptimizer::promoteConstantOffsetToImm( |
2083 | MachineInstr &MI, |
2084 | MemInfoMap &Visited, |
2085 | SmallPtrSet<MachineInstr *, 4> &AnchorList) const { |
2086 | |
2087 | if (!STM->hasFlatInstOffsets() || !SIInstrInfo::isFLAT(MI)) |
2088 | return false; |
2089 | |
2090 | // TODO: Support FLAT_SCRATCH. Currently code expects 64-bit pointers. |
2091 | if (SIInstrInfo::isFLATScratch(MI)) |
2092 | return false; |
2093 | |
2094 | unsigned AS = SIInstrInfo::isFLATGlobal(MI) ? AMDGPUAS::GLOBAL_ADDRESS |
2095 | : AMDGPUAS::FLAT_ADDRESS; |
2096 | |
2097 | if (AnchorList.count(Ptr: &MI)) |
2098 | return false; |
2099 | |
2100 | LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor " ; MI.dump()); |
2101 | |
2102 | if (TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::offset)->getImm()) { |
2103 | LLVM_DEBUG(dbgs() << " Const-offset is already promoted.\n" ;); |
2104 | return false; |
2105 | } |
2106 | |
2107 | // Step1: Find the base-registers and a 64bit constant offset. |
2108 | MachineOperand &Base = *TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vaddr); |
2109 | MemAddress MAddr; |
2110 | if (!Visited.contains(Val: &MI)) { |
2111 | processBaseWithConstOffset(Base, Addr&: MAddr); |
2112 | Visited[&MI] = MAddr; |
2113 | } else |
2114 | MAddr = Visited[&MI]; |
2115 | |
2116 | if (MAddr.Offset == 0) { |
2117 | LLVM_DEBUG(dbgs() << " Failed to extract constant-offset or there are no" |
2118 | " constant offsets that can be promoted.\n" ;); |
2119 | return false; |
2120 | } |
2121 | |
2122 | LLVM_DEBUG(dbgs() << " BASE: {" << MAddr.Base.HiReg << ", " |
2123 | << MAddr.Base.LoReg << "} Offset: " << MAddr.Offset << "\n\n" ;); |
2124 | |
2125 | // Step2: Traverse through MI's basic block and find an anchor(that has the |
2126 | // same base-registers) with the highest 13bit distance from MI's offset. |
2127 | // E.g. (64bit loads) |
2128 | // bb: |
2129 | // addr1 = &a + 4096; load1 = load(addr1, 0) |
2130 | // addr2 = &a + 6144; load2 = load(addr2, 0) |
2131 | // addr3 = &a + 8192; load3 = load(addr3, 0) |
2132 | // addr4 = &a + 10240; load4 = load(addr4, 0) |
2133 | // addr5 = &a + 12288; load5 = load(addr5, 0) |
2134 | // |
2135 | // Starting from the first load, the optimization will try to find a new base |
2136 | // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192 |
2137 | // has 13bit distance from &a + 4096. The heuristic considers &a + 8192 |
2138 | // as the new-base(anchor) because of the maximum distance which can |
2139 | // accommodate more intermediate bases presumably. |
2140 | // |
2141 | // Step3: move (&a + 8192) above load1. Compute and promote offsets from |
2142 | // (&a + 8192) for load1, load2, load4. |
2143 | // addr = &a + 8192 |
2144 | // load1 = load(addr, -4096) |
2145 | // load2 = load(addr, -2048) |
2146 | // load3 = load(addr, 0) |
2147 | // load4 = load(addr, 2048) |
2148 | // addr5 = &a + 12288; load5 = load(addr5, 0) |
2149 | // |
2150 | MachineInstr *AnchorInst = nullptr; |
2151 | MemAddress AnchorAddr; |
2152 | uint32_t MaxDist = std::numeric_limits<uint32_t>::min(); |
2153 | SmallVector<std::pair<MachineInstr *, int64_t>, 4> InstsWCommonBase; |
2154 | |
2155 | MachineBasicBlock *MBB = MI.getParent(); |
2156 | MachineBasicBlock::iterator E = MBB->end(); |
2157 | MachineBasicBlock::iterator MBBI = MI.getIterator(); |
2158 | ++MBBI; |
2159 | const SITargetLowering *TLI = |
2160 | static_cast<const SITargetLowering *>(STM->getTargetLowering()); |
2161 | |
2162 | for ( ; MBBI != E; ++MBBI) { |
2163 | MachineInstr &MINext = *MBBI; |
2164 | // TODO: Support finding an anchor(with same base) from store addresses or |
2165 | // any other load addresses where the opcodes are different. |
2166 | if (MINext.getOpcode() != MI.getOpcode() || |
2167 | TII->getNamedOperand(MI&: MINext, OperandName: AMDGPU::OpName::offset)->getImm()) |
2168 | continue; |
2169 | |
2170 | const MachineOperand &BaseNext = |
2171 | *TII->getNamedOperand(MI&: MINext, OperandName: AMDGPU::OpName::vaddr); |
2172 | MemAddress MAddrNext; |
2173 | if (!Visited.contains(Val: &MINext)) { |
2174 | processBaseWithConstOffset(Base: BaseNext, Addr&: MAddrNext); |
2175 | Visited[&MINext] = MAddrNext; |
2176 | } else |
2177 | MAddrNext = Visited[&MINext]; |
2178 | |
2179 | if (MAddrNext.Base.LoReg != MAddr.Base.LoReg || |
2180 | MAddrNext.Base.HiReg != MAddr.Base.HiReg || |
2181 | MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg || |
2182 | MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg) |
2183 | continue; |
2184 | |
2185 | InstsWCommonBase.emplace_back(Args: &MINext, Args&: MAddrNext.Offset); |
2186 | |
2187 | int64_t Dist = MAddr.Offset - MAddrNext.Offset; |
2188 | TargetLoweringBase::AddrMode AM; |
2189 | AM.HasBaseReg = true; |
2190 | AM.BaseOffs = Dist; |
2191 | if (TLI->isLegalFlatAddressingMode(AM, AddrSpace: AS) && |
2192 | (uint32_t)std::abs(i: Dist) > MaxDist) { |
2193 | MaxDist = std::abs(i: Dist); |
2194 | |
2195 | AnchorAddr = MAddrNext; |
2196 | AnchorInst = &MINext; |
2197 | } |
2198 | } |
2199 | |
2200 | if (AnchorInst) { |
2201 | LLVM_DEBUG(dbgs() << " Anchor-Inst(with max-distance from Offset): " ; |
2202 | AnchorInst->dump()); |
2203 | LLVM_DEBUG(dbgs() << " Anchor-Offset from BASE: " |
2204 | << AnchorAddr.Offset << "\n\n" ); |
2205 | |
2206 | // Instead of moving up, just re-compute anchor-instruction's base address. |
2207 | Register Base = computeBase(MI, Addr: AnchorAddr); |
2208 | |
2209 | updateBaseAndOffset(MI, NewBase: Base, NewOffset: MAddr.Offset - AnchorAddr.Offset); |
2210 | LLVM_DEBUG(dbgs() << " After promotion: " ; MI.dump();); |
2211 | |
2212 | for (auto [OtherMI, OtherOffset] : InstsWCommonBase) { |
2213 | TargetLoweringBase::AddrMode AM; |
2214 | AM.HasBaseReg = true; |
2215 | AM.BaseOffs = OtherOffset - AnchorAddr.Offset; |
2216 | |
2217 | if (TLI->isLegalFlatAddressingMode(AM, AddrSpace: AS)) { |
2218 | LLVM_DEBUG(dbgs() << " Promote Offset(" << OtherOffset; dbgs() << ")" ; |
2219 | OtherMI->dump()); |
2220 | updateBaseAndOffset(MI&: *OtherMI, NewBase: Base, NewOffset: OtherOffset - AnchorAddr.Offset); |
2221 | LLVM_DEBUG(dbgs() << " After promotion: " ; OtherMI->dump()); |
2222 | } |
2223 | } |
2224 | AnchorList.insert(Ptr: AnchorInst); |
2225 | return true; |
2226 | } |
2227 | |
2228 | return false; |
2229 | } |
2230 | |
2231 | void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI, |
2232 | std::list<std::list<CombineInfo> > &MergeableInsts) const { |
2233 | for (std::list<CombineInfo> &AddrList : MergeableInsts) { |
2234 | if (AddrList.front().InstClass == CI.InstClass && |
2235 | AddrList.front().IsAGPR == CI.IsAGPR && |
2236 | AddrList.front().hasSameBaseAddress(CI)) { |
2237 | AddrList.emplace_back(args: CI); |
2238 | return; |
2239 | } |
2240 | } |
2241 | |
2242 | // Base address not found, so add a new list. |
2243 | MergeableInsts.emplace_back(args: 1, args: CI); |
2244 | } |
2245 | |
2246 | std::pair<MachineBasicBlock::iterator, bool> |
2247 | SILoadStoreOptimizer::collectMergeableInsts( |
2248 | MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End, |
2249 | MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList, |
2250 | std::list<std::list<CombineInfo>> &MergeableInsts) const { |
2251 | bool Modified = false; |
2252 | |
2253 | // Sort potential mergeable instructions into lists. One list per base address. |
2254 | unsigned Order = 0; |
2255 | MachineBasicBlock::iterator BlockI = Begin; |
2256 | for (; BlockI != End; ++BlockI) { |
2257 | MachineInstr &MI = *BlockI; |
2258 | |
2259 | // We run this before checking if an address is mergeable, because it can produce |
2260 | // better code even if the instructions aren't mergeable. |
2261 | if (promoteConstantOffsetToImm(MI, Visited, AnchorList)) |
2262 | Modified = true; |
2263 | |
2264 | // Treat volatile accesses, ordered accesses and unmodeled side effects as |
2265 | // barriers. We can look after this barrier for separate merges. |
2266 | if (MI.hasOrderedMemoryRef() || MI.hasUnmodeledSideEffects()) { |
2267 | LLVM_DEBUG(dbgs() << "Breaking search on barrier: " << MI); |
2268 | |
2269 | // Search will resume after this instruction in a separate merge list. |
2270 | ++BlockI; |
2271 | break; |
2272 | } |
2273 | |
2274 | const InstClassEnum InstClass = getInstClass(Opc: MI.getOpcode(), TII: *TII); |
2275 | if (InstClass == UNKNOWN) |
2276 | continue; |
2277 | |
2278 | // Do not merge VMEM buffer instructions with "swizzled" bit set. |
2279 | int Swizzled = |
2280 | AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), NamedIdx: AMDGPU::OpName::swz); |
2281 | if (Swizzled != -1 && MI.getOperand(i: Swizzled).getImm()) |
2282 | continue; |
2283 | |
2284 | CombineInfo CI; |
2285 | CI.setMI(MI, LSO: *this); |
2286 | CI.Order = Order++; |
2287 | |
2288 | if (!CI.hasMergeableAddress(MRI: *MRI)) |
2289 | continue; |
2290 | |
2291 | if (CI.InstClass == DS_WRITE && CI.IsAGPR) { |
2292 | // FIXME: nothing is illegal in a ds_write2 opcode with two AGPR data |
2293 | // operands. However we are reporting that ds_write2 shall have |
2294 | // only VGPR data so that machine copy propagation does not |
2295 | // create an illegal instruction with a VGPR and AGPR sources. |
2296 | // Consequenctially if we create such instruction the verifier |
2297 | // will complain. |
2298 | continue; |
2299 | } |
2300 | |
2301 | LLVM_DEBUG(dbgs() << "Mergeable: " << MI); |
2302 | |
2303 | addInstToMergeableList(CI, MergeableInsts); |
2304 | } |
2305 | |
2306 | // At this point we have lists of Mergeable instructions. |
2307 | // |
2308 | // Part 2: Sort lists by offset and then for each CombineInfo object in the |
2309 | // list try to find an instruction that can be merged with I. If an instruction |
2310 | // is found, it is stored in the Paired field. If no instructions are found, then |
2311 | // the CombineInfo object is deleted from the list. |
2312 | |
2313 | for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(), |
2314 | E = MergeableInsts.end(); I != E;) { |
2315 | |
2316 | std::list<CombineInfo> &MergeList = *I; |
2317 | if (MergeList.size() <= 1) { |
2318 | // This means we have found only one instruction with a given address |
2319 | // that can be merged, and we need at least 2 instructions to do a merge, |
2320 | // so this list can be discarded. |
2321 | I = MergeableInsts.erase(position: I); |
2322 | continue; |
2323 | } |
2324 | |
2325 | // Sort the lists by offsets, this way mergeable instructions will be |
2326 | // adjacent to each other in the list, which will make it easier to find |
2327 | // matches. |
2328 | MergeList.sort( |
2329 | [] (const CombineInfo &A, const CombineInfo &B) { |
2330 | return A.Offset < B.Offset; |
2331 | }); |
2332 | ++I; |
2333 | } |
2334 | |
2335 | return {BlockI, Modified}; |
2336 | } |
2337 | |
2338 | // Scan through looking for adjacent LDS operations with constant offsets from |
2339 | // the same base register. We rely on the scheduler to do the hard work of |
2340 | // clustering nearby loads, and assume these are all adjacent. |
2341 | bool SILoadStoreOptimizer::optimizeBlock( |
2342 | std::list<std::list<CombineInfo> > &MergeableInsts) { |
2343 | bool Modified = false; |
2344 | |
2345 | for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(), |
2346 | E = MergeableInsts.end(); I != E;) { |
2347 | std::list<CombineInfo> &MergeList = *I; |
2348 | |
2349 | bool OptimizeListAgain = false; |
2350 | if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) { |
2351 | // We weren't able to make any changes, so delete the list so we don't |
2352 | // process the same instructions the next time we try to optimize this |
2353 | // block. |
2354 | I = MergeableInsts.erase(position: I); |
2355 | continue; |
2356 | } |
2357 | |
2358 | Modified = true; |
2359 | |
2360 | // We made changes, but also determined that there were no more optimization |
2361 | // opportunities, so we don't need to reprocess the list |
2362 | if (!OptimizeListAgain) { |
2363 | I = MergeableInsts.erase(position: I); |
2364 | continue; |
2365 | } |
2366 | OptimizeAgain = true; |
2367 | } |
2368 | return Modified; |
2369 | } |
2370 | |
2371 | bool |
2372 | SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr( |
2373 | std::list<CombineInfo> &MergeList, |
2374 | bool &OptimizeListAgain) { |
2375 | if (MergeList.empty()) |
2376 | return false; |
2377 | |
2378 | bool Modified = false; |
2379 | |
2380 | for (auto I = MergeList.begin(), Next = std::next(x: I); Next != MergeList.end(); |
2381 | Next = std::next(x: I)) { |
2382 | |
2383 | auto First = I; |
2384 | auto Second = Next; |
2385 | |
2386 | if ((*First).Order > (*Second).Order) |
2387 | std::swap(a&: First, b&: Second); |
2388 | CombineInfo &CI = *First; |
2389 | CombineInfo &Paired = *Second; |
2390 | |
2391 | CombineInfo *Where = checkAndPrepareMerge(CI, Paired); |
2392 | if (!Where) { |
2393 | ++I; |
2394 | continue; |
2395 | } |
2396 | |
2397 | Modified = true; |
2398 | |
2399 | LLVM_DEBUG(dbgs() << "Merging: " << *CI.I << " with: " << *Paired.I); |
2400 | |
2401 | MachineBasicBlock::iterator NewMI; |
2402 | switch (CI.InstClass) { |
2403 | default: |
2404 | llvm_unreachable("unknown InstClass" ); |
2405 | break; |
2406 | case DS_READ: |
2407 | NewMI = mergeRead2Pair(CI, Paired, InsertBefore: Where->I); |
2408 | break; |
2409 | case DS_WRITE: |
2410 | NewMI = mergeWrite2Pair(CI, Paired, InsertBefore: Where->I); |
2411 | break; |
2412 | case S_BUFFER_LOAD_IMM: |
2413 | case S_BUFFER_LOAD_SGPR_IMM: |
2414 | case S_LOAD_IMM: |
2415 | NewMI = mergeSMemLoadImmPair(CI, Paired, InsertBefore: Where->I); |
2416 | OptimizeListAgain |= CI.Width + Paired.Width < 8; |
2417 | break; |
2418 | case BUFFER_LOAD: |
2419 | NewMI = mergeBufferLoadPair(CI, Paired, InsertBefore: Where->I); |
2420 | OptimizeListAgain |= CI.Width + Paired.Width < 4; |
2421 | break; |
2422 | case BUFFER_STORE: |
2423 | NewMI = mergeBufferStorePair(CI, Paired, InsertBefore: Where->I); |
2424 | OptimizeListAgain |= CI.Width + Paired.Width < 4; |
2425 | break; |
2426 | case MIMG: |
2427 | NewMI = mergeImagePair(CI, Paired, InsertBefore: Where->I); |
2428 | OptimizeListAgain |= CI.Width + Paired.Width < 4; |
2429 | break; |
2430 | case TBUFFER_LOAD: |
2431 | NewMI = mergeTBufferLoadPair(CI, Paired, InsertBefore: Where->I); |
2432 | OptimizeListAgain |= CI.Width + Paired.Width < 4; |
2433 | break; |
2434 | case TBUFFER_STORE: |
2435 | NewMI = mergeTBufferStorePair(CI, Paired, InsertBefore: Where->I); |
2436 | OptimizeListAgain |= CI.Width + Paired.Width < 4; |
2437 | break; |
2438 | case FLAT_LOAD: |
2439 | case GLOBAL_LOAD: |
2440 | case GLOBAL_LOAD_SADDR: |
2441 | NewMI = mergeFlatLoadPair(CI, Paired, InsertBefore: Where->I); |
2442 | OptimizeListAgain |= CI.Width + Paired.Width < 4; |
2443 | break; |
2444 | case FLAT_STORE: |
2445 | case GLOBAL_STORE: |
2446 | case GLOBAL_STORE_SADDR: |
2447 | NewMI = mergeFlatStorePair(CI, Paired, InsertBefore: Where->I); |
2448 | OptimizeListAgain |= CI.Width + Paired.Width < 4; |
2449 | break; |
2450 | } |
2451 | CI.setMI(MI: NewMI, LSO: *this); |
2452 | CI.Order = Where->Order; |
2453 | if (I == Second) |
2454 | I = Next; |
2455 | |
2456 | MergeList.erase(position: Second); |
2457 | } |
2458 | |
2459 | return Modified; |
2460 | } |
2461 | |
2462 | bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) { |
2463 | if (skipFunction(F: MF.getFunction())) |
2464 | return false; |
2465 | |
2466 | STM = &MF.getSubtarget<GCNSubtarget>(); |
2467 | if (!STM->loadStoreOptEnabled()) |
2468 | return false; |
2469 | |
2470 | TII = STM->getInstrInfo(); |
2471 | TRI = &TII->getRegisterInfo(); |
2472 | |
2473 | MRI = &MF.getRegInfo(); |
2474 | AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); |
2475 | |
2476 | LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n" ); |
2477 | |
2478 | bool Modified = false; |
2479 | |
2480 | // Contains the list of instructions for which constant offsets are being |
2481 | // promoted to the IMM. This is tracked for an entire block at time. |
2482 | SmallPtrSet<MachineInstr *, 4> AnchorList; |
2483 | MemInfoMap Visited; |
2484 | |
2485 | for (MachineBasicBlock &MBB : MF) { |
2486 | MachineBasicBlock::iterator SectionEnd; |
2487 | for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E; |
2488 | I = SectionEnd) { |
2489 | bool CollectModified; |
2490 | std::list<std::list<CombineInfo>> MergeableInsts; |
2491 | |
2492 | // First pass: Collect list of all instructions we know how to merge in a |
2493 | // subset of the block. |
2494 | std::tie(args&: SectionEnd, args&: CollectModified) = |
2495 | collectMergeableInsts(Begin: I, End: E, Visited, AnchorList, MergeableInsts); |
2496 | |
2497 | Modified |= CollectModified; |
2498 | |
2499 | do { |
2500 | OptimizeAgain = false; |
2501 | Modified |= optimizeBlock(MergeableInsts); |
2502 | } while (OptimizeAgain); |
2503 | } |
2504 | |
2505 | Visited.clear(); |
2506 | AnchorList.clear(); |
2507 | } |
2508 | |
2509 | return Modified; |
2510 | } |
2511 | |