1//===- SILoadStoreOptimizer.cpp -------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass tries to fuse DS instructions with close by immediate offsets.
10// This will fuse operations such as
11// ds_read_b32 v0, v2 offset:16
12// ds_read_b32 v1, v2 offset:32
13// ==>
14// ds_read2_b32 v[0:1], v2, offset0:4 offset1:8
15//
16// The same is done for certain SMEM and VMEM opcodes, e.g.:
17// s_buffer_load_dword s4, s[0:3], 4
18// s_buffer_load_dword s5, s[0:3], 8
19// ==>
20// s_buffer_load_dwordx2 s[4:5], s[0:3], 4
21//
22// This pass also tries to promote constant offset to the immediate by
23// adjusting the base. It tries to use a base from the nearby instructions that
24// allows it to have a 13bit constant offset and then promotes the 13bit offset
25// to the immediate.
26// E.g.
27// s_movk_i32 s0, 0x1800
28// v_add_co_u32_e32 v0, vcc, s0, v2
29// v_addc_co_u32_e32 v1, vcc, 0, v6, vcc
30//
31// s_movk_i32 s0, 0x1000
32// v_add_co_u32_e32 v5, vcc, s0, v2
33// v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
34// global_load_dwordx2 v[5:6], v[5:6], off
35// global_load_dwordx2 v[0:1], v[0:1], off
36// =>
37// s_movk_i32 s0, 0x1000
38// v_add_co_u32_e32 v5, vcc, s0, v2
39// v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
40// global_load_dwordx2 v[5:6], v[5:6], off
41// global_load_dwordx2 v[0:1], v[5:6], off offset:2048
42//
43// Future improvements:
44//
45// - This is currently missing stores of constants because loading
46// the constant into the data register is placed between the stores, although
47// this is arguably a scheduling problem.
48//
49// - Live interval recomputing seems inefficient. This currently only matches
50// one pair, and recomputes live intervals and moves on to the next pair. It
51// would be better to compute a list of all merges that need to occur.
52//
53// - With a list of instructions to process, we can also merge more. If a
54// cluster of loads have offsets that are too large to fit in the 8-bit
55// offsets, but are close enough to fit in the 8 bits, we can add to the base
56// pointer and use the new reduced offsets.
57//
58//===----------------------------------------------------------------------===//
59
60#include "SILoadStoreOptimizer.h"
61#include "AMDGPU.h"
62#include "GCNSubtarget.h"
63#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
64#include "llvm/Analysis/AliasAnalysis.h"
65#include "llvm/CodeGen/MachineFunctionPass.h"
66#include "llvm/InitializePasses.h"
67
68using namespace llvm;
69
70#define DEBUG_TYPE "si-load-store-opt"
71
72namespace {
73enum InstClassEnum {
74 UNKNOWN,
75 DS_READ,
76 DS_WRITE,
77 S_BUFFER_LOAD_IMM,
78 S_BUFFER_LOAD_SGPR_IMM,
79 S_LOAD_IMM,
80 BUFFER_LOAD,
81 BUFFER_STORE,
82 MIMG,
83 TBUFFER_LOAD,
84 TBUFFER_STORE,
85 GLOBAL_LOAD_SADDR,
86 GLOBAL_STORE_SADDR,
87 FLAT_LOAD,
88 FLAT_STORE,
89 GLOBAL_LOAD, // GLOBAL_LOAD/GLOBAL_STORE are never used as the InstClass of
90 GLOBAL_STORE // any CombineInfo, they are only ever returned by
91 // getCommonInstClass.
92};
93
94struct AddressRegs {
95 unsigned char NumVAddrs = 0;
96 bool SBase = false;
97 bool SRsrc = false;
98 bool SOffset = false;
99 bool SAddr = false;
100 bool VAddr = false;
101 bool Addr = false;
102 bool SSamp = false;
103};
104
105// GFX10 image_sample instructions can have 12 vaddrs + srsrc + ssamp.
106const unsigned MaxAddressRegs = 12 + 1 + 1;
107
108class SILoadStoreOptimizer {
109 struct CombineInfo {
110 MachineBasicBlock::iterator I;
111 unsigned EltSize;
112 unsigned Offset;
113 unsigned Width;
114 unsigned Format;
115 unsigned BaseOff;
116 unsigned DMask;
117 InstClassEnum InstClass;
118 unsigned CPol = 0;
119 bool IsAGPR;
120 bool UseST64;
121 int AddrIdx[MaxAddressRegs];
122 const MachineOperand *AddrReg[MaxAddressRegs];
123 unsigned NumAddresses;
124 unsigned Order;
125
126 bool hasSameBaseAddress(const CombineInfo &CI) {
127 if (NumAddresses != CI.NumAddresses)
128 return false;
129
130 const MachineInstr &MI = *CI.I;
131 for (unsigned i = 0; i < NumAddresses; i++) {
132 const MachineOperand &AddrRegNext = MI.getOperand(i: AddrIdx[i]);
133
134 if (AddrReg[i]->isImm() || AddrRegNext.isImm()) {
135 if (AddrReg[i]->isImm() != AddrRegNext.isImm() ||
136 AddrReg[i]->getImm() != AddrRegNext.getImm()) {
137 return false;
138 }
139 continue;
140 }
141
142 // Check same base pointer. Be careful of subregisters, which can occur
143 // with vectors of pointers.
144 if (AddrReg[i]->getReg() != AddrRegNext.getReg() ||
145 AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) {
146 return false;
147 }
148 }
149 return true;
150 }
151
152 bool hasMergeableAddress(const MachineRegisterInfo &MRI) {
153 for (unsigned i = 0; i < NumAddresses; ++i) {
154 const MachineOperand *AddrOp = AddrReg[i];
155 // Immediates are always OK.
156 if (AddrOp->isImm())
157 continue;
158
159 // Don't try to merge addresses that aren't either immediates or registers.
160 // TODO: Should be possible to merge FrameIndexes and maybe some other
161 // non-register
162 if (!AddrOp->isReg())
163 return false;
164
165 // TODO: We should be able to merge instructions with other physical reg
166 // addresses too.
167 if (AddrOp->getReg().isPhysical() &&
168 AddrOp->getReg() != AMDGPU::SGPR_NULL)
169 return false;
170
171 // If an address has only one use then there will be no other
172 // instructions with the same address, so we can't merge this one.
173 if (MRI.hasOneNonDBGUse(RegNo: AddrOp->getReg()))
174 return false;
175 }
176 return true;
177 }
178
179 void setMI(MachineBasicBlock::iterator MI, const SILoadStoreOptimizer &LSO);
180
181 // Compare by pointer order.
182 bool operator<(const CombineInfo& Other) const {
183 return (InstClass == MIMG) ? DMask < Other.DMask : Offset < Other.Offset;
184 }
185 };
186
187 struct BaseRegisters {
188 Register LoReg;
189 Register HiReg;
190
191 unsigned LoSubReg = 0;
192 unsigned HiSubReg = 0;
193 };
194
195 struct MemAddress {
196 BaseRegisters Base;
197 int64_t Offset = 0;
198 };
199
200 using MemInfoMap = DenseMap<MachineInstr *, MemAddress>;
201
202private:
203 const GCNSubtarget *STM = nullptr;
204 const SIInstrInfo *TII = nullptr;
205 const SIRegisterInfo *TRI = nullptr;
206 MachineRegisterInfo *MRI = nullptr;
207 AliasAnalysis *AA = nullptr;
208 bool OptimizeAgain;
209
210 bool canSwapInstructions(const DenseSet<Register> &ARegDefs,
211 const DenseSet<Register> &ARegUses,
212 const MachineInstr &A, const MachineInstr &B) const;
213 static bool dmasksCanBeCombined(const CombineInfo &CI,
214 const SIInstrInfo &TII,
215 const CombineInfo &Paired);
216 static bool offsetsCanBeCombined(CombineInfo &CI, const GCNSubtarget &STI,
217 CombineInfo &Paired, bool Modify = false);
218 static bool widthsFit(const GCNSubtarget &STI, const CombineInfo &CI,
219 const CombineInfo &Paired);
220 unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired);
221 static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI,
222 const CombineInfo &Paired);
223 const TargetRegisterClass *
224 getTargetRegisterClass(const CombineInfo &CI,
225 const CombineInfo &Paired) const;
226 const TargetRegisterClass *getDataRegClass(const MachineInstr &MI) const;
227
228 CombineInfo *checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired);
229
230 void copyToDestRegs(CombineInfo &CI, CombineInfo &Paired,
231 MachineBasicBlock::iterator InsertBefore,
232 AMDGPU::OpName OpName, Register DestReg) const;
233 Register copyFromSrcRegs(CombineInfo &CI, CombineInfo &Paired,
234 MachineBasicBlock::iterator InsertBefore,
235 AMDGPU::OpName OpName) const;
236
237 unsigned read2Opcode(unsigned EltSize) const;
238 unsigned read2ST64Opcode(unsigned EltSize) const;
239 MachineBasicBlock::iterator
240 mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
241 MachineBasicBlock::iterator InsertBefore);
242
243 unsigned write2Opcode(unsigned EltSize) const;
244 unsigned write2ST64Opcode(unsigned EltSize) const;
245 MachineBasicBlock::iterator
246 mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
247 MachineBasicBlock::iterator InsertBefore);
248 MachineBasicBlock::iterator
249 mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
250 MachineBasicBlock::iterator InsertBefore);
251 MachineBasicBlock::iterator
252 mergeSMemLoadImmPair(CombineInfo &CI, CombineInfo &Paired,
253 MachineBasicBlock::iterator InsertBefore);
254 MachineBasicBlock::iterator
255 mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
256 MachineBasicBlock::iterator InsertBefore);
257 MachineBasicBlock::iterator
258 mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
259 MachineBasicBlock::iterator InsertBefore);
260 MachineBasicBlock::iterator
261 mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
262 MachineBasicBlock::iterator InsertBefore);
263 MachineBasicBlock::iterator
264 mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
265 MachineBasicBlock::iterator InsertBefore);
266 MachineBasicBlock::iterator
267 mergeFlatLoadPair(CombineInfo &CI, CombineInfo &Paired,
268 MachineBasicBlock::iterator InsertBefore);
269 MachineBasicBlock::iterator
270 mergeFlatStorePair(CombineInfo &CI, CombineInfo &Paired,
271 MachineBasicBlock::iterator InsertBefore);
272
273 void updateBaseAndOffset(MachineInstr &I, Register NewBase,
274 int32_t NewOffset) const;
275 Register computeBase(MachineInstr &MI, const MemAddress &Addr) const;
276 MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const;
277 std::optional<int32_t> extractConstOffset(const MachineOperand &Op) const;
278 void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr) const;
279 /// Promotes constant offset to the immediate by adjusting the base. It
280 /// tries to use a base from the nearby instructions that allows it to have
281 /// a 13bit constant offset which gets promoted to the immediate.
282 bool promoteConstantOffsetToImm(MachineInstr &CI,
283 MemInfoMap &Visited,
284 SmallPtrSet<MachineInstr *, 4> &Promoted) const;
285 void addInstToMergeableList(const CombineInfo &CI,
286 std::list<std::list<CombineInfo> > &MergeableInsts) const;
287
288 std::pair<MachineBasicBlock::iterator, bool> collectMergeableInsts(
289 MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End,
290 MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
291 std::list<std::list<CombineInfo>> &MergeableInsts) const;
292
293 static MachineMemOperand *combineKnownAdjacentMMOs(const CombineInfo &CI,
294 const CombineInfo &Paired);
295
296 static InstClassEnum getCommonInstClass(const CombineInfo &CI,
297 const CombineInfo &Paired);
298
299 bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList,
300 bool &OptimizeListAgain);
301 bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts);
302
303public:
304 SILoadStoreOptimizer(AliasAnalysis *AA) : AA(AA) {}
305 bool run(MachineFunction &MF);
306};
307
308class SILoadStoreOptimizerLegacy : public MachineFunctionPass {
309public:
310 static char ID;
311
312 SILoadStoreOptimizerLegacy() : MachineFunctionPass(ID) {}
313
314 bool runOnMachineFunction(MachineFunction &MF) override;
315
316 StringRef getPassName() const override { return "SI Load Store Optimizer"; }
317
318 void getAnalysisUsage(AnalysisUsage &AU) const override {
319 AU.setPreservesCFG();
320 AU.addRequired<AAResultsWrapperPass>();
321
322 MachineFunctionPass::getAnalysisUsage(AU);
323 }
324
325 MachineFunctionProperties getRequiredProperties() const override {
326 return MachineFunctionProperties().setIsSSA();
327 }
328};
329
330static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
331 const unsigned Opc = MI.getOpcode();
332
333 if (TII.isMUBUF(Opcode: Opc)) {
334 // FIXME: Handle d16 correctly
335 return AMDGPU::getMUBUFElements(Opc);
336 }
337 if (TII.isImage(MI)) {
338 uint64_t DMaskImm =
339 TII.getNamedOperand(MI, OperandName: AMDGPU::OpName::dmask)->getImm();
340 return llvm::popcount(Value: DMaskImm);
341 }
342 if (TII.isMTBUF(Opcode: Opc)) {
343 return AMDGPU::getMTBUFElements(Opc);
344 }
345
346 switch (Opc) {
347 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
348 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
349 case AMDGPU::S_LOAD_DWORD_IMM:
350 case AMDGPU::GLOBAL_LOAD_DWORD:
351 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
352 case AMDGPU::GLOBAL_STORE_DWORD:
353 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
354 case AMDGPU::FLAT_LOAD_DWORD:
355 case AMDGPU::FLAT_STORE_DWORD:
356 return 1;
357 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
358 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
359 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
360 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
361 case AMDGPU::S_LOAD_DWORDX2_IMM:
362 case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
363 case AMDGPU::GLOBAL_LOAD_DWORDX2:
364 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
365 case AMDGPU::GLOBAL_STORE_DWORDX2:
366 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
367 case AMDGPU::FLAT_LOAD_DWORDX2:
368 case AMDGPU::FLAT_STORE_DWORDX2:
369 return 2;
370 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
371 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
372 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
373 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
374 case AMDGPU::S_LOAD_DWORDX3_IMM:
375 case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
376 case AMDGPU::GLOBAL_LOAD_DWORDX3:
377 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
378 case AMDGPU::GLOBAL_STORE_DWORDX3:
379 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
380 case AMDGPU::FLAT_LOAD_DWORDX3:
381 case AMDGPU::FLAT_STORE_DWORDX3:
382 return 3;
383 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
384 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
385 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
386 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
387 case AMDGPU::S_LOAD_DWORDX4_IMM:
388 case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
389 case AMDGPU::GLOBAL_LOAD_DWORDX4:
390 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
391 case AMDGPU::GLOBAL_STORE_DWORDX4:
392 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
393 case AMDGPU::FLAT_LOAD_DWORDX4:
394 case AMDGPU::FLAT_STORE_DWORDX4:
395 return 4;
396 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
397 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
398 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
399 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
400 case AMDGPU::S_LOAD_DWORDX8_IMM:
401 case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
402 return 8;
403 case AMDGPU::DS_READ_B32:
404 case AMDGPU::DS_READ_B32_gfx9:
405 case AMDGPU::DS_WRITE_B32:
406 case AMDGPU::DS_WRITE_B32_gfx9:
407 return 1;
408 case AMDGPU::DS_READ_B64:
409 case AMDGPU::DS_READ_B64_gfx9:
410 case AMDGPU::DS_WRITE_B64:
411 case AMDGPU::DS_WRITE_B64_gfx9:
412 return 2;
413 default:
414 return 0;
415 }
416}
417
418/// Maps instruction opcode to enum InstClassEnum.
419static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
420 switch (Opc) {
421 default:
422 if (TII.isMUBUF(Opcode: Opc)) {
423 switch (AMDGPU::getMUBUFBaseOpcode(Opc)) {
424 default:
425 return UNKNOWN;
426 case AMDGPU::BUFFER_LOAD_DWORD_BOTHEN:
427 case AMDGPU::BUFFER_LOAD_DWORD_BOTHEN_exact:
428 case AMDGPU::BUFFER_LOAD_DWORD_IDXEN:
429 case AMDGPU::BUFFER_LOAD_DWORD_IDXEN_exact:
430 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
431 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact:
432 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
433 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact:
434 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_BOTHEN:
435 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_BOTHEN_exact:
436 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_IDXEN:
437 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_IDXEN_exact:
438 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN:
439 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN_exact:
440 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET:
441 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET_exact:
442 return BUFFER_LOAD;
443 case AMDGPU::BUFFER_STORE_DWORD_BOTHEN:
444 case AMDGPU::BUFFER_STORE_DWORD_BOTHEN_exact:
445 case AMDGPU::BUFFER_STORE_DWORD_IDXEN:
446 case AMDGPU::BUFFER_STORE_DWORD_IDXEN_exact:
447 case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
448 case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
449 case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
450 case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
451 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_BOTHEN:
452 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_BOTHEN_exact:
453 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_IDXEN:
454 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_IDXEN_exact:
455 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN:
456 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact:
457 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET:
458 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact:
459 return BUFFER_STORE;
460 }
461 }
462 if (TII.isImage(Opcode: Opc)) {
463 // Ignore instructions encoded without vaddr.
464 if (!AMDGPU::hasNamedOperand(Opcode: Opc, NamedIdx: AMDGPU::OpName::vaddr) &&
465 !AMDGPU::hasNamedOperand(Opcode: Opc, NamedIdx: AMDGPU::OpName::vaddr0))
466 return UNKNOWN;
467 // Ignore BVH instructions
468 if (AMDGPU::getMIMGBaseOpcode(Opc)->BVH)
469 return UNKNOWN;
470 // TODO: Support IMAGE_GET_RESINFO and IMAGE_GET_LOD.
471 if (TII.get(Opcode: Opc).mayStore() || !TII.get(Opcode: Opc).mayLoad() ||
472 TII.isGather4(Opcode: Opc))
473 return UNKNOWN;
474 return MIMG;
475 }
476 if (TII.isMTBUF(Opcode: Opc)) {
477 switch (AMDGPU::getMTBUFBaseOpcode(Opc)) {
478 default:
479 return UNKNOWN;
480 case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN:
481 case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:
482 case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN:
483 case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN_exact:
484 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN:
485 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN_exact:
486 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET:
487 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET_exact:
488 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN:
489 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN_exact:
490 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN:
491 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN_exact:
492 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN:
493 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN_exact:
494 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET:
495 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET_exact:
496 return TBUFFER_LOAD;
497 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN:
498 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN_exact:
499 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET:
500 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET_exact:
501 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN:
502 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN_exact:
503 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET:
504 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET_exact:
505 return TBUFFER_STORE;
506 }
507 }
508 return UNKNOWN;
509 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
510 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
511 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
512 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
513 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
514 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
515 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
516 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
517 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
518 return S_BUFFER_LOAD_IMM;
519 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
520 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
521 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
522 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
523 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
524 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
525 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
526 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
527 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
528 return S_BUFFER_LOAD_SGPR_IMM;
529 case AMDGPU::S_LOAD_DWORD_IMM:
530 case AMDGPU::S_LOAD_DWORDX2_IMM:
531 case AMDGPU::S_LOAD_DWORDX3_IMM:
532 case AMDGPU::S_LOAD_DWORDX4_IMM:
533 case AMDGPU::S_LOAD_DWORDX8_IMM:
534 case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
535 case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
536 case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
537 case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
538 return S_LOAD_IMM;
539 case AMDGPU::DS_READ_B32:
540 case AMDGPU::DS_READ_B32_gfx9:
541 case AMDGPU::DS_READ_B64:
542 case AMDGPU::DS_READ_B64_gfx9:
543 return DS_READ;
544 case AMDGPU::DS_WRITE_B32:
545 case AMDGPU::DS_WRITE_B32_gfx9:
546 case AMDGPU::DS_WRITE_B64:
547 case AMDGPU::DS_WRITE_B64_gfx9:
548 return DS_WRITE;
549 case AMDGPU::GLOBAL_LOAD_DWORD:
550 case AMDGPU::GLOBAL_LOAD_DWORDX2:
551 case AMDGPU::GLOBAL_LOAD_DWORDX3:
552 case AMDGPU::GLOBAL_LOAD_DWORDX4:
553 case AMDGPU::FLAT_LOAD_DWORD:
554 case AMDGPU::FLAT_LOAD_DWORDX2:
555 case AMDGPU::FLAT_LOAD_DWORDX3:
556 case AMDGPU::FLAT_LOAD_DWORDX4:
557 return FLAT_LOAD;
558 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
559 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
560 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
561 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
562 return GLOBAL_LOAD_SADDR;
563 case AMDGPU::GLOBAL_STORE_DWORD:
564 case AMDGPU::GLOBAL_STORE_DWORDX2:
565 case AMDGPU::GLOBAL_STORE_DWORDX3:
566 case AMDGPU::GLOBAL_STORE_DWORDX4:
567 case AMDGPU::FLAT_STORE_DWORD:
568 case AMDGPU::FLAT_STORE_DWORDX2:
569 case AMDGPU::FLAT_STORE_DWORDX3:
570 case AMDGPU::FLAT_STORE_DWORDX4:
571 return FLAT_STORE;
572 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
573 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
574 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
575 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
576 return GLOBAL_STORE_SADDR;
577 }
578}
579
580/// Determines instruction subclass from opcode. Only instructions
581/// of the same subclass can be merged together. The merged instruction may have
582/// a different subclass but must have the same class.
583static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {
584 switch (Opc) {
585 default:
586 if (TII.isMUBUF(Opcode: Opc))
587 return AMDGPU::getMUBUFBaseOpcode(Opc);
588 if (TII.isImage(Opcode: Opc)) {
589 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc);
590 assert(Info);
591 return Info->BaseOpcode;
592 }
593 if (TII.isMTBUF(Opcode: Opc))
594 return AMDGPU::getMTBUFBaseOpcode(Opc);
595 return -1;
596 case AMDGPU::DS_READ_B32:
597 case AMDGPU::DS_READ_B32_gfx9:
598 case AMDGPU::DS_READ_B64:
599 case AMDGPU::DS_READ_B64_gfx9:
600 case AMDGPU::DS_WRITE_B32:
601 case AMDGPU::DS_WRITE_B32_gfx9:
602 case AMDGPU::DS_WRITE_B64:
603 case AMDGPU::DS_WRITE_B64_gfx9:
604 return Opc;
605 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
606 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
607 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
608 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
609 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
610 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
611 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
612 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
613 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
614 return AMDGPU::S_BUFFER_LOAD_DWORD_IMM;
615 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
616 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
617 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
618 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
619 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
620 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
621 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
622 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
623 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
624 return AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM;
625 case AMDGPU::S_LOAD_DWORD_IMM:
626 case AMDGPU::S_LOAD_DWORDX2_IMM:
627 case AMDGPU::S_LOAD_DWORDX3_IMM:
628 case AMDGPU::S_LOAD_DWORDX4_IMM:
629 case AMDGPU::S_LOAD_DWORDX8_IMM:
630 case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
631 case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
632 case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
633 case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
634 return AMDGPU::S_LOAD_DWORD_IMM;
635 case AMDGPU::GLOBAL_LOAD_DWORD:
636 case AMDGPU::GLOBAL_LOAD_DWORDX2:
637 case AMDGPU::GLOBAL_LOAD_DWORDX3:
638 case AMDGPU::GLOBAL_LOAD_DWORDX4:
639 case AMDGPU::FLAT_LOAD_DWORD:
640 case AMDGPU::FLAT_LOAD_DWORDX2:
641 case AMDGPU::FLAT_LOAD_DWORDX3:
642 case AMDGPU::FLAT_LOAD_DWORDX4:
643 return AMDGPU::FLAT_LOAD_DWORD;
644 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
645 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
646 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
647 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
648 return AMDGPU::GLOBAL_LOAD_DWORD_SADDR;
649 case AMDGPU::GLOBAL_STORE_DWORD:
650 case AMDGPU::GLOBAL_STORE_DWORDX2:
651 case AMDGPU::GLOBAL_STORE_DWORDX3:
652 case AMDGPU::GLOBAL_STORE_DWORDX4:
653 case AMDGPU::FLAT_STORE_DWORD:
654 case AMDGPU::FLAT_STORE_DWORDX2:
655 case AMDGPU::FLAT_STORE_DWORDX3:
656 case AMDGPU::FLAT_STORE_DWORDX4:
657 return AMDGPU::FLAT_STORE_DWORD;
658 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
659 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
660 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
661 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
662 return AMDGPU::GLOBAL_STORE_DWORD_SADDR;
663 }
664}
665
666// GLOBAL loads and stores are classified as FLAT initially. If both combined
667// instructions are FLAT GLOBAL adjust the class to GLOBAL_LOAD or GLOBAL_STORE.
668// If either or both instructions are non segment specific FLAT the resulting
669// combined operation will be FLAT, potentially promoting one of the GLOBAL
670// operations to FLAT.
671// For other instructions return the original unmodified class.
672InstClassEnum
673SILoadStoreOptimizer::getCommonInstClass(const CombineInfo &CI,
674 const CombineInfo &Paired) {
675 assert(CI.InstClass == Paired.InstClass);
676
677 if ((CI.InstClass == FLAT_LOAD || CI.InstClass == FLAT_STORE) &&
678 SIInstrInfo::isFLATGlobal(MI: *CI.I) && SIInstrInfo::isFLATGlobal(MI: *Paired.I))
679 return (CI.InstClass == FLAT_STORE) ? GLOBAL_STORE : GLOBAL_LOAD;
680
681 return CI.InstClass;
682}
683
684static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) {
685 AddressRegs Result;
686
687 if (TII.isMUBUF(Opcode: Opc)) {
688 if (AMDGPU::getMUBUFHasVAddr(Opc))
689 Result.VAddr = true;
690 if (AMDGPU::getMUBUFHasSrsrc(Opc))
691 Result.SRsrc = true;
692 if (AMDGPU::getMUBUFHasSoffset(Opc))
693 Result.SOffset = true;
694
695 return Result;
696 }
697
698 if (TII.isImage(Opcode: Opc)) {
699 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::vaddr0);
700 if (VAddr0Idx >= 0) {
701 AMDGPU::OpName RsrcName =
702 TII.isMIMG(Opcode: Opc) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
703 int RsrcIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: RsrcName);
704 Result.NumVAddrs = RsrcIdx - VAddr0Idx;
705 } else {
706 Result.VAddr = true;
707 }
708 Result.SRsrc = true;
709 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc);
710 if (Info && AMDGPU::getMIMGBaseOpcodeInfo(BaseOpcode: Info->BaseOpcode)->Sampler)
711 Result.SSamp = true;
712
713 return Result;
714 }
715 if (TII.isMTBUF(Opcode: Opc)) {
716 if (AMDGPU::getMTBUFHasVAddr(Opc))
717 Result.VAddr = true;
718 if (AMDGPU::getMTBUFHasSrsrc(Opc))
719 Result.SRsrc = true;
720 if (AMDGPU::getMTBUFHasSoffset(Opc))
721 Result.SOffset = true;
722
723 return Result;
724 }
725
726 switch (Opc) {
727 default:
728 return Result;
729 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
730 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
731 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
732 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
733 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
734 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
735 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
736 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
737 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
738 Result.SOffset = true;
739 [[fallthrough]];
740 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
741 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
742 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
743 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
744 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
745 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
746 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
747 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
748 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
749 case AMDGPU::S_LOAD_DWORD_IMM:
750 case AMDGPU::S_LOAD_DWORDX2_IMM:
751 case AMDGPU::S_LOAD_DWORDX3_IMM:
752 case AMDGPU::S_LOAD_DWORDX4_IMM:
753 case AMDGPU::S_LOAD_DWORDX8_IMM:
754 case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
755 case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
756 case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
757 case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
758 Result.SBase = true;
759 return Result;
760 case AMDGPU::DS_READ_B32:
761 case AMDGPU::DS_READ_B64:
762 case AMDGPU::DS_READ_B32_gfx9:
763 case AMDGPU::DS_READ_B64_gfx9:
764 case AMDGPU::DS_WRITE_B32:
765 case AMDGPU::DS_WRITE_B64:
766 case AMDGPU::DS_WRITE_B32_gfx9:
767 case AMDGPU::DS_WRITE_B64_gfx9:
768 Result.Addr = true;
769 return Result;
770 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
771 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
772 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
773 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
774 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
775 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
776 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
777 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
778 Result.SAddr = true;
779 [[fallthrough]];
780 case AMDGPU::GLOBAL_LOAD_DWORD:
781 case AMDGPU::GLOBAL_LOAD_DWORDX2:
782 case AMDGPU::GLOBAL_LOAD_DWORDX3:
783 case AMDGPU::GLOBAL_LOAD_DWORDX4:
784 case AMDGPU::GLOBAL_STORE_DWORD:
785 case AMDGPU::GLOBAL_STORE_DWORDX2:
786 case AMDGPU::GLOBAL_STORE_DWORDX3:
787 case AMDGPU::GLOBAL_STORE_DWORDX4:
788 case AMDGPU::FLAT_LOAD_DWORD:
789 case AMDGPU::FLAT_LOAD_DWORDX2:
790 case AMDGPU::FLAT_LOAD_DWORDX3:
791 case AMDGPU::FLAT_LOAD_DWORDX4:
792 case AMDGPU::FLAT_STORE_DWORD:
793 case AMDGPU::FLAT_STORE_DWORDX2:
794 case AMDGPU::FLAT_STORE_DWORDX3:
795 case AMDGPU::FLAT_STORE_DWORDX4:
796 Result.VAddr = true;
797 return Result;
798 }
799}
800
801void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI,
802 const SILoadStoreOptimizer &LSO) {
803 I = MI;
804 unsigned Opc = MI->getOpcode();
805 InstClass = getInstClass(Opc, TII: *LSO.TII);
806
807 if (InstClass == UNKNOWN)
808 return;
809
810 IsAGPR = LSO.TRI->hasAGPRs(RC: LSO.getDataRegClass(MI: *MI));
811
812 switch (InstClass) {
813 case DS_READ:
814 EltSize =
815 (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8
816 : 4;
817 break;
818 case DS_WRITE:
819 EltSize =
820 (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8
821 : 4;
822 break;
823 case S_BUFFER_LOAD_IMM:
824 case S_BUFFER_LOAD_SGPR_IMM:
825 case S_LOAD_IMM:
826 EltSize = AMDGPU::convertSMRDOffsetUnits(ST: *LSO.STM, ByteOffset: 4);
827 break;
828 default:
829 EltSize = 4;
830 break;
831 }
832
833 if (InstClass == MIMG) {
834 DMask = LSO.TII->getNamedOperand(MI&: *I, OperandName: AMDGPU::OpName::dmask)->getImm();
835 // Offset is not considered for MIMG instructions.
836 Offset = 0;
837 } else {
838 int OffsetIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::offset);
839 Offset = I->getOperand(i: OffsetIdx).getImm();
840 }
841
842 if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE)
843 Format = LSO.TII->getNamedOperand(MI&: *I, OperandName: AMDGPU::OpName::format)->getImm();
844
845 Width = getOpcodeWidth(MI: *I, TII: *LSO.TII);
846
847 if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) {
848 Offset &= 0xffff;
849 } else if (InstClass != MIMG) {
850 CPol = LSO.TII->getNamedOperand(MI&: *I, OperandName: AMDGPU::OpName::cpol)->getImm();
851 }
852
853 AddressRegs Regs = getRegs(Opc, TII: *LSO.TII);
854 bool isVIMAGEorVSAMPLE = LSO.TII->isVIMAGE(MI: *I) || LSO.TII->isVSAMPLE(MI: *I);
855
856 NumAddresses = 0;
857 for (unsigned J = 0; J < Regs.NumVAddrs; J++)
858 AddrIdx[NumAddresses++] =
859 AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::vaddr0) + J;
860 if (Regs.Addr)
861 AddrIdx[NumAddresses++] =
862 AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::addr);
863 if (Regs.SBase)
864 AddrIdx[NumAddresses++] =
865 AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::sbase);
866 if (Regs.SRsrc)
867 AddrIdx[NumAddresses++] = AMDGPU::getNamedOperandIdx(
868 Opcode: Opc, Name: isVIMAGEorVSAMPLE ? AMDGPU::OpName::rsrc : AMDGPU::OpName::srsrc);
869 if (Regs.SOffset)
870 AddrIdx[NumAddresses++] =
871 AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::soffset);
872 if (Regs.SAddr)
873 AddrIdx[NumAddresses++] =
874 AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::saddr);
875 if (Regs.VAddr)
876 AddrIdx[NumAddresses++] =
877 AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::vaddr);
878 if (Regs.SSamp)
879 AddrIdx[NumAddresses++] = AMDGPU::getNamedOperandIdx(
880 Opcode: Opc, Name: isVIMAGEorVSAMPLE ? AMDGPU::OpName::samp : AMDGPU::OpName::ssamp);
881 assert(NumAddresses <= MaxAddressRegs);
882
883 for (unsigned J = 0; J < NumAddresses; J++)
884 AddrReg[J] = &I->getOperand(i: AddrIdx[J]);
885}
886
887} // end anonymous namespace.
888
889INITIALIZE_PASS_BEGIN(SILoadStoreOptimizerLegacy, DEBUG_TYPE,
890 "SI Load Store Optimizer", false, false)
891INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
892INITIALIZE_PASS_END(SILoadStoreOptimizerLegacy, DEBUG_TYPE,
893 "SI Load Store Optimizer", false, false)
894
895char SILoadStoreOptimizerLegacy::ID = 0;
896
897char &llvm::SILoadStoreOptimizerLegacyID = SILoadStoreOptimizerLegacy::ID;
898
899FunctionPass *llvm::createSILoadStoreOptimizerLegacyPass() {
900 return new SILoadStoreOptimizerLegacy();
901}
902
903static void addDefsUsesToList(const MachineInstr &MI,
904 DenseSet<Register> &RegDefs,
905 DenseSet<Register> &RegUses) {
906 for (const auto &Op : MI.operands()) {
907 if (!Op.isReg())
908 continue;
909 if (Op.isDef())
910 RegDefs.insert(V: Op.getReg());
911 if (Op.readsReg())
912 RegUses.insert(V: Op.getReg());
913 }
914}
915
916bool SILoadStoreOptimizer::canSwapInstructions(
917 const DenseSet<Register> &ARegDefs, const DenseSet<Register> &ARegUses,
918 const MachineInstr &A, const MachineInstr &B) const {
919 if (A.mayLoadOrStore() && B.mayLoadOrStore() &&
920 (A.mayStore() || B.mayStore()) && A.mayAlias(AA, Other: B, UseTBAA: true))
921 return false;
922 for (const auto &BOp : B.operands()) {
923 if (!BOp.isReg())
924 continue;
925 if ((BOp.isDef() || BOp.readsReg()) && ARegDefs.contains(V: BOp.getReg()))
926 return false;
927 if (BOp.isDef() && ARegUses.contains(V: BOp.getReg()))
928 return false;
929 }
930 return true;
931}
932
933// Given that \p CI and \p Paired are adjacent memory operations produce a new
934// MMO for the combined operation with a new access size.
935MachineMemOperand *
936SILoadStoreOptimizer::combineKnownAdjacentMMOs(const CombineInfo &CI,
937 const CombineInfo &Paired) {
938 const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
939 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
940
941 unsigned Size = MMOa->getSize().getValue() + MMOb->getSize().getValue();
942
943 // A base pointer for the combined operation is the same as the leading
944 // operation's pointer.
945 if (Paired < CI)
946 std::swap(a&: MMOa, b&: MMOb);
947
948 MachinePointerInfo PtrInfo(MMOa->getPointerInfo());
949 // If merging FLAT and GLOBAL set address space to FLAT.
950 if (MMOb->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS)
951 PtrInfo.AddrSpace = AMDGPUAS::FLAT_ADDRESS;
952
953 MachineFunction *MF = CI.I->getMF();
954 return MF->getMachineMemOperand(MMO: MMOa, PtrInfo, Size);
955}
956
957bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI,
958 const SIInstrInfo &TII,
959 const CombineInfo &Paired) {
960 assert(CI.InstClass == MIMG);
961
962 // Ignore instructions with tfe/lwe set.
963 const auto *TFEOp = TII.getNamedOperand(MI&: *CI.I, OperandName: AMDGPU::OpName::tfe);
964 const auto *LWEOp = TII.getNamedOperand(MI&: *CI.I, OperandName: AMDGPU::OpName::lwe);
965
966 if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm()))
967 return false;
968
969 // Check other optional immediate operands for equality.
970 AMDGPU::OpName OperandsToMatch[] = {
971 AMDGPU::OpName::cpol, AMDGPU::OpName::d16, AMDGPU::OpName::unorm,
972 AMDGPU::OpName::da, AMDGPU::OpName::r128, AMDGPU::OpName::a16};
973
974 for (AMDGPU::OpName op : OperandsToMatch) {
975 int Idx = AMDGPU::getNamedOperandIdx(Opcode: CI.I->getOpcode(), Name: op);
976 if (AMDGPU::getNamedOperandIdx(Opcode: Paired.I->getOpcode(), Name: op) != Idx)
977 return false;
978 if (Idx != -1 &&
979 CI.I->getOperand(i: Idx).getImm() != Paired.I->getOperand(i: Idx).getImm())
980 return false;
981 }
982
983 // Check DMask for overlaps.
984 unsigned MaxMask = std::max(a: CI.DMask, b: Paired.DMask);
985 unsigned MinMask = std::min(a: CI.DMask, b: Paired.DMask);
986
987 if (!MaxMask)
988 return false;
989
990 unsigned AllowedBitsForMin = llvm::countr_zero(Val: MaxMask);
991 if ((1u << AllowedBitsForMin) <= MinMask)
992 return false;
993
994 return true;
995}
996
997static unsigned getBufferFormatWithCompCount(unsigned OldFormat,
998 unsigned ComponentCount,
999 const GCNSubtarget &STI) {
1000 if (ComponentCount > 4)
1001 return 0;
1002
1003 const llvm::AMDGPU::GcnBufferFormatInfo *OldFormatInfo =
1004 llvm::AMDGPU::getGcnBufferFormatInfo(Format: OldFormat, STI);
1005 if (!OldFormatInfo)
1006 return 0;
1007
1008 const llvm::AMDGPU::GcnBufferFormatInfo *NewFormatInfo =
1009 llvm::AMDGPU::getGcnBufferFormatInfo(BitsPerComp: OldFormatInfo->BitsPerComp,
1010 NumComponents: ComponentCount,
1011 NumFormat: OldFormatInfo->NumFormat, STI);
1012
1013 if (!NewFormatInfo)
1014 return 0;
1015
1016 assert(NewFormatInfo->NumFormat == OldFormatInfo->NumFormat &&
1017 NewFormatInfo->BitsPerComp == OldFormatInfo->BitsPerComp);
1018
1019 return NewFormatInfo->Format;
1020}
1021
1022// Return the value in the inclusive range [Lo,Hi] that is aligned to the
1023// highest power of two. Note that the result is well defined for all inputs
1024// including corner cases like:
1025// - if Lo == Hi, return that value
1026// - if Lo == 0, return 0 (even though the "- 1" below underflows
1027// - if Lo > Hi, return 0 (as if the range wrapped around)
1028static uint32_t mostAlignedValueInRange(uint32_t Lo, uint32_t Hi) {
1029 return Hi & maskLeadingOnes<uint32_t>(N: llvm::countl_zero(Val: (Lo - 1) ^ Hi) + 1);
1030}
1031
1032bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
1033 const GCNSubtarget &STI,
1034 CombineInfo &Paired,
1035 bool Modify) {
1036 assert(CI.InstClass != MIMG);
1037
1038 // XXX - Would the same offset be OK? Is there any reason this would happen or
1039 // be useful?
1040 if (CI.Offset == Paired.Offset)
1041 return false;
1042
1043 // This won't be valid if the offset isn't aligned.
1044 if ((CI.Offset % CI.EltSize != 0) || (Paired.Offset % CI.EltSize != 0))
1045 return false;
1046
1047 if (CI.InstClass == TBUFFER_LOAD || CI.InstClass == TBUFFER_STORE) {
1048
1049 const llvm::AMDGPU::GcnBufferFormatInfo *Info0 =
1050 llvm::AMDGPU::getGcnBufferFormatInfo(Format: CI.Format, STI);
1051 if (!Info0)
1052 return false;
1053 const llvm::AMDGPU::GcnBufferFormatInfo *Info1 =
1054 llvm::AMDGPU::getGcnBufferFormatInfo(Format: Paired.Format, STI);
1055 if (!Info1)
1056 return false;
1057
1058 if (Info0->BitsPerComp != Info1->BitsPerComp ||
1059 Info0->NumFormat != Info1->NumFormat)
1060 return false;
1061
1062 // TODO: Should be possible to support more formats, but if format loads
1063 // are not dword-aligned, the merged load might not be valid.
1064 if (Info0->BitsPerComp != 32)
1065 return false;
1066
1067 if (getBufferFormatWithCompCount(OldFormat: CI.Format, ComponentCount: CI.Width + Paired.Width, STI) == 0)
1068 return false;
1069 }
1070
1071 uint32_t EltOffset0 = CI.Offset / CI.EltSize;
1072 uint32_t EltOffset1 = Paired.Offset / CI.EltSize;
1073 CI.UseST64 = false;
1074 CI.BaseOff = 0;
1075
1076 // Handle all non-DS instructions.
1077 if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) {
1078 if (EltOffset0 + CI.Width != EltOffset1 &&
1079 EltOffset1 + Paired.Width != EltOffset0)
1080 return false;
1081 if (CI.CPol != Paired.CPol)
1082 return false;
1083 if (CI.InstClass == S_LOAD_IMM || CI.InstClass == S_BUFFER_LOAD_IMM ||
1084 CI.InstClass == S_BUFFER_LOAD_SGPR_IMM) {
1085 // Reject cases like:
1086 // dword + dwordx2 -> dwordx3
1087 // dword + dwordx3 -> dwordx4
1088 // If we tried to combine these cases, we would fail to extract a subreg
1089 // for the result of the second load due to SGPR alignment requirements.
1090 if (CI.Width != Paired.Width &&
1091 (CI.Width < Paired.Width) == (CI.Offset < Paired.Offset))
1092 return false;
1093 }
1094 return true;
1095 }
1096
1097 // If the offset in elements doesn't fit in 8-bits, we might be able to use
1098 // the stride 64 versions.
1099 if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&
1100 isUInt<8>(x: EltOffset0 / 64) && isUInt<8>(x: EltOffset1 / 64)) {
1101 if (Modify) {
1102 CI.Offset = EltOffset0 / 64;
1103 Paired.Offset = EltOffset1 / 64;
1104 CI.UseST64 = true;
1105 }
1106 return true;
1107 }
1108
1109 // Check if the new offsets fit in the reduced 8-bit range.
1110 if (isUInt<8>(x: EltOffset0) && isUInt<8>(x: EltOffset1)) {
1111 if (Modify) {
1112 CI.Offset = EltOffset0;
1113 Paired.Offset = EltOffset1;
1114 }
1115 return true;
1116 }
1117
1118 // Try to shift base address to decrease offsets.
1119 uint32_t Min = std::min(a: EltOffset0, b: EltOffset1);
1120 uint32_t Max = std::max(a: EltOffset0, b: EltOffset1);
1121
1122 const uint32_t Mask = maskTrailingOnes<uint32_t>(N: 8) * 64;
1123 if (((Max - Min) & ~Mask) == 0) {
1124 if (Modify) {
1125 // From the range of values we could use for BaseOff, choose the one that
1126 // is aligned to the highest power of two, to maximise the chance that
1127 // the same offset can be reused for other load/store pairs.
1128 uint32_t BaseOff = mostAlignedValueInRange(Lo: Max - 0xff * 64, Hi: Min);
1129 // Copy the low bits of the offsets, so that when we adjust them by
1130 // subtracting BaseOff they will be multiples of 64.
1131 BaseOff |= Min & maskTrailingOnes<uint32_t>(N: 6);
1132 CI.BaseOff = BaseOff * CI.EltSize;
1133 CI.Offset = (EltOffset0 - BaseOff) / 64;
1134 Paired.Offset = (EltOffset1 - BaseOff) / 64;
1135 CI.UseST64 = true;
1136 }
1137 return true;
1138 }
1139
1140 if (isUInt<8>(x: Max - Min)) {
1141 if (Modify) {
1142 // From the range of values we could use for BaseOff, choose the one that
1143 // is aligned to the highest power of two, to maximise the chance that
1144 // the same offset can be reused for other load/store pairs.
1145 uint32_t BaseOff = mostAlignedValueInRange(Lo: Max - 0xff, Hi: Min);
1146 CI.BaseOff = BaseOff * CI.EltSize;
1147 CI.Offset = EltOffset0 - BaseOff;
1148 Paired.Offset = EltOffset1 - BaseOff;
1149 }
1150 return true;
1151 }
1152
1153 return false;
1154}
1155
1156bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM,
1157 const CombineInfo &CI,
1158 const CombineInfo &Paired) {
1159 const unsigned Width = (CI.Width + Paired.Width);
1160 switch (CI.InstClass) {
1161 default:
1162 return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3));
1163 case S_BUFFER_LOAD_IMM:
1164 case S_BUFFER_LOAD_SGPR_IMM:
1165 case S_LOAD_IMM:
1166 switch (Width) {
1167 default:
1168 return false;
1169 case 2:
1170 case 4:
1171 case 8:
1172 return true;
1173 case 3:
1174 return STM.hasScalarDwordx3Loads();
1175 }
1176 }
1177}
1178
1179const TargetRegisterClass *
1180SILoadStoreOptimizer::getDataRegClass(const MachineInstr &MI) const {
1181 if (const auto *Dst = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst)) {
1182 return TRI->getRegClassForReg(MRI: *MRI, Reg: Dst->getReg());
1183 }
1184 if (const auto *Src = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdata)) {
1185 return TRI->getRegClassForReg(MRI: *MRI, Reg: Src->getReg());
1186 }
1187 if (const auto *Src = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::data0)) {
1188 return TRI->getRegClassForReg(MRI: *MRI, Reg: Src->getReg());
1189 }
1190 if (const auto *Dst = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::sdst)) {
1191 return TRI->getRegClassForReg(MRI: *MRI, Reg: Dst->getReg());
1192 }
1193 if (const auto *Src = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::sdata)) {
1194 return TRI->getRegClassForReg(MRI: *MRI, Reg: Src->getReg());
1195 }
1196 return nullptr;
1197}
1198
1199/// This function assumes that CI comes before Paired in a basic block. Return
1200/// an insertion point for the merged instruction or nullptr on failure.
1201SILoadStoreOptimizer::CombineInfo *
1202SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI,
1203 CombineInfo &Paired) {
1204 // If another instruction has already been merged into CI, it may now be a
1205 // type that we can't do any further merging into.
1206 if (CI.InstClass == UNKNOWN || Paired.InstClass == UNKNOWN)
1207 return nullptr;
1208 assert(CI.InstClass == Paired.InstClass);
1209
1210 if (getInstSubclass(Opc: CI.I->getOpcode(), TII: *TII) !=
1211 getInstSubclass(Opc: Paired.I->getOpcode(), TII: *TII))
1212 return nullptr;
1213
1214 // Check both offsets (or masks for MIMG) can be combined and fit in the
1215 // reduced range.
1216 if (CI.InstClass == MIMG) {
1217 if (!dmasksCanBeCombined(CI, TII: *TII, Paired))
1218 return nullptr;
1219 } else {
1220 if (!widthsFit(STM: *STM, CI, Paired) || !offsetsCanBeCombined(CI, STI: *STM, Paired))
1221 return nullptr;
1222 }
1223
1224 DenseSet<Register> RegDefs;
1225 DenseSet<Register> RegUses;
1226 CombineInfo *Where;
1227 if (CI.I->mayLoad()) {
1228 // Try to hoist Paired up to CI.
1229 addDefsUsesToList(MI: *Paired.I, RegDefs, RegUses);
1230 for (MachineBasicBlock::iterator MBBI = Paired.I; --MBBI != CI.I;) {
1231 if (!canSwapInstructions(ARegDefs: RegDefs, ARegUses: RegUses, A: *Paired.I, B: *MBBI))
1232 return nullptr;
1233 }
1234 Where = &CI;
1235 } else {
1236 // Try to sink CI down to Paired.
1237 addDefsUsesToList(MI: *CI.I, RegDefs, RegUses);
1238 for (MachineBasicBlock::iterator MBBI = CI.I; ++MBBI != Paired.I;) {
1239 if (!canSwapInstructions(ARegDefs: RegDefs, ARegUses: RegUses, A: *CI.I, B: *MBBI))
1240 return nullptr;
1241 }
1242 Where = &Paired;
1243 }
1244
1245 // Call offsetsCanBeCombined with modify = true so that the offsets are
1246 // correct for the new instruction. This should return true, because
1247 // this function should only be called on CombineInfo objects that
1248 // have already been confirmed to be mergeable.
1249 if (CI.InstClass == DS_READ || CI.InstClass == DS_WRITE)
1250 offsetsCanBeCombined(CI, STI: *STM, Paired, Modify: true);
1251 return Where;
1252}
1253
1254// Copy the merged load result from DestReg to the original dest regs of CI and
1255// Paired.
1256void SILoadStoreOptimizer::copyToDestRegs(
1257 CombineInfo &CI, CombineInfo &Paired,
1258 MachineBasicBlock::iterator InsertBefore, AMDGPU::OpName OpName,
1259 Register DestReg) const {
1260 MachineBasicBlock *MBB = CI.I->getParent();
1261 DebugLoc DL = CI.I->getDebugLoc();
1262
1263 auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired);
1264
1265 // Copy to the old destination registers.
1266 const MCInstrDesc &CopyDesc = TII->get(Opcode: TargetOpcode::COPY);
1267 auto *Dest0 = TII->getNamedOperand(MI&: *CI.I, OperandName: OpName);
1268 auto *Dest1 = TII->getNamedOperand(MI&: *Paired.I, OperandName: OpName);
1269
1270 // The constrained sload instructions in S_LOAD_IMM class will have
1271 // `early-clobber` flag in the dst operand. Remove the flag before using the
1272 // MOs in copies.
1273 Dest0->setIsEarlyClobber(false);
1274 Dest1->setIsEarlyClobber(false);
1275
1276 BuildMI(BB&: *MBB, I: InsertBefore, MIMD: DL, MCID: CopyDesc)
1277 .add(MO: *Dest0) // Copy to same destination including flags and sub reg.
1278 .addReg(RegNo: DestReg, flags: 0, SubReg: SubRegIdx0);
1279 BuildMI(BB&: *MBB, I: InsertBefore, MIMD: DL, MCID: CopyDesc)
1280 .add(MO: *Dest1)
1281 .addReg(RegNo: DestReg, flags: RegState::Kill, SubReg: SubRegIdx1);
1282}
1283
1284// Return a register for the source of the merged store after copying the
1285// original source regs of CI and Paired into it.
1286Register
1287SILoadStoreOptimizer::copyFromSrcRegs(CombineInfo &CI, CombineInfo &Paired,
1288 MachineBasicBlock::iterator InsertBefore,
1289 AMDGPU::OpName OpName) const {
1290 MachineBasicBlock *MBB = CI.I->getParent();
1291 DebugLoc DL = CI.I->getDebugLoc();
1292
1293 auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired);
1294
1295 // Copy to the new source register.
1296 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1297 Register SrcReg = MRI->createVirtualRegister(RegClass: SuperRC);
1298
1299 const auto *Src0 = TII->getNamedOperand(MI&: *CI.I, OperandName: OpName);
1300 const auto *Src1 = TII->getNamedOperand(MI&: *Paired.I, OperandName: OpName);
1301
1302 BuildMI(BB&: *MBB, I: InsertBefore, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: SrcReg)
1303 .add(MO: *Src0)
1304 .addImm(Val: SubRegIdx0)
1305 .add(MO: *Src1)
1306 .addImm(Val: SubRegIdx1);
1307
1308 return SrcReg;
1309}
1310
1311unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const {
1312 if (STM->ldsRequiresM0Init())
1313 return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;
1314 return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9;
1315}
1316
1317unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const {
1318 if (STM->ldsRequiresM0Init())
1319 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
1320
1321 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9
1322 : AMDGPU::DS_READ2ST64_B64_gfx9;
1323}
1324
1325MachineBasicBlock::iterator
1326SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
1327 MachineBasicBlock::iterator InsertBefore) {
1328 MachineBasicBlock *MBB = CI.I->getParent();
1329
1330 // Be careful, since the addresses could be subregisters themselves in weird
1331 // cases, like vectors of pointers.
1332 const auto *AddrReg = TII->getNamedOperand(MI&: *CI.I, OperandName: AMDGPU::OpName::addr);
1333
1334 unsigned NewOffset0 = std::min(a: CI.Offset, b: Paired.Offset);
1335 unsigned NewOffset1 = std::max(a: CI.Offset, b: Paired.Offset);
1336 unsigned Opc =
1337 CI.UseST64 ? read2ST64Opcode(EltSize: CI.EltSize) : read2Opcode(EltSize: CI.EltSize);
1338
1339 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
1340 (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
1341
1342 const MCInstrDesc &Read2Desc = TII->get(Opcode: Opc);
1343
1344 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1345 Register DestReg = MRI->createVirtualRegister(RegClass: SuperRC);
1346
1347 DebugLoc DL = CI.I->getDebugLoc();
1348
1349 Register BaseReg = AddrReg->getReg();
1350 unsigned BaseSubReg = AddrReg->getSubReg();
1351 unsigned BaseRegFlags = 0;
1352 if (CI.BaseOff) {
1353 Register ImmReg = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
1354 BuildMI(BB&: *MBB, I: InsertBefore, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MOV_B32), DestReg: ImmReg)
1355 .addImm(Val: CI.BaseOff);
1356
1357 BaseReg = MRI->createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
1358 BaseRegFlags = RegState::Kill;
1359
1360 TII->getAddNoCarry(MBB&: *MBB, I: InsertBefore, DL, DestReg: BaseReg)
1361 .addReg(RegNo: ImmReg)
1362 .addReg(RegNo: AddrReg->getReg(), flags: 0, SubReg: BaseSubReg)
1363 .addImm(Val: 0); // clamp bit
1364 BaseSubReg = 0;
1365 }
1366
1367 MachineInstrBuilder Read2 =
1368 BuildMI(BB&: *MBB, I: InsertBefore, MIMD: DL, MCID: Read2Desc, DestReg)
1369 .addReg(RegNo: BaseReg, flags: BaseRegFlags, SubReg: BaseSubReg) // addr
1370 .addImm(Val: NewOffset0) // offset0
1371 .addImm(Val: NewOffset1) // offset1
1372 .addImm(Val: 0) // gds
1373 .cloneMergedMemRefs(OtherMIs: {&*CI.I, &*Paired.I});
1374
1375 copyToDestRegs(CI, Paired, InsertBefore, OpName: AMDGPU::OpName::vdst, DestReg);
1376
1377 CI.I->eraseFromParent();
1378 Paired.I->eraseFromParent();
1379
1380 LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
1381 return Read2;
1382}
1383
1384unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const {
1385 if (STM->ldsRequiresM0Init())
1386 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
1387 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9
1388 : AMDGPU::DS_WRITE2_B64_gfx9;
1389}
1390
1391unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const {
1392 if (STM->ldsRequiresM0Init())
1393 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32
1394 : AMDGPU::DS_WRITE2ST64_B64;
1395
1396 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9
1397 : AMDGPU::DS_WRITE2ST64_B64_gfx9;
1398}
1399
1400MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
1401 CombineInfo &CI, CombineInfo &Paired,
1402 MachineBasicBlock::iterator InsertBefore) {
1403 MachineBasicBlock *MBB = CI.I->getParent();
1404
1405 // Be sure to use .addOperand(), and not .addReg() with these. We want to be
1406 // sure we preserve the subregister index and any register flags set on them.
1407 const MachineOperand *AddrReg =
1408 TII->getNamedOperand(MI&: *CI.I, OperandName: AMDGPU::OpName::addr);
1409 const MachineOperand *Data0 =
1410 TII->getNamedOperand(MI&: *CI.I, OperandName: AMDGPU::OpName::data0);
1411 const MachineOperand *Data1 =
1412 TII->getNamedOperand(MI&: *Paired.I, OperandName: AMDGPU::OpName::data0);
1413
1414 unsigned NewOffset0 = CI.Offset;
1415 unsigned NewOffset1 = Paired.Offset;
1416 unsigned Opc =
1417 CI.UseST64 ? write2ST64Opcode(EltSize: CI.EltSize) : write2Opcode(EltSize: CI.EltSize);
1418
1419 if (NewOffset0 > NewOffset1) {
1420 // Canonicalize the merged instruction so the smaller offset comes first.
1421 std::swap(a&: NewOffset0, b&: NewOffset1);
1422 std::swap(a&: Data0, b&: Data1);
1423 }
1424
1425 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
1426 (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
1427
1428 const MCInstrDesc &Write2Desc = TII->get(Opcode: Opc);
1429 DebugLoc DL = CI.I->getDebugLoc();
1430
1431 Register BaseReg = AddrReg->getReg();
1432 unsigned BaseSubReg = AddrReg->getSubReg();
1433 unsigned BaseRegFlags = 0;
1434 if (CI.BaseOff) {
1435 Register ImmReg = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
1436 BuildMI(BB&: *MBB, I: InsertBefore, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MOV_B32), DestReg: ImmReg)
1437 .addImm(Val: CI.BaseOff);
1438
1439 BaseReg = MRI->createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
1440 BaseRegFlags = RegState::Kill;
1441
1442 TII->getAddNoCarry(MBB&: *MBB, I: InsertBefore, DL, DestReg: BaseReg)
1443 .addReg(RegNo: ImmReg)
1444 .addReg(RegNo: AddrReg->getReg(), flags: 0, SubReg: BaseSubReg)
1445 .addImm(Val: 0); // clamp bit
1446 BaseSubReg = 0;
1447 }
1448
1449 MachineInstrBuilder Write2 =
1450 BuildMI(BB&: *MBB, I: InsertBefore, MIMD: DL, MCID: Write2Desc)
1451 .addReg(RegNo: BaseReg, flags: BaseRegFlags, SubReg: BaseSubReg) // addr
1452 .add(MO: *Data0) // data0
1453 .add(MO: *Data1) // data1
1454 .addImm(Val: NewOffset0) // offset0
1455 .addImm(Val: NewOffset1) // offset1
1456 .addImm(Val: 0) // gds
1457 .cloneMergedMemRefs(OtherMIs: {&*CI.I, &*Paired.I});
1458
1459 CI.I->eraseFromParent();
1460 Paired.I->eraseFromParent();
1461
1462 LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
1463 return Write2;
1464}
1465
1466MachineBasicBlock::iterator
1467SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
1468 MachineBasicBlock::iterator InsertBefore) {
1469 MachineBasicBlock *MBB = CI.I->getParent();
1470 DebugLoc DL = CI.I->getDebugLoc();
1471 const unsigned Opcode = getNewOpcode(CI, Paired);
1472
1473 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1474
1475 Register DestReg = MRI->createVirtualRegister(RegClass: SuperRC);
1476 unsigned MergedDMask = CI.DMask | Paired.DMask;
1477 unsigned DMaskIdx =
1478 AMDGPU::getNamedOperandIdx(Opcode: CI.I->getOpcode(), Name: AMDGPU::OpName::dmask);
1479
1480 auto MIB = BuildMI(BB&: *MBB, I: InsertBefore, MIMD: DL, MCID: TII->get(Opcode), DestReg);
1481 for (unsigned I = 1, E = (*CI.I).getNumOperands(); I != E; ++I) {
1482 if (I == DMaskIdx)
1483 MIB.addImm(Val: MergedDMask);
1484 else
1485 MIB.add(MO: (*CI.I).getOperand(i: I));
1486 }
1487
1488 // It shouldn't be possible to get this far if the two instructions
1489 // don't have a single memoperand, because MachineInstr::mayAlias()
1490 // will return true if this is the case.
1491 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1492
1493 MachineInstr *New = MIB.addMemOperand(MMO: combineKnownAdjacentMMOs(CI, Paired));
1494
1495 copyToDestRegs(CI, Paired, InsertBefore, OpName: AMDGPU::OpName::vdata, DestReg);
1496
1497 CI.I->eraseFromParent();
1498 Paired.I->eraseFromParent();
1499 return New;
1500}
1501
1502MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSMemLoadImmPair(
1503 CombineInfo &CI, CombineInfo &Paired,
1504 MachineBasicBlock::iterator InsertBefore) {
1505 MachineBasicBlock *MBB = CI.I->getParent();
1506 DebugLoc DL = CI.I->getDebugLoc();
1507 const unsigned Opcode = getNewOpcode(CI, Paired);
1508
1509 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1510
1511 Register DestReg = MRI->createVirtualRegister(RegClass: SuperRC);
1512 unsigned MergedOffset = std::min(a: CI.Offset, b: Paired.Offset);
1513
1514 // It shouldn't be possible to get this far if the two instructions
1515 // don't have a single memoperand, because MachineInstr::mayAlias()
1516 // will return true if this is the case.
1517 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1518
1519 MachineInstrBuilder New =
1520 BuildMI(BB&: *MBB, I: InsertBefore, MIMD: DL, MCID: TII->get(Opcode), DestReg)
1521 .add(MO: *TII->getNamedOperand(MI&: *CI.I, OperandName: AMDGPU::OpName::sbase));
1522 if (CI.InstClass == S_BUFFER_LOAD_SGPR_IMM)
1523 New.add(MO: *TII->getNamedOperand(MI&: *CI.I, OperandName: AMDGPU::OpName::soffset));
1524 New.addImm(Val: MergedOffset);
1525 New.addImm(Val: CI.CPol).addMemOperand(MMO: combineKnownAdjacentMMOs(CI, Paired));
1526
1527 copyToDestRegs(CI, Paired, InsertBefore, OpName: AMDGPU::OpName::sdst, DestReg);
1528
1529 CI.I->eraseFromParent();
1530 Paired.I->eraseFromParent();
1531 return New;
1532}
1533
1534MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
1535 CombineInfo &CI, CombineInfo &Paired,
1536 MachineBasicBlock::iterator InsertBefore) {
1537 MachineBasicBlock *MBB = CI.I->getParent();
1538 DebugLoc DL = CI.I->getDebugLoc();
1539
1540 const unsigned Opcode = getNewOpcode(CI, Paired);
1541
1542 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1543
1544 // Copy to the new source register.
1545 Register DestReg = MRI->createVirtualRegister(RegClass: SuperRC);
1546 unsigned MergedOffset = std::min(a: CI.Offset, b: Paired.Offset);
1547
1548 auto MIB = BuildMI(BB&: *MBB, I: InsertBefore, MIMD: DL, MCID: TII->get(Opcode), DestReg);
1549
1550 AddressRegs Regs = getRegs(Opc: Opcode, TII: *TII);
1551
1552 if (Regs.VAddr)
1553 MIB.add(MO: *TII->getNamedOperand(MI&: *CI.I, OperandName: AMDGPU::OpName::vaddr));
1554
1555 // It shouldn't be possible to get this far if the two instructions
1556 // don't have a single memoperand, because MachineInstr::mayAlias()
1557 // will return true if this is the case.
1558 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1559
1560 MachineInstr *New =
1561 MIB.add(MO: *TII->getNamedOperand(MI&: *CI.I, OperandName: AMDGPU::OpName::srsrc))
1562 .add(MO: *TII->getNamedOperand(MI&: *CI.I, OperandName: AMDGPU::OpName::soffset))
1563 .addImm(Val: MergedOffset) // offset
1564 .addImm(Val: CI.CPol) // cpol
1565 .addImm(Val: 0) // swz
1566 .addMemOperand(MMO: combineKnownAdjacentMMOs(CI, Paired));
1567
1568 copyToDestRegs(CI, Paired, InsertBefore, OpName: AMDGPU::OpName::vdata, DestReg);
1569
1570 CI.I->eraseFromParent();
1571 Paired.I->eraseFromParent();
1572 return New;
1573}
1574
1575MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair(
1576 CombineInfo &CI, CombineInfo &Paired,
1577 MachineBasicBlock::iterator InsertBefore) {
1578 MachineBasicBlock *MBB = CI.I->getParent();
1579 DebugLoc DL = CI.I->getDebugLoc();
1580
1581 const unsigned Opcode = getNewOpcode(CI, Paired);
1582
1583 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1584
1585 // Copy to the new source register.
1586 Register DestReg = MRI->createVirtualRegister(RegClass: SuperRC);
1587 unsigned MergedOffset = std::min(a: CI.Offset, b: Paired.Offset);
1588
1589 auto MIB = BuildMI(BB&: *MBB, I: InsertBefore, MIMD: DL, MCID: TII->get(Opcode), DestReg);
1590
1591 AddressRegs Regs = getRegs(Opc: Opcode, TII: *TII);
1592
1593 if (Regs.VAddr)
1594 MIB.add(MO: *TII->getNamedOperand(MI&: *CI.I, OperandName: AMDGPU::OpName::vaddr));
1595
1596 unsigned JoinedFormat =
1597 getBufferFormatWithCompCount(OldFormat: CI.Format, ComponentCount: CI.Width + Paired.Width, STI: *STM);
1598
1599 // It shouldn't be possible to get this far if the two instructions
1600 // don't have a single memoperand, because MachineInstr::mayAlias()
1601 // will return true if this is the case.
1602 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1603
1604 MachineInstr *New =
1605 MIB.add(MO: *TII->getNamedOperand(MI&: *CI.I, OperandName: AMDGPU::OpName::srsrc))
1606 .add(MO: *TII->getNamedOperand(MI&: *CI.I, OperandName: AMDGPU::OpName::soffset))
1607 .addImm(Val: MergedOffset) // offset
1608 .addImm(Val: JoinedFormat) // format
1609 .addImm(Val: CI.CPol) // cpol
1610 .addImm(Val: 0) // swz
1611 .addMemOperand(MMO: combineKnownAdjacentMMOs(CI, Paired));
1612
1613 copyToDestRegs(CI, Paired, InsertBefore, OpName: AMDGPU::OpName::vdata, DestReg);
1614
1615 CI.I->eraseFromParent();
1616 Paired.I->eraseFromParent();
1617 return New;
1618}
1619
1620MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair(
1621 CombineInfo &CI, CombineInfo &Paired,
1622 MachineBasicBlock::iterator InsertBefore) {
1623 MachineBasicBlock *MBB = CI.I->getParent();
1624 DebugLoc DL = CI.I->getDebugLoc();
1625
1626 const unsigned Opcode = getNewOpcode(CI, Paired);
1627
1628 Register SrcReg =
1629 copyFromSrcRegs(CI, Paired, InsertBefore, OpName: AMDGPU::OpName::vdata);
1630
1631 auto MIB = BuildMI(BB&: *MBB, I: InsertBefore, MIMD: DL, MCID: TII->get(Opcode))
1632 .addReg(RegNo: SrcReg, flags: RegState::Kill);
1633
1634 AddressRegs Regs = getRegs(Opc: Opcode, TII: *TII);
1635
1636 if (Regs.VAddr)
1637 MIB.add(MO: *TII->getNamedOperand(MI&: *CI.I, OperandName: AMDGPU::OpName::vaddr));
1638
1639 unsigned JoinedFormat =
1640 getBufferFormatWithCompCount(OldFormat: CI.Format, ComponentCount: CI.Width + Paired.Width, STI: *STM);
1641
1642 // It shouldn't be possible to get this far if the two instructions
1643 // don't have a single memoperand, because MachineInstr::mayAlias()
1644 // will return true if this is the case.
1645 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1646
1647 MachineInstr *New =
1648 MIB.add(MO: *TII->getNamedOperand(MI&: *CI.I, OperandName: AMDGPU::OpName::srsrc))
1649 .add(MO: *TII->getNamedOperand(MI&: *CI.I, OperandName: AMDGPU::OpName::soffset))
1650 .addImm(Val: std::min(a: CI.Offset, b: Paired.Offset)) // offset
1651 .addImm(Val: JoinedFormat) // format
1652 .addImm(Val: CI.CPol) // cpol
1653 .addImm(Val: 0) // swz
1654 .addMemOperand(MMO: combineKnownAdjacentMMOs(CI, Paired));
1655
1656 CI.I->eraseFromParent();
1657 Paired.I->eraseFromParent();
1658 return New;
1659}
1660
1661MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatLoadPair(
1662 CombineInfo &CI, CombineInfo &Paired,
1663 MachineBasicBlock::iterator InsertBefore) {
1664 MachineBasicBlock *MBB = CI.I->getParent();
1665 DebugLoc DL = CI.I->getDebugLoc();
1666
1667 const unsigned Opcode = getNewOpcode(CI, Paired);
1668
1669 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1670 Register DestReg = MRI->createVirtualRegister(RegClass: SuperRC);
1671
1672 auto MIB = BuildMI(BB&: *MBB, I: InsertBefore, MIMD: DL, MCID: TII->get(Opcode), DestReg);
1673
1674 if (auto *SAddr = TII->getNamedOperand(MI&: *CI.I, OperandName: AMDGPU::OpName::saddr))
1675 MIB.add(MO: *SAddr);
1676
1677 MachineInstr *New =
1678 MIB.add(MO: *TII->getNamedOperand(MI&: *CI.I, OperandName: AMDGPU::OpName::vaddr))
1679 .addImm(Val: std::min(a: CI.Offset, b: Paired.Offset))
1680 .addImm(Val: CI.CPol)
1681 .addMemOperand(MMO: combineKnownAdjacentMMOs(CI, Paired));
1682
1683 copyToDestRegs(CI, Paired, InsertBefore, OpName: AMDGPU::OpName::vdst, DestReg);
1684
1685 CI.I->eraseFromParent();
1686 Paired.I->eraseFromParent();
1687 return New;
1688}
1689
1690MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatStorePair(
1691 CombineInfo &CI, CombineInfo &Paired,
1692 MachineBasicBlock::iterator InsertBefore) {
1693 MachineBasicBlock *MBB = CI.I->getParent();
1694 DebugLoc DL = CI.I->getDebugLoc();
1695
1696 const unsigned Opcode = getNewOpcode(CI, Paired);
1697
1698 Register SrcReg =
1699 copyFromSrcRegs(CI, Paired, InsertBefore, OpName: AMDGPU::OpName::vdata);
1700
1701 auto MIB = BuildMI(BB&: *MBB, I: InsertBefore, MIMD: DL, MCID: TII->get(Opcode))
1702 .add(MO: *TII->getNamedOperand(MI&: *CI.I, OperandName: AMDGPU::OpName::vaddr))
1703 .addReg(RegNo: SrcReg, flags: RegState::Kill);
1704
1705 if (auto *SAddr = TII->getNamedOperand(MI&: *CI.I, OperandName: AMDGPU::OpName::saddr))
1706 MIB.add(MO: *SAddr);
1707
1708 MachineInstr *New =
1709 MIB.addImm(Val: std::min(a: CI.Offset, b: Paired.Offset))
1710 .addImm(Val: CI.CPol)
1711 .addMemOperand(MMO: combineKnownAdjacentMMOs(CI, Paired));
1712
1713 CI.I->eraseFromParent();
1714 Paired.I->eraseFromParent();
1715 return New;
1716}
1717
1718static bool needsConstrainedOpcode(const GCNSubtarget &STM,
1719 ArrayRef<MachineMemOperand *> MMOs,
1720 unsigned Width) {
1721 // Conservatively returns true if not found the MMO.
1722 return STM.isXNACKEnabled() &&
1723 (MMOs.size() != 1 || MMOs[0]->getAlign().value() < Width * 4);
1724}
1725
1726unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
1727 const CombineInfo &Paired) {
1728 const unsigned Width = CI.Width + Paired.Width;
1729
1730 switch (getCommonInstClass(CI, Paired)) {
1731 default:
1732 assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE);
1733 // FIXME: Handle d16 correctly
1734 return AMDGPU::getMUBUFOpcode(BaseOpc: AMDGPU::getMUBUFBaseOpcode(Opc: CI.I->getOpcode()),
1735 Elements: Width);
1736 case TBUFFER_LOAD:
1737 case TBUFFER_STORE:
1738 return AMDGPU::getMTBUFOpcode(BaseOpc: AMDGPU::getMTBUFBaseOpcode(Opc: CI.I->getOpcode()),
1739 Elements: Width);
1740
1741 case UNKNOWN:
1742 llvm_unreachable("Unknown instruction class");
1743 case S_BUFFER_LOAD_IMM: {
1744 // If XNACK is enabled, use the constrained opcodes when the first load is
1745 // under-aligned.
1746 bool NeedsConstrainedOpc =
1747 needsConstrainedOpcode(STM: *STM, MMOs: CI.I->memoperands(), Width);
1748 switch (Width) {
1749 default:
1750 return 0;
1751 case 2:
1752 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec
1753 : AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
1754 case 3:
1755 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec
1756 : AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM;
1757 case 4:
1758 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec
1759 : AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
1760 case 8:
1761 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec
1762 : AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM;
1763 }
1764 }
1765 case S_BUFFER_LOAD_SGPR_IMM: {
1766 // If XNACK is enabled, use the constrained opcodes when the first load is
1767 // under-aligned.
1768 bool NeedsConstrainedOpc =
1769 needsConstrainedOpcode(STM: *STM, MMOs: CI.I->memoperands(), Width);
1770 switch (Width) {
1771 default:
1772 return 0;
1773 case 2:
1774 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec
1775 : AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM;
1776 case 3:
1777 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec
1778 : AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM;
1779 case 4:
1780 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec
1781 : AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM;
1782 case 8:
1783 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec
1784 : AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM;
1785 }
1786 }
1787 case S_LOAD_IMM: {
1788 // If XNACK is enabled, use the constrained opcodes when the first load is
1789 // under-aligned.
1790 bool NeedsConstrainedOpc =
1791 needsConstrainedOpcode(STM: *STM, MMOs: CI.I->memoperands(), Width);
1792 switch (Width) {
1793 default:
1794 return 0;
1795 case 2:
1796 return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX2_IMM_ec
1797 : AMDGPU::S_LOAD_DWORDX2_IMM;
1798 case 3:
1799 return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX3_IMM_ec
1800 : AMDGPU::S_LOAD_DWORDX3_IMM;
1801 case 4:
1802 return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX4_IMM_ec
1803 : AMDGPU::S_LOAD_DWORDX4_IMM;
1804 case 8:
1805 return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX8_IMM_ec
1806 : AMDGPU::S_LOAD_DWORDX8_IMM;
1807 }
1808 }
1809 case GLOBAL_LOAD:
1810 switch (Width) {
1811 default:
1812 return 0;
1813 case 2:
1814 return AMDGPU::GLOBAL_LOAD_DWORDX2;
1815 case 3:
1816 return AMDGPU::GLOBAL_LOAD_DWORDX3;
1817 case 4:
1818 return AMDGPU::GLOBAL_LOAD_DWORDX4;
1819 }
1820 case GLOBAL_LOAD_SADDR:
1821 switch (Width) {
1822 default:
1823 return 0;
1824 case 2:
1825 return AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR;
1826 case 3:
1827 return AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR;
1828 case 4:
1829 return AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR;
1830 }
1831 case GLOBAL_STORE:
1832 switch (Width) {
1833 default:
1834 return 0;
1835 case 2:
1836 return AMDGPU::GLOBAL_STORE_DWORDX2;
1837 case 3:
1838 return AMDGPU::GLOBAL_STORE_DWORDX3;
1839 case 4:
1840 return AMDGPU::GLOBAL_STORE_DWORDX4;
1841 }
1842 case GLOBAL_STORE_SADDR:
1843 switch (Width) {
1844 default:
1845 return 0;
1846 case 2:
1847 return AMDGPU::GLOBAL_STORE_DWORDX2_SADDR;
1848 case 3:
1849 return AMDGPU::GLOBAL_STORE_DWORDX3_SADDR;
1850 case 4:
1851 return AMDGPU::GLOBAL_STORE_DWORDX4_SADDR;
1852 }
1853 case FLAT_LOAD:
1854 switch (Width) {
1855 default:
1856 return 0;
1857 case 2:
1858 return AMDGPU::FLAT_LOAD_DWORDX2;
1859 case 3:
1860 return AMDGPU::FLAT_LOAD_DWORDX3;
1861 case 4:
1862 return AMDGPU::FLAT_LOAD_DWORDX4;
1863 }
1864 case FLAT_STORE:
1865 switch (Width) {
1866 default:
1867 return 0;
1868 case 2:
1869 return AMDGPU::FLAT_STORE_DWORDX2;
1870 case 3:
1871 return AMDGPU::FLAT_STORE_DWORDX3;
1872 case 4:
1873 return AMDGPU::FLAT_STORE_DWORDX4;
1874 }
1875 case MIMG:
1876 assert(((unsigned)llvm::popcount(CI.DMask | Paired.DMask) == Width) &&
1877 "No overlaps");
1878 return AMDGPU::getMaskedMIMGOp(Opc: CI.I->getOpcode(), NewChannels: Width);
1879 }
1880}
1881
1882std::pair<unsigned, unsigned>
1883SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI,
1884 const CombineInfo &Paired) {
1885 assert((CI.InstClass != MIMG ||
1886 ((unsigned)llvm::popcount(CI.DMask | Paired.DMask) ==
1887 CI.Width + Paired.Width)) &&
1888 "No overlaps");
1889
1890 unsigned Idx0;
1891 unsigned Idx1;
1892
1893 static const unsigned Idxs[5][4] = {
1894 {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3},
1895 {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, AMDGPU::sub1_sub2_sub3_sub4},
1896 {AMDGPU::sub2, AMDGPU::sub2_sub3, AMDGPU::sub2_sub3_sub4, AMDGPU::sub2_sub3_sub4_sub5},
1897 {AMDGPU::sub3, AMDGPU::sub3_sub4, AMDGPU::sub3_sub4_sub5, AMDGPU::sub3_sub4_sub5_sub6},
1898 {AMDGPU::sub4, AMDGPU::sub4_sub5, AMDGPU::sub4_sub5_sub6, AMDGPU::sub4_sub5_sub6_sub7},
1899 };
1900
1901 assert(CI.Width >= 1 && CI.Width <= 4);
1902 assert(Paired.Width >= 1 && Paired.Width <= 4);
1903
1904 if (Paired < CI) {
1905 Idx1 = Idxs[0][Paired.Width - 1];
1906 Idx0 = Idxs[Paired.Width][CI.Width - 1];
1907 } else {
1908 Idx0 = Idxs[0][CI.Width - 1];
1909 Idx1 = Idxs[CI.Width][Paired.Width - 1];
1910 }
1911
1912 return {Idx0, Idx1};
1913}
1914
1915const TargetRegisterClass *
1916SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI,
1917 const CombineInfo &Paired) const {
1918 if (CI.InstClass == S_BUFFER_LOAD_IMM ||
1919 CI.InstClass == S_BUFFER_LOAD_SGPR_IMM || CI.InstClass == S_LOAD_IMM) {
1920 switch (CI.Width + Paired.Width) {
1921 default:
1922 return nullptr;
1923 case 2:
1924 return &AMDGPU::SReg_64_XEXECRegClass;
1925 case 3:
1926 return &AMDGPU::SGPR_96RegClass;
1927 case 4:
1928 return &AMDGPU::SGPR_128RegClass;
1929 case 8:
1930 return &AMDGPU::SGPR_256RegClass;
1931 case 16:
1932 return &AMDGPU::SGPR_512RegClass;
1933 }
1934 }
1935
1936 unsigned BitWidth = 32 * (CI.Width + Paired.Width);
1937 return TRI->isAGPRClass(RC: getDataRegClass(MI: *CI.I))
1938 ? TRI->getAGPRClassForBitWidth(BitWidth)
1939 : TRI->getVGPRClassForBitWidth(BitWidth);
1940}
1941
1942MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
1943 CombineInfo &CI, CombineInfo &Paired,
1944 MachineBasicBlock::iterator InsertBefore) {
1945 MachineBasicBlock *MBB = CI.I->getParent();
1946 DebugLoc DL = CI.I->getDebugLoc();
1947
1948 const unsigned Opcode = getNewOpcode(CI, Paired);
1949
1950 Register SrcReg =
1951 copyFromSrcRegs(CI, Paired, InsertBefore, OpName: AMDGPU::OpName::vdata);
1952
1953 auto MIB = BuildMI(BB&: *MBB, I: InsertBefore, MIMD: DL, MCID: TII->get(Opcode))
1954 .addReg(RegNo: SrcReg, flags: RegState::Kill);
1955
1956 AddressRegs Regs = getRegs(Opc: Opcode, TII: *TII);
1957
1958 if (Regs.VAddr)
1959 MIB.add(MO: *TII->getNamedOperand(MI&: *CI.I, OperandName: AMDGPU::OpName::vaddr));
1960
1961
1962 // It shouldn't be possible to get this far if the two instructions
1963 // don't have a single memoperand, because MachineInstr::mayAlias()
1964 // will return true if this is the case.
1965 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1966
1967 MachineInstr *New =
1968 MIB.add(MO: *TII->getNamedOperand(MI&: *CI.I, OperandName: AMDGPU::OpName::srsrc))
1969 .add(MO: *TII->getNamedOperand(MI&: *CI.I, OperandName: AMDGPU::OpName::soffset))
1970 .addImm(Val: std::min(a: CI.Offset, b: Paired.Offset)) // offset
1971 .addImm(Val: CI.CPol) // cpol
1972 .addImm(Val: 0) // swz
1973 .addMemOperand(MMO: combineKnownAdjacentMMOs(CI, Paired));
1974
1975 CI.I->eraseFromParent();
1976 Paired.I->eraseFromParent();
1977 return New;
1978}
1979
1980MachineOperand
1981SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) const {
1982 APInt V(32, Val, true);
1983 if (TII->isInlineConstant(Imm: V))
1984 return MachineOperand::CreateImm(Val);
1985
1986 Register Reg = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
1987 MachineInstr *Mov =
1988 BuildMI(BB&: *MI.getParent(), I: MI.getIterator(), MIMD: MI.getDebugLoc(),
1989 MCID: TII->get(Opcode: AMDGPU::S_MOV_B32), DestReg: Reg)
1990 .addImm(Val);
1991 (void)Mov;
1992 LLVM_DEBUG(dbgs() << " "; Mov->dump());
1993 return MachineOperand::CreateReg(Reg, isDef: false);
1994}
1995
1996// Compute base address using Addr and return the final register.
1997Register SILoadStoreOptimizer::computeBase(MachineInstr &MI,
1998 const MemAddress &Addr) const {
1999 MachineBasicBlock *MBB = MI.getParent();
2000 MachineBasicBlock::iterator MBBI = MI.getIterator();
2001 DebugLoc DL = MI.getDebugLoc();
2002
2003 assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 ||
2004 Addr.Base.LoSubReg) &&
2005 "Expected 32-bit Base-Register-Low!!");
2006
2007 assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 ||
2008 Addr.Base.HiSubReg) &&
2009 "Expected 32-bit Base-Register-Hi!!");
2010
2011 LLVM_DEBUG(dbgs() << " Re-Computed Anchor-Base:\n");
2012 MachineOperand OffsetLo = createRegOrImm(Val: static_cast<int32_t>(Addr.Offset), MI);
2013 MachineOperand OffsetHi =
2014 createRegOrImm(Val: static_cast<int32_t>(Addr.Offset >> 32), MI);
2015
2016 const auto *CarryRC = TRI->getWaveMaskRegClass();
2017 Register CarryReg = MRI->createVirtualRegister(RegClass: CarryRC);
2018 Register DeadCarryReg = MRI->createVirtualRegister(RegClass: CarryRC);
2019
2020 Register DestSub0 = MRI->createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
2021 Register DestSub1 = MRI->createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
2022 MachineInstr *LoHalf =
2023 BuildMI(BB&: *MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_ADD_CO_U32_e64), DestReg: DestSub0)
2024 .addReg(RegNo: CarryReg, flags: RegState::Define)
2025 .addReg(RegNo: Addr.Base.LoReg, flags: 0, SubReg: Addr.Base.LoSubReg)
2026 .add(MO: OffsetLo)
2027 .addImm(Val: 0); // clamp bit
2028 (void)LoHalf;
2029 LLVM_DEBUG(dbgs() << " "; LoHalf->dump(););
2030
2031 MachineInstr *HiHalf =
2032 BuildMI(BB&: *MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_ADDC_U32_e64), DestReg: DestSub1)
2033 .addReg(RegNo: DeadCarryReg, flags: RegState::Define | RegState::Dead)
2034 .addReg(RegNo: Addr.Base.HiReg, flags: 0, SubReg: Addr.Base.HiSubReg)
2035 .add(MO: OffsetHi)
2036 .addReg(RegNo: CarryReg, flags: RegState::Kill)
2037 .addImm(Val: 0); // clamp bit
2038 (void)HiHalf;
2039 LLVM_DEBUG(dbgs() << " "; HiHalf->dump(););
2040
2041 Register FullDestReg = MRI->createVirtualRegister(RegClass: TRI->getVGPR64Class());
2042 MachineInstr *FullBase =
2043 BuildMI(BB&: *MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::REG_SEQUENCE), DestReg: FullDestReg)
2044 .addReg(RegNo: DestSub0)
2045 .addImm(Val: AMDGPU::sub0)
2046 .addReg(RegNo: DestSub1)
2047 .addImm(Val: AMDGPU::sub1);
2048 (void)FullBase;
2049 LLVM_DEBUG(dbgs() << " "; FullBase->dump(); dbgs() << "\n";);
2050
2051 return FullDestReg;
2052}
2053
2054// Update base and offset with the NewBase and NewOffset in MI.
2055void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI,
2056 Register NewBase,
2057 int32_t NewOffset) const {
2058 auto *Base = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vaddr);
2059 Base->setReg(NewBase);
2060 Base->setIsKill(false);
2061 TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::offset)->setImm(NewOffset);
2062}
2063
2064std::optional<int32_t>
2065SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) const {
2066 if (Op.isImm())
2067 return Op.getImm();
2068
2069 if (!Op.isReg())
2070 return std::nullopt;
2071
2072 MachineInstr *Def = MRI->getUniqueVRegDef(Reg: Op.getReg());
2073 if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 ||
2074 !Def->getOperand(i: 1).isImm())
2075 return std::nullopt;
2076
2077 return Def->getOperand(i: 1).getImm();
2078}
2079
2080// Analyze Base and extracts:
2081// - 32bit base registers, subregisters
2082// - 64bit constant offset
2083// Expecting base computation as:
2084// %OFFSET0:sgpr_32 = S_MOV_B32 8000
2085// %LO:vgpr_32, %c:sreg_64_xexec =
2086// V_ADD_CO_U32_e64 %BASE_LO:vgpr_32, %103:sgpr_32,
2087// %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec
2088// %Base:vreg_64 =
2089// REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1
2090void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base,
2091 MemAddress &Addr) const {
2092 if (!Base.isReg())
2093 return;
2094
2095 MachineInstr *Def = MRI->getUniqueVRegDef(Reg: Base.getReg());
2096 if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE
2097 || Def->getNumOperands() != 5)
2098 return;
2099
2100 MachineOperand BaseLo = Def->getOperand(i: 1);
2101 MachineOperand BaseHi = Def->getOperand(i: 3);
2102 if (!BaseLo.isReg() || !BaseHi.isReg())
2103 return;
2104
2105 MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(Reg: BaseLo.getReg());
2106 MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(Reg: BaseHi.getReg());
2107
2108 if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_CO_U32_e64 ||
2109 !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64)
2110 return;
2111
2112 const auto *Src0 = TII->getNamedOperand(MI&: *BaseLoDef, OperandName: AMDGPU::OpName::src0);
2113 const auto *Src1 = TII->getNamedOperand(MI&: *BaseLoDef, OperandName: AMDGPU::OpName::src1);
2114
2115 auto Offset0P = extractConstOffset(Op: *Src0);
2116 if (Offset0P)
2117 BaseLo = *Src1;
2118 else {
2119 if (!(Offset0P = extractConstOffset(Op: *Src1)))
2120 return;
2121 BaseLo = *Src0;
2122 }
2123
2124 if (!BaseLo.isReg())
2125 return;
2126
2127 Src0 = TII->getNamedOperand(MI&: *BaseHiDef, OperandName: AMDGPU::OpName::src0);
2128 Src1 = TII->getNamedOperand(MI&: *BaseHiDef, OperandName: AMDGPU::OpName::src1);
2129
2130 if (Src0->isImm())
2131 std::swap(a&: Src0, b&: Src1);
2132
2133 if (!Src1->isImm() || Src0->isImm())
2134 return;
2135
2136 uint64_t Offset1 = Src1->getImm();
2137 BaseHi = *Src0;
2138
2139 if (!BaseHi.isReg())
2140 return;
2141
2142 Addr.Base.LoReg = BaseLo.getReg();
2143 Addr.Base.HiReg = BaseHi.getReg();
2144 Addr.Base.LoSubReg = BaseLo.getSubReg();
2145 Addr.Base.HiSubReg = BaseHi.getSubReg();
2146 Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32);
2147}
2148
2149bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
2150 MachineInstr &MI,
2151 MemInfoMap &Visited,
2152 SmallPtrSet<MachineInstr *, 4> &AnchorList) const {
2153
2154 if (!STM->hasFlatInstOffsets() || !SIInstrInfo::isFLAT(MI))
2155 return false;
2156
2157 // TODO: Support FLAT_SCRATCH. Currently code expects 64-bit pointers.
2158 if (SIInstrInfo::isFLATScratch(MI))
2159 return false;
2160
2161 unsigned AS = SIInstrInfo::isFLATGlobal(MI) ? AMDGPUAS::GLOBAL_ADDRESS
2162 : AMDGPUAS::FLAT_ADDRESS;
2163
2164 if (AnchorList.count(Ptr: &MI))
2165 return false;
2166
2167 LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI.dump());
2168
2169 if (TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::offset)->getImm()) {
2170 LLVM_DEBUG(dbgs() << " Const-offset is already promoted.\n";);
2171 return false;
2172 }
2173
2174 // Step1: Find the base-registers and a 64bit constant offset.
2175 MachineOperand &Base = *TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vaddr);
2176 auto [It, Inserted] = Visited.try_emplace(Key: &MI);
2177 MemAddress MAddr;
2178 if (Inserted) {
2179 processBaseWithConstOffset(Base, Addr&: MAddr);
2180 It->second = MAddr;
2181 } else
2182 MAddr = It->second;
2183
2184 if (MAddr.Offset == 0) {
2185 LLVM_DEBUG(dbgs() << " Failed to extract constant-offset or there are no"
2186 " constant offsets that can be promoted.\n";);
2187 return false;
2188 }
2189
2190 LLVM_DEBUG(dbgs() << " BASE: {" << printReg(MAddr.Base.HiReg, TRI) << ", "
2191 << printReg(MAddr.Base.LoReg, TRI)
2192 << "} Offset: " << MAddr.Offset << "\n\n";);
2193
2194 // Step2: Traverse through MI's basic block and find an anchor(that has the
2195 // same base-registers) with the highest 13bit distance from MI's offset.
2196 // E.g. (64bit loads)
2197 // bb:
2198 // addr1 = &a + 4096; load1 = load(addr1, 0)
2199 // addr2 = &a + 6144; load2 = load(addr2, 0)
2200 // addr3 = &a + 8192; load3 = load(addr3, 0)
2201 // addr4 = &a + 10240; load4 = load(addr4, 0)
2202 // addr5 = &a + 12288; load5 = load(addr5, 0)
2203 //
2204 // Starting from the first load, the optimization will try to find a new base
2205 // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192
2206 // has 13bit distance from &a + 4096. The heuristic considers &a + 8192
2207 // as the new-base(anchor) because of the maximum distance which can
2208 // accommodate more intermediate bases presumably.
2209 //
2210 // Step3: move (&a + 8192) above load1. Compute and promote offsets from
2211 // (&a + 8192) for load1, load2, load4.
2212 // addr = &a + 8192
2213 // load1 = load(addr, -4096)
2214 // load2 = load(addr, -2048)
2215 // load3 = load(addr, 0)
2216 // load4 = load(addr, 2048)
2217 // addr5 = &a + 12288; load5 = load(addr5, 0)
2218 //
2219 MachineInstr *AnchorInst = nullptr;
2220 MemAddress AnchorAddr;
2221 uint32_t MaxDist = std::numeric_limits<uint32_t>::min();
2222 SmallVector<std::pair<MachineInstr *, int64_t>, 4> InstsWCommonBase;
2223
2224 MachineBasicBlock *MBB = MI.getParent();
2225 MachineBasicBlock::iterator E = MBB->end();
2226 MachineBasicBlock::iterator MBBI = MI.getIterator();
2227 ++MBBI;
2228 const SITargetLowering *TLI =
2229 static_cast<const SITargetLowering *>(STM->getTargetLowering());
2230
2231 for ( ; MBBI != E; ++MBBI) {
2232 MachineInstr &MINext = *MBBI;
2233 // TODO: Support finding an anchor(with same base) from store addresses or
2234 // any other load addresses where the opcodes are different.
2235 if (MINext.getOpcode() != MI.getOpcode() ||
2236 TII->getNamedOperand(MI&: MINext, OperandName: AMDGPU::OpName::offset)->getImm())
2237 continue;
2238
2239 const MachineOperand &BaseNext =
2240 *TII->getNamedOperand(MI&: MINext, OperandName: AMDGPU::OpName::vaddr);
2241 MemAddress MAddrNext;
2242 auto [It, Inserted] = Visited.try_emplace(Key: &MINext);
2243 if (Inserted) {
2244 processBaseWithConstOffset(Base: BaseNext, Addr&: MAddrNext);
2245 It->second = MAddrNext;
2246 } else
2247 MAddrNext = It->second;
2248
2249 if (MAddrNext.Base.LoReg != MAddr.Base.LoReg ||
2250 MAddrNext.Base.HiReg != MAddr.Base.HiReg ||
2251 MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg ||
2252 MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg)
2253 continue;
2254
2255 InstsWCommonBase.emplace_back(Args: &MINext, Args&: MAddrNext.Offset);
2256
2257 int64_t Dist = MAddr.Offset - MAddrNext.Offset;
2258 TargetLoweringBase::AddrMode AM;
2259 AM.HasBaseReg = true;
2260 AM.BaseOffs = Dist;
2261 if (TLI->isLegalFlatAddressingMode(AM, AddrSpace: AS) &&
2262 (uint32_t)std::abs(i: Dist) > MaxDist) {
2263 MaxDist = std::abs(i: Dist);
2264
2265 AnchorAddr = MAddrNext;
2266 AnchorInst = &MINext;
2267 }
2268 }
2269
2270 if (AnchorInst) {
2271 LLVM_DEBUG(dbgs() << " Anchor-Inst(with max-distance from Offset): ";
2272 AnchorInst->dump());
2273 LLVM_DEBUG(dbgs() << " Anchor-Offset from BASE: "
2274 << AnchorAddr.Offset << "\n\n");
2275
2276 // Instead of moving up, just re-compute anchor-instruction's base address.
2277 Register Base = computeBase(MI, Addr: AnchorAddr);
2278
2279 updateBaseAndOffset(MI, NewBase: Base, NewOffset: MAddr.Offset - AnchorAddr.Offset);
2280 LLVM_DEBUG(dbgs() << " After promotion: "; MI.dump(););
2281
2282 for (auto [OtherMI, OtherOffset] : InstsWCommonBase) {
2283 TargetLoweringBase::AddrMode AM;
2284 AM.HasBaseReg = true;
2285 AM.BaseOffs = OtherOffset - AnchorAddr.Offset;
2286
2287 if (TLI->isLegalFlatAddressingMode(AM, AddrSpace: AS)) {
2288 LLVM_DEBUG(dbgs() << " Promote Offset(" << OtherOffset; dbgs() << ")";
2289 OtherMI->dump());
2290 updateBaseAndOffset(MI&: *OtherMI, NewBase: Base, NewOffset: OtherOffset - AnchorAddr.Offset);
2291 LLVM_DEBUG(dbgs() << " After promotion: "; OtherMI->dump());
2292 }
2293 }
2294 AnchorList.insert(Ptr: AnchorInst);
2295 return true;
2296 }
2297
2298 return false;
2299}
2300
2301void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI,
2302 std::list<std::list<CombineInfo> > &MergeableInsts) const {
2303 for (std::list<CombineInfo> &AddrList : MergeableInsts) {
2304 if (AddrList.front().InstClass == CI.InstClass &&
2305 AddrList.front().IsAGPR == CI.IsAGPR &&
2306 AddrList.front().hasSameBaseAddress(CI)) {
2307 AddrList.emplace_back(args: CI);
2308 return;
2309 }
2310 }
2311
2312 // Base address not found, so add a new list.
2313 MergeableInsts.emplace_back(args: 1, args: CI);
2314}
2315
2316std::pair<MachineBasicBlock::iterator, bool>
2317SILoadStoreOptimizer::collectMergeableInsts(
2318 MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End,
2319 MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
2320 std::list<std::list<CombineInfo>> &MergeableInsts) const {
2321 bool Modified = false;
2322
2323 // Sort potential mergeable instructions into lists. One list per base address.
2324 unsigned Order = 0;
2325 MachineBasicBlock::iterator BlockI = Begin;
2326 for (; BlockI != End; ++BlockI) {
2327 MachineInstr &MI = *BlockI;
2328
2329 // We run this before checking if an address is mergeable, because it can produce
2330 // better code even if the instructions aren't mergeable.
2331 if (promoteConstantOffsetToImm(MI, Visited, AnchorList))
2332 Modified = true;
2333
2334 // Treat volatile accesses, ordered accesses and unmodeled side effects as
2335 // barriers. We can look after this barrier for separate merges.
2336 if (MI.hasOrderedMemoryRef() || MI.hasUnmodeledSideEffects()) {
2337 LLVM_DEBUG(dbgs() << "Breaking search on barrier: " << MI);
2338
2339 // Search will resume after this instruction in a separate merge list.
2340 ++BlockI;
2341 break;
2342 }
2343
2344 const InstClassEnum InstClass = getInstClass(Opc: MI.getOpcode(), TII: *TII);
2345 if (InstClass == UNKNOWN)
2346 continue;
2347
2348 // Do not merge VMEM buffer instructions with "swizzled" bit set.
2349 int Swizzled =
2350 AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: AMDGPU::OpName::swz);
2351 if (Swizzled != -1 && MI.getOperand(i: Swizzled).getImm())
2352 continue;
2353
2354 CombineInfo CI;
2355 CI.setMI(MI, LSO: *this);
2356 CI.Order = Order++;
2357
2358 if (!CI.hasMergeableAddress(MRI: *MRI))
2359 continue;
2360
2361 if (CI.InstClass == DS_WRITE && CI.IsAGPR) {
2362 // FIXME: nothing is illegal in a ds_write2 opcode with two AGPR data
2363 // operands. However we are reporting that ds_write2 shall have
2364 // only VGPR data so that machine copy propagation does not
2365 // create an illegal instruction with a VGPR and AGPR sources.
2366 // Consequenctially if we create such instruction the verifier
2367 // will complain.
2368 continue;
2369 }
2370
2371 LLVM_DEBUG(dbgs() << "Mergeable: " << MI);
2372
2373 addInstToMergeableList(CI, MergeableInsts);
2374 }
2375
2376 // At this point we have lists of Mergeable instructions.
2377 //
2378 // Part 2: Sort lists by offset and then for each CombineInfo object in the
2379 // list try to find an instruction that can be merged with I. If an instruction
2380 // is found, it is stored in the Paired field. If no instructions are found, then
2381 // the CombineInfo object is deleted from the list.
2382
2383 for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
2384 E = MergeableInsts.end(); I != E;) {
2385
2386 std::list<CombineInfo> &MergeList = *I;
2387 if (MergeList.size() <= 1) {
2388 // This means we have found only one instruction with a given address
2389 // that can be merged, and we need at least 2 instructions to do a merge,
2390 // so this list can be discarded.
2391 I = MergeableInsts.erase(position: I);
2392 continue;
2393 }
2394
2395 // Sort the lists by offsets, this way mergeable instructions will be
2396 // adjacent to each other in the list, which will make it easier to find
2397 // matches.
2398 MergeList.sort(
2399 comp: [] (const CombineInfo &A, const CombineInfo &B) {
2400 return A.Offset < B.Offset;
2401 });
2402 ++I;
2403 }
2404
2405 return {BlockI, Modified};
2406}
2407
2408// Scan through looking for adjacent LDS operations with constant offsets from
2409// the same base register. We rely on the scheduler to do the hard work of
2410// clustering nearby loads, and assume these are all adjacent.
2411bool SILoadStoreOptimizer::optimizeBlock(
2412 std::list<std::list<CombineInfo> > &MergeableInsts) {
2413 bool Modified = false;
2414
2415 for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
2416 E = MergeableInsts.end(); I != E;) {
2417 std::list<CombineInfo> &MergeList = *I;
2418
2419 bool OptimizeListAgain = false;
2420 if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) {
2421 // We weren't able to make any changes, so delete the list so we don't
2422 // process the same instructions the next time we try to optimize this
2423 // block.
2424 I = MergeableInsts.erase(position: I);
2425 continue;
2426 }
2427
2428 Modified = true;
2429
2430 // We made changes, but also determined that there were no more optimization
2431 // opportunities, so we don't need to reprocess the list
2432 if (!OptimizeListAgain) {
2433 I = MergeableInsts.erase(position: I);
2434 continue;
2435 }
2436 OptimizeAgain = true;
2437 }
2438 return Modified;
2439}
2440
2441bool
2442SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
2443 std::list<CombineInfo> &MergeList,
2444 bool &OptimizeListAgain) {
2445 if (MergeList.empty())
2446 return false;
2447
2448 bool Modified = false;
2449
2450 for (auto I = MergeList.begin(), Next = std::next(x: I); Next != MergeList.end();
2451 Next = std::next(x: I)) {
2452
2453 auto First = I;
2454 auto Second = Next;
2455
2456 if ((*First).Order > (*Second).Order)
2457 std::swap(a&: First, b&: Second);
2458 CombineInfo &CI = *First;
2459 CombineInfo &Paired = *Second;
2460
2461 CombineInfo *Where = checkAndPrepareMerge(CI, Paired);
2462 if (!Where) {
2463 ++I;
2464 continue;
2465 }
2466
2467 Modified = true;
2468
2469 LLVM_DEBUG(dbgs() << "Merging: " << *CI.I << " with: " << *Paired.I);
2470
2471 MachineBasicBlock::iterator NewMI;
2472 switch (CI.InstClass) {
2473 default:
2474 llvm_unreachable("unknown InstClass");
2475 break;
2476 case DS_READ:
2477 NewMI = mergeRead2Pair(CI, Paired, InsertBefore: Where->I);
2478 break;
2479 case DS_WRITE:
2480 NewMI = mergeWrite2Pair(CI, Paired, InsertBefore: Where->I);
2481 break;
2482 case S_BUFFER_LOAD_IMM:
2483 case S_BUFFER_LOAD_SGPR_IMM:
2484 case S_LOAD_IMM:
2485 NewMI = mergeSMemLoadImmPair(CI, Paired, InsertBefore: Where->I);
2486 OptimizeListAgain |= CI.Width + Paired.Width < 8;
2487 break;
2488 case BUFFER_LOAD:
2489 NewMI = mergeBufferLoadPair(CI, Paired, InsertBefore: Where->I);
2490 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2491 break;
2492 case BUFFER_STORE:
2493 NewMI = mergeBufferStorePair(CI, Paired, InsertBefore: Where->I);
2494 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2495 break;
2496 case MIMG:
2497 NewMI = mergeImagePair(CI, Paired, InsertBefore: Where->I);
2498 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2499 break;
2500 case TBUFFER_LOAD:
2501 NewMI = mergeTBufferLoadPair(CI, Paired, InsertBefore: Where->I);
2502 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2503 break;
2504 case TBUFFER_STORE:
2505 NewMI = mergeTBufferStorePair(CI, Paired, InsertBefore: Where->I);
2506 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2507 break;
2508 case FLAT_LOAD:
2509 case GLOBAL_LOAD:
2510 case GLOBAL_LOAD_SADDR:
2511 NewMI = mergeFlatLoadPair(CI, Paired, InsertBefore: Where->I);
2512 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2513 break;
2514 case FLAT_STORE:
2515 case GLOBAL_STORE:
2516 case GLOBAL_STORE_SADDR:
2517 NewMI = mergeFlatStorePair(CI, Paired, InsertBefore: Where->I);
2518 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2519 break;
2520 }
2521 CI.setMI(MI: NewMI, LSO: *this);
2522 CI.Order = Where->Order;
2523 if (I == Second)
2524 I = Next;
2525
2526 MergeList.erase(position: Second);
2527 }
2528
2529 return Modified;
2530}
2531
2532bool SILoadStoreOptimizerLegacy::runOnMachineFunction(MachineFunction &MF) {
2533 if (skipFunction(F: MF.getFunction()))
2534 return false;
2535 return SILoadStoreOptimizer(
2536 &getAnalysis<AAResultsWrapperPass>().getAAResults())
2537 .run(MF);
2538}
2539
2540bool SILoadStoreOptimizer::run(MachineFunction &MF) {
2541 STM = &MF.getSubtarget<GCNSubtarget>();
2542 if (!STM->loadStoreOptEnabled())
2543 return false;
2544
2545 TII = STM->getInstrInfo();
2546 TRI = &TII->getRegisterInfo();
2547
2548 MRI = &MF.getRegInfo();
2549
2550 LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
2551
2552 bool Modified = false;
2553
2554 // Contains the list of instructions for which constant offsets are being
2555 // promoted to the IMM. This is tracked for an entire block at time.
2556 SmallPtrSet<MachineInstr *, 4> AnchorList;
2557 MemInfoMap Visited;
2558
2559 for (MachineBasicBlock &MBB : MF) {
2560 MachineBasicBlock::iterator SectionEnd;
2561 for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;
2562 I = SectionEnd) {
2563 bool CollectModified;
2564 std::list<std::list<CombineInfo>> MergeableInsts;
2565
2566 // First pass: Collect list of all instructions we know how to merge in a
2567 // subset of the block.
2568 std::tie(args&: SectionEnd, args&: CollectModified) =
2569 collectMergeableInsts(Begin: I, End: E, Visited, AnchorList, MergeableInsts);
2570
2571 Modified |= CollectModified;
2572
2573 do {
2574 OptimizeAgain = false;
2575 Modified |= optimizeBlock(MergeableInsts);
2576 } while (OptimizeAgain);
2577 }
2578
2579 Visited.clear();
2580 AnchorList.clear();
2581 }
2582
2583 return Modified;
2584}
2585
2586PreservedAnalyses
2587SILoadStoreOptimizerPass::run(MachineFunction &MF,
2588 MachineFunctionAnalysisManager &MFAM) {
2589 MFPropsModifier _(*this, MF);
2590
2591 if (MF.getFunction().hasOptNone())
2592 return PreservedAnalyses::all();
2593
2594 auto &FAM = MFAM.getResult<FunctionAnalysisManagerMachineFunctionProxy>(IR&: MF)
2595 .getManager();
2596 AAResults &AA = FAM.getResult<AAManager>(IR&: MF.getFunction());
2597
2598 bool Changed = SILoadStoreOptimizer(&AA).run(MF);
2599 if (!Changed)
2600 return PreservedAnalyses::all();
2601
2602 PreservedAnalyses PA = getMachineFunctionPassPreservedAnalyses();
2603 PA.preserveSet<CFGAnalyses>();
2604 return PA;
2605}
2606