1//===- SILoadStoreOptimizer.cpp -------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass tries to fuse DS instructions with close by immediate offsets.
10// This will fuse operations such as
11// ds_read_b32 v0, v2 offset:16
12// ds_read_b32 v1, v2 offset:32
13// ==>
14// ds_read2_b32 v[0:1], v2, offset0:4 offset1:8
15//
16// The same is done for certain SMEM and VMEM opcodes, e.g.:
17// s_buffer_load_dword s4, s[0:3], 4
18// s_buffer_load_dword s5, s[0:3], 8
19// ==>
20// s_buffer_load_dwordx2 s[4:5], s[0:3], 4
21//
22// This pass also tries to promote constant offset to the immediate by
23// adjusting the base. It tries to use a base from the nearby instructions that
24// allows it to have a 13bit constant offset and then promotes the 13bit offset
25// to the immediate.
26// E.g.
27// s_movk_i32 s0, 0x1800
28// v_add_co_u32_e32 v0, vcc, s0, v2
29// v_addc_co_u32_e32 v1, vcc, 0, v6, vcc
30//
31// s_movk_i32 s0, 0x1000
32// v_add_co_u32_e32 v5, vcc, s0, v2
33// v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
34// global_load_dwordx2 v[5:6], v[5:6], off
35// global_load_dwordx2 v[0:1], v[0:1], off
36// =>
37// s_movk_i32 s0, 0x1000
38// v_add_co_u32_e32 v5, vcc, s0, v2
39// v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
40// global_load_dwordx2 v[5:6], v[5:6], off
41// global_load_dwordx2 v[0:1], v[5:6], off offset:2048
42//
43// Future improvements:
44//
45// - This is currently missing stores of constants because loading
46// the constant into the data register is placed between the stores, although
47// this is arguably a scheduling problem.
48//
49// - Live interval recomputing seems inefficient. This currently only matches
50// one pair, and recomputes live intervals and moves on to the next pair. It
51// would be better to compute a list of all merges that need to occur.
52//
53// - With a list of instructions to process, we can also merge more. If a
54// cluster of loads have offsets that are too large to fit in the 8-bit
55// offsets, but are close enough to fit in the 8 bits, we can add to the base
56// pointer and use the new reduced offsets.
57//
58//===----------------------------------------------------------------------===//
59
60#include "SILoadStoreOptimizer.h"
61#include "AMDGPU.h"
62#include "GCNSubtarget.h"
63#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
64#include "SIDefines.h"
65#include "llvm/Analysis/AliasAnalysis.h"
66#include "llvm/CodeGen/MachineFunctionPass.h"
67#include "llvm/InitializePasses.h"
68
69using namespace llvm;
70
71#define DEBUG_TYPE "si-load-store-opt"
72
73namespace {
74enum InstClassEnum {
75 UNKNOWN,
76 DS_READ,
77 DS_WRITE,
78 S_BUFFER_LOAD_IMM,
79 S_BUFFER_LOAD_SGPR_IMM,
80 S_LOAD_IMM,
81 BUFFER_LOAD,
82 BUFFER_STORE,
83 MIMG,
84 TBUFFER_LOAD,
85 TBUFFER_STORE,
86 GLOBAL_LOAD_SADDR,
87 GLOBAL_STORE_SADDR,
88 FLAT_LOAD,
89 FLAT_STORE,
90 FLAT_LOAD_SADDR,
91 FLAT_STORE_SADDR,
92 GLOBAL_LOAD, // GLOBAL_LOAD/GLOBAL_STORE are never used as the InstClass of
93 GLOBAL_STORE // any CombineInfo, they are only ever returned by
94 // getCommonInstClass.
95};
96
97struct AddressRegs {
98 unsigned char NumVAddrs = 0;
99 bool SBase = false;
100 bool SRsrc = false;
101 bool SOffset = false;
102 bool SAddr = false;
103 bool VAddr = false;
104 bool Addr = false;
105 bool SSamp = false;
106};
107
108// GFX10 image_sample instructions can have 12 vaddrs + srsrc + ssamp.
109const unsigned MaxAddressRegs = 12 + 1 + 1;
110
111class SILoadStoreOptimizer {
112 struct CombineInfo {
113 MachineBasicBlock::iterator I;
114 unsigned EltSize;
115 unsigned Offset;
116 unsigned Width;
117 unsigned Format;
118 unsigned BaseOff;
119 unsigned DMask;
120 InstClassEnum InstClass;
121 unsigned CPol = 0;
122 const TargetRegisterClass *DataRC;
123 bool UseST64;
124 int AddrIdx[MaxAddressRegs];
125 const MachineOperand *AddrReg[MaxAddressRegs];
126 unsigned NumAddresses;
127 unsigned Order;
128
129 bool hasSameBaseAddress(const CombineInfo &CI) {
130 if (NumAddresses != CI.NumAddresses)
131 return false;
132
133 const MachineInstr &MI = *CI.I;
134 for (unsigned i = 0; i < NumAddresses; i++) {
135 const MachineOperand &AddrRegNext = MI.getOperand(i: AddrIdx[i]);
136
137 if (AddrReg[i]->isImm() || AddrRegNext.isImm()) {
138 if (AddrReg[i]->isImm() != AddrRegNext.isImm() ||
139 AddrReg[i]->getImm() != AddrRegNext.getImm()) {
140 return false;
141 }
142 continue;
143 }
144
145 // Check same base pointer. Be careful of subregisters, which can occur
146 // with vectors of pointers.
147 if (AddrReg[i]->getReg() != AddrRegNext.getReg() ||
148 AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) {
149 return false;
150 }
151 }
152 return true;
153 }
154
155 bool hasMergeableAddress(const MachineRegisterInfo &MRI) {
156 for (unsigned i = 0; i < NumAddresses; ++i) {
157 const MachineOperand *AddrOp = AddrReg[i];
158 // Immediates are always OK.
159 if (AddrOp->isImm())
160 continue;
161
162 // Don't try to merge addresses that aren't either immediates or registers.
163 // TODO: Should be possible to merge FrameIndexes and maybe some other
164 // non-register
165 if (!AddrOp->isReg())
166 return false;
167
168 // TODO: We should be able to merge instructions with other physical reg
169 // addresses too.
170 if (AddrOp->getReg().isPhysical() &&
171 AddrOp->getReg() != AMDGPU::SGPR_NULL)
172 return false;
173
174 // If an address has only one use then there will be no other
175 // instructions with the same address, so we can't merge this one.
176 if (MRI.hasOneNonDBGUse(RegNo: AddrOp->getReg()))
177 return false;
178 }
179 return true;
180 }
181
182 void setMI(MachineBasicBlock::iterator MI, const SILoadStoreOptimizer &LSO);
183
184 // Compare by pointer order.
185 bool operator<(const CombineInfo& Other) const {
186 return (InstClass == MIMG) ? DMask < Other.DMask : Offset < Other.Offset;
187 }
188 };
189
190 struct BaseRegisters {
191 Register LoReg;
192 Register HiReg;
193
194 unsigned LoSubReg = 0;
195 unsigned HiSubReg = 0;
196 // True when using V_ADD_U64_e64 pattern
197 bool UseV64Pattern = false;
198 };
199
200 struct MemAddress {
201 BaseRegisters Base;
202 int64_t Offset = 0;
203 };
204
205 using MemInfoMap = DenseMap<MachineInstr *, MemAddress>;
206
207private:
208 MachineFunction *MF = nullptr;
209 const GCNSubtarget *STM = nullptr;
210 const SIInstrInfo *TII = nullptr;
211 const SIRegisterInfo *TRI = nullptr;
212 MachineRegisterInfo *MRI = nullptr;
213 AliasAnalysis *AA = nullptr;
214 bool OptimizeAgain;
215
216 bool canSwapInstructions(const DenseSet<Register> &ARegDefs,
217 const DenseSet<Register> &ARegUses,
218 const MachineInstr &A, const MachineInstr &B) const;
219 static bool dmasksCanBeCombined(const CombineInfo &CI,
220 const SIInstrInfo &TII,
221 const CombineInfo &Paired);
222 static bool offsetsCanBeCombined(CombineInfo &CI, const GCNSubtarget &STI,
223 CombineInfo &Paired, bool Modify = false);
224 static bool widthsFit(const GCNSubtarget &STI, const CombineInfo &CI,
225 const CombineInfo &Paired);
226 unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired);
227 static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI,
228 const CombineInfo &Paired);
229 const TargetRegisterClass *
230 getTargetRegisterClass(const CombineInfo &CI,
231 const CombineInfo &Paired) const;
232 const TargetRegisterClass *getDataRegClass(const MachineInstr &MI) const;
233
234 CombineInfo *checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired);
235
236 void copyToDestRegs(CombineInfo &CI, CombineInfo &Paired,
237 MachineBasicBlock::iterator InsertBefore,
238 const DebugLoc &DL, AMDGPU::OpName OpName,
239 Register DestReg) const;
240 Register copyFromSrcRegs(CombineInfo &CI, CombineInfo &Paired,
241 MachineBasicBlock::iterator InsertBefore,
242 const DebugLoc &DL, AMDGPU::OpName OpName) const;
243
244 unsigned read2Opcode(unsigned EltSize) const;
245 unsigned read2ST64Opcode(unsigned EltSize) const;
246 MachineBasicBlock::iterator
247 mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
248 MachineBasicBlock::iterator InsertBefore);
249
250 unsigned write2Opcode(unsigned EltSize) const;
251 unsigned write2ST64Opcode(unsigned EltSize) const;
252 unsigned getWrite2Opcode(const CombineInfo &CI) const;
253
254 MachineBasicBlock::iterator
255 mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
256 MachineBasicBlock::iterator InsertBefore);
257 MachineBasicBlock::iterator
258 mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
259 MachineBasicBlock::iterator InsertBefore);
260 MachineBasicBlock::iterator
261 mergeSMemLoadImmPair(CombineInfo &CI, CombineInfo &Paired,
262 MachineBasicBlock::iterator InsertBefore);
263 MachineBasicBlock::iterator
264 mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
265 MachineBasicBlock::iterator InsertBefore);
266 MachineBasicBlock::iterator
267 mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
268 MachineBasicBlock::iterator InsertBefore);
269 MachineBasicBlock::iterator
270 mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
271 MachineBasicBlock::iterator InsertBefore);
272 MachineBasicBlock::iterator
273 mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
274 MachineBasicBlock::iterator InsertBefore);
275 MachineBasicBlock::iterator
276 mergeFlatLoadPair(CombineInfo &CI, CombineInfo &Paired,
277 MachineBasicBlock::iterator InsertBefore);
278 MachineBasicBlock::iterator
279 mergeFlatStorePair(CombineInfo &CI, CombineInfo &Paired,
280 MachineBasicBlock::iterator InsertBefore);
281
282 void updateBaseAndOffset(MachineInstr &I, Register NewBase,
283 int32_t NewOffset) const;
284 void updateAsyncLDSAddress(MachineInstr &MI, int32_t OffsetDiff) const;
285 Register computeBase(MachineInstr &MI, const MemAddress &Addr) const;
286 MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const;
287 bool processBaseWithConstOffset64(MachineInstr *AddDef,
288 const MachineOperand &Base,
289 MemAddress &Addr) const;
290 void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr) const;
291 /// Promotes constant offset to the immediate by adjusting the base. It
292 /// tries to use a base from the nearby instructions that allows it to have
293 /// a 13bit constant offset which gets promoted to the immediate.
294 bool promoteConstantOffsetToImm(MachineInstr &CI,
295 MemInfoMap &Visited,
296 SmallPtrSet<MachineInstr *, 4> &Promoted) const;
297 void addInstToMergeableList(const CombineInfo &CI,
298 std::list<std::list<CombineInfo> > &MergeableInsts) const;
299
300 std::pair<MachineBasicBlock::iterator, bool> collectMergeableInsts(
301 MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End,
302 MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
303 std::list<std::list<CombineInfo>> &MergeableInsts) const;
304
305 static MachineMemOperand *combineKnownAdjacentMMOs(const CombineInfo &CI,
306 const CombineInfo &Paired);
307
308 static InstClassEnum getCommonInstClass(const CombineInfo &CI,
309 const CombineInfo &Paired);
310
311 bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList,
312 bool &OptimizeListAgain);
313 bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts);
314
315public:
316 SILoadStoreOptimizer(AliasAnalysis *AA) : AA(AA) {}
317 bool run(MachineFunction &MF);
318};
319
320class SILoadStoreOptimizerLegacy : public MachineFunctionPass {
321public:
322 static char ID;
323
324 SILoadStoreOptimizerLegacy() : MachineFunctionPass(ID) {}
325
326 bool runOnMachineFunction(MachineFunction &MF) override;
327
328 StringRef getPassName() const override { return "SI Load Store Optimizer"; }
329
330 void getAnalysisUsage(AnalysisUsage &AU) const override {
331 AU.setPreservesCFG();
332 AU.addRequired<AAResultsWrapperPass>();
333
334 MachineFunctionPass::getAnalysisUsage(AU);
335 }
336
337 MachineFunctionProperties getRequiredProperties() const override {
338 return MachineFunctionProperties().setIsSSA();
339 }
340};
341
342static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
343 const unsigned Opc = MI.getOpcode();
344
345 if (TII.isMUBUF(Opcode: Opc)) {
346 // FIXME: Handle d16 correctly
347 return AMDGPU::getMUBUFElements(Opc);
348 }
349 if (TII.isImage(MI)) {
350 uint64_t DMaskImm =
351 TII.getNamedOperand(MI, OperandName: AMDGPU::OpName::dmask)->getImm();
352 return llvm::popcount(Value: DMaskImm);
353 }
354 if (TII.isMTBUF(Opcode: Opc)) {
355 return AMDGPU::getMTBUFElements(Opc);
356 }
357
358 switch (Opc) {
359 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
360 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
361 case AMDGPU::S_LOAD_DWORD_IMM:
362 case AMDGPU::GLOBAL_LOAD_DWORD:
363 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
364 case AMDGPU::GLOBAL_STORE_DWORD:
365 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
366 case AMDGPU::FLAT_LOAD_DWORD:
367 case AMDGPU::FLAT_STORE_DWORD:
368 case AMDGPU::FLAT_LOAD_DWORD_SADDR:
369 case AMDGPU::FLAT_STORE_DWORD_SADDR:
370 return 1;
371 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
372 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
373 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
374 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
375 case AMDGPU::S_LOAD_DWORDX2_IMM:
376 case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
377 case AMDGPU::GLOBAL_LOAD_DWORDX2:
378 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
379 case AMDGPU::GLOBAL_STORE_DWORDX2:
380 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
381 case AMDGPU::FLAT_LOAD_DWORDX2:
382 case AMDGPU::FLAT_STORE_DWORDX2:
383 case AMDGPU::FLAT_LOAD_DWORDX2_SADDR:
384 case AMDGPU::FLAT_STORE_DWORDX2_SADDR:
385 return 2;
386 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
387 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
388 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
389 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
390 case AMDGPU::S_LOAD_DWORDX3_IMM:
391 case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
392 case AMDGPU::GLOBAL_LOAD_DWORDX3:
393 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
394 case AMDGPU::GLOBAL_STORE_DWORDX3:
395 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
396 case AMDGPU::FLAT_LOAD_DWORDX3:
397 case AMDGPU::FLAT_STORE_DWORDX3:
398 case AMDGPU::FLAT_LOAD_DWORDX3_SADDR:
399 case AMDGPU::FLAT_STORE_DWORDX3_SADDR:
400 return 3;
401 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
402 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
403 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
404 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
405 case AMDGPU::S_LOAD_DWORDX4_IMM:
406 case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
407 case AMDGPU::GLOBAL_LOAD_DWORDX4:
408 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
409 case AMDGPU::GLOBAL_STORE_DWORDX4:
410 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
411 case AMDGPU::FLAT_LOAD_DWORDX4:
412 case AMDGPU::FLAT_STORE_DWORDX4:
413 case AMDGPU::FLAT_LOAD_DWORDX4_SADDR:
414 case AMDGPU::FLAT_STORE_DWORDX4_SADDR:
415 return 4;
416 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
417 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
418 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
419 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
420 case AMDGPU::S_LOAD_DWORDX8_IMM:
421 case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
422 return 8;
423 case AMDGPU::DS_READ_B32:
424 case AMDGPU::DS_READ_B32_gfx9:
425 case AMDGPU::DS_WRITE_B32:
426 case AMDGPU::DS_WRITE_B32_gfx9:
427 return 1;
428 case AMDGPU::DS_READ_B64:
429 case AMDGPU::DS_READ_B64_gfx9:
430 case AMDGPU::DS_WRITE_B64:
431 case AMDGPU::DS_WRITE_B64_gfx9:
432 return 2;
433 default:
434 return 0;
435 }
436}
437
438/// Maps instruction opcode to enum InstClassEnum.
439static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
440 switch (Opc) {
441 default:
442 if (TII.isMUBUF(Opcode: Opc)) {
443 switch (AMDGPU::getMUBUFBaseOpcode(Opc)) {
444 default:
445 return UNKNOWN;
446 case AMDGPU::BUFFER_LOAD_DWORD_BOTHEN:
447 case AMDGPU::BUFFER_LOAD_DWORD_BOTHEN_exact:
448 case AMDGPU::BUFFER_LOAD_DWORD_IDXEN:
449 case AMDGPU::BUFFER_LOAD_DWORD_IDXEN_exact:
450 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
451 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact:
452 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
453 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact:
454 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_BOTHEN:
455 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_BOTHEN_exact:
456 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_IDXEN:
457 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_IDXEN_exact:
458 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN:
459 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN_exact:
460 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET:
461 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET_exact:
462 return BUFFER_LOAD;
463 case AMDGPU::BUFFER_STORE_DWORD_BOTHEN:
464 case AMDGPU::BUFFER_STORE_DWORD_BOTHEN_exact:
465 case AMDGPU::BUFFER_STORE_DWORD_IDXEN:
466 case AMDGPU::BUFFER_STORE_DWORD_IDXEN_exact:
467 case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
468 case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
469 case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
470 case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
471 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_BOTHEN:
472 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_BOTHEN_exact:
473 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_IDXEN:
474 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_IDXEN_exact:
475 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN:
476 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact:
477 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET:
478 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact:
479 return BUFFER_STORE;
480 }
481 }
482 if (TII.isImage(Opcode: Opc)) {
483 // Ignore instructions encoded without vaddr.
484 if (!AMDGPU::hasNamedOperand(Opcode: Opc, NamedIdx: AMDGPU::OpName::vaddr) &&
485 !AMDGPU::hasNamedOperand(Opcode: Opc, NamedIdx: AMDGPU::OpName::vaddr0))
486 return UNKNOWN;
487 // Ignore BVH instructions
488 if (AMDGPU::getMIMGBaseOpcode(Opc)->BVH)
489 return UNKNOWN;
490 // TODO: Support IMAGE_GET_RESINFO and IMAGE_GET_LOD.
491 if (TII.get(Opcode: Opc).mayStore() || !TII.get(Opcode: Opc).mayLoad() ||
492 TII.isGather4(Opcode: Opc))
493 return UNKNOWN;
494 return MIMG;
495 }
496 if (TII.isMTBUF(Opcode: Opc)) {
497 switch (AMDGPU::getMTBUFBaseOpcode(Opc)) {
498 default:
499 return UNKNOWN;
500 case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN:
501 case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:
502 case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN:
503 case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN_exact:
504 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN:
505 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN_exact:
506 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET:
507 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET_exact:
508 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN:
509 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN_exact:
510 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN:
511 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN_exact:
512 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN:
513 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN_exact:
514 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET:
515 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET_exact:
516 return TBUFFER_LOAD;
517 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN:
518 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN_exact:
519 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET:
520 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET_exact:
521 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN:
522 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN_exact:
523 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET:
524 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET_exact:
525 return TBUFFER_STORE;
526 }
527 }
528 return UNKNOWN;
529 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
530 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
531 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
532 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
533 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
534 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
535 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
536 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
537 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
538 return S_BUFFER_LOAD_IMM;
539 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
540 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
541 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
542 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
543 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
544 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
545 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
546 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
547 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
548 return S_BUFFER_LOAD_SGPR_IMM;
549 case AMDGPU::S_LOAD_DWORD_IMM:
550 case AMDGPU::S_LOAD_DWORDX2_IMM:
551 case AMDGPU::S_LOAD_DWORDX3_IMM:
552 case AMDGPU::S_LOAD_DWORDX4_IMM:
553 case AMDGPU::S_LOAD_DWORDX8_IMM:
554 case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
555 case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
556 case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
557 case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
558 return S_LOAD_IMM;
559 case AMDGPU::DS_READ_B32:
560 case AMDGPU::DS_READ_B32_gfx9:
561 case AMDGPU::DS_READ_B64:
562 case AMDGPU::DS_READ_B64_gfx9:
563 return DS_READ;
564 case AMDGPU::DS_WRITE_B32:
565 case AMDGPU::DS_WRITE_B32_gfx9:
566 case AMDGPU::DS_WRITE_B64:
567 case AMDGPU::DS_WRITE_B64_gfx9:
568 return DS_WRITE;
569 case AMDGPU::GLOBAL_LOAD_DWORD:
570 case AMDGPU::GLOBAL_LOAD_DWORDX2:
571 case AMDGPU::GLOBAL_LOAD_DWORDX3:
572 case AMDGPU::GLOBAL_LOAD_DWORDX4:
573 case AMDGPU::FLAT_LOAD_DWORD:
574 case AMDGPU::FLAT_LOAD_DWORDX2:
575 case AMDGPU::FLAT_LOAD_DWORDX3:
576 case AMDGPU::FLAT_LOAD_DWORDX4:
577 return FLAT_LOAD;
578 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
579 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
580 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
581 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
582 return GLOBAL_LOAD_SADDR;
583 case AMDGPU::GLOBAL_STORE_DWORD:
584 case AMDGPU::GLOBAL_STORE_DWORDX2:
585 case AMDGPU::GLOBAL_STORE_DWORDX3:
586 case AMDGPU::GLOBAL_STORE_DWORDX4:
587 case AMDGPU::FLAT_STORE_DWORD:
588 case AMDGPU::FLAT_STORE_DWORDX2:
589 case AMDGPU::FLAT_STORE_DWORDX3:
590 case AMDGPU::FLAT_STORE_DWORDX4:
591 return FLAT_STORE;
592 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
593 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
594 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
595 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
596 return GLOBAL_STORE_SADDR;
597 case AMDGPU::FLAT_LOAD_DWORD_SADDR:
598 case AMDGPU::FLAT_LOAD_DWORDX2_SADDR:
599 case AMDGPU::FLAT_LOAD_DWORDX3_SADDR:
600 case AMDGPU::FLAT_LOAD_DWORDX4_SADDR:
601 return FLAT_LOAD_SADDR;
602 case AMDGPU::FLAT_STORE_DWORD_SADDR:
603 case AMDGPU::FLAT_STORE_DWORDX2_SADDR:
604 case AMDGPU::FLAT_STORE_DWORDX3_SADDR:
605 case AMDGPU::FLAT_STORE_DWORDX4_SADDR:
606 return FLAT_STORE_SADDR;
607 }
608}
609
610/// Determines instruction subclass from opcode. Only instructions
611/// of the same subclass can be merged together. The merged instruction may have
612/// a different subclass but must have the same class.
613static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {
614 switch (Opc) {
615 default:
616 if (TII.isMUBUF(Opcode: Opc))
617 return AMDGPU::getMUBUFBaseOpcode(Opc);
618 if (TII.isImage(Opcode: Opc)) {
619 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc);
620 assert(Info);
621 return Info->BaseOpcode;
622 }
623 if (TII.isMTBUF(Opcode: Opc))
624 return AMDGPU::getMTBUFBaseOpcode(Opc);
625 return -1;
626 case AMDGPU::DS_READ_B32:
627 case AMDGPU::DS_READ_B32_gfx9:
628 case AMDGPU::DS_READ_B64:
629 case AMDGPU::DS_READ_B64_gfx9:
630 case AMDGPU::DS_WRITE_B32:
631 case AMDGPU::DS_WRITE_B32_gfx9:
632 case AMDGPU::DS_WRITE_B64:
633 case AMDGPU::DS_WRITE_B64_gfx9:
634 return Opc;
635 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
636 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
637 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
638 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
639 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
640 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
641 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
642 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
643 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
644 return AMDGPU::S_BUFFER_LOAD_DWORD_IMM;
645 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
646 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
647 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
648 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
649 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
650 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
651 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
652 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
653 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
654 return AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM;
655 case AMDGPU::S_LOAD_DWORD_IMM:
656 case AMDGPU::S_LOAD_DWORDX2_IMM:
657 case AMDGPU::S_LOAD_DWORDX3_IMM:
658 case AMDGPU::S_LOAD_DWORDX4_IMM:
659 case AMDGPU::S_LOAD_DWORDX8_IMM:
660 case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
661 case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
662 case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
663 case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
664 return AMDGPU::S_LOAD_DWORD_IMM;
665 case AMDGPU::GLOBAL_LOAD_DWORD:
666 case AMDGPU::GLOBAL_LOAD_DWORDX2:
667 case AMDGPU::GLOBAL_LOAD_DWORDX3:
668 case AMDGPU::GLOBAL_LOAD_DWORDX4:
669 case AMDGPU::FLAT_LOAD_DWORD:
670 case AMDGPU::FLAT_LOAD_DWORDX2:
671 case AMDGPU::FLAT_LOAD_DWORDX3:
672 case AMDGPU::FLAT_LOAD_DWORDX4:
673 return AMDGPU::FLAT_LOAD_DWORD;
674 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
675 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
676 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
677 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
678 return AMDGPU::GLOBAL_LOAD_DWORD_SADDR;
679 case AMDGPU::GLOBAL_STORE_DWORD:
680 case AMDGPU::GLOBAL_STORE_DWORDX2:
681 case AMDGPU::GLOBAL_STORE_DWORDX3:
682 case AMDGPU::GLOBAL_STORE_DWORDX4:
683 case AMDGPU::FLAT_STORE_DWORD:
684 case AMDGPU::FLAT_STORE_DWORDX2:
685 case AMDGPU::FLAT_STORE_DWORDX3:
686 case AMDGPU::FLAT_STORE_DWORDX4:
687 return AMDGPU::FLAT_STORE_DWORD;
688 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
689 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
690 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
691 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
692 return AMDGPU::GLOBAL_STORE_DWORD_SADDR;
693 case AMDGPU::FLAT_LOAD_DWORD_SADDR:
694 case AMDGPU::FLAT_LOAD_DWORDX2_SADDR:
695 case AMDGPU::FLAT_LOAD_DWORDX3_SADDR:
696 case AMDGPU::FLAT_LOAD_DWORDX4_SADDR:
697 return AMDGPU::FLAT_LOAD_DWORD_SADDR;
698 case AMDGPU::FLAT_STORE_DWORD_SADDR:
699 case AMDGPU::FLAT_STORE_DWORDX2_SADDR:
700 case AMDGPU::FLAT_STORE_DWORDX3_SADDR:
701 case AMDGPU::FLAT_STORE_DWORDX4_SADDR:
702 return AMDGPU::FLAT_STORE_DWORD_SADDR;
703 }
704}
705
706// GLOBAL loads and stores are classified as FLAT initially. If both combined
707// instructions are FLAT GLOBAL adjust the class to GLOBAL_LOAD or GLOBAL_STORE.
708// If either or both instructions are non segment specific FLAT the resulting
709// combined operation will be FLAT, potentially promoting one of the GLOBAL
710// operations to FLAT.
711// For other instructions return the original unmodified class.
712InstClassEnum
713SILoadStoreOptimizer::getCommonInstClass(const CombineInfo &CI,
714 const CombineInfo &Paired) {
715 assert(CI.InstClass == Paired.InstClass);
716
717 if ((CI.InstClass == FLAT_LOAD || CI.InstClass == FLAT_STORE) &&
718 SIInstrInfo::isFLATGlobal(MI: *CI.I) && SIInstrInfo::isFLATGlobal(MI: *Paired.I))
719 return (CI.InstClass == FLAT_STORE) ? GLOBAL_STORE : GLOBAL_LOAD;
720
721 return CI.InstClass;
722}
723
724static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) {
725 AddressRegs Result;
726
727 if (TII.isMUBUF(Opcode: Opc)) {
728 if (AMDGPU::getMUBUFHasVAddr(Opc))
729 Result.VAddr = true;
730 if (AMDGPU::getMUBUFHasSrsrc(Opc))
731 Result.SRsrc = true;
732 if (AMDGPU::getMUBUFHasSoffset(Opc))
733 Result.SOffset = true;
734
735 return Result;
736 }
737
738 if (TII.isImage(Opcode: Opc)) {
739 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::vaddr0);
740 if (VAddr0Idx >= 0) {
741 AMDGPU::OpName RsrcName =
742 TII.isMIMG(Opcode: Opc) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
743 int RsrcIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: RsrcName);
744 Result.NumVAddrs = RsrcIdx - VAddr0Idx;
745 } else {
746 Result.VAddr = true;
747 }
748 Result.SRsrc = true;
749 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc);
750 if (Info && AMDGPU::getMIMGBaseOpcodeInfo(BaseOpcode: Info->BaseOpcode)->Sampler)
751 Result.SSamp = true;
752
753 return Result;
754 }
755 if (TII.isMTBUF(Opcode: Opc)) {
756 if (AMDGPU::getMTBUFHasVAddr(Opc))
757 Result.VAddr = true;
758 if (AMDGPU::getMTBUFHasSrsrc(Opc))
759 Result.SRsrc = true;
760 if (AMDGPU::getMTBUFHasSoffset(Opc))
761 Result.SOffset = true;
762
763 return Result;
764 }
765
766 switch (Opc) {
767 default:
768 return Result;
769 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
770 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
771 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
772 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
773 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
774 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
775 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
776 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
777 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
778 Result.SOffset = true;
779 [[fallthrough]];
780 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
781 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
782 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
783 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
784 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
785 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
786 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
787 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
788 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
789 case AMDGPU::S_LOAD_DWORD_IMM:
790 case AMDGPU::S_LOAD_DWORDX2_IMM:
791 case AMDGPU::S_LOAD_DWORDX3_IMM:
792 case AMDGPU::S_LOAD_DWORDX4_IMM:
793 case AMDGPU::S_LOAD_DWORDX8_IMM:
794 case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
795 case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
796 case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
797 case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
798 Result.SBase = true;
799 return Result;
800 case AMDGPU::DS_READ_B32:
801 case AMDGPU::DS_READ_B64:
802 case AMDGPU::DS_READ_B32_gfx9:
803 case AMDGPU::DS_READ_B64_gfx9:
804 case AMDGPU::DS_WRITE_B32:
805 case AMDGPU::DS_WRITE_B64:
806 case AMDGPU::DS_WRITE_B32_gfx9:
807 case AMDGPU::DS_WRITE_B64_gfx9:
808 Result.Addr = true;
809 return Result;
810 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
811 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
812 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
813 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
814 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
815 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
816 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
817 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
818 case AMDGPU::FLAT_LOAD_DWORD_SADDR:
819 case AMDGPU::FLAT_LOAD_DWORDX2_SADDR:
820 case AMDGPU::FLAT_LOAD_DWORDX3_SADDR:
821 case AMDGPU::FLAT_LOAD_DWORDX4_SADDR:
822 case AMDGPU::FLAT_STORE_DWORD_SADDR:
823 case AMDGPU::FLAT_STORE_DWORDX2_SADDR:
824 case AMDGPU::FLAT_STORE_DWORDX3_SADDR:
825 case AMDGPU::FLAT_STORE_DWORDX4_SADDR:
826 Result.SAddr = true;
827 [[fallthrough]];
828 case AMDGPU::GLOBAL_LOAD_DWORD:
829 case AMDGPU::GLOBAL_LOAD_DWORDX2:
830 case AMDGPU::GLOBAL_LOAD_DWORDX3:
831 case AMDGPU::GLOBAL_LOAD_DWORDX4:
832 case AMDGPU::GLOBAL_STORE_DWORD:
833 case AMDGPU::GLOBAL_STORE_DWORDX2:
834 case AMDGPU::GLOBAL_STORE_DWORDX3:
835 case AMDGPU::GLOBAL_STORE_DWORDX4:
836 case AMDGPU::FLAT_LOAD_DWORD:
837 case AMDGPU::FLAT_LOAD_DWORDX2:
838 case AMDGPU::FLAT_LOAD_DWORDX3:
839 case AMDGPU::FLAT_LOAD_DWORDX4:
840 case AMDGPU::FLAT_STORE_DWORD:
841 case AMDGPU::FLAT_STORE_DWORDX2:
842 case AMDGPU::FLAT_STORE_DWORDX3:
843 case AMDGPU::FLAT_STORE_DWORDX4:
844 Result.VAddr = true;
845 return Result;
846 }
847}
848
849void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI,
850 const SILoadStoreOptimizer &LSO) {
851 I = MI;
852 unsigned Opc = MI->getOpcode();
853 InstClass = getInstClass(Opc, TII: *LSO.TII);
854
855 if (InstClass == UNKNOWN)
856 return;
857
858 DataRC = LSO.getDataRegClass(MI: *MI);
859
860 switch (InstClass) {
861 case DS_READ:
862 EltSize =
863 (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8
864 : 4;
865 break;
866 case DS_WRITE:
867 EltSize =
868 (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8
869 : 4;
870 break;
871 case S_BUFFER_LOAD_IMM:
872 case S_BUFFER_LOAD_SGPR_IMM:
873 case S_LOAD_IMM:
874 EltSize = AMDGPU::convertSMRDOffsetUnits(ST: *LSO.STM, ByteOffset: 4);
875 break;
876 default:
877 EltSize = 4;
878 break;
879 }
880
881 if (InstClass == MIMG) {
882 DMask = LSO.TII->getNamedOperand(MI&: *I, OperandName: AMDGPU::OpName::dmask)->getImm();
883 // Offset is not considered for MIMG instructions.
884 Offset = 0;
885 } else {
886 int OffsetIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::offset);
887 Offset = I->getOperand(i: OffsetIdx).getImm();
888 }
889
890 if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE) {
891 Format = LSO.TII->getNamedOperand(MI&: *I, OperandName: AMDGPU::OpName::format)->getImm();
892 const AMDGPU::GcnBufferFormatInfo *Info =
893 AMDGPU::getGcnBufferFormatInfo(Format, STI: *LSO.STM);
894 EltSize = Info->BitsPerComp / 8;
895 }
896
897 Width = getOpcodeWidth(MI: *I, TII: *LSO.TII);
898
899 if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) {
900 Offset &= 0xffff;
901 } else if (InstClass != MIMG) {
902 CPol = LSO.TII->getNamedOperand(MI&: *I, OperandName: AMDGPU::OpName::cpol)->getImm();
903 }
904
905 AddressRegs Regs = getRegs(Opc, TII: *LSO.TII);
906 bool isVIMAGEorVSAMPLE = LSO.TII->isVIMAGE(MI: *I) || LSO.TII->isVSAMPLE(MI: *I);
907
908 NumAddresses = 0;
909 for (unsigned J = 0; J < Regs.NumVAddrs; J++)
910 AddrIdx[NumAddresses++] =
911 AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::vaddr0) + J;
912 if (Regs.Addr)
913 AddrIdx[NumAddresses++] =
914 AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::addr);
915 if (Regs.SBase)
916 AddrIdx[NumAddresses++] =
917 AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::sbase);
918 if (Regs.SRsrc)
919 AddrIdx[NumAddresses++] = AMDGPU::getNamedOperandIdx(
920 Opcode: Opc, Name: isVIMAGEorVSAMPLE ? AMDGPU::OpName::rsrc : AMDGPU::OpName::srsrc);
921 if (Regs.SOffset)
922 AddrIdx[NumAddresses++] =
923 AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::soffset);
924 if (Regs.SAddr)
925 AddrIdx[NumAddresses++] =
926 AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::saddr);
927 if (Regs.VAddr)
928 AddrIdx[NumAddresses++] =
929 AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::vaddr);
930 if (Regs.SSamp)
931 AddrIdx[NumAddresses++] = AMDGPU::getNamedOperandIdx(
932 Opcode: Opc, Name: isVIMAGEorVSAMPLE ? AMDGPU::OpName::samp : AMDGPU::OpName::ssamp);
933 assert(NumAddresses <= MaxAddressRegs);
934
935 for (unsigned J = 0; J < NumAddresses; J++)
936 AddrReg[J] = &I->getOperand(i: AddrIdx[J]);
937}
938
939} // end anonymous namespace.
940
941INITIALIZE_PASS_BEGIN(SILoadStoreOptimizerLegacy, DEBUG_TYPE,
942 "SI Load Store Optimizer", false, false)
943INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
944INITIALIZE_PASS_END(SILoadStoreOptimizerLegacy, DEBUG_TYPE,
945 "SI Load Store Optimizer", false, false)
946
947char SILoadStoreOptimizerLegacy::ID = 0;
948
949char &llvm::SILoadStoreOptimizerLegacyID = SILoadStoreOptimizerLegacy::ID;
950
951FunctionPass *llvm::createSILoadStoreOptimizerLegacyPass() {
952 return new SILoadStoreOptimizerLegacy();
953}
954
955static void addDefsUsesToList(const MachineInstr &MI,
956 DenseSet<Register> &RegDefs,
957 DenseSet<Register> &RegUses) {
958 for (const auto &Op : MI.operands()) {
959 if (!Op.isReg())
960 continue;
961 if (Op.isDef())
962 RegDefs.insert(V: Op.getReg());
963 if (Op.readsReg())
964 RegUses.insert(V: Op.getReg());
965 }
966}
967
968bool SILoadStoreOptimizer::canSwapInstructions(
969 const DenseSet<Register> &ARegDefs, const DenseSet<Register> &ARegUses,
970 const MachineInstr &A, const MachineInstr &B) const {
971 if (A.mayLoadOrStore() && B.mayLoadOrStore() &&
972 (A.mayStore() || B.mayStore()) && A.mayAlias(AA, Other: B, UseTBAA: true))
973 return false;
974 for (const auto &BOp : B.operands()) {
975 if (!BOp.isReg())
976 continue;
977 if ((BOp.isDef() || BOp.readsReg()) && ARegDefs.contains(V: BOp.getReg()))
978 return false;
979 if (BOp.isDef() && ARegUses.contains(V: BOp.getReg()))
980 return false;
981 }
982 return true;
983}
984
985// Given that \p CI and \p Paired are adjacent memory operations produce a new
986// MMO for the combined operation with a new access size.
987MachineMemOperand *
988SILoadStoreOptimizer::combineKnownAdjacentMMOs(const CombineInfo &CI,
989 const CombineInfo &Paired) {
990 const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
991 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
992
993 unsigned Size = MMOa->getSize().getValue() + MMOb->getSize().getValue();
994
995 // A base pointer for the combined operation is the same as the leading
996 // operation's pointer.
997 if (Paired < CI)
998 std::swap(a&: MMOa, b&: MMOb);
999
1000 MachinePointerInfo PtrInfo(MMOa->getPointerInfo());
1001 // If merging FLAT and GLOBAL set address space to FLAT.
1002 if (MMOb->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS)
1003 PtrInfo.AddrSpace = AMDGPUAS::FLAT_ADDRESS;
1004
1005 MachineFunction *MF = CI.I->getMF();
1006 return MF->getMachineMemOperand(MMO: MMOa, PtrInfo, Size);
1007}
1008
1009bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI,
1010 const SIInstrInfo &TII,
1011 const CombineInfo &Paired) {
1012 assert(CI.InstClass == MIMG);
1013
1014 // Ignore instructions with tfe/lwe set.
1015 const auto *TFEOp = TII.getNamedOperand(MI&: *CI.I, OperandName: AMDGPU::OpName::tfe);
1016 const auto *LWEOp = TII.getNamedOperand(MI&: *CI.I, OperandName: AMDGPU::OpName::lwe);
1017
1018 if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm()))
1019 return false;
1020
1021 // Check other optional immediate operands for equality.
1022 AMDGPU::OpName OperandsToMatch[] = {
1023 AMDGPU::OpName::cpol, AMDGPU::OpName::d16, AMDGPU::OpName::unorm,
1024 AMDGPU::OpName::da, AMDGPU::OpName::r128, AMDGPU::OpName::a16};
1025
1026 for (AMDGPU::OpName op : OperandsToMatch) {
1027 int Idx = AMDGPU::getNamedOperandIdx(Opcode: CI.I->getOpcode(), Name: op);
1028 if (AMDGPU::getNamedOperandIdx(Opcode: Paired.I->getOpcode(), Name: op) != Idx)
1029 return false;
1030 if (Idx != -1 &&
1031 CI.I->getOperand(i: Idx).getImm() != Paired.I->getOperand(i: Idx).getImm())
1032 return false;
1033 }
1034
1035 // Check DMask for overlaps.
1036 unsigned MaxMask = std::max(a: CI.DMask, b: Paired.DMask);
1037 unsigned MinMask = std::min(a: CI.DMask, b: Paired.DMask);
1038
1039 if (!MaxMask)
1040 return false;
1041
1042 unsigned AllowedBitsForMin = llvm::countr_zero(Val: MaxMask);
1043 if ((1u << AllowedBitsForMin) <= MinMask)
1044 return false;
1045
1046 return true;
1047}
1048
1049static unsigned getBufferFormatWithCompCount(unsigned OldFormat,
1050 unsigned ComponentCount,
1051 const GCNSubtarget &STI) {
1052 if (ComponentCount > 4)
1053 return 0;
1054
1055 const llvm::AMDGPU::GcnBufferFormatInfo *OldFormatInfo =
1056 llvm::AMDGPU::getGcnBufferFormatInfo(Format: OldFormat, STI);
1057 if (!OldFormatInfo)
1058 return 0;
1059
1060 const llvm::AMDGPU::GcnBufferFormatInfo *NewFormatInfo =
1061 llvm::AMDGPU::getGcnBufferFormatInfo(BitsPerComp: OldFormatInfo->BitsPerComp,
1062 NumComponents: ComponentCount,
1063 NumFormat: OldFormatInfo->NumFormat, STI);
1064
1065 if (!NewFormatInfo)
1066 return 0;
1067
1068 assert(NewFormatInfo->NumFormat == OldFormatInfo->NumFormat &&
1069 NewFormatInfo->BitsPerComp == OldFormatInfo->BitsPerComp);
1070
1071 return NewFormatInfo->Format;
1072}
1073
1074// Return the value in the inclusive range [Lo,Hi] that is aligned to the
1075// highest power of two. Note that the result is well defined for all inputs
1076// including corner cases like:
1077// - if Lo == Hi, return that value
1078// - if Lo == 0, return 0 (even though the "- 1" below underflows
1079// - if Lo > Hi, return 0 (as if the range wrapped around)
1080static uint32_t mostAlignedValueInRange(uint32_t Lo, uint32_t Hi) {
1081 return Hi & maskLeadingOnes<uint32_t>(N: llvm::countl_zero(Val: (Lo - 1) ^ Hi) + 1);
1082}
1083
1084bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
1085 const GCNSubtarget &STI,
1086 CombineInfo &Paired,
1087 bool Modify) {
1088 assert(CI.InstClass != MIMG);
1089
1090 // XXX - Would the same offset be OK? Is there any reason this would happen or
1091 // be useful?
1092 if (CI.Offset == Paired.Offset)
1093 return false;
1094
1095 // This won't be valid if the offset isn't aligned.
1096 if ((CI.Offset % CI.EltSize != 0) || (Paired.Offset % CI.EltSize != 0))
1097 return false;
1098
1099 if (CI.InstClass == TBUFFER_LOAD || CI.InstClass == TBUFFER_STORE) {
1100
1101 const llvm::AMDGPU::GcnBufferFormatInfo *Info0 =
1102 llvm::AMDGPU::getGcnBufferFormatInfo(Format: CI.Format, STI);
1103 const llvm::AMDGPU::GcnBufferFormatInfo *Info1 =
1104 llvm::AMDGPU::getGcnBufferFormatInfo(Format: Paired.Format, STI);
1105
1106 if (Info0->BitsPerComp != Info1->BitsPerComp ||
1107 Info0->NumFormat != Info1->NumFormat)
1108 return false;
1109
1110 // For 8-bit or 16-bit formats there is no 3-component variant.
1111 // If NumCombinedComponents is 3, try the 4-component format and use XYZ.
1112 // Example:
1113 // tbuffer_load_format_x + tbuffer_load_format_x + tbuffer_load_format_x
1114 // ==> tbuffer_load_format_xyz with format:[BUF_FMT_16_16_16_16_SNORM]
1115 unsigned NumCombinedComponents = CI.Width + Paired.Width;
1116 if (NumCombinedComponents == 3 && CI.EltSize <= 2)
1117 NumCombinedComponents = 4;
1118
1119 if (getBufferFormatWithCompCount(OldFormat: CI.Format, ComponentCount: NumCombinedComponents, STI) ==
1120 0)
1121 return false;
1122
1123 // Merge only when the two access ranges are strictly back-to-back,
1124 // any gap or overlap can over-write data or leave holes.
1125 unsigned ElemIndex0 = CI.Offset / CI.EltSize;
1126 unsigned ElemIndex1 = Paired.Offset / Paired.EltSize;
1127 if (ElemIndex0 + CI.Width != ElemIndex1 &&
1128 ElemIndex1 + Paired.Width != ElemIndex0)
1129 return false;
1130
1131 // 1-byte formats require 1-byte alignment.
1132 // 2-byte formats require 2-byte alignment.
1133 // 4-byte and larger formats require 4-byte alignment.
1134 unsigned MergedBytes = CI.EltSize * NumCombinedComponents;
1135 unsigned RequiredAlign = std::min(a: MergedBytes, b: 4u);
1136 unsigned MinOff = std::min(a: CI.Offset, b: Paired.Offset);
1137 if (MinOff % RequiredAlign != 0)
1138 return false;
1139
1140 return true;
1141 }
1142
1143 uint32_t EltOffset0 = CI.Offset / CI.EltSize;
1144 uint32_t EltOffset1 = Paired.Offset / CI.EltSize;
1145 CI.UseST64 = false;
1146 CI.BaseOff = 0;
1147
1148 // Handle all non-DS instructions.
1149 if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) {
1150 if (EltOffset0 + CI.Width != EltOffset1 &&
1151 EltOffset1 + Paired.Width != EltOffset0)
1152 return false;
1153 // Instructions with scale_offset modifier cannot be combined unless we
1154 // also generate a code to scale the offset and reset that bit.
1155 if (CI.CPol != Paired.CPol || (CI.CPol & AMDGPU::CPol::SCAL))
1156 return false;
1157 if (CI.InstClass == S_LOAD_IMM || CI.InstClass == S_BUFFER_LOAD_IMM ||
1158 CI.InstClass == S_BUFFER_LOAD_SGPR_IMM) {
1159 // Reject cases like:
1160 // dword + dwordx2 -> dwordx3
1161 // dword + dwordx3 -> dwordx4
1162 // If we tried to combine these cases, we would fail to extract a subreg
1163 // for the result of the second load due to SGPR alignment requirements.
1164 if (CI.Width != Paired.Width &&
1165 (CI.Width < Paired.Width) == (CI.Offset < Paired.Offset))
1166 return false;
1167 }
1168 return true;
1169 }
1170
1171 // If the offset in elements doesn't fit in 8-bits, we might be able to use
1172 // the stride 64 versions.
1173 if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&
1174 isUInt<8>(x: EltOffset0 / 64) && isUInt<8>(x: EltOffset1 / 64)) {
1175 if (Modify) {
1176 CI.Offset = EltOffset0 / 64;
1177 Paired.Offset = EltOffset1 / 64;
1178 CI.UseST64 = true;
1179 }
1180 return true;
1181 }
1182
1183 // Check if the new offsets fit in the reduced 8-bit range.
1184 if (isUInt<8>(x: EltOffset0) && isUInt<8>(x: EltOffset1)) {
1185 if (Modify) {
1186 CI.Offset = EltOffset0;
1187 Paired.Offset = EltOffset1;
1188 }
1189 return true;
1190 }
1191
1192 // Try to shift base address to decrease offsets.
1193 uint32_t Min = std::min(a: EltOffset0, b: EltOffset1);
1194 uint32_t Max = std::max(a: EltOffset0, b: EltOffset1);
1195
1196 const uint32_t Mask = maskTrailingOnes<uint32_t>(N: 8) * 64;
1197 if (((Max - Min) & ~Mask) == 0) {
1198 if (Modify) {
1199 // From the range of values we could use for BaseOff, choose the one that
1200 // is aligned to the highest power of two, to maximise the chance that
1201 // the same offset can be reused for other load/store pairs.
1202 uint32_t BaseOff = mostAlignedValueInRange(Lo: Max - 0xff * 64, Hi: Min);
1203 // Copy the low bits of the offsets, so that when we adjust them by
1204 // subtracting BaseOff they will be multiples of 64.
1205 BaseOff |= Min & maskTrailingOnes<uint32_t>(N: 6);
1206 CI.BaseOff = BaseOff * CI.EltSize;
1207 CI.Offset = (EltOffset0 - BaseOff) / 64;
1208 Paired.Offset = (EltOffset1 - BaseOff) / 64;
1209 CI.UseST64 = true;
1210 }
1211 return true;
1212 }
1213
1214 if (isUInt<8>(x: Max - Min)) {
1215 if (Modify) {
1216 // From the range of values we could use for BaseOff, choose the one that
1217 // is aligned to the highest power of two, to maximise the chance that
1218 // the same offset can be reused for other load/store pairs.
1219 uint32_t BaseOff = mostAlignedValueInRange(Lo: Max - 0xff, Hi: Min);
1220 CI.BaseOff = BaseOff * CI.EltSize;
1221 CI.Offset = EltOffset0 - BaseOff;
1222 Paired.Offset = EltOffset1 - BaseOff;
1223 }
1224 return true;
1225 }
1226
1227 return false;
1228}
1229
1230bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM,
1231 const CombineInfo &CI,
1232 const CombineInfo &Paired) {
1233 const unsigned Width = (CI.Width + Paired.Width);
1234 switch (CI.InstClass) {
1235 default:
1236 return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3));
1237 case S_BUFFER_LOAD_IMM:
1238 case S_BUFFER_LOAD_SGPR_IMM:
1239 case S_LOAD_IMM:
1240 switch (Width) {
1241 default:
1242 return false;
1243 case 2:
1244 case 4:
1245 case 8:
1246 return true;
1247 case 3:
1248 return STM.hasScalarDwordx3Loads();
1249 }
1250 }
1251}
1252
1253const TargetRegisterClass *
1254SILoadStoreOptimizer::getDataRegClass(const MachineInstr &MI) const {
1255 if (const auto *Dst = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst)) {
1256 return TRI->getRegClassForReg(MRI: *MRI, Reg: Dst->getReg());
1257 }
1258 if (const auto *Src = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdata)) {
1259 return TRI->getRegClassForReg(MRI: *MRI, Reg: Src->getReg());
1260 }
1261 if (const auto *Src = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::data0)) {
1262 return TRI->getRegClassForReg(MRI: *MRI, Reg: Src->getReg());
1263 }
1264 if (const auto *Dst = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::sdst)) {
1265 return TRI->getRegClassForReg(MRI: *MRI, Reg: Dst->getReg());
1266 }
1267 if (const auto *Src = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::sdata)) {
1268 return TRI->getRegClassForReg(MRI: *MRI, Reg: Src->getReg());
1269 }
1270 return nullptr;
1271}
1272
1273/// This function assumes that CI comes before Paired in a basic block. Return
1274/// an insertion point for the merged instruction or nullptr on failure.
1275SILoadStoreOptimizer::CombineInfo *
1276SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI,
1277 CombineInfo &Paired) {
1278 // If another instruction has already been merged into CI, it may now be a
1279 // type that we can't do any further merging into.
1280 if (CI.InstClass == UNKNOWN || Paired.InstClass == UNKNOWN)
1281 return nullptr;
1282 assert(CI.InstClass == Paired.InstClass);
1283
1284 if (getInstSubclass(Opc: CI.I->getOpcode(), TII: *TII) !=
1285 getInstSubclass(Opc: Paired.I->getOpcode(), TII: *TII))
1286 return nullptr;
1287
1288 // Check both offsets (or masks for MIMG) can be combined and fit in the
1289 // reduced range.
1290 if (CI.InstClass == MIMG) {
1291 if (!dmasksCanBeCombined(CI, TII: *TII, Paired))
1292 return nullptr;
1293 } else {
1294 if (!widthsFit(STM: *STM, CI, Paired) || !offsetsCanBeCombined(CI, STI: *STM, Paired))
1295 return nullptr;
1296 }
1297
1298 DenseSet<Register> RegDefs;
1299 DenseSet<Register> RegUses;
1300 CombineInfo *Where;
1301 if (CI.I->mayLoad()) {
1302 // Try to hoist Paired up to CI.
1303 addDefsUsesToList(MI: *Paired.I, RegDefs, RegUses);
1304 for (MachineBasicBlock::iterator MBBI = Paired.I; --MBBI != CI.I;) {
1305 if (!canSwapInstructions(ARegDefs: RegDefs, ARegUses: RegUses, A: *Paired.I, B: *MBBI))
1306 return nullptr;
1307 }
1308 Where = &CI;
1309 } else {
1310 // Try to sink CI down to Paired.
1311 addDefsUsesToList(MI: *CI.I, RegDefs, RegUses);
1312 for (MachineBasicBlock::iterator MBBI = CI.I; ++MBBI != Paired.I;) {
1313 if (!canSwapInstructions(ARegDefs: RegDefs, ARegUses: RegUses, A: *CI.I, B: *MBBI))
1314 return nullptr;
1315 }
1316 Where = &Paired;
1317 }
1318
1319 // Call offsetsCanBeCombined with modify = true so that the offsets are
1320 // correct for the new instruction. This should return true, because
1321 // this function should only be called on CombineInfo objects that
1322 // have already been confirmed to be mergeable.
1323 if (CI.InstClass == DS_READ || CI.InstClass == DS_WRITE)
1324 offsetsCanBeCombined(CI, STI: *STM, Paired, Modify: true);
1325
1326 if (CI.InstClass == DS_WRITE) {
1327 // Both data operands must be AGPR or VGPR, so the data registers needs to
1328 // be constrained to one or the other. We expect to only emit the VGPR form
1329 // here for now.
1330 //
1331 // FIXME: There is currently a hack in getRegClass to report that the write2
1332 // operands are VGPRs. In the future we should have separate agpr
1333 // instruction definitions.
1334 const MachineOperand *Data0 =
1335 TII->getNamedOperand(MI&: *CI.I, OperandName: AMDGPU::OpName::data0);
1336 const MachineOperand *Data1 =
1337 TII->getNamedOperand(MI&: *Paired.I, OperandName: AMDGPU::OpName::data0);
1338
1339 const MCInstrDesc &Write2Opc = TII->get(Opcode: getWrite2Opcode(CI));
1340 int Data0Idx = AMDGPU::getNamedOperandIdx(Opcode: Write2Opc.getOpcode(),
1341 Name: AMDGPU::OpName::data0);
1342 int Data1Idx = AMDGPU::getNamedOperandIdx(Opcode: Write2Opc.getOpcode(),
1343 Name: AMDGPU::OpName::data1);
1344
1345 const TargetRegisterClass *DataRC0 = TII->getRegClass(MCID: Write2Opc, OpNum: Data0Idx);
1346
1347 const TargetRegisterClass *DataRC1 = TII->getRegClass(MCID: Write2Opc, OpNum: Data1Idx);
1348
1349 if (unsigned SubReg = Data0->getSubReg()) {
1350 DataRC0 = TRI->getMatchingSuperRegClass(A: MRI->getRegClass(Reg: Data0->getReg()),
1351 B: DataRC0, Idx: SubReg);
1352 }
1353
1354 if (unsigned SubReg = Data1->getSubReg()) {
1355 DataRC1 = TRI->getMatchingSuperRegClass(A: MRI->getRegClass(Reg: Data1->getReg()),
1356 B: DataRC1, Idx: SubReg);
1357 }
1358
1359 if (!MRI->constrainRegClass(Reg: Data0->getReg(), RC: DataRC0) ||
1360 !MRI->constrainRegClass(Reg: Data1->getReg(), RC: DataRC1))
1361 return nullptr;
1362
1363 // TODO: If one register can be constrained, and not the other, insert a
1364 // copy.
1365 }
1366
1367 return Where;
1368}
1369
1370// Copy the merged load result from DestReg to the original dest regs of CI and
1371// Paired.
1372void SILoadStoreOptimizer::copyToDestRegs(
1373 CombineInfo &CI, CombineInfo &Paired,
1374 MachineBasicBlock::iterator InsertBefore, const DebugLoc &DL,
1375 AMDGPU::OpName OpName, Register DestReg) const {
1376 MachineBasicBlock *MBB = CI.I->getParent();
1377
1378 auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired);
1379
1380 // Copy to the old destination registers.
1381 const MCInstrDesc &CopyDesc = TII->get(Opcode: TargetOpcode::COPY);
1382 auto *Dest0 = TII->getNamedOperand(MI&: *CI.I, OperandName: OpName);
1383 auto *Dest1 = TII->getNamedOperand(MI&: *Paired.I, OperandName: OpName);
1384
1385 // The constrained sload instructions in S_LOAD_IMM class will have
1386 // `early-clobber` flag in the dst operand. Remove the flag before using the
1387 // MOs in copies.
1388 Dest0->setIsEarlyClobber(false);
1389 Dest1->setIsEarlyClobber(false);
1390
1391 BuildMI(BB&: *MBB, I: InsertBefore, MIMD: DL, MCID: CopyDesc)
1392 .add(MO: *Dest0) // Copy to same destination including flags and sub reg.
1393 .addReg(RegNo: DestReg, Flags: {}, SubReg: SubRegIdx0);
1394 BuildMI(BB&: *MBB, I: InsertBefore, MIMD: DL, MCID: CopyDesc)
1395 .add(MO: *Dest1)
1396 .addReg(RegNo: DestReg, Flags: RegState::Kill, SubReg: SubRegIdx1);
1397}
1398
1399// Return a register for the source of the merged store after copying the
1400// original source regs of CI and Paired into it.
1401Register
1402SILoadStoreOptimizer::copyFromSrcRegs(CombineInfo &CI, CombineInfo &Paired,
1403 MachineBasicBlock::iterator InsertBefore,
1404 const DebugLoc &DL,
1405 AMDGPU::OpName OpName) const {
1406 MachineBasicBlock *MBB = CI.I->getParent();
1407
1408 auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired);
1409
1410 // Copy to the new source register.
1411 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1412 Register SrcReg = MRI->createVirtualRegister(RegClass: SuperRC);
1413
1414 const auto *Src0 = TII->getNamedOperand(MI&: *CI.I, OperandName: OpName);
1415 const auto *Src1 = TII->getNamedOperand(MI&: *Paired.I, OperandName: OpName);
1416
1417 BuildMI(BB&: *MBB, I: InsertBefore, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: SrcReg)
1418 .add(MO: *Src0)
1419 .addImm(Val: SubRegIdx0)
1420 .add(MO: *Src1)
1421 .addImm(Val: SubRegIdx1);
1422
1423 return SrcReg;
1424}
1425
1426unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const {
1427 if (STM->ldsRequiresM0Init())
1428 return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;
1429 return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9;
1430}
1431
1432unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const {
1433 if (STM->ldsRequiresM0Init())
1434 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
1435
1436 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9
1437 : AMDGPU::DS_READ2ST64_B64_gfx9;
1438}
1439
1440MachineBasicBlock::iterator
1441SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
1442 MachineBasicBlock::iterator InsertBefore) {
1443 MachineBasicBlock *MBB = CI.I->getParent();
1444
1445 // Be careful, since the addresses could be subregisters themselves in weird
1446 // cases, like vectors of pointers.
1447 const auto *AddrReg = TII->getNamedOperand(MI&: *CI.I, OperandName: AMDGPU::OpName::addr);
1448
1449 unsigned NewOffset0 = std::min(a: CI.Offset, b: Paired.Offset);
1450 unsigned NewOffset1 = std::max(a: CI.Offset, b: Paired.Offset);
1451 unsigned Opc =
1452 CI.UseST64 ? read2ST64Opcode(EltSize: CI.EltSize) : read2Opcode(EltSize: CI.EltSize);
1453
1454 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
1455 (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
1456
1457 const MCInstrDesc &Read2Desc = TII->get(Opcode: Opc);
1458
1459 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1460 Register DestReg = MRI->createVirtualRegister(RegClass: SuperRC);
1461
1462 DebugLoc DL =
1463 DebugLoc::getMergedLocation(LocA: CI.I->getDebugLoc(), LocB: Paired.I->getDebugLoc());
1464
1465 Register BaseReg = AddrReg->getReg();
1466 unsigned BaseSubReg = AddrReg->getSubReg();
1467 RegState BaseRegFlags = {};
1468 if (CI.BaseOff) {
1469 Register ImmReg = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
1470 BuildMI(BB&: *MBB, I: InsertBefore, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MOV_B32), DestReg: ImmReg)
1471 .addImm(Val: CI.BaseOff);
1472
1473 BaseReg = MRI->createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
1474 BaseRegFlags = RegState::Kill;
1475
1476 TII->getAddNoCarry(MBB&: *MBB, I: InsertBefore, DL, DestReg: BaseReg)
1477 .addReg(RegNo: ImmReg)
1478 .addReg(RegNo: AddrReg->getReg(), Flags: {}, SubReg: BaseSubReg)
1479 .addImm(Val: 0); // clamp bit
1480 BaseSubReg = 0;
1481 }
1482
1483 MachineInstrBuilder Read2 =
1484 BuildMI(BB&: *MBB, I: InsertBefore, MIMD: DL, MCID: Read2Desc, DestReg)
1485 .addReg(RegNo: BaseReg, Flags: BaseRegFlags, SubReg: BaseSubReg) // addr
1486 .addImm(Val: NewOffset0) // offset0
1487 .addImm(Val: NewOffset1) // offset1
1488 .addImm(Val: 0) // gds
1489 .cloneMergedMemRefs(OtherMIs: {&*CI.I, &*Paired.I});
1490
1491 copyToDestRegs(CI, Paired, InsertBefore, DL, OpName: AMDGPU::OpName::vdst, DestReg);
1492
1493 CI.I->eraseFromParent();
1494 Paired.I->eraseFromParent();
1495
1496 LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
1497 return Read2;
1498}
1499
1500unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const {
1501 if (STM->ldsRequiresM0Init())
1502 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
1503 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9
1504 : AMDGPU::DS_WRITE2_B64_gfx9;
1505}
1506
1507unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const {
1508 if (STM->ldsRequiresM0Init())
1509 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32
1510 : AMDGPU::DS_WRITE2ST64_B64;
1511
1512 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9
1513 : AMDGPU::DS_WRITE2ST64_B64_gfx9;
1514}
1515
1516unsigned SILoadStoreOptimizer::getWrite2Opcode(const CombineInfo &CI) const {
1517 return CI.UseST64 ? write2ST64Opcode(EltSize: CI.EltSize) : write2Opcode(EltSize: CI.EltSize);
1518}
1519
1520MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
1521 CombineInfo &CI, CombineInfo &Paired,
1522 MachineBasicBlock::iterator InsertBefore) {
1523 MachineBasicBlock *MBB = CI.I->getParent();
1524
1525 // Be sure to use .addOperand(), and not .addReg() with these. We want to be
1526 // sure we preserve the subregister index and any register flags set on them.
1527 const MachineOperand *AddrReg =
1528 TII->getNamedOperand(MI&: *CI.I, OperandName: AMDGPU::OpName::addr);
1529 const MachineOperand *Data0 =
1530 TII->getNamedOperand(MI&: *CI.I, OperandName: AMDGPU::OpName::data0);
1531 const MachineOperand *Data1 =
1532 TII->getNamedOperand(MI&: *Paired.I, OperandName: AMDGPU::OpName::data0);
1533
1534 unsigned NewOffset0 = CI.Offset;
1535 unsigned NewOffset1 = Paired.Offset;
1536 unsigned Opc = getWrite2Opcode(CI);
1537
1538 if (NewOffset0 > NewOffset1) {
1539 // Canonicalize the merged instruction so the smaller offset comes first.
1540 std::swap(a&: NewOffset0, b&: NewOffset1);
1541 std::swap(a&: Data0, b&: Data1);
1542 }
1543
1544 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
1545 (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
1546
1547 const MCInstrDesc &Write2Desc = TII->get(Opcode: Opc);
1548 DebugLoc DL =
1549 DebugLoc::getMergedLocation(LocA: CI.I->getDebugLoc(), LocB: Paired.I->getDebugLoc());
1550
1551 Register BaseReg = AddrReg->getReg();
1552 unsigned BaseSubReg = AddrReg->getSubReg();
1553 RegState BaseRegFlags = {};
1554 if (CI.BaseOff) {
1555 Register ImmReg = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
1556 BuildMI(BB&: *MBB, I: InsertBefore, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MOV_B32), DestReg: ImmReg)
1557 .addImm(Val: CI.BaseOff);
1558
1559 BaseReg = MRI->createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
1560 BaseRegFlags = RegState::Kill;
1561
1562 TII->getAddNoCarry(MBB&: *MBB, I: InsertBefore, DL, DestReg: BaseReg)
1563 .addReg(RegNo: ImmReg)
1564 .addReg(RegNo: AddrReg->getReg(), Flags: {}, SubReg: BaseSubReg)
1565 .addImm(Val: 0); // clamp bit
1566 BaseSubReg = 0;
1567 }
1568
1569 MachineInstrBuilder Write2 =
1570 BuildMI(BB&: *MBB, I: InsertBefore, MIMD: DL, MCID: Write2Desc)
1571 .addReg(RegNo: BaseReg, Flags: BaseRegFlags, SubReg: BaseSubReg) // addr
1572 .add(MO: *Data0) // data0
1573 .add(MO: *Data1) // data1
1574 .addImm(Val: NewOffset0) // offset0
1575 .addImm(Val: NewOffset1) // offset1
1576 .addImm(Val: 0) // gds
1577 .cloneMergedMemRefs(OtherMIs: {&*CI.I, &*Paired.I});
1578
1579 CI.I->eraseFromParent();
1580 Paired.I->eraseFromParent();
1581
1582 LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
1583 return Write2;
1584}
1585
1586MachineBasicBlock::iterator
1587SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
1588 MachineBasicBlock::iterator InsertBefore) {
1589 MachineBasicBlock *MBB = CI.I->getParent();
1590 DebugLoc DL =
1591 DebugLoc::getMergedLocation(LocA: CI.I->getDebugLoc(), LocB: Paired.I->getDebugLoc());
1592
1593 const unsigned Opcode = getNewOpcode(CI, Paired);
1594
1595 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1596
1597 Register DestReg = MRI->createVirtualRegister(RegClass: SuperRC);
1598 unsigned MergedDMask = CI.DMask | Paired.DMask;
1599 unsigned DMaskIdx =
1600 AMDGPU::getNamedOperandIdx(Opcode: CI.I->getOpcode(), Name: AMDGPU::OpName::dmask);
1601
1602 auto MIB = BuildMI(BB&: *MBB, I: InsertBefore, MIMD: DL, MCID: TII->get(Opcode), DestReg);
1603 for (unsigned I = 1, E = (*CI.I).getNumOperands(); I != E; ++I) {
1604 if (I == DMaskIdx)
1605 MIB.addImm(Val: MergedDMask);
1606 else
1607 MIB.add(MO: (*CI.I).getOperand(i: I));
1608 }
1609
1610 // It shouldn't be possible to get this far if the two instructions
1611 // don't have a single memoperand, because MachineInstr::mayAlias()
1612 // will return true if this is the case.
1613 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1614
1615 MachineInstr *New = MIB.addMemOperand(MMO: combineKnownAdjacentMMOs(CI, Paired));
1616
1617 copyToDestRegs(CI, Paired, InsertBefore, DL, OpName: AMDGPU::OpName::vdata, DestReg);
1618
1619 CI.I->eraseFromParent();
1620 Paired.I->eraseFromParent();
1621 return New;
1622}
1623
1624MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSMemLoadImmPair(
1625 CombineInfo &CI, CombineInfo &Paired,
1626 MachineBasicBlock::iterator InsertBefore) {
1627 MachineBasicBlock *MBB = CI.I->getParent();
1628 DebugLoc DL =
1629 DebugLoc::getMergedLocation(LocA: CI.I->getDebugLoc(), LocB: Paired.I->getDebugLoc());
1630
1631 const unsigned Opcode = getNewOpcode(CI, Paired);
1632
1633 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1634
1635 Register DestReg = MRI->createVirtualRegister(RegClass: SuperRC);
1636 unsigned MergedOffset = std::min(a: CI.Offset, b: Paired.Offset);
1637
1638 // It shouldn't be possible to get this far if the two instructions
1639 // don't have a single memoperand, because MachineInstr::mayAlias()
1640 // will return true if this is the case.
1641 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1642
1643 MachineInstrBuilder New =
1644 BuildMI(BB&: *MBB, I: InsertBefore, MIMD: DL, MCID: TII->get(Opcode), DestReg)
1645 .add(MO: *TII->getNamedOperand(MI&: *CI.I, OperandName: AMDGPU::OpName::sbase));
1646 if (CI.InstClass == S_BUFFER_LOAD_SGPR_IMM)
1647 New.add(MO: *TII->getNamedOperand(MI&: *CI.I, OperandName: AMDGPU::OpName::soffset));
1648 New.addImm(Val: MergedOffset);
1649 New.addImm(Val: CI.CPol).addMemOperand(MMO: combineKnownAdjacentMMOs(CI, Paired));
1650
1651 copyToDestRegs(CI, Paired, InsertBefore, DL, OpName: AMDGPU::OpName::sdst, DestReg);
1652
1653 CI.I->eraseFromParent();
1654 Paired.I->eraseFromParent();
1655 return New;
1656}
1657
1658MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
1659 CombineInfo &CI, CombineInfo &Paired,
1660 MachineBasicBlock::iterator InsertBefore) {
1661 MachineBasicBlock *MBB = CI.I->getParent();
1662
1663 DebugLoc DL =
1664 DebugLoc::getMergedLocation(LocA: CI.I->getDebugLoc(), LocB: Paired.I->getDebugLoc());
1665
1666 const unsigned Opcode = getNewOpcode(CI, Paired);
1667
1668 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1669
1670 // Copy to the new source register.
1671 Register DestReg = MRI->createVirtualRegister(RegClass: SuperRC);
1672 unsigned MergedOffset = std::min(a: CI.Offset, b: Paired.Offset);
1673
1674 auto MIB = BuildMI(BB&: *MBB, I: InsertBefore, MIMD: DL, MCID: TII->get(Opcode), DestReg);
1675
1676 AddressRegs Regs = getRegs(Opc: Opcode, TII: *TII);
1677
1678 if (Regs.VAddr)
1679 MIB.add(MO: *TII->getNamedOperand(MI&: *CI.I, OperandName: AMDGPU::OpName::vaddr));
1680
1681 // It shouldn't be possible to get this far if the two instructions
1682 // don't have a single memoperand, because MachineInstr::mayAlias()
1683 // will return true if this is the case.
1684 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1685
1686 MachineInstr *New =
1687 MIB.add(MO: *TII->getNamedOperand(MI&: *CI.I, OperandName: AMDGPU::OpName::srsrc))
1688 .add(MO: *TII->getNamedOperand(MI&: *CI.I, OperandName: AMDGPU::OpName::soffset))
1689 .addImm(Val: MergedOffset) // offset
1690 .addImm(Val: CI.CPol) // cpol
1691 .addImm(Val: 0) // swz
1692 .addMemOperand(MMO: combineKnownAdjacentMMOs(CI, Paired));
1693
1694 copyToDestRegs(CI, Paired, InsertBefore, DL, OpName: AMDGPU::OpName::vdata, DestReg);
1695
1696 CI.I->eraseFromParent();
1697 Paired.I->eraseFromParent();
1698 return New;
1699}
1700
1701MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair(
1702 CombineInfo &CI, CombineInfo &Paired,
1703 MachineBasicBlock::iterator InsertBefore) {
1704 MachineBasicBlock *MBB = CI.I->getParent();
1705
1706 DebugLoc DL =
1707 DebugLoc::getMergedLocation(LocA: CI.I->getDebugLoc(), LocB: Paired.I->getDebugLoc());
1708
1709 const unsigned Opcode = getNewOpcode(CI, Paired);
1710
1711 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1712
1713 // Copy to the new source register.
1714 Register DestReg = MRI->createVirtualRegister(RegClass: SuperRC);
1715 unsigned MergedOffset = std::min(a: CI.Offset, b: Paired.Offset);
1716
1717 auto MIB = BuildMI(BB&: *MBB, I: InsertBefore, MIMD: DL, MCID: TII->get(Opcode), DestReg);
1718
1719 AddressRegs Regs = getRegs(Opc: Opcode, TII: *TII);
1720
1721 if (Regs.VAddr)
1722 MIB.add(MO: *TII->getNamedOperand(MI&: *CI.I, OperandName: AMDGPU::OpName::vaddr));
1723
1724 // For 8-bit or 16-bit tbuffer formats there is no 3-component encoding.
1725 // If the combined count is 3 (e.g. X+X+X or XY+X), promote to 4 components
1726 // and use XYZ of XYZW to enable the merge.
1727 unsigned NumCombinedComponents = CI.Width + Paired.Width;
1728 if (NumCombinedComponents == 3 && CI.EltSize <= 2)
1729 NumCombinedComponents = 4;
1730 unsigned JoinedFormat =
1731 getBufferFormatWithCompCount(OldFormat: CI.Format, ComponentCount: NumCombinedComponents, STI: *STM);
1732
1733 // It shouldn't be possible to get this far if the two instructions
1734 // don't have a single memoperand, because MachineInstr::mayAlias()
1735 // will return true if this is the case.
1736 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1737
1738 MachineInstr *New =
1739 MIB.add(MO: *TII->getNamedOperand(MI&: *CI.I, OperandName: AMDGPU::OpName::srsrc))
1740 .add(MO: *TII->getNamedOperand(MI&: *CI.I, OperandName: AMDGPU::OpName::soffset))
1741 .addImm(Val: MergedOffset) // offset
1742 .addImm(Val: JoinedFormat) // format
1743 .addImm(Val: CI.CPol) // cpol
1744 .addImm(Val: 0) // swz
1745 .addMemOperand(MMO: combineKnownAdjacentMMOs(CI, Paired));
1746
1747 copyToDestRegs(CI, Paired, InsertBefore, DL, OpName: AMDGPU::OpName::vdata, DestReg);
1748
1749 CI.I->eraseFromParent();
1750 Paired.I->eraseFromParent();
1751 return New;
1752}
1753
1754MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair(
1755 CombineInfo &CI, CombineInfo &Paired,
1756 MachineBasicBlock::iterator InsertBefore) {
1757 MachineBasicBlock *MBB = CI.I->getParent();
1758 DebugLoc DL =
1759 DebugLoc::getMergedLocation(LocA: CI.I->getDebugLoc(), LocB: Paired.I->getDebugLoc());
1760
1761 const unsigned Opcode = getNewOpcode(CI, Paired);
1762
1763 Register SrcReg =
1764 copyFromSrcRegs(CI, Paired, InsertBefore, DL, OpName: AMDGPU::OpName::vdata);
1765
1766 auto MIB = BuildMI(BB&: *MBB, I: InsertBefore, MIMD: DL, MCID: TII->get(Opcode))
1767 .addReg(RegNo: SrcReg, Flags: RegState::Kill);
1768
1769 AddressRegs Regs = getRegs(Opc: Opcode, TII: *TII);
1770
1771 if (Regs.VAddr)
1772 MIB.add(MO: *TII->getNamedOperand(MI&: *CI.I, OperandName: AMDGPU::OpName::vaddr));
1773
1774 // For 8-bit or 16-bit tbuffer formats there is no 3-component encoding.
1775 // If the combined count is 3 (e.g. X+X+X or XY+X), promote to 4 components
1776 // and use XYZ of XYZW to enable the merge.
1777 unsigned NumCombinedComponents = CI.Width + Paired.Width;
1778 if (NumCombinedComponents == 3 && CI.EltSize <= 2)
1779 NumCombinedComponents = 4;
1780 unsigned JoinedFormat =
1781 getBufferFormatWithCompCount(OldFormat: CI.Format, ComponentCount: NumCombinedComponents, STI: *STM);
1782
1783 // It shouldn't be possible to get this far if the two instructions
1784 // don't have a single memoperand, because MachineInstr::mayAlias()
1785 // will return true if this is the case.
1786 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1787
1788 MachineInstr *New =
1789 MIB.add(MO: *TII->getNamedOperand(MI&: *CI.I, OperandName: AMDGPU::OpName::srsrc))
1790 .add(MO: *TII->getNamedOperand(MI&: *CI.I, OperandName: AMDGPU::OpName::soffset))
1791 .addImm(Val: std::min(a: CI.Offset, b: Paired.Offset)) // offset
1792 .addImm(Val: JoinedFormat) // format
1793 .addImm(Val: CI.CPol) // cpol
1794 .addImm(Val: 0) // swz
1795 .addMemOperand(MMO: combineKnownAdjacentMMOs(CI, Paired));
1796
1797 CI.I->eraseFromParent();
1798 Paired.I->eraseFromParent();
1799 return New;
1800}
1801
1802MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatLoadPair(
1803 CombineInfo &CI, CombineInfo &Paired,
1804 MachineBasicBlock::iterator InsertBefore) {
1805 MachineBasicBlock *MBB = CI.I->getParent();
1806
1807 DebugLoc DL =
1808 DebugLoc::getMergedLocation(LocA: CI.I->getDebugLoc(), LocB: Paired.I->getDebugLoc());
1809
1810 const unsigned Opcode = getNewOpcode(CI, Paired);
1811
1812 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1813 Register DestReg = MRI->createVirtualRegister(RegClass: SuperRC);
1814
1815 auto MIB = BuildMI(BB&: *MBB, I: InsertBefore, MIMD: DL, MCID: TII->get(Opcode), DestReg);
1816
1817 if (auto *SAddr = TII->getNamedOperand(MI&: *CI.I, OperandName: AMDGPU::OpName::saddr))
1818 MIB.add(MO: *SAddr);
1819
1820 MachineInstr *New =
1821 MIB.add(MO: *TII->getNamedOperand(MI&: *CI.I, OperandName: AMDGPU::OpName::vaddr))
1822 .addImm(Val: std::min(a: CI.Offset, b: Paired.Offset))
1823 .addImm(Val: CI.CPol)
1824 .addMemOperand(MMO: combineKnownAdjacentMMOs(CI, Paired));
1825
1826 copyToDestRegs(CI, Paired, InsertBefore, DL, OpName: AMDGPU::OpName::vdst, DestReg);
1827
1828 CI.I->eraseFromParent();
1829 Paired.I->eraseFromParent();
1830 return New;
1831}
1832
1833MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatStorePair(
1834 CombineInfo &CI, CombineInfo &Paired,
1835 MachineBasicBlock::iterator InsertBefore) {
1836 MachineBasicBlock *MBB = CI.I->getParent();
1837
1838 DebugLoc DL =
1839 DebugLoc::getMergedLocation(LocA: CI.I->getDebugLoc(), LocB: Paired.I->getDebugLoc());
1840
1841 const unsigned Opcode = getNewOpcode(CI, Paired);
1842
1843 Register SrcReg =
1844 copyFromSrcRegs(CI, Paired, InsertBefore, DL, OpName: AMDGPU::OpName::vdata);
1845
1846 auto MIB = BuildMI(BB&: *MBB, I: InsertBefore, MIMD: DL, MCID: TII->get(Opcode))
1847 .add(MO: *TII->getNamedOperand(MI&: *CI.I, OperandName: AMDGPU::OpName::vaddr))
1848 .addReg(RegNo: SrcReg, Flags: RegState::Kill);
1849
1850 if (auto *SAddr = TII->getNamedOperand(MI&: *CI.I, OperandName: AMDGPU::OpName::saddr))
1851 MIB.add(MO: *SAddr);
1852
1853 MachineInstr *New =
1854 MIB.addImm(Val: std::min(a: CI.Offset, b: Paired.Offset))
1855 .addImm(Val: CI.CPol)
1856 .addMemOperand(MMO: combineKnownAdjacentMMOs(CI, Paired));
1857
1858 CI.I->eraseFromParent();
1859 Paired.I->eraseFromParent();
1860 return New;
1861}
1862
1863static bool needsConstrainedOpcode(const GCNSubtarget &STM,
1864 ArrayRef<MachineMemOperand *> MMOs,
1865 unsigned Width) {
1866 // Conservatively returns true if not found the MMO.
1867 return STM.isXNACKEnabled() &&
1868 (MMOs.size() != 1 || MMOs[0]->getAlign().value() < Width * 4);
1869}
1870
1871unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
1872 const CombineInfo &Paired) {
1873 const unsigned Width = CI.Width + Paired.Width;
1874
1875 switch (getCommonInstClass(CI, Paired)) {
1876 default:
1877 assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE);
1878 // FIXME: Handle d16 correctly
1879 return AMDGPU::getMUBUFOpcode(BaseOpc: AMDGPU::getMUBUFBaseOpcode(Opc: CI.I->getOpcode()),
1880 Elements: Width);
1881 case TBUFFER_LOAD:
1882 case TBUFFER_STORE:
1883 return AMDGPU::getMTBUFOpcode(BaseOpc: AMDGPU::getMTBUFBaseOpcode(Opc: CI.I->getOpcode()),
1884 Elements: Width);
1885
1886 case UNKNOWN:
1887 llvm_unreachable("Unknown instruction class");
1888 case S_BUFFER_LOAD_IMM: {
1889 // If XNACK is enabled, use the constrained opcodes when the first load is
1890 // under-aligned.
1891 bool NeedsConstrainedOpc =
1892 needsConstrainedOpcode(STM: *STM, MMOs: CI.I->memoperands(), Width);
1893 switch (Width) {
1894 default:
1895 return 0;
1896 case 2:
1897 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec
1898 : AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
1899 case 3:
1900 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec
1901 : AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM;
1902 case 4:
1903 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec
1904 : AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
1905 case 8:
1906 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec
1907 : AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM;
1908 }
1909 }
1910 case S_BUFFER_LOAD_SGPR_IMM: {
1911 // If XNACK is enabled, use the constrained opcodes when the first load is
1912 // under-aligned.
1913 bool NeedsConstrainedOpc =
1914 needsConstrainedOpcode(STM: *STM, MMOs: CI.I->memoperands(), Width);
1915 switch (Width) {
1916 default:
1917 return 0;
1918 case 2:
1919 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec
1920 : AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM;
1921 case 3:
1922 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec
1923 : AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM;
1924 case 4:
1925 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec
1926 : AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM;
1927 case 8:
1928 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec
1929 : AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM;
1930 }
1931 }
1932 case S_LOAD_IMM: {
1933 // If XNACK is enabled, use the constrained opcodes when the first load is
1934 // under-aligned.
1935 bool NeedsConstrainedOpc =
1936 needsConstrainedOpcode(STM: *STM, MMOs: CI.I->memoperands(), Width);
1937 switch (Width) {
1938 default:
1939 return 0;
1940 case 2:
1941 return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX2_IMM_ec
1942 : AMDGPU::S_LOAD_DWORDX2_IMM;
1943 case 3:
1944 return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX3_IMM_ec
1945 : AMDGPU::S_LOAD_DWORDX3_IMM;
1946 case 4:
1947 return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX4_IMM_ec
1948 : AMDGPU::S_LOAD_DWORDX4_IMM;
1949 case 8:
1950 return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX8_IMM_ec
1951 : AMDGPU::S_LOAD_DWORDX8_IMM;
1952 }
1953 }
1954 case GLOBAL_LOAD:
1955 switch (Width) {
1956 default:
1957 return 0;
1958 case 2:
1959 return AMDGPU::GLOBAL_LOAD_DWORDX2;
1960 case 3:
1961 return AMDGPU::GLOBAL_LOAD_DWORDX3;
1962 case 4:
1963 return AMDGPU::GLOBAL_LOAD_DWORDX4;
1964 }
1965 case GLOBAL_LOAD_SADDR:
1966 switch (Width) {
1967 default:
1968 return 0;
1969 case 2:
1970 return AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR;
1971 case 3:
1972 return AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR;
1973 case 4:
1974 return AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR;
1975 }
1976 case GLOBAL_STORE:
1977 switch (Width) {
1978 default:
1979 return 0;
1980 case 2:
1981 return AMDGPU::GLOBAL_STORE_DWORDX2;
1982 case 3:
1983 return AMDGPU::GLOBAL_STORE_DWORDX3;
1984 case 4:
1985 return AMDGPU::GLOBAL_STORE_DWORDX4;
1986 }
1987 case GLOBAL_STORE_SADDR:
1988 switch (Width) {
1989 default:
1990 return 0;
1991 case 2:
1992 return AMDGPU::GLOBAL_STORE_DWORDX2_SADDR;
1993 case 3:
1994 return AMDGPU::GLOBAL_STORE_DWORDX3_SADDR;
1995 case 4:
1996 return AMDGPU::GLOBAL_STORE_DWORDX4_SADDR;
1997 }
1998 case FLAT_LOAD:
1999 switch (Width) {
2000 default:
2001 return 0;
2002 case 2:
2003 return AMDGPU::FLAT_LOAD_DWORDX2;
2004 case 3:
2005 return AMDGPU::FLAT_LOAD_DWORDX3;
2006 case 4:
2007 return AMDGPU::FLAT_LOAD_DWORDX4;
2008 }
2009 case FLAT_STORE:
2010 switch (Width) {
2011 default:
2012 return 0;
2013 case 2:
2014 return AMDGPU::FLAT_STORE_DWORDX2;
2015 case 3:
2016 return AMDGPU::FLAT_STORE_DWORDX3;
2017 case 4:
2018 return AMDGPU::FLAT_STORE_DWORDX4;
2019 }
2020 case FLAT_LOAD_SADDR:
2021 switch (Width) {
2022 default:
2023 return 0;
2024 case 2:
2025 return AMDGPU::FLAT_LOAD_DWORDX2_SADDR;
2026 case 3:
2027 return AMDGPU::FLAT_LOAD_DWORDX3_SADDR;
2028 case 4:
2029 return AMDGPU::FLAT_LOAD_DWORDX4_SADDR;
2030 }
2031 case FLAT_STORE_SADDR:
2032 switch (Width) {
2033 default:
2034 return 0;
2035 case 2:
2036 return AMDGPU::FLAT_STORE_DWORDX2_SADDR;
2037 case 3:
2038 return AMDGPU::FLAT_STORE_DWORDX3_SADDR;
2039 case 4:
2040 return AMDGPU::FLAT_STORE_DWORDX4_SADDR;
2041 }
2042 case MIMG:
2043 assert(((unsigned)llvm::popcount(CI.DMask | Paired.DMask) == Width) &&
2044 "No overlaps");
2045 return AMDGPU::getMaskedMIMGOp(Opc: CI.I->getOpcode(), NewChannels: Width);
2046 }
2047}
2048
2049std::pair<unsigned, unsigned>
2050SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI,
2051 const CombineInfo &Paired) {
2052 assert((CI.InstClass != MIMG ||
2053 ((unsigned)llvm::popcount(CI.DMask | Paired.DMask) ==
2054 CI.Width + Paired.Width)) &&
2055 "No overlaps");
2056
2057 unsigned Idx0;
2058 unsigned Idx1;
2059
2060 static const unsigned Idxs[5][4] = {
2061 {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3},
2062 {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, AMDGPU::sub1_sub2_sub3_sub4},
2063 {AMDGPU::sub2, AMDGPU::sub2_sub3, AMDGPU::sub2_sub3_sub4, AMDGPU::sub2_sub3_sub4_sub5},
2064 {AMDGPU::sub3, AMDGPU::sub3_sub4, AMDGPU::sub3_sub4_sub5, AMDGPU::sub3_sub4_sub5_sub6},
2065 {AMDGPU::sub4, AMDGPU::sub4_sub5, AMDGPU::sub4_sub5_sub6, AMDGPU::sub4_sub5_sub6_sub7},
2066 };
2067
2068 assert(CI.Width >= 1 && CI.Width <= 4);
2069 assert(Paired.Width >= 1 && Paired.Width <= 4);
2070
2071 if (Paired < CI) {
2072 Idx1 = Idxs[0][Paired.Width - 1];
2073 Idx0 = Idxs[Paired.Width][CI.Width - 1];
2074 } else {
2075 Idx0 = Idxs[0][CI.Width - 1];
2076 Idx1 = Idxs[CI.Width][Paired.Width - 1];
2077 }
2078
2079 return {Idx0, Idx1};
2080}
2081
2082const TargetRegisterClass *
2083SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI,
2084 const CombineInfo &Paired) const {
2085 if (CI.InstClass == S_BUFFER_LOAD_IMM ||
2086 CI.InstClass == S_BUFFER_LOAD_SGPR_IMM || CI.InstClass == S_LOAD_IMM) {
2087 switch (CI.Width + Paired.Width) {
2088 default:
2089 return nullptr;
2090 case 2:
2091 return &AMDGPU::SReg_64_XEXECRegClass;
2092 case 3:
2093 return &AMDGPU::SGPR_96RegClass;
2094 case 4:
2095 return &AMDGPU::SGPR_128RegClass;
2096 case 8:
2097 return &AMDGPU::SGPR_256RegClass;
2098 case 16:
2099 return &AMDGPU::SGPR_512RegClass;
2100 }
2101 }
2102
2103 // FIXME: This should compute the instruction to use, and then use the result
2104 // of TII->getRegClass.
2105 unsigned BitWidth = 32 * (CI.Width + Paired.Width);
2106 return TRI->isAGPRClass(RC: getDataRegClass(MI: *CI.I))
2107 ? TRI->getAGPRClassForBitWidth(BitWidth)
2108 : TRI->getVGPRClassForBitWidth(BitWidth);
2109}
2110
2111MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
2112 CombineInfo &CI, CombineInfo &Paired,
2113 MachineBasicBlock::iterator InsertBefore) {
2114 MachineBasicBlock *MBB = CI.I->getParent();
2115 DebugLoc DL =
2116 DebugLoc::getMergedLocation(LocA: CI.I->getDebugLoc(), LocB: Paired.I->getDebugLoc());
2117
2118 const unsigned Opcode = getNewOpcode(CI, Paired);
2119
2120 Register SrcReg =
2121 copyFromSrcRegs(CI, Paired, InsertBefore, DL, OpName: AMDGPU::OpName::vdata);
2122
2123 auto MIB = BuildMI(BB&: *MBB, I: InsertBefore, MIMD: DL, MCID: TII->get(Opcode))
2124 .addReg(RegNo: SrcReg, Flags: RegState::Kill);
2125
2126 AddressRegs Regs = getRegs(Opc: Opcode, TII: *TII);
2127
2128 if (Regs.VAddr)
2129 MIB.add(MO: *TII->getNamedOperand(MI&: *CI.I, OperandName: AMDGPU::OpName::vaddr));
2130
2131
2132 // It shouldn't be possible to get this far if the two instructions
2133 // don't have a single memoperand, because MachineInstr::mayAlias()
2134 // will return true if this is the case.
2135 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
2136
2137 MachineInstr *New =
2138 MIB.add(MO: *TII->getNamedOperand(MI&: *CI.I, OperandName: AMDGPU::OpName::srsrc))
2139 .add(MO: *TII->getNamedOperand(MI&: *CI.I, OperandName: AMDGPU::OpName::soffset))
2140 .addImm(Val: std::min(a: CI.Offset, b: Paired.Offset)) // offset
2141 .addImm(Val: CI.CPol) // cpol
2142 .addImm(Val: 0) // swz
2143 .addMemOperand(MMO: combineKnownAdjacentMMOs(CI, Paired));
2144
2145 CI.I->eraseFromParent();
2146 Paired.I->eraseFromParent();
2147 return New;
2148}
2149
2150MachineOperand
2151SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) const {
2152 APInt V(32, Val, true);
2153 if (TII->isInlineConstant(Imm: V))
2154 return MachineOperand::CreateImm(Val);
2155
2156 Register Reg = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
2157 MachineInstr *Mov =
2158 BuildMI(BB&: *MI.getParent(), I: MI.getIterator(), MIMD: MI.getDebugLoc(),
2159 MCID: TII->get(Opcode: AMDGPU::S_MOV_B32), DestReg: Reg)
2160 .addImm(Val);
2161 (void)Mov;
2162 LLVM_DEBUG(dbgs() << " "; Mov->dump());
2163 return MachineOperand::CreateReg(Reg, isDef: false);
2164}
2165
2166// Compute base address using Addr and return the final register.
2167Register SILoadStoreOptimizer::computeBase(MachineInstr &MI,
2168 const MemAddress &Addr) const {
2169 MachineBasicBlock *MBB = MI.getParent();
2170 MachineBasicBlock::iterator MBBI = MI.getIterator();
2171 const DebugLoc &DL = MI.getDebugLoc();
2172
2173 LLVM_DEBUG(dbgs() << " Re-Computed Anchor-Base:\n");
2174
2175 // Use V_ADD_U64_e64 when the original pattern used it (gfx1250+)
2176 if (Addr.Base.UseV64Pattern) {
2177 Register FullDestReg = MRI->createVirtualRegister(
2178 RegClass: TII->getRegClass(MCID: TII->get(Opcode: AMDGPU::V_ADD_U64_e64), OpNum: 0));
2179
2180 // Load the 64-bit offset into an SGPR pair if needed
2181 Register OffsetReg = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_64RegClass);
2182 MachineInstr *MovOffset =
2183 BuildMI(BB&: *MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MOV_B64_IMM_PSEUDO),
2184 DestReg: OffsetReg)
2185 .addImm(Val: Addr.Offset);
2186 MachineInstr *Add64 =
2187 BuildMI(BB&: *MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_ADD_U64_e64), DestReg: FullDestReg)
2188 .addReg(RegNo: Addr.Base.LoReg)
2189 .addReg(RegNo: OffsetReg, Flags: RegState::Kill)
2190 .addImm(Val: 0);
2191 (void)MovOffset;
2192 (void)Add64;
2193 LLVM_DEBUG(dbgs() << " " << *MovOffset << "\n";
2194 dbgs() << " " << *Add64 << "\n\n";);
2195
2196 return FullDestReg;
2197 }
2198
2199 // Original carry-chain pattern (V_ADD_CO_U32 + V_ADDC_U32)
2200 assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 ||
2201 Addr.Base.LoSubReg) &&
2202 "Expected 32-bit Base-Register-Low!!");
2203
2204 assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 ||
2205 Addr.Base.HiSubReg) &&
2206 "Expected 32-bit Base-Register-Hi!!");
2207
2208 MachineOperand OffsetLo = createRegOrImm(Val: static_cast<int32_t>(Addr.Offset), MI);
2209 MachineOperand OffsetHi =
2210 createRegOrImm(Val: static_cast<int32_t>(Addr.Offset >> 32), MI);
2211
2212 const auto *CarryRC = TRI->getWaveMaskRegClass();
2213 Register CarryReg = MRI->createVirtualRegister(RegClass: CarryRC);
2214 Register DeadCarryReg = MRI->createVirtualRegister(RegClass: CarryRC);
2215
2216 Register DestSub0 = MRI->createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
2217 Register DestSub1 = MRI->createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
2218 MachineInstr *LoHalf =
2219 BuildMI(BB&: *MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_ADD_CO_U32_e64), DestReg: DestSub0)
2220 .addReg(RegNo: CarryReg, Flags: RegState::Define)
2221 .addReg(RegNo: Addr.Base.LoReg, Flags: {}, SubReg: Addr.Base.LoSubReg)
2222 .add(MO: OffsetLo)
2223 .addImm(Val: 0); // clamp bit
2224
2225 MachineInstr *HiHalf =
2226 BuildMI(BB&: *MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_ADDC_U32_e64), DestReg: DestSub1)
2227 .addReg(RegNo: DeadCarryReg, Flags: RegState::Define | RegState::Dead)
2228 .addReg(RegNo: Addr.Base.HiReg, Flags: {}, SubReg: Addr.Base.HiSubReg)
2229 .add(MO: OffsetHi)
2230 .addReg(RegNo: CarryReg, Flags: RegState::Kill)
2231 .addImm(Val: 0); // clamp bit
2232
2233 Register FullDestReg = MRI->createVirtualRegister(RegClass: TRI->getVGPR64Class());
2234 MachineInstr *FullBase =
2235 BuildMI(BB&: *MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::REG_SEQUENCE), DestReg: FullDestReg)
2236 .addReg(RegNo: DestSub0)
2237 .addImm(Val: AMDGPU::sub0)
2238 .addReg(RegNo: DestSub1)
2239 .addImm(Val: AMDGPU::sub1);
2240
2241 (void)LoHalf;
2242 (void)HiHalf;
2243 (void)FullBase;
2244 LLVM_DEBUG(dbgs() << " " << *LoHalf << "\n";
2245 dbgs() << " " << *HiHalf << "\n";
2246 dbgs() << " " << *FullBase << "\n\n";);
2247
2248 return FullDestReg;
2249}
2250
2251// Update base and offset with the NewBase and NewOffset in MI.
2252void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI,
2253 Register NewBase,
2254 int32_t NewOffset) const {
2255 auto *Base = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vaddr);
2256 Base->setReg(NewBase);
2257 Base->setIsKill(false);
2258 TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::offset)->setImm(NewOffset);
2259}
2260
2261// Helper to extract a 64-bit constant offset from a V_ADD_U64_e64 instruction.
2262// Returns true if successful, populating Addr with base register info and
2263// offset.
2264bool SILoadStoreOptimizer::processBaseWithConstOffset64(
2265 MachineInstr *AddDef, const MachineOperand &Base, MemAddress &Addr) const {
2266 if (!Base.isReg())
2267 return false;
2268
2269 MachineOperand *Src0 = TII->getNamedOperand(MI&: *AddDef, OperandName: AMDGPU::OpName::src0);
2270 MachineOperand *Src1 = TII->getNamedOperand(MI&: *AddDef, OperandName: AMDGPU::OpName::src1);
2271
2272 const MachineOperand *BaseOp = nullptr;
2273
2274 auto Offset = TII->getImmOrMaterializedImm(Op&: *Src1);
2275
2276 if (Offset) {
2277 BaseOp = Src0;
2278 Addr.Offset = *Offset;
2279 } else {
2280 // Both or neither are constants - can't handle this pattern
2281 return false;
2282 }
2283
2284 // Now extract the base register (which should be a 64-bit VGPR).
2285 Addr.Base.LoReg = BaseOp->getReg();
2286 Addr.Base.UseV64Pattern = true;
2287 return true;
2288}
2289
2290// Analyze Base and extracts:
2291// - 32bit base registers, subregisters
2292// - 64bit constant offset
2293// Expecting base computation as:
2294// %OFFSET0:sgpr_32 = S_MOV_B32 8000
2295// %LO:vgpr_32, %c:sreg_64_xexec =
2296// V_ADD_CO_U32_e64 %BASE_LO:vgpr_32, %103:sgpr_32,
2297// %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec
2298// %Base:vreg_64 =
2299// REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1
2300//
2301// Also handles V_ADD_U64_e64 pattern (gfx1250+):
2302// %OFFSET:sreg_64 = S_MOV_B64_IMM_PSEUDO 256
2303// %Base:vreg_64 = V_ADD_U64_e64 %BASE:vreg_64, %OFFSET:sreg_64, 0
2304void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base,
2305 MemAddress &Addr) const {
2306 if (!Base.isReg())
2307 return;
2308
2309 MachineInstr *Def = MRI->getUniqueVRegDef(Reg: Base.getReg());
2310 if (!Def)
2311 return;
2312
2313 // Try V_ADD_U64_e64 pattern first (simpler, used on gfx1250+)
2314 if (Def->getOpcode() == AMDGPU::V_ADD_U64_e64) {
2315 if (processBaseWithConstOffset64(AddDef: Def, Base, Addr))
2316 return;
2317 }
2318
2319 // Fall through to REG_SEQUENCE + V_ADD_CO_U32 + V_ADDC_U32 pattern
2320 if (Def->getOpcode() != AMDGPU::REG_SEQUENCE || Def->getNumOperands() != 5)
2321 return;
2322
2323 MachineOperand BaseLo = Def->getOperand(i: 1);
2324 MachineOperand BaseHi = Def->getOperand(i: 3);
2325 if (!BaseLo.isReg() || !BaseHi.isReg())
2326 return;
2327
2328 MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(Reg: BaseLo.getReg());
2329 MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(Reg: BaseHi.getReg());
2330
2331 if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_CO_U32_e64 ||
2332 !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64)
2333 return;
2334
2335 MachineOperand *Src0 = TII->getNamedOperand(MI&: *BaseLoDef, OperandName: AMDGPU::OpName::src0);
2336 MachineOperand *Src1 = TII->getNamedOperand(MI&: *BaseLoDef, OperandName: AMDGPU::OpName::src1);
2337
2338 auto Offset0P = TII->getImmOrMaterializedImm(Op&: *Src0);
2339 if (Offset0P)
2340 BaseLo = *Src1;
2341 else {
2342 if (!(Offset0P = TII->getImmOrMaterializedImm(Op&: *Src1)))
2343 return;
2344 BaseLo = *Src0;
2345 }
2346
2347 if (!BaseLo.isReg())
2348 return;
2349
2350 Src0 = TII->getNamedOperand(MI&: *BaseHiDef, OperandName: AMDGPU::OpName::src0);
2351 Src1 = TII->getNamedOperand(MI&: *BaseHiDef, OperandName: AMDGPU::OpName::src1);
2352
2353 if (Src0->isImm())
2354 std::swap(a&: Src0, b&: Src1);
2355
2356 if (!Src1->isImm() || Src0->isImm())
2357 return;
2358
2359 uint64_t Offset1 = Src1->getImm();
2360 BaseHi = *Src0;
2361
2362 if (!BaseHi.isReg())
2363 return;
2364
2365 Addr.Base.LoReg = BaseLo.getReg();
2366 Addr.Base.HiReg = BaseHi.getReg();
2367 Addr.Base.LoSubReg = BaseLo.getSubReg();
2368 Addr.Base.HiSubReg = BaseHi.getSubReg();
2369 Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32);
2370}
2371
2372// Maintain the correct LDS address for async loads.
2373// It becomes incorrect when promoteConstantOffsetToImm
2374// adds an offset only meant for the src operand.
2375void SILoadStoreOptimizer::updateAsyncLDSAddress(MachineInstr &MI,
2376 int32_t OffsetDiff) const {
2377 if (!TII->usesASYNC_CNT(MI) || OffsetDiff == 0)
2378 return;
2379
2380 Register OldVDst = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst)->getReg();
2381 Register NewVDst = MRI->createVirtualRegister(RegClass: MRI->getRegClass(Reg: OldVDst));
2382 MachineBasicBlock &MBB = *MI.getParent();
2383 const DebugLoc &DL = MI.getDebugLoc();
2384 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_ADD_U32_e64), DestReg: NewVDst)
2385 .addReg(RegNo: OldVDst)
2386 .addImm(Val: -OffsetDiff)
2387 .addImm(Val: 0);
2388
2389 MI.getOperand(i: 0).setReg(NewVDst);
2390}
2391
2392bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
2393 MachineInstr &MI,
2394 MemInfoMap &Visited,
2395 SmallPtrSet<MachineInstr *, 4> &AnchorList) const {
2396
2397 if (!STM->hasFlatInstOffsets() || !SIInstrInfo::isFLAT(MI))
2398 return false;
2399
2400 // TODO: Support FLAT_SCRATCH. Currently code expects 64-bit pointers.
2401 if (SIInstrInfo::isFLATScratch(MI))
2402 return false;
2403
2404 unsigned AS = SIInstrInfo::isFLATGlobal(MI) ? AMDGPUAS::GLOBAL_ADDRESS
2405 : AMDGPUAS::FLAT_ADDRESS;
2406
2407 if (AnchorList.count(Ptr: &MI))
2408 return false;
2409
2410 LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI.dump());
2411
2412 if (TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::offset)->getImm()) {
2413 LLVM_DEBUG(dbgs() << " Const-offset is already promoted.\n";);
2414 return false;
2415 }
2416
2417 // Step1: Find the base-registers and a 64bit constant offset.
2418 MachineOperand &Base = *TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vaddr);
2419 auto [It, Inserted] = Visited.try_emplace(Key: &MI);
2420 MemAddress MAddr;
2421 if (Inserted) {
2422 processBaseWithConstOffset(Base, Addr&: MAddr);
2423 It->second = MAddr;
2424 } else
2425 MAddr = It->second;
2426
2427 if (MAddr.Offset == 0) {
2428 LLVM_DEBUG(dbgs() << " Failed to extract constant-offset or there are no"
2429 " constant offsets that can be promoted.\n";);
2430 return false;
2431 }
2432
2433 LLVM_DEBUG(dbgs() << " BASE: {" << printReg(MAddr.Base.HiReg, TRI) << ", "
2434 << printReg(MAddr.Base.LoReg, TRI)
2435 << "} Offset: " << MAddr.Offset << "\n\n";);
2436
2437 // Step2: Traverse through MI's basic block and find an anchor(that has the
2438 // same base-registers) with the highest 13bit distance from MI's offset.
2439 // E.g. (64bit loads)
2440 // bb:
2441 // addr1 = &a + 4096; load1 = load(addr1, 0)
2442 // addr2 = &a + 6144; load2 = load(addr2, 0)
2443 // addr3 = &a + 8192; load3 = load(addr3, 0)
2444 // addr4 = &a + 10240; load4 = load(addr4, 0)
2445 // addr5 = &a + 12288; load5 = load(addr5, 0)
2446 //
2447 // Starting from the first load, the optimization will try to find a new base
2448 // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192
2449 // has 13bit distance from &a + 4096. The heuristic considers &a + 8192
2450 // as the new-base(anchor) because of the maximum distance which can
2451 // accommodate more intermediate bases presumably.
2452 //
2453 // Step3: move (&a + 8192) above load1. Compute and promote offsets from
2454 // (&a + 8192) for load1, load2, load4.
2455 // addr = &a + 8192
2456 // load1 = load(addr, -4096)
2457 // load2 = load(addr, -2048)
2458 // load3 = load(addr, 0)
2459 // load4 = load(addr, 2048)
2460 // addr5 = &a + 12288; load5 = load(addr5, 0)
2461 //
2462 MachineInstr *AnchorInst = nullptr;
2463 MemAddress AnchorAddr;
2464 uint32_t MaxDist = std::numeric_limits<uint32_t>::min();
2465 SmallVector<std::pair<MachineInstr *, int64_t>, 4> InstsWCommonBase;
2466
2467 MachineBasicBlock *MBB = MI.getParent();
2468 MachineBasicBlock::iterator E = MBB->end();
2469 MachineBasicBlock::iterator MBBI = MI.getIterator();
2470 ++MBBI;
2471 const SITargetLowering *TLI = STM->getTargetLowering();
2472
2473 for ( ; MBBI != E; ++MBBI) {
2474 MachineInstr &MINext = *MBBI;
2475 // TODO: Support finding an anchor(with same base) from store addresses or
2476 // any other load addresses where the opcodes are different.
2477 if (MINext.getOpcode() != MI.getOpcode() ||
2478 TII->getNamedOperand(MI&: MINext, OperandName: AMDGPU::OpName::offset)->getImm())
2479 continue;
2480
2481 const MachineOperand &BaseNext =
2482 *TII->getNamedOperand(MI&: MINext, OperandName: AMDGPU::OpName::vaddr);
2483 MemAddress MAddrNext;
2484 auto [It, Inserted] = Visited.try_emplace(Key: &MINext);
2485 if (Inserted) {
2486 processBaseWithConstOffset(Base: BaseNext, Addr&: MAddrNext);
2487 It->second = MAddrNext;
2488 } else
2489 MAddrNext = It->second;
2490
2491 if (MAddrNext.Base.LoReg != MAddr.Base.LoReg ||
2492 MAddrNext.Base.HiReg != MAddr.Base.HiReg ||
2493 MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg ||
2494 MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg)
2495 continue;
2496
2497 InstsWCommonBase.emplace_back(Args: &MINext, Args&: MAddrNext.Offset);
2498
2499 int64_t Dist = MAddr.Offset - MAddrNext.Offset;
2500 TargetLoweringBase::AddrMode AM;
2501 AM.HasBaseReg = true;
2502 AM.BaseOffs = Dist;
2503 if (TLI->isLegalFlatAddressingMode(AM, AddrSpace: AS) &&
2504 (uint32_t)std::abs(i: Dist) > MaxDist) {
2505 MaxDist = std::abs(i: Dist);
2506
2507 AnchorAddr = MAddrNext;
2508 AnchorInst = &MINext;
2509 }
2510 }
2511
2512 if (AnchorInst) {
2513 LLVM_DEBUG(dbgs() << " Anchor-Inst(with max-distance from Offset): ";
2514 AnchorInst->dump());
2515 LLVM_DEBUG(dbgs() << " Anchor-Offset from BASE: "
2516 << AnchorAddr.Offset << "\n\n");
2517
2518 // Instead of moving up, just re-compute anchor-instruction's base address.
2519 Register Base = computeBase(MI, Addr: AnchorAddr);
2520
2521 int32_t OffsetDiff = MAddr.Offset - AnchorAddr.Offset;
2522 updateBaseAndOffset(MI, NewBase: Base, NewOffset: OffsetDiff);
2523 updateAsyncLDSAddress(MI, OffsetDiff);
2524 LLVM_DEBUG(dbgs() << " After promotion: "; MI.dump(););
2525
2526 for (auto [OtherMI, OtherOffset] : InstsWCommonBase) {
2527 TargetLoweringBase::AddrMode AM;
2528 AM.HasBaseReg = true;
2529 AM.BaseOffs = OtherOffset - AnchorAddr.Offset;
2530
2531 if (TLI->isLegalFlatAddressingMode(AM, AddrSpace: AS)) {
2532 LLVM_DEBUG(dbgs() << " Promote Offset(" << OtherOffset; dbgs() << ")";
2533 OtherMI->dump());
2534 int32_t OtherOffsetDiff = OtherOffset - AnchorAddr.Offset;
2535 updateBaseAndOffset(MI&: *OtherMI, NewBase: Base, NewOffset: OtherOffsetDiff);
2536 updateAsyncLDSAddress(MI&: *OtherMI, OffsetDiff: OtherOffsetDiff);
2537 LLVM_DEBUG(dbgs() << " After promotion: "; OtherMI->dump());
2538 }
2539 }
2540 AnchorList.insert(Ptr: AnchorInst);
2541 return true;
2542 }
2543
2544 return false;
2545}
2546
2547void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI,
2548 std::list<std::list<CombineInfo> > &MergeableInsts) const {
2549 for (std::list<CombineInfo> &AddrList : MergeableInsts) {
2550 if (AddrList.front().InstClass == CI.InstClass &&
2551 AddrList.front().hasSameBaseAddress(CI)) {
2552 AddrList.emplace_back(args: CI);
2553 return;
2554 }
2555 }
2556
2557 // Base address not found, so add a new list.
2558 MergeableInsts.emplace_back(args: 1, args: CI);
2559}
2560
2561std::pair<MachineBasicBlock::iterator, bool>
2562SILoadStoreOptimizer::collectMergeableInsts(
2563 MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End,
2564 MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
2565 std::list<std::list<CombineInfo>> &MergeableInsts) const {
2566 bool Modified = false;
2567
2568 // Sort potential mergeable instructions into lists. One list per base address.
2569 unsigned Order = 0;
2570 MachineBasicBlock::iterator BlockI = Begin;
2571 for (; BlockI != End; ++BlockI) {
2572 MachineInstr &MI = *BlockI;
2573
2574 // We run this before checking if an address is mergeable, because it can produce
2575 // better code even if the instructions aren't mergeable.
2576 if (promoteConstantOffsetToImm(MI, Visited, AnchorList))
2577 Modified = true;
2578
2579 // Treat volatile accesses, ordered accesses and unmodeled side effects as
2580 // barriers. We can look after this barrier for separate merges.
2581 if (MI.hasOrderedMemoryRef() || MI.hasUnmodeledSideEffects()) {
2582 LLVM_DEBUG(dbgs() << "Breaking search on barrier: " << MI);
2583
2584 // Search will resume after this instruction in a separate merge list.
2585 ++BlockI;
2586 break;
2587 }
2588
2589 const InstClassEnum InstClass = getInstClass(Opc: MI.getOpcode(), TII: *TII);
2590 if (InstClass == UNKNOWN)
2591 continue;
2592
2593 // Do not merge VMEM buffer instructions with "swizzled" bit set.
2594 int Swizzled =
2595 AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: AMDGPU::OpName::swz);
2596 if (Swizzled != -1 && MI.getOperand(i: Swizzled).getImm())
2597 continue;
2598
2599 if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE) {
2600 const MachineOperand *Fmt =
2601 TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::format);
2602 if (!AMDGPU::getGcnBufferFormatInfo(Format: Fmt->getImm(), STI: *STM)) {
2603 LLVM_DEBUG(dbgs() << "Skip tbuffer with unknown format: " << MI);
2604 continue;
2605 }
2606 }
2607
2608 CombineInfo CI;
2609 CI.setMI(MI, LSO: *this);
2610 CI.Order = Order++;
2611
2612 if (!CI.hasMergeableAddress(MRI: *MRI))
2613 continue;
2614
2615 LLVM_DEBUG(dbgs() << "Mergeable: " << MI);
2616
2617 addInstToMergeableList(CI, MergeableInsts);
2618 }
2619
2620 // At this point we have lists of Mergeable instructions.
2621 //
2622 // Part 2: Sort lists by offset and then for each CombineInfo object in the
2623 // list try to find an instruction that can be merged with I. If an instruction
2624 // is found, it is stored in the Paired field. If no instructions are found, then
2625 // the CombineInfo object is deleted from the list.
2626
2627 for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
2628 E = MergeableInsts.end(); I != E;) {
2629
2630 std::list<CombineInfo> &MergeList = *I;
2631 if (MergeList.size() <= 1) {
2632 // This means we have found only one instruction with a given address
2633 // that can be merged, and we need at least 2 instructions to do a merge,
2634 // so this list can be discarded.
2635 I = MergeableInsts.erase(position: I);
2636 continue;
2637 }
2638
2639 // Sort the lists by offsets, this way mergeable instructions will be
2640 // adjacent to each other in the list, which will make it easier to find
2641 // matches.
2642 MergeList.sort(
2643 comp: [] (const CombineInfo &A, const CombineInfo &B) {
2644 return A.Offset < B.Offset;
2645 });
2646 ++I;
2647 }
2648
2649 return {BlockI, Modified};
2650}
2651
2652// Scan through looking for adjacent LDS operations with constant offsets from
2653// the same base register. We rely on the scheduler to do the hard work of
2654// clustering nearby loads, and assume these are all adjacent.
2655bool SILoadStoreOptimizer::optimizeBlock(
2656 std::list<std::list<CombineInfo> > &MergeableInsts) {
2657 bool Modified = false;
2658
2659 for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
2660 E = MergeableInsts.end(); I != E;) {
2661 std::list<CombineInfo> &MergeList = *I;
2662
2663 bool OptimizeListAgain = false;
2664 if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) {
2665 // We weren't able to make any changes, so delete the list so we don't
2666 // process the same instructions the next time we try to optimize this
2667 // block.
2668 I = MergeableInsts.erase(position: I);
2669 continue;
2670 }
2671
2672 Modified = true;
2673
2674 // We made changes, but also determined that there were no more optimization
2675 // opportunities, so we don't need to reprocess the list
2676 if (!OptimizeListAgain) {
2677 I = MergeableInsts.erase(position: I);
2678 continue;
2679 }
2680 OptimizeAgain = true;
2681 }
2682 return Modified;
2683}
2684
2685bool
2686SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
2687 std::list<CombineInfo> &MergeList,
2688 bool &OptimizeListAgain) {
2689 if (MergeList.empty())
2690 return false;
2691
2692 bool Modified = false;
2693
2694 for (auto I = MergeList.begin(), Next = std::next(x: I); Next != MergeList.end();
2695 Next = std::next(x: I)) {
2696
2697 auto First = I;
2698 auto Second = Next;
2699
2700 if ((*First).Order > (*Second).Order)
2701 std::swap(a&: First, b&: Second);
2702 CombineInfo &CI = *First;
2703 CombineInfo &Paired = *Second;
2704
2705 CombineInfo *Where = checkAndPrepareMerge(CI, Paired);
2706 if (!Where) {
2707 ++I;
2708 continue;
2709 }
2710
2711 Modified = true;
2712
2713 LLVM_DEBUG(dbgs() << "Merging: " << *CI.I << " with: " << *Paired.I);
2714
2715 MachineBasicBlock::iterator NewMI;
2716 switch (CI.InstClass) {
2717 default:
2718 llvm_unreachable("unknown InstClass");
2719 break;
2720 case DS_READ:
2721 NewMI = mergeRead2Pair(CI, Paired, InsertBefore: Where->I);
2722 break;
2723 case DS_WRITE:
2724 NewMI = mergeWrite2Pair(CI, Paired, InsertBefore: Where->I);
2725 break;
2726 case S_BUFFER_LOAD_IMM:
2727 case S_BUFFER_LOAD_SGPR_IMM:
2728 case S_LOAD_IMM:
2729 NewMI = mergeSMemLoadImmPair(CI, Paired, InsertBefore: Where->I);
2730 OptimizeListAgain |= CI.Width + Paired.Width < 8;
2731 break;
2732 case BUFFER_LOAD:
2733 NewMI = mergeBufferLoadPair(CI, Paired, InsertBefore: Where->I);
2734 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2735 break;
2736 case BUFFER_STORE:
2737 NewMI = mergeBufferStorePair(CI, Paired, InsertBefore: Where->I);
2738 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2739 break;
2740 case MIMG:
2741 NewMI = mergeImagePair(CI, Paired, InsertBefore: Where->I);
2742 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2743 break;
2744 case TBUFFER_LOAD:
2745 NewMI = mergeTBufferLoadPair(CI, Paired, InsertBefore: Where->I);
2746 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2747 break;
2748 case TBUFFER_STORE:
2749 NewMI = mergeTBufferStorePair(CI, Paired, InsertBefore: Where->I);
2750 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2751 break;
2752 case FLAT_LOAD:
2753 case FLAT_LOAD_SADDR:
2754 case GLOBAL_LOAD:
2755 case GLOBAL_LOAD_SADDR:
2756 NewMI = mergeFlatLoadPair(CI, Paired, InsertBefore: Where->I);
2757 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2758 break;
2759 case FLAT_STORE:
2760 case FLAT_STORE_SADDR:
2761 case GLOBAL_STORE:
2762 case GLOBAL_STORE_SADDR:
2763 NewMI = mergeFlatStorePair(CI, Paired, InsertBefore: Where->I);
2764 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2765 break;
2766 }
2767 CI.setMI(MI: NewMI, LSO: *this);
2768 CI.Order = Where->Order;
2769 if (I == Second)
2770 I = Next;
2771
2772 MergeList.erase(position: Second);
2773 }
2774
2775 return Modified;
2776}
2777
2778bool SILoadStoreOptimizerLegacy::runOnMachineFunction(MachineFunction &MF) {
2779 if (skipFunction(F: MF.getFunction()))
2780 return false;
2781 return SILoadStoreOptimizer(
2782 &getAnalysis<AAResultsWrapperPass>().getAAResults())
2783 .run(MF);
2784}
2785
2786bool SILoadStoreOptimizer::run(MachineFunction &MF) {
2787 this->MF = &MF;
2788 STM = &MF.getSubtarget<GCNSubtarget>();
2789 if (!STM->loadStoreOptEnabled())
2790 return false;
2791
2792 TII = STM->getInstrInfo();
2793 TRI = &TII->getRegisterInfo();
2794
2795 MRI = &MF.getRegInfo();
2796
2797 LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
2798
2799 bool Modified = false;
2800
2801 // Contains the list of instructions for which constant offsets are being
2802 // promoted to the IMM. This is tracked for an entire block at time.
2803 SmallPtrSet<MachineInstr *, 4> AnchorList;
2804 MemInfoMap Visited;
2805
2806 for (MachineBasicBlock &MBB : MF) {
2807 MachineBasicBlock::iterator SectionEnd;
2808 for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;
2809 I = SectionEnd) {
2810 bool CollectModified;
2811 std::list<std::list<CombineInfo>> MergeableInsts;
2812
2813 // First pass: Collect list of all instructions we know how to merge in a
2814 // subset of the block.
2815 std::tie(args&: SectionEnd, args&: CollectModified) =
2816 collectMergeableInsts(Begin: I, End: E, Visited, AnchorList, MergeableInsts);
2817
2818 Modified |= CollectModified;
2819
2820 do {
2821 OptimizeAgain = false;
2822 Modified |= optimizeBlock(MergeableInsts);
2823 } while (OptimizeAgain);
2824 }
2825
2826 Visited.clear();
2827 AnchorList.clear();
2828 }
2829
2830 return Modified;
2831}
2832
2833PreservedAnalyses
2834SILoadStoreOptimizerPass::run(MachineFunction &MF,
2835 MachineFunctionAnalysisManager &MFAM) {
2836 MFPropsModifier _(*this, MF);
2837
2838 if (MF.getFunction().hasOptNone())
2839 return PreservedAnalyses::all();
2840
2841 auto &FAM = MFAM.getResult<FunctionAnalysisManagerMachineFunctionProxy>(IR&: MF)
2842 .getManager();
2843 AAResults &AA = FAM.getResult<AAManager>(IR&: MF.getFunction());
2844
2845 bool Changed = SILoadStoreOptimizer(&AA).run(MF);
2846 if (!Changed)
2847 return PreservedAnalyses::all();
2848
2849 PreservedAnalyses PA = getMachineFunctionPassPreservedAnalyses();
2850 PA.preserveSet<CFGAnalyses>();
2851 return PA;
2852}
2853