| 1 | //===- SILoadStoreOptimizer.cpp -------------------------------------------===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | // |
| 9 | // This pass tries to fuse DS instructions with close by immediate offsets. |
| 10 | // This will fuse operations such as |
| 11 | // ds_read_b32 v0, v2 offset:16 |
| 12 | // ds_read_b32 v1, v2 offset:32 |
| 13 | // ==> |
| 14 | // ds_read2_b32 v[0:1], v2, offset0:4 offset1:8 |
| 15 | // |
| 16 | // The same is done for certain SMEM and VMEM opcodes, e.g.: |
| 17 | // s_buffer_load_dword s4, s[0:3], 4 |
| 18 | // s_buffer_load_dword s5, s[0:3], 8 |
| 19 | // ==> |
| 20 | // s_buffer_load_dwordx2 s[4:5], s[0:3], 4 |
| 21 | // |
| 22 | // This pass also tries to promote constant offset to the immediate by |
| 23 | // adjusting the base. It tries to use a base from the nearby instructions that |
| 24 | // allows it to have a 13bit constant offset and then promotes the 13bit offset |
| 25 | // to the immediate. |
| 26 | // E.g. |
| 27 | // s_movk_i32 s0, 0x1800 |
| 28 | // v_add_co_u32_e32 v0, vcc, s0, v2 |
| 29 | // v_addc_co_u32_e32 v1, vcc, 0, v6, vcc |
| 30 | // |
| 31 | // s_movk_i32 s0, 0x1000 |
| 32 | // v_add_co_u32_e32 v5, vcc, s0, v2 |
| 33 | // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc |
| 34 | // global_load_dwordx2 v[5:6], v[5:6], off |
| 35 | // global_load_dwordx2 v[0:1], v[0:1], off |
| 36 | // => |
| 37 | // s_movk_i32 s0, 0x1000 |
| 38 | // v_add_co_u32_e32 v5, vcc, s0, v2 |
| 39 | // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc |
| 40 | // global_load_dwordx2 v[5:6], v[5:6], off |
| 41 | // global_load_dwordx2 v[0:1], v[5:6], off offset:2048 |
| 42 | // |
| 43 | // Future improvements: |
| 44 | // |
| 45 | // - This is currently missing stores of constants because loading |
| 46 | // the constant into the data register is placed between the stores, although |
| 47 | // this is arguably a scheduling problem. |
| 48 | // |
| 49 | // - Live interval recomputing seems inefficient. This currently only matches |
| 50 | // one pair, and recomputes live intervals and moves on to the next pair. It |
| 51 | // would be better to compute a list of all merges that need to occur. |
| 52 | // |
| 53 | // - With a list of instructions to process, we can also merge more. If a |
| 54 | // cluster of loads have offsets that are too large to fit in the 8-bit |
| 55 | // offsets, but are close enough to fit in the 8 bits, we can add to the base |
| 56 | // pointer and use the new reduced offsets. |
| 57 | // |
| 58 | //===----------------------------------------------------------------------===// |
| 59 | |
| 60 | #include "SILoadStoreOptimizer.h" |
| 61 | #include "AMDGPU.h" |
| 62 | #include "GCNSubtarget.h" |
| 63 | #include "MCTargetDesc/AMDGPUMCTargetDesc.h" |
| 64 | #include "llvm/Analysis/AliasAnalysis.h" |
| 65 | #include "llvm/CodeGen/MachineFunctionPass.h" |
| 66 | #include "llvm/InitializePasses.h" |
| 67 | |
| 68 | using namespace llvm; |
| 69 | |
| 70 | #define DEBUG_TYPE "si-load-store-opt" |
| 71 | |
| 72 | namespace { |
| 73 | enum InstClassEnum { |
| 74 | UNKNOWN, |
| 75 | DS_READ, |
| 76 | DS_WRITE, |
| 77 | S_BUFFER_LOAD_IMM, |
| 78 | S_BUFFER_LOAD_SGPR_IMM, |
| 79 | S_LOAD_IMM, |
| 80 | BUFFER_LOAD, |
| 81 | BUFFER_STORE, |
| 82 | MIMG, |
| 83 | TBUFFER_LOAD, |
| 84 | TBUFFER_STORE, |
| 85 | GLOBAL_LOAD_SADDR, |
| 86 | GLOBAL_STORE_SADDR, |
| 87 | FLAT_LOAD, |
| 88 | FLAT_STORE, |
| 89 | GLOBAL_LOAD, // GLOBAL_LOAD/GLOBAL_STORE are never used as the InstClass of |
| 90 | GLOBAL_STORE // any CombineInfo, they are only ever returned by |
| 91 | // getCommonInstClass. |
| 92 | }; |
| 93 | |
| 94 | struct AddressRegs { |
| 95 | unsigned char NumVAddrs = 0; |
| 96 | bool SBase = false; |
| 97 | bool SRsrc = false; |
| 98 | bool SOffset = false; |
| 99 | bool SAddr = false; |
| 100 | bool VAddr = false; |
| 101 | bool Addr = false; |
| 102 | bool SSamp = false; |
| 103 | }; |
| 104 | |
| 105 | // GFX10 image_sample instructions can have 12 vaddrs + srsrc + ssamp. |
| 106 | const unsigned MaxAddressRegs = 12 + 1 + 1; |
| 107 | |
| 108 | class SILoadStoreOptimizer { |
| 109 | struct CombineInfo { |
| 110 | MachineBasicBlock::iterator I; |
| 111 | unsigned EltSize; |
| 112 | unsigned Offset; |
| 113 | unsigned Width; |
| 114 | unsigned Format; |
| 115 | unsigned BaseOff; |
| 116 | unsigned DMask; |
| 117 | InstClassEnum InstClass; |
| 118 | unsigned CPol = 0; |
| 119 | bool IsAGPR; |
| 120 | bool UseST64; |
| 121 | int AddrIdx[MaxAddressRegs]; |
| 122 | const MachineOperand *AddrReg[MaxAddressRegs]; |
| 123 | unsigned NumAddresses; |
| 124 | unsigned Order; |
| 125 | |
| 126 | bool hasSameBaseAddress(const CombineInfo &CI) { |
| 127 | if (NumAddresses != CI.NumAddresses) |
| 128 | return false; |
| 129 | |
| 130 | const MachineInstr &MI = *CI.I; |
| 131 | for (unsigned i = 0; i < NumAddresses; i++) { |
| 132 | const MachineOperand &AddrRegNext = MI.getOperand(i: AddrIdx[i]); |
| 133 | |
| 134 | if (AddrReg[i]->isImm() || AddrRegNext.isImm()) { |
| 135 | if (AddrReg[i]->isImm() != AddrRegNext.isImm() || |
| 136 | AddrReg[i]->getImm() != AddrRegNext.getImm()) { |
| 137 | return false; |
| 138 | } |
| 139 | continue; |
| 140 | } |
| 141 | |
| 142 | // Check same base pointer. Be careful of subregisters, which can occur |
| 143 | // with vectors of pointers. |
| 144 | if (AddrReg[i]->getReg() != AddrRegNext.getReg() || |
| 145 | AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) { |
| 146 | return false; |
| 147 | } |
| 148 | } |
| 149 | return true; |
| 150 | } |
| 151 | |
| 152 | bool hasMergeableAddress(const MachineRegisterInfo &MRI) { |
| 153 | for (unsigned i = 0; i < NumAddresses; ++i) { |
| 154 | const MachineOperand *AddrOp = AddrReg[i]; |
| 155 | // Immediates are always OK. |
| 156 | if (AddrOp->isImm()) |
| 157 | continue; |
| 158 | |
| 159 | // Don't try to merge addresses that aren't either immediates or registers. |
| 160 | // TODO: Should be possible to merge FrameIndexes and maybe some other |
| 161 | // non-register |
| 162 | if (!AddrOp->isReg()) |
| 163 | return false; |
| 164 | |
| 165 | // TODO: We should be able to merge instructions with other physical reg |
| 166 | // addresses too. |
| 167 | if (AddrOp->getReg().isPhysical() && |
| 168 | AddrOp->getReg() != AMDGPU::SGPR_NULL) |
| 169 | return false; |
| 170 | |
| 171 | // If an address has only one use then there will be no other |
| 172 | // instructions with the same address, so we can't merge this one. |
| 173 | if (MRI.hasOneNonDBGUse(RegNo: AddrOp->getReg())) |
| 174 | return false; |
| 175 | } |
| 176 | return true; |
| 177 | } |
| 178 | |
| 179 | void setMI(MachineBasicBlock::iterator MI, const SILoadStoreOptimizer &LSO); |
| 180 | |
| 181 | // Compare by pointer order. |
| 182 | bool operator<(const CombineInfo& Other) const { |
| 183 | return (InstClass == MIMG) ? DMask < Other.DMask : Offset < Other.Offset; |
| 184 | } |
| 185 | }; |
| 186 | |
| 187 | struct BaseRegisters { |
| 188 | Register LoReg; |
| 189 | Register HiReg; |
| 190 | |
| 191 | unsigned LoSubReg = 0; |
| 192 | unsigned HiSubReg = 0; |
| 193 | }; |
| 194 | |
| 195 | struct MemAddress { |
| 196 | BaseRegisters Base; |
| 197 | int64_t Offset = 0; |
| 198 | }; |
| 199 | |
| 200 | using MemInfoMap = DenseMap<MachineInstr *, MemAddress>; |
| 201 | |
| 202 | private: |
| 203 | const GCNSubtarget *STM = nullptr; |
| 204 | const SIInstrInfo *TII = nullptr; |
| 205 | const SIRegisterInfo *TRI = nullptr; |
| 206 | MachineRegisterInfo *MRI = nullptr; |
| 207 | AliasAnalysis *AA = nullptr; |
| 208 | bool OptimizeAgain; |
| 209 | |
| 210 | bool canSwapInstructions(const DenseSet<Register> &ARegDefs, |
| 211 | const DenseSet<Register> &ARegUses, |
| 212 | const MachineInstr &A, const MachineInstr &B) const; |
| 213 | static bool dmasksCanBeCombined(const CombineInfo &CI, |
| 214 | const SIInstrInfo &TII, |
| 215 | const CombineInfo &Paired); |
| 216 | static bool offsetsCanBeCombined(CombineInfo &CI, const GCNSubtarget &STI, |
| 217 | CombineInfo &Paired, bool Modify = false); |
| 218 | static bool widthsFit(const GCNSubtarget &STI, const CombineInfo &CI, |
| 219 | const CombineInfo &Paired); |
| 220 | unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired); |
| 221 | static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI, |
| 222 | const CombineInfo &Paired); |
| 223 | const TargetRegisterClass * |
| 224 | getTargetRegisterClass(const CombineInfo &CI, |
| 225 | const CombineInfo &Paired) const; |
| 226 | const TargetRegisterClass *getDataRegClass(const MachineInstr &MI) const; |
| 227 | |
| 228 | CombineInfo *checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired); |
| 229 | |
| 230 | void copyToDestRegs(CombineInfo &CI, CombineInfo &Paired, |
| 231 | MachineBasicBlock::iterator InsertBefore, |
| 232 | AMDGPU::OpName OpName, Register DestReg) const; |
| 233 | Register copyFromSrcRegs(CombineInfo &CI, CombineInfo &Paired, |
| 234 | MachineBasicBlock::iterator InsertBefore, |
| 235 | AMDGPU::OpName OpName) const; |
| 236 | |
| 237 | unsigned read2Opcode(unsigned EltSize) const; |
| 238 | unsigned read2ST64Opcode(unsigned EltSize) const; |
| 239 | MachineBasicBlock::iterator |
| 240 | mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired, |
| 241 | MachineBasicBlock::iterator InsertBefore); |
| 242 | |
| 243 | unsigned write2Opcode(unsigned EltSize) const; |
| 244 | unsigned write2ST64Opcode(unsigned EltSize) const; |
| 245 | MachineBasicBlock::iterator |
| 246 | mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired, |
| 247 | MachineBasicBlock::iterator InsertBefore); |
| 248 | MachineBasicBlock::iterator |
| 249 | mergeImagePair(CombineInfo &CI, CombineInfo &Paired, |
| 250 | MachineBasicBlock::iterator InsertBefore); |
| 251 | MachineBasicBlock::iterator |
| 252 | mergeSMemLoadImmPair(CombineInfo &CI, CombineInfo &Paired, |
| 253 | MachineBasicBlock::iterator InsertBefore); |
| 254 | MachineBasicBlock::iterator |
| 255 | mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired, |
| 256 | MachineBasicBlock::iterator InsertBefore); |
| 257 | MachineBasicBlock::iterator |
| 258 | mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired, |
| 259 | MachineBasicBlock::iterator InsertBefore); |
| 260 | MachineBasicBlock::iterator |
| 261 | mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired, |
| 262 | MachineBasicBlock::iterator InsertBefore); |
| 263 | MachineBasicBlock::iterator |
| 264 | mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired, |
| 265 | MachineBasicBlock::iterator InsertBefore); |
| 266 | MachineBasicBlock::iterator |
| 267 | mergeFlatLoadPair(CombineInfo &CI, CombineInfo &Paired, |
| 268 | MachineBasicBlock::iterator InsertBefore); |
| 269 | MachineBasicBlock::iterator |
| 270 | mergeFlatStorePair(CombineInfo &CI, CombineInfo &Paired, |
| 271 | MachineBasicBlock::iterator InsertBefore); |
| 272 | |
| 273 | void updateBaseAndOffset(MachineInstr &I, Register NewBase, |
| 274 | int32_t NewOffset) const; |
| 275 | Register computeBase(MachineInstr &MI, const MemAddress &Addr) const; |
| 276 | MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const; |
| 277 | std::optional<int32_t> extractConstOffset(const MachineOperand &Op) const; |
| 278 | void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr) const; |
| 279 | /// Promotes constant offset to the immediate by adjusting the base. It |
| 280 | /// tries to use a base from the nearby instructions that allows it to have |
| 281 | /// a 13bit constant offset which gets promoted to the immediate. |
| 282 | bool promoteConstantOffsetToImm(MachineInstr &CI, |
| 283 | MemInfoMap &Visited, |
| 284 | SmallPtrSet<MachineInstr *, 4> &Promoted) const; |
| 285 | void addInstToMergeableList(const CombineInfo &CI, |
| 286 | std::list<std::list<CombineInfo> > &MergeableInsts) const; |
| 287 | |
| 288 | std::pair<MachineBasicBlock::iterator, bool> collectMergeableInsts( |
| 289 | MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End, |
| 290 | MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList, |
| 291 | std::list<std::list<CombineInfo>> &MergeableInsts) const; |
| 292 | |
| 293 | static MachineMemOperand *combineKnownAdjacentMMOs(const CombineInfo &CI, |
| 294 | const CombineInfo &Paired); |
| 295 | |
| 296 | static InstClassEnum getCommonInstClass(const CombineInfo &CI, |
| 297 | const CombineInfo &Paired); |
| 298 | |
| 299 | bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList, |
| 300 | bool &OptimizeListAgain); |
| 301 | bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts); |
| 302 | |
| 303 | public: |
| 304 | SILoadStoreOptimizer(AliasAnalysis *AA) : AA(AA) {} |
| 305 | bool run(MachineFunction &MF); |
| 306 | }; |
| 307 | |
| 308 | class SILoadStoreOptimizerLegacy : public MachineFunctionPass { |
| 309 | public: |
| 310 | static char ID; |
| 311 | |
| 312 | SILoadStoreOptimizerLegacy() : MachineFunctionPass(ID) {} |
| 313 | |
| 314 | bool runOnMachineFunction(MachineFunction &MF) override; |
| 315 | |
| 316 | StringRef getPassName() const override { return "SI Load Store Optimizer" ; } |
| 317 | |
| 318 | void getAnalysisUsage(AnalysisUsage &AU) const override { |
| 319 | AU.setPreservesCFG(); |
| 320 | AU.addRequired<AAResultsWrapperPass>(); |
| 321 | |
| 322 | MachineFunctionPass::getAnalysisUsage(AU); |
| 323 | } |
| 324 | |
| 325 | MachineFunctionProperties getRequiredProperties() const override { |
| 326 | return MachineFunctionProperties().setIsSSA(); |
| 327 | } |
| 328 | }; |
| 329 | |
| 330 | static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) { |
| 331 | const unsigned Opc = MI.getOpcode(); |
| 332 | |
| 333 | if (TII.isMUBUF(Opcode: Opc)) { |
| 334 | // FIXME: Handle d16 correctly |
| 335 | return AMDGPU::getMUBUFElements(Opc); |
| 336 | } |
| 337 | if (TII.isImage(MI)) { |
| 338 | uint64_t DMaskImm = |
| 339 | TII.getNamedOperand(MI, OperandName: AMDGPU::OpName::dmask)->getImm(); |
| 340 | return llvm::popcount(Value: DMaskImm); |
| 341 | } |
| 342 | if (TII.isMTBUF(Opcode: Opc)) { |
| 343 | return AMDGPU::getMTBUFElements(Opc); |
| 344 | } |
| 345 | |
| 346 | switch (Opc) { |
| 347 | case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: |
| 348 | case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM: |
| 349 | case AMDGPU::S_LOAD_DWORD_IMM: |
| 350 | case AMDGPU::GLOBAL_LOAD_DWORD: |
| 351 | case AMDGPU::GLOBAL_LOAD_DWORD_SADDR: |
| 352 | case AMDGPU::GLOBAL_STORE_DWORD: |
| 353 | case AMDGPU::GLOBAL_STORE_DWORD_SADDR: |
| 354 | case AMDGPU::FLAT_LOAD_DWORD: |
| 355 | case AMDGPU::FLAT_STORE_DWORD: |
| 356 | return 1; |
| 357 | case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: |
| 358 | case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM: |
| 359 | case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec: |
| 360 | case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec: |
| 361 | case AMDGPU::S_LOAD_DWORDX2_IMM: |
| 362 | case AMDGPU::S_LOAD_DWORDX2_IMM_ec: |
| 363 | case AMDGPU::GLOBAL_LOAD_DWORDX2: |
| 364 | case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR: |
| 365 | case AMDGPU::GLOBAL_STORE_DWORDX2: |
| 366 | case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR: |
| 367 | case AMDGPU::FLAT_LOAD_DWORDX2: |
| 368 | case AMDGPU::FLAT_STORE_DWORDX2: |
| 369 | return 2; |
| 370 | case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM: |
| 371 | case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM: |
| 372 | case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec: |
| 373 | case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec: |
| 374 | case AMDGPU::S_LOAD_DWORDX3_IMM: |
| 375 | case AMDGPU::S_LOAD_DWORDX3_IMM_ec: |
| 376 | case AMDGPU::GLOBAL_LOAD_DWORDX3: |
| 377 | case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR: |
| 378 | case AMDGPU::GLOBAL_STORE_DWORDX3: |
| 379 | case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR: |
| 380 | case AMDGPU::FLAT_LOAD_DWORDX3: |
| 381 | case AMDGPU::FLAT_STORE_DWORDX3: |
| 382 | return 3; |
| 383 | case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: |
| 384 | case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM: |
| 385 | case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec: |
| 386 | case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec: |
| 387 | case AMDGPU::S_LOAD_DWORDX4_IMM: |
| 388 | case AMDGPU::S_LOAD_DWORDX4_IMM_ec: |
| 389 | case AMDGPU::GLOBAL_LOAD_DWORDX4: |
| 390 | case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR: |
| 391 | case AMDGPU::GLOBAL_STORE_DWORDX4: |
| 392 | case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR: |
| 393 | case AMDGPU::FLAT_LOAD_DWORDX4: |
| 394 | case AMDGPU::FLAT_STORE_DWORDX4: |
| 395 | return 4; |
| 396 | case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: |
| 397 | case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM: |
| 398 | case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec: |
| 399 | case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec: |
| 400 | case AMDGPU::S_LOAD_DWORDX8_IMM: |
| 401 | case AMDGPU::S_LOAD_DWORDX8_IMM_ec: |
| 402 | return 8; |
| 403 | case AMDGPU::DS_READ_B32: |
| 404 | case AMDGPU::DS_READ_B32_gfx9: |
| 405 | case AMDGPU::DS_WRITE_B32: |
| 406 | case AMDGPU::DS_WRITE_B32_gfx9: |
| 407 | return 1; |
| 408 | case AMDGPU::DS_READ_B64: |
| 409 | case AMDGPU::DS_READ_B64_gfx9: |
| 410 | case AMDGPU::DS_WRITE_B64: |
| 411 | case AMDGPU::DS_WRITE_B64_gfx9: |
| 412 | return 2; |
| 413 | default: |
| 414 | return 0; |
| 415 | } |
| 416 | } |
| 417 | |
| 418 | /// Maps instruction opcode to enum InstClassEnum. |
| 419 | static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) { |
| 420 | switch (Opc) { |
| 421 | default: |
| 422 | if (TII.isMUBUF(Opcode: Opc)) { |
| 423 | switch (AMDGPU::getMUBUFBaseOpcode(Opc)) { |
| 424 | default: |
| 425 | return UNKNOWN; |
| 426 | case AMDGPU::BUFFER_LOAD_DWORD_BOTHEN: |
| 427 | case AMDGPU::BUFFER_LOAD_DWORD_BOTHEN_exact: |
| 428 | case AMDGPU::BUFFER_LOAD_DWORD_IDXEN: |
| 429 | case AMDGPU::BUFFER_LOAD_DWORD_IDXEN_exact: |
| 430 | case AMDGPU::BUFFER_LOAD_DWORD_OFFEN: |
| 431 | case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact: |
| 432 | case AMDGPU::BUFFER_LOAD_DWORD_OFFSET: |
| 433 | case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact: |
| 434 | case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_BOTHEN: |
| 435 | case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_BOTHEN_exact: |
| 436 | case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_IDXEN: |
| 437 | case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_IDXEN_exact: |
| 438 | case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN: |
| 439 | case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN_exact: |
| 440 | case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET: |
| 441 | case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET_exact: |
| 442 | return BUFFER_LOAD; |
| 443 | case AMDGPU::BUFFER_STORE_DWORD_BOTHEN: |
| 444 | case AMDGPU::BUFFER_STORE_DWORD_BOTHEN_exact: |
| 445 | case AMDGPU::BUFFER_STORE_DWORD_IDXEN: |
| 446 | case AMDGPU::BUFFER_STORE_DWORD_IDXEN_exact: |
| 447 | case AMDGPU::BUFFER_STORE_DWORD_OFFEN: |
| 448 | case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact: |
| 449 | case AMDGPU::BUFFER_STORE_DWORD_OFFSET: |
| 450 | case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact: |
| 451 | case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_BOTHEN: |
| 452 | case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_BOTHEN_exact: |
| 453 | case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_IDXEN: |
| 454 | case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_IDXEN_exact: |
| 455 | case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN: |
| 456 | case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact: |
| 457 | case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET: |
| 458 | case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact: |
| 459 | return BUFFER_STORE; |
| 460 | } |
| 461 | } |
| 462 | if (TII.isImage(Opcode: Opc)) { |
| 463 | // Ignore instructions encoded without vaddr. |
| 464 | if (!AMDGPU::hasNamedOperand(Opcode: Opc, NamedIdx: AMDGPU::OpName::vaddr) && |
| 465 | !AMDGPU::hasNamedOperand(Opcode: Opc, NamedIdx: AMDGPU::OpName::vaddr0)) |
| 466 | return UNKNOWN; |
| 467 | // Ignore BVH instructions |
| 468 | if (AMDGPU::getMIMGBaseOpcode(Opc)->BVH) |
| 469 | return UNKNOWN; |
| 470 | // TODO: Support IMAGE_GET_RESINFO and IMAGE_GET_LOD. |
| 471 | if (TII.get(Opcode: Opc).mayStore() || !TII.get(Opcode: Opc).mayLoad() || |
| 472 | TII.isGather4(Opcode: Opc)) |
| 473 | return UNKNOWN; |
| 474 | return MIMG; |
| 475 | } |
| 476 | if (TII.isMTBUF(Opcode: Opc)) { |
| 477 | switch (AMDGPU::getMTBUFBaseOpcode(Opc)) { |
| 478 | default: |
| 479 | return UNKNOWN; |
| 480 | case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN: |
| 481 | case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN_exact: |
| 482 | case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN: |
| 483 | case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN_exact: |
| 484 | case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN: |
| 485 | case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN_exact: |
| 486 | case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET: |
| 487 | case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET_exact: |
| 488 | case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN: |
| 489 | case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN_exact: |
| 490 | case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN: |
| 491 | case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN_exact: |
| 492 | case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN: |
| 493 | case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN_exact: |
| 494 | case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET: |
| 495 | case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET_exact: |
| 496 | return TBUFFER_LOAD; |
| 497 | case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN: |
| 498 | case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN_exact: |
| 499 | case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET: |
| 500 | case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET_exact: |
| 501 | case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN: |
| 502 | case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN_exact: |
| 503 | case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET: |
| 504 | case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET_exact: |
| 505 | return TBUFFER_STORE; |
| 506 | } |
| 507 | } |
| 508 | return UNKNOWN; |
| 509 | case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: |
| 510 | case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: |
| 511 | case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM: |
| 512 | case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: |
| 513 | case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: |
| 514 | case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec: |
| 515 | case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec: |
| 516 | case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec: |
| 517 | case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec: |
| 518 | return S_BUFFER_LOAD_IMM; |
| 519 | case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM: |
| 520 | case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM: |
| 521 | case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM: |
| 522 | case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM: |
| 523 | case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM: |
| 524 | case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec: |
| 525 | case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec: |
| 526 | case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec: |
| 527 | case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec: |
| 528 | return S_BUFFER_LOAD_SGPR_IMM; |
| 529 | case AMDGPU::S_LOAD_DWORD_IMM: |
| 530 | case AMDGPU::S_LOAD_DWORDX2_IMM: |
| 531 | case AMDGPU::S_LOAD_DWORDX3_IMM: |
| 532 | case AMDGPU::S_LOAD_DWORDX4_IMM: |
| 533 | case AMDGPU::S_LOAD_DWORDX8_IMM: |
| 534 | case AMDGPU::S_LOAD_DWORDX2_IMM_ec: |
| 535 | case AMDGPU::S_LOAD_DWORDX3_IMM_ec: |
| 536 | case AMDGPU::S_LOAD_DWORDX4_IMM_ec: |
| 537 | case AMDGPU::S_LOAD_DWORDX8_IMM_ec: |
| 538 | return S_LOAD_IMM; |
| 539 | case AMDGPU::DS_READ_B32: |
| 540 | case AMDGPU::DS_READ_B32_gfx9: |
| 541 | case AMDGPU::DS_READ_B64: |
| 542 | case AMDGPU::DS_READ_B64_gfx9: |
| 543 | return DS_READ; |
| 544 | case AMDGPU::DS_WRITE_B32: |
| 545 | case AMDGPU::DS_WRITE_B32_gfx9: |
| 546 | case AMDGPU::DS_WRITE_B64: |
| 547 | case AMDGPU::DS_WRITE_B64_gfx9: |
| 548 | return DS_WRITE; |
| 549 | case AMDGPU::GLOBAL_LOAD_DWORD: |
| 550 | case AMDGPU::GLOBAL_LOAD_DWORDX2: |
| 551 | case AMDGPU::GLOBAL_LOAD_DWORDX3: |
| 552 | case AMDGPU::GLOBAL_LOAD_DWORDX4: |
| 553 | case AMDGPU::FLAT_LOAD_DWORD: |
| 554 | case AMDGPU::FLAT_LOAD_DWORDX2: |
| 555 | case AMDGPU::FLAT_LOAD_DWORDX3: |
| 556 | case AMDGPU::FLAT_LOAD_DWORDX4: |
| 557 | return FLAT_LOAD; |
| 558 | case AMDGPU::GLOBAL_LOAD_DWORD_SADDR: |
| 559 | case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR: |
| 560 | case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR: |
| 561 | case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR: |
| 562 | return GLOBAL_LOAD_SADDR; |
| 563 | case AMDGPU::GLOBAL_STORE_DWORD: |
| 564 | case AMDGPU::GLOBAL_STORE_DWORDX2: |
| 565 | case AMDGPU::GLOBAL_STORE_DWORDX3: |
| 566 | case AMDGPU::GLOBAL_STORE_DWORDX4: |
| 567 | case AMDGPU::FLAT_STORE_DWORD: |
| 568 | case AMDGPU::FLAT_STORE_DWORDX2: |
| 569 | case AMDGPU::FLAT_STORE_DWORDX3: |
| 570 | case AMDGPU::FLAT_STORE_DWORDX4: |
| 571 | return FLAT_STORE; |
| 572 | case AMDGPU::GLOBAL_STORE_DWORD_SADDR: |
| 573 | case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR: |
| 574 | case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR: |
| 575 | case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR: |
| 576 | return GLOBAL_STORE_SADDR; |
| 577 | } |
| 578 | } |
| 579 | |
| 580 | /// Determines instruction subclass from opcode. Only instructions |
| 581 | /// of the same subclass can be merged together. The merged instruction may have |
| 582 | /// a different subclass but must have the same class. |
| 583 | static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) { |
| 584 | switch (Opc) { |
| 585 | default: |
| 586 | if (TII.isMUBUF(Opcode: Opc)) |
| 587 | return AMDGPU::getMUBUFBaseOpcode(Opc); |
| 588 | if (TII.isImage(Opcode: Opc)) { |
| 589 | const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc); |
| 590 | assert(Info); |
| 591 | return Info->BaseOpcode; |
| 592 | } |
| 593 | if (TII.isMTBUF(Opcode: Opc)) |
| 594 | return AMDGPU::getMTBUFBaseOpcode(Opc); |
| 595 | return -1; |
| 596 | case AMDGPU::DS_READ_B32: |
| 597 | case AMDGPU::DS_READ_B32_gfx9: |
| 598 | case AMDGPU::DS_READ_B64: |
| 599 | case AMDGPU::DS_READ_B64_gfx9: |
| 600 | case AMDGPU::DS_WRITE_B32: |
| 601 | case AMDGPU::DS_WRITE_B32_gfx9: |
| 602 | case AMDGPU::DS_WRITE_B64: |
| 603 | case AMDGPU::DS_WRITE_B64_gfx9: |
| 604 | return Opc; |
| 605 | case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: |
| 606 | case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: |
| 607 | case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM: |
| 608 | case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: |
| 609 | case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: |
| 610 | case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec: |
| 611 | case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec: |
| 612 | case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec: |
| 613 | case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec: |
| 614 | return AMDGPU::S_BUFFER_LOAD_DWORD_IMM; |
| 615 | case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM: |
| 616 | case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM: |
| 617 | case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM: |
| 618 | case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM: |
| 619 | case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM: |
| 620 | case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec: |
| 621 | case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec: |
| 622 | case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec: |
| 623 | case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec: |
| 624 | return AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM; |
| 625 | case AMDGPU::S_LOAD_DWORD_IMM: |
| 626 | case AMDGPU::S_LOAD_DWORDX2_IMM: |
| 627 | case AMDGPU::S_LOAD_DWORDX3_IMM: |
| 628 | case AMDGPU::S_LOAD_DWORDX4_IMM: |
| 629 | case AMDGPU::S_LOAD_DWORDX8_IMM: |
| 630 | case AMDGPU::S_LOAD_DWORDX2_IMM_ec: |
| 631 | case AMDGPU::S_LOAD_DWORDX3_IMM_ec: |
| 632 | case AMDGPU::S_LOAD_DWORDX4_IMM_ec: |
| 633 | case AMDGPU::S_LOAD_DWORDX8_IMM_ec: |
| 634 | return AMDGPU::S_LOAD_DWORD_IMM; |
| 635 | case AMDGPU::GLOBAL_LOAD_DWORD: |
| 636 | case AMDGPU::GLOBAL_LOAD_DWORDX2: |
| 637 | case AMDGPU::GLOBAL_LOAD_DWORDX3: |
| 638 | case AMDGPU::GLOBAL_LOAD_DWORDX4: |
| 639 | case AMDGPU::FLAT_LOAD_DWORD: |
| 640 | case AMDGPU::FLAT_LOAD_DWORDX2: |
| 641 | case AMDGPU::FLAT_LOAD_DWORDX3: |
| 642 | case AMDGPU::FLAT_LOAD_DWORDX4: |
| 643 | return AMDGPU::FLAT_LOAD_DWORD; |
| 644 | case AMDGPU::GLOBAL_LOAD_DWORD_SADDR: |
| 645 | case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR: |
| 646 | case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR: |
| 647 | case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR: |
| 648 | return AMDGPU::GLOBAL_LOAD_DWORD_SADDR; |
| 649 | case AMDGPU::GLOBAL_STORE_DWORD: |
| 650 | case AMDGPU::GLOBAL_STORE_DWORDX2: |
| 651 | case AMDGPU::GLOBAL_STORE_DWORDX3: |
| 652 | case AMDGPU::GLOBAL_STORE_DWORDX4: |
| 653 | case AMDGPU::FLAT_STORE_DWORD: |
| 654 | case AMDGPU::FLAT_STORE_DWORDX2: |
| 655 | case AMDGPU::FLAT_STORE_DWORDX3: |
| 656 | case AMDGPU::FLAT_STORE_DWORDX4: |
| 657 | return AMDGPU::FLAT_STORE_DWORD; |
| 658 | case AMDGPU::GLOBAL_STORE_DWORD_SADDR: |
| 659 | case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR: |
| 660 | case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR: |
| 661 | case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR: |
| 662 | return AMDGPU::GLOBAL_STORE_DWORD_SADDR; |
| 663 | } |
| 664 | } |
| 665 | |
| 666 | // GLOBAL loads and stores are classified as FLAT initially. If both combined |
| 667 | // instructions are FLAT GLOBAL adjust the class to GLOBAL_LOAD or GLOBAL_STORE. |
| 668 | // If either or both instructions are non segment specific FLAT the resulting |
| 669 | // combined operation will be FLAT, potentially promoting one of the GLOBAL |
| 670 | // operations to FLAT. |
| 671 | // For other instructions return the original unmodified class. |
| 672 | InstClassEnum |
| 673 | SILoadStoreOptimizer::getCommonInstClass(const CombineInfo &CI, |
| 674 | const CombineInfo &Paired) { |
| 675 | assert(CI.InstClass == Paired.InstClass); |
| 676 | |
| 677 | if ((CI.InstClass == FLAT_LOAD || CI.InstClass == FLAT_STORE) && |
| 678 | SIInstrInfo::isFLATGlobal(MI: *CI.I) && SIInstrInfo::isFLATGlobal(MI: *Paired.I)) |
| 679 | return (CI.InstClass == FLAT_STORE) ? GLOBAL_STORE : GLOBAL_LOAD; |
| 680 | |
| 681 | return CI.InstClass; |
| 682 | } |
| 683 | |
| 684 | static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) { |
| 685 | AddressRegs Result; |
| 686 | |
| 687 | if (TII.isMUBUF(Opcode: Opc)) { |
| 688 | if (AMDGPU::getMUBUFHasVAddr(Opc)) |
| 689 | Result.VAddr = true; |
| 690 | if (AMDGPU::getMUBUFHasSrsrc(Opc)) |
| 691 | Result.SRsrc = true; |
| 692 | if (AMDGPU::getMUBUFHasSoffset(Opc)) |
| 693 | Result.SOffset = true; |
| 694 | |
| 695 | return Result; |
| 696 | } |
| 697 | |
| 698 | if (TII.isImage(Opcode: Opc)) { |
| 699 | int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::vaddr0); |
| 700 | if (VAddr0Idx >= 0) { |
| 701 | AMDGPU::OpName RsrcName = |
| 702 | TII.isMIMG(Opcode: Opc) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc; |
| 703 | int RsrcIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: RsrcName); |
| 704 | Result.NumVAddrs = RsrcIdx - VAddr0Idx; |
| 705 | } else { |
| 706 | Result.VAddr = true; |
| 707 | } |
| 708 | Result.SRsrc = true; |
| 709 | const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc); |
| 710 | if (Info && AMDGPU::getMIMGBaseOpcodeInfo(BaseOpcode: Info->BaseOpcode)->Sampler) |
| 711 | Result.SSamp = true; |
| 712 | |
| 713 | return Result; |
| 714 | } |
| 715 | if (TII.isMTBUF(Opcode: Opc)) { |
| 716 | if (AMDGPU::getMTBUFHasVAddr(Opc)) |
| 717 | Result.VAddr = true; |
| 718 | if (AMDGPU::getMTBUFHasSrsrc(Opc)) |
| 719 | Result.SRsrc = true; |
| 720 | if (AMDGPU::getMTBUFHasSoffset(Opc)) |
| 721 | Result.SOffset = true; |
| 722 | |
| 723 | return Result; |
| 724 | } |
| 725 | |
| 726 | switch (Opc) { |
| 727 | default: |
| 728 | return Result; |
| 729 | case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM: |
| 730 | case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM: |
| 731 | case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM: |
| 732 | case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM: |
| 733 | case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM: |
| 734 | case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec: |
| 735 | case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec: |
| 736 | case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec: |
| 737 | case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec: |
| 738 | Result.SOffset = true; |
| 739 | [[fallthrough]]; |
| 740 | case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: |
| 741 | case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: |
| 742 | case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM: |
| 743 | case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: |
| 744 | case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: |
| 745 | case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec: |
| 746 | case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec: |
| 747 | case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec: |
| 748 | case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec: |
| 749 | case AMDGPU::S_LOAD_DWORD_IMM: |
| 750 | case AMDGPU::S_LOAD_DWORDX2_IMM: |
| 751 | case AMDGPU::S_LOAD_DWORDX3_IMM: |
| 752 | case AMDGPU::S_LOAD_DWORDX4_IMM: |
| 753 | case AMDGPU::S_LOAD_DWORDX8_IMM: |
| 754 | case AMDGPU::S_LOAD_DWORDX2_IMM_ec: |
| 755 | case AMDGPU::S_LOAD_DWORDX3_IMM_ec: |
| 756 | case AMDGPU::S_LOAD_DWORDX4_IMM_ec: |
| 757 | case AMDGPU::S_LOAD_DWORDX8_IMM_ec: |
| 758 | Result.SBase = true; |
| 759 | return Result; |
| 760 | case AMDGPU::DS_READ_B32: |
| 761 | case AMDGPU::DS_READ_B64: |
| 762 | case AMDGPU::DS_READ_B32_gfx9: |
| 763 | case AMDGPU::DS_READ_B64_gfx9: |
| 764 | case AMDGPU::DS_WRITE_B32: |
| 765 | case AMDGPU::DS_WRITE_B64: |
| 766 | case AMDGPU::DS_WRITE_B32_gfx9: |
| 767 | case AMDGPU::DS_WRITE_B64_gfx9: |
| 768 | Result.Addr = true; |
| 769 | return Result; |
| 770 | case AMDGPU::GLOBAL_LOAD_DWORD_SADDR: |
| 771 | case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR: |
| 772 | case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR: |
| 773 | case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR: |
| 774 | case AMDGPU::GLOBAL_STORE_DWORD_SADDR: |
| 775 | case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR: |
| 776 | case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR: |
| 777 | case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR: |
| 778 | Result.SAddr = true; |
| 779 | [[fallthrough]]; |
| 780 | case AMDGPU::GLOBAL_LOAD_DWORD: |
| 781 | case AMDGPU::GLOBAL_LOAD_DWORDX2: |
| 782 | case AMDGPU::GLOBAL_LOAD_DWORDX3: |
| 783 | case AMDGPU::GLOBAL_LOAD_DWORDX4: |
| 784 | case AMDGPU::GLOBAL_STORE_DWORD: |
| 785 | case AMDGPU::GLOBAL_STORE_DWORDX2: |
| 786 | case AMDGPU::GLOBAL_STORE_DWORDX3: |
| 787 | case AMDGPU::GLOBAL_STORE_DWORDX4: |
| 788 | case AMDGPU::FLAT_LOAD_DWORD: |
| 789 | case AMDGPU::FLAT_LOAD_DWORDX2: |
| 790 | case AMDGPU::FLAT_LOAD_DWORDX3: |
| 791 | case AMDGPU::FLAT_LOAD_DWORDX4: |
| 792 | case AMDGPU::FLAT_STORE_DWORD: |
| 793 | case AMDGPU::FLAT_STORE_DWORDX2: |
| 794 | case AMDGPU::FLAT_STORE_DWORDX3: |
| 795 | case AMDGPU::FLAT_STORE_DWORDX4: |
| 796 | Result.VAddr = true; |
| 797 | return Result; |
| 798 | } |
| 799 | } |
| 800 | |
| 801 | void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI, |
| 802 | const SILoadStoreOptimizer &LSO) { |
| 803 | I = MI; |
| 804 | unsigned Opc = MI->getOpcode(); |
| 805 | InstClass = getInstClass(Opc, TII: *LSO.TII); |
| 806 | |
| 807 | if (InstClass == UNKNOWN) |
| 808 | return; |
| 809 | |
| 810 | IsAGPR = LSO.TRI->hasAGPRs(RC: LSO.getDataRegClass(MI: *MI)); |
| 811 | |
| 812 | switch (InstClass) { |
| 813 | case DS_READ: |
| 814 | EltSize = |
| 815 | (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8 |
| 816 | : 4; |
| 817 | break; |
| 818 | case DS_WRITE: |
| 819 | EltSize = |
| 820 | (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8 |
| 821 | : 4; |
| 822 | break; |
| 823 | case S_BUFFER_LOAD_IMM: |
| 824 | case S_BUFFER_LOAD_SGPR_IMM: |
| 825 | case S_LOAD_IMM: |
| 826 | EltSize = AMDGPU::convertSMRDOffsetUnits(ST: *LSO.STM, ByteOffset: 4); |
| 827 | break; |
| 828 | default: |
| 829 | EltSize = 4; |
| 830 | break; |
| 831 | } |
| 832 | |
| 833 | if (InstClass == MIMG) { |
| 834 | DMask = LSO.TII->getNamedOperand(MI&: *I, OperandName: AMDGPU::OpName::dmask)->getImm(); |
| 835 | // Offset is not considered for MIMG instructions. |
| 836 | Offset = 0; |
| 837 | } else { |
| 838 | int OffsetIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::offset); |
| 839 | Offset = I->getOperand(i: OffsetIdx).getImm(); |
| 840 | } |
| 841 | |
| 842 | if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE) |
| 843 | Format = LSO.TII->getNamedOperand(MI&: *I, OperandName: AMDGPU::OpName::format)->getImm(); |
| 844 | |
| 845 | Width = getOpcodeWidth(MI: *I, TII: *LSO.TII); |
| 846 | |
| 847 | if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) { |
| 848 | Offset &= 0xffff; |
| 849 | } else if (InstClass != MIMG) { |
| 850 | CPol = LSO.TII->getNamedOperand(MI&: *I, OperandName: AMDGPU::OpName::cpol)->getImm(); |
| 851 | } |
| 852 | |
| 853 | AddressRegs Regs = getRegs(Opc, TII: *LSO.TII); |
| 854 | bool isVIMAGEorVSAMPLE = LSO.TII->isVIMAGE(MI: *I) || LSO.TII->isVSAMPLE(MI: *I); |
| 855 | |
| 856 | NumAddresses = 0; |
| 857 | for (unsigned J = 0; J < Regs.NumVAddrs; J++) |
| 858 | AddrIdx[NumAddresses++] = |
| 859 | AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::vaddr0) + J; |
| 860 | if (Regs.Addr) |
| 861 | AddrIdx[NumAddresses++] = |
| 862 | AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::addr); |
| 863 | if (Regs.SBase) |
| 864 | AddrIdx[NumAddresses++] = |
| 865 | AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::sbase); |
| 866 | if (Regs.SRsrc) |
| 867 | AddrIdx[NumAddresses++] = AMDGPU::getNamedOperandIdx( |
| 868 | Opcode: Opc, Name: isVIMAGEorVSAMPLE ? AMDGPU::OpName::rsrc : AMDGPU::OpName::srsrc); |
| 869 | if (Regs.SOffset) |
| 870 | AddrIdx[NumAddresses++] = |
| 871 | AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::soffset); |
| 872 | if (Regs.SAddr) |
| 873 | AddrIdx[NumAddresses++] = |
| 874 | AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::saddr); |
| 875 | if (Regs.VAddr) |
| 876 | AddrIdx[NumAddresses++] = |
| 877 | AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::vaddr); |
| 878 | if (Regs.SSamp) |
| 879 | AddrIdx[NumAddresses++] = AMDGPU::getNamedOperandIdx( |
| 880 | Opcode: Opc, Name: isVIMAGEorVSAMPLE ? AMDGPU::OpName::samp : AMDGPU::OpName::ssamp); |
| 881 | assert(NumAddresses <= MaxAddressRegs); |
| 882 | |
| 883 | for (unsigned J = 0; J < NumAddresses; J++) |
| 884 | AddrReg[J] = &I->getOperand(i: AddrIdx[J]); |
| 885 | } |
| 886 | |
| 887 | } // end anonymous namespace. |
| 888 | |
| 889 | INITIALIZE_PASS_BEGIN(SILoadStoreOptimizerLegacy, DEBUG_TYPE, |
| 890 | "SI Load Store Optimizer" , false, false) |
| 891 | INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) |
| 892 | INITIALIZE_PASS_END(SILoadStoreOptimizerLegacy, DEBUG_TYPE, |
| 893 | "SI Load Store Optimizer" , false, false) |
| 894 | |
| 895 | char SILoadStoreOptimizerLegacy::ID = 0; |
| 896 | |
| 897 | char &llvm::SILoadStoreOptimizerLegacyID = SILoadStoreOptimizerLegacy::ID; |
| 898 | |
| 899 | FunctionPass *llvm::createSILoadStoreOptimizerLegacyPass() { |
| 900 | return new SILoadStoreOptimizerLegacy(); |
| 901 | } |
| 902 | |
| 903 | static void addDefsUsesToList(const MachineInstr &MI, |
| 904 | DenseSet<Register> &RegDefs, |
| 905 | DenseSet<Register> &RegUses) { |
| 906 | for (const auto &Op : MI.operands()) { |
| 907 | if (!Op.isReg()) |
| 908 | continue; |
| 909 | if (Op.isDef()) |
| 910 | RegDefs.insert(V: Op.getReg()); |
| 911 | if (Op.readsReg()) |
| 912 | RegUses.insert(V: Op.getReg()); |
| 913 | } |
| 914 | } |
| 915 | |
| 916 | bool SILoadStoreOptimizer::canSwapInstructions( |
| 917 | const DenseSet<Register> &ARegDefs, const DenseSet<Register> &ARegUses, |
| 918 | const MachineInstr &A, const MachineInstr &B) const { |
| 919 | if (A.mayLoadOrStore() && B.mayLoadOrStore() && |
| 920 | (A.mayStore() || B.mayStore()) && A.mayAlias(AA, Other: B, UseTBAA: true)) |
| 921 | return false; |
| 922 | for (const auto &BOp : B.operands()) { |
| 923 | if (!BOp.isReg()) |
| 924 | continue; |
| 925 | if ((BOp.isDef() || BOp.readsReg()) && ARegDefs.contains(V: BOp.getReg())) |
| 926 | return false; |
| 927 | if (BOp.isDef() && ARegUses.contains(V: BOp.getReg())) |
| 928 | return false; |
| 929 | } |
| 930 | return true; |
| 931 | } |
| 932 | |
| 933 | // Given that \p CI and \p Paired are adjacent memory operations produce a new |
| 934 | // MMO for the combined operation with a new access size. |
| 935 | MachineMemOperand * |
| 936 | SILoadStoreOptimizer::combineKnownAdjacentMMOs(const CombineInfo &CI, |
| 937 | const CombineInfo &Paired) { |
| 938 | const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); |
| 939 | const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); |
| 940 | |
| 941 | unsigned Size = MMOa->getSize().getValue() + MMOb->getSize().getValue(); |
| 942 | |
| 943 | // A base pointer for the combined operation is the same as the leading |
| 944 | // operation's pointer. |
| 945 | if (Paired < CI) |
| 946 | std::swap(a&: MMOa, b&: MMOb); |
| 947 | |
| 948 | MachinePointerInfo PtrInfo(MMOa->getPointerInfo()); |
| 949 | // If merging FLAT and GLOBAL set address space to FLAT. |
| 950 | if (MMOb->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS) |
| 951 | PtrInfo.AddrSpace = AMDGPUAS::FLAT_ADDRESS; |
| 952 | |
| 953 | MachineFunction *MF = CI.I->getMF(); |
| 954 | return MF->getMachineMemOperand(MMO: MMOa, PtrInfo, Size); |
| 955 | } |
| 956 | |
| 957 | bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI, |
| 958 | const SIInstrInfo &TII, |
| 959 | const CombineInfo &Paired) { |
| 960 | assert(CI.InstClass == MIMG); |
| 961 | |
| 962 | // Ignore instructions with tfe/lwe set. |
| 963 | const auto *TFEOp = TII.getNamedOperand(MI&: *CI.I, OperandName: AMDGPU::OpName::tfe); |
| 964 | const auto *LWEOp = TII.getNamedOperand(MI&: *CI.I, OperandName: AMDGPU::OpName::lwe); |
| 965 | |
| 966 | if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm())) |
| 967 | return false; |
| 968 | |
| 969 | // Check other optional immediate operands for equality. |
| 970 | AMDGPU::OpName OperandsToMatch[] = { |
| 971 | AMDGPU::OpName::cpol, AMDGPU::OpName::d16, AMDGPU::OpName::unorm, |
| 972 | AMDGPU::OpName::da, AMDGPU::OpName::r128, AMDGPU::OpName::a16}; |
| 973 | |
| 974 | for (AMDGPU::OpName op : OperandsToMatch) { |
| 975 | int Idx = AMDGPU::getNamedOperandIdx(Opcode: CI.I->getOpcode(), Name: op); |
| 976 | if (AMDGPU::getNamedOperandIdx(Opcode: Paired.I->getOpcode(), Name: op) != Idx) |
| 977 | return false; |
| 978 | if (Idx != -1 && |
| 979 | CI.I->getOperand(i: Idx).getImm() != Paired.I->getOperand(i: Idx).getImm()) |
| 980 | return false; |
| 981 | } |
| 982 | |
| 983 | // Check DMask for overlaps. |
| 984 | unsigned MaxMask = std::max(a: CI.DMask, b: Paired.DMask); |
| 985 | unsigned MinMask = std::min(a: CI.DMask, b: Paired.DMask); |
| 986 | |
| 987 | if (!MaxMask) |
| 988 | return false; |
| 989 | |
| 990 | unsigned AllowedBitsForMin = llvm::countr_zero(Val: MaxMask); |
| 991 | if ((1u << AllowedBitsForMin) <= MinMask) |
| 992 | return false; |
| 993 | |
| 994 | return true; |
| 995 | } |
| 996 | |
| 997 | static unsigned getBufferFormatWithCompCount(unsigned OldFormat, |
| 998 | unsigned ComponentCount, |
| 999 | const GCNSubtarget &STI) { |
| 1000 | if (ComponentCount > 4) |
| 1001 | return 0; |
| 1002 | |
| 1003 | const llvm::AMDGPU::GcnBufferFormatInfo *OldFormatInfo = |
| 1004 | llvm::AMDGPU::getGcnBufferFormatInfo(Format: OldFormat, STI); |
| 1005 | if (!OldFormatInfo) |
| 1006 | return 0; |
| 1007 | |
| 1008 | const llvm::AMDGPU::GcnBufferFormatInfo *NewFormatInfo = |
| 1009 | llvm::AMDGPU::getGcnBufferFormatInfo(BitsPerComp: OldFormatInfo->BitsPerComp, |
| 1010 | NumComponents: ComponentCount, |
| 1011 | NumFormat: OldFormatInfo->NumFormat, STI); |
| 1012 | |
| 1013 | if (!NewFormatInfo) |
| 1014 | return 0; |
| 1015 | |
| 1016 | assert(NewFormatInfo->NumFormat == OldFormatInfo->NumFormat && |
| 1017 | NewFormatInfo->BitsPerComp == OldFormatInfo->BitsPerComp); |
| 1018 | |
| 1019 | return NewFormatInfo->Format; |
| 1020 | } |
| 1021 | |
| 1022 | // Return the value in the inclusive range [Lo,Hi] that is aligned to the |
| 1023 | // highest power of two. Note that the result is well defined for all inputs |
| 1024 | // including corner cases like: |
| 1025 | // - if Lo == Hi, return that value |
| 1026 | // - if Lo == 0, return 0 (even though the "- 1" below underflows |
| 1027 | // - if Lo > Hi, return 0 (as if the range wrapped around) |
| 1028 | static uint32_t mostAlignedValueInRange(uint32_t Lo, uint32_t Hi) { |
| 1029 | return Hi & maskLeadingOnes<uint32_t>(N: llvm::countl_zero(Val: (Lo - 1) ^ Hi) + 1); |
| 1030 | } |
| 1031 | |
| 1032 | bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI, |
| 1033 | const GCNSubtarget &STI, |
| 1034 | CombineInfo &Paired, |
| 1035 | bool Modify) { |
| 1036 | assert(CI.InstClass != MIMG); |
| 1037 | |
| 1038 | // XXX - Would the same offset be OK? Is there any reason this would happen or |
| 1039 | // be useful? |
| 1040 | if (CI.Offset == Paired.Offset) |
| 1041 | return false; |
| 1042 | |
| 1043 | // This won't be valid if the offset isn't aligned. |
| 1044 | if ((CI.Offset % CI.EltSize != 0) || (Paired.Offset % CI.EltSize != 0)) |
| 1045 | return false; |
| 1046 | |
| 1047 | if (CI.InstClass == TBUFFER_LOAD || CI.InstClass == TBUFFER_STORE) { |
| 1048 | |
| 1049 | const llvm::AMDGPU::GcnBufferFormatInfo *Info0 = |
| 1050 | llvm::AMDGPU::getGcnBufferFormatInfo(Format: CI.Format, STI); |
| 1051 | if (!Info0) |
| 1052 | return false; |
| 1053 | const llvm::AMDGPU::GcnBufferFormatInfo *Info1 = |
| 1054 | llvm::AMDGPU::getGcnBufferFormatInfo(Format: Paired.Format, STI); |
| 1055 | if (!Info1) |
| 1056 | return false; |
| 1057 | |
| 1058 | if (Info0->BitsPerComp != Info1->BitsPerComp || |
| 1059 | Info0->NumFormat != Info1->NumFormat) |
| 1060 | return false; |
| 1061 | |
| 1062 | // TODO: Should be possible to support more formats, but if format loads |
| 1063 | // are not dword-aligned, the merged load might not be valid. |
| 1064 | if (Info0->BitsPerComp != 32) |
| 1065 | return false; |
| 1066 | |
| 1067 | if (getBufferFormatWithCompCount(OldFormat: CI.Format, ComponentCount: CI.Width + Paired.Width, STI) == 0) |
| 1068 | return false; |
| 1069 | } |
| 1070 | |
| 1071 | uint32_t EltOffset0 = CI.Offset / CI.EltSize; |
| 1072 | uint32_t EltOffset1 = Paired.Offset / CI.EltSize; |
| 1073 | CI.UseST64 = false; |
| 1074 | CI.BaseOff = 0; |
| 1075 | |
| 1076 | // Handle all non-DS instructions. |
| 1077 | if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) { |
| 1078 | if (EltOffset0 + CI.Width != EltOffset1 && |
| 1079 | EltOffset1 + Paired.Width != EltOffset0) |
| 1080 | return false; |
| 1081 | if (CI.CPol != Paired.CPol) |
| 1082 | return false; |
| 1083 | if (CI.InstClass == S_LOAD_IMM || CI.InstClass == S_BUFFER_LOAD_IMM || |
| 1084 | CI.InstClass == S_BUFFER_LOAD_SGPR_IMM) { |
| 1085 | // Reject cases like: |
| 1086 | // dword + dwordx2 -> dwordx3 |
| 1087 | // dword + dwordx3 -> dwordx4 |
| 1088 | // If we tried to combine these cases, we would fail to extract a subreg |
| 1089 | // for the result of the second load due to SGPR alignment requirements. |
| 1090 | if (CI.Width != Paired.Width && |
| 1091 | (CI.Width < Paired.Width) == (CI.Offset < Paired.Offset)) |
| 1092 | return false; |
| 1093 | } |
| 1094 | return true; |
| 1095 | } |
| 1096 | |
| 1097 | // If the offset in elements doesn't fit in 8-bits, we might be able to use |
| 1098 | // the stride 64 versions. |
| 1099 | if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 && |
| 1100 | isUInt<8>(x: EltOffset0 / 64) && isUInt<8>(x: EltOffset1 / 64)) { |
| 1101 | if (Modify) { |
| 1102 | CI.Offset = EltOffset0 / 64; |
| 1103 | Paired.Offset = EltOffset1 / 64; |
| 1104 | CI.UseST64 = true; |
| 1105 | } |
| 1106 | return true; |
| 1107 | } |
| 1108 | |
| 1109 | // Check if the new offsets fit in the reduced 8-bit range. |
| 1110 | if (isUInt<8>(x: EltOffset0) && isUInt<8>(x: EltOffset1)) { |
| 1111 | if (Modify) { |
| 1112 | CI.Offset = EltOffset0; |
| 1113 | Paired.Offset = EltOffset1; |
| 1114 | } |
| 1115 | return true; |
| 1116 | } |
| 1117 | |
| 1118 | // Try to shift base address to decrease offsets. |
| 1119 | uint32_t Min = std::min(a: EltOffset0, b: EltOffset1); |
| 1120 | uint32_t Max = std::max(a: EltOffset0, b: EltOffset1); |
| 1121 | |
| 1122 | const uint32_t Mask = maskTrailingOnes<uint32_t>(N: 8) * 64; |
| 1123 | if (((Max - Min) & ~Mask) == 0) { |
| 1124 | if (Modify) { |
| 1125 | // From the range of values we could use for BaseOff, choose the one that |
| 1126 | // is aligned to the highest power of two, to maximise the chance that |
| 1127 | // the same offset can be reused for other load/store pairs. |
| 1128 | uint32_t BaseOff = mostAlignedValueInRange(Lo: Max - 0xff * 64, Hi: Min); |
| 1129 | // Copy the low bits of the offsets, so that when we adjust them by |
| 1130 | // subtracting BaseOff they will be multiples of 64. |
| 1131 | BaseOff |= Min & maskTrailingOnes<uint32_t>(N: 6); |
| 1132 | CI.BaseOff = BaseOff * CI.EltSize; |
| 1133 | CI.Offset = (EltOffset0 - BaseOff) / 64; |
| 1134 | Paired.Offset = (EltOffset1 - BaseOff) / 64; |
| 1135 | CI.UseST64 = true; |
| 1136 | } |
| 1137 | return true; |
| 1138 | } |
| 1139 | |
| 1140 | if (isUInt<8>(x: Max - Min)) { |
| 1141 | if (Modify) { |
| 1142 | // From the range of values we could use for BaseOff, choose the one that |
| 1143 | // is aligned to the highest power of two, to maximise the chance that |
| 1144 | // the same offset can be reused for other load/store pairs. |
| 1145 | uint32_t BaseOff = mostAlignedValueInRange(Lo: Max - 0xff, Hi: Min); |
| 1146 | CI.BaseOff = BaseOff * CI.EltSize; |
| 1147 | CI.Offset = EltOffset0 - BaseOff; |
| 1148 | Paired.Offset = EltOffset1 - BaseOff; |
| 1149 | } |
| 1150 | return true; |
| 1151 | } |
| 1152 | |
| 1153 | return false; |
| 1154 | } |
| 1155 | |
| 1156 | bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM, |
| 1157 | const CombineInfo &CI, |
| 1158 | const CombineInfo &Paired) { |
| 1159 | const unsigned Width = (CI.Width + Paired.Width); |
| 1160 | switch (CI.InstClass) { |
| 1161 | default: |
| 1162 | return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3)); |
| 1163 | case S_BUFFER_LOAD_IMM: |
| 1164 | case S_BUFFER_LOAD_SGPR_IMM: |
| 1165 | case S_LOAD_IMM: |
| 1166 | switch (Width) { |
| 1167 | default: |
| 1168 | return false; |
| 1169 | case 2: |
| 1170 | case 4: |
| 1171 | case 8: |
| 1172 | return true; |
| 1173 | case 3: |
| 1174 | return STM.hasScalarDwordx3Loads(); |
| 1175 | } |
| 1176 | } |
| 1177 | } |
| 1178 | |
| 1179 | const TargetRegisterClass * |
| 1180 | SILoadStoreOptimizer::getDataRegClass(const MachineInstr &MI) const { |
| 1181 | if (const auto *Dst = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst)) { |
| 1182 | return TRI->getRegClassForReg(MRI: *MRI, Reg: Dst->getReg()); |
| 1183 | } |
| 1184 | if (const auto *Src = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdata)) { |
| 1185 | return TRI->getRegClassForReg(MRI: *MRI, Reg: Src->getReg()); |
| 1186 | } |
| 1187 | if (const auto *Src = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::data0)) { |
| 1188 | return TRI->getRegClassForReg(MRI: *MRI, Reg: Src->getReg()); |
| 1189 | } |
| 1190 | if (const auto *Dst = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::sdst)) { |
| 1191 | return TRI->getRegClassForReg(MRI: *MRI, Reg: Dst->getReg()); |
| 1192 | } |
| 1193 | if (const auto *Src = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::sdata)) { |
| 1194 | return TRI->getRegClassForReg(MRI: *MRI, Reg: Src->getReg()); |
| 1195 | } |
| 1196 | return nullptr; |
| 1197 | } |
| 1198 | |
| 1199 | /// This function assumes that CI comes before Paired in a basic block. Return |
| 1200 | /// an insertion point for the merged instruction or nullptr on failure. |
| 1201 | SILoadStoreOptimizer::CombineInfo * |
| 1202 | SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI, |
| 1203 | CombineInfo &Paired) { |
| 1204 | // If another instruction has already been merged into CI, it may now be a |
| 1205 | // type that we can't do any further merging into. |
| 1206 | if (CI.InstClass == UNKNOWN || Paired.InstClass == UNKNOWN) |
| 1207 | return nullptr; |
| 1208 | assert(CI.InstClass == Paired.InstClass); |
| 1209 | |
| 1210 | if (getInstSubclass(Opc: CI.I->getOpcode(), TII: *TII) != |
| 1211 | getInstSubclass(Opc: Paired.I->getOpcode(), TII: *TII)) |
| 1212 | return nullptr; |
| 1213 | |
| 1214 | // Check both offsets (or masks for MIMG) can be combined and fit in the |
| 1215 | // reduced range. |
| 1216 | if (CI.InstClass == MIMG) { |
| 1217 | if (!dmasksCanBeCombined(CI, TII: *TII, Paired)) |
| 1218 | return nullptr; |
| 1219 | } else { |
| 1220 | if (!widthsFit(STM: *STM, CI, Paired) || !offsetsCanBeCombined(CI, STI: *STM, Paired)) |
| 1221 | return nullptr; |
| 1222 | } |
| 1223 | |
| 1224 | DenseSet<Register> RegDefs; |
| 1225 | DenseSet<Register> RegUses; |
| 1226 | CombineInfo *Where; |
| 1227 | if (CI.I->mayLoad()) { |
| 1228 | // Try to hoist Paired up to CI. |
| 1229 | addDefsUsesToList(MI: *Paired.I, RegDefs, RegUses); |
| 1230 | for (MachineBasicBlock::iterator MBBI = Paired.I; --MBBI != CI.I;) { |
| 1231 | if (!canSwapInstructions(ARegDefs: RegDefs, ARegUses: RegUses, A: *Paired.I, B: *MBBI)) |
| 1232 | return nullptr; |
| 1233 | } |
| 1234 | Where = &CI; |
| 1235 | } else { |
| 1236 | // Try to sink CI down to Paired. |
| 1237 | addDefsUsesToList(MI: *CI.I, RegDefs, RegUses); |
| 1238 | for (MachineBasicBlock::iterator MBBI = CI.I; ++MBBI != Paired.I;) { |
| 1239 | if (!canSwapInstructions(ARegDefs: RegDefs, ARegUses: RegUses, A: *CI.I, B: *MBBI)) |
| 1240 | return nullptr; |
| 1241 | } |
| 1242 | Where = &Paired; |
| 1243 | } |
| 1244 | |
| 1245 | // Call offsetsCanBeCombined with modify = true so that the offsets are |
| 1246 | // correct for the new instruction. This should return true, because |
| 1247 | // this function should only be called on CombineInfo objects that |
| 1248 | // have already been confirmed to be mergeable. |
| 1249 | if (CI.InstClass == DS_READ || CI.InstClass == DS_WRITE) |
| 1250 | offsetsCanBeCombined(CI, STI: *STM, Paired, Modify: true); |
| 1251 | return Where; |
| 1252 | } |
| 1253 | |
| 1254 | // Copy the merged load result from DestReg to the original dest regs of CI and |
| 1255 | // Paired. |
| 1256 | void SILoadStoreOptimizer::copyToDestRegs( |
| 1257 | CombineInfo &CI, CombineInfo &Paired, |
| 1258 | MachineBasicBlock::iterator InsertBefore, AMDGPU::OpName OpName, |
| 1259 | Register DestReg) const { |
| 1260 | MachineBasicBlock *MBB = CI.I->getParent(); |
| 1261 | DebugLoc DL = CI.I->getDebugLoc(); |
| 1262 | |
| 1263 | auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired); |
| 1264 | |
| 1265 | // Copy to the old destination registers. |
| 1266 | const MCInstrDesc &CopyDesc = TII->get(Opcode: TargetOpcode::COPY); |
| 1267 | auto *Dest0 = TII->getNamedOperand(MI&: *CI.I, OperandName: OpName); |
| 1268 | auto *Dest1 = TII->getNamedOperand(MI&: *Paired.I, OperandName: OpName); |
| 1269 | |
| 1270 | // The constrained sload instructions in S_LOAD_IMM class will have |
| 1271 | // `early-clobber` flag in the dst operand. Remove the flag before using the |
| 1272 | // MOs in copies. |
| 1273 | Dest0->setIsEarlyClobber(false); |
| 1274 | Dest1->setIsEarlyClobber(false); |
| 1275 | |
| 1276 | BuildMI(BB&: *MBB, I: InsertBefore, MIMD: DL, MCID: CopyDesc) |
| 1277 | .add(MO: *Dest0) // Copy to same destination including flags and sub reg. |
| 1278 | .addReg(RegNo: DestReg, flags: 0, SubReg: SubRegIdx0); |
| 1279 | BuildMI(BB&: *MBB, I: InsertBefore, MIMD: DL, MCID: CopyDesc) |
| 1280 | .add(MO: *Dest1) |
| 1281 | .addReg(RegNo: DestReg, flags: RegState::Kill, SubReg: SubRegIdx1); |
| 1282 | } |
| 1283 | |
| 1284 | // Return a register for the source of the merged store after copying the |
| 1285 | // original source regs of CI and Paired into it. |
| 1286 | Register |
| 1287 | SILoadStoreOptimizer::copyFromSrcRegs(CombineInfo &CI, CombineInfo &Paired, |
| 1288 | MachineBasicBlock::iterator InsertBefore, |
| 1289 | AMDGPU::OpName OpName) const { |
| 1290 | MachineBasicBlock *MBB = CI.I->getParent(); |
| 1291 | DebugLoc DL = CI.I->getDebugLoc(); |
| 1292 | |
| 1293 | auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired); |
| 1294 | |
| 1295 | // Copy to the new source register. |
| 1296 | const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); |
| 1297 | Register SrcReg = MRI->createVirtualRegister(RegClass: SuperRC); |
| 1298 | |
| 1299 | const auto *Src0 = TII->getNamedOperand(MI&: *CI.I, OperandName: OpName); |
| 1300 | const auto *Src1 = TII->getNamedOperand(MI&: *Paired.I, OperandName: OpName); |
| 1301 | |
| 1302 | BuildMI(BB&: *MBB, I: InsertBefore, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: SrcReg) |
| 1303 | .add(MO: *Src0) |
| 1304 | .addImm(Val: SubRegIdx0) |
| 1305 | .add(MO: *Src1) |
| 1306 | .addImm(Val: SubRegIdx1); |
| 1307 | |
| 1308 | return SrcReg; |
| 1309 | } |
| 1310 | |
| 1311 | unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const { |
| 1312 | if (STM->ldsRequiresM0Init()) |
| 1313 | return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64; |
| 1314 | return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9; |
| 1315 | } |
| 1316 | |
| 1317 | unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const { |
| 1318 | if (STM->ldsRequiresM0Init()) |
| 1319 | return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64; |
| 1320 | |
| 1321 | return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9 |
| 1322 | : AMDGPU::DS_READ2ST64_B64_gfx9; |
| 1323 | } |
| 1324 | |
| 1325 | MachineBasicBlock::iterator |
| 1326 | SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired, |
| 1327 | MachineBasicBlock::iterator InsertBefore) { |
| 1328 | MachineBasicBlock *MBB = CI.I->getParent(); |
| 1329 | |
| 1330 | // Be careful, since the addresses could be subregisters themselves in weird |
| 1331 | // cases, like vectors of pointers. |
| 1332 | const auto *AddrReg = TII->getNamedOperand(MI&: *CI.I, OperandName: AMDGPU::OpName::addr); |
| 1333 | |
| 1334 | unsigned NewOffset0 = std::min(a: CI.Offset, b: Paired.Offset); |
| 1335 | unsigned NewOffset1 = std::max(a: CI.Offset, b: Paired.Offset); |
| 1336 | unsigned Opc = |
| 1337 | CI.UseST64 ? read2ST64Opcode(EltSize: CI.EltSize) : read2Opcode(EltSize: CI.EltSize); |
| 1338 | |
| 1339 | assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && |
| 1340 | (NewOffset0 != NewOffset1) && "Computed offset doesn't fit" ); |
| 1341 | |
| 1342 | const MCInstrDesc &Read2Desc = TII->get(Opcode: Opc); |
| 1343 | |
| 1344 | const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); |
| 1345 | Register DestReg = MRI->createVirtualRegister(RegClass: SuperRC); |
| 1346 | |
| 1347 | DebugLoc DL = CI.I->getDebugLoc(); |
| 1348 | |
| 1349 | Register BaseReg = AddrReg->getReg(); |
| 1350 | unsigned BaseSubReg = AddrReg->getSubReg(); |
| 1351 | unsigned BaseRegFlags = 0; |
| 1352 | if (CI.BaseOff) { |
| 1353 | Register ImmReg = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass); |
| 1354 | BuildMI(BB&: *MBB, I: InsertBefore, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MOV_B32), DestReg: ImmReg) |
| 1355 | .addImm(Val: CI.BaseOff); |
| 1356 | |
| 1357 | BaseReg = MRI->createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass); |
| 1358 | BaseRegFlags = RegState::Kill; |
| 1359 | |
| 1360 | TII->getAddNoCarry(MBB&: *MBB, I: InsertBefore, DL, DestReg: BaseReg) |
| 1361 | .addReg(RegNo: ImmReg) |
| 1362 | .addReg(RegNo: AddrReg->getReg(), flags: 0, SubReg: BaseSubReg) |
| 1363 | .addImm(Val: 0); // clamp bit |
| 1364 | BaseSubReg = 0; |
| 1365 | } |
| 1366 | |
| 1367 | MachineInstrBuilder Read2 = |
| 1368 | BuildMI(BB&: *MBB, I: InsertBefore, MIMD: DL, MCID: Read2Desc, DestReg) |
| 1369 | .addReg(RegNo: BaseReg, flags: BaseRegFlags, SubReg: BaseSubReg) // addr |
| 1370 | .addImm(Val: NewOffset0) // offset0 |
| 1371 | .addImm(Val: NewOffset1) // offset1 |
| 1372 | .addImm(Val: 0) // gds |
| 1373 | .cloneMergedMemRefs(OtherMIs: {&*CI.I, &*Paired.I}); |
| 1374 | |
| 1375 | copyToDestRegs(CI, Paired, InsertBefore, OpName: AMDGPU::OpName::vdst, DestReg); |
| 1376 | |
| 1377 | CI.I->eraseFromParent(); |
| 1378 | Paired.I->eraseFromParent(); |
| 1379 | |
| 1380 | LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n'); |
| 1381 | return Read2; |
| 1382 | } |
| 1383 | |
| 1384 | unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const { |
| 1385 | if (STM->ldsRequiresM0Init()) |
| 1386 | return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64; |
| 1387 | return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9 |
| 1388 | : AMDGPU::DS_WRITE2_B64_gfx9; |
| 1389 | } |
| 1390 | |
| 1391 | unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const { |
| 1392 | if (STM->ldsRequiresM0Init()) |
| 1393 | return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32 |
| 1394 | : AMDGPU::DS_WRITE2ST64_B64; |
| 1395 | |
| 1396 | return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9 |
| 1397 | : AMDGPU::DS_WRITE2ST64_B64_gfx9; |
| 1398 | } |
| 1399 | |
| 1400 | MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair( |
| 1401 | CombineInfo &CI, CombineInfo &Paired, |
| 1402 | MachineBasicBlock::iterator InsertBefore) { |
| 1403 | MachineBasicBlock *MBB = CI.I->getParent(); |
| 1404 | |
| 1405 | // Be sure to use .addOperand(), and not .addReg() with these. We want to be |
| 1406 | // sure we preserve the subregister index and any register flags set on them. |
| 1407 | const MachineOperand *AddrReg = |
| 1408 | TII->getNamedOperand(MI&: *CI.I, OperandName: AMDGPU::OpName::addr); |
| 1409 | const MachineOperand *Data0 = |
| 1410 | TII->getNamedOperand(MI&: *CI.I, OperandName: AMDGPU::OpName::data0); |
| 1411 | const MachineOperand *Data1 = |
| 1412 | TII->getNamedOperand(MI&: *Paired.I, OperandName: AMDGPU::OpName::data0); |
| 1413 | |
| 1414 | unsigned NewOffset0 = CI.Offset; |
| 1415 | unsigned NewOffset1 = Paired.Offset; |
| 1416 | unsigned Opc = |
| 1417 | CI.UseST64 ? write2ST64Opcode(EltSize: CI.EltSize) : write2Opcode(EltSize: CI.EltSize); |
| 1418 | |
| 1419 | if (NewOffset0 > NewOffset1) { |
| 1420 | // Canonicalize the merged instruction so the smaller offset comes first. |
| 1421 | std::swap(a&: NewOffset0, b&: NewOffset1); |
| 1422 | std::swap(a&: Data0, b&: Data1); |
| 1423 | } |
| 1424 | |
| 1425 | assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && |
| 1426 | (NewOffset0 != NewOffset1) && "Computed offset doesn't fit" ); |
| 1427 | |
| 1428 | const MCInstrDesc &Write2Desc = TII->get(Opcode: Opc); |
| 1429 | DebugLoc DL = CI.I->getDebugLoc(); |
| 1430 | |
| 1431 | Register BaseReg = AddrReg->getReg(); |
| 1432 | unsigned BaseSubReg = AddrReg->getSubReg(); |
| 1433 | unsigned BaseRegFlags = 0; |
| 1434 | if (CI.BaseOff) { |
| 1435 | Register ImmReg = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass); |
| 1436 | BuildMI(BB&: *MBB, I: InsertBefore, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MOV_B32), DestReg: ImmReg) |
| 1437 | .addImm(Val: CI.BaseOff); |
| 1438 | |
| 1439 | BaseReg = MRI->createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass); |
| 1440 | BaseRegFlags = RegState::Kill; |
| 1441 | |
| 1442 | TII->getAddNoCarry(MBB&: *MBB, I: InsertBefore, DL, DestReg: BaseReg) |
| 1443 | .addReg(RegNo: ImmReg) |
| 1444 | .addReg(RegNo: AddrReg->getReg(), flags: 0, SubReg: BaseSubReg) |
| 1445 | .addImm(Val: 0); // clamp bit |
| 1446 | BaseSubReg = 0; |
| 1447 | } |
| 1448 | |
| 1449 | MachineInstrBuilder Write2 = |
| 1450 | BuildMI(BB&: *MBB, I: InsertBefore, MIMD: DL, MCID: Write2Desc) |
| 1451 | .addReg(RegNo: BaseReg, flags: BaseRegFlags, SubReg: BaseSubReg) // addr |
| 1452 | .add(MO: *Data0) // data0 |
| 1453 | .add(MO: *Data1) // data1 |
| 1454 | .addImm(Val: NewOffset0) // offset0 |
| 1455 | .addImm(Val: NewOffset1) // offset1 |
| 1456 | .addImm(Val: 0) // gds |
| 1457 | .cloneMergedMemRefs(OtherMIs: {&*CI.I, &*Paired.I}); |
| 1458 | |
| 1459 | CI.I->eraseFromParent(); |
| 1460 | Paired.I->eraseFromParent(); |
| 1461 | |
| 1462 | LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n'); |
| 1463 | return Write2; |
| 1464 | } |
| 1465 | |
| 1466 | MachineBasicBlock::iterator |
| 1467 | SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired, |
| 1468 | MachineBasicBlock::iterator InsertBefore) { |
| 1469 | MachineBasicBlock *MBB = CI.I->getParent(); |
| 1470 | DebugLoc DL = CI.I->getDebugLoc(); |
| 1471 | const unsigned Opcode = getNewOpcode(CI, Paired); |
| 1472 | |
| 1473 | const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); |
| 1474 | |
| 1475 | Register DestReg = MRI->createVirtualRegister(RegClass: SuperRC); |
| 1476 | unsigned MergedDMask = CI.DMask | Paired.DMask; |
| 1477 | unsigned DMaskIdx = |
| 1478 | AMDGPU::getNamedOperandIdx(Opcode: CI.I->getOpcode(), Name: AMDGPU::OpName::dmask); |
| 1479 | |
| 1480 | auto MIB = BuildMI(BB&: *MBB, I: InsertBefore, MIMD: DL, MCID: TII->get(Opcode), DestReg); |
| 1481 | for (unsigned I = 1, E = (*CI.I).getNumOperands(); I != E; ++I) { |
| 1482 | if (I == DMaskIdx) |
| 1483 | MIB.addImm(Val: MergedDMask); |
| 1484 | else |
| 1485 | MIB.add(MO: (*CI.I).getOperand(i: I)); |
| 1486 | } |
| 1487 | |
| 1488 | // It shouldn't be possible to get this far if the two instructions |
| 1489 | // don't have a single memoperand, because MachineInstr::mayAlias() |
| 1490 | // will return true if this is the case. |
| 1491 | assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); |
| 1492 | |
| 1493 | MachineInstr *New = MIB.addMemOperand(MMO: combineKnownAdjacentMMOs(CI, Paired)); |
| 1494 | |
| 1495 | copyToDestRegs(CI, Paired, InsertBefore, OpName: AMDGPU::OpName::vdata, DestReg); |
| 1496 | |
| 1497 | CI.I->eraseFromParent(); |
| 1498 | Paired.I->eraseFromParent(); |
| 1499 | return New; |
| 1500 | } |
| 1501 | |
| 1502 | MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSMemLoadImmPair( |
| 1503 | CombineInfo &CI, CombineInfo &Paired, |
| 1504 | MachineBasicBlock::iterator InsertBefore) { |
| 1505 | MachineBasicBlock *MBB = CI.I->getParent(); |
| 1506 | DebugLoc DL = CI.I->getDebugLoc(); |
| 1507 | const unsigned Opcode = getNewOpcode(CI, Paired); |
| 1508 | |
| 1509 | const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); |
| 1510 | |
| 1511 | Register DestReg = MRI->createVirtualRegister(RegClass: SuperRC); |
| 1512 | unsigned MergedOffset = std::min(a: CI.Offset, b: Paired.Offset); |
| 1513 | |
| 1514 | // It shouldn't be possible to get this far if the two instructions |
| 1515 | // don't have a single memoperand, because MachineInstr::mayAlias() |
| 1516 | // will return true if this is the case. |
| 1517 | assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); |
| 1518 | |
| 1519 | MachineInstrBuilder New = |
| 1520 | BuildMI(BB&: *MBB, I: InsertBefore, MIMD: DL, MCID: TII->get(Opcode), DestReg) |
| 1521 | .add(MO: *TII->getNamedOperand(MI&: *CI.I, OperandName: AMDGPU::OpName::sbase)); |
| 1522 | if (CI.InstClass == S_BUFFER_LOAD_SGPR_IMM) |
| 1523 | New.add(MO: *TII->getNamedOperand(MI&: *CI.I, OperandName: AMDGPU::OpName::soffset)); |
| 1524 | New.addImm(Val: MergedOffset); |
| 1525 | New.addImm(Val: CI.CPol).addMemOperand(MMO: combineKnownAdjacentMMOs(CI, Paired)); |
| 1526 | |
| 1527 | copyToDestRegs(CI, Paired, InsertBefore, OpName: AMDGPU::OpName::sdst, DestReg); |
| 1528 | |
| 1529 | CI.I->eraseFromParent(); |
| 1530 | Paired.I->eraseFromParent(); |
| 1531 | return New; |
| 1532 | } |
| 1533 | |
| 1534 | MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair( |
| 1535 | CombineInfo &CI, CombineInfo &Paired, |
| 1536 | MachineBasicBlock::iterator InsertBefore) { |
| 1537 | MachineBasicBlock *MBB = CI.I->getParent(); |
| 1538 | DebugLoc DL = CI.I->getDebugLoc(); |
| 1539 | |
| 1540 | const unsigned Opcode = getNewOpcode(CI, Paired); |
| 1541 | |
| 1542 | const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); |
| 1543 | |
| 1544 | // Copy to the new source register. |
| 1545 | Register DestReg = MRI->createVirtualRegister(RegClass: SuperRC); |
| 1546 | unsigned MergedOffset = std::min(a: CI.Offset, b: Paired.Offset); |
| 1547 | |
| 1548 | auto MIB = BuildMI(BB&: *MBB, I: InsertBefore, MIMD: DL, MCID: TII->get(Opcode), DestReg); |
| 1549 | |
| 1550 | AddressRegs Regs = getRegs(Opc: Opcode, TII: *TII); |
| 1551 | |
| 1552 | if (Regs.VAddr) |
| 1553 | MIB.add(MO: *TII->getNamedOperand(MI&: *CI.I, OperandName: AMDGPU::OpName::vaddr)); |
| 1554 | |
| 1555 | // It shouldn't be possible to get this far if the two instructions |
| 1556 | // don't have a single memoperand, because MachineInstr::mayAlias() |
| 1557 | // will return true if this is the case. |
| 1558 | assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); |
| 1559 | |
| 1560 | MachineInstr *New = |
| 1561 | MIB.add(MO: *TII->getNamedOperand(MI&: *CI.I, OperandName: AMDGPU::OpName::srsrc)) |
| 1562 | .add(MO: *TII->getNamedOperand(MI&: *CI.I, OperandName: AMDGPU::OpName::soffset)) |
| 1563 | .addImm(Val: MergedOffset) // offset |
| 1564 | .addImm(Val: CI.CPol) // cpol |
| 1565 | .addImm(Val: 0) // swz |
| 1566 | .addMemOperand(MMO: combineKnownAdjacentMMOs(CI, Paired)); |
| 1567 | |
| 1568 | copyToDestRegs(CI, Paired, InsertBefore, OpName: AMDGPU::OpName::vdata, DestReg); |
| 1569 | |
| 1570 | CI.I->eraseFromParent(); |
| 1571 | Paired.I->eraseFromParent(); |
| 1572 | return New; |
| 1573 | } |
| 1574 | |
| 1575 | MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair( |
| 1576 | CombineInfo &CI, CombineInfo &Paired, |
| 1577 | MachineBasicBlock::iterator InsertBefore) { |
| 1578 | MachineBasicBlock *MBB = CI.I->getParent(); |
| 1579 | DebugLoc DL = CI.I->getDebugLoc(); |
| 1580 | |
| 1581 | const unsigned Opcode = getNewOpcode(CI, Paired); |
| 1582 | |
| 1583 | const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); |
| 1584 | |
| 1585 | // Copy to the new source register. |
| 1586 | Register DestReg = MRI->createVirtualRegister(RegClass: SuperRC); |
| 1587 | unsigned MergedOffset = std::min(a: CI.Offset, b: Paired.Offset); |
| 1588 | |
| 1589 | auto MIB = BuildMI(BB&: *MBB, I: InsertBefore, MIMD: DL, MCID: TII->get(Opcode), DestReg); |
| 1590 | |
| 1591 | AddressRegs Regs = getRegs(Opc: Opcode, TII: *TII); |
| 1592 | |
| 1593 | if (Regs.VAddr) |
| 1594 | MIB.add(MO: *TII->getNamedOperand(MI&: *CI.I, OperandName: AMDGPU::OpName::vaddr)); |
| 1595 | |
| 1596 | unsigned JoinedFormat = |
| 1597 | getBufferFormatWithCompCount(OldFormat: CI.Format, ComponentCount: CI.Width + Paired.Width, STI: *STM); |
| 1598 | |
| 1599 | // It shouldn't be possible to get this far if the two instructions |
| 1600 | // don't have a single memoperand, because MachineInstr::mayAlias() |
| 1601 | // will return true if this is the case. |
| 1602 | assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); |
| 1603 | |
| 1604 | MachineInstr *New = |
| 1605 | MIB.add(MO: *TII->getNamedOperand(MI&: *CI.I, OperandName: AMDGPU::OpName::srsrc)) |
| 1606 | .add(MO: *TII->getNamedOperand(MI&: *CI.I, OperandName: AMDGPU::OpName::soffset)) |
| 1607 | .addImm(Val: MergedOffset) // offset |
| 1608 | .addImm(Val: JoinedFormat) // format |
| 1609 | .addImm(Val: CI.CPol) // cpol |
| 1610 | .addImm(Val: 0) // swz |
| 1611 | .addMemOperand(MMO: combineKnownAdjacentMMOs(CI, Paired)); |
| 1612 | |
| 1613 | copyToDestRegs(CI, Paired, InsertBefore, OpName: AMDGPU::OpName::vdata, DestReg); |
| 1614 | |
| 1615 | CI.I->eraseFromParent(); |
| 1616 | Paired.I->eraseFromParent(); |
| 1617 | return New; |
| 1618 | } |
| 1619 | |
| 1620 | MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair( |
| 1621 | CombineInfo &CI, CombineInfo &Paired, |
| 1622 | MachineBasicBlock::iterator InsertBefore) { |
| 1623 | MachineBasicBlock *MBB = CI.I->getParent(); |
| 1624 | DebugLoc DL = CI.I->getDebugLoc(); |
| 1625 | |
| 1626 | const unsigned Opcode = getNewOpcode(CI, Paired); |
| 1627 | |
| 1628 | Register SrcReg = |
| 1629 | copyFromSrcRegs(CI, Paired, InsertBefore, OpName: AMDGPU::OpName::vdata); |
| 1630 | |
| 1631 | auto MIB = BuildMI(BB&: *MBB, I: InsertBefore, MIMD: DL, MCID: TII->get(Opcode)) |
| 1632 | .addReg(RegNo: SrcReg, flags: RegState::Kill); |
| 1633 | |
| 1634 | AddressRegs Regs = getRegs(Opc: Opcode, TII: *TII); |
| 1635 | |
| 1636 | if (Regs.VAddr) |
| 1637 | MIB.add(MO: *TII->getNamedOperand(MI&: *CI.I, OperandName: AMDGPU::OpName::vaddr)); |
| 1638 | |
| 1639 | unsigned JoinedFormat = |
| 1640 | getBufferFormatWithCompCount(OldFormat: CI.Format, ComponentCount: CI.Width + Paired.Width, STI: *STM); |
| 1641 | |
| 1642 | // It shouldn't be possible to get this far if the two instructions |
| 1643 | // don't have a single memoperand, because MachineInstr::mayAlias() |
| 1644 | // will return true if this is the case. |
| 1645 | assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); |
| 1646 | |
| 1647 | MachineInstr *New = |
| 1648 | MIB.add(MO: *TII->getNamedOperand(MI&: *CI.I, OperandName: AMDGPU::OpName::srsrc)) |
| 1649 | .add(MO: *TII->getNamedOperand(MI&: *CI.I, OperandName: AMDGPU::OpName::soffset)) |
| 1650 | .addImm(Val: std::min(a: CI.Offset, b: Paired.Offset)) // offset |
| 1651 | .addImm(Val: JoinedFormat) // format |
| 1652 | .addImm(Val: CI.CPol) // cpol |
| 1653 | .addImm(Val: 0) // swz |
| 1654 | .addMemOperand(MMO: combineKnownAdjacentMMOs(CI, Paired)); |
| 1655 | |
| 1656 | CI.I->eraseFromParent(); |
| 1657 | Paired.I->eraseFromParent(); |
| 1658 | return New; |
| 1659 | } |
| 1660 | |
| 1661 | MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatLoadPair( |
| 1662 | CombineInfo &CI, CombineInfo &Paired, |
| 1663 | MachineBasicBlock::iterator InsertBefore) { |
| 1664 | MachineBasicBlock *MBB = CI.I->getParent(); |
| 1665 | DebugLoc DL = CI.I->getDebugLoc(); |
| 1666 | |
| 1667 | const unsigned Opcode = getNewOpcode(CI, Paired); |
| 1668 | |
| 1669 | const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); |
| 1670 | Register DestReg = MRI->createVirtualRegister(RegClass: SuperRC); |
| 1671 | |
| 1672 | auto MIB = BuildMI(BB&: *MBB, I: InsertBefore, MIMD: DL, MCID: TII->get(Opcode), DestReg); |
| 1673 | |
| 1674 | if (auto *SAddr = TII->getNamedOperand(MI&: *CI.I, OperandName: AMDGPU::OpName::saddr)) |
| 1675 | MIB.add(MO: *SAddr); |
| 1676 | |
| 1677 | MachineInstr *New = |
| 1678 | MIB.add(MO: *TII->getNamedOperand(MI&: *CI.I, OperandName: AMDGPU::OpName::vaddr)) |
| 1679 | .addImm(Val: std::min(a: CI.Offset, b: Paired.Offset)) |
| 1680 | .addImm(Val: CI.CPol) |
| 1681 | .addMemOperand(MMO: combineKnownAdjacentMMOs(CI, Paired)); |
| 1682 | |
| 1683 | copyToDestRegs(CI, Paired, InsertBefore, OpName: AMDGPU::OpName::vdst, DestReg); |
| 1684 | |
| 1685 | CI.I->eraseFromParent(); |
| 1686 | Paired.I->eraseFromParent(); |
| 1687 | return New; |
| 1688 | } |
| 1689 | |
| 1690 | MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatStorePair( |
| 1691 | CombineInfo &CI, CombineInfo &Paired, |
| 1692 | MachineBasicBlock::iterator InsertBefore) { |
| 1693 | MachineBasicBlock *MBB = CI.I->getParent(); |
| 1694 | DebugLoc DL = CI.I->getDebugLoc(); |
| 1695 | |
| 1696 | const unsigned Opcode = getNewOpcode(CI, Paired); |
| 1697 | |
| 1698 | Register SrcReg = |
| 1699 | copyFromSrcRegs(CI, Paired, InsertBefore, OpName: AMDGPU::OpName::vdata); |
| 1700 | |
| 1701 | auto MIB = BuildMI(BB&: *MBB, I: InsertBefore, MIMD: DL, MCID: TII->get(Opcode)) |
| 1702 | .add(MO: *TII->getNamedOperand(MI&: *CI.I, OperandName: AMDGPU::OpName::vaddr)) |
| 1703 | .addReg(RegNo: SrcReg, flags: RegState::Kill); |
| 1704 | |
| 1705 | if (auto *SAddr = TII->getNamedOperand(MI&: *CI.I, OperandName: AMDGPU::OpName::saddr)) |
| 1706 | MIB.add(MO: *SAddr); |
| 1707 | |
| 1708 | MachineInstr *New = |
| 1709 | MIB.addImm(Val: std::min(a: CI.Offset, b: Paired.Offset)) |
| 1710 | .addImm(Val: CI.CPol) |
| 1711 | .addMemOperand(MMO: combineKnownAdjacentMMOs(CI, Paired)); |
| 1712 | |
| 1713 | CI.I->eraseFromParent(); |
| 1714 | Paired.I->eraseFromParent(); |
| 1715 | return New; |
| 1716 | } |
| 1717 | |
| 1718 | static bool needsConstrainedOpcode(const GCNSubtarget &STM, |
| 1719 | ArrayRef<MachineMemOperand *> MMOs, |
| 1720 | unsigned Width) { |
| 1721 | // Conservatively returns true if not found the MMO. |
| 1722 | return STM.isXNACKEnabled() && |
| 1723 | (MMOs.size() != 1 || MMOs[0]->getAlign().value() < Width * 4); |
| 1724 | } |
| 1725 | |
| 1726 | unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI, |
| 1727 | const CombineInfo &Paired) { |
| 1728 | const unsigned Width = CI.Width + Paired.Width; |
| 1729 | |
| 1730 | switch (getCommonInstClass(CI, Paired)) { |
| 1731 | default: |
| 1732 | assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE); |
| 1733 | // FIXME: Handle d16 correctly |
| 1734 | return AMDGPU::getMUBUFOpcode(BaseOpc: AMDGPU::getMUBUFBaseOpcode(Opc: CI.I->getOpcode()), |
| 1735 | Elements: Width); |
| 1736 | case TBUFFER_LOAD: |
| 1737 | case TBUFFER_STORE: |
| 1738 | return AMDGPU::getMTBUFOpcode(BaseOpc: AMDGPU::getMTBUFBaseOpcode(Opc: CI.I->getOpcode()), |
| 1739 | Elements: Width); |
| 1740 | |
| 1741 | case UNKNOWN: |
| 1742 | llvm_unreachable("Unknown instruction class" ); |
| 1743 | case S_BUFFER_LOAD_IMM: { |
| 1744 | // If XNACK is enabled, use the constrained opcodes when the first load is |
| 1745 | // under-aligned. |
| 1746 | bool NeedsConstrainedOpc = |
| 1747 | needsConstrainedOpcode(STM: *STM, MMOs: CI.I->memoperands(), Width); |
| 1748 | switch (Width) { |
| 1749 | default: |
| 1750 | return 0; |
| 1751 | case 2: |
| 1752 | return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec |
| 1753 | : AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM; |
| 1754 | case 3: |
| 1755 | return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec |
| 1756 | : AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM; |
| 1757 | case 4: |
| 1758 | return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec |
| 1759 | : AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM; |
| 1760 | case 8: |
| 1761 | return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec |
| 1762 | : AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM; |
| 1763 | } |
| 1764 | } |
| 1765 | case S_BUFFER_LOAD_SGPR_IMM: { |
| 1766 | // If XNACK is enabled, use the constrained opcodes when the first load is |
| 1767 | // under-aligned. |
| 1768 | bool NeedsConstrainedOpc = |
| 1769 | needsConstrainedOpcode(STM: *STM, MMOs: CI.I->memoperands(), Width); |
| 1770 | switch (Width) { |
| 1771 | default: |
| 1772 | return 0; |
| 1773 | case 2: |
| 1774 | return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec |
| 1775 | : AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM; |
| 1776 | case 3: |
| 1777 | return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec |
| 1778 | : AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM; |
| 1779 | case 4: |
| 1780 | return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec |
| 1781 | : AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM; |
| 1782 | case 8: |
| 1783 | return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec |
| 1784 | : AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM; |
| 1785 | } |
| 1786 | } |
| 1787 | case S_LOAD_IMM: { |
| 1788 | // If XNACK is enabled, use the constrained opcodes when the first load is |
| 1789 | // under-aligned. |
| 1790 | bool NeedsConstrainedOpc = |
| 1791 | needsConstrainedOpcode(STM: *STM, MMOs: CI.I->memoperands(), Width); |
| 1792 | switch (Width) { |
| 1793 | default: |
| 1794 | return 0; |
| 1795 | case 2: |
| 1796 | return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX2_IMM_ec |
| 1797 | : AMDGPU::S_LOAD_DWORDX2_IMM; |
| 1798 | case 3: |
| 1799 | return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX3_IMM_ec |
| 1800 | : AMDGPU::S_LOAD_DWORDX3_IMM; |
| 1801 | case 4: |
| 1802 | return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX4_IMM_ec |
| 1803 | : AMDGPU::S_LOAD_DWORDX4_IMM; |
| 1804 | case 8: |
| 1805 | return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX8_IMM_ec |
| 1806 | : AMDGPU::S_LOAD_DWORDX8_IMM; |
| 1807 | } |
| 1808 | } |
| 1809 | case GLOBAL_LOAD: |
| 1810 | switch (Width) { |
| 1811 | default: |
| 1812 | return 0; |
| 1813 | case 2: |
| 1814 | return AMDGPU::GLOBAL_LOAD_DWORDX2; |
| 1815 | case 3: |
| 1816 | return AMDGPU::GLOBAL_LOAD_DWORDX3; |
| 1817 | case 4: |
| 1818 | return AMDGPU::GLOBAL_LOAD_DWORDX4; |
| 1819 | } |
| 1820 | case GLOBAL_LOAD_SADDR: |
| 1821 | switch (Width) { |
| 1822 | default: |
| 1823 | return 0; |
| 1824 | case 2: |
| 1825 | return AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR; |
| 1826 | case 3: |
| 1827 | return AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR; |
| 1828 | case 4: |
| 1829 | return AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR; |
| 1830 | } |
| 1831 | case GLOBAL_STORE: |
| 1832 | switch (Width) { |
| 1833 | default: |
| 1834 | return 0; |
| 1835 | case 2: |
| 1836 | return AMDGPU::GLOBAL_STORE_DWORDX2; |
| 1837 | case 3: |
| 1838 | return AMDGPU::GLOBAL_STORE_DWORDX3; |
| 1839 | case 4: |
| 1840 | return AMDGPU::GLOBAL_STORE_DWORDX4; |
| 1841 | } |
| 1842 | case GLOBAL_STORE_SADDR: |
| 1843 | switch (Width) { |
| 1844 | default: |
| 1845 | return 0; |
| 1846 | case 2: |
| 1847 | return AMDGPU::GLOBAL_STORE_DWORDX2_SADDR; |
| 1848 | case 3: |
| 1849 | return AMDGPU::GLOBAL_STORE_DWORDX3_SADDR; |
| 1850 | case 4: |
| 1851 | return AMDGPU::GLOBAL_STORE_DWORDX4_SADDR; |
| 1852 | } |
| 1853 | case FLAT_LOAD: |
| 1854 | switch (Width) { |
| 1855 | default: |
| 1856 | return 0; |
| 1857 | case 2: |
| 1858 | return AMDGPU::FLAT_LOAD_DWORDX2; |
| 1859 | case 3: |
| 1860 | return AMDGPU::FLAT_LOAD_DWORDX3; |
| 1861 | case 4: |
| 1862 | return AMDGPU::FLAT_LOAD_DWORDX4; |
| 1863 | } |
| 1864 | case FLAT_STORE: |
| 1865 | switch (Width) { |
| 1866 | default: |
| 1867 | return 0; |
| 1868 | case 2: |
| 1869 | return AMDGPU::FLAT_STORE_DWORDX2; |
| 1870 | case 3: |
| 1871 | return AMDGPU::FLAT_STORE_DWORDX3; |
| 1872 | case 4: |
| 1873 | return AMDGPU::FLAT_STORE_DWORDX4; |
| 1874 | } |
| 1875 | case MIMG: |
| 1876 | assert(((unsigned)llvm::popcount(CI.DMask | Paired.DMask) == Width) && |
| 1877 | "No overlaps" ); |
| 1878 | return AMDGPU::getMaskedMIMGOp(Opc: CI.I->getOpcode(), NewChannels: Width); |
| 1879 | } |
| 1880 | } |
| 1881 | |
| 1882 | std::pair<unsigned, unsigned> |
| 1883 | SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI, |
| 1884 | const CombineInfo &Paired) { |
| 1885 | assert((CI.InstClass != MIMG || |
| 1886 | ((unsigned)llvm::popcount(CI.DMask | Paired.DMask) == |
| 1887 | CI.Width + Paired.Width)) && |
| 1888 | "No overlaps" ); |
| 1889 | |
| 1890 | unsigned Idx0; |
| 1891 | unsigned Idx1; |
| 1892 | |
| 1893 | static const unsigned Idxs[5][4] = { |
| 1894 | {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3}, |
| 1895 | {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, AMDGPU::sub1_sub2_sub3_sub4}, |
| 1896 | {AMDGPU::sub2, AMDGPU::sub2_sub3, AMDGPU::sub2_sub3_sub4, AMDGPU::sub2_sub3_sub4_sub5}, |
| 1897 | {AMDGPU::sub3, AMDGPU::sub3_sub4, AMDGPU::sub3_sub4_sub5, AMDGPU::sub3_sub4_sub5_sub6}, |
| 1898 | {AMDGPU::sub4, AMDGPU::sub4_sub5, AMDGPU::sub4_sub5_sub6, AMDGPU::sub4_sub5_sub6_sub7}, |
| 1899 | }; |
| 1900 | |
| 1901 | assert(CI.Width >= 1 && CI.Width <= 4); |
| 1902 | assert(Paired.Width >= 1 && Paired.Width <= 4); |
| 1903 | |
| 1904 | if (Paired < CI) { |
| 1905 | Idx1 = Idxs[0][Paired.Width - 1]; |
| 1906 | Idx0 = Idxs[Paired.Width][CI.Width - 1]; |
| 1907 | } else { |
| 1908 | Idx0 = Idxs[0][CI.Width - 1]; |
| 1909 | Idx1 = Idxs[CI.Width][Paired.Width - 1]; |
| 1910 | } |
| 1911 | |
| 1912 | return {Idx0, Idx1}; |
| 1913 | } |
| 1914 | |
| 1915 | const TargetRegisterClass * |
| 1916 | SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI, |
| 1917 | const CombineInfo &Paired) const { |
| 1918 | if (CI.InstClass == S_BUFFER_LOAD_IMM || |
| 1919 | CI.InstClass == S_BUFFER_LOAD_SGPR_IMM || CI.InstClass == S_LOAD_IMM) { |
| 1920 | switch (CI.Width + Paired.Width) { |
| 1921 | default: |
| 1922 | return nullptr; |
| 1923 | case 2: |
| 1924 | return &AMDGPU::SReg_64_XEXECRegClass; |
| 1925 | case 3: |
| 1926 | return &AMDGPU::SGPR_96RegClass; |
| 1927 | case 4: |
| 1928 | return &AMDGPU::SGPR_128RegClass; |
| 1929 | case 8: |
| 1930 | return &AMDGPU::SGPR_256RegClass; |
| 1931 | case 16: |
| 1932 | return &AMDGPU::SGPR_512RegClass; |
| 1933 | } |
| 1934 | } |
| 1935 | |
| 1936 | unsigned BitWidth = 32 * (CI.Width + Paired.Width); |
| 1937 | return TRI->isAGPRClass(RC: getDataRegClass(MI: *CI.I)) |
| 1938 | ? TRI->getAGPRClassForBitWidth(BitWidth) |
| 1939 | : TRI->getVGPRClassForBitWidth(BitWidth); |
| 1940 | } |
| 1941 | |
| 1942 | MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair( |
| 1943 | CombineInfo &CI, CombineInfo &Paired, |
| 1944 | MachineBasicBlock::iterator InsertBefore) { |
| 1945 | MachineBasicBlock *MBB = CI.I->getParent(); |
| 1946 | DebugLoc DL = CI.I->getDebugLoc(); |
| 1947 | |
| 1948 | const unsigned Opcode = getNewOpcode(CI, Paired); |
| 1949 | |
| 1950 | Register SrcReg = |
| 1951 | copyFromSrcRegs(CI, Paired, InsertBefore, OpName: AMDGPU::OpName::vdata); |
| 1952 | |
| 1953 | auto MIB = BuildMI(BB&: *MBB, I: InsertBefore, MIMD: DL, MCID: TII->get(Opcode)) |
| 1954 | .addReg(RegNo: SrcReg, flags: RegState::Kill); |
| 1955 | |
| 1956 | AddressRegs Regs = getRegs(Opc: Opcode, TII: *TII); |
| 1957 | |
| 1958 | if (Regs.VAddr) |
| 1959 | MIB.add(MO: *TII->getNamedOperand(MI&: *CI.I, OperandName: AMDGPU::OpName::vaddr)); |
| 1960 | |
| 1961 | |
| 1962 | // It shouldn't be possible to get this far if the two instructions |
| 1963 | // don't have a single memoperand, because MachineInstr::mayAlias() |
| 1964 | // will return true if this is the case. |
| 1965 | assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); |
| 1966 | |
| 1967 | MachineInstr *New = |
| 1968 | MIB.add(MO: *TII->getNamedOperand(MI&: *CI.I, OperandName: AMDGPU::OpName::srsrc)) |
| 1969 | .add(MO: *TII->getNamedOperand(MI&: *CI.I, OperandName: AMDGPU::OpName::soffset)) |
| 1970 | .addImm(Val: std::min(a: CI.Offset, b: Paired.Offset)) // offset |
| 1971 | .addImm(Val: CI.CPol) // cpol |
| 1972 | .addImm(Val: 0) // swz |
| 1973 | .addMemOperand(MMO: combineKnownAdjacentMMOs(CI, Paired)); |
| 1974 | |
| 1975 | CI.I->eraseFromParent(); |
| 1976 | Paired.I->eraseFromParent(); |
| 1977 | return New; |
| 1978 | } |
| 1979 | |
| 1980 | MachineOperand |
| 1981 | SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) const { |
| 1982 | APInt V(32, Val, true); |
| 1983 | if (TII->isInlineConstant(Imm: V)) |
| 1984 | return MachineOperand::CreateImm(Val); |
| 1985 | |
| 1986 | Register Reg = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass); |
| 1987 | MachineInstr *Mov = |
| 1988 | BuildMI(BB&: *MI.getParent(), I: MI.getIterator(), MIMD: MI.getDebugLoc(), |
| 1989 | MCID: TII->get(Opcode: AMDGPU::S_MOV_B32), DestReg: Reg) |
| 1990 | .addImm(Val); |
| 1991 | (void)Mov; |
| 1992 | LLVM_DEBUG(dbgs() << " " ; Mov->dump()); |
| 1993 | return MachineOperand::CreateReg(Reg, isDef: false); |
| 1994 | } |
| 1995 | |
| 1996 | // Compute base address using Addr and return the final register. |
| 1997 | Register SILoadStoreOptimizer::computeBase(MachineInstr &MI, |
| 1998 | const MemAddress &Addr) const { |
| 1999 | MachineBasicBlock *MBB = MI.getParent(); |
| 2000 | MachineBasicBlock::iterator MBBI = MI.getIterator(); |
| 2001 | DebugLoc DL = MI.getDebugLoc(); |
| 2002 | |
| 2003 | assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 || |
| 2004 | Addr.Base.LoSubReg) && |
| 2005 | "Expected 32-bit Base-Register-Low!!" ); |
| 2006 | |
| 2007 | assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 || |
| 2008 | Addr.Base.HiSubReg) && |
| 2009 | "Expected 32-bit Base-Register-Hi!!" ); |
| 2010 | |
| 2011 | LLVM_DEBUG(dbgs() << " Re-Computed Anchor-Base:\n" ); |
| 2012 | MachineOperand OffsetLo = createRegOrImm(Val: static_cast<int32_t>(Addr.Offset), MI); |
| 2013 | MachineOperand OffsetHi = |
| 2014 | createRegOrImm(Val: static_cast<int32_t>(Addr.Offset >> 32), MI); |
| 2015 | |
| 2016 | const auto *CarryRC = TRI->getWaveMaskRegClass(); |
| 2017 | Register CarryReg = MRI->createVirtualRegister(RegClass: CarryRC); |
| 2018 | Register DeadCarryReg = MRI->createVirtualRegister(RegClass: CarryRC); |
| 2019 | |
| 2020 | Register DestSub0 = MRI->createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass); |
| 2021 | Register DestSub1 = MRI->createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass); |
| 2022 | MachineInstr *LoHalf = |
| 2023 | BuildMI(BB&: *MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_ADD_CO_U32_e64), DestReg: DestSub0) |
| 2024 | .addReg(RegNo: CarryReg, flags: RegState::Define) |
| 2025 | .addReg(RegNo: Addr.Base.LoReg, flags: 0, SubReg: Addr.Base.LoSubReg) |
| 2026 | .add(MO: OffsetLo) |
| 2027 | .addImm(Val: 0); // clamp bit |
| 2028 | (void)LoHalf; |
| 2029 | LLVM_DEBUG(dbgs() << " " ; LoHalf->dump();); |
| 2030 | |
| 2031 | MachineInstr *HiHalf = |
| 2032 | BuildMI(BB&: *MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_ADDC_U32_e64), DestReg: DestSub1) |
| 2033 | .addReg(RegNo: DeadCarryReg, flags: RegState::Define | RegState::Dead) |
| 2034 | .addReg(RegNo: Addr.Base.HiReg, flags: 0, SubReg: Addr.Base.HiSubReg) |
| 2035 | .add(MO: OffsetHi) |
| 2036 | .addReg(RegNo: CarryReg, flags: RegState::Kill) |
| 2037 | .addImm(Val: 0); // clamp bit |
| 2038 | (void)HiHalf; |
| 2039 | LLVM_DEBUG(dbgs() << " " ; HiHalf->dump();); |
| 2040 | |
| 2041 | Register FullDestReg = MRI->createVirtualRegister(RegClass: TRI->getVGPR64Class()); |
| 2042 | MachineInstr *FullBase = |
| 2043 | BuildMI(BB&: *MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::REG_SEQUENCE), DestReg: FullDestReg) |
| 2044 | .addReg(RegNo: DestSub0) |
| 2045 | .addImm(Val: AMDGPU::sub0) |
| 2046 | .addReg(RegNo: DestSub1) |
| 2047 | .addImm(Val: AMDGPU::sub1); |
| 2048 | (void)FullBase; |
| 2049 | LLVM_DEBUG(dbgs() << " " ; FullBase->dump(); dbgs() << "\n" ;); |
| 2050 | |
| 2051 | return FullDestReg; |
| 2052 | } |
| 2053 | |
| 2054 | // Update base and offset with the NewBase and NewOffset in MI. |
| 2055 | void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI, |
| 2056 | Register NewBase, |
| 2057 | int32_t NewOffset) const { |
| 2058 | auto *Base = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vaddr); |
| 2059 | Base->setReg(NewBase); |
| 2060 | Base->setIsKill(false); |
| 2061 | TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::offset)->setImm(NewOffset); |
| 2062 | } |
| 2063 | |
| 2064 | std::optional<int32_t> |
| 2065 | SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) const { |
| 2066 | if (Op.isImm()) |
| 2067 | return Op.getImm(); |
| 2068 | |
| 2069 | if (!Op.isReg()) |
| 2070 | return std::nullopt; |
| 2071 | |
| 2072 | MachineInstr *Def = MRI->getUniqueVRegDef(Reg: Op.getReg()); |
| 2073 | if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 || |
| 2074 | !Def->getOperand(i: 1).isImm()) |
| 2075 | return std::nullopt; |
| 2076 | |
| 2077 | return Def->getOperand(i: 1).getImm(); |
| 2078 | } |
| 2079 | |
| 2080 | // Analyze Base and extracts: |
| 2081 | // - 32bit base registers, subregisters |
| 2082 | // - 64bit constant offset |
| 2083 | // Expecting base computation as: |
| 2084 | // %OFFSET0:sgpr_32 = S_MOV_B32 8000 |
| 2085 | // %LO:vgpr_32, %c:sreg_64_xexec = |
| 2086 | // V_ADD_CO_U32_e64 %BASE_LO:vgpr_32, %103:sgpr_32, |
| 2087 | // %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec |
| 2088 | // %Base:vreg_64 = |
| 2089 | // REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1 |
| 2090 | void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base, |
| 2091 | MemAddress &Addr) const { |
| 2092 | if (!Base.isReg()) |
| 2093 | return; |
| 2094 | |
| 2095 | MachineInstr *Def = MRI->getUniqueVRegDef(Reg: Base.getReg()); |
| 2096 | if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE |
| 2097 | || Def->getNumOperands() != 5) |
| 2098 | return; |
| 2099 | |
| 2100 | MachineOperand BaseLo = Def->getOperand(i: 1); |
| 2101 | MachineOperand BaseHi = Def->getOperand(i: 3); |
| 2102 | if (!BaseLo.isReg() || !BaseHi.isReg()) |
| 2103 | return; |
| 2104 | |
| 2105 | MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(Reg: BaseLo.getReg()); |
| 2106 | MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(Reg: BaseHi.getReg()); |
| 2107 | |
| 2108 | if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_CO_U32_e64 || |
| 2109 | !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64) |
| 2110 | return; |
| 2111 | |
| 2112 | const auto *Src0 = TII->getNamedOperand(MI&: *BaseLoDef, OperandName: AMDGPU::OpName::src0); |
| 2113 | const auto *Src1 = TII->getNamedOperand(MI&: *BaseLoDef, OperandName: AMDGPU::OpName::src1); |
| 2114 | |
| 2115 | auto Offset0P = extractConstOffset(Op: *Src0); |
| 2116 | if (Offset0P) |
| 2117 | BaseLo = *Src1; |
| 2118 | else { |
| 2119 | if (!(Offset0P = extractConstOffset(Op: *Src1))) |
| 2120 | return; |
| 2121 | BaseLo = *Src0; |
| 2122 | } |
| 2123 | |
| 2124 | if (!BaseLo.isReg()) |
| 2125 | return; |
| 2126 | |
| 2127 | Src0 = TII->getNamedOperand(MI&: *BaseHiDef, OperandName: AMDGPU::OpName::src0); |
| 2128 | Src1 = TII->getNamedOperand(MI&: *BaseHiDef, OperandName: AMDGPU::OpName::src1); |
| 2129 | |
| 2130 | if (Src0->isImm()) |
| 2131 | std::swap(a&: Src0, b&: Src1); |
| 2132 | |
| 2133 | if (!Src1->isImm() || Src0->isImm()) |
| 2134 | return; |
| 2135 | |
| 2136 | uint64_t Offset1 = Src1->getImm(); |
| 2137 | BaseHi = *Src0; |
| 2138 | |
| 2139 | if (!BaseHi.isReg()) |
| 2140 | return; |
| 2141 | |
| 2142 | Addr.Base.LoReg = BaseLo.getReg(); |
| 2143 | Addr.Base.HiReg = BaseHi.getReg(); |
| 2144 | Addr.Base.LoSubReg = BaseLo.getSubReg(); |
| 2145 | Addr.Base.HiSubReg = BaseHi.getSubReg(); |
| 2146 | Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32); |
| 2147 | } |
| 2148 | |
| 2149 | bool SILoadStoreOptimizer::promoteConstantOffsetToImm( |
| 2150 | MachineInstr &MI, |
| 2151 | MemInfoMap &Visited, |
| 2152 | SmallPtrSet<MachineInstr *, 4> &AnchorList) const { |
| 2153 | |
| 2154 | if (!STM->hasFlatInstOffsets() || !SIInstrInfo::isFLAT(MI)) |
| 2155 | return false; |
| 2156 | |
| 2157 | // TODO: Support FLAT_SCRATCH. Currently code expects 64-bit pointers. |
| 2158 | if (SIInstrInfo::isFLATScratch(MI)) |
| 2159 | return false; |
| 2160 | |
| 2161 | unsigned AS = SIInstrInfo::isFLATGlobal(MI) ? AMDGPUAS::GLOBAL_ADDRESS |
| 2162 | : AMDGPUAS::FLAT_ADDRESS; |
| 2163 | |
| 2164 | if (AnchorList.count(Ptr: &MI)) |
| 2165 | return false; |
| 2166 | |
| 2167 | LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor " ; MI.dump()); |
| 2168 | |
| 2169 | if (TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::offset)->getImm()) { |
| 2170 | LLVM_DEBUG(dbgs() << " Const-offset is already promoted.\n" ;); |
| 2171 | return false; |
| 2172 | } |
| 2173 | |
| 2174 | // Step1: Find the base-registers and a 64bit constant offset. |
| 2175 | MachineOperand &Base = *TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vaddr); |
| 2176 | auto [It, Inserted] = Visited.try_emplace(Key: &MI); |
| 2177 | MemAddress MAddr; |
| 2178 | if (Inserted) { |
| 2179 | processBaseWithConstOffset(Base, Addr&: MAddr); |
| 2180 | It->second = MAddr; |
| 2181 | } else |
| 2182 | MAddr = It->second; |
| 2183 | |
| 2184 | if (MAddr.Offset == 0) { |
| 2185 | LLVM_DEBUG(dbgs() << " Failed to extract constant-offset or there are no" |
| 2186 | " constant offsets that can be promoted.\n" ;); |
| 2187 | return false; |
| 2188 | } |
| 2189 | |
| 2190 | LLVM_DEBUG(dbgs() << " BASE: {" << printReg(MAddr.Base.HiReg, TRI) << ", " |
| 2191 | << printReg(MAddr.Base.LoReg, TRI) |
| 2192 | << "} Offset: " << MAddr.Offset << "\n\n" ;); |
| 2193 | |
| 2194 | // Step2: Traverse through MI's basic block and find an anchor(that has the |
| 2195 | // same base-registers) with the highest 13bit distance from MI's offset. |
| 2196 | // E.g. (64bit loads) |
| 2197 | // bb: |
| 2198 | // addr1 = &a + 4096; load1 = load(addr1, 0) |
| 2199 | // addr2 = &a + 6144; load2 = load(addr2, 0) |
| 2200 | // addr3 = &a + 8192; load3 = load(addr3, 0) |
| 2201 | // addr4 = &a + 10240; load4 = load(addr4, 0) |
| 2202 | // addr5 = &a + 12288; load5 = load(addr5, 0) |
| 2203 | // |
| 2204 | // Starting from the first load, the optimization will try to find a new base |
| 2205 | // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192 |
| 2206 | // has 13bit distance from &a + 4096. The heuristic considers &a + 8192 |
| 2207 | // as the new-base(anchor) because of the maximum distance which can |
| 2208 | // accommodate more intermediate bases presumably. |
| 2209 | // |
| 2210 | // Step3: move (&a + 8192) above load1. Compute and promote offsets from |
| 2211 | // (&a + 8192) for load1, load2, load4. |
| 2212 | // addr = &a + 8192 |
| 2213 | // load1 = load(addr, -4096) |
| 2214 | // load2 = load(addr, -2048) |
| 2215 | // load3 = load(addr, 0) |
| 2216 | // load4 = load(addr, 2048) |
| 2217 | // addr5 = &a + 12288; load5 = load(addr5, 0) |
| 2218 | // |
| 2219 | MachineInstr *AnchorInst = nullptr; |
| 2220 | MemAddress AnchorAddr; |
| 2221 | uint32_t MaxDist = std::numeric_limits<uint32_t>::min(); |
| 2222 | SmallVector<std::pair<MachineInstr *, int64_t>, 4> InstsWCommonBase; |
| 2223 | |
| 2224 | MachineBasicBlock *MBB = MI.getParent(); |
| 2225 | MachineBasicBlock::iterator E = MBB->end(); |
| 2226 | MachineBasicBlock::iterator MBBI = MI.getIterator(); |
| 2227 | ++MBBI; |
| 2228 | const SITargetLowering *TLI = |
| 2229 | static_cast<const SITargetLowering *>(STM->getTargetLowering()); |
| 2230 | |
| 2231 | for ( ; MBBI != E; ++MBBI) { |
| 2232 | MachineInstr &MINext = *MBBI; |
| 2233 | // TODO: Support finding an anchor(with same base) from store addresses or |
| 2234 | // any other load addresses where the opcodes are different. |
| 2235 | if (MINext.getOpcode() != MI.getOpcode() || |
| 2236 | TII->getNamedOperand(MI&: MINext, OperandName: AMDGPU::OpName::offset)->getImm()) |
| 2237 | continue; |
| 2238 | |
| 2239 | const MachineOperand &BaseNext = |
| 2240 | *TII->getNamedOperand(MI&: MINext, OperandName: AMDGPU::OpName::vaddr); |
| 2241 | MemAddress MAddrNext; |
| 2242 | auto [It, Inserted] = Visited.try_emplace(Key: &MINext); |
| 2243 | if (Inserted) { |
| 2244 | processBaseWithConstOffset(Base: BaseNext, Addr&: MAddrNext); |
| 2245 | It->second = MAddrNext; |
| 2246 | } else |
| 2247 | MAddrNext = It->second; |
| 2248 | |
| 2249 | if (MAddrNext.Base.LoReg != MAddr.Base.LoReg || |
| 2250 | MAddrNext.Base.HiReg != MAddr.Base.HiReg || |
| 2251 | MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg || |
| 2252 | MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg) |
| 2253 | continue; |
| 2254 | |
| 2255 | InstsWCommonBase.emplace_back(Args: &MINext, Args&: MAddrNext.Offset); |
| 2256 | |
| 2257 | int64_t Dist = MAddr.Offset - MAddrNext.Offset; |
| 2258 | TargetLoweringBase::AddrMode AM; |
| 2259 | AM.HasBaseReg = true; |
| 2260 | AM.BaseOffs = Dist; |
| 2261 | if (TLI->isLegalFlatAddressingMode(AM, AddrSpace: AS) && |
| 2262 | (uint32_t)std::abs(i: Dist) > MaxDist) { |
| 2263 | MaxDist = std::abs(i: Dist); |
| 2264 | |
| 2265 | AnchorAddr = MAddrNext; |
| 2266 | AnchorInst = &MINext; |
| 2267 | } |
| 2268 | } |
| 2269 | |
| 2270 | if (AnchorInst) { |
| 2271 | LLVM_DEBUG(dbgs() << " Anchor-Inst(with max-distance from Offset): " ; |
| 2272 | AnchorInst->dump()); |
| 2273 | LLVM_DEBUG(dbgs() << " Anchor-Offset from BASE: " |
| 2274 | << AnchorAddr.Offset << "\n\n" ); |
| 2275 | |
| 2276 | // Instead of moving up, just re-compute anchor-instruction's base address. |
| 2277 | Register Base = computeBase(MI, Addr: AnchorAddr); |
| 2278 | |
| 2279 | updateBaseAndOffset(MI, NewBase: Base, NewOffset: MAddr.Offset - AnchorAddr.Offset); |
| 2280 | LLVM_DEBUG(dbgs() << " After promotion: " ; MI.dump();); |
| 2281 | |
| 2282 | for (auto [OtherMI, OtherOffset] : InstsWCommonBase) { |
| 2283 | TargetLoweringBase::AddrMode AM; |
| 2284 | AM.HasBaseReg = true; |
| 2285 | AM.BaseOffs = OtherOffset - AnchorAddr.Offset; |
| 2286 | |
| 2287 | if (TLI->isLegalFlatAddressingMode(AM, AddrSpace: AS)) { |
| 2288 | LLVM_DEBUG(dbgs() << " Promote Offset(" << OtherOffset; dbgs() << ")" ; |
| 2289 | OtherMI->dump()); |
| 2290 | updateBaseAndOffset(MI&: *OtherMI, NewBase: Base, NewOffset: OtherOffset - AnchorAddr.Offset); |
| 2291 | LLVM_DEBUG(dbgs() << " After promotion: " ; OtherMI->dump()); |
| 2292 | } |
| 2293 | } |
| 2294 | AnchorList.insert(Ptr: AnchorInst); |
| 2295 | return true; |
| 2296 | } |
| 2297 | |
| 2298 | return false; |
| 2299 | } |
| 2300 | |
| 2301 | void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI, |
| 2302 | std::list<std::list<CombineInfo> > &MergeableInsts) const { |
| 2303 | for (std::list<CombineInfo> &AddrList : MergeableInsts) { |
| 2304 | if (AddrList.front().InstClass == CI.InstClass && |
| 2305 | AddrList.front().IsAGPR == CI.IsAGPR && |
| 2306 | AddrList.front().hasSameBaseAddress(CI)) { |
| 2307 | AddrList.emplace_back(args: CI); |
| 2308 | return; |
| 2309 | } |
| 2310 | } |
| 2311 | |
| 2312 | // Base address not found, so add a new list. |
| 2313 | MergeableInsts.emplace_back(args: 1, args: CI); |
| 2314 | } |
| 2315 | |
| 2316 | std::pair<MachineBasicBlock::iterator, bool> |
| 2317 | SILoadStoreOptimizer::collectMergeableInsts( |
| 2318 | MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End, |
| 2319 | MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList, |
| 2320 | std::list<std::list<CombineInfo>> &MergeableInsts) const { |
| 2321 | bool Modified = false; |
| 2322 | |
| 2323 | // Sort potential mergeable instructions into lists. One list per base address. |
| 2324 | unsigned Order = 0; |
| 2325 | MachineBasicBlock::iterator BlockI = Begin; |
| 2326 | for (; BlockI != End; ++BlockI) { |
| 2327 | MachineInstr &MI = *BlockI; |
| 2328 | |
| 2329 | // We run this before checking if an address is mergeable, because it can produce |
| 2330 | // better code even if the instructions aren't mergeable. |
| 2331 | if (promoteConstantOffsetToImm(MI, Visited, AnchorList)) |
| 2332 | Modified = true; |
| 2333 | |
| 2334 | // Treat volatile accesses, ordered accesses and unmodeled side effects as |
| 2335 | // barriers. We can look after this barrier for separate merges. |
| 2336 | if (MI.hasOrderedMemoryRef() || MI.hasUnmodeledSideEffects()) { |
| 2337 | LLVM_DEBUG(dbgs() << "Breaking search on barrier: " << MI); |
| 2338 | |
| 2339 | // Search will resume after this instruction in a separate merge list. |
| 2340 | ++BlockI; |
| 2341 | break; |
| 2342 | } |
| 2343 | |
| 2344 | const InstClassEnum InstClass = getInstClass(Opc: MI.getOpcode(), TII: *TII); |
| 2345 | if (InstClass == UNKNOWN) |
| 2346 | continue; |
| 2347 | |
| 2348 | // Do not merge VMEM buffer instructions with "swizzled" bit set. |
| 2349 | int Swizzled = |
| 2350 | AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: AMDGPU::OpName::swz); |
| 2351 | if (Swizzled != -1 && MI.getOperand(i: Swizzled).getImm()) |
| 2352 | continue; |
| 2353 | |
| 2354 | CombineInfo CI; |
| 2355 | CI.setMI(MI, LSO: *this); |
| 2356 | CI.Order = Order++; |
| 2357 | |
| 2358 | if (!CI.hasMergeableAddress(MRI: *MRI)) |
| 2359 | continue; |
| 2360 | |
| 2361 | if (CI.InstClass == DS_WRITE && CI.IsAGPR) { |
| 2362 | // FIXME: nothing is illegal in a ds_write2 opcode with two AGPR data |
| 2363 | // operands. However we are reporting that ds_write2 shall have |
| 2364 | // only VGPR data so that machine copy propagation does not |
| 2365 | // create an illegal instruction with a VGPR and AGPR sources. |
| 2366 | // Consequenctially if we create such instruction the verifier |
| 2367 | // will complain. |
| 2368 | continue; |
| 2369 | } |
| 2370 | |
| 2371 | LLVM_DEBUG(dbgs() << "Mergeable: " << MI); |
| 2372 | |
| 2373 | addInstToMergeableList(CI, MergeableInsts); |
| 2374 | } |
| 2375 | |
| 2376 | // At this point we have lists of Mergeable instructions. |
| 2377 | // |
| 2378 | // Part 2: Sort lists by offset and then for each CombineInfo object in the |
| 2379 | // list try to find an instruction that can be merged with I. If an instruction |
| 2380 | // is found, it is stored in the Paired field. If no instructions are found, then |
| 2381 | // the CombineInfo object is deleted from the list. |
| 2382 | |
| 2383 | for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(), |
| 2384 | E = MergeableInsts.end(); I != E;) { |
| 2385 | |
| 2386 | std::list<CombineInfo> &MergeList = *I; |
| 2387 | if (MergeList.size() <= 1) { |
| 2388 | // This means we have found only one instruction with a given address |
| 2389 | // that can be merged, and we need at least 2 instructions to do a merge, |
| 2390 | // so this list can be discarded. |
| 2391 | I = MergeableInsts.erase(position: I); |
| 2392 | continue; |
| 2393 | } |
| 2394 | |
| 2395 | // Sort the lists by offsets, this way mergeable instructions will be |
| 2396 | // adjacent to each other in the list, which will make it easier to find |
| 2397 | // matches. |
| 2398 | MergeList.sort( |
| 2399 | comp: [] (const CombineInfo &A, const CombineInfo &B) { |
| 2400 | return A.Offset < B.Offset; |
| 2401 | }); |
| 2402 | ++I; |
| 2403 | } |
| 2404 | |
| 2405 | return {BlockI, Modified}; |
| 2406 | } |
| 2407 | |
| 2408 | // Scan through looking for adjacent LDS operations with constant offsets from |
| 2409 | // the same base register. We rely on the scheduler to do the hard work of |
| 2410 | // clustering nearby loads, and assume these are all adjacent. |
| 2411 | bool SILoadStoreOptimizer::optimizeBlock( |
| 2412 | std::list<std::list<CombineInfo> > &MergeableInsts) { |
| 2413 | bool Modified = false; |
| 2414 | |
| 2415 | for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(), |
| 2416 | E = MergeableInsts.end(); I != E;) { |
| 2417 | std::list<CombineInfo> &MergeList = *I; |
| 2418 | |
| 2419 | bool OptimizeListAgain = false; |
| 2420 | if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) { |
| 2421 | // We weren't able to make any changes, so delete the list so we don't |
| 2422 | // process the same instructions the next time we try to optimize this |
| 2423 | // block. |
| 2424 | I = MergeableInsts.erase(position: I); |
| 2425 | continue; |
| 2426 | } |
| 2427 | |
| 2428 | Modified = true; |
| 2429 | |
| 2430 | // We made changes, but also determined that there were no more optimization |
| 2431 | // opportunities, so we don't need to reprocess the list |
| 2432 | if (!OptimizeListAgain) { |
| 2433 | I = MergeableInsts.erase(position: I); |
| 2434 | continue; |
| 2435 | } |
| 2436 | OptimizeAgain = true; |
| 2437 | } |
| 2438 | return Modified; |
| 2439 | } |
| 2440 | |
| 2441 | bool |
| 2442 | SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr( |
| 2443 | std::list<CombineInfo> &MergeList, |
| 2444 | bool &OptimizeListAgain) { |
| 2445 | if (MergeList.empty()) |
| 2446 | return false; |
| 2447 | |
| 2448 | bool Modified = false; |
| 2449 | |
| 2450 | for (auto I = MergeList.begin(), Next = std::next(x: I); Next != MergeList.end(); |
| 2451 | Next = std::next(x: I)) { |
| 2452 | |
| 2453 | auto First = I; |
| 2454 | auto Second = Next; |
| 2455 | |
| 2456 | if ((*First).Order > (*Second).Order) |
| 2457 | std::swap(a&: First, b&: Second); |
| 2458 | CombineInfo &CI = *First; |
| 2459 | CombineInfo &Paired = *Second; |
| 2460 | |
| 2461 | CombineInfo *Where = checkAndPrepareMerge(CI, Paired); |
| 2462 | if (!Where) { |
| 2463 | ++I; |
| 2464 | continue; |
| 2465 | } |
| 2466 | |
| 2467 | Modified = true; |
| 2468 | |
| 2469 | LLVM_DEBUG(dbgs() << "Merging: " << *CI.I << " with: " << *Paired.I); |
| 2470 | |
| 2471 | MachineBasicBlock::iterator NewMI; |
| 2472 | switch (CI.InstClass) { |
| 2473 | default: |
| 2474 | llvm_unreachable("unknown InstClass" ); |
| 2475 | break; |
| 2476 | case DS_READ: |
| 2477 | NewMI = mergeRead2Pair(CI, Paired, InsertBefore: Where->I); |
| 2478 | break; |
| 2479 | case DS_WRITE: |
| 2480 | NewMI = mergeWrite2Pair(CI, Paired, InsertBefore: Where->I); |
| 2481 | break; |
| 2482 | case S_BUFFER_LOAD_IMM: |
| 2483 | case S_BUFFER_LOAD_SGPR_IMM: |
| 2484 | case S_LOAD_IMM: |
| 2485 | NewMI = mergeSMemLoadImmPair(CI, Paired, InsertBefore: Where->I); |
| 2486 | OptimizeListAgain |= CI.Width + Paired.Width < 8; |
| 2487 | break; |
| 2488 | case BUFFER_LOAD: |
| 2489 | NewMI = mergeBufferLoadPair(CI, Paired, InsertBefore: Where->I); |
| 2490 | OptimizeListAgain |= CI.Width + Paired.Width < 4; |
| 2491 | break; |
| 2492 | case BUFFER_STORE: |
| 2493 | NewMI = mergeBufferStorePair(CI, Paired, InsertBefore: Where->I); |
| 2494 | OptimizeListAgain |= CI.Width + Paired.Width < 4; |
| 2495 | break; |
| 2496 | case MIMG: |
| 2497 | NewMI = mergeImagePair(CI, Paired, InsertBefore: Where->I); |
| 2498 | OptimizeListAgain |= CI.Width + Paired.Width < 4; |
| 2499 | break; |
| 2500 | case TBUFFER_LOAD: |
| 2501 | NewMI = mergeTBufferLoadPair(CI, Paired, InsertBefore: Where->I); |
| 2502 | OptimizeListAgain |= CI.Width + Paired.Width < 4; |
| 2503 | break; |
| 2504 | case TBUFFER_STORE: |
| 2505 | NewMI = mergeTBufferStorePair(CI, Paired, InsertBefore: Where->I); |
| 2506 | OptimizeListAgain |= CI.Width + Paired.Width < 4; |
| 2507 | break; |
| 2508 | case FLAT_LOAD: |
| 2509 | case GLOBAL_LOAD: |
| 2510 | case GLOBAL_LOAD_SADDR: |
| 2511 | NewMI = mergeFlatLoadPair(CI, Paired, InsertBefore: Where->I); |
| 2512 | OptimizeListAgain |= CI.Width + Paired.Width < 4; |
| 2513 | break; |
| 2514 | case FLAT_STORE: |
| 2515 | case GLOBAL_STORE: |
| 2516 | case GLOBAL_STORE_SADDR: |
| 2517 | NewMI = mergeFlatStorePair(CI, Paired, InsertBefore: Where->I); |
| 2518 | OptimizeListAgain |= CI.Width + Paired.Width < 4; |
| 2519 | break; |
| 2520 | } |
| 2521 | CI.setMI(MI: NewMI, LSO: *this); |
| 2522 | CI.Order = Where->Order; |
| 2523 | if (I == Second) |
| 2524 | I = Next; |
| 2525 | |
| 2526 | MergeList.erase(position: Second); |
| 2527 | } |
| 2528 | |
| 2529 | return Modified; |
| 2530 | } |
| 2531 | |
| 2532 | bool SILoadStoreOptimizerLegacy::runOnMachineFunction(MachineFunction &MF) { |
| 2533 | if (skipFunction(F: MF.getFunction())) |
| 2534 | return false; |
| 2535 | return SILoadStoreOptimizer( |
| 2536 | &getAnalysis<AAResultsWrapperPass>().getAAResults()) |
| 2537 | .run(MF); |
| 2538 | } |
| 2539 | |
| 2540 | bool SILoadStoreOptimizer::run(MachineFunction &MF) { |
| 2541 | STM = &MF.getSubtarget<GCNSubtarget>(); |
| 2542 | if (!STM->loadStoreOptEnabled()) |
| 2543 | return false; |
| 2544 | |
| 2545 | TII = STM->getInstrInfo(); |
| 2546 | TRI = &TII->getRegisterInfo(); |
| 2547 | |
| 2548 | MRI = &MF.getRegInfo(); |
| 2549 | |
| 2550 | LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n" ); |
| 2551 | |
| 2552 | bool Modified = false; |
| 2553 | |
| 2554 | // Contains the list of instructions for which constant offsets are being |
| 2555 | // promoted to the IMM. This is tracked for an entire block at time. |
| 2556 | SmallPtrSet<MachineInstr *, 4> AnchorList; |
| 2557 | MemInfoMap Visited; |
| 2558 | |
| 2559 | for (MachineBasicBlock &MBB : MF) { |
| 2560 | MachineBasicBlock::iterator SectionEnd; |
| 2561 | for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E; |
| 2562 | I = SectionEnd) { |
| 2563 | bool CollectModified; |
| 2564 | std::list<std::list<CombineInfo>> MergeableInsts; |
| 2565 | |
| 2566 | // First pass: Collect list of all instructions we know how to merge in a |
| 2567 | // subset of the block. |
| 2568 | std::tie(args&: SectionEnd, args&: CollectModified) = |
| 2569 | collectMergeableInsts(Begin: I, End: E, Visited, AnchorList, MergeableInsts); |
| 2570 | |
| 2571 | Modified |= CollectModified; |
| 2572 | |
| 2573 | do { |
| 2574 | OptimizeAgain = false; |
| 2575 | Modified |= optimizeBlock(MergeableInsts); |
| 2576 | } while (OptimizeAgain); |
| 2577 | } |
| 2578 | |
| 2579 | Visited.clear(); |
| 2580 | AnchorList.clear(); |
| 2581 | } |
| 2582 | |
| 2583 | return Modified; |
| 2584 | } |
| 2585 | |
| 2586 | PreservedAnalyses |
| 2587 | SILoadStoreOptimizerPass::run(MachineFunction &MF, |
| 2588 | MachineFunctionAnalysisManager &MFAM) { |
| 2589 | MFPropsModifier _(*this, MF); |
| 2590 | |
| 2591 | if (MF.getFunction().hasOptNone()) |
| 2592 | return PreservedAnalyses::all(); |
| 2593 | |
| 2594 | auto &FAM = MFAM.getResult<FunctionAnalysisManagerMachineFunctionProxy>(IR&: MF) |
| 2595 | .getManager(); |
| 2596 | AAResults &AA = FAM.getResult<AAManager>(IR&: MF.getFunction()); |
| 2597 | |
| 2598 | bool Changed = SILoadStoreOptimizer(&AA).run(MF); |
| 2599 | if (!Changed) |
| 2600 | return PreservedAnalyses::all(); |
| 2601 | |
| 2602 | PreservedAnalyses PA = getMachineFunctionPassPreservedAnalyses(); |
| 2603 | PA.preserveSet<CFGAnalyses>(); |
| 2604 | return PA; |
| 2605 | } |
| 2606 | |