| 1 | //===- SIPeepholeSDWA.cpp - Peephole optimization for SDWA instructions ---===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | // |
| 9 | /// \file This pass tries to apply several peephole SDWA patterns. |
| 10 | /// |
| 11 | /// E.g. original: |
| 12 | /// V_LSHRREV_B32_e32 %0, 16, %1 |
| 13 | /// V_ADD_CO_U32_e32 %2, %0, %3 |
| 14 | /// V_LSHLREV_B32_e32 %4, 16, %2 |
| 15 | /// |
| 16 | /// Replace: |
| 17 | /// V_ADD_CO_U32_sdwa %4, %1, %3 |
| 18 | /// dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD |
| 19 | /// |
| 20 | //===----------------------------------------------------------------------===// |
| 21 | |
| 22 | #include "SIPeepholeSDWA.h" |
| 23 | #include "AMDGPU.h" |
| 24 | #include "GCNSubtarget.h" |
| 25 | #include "MCTargetDesc/AMDGPUMCTargetDesc.h" |
| 26 | #include "llvm/ADT/MapVector.h" |
| 27 | #include "llvm/ADT/Statistic.h" |
| 28 | #include "llvm/CodeGen/MachineFunctionPass.h" |
| 29 | #include <optional> |
| 30 | |
| 31 | using namespace llvm; |
| 32 | |
| 33 | #define DEBUG_TYPE "si-peephole-sdwa" |
| 34 | |
| 35 | STATISTIC(NumSDWAPatternsFound, "Number of SDWA patterns found." ); |
| 36 | STATISTIC(NumSDWAInstructionsPeepholed, |
| 37 | "Number of instruction converted to SDWA." ); |
| 38 | |
| 39 | namespace { |
| 40 | |
| 41 | bool isConvertibleToSDWA(MachineInstr &MI, const GCNSubtarget &ST, |
| 42 | const SIInstrInfo *TII); |
| 43 | class SDWAOperand; |
| 44 | class SDWADstOperand; |
| 45 | |
| 46 | using SDWAOperandsVector = SmallVector<SDWAOperand *, 4>; |
| 47 | using SDWAOperandsMap = MapVector<MachineInstr *, SDWAOperandsVector>; |
| 48 | |
| 49 | class SIPeepholeSDWA { |
| 50 | private: |
| 51 | MachineRegisterInfo *MRI; |
| 52 | const SIRegisterInfo *TRI; |
| 53 | const SIInstrInfo *TII; |
| 54 | |
| 55 | MapVector<MachineInstr *, std::unique_ptr<SDWAOperand>> SDWAOperands; |
| 56 | SDWAOperandsMap PotentialMatches; |
| 57 | SmallVector<MachineInstr *, 8> ConvertedInstructions; |
| 58 | |
| 59 | std::optional<int64_t> foldToImm(const MachineOperand &Op) const; |
| 60 | |
| 61 | void matchSDWAOperands(MachineBasicBlock &MBB); |
| 62 | std::unique_ptr<SDWAOperand> matchSDWAOperand(MachineInstr &MI); |
| 63 | void pseudoOpConvertToVOP2(MachineInstr &MI, |
| 64 | const GCNSubtarget &ST) const; |
| 65 | void convertVcndmaskToVOP2(MachineInstr &MI, const GCNSubtarget &ST) const; |
| 66 | MachineInstr *createSDWAVersion(MachineInstr &MI); |
| 67 | bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands); |
| 68 | void legalizeScalarOperands(MachineInstr &MI, const GCNSubtarget &ST) const; |
| 69 | |
| 70 | public: |
| 71 | bool run(MachineFunction &MF); |
| 72 | }; |
| 73 | |
| 74 | class SIPeepholeSDWALegacy : public MachineFunctionPass { |
| 75 | public: |
| 76 | static char ID; |
| 77 | |
| 78 | SIPeepholeSDWALegacy() : MachineFunctionPass(ID) {} |
| 79 | |
| 80 | StringRef getPassName() const override { return "SI Peephole SDWA" ; } |
| 81 | |
| 82 | bool runOnMachineFunction(MachineFunction &MF) override; |
| 83 | |
| 84 | void getAnalysisUsage(AnalysisUsage &AU) const override { |
| 85 | AU.setPreservesCFG(); |
| 86 | MachineFunctionPass::getAnalysisUsage(AU); |
| 87 | } |
| 88 | }; |
| 89 | |
| 90 | using namespace AMDGPU::SDWA; |
| 91 | |
| 92 | class SDWAOperand { |
| 93 | private: |
| 94 | MachineOperand *Target; // Operand that would be used in converted instruction |
| 95 | MachineOperand *Replaced; // Operand that would be replace by Target |
| 96 | |
| 97 | /// Returns true iff the SDWA selection of this SDWAOperand can be combined |
| 98 | /// with the SDWA selections of its uses in \p MI. |
| 99 | virtual bool canCombineSelections(const MachineInstr &MI, |
| 100 | const SIInstrInfo *TII) = 0; |
| 101 | |
| 102 | public: |
| 103 | SDWAOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp) |
| 104 | : Target(TargetOp), Replaced(ReplacedOp) { |
| 105 | assert(Target->isReg()); |
| 106 | assert(Replaced->isReg()); |
| 107 | } |
| 108 | |
| 109 | virtual ~SDWAOperand() = default; |
| 110 | |
| 111 | virtual MachineInstr *potentialToConvert(const SIInstrInfo *TII, |
| 112 | const GCNSubtarget &ST, |
| 113 | SDWAOperandsMap *PotentialMatches = nullptr) = 0; |
| 114 | virtual bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) = 0; |
| 115 | |
| 116 | MachineOperand *getTargetOperand() const { return Target; } |
| 117 | MachineOperand *getReplacedOperand() const { return Replaced; } |
| 118 | MachineInstr *getParentInst() const { return Target->getParent(); } |
| 119 | |
| 120 | MachineRegisterInfo *getMRI() const { |
| 121 | return &getParentInst()->getParent()->getParent()->getRegInfo(); |
| 122 | } |
| 123 | |
| 124 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 125 | virtual void print(raw_ostream& OS) const = 0; |
| 126 | void dump() const { print(dbgs()); } |
| 127 | #endif |
| 128 | }; |
| 129 | |
| 130 | class SDWASrcOperand : public SDWAOperand { |
| 131 | private: |
| 132 | SdwaSel SrcSel; |
| 133 | bool Abs; |
| 134 | bool Neg; |
| 135 | bool Sext; |
| 136 | |
| 137 | public: |
| 138 | SDWASrcOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp, |
| 139 | SdwaSel SrcSel_ = DWORD, bool Abs_ = false, bool Neg_ = false, |
| 140 | bool Sext_ = false) |
| 141 | : SDWAOperand(TargetOp, ReplacedOp), SrcSel(SrcSel_), Abs(Abs_), |
| 142 | Neg(Neg_), Sext(Sext_) {} |
| 143 | |
| 144 | MachineInstr *potentialToConvert(const SIInstrInfo *TII, |
| 145 | const GCNSubtarget &ST, |
| 146 | SDWAOperandsMap *PotentialMatches = nullptr) override; |
| 147 | bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override; |
| 148 | bool canCombineSelections(const MachineInstr &MI, |
| 149 | const SIInstrInfo *TII) override; |
| 150 | |
| 151 | SdwaSel getSrcSel() const { return SrcSel; } |
| 152 | bool getAbs() const { return Abs; } |
| 153 | bool getNeg() const { return Neg; } |
| 154 | bool getSext() const { return Sext; } |
| 155 | |
| 156 | uint64_t getSrcMods(const SIInstrInfo *TII, |
| 157 | const MachineOperand *SrcOp) const; |
| 158 | |
| 159 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 160 | void print(raw_ostream& OS) const override; |
| 161 | #endif |
| 162 | }; |
| 163 | |
| 164 | class SDWADstOperand : public SDWAOperand { |
| 165 | private: |
| 166 | SdwaSel DstSel; |
| 167 | DstUnused DstUn; |
| 168 | |
| 169 | public: |
| 170 | SDWADstOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp, |
| 171 | SdwaSel DstSel_ = DWORD, DstUnused DstUn_ = UNUSED_PAD) |
| 172 | : SDWAOperand(TargetOp, ReplacedOp), DstSel(DstSel_), DstUn(DstUn_) {} |
| 173 | |
| 174 | MachineInstr *potentialToConvert(const SIInstrInfo *TII, |
| 175 | const GCNSubtarget &ST, |
| 176 | SDWAOperandsMap *PotentialMatches = nullptr) override; |
| 177 | bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override; |
| 178 | bool canCombineSelections(const MachineInstr &MI, |
| 179 | const SIInstrInfo *TII) override; |
| 180 | |
| 181 | SdwaSel getDstSel() const { return DstSel; } |
| 182 | DstUnused getDstUnused() const { return DstUn; } |
| 183 | |
| 184 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 185 | void print(raw_ostream& OS) const override; |
| 186 | #endif |
| 187 | }; |
| 188 | |
| 189 | class SDWADstPreserveOperand : public SDWADstOperand { |
| 190 | private: |
| 191 | MachineOperand *Preserve; |
| 192 | |
| 193 | public: |
| 194 | SDWADstPreserveOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp, |
| 195 | MachineOperand *PreserveOp, SdwaSel DstSel_ = DWORD) |
| 196 | : SDWADstOperand(TargetOp, ReplacedOp, DstSel_, UNUSED_PRESERVE), |
| 197 | Preserve(PreserveOp) {} |
| 198 | |
| 199 | bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override; |
| 200 | bool canCombineSelections(const MachineInstr &MI, |
| 201 | const SIInstrInfo *TII) override; |
| 202 | |
| 203 | MachineOperand *getPreservedOperand() const { return Preserve; } |
| 204 | |
| 205 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 206 | void print(raw_ostream& OS) const override; |
| 207 | #endif |
| 208 | }; |
| 209 | |
| 210 | } // end anonymous namespace |
| 211 | |
| 212 | INITIALIZE_PASS(SIPeepholeSDWALegacy, DEBUG_TYPE, "SI Peephole SDWA" , false, |
| 213 | false) |
| 214 | |
| 215 | char SIPeepholeSDWALegacy::ID = 0; |
| 216 | |
| 217 | char &llvm::SIPeepholeSDWALegacyID = SIPeepholeSDWALegacy::ID; |
| 218 | |
| 219 | FunctionPass *llvm::createSIPeepholeSDWALegacyPass() { |
| 220 | return new SIPeepholeSDWALegacy(); |
| 221 | } |
| 222 | |
| 223 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 224 | static raw_ostream& operator<<(raw_ostream &OS, SdwaSel Sel) { |
| 225 | switch(Sel) { |
| 226 | case BYTE_0: OS << "BYTE_0" ; break; |
| 227 | case BYTE_1: OS << "BYTE_1" ; break; |
| 228 | case BYTE_2: OS << "BYTE_2" ; break; |
| 229 | case BYTE_3: OS << "BYTE_3" ; break; |
| 230 | case WORD_0: OS << "WORD_0" ; break; |
| 231 | case WORD_1: OS << "WORD_1" ; break; |
| 232 | case DWORD: OS << "DWORD" ; break; |
| 233 | } |
| 234 | return OS; |
| 235 | } |
| 236 | |
| 237 | static raw_ostream& operator<<(raw_ostream &OS, const DstUnused &Un) { |
| 238 | switch(Un) { |
| 239 | case UNUSED_PAD: OS << "UNUSED_PAD" ; break; |
| 240 | case UNUSED_SEXT: OS << "UNUSED_SEXT" ; break; |
| 241 | case UNUSED_PRESERVE: OS << "UNUSED_PRESERVE" ; break; |
| 242 | } |
| 243 | return OS; |
| 244 | } |
| 245 | |
| 246 | LLVM_DUMP_METHOD |
| 247 | void SDWASrcOperand::print(raw_ostream& OS) const { |
| 248 | OS << "SDWA src: " << *getTargetOperand() |
| 249 | << " src_sel:" << getSrcSel() |
| 250 | << " abs:" << getAbs() << " neg:" << getNeg() |
| 251 | << " sext:" << getSext() << '\n'; |
| 252 | } |
| 253 | |
| 254 | LLVM_DUMP_METHOD |
| 255 | void SDWADstOperand::print(raw_ostream& OS) const { |
| 256 | OS << "SDWA dst: " << *getTargetOperand() |
| 257 | << " dst_sel:" << getDstSel() |
| 258 | << " dst_unused:" << getDstUnused() << '\n'; |
| 259 | } |
| 260 | |
| 261 | LLVM_DUMP_METHOD |
| 262 | void SDWADstPreserveOperand::print(raw_ostream& OS) const { |
| 263 | OS << "SDWA preserve dst: " << *getTargetOperand() |
| 264 | << " dst_sel:" << getDstSel() |
| 265 | << " preserve:" << *getPreservedOperand() << '\n'; |
| 266 | } |
| 267 | |
| 268 | #endif |
| 269 | |
| 270 | static void copyRegOperand(MachineOperand &To, const MachineOperand &From) { |
| 271 | assert(To.isReg() && From.isReg()); |
| 272 | To.setReg(From.getReg()); |
| 273 | To.setSubReg(From.getSubReg()); |
| 274 | To.setIsUndef(From.isUndef()); |
| 275 | if (To.isUse()) { |
| 276 | To.setIsKill(From.isKill()); |
| 277 | } else { |
| 278 | To.setIsDead(From.isDead()); |
| 279 | } |
| 280 | } |
| 281 | |
| 282 | static bool isSameReg(const MachineOperand &LHS, const MachineOperand &RHS) { |
| 283 | return LHS.isReg() && |
| 284 | RHS.isReg() && |
| 285 | LHS.getReg() == RHS.getReg() && |
| 286 | LHS.getSubReg() == RHS.getSubReg(); |
| 287 | } |
| 288 | |
| 289 | static MachineOperand *findSingleRegUse(const MachineOperand *Reg, |
| 290 | const MachineRegisterInfo *MRI) { |
| 291 | if (!Reg->isReg() || !Reg->isDef()) |
| 292 | return nullptr; |
| 293 | |
| 294 | MachineOperand *ResMO = nullptr; |
| 295 | for (MachineOperand &UseMO : MRI->use_nodbg_operands(Reg: Reg->getReg())) { |
| 296 | // If there exist use of subreg of Reg then return nullptr |
| 297 | if (!isSameReg(LHS: UseMO, RHS: *Reg)) |
| 298 | return nullptr; |
| 299 | |
| 300 | // Check that there is only one instruction that uses Reg |
| 301 | if (!ResMO) { |
| 302 | ResMO = &UseMO; |
| 303 | } else if (ResMO->getParent() != UseMO.getParent()) { |
| 304 | return nullptr; |
| 305 | } |
| 306 | } |
| 307 | |
| 308 | return ResMO; |
| 309 | } |
| 310 | |
| 311 | static MachineOperand *findSingleRegDef(const MachineOperand *Reg, |
| 312 | const MachineRegisterInfo *MRI) { |
| 313 | if (!Reg->isReg()) |
| 314 | return nullptr; |
| 315 | |
| 316 | MachineInstr *DefInstr = MRI->getUniqueVRegDef(Reg: Reg->getReg()); |
| 317 | if (!DefInstr) |
| 318 | return nullptr; |
| 319 | |
| 320 | for (auto &DefMO : DefInstr->defs()) { |
| 321 | if (DefMO.isReg() && DefMO.getReg() == Reg->getReg()) |
| 322 | return &DefMO; |
| 323 | } |
| 324 | |
| 325 | // Ignore implicit defs. |
| 326 | return nullptr; |
| 327 | } |
| 328 | |
| 329 | /// Combine an SDWA instruction's existing SDWA selection \p Sel with |
| 330 | /// the SDWA selection \p OperandSel of its operand. If the selections |
| 331 | /// are compatible, return the combined selection, otherwise return a |
| 332 | /// nullopt. |
| 333 | /// For example, if we have Sel = BYTE_0 Sel and OperandSel = WORD_1: |
| 334 | /// BYTE_0 Sel (WORD_1 Sel (%X)) -> BYTE_2 Sel (%X) |
| 335 | static std::optional<SdwaSel> combineSdwaSel(SdwaSel Sel, SdwaSel OperandSel) { |
| 336 | if (Sel == SdwaSel::DWORD) |
| 337 | return OperandSel; |
| 338 | |
| 339 | if (Sel == OperandSel || OperandSel == SdwaSel::DWORD) |
| 340 | return Sel; |
| 341 | |
| 342 | if (Sel == SdwaSel::WORD_1 || Sel == SdwaSel::BYTE_2 || |
| 343 | Sel == SdwaSel::BYTE_3) |
| 344 | return {}; |
| 345 | |
| 346 | if (OperandSel == SdwaSel::WORD_0) |
| 347 | return Sel; |
| 348 | |
| 349 | if (OperandSel == SdwaSel::WORD_1) { |
| 350 | if (Sel == SdwaSel::BYTE_0) |
| 351 | return SdwaSel::BYTE_2; |
| 352 | if (Sel == SdwaSel::BYTE_1) |
| 353 | return SdwaSel::BYTE_3; |
| 354 | if (Sel == SdwaSel::WORD_0) |
| 355 | return SdwaSel::WORD_1; |
| 356 | } |
| 357 | |
| 358 | return {}; |
| 359 | } |
| 360 | |
| 361 | uint64_t SDWASrcOperand::getSrcMods(const SIInstrInfo *TII, |
| 362 | const MachineOperand *SrcOp) const { |
| 363 | uint64_t Mods = 0; |
| 364 | const auto *MI = SrcOp->getParent(); |
| 365 | if (TII->getNamedOperand(MI: *MI, OperandName: AMDGPU::OpName::src0) == SrcOp) { |
| 366 | if (auto *Mod = TII->getNamedOperand(MI: *MI, OperandName: AMDGPU::OpName::src0_modifiers)) { |
| 367 | Mods = Mod->getImm(); |
| 368 | } |
| 369 | } else if (TII->getNamedOperand(MI: *MI, OperandName: AMDGPU::OpName::src1) == SrcOp) { |
| 370 | if (auto *Mod = TII->getNamedOperand(MI: *MI, OperandName: AMDGPU::OpName::src1_modifiers)) { |
| 371 | Mods = Mod->getImm(); |
| 372 | } |
| 373 | } |
| 374 | if (Abs || Neg) { |
| 375 | assert(!Sext && |
| 376 | "Float and integer src modifiers can't be set simultaneously" ); |
| 377 | Mods |= Abs ? SISrcMods::ABS : 0u; |
| 378 | Mods ^= Neg ? SISrcMods::NEG : 0u; |
| 379 | } else if (Sext) { |
| 380 | Mods |= SISrcMods::SEXT; |
| 381 | } |
| 382 | |
| 383 | return Mods; |
| 384 | } |
| 385 | |
| 386 | MachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII, |
| 387 | const GCNSubtarget &ST, |
| 388 | SDWAOperandsMap *PotentialMatches) { |
| 389 | if (PotentialMatches != nullptr) { |
| 390 | // Fill out the map for all uses if all can be converted |
| 391 | MachineOperand *Reg = getReplacedOperand(); |
| 392 | if (!Reg->isReg() || !Reg->isDef()) |
| 393 | return nullptr; |
| 394 | |
| 395 | for (MachineInstr &UseMI : getMRI()->use_nodbg_instructions(Reg: Reg->getReg())) |
| 396 | // Check that all instructions that use Reg can be converted |
| 397 | if (!isConvertibleToSDWA(MI&: UseMI, ST, TII) || |
| 398 | !canCombineSelections(MI: UseMI, TII)) |
| 399 | return nullptr; |
| 400 | |
| 401 | // Now that it's guaranteed all uses are legal, iterate over the uses again |
| 402 | // to add them for later conversion. |
| 403 | for (MachineOperand &UseMO : getMRI()->use_nodbg_operands(Reg: Reg->getReg())) { |
| 404 | // Should not get a subregister here |
| 405 | assert(isSameReg(UseMO, *Reg)); |
| 406 | |
| 407 | SDWAOperandsMap &potentialMatchesMap = *PotentialMatches; |
| 408 | MachineInstr *UseMI = UseMO.getParent(); |
| 409 | potentialMatchesMap[UseMI].push_back(Elt: this); |
| 410 | } |
| 411 | return nullptr; |
| 412 | } |
| 413 | |
| 414 | // For SDWA src operand potential instruction is one that use register |
| 415 | // defined by parent instruction |
| 416 | MachineOperand *PotentialMO = findSingleRegUse(Reg: getReplacedOperand(), MRI: getMRI()); |
| 417 | if (!PotentialMO) |
| 418 | return nullptr; |
| 419 | |
| 420 | MachineInstr *Parent = PotentialMO->getParent(); |
| 421 | |
| 422 | return canCombineSelections(MI: *Parent, TII) ? Parent : nullptr; |
| 423 | } |
| 424 | |
| 425 | bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) { |
| 426 | switch (MI.getOpcode()) { |
| 427 | case AMDGPU::V_CVT_F32_FP8_sdwa: |
| 428 | case AMDGPU::V_CVT_F32_BF8_sdwa: |
| 429 | case AMDGPU::V_CVT_PK_F32_FP8_sdwa: |
| 430 | case AMDGPU::V_CVT_PK_F32_BF8_sdwa: |
| 431 | // Does not support input modifiers: noabs, noneg, nosext. |
| 432 | return false; |
| 433 | case AMDGPU::V_CNDMASK_B32_sdwa: |
| 434 | // SISrcMods uses the same bitmask for SEXT and NEG modifiers and |
| 435 | // hence the compiler can only support one type of modifier for |
| 436 | // each SDWA instruction. For V_CNDMASK_B32_sdwa, this is NEG |
| 437 | // since its operands get printed using |
| 438 | // AMDGPUInstPrinter::printOperandAndFPInputMods which produces |
| 439 | // the output intended for NEG if SEXT is set. |
| 440 | // |
| 441 | // The ISA does actually support both modifiers on most SDWA |
| 442 | // instructions. |
| 443 | // |
| 444 | // FIXME Accept SEXT here after fixing this issue. |
| 445 | if (Sext) |
| 446 | return false; |
| 447 | break; |
| 448 | } |
| 449 | |
| 450 | // Find operand in instruction that matches source operand and replace it with |
| 451 | // target operand. Set corresponding src_sel |
| 452 | bool IsPreserveSrc = false; |
| 453 | MachineOperand *Src = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0); |
| 454 | MachineOperand *SrcSel = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0_sel); |
| 455 | MachineOperand *SrcMods = |
| 456 | TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0_modifiers); |
| 457 | assert(Src && (Src->isReg() || Src->isImm())); |
| 458 | if (!isSameReg(LHS: *Src, RHS: *getReplacedOperand())) { |
| 459 | // If this is not src0 then it could be src1 |
| 460 | Src = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1); |
| 461 | SrcSel = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1_sel); |
| 462 | SrcMods = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1_modifiers); |
| 463 | |
| 464 | if (!Src || |
| 465 | !isSameReg(LHS: *Src, RHS: *getReplacedOperand())) { |
| 466 | // It's possible this Src is a tied operand for |
| 467 | // UNUSED_PRESERVE, in which case we can either |
| 468 | // abandon the peephole attempt, or if legal we can |
| 469 | // copy the target operand into the tied slot |
| 470 | // if the preserve operation will effectively cause the same |
| 471 | // result by overwriting the rest of the dst. |
| 472 | MachineOperand *Dst = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst); |
| 473 | MachineOperand *DstUnused = |
| 474 | TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::dst_unused); |
| 475 | |
| 476 | if (Dst && |
| 477 | DstUnused->getImm() == AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE) { |
| 478 | // This will work if the tied src is accessing WORD_0, and the dst is |
| 479 | // writing WORD_1. Modifiers don't matter because all the bits that |
| 480 | // would be impacted are being overwritten by the dst. |
| 481 | // Any other case will not work. |
| 482 | SdwaSel DstSel = static_cast<SdwaSel>( |
| 483 | TII->getNamedImmOperand(MI, OperandName: AMDGPU::OpName::dst_sel)); |
| 484 | if (DstSel == AMDGPU::SDWA::SdwaSel::WORD_1 && |
| 485 | getSrcSel() == AMDGPU::SDWA::SdwaSel::WORD_0) { |
| 486 | IsPreserveSrc = true; |
| 487 | auto DstIdx = AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), |
| 488 | Name: AMDGPU::OpName::vdst); |
| 489 | auto TiedIdx = MI.findTiedOperandIdx(OpIdx: DstIdx); |
| 490 | Src = &MI.getOperand(i: TiedIdx); |
| 491 | SrcSel = nullptr; |
| 492 | SrcMods = nullptr; |
| 493 | } else { |
| 494 | // Not legal to convert this src |
| 495 | return false; |
| 496 | } |
| 497 | } |
| 498 | } |
| 499 | assert(Src && Src->isReg()); |
| 500 | |
| 501 | if ((MI.getOpcode() == AMDGPU::V_FMAC_F16_sdwa || |
| 502 | MI.getOpcode() == AMDGPU::V_FMAC_F32_sdwa || |
| 503 | MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa || |
| 504 | MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) && |
| 505 | !isSameReg(LHS: *Src, RHS: *getReplacedOperand())) { |
| 506 | // In case of v_mac_f16/32_sdwa this pass can try to apply src operand to |
| 507 | // src2. This is not allowed. |
| 508 | return false; |
| 509 | } |
| 510 | |
| 511 | assert(isSameReg(*Src, *getReplacedOperand()) && |
| 512 | (IsPreserveSrc || (SrcSel && SrcMods))); |
| 513 | } |
| 514 | copyRegOperand(To&: *Src, From: *getTargetOperand()); |
| 515 | if (!IsPreserveSrc) { |
| 516 | SdwaSel ExistingSel = static_cast<SdwaSel>(SrcSel->getImm()); |
| 517 | SrcSel->setImm(*combineSdwaSel(Sel: ExistingSel, OperandSel: getSrcSel())); |
| 518 | SrcMods->setImm(getSrcMods(TII, SrcOp: Src)); |
| 519 | } |
| 520 | getTargetOperand()->setIsKill(false); |
| 521 | return true; |
| 522 | } |
| 523 | |
| 524 | /// Verify that the SDWA selection operand \p SrcSelOpName of the SDWA |
| 525 | /// instruction \p MI can be combined with the selection \p OpSel. |
| 526 | static bool canCombineOpSel(const MachineInstr &MI, const SIInstrInfo *TII, |
| 527 | AMDGPU::OpName SrcSelOpName, SdwaSel OpSel) { |
| 528 | assert(TII->isSDWA(MI.getOpcode())); |
| 529 | |
| 530 | const MachineOperand *SrcSelOp = TII->getNamedOperand(MI, OperandName: SrcSelOpName); |
| 531 | SdwaSel SrcSel = static_cast<SdwaSel>(SrcSelOp->getImm()); |
| 532 | |
| 533 | return combineSdwaSel(Sel: SrcSel, OperandSel: OpSel).has_value(); |
| 534 | } |
| 535 | |
| 536 | /// Verify that \p Op is the same register as the operand of the SDWA |
| 537 | /// instruction \p MI named by \p SrcOpName and that the SDWA |
| 538 | /// selection \p SrcSelOpName can be combined with the \p OpSel. |
| 539 | static bool canCombineOpSel(const MachineInstr &MI, const SIInstrInfo *TII, |
| 540 | AMDGPU::OpName SrcOpName, |
| 541 | AMDGPU::OpName SrcSelOpName, MachineOperand *Op, |
| 542 | SdwaSel OpSel) { |
| 543 | assert(TII->isSDWA(MI.getOpcode())); |
| 544 | |
| 545 | const MachineOperand *Src = TII->getNamedOperand(MI, OperandName: SrcOpName); |
| 546 | if (!Src || !isSameReg(LHS: *Src, RHS: *Op)) |
| 547 | return true; |
| 548 | |
| 549 | return canCombineOpSel(MI, TII, SrcSelOpName, OpSel); |
| 550 | } |
| 551 | |
| 552 | bool SDWASrcOperand::canCombineSelections(const MachineInstr &MI, |
| 553 | const SIInstrInfo *TII) { |
| 554 | if (!TII->isSDWA(Opcode: MI.getOpcode())) |
| 555 | return true; |
| 556 | |
| 557 | using namespace AMDGPU; |
| 558 | |
| 559 | return canCombineOpSel(MI, TII, SrcOpName: OpName::src0, SrcSelOpName: OpName::src0_sel, |
| 560 | Op: getReplacedOperand(), OpSel: getSrcSel()) && |
| 561 | canCombineOpSel(MI, TII, SrcOpName: OpName::src1, SrcSelOpName: OpName::src1_sel, |
| 562 | Op: getReplacedOperand(), OpSel: getSrcSel()); |
| 563 | } |
| 564 | |
| 565 | MachineInstr *SDWADstOperand::potentialToConvert(const SIInstrInfo *TII, |
| 566 | const GCNSubtarget &ST, |
| 567 | SDWAOperandsMap *PotentialMatches) { |
| 568 | // For SDWA dst operand potential instruction is one that defines register |
| 569 | // that this operand uses |
| 570 | MachineRegisterInfo *MRI = getMRI(); |
| 571 | MachineInstr *ParentMI = getParentInst(); |
| 572 | |
| 573 | MachineOperand *PotentialMO = findSingleRegDef(Reg: getReplacedOperand(), MRI); |
| 574 | if (!PotentialMO) |
| 575 | return nullptr; |
| 576 | |
| 577 | // Check that ParentMI is the only instruction that uses replaced register |
| 578 | for (MachineInstr &UseInst : MRI->use_nodbg_instructions(Reg: PotentialMO->getReg())) { |
| 579 | if (&UseInst != ParentMI) |
| 580 | return nullptr; |
| 581 | } |
| 582 | |
| 583 | MachineInstr *Parent = PotentialMO->getParent(); |
| 584 | return canCombineSelections(MI: *Parent, TII) ? Parent : nullptr; |
| 585 | } |
| 586 | |
| 587 | bool SDWADstOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) { |
| 588 | // Replace vdst operand in MI with target operand. Set dst_sel and dst_unused |
| 589 | |
| 590 | if ((MI.getOpcode() == AMDGPU::V_FMAC_F16_sdwa || |
| 591 | MI.getOpcode() == AMDGPU::V_FMAC_F32_sdwa || |
| 592 | MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa || |
| 593 | MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) && |
| 594 | getDstSel() != AMDGPU::SDWA::DWORD) { |
| 595 | // v_mac_f16/32_sdwa allow dst_sel to be equal only to DWORD |
| 596 | return false; |
| 597 | } |
| 598 | |
| 599 | MachineOperand *Operand = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst); |
| 600 | assert(Operand && |
| 601 | Operand->isReg() && |
| 602 | isSameReg(*Operand, *getReplacedOperand())); |
| 603 | copyRegOperand(To&: *Operand, From: *getTargetOperand()); |
| 604 | MachineOperand *DstSel= TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::dst_sel); |
| 605 | assert(DstSel); |
| 606 | |
| 607 | SdwaSel ExistingSel = static_cast<SdwaSel>(DstSel->getImm()); |
| 608 | DstSel->setImm(combineSdwaSel(Sel: ExistingSel, OperandSel: getDstSel()).value()); |
| 609 | |
| 610 | MachineOperand *DstUnused= TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::dst_unused); |
| 611 | assert(DstUnused); |
| 612 | DstUnused->setImm(getDstUnused()); |
| 613 | |
| 614 | // Remove original instruction because it would conflict with our new |
| 615 | // instruction by register definition |
| 616 | getParentInst()->eraseFromParent(); |
| 617 | return true; |
| 618 | } |
| 619 | |
| 620 | bool SDWADstOperand::canCombineSelections(const MachineInstr &MI, |
| 621 | const SIInstrInfo *TII) { |
| 622 | if (!TII->isSDWA(Opcode: MI.getOpcode())) |
| 623 | return true; |
| 624 | |
| 625 | return canCombineOpSel(MI, TII, SrcSelOpName: AMDGPU::OpName::dst_sel, OpSel: getDstSel()); |
| 626 | } |
| 627 | |
| 628 | bool SDWADstPreserveOperand::convertToSDWA(MachineInstr &MI, |
| 629 | const SIInstrInfo *TII) { |
| 630 | // MI should be moved right before v_or_b32. |
| 631 | // For this we should clear all kill flags on uses of MI src-operands or else |
| 632 | // we can encounter problem with use of killed operand. |
| 633 | for (MachineOperand &MO : MI.uses()) { |
| 634 | if (!MO.isReg()) |
| 635 | continue; |
| 636 | getMRI()->clearKillFlags(Reg: MO.getReg()); |
| 637 | } |
| 638 | |
| 639 | // Move MI before v_or_b32 |
| 640 | MI.getParent()->remove(I: &MI); |
| 641 | getParentInst()->getParent()->insert(I: getParentInst(), MI: &MI); |
| 642 | |
| 643 | // Add Implicit use of preserved register |
| 644 | MachineInstrBuilder MIB(*MI.getMF(), MI); |
| 645 | MIB.addReg(RegNo: getPreservedOperand()->getReg(), |
| 646 | flags: RegState::ImplicitKill, |
| 647 | SubReg: getPreservedOperand()->getSubReg()); |
| 648 | |
| 649 | // Tie dst to implicit use |
| 650 | MI.tieOperands(DefIdx: AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: AMDGPU::OpName::vdst), |
| 651 | UseIdx: MI.getNumOperands() - 1); |
| 652 | |
| 653 | // Convert MI as any other SDWADstOperand and remove v_or_b32 |
| 654 | return SDWADstOperand::convertToSDWA(MI, TII); |
| 655 | } |
| 656 | |
| 657 | bool SDWADstPreserveOperand::canCombineSelections(const MachineInstr &MI, |
| 658 | const SIInstrInfo *TII) { |
| 659 | return SDWADstOperand::canCombineSelections(MI, TII); |
| 660 | } |
| 661 | |
| 662 | std::optional<int64_t> |
| 663 | SIPeepholeSDWA::foldToImm(const MachineOperand &Op) const { |
| 664 | if (Op.isImm()) { |
| 665 | return Op.getImm(); |
| 666 | } |
| 667 | |
| 668 | // If this is not immediate then it can be copy of immediate value, e.g.: |
| 669 | // %1 = S_MOV_B32 255; |
| 670 | if (Op.isReg()) { |
| 671 | for (const MachineOperand &Def : MRI->def_operands(Reg: Op.getReg())) { |
| 672 | if (!isSameReg(LHS: Op, RHS: Def)) |
| 673 | continue; |
| 674 | |
| 675 | const MachineInstr *DefInst = Def.getParent(); |
| 676 | if (!TII->isFoldableCopy(MI: *DefInst)) |
| 677 | return std::nullopt; |
| 678 | |
| 679 | const MachineOperand &Copied = DefInst->getOperand(i: 1); |
| 680 | if (!Copied.isImm()) |
| 681 | return std::nullopt; |
| 682 | |
| 683 | return Copied.getImm(); |
| 684 | } |
| 685 | } |
| 686 | |
| 687 | return std::nullopt; |
| 688 | } |
| 689 | |
| 690 | std::unique_ptr<SDWAOperand> |
| 691 | SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) { |
| 692 | unsigned Opcode = MI.getOpcode(); |
| 693 | switch (Opcode) { |
| 694 | case AMDGPU::V_LSHRREV_B32_e32: |
| 695 | case AMDGPU::V_ASHRREV_I32_e32: |
| 696 | case AMDGPU::V_LSHLREV_B32_e32: |
| 697 | case AMDGPU::V_LSHRREV_B32_e64: |
| 698 | case AMDGPU::V_ASHRREV_I32_e64: |
| 699 | case AMDGPU::V_LSHLREV_B32_e64: { |
| 700 | // from: v_lshrrev_b32_e32 v1, 16/24, v0 |
| 701 | // to SDWA src:v0 src_sel:WORD_1/BYTE_3 |
| 702 | |
| 703 | // from: v_ashrrev_i32_e32 v1, 16/24, v0 |
| 704 | // to SDWA src:v0 src_sel:WORD_1/BYTE_3 sext:1 |
| 705 | |
| 706 | // from: v_lshlrev_b32_e32 v1, 16/24, v0 |
| 707 | // to SDWA dst:v1 dst_sel:WORD_1/BYTE_3 dst_unused:UNUSED_PAD |
| 708 | MachineOperand *Src0 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0); |
| 709 | auto Imm = foldToImm(Op: *Src0); |
| 710 | if (!Imm) |
| 711 | break; |
| 712 | |
| 713 | if (*Imm != 16 && *Imm != 24) |
| 714 | break; |
| 715 | |
| 716 | MachineOperand *Src1 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1); |
| 717 | MachineOperand *Dst = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst); |
| 718 | if (!Src1->isReg() || Src1->getReg().isPhysical() || |
| 719 | Dst->getReg().isPhysical()) |
| 720 | break; |
| 721 | |
| 722 | if (Opcode == AMDGPU::V_LSHLREV_B32_e32 || |
| 723 | Opcode == AMDGPU::V_LSHLREV_B32_e64) { |
| 724 | return std::make_unique<SDWADstOperand>( |
| 725 | args&: Dst, args&: Src1, args: *Imm == 16 ? WORD_1 : BYTE_3, args: UNUSED_PAD); |
| 726 | } |
| 727 | return std::make_unique<SDWASrcOperand>( |
| 728 | args&: Src1, args&: Dst, args: *Imm == 16 ? WORD_1 : BYTE_3, args: false, args: false, |
| 729 | args: Opcode != AMDGPU::V_LSHRREV_B32_e32 && |
| 730 | Opcode != AMDGPU::V_LSHRREV_B32_e64); |
| 731 | break; |
| 732 | } |
| 733 | |
| 734 | case AMDGPU::V_LSHRREV_B16_e32: |
| 735 | case AMDGPU::V_ASHRREV_I16_e32: |
| 736 | case AMDGPU::V_LSHLREV_B16_e32: |
| 737 | case AMDGPU::V_LSHRREV_B16_e64: |
| 738 | case AMDGPU::V_LSHRREV_B16_opsel_e64: |
| 739 | case AMDGPU::V_ASHRREV_I16_e64: |
| 740 | case AMDGPU::V_LSHLREV_B16_opsel_e64: |
| 741 | case AMDGPU::V_LSHLREV_B16_e64: { |
| 742 | // from: v_lshrrev_b16_e32 v1, 8, v0 |
| 743 | // to SDWA src:v0 src_sel:BYTE_1 |
| 744 | |
| 745 | // from: v_ashrrev_i16_e32 v1, 8, v0 |
| 746 | // to SDWA src:v0 src_sel:BYTE_1 sext:1 |
| 747 | |
| 748 | // from: v_lshlrev_b16_e32 v1, 8, v0 |
| 749 | // to SDWA dst:v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD |
| 750 | MachineOperand *Src0 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0); |
| 751 | auto Imm = foldToImm(Op: *Src0); |
| 752 | if (!Imm || *Imm != 8) |
| 753 | break; |
| 754 | |
| 755 | MachineOperand *Src1 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1); |
| 756 | MachineOperand *Dst = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst); |
| 757 | |
| 758 | if (!Src1->isReg() || Src1->getReg().isPhysical() || |
| 759 | Dst->getReg().isPhysical()) |
| 760 | break; |
| 761 | |
| 762 | if (Opcode == AMDGPU::V_LSHLREV_B16_e32 || |
| 763 | Opcode == AMDGPU::V_LSHLREV_B16_opsel_e64 || |
| 764 | Opcode == AMDGPU::V_LSHLREV_B16_e64) |
| 765 | return std::make_unique<SDWADstOperand>(args&: Dst, args&: Src1, args: BYTE_1, args: UNUSED_PAD); |
| 766 | return std::make_unique<SDWASrcOperand>( |
| 767 | args&: Src1, args&: Dst, args: BYTE_1, args: false, args: false, |
| 768 | args: Opcode != AMDGPU::V_LSHRREV_B16_e32 && |
| 769 | Opcode != AMDGPU::V_LSHRREV_B16_opsel_e64 && |
| 770 | Opcode != AMDGPU::V_LSHRREV_B16_e64); |
| 771 | break; |
| 772 | } |
| 773 | |
| 774 | case AMDGPU::V_BFE_I32_e64: |
| 775 | case AMDGPU::V_BFE_U32_e64: { |
| 776 | // e.g.: |
| 777 | // from: v_bfe_u32 v1, v0, 8, 8 |
| 778 | // to SDWA src:v0 src_sel:BYTE_1 |
| 779 | |
| 780 | // offset | width | src_sel |
| 781 | // ------------------------ |
| 782 | // 0 | 8 | BYTE_0 |
| 783 | // 0 | 16 | WORD_0 |
| 784 | // 0 | 32 | DWORD ? |
| 785 | // 8 | 8 | BYTE_1 |
| 786 | // 16 | 8 | BYTE_2 |
| 787 | // 16 | 16 | WORD_1 |
| 788 | // 24 | 8 | BYTE_3 |
| 789 | |
| 790 | MachineOperand *Src1 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1); |
| 791 | auto Offset = foldToImm(Op: *Src1); |
| 792 | if (!Offset) |
| 793 | break; |
| 794 | |
| 795 | MachineOperand *Src2 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src2); |
| 796 | auto Width = foldToImm(Op: *Src2); |
| 797 | if (!Width) |
| 798 | break; |
| 799 | |
| 800 | SdwaSel SrcSel = DWORD; |
| 801 | |
| 802 | if (*Offset == 0 && *Width == 8) |
| 803 | SrcSel = BYTE_0; |
| 804 | else if (*Offset == 0 && *Width == 16) |
| 805 | SrcSel = WORD_0; |
| 806 | else if (*Offset == 0 && *Width == 32) |
| 807 | SrcSel = DWORD; |
| 808 | else if (*Offset == 8 && *Width == 8) |
| 809 | SrcSel = BYTE_1; |
| 810 | else if (*Offset == 16 && *Width == 8) |
| 811 | SrcSel = BYTE_2; |
| 812 | else if (*Offset == 16 && *Width == 16) |
| 813 | SrcSel = WORD_1; |
| 814 | else if (*Offset == 24 && *Width == 8) |
| 815 | SrcSel = BYTE_3; |
| 816 | else |
| 817 | break; |
| 818 | |
| 819 | MachineOperand *Src0 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0); |
| 820 | MachineOperand *Dst = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst); |
| 821 | |
| 822 | if (!Src0->isReg() || Src0->getReg().isPhysical() || |
| 823 | Dst->getReg().isPhysical()) |
| 824 | break; |
| 825 | |
| 826 | return std::make_unique<SDWASrcOperand>( |
| 827 | args&: Src0, args&: Dst, args&: SrcSel, args: false, args: false, args: Opcode != AMDGPU::V_BFE_U32_e64); |
| 828 | } |
| 829 | |
| 830 | case AMDGPU::V_AND_B32_e32: |
| 831 | case AMDGPU::V_AND_B32_e64: { |
| 832 | // e.g.: |
| 833 | // from: v_and_b32_e32 v1, 0x0000ffff/0x000000ff, v0 |
| 834 | // to SDWA src:v0 src_sel:WORD_0/BYTE_0 |
| 835 | |
| 836 | MachineOperand *Src0 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0); |
| 837 | MachineOperand *Src1 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1); |
| 838 | auto *ValSrc = Src1; |
| 839 | auto Imm = foldToImm(Op: *Src0); |
| 840 | |
| 841 | if (!Imm) { |
| 842 | Imm = foldToImm(Op: *Src1); |
| 843 | ValSrc = Src0; |
| 844 | } |
| 845 | |
| 846 | if (!Imm || (*Imm != 0x0000ffff && *Imm != 0x000000ff)) |
| 847 | break; |
| 848 | |
| 849 | MachineOperand *Dst = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst); |
| 850 | |
| 851 | if (!ValSrc->isReg() || ValSrc->getReg().isPhysical() || |
| 852 | Dst->getReg().isPhysical()) |
| 853 | break; |
| 854 | |
| 855 | return std::make_unique<SDWASrcOperand>( |
| 856 | args&: ValSrc, args&: Dst, args: *Imm == 0x0000ffff ? WORD_0 : BYTE_0); |
| 857 | } |
| 858 | |
| 859 | case AMDGPU::V_OR_B32_e32: |
| 860 | case AMDGPU::V_OR_B32_e64: { |
| 861 | // Patterns for dst_unused:UNUSED_PRESERVE. |
| 862 | // e.g., from: |
| 863 | // v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD |
| 864 | // src1_sel:WORD_1 src2_sel:WORD1 |
| 865 | // v_add_f16_e32 v3, v1, v2 |
| 866 | // v_or_b32_e32 v4, v0, v3 |
| 867 | // to SDWA preserve dst:v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE preserve:v3 |
| 868 | |
| 869 | // Check if one of operands of v_or_b32 is SDWA instruction |
| 870 | using CheckRetType = |
| 871 | std::optional<std::pair<MachineOperand *, MachineOperand *>>; |
| 872 | auto CheckOROperandsForSDWA = |
| 873 | [&](const MachineOperand *Op1, const MachineOperand *Op2) -> CheckRetType { |
| 874 | if (!Op1 || !Op1->isReg() || !Op2 || !Op2->isReg()) |
| 875 | return CheckRetType(std::nullopt); |
| 876 | |
| 877 | MachineOperand *Op1Def = findSingleRegDef(Reg: Op1, MRI); |
| 878 | if (!Op1Def) |
| 879 | return CheckRetType(std::nullopt); |
| 880 | |
| 881 | MachineInstr *Op1Inst = Op1Def->getParent(); |
| 882 | if (!TII->isSDWA(MI: *Op1Inst)) |
| 883 | return CheckRetType(std::nullopt); |
| 884 | |
| 885 | MachineOperand *Op2Def = findSingleRegDef(Reg: Op2, MRI); |
| 886 | if (!Op2Def) |
| 887 | return CheckRetType(std::nullopt); |
| 888 | |
| 889 | return CheckRetType(std::pair(Op1Def, Op2Def)); |
| 890 | }; |
| 891 | |
| 892 | MachineOperand *OrSDWA = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0); |
| 893 | MachineOperand *OrOther = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1); |
| 894 | assert(OrSDWA && OrOther); |
| 895 | auto Res = CheckOROperandsForSDWA(OrSDWA, OrOther); |
| 896 | if (!Res) { |
| 897 | OrSDWA = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1); |
| 898 | OrOther = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0); |
| 899 | assert(OrSDWA && OrOther); |
| 900 | Res = CheckOROperandsForSDWA(OrSDWA, OrOther); |
| 901 | if (!Res) |
| 902 | break; |
| 903 | } |
| 904 | |
| 905 | MachineOperand *OrSDWADef = Res->first; |
| 906 | MachineOperand *OrOtherDef = Res->second; |
| 907 | assert(OrSDWADef && OrOtherDef); |
| 908 | |
| 909 | MachineInstr *SDWAInst = OrSDWADef->getParent(); |
| 910 | MachineInstr *OtherInst = OrOtherDef->getParent(); |
| 911 | |
| 912 | // Check that OtherInstr is actually bitwise compatible with SDWAInst = their |
| 913 | // destination patterns don't overlap. Compatible instruction can be either |
| 914 | // regular instruction with compatible bitness or SDWA instruction with |
| 915 | // correct dst_sel |
| 916 | // SDWAInst | OtherInst bitness / OtherInst dst_sel |
| 917 | // ----------------------------------------------------- |
| 918 | // DWORD | no / no |
| 919 | // WORD_0 | no / BYTE_2/3, WORD_1 |
| 920 | // WORD_1 | 8/16-bit instructions / BYTE_0/1, WORD_0 |
| 921 | // BYTE_0 | no / BYTE_1/2/3, WORD_1 |
| 922 | // BYTE_1 | 8-bit / BYTE_0/2/3, WORD_1 |
| 923 | // BYTE_2 | 8/16-bit / BYTE_0/1/3. WORD_0 |
| 924 | // BYTE_3 | 8/16/24-bit / BYTE_0/1/2, WORD_0 |
| 925 | // E.g. if SDWAInst is v_add_f16_sdwa dst_sel:WORD_1 then v_add_f16 is OK |
| 926 | // but v_add_f32 is not. |
| 927 | |
| 928 | // TODO: add support for non-SDWA instructions as OtherInst. |
| 929 | // For now this only works with SDWA instructions. For regular instructions |
| 930 | // there is no way to determine if the instruction writes only 8/16/24-bit |
| 931 | // out of full register size and all registers are at min 32-bit wide. |
| 932 | if (!TII->isSDWA(MI: *OtherInst)) |
| 933 | break; |
| 934 | |
| 935 | SdwaSel DstSel = static_cast<SdwaSel>( |
| 936 | TII->getNamedImmOperand(MI: *SDWAInst, OperandName: AMDGPU::OpName::dst_sel)); |
| 937 | SdwaSel OtherDstSel = static_cast<SdwaSel>( |
| 938 | TII->getNamedImmOperand(MI: *OtherInst, OperandName: AMDGPU::OpName::dst_sel)); |
| 939 | |
| 940 | bool DstSelAgree = false; |
| 941 | switch (DstSel) { |
| 942 | case WORD_0: DstSelAgree = ((OtherDstSel == BYTE_2) || |
| 943 | (OtherDstSel == BYTE_3) || |
| 944 | (OtherDstSel == WORD_1)); |
| 945 | break; |
| 946 | case WORD_1: DstSelAgree = ((OtherDstSel == BYTE_0) || |
| 947 | (OtherDstSel == BYTE_1) || |
| 948 | (OtherDstSel == WORD_0)); |
| 949 | break; |
| 950 | case BYTE_0: DstSelAgree = ((OtherDstSel == BYTE_1) || |
| 951 | (OtherDstSel == BYTE_2) || |
| 952 | (OtherDstSel == BYTE_3) || |
| 953 | (OtherDstSel == WORD_1)); |
| 954 | break; |
| 955 | case BYTE_1: DstSelAgree = ((OtherDstSel == BYTE_0) || |
| 956 | (OtherDstSel == BYTE_2) || |
| 957 | (OtherDstSel == BYTE_3) || |
| 958 | (OtherDstSel == WORD_1)); |
| 959 | break; |
| 960 | case BYTE_2: DstSelAgree = ((OtherDstSel == BYTE_0) || |
| 961 | (OtherDstSel == BYTE_1) || |
| 962 | (OtherDstSel == BYTE_3) || |
| 963 | (OtherDstSel == WORD_0)); |
| 964 | break; |
| 965 | case BYTE_3: DstSelAgree = ((OtherDstSel == BYTE_0) || |
| 966 | (OtherDstSel == BYTE_1) || |
| 967 | (OtherDstSel == BYTE_2) || |
| 968 | (OtherDstSel == WORD_0)); |
| 969 | break; |
| 970 | default: DstSelAgree = false; |
| 971 | } |
| 972 | |
| 973 | if (!DstSelAgree) |
| 974 | break; |
| 975 | |
| 976 | // Also OtherInst dst_unused should be UNUSED_PAD |
| 977 | DstUnused OtherDstUnused = static_cast<DstUnused>( |
| 978 | TII->getNamedImmOperand(MI: *OtherInst, OperandName: AMDGPU::OpName::dst_unused)); |
| 979 | if (OtherDstUnused != DstUnused::UNUSED_PAD) |
| 980 | break; |
| 981 | |
| 982 | // Create DstPreserveOperand |
| 983 | MachineOperand *OrDst = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst); |
| 984 | assert(OrDst && OrDst->isReg()); |
| 985 | |
| 986 | return std::make_unique<SDWADstPreserveOperand>( |
| 987 | args&: OrDst, args&: OrSDWADef, args&: OrOtherDef, args&: DstSel); |
| 988 | |
| 989 | } |
| 990 | } |
| 991 | |
| 992 | return std::unique_ptr<SDWAOperand>(nullptr); |
| 993 | } |
| 994 | |
| 995 | #if !defined(NDEBUG) |
| 996 | static raw_ostream& operator<<(raw_ostream &OS, const SDWAOperand &Operand) { |
| 997 | Operand.print(OS); |
| 998 | return OS; |
| 999 | } |
| 1000 | #endif |
| 1001 | |
| 1002 | void SIPeepholeSDWA::matchSDWAOperands(MachineBasicBlock &MBB) { |
| 1003 | for (MachineInstr &MI : MBB) { |
| 1004 | if (auto Operand = matchSDWAOperand(MI)) { |
| 1005 | LLVM_DEBUG(dbgs() << "Match: " << MI << "To: " << *Operand << '\n'); |
| 1006 | SDWAOperands[&MI] = std::move(Operand); |
| 1007 | ++NumSDWAPatternsFound; |
| 1008 | } |
| 1009 | } |
| 1010 | } |
| 1011 | |
| 1012 | // Convert the V_ADD_CO_U32_e64 into V_ADD_CO_U32_e32. This allows |
| 1013 | // isConvertibleToSDWA to perform its transformation on V_ADD_CO_U32_e32 into |
| 1014 | // V_ADD_CO_U32_sdwa. |
| 1015 | // |
| 1016 | // We are transforming from a VOP3 into a VOP2 form of the instruction. |
| 1017 | // %19:vgpr_32 = V_AND_B32_e32 255, |
| 1018 | // killed %16:vgpr_32, implicit $exec |
| 1019 | // %47:vgpr_32, %49:sreg_64_xexec = V_ADD_CO_U32_e64 |
| 1020 | // %26.sub0:vreg_64, %19:vgpr_32, implicit $exec |
| 1021 | // %48:vgpr_32, dead %50:sreg_64_xexec = V_ADDC_U32_e64 |
| 1022 | // %26.sub1:vreg_64, %54:vgpr_32, killed %49:sreg_64_xexec, implicit $exec |
| 1023 | // |
| 1024 | // becomes |
| 1025 | // %47:vgpr_32 = V_ADD_CO_U32_sdwa |
| 1026 | // 0, %26.sub0:vreg_64, 0, killed %16:vgpr_32, 0, 6, 0, 6, 0, |
| 1027 | // implicit-def $vcc, implicit $exec |
| 1028 | // %48:vgpr_32, dead %50:sreg_64_xexec = V_ADDC_U32_e64 |
| 1029 | // %26.sub1:vreg_64, %54:vgpr_32, killed $vcc, implicit $exec |
| 1030 | void SIPeepholeSDWA::pseudoOpConvertToVOP2(MachineInstr &MI, |
| 1031 | const GCNSubtarget &ST) const { |
| 1032 | int Opc = MI.getOpcode(); |
| 1033 | assert((Opc == AMDGPU::V_ADD_CO_U32_e64 || Opc == AMDGPU::V_SUB_CO_U32_e64) && |
| 1034 | "Currently only handles V_ADD_CO_U32_e64 or V_SUB_CO_U32_e64" ); |
| 1035 | |
| 1036 | // Can the candidate MI be shrunk? |
| 1037 | if (!TII->canShrink(MI, MRI: *MRI)) |
| 1038 | return; |
| 1039 | Opc = AMDGPU::getVOPe32(Opcode: Opc); |
| 1040 | // Find the related ADD instruction. |
| 1041 | const MachineOperand *Sdst = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::sdst); |
| 1042 | if (!Sdst) |
| 1043 | return; |
| 1044 | MachineOperand *NextOp = findSingleRegUse(Reg: Sdst, MRI); |
| 1045 | if (!NextOp) |
| 1046 | return; |
| 1047 | MachineInstr &MISucc = *NextOp->getParent(); |
| 1048 | |
| 1049 | // Make sure the carry in/out are subsequently unused. |
| 1050 | MachineOperand *CarryIn = TII->getNamedOperand(MI&: MISucc, OperandName: AMDGPU::OpName::src2); |
| 1051 | if (!CarryIn) |
| 1052 | return; |
| 1053 | MachineOperand *CarryOut = TII->getNamedOperand(MI&: MISucc, OperandName: AMDGPU::OpName::sdst); |
| 1054 | if (!CarryOut) |
| 1055 | return; |
| 1056 | if (!MRI->hasOneUse(RegNo: CarryIn->getReg()) || !MRI->use_empty(RegNo: CarryOut->getReg())) |
| 1057 | return; |
| 1058 | // Make sure VCC or its subregs are dead before MI. |
| 1059 | MachineBasicBlock &MBB = *MI.getParent(); |
| 1060 | MachineBasicBlock::LivenessQueryResult Liveness = |
| 1061 | MBB.computeRegisterLiveness(TRI, Reg: AMDGPU::VCC, Before: MI, Neighborhood: 25); |
| 1062 | if (Liveness != MachineBasicBlock::LQR_Dead) |
| 1063 | return; |
| 1064 | // Check if VCC is referenced in range of (MI,MISucc]. |
| 1065 | for (auto I = std::next(x: MI.getIterator()), E = MISucc.getIterator(); |
| 1066 | I != E; ++I) { |
| 1067 | if (I->modifiesRegister(Reg: AMDGPU::VCC, TRI)) |
| 1068 | return; |
| 1069 | } |
| 1070 | |
| 1071 | // Replace MI with V_{SUB|ADD}_I32_e32 |
| 1072 | BuildMI(BB&: MBB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: Opc)) |
| 1073 | .add(MO: *TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst)) |
| 1074 | .add(MO: *TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0)) |
| 1075 | .add(MO: *TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1)) |
| 1076 | .setMIFlags(MI.getFlags()); |
| 1077 | |
| 1078 | MI.eraseFromParent(); |
| 1079 | |
| 1080 | // Since the carry output of MI is now VCC, update its use in MISucc. |
| 1081 | |
| 1082 | MISucc.substituteRegister(FromReg: CarryIn->getReg(), ToReg: TRI->getVCC(), SubIdx: 0, RegInfo: *TRI); |
| 1083 | } |
| 1084 | |
| 1085 | /// Try to convert an \p MI in VOP3 which takes an src2 carry-in |
| 1086 | /// operand into the corresponding VOP2 form which expects the |
| 1087 | /// argument in VCC. To this end, add an copy from the carry-in to |
| 1088 | /// VCC. The conversion will only be applied if \p MI can be shrunk |
| 1089 | /// to VOP2 and if VCC can be proven to be dead before \p MI. |
| 1090 | void SIPeepholeSDWA::convertVcndmaskToVOP2(MachineInstr &MI, |
| 1091 | const GCNSubtarget &ST) const { |
| 1092 | assert(MI.getOpcode() == AMDGPU::V_CNDMASK_B32_e64); |
| 1093 | |
| 1094 | LLVM_DEBUG(dbgs() << "Attempting VOP2 conversion: " << MI); |
| 1095 | if (!TII->canShrink(MI, MRI: *MRI)) { |
| 1096 | LLVM_DEBUG(dbgs() << "Cannot shrink instruction\n" ); |
| 1097 | return; |
| 1098 | } |
| 1099 | |
| 1100 | const MachineOperand &CarryIn = |
| 1101 | *TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src2); |
| 1102 | Register CarryReg = CarryIn.getReg(); |
| 1103 | MachineInstr *CarryDef = MRI->getVRegDef(Reg: CarryReg); |
| 1104 | if (!CarryDef) { |
| 1105 | LLVM_DEBUG(dbgs() << "Missing carry-in operand definition\n" ); |
| 1106 | return; |
| 1107 | } |
| 1108 | |
| 1109 | // Make sure VCC or its subregs are dead before MI. |
| 1110 | MCRegister Vcc = TRI->getVCC(); |
| 1111 | MachineBasicBlock &MBB = *MI.getParent(); |
| 1112 | MachineBasicBlock::LivenessQueryResult Liveness = |
| 1113 | MBB.computeRegisterLiveness(TRI, Reg: Vcc, Before: MI); |
| 1114 | if (Liveness != MachineBasicBlock::LQR_Dead) { |
| 1115 | LLVM_DEBUG(dbgs() << "VCC not known to be dead before instruction\n" ); |
| 1116 | return; |
| 1117 | } |
| 1118 | |
| 1119 | BuildMI(BB&: MBB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: Vcc).add(MO: CarryIn); |
| 1120 | |
| 1121 | auto Converted = BuildMI(BB&: MBB, I&: MI, MIMD: MI.getDebugLoc(), |
| 1122 | MCID: TII->get(Opcode: AMDGPU::getVOPe32(Opcode: MI.getOpcode()))) |
| 1123 | .add(MO: *TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst)) |
| 1124 | .add(MO: *TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0)) |
| 1125 | .add(MO: *TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1)) |
| 1126 | .setMIFlags(MI.getFlags()); |
| 1127 | TII->fixImplicitOperands(MI&: *Converted); |
| 1128 | LLVM_DEBUG(dbgs() << "Converted to VOP2: " << *Converted); |
| 1129 | (void)Converted; |
| 1130 | MI.eraseFromParent(); |
| 1131 | } |
| 1132 | |
| 1133 | namespace { |
| 1134 | bool isConvertibleToSDWA(MachineInstr &MI, |
| 1135 | const GCNSubtarget &ST, |
| 1136 | const SIInstrInfo* TII) { |
| 1137 | // Check if this is already an SDWA instruction |
| 1138 | unsigned Opc = MI.getOpcode(); |
| 1139 | if (TII->isSDWA(Opcode: Opc)) |
| 1140 | return true; |
| 1141 | |
| 1142 | // Can only be handled after ealier conversion to |
| 1143 | // AMDGPU::V_CNDMASK_B32_e32 which is not always possible. |
| 1144 | if (Opc == AMDGPU::V_CNDMASK_B32_e64) |
| 1145 | return false; |
| 1146 | |
| 1147 | // Check if this instruction has opcode that supports SDWA |
| 1148 | if (AMDGPU::getSDWAOp(Opcode: Opc) == -1) |
| 1149 | Opc = AMDGPU::getVOPe32(Opcode: Opc); |
| 1150 | |
| 1151 | if (AMDGPU::getSDWAOp(Opcode: Opc) == -1) |
| 1152 | return false; |
| 1153 | |
| 1154 | if (!ST.hasSDWAOmod() && TII->hasModifiersSet(MI, OpName: AMDGPU::OpName::omod)) |
| 1155 | return false; |
| 1156 | |
| 1157 | if (TII->isVOPC(Opcode: Opc)) { |
| 1158 | if (!ST.hasSDWASdst()) { |
| 1159 | const MachineOperand *SDst = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::sdst); |
| 1160 | if (SDst && (SDst->getReg() != AMDGPU::VCC && |
| 1161 | SDst->getReg() != AMDGPU::VCC_LO)) |
| 1162 | return false; |
| 1163 | } |
| 1164 | |
| 1165 | if (!ST.hasSDWAOutModsVOPC() && |
| 1166 | (TII->hasModifiersSet(MI, OpName: AMDGPU::OpName::clamp) || |
| 1167 | TII->hasModifiersSet(MI, OpName: AMDGPU::OpName::omod))) |
| 1168 | return false; |
| 1169 | |
| 1170 | } else if (TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::sdst) || |
| 1171 | !TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst)) { |
| 1172 | return false; |
| 1173 | } |
| 1174 | |
| 1175 | if (!ST.hasSDWAMac() && (Opc == AMDGPU::V_FMAC_F16_e32 || |
| 1176 | Opc == AMDGPU::V_FMAC_F32_e32 || |
| 1177 | Opc == AMDGPU::V_MAC_F16_e32 || |
| 1178 | Opc == AMDGPU::V_MAC_F32_e32)) |
| 1179 | return false; |
| 1180 | |
| 1181 | // Check if target supports this SDWA opcode |
| 1182 | if (TII->pseudoToMCOpcode(Opcode: Opc) == -1) |
| 1183 | return false; |
| 1184 | |
| 1185 | if (MachineOperand *Src0 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0)) { |
| 1186 | if (!Src0->isReg() && !Src0->isImm()) |
| 1187 | return false; |
| 1188 | } |
| 1189 | |
| 1190 | if (MachineOperand *Src1 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1)) { |
| 1191 | if (!Src1->isReg() && !Src1->isImm()) |
| 1192 | return false; |
| 1193 | } |
| 1194 | |
| 1195 | return true; |
| 1196 | } |
| 1197 | } // namespace |
| 1198 | |
| 1199 | MachineInstr *SIPeepholeSDWA::createSDWAVersion(MachineInstr &MI) { |
| 1200 | unsigned Opcode = MI.getOpcode(); |
| 1201 | assert(!TII->isSDWA(Opcode)); |
| 1202 | |
| 1203 | int SDWAOpcode = AMDGPU::getSDWAOp(Opcode); |
| 1204 | if (SDWAOpcode == -1) |
| 1205 | SDWAOpcode = AMDGPU::getSDWAOp(Opcode: AMDGPU::getVOPe32(Opcode)); |
| 1206 | assert(SDWAOpcode != -1); |
| 1207 | |
| 1208 | const MCInstrDesc &SDWADesc = TII->get(Opcode: SDWAOpcode); |
| 1209 | |
| 1210 | // Create SDWA version of instruction MI and initialize its operands |
| 1211 | MachineInstrBuilder SDWAInst = |
| 1212 | BuildMI(BB&: *MI.getParent(), I&: MI, MIMD: MI.getDebugLoc(), MCID: SDWADesc) |
| 1213 | .setMIFlags(MI.getFlags()); |
| 1214 | |
| 1215 | // Copy dst, if it is present in original then should also be present in SDWA |
| 1216 | MachineOperand *Dst = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst); |
| 1217 | if (Dst) { |
| 1218 | assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::vdst)); |
| 1219 | SDWAInst.add(MO: *Dst); |
| 1220 | } else if ((Dst = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::sdst))) { |
| 1221 | assert(Dst && AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::sdst)); |
| 1222 | SDWAInst.add(MO: *Dst); |
| 1223 | } else { |
| 1224 | assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::sdst)); |
| 1225 | SDWAInst.addReg(RegNo: TRI->getVCC(), flags: RegState::Define); |
| 1226 | } |
| 1227 | |
| 1228 | // Copy src0, initialize src0_modifiers. All sdwa instructions has src0 and |
| 1229 | // src0_modifiers (except for v_nop_sdwa, but it can't get here) |
| 1230 | MachineOperand *Src0 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0); |
| 1231 | assert(Src0 && AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src0) && |
| 1232 | AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src0_modifiers)); |
| 1233 | if (auto *Mod = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0_modifiers)) |
| 1234 | SDWAInst.addImm(Val: Mod->getImm()); |
| 1235 | else |
| 1236 | SDWAInst.addImm(Val: 0); |
| 1237 | SDWAInst.add(MO: *Src0); |
| 1238 | |
| 1239 | // Copy src1 if present, initialize src1_modifiers. |
| 1240 | MachineOperand *Src1 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1); |
| 1241 | if (Src1) { |
| 1242 | assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src1) && |
| 1243 | AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src1_modifiers)); |
| 1244 | if (auto *Mod = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1_modifiers)) |
| 1245 | SDWAInst.addImm(Val: Mod->getImm()); |
| 1246 | else |
| 1247 | SDWAInst.addImm(Val: 0); |
| 1248 | SDWAInst.add(MO: *Src1); |
| 1249 | } |
| 1250 | |
| 1251 | if (SDWAOpcode == AMDGPU::V_FMAC_F16_sdwa || |
| 1252 | SDWAOpcode == AMDGPU::V_FMAC_F32_sdwa || |
| 1253 | SDWAOpcode == AMDGPU::V_MAC_F16_sdwa || |
| 1254 | SDWAOpcode == AMDGPU::V_MAC_F32_sdwa) { |
| 1255 | // v_mac_f16/32 has additional src2 operand tied to vdst |
| 1256 | MachineOperand *Src2 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src2); |
| 1257 | assert(Src2); |
| 1258 | SDWAInst.add(MO: *Src2); |
| 1259 | } |
| 1260 | |
| 1261 | // Copy clamp if present, initialize otherwise |
| 1262 | assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::clamp)); |
| 1263 | MachineOperand *Clamp = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::clamp); |
| 1264 | if (Clamp) { |
| 1265 | SDWAInst.add(MO: *Clamp); |
| 1266 | } else { |
| 1267 | SDWAInst.addImm(Val: 0); |
| 1268 | } |
| 1269 | |
| 1270 | // Copy omod if present, initialize otherwise if needed |
| 1271 | if (AMDGPU::hasNamedOperand(Opcode: SDWAOpcode, NamedIdx: AMDGPU::OpName::omod)) { |
| 1272 | MachineOperand *OMod = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::omod); |
| 1273 | if (OMod) { |
| 1274 | SDWAInst.add(MO: *OMod); |
| 1275 | } else { |
| 1276 | SDWAInst.addImm(Val: 0); |
| 1277 | } |
| 1278 | } |
| 1279 | |
| 1280 | // Initialize SDWA specific operands |
| 1281 | if (AMDGPU::hasNamedOperand(Opcode: SDWAOpcode, NamedIdx: AMDGPU::OpName::dst_sel)) |
| 1282 | SDWAInst.addImm(Val: AMDGPU::SDWA::SdwaSel::DWORD); |
| 1283 | |
| 1284 | if (AMDGPU::hasNamedOperand(Opcode: SDWAOpcode, NamedIdx: AMDGPU::OpName::dst_unused)) |
| 1285 | SDWAInst.addImm(Val: AMDGPU::SDWA::DstUnused::UNUSED_PAD); |
| 1286 | |
| 1287 | assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src0_sel)); |
| 1288 | SDWAInst.addImm(Val: AMDGPU::SDWA::SdwaSel::DWORD); |
| 1289 | |
| 1290 | if (Src1) { |
| 1291 | assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src1_sel)); |
| 1292 | SDWAInst.addImm(Val: AMDGPU::SDWA::SdwaSel::DWORD); |
| 1293 | } |
| 1294 | |
| 1295 | // Check for a preserved register that needs to be copied. |
| 1296 | MachineInstr *Ret = SDWAInst.getInstr(); |
| 1297 | TII->fixImplicitOperands(MI&: *Ret); |
| 1298 | return Ret; |
| 1299 | } |
| 1300 | |
| 1301 | bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI, |
| 1302 | const SDWAOperandsVector &SDWAOperands) { |
| 1303 | LLVM_DEBUG(dbgs() << "Convert instruction:" << MI); |
| 1304 | |
| 1305 | MachineInstr *SDWAInst; |
| 1306 | if (TII->isSDWA(Opcode: MI.getOpcode())) { |
| 1307 | // Clone the instruction to allow revoking changes |
| 1308 | // made to MI during the processing of the operands |
| 1309 | // if the conversion fails. |
| 1310 | SDWAInst = MI.getParent()->getParent()->CloneMachineInstr(Orig: &MI); |
| 1311 | MI.getParent()->insert(I: MI.getIterator(), M: SDWAInst); |
| 1312 | } else { |
| 1313 | SDWAInst = createSDWAVersion(MI); |
| 1314 | } |
| 1315 | |
| 1316 | // Apply all sdwa operand patterns. |
| 1317 | bool Converted = false; |
| 1318 | for (auto &Operand : SDWAOperands) { |
| 1319 | LLVM_DEBUG(dbgs() << *SDWAInst << "\nOperand: " << *Operand); |
| 1320 | // There should be no intersection between SDWA operands and potential MIs |
| 1321 | // e.g.: |
| 1322 | // v_and_b32 v0, 0xff, v1 -> src:v1 sel:BYTE_0 |
| 1323 | // v_and_b32 v2, 0xff, v0 -> src:v0 sel:BYTE_0 |
| 1324 | // v_add_u32 v3, v4, v2 |
| 1325 | // |
| 1326 | // In that example it is possible that we would fold 2nd instruction into |
| 1327 | // 3rd (v_add_u32_sdwa) and then try to fold 1st instruction into 2nd (that |
| 1328 | // was already destroyed). So if SDWAOperand is also a potential MI then do |
| 1329 | // not apply it. |
| 1330 | if (PotentialMatches.count(Key: Operand->getParentInst()) == 0) |
| 1331 | Converted |= Operand->convertToSDWA(MI&: *SDWAInst, TII); |
| 1332 | } |
| 1333 | |
| 1334 | if (!Converted) { |
| 1335 | SDWAInst->eraseFromParent(); |
| 1336 | return false; |
| 1337 | } |
| 1338 | |
| 1339 | ConvertedInstructions.push_back(Elt: SDWAInst); |
| 1340 | for (MachineOperand &MO : SDWAInst->uses()) { |
| 1341 | if (!MO.isReg()) |
| 1342 | continue; |
| 1343 | |
| 1344 | MRI->clearKillFlags(Reg: MO.getReg()); |
| 1345 | } |
| 1346 | LLVM_DEBUG(dbgs() << "\nInto:" << *SDWAInst << '\n'); |
| 1347 | ++NumSDWAInstructionsPeepholed; |
| 1348 | |
| 1349 | MI.eraseFromParent(); |
| 1350 | return true; |
| 1351 | } |
| 1352 | |
| 1353 | // If an instruction was converted to SDWA it should not have immediates or SGPR |
| 1354 | // operands (allowed one SGPR on GFX9). Copy its scalar operands into VGPRs. |
| 1355 | void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI, |
| 1356 | const GCNSubtarget &ST) const { |
| 1357 | const MCInstrDesc &Desc = TII->get(Opcode: MI.getOpcode()); |
| 1358 | unsigned ConstantBusCount = 0; |
| 1359 | for (MachineOperand &Op : MI.explicit_uses()) { |
| 1360 | if (!Op.isImm() && !(Op.isReg() && !TRI->isVGPR(MRI: *MRI, Reg: Op.getReg()))) |
| 1361 | continue; |
| 1362 | |
| 1363 | unsigned I = Op.getOperandNo(); |
| 1364 | if (Desc.operands()[I].RegClass == -1 || |
| 1365 | !TRI->isVSSuperClass(RC: TRI->getRegClass(RCID: Desc.operands()[I].RegClass))) |
| 1366 | continue; |
| 1367 | |
| 1368 | if (ST.hasSDWAScalar() && ConstantBusCount == 0 && Op.isReg() && |
| 1369 | TRI->isSGPRReg(MRI: *MRI, Reg: Op.getReg())) { |
| 1370 | ++ConstantBusCount; |
| 1371 | continue; |
| 1372 | } |
| 1373 | |
| 1374 | Register VGPR = MRI->createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass); |
| 1375 | auto Copy = BuildMI(BB&: *MI.getParent(), I: MI.getIterator(), MIMD: MI.getDebugLoc(), |
| 1376 | MCID: TII->get(Opcode: AMDGPU::V_MOV_B32_e32), DestReg: VGPR); |
| 1377 | if (Op.isImm()) |
| 1378 | Copy.addImm(Val: Op.getImm()); |
| 1379 | else if (Op.isReg()) |
| 1380 | Copy.addReg(RegNo: Op.getReg(), flags: Op.isKill() ? RegState::Kill : 0, |
| 1381 | SubReg: Op.getSubReg()); |
| 1382 | Op.ChangeToRegister(Reg: VGPR, isDef: false); |
| 1383 | } |
| 1384 | } |
| 1385 | |
| 1386 | bool SIPeepholeSDWALegacy::runOnMachineFunction(MachineFunction &MF) { |
| 1387 | if (skipFunction(F: MF.getFunction())) |
| 1388 | return false; |
| 1389 | |
| 1390 | return SIPeepholeSDWA().run(MF); |
| 1391 | } |
| 1392 | |
| 1393 | bool SIPeepholeSDWA::run(MachineFunction &MF) { |
| 1394 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
| 1395 | |
| 1396 | if (!ST.hasSDWA()) |
| 1397 | return false; |
| 1398 | |
| 1399 | MRI = &MF.getRegInfo(); |
| 1400 | TRI = ST.getRegisterInfo(); |
| 1401 | TII = ST.getInstrInfo(); |
| 1402 | |
| 1403 | // Find all SDWA operands in MF. |
| 1404 | bool Ret = false; |
| 1405 | for (MachineBasicBlock &MBB : MF) { |
| 1406 | bool Changed = false; |
| 1407 | do { |
| 1408 | // Preprocess the ADD/SUB pairs so they could be SDWA'ed. |
| 1409 | // Look for a possible ADD or SUB that resulted from a previously lowered |
| 1410 | // V_{ADD|SUB}_U64_PSEUDO. The function pseudoOpConvertToVOP2 |
| 1411 | // lowers the pair of instructions into e32 form. |
| 1412 | matchSDWAOperands(MBB); |
| 1413 | for (const auto &OperandPair : SDWAOperands) { |
| 1414 | const auto &Operand = OperandPair.second; |
| 1415 | MachineInstr *PotentialMI = Operand->potentialToConvert(TII, ST); |
| 1416 | if (!PotentialMI) |
| 1417 | continue; |
| 1418 | |
| 1419 | switch (PotentialMI->getOpcode()) { |
| 1420 | case AMDGPU::V_ADD_CO_U32_e64: |
| 1421 | case AMDGPU::V_SUB_CO_U32_e64: |
| 1422 | pseudoOpConvertToVOP2(MI&: *PotentialMI, ST); |
| 1423 | break; |
| 1424 | case AMDGPU::V_CNDMASK_B32_e64: |
| 1425 | convertVcndmaskToVOP2(MI&: *PotentialMI, ST); |
| 1426 | break; |
| 1427 | }; |
| 1428 | } |
| 1429 | SDWAOperands.clear(); |
| 1430 | |
| 1431 | // Generate potential match list. |
| 1432 | matchSDWAOperands(MBB); |
| 1433 | |
| 1434 | for (const auto &OperandPair : SDWAOperands) { |
| 1435 | const auto &Operand = OperandPair.second; |
| 1436 | MachineInstr *PotentialMI = |
| 1437 | Operand->potentialToConvert(TII, ST, PotentialMatches: &PotentialMatches); |
| 1438 | |
| 1439 | if (PotentialMI && isConvertibleToSDWA(MI&: *PotentialMI, ST, TII)) |
| 1440 | PotentialMatches[PotentialMI].push_back(Elt: Operand.get()); |
| 1441 | } |
| 1442 | |
| 1443 | for (auto &PotentialPair : PotentialMatches) { |
| 1444 | MachineInstr &PotentialMI = *PotentialPair.first; |
| 1445 | convertToSDWA(MI&: PotentialMI, SDWAOperands: PotentialPair.second); |
| 1446 | } |
| 1447 | |
| 1448 | PotentialMatches.clear(); |
| 1449 | SDWAOperands.clear(); |
| 1450 | |
| 1451 | Changed = !ConvertedInstructions.empty(); |
| 1452 | |
| 1453 | if (Changed) |
| 1454 | Ret = true; |
| 1455 | while (!ConvertedInstructions.empty()) |
| 1456 | legalizeScalarOperands(MI&: *ConvertedInstructions.pop_back_val(), ST); |
| 1457 | } while (Changed); |
| 1458 | } |
| 1459 | |
| 1460 | return Ret; |
| 1461 | } |
| 1462 | |
| 1463 | PreservedAnalyses SIPeepholeSDWAPass::run(MachineFunction &MF, |
| 1464 | MachineFunctionAnalysisManager &) { |
| 1465 | if (MF.getFunction().hasOptNone() || !SIPeepholeSDWA().run(MF)) |
| 1466 | return PreservedAnalyses::all(); |
| 1467 | |
| 1468 | PreservedAnalyses PA = getMachineFunctionPassPreservedAnalyses(); |
| 1469 | PA.preserveSet<CFGAnalyses>(); |
| 1470 | return PA; |
| 1471 | } |
| 1472 | |