| 1 | //===- SIPeepholeSDWA.cpp - Peephole optimization for SDWA instructions ---===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | // |
| 9 | /// \file This pass tries to apply several peephole SDWA patterns. |
| 10 | /// |
| 11 | /// E.g. original: |
| 12 | /// V_LSHRREV_B32_e32 %0, 16, %1 |
| 13 | /// V_ADD_CO_U32_e32 %2, %0, %3 |
| 14 | /// V_LSHLREV_B32_e32 %4, 16, %2 |
| 15 | /// |
| 16 | /// Replace: |
| 17 | /// V_ADD_CO_U32_sdwa %4, %1, %3 |
| 18 | /// dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD |
| 19 | /// |
| 20 | //===----------------------------------------------------------------------===// |
| 21 | |
| 22 | #include "SIPeepholeSDWA.h" |
| 23 | #include "AMDGPU.h" |
| 24 | #include "GCNSubtarget.h" |
| 25 | #include "MCTargetDesc/AMDGPUMCTargetDesc.h" |
| 26 | #include "llvm/ADT/MapVector.h" |
| 27 | #include "llvm/ADT/Statistic.h" |
| 28 | #include "llvm/CodeGen/MachineFunctionPass.h" |
| 29 | #include <optional> |
| 30 | |
| 31 | using namespace llvm; |
| 32 | |
| 33 | #define DEBUG_TYPE "si-peephole-sdwa" |
| 34 | |
| 35 | STATISTIC(NumSDWAPatternsFound, "Number of SDWA patterns found." ); |
| 36 | STATISTIC(NumSDWAInstructionsPeepholed, |
| 37 | "Number of instruction converted to SDWA." ); |
| 38 | |
| 39 | namespace { |
| 40 | |
| 41 | bool isConvertibleToSDWA(MachineInstr &MI, const GCNSubtarget &ST, |
| 42 | const SIInstrInfo *TII); |
| 43 | class SDWAOperand; |
| 44 | class SDWADstOperand; |
| 45 | |
| 46 | using SDWAOperandsVector = SmallVector<SDWAOperand *, 4>; |
| 47 | using SDWAOperandsMap = MapVector<MachineInstr *, SDWAOperandsVector>; |
| 48 | |
| 49 | class SIPeepholeSDWA { |
| 50 | private: |
| 51 | MachineRegisterInfo *MRI; |
| 52 | const SIRegisterInfo *TRI; |
| 53 | const SIInstrInfo *TII; |
| 54 | |
| 55 | MapVector<MachineInstr *, std::unique_ptr<SDWAOperand>> SDWAOperands; |
| 56 | SDWAOperandsMap PotentialMatches; |
| 57 | SmallVector<MachineInstr *, 8> ConvertedInstructions; |
| 58 | |
| 59 | std::optional<int64_t> foldToImm(const MachineOperand &Op) const; |
| 60 | |
| 61 | void matchSDWAOperands(MachineBasicBlock &MBB); |
| 62 | std::unique_ptr<SDWAOperand> matchSDWAOperand(MachineInstr &MI); |
| 63 | void pseudoOpConvertToVOP2(MachineInstr &MI, |
| 64 | const GCNSubtarget &ST) const; |
| 65 | void convertVcndmaskToVOP2(MachineInstr &MI, const GCNSubtarget &ST) const; |
| 66 | MachineInstr *createSDWAVersion(MachineInstr &MI); |
| 67 | bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands); |
| 68 | void legalizeScalarOperands(MachineInstr &MI, const GCNSubtarget &ST) const; |
| 69 | |
| 70 | public: |
| 71 | bool run(MachineFunction &MF); |
| 72 | }; |
| 73 | |
| 74 | class SIPeepholeSDWALegacy : public MachineFunctionPass { |
| 75 | public: |
| 76 | static char ID; |
| 77 | |
| 78 | SIPeepholeSDWALegacy() : MachineFunctionPass(ID) {} |
| 79 | |
| 80 | StringRef getPassName() const override { return "SI Peephole SDWA" ; } |
| 81 | |
| 82 | bool runOnMachineFunction(MachineFunction &MF) override; |
| 83 | |
| 84 | void getAnalysisUsage(AnalysisUsage &AU) const override { |
| 85 | AU.setPreservesCFG(); |
| 86 | MachineFunctionPass::getAnalysisUsage(AU); |
| 87 | } |
| 88 | }; |
| 89 | |
| 90 | using namespace AMDGPU::SDWA; |
| 91 | |
| 92 | class SDWAOperand { |
| 93 | private: |
| 94 | MachineOperand *Target; // Operand that would be used in converted instruction |
| 95 | MachineOperand *Replaced; // Operand that would be replace by Target |
| 96 | |
| 97 | /// Returns true iff the SDWA selection of this SDWAOperand can be combined |
| 98 | /// with the SDWA selections of its uses in \p MI. |
| 99 | virtual bool canCombineSelections(const MachineInstr &MI, |
| 100 | const SIInstrInfo *TII) = 0; |
| 101 | |
| 102 | public: |
| 103 | SDWAOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp) |
| 104 | : Target(TargetOp), Replaced(ReplacedOp) { |
| 105 | assert(Target->isReg()); |
| 106 | assert(Replaced->isReg()); |
| 107 | } |
| 108 | |
| 109 | virtual ~SDWAOperand() = default; |
| 110 | |
| 111 | virtual MachineInstr *potentialToConvert(const SIInstrInfo *TII, |
| 112 | const GCNSubtarget &ST, |
| 113 | SDWAOperandsMap *PotentialMatches = nullptr) = 0; |
| 114 | virtual bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) = 0; |
| 115 | |
| 116 | MachineOperand *getTargetOperand() const { return Target; } |
| 117 | MachineOperand *getReplacedOperand() const { return Replaced; } |
| 118 | MachineInstr *getParentInst() const { return Target->getParent(); } |
| 119 | |
| 120 | MachineRegisterInfo *getMRI() const { |
| 121 | return &getParentInst()->getMF()->getRegInfo(); |
| 122 | } |
| 123 | |
| 124 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 125 | virtual void print(raw_ostream& OS) const = 0; |
| 126 | void dump() const { print(dbgs()); } |
| 127 | #endif |
| 128 | }; |
| 129 | |
| 130 | class SDWASrcOperand : public SDWAOperand { |
| 131 | private: |
| 132 | SdwaSel SrcSel; |
| 133 | bool Abs; |
| 134 | bool Neg; |
| 135 | bool Sext; |
| 136 | |
| 137 | public: |
| 138 | SDWASrcOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp, |
| 139 | SdwaSel SrcSel_ = DWORD, bool Abs_ = false, bool Neg_ = false, |
| 140 | bool Sext_ = false) |
| 141 | : SDWAOperand(TargetOp, ReplacedOp), SrcSel(SrcSel_), Abs(Abs_), |
| 142 | Neg(Neg_), Sext(Sext_) {} |
| 143 | |
| 144 | MachineInstr *potentialToConvert(const SIInstrInfo *TII, |
| 145 | const GCNSubtarget &ST, |
| 146 | SDWAOperandsMap *PotentialMatches = nullptr) override; |
| 147 | bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override; |
| 148 | bool canCombineSelections(const MachineInstr &MI, |
| 149 | const SIInstrInfo *TII) override; |
| 150 | |
| 151 | SdwaSel getSrcSel() const { return SrcSel; } |
| 152 | bool getAbs() const { return Abs; } |
| 153 | bool getNeg() const { return Neg; } |
| 154 | bool getSext() const { return Sext; } |
| 155 | |
| 156 | uint64_t getSrcMods(const SIInstrInfo *TII, |
| 157 | const MachineOperand *SrcOp) const; |
| 158 | |
| 159 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 160 | void print(raw_ostream& OS) const override; |
| 161 | #endif |
| 162 | }; |
| 163 | |
| 164 | class SDWADstOperand : public SDWAOperand { |
| 165 | private: |
| 166 | SdwaSel DstSel; |
| 167 | DstUnused DstUn; |
| 168 | |
| 169 | public: |
| 170 | SDWADstOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp, |
| 171 | SdwaSel DstSel_ = DWORD, DstUnused DstUn_ = UNUSED_PAD) |
| 172 | : SDWAOperand(TargetOp, ReplacedOp), DstSel(DstSel_), DstUn(DstUn_) {} |
| 173 | |
| 174 | MachineInstr *potentialToConvert(const SIInstrInfo *TII, |
| 175 | const GCNSubtarget &ST, |
| 176 | SDWAOperandsMap *PotentialMatches = nullptr) override; |
| 177 | bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override; |
| 178 | bool canCombineSelections(const MachineInstr &MI, |
| 179 | const SIInstrInfo *TII) override; |
| 180 | |
| 181 | SdwaSel getDstSel() const { return DstSel; } |
| 182 | DstUnused getDstUnused() const { return DstUn; } |
| 183 | |
| 184 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 185 | void print(raw_ostream& OS) const override; |
| 186 | #endif |
| 187 | }; |
| 188 | |
| 189 | class SDWADstPreserveOperand : public SDWADstOperand { |
| 190 | private: |
| 191 | MachineOperand *Preserve; |
| 192 | |
| 193 | public: |
| 194 | SDWADstPreserveOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp, |
| 195 | MachineOperand *PreserveOp, SdwaSel DstSel_ = DWORD) |
| 196 | : SDWADstOperand(TargetOp, ReplacedOp, DstSel_, UNUSED_PRESERVE), |
| 197 | Preserve(PreserveOp) {} |
| 198 | |
| 199 | bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override; |
| 200 | bool canCombineSelections(const MachineInstr &MI, |
| 201 | const SIInstrInfo *TII) override; |
| 202 | |
| 203 | MachineOperand *getPreservedOperand() const { return Preserve; } |
| 204 | |
| 205 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 206 | void print(raw_ostream& OS) const override; |
| 207 | #endif |
| 208 | }; |
| 209 | |
| 210 | } // end anonymous namespace |
| 211 | |
| 212 | INITIALIZE_PASS(SIPeepholeSDWALegacy, DEBUG_TYPE, "SI Peephole SDWA" , false, |
| 213 | false) |
| 214 | |
| 215 | char SIPeepholeSDWALegacy::ID = 0; |
| 216 | |
| 217 | char &llvm::SIPeepholeSDWALegacyID = SIPeepholeSDWALegacy::ID; |
| 218 | |
| 219 | FunctionPass *llvm::createSIPeepholeSDWALegacyPass() { |
| 220 | return new SIPeepholeSDWALegacy(); |
| 221 | } |
| 222 | |
| 223 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 224 | static raw_ostream& operator<<(raw_ostream &OS, SdwaSel Sel) { |
| 225 | switch(Sel) { |
| 226 | case BYTE_0: OS << "BYTE_0" ; break; |
| 227 | case BYTE_1: OS << "BYTE_1" ; break; |
| 228 | case BYTE_2: OS << "BYTE_2" ; break; |
| 229 | case BYTE_3: OS << "BYTE_3" ; break; |
| 230 | case WORD_0: OS << "WORD_0" ; break; |
| 231 | case WORD_1: OS << "WORD_1" ; break; |
| 232 | case DWORD: OS << "DWORD" ; break; |
| 233 | } |
| 234 | return OS; |
| 235 | } |
| 236 | |
| 237 | static raw_ostream& operator<<(raw_ostream &OS, const DstUnused &Un) { |
| 238 | switch(Un) { |
| 239 | case UNUSED_PAD: OS << "UNUSED_PAD" ; break; |
| 240 | case UNUSED_SEXT: OS << "UNUSED_SEXT" ; break; |
| 241 | case UNUSED_PRESERVE: OS << "UNUSED_PRESERVE" ; break; |
| 242 | } |
| 243 | return OS; |
| 244 | } |
| 245 | |
| 246 | LLVM_DUMP_METHOD |
| 247 | void SDWASrcOperand::print(raw_ostream& OS) const { |
| 248 | OS << "SDWA src: " << *getTargetOperand() |
| 249 | << " src_sel:" << getSrcSel() |
| 250 | << " abs:" << getAbs() << " neg:" << getNeg() |
| 251 | << " sext:" << getSext() << '\n'; |
| 252 | } |
| 253 | |
| 254 | LLVM_DUMP_METHOD |
| 255 | void SDWADstOperand::print(raw_ostream& OS) const { |
| 256 | OS << "SDWA dst: " << *getTargetOperand() |
| 257 | << " dst_sel:" << getDstSel() |
| 258 | << " dst_unused:" << getDstUnused() << '\n'; |
| 259 | } |
| 260 | |
| 261 | LLVM_DUMP_METHOD |
| 262 | void SDWADstPreserveOperand::print(raw_ostream& OS) const { |
| 263 | OS << "SDWA preserve dst: " << *getTargetOperand() |
| 264 | << " dst_sel:" << getDstSel() |
| 265 | << " preserve:" << *getPreservedOperand() << '\n'; |
| 266 | } |
| 267 | |
| 268 | #endif |
| 269 | |
| 270 | static void copyRegOperand(MachineOperand &To, const MachineOperand &From) { |
| 271 | assert(To.isReg() && From.isReg()); |
| 272 | To.setReg(From.getReg()); |
| 273 | To.setSubReg(From.getSubReg()); |
| 274 | To.setIsUndef(From.isUndef()); |
| 275 | if (To.isUse()) { |
| 276 | To.setIsKill(From.isKill()); |
| 277 | } else { |
| 278 | To.setIsDead(From.isDead()); |
| 279 | } |
| 280 | } |
| 281 | |
| 282 | static bool isSameReg(const MachineOperand &LHS, const MachineOperand &RHS) { |
| 283 | return LHS.isReg() && |
| 284 | RHS.isReg() && |
| 285 | LHS.getReg() == RHS.getReg() && |
| 286 | LHS.getSubReg() == RHS.getSubReg(); |
| 287 | } |
| 288 | |
| 289 | static MachineOperand *findSingleRegUse(const MachineOperand *Reg, |
| 290 | const MachineRegisterInfo *MRI) { |
| 291 | if (!Reg->isReg() || !Reg->isDef()) |
| 292 | return nullptr; |
| 293 | |
| 294 | return MRI->getOneNonDBGUse(RegNo: Reg->getReg()); |
| 295 | } |
| 296 | |
| 297 | static MachineOperand *findSingleRegDef(const MachineOperand *Reg, |
| 298 | const MachineRegisterInfo *MRI) { |
| 299 | if (!Reg->isReg()) |
| 300 | return nullptr; |
| 301 | |
| 302 | return MRI->getOneDef(Reg: Reg->getReg()); |
| 303 | } |
| 304 | |
| 305 | /// Combine an SDWA instruction's existing SDWA selection \p Sel with |
| 306 | /// the SDWA selection \p OperandSel of its operand. If the selections |
| 307 | /// are compatible, return the combined selection, otherwise return a |
| 308 | /// nullopt. |
| 309 | /// For example, if we have Sel = BYTE_0 Sel and OperandSel = WORD_1: |
| 310 | /// BYTE_0 Sel (WORD_1 Sel (%X)) -> BYTE_2 Sel (%X) |
| 311 | static std::optional<SdwaSel> combineSdwaSel(SdwaSel Sel, SdwaSel OperandSel) { |
| 312 | if (Sel == SdwaSel::DWORD) |
| 313 | return OperandSel; |
| 314 | |
| 315 | if (Sel == OperandSel || OperandSel == SdwaSel::DWORD) |
| 316 | return Sel; |
| 317 | |
| 318 | if (Sel == SdwaSel::WORD_1 || Sel == SdwaSel::BYTE_2 || |
| 319 | Sel == SdwaSel::BYTE_3) |
| 320 | return {}; |
| 321 | |
| 322 | if (OperandSel == SdwaSel::WORD_0) |
| 323 | return Sel; |
| 324 | |
| 325 | if (OperandSel == SdwaSel::WORD_1) { |
| 326 | if (Sel == SdwaSel::BYTE_0) |
| 327 | return SdwaSel::BYTE_2; |
| 328 | if (Sel == SdwaSel::BYTE_1) |
| 329 | return SdwaSel::BYTE_3; |
| 330 | if (Sel == SdwaSel::WORD_0) |
| 331 | return SdwaSel::WORD_1; |
| 332 | } |
| 333 | |
| 334 | return {}; |
| 335 | } |
| 336 | |
| 337 | uint64_t SDWASrcOperand::getSrcMods(const SIInstrInfo *TII, |
| 338 | const MachineOperand *SrcOp) const { |
| 339 | uint64_t Mods = 0; |
| 340 | const auto *MI = SrcOp->getParent(); |
| 341 | if (TII->getNamedOperand(MI: *MI, OperandName: AMDGPU::OpName::src0) == SrcOp) { |
| 342 | if (auto *Mod = TII->getNamedOperand(MI: *MI, OperandName: AMDGPU::OpName::src0_modifiers)) { |
| 343 | Mods = Mod->getImm(); |
| 344 | } |
| 345 | } else if (TII->getNamedOperand(MI: *MI, OperandName: AMDGPU::OpName::src1) == SrcOp) { |
| 346 | if (auto *Mod = TII->getNamedOperand(MI: *MI, OperandName: AMDGPU::OpName::src1_modifiers)) { |
| 347 | Mods = Mod->getImm(); |
| 348 | } |
| 349 | } |
| 350 | if (Abs || Neg) { |
| 351 | assert(!Sext && |
| 352 | "Float and integer src modifiers can't be set simultaneously" ); |
| 353 | Mods |= Abs ? SISrcMods::ABS : 0u; |
| 354 | Mods ^= Neg ? SISrcMods::NEG : 0u; |
| 355 | } else if (Sext) { |
| 356 | Mods |= SISrcMods::SEXT; |
| 357 | } |
| 358 | |
| 359 | return Mods; |
| 360 | } |
| 361 | |
| 362 | MachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII, |
| 363 | const GCNSubtarget &ST, |
| 364 | SDWAOperandsMap *PotentialMatches) { |
| 365 | if (PotentialMatches != nullptr) { |
| 366 | // Fill out the map for all uses if all can be converted |
| 367 | MachineOperand *Reg = getReplacedOperand(); |
| 368 | if (!Reg->isReg() || !Reg->isDef()) |
| 369 | return nullptr; |
| 370 | |
| 371 | for (MachineInstr &UseMI : getMRI()->use_nodbg_instructions(Reg: Reg->getReg())) |
| 372 | // Check that all instructions that use Reg can be converted |
| 373 | if (!isConvertibleToSDWA(MI&: UseMI, ST, TII) || |
| 374 | !canCombineSelections(MI: UseMI, TII)) |
| 375 | return nullptr; |
| 376 | |
| 377 | // Now that it's guaranteed all uses are legal, iterate over the uses again |
| 378 | // to add them for later conversion. |
| 379 | for (MachineOperand &UseMO : getMRI()->use_nodbg_operands(Reg: Reg->getReg())) { |
| 380 | // Should not get a subregister here |
| 381 | assert(isSameReg(UseMO, *Reg)); |
| 382 | |
| 383 | SDWAOperandsMap &potentialMatchesMap = *PotentialMatches; |
| 384 | MachineInstr *UseMI = UseMO.getParent(); |
| 385 | potentialMatchesMap[UseMI].push_back(Elt: this); |
| 386 | } |
| 387 | return nullptr; |
| 388 | } |
| 389 | |
| 390 | // For SDWA src operand potential instruction is one that use register |
| 391 | // defined by parent instruction |
| 392 | MachineOperand *PotentialMO = findSingleRegUse(Reg: getReplacedOperand(), MRI: getMRI()); |
| 393 | if (!PotentialMO) |
| 394 | return nullptr; |
| 395 | |
| 396 | MachineInstr *Parent = PotentialMO->getParent(); |
| 397 | |
| 398 | return canCombineSelections(MI: *Parent, TII) ? Parent : nullptr; |
| 399 | } |
| 400 | |
| 401 | bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) { |
| 402 | switch (MI.getOpcode()) { |
| 403 | case AMDGPU::V_CVT_F32_FP8_sdwa: |
| 404 | case AMDGPU::V_CVT_F32_BF8_sdwa: |
| 405 | case AMDGPU::V_CVT_PK_F32_FP8_sdwa: |
| 406 | case AMDGPU::V_CVT_PK_F32_BF8_sdwa: |
| 407 | // Does not support input modifiers: noabs, noneg, nosext. |
| 408 | return false; |
| 409 | case AMDGPU::V_CNDMASK_B32_sdwa: |
| 410 | // SISrcMods uses the same bitmask for SEXT and NEG modifiers and |
| 411 | // hence the compiler can only support one type of modifier for |
| 412 | // each SDWA instruction. For V_CNDMASK_B32_sdwa, this is NEG |
| 413 | // since its operands get printed using |
| 414 | // AMDGPUInstPrinter::printOperandAndFPInputMods which produces |
| 415 | // the output intended for NEG if SEXT is set. |
| 416 | // |
| 417 | // The ISA does actually support both modifiers on most SDWA |
| 418 | // instructions. |
| 419 | // |
| 420 | // FIXME Accept SEXT here after fixing this issue. |
| 421 | if (Sext) |
| 422 | return false; |
| 423 | break; |
| 424 | } |
| 425 | |
| 426 | // Find operand in instruction that matches source operand and replace it with |
| 427 | // target operand. Set corresponding src_sel |
| 428 | bool IsPreserveSrc = false; |
| 429 | MachineOperand *Src = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0); |
| 430 | MachineOperand *SrcSel = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0_sel); |
| 431 | MachineOperand *SrcMods = |
| 432 | TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0_modifiers); |
| 433 | assert(Src && (Src->isReg() || Src->isImm())); |
| 434 | if (!isSameReg(LHS: *Src, RHS: *getReplacedOperand())) { |
| 435 | // If this is not src0 then it could be src1 |
| 436 | Src = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1); |
| 437 | SrcSel = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1_sel); |
| 438 | SrcMods = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1_modifiers); |
| 439 | |
| 440 | if (!Src || |
| 441 | !isSameReg(LHS: *Src, RHS: *getReplacedOperand())) { |
| 442 | // It's possible this Src is a tied operand for |
| 443 | // UNUSED_PRESERVE, in which case we can either |
| 444 | // abandon the peephole attempt, or if legal we can |
| 445 | // copy the target operand into the tied slot |
| 446 | // if the preserve operation will effectively cause the same |
| 447 | // result by overwriting the rest of the dst. |
| 448 | MachineOperand *Dst = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst); |
| 449 | MachineOperand *DstUnused = |
| 450 | TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::dst_unused); |
| 451 | |
| 452 | if (Dst && |
| 453 | DstUnused->getImm() == AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE) { |
| 454 | // This will work if the tied src is accessing WORD_0, and the dst is |
| 455 | // writing WORD_1. Modifiers don't matter because all the bits that |
| 456 | // would be impacted are being overwritten by the dst. |
| 457 | // Any other case will not work. |
| 458 | SdwaSel DstSel = static_cast<SdwaSel>( |
| 459 | TII->getNamedImmOperand(MI, OperandName: AMDGPU::OpName::dst_sel)); |
| 460 | if (DstSel == AMDGPU::SDWA::SdwaSel::WORD_1 && |
| 461 | getSrcSel() == AMDGPU::SDWA::SdwaSel::WORD_0) { |
| 462 | IsPreserveSrc = true; |
| 463 | auto DstIdx = AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), |
| 464 | Name: AMDGPU::OpName::vdst); |
| 465 | auto TiedIdx = MI.findTiedOperandIdx(OpIdx: DstIdx); |
| 466 | Src = &MI.getOperand(i: TiedIdx); |
| 467 | SrcSel = nullptr; |
| 468 | SrcMods = nullptr; |
| 469 | } else { |
| 470 | // Not legal to convert this src |
| 471 | return false; |
| 472 | } |
| 473 | } |
| 474 | } |
| 475 | assert(Src && Src->isReg()); |
| 476 | |
| 477 | if ((MI.getOpcode() == AMDGPU::V_FMAC_F16_sdwa || |
| 478 | MI.getOpcode() == AMDGPU::V_FMAC_F32_sdwa || |
| 479 | MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa || |
| 480 | MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) && |
| 481 | !isSameReg(LHS: *Src, RHS: *getReplacedOperand())) { |
| 482 | // In case of v_mac_f16/32_sdwa this pass can try to apply src operand to |
| 483 | // src2. This is not allowed. |
| 484 | return false; |
| 485 | } |
| 486 | |
| 487 | assert(isSameReg(*Src, *getReplacedOperand()) && |
| 488 | (IsPreserveSrc || (SrcSel && SrcMods))); |
| 489 | } |
| 490 | copyRegOperand(To&: *Src, From: *getTargetOperand()); |
| 491 | if (!IsPreserveSrc) { |
| 492 | SdwaSel ExistingSel = static_cast<SdwaSel>(SrcSel->getImm()); |
| 493 | SrcSel->setImm(*combineSdwaSel(Sel: ExistingSel, OperandSel: getSrcSel())); |
| 494 | SrcMods->setImm(getSrcMods(TII, SrcOp: Src)); |
| 495 | } |
| 496 | getTargetOperand()->setIsKill(false); |
| 497 | return true; |
| 498 | } |
| 499 | |
| 500 | /// Verify that the SDWA selection operand \p SrcSelOpName of the SDWA |
| 501 | /// instruction \p MI can be combined with the selection \p OpSel. |
| 502 | static bool canCombineOpSel(const MachineInstr &MI, const SIInstrInfo *TII, |
| 503 | AMDGPU::OpName SrcSelOpName, SdwaSel OpSel) { |
| 504 | assert(TII->isSDWA(MI.getOpcode())); |
| 505 | |
| 506 | const MachineOperand *SrcSelOp = TII->getNamedOperand(MI, OperandName: SrcSelOpName); |
| 507 | SdwaSel SrcSel = static_cast<SdwaSel>(SrcSelOp->getImm()); |
| 508 | |
| 509 | return combineSdwaSel(Sel: SrcSel, OperandSel: OpSel).has_value(); |
| 510 | } |
| 511 | |
| 512 | /// Verify that \p Op is the same register as the operand of the SDWA |
| 513 | /// instruction \p MI named by \p SrcOpName and that the SDWA |
| 514 | /// selection \p SrcSelOpName can be combined with the \p OpSel. |
| 515 | static bool canCombineOpSel(const MachineInstr &MI, const SIInstrInfo *TII, |
| 516 | AMDGPU::OpName SrcOpName, |
| 517 | AMDGPU::OpName SrcSelOpName, MachineOperand *Op, |
| 518 | SdwaSel OpSel) { |
| 519 | assert(TII->isSDWA(MI.getOpcode())); |
| 520 | |
| 521 | const MachineOperand *Src = TII->getNamedOperand(MI, OperandName: SrcOpName); |
| 522 | if (!Src || !isSameReg(LHS: *Src, RHS: *Op)) |
| 523 | return true; |
| 524 | |
| 525 | return canCombineOpSel(MI, TII, SrcSelOpName, OpSel); |
| 526 | } |
| 527 | |
| 528 | bool SDWASrcOperand::canCombineSelections(const MachineInstr &MI, |
| 529 | const SIInstrInfo *TII) { |
| 530 | if (!TII->isSDWA(Opcode: MI.getOpcode())) |
| 531 | return true; |
| 532 | |
| 533 | using namespace AMDGPU; |
| 534 | |
| 535 | return canCombineOpSel(MI, TII, SrcOpName: OpName::src0, SrcSelOpName: OpName::src0_sel, |
| 536 | Op: getReplacedOperand(), OpSel: getSrcSel()) && |
| 537 | canCombineOpSel(MI, TII, SrcOpName: OpName::src1, SrcSelOpName: OpName::src1_sel, |
| 538 | Op: getReplacedOperand(), OpSel: getSrcSel()); |
| 539 | } |
| 540 | |
| 541 | MachineInstr *SDWADstOperand::potentialToConvert(const SIInstrInfo *TII, |
| 542 | const GCNSubtarget &ST, |
| 543 | SDWAOperandsMap *PotentialMatches) { |
| 544 | // For SDWA dst operand potential instruction is one that defines register |
| 545 | // that this operand uses |
| 546 | MachineRegisterInfo *MRI = getMRI(); |
| 547 | MachineInstr *ParentMI = getParentInst(); |
| 548 | |
| 549 | MachineOperand *PotentialMO = findSingleRegDef(Reg: getReplacedOperand(), MRI); |
| 550 | if (!PotentialMO) |
| 551 | return nullptr; |
| 552 | |
| 553 | // Check that ParentMI is the only instruction that uses replaced register |
| 554 | for (MachineInstr &UseInst : MRI->use_nodbg_instructions(Reg: PotentialMO->getReg())) { |
| 555 | if (&UseInst != ParentMI) |
| 556 | return nullptr; |
| 557 | } |
| 558 | |
| 559 | MachineInstr *Parent = PotentialMO->getParent(); |
| 560 | return canCombineSelections(MI: *Parent, TII) ? Parent : nullptr; |
| 561 | } |
| 562 | |
| 563 | bool SDWADstOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) { |
| 564 | // Replace vdst operand in MI with target operand. Set dst_sel and dst_unused |
| 565 | |
| 566 | if ((MI.getOpcode() == AMDGPU::V_FMAC_F16_sdwa || |
| 567 | MI.getOpcode() == AMDGPU::V_FMAC_F32_sdwa || |
| 568 | MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa || |
| 569 | MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) && |
| 570 | getDstSel() != AMDGPU::SDWA::DWORD) { |
| 571 | // v_mac_f16/32_sdwa allow dst_sel to be equal only to DWORD |
| 572 | return false; |
| 573 | } |
| 574 | |
| 575 | MachineOperand *Operand = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst); |
| 576 | assert(Operand && |
| 577 | Operand->isReg() && |
| 578 | isSameReg(*Operand, *getReplacedOperand())); |
| 579 | copyRegOperand(To&: *Operand, From: *getTargetOperand()); |
| 580 | MachineOperand *DstSel= TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::dst_sel); |
| 581 | assert(DstSel); |
| 582 | |
| 583 | SdwaSel ExistingSel = static_cast<SdwaSel>(DstSel->getImm()); |
| 584 | DstSel->setImm(combineSdwaSel(Sel: ExistingSel, OperandSel: getDstSel()).value()); |
| 585 | |
| 586 | MachineOperand *DstUnused= TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::dst_unused); |
| 587 | assert(DstUnused); |
| 588 | DstUnused->setImm(getDstUnused()); |
| 589 | |
| 590 | // Remove original instruction because it would conflict with our new |
| 591 | // instruction by register definition |
| 592 | getParentInst()->eraseFromParent(); |
| 593 | return true; |
| 594 | } |
| 595 | |
| 596 | bool SDWADstOperand::canCombineSelections(const MachineInstr &MI, |
| 597 | const SIInstrInfo *TII) { |
| 598 | if (!TII->isSDWA(Opcode: MI.getOpcode())) |
| 599 | return true; |
| 600 | |
| 601 | return canCombineOpSel(MI, TII, SrcSelOpName: AMDGPU::OpName::dst_sel, OpSel: getDstSel()); |
| 602 | } |
| 603 | |
| 604 | bool SDWADstPreserveOperand::convertToSDWA(MachineInstr &MI, |
| 605 | const SIInstrInfo *TII) { |
| 606 | // MI should be moved right before v_or_b32. |
| 607 | // For this we should clear all kill flags on uses of MI src-operands or else |
| 608 | // we can encounter problem with use of killed operand. |
| 609 | for (MachineOperand &MO : MI.uses()) { |
| 610 | if (!MO.isReg()) |
| 611 | continue; |
| 612 | getMRI()->clearKillFlags(Reg: MO.getReg()); |
| 613 | } |
| 614 | |
| 615 | // Move MI before v_or_b32 |
| 616 | MI.getParent()->remove(I: &MI); |
| 617 | getParentInst()->getParent()->insert(I: getParentInst(), MI: &MI); |
| 618 | |
| 619 | // Add Implicit use of preserved register |
| 620 | MachineInstrBuilder MIB(*MI.getMF(), MI); |
| 621 | MIB.addReg(RegNo: getPreservedOperand()->getReg(), |
| 622 | Flags: RegState::ImplicitKill, |
| 623 | SubReg: getPreservedOperand()->getSubReg()); |
| 624 | |
| 625 | // Tie dst to implicit use |
| 626 | MI.tieOperands(DefIdx: AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: AMDGPU::OpName::vdst), |
| 627 | UseIdx: MI.getNumOperands() - 1); |
| 628 | |
| 629 | // Convert MI as any other SDWADstOperand and remove v_or_b32 |
| 630 | return SDWADstOperand::convertToSDWA(MI, TII); |
| 631 | } |
| 632 | |
| 633 | bool SDWADstPreserveOperand::canCombineSelections(const MachineInstr &MI, |
| 634 | const SIInstrInfo *TII) { |
| 635 | return SDWADstOperand::canCombineSelections(MI, TII); |
| 636 | } |
| 637 | |
| 638 | std::optional<int64_t> |
| 639 | SIPeepholeSDWA::foldToImm(const MachineOperand &Op) const { |
| 640 | if (Op.isImm()) { |
| 641 | return Op.getImm(); |
| 642 | } |
| 643 | |
| 644 | // If this is not immediate then it can be copy of immediate value, e.g.: |
| 645 | // %1 = S_MOV_B32 255; |
| 646 | if (Op.isReg()) { |
| 647 | for (const MachineOperand &Def : MRI->def_operands(Reg: Op.getReg())) { |
| 648 | if (!isSameReg(LHS: Op, RHS: Def)) |
| 649 | continue; |
| 650 | |
| 651 | const MachineInstr *DefInst = Def.getParent(); |
| 652 | if (!TII->isFoldableCopy(MI: *DefInst)) |
| 653 | return std::nullopt; |
| 654 | |
| 655 | const MachineOperand &Copied = DefInst->getOperand(i: 1); |
| 656 | if (!Copied.isImm()) |
| 657 | return std::nullopt; |
| 658 | |
| 659 | return Copied.getImm(); |
| 660 | } |
| 661 | } |
| 662 | |
| 663 | return std::nullopt; |
| 664 | } |
| 665 | |
| 666 | std::unique_ptr<SDWAOperand> |
| 667 | SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) { |
| 668 | unsigned Opcode = MI.getOpcode(); |
| 669 | switch (Opcode) { |
| 670 | case AMDGPU::V_LSHRREV_B32_e32: |
| 671 | case AMDGPU::V_ASHRREV_I32_e32: |
| 672 | case AMDGPU::V_LSHLREV_B32_e32: |
| 673 | case AMDGPU::V_LSHRREV_B32_e64: |
| 674 | case AMDGPU::V_ASHRREV_I32_e64: |
| 675 | case AMDGPU::V_LSHLREV_B32_e64: { |
| 676 | // from: v_lshrrev_b32_e32 v1, 16/24, v0 |
| 677 | // to SDWA src:v0 src_sel:WORD_1/BYTE_3 |
| 678 | |
| 679 | // from: v_ashrrev_i32_e32 v1, 16/24, v0 |
| 680 | // to SDWA src:v0 src_sel:WORD_1/BYTE_3 sext:1 |
| 681 | |
| 682 | // from: v_lshlrev_b32_e32 v1, 16/24, v0 |
| 683 | // to SDWA dst:v1 dst_sel:WORD_1/BYTE_3 dst_unused:UNUSED_PAD |
| 684 | MachineOperand *Src0 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0); |
| 685 | auto Imm = foldToImm(Op: *Src0); |
| 686 | if (!Imm) |
| 687 | break; |
| 688 | |
| 689 | if (*Imm != 16 && *Imm != 24) |
| 690 | break; |
| 691 | |
| 692 | MachineOperand *Src1 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1); |
| 693 | MachineOperand *Dst = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst); |
| 694 | if (!Src1->isReg() || Src1->getReg().isPhysical() || |
| 695 | Dst->getReg().isPhysical()) |
| 696 | break; |
| 697 | |
| 698 | if (Opcode == AMDGPU::V_LSHLREV_B32_e32 || |
| 699 | Opcode == AMDGPU::V_LSHLREV_B32_e64) { |
| 700 | return std::make_unique<SDWADstOperand>( |
| 701 | args&: Dst, args&: Src1, args: *Imm == 16 ? WORD_1 : BYTE_3, args: UNUSED_PAD); |
| 702 | } |
| 703 | return std::make_unique<SDWASrcOperand>( |
| 704 | args&: Src1, args&: Dst, args: *Imm == 16 ? WORD_1 : BYTE_3, args: false, args: false, |
| 705 | args: Opcode != AMDGPU::V_LSHRREV_B32_e32 && |
| 706 | Opcode != AMDGPU::V_LSHRREV_B32_e64); |
| 707 | break; |
| 708 | } |
| 709 | |
| 710 | case AMDGPU::V_LSHRREV_B16_e32: |
| 711 | case AMDGPU::V_ASHRREV_I16_e32: |
| 712 | case AMDGPU::V_LSHLREV_B16_e32: |
| 713 | case AMDGPU::V_LSHRREV_B16_e64: |
| 714 | case AMDGPU::V_LSHRREV_B16_opsel_e64: |
| 715 | case AMDGPU::V_ASHRREV_I16_e64: |
| 716 | case AMDGPU::V_LSHLREV_B16_opsel_e64: |
| 717 | case AMDGPU::V_LSHLREV_B16_e64: { |
| 718 | // from: v_lshrrev_b16_e32 v1, 8, v0 |
| 719 | // to SDWA src:v0 src_sel:BYTE_1 |
| 720 | |
| 721 | // from: v_ashrrev_i16_e32 v1, 8, v0 |
| 722 | // to SDWA src:v0 src_sel:BYTE_1 sext:1 |
| 723 | |
| 724 | // from: v_lshlrev_b16_e32 v1, 8, v0 |
| 725 | // to SDWA dst:v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD |
| 726 | MachineOperand *Src0 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0); |
| 727 | auto Imm = foldToImm(Op: *Src0); |
| 728 | if (!Imm || *Imm != 8) |
| 729 | break; |
| 730 | |
| 731 | MachineOperand *Src1 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1); |
| 732 | MachineOperand *Dst = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst); |
| 733 | |
| 734 | if (!Src1->isReg() || Src1->getReg().isPhysical() || |
| 735 | Dst->getReg().isPhysical()) |
| 736 | break; |
| 737 | |
| 738 | if (Opcode == AMDGPU::V_LSHLREV_B16_e32 || |
| 739 | Opcode == AMDGPU::V_LSHLREV_B16_opsel_e64 || |
| 740 | Opcode == AMDGPU::V_LSHLREV_B16_e64) |
| 741 | return std::make_unique<SDWADstOperand>(args&: Dst, args&: Src1, args: BYTE_1, args: UNUSED_PAD); |
| 742 | return std::make_unique<SDWASrcOperand>( |
| 743 | args&: Src1, args&: Dst, args: BYTE_1, args: false, args: false, |
| 744 | args: Opcode != AMDGPU::V_LSHRREV_B16_e32 && |
| 745 | Opcode != AMDGPU::V_LSHRREV_B16_opsel_e64 && |
| 746 | Opcode != AMDGPU::V_LSHRREV_B16_e64); |
| 747 | break; |
| 748 | } |
| 749 | |
| 750 | case AMDGPU::V_BFE_I32_e64: |
| 751 | case AMDGPU::V_BFE_U32_e64: { |
| 752 | // e.g.: |
| 753 | // from: v_bfe_u32 v1, v0, 8, 8 |
| 754 | // to SDWA src:v0 src_sel:BYTE_1 |
| 755 | |
| 756 | // offset | width | src_sel |
| 757 | // ------------------------ |
| 758 | // 0 | 8 | BYTE_0 |
| 759 | // 0 | 16 | WORD_0 |
| 760 | // 0 | 32 | DWORD ? |
| 761 | // 8 | 8 | BYTE_1 |
| 762 | // 16 | 8 | BYTE_2 |
| 763 | // 16 | 16 | WORD_1 |
| 764 | // 24 | 8 | BYTE_3 |
| 765 | |
| 766 | MachineOperand *Src1 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1); |
| 767 | auto Offset = foldToImm(Op: *Src1); |
| 768 | if (!Offset) |
| 769 | break; |
| 770 | |
| 771 | MachineOperand *Src2 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src2); |
| 772 | auto Width = foldToImm(Op: *Src2); |
| 773 | if (!Width) |
| 774 | break; |
| 775 | |
| 776 | SdwaSel SrcSel = DWORD; |
| 777 | |
| 778 | if (*Offset == 0 && *Width == 8) |
| 779 | SrcSel = BYTE_0; |
| 780 | else if (*Offset == 0 && *Width == 16) |
| 781 | SrcSel = WORD_0; |
| 782 | else if (*Offset == 0 && *Width == 32) |
| 783 | SrcSel = DWORD; |
| 784 | else if (*Offset == 8 && *Width == 8) |
| 785 | SrcSel = BYTE_1; |
| 786 | else if (*Offset == 16 && *Width == 8) |
| 787 | SrcSel = BYTE_2; |
| 788 | else if (*Offset == 16 && *Width == 16) |
| 789 | SrcSel = WORD_1; |
| 790 | else if (*Offset == 24 && *Width == 8) |
| 791 | SrcSel = BYTE_3; |
| 792 | else |
| 793 | break; |
| 794 | |
| 795 | MachineOperand *Src0 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0); |
| 796 | MachineOperand *Dst = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst); |
| 797 | |
| 798 | if (!Src0->isReg() || Src0->getReg().isPhysical() || |
| 799 | Dst->getReg().isPhysical()) |
| 800 | break; |
| 801 | |
| 802 | return std::make_unique<SDWASrcOperand>( |
| 803 | args&: Src0, args&: Dst, args&: SrcSel, args: false, args: false, args: Opcode != AMDGPU::V_BFE_U32_e64); |
| 804 | } |
| 805 | |
| 806 | case AMDGPU::V_AND_B32_e32: |
| 807 | case AMDGPU::V_AND_B32_e64: { |
| 808 | // e.g.: |
| 809 | // from: v_and_b32_e32 v1, 0x0000ffff/0x000000ff, v0 |
| 810 | // to SDWA src:v0 src_sel:WORD_0/BYTE_0 |
| 811 | |
| 812 | MachineOperand *Src0 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0); |
| 813 | MachineOperand *Src1 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1); |
| 814 | auto *ValSrc = Src1; |
| 815 | auto Imm = foldToImm(Op: *Src0); |
| 816 | |
| 817 | if (!Imm) { |
| 818 | Imm = foldToImm(Op: *Src1); |
| 819 | ValSrc = Src0; |
| 820 | } |
| 821 | |
| 822 | if (!Imm || (*Imm != 0x0000ffff && *Imm != 0x000000ff)) |
| 823 | break; |
| 824 | |
| 825 | MachineOperand *Dst = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst); |
| 826 | |
| 827 | if (!ValSrc->isReg() || ValSrc->getReg().isPhysical() || |
| 828 | Dst->getReg().isPhysical()) |
| 829 | break; |
| 830 | |
| 831 | return std::make_unique<SDWASrcOperand>( |
| 832 | args&: ValSrc, args&: Dst, args: *Imm == 0x0000ffff ? WORD_0 : BYTE_0); |
| 833 | } |
| 834 | |
| 835 | case AMDGPU::V_OR_B32_e32: |
| 836 | case AMDGPU::V_OR_B32_e64: { |
| 837 | // Patterns for dst_unused:UNUSED_PRESERVE. |
| 838 | // e.g., from: |
| 839 | // v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD |
| 840 | // src1_sel:WORD_1 src2_sel:WORD1 |
| 841 | // v_add_f16_e32 v3, v1, v2 |
| 842 | // v_or_b32_e32 v4, v0, v3 |
| 843 | // to SDWA preserve dst:v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE preserve:v3 |
| 844 | |
| 845 | // Check if one of operands of v_or_b32 is SDWA instruction |
| 846 | using CheckRetType = |
| 847 | std::optional<std::pair<MachineOperand *, MachineOperand *>>; |
| 848 | auto CheckOROperandsForSDWA = |
| 849 | [&](const MachineOperand *Op1, const MachineOperand *Op2) -> CheckRetType { |
| 850 | if (!Op1 || !Op1->isReg() || !Op2 || !Op2->isReg()) |
| 851 | return CheckRetType(std::nullopt); |
| 852 | |
| 853 | MachineOperand *Op1Def = findSingleRegDef(Reg: Op1, MRI); |
| 854 | if (!Op1Def) |
| 855 | return CheckRetType(std::nullopt); |
| 856 | |
| 857 | MachineInstr *Op1Inst = Op1Def->getParent(); |
| 858 | if (!TII->isSDWA(MI: *Op1Inst)) |
| 859 | return CheckRetType(std::nullopt); |
| 860 | |
| 861 | MachineOperand *Op2Def = findSingleRegDef(Reg: Op2, MRI); |
| 862 | if (!Op2Def) |
| 863 | return CheckRetType(std::nullopt); |
| 864 | |
| 865 | return CheckRetType(std::pair(Op1Def, Op2Def)); |
| 866 | }; |
| 867 | |
| 868 | MachineOperand *OrSDWA = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0); |
| 869 | MachineOperand *OrOther = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1); |
| 870 | assert(OrSDWA && OrOther); |
| 871 | auto Res = CheckOROperandsForSDWA(OrSDWA, OrOther); |
| 872 | if (!Res) { |
| 873 | OrSDWA = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1); |
| 874 | OrOther = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0); |
| 875 | assert(OrSDWA && OrOther); |
| 876 | Res = CheckOROperandsForSDWA(OrSDWA, OrOther); |
| 877 | if (!Res) |
| 878 | break; |
| 879 | } |
| 880 | |
| 881 | MachineOperand *OrSDWADef = Res->first; |
| 882 | MachineOperand *OrOtherDef = Res->second; |
| 883 | assert(OrSDWADef && OrOtherDef); |
| 884 | |
| 885 | MachineInstr *SDWAInst = OrSDWADef->getParent(); |
| 886 | MachineInstr *OtherInst = OrOtherDef->getParent(); |
| 887 | |
| 888 | // Check that OtherInstr is actually bitwise compatible with SDWAInst = their |
| 889 | // destination patterns don't overlap. Compatible instruction can be either |
| 890 | // regular instruction with compatible bitness or SDWA instruction with |
| 891 | // correct dst_sel |
| 892 | // SDWAInst | OtherInst bitness / OtherInst dst_sel |
| 893 | // ----------------------------------------------------- |
| 894 | // DWORD | no / no |
| 895 | // WORD_0 | no / BYTE_2/3, WORD_1 |
| 896 | // WORD_1 | 8/16-bit instructions / BYTE_0/1, WORD_0 |
| 897 | // BYTE_0 | no / BYTE_1/2/3, WORD_1 |
| 898 | // BYTE_1 | 8-bit / BYTE_0/2/3, WORD_1 |
| 899 | // BYTE_2 | 8/16-bit / BYTE_0/1/3. WORD_0 |
| 900 | // BYTE_3 | 8/16/24-bit / BYTE_0/1/2, WORD_0 |
| 901 | // E.g. if SDWAInst is v_add_f16_sdwa dst_sel:WORD_1 then v_add_f16 is OK |
| 902 | // but v_add_f32 is not. |
| 903 | |
| 904 | // TODO: add support for non-SDWA instructions as OtherInst. |
| 905 | // For now this only works with SDWA instructions. For regular instructions |
| 906 | // there is no way to determine if the instruction writes only 8/16/24-bit |
| 907 | // out of full register size and all registers are at min 32-bit wide. |
| 908 | if (!TII->isSDWA(MI: *OtherInst)) |
| 909 | break; |
| 910 | |
| 911 | SdwaSel DstSel = static_cast<SdwaSel>( |
| 912 | TII->getNamedImmOperand(MI: *SDWAInst, OperandName: AMDGPU::OpName::dst_sel)); |
| 913 | SdwaSel OtherDstSel = static_cast<SdwaSel>( |
| 914 | TII->getNamedImmOperand(MI: *OtherInst, OperandName: AMDGPU::OpName::dst_sel)); |
| 915 | |
| 916 | bool DstSelAgree = false; |
| 917 | switch (DstSel) { |
| 918 | case WORD_0: DstSelAgree = ((OtherDstSel == BYTE_2) || |
| 919 | (OtherDstSel == BYTE_3) || |
| 920 | (OtherDstSel == WORD_1)); |
| 921 | break; |
| 922 | case WORD_1: DstSelAgree = ((OtherDstSel == BYTE_0) || |
| 923 | (OtherDstSel == BYTE_1) || |
| 924 | (OtherDstSel == WORD_0)); |
| 925 | break; |
| 926 | case BYTE_0: DstSelAgree = ((OtherDstSel == BYTE_1) || |
| 927 | (OtherDstSel == BYTE_2) || |
| 928 | (OtherDstSel == BYTE_3) || |
| 929 | (OtherDstSel == WORD_1)); |
| 930 | break; |
| 931 | case BYTE_1: DstSelAgree = ((OtherDstSel == BYTE_0) || |
| 932 | (OtherDstSel == BYTE_2) || |
| 933 | (OtherDstSel == BYTE_3) || |
| 934 | (OtherDstSel == WORD_1)); |
| 935 | break; |
| 936 | case BYTE_2: DstSelAgree = ((OtherDstSel == BYTE_0) || |
| 937 | (OtherDstSel == BYTE_1) || |
| 938 | (OtherDstSel == BYTE_3) || |
| 939 | (OtherDstSel == WORD_0)); |
| 940 | break; |
| 941 | case BYTE_3: DstSelAgree = ((OtherDstSel == BYTE_0) || |
| 942 | (OtherDstSel == BYTE_1) || |
| 943 | (OtherDstSel == BYTE_2) || |
| 944 | (OtherDstSel == WORD_0)); |
| 945 | break; |
| 946 | default: DstSelAgree = false; |
| 947 | } |
| 948 | |
| 949 | if (!DstSelAgree) |
| 950 | break; |
| 951 | |
| 952 | // Also OtherInst dst_unused should be UNUSED_PAD |
| 953 | DstUnused OtherDstUnused = static_cast<DstUnused>( |
| 954 | TII->getNamedImmOperand(MI: *OtherInst, OperandName: AMDGPU::OpName::dst_unused)); |
| 955 | if (OtherDstUnused != DstUnused::UNUSED_PAD) |
| 956 | break; |
| 957 | |
| 958 | // Create DstPreserveOperand |
| 959 | MachineOperand *OrDst = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst); |
| 960 | assert(OrDst && OrDst->isReg()); |
| 961 | |
| 962 | return std::make_unique<SDWADstPreserveOperand>( |
| 963 | args&: OrDst, args&: OrSDWADef, args&: OrOtherDef, args&: DstSel); |
| 964 | |
| 965 | } |
| 966 | } |
| 967 | |
| 968 | return std::unique_ptr<SDWAOperand>(nullptr); |
| 969 | } |
| 970 | |
| 971 | #if !defined(NDEBUG) |
| 972 | static raw_ostream& operator<<(raw_ostream &OS, const SDWAOperand &Operand) { |
| 973 | Operand.print(OS); |
| 974 | return OS; |
| 975 | } |
| 976 | #endif |
| 977 | |
| 978 | void SIPeepholeSDWA::matchSDWAOperands(MachineBasicBlock &MBB) { |
| 979 | for (MachineInstr &MI : MBB) { |
| 980 | if (auto Operand = matchSDWAOperand(MI)) { |
| 981 | LLVM_DEBUG(dbgs() << "Match: " << MI << "To: " << *Operand << '\n'); |
| 982 | SDWAOperands[&MI] = std::move(Operand); |
| 983 | ++NumSDWAPatternsFound; |
| 984 | } |
| 985 | } |
| 986 | } |
| 987 | |
| 988 | // Convert the V_ADD_CO_U32_e64 into V_ADD_CO_U32_e32. This allows |
| 989 | // isConvertibleToSDWA to perform its transformation on V_ADD_CO_U32_e32 into |
| 990 | // V_ADD_CO_U32_sdwa. |
| 991 | // |
| 992 | // We are transforming from a VOP3 into a VOP2 form of the instruction. |
| 993 | // %19:vgpr_32 = V_AND_B32_e32 255, |
| 994 | // killed %16:vgpr_32, implicit $exec |
| 995 | // %47:vgpr_32, %49:sreg_64_xexec = V_ADD_CO_U32_e64 |
| 996 | // %26.sub0:vreg_64, %19:vgpr_32, implicit $exec |
| 997 | // %48:vgpr_32, dead %50:sreg_64_xexec = V_ADDC_U32_e64 |
| 998 | // %26.sub1:vreg_64, %54:vgpr_32, killed %49:sreg_64_xexec, implicit $exec |
| 999 | // |
| 1000 | // becomes |
| 1001 | // %47:vgpr_32 = V_ADD_CO_U32_sdwa |
| 1002 | // 0, %26.sub0:vreg_64, 0, killed %16:vgpr_32, 0, 6, 0, 6, 0, |
| 1003 | // implicit-def $vcc, implicit $exec |
| 1004 | // %48:vgpr_32, dead %50:sreg_64_xexec = V_ADDC_U32_e64 |
| 1005 | // %26.sub1:vreg_64, %54:vgpr_32, killed $vcc, implicit $exec |
| 1006 | void SIPeepholeSDWA::pseudoOpConvertToVOP2(MachineInstr &MI, |
| 1007 | const GCNSubtarget &ST) const { |
| 1008 | int Opc = MI.getOpcode(); |
| 1009 | assert((Opc == AMDGPU::V_ADD_CO_U32_e64 || Opc == AMDGPU::V_SUB_CO_U32_e64) && |
| 1010 | "Currently only handles V_ADD_CO_U32_e64 or V_SUB_CO_U32_e64" ); |
| 1011 | |
| 1012 | // Can the candidate MI be shrunk? |
| 1013 | if (!TII->canShrink(MI, MRI: *MRI)) |
| 1014 | return; |
| 1015 | Opc = AMDGPU::getVOPe32(Opcode: Opc); |
| 1016 | // Find the related ADD instruction. |
| 1017 | const MachineOperand *Sdst = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::sdst); |
| 1018 | if (!Sdst) |
| 1019 | return; |
| 1020 | MachineOperand *NextOp = findSingleRegUse(Reg: Sdst, MRI); |
| 1021 | if (!NextOp) |
| 1022 | return; |
| 1023 | MachineInstr &MISucc = *NextOp->getParent(); |
| 1024 | |
| 1025 | // Make sure the carry in/out are subsequently unused. |
| 1026 | MachineOperand *CarryIn = TII->getNamedOperand(MI&: MISucc, OperandName: AMDGPU::OpName::src2); |
| 1027 | if (!CarryIn) |
| 1028 | return; |
| 1029 | MachineOperand *CarryOut = TII->getNamedOperand(MI&: MISucc, OperandName: AMDGPU::OpName::sdst); |
| 1030 | if (!CarryOut) |
| 1031 | return; |
| 1032 | if (!MRI->hasOneNonDBGUse(RegNo: CarryIn->getReg()) || |
| 1033 | !MRI->use_nodbg_empty(RegNo: CarryOut->getReg())) |
| 1034 | return; |
| 1035 | // Make sure VCC or its subregs are dead before MI. |
| 1036 | MachineBasicBlock &MBB = *MI.getParent(); |
| 1037 | MachineBasicBlock::LivenessQueryResult Liveness = |
| 1038 | MBB.computeRegisterLiveness(TRI, Reg: AMDGPU::VCC, Before: MI, Neighborhood: 25); |
| 1039 | if (Liveness != MachineBasicBlock::LQR_Dead) |
| 1040 | return; |
| 1041 | // Check if VCC is referenced in range of (MI,MISucc]. |
| 1042 | for (auto I = std::next(x: MI.getIterator()), E = MISucc.getIterator(); |
| 1043 | I != E; ++I) { |
| 1044 | if (I->modifiesRegister(Reg: AMDGPU::VCC, TRI)) |
| 1045 | return; |
| 1046 | } |
| 1047 | |
| 1048 | // Replace MI with V_{SUB|ADD}_I32_e32 |
| 1049 | BuildMI(BB&: MBB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: Opc)) |
| 1050 | .add(MO: *TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst)) |
| 1051 | .add(MO: *TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0)) |
| 1052 | .add(MO: *TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1)) |
| 1053 | .setMIFlags(MI.getFlags()); |
| 1054 | |
| 1055 | MI.eraseFromParent(); |
| 1056 | |
| 1057 | // Since the carry output of MI is now VCC, update its use in MISucc. |
| 1058 | |
| 1059 | MISucc.substituteRegister(FromReg: CarryIn->getReg(), ToReg: TRI->getVCC(), SubIdx: 0, RegInfo: *TRI); |
| 1060 | } |
| 1061 | |
| 1062 | /// Try to convert an \p MI in VOP3 which takes an src2 carry-in |
| 1063 | /// operand into the corresponding VOP2 form which expects the |
| 1064 | /// argument in VCC. To this end, add an copy from the carry-in to |
| 1065 | /// VCC. The conversion will only be applied if \p MI can be shrunk |
| 1066 | /// to VOP2 and if VCC can be proven to be dead before \p MI. |
| 1067 | void SIPeepholeSDWA::convertVcndmaskToVOP2(MachineInstr &MI, |
| 1068 | const GCNSubtarget &ST) const { |
| 1069 | assert(MI.getOpcode() == AMDGPU::V_CNDMASK_B32_e64); |
| 1070 | |
| 1071 | LLVM_DEBUG(dbgs() << "Attempting VOP2 conversion: " << MI); |
| 1072 | if (!TII->canShrink(MI, MRI: *MRI)) { |
| 1073 | LLVM_DEBUG(dbgs() << "Cannot shrink instruction\n" ); |
| 1074 | return; |
| 1075 | } |
| 1076 | |
| 1077 | const MachineOperand &CarryIn = |
| 1078 | *TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src2); |
| 1079 | Register CarryReg = CarryIn.getReg(); |
| 1080 | MachineInstr *CarryDef = MRI->getVRegDef(Reg: CarryReg); |
| 1081 | if (!CarryDef) { |
| 1082 | LLVM_DEBUG(dbgs() << "Missing carry-in operand definition\n" ); |
| 1083 | return; |
| 1084 | } |
| 1085 | |
| 1086 | // Make sure VCC or its subregs are dead before MI. |
| 1087 | MCRegister Vcc = TRI->getVCC(); |
| 1088 | MachineBasicBlock &MBB = *MI.getParent(); |
| 1089 | MachineBasicBlock::LivenessQueryResult Liveness = |
| 1090 | MBB.computeRegisterLiveness(TRI, Reg: Vcc, Before: MI); |
| 1091 | if (Liveness != MachineBasicBlock::LQR_Dead) { |
| 1092 | LLVM_DEBUG(dbgs() << "VCC not known to be dead before instruction\n" ); |
| 1093 | return; |
| 1094 | } |
| 1095 | |
| 1096 | BuildMI(BB&: MBB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: Vcc).add(MO: CarryIn); |
| 1097 | |
| 1098 | auto Converted = BuildMI(BB&: MBB, I&: MI, MIMD: MI.getDebugLoc(), |
| 1099 | MCID: TII->get(Opcode: AMDGPU::getVOPe32(Opcode: MI.getOpcode()))) |
| 1100 | .add(MO: *TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst)) |
| 1101 | .add(MO: *TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0)) |
| 1102 | .add(MO: *TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1)) |
| 1103 | .setMIFlags(MI.getFlags()); |
| 1104 | TII->fixImplicitOperands(MI&: *Converted); |
| 1105 | LLVM_DEBUG(dbgs() << "Converted to VOP2: " << *Converted); |
| 1106 | (void)Converted; |
| 1107 | MI.eraseFromParent(); |
| 1108 | } |
| 1109 | |
| 1110 | namespace { |
| 1111 | bool isConvertibleToSDWA(MachineInstr &MI, |
| 1112 | const GCNSubtarget &ST, |
| 1113 | const SIInstrInfo* TII) { |
| 1114 | // Check if this is already an SDWA instruction |
| 1115 | unsigned Opc = MI.getOpcode(); |
| 1116 | if (TII->isSDWA(Opcode: Opc)) |
| 1117 | return true; |
| 1118 | |
| 1119 | // Can only be handled after ealier conversion to |
| 1120 | // AMDGPU::V_CNDMASK_B32_e32 which is not always possible. |
| 1121 | if (Opc == AMDGPU::V_CNDMASK_B32_e64) |
| 1122 | return false; |
| 1123 | |
| 1124 | // Check if this instruction has opcode that supports SDWA |
| 1125 | if (AMDGPU::getSDWAOp(Opcode: Opc) == -1) |
| 1126 | Opc = AMDGPU::getVOPe32(Opcode: Opc); |
| 1127 | |
| 1128 | if (AMDGPU::getSDWAOp(Opcode: Opc) == -1) |
| 1129 | return false; |
| 1130 | |
| 1131 | if (!ST.hasSDWAOmod() && TII->hasModifiersSet(MI, OpName: AMDGPU::OpName::omod)) |
| 1132 | return false; |
| 1133 | |
| 1134 | if (TII->isVOPC(Opcode: Opc)) { |
| 1135 | if (!ST.hasSDWASdst()) { |
| 1136 | const MachineOperand *SDst = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::sdst); |
| 1137 | if (SDst && (SDst->getReg() != AMDGPU::VCC && |
| 1138 | SDst->getReg() != AMDGPU::VCC_LO)) |
| 1139 | return false; |
| 1140 | } |
| 1141 | |
| 1142 | if (!ST.hasSDWAOutModsVOPC() && |
| 1143 | (TII->hasModifiersSet(MI, OpName: AMDGPU::OpName::clamp) || |
| 1144 | TII->hasModifiersSet(MI, OpName: AMDGPU::OpName::omod))) |
| 1145 | return false; |
| 1146 | |
| 1147 | } else if (TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::sdst) || |
| 1148 | !TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst)) { |
| 1149 | return false; |
| 1150 | } |
| 1151 | |
| 1152 | if (!ST.hasSDWAMac() && (Opc == AMDGPU::V_FMAC_F16_e32 || |
| 1153 | Opc == AMDGPU::V_FMAC_F32_e32 || |
| 1154 | Opc == AMDGPU::V_MAC_F16_e32 || |
| 1155 | Opc == AMDGPU::V_MAC_F32_e32)) |
| 1156 | return false; |
| 1157 | |
| 1158 | // Check if target supports this SDWA opcode |
| 1159 | if (TII->pseudoToMCOpcode(Opcode: Opc) == -1) |
| 1160 | return false; |
| 1161 | |
| 1162 | if (MachineOperand *Src0 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0)) { |
| 1163 | if (!Src0->isReg() && !Src0->isImm()) |
| 1164 | return false; |
| 1165 | } |
| 1166 | |
| 1167 | if (MachineOperand *Src1 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1)) { |
| 1168 | if (!Src1->isReg() && !Src1->isImm()) |
| 1169 | return false; |
| 1170 | } |
| 1171 | |
| 1172 | return true; |
| 1173 | } |
| 1174 | } // namespace |
| 1175 | |
| 1176 | MachineInstr *SIPeepholeSDWA::createSDWAVersion(MachineInstr &MI) { |
| 1177 | unsigned Opcode = MI.getOpcode(); |
| 1178 | assert(!TII->isSDWA(Opcode)); |
| 1179 | |
| 1180 | int SDWAOpcode = AMDGPU::getSDWAOp(Opcode); |
| 1181 | if (SDWAOpcode == -1) |
| 1182 | SDWAOpcode = AMDGPU::getSDWAOp(Opcode: AMDGPU::getVOPe32(Opcode)); |
| 1183 | assert(SDWAOpcode != -1); |
| 1184 | |
| 1185 | const MCInstrDesc &SDWADesc = TII->get(Opcode: SDWAOpcode); |
| 1186 | |
| 1187 | // Create SDWA version of instruction MI and initialize its operands |
| 1188 | MachineInstrBuilder SDWAInst = |
| 1189 | BuildMI(BB&: *MI.getParent(), I&: MI, MIMD: MI.getDebugLoc(), MCID: SDWADesc) |
| 1190 | .setMIFlags(MI.getFlags()); |
| 1191 | |
| 1192 | // Copy dst, if it is present in original then should also be present in SDWA |
| 1193 | MachineOperand *Dst = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst); |
| 1194 | if (Dst) { |
| 1195 | assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::vdst)); |
| 1196 | SDWAInst.add(MO: *Dst); |
| 1197 | } else if ((Dst = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::sdst))) { |
| 1198 | assert(Dst && AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::sdst)); |
| 1199 | SDWAInst.add(MO: *Dst); |
| 1200 | } else { |
| 1201 | assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::sdst)); |
| 1202 | SDWAInst.addReg(RegNo: TRI->getVCC(), Flags: RegState::Define); |
| 1203 | } |
| 1204 | |
| 1205 | // Copy src0, initialize src0_modifiers. All sdwa instructions has src0 and |
| 1206 | // src0_modifiers (except for v_nop_sdwa, but it can't get here) |
| 1207 | MachineOperand *Src0 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0); |
| 1208 | assert(Src0 && AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src0) && |
| 1209 | AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src0_modifiers)); |
| 1210 | if (auto *Mod = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0_modifiers)) |
| 1211 | SDWAInst.addImm(Val: Mod->getImm()); |
| 1212 | else |
| 1213 | SDWAInst.addImm(Val: 0); |
| 1214 | SDWAInst.add(MO: *Src0); |
| 1215 | |
| 1216 | // Copy src1 if present, initialize src1_modifiers. |
| 1217 | MachineOperand *Src1 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1); |
| 1218 | if (Src1) { |
| 1219 | assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src1) && |
| 1220 | AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src1_modifiers)); |
| 1221 | if (auto *Mod = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1_modifiers)) |
| 1222 | SDWAInst.addImm(Val: Mod->getImm()); |
| 1223 | else |
| 1224 | SDWAInst.addImm(Val: 0); |
| 1225 | SDWAInst.add(MO: *Src1); |
| 1226 | } |
| 1227 | |
| 1228 | if (SDWAOpcode == AMDGPU::V_FMAC_F16_sdwa || |
| 1229 | SDWAOpcode == AMDGPU::V_FMAC_F32_sdwa || |
| 1230 | SDWAOpcode == AMDGPU::V_MAC_F16_sdwa || |
| 1231 | SDWAOpcode == AMDGPU::V_MAC_F32_sdwa) { |
| 1232 | // v_mac_f16/32 has additional src2 operand tied to vdst |
| 1233 | MachineOperand *Src2 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src2); |
| 1234 | assert(Src2); |
| 1235 | SDWAInst.add(MO: *Src2); |
| 1236 | } |
| 1237 | |
| 1238 | // Copy clamp if present, initialize otherwise |
| 1239 | assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::clamp)); |
| 1240 | MachineOperand *Clamp = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::clamp); |
| 1241 | if (Clamp) { |
| 1242 | SDWAInst.add(MO: *Clamp); |
| 1243 | } else { |
| 1244 | SDWAInst.addImm(Val: 0); |
| 1245 | } |
| 1246 | |
| 1247 | // Copy omod if present, initialize otherwise if needed |
| 1248 | if (AMDGPU::hasNamedOperand(Opcode: SDWAOpcode, NamedIdx: AMDGPU::OpName::omod)) { |
| 1249 | MachineOperand *OMod = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::omod); |
| 1250 | if (OMod) { |
| 1251 | SDWAInst.add(MO: *OMod); |
| 1252 | } else { |
| 1253 | SDWAInst.addImm(Val: 0); |
| 1254 | } |
| 1255 | } |
| 1256 | |
| 1257 | // Initialize SDWA specific operands |
| 1258 | if (AMDGPU::hasNamedOperand(Opcode: SDWAOpcode, NamedIdx: AMDGPU::OpName::dst_sel)) |
| 1259 | SDWAInst.addImm(Val: AMDGPU::SDWA::SdwaSel::DWORD); |
| 1260 | |
| 1261 | if (AMDGPU::hasNamedOperand(Opcode: SDWAOpcode, NamedIdx: AMDGPU::OpName::dst_unused)) |
| 1262 | SDWAInst.addImm(Val: AMDGPU::SDWA::DstUnused::UNUSED_PAD); |
| 1263 | |
| 1264 | assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src0_sel)); |
| 1265 | SDWAInst.addImm(Val: AMDGPU::SDWA::SdwaSel::DWORD); |
| 1266 | |
| 1267 | if (Src1) { |
| 1268 | assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src1_sel)); |
| 1269 | SDWAInst.addImm(Val: AMDGPU::SDWA::SdwaSel::DWORD); |
| 1270 | } |
| 1271 | |
| 1272 | // Check for a preserved register that needs to be copied. |
| 1273 | MachineInstr *Ret = SDWAInst.getInstr(); |
| 1274 | TII->fixImplicitOperands(MI&: *Ret); |
| 1275 | return Ret; |
| 1276 | } |
| 1277 | |
| 1278 | bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI, |
| 1279 | const SDWAOperandsVector &SDWAOperands) { |
| 1280 | LLVM_DEBUG(dbgs() << "Convert instruction:" << MI); |
| 1281 | |
| 1282 | MachineInstr *SDWAInst; |
| 1283 | if (TII->isSDWA(Opcode: MI.getOpcode())) { |
| 1284 | // Clone the instruction to allow revoking changes |
| 1285 | // made to MI during the processing of the operands |
| 1286 | // if the conversion fails. |
| 1287 | SDWAInst = MI.getMF()->CloneMachineInstr(Orig: &MI); |
| 1288 | MI.getParent()->insert(I: MI.getIterator(), M: SDWAInst); |
| 1289 | } else { |
| 1290 | SDWAInst = createSDWAVersion(MI); |
| 1291 | } |
| 1292 | |
| 1293 | // Apply all sdwa operand patterns. |
| 1294 | bool Converted = false; |
| 1295 | for (auto &Operand : SDWAOperands) { |
| 1296 | LLVM_DEBUG(dbgs() << *SDWAInst << "\nOperand: " << *Operand); |
| 1297 | // There should be no intersection between SDWA operands and potential MIs |
| 1298 | // e.g.: |
| 1299 | // v_and_b32 v0, 0xff, v1 -> src:v1 sel:BYTE_0 |
| 1300 | // v_and_b32 v2, 0xff, v0 -> src:v0 sel:BYTE_0 |
| 1301 | // v_add_u32 v3, v4, v2 |
| 1302 | // |
| 1303 | // In that example it is possible that we would fold 2nd instruction into |
| 1304 | // 3rd (v_add_u32_sdwa) and then try to fold 1st instruction into 2nd (that |
| 1305 | // was already destroyed). So if SDWAOperand is also a potential MI then do |
| 1306 | // not apply it. |
| 1307 | if (PotentialMatches.count(Key: Operand->getParentInst()) == 0) |
| 1308 | Converted |= Operand->convertToSDWA(MI&: *SDWAInst, TII); |
| 1309 | } |
| 1310 | |
| 1311 | if (!Converted) { |
| 1312 | SDWAInst->eraseFromParent(); |
| 1313 | return false; |
| 1314 | } |
| 1315 | |
| 1316 | ConvertedInstructions.push_back(Elt: SDWAInst); |
| 1317 | for (MachineOperand &MO : SDWAInst->uses()) { |
| 1318 | if (!MO.isReg()) |
| 1319 | continue; |
| 1320 | |
| 1321 | MRI->clearKillFlags(Reg: MO.getReg()); |
| 1322 | } |
| 1323 | LLVM_DEBUG(dbgs() << "\nInto:" << *SDWAInst << '\n'); |
| 1324 | ++NumSDWAInstructionsPeepholed; |
| 1325 | |
| 1326 | MI.eraseFromParent(); |
| 1327 | return true; |
| 1328 | } |
| 1329 | |
| 1330 | // If an instruction was converted to SDWA it should not have immediates or SGPR |
| 1331 | // operands (allowed one SGPR on GFX9). Copy its scalar operands into VGPRs. |
| 1332 | void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI, |
| 1333 | const GCNSubtarget &ST) const { |
| 1334 | const MCInstrDesc &Desc = TII->get(Opcode: MI.getOpcode()); |
| 1335 | unsigned ConstantBusCount = 0; |
| 1336 | for (MachineOperand &Op : MI.explicit_uses()) { |
| 1337 | if (Op.isReg()) { |
| 1338 | if (TRI->isVGPR(MRI: *MRI, Reg: Op.getReg())) |
| 1339 | continue; |
| 1340 | |
| 1341 | if (ST.hasSDWAScalar() && ConstantBusCount == 0) { |
| 1342 | ++ConstantBusCount; |
| 1343 | continue; |
| 1344 | } |
| 1345 | } else if (!Op.isImm()) |
| 1346 | continue; |
| 1347 | |
| 1348 | unsigned I = Op.getOperandNo(); |
| 1349 | const TargetRegisterClass *OpRC = TII->getRegClass(MCID: Desc, OpNum: I); |
| 1350 | if (!OpRC || !TRI->isVSSuperClass(RC: OpRC)) |
| 1351 | continue; |
| 1352 | |
| 1353 | Register VGPR = MRI->createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass); |
| 1354 | auto Copy = BuildMI(BB&: *MI.getParent(), I: MI.getIterator(), MIMD: MI.getDebugLoc(), |
| 1355 | MCID: TII->get(Opcode: AMDGPU::V_MOV_B32_e32), DestReg: VGPR); |
| 1356 | if (Op.isImm()) |
| 1357 | Copy.addImm(Val: Op.getImm()); |
| 1358 | else if (Op.isReg()) |
| 1359 | Copy.addReg(RegNo: Op.getReg(), Flags: getKillRegState(B: Op.isKill()), SubReg: Op.getSubReg()); |
| 1360 | Op.ChangeToRegister(Reg: VGPR, isDef: false); |
| 1361 | } |
| 1362 | } |
| 1363 | |
| 1364 | bool SIPeepholeSDWALegacy::runOnMachineFunction(MachineFunction &MF) { |
| 1365 | if (skipFunction(F: MF.getFunction())) |
| 1366 | return false; |
| 1367 | |
| 1368 | return SIPeepholeSDWA().run(MF); |
| 1369 | } |
| 1370 | |
| 1371 | bool SIPeepholeSDWA::run(MachineFunction &MF) { |
| 1372 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
| 1373 | |
| 1374 | if (!ST.hasSDWA()) |
| 1375 | return false; |
| 1376 | |
| 1377 | MRI = &MF.getRegInfo(); |
| 1378 | TRI = ST.getRegisterInfo(); |
| 1379 | TII = ST.getInstrInfo(); |
| 1380 | |
| 1381 | // Find all SDWA operands in MF. |
| 1382 | bool Ret = false; |
| 1383 | for (MachineBasicBlock &MBB : MF) { |
| 1384 | bool Changed = false; |
| 1385 | do { |
| 1386 | // Preprocess the ADD/SUB pairs so they could be SDWA'ed. |
| 1387 | // Look for a possible ADD or SUB that resulted from a previously lowered |
| 1388 | // V_{ADD|SUB}_U64_PSEUDO. The function pseudoOpConvertToVOP2 |
| 1389 | // lowers the pair of instructions into e32 form. |
| 1390 | matchSDWAOperands(MBB); |
| 1391 | for (const auto &OperandPair : SDWAOperands) { |
| 1392 | const auto &Operand = OperandPair.second; |
| 1393 | MachineInstr *PotentialMI = Operand->potentialToConvert(TII, ST); |
| 1394 | if (!PotentialMI) |
| 1395 | continue; |
| 1396 | |
| 1397 | switch (PotentialMI->getOpcode()) { |
| 1398 | case AMDGPU::V_ADD_CO_U32_e64: |
| 1399 | case AMDGPU::V_SUB_CO_U32_e64: |
| 1400 | pseudoOpConvertToVOP2(MI&: *PotentialMI, ST); |
| 1401 | break; |
| 1402 | case AMDGPU::V_CNDMASK_B32_e64: |
| 1403 | convertVcndmaskToVOP2(MI&: *PotentialMI, ST); |
| 1404 | break; |
| 1405 | }; |
| 1406 | } |
| 1407 | SDWAOperands.clear(); |
| 1408 | |
| 1409 | // Generate potential match list. |
| 1410 | matchSDWAOperands(MBB); |
| 1411 | |
| 1412 | for (const auto &OperandPair : SDWAOperands) { |
| 1413 | const auto &Operand = OperandPair.second; |
| 1414 | MachineInstr *PotentialMI = |
| 1415 | Operand->potentialToConvert(TII, ST, PotentialMatches: &PotentialMatches); |
| 1416 | |
| 1417 | if (PotentialMI && isConvertibleToSDWA(MI&: *PotentialMI, ST, TII)) |
| 1418 | PotentialMatches[PotentialMI].push_back(Elt: Operand.get()); |
| 1419 | } |
| 1420 | |
| 1421 | for (auto &PotentialPair : PotentialMatches) { |
| 1422 | MachineInstr &PotentialMI = *PotentialPair.first; |
| 1423 | convertToSDWA(MI&: PotentialMI, SDWAOperands: PotentialPair.second); |
| 1424 | } |
| 1425 | |
| 1426 | PotentialMatches.clear(); |
| 1427 | SDWAOperands.clear(); |
| 1428 | |
| 1429 | Changed = !ConvertedInstructions.empty(); |
| 1430 | |
| 1431 | if (Changed) |
| 1432 | Ret = true; |
| 1433 | while (!ConvertedInstructions.empty()) |
| 1434 | legalizeScalarOperands(MI&: *ConvertedInstructions.pop_back_val(), ST); |
| 1435 | } while (Changed); |
| 1436 | } |
| 1437 | |
| 1438 | return Ret; |
| 1439 | } |
| 1440 | |
| 1441 | PreservedAnalyses SIPeepholeSDWAPass::run(MachineFunction &MF, |
| 1442 | MachineFunctionAnalysisManager &) { |
| 1443 | if (MF.getFunction().hasOptNone() || !SIPeepholeSDWA().run(MF)) |
| 1444 | return PreservedAnalyses::all(); |
| 1445 | |
| 1446 | PreservedAnalyses PA = getMachineFunctionPassPreservedAnalyses(); |
| 1447 | PA.preserveSet<CFGAnalyses>(); |
| 1448 | return PA; |
| 1449 | } |
| 1450 | |