| 1 | //===- SIPeepholeSDWA.cpp - Peephole optimization for SDWA instructions ---===// | 
|---|
| 2 | // | 
|---|
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | 
|---|
| 4 | // See https://llvm.org/LICENSE.txt for license information. | 
|---|
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | 
|---|
| 6 | // | 
|---|
| 7 | //===----------------------------------------------------------------------===// | 
|---|
| 8 | // | 
|---|
| 9 | /// \file This pass tries to apply several peephole SDWA patterns. | 
|---|
| 10 | /// | 
|---|
| 11 | /// E.g. original: | 
|---|
| 12 | ///   V_LSHRREV_B32_e32 %0, 16, %1 | 
|---|
| 13 | ///   V_ADD_CO_U32_e32 %2, %0, %3 | 
|---|
| 14 | ///   V_LSHLREV_B32_e32 %4, 16, %2 | 
|---|
| 15 | /// | 
|---|
| 16 | /// Replace: | 
|---|
| 17 | ///   V_ADD_CO_U32_sdwa %4, %1, %3 | 
|---|
| 18 | ///       dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD | 
|---|
| 19 | /// | 
|---|
| 20 | //===----------------------------------------------------------------------===// | 
|---|
| 21 |  | 
|---|
| 22 | #include "SIPeepholeSDWA.h" | 
|---|
| 23 | #include "AMDGPU.h" | 
|---|
| 24 | #include "GCNSubtarget.h" | 
|---|
| 25 | #include "MCTargetDesc/AMDGPUMCTargetDesc.h" | 
|---|
| 26 | #include "llvm/ADT/MapVector.h" | 
|---|
| 27 | #include "llvm/ADT/Statistic.h" | 
|---|
| 28 | #include "llvm/CodeGen/MachineFunctionPass.h" | 
|---|
| 29 | #include <optional> | 
|---|
| 30 |  | 
|---|
| 31 | using namespace llvm; | 
|---|
| 32 |  | 
|---|
| 33 | #define DEBUG_TYPE "si-peephole-sdwa" | 
|---|
| 34 |  | 
|---|
| 35 | STATISTIC(NumSDWAPatternsFound, "Number of SDWA patterns found."); | 
|---|
| 36 | STATISTIC(NumSDWAInstructionsPeepholed, | 
|---|
| 37 | "Number of instruction converted to SDWA."); | 
|---|
| 38 |  | 
|---|
| 39 | namespace { | 
|---|
| 40 |  | 
|---|
| 41 | bool isConvertibleToSDWA(MachineInstr &MI, const GCNSubtarget &ST, | 
|---|
| 42 | const SIInstrInfo *TII); | 
|---|
| 43 | class SDWAOperand; | 
|---|
| 44 | class SDWADstOperand; | 
|---|
| 45 |  | 
|---|
| 46 | using SDWAOperandsVector = SmallVector<SDWAOperand *, 4>; | 
|---|
| 47 | using SDWAOperandsMap = MapVector<MachineInstr *, SDWAOperandsVector>; | 
|---|
| 48 |  | 
|---|
| 49 | class SIPeepholeSDWA { | 
|---|
| 50 | private: | 
|---|
| 51 | MachineRegisterInfo *MRI; | 
|---|
| 52 | const SIRegisterInfo *TRI; | 
|---|
| 53 | const SIInstrInfo *TII; | 
|---|
| 54 |  | 
|---|
| 55 | MapVector<MachineInstr *, std::unique_ptr<SDWAOperand>> SDWAOperands; | 
|---|
| 56 | SDWAOperandsMap PotentialMatches; | 
|---|
| 57 | SmallVector<MachineInstr *, 8> ConvertedInstructions; | 
|---|
| 58 |  | 
|---|
| 59 | std::optional<int64_t> foldToImm(const MachineOperand &Op) const; | 
|---|
| 60 |  | 
|---|
| 61 | void matchSDWAOperands(MachineBasicBlock &MBB); | 
|---|
| 62 | std::unique_ptr<SDWAOperand> matchSDWAOperand(MachineInstr &MI); | 
|---|
| 63 | void pseudoOpConvertToVOP2(MachineInstr &MI, | 
|---|
| 64 | const GCNSubtarget &ST) const; | 
|---|
| 65 | void convertVcndmaskToVOP2(MachineInstr &MI, const GCNSubtarget &ST) const; | 
|---|
| 66 | MachineInstr *createSDWAVersion(MachineInstr &MI); | 
|---|
| 67 | bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands); | 
|---|
| 68 | void legalizeScalarOperands(MachineInstr &MI, const GCNSubtarget &ST) const; | 
|---|
| 69 |  | 
|---|
| 70 | public: | 
|---|
| 71 | bool run(MachineFunction &MF); | 
|---|
| 72 | }; | 
|---|
| 73 |  | 
|---|
| 74 | class SIPeepholeSDWALegacy : public MachineFunctionPass { | 
|---|
| 75 | public: | 
|---|
| 76 | static char ID; | 
|---|
| 77 |  | 
|---|
| 78 | SIPeepholeSDWALegacy() : MachineFunctionPass(ID) {} | 
|---|
| 79 |  | 
|---|
| 80 | StringRef getPassName() const override { return "SI Peephole SDWA"; } | 
|---|
| 81 |  | 
|---|
| 82 | bool runOnMachineFunction(MachineFunction &MF) override; | 
|---|
| 83 |  | 
|---|
| 84 | void getAnalysisUsage(AnalysisUsage &AU) const override { | 
|---|
| 85 | AU.setPreservesCFG(); | 
|---|
| 86 | MachineFunctionPass::getAnalysisUsage(AU); | 
|---|
| 87 | } | 
|---|
| 88 | }; | 
|---|
| 89 |  | 
|---|
| 90 | using namespace AMDGPU::SDWA; | 
|---|
| 91 |  | 
|---|
| 92 | class SDWAOperand { | 
|---|
| 93 | private: | 
|---|
| 94 | MachineOperand *Target; // Operand that would be used in converted instruction | 
|---|
| 95 | MachineOperand *Replaced; // Operand that would be replace by Target | 
|---|
| 96 |  | 
|---|
| 97 | /// Returns true iff the SDWA selection of this SDWAOperand can be combined | 
|---|
| 98 | /// with the SDWA selections of its uses in \p MI. | 
|---|
| 99 | virtual bool canCombineSelections(const MachineInstr &MI, | 
|---|
| 100 | const SIInstrInfo *TII) = 0; | 
|---|
| 101 |  | 
|---|
| 102 | public: | 
|---|
| 103 | SDWAOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp) | 
|---|
| 104 | : Target(TargetOp), Replaced(ReplacedOp) { | 
|---|
| 105 | assert(Target->isReg()); | 
|---|
| 106 | assert(Replaced->isReg()); | 
|---|
| 107 | } | 
|---|
| 108 |  | 
|---|
| 109 | virtual ~SDWAOperand() = default; | 
|---|
| 110 |  | 
|---|
| 111 | virtual MachineInstr *potentialToConvert(const SIInstrInfo *TII, | 
|---|
| 112 | const GCNSubtarget &ST, | 
|---|
| 113 | SDWAOperandsMap *PotentialMatches = nullptr) = 0; | 
|---|
| 114 | virtual bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) = 0; | 
|---|
| 115 |  | 
|---|
| 116 | MachineOperand *getTargetOperand() const { return Target; } | 
|---|
| 117 | MachineOperand *getReplacedOperand() const { return Replaced; } | 
|---|
| 118 | MachineInstr *getParentInst() const { return Target->getParent(); } | 
|---|
| 119 |  | 
|---|
| 120 | MachineRegisterInfo *getMRI() const { | 
|---|
| 121 | return &getParentInst()->getParent()->getParent()->getRegInfo(); | 
|---|
| 122 | } | 
|---|
| 123 |  | 
|---|
| 124 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) | 
|---|
| 125 | virtual void print(raw_ostream& OS) const = 0; | 
|---|
| 126 | void dump() const { print(dbgs()); } | 
|---|
| 127 | #endif | 
|---|
| 128 | }; | 
|---|
| 129 |  | 
|---|
| 130 | class SDWASrcOperand : public SDWAOperand { | 
|---|
| 131 | private: | 
|---|
| 132 | SdwaSel SrcSel; | 
|---|
| 133 | bool Abs; | 
|---|
| 134 | bool Neg; | 
|---|
| 135 | bool Sext; | 
|---|
| 136 |  | 
|---|
| 137 | public: | 
|---|
| 138 | SDWASrcOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp, | 
|---|
| 139 | SdwaSel SrcSel_ = DWORD, bool Abs_ = false, bool Neg_ = false, | 
|---|
| 140 | bool Sext_ = false) | 
|---|
| 141 | : SDWAOperand(TargetOp, ReplacedOp), SrcSel(SrcSel_), Abs(Abs_), | 
|---|
| 142 | Neg(Neg_), Sext(Sext_) {} | 
|---|
| 143 |  | 
|---|
| 144 | MachineInstr *potentialToConvert(const SIInstrInfo *TII, | 
|---|
| 145 | const GCNSubtarget &ST, | 
|---|
| 146 | SDWAOperandsMap *PotentialMatches = nullptr) override; | 
|---|
| 147 | bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override; | 
|---|
| 148 | bool canCombineSelections(const MachineInstr &MI, | 
|---|
| 149 | const SIInstrInfo *TII) override; | 
|---|
| 150 |  | 
|---|
| 151 | SdwaSel getSrcSel() const { return SrcSel; } | 
|---|
| 152 | bool getAbs() const { return Abs; } | 
|---|
| 153 | bool getNeg() const { return Neg; } | 
|---|
| 154 | bool getSext() const { return Sext; } | 
|---|
| 155 |  | 
|---|
| 156 | uint64_t getSrcMods(const SIInstrInfo *TII, | 
|---|
| 157 | const MachineOperand *SrcOp) const; | 
|---|
| 158 |  | 
|---|
| 159 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) | 
|---|
| 160 | void print(raw_ostream& OS) const override; | 
|---|
| 161 | #endif | 
|---|
| 162 | }; | 
|---|
| 163 |  | 
|---|
| 164 | class SDWADstOperand : public SDWAOperand { | 
|---|
| 165 | private: | 
|---|
| 166 | SdwaSel DstSel; | 
|---|
| 167 | DstUnused DstUn; | 
|---|
| 168 |  | 
|---|
| 169 | public: | 
|---|
| 170 | SDWADstOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp, | 
|---|
| 171 | SdwaSel DstSel_ = DWORD, DstUnused DstUn_ = UNUSED_PAD) | 
|---|
| 172 | : SDWAOperand(TargetOp, ReplacedOp), DstSel(DstSel_), DstUn(DstUn_) {} | 
|---|
| 173 |  | 
|---|
| 174 | MachineInstr *potentialToConvert(const SIInstrInfo *TII, | 
|---|
| 175 | const GCNSubtarget &ST, | 
|---|
| 176 | SDWAOperandsMap *PotentialMatches = nullptr) override; | 
|---|
| 177 | bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override; | 
|---|
| 178 | bool canCombineSelections(const MachineInstr &MI, | 
|---|
| 179 | const SIInstrInfo *TII) override; | 
|---|
| 180 |  | 
|---|
| 181 | SdwaSel getDstSel() const { return DstSel; } | 
|---|
| 182 | DstUnused getDstUnused() const { return DstUn; } | 
|---|
| 183 |  | 
|---|
| 184 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) | 
|---|
| 185 | void print(raw_ostream& OS) const override; | 
|---|
| 186 | #endif | 
|---|
| 187 | }; | 
|---|
| 188 |  | 
|---|
| 189 | class SDWADstPreserveOperand : public SDWADstOperand { | 
|---|
| 190 | private: | 
|---|
| 191 | MachineOperand *Preserve; | 
|---|
| 192 |  | 
|---|
| 193 | public: | 
|---|
| 194 | SDWADstPreserveOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp, | 
|---|
| 195 | MachineOperand *PreserveOp, SdwaSel DstSel_ = DWORD) | 
|---|
| 196 | : SDWADstOperand(TargetOp, ReplacedOp, DstSel_, UNUSED_PRESERVE), | 
|---|
| 197 | Preserve(PreserveOp) {} | 
|---|
| 198 |  | 
|---|
| 199 | bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override; | 
|---|
| 200 | bool canCombineSelections(const MachineInstr &MI, | 
|---|
| 201 | const SIInstrInfo *TII) override; | 
|---|
| 202 |  | 
|---|
| 203 | MachineOperand *getPreservedOperand() const { return Preserve; } | 
|---|
| 204 |  | 
|---|
| 205 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) | 
|---|
| 206 | void print(raw_ostream& OS) const override; | 
|---|
| 207 | #endif | 
|---|
| 208 | }; | 
|---|
| 209 |  | 
|---|
| 210 | } // end anonymous namespace | 
|---|
| 211 |  | 
|---|
| 212 | INITIALIZE_PASS(SIPeepholeSDWALegacy, DEBUG_TYPE, "SI Peephole SDWA", false, | 
|---|
| 213 | false) | 
|---|
| 214 |  | 
|---|
| 215 | char SIPeepholeSDWALegacy::ID = 0; | 
|---|
| 216 |  | 
|---|
| 217 | char &llvm::SIPeepholeSDWALegacyID = SIPeepholeSDWALegacy::ID; | 
|---|
| 218 |  | 
|---|
| 219 | FunctionPass *llvm::createSIPeepholeSDWALegacyPass() { | 
|---|
| 220 | return new SIPeepholeSDWALegacy(); | 
|---|
| 221 | } | 
|---|
| 222 |  | 
|---|
| 223 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) | 
|---|
| 224 | static raw_ostream& operator<<(raw_ostream &OS, SdwaSel Sel) { | 
|---|
| 225 | switch(Sel) { | 
|---|
| 226 | case BYTE_0: OS << "BYTE_0"; break; | 
|---|
| 227 | case BYTE_1: OS << "BYTE_1"; break; | 
|---|
| 228 | case BYTE_2: OS << "BYTE_2"; break; | 
|---|
| 229 | case BYTE_3: OS << "BYTE_3"; break; | 
|---|
| 230 | case WORD_0: OS << "WORD_0"; break; | 
|---|
| 231 | case WORD_1: OS << "WORD_1"; break; | 
|---|
| 232 | case DWORD:  OS << "DWORD"; break; | 
|---|
| 233 | } | 
|---|
| 234 | return OS; | 
|---|
| 235 | } | 
|---|
| 236 |  | 
|---|
| 237 | static raw_ostream& operator<<(raw_ostream &OS, const DstUnused &Un) { | 
|---|
| 238 | switch(Un) { | 
|---|
| 239 | case UNUSED_PAD: OS << "UNUSED_PAD"; break; | 
|---|
| 240 | case UNUSED_SEXT: OS << "UNUSED_SEXT"; break; | 
|---|
| 241 | case UNUSED_PRESERVE: OS << "UNUSED_PRESERVE"; break; | 
|---|
| 242 | } | 
|---|
| 243 | return OS; | 
|---|
| 244 | } | 
|---|
| 245 |  | 
|---|
| 246 | LLVM_DUMP_METHOD | 
|---|
| 247 | void SDWASrcOperand::print(raw_ostream& OS) const { | 
|---|
| 248 | OS << "SDWA src: "<< *getTargetOperand() | 
|---|
| 249 | << " src_sel:"<< getSrcSel() | 
|---|
| 250 | << " abs:"<< getAbs() << " neg:"<< getNeg() | 
|---|
| 251 | << " sext:"<< getSext() << '\n'; | 
|---|
| 252 | } | 
|---|
| 253 |  | 
|---|
| 254 | LLVM_DUMP_METHOD | 
|---|
| 255 | void SDWADstOperand::print(raw_ostream& OS) const { | 
|---|
| 256 | OS << "SDWA dst: "<< *getTargetOperand() | 
|---|
| 257 | << " dst_sel:"<< getDstSel() | 
|---|
| 258 | << " dst_unused:"<< getDstUnused() << '\n'; | 
|---|
| 259 | } | 
|---|
| 260 |  | 
|---|
| 261 | LLVM_DUMP_METHOD | 
|---|
| 262 | void SDWADstPreserveOperand::print(raw_ostream& OS) const { | 
|---|
| 263 | OS << "SDWA preserve dst: "<< *getTargetOperand() | 
|---|
| 264 | << " dst_sel:"<< getDstSel() | 
|---|
| 265 | << " preserve:"<< *getPreservedOperand() << '\n'; | 
|---|
| 266 | } | 
|---|
| 267 |  | 
|---|
| 268 | #endif | 
|---|
| 269 |  | 
|---|
| 270 | static void copyRegOperand(MachineOperand &To, const MachineOperand &From) { | 
|---|
| 271 | assert(To.isReg() && From.isReg()); | 
|---|
| 272 | To.setReg(From.getReg()); | 
|---|
| 273 | To.setSubReg(From.getSubReg()); | 
|---|
| 274 | To.setIsUndef(From.isUndef()); | 
|---|
| 275 | if (To.isUse()) { | 
|---|
| 276 | To.setIsKill(From.isKill()); | 
|---|
| 277 | } else { | 
|---|
| 278 | To.setIsDead(From.isDead()); | 
|---|
| 279 | } | 
|---|
| 280 | } | 
|---|
| 281 |  | 
|---|
| 282 | static bool isSameReg(const MachineOperand &LHS, const MachineOperand &RHS) { | 
|---|
| 283 | return LHS.isReg() && | 
|---|
| 284 | RHS.isReg() && | 
|---|
| 285 | LHS.getReg() == RHS.getReg() && | 
|---|
| 286 | LHS.getSubReg() == RHS.getSubReg(); | 
|---|
| 287 | } | 
|---|
| 288 |  | 
|---|
| 289 | static MachineOperand *findSingleRegUse(const MachineOperand *Reg, | 
|---|
| 290 | const MachineRegisterInfo *MRI) { | 
|---|
| 291 | if (!Reg->isReg() || !Reg->isDef()) | 
|---|
| 292 | return nullptr; | 
|---|
| 293 |  | 
|---|
| 294 | MachineOperand *ResMO = nullptr; | 
|---|
| 295 | for (MachineOperand &UseMO : MRI->use_nodbg_operands(Reg: Reg->getReg())) { | 
|---|
| 296 | // If there exist use of subreg of Reg then return nullptr | 
|---|
| 297 | if (!isSameReg(LHS: UseMO, RHS: *Reg)) | 
|---|
| 298 | return nullptr; | 
|---|
| 299 |  | 
|---|
| 300 | // Check that there is only one instruction that uses Reg | 
|---|
| 301 | if (!ResMO) { | 
|---|
| 302 | ResMO = &UseMO; | 
|---|
| 303 | } else if (ResMO->getParent() != UseMO.getParent()) { | 
|---|
| 304 | return nullptr; | 
|---|
| 305 | } | 
|---|
| 306 | } | 
|---|
| 307 |  | 
|---|
| 308 | return ResMO; | 
|---|
| 309 | } | 
|---|
| 310 |  | 
|---|
| 311 | static MachineOperand *findSingleRegDef(const MachineOperand *Reg, | 
|---|
| 312 | const MachineRegisterInfo *MRI) { | 
|---|
| 313 | if (!Reg->isReg()) | 
|---|
| 314 | return nullptr; | 
|---|
| 315 |  | 
|---|
| 316 | MachineInstr *DefInstr = MRI->getUniqueVRegDef(Reg: Reg->getReg()); | 
|---|
| 317 | if (!DefInstr) | 
|---|
| 318 | return nullptr; | 
|---|
| 319 |  | 
|---|
| 320 | for (auto &DefMO : DefInstr->defs()) { | 
|---|
| 321 | if (DefMO.isReg() && DefMO.getReg() == Reg->getReg()) | 
|---|
| 322 | return &DefMO; | 
|---|
| 323 | } | 
|---|
| 324 |  | 
|---|
| 325 | // Ignore implicit defs. | 
|---|
| 326 | return nullptr; | 
|---|
| 327 | } | 
|---|
| 328 |  | 
|---|
| 329 | /// Combine an SDWA instruction's existing SDWA selection \p Sel with | 
|---|
| 330 | /// the SDWA selection \p OperandSel of its operand. If the selections | 
|---|
| 331 | /// are compatible, return the combined selection, otherwise return a | 
|---|
| 332 | /// nullopt. | 
|---|
| 333 | /// For example, if we have Sel = BYTE_0 Sel and OperandSel = WORD_1: | 
|---|
| 334 | ///     BYTE_0 Sel (WORD_1 Sel (%X)) -> BYTE_2 Sel (%X) | 
|---|
| 335 | static std::optional<SdwaSel> combineSdwaSel(SdwaSel Sel, SdwaSel OperandSel) { | 
|---|
| 336 | if (Sel == SdwaSel::DWORD) | 
|---|
| 337 | return OperandSel; | 
|---|
| 338 |  | 
|---|
| 339 | if (Sel == OperandSel || OperandSel == SdwaSel::DWORD) | 
|---|
| 340 | return Sel; | 
|---|
| 341 |  | 
|---|
| 342 | if (Sel == SdwaSel::WORD_1 || Sel == SdwaSel::BYTE_2 || | 
|---|
| 343 | Sel == SdwaSel::BYTE_3) | 
|---|
| 344 | return {}; | 
|---|
| 345 |  | 
|---|
| 346 | if (OperandSel == SdwaSel::WORD_0) | 
|---|
| 347 | return Sel; | 
|---|
| 348 |  | 
|---|
| 349 | if (OperandSel == SdwaSel::WORD_1) { | 
|---|
| 350 | if (Sel == SdwaSel::BYTE_0) | 
|---|
| 351 | return SdwaSel::BYTE_2; | 
|---|
| 352 | if (Sel == SdwaSel::BYTE_1) | 
|---|
| 353 | return SdwaSel::BYTE_3; | 
|---|
| 354 | if (Sel == SdwaSel::WORD_0) | 
|---|
| 355 | return SdwaSel::WORD_1; | 
|---|
| 356 | } | 
|---|
| 357 |  | 
|---|
| 358 | return {}; | 
|---|
| 359 | } | 
|---|
| 360 |  | 
|---|
| 361 | uint64_t SDWASrcOperand::getSrcMods(const SIInstrInfo *TII, | 
|---|
| 362 | const MachineOperand *SrcOp) const { | 
|---|
| 363 | uint64_t Mods = 0; | 
|---|
| 364 | const auto *MI = SrcOp->getParent(); | 
|---|
| 365 | if (TII->getNamedOperand(MI: *MI, OperandName: AMDGPU::OpName::src0) == SrcOp) { | 
|---|
| 366 | if (auto *Mod = TII->getNamedOperand(MI: *MI, OperandName: AMDGPU::OpName::src0_modifiers)) { | 
|---|
| 367 | Mods = Mod->getImm(); | 
|---|
| 368 | } | 
|---|
| 369 | } else if (TII->getNamedOperand(MI: *MI, OperandName: AMDGPU::OpName::src1) == SrcOp) { | 
|---|
| 370 | if (auto *Mod = TII->getNamedOperand(MI: *MI, OperandName: AMDGPU::OpName::src1_modifiers)) { | 
|---|
| 371 | Mods = Mod->getImm(); | 
|---|
| 372 | } | 
|---|
| 373 | } | 
|---|
| 374 | if (Abs || Neg) { | 
|---|
| 375 | assert(!Sext && | 
|---|
| 376 | "Float and integer src modifiers can't be set simultaneously"); | 
|---|
| 377 | Mods |= Abs ? SISrcMods::ABS : 0u; | 
|---|
| 378 | Mods ^= Neg ? SISrcMods::NEG : 0u; | 
|---|
| 379 | } else if (Sext) { | 
|---|
| 380 | Mods |= SISrcMods::SEXT; | 
|---|
| 381 | } | 
|---|
| 382 |  | 
|---|
| 383 | return Mods; | 
|---|
| 384 | } | 
|---|
| 385 |  | 
|---|
| 386 | MachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII, | 
|---|
| 387 | const GCNSubtarget &ST, | 
|---|
| 388 | SDWAOperandsMap *PotentialMatches) { | 
|---|
| 389 | if (PotentialMatches != nullptr) { | 
|---|
| 390 | // Fill out the map for all uses if all can be converted | 
|---|
| 391 | MachineOperand *Reg = getReplacedOperand(); | 
|---|
| 392 | if (!Reg->isReg() || !Reg->isDef()) | 
|---|
| 393 | return nullptr; | 
|---|
| 394 |  | 
|---|
| 395 | for (MachineInstr &UseMI : getMRI()->use_nodbg_instructions(Reg: Reg->getReg())) | 
|---|
| 396 | // Check that all instructions that use Reg can be converted | 
|---|
| 397 | if (!isConvertibleToSDWA(MI&: UseMI, ST, TII) || | 
|---|
| 398 | !canCombineSelections(MI: UseMI, TII)) | 
|---|
| 399 | return nullptr; | 
|---|
| 400 |  | 
|---|
| 401 | // Now that it's guaranteed all uses are legal, iterate over the uses again | 
|---|
| 402 | // to add them for later conversion. | 
|---|
| 403 | for (MachineOperand &UseMO : getMRI()->use_nodbg_operands(Reg: Reg->getReg())) { | 
|---|
| 404 | // Should not get a subregister here | 
|---|
| 405 | assert(isSameReg(UseMO, *Reg)); | 
|---|
| 406 |  | 
|---|
| 407 | SDWAOperandsMap &potentialMatchesMap = *PotentialMatches; | 
|---|
| 408 | MachineInstr *UseMI = UseMO.getParent(); | 
|---|
| 409 | potentialMatchesMap[UseMI].push_back(Elt: this); | 
|---|
| 410 | } | 
|---|
| 411 | return nullptr; | 
|---|
| 412 | } | 
|---|
| 413 |  | 
|---|
| 414 | // For SDWA src operand potential instruction is one that use register | 
|---|
| 415 | // defined by parent instruction | 
|---|
| 416 | MachineOperand *PotentialMO = findSingleRegUse(Reg: getReplacedOperand(), MRI: getMRI()); | 
|---|
| 417 | if (!PotentialMO) | 
|---|
| 418 | return nullptr; | 
|---|
| 419 |  | 
|---|
| 420 | MachineInstr *Parent = PotentialMO->getParent(); | 
|---|
| 421 |  | 
|---|
| 422 | return canCombineSelections(MI: *Parent, TII) ? Parent : nullptr; | 
|---|
| 423 | } | 
|---|
| 424 |  | 
|---|
| 425 | bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) { | 
|---|
| 426 | switch (MI.getOpcode()) { | 
|---|
| 427 | case AMDGPU::V_CVT_F32_FP8_sdwa: | 
|---|
| 428 | case AMDGPU::V_CVT_F32_BF8_sdwa: | 
|---|
| 429 | case AMDGPU::V_CVT_PK_F32_FP8_sdwa: | 
|---|
| 430 | case AMDGPU::V_CVT_PK_F32_BF8_sdwa: | 
|---|
| 431 | // Does not support input modifiers: noabs, noneg, nosext. | 
|---|
| 432 | return false; | 
|---|
| 433 | case AMDGPU::V_CNDMASK_B32_sdwa: | 
|---|
| 434 | // SISrcMods uses the same bitmask for SEXT and NEG modifiers and | 
|---|
| 435 | // hence the compiler can only support one type of modifier for | 
|---|
| 436 | // each SDWA instruction.  For V_CNDMASK_B32_sdwa, this is NEG | 
|---|
| 437 | // since its operands get printed using | 
|---|
| 438 | // AMDGPUInstPrinter::printOperandAndFPInputMods which produces | 
|---|
| 439 | // the output intended for NEG if SEXT is set. | 
|---|
| 440 | // | 
|---|
| 441 | // The ISA does actually support both modifiers on most SDWA | 
|---|
| 442 | // instructions. | 
|---|
| 443 | // | 
|---|
| 444 | // FIXME Accept SEXT here after fixing this issue. | 
|---|
| 445 | if (Sext) | 
|---|
| 446 | return false; | 
|---|
| 447 | break; | 
|---|
| 448 | } | 
|---|
| 449 |  | 
|---|
| 450 | // Find operand in instruction that matches source operand and replace it with | 
|---|
| 451 | // target operand. Set corresponding src_sel | 
|---|
| 452 | bool IsPreserveSrc = false; | 
|---|
| 453 | MachineOperand *Src = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0); | 
|---|
| 454 | MachineOperand *SrcSel = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0_sel); | 
|---|
| 455 | MachineOperand *SrcMods = | 
|---|
| 456 | TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0_modifiers); | 
|---|
| 457 | assert(Src && (Src->isReg() || Src->isImm())); | 
|---|
| 458 | if (!isSameReg(LHS: *Src, RHS: *getReplacedOperand())) { | 
|---|
| 459 | // If this is not src0 then it could be src1 | 
|---|
| 460 | Src = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1); | 
|---|
| 461 | SrcSel = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1_sel); | 
|---|
| 462 | SrcMods = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1_modifiers); | 
|---|
| 463 |  | 
|---|
| 464 | if (!Src || | 
|---|
| 465 | !isSameReg(LHS: *Src, RHS: *getReplacedOperand())) { | 
|---|
| 466 | // It's possible this Src is a tied operand for | 
|---|
| 467 | // UNUSED_PRESERVE, in which case we can either | 
|---|
| 468 | // abandon the peephole attempt, or if legal we can | 
|---|
| 469 | // copy the target operand into the tied slot | 
|---|
| 470 | // if the preserve operation will effectively cause the same | 
|---|
| 471 | // result by overwriting the rest of the dst. | 
|---|
| 472 | MachineOperand *Dst = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst); | 
|---|
| 473 | MachineOperand *DstUnused = | 
|---|
| 474 | TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::dst_unused); | 
|---|
| 475 |  | 
|---|
| 476 | if (Dst && | 
|---|
| 477 | DstUnused->getImm() == AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE) { | 
|---|
| 478 | // This will work if the tied src is accessing WORD_0, and the dst is | 
|---|
| 479 | // writing WORD_1. Modifiers don't matter because all the bits that | 
|---|
| 480 | // would be impacted are being overwritten by the dst. | 
|---|
| 481 | // Any other case will not work. | 
|---|
| 482 | SdwaSel DstSel = static_cast<SdwaSel>( | 
|---|
| 483 | TII->getNamedImmOperand(MI, OperandName: AMDGPU::OpName::dst_sel)); | 
|---|
| 484 | if (DstSel == AMDGPU::SDWA::SdwaSel::WORD_1 && | 
|---|
| 485 | getSrcSel() == AMDGPU::SDWA::SdwaSel::WORD_0) { | 
|---|
| 486 | IsPreserveSrc = true; | 
|---|
| 487 | auto DstIdx = AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), | 
|---|
| 488 | Name: AMDGPU::OpName::vdst); | 
|---|
| 489 | auto TiedIdx = MI.findTiedOperandIdx(OpIdx: DstIdx); | 
|---|
| 490 | Src = &MI.getOperand(i: TiedIdx); | 
|---|
| 491 | SrcSel = nullptr; | 
|---|
| 492 | SrcMods = nullptr; | 
|---|
| 493 | } else { | 
|---|
| 494 | // Not legal to convert this src | 
|---|
| 495 | return false; | 
|---|
| 496 | } | 
|---|
| 497 | } | 
|---|
| 498 | } | 
|---|
| 499 | assert(Src && Src->isReg()); | 
|---|
| 500 |  | 
|---|
| 501 | if ((MI.getOpcode() == AMDGPU::V_FMAC_F16_sdwa || | 
|---|
| 502 | MI.getOpcode() == AMDGPU::V_FMAC_F32_sdwa || | 
|---|
| 503 | MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa || | 
|---|
| 504 | MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) && | 
|---|
| 505 | !isSameReg(LHS: *Src, RHS: *getReplacedOperand())) { | 
|---|
| 506 | // In case of v_mac_f16/32_sdwa this pass can try to apply src operand to | 
|---|
| 507 | // src2. This is not allowed. | 
|---|
| 508 | return false; | 
|---|
| 509 | } | 
|---|
| 510 |  | 
|---|
| 511 | assert(isSameReg(*Src, *getReplacedOperand()) && | 
|---|
| 512 | (IsPreserveSrc || (SrcSel && SrcMods))); | 
|---|
| 513 | } | 
|---|
| 514 | copyRegOperand(To&: *Src, From: *getTargetOperand()); | 
|---|
| 515 | if (!IsPreserveSrc) { | 
|---|
| 516 | SdwaSel ExistingSel = static_cast<SdwaSel>(SrcSel->getImm()); | 
|---|
| 517 | SrcSel->setImm(*combineSdwaSel(Sel: ExistingSel, OperandSel: getSrcSel())); | 
|---|
| 518 | SrcMods->setImm(getSrcMods(TII, SrcOp: Src)); | 
|---|
| 519 | } | 
|---|
| 520 | getTargetOperand()->setIsKill(false); | 
|---|
| 521 | return true; | 
|---|
| 522 | } | 
|---|
| 523 |  | 
|---|
| 524 | /// Verify that the SDWA selection operand \p SrcSelOpName of the SDWA | 
|---|
| 525 | /// instruction \p MI can be combined with the selection \p OpSel. | 
|---|
| 526 | static bool canCombineOpSel(const MachineInstr &MI, const SIInstrInfo *TII, | 
|---|
| 527 | AMDGPU::OpName SrcSelOpName, SdwaSel OpSel) { | 
|---|
| 528 | assert(TII->isSDWA(MI.getOpcode())); | 
|---|
| 529 |  | 
|---|
| 530 | const MachineOperand *SrcSelOp = TII->getNamedOperand(MI, OperandName: SrcSelOpName); | 
|---|
| 531 | SdwaSel SrcSel = static_cast<SdwaSel>(SrcSelOp->getImm()); | 
|---|
| 532 |  | 
|---|
| 533 | return combineSdwaSel(Sel: SrcSel, OperandSel: OpSel).has_value(); | 
|---|
| 534 | } | 
|---|
| 535 |  | 
|---|
| 536 | /// Verify that \p Op is the same register as the operand of the SDWA | 
|---|
| 537 | /// instruction \p MI named by \p SrcOpName and that the SDWA | 
|---|
| 538 | /// selection \p SrcSelOpName can be combined with the \p OpSel. | 
|---|
| 539 | static bool canCombineOpSel(const MachineInstr &MI, const SIInstrInfo *TII, | 
|---|
| 540 | AMDGPU::OpName SrcOpName, | 
|---|
| 541 | AMDGPU::OpName SrcSelOpName, MachineOperand *Op, | 
|---|
| 542 | SdwaSel OpSel) { | 
|---|
| 543 | assert(TII->isSDWA(MI.getOpcode())); | 
|---|
| 544 |  | 
|---|
| 545 | const MachineOperand *Src = TII->getNamedOperand(MI, OperandName: SrcOpName); | 
|---|
| 546 | if (!Src || !isSameReg(LHS: *Src, RHS: *Op)) | 
|---|
| 547 | return true; | 
|---|
| 548 |  | 
|---|
| 549 | return canCombineOpSel(MI, TII, SrcSelOpName, OpSel); | 
|---|
| 550 | } | 
|---|
| 551 |  | 
|---|
| 552 | bool SDWASrcOperand::canCombineSelections(const MachineInstr &MI, | 
|---|
| 553 | const SIInstrInfo *TII) { | 
|---|
| 554 | if (!TII->isSDWA(Opcode: MI.getOpcode())) | 
|---|
| 555 | return true; | 
|---|
| 556 |  | 
|---|
| 557 | using namespace AMDGPU; | 
|---|
| 558 |  | 
|---|
| 559 | return canCombineOpSel(MI, TII, SrcOpName: OpName::src0, SrcSelOpName: OpName::src0_sel, | 
|---|
| 560 | Op: getReplacedOperand(), OpSel: getSrcSel()) && | 
|---|
| 561 | canCombineOpSel(MI, TII, SrcOpName: OpName::src1, SrcSelOpName: OpName::src1_sel, | 
|---|
| 562 | Op: getReplacedOperand(), OpSel: getSrcSel()); | 
|---|
| 563 | } | 
|---|
| 564 |  | 
|---|
| 565 | MachineInstr *SDWADstOperand::potentialToConvert(const SIInstrInfo *TII, | 
|---|
| 566 | const GCNSubtarget &ST, | 
|---|
| 567 | SDWAOperandsMap *PotentialMatches) { | 
|---|
| 568 | // For SDWA dst operand potential instruction is one that defines register | 
|---|
| 569 | // that this operand uses | 
|---|
| 570 | MachineRegisterInfo *MRI = getMRI(); | 
|---|
| 571 | MachineInstr *ParentMI = getParentInst(); | 
|---|
| 572 |  | 
|---|
| 573 | MachineOperand *PotentialMO = findSingleRegDef(Reg: getReplacedOperand(), MRI); | 
|---|
| 574 | if (!PotentialMO) | 
|---|
| 575 | return nullptr; | 
|---|
| 576 |  | 
|---|
| 577 | // Check that ParentMI is the only instruction that uses replaced register | 
|---|
| 578 | for (MachineInstr &UseInst : MRI->use_nodbg_instructions(Reg: PotentialMO->getReg())) { | 
|---|
| 579 | if (&UseInst != ParentMI) | 
|---|
| 580 | return nullptr; | 
|---|
| 581 | } | 
|---|
| 582 |  | 
|---|
| 583 | MachineInstr *Parent = PotentialMO->getParent(); | 
|---|
| 584 | return canCombineSelections(MI: *Parent, TII) ? Parent : nullptr; | 
|---|
| 585 | } | 
|---|
| 586 |  | 
|---|
| 587 | bool SDWADstOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) { | 
|---|
| 588 | // Replace vdst operand in MI with target operand. Set dst_sel and dst_unused | 
|---|
| 589 |  | 
|---|
| 590 | if ((MI.getOpcode() == AMDGPU::V_FMAC_F16_sdwa || | 
|---|
| 591 | MI.getOpcode() == AMDGPU::V_FMAC_F32_sdwa || | 
|---|
| 592 | MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa || | 
|---|
| 593 | MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) && | 
|---|
| 594 | getDstSel() != AMDGPU::SDWA::DWORD) { | 
|---|
| 595 | // v_mac_f16/32_sdwa allow dst_sel to be equal only to DWORD | 
|---|
| 596 | return false; | 
|---|
| 597 | } | 
|---|
| 598 |  | 
|---|
| 599 | MachineOperand *Operand = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst); | 
|---|
| 600 | assert(Operand && | 
|---|
| 601 | Operand->isReg() && | 
|---|
| 602 | isSameReg(*Operand, *getReplacedOperand())); | 
|---|
| 603 | copyRegOperand(To&: *Operand, From: *getTargetOperand()); | 
|---|
| 604 | MachineOperand *DstSel= TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::dst_sel); | 
|---|
| 605 | assert(DstSel); | 
|---|
| 606 |  | 
|---|
| 607 | SdwaSel ExistingSel = static_cast<SdwaSel>(DstSel->getImm()); | 
|---|
| 608 | DstSel->setImm(combineSdwaSel(Sel: ExistingSel, OperandSel: getDstSel()).value()); | 
|---|
| 609 |  | 
|---|
| 610 | MachineOperand *DstUnused= TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::dst_unused); | 
|---|
| 611 | assert(DstUnused); | 
|---|
| 612 | DstUnused->setImm(getDstUnused()); | 
|---|
| 613 |  | 
|---|
| 614 | // Remove original instruction  because it would conflict with our new | 
|---|
| 615 | // instruction by register definition | 
|---|
| 616 | getParentInst()->eraseFromParent(); | 
|---|
| 617 | return true; | 
|---|
| 618 | } | 
|---|
| 619 |  | 
|---|
| 620 | bool SDWADstOperand::canCombineSelections(const MachineInstr &MI, | 
|---|
| 621 | const SIInstrInfo *TII) { | 
|---|
| 622 | if (!TII->isSDWA(Opcode: MI.getOpcode())) | 
|---|
| 623 | return true; | 
|---|
| 624 |  | 
|---|
| 625 | return canCombineOpSel(MI, TII, SrcSelOpName: AMDGPU::OpName::dst_sel, OpSel: getDstSel()); | 
|---|
| 626 | } | 
|---|
| 627 |  | 
|---|
| 628 | bool SDWADstPreserveOperand::convertToSDWA(MachineInstr &MI, | 
|---|
| 629 | const SIInstrInfo *TII) { | 
|---|
| 630 | // MI should be moved right before v_or_b32. | 
|---|
| 631 | // For this we should clear all kill flags on uses of MI src-operands or else | 
|---|
| 632 | // we can encounter problem with use of killed operand. | 
|---|
| 633 | for (MachineOperand &MO : MI.uses()) { | 
|---|
| 634 | if (!MO.isReg()) | 
|---|
| 635 | continue; | 
|---|
| 636 | getMRI()->clearKillFlags(Reg: MO.getReg()); | 
|---|
| 637 | } | 
|---|
| 638 |  | 
|---|
| 639 | // Move MI before v_or_b32 | 
|---|
| 640 | MI.getParent()->remove(I: &MI); | 
|---|
| 641 | getParentInst()->getParent()->insert(I: getParentInst(), MI: &MI); | 
|---|
| 642 |  | 
|---|
| 643 | // Add Implicit use of preserved register | 
|---|
| 644 | MachineInstrBuilder MIB(*MI.getMF(), MI); | 
|---|
| 645 | MIB.addReg(RegNo: getPreservedOperand()->getReg(), | 
|---|
| 646 | flags: RegState::ImplicitKill, | 
|---|
| 647 | SubReg: getPreservedOperand()->getSubReg()); | 
|---|
| 648 |  | 
|---|
| 649 | // Tie dst to implicit use | 
|---|
| 650 | MI.tieOperands(DefIdx: AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: AMDGPU::OpName::vdst), | 
|---|
| 651 | UseIdx: MI.getNumOperands() - 1); | 
|---|
| 652 |  | 
|---|
| 653 | // Convert MI as any other SDWADstOperand and remove v_or_b32 | 
|---|
| 654 | return SDWADstOperand::convertToSDWA(MI, TII); | 
|---|
| 655 | } | 
|---|
| 656 |  | 
|---|
| 657 | bool SDWADstPreserveOperand::canCombineSelections(const MachineInstr &MI, | 
|---|
| 658 | const SIInstrInfo *TII) { | 
|---|
| 659 | return SDWADstOperand::canCombineSelections(MI, TII); | 
|---|
| 660 | } | 
|---|
| 661 |  | 
|---|
| 662 | std::optional<int64_t> | 
|---|
| 663 | SIPeepholeSDWA::foldToImm(const MachineOperand &Op) const { | 
|---|
| 664 | if (Op.isImm()) { | 
|---|
| 665 | return Op.getImm(); | 
|---|
| 666 | } | 
|---|
| 667 |  | 
|---|
| 668 | // If this is not immediate then it can be copy of immediate value, e.g.: | 
|---|
| 669 | // %1 = S_MOV_B32 255; | 
|---|
| 670 | if (Op.isReg()) { | 
|---|
| 671 | for (const MachineOperand &Def : MRI->def_operands(Reg: Op.getReg())) { | 
|---|
| 672 | if (!isSameReg(LHS: Op, RHS: Def)) | 
|---|
| 673 | continue; | 
|---|
| 674 |  | 
|---|
| 675 | const MachineInstr *DefInst = Def.getParent(); | 
|---|
| 676 | if (!TII->isFoldableCopy(MI: *DefInst)) | 
|---|
| 677 | return std::nullopt; | 
|---|
| 678 |  | 
|---|
| 679 | const MachineOperand &Copied = DefInst->getOperand(i: 1); | 
|---|
| 680 | if (!Copied.isImm()) | 
|---|
| 681 | return std::nullopt; | 
|---|
| 682 |  | 
|---|
| 683 | return Copied.getImm(); | 
|---|
| 684 | } | 
|---|
| 685 | } | 
|---|
| 686 |  | 
|---|
| 687 | return std::nullopt; | 
|---|
| 688 | } | 
|---|
| 689 |  | 
|---|
| 690 | std::unique_ptr<SDWAOperand> | 
|---|
| 691 | SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) { | 
|---|
| 692 | unsigned Opcode = MI.getOpcode(); | 
|---|
| 693 | switch (Opcode) { | 
|---|
| 694 | case AMDGPU::V_LSHRREV_B32_e32: | 
|---|
| 695 | case AMDGPU::V_ASHRREV_I32_e32: | 
|---|
| 696 | case AMDGPU::V_LSHLREV_B32_e32: | 
|---|
| 697 | case AMDGPU::V_LSHRREV_B32_e64: | 
|---|
| 698 | case AMDGPU::V_ASHRREV_I32_e64: | 
|---|
| 699 | case AMDGPU::V_LSHLREV_B32_e64: { | 
|---|
| 700 | // from: v_lshrrev_b32_e32 v1, 16/24, v0 | 
|---|
| 701 | // to SDWA src:v0 src_sel:WORD_1/BYTE_3 | 
|---|
| 702 |  | 
|---|
| 703 | // from: v_ashrrev_i32_e32 v1, 16/24, v0 | 
|---|
| 704 | // to SDWA src:v0 src_sel:WORD_1/BYTE_3 sext:1 | 
|---|
| 705 |  | 
|---|
| 706 | // from: v_lshlrev_b32_e32 v1, 16/24, v0 | 
|---|
| 707 | // to SDWA dst:v1 dst_sel:WORD_1/BYTE_3 dst_unused:UNUSED_PAD | 
|---|
| 708 | MachineOperand *Src0 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0); | 
|---|
| 709 | auto Imm = foldToImm(Op: *Src0); | 
|---|
| 710 | if (!Imm) | 
|---|
| 711 | break; | 
|---|
| 712 |  | 
|---|
| 713 | if (*Imm != 16 && *Imm != 24) | 
|---|
| 714 | break; | 
|---|
| 715 |  | 
|---|
| 716 | MachineOperand *Src1 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1); | 
|---|
| 717 | MachineOperand *Dst = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst); | 
|---|
| 718 | if (!Src1->isReg() || Src1->getReg().isPhysical() || | 
|---|
| 719 | Dst->getReg().isPhysical()) | 
|---|
| 720 | break; | 
|---|
| 721 |  | 
|---|
| 722 | if (Opcode == AMDGPU::V_LSHLREV_B32_e32 || | 
|---|
| 723 | Opcode == AMDGPU::V_LSHLREV_B32_e64) { | 
|---|
| 724 | return std::make_unique<SDWADstOperand>( | 
|---|
| 725 | args&: Dst, args&: Src1, args: *Imm == 16 ? WORD_1 : BYTE_3, args: UNUSED_PAD); | 
|---|
| 726 | } | 
|---|
| 727 | return std::make_unique<SDWASrcOperand>( | 
|---|
| 728 | args&: Src1, args&: Dst, args: *Imm == 16 ? WORD_1 : BYTE_3, args: false, args: false, | 
|---|
| 729 | args: Opcode != AMDGPU::V_LSHRREV_B32_e32 && | 
|---|
| 730 | Opcode != AMDGPU::V_LSHRREV_B32_e64); | 
|---|
| 731 | break; | 
|---|
| 732 | } | 
|---|
| 733 |  | 
|---|
| 734 | case AMDGPU::V_LSHRREV_B16_e32: | 
|---|
| 735 | case AMDGPU::V_ASHRREV_I16_e32: | 
|---|
| 736 | case AMDGPU::V_LSHLREV_B16_e32: | 
|---|
| 737 | case AMDGPU::V_LSHRREV_B16_e64: | 
|---|
| 738 | case AMDGPU::V_LSHRREV_B16_opsel_e64: | 
|---|
| 739 | case AMDGPU::V_ASHRREV_I16_e64: | 
|---|
| 740 | case AMDGPU::V_LSHLREV_B16_opsel_e64: | 
|---|
| 741 | case AMDGPU::V_LSHLREV_B16_e64: { | 
|---|
| 742 | // from: v_lshrrev_b16_e32 v1, 8, v0 | 
|---|
| 743 | // to SDWA src:v0 src_sel:BYTE_1 | 
|---|
| 744 |  | 
|---|
| 745 | // from: v_ashrrev_i16_e32 v1, 8, v0 | 
|---|
| 746 | // to SDWA src:v0 src_sel:BYTE_1 sext:1 | 
|---|
| 747 |  | 
|---|
| 748 | // from: v_lshlrev_b16_e32 v1, 8, v0 | 
|---|
| 749 | // to SDWA dst:v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD | 
|---|
| 750 | MachineOperand *Src0 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0); | 
|---|
| 751 | auto Imm = foldToImm(Op: *Src0); | 
|---|
| 752 | if (!Imm || *Imm != 8) | 
|---|
| 753 | break; | 
|---|
| 754 |  | 
|---|
| 755 | MachineOperand *Src1 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1); | 
|---|
| 756 | MachineOperand *Dst = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst); | 
|---|
| 757 |  | 
|---|
| 758 | if (!Src1->isReg() || Src1->getReg().isPhysical() || | 
|---|
| 759 | Dst->getReg().isPhysical()) | 
|---|
| 760 | break; | 
|---|
| 761 |  | 
|---|
| 762 | if (Opcode == AMDGPU::V_LSHLREV_B16_e32 || | 
|---|
| 763 | Opcode == AMDGPU::V_LSHLREV_B16_opsel_e64 || | 
|---|
| 764 | Opcode == AMDGPU::V_LSHLREV_B16_e64) | 
|---|
| 765 | return std::make_unique<SDWADstOperand>(args&: Dst, args&: Src1, args: BYTE_1, args: UNUSED_PAD); | 
|---|
| 766 | return std::make_unique<SDWASrcOperand>( | 
|---|
| 767 | args&: Src1, args&: Dst, args: BYTE_1, args: false, args: false, | 
|---|
| 768 | args: Opcode != AMDGPU::V_LSHRREV_B16_e32 && | 
|---|
| 769 | Opcode != AMDGPU::V_LSHRREV_B16_opsel_e64 && | 
|---|
| 770 | Opcode != AMDGPU::V_LSHRREV_B16_e64); | 
|---|
| 771 | break; | 
|---|
| 772 | } | 
|---|
| 773 |  | 
|---|
| 774 | case AMDGPU::V_BFE_I32_e64: | 
|---|
| 775 | case AMDGPU::V_BFE_U32_e64: { | 
|---|
| 776 | // e.g.: | 
|---|
| 777 | // from: v_bfe_u32 v1, v0, 8, 8 | 
|---|
| 778 | // to SDWA src:v0 src_sel:BYTE_1 | 
|---|
| 779 |  | 
|---|
| 780 | // offset | width | src_sel | 
|---|
| 781 | // ------------------------ | 
|---|
| 782 | // 0      | 8     | BYTE_0 | 
|---|
| 783 | // 0      | 16    | WORD_0 | 
|---|
| 784 | // 0      | 32    | DWORD ? | 
|---|
| 785 | // 8      | 8     | BYTE_1 | 
|---|
| 786 | // 16     | 8     | BYTE_2 | 
|---|
| 787 | // 16     | 16    | WORD_1 | 
|---|
| 788 | // 24     | 8     | BYTE_3 | 
|---|
| 789 |  | 
|---|
| 790 | MachineOperand *Src1 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1); | 
|---|
| 791 | auto Offset = foldToImm(Op: *Src1); | 
|---|
| 792 | if (!Offset) | 
|---|
| 793 | break; | 
|---|
| 794 |  | 
|---|
| 795 | MachineOperand *Src2 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src2); | 
|---|
| 796 | auto Width = foldToImm(Op: *Src2); | 
|---|
| 797 | if (!Width) | 
|---|
| 798 | break; | 
|---|
| 799 |  | 
|---|
| 800 | SdwaSel SrcSel = DWORD; | 
|---|
| 801 |  | 
|---|
| 802 | if (*Offset == 0 && *Width == 8) | 
|---|
| 803 | SrcSel = BYTE_0; | 
|---|
| 804 | else if (*Offset == 0 && *Width == 16) | 
|---|
| 805 | SrcSel = WORD_0; | 
|---|
| 806 | else if (*Offset == 0 && *Width == 32) | 
|---|
| 807 | SrcSel = DWORD; | 
|---|
| 808 | else if (*Offset == 8 && *Width == 8) | 
|---|
| 809 | SrcSel = BYTE_1; | 
|---|
| 810 | else if (*Offset == 16 && *Width == 8) | 
|---|
| 811 | SrcSel = BYTE_2; | 
|---|
| 812 | else if (*Offset == 16 && *Width == 16) | 
|---|
| 813 | SrcSel = WORD_1; | 
|---|
| 814 | else if (*Offset == 24 && *Width == 8) | 
|---|
| 815 | SrcSel = BYTE_3; | 
|---|
| 816 | else | 
|---|
| 817 | break; | 
|---|
| 818 |  | 
|---|
| 819 | MachineOperand *Src0 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0); | 
|---|
| 820 | MachineOperand *Dst = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst); | 
|---|
| 821 |  | 
|---|
| 822 | if (!Src0->isReg() || Src0->getReg().isPhysical() || | 
|---|
| 823 | Dst->getReg().isPhysical()) | 
|---|
| 824 | break; | 
|---|
| 825 |  | 
|---|
| 826 | return std::make_unique<SDWASrcOperand>( | 
|---|
| 827 | args&: Src0, args&: Dst, args&: SrcSel, args: false, args: false, args: Opcode != AMDGPU::V_BFE_U32_e64); | 
|---|
| 828 | } | 
|---|
| 829 |  | 
|---|
| 830 | case AMDGPU::V_AND_B32_e32: | 
|---|
| 831 | case AMDGPU::V_AND_B32_e64: { | 
|---|
| 832 | // e.g.: | 
|---|
| 833 | // from: v_and_b32_e32 v1, 0x0000ffff/0x000000ff, v0 | 
|---|
| 834 | // to SDWA src:v0 src_sel:WORD_0/BYTE_0 | 
|---|
| 835 |  | 
|---|
| 836 | MachineOperand *Src0 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0); | 
|---|
| 837 | MachineOperand *Src1 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1); | 
|---|
| 838 | auto *ValSrc = Src1; | 
|---|
| 839 | auto Imm = foldToImm(Op: *Src0); | 
|---|
| 840 |  | 
|---|
| 841 | if (!Imm) { | 
|---|
| 842 | Imm = foldToImm(Op: *Src1); | 
|---|
| 843 | ValSrc = Src0; | 
|---|
| 844 | } | 
|---|
| 845 |  | 
|---|
| 846 | if (!Imm || (*Imm != 0x0000ffff && *Imm != 0x000000ff)) | 
|---|
| 847 | break; | 
|---|
| 848 |  | 
|---|
| 849 | MachineOperand *Dst = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst); | 
|---|
| 850 |  | 
|---|
| 851 | if (!ValSrc->isReg() || ValSrc->getReg().isPhysical() || | 
|---|
| 852 | Dst->getReg().isPhysical()) | 
|---|
| 853 | break; | 
|---|
| 854 |  | 
|---|
| 855 | return std::make_unique<SDWASrcOperand>( | 
|---|
| 856 | args&: ValSrc, args&: Dst, args: *Imm == 0x0000ffff ? WORD_0 : BYTE_0); | 
|---|
| 857 | } | 
|---|
| 858 |  | 
|---|
| 859 | case AMDGPU::V_OR_B32_e32: | 
|---|
| 860 | case AMDGPU::V_OR_B32_e64: { | 
|---|
| 861 | // Patterns for dst_unused:UNUSED_PRESERVE. | 
|---|
| 862 | // e.g., from: | 
|---|
| 863 | // v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD | 
|---|
| 864 | //                           src1_sel:WORD_1 src2_sel:WORD1 | 
|---|
| 865 | // v_add_f16_e32 v3, v1, v2 | 
|---|
| 866 | // v_or_b32_e32 v4, v0, v3 | 
|---|
| 867 | // to SDWA preserve dst:v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE preserve:v3 | 
|---|
| 868 |  | 
|---|
| 869 | // Check if one of operands of v_or_b32 is SDWA instruction | 
|---|
| 870 | using CheckRetType = | 
|---|
| 871 | std::optional<std::pair<MachineOperand *, MachineOperand *>>; | 
|---|
| 872 | auto CheckOROperandsForSDWA = | 
|---|
| 873 | [&](const MachineOperand *Op1, const MachineOperand *Op2) -> CheckRetType { | 
|---|
| 874 | if (!Op1 || !Op1->isReg() || !Op2 || !Op2->isReg()) | 
|---|
| 875 | return CheckRetType(std::nullopt); | 
|---|
| 876 |  | 
|---|
| 877 | MachineOperand *Op1Def = findSingleRegDef(Reg: Op1, MRI); | 
|---|
| 878 | if (!Op1Def) | 
|---|
| 879 | return CheckRetType(std::nullopt); | 
|---|
| 880 |  | 
|---|
| 881 | MachineInstr *Op1Inst = Op1Def->getParent(); | 
|---|
| 882 | if (!TII->isSDWA(MI: *Op1Inst)) | 
|---|
| 883 | return CheckRetType(std::nullopt); | 
|---|
| 884 |  | 
|---|
| 885 | MachineOperand *Op2Def = findSingleRegDef(Reg: Op2, MRI); | 
|---|
| 886 | if (!Op2Def) | 
|---|
| 887 | return CheckRetType(std::nullopt); | 
|---|
| 888 |  | 
|---|
| 889 | return CheckRetType(std::pair(Op1Def, Op2Def)); | 
|---|
| 890 | }; | 
|---|
| 891 |  | 
|---|
| 892 | MachineOperand *OrSDWA = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0); | 
|---|
| 893 | MachineOperand *OrOther = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1); | 
|---|
| 894 | assert(OrSDWA && OrOther); | 
|---|
| 895 | auto Res = CheckOROperandsForSDWA(OrSDWA, OrOther); | 
|---|
| 896 | if (!Res) { | 
|---|
| 897 | OrSDWA = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1); | 
|---|
| 898 | OrOther = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0); | 
|---|
| 899 | assert(OrSDWA && OrOther); | 
|---|
| 900 | Res = CheckOROperandsForSDWA(OrSDWA, OrOther); | 
|---|
| 901 | if (!Res) | 
|---|
| 902 | break; | 
|---|
| 903 | } | 
|---|
| 904 |  | 
|---|
| 905 | MachineOperand *OrSDWADef = Res->first; | 
|---|
| 906 | MachineOperand *OrOtherDef = Res->second; | 
|---|
| 907 | assert(OrSDWADef && OrOtherDef); | 
|---|
| 908 |  | 
|---|
| 909 | MachineInstr *SDWAInst = OrSDWADef->getParent(); | 
|---|
| 910 | MachineInstr *OtherInst = OrOtherDef->getParent(); | 
|---|
| 911 |  | 
|---|
| 912 | // Check that OtherInstr is actually bitwise compatible with SDWAInst = their | 
|---|
| 913 | // destination patterns don't overlap. Compatible instruction can be either | 
|---|
| 914 | // regular instruction with compatible bitness or SDWA instruction with | 
|---|
| 915 | // correct dst_sel | 
|---|
| 916 | // SDWAInst | OtherInst bitness / OtherInst dst_sel | 
|---|
| 917 | // ----------------------------------------------------- | 
|---|
| 918 | // DWORD    | no                    / no | 
|---|
| 919 | // WORD_0   | no                    / BYTE_2/3, WORD_1 | 
|---|
| 920 | // WORD_1   | 8/16-bit instructions / BYTE_0/1, WORD_0 | 
|---|
| 921 | // BYTE_0   | no                    / BYTE_1/2/3, WORD_1 | 
|---|
| 922 | // BYTE_1   | 8-bit                 / BYTE_0/2/3, WORD_1 | 
|---|
| 923 | // BYTE_2   | 8/16-bit              / BYTE_0/1/3. WORD_0 | 
|---|
| 924 | // BYTE_3   | 8/16/24-bit           / BYTE_0/1/2, WORD_0 | 
|---|
| 925 | // E.g. if SDWAInst is v_add_f16_sdwa dst_sel:WORD_1 then v_add_f16 is OK | 
|---|
| 926 | // but v_add_f32 is not. | 
|---|
| 927 |  | 
|---|
| 928 | // TODO: add support for non-SDWA instructions as OtherInst. | 
|---|
| 929 | // For now this only works with SDWA instructions. For regular instructions | 
|---|
| 930 | // there is no way to determine if the instruction writes only 8/16/24-bit | 
|---|
| 931 | // out of full register size and all registers are at min 32-bit wide. | 
|---|
| 932 | if (!TII->isSDWA(MI: *OtherInst)) | 
|---|
| 933 | break; | 
|---|
| 934 |  | 
|---|
| 935 | SdwaSel DstSel = static_cast<SdwaSel>( | 
|---|
| 936 | TII->getNamedImmOperand(MI: *SDWAInst, OperandName: AMDGPU::OpName::dst_sel)); | 
|---|
| 937 | SdwaSel OtherDstSel = static_cast<SdwaSel>( | 
|---|
| 938 | TII->getNamedImmOperand(MI: *OtherInst, OperandName: AMDGPU::OpName::dst_sel)); | 
|---|
| 939 |  | 
|---|
| 940 | bool DstSelAgree = false; | 
|---|
| 941 | switch (DstSel) { | 
|---|
| 942 | case WORD_0: DstSelAgree = ((OtherDstSel == BYTE_2) || | 
|---|
| 943 | (OtherDstSel == BYTE_3) || | 
|---|
| 944 | (OtherDstSel == WORD_1)); | 
|---|
| 945 | break; | 
|---|
| 946 | case WORD_1: DstSelAgree = ((OtherDstSel == BYTE_0) || | 
|---|
| 947 | (OtherDstSel == BYTE_1) || | 
|---|
| 948 | (OtherDstSel == WORD_0)); | 
|---|
| 949 | break; | 
|---|
| 950 | case BYTE_0: DstSelAgree = ((OtherDstSel == BYTE_1) || | 
|---|
| 951 | (OtherDstSel == BYTE_2) || | 
|---|
| 952 | (OtherDstSel == BYTE_3) || | 
|---|
| 953 | (OtherDstSel == WORD_1)); | 
|---|
| 954 | break; | 
|---|
| 955 | case BYTE_1: DstSelAgree = ((OtherDstSel == BYTE_0) || | 
|---|
| 956 | (OtherDstSel == BYTE_2) || | 
|---|
| 957 | (OtherDstSel == BYTE_3) || | 
|---|
| 958 | (OtherDstSel == WORD_1)); | 
|---|
| 959 | break; | 
|---|
| 960 | case BYTE_2: DstSelAgree = ((OtherDstSel == BYTE_0) || | 
|---|
| 961 | (OtherDstSel == BYTE_1) || | 
|---|
| 962 | (OtherDstSel == BYTE_3) || | 
|---|
| 963 | (OtherDstSel == WORD_0)); | 
|---|
| 964 | break; | 
|---|
| 965 | case BYTE_3: DstSelAgree = ((OtherDstSel == BYTE_0) || | 
|---|
| 966 | (OtherDstSel == BYTE_1) || | 
|---|
| 967 | (OtherDstSel == BYTE_2) || | 
|---|
| 968 | (OtherDstSel == WORD_0)); | 
|---|
| 969 | break; | 
|---|
| 970 | default: DstSelAgree = false; | 
|---|
| 971 | } | 
|---|
| 972 |  | 
|---|
| 973 | if (!DstSelAgree) | 
|---|
| 974 | break; | 
|---|
| 975 |  | 
|---|
| 976 | // Also OtherInst dst_unused should be UNUSED_PAD | 
|---|
| 977 | DstUnused OtherDstUnused = static_cast<DstUnused>( | 
|---|
| 978 | TII->getNamedImmOperand(MI: *OtherInst, OperandName: AMDGPU::OpName::dst_unused)); | 
|---|
| 979 | if (OtherDstUnused != DstUnused::UNUSED_PAD) | 
|---|
| 980 | break; | 
|---|
| 981 |  | 
|---|
| 982 | // Create DstPreserveOperand | 
|---|
| 983 | MachineOperand *OrDst = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst); | 
|---|
| 984 | assert(OrDst && OrDst->isReg()); | 
|---|
| 985 |  | 
|---|
| 986 | return std::make_unique<SDWADstPreserveOperand>( | 
|---|
| 987 | args&: OrDst, args&: OrSDWADef, args&: OrOtherDef, args&: DstSel); | 
|---|
| 988 |  | 
|---|
| 989 | } | 
|---|
| 990 | } | 
|---|
| 991 |  | 
|---|
| 992 | return std::unique_ptr<SDWAOperand>(nullptr); | 
|---|
| 993 | } | 
|---|
| 994 |  | 
|---|
| 995 | #if !defined(NDEBUG) | 
|---|
| 996 | static raw_ostream& operator<<(raw_ostream &OS, const SDWAOperand &Operand) { | 
|---|
| 997 | Operand.print(OS); | 
|---|
| 998 | return OS; | 
|---|
| 999 | } | 
|---|
| 1000 | #endif | 
|---|
| 1001 |  | 
|---|
| 1002 | void SIPeepholeSDWA::matchSDWAOperands(MachineBasicBlock &MBB) { | 
|---|
| 1003 | for (MachineInstr &MI : MBB) { | 
|---|
| 1004 | if (auto Operand = matchSDWAOperand(MI)) { | 
|---|
| 1005 | LLVM_DEBUG(dbgs() << "Match: "<< MI << "To: "<< *Operand << '\n'); | 
|---|
| 1006 | SDWAOperands[&MI] = std::move(Operand); | 
|---|
| 1007 | ++NumSDWAPatternsFound; | 
|---|
| 1008 | } | 
|---|
| 1009 | } | 
|---|
| 1010 | } | 
|---|
| 1011 |  | 
|---|
| 1012 | // Convert the V_ADD_CO_U32_e64 into V_ADD_CO_U32_e32. This allows | 
|---|
| 1013 | // isConvertibleToSDWA to perform its transformation on V_ADD_CO_U32_e32 into | 
|---|
| 1014 | // V_ADD_CO_U32_sdwa. | 
|---|
| 1015 | // | 
|---|
| 1016 | // We are transforming from a VOP3 into a VOP2 form of the instruction. | 
|---|
| 1017 | //   %19:vgpr_32 = V_AND_B32_e32 255, | 
|---|
| 1018 | //       killed %16:vgpr_32, implicit $exec | 
|---|
| 1019 | //   %47:vgpr_32, %49:sreg_64_xexec = V_ADD_CO_U32_e64 | 
|---|
| 1020 | //       %26.sub0:vreg_64, %19:vgpr_32, implicit $exec | 
|---|
| 1021 | //  %48:vgpr_32, dead %50:sreg_64_xexec = V_ADDC_U32_e64 | 
|---|
| 1022 | //       %26.sub1:vreg_64, %54:vgpr_32, killed %49:sreg_64_xexec, implicit $exec | 
|---|
| 1023 | // | 
|---|
| 1024 | // becomes | 
|---|
| 1025 | //   %47:vgpr_32 = V_ADD_CO_U32_sdwa | 
|---|
| 1026 | //       0, %26.sub0:vreg_64, 0, killed %16:vgpr_32, 0, 6, 0, 6, 0, | 
|---|
| 1027 | //       implicit-def $vcc, implicit $exec | 
|---|
| 1028 | //  %48:vgpr_32, dead %50:sreg_64_xexec = V_ADDC_U32_e64 | 
|---|
| 1029 | //       %26.sub1:vreg_64, %54:vgpr_32, killed $vcc, implicit $exec | 
|---|
| 1030 | void SIPeepholeSDWA::pseudoOpConvertToVOP2(MachineInstr &MI, | 
|---|
| 1031 | const GCNSubtarget &ST) const { | 
|---|
| 1032 | int Opc = MI.getOpcode(); | 
|---|
| 1033 | assert((Opc == AMDGPU::V_ADD_CO_U32_e64 || Opc == AMDGPU::V_SUB_CO_U32_e64) && | 
|---|
| 1034 | "Currently only handles V_ADD_CO_U32_e64 or V_SUB_CO_U32_e64"); | 
|---|
| 1035 |  | 
|---|
| 1036 | // Can the candidate MI be shrunk? | 
|---|
| 1037 | if (!TII->canShrink(MI, MRI: *MRI)) | 
|---|
| 1038 | return; | 
|---|
| 1039 | Opc = AMDGPU::getVOPe32(Opcode: Opc); | 
|---|
| 1040 | // Find the related ADD instruction. | 
|---|
| 1041 | const MachineOperand *Sdst = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::sdst); | 
|---|
| 1042 | if (!Sdst) | 
|---|
| 1043 | return; | 
|---|
| 1044 | MachineOperand *NextOp = findSingleRegUse(Reg: Sdst, MRI); | 
|---|
| 1045 | if (!NextOp) | 
|---|
| 1046 | return; | 
|---|
| 1047 | MachineInstr &MISucc = *NextOp->getParent(); | 
|---|
| 1048 |  | 
|---|
| 1049 | // Make sure the carry in/out are subsequently unused. | 
|---|
| 1050 | MachineOperand *CarryIn = TII->getNamedOperand(MI&: MISucc, OperandName: AMDGPU::OpName::src2); | 
|---|
| 1051 | if (!CarryIn) | 
|---|
| 1052 | return; | 
|---|
| 1053 | MachineOperand *CarryOut = TII->getNamedOperand(MI&: MISucc, OperandName: AMDGPU::OpName::sdst); | 
|---|
| 1054 | if (!CarryOut) | 
|---|
| 1055 | return; | 
|---|
| 1056 | if (!MRI->hasOneUse(RegNo: CarryIn->getReg()) || !MRI->use_empty(RegNo: CarryOut->getReg())) | 
|---|
| 1057 | return; | 
|---|
| 1058 | // Make sure VCC or its subregs are dead before MI. | 
|---|
| 1059 | MachineBasicBlock &MBB = *MI.getParent(); | 
|---|
| 1060 | MachineBasicBlock::LivenessQueryResult Liveness = | 
|---|
| 1061 | MBB.computeRegisterLiveness(TRI, Reg: AMDGPU::VCC, Before: MI, Neighborhood: 25); | 
|---|
| 1062 | if (Liveness != MachineBasicBlock::LQR_Dead) | 
|---|
| 1063 | return; | 
|---|
| 1064 | // Check if VCC is referenced in range of (MI,MISucc]. | 
|---|
| 1065 | for (auto I = std::next(x: MI.getIterator()), E = MISucc.getIterator(); | 
|---|
| 1066 | I != E; ++I) { | 
|---|
| 1067 | if (I->modifiesRegister(Reg: AMDGPU::VCC, TRI)) | 
|---|
| 1068 | return; | 
|---|
| 1069 | } | 
|---|
| 1070 |  | 
|---|
| 1071 | // Replace MI with V_{SUB|ADD}_I32_e32 | 
|---|
| 1072 | BuildMI(BB&: MBB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: Opc)) | 
|---|
| 1073 | .add(MO: *TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst)) | 
|---|
| 1074 | .add(MO: *TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0)) | 
|---|
| 1075 | .add(MO: *TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1)) | 
|---|
| 1076 | .setMIFlags(MI.getFlags()); | 
|---|
| 1077 |  | 
|---|
| 1078 | MI.eraseFromParent(); | 
|---|
| 1079 |  | 
|---|
| 1080 | // Since the carry output of MI is now VCC, update its use in MISucc. | 
|---|
| 1081 |  | 
|---|
| 1082 | MISucc.substituteRegister(FromReg: CarryIn->getReg(), ToReg: TRI->getVCC(), SubIdx: 0, RegInfo: *TRI); | 
|---|
| 1083 | } | 
|---|
| 1084 |  | 
|---|
| 1085 | /// Try to convert an \p MI in VOP3 which takes an src2 carry-in | 
|---|
| 1086 | /// operand into the corresponding VOP2 form which expects the | 
|---|
| 1087 | /// argument in VCC. To this end, add an copy from the carry-in to | 
|---|
| 1088 | /// VCC.  The conversion will only be applied if \p MI can be shrunk | 
|---|
| 1089 | /// to VOP2 and if VCC can be proven to be dead before \p MI. | 
|---|
| 1090 | void SIPeepholeSDWA::convertVcndmaskToVOP2(MachineInstr &MI, | 
|---|
| 1091 | const GCNSubtarget &ST) const { | 
|---|
| 1092 | assert(MI.getOpcode() == AMDGPU::V_CNDMASK_B32_e64); | 
|---|
| 1093 |  | 
|---|
| 1094 | LLVM_DEBUG(dbgs() << "Attempting VOP2 conversion: "<< MI); | 
|---|
| 1095 | if (!TII->canShrink(MI, MRI: *MRI)) { | 
|---|
| 1096 | LLVM_DEBUG(dbgs() << "Cannot shrink instruction\n"); | 
|---|
| 1097 | return; | 
|---|
| 1098 | } | 
|---|
| 1099 |  | 
|---|
| 1100 | const MachineOperand &CarryIn = | 
|---|
| 1101 | *TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src2); | 
|---|
| 1102 | Register CarryReg = CarryIn.getReg(); | 
|---|
| 1103 | MachineInstr *CarryDef = MRI->getVRegDef(Reg: CarryReg); | 
|---|
| 1104 | if (!CarryDef) { | 
|---|
| 1105 | LLVM_DEBUG(dbgs() << "Missing carry-in operand definition\n"); | 
|---|
| 1106 | return; | 
|---|
| 1107 | } | 
|---|
| 1108 |  | 
|---|
| 1109 | // Make sure VCC or its subregs are dead before MI. | 
|---|
| 1110 | MCRegister Vcc = TRI->getVCC(); | 
|---|
| 1111 | MachineBasicBlock &MBB = *MI.getParent(); | 
|---|
| 1112 | MachineBasicBlock::LivenessQueryResult Liveness = | 
|---|
| 1113 | MBB.computeRegisterLiveness(TRI, Reg: Vcc, Before: MI); | 
|---|
| 1114 | if (Liveness != MachineBasicBlock::LQR_Dead) { | 
|---|
| 1115 | LLVM_DEBUG(dbgs() << "VCC not known to be dead before instruction\n"); | 
|---|
| 1116 | return; | 
|---|
| 1117 | } | 
|---|
| 1118 |  | 
|---|
| 1119 | BuildMI(BB&: MBB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: Vcc).add(MO: CarryIn); | 
|---|
| 1120 |  | 
|---|
| 1121 | auto Converted = BuildMI(BB&: MBB, I&: MI, MIMD: MI.getDebugLoc(), | 
|---|
| 1122 | MCID: TII->get(Opcode: AMDGPU::getVOPe32(Opcode: MI.getOpcode()))) | 
|---|
| 1123 | .add(MO: *TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst)) | 
|---|
| 1124 | .add(MO: *TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0)) | 
|---|
| 1125 | .add(MO: *TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1)) | 
|---|
| 1126 | .setMIFlags(MI.getFlags()); | 
|---|
| 1127 | TII->fixImplicitOperands(MI&: *Converted); | 
|---|
| 1128 | LLVM_DEBUG(dbgs() << "Converted to VOP2: "<< *Converted); | 
|---|
| 1129 | (void)Converted; | 
|---|
| 1130 | MI.eraseFromParent(); | 
|---|
| 1131 | } | 
|---|
| 1132 |  | 
|---|
| 1133 | namespace { | 
|---|
| 1134 | bool isConvertibleToSDWA(MachineInstr &MI, | 
|---|
| 1135 | const GCNSubtarget &ST, | 
|---|
| 1136 | const SIInstrInfo* TII) { | 
|---|
| 1137 | // Check if this is already an SDWA instruction | 
|---|
| 1138 | unsigned Opc = MI.getOpcode(); | 
|---|
| 1139 | if (TII->isSDWA(Opcode: Opc)) | 
|---|
| 1140 | return true; | 
|---|
| 1141 |  | 
|---|
| 1142 | // Can only be handled after ealier conversion to | 
|---|
| 1143 | // AMDGPU::V_CNDMASK_B32_e32 which is not always possible. | 
|---|
| 1144 | if (Opc == AMDGPU::V_CNDMASK_B32_e64) | 
|---|
| 1145 | return false; | 
|---|
| 1146 |  | 
|---|
| 1147 | // Check if this instruction has opcode that supports SDWA | 
|---|
| 1148 | if (AMDGPU::getSDWAOp(Opcode: Opc) == -1) | 
|---|
| 1149 | Opc = AMDGPU::getVOPe32(Opcode: Opc); | 
|---|
| 1150 |  | 
|---|
| 1151 | if (AMDGPU::getSDWAOp(Opcode: Opc) == -1) | 
|---|
| 1152 | return false; | 
|---|
| 1153 |  | 
|---|
| 1154 | if (!ST.hasSDWAOmod() && TII->hasModifiersSet(MI, OpName: AMDGPU::OpName::omod)) | 
|---|
| 1155 | return false; | 
|---|
| 1156 |  | 
|---|
| 1157 | if (TII->isVOPC(Opcode: Opc)) { | 
|---|
| 1158 | if (!ST.hasSDWASdst()) { | 
|---|
| 1159 | const MachineOperand *SDst = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::sdst); | 
|---|
| 1160 | if (SDst && (SDst->getReg() != AMDGPU::VCC && | 
|---|
| 1161 | SDst->getReg() != AMDGPU::VCC_LO)) | 
|---|
| 1162 | return false; | 
|---|
| 1163 | } | 
|---|
| 1164 |  | 
|---|
| 1165 | if (!ST.hasSDWAOutModsVOPC() && | 
|---|
| 1166 | (TII->hasModifiersSet(MI, OpName: AMDGPU::OpName::clamp) || | 
|---|
| 1167 | TII->hasModifiersSet(MI, OpName: AMDGPU::OpName::omod))) | 
|---|
| 1168 | return false; | 
|---|
| 1169 |  | 
|---|
| 1170 | } else if (TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::sdst) || | 
|---|
| 1171 | !TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst)) { | 
|---|
| 1172 | return false; | 
|---|
| 1173 | } | 
|---|
| 1174 |  | 
|---|
| 1175 | if (!ST.hasSDWAMac() && (Opc == AMDGPU::V_FMAC_F16_e32 || | 
|---|
| 1176 | Opc == AMDGPU::V_FMAC_F32_e32 || | 
|---|
| 1177 | Opc == AMDGPU::V_MAC_F16_e32 || | 
|---|
| 1178 | Opc == AMDGPU::V_MAC_F32_e32)) | 
|---|
| 1179 | return false; | 
|---|
| 1180 |  | 
|---|
| 1181 | // Check if target supports this SDWA opcode | 
|---|
| 1182 | if (TII->pseudoToMCOpcode(Opcode: Opc) == -1) | 
|---|
| 1183 | return false; | 
|---|
| 1184 |  | 
|---|
| 1185 | if (MachineOperand *Src0 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0)) { | 
|---|
| 1186 | if (!Src0->isReg() && !Src0->isImm()) | 
|---|
| 1187 | return false; | 
|---|
| 1188 | } | 
|---|
| 1189 |  | 
|---|
| 1190 | if (MachineOperand *Src1 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1)) { | 
|---|
| 1191 | if (!Src1->isReg() && !Src1->isImm()) | 
|---|
| 1192 | return false; | 
|---|
| 1193 | } | 
|---|
| 1194 |  | 
|---|
| 1195 | return true; | 
|---|
| 1196 | } | 
|---|
| 1197 | } // namespace | 
|---|
| 1198 |  | 
|---|
| 1199 | MachineInstr *SIPeepholeSDWA::createSDWAVersion(MachineInstr &MI) { | 
|---|
| 1200 | unsigned Opcode = MI.getOpcode(); | 
|---|
| 1201 | assert(!TII->isSDWA(Opcode)); | 
|---|
| 1202 |  | 
|---|
| 1203 | int SDWAOpcode = AMDGPU::getSDWAOp(Opcode); | 
|---|
| 1204 | if (SDWAOpcode == -1) | 
|---|
| 1205 | SDWAOpcode = AMDGPU::getSDWAOp(Opcode: AMDGPU::getVOPe32(Opcode)); | 
|---|
| 1206 | assert(SDWAOpcode != -1); | 
|---|
| 1207 |  | 
|---|
| 1208 | const MCInstrDesc &SDWADesc = TII->get(Opcode: SDWAOpcode); | 
|---|
| 1209 |  | 
|---|
| 1210 | // Create SDWA version of instruction MI and initialize its operands | 
|---|
| 1211 | MachineInstrBuilder SDWAInst = | 
|---|
| 1212 | BuildMI(BB&: *MI.getParent(), I&: MI, MIMD: MI.getDebugLoc(), MCID: SDWADesc) | 
|---|
| 1213 | .setMIFlags(MI.getFlags()); | 
|---|
| 1214 |  | 
|---|
| 1215 | // Copy dst, if it is present in original then should also be present in SDWA | 
|---|
| 1216 | MachineOperand *Dst = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst); | 
|---|
| 1217 | if (Dst) { | 
|---|
| 1218 | assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::vdst)); | 
|---|
| 1219 | SDWAInst.add(MO: *Dst); | 
|---|
| 1220 | } else if ((Dst = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::sdst))) { | 
|---|
| 1221 | assert(Dst && AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::sdst)); | 
|---|
| 1222 | SDWAInst.add(MO: *Dst); | 
|---|
| 1223 | } else { | 
|---|
| 1224 | assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::sdst)); | 
|---|
| 1225 | SDWAInst.addReg(RegNo: TRI->getVCC(), flags: RegState::Define); | 
|---|
| 1226 | } | 
|---|
| 1227 |  | 
|---|
| 1228 | // Copy src0, initialize src0_modifiers. All sdwa instructions has src0 and | 
|---|
| 1229 | // src0_modifiers (except for v_nop_sdwa, but it can't get here) | 
|---|
| 1230 | MachineOperand *Src0 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0); | 
|---|
| 1231 | assert(Src0 && AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src0) && | 
|---|
| 1232 | AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src0_modifiers)); | 
|---|
| 1233 | if (auto *Mod = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0_modifiers)) | 
|---|
| 1234 | SDWAInst.addImm(Val: Mod->getImm()); | 
|---|
| 1235 | else | 
|---|
| 1236 | SDWAInst.addImm(Val: 0); | 
|---|
| 1237 | SDWAInst.add(MO: *Src0); | 
|---|
| 1238 |  | 
|---|
| 1239 | // Copy src1 if present, initialize src1_modifiers. | 
|---|
| 1240 | MachineOperand *Src1 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1); | 
|---|
| 1241 | if (Src1) { | 
|---|
| 1242 | assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src1) && | 
|---|
| 1243 | AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src1_modifiers)); | 
|---|
| 1244 | if (auto *Mod = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1_modifiers)) | 
|---|
| 1245 | SDWAInst.addImm(Val: Mod->getImm()); | 
|---|
| 1246 | else | 
|---|
| 1247 | SDWAInst.addImm(Val: 0); | 
|---|
| 1248 | SDWAInst.add(MO: *Src1); | 
|---|
| 1249 | } | 
|---|
| 1250 |  | 
|---|
| 1251 | if (SDWAOpcode == AMDGPU::V_FMAC_F16_sdwa || | 
|---|
| 1252 | SDWAOpcode == AMDGPU::V_FMAC_F32_sdwa || | 
|---|
| 1253 | SDWAOpcode == AMDGPU::V_MAC_F16_sdwa || | 
|---|
| 1254 | SDWAOpcode == AMDGPU::V_MAC_F32_sdwa) { | 
|---|
| 1255 | // v_mac_f16/32 has additional src2 operand tied to vdst | 
|---|
| 1256 | MachineOperand *Src2 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src2); | 
|---|
| 1257 | assert(Src2); | 
|---|
| 1258 | SDWAInst.add(MO: *Src2); | 
|---|
| 1259 | } | 
|---|
| 1260 |  | 
|---|
| 1261 | // Copy clamp if present, initialize otherwise | 
|---|
| 1262 | assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::clamp)); | 
|---|
| 1263 | MachineOperand *Clamp = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::clamp); | 
|---|
| 1264 | if (Clamp) { | 
|---|
| 1265 | SDWAInst.add(MO: *Clamp); | 
|---|
| 1266 | } else { | 
|---|
| 1267 | SDWAInst.addImm(Val: 0); | 
|---|
| 1268 | } | 
|---|
| 1269 |  | 
|---|
| 1270 | // Copy omod if present, initialize otherwise if needed | 
|---|
| 1271 | if (AMDGPU::hasNamedOperand(Opcode: SDWAOpcode, NamedIdx: AMDGPU::OpName::omod)) { | 
|---|
| 1272 | MachineOperand *OMod = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::omod); | 
|---|
| 1273 | if (OMod) { | 
|---|
| 1274 | SDWAInst.add(MO: *OMod); | 
|---|
| 1275 | } else { | 
|---|
| 1276 | SDWAInst.addImm(Val: 0); | 
|---|
| 1277 | } | 
|---|
| 1278 | } | 
|---|
| 1279 |  | 
|---|
| 1280 | // Initialize SDWA specific operands | 
|---|
| 1281 | if (AMDGPU::hasNamedOperand(Opcode: SDWAOpcode, NamedIdx: AMDGPU::OpName::dst_sel)) | 
|---|
| 1282 | SDWAInst.addImm(Val: AMDGPU::SDWA::SdwaSel::DWORD); | 
|---|
| 1283 |  | 
|---|
| 1284 | if (AMDGPU::hasNamedOperand(Opcode: SDWAOpcode, NamedIdx: AMDGPU::OpName::dst_unused)) | 
|---|
| 1285 | SDWAInst.addImm(Val: AMDGPU::SDWA::DstUnused::UNUSED_PAD); | 
|---|
| 1286 |  | 
|---|
| 1287 | assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src0_sel)); | 
|---|
| 1288 | SDWAInst.addImm(Val: AMDGPU::SDWA::SdwaSel::DWORD); | 
|---|
| 1289 |  | 
|---|
| 1290 | if (Src1) { | 
|---|
| 1291 | assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src1_sel)); | 
|---|
| 1292 | SDWAInst.addImm(Val: AMDGPU::SDWA::SdwaSel::DWORD); | 
|---|
| 1293 | } | 
|---|
| 1294 |  | 
|---|
| 1295 | // Check for a preserved register that needs to be copied. | 
|---|
| 1296 | MachineInstr *Ret = SDWAInst.getInstr(); | 
|---|
| 1297 | TII->fixImplicitOperands(MI&: *Ret); | 
|---|
| 1298 | return Ret; | 
|---|
| 1299 | } | 
|---|
| 1300 |  | 
|---|
| 1301 | bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI, | 
|---|
| 1302 | const SDWAOperandsVector &SDWAOperands) { | 
|---|
| 1303 | LLVM_DEBUG(dbgs() << "Convert instruction:"<< MI); | 
|---|
| 1304 |  | 
|---|
| 1305 | MachineInstr *SDWAInst; | 
|---|
| 1306 | if (TII->isSDWA(Opcode: MI.getOpcode())) { | 
|---|
| 1307 | // Clone the instruction to allow revoking changes | 
|---|
| 1308 | // made to MI during the processing of the operands | 
|---|
| 1309 | // if the conversion fails. | 
|---|
| 1310 | SDWAInst = MI.getParent()->getParent()->CloneMachineInstr(Orig: &MI); | 
|---|
| 1311 | MI.getParent()->insert(I: MI.getIterator(), M: SDWAInst); | 
|---|
| 1312 | } else { | 
|---|
| 1313 | SDWAInst = createSDWAVersion(MI); | 
|---|
| 1314 | } | 
|---|
| 1315 |  | 
|---|
| 1316 | // Apply all sdwa operand patterns. | 
|---|
| 1317 | bool Converted = false; | 
|---|
| 1318 | for (auto &Operand : SDWAOperands) { | 
|---|
| 1319 | LLVM_DEBUG(dbgs() << *SDWAInst << "\nOperand: "<< *Operand); | 
|---|
| 1320 | // There should be no intersection between SDWA operands and potential MIs | 
|---|
| 1321 | // e.g.: | 
|---|
| 1322 | // v_and_b32 v0, 0xff, v1 -> src:v1 sel:BYTE_0 | 
|---|
| 1323 | // v_and_b32 v2, 0xff, v0 -> src:v0 sel:BYTE_0 | 
|---|
| 1324 | // v_add_u32 v3, v4, v2 | 
|---|
| 1325 | // | 
|---|
| 1326 | // In that example it is possible that we would fold 2nd instruction into | 
|---|
| 1327 | // 3rd (v_add_u32_sdwa) and then try to fold 1st instruction into 2nd (that | 
|---|
| 1328 | // was already destroyed). So if SDWAOperand is also a potential MI then do | 
|---|
| 1329 | // not apply it. | 
|---|
| 1330 | if (PotentialMatches.count(Key: Operand->getParentInst()) == 0) | 
|---|
| 1331 | Converted |= Operand->convertToSDWA(MI&: *SDWAInst, TII); | 
|---|
| 1332 | } | 
|---|
| 1333 |  | 
|---|
| 1334 | if (!Converted) { | 
|---|
| 1335 | SDWAInst->eraseFromParent(); | 
|---|
| 1336 | return false; | 
|---|
| 1337 | } | 
|---|
| 1338 |  | 
|---|
| 1339 | ConvertedInstructions.push_back(Elt: SDWAInst); | 
|---|
| 1340 | for (MachineOperand &MO : SDWAInst->uses()) { | 
|---|
| 1341 | if (!MO.isReg()) | 
|---|
| 1342 | continue; | 
|---|
| 1343 |  | 
|---|
| 1344 | MRI->clearKillFlags(Reg: MO.getReg()); | 
|---|
| 1345 | } | 
|---|
| 1346 | LLVM_DEBUG(dbgs() << "\nInto:"<< *SDWAInst << '\n'); | 
|---|
| 1347 | ++NumSDWAInstructionsPeepholed; | 
|---|
| 1348 |  | 
|---|
| 1349 | MI.eraseFromParent(); | 
|---|
| 1350 | return true; | 
|---|
| 1351 | } | 
|---|
| 1352 |  | 
|---|
| 1353 | // If an instruction was converted to SDWA it should not have immediates or SGPR | 
|---|
| 1354 | // operands (allowed one SGPR on GFX9). Copy its scalar operands into VGPRs. | 
|---|
| 1355 | void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI, | 
|---|
| 1356 | const GCNSubtarget &ST) const { | 
|---|
| 1357 | const MCInstrDesc &Desc = TII->get(Opcode: MI.getOpcode()); | 
|---|
| 1358 | unsigned ConstantBusCount = 0; | 
|---|
| 1359 | for (MachineOperand &Op : MI.explicit_uses()) { | 
|---|
| 1360 | if (!Op.isImm() && !(Op.isReg() && !TRI->isVGPR(MRI: *MRI, Reg: Op.getReg()))) | 
|---|
| 1361 | continue; | 
|---|
| 1362 |  | 
|---|
| 1363 | unsigned I = Op.getOperandNo(); | 
|---|
| 1364 | if (Desc.operands()[I].RegClass == -1 || | 
|---|
| 1365 | !TRI->isVSSuperClass(RC: TRI->getRegClass(RCID: Desc.operands()[I].RegClass))) | 
|---|
| 1366 | continue; | 
|---|
| 1367 |  | 
|---|
| 1368 | if (ST.hasSDWAScalar() && ConstantBusCount == 0 && Op.isReg() && | 
|---|
| 1369 | TRI->isSGPRReg(MRI: *MRI, Reg: Op.getReg())) { | 
|---|
| 1370 | ++ConstantBusCount; | 
|---|
| 1371 | continue; | 
|---|
| 1372 | } | 
|---|
| 1373 |  | 
|---|
| 1374 | Register VGPR = MRI->createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass); | 
|---|
| 1375 | auto Copy = BuildMI(BB&: *MI.getParent(), I: MI.getIterator(), MIMD: MI.getDebugLoc(), | 
|---|
| 1376 | MCID: TII->get(Opcode: AMDGPU::V_MOV_B32_e32), DestReg: VGPR); | 
|---|
| 1377 | if (Op.isImm()) | 
|---|
| 1378 | Copy.addImm(Val: Op.getImm()); | 
|---|
| 1379 | else if (Op.isReg()) | 
|---|
| 1380 | Copy.addReg(RegNo: Op.getReg(), flags: Op.isKill() ? RegState::Kill : 0, | 
|---|
| 1381 | SubReg: Op.getSubReg()); | 
|---|
| 1382 | Op.ChangeToRegister(Reg: VGPR, isDef: false); | 
|---|
| 1383 | } | 
|---|
| 1384 | } | 
|---|
| 1385 |  | 
|---|
| 1386 | bool SIPeepholeSDWALegacy::runOnMachineFunction(MachineFunction &MF) { | 
|---|
| 1387 | if (skipFunction(F: MF.getFunction())) | 
|---|
| 1388 | return false; | 
|---|
| 1389 |  | 
|---|
| 1390 | return SIPeepholeSDWA().run(MF); | 
|---|
| 1391 | } | 
|---|
| 1392 |  | 
|---|
| 1393 | bool SIPeepholeSDWA::run(MachineFunction &MF) { | 
|---|
| 1394 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); | 
|---|
| 1395 |  | 
|---|
| 1396 | if (!ST.hasSDWA()) | 
|---|
| 1397 | return false; | 
|---|
| 1398 |  | 
|---|
| 1399 | MRI = &MF.getRegInfo(); | 
|---|
| 1400 | TRI = ST.getRegisterInfo(); | 
|---|
| 1401 | TII = ST.getInstrInfo(); | 
|---|
| 1402 |  | 
|---|
| 1403 | // Find all SDWA operands in MF. | 
|---|
| 1404 | bool Ret = false; | 
|---|
| 1405 | for (MachineBasicBlock &MBB : MF) { | 
|---|
| 1406 | bool Changed = false; | 
|---|
| 1407 | do { | 
|---|
| 1408 | // Preprocess the ADD/SUB pairs so they could be SDWA'ed. | 
|---|
| 1409 | // Look for a possible ADD or SUB that resulted from a previously lowered | 
|---|
| 1410 | // V_{ADD|SUB}_U64_PSEUDO. The function pseudoOpConvertToVOP2 | 
|---|
| 1411 | // lowers the pair of instructions into e32 form. | 
|---|
| 1412 | matchSDWAOperands(MBB); | 
|---|
| 1413 | for (const auto &OperandPair : SDWAOperands) { | 
|---|
| 1414 | const auto &Operand = OperandPair.second; | 
|---|
| 1415 | MachineInstr *PotentialMI = Operand->potentialToConvert(TII, ST); | 
|---|
| 1416 | if (!PotentialMI) | 
|---|
| 1417 | continue; | 
|---|
| 1418 |  | 
|---|
| 1419 | switch (PotentialMI->getOpcode()) { | 
|---|
| 1420 | case AMDGPU::V_ADD_CO_U32_e64: | 
|---|
| 1421 | case AMDGPU::V_SUB_CO_U32_e64: | 
|---|
| 1422 | pseudoOpConvertToVOP2(MI&: *PotentialMI, ST); | 
|---|
| 1423 | break; | 
|---|
| 1424 | case AMDGPU::V_CNDMASK_B32_e64: | 
|---|
| 1425 | convertVcndmaskToVOP2(MI&: *PotentialMI, ST); | 
|---|
| 1426 | break; | 
|---|
| 1427 | }; | 
|---|
| 1428 | } | 
|---|
| 1429 | SDWAOperands.clear(); | 
|---|
| 1430 |  | 
|---|
| 1431 | // Generate potential match list. | 
|---|
| 1432 | matchSDWAOperands(MBB); | 
|---|
| 1433 |  | 
|---|
| 1434 | for (const auto &OperandPair : SDWAOperands) { | 
|---|
| 1435 | const auto &Operand = OperandPair.second; | 
|---|
| 1436 | MachineInstr *PotentialMI = | 
|---|
| 1437 | Operand->potentialToConvert(TII, ST, PotentialMatches: &PotentialMatches); | 
|---|
| 1438 |  | 
|---|
| 1439 | if (PotentialMI && isConvertibleToSDWA(MI&: *PotentialMI, ST, TII)) | 
|---|
| 1440 | PotentialMatches[PotentialMI].push_back(Elt: Operand.get()); | 
|---|
| 1441 | } | 
|---|
| 1442 |  | 
|---|
| 1443 | for (auto &PotentialPair : PotentialMatches) { | 
|---|
| 1444 | MachineInstr &PotentialMI = *PotentialPair.first; | 
|---|
| 1445 | convertToSDWA(MI&: PotentialMI, SDWAOperands: PotentialPair.second); | 
|---|
| 1446 | } | 
|---|
| 1447 |  | 
|---|
| 1448 | PotentialMatches.clear(); | 
|---|
| 1449 | SDWAOperands.clear(); | 
|---|
| 1450 |  | 
|---|
| 1451 | Changed = !ConvertedInstructions.empty(); | 
|---|
| 1452 |  | 
|---|
| 1453 | if (Changed) | 
|---|
| 1454 | Ret = true; | 
|---|
| 1455 | while (!ConvertedInstructions.empty()) | 
|---|
| 1456 | legalizeScalarOperands(MI&: *ConvertedInstructions.pop_back_val(), ST); | 
|---|
| 1457 | } while (Changed); | 
|---|
| 1458 | } | 
|---|
| 1459 |  | 
|---|
| 1460 | return Ret; | 
|---|
| 1461 | } | 
|---|
| 1462 |  | 
|---|
| 1463 | PreservedAnalyses SIPeepholeSDWAPass::run(MachineFunction &MF, | 
|---|
| 1464 | MachineFunctionAnalysisManager &) { | 
|---|
| 1465 | if (MF.getFunction().hasOptNone() || !SIPeepholeSDWA().run(MF)) | 
|---|
| 1466 | return PreservedAnalyses::all(); | 
|---|
| 1467 |  | 
|---|
| 1468 | PreservedAnalyses PA = getMachineFunctionPassPreservedAnalyses(); | 
|---|
| 1469 | PA.preserveSet<CFGAnalyses>(); | 
|---|
| 1470 | return PA; | 
|---|
| 1471 | } | 
|---|
| 1472 |  | 
|---|