1//===- SIPeepholeSDWA.cpp - Peephole optimization for SDWA instructions ---===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file This pass tries to apply several peephole SDWA patterns.
10///
11/// E.g. original:
12/// V_LSHRREV_B32_e32 %0, 16, %1
13/// V_ADD_CO_U32_e32 %2, %0, %3
14/// V_LSHLREV_B32_e32 %4, 16, %2
15///
16/// Replace:
17/// V_ADD_CO_U32_sdwa %4, %1, %3
18/// dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
19///
20//===----------------------------------------------------------------------===//
21
22#include "SIPeepholeSDWA.h"
23#include "AMDGPU.h"
24#include "GCNSubtarget.h"
25#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
26#include "llvm/ADT/MapVector.h"
27#include "llvm/ADT/Statistic.h"
28#include "llvm/CodeGen/MachineFunctionPass.h"
29#include <optional>
30
31using namespace llvm;
32
33#define DEBUG_TYPE "si-peephole-sdwa"
34
35STATISTIC(NumSDWAPatternsFound, "Number of SDWA patterns found.");
36STATISTIC(NumSDWAInstructionsPeepholed,
37 "Number of instruction converted to SDWA.");
38
39namespace {
40
41bool isConvertibleToSDWA(MachineInstr &MI, const GCNSubtarget &ST,
42 const SIInstrInfo *TII);
43class SDWAOperand;
44class SDWADstOperand;
45
46using SDWAOperandsVector = SmallVector<SDWAOperand *, 4>;
47using SDWAOperandsMap = MapVector<MachineInstr *, SDWAOperandsVector>;
48
49class SIPeepholeSDWA {
50private:
51 MachineRegisterInfo *MRI;
52 const SIRegisterInfo *TRI;
53 const SIInstrInfo *TII;
54
55 MapVector<MachineInstr *, std::unique_ptr<SDWAOperand>> SDWAOperands;
56 SDWAOperandsMap PotentialMatches;
57 SmallVector<MachineInstr *, 8> ConvertedInstructions;
58
59 std::optional<int64_t> foldToImm(const MachineOperand &Op) const;
60
61 void matchSDWAOperands(MachineBasicBlock &MBB);
62 std::unique_ptr<SDWAOperand> matchSDWAOperand(MachineInstr &MI);
63 void pseudoOpConvertToVOP2(MachineInstr &MI,
64 const GCNSubtarget &ST) const;
65 void convertVcndmaskToVOP2(MachineInstr &MI, const GCNSubtarget &ST) const;
66 MachineInstr *createSDWAVersion(MachineInstr &MI);
67 bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands);
68 void legalizeScalarOperands(MachineInstr &MI, const GCNSubtarget &ST) const;
69
70public:
71 bool run(MachineFunction &MF);
72};
73
74class SIPeepholeSDWALegacy : public MachineFunctionPass {
75public:
76 static char ID;
77
78 SIPeepholeSDWALegacy() : MachineFunctionPass(ID) {}
79
80 StringRef getPassName() const override { return "SI Peephole SDWA"; }
81
82 bool runOnMachineFunction(MachineFunction &MF) override;
83
84 void getAnalysisUsage(AnalysisUsage &AU) const override {
85 AU.setPreservesCFG();
86 MachineFunctionPass::getAnalysisUsage(AU);
87 }
88};
89
90using namespace AMDGPU::SDWA;
91
92class SDWAOperand {
93private:
94 MachineOperand *Target; // Operand that would be used in converted instruction
95 MachineOperand *Replaced; // Operand that would be replace by Target
96
97 /// Returns true iff the SDWA selection of this SDWAOperand can be combined
98 /// with the SDWA selections of its uses in \p MI.
99 virtual bool canCombineSelections(const MachineInstr &MI,
100 const SIInstrInfo *TII) = 0;
101
102public:
103 SDWAOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp)
104 : Target(TargetOp), Replaced(ReplacedOp) {
105 assert(Target->isReg());
106 assert(Replaced->isReg());
107 }
108
109 virtual ~SDWAOperand() = default;
110
111 virtual MachineInstr *potentialToConvert(const SIInstrInfo *TII,
112 const GCNSubtarget &ST,
113 SDWAOperandsMap *PotentialMatches = nullptr) = 0;
114 virtual bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) = 0;
115
116 MachineOperand *getTargetOperand() const { return Target; }
117 MachineOperand *getReplacedOperand() const { return Replaced; }
118 MachineInstr *getParentInst() const { return Target->getParent(); }
119
120 MachineRegisterInfo *getMRI() const {
121 return &getParentInst()->getMF()->getRegInfo();
122 }
123
124#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
125 virtual void print(raw_ostream& OS) const = 0;
126 void dump() const { print(dbgs()); }
127#endif
128};
129
130class SDWASrcOperand : public SDWAOperand {
131private:
132 SdwaSel SrcSel;
133 bool Abs;
134 bool Neg;
135 bool Sext;
136
137public:
138 SDWASrcOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp,
139 SdwaSel SrcSel_ = DWORD, bool Abs_ = false, bool Neg_ = false,
140 bool Sext_ = false)
141 : SDWAOperand(TargetOp, ReplacedOp), SrcSel(SrcSel_), Abs(Abs_),
142 Neg(Neg_), Sext(Sext_) {}
143
144 MachineInstr *potentialToConvert(const SIInstrInfo *TII,
145 const GCNSubtarget &ST,
146 SDWAOperandsMap *PotentialMatches = nullptr) override;
147 bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override;
148 bool canCombineSelections(const MachineInstr &MI,
149 const SIInstrInfo *TII) override;
150
151 SdwaSel getSrcSel() const { return SrcSel; }
152 bool getAbs() const { return Abs; }
153 bool getNeg() const { return Neg; }
154 bool getSext() const { return Sext; }
155
156 uint64_t getSrcMods(const SIInstrInfo *TII,
157 const MachineOperand *SrcOp) const;
158
159#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
160 void print(raw_ostream& OS) const override;
161#endif
162};
163
164class SDWADstOperand : public SDWAOperand {
165private:
166 SdwaSel DstSel;
167 DstUnused DstUn;
168
169public:
170 SDWADstOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp,
171 SdwaSel DstSel_ = DWORD, DstUnused DstUn_ = UNUSED_PAD)
172 : SDWAOperand(TargetOp, ReplacedOp), DstSel(DstSel_), DstUn(DstUn_) {}
173
174 MachineInstr *potentialToConvert(const SIInstrInfo *TII,
175 const GCNSubtarget &ST,
176 SDWAOperandsMap *PotentialMatches = nullptr) override;
177 bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override;
178 bool canCombineSelections(const MachineInstr &MI,
179 const SIInstrInfo *TII) override;
180
181 SdwaSel getDstSel() const { return DstSel; }
182 DstUnused getDstUnused() const { return DstUn; }
183
184#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
185 void print(raw_ostream& OS) const override;
186#endif
187};
188
189class SDWADstPreserveOperand : public SDWADstOperand {
190private:
191 MachineOperand *Preserve;
192
193public:
194 SDWADstPreserveOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp,
195 MachineOperand *PreserveOp, SdwaSel DstSel_ = DWORD)
196 : SDWADstOperand(TargetOp, ReplacedOp, DstSel_, UNUSED_PRESERVE),
197 Preserve(PreserveOp) {}
198
199 bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override;
200 bool canCombineSelections(const MachineInstr &MI,
201 const SIInstrInfo *TII) override;
202
203 MachineOperand *getPreservedOperand() const { return Preserve; }
204
205#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
206 void print(raw_ostream& OS) const override;
207#endif
208};
209
210} // end anonymous namespace
211
212INITIALIZE_PASS(SIPeepholeSDWALegacy, DEBUG_TYPE, "SI Peephole SDWA", false,
213 false)
214
215char SIPeepholeSDWALegacy::ID = 0;
216
217char &llvm::SIPeepholeSDWALegacyID = SIPeepholeSDWALegacy::ID;
218
219FunctionPass *llvm::createSIPeepholeSDWALegacyPass() {
220 return new SIPeepholeSDWALegacy();
221}
222
223#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
224static raw_ostream& operator<<(raw_ostream &OS, SdwaSel Sel) {
225 switch(Sel) {
226 case BYTE_0: OS << "BYTE_0"; break;
227 case BYTE_1: OS << "BYTE_1"; break;
228 case BYTE_2: OS << "BYTE_2"; break;
229 case BYTE_3: OS << "BYTE_3"; break;
230 case WORD_0: OS << "WORD_0"; break;
231 case WORD_1: OS << "WORD_1"; break;
232 case DWORD: OS << "DWORD"; break;
233 }
234 return OS;
235}
236
237static raw_ostream& operator<<(raw_ostream &OS, const DstUnused &Un) {
238 switch(Un) {
239 case UNUSED_PAD: OS << "UNUSED_PAD"; break;
240 case UNUSED_SEXT: OS << "UNUSED_SEXT"; break;
241 case UNUSED_PRESERVE: OS << "UNUSED_PRESERVE"; break;
242 }
243 return OS;
244}
245
246LLVM_DUMP_METHOD
247void SDWASrcOperand::print(raw_ostream& OS) const {
248 OS << "SDWA src: " << *getTargetOperand()
249 << " src_sel:" << getSrcSel()
250 << " abs:" << getAbs() << " neg:" << getNeg()
251 << " sext:" << getSext() << '\n';
252}
253
254LLVM_DUMP_METHOD
255void SDWADstOperand::print(raw_ostream& OS) const {
256 OS << "SDWA dst: " << *getTargetOperand()
257 << " dst_sel:" << getDstSel()
258 << " dst_unused:" << getDstUnused() << '\n';
259}
260
261LLVM_DUMP_METHOD
262void SDWADstPreserveOperand::print(raw_ostream& OS) const {
263 OS << "SDWA preserve dst: " << *getTargetOperand()
264 << " dst_sel:" << getDstSel()
265 << " preserve:" << *getPreservedOperand() << '\n';
266}
267
268#endif
269
270static void copyRegOperand(MachineOperand &To, const MachineOperand &From) {
271 assert(To.isReg() && From.isReg());
272 To.setReg(From.getReg());
273 To.setSubReg(From.getSubReg());
274 To.setIsUndef(From.isUndef());
275 if (To.isUse()) {
276 To.setIsKill(From.isKill());
277 } else {
278 To.setIsDead(From.isDead());
279 }
280}
281
282static bool isSameReg(const MachineOperand &LHS, const MachineOperand &RHS) {
283 return LHS.isReg() &&
284 RHS.isReg() &&
285 LHS.getReg() == RHS.getReg() &&
286 LHS.getSubReg() == RHS.getSubReg();
287}
288
289static MachineOperand *findSingleRegUse(const MachineOperand *Reg,
290 const MachineRegisterInfo *MRI) {
291 if (!Reg->isReg() || !Reg->isDef())
292 return nullptr;
293
294 return MRI->getOneNonDBGUse(RegNo: Reg->getReg());
295}
296
297static MachineOperand *findSingleRegDef(const MachineOperand *Reg,
298 const MachineRegisterInfo *MRI) {
299 if (!Reg->isReg())
300 return nullptr;
301
302 return MRI->getOneDef(Reg: Reg->getReg());
303}
304
305/// Combine an SDWA instruction's existing SDWA selection \p Sel with
306/// the SDWA selection \p OperandSel of its operand. If the selections
307/// are compatible, return the combined selection, otherwise return a
308/// nullopt.
309/// For example, if we have Sel = BYTE_0 Sel and OperandSel = WORD_1:
310/// BYTE_0 Sel (WORD_1 Sel (%X)) -> BYTE_2 Sel (%X)
311static std::optional<SdwaSel> combineSdwaSel(SdwaSel Sel, SdwaSel OperandSel) {
312 if (Sel == SdwaSel::DWORD)
313 return OperandSel;
314
315 if (Sel == OperandSel || OperandSel == SdwaSel::DWORD)
316 return Sel;
317
318 if (Sel == SdwaSel::WORD_1 || Sel == SdwaSel::BYTE_2 ||
319 Sel == SdwaSel::BYTE_3)
320 return {};
321
322 if (OperandSel == SdwaSel::WORD_0)
323 return Sel;
324
325 if (OperandSel == SdwaSel::WORD_1) {
326 if (Sel == SdwaSel::BYTE_0)
327 return SdwaSel::BYTE_2;
328 if (Sel == SdwaSel::BYTE_1)
329 return SdwaSel::BYTE_3;
330 if (Sel == SdwaSel::WORD_0)
331 return SdwaSel::WORD_1;
332 }
333
334 return {};
335}
336
337uint64_t SDWASrcOperand::getSrcMods(const SIInstrInfo *TII,
338 const MachineOperand *SrcOp) const {
339 uint64_t Mods = 0;
340 const auto *MI = SrcOp->getParent();
341 if (TII->getNamedOperand(MI: *MI, OperandName: AMDGPU::OpName::src0) == SrcOp) {
342 if (auto *Mod = TII->getNamedOperand(MI: *MI, OperandName: AMDGPU::OpName::src0_modifiers)) {
343 Mods = Mod->getImm();
344 }
345 } else if (TII->getNamedOperand(MI: *MI, OperandName: AMDGPU::OpName::src1) == SrcOp) {
346 if (auto *Mod = TII->getNamedOperand(MI: *MI, OperandName: AMDGPU::OpName::src1_modifiers)) {
347 Mods = Mod->getImm();
348 }
349 }
350 if (Abs || Neg) {
351 assert(!Sext &&
352 "Float and integer src modifiers can't be set simultaneously");
353 Mods |= Abs ? SISrcMods::ABS : 0u;
354 Mods ^= Neg ? SISrcMods::NEG : 0u;
355 } else if (Sext) {
356 Mods |= SISrcMods::SEXT;
357 }
358
359 return Mods;
360}
361
362MachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII,
363 const GCNSubtarget &ST,
364 SDWAOperandsMap *PotentialMatches) {
365 if (PotentialMatches != nullptr) {
366 // Fill out the map for all uses if all can be converted
367 MachineOperand *Reg = getReplacedOperand();
368 if (!Reg->isReg() || !Reg->isDef())
369 return nullptr;
370
371 for (MachineInstr &UseMI : getMRI()->use_nodbg_instructions(Reg: Reg->getReg()))
372 // Check that all instructions that use Reg can be converted
373 if (!isConvertibleToSDWA(MI&: UseMI, ST, TII) ||
374 !canCombineSelections(MI: UseMI, TII))
375 return nullptr;
376
377 // Now that it's guaranteed all uses are legal, iterate over the uses again
378 // to add them for later conversion.
379 for (MachineOperand &UseMO : getMRI()->use_nodbg_operands(Reg: Reg->getReg())) {
380 // Should not get a subregister here
381 assert(isSameReg(UseMO, *Reg));
382
383 SDWAOperandsMap &potentialMatchesMap = *PotentialMatches;
384 MachineInstr *UseMI = UseMO.getParent();
385 potentialMatchesMap[UseMI].push_back(Elt: this);
386 }
387 return nullptr;
388 }
389
390 // For SDWA src operand potential instruction is one that use register
391 // defined by parent instruction
392 MachineOperand *PotentialMO = findSingleRegUse(Reg: getReplacedOperand(), MRI: getMRI());
393 if (!PotentialMO)
394 return nullptr;
395
396 MachineInstr *Parent = PotentialMO->getParent();
397
398 return canCombineSelections(MI: *Parent, TII) ? Parent : nullptr;
399}
400
401bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {
402 switch (MI.getOpcode()) {
403 case AMDGPU::V_CVT_F32_FP8_sdwa:
404 case AMDGPU::V_CVT_F32_BF8_sdwa:
405 case AMDGPU::V_CVT_PK_F32_FP8_sdwa:
406 case AMDGPU::V_CVT_PK_F32_BF8_sdwa:
407 // Does not support input modifiers: noabs, noneg, nosext.
408 return false;
409 case AMDGPU::V_CNDMASK_B32_sdwa:
410 // SISrcMods uses the same bitmask for SEXT and NEG modifiers and
411 // hence the compiler can only support one type of modifier for
412 // each SDWA instruction. For V_CNDMASK_B32_sdwa, this is NEG
413 // since its operands get printed using
414 // AMDGPUInstPrinter::printOperandAndFPInputMods which produces
415 // the output intended for NEG if SEXT is set.
416 //
417 // The ISA does actually support both modifiers on most SDWA
418 // instructions.
419 //
420 // FIXME Accept SEXT here after fixing this issue.
421 if (Sext)
422 return false;
423 break;
424 }
425
426 // Find operand in instruction that matches source operand and replace it with
427 // target operand. Set corresponding src_sel
428 bool IsPreserveSrc = false;
429 MachineOperand *Src = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0);
430 MachineOperand *SrcSel = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0_sel);
431 MachineOperand *SrcMods =
432 TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0_modifiers);
433 assert(Src && (Src->isReg() || Src->isImm()));
434 if (!isSameReg(LHS: *Src, RHS: *getReplacedOperand())) {
435 // If this is not src0 then it could be src1
436 Src = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1);
437 SrcSel = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1_sel);
438 SrcMods = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1_modifiers);
439
440 if (!Src ||
441 !isSameReg(LHS: *Src, RHS: *getReplacedOperand())) {
442 // It's possible this Src is a tied operand for
443 // UNUSED_PRESERVE, in which case we can either
444 // abandon the peephole attempt, or if legal we can
445 // copy the target operand into the tied slot
446 // if the preserve operation will effectively cause the same
447 // result by overwriting the rest of the dst.
448 MachineOperand *Dst = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst);
449 MachineOperand *DstUnused =
450 TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::dst_unused);
451
452 if (Dst &&
453 DstUnused->getImm() == AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE) {
454 // This will work if the tied src is accessing WORD_0, and the dst is
455 // writing WORD_1. Modifiers don't matter because all the bits that
456 // would be impacted are being overwritten by the dst.
457 // Any other case will not work.
458 SdwaSel DstSel = static_cast<SdwaSel>(
459 TII->getNamedImmOperand(MI, OperandName: AMDGPU::OpName::dst_sel));
460 if (DstSel == AMDGPU::SDWA::SdwaSel::WORD_1 &&
461 getSrcSel() == AMDGPU::SDWA::SdwaSel::WORD_0) {
462 IsPreserveSrc = true;
463 auto DstIdx = AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(),
464 Name: AMDGPU::OpName::vdst);
465 auto TiedIdx = MI.findTiedOperandIdx(OpIdx: DstIdx);
466 Src = &MI.getOperand(i: TiedIdx);
467 SrcSel = nullptr;
468 SrcMods = nullptr;
469 } else {
470 // Not legal to convert this src
471 return false;
472 }
473 }
474 }
475 assert(Src && Src->isReg());
476
477 if ((MI.getOpcode() == AMDGPU::V_FMAC_F16_sdwa ||
478 MI.getOpcode() == AMDGPU::V_FMAC_F32_sdwa ||
479 MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa ||
480 MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) &&
481 !isSameReg(LHS: *Src, RHS: *getReplacedOperand())) {
482 // In case of v_mac_f16/32_sdwa this pass can try to apply src operand to
483 // src2. This is not allowed.
484 return false;
485 }
486
487 assert(isSameReg(*Src, *getReplacedOperand()) &&
488 (IsPreserveSrc || (SrcSel && SrcMods)));
489 }
490 copyRegOperand(To&: *Src, From: *getTargetOperand());
491 if (!IsPreserveSrc) {
492 SdwaSel ExistingSel = static_cast<SdwaSel>(SrcSel->getImm());
493 SrcSel->setImm(*combineSdwaSel(Sel: ExistingSel, OperandSel: getSrcSel()));
494 SrcMods->setImm(getSrcMods(TII, SrcOp: Src));
495 }
496 getTargetOperand()->setIsKill(false);
497 return true;
498}
499
500/// Verify that the SDWA selection operand \p SrcSelOpName of the SDWA
501/// instruction \p MI can be combined with the selection \p OpSel.
502static bool canCombineOpSel(const MachineInstr &MI, const SIInstrInfo *TII,
503 AMDGPU::OpName SrcSelOpName, SdwaSel OpSel) {
504 assert(TII->isSDWA(MI.getOpcode()));
505
506 const MachineOperand *SrcSelOp = TII->getNamedOperand(MI, OperandName: SrcSelOpName);
507 SdwaSel SrcSel = static_cast<SdwaSel>(SrcSelOp->getImm());
508
509 return combineSdwaSel(Sel: SrcSel, OperandSel: OpSel).has_value();
510}
511
512/// Verify that \p Op is the same register as the operand of the SDWA
513/// instruction \p MI named by \p SrcOpName and that the SDWA
514/// selection \p SrcSelOpName can be combined with the \p OpSel.
515static bool canCombineOpSel(const MachineInstr &MI, const SIInstrInfo *TII,
516 AMDGPU::OpName SrcOpName,
517 AMDGPU::OpName SrcSelOpName, MachineOperand *Op,
518 SdwaSel OpSel) {
519 assert(TII->isSDWA(MI.getOpcode()));
520
521 const MachineOperand *Src = TII->getNamedOperand(MI, OperandName: SrcOpName);
522 if (!Src || !isSameReg(LHS: *Src, RHS: *Op))
523 return true;
524
525 return canCombineOpSel(MI, TII, SrcSelOpName, OpSel);
526}
527
528bool SDWASrcOperand::canCombineSelections(const MachineInstr &MI,
529 const SIInstrInfo *TII) {
530 if (!TII->isSDWA(Opcode: MI.getOpcode()))
531 return true;
532
533 using namespace AMDGPU;
534
535 return canCombineOpSel(MI, TII, SrcOpName: OpName::src0, SrcSelOpName: OpName::src0_sel,
536 Op: getReplacedOperand(), OpSel: getSrcSel()) &&
537 canCombineOpSel(MI, TII, SrcOpName: OpName::src1, SrcSelOpName: OpName::src1_sel,
538 Op: getReplacedOperand(), OpSel: getSrcSel());
539}
540
541MachineInstr *SDWADstOperand::potentialToConvert(const SIInstrInfo *TII,
542 const GCNSubtarget &ST,
543 SDWAOperandsMap *PotentialMatches) {
544 // For SDWA dst operand potential instruction is one that defines register
545 // that this operand uses
546 MachineRegisterInfo *MRI = getMRI();
547 MachineInstr *ParentMI = getParentInst();
548
549 MachineOperand *PotentialMO = findSingleRegDef(Reg: getReplacedOperand(), MRI);
550 if (!PotentialMO)
551 return nullptr;
552
553 // Check that ParentMI is the only instruction that uses replaced register
554 for (MachineInstr &UseInst : MRI->use_nodbg_instructions(Reg: PotentialMO->getReg())) {
555 if (&UseInst != ParentMI)
556 return nullptr;
557 }
558
559 MachineInstr *Parent = PotentialMO->getParent();
560 return canCombineSelections(MI: *Parent, TII) ? Parent : nullptr;
561}
562
563bool SDWADstOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {
564 // Replace vdst operand in MI with target operand. Set dst_sel and dst_unused
565
566 if ((MI.getOpcode() == AMDGPU::V_FMAC_F16_sdwa ||
567 MI.getOpcode() == AMDGPU::V_FMAC_F32_sdwa ||
568 MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa ||
569 MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) &&
570 getDstSel() != AMDGPU::SDWA::DWORD) {
571 // v_mac_f16/32_sdwa allow dst_sel to be equal only to DWORD
572 return false;
573 }
574
575 MachineOperand *Operand = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst);
576 assert(Operand &&
577 Operand->isReg() &&
578 isSameReg(*Operand, *getReplacedOperand()));
579 copyRegOperand(To&: *Operand, From: *getTargetOperand());
580 MachineOperand *DstSel= TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::dst_sel);
581 assert(DstSel);
582
583 SdwaSel ExistingSel = static_cast<SdwaSel>(DstSel->getImm());
584 DstSel->setImm(combineSdwaSel(Sel: ExistingSel, OperandSel: getDstSel()).value());
585
586 MachineOperand *DstUnused= TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::dst_unused);
587 assert(DstUnused);
588 DstUnused->setImm(getDstUnused());
589
590 // Remove original instruction because it would conflict with our new
591 // instruction by register definition
592 getParentInst()->eraseFromParent();
593 return true;
594}
595
596bool SDWADstOperand::canCombineSelections(const MachineInstr &MI,
597 const SIInstrInfo *TII) {
598 if (!TII->isSDWA(Opcode: MI.getOpcode()))
599 return true;
600
601 return canCombineOpSel(MI, TII, SrcSelOpName: AMDGPU::OpName::dst_sel, OpSel: getDstSel());
602}
603
604bool SDWADstPreserveOperand::convertToSDWA(MachineInstr &MI,
605 const SIInstrInfo *TII) {
606 // MI should be moved right before v_or_b32.
607 // For this we should clear all kill flags on uses of MI src-operands or else
608 // we can encounter problem with use of killed operand.
609 for (MachineOperand &MO : MI.uses()) {
610 if (!MO.isReg())
611 continue;
612 getMRI()->clearKillFlags(Reg: MO.getReg());
613 }
614
615 // Move MI before v_or_b32
616 MI.getParent()->remove(I: &MI);
617 getParentInst()->getParent()->insert(I: getParentInst(), MI: &MI);
618
619 // Add Implicit use of preserved register
620 MachineInstrBuilder MIB(*MI.getMF(), MI);
621 MIB.addReg(RegNo: getPreservedOperand()->getReg(),
622 Flags: RegState::ImplicitKill,
623 SubReg: getPreservedOperand()->getSubReg());
624
625 // Tie dst to implicit use
626 MI.tieOperands(DefIdx: AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: AMDGPU::OpName::vdst),
627 UseIdx: MI.getNumOperands() - 1);
628
629 // Convert MI as any other SDWADstOperand and remove v_or_b32
630 return SDWADstOperand::convertToSDWA(MI, TII);
631}
632
633bool SDWADstPreserveOperand::canCombineSelections(const MachineInstr &MI,
634 const SIInstrInfo *TII) {
635 return SDWADstOperand::canCombineSelections(MI, TII);
636}
637
638std::optional<int64_t>
639SIPeepholeSDWA::foldToImm(const MachineOperand &Op) const {
640 if (Op.isImm()) {
641 return Op.getImm();
642 }
643
644 // If this is not immediate then it can be copy of immediate value, e.g.:
645 // %1 = S_MOV_B32 255;
646 if (Op.isReg()) {
647 for (const MachineOperand &Def : MRI->def_operands(Reg: Op.getReg())) {
648 if (!isSameReg(LHS: Op, RHS: Def))
649 continue;
650
651 const MachineInstr *DefInst = Def.getParent();
652 if (!TII->isFoldableCopy(MI: *DefInst))
653 return std::nullopt;
654
655 const MachineOperand &Copied = DefInst->getOperand(i: 1);
656 if (!Copied.isImm())
657 return std::nullopt;
658
659 return Copied.getImm();
660 }
661 }
662
663 return std::nullopt;
664}
665
666std::unique_ptr<SDWAOperand>
667SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) {
668 unsigned Opcode = MI.getOpcode();
669 switch (Opcode) {
670 case AMDGPU::V_LSHRREV_B32_e32:
671 case AMDGPU::V_ASHRREV_I32_e32:
672 case AMDGPU::V_LSHLREV_B32_e32:
673 case AMDGPU::V_LSHRREV_B32_e64:
674 case AMDGPU::V_ASHRREV_I32_e64:
675 case AMDGPU::V_LSHLREV_B32_e64: {
676 // from: v_lshrrev_b32_e32 v1, 16/24, v0
677 // to SDWA src:v0 src_sel:WORD_1/BYTE_3
678
679 // from: v_ashrrev_i32_e32 v1, 16/24, v0
680 // to SDWA src:v0 src_sel:WORD_1/BYTE_3 sext:1
681
682 // from: v_lshlrev_b32_e32 v1, 16/24, v0
683 // to SDWA dst:v1 dst_sel:WORD_1/BYTE_3 dst_unused:UNUSED_PAD
684 MachineOperand *Src0 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0);
685 auto Imm = foldToImm(Op: *Src0);
686 if (!Imm)
687 break;
688
689 if (*Imm != 16 && *Imm != 24)
690 break;
691
692 MachineOperand *Src1 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1);
693 MachineOperand *Dst = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst);
694 if (!Src1->isReg() || Src1->getReg().isPhysical() ||
695 Dst->getReg().isPhysical())
696 break;
697
698 if (Opcode == AMDGPU::V_LSHLREV_B32_e32 ||
699 Opcode == AMDGPU::V_LSHLREV_B32_e64) {
700 return std::make_unique<SDWADstOperand>(
701 args&: Dst, args&: Src1, args: *Imm == 16 ? WORD_1 : BYTE_3, args: UNUSED_PAD);
702 }
703 return std::make_unique<SDWASrcOperand>(
704 args&: Src1, args&: Dst, args: *Imm == 16 ? WORD_1 : BYTE_3, args: false, args: false,
705 args: Opcode != AMDGPU::V_LSHRREV_B32_e32 &&
706 Opcode != AMDGPU::V_LSHRREV_B32_e64);
707 break;
708 }
709
710 case AMDGPU::V_LSHRREV_B16_e32:
711 case AMDGPU::V_ASHRREV_I16_e32:
712 case AMDGPU::V_LSHLREV_B16_e32:
713 case AMDGPU::V_LSHRREV_B16_e64:
714 case AMDGPU::V_LSHRREV_B16_opsel_e64:
715 case AMDGPU::V_ASHRREV_I16_e64:
716 case AMDGPU::V_LSHLREV_B16_opsel_e64:
717 case AMDGPU::V_LSHLREV_B16_e64: {
718 // from: v_lshrrev_b16_e32 v1, 8, v0
719 // to SDWA src:v0 src_sel:BYTE_1
720
721 // from: v_ashrrev_i16_e32 v1, 8, v0
722 // to SDWA src:v0 src_sel:BYTE_1 sext:1
723
724 // from: v_lshlrev_b16_e32 v1, 8, v0
725 // to SDWA dst:v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD
726 MachineOperand *Src0 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0);
727 auto Imm = foldToImm(Op: *Src0);
728 if (!Imm || *Imm != 8)
729 break;
730
731 MachineOperand *Src1 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1);
732 MachineOperand *Dst = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst);
733
734 if (!Src1->isReg() || Src1->getReg().isPhysical() ||
735 Dst->getReg().isPhysical())
736 break;
737
738 if (Opcode == AMDGPU::V_LSHLREV_B16_e32 ||
739 Opcode == AMDGPU::V_LSHLREV_B16_opsel_e64 ||
740 Opcode == AMDGPU::V_LSHLREV_B16_e64)
741 return std::make_unique<SDWADstOperand>(args&: Dst, args&: Src1, args: BYTE_1, args: UNUSED_PAD);
742 return std::make_unique<SDWASrcOperand>(
743 args&: Src1, args&: Dst, args: BYTE_1, args: false, args: false,
744 args: Opcode != AMDGPU::V_LSHRREV_B16_e32 &&
745 Opcode != AMDGPU::V_LSHRREV_B16_opsel_e64 &&
746 Opcode != AMDGPU::V_LSHRREV_B16_e64);
747 break;
748 }
749
750 case AMDGPU::V_BFE_I32_e64:
751 case AMDGPU::V_BFE_U32_e64: {
752 // e.g.:
753 // from: v_bfe_u32 v1, v0, 8, 8
754 // to SDWA src:v0 src_sel:BYTE_1
755
756 // offset | width | src_sel
757 // ------------------------
758 // 0 | 8 | BYTE_0
759 // 0 | 16 | WORD_0
760 // 0 | 32 | DWORD ?
761 // 8 | 8 | BYTE_1
762 // 16 | 8 | BYTE_2
763 // 16 | 16 | WORD_1
764 // 24 | 8 | BYTE_3
765
766 MachineOperand *Src1 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1);
767 auto Offset = foldToImm(Op: *Src1);
768 if (!Offset)
769 break;
770
771 MachineOperand *Src2 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src2);
772 auto Width = foldToImm(Op: *Src2);
773 if (!Width)
774 break;
775
776 SdwaSel SrcSel = DWORD;
777
778 if (*Offset == 0 && *Width == 8)
779 SrcSel = BYTE_0;
780 else if (*Offset == 0 && *Width == 16)
781 SrcSel = WORD_0;
782 else if (*Offset == 0 && *Width == 32)
783 SrcSel = DWORD;
784 else if (*Offset == 8 && *Width == 8)
785 SrcSel = BYTE_1;
786 else if (*Offset == 16 && *Width == 8)
787 SrcSel = BYTE_2;
788 else if (*Offset == 16 && *Width == 16)
789 SrcSel = WORD_1;
790 else if (*Offset == 24 && *Width == 8)
791 SrcSel = BYTE_3;
792 else
793 break;
794
795 MachineOperand *Src0 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0);
796 MachineOperand *Dst = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst);
797
798 if (!Src0->isReg() || Src0->getReg().isPhysical() ||
799 Dst->getReg().isPhysical())
800 break;
801
802 return std::make_unique<SDWASrcOperand>(
803 args&: Src0, args&: Dst, args&: SrcSel, args: false, args: false, args: Opcode != AMDGPU::V_BFE_U32_e64);
804 }
805
806 case AMDGPU::V_AND_B32_e32:
807 case AMDGPU::V_AND_B32_e64: {
808 // e.g.:
809 // from: v_and_b32_e32 v1, 0x0000ffff/0x000000ff, v0
810 // to SDWA src:v0 src_sel:WORD_0/BYTE_0
811
812 MachineOperand *Src0 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0);
813 MachineOperand *Src1 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1);
814 auto *ValSrc = Src1;
815 auto Imm = foldToImm(Op: *Src0);
816
817 if (!Imm) {
818 Imm = foldToImm(Op: *Src1);
819 ValSrc = Src0;
820 }
821
822 if (!Imm || (*Imm != 0x0000ffff && *Imm != 0x000000ff))
823 break;
824
825 MachineOperand *Dst = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst);
826
827 if (!ValSrc->isReg() || ValSrc->getReg().isPhysical() ||
828 Dst->getReg().isPhysical())
829 break;
830
831 return std::make_unique<SDWASrcOperand>(
832 args&: ValSrc, args&: Dst, args: *Imm == 0x0000ffff ? WORD_0 : BYTE_0);
833 }
834
835 case AMDGPU::V_OR_B32_e32:
836 case AMDGPU::V_OR_B32_e64: {
837 // Patterns for dst_unused:UNUSED_PRESERVE.
838 // e.g., from:
839 // v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD
840 // src1_sel:WORD_1 src2_sel:WORD1
841 // v_add_f16_e32 v3, v1, v2
842 // v_or_b32_e32 v4, v0, v3
843 // to SDWA preserve dst:v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE preserve:v3
844
845 // Check if one of operands of v_or_b32 is SDWA instruction
846 using CheckRetType =
847 std::optional<std::pair<MachineOperand *, MachineOperand *>>;
848 auto CheckOROperandsForSDWA =
849 [&](const MachineOperand *Op1, const MachineOperand *Op2) -> CheckRetType {
850 if (!Op1 || !Op1->isReg() || !Op2 || !Op2->isReg())
851 return CheckRetType(std::nullopt);
852
853 MachineOperand *Op1Def = findSingleRegDef(Reg: Op1, MRI);
854 if (!Op1Def)
855 return CheckRetType(std::nullopt);
856
857 MachineInstr *Op1Inst = Op1Def->getParent();
858 if (!TII->isSDWA(MI: *Op1Inst))
859 return CheckRetType(std::nullopt);
860
861 MachineOperand *Op2Def = findSingleRegDef(Reg: Op2, MRI);
862 if (!Op2Def)
863 return CheckRetType(std::nullopt);
864
865 return CheckRetType(std::pair(Op1Def, Op2Def));
866 };
867
868 MachineOperand *OrSDWA = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0);
869 MachineOperand *OrOther = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1);
870 assert(OrSDWA && OrOther);
871 auto Res = CheckOROperandsForSDWA(OrSDWA, OrOther);
872 if (!Res) {
873 OrSDWA = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1);
874 OrOther = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0);
875 assert(OrSDWA && OrOther);
876 Res = CheckOROperandsForSDWA(OrSDWA, OrOther);
877 if (!Res)
878 break;
879 }
880
881 MachineOperand *OrSDWADef = Res->first;
882 MachineOperand *OrOtherDef = Res->second;
883 assert(OrSDWADef && OrOtherDef);
884
885 MachineInstr *SDWAInst = OrSDWADef->getParent();
886 MachineInstr *OtherInst = OrOtherDef->getParent();
887
888 // Check that OtherInstr is actually bitwise compatible with SDWAInst = their
889 // destination patterns don't overlap. Compatible instruction can be either
890 // regular instruction with compatible bitness or SDWA instruction with
891 // correct dst_sel
892 // SDWAInst | OtherInst bitness / OtherInst dst_sel
893 // -----------------------------------------------------
894 // DWORD | no / no
895 // WORD_0 | no / BYTE_2/3, WORD_1
896 // WORD_1 | 8/16-bit instructions / BYTE_0/1, WORD_0
897 // BYTE_0 | no / BYTE_1/2/3, WORD_1
898 // BYTE_1 | 8-bit / BYTE_0/2/3, WORD_1
899 // BYTE_2 | 8/16-bit / BYTE_0/1/3. WORD_0
900 // BYTE_3 | 8/16/24-bit / BYTE_0/1/2, WORD_0
901 // E.g. if SDWAInst is v_add_f16_sdwa dst_sel:WORD_1 then v_add_f16 is OK
902 // but v_add_f32 is not.
903
904 // TODO: add support for non-SDWA instructions as OtherInst.
905 // For now this only works with SDWA instructions. For regular instructions
906 // there is no way to determine if the instruction writes only 8/16/24-bit
907 // out of full register size and all registers are at min 32-bit wide.
908 if (!TII->isSDWA(MI: *OtherInst))
909 break;
910
911 SdwaSel DstSel = static_cast<SdwaSel>(
912 TII->getNamedImmOperand(MI: *SDWAInst, OperandName: AMDGPU::OpName::dst_sel));
913 SdwaSel OtherDstSel = static_cast<SdwaSel>(
914 TII->getNamedImmOperand(MI: *OtherInst, OperandName: AMDGPU::OpName::dst_sel));
915
916 bool DstSelAgree = false;
917 switch (DstSel) {
918 case WORD_0: DstSelAgree = ((OtherDstSel == BYTE_2) ||
919 (OtherDstSel == BYTE_3) ||
920 (OtherDstSel == WORD_1));
921 break;
922 case WORD_1: DstSelAgree = ((OtherDstSel == BYTE_0) ||
923 (OtherDstSel == BYTE_1) ||
924 (OtherDstSel == WORD_0));
925 break;
926 case BYTE_0: DstSelAgree = ((OtherDstSel == BYTE_1) ||
927 (OtherDstSel == BYTE_2) ||
928 (OtherDstSel == BYTE_3) ||
929 (OtherDstSel == WORD_1));
930 break;
931 case BYTE_1: DstSelAgree = ((OtherDstSel == BYTE_0) ||
932 (OtherDstSel == BYTE_2) ||
933 (OtherDstSel == BYTE_3) ||
934 (OtherDstSel == WORD_1));
935 break;
936 case BYTE_2: DstSelAgree = ((OtherDstSel == BYTE_0) ||
937 (OtherDstSel == BYTE_1) ||
938 (OtherDstSel == BYTE_3) ||
939 (OtherDstSel == WORD_0));
940 break;
941 case BYTE_3: DstSelAgree = ((OtherDstSel == BYTE_0) ||
942 (OtherDstSel == BYTE_1) ||
943 (OtherDstSel == BYTE_2) ||
944 (OtherDstSel == WORD_0));
945 break;
946 default: DstSelAgree = false;
947 }
948
949 if (!DstSelAgree)
950 break;
951
952 // Also OtherInst dst_unused should be UNUSED_PAD
953 DstUnused OtherDstUnused = static_cast<DstUnused>(
954 TII->getNamedImmOperand(MI: *OtherInst, OperandName: AMDGPU::OpName::dst_unused));
955 if (OtherDstUnused != DstUnused::UNUSED_PAD)
956 break;
957
958 // Create DstPreserveOperand
959 MachineOperand *OrDst = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst);
960 assert(OrDst && OrDst->isReg());
961
962 return std::make_unique<SDWADstPreserveOperand>(
963 args&: OrDst, args&: OrSDWADef, args&: OrOtherDef, args&: DstSel);
964
965 }
966 }
967
968 return std::unique_ptr<SDWAOperand>(nullptr);
969}
970
971#if !defined(NDEBUG)
972static raw_ostream& operator<<(raw_ostream &OS, const SDWAOperand &Operand) {
973 Operand.print(OS);
974 return OS;
975}
976#endif
977
978void SIPeepholeSDWA::matchSDWAOperands(MachineBasicBlock &MBB) {
979 for (MachineInstr &MI : MBB) {
980 if (auto Operand = matchSDWAOperand(MI)) {
981 LLVM_DEBUG(dbgs() << "Match: " << MI << "To: " << *Operand << '\n');
982 SDWAOperands[&MI] = std::move(Operand);
983 ++NumSDWAPatternsFound;
984 }
985 }
986}
987
988// Convert the V_ADD_CO_U32_e64 into V_ADD_CO_U32_e32. This allows
989// isConvertibleToSDWA to perform its transformation on V_ADD_CO_U32_e32 into
990// V_ADD_CO_U32_sdwa.
991//
992// We are transforming from a VOP3 into a VOP2 form of the instruction.
993// %19:vgpr_32 = V_AND_B32_e32 255,
994// killed %16:vgpr_32, implicit $exec
995// %47:vgpr_32, %49:sreg_64_xexec = V_ADD_CO_U32_e64
996// %26.sub0:vreg_64, %19:vgpr_32, implicit $exec
997// %48:vgpr_32, dead %50:sreg_64_xexec = V_ADDC_U32_e64
998// %26.sub1:vreg_64, %54:vgpr_32, killed %49:sreg_64_xexec, implicit $exec
999//
1000// becomes
1001// %47:vgpr_32 = V_ADD_CO_U32_sdwa
1002// 0, %26.sub0:vreg_64, 0, killed %16:vgpr_32, 0, 6, 0, 6, 0,
1003// implicit-def $vcc, implicit $exec
1004// %48:vgpr_32, dead %50:sreg_64_xexec = V_ADDC_U32_e64
1005// %26.sub1:vreg_64, %54:vgpr_32, killed $vcc, implicit $exec
1006void SIPeepholeSDWA::pseudoOpConvertToVOP2(MachineInstr &MI,
1007 const GCNSubtarget &ST) const {
1008 int Opc = MI.getOpcode();
1009 assert((Opc == AMDGPU::V_ADD_CO_U32_e64 || Opc == AMDGPU::V_SUB_CO_U32_e64) &&
1010 "Currently only handles V_ADD_CO_U32_e64 or V_SUB_CO_U32_e64");
1011
1012 // Can the candidate MI be shrunk?
1013 if (!TII->canShrink(MI, MRI: *MRI))
1014 return;
1015 Opc = AMDGPU::getVOPe32(Opcode: Opc);
1016 // Find the related ADD instruction.
1017 const MachineOperand *Sdst = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::sdst);
1018 if (!Sdst)
1019 return;
1020 MachineOperand *NextOp = findSingleRegUse(Reg: Sdst, MRI);
1021 if (!NextOp)
1022 return;
1023 MachineInstr &MISucc = *NextOp->getParent();
1024
1025 // Make sure the carry in/out are subsequently unused.
1026 MachineOperand *CarryIn = TII->getNamedOperand(MI&: MISucc, OperandName: AMDGPU::OpName::src2);
1027 if (!CarryIn)
1028 return;
1029 MachineOperand *CarryOut = TII->getNamedOperand(MI&: MISucc, OperandName: AMDGPU::OpName::sdst);
1030 if (!CarryOut)
1031 return;
1032 if (!MRI->hasOneNonDBGUse(RegNo: CarryIn->getReg()) ||
1033 !MRI->use_nodbg_empty(RegNo: CarryOut->getReg()))
1034 return;
1035 // Make sure VCC or its subregs are dead before MI.
1036 MachineBasicBlock &MBB = *MI.getParent();
1037 MachineBasicBlock::LivenessQueryResult Liveness =
1038 MBB.computeRegisterLiveness(TRI, Reg: AMDGPU::VCC, Before: MI, Neighborhood: 25);
1039 if (Liveness != MachineBasicBlock::LQR_Dead)
1040 return;
1041 // Check if VCC is referenced in range of (MI,MISucc].
1042 for (auto I = std::next(x: MI.getIterator()), E = MISucc.getIterator();
1043 I != E; ++I) {
1044 if (I->modifiesRegister(Reg: AMDGPU::VCC, TRI))
1045 return;
1046 }
1047
1048 // Replace MI with V_{SUB|ADD}_I32_e32
1049 BuildMI(BB&: MBB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: Opc))
1050 .add(MO: *TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst))
1051 .add(MO: *TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0))
1052 .add(MO: *TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1))
1053 .setMIFlags(MI.getFlags());
1054
1055 MI.eraseFromParent();
1056
1057 // Since the carry output of MI is now VCC, update its use in MISucc.
1058
1059 MISucc.substituteRegister(FromReg: CarryIn->getReg(), ToReg: TRI->getVCC(), SubIdx: 0, RegInfo: *TRI);
1060}
1061
1062/// Try to convert an \p MI in VOP3 which takes an src2 carry-in
1063/// operand into the corresponding VOP2 form which expects the
1064/// argument in VCC. To this end, add an copy from the carry-in to
1065/// VCC. The conversion will only be applied if \p MI can be shrunk
1066/// to VOP2 and if VCC can be proven to be dead before \p MI.
1067void SIPeepholeSDWA::convertVcndmaskToVOP2(MachineInstr &MI,
1068 const GCNSubtarget &ST) const {
1069 assert(MI.getOpcode() == AMDGPU::V_CNDMASK_B32_e64);
1070
1071 LLVM_DEBUG(dbgs() << "Attempting VOP2 conversion: " << MI);
1072 if (!TII->canShrink(MI, MRI: *MRI)) {
1073 LLVM_DEBUG(dbgs() << "Cannot shrink instruction\n");
1074 return;
1075 }
1076
1077 const MachineOperand &CarryIn =
1078 *TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src2);
1079 Register CarryReg = CarryIn.getReg();
1080 MachineInstr *CarryDef = MRI->getVRegDef(Reg: CarryReg);
1081 if (!CarryDef) {
1082 LLVM_DEBUG(dbgs() << "Missing carry-in operand definition\n");
1083 return;
1084 }
1085
1086 // Make sure VCC or its subregs are dead before MI.
1087 MCRegister Vcc = TRI->getVCC();
1088 MachineBasicBlock &MBB = *MI.getParent();
1089 MachineBasicBlock::LivenessQueryResult Liveness =
1090 MBB.computeRegisterLiveness(TRI, Reg: Vcc, Before: MI);
1091 if (Liveness != MachineBasicBlock::LQR_Dead) {
1092 LLVM_DEBUG(dbgs() << "VCC not known to be dead before instruction\n");
1093 return;
1094 }
1095
1096 BuildMI(BB&: MBB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: Vcc).add(MO: CarryIn);
1097
1098 auto Converted = BuildMI(BB&: MBB, I&: MI, MIMD: MI.getDebugLoc(),
1099 MCID: TII->get(Opcode: AMDGPU::getVOPe32(Opcode: MI.getOpcode())))
1100 .add(MO: *TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst))
1101 .add(MO: *TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0))
1102 .add(MO: *TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1))
1103 .setMIFlags(MI.getFlags());
1104 TII->fixImplicitOperands(MI&: *Converted);
1105 LLVM_DEBUG(dbgs() << "Converted to VOP2: " << *Converted);
1106 (void)Converted;
1107 MI.eraseFromParent();
1108}
1109
1110namespace {
1111bool isConvertibleToSDWA(MachineInstr &MI,
1112 const GCNSubtarget &ST,
1113 const SIInstrInfo* TII) {
1114 // Check if this is already an SDWA instruction
1115 unsigned Opc = MI.getOpcode();
1116 if (TII->isSDWA(Opcode: Opc))
1117 return true;
1118
1119 // Can only be handled after ealier conversion to
1120 // AMDGPU::V_CNDMASK_B32_e32 which is not always possible.
1121 if (Opc == AMDGPU::V_CNDMASK_B32_e64)
1122 return false;
1123
1124 // Check if this instruction has opcode that supports SDWA
1125 if (AMDGPU::getSDWAOp(Opcode: Opc) == -1)
1126 Opc = AMDGPU::getVOPe32(Opcode: Opc);
1127
1128 if (AMDGPU::getSDWAOp(Opcode: Opc) == -1)
1129 return false;
1130
1131 if (!ST.hasSDWAOmod() && TII->hasModifiersSet(MI, OpName: AMDGPU::OpName::omod))
1132 return false;
1133
1134 if (TII->isVOPC(Opcode: Opc)) {
1135 if (!ST.hasSDWASdst()) {
1136 const MachineOperand *SDst = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::sdst);
1137 if (SDst && (SDst->getReg() != AMDGPU::VCC &&
1138 SDst->getReg() != AMDGPU::VCC_LO))
1139 return false;
1140 }
1141
1142 if (!ST.hasSDWAOutModsVOPC() &&
1143 (TII->hasModifiersSet(MI, OpName: AMDGPU::OpName::clamp) ||
1144 TII->hasModifiersSet(MI, OpName: AMDGPU::OpName::omod)))
1145 return false;
1146
1147 } else if (TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::sdst) ||
1148 !TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst)) {
1149 return false;
1150 }
1151
1152 if (!ST.hasSDWAMac() && (Opc == AMDGPU::V_FMAC_F16_e32 ||
1153 Opc == AMDGPU::V_FMAC_F32_e32 ||
1154 Opc == AMDGPU::V_MAC_F16_e32 ||
1155 Opc == AMDGPU::V_MAC_F32_e32))
1156 return false;
1157
1158 // Check if target supports this SDWA opcode
1159 if (TII->pseudoToMCOpcode(Opcode: Opc) == -1)
1160 return false;
1161
1162 if (MachineOperand *Src0 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0)) {
1163 if (!Src0->isReg() && !Src0->isImm())
1164 return false;
1165 }
1166
1167 if (MachineOperand *Src1 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1)) {
1168 if (!Src1->isReg() && !Src1->isImm())
1169 return false;
1170 }
1171
1172 return true;
1173}
1174} // namespace
1175
1176MachineInstr *SIPeepholeSDWA::createSDWAVersion(MachineInstr &MI) {
1177 unsigned Opcode = MI.getOpcode();
1178 assert(!TII->isSDWA(Opcode));
1179
1180 int SDWAOpcode = AMDGPU::getSDWAOp(Opcode);
1181 if (SDWAOpcode == -1)
1182 SDWAOpcode = AMDGPU::getSDWAOp(Opcode: AMDGPU::getVOPe32(Opcode));
1183 assert(SDWAOpcode != -1);
1184
1185 const MCInstrDesc &SDWADesc = TII->get(Opcode: SDWAOpcode);
1186
1187 // Create SDWA version of instruction MI and initialize its operands
1188 MachineInstrBuilder SDWAInst =
1189 BuildMI(BB&: *MI.getParent(), I&: MI, MIMD: MI.getDebugLoc(), MCID: SDWADesc)
1190 .setMIFlags(MI.getFlags());
1191
1192 // Copy dst, if it is present in original then should also be present in SDWA
1193 MachineOperand *Dst = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst);
1194 if (Dst) {
1195 assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::vdst));
1196 SDWAInst.add(MO: *Dst);
1197 } else if ((Dst = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::sdst))) {
1198 assert(Dst && AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::sdst));
1199 SDWAInst.add(MO: *Dst);
1200 } else {
1201 assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::sdst));
1202 SDWAInst.addReg(RegNo: TRI->getVCC(), Flags: RegState::Define);
1203 }
1204
1205 // Copy src0, initialize src0_modifiers. All sdwa instructions has src0 and
1206 // src0_modifiers (except for v_nop_sdwa, but it can't get here)
1207 MachineOperand *Src0 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0);
1208 assert(Src0 && AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src0) &&
1209 AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src0_modifiers));
1210 if (auto *Mod = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0_modifiers))
1211 SDWAInst.addImm(Val: Mod->getImm());
1212 else
1213 SDWAInst.addImm(Val: 0);
1214 SDWAInst.add(MO: *Src0);
1215
1216 // Copy src1 if present, initialize src1_modifiers.
1217 MachineOperand *Src1 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1);
1218 if (Src1) {
1219 assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src1) &&
1220 AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src1_modifiers));
1221 if (auto *Mod = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1_modifiers))
1222 SDWAInst.addImm(Val: Mod->getImm());
1223 else
1224 SDWAInst.addImm(Val: 0);
1225 SDWAInst.add(MO: *Src1);
1226 }
1227
1228 if (SDWAOpcode == AMDGPU::V_FMAC_F16_sdwa ||
1229 SDWAOpcode == AMDGPU::V_FMAC_F32_sdwa ||
1230 SDWAOpcode == AMDGPU::V_MAC_F16_sdwa ||
1231 SDWAOpcode == AMDGPU::V_MAC_F32_sdwa) {
1232 // v_mac_f16/32 has additional src2 operand tied to vdst
1233 MachineOperand *Src2 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src2);
1234 assert(Src2);
1235 SDWAInst.add(MO: *Src2);
1236 }
1237
1238 // Copy clamp if present, initialize otherwise
1239 assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::clamp));
1240 MachineOperand *Clamp = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::clamp);
1241 if (Clamp) {
1242 SDWAInst.add(MO: *Clamp);
1243 } else {
1244 SDWAInst.addImm(Val: 0);
1245 }
1246
1247 // Copy omod if present, initialize otherwise if needed
1248 if (AMDGPU::hasNamedOperand(Opcode: SDWAOpcode, NamedIdx: AMDGPU::OpName::omod)) {
1249 MachineOperand *OMod = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::omod);
1250 if (OMod) {
1251 SDWAInst.add(MO: *OMod);
1252 } else {
1253 SDWAInst.addImm(Val: 0);
1254 }
1255 }
1256
1257 // Initialize SDWA specific operands
1258 if (AMDGPU::hasNamedOperand(Opcode: SDWAOpcode, NamedIdx: AMDGPU::OpName::dst_sel))
1259 SDWAInst.addImm(Val: AMDGPU::SDWA::SdwaSel::DWORD);
1260
1261 if (AMDGPU::hasNamedOperand(Opcode: SDWAOpcode, NamedIdx: AMDGPU::OpName::dst_unused))
1262 SDWAInst.addImm(Val: AMDGPU::SDWA::DstUnused::UNUSED_PAD);
1263
1264 assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src0_sel));
1265 SDWAInst.addImm(Val: AMDGPU::SDWA::SdwaSel::DWORD);
1266
1267 if (Src1) {
1268 assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src1_sel));
1269 SDWAInst.addImm(Val: AMDGPU::SDWA::SdwaSel::DWORD);
1270 }
1271
1272 // Check for a preserved register that needs to be copied.
1273 MachineInstr *Ret = SDWAInst.getInstr();
1274 TII->fixImplicitOperands(MI&: *Ret);
1275 return Ret;
1276}
1277
1278bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
1279 const SDWAOperandsVector &SDWAOperands) {
1280 LLVM_DEBUG(dbgs() << "Convert instruction:" << MI);
1281
1282 MachineInstr *SDWAInst;
1283 if (TII->isSDWA(Opcode: MI.getOpcode())) {
1284 // Clone the instruction to allow revoking changes
1285 // made to MI during the processing of the operands
1286 // if the conversion fails.
1287 SDWAInst = MI.getMF()->CloneMachineInstr(Orig: &MI);
1288 MI.getParent()->insert(I: MI.getIterator(), M: SDWAInst);
1289 } else {
1290 SDWAInst = createSDWAVersion(MI);
1291 }
1292
1293 // Apply all sdwa operand patterns.
1294 bool Converted = false;
1295 for (auto &Operand : SDWAOperands) {
1296 LLVM_DEBUG(dbgs() << *SDWAInst << "\nOperand: " << *Operand);
1297 // There should be no intersection between SDWA operands and potential MIs
1298 // e.g.:
1299 // v_and_b32 v0, 0xff, v1 -> src:v1 sel:BYTE_0
1300 // v_and_b32 v2, 0xff, v0 -> src:v0 sel:BYTE_0
1301 // v_add_u32 v3, v4, v2
1302 //
1303 // In that example it is possible that we would fold 2nd instruction into
1304 // 3rd (v_add_u32_sdwa) and then try to fold 1st instruction into 2nd (that
1305 // was already destroyed). So if SDWAOperand is also a potential MI then do
1306 // not apply it.
1307 if (PotentialMatches.count(Key: Operand->getParentInst()) == 0)
1308 Converted |= Operand->convertToSDWA(MI&: *SDWAInst, TII);
1309 }
1310
1311 if (!Converted) {
1312 SDWAInst->eraseFromParent();
1313 return false;
1314 }
1315
1316 ConvertedInstructions.push_back(Elt: SDWAInst);
1317 for (MachineOperand &MO : SDWAInst->uses()) {
1318 if (!MO.isReg())
1319 continue;
1320
1321 MRI->clearKillFlags(Reg: MO.getReg());
1322 }
1323 LLVM_DEBUG(dbgs() << "\nInto:" << *SDWAInst << '\n');
1324 ++NumSDWAInstructionsPeepholed;
1325
1326 MI.eraseFromParent();
1327 return true;
1328}
1329
1330// If an instruction was converted to SDWA it should not have immediates or SGPR
1331// operands (allowed one SGPR on GFX9). Copy its scalar operands into VGPRs.
1332void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI,
1333 const GCNSubtarget &ST) const {
1334 const MCInstrDesc &Desc = TII->get(Opcode: MI.getOpcode());
1335 unsigned ConstantBusCount = 0;
1336 for (MachineOperand &Op : MI.explicit_uses()) {
1337 if (Op.isReg()) {
1338 if (TRI->isVGPR(MRI: *MRI, Reg: Op.getReg()))
1339 continue;
1340
1341 if (ST.hasSDWAScalar() && ConstantBusCount == 0) {
1342 ++ConstantBusCount;
1343 continue;
1344 }
1345 } else if (!Op.isImm())
1346 continue;
1347
1348 unsigned I = Op.getOperandNo();
1349 const TargetRegisterClass *OpRC = TII->getRegClass(MCID: Desc, OpNum: I);
1350 if (!OpRC || !TRI->isVSSuperClass(RC: OpRC))
1351 continue;
1352
1353 Register VGPR = MRI->createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
1354 auto Copy = BuildMI(BB&: *MI.getParent(), I: MI.getIterator(), MIMD: MI.getDebugLoc(),
1355 MCID: TII->get(Opcode: AMDGPU::V_MOV_B32_e32), DestReg: VGPR);
1356 if (Op.isImm())
1357 Copy.addImm(Val: Op.getImm());
1358 else if (Op.isReg())
1359 Copy.addReg(RegNo: Op.getReg(), Flags: getKillRegState(B: Op.isKill()), SubReg: Op.getSubReg());
1360 Op.ChangeToRegister(Reg: VGPR, isDef: false);
1361 }
1362}
1363
1364bool SIPeepholeSDWALegacy::runOnMachineFunction(MachineFunction &MF) {
1365 if (skipFunction(F: MF.getFunction()))
1366 return false;
1367
1368 return SIPeepholeSDWA().run(MF);
1369}
1370
1371bool SIPeepholeSDWA::run(MachineFunction &MF) {
1372 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1373
1374 if (!ST.hasSDWA())
1375 return false;
1376
1377 MRI = &MF.getRegInfo();
1378 TRI = ST.getRegisterInfo();
1379 TII = ST.getInstrInfo();
1380
1381 // Find all SDWA operands in MF.
1382 bool Ret = false;
1383 for (MachineBasicBlock &MBB : MF) {
1384 bool Changed = false;
1385 do {
1386 // Preprocess the ADD/SUB pairs so they could be SDWA'ed.
1387 // Look for a possible ADD or SUB that resulted from a previously lowered
1388 // V_{ADD|SUB}_U64_PSEUDO. The function pseudoOpConvertToVOP2
1389 // lowers the pair of instructions into e32 form.
1390 matchSDWAOperands(MBB);
1391 for (const auto &OperandPair : SDWAOperands) {
1392 const auto &Operand = OperandPair.second;
1393 MachineInstr *PotentialMI = Operand->potentialToConvert(TII, ST);
1394 if (!PotentialMI)
1395 continue;
1396
1397 switch (PotentialMI->getOpcode()) {
1398 case AMDGPU::V_ADD_CO_U32_e64:
1399 case AMDGPU::V_SUB_CO_U32_e64:
1400 pseudoOpConvertToVOP2(MI&: *PotentialMI, ST);
1401 break;
1402 case AMDGPU::V_CNDMASK_B32_e64:
1403 convertVcndmaskToVOP2(MI&: *PotentialMI, ST);
1404 break;
1405 };
1406 }
1407 SDWAOperands.clear();
1408
1409 // Generate potential match list.
1410 matchSDWAOperands(MBB);
1411
1412 for (const auto &OperandPair : SDWAOperands) {
1413 const auto &Operand = OperandPair.second;
1414 MachineInstr *PotentialMI =
1415 Operand->potentialToConvert(TII, ST, PotentialMatches: &PotentialMatches);
1416
1417 if (PotentialMI && isConvertibleToSDWA(MI&: *PotentialMI, ST, TII))
1418 PotentialMatches[PotentialMI].push_back(Elt: Operand.get());
1419 }
1420
1421 for (auto &PotentialPair : PotentialMatches) {
1422 MachineInstr &PotentialMI = *PotentialPair.first;
1423 convertToSDWA(MI&: PotentialMI, SDWAOperands: PotentialPair.second);
1424 }
1425
1426 PotentialMatches.clear();
1427 SDWAOperands.clear();
1428
1429 Changed = !ConvertedInstructions.empty();
1430
1431 if (Changed)
1432 Ret = true;
1433 while (!ConvertedInstructions.empty())
1434 legalizeScalarOperands(MI&: *ConvertedInstructions.pop_back_val(), ST);
1435 } while (Changed);
1436 }
1437
1438 return Ret;
1439}
1440
1441PreservedAnalyses SIPeepholeSDWAPass::run(MachineFunction &MF,
1442 MachineFunctionAnalysisManager &) {
1443 if (MF.getFunction().hasOptNone() || !SIPeepholeSDWA().run(MF))
1444 return PreservedAnalyses::all();
1445
1446 PreservedAnalyses PA = getMachineFunctionPassPreservedAnalyses();
1447 PA.preserveSet<CFGAnalyses>();
1448 return PA;
1449}
1450