1//===- SIPeepholeSDWA.cpp - Peephole optimization for SDWA instructions ---===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file This pass tries to apply several peephole SDWA patterns.
10///
11/// E.g. original:
12/// V_LSHRREV_B32_e32 %0, 16, %1
13/// V_ADD_CO_U32_e32 %2, %0, %3
14/// V_LSHLREV_B32_e32 %4, 16, %2
15///
16/// Replace:
17/// V_ADD_CO_U32_sdwa %4, %1, %3
18/// dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
19///
20//===----------------------------------------------------------------------===//
21
22#include "SIPeepholeSDWA.h"
23#include "AMDGPU.h"
24#include "GCNSubtarget.h"
25#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
26#include "llvm/ADT/MapVector.h"
27#include "llvm/ADT/Statistic.h"
28#include "llvm/CodeGen/MachineFunctionPass.h"
29#include <optional>
30
31using namespace llvm;
32
33#define DEBUG_TYPE "si-peephole-sdwa"
34
35STATISTIC(NumSDWAPatternsFound, "Number of SDWA patterns found.");
36STATISTIC(NumSDWAInstructionsPeepholed,
37 "Number of instruction converted to SDWA.");
38
39namespace {
40
41bool isConvertibleToSDWA(MachineInstr &MI, const GCNSubtarget &ST,
42 const SIInstrInfo *TII);
43class SDWAOperand;
44class SDWADstOperand;
45
46using SDWAOperandsVector = SmallVector<SDWAOperand *, 4>;
47using SDWAOperandsMap = MapVector<MachineInstr *, SDWAOperandsVector>;
48
49class SIPeepholeSDWA {
50private:
51 MachineRegisterInfo *MRI;
52 const SIRegisterInfo *TRI;
53 const SIInstrInfo *TII;
54
55 MapVector<MachineInstr *, std::unique_ptr<SDWAOperand>> SDWAOperands;
56 SDWAOperandsMap PotentialMatches;
57 SmallVector<MachineInstr *, 8> ConvertedInstructions;
58
59 std::optional<int64_t> foldToImm(const MachineOperand &Op) const;
60
61 void matchSDWAOperands(MachineBasicBlock &MBB);
62 std::unique_ptr<SDWAOperand> matchSDWAOperand(MachineInstr &MI);
63 void pseudoOpConvertToVOP2(MachineInstr &MI,
64 const GCNSubtarget &ST) const;
65 void convertVcndmaskToVOP2(MachineInstr &MI, const GCNSubtarget &ST) const;
66 MachineInstr *createSDWAVersion(MachineInstr &MI);
67 bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands);
68 void legalizeScalarOperands(MachineInstr &MI, const GCNSubtarget &ST) const;
69
70public:
71 bool run(MachineFunction &MF);
72};
73
74class SIPeepholeSDWALegacy : public MachineFunctionPass {
75public:
76 static char ID;
77
78 SIPeepholeSDWALegacy() : MachineFunctionPass(ID) {}
79
80 StringRef getPassName() const override { return "SI Peephole SDWA"; }
81
82 bool runOnMachineFunction(MachineFunction &MF) override;
83
84 void getAnalysisUsage(AnalysisUsage &AU) const override {
85 AU.setPreservesCFG();
86 MachineFunctionPass::getAnalysisUsage(AU);
87 }
88};
89
90using namespace AMDGPU::SDWA;
91
92class SDWAOperand {
93private:
94 MachineOperand *Target; // Operand that would be used in converted instruction
95 MachineOperand *Replaced; // Operand that would be replace by Target
96
97 /// Returns true iff the SDWA selection of this SDWAOperand can be combined
98 /// with the SDWA selections of its uses in \p MI.
99 virtual bool canCombineSelections(const MachineInstr &MI,
100 const SIInstrInfo *TII) = 0;
101
102public:
103 SDWAOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp)
104 : Target(TargetOp), Replaced(ReplacedOp) {
105 assert(Target->isReg());
106 assert(Replaced->isReg());
107 }
108
109 virtual ~SDWAOperand() = default;
110
111 virtual MachineInstr *potentialToConvert(const SIInstrInfo *TII,
112 const GCNSubtarget &ST,
113 SDWAOperandsMap *PotentialMatches = nullptr) = 0;
114 virtual bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) = 0;
115
116 MachineOperand *getTargetOperand() const { return Target; }
117 MachineOperand *getReplacedOperand() const { return Replaced; }
118 MachineInstr *getParentInst() const { return Target->getParent(); }
119
120 MachineRegisterInfo *getMRI() const {
121 return &getParentInst()->getParent()->getParent()->getRegInfo();
122 }
123
124#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
125 virtual void print(raw_ostream& OS) const = 0;
126 void dump() const { print(dbgs()); }
127#endif
128};
129
130class SDWASrcOperand : public SDWAOperand {
131private:
132 SdwaSel SrcSel;
133 bool Abs;
134 bool Neg;
135 bool Sext;
136
137public:
138 SDWASrcOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp,
139 SdwaSel SrcSel_ = DWORD, bool Abs_ = false, bool Neg_ = false,
140 bool Sext_ = false)
141 : SDWAOperand(TargetOp, ReplacedOp), SrcSel(SrcSel_), Abs(Abs_),
142 Neg(Neg_), Sext(Sext_) {}
143
144 MachineInstr *potentialToConvert(const SIInstrInfo *TII,
145 const GCNSubtarget &ST,
146 SDWAOperandsMap *PotentialMatches = nullptr) override;
147 bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override;
148 bool canCombineSelections(const MachineInstr &MI,
149 const SIInstrInfo *TII) override;
150
151 SdwaSel getSrcSel() const { return SrcSel; }
152 bool getAbs() const { return Abs; }
153 bool getNeg() const { return Neg; }
154 bool getSext() const { return Sext; }
155
156 uint64_t getSrcMods(const SIInstrInfo *TII,
157 const MachineOperand *SrcOp) const;
158
159#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
160 void print(raw_ostream& OS) const override;
161#endif
162};
163
164class SDWADstOperand : public SDWAOperand {
165private:
166 SdwaSel DstSel;
167 DstUnused DstUn;
168
169public:
170 SDWADstOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp,
171 SdwaSel DstSel_ = DWORD, DstUnused DstUn_ = UNUSED_PAD)
172 : SDWAOperand(TargetOp, ReplacedOp), DstSel(DstSel_), DstUn(DstUn_) {}
173
174 MachineInstr *potentialToConvert(const SIInstrInfo *TII,
175 const GCNSubtarget &ST,
176 SDWAOperandsMap *PotentialMatches = nullptr) override;
177 bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override;
178 bool canCombineSelections(const MachineInstr &MI,
179 const SIInstrInfo *TII) override;
180
181 SdwaSel getDstSel() const { return DstSel; }
182 DstUnused getDstUnused() const { return DstUn; }
183
184#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
185 void print(raw_ostream& OS) const override;
186#endif
187};
188
189class SDWADstPreserveOperand : public SDWADstOperand {
190private:
191 MachineOperand *Preserve;
192
193public:
194 SDWADstPreserveOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp,
195 MachineOperand *PreserveOp, SdwaSel DstSel_ = DWORD)
196 : SDWADstOperand(TargetOp, ReplacedOp, DstSel_, UNUSED_PRESERVE),
197 Preserve(PreserveOp) {}
198
199 bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override;
200 bool canCombineSelections(const MachineInstr &MI,
201 const SIInstrInfo *TII) override;
202
203 MachineOperand *getPreservedOperand() const { return Preserve; }
204
205#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
206 void print(raw_ostream& OS) const override;
207#endif
208};
209
210} // end anonymous namespace
211
212INITIALIZE_PASS(SIPeepholeSDWALegacy, DEBUG_TYPE, "SI Peephole SDWA", false,
213 false)
214
215char SIPeepholeSDWALegacy::ID = 0;
216
217char &llvm::SIPeepholeSDWALegacyID = SIPeepholeSDWALegacy::ID;
218
219FunctionPass *llvm::createSIPeepholeSDWALegacyPass() {
220 return new SIPeepholeSDWALegacy();
221}
222
223#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
224static raw_ostream& operator<<(raw_ostream &OS, SdwaSel Sel) {
225 switch(Sel) {
226 case BYTE_0: OS << "BYTE_0"; break;
227 case BYTE_1: OS << "BYTE_1"; break;
228 case BYTE_2: OS << "BYTE_2"; break;
229 case BYTE_3: OS << "BYTE_3"; break;
230 case WORD_0: OS << "WORD_0"; break;
231 case WORD_1: OS << "WORD_1"; break;
232 case DWORD: OS << "DWORD"; break;
233 }
234 return OS;
235}
236
237static raw_ostream& operator<<(raw_ostream &OS, const DstUnused &Un) {
238 switch(Un) {
239 case UNUSED_PAD: OS << "UNUSED_PAD"; break;
240 case UNUSED_SEXT: OS << "UNUSED_SEXT"; break;
241 case UNUSED_PRESERVE: OS << "UNUSED_PRESERVE"; break;
242 }
243 return OS;
244}
245
246LLVM_DUMP_METHOD
247void SDWASrcOperand::print(raw_ostream& OS) const {
248 OS << "SDWA src: " << *getTargetOperand()
249 << " src_sel:" << getSrcSel()
250 << " abs:" << getAbs() << " neg:" << getNeg()
251 << " sext:" << getSext() << '\n';
252}
253
254LLVM_DUMP_METHOD
255void SDWADstOperand::print(raw_ostream& OS) const {
256 OS << "SDWA dst: " << *getTargetOperand()
257 << " dst_sel:" << getDstSel()
258 << " dst_unused:" << getDstUnused() << '\n';
259}
260
261LLVM_DUMP_METHOD
262void SDWADstPreserveOperand::print(raw_ostream& OS) const {
263 OS << "SDWA preserve dst: " << *getTargetOperand()
264 << " dst_sel:" << getDstSel()
265 << " preserve:" << *getPreservedOperand() << '\n';
266}
267
268#endif
269
270static void copyRegOperand(MachineOperand &To, const MachineOperand &From) {
271 assert(To.isReg() && From.isReg());
272 To.setReg(From.getReg());
273 To.setSubReg(From.getSubReg());
274 To.setIsUndef(From.isUndef());
275 if (To.isUse()) {
276 To.setIsKill(From.isKill());
277 } else {
278 To.setIsDead(From.isDead());
279 }
280}
281
282static bool isSameReg(const MachineOperand &LHS, const MachineOperand &RHS) {
283 return LHS.isReg() &&
284 RHS.isReg() &&
285 LHS.getReg() == RHS.getReg() &&
286 LHS.getSubReg() == RHS.getSubReg();
287}
288
289static MachineOperand *findSingleRegUse(const MachineOperand *Reg,
290 const MachineRegisterInfo *MRI) {
291 if (!Reg->isReg() || !Reg->isDef())
292 return nullptr;
293
294 MachineOperand *ResMO = nullptr;
295 for (MachineOperand &UseMO : MRI->use_nodbg_operands(Reg: Reg->getReg())) {
296 // If there exist use of subreg of Reg then return nullptr
297 if (!isSameReg(LHS: UseMO, RHS: *Reg))
298 return nullptr;
299
300 // Check that there is only one instruction that uses Reg
301 if (!ResMO) {
302 ResMO = &UseMO;
303 } else if (ResMO->getParent() != UseMO.getParent()) {
304 return nullptr;
305 }
306 }
307
308 return ResMO;
309}
310
311static MachineOperand *findSingleRegDef(const MachineOperand *Reg,
312 const MachineRegisterInfo *MRI) {
313 if (!Reg->isReg())
314 return nullptr;
315
316 MachineInstr *DefInstr = MRI->getUniqueVRegDef(Reg: Reg->getReg());
317 if (!DefInstr)
318 return nullptr;
319
320 for (auto &DefMO : DefInstr->defs()) {
321 if (DefMO.isReg() && DefMO.getReg() == Reg->getReg())
322 return &DefMO;
323 }
324
325 // Ignore implicit defs.
326 return nullptr;
327}
328
329/// Combine an SDWA instruction's existing SDWA selection \p Sel with
330/// the SDWA selection \p OperandSel of its operand. If the selections
331/// are compatible, return the combined selection, otherwise return a
332/// nullopt.
333/// For example, if we have Sel = BYTE_0 Sel and OperandSel = WORD_1:
334/// BYTE_0 Sel (WORD_1 Sel (%X)) -> BYTE_2 Sel (%X)
335static std::optional<SdwaSel> combineSdwaSel(SdwaSel Sel, SdwaSel OperandSel) {
336 if (Sel == SdwaSel::DWORD)
337 return OperandSel;
338
339 if (Sel == OperandSel || OperandSel == SdwaSel::DWORD)
340 return Sel;
341
342 if (Sel == SdwaSel::WORD_1 || Sel == SdwaSel::BYTE_2 ||
343 Sel == SdwaSel::BYTE_3)
344 return {};
345
346 if (OperandSel == SdwaSel::WORD_0)
347 return Sel;
348
349 if (OperandSel == SdwaSel::WORD_1) {
350 if (Sel == SdwaSel::BYTE_0)
351 return SdwaSel::BYTE_2;
352 if (Sel == SdwaSel::BYTE_1)
353 return SdwaSel::BYTE_3;
354 if (Sel == SdwaSel::WORD_0)
355 return SdwaSel::WORD_1;
356 }
357
358 return {};
359}
360
361uint64_t SDWASrcOperand::getSrcMods(const SIInstrInfo *TII,
362 const MachineOperand *SrcOp) const {
363 uint64_t Mods = 0;
364 const auto *MI = SrcOp->getParent();
365 if (TII->getNamedOperand(MI: *MI, OperandName: AMDGPU::OpName::src0) == SrcOp) {
366 if (auto *Mod = TII->getNamedOperand(MI: *MI, OperandName: AMDGPU::OpName::src0_modifiers)) {
367 Mods = Mod->getImm();
368 }
369 } else if (TII->getNamedOperand(MI: *MI, OperandName: AMDGPU::OpName::src1) == SrcOp) {
370 if (auto *Mod = TII->getNamedOperand(MI: *MI, OperandName: AMDGPU::OpName::src1_modifiers)) {
371 Mods = Mod->getImm();
372 }
373 }
374 if (Abs || Neg) {
375 assert(!Sext &&
376 "Float and integer src modifiers can't be set simultaneously");
377 Mods |= Abs ? SISrcMods::ABS : 0u;
378 Mods ^= Neg ? SISrcMods::NEG : 0u;
379 } else if (Sext) {
380 Mods |= SISrcMods::SEXT;
381 }
382
383 return Mods;
384}
385
386MachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII,
387 const GCNSubtarget &ST,
388 SDWAOperandsMap *PotentialMatches) {
389 if (PotentialMatches != nullptr) {
390 // Fill out the map for all uses if all can be converted
391 MachineOperand *Reg = getReplacedOperand();
392 if (!Reg->isReg() || !Reg->isDef())
393 return nullptr;
394
395 for (MachineInstr &UseMI : getMRI()->use_nodbg_instructions(Reg: Reg->getReg()))
396 // Check that all instructions that use Reg can be converted
397 if (!isConvertibleToSDWA(MI&: UseMI, ST, TII) ||
398 !canCombineSelections(MI: UseMI, TII))
399 return nullptr;
400
401 // Now that it's guaranteed all uses are legal, iterate over the uses again
402 // to add them for later conversion.
403 for (MachineOperand &UseMO : getMRI()->use_nodbg_operands(Reg: Reg->getReg())) {
404 // Should not get a subregister here
405 assert(isSameReg(UseMO, *Reg));
406
407 SDWAOperandsMap &potentialMatchesMap = *PotentialMatches;
408 MachineInstr *UseMI = UseMO.getParent();
409 potentialMatchesMap[UseMI].push_back(Elt: this);
410 }
411 return nullptr;
412 }
413
414 // For SDWA src operand potential instruction is one that use register
415 // defined by parent instruction
416 MachineOperand *PotentialMO = findSingleRegUse(Reg: getReplacedOperand(), MRI: getMRI());
417 if (!PotentialMO)
418 return nullptr;
419
420 MachineInstr *Parent = PotentialMO->getParent();
421
422 return canCombineSelections(MI: *Parent, TII) ? Parent : nullptr;
423}
424
425bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {
426 switch (MI.getOpcode()) {
427 case AMDGPU::V_CVT_F32_FP8_sdwa:
428 case AMDGPU::V_CVT_F32_BF8_sdwa:
429 case AMDGPU::V_CVT_PK_F32_FP8_sdwa:
430 case AMDGPU::V_CVT_PK_F32_BF8_sdwa:
431 // Does not support input modifiers: noabs, noneg, nosext.
432 return false;
433 case AMDGPU::V_CNDMASK_B32_sdwa:
434 // SISrcMods uses the same bitmask for SEXT and NEG modifiers and
435 // hence the compiler can only support one type of modifier for
436 // each SDWA instruction. For V_CNDMASK_B32_sdwa, this is NEG
437 // since its operands get printed using
438 // AMDGPUInstPrinter::printOperandAndFPInputMods which produces
439 // the output intended for NEG if SEXT is set.
440 //
441 // The ISA does actually support both modifiers on most SDWA
442 // instructions.
443 //
444 // FIXME Accept SEXT here after fixing this issue.
445 if (Sext)
446 return false;
447 break;
448 }
449
450 // Find operand in instruction that matches source operand and replace it with
451 // target operand. Set corresponding src_sel
452 bool IsPreserveSrc = false;
453 MachineOperand *Src = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0);
454 MachineOperand *SrcSel = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0_sel);
455 MachineOperand *SrcMods =
456 TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0_modifiers);
457 assert(Src && (Src->isReg() || Src->isImm()));
458 if (!isSameReg(LHS: *Src, RHS: *getReplacedOperand())) {
459 // If this is not src0 then it could be src1
460 Src = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1);
461 SrcSel = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1_sel);
462 SrcMods = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1_modifiers);
463
464 if (!Src ||
465 !isSameReg(LHS: *Src, RHS: *getReplacedOperand())) {
466 // It's possible this Src is a tied operand for
467 // UNUSED_PRESERVE, in which case we can either
468 // abandon the peephole attempt, or if legal we can
469 // copy the target operand into the tied slot
470 // if the preserve operation will effectively cause the same
471 // result by overwriting the rest of the dst.
472 MachineOperand *Dst = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst);
473 MachineOperand *DstUnused =
474 TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::dst_unused);
475
476 if (Dst &&
477 DstUnused->getImm() == AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE) {
478 // This will work if the tied src is accessing WORD_0, and the dst is
479 // writing WORD_1. Modifiers don't matter because all the bits that
480 // would be impacted are being overwritten by the dst.
481 // Any other case will not work.
482 SdwaSel DstSel = static_cast<SdwaSel>(
483 TII->getNamedImmOperand(MI, OperandName: AMDGPU::OpName::dst_sel));
484 if (DstSel == AMDGPU::SDWA::SdwaSel::WORD_1 &&
485 getSrcSel() == AMDGPU::SDWA::SdwaSel::WORD_0) {
486 IsPreserveSrc = true;
487 auto DstIdx = AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(),
488 Name: AMDGPU::OpName::vdst);
489 auto TiedIdx = MI.findTiedOperandIdx(OpIdx: DstIdx);
490 Src = &MI.getOperand(i: TiedIdx);
491 SrcSel = nullptr;
492 SrcMods = nullptr;
493 } else {
494 // Not legal to convert this src
495 return false;
496 }
497 }
498 }
499 assert(Src && Src->isReg());
500
501 if ((MI.getOpcode() == AMDGPU::V_FMAC_F16_sdwa ||
502 MI.getOpcode() == AMDGPU::V_FMAC_F32_sdwa ||
503 MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa ||
504 MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) &&
505 !isSameReg(LHS: *Src, RHS: *getReplacedOperand())) {
506 // In case of v_mac_f16/32_sdwa this pass can try to apply src operand to
507 // src2. This is not allowed.
508 return false;
509 }
510
511 assert(isSameReg(*Src, *getReplacedOperand()) &&
512 (IsPreserveSrc || (SrcSel && SrcMods)));
513 }
514 copyRegOperand(To&: *Src, From: *getTargetOperand());
515 if (!IsPreserveSrc) {
516 SdwaSel ExistingSel = static_cast<SdwaSel>(SrcSel->getImm());
517 SrcSel->setImm(*combineSdwaSel(Sel: ExistingSel, OperandSel: getSrcSel()));
518 SrcMods->setImm(getSrcMods(TII, SrcOp: Src));
519 }
520 getTargetOperand()->setIsKill(false);
521 return true;
522}
523
524/// Verify that the SDWA selection operand \p SrcSelOpName of the SDWA
525/// instruction \p MI can be combined with the selection \p OpSel.
526static bool canCombineOpSel(const MachineInstr &MI, const SIInstrInfo *TII,
527 AMDGPU::OpName SrcSelOpName, SdwaSel OpSel) {
528 assert(TII->isSDWA(MI.getOpcode()));
529
530 const MachineOperand *SrcSelOp = TII->getNamedOperand(MI, OperandName: SrcSelOpName);
531 SdwaSel SrcSel = static_cast<SdwaSel>(SrcSelOp->getImm());
532
533 return combineSdwaSel(Sel: SrcSel, OperandSel: OpSel).has_value();
534}
535
536/// Verify that \p Op is the same register as the operand of the SDWA
537/// instruction \p MI named by \p SrcOpName and that the SDWA
538/// selection \p SrcSelOpName can be combined with the \p OpSel.
539static bool canCombineOpSel(const MachineInstr &MI, const SIInstrInfo *TII,
540 AMDGPU::OpName SrcOpName,
541 AMDGPU::OpName SrcSelOpName, MachineOperand *Op,
542 SdwaSel OpSel) {
543 assert(TII->isSDWA(MI.getOpcode()));
544
545 const MachineOperand *Src = TII->getNamedOperand(MI, OperandName: SrcOpName);
546 if (!Src || !isSameReg(LHS: *Src, RHS: *Op))
547 return true;
548
549 return canCombineOpSel(MI, TII, SrcSelOpName, OpSel);
550}
551
552bool SDWASrcOperand::canCombineSelections(const MachineInstr &MI,
553 const SIInstrInfo *TII) {
554 if (!TII->isSDWA(Opcode: MI.getOpcode()))
555 return true;
556
557 using namespace AMDGPU;
558
559 return canCombineOpSel(MI, TII, SrcOpName: OpName::src0, SrcSelOpName: OpName::src0_sel,
560 Op: getReplacedOperand(), OpSel: getSrcSel()) &&
561 canCombineOpSel(MI, TII, SrcOpName: OpName::src1, SrcSelOpName: OpName::src1_sel,
562 Op: getReplacedOperand(), OpSel: getSrcSel());
563}
564
565MachineInstr *SDWADstOperand::potentialToConvert(const SIInstrInfo *TII,
566 const GCNSubtarget &ST,
567 SDWAOperandsMap *PotentialMatches) {
568 // For SDWA dst operand potential instruction is one that defines register
569 // that this operand uses
570 MachineRegisterInfo *MRI = getMRI();
571 MachineInstr *ParentMI = getParentInst();
572
573 MachineOperand *PotentialMO = findSingleRegDef(Reg: getReplacedOperand(), MRI);
574 if (!PotentialMO)
575 return nullptr;
576
577 // Check that ParentMI is the only instruction that uses replaced register
578 for (MachineInstr &UseInst : MRI->use_nodbg_instructions(Reg: PotentialMO->getReg())) {
579 if (&UseInst != ParentMI)
580 return nullptr;
581 }
582
583 MachineInstr *Parent = PotentialMO->getParent();
584 return canCombineSelections(MI: *Parent, TII) ? Parent : nullptr;
585}
586
587bool SDWADstOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {
588 // Replace vdst operand in MI with target operand. Set dst_sel and dst_unused
589
590 if ((MI.getOpcode() == AMDGPU::V_FMAC_F16_sdwa ||
591 MI.getOpcode() == AMDGPU::V_FMAC_F32_sdwa ||
592 MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa ||
593 MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) &&
594 getDstSel() != AMDGPU::SDWA::DWORD) {
595 // v_mac_f16/32_sdwa allow dst_sel to be equal only to DWORD
596 return false;
597 }
598
599 MachineOperand *Operand = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst);
600 assert(Operand &&
601 Operand->isReg() &&
602 isSameReg(*Operand, *getReplacedOperand()));
603 copyRegOperand(To&: *Operand, From: *getTargetOperand());
604 MachineOperand *DstSel= TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::dst_sel);
605 assert(DstSel);
606
607 SdwaSel ExistingSel = static_cast<SdwaSel>(DstSel->getImm());
608 DstSel->setImm(combineSdwaSel(Sel: ExistingSel, OperandSel: getDstSel()).value());
609
610 MachineOperand *DstUnused= TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::dst_unused);
611 assert(DstUnused);
612 DstUnused->setImm(getDstUnused());
613
614 // Remove original instruction because it would conflict with our new
615 // instruction by register definition
616 getParentInst()->eraseFromParent();
617 return true;
618}
619
620bool SDWADstOperand::canCombineSelections(const MachineInstr &MI,
621 const SIInstrInfo *TII) {
622 if (!TII->isSDWA(Opcode: MI.getOpcode()))
623 return true;
624
625 return canCombineOpSel(MI, TII, SrcSelOpName: AMDGPU::OpName::dst_sel, OpSel: getDstSel());
626}
627
628bool SDWADstPreserveOperand::convertToSDWA(MachineInstr &MI,
629 const SIInstrInfo *TII) {
630 // MI should be moved right before v_or_b32.
631 // For this we should clear all kill flags on uses of MI src-operands or else
632 // we can encounter problem with use of killed operand.
633 for (MachineOperand &MO : MI.uses()) {
634 if (!MO.isReg())
635 continue;
636 getMRI()->clearKillFlags(Reg: MO.getReg());
637 }
638
639 // Move MI before v_or_b32
640 MI.getParent()->remove(I: &MI);
641 getParentInst()->getParent()->insert(I: getParentInst(), MI: &MI);
642
643 // Add Implicit use of preserved register
644 MachineInstrBuilder MIB(*MI.getMF(), MI);
645 MIB.addReg(RegNo: getPreservedOperand()->getReg(),
646 flags: RegState::ImplicitKill,
647 SubReg: getPreservedOperand()->getSubReg());
648
649 // Tie dst to implicit use
650 MI.tieOperands(DefIdx: AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: AMDGPU::OpName::vdst),
651 UseIdx: MI.getNumOperands() - 1);
652
653 // Convert MI as any other SDWADstOperand and remove v_or_b32
654 return SDWADstOperand::convertToSDWA(MI, TII);
655}
656
657bool SDWADstPreserveOperand::canCombineSelections(const MachineInstr &MI,
658 const SIInstrInfo *TII) {
659 return SDWADstOperand::canCombineSelections(MI, TII);
660}
661
662std::optional<int64_t>
663SIPeepholeSDWA::foldToImm(const MachineOperand &Op) const {
664 if (Op.isImm()) {
665 return Op.getImm();
666 }
667
668 // If this is not immediate then it can be copy of immediate value, e.g.:
669 // %1 = S_MOV_B32 255;
670 if (Op.isReg()) {
671 for (const MachineOperand &Def : MRI->def_operands(Reg: Op.getReg())) {
672 if (!isSameReg(LHS: Op, RHS: Def))
673 continue;
674
675 const MachineInstr *DefInst = Def.getParent();
676 if (!TII->isFoldableCopy(MI: *DefInst))
677 return std::nullopt;
678
679 const MachineOperand &Copied = DefInst->getOperand(i: 1);
680 if (!Copied.isImm())
681 return std::nullopt;
682
683 return Copied.getImm();
684 }
685 }
686
687 return std::nullopt;
688}
689
690std::unique_ptr<SDWAOperand>
691SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) {
692 unsigned Opcode = MI.getOpcode();
693 switch (Opcode) {
694 case AMDGPU::V_LSHRREV_B32_e32:
695 case AMDGPU::V_ASHRREV_I32_e32:
696 case AMDGPU::V_LSHLREV_B32_e32:
697 case AMDGPU::V_LSHRREV_B32_e64:
698 case AMDGPU::V_ASHRREV_I32_e64:
699 case AMDGPU::V_LSHLREV_B32_e64: {
700 // from: v_lshrrev_b32_e32 v1, 16/24, v0
701 // to SDWA src:v0 src_sel:WORD_1/BYTE_3
702
703 // from: v_ashrrev_i32_e32 v1, 16/24, v0
704 // to SDWA src:v0 src_sel:WORD_1/BYTE_3 sext:1
705
706 // from: v_lshlrev_b32_e32 v1, 16/24, v0
707 // to SDWA dst:v1 dst_sel:WORD_1/BYTE_3 dst_unused:UNUSED_PAD
708 MachineOperand *Src0 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0);
709 auto Imm = foldToImm(Op: *Src0);
710 if (!Imm)
711 break;
712
713 if (*Imm != 16 && *Imm != 24)
714 break;
715
716 MachineOperand *Src1 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1);
717 MachineOperand *Dst = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst);
718 if (!Src1->isReg() || Src1->getReg().isPhysical() ||
719 Dst->getReg().isPhysical())
720 break;
721
722 if (Opcode == AMDGPU::V_LSHLREV_B32_e32 ||
723 Opcode == AMDGPU::V_LSHLREV_B32_e64) {
724 return std::make_unique<SDWADstOperand>(
725 args&: Dst, args&: Src1, args: *Imm == 16 ? WORD_1 : BYTE_3, args: UNUSED_PAD);
726 }
727 return std::make_unique<SDWASrcOperand>(
728 args&: Src1, args&: Dst, args: *Imm == 16 ? WORD_1 : BYTE_3, args: false, args: false,
729 args: Opcode != AMDGPU::V_LSHRREV_B32_e32 &&
730 Opcode != AMDGPU::V_LSHRREV_B32_e64);
731 break;
732 }
733
734 case AMDGPU::V_LSHRREV_B16_e32:
735 case AMDGPU::V_ASHRREV_I16_e32:
736 case AMDGPU::V_LSHLREV_B16_e32:
737 case AMDGPU::V_LSHRREV_B16_e64:
738 case AMDGPU::V_LSHRREV_B16_opsel_e64:
739 case AMDGPU::V_ASHRREV_I16_e64:
740 case AMDGPU::V_LSHLREV_B16_opsel_e64:
741 case AMDGPU::V_LSHLREV_B16_e64: {
742 // from: v_lshrrev_b16_e32 v1, 8, v0
743 // to SDWA src:v0 src_sel:BYTE_1
744
745 // from: v_ashrrev_i16_e32 v1, 8, v0
746 // to SDWA src:v0 src_sel:BYTE_1 sext:1
747
748 // from: v_lshlrev_b16_e32 v1, 8, v0
749 // to SDWA dst:v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD
750 MachineOperand *Src0 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0);
751 auto Imm = foldToImm(Op: *Src0);
752 if (!Imm || *Imm != 8)
753 break;
754
755 MachineOperand *Src1 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1);
756 MachineOperand *Dst = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst);
757
758 if (!Src1->isReg() || Src1->getReg().isPhysical() ||
759 Dst->getReg().isPhysical())
760 break;
761
762 if (Opcode == AMDGPU::V_LSHLREV_B16_e32 ||
763 Opcode == AMDGPU::V_LSHLREV_B16_opsel_e64 ||
764 Opcode == AMDGPU::V_LSHLREV_B16_e64)
765 return std::make_unique<SDWADstOperand>(args&: Dst, args&: Src1, args: BYTE_1, args: UNUSED_PAD);
766 return std::make_unique<SDWASrcOperand>(
767 args&: Src1, args&: Dst, args: BYTE_1, args: false, args: false,
768 args: Opcode != AMDGPU::V_LSHRREV_B16_e32 &&
769 Opcode != AMDGPU::V_LSHRREV_B16_opsel_e64 &&
770 Opcode != AMDGPU::V_LSHRREV_B16_e64);
771 break;
772 }
773
774 case AMDGPU::V_BFE_I32_e64:
775 case AMDGPU::V_BFE_U32_e64: {
776 // e.g.:
777 // from: v_bfe_u32 v1, v0, 8, 8
778 // to SDWA src:v0 src_sel:BYTE_1
779
780 // offset | width | src_sel
781 // ------------------------
782 // 0 | 8 | BYTE_0
783 // 0 | 16 | WORD_0
784 // 0 | 32 | DWORD ?
785 // 8 | 8 | BYTE_1
786 // 16 | 8 | BYTE_2
787 // 16 | 16 | WORD_1
788 // 24 | 8 | BYTE_3
789
790 MachineOperand *Src1 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1);
791 auto Offset = foldToImm(Op: *Src1);
792 if (!Offset)
793 break;
794
795 MachineOperand *Src2 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src2);
796 auto Width = foldToImm(Op: *Src2);
797 if (!Width)
798 break;
799
800 SdwaSel SrcSel = DWORD;
801
802 if (*Offset == 0 && *Width == 8)
803 SrcSel = BYTE_0;
804 else if (*Offset == 0 && *Width == 16)
805 SrcSel = WORD_0;
806 else if (*Offset == 0 && *Width == 32)
807 SrcSel = DWORD;
808 else if (*Offset == 8 && *Width == 8)
809 SrcSel = BYTE_1;
810 else if (*Offset == 16 && *Width == 8)
811 SrcSel = BYTE_2;
812 else if (*Offset == 16 && *Width == 16)
813 SrcSel = WORD_1;
814 else if (*Offset == 24 && *Width == 8)
815 SrcSel = BYTE_3;
816 else
817 break;
818
819 MachineOperand *Src0 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0);
820 MachineOperand *Dst = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst);
821
822 if (!Src0->isReg() || Src0->getReg().isPhysical() ||
823 Dst->getReg().isPhysical())
824 break;
825
826 return std::make_unique<SDWASrcOperand>(
827 args&: Src0, args&: Dst, args&: SrcSel, args: false, args: false, args: Opcode != AMDGPU::V_BFE_U32_e64);
828 }
829
830 case AMDGPU::V_AND_B32_e32:
831 case AMDGPU::V_AND_B32_e64: {
832 // e.g.:
833 // from: v_and_b32_e32 v1, 0x0000ffff/0x000000ff, v0
834 // to SDWA src:v0 src_sel:WORD_0/BYTE_0
835
836 MachineOperand *Src0 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0);
837 MachineOperand *Src1 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1);
838 auto *ValSrc = Src1;
839 auto Imm = foldToImm(Op: *Src0);
840
841 if (!Imm) {
842 Imm = foldToImm(Op: *Src1);
843 ValSrc = Src0;
844 }
845
846 if (!Imm || (*Imm != 0x0000ffff && *Imm != 0x000000ff))
847 break;
848
849 MachineOperand *Dst = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst);
850
851 if (!ValSrc->isReg() || ValSrc->getReg().isPhysical() ||
852 Dst->getReg().isPhysical())
853 break;
854
855 return std::make_unique<SDWASrcOperand>(
856 args&: ValSrc, args&: Dst, args: *Imm == 0x0000ffff ? WORD_0 : BYTE_0);
857 }
858
859 case AMDGPU::V_OR_B32_e32:
860 case AMDGPU::V_OR_B32_e64: {
861 // Patterns for dst_unused:UNUSED_PRESERVE.
862 // e.g., from:
863 // v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD
864 // src1_sel:WORD_1 src2_sel:WORD1
865 // v_add_f16_e32 v3, v1, v2
866 // v_or_b32_e32 v4, v0, v3
867 // to SDWA preserve dst:v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE preserve:v3
868
869 // Check if one of operands of v_or_b32 is SDWA instruction
870 using CheckRetType =
871 std::optional<std::pair<MachineOperand *, MachineOperand *>>;
872 auto CheckOROperandsForSDWA =
873 [&](const MachineOperand *Op1, const MachineOperand *Op2) -> CheckRetType {
874 if (!Op1 || !Op1->isReg() || !Op2 || !Op2->isReg())
875 return CheckRetType(std::nullopt);
876
877 MachineOperand *Op1Def = findSingleRegDef(Reg: Op1, MRI);
878 if (!Op1Def)
879 return CheckRetType(std::nullopt);
880
881 MachineInstr *Op1Inst = Op1Def->getParent();
882 if (!TII->isSDWA(MI: *Op1Inst))
883 return CheckRetType(std::nullopt);
884
885 MachineOperand *Op2Def = findSingleRegDef(Reg: Op2, MRI);
886 if (!Op2Def)
887 return CheckRetType(std::nullopt);
888
889 return CheckRetType(std::pair(Op1Def, Op2Def));
890 };
891
892 MachineOperand *OrSDWA = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0);
893 MachineOperand *OrOther = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1);
894 assert(OrSDWA && OrOther);
895 auto Res = CheckOROperandsForSDWA(OrSDWA, OrOther);
896 if (!Res) {
897 OrSDWA = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1);
898 OrOther = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0);
899 assert(OrSDWA && OrOther);
900 Res = CheckOROperandsForSDWA(OrSDWA, OrOther);
901 if (!Res)
902 break;
903 }
904
905 MachineOperand *OrSDWADef = Res->first;
906 MachineOperand *OrOtherDef = Res->second;
907 assert(OrSDWADef && OrOtherDef);
908
909 MachineInstr *SDWAInst = OrSDWADef->getParent();
910 MachineInstr *OtherInst = OrOtherDef->getParent();
911
912 // Check that OtherInstr is actually bitwise compatible with SDWAInst = their
913 // destination patterns don't overlap. Compatible instruction can be either
914 // regular instruction with compatible bitness or SDWA instruction with
915 // correct dst_sel
916 // SDWAInst | OtherInst bitness / OtherInst dst_sel
917 // -----------------------------------------------------
918 // DWORD | no / no
919 // WORD_0 | no / BYTE_2/3, WORD_1
920 // WORD_1 | 8/16-bit instructions / BYTE_0/1, WORD_0
921 // BYTE_0 | no / BYTE_1/2/3, WORD_1
922 // BYTE_1 | 8-bit / BYTE_0/2/3, WORD_1
923 // BYTE_2 | 8/16-bit / BYTE_0/1/3. WORD_0
924 // BYTE_3 | 8/16/24-bit / BYTE_0/1/2, WORD_0
925 // E.g. if SDWAInst is v_add_f16_sdwa dst_sel:WORD_1 then v_add_f16 is OK
926 // but v_add_f32 is not.
927
928 // TODO: add support for non-SDWA instructions as OtherInst.
929 // For now this only works with SDWA instructions. For regular instructions
930 // there is no way to determine if the instruction writes only 8/16/24-bit
931 // out of full register size and all registers are at min 32-bit wide.
932 if (!TII->isSDWA(MI: *OtherInst))
933 break;
934
935 SdwaSel DstSel = static_cast<SdwaSel>(
936 TII->getNamedImmOperand(MI: *SDWAInst, OperandName: AMDGPU::OpName::dst_sel));
937 SdwaSel OtherDstSel = static_cast<SdwaSel>(
938 TII->getNamedImmOperand(MI: *OtherInst, OperandName: AMDGPU::OpName::dst_sel));
939
940 bool DstSelAgree = false;
941 switch (DstSel) {
942 case WORD_0: DstSelAgree = ((OtherDstSel == BYTE_2) ||
943 (OtherDstSel == BYTE_3) ||
944 (OtherDstSel == WORD_1));
945 break;
946 case WORD_1: DstSelAgree = ((OtherDstSel == BYTE_0) ||
947 (OtherDstSel == BYTE_1) ||
948 (OtherDstSel == WORD_0));
949 break;
950 case BYTE_0: DstSelAgree = ((OtherDstSel == BYTE_1) ||
951 (OtherDstSel == BYTE_2) ||
952 (OtherDstSel == BYTE_3) ||
953 (OtherDstSel == WORD_1));
954 break;
955 case BYTE_1: DstSelAgree = ((OtherDstSel == BYTE_0) ||
956 (OtherDstSel == BYTE_2) ||
957 (OtherDstSel == BYTE_3) ||
958 (OtherDstSel == WORD_1));
959 break;
960 case BYTE_2: DstSelAgree = ((OtherDstSel == BYTE_0) ||
961 (OtherDstSel == BYTE_1) ||
962 (OtherDstSel == BYTE_3) ||
963 (OtherDstSel == WORD_0));
964 break;
965 case BYTE_3: DstSelAgree = ((OtherDstSel == BYTE_0) ||
966 (OtherDstSel == BYTE_1) ||
967 (OtherDstSel == BYTE_2) ||
968 (OtherDstSel == WORD_0));
969 break;
970 default: DstSelAgree = false;
971 }
972
973 if (!DstSelAgree)
974 break;
975
976 // Also OtherInst dst_unused should be UNUSED_PAD
977 DstUnused OtherDstUnused = static_cast<DstUnused>(
978 TII->getNamedImmOperand(MI: *OtherInst, OperandName: AMDGPU::OpName::dst_unused));
979 if (OtherDstUnused != DstUnused::UNUSED_PAD)
980 break;
981
982 // Create DstPreserveOperand
983 MachineOperand *OrDst = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst);
984 assert(OrDst && OrDst->isReg());
985
986 return std::make_unique<SDWADstPreserveOperand>(
987 args&: OrDst, args&: OrSDWADef, args&: OrOtherDef, args&: DstSel);
988
989 }
990 }
991
992 return std::unique_ptr<SDWAOperand>(nullptr);
993}
994
995#if !defined(NDEBUG)
996static raw_ostream& operator<<(raw_ostream &OS, const SDWAOperand &Operand) {
997 Operand.print(OS);
998 return OS;
999}
1000#endif
1001
1002void SIPeepholeSDWA::matchSDWAOperands(MachineBasicBlock &MBB) {
1003 for (MachineInstr &MI : MBB) {
1004 if (auto Operand = matchSDWAOperand(MI)) {
1005 LLVM_DEBUG(dbgs() << "Match: " << MI << "To: " << *Operand << '\n');
1006 SDWAOperands[&MI] = std::move(Operand);
1007 ++NumSDWAPatternsFound;
1008 }
1009 }
1010}
1011
1012// Convert the V_ADD_CO_U32_e64 into V_ADD_CO_U32_e32. This allows
1013// isConvertibleToSDWA to perform its transformation on V_ADD_CO_U32_e32 into
1014// V_ADD_CO_U32_sdwa.
1015//
1016// We are transforming from a VOP3 into a VOP2 form of the instruction.
1017// %19:vgpr_32 = V_AND_B32_e32 255,
1018// killed %16:vgpr_32, implicit $exec
1019// %47:vgpr_32, %49:sreg_64_xexec = V_ADD_CO_U32_e64
1020// %26.sub0:vreg_64, %19:vgpr_32, implicit $exec
1021// %48:vgpr_32, dead %50:sreg_64_xexec = V_ADDC_U32_e64
1022// %26.sub1:vreg_64, %54:vgpr_32, killed %49:sreg_64_xexec, implicit $exec
1023//
1024// becomes
1025// %47:vgpr_32 = V_ADD_CO_U32_sdwa
1026// 0, %26.sub0:vreg_64, 0, killed %16:vgpr_32, 0, 6, 0, 6, 0,
1027// implicit-def $vcc, implicit $exec
1028// %48:vgpr_32, dead %50:sreg_64_xexec = V_ADDC_U32_e64
1029// %26.sub1:vreg_64, %54:vgpr_32, killed $vcc, implicit $exec
1030void SIPeepholeSDWA::pseudoOpConvertToVOP2(MachineInstr &MI,
1031 const GCNSubtarget &ST) const {
1032 int Opc = MI.getOpcode();
1033 assert((Opc == AMDGPU::V_ADD_CO_U32_e64 || Opc == AMDGPU::V_SUB_CO_U32_e64) &&
1034 "Currently only handles V_ADD_CO_U32_e64 or V_SUB_CO_U32_e64");
1035
1036 // Can the candidate MI be shrunk?
1037 if (!TII->canShrink(MI, MRI: *MRI))
1038 return;
1039 Opc = AMDGPU::getVOPe32(Opcode: Opc);
1040 // Find the related ADD instruction.
1041 const MachineOperand *Sdst = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::sdst);
1042 if (!Sdst)
1043 return;
1044 MachineOperand *NextOp = findSingleRegUse(Reg: Sdst, MRI);
1045 if (!NextOp)
1046 return;
1047 MachineInstr &MISucc = *NextOp->getParent();
1048
1049 // Make sure the carry in/out are subsequently unused.
1050 MachineOperand *CarryIn = TII->getNamedOperand(MI&: MISucc, OperandName: AMDGPU::OpName::src2);
1051 if (!CarryIn)
1052 return;
1053 MachineOperand *CarryOut = TII->getNamedOperand(MI&: MISucc, OperandName: AMDGPU::OpName::sdst);
1054 if (!CarryOut)
1055 return;
1056 if (!MRI->hasOneUse(RegNo: CarryIn->getReg()) || !MRI->use_empty(RegNo: CarryOut->getReg()))
1057 return;
1058 // Make sure VCC or its subregs are dead before MI.
1059 MachineBasicBlock &MBB = *MI.getParent();
1060 MachineBasicBlock::LivenessQueryResult Liveness =
1061 MBB.computeRegisterLiveness(TRI, Reg: AMDGPU::VCC, Before: MI, Neighborhood: 25);
1062 if (Liveness != MachineBasicBlock::LQR_Dead)
1063 return;
1064 // Check if VCC is referenced in range of (MI,MISucc].
1065 for (auto I = std::next(x: MI.getIterator()), E = MISucc.getIterator();
1066 I != E; ++I) {
1067 if (I->modifiesRegister(Reg: AMDGPU::VCC, TRI))
1068 return;
1069 }
1070
1071 // Replace MI with V_{SUB|ADD}_I32_e32
1072 BuildMI(BB&: MBB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: Opc))
1073 .add(MO: *TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst))
1074 .add(MO: *TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0))
1075 .add(MO: *TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1))
1076 .setMIFlags(MI.getFlags());
1077
1078 MI.eraseFromParent();
1079
1080 // Since the carry output of MI is now VCC, update its use in MISucc.
1081
1082 MISucc.substituteRegister(FromReg: CarryIn->getReg(), ToReg: TRI->getVCC(), SubIdx: 0, RegInfo: *TRI);
1083}
1084
1085/// Try to convert an \p MI in VOP3 which takes an src2 carry-in
1086/// operand into the corresponding VOP2 form which expects the
1087/// argument in VCC. To this end, add an copy from the carry-in to
1088/// VCC. The conversion will only be applied if \p MI can be shrunk
1089/// to VOP2 and if VCC can be proven to be dead before \p MI.
1090void SIPeepholeSDWA::convertVcndmaskToVOP2(MachineInstr &MI,
1091 const GCNSubtarget &ST) const {
1092 assert(MI.getOpcode() == AMDGPU::V_CNDMASK_B32_e64);
1093
1094 LLVM_DEBUG(dbgs() << "Attempting VOP2 conversion: " << MI);
1095 if (!TII->canShrink(MI, MRI: *MRI)) {
1096 LLVM_DEBUG(dbgs() << "Cannot shrink instruction\n");
1097 return;
1098 }
1099
1100 const MachineOperand &CarryIn =
1101 *TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src2);
1102 Register CarryReg = CarryIn.getReg();
1103 MachineInstr *CarryDef = MRI->getVRegDef(Reg: CarryReg);
1104 if (!CarryDef) {
1105 LLVM_DEBUG(dbgs() << "Missing carry-in operand definition\n");
1106 return;
1107 }
1108
1109 // Make sure VCC or its subregs are dead before MI.
1110 MCRegister Vcc = TRI->getVCC();
1111 MachineBasicBlock &MBB = *MI.getParent();
1112 MachineBasicBlock::LivenessQueryResult Liveness =
1113 MBB.computeRegisterLiveness(TRI, Reg: Vcc, Before: MI);
1114 if (Liveness != MachineBasicBlock::LQR_Dead) {
1115 LLVM_DEBUG(dbgs() << "VCC not known to be dead before instruction\n");
1116 return;
1117 }
1118
1119 BuildMI(BB&: MBB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: Vcc).add(MO: CarryIn);
1120
1121 auto Converted = BuildMI(BB&: MBB, I&: MI, MIMD: MI.getDebugLoc(),
1122 MCID: TII->get(Opcode: AMDGPU::getVOPe32(Opcode: MI.getOpcode())))
1123 .add(MO: *TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst))
1124 .add(MO: *TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0))
1125 .add(MO: *TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1))
1126 .setMIFlags(MI.getFlags());
1127 TII->fixImplicitOperands(MI&: *Converted);
1128 LLVM_DEBUG(dbgs() << "Converted to VOP2: " << *Converted);
1129 (void)Converted;
1130 MI.eraseFromParent();
1131}
1132
1133namespace {
1134bool isConvertibleToSDWA(MachineInstr &MI,
1135 const GCNSubtarget &ST,
1136 const SIInstrInfo* TII) {
1137 // Check if this is already an SDWA instruction
1138 unsigned Opc = MI.getOpcode();
1139 if (TII->isSDWA(Opcode: Opc))
1140 return true;
1141
1142 // Can only be handled after ealier conversion to
1143 // AMDGPU::V_CNDMASK_B32_e32 which is not always possible.
1144 if (Opc == AMDGPU::V_CNDMASK_B32_e64)
1145 return false;
1146
1147 // Check if this instruction has opcode that supports SDWA
1148 if (AMDGPU::getSDWAOp(Opcode: Opc) == -1)
1149 Opc = AMDGPU::getVOPe32(Opcode: Opc);
1150
1151 if (AMDGPU::getSDWAOp(Opcode: Opc) == -1)
1152 return false;
1153
1154 if (!ST.hasSDWAOmod() && TII->hasModifiersSet(MI, OpName: AMDGPU::OpName::omod))
1155 return false;
1156
1157 if (TII->isVOPC(Opcode: Opc)) {
1158 if (!ST.hasSDWASdst()) {
1159 const MachineOperand *SDst = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::sdst);
1160 if (SDst && (SDst->getReg() != AMDGPU::VCC &&
1161 SDst->getReg() != AMDGPU::VCC_LO))
1162 return false;
1163 }
1164
1165 if (!ST.hasSDWAOutModsVOPC() &&
1166 (TII->hasModifiersSet(MI, OpName: AMDGPU::OpName::clamp) ||
1167 TII->hasModifiersSet(MI, OpName: AMDGPU::OpName::omod)))
1168 return false;
1169
1170 } else if (TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::sdst) ||
1171 !TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst)) {
1172 return false;
1173 }
1174
1175 if (!ST.hasSDWAMac() && (Opc == AMDGPU::V_FMAC_F16_e32 ||
1176 Opc == AMDGPU::V_FMAC_F32_e32 ||
1177 Opc == AMDGPU::V_MAC_F16_e32 ||
1178 Opc == AMDGPU::V_MAC_F32_e32))
1179 return false;
1180
1181 // Check if target supports this SDWA opcode
1182 if (TII->pseudoToMCOpcode(Opcode: Opc) == -1)
1183 return false;
1184
1185 if (MachineOperand *Src0 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0)) {
1186 if (!Src0->isReg() && !Src0->isImm())
1187 return false;
1188 }
1189
1190 if (MachineOperand *Src1 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1)) {
1191 if (!Src1->isReg() && !Src1->isImm())
1192 return false;
1193 }
1194
1195 return true;
1196}
1197} // namespace
1198
1199MachineInstr *SIPeepholeSDWA::createSDWAVersion(MachineInstr &MI) {
1200 unsigned Opcode = MI.getOpcode();
1201 assert(!TII->isSDWA(Opcode));
1202
1203 int SDWAOpcode = AMDGPU::getSDWAOp(Opcode);
1204 if (SDWAOpcode == -1)
1205 SDWAOpcode = AMDGPU::getSDWAOp(Opcode: AMDGPU::getVOPe32(Opcode));
1206 assert(SDWAOpcode != -1);
1207
1208 const MCInstrDesc &SDWADesc = TII->get(Opcode: SDWAOpcode);
1209
1210 // Create SDWA version of instruction MI and initialize its operands
1211 MachineInstrBuilder SDWAInst =
1212 BuildMI(BB&: *MI.getParent(), I&: MI, MIMD: MI.getDebugLoc(), MCID: SDWADesc)
1213 .setMIFlags(MI.getFlags());
1214
1215 // Copy dst, if it is present in original then should also be present in SDWA
1216 MachineOperand *Dst = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst);
1217 if (Dst) {
1218 assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::vdst));
1219 SDWAInst.add(MO: *Dst);
1220 } else if ((Dst = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::sdst))) {
1221 assert(Dst && AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::sdst));
1222 SDWAInst.add(MO: *Dst);
1223 } else {
1224 assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::sdst));
1225 SDWAInst.addReg(RegNo: TRI->getVCC(), flags: RegState::Define);
1226 }
1227
1228 // Copy src0, initialize src0_modifiers. All sdwa instructions has src0 and
1229 // src0_modifiers (except for v_nop_sdwa, but it can't get here)
1230 MachineOperand *Src0 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0);
1231 assert(Src0 && AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src0) &&
1232 AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src0_modifiers));
1233 if (auto *Mod = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0_modifiers))
1234 SDWAInst.addImm(Val: Mod->getImm());
1235 else
1236 SDWAInst.addImm(Val: 0);
1237 SDWAInst.add(MO: *Src0);
1238
1239 // Copy src1 if present, initialize src1_modifiers.
1240 MachineOperand *Src1 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1);
1241 if (Src1) {
1242 assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src1) &&
1243 AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src1_modifiers));
1244 if (auto *Mod = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1_modifiers))
1245 SDWAInst.addImm(Val: Mod->getImm());
1246 else
1247 SDWAInst.addImm(Val: 0);
1248 SDWAInst.add(MO: *Src1);
1249 }
1250
1251 if (SDWAOpcode == AMDGPU::V_FMAC_F16_sdwa ||
1252 SDWAOpcode == AMDGPU::V_FMAC_F32_sdwa ||
1253 SDWAOpcode == AMDGPU::V_MAC_F16_sdwa ||
1254 SDWAOpcode == AMDGPU::V_MAC_F32_sdwa) {
1255 // v_mac_f16/32 has additional src2 operand tied to vdst
1256 MachineOperand *Src2 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src2);
1257 assert(Src2);
1258 SDWAInst.add(MO: *Src2);
1259 }
1260
1261 // Copy clamp if present, initialize otherwise
1262 assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::clamp));
1263 MachineOperand *Clamp = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::clamp);
1264 if (Clamp) {
1265 SDWAInst.add(MO: *Clamp);
1266 } else {
1267 SDWAInst.addImm(Val: 0);
1268 }
1269
1270 // Copy omod if present, initialize otherwise if needed
1271 if (AMDGPU::hasNamedOperand(Opcode: SDWAOpcode, NamedIdx: AMDGPU::OpName::omod)) {
1272 MachineOperand *OMod = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::omod);
1273 if (OMod) {
1274 SDWAInst.add(MO: *OMod);
1275 } else {
1276 SDWAInst.addImm(Val: 0);
1277 }
1278 }
1279
1280 // Initialize SDWA specific operands
1281 if (AMDGPU::hasNamedOperand(Opcode: SDWAOpcode, NamedIdx: AMDGPU::OpName::dst_sel))
1282 SDWAInst.addImm(Val: AMDGPU::SDWA::SdwaSel::DWORD);
1283
1284 if (AMDGPU::hasNamedOperand(Opcode: SDWAOpcode, NamedIdx: AMDGPU::OpName::dst_unused))
1285 SDWAInst.addImm(Val: AMDGPU::SDWA::DstUnused::UNUSED_PAD);
1286
1287 assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src0_sel));
1288 SDWAInst.addImm(Val: AMDGPU::SDWA::SdwaSel::DWORD);
1289
1290 if (Src1) {
1291 assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src1_sel));
1292 SDWAInst.addImm(Val: AMDGPU::SDWA::SdwaSel::DWORD);
1293 }
1294
1295 // Check for a preserved register that needs to be copied.
1296 MachineInstr *Ret = SDWAInst.getInstr();
1297 TII->fixImplicitOperands(MI&: *Ret);
1298 return Ret;
1299}
1300
1301bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
1302 const SDWAOperandsVector &SDWAOperands) {
1303 LLVM_DEBUG(dbgs() << "Convert instruction:" << MI);
1304
1305 MachineInstr *SDWAInst;
1306 if (TII->isSDWA(Opcode: MI.getOpcode())) {
1307 // Clone the instruction to allow revoking changes
1308 // made to MI during the processing of the operands
1309 // if the conversion fails.
1310 SDWAInst = MI.getParent()->getParent()->CloneMachineInstr(Orig: &MI);
1311 MI.getParent()->insert(I: MI.getIterator(), M: SDWAInst);
1312 } else {
1313 SDWAInst = createSDWAVersion(MI);
1314 }
1315
1316 // Apply all sdwa operand patterns.
1317 bool Converted = false;
1318 for (auto &Operand : SDWAOperands) {
1319 LLVM_DEBUG(dbgs() << *SDWAInst << "\nOperand: " << *Operand);
1320 // There should be no intersection between SDWA operands and potential MIs
1321 // e.g.:
1322 // v_and_b32 v0, 0xff, v1 -> src:v1 sel:BYTE_0
1323 // v_and_b32 v2, 0xff, v0 -> src:v0 sel:BYTE_0
1324 // v_add_u32 v3, v4, v2
1325 //
1326 // In that example it is possible that we would fold 2nd instruction into
1327 // 3rd (v_add_u32_sdwa) and then try to fold 1st instruction into 2nd (that
1328 // was already destroyed). So if SDWAOperand is also a potential MI then do
1329 // not apply it.
1330 if (PotentialMatches.count(Key: Operand->getParentInst()) == 0)
1331 Converted |= Operand->convertToSDWA(MI&: *SDWAInst, TII);
1332 }
1333
1334 if (!Converted) {
1335 SDWAInst->eraseFromParent();
1336 return false;
1337 }
1338
1339 ConvertedInstructions.push_back(Elt: SDWAInst);
1340 for (MachineOperand &MO : SDWAInst->uses()) {
1341 if (!MO.isReg())
1342 continue;
1343
1344 MRI->clearKillFlags(Reg: MO.getReg());
1345 }
1346 LLVM_DEBUG(dbgs() << "\nInto:" << *SDWAInst << '\n');
1347 ++NumSDWAInstructionsPeepholed;
1348
1349 MI.eraseFromParent();
1350 return true;
1351}
1352
1353// If an instruction was converted to SDWA it should not have immediates or SGPR
1354// operands (allowed one SGPR on GFX9). Copy its scalar operands into VGPRs.
1355void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI,
1356 const GCNSubtarget &ST) const {
1357 const MCInstrDesc &Desc = TII->get(Opcode: MI.getOpcode());
1358 unsigned ConstantBusCount = 0;
1359 for (MachineOperand &Op : MI.explicit_uses()) {
1360 if (!Op.isImm() && !(Op.isReg() && !TRI->isVGPR(MRI: *MRI, Reg: Op.getReg())))
1361 continue;
1362
1363 unsigned I = Op.getOperandNo();
1364 if (Desc.operands()[I].RegClass == -1 ||
1365 !TRI->isVSSuperClass(RC: TRI->getRegClass(RCID: Desc.operands()[I].RegClass)))
1366 continue;
1367
1368 if (ST.hasSDWAScalar() && ConstantBusCount == 0 && Op.isReg() &&
1369 TRI->isSGPRReg(MRI: *MRI, Reg: Op.getReg())) {
1370 ++ConstantBusCount;
1371 continue;
1372 }
1373
1374 Register VGPR = MRI->createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
1375 auto Copy = BuildMI(BB&: *MI.getParent(), I: MI.getIterator(), MIMD: MI.getDebugLoc(),
1376 MCID: TII->get(Opcode: AMDGPU::V_MOV_B32_e32), DestReg: VGPR);
1377 if (Op.isImm())
1378 Copy.addImm(Val: Op.getImm());
1379 else if (Op.isReg())
1380 Copy.addReg(RegNo: Op.getReg(), flags: Op.isKill() ? RegState::Kill : 0,
1381 SubReg: Op.getSubReg());
1382 Op.ChangeToRegister(Reg: VGPR, isDef: false);
1383 }
1384}
1385
1386bool SIPeepholeSDWALegacy::runOnMachineFunction(MachineFunction &MF) {
1387 if (skipFunction(F: MF.getFunction()))
1388 return false;
1389
1390 return SIPeepholeSDWA().run(MF);
1391}
1392
1393bool SIPeepholeSDWA::run(MachineFunction &MF) {
1394 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1395
1396 if (!ST.hasSDWA())
1397 return false;
1398
1399 MRI = &MF.getRegInfo();
1400 TRI = ST.getRegisterInfo();
1401 TII = ST.getInstrInfo();
1402
1403 // Find all SDWA operands in MF.
1404 bool Ret = false;
1405 for (MachineBasicBlock &MBB : MF) {
1406 bool Changed = false;
1407 do {
1408 // Preprocess the ADD/SUB pairs so they could be SDWA'ed.
1409 // Look for a possible ADD or SUB that resulted from a previously lowered
1410 // V_{ADD|SUB}_U64_PSEUDO. The function pseudoOpConvertToVOP2
1411 // lowers the pair of instructions into e32 form.
1412 matchSDWAOperands(MBB);
1413 for (const auto &OperandPair : SDWAOperands) {
1414 const auto &Operand = OperandPair.second;
1415 MachineInstr *PotentialMI = Operand->potentialToConvert(TII, ST);
1416 if (!PotentialMI)
1417 continue;
1418
1419 switch (PotentialMI->getOpcode()) {
1420 case AMDGPU::V_ADD_CO_U32_e64:
1421 case AMDGPU::V_SUB_CO_U32_e64:
1422 pseudoOpConvertToVOP2(MI&: *PotentialMI, ST);
1423 break;
1424 case AMDGPU::V_CNDMASK_B32_e64:
1425 convertVcndmaskToVOP2(MI&: *PotentialMI, ST);
1426 break;
1427 };
1428 }
1429 SDWAOperands.clear();
1430
1431 // Generate potential match list.
1432 matchSDWAOperands(MBB);
1433
1434 for (const auto &OperandPair : SDWAOperands) {
1435 const auto &Operand = OperandPair.second;
1436 MachineInstr *PotentialMI =
1437 Operand->potentialToConvert(TII, ST, PotentialMatches: &PotentialMatches);
1438
1439 if (PotentialMI && isConvertibleToSDWA(MI&: *PotentialMI, ST, TII))
1440 PotentialMatches[PotentialMI].push_back(Elt: Operand.get());
1441 }
1442
1443 for (auto &PotentialPair : PotentialMatches) {
1444 MachineInstr &PotentialMI = *PotentialPair.first;
1445 convertToSDWA(MI&: PotentialMI, SDWAOperands: PotentialPair.second);
1446 }
1447
1448 PotentialMatches.clear();
1449 SDWAOperands.clear();
1450
1451 Changed = !ConvertedInstructions.empty();
1452
1453 if (Changed)
1454 Ret = true;
1455 while (!ConvertedInstructions.empty())
1456 legalizeScalarOperands(MI&: *ConvertedInstructions.pop_back_val(), ST);
1457 } while (Changed);
1458 }
1459
1460 return Ret;
1461}
1462
1463PreservedAnalyses SIPeepholeSDWAPass::run(MachineFunction &MF,
1464 MachineFunctionAnalysisManager &) {
1465 if (MF.getFunction().hasOptNone() || !SIPeepholeSDWA().run(MF))
1466 return PreservedAnalyses::all();
1467
1468 PreservedAnalyses PA = getMachineFunctionPassPreservedAnalyses();
1469 PA.preserveSet<CFGAnalyses>();
1470 return PA;
1471}
1472