1//===- SIPeepholeSDWA.cpp - Peephole optimization for SDWA instructions ---===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file This pass tries to apply several peephole SDWA patterns.
10///
11/// E.g. original:
12/// V_LSHRREV_B32_e32 %0, 16, %1
13/// V_ADD_CO_U32_e32 %2, %0, %3
14/// V_LSHLREV_B32_e32 %4, 16, %2
15///
16/// Replace:
17/// V_ADD_CO_U32_sdwa %4, %1, %3
18/// dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
19///
20//===----------------------------------------------------------------------===//
21
22#include "SIPeepholeSDWA.h"
23#include "AMDGPU.h"
24#include "GCNSubtarget.h"
25#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
26#include "llvm/ADT/MapVector.h"
27#include "llvm/ADT/Statistic.h"
28#include "llvm/CodeGen/MachineFunctionPass.h"
29#include <optional>
30
31using namespace llvm;
32
33#define DEBUG_TYPE "si-peephole-sdwa"
34
35STATISTIC(NumSDWAPatternsFound, "Number of SDWA patterns found.");
36STATISTIC(NumSDWAInstructionsPeepholed,
37 "Number of instruction converted to SDWA.");
38
39namespace {
40
41bool isConvertibleToSDWA(MachineInstr &MI, const GCNSubtarget &ST,
42 const SIInstrInfo *TII);
43class SDWAOperand;
44class SDWADstOperand;
45
46using SDWAOperandsVector = SmallVector<SDWAOperand *, 4>;
47using SDWAOperandsMap = MapVector<MachineInstr *, SDWAOperandsVector>;
48
49class SIPeepholeSDWA {
50private:
51 MachineRegisterInfo *MRI;
52 const SIRegisterInfo *TRI;
53 const SIInstrInfo *TII;
54
55 MapVector<MachineInstr *, std::unique_ptr<SDWAOperand>> SDWAOperands;
56 SDWAOperandsMap PotentialMatches;
57 SmallVector<MachineInstr *, 8> ConvertedInstructions;
58
59 std::optional<int64_t> foldToImm(const MachineOperand &Op) const;
60
61 void matchSDWAOperands(MachineBasicBlock &MBB);
62 std::unique_ptr<SDWAOperand> matchSDWAOperand(MachineInstr &MI);
63 void pseudoOpConvertToVOP2(MachineInstr &MI,
64 const GCNSubtarget &ST) const;
65 void convertVcndmaskToVOP2(MachineInstr &MI, const GCNSubtarget &ST) const;
66 MachineInstr *createSDWAVersion(MachineInstr &MI);
67 bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands);
68 void legalizeScalarOperands(MachineInstr &MI, const GCNSubtarget &ST) const;
69
70public:
71 bool run(MachineFunction &MF);
72};
73
74class SIPeepholeSDWALegacy : public MachineFunctionPass {
75public:
76 static char ID;
77
78 SIPeepholeSDWALegacy() : MachineFunctionPass(ID) {}
79
80 StringRef getPassName() const override { return "SI Peephole SDWA"; }
81
82 bool runOnMachineFunction(MachineFunction &MF) override;
83
84 void getAnalysisUsage(AnalysisUsage &AU) const override {
85 AU.setPreservesCFG();
86 MachineFunctionPass::getAnalysisUsage(AU);
87 }
88};
89
90using namespace AMDGPU::SDWA;
91
92class SDWAOperand {
93private:
94 MachineOperand *Target; // Operand that would be used in converted instruction
95 MachineOperand *Replaced; // Operand that would be replace by Target
96
97 /// Returns true iff the SDWA selection of this SDWAOperand can be combined
98 /// with the SDWA selections of its uses in \p MI.
99 virtual bool canCombineSelections(const MachineInstr &MI,
100 const SIInstrInfo *TII) = 0;
101
102public:
103 SDWAOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp)
104 : Target(TargetOp), Replaced(ReplacedOp) {
105 assert(Target->isReg());
106 assert(Replaced->isReg());
107 }
108
109 virtual ~SDWAOperand() = default;
110
111 virtual MachineInstr *potentialToConvert(const SIInstrInfo *TII,
112 const GCNSubtarget &ST,
113 SDWAOperandsMap *PotentialMatches = nullptr) = 0;
114 virtual bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) = 0;
115
116 MachineOperand *getTargetOperand() const { return Target; }
117 MachineOperand *getReplacedOperand() const { return Replaced; }
118 MachineInstr *getParentInst() const { return Target->getParent(); }
119
120 MachineRegisterInfo *getMRI() const {
121 return &getParentInst()->getMF()->getRegInfo();
122 }
123
124#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
125 virtual void print(raw_ostream& OS) const = 0;
126 void dump() const { print(dbgs()); }
127#endif
128};
129
130class SDWASrcOperand : public SDWAOperand {
131private:
132 SdwaSel SrcSel;
133 bool Abs;
134 bool Neg;
135 bool Sext;
136
137public:
138 SDWASrcOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp,
139 SdwaSel SrcSel_ = DWORD, bool Abs_ = false, bool Neg_ = false,
140 bool Sext_ = false)
141 : SDWAOperand(TargetOp, ReplacedOp), SrcSel(SrcSel_), Abs(Abs_),
142 Neg(Neg_), Sext(Sext_) {}
143
144 MachineInstr *potentialToConvert(const SIInstrInfo *TII,
145 const GCNSubtarget &ST,
146 SDWAOperandsMap *PotentialMatches = nullptr) override;
147 bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override;
148 bool canCombineSelections(const MachineInstr &MI,
149 const SIInstrInfo *TII) override;
150
151 SdwaSel getSrcSel() const { return SrcSel; }
152 bool getAbs() const { return Abs; }
153 bool getNeg() const { return Neg; }
154 bool getSext() const { return Sext; }
155
156 uint64_t getSrcMods(const SIInstrInfo *TII,
157 const MachineOperand *SrcOp) const;
158
159#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
160 void print(raw_ostream& OS) const override;
161#endif
162};
163
164class SDWADstOperand : public SDWAOperand {
165private:
166 SdwaSel DstSel;
167 DstUnused DstUn;
168
169public:
170 SDWADstOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp,
171 SdwaSel DstSel_ = DWORD, DstUnused DstUn_ = UNUSED_PAD)
172 : SDWAOperand(TargetOp, ReplacedOp), DstSel(DstSel_), DstUn(DstUn_) {}
173
174 MachineInstr *potentialToConvert(const SIInstrInfo *TII,
175 const GCNSubtarget &ST,
176 SDWAOperandsMap *PotentialMatches = nullptr) override;
177 bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override;
178 bool canCombineSelections(const MachineInstr &MI,
179 const SIInstrInfo *TII) override;
180
181 SdwaSel getDstSel() const { return DstSel; }
182 DstUnused getDstUnused() const { return DstUn; }
183
184#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
185 void print(raw_ostream& OS) const override;
186#endif
187};
188
189class SDWADstPreserveOperand : public SDWADstOperand {
190private:
191 MachineOperand *Preserve;
192
193public:
194 SDWADstPreserveOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp,
195 MachineOperand *PreserveOp, SdwaSel DstSel_ = DWORD)
196 : SDWADstOperand(TargetOp, ReplacedOp, DstSel_, UNUSED_PRESERVE),
197 Preserve(PreserveOp) {}
198
199 bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override;
200 bool canCombineSelections(const MachineInstr &MI,
201 const SIInstrInfo *TII) override;
202
203 MachineOperand *getPreservedOperand() const { return Preserve; }
204
205#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
206 void print(raw_ostream& OS) const override;
207#endif
208};
209
210} // end anonymous namespace
211
212INITIALIZE_PASS(SIPeepholeSDWALegacy, DEBUG_TYPE, "SI Peephole SDWA", false,
213 false)
214
215char SIPeepholeSDWALegacy::ID = 0;
216
217char &llvm::SIPeepholeSDWALegacyID = SIPeepholeSDWALegacy::ID;
218
219FunctionPass *llvm::createSIPeepholeSDWALegacyPass() {
220 return new SIPeepholeSDWALegacy();
221}
222
223#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
224static raw_ostream& operator<<(raw_ostream &OS, SdwaSel Sel) {
225 switch(Sel) {
226 case BYTE_0: OS << "BYTE_0"; break;
227 case BYTE_1: OS << "BYTE_1"; break;
228 case BYTE_2: OS << "BYTE_2"; break;
229 case BYTE_3: OS << "BYTE_3"; break;
230 case WORD_0: OS << "WORD_0"; break;
231 case WORD_1: OS << "WORD_1"; break;
232 case DWORD: OS << "DWORD"; break;
233 }
234 return OS;
235}
236
237static raw_ostream& operator<<(raw_ostream &OS, const DstUnused &Un) {
238 switch(Un) {
239 case UNUSED_PAD: OS << "UNUSED_PAD"; break;
240 case UNUSED_SEXT: OS << "UNUSED_SEXT"; break;
241 case UNUSED_PRESERVE: OS << "UNUSED_PRESERVE"; break;
242 }
243 return OS;
244}
245
246LLVM_DUMP_METHOD
247void SDWASrcOperand::print(raw_ostream& OS) const {
248 OS << "SDWA src: " << *getTargetOperand()
249 << " src_sel:" << getSrcSel()
250 << " abs:" << getAbs() << " neg:" << getNeg()
251 << " sext:" << getSext() << '\n';
252}
253
254LLVM_DUMP_METHOD
255void SDWADstOperand::print(raw_ostream& OS) const {
256 OS << "SDWA dst: " << *getTargetOperand()
257 << " dst_sel:" << getDstSel()
258 << " dst_unused:" << getDstUnused() << '\n';
259}
260
261LLVM_DUMP_METHOD
262void SDWADstPreserveOperand::print(raw_ostream& OS) const {
263 OS << "SDWA preserve dst: " << *getTargetOperand()
264 << " dst_sel:" << getDstSel()
265 << " preserve:" << *getPreservedOperand() << '\n';
266}
267
268#endif
269
270static void copyRegOperand(MachineOperand &To, const MachineOperand &From) {
271 assert(To.isReg() && From.isReg());
272 To.setReg(From.getReg());
273 To.setSubReg(From.getSubReg());
274 To.setIsUndef(From.isUndef());
275 if (To.isUse()) {
276 To.setIsKill(From.isKill());
277 } else {
278 To.setIsDead(From.isDead());
279 }
280}
281
282static bool isSameReg(const MachineOperand &LHS, const MachineOperand &RHS) {
283 return LHS.isReg() &&
284 RHS.isReg() &&
285 LHS.getReg() == RHS.getReg() &&
286 LHS.getSubReg() == RHS.getSubReg();
287}
288
289static MachineOperand *findSingleRegUse(const MachineOperand *Reg,
290 const MachineRegisterInfo *MRI) {
291 if (!Reg->isReg() || !Reg->isDef())
292 return nullptr;
293
294 return MRI->getOneNonDBGUse(RegNo: Reg->getReg());
295}
296
297static MachineOperand *findSingleRegDef(const MachineOperand *Reg,
298 const MachineRegisterInfo *MRI) {
299 if (!Reg->isReg())
300 return nullptr;
301
302 return MRI->getOneDef(Reg: Reg->getReg());
303}
304
305/// Combine an SDWA instruction's existing SDWA selection \p Sel with
306/// the SDWA selection \p OperandSel of its operand. If the selections
307/// are compatible, return the combined selection, otherwise return a
308/// nullopt.
309/// For example, if we have Sel = BYTE_0 Sel and OperandSel = WORD_1:
310/// BYTE_0 Sel (WORD_1 Sel (%X)) -> BYTE_2 Sel (%X)
311static std::optional<SdwaSel> combineSdwaSel(SdwaSel Sel, SdwaSel OperandSel) {
312 if (Sel == SdwaSel::DWORD)
313 return OperandSel;
314
315 if (Sel == OperandSel || OperandSel == SdwaSel::DWORD)
316 return Sel;
317
318 if (Sel == SdwaSel::WORD_1 || Sel == SdwaSel::BYTE_2 ||
319 Sel == SdwaSel::BYTE_3)
320 return {};
321
322 if (OperandSel == SdwaSel::WORD_0)
323 return Sel;
324
325 if (OperandSel == SdwaSel::WORD_1) {
326 if (Sel == SdwaSel::BYTE_0)
327 return SdwaSel::BYTE_2;
328 if (Sel == SdwaSel::BYTE_1)
329 return SdwaSel::BYTE_3;
330 if (Sel == SdwaSel::WORD_0)
331 return SdwaSel::WORD_1;
332 }
333
334 return {};
335}
336
337uint64_t SDWASrcOperand::getSrcMods(const SIInstrInfo *TII,
338 const MachineOperand *SrcOp) const {
339 uint64_t Mods = 0;
340 const auto *MI = SrcOp->getParent();
341 if (TII->getNamedOperand(MI: *MI, OperandName: AMDGPU::OpName::src0) == SrcOp) {
342 if (auto *Mod = TII->getNamedOperand(MI: *MI, OperandName: AMDGPU::OpName::src0_modifiers)) {
343 Mods = Mod->getImm();
344 }
345 } else if (TII->getNamedOperand(MI: *MI, OperandName: AMDGPU::OpName::src1) == SrcOp) {
346 if (auto *Mod = TII->getNamedOperand(MI: *MI, OperandName: AMDGPU::OpName::src1_modifiers)) {
347 Mods = Mod->getImm();
348 }
349 }
350 if (Abs || Neg) {
351 assert(!Sext &&
352 "Float and integer src modifiers can't be set simultaneously");
353 Mods |= Abs ? SISrcMods::ABS : 0u;
354 Mods ^= Neg ? SISrcMods::NEG : 0u;
355 } else if (Sext) {
356 Mods |= SISrcMods::SEXT;
357 }
358
359 return Mods;
360}
361
362MachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII,
363 const GCNSubtarget &ST,
364 SDWAOperandsMap *PotentialMatches) {
365 if (PotentialMatches != nullptr) {
366 // Fill out the map for all uses if all can be converted
367 MachineOperand *Reg = getReplacedOperand();
368 if (!Reg->isReg() || !Reg->isDef())
369 return nullptr;
370
371 for (MachineInstr &UseMI : getMRI()->use_nodbg_instructions(Reg: Reg->getReg()))
372 // Check that all instructions that use Reg can be converted
373 if (!isConvertibleToSDWA(MI&: UseMI, ST, TII) ||
374 !canCombineSelections(MI: UseMI, TII))
375 return nullptr;
376
377 // Now that it's guaranteed all uses are legal, iterate over the uses again
378 // to add them for later conversion.
379 for (MachineOperand &UseMO : getMRI()->use_nodbg_operands(Reg: Reg->getReg())) {
380 // Should not get a subregister here
381 assert(isSameReg(UseMO, *Reg));
382
383 SDWAOperandsMap &potentialMatchesMap = *PotentialMatches;
384 MachineInstr *UseMI = UseMO.getParent();
385 potentialMatchesMap[UseMI].push_back(Elt: this);
386 }
387 return nullptr;
388 }
389
390 // For SDWA src operand potential instruction is one that use register
391 // defined by parent instruction
392 MachineOperand *PotentialMO = findSingleRegUse(Reg: getReplacedOperand(), MRI: getMRI());
393 if (!PotentialMO)
394 return nullptr;
395
396 MachineInstr *Parent = PotentialMO->getParent();
397
398 return canCombineSelections(MI: *Parent, TII) ? Parent : nullptr;
399}
400
401bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {
402 assert((!Sext || !TII->getSubtarget().zeroesHigh16BitsOfDest(
403 getParentInst()->getOpcode())) &&
404 "Cannot use sign-extension with instruction that zeroes high bits");
405 switch (MI.getOpcode()) {
406 case AMDGPU::V_CVT_F32_FP8_sdwa:
407 case AMDGPU::V_CVT_F32_BF8_sdwa:
408 case AMDGPU::V_CVT_PK_F32_FP8_sdwa:
409 case AMDGPU::V_CVT_PK_F32_BF8_sdwa:
410 // Does not support input modifiers: noabs, noneg, nosext.
411 return false;
412 case AMDGPU::V_CNDMASK_B32_sdwa:
413 // SISrcMods uses the same bitmask for SEXT and NEG modifiers and
414 // hence the compiler can only support one type of modifier for
415 // each SDWA instruction. For V_CNDMASK_B32_sdwa, this is NEG
416 // since its operands get printed using
417 // AMDGPUInstPrinter::printOperandAndFPInputMods which produces
418 // the output intended for NEG if SEXT is set.
419 //
420 // The ISA does actually support both modifiers on most SDWA
421 // instructions.
422 //
423 // FIXME Accept SEXT here after fixing this issue.
424 if (Sext)
425 return false;
426 break;
427 }
428
429 // Find operand in instruction that matches source operand and replace it with
430 // target operand. Set corresponding src_sel
431 bool IsPreserveSrc = false;
432 MachineOperand *Src = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0);
433 MachineOperand *SrcSel = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0_sel);
434 MachineOperand *SrcMods =
435 TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0_modifiers);
436 assert(Src && (Src->isReg() || Src->isImm()));
437 if (!isSameReg(LHS: *Src, RHS: *getReplacedOperand())) {
438 // If this is not src0 then it could be src1
439 Src = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1);
440 SrcSel = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1_sel);
441 SrcMods = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1_modifiers);
442
443 if (!Src ||
444 !isSameReg(LHS: *Src, RHS: *getReplacedOperand())) {
445 // It's possible this Src is a tied operand for
446 // UNUSED_PRESERVE, in which case we can either
447 // abandon the peephole attempt, or if legal we can
448 // copy the target operand into the tied slot
449 // if the preserve operation will effectively cause the same
450 // result by overwriting the rest of the dst.
451 MachineOperand *Dst = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst);
452 MachineOperand *DstUnused =
453 TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::dst_unused);
454
455 if (Dst &&
456 DstUnused->getImm() == AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE) {
457 // This will work if the tied src is accessing WORD_0, and the dst is
458 // writing WORD_1. Modifiers don't matter because all the bits that
459 // would be impacted are being overwritten by the dst.
460 // Any other case will not work.
461 SdwaSel DstSel = static_cast<SdwaSel>(
462 TII->getNamedImmOperand(MI, OperandName: AMDGPU::OpName::dst_sel));
463 if (DstSel == AMDGPU::SDWA::SdwaSel::WORD_1 &&
464 getSrcSel() == AMDGPU::SDWA::SdwaSel::WORD_0) {
465 IsPreserveSrc = true;
466 auto DstIdx = AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(),
467 Name: AMDGPU::OpName::vdst);
468 auto TiedIdx = MI.findTiedOperandIdx(OpIdx: DstIdx);
469 Src = &MI.getOperand(i: TiedIdx);
470 SrcSel = nullptr;
471 SrcMods = nullptr;
472 } else {
473 // Not legal to convert this src
474 return false;
475 }
476 }
477 }
478 assert(Src && Src->isReg());
479
480 if ((MI.getOpcode() == AMDGPU::V_FMAC_F16_sdwa ||
481 MI.getOpcode() == AMDGPU::V_FMAC_F32_sdwa ||
482 MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa ||
483 MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) &&
484 !isSameReg(LHS: *Src, RHS: *getReplacedOperand())) {
485 // In case of v_mac_f16/32_sdwa this pass can try to apply src operand to
486 // src2. This is not allowed.
487 return false;
488 }
489
490 assert(isSameReg(*Src, *getReplacedOperand()) &&
491 (IsPreserveSrc || (SrcSel && SrcMods)));
492 }
493 copyRegOperand(To&: *Src, From: *getTargetOperand());
494 if (!IsPreserveSrc) {
495 SdwaSel ExistingSel = static_cast<SdwaSel>(SrcSel->getImm());
496 SrcSel->setImm(*combineSdwaSel(Sel: ExistingSel, OperandSel: getSrcSel()));
497 SrcMods->setImm(getSrcMods(TII, SrcOp: Src));
498 }
499 getTargetOperand()->setIsKill(false);
500 return true;
501}
502
503/// Verify that the SDWA selection operand \p SrcSelOpName of the SDWA
504/// instruction \p MI can be combined with the selection \p OpSel.
505static bool canCombineOpSel(const MachineInstr &MI, const SIInstrInfo *TII,
506 AMDGPU::OpName SrcSelOpName, SdwaSel OpSel) {
507 assert(TII->isSDWA(MI.getOpcode()));
508
509 const MachineOperand *SrcSelOp = TII->getNamedOperand(MI, OperandName: SrcSelOpName);
510 SdwaSel SrcSel = static_cast<SdwaSel>(SrcSelOp->getImm());
511
512 return combineSdwaSel(Sel: SrcSel, OperandSel: OpSel).has_value();
513}
514
515/// Verify that \p Op is the same register as the operand of the SDWA
516/// instruction \p MI named by \p SrcOpName and that the SDWA
517/// selection \p SrcSelOpName can be combined with the \p OpSel.
518static bool canCombineOpSel(const MachineInstr &MI, const SIInstrInfo *TII,
519 AMDGPU::OpName SrcOpName,
520 AMDGPU::OpName SrcSelOpName, MachineOperand *Op,
521 SdwaSel OpSel) {
522 assert(TII->isSDWA(MI.getOpcode()));
523
524 const MachineOperand *Src = TII->getNamedOperand(MI, OperandName: SrcOpName);
525 if (!Src || !isSameReg(LHS: *Src, RHS: *Op))
526 return true;
527
528 return canCombineOpSel(MI, TII, SrcSelOpName, OpSel);
529}
530
531bool SDWASrcOperand::canCombineSelections(const MachineInstr &MI,
532 const SIInstrInfo *TII) {
533 if (!TII->isSDWA(Opcode: MI.getOpcode()))
534 return true;
535
536 using namespace AMDGPU;
537
538 return canCombineOpSel(MI, TII, SrcOpName: OpName::src0, SrcSelOpName: OpName::src0_sel,
539 Op: getReplacedOperand(), OpSel: getSrcSel()) &&
540 canCombineOpSel(MI, TII, SrcOpName: OpName::src1, SrcSelOpName: OpName::src1_sel,
541 Op: getReplacedOperand(), OpSel: getSrcSel());
542}
543
544MachineInstr *SDWADstOperand::potentialToConvert(const SIInstrInfo *TII,
545 const GCNSubtarget &ST,
546 SDWAOperandsMap *PotentialMatches) {
547 // For SDWA dst operand potential instruction is one that defines register
548 // that this operand uses
549 MachineRegisterInfo *MRI = getMRI();
550 MachineInstr *ParentMI = getParentInst();
551
552 MachineOperand *PotentialMO = findSingleRegDef(Reg: getReplacedOperand(), MRI);
553 if (!PotentialMO)
554 return nullptr;
555
556 // Check that ParentMI is the only instruction that uses replaced register
557 for (MachineInstr &UseInst : MRI->use_nodbg_instructions(Reg: PotentialMO->getReg())) {
558 if (&UseInst != ParentMI)
559 return nullptr;
560 }
561
562 MachineInstr *Parent = PotentialMO->getParent();
563 return canCombineSelections(MI: *Parent, TII) ? Parent : nullptr;
564}
565
566bool SDWADstOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {
567 // Replace vdst operand in MI with target operand. Set dst_sel and dst_unused
568
569 if ((MI.getOpcode() == AMDGPU::V_FMAC_F16_sdwa ||
570 MI.getOpcode() == AMDGPU::V_FMAC_F32_sdwa ||
571 MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa ||
572 MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) &&
573 getDstSel() != AMDGPU::SDWA::DWORD) {
574 // v_mac_f16/32_sdwa allow dst_sel to be equal only to DWORD
575 return false;
576 }
577
578 MachineOperand *Operand = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst);
579 assert(Operand &&
580 Operand->isReg() &&
581 isSameReg(*Operand, *getReplacedOperand()));
582 copyRegOperand(To&: *Operand, From: *getTargetOperand());
583 MachineOperand *DstSel= TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::dst_sel);
584 assert(DstSel);
585
586 SdwaSel ExistingSel = static_cast<SdwaSel>(DstSel->getImm());
587 DstSel->setImm(combineSdwaSel(Sel: ExistingSel, OperandSel: getDstSel()).value());
588
589 MachineOperand *DstUnused= TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::dst_unused);
590 assert(DstUnused);
591 DstUnused->setImm(getDstUnused());
592
593 // Remove original instruction because it would conflict with our new
594 // instruction by register definition
595 getParentInst()->eraseFromParent();
596 return true;
597}
598
599bool SDWADstOperand::canCombineSelections(const MachineInstr &MI,
600 const SIInstrInfo *TII) {
601 if (!TII->isSDWA(Opcode: MI.getOpcode()))
602 return true;
603
604 return canCombineOpSel(MI, TII, SrcSelOpName: AMDGPU::OpName::dst_sel, OpSel: getDstSel());
605}
606
607bool SDWADstPreserveOperand::convertToSDWA(MachineInstr &MI,
608 const SIInstrInfo *TII) {
609 // MI should be moved right before v_or_b32.
610 // For this we should clear all kill flags on uses of MI src-operands or else
611 // we can encounter problem with use of killed operand.
612 for (MachineOperand &MO : MI.uses()) {
613 if (!MO.isReg())
614 continue;
615 getMRI()->clearKillFlags(Reg: MO.getReg());
616 }
617
618 // Move MI before v_or_b32
619 MI.getParent()->remove(I: &MI);
620 getParentInst()->getParent()->insert(I: getParentInst(), MI: &MI);
621
622 // Add Implicit use of preserved register
623 MachineInstrBuilder MIB(*MI.getMF(), MI);
624 MIB.addReg(RegNo: getPreservedOperand()->getReg(),
625 Flags: RegState::ImplicitKill,
626 SubReg: getPreservedOperand()->getSubReg());
627
628 // Tie dst to implicit use
629 MI.tieOperands(DefIdx: AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: AMDGPU::OpName::vdst),
630 UseIdx: MI.getNumOperands() - 1);
631
632 // Convert MI as any other SDWADstOperand and remove v_or_b32
633 return SDWADstOperand::convertToSDWA(MI, TII);
634}
635
636bool SDWADstPreserveOperand::canCombineSelections(const MachineInstr &MI,
637 const SIInstrInfo *TII) {
638 return SDWADstOperand::canCombineSelections(MI, TII);
639}
640
641std::optional<int64_t>
642SIPeepholeSDWA::foldToImm(const MachineOperand &Op) const {
643 if (Op.isImm()) {
644 return Op.getImm();
645 }
646
647 // If this is not immediate then it can be copy of immediate value, e.g.:
648 // %1 = S_MOV_B32 255;
649 if (Op.isReg()) {
650 for (const MachineOperand &Def : MRI->def_operands(Reg: Op.getReg())) {
651 if (!isSameReg(LHS: Op, RHS: Def))
652 continue;
653
654 const MachineInstr *DefInst = Def.getParent();
655 if (!TII->isFoldableCopy(MI: *DefInst))
656 return std::nullopt;
657
658 const MachineOperand &Copied = DefInst->getOperand(i: 1);
659 if (!Copied.isImm())
660 return std::nullopt;
661
662 return Copied.getImm();
663 }
664 }
665
666 return std::nullopt;
667}
668
669std::unique_ptr<SDWAOperand>
670SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) {
671 unsigned Opcode = MI.getOpcode();
672 switch (Opcode) {
673 case AMDGPU::V_LSHRREV_B32_e32:
674 case AMDGPU::V_ASHRREV_I32_e32:
675 case AMDGPU::V_LSHLREV_B32_e32:
676 case AMDGPU::V_LSHRREV_B32_e64:
677 case AMDGPU::V_ASHRREV_I32_e64:
678 case AMDGPU::V_LSHLREV_B32_e64: {
679 // from: v_lshrrev_b32_e32 v1, 16/24, v0
680 // to SDWA src:v0 src_sel:WORD_1/BYTE_3
681
682 // from: v_ashrrev_i32_e32 v1, 16/24, v0
683 // to SDWA src:v0 src_sel:WORD_1/BYTE_3 sext:1
684
685 // from: v_lshlrev_b32_e32 v1, 16/24, v0
686 // to SDWA dst:v1 dst_sel:WORD_1/BYTE_3 dst_unused:UNUSED_PAD
687 MachineOperand *Src0 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0);
688 auto Imm = foldToImm(Op: *Src0);
689 if (!Imm)
690 break;
691
692 if (*Imm != 16 && *Imm != 24)
693 break;
694
695 MachineOperand *Src1 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1);
696 MachineOperand *Dst = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst);
697 if (!Src1->isReg() || Src1->getReg().isPhysical() ||
698 Dst->getReg().isPhysical())
699 break;
700
701 if (Opcode == AMDGPU::V_LSHLREV_B32_e32 ||
702 Opcode == AMDGPU::V_LSHLREV_B32_e64) {
703 return std::make_unique<SDWADstOperand>(
704 args&: Dst, args&: Src1, args: *Imm == 16 ? WORD_1 : BYTE_3, args: UNUSED_PAD);
705 }
706 return std::make_unique<SDWASrcOperand>(
707 args&: Src1, args&: Dst, args: *Imm == 16 ? WORD_1 : BYTE_3, args: false, args: false,
708 args: Opcode != AMDGPU::V_LSHRREV_B32_e32 &&
709 Opcode != AMDGPU::V_LSHRREV_B32_e64);
710 break;
711 }
712
713 case AMDGPU::V_LSHRREV_B16_e32:
714 case AMDGPU::V_LSHLREV_B16_e32:
715 case AMDGPU::V_LSHRREV_B16_e64:
716 case AMDGPU::V_LSHRREV_B16_opsel_e64:
717 case AMDGPU::V_LSHLREV_B16_opsel_e64:
718 case AMDGPU::V_LSHLREV_B16_e64: {
719 // V_ASHRREV_I16_e32 and V_ASHRREV_I16_e64 are
720 // not included here because they zero-fill the high 16-bits.
721
722 // from: v_lshrrev_b16_e32 v1, 8, v0
723 // to SDWA src:v0 src_sel:BYTE_1
724
725 // from: v_lshlrev_b16_e32 v1, 8, v0
726 // to SDWA dst:v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD
727 MachineOperand *Src0 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0);
728 auto Imm = foldToImm(Op: *Src0);
729 if (!Imm || *Imm != 8)
730 break;
731
732 MachineOperand *Src1 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1);
733 MachineOperand *Dst = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst);
734
735 if (!Src1->isReg() || Src1->getReg().isPhysical() ||
736 Dst->getReg().isPhysical())
737 break;
738
739 if (Opcode == AMDGPU::V_LSHLREV_B16_e32 ||
740 Opcode == AMDGPU::V_LSHLREV_B16_opsel_e64 ||
741 Opcode == AMDGPU::V_LSHLREV_B16_e64)
742 return std::make_unique<SDWADstOperand>(args&: Dst, args&: Src1, args: BYTE_1, args: UNUSED_PAD);
743 return std::make_unique<SDWASrcOperand>(args&: Src1, args&: Dst, args: BYTE_1, args: false, args: false,
744 args: false);
745 break;
746 }
747
748 case AMDGPU::V_BFE_I32_e64:
749 case AMDGPU::V_BFE_U32_e64: {
750 // e.g.:
751 // from: v_bfe_u32 v1, v0, 8, 8
752 // to SDWA src:v0 src_sel:BYTE_1
753
754 // offset | width | src_sel
755 // ------------------------
756 // 0 | 8 | BYTE_0
757 // 0 | 16 | WORD_0
758 // 0 | 32 | DWORD ?
759 // 8 | 8 | BYTE_1
760 // 16 | 8 | BYTE_2
761 // 16 | 16 | WORD_1
762 // 24 | 8 | BYTE_3
763
764 MachineOperand *Src1 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1);
765 auto Offset = foldToImm(Op: *Src1);
766 if (!Offset)
767 break;
768
769 MachineOperand *Src2 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src2);
770 auto Width = foldToImm(Op: *Src2);
771 if (!Width)
772 break;
773
774 SdwaSel SrcSel = DWORD;
775
776 if (*Offset == 0 && *Width == 8)
777 SrcSel = BYTE_0;
778 else if (*Offset == 0 && *Width == 16)
779 SrcSel = WORD_0;
780 else if (*Offset == 0 && *Width == 32)
781 SrcSel = DWORD;
782 else if (*Offset == 8 && *Width == 8)
783 SrcSel = BYTE_1;
784 else if (*Offset == 16 && *Width == 8)
785 SrcSel = BYTE_2;
786 else if (*Offset == 16 && *Width == 16)
787 SrcSel = WORD_1;
788 else if (*Offset == 24 && *Width == 8)
789 SrcSel = BYTE_3;
790 else
791 break;
792
793 MachineOperand *Src0 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0);
794 MachineOperand *Dst = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst);
795
796 if (!Src0->isReg() || Src0->getReg().isPhysical() ||
797 Dst->getReg().isPhysical())
798 break;
799
800 return std::make_unique<SDWASrcOperand>(
801 args&: Src0, args&: Dst, args&: SrcSel, args: false, args: false, args: Opcode != AMDGPU::V_BFE_U32_e64);
802 }
803
804 case AMDGPU::V_AND_B32_e32:
805 case AMDGPU::V_AND_B32_e64: {
806 // e.g.:
807 // from: v_and_b32_e32 v1, 0x0000ffff/0x000000ff, v0
808 // to SDWA src:v0 src_sel:WORD_0/BYTE_0
809
810 MachineOperand *Src0 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0);
811 MachineOperand *Src1 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1);
812 auto *ValSrc = Src1;
813 auto Imm = foldToImm(Op: *Src0);
814
815 if (!Imm) {
816 Imm = foldToImm(Op: *Src1);
817 ValSrc = Src0;
818 }
819
820 if (!Imm || (*Imm != 0x0000ffff && *Imm != 0x000000ff))
821 break;
822
823 MachineOperand *Dst = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst);
824
825 if (!ValSrc->isReg() || ValSrc->getReg().isPhysical() ||
826 Dst->getReg().isPhysical())
827 break;
828
829 return std::make_unique<SDWASrcOperand>(
830 args&: ValSrc, args&: Dst, args: *Imm == 0x0000ffff ? WORD_0 : BYTE_0);
831 }
832
833 case AMDGPU::V_OR_B32_e32:
834 case AMDGPU::V_OR_B32_e64: {
835 // Patterns for dst_unused:UNUSED_PRESERVE.
836 // e.g., from:
837 // v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD
838 // src1_sel:WORD_1 src2_sel:WORD1
839 // v_add_f16_e32 v3, v1, v2
840 // v_or_b32_e32 v4, v0, v3
841 // to SDWA preserve dst:v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE preserve:v3
842
843 // Check if one of operands of v_or_b32 is SDWA instruction
844 using CheckRetType =
845 std::optional<std::pair<MachineOperand *, MachineOperand *>>;
846 auto CheckOROperandsForSDWA =
847 [&](const MachineOperand *Op1, const MachineOperand *Op2) -> CheckRetType {
848 if (!Op1 || !Op1->isReg() || !Op2 || !Op2->isReg())
849 return CheckRetType(std::nullopt);
850
851 MachineOperand *Op1Def = findSingleRegDef(Reg: Op1, MRI);
852 if (!Op1Def)
853 return CheckRetType(std::nullopt);
854
855 MachineInstr *Op1Inst = Op1Def->getParent();
856 if (!TII->isSDWA(MI: *Op1Inst))
857 return CheckRetType(std::nullopt);
858
859 MachineOperand *Op2Def = findSingleRegDef(Reg: Op2, MRI);
860 if (!Op2Def)
861 return CheckRetType(std::nullopt);
862
863 return CheckRetType(std::pair(Op1Def, Op2Def));
864 };
865
866 MachineOperand *OrSDWA = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0);
867 MachineOperand *OrOther = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1);
868 assert(OrSDWA && OrOther);
869 auto Res = CheckOROperandsForSDWA(OrSDWA, OrOther);
870 if (!Res) {
871 OrSDWA = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1);
872 OrOther = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0);
873 assert(OrSDWA && OrOther);
874 Res = CheckOROperandsForSDWA(OrSDWA, OrOther);
875 if (!Res)
876 break;
877 }
878
879 MachineOperand *OrSDWADef = Res->first;
880 MachineOperand *OrOtherDef = Res->second;
881 assert(OrSDWADef && OrOtherDef);
882
883 MachineInstr *SDWAInst = OrSDWADef->getParent();
884 MachineInstr *OtherInst = OrOtherDef->getParent();
885
886 // Check that OtherInstr is actually bitwise compatible with SDWAInst = their
887 // destination patterns don't overlap. Compatible instruction can be either
888 // regular instruction with compatible bitness or SDWA instruction with
889 // correct dst_sel
890 // SDWAInst | OtherInst bitness / OtherInst dst_sel
891 // -----------------------------------------------------
892 // DWORD | no / no
893 // WORD_0 | no / BYTE_2/3, WORD_1
894 // WORD_1 | 8/16-bit instructions / BYTE_0/1, WORD_0
895 // BYTE_0 | no / BYTE_1/2/3, WORD_1
896 // BYTE_1 | 8-bit / BYTE_0/2/3, WORD_1
897 // BYTE_2 | 8/16-bit / BYTE_0/1/3. WORD_0
898 // BYTE_3 | 8/16/24-bit / BYTE_0/1/2, WORD_0
899 // E.g. if SDWAInst is v_add_f16_sdwa dst_sel:WORD_1 then v_add_f16 is OK
900 // but v_add_f32 is not.
901
902 // TODO: add support for non-SDWA instructions as OtherInst.
903 // For now this only works with SDWA instructions. For regular instructions
904 // there is no way to determine if the instruction writes only 8/16/24-bit
905 // out of full register size and all registers are at min 32-bit wide.
906 if (!TII->isSDWA(MI: *OtherInst))
907 break;
908
909 SdwaSel DstSel = static_cast<SdwaSel>(
910 TII->getNamedImmOperand(MI: *SDWAInst, OperandName: AMDGPU::OpName::dst_sel));
911 SdwaSel OtherDstSel = static_cast<SdwaSel>(
912 TII->getNamedImmOperand(MI: *OtherInst, OperandName: AMDGPU::OpName::dst_sel));
913
914 bool DstSelAgree = false;
915 switch (DstSel) {
916 case WORD_0: DstSelAgree = ((OtherDstSel == BYTE_2) ||
917 (OtherDstSel == BYTE_3) ||
918 (OtherDstSel == WORD_1));
919 break;
920 case WORD_1: DstSelAgree = ((OtherDstSel == BYTE_0) ||
921 (OtherDstSel == BYTE_1) ||
922 (OtherDstSel == WORD_0));
923 break;
924 case BYTE_0: DstSelAgree = ((OtherDstSel == BYTE_1) ||
925 (OtherDstSel == BYTE_2) ||
926 (OtherDstSel == BYTE_3) ||
927 (OtherDstSel == WORD_1));
928 break;
929 case BYTE_1: DstSelAgree = ((OtherDstSel == BYTE_0) ||
930 (OtherDstSel == BYTE_2) ||
931 (OtherDstSel == BYTE_3) ||
932 (OtherDstSel == WORD_1));
933 break;
934 case BYTE_2: DstSelAgree = ((OtherDstSel == BYTE_0) ||
935 (OtherDstSel == BYTE_1) ||
936 (OtherDstSel == BYTE_3) ||
937 (OtherDstSel == WORD_0));
938 break;
939 case BYTE_3: DstSelAgree = ((OtherDstSel == BYTE_0) ||
940 (OtherDstSel == BYTE_1) ||
941 (OtherDstSel == BYTE_2) ||
942 (OtherDstSel == WORD_0));
943 break;
944 default: DstSelAgree = false;
945 }
946
947 if (!DstSelAgree)
948 break;
949
950 // Also OtherInst dst_unused should be UNUSED_PAD
951 DstUnused OtherDstUnused = static_cast<DstUnused>(
952 TII->getNamedImmOperand(MI: *OtherInst, OperandName: AMDGPU::OpName::dst_unused));
953 if (OtherDstUnused != DstUnused::UNUSED_PAD)
954 break;
955
956 // Create DstPreserveOperand
957 MachineOperand *OrDst = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst);
958 assert(OrDst && OrDst->isReg());
959
960 return std::make_unique<SDWADstPreserveOperand>(
961 args&: OrDst, args&: OrSDWADef, args&: OrOtherDef, args&: DstSel);
962
963 }
964 }
965
966 return std::unique_ptr<SDWAOperand>(nullptr);
967}
968
969#if !defined(NDEBUG)
970static raw_ostream& operator<<(raw_ostream &OS, const SDWAOperand &Operand) {
971 Operand.print(OS);
972 return OS;
973}
974#endif
975
976void SIPeepholeSDWA::matchSDWAOperands(MachineBasicBlock &MBB) {
977 for (MachineInstr &MI : MBB) {
978 if (auto Operand = matchSDWAOperand(MI)) {
979 LLVM_DEBUG(dbgs() << "Match: " << MI << "To: " << *Operand << '\n');
980 SDWAOperands[&MI] = std::move(Operand);
981 ++NumSDWAPatternsFound;
982 }
983 }
984}
985
986// Convert the V_ADD_CO_U32_e64 into V_ADD_CO_U32_e32. This allows
987// isConvertibleToSDWA to perform its transformation on V_ADD_CO_U32_e32 into
988// V_ADD_CO_U32_sdwa.
989//
990// We are transforming from a VOP3 into a VOP2 form of the instruction.
991// %19:vgpr_32 = V_AND_B32_e32 255,
992// killed %16:vgpr_32, implicit $exec
993// %47:vgpr_32, %49:sreg_64_xexec = V_ADD_CO_U32_e64
994// %26.sub0:vreg_64, %19:vgpr_32, implicit $exec
995// %48:vgpr_32, dead %50:sreg_64_xexec = V_ADDC_U32_e64
996// %26.sub1:vreg_64, %54:vgpr_32, killed %49:sreg_64_xexec, implicit $exec
997//
998// becomes
999// %47:vgpr_32 = V_ADD_CO_U32_sdwa
1000// 0, %26.sub0:vreg_64, 0, killed %16:vgpr_32, 0, 6, 0, 6, 0,
1001// implicit-def $vcc, implicit $exec
1002// %48:vgpr_32, dead %50:sreg_64_xexec = V_ADDC_U32_e64
1003// %26.sub1:vreg_64, %54:vgpr_32, killed $vcc, implicit $exec
1004void SIPeepholeSDWA::pseudoOpConvertToVOP2(MachineInstr &MI,
1005 const GCNSubtarget &ST) const {
1006 int Opc = MI.getOpcode();
1007 assert((Opc == AMDGPU::V_ADD_CO_U32_e64 || Opc == AMDGPU::V_SUB_CO_U32_e64) &&
1008 "Currently only handles V_ADD_CO_U32_e64 or V_SUB_CO_U32_e64");
1009
1010 // Can the candidate MI be shrunk?
1011 if (!TII->canShrink(MI, MRI: *MRI))
1012 return;
1013 Opc = AMDGPU::getVOPe32(Opcode: Opc);
1014 // Find the related ADD instruction.
1015 const MachineOperand *Sdst = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::sdst);
1016 if (!Sdst)
1017 return;
1018 MachineOperand *NextOp = findSingleRegUse(Reg: Sdst, MRI);
1019 if (!NextOp)
1020 return;
1021 MachineInstr &MISucc = *NextOp->getParent();
1022
1023 // Make sure the carry in/out are subsequently unused.
1024 MachineOperand *CarryIn = TII->getNamedOperand(MI&: MISucc, OperandName: AMDGPU::OpName::src2);
1025 if (!CarryIn)
1026 return;
1027 MachineOperand *CarryOut = TII->getNamedOperand(MI&: MISucc, OperandName: AMDGPU::OpName::sdst);
1028 if (!CarryOut)
1029 return;
1030 if (!MRI->hasOneNonDBGUse(RegNo: CarryIn->getReg()) ||
1031 !MRI->use_nodbg_empty(RegNo: CarryOut->getReg()))
1032 return;
1033 // Make sure VCC or its subregs are dead before MI.
1034 MachineBasicBlock &MBB = *MI.getParent();
1035 MachineBasicBlock::LivenessQueryResult Liveness =
1036 MBB.computeRegisterLiveness(TRI, Reg: AMDGPU::VCC, Before: MI, Neighborhood: 25);
1037 if (Liveness != MachineBasicBlock::LQR_Dead)
1038 return;
1039 // Check if VCC is referenced in range of (MI,MISucc].
1040 for (auto I = std::next(x: MI.getIterator()), E = MISucc.getIterator();
1041 I != E; ++I) {
1042 if (I->modifiesRegister(Reg: AMDGPU::VCC, TRI))
1043 return;
1044 }
1045
1046 // Replace MI with V_{SUB|ADD}_I32_e32
1047 BuildMI(BB&: MBB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: Opc))
1048 .add(MO: *TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst))
1049 .add(MO: *TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0))
1050 .add(MO: *TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1))
1051 .setMIFlags(MI.getFlags());
1052
1053 MI.eraseFromParent();
1054
1055 // Since the carry output of MI is now VCC, update its use in MISucc.
1056
1057 MISucc.substituteRegister(FromReg: CarryIn->getReg(), ToReg: TRI->getVCC(), SubIdx: 0, RegInfo: *TRI);
1058}
1059
1060/// Try to convert an \p MI in VOP3 which takes an src2 carry-in
1061/// operand into the corresponding VOP2 form which expects the
1062/// argument in VCC. To this end, add an copy from the carry-in to
1063/// VCC. The conversion will only be applied if \p MI can be shrunk
1064/// to VOP2 and if VCC can be proven to be dead before \p MI.
1065void SIPeepholeSDWA::convertVcndmaskToVOP2(MachineInstr &MI,
1066 const GCNSubtarget &ST) const {
1067 assert(MI.getOpcode() == AMDGPU::V_CNDMASK_B32_e64);
1068
1069 LLVM_DEBUG(dbgs() << "Attempting VOP2 conversion: " << MI);
1070 if (!TII->canShrink(MI, MRI: *MRI)) {
1071 LLVM_DEBUG(dbgs() << "Cannot shrink instruction\n");
1072 return;
1073 }
1074
1075 const MachineOperand &CarryIn =
1076 *TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src2);
1077 Register CarryReg = CarryIn.getReg();
1078 MachineInstr *CarryDef = MRI->getVRegDef(Reg: CarryReg);
1079 if (!CarryDef) {
1080 LLVM_DEBUG(dbgs() << "Missing carry-in operand definition\n");
1081 return;
1082 }
1083
1084 // Make sure VCC or its subregs are dead before MI.
1085 MCRegister Vcc = TRI->getVCC();
1086 MachineBasicBlock &MBB = *MI.getParent();
1087 MachineBasicBlock::LivenessQueryResult Liveness =
1088 MBB.computeRegisterLiveness(TRI, Reg: Vcc, Before: MI);
1089 if (Liveness != MachineBasicBlock::LQR_Dead) {
1090 LLVM_DEBUG(dbgs() << "VCC not known to be dead before instruction\n");
1091 return;
1092 }
1093
1094 BuildMI(BB&: MBB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: Vcc).add(MO: CarryIn);
1095
1096 auto Converted = BuildMI(BB&: MBB, I&: MI, MIMD: MI.getDebugLoc(),
1097 MCID: TII->get(Opcode: AMDGPU::getVOPe32(Opcode: MI.getOpcode())))
1098 .add(MO: *TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst))
1099 .add(MO: *TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0))
1100 .add(MO: *TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1))
1101 .setMIFlags(MI.getFlags());
1102 TII->fixImplicitOperands(MI&: *Converted);
1103 LLVM_DEBUG(dbgs() << "Converted to VOP2: " << *Converted);
1104 (void)Converted;
1105 MI.eraseFromParent();
1106}
1107
1108namespace {
1109bool isConvertibleToSDWA(MachineInstr &MI,
1110 const GCNSubtarget &ST,
1111 const SIInstrInfo* TII) {
1112 // Check if this is already an SDWA instruction
1113 unsigned Opc = MI.getOpcode();
1114 if (TII->isSDWA(Opcode: Opc))
1115 return true;
1116
1117 // Can only be handled after ealier conversion to
1118 // AMDGPU::V_CNDMASK_B32_e32 which is not always possible.
1119 if (Opc == AMDGPU::V_CNDMASK_B32_e64)
1120 return false;
1121
1122 // Check if this instruction has opcode that supports SDWA
1123 if (AMDGPU::getSDWAOp(Opcode: Opc) == -1)
1124 Opc = AMDGPU::getVOPe32(Opcode: Opc);
1125
1126 if (AMDGPU::getSDWAOp(Opcode: Opc) == -1)
1127 return false;
1128
1129 if (!ST.hasSDWAOmod() && TII->hasModifiersSet(MI, OpName: AMDGPU::OpName::omod))
1130 return false;
1131
1132 if (TII->isVOPC(Opcode: Opc)) {
1133 if (!ST.hasSDWASdst()) {
1134 const MachineOperand *SDst = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::sdst);
1135 if (SDst && (SDst->getReg() != AMDGPU::VCC &&
1136 SDst->getReg() != AMDGPU::VCC_LO))
1137 return false;
1138 }
1139
1140 if (!ST.hasSDWAOutModsVOPC() &&
1141 (TII->hasModifiersSet(MI, OpName: AMDGPU::OpName::clamp) ||
1142 TII->hasModifiersSet(MI, OpName: AMDGPU::OpName::omod)))
1143 return false;
1144
1145 } else if (TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::sdst) ||
1146 !TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst)) {
1147 return false;
1148 }
1149
1150 if (!ST.hasSDWAMac() && (Opc == AMDGPU::V_FMAC_F16_e32 ||
1151 Opc == AMDGPU::V_FMAC_F32_e32 ||
1152 Opc == AMDGPU::V_MAC_F16_e32 ||
1153 Opc == AMDGPU::V_MAC_F32_e32))
1154 return false;
1155
1156 // Check if target supports this SDWA opcode
1157 if (TII->pseudoToMCOpcode(Opcode: Opc) == -1)
1158 return false;
1159
1160 if (MachineOperand *Src0 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0)) {
1161 if (!Src0->isReg() && !Src0->isImm())
1162 return false;
1163 }
1164
1165 if (MachineOperand *Src1 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1)) {
1166 if (!Src1->isReg() && !Src1->isImm())
1167 return false;
1168 }
1169
1170 return true;
1171}
1172} // namespace
1173
1174MachineInstr *SIPeepholeSDWA::createSDWAVersion(MachineInstr &MI) {
1175 unsigned Opcode = MI.getOpcode();
1176 assert(!TII->isSDWA(Opcode));
1177
1178 int SDWAOpcode = AMDGPU::getSDWAOp(Opcode);
1179 if (SDWAOpcode == -1)
1180 SDWAOpcode = AMDGPU::getSDWAOp(Opcode: AMDGPU::getVOPe32(Opcode));
1181 assert(SDWAOpcode != -1);
1182
1183 const MCInstrDesc &SDWADesc = TII->get(Opcode: SDWAOpcode);
1184
1185 // Create SDWA version of instruction MI and initialize its operands
1186 MachineInstrBuilder SDWAInst =
1187 BuildMI(BB&: *MI.getParent(), I&: MI, MIMD: MI.getDebugLoc(), MCID: SDWADesc)
1188 .setMIFlags(MI.getFlags());
1189
1190 // Copy dst, if it is present in original then should also be present in SDWA
1191 MachineOperand *Dst = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst);
1192 if (Dst) {
1193 assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::vdst));
1194 SDWAInst.add(MO: *Dst);
1195 } else if ((Dst = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::sdst))) {
1196 assert(Dst && AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::sdst));
1197 SDWAInst.add(MO: *Dst);
1198 } else {
1199 assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::sdst));
1200 SDWAInst.addReg(RegNo: TRI->getVCC(), Flags: RegState::Define);
1201 }
1202
1203 // Copy src0, initialize src0_modifiers. All sdwa instructions has src0 and
1204 // src0_modifiers (except for v_nop_sdwa, but it can't get here)
1205 MachineOperand *Src0 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0);
1206 assert(Src0 && AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src0) &&
1207 AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src0_modifiers));
1208 if (auto *Mod = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0_modifiers))
1209 SDWAInst.addImm(Val: Mod->getImm());
1210 else
1211 SDWAInst.addImm(Val: 0);
1212 SDWAInst.add(MO: *Src0);
1213
1214 // Copy src1 if present, initialize src1_modifiers.
1215 MachineOperand *Src1 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1);
1216 if (Src1) {
1217 assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src1) &&
1218 AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src1_modifiers));
1219 if (auto *Mod = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1_modifiers))
1220 SDWAInst.addImm(Val: Mod->getImm());
1221 else
1222 SDWAInst.addImm(Val: 0);
1223 SDWAInst.add(MO: *Src1);
1224 }
1225
1226 if (SDWAOpcode == AMDGPU::V_FMAC_F16_sdwa ||
1227 SDWAOpcode == AMDGPU::V_FMAC_F32_sdwa ||
1228 SDWAOpcode == AMDGPU::V_MAC_F16_sdwa ||
1229 SDWAOpcode == AMDGPU::V_MAC_F32_sdwa) {
1230 // v_mac_f16/32 has additional src2 operand tied to vdst
1231 MachineOperand *Src2 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src2);
1232 assert(Src2);
1233 SDWAInst.add(MO: *Src2);
1234 }
1235
1236 // Copy clamp if present, initialize otherwise
1237 assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::clamp));
1238 MachineOperand *Clamp = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::clamp);
1239 if (Clamp) {
1240 SDWAInst.add(MO: *Clamp);
1241 } else {
1242 SDWAInst.addImm(Val: 0);
1243 }
1244
1245 // Copy omod if present, initialize otherwise if needed
1246 if (AMDGPU::hasNamedOperand(Opcode: SDWAOpcode, NamedIdx: AMDGPU::OpName::omod)) {
1247 MachineOperand *OMod = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::omod);
1248 if (OMod) {
1249 SDWAInst.add(MO: *OMod);
1250 } else {
1251 SDWAInst.addImm(Val: 0);
1252 }
1253 }
1254
1255 // Initialize SDWA specific operands
1256 if (AMDGPU::hasNamedOperand(Opcode: SDWAOpcode, NamedIdx: AMDGPU::OpName::dst_sel))
1257 SDWAInst.addImm(Val: AMDGPU::SDWA::SdwaSel::DWORD);
1258
1259 if (AMDGPU::hasNamedOperand(Opcode: SDWAOpcode, NamedIdx: AMDGPU::OpName::dst_unused))
1260 SDWAInst.addImm(Val: AMDGPU::SDWA::DstUnused::UNUSED_PAD);
1261
1262 assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src0_sel));
1263 SDWAInst.addImm(Val: AMDGPU::SDWA::SdwaSel::DWORD);
1264
1265 if (Src1) {
1266 assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src1_sel));
1267 SDWAInst.addImm(Val: AMDGPU::SDWA::SdwaSel::DWORD);
1268 }
1269
1270 // Check for a preserved register that needs to be copied.
1271 MachineInstr *Ret = SDWAInst.getInstr();
1272 TII->fixImplicitOperands(MI&: *Ret);
1273 return Ret;
1274}
1275
1276bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
1277 const SDWAOperandsVector &SDWAOperands) {
1278 LLVM_DEBUG(dbgs() << "Convert instruction:" << MI);
1279
1280 MachineInstr *SDWAInst;
1281 if (TII->isSDWA(Opcode: MI.getOpcode())) {
1282 // Clone the instruction to allow revoking changes
1283 // made to MI during the processing of the operands
1284 // if the conversion fails.
1285 SDWAInst = MI.getMF()->CloneMachineInstr(Orig: &MI);
1286 MI.getParent()->insert(I: MI.getIterator(), M: SDWAInst);
1287 } else {
1288 SDWAInst = createSDWAVersion(MI);
1289 }
1290
1291 // Apply all sdwa operand patterns.
1292 bool Converted = false;
1293 for (auto &Operand : SDWAOperands) {
1294 LLVM_DEBUG(dbgs() << *SDWAInst << "\nOperand: " << *Operand);
1295 // There should be no intersection between SDWA operands and potential MIs
1296 // e.g.:
1297 // v_and_b32 v0, 0xff, v1 -> src:v1 sel:BYTE_0
1298 // v_and_b32 v2, 0xff, v0 -> src:v0 sel:BYTE_0
1299 // v_add_u32 v3, v4, v2
1300 //
1301 // In that example it is possible that we would fold 2nd instruction into
1302 // 3rd (v_add_u32_sdwa) and then try to fold 1st instruction into 2nd (that
1303 // was already destroyed). So if SDWAOperand is also a potential MI then do
1304 // not apply it.
1305 if (PotentialMatches.count(Key: Operand->getParentInst()) == 0)
1306 Converted |= Operand->convertToSDWA(MI&: *SDWAInst, TII);
1307 }
1308
1309 if (!Converted) {
1310 SDWAInst->eraseFromParent();
1311 return false;
1312 }
1313
1314 ConvertedInstructions.push_back(Elt: SDWAInst);
1315 for (MachineOperand &MO : SDWAInst->uses()) {
1316 if (!MO.isReg())
1317 continue;
1318
1319 MRI->clearKillFlags(Reg: MO.getReg());
1320 }
1321 LLVM_DEBUG(dbgs() << "\nInto:" << *SDWAInst << '\n');
1322 ++NumSDWAInstructionsPeepholed;
1323
1324 MI.eraseFromParent();
1325 return true;
1326}
1327
1328// If an instruction was converted to SDWA it should not have immediates or SGPR
1329// operands (allowed one SGPR on GFX9). Copy its scalar operands into VGPRs.
1330void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI,
1331 const GCNSubtarget &ST) const {
1332 const MCInstrDesc &Desc = TII->get(Opcode: MI.getOpcode());
1333 unsigned ConstantBusCount = 0;
1334 for (MachineOperand &Op : MI.explicit_uses()) {
1335 if (Op.isReg()) {
1336 if (TRI->isVGPR(MRI: *MRI, Reg: Op.getReg()))
1337 continue;
1338
1339 if (ST.hasSDWAScalar() && ConstantBusCount == 0) {
1340 ++ConstantBusCount;
1341 continue;
1342 }
1343 } else if (!Op.isImm())
1344 continue;
1345
1346 unsigned I = Op.getOperandNo();
1347 const TargetRegisterClass *OpRC = TII->getRegClass(MCID: Desc, OpNum: I);
1348 if (!OpRC || !TRI->isVSSuperClass(RC: OpRC))
1349 continue;
1350
1351 Register VGPR = MRI->createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
1352 auto Copy = BuildMI(BB&: *MI.getParent(), I: MI.getIterator(), MIMD: MI.getDebugLoc(),
1353 MCID: TII->get(Opcode: AMDGPU::V_MOV_B32_e32), DestReg: VGPR);
1354 if (Op.isImm())
1355 Copy.addImm(Val: Op.getImm());
1356 else if (Op.isReg())
1357 Copy.addReg(RegNo: Op.getReg(), Flags: getKillRegState(B: Op.isKill()), SubReg: Op.getSubReg());
1358 Op.ChangeToRegister(Reg: VGPR, isDef: false);
1359 }
1360}
1361
1362bool SIPeepholeSDWALegacy::runOnMachineFunction(MachineFunction &MF) {
1363 if (skipFunction(F: MF.getFunction()))
1364 return false;
1365
1366 return SIPeepholeSDWA().run(MF);
1367}
1368
1369bool SIPeepholeSDWA::run(MachineFunction &MF) {
1370 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1371
1372 if (!ST.hasSDWA())
1373 return false;
1374
1375 MRI = &MF.getRegInfo();
1376 TRI = ST.getRegisterInfo();
1377 TII = ST.getInstrInfo();
1378
1379 // Find all SDWA operands in MF.
1380 bool Ret = false;
1381 for (MachineBasicBlock &MBB : MF) {
1382 bool Changed = false;
1383 do {
1384 // Preprocess the ADD/SUB pairs so they could be SDWA'ed.
1385 // Look for a possible ADD or SUB that resulted from a previously lowered
1386 // V_{ADD|SUB}_U64_PSEUDO. The function pseudoOpConvertToVOP2
1387 // lowers the pair of instructions into e32 form.
1388 matchSDWAOperands(MBB);
1389 for (const auto &OperandPair : SDWAOperands) {
1390 const auto &Operand = OperandPair.second;
1391 MachineInstr *PotentialMI = Operand->potentialToConvert(TII, ST);
1392 if (!PotentialMI)
1393 continue;
1394
1395 switch (PotentialMI->getOpcode()) {
1396 case AMDGPU::V_ADD_CO_U32_e64:
1397 case AMDGPU::V_SUB_CO_U32_e64:
1398 pseudoOpConvertToVOP2(MI&: *PotentialMI, ST);
1399 break;
1400 case AMDGPU::V_CNDMASK_B32_e64:
1401 convertVcndmaskToVOP2(MI&: *PotentialMI, ST);
1402 break;
1403 };
1404 }
1405 SDWAOperands.clear();
1406
1407 // Generate potential match list.
1408 matchSDWAOperands(MBB);
1409
1410 for (const auto &OperandPair : SDWAOperands) {
1411 const auto &Operand = OperandPair.second;
1412 MachineInstr *PotentialMI =
1413 Operand->potentialToConvert(TII, ST, PotentialMatches: &PotentialMatches);
1414
1415 if (PotentialMI && isConvertibleToSDWA(MI&: *PotentialMI, ST, TII))
1416 PotentialMatches[PotentialMI].push_back(Elt: Operand.get());
1417 }
1418
1419 for (auto &PotentialPair : PotentialMatches) {
1420 MachineInstr &PotentialMI = *PotentialPair.first;
1421 convertToSDWA(MI&: PotentialMI, SDWAOperands: PotentialPair.second);
1422 }
1423
1424 PotentialMatches.clear();
1425 SDWAOperands.clear();
1426
1427 Changed = !ConvertedInstructions.empty();
1428
1429 if (Changed)
1430 Ret = true;
1431 while (!ConvertedInstructions.empty())
1432 legalizeScalarOperands(MI&: *ConvertedInstructions.pop_back_val(), ST);
1433 } while (Changed);
1434 }
1435
1436 return Ret;
1437}
1438
1439PreservedAnalyses SIPeepholeSDWAPass::run(MachineFunction &MF,
1440 MachineFunctionAnalysisManager &) {
1441 if (MF.getFunction().hasOptNone() || !SIPeepholeSDWA().run(MF))
1442 return PreservedAnalyses::all();
1443
1444 PreservedAnalyses PA = getMachineFunctionPassPreservedAnalyses();
1445 PA.preserveSet<CFGAnalyses>();
1446 return PA;
1447}
1448