1 | //===- SIPeepholeSDWA.cpp - Peephole optimization for SDWA instructions ---===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | /// \file This pass tries to apply several peephole SDWA patterns. |
10 | /// |
11 | /// E.g. original: |
12 | /// V_LSHRREV_B32_e32 %0, 16, %1 |
13 | /// V_ADD_CO_U32_e32 %2, %0, %3 |
14 | /// V_LSHLREV_B32_e32 %4, 16, %2 |
15 | /// |
16 | /// Replace: |
17 | /// V_ADD_CO_U32_sdwa %4, %1, %3 |
18 | /// dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD |
19 | /// |
20 | //===----------------------------------------------------------------------===// |
21 | |
22 | #include "SIPeepholeSDWA.h" |
23 | #include "AMDGPU.h" |
24 | #include "GCNSubtarget.h" |
25 | #include "MCTargetDesc/AMDGPUMCTargetDesc.h" |
26 | #include "llvm/ADT/MapVector.h" |
27 | #include "llvm/ADT/Statistic.h" |
28 | #include "llvm/CodeGen/MachineFunctionPass.h" |
29 | #include <optional> |
30 | |
31 | using namespace llvm; |
32 | |
33 | #define DEBUG_TYPE "si-peephole-sdwa" |
34 | |
35 | STATISTIC(NumSDWAPatternsFound, "Number of SDWA patterns found." ); |
36 | STATISTIC(NumSDWAInstructionsPeepholed, |
37 | "Number of instruction converted to SDWA." ); |
38 | |
39 | namespace { |
40 | |
41 | bool isConvertibleToSDWA(MachineInstr &MI, const GCNSubtarget &ST, |
42 | const SIInstrInfo *TII); |
43 | class SDWAOperand; |
44 | class SDWADstOperand; |
45 | |
46 | using SDWAOperandsVector = SmallVector<SDWAOperand *, 4>; |
47 | using SDWAOperandsMap = MapVector<MachineInstr *, SDWAOperandsVector>; |
48 | |
49 | class SIPeepholeSDWA { |
50 | private: |
51 | MachineRegisterInfo *MRI; |
52 | const SIRegisterInfo *TRI; |
53 | const SIInstrInfo *TII; |
54 | |
55 | MapVector<MachineInstr *, std::unique_ptr<SDWAOperand>> SDWAOperands; |
56 | SDWAOperandsMap PotentialMatches; |
57 | SmallVector<MachineInstr *, 8> ConvertedInstructions; |
58 | |
59 | std::optional<int64_t> foldToImm(const MachineOperand &Op) const; |
60 | |
61 | void matchSDWAOperands(MachineBasicBlock &MBB); |
62 | std::unique_ptr<SDWAOperand> matchSDWAOperand(MachineInstr &MI); |
63 | void pseudoOpConvertToVOP2(MachineInstr &MI, |
64 | const GCNSubtarget &ST) const; |
65 | void convertVcndmaskToVOP2(MachineInstr &MI, const GCNSubtarget &ST) const; |
66 | MachineInstr *createSDWAVersion(MachineInstr &MI); |
67 | bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands); |
68 | void legalizeScalarOperands(MachineInstr &MI, const GCNSubtarget &ST) const; |
69 | |
70 | public: |
71 | bool run(MachineFunction &MF); |
72 | }; |
73 | |
74 | class SIPeepholeSDWALegacy : public MachineFunctionPass { |
75 | public: |
76 | static char ID; |
77 | |
78 | SIPeepholeSDWALegacy() : MachineFunctionPass(ID) {} |
79 | |
80 | StringRef getPassName() const override { return "SI Peephole SDWA" ; } |
81 | |
82 | bool runOnMachineFunction(MachineFunction &MF) override; |
83 | |
84 | void getAnalysisUsage(AnalysisUsage &AU) const override { |
85 | AU.setPreservesCFG(); |
86 | MachineFunctionPass::getAnalysisUsage(AU); |
87 | } |
88 | }; |
89 | |
90 | using namespace AMDGPU::SDWA; |
91 | |
92 | class SDWAOperand { |
93 | private: |
94 | MachineOperand *Target; // Operand that would be used in converted instruction |
95 | MachineOperand *Replaced; // Operand that would be replace by Target |
96 | |
97 | /// Returns true iff the SDWA selection of this SDWAOperand can be combined |
98 | /// with the SDWA selections of its uses in \p MI. |
99 | virtual bool canCombineSelections(const MachineInstr &MI, |
100 | const SIInstrInfo *TII) = 0; |
101 | |
102 | public: |
103 | SDWAOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp) |
104 | : Target(TargetOp), Replaced(ReplacedOp) { |
105 | assert(Target->isReg()); |
106 | assert(Replaced->isReg()); |
107 | } |
108 | |
109 | virtual ~SDWAOperand() = default; |
110 | |
111 | virtual MachineInstr *potentialToConvert(const SIInstrInfo *TII, |
112 | const GCNSubtarget &ST, |
113 | SDWAOperandsMap *PotentialMatches = nullptr) = 0; |
114 | virtual bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) = 0; |
115 | |
116 | MachineOperand *getTargetOperand() const { return Target; } |
117 | MachineOperand *getReplacedOperand() const { return Replaced; } |
118 | MachineInstr *getParentInst() const { return Target->getParent(); } |
119 | |
120 | MachineRegisterInfo *getMRI() const { |
121 | return &getParentInst()->getParent()->getParent()->getRegInfo(); |
122 | } |
123 | |
124 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
125 | virtual void print(raw_ostream& OS) const = 0; |
126 | void dump() const { print(dbgs()); } |
127 | #endif |
128 | }; |
129 | |
130 | class SDWASrcOperand : public SDWAOperand { |
131 | private: |
132 | SdwaSel SrcSel; |
133 | bool Abs; |
134 | bool Neg; |
135 | bool Sext; |
136 | |
137 | public: |
138 | SDWASrcOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp, |
139 | SdwaSel SrcSel_ = DWORD, bool Abs_ = false, bool Neg_ = false, |
140 | bool Sext_ = false) |
141 | : SDWAOperand(TargetOp, ReplacedOp), SrcSel(SrcSel_), Abs(Abs_), |
142 | Neg(Neg_), Sext(Sext_) {} |
143 | |
144 | MachineInstr *potentialToConvert(const SIInstrInfo *TII, |
145 | const GCNSubtarget &ST, |
146 | SDWAOperandsMap *PotentialMatches = nullptr) override; |
147 | bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override; |
148 | bool canCombineSelections(const MachineInstr &MI, |
149 | const SIInstrInfo *TII) override; |
150 | |
151 | SdwaSel getSrcSel() const { return SrcSel; } |
152 | bool getAbs() const { return Abs; } |
153 | bool getNeg() const { return Neg; } |
154 | bool getSext() const { return Sext; } |
155 | |
156 | uint64_t getSrcMods(const SIInstrInfo *TII, |
157 | const MachineOperand *SrcOp) const; |
158 | |
159 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
160 | void print(raw_ostream& OS) const override; |
161 | #endif |
162 | }; |
163 | |
164 | class SDWADstOperand : public SDWAOperand { |
165 | private: |
166 | SdwaSel DstSel; |
167 | DstUnused DstUn; |
168 | |
169 | public: |
170 | SDWADstOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp, |
171 | SdwaSel DstSel_ = DWORD, DstUnused DstUn_ = UNUSED_PAD) |
172 | : SDWAOperand(TargetOp, ReplacedOp), DstSel(DstSel_), DstUn(DstUn_) {} |
173 | |
174 | MachineInstr *potentialToConvert(const SIInstrInfo *TII, |
175 | const GCNSubtarget &ST, |
176 | SDWAOperandsMap *PotentialMatches = nullptr) override; |
177 | bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override; |
178 | bool canCombineSelections(const MachineInstr &MI, |
179 | const SIInstrInfo *TII) override; |
180 | |
181 | SdwaSel getDstSel() const { return DstSel; } |
182 | DstUnused getDstUnused() const { return DstUn; } |
183 | |
184 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
185 | void print(raw_ostream& OS) const override; |
186 | #endif |
187 | }; |
188 | |
189 | class SDWADstPreserveOperand : public SDWADstOperand { |
190 | private: |
191 | MachineOperand *Preserve; |
192 | |
193 | public: |
194 | SDWADstPreserveOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp, |
195 | MachineOperand *PreserveOp, SdwaSel DstSel_ = DWORD) |
196 | : SDWADstOperand(TargetOp, ReplacedOp, DstSel_, UNUSED_PRESERVE), |
197 | Preserve(PreserveOp) {} |
198 | |
199 | bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override; |
200 | bool canCombineSelections(const MachineInstr &MI, |
201 | const SIInstrInfo *TII) override; |
202 | |
203 | MachineOperand *getPreservedOperand() const { return Preserve; } |
204 | |
205 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
206 | void print(raw_ostream& OS) const override; |
207 | #endif |
208 | }; |
209 | |
210 | } // end anonymous namespace |
211 | |
212 | INITIALIZE_PASS(SIPeepholeSDWALegacy, DEBUG_TYPE, "SI Peephole SDWA" , false, |
213 | false) |
214 | |
215 | char SIPeepholeSDWALegacy::ID = 0; |
216 | |
217 | char &llvm::SIPeepholeSDWALegacyID = SIPeepholeSDWALegacy::ID; |
218 | |
219 | FunctionPass *llvm::createSIPeepholeSDWALegacyPass() { |
220 | return new SIPeepholeSDWALegacy(); |
221 | } |
222 | |
223 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
224 | static raw_ostream& operator<<(raw_ostream &OS, SdwaSel Sel) { |
225 | switch(Sel) { |
226 | case BYTE_0: OS << "BYTE_0" ; break; |
227 | case BYTE_1: OS << "BYTE_1" ; break; |
228 | case BYTE_2: OS << "BYTE_2" ; break; |
229 | case BYTE_3: OS << "BYTE_3" ; break; |
230 | case WORD_0: OS << "WORD_0" ; break; |
231 | case WORD_1: OS << "WORD_1" ; break; |
232 | case DWORD: OS << "DWORD" ; break; |
233 | } |
234 | return OS; |
235 | } |
236 | |
237 | static raw_ostream& operator<<(raw_ostream &OS, const DstUnused &Un) { |
238 | switch(Un) { |
239 | case UNUSED_PAD: OS << "UNUSED_PAD" ; break; |
240 | case UNUSED_SEXT: OS << "UNUSED_SEXT" ; break; |
241 | case UNUSED_PRESERVE: OS << "UNUSED_PRESERVE" ; break; |
242 | } |
243 | return OS; |
244 | } |
245 | |
246 | LLVM_DUMP_METHOD |
247 | void SDWASrcOperand::print(raw_ostream& OS) const { |
248 | OS << "SDWA src: " << *getTargetOperand() |
249 | << " src_sel:" << getSrcSel() |
250 | << " abs:" << getAbs() << " neg:" << getNeg() |
251 | << " sext:" << getSext() << '\n'; |
252 | } |
253 | |
254 | LLVM_DUMP_METHOD |
255 | void SDWADstOperand::print(raw_ostream& OS) const { |
256 | OS << "SDWA dst: " << *getTargetOperand() |
257 | << " dst_sel:" << getDstSel() |
258 | << " dst_unused:" << getDstUnused() << '\n'; |
259 | } |
260 | |
261 | LLVM_DUMP_METHOD |
262 | void SDWADstPreserveOperand::print(raw_ostream& OS) const { |
263 | OS << "SDWA preserve dst: " << *getTargetOperand() |
264 | << " dst_sel:" << getDstSel() |
265 | << " preserve:" << *getPreservedOperand() << '\n'; |
266 | } |
267 | |
268 | #endif |
269 | |
270 | static void copyRegOperand(MachineOperand &To, const MachineOperand &From) { |
271 | assert(To.isReg() && From.isReg()); |
272 | To.setReg(From.getReg()); |
273 | To.setSubReg(From.getSubReg()); |
274 | To.setIsUndef(From.isUndef()); |
275 | if (To.isUse()) { |
276 | To.setIsKill(From.isKill()); |
277 | } else { |
278 | To.setIsDead(From.isDead()); |
279 | } |
280 | } |
281 | |
282 | static bool isSameReg(const MachineOperand &LHS, const MachineOperand &RHS) { |
283 | return LHS.isReg() && |
284 | RHS.isReg() && |
285 | LHS.getReg() == RHS.getReg() && |
286 | LHS.getSubReg() == RHS.getSubReg(); |
287 | } |
288 | |
289 | static MachineOperand *findSingleRegUse(const MachineOperand *Reg, |
290 | const MachineRegisterInfo *MRI) { |
291 | if (!Reg->isReg() || !Reg->isDef()) |
292 | return nullptr; |
293 | |
294 | MachineOperand *ResMO = nullptr; |
295 | for (MachineOperand &UseMO : MRI->use_nodbg_operands(Reg: Reg->getReg())) { |
296 | // If there exist use of subreg of Reg then return nullptr |
297 | if (!isSameReg(LHS: UseMO, RHS: *Reg)) |
298 | return nullptr; |
299 | |
300 | // Check that there is only one instruction that uses Reg |
301 | if (!ResMO) { |
302 | ResMO = &UseMO; |
303 | } else if (ResMO->getParent() != UseMO.getParent()) { |
304 | return nullptr; |
305 | } |
306 | } |
307 | |
308 | return ResMO; |
309 | } |
310 | |
311 | static MachineOperand *findSingleRegDef(const MachineOperand *Reg, |
312 | const MachineRegisterInfo *MRI) { |
313 | if (!Reg->isReg()) |
314 | return nullptr; |
315 | |
316 | MachineInstr *DefInstr = MRI->getUniqueVRegDef(Reg: Reg->getReg()); |
317 | if (!DefInstr) |
318 | return nullptr; |
319 | |
320 | for (auto &DefMO : DefInstr->defs()) { |
321 | if (DefMO.isReg() && DefMO.getReg() == Reg->getReg()) |
322 | return &DefMO; |
323 | } |
324 | |
325 | // Ignore implicit defs. |
326 | return nullptr; |
327 | } |
328 | |
329 | /// Combine an SDWA instruction's existing SDWA selection \p Sel with |
330 | /// the SDWA selection \p OperandSel of its operand. If the selections |
331 | /// are compatible, return the combined selection, otherwise return a |
332 | /// nullopt. |
333 | /// For example, if we have Sel = BYTE_0 Sel and OperandSel = WORD_1: |
334 | /// BYTE_0 Sel (WORD_1 Sel (%X)) -> BYTE_2 Sel (%X) |
335 | static std::optional<SdwaSel> combineSdwaSel(SdwaSel Sel, SdwaSel OperandSel) { |
336 | if (Sel == SdwaSel::DWORD) |
337 | return OperandSel; |
338 | |
339 | if (Sel == OperandSel || OperandSel == SdwaSel::DWORD) |
340 | return Sel; |
341 | |
342 | if (Sel == SdwaSel::WORD_1 || Sel == SdwaSel::BYTE_2 || |
343 | Sel == SdwaSel::BYTE_3) |
344 | return {}; |
345 | |
346 | if (OperandSel == SdwaSel::WORD_0) |
347 | return Sel; |
348 | |
349 | if (OperandSel == SdwaSel::WORD_1) { |
350 | if (Sel == SdwaSel::BYTE_0) |
351 | return SdwaSel::BYTE_2; |
352 | if (Sel == SdwaSel::BYTE_1) |
353 | return SdwaSel::BYTE_3; |
354 | if (Sel == SdwaSel::WORD_0) |
355 | return SdwaSel::WORD_1; |
356 | } |
357 | |
358 | return {}; |
359 | } |
360 | |
361 | uint64_t SDWASrcOperand::getSrcMods(const SIInstrInfo *TII, |
362 | const MachineOperand *SrcOp) const { |
363 | uint64_t Mods = 0; |
364 | const auto *MI = SrcOp->getParent(); |
365 | if (TII->getNamedOperand(MI: *MI, OperandName: AMDGPU::OpName::src0) == SrcOp) { |
366 | if (auto *Mod = TII->getNamedOperand(MI: *MI, OperandName: AMDGPU::OpName::src0_modifiers)) { |
367 | Mods = Mod->getImm(); |
368 | } |
369 | } else if (TII->getNamedOperand(MI: *MI, OperandName: AMDGPU::OpName::src1) == SrcOp) { |
370 | if (auto *Mod = TII->getNamedOperand(MI: *MI, OperandName: AMDGPU::OpName::src1_modifiers)) { |
371 | Mods = Mod->getImm(); |
372 | } |
373 | } |
374 | if (Abs || Neg) { |
375 | assert(!Sext && |
376 | "Float and integer src modifiers can't be set simultaneously" ); |
377 | Mods |= Abs ? SISrcMods::ABS : 0u; |
378 | Mods ^= Neg ? SISrcMods::NEG : 0u; |
379 | } else if (Sext) { |
380 | Mods |= SISrcMods::SEXT; |
381 | } |
382 | |
383 | return Mods; |
384 | } |
385 | |
386 | MachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII, |
387 | const GCNSubtarget &ST, |
388 | SDWAOperandsMap *PotentialMatches) { |
389 | if (PotentialMatches != nullptr) { |
390 | // Fill out the map for all uses if all can be converted |
391 | MachineOperand *Reg = getReplacedOperand(); |
392 | if (!Reg->isReg() || !Reg->isDef()) |
393 | return nullptr; |
394 | |
395 | for (MachineInstr &UseMI : getMRI()->use_nodbg_instructions(Reg: Reg->getReg())) |
396 | // Check that all instructions that use Reg can be converted |
397 | if (!isConvertibleToSDWA(MI&: UseMI, ST, TII) || |
398 | !canCombineSelections(MI: UseMI, TII)) |
399 | return nullptr; |
400 | |
401 | // Now that it's guaranteed all uses are legal, iterate over the uses again |
402 | // to add them for later conversion. |
403 | for (MachineOperand &UseMO : getMRI()->use_nodbg_operands(Reg: Reg->getReg())) { |
404 | // Should not get a subregister here |
405 | assert(isSameReg(UseMO, *Reg)); |
406 | |
407 | SDWAOperandsMap &potentialMatchesMap = *PotentialMatches; |
408 | MachineInstr *UseMI = UseMO.getParent(); |
409 | potentialMatchesMap[UseMI].push_back(Elt: this); |
410 | } |
411 | return nullptr; |
412 | } |
413 | |
414 | // For SDWA src operand potential instruction is one that use register |
415 | // defined by parent instruction |
416 | MachineOperand *PotentialMO = findSingleRegUse(Reg: getReplacedOperand(), MRI: getMRI()); |
417 | if (!PotentialMO) |
418 | return nullptr; |
419 | |
420 | MachineInstr *Parent = PotentialMO->getParent(); |
421 | |
422 | return canCombineSelections(MI: *Parent, TII) ? Parent : nullptr; |
423 | } |
424 | |
425 | bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) { |
426 | switch (MI.getOpcode()) { |
427 | case AMDGPU::V_CVT_F32_FP8_sdwa: |
428 | case AMDGPU::V_CVT_F32_BF8_sdwa: |
429 | case AMDGPU::V_CVT_PK_F32_FP8_sdwa: |
430 | case AMDGPU::V_CVT_PK_F32_BF8_sdwa: |
431 | // Does not support input modifiers: noabs, noneg, nosext. |
432 | return false; |
433 | case AMDGPU::V_CNDMASK_B32_sdwa: |
434 | // SISrcMods uses the same bitmask for SEXT and NEG modifiers and |
435 | // hence the compiler can only support one type of modifier for |
436 | // each SDWA instruction. For V_CNDMASK_B32_sdwa, this is NEG |
437 | // since its operands get printed using |
438 | // AMDGPUInstPrinter::printOperandAndFPInputMods which produces |
439 | // the output intended for NEG if SEXT is set. |
440 | // |
441 | // The ISA does actually support both modifiers on most SDWA |
442 | // instructions. |
443 | // |
444 | // FIXME Accept SEXT here after fixing this issue. |
445 | if (Sext) |
446 | return false; |
447 | break; |
448 | } |
449 | |
450 | // Find operand in instruction that matches source operand and replace it with |
451 | // target operand. Set corresponding src_sel |
452 | bool IsPreserveSrc = false; |
453 | MachineOperand *Src = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0); |
454 | MachineOperand *SrcSel = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0_sel); |
455 | MachineOperand *SrcMods = |
456 | TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0_modifiers); |
457 | assert(Src && (Src->isReg() || Src->isImm())); |
458 | if (!isSameReg(LHS: *Src, RHS: *getReplacedOperand())) { |
459 | // If this is not src0 then it could be src1 |
460 | Src = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1); |
461 | SrcSel = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1_sel); |
462 | SrcMods = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1_modifiers); |
463 | |
464 | if (!Src || |
465 | !isSameReg(LHS: *Src, RHS: *getReplacedOperand())) { |
466 | // It's possible this Src is a tied operand for |
467 | // UNUSED_PRESERVE, in which case we can either |
468 | // abandon the peephole attempt, or if legal we can |
469 | // copy the target operand into the tied slot |
470 | // if the preserve operation will effectively cause the same |
471 | // result by overwriting the rest of the dst. |
472 | MachineOperand *Dst = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst); |
473 | MachineOperand *DstUnused = |
474 | TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::dst_unused); |
475 | |
476 | if (Dst && |
477 | DstUnused->getImm() == AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE) { |
478 | // This will work if the tied src is accessing WORD_0, and the dst is |
479 | // writing WORD_1. Modifiers don't matter because all the bits that |
480 | // would be impacted are being overwritten by the dst. |
481 | // Any other case will not work. |
482 | SdwaSel DstSel = static_cast<SdwaSel>( |
483 | TII->getNamedImmOperand(MI, OperandName: AMDGPU::OpName::dst_sel)); |
484 | if (DstSel == AMDGPU::SDWA::SdwaSel::WORD_1 && |
485 | getSrcSel() == AMDGPU::SDWA::SdwaSel::WORD_0) { |
486 | IsPreserveSrc = true; |
487 | auto DstIdx = AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), |
488 | Name: AMDGPU::OpName::vdst); |
489 | auto TiedIdx = MI.findTiedOperandIdx(OpIdx: DstIdx); |
490 | Src = &MI.getOperand(i: TiedIdx); |
491 | SrcSel = nullptr; |
492 | SrcMods = nullptr; |
493 | } else { |
494 | // Not legal to convert this src |
495 | return false; |
496 | } |
497 | } |
498 | } |
499 | assert(Src && Src->isReg()); |
500 | |
501 | if ((MI.getOpcode() == AMDGPU::V_FMAC_F16_sdwa || |
502 | MI.getOpcode() == AMDGPU::V_FMAC_F32_sdwa || |
503 | MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa || |
504 | MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) && |
505 | !isSameReg(LHS: *Src, RHS: *getReplacedOperand())) { |
506 | // In case of v_mac_f16/32_sdwa this pass can try to apply src operand to |
507 | // src2. This is not allowed. |
508 | return false; |
509 | } |
510 | |
511 | assert(isSameReg(*Src, *getReplacedOperand()) && |
512 | (IsPreserveSrc || (SrcSel && SrcMods))); |
513 | } |
514 | copyRegOperand(To&: *Src, From: *getTargetOperand()); |
515 | if (!IsPreserveSrc) { |
516 | SdwaSel ExistingSel = static_cast<SdwaSel>(SrcSel->getImm()); |
517 | SrcSel->setImm(*combineSdwaSel(Sel: ExistingSel, OperandSel: getSrcSel())); |
518 | SrcMods->setImm(getSrcMods(TII, SrcOp: Src)); |
519 | } |
520 | getTargetOperand()->setIsKill(false); |
521 | return true; |
522 | } |
523 | |
524 | /// Verify that the SDWA selection operand \p SrcSelOpName of the SDWA |
525 | /// instruction \p MI can be combined with the selection \p OpSel. |
526 | static bool canCombineOpSel(const MachineInstr &MI, const SIInstrInfo *TII, |
527 | AMDGPU::OpName SrcSelOpName, SdwaSel OpSel) { |
528 | assert(TII->isSDWA(MI.getOpcode())); |
529 | |
530 | const MachineOperand *SrcSelOp = TII->getNamedOperand(MI, OperandName: SrcSelOpName); |
531 | SdwaSel SrcSel = static_cast<SdwaSel>(SrcSelOp->getImm()); |
532 | |
533 | return combineSdwaSel(Sel: SrcSel, OperandSel: OpSel).has_value(); |
534 | } |
535 | |
536 | /// Verify that \p Op is the same register as the operand of the SDWA |
537 | /// instruction \p MI named by \p SrcOpName and that the SDWA |
538 | /// selection \p SrcSelOpName can be combined with the \p OpSel. |
539 | static bool canCombineOpSel(const MachineInstr &MI, const SIInstrInfo *TII, |
540 | AMDGPU::OpName SrcOpName, |
541 | AMDGPU::OpName SrcSelOpName, MachineOperand *Op, |
542 | SdwaSel OpSel) { |
543 | assert(TII->isSDWA(MI.getOpcode())); |
544 | |
545 | const MachineOperand *Src = TII->getNamedOperand(MI, OperandName: SrcOpName); |
546 | if (!Src || !isSameReg(LHS: *Src, RHS: *Op)) |
547 | return true; |
548 | |
549 | return canCombineOpSel(MI, TII, SrcSelOpName, OpSel); |
550 | } |
551 | |
552 | bool SDWASrcOperand::canCombineSelections(const MachineInstr &MI, |
553 | const SIInstrInfo *TII) { |
554 | if (!TII->isSDWA(Opcode: MI.getOpcode())) |
555 | return true; |
556 | |
557 | using namespace AMDGPU; |
558 | |
559 | return canCombineOpSel(MI, TII, SrcOpName: OpName::src0, SrcSelOpName: OpName::src0_sel, |
560 | Op: getReplacedOperand(), OpSel: getSrcSel()) && |
561 | canCombineOpSel(MI, TII, SrcOpName: OpName::src1, SrcSelOpName: OpName::src1_sel, |
562 | Op: getReplacedOperand(), OpSel: getSrcSel()); |
563 | } |
564 | |
565 | MachineInstr *SDWADstOperand::potentialToConvert(const SIInstrInfo *TII, |
566 | const GCNSubtarget &ST, |
567 | SDWAOperandsMap *PotentialMatches) { |
568 | // For SDWA dst operand potential instruction is one that defines register |
569 | // that this operand uses |
570 | MachineRegisterInfo *MRI = getMRI(); |
571 | MachineInstr *ParentMI = getParentInst(); |
572 | |
573 | MachineOperand *PotentialMO = findSingleRegDef(Reg: getReplacedOperand(), MRI); |
574 | if (!PotentialMO) |
575 | return nullptr; |
576 | |
577 | // Check that ParentMI is the only instruction that uses replaced register |
578 | for (MachineInstr &UseInst : MRI->use_nodbg_instructions(Reg: PotentialMO->getReg())) { |
579 | if (&UseInst != ParentMI) |
580 | return nullptr; |
581 | } |
582 | |
583 | MachineInstr *Parent = PotentialMO->getParent(); |
584 | return canCombineSelections(MI: *Parent, TII) ? Parent : nullptr; |
585 | } |
586 | |
587 | bool SDWADstOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) { |
588 | // Replace vdst operand in MI with target operand. Set dst_sel and dst_unused |
589 | |
590 | if ((MI.getOpcode() == AMDGPU::V_FMAC_F16_sdwa || |
591 | MI.getOpcode() == AMDGPU::V_FMAC_F32_sdwa || |
592 | MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa || |
593 | MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) && |
594 | getDstSel() != AMDGPU::SDWA::DWORD) { |
595 | // v_mac_f16/32_sdwa allow dst_sel to be equal only to DWORD |
596 | return false; |
597 | } |
598 | |
599 | MachineOperand *Operand = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst); |
600 | assert(Operand && |
601 | Operand->isReg() && |
602 | isSameReg(*Operand, *getReplacedOperand())); |
603 | copyRegOperand(To&: *Operand, From: *getTargetOperand()); |
604 | MachineOperand *DstSel= TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::dst_sel); |
605 | assert(DstSel); |
606 | |
607 | SdwaSel ExistingSel = static_cast<SdwaSel>(DstSel->getImm()); |
608 | DstSel->setImm(combineSdwaSel(Sel: ExistingSel, OperandSel: getDstSel()).value()); |
609 | |
610 | MachineOperand *DstUnused= TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::dst_unused); |
611 | assert(DstUnused); |
612 | DstUnused->setImm(getDstUnused()); |
613 | |
614 | // Remove original instruction because it would conflict with our new |
615 | // instruction by register definition |
616 | getParentInst()->eraseFromParent(); |
617 | return true; |
618 | } |
619 | |
620 | bool SDWADstOperand::canCombineSelections(const MachineInstr &MI, |
621 | const SIInstrInfo *TII) { |
622 | if (!TII->isSDWA(Opcode: MI.getOpcode())) |
623 | return true; |
624 | |
625 | return canCombineOpSel(MI, TII, SrcSelOpName: AMDGPU::OpName::dst_sel, OpSel: getDstSel()); |
626 | } |
627 | |
628 | bool SDWADstPreserveOperand::convertToSDWA(MachineInstr &MI, |
629 | const SIInstrInfo *TII) { |
630 | // MI should be moved right before v_or_b32. |
631 | // For this we should clear all kill flags on uses of MI src-operands or else |
632 | // we can encounter problem with use of killed operand. |
633 | for (MachineOperand &MO : MI.uses()) { |
634 | if (!MO.isReg()) |
635 | continue; |
636 | getMRI()->clearKillFlags(Reg: MO.getReg()); |
637 | } |
638 | |
639 | // Move MI before v_or_b32 |
640 | MI.getParent()->remove(I: &MI); |
641 | getParentInst()->getParent()->insert(I: getParentInst(), MI: &MI); |
642 | |
643 | // Add Implicit use of preserved register |
644 | MachineInstrBuilder MIB(*MI.getMF(), MI); |
645 | MIB.addReg(RegNo: getPreservedOperand()->getReg(), |
646 | flags: RegState::ImplicitKill, |
647 | SubReg: getPreservedOperand()->getSubReg()); |
648 | |
649 | // Tie dst to implicit use |
650 | MI.tieOperands(DefIdx: AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: AMDGPU::OpName::vdst), |
651 | UseIdx: MI.getNumOperands() - 1); |
652 | |
653 | // Convert MI as any other SDWADstOperand and remove v_or_b32 |
654 | return SDWADstOperand::convertToSDWA(MI, TII); |
655 | } |
656 | |
657 | bool SDWADstPreserveOperand::canCombineSelections(const MachineInstr &MI, |
658 | const SIInstrInfo *TII) { |
659 | return SDWADstOperand::canCombineSelections(MI, TII); |
660 | } |
661 | |
662 | std::optional<int64_t> |
663 | SIPeepholeSDWA::foldToImm(const MachineOperand &Op) const { |
664 | if (Op.isImm()) { |
665 | return Op.getImm(); |
666 | } |
667 | |
668 | // If this is not immediate then it can be copy of immediate value, e.g.: |
669 | // %1 = S_MOV_B32 255; |
670 | if (Op.isReg()) { |
671 | for (const MachineOperand &Def : MRI->def_operands(Reg: Op.getReg())) { |
672 | if (!isSameReg(LHS: Op, RHS: Def)) |
673 | continue; |
674 | |
675 | const MachineInstr *DefInst = Def.getParent(); |
676 | if (!TII->isFoldableCopy(MI: *DefInst)) |
677 | return std::nullopt; |
678 | |
679 | const MachineOperand &Copied = DefInst->getOperand(i: 1); |
680 | if (!Copied.isImm()) |
681 | return std::nullopt; |
682 | |
683 | return Copied.getImm(); |
684 | } |
685 | } |
686 | |
687 | return std::nullopt; |
688 | } |
689 | |
690 | std::unique_ptr<SDWAOperand> |
691 | SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) { |
692 | unsigned Opcode = MI.getOpcode(); |
693 | switch (Opcode) { |
694 | case AMDGPU::V_LSHRREV_B32_e32: |
695 | case AMDGPU::V_ASHRREV_I32_e32: |
696 | case AMDGPU::V_LSHLREV_B32_e32: |
697 | case AMDGPU::V_LSHRREV_B32_e64: |
698 | case AMDGPU::V_ASHRREV_I32_e64: |
699 | case AMDGPU::V_LSHLREV_B32_e64: { |
700 | // from: v_lshrrev_b32_e32 v1, 16/24, v0 |
701 | // to SDWA src:v0 src_sel:WORD_1/BYTE_3 |
702 | |
703 | // from: v_ashrrev_i32_e32 v1, 16/24, v0 |
704 | // to SDWA src:v0 src_sel:WORD_1/BYTE_3 sext:1 |
705 | |
706 | // from: v_lshlrev_b32_e32 v1, 16/24, v0 |
707 | // to SDWA dst:v1 dst_sel:WORD_1/BYTE_3 dst_unused:UNUSED_PAD |
708 | MachineOperand *Src0 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0); |
709 | auto Imm = foldToImm(Op: *Src0); |
710 | if (!Imm) |
711 | break; |
712 | |
713 | if (*Imm != 16 && *Imm != 24) |
714 | break; |
715 | |
716 | MachineOperand *Src1 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1); |
717 | MachineOperand *Dst = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst); |
718 | if (!Src1->isReg() || Src1->getReg().isPhysical() || |
719 | Dst->getReg().isPhysical()) |
720 | break; |
721 | |
722 | if (Opcode == AMDGPU::V_LSHLREV_B32_e32 || |
723 | Opcode == AMDGPU::V_LSHLREV_B32_e64) { |
724 | return std::make_unique<SDWADstOperand>( |
725 | args&: Dst, args&: Src1, args: *Imm == 16 ? WORD_1 : BYTE_3, args: UNUSED_PAD); |
726 | } |
727 | return std::make_unique<SDWASrcOperand>( |
728 | args&: Src1, args&: Dst, args: *Imm == 16 ? WORD_1 : BYTE_3, args: false, args: false, |
729 | args: Opcode != AMDGPU::V_LSHRREV_B32_e32 && |
730 | Opcode != AMDGPU::V_LSHRREV_B32_e64); |
731 | break; |
732 | } |
733 | |
734 | case AMDGPU::V_LSHRREV_B16_e32: |
735 | case AMDGPU::V_ASHRREV_I16_e32: |
736 | case AMDGPU::V_LSHLREV_B16_e32: |
737 | case AMDGPU::V_LSHRREV_B16_e64: |
738 | case AMDGPU::V_LSHRREV_B16_opsel_e64: |
739 | case AMDGPU::V_ASHRREV_I16_e64: |
740 | case AMDGPU::V_LSHLREV_B16_opsel_e64: |
741 | case AMDGPU::V_LSHLREV_B16_e64: { |
742 | // from: v_lshrrev_b16_e32 v1, 8, v0 |
743 | // to SDWA src:v0 src_sel:BYTE_1 |
744 | |
745 | // from: v_ashrrev_i16_e32 v1, 8, v0 |
746 | // to SDWA src:v0 src_sel:BYTE_1 sext:1 |
747 | |
748 | // from: v_lshlrev_b16_e32 v1, 8, v0 |
749 | // to SDWA dst:v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD |
750 | MachineOperand *Src0 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0); |
751 | auto Imm = foldToImm(Op: *Src0); |
752 | if (!Imm || *Imm != 8) |
753 | break; |
754 | |
755 | MachineOperand *Src1 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1); |
756 | MachineOperand *Dst = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst); |
757 | |
758 | if (!Src1->isReg() || Src1->getReg().isPhysical() || |
759 | Dst->getReg().isPhysical()) |
760 | break; |
761 | |
762 | if (Opcode == AMDGPU::V_LSHLREV_B16_e32 || |
763 | Opcode == AMDGPU::V_LSHLREV_B16_opsel_e64 || |
764 | Opcode == AMDGPU::V_LSHLREV_B16_e64) |
765 | return std::make_unique<SDWADstOperand>(args&: Dst, args&: Src1, args: BYTE_1, args: UNUSED_PAD); |
766 | return std::make_unique<SDWASrcOperand>( |
767 | args&: Src1, args&: Dst, args: BYTE_1, args: false, args: false, |
768 | args: Opcode != AMDGPU::V_LSHRREV_B16_e32 && |
769 | Opcode != AMDGPU::V_LSHRREV_B16_opsel_e64 && |
770 | Opcode != AMDGPU::V_LSHRREV_B16_e64); |
771 | break; |
772 | } |
773 | |
774 | case AMDGPU::V_BFE_I32_e64: |
775 | case AMDGPU::V_BFE_U32_e64: { |
776 | // e.g.: |
777 | // from: v_bfe_u32 v1, v0, 8, 8 |
778 | // to SDWA src:v0 src_sel:BYTE_1 |
779 | |
780 | // offset | width | src_sel |
781 | // ------------------------ |
782 | // 0 | 8 | BYTE_0 |
783 | // 0 | 16 | WORD_0 |
784 | // 0 | 32 | DWORD ? |
785 | // 8 | 8 | BYTE_1 |
786 | // 16 | 8 | BYTE_2 |
787 | // 16 | 16 | WORD_1 |
788 | // 24 | 8 | BYTE_3 |
789 | |
790 | MachineOperand *Src1 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1); |
791 | auto Offset = foldToImm(Op: *Src1); |
792 | if (!Offset) |
793 | break; |
794 | |
795 | MachineOperand *Src2 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src2); |
796 | auto Width = foldToImm(Op: *Src2); |
797 | if (!Width) |
798 | break; |
799 | |
800 | SdwaSel SrcSel = DWORD; |
801 | |
802 | if (*Offset == 0 && *Width == 8) |
803 | SrcSel = BYTE_0; |
804 | else if (*Offset == 0 && *Width == 16) |
805 | SrcSel = WORD_0; |
806 | else if (*Offset == 0 && *Width == 32) |
807 | SrcSel = DWORD; |
808 | else if (*Offset == 8 && *Width == 8) |
809 | SrcSel = BYTE_1; |
810 | else if (*Offset == 16 && *Width == 8) |
811 | SrcSel = BYTE_2; |
812 | else if (*Offset == 16 && *Width == 16) |
813 | SrcSel = WORD_1; |
814 | else if (*Offset == 24 && *Width == 8) |
815 | SrcSel = BYTE_3; |
816 | else |
817 | break; |
818 | |
819 | MachineOperand *Src0 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0); |
820 | MachineOperand *Dst = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst); |
821 | |
822 | if (!Src0->isReg() || Src0->getReg().isPhysical() || |
823 | Dst->getReg().isPhysical()) |
824 | break; |
825 | |
826 | return std::make_unique<SDWASrcOperand>( |
827 | args&: Src0, args&: Dst, args&: SrcSel, args: false, args: false, args: Opcode != AMDGPU::V_BFE_U32_e64); |
828 | } |
829 | |
830 | case AMDGPU::V_AND_B32_e32: |
831 | case AMDGPU::V_AND_B32_e64: { |
832 | // e.g.: |
833 | // from: v_and_b32_e32 v1, 0x0000ffff/0x000000ff, v0 |
834 | // to SDWA src:v0 src_sel:WORD_0/BYTE_0 |
835 | |
836 | MachineOperand *Src0 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0); |
837 | MachineOperand *Src1 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1); |
838 | auto *ValSrc = Src1; |
839 | auto Imm = foldToImm(Op: *Src0); |
840 | |
841 | if (!Imm) { |
842 | Imm = foldToImm(Op: *Src1); |
843 | ValSrc = Src0; |
844 | } |
845 | |
846 | if (!Imm || (*Imm != 0x0000ffff && *Imm != 0x000000ff)) |
847 | break; |
848 | |
849 | MachineOperand *Dst = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst); |
850 | |
851 | if (!ValSrc->isReg() || ValSrc->getReg().isPhysical() || |
852 | Dst->getReg().isPhysical()) |
853 | break; |
854 | |
855 | return std::make_unique<SDWASrcOperand>( |
856 | args&: ValSrc, args&: Dst, args: *Imm == 0x0000ffff ? WORD_0 : BYTE_0); |
857 | } |
858 | |
859 | case AMDGPU::V_OR_B32_e32: |
860 | case AMDGPU::V_OR_B32_e64: { |
861 | // Patterns for dst_unused:UNUSED_PRESERVE. |
862 | // e.g., from: |
863 | // v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD |
864 | // src1_sel:WORD_1 src2_sel:WORD1 |
865 | // v_add_f16_e32 v3, v1, v2 |
866 | // v_or_b32_e32 v4, v0, v3 |
867 | // to SDWA preserve dst:v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE preserve:v3 |
868 | |
869 | // Check if one of operands of v_or_b32 is SDWA instruction |
870 | using CheckRetType = |
871 | std::optional<std::pair<MachineOperand *, MachineOperand *>>; |
872 | auto CheckOROperandsForSDWA = |
873 | [&](const MachineOperand *Op1, const MachineOperand *Op2) -> CheckRetType { |
874 | if (!Op1 || !Op1->isReg() || !Op2 || !Op2->isReg()) |
875 | return CheckRetType(std::nullopt); |
876 | |
877 | MachineOperand *Op1Def = findSingleRegDef(Reg: Op1, MRI); |
878 | if (!Op1Def) |
879 | return CheckRetType(std::nullopt); |
880 | |
881 | MachineInstr *Op1Inst = Op1Def->getParent(); |
882 | if (!TII->isSDWA(MI: *Op1Inst)) |
883 | return CheckRetType(std::nullopt); |
884 | |
885 | MachineOperand *Op2Def = findSingleRegDef(Reg: Op2, MRI); |
886 | if (!Op2Def) |
887 | return CheckRetType(std::nullopt); |
888 | |
889 | return CheckRetType(std::pair(Op1Def, Op2Def)); |
890 | }; |
891 | |
892 | MachineOperand *OrSDWA = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0); |
893 | MachineOperand *OrOther = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1); |
894 | assert(OrSDWA && OrOther); |
895 | auto Res = CheckOROperandsForSDWA(OrSDWA, OrOther); |
896 | if (!Res) { |
897 | OrSDWA = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1); |
898 | OrOther = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0); |
899 | assert(OrSDWA && OrOther); |
900 | Res = CheckOROperandsForSDWA(OrSDWA, OrOther); |
901 | if (!Res) |
902 | break; |
903 | } |
904 | |
905 | MachineOperand *OrSDWADef = Res->first; |
906 | MachineOperand *OrOtherDef = Res->second; |
907 | assert(OrSDWADef && OrOtherDef); |
908 | |
909 | MachineInstr *SDWAInst = OrSDWADef->getParent(); |
910 | MachineInstr *OtherInst = OrOtherDef->getParent(); |
911 | |
912 | // Check that OtherInstr is actually bitwise compatible with SDWAInst = their |
913 | // destination patterns don't overlap. Compatible instruction can be either |
914 | // regular instruction with compatible bitness or SDWA instruction with |
915 | // correct dst_sel |
916 | // SDWAInst | OtherInst bitness / OtherInst dst_sel |
917 | // ----------------------------------------------------- |
918 | // DWORD | no / no |
919 | // WORD_0 | no / BYTE_2/3, WORD_1 |
920 | // WORD_1 | 8/16-bit instructions / BYTE_0/1, WORD_0 |
921 | // BYTE_0 | no / BYTE_1/2/3, WORD_1 |
922 | // BYTE_1 | 8-bit / BYTE_0/2/3, WORD_1 |
923 | // BYTE_2 | 8/16-bit / BYTE_0/1/3. WORD_0 |
924 | // BYTE_3 | 8/16/24-bit / BYTE_0/1/2, WORD_0 |
925 | // E.g. if SDWAInst is v_add_f16_sdwa dst_sel:WORD_1 then v_add_f16 is OK |
926 | // but v_add_f32 is not. |
927 | |
928 | // TODO: add support for non-SDWA instructions as OtherInst. |
929 | // For now this only works with SDWA instructions. For regular instructions |
930 | // there is no way to determine if the instruction writes only 8/16/24-bit |
931 | // out of full register size and all registers are at min 32-bit wide. |
932 | if (!TII->isSDWA(MI: *OtherInst)) |
933 | break; |
934 | |
935 | SdwaSel DstSel = static_cast<SdwaSel>( |
936 | TII->getNamedImmOperand(MI: *SDWAInst, OperandName: AMDGPU::OpName::dst_sel)); |
937 | SdwaSel OtherDstSel = static_cast<SdwaSel>( |
938 | TII->getNamedImmOperand(MI: *OtherInst, OperandName: AMDGPU::OpName::dst_sel)); |
939 | |
940 | bool DstSelAgree = false; |
941 | switch (DstSel) { |
942 | case WORD_0: DstSelAgree = ((OtherDstSel == BYTE_2) || |
943 | (OtherDstSel == BYTE_3) || |
944 | (OtherDstSel == WORD_1)); |
945 | break; |
946 | case WORD_1: DstSelAgree = ((OtherDstSel == BYTE_0) || |
947 | (OtherDstSel == BYTE_1) || |
948 | (OtherDstSel == WORD_0)); |
949 | break; |
950 | case BYTE_0: DstSelAgree = ((OtherDstSel == BYTE_1) || |
951 | (OtherDstSel == BYTE_2) || |
952 | (OtherDstSel == BYTE_3) || |
953 | (OtherDstSel == WORD_1)); |
954 | break; |
955 | case BYTE_1: DstSelAgree = ((OtherDstSel == BYTE_0) || |
956 | (OtherDstSel == BYTE_2) || |
957 | (OtherDstSel == BYTE_3) || |
958 | (OtherDstSel == WORD_1)); |
959 | break; |
960 | case BYTE_2: DstSelAgree = ((OtherDstSel == BYTE_0) || |
961 | (OtherDstSel == BYTE_1) || |
962 | (OtherDstSel == BYTE_3) || |
963 | (OtherDstSel == WORD_0)); |
964 | break; |
965 | case BYTE_3: DstSelAgree = ((OtherDstSel == BYTE_0) || |
966 | (OtherDstSel == BYTE_1) || |
967 | (OtherDstSel == BYTE_2) || |
968 | (OtherDstSel == WORD_0)); |
969 | break; |
970 | default: DstSelAgree = false; |
971 | } |
972 | |
973 | if (!DstSelAgree) |
974 | break; |
975 | |
976 | // Also OtherInst dst_unused should be UNUSED_PAD |
977 | DstUnused OtherDstUnused = static_cast<DstUnused>( |
978 | TII->getNamedImmOperand(MI: *OtherInst, OperandName: AMDGPU::OpName::dst_unused)); |
979 | if (OtherDstUnused != DstUnused::UNUSED_PAD) |
980 | break; |
981 | |
982 | // Create DstPreserveOperand |
983 | MachineOperand *OrDst = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst); |
984 | assert(OrDst && OrDst->isReg()); |
985 | |
986 | return std::make_unique<SDWADstPreserveOperand>( |
987 | args&: OrDst, args&: OrSDWADef, args&: OrOtherDef, args&: DstSel); |
988 | |
989 | } |
990 | } |
991 | |
992 | return std::unique_ptr<SDWAOperand>(nullptr); |
993 | } |
994 | |
995 | #if !defined(NDEBUG) |
996 | static raw_ostream& operator<<(raw_ostream &OS, const SDWAOperand &Operand) { |
997 | Operand.print(OS); |
998 | return OS; |
999 | } |
1000 | #endif |
1001 | |
1002 | void SIPeepholeSDWA::matchSDWAOperands(MachineBasicBlock &MBB) { |
1003 | for (MachineInstr &MI : MBB) { |
1004 | if (auto Operand = matchSDWAOperand(MI)) { |
1005 | LLVM_DEBUG(dbgs() << "Match: " << MI << "To: " << *Operand << '\n'); |
1006 | SDWAOperands[&MI] = std::move(Operand); |
1007 | ++NumSDWAPatternsFound; |
1008 | } |
1009 | } |
1010 | } |
1011 | |
1012 | // Convert the V_ADD_CO_U32_e64 into V_ADD_CO_U32_e32. This allows |
1013 | // isConvertibleToSDWA to perform its transformation on V_ADD_CO_U32_e32 into |
1014 | // V_ADD_CO_U32_sdwa. |
1015 | // |
1016 | // We are transforming from a VOP3 into a VOP2 form of the instruction. |
1017 | // %19:vgpr_32 = V_AND_B32_e32 255, |
1018 | // killed %16:vgpr_32, implicit $exec |
1019 | // %47:vgpr_32, %49:sreg_64_xexec = V_ADD_CO_U32_e64 |
1020 | // %26.sub0:vreg_64, %19:vgpr_32, implicit $exec |
1021 | // %48:vgpr_32, dead %50:sreg_64_xexec = V_ADDC_U32_e64 |
1022 | // %26.sub1:vreg_64, %54:vgpr_32, killed %49:sreg_64_xexec, implicit $exec |
1023 | // |
1024 | // becomes |
1025 | // %47:vgpr_32 = V_ADD_CO_U32_sdwa |
1026 | // 0, %26.sub0:vreg_64, 0, killed %16:vgpr_32, 0, 6, 0, 6, 0, |
1027 | // implicit-def $vcc, implicit $exec |
1028 | // %48:vgpr_32, dead %50:sreg_64_xexec = V_ADDC_U32_e64 |
1029 | // %26.sub1:vreg_64, %54:vgpr_32, killed $vcc, implicit $exec |
1030 | void SIPeepholeSDWA::pseudoOpConvertToVOP2(MachineInstr &MI, |
1031 | const GCNSubtarget &ST) const { |
1032 | int Opc = MI.getOpcode(); |
1033 | assert((Opc == AMDGPU::V_ADD_CO_U32_e64 || Opc == AMDGPU::V_SUB_CO_U32_e64) && |
1034 | "Currently only handles V_ADD_CO_U32_e64 or V_SUB_CO_U32_e64" ); |
1035 | |
1036 | // Can the candidate MI be shrunk? |
1037 | if (!TII->canShrink(MI, MRI: *MRI)) |
1038 | return; |
1039 | Opc = AMDGPU::getVOPe32(Opcode: Opc); |
1040 | // Find the related ADD instruction. |
1041 | const MachineOperand *Sdst = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::sdst); |
1042 | if (!Sdst) |
1043 | return; |
1044 | MachineOperand *NextOp = findSingleRegUse(Reg: Sdst, MRI); |
1045 | if (!NextOp) |
1046 | return; |
1047 | MachineInstr &MISucc = *NextOp->getParent(); |
1048 | |
1049 | // Make sure the carry in/out are subsequently unused. |
1050 | MachineOperand *CarryIn = TII->getNamedOperand(MI&: MISucc, OperandName: AMDGPU::OpName::src2); |
1051 | if (!CarryIn) |
1052 | return; |
1053 | MachineOperand *CarryOut = TII->getNamedOperand(MI&: MISucc, OperandName: AMDGPU::OpName::sdst); |
1054 | if (!CarryOut) |
1055 | return; |
1056 | if (!MRI->hasOneUse(RegNo: CarryIn->getReg()) || !MRI->use_empty(RegNo: CarryOut->getReg())) |
1057 | return; |
1058 | // Make sure VCC or its subregs are dead before MI. |
1059 | MachineBasicBlock &MBB = *MI.getParent(); |
1060 | MachineBasicBlock::LivenessQueryResult Liveness = |
1061 | MBB.computeRegisterLiveness(TRI, Reg: AMDGPU::VCC, Before: MI, Neighborhood: 25); |
1062 | if (Liveness != MachineBasicBlock::LQR_Dead) |
1063 | return; |
1064 | // Check if VCC is referenced in range of (MI,MISucc]. |
1065 | for (auto I = std::next(x: MI.getIterator()), E = MISucc.getIterator(); |
1066 | I != E; ++I) { |
1067 | if (I->modifiesRegister(Reg: AMDGPU::VCC, TRI)) |
1068 | return; |
1069 | } |
1070 | |
1071 | // Replace MI with V_{SUB|ADD}_I32_e32 |
1072 | BuildMI(BB&: MBB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: Opc)) |
1073 | .add(MO: *TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst)) |
1074 | .add(MO: *TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0)) |
1075 | .add(MO: *TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1)) |
1076 | .setMIFlags(MI.getFlags()); |
1077 | |
1078 | MI.eraseFromParent(); |
1079 | |
1080 | // Since the carry output of MI is now VCC, update its use in MISucc. |
1081 | |
1082 | MISucc.substituteRegister(FromReg: CarryIn->getReg(), ToReg: TRI->getVCC(), SubIdx: 0, RegInfo: *TRI); |
1083 | } |
1084 | |
1085 | /// Try to convert an \p MI in VOP3 which takes an src2 carry-in |
1086 | /// operand into the corresponding VOP2 form which expects the |
1087 | /// argument in VCC. To this end, add an copy from the carry-in to |
1088 | /// VCC. The conversion will only be applied if \p MI can be shrunk |
1089 | /// to VOP2 and if VCC can be proven to be dead before \p MI. |
1090 | void SIPeepholeSDWA::convertVcndmaskToVOP2(MachineInstr &MI, |
1091 | const GCNSubtarget &ST) const { |
1092 | assert(MI.getOpcode() == AMDGPU::V_CNDMASK_B32_e64); |
1093 | |
1094 | LLVM_DEBUG(dbgs() << "Attempting VOP2 conversion: " << MI); |
1095 | if (!TII->canShrink(MI, MRI: *MRI)) { |
1096 | LLVM_DEBUG(dbgs() << "Cannot shrink instruction\n" ); |
1097 | return; |
1098 | } |
1099 | |
1100 | const MachineOperand &CarryIn = |
1101 | *TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src2); |
1102 | Register CarryReg = CarryIn.getReg(); |
1103 | MachineInstr *CarryDef = MRI->getVRegDef(Reg: CarryReg); |
1104 | if (!CarryDef) { |
1105 | LLVM_DEBUG(dbgs() << "Missing carry-in operand definition\n" ); |
1106 | return; |
1107 | } |
1108 | |
1109 | // Make sure VCC or its subregs are dead before MI. |
1110 | MCRegister Vcc = TRI->getVCC(); |
1111 | MachineBasicBlock &MBB = *MI.getParent(); |
1112 | MachineBasicBlock::LivenessQueryResult Liveness = |
1113 | MBB.computeRegisterLiveness(TRI, Reg: Vcc, Before: MI); |
1114 | if (Liveness != MachineBasicBlock::LQR_Dead) { |
1115 | LLVM_DEBUG(dbgs() << "VCC not known to be dead before instruction\n" ); |
1116 | return; |
1117 | } |
1118 | |
1119 | BuildMI(BB&: MBB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: Vcc).add(MO: CarryIn); |
1120 | |
1121 | auto Converted = BuildMI(BB&: MBB, I&: MI, MIMD: MI.getDebugLoc(), |
1122 | MCID: TII->get(Opcode: AMDGPU::getVOPe32(Opcode: MI.getOpcode()))) |
1123 | .add(MO: *TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst)) |
1124 | .add(MO: *TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0)) |
1125 | .add(MO: *TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1)) |
1126 | .setMIFlags(MI.getFlags()); |
1127 | TII->fixImplicitOperands(MI&: *Converted); |
1128 | LLVM_DEBUG(dbgs() << "Converted to VOP2: " << *Converted); |
1129 | (void)Converted; |
1130 | MI.eraseFromParent(); |
1131 | } |
1132 | |
1133 | namespace { |
1134 | bool isConvertibleToSDWA(MachineInstr &MI, |
1135 | const GCNSubtarget &ST, |
1136 | const SIInstrInfo* TII) { |
1137 | // Check if this is already an SDWA instruction |
1138 | unsigned Opc = MI.getOpcode(); |
1139 | if (TII->isSDWA(Opcode: Opc)) |
1140 | return true; |
1141 | |
1142 | // Can only be handled after ealier conversion to |
1143 | // AMDGPU::V_CNDMASK_B32_e32 which is not always possible. |
1144 | if (Opc == AMDGPU::V_CNDMASK_B32_e64) |
1145 | return false; |
1146 | |
1147 | // Check if this instruction has opcode that supports SDWA |
1148 | if (AMDGPU::getSDWAOp(Opcode: Opc) == -1) |
1149 | Opc = AMDGPU::getVOPe32(Opcode: Opc); |
1150 | |
1151 | if (AMDGPU::getSDWAOp(Opcode: Opc) == -1) |
1152 | return false; |
1153 | |
1154 | if (!ST.hasSDWAOmod() && TII->hasModifiersSet(MI, OpName: AMDGPU::OpName::omod)) |
1155 | return false; |
1156 | |
1157 | if (TII->isVOPC(Opcode: Opc)) { |
1158 | if (!ST.hasSDWASdst()) { |
1159 | const MachineOperand *SDst = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::sdst); |
1160 | if (SDst && (SDst->getReg() != AMDGPU::VCC && |
1161 | SDst->getReg() != AMDGPU::VCC_LO)) |
1162 | return false; |
1163 | } |
1164 | |
1165 | if (!ST.hasSDWAOutModsVOPC() && |
1166 | (TII->hasModifiersSet(MI, OpName: AMDGPU::OpName::clamp) || |
1167 | TII->hasModifiersSet(MI, OpName: AMDGPU::OpName::omod))) |
1168 | return false; |
1169 | |
1170 | } else if (TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::sdst) || |
1171 | !TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst)) { |
1172 | return false; |
1173 | } |
1174 | |
1175 | if (!ST.hasSDWAMac() && (Opc == AMDGPU::V_FMAC_F16_e32 || |
1176 | Opc == AMDGPU::V_FMAC_F32_e32 || |
1177 | Opc == AMDGPU::V_MAC_F16_e32 || |
1178 | Opc == AMDGPU::V_MAC_F32_e32)) |
1179 | return false; |
1180 | |
1181 | // Check if target supports this SDWA opcode |
1182 | if (TII->pseudoToMCOpcode(Opcode: Opc) == -1) |
1183 | return false; |
1184 | |
1185 | if (MachineOperand *Src0 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0)) { |
1186 | if (!Src0->isReg() && !Src0->isImm()) |
1187 | return false; |
1188 | } |
1189 | |
1190 | if (MachineOperand *Src1 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1)) { |
1191 | if (!Src1->isReg() && !Src1->isImm()) |
1192 | return false; |
1193 | } |
1194 | |
1195 | return true; |
1196 | } |
1197 | } // namespace |
1198 | |
1199 | MachineInstr *SIPeepholeSDWA::createSDWAVersion(MachineInstr &MI) { |
1200 | unsigned Opcode = MI.getOpcode(); |
1201 | assert(!TII->isSDWA(Opcode)); |
1202 | |
1203 | int SDWAOpcode = AMDGPU::getSDWAOp(Opcode); |
1204 | if (SDWAOpcode == -1) |
1205 | SDWAOpcode = AMDGPU::getSDWAOp(Opcode: AMDGPU::getVOPe32(Opcode)); |
1206 | assert(SDWAOpcode != -1); |
1207 | |
1208 | const MCInstrDesc &SDWADesc = TII->get(Opcode: SDWAOpcode); |
1209 | |
1210 | // Create SDWA version of instruction MI and initialize its operands |
1211 | MachineInstrBuilder SDWAInst = |
1212 | BuildMI(BB&: *MI.getParent(), I&: MI, MIMD: MI.getDebugLoc(), MCID: SDWADesc) |
1213 | .setMIFlags(MI.getFlags()); |
1214 | |
1215 | // Copy dst, if it is present in original then should also be present in SDWA |
1216 | MachineOperand *Dst = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst); |
1217 | if (Dst) { |
1218 | assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::vdst)); |
1219 | SDWAInst.add(MO: *Dst); |
1220 | } else if ((Dst = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::sdst))) { |
1221 | assert(Dst && AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::sdst)); |
1222 | SDWAInst.add(MO: *Dst); |
1223 | } else { |
1224 | assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::sdst)); |
1225 | SDWAInst.addReg(RegNo: TRI->getVCC(), flags: RegState::Define); |
1226 | } |
1227 | |
1228 | // Copy src0, initialize src0_modifiers. All sdwa instructions has src0 and |
1229 | // src0_modifiers (except for v_nop_sdwa, but it can't get here) |
1230 | MachineOperand *Src0 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0); |
1231 | assert(Src0 && AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src0) && |
1232 | AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src0_modifiers)); |
1233 | if (auto *Mod = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0_modifiers)) |
1234 | SDWAInst.addImm(Val: Mod->getImm()); |
1235 | else |
1236 | SDWAInst.addImm(Val: 0); |
1237 | SDWAInst.add(MO: *Src0); |
1238 | |
1239 | // Copy src1 if present, initialize src1_modifiers. |
1240 | MachineOperand *Src1 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1); |
1241 | if (Src1) { |
1242 | assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src1) && |
1243 | AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src1_modifiers)); |
1244 | if (auto *Mod = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1_modifiers)) |
1245 | SDWAInst.addImm(Val: Mod->getImm()); |
1246 | else |
1247 | SDWAInst.addImm(Val: 0); |
1248 | SDWAInst.add(MO: *Src1); |
1249 | } |
1250 | |
1251 | if (SDWAOpcode == AMDGPU::V_FMAC_F16_sdwa || |
1252 | SDWAOpcode == AMDGPU::V_FMAC_F32_sdwa || |
1253 | SDWAOpcode == AMDGPU::V_MAC_F16_sdwa || |
1254 | SDWAOpcode == AMDGPU::V_MAC_F32_sdwa) { |
1255 | // v_mac_f16/32 has additional src2 operand tied to vdst |
1256 | MachineOperand *Src2 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src2); |
1257 | assert(Src2); |
1258 | SDWAInst.add(MO: *Src2); |
1259 | } |
1260 | |
1261 | // Copy clamp if present, initialize otherwise |
1262 | assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::clamp)); |
1263 | MachineOperand *Clamp = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::clamp); |
1264 | if (Clamp) { |
1265 | SDWAInst.add(MO: *Clamp); |
1266 | } else { |
1267 | SDWAInst.addImm(Val: 0); |
1268 | } |
1269 | |
1270 | // Copy omod if present, initialize otherwise if needed |
1271 | if (AMDGPU::hasNamedOperand(Opcode: SDWAOpcode, NamedIdx: AMDGPU::OpName::omod)) { |
1272 | MachineOperand *OMod = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::omod); |
1273 | if (OMod) { |
1274 | SDWAInst.add(MO: *OMod); |
1275 | } else { |
1276 | SDWAInst.addImm(Val: 0); |
1277 | } |
1278 | } |
1279 | |
1280 | // Initialize SDWA specific operands |
1281 | if (AMDGPU::hasNamedOperand(Opcode: SDWAOpcode, NamedIdx: AMDGPU::OpName::dst_sel)) |
1282 | SDWAInst.addImm(Val: AMDGPU::SDWA::SdwaSel::DWORD); |
1283 | |
1284 | if (AMDGPU::hasNamedOperand(Opcode: SDWAOpcode, NamedIdx: AMDGPU::OpName::dst_unused)) |
1285 | SDWAInst.addImm(Val: AMDGPU::SDWA::DstUnused::UNUSED_PAD); |
1286 | |
1287 | assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src0_sel)); |
1288 | SDWAInst.addImm(Val: AMDGPU::SDWA::SdwaSel::DWORD); |
1289 | |
1290 | if (Src1) { |
1291 | assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src1_sel)); |
1292 | SDWAInst.addImm(Val: AMDGPU::SDWA::SdwaSel::DWORD); |
1293 | } |
1294 | |
1295 | // Check for a preserved register that needs to be copied. |
1296 | MachineInstr *Ret = SDWAInst.getInstr(); |
1297 | TII->fixImplicitOperands(MI&: *Ret); |
1298 | return Ret; |
1299 | } |
1300 | |
1301 | bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI, |
1302 | const SDWAOperandsVector &SDWAOperands) { |
1303 | LLVM_DEBUG(dbgs() << "Convert instruction:" << MI); |
1304 | |
1305 | MachineInstr *SDWAInst; |
1306 | if (TII->isSDWA(Opcode: MI.getOpcode())) { |
1307 | // Clone the instruction to allow revoking changes |
1308 | // made to MI during the processing of the operands |
1309 | // if the conversion fails. |
1310 | SDWAInst = MI.getParent()->getParent()->CloneMachineInstr(Orig: &MI); |
1311 | MI.getParent()->insert(I: MI.getIterator(), M: SDWAInst); |
1312 | } else { |
1313 | SDWAInst = createSDWAVersion(MI); |
1314 | } |
1315 | |
1316 | // Apply all sdwa operand patterns. |
1317 | bool Converted = false; |
1318 | for (auto &Operand : SDWAOperands) { |
1319 | LLVM_DEBUG(dbgs() << *SDWAInst << "\nOperand: " << *Operand); |
1320 | // There should be no intersection between SDWA operands and potential MIs |
1321 | // e.g.: |
1322 | // v_and_b32 v0, 0xff, v1 -> src:v1 sel:BYTE_0 |
1323 | // v_and_b32 v2, 0xff, v0 -> src:v0 sel:BYTE_0 |
1324 | // v_add_u32 v3, v4, v2 |
1325 | // |
1326 | // In that example it is possible that we would fold 2nd instruction into |
1327 | // 3rd (v_add_u32_sdwa) and then try to fold 1st instruction into 2nd (that |
1328 | // was already destroyed). So if SDWAOperand is also a potential MI then do |
1329 | // not apply it. |
1330 | if (PotentialMatches.count(Key: Operand->getParentInst()) == 0) |
1331 | Converted |= Operand->convertToSDWA(MI&: *SDWAInst, TII); |
1332 | } |
1333 | |
1334 | if (!Converted) { |
1335 | SDWAInst->eraseFromParent(); |
1336 | return false; |
1337 | } |
1338 | |
1339 | ConvertedInstructions.push_back(Elt: SDWAInst); |
1340 | for (MachineOperand &MO : SDWAInst->uses()) { |
1341 | if (!MO.isReg()) |
1342 | continue; |
1343 | |
1344 | MRI->clearKillFlags(Reg: MO.getReg()); |
1345 | } |
1346 | LLVM_DEBUG(dbgs() << "\nInto:" << *SDWAInst << '\n'); |
1347 | ++NumSDWAInstructionsPeepholed; |
1348 | |
1349 | MI.eraseFromParent(); |
1350 | return true; |
1351 | } |
1352 | |
1353 | // If an instruction was converted to SDWA it should not have immediates or SGPR |
1354 | // operands (allowed one SGPR on GFX9). Copy its scalar operands into VGPRs. |
1355 | void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI, |
1356 | const GCNSubtarget &ST) const { |
1357 | const MCInstrDesc &Desc = TII->get(Opcode: MI.getOpcode()); |
1358 | unsigned ConstantBusCount = 0; |
1359 | for (MachineOperand &Op : MI.explicit_uses()) { |
1360 | if (!Op.isImm() && !(Op.isReg() && !TRI->isVGPR(MRI: *MRI, Reg: Op.getReg()))) |
1361 | continue; |
1362 | |
1363 | unsigned I = Op.getOperandNo(); |
1364 | if (Desc.operands()[I].RegClass == -1 || |
1365 | !TRI->isVSSuperClass(RC: TRI->getRegClass(RCID: Desc.operands()[I].RegClass))) |
1366 | continue; |
1367 | |
1368 | if (ST.hasSDWAScalar() && ConstantBusCount == 0 && Op.isReg() && |
1369 | TRI->isSGPRReg(MRI: *MRI, Reg: Op.getReg())) { |
1370 | ++ConstantBusCount; |
1371 | continue; |
1372 | } |
1373 | |
1374 | Register VGPR = MRI->createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass); |
1375 | auto Copy = BuildMI(BB&: *MI.getParent(), I: MI.getIterator(), MIMD: MI.getDebugLoc(), |
1376 | MCID: TII->get(Opcode: AMDGPU::V_MOV_B32_e32), DestReg: VGPR); |
1377 | if (Op.isImm()) |
1378 | Copy.addImm(Val: Op.getImm()); |
1379 | else if (Op.isReg()) |
1380 | Copy.addReg(RegNo: Op.getReg(), flags: Op.isKill() ? RegState::Kill : 0, |
1381 | SubReg: Op.getSubReg()); |
1382 | Op.ChangeToRegister(Reg: VGPR, isDef: false); |
1383 | } |
1384 | } |
1385 | |
1386 | bool SIPeepholeSDWALegacy::runOnMachineFunction(MachineFunction &MF) { |
1387 | if (skipFunction(F: MF.getFunction())) |
1388 | return false; |
1389 | |
1390 | return SIPeepholeSDWA().run(MF); |
1391 | } |
1392 | |
1393 | bool SIPeepholeSDWA::run(MachineFunction &MF) { |
1394 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
1395 | |
1396 | if (!ST.hasSDWA()) |
1397 | return false; |
1398 | |
1399 | MRI = &MF.getRegInfo(); |
1400 | TRI = ST.getRegisterInfo(); |
1401 | TII = ST.getInstrInfo(); |
1402 | |
1403 | // Find all SDWA operands in MF. |
1404 | bool Ret = false; |
1405 | for (MachineBasicBlock &MBB : MF) { |
1406 | bool Changed = false; |
1407 | do { |
1408 | // Preprocess the ADD/SUB pairs so they could be SDWA'ed. |
1409 | // Look for a possible ADD or SUB that resulted from a previously lowered |
1410 | // V_{ADD|SUB}_U64_PSEUDO. The function pseudoOpConvertToVOP2 |
1411 | // lowers the pair of instructions into e32 form. |
1412 | matchSDWAOperands(MBB); |
1413 | for (const auto &OperandPair : SDWAOperands) { |
1414 | const auto &Operand = OperandPair.second; |
1415 | MachineInstr *PotentialMI = Operand->potentialToConvert(TII, ST); |
1416 | if (!PotentialMI) |
1417 | continue; |
1418 | |
1419 | switch (PotentialMI->getOpcode()) { |
1420 | case AMDGPU::V_ADD_CO_U32_e64: |
1421 | case AMDGPU::V_SUB_CO_U32_e64: |
1422 | pseudoOpConvertToVOP2(MI&: *PotentialMI, ST); |
1423 | break; |
1424 | case AMDGPU::V_CNDMASK_B32_e64: |
1425 | convertVcndmaskToVOP2(MI&: *PotentialMI, ST); |
1426 | break; |
1427 | }; |
1428 | } |
1429 | SDWAOperands.clear(); |
1430 | |
1431 | // Generate potential match list. |
1432 | matchSDWAOperands(MBB); |
1433 | |
1434 | for (const auto &OperandPair : SDWAOperands) { |
1435 | const auto &Operand = OperandPair.second; |
1436 | MachineInstr *PotentialMI = |
1437 | Operand->potentialToConvert(TII, ST, PotentialMatches: &PotentialMatches); |
1438 | |
1439 | if (PotentialMI && isConvertibleToSDWA(MI&: *PotentialMI, ST, TII)) |
1440 | PotentialMatches[PotentialMI].push_back(Elt: Operand.get()); |
1441 | } |
1442 | |
1443 | for (auto &PotentialPair : PotentialMatches) { |
1444 | MachineInstr &PotentialMI = *PotentialPair.first; |
1445 | convertToSDWA(MI&: PotentialMI, SDWAOperands: PotentialPair.second); |
1446 | } |
1447 | |
1448 | PotentialMatches.clear(); |
1449 | SDWAOperands.clear(); |
1450 | |
1451 | Changed = !ConvertedInstructions.empty(); |
1452 | |
1453 | if (Changed) |
1454 | Ret = true; |
1455 | while (!ConvertedInstructions.empty()) |
1456 | legalizeScalarOperands(MI&: *ConvertedInstructions.pop_back_val(), ST); |
1457 | } while (Changed); |
1458 | } |
1459 | |
1460 | return Ret; |
1461 | } |
1462 | |
1463 | PreservedAnalyses SIPeepholeSDWAPass::run(MachineFunction &MF, |
1464 | MachineFunctionAnalysisManager &) { |
1465 | if (MF.getFunction().hasOptNone() || !SIPeepholeSDWA().run(MF)) |
1466 | return PreservedAnalyses::all(); |
1467 | |
1468 | PreservedAnalyses PA = getMachineFunctionPassPreservedAnalyses(); |
1469 | PA.preserveSet<CFGAnalyses>(); |
1470 | return PA; |
1471 | } |
1472 | |