1 | //===- SIPeepholeSDWA.cpp - Peephole optimization for SDWA instructions ---===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | /// \file This pass tries to apply several peephole SDWA patterns. |
10 | /// |
11 | /// E.g. original: |
12 | /// V_LSHRREV_B32_e32 %0, 16, %1 |
13 | /// V_ADD_CO_U32_e32 %2, %0, %3 |
14 | /// V_LSHLREV_B32_e32 %4, 16, %2 |
15 | /// |
16 | /// Replace: |
17 | /// V_ADD_CO_U32_sdwa %4, %1, %3 |
18 | /// dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD |
19 | /// |
20 | //===----------------------------------------------------------------------===// |
21 | |
22 | #include "AMDGPU.h" |
23 | #include "GCNSubtarget.h" |
24 | #include "MCTargetDesc/AMDGPUMCTargetDesc.h" |
25 | #include "llvm/ADT/MapVector.h" |
26 | #include "llvm/ADT/Statistic.h" |
27 | #include "llvm/CodeGen/MachineFunctionPass.h" |
28 | #include <optional> |
29 | |
30 | using namespace llvm; |
31 | |
32 | #define DEBUG_TYPE "si-peephole-sdwa" |
33 | |
34 | STATISTIC(NumSDWAPatternsFound, "Number of SDWA patterns found." ); |
35 | STATISTIC(NumSDWAInstructionsPeepholed, |
36 | "Number of instruction converted to SDWA." ); |
37 | |
38 | namespace { |
39 | |
40 | bool isConvertibleToSDWA(MachineInstr &MI, const GCNSubtarget &ST, |
41 | const SIInstrInfo *TII); |
42 | class SDWAOperand; |
43 | class SDWADstOperand; |
44 | |
45 | using SDWAOperandsVector = SmallVector<SDWAOperand *, 4>; |
46 | using SDWAOperandsMap = MapVector<MachineInstr *, SDWAOperandsVector>; |
47 | |
48 | class SIPeepholeSDWA : public MachineFunctionPass { |
49 | private: |
50 | MachineRegisterInfo *MRI; |
51 | const SIRegisterInfo *TRI; |
52 | const SIInstrInfo *TII; |
53 | |
54 | MapVector<MachineInstr *, std::unique_ptr<SDWAOperand>> SDWAOperands; |
55 | SDWAOperandsMap PotentialMatches; |
56 | SmallVector<MachineInstr *, 8> ConvertedInstructions; |
57 | |
58 | std::optional<int64_t> foldToImm(const MachineOperand &Op) const; |
59 | |
60 | public: |
61 | static char ID; |
62 | |
63 | SIPeepholeSDWA() : MachineFunctionPass(ID) { |
64 | initializeSIPeepholeSDWAPass(*PassRegistry::getPassRegistry()); |
65 | } |
66 | |
67 | bool runOnMachineFunction(MachineFunction &MF) override; |
68 | void matchSDWAOperands(MachineBasicBlock &MBB); |
69 | std::unique_ptr<SDWAOperand> matchSDWAOperand(MachineInstr &MI); |
70 | void pseudoOpConvertToVOP2(MachineInstr &MI, |
71 | const GCNSubtarget &ST) const; |
72 | bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands); |
73 | void legalizeScalarOperands(MachineInstr &MI, const GCNSubtarget &ST) const; |
74 | |
75 | StringRef getPassName() const override { return "SI Peephole SDWA" ; } |
76 | |
77 | void getAnalysisUsage(AnalysisUsage &AU) const override { |
78 | AU.setPreservesCFG(); |
79 | MachineFunctionPass::getAnalysisUsage(AU); |
80 | } |
81 | }; |
82 | |
83 | class SDWAOperand { |
84 | private: |
85 | MachineOperand *Target; // Operand that would be used in converted instruction |
86 | MachineOperand *Replaced; // Operand that would be replace by Target |
87 | |
88 | public: |
89 | SDWAOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp) |
90 | : Target(TargetOp), Replaced(ReplacedOp) { |
91 | assert(Target->isReg()); |
92 | assert(Replaced->isReg()); |
93 | } |
94 | |
95 | virtual ~SDWAOperand() = default; |
96 | |
97 | virtual MachineInstr *potentialToConvert(const SIInstrInfo *TII, |
98 | const GCNSubtarget &ST, |
99 | SDWAOperandsMap *PotentialMatches = nullptr) = 0; |
100 | virtual bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) = 0; |
101 | |
102 | MachineOperand *getTargetOperand() const { return Target; } |
103 | MachineOperand *getReplacedOperand() const { return Replaced; } |
104 | MachineInstr *getParentInst() const { return Target->getParent(); } |
105 | |
106 | MachineRegisterInfo *getMRI() const { |
107 | return &getParentInst()->getParent()->getParent()->getRegInfo(); |
108 | } |
109 | |
110 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
111 | virtual void print(raw_ostream& OS) const = 0; |
112 | void dump() const { print(dbgs()); } |
113 | #endif |
114 | }; |
115 | |
116 | using namespace AMDGPU::SDWA; |
117 | |
118 | class SDWASrcOperand : public SDWAOperand { |
119 | private: |
120 | SdwaSel SrcSel; |
121 | bool Abs; |
122 | bool Neg; |
123 | bool Sext; |
124 | |
125 | public: |
126 | SDWASrcOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp, |
127 | SdwaSel SrcSel_ = DWORD, bool Abs_ = false, bool Neg_ = false, |
128 | bool Sext_ = false) |
129 | : SDWAOperand(TargetOp, ReplacedOp), |
130 | SrcSel(SrcSel_), Abs(Abs_), Neg(Neg_), Sext(Sext_) {} |
131 | |
132 | MachineInstr *potentialToConvert(const SIInstrInfo *TII, |
133 | const GCNSubtarget &ST, |
134 | SDWAOperandsMap *PotentialMatches = nullptr) override; |
135 | bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override; |
136 | |
137 | SdwaSel getSrcSel() const { return SrcSel; } |
138 | bool getAbs() const { return Abs; } |
139 | bool getNeg() const { return Neg; } |
140 | bool getSext() const { return Sext; } |
141 | |
142 | uint64_t getSrcMods(const SIInstrInfo *TII, |
143 | const MachineOperand *SrcOp) const; |
144 | |
145 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
146 | void print(raw_ostream& OS) const override; |
147 | #endif |
148 | }; |
149 | |
150 | class SDWADstOperand : public SDWAOperand { |
151 | private: |
152 | SdwaSel DstSel; |
153 | DstUnused DstUn; |
154 | |
155 | public: |
156 | |
157 | SDWADstOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp, |
158 | SdwaSel DstSel_ = DWORD, DstUnused DstUn_ = UNUSED_PAD) |
159 | : SDWAOperand(TargetOp, ReplacedOp), DstSel(DstSel_), DstUn(DstUn_) {} |
160 | |
161 | MachineInstr *potentialToConvert(const SIInstrInfo *TII, |
162 | const GCNSubtarget &ST, |
163 | SDWAOperandsMap *PotentialMatches = nullptr) override; |
164 | bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override; |
165 | |
166 | SdwaSel getDstSel() const { return DstSel; } |
167 | DstUnused getDstUnused() const { return DstUn; } |
168 | |
169 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
170 | void print(raw_ostream& OS) const override; |
171 | #endif |
172 | }; |
173 | |
174 | class SDWADstPreserveOperand : public SDWADstOperand { |
175 | private: |
176 | MachineOperand *Preserve; |
177 | |
178 | public: |
179 | SDWADstPreserveOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp, |
180 | MachineOperand *PreserveOp, SdwaSel DstSel_ = DWORD) |
181 | : SDWADstOperand(TargetOp, ReplacedOp, DstSel_, UNUSED_PRESERVE), |
182 | Preserve(PreserveOp) {} |
183 | |
184 | bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override; |
185 | |
186 | MachineOperand *getPreservedOperand() const { return Preserve; } |
187 | |
188 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
189 | void print(raw_ostream& OS) const override; |
190 | #endif |
191 | }; |
192 | |
193 | } // end anonymous namespace |
194 | |
195 | INITIALIZE_PASS(SIPeepholeSDWA, DEBUG_TYPE, "SI Peephole SDWA" , false, false) |
196 | |
197 | char SIPeepholeSDWA::ID = 0; |
198 | |
199 | char &llvm::SIPeepholeSDWAID = SIPeepholeSDWA::ID; |
200 | |
201 | FunctionPass *llvm::createSIPeepholeSDWAPass() { |
202 | return new SIPeepholeSDWA(); |
203 | } |
204 | |
205 | |
206 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
207 | static raw_ostream& operator<<(raw_ostream &OS, SdwaSel Sel) { |
208 | switch(Sel) { |
209 | case BYTE_0: OS << "BYTE_0" ; break; |
210 | case BYTE_1: OS << "BYTE_1" ; break; |
211 | case BYTE_2: OS << "BYTE_2" ; break; |
212 | case BYTE_3: OS << "BYTE_3" ; break; |
213 | case WORD_0: OS << "WORD_0" ; break; |
214 | case WORD_1: OS << "WORD_1" ; break; |
215 | case DWORD: OS << "DWORD" ; break; |
216 | } |
217 | return OS; |
218 | } |
219 | |
220 | static raw_ostream& operator<<(raw_ostream &OS, const DstUnused &Un) { |
221 | switch(Un) { |
222 | case UNUSED_PAD: OS << "UNUSED_PAD" ; break; |
223 | case UNUSED_SEXT: OS << "UNUSED_SEXT" ; break; |
224 | case UNUSED_PRESERVE: OS << "UNUSED_PRESERVE" ; break; |
225 | } |
226 | return OS; |
227 | } |
228 | |
229 | LLVM_DUMP_METHOD |
230 | void SDWASrcOperand::print(raw_ostream& OS) const { |
231 | OS << "SDWA src: " << *getTargetOperand() |
232 | << " src_sel:" << getSrcSel() |
233 | << " abs:" << getAbs() << " neg:" << getNeg() |
234 | << " sext:" << getSext() << '\n'; |
235 | } |
236 | |
237 | LLVM_DUMP_METHOD |
238 | void SDWADstOperand::print(raw_ostream& OS) const { |
239 | OS << "SDWA dst: " << *getTargetOperand() |
240 | << " dst_sel:" << getDstSel() |
241 | << " dst_unused:" << getDstUnused() << '\n'; |
242 | } |
243 | |
244 | LLVM_DUMP_METHOD |
245 | void SDWADstPreserveOperand::print(raw_ostream& OS) const { |
246 | OS << "SDWA preserve dst: " << *getTargetOperand() |
247 | << " dst_sel:" << getDstSel() |
248 | << " preserve:" << *getPreservedOperand() << '\n'; |
249 | } |
250 | |
251 | #endif |
252 | |
253 | static void copyRegOperand(MachineOperand &To, const MachineOperand &From) { |
254 | assert(To.isReg() && From.isReg()); |
255 | To.setReg(From.getReg()); |
256 | To.setSubReg(From.getSubReg()); |
257 | To.setIsUndef(From.isUndef()); |
258 | if (To.isUse()) { |
259 | To.setIsKill(From.isKill()); |
260 | } else { |
261 | To.setIsDead(From.isDead()); |
262 | } |
263 | } |
264 | |
265 | static bool isSameReg(const MachineOperand &LHS, const MachineOperand &RHS) { |
266 | return LHS.isReg() && |
267 | RHS.isReg() && |
268 | LHS.getReg() == RHS.getReg() && |
269 | LHS.getSubReg() == RHS.getSubReg(); |
270 | } |
271 | |
272 | static MachineOperand *findSingleRegUse(const MachineOperand *Reg, |
273 | const MachineRegisterInfo *MRI) { |
274 | if (!Reg->isReg() || !Reg->isDef()) |
275 | return nullptr; |
276 | |
277 | MachineOperand *ResMO = nullptr; |
278 | for (MachineOperand &UseMO : MRI->use_nodbg_operands(Reg: Reg->getReg())) { |
279 | // If there exist use of subreg of Reg then return nullptr |
280 | if (!isSameReg(LHS: UseMO, RHS: *Reg)) |
281 | return nullptr; |
282 | |
283 | // Check that there is only one instruction that uses Reg |
284 | if (!ResMO) { |
285 | ResMO = &UseMO; |
286 | } else if (ResMO->getParent() != UseMO.getParent()) { |
287 | return nullptr; |
288 | } |
289 | } |
290 | |
291 | return ResMO; |
292 | } |
293 | |
294 | static MachineOperand *findSingleRegDef(const MachineOperand *Reg, |
295 | const MachineRegisterInfo *MRI) { |
296 | if (!Reg->isReg()) |
297 | return nullptr; |
298 | |
299 | MachineInstr *DefInstr = MRI->getUniqueVRegDef(Reg: Reg->getReg()); |
300 | if (!DefInstr) |
301 | return nullptr; |
302 | |
303 | for (auto &DefMO : DefInstr->defs()) { |
304 | if (DefMO.isReg() && DefMO.getReg() == Reg->getReg()) |
305 | return &DefMO; |
306 | } |
307 | |
308 | // Ignore implicit defs. |
309 | return nullptr; |
310 | } |
311 | |
312 | uint64_t SDWASrcOperand::getSrcMods(const SIInstrInfo *TII, |
313 | const MachineOperand *SrcOp) const { |
314 | uint64_t Mods = 0; |
315 | const auto *MI = SrcOp->getParent(); |
316 | if (TII->getNamedOperand(MI: *MI, OpName: AMDGPU::OpName::src0) == SrcOp) { |
317 | if (auto *Mod = TII->getNamedOperand(MI: *MI, OpName: AMDGPU::OpName::src0_modifiers)) { |
318 | Mods = Mod->getImm(); |
319 | } |
320 | } else if (TII->getNamedOperand(MI: *MI, OpName: AMDGPU::OpName::src1) == SrcOp) { |
321 | if (auto *Mod = TII->getNamedOperand(MI: *MI, OpName: AMDGPU::OpName::src1_modifiers)) { |
322 | Mods = Mod->getImm(); |
323 | } |
324 | } |
325 | if (Abs || Neg) { |
326 | assert(!Sext && |
327 | "Float and integer src modifiers can't be set simultaneously" ); |
328 | Mods |= Abs ? SISrcMods::ABS : 0u; |
329 | Mods ^= Neg ? SISrcMods::NEG : 0u; |
330 | } else if (Sext) { |
331 | Mods |= SISrcMods::SEXT; |
332 | } |
333 | |
334 | return Mods; |
335 | } |
336 | |
337 | MachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII, |
338 | const GCNSubtarget &ST, |
339 | SDWAOperandsMap *PotentialMatches) { |
340 | if (PotentialMatches != nullptr) { |
341 | // Fill out the map for all uses if all can be converted |
342 | MachineOperand *Reg = getReplacedOperand(); |
343 | if (!Reg->isReg() || !Reg->isDef()) |
344 | return nullptr; |
345 | |
346 | for (MachineInstr &UseMI : getMRI()->use_nodbg_instructions(Reg: Reg->getReg())) |
347 | // Check that all instructions that use Reg can be converted |
348 | if (!isConvertibleToSDWA(MI&: UseMI, ST, TII)) |
349 | return nullptr; |
350 | |
351 | // Now that it's guaranteed all uses are legal, iterate over the uses again |
352 | // to add them for later conversion. |
353 | for (MachineOperand &UseMO : getMRI()->use_nodbg_operands(Reg: Reg->getReg())) { |
354 | // Should not get a subregister here |
355 | assert(isSameReg(UseMO, *Reg)); |
356 | |
357 | SDWAOperandsMap &potentialMatchesMap = *PotentialMatches; |
358 | MachineInstr *UseMI = UseMO.getParent(); |
359 | potentialMatchesMap[UseMI].push_back(Elt: this); |
360 | } |
361 | return nullptr; |
362 | } |
363 | |
364 | // For SDWA src operand potential instruction is one that use register |
365 | // defined by parent instruction |
366 | MachineOperand *PotentialMO = findSingleRegUse(Reg: getReplacedOperand(), MRI: getMRI()); |
367 | if (!PotentialMO) |
368 | return nullptr; |
369 | |
370 | return PotentialMO->getParent(); |
371 | } |
372 | |
373 | bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) { |
374 | switch (MI.getOpcode()) { |
375 | case AMDGPU::V_CVT_F32_FP8_sdwa: |
376 | case AMDGPU::V_CVT_F32_BF8_sdwa: |
377 | case AMDGPU::V_CVT_PK_F32_FP8_sdwa: |
378 | case AMDGPU::V_CVT_PK_F32_BF8_sdwa: |
379 | // Does not support input modifiers: noabs, noneg, nosext. |
380 | return false; |
381 | } |
382 | |
383 | // Find operand in instruction that matches source operand and replace it with |
384 | // target operand. Set corresponding src_sel |
385 | bool IsPreserveSrc = false; |
386 | MachineOperand *Src = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0); |
387 | MachineOperand *SrcSel = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0_sel); |
388 | MachineOperand *SrcMods = |
389 | TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0_modifiers); |
390 | assert(Src && (Src->isReg() || Src->isImm())); |
391 | if (!isSameReg(LHS: *Src, RHS: *getReplacedOperand())) { |
392 | // If this is not src0 then it could be src1 |
393 | Src = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1); |
394 | SrcSel = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1_sel); |
395 | SrcMods = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1_modifiers); |
396 | |
397 | if (!Src || |
398 | !isSameReg(LHS: *Src, RHS: *getReplacedOperand())) { |
399 | // It's possible this Src is a tied operand for |
400 | // UNUSED_PRESERVE, in which case we can either |
401 | // abandon the peephole attempt, or if legal we can |
402 | // copy the target operand into the tied slot |
403 | // if the preserve operation will effectively cause the same |
404 | // result by overwriting the rest of the dst. |
405 | MachineOperand *Dst = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst); |
406 | MachineOperand *DstUnused = |
407 | TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::dst_unused); |
408 | |
409 | if (Dst && |
410 | DstUnused->getImm() == AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE) { |
411 | // This will work if the tied src is accessing WORD_0, and the dst is |
412 | // writing WORD_1. Modifiers don't matter because all the bits that |
413 | // would be impacted are being overwritten by the dst. |
414 | // Any other case will not work. |
415 | SdwaSel DstSel = static_cast<SdwaSel>( |
416 | TII->getNamedImmOperand(MI, OpName: AMDGPU::OpName::dst_sel)); |
417 | if (DstSel == AMDGPU::SDWA::SdwaSel::WORD_1 && |
418 | getSrcSel() == AMDGPU::SDWA::SdwaSel::WORD_0) { |
419 | IsPreserveSrc = true; |
420 | auto DstIdx = AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), |
421 | NamedIdx: AMDGPU::OpName::vdst); |
422 | auto TiedIdx = MI.findTiedOperandIdx(OpIdx: DstIdx); |
423 | Src = &MI.getOperand(i: TiedIdx); |
424 | SrcSel = nullptr; |
425 | SrcMods = nullptr; |
426 | } else { |
427 | // Not legal to convert this src |
428 | return false; |
429 | } |
430 | } |
431 | } |
432 | assert(Src && Src->isReg()); |
433 | |
434 | if ((MI.getOpcode() == AMDGPU::V_FMAC_F16_sdwa || |
435 | MI.getOpcode() == AMDGPU::V_FMAC_F32_sdwa || |
436 | MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa || |
437 | MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) && |
438 | !isSameReg(LHS: *Src, RHS: *getReplacedOperand())) { |
439 | // In case of v_mac_f16/32_sdwa this pass can try to apply src operand to |
440 | // src2. This is not allowed. |
441 | return false; |
442 | } |
443 | |
444 | assert(isSameReg(*Src, *getReplacedOperand()) && |
445 | (IsPreserveSrc || (SrcSel && SrcMods))); |
446 | } |
447 | copyRegOperand(To&: *Src, From: *getTargetOperand()); |
448 | if (!IsPreserveSrc) { |
449 | SrcSel->setImm(getSrcSel()); |
450 | SrcMods->setImm(getSrcMods(TII, SrcOp: Src)); |
451 | } |
452 | getTargetOperand()->setIsKill(false); |
453 | return true; |
454 | } |
455 | |
456 | MachineInstr *SDWADstOperand::potentialToConvert(const SIInstrInfo *TII, |
457 | const GCNSubtarget &ST, |
458 | SDWAOperandsMap *PotentialMatches) { |
459 | // For SDWA dst operand potential instruction is one that defines register |
460 | // that this operand uses |
461 | MachineRegisterInfo *MRI = getMRI(); |
462 | MachineInstr *ParentMI = getParentInst(); |
463 | |
464 | MachineOperand *PotentialMO = findSingleRegDef(Reg: getReplacedOperand(), MRI); |
465 | if (!PotentialMO) |
466 | return nullptr; |
467 | |
468 | // Check that ParentMI is the only instruction that uses replaced register |
469 | for (MachineInstr &UseInst : MRI->use_nodbg_instructions(Reg: PotentialMO->getReg())) { |
470 | if (&UseInst != ParentMI) |
471 | return nullptr; |
472 | } |
473 | |
474 | return PotentialMO->getParent(); |
475 | } |
476 | |
477 | bool SDWADstOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) { |
478 | // Replace vdst operand in MI with target operand. Set dst_sel and dst_unused |
479 | |
480 | if ((MI.getOpcode() == AMDGPU::V_FMAC_F16_sdwa || |
481 | MI.getOpcode() == AMDGPU::V_FMAC_F32_sdwa || |
482 | MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa || |
483 | MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) && |
484 | getDstSel() != AMDGPU::SDWA::DWORD) { |
485 | // v_mac_f16/32_sdwa allow dst_sel to be equal only to DWORD |
486 | return false; |
487 | } |
488 | |
489 | MachineOperand *Operand = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst); |
490 | assert(Operand && |
491 | Operand->isReg() && |
492 | isSameReg(*Operand, *getReplacedOperand())); |
493 | copyRegOperand(To&: *Operand, From: *getTargetOperand()); |
494 | MachineOperand *DstSel= TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::dst_sel); |
495 | assert(DstSel); |
496 | DstSel->setImm(getDstSel()); |
497 | MachineOperand *DstUnused= TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::dst_unused); |
498 | assert(DstUnused); |
499 | DstUnused->setImm(getDstUnused()); |
500 | |
501 | // Remove original instruction because it would conflict with our new |
502 | // instruction by register definition |
503 | getParentInst()->eraseFromParent(); |
504 | return true; |
505 | } |
506 | |
507 | bool SDWADstPreserveOperand::convertToSDWA(MachineInstr &MI, |
508 | const SIInstrInfo *TII) { |
509 | // MI should be moved right before v_or_b32. |
510 | // For this we should clear all kill flags on uses of MI src-operands or else |
511 | // we can encounter problem with use of killed operand. |
512 | for (MachineOperand &MO : MI.uses()) { |
513 | if (!MO.isReg()) |
514 | continue; |
515 | getMRI()->clearKillFlags(Reg: MO.getReg()); |
516 | } |
517 | |
518 | // Move MI before v_or_b32 |
519 | MI.getParent()->remove(I: &MI); |
520 | getParentInst()->getParent()->insert(I: getParentInst(), MI: &MI); |
521 | |
522 | // Add Implicit use of preserved register |
523 | MachineInstrBuilder MIB(*MI.getMF(), MI); |
524 | MIB.addReg(RegNo: getPreservedOperand()->getReg(), |
525 | flags: RegState::ImplicitKill, |
526 | SubReg: getPreservedOperand()->getSubReg()); |
527 | |
528 | // Tie dst to implicit use |
529 | MI.tieOperands(DefIdx: AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), NamedIdx: AMDGPU::OpName::vdst), |
530 | UseIdx: MI.getNumOperands() - 1); |
531 | |
532 | // Convert MI as any other SDWADstOperand and remove v_or_b32 |
533 | return SDWADstOperand::convertToSDWA(MI, TII); |
534 | } |
535 | |
536 | std::optional<int64_t> |
537 | SIPeepholeSDWA::foldToImm(const MachineOperand &Op) const { |
538 | if (Op.isImm()) { |
539 | return Op.getImm(); |
540 | } |
541 | |
542 | // If this is not immediate then it can be copy of immediate value, e.g.: |
543 | // %1 = S_MOV_B32 255; |
544 | if (Op.isReg()) { |
545 | for (const MachineOperand &Def : MRI->def_operands(Reg: Op.getReg())) { |
546 | if (!isSameReg(LHS: Op, RHS: Def)) |
547 | continue; |
548 | |
549 | const MachineInstr *DefInst = Def.getParent(); |
550 | if (!TII->isFoldableCopy(MI: *DefInst)) |
551 | return std::nullopt; |
552 | |
553 | const MachineOperand &Copied = DefInst->getOperand(i: 1); |
554 | if (!Copied.isImm()) |
555 | return std::nullopt; |
556 | |
557 | return Copied.getImm(); |
558 | } |
559 | } |
560 | |
561 | return std::nullopt; |
562 | } |
563 | |
564 | std::unique_ptr<SDWAOperand> |
565 | SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) { |
566 | unsigned Opcode = MI.getOpcode(); |
567 | switch (Opcode) { |
568 | case AMDGPU::V_LSHRREV_B32_e32: |
569 | case AMDGPU::V_ASHRREV_I32_e32: |
570 | case AMDGPU::V_LSHLREV_B32_e32: |
571 | case AMDGPU::V_LSHRREV_B32_e64: |
572 | case AMDGPU::V_ASHRREV_I32_e64: |
573 | case AMDGPU::V_LSHLREV_B32_e64: { |
574 | // from: v_lshrrev_b32_e32 v1, 16/24, v0 |
575 | // to SDWA src:v0 src_sel:WORD_1/BYTE_3 |
576 | |
577 | // from: v_ashrrev_i32_e32 v1, 16/24, v0 |
578 | // to SDWA src:v0 src_sel:WORD_1/BYTE_3 sext:1 |
579 | |
580 | // from: v_lshlrev_b32_e32 v1, 16/24, v0 |
581 | // to SDWA dst:v1 dst_sel:WORD_1/BYTE_3 dst_unused:UNUSED_PAD |
582 | MachineOperand *Src0 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0); |
583 | auto Imm = foldToImm(Op: *Src0); |
584 | if (!Imm) |
585 | break; |
586 | |
587 | if (*Imm != 16 && *Imm != 24) |
588 | break; |
589 | |
590 | MachineOperand *Src1 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1); |
591 | MachineOperand *Dst = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst); |
592 | if (!Src1->isReg() || Src1->getReg().isPhysical() || |
593 | Dst->getReg().isPhysical()) |
594 | break; |
595 | |
596 | if (Opcode == AMDGPU::V_LSHLREV_B32_e32 || |
597 | Opcode == AMDGPU::V_LSHLREV_B32_e64) { |
598 | return std::make_unique<SDWADstOperand>( |
599 | args&: Dst, args&: Src1, args: *Imm == 16 ? WORD_1 : BYTE_3, args: UNUSED_PAD); |
600 | } |
601 | return std::make_unique<SDWASrcOperand>( |
602 | args&: Src1, args&: Dst, args: *Imm == 16 ? WORD_1 : BYTE_3, args: false, args: false, |
603 | args: Opcode != AMDGPU::V_LSHRREV_B32_e32 && |
604 | Opcode != AMDGPU::V_LSHRREV_B32_e64); |
605 | break; |
606 | } |
607 | |
608 | case AMDGPU::V_LSHRREV_B16_e32: |
609 | case AMDGPU::V_ASHRREV_I16_e32: |
610 | case AMDGPU::V_LSHLREV_B16_e32: |
611 | case AMDGPU::V_LSHRREV_B16_e64: |
612 | case AMDGPU::V_ASHRREV_I16_e64: |
613 | case AMDGPU::V_LSHLREV_B16_e64: { |
614 | // from: v_lshrrev_b16_e32 v1, 8, v0 |
615 | // to SDWA src:v0 src_sel:BYTE_1 |
616 | |
617 | // from: v_ashrrev_i16_e32 v1, 8, v0 |
618 | // to SDWA src:v0 src_sel:BYTE_1 sext:1 |
619 | |
620 | // from: v_lshlrev_b16_e32 v1, 8, v0 |
621 | // to SDWA dst:v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD |
622 | MachineOperand *Src0 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0); |
623 | auto Imm = foldToImm(Op: *Src0); |
624 | if (!Imm || *Imm != 8) |
625 | break; |
626 | |
627 | MachineOperand *Src1 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1); |
628 | MachineOperand *Dst = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst); |
629 | |
630 | if (!Src1->isReg() || Src1->getReg().isPhysical() || |
631 | Dst->getReg().isPhysical()) |
632 | break; |
633 | |
634 | if (Opcode == AMDGPU::V_LSHLREV_B16_e32 || |
635 | Opcode == AMDGPU::V_LSHLREV_B16_e64) |
636 | return std::make_unique<SDWADstOperand>(args&: Dst, args&: Src1, args: BYTE_1, args: UNUSED_PAD); |
637 | return std::make_unique<SDWASrcOperand>( |
638 | args&: Src1, args&: Dst, args: BYTE_1, args: false, args: false, |
639 | args: Opcode != AMDGPU::V_LSHRREV_B16_e32 && |
640 | Opcode != AMDGPU::V_LSHRREV_B16_e64); |
641 | break; |
642 | } |
643 | |
644 | case AMDGPU::V_BFE_I32_e64: |
645 | case AMDGPU::V_BFE_U32_e64: { |
646 | // e.g.: |
647 | // from: v_bfe_u32 v1, v0, 8, 8 |
648 | // to SDWA src:v0 src_sel:BYTE_1 |
649 | |
650 | // offset | width | src_sel |
651 | // ------------------------ |
652 | // 0 | 8 | BYTE_0 |
653 | // 0 | 16 | WORD_0 |
654 | // 0 | 32 | DWORD ? |
655 | // 8 | 8 | BYTE_1 |
656 | // 16 | 8 | BYTE_2 |
657 | // 16 | 16 | WORD_1 |
658 | // 24 | 8 | BYTE_3 |
659 | |
660 | MachineOperand *Src1 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1); |
661 | auto Offset = foldToImm(Op: *Src1); |
662 | if (!Offset) |
663 | break; |
664 | |
665 | MachineOperand *Src2 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src2); |
666 | auto Width = foldToImm(Op: *Src2); |
667 | if (!Width) |
668 | break; |
669 | |
670 | SdwaSel SrcSel = DWORD; |
671 | |
672 | if (*Offset == 0 && *Width == 8) |
673 | SrcSel = BYTE_0; |
674 | else if (*Offset == 0 && *Width == 16) |
675 | SrcSel = WORD_0; |
676 | else if (*Offset == 0 && *Width == 32) |
677 | SrcSel = DWORD; |
678 | else if (*Offset == 8 && *Width == 8) |
679 | SrcSel = BYTE_1; |
680 | else if (*Offset == 16 && *Width == 8) |
681 | SrcSel = BYTE_2; |
682 | else if (*Offset == 16 && *Width == 16) |
683 | SrcSel = WORD_1; |
684 | else if (*Offset == 24 && *Width == 8) |
685 | SrcSel = BYTE_3; |
686 | else |
687 | break; |
688 | |
689 | MachineOperand *Src0 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0); |
690 | MachineOperand *Dst = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst); |
691 | |
692 | if (!Src0->isReg() || Src0->getReg().isPhysical() || |
693 | Dst->getReg().isPhysical()) |
694 | break; |
695 | |
696 | return std::make_unique<SDWASrcOperand>( |
697 | args&: Src0, args&: Dst, args&: SrcSel, args: false, args: false, args: Opcode != AMDGPU::V_BFE_U32_e64); |
698 | } |
699 | |
700 | case AMDGPU::V_AND_B32_e32: |
701 | case AMDGPU::V_AND_B32_e64: { |
702 | // e.g.: |
703 | // from: v_and_b32_e32 v1, 0x0000ffff/0x000000ff, v0 |
704 | // to SDWA src:v0 src_sel:WORD_0/BYTE_0 |
705 | |
706 | MachineOperand *Src0 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0); |
707 | MachineOperand *Src1 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1); |
708 | auto ValSrc = Src1; |
709 | auto Imm = foldToImm(Op: *Src0); |
710 | |
711 | if (!Imm) { |
712 | Imm = foldToImm(Op: *Src1); |
713 | ValSrc = Src0; |
714 | } |
715 | |
716 | if (!Imm || (*Imm != 0x0000ffff && *Imm != 0x000000ff)) |
717 | break; |
718 | |
719 | MachineOperand *Dst = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst); |
720 | |
721 | if (!ValSrc->isReg() || ValSrc->getReg().isPhysical() || |
722 | Dst->getReg().isPhysical()) |
723 | break; |
724 | |
725 | return std::make_unique<SDWASrcOperand>( |
726 | args&: ValSrc, args&: Dst, args: *Imm == 0x0000ffff ? WORD_0 : BYTE_0); |
727 | } |
728 | |
729 | case AMDGPU::V_OR_B32_e32: |
730 | case AMDGPU::V_OR_B32_e64: { |
731 | // Patterns for dst_unused:UNUSED_PRESERVE. |
732 | // e.g., from: |
733 | // v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD |
734 | // src1_sel:WORD_1 src2_sel:WORD1 |
735 | // v_add_f16_e32 v3, v1, v2 |
736 | // v_or_b32_e32 v4, v0, v3 |
737 | // to SDWA preserve dst:v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE preserve:v3 |
738 | |
739 | // Check if one of operands of v_or_b32 is SDWA instruction |
740 | using CheckRetType = |
741 | std::optional<std::pair<MachineOperand *, MachineOperand *>>; |
742 | auto CheckOROperandsForSDWA = |
743 | [&](const MachineOperand *Op1, const MachineOperand *Op2) -> CheckRetType { |
744 | if (!Op1 || !Op1->isReg() || !Op2 || !Op2->isReg()) |
745 | return CheckRetType(std::nullopt); |
746 | |
747 | MachineOperand *Op1Def = findSingleRegDef(Reg: Op1, MRI); |
748 | if (!Op1Def) |
749 | return CheckRetType(std::nullopt); |
750 | |
751 | MachineInstr *Op1Inst = Op1Def->getParent(); |
752 | if (!TII->isSDWA(MI: *Op1Inst)) |
753 | return CheckRetType(std::nullopt); |
754 | |
755 | MachineOperand *Op2Def = findSingleRegDef(Reg: Op2, MRI); |
756 | if (!Op2Def) |
757 | return CheckRetType(std::nullopt); |
758 | |
759 | return CheckRetType(std::pair(Op1Def, Op2Def)); |
760 | }; |
761 | |
762 | MachineOperand *OrSDWA = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0); |
763 | MachineOperand *OrOther = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1); |
764 | assert(OrSDWA && OrOther); |
765 | auto Res = CheckOROperandsForSDWA(OrSDWA, OrOther); |
766 | if (!Res) { |
767 | OrSDWA = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1); |
768 | OrOther = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0); |
769 | assert(OrSDWA && OrOther); |
770 | Res = CheckOROperandsForSDWA(OrSDWA, OrOther); |
771 | if (!Res) |
772 | break; |
773 | } |
774 | |
775 | MachineOperand *OrSDWADef = Res->first; |
776 | MachineOperand *OrOtherDef = Res->second; |
777 | assert(OrSDWADef && OrOtherDef); |
778 | |
779 | MachineInstr *SDWAInst = OrSDWADef->getParent(); |
780 | MachineInstr *OtherInst = OrOtherDef->getParent(); |
781 | |
782 | // Check that OtherInstr is actually bitwise compatible with SDWAInst = their |
783 | // destination patterns don't overlap. Compatible instruction can be either |
784 | // regular instruction with compatible bitness or SDWA instruction with |
785 | // correct dst_sel |
786 | // SDWAInst | OtherInst bitness / OtherInst dst_sel |
787 | // ----------------------------------------------------- |
788 | // DWORD | no / no |
789 | // WORD_0 | no / BYTE_2/3, WORD_1 |
790 | // WORD_1 | 8/16-bit instructions / BYTE_0/1, WORD_0 |
791 | // BYTE_0 | no / BYTE_1/2/3, WORD_1 |
792 | // BYTE_1 | 8-bit / BYTE_0/2/3, WORD_1 |
793 | // BYTE_2 | 8/16-bit / BYTE_0/1/3. WORD_0 |
794 | // BYTE_3 | 8/16/24-bit / BYTE_0/1/2, WORD_0 |
795 | // E.g. if SDWAInst is v_add_f16_sdwa dst_sel:WORD_1 then v_add_f16 is OK |
796 | // but v_add_f32 is not. |
797 | |
798 | // TODO: add support for non-SDWA instructions as OtherInst. |
799 | // For now this only works with SDWA instructions. For regular instructions |
800 | // there is no way to determine if the instruction writes only 8/16/24-bit |
801 | // out of full register size and all registers are at min 32-bit wide. |
802 | if (!TII->isSDWA(MI: *OtherInst)) |
803 | break; |
804 | |
805 | SdwaSel DstSel = static_cast<SdwaSel>( |
806 | TII->getNamedImmOperand(MI: *SDWAInst, OpName: AMDGPU::OpName::dst_sel)); |
807 | SdwaSel OtherDstSel = static_cast<SdwaSel>( |
808 | TII->getNamedImmOperand(MI: *OtherInst, OpName: AMDGPU::OpName::dst_sel)); |
809 | |
810 | bool DstSelAgree = false; |
811 | switch (DstSel) { |
812 | case WORD_0: DstSelAgree = ((OtherDstSel == BYTE_2) || |
813 | (OtherDstSel == BYTE_3) || |
814 | (OtherDstSel == WORD_1)); |
815 | break; |
816 | case WORD_1: DstSelAgree = ((OtherDstSel == BYTE_0) || |
817 | (OtherDstSel == BYTE_1) || |
818 | (OtherDstSel == WORD_0)); |
819 | break; |
820 | case BYTE_0: DstSelAgree = ((OtherDstSel == BYTE_1) || |
821 | (OtherDstSel == BYTE_2) || |
822 | (OtherDstSel == BYTE_3) || |
823 | (OtherDstSel == WORD_1)); |
824 | break; |
825 | case BYTE_1: DstSelAgree = ((OtherDstSel == BYTE_0) || |
826 | (OtherDstSel == BYTE_2) || |
827 | (OtherDstSel == BYTE_3) || |
828 | (OtherDstSel == WORD_1)); |
829 | break; |
830 | case BYTE_2: DstSelAgree = ((OtherDstSel == BYTE_0) || |
831 | (OtherDstSel == BYTE_1) || |
832 | (OtherDstSel == BYTE_3) || |
833 | (OtherDstSel == WORD_0)); |
834 | break; |
835 | case BYTE_3: DstSelAgree = ((OtherDstSel == BYTE_0) || |
836 | (OtherDstSel == BYTE_1) || |
837 | (OtherDstSel == BYTE_2) || |
838 | (OtherDstSel == WORD_0)); |
839 | break; |
840 | default: DstSelAgree = false; |
841 | } |
842 | |
843 | if (!DstSelAgree) |
844 | break; |
845 | |
846 | // Also OtherInst dst_unused should be UNUSED_PAD |
847 | DstUnused OtherDstUnused = static_cast<DstUnused>( |
848 | TII->getNamedImmOperand(MI: *OtherInst, OpName: AMDGPU::OpName::dst_unused)); |
849 | if (OtherDstUnused != DstUnused::UNUSED_PAD) |
850 | break; |
851 | |
852 | // Create DstPreserveOperand |
853 | MachineOperand *OrDst = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst); |
854 | assert(OrDst && OrDst->isReg()); |
855 | |
856 | return std::make_unique<SDWADstPreserveOperand>( |
857 | args&: OrDst, args&: OrSDWADef, args&: OrOtherDef, args&: DstSel); |
858 | |
859 | } |
860 | } |
861 | |
862 | return std::unique_ptr<SDWAOperand>(nullptr); |
863 | } |
864 | |
865 | #if !defined(NDEBUG) |
866 | static raw_ostream& operator<<(raw_ostream &OS, const SDWAOperand &Operand) { |
867 | Operand.print(OS); |
868 | return OS; |
869 | } |
870 | #endif |
871 | |
872 | void SIPeepholeSDWA::matchSDWAOperands(MachineBasicBlock &MBB) { |
873 | for (MachineInstr &MI : MBB) { |
874 | if (auto Operand = matchSDWAOperand(MI)) { |
875 | LLVM_DEBUG(dbgs() << "Match: " << MI << "To: " << *Operand << '\n'); |
876 | SDWAOperands[&MI] = std::move(Operand); |
877 | ++NumSDWAPatternsFound; |
878 | } |
879 | } |
880 | } |
881 | |
882 | // Convert the V_ADD_CO_U32_e64 into V_ADD_CO_U32_e32. This allows |
883 | // isConvertibleToSDWA to perform its transformation on V_ADD_CO_U32_e32 into |
884 | // V_ADD_CO_U32_sdwa. |
885 | // |
886 | // We are transforming from a VOP3 into a VOP2 form of the instruction. |
887 | // %19:vgpr_32 = V_AND_B32_e32 255, |
888 | // killed %16:vgpr_32, implicit $exec |
889 | // %47:vgpr_32, %49:sreg_64_xexec = V_ADD_CO_U32_e64 |
890 | // %26.sub0:vreg_64, %19:vgpr_32, implicit $exec |
891 | // %48:vgpr_32, dead %50:sreg_64_xexec = V_ADDC_U32_e64 |
892 | // %26.sub1:vreg_64, %54:vgpr_32, killed %49:sreg_64_xexec, implicit $exec |
893 | // |
894 | // becomes |
895 | // %47:vgpr_32 = V_ADD_CO_U32_sdwa |
896 | // 0, %26.sub0:vreg_64, 0, killed %16:vgpr_32, 0, 6, 0, 6, 0, |
897 | // implicit-def $vcc, implicit $exec |
898 | // %48:vgpr_32, dead %50:sreg_64_xexec = V_ADDC_U32_e64 |
899 | // %26.sub1:vreg_64, %54:vgpr_32, killed $vcc, implicit $exec |
900 | void SIPeepholeSDWA::pseudoOpConvertToVOP2(MachineInstr &MI, |
901 | const GCNSubtarget &ST) const { |
902 | int Opc = MI.getOpcode(); |
903 | assert((Opc == AMDGPU::V_ADD_CO_U32_e64 || Opc == AMDGPU::V_SUB_CO_U32_e64) && |
904 | "Currently only handles V_ADD_CO_U32_e64 or V_SUB_CO_U32_e64" ); |
905 | |
906 | // Can the candidate MI be shrunk? |
907 | if (!TII->canShrink(MI, MRI: *MRI)) |
908 | return; |
909 | Opc = AMDGPU::getVOPe32(Opcode: Opc); |
910 | // Find the related ADD instruction. |
911 | const MachineOperand *Sdst = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::sdst); |
912 | if (!Sdst) |
913 | return; |
914 | MachineOperand *NextOp = findSingleRegUse(Reg: Sdst, MRI); |
915 | if (!NextOp) |
916 | return; |
917 | MachineInstr &MISucc = *NextOp->getParent(); |
918 | |
919 | // Make sure the carry in/out are subsequently unused. |
920 | MachineOperand *CarryIn = TII->getNamedOperand(MI&: MISucc, OperandName: AMDGPU::OpName::src2); |
921 | if (!CarryIn) |
922 | return; |
923 | MachineOperand *CarryOut = TII->getNamedOperand(MI&: MISucc, OperandName: AMDGPU::OpName::sdst); |
924 | if (!CarryOut) |
925 | return; |
926 | if (!MRI->hasOneUse(RegNo: CarryIn->getReg()) || !MRI->use_empty(RegNo: CarryOut->getReg())) |
927 | return; |
928 | // Make sure VCC or its subregs are dead before MI. |
929 | MachineBasicBlock &MBB = *MI.getParent(); |
930 | auto Liveness = MBB.computeRegisterLiveness(TRI, Reg: AMDGPU::VCC, Before: MI, Neighborhood: 25); |
931 | if (Liveness != MachineBasicBlock::LQR_Dead) |
932 | return; |
933 | // Check if VCC is referenced in range of (MI,MISucc]. |
934 | for (auto I = std::next(x: MI.getIterator()), E = MISucc.getIterator(); |
935 | I != E; ++I) { |
936 | if (I->modifiesRegister(Reg: AMDGPU::VCC, TRI)) |
937 | return; |
938 | } |
939 | |
940 | // Replace MI with V_{SUB|ADD}_I32_e32 |
941 | BuildMI(BB&: MBB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: Opc)) |
942 | .add(MO: *TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst)) |
943 | .add(MO: *TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0)) |
944 | .add(MO: *TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1)) |
945 | .setMIFlags(MI.getFlags()); |
946 | |
947 | MI.eraseFromParent(); |
948 | |
949 | // Since the carry output of MI is now VCC, update its use in MISucc. |
950 | |
951 | MISucc.substituteRegister(FromReg: CarryIn->getReg(), ToReg: TRI->getVCC(), SubIdx: 0, RegInfo: *TRI); |
952 | } |
953 | |
954 | namespace { |
955 | bool isConvertibleToSDWA(MachineInstr &MI, |
956 | const GCNSubtarget &ST, |
957 | const SIInstrInfo* TII) { |
958 | // Check if this is already an SDWA instruction |
959 | unsigned Opc = MI.getOpcode(); |
960 | if (TII->isSDWA(Opcode: Opc)) |
961 | return true; |
962 | |
963 | // Check if this instruction has opcode that supports SDWA |
964 | if (AMDGPU::getSDWAOp(Opcode: Opc) == -1) |
965 | Opc = AMDGPU::getVOPe32(Opcode: Opc); |
966 | |
967 | if (AMDGPU::getSDWAOp(Opcode: Opc) == -1) |
968 | return false; |
969 | |
970 | if (!ST.hasSDWAOmod() && TII->hasModifiersSet(MI, OpName: AMDGPU::OpName::omod)) |
971 | return false; |
972 | |
973 | if (TII->isVOPC(Opcode: Opc)) { |
974 | if (!ST.hasSDWASdst()) { |
975 | const MachineOperand *SDst = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::sdst); |
976 | if (SDst && (SDst->getReg() != AMDGPU::VCC && |
977 | SDst->getReg() != AMDGPU::VCC_LO)) |
978 | return false; |
979 | } |
980 | |
981 | if (!ST.hasSDWAOutModsVOPC() && |
982 | (TII->hasModifiersSet(MI, OpName: AMDGPU::OpName::clamp) || |
983 | TII->hasModifiersSet(MI, OpName: AMDGPU::OpName::omod))) |
984 | return false; |
985 | |
986 | } else if (TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::sdst) || |
987 | !TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst)) { |
988 | return false; |
989 | } |
990 | |
991 | if (!ST.hasSDWAMac() && (Opc == AMDGPU::V_FMAC_F16_e32 || |
992 | Opc == AMDGPU::V_FMAC_F32_e32 || |
993 | Opc == AMDGPU::V_MAC_F16_e32 || |
994 | Opc == AMDGPU::V_MAC_F32_e32)) |
995 | return false; |
996 | |
997 | // Check if target supports this SDWA opcode |
998 | if (TII->pseudoToMCOpcode(Opcode: Opc) == -1) |
999 | return false; |
1000 | |
1001 | // FIXME: has SDWA but require handling of implicit VCC use |
1002 | if (Opc == AMDGPU::V_CNDMASK_B32_e32) |
1003 | return false; |
1004 | |
1005 | if (MachineOperand *Src0 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0)) { |
1006 | if (!Src0->isReg() && !Src0->isImm()) |
1007 | return false; |
1008 | } |
1009 | |
1010 | if (MachineOperand *Src1 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1)) { |
1011 | if (!Src1->isReg() && !Src1->isImm()) |
1012 | return false; |
1013 | } |
1014 | |
1015 | return true; |
1016 | } |
1017 | } // namespace |
1018 | |
1019 | bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI, |
1020 | const SDWAOperandsVector &SDWAOperands) { |
1021 | |
1022 | LLVM_DEBUG(dbgs() << "Convert instruction:" << MI); |
1023 | |
1024 | // Convert to sdwa |
1025 | int SDWAOpcode; |
1026 | unsigned Opcode = MI.getOpcode(); |
1027 | if (TII->isSDWA(Opcode)) { |
1028 | SDWAOpcode = Opcode; |
1029 | } else { |
1030 | SDWAOpcode = AMDGPU::getSDWAOp(Opcode); |
1031 | if (SDWAOpcode == -1) |
1032 | SDWAOpcode = AMDGPU::getSDWAOp(Opcode: AMDGPU::getVOPe32(Opcode)); |
1033 | } |
1034 | assert(SDWAOpcode != -1); |
1035 | |
1036 | const MCInstrDesc &SDWADesc = TII->get(Opcode: SDWAOpcode); |
1037 | |
1038 | // Create SDWA version of instruction MI and initialize its operands |
1039 | MachineInstrBuilder SDWAInst = |
1040 | BuildMI(BB&: *MI.getParent(), I&: MI, MIMD: MI.getDebugLoc(), MCID: SDWADesc) |
1041 | .setMIFlags(MI.getFlags()); |
1042 | |
1043 | // Copy dst, if it is present in original then should also be present in SDWA |
1044 | MachineOperand *Dst = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst); |
1045 | if (Dst) { |
1046 | assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::vdst)); |
1047 | SDWAInst.add(MO: *Dst); |
1048 | } else if ((Dst = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::sdst))) { |
1049 | assert(Dst && AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::sdst)); |
1050 | SDWAInst.add(MO: *Dst); |
1051 | } else { |
1052 | assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::sdst)); |
1053 | SDWAInst.addReg(RegNo: TRI->getVCC(), flags: RegState::Define); |
1054 | } |
1055 | |
1056 | // Copy src0, initialize src0_modifiers. All sdwa instructions has src0 and |
1057 | // src0_modifiers (except for v_nop_sdwa, but it can't get here) |
1058 | MachineOperand *Src0 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0); |
1059 | assert(Src0 && AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src0) && |
1060 | AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src0_modifiers)); |
1061 | if (auto *Mod = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0_modifiers)) |
1062 | SDWAInst.addImm(Val: Mod->getImm()); |
1063 | else |
1064 | SDWAInst.addImm(Val: 0); |
1065 | SDWAInst.add(MO: *Src0); |
1066 | |
1067 | // Copy src1 if present, initialize src1_modifiers. |
1068 | MachineOperand *Src1 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1); |
1069 | if (Src1) { |
1070 | assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src1) && |
1071 | AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src1_modifiers)); |
1072 | if (auto *Mod = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1_modifiers)) |
1073 | SDWAInst.addImm(Val: Mod->getImm()); |
1074 | else |
1075 | SDWAInst.addImm(Val: 0); |
1076 | SDWAInst.add(MO: *Src1); |
1077 | } |
1078 | |
1079 | if (SDWAOpcode == AMDGPU::V_FMAC_F16_sdwa || |
1080 | SDWAOpcode == AMDGPU::V_FMAC_F32_sdwa || |
1081 | SDWAOpcode == AMDGPU::V_MAC_F16_sdwa || |
1082 | SDWAOpcode == AMDGPU::V_MAC_F32_sdwa) { |
1083 | // v_mac_f16/32 has additional src2 operand tied to vdst |
1084 | MachineOperand *Src2 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src2); |
1085 | assert(Src2); |
1086 | SDWAInst.add(MO: *Src2); |
1087 | } |
1088 | |
1089 | // Copy clamp if present, initialize otherwise |
1090 | assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::clamp)); |
1091 | MachineOperand *Clamp = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::clamp); |
1092 | if (Clamp) { |
1093 | SDWAInst.add(MO: *Clamp); |
1094 | } else { |
1095 | SDWAInst.addImm(Val: 0); |
1096 | } |
1097 | |
1098 | // Copy omod if present, initialize otherwise if needed |
1099 | if (AMDGPU::hasNamedOperand(Opcode: SDWAOpcode, NamedIdx: AMDGPU::OpName::omod)) { |
1100 | MachineOperand *OMod = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::omod); |
1101 | if (OMod) { |
1102 | SDWAInst.add(MO: *OMod); |
1103 | } else { |
1104 | SDWAInst.addImm(Val: 0); |
1105 | } |
1106 | } |
1107 | |
1108 | // Copy dst_sel if present, initialize otherwise if needed |
1109 | if (AMDGPU::hasNamedOperand(Opcode: SDWAOpcode, NamedIdx: AMDGPU::OpName::dst_sel)) { |
1110 | MachineOperand *DstSel = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::dst_sel); |
1111 | if (DstSel) { |
1112 | SDWAInst.add(MO: *DstSel); |
1113 | } else { |
1114 | SDWAInst.addImm(Val: AMDGPU::SDWA::SdwaSel::DWORD); |
1115 | } |
1116 | } |
1117 | |
1118 | // Copy dst_unused if present, initialize otherwise if needed |
1119 | if (AMDGPU::hasNamedOperand(Opcode: SDWAOpcode, NamedIdx: AMDGPU::OpName::dst_unused)) { |
1120 | MachineOperand *DstUnused = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::dst_unused); |
1121 | if (DstUnused) { |
1122 | SDWAInst.add(MO: *DstUnused); |
1123 | } else { |
1124 | SDWAInst.addImm(Val: AMDGPU::SDWA::DstUnused::UNUSED_PAD); |
1125 | } |
1126 | } |
1127 | |
1128 | // Copy src0_sel if present, initialize otherwise |
1129 | assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src0_sel)); |
1130 | MachineOperand *Src0Sel = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0_sel); |
1131 | if (Src0Sel) { |
1132 | SDWAInst.add(MO: *Src0Sel); |
1133 | } else { |
1134 | SDWAInst.addImm(Val: AMDGPU::SDWA::SdwaSel::DWORD); |
1135 | } |
1136 | |
1137 | // Copy src1_sel if present, initialize otherwise if needed |
1138 | if (Src1) { |
1139 | assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src1_sel)); |
1140 | MachineOperand *Src1Sel = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1_sel); |
1141 | if (Src1Sel) { |
1142 | SDWAInst.add(MO: *Src1Sel); |
1143 | } else { |
1144 | SDWAInst.addImm(Val: AMDGPU::SDWA::SdwaSel::DWORD); |
1145 | } |
1146 | } |
1147 | |
1148 | // Check for a preserved register that needs to be copied. |
1149 | auto DstUnused = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::dst_unused); |
1150 | if (DstUnused && |
1151 | DstUnused->getImm() == AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE) { |
1152 | // We expect, if we are here, that the instruction was already in it's SDWA form, |
1153 | // with a tied operand. |
1154 | assert(Dst && Dst->isTied()); |
1155 | assert(Opcode == static_cast<unsigned int>(SDWAOpcode)); |
1156 | // We also expect a vdst, since sdst can't preserve. |
1157 | auto PreserveDstIdx = AMDGPU::getNamedOperandIdx(Opcode: SDWAOpcode, NamedIdx: AMDGPU::OpName::vdst); |
1158 | assert(PreserveDstIdx != -1); |
1159 | |
1160 | auto TiedIdx = MI.findTiedOperandIdx(OpIdx: PreserveDstIdx); |
1161 | auto Tied = MI.getOperand(i: TiedIdx); |
1162 | |
1163 | SDWAInst.add(MO: Tied); |
1164 | SDWAInst->tieOperands(DefIdx: PreserveDstIdx, UseIdx: SDWAInst->getNumOperands() - 1); |
1165 | } |
1166 | |
1167 | // Apply all sdwa operand patterns. |
1168 | bool Converted = false; |
1169 | for (auto &Operand : SDWAOperands) { |
1170 | LLVM_DEBUG(dbgs() << *SDWAInst << "\nOperand: " << *Operand); |
1171 | // There should be no intersection between SDWA operands and potential MIs |
1172 | // e.g.: |
1173 | // v_and_b32 v0, 0xff, v1 -> src:v1 sel:BYTE_0 |
1174 | // v_and_b32 v2, 0xff, v0 -> src:v0 sel:BYTE_0 |
1175 | // v_add_u32 v3, v4, v2 |
1176 | // |
1177 | // In that example it is possible that we would fold 2nd instruction into |
1178 | // 3rd (v_add_u32_sdwa) and then try to fold 1st instruction into 2nd (that |
1179 | // was already destroyed). So if SDWAOperand is also a potential MI then do |
1180 | // not apply it. |
1181 | if (PotentialMatches.count(Key: Operand->getParentInst()) == 0) |
1182 | Converted |= Operand->convertToSDWA(MI&: *SDWAInst, TII); |
1183 | } |
1184 | |
1185 | if (Converted) { |
1186 | ConvertedInstructions.push_back(Elt: SDWAInst); |
1187 | for (MachineOperand &MO : SDWAInst->uses()) { |
1188 | if (!MO.isReg()) |
1189 | continue; |
1190 | |
1191 | MRI->clearKillFlags(Reg: MO.getReg()); |
1192 | } |
1193 | } else { |
1194 | SDWAInst->eraseFromParent(); |
1195 | return false; |
1196 | } |
1197 | |
1198 | LLVM_DEBUG(dbgs() << "\nInto:" << *SDWAInst << '\n'); |
1199 | ++NumSDWAInstructionsPeepholed; |
1200 | |
1201 | MI.eraseFromParent(); |
1202 | return true; |
1203 | } |
1204 | |
1205 | // If an instruction was converted to SDWA it should not have immediates or SGPR |
1206 | // operands (allowed one SGPR on GFX9). Copy its scalar operands into VGPRs. |
1207 | void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI, |
1208 | const GCNSubtarget &ST) const { |
1209 | const MCInstrDesc &Desc = TII->get(Opcode: MI.getOpcode()); |
1210 | unsigned ConstantBusCount = 0; |
1211 | for (MachineOperand &Op : MI.explicit_uses()) { |
1212 | if (!Op.isImm() && !(Op.isReg() && !TRI->isVGPR(MRI: *MRI, Reg: Op.getReg()))) |
1213 | continue; |
1214 | |
1215 | unsigned I = Op.getOperandNo(); |
1216 | if (Desc.operands()[I].RegClass == -1 || |
1217 | !TRI->isVSSuperClass(RC: TRI->getRegClass(RCID: Desc.operands()[I].RegClass))) |
1218 | continue; |
1219 | |
1220 | if (ST.hasSDWAScalar() && ConstantBusCount == 0 && Op.isReg() && |
1221 | TRI->isSGPRReg(MRI: *MRI, Reg: Op.getReg())) { |
1222 | ++ConstantBusCount; |
1223 | continue; |
1224 | } |
1225 | |
1226 | Register VGPR = MRI->createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass); |
1227 | auto Copy = BuildMI(BB&: *MI.getParent(), I: MI.getIterator(), MIMD: MI.getDebugLoc(), |
1228 | MCID: TII->get(Opcode: AMDGPU::V_MOV_B32_e32), DestReg: VGPR); |
1229 | if (Op.isImm()) |
1230 | Copy.addImm(Val: Op.getImm()); |
1231 | else if (Op.isReg()) |
1232 | Copy.addReg(RegNo: Op.getReg(), flags: Op.isKill() ? RegState::Kill : 0, |
1233 | SubReg: Op.getSubReg()); |
1234 | Op.ChangeToRegister(Reg: VGPR, isDef: false); |
1235 | } |
1236 | } |
1237 | |
1238 | bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) { |
1239 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
1240 | |
1241 | if (!ST.hasSDWA() || skipFunction(F: MF.getFunction())) |
1242 | return false; |
1243 | |
1244 | MRI = &MF.getRegInfo(); |
1245 | TRI = ST.getRegisterInfo(); |
1246 | TII = ST.getInstrInfo(); |
1247 | |
1248 | // Find all SDWA operands in MF. |
1249 | bool Ret = false; |
1250 | for (MachineBasicBlock &MBB : MF) { |
1251 | bool Changed = false; |
1252 | do { |
1253 | // Preprocess the ADD/SUB pairs so they could be SDWA'ed. |
1254 | // Look for a possible ADD or SUB that resulted from a previously lowered |
1255 | // V_{ADD|SUB}_U64_PSEUDO. The function pseudoOpConvertToVOP2 |
1256 | // lowers the pair of instructions into e32 form. |
1257 | matchSDWAOperands(MBB); |
1258 | for (const auto &OperandPair : SDWAOperands) { |
1259 | const auto &Operand = OperandPair.second; |
1260 | MachineInstr *PotentialMI = Operand->potentialToConvert(TII, ST); |
1261 | if (PotentialMI && |
1262 | (PotentialMI->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 || |
1263 | PotentialMI->getOpcode() == AMDGPU::V_SUB_CO_U32_e64)) |
1264 | pseudoOpConvertToVOP2(MI&: *PotentialMI, ST); |
1265 | } |
1266 | SDWAOperands.clear(); |
1267 | |
1268 | // Generate potential match list. |
1269 | matchSDWAOperands(MBB); |
1270 | |
1271 | for (const auto &OperandPair : SDWAOperands) { |
1272 | const auto &Operand = OperandPair.second; |
1273 | MachineInstr *PotentialMI = Operand->potentialToConvert(TII, ST, PotentialMatches: &PotentialMatches); |
1274 | if (PotentialMI && isConvertibleToSDWA(MI&: *PotentialMI, ST, TII)) { |
1275 | PotentialMatches[PotentialMI].push_back(Elt: Operand.get()); |
1276 | } |
1277 | } |
1278 | |
1279 | for (auto &PotentialPair : PotentialMatches) { |
1280 | MachineInstr &PotentialMI = *PotentialPair.first; |
1281 | convertToSDWA(MI&: PotentialMI, SDWAOperands: PotentialPair.second); |
1282 | } |
1283 | |
1284 | PotentialMatches.clear(); |
1285 | SDWAOperands.clear(); |
1286 | |
1287 | Changed = !ConvertedInstructions.empty(); |
1288 | |
1289 | if (Changed) |
1290 | Ret = true; |
1291 | while (!ConvertedInstructions.empty()) |
1292 | legalizeScalarOperands(MI&: *ConvertedInstructions.pop_back_val(), ST); |
1293 | } while (Changed); |
1294 | } |
1295 | |
1296 | return Ret; |
1297 | } |
1298 | |