| 1 | //===- GCNVOPDUtils.cpp - GCN VOPD Utils ------------------------===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | // |
| 9 | /// \file This file contains the AMDGPU DAG scheduling |
| 10 | /// mutation to pair VOPD instructions back to back. It also contains |
| 11 | // subroutines useful in the creation of VOPD instructions |
| 12 | // |
| 13 | //===----------------------------------------------------------------------===// |
| 14 | |
| 15 | #include "GCNVOPDUtils.h" |
| 16 | #include "AMDGPUSubtarget.h" |
| 17 | #include "GCNSubtarget.h" |
| 18 | #include "MCTargetDesc/AMDGPUMCTargetDesc.h" |
| 19 | #include "SIInstrInfo.h" |
| 20 | #include "Utils/AMDGPUBaseInfo.h" |
| 21 | #include "llvm/ADT/STLExtras.h" |
| 22 | #include "llvm/ADT/SmallVector.h" |
| 23 | #include "llvm/CodeGen/MachineBasicBlock.h" |
| 24 | #include "llvm/CodeGen/MachineInstr.h" |
| 25 | #include "llvm/CodeGen/MachineOperand.h" |
| 26 | #include "llvm/CodeGen/MachineRegisterInfo.h" |
| 27 | #include "llvm/CodeGen/MacroFusion.h" |
| 28 | #include "llvm/CodeGen/ScheduleDAG.h" |
| 29 | #include "llvm/CodeGen/ScheduleDAGMutation.h" |
| 30 | #include "llvm/CodeGen/TargetInstrInfo.h" |
| 31 | #include "llvm/MC/MCInst.h" |
| 32 | |
| 33 | using namespace llvm; |
| 34 | |
| 35 | #define DEBUG_TYPE "gcn-vopd-utils" |
| 36 | |
| 37 | bool llvm::checkVOPDRegConstraints(const SIInstrInfo &TII, |
| 38 | const MachineInstr &MIX, |
| 39 | const MachineInstr &MIY, bool IsVOPD3) { |
| 40 | namespace VOPD = AMDGPU::VOPD; |
| 41 | |
| 42 | const MachineFunction *MF = MIX.getMF(); |
| 43 | const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); |
| 44 | |
| 45 | if (IsVOPD3 && !ST.hasVOPD3()) |
| 46 | return false; |
| 47 | if (!IsVOPD3 && (TII.isVOP3(MI: MIX) || TII.isVOP3(MI: MIY))) |
| 48 | return false; |
| 49 | if (TII.isDPP(MI: MIX) || TII.isDPP(MI: MIY)) |
| 50 | return false; |
| 51 | |
| 52 | const SIRegisterInfo *TRI = dyn_cast<SIRegisterInfo>(Val: ST.getRegisterInfo()); |
| 53 | const MachineRegisterInfo &MRI = MF->getRegInfo(); |
| 54 | // Literals also count against scalar bus limit |
| 55 | SmallVector<const MachineOperand *> UniqueLiterals; |
| 56 | auto addLiteral = [&](const MachineOperand &Op) { |
| 57 | for (auto &Literal : UniqueLiterals) { |
| 58 | if (Literal->isIdenticalTo(Other: Op)) |
| 59 | return; |
| 60 | } |
| 61 | UniqueLiterals.push_back(Elt: &Op); |
| 62 | }; |
| 63 | SmallVector<Register> UniqueScalarRegs; |
| 64 | |
| 65 | // MIX must not modify any registers used by MIY. |
| 66 | for (const auto &Use : MIY.uses()) |
| 67 | if (Use.isReg() && MIX.modifiesRegister(Reg: Use.getReg(), TRI)) |
| 68 | return false; |
| 69 | |
| 70 | auto getVRegIdx = [&](unsigned OpcodeIdx, unsigned OperandIdx) { |
| 71 | const MachineInstr &MI = (OpcodeIdx == VOPD::X) ? MIX : MIY; |
| 72 | const MachineOperand &Operand = MI.getOperand(i: OperandIdx); |
| 73 | if (Operand.isReg() && TRI->isVectorRegister(MRI, Reg: Operand.getReg())) |
| 74 | return Operand.getReg(); |
| 75 | return Register(); |
| 76 | }; |
| 77 | |
| 78 | auto InstInfo = AMDGPU::getVOPDInstInfo(OpX: MIX.getDesc(), OpY: MIY.getDesc()); |
| 79 | |
| 80 | for (auto CompIdx : VOPD::COMPONENTS) { |
| 81 | const MachineInstr &MI = (CompIdx == VOPD::X) ? MIX : MIY; |
| 82 | |
| 83 | const MachineOperand &Src0 = *TII.getNamedOperand(MI, OperandName: AMDGPU::OpName::src0); |
| 84 | if (Src0.isReg()) { |
| 85 | if (!TRI->isVectorRegister(MRI, Reg: Src0.getReg())) { |
| 86 | if (!is_contained(Range&: UniqueScalarRegs, Element: Src0.getReg())) |
| 87 | UniqueScalarRegs.push_back(Elt: Src0.getReg()); |
| 88 | } |
| 89 | } else if (!TII.isInlineConstant(MO: Src0)) { |
| 90 | if (IsVOPD3) |
| 91 | return false; |
| 92 | addLiteral(Src0); |
| 93 | } |
| 94 | |
| 95 | if (InstInfo[CompIdx].hasMandatoryLiteral()) { |
| 96 | if (IsVOPD3) |
| 97 | return false; |
| 98 | |
| 99 | auto CompOprIdx = InstInfo[CompIdx].getMandatoryLiteralCompOperandIndex(); |
| 100 | addLiteral(MI.getOperand(i: CompOprIdx)); |
| 101 | } |
| 102 | if (MI.getDesc().hasImplicitUseOfPhysReg(Reg: AMDGPU::VCC)) |
| 103 | UniqueScalarRegs.push_back(Elt: AMDGPU::VCC_LO); |
| 104 | |
| 105 | if (IsVOPD3) { |
| 106 | for (auto OpName : {AMDGPU::OpName::src1, AMDGPU::OpName::src2}) { |
| 107 | const MachineOperand *Src = TII.getNamedOperand(MI, OperandName: OpName); |
| 108 | if (!Src) |
| 109 | continue; |
| 110 | if (OpName == AMDGPU::OpName::src2) { |
| 111 | if (AMDGPU::hasNamedOperand(Opcode: MI.getOpcode(), NamedIdx: AMDGPU::OpName::bitop3)) |
| 112 | continue; |
| 113 | if (MI.getOpcode() == AMDGPU::V_CNDMASK_B32_e64) { |
| 114 | UniqueScalarRegs.push_back(Elt: Src->getReg()); |
| 115 | continue; |
| 116 | } |
| 117 | } |
| 118 | if (!Src->isReg() || !TRI->isVGPR(MRI, Reg: Src->getReg())) |
| 119 | return false; |
| 120 | } |
| 121 | |
| 122 | for (auto OpName : {AMDGPU::OpName::clamp, AMDGPU::OpName::omod, |
| 123 | AMDGPU::OpName::op_sel}) { |
| 124 | if (TII.hasModifiersSet(MI, OpName)) |
| 125 | return false; |
| 126 | } |
| 127 | |
| 128 | // Neg is allowed, other modifiers are not. NB: even though sext has the |
| 129 | // same value as neg, there are no combinable instructions with sext. |
| 130 | for (auto OpName : |
| 131 | {AMDGPU::OpName::src0_modifiers, AMDGPU::OpName::src1_modifiers, |
| 132 | AMDGPU::OpName::src2_modifiers}) { |
| 133 | const MachineOperand *Mods = TII.getNamedOperand(MI, OperandName: OpName); |
| 134 | if (Mods && (Mods->getImm() & ~SISrcMods::NEG)) |
| 135 | return false; |
| 136 | } |
| 137 | } |
| 138 | } |
| 139 | |
| 140 | if (UniqueLiterals.size() > 1) |
| 141 | return false; |
| 142 | if ((UniqueLiterals.size() + UniqueScalarRegs.size()) > 2) |
| 143 | return false; |
| 144 | |
| 145 | // On GFX12+ if both OpX and OpY are V_MOV_B32 then OPY uses SRC2 |
| 146 | // source-cache. |
| 147 | bool SkipSrc = ST.getGeneration() >= AMDGPUSubtarget::GFX12 && |
| 148 | MIX.getOpcode() == AMDGPU::V_MOV_B32_e32 && |
| 149 | MIY.getOpcode() == AMDGPU::V_MOV_B32_e32; |
| 150 | bool AllowSameVGPR = ST.hasGFX1250Insts(); |
| 151 | |
| 152 | if (InstInfo.hasInvalidOperand(GetRegIdx: getVRegIdx, MRI: *TRI, SkipSrc, AllowSameVGPR, |
| 153 | VOPD3: IsVOPD3)) |
| 154 | return false; |
| 155 | |
| 156 | if (IsVOPD3) { |
| 157 | // BITOP3 can be converted to DUAL_BITOP2 only if src2 is zero. |
| 158 | // MIX check is only relevant to scheduling? |
| 159 | if (AMDGPU::hasNamedOperand(Opcode: MIX.getOpcode(), NamedIdx: AMDGPU::OpName::bitop3)) { |
| 160 | const MachineOperand &Src2 = |
| 161 | *TII.getNamedOperand(MI: MIX, OperandName: AMDGPU::OpName::src2); |
| 162 | if (!Src2.isImm() || Src2.getImm()) |
| 163 | return false; |
| 164 | } |
| 165 | if (AMDGPU::hasNamedOperand(Opcode: MIY.getOpcode(), NamedIdx: AMDGPU::OpName::bitop3)) { |
| 166 | const MachineOperand &Src2 = |
| 167 | *TII.getNamedOperand(MI: MIY, OperandName: AMDGPU::OpName::src2); |
| 168 | if (!Src2.isImm() || Src2.getImm()) |
| 169 | return false; |
| 170 | } |
| 171 | } |
| 172 | |
| 173 | LLVM_DEBUG(dbgs() << "VOPD Reg Constraints Passed\n\tX: " << MIX |
| 174 | << "\n\tY: " << MIY << "\n" ); |
| 175 | return true; |
| 176 | } |
| 177 | |
| 178 | /// Check if the instr pair, FirstMI and SecondMI, should be scheduled |
| 179 | /// together. Given SecondMI, when FirstMI is unspecified, then check if |
| 180 | /// SecondMI may be part of a fused pair at all. |
| 181 | static bool shouldScheduleVOPDAdjacent(const TargetInstrInfo &TII, |
| 182 | const TargetSubtargetInfo &TSI, |
| 183 | const MachineInstr *FirstMI, |
| 184 | const MachineInstr &SecondMI) { |
| 185 | const SIInstrInfo &STII = static_cast<const SIInstrInfo &>(TII); |
| 186 | const GCNSubtarget &ST = STII.getSubtarget(); |
| 187 | unsigned EncodingFamily = AMDGPU::getVOPDEncodingFamily(ST); |
| 188 | unsigned Opc2 = SecondMI.getOpcode(); |
| 189 | |
| 190 | const auto checkVOPD = [&](bool VOPD3) -> bool { |
| 191 | auto SecondCanBeVOPD = AMDGPU::getCanBeVOPD(Opc: Opc2, EncodingFamily, VOPD3); |
| 192 | |
| 193 | // One instruction case |
| 194 | if (!FirstMI) |
| 195 | return SecondCanBeVOPD.Y || SecondCanBeVOPD.X; |
| 196 | |
| 197 | unsigned Opc = FirstMI->getOpcode(); |
| 198 | auto FirstCanBeVOPD = AMDGPU::getCanBeVOPD(Opc, EncodingFamily, VOPD3); |
| 199 | |
| 200 | if (!((FirstCanBeVOPD.X && SecondCanBeVOPD.Y) || |
| 201 | (FirstCanBeVOPD.Y && SecondCanBeVOPD.X))) |
| 202 | return false; |
| 203 | |
| 204 | assert([&]() -> bool { |
| 205 | for (auto MII = MachineBasicBlock::const_iterator(FirstMI); |
| 206 | MII != FirstMI->getParent()->instr_end(); ++MII) { |
| 207 | if (&*MII == &SecondMI) |
| 208 | return true; |
| 209 | } |
| 210 | return false; |
| 211 | }() && "Expected FirstMI to precede SecondMI" ); |
| 212 | |
| 213 | return checkVOPDRegConstraints(TII: STII, MIX: *FirstMI, MIY: SecondMI, IsVOPD3: VOPD3); |
| 214 | }; |
| 215 | |
| 216 | return checkVOPD(false) || (ST.hasVOPD3() && checkVOPD(true)); |
| 217 | } |
| 218 | |
| 219 | namespace { |
| 220 | /// Adapts design from MacroFusion |
| 221 | /// Puts valid candidate instructions back-to-back so they can easily |
| 222 | /// be turned into VOPD instructions |
| 223 | /// Greedily pairs instruction candidates. O(n^2) algorithm. |
| 224 | struct VOPDPairingMutation : ScheduleDAGMutation { |
| 225 | MacroFusionPredTy shouldScheduleAdjacent; // NOLINT: function pointer |
| 226 | |
| 227 | VOPDPairingMutation( |
| 228 | MacroFusionPredTy shouldScheduleAdjacent) // NOLINT: function pointer |
| 229 | : shouldScheduleAdjacent(shouldScheduleAdjacent) {} |
| 230 | |
| 231 | void apply(ScheduleDAGInstrs *DAG) override { |
| 232 | const TargetInstrInfo &TII = *DAG->TII; |
| 233 | const GCNSubtarget &ST = DAG->MF.getSubtarget<GCNSubtarget>(); |
| 234 | if (!AMDGPU::hasVOPD(STI: ST) || !ST.isWave32()) { |
| 235 | LLVM_DEBUG(dbgs() << "Target does not support VOPDPairingMutation\n" ); |
| 236 | return; |
| 237 | } |
| 238 | |
| 239 | std::vector<SUnit>::iterator ISUI, JSUI; |
| 240 | for (ISUI = DAG->SUnits.begin(); ISUI != DAG->SUnits.end(); ++ISUI) { |
| 241 | const MachineInstr *IMI = ISUI->getInstr(); |
| 242 | if (!shouldScheduleAdjacent(TII, ST, nullptr, *IMI)) |
| 243 | continue; |
| 244 | if (!hasLessThanNumFused(SU: *ISUI, FuseLimit: 2)) |
| 245 | continue; |
| 246 | |
| 247 | for (JSUI = ISUI + 1; JSUI != DAG->SUnits.end(); ++JSUI) { |
| 248 | if (JSUI->isBoundaryNode()) |
| 249 | continue; |
| 250 | const MachineInstr *JMI = JSUI->getInstr(); |
| 251 | if (!hasLessThanNumFused(SU: *JSUI, FuseLimit: 2) || |
| 252 | !shouldScheduleAdjacent(TII, ST, IMI, *JMI)) |
| 253 | continue; |
| 254 | if (fuseInstructionPair(DAG&: *DAG, FirstSU&: *ISUI, SecondSU&: *JSUI)) |
| 255 | break; |
| 256 | } |
| 257 | } |
| 258 | LLVM_DEBUG(dbgs() << "Completed VOPDPairingMutation\n" ); |
| 259 | } |
| 260 | }; |
| 261 | } // namespace |
| 262 | |
| 263 | std::unique_ptr<ScheduleDAGMutation> llvm::createVOPDPairingMutation() { |
| 264 | return std::make_unique<VOPDPairingMutation>(args&: shouldScheduleVOPDAdjacent); |
| 265 | } |
| 266 | |