| 1 | //===- GCNVOPDUtils.cpp - GCN VOPD Utils ------------------------===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | // |
| 9 | /// \file This file contains the AMDGPU DAG scheduling |
| 10 | /// mutation to pair VOPD instructions back to back. It also contains |
| 11 | // subroutines useful in the creation of VOPD instructions |
| 12 | // |
| 13 | //===----------------------------------------------------------------------===// |
| 14 | |
| 15 | #include "GCNVOPDUtils.h" |
| 16 | #include "AMDGPUSubtarget.h" |
| 17 | #include "GCNSubtarget.h" |
| 18 | #include "MCTargetDesc/AMDGPUMCTargetDesc.h" |
| 19 | #include "SIInstrInfo.h" |
| 20 | #include "Utils/AMDGPUBaseInfo.h" |
| 21 | #include "llvm/ADT/STLExtras.h" |
| 22 | #include "llvm/ADT/SmallVector.h" |
| 23 | #include "llvm/CodeGen/MachineBasicBlock.h" |
| 24 | #include "llvm/CodeGen/MachineInstr.h" |
| 25 | #include "llvm/CodeGen/MachineOperand.h" |
| 26 | #include "llvm/CodeGen/MachineRegisterInfo.h" |
| 27 | #include "llvm/CodeGen/MacroFusion.h" |
| 28 | #include "llvm/CodeGen/ScheduleDAG.h" |
| 29 | #include "llvm/CodeGen/ScheduleDAGMutation.h" |
| 30 | #include "llvm/CodeGen/TargetInstrInfo.h" |
| 31 | #include "llvm/MC/MCInst.h" |
| 32 | |
| 33 | using namespace llvm; |
| 34 | |
| 35 | #define DEBUG_TYPE "gcn-vopd-utils" |
| 36 | |
| 37 | bool llvm::checkVOPDRegConstraints(const SIInstrInfo &TII, |
| 38 | const MachineInstr &MIX, |
| 39 | const MachineInstr &MIY, bool IsVOPD3, |
| 40 | bool AllowSameVGPR) { |
| 41 | namespace VOPD = AMDGPU::VOPD; |
| 42 | |
| 43 | const MachineFunction *MF = MIX.getMF(); |
| 44 | const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); |
| 45 | |
| 46 | if (IsVOPD3 && !ST.hasVOPD3()) |
| 47 | return false; |
| 48 | if (!IsVOPD3 && (TII.isVOP3(MI: MIX) || TII.isVOP3(MI: MIY))) |
| 49 | return false; |
| 50 | if (TII.isDPP(MI: MIX) || TII.isDPP(MI: MIY)) |
| 51 | return false; |
| 52 | |
| 53 | const SIRegisterInfo *TRI = ST.getRegisterInfo(); |
| 54 | const MachineRegisterInfo &MRI = MF->getRegInfo(); |
| 55 | // Literals also count against scalar bus limit |
| 56 | SmallVector<const MachineOperand *> UniqueLiterals; |
| 57 | auto addLiteral = [&](const MachineOperand &Op) { |
| 58 | for (auto &Literal : UniqueLiterals) { |
| 59 | if (Literal->isIdenticalTo(Other: Op)) |
| 60 | return; |
| 61 | } |
| 62 | UniqueLiterals.push_back(Elt: &Op); |
| 63 | }; |
| 64 | SmallSet<Register, 4> UniqueScalarRegs; |
| 65 | |
| 66 | auto getVRegIdx = [&](unsigned OpcodeIdx, unsigned OperandIdx) { |
| 67 | const MachineInstr &MI = (OpcodeIdx == VOPD::X) ? MIX : MIY; |
| 68 | const MachineOperand &Operand = MI.getOperand(i: OperandIdx); |
| 69 | if (Operand.isReg() && TRI->isVectorRegister(MRI, Reg: Operand.getReg())) |
| 70 | return Operand.getReg(); |
| 71 | return Register(); |
| 72 | }; |
| 73 | |
| 74 | auto InstInfo = AMDGPU::getVOPDInstInfo(OpX: MIX.getDesc(), OpY: MIY.getDesc()); |
| 75 | |
| 76 | for (auto CompIdx : VOPD::COMPONENTS) { |
| 77 | const MachineInstr &MI = (CompIdx == VOPD::X) ? MIX : MIY; |
| 78 | |
| 79 | const MachineOperand &Src0 = *TII.getNamedOperand(MI, OperandName: AMDGPU::OpName::src0); |
| 80 | if (Src0.isReg()) { |
| 81 | if (!TRI->isVectorRegister(MRI, Reg: Src0.getReg())) { |
| 82 | UniqueScalarRegs.insert(V: Src0.getReg()); |
| 83 | } |
| 84 | } else if (!TII.isInlineConstant(MO: Src0)) { |
| 85 | if (IsVOPD3) |
| 86 | return false; |
| 87 | addLiteral(Src0); |
| 88 | } |
| 89 | |
| 90 | if (InstInfo[CompIdx].hasMandatoryLiteral()) { |
| 91 | if (IsVOPD3) |
| 92 | return false; |
| 93 | |
| 94 | auto CompOprIdx = InstInfo[CompIdx].getMandatoryLiteralCompOperandIndex(); |
| 95 | addLiteral(MI.getOperand(i: CompOprIdx)); |
| 96 | } |
| 97 | if (MI.getDesc().hasImplicitUseOfPhysReg(Reg: AMDGPU::VCC)) |
| 98 | UniqueScalarRegs.insert(V: AMDGPU::VCC_LO); |
| 99 | |
| 100 | if (IsVOPD3) { |
| 101 | for (auto OpName : {AMDGPU::OpName::src1, AMDGPU::OpName::src2}) { |
| 102 | const MachineOperand *Src = TII.getNamedOperand(MI, OperandName: OpName); |
| 103 | if (!Src) |
| 104 | continue; |
| 105 | if (OpName == AMDGPU::OpName::src2) { |
| 106 | if (AMDGPU::hasNamedOperand(Opcode: MI.getOpcode(), NamedIdx: AMDGPU::OpName::bitop3)) |
| 107 | continue; |
| 108 | if (MI.getOpcode() == AMDGPU::V_CNDMASK_B32_e64) { |
| 109 | UniqueScalarRegs.insert(V: Src->getReg()); |
| 110 | continue; |
| 111 | } |
| 112 | } |
| 113 | if (!Src->isReg() || !TRI->isVGPR(MRI, Reg: Src->getReg())) |
| 114 | return false; |
| 115 | } |
| 116 | |
| 117 | for (auto OpName : {AMDGPU::OpName::clamp, AMDGPU::OpName::omod, |
| 118 | AMDGPU::OpName::op_sel}) { |
| 119 | if (TII.hasModifiersSet(MI, OpName)) |
| 120 | return false; |
| 121 | } |
| 122 | |
| 123 | // Neg is allowed, other modifiers are not. NB: even though sext has the |
| 124 | // same value as neg, there are no combinable instructions with sext. |
| 125 | for (auto OpName : |
| 126 | {AMDGPU::OpName::src0_modifiers, AMDGPU::OpName::src1_modifiers, |
| 127 | AMDGPU::OpName::src2_modifiers}) { |
| 128 | const MachineOperand *Mods = TII.getNamedOperand(MI, OperandName: OpName); |
| 129 | if (Mods && (Mods->getImm() & ~SISrcMods::NEG)) |
| 130 | return false; |
| 131 | } |
| 132 | } |
| 133 | } |
| 134 | |
| 135 | if (UniqueLiterals.size() > 1) |
| 136 | return false; |
| 137 | if ((UniqueLiterals.size() + UniqueScalarRegs.size()) > 2) |
| 138 | return false; |
| 139 | |
| 140 | // On GFX1170+ if both OpX and OpY are V_MOV_B32 then OPY uses SRC2 |
| 141 | // source-cache. |
| 142 | bool SkipSrc = (ST.hasGFX11_7Insts() || ST.hasGFX12Insts()) && |
| 143 | MIX.getOpcode() == AMDGPU::V_MOV_B32_e32 && |
| 144 | MIY.getOpcode() == AMDGPU::V_MOV_B32_e32; |
| 145 | |
| 146 | if (InstInfo.hasInvalidOperand(GetRegIdx: getVRegIdx, MRI: *TRI, SkipSrc, AllowSameVGPR, |
| 147 | VOPD3: IsVOPD3)) |
| 148 | return false; |
| 149 | |
| 150 | if (IsVOPD3) { |
| 151 | // BITOP3 can be converted to DUAL_BITOP2 only if src2 is zero. |
| 152 | // MIX check is only relevant to scheduling? |
| 153 | if (AMDGPU::hasNamedOperand(Opcode: MIX.getOpcode(), NamedIdx: AMDGPU::OpName::bitop3)) { |
| 154 | const MachineOperand &Src2 = |
| 155 | *TII.getNamedOperand(MI: MIX, OperandName: AMDGPU::OpName::src2); |
| 156 | if (!Src2.isImm() || Src2.getImm()) |
| 157 | return false; |
| 158 | } |
| 159 | if (AMDGPU::hasNamedOperand(Opcode: MIY.getOpcode(), NamedIdx: AMDGPU::OpName::bitop3)) { |
| 160 | const MachineOperand &Src2 = |
| 161 | *TII.getNamedOperand(MI: MIY, OperandName: AMDGPU::OpName::src2); |
| 162 | if (!Src2.isImm() || Src2.getImm()) |
| 163 | return false; |
| 164 | } |
| 165 | } |
| 166 | |
| 167 | LLVM_DEBUG(dbgs() << "VOPD Reg Constraints Passed\n\tX: " << MIX |
| 168 | << "\n\tY: " << MIY << "\n" ); |
| 169 | return true; |
| 170 | } |
| 171 | |
| 172 | /// Core pair-eligibility check for a single VOPD encoding variant (VOPD or |
| 173 | /// VOPD3). Returns the X/Y assignment on success, or std::nullopt otherwise. |
| 174 | static std::optional<VOPDMatchInfo> |
| 175 | tryMatchVOPDPairVariant(const SIInstrInfo &TII, unsigned EncodingFamily, |
| 176 | MachineInstr &FirstMI, MachineInstr &SecondMI, |
| 177 | bool IsVOPD3) { |
| 178 | unsigned Opc = FirstMI.getOpcode(); |
| 179 | unsigned Opc2 = SecondMI.getOpcode(); |
| 180 | AMDGPU::CanBeVOPD FirstCanBeVOPD = |
| 181 | AMDGPU::getCanBeVOPD(Opc, EncodingFamily, VOPD3: IsVOPD3); |
| 182 | AMDGPU::CanBeVOPD SecondCanBeVOPD = |
| 183 | AMDGPU::getCanBeVOPD(Opc: Opc2, EncodingFamily, VOPD3: IsVOPD3); |
| 184 | |
| 185 | if (!(FirstCanBeVOPD.X && SecondCanBeVOPD.Y) && |
| 186 | !(FirstCanBeVOPD.Y && SecondCanBeVOPD.X)) |
| 187 | return std::nullopt; |
| 188 | |
| 189 | // If SecondMI depends on FirstMI they cannot execute at the same time. |
| 190 | if (TII.hasRAWDependency(FirstMI, SecondMI)) |
| 191 | return std::nullopt; |
| 192 | |
| 193 | const GCNSubtarget &ST = TII.getSubtarget(); |
| 194 | bool AllowSameVGPR = ST.hasGFX12Insts(); |
| 195 | |
| 196 | if (FirstCanBeVOPD.X && SecondCanBeVOPD.Y) { |
| 197 | if (checkVOPDRegConstraints(TII, MIX: FirstMI, MIY: SecondMI, IsVOPD3, AllowSameVGPR)) |
| 198 | return VOPDMatchInfo{.MIX: &FirstMI, .MIY: &SecondMI, .IsVOPD3: IsVOPD3}; |
| 199 | } |
| 200 | |
| 201 | if (FirstCanBeVOPD.Y && SecondCanBeVOPD.X) { |
| 202 | // AllowSameVGPR relaxes the VGPR bank overlap check for source operands. |
| 203 | // Only enable it when there is no antidependency. |
| 204 | bool IsAntiDep = TII.hasRAWDependency(FirstMI: SecondMI, SecondMI: FirstMI); |
| 205 | AllowSameVGPR &= !IsAntiDep; |
| 206 | if (IsAntiDep && !TII.isVOPDAntidependencyAllowed(MI: SecondMI)) |
| 207 | return std::nullopt; |
| 208 | if (checkVOPDRegConstraints(TII, MIX: SecondMI, MIY: FirstMI, IsVOPD3, AllowSameVGPR)) |
| 209 | return VOPDMatchInfo{.MIX: &SecondMI, .MIY: &FirstMI, .IsVOPD3: IsVOPD3}; |
| 210 | } |
| 211 | |
| 212 | return std::nullopt; |
| 213 | } |
| 214 | |
| 215 | std::optional<VOPDMatchInfo> llvm::tryMatchVOPDPair(const SIInstrInfo &TII, |
| 216 | MachineInstr &FirstMI, |
| 217 | MachineInstr &SecondMI) { |
| 218 | const GCNSubtarget &ST = TII.getSubtarget(); |
| 219 | unsigned EncodingFamily = AMDGPU::getVOPDEncodingFamily(ST); |
| 220 | if (auto Match = tryMatchVOPDPairVariant(TII, EncodingFamily, FirstMI, |
| 221 | SecondMI, /*IsVOPD3=*/false)) |
| 222 | return Match; |
| 223 | if (ST.hasVOPD3()) |
| 224 | return tryMatchVOPDPairVariant(TII, EncodingFamily, FirstMI, SecondMI, |
| 225 | /*IsVOPD3=*/true); |
| 226 | return std::nullopt; |
| 227 | } |
| 228 | |
| 229 | /// Check if the instr pair, FirstMI and SecondMI, should be scheduled |
| 230 | /// together. Given SecondMI, when FirstMI is unspecified, then check if |
| 231 | /// SecondMI may be part of a fused pair at all. |
| 232 | static bool shouldScheduleVOPDAdjacent(const TargetInstrInfo &TII, |
| 233 | const TargetSubtargetInfo &TSI, |
| 234 | const MachineInstr *FirstMI, |
| 235 | const MachineInstr &SecondMI) { |
| 236 | const SIInstrInfo &STII = static_cast<const SIInstrInfo &>(TII); |
| 237 | const GCNSubtarget &ST = STII.getSubtarget(); |
| 238 | |
| 239 | // One instruction case: just check whether SecondMI is eligible at all. |
| 240 | if (!FirstMI) { |
| 241 | unsigned EncodingFamily = AMDGPU::getVOPDEncodingFamily(ST); |
| 242 | unsigned Opc2 = SecondMI.getOpcode(); |
| 243 | auto checkCanBeVOPD = [&](bool VOPD3) { |
| 244 | AMDGPU::CanBeVOPD CanBeVOPD = |
| 245 | AMDGPU::getCanBeVOPD(Opc: Opc2, EncodingFamily, VOPD3); |
| 246 | return CanBeVOPD.Y || CanBeVOPD.X; |
| 247 | }; |
| 248 | return checkCanBeVOPD(false) || (ST.hasVOPD3() && checkCanBeVOPD(true)); |
| 249 | } |
| 250 | |
| 251 | #ifdef EXPENSIVE_CHECKS |
| 252 | assert([&]() -> bool { |
| 253 | for (auto MII = MachineBasicBlock::const_iterator(FirstMI); |
| 254 | MII != FirstMI->getParent()->instr_end(); ++MII) { |
| 255 | if (&*MII == &SecondMI) |
| 256 | return true; |
| 257 | } |
| 258 | return false; |
| 259 | }() && "Expected FirstMI to precede SecondMI" ); |
| 260 | #endif |
| 261 | |
| 262 | return tryMatchVOPDPair(TII: STII, FirstMI&: *const_cast<MachineInstr *>(FirstMI), |
| 263 | SecondMI&: const_cast<MachineInstr &>(SecondMI)) |
| 264 | .has_value(); |
| 265 | } |
| 266 | |
| 267 | /// Collect all load (dependents if \p Forward else dependencies) that connect |
| 268 | /// to the \p Head SU. |
| 269 | /// \p Visited should allocate enough bits for the number of SUnits, but its |
| 270 | /// value can otherwise be uninitialized. |
| 271 | static void collectLoads(SmallPtrSet<SUnit *, 8> &Loads, BitVector &Visited, |
| 272 | SUnit &Head, bool Forward, bool StopAtLoads) { |
| 273 | if (Head.isBoundaryNode()) |
| 274 | return; |
| 275 | |
| 276 | Visited.reset(); |
| 277 | |
| 278 | SmallVector<SUnit *> Stack; |
| 279 | Stack.push_back(Elt: &Head); |
| 280 | while (!Stack.empty()) { |
| 281 | SUnit *SU = Stack.pop_back_val(); |
| 282 | const SmallVector<SDep, 4> &Deps = Forward ? SU->Succs : SU->Preds; |
| 283 | for (const SDep &Edge : Deps) { |
| 284 | if (StopAtLoads && Edge.getKind() != SDep::Data) |
| 285 | continue; |
| 286 | SUnit *Dep = Edge.getSUnit(); |
| 287 | if (Dep->isBoundaryNode() || Visited.test(Idx: Dep->NodeNum)) |
| 288 | continue; |
| 289 | Visited.set(Dep->NodeNum); |
| 290 | |
| 291 | if (Dep->isInstr() && Dep->getInstr()->mayLoad()) { |
| 292 | Loads.insert(Ptr: Dep); |
| 293 | if (StopAtLoads) |
| 294 | continue; |
| 295 | } |
| 296 | Stack.push_back(Elt: Dep); |
| 297 | } |
| 298 | } |
| 299 | } |
| 300 | |
| 301 | /// Checks whether fusing SU \p I with SU \p J would force the loads preceding |
| 302 | /// \p J to complete before loads depending on \p I. |
| 303 | /// |
| 304 | /// \p ILoadSuccs should hold all first load successors of \p I (via |
| 305 | /// collectLoads with StopAtLoads=true). For set bits in \p LoadPredsComputed, |
| 306 | /// the corresponding set in \p LoadPredsCache should hold all transitive load |
| 307 | /// dependencies (via collectLoads with StopAtLoads=false). The \p Scratch |
| 308 | /// bitvector should allocate enough bits for the number of SUnits. |
| 309 | static bool loadsMayOverlap( |
| 310 | [[maybe_unused]] SUnit &I, const SmallPtrSet<SUnit *, 8> &ILoadSuccs, |
| 311 | SUnit &J, BitVector &LoadPredsComputed, |
| 312 | SmallVector<SmallPtrSet<SUnit *, 8>> &LoadPredsCache, BitVector &Scratch) { |
| 313 | |
| 314 | if (ILoadSuccs.empty()) |
| 315 | return false; |
| 316 | |
| 317 | SmallPtrSet<SUnit *, 8> &JLoadPreds = LoadPredsCache[J.NodeNum]; |
| 318 | if (!LoadPredsComputed.test(Idx: J.NodeNum)) { |
| 319 | collectLoads(Loads&: JLoadPreds, Visited&: Scratch, Head&: J, /*Forward=*/false, |
| 320 | /*StopAtLoads=*/true); |
| 321 | LoadPredsComputed.set(J.NodeNum); |
| 322 | } |
| 323 | if (JLoadPreds.empty()) |
| 324 | return false; |
| 325 | |
| 326 | for (SUnit *ILoad : ILoadSuccs) { |
| 327 | SmallPtrSet<SUnit *, 8> &ILoadDeps = LoadPredsCache[ILoad->NodeNum]; |
| 328 | if (!LoadPredsComputed.test(Idx: ILoad->NodeNum)) { |
| 329 | collectLoads(Loads&: ILoadDeps, Visited&: Scratch, Head&: *ILoad, /*Forward=*/false, |
| 330 | /*StopAtLoads=*/false); |
| 331 | LoadPredsComputed.set(ILoad->NodeNum); |
| 332 | } |
| 333 | |
| 334 | for (SUnit *JLoad : JLoadPreds) { |
| 335 | if (ILoad == JLoad) { |
| 336 | LLVM_DEBUG( |
| 337 | dbgs() << "Will not pair SU(" << I.NodeNum << ") with SU(" |
| 338 | << J.NodeNum << ")\n" |
| 339 | << " Fusion would introduce a cyclic dependency with SU(" |
| 340 | << ILoad->NodeNum << ")\n" ); |
| 341 | return true; |
| 342 | } |
| 343 | |
| 344 | if (!ILoadDeps.contains(Ptr: JLoad)) { |
| 345 | LLVM_DEBUG(dbgs() << "Will not pair SU(" << I.NodeNum << ") with SU(" |
| 346 | << J.NodeNum << ")\n" |
| 347 | << " Fusion may force SU(" << JLoad->NodeNum |
| 348 | << ") to complete its load before dispatching SU(" |
| 349 | << ILoad->NodeNum << ")\n" ); |
| 350 | return true; |
| 351 | } |
| 352 | } |
| 353 | } |
| 354 | return false; |
| 355 | } |
| 356 | |
| 357 | namespace { |
| 358 | /// Adapts design from MacroFusion |
| 359 | /// Puts valid candidate instructions back-to-back so they can easily |
| 360 | /// be turned into VOPD instructions |
| 361 | /// Greedily pairs instruction candidates. O(n^2) algorithm. |
| 362 | struct VOPDPairingMutation : ScheduleDAGMutation { |
| 363 | MacroFusionPredTy shouldScheduleAdjacent; // NOLINT: function pointer |
| 364 | |
| 365 | VOPDPairingMutation( |
| 366 | MacroFusionPredTy shouldScheduleAdjacent) // NOLINT: function pointer |
| 367 | : shouldScheduleAdjacent(shouldScheduleAdjacent) {} |
| 368 | |
| 369 | void apply(ScheduleDAGInstrs *DAG) override { |
| 370 | const TargetInstrInfo &TII = *DAG->TII; |
| 371 | const GCNSubtarget &ST = DAG->MF.getSubtarget<GCNSubtarget>(); |
| 372 | if (!AMDGPU::hasVOPD(STI: ST) || !ST.isWave32()) { |
| 373 | LLVM_DEBUG(dbgs() << "Target does not support VOPDPairingMutation\n" ); |
| 374 | return; |
| 375 | } |
| 376 | |
| 377 | BitVector VOPDCapable(DAG->SUnits.size()); |
| 378 | unsigned IIdx = 0; |
| 379 | // Pre-compute whether each individual instruction can be VOPD |
| 380 | for (auto ISUI = DAG->SUnits.begin(), E = DAG->SUnits.end(); ISUI != E; |
| 381 | ++ISUI, ++IIdx) { |
| 382 | const MachineInstr *IMI = ISUI->getInstr(); |
| 383 | if (shouldScheduleAdjacent(TII, ST, nullptr, *IMI) && |
| 384 | hasLessThanNumFused(SU: *ISUI, FuseLimit: 2)) |
| 385 | VOPDCapable[IIdx] = true; |
| 386 | } |
| 387 | |
| 388 | IIdx = 0; |
| 389 | SmallPtrSet<SUnit *, 8> ILoadSuccs; |
| 390 | |
| 391 | // Cache collected load predecessors. |
| 392 | // For VOPDCapable nodes, this caches collectLoads with StopAtLoads=true |
| 393 | // For loads, this caches collectLoads with StopAtLoads=false |
| 394 | BitVector LoadPredsComputed(DAG->SUnits.size()); |
| 395 | SmallVector<SmallPtrSet<SUnit *, 8>> LoadPredsCache(DAG->SUnits.size()); |
| 396 | |
| 397 | BitVector Scratch(DAG->SUnits.size()); |
| 398 | for (auto ISUI = DAG->SUnits.begin(), E = DAG->SUnits.end(); ISUI != E; |
| 399 | ++ISUI, ++IIdx) { |
| 400 | if (!VOPDCapable[IIdx]) |
| 401 | continue; |
| 402 | const MachineInstr *IMI = ISUI->getInstr(); |
| 403 | |
| 404 | ILoadSuccs.clear(); |
| 405 | collectLoads(Loads&: ILoadSuccs, Visited&: Scratch, Head&: *ISUI, /*Forward=*/true, |
| 406 | /*StopAtLoads=*/true); |
| 407 | |
| 408 | unsigned JIdx = IIdx + 1; |
| 409 | for (auto JSUI = ISUI + 1; JSUI != E; ++JSUI, ++JIdx) { |
| 410 | if (!VOPDCapable[JIdx] || JSUI->isBoundaryNode()) |
| 411 | continue; |
| 412 | const MachineInstr *JMI = JSUI->getInstr(); |
| 413 | if (!hasLessThanNumFused(SU: *JSUI, FuseLimit: 2) || |
| 414 | !shouldScheduleAdjacent(TII, ST, IMI, *JMI)) |
| 415 | continue; |
| 416 | |
| 417 | if (loadsMayOverlap(I&: *ISUI, ILoadSuccs, J&: *JSUI, LoadPredsComputed, |
| 418 | LoadPredsCache, Scratch)) |
| 419 | continue; |
| 420 | |
| 421 | if (fuseInstructionPair(DAG&: *DAG, FirstSU&: *ISUI, SecondSU&: *JSUI)) { |
| 422 | // Clear to prevent future checks/fusing |
| 423 | VOPDCapable[JIdx] = false; |
| 424 | break; |
| 425 | } |
| 426 | } |
| 427 | } |
| 428 | LLVM_DEBUG(dbgs() << "Completed VOPDPairingMutation\n" ); |
| 429 | } |
| 430 | }; |
| 431 | } // namespace |
| 432 | |
| 433 | std::unique_ptr<ScheduleDAGMutation> llvm::createVOPDPairingMutation() { |
| 434 | return std::make_unique<VOPDPairingMutation>(args&: shouldScheduleVOPDAdjacent); |
| 435 | } |
| 436 | |