| 1 | //===- AArch64SRLTDefineSuperRegs.cpp -------------------------------------===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | // |
| 9 | // When SubRegister Liveness Tracking (SRLT) is enabled, this pass adds |
| 10 | // extra implicit-def's to instructions that define the low N bits of |
| 11 | // a GPR/FPR register to also define the top bits, because all AArch64 |
| 12 | // instructions that write the low bits of a GPR/FPR also implicitly zero |
| 13 | // the top bits. For example, 'mov w0, w1' writes zeroes to the top 32-bits of |
| 14 | // x0, so this pass adds a `implicit-def $x0` after register allocation. |
| 15 | // |
| 16 | // These semantics are originally represented in the MIR using `SUBREG_TO_REG` |
| 17 | // which expresses that the top bits have been defined by the preceding |
| 18 | // instructions, but during register coalescing this information is lost and in |
| 19 | // contrast to when SRTL is disabled, when rewriting virtual -> physical |
| 20 | // registers the implicit-defs are not added to the instruction. |
| 21 | // |
| 22 | // There have been several attempts to fix this in the coalescer [1], but each |
| 23 | // iteration has exposed new bugs and the patch had to be reverted. |
| 24 | // Additionally, the concept of adding 'implicit-def' of a virtual register is |
| 25 | // particularly fragile and many places don't expect it (for example in |
| 26 | // `X86::commuteInstructionImpl` the code only looks at specific operands and |
| 27 | // does not consider implicit-defs. Similar in `SplitEditor::addDeadDef` where |
| 28 | // it traverses operand 'defs' rather than 'all_defs'). |
| 29 | // |
| 30 | // We want a temporary solution that doesn't impact other targets and is simpler |
| 31 | // and less intrusive than the patch proposed for the register coalescer [1], so |
| 32 | // that we can enable SRLT for AArch64. |
| 33 | // |
| 34 | // The approach here is to just add the 'implicit-def' manually after rewriting |
| 35 | // virtual regs -> phsyical regs. This still means that during the register |
| 36 | // allocation process the dependences are not accurately represented in the MIR |
| 37 | // and LiveIntervals, but there are several reasons why we believe this isn't a |
| 38 | // problem in practice: |
| 39 | // (A) The register allocator only spills entire virtual registers. |
| 40 | // This is additionally guarded by code in |
| 41 | // AArch64InstrInfo::storeRegToStackSlot/loadRegFromStackSlot |
| 42 | // where it checks if a register matches the expected register class. |
| 43 | // (B) Rematerialization only happens when the instruction writes the full |
| 44 | // register. |
| 45 | // (C) The high bits of the AArch64 register cannot be written independently. |
| 46 | // (D) Instructions that write only part of a register always take that same |
| 47 | // register as a tied input operand, to indicate it's a merging operation. |
| 48 | // |
| 49 | // (A) means that for two virtual registers of regclass GPR32 and GPR64, if the |
| 50 | // GPR32 register is coalesced into the GPR64 vreg then the full GPR64 would |
| 51 | // be spilled/filled even if only the low 32-bits would be required for the |
| 52 | // given liverange. (B) means that the top bits of a GPR64 would never be |
| 53 | // overwritten by rematerialising a GPR32 sub-register for a given liverange. |
| 54 | // (C-D) means that we can assume that the MIR as input to the register |
| 55 | // allocator correctly expresses the instruction behaviour and dependences |
| 56 | // between values, so unless the register allocator would violate (A) or (B), |
| 57 | // the MIR is otherwise sound. |
| 58 | // |
| 59 | // Alternative approaches have also been considered, such as: |
| 60 | // (1) Changing the AArch64 instruction definitions to write all bits and |
| 61 | // extract the low N bits for the result. |
| 62 | // (2) Disabling coalescing of SUBREG_TO_REG and using regalloc hints to tell |
| 63 | // the register allocator to favour the same register for the input/output. |
| 64 | // (3) Adding a new coalescer guard node with a tied-operand constraint, such |
| 65 | // that when the SUBREG_TO_REG is removed, something still represents that |
| 66 | // the top bits are defined. The node would get removed before rewriting |
| 67 | // virtregs. |
| 68 | // (4) Using an explicit INSERT_SUBREG into a zero value and try to optimize |
| 69 | // away the INSERT_SUBREG (this is a more explicit variant of (2) and (3)) |
| 70 | // (5) Adding a new MachineOperand flag that represents the top bits would be |
| 71 | // defined, but are not read nor undef. |
| 72 | // |
| 73 | // (1) would be the best approach but would be a significant effort as it |
| 74 | // requires rewriting most/all instruction definitions and fixing MIR passes |
| 75 | // that rely on the current definitions, whereas (2-4) result in sub-optimal |
| 76 | // code that can't really be avoided because the explicit nodes would stop |
| 77 | // rematerialization. (5) might be a way to mitigate the |
| 78 | // fragility of implicit-def's of virtual registers if we want to pursue |
| 79 | // landing [1], but then we'd rather choose approach (1) to avoid using |
| 80 | // SUBREG_TO_REG entirely. |
| 81 | // |
| 82 | // [1] https://github.com/llvm/llvm-project/pull/168353 |
| 83 | //===----------------------------------------------------------------------===// |
| 84 | |
| 85 | #include "AArch64InstrInfo.h" |
| 86 | #include "AArch64MachineFunctionInfo.h" |
| 87 | #include "AArch64Subtarget.h" |
| 88 | #include "MCTargetDesc/AArch64AddressingModes.h" |
| 89 | #include "llvm/ADT/BitVector.h" |
| 90 | #include "llvm/ADT/SmallSet.h" |
| 91 | #include "llvm/CodeGen/MachineBasicBlock.h" |
| 92 | #include "llvm/CodeGen/MachineFunctionPass.h" |
| 93 | #include "llvm/CodeGen/MachineRegisterInfo.h" |
| 94 | #include "llvm/CodeGen/TargetRegisterInfo.h" |
| 95 | #include "llvm/Support/Debug.h" |
| 96 | |
| 97 | using namespace llvm; |
| 98 | |
| 99 | #define DEBUG_TYPE "aarch64-srlt-define-superregs" |
| 100 | #define PASS_NAME "AArch64 SRLT Define Super-Regs Pass" |
| 101 | |
| 102 | namespace { |
| 103 | |
| 104 | struct AArch64SRLTDefineSuperRegs : public MachineFunctionPass { |
| 105 | inline static char ID = 0; |
| 106 | |
| 107 | AArch64SRLTDefineSuperRegs() : MachineFunctionPass(ID) {} |
| 108 | |
| 109 | bool runOnMachineFunction(MachineFunction &MF) override; |
| 110 | |
| 111 | Register getWidestSuperReg(Register R, const BitVector &RequiredBaseRegUnits, |
| 112 | const BitVector &QHiRegUnits); |
| 113 | |
| 114 | StringRef getPassName() const override { return PASS_NAME; } |
| 115 | |
| 116 | void getAnalysisUsage(AnalysisUsage &AU) const override { |
| 117 | AU.setPreservesCFG(); |
| 118 | AU.addPreservedID(ID&: MachineLoopInfoID); |
| 119 | AU.addPreservedID(ID&: MachineDominatorsID); |
| 120 | MachineFunctionPass::getAnalysisUsage(AU); |
| 121 | } |
| 122 | |
| 123 | private: |
| 124 | MachineFunction *MF = nullptr; |
| 125 | const AArch64Subtarget *Subtarget = nullptr; |
| 126 | const AArch64RegisterInfo *TRI = nullptr; |
| 127 | }; |
| 128 | |
| 129 | } // end anonymous namespace |
| 130 | |
| 131 | INITIALIZE_PASS(AArch64SRLTDefineSuperRegs, DEBUG_TYPE, PASS_NAME, false, false) |
| 132 | |
| 133 | // Returns the widest super-reg for a given reg, or NoRegister if no suitable |
| 134 | // wider super-reg has been found. For example: |
| 135 | // W0 -> X0 |
| 136 | // B1 -> Q1 (without SVE) |
| 137 | // -> Z1 (with SVE) |
| 138 | // W1_W2 -> X1_X2 |
| 139 | // D0_D1 -> Q0_Q1 (without SVE) |
| 140 | // -> Z0_Z1 (with SVE) |
| 141 | Register AArch64SRLTDefineSuperRegs::getWidestSuperReg( |
| 142 | Register R, const BitVector &RequiredBaseRegUnits, |
| 143 | const BitVector &QHiRegUnits) { |
| 144 | assert(R.isPhysical() && |
| 145 | "Expected to be run straight after virtregrewriter!" ); |
| 146 | |
| 147 | BitVector Units(TRI->getNumRegUnits()); |
| 148 | for (MCRegUnit U : TRI->regunits(Reg: R)) |
| 149 | Units.set((unsigned)U); |
| 150 | |
| 151 | auto IsSuitableSuperReg = [&](Register SR) { |
| 152 | for (MCRegUnit U : TRI->regunits(Reg: SR)) { |
| 153 | // Avoid choosing z1 as super-reg of d1 if SVE is not available. |
| 154 | // Q*_HI registers are only set for SVE registers, as those consist |
| 155 | // of the Q* register for the low 128 bits and the Q*_HI (artificial) |
| 156 | // register for the top (vscale-1) * 128 bits. |
| 157 | if (QHiRegUnits.test(Idx: (unsigned)U) && |
| 158 | !Subtarget->isSVEorStreamingSVEAvailable()) |
| 159 | return false; |
| 160 | // We consider a super-reg as unsuitable if any of its reg units is not |
| 161 | // artificial and not shared, as that would imply that U is a unit for a |
| 162 | // different register, which means the candidate super-reg is likely |
| 163 | // a register tuple. |
| 164 | if (!TRI->isArtificialRegUnit(Unit: U) && |
| 165 | (!Units.test(Idx: (unsigned)U) || !RequiredBaseRegUnits.test(Idx: (unsigned)U))) |
| 166 | return false; |
| 167 | } |
| 168 | return true; |
| 169 | }; |
| 170 | |
| 171 | Register LargestSuperReg = AArch64::NoRegister; |
| 172 | for (Register SR : TRI->superregs(Reg: R)) |
| 173 | if (IsSuitableSuperReg(SR) && (LargestSuperReg == AArch64::NoRegister || |
| 174 | TRI->isSuperRegister(RegA: LargestSuperReg, RegB: SR))) |
| 175 | LargestSuperReg = SR; |
| 176 | |
| 177 | return LargestSuperReg; |
| 178 | } |
| 179 | |
| 180 | bool AArch64SRLTDefineSuperRegs::runOnMachineFunction(MachineFunction &MF) { |
| 181 | this->MF = &MF; |
| 182 | Subtarget = &MF.getSubtarget<AArch64Subtarget>(); |
| 183 | TRI = Subtarget->getRegisterInfo(); |
| 184 | const MachineRegisterInfo *MRI = &MF.getRegInfo(); |
| 185 | |
| 186 | if (!MRI->subRegLivenessEnabled()) |
| 187 | return false; |
| 188 | |
| 189 | assert(!MRI->isSSA() && "Expected to be run after breaking down SSA form!" ); |
| 190 | |
| 191 | auto XRegs = seq_inclusive<unsigned>(Begin: AArch64::X0, End: AArch64::X28); |
| 192 | auto ZRegs = seq_inclusive<unsigned>(Begin: AArch64::Z0, End: AArch64::Z31); |
| 193 | constexpr unsigned FixedRegs[] = {AArch64::FP, AArch64::LR, AArch64::SP}; |
| 194 | |
| 195 | BitVector RequiredBaseRegUnits(TRI->getNumRegUnits()); |
| 196 | for (Register R : concat<unsigned>(Ranges&: XRegs, Ranges&: ZRegs, Ranges: FixedRegs)) |
| 197 | for (MCRegUnit U : TRI->regunits(Reg: R)) |
| 198 | RequiredBaseRegUnits.set((unsigned)U); |
| 199 | |
| 200 | BitVector QHiRegUnits(TRI->getNumRegUnits()); |
| 201 | for (Register R : seq_inclusive<unsigned>(Begin: AArch64::Q0_HI, End: AArch64::Q31_HI)) |
| 202 | for (MCRegUnit U : TRI->regunits(Reg: R)) |
| 203 | QHiRegUnits.set((unsigned)U); |
| 204 | |
| 205 | bool Changed = false; |
| 206 | for (MachineBasicBlock &MBB : MF) { |
| 207 | for (MachineInstr &MI : MBB) { |
| 208 | // PATCHPOINT may have a 'def' that's not a register, avoid this. |
| 209 | if (MI.getOpcode() == TargetOpcode::PATCHPOINT) |
| 210 | continue; |
| 211 | // For each partial register write, also add an implicit-def for top bits |
| 212 | // of the register (e.g. for w0 add a def of x0). |
| 213 | SmallSet<Register, 8> SuperRegs; |
| 214 | for (const MachineOperand &DefOp : MI.defs()) |
| 215 | if (Register R = getWidestSuperReg(R: DefOp.getReg(), RequiredBaseRegUnits, |
| 216 | QHiRegUnits); |
| 217 | R != AArch64::NoRegister) |
| 218 | SuperRegs.insert(V: R); |
| 219 | |
| 220 | if (!SuperRegs.size()) |
| 221 | continue; |
| 222 | |
| 223 | LLVM_DEBUG(dbgs() << "Adding implicit-defs to: " << MI); |
| 224 | for (Register R : SuperRegs) { |
| 225 | LLVM_DEBUG(dbgs() << " " << printReg(R, TRI) << "\n" ); |
| 226 | bool IsRenamable = any_of(Range: MI.defs(), P: [&](const MachineOperand &MO) { |
| 227 | return MO.isRenamable() && TRI->regsOverlap(RegA: MO.getReg(), RegB: R); |
| 228 | }); |
| 229 | bool IsDead = any_of(Range: MI.defs(), P: [&](const MachineOperand &MO) { |
| 230 | return MO.isDead() && TRI->regsOverlap(RegA: MO.getReg(), RegB: R); |
| 231 | }); |
| 232 | MachineOperand DefOp = MachineOperand::CreateReg( |
| 233 | Reg: R, /*isDef=*/true, /*isImp=*/true, /*isKill=*/false, |
| 234 | /*isDead=*/IsDead, /*isUndef=*/false, /*isEarlyClobber=*/false, |
| 235 | /*SubReg=*/0, /*isDebug=*/false, /*isInternalRead=*/false, |
| 236 | /*isRenamable=*/IsRenamable); |
| 237 | MI.addOperand(Op: DefOp); |
| 238 | } |
| 239 | Changed = true; |
| 240 | } |
| 241 | } |
| 242 | |
| 243 | return Changed; |
| 244 | } |
| 245 | |
| 246 | FunctionPass *llvm::createAArch64SRLTDefineSuperRegsPass() { |
| 247 | return new AArch64SRLTDefineSuperRegs(); |
| 248 | } |
| 249 | |