| 1 | //===-- AArch64CodeLayoutOpt.cpp - Code Layout Optimizations --===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | // |
| 9 | // This pass runs after instruction scheduling and employs code layout |
| 10 | // optimizations for certain patterns. |
| 11 | // |
| 12 | // Option -aarch64-code-layout-opt-enable selects instruction pairs to optimize: |
| 13 | // cmp-csel: Enable CMP/CMN-CSEL code layout optimization |
| 14 | // fcmp-fcsel: Enable FCMP-FCSEL code layout optimization |
| 15 | // |
| 16 | // The initial implementation induces function alignment when a supported |
| 17 | // pattern is detected, and possibly instruction-alignment when a pair would |
| 18 | // straddle cache-lines. |
| 19 | //===----------------------------------------------------------------------===// |
| 20 | |
| 21 | #include "AArch64.h" |
| 22 | #include "AArch64InstrInfo.h" |
| 23 | #include "AArch64Subtarget.h" |
| 24 | #include "llvm/ADT/SmallVector.h" |
| 25 | #include "llvm/ADT/Statistic.h" |
| 26 | #include "llvm/CodeGen/MachineBasicBlock.h" |
| 27 | #include "llvm/CodeGen/MachineFunctionPass.h" |
| 28 | #include "llvm/Support/CommandLine.h" |
| 29 | #include "llvm/Support/Debug.h" |
| 30 | #include "llvm/Support/ErrorHandling.h" |
| 31 | #include "llvm/Support/MathExtras.h" |
| 32 | |
| 33 | using namespace llvm; |
| 34 | |
| 35 | #define DEBUG_TYPE "aarch64-code-layout-opt" |
| 36 | #define DBG(...) LLVM_DEBUG(dbgs() << DEBUG_TYPE ": " << __VA_ARGS__) |
| 37 | #define AARCH64_CODE_LAYOUT_OPT_NAME "AArch64 Code Layout Optimization" |
| 38 | |
| 39 | enum CodeLayoutOpt { |
| 40 | CmpCsel, // Align CMP/CMN-CSEL pairs |
| 41 | FcmpFcsel, // Align FCMP-FCSEL pairs |
| 42 | }; |
| 43 | |
| 44 | static cl::bits<CodeLayoutOpt> EnableCodeAlignment( |
| 45 | "aarch64-code-layout-opt-enable" , cl::Hidden, cl::CommaSeparated, |
| 46 | cl::desc("Enable code alignment optimization for instruction pairs" ), |
| 47 | cl::values( |
| 48 | clEnumValN(CmpCsel, "cmp-csel" , "CMP/CMN-CSEL pair alignment (32-bit)" ), |
| 49 | clEnumValN(FcmpFcsel, "fcmp-fcsel" , "FCMP-FCSEL pair alignment" ))); |
| 50 | |
| 51 | static cl::opt<unsigned> FunctionAlignBytes( |
| 52 | "aarch64-code-layout-opt-align-functions" , cl::Hidden, |
| 53 | cl::desc("Function alignment in bytes for code layout optimization " |
| 54 | "(must be a power of 2)" ), |
| 55 | cl::init(Val: 64), cl::callback(CB: [](const unsigned &Val) { |
| 56 | if (!isPowerOf2_32(Value: Val)) |
| 57 | report_fatal_error( |
| 58 | reason: "aarch64-code-layout-opt-align must be a power of 2" ); |
| 59 | })); |
| 60 | |
| 61 | STATISTIC(NumFunctionsAligned, |
| 62 | "Number of functions with aligned (to 64-bytes by default)" ); |
| 63 | STATISTIC(NumCmpCselPairsDetected, |
| 64 | "Number of CMP/CMN-CSEL pairs detected for alignment" ); |
| 65 | STATISTIC(NumFcmpFcselPairsDetected, |
| 66 | "Number of FCMP-FCSEL pairs detected for alignment" ); |
| 67 | |
| 68 | namespace { |
| 69 | |
| 70 | class AArch64CodeLayoutOpt : public MachineFunctionPass { |
| 71 | public: |
| 72 | static char ID; |
| 73 | AArch64CodeLayoutOpt() : MachineFunctionPass(ID) {} |
| 74 | void getAnalysisUsage(AnalysisUsage &AU) const override; |
| 75 | bool runOnMachineFunction(MachineFunction &MF) override; |
| 76 | StringRef getPassName() const override { |
| 77 | return AARCH64_CODE_LAYOUT_OPT_NAME; |
| 78 | } |
| 79 | |
| 80 | private: |
| 81 | const AArch64InstrInfo *TII = nullptr; |
| 82 | |
| 83 | /// Align each fusible CMP/CMN-CSEL or FCMP-FCSEL pair in \p MBB by emitting |
| 84 | /// .p2align before the lead instruction (splitting the block if needed). |
| 85 | /// \returns true iff at least one pair was found and aligned. |
| 86 | bool alignLayoutSensitivePatterns(MachineBasicBlock *MBB); |
| 87 | |
| 88 | /// Emit .p2align before MI. Splits the block if MI is not at its start. |
| 89 | void emitP2Align(MachineInstr &MI, Align DesiredAlign, |
| 90 | unsigned MaxSkipBytes = 4); |
| 91 | |
| 92 | bool optimizeForCodeLayout(MachineFunction &MF); |
| 93 | }; |
| 94 | |
| 95 | } // end anonymous namespace |
| 96 | |
| 97 | char AArch64CodeLayoutOpt::ID = 0; |
| 98 | |
| 99 | INITIALIZE_PASS(AArch64CodeLayoutOpt, "aarch64-code-layout-opt" , |
| 100 | AARCH64_CODE_LAYOUT_OPT_NAME, false, false) |
| 101 | |
| 102 | void AArch64CodeLayoutOpt::getAnalysisUsage(AnalysisUsage &AU) const { |
| 103 | AU.setPreservesAll(); |
| 104 | MachineFunctionPass::getAnalysisUsage(AU); |
| 105 | } |
| 106 | |
| 107 | FunctionPass *llvm::createAArch64CodeLayoutOptPass() { |
| 108 | return new AArch64CodeLayoutOpt(); |
| 109 | } |
| 110 | |
| 111 | /// \returns true iff Opc is a floating-point comparison (FCMP/FCMPE). |
| 112 | static bool isFloatingPointCompare(unsigned Opc) { |
| 113 | switch (Opc) { |
| 114 | case AArch64::FCMPSrr: |
| 115 | case AArch64::FCMPDrr: |
| 116 | case AArch64::FCMPESrr: |
| 117 | case AArch64::FCMPEDrr: |
| 118 | case AArch64::FCMPHrr: |
| 119 | case AArch64::FCMPEHrr: |
| 120 | return true; |
| 121 | default: |
| 122 | return false; |
| 123 | } |
| 124 | } |
| 125 | |
| 126 | /// \returns true iff Opc is a floating-point conditional select (FCSEL). |
| 127 | static bool isFloatingPointConditionalSelect(unsigned Opc) { |
| 128 | switch (Opc) { |
| 129 | case AArch64::FCSELSrrr: |
| 130 | case AArch64::FCSELDrrr: |
| 131 | case AArch64::FCSELHrrr: |
| 132 | return true; |
| 133 | default: |
| 134 | return false; |
| 135 | } |
| 136 | } |
| 137 | |
| 138 | /// \returns true if MI is a qualifying 32-bit CMP or CMN instruction. |
| 139 | /// CMP is encoded as SUBS with WZR destination, CMN as ADDS with WZR. |
| 140 | /// Only simple variants (no shifted/extended reg) qualify, and immediate |
| 141 | /// variants require no LSL shift and small immediates (<=15). |
| 142 | static bool isQualifyingIntCompare(const MachineInstr &MI) { |
| 143 | switch (MI.getOpcode()) { |
| 144 | case AArch64::SUBSWrr: |
| 145 | case AArch64::ADDSWrr: |
| 146 | return MI.definesRegister(Reg: AArch64::WZR, /*TRI=*/nullptr); |
| 147 | case AArch64::SUBSWri: |
| 148 | case AArch64::ADDSWri: |
| 149 | return MI.definesRegister(Reg: AArch64::WZR, /*TRI=*/nullptr) && |
| 150 | MI.getOperand(i: 3).getImm() == 0 && MI.getOperand(i: 2).getImm() <= 15; |
| 151 | case AArch64::SUBSWrs: |
| 152 | case AArch64::ADDSWrs: |
| 153 | return MI.definesRegister(Reg: AArch64::WZR, /*TRI=*/nullptr) && |
| 154 | !AArch64InstrInfo::hasShiftedReg(MI); |
| 155 | case AArch64::SUBSWrx: |
| 156 | return MI.definesRegister(Reg: AArch64::WZR, /*TRI=*/nullptr) && |
| 157 | !AArch64InstrInfo::hasExtendedReg(MI); |
| 158 | default: |
| 159 | return false; |
| 160 | } |
| 161 | } |
| 162 | |
| 163 | bool AArch64CodeLayoutOpt::runOnMachineFunction(MachineFunction &MF) { |
| 164 | const Function &F = MF.getFunction(); |
| 165 | // hasOptSize() returns true for both -Os and -Oz. |
| 166 | if (F.hasOptSize()) |
| 167 | return false; |
| 168 | |
| 169 | const auto *Subtarget = &MF.getSubtarget<AArch64Subtarget>(); |
| 170 | TII = Subtarget->getInstrInfo(); |
| 171 | |
| 172 | // Default: enable when the subtarget opts in via FeatureAlignCmpCSelPairs. |
| 173 | if (!EnableCodeAlignment.getBits() && Subtarget->hasAlignCmpCSelPairs()) { |
| 174 | if (Subtarget->hasFuseCmpCSel()) |
| 175 | EnableCodeAlignment.addValue(V: CmpCsel); |
| 176 | if (Subtarget->hasFuseFCmpFCSel()) |
| 177 | EnableCodeAlignment.addValue(V: FcmpFcsel); |
| 178 | } |
| 179 | |
| 180 | if (!(EnableCodeAlignment.isSet(V: CmpCsel) && Subtarget->hasFuseCmpCSel()) && |
| 181 | !(EnableCodeAlignment.isSet(V: FcmpFcsel) && Subtarget->hasFuseFCmpFCSel())) |
| 182 | return false; |
| 183 | |
| 184 | return optimizeForCodeLayout(MF); |
| 185 | } |
| 186 | |
| 187 | void AArch64CodeLayoutOpt::emitP2Align(MachineInstr &MI, Align DesiredAlign, |
| 188 | unsigned MaxSkipBytes) { |
| 189 | MachineBasicBlock *MBB = MI.getParent(); |
| 190 | |
| 191 | auto FirstReal = |
| 192 | skipDebugInstructionsForward(It: MBB->instr_begin(), End: MBB->instr_end()); |
| 193 | if (&*FirstReal != &MI) { |
| 194 | auto PrevIt = prev_nodbg(It: MI.getIterator(), Begin: MBB->instr_begin()); |
| 195 | MBB = MBB->splitAt(SplitInst&: *PrevIt, /*UpdateLiveIns=*/true); |
| 196 | } |
| 197 | |
| 198 | MBB->setAlignment(DesiredAlign); |
| 199 | MBB->setMaxBytesForAlignment(MaxSkipBytes); |
| 200 | } |
| 201 | |
| 202 | // Align each fusible CMP/CMN-CSEL or FCMP-FCSEL pair in MBB by emitting |
| 203 | // .p2align before the lead instruction (splitting the block if needed). |
| 204 | // A pair is: a qualifying lead instruction immediately followed by its |
| 205 | // consumer (CMP/CMN→CSEL or FCMP→FCSEL), with no intervening instructions. |
| 206 | // Returns true iff at least one pair was found and aligned. |
| 207 | bool AArch64CodeLayoutOpt::alignLayoutSensitivePatterns( |
| 208 | MachineBasicBlock *MBB) { |
| 209 | auto End = MBB->instr_end(); |
| 210 | SmallVector<std::pair<MachineInstr *, bool>, 4> Pairs; |
| 211 | |
| 212 | for (auto &MI : instructionsWithoutDebug(It: MBB->begin(), End: MBB->end())) { |
| 213 | auto NextIt = |
| 214 | skipDebugInstructionsForward(It: std::next(x: MI.getIterator()), End); |
| 215 | if (NextIt == End) |
| 216 | break; |
| 217 | |
| 218 | // --- CMP/CMN-CSEL detection --- |
| 219 | if (EnableCodeAlignment.isSet(V: CmpCsel) && isQualifyingIntCompare(MI) && |
| 220 | NextIt->getOpcode() == AArch64::CSELWr) { |
| 221 | Pairs.push_back(Elt: {&MI, true}); |
| 222 | continue; |
| 223 | } |
| 224 | |
| 225 | // --- FCMP-FCSEL detection --- |
| 226 | if (EnableCodeAlignment.isSet(V: FcmpFcsel) && |
| 227 | isFloatingPointCompare(Opc: MI.getOpcode()) && |
| 228 | isFloatingPointConditionalSelect(Opc: NextIt->getOpcode())) { |
| 229 | Pairs.push_back(Elt: {&MI, false}); |
| 230 | continue; |
| 231 | } |
| 232 | } |
| 233 | |
| 234 | for (auto &[MI, IsCmpCsel] : Pairs) { |
| 235 | emitP2Align(MI&: *MI, DesiredAlign: Align(64)); |
| 236 | DBG(".p2align 6, , 4 before " << *MI); |
| 237 | ++(IsCmpCsel ? NumCmpCselPairsDetected : NumFcmpFcselPairsDetected); |
| 238 | } |
| 239 | |
| 240 | return !Pairs.empty(); |
| 241 | } |
| 242 | |
| 243 | bool AArch64CodeLayoutOpt::optimizeForCodeLayout(MachineFunction &MF) { |
| 244 | DBG("optimizeForCodeLayout: " << MF.getName() << "\n" ); |
| 245 | |
| 246 | bool Changed = false; |
| 247 | for (auto &MBB : MF) |
| 248 | Changed |= alignLayoutSensitivePatterns(MBB: &MBB); |
| 249 | |
| 250 | if (!Changed) |
| 251 | return false; |
| 252 | |
| 253 | if (MF.getAlignment() < Align(FunctionAlignBytes)) { |
| 254 | MF.setAlignment(Align(FunctionAlignBytes)); |
| 255 | ++NumFunctionsAligned; |
| 256 | DBG("Set " << FunctionAlignBytes << "-byte alignment for function " |
| 257 | << MF.getName() << "\n" ); |
| 258 | } else { |
| 259 | DBG("Function " << MF.getName() << " already has sufficient alignment\n" ); |
| 260 | } |
| 261 | return true; |
| 262 | } |
| 263 | |