| 1 | //===-- X86FixupInstTunings.cpp - replace instructions -----------===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | // |
| 9 | // This file does a tuning pass replacing slower machine instructions |
| 10 | // with faster ones. We do this here, as opposed to during normal ISel, as |
| 11 | // attempting to get the "right" instruction can break patterns. This pass |
| 12 | // is not meant search for special cases where an instruction can be transformed |
| 13 | // to another, it is only meant to do transformations where the old instruction |
| 14 | // is always replacable with the new instructions. For example: |
| 15 | // |
| 16 | // `vpermq ymm` -> `vshufd ymm` |
| 17 | // -- BAD, not always valid (lane cross/non-repeated mask) |
| 18 | // |
| 19 | // `vpermilps ymm` -> `vshufd ymm` |
| 20 | // -- GOOD, always replaceable |
| 21 | // |
| 22 | //===----------------------------------------------------------------------===// |
| 23 | |
| 24 | #include "X86.h" |
| 25 | #include "X86InstrInfo.h" |
| 26 | #include "X86RegisterInfo.h" |
| 27 | #include "X86Subtarget.h" |
| 28 | #include "llvm/ADT/Statistic.h" |
| 29 | #include "llvm/CodeGen/MachineFunctionAnalysisManager.h" |
| 30 | #include "llvm/CodeGen/MachineFunctionPass.h" |
| 31 | #include "llvm/CodeGen/MachineInstrBuilder.h" |
| 32 | #include "llvm/CodeGen/MachinePassManager.h" |
| 33 | #include "llvm/IR/Analysis.h" |
| 34 | |
| 35 | using namespace llvm; |
| 36 | |
| 37 | #define DEBUG_TYPE "x86-fixup-inst-tuning" |
| 38 | |
| 39 | STATISTIC(NumInstChanges, "Number of instructions changes" ); |
| 40 | |
| 41 | namespace { |
| 42 | class X86FixupInstTuningImpl { |
| 43 | public: |
| 44 | bool runOnMachineFunction(MachineFunction &MF); |
| 45 | |
| 46 | private: |
| 47 | bool processInstruction(MachineFunction &MF, MachineBasicBlock &MBB, |
| 48 | MachineBasicBlock::iterator &I); |
| 49 | |
| 50 | const X86InstrInfo *TII = nullptr; |
| 51 | const X86Subtarget *ST = nullptr; |
| 52 | const MCSchedModel *SM = nullptr; |
| 53 | const X86RegisterInfo *TRI = nullptr; |
| 54 | }; |
| 55 | |
| 56 | class X86FixupInstTuningLegacy : public MachineFunctionPass { |
| 57 | public: |
| 58 | static char ID; |
| 59 | |
| 60 | X86FixupInstTuningLegacy() : MachineFunctionPass(ID) {} |
| 61 | |
| 62 | StringRef getPassName() const override { return "X86 Fixup Inst Tuning" ; } |
| 63 | |
| 64 | bool runOnMachineFunction(MachineFunction &MF) override; |
| 65 | bool processInstruction(MachineFunction &MF, MachineBasicBlock &MBB, |
| 66 | MachineBasicBlock::iterator &I); |
| 67 | |
| 68 | // This pass runs after regalloc and doesn't support VReg operands. |
| 69 | MachineFunctionProperties getRequiredProperties() const override { |
| 70 | return MachineFunctionProperties().setNoVRegs(); |
| 71 | } |
| 72 | }; |
| 73 | } // end anonymous namespace |
| 74 | |
| 75 | char X86FixupInstTuningLegacy ::ID = 0; |
| 76 | |
| 77 | INITIALIZE_PASS(X86FixupInstTuningLegacy, DEBUG_TYPE, DEBUG_TYPE, false, false) |
| 78 | |
| 79 | FunctionPass *llvm::createX86FixupInstTuningLegacyPass() { |
| 80 | return new X86FixupInstTuningLegacy(); |
| 81 | } |
| 82 | |
| 83 | template <typename T> |
| 84 | static std::optional<bool> CmpOptionals(T NewVal, T CurVal) { |
| 85 | if (NewVal.has_value() && CurVal.has_value() && *NewVal != *CurVal) |
| 86 | return *NewVal < *CurVal; |
| 87 | |
| 88 | return std::nullopt; |
| 89 | } |
| 90 | |
| 91 | bool X86FixupInstTuningImpl::processInstruction( |
| 92 | MachineFunction &MF, MachineBasicBlock &MBB, |
| 93 | MachineBasicBlock::iterator &I) { |
| 94 | MachineInstr &MI = *I; |
| 95 | unsigned Opc = MI.getOpcode(); |
| 96 | unsigned NumOperands = MI.getDesc().getNumOperands(); |
| 97 | bool OptSize = MF.getFunction().hasOptSize(); |
| 98 | |
| 99 | auto GetInstTput = [&](unsigned Opcode) -> std::optional<double> { |
| 100 | // We already checked that SchedModel exists in `NewOpcPreferable`. |
| 101 | return MCSchedModel::getReciprocalThroughput( |
| 102 | STI: *ST, SCDesc: *(SM->getSchedClassDesc(SchedClassIdx: TII->get(Opcode).getSchedClass()))); |
| 103 | }; |
| 104 | |
| 105 | auto GetInstLat = [&](unsigned Opcode) -> std::optional<double> { |
| 106 | // We already checked that SchedModel exists in `NewOpcPreferable`. |
| 107 | return MCSchedModel::computeInstrLatency( |
| 108 | STI: *ST, SCDesc: *(SM->getSchedClassDesc(SchedClassIdx: TII->get(Opcode).getSchedClass()))); |
| 109 | }; |
| 110 | |
| 111 | auto GetInstSize = [&](unsigned Opcode) -> std::optional<unsigned> { |
| 112 | if (unsigned Size = TII->get(Opcode).getSize()) |
| 113 | return Size; |
| 114 | // Zero size means we where unable to compute it. |
| 115 | return std::nullopt; |
| 116 | }; |
| 117 | |
| 118 | auto NewOpcPreferable = [&](unsigned NewOpc, |
| 119 | bool ReplaceInTie = true) -> bool { |
| 120 | std::optional<bool> Res; |
| 121 | if (SM->hasInstrSchedModel()) { |
| 122 | // Compare tput -> lat -> code size. |
| 123 | Res = CmpOptionals(NewVal: GetInstTput(NewOpc), CurVal: GetInstTput(Opc)); |
| 124 | if (Res.has_value()) |
| 125 | return *Res; |
| 126 | |
| 127 | Res = CmpOptionals(NewVal: GetInstLat(NewOpc), CurVal: GetInstLat(Opc)); |
| 128 | if (Res.has_value()) |
| 129 | return *Res; |
| 130 | } |
| 131 | |
| 132 | Res = CmpOptionals(NewVal: GetInstSize(Opc), CurVal: GetInstSize(NewOpc)); |
| 133 | if (Res.has_value()) |
| 134 | return *Res; |
| 135 | |
| 136 | // We either have either were unable to get tput/lat/codesize or all values |
| 137 | // were equal. Return specified option for a tie. |
| 138 | return ReplaceInTie; |
| 139 | }; |
| 140 | |
| 141 | // `vpermilpd r, i` -> `vshufpd r, r, i` |
| 142 | // `vpermilpd r, i, k` -> `vshufpd r, r, i, k` |
| 143 | // `vshufpd` is always as fast or faster than `vpermilpd` and takes |
| 144 | // 1 less byte of code size for VEX and EVEX encoding. |
| 145 | auto ProcessVPERMILPDri = [&](unsigned NewOpc) -> bool { |
| 146 | if (!NewOpcPreferable(NewOpc)) |
| 147 | return false; |
| 148 | LLVM_DEBUG(dbgs() << "Replacing: " << MI); |
| 149 | { |
| 150 | unsigned MaskImm = MI.getOperand(i: NumOperands - 1).getImm(); |
| 151 | MI.removeOperand(OpNo: NumOperands - 1); |
| 152 | MI.addOperand(Op: MI.getOperand(i: NumOperands - 2)); |
| 153 | MI.setDesc(TII->get(Opcode: NewOpc)); |
| 154 | MI.addOperand(Op: MachineOperand::CreateImm(Val: MaskImm)); |
| 155 | } |
| 156 | LLVM_DEBUG(dbgs() << " With: " << MI); |
| 157 | return true; |
| 158 | }; |
| 159 | |
| 160 | // `vpermilps r, i` -> `vshufps r, r, i` |
| 161 | // `vpermilps r, i, k` -> `vshufps r, r, i, k` |
| 162 | // `vshufps` is always as fast or faster than `vpermilps` and takes |
| 163 | // 1 less byte of code size for VEX and EVEX encoding. |
| 164 | auto ProcessVPERMILPSri = [&](unsigned NewOpc) -> bool { |
| 165 | if (!NewOpcPreferable(NewOpc)) |
| 166 | return false; |
| 167 | LLVM_DEBUG(dbgs() << "Replacing: " << MI); |
| 168 | { |
| 169 | unsigned MaskImm = MI.getOperand(i: NumOperands - 1).getImm(); |
| 170 | MI.removeOperand(OpNo: NumOperands - 1); |
| 171 | MI.addOperand(Op: MI.getOperand(i: NumOperands - 2)); |
| 172 | MI.setDesc(TII->get(Opcode: NewOpc)); |
| 173 | MI.addOperand(Op: MachineOperand::CreateImm(Val: MaskImm)); |
| 174 | } |
| 175 | LLVM_DEBUG(dbgs() << " With: " << MI); |
| 176 | return true; |
| 177 | }; |
| 178 | |
| 179 | // `vpermilps m, i` -> `vpshufd m, i` iff no domain delay penalty on shuffles. |
| 180 | // `vpshufd` is always as fast or faster than `vpermilps` and takes 1 less |
| 181 | // byte of code size. |
| 182 | auto ProcessVPERMILPSmi = [&](unsigned NewOpc) -> bool { |
| 183 | // TODO: Might be work adding bypass delay if -Os/-Oz is enabled as |
| 184 | // `vpshufd` saves a byte of code size. |
| 185 | if (!ST->hasNoDomainDelayShuffle() || |
| 186 | !NewOpcPreferable(NewOpc, /*ReplaceInTie*/ false)) |
| 187 | return false; |
| 188 | LLVM_DEBUG(dbgs() << "Replacing: " << MI); |
| 189 | { |
| 190 | MI.setDesc(TII->get(Opcode: NewOpc)); |
| 191 | } |
| 192 | LLVM_DEBUG(dbgs() << " With: " << MI); |
| 193 | return true; |
| 194 | }; |
| 195 | |
| 196 | // `vunpcklpd/vmovlhps r, r` -> `vunpcklqdq r, r`/`vshufpd r, r, 0x00` |
| 197 | // `vunpckhpd/vmovlhps r, r` -> `vunpckhqdq r, r`/`vshufpd r, r, 0xff` |
| 198 | // `vunpcklpd r, r, k` -> `vunpcklqdq r, r, k`/`vshufpd r, r, k, 0x00` |
| 199 | // `vunpckhpd r, r, k` -> `vunpckhqdq r, r, k`/`vshufpd r, r, k, 0xff` |
| 200 | // `vunpcklpd r, m` -> `vunpcklqdq r, m, k` |
| 201 | // `vunpckhpd r, m` -> `vunpckhqdq r, m, k` |
| 202 | // `vunpcklpd r, m, k` -> `vunpcklqdq r, m, k` |
| 203 | // `vunpckhpd r, m, k` -> `vunpckhqdq r, m, k` |
| 204 | // 1) If no bypass delay and `vunpck{l|h}qdq` faster than `vunpck{l|h}pd` |
| 205 | // -> `vunpck{l|h}qdq` |
| 206 | // 2) If `vshufpd` faster than `vunpck{l|h}pd` |
| 207 | // -> `vshufpd` |
| 208 | // |
| 209 | // `vunpcklps` -> `vunpckldq` (for all operand types if no bypass delay) |
| 210 | auto ProcessUNPCK = [&](unsigned NewOpc, unsigned MaskImm) -> bool { |
| 211 | if (!NewOpcPreferable(NewOpc, /*ReplaceInTie*/ false)) |
| 212 | return false; |
| 213 | LLVM_DEBUG(dbgs() << "Replacing: " << MI); |
| 214 | { |
| 215 | MI.setDesc(TII->get(Opcode: NewOpc)); |
| 216 | MI.addOperand(Op: MachineOperand::CreateImm(Val: MaskImm)); |
| 217 | } |
| 218 | LLVM_DEBUG(dbgs() << " With: " << MI); |
| 219 | return true; |
| 220 | }; |
| 221 | |
| 222 | auto ProcessUNPCKToIntDomain = [&](unsigned NewOpc) -> bool { |
| 223 | // TODO it may be worth it to set ReplaceInTie to `true` as there is no real |
| 224 | // downside to the integer unpck, but if someone doesn't specify exact |
| 225 | // target we won't find it faster. |
| 226 | if (!ST->hasNoDomainDelayShuffle() || |
| 227 | !NewOpcPreferable(NewOpc, /*ReplaceInTie*/ false)) |
| 228 | return false; |
| 229 | LLVM_DEBUG(dbgs() << "Replacing: " << MI); |
| 230 | { |
| 231 | MI.setDesc(TII->get(Opcode: NewOpc)); |
| 232 | } |
| 233 | LLVM_DEBUG(dbgs() << " With: " << MI); |
| 234 | return true; |
| 235 | }; |
| 236 | |
| 237 | auto ProcessUNPCKLPDrr = [&](unsigned NewOpcIntDomain, |
| 238 | unsigned NewOpc) -> bool { |
| 239 | if (ProcessUNPCKToIntDomain(NewOpcIntDomain)) |
| 240 | return true; |
| 241 | return ProcessUNPCK(NewOpc, 0x00); |
| 242 | }; |
| 243 | auto ProcessUNPCKHPDrr = [&](unsigned NewOpcIntDomain, |
| 244 | unsigned NewOpc) -> bool { |
| 245 | if (ProcessUNPCKToIntDomain(NewOpcIntDomain)) |
| 246 | return true; |
| 247 | return ProcessUNPCK(NewOpc, 0xff); |
| 248 | }; |
| 249 | |
| 250 | auto ProcessUNPCKPDrm = [&](unsigned NewOpcIntDomain) -> bool { |
| 251 | return ProcessUNPCKToIntDomain(NewOpcIntDomain); |
| 252 | }; |
| 253 | |
| 254 | auto ProcessUNPCKPS = [&](unsigned NewOpc) -> bool { |
| 255 | return ProcessUNPCKToIntDomain(NewOpc); |
| 256 | }; |
| 257 | |
| 258 | auto ProcessBLENDWToBLENDD = [&](unsigned MovOpc, unsigned NumElts) -> bool { |
| 259 | if (!ST->hasAVX2() || !NewOpcPreferable(MovOpc)) |
| 260 | return false; |
| 261 | // Convert to VPBLENDD if scaling the VPBLENDW mask down/up loses no bits. |
| 262 | APInt MaskW = |
| 263 | APInt(8, MI.getOperand(i: NumOperands - 1).getImm(), /*IsSigned=*/false, |
| 264 | /*implicitTrunc=*/true); |
| 265 | APInt MaskD = APIntOps::ScaleBitMask(A: MaskW, NewBitWidth: 4, /*MatchAllBits=*/true); |
| 266 | if (MaskW != APIntOps::ScaleBitMask(A: MaskD, NewBitWidth: 8, /*MatchAllBits=*/true)) |
| 267 | return false; |
| 268 | APInt NewMaskD = APInt::getSplat(NewLen: NumElts, V: MaskD); |
| 269 | LLVM_DEBUG(dbgs() << "Replacing: " << MI); |
| 270 | { |
| 271 | MI.setDesc(TII->get(Opcode: MovOpc)); |
| 272 | MI.removeOperand(OpNo: NumOperands - 1); |
| 273 | MI.addOperand(Op: MachineOperand::CreateImm(Val: NewMaskD.getZExtValue())); |
| 274 | } |
| 275 | LLVM_DEBUG(dbgs() << " With: " << MI); |
| 276 | return true; |
| 277 | }; |
| 278 | |
| 279 | auto ProcessBLENDToMOV = [&](unsigned MovOpc, unsigned Mask, |
| 280 | unsigned MovImm) -> bool { |
| 281 | if ((MI.getOperand(i: NumOperands - 1).getImm() & Mask) != MovImm) |
| 282 | return false; |
| 283 | if (!OptSize && !NewOpcPreferable(MovOpc)) |
| 284 | return false; |
| 285 | LLVM_DEBUG(dbgs() << "Replacing: " << MI); |
| 286 | { |
| 287 | MI.setDesc(TII->get(Opcode: MovOpc)); |
| 288 | MI.removeOperand(OpNo: NumOperands - 1); |
| 289 | } |
| 290 | LLVM_DEBUG(dbgs() << " With: " << MI); |
| 291 | return true; |
| 292 | }; |
| 293 | |
| 294 | // Is ADD(X,X) more efficient than SHL(X,1)? |
| 295 | auto ProcessShiftLeftToAdd = [&](unsigned AddOpc) -> bool { |
| 296 | if (MI.getOperand(i: NumOperands - 1).getImm() != 1) |
| 297 | return false; |
| 298 | if (!NewOpcPreferable(AddOpc, /*ReplaceInTie*/ true)) |
| 299 | return false; |
| 300 | LLVM_DEBUG(dbgs() << "Replacing: " << MI); |
| 301 | { |
| 302 | MI.setDesc(TII->get(Opcode: AddOpc)); |
| 303 | MI.removeOperand(OpNo: NumOperands - 1); |
| 304 | MI.addOperand(Op: MI.getOperand(i: NumOperands - 2)); |
| 305 | } |
| 306 | LLVM_DEBUG(dbgs() << " With: " << MI); |
| 307 | return false; |
| 308 | }; |
| 309 | |
| 310 | // `vpermq ymm, ymm, 0x44` -> `vinserti128 ymm, ymm, xmm, 1` |
| 311 | // `vpermpd ymm, ymm, 0x44` -> `vinsertf128 ymm, ymm, xmm, 1` |
| 312 | // When the immediate is 0x44, VPERMQ/VPERMPD duplicates the lower 128-bit |
| 313 | // lane to both lanes. 0x44 = 0b01_00_01_00 means qwords[3:0] = {src[1], |
| 314 | // src[0], src[1], src[0]} This is equivalent to inserting the lower 128-bits |
| 315 | // into the upper 128-bit position. |
| 316 | auto ProcessVPERMQToVINSERT128 = [&](unsigned NewOpc) -> bool { |
| 317 | if (MI.getOperand(i: NumOperands - 1).getImm() != 0x44) |
| 318 | return false; |
| 319 | if (!NewOpcPreferable(NewOpc, /*ReplaceInTie*/ false)) |
| 320 | return false; |
| 321 | |
| 322 | // Get the XMM subregister of the source YMM register. |
| 323 | Register SrcReg = MI.getOperand(i: 1).getReg(); |
| 324 | Register XmmReg = TRI->getSubReg(Reg: SrcReg, Idx: X86::sub_xmm); |
| 325 | |
| 326 | LLVM_DEBUG(dbgs() << "Replacing: " << MI); |
| 327 | { |
| 328 | // Transform: VPERMQ $dst, $src, $0x44 |
| 329 | // Into: VINSERTI128 $dst, $src, $xmm_src, $1 |
| 330 | MI.setDesc(TII->get(Opcode: NewOpc)); |
| 331 | // Remove the immediate operand. |
| 332 | MI.removeOperand(OpNo: NumOperands - 1); |
| 333 | // Add the XMM subregister operand. |
| 334 | MI.addOperand(Op: MachineOperand::CreateReg(Reg: XmmReg, /*isDef=*/false, |
| 335 | /*isImp=*/false, |
| 336 | /*isKill=*/false)); |
| 337 | // Add the immediate (1 = insert into high 128-bits). |
| 338 | MI.addOperand(Op: MachineOperand::CreateImm(Val: 1)); |
| 339 | } |
| 340 | LLVM_DEBUG(dbgs() << " With: " << MI); |
| 341 | return true; |
| 342 | }; |
| 343 | |
| 344 | switch (Opc) { |
| 345 | case X86::BLENDPDrri: |
| 346 | return ProcessBLENDToMOV(X86::MOVSDrr, 0x3, 0x1); |
| 347 | case X86::VBLENDPDrri: |
| 348 | return ProcessBLENDToMOV(X86::VMOVSDrr, 0x3, 0x1); |
| 349 | |
| 350 | case X86::BLENDPSrri: |
| 351 | return ProcessBLENDToMOV(X86::MOVSSrr, 0xF, 0x1) || |
| 352 | ProcessBLENDToMOV(X86::MOVSDrr, 0xF, 0x3); |
| 353 | case X86::VBLENDPSrri: |
| 354 | return ProcessBLENDToMOV(X86::VMOVSSrr, 0xF, 0x1) || |
| 355 | ProcessBLENDToMOV(X86::VMOVSDrr, 0xF, 0x3); |
| 356 | |
| 357 | case X86::VPBLENDWrri: |
| 358 | // TODO: Add X86::VPBLENDWrmi handling |
| 359 | // TODO: Add X86::VPBLENDWYrri handling |
| 360 | // TODO: Add X86::VPBLENDWYrmi handling |
| 361 | return ProcessBLENDWToBLENDD(X86::VPBLENDDrri, 4); |
| 362 | |
| 363 | case X86::VPERMILPDri: |
| 364 | return ProcessVPERMILPDri(X86::VSHUFPDrri); |
| 365 | case X86::VPERMILPDYri: |
| 366 | return ProcessVPERMILPDri(X86::VSHUFPDYrri); |
| 367 | case X86::VPERMILPDZ128ri: |
| 368 | return ProcessVPERMILPDri(X86::VSHUFPDZ128rri); |
| 369 | case X86::VPERMILPDZ256ri: |
| 370 | return ProcessVPERMILPDri(X86::VSHUFPDZ256rri); |
| 371 | case X86::VPERMILPDZri: |
| 372 | return ProcessVPERMILPDri(X86::VSHUFPDZrri); |
| 373 | case X86::VPERMILPDZ128rikz: |
| 374 | return ProcessVPERMILPDri(X86::VSHUFPDZ128rrikz); |
| 375 | case X86::VPERMILPDZ256rikz: |
| 376 | return ProcessVPERMILPDri(X86::VSHUFPDZ256rrikz); |
| 377 | case X86::VPERMILPDZrikz: |
| 378 | return ProcessVPERMILPDri(X86::VSHUFPDZrrikz); |
| 379 | case X86::VPERMILPDZ128rik: |
| 380 | return ProcessVPERMILPDri(X86::VSHUFPDZ128rrik); |
| 381 | case X86::VPERMILPDZ256rik: |
| 382 | return ProcessVPERMILPDri(X86::VSHUFPDZ256rrik); |
| 383 | case X86::VPERMILPDZrik: |
| 384 | return ProcessVPERMILPDri(X86::VSHUFPDZrrik); |
| 385 | |
| 386 | case X86::VPERMILPSri: |
| 387 | return ProcessVPERMILPSri(X86::VSHUFPSrri); |
| 388 | case X86::VPERMILPSYri: |
| 389 | return ProcessVPERMILPSri(X86::VSHUFPSYrri); |
| 390 | case X86::VPERMILPSZ128ri: |
| 391 | return ProcessVPERMILPSri(X86::VSHUFPSZ128rri); |
| 392 | case X86::VPERMILPSZ256ri: |
| 393 | return ProcessVPERMILPSri(X86::VSHUFPSZ256rri); |
| 394 | case X86::VPERMILPSZri: |
| 395 | return ProcessVPERMILPSri(X86::VSHUFPSZrri); |
| 396 | case X86::VPERMILPSZ128rikz: |
| 397 | return ProcessVPERMILPSri(X86::VSHUFPSZ128rrikz); |
| 398 | case X86::VPERMILPSZ256rikz: |
| 399 | return ProcessVPERMILPSri(X86::VSHUFPSZ256rrikz); |
| 400 | case X86::VPERMILPSZrikz: |
| 401 | return ProcessVPERMILPSri(X86::VSHUFPSZrrikz); |
| 402 | case X86::VPERMILPSZ128rik: |
| 403 | return ProcessVPERMILPSri(X86::VSHUFPSZ128rrik); |
| 404 | case X86::VPERMILPSZ256rik: |
| 405 | return ProcessVPERMILPSri(X86::VSHUFPSZ256rrik); |
| 406 | case X86::VPERMILPSZrik: |
| 407 | return ProcessVPERMILPSri(X86::VSHUFPSZrrik); |
| 408 | case X86::VPERMILPSmi: |
| 409 | return ProcessVPERMILPSmi(X86::VPSHUFDmi); |
| 410 | case X86::VPERMILPSYmi: |
| 411 | // TODO: See if there is a more generic way we can test if the replacement |
| 412 | // instruction is supported. |
| 413 | return ST->hasAVX2() ? ProcessVPERMILPSmi(X86::VPSHUFDYmi) : false; |
| 414 | case X86::VPERMILPSZ128mi: |
| 415 | return ProcessVPERMILPSmi(X86::VPSHUFDZ128mi); |
| 416 | case X86::VPERMILPSZ256mi: |
| 417 | return ProcessVPERMILPSmi(X86::VPSHUFDZ256mi); |
| 418 | case X86::VPERMILPSZmi: |
| 419 | return ProcessVPERMILPSmi(X86::VPSHUFDZmi); |
| 420 | case X86::VPERMILPSZ128mikz: |
| 421 | return ProcessVPERMILPSmi(X86::VPSHUFDZ128mikz); |
| 422 | case X86::VPERMILPSZ256mikz: |
| 423 | return ProcessVPERMILPSmi(X86::VPSHUFDZ256mikz); |
| 424 | case X86::VPERMILPSZmikz: |
| 425 | return ProcessVPERMILPSmi(X86::VPSHUFDZmikz); |
| 426 | case X86::VPERMILPSZ128mik: |
| 427 | return ProcessVPERMILPSmi(X86::VPSHUFDZ128mik); |
| 428 | case X86::VPERMILPSZ256mik: |
| 429 | return ProcessVPERMILPSmi(X86::VPSHUFDZ256mik); |
| 430 | case X86::VPERMILPSZmik: |
| 431 | return ProcessVPERMILPSmi(X86::VPSHUFDZmik); |
| 432 | case X86::VPERMQYri: |
| 433 | return ProcessVPERMQToVINSERT128(X86::VINSERTI128rri); |
| 434 | case X86::VPERMPDYri: |
| 435 | return ProcessVPERMQToVINSERT128(X86::VINSERTF128rri); |
| 436 | case X86::MOVLHPSrr: |
| 437 | case X86::UNPCKLPDrr: |
| 438 | return ProcessUNPCKLPDrr(X86::PUNPCKLQDQrr, X86::SHUFPDrri); |
| 439 | case X86::VMOVLHPSrr: |
| 440 | case X86::VUNPCKLPDrr: |
| 441 | return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQrr, X86::VSHUFPDrri); |
| 442 | case X86::VUNPCKLPDYrr: |
| 443 | return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQYrr, X86::VSHUFPDYrri); |
| 444 | // VMOVLHPS is always 128 bits. |
| 445 | case X86::VMOVLHPSZrr: |
| 446 | case X86::VUNPCKLPDZ128rr: |
| 447 | return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQZ128rr, X86::VSHUFPDZ128rri); |
| 448 | case X86::VUNPCKLPDZ256rr: |
| 449 | return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQZ256rr, X86::VSHUFPDZ256rri); |
| 450 | case X86::VUNPCKLPDZrr: |
| 451 | return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQZrr, X86::VSHUFPDZrri); |
| 452 | case X86::VUNPCKLPDZ128rrk: |
| 453 | return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQZ128rrk, X86::VSHUFPDZ128rrik); |
| 454 | case X86::VUNPCKLPDZ256rrk: |
| 455 | return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQZ256rrk, X86::VSHUFPDZ256rrik); |
| 456 | case X86::VUNPCKLPDZrrk: |
| 457 | return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQZrrk, X86::VSHUFPDZrrik); |
| 458 | case X86::VUNPCKLPDZ128rrkz: |
| 459 | return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQZ128rrkz, X86::VSHUFPDZ128rrikz); |
| 460 | case X86::VUNPCKLPDZ256rrkz: |
| 461 | return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQZ256rrkz, X86::VSHUFPDZ256rrikz); |
| 462 | case X86::VUNPCKLPDZrrkz: |
| 463 | return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQZrrkz, X86::VSHUFPDZrrikz); |
| 464 | case X86::UNPCKHPDrr: |
| 465 | return ProcessUNPCKHPDrr(X86::PUNPCKHQDQrr, X86::SHUFPDrri); |
| 466 | case X86::VUNPCKHPDrr: |
| 467 | return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQrr, X86::VSHUFPDrri); |
| 468 | case X86::VUNPCKHPDYrr: |
| 469 | return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQYrr, X86::VSHUFPDYrri); |
| 470 | case X86::VUNPCKHPDZ128rr: |
| 471 | return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQZ128rr, X86::VSHUFPDZ128rri); |
| 472 | case X86::VUNPCKHPDZ256rr: |
| 473 | return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQZ256rr, X86::VSHUFPDZ256rri); |
| 474 | case X86::VUNPCKHPDZrr: |
| 475 | return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQZrr, X86::VSHUFPDZrri); |
| 476 | case X86::VUNPCKHPDZ128rrk: |
| 477 | return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQZ128rrk, X86::VSHUFPDZ128rrik); |
| 478 | case X86::VUNPCKHPDZ256rrk: |
| 479 | return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQZ256rrk, X86::VSHUFPDZ256rrik); |
| 480 | case X86::VUNPCKHPDZrrk: |
| 481 | return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQZrrk, X86::VSHUFPDZrrik); |
| 482 | case X86::VUNPCKHPDZ128rrkz: |
| 483 | return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQZ128rrkz, X86::VSHUFPDZ128rrikz); |
| 484 | case X86::VUNPCKHPDZ256rrkz: |
| 485 | return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQZ256rrkz, X86::VSHUFPDZ256rrikz); |
| 486 | case X86::VUNPCKHPDZrrkz: |
| 487 | return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQZrrkz, X86::VSHUFPDZrrikz); |
| 488 | case X86::UNPCKLPDrm: |
| 489 | return ProcessUNPCKPDrm(X86::PUNPCKLQDQrm); |
| 490 | case X86::VUNPCKLPDrm: |
| 491 | return ProcessUNPCKPDrm(X86::VPUNPCKLQDQrm); |
| 492 | case X86::VUNPCKLPDYrm: |
| 493 | return ProcessUNPCKPDrm(X86::VPUNPCKLQDQYrm); |
| 494 | case X86::VUNPCKLPDZ128rm: |
| 495 | return ProcessUNPCKPDrm(X86::VPUNPCKLQDQZ128rm); |
| 496 | case X86::VUNPCKLPDZ256rm: |
| 497 | return ProcessUNPCKPDrm(X86::VPUNPCKLQDQZ256rm); |
| 498 | case X86::VUNPCKLPDZrm: |
| 499 | return ProcessUNPCKPDrm(X86::VPUNPCKLQDQZrm); |
| 500 | case X86::VUNPCKLPDZ128rmk: |
| 501 | return ProcessUNPCKPDrm(X86::VPUNPCKLQDQZ128rmk); |
| 502 | case X86::VUNPCKLPDZ256rmk: |
| 503 | return ProcessUNPCKPDrm(X86::VPUNPCKLQDQZ256rmk); |
| 504 | case X86::VUNPCKLPDZrmk: |
| 505 | return ProcessUNPCKPDrm(X86::VPUNPCKLQDQZrmk); |
| 506 | case X86::VUNPCKLPDZ128rmkz: |
| 507 | return ProcessUNPCKPDrm(X86::VPUNPCKLQDQZ128rmkz); |
| 508 | case X86::VUNPCKLPDZ256rmkz: |
| 509 | return ProcessUNPCKPDrm(X86::VPUNPCKLQDQZ256rmkz); |
| 510 | case X86::VUNPCKLPDZrmkz: |
| 511 | return ProcessUNPCKPDrm(X86::VPUNPCKLQDQZrmkz); |
| 512 | case X86::UNPCKHPDrm: |
| 513 | return ProcessUNPCKPDrm(X86::PUNPCKHQDQrm); |
| 514 | case X86::VUNPCKHPDrm: |
| 515 | return ProcessUNPCKPDrm(X86::VPUNPCKHQDQrm); |
| 516 | case X86::VUNPCKHPDYrm: |
| 517 | return ProcessUNPCKPDrm(X86::VPUNPCKHQDQYrm); |
| 518 | case X86::VUNPCKHPDZ128rm: |
| 519 | return ProcessUNPCKPDrm(X86::VPUNPCKHQDQZ128rm); |
| 520 | case X86::VUNPCKHPDZ256rm: |
| 521 | return ProcessUNPCKPDrm(X86::VPUNPCKHQDQZ256rm); |
| 522 | case X86::VUNPCKHPDZrm: |
| 523 | return ProcessUNPCKPDrm(X86::VPUNPCKHQDQZrm); |
| 524 | case X86::VUNPCKHPDZ128rmk: |
| 525 | return ProcessUNPCKPDrm(X86::VPUNPCKHQDQZ128rmk); |
| 526 | case X86::VUNPCKHPDZ256rmk: |
| 527 | return ProcessUNPCKPDrm(X86::VPUNPCKHQDQZ256rmk); |
| 528 | case X86::VUNPCKHPDZrmk: |
| 529 | return ProcessUNPCKPDrm(X86::VPUNPCKHQDQZrmk); |
| 530 | case X86::VUNPCKHPDZ128rmkz: |
| 531 | return ProcessUNPCKPDrm(X86::VPUNPCKHQDQZ128rmkz); |
| 532 | case X86::VUNPCKHPDZ256rmkz: |
| 533 | return ProcessUNPCKPDrm(X86::VPUNPCKHQDQZ256rmkz); |
| 534 | case X86::VUNPCKHPDZrmkz: |
| 535 | return ProcessUNPCKPDrm(X86::VPUNPCKHQDQZrmkz); |
| 536 | |
| 537 | case X86::UNPCKLPSrr: |
| 538 | return ProcessUNPCKPS(X86::PUNPCKLDQrr); |
| 539 | case X86::VUNPCKLPSrr: |
| 540 | return ProcessUNPCKPS(X86::VPUNPCKLDQrr); |
| 541 | case X86::VUNPCKLPSYrr: |
| 542 | return ProcessUNPCKPS(X86::VPUNPCKLDQYrr); |
| 543 | case X86::VUNPCKLPSZ128rr: |
| 544 | return ProcessUNPCKPS(X86::VPUNPCKLDQZ128rr); |
| 545 | case X86::VUNPCKLPSZ256rr: |
| 546 | return ProcessUNPCKPS(X86::VPUNPCKLDQZ256rr); |
| 547 | case X86::VUNPCKLPSZrr: |
| 548 | return ProcessUNPCKPS(X86::VPUNPCKLDQZrr); |
| 549 | case X86::VUNPCKLPSZ128rrk: |
| 550 | return ProcessUNPCKPS(X86::VPUNPCKLDQZ128rrk); |
| 551 | case X86::VUNPCKLPSZ256rrk: |
| 552 | return ProcessUNPCKPS(X86::VPUNPCKLDQZ256rrk); |
| 553 | case X86::VUNPCKLPSZrrk: |
| 554 | return ProcessUNPCKPS(X86::VPUNPCKLDQZrrk); |
| 555 | case X86::VUNPCKLPSZ128rrkz: |
| 556 | return ProcessUNPCKPS(X86::VPUNPCKLDQZ128rrkz); |
| 557 | case X86::VUNPCKLPSZ256rrkz: |
| 558 | return ProcessUNPCKPS(X86::VPUNPCKLDQZ256rrkz); |
| 559 | case X86::VUNPCKLPSZrrkz: |
| 560 | return ProcessUNPCKPS(X86::VPUNPCKLDQZrrkz); |
| 561 | case X86::UNPCKHPSrr: |
| 562 | return ProcessUNPCKPS(X86::PUNPCKHDQrr); |
| 563 | case X86::VUNPCKHPSrr: |
| 564 | return ProcessUNPCKPS(X86::VPUNPCKHDQrr); |
| 565 | case X86::VUNPCKHPSYrr: |
| 566 | return ProcessUNPCKPS(X86::VPUNPCKHDQYrr); |
| 567 | case X86::VUNPCKHPSZ128rr: |
| 568 | return ProcessUNPCKPS(X86::VPUNPCKHDQZ128rr); |
| 569 | case X86::VUNPCKHPSZ256rr: |
| 570 | return ProcessUNPCKPS(X86::VPUNPCKHDQZ256rr); |
| 571 | case X86::VUNPCKHPSZrr: |
| 572 | return ProcessUNPCKPS(X86::VPUNPCKHDQZrr); |
| 573 | case X86::VUNPCKHPSZ128rrk: |
| 574 | return ProcessUNPCKPS(X86::VPUNPCKHDQZ128rrk); |
| 575 | case X86::VUNPCKHPSZ256rrk: |
| 576 | return ProcessUNPCKPS(X86::VPUNPCKHDQZ256rrk); |
| 577 | case X86::VUNPCKHPSZrrk: |
| 578 | return ProcessUNPCKPS(X86::VPUNPCKHDQZrrk); |
| 579 | case X86::VUNPCKHPSZ128rrkz: |
| 580 | return ProcessUNPCKPS(X86::VPUNPCKHDQZ128rrkz); |
| 581 | case X86::VUNPCKHPSZ256rrkz: |
| 582 | return ProcessUNPCKPS(X86::VPUNPCKHDQZ256rrkz); |
| 583 | case X86::VUNPCKHPSZrrkz: |
| 584 | return ProcessUNPCKPS(X86::VPUNPCKHDQZrrkz); |
| 585 | case X86::UNPCKLPSrm: |
| 586 | return ProcessUNPCKPS(X86::PUNPCKLDQrm); |
| 587 | case X86::VUNPCKLPSrm: |
| 588 | return ProcessUNPCKPS(X86::VPUNPCKLDQrm); |
| 589 | case X86::VUNPCKLPSYrm: |
| 590 | return ProcessUNPCKPS(X86::VPUNPCKLDQYrm); |
| 591 | case X86::VUNPCKLPSZ128rm: |
| 592 | return ProcessUNPCKPS(X86::VPUNPCKLDQZ128rm); |
| 593 | case X86::VUNPCKLPSZ256rm: |
| 594 | return ProcessUNPCKPS(X86::VPUNPCKLDQZ256rm); |
| 595 | case X86::VUNPCKLPSZrm: |
| 596 | return ProcessUNPCKPS(X86::VPUNPCKLDQZrm); |
| 597 | case X86::VUNPCKLPSZ128rmk: |
| 598 | return ProcessUNPCKPS(X86::VPUNPCKLDQZ128rmk); |
| 599 | case X86::VUNPCKLPSZ256rmk: |
| 600 | return ProcessUNPCKPS(X86::VPUNPCKLDQZ256rmk); |
| 601 | case X86::VUNPCKLPSZrmk: |
| 602 | return ProcessUNPCKPS(X86::VPUNPCKLDQZrmk); |
| 603 | case X86::VUNPCKLPSZ128rmkz: |
| 604 | return ProcessUNPCKPS(X86::VPUNPCKLDQZ128rmkz); |
| 605 | case X86::VUNPCKLPSZ256rmkz: |
| 606 | return ProcessUNPCKPS(X86::VPUNPCKLDQZ256rmkz); |
| 607 | case X86::VUNPCKLPSZrmkz: |
| 608 | return ProcessUNPCKPS(X86::VPUNPCKLDQZrmkz); |
| 609 | case X86::UNPCKHPSrm: |
| 610 | return ProcessUNPCKPS(X86::PUNPCKHDQrm); |
| 611 | case X86::VUNPCKHPSrm: |
| 612 | return ProcessUNPCKPS(X86::VPUNPCKHDQrm); |
| 613 | case X86::VUNPCKHPSYrm: |
| 614 | return ProcessUNPCKPS(X86::VPUNPCKHDQYrm); |
| 615 | case X86::VUNPCKHPSZ128rm: |
| 616 | return ProcessUNPCKPS(X86::VPUNPCKHDQZ128rm); |
| 617 | case X86::VUNPCKHPSZ256rm: |
| 618 | return ProcessUNPCKPS(X86::VPUNPCKHDQZ256rm); |
| 619 | case X86::VUNPCKHPSZrm: |
| 620 | return ProcessUNPCKPS(X86::VPUNPCKHDQZrm); |
| 621 | case X86::VUNPCKHPSZ128rmk: |
| 622 | return ProcessUNPCKPS(X86::VPUNPCKHDQZ128rmk); |
| 623 | case X86::VUNPCKHPSZ256rmk: |
| 624 | return ProcessUNPCKPS(X86::VPUNPCKHDQZ256rmk); |
| 625 | case X86::VUNPCKHPSZrmk: |
| 626 | return ProcessUNPCKPS(X86::VPUNPCKHDQZrmk); |
| 627 | case X86::VUNPCKHPSZ128rmkz: |
| 628 | return ProcessUNPCKPS(X86::VPUNPCKHDQZ128rmkz); |
| 629 | case X86::VUNPCKHPSZ256rmkz: |
| 630 | return ProcessUNPCKPS(X86::VPUNPCKHDQZ256rmkz); |
| 631 | case X86::VUNPCKHPSZrmkz: |
| 632 | return ProcessUNPCKPS(X86::VPUNPCKHDQZrmkz); |
| 633 | |
| 634 | case X86::PSLLWri: |
| 635 | return ProcessShiftLeftToAdd(X86::PADDWrr); |
| 636 | case X86::VPSLLWri: |
| 637 | return ProcessShiftLeftToAdd(X86::VPADDWrr); |
| 638 | case X86::VPSLLWYri: |
| 639 | return ProcessShiftLeftToAdd(X86::VPADDWYrr); |
| 640 | case X86::VPSLLWZ128ri: |
| 641 | return ProcessShiftLeftToAdd(X86::VPADDWZ128rr); |
| 642 | case X86::VPSLLWZ256ri: |
| 643 | return ProcessShiftLeftToAdd(X86::VPADDWZ256rr); |
| 644 | case X86::VPSLLWZri: |
| 645 | return ProcessShiftLeftToAdd(X86::VPADDWZrr); |
| 646 | case X86::PSLLDri: |
| 647 | return ProcessShiftLeftToAdd(X86::PADDDrr); |
| 648 | case X86::VPSLLDri: |
| 649 | return ProcessShiftLeftToAdd(X86::VPADDDrr); |
| 650 | case X86::VPSLLDYri: |
| 651 | return ProcessShiftLeftToAdd(X86::VPADDDYrr); |
| 652 | case X86::VPSLLDZ128ri: |
| 653 | return ProcessShiftLeftToAdd(X86::VPADDDZ128rr); |
| 654 | case X86::VPSLLDZ256ri: |
| 655 | return ProcessShiftLeftToAdd(X86::VPADDDZ256rr); |
| 656 | case X86::VPSLLDZri: |
| 657 | return ProcessShiftLeftToAdd(X86::VPADDDZrr); |
| 658 | case X86::PSLLQri: |
| 659 | return ProcessShiftLeftToAdd(X86::PADDQrr); |
| 660 | case X86::VPSLLQri: |
| 661 | return ProcessShiftLeftToAdd(X86::VPADDQrr); |
| 662 | case X86::VPSLLQYri: |
| 663 | return ProcessShiftLeftToAdd(X86::VPADDQYrr); |
| 664 | case X86::VPSLLQZ128ri: |
| 665 | return ProcessShiftLeftToAdd(X86::VPADDQZ128rr); |
| 666 | case X86::VPSLLQZ256ri: |
| 667 | return ProcessShiftLeftToAdd(X86::VPADDQZ256rr); |
| 668 | case X86::VPSLLQZri: |
| 669 | return ProcessShiftLeftToAdd(X86::VPADDQZrr); |
| 670 | |
| 671 | default: |
| 672 | return false; |
| 673 | } |
| 674 | } |
| 675 | |
| 676 | bool X86FixupInstTuningImpl::runOnMachineFunction(MachineFunction &MF) { |
| 677 | LLVM_DEBUG(dbgs() << "Start X86FixupInstTuning\n" ;); |
| 678 | bool Changed = false; |
| 679 | ST = &MF.getSubtarget<X86Subtarget>(); |
| 680 | TII = ST->getInstrInfo(); |
| 681 | TRI = ST->getRegisterInfo(); |
| 682 | SM = &ST->getSchedModel(); |
| 683 | |
| 684 | for (MachineBasicBlock &MBB : MF) { |
| 685 | for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ++I) { |
| 686 | if (processInstruction(MF, MBB, I)) { |
| 687 | ++NumInstChanges; |
| 688 | Changed = true; |
| 689 | } |
| 690 | } |
| 691 | } |
| 692 | LLVM_DEBUG(dbgs() << "End X86FixupInstTuning\n" ;); |
| 693 | return Changed; |
| 694 | } |
| 695 | |
| 696 | bool X86FixupInstTuningLegacy::runOnMachineFunction(MachineFunction &MF) { |
| 697 | X86FixupInstTuningImpl Impl; |
| 698 | return Impl.runOnMachineFunction(MF); |
| 699 | } |
| 700 | |
| 701 | PreservedAnalyses |
| 702 | X86FixupInstTuningPass::run(MachineFunction &MF, |
| 703 | MachineFunctionAnalysisManager &MFAM) { |
| 704 | X86FixupInstTuningImpl Impl; |
| 705 | return Impl.runOnMachineFunction(MF) |
| 706 | ? getMachineFunctionPassPreservedAnalyses() |
| 707 | .preserveSet<CFGAnalyses>() |
| 708 | : PreservedAnalyses::all(); |
| 709 | } |
| 710 | |