| 1 | //===-- X86FixupInstTunings.cpp - replace instructions -----------===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | // |
| 9 | // This file does a tuning pass replacing slower machine instructions |
| 10 | // with faster ones. We do this here, as opposed to during normal ISel, as |
| 11 | // attempting to get the "right" instruction can break patterns. This pass |
| 12 | // is not meant search for special cases where an instruction can be transformed |
| 13 | // to another, it is only meant to do transformations where the old instruction |
| 14 | // is always replacable with the new instructions. For example: |
| 15 | // |
| 16 | // `vpermq ymm` -> `vshufd ymm` |
| 17 | // -- BAD, not always valid (lane cross/non-repeated mask) |
| 18 | // |
| 19 | // `vpermilps ymm` -> `vshufd ymm` |
| 20 | // -- GOOD, always replaceable |
| 21 | // |
| 22 | //===----------------------------------------------------------------------===// |
| 23 | |
| 24 | #include "X86.h" |
| 25 | #include "X86InstrInfo.h" |
| 26 | #include "X86Subtarget.h" |
| 27 | #include "llvm/ADT/Statistic.h" |
| 28 | #include "llvm/CodeGen/MachineFunctionPass.h" |
| 29 | #include "llvm/CodeGen/MachineInstrBuilder.h" |
| 30 | |
| 31 | using namespace llvm; |
| 32 | |
| 33 | #define DEBUG_TYPE "x86-fixup-inst-tuning" |
| 34 | |
| 35 | STATISTIC(NumInstChanges, "Number of instructions changes" ); |
| 36 | |
| 37 | namespace { |
| 38 | class X86FixupInstTuningPass : public MachineFunctionPass { |
| 39 | public: |
| 40 | static char ID; |
| 41 | |
| 42 | X86FixupInstTuningPass() : MachineFunctionPass(ID) {} |
| 43 | |
| 44 | StringRef getPassName() const override { return "X86 Fixup Inst Tuning" ; } |
| 45 | |
| 46 | bool runOnMachineFunction(MachineFunction &MF) override; |
| 47 | bool processInstruction(MachineFunction &MF, MachineBasicBlock &MBB, |
| 48 | MachineBasicBlock::iterator &I); |
| 49 | |
| 50 | // This pass runs after regalloc and doesn't support VReg operands. |
| 51 | MachineFunctionProperties getRequiredProperties() const override { |
| 52 | return MachineFunctionProperties().setNoVRegs(); |
| 53 | } |
| 54 | |
| 55 | private: |
| 56 | const X86InstrInfo *TII = nullptr; |
| 57 | const X86Subtarget *ST = nullptr; |
| 58 | const MCSchedModel *SM = nullptr; |
| 59 | }; |
| 60 | } // end anonymous namespace |
| 61 | |
| 62 | char X86FixupInstTuningPass::ID = 0; |
| 63 | |
| 64 | INITIALIZE_PASS(X86FixupInstTuningPass, DEBUG_TYPE, DEBUG_TYPE, false, false) |
| 65 | |
| 66 | FunctionPass *llvm::createX86FixupInstTuning() { |
| 67 | return new X86FixupInstTuningPass(); |
| 68 | } |
| 69 | |
| 70 | template <typename T> |
| 71 | static std::optional<bool> CmpOptionals(T NewVal, T CurVal) { |
| 72 | if (NewVal.has_value() && CurVal.has_value() && *NewVal != *CurVal) |
| 73 | return *NewVal < *CurVal; |
| 74 | |
| 75 | return std::nullopt; |
| 76 | } |
| 77 | |
| 78 | bool X86FixupInstTuningPass::processInstruction( |
| 79 | MachineFunction &MF, MachineBasicBlock &MBB, |
| 80 | MachineBasicBlock::iterator &I) { |
| 81 | MachineInstr &MI = *I; |
| 82 | unsigned Opc = MI.getOpcode(); |
| 83 | unsigned NumOperands = MI.getDesc().getNumOperands(); |
| 84 | bool OptSize = MF.getFunction().hasOptSize(); |
| 85 | |
| 86 | auto GetInstTput = [&](unsigned Opcode) -> std::optional<double> { |
| 87 | // We already checked that SchedModel exists in `NewOpcPreferable`. |
| 88 | return MCSchedModel::getReciprocalThroughput( |
| 89 | STI: *ST, SCDesc: *(SM->getSchedClassDesc(SchedClassIdx: TII->get(Opcode).getSchedClass()))); |
| 90 | }; |
| 91 | |
| 92 | auto GetInstLat = [&](unsigned Opcode) -> std::optional<double> { |
| 93 | // We already checked that SchedModel exists in `NewOpcPreferable`. |
| 94 | return MCSchedModel::computeInstrLatency( |
| 95 | STI: *ST, SCDesc: *(SM->getSchedClassDesc(SchedClassIdx: TII->get(Opcode).getSchedClass()))); |
| 96 | }; |
| 97 | |
| 98 | auto GetInstSize = [&](unsigned Opcode) -> std::optional<unsigned> { |
| 99 | if (unsigned Size = TII->get(Opcode).getSize()) |
| 100 | return Size; |
| 101 | // Zero size means we where unable to compute it. |
| 102 | return std::nullopt; |
| 103 | }; |
| 104 | |
| 105 | auto NewOpcPreferable = [&](unsigned NewOpc, |
| 106 | bool ReplaceInTie = true) -> bool { |
| 107 | std::optional<bool> Res; |
| 108 | if (SM->hasInstrSchedModel()) { |
| 109 | // Compare tput -> lat -> code size. |
| 110 | Res = CmpOptionals(NewVal: GetInstTput(NewOpc), CurVal: GetInstTput(Opc)); |
| 111 | if (Res.has_value()) |
| 112 | return *Res; |
| 113 | |
| 114 | Res = CmpOptionals(NewVal: GetInstLat(NewOpc), CurVal: GetInstLat(Opc)); |
| 115 | if (Res.has_value()) |
| 116 | return *Res; |
| 117 | } |
| 118 | |
| 119 | Res = CmpOptionals(NewVal: GetInstSize(Opc), CurVal: GetInstSize(NewOpc)); |
| 120 | if (Res.has_value()) |
| 121 | return *Res; |
| 122 | |
| 123 | // We either have either were unable to get tput/lat/codesize or all values |
| 124 | // were equal. Return specified option for a tie. |
| 125 | return ReplaceInTie; |
| 126 | }; |
| 127 | |
| 128 | // `vpermilpd r, i` -> `vshufpd r, r, i` |
| 129 | // `vpermilpd r, i, k` -> `vshufpd r, r, i, k` |
| 130 | // `vshufpd` is always as fast or faster than `vpermilpd` and takes |
| 131 | // 1 less byte of code size for VEX and EVEX encoding. |
| 132 | auto ProcessVPERMILPDri = [&](unsigned NewOpc) -> bool { |
| 133 | if (!NewOpcPreferable(NewOpc)) |
| 134 | return false; |
| 135 | LLVM_DEBUG(dbgs() << "Replacing: " << MI); |
| 136 | { |
| 137 | unsigned MaskImm = MI.getOperand(i: NumOperands - 1).getImm(); |
| 138 | MI.removeOperand(OpNo: NumOperands - 1); |
| 139 | MI.addOperand(Op: MI.getOperand(i: NumOperands - 2)); |
| 140 | MI.setDesc(TII->get(Opcode: NewOpc)); |
| 141 | MI.addOperand(Op: MachineOperand::CreateImm(Val: MaskImm)); |
| 142 | } |
| 143 | LLVM_DEBUG(dbgs() << " With: " << MI); |
| 144 | return true; |
| 145 | }; |
| 146 | |
| 147 | // `vpermilps r, i` -> `vshufps r, r, i` |
| 148 | // `vpermilps r, i, k` -> `vshufps r, r, i, k` |
| 149 | // `vshufps` is always as fast or faster than `vpermilps` and takes |
| 150 | // 1 less byte of code size for VEX and EVEX encoding. |
| 151 | auto ProcessVPERMILPSri = [&](unsigned NewOpc) -> bool { |
| 152 | if (!NewOpcPreferable(NewOpc)) |
| 153 | return false; |
| 154 | LLVM_DEBUG(dbgs() << "Replacing: " << MI); |
| 155 | { |
| 156 | unsigned MaskImm = MI.getOperand(i: NumOperands - 1).getImm(); |
| 157 | MI.removeOperand(OpNo: NumOperands - 1); |
| 158 | MI.addOperand(Op: MI.getOperand(i: NumOperands - 2)); |
| 159 | MI.setDesc(TII->get(Opcode: NewOpc)); |
| 160 | MI.addOperand(Op: MachineOperand::CreateImm(Val: MaskImm)); |
| 161 | } |
| 162 | LLVM_DEBUG(dbgs() << " With: " << MI); |
| 163 | return true; |
| 164 | }; |
| 165 | |
| 166 | // `vpermilps m, i` -> `vpshufd m, i` iff no domain delay penalty on shuffles. |
| 167 | // `vpshufd` is always as fast or faster than `vpermilps` and takes 1 less |
| 168 | // byte of code size. |
| 169 | auto ProcessVPERMILPSmi = [&](unsigned NewOpc) -> bool { |
| 170 | // TODO: Might be work adding bypass delay if -Os/-Oz is enabled as |
| 171 | // `vpshufd` saves a byte of code size. |
| 172 | if (!ST->hasNoDomainDelayShuffle() || |
| 173 | !NewOpcPreferable(NewOpc, /*ReplaceInTie*/ false)) |
| 174 | return false; |
| 175 | LLVM_DEBUG(dbgs() << "Replacing: " << MI); |
| 176 | { |
| 177 | MI.setDesc(TII->get(Opcode: NewOpc)); |
| 178 | } |
| 179 | LLVM_DEBUG(dbgs() << " With: " << MI); |
| 180 | return true; |
| 181 | }; |
| 182 | |
| 183 | // `vunpcklpd/vmovlhps r, r` -> `vunpcklqdq r, r`/`vshufpd r, r, 0x00` |
| 184 | // `vunpckhpd/vmovlhps r, r` -> `vunpckhqdq r, r`/`vshufpd r, r, 0xff` |
| 185 | // `vunpcklpd r, r, k` -> `vunpcklqdq r, r, k`/`vshufpd r, r, k, 0x00` |
| 186 | // `vunpckhpd r, r, k` -> `vunpckhqdq r, r, k`/`vshufpd r, r, k, 0xff` |
| 187 | // `vunpcklpd r, m` -> `vunpcklqdq r, m, k` |
| 188 | // `vunpckhpd r, m` -> `vunpckhqdq r, m, k` |
| 189 | // `vunpcklpd r, m, k` -> `vunpcklqdq r, m, k` |
| 190 | // `vunpckhpd r, m, k` -> `vunpckhqdq r, m, k` |
| 191 | // 1) If no bypass delay and `vunpck{l|h}qdq` faster than `vunpck{l|h}pd` |
| 192 | // -> `vunpck{l|h}qdq` |
| 193 | // 2) If `vshufpd` faster than `vunpck{l|h}pd` |
| 194 | // -> `vshufpd` |
| 195 | // |
| 196 | // `vunpcklps` -> `vunpckldq` (for all operand types if no bypass delay) |
| 197 | auto ProcessUNPCK = [&](unsigned NewOpc, unsigned MaskImm) -> bool { |
| 198 | if (!NewOpcPreferable(NewOpc, /*ReplaceInTie*/ false)) |
| 199 | return false; |
| 200 | LLVM_DEBUG(dbgs() << "Replacing: " << MI); |
| 201 | { |
| 202 | MI.setDesc(TII->get(Opcode: NewOpc)); |
| 203 | MI.addOperand(Op: MachineOperand::CreateImm(Val: MaskImm)); |
| 204 | } |
| 205 | LLVM_DEBUG(dbgs() << " With: " << MI); |
| 206 | return true; |
| 207 | }; |
| 208 | |
| 209 | auto ProcessUNPCKToIntDomain = [&](unsigned NewOpc) -> bool { |
| 210 | // TODO it may be worth it to set ReplaceInTie to `true` as there is no real |
| 211 | // downside to the integer unpck, but if someone doesn't specify exact |
| 212 | // target we won't find it faster. |
| 213 | if (!ST->hasNoDomainDelayShuffle() || |
| 214 | !NewOpcPreferable(NewOpc, /*ReplaceInTie*/ false)) |
| 215 | return false; |
| 216 | LLVM_DEBUG(dbgs() << "Replacing: " << MI); |
| 217 | { |
| 218 | MI.setDesc(TII->get(Opcode: NewOpc)); |
| 219 | } |
| 220 | LLVM_DEBUG(dbgs() << " With: " << MI); |
| 221 | return true; |
| 222 | }; |
| 223 | |
| 224 | auto ProcessUNPCKLPDrr = [&](unsigned NewOpcIntDomain, |
| 225 | unsigned NewOpc) -> bool { |
| 226 | if (ProcessUNPCKToIntDomain(NewOpcIntDomain)) |
| 227 | return true; |
| 228 | return ProcessUNPCK(NewOpc, 0x00); |
| 229 | }; |
| 230 | auto ProcessUNPCKHPDrr = [&](unsigned NewOpcIntDomain, |
| 231 | unsigned NewOpc) -> bool { |
| 232 | if (ProcessUNPCKToIntDomain(NewOpcIntDomain)) |
| 233 | return true; |
| 234 | return ProcessUNPCK(NewOpc, 0xff); |
| 235 | }; |
| 236 | |
| 237 | auto ProcessUNPCKPDrm = [&](unsigned NewOpcIntDomain) -> bool { |
| 238 | return ProcessUNPCKToIntDomain(NewOpcIntDomain); |
| 239 | }; |
| 240 | |
| 241 | auto ProcessUNPCKPS = [&](unsigned NewOpc) -> bool { |
| 242 | return ProcessUNPCKToIntDomain(NewOpc); |
| 243 | }; |
| 244 | |
| 245 | auto ProcessBLENDWToBLENDD = [&](unsigned MovOpc, unsigned NumElts) -> bool { |
| 246 | if (!ST->hasAVX2() || !NewOpcPreferable(MovOpc)) |
| 247 | return false; |
| 248 | // Convert to VPBLENDD if scaling the VPBLENDW mask down/up loses no bits. |
| 249 | APInt MaskW = |
| 250 | APInt(8, MI.getOperand(i: NumOperands - 1).getImm(), /*IsSigned=*/false); |
| 251 | APInt MaskD = APIntOps::ScaleBitMask(A: MaskW, NewBitWidth: 4, /*MatchAllBits=*/true); |
| 252 | if (MaskW != APIntOps::ScaleBitMask(A: MaskD, NewBitWidth: 8, /*MatchAllBits=*/true)) |
| 253 | return false; |
| 254 | APInt NewMaskD = APInt::getSplat(NewLen: NumElts, V: MaskD); |
| 255 | LLVM_DEBUG(dbgs() << "Replacing: " << MI); |
| 256 | { |
| 257 | MI.setDesc(TII->get(Opcode: MovOpc)); |
| 258 | MI.removeOperand(OpNo: NumOperands - 1); |
| 259 | MI.addOperand(Op: MachineOperand::CreateImm(Val: NewMaskD.getZExtValue())); |
| 260 | } |
| 261 | LLVM_DEBUG(dbgs() << " With: " << MI); |
| 262 | return true; |
| 263 | }; |
| 264 | |
| 265 | auto ProcessBLENDToMOV = [&](unsigned MovOpc, unsigned Mask, |
| 266 | unsigned MovImm) -> bool { |
| 267 | if ((MI.getOperand(i: NumOperands - 1).getImm() & Mask) != MovImm) |
| 268 | return false; |
| 269 | if (!OptSize && !NewOpcPreferable(MovOpc)) |
| 270 | return false; |
| 271 | LLVM_DEBUG(dbgs() << "Replacing: " << MI); |
| 272 | { |
| 273 | MI.setDesc(TII->get(Opcode: MovOpc)); |
| 274 | MI.removeOperand(OpNo: NumOperands - 1); |
| 275 | } |
| 276 | LLVM_DEBUG(dbgs() << " With: " << MI); |
| 277 | return true; |
| 278 | }; |
| 279 | |
| 280 | switch (Opc) { |
| 281 | case X86::BLENDPDrri: |
| 282 | return ProcessBLENDToMOV(X86::MOVSDrr, 0x3, 0x1); |
| 283 | case X86::VBLENDPDrri: |
| 284 | return ProcessBLENDToMOV(X86::VMOVSDrr, 0x3, 0x1); |
| 285 | |
| 286 | case X86::BLENDPSrri: |
| 287 | return ProcessBLENDToMOV(X86::MOVSSrr, 0xF, 0x1) || |
| 288 | ProcessBLENDToMOV(X86::MOVSDrr, 0xF, 0x3); |
| 289 | case X86::VBLENDPSrri: |
| 290 | return ProcessBLENDToMOV(X86::VMOVSSrr, 0xF, 0x1) || |
| 291 | ProcessBLENDToMOV(X86::VMOVSDrr, 0xF, 0x3); |
| 292 | |
| 293 | case X86::VPBLENDWrri: |
| 294 | // TODO: Add X86::VPBLENDWrmi handling |
| 295 | // TODO: Add X86::VPBLENDWYrri handling |
| 296 | // TODO: Add X86::VPBLENDWYrmi handling |
| 297 | return ProcessBLENDWToBLENDD(X86::VPBLENDDrri, 4); |
| 298 | |
| 299 | case X86::VPERMILPDri: |
| 300 | return ProcessVPERMILPDri(X86::VSHUFPDrri); |
| 301 | case X86::VPERMILPDYri: |
| 302 | return ProcessVPERMILPDri(X86::VSHUFPDYrri); |
| 303 | case X86::VPERMILPDZ128ri: |
| 304 | return ProcessVPERMILPDri(X86::VSHUFPDZ128rri); |
| 305 | case X86::VPERMILPDZ256ri: |
| 306 | return ProcessVPERMILPDri(X86::VSHUFPDZ256rri); |
| 307 | case X86::VPERMILPDZri: |
| 308 | return ProcessVPERMILPDri(X86::VSHUFPDZrri); |
| 309 | case X86::VPERMILPDZ128rikz: |
| 310 | return ProcessVPERMILPDri(X86::VSHUFPDZ128rrikz); |
| 311 | case X86::VPERMILPDZ256rikz: |
| 312 | return ProcessVPERMILPDri(X86::VSHUFPDZ256rrikz); |
| 313 | case X86::VPERMILPDZrikz: |
| 314 | return ProcessVPERMILPDri(X86::VSHUFPDZrrikz); |
| 315 | case X86::VPERMILPDZ128rik: |
| 316 | return ProcessVPERMILPDri(X86::VSHUFPDZ128rrik); |
| 317 | case X86::VPERMILPDZ256rik: |
| 318 | return ProcessVPERMILPDri(X86::VSHUFPDZ256rrik); |
| 319 | case X86::VPERMILPDZrik: |
| 320 | return ProcessVPERMILPDri(X86::VSHUFPDZrrik); |
| 321 | |
| 322 | case X86::VPERMILPSri: |
| 323 | return ProcessVPERMILPSri(X86::VSHUFPSrri); |
| 324 | case X86::VPERMILPSYri: |
| 325 | return ProcessVPERMILPSri(X86::VSHUFPSYrri); |
| 326 | case X86::VPERMILPSZ128ri: |
| 327 | return ProcessVPERMILPSri(X86::VSHUFPSZ128rri); |
| 328 | case X86::VPERMILPSZ256ri: |
| 329 | return ProcessVPERMILPSri(X86::VSHUFPSZ256rri); |
| 330 | case X86::VPERMILPSZri: |
| 331 | return ProcessVPERMILPSri(X86::VSHUFPSZrri); |
| 332 | case X86::VPERMILPSZ128rikz: |
| 333 | return ProcessVPERMILPSri(X86::VSHUFPSZ128rrikz); |
| 334 | case X86::VPERMILPSZ256rikz: |
| 335 | return ProcessVPERMILPSri(X86::VSHUFPSZ256rrikz); |
| 336 | case X86::VPERMILPSZrikz: |
| 337 | return ProcessVPERMILPSri(X86::VSHUFPSZrrikz); |
| 338 | case X86::VPERMILPSZ128rik: |
| 339 | return ProcessVPERMILPSri(X86::VSHUFPSZ128rrik); |
| 340 | case X86::VPERMILPSZ256rik: |
| 341 | return ProcessVPERMILPSri(X86::VSHUFPSZ256rrik); |
| 342 | case X86::VPERMILPSZrik: |
| 343 | return ProcessVPERMILPSri(X86::VSHUFPSZrrik); |
| 344 | case X86::VPERMILPSmi: |
| 345 | return ProcessVPERMILPSmi(X86::VPSHUFDmi); |
| 346 | case X86::VPERMILPSYmi: |
| 347 | // TODO: See if there is a more generic way we can test if the replacement |
| 348 | // instruction is supported. |
| 349 | return ST->hasAVX2() ? ProcessVPERMILPSmi(X86::VPSHUFDYmi) : false; |
| 350 | case X86::VPERMILPSZ128mi: |
| 351 | return ProcessVPERMILPSmi(X86::VPSHUFDZ128mi); |
| 352 | case X86::VPERMILPSZ256mi: |
| 353 | return ProcessVPERMILPSmi(X86::VPSHUFDZ256mi); |
| 354 | case X86::VPERMILPSZmi: |
| 355 | return ProcessVPERMILPSmi(X86::VPSHUFDZmi); |
| 356 | case X86::VPERMILPSZ128mikz: |
| 357 | return ProcessVPERMILPSmi(X86::VPSHUFDZ128mikz); |
| 358 | case X86::VPERMILPSZ256mikz: |
| 359 | return ProcessVPERMILPSmi(X86::VPSHUFDZ256mikz); |
| 360 | case X86::VPERMILPSZmikz: |
| 361 | return ProcessVPERMILPSmi(X86::VPSHUFDZmikz); |
| 362 | case X86::VPERMILPSZ128mik: |
| 363 | return ProcessVPERMILPSmi(X86::VPSHUFDZ128mik); |
| 364 | case X86::VPERMILPSZ256mik: |
| 365 | return ProcessVPERMILPSmi(X86::VPSHUFDZ256mik); |
| 366 | case X86::VPERMILPSZmik: |
| 367 | return ProcessVPERMILPSmi(X86::VPSHUFDZmik); |
| 368 | |
| 369 | case X86::MOVLHPSrr: |
| 370 | case X86::UNPCKLPDrr: |
| 371 | return ProcessUNPCKLPDrr(X86::PUNPCKLQDQrr, X86::SHUFPDrri); |
| 372 | case X86::VMOVLHPSrr: |
| 373 | case X86::VUNPCKLPDrr: |
| 374 | return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQrr, X86::VSHUFPDrri); |
| 375 | case X86::VUNPCKLPDYrr: |
| 376 | return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQYrr, X86::VSHUFPDYrri); |
| 377 | // VMOVLHPS is always 128 bits. |
| 378 | case X86::VMOVLHPSZrr: |
| 379 | case X86::VUNPCKLPDZ128rr: |
| 380 | return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQZ128rr, X86::VSHUFPDZ128rri); |
| 381 | case X86::VUNPCKLPDZ256rr: |
| 382 | return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQZ256rr, X86::VSHUFPDZ256rri); |
| 383 | case X86::VUNPCKLPDZrr: |
| 384 | return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQZrr, X86::VSHUFPDZrri); |
| 385 | case X86::VUNPCKLPDZ128rrk: |
| 386 | return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQZ128rrk, X86::VSHUFPDZ128rrik); |
| 387 | case X86::VUNPCKLPDZ256rrk: |
| 388 | return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQZ256rrk, X86::VSHUFPDZ256rrik); |
| 389 | case X86::VUNPCKLPDZrrk: |
| 390 | return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQZrrk, X86::VSHUFPDZrrik); |
| 391 | case X86::VUNPCKLPDZ128rrkz: |
| 392 | return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQZ128rrkz, X86::VSHUFPDZ128rrikz); |
| 393 | case X86::VUNPCKLPDZ256rrkz: |
| 394 | return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQZ256rrkz, X86::VSHUFPDZ256rrikz); |
| 395 | case X86::VUNPCKLPDZrrkz: |
| 396 | return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQZrrkz, X86::VSHUFPDZrrikz); |
| 397 | case X86::UNPCKHPDrr: |
| 398 | return ProcessUNPCKHPDrr(X86::PUNPCKHQDQrr, X86::SHUFPDrri); |
| 399 | case X86::VUNPCKHPDrr: |
| 400 | return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQrr, X86::VSHUFPDrri); |
| 401 | case X86::VUNPCKHPDYrr: |
| 402 | return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQYrr, X86::VSHUFPDYrri); |
| 403 | case X86::VUNPCKHPDZ128rr: |
| 404 | return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQZ128rr, X86::VSHUFPDZ128rri); |
| 405 | case X86::VUNPCKHPDZ256rr: |
| 406 | return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQZ256rr, X86::VSHUFPDZ256rri); |
| 407 | case X86::VUNPCKHPDZrr: |
| 408 | return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQZrr, X86::VSHUFPDZrri); |
| 409 | case X86::VUNPCKHPDZ128rrk: |
| 410 | return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQZ128rrk, X86::VSHUFPDZ128rrik); |
| 411 | case X86::VUNPCKHPDZ256rrk: |
| 412 | return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQZ256rrk, X86::VSHUFPDZ256rrik); |
| 413 | case X86::VUNPCKHPDZrrk: |
| 414 | return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQZrrk, X86::VSHUFPDZrrik); |
| 415 | case X86::VUNPCKHPDZ128rrkz: |
| 416 | return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQZ128rrkz, X86::VSHUFPDZ128rrikz); |
| 417 | case X86::VUNPCKHPDZ256rrkz: |
| 418 | return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQZ256rrkz, X86::VSHUFPDZ256rrikz); |
| 419 | case X86::VUNPCKHPDZrrkz: |
| 420 | return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQZrrkz, X86::VSHUFPDZrrikz); |
| 421 | case X86::UNPCKLPDrm: |
| 422 | return ProcessUNPCKPDrm(X86::PUNPCKLQDQrm); |
| 423 | case X86::VUNPCKLPDrm: |
| 424 | return ProcessUNPCKPDrm(X86::VPUNPCKLQDQrm); |
| 425 | case X86::VUNPCKLPDYrm: |
| 426 | return ProcessUNPCKPDrm(X86::VPUNPCKLQDQYrm); |
| 427 | case X86::VUNPCKLPDZ128rm: |
| 428 | return ProcessUNPCKPDrm(X86::VPUNPCKLQDQZ128rm); |
| 429 | case X86::VUNPCKLPDZ256rm: |
| 430 | return ProcessUNPCKPDrm(X86::VPUNPCKLQDQZ256rm); |
| 431 | case X86::VUNPCKLPDZrm: |
| 432 | return ProcessUNPCKPDrm(X86::VPUNPCKLQDQZrm); |
| 433 | case X86::VUNPCKLPDZ128rmk: |
| 434 | return ProcessUNPCKPDrm(X86::VPUNPCKLQDQZ128rmk); |
| 435 | case X86::VUNPCKLPDZ256rmk: |
| 436 | return ProcessUNPCKPDrm(X86::VPUNPCKLQDQZ256rmk); |
| 437 | case X86::VUNPCKLPDZrmk: |
| 438 | return ProcessUNPCKPDrm(X86::VPUNPCKLQDQZrmk); |
| 439 | case X86::VUNPCKLPDZ128rmkz: |
| 440 | return ProcessUNPCKPDrm(X86::VPUNPCKLQDQZ128rmkz); |
| 441 | case X86::VUNPCKLPDZ256rmkz: |
| 442 | return ProcessUNPCKPDrm(X86::VPUNPCKLQDQZ256rmkz); |
| 443 | case X86::VUNPCKLPDZrmkz: |
| 444 | return ProcessUNPCKPDrm(X86::VPUNPCKLQDQZrmkz); |
| 445 | case X86::UNPCKHPDrm: |
| 446 | return ProcessUNPCKPDrm(X86::PUNPCKHQDQrm); |
| 447 | case X86::VUNPCKHPDrm: |
| 448 | return ProcessUNPCKPDrm(X86::VPUNPCKHQDQrm); |
| 449 | case X86::VUNPCKHPDYrm: |
| 450 | return ProcessUNPCKPDrm(X86::VPUNPCKHQDQYrm); |
| 451 | case X86::VUNPCKHPDZ128rm: |
| 452 | return ProcessUNPCKPDrm(X86::VPUNPCKHQDQZ128rm); |
| 453 | case X86::VUNPCKHPDZ256rm: |
| 454 | return ProcessUNPCKPDrm(X86::VPUNPCKHQDQZ256rm); |
| 455 | case X86::VUNPCKHPDZrm: |
| 456 | return ProcessUNPCKPDrm(X86::VPUNPCKHQDQZrm); |
| 457 | case X86::VUNPCKHPDZ128rmk: |
| 458 | return ProcessUNPCKPDrm(X86::VPUNPCKHQDQZ128rmk); |
| 459 | case X86::VUNPCKHPDZ256rmk: |
| 460 | return ProcessUNPCKPDrm(X86::VPUNPCKHQDQZ256rmk); |
| 461 | case X86::VUNPCKHPDZrmk: |
| 462 | return ProcessUNPCKPDrm(X86::VPUNPCKHQDQZrmk); |
| 463 | case X86::VUNPCKHPDZ128rmkz: |
| 464 | return ProcessUNPCKPDrm(X86::VPUNPCKHQDQZ128rmkz); |
| 465 | case X86::VUNPCKHPDZ256rmkz: |
| 466 | return ProcessUNPCKPDrm(X86::VPUNPCKHQDQZ256rmkz); |
| 467 | case X86::VUNPCKHPDZrmkz: |
| 468 | return ProcessUNPCKPDrm(X86::VPUNPCKHQDQZrmkz); |
| 469 | |
| 470 | case X86::UNPCKLPSrr: |
| 471 | return ProcessUNPCKPS(X86::PUNPCKLDQrr); |
| 472 | case X86::VUNPCKLPSrr: |
| 473 | return ProcessUNPCKPS(X86::VPUNPCKLDQrr); |
| 474 | case X86::VUNPCKLPSYrr: |
| 475 | return ProcessUNPCKPS(X86::VPUNPCKLDQYrr); |
| 476 | case X86::VUNPCKLPSZ128rr: |
| 477 | return ProcessUNPCKPS(X86::VPUNPCKLDQZ128rr); |
| 478 | case X86::VUNPCKLPSZ256rr: |
| 479 | return ProcessUNPCKPS(X86::VPUNPCKLDQZ256rr); |
| 480 | case X86::VUNPCKLPSZrr: |
| 481 | return ProcessUNPCKPS(X86::VPUNPCKLDQZrr); |
| 482 | case X86::VUNPCKLPSZ128rrk: |
| 483 | return ProcessUNPCKPS(X86::VPUNPCKLDQZ128rrk); |
| 484 | case X86::VUNPCKLPSZ256rrk: |
| 485 | return ProcessUNPCKPS(X86::VPUNPCKLDQZ256rrk); |
| 486 | case X86::VUNPCKLPSZrrk: |
| 487 | return ProcessUNPCKPS(X86::VPUNPCKLDQZrrk); |
| 488 | case X86::VUNPCKLPSZ128rrkz: |
| 489 | return ProcessUNPCKPS(X86::VPUNPCKLDQZ128rrkz); |
| 490 | case X86::VUNPCKLPSZ256rrkz: |
| 491 | return ProcessUNPCKPS(X86::VPUNPCKLDQZ256rrkz); |
| 492 | case X86::VUNPCKLPSZrrkz: |
| 493 | return ProcessUNPCKPS(X86::VPUNPCKLDQZrrkz); |
| 494 | case X86::UNPCKHPSrr: |
| 495 | return ProcessUNPCKPS(X86::PUNPCKHDQrr); |
| 496 | case X86::VUNPCKHPSrr: |
| 497 | return ProcessUNPCKPS(X86::VPUNPCKHDQrr); |
| 498 | case X86::VUNPCKHPSYrr: |
| 499 | return ProcessUNPCKPS(X86::VPUNPCKHDQYrr); |
| 500 | case X86::VUNPCKHPSZ128rr: |
| 501 | return ProcessUNPCKPS(X86::VPUNPCKHDQZ128rr); |
| 502 | case X86::VUNPCKHPSZ256rr: |
| 503 | return ProcessUNPCKPS(X86::VPUNPCKHDQZ256rr); |
| 504 | case X86::VUNPCKHPSZrr: |
| 505 | return ProcessUNPCKPS(X86::VPUNPCKHDQZrr); |
| 506 | case X86::VUNPCKHPSZ128rrk: |
| 507 | return ProcessUNPCKPS(X86::VPUNPCKHDQZ128rrk); |
| 508 | case X86::VUNPCKHPSZ256rrk: |
| 509 | return ProcessUNPCKPS(X86::VPUNPCKHDQZ256rrk); |
| 510 | case X86::VUNPCKHPSZrrk: |
| 511 | return ProcessUNPCKPS(X86::VPUNPCKHDQZrrk); |
| 512 | case X86::VUNPCKHPSZ128rrkz: |
| 513 | return ProcessUNPCKPS(X86::VPUNPCKHDQZ128rrkz); |
| 514 | case X86::VUNPCKHPSZ256rrkz: |
| 515 | return ProcessUNPCKPS(X86::VPUNPCKHDQZ256rrkz); |
| 516 | case X86::VUNPCKHPSZrrkz: |
| 517 | return ProcessUNPCKPS(X86::VPUNPCKHDQZrrkz); |
| 518 | case X86::UNPCKLPSrm: |
| 519 | return ProcessUNPCKPS(X86::PUNPCKLDQrm); |
| 520 | case X86::VUNPCKLPSrm: |
| 521 | return ProcessUNPCKPS(X86::VPUNPCKLDQrm); |
| 522 | case X86::VUNPCKLPSYrm: |
| 523 | return ProcessUNPCKPS(X86::VPUNPCKLDQYrm); |
| 524 | case X86::VUNPCKLPSZ128rm: |
| 525 | return ProcessUNPCKPS(X86::VPUNPCKLDQZ128rm); |
| 526 | case X86::VUNPCKLPSZ256rm: |
| 527 | return ProcessUNPCKPS(X86::VPUNPCKLDQZ256rm); |
| 528 | case X86::VUNPCKLPSZrm: |
| 529 | return ProcessUNPCKPS(X86::VPUNPCKLDQZrm); |
| 530 | case X86::VUNPCKLPSZ128rmk: |
| 531 | return ProcessUNPCKPS(X86::VPUNPCKLDQZ128rmk); |
| 532 | case X86::VUNPCKLPSZ256rmk: |
| 533 | return ProcessUNPCKPS(X86::VPUNPCKLDQZ256rmk); |
| 534 | case X86::VUNPCKLPSZrmk: |
| 535 | return ProcessUNPCKPS(X86::VPUNPCKLDQZrmk); |
| 536 | case X86::VUNPCKLPSZ128rmkz: |
| 537 | return ProcessUNPCKPS(X86::VPUNPCKLDQZ128rmkz); |
| 538 | case X86::VUNPCKLPSZ256rmkz: |
| 539 | return ProcessUNPCKPS(X86::VPUNPCKLDQZ256rmkz); |
| 540 | case X86::VUNPCKLPSZrmkz: |
| 541 | return ProcessUNPCKPS(X86::VPUNPCKLDQZrmkz); |
| 542 | case X86::UNPCKHPSrm: |
| 543 | return ProcessUNPCKPS(X86::PUNPCKHDQrm); |
| 544 | case X86::VUNPCKHPSrm: |
| 545 | return ProcessUNPCKPS(X86::VPUNPCKHDQrm); |
| 546 | case X86::VUNPCKHPSYrm: |
| 547 | return ProcessUNPCKPS(X86::VPUNPCKHDQYrm); |
| 548 | case X86::VUNPCKHPSZ128rm: |
| 549 | return ProcessUNPCKPS(X86::VPUNPCKHDQZ128rm); |
| 550 | case X86::VUNPCKHPSZ256rm: |
| 551 | return ProcessUNPCKPS(X86::VPUNPCKHDQZ256rm); |
| 552 | case X86::VUNPCKHPSZrm: |
| 553 | return ProcessUNPCKPS(X86::VPUNPCKHDQZrm); |
| 554 | case X86::VUNPCKHPSZ128rmk: |
| 555 | return ProcessUNPCKPS(X86::VPUNPCKHDQZ128rmk); |
| 556 | case X86::VUNPCKHPSZ256rmk: |
| 557 | return ProcessUNPCKPS(X86::VPUNPCKHDQZ256rmk); |
| 558 | case X86::VUNPCKHPSZrmk: |
| 559 | return ProcessUNPCKPS(X86::VPUNPCKHDQZrmk); |
| 560 | case X86::VUNPCKHPSZ128rmkz: |
| 561 | return ProcessUNPCKPS(X86::VPUNPCKHDQZ128rmkz); |
| 562 | case X86::VUNPCKHPSZ256rmkz: |
| 563 | return ProcessUNPCKPS(X86::VPUNPCKHDQZ256rmkz); |
| 564 | case X86::VUNPCKHPSZrmkz: |
| 565 | return ProcessUNPCKPS(X86::VPUNPCKHDQZrmkz); |
| 566 | default: |
| 567 | return false; |
| 568 | } |
| 569 | } |
| 570 | |
| 571 | bool X86FixupInstTuningPass::runOnMachineFunction(MachineFunction &MF) { |
| 572 | LLVM_DEBUG(dbgs() << "Start X86FixupInstTuning\n" ;); |
| 573 | bool Changed = false; |
| 574 | ST = &MF.getSubtarget<X86Subtarget>(); |
| 575 | TII = ST->getInstrInfo(); |
| 576 | SM = &ST->getSchedModel(); |
| 577 | |
| 578 | for (MachineBasicBlock &MBB : MF) { |
| 579 | for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ++I) { |
| 580 | if (processInstruction(MF, MBB, I)) { |
| 581 | ++NumInstChanges; |
| 582 | Changed = true; |
| 583 | } |
| 584 | } |
| 585 | } |
| 586 | LLVM_DEBUG(dbgs() << "End X86FixupInstTuning\n" ;); |
| 587 | return Changed; |
| 588 | } |
| 589 | |