1 | //===-- X86FixupInstTunings.cpp - replace instructions -----------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This file does a tuning pass replacing slower machine instructions |
10 | // with faster ones. We do this here, as opposed to during normal ISel, as |
11 | // attempting to get the "right" instruction can break patterns. This pass |
12 | // is not meant search for special cases where an instruction can be transformed |
13 | // to another, it is only meant to do transformations where the old instruction |
14 | // is always replacable with the new instructions. For example: |
15 | // |
16 | // `vpermq ymm` -> `vshufd ymm` |
17 | // -- BAD, not always valid (lane cross/non-repeated mask) |
18 | // |
19 | // `vpermilps ymm` -> `vshufd ymm` |
20 | // -- GOOD, always replaceable |
21 | // |
22 | //===----------------------------------------------------------------------===// |
23 | |
24 | #include "X86.h" |
25 | #include "X86InstrInfo.h" |
26 | #include "X86Subtarget.h" |
27 | #include "llvm/ADT/Statistic.h" |
28 | #include "llvm/CodeGen/MachineFunctionPass.h" |
29 | #include "llvm/CodeGen/MachineInstrBuilder.h" |
30 | |
31 | using namespace llvm; |
32 | |
33 | #define DEBUG_TYPE "x86-fixup-inst-tuning" |
34 | |
35 | STATISTIC(NumInstChanges, "Number of instructions changes" ); |
36 | |
37 | namespace { |
38 | class X86FixupInstTuningPass : public MachineFunctionPass { |
39 | public: |
40 | static char ID; |
41 | |
42 | X86FixupInstTuningPass() : MachineFunctionPass(ID) {} |
43 | |
44 | StringRef getPassName() const override { return "X86 Fixup Inst Tuning" ; } |
45 | |
46 | bool runOnMachineFunction(MachineFunction &MF) override; |
47 | bool processInstruction(MachineFunction &MF, MachineBasicBlock &MBB, |
48 | MachineBasicBlock::iterator &I); |
49 | |
50 | // This pass runs after regalloc and doesn't support VReg operands. |
51 | MachineFunctionProperties getRequiredProperties() const override { |
52 | return MachineFunctionProperties().setNoVRegs(); |
53 | } |
54 | |
55 | private: |
56 | const X86InstrInfo *TII = nullptr; |
57 | const X86Subtarget *ST = nullptr; |
58 | const MCSchedModel *SM = nullptr; |
59 | }; |
60 | } // end anonymous namespace |
61 | |
62 | char X86FixupInstTuningPass::ID = 0; |
63 | |
64 | INITIALIZE_PASS(X86FixupInstTuningPass, DEBUG_TYPE, DEBUG_TYPE, false, false) |
65 | |
66 | FunctionPass *llvm::createX86FixupInstTuning() { |
67 | return new X86FixupInstTuningPass(); |
68 | } |
69 | |
70 | template <typename T> |
71 | static std::optional<bool> CmpOptionals(T NewVal, T CurVal) { |
72 | if (NewVal.has_value() && CurVal.has_value() && *NewVal != *CurVal) |
73 | return *NewVal < *CurVal; |
74 | |
75 | return std::nullopt; |
76 | } |
77 | |
78 | bool X86FixupInstTuningPass::processInstruction( |
79 | MachineFunction &MF, MachineBasicBlock &MBB, |
80 | MachineBasicBlock::iterator &I) { |
81 | MachineInstr &MI = *I; |
82 | unsigned Opc = MI.getOpcode(); |
83 | unsigned NumOperands = MI.getDesc().getNumOperands(); |
84 | bool OptSize = MF.getFunction().hasOptSize(); |
85 | |
86 | auto GetInstTput = [&](unsigned Opcode) -> std::optional<double> { |
87 | // We already checked that SchedModel exists in `NewOpcPreferable`. |
88 | return MCSchedModel::getReciprocalThroughput( |
89 | STI: *ST, SCDesc: *(SM->getSchedClassDesc(SchedClassIdx: TII->get(Opcode).getSchedClass()))); |
90 | }; |
91 | |
92 | auto GetInstLat = [&](unsigned Opcode) -> std::optional<double> { |
93 | // We already checked that SchedModel exists in `NewOpcPreferable`. |
94 | return MCSchedModel::computeInstrLatency( |
95 | STI: *ST, SCDesc: *(SM->getSchedClassDesc(SchedClassIdx: TII->get(Opcode).getSchedClass()))); |
96 | }; |
97 | |
98 | auto GetInstSize = [&](unsigned Opcode) -> std::optional<unsigned> { |
99 | if (unsigned Size = TII->get(Opcode).getSize()) |
100 | return Size; |
101 | // Zero size means we where unable to compute it. |
102 | return std::nullopt; |
103 | }; |
104 | |
105 | auto NewOpcPreferable = [&](unsigned NewOpc, |
106 | bool ReplaceInTie = true) -> bool { |
107 | std::optional<bool> Res; |
108 | if (SM->hasInstrSchedModel()) { |
109 | // Compare tput -> lat -> code size. |
110 | Res = CmpOptionals(NewVal: GetInstTput(NewOpc), CurVal: GetInstTput(Opc)); |
111 | if (Res.has_value()) |
112 | return *Res; |
113 | |
114 | Res = CmpOptionals(NewVal: GetInstLat(NewOpc), CurVal: GetInstLat(Opc)); |
115 | if (Res.has_value()) |
116 | return *Res; |
117 | } |
118 | |
119 | Res = CmpOptionals(NewVal: GetInstSize(Opc), CurVal: GetInstSize(NewOpc)); |
120 | if (Res.has_value()) |
121 | return *Res; |
122 | |
123 | // We either have either were unable to get tput/lat/codesize or all values |
124 | // were equal. Return specified option for a tie. |
125 | return ReplaceInTie; |
126 | }; |
127 | |
128 | // `vpermilpd r, i` -> `vshufpd r, r, i` |
129 | // `vpermilpd r, i, k` -> `vshufpd r, r, i, k` |
130 | // `vshufpd` is always as fast or faster than `vpermilpd` and takes |
131 | // 1 less byte of code size for VEX and EVEX encoding. |
132 | auto ProcessVPERMILPDri = [&](unsigned NewOpc) -> bool { |
133 | if (!NewOpcPreferable(NewOpc)) |
134 | return false; |
135 | LLVM_DEBUG(dbgs() << "Replacing: " << MI); |
136 | { |
137 | unsigned MaskImm = MI.getOperand(i: NumOperands - 1).getImm(); |
138 | MI.removeOperand(OpNo: NumOperands - 1); |
139 | MI.addOperand(Op: MI.getOperand(i: NumOperands - 2)); |
140 | MI.setDesc(TII->get(Opcode: NewOpc)); |
141 | MI.addOperand(Op: MachineOperand::CreateImm(Val: MaskImm)); |
142 | } |
143 | LLVM_DEBUG(dbgs() << " With: " << MI); |
144 | return true; |
145 | }; |
146 | |
147 | // `vpermilps r, i` -> `vshufps r, r, i` |
148 | // `vpermilps r, i, k` -> `vshufps r, r, i, k` |
149 | // `vshufps` is always as fast or faster than `vpermilps` and takes |
150 | // 1 less byte of code size for VEX and EVEX encoding. |
151 | auto ProcessVPERMILPSri = [&](unsigned NewOpc) -> bool { |
152 | if (!NewOpcPreferable(NewOpc)) |
153 | return false; |
154 | LLVM_DEBUG(dbgs() << "Replacing: " << MI); |
155 | { |
156 | unsigned MaskImm = MI.getOperand(i: NumOperands - 1).getImm(); |
157 | MI.removeOperand(OpNo: NumOperands - 1); |
158 | MI.addOperand(Op: MI.getOperand(i: NumOperands - 2)); |
159 | MI.setDesc(TII->get(Opcode: NewOpc)); |
160 | MI.addOperand(Op: MachineOperand::CreateImm(Val: MaskImm)); |
161 | } |
162 | LLVM_DEBUG(dbgs() << " With: " << MI); |
163 | return true; |
164 | }; |
165 | |
166 | // `vpermilps m, i` -> `vpshufd m, i` iff no domain delay penalty on shuffles. |
167 | // `vpshufd` is always as fast or faster than `vpermilps` and takes 1 less |
168 | // byte of code size. |
169 | auto ProcessVPERMILPSmi = [&](unsigned NewOpc) -> bool { |
170 | // TODO: Might be work adding bypass delay if -Os/-Oz is enabled as |
171 | // `vpshufd` saves a byte of code size. |
172 | if (!ST->hasNoDomainDelayShuffle() || |
173 | !NewOpcPreferable(NewOpc, /*ReplaceInTie*/ false)) |
174 | return false; |
175 | LLVM_DEBUG(dbgs() << "Replacing: " << MI); |
176 | { |
177 | MI.setDesc(TII->get(Opcode: NewOpc)); |
178 | } |
179 | LLVM_DEBUG(dbgs() << " With: " << MI); |
180 | return true; |
181 | }; |
182 | |
183 | // `vunpcklpd/vmovlhps r, r` -> `vunpcklqdq r, r`/`vshufpd r, r, 0x00` |
184 | // `vunpckhpd/vmovlhps r, r` -> `vunpckhqdq r, r`/`vshufpd r, r, 0xff` |
185 | // `vunpcklpd r, r, k` -> `vunpcklqdq r, r, k`/`vshufpd r, r, k, 0x00` |
186 | // `vunpckhpd r, r, k` -> `vunpckhqdq r, r, k`/`vshufpd r, r, k, 0xff` |
187 | // `vunpcklpd r, m` -> `vunpcklqdq r, m, k` |
188 | // `vunpckhpd r, m` -> `vunpckhqdq r, m, k` |
189 | // `vunpcklpd r, m, k` -> `vunpcklqdq r, m, k` |
190 | // `vunpckhpd r, m, k` -> `vunpckhqdq r, m, k` |
191 | // 1) If no bypass delay and `vunpck{l|h}qdq` faster than `vunpck{l|h}pd` |
192 | // -> `vunpck{l|h}qdq` |
193 | // 2) If `vshufpd` faster than `vunpck{l|h}pd` |
194 | // -> `vshufpd` |
195 | // |
196 | // `vunpcklps` -> `vunpckldq` (for all operand types if no bypass delay) |
197 | auto ProcessUNPCK = [&](unsigned NewOpc, unsigned MaskImm) -> bool { |
198 | if (!NewOpcPreferable(NewOpc, /*ReplaceInTie*/ false)) |
199 | return false; |
200 | LLVM_DEBUG(dbgs() << "Replacing: " << MI); |
201 | { |
202 | MI.setDesc(TII->get(Opcode: NewOpc)); |
203 | MI.addOperand(Op: MachineOperand::CreateImm(Val: MaskImm)); |
204 | } |
205 | LLVM_DEBUG(dbgs() << " With: " << MI); |
206 | return true; |
207 | }; |
208 | |
209 | auto ProcessUNPCKToIntDomain = [&](unsigned NewOpc) -> bool { |
210 | // TODO it may be worth it to set ReplaceInTie to `true` as there is no real |
211 | // downside to the integer unpck, but if someone doesn't specify exact |
212 | // target we won't find it faster. |
213 | if (!ST->hasNoDomainDelayShuffle() || |
214 | !NewOpcPreferable(NewOpc, /*ReplaceInTie*/ false)) |
215 | return false; |
216 | LLVM_DEBUG(dbgs() << "Replacing: " << MI); |
217 | { |
218 | MI.setDesc(TII->get(Opcode: NewOpc)); |
219 | } |
220 | LLVM_DEBUG(dbgs() << " With: " << MI); |
221 | return true; |
222 | }; |
223 | |
224 | auto ProcessUNPCKLPDrr = [&](unsigned NewOpcIntDomain, |
225 | unsigned NewOpc) -> bool { |
226 | if (ProcessUNPCKToIntDomain(NewOpcIntDomain)) |
227 | return true; |
228 | return ProcessUNPCK(NewOpc, 0x00); |
229 | }; |
230 | auto ProcessUNPCKHPDrr = [&](unsigned NewOpcIntDomain, |
231 | unsigned NewOpc) -> bool { |
232 | if (ProcessUNPCKToIntDomain(NewOpcIntDomain)) |
233 | return true; |
234 | return ProcessUNPCK(NewOpc, 0xff); |
235 | }; |
236 | |
237 | auto ProcessUNPCKPDrm = [&](unsigned NewOpcIntDomain) -> bool { |
238 | return ProcessUNPCKToIntDomain(NewOpcIntDomain); |
239 | }; |
240 | |
241 | auto ProcessUNPCKPS = [&](unsigned NewOpc) -> bool { |
242 | return ProcessUNPCKToIntDomain(NewOpc); |
243 | }; |
244 | |
245 | auto ProcessBLENDWToBLENDD = [&](unsigned MovOpc, unsigned NumElts) -> bool { |
246 | if (!ST->hasAVX2() || !NewOpcPreferable(MovOpc)) |
247 | return false; |
248 | // Convert to VPBLENDD if scaling the VPBLENDW mask down/up loses no bits. |
249 | APInt MaskW = |
250 | APInt(8, MI.getOperand(i: NumOperands - 1).getImm(), /*IsSigned=*/false); |
251 | APInt MaskD = APIntOps::ScaleBitMask(A: MaskW, NewBitWidth: 4, /*MatchAllBits=*/true); |
252 | if (MaskW != APIntOps::ScaleBitMask(A: MaskD, NewBitWidth: 8, /*MatchAllBits=*/true)) |
253 | return false; |
254 | APInt NewMaskD = APInt::getSplat(NewLen: NumElts, V: MaskD); |
255 | LLVM_DEBUG(dbgs() << "Replacing: " << MI); |
256 | { |
257 | MI.setDesc(TII->get(Opcode: MovOpc)); |
258 | MI.removeOperand(OpNo: NumOperands - 1); |
259 | MI.addOperand(Op: MachineOperand::CreateImm(Val: NewMaskD.getZExtValue())); |
260 | } |
261 | LLVM_DEBUG(dbgs() << " With: " << MI); |
262 | return true; |
263 | }; |
264 | |
265 | auto ProcessBLENDToMOV = [&](unsigned MovOpc, unsigned Mask, |
266 | unsigned MovImm) -> bool { |
267 | if ((MI.getOperand(i: NumOperands - 1).getImm() & Mask) != MovImm) |
268 | return false; |
269 | if (!OptSize && !NewOpcPreferable(MovOpc)) |
270 | return false; |
271 | LLVM_DEBUG(dbgs() << "Replacing: " << MI); |
272 | { |
273 | MI.setDesc(TII->get(Opcode: MovOpc)); |
274 | MI.removeOperand(OpNo: NumOperands - 1); |
275 | } |
276 | LLVM_DEBUG(dbgs() << " With: " << MI); |
277 | return true; |
278 | }; |
279 | |
280 | switch (Opc) { |
281 | case X86::BLENDPDrri: |
282 | return ProcessBLENDToMOV(X86::MOVSDrr, 0x3, 0x1); |
283 | case X86::VBLENDPDrri: |
284 | return ProcessBLENDToMOV(X86::VMOVSDrr, 0x3, 0x1); |
285 | |
286 | case X86::BLENDPSrri: |
287 | return ProcessBLENDToMOV(X86::MOVSSrr, 0xF, 0x1) || |
288 | ProcessBLENDToMOV(X86::MOVSDrr, 0xF, 0x3); |
289 | case X86::VBLENDPSrri: |
290 | return ProcessBLENDToMOV(X86::VMOVSSrr, 0xF, 0x1) || |
291 | ProcessBLENDToMOV(X86::VMOVSDrr, 0xF, 0x3); |
292 | |
293 | case X86::VPBLENDWrri: |
294 | // TODO: Add X86::VPBLENDWrmi handling |
295 | // TODO: Add X86::VPBLENDWYrri handling |
296 | // TODO: Add X86::VPBLENDWYrmi handling |
297 | return ProcessBLENDWToBLENDD(X86::VPBLENDDrri, 4); |
298 | |
299 | case X86::VPERMILPDri: |
300 | return ProcessVPERMILPDri(X86::VSHUFPDrri); |
301 | case X86::VPERMILPDYri: |
302 | return ProcessVPERMILPDri(X86::VSHUFPDYrri); |
303 | case X86::VPERMILPDZ128ri: |
304 | return ProcessVPERMILPDri(X86::VSHUFPDZ128rri); |
305 | case X86::VPERMILPDZ256ri: |
306 | return ProcessVPERMILPDri(X86::VSHUFPDZ256rri); |
307 | case X86::VPERMILPDZri: |
308 | return ProcessVPERMILPDri(X86::VSHUFPDZrri); |
309 | case X86::VPERMILPDZ128rikz: |
310 | return ProcessVPERMILPDri(X86::VSHUFPDZ128rrikz); |
311 | case X86::VPERMILPDZ256rikz: |
312 | return ProcessVPERMILPDri(X86::VSHUFPDZ256rrikz); |
313 | case X86::VPERMILPDZrikz: |
314 | return ProcessVPERMILPDri(X86::VSHUFPDZrrikz); |
315 | case X86::VPERMILPDZ128rik: |
316 | return ProcessVPERMILPDri(X86::VSHUFPDZ128rrik); |
317 | case X86::VPERMILPDZ256rik: |
318 | return ProcessVPERMILPDri(X86::VSHUFPDZ256rrik); |
319 | case X86::VPERMILPDZrik: |
320 | return ProcessVPERMILPDri(X86::VSHUFPDZrrik); |
321 | |
322 | case X86::VPERMILPSri: |
323 | return ProcessVPERMILPSri(X86::VSHUFPSrri); |
324 | case X86::VPERMILPSYri: |
325 | return ProcessVPERMILPSri(X86::VSHUFPSYrri); |
326 | case X86::VPERMILPSZ128ri: |
327 | return ProcessVPERMILPSri(X86::VSHUFPSZ128rri); |
328 | case X86::VPERMILPSZ256ri: |
329 | return ProcessVPERMILPSri(X86::VSHUFPSZ256rri); |
330 | case X86::VPERMILPSZri: |
331 | return ProcessVPERMILPSri(X86::VSHUFPSZrri); |
332 | case X86::VPERMILPSZ128rikz: |
333 | return ProcessVPERMILPSri(X86::VSHUFPSZ128rrikz); |
334 | case X86::VPERMILPSZ256rikz: |
335 | return ProcessVPERMILPSri(X86::VSHUFPSZ256rrikz); |
336 | case X86::VPERMILPSZrikz: |
337 | return ProcessVPERMILPSri(X86::VSHUFPSZrrikz); |
338 | case X86::VPERMILPSZ128rik: |
339 | return ProcessVPERMILPSri(X86::VSHUFPSZ128rrik); |
340 | case X86::VPERMILPSZ256rik: |
341 | return ProcessVPERMILPSri(X86::VSHUFPSZ256rrik); |
342 | case X86::VPERMILPSZrik: |
343 | return ProcessVPERMILPSri(X86::VSHUFPSZrrik); |
344 | case X86::VPERMILPSmi: |
345 | return ProcessVPERMILPSmi(X86::VPSHUFDmi); |
346 | case X86::VPERMILPSYmi: |
347 | // TODO: See if there is a more generic way we can test if the replacement |
348 | // instruction is supported. |
349 | return ST->hasAVX2() ? ProcessVPERMILPSmi(X86::VPSHUFDYmi) : false; |
350 | case X86::VPERMILPSZ128mi: |
351 | return ProcessVPERMILPSmi(X86::VPSHUFDZ128mi); |
352 | case X86::VPERMILPSZ256mi: |
353 | return ProcessVPERMILPSmi(X86::VPSHUFDZ256mi); |
354 | case X86::VPERMILPSZmi: |
355 | return ProcessVPERMILPSmi(X86::VPSHUFDZmi); |
356 | case X86::VPERMILPSZ128mikz: |
357 | return ProcessVPERMILPSmi(X86::VPSHUFDZ128mikz); |
358 | case X86::VPERMILPSZ256mikz: |
359 | return ProcessVPERMILPSmi(X86::VPSHUFDZ256mikz); |
360 | case X86::VPERMILPSZmikz: |
361 | return ProcessVPERMILPSmi(X86::VPSHUFDZmikz); |
362 | case X86::VPERMILPSZ128mik: |
363 | return ProcessVPERMILPSmi(X86::VPSHUFDZ128mik); |
364 | case X86::VPERMILPSZ256mik: |
365 | return ProcessVPERMILPSmi(X86::VPSHUFDZ256mik); |
366 | case X86::VPERMILPSZmik: |
367 | return ProcessVPERMILPSmi(X86::VPSHUFDZmik); |
368 | |
369 | case X86::MOVLHPSrr: |
370 | case X86::UNPCKLPDrr: |
371 | return ProcessUNPCKLPDrr(X86::PUNPCKLQDQrr, X86::SHUFPDrri); |
372 | case X86::VMOVLHPSrr: |
373 | case X86::VUNPCKLPDrr: |
374 | return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQrr, X86::VSHUFPDrri); |
375 | case X86::VUNPCKLPDYrr: |
376 | return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQYrr, X86::VSHUFPDYrri); |
377 | // VMOVLHPS is always 128 bits. |
378 | case X86::VMOVLHPSZrr: |
379 | case X86::VUNPCKLPDZ128rr: |
380 | return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQZ128rr, X86::VSHUFPDZ128rri); |
381 | case X86::VUNPCKLPDZ256rr: |
382 | return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQZ256rr, X86::VSHUFPDZ256rri); |
383 | case X86::VUNPCKLPDZrr: |
384 | return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQZrr, X86::VSHUFPDZrri); |
385 | case X86::VUNPCKLPDZ128rrk: |
386 | return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQZ128rrk, X86::VSHUFPDZ128rrik); |
387 | case X86::VUNPCKLPDZ256rrk: |
388 | return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQZ256rrk, X86::VSHUFPDZ256rrik); |
389 | case X86::VUNPCKLPDZrrk: |
390 | return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQZrrk, X86::VSHUFPDZrrik); |
391 | case X86::VUNPCKLPDZ128rrkz: |
392 | return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQZ128rrkz, X86::VSHUFPDZ128rrikz); |
393 | case X86::VUNPCKLPDZ256rrkz: |
394 | return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQZ256rrkz, X86::VSHUFPDZ256rrikz); |
395 | case X86::VUNPCKLPDZrrkz: |
396 | return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQZrrkz, X86::VSHUFPDZrrikz); |
397 | case X86::UNPCKHPDrr: |
398 | return ProcessUNPCKHPDrr(X86::PUNPCKHQDQrr, X86::SHUFPDrri); |
399 | case X86::VUNPCKHPDrr: |
400 | return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQrr, X86::VSHUFPDrri); |
401 | case X86::VUNPCKHPDYrr: |
402 | return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQYrr, X86::VSHUFPDYrri); |
403 | case X86::VUNPCKHPDZ128rr: |
404 | return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQZ128rr, X86::VSHUFPDZ128rri); |
405 | case X86::VUNPCKHPDZ256rr: |
406 | return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQZ256rr, X86::VSHUFPDZ256rri); |
407 | case X86::VUNPCKHPDZrr: |
408 | return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQZrr, X86::VSHUFPDZrri); |
409 | case X86::VUNPCKHPDZ128rrk: |
410 | return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQZ128rrk, X86::VSHUFPDZ128rrik); |
411 | case X86::VUNPCKHPDZ256rrk: |
412 | return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQZ256rrk, X86::VSHUFPDZ256rrik); |
413 | case X86::VUNPCKHPDZrrk: |
414 | return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQZrrk, X86::VSHUFPDZrrik); |
415 | case X86::VUNPCKHPDZ128rrkz: |
416 | return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQZ128rrkz, X86::VSHUFPDZ128rrikz); |
417 | case X86::VUNPCKHPDZ256rrkz: |
418 | return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQZ256rrkz, X86::VSHUFPDZ256rrikz); |
419 | case X86::VUNPCKHPDZrrkz: |
420 | return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQZrrkz, X86::VSHUFPDZrrikz); |
421 | case X86::UNPCKLPDrm: |
422 | return ProcessUNPCKPDrm(X86::PUNPCKLQDQrm); |
423 | case X86::VUNPCKLPDrm: |
424 | return ProcessUNPCKPDrm(X86::VPUNPCKLQDQrm); |
425 | case X86::VUNPCKLPDYrm: |
426 | return ProcessUNPCKPDrm(X86::VPUNPCKLQDQYrm); |
427 | case X86::VUNPCKLPDZ128rm: |
428 | return ProcessUNPCKPDrm(X86::VPUNPCKLQDQZ128rm); |
429 | case X86::VUNPCKLPDZ256rm: |
430 | return ProcessUNPCKPDrm(X86::VPUNPCKLQDQZ256rm); |
431 | case X86::VUNPCKLPDZrm: |
432 | return ProcessUNPCKPDrm(X86::VPUNPCKLQDQZrm); |
433 | case X86::VUNPCKLPDZ128rmk: |
434 | return ProcessUNPCKPDrm(X86::VPUNPCKLQDQZ128rmk); |
435 | case X86::VUNPCKLPDZ256rmk: |
436 | return ProcessUNPCKPDrm(X86::VPUNPCKLQDQZ256rmk); |
437 | case X86::VUNPCKLPDZrmk: |
438 | return ProcessUNPCKPDrm(X86::VPUNPCKLQDQZrmk); |
439 | case X86::VUNPCKLPDZ128rmkz: |
440 | return ProcessUNPCKPDrm(X86::VPUNPCKLQDQZ128rmkz); |
441 | case X86::VUNPCKLPDZ256rmkz: |
442 | return ProcessUNPCKPDrm(X86::VPUNPCKLQDQZ256rmkz); |
443 | case X86::VUNPCKLPDZrmkz: |
444 | return ProcessUNPCKPDrm(X86::VPUNPCKLQDQZrmkz); |
445 | case X86::UNPCKHPDrm: |
446 | return ProcessUNPCKPDrm(X86::PUNPCKHQDQrm); |
447 | case X86::VUNPCKHPDrm: |
448 | return ProcessUNPCKPDrm(X86::VPUNPCKHQDQrm); |
449 | case X86::VUNPCKHPDYrm: |
450 | return ProcessUNPCKPDrm(X86::VPUNPCKHQDQYrm); |
451 | case X86::VUNPCKHPDZ128rm: |
452 | return ProcessUNPCKPDrm(X86::VPUNPCKHQDQZ128rm); |
453 | case X86::VUNPCKHPDZ256rm: |
454 | return ProcessUNPCKPDrm(X86::VPUNPCKHQDQZ256rm); |
455 | case X86::VUNPCKHPDZrm: |
456 | return ProcessUNPCKPDrm(X86::VPUNPCKHQDQZrm); |
457 | case X86::VUNPCKHPDZ128rmk: |
458 | return ProcessUNPCKPDrm(X86::VPUNPCKHQDQZ128rmk); |
459 | case X86::VUNPCKHPDZ256rmk: |
460 | return ProcessUNPCKPDrm(X86::VPUNPCKHQDQZ256rmk); |
461 | case X86::VUNPCKHPDZrmk: |
462 | return ProcessUNPCKPDrm(X86::VPUNPCKHQDQZrmk); |
463 | case X86::VUNPCKHPDZ128rmkz: |
464 | return ProcessUNPCKPDrm(X86::VPUNPCKHQDQZ128rmkz); |
465 | case X86::VUNPCKHPDZ256rmkz: |
466 | return ProcessUNPCKPDrm(X86::VPUNPCKHQDQZ256rmkz); |
467 | case X86::VUNPCKHPDZrmkz: |
468 | return ProcessUNPCKPDrm(X86::VPUNPCKHQDQZrmkz); |
469 | |
470 | case X86::UNPCKLPSrr: |
471 | return ProcessUNPCKPS(X86::PUNPCKLDQrr); |
472 | case X86::VUNPCKLPSrr: |
473 | return ProcessUNPCKPS(X86::VPUNPCKLDQrr); |
474 | case X86::VUNPCKLPSYrr: |
475 | return ProcessUNPCKPS(X86::VPUNPCKLDQYrr); |
476 | case X86::VUNPCKLPSZ128rr: |
477 | return ProcessUNPCKPS(X86::VPUNPCKLDQZ128rr); |
478 | case X86::VUNPCKLPSZ256rr: |
479 | return ProcessUNPCKPS(X86::VPUNPCKLDQZ256rr); |
480 | case X86::VUNPCKLPSZrr: |
481 | return ProcessUNPCKPS(X86::VPUNPCKLDQZrr); |
482 | case X86::VUNPCKLPSZ128rrk: |
483 | return ProcessUNPCKPS(X86::VPUNPCKLDQZ128rrk); |
484 | case X86::VUNPCKLPSZ256rrk: |
485 | return ProcessUNPCKPS(X86::VPUNPCKLDQZ256rrk); |
486 | case X86::VUNPCKLPSZrrk: |
487 | return ProcessUNPCKPS(X86::VPUNPCKLDQZrrk); |
488 | case X86::VUNPCKLPSZ128rrkz: |
489 | return ProcessUNPCKPS(X86::VPUNPCKLDQZ128rrkz); |
490 | case X86::VUNPCKLPSZ256rrkz: |
491 | return ProcessUNPCKPS(X86::VPUNPCKLDQZ256rrkz); |
492 | case X86::VUNPCKLPSZrrkz: |
493 | return ProcessUNPCKPS(X86::VPUNPCKLDQZrrkz); |
494 | case X86::UNPCKHPSrr: |
495 | return ProcessUNPCKPS(X86::PUNPCKHDQrr); |
496 | case X86::VUNPCKHPSrr: |
497 | return ProcessUNPCKPS(X86::VPUNPCKHDQrr); |
498 | case X86::VUNPCKHPSYrr: |
499 | return ProcessUNPCKPS(X86::VPUNPCKHDQYrr); |
500 | case X86::VUNPCKHPSZ128rr: |
501 | return ProcessUNPCKPS(X86::VPUNPCKHDQZ128rr); |
502 | case X86::VUNPCKHPSZ256rr: |
503 | return ProcessUNPCKPS(X86::VPUNPCKHDQZ256rr); |
504 | case X86::VUNPCKHPSZrr: |
505 | return ProcessUNPCKPS(X86::VPUNPCKHDQZrr); |
506 | case X86::VUNPCKHPSZ128rrk: |
507 | return ProcessUNPCKPS(X86::VPUNPCKHDQZ128rrk); |
508 | case X86::VUNPCKHPSZ256rrk: |
509 | return ProcessUNPCKPS(X86::VPUNPCKHDQZ256rrk); |
510 | case X86::VUNPCKHPSZrrk: |
511 | return ProcessUNPCKPS(X86::VPUNPCKHDQZrrk); |
512 | case X86::VUNPCKHPSZ128rrkz: |
513 | return ProcessUNPCKPS(X86::VPUNPCKHDQZ128rrkz); |
514 | case X86::VUNPCKHPSZ256rrkz: |
515 | return ProcessUNPCKPS(X86::VPUNPCKHDQZ256rrkz); |
516 | case X86::VUNPCKHPSZrrkz: |
517 | return ProcessUNPCKPS(X86::VPUNPCKHDQZrrkz); |
518 | case X86::UNPCKLPSrm: |
519 | return ProcessUNPCKPS(X86::PUNPCKLDQrm); |
520 | case X86::VUNPCKLPSrm: |
521 | return ProcessUNPCKPS(X86::VPUNPCKLDQrm); |
522 | case X86::VUNPCKLPSYrm: |
523 | return ProcessUNPCKPS(X86::VPUNPCKLDQYrm); |
524 | case X86::VUNPCKLPSZ128rm: |
525 | return ProcessUNPCKPS(X86::VPUNPCKLDQZ128rm); |
526 | case X86::VUNPCKLPSZ256rm: |
527 | return ProcessUNPCKPS(X86::VPUNPCKLDQZ256rm); |
528 | case X86::VUNPCKLPSZrm: |
529 | return ProcessUNPCKPS(X86::VPUNPCKLDQZrm); |
530 | case X86::VUNPCKLPSZ128rmk: |
531 | return ProcessUNPCKPS(X86::VPUNPCKLDQZ128rmk); |
532 | case X86::VUNPCKLPSZ256rmk: |
533 | return ProcessUNPCKPS(X86::VPUNPCKLDQZ256rmk); |
534 | case X86::VUNPCKLPSZrmk: |
535 | return ProcessUNPCKPS(X86::VPUNPCKLDQZrmk); |
536 | case X86::VUNPCKLPSZ128rmkz: |
537 | return ProcessUNPCKPS(X86::VPUNPCKLDQZ128rmkz); |
538 | case X86::VUNPCKLPSZ256rmkz: |
539 | return ProcessUNPCKPS(X86::VPUNPCKLDQZ256rmkz); |
540 | case X86::VUNPCKLPSZrmkz: |
541 | return ProcessUNPCKPS(X86::VPUNPCKLDQZrmkz); |
542 | case X86::UNPCKHPSrm: |
543 | return ProcessUNPCKPS(X86::PUNPCKHDQrm); |
544 | case X86::VUNPCKHPSrm: |
545 | return ProcessUNPCKPS(X86::VPUNPCKHDQrm); |
546 | case X86::VUNPCKHPSYrm: |
547 | return ProcessUNPCKPS(X86::VPUNPCKHDQYrm); |
548 | case X86::VUNPCKHPSZ128rm: |
549 | return ProcessUNPCKPS(X86::VPUNPCKHDQZ128rm); |
550 | case X86::VUNPCKHPSZ256rm: |
551 | return ProcessUNPCKPS(X86::VPUNPCKHDQZ256rm); |
552 | case X86::VUNPCKHPSZrm: |
553 | return ProcessUNPCKPS(X86::VPUNPCKHDQZrm); |
554 | case X86::VUNPCKHPSZ128rmk: |
555 | return ProcessUNPCKPS(X86::VPUNPCKHDQZ128rmk); |
556 | case X86::VUNPCKHPSZ256rmk: |
557 | return ProcessUNPCKPS(X86::VPUNPCKHDQZ256rmk); |
558 | case X86::VUNPCKHPSZrmk: |
559 | return ProcessUNPCKPS(X86::VPUNPCKHDQZrmk); |
560 | case X86::VUNPCKHPSZ128rmkz: |
561 | return ProcessUNPCKPS(X86::VPUNPCKHDQZ128rmkz); |
562 | case X86::VUNPCKHPSZ256rmkz: |
563 | return ProcessUNPCKPS(X86::VPUNPCKHDQZ256rmkz); |
564 | case X86::VUNPCKHPSZrmkz: |
565 | return ProcessUNPCKPS(X86::VPUNPCKHDQZrmkz); |
566 | default: |
567 | return false; |
568 | } |
569 | } |
570 | |
571 | bool X86FixupInstTuningPass::runOnMachineFunction(MachineFunction &MF) { |
572 | LLVM_DEBUG(dbgs() << "Start X86FixupInstTuning\n" ;); |
573 | bool Changed = false; |
574 | ST = &MF.getSubtarget<X86Subtarget>(); |
575 | TII = ST->getInstrInfo(); |
576 | SM = &ST->getSchedModel(); |
577 | |
578 | for (MachineBasicBlock &MBB : MF) { |
579 | for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ++I) { |
580 | if (processInstruction(MF, MBB, I)) { |
581 | ++NumInstChanges; |
582 | Changed = true; |
583 | } |
584 | } |
585 | } |
586 | LLVM_DEBUG(dbgs() << "End X86FixupInstTuning\n" ;); |
587 | return Changed; |
588 | } |
589 | |