1 | //===- AArch64MIPeepholeOpt.cpp - AArch64 MI peephole optimization pass ---===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This pass performs below peephole optimizations on MIR level. |
10 | // |
11 | // 1. MOVi32imm + ANDWrr ==> ANDWri + ANDWri |
12 | // MOVi64imm + ANDXrr ==> ANDXri + ANDXri |
13 | // |
14 | // 2. MOVi32imm + ADDWrr ==> ADDWRi + ADDWRi |
15 | // MOVi64imm + ADDXrr ==> ANDXri + ANDXri |
16 | // |
17 | // 3. MOVi32imm + SUBWrr ==> SUBWRi + SUBWRi |
18 | // MOVi64imm + SUBXrr ==> SUBXri + SUBXri |
19 | // |
20 | // The mov pseudo instruction could be expanded to multiple mov instructions |
21 | // later. In this case, we could try to split the constant operand of mov |
22 | // instruction into two immediates which can be directly encoded into |
23 | // *Wri/*Xri instructions. It makes two AND/ADD/SUB instructions instead of |
24 | // multiple `mov` + `and/add/sub` instructions. |
25 | // |
26 | // 4. Remove redundant ORRWrs which is generated by zero-extend. |
27 | // |
28 | // %3:gpr32 = ORRWrs $wzr, %2, 0 |
29 | // %4:gpr64 = SUBREG_TO_REG 0, %3, %subreg.sub_32 |
30 | // |
31 | // If AArch64's 32-bit form of instruction defines the source operand of |
32 | // ORRWrs, we can remove the ORRWrs because the upper 32 bits of the source |
33 | // operand are set to zero. |
34 | // |
35 | // 5. %reg = INSERT_SUBREG %reg(tied-def 0), %subreg, subidx |
36 | // ==> %reg:subidx = SUBREG_TO_REG 0, %subreg, subidx |
37 | // |
38 | // 6. %intermediate:gpr32 = COPY %src:fpr128 |
39 | // %dst:fpr128 = INSvi32gpr %dst_vec:fpr128, dst_index, %intermediate:gpr32 |
40 | // ==> %dst:fpr128 = INSvi32lane %dst_vec:fpr128, dst_index, %src:fpr128, 0 |
41 | // |
42 | // In cases where a source FPR is copied to a GPR in order to be copied |
43 | // to a destination FPR, we can directly copy the values between the FPRs, |
44 | // eliminating the use of the Integer unit. When we match a pattern of |
45 | // INSvi[X]gpr that is preceded by a chain of COPY instructions from a FPR |
46 | // source, we use the INSvi[X]lane to replace the COPY & INSvi[X]gpr |
47 | // instructions. |
48 | // |
49 | // 7. If MI sets zero for high 64-bits implicitly, remove `mov 0` for high |
50 | // 64-bits. For example, |
51 | // |
52 | // %1:fpr64 = nofpexcept FCVTNv4i16 %0:fpr128, implicit $fpcr |
53 | // %2:fpr64 = MOVID 0 |
54 | // %4:fpr128 = IMPLICIT_DEF |
55 | // %3:fpr128 = INSERT_SUBREG %4:fpr128(tied-def 0), killed %2:fpr64, %subreg.dsub |
56 | // %6:fpr128 = IMPLICIT_DEF |
57 | // %5:fpr128 = INSERT_SUBREG %6:fpr128(tied-def 0), killed %1:fpr64, %subreg.dsub |
58 | // %7:fpr128 = INSvi64lane %5:fpr128(tied-def 0), 1, killed %3:fpr128, 0 |
59 | // ==> |
60 | // %1:fpr64 = nofpexcept FCVTNv4i16 %0:fpr128, implicit $fpcr |
61 | // %6:fpr128 = IMPLICIT_DEF |
62 | // %7:fpr128 = INSERT_SUBREG %6:fpr128(tied-def 0), killed %1:fpr64, %subreg.dsub |
63 | // |
64 | //===----------------------------------------------------------------------===// |
65 | |
66 | #include "AArch64ExpandImm.h" |
67 | #include "AArch64InstrInfo.h" |
68 | #include "MCTargetDesc/AArch64AddressingModes.h" |
69 | #include "llvm/CodeGen/MachineDominators.h" |
70 | #include "llvm/CodeGen/MachineLoopInfo.h" |
71 | |
72 | using namespace llvm; |
73 | |
74 | #define DEBUG_TYPE "aarch64-mi-peephole-opt" |
75 | |
76 | namespace { |
77 | |
78 | struct AArch64MIPeepholeOpt : public MachineFunctionPass { |
79 | static char ID; |
80 | |
81 | AArch64MIPeepholeOpt() : MachineFunctionPass(ID) { |
82 | initializeAArch64MIPeepholeOptPass(*PassRegistry::getPassRegistry()); |
83 | } |
84 | |
85 | const AArch64InstrInfo *TII; |
86 | const AArch64RegisterInfo *TRI; |
87 | MachineLoopInfo *MLI; |
88 | MachineRegisterInfo *MRI; |
89 | |
90 | using OpcodePair = std::pair<unsigned, unsigned>; |
91 | template <typename T> |
92 | using SplitAndOpcFunc = |
93 | std::function<std::optional<OpcodePair>(T, unsigned, T &, T &)>; |
94 | using BuildMIFunc = |
95 | std::function<void(MachineInstr &, OpcodePair, unsigned, unsigned, |
96 | Register, Register, Register)>; |
97 | |
98 | /// For instructions where an immediate operand could be split into two |
99 | /// separate immediate instructions, use the splitTwoPartImm two handle the |
100 | /// optimization. |
101 | /// |
102 | /// To implement, the following function types must be passed to |
103 | /// splitTwoPartImm. A SplitAndOpcFunc must be implemented that determines if |
104 | /// splitting the immediate is valid and returns the associated new opcode. A |
105 | /// BuildMIFunc must be implemented to build the two immediate instructions. |
106 | /// |
107 | /// Example Pattern (where IMM would require 2+ MOV instructions): |
108 | /// %dst = <Instr>rr %src IMM [...] |
109 | /// becomes: |
110 | /// %tmp = <Instr>ri %src (encode half IMM) [...] |
111 | /// %dst = <Instr>ri %tmp (encode half IMM) [...] |
112 | template <typename T> |
113 | bool splitTwoPartImm(MachineInstr &MI, |
114 | SplitAndOpcFunc<T> SplitAndOpc, BuildMIFunc BuildInstr); |
115 | |
116 | bool checkMovImmInstr(MachineInstr &MI, MachineInstr *&MovMI, |
117 | MachineInstr *&SubregToRegMI); |
118 | |
119 | template <typename T> |
120 | bool visitADDSUB(unsigned PosOpc, unsigned NegOpc, MachineInstr &MI); |
121 | template <typename T> |
122 | bool visitADDSSUBS(OpcodePair PosOpcs, OpcodePair NegOpcs, MachineInstr &MI); |
123 | |
124 | template <typename T> |
125 | bool visitAND(unsigned Opc, MachineInstr &MI); |
126 | bool visitORR(MachineInstr &MI); |
127 | bool visitINSERT(MachineInstr &MI); |
128 | bool visitINSviGPR(MachineInstr &MI, unsigned Opc); |
129 | bool visitINSvi64lane(MachineInstr &MI); |
130 | bool visitFMOVDr(MachineInstr &MI); |
131 | bool visitCopy(MachineInstr &MI); |
132 | bool runOnMachineFunction(MachineFunction &MF) override; |
133 | |
134 | StringRef getPassName() const override { |
135 | return "AArch64 MI Peephole Optimization pass" ; |
136 | } |
137 | |
138 | void getAnalysisUsage(AnalysisUsage &AU) const override { |
139 | AU.setPreservesCFG(); |
140 | AU.addRequired<MachineLoopInfoWrapperPass>(); |
141 | MachineFunctionPass::getAnalysisUsage(AU); |
142 | } |
143 | }; |
144 | |
145 | char AArch64MIPeepholeOpt::ID = 0; |
146 | |
147 | } // end anonymous namespace |
148 | |
149 | INITIALIZE_PASS(AArch64MIPeepholeOpt, "aarch64-mi-peephole-opt" , |
150 | "AArch64 MI Peephole Optimization" , false, false) |
151 | |
152 | template <typename T> |
153 | static bool splitBitmaskImm(T Imm, unsigned RegSize, T &Imm1Enc, T &Imm2Enc) { |
154 | T UImm = static_cast<T>(Imm); |
155 | if (AArch64_AM::isLogicalImmediate(imm: UImm, regSize: RegSize)) |
156 | return false; |
157 | |
158 | // If this immediate can be handled by one instruction, do not split it. |
159 | SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn; |
160 | AArch64_IMM::expandMOVImm(Imm: UImm, BitSize: RegSize, Insn); |
161 | if (Insn.size() == 1) |
162 | return false; |
163 | |
164 | // The bitmask immediate consists of consecutive ones. Let's say there is |
165 | // constant 0b00000000001000000000010000000000 which does not consist of |
166 | // consecutive ones. We can split it in to two bitmask immediate like |
167 | // 0b00000000001111111111110000000000 and 0b11111111111000000000011111111111. |
168 | // If we do AND with these two bitmask immediate, we can see original one. |
169 | unsigned LowestBitSet = llvm::countr_zero(UImm); |
170 | unsigned HighestBitSet = Log2_64(UImm); |
171 | |
172 | // Create a mask which is filled with one from the position of lowest bit set |
173 | // to the position of highest bit set. |
174 | T NewImm1 = (static_cast<T>(2) << HighestBitSet) - |
175 | (static_cast<T>(1) << LowestBitSet); |
176 | // Create a mask which is filled with one outside the position of lowest bit |
177 | // set and the position of highest bit set. |
178 | T NewImm2 = UImm | ~NewImm1; |
179 | |
180 | // If the split value is not valid bitmask immediate, do not split this |
181 | // constant. |
182 | if (!AArch64_AM::isLogicalImmediate(imm: NewImm2, regSize: RegSize)) |
183 | return false; |
184 | |
185 | Imm1Enc = AArch64_AM::encodeLogicalImmediate(imm: NewImm1, regSize: RegSize); |
186 | Imm2Enc = AArch64_AM::encodeLogicalImmediate(imm: NewImm2, regSize: RegSize); |
187 | return true; |
188 | } |
189 | |
190 | template <typename T> |
191 | bool AArch64MIPeepholeOpt::visitAND( |
192 | unsigned Opc, MachineInstr &MI) { |
193 | // Try below transformation. |
194 | // |
195 | // MOVi32imm + ANDWrr ==> ANDWri + ANDWri |
196 | // MOVi64imm + ANDXrr ==> ANDXri + ANDXri |
197 | // |
198 | // The mov pseudo instruction could be expanded to multiple mov instructions |
199 | // later. Let's try to split the constant operand of mov instruction into two |
200 | // bitmask immediates. It makes only two AND instructions intead of multiple |
201 | // mov + and instructions. |
202 | |
203 | return splitTwoPartImm<T>( |
204 | MI, |
205 | [Opc](T Imm, unsigned RegSize, T &Imm0, |
206 | T &Imm1) -> std::optional<OpcodePair> { |
207 | if (splitBitmaskImm(Imm, RegSize, Imm0, Imm1)) |
208 | return std::make_pair(x: Opc, y: Opc); |
209 | return std::nullopt; |
210 | }, |
211 | [&TII = TII](MachineInstr &MI, OpcodePair Opcode, unsigned Imm0, |
212 | unsigned Imm1, Register SrcReg, Register NewTmpReg, |
213 | Register NewDstReg) { |
214 | DebugLoc DL = MI.getDebugLoc(); |
215 | MachineBasicBlock *MBB = MI.getParent(); |
216 | BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: Opcode.first), DestReg: NewTmpReg) |
217 | .addReg(RegNo: SrcReg) |
218 | .addImm(Val: Imm0); |
219 | BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: Opcode.second), DestReg: NewDstReg) |
220 | .addReg(RegNo: NewTmpReg) |
221 | .addImm(Val: Imm1); |
222 | }); |
223 | } |
224 | |
225 | bool AArch64MIPeepholeOpt::visitORR(MachineInstr &MI) { |
226 | // Check this ORR comes from below zero-extend pattern. |
227 | // |
228 | // def : Pat<(i64 (zext GPR32:$src)), |
229 | // (SUBREG_TO_REG (i32 0), (ORRWrs WZR, GPR32:$src, 0), sub_32)>; |
230 | if (MI.getOperand(i: 3).getImm() != 0) |
231 | return false; |
232 | |
233 | if (MI.getOperand(i: 1).getReg() != AArch64::WZR) |
234 | return false; |
235 | |
236 | MachineInstr *SrcMI = MRI->getUniqueVRegDef(Reg: MI.getOperand(i: 2).getReg()); |
237 | if (!SrcMI) |
238 | return false; |
239 | |
240 | // From https://developer.arm.com/documentation/dui0801/b/BABBGCAC |
241 | // |
242 | // When you use the 32-bit form of an instruction, the upper 32 bits of the |
243 | // source registers are ignored and the upper 32 bits of the destination |
244 | // register are set to zero. |
245 | // |
246 | // If AArch64's 32-bit form of instruction defines the source operand of |
247 | // zero-extend, we do not need the zero-extend. Let's check the MI's opcode is |
248 | // real AArch64 instruction and if it is not, do not process the opcode |
249 | // conservatively. |
250 | if (SrcMI->getOpcode() == TargetOpcode::COPY && |
251 | SrcMI->getOperand(i: 1).getReg().isVirtual()) { |
252 | const TargetRegisterClass *RC = |
253 | MRI->getRegClass(Reg: SrcMI->getOperand(i: 1).getReg()); |
254 | |
255 | // A COPY from an FPR will become a FMOVSWr, so do so now so that we know |
256 | // that the upper bits are zero. |
257 | if (RC != &AArch64::FPR32RegClass && |
258 | ((RC != &AArch64::FPR64RegClass && RC != &AArch64::FPR128RegClass) || |
259 | SrcMI->getOperand(i: 1).getSubReg() != AArch64::ssub)) |
260 | return false; |
261 | Register CpySrc = SrcMI->getOperand(i: 1).getReg(); |
262 | if (SrcMI->getOperand(i: 1).getSubReg() == AArch64::ssub) { |
263 | CpySrc = MRI->createVirtualRegister(RegClass: &AArch64::FPR32RegClass); |
264 | BuildMI(BB&: *SrcMI->getParent(), I: SrcMI, MIMD: SrcMI->getDebugLoc(), |
265 | MCID: TII->get(Opcode: TargetOpcode::COPY), DestReg: CpySrc) |
266 | .add(MO: SrcMI->getOperand(i: 1)); |
267 | } |
268 | BuildMI(BB&: *SrcMI->getParent(), I: SrcMI, MIMD: SrcMI->getDebugLoc(), |
269 | MCID: TII->get(Opcode: AArch64::FMOVSWr), DestReg: SrcMI->getOperand(i: 0).getReg()) |
270 | .addReg(RegNo: CpySrc); |
271 | SrcMI->eraseFromParent(); |
272 | } |
273 | else if (SrcMI->getOpcode() <= TargetOpcode::GENERIC_OP_END) |
274 | return false; |
275 | |
276 | Register DefReg = MI.getOperand(i: 0).getReg(); |
277 | Register SrcReg = MI.getOperand(i: 2).getReg(); |
278 | MRI->replaceRegWith(FromReg: DefReg, ToReg: SrcReg); |
279 | MRI->clearKillFlags(Reg: SrcReg); |
280 | LLVM_DEBUG(dbgs() << "Removed: " << MI << "\n" ); |
281 | MI.eraseFromParent(); |
282 | |
283 | return true; |
284 | } |
285 | |
286 | bool AArch64MIPeepholeOpt::visitINSERT(MachineInstr &MI) { |
287 | // Check this INSERT_SUBREG comes from below zero-extend pattern. |
288 | // |
289 | // From %reg = INSERT_SUBREG %reg(tied-def 0), %subreg, subidx |
290 | // To %reg:subidx = SUBREG_TO_REG 0, %subreg, subidx |
291 | // |
292 | // We're assuming the first operand to INSERT_SUBREG is irrelevant because a |
293 | // COPY would destroy the upper part of the register anyway |
294 | if (!MI.isRegTiedToDefOperand(UseOpIdx: 1)) |
295 | return false; |
296 | |
297 | Register DstReg = MI.getOperand(i: 0).getReg(); |
298 | const TargetRegisterClass *RC = MRI->getRegClass(Reg: DstReg); |
299 | MachineInstr *SrcMI = MRI->getUniqueVRegDef(Reg: MI.getOperand(i: 2).getReg()); |
300 | if (!SrcMI) |
301 | return false; |
302 | |
303 | // From https://developer.arm.com/documentation/dui0801/b/BABBGCAC |
304 | // |
305 | // When you use the 32-bit form of an instruction, the upper 32 bits of the |
306 | // source registers are ignored and the upper 32 bits of the destination |
307 | // register are set to zero. |
308 | // |
309 | // If AArch64's 32-bit form of instruction defines the source operand of |
310 | // zero-extend, we do not need the zero-extend. Let's check the MI's opcode is |
311 | // real AArch64 instruction and if it is not, do not process the opcode |
312 | // conservatively. |
313 | if ((SrcMI->getOpcode() <= TargetOpcode::GENERIC_OP_END) || |
314 | !AArch64::GPR64allRegClass.hasSubClassEq(RC)) |
315 | return false; |
316 | |
317 | // Build a SUBREG_TO_REG instruction |
318 | MachineInstr *SubregMI = |
319 | BuildMI(BB&: *MI.getParent(), I&: MI, MIMD: MI.getDebugLoc(), |
320 | MCID: TII->get(Opcode: TargetOpcode::SUBREG_TO_REG), DestReg: DstReg) |
321 | .addImm(Val: 0) |
322 | .add(MO: MI.getOperand(i: 2)) |
323 | .add(MO: MI.getOperand(i: 3)); |
324 | LLVM_DEBUG(dbgs() << MI << " replace by:\n: " << *SubregMI << "\n" ); |
325 | (void)SubregMI; |
326 | MI.eraseFromParent(); |
327 | |
328 | return true; |
329 | } |
330 | |
331 | template <typename T> |
332 | static bool splitAddSubImm(T Imm, unsigned RegSize, T &Imm0, T &Imm1) { |
333 | // The immediate must be in the form of ((imm0 << 12) + imm1), in which both |
334 | // imm0 and imm1 are non-zero 12-bit unsigned int. |
335 | if ((Imm & 0xfff000) == 0 || (Imm & 0xfff) == 0 || |
336 | (Imm & ~static_cast<T>(0xffffff)) != 0) |
337 | return false; |
338 | |
339 | // The immediate can not be composed via a single instruction. |
340 | SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn; |
341 | AArch64_IMM::expandMOVImm(Imm, BitSize: RegSize, Insn); |
342 | if (Insn.size() == 1) |
343 | return false; |
344 | |
345 | // Split Imm into (Imm0 << 12) + Imm1; |
346 | Imm0 = (Imm >> 12) & 0xfff; |
347 | Imm1 = Imm & 0xfff; |
348 | return true; |
349 | } |
350 | |
351 | template <typename T> |
352 | bool AArch64MIPeepholeOpt::visitADDSUB( |
353 | unsigned PosOpc, unsigned NegOpc, MachineInstr &MI) { |
354 | // Try below transformation. |
355 | // |
356 | // ADDWrr X, MOVi32imm ==> ADDWri + ADDWri |
357 | // ADDXrr X, MOVi64imm ==> ADDXri + ADDXri |
358 | // |
359 | // SUBWrr X, MOVi32imm ==> SUBWri + SUBWri |
360 | // SUBXrr X, MOVi64imm ==> SUBXri + SUBXri |
361 | // |
362 | // The mov pseudo instruction could be expanded to multiple mov instructions |
363 | // later. Let's try to split the constant operand of mov instruction into two |
364 | // legal add/sub immediates. It makes only two ADD/SUB instructions intead of |
365 | // multiple `mov` + `and/sub` instructions. |
366 | |
367 | // We can sometimes have ADDWrr WZR, MULi32imm that have not been constant |
368 | // folded. Make sure that we don't generate invalid instructions that use XZR |
369 | // in those cases. |
370 | if (MI.getOperand(i: 1).getReg() == AArch64::XZR || |
371 | MI.getOperand(i: 1).getReg() == AArch64::WZR) |
372 | return false; |
373 | |
374 | return splitTwoPartImm<T>( |
375 | MI, |
376 | [PosOpc, NegOpc](T Imm, unsigned RegSize, T &Imm0, |
377 | T &Imm1) -> std::optional<OpcodePair> { |
378 | if (splitAddSubImm(Imm, RegSize, Imm0, Imm1)) |
379 | return std::make_pair(x: PosOpc, y: PosOpc); |
380 | if (splitAddSubImm(-Imm, RegSize, Imm0, Imm1)) |
381 | return std::make_pair(x: NegOpc, y: NegOpc); |
382 | return std::nullopt; |
383 | }, |
384 | [&TII = TII](MachineInstr &MI, OpcodePair Opcode, unsigned Imm0, |
385 | unsigned Imm1, Register SrcReg, Register NewTmpReg, |
386 | Register NewDstReg) { |
387 | DebugLoc DL = MI.getDebugLoc(); |
388 | MachineBasicBlock *MBB = MI.getParent(); |
389 | BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: Opcode.first), DestReg: NewTmpReg) |
390 | .addReg(RegNo: SrcReg) |
391 | .addImm(Val: Imm0) |
392 | .addImm(Val: 12); |
393 | BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: Opcode.second), DestReg: NewDstReg) |
394 | .addReg(RegNo: NewTmpReg) |
395 | .addImm(Val: Imm1) |
396 | .addImm(Val: 0); |
397 | }); |
398 | } |
399 | |
400 | template <typename T> |
401 | bool AArch64MIPeepholeOpt::visitADDSSUBS( |
402 | OpcodePair PosOpcs, OpcodePair NegOpcs, MachineInstr &MI) { |
403 | // Try the same transformation as ADDSUB but with additional requirement |
404 | // that the condition code usages are only for Equal and Not Equal |
405 | |
406 | if (MI.getOperand(i: 1).getReg() == AArch64::XZR || |
407 | MI.getOperand(i: 1).getReg() == AArch64::WZR) |
408 | return false; |
409 | |
410 | return splitTwoPartImm<T>( |
411 | MI, |
412 | [PosOpcs, NegOpcs, &MI, &TRI = TRI, |
413 | &MRI = MRI](T Imm, unsigned RegSize, T &Imm0, |
414 | T &Imm1) -> std::optional<OpcodePair> { |
415 | OpcodePair OP; |
416 | if (splitAddSubImm(Imm, RegSize, Imm0, Imm1)) |
417 | OP = PosOpcs; |
418 | else if (splitAddSubImm(-Imm, RegSize, Imm0, Imm1)) |
419 | OP = NegOpcs; |
420 | else |
421 | return std::nullopt; |
422 | // Check conditional uses last since it is expensive for scanning |
423 | // proceeding instructions |
424 | MachineInstr &SrcMI = *MRI->getUniqueVRegDef(Reg: MI.getOperand(i: 1).getReg()); |
425 | std::optional<UsedNZCV> NZCVUsed = examineCFlagsUse(MI&: SrcMI, CmpInstr&: MI, TRI: *TRI); |
426 | if (!NZCVUsed || NZCVUsed->C || NZCVUsed->V) |
427 | return std::nullopt; |
428 | return OP; |
429 | }, |
430 | [&TII = TII](MachineInstr &MI, OpcodePair Opcode, unsigned Imm0, |
431 | unsigned Imm1, Register SrcReg, Register NewTmpReg, |
432 | Register NewDstReg) { |
433 | DebugLoc DL = MI.getDebugLoc(); |
434 | MachineBasicBlock *MBB = MI.getParent(); |
435 | BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: Opcode.first), DestReg: NewTmpReg) |
436 | .addReg(RegNo: SrcReg) |
437 | .addImm(Val: Imm0) |
438 | .addImm(Val: 12); |
439 | BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: Opcode.second), DestReg: NewDstReg) |
440 | .addReg(RegNo: NewTmpReg) |
441 | .addImm(Val: Imm1) |
442 | .addImm(Val: 0); |
443 | }); |
444 | } |
445 | |
446 | // Checks if the corresponding MOV immediate instruction is applicable for |
447 | // this peephole optimization. |
448 | bool AArch64MIPeepholeOpt::checkMovImmInstr(MachineInstr &MI, |
449 | MachineInstr *&MovMI, |
450 | MachineInstr *&SubregToRegMI) { |
451 | // Check whether current MBB is in loop and the AND is loop invariant. |
452 | MachineBasicBlock *MBB = MI.getParent(); |
453 | MachineLoop *L = MLI->getLoopFor(BB: MBB); |
454 | if (L && !L->isLoopInvariant(I&: MI)) |
455 | return false; |
456 | |
457 | // Check whether current MI's operand is MOV with immediate. |
458 | MovMI = MRI->getUniqueVRegDef(Reg: MI.getOperand(i: 2).getReg()); |
459 | if (!MovMI) |
460 | return false; |
461 | |
462 | // If it is SUBREG_TO_REG, check its operand. |
463 | SubregToRegMI = nullptr; |
464 | if (MovMI->getOpcode() == TargetOpcode::SUBREG_TO_REG) { |
465 | SubregToRegMI = MovMI; |
466 | MovMI = MRI->getUniqueVRegDef(Reg: MovMI->getOperand(i: 2).getReg()); |
467 | if (!MovMI) |
468 | return false; |
469 | } |
470 | |
471 | if (MovMI->getOpcode() != AArch64::MOVi32imm && |
472 | MovMI->getOpcode() != AArch64::MOVi64imm) |
473 | return false; |
474 | |
475 | // If the MOV has multiple uses, do not split the immediate because it causes |
476 | // more instructions. |
477 | if (!MRI->hasOneUse(RegNo: MovMI->getOperand(i: 0).getReg())) |
478 | return false; |
479 | if (SubregToRegMI && !MRI->hasOneUse(RegNo: SubregToRegMI->getOperand(i: 0).getReg())) |
480 | return false; |
481 | |
482 | // It is OK to perform this peephole optimization. |
483 | return true; |
484 | } |
485 | |
486 | template <typename T> |
487 | bool AArch64MIPeepholeOpt::splitTwoPartImm( |
488 | MachineInstr &MI, |
489 | SplitAndOpcFunc<T> SplitAndOpc, BuildMIFunc BuildInstr) { |
490 | unsigned RegSize = sizeof(T) * 8; |
491 | assert((RegSize == 32 || RegSize == 64) && |
492 | "Invalid RegSize for legal immediate peephole optimization" ); |
493 | |
494 | // Perform several essential checks against current MI. |
495 | MachineInstr *MovMI, *SubregToRegMI; |
496 | if (!checkMovImmInstr(MI, MovMI, SubregToRegMI)) |
497 | return false; |
498 | |
499 | // Split the immediate to Imm0 and Imm1, and calculate the Opcode. |
500 | T Imm = static_cast<T>(MovMI->getOperand(i: 1).getImm()), Imm0, Imm1; |
501 | // For the 32 bit form of instruction, the upper 32 bits of the destination |
502 | // register are set to zero. If there is SUBREG_TO_REG, set the upper 32 bits |
503 | // of Imm to zero. This is essential if the Immediate value was a negative |
504 | // number since it was sign extended when we assign to the 64-bit Imm. |
505 | if (SubregToRegMI) |
506 | Imm &= 0xFFFFFFFF; |
507 | OpcodePair Opcode; |
508 | if (auto R = SplitAndOpc(Imm, RegSize, Imm0, Imm1)) |
509 | Opcode = *R; |
510 | else |
511 | return false; |
512 | |
513 | // Create new MIs using the first and second opcodes. Opcodes might differ for |
514 | // flag setting operations that should only set flags on second instruction. |
515 | // NewTmpReg = Opcode.first SrcReg Imm0 |
516 | // NewDstReg = Opcode.second NewTmpReg Imm1 |
517 | |
518 | // Determine register classes for destinations and register operands |
519 | MachineFunction *MF = MI.getMF(); |
520 | const TargetRegisterClass *FirstInstrDstRC = |
521 | TII->getRegClass(MCID: TII->get(Opcode: Opcode.first), OpNum: 0, TRI, MF: *MF); |
522 | const TargetRegisterClass *FirstInstrOperandRC = |
523 | TII->getRegClass(MCID: TII->get(Opcode: Opcode.first), OpNum: 1, TRI, MF: *MF); |
524 | const TargetRegisterClass *SecondInstrDstRC = |
525 | (Opcode.first == Opcode.second) |
526 | ? FirstInstrDstRC |
527 | : TII->getRegClass(MCID: TII->get(Opcode: Opcode.second), OpNum: 0, TRI, MF: *MF); |
528 | const TargetRegisterClass *SecondInstrOperandRC = |
529 | (Opcode.first == Opcode.second) |
530 | ? FirstInstrOperandRC |
531 | : TII->getRegClass(MCID: TII->get(Opcode: Opcode.second), OpNum: 1, TRI, MF: *MF); |
532 | |
533 | // Get old registers destinations and new register destinations |
534 | Register DstReg = MI.getOperand(i: 0).getReg(); |
535 | Register SrcReg = MI.getOperand(i: 1).getReg(); |
536 | Register NewTmpReg = MRI->createVirtualRegister(RegClass: FirstInstrDstRC); |
537 | // In the situation that DstReg is not Virtual (likely WZR or XZR), we want to |
538 | // reuse that same destination register. |
539 | Register NewDstReg = DstReg.isVirtual() |
540 | ? MRI->createVirtualRegister(RegClass: SecondInstrDstRC) |
541 | : DstReg; |
542 | |
543 | // Constrain registers based on their new uses |
544 | MRI->constrainRegClass(Reg: SrcReg, RC: FirstInstrOperandRC); |
545 | MRI->constrainRegClass(Reg: NewTmpReg, RC: SecondInstrOperandRC); |
546 | if (DstReg != NewDstReg) |
547 | MRI->constrainRegClass(Reg: NewDstReg, RC: MRI->getRegClass(Reg: DstReg)); |
548 | |
549 | // Call the delegating operation to build the instruction |
550 | BuildInstr(MI, Opcode, Imm0, Imm1, SrcReg, NewTmpReg, NewDstReg); |
551 | |
552 | // replaceRegWith changes MI's definition register. Keep it for SSA form until |
553 | // deleting MI. Only if we made a new destination register. |
554 | if (DstReg != NewDstReg) { |
555 | MRI->replaceRegWith(FromReg: DstReg, ToReg: NewDstReg); |
556 | MI.getOperand(i: 0).setReg(DstReg); |
557 | } |
558 | |
559 | // Record the MIs need to be removed. |
560 | MI.eraseFromParent(); |
561 | if (SubregToRegMI) |
562 | SubregToRegMI->eraseFromParent(); |
563 | MovMI->eraseFromParent(); |
564 | |
565 | return true; |
566 | } |
567 | |
568 | bool AArch64MIPeepholeOpt::visitINSviGPR(MachineInstr &MI, unsigned Opc) { |
569 | // Check if this INSvi[X]gpr comes from COPY of a source FPR128 |
570 | // |
571 | // From |
572 | // %intermediate1:gpr64 = COPY %src:fpr128 |
573 | // %intermediate2:gpr32 = COPY %intermediate1:gpr64 |
574 | // %dst:fpr128 = INSvi[X]gpr %dst_vec:fpr128, dst_index, %intermediate2:gpr32 |
575 | // To |
576 | // %dst:fpr128 = INSvi[X]lane %dst_vec:fpr128, dst_index, %src:fpr128, |
577 | // src_index |
578 | // where src_index = 0, X = [8|16|32|64] |
579 | |
580 | MachineInstr *SrcMI = MRI->getUniqueVRegDef(Reg: MI.getOperand(i: 3).getReg()); |
581 | |
582 | // For a chain of COPY instructions, find the initial source register |
583 | // and check if it's an FPR128 |
584 | while (true) { |
585 | if (!SrcMI || SrcMI->getOpcode() != TargetOpcode::COPY) |
586 | return false; |
587 | |
588 | if (!SrcMI->getOperand(i: 1).getReg().isVirtual()) |
589 | return false; |
590 | |
591 | if (MRI->getRegClass(Reg: SrcMI->getOperand(i: 1).getReg()) == |
592 | &AArch64::FPR128RegClass) { |
593 | break; |
594 | } |
595 | SrcMI = MRI->getUniqueVRegDef(Reg: SrcMI->getOperand(i: 1).getReg()); |
596 | } |
597 | |
598 | Register DstReg = MI.getOperand(i: 0).getReg(); |
599 | Register SrcReg = SrcMI->getOperand(i: 1).getReg(); |
600 | MachineInstr *INSvilaneMI = |
601 | BuildMI(BB&: *MI.getParent(), I&: MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: Opc), DestReg: DstReg) |
602 | .add(MO: MI.getOperand(i: 1)) |
603 | .add(MO: MI.getOperand(i: 2)) |
604 | .addUse(RegNo: SrcReg, Flags: getRegState(RegOp: SrcMI->getOperand(i: 1))) |
605 | .addImm(Val: 0); |
606 | |
607 | LLVM_DEBUG(dbgs() << MI << " replace by:\n: " << *INSvilaneMI << "\n" ); |
608 | (void)INSvilaneMI; |
609 | MI.eraseFromParent(); |
610 | return true; |
611 | } |
612 | |
613 | // All instructions that set a FPR64 will implicitly zero the top bits of the |
614 | // register. |
615 | static bool is64bitDefwithZeroHigh64bit(MachineInstr *MI, |
616 | MachineRegisterInfo *MRI) { |
617 | if (!MI->getOperand(i: 0).isReg() || !MI->getOperand(i: 0).isDef()) |
618 | return false; |
619 | const TargetRegisterClass *RC = MRI->getRegClass(Reg: MI->getOperand(i: 0).getReg()); |
620 | if (RC != &AArch64::FPR64RegClass) |
621 | return false; |
622 | return MI->getOpcode() > TargetOpcode::GENERIC_OP_END; |
623 | } |
624 | |
625 | bool AArch64MIPeepholeOpt::visitINSvi64lane(MachineInstr &MI) { |
626 | // Check the MI for low 64-bits sets zero for high 64-bits implicitly. |
627 | // We are expecting below case. |
628 | // |
629 | // %1:fpr64 = nofpexcept FCVTNv4i16 %0:fpr128, implicit $fpcr |
630 | // %6:fpr128 = IMPLICIT_DEF |
631 | // %5:fpr128 = INSERT_SUBREG %6:fpr128(tied-def 0), killed %1:fpr64, %subreg.dsub |
632 | // %7:fpr128 = INSvi64lane %5:fpr128(tied-def 0), 1, killed %3:fpr128, 0 |
633 | MachineInstr *Low64MI = MRI->getUniqueVRegDef(Reg: MI.getOperand(i: 1).getReg()); |
634 | if (Low64MI->getOpcode() != AArch64::INSERT_SUBREG) |
635 | return false; |
636 | Low64MI = MRI->getUniqueVRegDef(Reg: Low64MI->getOperand(i: 2).getReg()); |
637 | if (!Low64MI || !is64bitDefwithZeroHigh64bit(MI: Low64MI, MRI)) |
638 | return false; |
639 | |
640 | // Check there is `mov 0` MI for high 64-bits. |
641 | // We are expecting below cases. |
642 | // |
643 | // %2:fpr64 = MOVID 0 |
644 | // %4:fpr128 = IMPLICIT_DEF |
645 | // %3:fpr128 = INSERT_SUBREG %4:fpr128(tied-def 0), killed %2:fpr64, %subreg.dsub |
646 | // %7:fpr128 = INSvi64lane %5:fpr128(tied-def 0), 1, killed %3:fpr128, 0 |
647 | // or |
648 | // %5:fpr128 = MOVIv2d_ns 0 |
649 | // %6:fpr64 = COPY %5.dsub:fpr128 |
650 | // %8:fpr128 = IMPLICIT_DEF |
651 | // %7:fpr128 = INSERT_SUBREG %8:fpr128(tied-def 0), killed %6:fpr64, %subreg.dsub |
652 | // %11:fpr128 = INSvi64lane %9:fpr128(tied-def 0), 1, killed %7:fpr128, 0 |
653 | MachineInstr *High64MI = MRI->getUniqueVRegDef(Reg: MI.getOperand(i: 3).getReg()); |
654 | if (!High64MI || High64MI->getOpcode() != AArch64::INSERT_SUBREG) |
655 | return false; |
656 | High64MI = MRI->getUniqueVRegDef(Reg: High64MI->getOperand(i: 2).getReg()); |
657 | if (High64MI && High64MI->getOpcode() == TargetOpcode::COPY) |
658 | High64MI = MRI->getUniqueVRegDef(Reg: High64MI->getOperand(i: 1).getReg()); |
659 | if (!High64MI || (High64MI->getOpcode() != AArch64::MOVID && |
660 | High64MI->getOpcode() != AArch64::MOVIv2d_ns)) |
661 | return false; |
662 | if (High64MI->getOperand(i: 1).getImm() != 0) |
663 | return false; |
664 | |
665 | // Let's remove MIs for high 64-bits. |
666 | Register OldDef = MI.getOperand(i: 0).getReg(); |
667 | Register NewDef = MI.getOperand(i: 1).getReg(); |
668 | MRI->constrainRegClass(Reg: NewDef, RC: MRI->getRegClass(Reg: OldDef)); |
669 | MRI->replaceRegWith(FromReg: OldDef, ToReg: NewDef); |
670 | MI.eraseFromParent(); |
671 | |
672 | return true; |
673 | } |
674 | |
675 | bool AArch64MIPeepholeOpt::visitFMOVDr(MachineInstr &MI) { |
676 | // An FMOVDr sets the high 64-bits to zero implicitly, similar to ORR for GPR. |
677 | MachineInstr *Low64MI = MRI->getUniqueVRegDef(Reg: MI.getOperand(i: 1).getReg()); |
678 | if (!Low64MI || !is64bitDefwithZeroHigh64bit(MI: Low64MI, MRI)) |
679 | return false; |
680 | |
681 | // Let's remove MIs for high 64-bits. |
682 | Register OldDef = MI.getOperand(i: 0).getReg(); |
683 | Register NewDef = MI.getOperand(i: 1).getReg(); |
684 | LLVM_DEBUG(dbgs() << "Removing: " << MI << "\n" ); |
685 | MRI->clearKillFlags(Reg: OldDef); |
686 | MRI->clearKillFlags(Reg: NewDef); |
687 | MRI->constrainRegClass(Reg: NewDef, RC: MRI->getRegClass(Reg: OldDef)); |
688 | MRI->replaceRegWith(FromReg: OldDef, ToReg: NewDef); |
689 | MI.eraseFromParent(); |
690 | |
691 | return true; |
692 | } |
693 | |
694 | // Across a basic-block we might have in i32 extract from a value that only |
695 | // operates on upper bits (for example a sxtw). We can replace the COPY with a |
696 | // new version skipping the sxtw. |
697 | bool AArch64MIPeepholeOpt::visitCopy(MachineInstr &MI) { |
698 | Register InputReg = MI.getOperand(i: 1).getReg(); |
699 | if (MI.getOperand(i: 1).getSubReg() != AArch64::sub_32 || |
700 | !MRI->hasOneNonDBGUse(RegNo: InputReg)) |
701 | return false; |
702 | |
703 | MachineInstr *SrcMI = MRI->getUniqueVRegDef(Reg: InputReg); |
704 | SmallPtrSet<MachineInstr *, 4> DeadInstrs; |
705 | DeadInstrs.insert(Ptr: SrcMI); |
706 | while (SrcMI && SrcMI->isFullCopy() && |
707 | MRI->hasOneNonDBGUse(RegNo: SrcMI->getOperand(i: 1).getReg())) { |
708 | SrcMI = MRI->getUniqueVRegDef(Reg: SrcMI->getOperand(i: 1).getReg()); |
709 | DeadInstrs.insert(Ptr: SrcMI); |
710 | } |
711 | |
712 | if (!SrcMI || SrcMI->getOpcode() != AArch64::SBFMXri || |
713 | SrcMI->getOperand(i: 2).getImm() != 0 || SrcMI->getOperand(i: 3).getImm() != 31) |
714 | return false; |
715 | |
716 | Register SrcReg = SrcMI->getOperand(i: 1).getReg(); |
717 | MRI->constrainRegClass(Reg: SrcReg, RC: MRI->getRegClass(Reg: InputReg)); |
718 | LLVM_DEBUG(dbgs() << "Optimizing: " << MI); |
719 | MI.getOperand(i: 1).setReg(SrcReg); |
720 | LLVM_DEBUG(dbgs() << " to: " << MI); |
721 | for (auto *DeadMI : DeadInstrs) { |
722 | LLVM_DEBUG(dbgs() << " Removing: " << *DeadMI); |
723 | DeadMI->eraseFromParent(); |
724 | } |
725 | return true; |
726 | } |
727 | |
728 | bool AArch64MIPeepholeOpt::runOnMachineFunction(MachineFunction &MF) { |
729 | if (skipFunction(F: MF.getFunction())) |
730 | return false; |
731 | |
732 | TII = static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo()); |
733 | TRI = static_cast<const AArch64RegisterInfo *>( |
734 | MF.getSubtarget().getRegisterInfo()); |
735 | MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI(); |
736 | MRI = &MF.getRegInfo(); |
737 | |
738 | assert(MRI->isSSA() && "Expected to be run on SSA form!" ); |
739 | |
740 | bool Changed = false; |
741 | |
742 | for (MachineBasicBlock &MBB : MF) { |
743 | for (MachineInstr &MI : make_early_inc_range(Range&: MBB)) { |
744 | switch (MI.getOpcode()) { |
745 | default: |
746 | break; |
747 | case AArch64::INSERT_SUBREG: |
748 | Changed |= visitINSERT(MI); |
749 | break; |
750 | case AArch64::ANDWrr: |
751 | Changed |= visitAND<uint32_t>(Opc: AArch64::ANDWri, MI); |
752 | break; |
753 | case AArch64::ANDXrr: |
754 | Changed |= visitAND<uint64_t>(Opc: AArch64::ANDXri, MI); |
755 | break; |
756 | case AArch64::ORRWrs: |
757 | Changed |= visitORR(MI); |
758 | break; |
759 | case AArch64::ADDWrr: |
760 | Changed |= visitADDSUB<uint32_t>(PosOpc: AArch64::ADDWri, NegOpc: AArch64::SUBWri, MI); |
761 | break; |
762 | case AArch64::SUBWrr: |
763 | Changed |= visitADDSUB<uint32_t>(PosOpc: AArch64::SUBWri, NegOpc: AArch64::ADDWri, MI); |
764 | break; |
765 | case AArch64::ADDXrr: |
766 | Changed |= visitADDSUB<uint64_t>(PosOpc: AArch64::ADDXri, NegOpc: AArch64::SUBXri, MI); |
767 | break; |
768 | case AArch64::SUBXrr: |
769 | Changed |= visitADDSUB<uint64_t>(PosOpc: AArch64::SUBXri, NegOpc: AArch64::ADDXri, MI); |
770 | break; |
771 | case AArch64::ADDSWrr: |
772 | Changed |= |
773 | visitADDSSUBS<uint32_t>(PosOpcs: {AArch64::ADDWri, AArch64::ADDSWri}, |
774 | NegOpcs: {AArch64::SUBWri, AArch64::SUBSWri}, MI); |
775 | break; |
776 | case AArch64::SUBSWrr: |
777 | Changed |= |
778 | visitADDSSUBS<uint32_t>(PosOpcs: {AArch64::SUBWri, AArch64::SUBSWri}, |
779 | NegOpcs: {AArch64::ADDWri, AArch64::ADDSWri}, MI); |
780 | break; |
781 | case AArch64::ADDSXrr: |
782 | Changed |= |
783 | visitADDSSUBS<uint64_t>(PosOpcs: {AArch64::ADDXri, AArch64::ADDSXri}, |
784 | NegOpcs: {AArch64::SUBXri, AArch64::SUBSXri}, MI); |
785 | break; |
786 | case AArch64::SUBSXrr: |
787 | Changed |= |
788 | visitADDSSUBS<uint64_t>(PosOpcs: {AArch64::SUBXri, AArch64::SUBSXri}, |
789 | NegOpcs: {AArch64::ADDXri, AArch64::ADDSXri}, MI); |
790 | break; |
791 | case AArch64::INSvi64gpr: |
792 | Changed |= visitINSviGPR(MI, Opc: AArch64::INSvi64lane); |
793 | break; |
794 | case AArch64::INSvi32gpr: |
795 | Changed |= visitINSviGPR(MI, Opc: AArch64::INSvi32lane); |
796 | break; |
797 | case AArch64::INSvi16gpr: |
798 | Changed |= visitINSviGPR(MI, Opc: AArch64::INSvi16lane); |
799 | break; |
800 | case AArch64::INSvi8gpr: |
801 | Changed |= visitINSviGPR(MI, Opc: AArch64::INSvi8lane); |
802 | break; |
803 | case AArch64::INSvi64lane: |
804 | Changed |= visitINSvi64lane(MI); |
805 | break; |
806 | case AArch64::FMOVDr: |
807 | Changed |= visitFMOVDr(MI); |
808 | break; |
809 | case AArch64::COPY: |
810 | Changed |= visitCopy(MI); |
811 | break; |
812 | } |
813 | } |
814 | } |
815 | |
816 | return Changed; |
817 | } |
818 | |
819 | FunctionPass *llvm::createAArch64MIPeepholeOptPass() { |
820 | return new AArch64MIPeepholeOpt(); |
821 | } |
822 | |