1//===- AArch64MIPeepholeOpt.cpp - AArch64 MI peephole optimization pass ---===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass performs below peephole optimizations on MIR level.
10//
11// 1. MOVi32imm + ANDWrr ==> ANDWri + ANDWri
12// MOVi64imm + ANDXrr ==> ANDXri + ANDXri
13//
14// 2. MOVi32imm + ADDWrr ==> ADDWRi + ADDWRi
15// MOVi64imm + ADDXrr ==> ANDXri + ANDXri
16//
17// 3. MOVi32imm + SUBWrr ==> SUBWRi + SUBWRi
18// MOVi64imm + SUBXrr ==> SUBXri + SUBXri
19//
20// The mov pseudo instruction could be expanded to multiple mov instructions
21// later. In this case, we could try to split the constant operand of mov
22// instruction into two immediates which can be directly encoded into
23// *Wri/*Xri instructions. It makes two AND/ADD/SUB instructions instead of
24// multiple `mov` + `and/add/sub` instructions.
25//
26// 4. Remove redundant ORRWrs which is generated by zero-extend.
27//
28// %3:gpr32 = ORRWrs $wzr, %2, 0
29// %4:gpr64 = SUBREG_TO_REG 0, %3, %subreg.sub_32
30//
31// If AArch64's 32-bit form of instruction defines the source operand of
32// ORRWrs, we can remove the ORRWrs because the upper 32 bits of the source
33// operand are set to zero.
34//
35// 5. %reg = INSERT_SUBREG %reg(tied-def 0), %subreg, subidx
36// ==> %reg:subidx = SUBREG_TO_REG 0, %subreg, subidx
37//
38// 6. %intermediate:gpr32 = COPY %src:fpr128
39// %dst:fpr128 = INSvi32gpr %dst_vec:fpr128, dst_index, %intermediate:gpr32
40// ==> %dst:fpr128 = INSvi32lane %dst_vec:fpr128, dst_index, %src:fpr128, 0
41//
42// In cases where a source FPR is copied to a GPR in order to be copied
43// to a destination FPR, we can directly copy the values between the FPRs,
44// eliminating the use of the Integer unit. When we match a pattern of
45// INSvi[X]gpr that is preceded by a chain of COPY instructions from a FPR
46// source, we use the INSvi[X]lane to replace the COPY & INSvi[X]gpr
47// instructions.
48//
49// 7. If MI sets zero for high 64-bits implicitly, remove `mov 0` for high
50// 64-bits. For example,
51//
52// %1:fpr64 = nofpexcept FCVTNv4i16 %0:fpr128, implicit $fpcr
53// %2:fpr64 = MOVID 0
54// %4:fpr128 = IMPLICIT_DEF
55// %3:fpr128 = INSERT_SUBREG %4:fpr128(tied-def 0), %2:fpr64, %subreg.dsub
56// %6:fpr128 = IMPLICIT_DEF
57// %5:fpr128 = INSERT_SUBREG %6:fpr128(tied-def 0), %1:fpr64, %subreg.dsub
58// %7:fpr128 = INSvi64lane %5:fpr128(tied-def 0), 1, %3:fpr128, 0
59// ==>
60// %1:fpr64 = nofpexcept FCVTNv4i16 %0:fpr128, implicit $fpcr
61// %6:fpr128 = IMPLICIT_DEF
62// %7:fpr128 = INSERT_SUBREG %6:fpr128(tied-def 0), %1:fpr64, %subreg.dsub
63//
64// 8. Remove redundant CSELs that select between identical registers, by
65// replacing them with unconditional moves.
66//
67// 9. Replace UBFMXri with UBFMWri if the instruction is equivalent to a 32 bit
68// LSR or LSL alias of UBFM.
69//
70//===----------------------------------------------------------------------===//
71
72#include "AArch64ExpandImm.h"
73#include "AArch64InstrInfo.h"
74#include "MCTargetDesc/AArch64AddressingModes.h"
75#include "llvm/CodeGen/MachineDominators.h"
76#include "llvm/CodeGen/MachineLoopInfo.h"
77
78using namespace llvm;
79
80#define DEBUG_TYPE "aarch64-mi-peephole-opt"
81
82namespace {
83
84struct AArch64MIPeepholeOpt : public MachineFunctionPass {
85 static char ID;
86
87 AArch64MIPeepholeOpt() : MachineFunctionPass(ID) {}
88
89 const AArch64InstrInfo *TII;
90 const AArch64RegisterInfo *TRI;
91 MachineLoopInfo *MLI;
92 MachineRegisterInfo *MRI;
93
94 using OpcodePair = std::pair<unsigned, unsigned>;
95 template <typename T>
96 using SplitAndOpcFunc =
97 std::function<std::optional<OpcodePair>(T, unsigned, T &, T &)>;
98 using BuildMIFunc =
99 std::function<void(MachineInstr &, OpcodePair, unsigned, unsigned,
100 Register, Register, Register)>;
101
102 /// For instructions where an immediate operand could be split into two
103 /// separate immediate instructions, use the splitTwoPartImm two handle the
104 /// optimization.
105 ///
106 /// To implement, the following function types must be passed to
107 /// splitTwoPartImm. A SplitAndOpcFunc must be implemented that determines if
108 /// splitting the immediate is valid and returns the associated new opcode. A
109 /// BuildMIFunc must be implemented to build the two immediate instructions.
110 ///
111 /// Example Pattern (where IMM would require 2+ MOV instructions):
112 /// %dst = <Instr>rr %src IMM [...]
113 /// becomes:
114 /// %tmp = <Instr>ri %src (encode half IMM) [...]
115 /// %dst = <Instr>ri %tmp (encode half IMM) [...]
116 template <typename T>
117 bool splitTwoPartImm(MachineInstr &MI,
118 SplitAndOpcFunc<T> SplitAndOpc, BuildMIFunc BuildInstr);
119
120 bool checkMovImmInstr(MachineInstr &MI, MachineInstr *&MovMI,
121 MachineInstr *&SubregToRegMI);
122
123 template <typename T>
124 bool visitADDSUB(unsigned PosOpc, unsigned NegOpc, MachineInstr &MI);
125 template <typename T>
126 bool visitADDSSUBS(OpcodePair PosOpcs, OpcodePair NegOpcs, MachineInstr &MI);
127
128 template <typename T>
129 bool visitAND(unsigned Opc, MachineInstr &MI);
130 bool visitORR(MachineInstr &MI);
131 bool visitCSEL(MachineInstr &MI);
132 bool visitINSERT(MachineInstr &MI);
133 bool visitINSviGPR(MachineInstr &MI, unsigned Opc);
134 bool visitINSvi64lane(MachineInstr &MI);
135 bool visitFMOVDr(MachineInstr &MI);
136 bool visitUBFMXri(MachineInstr &MI);
137 bool visitCopy(MachineInstr &MI);
138 bool runOnMachineFunction(MachineFunction &MF) override;
139
140 StringRef getPassName() const override {
141 return "AArch64 MI Peephole Optimization pass";
142 }
143
144 void getAnalysisUsage(AnalysisUsage &AU) const override {
145 AU.setPreservesCFG();
146 AU.addRequired<MachineLoopInfoWrapperPass>();
147 MachineFunctionPass::getAnalysisUsage(AU);
148 }
149};
150
151char AArch64MIPeepholeOpt::ID = 0;
152
153} // end anonymous namespace
154
155INITIALIZE_PASS(AArch64MIPeepholeOpt, "aarch64-mi-peephole-opt",
156 "AArch64 MI Peephole Optimization", false, false)
157
158template <typename T>
159static bool splitBitmaskImm(T Imm, unsigned RegSize, T &Imm1Enc, T &Imm2Enc) {
160 T UImm = static_cast<T>(Imm);
161 if (AArch64_AM::isLogicalImmediate(imm: UImm, regSize: RegSize))
162 return false;
163
164 // If this immediate can be handled by one instruction, do not split it.
165 SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
166 AArch64_IMM::expandMOVImm(Imm: UImm, BitSize: RegSize, Insn);
167 if (Insn.size() == 1)
168 return false;
169
170 // The bitmask immediate consists of consecutive ones. Let's say there is
171 // constant 0b00000000001000000000010000000000 which does not consist of
172 // consecutive ones. We can split it in to two bitmask immediate like
173 // 0b00000000001111111111110000000000 and 0b11111111111000000000011111111111.
174 // If we do AND with these two bitmask immediate, we can see original one.
175 unsigned LowestBitSet = llvm::countr_zero(UImm);
176 unsigned HighestBitSet = Log2_64(UImm);
177
178 // Create a mask which is filled with one from the position of lowest bit set
179 // to the position of highest bit set.
180 T NewImm1 = (static_cast<T>(2) << HighestBitSet) -
181 (static_cast<T>(1) << LowestBitSet);
182 // Create a mask which is filled with one outside the position of lowest bit
183 // set and the position of highest bit set.
184 T NewImm2 = UImm | ~NewImm1;
185
186 // If the split value is not valid bitmask immediate, do not split this
187 // constant.
188 if (!AArch64_AM::isLogicalImmediate(imm: NewImm2, regSize: RegSize))
189 return false;
190
191 Imm1Enc = AArch64_AM::encodeLogicalImmediate(imm: NewImm1, regSize: RegSize);
192 Imm2Enc = AArch64_AM::encodeLogicalImmediate(imm: NewImm2, regSize: RegSize);
193 return true;
194}
195
196template <typename T>
197bool AArch64MIPeepholeOpt::visitAND(
198 unsigned Opc, MachineInstr &MI) {
199 // Try below transformation.
200 //
201 // MOVi32imm + ANDWrr ==> ANDWri + ANDWri
202 // MOVi64imm + ANDXrr ==> ANDXri + ANDXri
203 //
204 // The mov pseudo instruction could be expanded to multiple mov instructions
205 // later. Let's try to split the constant operand of mov instruction into two
206 // bitmask immediates. It makes only two AND instructions instead of multiple
207 // mov + and instructions.
208
209 return splitTwoPartImm<T>(
210 MI,
211 [Opc](T Imm, unsigned RegSize, T &Imm0,
212 T &Imm1) -> std::optional<OpcodePair> {
213 if (splitBitmaskImm(Imm, RegSize, Imm0, Imm1))
214 return std::make_pair(x: Opc, y: Opc);
215 return std::nullopt;
216 },
217 [&TII = TII](MachineInstr &MI, OpcodePair Opcode, unsigned Imm0,
218 unsigned Imm1, Register SrcReg, Register NewTmpReg,
219 Register NewDstReg) {
220 DebugLoc DL = MI.getDebugLoc();
221 MachineBasicBlock *MBB = MI.getParent();
222 BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: Opcode.first), DestReg: NewTmpReg)
223 .addReg(RegNo: SrcReg)
224 .addImm(Val: Imm0);
225 BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: Opcode.second), DestReg: NewDstReg)
226 .addReg(RegNo: NewTmpReg)
227 .addImm(Val: Imm1);
228 });
229}
230
231bool AArch64MIPeepholeOpt::visitORR(MachineInstr &MI) {
232 // Check this ORR comes from below zero-extend pattern.
233 //
234 // def : Pat<(i64 (zext GPR32:$src)),
235 // (SUBREG_TO_REG (i32 0), (ORRWrs WZR, GPR32:$src, 0), sub_32)>;
236 if (MI.getOperand(i: 3).getImm() != 0)
237 return false;
238
239 if (MI.getOperand(i: 1).getReg() != AArch64::WZR)
240 return false;
241
242 MachineInstr *SrcMI = MRI->getUniqueVRegDef(Reg: MI.getOperand(i: 2).getReg());
243 if (!SrcMI)
244 return false;
245
246 // From https://developer.arm.com/documentation/dui0801/b/BABBGCAC
247 //
248 // When you use the 32-bit form of an instruction, the upper 32 bits of the
249 // source registers are ignored and the upper 32 bits of the destination
250 // register are set to zero.
251 //
252 // If AArch64's 32-bit form of instruction defines the source operand of
253 // zero-extend, we do not need the zero-extend. Let's check the MI's opcode is
254 // real AArch64 instruction and if it is not, do not process the opcode
255 // conservatively.
256 if (SrcMI->getOpcode() == TargetOpcode::COPY &&
257 SrcMI->getOperand(i: 1).getReg().isVirtual()) {
258 const TargetRegisterClass *RC =
259 MRI->getRegClass(Reg: SrcMI->getOperand(i: 1).getReg());
260
261 // A COPY from an FPR will become a FMOVSWr, so do so now so that we know
262 // that the upper bits are zero.
263 if (RC != &AArch64::FPR32RegClass &&
264 ((RC != &AArch64::FPR64RegClass && RC != &AArch64::FPR128RegClass &&
265 RC != &AArch64::ZPRRegClass) ||
266 SrcMI->getOperand(i: 1).getSubReg() != AArch64::ssub))
267 return false;
268 Register CpySrc;
269 if (SrcMI->getOperand(i: 1).getSubReg() == AArch64::ssub) {
270 CpySrc = MRI->createVirtualRegister(RegClass: &AArch64::FPR32RegClass);
271 BuildMI(BB&: *SrcMI->getParent(), I: SrcMI, MIMD: SrcMI->getDebugLoc(),
272 MCID: TII->get(Opcode: TargetOpcode::COPY), DestReg: CpySrc)
273 .add(MO: SrcMI->getOperand(i: 1));
274 } else {
275 CpySrc = SrcMI->getOperand(i: 1).getReg();
276 }
277 BuildMI(BB&: *SrcMI->getParent(), I: SrcMI, MIMD: SrcMI->getDebugLoc(),
278 MCID: TII->get(Opcode: AArch64::FMOVSWr), DestReg: SrcMI->getOperand(i: 0).getReg())
279 .addReg(RegNo: CpySrc);
280 SrcMI->eraseFromParent();
281 }
282 else if (SrcMI->getOpcode() <= TargetOpcode::GENERIC_OP_END)
283 return false;
284
285 Register DefReg = MI.getOperand(i: 0).getReg();
286 Register SrcReg = MI.getOperand(i: 2).getReg();
287 MRI->replaceRegWith(FromReg: DefReg, ToReg: SrcReg);
288 MRI->clearKillFlags(Reg: SrcReg);
289 LLVM_DEBUG(dbgs() << "Removed: " << MI << "\n");
290 MI.eraseFromParent();
291
292 return true;
293}
294
295bool AArch64MIPeepholeOpt::visitCSEL(MachineInstr &MI) {
296 // Replace CSEL with MOV when both inputs are the same register.
297 if (MI.getOperand(i: 1).getReg() != MI.getOperand(i: 2).getReg())
298 return false;
299
300 auto ZeroReg =
301 MI.getOpcode() == AArch64::CSELXr ? AArch64::XZR : AArch64::WZR;
302 auto OrOpcode =
303 MI.getOpcode() == AArch64::CSELXr ? AArch64::ORRXrs : AArch64::ORRWrs;
304
305 BuildMI(BB&: *MI.getParent(), I&: MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: OrOpcode))
306 .addReg(RegNo: MI.getOperand(i: 0).getReg(), flags: RegState::Define)
307 .addReg(RegNo: ZeroReg)
308 .addReg(RegNo: MI.getOperand(i: 1).getReg())
309 .addImm(Val: 0);
310
311 MI.eraseFromParent();
312 return true;
313}
314
315bool AArch64MIPeepholeOpt::visitINSERT(MachineInstr &MI) {
316 // Check this INSERT_SUBREG comes from below zero-extend pattern.
317 //
318 // From %reg = INSERT_SUBREG %reg(tied-def 0), %subreg, subidx
319 // To %reg:subidx = SUBREG_TO_REG 0, %subreg, subidx
320 //
321 // We're assuming the first operand to INSERT_SUBREG is irrelevant because a
322 // COPY would destroy the upper part of the register anyway
323 if (!MI.isRegTiedToDefOperand(UseOpIdx: 1))
324 return false;
325
326 Register DstReg = MI.getOperand(i: 0).getReg();
327 const TargetRegisterClass *RC = MRI->getRegClass(Reg: DstReg);
328 MachineInstr *SrcMI = MRI->getUniqueVRegDef(Reg: MI.getOperand(i: 2).getReg());
329 if (!SrcMI)
330 return false;
331
332 // From https://developer.arm.com/documentation/dui0801/b/BABBGCAC
333 //
334 // When you use the 32-bit form of an instruction, the upper 32 bits of the
335 // source registers are ignored and the upper 32 bits of the destination
336 // register are set to zero.
337 //
338 // If AArch64's 32-bit form of instruction defines the source operand of
339 // zero-extend, we do not need the zero-extend. Let's check the MI's opcode is
340 // real AArch64 instruction and if it is not, do not process the opcode
341 // conservatively.
342 if ((SrcMI->getOpcode() <= TargetOpcode::GENERIC_OP_END) ||
343 !AArch64::GPR64allRegClass.hasSubClassEq(RC))
344 return false;
345
346 // Build a SUBREG_TO_REG instruction
347 MachineInstr *SubregMI =
348 BuildMI(BB&: *MI.getParent(), I&: MI, MIMD: MI.getDebugLoc(),
349 MCID: TII->get(Opcode: TargetOpcode::SUBREG_TO_REG), DestReg: DstReg)
350 .addImm(Val: 0)
351 .add(MO: MI.getOperand(i: 2))
352 .add(MO: MI.getOperand(i: 3));
353 LLVM_DEBUG(dbgs() << MI << " replace by:\n: " << *SubregMI << "\n");
354 (void)SubregMI;
355 MI.eraseFromParent();
356
357 return true;
358}
359
360template <typename T>
361static bool splitAddSubImm(T Imm, unsigned RegSize, T &Imm0, T &Imm1) {
362 // The immediate must be in the form of ((imm0 << 12) + imm1), in which both
363 // imm0 and imm1 are non-zero 12-bit unsigned int.
364 if ((Imm & 0xfff000) == 0 || (Imm & 0xfff) == 0 ||
365 (Imm & ~static_cast<T>(0xffffff)) != 0)
366 return false;
367
368 // The immediate can not be composed via a single instruction.
369 SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
370 AArch64_IMM::expandMOVImm(Imm, BitSize: RegSize, Insn);
371 if (Insn.size() == 1)
372 return false;
373
374 // Split Imm into (Imm0 << 12) + Imm1;
375 Imm0 = (Imm >> 12) & 0xfff;
376 Imm1 = Imm & 0xfff;
377 return true;
378}
379
380template <typename T>
381bool AArch64MIPeepholeOpt::visitADDSUB(
382 unsigned PosOpc, unsigned NegOpc, MachineInstr &MI) {
383 // Try below transformation.
384 //
385 // ADDWrr X, MOVi32imm ==> ADDWri + ADDWri
386 // ADDXrr X, MOVi64imm ==> ADDXri + ADDXri
387 //
388 // SUBWrr X, MOVi32imm ==> SUBWri + SUBWri
389 // SUBXrr X, MOVi64imm ==> SUBXri + SUBXri
390 //
391 // The mov pseudo instruction could be expanded to multiple mov instructions
392 // later. Let's try to split the constant operand of mov instruction into two
393 // legal add/sub immediates. It makes only two ADD/SUB instructions instead of
394 // multiple `mov` + `and/sub` instructions.
395
396 // We can sometimes have ADDWrr WZR, MULi32imm that have not been constant
397 // folded. Make sure that we don't generate invalid instructions that use XZR
398 // in those cases.
399 if (MI.getOperand(i: 1).getReg() == AArch64::XZR ||
400 MI.getOperand(i: 1).getReg() == AArch64::WZR)
401 return false;
402
403 return splitTwoPartImm<T>(
404 MI,
405 [PosOpc, NegOpc](T Imm, unsigned RegSize, T &Imm0,
406 T &Imm1) -> std::optional<OpcodePair> {
407 if (splitAddSubImm(Imm, RegSize, Imm0, Imm1))
408 return std::make_pair(x: PosOpc, y: PosOpc);
409 if (splitAddSubImm(-Imm, RegSize, Imm0, Imm1))
410 return std::make_pair(x: NegOpc, y: NegOpc);
411 return std::nullopt;
412 },
413 [&TII = TII](MachineInstr &MI, OpcodePair Opcode, unsigned Imm0,
414 unsigned Imm1, Register SrcReg, Register NewTmpReg,
415 Register NewDstReg) {
416 DebugLoc DL = MI.getDebugLoc();
417 MachineBasicBlock *MBB = MI.getParent();
418 BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: Opcode.first), DestReg: NewTmpReg)
419 .addReg(RegNo: SrcReg)
420 .addImm(Val: Imm0)
421 .addImm(Val: 12);
422 BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: Opcode.second), DestReg: NewDstReg)
423 .addReg(RegNo: NewTmpReg)
424 .addImm(Val: Imm1)
425 .addImm(Val: 0);
426 });
427}
428
429template <typename T>
430bool AArch64MIPeepholeOpt::visitADDSSUBS(
431 OpcodePair PosOpcs, OpcodePair NegOpcs, MachineInstr &MI) {
432 // Try the same transformation as ADDSUB but with additional requirement
433 // that the condition code usages are only for Equal and Not Equal
434
435 if (MI.getOperand(i: 1).getReg() == AArch64::XZR ||
436 MI.getOperand(i: 1).getReg() == AArch64::WZR)
437 return false;
438
439 return splitTwoPartImm<T>(
440 MI,
441 [PosOpcs, NegOpcs, &MI, &TRI = TRI,
442 &MRI = MRI](T Imm, unsigned RegSize, T &Imm0,
443 T &Imm1) -> std::optional<OpcodePair> {
444 OpcodePair OP;
445 if (splitAddSubImm(Imm, RegSize, Imm0, Imm1))
446 OP = PosOpcs;
447 else if (splitAddSubImm(-Imm, RegSize, Imm0, Imm1))
448 OP = NegOpcs;
449 else
450 return std::nullopt;
451 // Check conditional uses last since it is expensive for scanning
452 // proceeding instructions
453 MachineInstr &SrcMI = *MRI->getUniqueVRegDef(Reg: MI.getOperand(i: 1).getReg());
454 std::optional<UsedNZCV> NZCVUsed = examineCFlagsUse(MI&: SrcMI, CmpInstr&: MI, TRI: *TRI);
455 if (!NZCVUsed || NZCVUsed->C || NZCVUsed->V)
456 return std::nullopt;
457 return OP;
458 },
459 [&TII = TII](MachineInstr &MI, OpcodePair Opcode, unsigned Imm0,
460 unsigned Imm1, Register SrcReg, Register NewTmpReg,
461 Register NewDstReg) {
462 DebugLoc DL = MI.getDebugLoc();
463 MachineBasicBlock *MBB = MI.getParent();
464 BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: Opcode.first), DestReg: NewTmpReg)
465 .addReg(RegNo: SrcReg)
466 .addImm(Val: Imm0)
467 .addImm(Val: 12);
468 BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: Opcode.second), DestReg: NewDstReg)
469 .addReg(RegNo: NewTmpReg)
470 .addImm(Val: Imm1)
471 .addImm(Val: 0);
472 });
473}
474
475// Checks if the corresponding MOV immediate instruction is applicable for
476// this peephole optimization.
477bool AArch64MIPeepholeOpt::checkMovImmInstr(MachineInstr &MI,
478 MachineInstr *&MovMI,
479 MachineInstr *&SubregToRegMI) {
480 // Check whether current MBB is in loop and the AND is loop invariant.
481 MachineBasicBlock *MBB = MI.getParent();
482 MachineLoop *L = MLI->getLoopFor(BB: MBB);
483 if (L && !L->isLoopInvariant(I&: MI))
484 return false;
485
486 // Check whether current MI's operand is MOV with immediate.
487 MovMI = MRI->getUniqueVRegDef(Reg: MI.getOperand(i: 2).getReg());
488 if (!MovMI)
489 return false;
490
491 // If it is SUBREG_TO_REG, check its operand.
492 SubregToRegMI = nullptr;
493 if (MovMI->getOpcode() == TargetOpcode::SUBREG_TO_REG) {
494 SubregToRegMI = MovMI;
495 MovMI = MRI->getUniqueVRegDef(Reg: MovMI->getOperand(i: 2).getReg());
496 if (!MovMI)
497 return false;
498 }
499
500 if (MovMI->getOpcode() != AArch64::MOVi32imm &&
501 MovMI->getOpcode() != AArch64::MOVi64imm)
502 return false;
503
504 // If the MOV has multiple uses, do not split the immediate because it causes
505 // more instructions.
506 if (!MRI->hasOneUse(RegNo: MovMI->getOperand(i: 0).getReg()))
507 return false;
508 if (SubregToRegMI && !MRI->hasOneUse(RegNo: SubregToRegMI->getOperand(i: 0).getReg()))
509 return false;
510
511 // It is OK to perform this peephole optimization.
512 return true;
513}
514
515template <typename T>
516bool AArch64MIPeepholeOpt::splitTwoPartImm(
517 MachineInstr &MI,
518 SplitAndOpcFunc<T> SplitAndOpc, BuildMIFunc BuildInstr) {
519 unsigned RegSize = sizeof(T) * 8;
520 assert((RegSize == 32 || RegSize == 64) &&
521 "Invalid RegSize for legal immediate peephole optimization");
522
523 // Perform several essential checks against current MI.
524 MachineInstr *MovMI, *SubregToRegMI;
525 if (!checkMovImmInstr(MI, MovMI, SubregToRegMI))
526 return false;
527
528 // Split the immediate to Imm0 and Imm1, and calculate the Opcode.
529 T Imm = static_cast<T>(MovMI->getOperand(i: 1).getImm()), Imm0, Imm1;
530 // For the 32 bit form of instruction, the upper 32 bits of the destination
531 // register are set to zero. If there is SUBREG_TO_REG, set the upper 32 bits
532 // of Imm to zero. This is essential if the Immediate value was a negative
533 // number since it was sign extended when we assign to the 64-bit Imm.
534 if (SubregToRegMI)
535 Imm &= 0xFFFFFFFF;
536 OpcodePair Opcode;
537 if (auto R = SplitAndOpc(Imm, RegSize, Imm0, Imm1))
538 Opcode = *R;
539 else
540 return false;
541
542 // Create new MIs using the first and second opcodes. Opcodes might differ for
543 // flag setting operations that should only set flags on second instruction.
544 // NewTmpReg = Opcode.first SrcReg Imm0
545 // NewDstReg = Opcode.second NewTmpReg Imm1
546
547 // Determine register classes for destinations and register operands
548 MachineFunction *MF = MI.getMF();
549 const TargetRegisterClass *FirstInstrDstRC =
550 TII->getRegClass(MCID: TII->get(Opcode: Opcode.first), OpNum: 0, TRI, MF: *MF);
551 const TargetRegisterClass *FirstInstrOperandRC =
552 TII->getRegClass(MCID: TII->get(Opcode: Opcode.first), OpNum: 1, TRI, MF: *MF);
553 const TargetRegisterClass *SecondInstrDstRC =
554 (Opcode.first == Opcode.second)
555 ? FirstInstrDstRC
556 : TII->getRegClass(MCID: TII->get(Opcode: Opcode.second), OpNum: 0, TRI, MF: *MF);
557 const TargetRegisterClass *SecondInstrOperandRC =
558 (Opcode.first == Opcode.second)
559 ? FirstInstrOperandRC
560 : TII->getRegClass(MCID: TII->get(Opcode: Opcode.second), OpNum: 1, TRI, MF: *MF);
561
562 // Get old registers destinations and new register destinations
563 Register DstReg = MI.getOperand(i: 0).getReg();
564 Register SrcReg = MI.getOperand(i: 1).getReg();
565 Register NewTmpReg = MRI->createVirtualRegister(RegClass: FirstInstrDstRC);
566 // In the situation that DstReg is not Virtual (likely WZR or XZR), we want to
567 // reuse that same destination register.
568 Register NewDstReg = DstReg.isVirtual()
569 ? MRI->createVirtualRegister(RegClass: SecondInstrDstRC)
570 : DstReg;
571
572 // Constrain registers based on their new uses
573 MRI->constrainRegClass(Reg: SrcReg, RC: FirstInstrOperandRC);
574 MRI->constrainRegClass(Reg: NewTmpReg, RC: SecondInstrOperandRC);
575 if (DstReg != NewDstReg)
576 MRI->constrainRegClass(Reg: NewDstReg, RC: MRI->getRegClass(Reg: DstReg));
577
578 // Call the delegating operation to build the instruction
579 BuildInstr(MI, Opcode, Imm0, Imm1, SrcReg, NewTmpReg, NewDstReg);
580
581 // replaceRegWith changes MI's definition register. Keep it for SSA form until
582 // deleting MI. Only if we made a new destination register.
583 if (DstReg != NewDstReg) {
584 MRI->replaceRegWith(FromReg: DstReg, ToReg: NewDstReg);
585 MI.getOperand(i: 0).setReg(DstReg);
586 }
587
588 // Record the MIs need to be removed.
589 MI.eraseFromParent();
590 if (SubregToRegMI)
591 SubregToRegMI->eraseFromParent();
592 MovMI->eraseFromParent();
593
594 return true;
595}
596
597bool AArch64MIPeepholeOpt::visitINSviGPR(MachineInstr &MI, unsigned Opc) {
598 // Check if this INSvi[X]gpr comes from COPY of a source FPR128
599 //
600 // From
601 // %intermediate1:gpr64 = COPY %src:fpr128
602 // %intermediate2:gpr32 = COPY %intermediate1:gpr64
603 // %dst:fpr128 = INSvi[X]gpr %dst_vec:fpr128, dst_index, %intermediate2:gpr32
604 // To
605 // %dst:fpr128 = INSvi[X]lane %dst_vec:fpr128, dst_index, %src:fpr128,
606 // src_index
607 // where src_index = 0, X = [8|16|32|64]
608
609 MachineInstr *SrcMI = MRI->getUniqueVRegDef(Reg: MI.getOperand(i: 3).getReg());
610
611 // For a chain of COPY instructions, find the initial source register
612 // and check if it's an FPR128
613 while (true) {
614 if (!SrcMI || SrcMI->getOpcode() != TargetOpcode::COPY)
615 return false;
616
617 if (!SrcMI->getOperand(i: 1).getReg().isVirtual())
618 return false;
619
620 if (MRI->getRegClass(Reg: SrcMI->getOperand(i: 1).getReg()) ==
621 &AArch64::FPR128RegClass) {
622 break;
623 }
624 SrcMI = MRI->getUniqueVRegDef(Reg: SrcMI->getOperand(i: 1).getReg());
625 }
626
627 Register DstReg = MI.getOperand(i: 0).getReg();
628 Register SrcReg = SrcMI->getOperand(i: 1).getReg();
629 MachineInstr *INSvilaneMI =
630 BuildMI(BB&: *MI.getParent(), I&: MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: Opc), DestReg: DstReg)
631 .add(MO: MI.getOperand(i: 1))
632 .add(MO: MI.getOperand(i: 2))
633 .addUse(RegNo: SrcReg, Flags: getRegState(RegOp: SrcMI->getOperand(i: 1)))
634 .addImm(Val: 0);
635
636 LLVM_DEBUG(dbgs() << MI << " replace by:\n: " << *INSvilaneMI << "\n");
637 (void)INSvilaneMI;
638 MI.eraseFromParent();
639 return true;
640}
641
642// All instructions that set a FPR64 will implicitly zero the top bits of the
643// register.
644static bool is64bitDefwithZeroHigh64bit(MachineInstr *MI,
645 MachineRegisterInfo *MRI) {
646 if (!MI->getOperand(i: 0).isReg() || !MI->getOperand(i: 0).isDef())
647 return false;
648 const TargetRegisterClass *RC = MRI->getRegClass(Reg: MI->getOperand(i: 0).getReg());
649 if (RC != &AArch64::FPR64RegClass)
650 return false;
651 return MI->getOpcode() > TargetOpcode::GENERIC_OP_END;
652}
653
654bool AArch64MIPeepholeOpt::visitINSvi64lane(MachineInstr &MI) {
655 // Check the MI for low 64-bits sets zero for high 64-bits implicitly.
656 // We are expecting below case.
657 //
658 // %1:fpr64 = nofpexcept FCVTNv4i16 %0:fpr128, implicit $fpcr
659 // %6:fpr128 = IMPLICIT_DEF
660 // %5:fpr128 = INSERT_SUBREG %6:fpr128(tied-def 0), killed %1:fpr64, %subreg.dsub
661 // %7:fpr128 = INSvi64lane %5:fpr128(tied-def 0), 1, killed %3:fpr128, 0
662 MachineInstr *Low64MI = MRI->getUniqueVRegDef(Reg: MI.getOperand(i: 1).getReg());
663 if (Low64MI->getOpcode() != AArch64::INSERT_SUBREG)
664 return false;
665 Low64MI = MRI->getUniqueVRegDef(Reg: Low64MI->getOperand(i: 2).getReg());
666 if (!Low64MI || !is64bitDefwithZeroHigh64bit(MI: Low64MI, MRI))
667 return false;
668
669 // Check there is `mov 0` MI for high 64-bits.
670 // We are expecting below cases.
671 //
672 // %2:fpr64 = MOVID 0
673 // %4:fpr128 = IMPLICIT_DEF
674 // %3:fpr128 = INSERT_SUBREG %4:fpr128(tied-def 0), killed %2:fpr64, %subreg.dsub
675 // %7:fpr128 = INSvi64lane %5:fpr128(tied-def 0), 1, killed %3:fpr128, 0
676 // or
677 // %5:fpr128 = MOVIv2d_ns 0
678 // %6:fpr64 = COPY %5.dsub:fpr128
679 // %8:fpr128 = IMPLICIT_DEF
680 // %7:fpr128 = INSERT_SUBREG %8:fpr128(tied-def 0), killed %6:fpr64, %subreg.dsub
681 // %11:fpr128 = INSvi64lane %9:fpr128(tied-def 0), 1, killed %7:fpr128, 0
682 MachineInstr *High64MI = MRI->getUniqueVRegDef(Reg: MI.getOperand(i: 3).getReg());
683 if (!High64MI || High64MI->getOpcode() != AArch64::INSERT_SUBREG)
684 return false;
685 High64MI = MRI->getUniqueVRegDef(Reg: High64MI->getOperand(i: 2).getReg());
686 if (High64MI && High64MI->getOpcode() == TargetOpcode::COPY)
687 High64MI = MRI->getUniqueVRegDef(Reg: High64MI->getOperand(i: 1).getReg());
688 if (!High64MI || (High64MI->getOpcode() != AArch64::MOVID &&
689 High64MI->getOpcode() != AArch64::MOVIv2d_ns))
690 return false;
691 if (High64MI->getOperand(i: 1).getImm() != 0)
692 return false;
693
694 // Let's remove MIs for high 64-bits.
695 Register OldDef = MI.getOperand(i: 0).getReg();
696 Register NewDef = MI.getOperand(i: 1).getReg();
697 MRI->constrainRegClass(Reg: NewDef, RC: MRI->getRegClass(Reg: OldDef));
698 MRI->replaceRegWith(FromReg: OldDef, ToReg: NewDef);
699 MI.eraseFromParent();
700
701 return true;
702}
703
704bool AArch64MIPeepholeOpt::visitFMOVDr(MachineInstr &MI) {
705 // An FMOVDr sets the high 64-bits to zero implicitly, similar to ORR for GPR.
706 MachineInstr *Low64MI = MRI->getUniqueVRegDef(Reg: MI.getOperand(i: 1).getReg());
707 if (!Low64MI || !is64bitDefwithZeroHigh64bit(MI: Low64MI, MRI))
708 return false;
709
710 // Let's remove MIs for high 64-bits.
711 Register OldDef = MI.getOperand(i: 0).getReg();
712 Register NewDef = MI.getOperand(i: 1).getReg();
713 LLVM_DEBUG(dbgs() << "Removing: " << MI << "\n");
714 MRI->clearKillFlags(Reg: OldDef);
715 MRI->clearKillFlags(Reg: NewDef);
716 MRI->constrainRegClass(Reg: NewDef, RC: MRI->getRegClass(Reg: OldDef));
717 MRI->replaceRegWith(FromReg: OldDef, ToReg: NewDef);
718 MI.eraseFromParent();
719
720 return true;
721}
722
723bool AArch64MIPeepholeOpt::visitUBFMXri(MachineInstr &MI) {
724 // Check if the instruction is equivalent to a 32 bit LSR or LSL alias of
725 // UBFM, and replace the UBFMXri instruction with its 32 bit variant, UBFMWri.
726 int64_t Immr = MI.getOperand(i: 2).getImm();
727 int64_t Imms = MI.getOperand(i: 3).getImm();
728
729 bool IsLSR = Imms == 31 && Immr <= Imms;
730 bool IsLSL = Immr == Imms + 33;
731 if (!IsLSR && !IsLSL)
732 return false;
733
734 if (IsLSL) {
735 Immr -= 32;
736 }
737
738 const TargetRegisterClass *DstRC64 =
739 TII->getRegClass(MCID: TII->get(Opcode: MI.getOpcode()), OpNum: 0, TRI, MF: *MI.getMF());
740 const TargetRegisterClass *DstRC32 =
741 TRI->getSubRegisterClass(DstRC64, AArch64::sub_32);
742 assert(DstRC32 && "Destination register class of UBFMXri doesn't have a "
743 "sub_32 subregister class");
744
745 const TargetRegisterClass *SrcRC64 =
746 TII->getRegClass(MCID: TII->get(Opcode: MI.getOpcode()), OpNum: 1, TRI, MF: *MI.getMF());
747 const TargetRegisterClass *SrcRC32 =
748 TRI->getSubRegisterClass(SrcRC64, AArch64::sub_32);
749 assert(SrcRC32 && "Source register class of UBFMXri doesn't have a sub_32 "
750 "subregister class");
751
752 Register DstReg64 = MI.getOperand(i: 0).getReg();
753 Register DstReg32 = MRI->createVirtualRegister(RegClass: DstRC32);
754 Register SrcReg64 = MI.getOperand(i: 1).getReg();
755 Register SrcReg32 = MRI->createVirtualRegister(RegClass: SrcRC32);
756
757 BuildMI(BB&: *MI.getParent(), I&: MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: AArch64::COPY),
758 DestReg: SrcReg32)
759 .addReg(RegNo: SrcReg64, flags: 0, SubReg: AArch64::sub_32);
760 BuildMI(BB&: *MI.getParent(), I&: MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: AArch64::UBFMWri),
761 DestReg: DstReg32)
762 .addReg(RegNo: SrcReg32)
763 .addImm(Val: Immr)
764 .addImm(Val: Imms);
765 BuildMI(BB&: *MI.getParent(), I&: MI, MIMD: MI.getDebugLoc(),
766 MCID: TII->get(Opcode: AArch64::SUBREG_TO_REG), DestReg: DstReg64)
767 .addImm(Val: 0)
768 .addReg(RegNo: DstReg32)
769 .addImm(Val: AArch64::sub_32);
770 MI.eraseFromParent();
771 return true;
772}
773
774// Across a basic-block we might have in i32 extract from a value that only
775// operates on upper bits (for example a sxtw). We can replace the COPY with a
776// new version skipping the sxtw.
777bool AArch64MIPeepholeOpt::visitCopy(MachineInstr &MI) {
778 Register InputReg = MI.getOperand(i: 1).getReg();
779 if (MI.getOperand(i: 1).getSubReg() != AArch64::sub_32 ||
780 !MRI->hasOneNonDBGUse(RegNo: InputReg))
781 return false;
782
783 MachineInstr *SrcMI = MRI->getUniqueVRegDef(Reg: InputReg);
784 SmallPtrSet<MachineInstr *, 4> DeadInstrs;
785 DeadInstrs.insert(Ptr: SrcMI);
786 while (SrcMI && SrcMI->isFullCopy() &&
787 MRI->hasOneNonDBGUse(RegNo: SrcMI->getOperand(i: 1).getReg())) {
788 SrcMI = MRI->getUniqueVRegDef(Reg: SrcMI->getOperand(i: 1).getReg());
789 DeadInstrs.insert(Ptr: SrcMI);
790 }
791
792 if (!SrcMI)
793 return false;
794
795 // Look for SXTW(X) and return Reg.
796 auto getSXTWSrcReg = [](MachineInstr *SrcMI) -> Register {
797 if (SrcMI->getOpcode() != AArch64::SBFMXri ||
798 SrcMI->getOperand(i: 2).getImm() != 0 ||
799 SrcMI->getOperand(i: 3).getImm() != 31)
800 return AArch64::NoRegister;
801 return SrcMI->getOperand(i: 1).getReg();
802 };
803 // Look for SUBREG_TO_REG(ORRWrr(WZR, COPY(X.sub_32)))
804 auto getUXTWSrcReg = [&](MachineInstr *SrcMI) -> Register {
805 if (SrcMI->getOpcode() != AArch64::SUBREG_TO_REG ||
806 SrcMI->getOperand(i: 3).getImm() != AArch64::sub_32 ||
807 !MRI->hasOneNonDBGUse(RegNo: SrcMI->getOperand(i: 2).getReg()))
808 return AArch64::NoRegister;
809 MachineInstr *Orr = MRI->getUniqueVRegDef(Reg: SrcMI->getOperand(i: 2).getReg());
810 if (!Orr || Orr->getOpcode() != AArch64::ORRWrr ||
811 Orr->getOperand(i: 1).getReg() != AArch64::WZR ||
812 !MRI->hasOneNonDBGUse(RegNo: Orr->getOperand(i: 2).getReg()))
813 return AArch64::NoRegister;
814 MachineInstr *Cpy = MRI->getUniqueVRegDef(Reg: Orr->getOperand(i: 2).getReg());
815 if (!Cpy || Cpy->getOpcode() != AArch64::COPY ||
816 Cpy->getOperand(i: 1).getSubReg() != AArch64::sub_32)
817 return AArch64::NoRegister;
818 DeadInstrs.insert(Ptr: Orr);
819 return Cpy->getOperand(i: 1).getReg();
820 };
821
822 Register SrcReg = getSXTWSrcReg(SrcMI);
823 if (!SrcReg)
824 SrcReg = getUXTWSrcReg(SrcMI);
825 if (!SrcReg)
826 return false;
827
828 MRI->constrainRegClass(Reg: SrcReg, RC: MRI->getRegClass(Reg: InputReg));
829 LLVM_DEBUG(dbgs() << "Optimizing: " << MI);
830 MI.getOperand(i: 1).setReg(SrcReg);
831 LLVM_DEBUG(dbgs() << " to: " << MI);
832 for (auto *DeadMI : DeadInstrs) {
833 LLVM_DEBUG(dbgs() << " Removing: " << *DeadMI);
834 DeadMI->eraseFromParent();
835 }
836 return true;
837}
838
839bool AArch64MIPeepholeOpt::runOnMachineFunction(MachineFunction &MF) {
840 if (skipFunction(F: MF.getFunction()))
841 return false;
842
843 TII = static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo());
844 TRI = static_cast<const AArch64RegisterInfo *>(
845 MF.getSubtarget().getRegisterInfo());
846 MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
847 MRI = &MF.getRegInfo();
848
849 assert(MRI->isSSA() && "Expected to be run on SSA form!");
850
851 bool Changed = false;
852
853 for (MachineBasicBlock &MBB : MF) {
854 for (MachineInstr &MI : make_early_inc_range(Range&: MBB)) {
855 switch (MI.getOpcode()) {
856 default:
857 break;
858 case AArch64::INSERT_SUBREG:
859 Changed |= visitINSERT(MI);
860 break;
861 case AArch64::ANDWrr:
862 Changed |= visitAND<uint32_t>(Opc: AArch64::ANDWri, MI);
863 break;
864 case AArch64::ANDXrr:
865 Changed |= visitAND<uint64_t>(Opc: AArch64::ANDXri, MI);
866 break;
867 case AArch64::ORRWrs:
868 Changed |= visitORR(MI);
869 break;
870 case AArch64::ADDWrr:
871 Changed |= visitADDSUB<uint32_t>(PosOpc: AArch64::ADDWri, NegOpc: AArch64::SUBWri, MI);
872 break;
873 case AArch64::SUBWrr:
874 Changed |= visitADDSUB<uint32_t>(PosOpc: AArch64::SUBWri, NegOpc: AArch64::ADDWri, MI);
875 break;
876 case AArch64::ADDXrr:
877 Changed |= visitADDSUB<uint64_t>(PosOpc: AArch64::ADDXri, NegOpc: AArch64::SUBXri, MI);
878 break;
879 case AArch64::SUBXrr:
880 Changed |= visitADDSUB<uint64_t>(PosOpc: AArch64::SUBXri, NegOpc: AArch64::ADDXri, MI);
881 break;
882 case AArch64::ADDSWrr:
883 Changed |=
884 visitADDSSUBS<uint32_t>(PosOpcs: {AArch64::ADDWri, AArch64::ADDSWri},
885 NegOpcs: {AArch64::SUBWri, AArch64::SUBSWri}, MI);
886 break;
887 case AArch64::SUBSWrr:
888 Changed |=
889 visitADDSSUBS<uint32_t>(PosOpcs: {AArch64::SUBWri, AArch64::SUBSWri},
890 NegOpcs: {AArch64::ADDWri, AArch64::ADDSWri}, MI);
891 break;
892 case AArch64::ADDSXrr:
893 Changed |=
894 visitADDSSUBS<uint64_t>(PosOpcs: {AArch64::ADDXri, AArch64::ADDSXri},
895 NegOpcs: {AArch64::SUBXri, AArch64::SUBSXri}, MI);
896 break;
897 case AArch64::SUBSXrr:
898 Changed |=
899 visitADDSSUBS<uint64_t>(PosOpcs: {AArch64::SUBXri, AArch64::SUBSXri},
900 NegOpcs: {AArch64::ADDXri, AArch64::ADDSXri}, MI);
901 break;
902 case AArch64::CSELWr:
903 case AArch64::CSELXr:
904 Changed |= visitCSEL(MI);
905 break;
906 case AArch64::INSvi64gpr:
907 Changed |= visitINSviGPR(MI, Opc: AArch64::INSvi64lane);
908 break;
909 case AArch64::INSvi32gpr:
910 Changed |= visitINSviGPR(MI, Opc: AArch64::INSvi32lane);
911 break;
912 case AArch64::INSvi16gpr:
913 Changed |= visitINSviGPR(MI, Opc: AArch64::INSvi16lane);
914 break;
915 case AArch64::INSvi8gpr:
916 Changed |= visitINSviGPR(MI, Opc: AArch64::INSvi8lane);
917 break;
918 case AArch64::INSvi64lane:
919 Changed |= visitINSvi64lane(MI);
920 break;
921 case AArch64::FMOVDr:
922 Changed |= visitFMOVDr(MI);
923 break;
924 case AArch64::UBFMXri:
925 Changed |= visitUBFMXri(MI);
926 break;
927 case AArch64::COPY:
928 Changed |= visitCopy(MI);
929 break;
930 }
931 }
932 }
933
934 return Changed;
935}
936
937FunctionPass *llvm::createAArch64MIPeepholeOptPass() {
938 return new AArch64MIPeepholeOpt();
939}
940