1//=== AArch64PostSelectOptimize.cpp ---------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass does post-instruction-selection optimizations in the GlobalISel
10// pipeline, before the rest of codegen runs.
11//
12//===----------------------------------------------------------------------===//
13
14#include "AArch64.h"
15#include "AArch64TargetMachine.h"
16#include "MCTargetDesc/AArch64MCTargetDesc.h"
17#include "llvm/ADT/STLExtras.h"
18#include "llvm/CodeGen/GlobalISel/Utils.h"
19#include "llvm/CodeGen/MachineBasicBlock.h"
20#include "llvm/CodeGen/MachineFunctionPass.h"
21#include "llvm/CodeGen/MachineInstr.h"
22#include "llvm/CodeGen/MachineOperand.h"
23#include "llvm/CodeGen/TargetPassConfig.h"
24#include "llvm/Support/Debug.h"
25#include "llvm/Support/ErrorHandling.h"
26
27#define DEBUG_TYPE "aarch64-post-select-optimize"
28
29using namespace llvm;
30
31namespace {
32class AArch64PostSelectOptimize : public MachineFunctionPass {
33public:
34 static char ID;
35
36 AArch64PostSelectOptimize() : MachineFunctionPass(ID) {}
37
38 StringRef getPassName() const override {
39 return "AArch64 Post Select Optimizer";
40 }
41
42 bool runOnMachineFunction(MachineFunction &MF) override;
43
44 void getAnalysisUsage(AnalysisUsage &AU) const override;
45
46private:
47 bool optimizeNZCVDefs(MachineBasicBlock &MBB);
48 bool doPeepholeOpts(MachineBasicBlock &MBB);
49 /// Look for cross regclass copies that can be trivially eliminated.
50 bool foldSimpleCrossClassCopies(MachineInstr &MI);
51 bool foldCopyDup(MachineInstr &MI);
52};
53} // end anonymous namespace
54
55void AArch64PostSelectOptimize::getAnalysisUsage(AnalysisUsage &AU) const {
56 AU.addRequired<TargetPassConfig>();
57 AU.setPreservesCFG();
58 getSelectionDAGFallbackAnalysisUsage(AU);
59 MachineFunctionPass::getAnalysisUsage(AU);
60}
61
62unsigned getNonFlagSettingVariant(unsigned Opc) {
63 switch (Opc) {
64 default:
65 return 0;
66 case AArch64::SUBSXrr:
67 return AArch64::SUBXrr;
68 case AArch64::SUBSWrr:
69 return AArch64::SUBWrr;
70 case AArch64::SUBSXrs:
71 return AArch64::SUBXrs;
72 case AArch64::SUBSWrs:
73 return AArch64::SUBWrs;
74 case AArch64::SUBSXri:
75 return AArch64::SUBXri;
76 case AArch64::SUBSWri:
77 return AArch64::SUBWri;
78 case AArch64::ADDSXrr:
79 return AArch64::ADDXrr;
80 case AArch64::ADDSWrr:
81 return AArch64::ADDWrr;
82 case AArch64::ADDSXrs:
83 return AArch64::ADDXrs;
84 case AArch64::ADDSWrs:
85 return AArch64::ADDWrs;
86 case AArch64::ADDSXri:
87 return AArch64::ADDXri;
88 case AArch64::ADDSWri:
89 return AArch64::ADDWri;
90 case AArch64::SBCSXr:
91 return AArch64::SBCXr;
92 case AArch64::SBCSWr:
93 return AArch64::SBCWr;
94 case AArch64::ADCSXr:
95 return AArch64::ADCXr;
96 case AArch64::ADCSWr:
97 return AArch64::ADCWr;
98 }
99}
100
101bool AArch64PostSelectOptimize::doPeepholeOpts(MachineBasicBlock &MBB) {
102 bool Changed = false;
103 for (auto &MI : make_early_inc_range(Range&: MBB)) {
104 bool CurrentIterChanged = foldSimpleCrossClassCopies(MI);
105 if (!CurrentIterChanged)
106 CurrentIterChanged |= foldCopyDup(MI);
107 Changed |= CurrentIterChanged;
108 }
109 return Changed;
110}
111
112bool AArch64PostSelectOptimize::foldSimpleCrossClassCopies(MachineInstr &MI) {
113 auto *MF = MI.getMF();
114 auto &MRI = MF->getRegInfo();
115
116 if (!MI.isCopy())
117 return false;
118
119 if (MI.getOperand(i: 1).getSubReg())
120 return false; // Don't deal with subreg copies
121
122 Register Src = MI.getOperand(i: 1).getReg();
123 Register Dst = MI.getOperand(i: 0).getReg();
124
125 if (Src.isPhysical() || Dst.isPhysical())
126 return false;
127
128 const TargetRegisterClass *SrcRC = MRI.getRegClass(Reg: Src);
129 const TargetRegisterClass *DstRC = MRI.getRegClass(Reg: Dst);
130
131 if (SrcRC == DstRC)
132 return false;
133
134
135 if (SrcRC->hasSubClass(RC: DstRC)) {
136 // This is the case where the source class is a superclass of the dest, so
137 // if the copy is the only user of the source, we can just constrain the
138 // source reg to the dest class.
139
140 if (!MRI.hasOneNonDBGUse(RegNo: Src))
141 return false; // Only constrain single uses of the source.
142
143 // Constrain to dst reg class as long as it's not a weird class that only
144 // has a few registers.
145 if (!MRI.constrainRegClass(Reg: Src, RC: DstRC, /* MinNumRegs */ 25))
146 return false;
147 } else if (DstRC->hasSubClass(RC: SrcRC)) {
148 // This is the inverse case, where the destination class is a superclass of
149 // the source. Here, if the copy is the only user, we can just constrain
150 // the user of the copy to use the smaller class of the source.
151 } else {
152 return false;
153 }
154
155 MRI.replaceRegWith(FromReg: Dst, ToReg: Src);
156 MI.eraseFromParent();
157 return true;
158}
159
160bool AArch64PostSelectOptimize::foldCopyDup(MachineInstr &MI) {
161 if (!MI.isCopy())
162 return false;
163
164 auto *MF = MI.getMF();
165 auto &MRI = MF->getRegInfo();
166 auto *TII = MF->getSubtarget().getInstrInfo();
167
168 // Optimize COPY(y:GPR, DUP(x:FPR, i)) -> UMOV(y:GPR, x:FPR, i).
169 // Here Dst is y and Src is the result of DUP.
170 Register Dst = MI.getOperand(i: 0).getReg();
171 Register Src = MI.getOperand(i: 1).getReg();
172
173 if (!Dst.isVirtual() || !Src.isVirtual())
174 return false;
175
176 auto TryMatchDUP = [&](const TargetRegisterClass *GPRRegClass,
177 const TargetRegisterClass *FPRRegClass, unsigned DUP,
178 unsigned UMOV) {
179 if (MRI.getRegClassOrNull(Reg: Dst) != GPRRegClass ||
180 MRI.getRegClassOrNull(Reg: Src) != FPRRegClass)
181 return false;
182
183 // There is a special case when one of the uses is COPY(z:FPR, y:GPR).
184 // In this case, we get COPY(z:FPR, COPY(y:GPR, DUP(x:FPR, i))), which can
185 // be folded by peephole-opt into just DUP(z:FPR, i), so this transform is
186 // not worthwhile in that case.
187 for (auto &Use : MRI.use_nodbg_instructions(Reg: Dst)) {
188 if (!Use.isCopy())
189 continue;
190
191 Register UseOp0 = Use.getOperand(i: 0).getReg();
192 Register UseOp1 = Use.getOperand(i: 1).getReg();
193 if (UseOp0.isPhysical() || UseOp1.isPhysical())
194 return false;
195
196 if (MRI.getRegClassOrNull(Reg: UseOp0) == FPRRegClass &&
197 MRI.getRegClassOrNull(Reg: UseOp1) == GPRRegClass)
198 return false;
199 }
200
201 MachineInstr *SrcMI = MRI.getUniqueVRegDef(Reg: Src);
202 if (!SrcMI || SrcMI->getOpcode() != DUP || !MRI.hasOneNonDBGUse(RegNo: Src))
203 return false;
204
205 Register DupSrc = SrcMI->getOperand(i: 1).getReg();
206 int64_t DupImm = SrcMI->getOperand(i: 2).getImm();
207
208 BuildMI(BB&: *MI.getParent(), I&: MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: UMOV), DestReg: Dst)
209 .addReg(RegNo: DupSrc)
210 .addImm(Val: DupImm);
211 SrcMI->eraseFromParent();
212 MI.eraseFromParent();
213 return true;
214 };
215
216 return TryMatchDUP(&AArch64::GPR32RegClass, &AArch64::FPR32RegClass,
217 AArch64::DUPi32, AArch64::UMOVvi32) ||
218 TryMatchDUP(&AArch64::GPR64RegClass, &AArch64::FPR64RegClass,
219 AArch64::DUPi64, AArch64::UMOVvi64);
220}
221
222bool AArch64PostSelectOptimize::optimizeNZCVDefs(MachineBasicBlock &MBB) {
223 // If we find a dead NZCV implicit-def, we
224 // - try to convert the operation to a non-flag-setting equivalent
225 // - or mark the def as dead to aid later peephole optimizations.
226
227 // Use cases:
228 // 1)
229 // Consider the following code:
230 // FCMPSrr %0, %1, implicit-def $nzcv
231 // %sel1:gpr32 = CSELWr %_, %_, 12, implicit $nzcv
232 // %sub:gpr32 = SUBSWrr %_, %_, implicit-def $nzcv
233 // FCMPSrr %0, %1, implicit-def $nzcv
234 // %sel2:gpr32 = CSELWr %_, %_, 12, implicit $nzcv
235 // This kind of code where we have 2 FCMPs each feeding a CSEL can happen
236 // when we have a single IR fcmp being used by two selects. During selection,
237 // to ensure that there can be no clobbering of nzcv between the fcmp and the
238 // csel, we have to generate an fcmp immediately before each csel is
239 // selected.
240 // However, often we can essentially CSE these together later in MachineCSE.
241 // This doesn't work though if there are unrelated flag-setting instructions
242 // in between the two FCMPs. In this case, the SUBS defines NZCV
243 // but it doesn't have any users, being overwritten by the second FCMP.
244 //
245 // 2)
246 // The instruction selector always emits the flag-setting variant of ADC/SBC
247 // while selecting G_UADDE/G_SADDE/G_USUBE/G_SSUBE. If the carry-out of these
248 // instructions is never used, we can switch to the non-flag-setting variant.
249
250 bool Changed = false;
251 auto &MF = *MBB.getParent();
252 auto &Subtarget = MF.getSubtarget();
253 const auto &TII = Subtarget.getInstrInfo();
254 auto TRI = Subtarget.getRegisterInfo();
255 auto RBI = Subtarget.getRegBankInfo();
256 auto &MRI = MF.getRegInfo();
257
258 LiveRegUnits LRU(*MBB.getParent()->getSubtarget().getRegisterInfo());
259 LRU.addLiveOuts(MBB);
260
261 for (auto &II : instructionsWithoutDebug(It: MBB.rbegin(), End: MBB.rend())) {
262 bool NZCVDead = LRU.available(Reg: AArch64::NZCV);
263 if (NZCVDead && II.definesRegister(Reg: AArch64::NZCV, /*TRI=*/nullptr)) {
264 // The instruction defines NZCV, but NZCV is dead.
265 unsigned NewOpc = getNonFlagSettingVariant(Opc: II.getOpcode());
266 int DeadNZCVIdx =
267 II.findRegisterDefOperandIdx(Reg: AArch64::NZCV, /*TRI=*/nullptr);
268 if (DeadNZCVIdx != -1) {
269 if (NewOpc) {
270 // If there is an equivalent non-flag-setting op, we convert.
271 LLVM_DEBUG(dbgs() << "Post-select optimizer: converting flag-setting "
272 "op: "
273 << II);
274 II.setDesc(TII->get(Opcode: NewOpc));
275 II.removeOperand(OpNo: DeadNZCVIdx);
276 // Changing the opcode can result in differing regclass requirements,
277 // e.g. SUBSWri uses gpr32 for the dest, whereas SUBWri uses gpr32sp.
278 // Constrain the regclasses, possibly introducing a copy.
279 constrainOperandRegClass(MF, TRI: *TRI, MRI, TII: *TII, RBI: *RBI, InsertPt&: II, II: II.getDesc(),
280 RegMO&: II.getOperand(i: 0), OpIdx: 0);
281 Changed |= true;
282 } else {
283 // Otherwise, we just set the nzcv imp-def operand to be dead, so the
284 // peephole optimizations can optimize them further.
285 II.getOperand(i: DeadNZCVIdx).setIsDead();
286 }
287 }
288 }
289 LRU.stepBackward(MI: II);
290 }
291 return Changed;
292}
293
294bool AArch64PostSelectOptimize::runOnMachineFunction(MachineFunction &MF) {
295 if (MF.getProperties().hasFailedISel())
296 return false;
297 assert(MF.getProperties().hasSelected() && "Expected a selected MF");
298
299 bool Changed = false;
300 for (auto &BB : MF) {
301 Changed |= optimizeNZCVDefs(MBB&: BB);
302 Changed |= doPeepholeOpts(MBB&: BB);
303 }
304 return Changed;
305}
306
307char AArch64PostSelectOptimize::ID = 0;
308INITIALIZE_PASS_BEGIN(AArch64PostSelectOptimize, DEBUG_TYPE,
309 "Optimize AArch64 selected instructions",
310 false, false)
311INITIALIZE_PASS_END(AArch64PostSelectOptimize, DEBUG_TYPE,
312 "Optimize AArch64 selected instructions", false,
313 false)
314
315namespace llvm {
316FunctionPass *createAArch64PostSelectOptimize() {
317 return new AArch64PostSelectOptimize();
318}
319} // end namespace llvm
320