1//=== AArch64PostSelectOptimize.cpp ---------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass does post-instruction-selection optimizations in the GlobalISel
10// pipeline, before the rest of codegen runs.
11//
12//===----------------------------------------------------------------------===//
13
14#include "AArch64.h"
15#include "AArch64TargetMachine.h"
16#include "MCTargetDesc/AArch64MCTargetDesc.h"
17#include "llvm/ADT/STLExtras.h"
18#include "llvm/CodeGen/GlobalISel/Utils.h"
19#include "llvm/CodeGen/MachineBasicBlock.h"
20#include "llvm/CodeGen/MachineFunctionPass.h"
21#include "llvm/CodeGen/MachineInstr.h"
22#include "llvm/CodeGen/MachineOperand.h"
23#include "llvm/Support/Debug.h"
24#include "llvm/Support/ErrorHandling.h"
25
26#define DEBUG_TYPE "aarch64-post-select-optimize"
27
28using namespace llvm;
29
30namespace {
31
32class AArch64PostSelectOptimizeLegacy : public MachineFunctionPass {
33public:
34 static char ID;
35
36 AArch64PostSelectOptimizeLegacy() : MachineFunctionPass(ID) {}
37
38 StringRef getPassName() const override {
39 return "AArch64 Post Select Optimizer";
40 }
41
42 bool runOnMachineFunction(MachineFunction &MF) override;
43
44 void getAnalysisUsage(AnalysisUsage &AU) const override;
45};
46} // end anonymous namespace
47
48void AArch64PostSelectOptimizeLegacy::getAnalysisUsage(
49 AnalysisUsage &AU) const {
50 AU.setPreservesCFG();
51 getSelectionDAGFallbackAnalysisUsage(AU);
52 MachineFunctionPass::getAnalysisUsage(AU);
53}
54
55unsigned getNonFlagSettingVariant(unsigned Opc) {
56 switch (Opc) {
57 default:
58 return 0;
59 case AArch64::SUBSXrr:
60 return AArch64::SUBXrr;
61 case AArch64::SUBSWrr:
62 return AArch64::SUBWrr;
63 case AArch64::SUBSXrs:
64 return AArch64::SUBXrs;
65 case AArch64::SUBSWrs:
66 return AArch64::SUBWrs;
67 case AArch64::SUBSXri:
68 return AArch64::SUBXri;
69 case AArch64::SUBSWri:
70 return AArch64::SUBWri;
71 case AArch64::ADDSXrr:
72 return AArch64::ADDXrr;
73 case AArch64::ADDSWrr:
74 return AArch64::ADDWrr;
75 case AArch64::ADDSXrs:
76 return AArch64::ADDXrs;
77 case AArch64::ADDSWrs:
78 return AArch64::ADDWrs;
79 case AArch64::ADDSXri:
80 return AArch64::ADDXri;
81 case AArch64::ADDSWri:
82 return AArch64::ADDWri;
83 case AArch64::SBCSXr:
84 return AArch64::SBCXr;
85 case AArch64::SBCSWr:
86 return AArch64::SBCWr;
87 case AArch64::ADCSXr:
88 return AArch64::ADCXr;
89 case AArch64::ADCSWr:
90 return AArch64::ADCWr;
91 }
92}
93
94/// Look for cross regclass copies that can be trivially eliminated.
95bool foldSimpleCrossClassCopies(MachineInstr &MI) {
96 auto *MF = MI.getMF();
97 auto &MRI = MF->getRegInfo();
98
99 if (!MI.isCopy())
100 return false;
101
102 if (MI.getOperand(i: 1).getSubReg())
103 return false; // Don't deal with subreg copies
104
105 Register Src = MI.getOperand(i: 1).getReg();
106 Register Dst = MI.getOperand(i: 0).getReg();
107
108 if (Src.isPhysical() || Dst.isPhysical())
109 return false;
110
111 const TargetRegisterClass *SrcRC = MRI.getRegClass(Reg: Src);
112 const TargetRegisterClass *DstRC = MRI.getRegClass(Reg: Dst);
113
114 if (SrcRC == DstRC)
115 return false;
116
117
118 if (SrcRC->hasSubClass(RC: DstRC)) {
119 // This is the case where the source class is a superclass of the dest, so
120 // if the copy is the only user of the source, we can just constrain the
121 // source reg to the dest class.
122
123 if (!MRI.hasOneNonDBGUse(RegNo: Src))
124 return false; // Only constrain single uses of the source.
125
126 // Constrain to dst reg class as long as it's not a weird class that only
127 // has a few registers.
128 if (!MRI.constrainRegClass(Reg: Src, RC: DstRC, /* MinNumRegs */ 25))
129 return false;
130 } else if (DstRC->hasSubClass(RC: SrcRC)) {
131 // This is the inverse case, where the destination class is a superclass of
132 // the source. Here, if the copy is the only user, we can just constrain
133 // the user of the copy to use the smaller class of the source.
134 } else {
135 return false;
136 }
137
138 MRI.replaceRegWith(FromReg: Dst, ToReg: Src);
139 MI.eraseFromParent();
140 return true;
141}
142
143bool foldCopyDup(MachineInstr &MI) {
144 if (!MI.isCopy())
145 return false;
146
147 auto *MF = MI.getMF();
148 auto &MRI = MF->getRegInfo();
149 auto *TII = MF->getSubtarget().getInstrInfo();
150
151 // Optimize COPY(y:GPR, DUP(x:FPR, i)) -> UMOV(y:GPR, x:FPR, i).
152 // Here Dst is y and Src is the result of DUP.
153 Register Dst = MI.getOperand(i: 0).getReg();
154 Register Src = MI.getOperand(i: 1).getReg();
155
156 if (!Dst.isVirtual() || !Src.isVirtual())
157 return false;
158
159 auto TryMatchDUP = [&](const TargetRegisterClass *GPRRegClass,
160 const TargetRegisterClass *FPRRegClass, unsigned DUP,
161 unsigned UMOV) {
162 if (MRI.getRegClassOrNull(Reg: Dst) != GPRRegClass ||
163 MRI.getRegClassOrNull(Reg: Src) != FPRRegClass)
164 return false;
165
166 // There is a special case when one of the uses is COPY(z:FPR, y:GPR).
167 // In this case, we get COPY(z:FPR, COPY(y:GPR, DUP(x:FPR, i))), which can
168 // be folded by peephole-opt into just DUP(z:FPR, i), so this transform is
169 // not worthwhile in that case.
170 for (auto &Use : MRI.use_nodbg_instructions(Reg: Dst)) {
171 if (!Use.isCopy())
172 continue;
173
174 Register UseOp0 = Use.getOperand(i: 0).getReg();
175 Register UseOp1 = Use.getOperand(i: 1).getReg();
176 if (UseOp0.isPhysical() || UseOp1.isPhysical())
177 return false;
178
179 if (MRI.getRegClassOrNull(Reg: UseOp0) == FPRRegClass &&
180 MRI.getRegClassOrNull(Reg: UseOp1) == GPRRegClass)
181 return false;
182 }
183
184 MachineInstr *SrcMI = MRI.getUniqueVRegDef(Reg: Src);
185 if (!SrcMI || SrcMI->getOpcode() != DUP || !MRI.hasOneNonDBGUse(RegNo: Src))
186 return false;
187
188 Register DupSrc = SrcMI->getOperand(i: 1).getReg();
189 int64_t DupImm = SrcMI->getOperand(i: 2).getImm();
190
191 BuildMI(BB&: *MI.getParent(), I&: MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: UMOV), DestReg: Dst)
192 .addReg(RegNo: DupSrc)
193 .addImm(Val: DupImm);
194 SrcMI->eraseFromParent();
195 MI.eraseFromParent();
196 return true;
197 };
198
199 return TryMatchDUP(&AArch64::GPR32RegClass, &AArch64::FPR32RegClass,
200 AArch64::DUPi32, AArch64::UMOVvi32) ||
201 TryMatchDUP(&AArch64::GPR64RegClass, &AArch64::FPR64RegClass,
202 AArch64::DUPi64, AArch64::UMOVvi64);
203}
204
205bool doPeepholeOpts(MachineBasicBlock &MBB) {
206 bool Changed = false;
207 for (auto &MI : make_early_inc_range(Range&: MBB)) {
208 bool CurrentIterChanged = foldSimpleCrossClassCopies(MI);
209 if (!CurrentIterChanged)
210 CurrentIterChanged |= foldCopyDup(MI);
211 Changed |= CurrentIterChanged;
212 }
213 return Changed;
214}
215
216bool optimizeNZCVDefs(MachineBasicBlock &MBB) {
217 // If we find a dead NZCV implicit-def, we
218 // - try to convert the operation to a non-flag-setting equivalent
219 // - or mark the def as dead to aid later peephole optimizations.
220
221 // Use cases:
222 // 1)
223 // Consider the following code:
224 // FCMPSrr %0, %1, implicit-def $nzcv
225 // %sel1:gpr32 = CSELWr %_, %_, 12, implicit $nzcv
226 // %sub:gpr32 = SUBSWrr %_, %_, implicit-def $nzcv
227 // FCMPSrr %0, %1, implicit-def $nzcv
228 // %sel2:gpr32 = CSELWr %_, %_, 12, implicit $nzcv
229 // This kind of code where we have 2 FCMPs each feeding a CSEL can happen
230 // when we have a single IR fcmp being used by two selects. During selection,
231 // to ensure that there can be no clobbering of nzcv between the fcmp and the
232 // csel, we have to generate an fcmp immediately before each csel is
233 // selected.
234 // However, often we can essentially CSE these together later in MachineCSE.
235 // This doesn't work though if there are unrelated flag-setting instructions
236 // in between the two FCMPs. In this case, the SUBS defines NZCV
237 // but it doesn't have any users, being overwritten by the second FCMP.
238 //
239 // 2)
240 // The instruction selector always emits the flag-setting variant of ADC/SBC
241 // while selecting G_UADDE/G_SADDE/G_USUBE/G_SSUBE. If the carry-out of these
242 // instructions is never used, we can switch to the non-flag-setting variant.
243
244 bool Changed = false;
245 auto &MF = *MBB.getParent();
246 auto &Subtarget = MF.getSubtarget();
247 const auto &TII = Subtarget.getInstrInfo();
248 auto TRI = Subtarget.getRegisterInfo();
249 auto RBI = Subtarget.getRegBankInfo();
250 auto &MRI = MF.getRegInfo();
251
252 LiveRegUnits LRU(*MBB.getParent()->getSubtarget().getRegisterInfo());
253 LRU.addLiveOuts(MBB);
254
255 for (auto &II : instructionsWithoutDebug(It: MBB.rbegin(), End: MBB.rend())) {
256 bool NZCVDead = LRU.available(Reg: AArch64::NZCV);
257 if (NZCVDead && II.definesRegister(Reg: AArch64::NZCV, /*TRI=*/nullptr)) {
258 // The instruction defines NZCV, but NZCV is dead.
259 unsigned NewOpc = getNonFlagSettingVariant(Opc: II.getOpcode());
260 int DeadNZCVIdx =
261 II.findRegisterDefOperandIdx(Reg: AArch64::NZCV, /*TRI=*/nullptr);
262 if (DeadNZCVIdx != -1) {
263 if (NewOpc) {
264 // If there is an equivalent non-flag-setting op, we convert.
265 LLVM_DEBUG(dbgs() << "Post-select optimizer: converting flag-setting "
266 "op: "
267 << II);
268 II.setDesc(TII->get(Opcode: NewOpc));
269 II.removeOperand(OpNo: DeadNZCVIdx);
270 // Changing the opcode can result in differing regclass requirements,
271 // e.g. SUBSWri uses gpr32 for the dest, whereas SUBWri uses gpr32sp.
272 // Constrain the regclasses, possibly introducing a copy.
273 constrainOperandRegClass(MF, TRI: *TRI, MRI, TII: *TII, RBI: *RBI, InsertPt&: II, II: II.getDesc(),
274 RegMO&: II.getOperand(i: 0), OpIdx: 0);
275 Changed |= true;
276 } else {
277 // Otherwise, we just set the nzcv imp-def operand to be dead, so the
278 // peephole optimizations can optimize them further.
279 II.getOperand(i: DeadNZCVIdx).setIsDead();
280 }
281 }
282 }
283 LRU.stepBackward(MI: II);
284 }
285 return Changed;
286}
287
288bool runAArch64PostSelectOptimize(MachineFunction &MF) {
289 if (MF.getProperties().hasFailedISel())
290 return false;
291 assert(MF.getProperties().hasSelected() && "Expected a selected MF");
292
293 bool Changed = false;
294 for (auto &BB : MF) {
295 Changed |= optimizeNZCVDefs(MBB&: BB);
296 Changed |= doPeepholeOpts(MBB&: BB);
297 }
298 return Changed;
299}
300
301bool AArch64PostSelectOptimizeLegacy::runOnMachineFunction(
302 MachineFunction &MF) {
303 return runAArch64PostSelectOptimize(MF);
304}
305
306char AArch64PostSelectOptimizeLegacy::ID = 0;
307INITIALIZE_PASS_BEGIN(AArch64PostSelectOptimizeLegacy, DEBUG_TYPE,
308 "Optimize AArch64 selected instructions", false, false)
309INITIALIZE_PASS_END(AArch64PostSelectOptimizeLegacy, DEBUG_TYPE,
310 "Optimize AArch64 selected instructions", false, false)
311
312namespace llvm {
313FunctionPass *createAArch64PostSelectOptimize() {
314 return new AArch64PostSelectOptimizeLegacy();
315}
316
317PreservedAnalyses
318AArch64PostSelectOptimizePass::run(MachineFunction &MF,
319 MachineFunctionAnalysisManager &MFAM) {
320 const bool Changed = runAArch64PostSelectOptimize(MF);
321 if (!Changed)
322 return PreservedAnalyses::all();
323 PreservedAnalyses PA = getMachineFunctionPassPreservedAnalyses();
324 PA.preserveSet<CFGAnalyses>();
325 return PA;
326}
327} // end namespace llvm
328