1 | //===-- AArch64ConditionalCompares.cpp --- CCMP formation for AArch64 -----===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This file implements the AArch64ConditionalCompares pass which reduces |
10 | // branching and code size by using the conditional compare instructions CCMP, |
11 | // CCMN, and FCMP. |
12 | // |
13 | // The CFG transformations for forming conditional compares are very similar to |
14 | // if-conversion, and this pass should run immediately before the early |
15 | // if-conversion pass. |
16 | // |
17 | //===----------------------------------------------------------------------===// |
18 | |
19 | #include "AArch64.h" |
20 | #include "llvm/ADT/DepthFirstIterator.h" |
21 | #include "llvm/ADT/Statistic.h" |
22 | #include "llvm/CodeGen/MachineBranchProbabilityInfo.h" |
23 | #include "llvm/CodeGen/MachineDominators.h" |
24 | #include "llvm/CodeGen/MachineFunction.h" |
25 | #include "llvm/CodeGen/MachineFunctionPass.h" |
26 | #include "llvm/CodeGen/MachineInstrBuilder.h" |
27 | #include "llvm/CodeGen/MachineLoopInfo.h" |
28 | #include "llvm/CodeGen/MachineRegisterInfo.h" |
29 | #include "llvm/CodeGen/MachineTraceMetrics.h" |
30 | #include "llvm/CodeGen/Passes.h" |
31 | #include "llvm/CodeGen/TargetInstrInfo.h" |
32 | #include "llvm/CodeGen/TargetRegisterInfo.h" |
33 | #include "llvm/CodeGen/TargetSubtargetInfo.h" |
34 | #include "llvm/InitializePasses.h" |
35 | #include "llvm/Support/CommandLine.h" |
36 | #include "llvm/Support/Debug.h" |
37 | #include "llvm/Support/raw_ostream.h" |
38 | |
39 | using namespace llvm; |
40 | |
41 | #define DEBUG_TYPE "aarch64-ccmp" |
42 | |
43 | // Absolute maximum number of instructions allowed per speculated block. |
44 | // This bypasses all other heuristics, so it should be set fairly high. |
45 | static cl::opt<unsigned> BlockInstrLimit( |
46 | "aarch64-ccmp-limit" , cl::init(Val: 30), cl::Hidden, |
47 | cl::desc("Maximum number of instructions per speculated block." )); |
48 | |
49 | // Stress testing mode - disable heuristics. |
50 | static cl::opt<bool> Stress("aarch64-stress-ccmp" , cl::Hidden, |
51 | cl::desc("Turn all knobs to 11" )); |
52 | |
53 | STATISTIC(NumConsidered, "Number of ccmps considered" ); |
54 | STATISTIC(NumPhiRejs, "Number of ccmps rejected (PHI)" ); |
55 | STATISTIC(NumPhysRejs, "Number of ccmps rejected (Physregs)" ); |
56 | STATISTIC(NumPhi2Rejs, "Number of ccmps rejected (PHI2)" ); |
57 | STATISTIC(NumHeadBranchRejs, "Number of ccmps rejected (Head branch)" ); |
58 | STATISTIC(NumCmpBranchRejs, "Number of ccmps rejected (CmpBB branch)" ); |
59 | STATISTIC(NumCmpTermRejs, "Number of ccmps rejected (CmpBB is cbz...)" ); |
60 | STATISTIC(NumImmRangeRejs, "Number of ccmps rejected (Imm out of range)" ); |
61 | STATISTIC(NumLiveDstRejs, "Number of ccmps rejected (Cmp dest live)" ); |
62 | STATISTIC(NumMultNZCVUses, "Number of ccmps rejected (NZCV used)" ); |
63 | STATISTIC(NumUnknNZCVDefs, "Number of ccmps rejected (NZCV def unknown)" ); |
64 | |
65 | STATISTIC(NumSpeculateRejs, "Number of ccmps rejected (Can't speculate)" ); |
66 | |
67 | STATISTIC(NumConverted, "Number of ccmp instructions created" ); |
68 | STATISTIC(NumCompBranches, "Number of cbz/cbnz branches converted" ); |
69 | |
70 | //===----------------------------------------------------------------------===// |
71 | // SSACCmpConv |
72 | //===----------------------------------------------------------------------===// |
73 | // |
74 | // The SSACCmpConv class performs ccmp-conversion on SSA form machine code |
75 | // after determining if it is possible. The class contains no heuristics; |
76 | // external code should be used to determine when ccmp-conversion is a good |
77 | // idea. |
78 | // |
79 | // CCmp-formation works on a CFG representing chained conditions, typically |
80 | // from C's short-circuit || and && operators: |
81 | // |
82 | // From: Head To: Head |
83 | // / | CmpBB |
84 | // / | / | |
85 | // | CmpBB / | |
86 | // | / | Tail | |
87 | // | / | | | |
88 | // Tail | | | |
89 | // | | | | |
90 | // ... ... ... ... |
91 | // |
92 | // The Head block is terminated by a br.cond instruction, and the CmpBB block |
93 | // contains compare + br.cond. Tail must be a successor of both. |
94 | // |
95 | // The cmp-conversion turns the compare instruction in CmpBB into a conditional |
96 | // compare, and merges CmpBB into Head, speculatively executing its |
97 | // instructions. The AArch64 conditional compare instructions have an immediate |
98 | // operand that specifies the NZCV flag values when the condition is false and |
99 | // the compare isn't executed. This makes it possible to chain compares with |
100 | // different condition codes. |
101 | // |
102 | // Example: |
103 | // |
104 | // if (a == 5 || b == 17) |
105 | // foo(); |
106 | // |
107 | // Head: |
108 | // cmp w0, #5 |
109 | // b.eq Tail |
110 | // CmpBB: |
111 | // cmp w1, #17 |
112 | // b.eq Tail |
113 | // ... |
114 | // Tail: |
115 | // bl _foo |
116 | // |
117 | // Becomes: |
118 | // |
119 | // Head: |
120 | // cmp w0, #5 |
121 | // ccmp w1, #17, 4, ne ; 4 = nZcv |
122 | // b.eq Tail |
123 | // ... |
124 | // Tail: |
125 | // bl _foo |
126 | // |
127 | // The ccmp condition code is the one that would cause the Head terminator to |
128 | // branch to CmpBB. |
129 | // |
130 | // FIXME: It should also be possible to speculate a block on the critical edge |
131 | // between Head and Tail, just like if-converting a diamond. |
132 | // |
133 | // FIXME: Handle PHIs in Tail by turning them into selects (if-conversion). |
134 | |
135 | namespace { |
136 | class SSACCmpConv { |
137 | MachineFunction *MF; |
138 | const TargetInstrInfo *TII; |
139 | const TargetRegisterInfo *TRI; |
140 | MachineRegisterInfo *MRI; |
141 | const MachineBranchProbabilityInfo *MBPI; |
142 | |
143 | public: |
144 | /// The first block containing a conditional branch, dominating everything |
145 | /// else. |
146 | MachineBasicBlock *Head; |
147 | |
148 | /// The block containing cmp+br.cond with a successor shared with Head. |
149 | MachineBasicBlock *CmpBB; |
150 | |
151 | /// The common successor for Head and CmpBB. |
152 | MachineBasicBlock *Tail; |
153 | |
154 | /// The compare instruction in CmpBB that can be converted to a ccmp. |
155 | MachineInstr *CmpMI; |
156 | |
157 | private: |
158 | /// The branch condition in Head as determined by analyzeBranch. |
159 | SmallVector<MachineOperand, 4> HeadCond; |
160 | |
161 | /// The condition code that makes Head branch to CmpBB. |
162 | AArch64CC::CondCode HeadCmpBBCC; |
163 | |
164 | /// The branch condition in CmpBB. |
165 | SmallVector<MachineOperand, 4> CmpBBCond; |
166 | |
167 | /// The condition code that makes CmpBB branch to Tail. |
168 | AArch64CC::CondCode CmpBBTailCC; |
169 | |
170 | /// Check if the Tail PHIs are trivially convertible. |
171 | bool trivialTailPHIs(); |
172 | |
173 | /// Remove CmpBB from the Tail PHIs. |
174 | void updateTailPHIs(); |
175 | |
176 | /// Check if an operand defining DstReg is dead. |
177 | bool isDeadDef(unsigned DstReg); |
178 | |
179 | /// Find the compare instruction in MBB that controls the conditional branch. |
180 | /// Return NULL if a convertible instruction can't be found. |
181 | MachineInstr *findConvertibleCompare(MachineBasicBlock *MBB); |
182 | |
183 | /// Return true if all non-terminator instructions in MBB can be safely |
184 | /// speculated. |
185 | bool canSpeculateInstrs(MachineBasicBlock *MBB, const MachineInstr *CmpMI); |
186 | |
187 | public: |
188 | /// runOnMachineFunction - Initialize per-function data structures. |
189 | void runOnMachineFunction(MachineFunction &MF, |
190 | const MachineBranchProbabilityInfo *MBPI) { |
191 | this->MF = &MF; |
192 | this->MBPI = MBPI; |
193 | TII = MF.getSubtarget().getInstrInfo(); |
194 | TRI = MF.getSubtarget().getRegisterInfo(); |
195 | MRI = &MF.getRegInfo(); |
196 | } |
197 | |
198 | /// If the sub-CFG headed by MBB can be cmp-converted, initialize the |
199 | /// internal state, and return true. |
200 | bool canConvert(MachineBasicBlock *MBB); |
201 | |
202 | /// Cmo-convert the last block passed to canConvertCmp(), assuming |
203 | /// it is possible. Add any erased blocks to RemovedBlocks. |
204 | void convert(SmallVectorImpl<MachineBasicBlock *> &RemovedBlocks); |
205 | |
206 | /// Return the expected code size delta if the conversion into a |
207 | /// conditional compare is performed. |
208 | int expectedCodeSizeDelta() const; |
209 | }; |
210 | } // end anonymous namespace |
211 | |
212 | // Check that all PHIs in Tail are selecting the same value from Head and CmpBB. |
213 | // This means that no if-conversion is required when merging CmpBB into Head. |
214 | bool SSACCmpConv::trivialTailPHIs() { |
215 | for (auto &I : *Tail) { |
216 | if (!I.isPHI()) |
217 | break; |
218 | unsigned HeadReg = 0, CmpBBReg = 0; |
219 | // PHI operands come in (VReg, MBB) pairs. |
220 | for (unsigned oi = 1, oe = I.getNumOperands(); oi != oe; oi += 2) { |
221 | MachineBasicBlock *MBB = I.getOperand(i: oi + 1).getMBB(); |
222 | Register Reg = I.getOperand(i: oi).getReg(); |
223 | if (MBB == Head) { |
224 | assert((!HeadReg || HeadReg == Reg) && "Inconsistent PHI operands" ); |
225 | HeadReg = Reg; |
226 | } |
227 | if (MBB == CmpBB) { |
228 | assert((!CmpBBReg || CmpBBReg == Reg) && "Inconsistent PHI operands" ); |
229 | CmpBBReg = Reg; |
230 | } |
231 | } |
232 | if (HeadReg != CmpBBReg) |
233 | return false; |
234 | } |
235 | return true; |
236 | } |
237 | |
238 | // Assuming that trivialTailPHIs() is true, update the Tail PHIs by simply |
239 | // removing the CmpBB operands. The Head operands will be identical. |
240 | void SSACCmpConv::updateTailPHIs() { |
241 | for (auto &I : *Tail) { |
242 | if (!I.isPHI()) |
243 | break; |
244 | // I is a PHI. It can have multiple entries for CmpBB. |
245 | for (unsigned oi = I.getNumOperands(); oi > 2; oi -= 2) { |
246 | // PHI operands are (Reg, MBB) at (oi-2, oi-1). |
247 | if (I.getOperand(i: oi - 1).getMBB() == CmpBB) { |
248 | I.removeOperand(OpNo: oi - 1); |
249 | I.removeOperand(OpNo: oi - 2); |
250 | } |
251 | } |
252 | } |
253 | } |
254 | |
255 | // This pass runs before the AArch64DeadRegisterDefinitions pass, so compares |
256 | // are still writing virtual registers without any uses. |
257 | bool SSACCmpConv::isDeadDef(unsigned DstReg) { |
258 | // Writes to the zero register are dead. |
259 | if (DstReg == AArch64::WZR || DstReg == AArch64::XZR) |
260 | return true; |
261 | if (!Register::isVirtualRegister(Reg: DstReg)) |
262 | return false; |
263 | // A virtual register def without any uses will be marked dead later, and |
264 | // eventually replaced by the zero register. |
265 | return MRI->use_nodbg_empty(RegNo: DstReg); |
266 | } |
267 | |
268 | // Parse a condition code returned by analyzeBranch, and compute the CondCode |
269 | // corresponding to TBB. |
270 | // Return |
271 | static bool parseCond(ArrayRef<MachineOperand> Cond, AArch64CC::CondCode &CC) { |
272 | // A normal br.cond simply has the condition code. |
273 | if (Cond[0].getImm() != -1) { |
274 | assert(Cond.size() == 1 && "Unknown Cond array format" ); |
275 | CC = (AArch64CC::CondCode)(int)Cond[0].getImm(); |
276 | return true; |
277 | } |
278 | // For tbz and cbz instruction, the opcode is next. |
279 | switch (Cond[1].getImm()) { |
280 | default: |
281 | // This includes tbz / tbnz branches which can't be converted to |
282 | // ccmp + br.cond. |
283 | return false; |
284 | case AArch64::CBZW: |
285 | case AArch64::CBZX: |
286 | assert(Cond.size() == 3 && "Unknown Cond array format" ); |
287 | CC = AArch64CC::EQ; |
288 | return true; |
289 | case AArch64::CBNZW: |
290 | case AArch64::CBNZX: |
291 | assert(Cond.size() == 3 && "Unknown Cond array format" ); |
292 | CC = AArch64CC::NE; |
293 | return true; |
294 | } |
295 | } |
296 | |
297 | MachineInstr *SSACCmpConv::findConvertibleCompare(MachineBasicBlock *MBB) { |
298 | MachineBasicBlock::iterator I = MBB->getFirstTerminator(); |
299 | if (I == MBB->end()) |
300 | return nullptr; |
301 | // The terminator must be controlled by the flags. |
302 | if (!I->readsRegister(Reg: AArch64::NZCV, /*TRI=*/nullptr)) { |
303 | switch (I->getOpcode()) { |
304 | case AArch64::CBZW: |
305 | case AArch64::CBZX: |
306 | case AArch64::CBNZW: |
307 | case AArch64::CBNZX: |
308 | // These can be converted into a ccmp against #0. |
309 | return &*I; |
310 | } |
311 | ++NumCmpTermRejs; |
312 | LLVM_DEBUG(dbgs() << "Flags not used by terminator: " << *I); |
313 | return nullptr; |
314 | } |
315 | |
316 | // Now find the instruction controlling the terminator. |
317 | for (MachineBasicBlock::iterator B = MBB->begin(); I != B;) { |
318 | I = prev_nodbg(It: I, Begin: MBB->begin()); |
319 | assert(!I->isTerminator() && "Spurious terminator" ); |
320 | switch (I->getOpcode()) { |
321 | // cmp is an alias for subs with a dead destination register. |
322 | case AArch64::SUBSWri: |
323 | case AArch64::SUBSXri: |
324 | // cmn is an alias for adds with a dead destination register. |
325 | case AArch64::ADDSWri: |
326 | case AArch64::ADDSXri: |
327 | // Check that the immediate operand is within range, ccmp wants a uimm5. |
328 | // Rd = SUBSri Rn, imm, shift |
329 | if (I->getOperand(i: 3).getImm() || !isUInt<5>(x: I->getOperand(i: 2).getImm())) { |
330 | LLVM_DEBUG(dbgs() << "Immediate out of range for ccmp: " << *I); |
331 | ++NumImmRangeRejs; |
332 | return nullptr; |
333 | } |
334 | [[fallthrough]]; |
335 | case AArch64::SUBSWrr: |
336 | case AArch64::SUBSXrr: |
337 | case AArch64::ADDSWrr: |
338 | case AArch64::ADDSXrr: |
339 | if (isDeadDef(DstReg: I->getOperand(i: 0).getReg())) |
340 | return &*I; |
341 | LLVM_DEBUG(dbgs() << "Can't convert compare with live destination: " |
342 | << *I); |
343 | ++NumLiveDstRejs; |
344 | return nullptr; |
345 | case AArch64::FCMPSrr: |
346 | case AArch64::FCMPDrr: |
347 | case AArch64::FCMPESrr: |
348 | case AArch64::FCMPEDrr: |
349 | return &*I; |
350 | } |
351 | |
352 | // Check for flag reads and clobbers. |
353 | PhysRegInfo PRI = AnalyzePhysRegInBundle(MI: *I, Reg: AArch64::NZCV, TRI); |
354 | |
355 | if (PRI.Read) { |
356 | // The ccmp doesn't produce exactly the same flags as the original |
357 | // compare, so reject the transform if there are uses of the flags |
358 | // besides the terminators. |
359 | LLVM_DEBUG(dbgs() << "Can't create ccmp with multiple uses: " << *I); |
360 | ++NumMultNZCVUses; |
361 | return nullptr; |
362 | } |
363 | |
364 | if (PRI.Defined || PRI.Clobbered) { |
365 | LLVM_DEBUG(dbgs() << "Not convertible compare: " << *I); |
366 | ++NumUnknNZCVDefs; |
367 | return nullptr; |
368 | } |
369 | } |
370 | LLVM_DEBUG(dbgs() << "Flags not defined in " << printMBBReference(*MBB) |
371 | << '\n'); |
372 | return nullptr; |
373 | } |
374 | |
375 | /// Determine if all the instructions in MBB can safely |
376 | /// be speculated. The terminators are not considered. |
377 | /// |
378 | /// Only CmpMI is allowed to clobber the flags. |
379 | /// |
380 | bool SSACCmpConv::canSpeculateInstrs(MachineBasicBlock *MBB, |
381 | const MachineInstr *CmpMI) { |
382 | // Reject any live-in physregs. It's probably NZCV/EFLAGS, and very hard to |
383 | // get right. |
384 | if (!MBB->livein_empty()) { |
385 | LLVM_DEBUG(dbgs() << printMBBReference(*MBB) << " has live-ins.\n" ); |
386 | return false; |
387 | } |
388 | |
389 | unsigned InstrCount = 0; |
390 | |
391 | // Check all instructions, except the terminators. It is assumed that |
392 | // terminators never have side effects or define any used register values. |
393 | for (auto &I : make_range(x: MBB->begin(), y: MBB->getFirstTerminator())) { |
394 | if (I.isDebugInstr()) |
395 | continue; |
396 | |
397 | if (++InstrCount > BlockInstrLimit && !Stress) { |
398 | LLVM_DEBUG(dbgs() << printMBBReference(*MBB) << " has more than " |
399 | << BlockInstrLimit << " instructions.\n" ); |
400 | return false; |
401 | } |
402 | |
403 | // There shouldn't normally be any phis in a single-predecessor block. |
404 | if (I.isPHI()) { |
405 | LLVM_DEBUG(dbgs() << "Can't hoist: " << I); |
406 | return false; |
407 | } |
408 | |
409 | // Don't speculate loads. Note that it may be possible and desirable to |
410 | // speculate GOT or constant pool loads that are guaranteed not to trap, |
411 | // but we don't support that for now. |
412 | if (I.mayLoad()) { |
413 | LLVM_DEBUG(dbgs() << "Won't speculate load: " << I); |
414 | return false; |
415 | } |
416 | |
417 | // We never speculate stores, so an AA pointer isn't necessary. |
418 | bool DontMoveAcrossStore = true; |
419 | if (!I.isSafeToMove(AA: nullptr, SawStore&: DontMoveAcrossStore)) { |
420 | LLVM_DEBUG(dbgs() << "Can't speculate: " << I); |
421 | return false; |
422 | } |
423 | |
424 | // Only CmpMI is allowed to clobber the flags. |
425 | if (&I != CmpMI && I.modifiesRegister(Reg: AArch64::NZCV, TRI)) { |
426 | LLVM_DEBUG(dbgs() << "Clobbers flags: " << I); |
427 | return false; |
428 | } |
429 | } |
430 | return true; |
431 | } |
432 | |
433 | /// Analyze the sub-cfg rooted in MBB, and return true if it is a potential |
434 | /// candidate for cmp-conversion. Fill out the internal state. |
435 | /// |
436 | bool SSACCmpConv::canConvert(MachineBasicBlock *MBB) { |
437 | Head = MBB; |
438 | Tail = CmpBB = nullptr; |
439 | |
440 | if (Head->succ_size() != 2) |
441 | return false; |
442 | MachineBasicBlock *Succ0 = Head->succ_begin()[0]; |
443 | MachineBasicBlock *Succ1 = Head->succ_begin()[1]; |
444 | |
445 | // CmpBB can only have a single predecessor. Tail is allowed many. |
446 | if (Succ0->pred_size() != 1) |
447 | std::swap(a&: Succ0, b&: Succ1); |
448 | |
449 | // Succ0 is our candidate for CmpBB. |
450 | if (Succ0->pred_size() != 1 || Succ0->succ_size() != 2) |
451 | return false; |
452 | |
453 | CmpBB = Succ0; |
454 | Tail = Succ1; |
455 | |
456 | if (!CmpBB->isSuccessor(MBB: Tail)) |
457 | return false; |
458 | |
459 | // The CFG topology checks out. |
460 | LLVM_DEBUG(dbgs() << "\nTriangle: " << printMBBReference(*Head) << " -> " |
461 | << printMBBReference(*CmpBB) << " -> " |
462 | << printMBBReference(*Tail) << '\n'); |
463 | ++NumConsidered; |
464 | |
465 | // Tail is allowed to have many predecessors, but we can't handle PHIs yet. |
466 | // |
467 | // FIXME: Real PHIs could be if-converted as long as the CmpBB values are |
468 | // defined before The CmpBB cmp clobbers the flags. Alternatively, it should |
469 | // always be safe to sink the ccmp down to immediately before the CmpBB |
470 | // terminators. |
471 | if (!trivialTailPHIs()) { |
472 | LLVM_DEBUG(dbgs() << "Can't handle phis in Tail.\n" ); |
473 | ++NumPhiRejs; |
474 | return false; |
475 | } |
476 | |
477 | if (!Tail->livein_empty()) { |
478 | LLVM_DEBUG(dbgs() << "Can't handle live-in physregs in Tail.\n" ); |
479 | ++NumPhysRejs; |
480 | return false; |
481 | } |
482 | |
483 | // CmpBB should never have PHIs since Head is its only predecessor. |
484 | // FIXME: Clean them up if it happens. |
485 | if (!CmpBB->empty() && CmpBB->front().isPHI()) { |
486 | LLVM_DEBUG(dbgs() << "Can't handle phis in CmpBB.\n" ); |
487 | ++NumPhi2Rejs; |
488 | return false; |
489 | } |
490 | |
491 | if (!CmpBB->livein_empty()) { |
492 | LLVM_DEBUG(dbgs() << "Can't handle live-in physregs in CmpBB.\n" ); |
493 | ++NumPhysRejs; |
494 | return false; |
495 | } |
496 | |
497 | // The branch we're looking to eliminate must be analyzable. |
498 | HeadCond.clear(); |
499 | MachineBasicBlock *TBB = nullptr, *FBB = nullptr; |
500 | if (TII->analyzeBranch(MBB&: *Head, TBB, FBB, Cond&: HeadCond)) { |
501 | LLVM_DEBUG(dbgs() << "Head branch not analyzable.\n" ); |
502 | ++NumHeadBranchRejs; |
503 | return false; |
504 | } |
505 | |
506 | // This is weird, probably some sort of degenerate CFG, or an edge to a |
507 | // landing pad. |
508 | if (!TBB || HeadCond.empty()) { |
509 | LLVM_DEBUG( |
510 | dbgs() << "analyzeBranch didn't find conditional branch in Head.\n" ); |
511 | ++NumHeadBranchRejs; |
512 | return false; |
513 | } |
514 | |
515 | if (!parseCond(Cond: HeadCond, CC&: HeadCmpBBCC)) { |
516 | LLVM_DEBUG(dbgs() << "Unsupported branch type on Head\n" ); |
517 | ++NumHeadBranchRejs; |
518 | return false; |
519 | } |
520 | |
521 | // Make sure the branch direction is right. |
522 | if (TBB != CmpBB) { |
523 | assert(TBB == Tail && "Unexpected TBB" ); |
524 | HeadCmpBBCC = AArch64CC::getInvertedCondCode(Code: HeadCmpBBCC); |
525 | } |
526 | |
527 | CmpBBCond.clear(); |
528 | TBB = FBB = nullptr; |
529 | if (TII->analyzeBranch(MBB&: *CmpBB, TBB, FBB, Cond&: CmpBBCond)) { |
530 | LLVM_DEBUG(dbgs() << "CmpBB branch not analyzable.\n" ); |
531 | ++NumCmpBranchRejs; |
532 | return false; |
533 | } |
534 | |
535 | if (!TBB || CmpBBCond.empty()) { |
536 | LLVM_DEBUG( |
537 | dbgs() << "analyzeBranch didn't find conditional branch in CmpBB.\n" ); |
538 | ++NumCmpBranchRejs; |
539 | return false; |
540 | } |
541 | |
542 | if (!parseCond(Cond: CmpBBCond, CC&: CmpBBTailCC)) { |
543 | LLVM_DEBUG(dbgs() << "Unsupported branch type on CmpBB\n" ); |
544 | ++NumCmpBranchRejs; |
545 | return false; |
546 | } |
547 | |
548 | if (TBB != Tail) |
549 | CmpBBTailCC = AArch64CC::getInvertedCondCode(Code: CmpBBTailCC); |
550 | |
551 | LLVM_DEBUG(dbgs() << "Head->CmpBB on " |
552 | << AArch64CC::getCondCodeName(HeadCmpBBCC) |
553 | << ", CmpBB->Tail on " |
554 | << AArch64CC::getCondCodeName(CmpBBTailCC) << '\n'); |
555 | |
556 | CmpMI = findConvertibleCompare(MBB: CmpBB); |
557 | if (!CmpMI) |
558 | return false; |
559 | |
560 | if (!canSpeculateInstrs(MBB: CmpBB, CmpMI)) { |
561 | ++NumSpeculateRejs; |
562 | return false; |
563 | } |
564 | return true; |
565 | } |
566 | |
567 | void SSACCmpConv::convert(SmallVectorImpl<MachineBasicBlock *> &RemovedBlocks) { |
568 | LLVM_DEBUG(dbgs() << "Merging " << printMBBReference(*CmpBB) << " into " |
569 | << printMBBReference(*Head) << ":\n" |
570 | << *CmpBB); |
571 | |
572 | // All CmpBB instructions are moved into Head, and CmpBB is deleted. |
573 | // Update the CFG first. |
574 | updateTailPHIs(); |
575 | |
576 | // Save successor probabilties before removing CmpBB and Tail from their |
577 | // parents. |
578 | BranchProbability Head2CmpBB = MBPI->getEdgeProbability(Src: Head, Dst: CmpBB); |
579 | BranchProbability CmpBB2Tail = MBPI->getEdgeProbability(Src: CmpBB, Dst: Tail); |
580 | |
581 | Head->removeSuccessor(Succ: CmpBB); |
582 | CmpBB->removeSuccessor(Succ: Tail); |
583 | |
584 | // If Head and CmpBB had successor probabilties, udpate the probabilities to |
585 | // reflect the ccmp-conversion. |
586 | if (Head->hasSuccessorProbabilities() && CmpBB->hasSuccessorProbabilities()) { |
587 | |
588 | // Head is allowed two successors. We've removed CmpBB, so the remaining |
589 | // successor is Tail. We need to increase the successor probability for |
590 | // Tail to account for the CmpBB path we removed. |
591 | // |
592 | // Pr(Tail|Head) += Pr(CmpBB|Head) * Pr(Tail|CmpBB). |
593 | assert(*Head->succ_begin() == Tail && "Head successor is not Tail" ); |
594 | BranchProbability Head2Tail = MBPI->getEdgeProbability(Src: Head, Dst: Tail); |
595 | Head->setSuccProbability(I: Head->succ_begin(), |
596 | Prob: Head2Tail + Head2CmpBB * CmpBB2Tail); |
597 | |
598 | // We will transfer successors of CmpBB to Head in a moment without |
599 | // normalizing the successor probabilities. Set the successor probabilites |
600 | // before doing so. |
601 | // |
602 | // Pr(I|Head) = Pr(CmpBB|Head) * Pr(I|CmpBB). |
603 | for (auto I = CmpBB->succ_begin(), E = CmpBB->succ_end(); I != E; ++I) { |
604 | BranchProbability CmpBB2I = MBPI->getEdgeProbability(Src: CmpBB, Dst: *I); |
605 | CmpBB->setSuccProbability(I, Prob: Head2CmpBB * CmpBB2I); |
606 | } |
607 | } |
608 | |
609 | Head->transferSuccessorsAndUpdatePHIs(FromMBB: CmpBB); |
610 | DebugLoc TermDL = Head->getFirstTerminator()->getDebugLoc(); |
611 | TII->removeBranch(MBB&: *Head); |
612 | |
613 | // If the Head terminator was one of the cbz / tbz branches with built-in |
614 | // compare, we need to insert an explicit compare instruction in its place. |
615 | if (HeadCond[0].getImm() == -1) { |
616 | ++NumCompBranches; |
617 | unsigned Opc = 0; |
618 | switch (HeadCond[1].getImm()) { |
619 | case AArch64::CBZW: |
620 | case AArch64::CBNZW: |
621 | Opc = AArch64::SUBSWri; |
622 | break; |
623 | case AArch64::CBZX: |
624 | case AArch64::CBNZX: |
625 | Opc = AArch64::SUBSXri; |
626 | break; |
627 | default: |
628 | llvm_unreachable("Cannot convert Head branch" ); |
629 | } |
630 | const MCInstrDesc &MCID = TII->get(Opcode: Opc); |
631 | // Create a dummy virtual register for the SUBS def. |
632 | Register DestReg = |
633 | MRI->createVirtualRegister(RegClass: TII->getRegClass(MCID, OpNum: 0, TRI, MF: *MF)); |
634 | // Insert a SUBS Rn, #0 instruction instead of the cbz / cbnz. |
635 | BuildMI(BB&: *Head, I: Head->end(), MIMD: TermDL, MCID) |
636 | .addReg(RegNo: DestReg, flags: RegState::Define | RegState::Dead) |
637 | .add(MO: HeadCond[2]) |
638 | .addImm(Val: 0) |
639 | .addImm(Val: 0); |
640 | // SUBS uses the GPR*sp register classes. |
641 | MRI->constrainRegClass(Reg: HeadCond[2].getReg(), |
642 | RC: TII->getRegClass(MCID, OpNum: 1, TRI, MF: *MF)); |
643 | } |
644 | |
645 | Head->splice(Where: Head->end(), Other: CmpBB, From: CmpBB->begin(), To: CmpBB->end()); |
646 | |
647 | // Now replace CmpMI with a ccmp instruction that also considers the incoming |
648 | // flags. |
649 | unsigned Opc = 0; |
650 | unsigned FirstOp = 1; // First CmpMI operand to copy. |
651 | bool isZBranch = false; // CmpMI is a cbz/cbnz instruction. |
652 | switch (CmpMI->getOpcode()) { |
653 | default: |
654 | llvm_unreachable("Unknown compare opcode" ); |
655 | case AArch64::SUBSWri: Opc = AArch64::CCMPWi; break; |
656 | case AArch64::SUBSWrr: Opc = AArch64::CCMPWr; break; |
657 | case AArch64::SUBSXri: Opc = AArch64::CCMPXi; break; |
658 | case AArch64::SUBSXrr: Opc = AArch64::CCMPXr; break; |
659 | case AArch64::ADDSWri: Opc = AArch64::CCMNWi; break; |
660 | case AArch64::ADDSWrr: Opc = AArch64::CCMNWr; break; |
661 | case AArch64::ADDSXri: Opc = AArch64::CCMNXi; break; |
662 | case AArch64::ADDSXrr: Opc = AArch64::CCMNXr; break; |
663 | case AArch64::FCMPSrr: Opc = AArch64::FCCMPSrr; FirstOp = 0; break; |
664 | case AArch64::FCMPDrr: Opc = AArch64::FCCMPDrr; FirstOp = 0; break; |
665 | case AArch64::FCMPESrr: Opc = AArch64::FCCMPESrr; FirstOp = 0; break; |
666 | case AArch64::FCMPEDrr: Opc = AArch64::FCCMPEDrr; FirstOp = 0; break; |
667 | case AArch64::CBZW: |
668 | case AArch64::CBNZW: |
669 | Opc = AArch64::CCMPWi; |
670 | FirstOp = 0; |
671 | isZBranch = true; |
672 | break; |
673 | case AArch64::CBZX: |
674 | case AArch64::CBNZX: |
675 | Opc = AArch64::CCMPXi; |
676 | FirstOp = 0; |
677 | isZBranch = true; |
678 | break; |
679 | } |
680 | |
681 | // The ccmp instruction should set the flags according to the comparison when |
682 | // Head would have branched to CmpBB. |
683 | // The NZCV immediate operand should provide flags for the case where Head |
684 | // would have branched to Tail. These flags should cause the new Head |
685 | // terminator to branch to tail. |
686 | unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(Code: CmpBBTailCC); |
687 | const MCInstrDesc &MCID = TII->get(Opcode: Opc); |
688 | MRI->constrainRegClass(Reg: CmpMI->getOperand(i: FirstOp).getReg(), |
689 | RC: TII->getRegClass(MCID, OpNum: 0, TRI, MF: *MF)); |
690 | if (CmpMI->getOperand(i: FirstOp + 1).isReg()) |
691 | MRI->constrainRegClass(Reg: CmpMI->getOperand(i: FirstOp + 1).getReg(), |
692 | RC: TII->getRegClass(MCID, OpNum: 1, TRI, MF: *MF)); |
693 | MachineInstrBuilder MIB = BuildMI(BB&: *Head, I: CmpMI, MIMD: CmpMI->getDebugLoc(), MCID) |
694 | .add(MO: CmpMI->getOperand(i: FirstOp)); // Register Rn |
695 | if (isZBranch) |
696 | MIB.addImm(Val: 0); // cbz/cbnz Rn -> ccmp Rn, #0 |
697 | else |
698 | MIB.add(MO: CmpMI->getOperand(i: FirstOp + 1)); // Register Rm / Immediate |
699 | MIB.addImm(Val: NZCV).addImm(Val: HeadCmpBBCC); |
700 | |
701 | // If CmpMI was a terminator, we need a new conditional branch to replace it. |
702 | // This now becomes a Head terminator. |
703 | if (isZBranch) { |
704 | bool isNZ = CmpMI->getOpcode() == AArch64::CBNZW || |
705 | CmpMI->getOpcode() == AArch64::CBNZX; |
706 | BuildMI(BB&: *Head, I: CmpMI, MIMD: CmpMI->getDebugLoc(), MCID: TII->get(Opcode: AArch64::Bcc)) |
707 | .addImm(Val: isNZ ? AArch64CC::NE : AArch64CC::EQ) |
708 | .add(MO: CmpMI->getOperand(i: 1)); // Branch target. |
709 | } |
710 | CmpMI->eraseFromParent(); |
711 | Head->updateTerminator(PreviousLayoutSuccessor: CmpBB->getNextNode()); |
712 | |
713 | RemovedBlocks.push_back(Elt: CmpBB); |
714 | CmpBB->eraseFromParent(); |
715 | LLVM_DEBUG(dbgs() << "Result:\n" << *Head); |
716 | ++NumConverted; |
717 | } |
718 | |
719 | int SSACCmpConv::expectedCodeSizeDelta() const { |
720 | int delta = 0; |
721 | // If the Head terminator was one of the cbz / tbz branches with built-in |
722 | // compare, we need to insert an explicit compare instruction in its place |
723 | // plus a branch instruction. |
724 | if (HeadCond[0].getImm() == -1) { |
725 | switch (HeadCond[1].getImm()) { |
726 | case AArch64::CBZW: |
727 | case AArch64::CBNZW: |
728 | case AArch64::CBZX: |
729 | case AArch64::CBNZX: |
730 | // Therefore delta += 1 |
731 | delta = 1; |
732 | break; |
733 | default: |
734 | llvm_unreachable("Cannot convert Head branch" ); |
735 | } |
736 | } |
737 | // If the Cmp terminator was one of the cbz / tbz branches with |
738 | // built-in compare, it will be turned into a compare instruction |
739 | // into Head, but we do not save any instruction. |
740 | // Otherwise, we save the branch instruction. |
741 | switch (CmpMI->getOpcode()) { |
742 | default: |
743 | --delta; |
744 | break; |
745 | case AArch64::CBZW: |
746 | case AArch64::CBNZW: |
747 | case AArch64::CBZX: |
748 | case AArch64::CBNZX: |
749 | break; |
750 | } |
751 | return delta; |
752 | } |
753 | |
754 | //===----------------------------------------------------------------------===// |
755 | // AArch64ConditionalCompares Pass |
756 | //===----------------------------------------------------------------------===// |
757 | |
758 | namespace { |
759 | class AArch64ConditionalCompares : public MachineFunctionPass { |
760 | const MachineBranchProbabilityInfo *MBPI; |
761 | const TargetInstrInfo *TII; |
762 | const TargetRegisterInfo *TRI; |
763 | MCSchedModel SchedModel; |
764 | // Does the proceeded function has Oz attribute. |
765 | bool MinSize; |
766 | MachineRegisterInfo *MRI; |
767 | MachineDominatorTree *DomTree; |
768 | MachineLoopInfo *Loops; |
769 | MachineTraceMetrics *Traces; |
770 | MachineTraceMetrics::Ensemble *MinInstr; |
771 | SSACCmpConv CmpConv; |
772 | |
773 | public: |
774 | static char ID; |
775 | AArch64ConditionalCompares() : MachineFunctionPass(ID) { |
776 | initializeAArch64ConditionalComparesPass(*PassRegistry::getPassRegistry()); |
777 | } |
778 | void getAnalysisUsage(AnalysisUsage &AU) const override; |
779 | bool runOnMachineFunction(MachineFunction &MF) override; |
780 | StringRef getPassName() const override { |
781 | return "AArch64 Conditional Compares" ; |
782 | } |
783 | |
784 | private: |
785 | bool tryConvert(MachineBasicBlock *); |
786 | void updateDomTree(ArrayRef<MachineBasicBlock *> Removed); |
787 | void updateLoops(ArrayRef<MachineBasicBlock *> Removed); |
788 | void invalidateTraces(); |
789 | bool shouldConvert(); |
790 | }; |
791 | } // end anonymous namespace |
792 | |
793 | char AArch64ConditionalCompares::ID = 0; |
794 | |
795 | INITIALIZE_PASS_BEGIN(AArch64ConditionalCompares, "aarch64-ccmp" , |
796 | "AArch64 CCMP Pass" , false, false) |
797 | INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfoWrapperPass) |
798 | INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) |
799 | INITIALIZE_PASS_DEPENDENCY(MachineTraceMetrics) |
800 | INITIALIZE_PASS_END(AArch64ConditionalCompares, "aarch64-ccmp" , |
801 | "AArch64 CCMP Pass" , false, false) |
802 | |
803 | FunctionPass *llvm::createAArch64ConditionalCompares() { |
804 | return new AArch64ConditionalCompares(); |
805 | } |
806 | |
807 | void AArch64ConditionalCompares::getAnalysisUsage(AnalysisUsage &AU) const { |
808 | AU.addRequired<MachineBranchProbabilityInfoWrapperPass>(); |
809 | AU.addRequired<MachineDominatorTreeWrapperPass>(); |
810 | AU.addPreserved<MachineDominatorTreeWrapperPass>(); |
811 | AU.addRequired<MachineLoopInfoWrapperPass>(); |
812 | AU.addPreserved<MachineLoopInfoWrapperPass>(); |
813 | AU.addRequired<MachineTraceMetrics>(); |
814 | AU.addPreserved<MachineTraceMetrics>(); |
815 | MachineFunctionPass::getAnalysisUsage(AU); |
816 | } |
817 | |
818 | /// Update the dominator tree after if-conversion erased some blocks. |
819 | void AArch64ConditionalCompares::updateDomTree( |
820 | ArrayRef<MachineBasicBlock *> Removed) { |
821 | // convert() removes CmpBB which was previously dominated by Head. |
822 | // CmpBB children should be transferred to Head. |
823 | MachineDomTreeNode *HeadNode = DomTree->getNode(BB: CmpConv.Head); |
824 | for (MachineBasicBlock *RemovedMBB : Removed) { |
825 | MachineDomTreeNode *Node = DomTree->getNode(BB: RemovedMBB); |
826 | assert(Node != HeadNode && "Cannot erase the head node" ); |
827 | assert(Node->getIDom() == HeadNode && "CmpBB should be dominated by Head" ); |
828 | while (Node->getNumChildren()) |
829 | DomTree->changeImmediateDominator(N: Node->back(), NewIDom: HeadNode); |
830 | DomTree->eraseNode(BB: RemovedMBB); |
831 | } |
832 | } |
833 | |
834 | /// Update LoopInfo after if-conversion. |
835 | void |
836 | AArch64ConditionalCompares::updateLoops(ArrayRef<MachineBasicBlock *> Removed) { |
837 | if (!Loops) |
838 | return; |
839 | for (MachineBasicBlock *RemovedMBB : Removed) |
840 | Loops->removeBlock(BB: RemovedMBB); |
841 | } |
842 | |
843 | /// Invalidate MachineTraceMetrics before if-conversion. |
844 | void AArch64ConditionalCompares::invalidateTraces() { |
845 | Traces->invalidate(MBB: CmpConv.Head); |
846 | Traces->invalidate(MBB: CmpConv.CmpBB); |
847 | } |
848 | |
849 | /// Apply cost model and heuristics to the if-conversion in IfConv. |
850 | /// Return true if the conversion is a good idea. |
851 | /// |
852 | bool AArch64ConditionalCompares::shouldConvert() { |
853 | // Stress testing mode disables all cost considerations. |
854 | if (Stress) |
855 | return true; |
856 | if (!MinInstr) |
857 | MinInstr = Traces->getEnsemble(MachineTraceStrategy::TS_MinInstrCount); |
858 | |
859 | // Head dominates CmpBB, so it is always included in its trace. |
860 | MachineTraceMetrics::Trace Trace = MinInstr->getTrace(MBB: CmpConv.CmpBB); |
861 | |
862 | // If code size is the main concern |
863 | if (MinSize) { |
864 | int CodeSizeDelta = CmpConv.expectedCodeSizeDelta(); |
865 | LLVM_DEBUG(dbgs() << "Code size delta: " << CodeSizeDelta << '\n'); |
866 | // If we are minimizing the code size, do the conversion whatever |
867 | // the cost is. |
868 | if (CodeSizeDelta < 0) |
869 | return true; |
870 | if (CodeSizeDelta > 0) { |
871 | LLVM_DEBUG(dbgs() << "Code size is increasing, give up on this one.\n" ); |
872 | return false; |
873 | } |
874 | // CodeSizeDelta == 0, continue with the regular heuristics |
875 | } |
876 | |
877 | // Heuristic: The compare conversion delays the execution of the branch |
878 | // instruction because we must wait for the inputs to the second compare as |
879 | // well. The branch has no dependent instructions, but delaying it increases |
880 | // the cost of a misprediction. |
881 | // |
882 | // Set a limit on the delay we will accept. |
883 | unsigned DelayLimit = SchedModel.MispredictPenalty * 3 / 4; |
884 | |
885 | // Instruction depths can be computed for all trace instructions above CmpBB. |
886 | unsigned HeadDepth = |
887 | Trace.getInstrCycles(MI: *CmpConv.Head->getFirstTerminator()).Depth; |
888 | unsigned CmpBBDepth = |
889 | Trace.getInstrCycles(MI: *CmpConv.CmpBB->getFirstTerminator()).Depth; |
890 | LLVM_DEBUG(dbgs() << "Head depth: " << HeadDepth |
891 | << "\nCmpBB depth: " << CmpBBDepth << '\n'); |
892 | if (CmpBBDepth > HeadDepth + DelayLimit) { |
893 | LLVM_DEBUG(dbgs() << "Branch delay would be larger than " << DelayLimit |
894 | << " cycles.\n" ); |
895 | return false; |
896 | } |
897 | |
898 | // Check the resource depth at the bottom of CmpBB - these instructions will |
899 | // be speculated. |
900 | unsigned ResDepth = Trace.getResourceDepth(Bottom: true); |
901 | LLVM_DEBUG(dbgs() << "Resources: " << ResDepth << '\n'); |
902 | |
903 | // Heuristic: The speculatively executed instructions must all be able to |
904 | // merge into the Head block. The Head critical path should dominate the |
905 | // resource cost of the speculated instructions. |
906 | if (ResDepth > HeadDepth) { |
907 | LLVM_DEBUG(dbgs() << "Too many instructions to speculate.\n" ); |
908 | return false; |
909 | } |
910 | return true; |
911 | } |
912 | |
913 | bool AArch64ConditionalCompares::tryConvert(MachineBasicBlock *MBB) { |
914 | bool Changed = false; |
915 | while (CmpConv.canConvert(MBB) && shouldConvert()) { |
916 | invalidateTraces(); |
917 | SmallVector<MachineBasicBlock *, 4> RemovedBlocks; |
918 | CmpConv.convert(RemovedBlocks); |
919 | Changed = true; |
920 | updateDomTree(Removed: RemovedBlocks); |
921 | updateLoops(Removed: RemovedBlocks); |
922 | } |
923 | return Changed; |
924 | } |
925 | |
926 | bool AArch64ConditionalCompares::runOnMachineFunction(MachineFunction &MF) { |
927 | LLVM_DEBUG(dbgs() << "********** AArch64 Conditional Compares **********\n" |
928 | << "********** Function: " << MF.getName() << '\n'); |
929 | if (skipFunction(F: MF.getFunction())) |
930 | return false; |
931 | |
932 | TII = MF.getSubtarget().getInstrInfo(); |
933 | TRI = MF.getSubtarget().getRegisterInfo(); |
934 | SchedModel = MF.getSubtarget().getSchedModel(); |
935 | MRI = &MF.getRegInfo(); |
936 | DomTree = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree(); |
937 | Loops = &getAnalysis<MachineLoopInfoWrapperPass>().getLI(); |
938 | MBPI = &getAnalysis<MachineBranchProbabilityInfoWrapperPass>().getMBPI(); |
939 | Traces = &getAnalysis<MachineTraceMetrics>(); |
940 | MinInstr = nullptr; |
941 | MinSize = MF.getFunction().hasMinSize(); |
942 | |
943 | bool Changed = false; |
944 | CmpConv.runOnMachineFunction(MF, MBPI); |
945 | |
946 | // Visit blocks in dominator tree pre-order. The pre-order enables multiple |
947 | // cmp-conversions from the same head block. |
948 | // Note that updateDomTree() modifies the children of the DomTree node |
949 | // currently being visited. The df_iterator supports that; it doesn't look at |
950 | // child_begin() / child_end() until after a node has been visited. |
951 | for (auto *I : depth_first(G: DomTree)) |
952 | if (tryConvert(MBB: I->getBlock())) |
953 | Changed = true; |
954 | |
955 | return Changed; |
956 | } |
957 | |