1 | //===- MachineCSE.cpp - Machine Common Subexpression Elimination Pass -----===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This pass performs global common subexpression elimination on machine |
10 | // instructions using a scoped hash table based value numbering scheme. It |
11 | // must be run while the machine function is still in SSA form. |
12 | // |
13 | //===----------------------------------------------------------------------===// |
14 | |
15 | #include "llvm/ADT/DenseMap.h" |
16 | #include "llvm/ADT/ScopedHashTable.h" |
17 | #include "llvm/ADT/SmallPtrSet.h" |
18 | #include "llvm/ADT/SmallSet.h" |
19 | #include "llvm/ADT/SmallVector.h" |
20 | #include "llvm/ADT/Statistic.h" |
21 | #include "llvm/Analysis/AliasAnalysis.h" |
22 | #include "llvm/Analysis/CFG.h" |
23 | #include "llvm/CodeGen/MachineBasicBlock.h" |
24 | #include "llvm/CodeGen/MachineBlockFrequencyInfo.h" |
25 | #include "llvm/CodeGen/MachineDominators.h" |
26 | #include "llvm/CodeGen/MachineFunction.h" |
27 | #include "llvm/CodeGen/MachineFunctionPass.h" |
28 | #include "llvm/CodeGen/MachineInstr.h" |
29 | #include "llvm/CodeGen/MachineOperand.h" |
30 | #include "llvm/CodeGen/MachineRegisterInfo.h" |
31 | #include "llvm/CodeGen/Passes.h" |
32 | #include "llvm/CodeGen/TargetInstrInfo.h" |
33 | #include "llvm/CodeGen/TargetOpcodes.h" |
34 | #include "llvm/CodeGen/TargetRegisterInfo.h" |
35 | #include "llvm/CodeGen/TargetSubtargetInfo.h" |
36 | #include "llvm/InitializePasses.h" |
37 | #include "llvm/MC/MCRegister.h" |
38 | #include "llvm/MC/MCRegisterInfo.h" |
39 | #include "llvm/Pass.h" |
40 | #include "llvm/Support/Allocator.h" |
41 | #include "llvm/Support/Debug.h" |
42 | #include "llvm/Support/RecyclingAllocator.h" |
43 | #include "llvm/Support/raw_ostream.h" |
44 | #include <cassert> |
45 | #include <iterator> |
46 | #include <utility> |
47 | |
48 | using namespace llvm; |
49 | |
50 | #define DEBUG_TYPE "machine-cse" |
51 | |
52 | STATISTIC(NumCoalesces, "Number of copies coalesced" ); |
53 | STATISTIC(NumCSEs, "Number of common subexpression eliminated" ); |
54 | STATISTIC(NumPREs, "Number of partial redundant expression" |
55 | " transformed to fully redundant" ); |
56 | STATISTIC(NumPhysCSEs, |
57 | "Number of physreg referencing common subexpr eliminated" ); |
58 | STATISTIC(NumCrossBBCSEs, |
59 | "Number of cross-MBB physreg referencing CS eliminated" ); |
60 | STATISTIC(NumCommutes, "Number of copies coalesced after commuting" ); |
61 | |
62 | // Threshold to avoid excessive cost to compute isProfitableToCSE. |
63 | static cl::opt<int> |
64 | CSUsesThreshold("csuses-threshold" , cl::Hidden, cl::init(Val: 1024), |
65 | cl::desc("Threshold for the size of CSUses" )); |
66 | |
67 | static cl::opt<bool> AggressiveMachineCSE( |
68 | "aggressive-machine-cse" , cl::Hidden, cl::init(Val: false), |
69 | cl::desc("Override the profitability heuristics for Machine CSE" )); |
70 | |
71 | namespace { |
72 | |
73 | class MachineCSE : public MachineFunctionPass { |
74 | const TargetInstrInfo *TII = nullptr; |
75 | const TargetRegisterInfo *TRI = nullptr; |
76 | AliasAnalysis *AA = nullptr; |
77 | MachineDominatorTree *DT = nullptr; |
78 | MachineRegisterInfo *MRI = nullptr; |
79 | MachineBlockFrequencyInfo *MBFI = nullptr; |
80 | |
81 | public: |
82 | static char ID; // Pass identification |
83 | |
84 | MachineCSE() : MachineFunctionPass(ID) { |
85 | initializeMachineCSEPass(*PassRegistry::getPassRegistry()); |
86 | } |
87 | |
88 | bool runOnMachineFunction(MachineFunction &MF) override; |
89 | |
90 | void getAnalysisUsage(AnalysisUsage &AU) const override { |
91 | AU.setPreservesCFG(); |
92 | MachineFunctionPass::getAnalysisUsage(AU); |
93 | AU.addRequired<AAResultsWrapperPass>(); |
94 | AU.addPreservedID(ID&: MachineLoopInfoID); |
95 | AU.addRequired<MachineDominatorTreeWrapperPass>(); |
96 | AU.addPreserved<MachineDominatorTreeWrapperPass>(); |
97 | AU.addRequired<MachineBlockFrequencyInfoWrapperPass>(); |
98 | AU.addPreserved<MachineBlockFrequencyInfoWrapperPass>(); |
99 | } |
100 | |
101 | MachineFunctionProperties getRequiredProperties() const override { |
102 | return MachineFunctionProperties() |
103 | .set(MachineFunctionProperties::Property::IsSSA); |
104 | } |
105 | |
106 | void releaseMemory() override { |
107 | ScopeMap.clear(); |
108 | PREMap.clear(); |
109 | Exps.clear(); |
110 | } |
111 | |
112 | private: |
113 | using AllocatorTy = RecyclingAllocator<BumpPtrAllocator, |
114 | ScopedHashTableVal<MachineInstr *, unsigned>>; |
115 | using ScopedHTType = |
116 | ScopedHashTable<MachineInstr *, unsigned, MachineInstrExpressionTrait, |
117 | AllocatorTy>; |
118 | using ScopeType = ScopedHTType::ScopeTy; |
119 | using PhysDefVector = SmallVector<std::pair<unsigned, unsigned>, 2>; |
120 | |
121 | unsigned LookAheadLimit = 0; |
122 | DenseMap<MachineBasicBlock *, ScopeType *> ScopeMap; |
123 | DenseMap<MachineInstr *, MachineBasicBlock *, MachineInstrExpressionTrait> |
124 | PREMap; |
125 | ScopedHTType VNT; |
126 | SmallVector<MachineInstr *, 64> Exps; |
127 | unsigned CurrVN = 0; |
128 | |
129 | bool PerformTrivialCopyPropagation(MachineInstr *MI, |
130 | MachineBasicBlock *MBB); |
131 | bool isPhysDefTriviallyDead(MCRegister Reg, |
132 | MachineBasicBlock::const_iterator I, |
133 | MachineBasicBlock::const_iterator E) const; |
134 | bool hasLivePhysRegDefUses(const MachineInstr *MI, |
135 | const MachineBasicBlock *MBB, |
136 | SmallSet<MCRegister, 8> &PhysRefs, |
137 | PhysDefVector &PhysDefs, bool &PhysUseDef) const; |
138 | bool PhysRegDefsReach(MachineInstr *CSMI, MachineInstr *MI, |
139 | SmallSet<MCRegister, 8> &PhysRefs, |
140 | PhysDefVector &PhysDefs, bool &NonLocal) const; |
141 | bool isCSECandidate(MachineInstr *MI); |
142 | bool isProfitableToCSE(Register CSReg, Register Reg, |
143 | MachineBasicBlock *CSBB, MachineInstr *MI); |
144 | void EnterScope(MachineBasicBlock *MBB); |
145 | void ExitScope(MachineBasicBlock *MBB); |
146 | bool ProcessBlockCSE(MachineBasicBlock *MBB); |
147 | void ExitScopeIfDone(MachineDomTreeNode *Node, |
148 | DenseMap<MachineDomTreeNode*, unsigned> &OpenChildren); |
149 | bool PerformCSE(MachineDomTreeNode *Node); |
150 | |
151 | bool isPRECandidate(MachineInstr *MI, SmallSet<MCRegister, 8> &PhysRefs); |
152 | bool ProcessBlockPRE(MachineDominatorTree *MDT, MachineBasicBlock *MBB); |
153 | bool PerformSimplePRE(MachineDominatorTree *DT); |
154 | /// Heuristics to see if it's profitable to move common computations of MBB |
155 | /// and MBB1 to CandidateBB. |
156 | bool isProfitableToHoistInto(MachineBasicBlock *CandidateBB, |
157 | MachineBasicBlock *MBB, |
158 | MachineBasicBlock *MBB1); |
159 | }; |
160 | |
161 | } // end anonymous namespace |
162 | |
163 | char MachineCSE::ID = 0; |
164 | |
165 | char &llvm::MachineCSEID = MachineCSE::ID; |
166 | |
167 | INITIALIZE_PASS_BEGIN(MachineCSE, DEBUG_TYPE, |
168 | "Machine Common Subexpression Elimination" , false, false) |
169 | INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) |
170 | INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) |
171 | INITIALIZE_PASS_END(MachineCSE, DEBUG_TYPE, |
172 | "Machine Common Subexpression Elimination" , false, false) |
173 | |
174 | /// The source register of a COPY machine instruction can be propagated to all |
175 | /// its users, and this propagation could increase the probability of finding |
176 | /// common subexpressions. If the COPY has only one user, the COPY itself can |
177 | /// be removed. |
178 | bool MachineCSE::PerformTrivialCopyPropagation(MachineInstr *MI, |
179 | MachineBasicBlock *MBB) { |
180 | bool Changed = false; |
181 | for (MachineOperand &MO : MI->all_uses()) { |
182 | Register Reg = MO.getReg(); |
183 | if (!Reg.isVirtual()) |
184 | continue; |
185 | bool OnlyOneUse = MRI->hasOneNonDBGUse(RegNo: Reg); |
186 | MachineInstr *DefMI = MRI->getVRegDef(Reg); |
187 | if (!DefMI || !DefMI->isCopy()) |
188 | continue; |
189 | Register SrcReg = DefMI->getOperand(i: 1).getReg(); |
190 | if (!SrcReg.isVirtual()) |
191 | continue; |
192 | if (DefMI->getOperand(i: 0).getSubReg()) |
193 | continue; |
194 | // FIXME: We should trivially coalesce subregister copies to expose CSE |
195 | // opportunities on instructions with truncated operands (see |
196 | // cse-add-with-overflow.ll). This can be done here as follows: |
197 | // if (SrcSubReg) |
198 | // RC = TRI->getMatchingSuperRegClass(MRI->getRegClass(SrcReg), RC, |
199 | // SrcSubReg); |
200 | // MO.substVirtReg(SrcReg, SrcSubReg, *TRI); |
201 | // |
202 | // The 2-addr pass has been updated to handle coalesced subregs. However, |
203 | // some machine-specific code still can't handle it. |
204 | // To handle it properly we also need a way find a constrained subregister |
205 | // class given a super-reg class and subreg index. |
206 | if (DefMI->getOperand(i: 1).getSubReg()) |
207 | continue; |
208 | if (!MRI->constrainRegAttrs(Reg: SrcReg, ConstrainingReg: Reg)) |
209 | continue; |
210 | LLVM_DEBUG(dbgs() << "Coalescing: " << *DefMI); |
211 | LLVM_DEBUG(dbgs() << "*** to: " << *MI); |
212 | |
213 | // Propagate SrcReg of copies to MI. |
214 | MO.setReg(SrcReg); |
215 | MRI->clearKillFlags(Reg: SrcReg); |
216 | // Coalesce single use copies. |
217 | if (OnlyOneUse) { |
218 | // If (and only if) we've eliminated all uses of the copy, also |
219 | // copy-propagate to any debug-users of MI, or they'll be left using |
220 | // an undefined value. |
221 | DefMI->changeDebugValuesDefReg(Reg: SrcReg); |
222 | |
223 | DefMI->eraseFromParent(); |
224 | ++NumCoalesces; |
225 | } |
226 | Changed = true; |
227 | } |
228 | |
229 | return Changed; |
230 | } |
231 | |
232 | bool MachineCSE::isPhysDefTriviallyDead( |
233 | MCRegister Reg, MachineBasicBlock::const_iterator I, |
234 | MachineBasicBlock::const_iterator E) const { |
235 | unsigned LookAheadLeft = LookAheadLimit; |
236 | while (LookAheadLeft) { |
237 | // Skip over dbg_value's. |
238 | I = skipDebugInstructionsForward(It: I, End: E); |
239 | |
240 | if (I == E) |
241 | // Reached end of block, we don't know if register is dead or not. |
242 | return false; |
243 | |
244 | bool SeenDef = false; |
245 | for (const MachineOperand &MO : I->operands()) { |
246 | if (MO.isRegMask() && MO.clobbersPhysReg(PhysReg: Reg)) |
247 | SeenDef = true; |
248 | if (!MO.isReg() || !MO.getReg()) |
249 | continue; |
250 | if (!TRI->regsOverlap(RegA: MO.getReg(), RegB: Reg)) |
251 | continue; |
252 | if (MO.isUse()) |
253 | // Found a use! |
254 | return false; |
255 | SeenDef = true; |
256 | } |
257 | if (SeenDef) |
258 | // See a def of Reg (or an alias) before encountering any use, it's |
259 | // trivially dead. |
260 | return true; |
261 | |
262 | --LookAheadLeft; |
263 | ++I; |
264 | } |
265 | return false; |
266 | } |
267 | |
268 | static bool isCallerPreservedOrConstPhysReg(MCRegister Reg, |
269 | const MachineOperand &MO, |
270 | const MachineFunction &MF, |
271 | const TargetRegisterInfo &TRI, |
272 | const TargetInstrInfo &TII) { |
273 | // MachineRegisterInfo::isConstantPhysReg directly called by |
274 | // MachineRegisterInfo::isCallerPreservedOrConstPhysReg expects the |
275 | // reserved registers to be frozen. That doesn't cause a problem post-ISel as |
276 | // most (if not all) targets freeze reserved registers right after ISel. |
277 | // |
278 | // It does cause issues mid-GlobalISel, however, hence the additional |
279 | // reservedRegsFrozen check. |
280 | const MachineRegisterInfo &MRI = MF.getRegInfo(); |
281 | return TRI.isCallerPreservedPhysReg(PhysReg: Reg, MF) || TII.isIgnorableUse(MO) || |
282 | (MRI.reservedRegsFrozen() && MRI.isConstantPhysReg(PhysReg: Reg)); |
283 | } |
284 | |
285 | /// hasLivePhysRegDefUses - Return true if the specified instruction read/write |
286 | /// physical registers (except for dead defs of physical registers). It also |
287 | /// returns the physical register def by reference if it's the only one and the |
288 | /// instruction does not uses a physical register. |
289 | bool MachineCSE::hasLivePhysRegDefUses(const MachineInstr *MI, |
290 | const MachineBasicBlock *MBB, |
291 | SmallSet<MCRegister, 8> &PhysRefs, |
292 | PhysDefVector &PhysDefs, |
293 | bool &PhysUseDef) const { |
294 | // First, add all uses to PhysRefs. |
295 | for (const MachineOperand &MO : MI->all_uses()) { |
296 | Register Reg = MO.getReg(); |
297 | if (!Reg) |
298 | continue; |
299 | if (Reg.isVirtual()) |
300 | continue; |
301 | // Reading either caller preserved or constant physregs is ok. |
302 | if (!isCallerPreservedOrConstPhysReg(Reg: Reg.asMCReg(), MO, MF: *MI->getMF(), TRI: *TRI, |
303 | TII: *TII)) |
304 | for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) |
305 | PhysRefs.insert(V: *AI); |
306 | } |
307 | |
308 | // Next, collect all defs into PhysDefs. If any is already in PhysRefs |
309 | // (which currently contains only uses), set the PhysUseDef flag. |
310 | PhysUseDef = false; |
311 | MachineBasicBlock::const_iterator I = MI; I = std::next(x: I); |
312 | for (const auto &MOP : llvm::enumerate(First: MI->operands())) { |
313 | const MachineOperand &MO = MOP.value(); |
314 | if (!MO.isReg() || !MO.isDef()) |
315 | continue; |
316 | Register Reg = MO.getReg(); |
317 | if (!Reg) |
318 | continue; |
319 | if (Reg.isVirtual()) |
320 | continue; |
321 | // Check against PhysRefs even if the def is "dead". |
322 | if (PhysRefs.count(V: Reg.asMCReg())) |
323 | PhysUseDef = true; |
324 | // If the def is dead, it's ok. But the def may not marked "dead". That's |
325 | // common since this pass is run before livevariables. We can scan |
326 | // forward a few instructions and check if it is obviously dead. |
327 | if (!MO.isDead() && !isPhysDefTriviallyDead(Reg: Reg.asMCReg(), I, E: MBB->end())) |
328 | PhysDefs.push_back(Elt: std::make_pair(x: MOP.index(), y&: Reg)); |
329 | } |
330 | |
331 | // Finally, add all defs to PhysRefs as well. |
332 | for (unsigned i = 0, e = PhysDefs.size(); i != e; ++i) |
333 | for (MCRegAliasIterator AI(PhysDefs[i].second, TRI, true); AI.isValid(); |
334 | ++AI) |
335 | PhysRefs.insert(V: *AI); |
336 | |
337 | return !PhysRefs.empty(); |
338 | } |
339 | |
340 | bool MachineCSE::PhysRegDefsReach(MachineInstr *CSMI, MachineInstr *MI, |
341 | SmallSet<MCRegister, 8> &PhysRefs, |
342 | PhysDefVector &PhysDefs, |
343 | bool &NonLocal) const { |
344 | // For now conservatively returns false if the common subexpression is |
345 | // not in the same basic block as the given instruction. The only exception |
346 | // is if the common subexpression is in the sole predecessor block. |
347 | const MachineBasicBlock *MBB = MI->getParent(); |
348 | const MachineBasicBlock *CSMBB = CSMI->getParent(); |
349 | |
350 | bool CrossMBB = false; |
351 | if (CSMBB != MBB) { |
352 | if (MBB->pred_size() != 1 || *MBB->pred_begin() != CSMBB) |
353 | return false; |
354 | |
355 | for (unsigned i = 0, e = PhysDefs.size(); i != e; ++i) { |
356 | if (MRI->isAllocatable(PhysReg: PhysDefs[i].second) || |
357 | MRI->isReserved(PhysReg: PhysDefs[i].second)) |
358 | // Avoid extending live range of physical registers if they are |
359 | //allocatable or reserved. |
360 | return false; |
361 | } |
362 | CrossMBB = true; |
363 | } |
364 | MachineBasicBlock::const_iterator I = CSMI; I = std::next(x: I); |
365 | MachineBasicBlock::const_iterator E = MI; |
366 | MachineBasicBlock::const_iterator EE = CSMBB->end(); |
367 | unsigned LookAheadLeft = LookAheadLimit; |
368 | while (LookAheadLeft) { |
369 | // Skip over dbg_value's. |
370 | while (I != E && I != EE && I->isDebugInstr()) |
371 | ++I; |
372 | |
373 | if (I == EE) { |
374 | assert(CrossMBB && "Reaching end-of-MBB without finding MI?" ); |
375 | (void)CrossMBB; |
376 | CrossMBB = false; |
377 | NonLocal = true; |
378 | I = MBB->begin(); |
379 | EE = MBB->end(); |
380 | continue; |
381 | } |
382 | |
383 | if (I == E) |
384 | return true; |
385 | |
386 | for (const MachineOperand &MO : I->operands()) { |
387 | // RegMasks go on instructions like calls that clobber lots of physregs. |
388 | // Don't attempt to CSE across such an instruction. |
389 | if (MO.isRegMask()) |
390 | return false; |
391 | if (!MO.isReg() || !MO.isDef()) |
392 | continue; |
393 | Register MOReg = MO.getReg(); |
394 | if (MOReg.isVirtual()) |
395 | continue; |
396 | if (PhysRefs.count(V: MOReg.asMCReg())) |
397 | return false; |
398 | } |
399 | |
400 | --LookAheadLeft; |
401 | ++I; |
402 | } |
403 | |
404 | return false; |
405 | } |
406 | |
407 | bool MachineCSE::isCSECandidate(MachineInstr *MI) { |
408 | if (MI->isPosition() || MI->isPHI() || MI->isImplicitDef() || MI->isKill() || |
409 | MI->isInlineAsm() || MI->isDebugInstr() || MI->isJumpTableDebugInfo()) |
410 | return false; |
411 | |
412 | // Ignore copies. |
413 | if (MI->isCopyLike()) |
414 | return false; |
415 | |
416 | // Ignore stuff that we obviously can't move. |
417 | if (MI->mayStore() || MI->isCall() || MI->isTerminator() || |
418 | MI->mayRaiseFPException() || MI->hasUnmodeledSideEffects()) |
419 | return false; |
420 | |
421 | if (MI->mayLoad()) { |
422 | // Okay, this instruction does a load. As a refinement, we allow the target |
423 | // to decide whether the loaded value is actually a constant. If so, we can |
424 | // actually use it as a load. |
425 | if (!MI->isDereferenceableInvariantLoad()) |
426 | // FIXME: we should be able to hoist loads with no other side effects if |
427 | // there are no other instructions which can change memory in this loop. |
428 | // This is a trivial form of alias analysis. |
429 | return false; |
430 | } |
431 | |
432 | // Ignore stack guard loads, otherwise the register that holds CSEed value may |
433 | // be spilled and get loaded back with corrupted data. |
434 | if (MI->getOpcode() == TargetOpcode::LOAD_STACK_GUARD) |
435 | return false; |
436 | |
437 | return true; |
438 | } |
439 | |
440 | /// isProfitableToCSE - Return true if it's profitable to eliminate MI with a |
441 | /// common expression that defines Reg. CSBB is basic block where CSReg is |
442 | /// defined. |
443 | bool MachineCSE::isProfitableToCSE(Register CSReg, Register Reg, |
444 | MachineBasicBlock *CSBB, MachineInstr *MI) { |
445 | if (AggressiveMachineCSE) |
446 | return true; |
447 | |
448 | // FIXME: Heuristics that works around the lack the live range splitting. |
449 | |
450 | // If CSReg is used at all uses of Reg, CSE should not increase register |
451 | // pressure of CSReg. |
452 | bool MayIncreasePressure = true; |
453 | if (CSReg.isVirtual() && Reg.isVirtual()) { |
454 | MayIncreasePressure = false; |
455 | SmallPtrSet<MachineInstr*, 8> CSUses; |
456 | int NumOfUses = 0; |
457 | for (MachineInstr &MI : MRI->use_nodbg_instructions(Reg: CSReg)) { |
458 | CSUses.insert(Ptr: &MI); |
459 | // Too costly to compute if NumOfUses is very large. Conservatively assume |
460 | // MayIncreasePressure to avoid spending too much time here. |
461 | if (++NumOfUses > CSUsesThreshold) { |
462 | MayIncreasePressure = true; |
463 | break; |
464 | } |
465 | } |
466 | if (!MayIncreasePressure) |
467 | for (MachineInstr &MI : MRI->use_nodbg_instructions(Reg)) { |
468 | if (!CSUses.count(Ptr: &MI)) { |
469 | MayIncreasePressure = true; |
470 | break; |
471 | } |
472 | } |
473 | } |
474 | if (!MayIncreasePressure) return true; |
475 | |
476 | // Heuristics #1: Don't CSE "cheap" computation if the def is not local or in |
477 | // an immediate predecessor. We don't want to increase register pressure and |
478 | // end up causing other computation to be spilled. |
479 | if (TII->isAsCheapAsAMove(MI: *MI)) { |
480 | MachineBasicBlock *BB = MI->getParent(); |
481 | if (CSBB != BB && !CSBB->isSuccessor(MBB: BB)) |
482 | return false; |
483 | } |
484 | |
485 | // Heuristics #2: If the expression doesn't not use a vr and the only use |
486 | // of the redundant computation are copies, do not cse. |
487 | bool HasVRegUse = false; |
488 | for (const MachineOperand &MO : MI->all_uses()) { |
489 | if (MO.getReg().isVirtual()) { |
490 | HasVRegUse = true; |
491 | break; |
492 | } |
493 | } |
494 | if (!HasVRegUse) { |
495 | bool HasNonCopyUse = false; |
496 | for (MachineInstr &MI : MRI->use_nodbg_instructions(Reg)) { |
497 | // Ignore copies. |
498 | if (!MI.isCopyLike()) { |
499 | HasNonCopyUse = true; |
500 | break; |
501 | } |
502 | } |
503 | if (!HasNonCopyUse) |
504 | return false; |
505 | } |
506 | |
507 | // Heuristics #3: If the common subexpression is used by PHIs, do not reuse |
508 | // it unless the defined value is already used in the BB of the new use. |
509 | bool HasPHI = false; |
510 | for (MachineInstr &UseMI : MRI->use_nodbg_instructions(Reg: CSReg)) { |
511 | HasPHI |= UseMI.isPHI(); |
512 | if (UseMI.getParent() == MI->getParent()) |
513 | return true; |
514 | } |
515 | |
516 | return !HasPHI; |
517 | } |
518 | |
519 | void MachineCSE::EnterScope(MachineBasicBlock *MBB) { |
520 | LLVM_DEBUG(dbgs() << "Entering: " << MBB->getName() << '\n'); |
521 | ScopeType *Scope = new ScopeType(VNT); |
522 | ScopeMap[MBB] = Scope; |
523 | } |
524 | |
525 | void MachineCSE::ExitScope(MachineBasicBlock *MBB) { |
526 | LLVM_DEBUG(dbgs() << "Exiting: " << MBB->getName() << '\n'); |
527 | DenseMap<MachineBasicBlock*, ScopeType*>::iterator SI = ScopeMap.find(Val: MBB); |
528 | assert(SI != ScopeMap.end()); |
529 | delete SI->second; |
530 | ScopeMap.erase(I: SI); |
531 | } |
532 | |
533 | bool MachineCSE::ProcessBlockCSE(MachineBasicBlock *MBB) { |
534 | bool Changed = false; |
535 | |
536 | SmallVector<std::pair<unsigned, unsigned>, 8> CSEPairs; |
537 | SmallVector<unsigned, 2> ImplicitDefsToUpdate; |
538 | SmallVector<unsigned, 2> ImplicitDefs; |
539 | for (MachineInstr &MI : llvm::make_early_inc_range(Range&: *MBB)) { |
540 | if (!isCSECandidate(MI: &MI)) |
541 | continue; |
542 | |
543 | bool FoundCSE = VNT.count(Key: &MI); |
544 | if (!FoundCSE) { |
545 | // Using trivial copy propagation to find more CSE opportunities. |
546 | if (PerformTrivialCopyPropagation(MI: &MI, MBB)) { |
547 | Changed = true; |
548 | |
549 | // After coalescing MI itself may become a copy. |
550 | if (MI.isCopyLike()) |
551 | continue; |
552 | |
553 | // Try again to see if CSE is possible. |
554 | FoundCSE = VNT.count(Key: &MI); |
555 | } |
556 | } |
557 | |
558 | // Commute commutable instructions. |
559 | bool Commuted = false; |
560 | if (!FoundCSE && MI.isCommutable()) { |
561 | if (MachineInstr *NewMI = TII->commuteInstruction(MI)) { |
562 | Commuted = true; |
563 | FoundCSE = VNT.count(Key: NewMI); |
564 | if (NewMI != &MI) { |
565 | // New instruction. It doesn't need to be kept. |
566 | NewMI->eraseFromParent(); |
567 | Changed = true; |
568 | } else if (!FoundCSE) |
569 | // MI was changed but it didn't help, commute it back! |
570 | (void)TII->commuteInstruction(MI); |
571 | } |
572 | } |
573 | |
574 | // If the instruction defines physical registers and the values *may* be |
575 | // used, then it's not safe to replace it with a common subexpression. |
576 | // It's also not safe if the instruction uses physical registers. |
577 | bool CrossMBBPhysDef = false; |
578 | SmallSet<MCRegister, 8> PhysRefs; |
579 | PhysDefVector PhysDefs; |
580 | bool PhysUseDef = false; |
581 | if (FoundCSE && |
582 | hasLivePhysRegDefUses(MI: &MI, MBB, PhysRefs, PhysDefs, PhysUseDef)) { |
583 | FoundCSE = false; |
584 | |
585 | // ... Unless the CS is local or is in the sole predecessor block |
586 | // and it also defines the physical register which is not clobbered |
587 | // in between and the physical register uses were not clobbered. |
588 | // This can never be the case if the instruction both uses and |
589 | // defines the same physical register, which was detected above. |
590 | if (!PhysUseDef) { |
591 | unsigned CSVN = VNT.lookup(Key: &MI); |
592 | MachineInstr *CSMI = Exps[CSVN]; |
593 | if (PhysRegDefsReach(CSMI, MI: &MI, PhysRefs, PhysDefs, NonLocal&: CrossMBBPhysDef)) |
594 | FoundCSE = true; |
595 | } |
596 | } |
597 | |
598 | if (!FoundCSE) { |
599 | VNT.insert(Key: &MI, Val: CurrVN++); |
600 | Exps.push_back(Elt: &MI); |
601 | continue; |
602 | } |
603 | |
604 | // Found a common subexpression, eliminate it. |
605 | unsigned CSVN = VNT.lookup(Key: &MI); |
606 | MachineInstr *CSMI = Exps[CSVN]; |
607 | LLVM_DEBUG(dbgs() << "Examining: " << MI); |
608 | LLVM_DEBUG(dbgs() << "*** Found a common subexpression: " << *CSMI); |
609 | |
610 | // Prevent CSE-ing non-local convergent instructions. |
611 | // LLVM's current definition of `isConvergent` does not necessarily prove |
612 | // that non-local CSE is illegal. The following check extends the definition |
613 | // of `isConvergent` to assume a convergent instruction is dependent not |
614 | // only on additional conditions, but also on fewer conditions. LLVM does |
615 | // not have a MachineInstr attribute which expresses this extended |
616 | // definition, so it's necessary to use `isConvergent` to prevent illegally |
617 | // CSE-ing the subset of `isConvergent` instructions which do fall into this |
618 | // extended definition. |
619 | if (MI.isConvergent() && MI.getParent() != CSMI->getParent()) { |
620 | LLVM_DEBUG(dbgs() << "*** Convergent MI and subexpression exist in " |
621 | "different BBs, avoid CSE!\n" ); |
622 | VNT.insert(Key: &MI, Val: CurrVN++); |
623 | Exps.push_back(Elt: &MI); |
624 | continue; |
625 | } |
626 | |
627 | // Check if it's profitable to perform this CSE. |
628 | bool DoCSE = true; |
629 | unsigned NumDefs = MI.getNumDefs(); |
630 | |
631 | for (unsigned i = 0, e = MI.getNumOperands(); NumDefs && i != e; ++i) { |
632 | MachineOperand &MO = MI.getOperand(i); |
633 | if (!MO.isReg() || !MO.isDef()) |
634 | continue; |
635 | Register OldReg = MO.getReg(); |
636 | Register NewReg = CSMI->getOperand(i).getReg(); |
637 | |
638 | // Go through implicit defs of CSMI and MI, if a def is not dead at MI, |
639 | // we should make sure it is not dead at CSMI. |
640 | if (MO.isImplicit() && !MO.isDead() && CSMI->getOperand(i).isDead()) |
641 | ImplicitDefsToUpdate.push_back(Elt: i); |
642 | |
643 | // Keep track of implicit defs of CSMI and MI, to clear possibly |
644 | // made-redundant kill flags. |
645 | if (MO.isImplicit() && !MO.isDead() && OldReg == NewReg) |
646 | ImplicitDefs.push_back(Elt: OldReg); |
647 | |
648 | if (OldReg == NewReg) { |
649 | --NumDefs; |
650 | continue; |
651 | } |
652 | |
653 | assert(OldReg.isVirtual() && NewReg.isVirtual() && |
654 | "Do not CSE physical register defs!" ); |
655 | |
656 | if (!isProfitableToCSE(CSReg: NewReg, Reg: OldReg, CSBB: CSMI->getParent(), MI: &MI)) { |
657 | LLVM_DEBUG(dbgs() << "*** Not profitable, avoid CSE!\n" ); |
658 | DoCSE = false; |
659 | break; |
660 | } |
661 | |
662 | // Don't perform CSE if the result of the new instruction cannot exist |
663 | // within the constraints (register class, bank, or low-level type) of |
664 | // the old instruction. |
665 | if (!MRI->constrainRegAttrs(Reg: NewReg, ConstrainingReg: OldReg)) { |
666 | LLVM_DEBUG( |
667 | dbgs() << "*** Not the same register constraints, avoid CSE!\n" ); |
668 | DoCSE = false; |
669 | break; |
670 | } |
671 | |
672 | CSEPairs.push_back(Elt: std::make_pair(x&: OldReg, y&: NewReg)); |
673 | --NumDefs; |
674 | } |
675 | |
676 | // Actually perform the elimination. |
677 | if (DoCSE) { |
678 | for (const std::pair<unsigned, unsigned> &CSEPair : CSEPairs) { |
679 | unsigned OldReg = CSEPair.first; |
680 | unsigned NewReg = CSEPair.second; |
681 | // OldReg may have been unused but is used now, clear the Dead flag |
682 | MachineInstr *Def = MRI->getUniqueVRegDef(Reg: NewReg); |
683 | assert(Def != nullptr && "CSEd register has no unique definition?" ); |
684 | Def->clearRegisterDeads(Reg: NewReg); |
685 | // Replace with NewReg and clear kill flags which may be wrong now. |
686 | MRI->replaceRegWith(FromReg: OldReg, ToReg: NewReg); |
687 | MRI->clearKillFlags(Reg: NewReg); |
688 | } |
689 | |
690 | // Go through implicit defs of CSMI and MI, if a def is not dead at MI, |
691 | // we should make sure it is not dead at CSMI. |
692 | for (unsigned ImplicitDefToUpdate : ImplicitDefsToUpdate) |
693 | CSMI->getOperand(i: ImplicitDefToUpdate).setIsDead(false); |
694 | for (const auto &PhysDef : PhysDefs) |
695 | if (!MI.getOperand(i: PhysDef.first).isDead()) |
696 | CSMI->getOperand(i: PhysDef.first).setIsDead(false); |
697 | |
698 | // Go through implicit defs of CSMI and MI, and clear the kill flags on |
699 | // their uses in all the instructions between CSMI and MI. |
700 | // We might have made some of the kill flags redundant, consider: |
701 | // subs ... implicit-def %nzcv <- CSMI |
702 | // csinc ... implicit killed %nzcv <- this kill flag isn't valid anymore |
703 | // subs ... implicit-def %nzcv <- MI, to be eliminated |
704 | // csinc ... implicit killed %nzcv |
705 | // Since we eliminated MI, and reused a register imp-def'd by CSMI |
706 | // (here %nzcv), that register, if it was killed before MI, should have |
707 | // that kill flag removed, because it's lifetime was extended. |
708 | if (CSMI->getParent() == MI.getParent()) { |
709 | for (MachineBasicBlock::iterator II = CSMI, IE = &MI; II != IE; ++II) |
710 | for (auto ImplicitDef : ImplicitDefs) |
711 | if (MachineOperand *MO = II->findRegisterUseOperand( |
712 | Reg: ImplicitDef, TRI, /*isKill=*/true)) |
713 | MO->setIsKill(false); |
714 | } else { |
715 | // If the instructions aren't in the same BB, bail out and clear the |
716 | // kill flag on all uses of the imp-def'd register. |
717 | for (auto ImplicitDef : ImplicitDefs) |
718 | MRI->clearKillFlags(Reg: ImplicitDef); |
719 | } |
720 | |
721 | if (CrossMBBPhysDef) { |
722 | // Add physical register defs now coming in from a predecessor to MBB |
723 | // livein list. |
724 | while (!PhysDefs.empty()) { |
725 | auto LiveIn = PhysDefs.pop_back_val(); |
726 | if (!MBB->isLiveIn(Reg: LiveIn.second)) |
727 | MBB->addLiveIn(PhysReg: LiveIn.second); |
728 | } |
729 | ++NumCrossBBCSEs; |
730 | } |
731 | |
732 | MI.eraseFromParent(); |
733 | ++NumCSEs; |
734 | if (!PhysRefs.empty()) |
735 | ++NumPhysCSEs; |
736 | if (Commuted) |
737 | ++NumCommutes; |
738 | Changed = true; |
739 | } else { |
740 | VNT.insert(Key: &MI, Val: CurrVN++); |
741 | Exps.push_back(Elt: &MI); |
742 | } |
743 | CSEPairs.clear(); |
744 | ImplicitDefsToUpdate.clear(); |
745 | ImplicitDefs.clear(); |
746 | } |
747 | |
748 | return Changed; |
749 | } |
750 | |
751 | /// ExitScopeIfDone - Destroy scope for the MBB that corresponds to the given |
752 | /// dominator tree node if its a leaf or all of its children are done. Walk |
753 | /// up the dominator tree to destroy ancestors which are now done. |
754 | void |
755 | MachineCSE::ExitScopeIfDone(MachineDomTreeNode *Node, |
756 | DenseMap<MachineDomTreeNode*, unsigned> &OpenChildren) { |
757 | if (OpenChildren[Node]) |
758 | return; |
759 | |
760 | // Pop scope. |
761 | ExitScope(MBB: Node->getBlock()); |
762 | |
763 | // Now traverse upwards to pop ancestors whose offsprings are all done. |
764 | while (MachineDomTreeNode *Parent = Node->getIDom()) { |
765 | unsigned Left = --OpenChildren[Parent]; |
766 | if (Left != 0) |
767 | break; |
768 | ExitScope(MBB: Parent->getBlock()); |
769 | Node = Parent; |
770 | } |
771 | } |
772 | |
773 | bool MachineCSE::PerformCSE(MachineDomTreeNode *Node) { |
774 | SmallVector<MachineDomTreeNode*, 32> Scopes; |
775 | SmallVector<MachineDomTreeNode*, 8> WorkList; |
776 | DenseMap<MachineDomTreeNode*, unsigned> OpenChildren; |
777 | |
778 | CurrVN = 0; |
779 | |
780 | // Perform a DFS walk to determine the order of visit. |
781 | WorkList.push_back(Elt: Node); |
782 | do { |
783 | Node = WorkList.pop_back_val(); |
784 | Scopes.push_back(Elt: Node); |
785 | OpenChildren[Node] = Node->getNumChildren(); |
786 | append_range(C&: WorkList, R: Node->children()); |
787 | } while (!WorkList.empty()); |
788 | |
789 | // Now perform CSE. |
790 | bool Changed = false; |
791 | for (MachineDomTreeNode *Node : Scopes) { |
792 | MachineBasicBlock *MBB = Node->getBlock(); |
793 | EnterScope(MBB); |
794 | Changed |= ProcessBlockCSE(MBB); |
795 | // If it's a leaf node, it's done. Traverse upwards to pop ancestors. |
796 | ExitScopeIfDone(Node, OpenChildren); |
797 | } |
798 | |
799 | return Changed; |
800 | } |
801 | |
802 | // We use stronger checks for PRE candidate rather than for CSE ones to embrace |
803 | // checks inside ProcessBlockCSE(), not only inside isCSECandidate(). This helps |
804 | // to exclude instrs created by PRE that won't be CSEed later. |
805 | bool MachineCSE::isPRECandidate(MachineInstr *MI, |
806 | SmallSet<MCRegister, 8> &PhysRefs) { |
807 | if (!isCSECandidate(MI) || |
808 | MI->isNotDuplicable() || |
809 | MI->mayLoad() || |
810 | TII->isAsCheapAsAMove(MI: *MI) || |
811 | MI->getNumDefs() != 1 || |
812 | MI->getNumExplicitDefs() != 1) |
813 | return false; |
814 | |
815 | for (const MachineOperand &MO : MI->operands()) { |
816 | if (MO.isReg() && !MO.getReg().isVirtual()) { |
817 | if (MO.isDef()) |
818 | return false; |
819 | else |
820 | PhysRefs.insert(V: MO.getReg()); |
821 | } |
822 | } |
823 | |
824 | return true; |
825 | } |
826 | |
827 | bool MachineCSE::ProcessBlockPRE(MachineDominatorTree *DT, |
828 | MachineBasicBlock *MBB) { |
829 | bool Changed = false; |
830 | for (MachineInstr &MI : llvm::make_early_inc_range(Range&: *MBB)) { |
831 | SmallSet<MCRegister, 8> PhysRefs; |
832 | if (!isPRECandidate(MI: &MI, PhysRefs)) |
833 | continue; |
834 | |
835 | if (!PREMap.count(Val: &MI)) { |
836 | PREMap[&MI] = MBB; |
837 | continue; |
838 | } |
839 | |
840 | auto MBB1 = PREMap[&MI]; |
841 | assert( |
842 | !DT->properlyDominates(MBB, MBB1) && |
843 | "MBB cannot properly dominate MBB1 while DFS through dominators tree!" ); |
844 | auto CMBB = DT->findNearestCommonDominator(A: MBB, B: MBB1); |
845 | if (!CMBB->isLegalToHoistInto()) |
846 | continue; |
847 | |
848 | if (!isProfitableToHoistInto(CandidateBB: CMBB, MBB, MBB1)) |
849 | continue; |
850 | |
851 | // Two instrs are partial redundant if their basic blocks are reachable |
852 | // from one to another but one doesn't dominate another. |
853 | if (CMBB != MBB1) { |
854 | auto BB = MBB->getBasicBlock(), BB1 = MBB1->getBasicBlock(); |
855 | if (BB != nullptr && BB1 != nullptr && |
856 | (isPotentiallyReachable(From: BB1, To: BB) || |
857 | isPotentiallyReachable(From: BB, To: BB1))) { |
858 | // The following check extends the definition of `isConvergent` to |
859 | // assume a convergent instruction is dependent not only on additional |
860 | // conditions, but also on fewer conditions. LLVM does not have a |
861 | // MachineInstr attribute which expresses this extended definition, so |
862 | // it's necessary to use `isConvergent` to prevent illegally PRE-ing the |
863 | // subset of `isConvergent` instructions which do fall into this |
864 | // extended definition. |
865 | if (MI.isConvergent() && CMBB != MBB) |
866 | continue; |
867 | |
868 | // If this instruction uses physical registers then we can only do PRE |
869 | // if it's using the value that is live at the place we're hoisting to. |
870 | bool NonLocal; |
871 | PhysDefVector PhysDefs; |
872 | if (!PhysRefs.empty() && |
873 | !PhysRegDefsReach(CSMI: &*(CMBB->getFirstTerminator()), MI: &MI, PhysRefs, |
874 | PhysDefs, NonLocal)) |
875 | continue; |
876 | |
877 | assert(MI.getOperand(0).isDef() && |
878 | "First operand of instr with one explicit def must be this def" ); |
879 | Register VReg = MI.getOperand(i: 0).getReg(); |
880 | Register NewReg = MRI->cloneVirtualRegister(VReg); |
881 | if (!isProfitableToCSE(CSReg: NewReg, Reg: VReg, CSBB: CMBB, MI: &MI)) |
882 | continue; |
883 | MachineInstr &NewMI = |
884 | TII->duplicate(MBB&: *CMBB, InsertBefore: CMBB->getFirstTerminator(), Orig: MI); |
885 | |
886 | // When hoisting, make sure we don't carry the debug location of |
887 | // the original instruction, as that's not correct and can cause |
888 | // unexpected jumps when debugging optimized code. |
889 | auto EmptyDL = DebugLoc(); |
890 | NewMI.setDebugLoc(EmptyDL); |
891 | |
892 | NewMI.getOperand(i: 0).setReg(NewReg); |
893 | |
894 | PREMap[&MI] = CMBB; |
895 | ++NumPREs; |
896 | Changed = true; |
897 | } |
898 | } |
899 | } |
900 | return Changed; |
901 | } |
902 | |
903 | // This simple PRE (partial redundancy elimination) pass doesn't actually |
904 | // eliminate partial redundancy but transforms it to full redundancy, |
905 | // anticipating that the next CSE step will eliminate this created redundancy. |
906 | // If CSE doesn't eliminate this, than created instruction will remain dead |
907 | // and eliminated later by Remove Dead Machine Instructions pass. |
908 | bool MachineCSE::PerformSimplePRE(MachineDominatorTree *DT) { |
909 | SmallVector<MachineDomTreeNode *, 32> BBs; |
910 | |
911 | PREMap.clear(); |
912 | bool Changed = false; |
913 | BBs.push_back(Elt: DT->getRootNode()); |
914 | do { |
915 | auto Node = BBs.pop_back_val(); |
916 | append_range(C&: BBs, R: Node->children()); |
917 | |
918 | MachineBasicBlock *MBB = Node->getBlock(); |
919 | Changed |= ProcessBlockPRE(DT, MBB); |
920 | |
921 | } while (!BBs.empty()); |
922 | |
923 | return Changed; |
924 | } |
925 | |
926 | bool MachineCSE::isProfitableToHoistInto(MachineBasicBlock *CandidateBB, |
927 | MachineBasicBlock *MBB, |
928 | MachineBasicBlock *MBB1) { |
929 | if (CandidateBB->getParent()->getFunction().hasMinSize()) |
930 | return true; |
931 | assert(DT->dominates(CandidateBB, MBB) && "CandidateBB should dominate MBB" ); |
932 | assert(DT->dominates(CandidateBB, MBB1) && |
933 | "CandidateBB should dominate MBB1" ); |
934 | return MBFI->getBlockFreq(MBB: CandidateBB) <= |
935 | MBFI->getBlockFreq(MBB) + MBFI->getBlockFreq(MBB: MBB1); |
936 | } |
937 | |
938 | bool MachineCSE::runOnMachineFunction(MachineFunction &MF) { |
939 | if (skipFunction(F: MF.getFunction())) |
940 | return false; |
941 | |
942 | TII = MF.getSubtarget().getInstrInfo(); |
943 | TRI = MF.getSubtarget().getRegisterInfo(); |
944 | MRI = &MF.getRegInfo(); |
945 | AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); |
946 | DT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree(); |
947 | MBFI = &getAnalysis<MachineBlockFrequencyInfoWrapperPass>().getMBFI(); |
948 | LookAheadLimit = TII->getMachineCSELookAheadLimit(); |
949 | bool ChangedPRE, ChangedCSE; |
950 | ChangedPRE = PerformSimplePRE(DT); |
951 | ChangedCSE = PerformCSE(Node: DT->getRootNode()); |
952 | return ChangedPRE || ChangedCSE; |
953 | } |
954 | |