1 | //===- MachineCSE.cpp - Machine Common Subexpression Elimination Pass -----===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This pass performs global common subexpression elimination on machine |
10 | // instructions using a scoped hash table based value numbering scheme. It |
11 | // must be run while the machine function is still in SSA form. |
12 | // |
13 | //===----------------------------------------------------------------------===// |
14 | |
15 | #include "llvm/CodeGen/MachineCSE.h" |
16 | #include "llvm/ADT/DenseMap.h" |
17 | #include "llvm/ADT/ScopedHashTable.h" |
18 | #include "llvm/ADT/SmallPtrSet.h" |
19 | #include "llvm/ADT/SmallSet.h" |
20 | #include "llvm/ADT/SmallVector.h" |
21 | #include "llvm/ADT/Statistic.h" |
22 | #include "llvm/Analysis/CFG.h" |
23 | #include "llvm/CodeGen/MachineBasicBlock.h" |
24 | #include "llvm/CodeGen/MachineBlockFrequencyInfo.h" |
25 | #include "llvm/CodeGen/MachineDominators.h" |
26 | #include "llvm/CodeGen/MachineFunction.h" |
27 | #include "llvm/CodeGen/MachineFunctionPass.h" |
28 | #include "llvm/CodeGen/MachineInstr.h" |
29 | #include "llvm/CodeGen/MachineLoopInfo.h" |
30 | #include "llvm/CodeGen/MachineOperand.h" |
31 | #include "llvm/CodeGen/MachineRegisterInfo.h" |
32 | #include "llvm/CodeGen/Passes.h" |
33 | #include "llvm/CodeGen/TargetInstrInfo.h" |
34 | #include "llvm/CodeGen/TargetOpcodes.h" |
35 | #include "llvm/CodeGen/TargetRegisterInfo.h" |
36 | #include "llvm/CodeGen/TargetSubtargetInfo.h" |
37 | #include "llvm/InitializePasses.h" |
38 | #include "llvm/MC/MCRegister.h" |
39 | #include "llvm/MC/MCRegisterInfo.h" |
40 | #include "llvm/Pass.h" |
41 | #include "llvm/Support/Allocator.h" |
42 | #include "llvm/Support/Debug.h" |
43 | #include "llvm/Support/RecyclingAllocator.h" |
44 | #include "llvm/Support/raw_ostream.h" |
45 | #include <cassert> |
46 | #include <iterator> |
47 | #include <utility> |
48 | |
49 | using namespace llvm; |
50 | |
51 | #define DEBUG_TYPE "machine-cse" |
52 | |
53 | STATISTIC(NumCoalesces, "Number of copies coalesced" ); |
54 | STATISTIC(NumCSEs, "Number of common subexpression eliminated" ); |
55 | STATISTIC(NumPREs, "Number of partial redundant expression" |
56 | " transformed to fully redundant" ); |
57 | STATISTIC(NumPhysCSEs, |
58 | "Number of physreg referencing common subexpr eliminated" ); |
59 | STATISTIC(NumCrossBBCSEs, |
60 | "Number of cross-MBB physreg referencing CS eliminated" ); |
61 | STATISTIC(NumCommutes, "Number of copies coalesced after commuting" ); |
62 | |
63 | // Threshold to avoid excessive cost to compute isProfitableToCSE. |
64 | static cl::opt<int> |
65 | CSUsesThreshold("csuses-threshold" , cl::Hidden, cl::init(Val: 1024), |
66 | cl::desc("Threshold for the size of CSUses" )); |
67 | |
68 | static cl::opt<bool> AggressiveMachineCSE( |
69 | "aggressive-machine-cse" , cl::Hidden, cl::init(Val: false), |
70 | cl::desc("Override the profitability heuristics for Machine CSE" )); |
71 | |
72 | namespace { |
73 | |
74 | class MachineCSEImpl { |
75 | const TargetInstrInfo *TII = nullptr; |
76 | const TargetRegisterInfo *TRI = nullptr; |
77 | MachineDominatorTree *DT = nullptr; |
78 | MachineRegisterInfo *MRI = nullptr; |
79 | MachineBlockFrequencyInfo *MBFI = nullptr; |
80 | |
81 | public: |
82 | MachineCSEImpl(MachineDominatorTree *DT, MachineBlockFrequencyInfo *MBFI) |
83 | : DT(DT), MBFI(MBFI) {} |
84 | bool run(MachineFunction &MF); |
85 | |
86 | private: |
87 | using AllocatorTy = |
88 | RecyclingAllocator<BumpPtrAllocator, |
89 | ScopedHashTableVal<MachineInstr *, unsigned>>; |
90 | using ScopedHTType = |
91 | ScopedHashTable<MachineInstr *, unsigned, MachineInstrExpressionTrait, |
92 | AllocatorTy>; |
93 | using ScopeType = ScopedHTType::ScopeTy; |
94 | using PhysDefVector = SmallVector<std::pair<unsigned, Register>, 2>; |
95 | |
96 | unsigned LookAheadLimit = 0; |
97 | DenseMap<MachineBasicBlock *, ScopeType *> ScopeMap; |
98 | DenseMap<MachineInstr *, MachineBasicBlock *, MachineInstrExpressionTrait> |
99 | PREMap; |
100 | ScopedHTType VNT; |
101 | SmallVector<MachineInstr *, 64> Exps; |
102 | unsigned CurrVN = 0; |
103 | |
104 | bool PerformTrivialCopyPropagation(MachineInstr *MI, MachineBasicBlock *MBB); |
105 | bool isPhysDefTriviallyDead(MCRegister Reg, |
106 | MachineBasicBlock::const_iterator I, |
107 | MachineBasicBlock::const_iterator E) const; |
108 | bool hasLivePhysRegDefUses(const MachineInstr *MI, |
109 | const MachineBasicBlock *MBB, |
110 | SmallSet<MCRegister, 8> &PhysRefs, |
111 | PhysDefVector &PhysDefs, bool &PhysUseDef) const; |
112 | bool PhysRegDefsReach(MachineInstr *CSMI, MachineInstr *MI, |
113 | const SmallSet<MCRegister, 8> &PhysRefs, |
114 | const PhysDefVector &PhysDefs, bool &NonLocal) const; |
115 | bool isCSECandidate(MachineInstr *MI); |
116 | bool isProfitableToCSE(Register CSReg, Register Reg, MachineBasicBlock *CSBB, |
117 | MachineInstr *MI); |
118 | void EnterScope(MachineBasicBlock *MBB); |
119 | void ExitScope(MachineBasicBlock *MBB); |
120 | bool ProcessBlockCSE(MachineBasicBlock *MBB); |
121 | void ExitScopeIfDone(MachineDomTreeNode *Node, |
122 | DenseMap<MachineDomTreeNode *, unsigned> &OpenChildren); |
123 | bool PerformCSE(MachineDomTreeNode *Node); |
124 | |
125 | bool isPRECandidate(MachineInstr *MI, SmallSet<MCRegister, 8> &PhysRefs); |
126 | bool ProcessBlockPRE(MachineDominatorTree *MDT, MachineBasicBlock *MBB); |
127 | bool PerformSimplePRE(MachineDominatorTree *DT); |
128 | /// Heuristics to see if it's profitable to move common computations of MBB |
129 | /// and MBB1 to CandidateBB. |
130 | bool isProfitableToHoistInto(MachineBasicBlock *CandidateBB, |
131 | MachineBasicBlock *MBB, MachineBasicBlock *MBB1); |
132 | void releaseMemory(); |
133 | }; |
134 | |
135 | class MachineCSELegacy : public MachineFunctionPass { |
136 | public: |
137 | static char ID; // Pass identification |
138 | |
139 | MachineCSELegacy() : MachineFunctionPass(ID) { |
140 | initializeMachineCSELegacyPass(*PassRegistry::getPassRegistry()); |
141 | } |
142 | |
143 | bool runOnMachineFunction(MachineFunction &MF) override; |
144 | |
145 | void getAnalysisUsage(AnalysisUsage &AU) const override { |
146 | AU.setPreservesCFG(); |
147 | MachineFunctionPass::getAnalysisUsage(AU); |
148 | AU.addPreservedID(ID&: MachineLoopInfoID); |
149 | AU.addRequired<MachineDominatorTreeWrapperPass>(); |
150 | AU.addPreserved<MachineDominatorTreeWrapperPass>(); |
151 | AU.addRequired<MachineBlockFrequencyInfoWrapperPass>(); |
152 | AU.addPreserved<MachineBlockFrequencyInfoWrapperPass>(); |
153 | } |
154 | |
155 | MachineFunctionProperties getRequiredProperties() const override { |
156 | return MachineFunctionProperties().setIsSSA(); |
157 | } |
158 | }; |
159 | } // end anonymous namespace |
160 | |
161 | char MachineCSELegacy::ID = 0; |
162 | |
163 | char &llvm::MachineCSELegacyID = MachineCSELegacy::ID; |
164 | |
165 | INITIALIZE_PASS_BEGIN(MachineCSELegacy, DEBUG_TYPE, |
166 | "Machine Common Subexpression Elimination" , false, false) |
167 | INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) |
168 | INITIALIZE_PASS_END(MachineCSELegacy, DEBUG_TYPE, |
169 | "Machine Common Subexpression Elimination" , false, false) |
170 | |
171 | /// The source register of a COPY machine instruction can be propagated to all |
172 | /// its users, and this propagation could increase the probability of finding |
173 | /// common subexpressions. If the COPY has only one user, the COPY itself can |
174 | /// be removed. |
175 | bool MachineCSEImpl::PerformTrivialCopyPropagation(MachineInstr *MI, |
176 | MachineBasicBlock *MBB) { |
177 | bool Changed = false; |
178 | for (MachineOperand &MO : MI->all_uses()) { |
179 | Register Reg = MO.getReg(); |
180 | if (!Reg.isVirtual()) |
181 | continue; |
182 | bool OnlyOneUse = MRI->hasOneNonDBGUse(RegNo: Reg); |
183 | MachineInstr *DefMI = MRI->getVRegDef(Reg); |
184 | if (!DefMI || !DefMI->isCopy()) |
185 | continue; |
186 | Register SrcReg = DefMI->getOperand(i: 1).getReg(); |
187 | if (!SrcReg.isVirtual()) |
188 | continue; |
189 | // FIXME: We should trivially coalesce subregister copies to expose CSE |
190 | // opportunities on instructions with truncated operands (see |
191 | // cse-add-with-overflow.ll). This can be done here as follows: |
192 | // if (SrcSubReg) |
193 | // RC = TRI->getMatchingSuperRegClass(MRI->getRegClass(SrcReg), RC, |
194 | // SrcSubReg); |
195 | // MO.substVirtReg(SrcReg, SrcSubReg, *TRI); |
196 | // |
197 | // The 2-addr pass has been updated to handle coalesced subregs. However, |
198 | // some machine-specific code still can't handle it. |
199 | // To handle it properly we also need a way find a constrained subregister |
200 | // class given a super-reg class and subreg index. |
201 | if (DefMI->getOperand(i: 1).getSubReg()) |
202 | continue; |
203 | if (!MRI->constrainRegAttrs(Reg: SrcReg, ConstrainingReg: Reg)) |
204 | continue; |
205 | LLVM_DEBUG(dbgs() << "Coalescing: " << *DefMI); |
206 | LLVM_DEBUG(dbgs() << "*** to: " << *MI); |
207 | |
208 | // Propagate SrcReg of copies to MI. |
209 | MO.setReg(SrcReg); |
210 | MRI->clearKillFlags(Reg: SrcReg); |
211 | // Coalesce single use copies. |
212 | if (OnlyOneUse) { |
213 | // If (and only if) we've eliminated all uses of the copy, also |
214 | // copy-propagate to any debug-users of MI, or they'll be left using |
215 | // an undefined value. |
216 | DefMI->changeDebugValuesDefReg(Reg: SrcReg); |
217 | |
218 | DefMI->eraseFromParent(); |
219 | ++NumCoalesces; |
220 | } |
221 | Changed = true; |
222 | } |
223 | |
224 | return Changed; |
225 | } |
226 | |
227 | bool MachineCSEImpl::isPhysDefTriviallyDead( |
228 | MCRegister Reg, MachineBasicBlock::const_iterator I, |
229 | MachineBasicBlock::const_iterator E) const { |
230 | unsigned LookAheadLeft = LookAheadLimit; |
231 | while (LookAheadLeft) { |
232 | // Skip over dbg_value's. |
233 | I = skipDebugInstructionsForward(It: I, End: E); |
234 | |
235 | if (I == E) |
236 | // Reached end of block, we don't know if register is dead or not. |
237 | return false; |
238 | |
239 | bool SeenDef = false; |
240 | for (const MachineOperand &MO : I->operands()) { |
241 | if (MO.isRegMask() && MO.clobbersPhysReg(PhysReg: Reg)) |
242 | SeenDef = true; |
243 | if (!MO.isReg() || !MO.getReg()) |
244 | continue; |
245 | if (!TRI->regsOverlap(RegA: MO.getReg(), RegB: Reg)) |
246 | continue; |
247 | if (MO.isUse()) |
248 | // Found a use! |
249 | return false; |
250 | SeenDef = true; |
251 | } |
252 | if (SeenDef) |
253 | // See a def of Reg (or an alias) before encountering any use, it's |
254 | // trivially dead. |
255 | return true; |
256 | |
257 | --LookAheadLeft; |
258 | ++I; |
259 | } |
260 | return false; |
261 | } |
262 | |
263 | static bool isCallerPreservedOrConstPhysReg(MCRegister Reg, |
264 | const MachineOperand &MO, |
265 | const MachineFunction &MF, |
266 | const TargetRegisterInfo &TRI, |
267 | const TargetInstrInfo &TII) { |
268 | // MachineRegisterInfo::isConstantPhysReg directly called by |
269 | // MachineRegisterInfo::isCallerPreservedOrConstPhysReg expects the |
270 | // reserved registers to be frozen. That doesn't cause a problem post-ISel as |
271 | // most (if not all) targets freeze reserved registers right after ISel. |
272 | // |
273 | // It does cause issues mid-GlobalISel, however, hence the additional |
274 | // reservedRegsFrozen check. |
275 | const MachineRegisterInfo &MRI = MF.getRegInfo(); |
276 | return TRI.isCallerPreservedPhysReg(PhysReg: Reg, MF) || TII.isIgnorableUse(MO) || |
277 | (MRI.reservedRegsFrozen() && MRI.isConstantPhysReg(PhysReg: Reg)); |
278 | } |
279 | |
280 | /// hasLivePhysRegDefUses - Return true if the specified instruction read/write |
281 | /// physical registers (except for dead defs of physical registers). It also |
282 | /// returns the physical register def by reference if it's the only one and the |
283 | /// instruction does not uses a physical register. |
284 | bool MachineCSEImpl::hasLivePhysRegDefUses(const MachineInstr *MI, |
285 | const MachineBasicBlock *MBB, |
286 | SmallSet<MCRegister, 8> &PhysRefs, |
287 | PhysDefVector &PhysDefs, |
288 | bool &PhysUseDef) const { |
289 | // First, add all uses to PhysRefs. |
290 | for (const MachineOperand &MO : MI->all_uses()) { |
291 | Register Reg = MO.getReg(); |
292 | if (!Reg) |
293 | continue; |
294 | if (Reg.isVirtual()) |
295 | continue; |
296 | // Reading either caller preserved or constant physregs is ok. |
297 | if (!isCallerPreservedOrConstPhysReg(Reg: Reg.asMCReg(), MO, MF: *MI->getMF(), TRI: *TRI, |
298 | TII: *TII)) |
299 | for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) |
300 | PhysRefs.insert(V: *AI); |
301 | } |
302 | |
303 | // Next, collect all defs into PhysDefs. If any is already in PhysRefs |
304 | // (which currently contains only uses), set the PhysUseDef flag. |
305 | PhysUseDef = false; |
306 | MachineBasicBlock::const_iterator I = MI; I = std::next(x: I); |
307 | for (const auto &MOP : llvm::enumerate(First: MI->operands())) { |
308 | const MachineOperand &MO = MOP.value(); |
309 | if (!MO.isReg() || !MO.isDef()) |
310 | continue; |
311 | Register Reg = MO.getReg(); |
312 | if (!Reg) |
313 | continue; |
314 | if (Reg.isVirtual()) |
315 | continue; |
316 | // Check against PhysRefs even if the def is "dead". |
317 | if (PhysRefs.count(V: Reg.asMCReg())) |
318 | PhysUseDef = true; |
319 | // If the def is dead, it's ok. But the def may not marked "dead". That's |
320 | // common since this pass is run before livevariables. We can scan |
321 | // forward a few instructions and check if it is obviously dead. |
322 | if (!MO.isDead() && !isPhysDefTriviallyDead(Reg: Reg.asMCReg(), I, E: MBB->end())) |
323 | PhysDefs.emplace_back(Args: MOP.index(), Args&: Reg); |
324 | } |
325 | |
326 | // Finally, add all defs to PhysRefs as well. |
327 | for (const auto &Def : PhysDefs) |
328 | for (MCRegAliasIterator AI(Def.second, TRI, true); AI.isValid(); ++AI) |
329 | PhysRefs.insert(V: *AI); |
330 | |
331 | return !PhysRefs.empty(); |
332 | } |
333 | |
334 | bool MachineCSEImpl::PhysRegDefsReach(MachineInstr *CSMI, MachineInstr *MI, |
335 | const SmallSet<MCRegister, 8> &PhysRefs, |
336 | const PhysDefVector &PhysDefs, |
337 | bool &NonLocal) const { |
338 | // For now conservatively returns false if the common subexpression is |
339 | // not in the same basic block as the given instruction. The only exception |
340 | // is if the common subexpression is in the sole predecessor block. |
341 | const MachineBasicBlock *MBB = MI->getParent(); |
342 | const MachineBasicBlock *CSMBB = CSMI->getParent(); |
343 | |
344 | bool CrossMBB = false; |
345 | if (CSMBB != MBB) { |
346 | if (MBB->pred_size() != 1 || *MBB->pred_begin() != CSMBB) |
347 | return false; |
348 | |
349 | for (const auto &PhysDef : PhysDefs) { |
350 | if (MRI->isAllocatable(PhysReg: PhysDef.second) || MRI->isReserved(PhysReg: PhysDef.second)) |
351 | // Avoid extending live range of physical registers if they are |
352 | //allocatable or reserved. |
353 | return false; |
354 | } |
355 | CrossMBB = true; |
356 | } |
357 | MachineBasicBlock::const_iterator I = CSMI; I = std::next(x: I); |
358 | MachineBasicBlock::const_iterator E = MI; |
359 | MachineBasicBlock::const_iterator EE = CSMBB->end(); |
360 | unsigned LookAheadLeft = LookAheadLimit; |
361 | while (LookAheadLeft) { |
362 | // Skip over dbg_value's. |
363 | while (I != E && I != EE && I->isDebugInstr()) |
364 | ++I; |
365 | |
366 | if (I == EE) { |
367 | assert(CrossMBB && "Reaching end-of-MBB without finding MI?" ); |
368 | (void)CrossMBB; |
369 | CrossMBB = false; |
370 | NonLocal = true; |
371 | I = MBB->begin(); |
372 | EE = MBB->end(); |
373 | continue; |
374 | } |
375 | |
376 | if (I == E) |
377 | return true; |
378 | |
379 | for (const MachineOperand &MO : I->operands()) { |
380 | // RegMasks go on instructions like calls that clobber lots of physregs. |
381 | // Don't attempt to CSE across such an instruction. |
382 | if (MO.isRegMask()) |
383 | return false; |
384 | if (!MO.isReg() || !MO.isDef()) |
385 | continue; |
386 | Register MOReg = MO.getReg(); |
387 | if (MOReg.isVirtual()) |
388 | continue; |
389 | if (PhysRefs.count(V: MOReg.asMCReg())) |
390 | return false; |
391 | } |
392 | |
393 | --LookAheadLeft; |
394 | ++I; |
395 | } |
396 | |
397 | return false; |
398 | } |
399 | |
400 | bool MachineCSEImpl::isCSECandidate(MachineInstr *MI) { |
401 | if (MI->isPosition() || MI->isPHI() || MI->isImplicitDef() || MI->isKill() || |
402 | MI->isInlineAsm() || MI->isDebugInstr() || MI->isJumpTableDebugInfo() || |
403 | MI->isFakeUse()) |
404 | return false; |
405 | |
406 | // Ignore copies. |
407 | if (MI->isCopyLike()) |
408 | return false; |
409 | |
410 | // Ignore stuff that we obviously can't move. |
411 | if (MI->mayStore() || MI->isCall() || MI->isTerminator() || |
412 | MI->mayRaiseFPException() || MI->hasUnmodeledSideEffects()) |
413 | return false; |
414 | |
415 | if (MI->mayLoad()) { |
416 | // Okay, this instruction does a load. As a refinement, we allow the target |
417 | // to decide whether the loaded value is actually a constant. If so, we can |
418 | // actually use it as a load. |
419 | if (!MI->isDereferenceableInvariantLoad()) |
420 | // FIXME: we should be able to hoist loads with no other side effects if |
421 | // there are no other instructions which can change memory in this loop. |
422 | // This is a trivial form of alias analysis. |
423 | return false; |
424 | } |
425 | |
426 | // Ignore stack guard loads, otherwise the register that holds CSEed value may |
427 | // be spilled and get loaded back with corrupted data. |
428 | if (MI->getOpcode() == TargetOpcode::LOAD_STACK_GUARD) |
429 | return false; |
430 | |
431 | return true; |
432 | } |
433 | |
434 | /// isProfitableToCSE - Return true if it's profitable to eliminate MI with a |
435 | /// common expression that defines Reg. CSBB is basic block where CSReg is |
436 | /// defined. |
437 | bool MachineCSEImpl::isProfitableToCSE(Register CSReg, Register Reg, |
438 | MachineBasicBlock *CSBB, |
439 | MachineInstr *MI) { |
440 | if (AggressiveMachineCSE) |
441 | return true; |
442 | |
443 | // FIXME: Heuristics that works around the lack the live range splitting. |
444 | |
445 | // If CSReg is used at all uses of Reg, CSE should not increase register |
446 | // pressure of CSReg. |
447 | bool MayIncreasePressure = true; |
448 | if (CSReg.isVirtual() && Reg.isVirtual()) { |
449 | MayIncreasePressure = false; |
450 | SmallPtrSet<MachineInstr*, 8> CSUses; |
451 | int NumOfUses = 0; |
452 | for (MachineInstr &MI : MRI->use_nodbg_instructions(Reg: CSReg)) { |
453 | CSUses.insert(Ptr: &MI); |
454 | // Too costly to compute if NumOfUses is very large. Conservatively assume |
455 | // MayIncreasePressure to avoid spending too much time here. |
456 | if (++NumOfUses > CSUsesThreshold) { |
457 | MayIncreasePressure = true; |
458 | break; |
459 | } |
460 | } |
461 | if (!MayIncreasePressure) |
462 | for (MachineInstr &MI : MRI->use_nodbg_instructions(Reg)) { |
463 | if (!CSUses.count(Ptr: &MI)) { |
464 | MayIncreasePressure = true; |
465 | break; |
466 | } |
467 | } |
468 | } |
469 | if (!MayIncreasePressure) return true; |
470 | |
471 | // Heuristics #1: Don't CSE "cheap" computation if the def is not local or in |
472 | // an immediate predecessor. We don't want to increase register pressure and |
473 | // end up causing other computation to be spilled. |
474 | if (TII->isAsCheapAsAMove(MI: *MI)) { |
475 | MachineBasicBlock *BB = MI->getParent(); |
476 | if (CSBB != BB && !CSBB->isSuccessor(MBB: BB)) |
477 | return false; |
478 | } |
479 | |
480 | // Heuristics #2: If the expression doesn't not use a vr and the only use |
481 | // of the redundant computation are copies, do not cse. |
482 | bool HasVRegUse = false; |
483 | for (const MachineOperand &MO : MI->all_uses()) { |
484 | if (MO.getReg().isVirtual()) { |
485 | HasVRegUse = true; |
486 | break; |
487 | } |
488 | } |
489 | if (!HasVRegUse) { |
490 | bool HasNonCopyUse = false; |
491 | for (MachineInstr &MI : MRI->use_nodbg_instructions(Reg)) { |
492 | // Ignore copies. |
493 | if (!MI.isCopyLike()) { |
494 | HasNonCopyUse = true; |
495 | break; |
496 | } |
497 | } |
498 | if (!HasNonCopyUse) |
499 | return false; |
500 | } |
501 | |
502 | // Heuristics #3: If the common subexpression is used by PHIs, do not reuse |
503 | // it unless the defined value is already used in the BB of the new use. |
504 | bool HasPHI = false; |
505 | for (MachineInstr &UseMI : MRI->use_nodbg_instructions(Reg: CSReg)) { |
506 | HasPHI |= UseMI.isPHI(); |
507 | if (UseMI.getParent() == MI->getParent()) |
508 | return true; |
509 | } |
510 | |
511 | return !HasPHI; |
512 | } |
513 | |
514 | void MachineCSEImpl::EnterScope(MachineBasicBlock *MBB) { |
515 | LLVM_DEBUG(dbgs() << "Entering: " << MBB->getName() << '\n'); |
516 | ScopeType *Scope = new ScopeType(VNT); |
517 | ScopeMap[MBB] = Scope; |
518 | } |
519 | |
520 | void MachineCSEImpl::ExitScope(MachineBasicBlock *MBB) { |
521 | LLVM_DEBUG(dbgs() << "Exiting: " << MBB->getName() << '\n'); |
522 | DenseMap<MachineBasicBlock*, ScopeType*>::iterator SI = ScopeMap.find(Val: MBB); |
523 | assert(SI != ScopeMap.end()); |
524 | delete SI->second; |
525 | ScopeMap.erase(I: SI); |
526 | } |
527 | |
528 | bool MachineCSEImpl::ProcessBlockCSE(MachineBasicBlock *MBB) { |
529 | bool Changed = false; |
530 | |
531 | SmallVector<std::pair<Register, Register>, 8> CSEPairs; |
532 | SmallVector<unsigned, 2> ImplicitDefsToUpdate; |
533 | SmallVector<Register, 2> ImplicitDefs; |
534 | for (MachineInstr &MI : llvm::make_early_inc_range(Range&: *MBB)) { |
535 | if (!isCSECandidate(MI: &MI)) |
536 | continue; |
537 | |
538 | bool FoundCSE = VNT.count(Key: &MI); |
539 | if (!FoundCSE) { |
540 | // Using trivial copy propagation to find more CSE opportunities. |
541 | if (PerformTrivialCopyPropagation(MI: &MI, MBB)) { |
542 | Changed = true; |
543 | |
544 | // After coalescing MI itself may become a copy. |
545 | if (MI.isCopyLike()) |
546 | continue; |
547 | |
548 | // Try again to see if CSE is possible. |
549 | FoundCSE = VNT.count(Key: &MI); |
550 | } |
551 | } |
552 | |
553 | // Commute commutable instructions. |
554 | bool Commuted = false; |
555 | if (!FoundCSE && MI.isCommutable()) { |
556 | if (MachineInstr *NewMI = TII->commuteInstruction(MI)) { |
557 | Commuted = true; |
558 | FoundCSE = VNT.count(Key: NewMI); |
559 | if (NewMI != &MI) { |
560 | // New instruction. It doesn't need to be kept. |
561 | NewMI->eraseFromParent(); |
562 | Changed = true; |
563 | } else if (!FoundCSE) |
564 | // MI was changed but it didn't help, commute it back! |
565 | (void)TII->commuteInstruction(MI); |
566 | } |
567 | } |
568 | |
569 | // If the instruction defines physical registers and the values *may* be |
570 | // used, then it's not safe to replace it with a common subexpression. |
571 | // It's also not safe if the instruction uses physical registers. |
572 | bool CrossMBBPhysDef = false; |
573 | SmallSet<MCRegister, 8> PhysRefs; |
574 | PhysDefVector PhysDefs; |
575 | bool PhysUseDef = false; |
576 | if (FoundCSE && |
577 | hasLivePhysRegDefUses(MI: &MI, MBB, PhysRefs, PhysDefs, PhysUseDef)) { |
578 | FoundCSE = false; |
579 | |
580 | // ... Unless the CS is local or is in the sole predecessor block |
581 | // and it also defines the physical register which is not clobbered |
582 | // in between and the physical register uses were not clobbered. |
583 | // This can never be the case if the instruction both uses and |
584 | // defines the same physical register, which was detected above. |
585 | if (!PhysUseDef) { |
586 | unsigned CSVN = VNT.lookup(Key: &MI); |
587 | MachineInstr *CSMI = Exps[CSVN]; |
588 | if (PhysRegDefsReach(CSMI, MI: &MI, PhysRefs, PhysDefs, NonLocal&: CrossMBBPhysDef)) |
589 | FoundCSE = true; |
590 | } |
591 | } |
592 | |
593 | if (!FoundCSE) { |
594 | VNT.insert(Key: &MI, Val: CurrVN++); |
595 | Exps.push_back(Elt: &MI); |
596 | continue; |
597 | } |
598 | |
599 | // Found a common subexpression, eliminate it. |
600 | unsigned CSVN = VNT.lookup(Key: &MI); |
601 | MachineInstr *CSMI = Exps[CSVN]; |
602 | LLVM_DEBUG(dbgs() << "Examining: " << MI); |
603 | LLVM_DEBUG(dbgs() << "*** Found a common subexpression: " << *CSMI); |
604 | |
605 | // Prevent CSE-ing non-local convergent instructions. |
606 | // LLVM's current definition of `isConvergent` does not necessarily prove |
607 | // that non-local CSE is illegal. The following check extends the definition |
608 | // of `isConvergent` to assume a convergent instruction is dependent not |
609 | // only on additional conditions, but also on fewer conditions. LLVM does |
610 | // not have a MachineInstr attribute which expresses this extended |
611 | // definition, so it's necessary to use `isConvergent` to prevent illegally |
612 | // CSE-ing the subset of `isConvergent` instructions which do fall into this |
613 | // extended definition. |
614 | if (MI.isConvergent() && MI.getParent() != CSMI->getParent()) { |
615 | LLVM_DEBUG(dbgs() << "*** Convergent MI and subexpression exist in " |
616 | "different BBs, avoid CSE!\n" ); |
617 | VNT.insert(Key: &MI, Val: CurrVN++); |
618 | Exps.push_back(Elt: &MI); |
619 | continue; |
620 | } |
621 | |
622 | // Check if it's profitable to perform this CSE. |
623 | bool DoCSE = true; |
624 | unsigned NumDefs = MI.getNumDefs(); |
625 | |
626 | for (unsigned i = 0, e = MI.getNumOperands(); NumDefs && i != e; ++i) { |
627 | MachineOperand &MO = MI.getOperand(i); |
628 | if (!MO.isReg() || !MO.isDef()) |
629 | continue; |
630 | Register OldReg = MO.getReg(); |
631 | Register NewReg = CSMI->getOperand(i).getReg(); |
632 | |
633 | // Go through implicit defs of CSMI and MI, if a def is not dead at MI, |
634 | // we should make sure it is not dead at CSMI. |
635 | if (MO.isImplicit() && !MO.isDead() && CSMI->getOperand(i).isDead()) |
636 | ImplicitDefsToUpdate.push_back(Elt: i); |
637 | |
638 | // Keep track of implicit defs of CSMI and MI, to clear possibly |
639 | // made-redundant kill flags. |
640 | if (MO.isImplicit() && !MO.isDead() && OldReg == NewReg) |
641 | ImplicitDefs.push_back(Elt: OldReg); |
642 | |
643 | if (OldReg == NewReg) { |
644 | --NumDefs; |
645 | continue; |
646 | } |
647 | |
648 | assert(OldReg.isVirtual() && NewReg.isVirtual() && |
649 | "Do not CSE physical register defs!" ); |
650 | |
651 | if (!isProfitableToCSE(CSReg: NewReg, Reg: OldReg, CSBB: CSMI->getParent(), MI: &MI)) { |
652 | LLVM_DEBUG(dbgs() << "*** Not profitable, avoid CSE!\n" ); |
653 | DoCSE = false; |
654 | break; |
655 | } |
656 | |
657 | // Don't perform CSE if the result of the new instruction cannot exist |
658 | // within the constraints (register class, bank, or low-level type) of |
659 | // the old instruction. |
660 | if (!MRI->constrainRegAttrs(Reg: NewReg, ConstrainingReg: OldReg)) { |
661 | LLVM_DEBUG( |
662 | dbgs() << "*** Not the same register constraints, avoid CSE!\n" ); |
663 | DoCSE = false; |
664 | break; |
665 | } |
666 | |
667 | CSEPairs.emplace_back(Args&: OldReg, Args&: NewReg); |
668 | --NumDefs; |
669 | } |
670 | |
671 | // Actually perform the elimination. |
672 | if (DoCSE) { |
673 | for (const std::pair<Register, Register> &CSEPair : CSEPairs) { |
674 | Register OldReg = CSEPair.first; |
675 | Register NewReg = CSEPair.second; |
676 | // OldReg may have been unused but is used now, clear the Dead flag |
677 | MachineInstr *Def = MRI->getUniqueVRegDef(Reg: NewReg); |
678 | assert(Def != nullptr && "CSEd register has no unique definition?" ); |
679 | Def->clearRegisterDeads(Reg: NewReg); |
680 | // Replace with NewReg and clear kill flags which may be wrong now. |
681 | MRI->replaceRegWith(FromReg: OldReg, ToReg: NewReg); |
682 | MRI->clearKillFlags(Reg: NewReg); |
683 | } |
684 | |
685 | // Go through implicit defs of CSMI and MI, if a def is not dead at MI, |
686 | // we should make sure it is not dead at CSMI. |
687 | for (unsigned ImplicitDefToUpdate : ImplicitDefsToUpdate) |
688 | CSMI->getOperand(i: ImplicitDefToUpdate).setIsDead(false); |
689 | for (const auto &PhysDef : PhysDefs) |
690 | if (!MI.getOperand(i: PhysDef.first).isDead()) |
691 | CSMI->getOperand(i: PhysDef.first).setIsDead(false); |
692 | |
693 | // Go through implicit defs of CSMI and MI, and clear the kill flags on |
694 | // their uses in all the instructions between CSMI and MI. |
695 | // We might have made some of the kill flags redundant, consider: |
696 | // subs ... implicit-def %nzcv <- CSMI |
697 | // csinc ... implicit killed %nzcv <- this kill flag isn't valid anymore |
698 | // subs ... implicit-def %nzcv <- MI, to be eliminated |
699 | // csinc ... implicit killed %nzcv |
700 | // Since we eliminated MI, and reused a register imp-def'd by CSMI |
701 | // (here %nzcv), that register, if it was killed before MI, should have |
702 | // that kill flag removed, because it's lifetime was extended. |
703 | if (CSMI->getParent() == MI.getParent()) { |
704 | for (MachineBasicBlock::iterator II = CSMI, IE = &MI; II != IE; ++II) |
705 | for (auto ImplicitDef : ImplicitDefs) |
706 | if (MachineOperand *MO = II->findRegisterUseOperand( |
707 | Reg: ImplicitDef, TRI, /*isKill=*/true)) |
708 | MO->setIsKill(false); |
709 | } else { |
710 | // If the instructions aren't in the same BB, bail out and clear the |
711 | // kill flag on all uses of the imp-def'd register. |
712 | for (auto ImplicitDef : ImplicitDefs) |
713 | MRI->clearKillFlags(Reg: ImplicitDef); |
714 | } |
715 | |
716 | if (CrossMBBPhysDef) { |
717 | // Add physical register defs now coming in from a predecessor to MBB |
718 | // livein list. |
719 | while (!PhysDefs.empty()) { |
720 | auto LiveIn = PhysDefs.pop_back_val(); |
721 | if (!MBB->isLiveIn(Reg: LiveIn.second)) |
722 | MBB->addLiveIn(PhysReg: LiveIn.second); |
723 | } |
724 | ++NumCrossBBCSEs; |
725 | } |
726 | |
727 | MI.eraseFromParent(); |
728 | ++NumCSEs; |
729 | if (!PhysRefs.empty()) |
730 | ++NumPhysCSEs; |
731 | if (Commuted) |
732 | ++NumCommutes; |
733 | Changed = true; |
734 | } else { |
735 | VNT.insert(Key: &MI, Val: CurrVN++); |
736 | Exps.push_back(Elt: &MI); |
737 | } |
738 | CSEPairs.clear(); |
739 | ImplicitDefsToUpdate.clear(); |
740 | ImplicitDefs.clear(); |
741 | } |
742 | |
743 | return Changed; |
744 | } |
745 | |
746 | /// ExitScopeIfDone - Destroy scope for the MBB that corresponds to the given |
747 | /// dominator tree node if its a leaf or all of its children are done. Walk |
748 | /// up the dominator tree to destroy ancestors which are now done. |
749 | void MachineCSEImpl::ExitScopeIfDone( |
750 | MachineDomTreeNode *Node, |
751 | DenseMap<MachineDomTreeNode *, unsigned> &OpenChildren) { |
752 | if (OpenChildren[Node]) |
753 | return; |
754 | |
755 | // Pop scope. |
756 | ExitScope(MBB: Node->getBlock()); |
757 | |
758 | // Now traverse upwards to pop ancestors whose offsprings are all done. |
759 | while (MachineDomTreeNode *Parent = Node->getIDom()) { |
760 | unsigned Left = --OpenChildren[Parent]; |
761 | if (Left != 0) |
762 | break; |
763 | ExitScope(MBB: Parent->getBlock()); |
764 | Node = Parent; |
765 | } |
766 | } |
767 | |
768 | bool MachineCSEImpl::PerformCSE(MachineDomTreeNode *Node) { |
769 | SmallVector<MachineDomTreeNode*, 32> Scopes; |
770 | SmallVector<MachineDomTreeNode*, 8> WorkList; |
771 | DenseMap<MachineDomTreeNode*, unsigned> OpenChildren; |
772 | |
773 | CurrVN = 0; |
774 | |
775 | // Perform a DFS walk to determine the order of visit. |
776 | WorkList.push_back(Elt: Node); |
777 | do { |
778 | Node = WorkList.pop_back_val(); |
779 | Scopes.push_back(Elt: Node); |
780 | OpenChildren[Node] = Node->getNumChildren(); |
781 | append_range(C&: WorkList, R: Node->children()); |
782 | } while (!WorkList.empty()); |
783 | |
784 | // Now perform CSE. |
785 | bool Changed = false; |
786 | for (MachineDomTreeNode *Node : Scopes) { |
787 | MachineBasicBlock *MBB = Node->getBlock(); |
788 | EnterScope(MBB); |
789 | Changed |= ProcessBlockCSE(MBB); |
790 | // If it's a leaf node, it's done. Traverse upwards to pop ancestors. |
791 | ExitScopeIfDone(Node, OpenChildren); |
792 | } |
793 | |
794 | return Changed; |
795 | } |
796 | |
797 | // We use stronger checks for PRE candidate rather than for CSE ones to embrace |
798 | // checks inside ProcessBlockCSE(), not only inside isCSECandidate(). This helps |
799 | // to exclude instrs created by PRE that won't be CSEed later. |
800 | bool MachineCSEImpl::isPRECandidate(MachineInstr *MI, |
801 | SmallSet<MCRegister, 8> &PhysRefs) { |
802 | if (!isCSECandidate(MI) || |
803 | MI->isNotDuplicable() || |
804 | MI->mayLoad() || |
805 | TII->isAsCheapAsAMove(MI: *MI) || |
806 | MI->getNumDefs() != 1 || |
807 | MI->getNumExplicitDefs() != 1) |
808 | return false; |
809 | |
810 | for (const MachineOperand &MO : MI->operands()) { |
811 | if (MO.isReg() && !MO.getReg().isVirtual()) { |
812 | if (MO.isDef()) |
813 | return false; |
814 | else |
815 | PhysRefs.insert(V: MO.getReg()); |
816 | } |
817 | } |
818 | |
819 | return true; |
820 | } |
821 | |
822 | bool MachineCSEImpl::ProcessBlockPRE(MachineDominatorTree *DT, |
823 | MachineBasicBlock *MBB) { |
824 | bool Changed = false; |
825 | for (MachineInstr &MI : llvm::make_early_inc_range(Range&: *MBB)) { |
826 | SmallSet<MCRegister, 8> PhysRefs; |
827 | if (!isPRECandidate(MI: &MI, PhysRefs)) |
828 | continue; |
829 | |
830 | auto [It, Inserted] = PREMap.try_emplace(Key: &MI, Args&: MBB); |
831 | if (Inserted) |
832 | continue; |
833 | |
834 | auto *MBB1 = It->second; |
835 | assert( |
836 | !DT->properlyDominates(MBB, MBB1) && |
837 | "MBB cannot properly dominate MBB1 while DFS through dominators tree!" ); |
838 | auto CMBB = DT->findNearestCommonDominator(A: MBB, B: MBB1); |
839 | if (!CMBB->isLegalToHoistInto()) |
840 | continue; |
841 | |
842 | if (!isProfitableToHoistInto(CandidateBB: CMBB, MBB, MBB1)) |
843 | continue; |
844 | |
845 | // Two instrs are partial redundant if their basic blocks are reachable |
846 | // from one to another but one doesn't dominate another. |
847 | if (CMBB != MBB1) { |
848 | auto BB = MBB->getBasicBlock(), BB1 = MBB1->getBasicBlock(); |
849 | if (BB != nullptr && BB1 != nullptr && |
850 | (isPotentiallyReachable(From: BB1, To: BB) || |
851 | isPotentiallyReachable(From: BB, To: BB1))) { |
852 | // The following check extends the definition of `isConvergent` to |
853 | // assume a convergent instruction is dependent not only on additional |
854 | // conditions, but also on fewer conditions. LLVM does not have a |
855 | // MachineInstr attribute which expresses this extended definition, so |
856 | // it's necessary to use `isConvergent` to prevent illegally PRE-ing the |
857 | // subset of `isConvergent` instructions which do fall into this |
858 | // extended definition. |
859 | if (MI.isConvergent() && CMBB != MBB) |
860 | continue; |
861 | |
862 | // If this instruction uses physical registers then we can only do PRE |
863 | // if it's using the value that is live at the place we're hoisting to. |
864 | bool NonLocal; |
865 | PhysDefVector PhysDefs; |
866 | if (!PhysRefs.empty() && |
867 | !PhysRegDefsReach(CSMI: &*(CMBB->getFirstTerminator()), MI: &MI, PhysRefs, |
868 | PhysDefs, NonLocal)) |
869 | continue; |
870 | |
871 | assert(MI.getOperand(0).isDef() && |
872 | "First operand of instr with one explicit def must be this def" ); |
873 | Register VReg = MI.getOperand(i: 0).getReg(); |
874 | Register NewReg = MRI->cloneVirtualRegister(VReg); |
875 | if (!isProfitableToCSE(CSReg: NewReg, Reg: VReg, CSBB: CMBB, MI: &MI)) |
876 | continue; |
877 | MachineInstr &NewMI = |
878 | TII->duplicate(MBB&: *CMBB, InsertBefore: CMBB->getFirstTerminator(), Orig: MI); |
879 | |
880 | // When hoisting, make sure we don't carry the debug location of |
881 | // the original instruction, as that's not correct and can cause |
882 | // unexpected jumps when debugging optimized code. |
883 | auto EmptyDL = DebugLoc(); |
884 | NewMI.setDebugLoc(EmptyDL); |
885 | |
886 | NewMI.getOperand(i: 0).setReg(NewReg); |
887 | |
888 | PREMap[&MI] = CMBB; |
889 | ++NumPREs; |
890 | Changed = true; |
891 | } |
892 | } |
893 | } |
894 | return Changed; |
895 | } |
896 | |
897 | // This simple PRE (partial redundancy elimination) pass doesn't actually |
898 | // eliminate partial redundancy but transforms it to full redundancy, |
899 | // anticipating that the next CSE step will eliminate this created redundancy. |
900 | // If CSE doesn't eliminate this, than created instruction will remain dead |
901 | // and eliminated later by Remove Dead Machine Instructions pass. |
902 | bool MachineCSEImpl::PerformSimplePRE(MachineDominatorTree *DT) { |
903 | SmallVector<MachineDomTreeNode *, 32> BBs; |
904 | |
905 | PREMap.clear(); |
906 | bool Changed = false; |
907 | BBs.push_back(Elt: DT->getRootNode()); |
908 | do { |
909 | auto Node = BBs.pop_back_val(); |
910 | append_range(C&: BBs, R: Node->children()); |
911 | |
912 | MachineBasicBlock *MBB = Node->getBlock(); |
913 | Changed |= ProcessBlockPRE(DT, MBB); |
914 | |
915 | } while (!BBs.empty()); |
916 | |
917 | return Changed; |
918 | } |
919 | |
920 | bool MachineCSEImpl::isProfitableToHoistInto(MachineBasicBlock *CandidateBB, |
921 | MachineBasicBlock *MBB, |
922 | MachineBasicBlock *MBB1) { |
923 | if (CandidateBB->getParent()->getFunction().hasMinSize()) |
924 | return true; |
925 | assert(DT->dominates(CandidateBB, MBB) && "CandidateBB should dominate MBB" ); |
926 | assert(DT->dominates(CandidateBB, MBB1) && |
927 | "CandidateBB should dominate MBB1" ); |
928 | return MBFI->getBlockFreq(MBB: CandidateBB) <= |
929 | MBFI->getBlockFreq(MBB) + MBFI->getBlockFreq(MBB: MBB1); |
930 | } |
931 | |
932 | void MachineCSEImpl::releaseMemory() { |
933 | ScopeMap.clear(); |
934 | PREMap.clear(); |
935 | Exps.clear(); |
936 | } |
937 | |
938 | bool MachineCSEImpl::run(MachineFunction &MF) { |
939 | TII = MF.getSubtarget().getInstrInfo(); |
940 | TRI = MF.getSubtarget().getRegisterInfo(); |
941 | MRI = &MF.getRegInfo(); |
942 | LookAheadLimit = TII->getMachineCSELookAheadLimit(); |
943 | bool ChangedPRE, ChangedCSE; |
944 | ChangedPRE = PerformSimplePRE(DT); |
945 | ChangedCSE = PerformCSE(Node: DT->getRootNode()); |
946 | releaseMemory(); |
947 | return ChangedPRE || ChangedCSE; |
948 | } |
949 | |
950 | PreservedAnalyses MachineCSEPass::run(MachineFunction &MF, |
951 | MachineFunctionAnalysisManager &MFAM) { |
952 | MFPropsModifier _(*this, MF); |
953 | |
954 | MachineDominatorTree &MDT = MFAM.getResult<MachineDominatorTreeAnalysis>(IR&: MF); |
955 | MachineBlockFrequencyInfo &MBFI = |
956 | MFAM.getResult<MachineBlockFrequencyAnalysis>(IR&: MF); |
957 | MachineCSEImpl Impl(&MDT, &MBFI); |
958 | bool Changed = Impl.run(MF); |
959 | if (!Changed) |
960 | return PreservedAnalyses::all(); |
961 | |
962 | auto PA = getMachineFunctionPassPreservedAnalyses(); |
963 | PA.preserve<MachineLoopAnalysis>(); |
964 | PA.preserve<MachineDominatorTreeAnalysis>(); |
965 | PA.preserve<MachineBlockFrequencyAnalysis>(); |
966 | PA.preserveSet<CFGAnalyses>(); |
967 | return PA; |
968 | } |
969 | |
970 | bool MachineCSELegacy::runOnMachineFunction(MachineFunction &MF) { |
971 | if (skipFunction(F: MF.getFunction())) |
972 | return false; |
973 | |
974 | MachineDominatorTree &MDT = |
975 | getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree(); |
976 | MachineBlockFrequencyInfo &MBFI = |
977 | getAnalysis<MachineBlockFrequencyInfoWrapperPass>().getMBFI(); |
978 | MachineCSEImpl Impl(&MDT, &MBFI); |
979 | return Impl.run(MF); |
980 | } |
981 | |