1 | //===---- MachineCombiner.cpp - Instcombining on SSA form machine code ----===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // The machine combiner pass uses machine trace metrics to ensure the combined |
10 | // instructions do not lengthen the critical path or the resource depth. |
11 | //===----------------------------------------------------------------------===// |
12 | |
13 | #include "llvm/ADT/DenseMap.h" |
14 | #include "llvm/ADT/Statistic.h" |
15 | #include "llvm/Analysis/ProfileSummaryInfo.h" |
16 | #include "llvm/CodeGen/LazyMachineBlockFrequencyInfo.h" |
17 | #include "llvm/CodeGen/MachineCombinerPattern.h" |
18 | #include "llvm/CodeGen/MachineDominators.h" |
19 | #include "llvm/CodeGen/MachineFunction.h" |
20 | #include "llvm/CodeGen/MachineFunctionPass.h" |
21 | #include "llvm/CodeGen/MachineLoopInfo.h" |
22 | #include "llvm/CodeGen/MachineRegisterInfo.h" |
23 | #include "llvm/CodeGen/MachineSizeOpts.h" |
24 | #include "llvm/CodeGen/MachineTraceMetrics.h" |
25 | #include "llvm/CodeGen/RegisterClassInfo.h" |
26 | #include "llvm/CodeGen/TargetInstrInfo.h" |
27 | #include "llvm/CodeGen/TargetRegisterInfo.h" |
28 | #include "llvm/CodeGen/TargetSchedule.h" |
29 | #include "llvm/CodeGen/TargetSubtargetInfo.h" |
30 | #include "llvm/InitializePasses.h" |
31 | #include "llvm/Support/CommandLine.h" |
32 | #include "llvm/Support/Debug.h" |
33 | #include "llvm/Support/raw_ostream.h" |
34 | |
35 | using namespace llvm; |
36 | |
37 | #define DEBUG_TYPE "machine-combiner" |
38 | |
39 | STATISTIC(NumInstCombined, "Number of machineinst combined" ); |
40 | |
41 | static cl::opt<unsigned> |
42 | inc_threshold("machine-combiner-inc-threshold" , cl::Hidden, |
43 | cl::desc("Incremental depth computation will be used for basic " |
44 | "blocks with more instructions." ), cl::init(Val: 500)); |
45 | |
46 | static cl::opt<bool> dump_intrs("machine-combiner-dump-subst-intrs" , cl::Hidden, |
47 | cl::desc("Dump all substituted intrs" ), |
48 | cl::init(Val: false)); |
49 | |
50 | #ifdef EXPENSIVE_CHECKS |
51 | static cl::opt<bool> VerifyPatternOrder( |
52 | "machine-combiner-verify-pattern-order" , cl::Hidden, |
53 | cl::desc( |
54 | "Verify that the generated patterns are ordered by increasing latency" ), |
55 | cl::init(true)); |
56 | #else |
57 | static cl::opt<bool> VerifyPatternOrder( |
58 | "machine-combiner-verify-pattern-order" , cl::Hidden, |
59 | cl::desc( |
60 | "Verify that the generated patterns are ordered by increasing latency" ), |
61 | cl::init(Val: false)); |
62 | #endif |
63 | |
64 | namespace { |
65 | class MachineCombiner : public MachineFunctionPass { |
66 | const TargetSubtargetInfo *STI = nullptr; |
67 | const TargetInstrInfo *TII = nullptr; |
68 | const TargetRegisterInfo *TRI = nullptr; |
69 | MCSchedModel SchedModel; |
70 | MachineRegisterInfo *MRI = nullptr; |
71 | MachineLoopInfo *MLI = nullptr; // Current MachineLoopInfo |
72 | MachineTraceMetrics *Traces = nullptr; |
73 | MachineTraceMetrics::Ensemble *TraceEnsemble = nullptr; |
74 | MachineBlockFrequencyInfo *MBFI = nullptr; |
75 | ProfileSummaryInfo *PSI = nullptr; |
76 | RegisterClassInfo RegClassInfo; |
77 | |
78 | TargetSchedModel TSchedModel; |
79 | |
80 | /// True if optimizing for code size. |
81 | bool OptSize = false; |
82 | |
83 | public: |
84 | static char ID; |
85 | MachineCombiner() : MachineFunctionPass(ID) { |
86 | initializeMachineCombinerPass(*PassRegistry::getPassRegistry()); |
87 | } |
88 | void getAnalysisUsage(AnalysisUsage &AU) const override; |
89 | bool runOnMachineFunction(MachineFunction &MF) override; |
90 | StringRef getPassName() const override { return "Machine InstCombiner" ; } |
91 | |
92 | private: |
93 | bool combineInstructions(MachineBasicBlock *); |
94 | MachineInstr *getOperandDef(const MachineOperand &MO); |
95 | bool isTransientMI(const MachineInstr *MI); |
96 | unsigned getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs, |
97 | DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, |
98 | MachineTraceMetrics::Trace BlockTrace, |
99 | const MachineBasicBlock &MBB); |
100 | unsigned getLatency(MachineInstr *Root, MachineInstr *NewRoot, |
101 | MachineTraceMetrics::Trace BlockTrace); |
102 | bool improvesCriticalPathLen(MachineBasicBlock *MBB, MachineInstr *Root, |
103 | MachineTraceMetrics::Trace BlockTrace, |
104 | SmallVectorImpl<MachineInstr *> &InsInstrs, |
105 | SmallVectorImpl<MachineInstr *> &DelInstrs, |
106 | DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, |
107 | unsigned Pattern, bool SlackIsAccurate); |
108 | bool reduceRegisterPressure(MachineInstr &Root, MachineBasicBlock *MBB, |
109 | SmallVectorImpl<MachineInstr *> &InsInstrs, |
110 | SmallVectorImpl<MachineInstr *> &DelInstrs, |
111 | unsigned Pattern); |
112 | bool preservesResourceLen(MachineBasicBlock *MBB, |
113 | MachineTraceMetrics::Trace BlockTrace, |
114 | SmallVectorImpl<MachineInstr *> &InsInstrs, |
115 | SmallVectorImpl<MachineInstr *> &DelInstrs); |
116 | void instr2instrSC(SmallVectorImpl<MachineInstr *> &Instrs, |
117 | SmallVectorImpl<const MCSchedClassDesc *> &); |
118 | std::pair<unsigned, unsigned> |
119 | getLatenciesForInstrSequences(MachineInstr &MI, |
120 | SmallVectorImpl<MachineInstr *> &InsInstrs, |
121 | SmallVectorImpl<MachineInstr *> &DelInstrs, |
122 | MachineTraceMetrics::Trace BlockTrace); |
123 | |
124 | void verifyPatternOrder(MachineBasicBlock *MBB, MachineInstr &Root, |
125 | SmallVector<unsigned, 16> &Patterns); |
126 | CombinerObjective getCombinerObjective(unsigned Pattern); |
127 | }; |
128 | } |
129 | |
130 | char MachineCombiner::ID = 0; |
131 | char &llvm::MachineCombinerID = MachineCombiner::ID; |
132 | |
133 | INITIALIZE_PASS_BEGIN(MachineCombiner, DEBUG_TYPE, |
134 | "Machine InstCombiner" , false, false) |
135 | INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass) |
136 | INITIALIZE_PASS_DEPENDENCY(MachineTraceMetrics) |
137 | INITIALIZE_PASS_END(MachineCombiner, DEBUG_TYPE, "Machine InstCombiner" , |
138 | false, false) |
139 | |
140 | void MachineCombiner::getAnalysisUsage(AnalysisUsage &AU) const { |
141 | AU.setPreservesCFG(); |
142 | AU.addPreserved<MachineDominatorTreeWrapperPass>(); |
143 | AU.addRequired<MachineLoopInfoWrapperPass>(); |
144 | AU.addPreserved<MachineLoopInfoWrapperPass>(); |
145 | AU.addRequired<MachineTraceMetrics>(); |
146 | AU.addPreserved<MachineTraceMetrics>(); |
147 | AU.addRequired<LazyMachineBlockFrequencyInfoPass>(); |
148 | AU.addRequired<ProfileSummaryInfoWrapperPass>(); |
149 | MachineFunctionPass::getAnalysisUsage(AU); |
150 | } |
151 | |
152 | MachineInstr * |
153 | MachineCombiner::getOperandDef(const MachineOperand &MO) { |
154 | MachineInstr *DefInstr = nullptr; |
155 | // We need a virtual register definition. |
156 | if (MO.isReg() && MO.getReg().isVirtual()) |
157 | DefInstr = MRI->getUniqueVRegDef(Reg: MO.getReg()); |
158 | return DefInstr; |
159 | } |
160 | |
161 | /// Return true if MI is unlikely to generate an actual target instruction. |
162 | bool MachineCombiner::isTransientMI(const MachineInstr *MI) { |
163 | if (!MI->isCopy()) |
164 | return MI->isTransient(); |
165 | |
166 | // If MI is a COPY, check if its src and dst registers can be coalesced. |
167 | Register Dst = MI->getOperand(i: 0).getReg(); |
168 | Register Src = MI->getOperand(i: 1).getReg(); |
169 | |
170 | if (!MI->isFullCopy()) { |
171 | // If src RC contains super registers of dst RC, it can also be coalesced. |
172 | if (MI->getOperand(i: 0).getSubReg() || Src.isPhysical() || Dst.isPhysical()) |
173 | return false; |
174 | |
175 | auto SrcSub = MI->getOperand(i: 1).getSubReg(); |
176 | auto SrcRC = MRI->getRegClass(Reg: Src); |
177 | auto DstRC = MRI->getRegClass(Reg: Dst); |
178 | return TRI->getMatchingSuperRegClass(A: SrcRC, B: DstRC, Idx: SrcSub) != nullptr; |
179 | } |
180 | |
181 | if (Src.isPhysical() && Dst.isPhysical()) |
182 | return Src == Dst; |
183 | |
184 | if (Src.isVirtual() && Dst.isVirtual()) { |
185 | auto SrcRC = MRI->getRegClass(Reg: Src); |
186 | auto DstRC = MRI->getRegClass(Reg: Dst); |
187 | return SrcRC->hasSuperClassEq(RC: DstRC) || SrcRC->hasSubClassEq(RC: DstRC); |
188 | } |
189 | |
190 | if (Src.isVirtual()) |
191 | std::swap(a&: Src, b&: Dst); |
192 | |
193 | // Now Src is physical register, Dst is virtual register. |
194 | auto DstRC = MRI->getRegClass(Reg: Dst); |
195 | return DstRC->contains(Reg: Src); |
196 | } |
197 | |
198 | /// Computes depth of instructions in vector \InsInstr. |
199 | /// |
200 | /// \param InsInstrs is a vector of machine instructions |
201 | /// \param InstrIdxForVirtReg is a dense map of virtual register to index |
202 | /// of defining machine instruction in \p InsInstrs |
203 | /// \param BlockTrace is a trace of machine instructions |
204 | /// |
205 | /// \returns Depth of last instruction in \InsInstrs ("NewRoot") |
206 | unsigned |
207 | MachineCombiner::getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs, |
208 | DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, |
209 | MachineTraceMetrics::Trace BlockTrace, |
210 | const MachineBasicBlock &MBB) { |
211 | SmallVector<unsigned, 16> InstrDepth; |
212 | // For each instruction in the new sequence compute the depth based on the |
213 | // operands. Use the trace information when possible. For new operands which |
214 | // are tracked in the InstrIdxForVirtReg map depth is looked up in InstrDepth |
215 | for (auto *InstrPtr : InsInstrs) { // for each Use |
216 | unsigned IDepth = 0; |
217 | for (const MachineOperand &MO : InstrPtr->all_uses()) { |
218 | // Check for virtual register operand. |
219 | if (!MO.getReg().isVirtual()) |
220 | continue; |
221 | unsigned DepthOp = 0; |
222 | unsigned LatencyOp = 0; |
223 | DenseMap<unsigned, unsigned>::iterator II = |
224 | InstrIdxForVirtReg.find(Val: MO.getReg()); |
225 | if (II != InstrIdxForVirtReg.end()) { |
226 | // Operand is new virtual register not in trace |
227 | assert(II->second < InstrDepth.size() && "Bad Index" ); |
228 | MachineInstr *DefInstr = InsInstrs[II->second]; |
229 | assert(DefInstr && |
230 | "There must be a definition for a new virtual register" ); |
231 | DepthOp = InstrDepth[II->second]; |
232 | int DefIdx = |
233 | DefInstr->findRegisterDefOperandIdx(Reg: MO.getReg(), /*TRI=*/nullptr); |
234 | int UseIdx = |
235 | InstrPtr->findRegisterUseOperandIdx(Reg: MO.getReg(), /*TRI=*/nullptr); |
236 | LatencyOp = TSchedModel.computeOperandLatency(DefMI: DefInstr, DefOperIdx: DefIdx, |
237 | UseMI: InstrPtr, UseOperIdx: UseIdx); |
238 | } else { |
239 | MachineInstr *DefInstr = getOperandDef(MO); |
240 | if (DefInstr && (TII->getMachineCombinerTraceStrategy() != |
241 | MachineTraceStrategy::TS_Local || |
242 | DefInstr->getParent() == &MBB)) { |
243 | DepthOp = BlockTrace.getInstrCycles(MI: *DefInstr).Depth; |
244 | if (!isTransientMI(MI: DefInstr)) |
245 | LatencyOp = TSchedModel.computeOperandLatency( |
246 | DefMI: DefInstr, |
247 | DefOperIdx: DefInstr->findRegisterDefOperandIdx(Reg: MO.getReg(), |
248 | /*TRI=*/nullptr), |
249 | UseMI: InstrPtr, |
250 | UseOperIdx: InstrPtr->findRegisterUseOperandIdx(Reg: MO.getReg(), |
251 | /*TRI=*/nullptr)); |
252 | } |
253 | } |
254 | IDepth = std::max(a: IDepth, b: DepthOp + LatencyOp); |
255 | } |
256 | InstrDepth.push_back(Elt: IDepth); |
257 | } |
258 | unsigned NewRootIdx = InsInstrs.size() - 1; |
259 | return InstrDepth[NewRootIdx]; |
260 | } |
261 | |
262 | /// Computes instruction latency as max of latency of defined operands. |
263 | /// |
264 | /// \param Root is a machine instruction that could be replaced by NewRoot. |
265 | /// It is used to compute a more accurate latency information for NewRoot in |
266 | /// case there is a dependent instruction in the same trace (\p BlockTrace) |
267 | /// \param NewRoot is the instruction for which the latency is computed |
268 | /// \param BlockTrace is a trace of machine instructions |
269 | /// |
270 | /// \returns Latency of \p NewRoot |
271 | unsigned MachineCombiner::getLatency(MachineInstr *Root, MachineInstr *NewRoot, |
272 | MachineTraceMetrics::Trace BlockTrace) { |
273 | // Check each definition in NewRoot and compute the latency |
274 | unsigned NewRootLatency = 0; |
275 | |
276 | for (const MachineOperand &MO : NewRoot->all_defs()) { |
277 | // Check for virtual register operand. |
278 | if (!MO.getReg().isVirtual()) |
279 | continue; |
280 | // Get the first instruction that uses MO |
281 | MachineRegisterInfo::reg_iterator RI = MRI->reg_begin(RegNo: MO.getReg()); |
282 | RI++; |
283 | if (RI == MRI->reg_end()) |
284 | continue; |
285 | MachineInstr *UseMO = RI->getParent(); |
286 | unsigned LatencyOp = 0; |
287 | if (UseMO && BlockTrace.isDepInTrace(DefMI: *Root, UseMI: *UseMO)) { |
288 | LatencyOp = TSchedModel.computeOperandLatency( |
289 | DefMI: NewRoot, |
290 | DefOperIdx: NewRoot->findRegisterDefOperandIdx(Reg: MO.getReg(), /*TRI=*/nullptr), |
291 | UseMI: UseMO, |
292 | UseOperIdx: UseMO->findRegisterUseOperandIdx(Reg: MO.getReg(), /*TRI=*/nullptr)); |
293 | } else { |
294 | LatencyOp = TSchedModel.computeInstrLatency(MI: NewRoot); |
295 | } |
296 | NewRootLatency = std::max(a: NewRootLatency, b: LatencyOp); |
297 | } |
298 | return NewRootLatency; |
299 | } |
300 | |
301 | CombinerObjective MachineCombiner::getCombinerObjective(unsigned Pattern) { |
302 | // TODO: If C++ ever gets a real enum class, make this part of the |
303 | // MachineCombinerPattern class. |
304 | switch (Pattern) { |
305 | case MachineCombinerPattern::REASSOC_AX_BY: |
306 | case MachineCombinerPattern::REASSOC_AX_YB: |
307 | case MachineCombinerPattern::REASSOC_XA_BY: |
308 | case MachineCombinerPattern::REASSOC_XA_YB: |
309 | return CombinerObjective::MustReduceDepth; |
310 | default: |
311 | return TII->getCombinerObjective(Pattern); |
312 | } |
313 | } |
314 | |
315 | /// Estimate the latency of the new and original instruction sequence by summing |
316 | /// up the latencies of the inserted and deleted instructions. This assumes |
317 | /// that the inserted and deleted instructions are dependent instruction chains, |
318 | /// which might not hold in all cases. |
319 | std::pair<unsigned, unsigned> MachineCombiner::getLatenciesForInstrSequences( |
320 | MachineInstr &MI, SmallVectorImpl<MachineInstr *> &InsInstrs, |
321 | SmallVectorImpl<MachineInstr *> &DelInstrs, |
322 | MachineTraceMetrics::Trace BlockTrace) { |
323 | assert(!InsInstrs.empty() && "Only support sequences that insert instrs." ); |
324 | unsigned NewRootLatency = 0; |
325 | // NewRoot is the last instruction in the \p InsInstrs vector. |
326 | MachineInstr *NewRoot = InsInstrs.back(); |
327 | for (unsigned i = 0; i < InsInstrs.size() - 1; i++) |
328 | NewRootLatency += TSchedModel.computeInstrLatency(MI: InsInstrs[i]); |
329 | NewRootLatency += getLatency(Root: &MI, NewRoot, BlockTrace); |
330 | |
331 | unsigned RootLatency = 0; |
332 | for (auto *I : DelInstrs) |
333 | RootLatency += TSchedModel.computeInstrLatency(MI: I); |
334 | |
335 | return {NewRootLatency, RootLatency}; |
336 | } |
337 | |
338 | bool MachineCombiner::reduceRegisterPressure( |
339 | MachineInstr &Root, MachineBasicBlock *MBB, |
340 | SmallVectorImpl<MachineInstr *> &InsInstrs, |
341 | SmallVectorImpl<MachineInstr *> &DelInstrs, unsigned Pattern) { |
342 | // FIXME: for now, we don't do any check for the register pressure patterns. |
343 | // We treat them as always profitable. But we can do better if we make |
344 | // RegPressureTracker class be aware of TIE attribute. Then we can get an |
345 | // accurate compare of register pressure with DelInstrs or InsInstrs. |
346 | return true; |
347 | } |
348 | |
349 | /// The DAGCombine code sequence ends in MI (Machine Instruction) Root. |
350 | /// The new code sequence ends in MI NewRoot. A necessary condition for the new |
351 | /// sequence to replace the old sequence is that it cannot lengthen the critical |
352 | /// path. The definition of "improve" may be restricted by specifying that the |
353 | /// new path improves the data dependency chain (MustReduceDepth). |
354 | bool MachineCombiner::improvesCriticalPathLen( |
355 | MachineBasicBlock *MBB, MachineInstr *Root, |
356 | MachineTraceMetrics::Trace BlockTrace, |
357 | SmallVectorImpl<MachineInstr *> &InsInstrs, |
358 | SmallVectorImpl<MachineInstr *> &DelInstrs, |
359 | DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, unsigned Pattern, |
360 | bool SlackIsAccurate) { |
361 | // Get depth and latency of NewRoot and Root. |
362 | unsigned NewRootDepth = |
363 | getDepth(InsInstrs, InstrIdxForVirtReg, BlockTrace, MBB: *MBB); |
364 | unsigned RootDepth = BlockTrace.getInstrCycles(MI: *Root).Depth; |
365 | |
366 | LLVM_DEBUG(dbgs() << " Dependence data for " << *Root << "\tNewRootDepth: " |
367 | << NewRootDepth << "\tRootDepth: " << RootDepth); |
368 | |
369 | // For a transform such as reassociation, the cost equation is |
370 | // conservatively calculated so that we must improve the depth (data |
371 | // dependency cycles) in the critical path to proceed with the transform. |
372 | // Being conservative also protects against inaccuracies in the underlying |
373 | // machine trace metrics and CPU models. |
374 | if (getCombinerObjective(Pattern) == CombinerObjective::MustReduceDepth) { |
375 | LLVM_DEBUG(dbgs() << "\tIt MustReduceDepth " ); |
376 | LLVM_DEBUG(NewRootDepth < RootDepth |
377 | ? dbgs() << "\t and it does it\n" |
378 | : dbgs() << "\t but it does NOT do it\n" ); |
379 | return NewRootDepth < RootDepth; |
380 | } |
381 | |
382 | // A more flexible cost calculation for the critical path includes the slack |
383 | // of the original code sequence. This may allow the transform to proceed |
384 | // even if the instruction depths (data dependency cycles) become worse. |
385 | |
386 | // Account for the latency of the inserted and deleted instructions by |
387 | unsigned NewRootLatency, RootLatency; |
388 | if (TII->accumulateInstrSeqToRootLatency(Root&: *Root)) { |
389 | std::tie(args&: NewRootLatency, args&: RootLatency) = |
390 | getLatenciesForInstrSequences(MI&: *Root, InsInstrs, DelInstrs, BlockTrace); |
391 | } else { |
392 | NewRootLatency = TSchedModel.computeInstrLatency(MI: InsInstrs.back()); |
393 | RootLatency = TSchedModel.computeInstrLatency(MI: Root); |
394 | } |
395 | |
396 | unsigned RootSlack = BlockTrace.getInstrSlack(MI: *Root); |
397 | unsigned NewCycleCount = NewRootDepth + NewRootLatency; |
398 | unsigned OldCycleCount = |
399 | RootDepth + RootLatency + (SlackIsAccurate ? RootSlack : 0); |
400 | LLVM_DEBUG(dbgs() << "\n\tNewRootLatency: " << NewRootLatency |
401 | << "\tRootLatency: " << RootLatency << "\n\tRootSlack: " |
402 | << RootSlack << " SlackIsAccurate=" << SlackIsAccurate |
403 | << "\n\tNewRootDepth + NewRootLatency = " << NewCycleCount |
404 | << "\n\tRootDepth + RootLatency + RootSlack = " |
405 | << OldCycleCount;); |
406 | LLVM_DEBUG(NewCycleCount <= OldCycleCount |
407 | ? dbgs() << "\n\t It IMPROVES PathLen because" |
408 | : dbgs() << "\n\t It DOES NOT improve PathLen because" ); |
409 | LLVM_DEBUG(dbgs() << "\n\t\tNewCycleCount = " << NewCycleCount |
410 | << ", OldCycleCount = " << OldCycleCount << "\n" ); |
411 | |
412 | return NewCycleCount <= OldCycleCount; |
413 | } |
414 | |
415 | /// helper routine to convert instructions into SC |
416 | void MachineCombiner::instr2instrSC( |
417 | SmallVectorImpl<MachineInstr *> &Instrs, |
418 | SmallVectorImpl<const MCSchedClassDesc *> &) { |
419 | for (auto *InstrPtr : Instrs) { |
420 | unsigned Opc = InstrPtr->getOpcode(); |
421 | unsigned Idx = TII->get(Opcode: Opc).getSchedClass(); |
422 | const MCSchedClassDesc *SC = SchedModel.getSchedClassDesc(SchedClassIdx: Idx); |
423 | InstrsSC.push_back(Elt: SC); |
424 | } |
425 | } |
426 | |
427 | /// True when the new instructions do not increase resource length |
428 | bool MachineCombiner::preservesResourceLen( |
429 | MachineBasicBlock *MBB, MachineTraceMetrics::Trace BlockTrace, |
430 | SmallVectorImpl<MachineInstr *> &InsInstrs, |
431 | SmallVectorImpl<MachineInstr *> &DelInstrs) { |
432 | if (!TSchedModel.hasInstrSchedModel()) |
433 | return true; |
434 | |
435 | // Compute current resource length |
436 | |
437 | //ArrayRef<const MachineBasicBlock *> MBBarr(MBB); |
438 | SmallVector <const MachineBasicBlock *, 1> MBBarr; |
439 | MBBarr.push_back(Elt: MBB); |
440 | unsigned ResLenBeforeCombine = BlockTrace.getResourceLength(Extrablocks: MBBarr); |
441 | |
442 | // Deal with SC rather than Instructions. |
443 | SmallVector<const MCSchedClassDesc *, 16> ; |
444 | SmallVector<const MCSchedClassDesc *, 16> ; |
445 | |
446 | instr2instrSC(Instrs&: InsInstrs, InstrsSC&: InsInstrsSC); |
447 | instr2instrSC(Instrs&: DelInstrs, InstrsSC&: DelInstrsSC); |
448 | |
449 | ArrayRef<const MCSchedClassDesc *> MSCInsArr{InsInstrsSC}; |
450 | ArrayRef<const MCSchedClassDesc *> MSCDelArr{DelInstrsSC}; |
451 | |
452 | // Compute new resource length. |
453 | unsigned ResLenAfterCombine = |
454 | BlockTrace.getResourceLength(Extrablocks: MBBarr, ExtraInstrs: MSCInsArr, RemoveInstrs: MSCDelArr); |
455 | |
456 | LLVM_DEBUG(dbgs() << "\t\tResource length before replacement: " |
457 | << ResLenBeforeCombine |
458 | << " and after: " << ResLenAfterCombine << "\n" ;); |
459 | LLVM_DEBUG( |
460 | ResLenAfterCombine <= |
461 | ResLenBeforeCombine + TII->getExtendResourceLenLimit() |
462 | ? dbgs() << "\t\t As result it IMPROVES/PRESERVES Resource Length\n" |
463 | : dbgs() << "\t\t As result it DOES NOT improve/preserve Resource " |
464 | "Length\n" ); |
465 | |
466 | return ResLenAfterCombine <= |
467 | ResLenBeforeCombine + TII->getExtendResourceLenLimit(); |
468 | } |
469 | |
470 | /// Inserts InsInstrs and deletes DelInstrs. Incrementally updates instruction |
471 | /// depths if requested. |
472 | /// |
473 | /// \param MBB basic block to insert instructions in |
474 | /// \param MI current machine instruction |
475 | /// \param InsInstrs new instructions to insert in \p MBB |
476 | /// \param DelInstrs instruction to delete from \p MBB |
477 | /// \param TraceEnsemble is a pointer to the machine trace information |
478 | /// \param RegUnits set of live registers, needed to compute instruction depths |
479 | /// \param TII is target instruction info, used to call target hook |
480 | /// \param Pattern is used to call target hook finalizeInsInstrs |
481 | /// \param IncrementalUpdate if true, compute instruction depths incrementally, |
482 | /// otherwise invalidate the trace |
483 | static void |
484 | insertDeleteInstructions(MachineBasicBlock *MBB, MachineInstr &MI, |
485 | SmallVectorImpl<MachineInstr *> &InsInstrs, |
486 | SmallVectorImpl<MachineInstr *> &DelInstrs, |
487 | MachineTraceMetrics::Ensemble *TraceEnsemble, |
488 | SparseSet<LiveRegUnit> &RegUnits, |
489 | const TargetInstrInfo *TII, unsigned Pattern, |
490 | bool IncrementalUpdate) { |
491 | // If we want to fix up some placeholder for some target, do it now. |
492 | // We need this because in genAlternativeCodeSequence, we have not decided the |
493 | // better pattern InsInstrs or DelInstrs, so we don't want generate some |
494 | // sideeffect to the function. For example we need to delay the constant pool |
495 | // entry creation here after InsInstrs is selected as better pattern. |
496 | // Otherwise the constant pool entry created for InsInstrs will not be deleted |
497 | // even if InsInstrs is not the better pattern. |
498 | TII->finalizeInsInstrs(Root&: MI, Pattern, InsInstrs); |
499 | |
500 | for (auto *InstrPtr : InsInstrs) |
501 | MBB->insert(I: (MachineBasicBlock::iterator)&MI, MI: InstrPtr); |
502 | |
503 | for (auto *InstrPtr : DelInstrs) { |
504 | InstrPtr->eraseFromParent(); |
505 | // Erase all LiveRegs defined by the removed instruction |
506 | for (auto *I = RegUnits.begin(); I != RegUnits.end();) { |
507 | if (I->MI == InstrPtr) |
508 | I = RegUnits.erase(I); |
509 | else |
510 | I++; |
511 | } |
512 | } |
513 | |
514 | if (IncrementalUpdate) |
515 | for (auto *InstrPtr : InsInstrs) |
516 | TraceEnsemble->updateDepth(MBB, *InstrPtr, RegUnits); |
517 | else |
518 | TraceEnsemble->invalidate(MBB); |
519 | |
520 | NumInstCombined++; |
521 | } |
522 | |
523 | // Check that the difference between original and new latency is decreasing for |
524 | // later patterns. This helps to discover sub-optimal pattern orderings. |
525 | void MachineCombiner::verifyPatternOrder(MachineBasicBlock *MBB, |
526 | MachineInstr &Root, |
527 | SmallVector<unsigned, 16> &Patterns) { |
528 | long PrevLatencyDiff = std::numeric_limits<long>::max(); |
529 | (void)PrevLatencyDiff; // Variable is used in assert only. |
530 | for (auto P : Patterns) { |
531 | SmallVector<MachineInstr *, 16> InsInstrs; |
532 | SmallVector<MachineInstr *, 16> DelInstrs; |
533 | DenseMap<unsigned, unsigned> InstrIdxForVirtReg; |
534 | TII->genAlternativeCodeSequence(Root, Pattern: P, InsInstrs, DelInstrs, |
535 | InstIdxForVirtReg&: InstrIdxForVirtReg); |
536 | // Found pattern, but did not generate alternative sequence. |
537 | // This can happen e.g. when an immediate could not be materialized |
538 | // in a single instruction. |
539 | if (InsInstrs.empty() || !TSchedModel.hasInstrSchedModelOrItineraries()) |
540 | continue; |
541 | |
542 | unsigned NewRootLatency, RootLatency; |
543 | std::tie(args&: NewRootLatency, args&: RootLatency) = getLatenciesForInstrSequences( |
544 | MI&: Root, InsInstrs, DelInstrs, BlockTrace: TraceEnsemble->getTrace(MBB)); |
545 | long CurrentLatencyDiff = ((long)RootLatency) - ((long)NewRootLatency); |
546 | assert(CurrentLatencyDiff <= PrevLatencyDiff && |
547 | "Current pattern is better than previous pattern." ); |
548 | PrevLatencyDiff = CurrentLatencyDiff; |
549 | } |
550 | } |
551 | |
552 | /// Substitute a slow code sequence with a faster one by |
553 | /// evaluating instruction combining pattern. |
554 | /// The prototype of such a pattern is MUl + ADD -> MADD. Performs instruction |
555 | /// combining based on machine trace metrics. Only combine a sequence of |
556 | /// instructions when this neither lengthens the critical path nor increases |
557 | /// resource pressure. When optimizing for codesize always combine when the new |
558 | /// sequence is shorter. |
559 | bool MachineCombiner::combineInstructions(MachineBasicBlock *MBB) { |
560 | bool Changed = false; |
561 | LLVM_DEBUG(dbgs() << "Combining MBB " << MBB->getName() << "\n" ); |
562 | |
563 | bool IncrementalUpdate = false; |
564 | auto BlockIter = MBB->begin(); |
565 | decltype(BlockIter) LastUpdate; |
566 | // Check if the block is in a loop. |
567 | const MachineLoop *ML = MLI->getLoopFor(BB: MBB); |
568 | if (!TraceEnsemble) |
569 | TraceEnsemble = Traces->getEnsemble(TII->getMachineCombinerTraceStrategy()); |
570 | |
571 | SparseSet<LiveRegUnit> RegUnits; |
572 | RegUnits.setUniverse(TRI->getNumRegUnits()); |
573 | |
574 | bool OptForSize = OptSize || llvm::shouldOptimizeForSize(MBB, PSI, MBFI); |
575 | |
576 | bool DoRegPressureReduce = |
577 | TII->shouldReduceRegisterPressure(MBB, RegClassInfo: &RegClassInfo); |
578 | |
579 | while (BlockIter != MBB->end()) { |
580 | auto &MI = *BlockIter++; |
581 | SmallVector<unsigned, 16> Patterns; |
582 | // The motivating example is: |
583 | // |
584 | // MUL Other MUL_op1 MUL_op2 Other |
585 | // \ / \ | / |
586 | // ADD/SUB => MADD/MSUB |
587 | // (=Root) (=NewRoot) |
588 | |
589 | // The DAGCombine code always replaced MUL + ADD/SUB by MADD. While this is |
590 | // usually beneficial for code size it unfortunately can hurt performance |
591 | // when the ADD is on the critical path, but the MUL is not. With the |
592 | // substitution the MUL becomes part of the critical path (in form of the |
593 | // MADD) and can lengthen it on architectures where the MADD latency is |
594 | // longer than the ADD latency. |
595 | // |
596 | // For each instruction we check if it can be the root of a combiner |
597 | // pattern. Then for each pattern the new code sequence in form of MI is |
598 | // generated and evaluated. When the efficiency criteria (don't lengthen |
599 | // critical path, don't use more resources) is met the new sequence gets |
600 | // hooked up into the basic block before the old sequence is removed. |
601 | // |
602 | // The algorithm does not try to evaluate all patterns and pick the best. |
603 | // This is only an artificial restriction though. In practice there is |
604 | // mostly one pattern, and getMachineCombinerPatterns() can order patterns |
605 | // based on an internal cost heuristic. If |
606 | // machine-combiner-verify-pattern-order is enabled, all patterns are |
607 | // checked to ensure later patterns do not provide better latency savings. |
608 | |
609 | if (!TII->getMachineCombinerPatterns(Root&: MI, Patterns, DoRegPressureReduce)) |
610 | continue; |
611 | |
612 | if (VerifyPatternOrder) |
613 | verifyPatternOrder(MBB, Root&: MI, Patterns); |
614 | |
615 | for (const auto P : Patterns) { |
616 | SmallVector<MachineInstr *, 16> InsInstrs; |
617 | SmallVector<MachineInstr *, 16> DelInstrs; |
618 | DenseMap<unsigned, unsigned> InstrIdxForVirtReg; |
619 | TII->genAlternativeCodeSequence(Root&: MI, Pattern: P, InsInstrs, DelInstrs, |
620 | InstIdxForVirtReg&: InstrIdxForVirtReg); |
621 | // Found pattern, but did not generate alternative sequence. |
622 | // This can happen e.g. when an immediate could not be materialized |
623 | // in a single instruction. |
624 | if (InsInstrs.empty()) |
625 | continue; |
626 | |
627 | LLVM_DEBUG(if (dump_intrs) { |
628 | dbgs() << "\tFor the Pattern (" << (int)P |
629 | << ") these instructions could be removed\n" ; |
630 | for (auto const *InstrPtr : DelInstrs) |
631 | InstrPtr->print(dbgs(), /*IsStandalone*/false, /*SkipOpers*/false, |
632 | /*SkipDebugLoc*/false, /*AddNewLine*/true, TII); |
633 | dbgs() << "\tThese instructions could replace the removed ones\n" ; |
634 | for (auto const *InstrPtr : InsInstrs) |
635 | InstrPtr->print(dbgs(), /*IsStandalone*/false, /*SkipOpers*/false, |
636 | /*SkipDebugLoc*/false, /*AddNewLine*/true, TII); |
637 | }); |
638 | |
639 | if (IncrementalUpdate && LastUpdate != BlockIter) { |
640 | // Update depths since the last incremental update. |
641 | TraceEnsemble->updateDepths(Start: LastUpdate, End: BlockIter, RegUnits); |
642 | LastUpdate = BlockIter; |
643 | } |
644 | |
645 | if (DoRegPressureReduce && |
646 | getCombinerObjective(Pattern: P) == |
647 | CombinerObjective::MustReduceRegisterPressure) { |
648 | if (MBB->size() > inc_threshold) { |
649 | // Use incremental depth updates for basic blocks above threshold |
650 | IncrementalUpdate = true; |
651 | LastUpdate = BlockIter; |
652 | } |
653 | if (reduceRegisterPressure(Root&: MI, MBB, InsInstrs, DelInstrs, Pattern: P)) { |
654 | // Replace DelInstrs with InsInstrs. |
655 | insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, TraceEnsemble, |
656 | RegUnits, TII, Pattern: P, IncrementalUpdate); |
657 | Changed |= true; |
658 | |
659 | // Go back to previous instruction as it may have ILP reassociation |
660 | // opportunity. |
661 | BlockIter--; |
662 | break; |
663 | } |
664 | } |
665 | |
666 | if (ML && TII->isThroughputPattern(Pattern: P)) { |
667 | LLVM_DEBUG(dbgs() << "\t Replacing due to throughput pattern in loop\n" ); |
668 | insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, TraceEnsemble, |
669 | RegUnits, TII, Pattern: P, IncrementalUpdate); |
670 | // Eagerly stop after the first pattern fires. |
671 | Changed = true; |
672 | break; |
673 | } else if (OptForSize && InsInstrs.size() < DelInstrs.size()) { |
674 | LLVM_DEBUG(dbgs() << "\t Replacing due to OptForSize (" |
675 | << InsInstrs.size() << " < " |
676 | << DelInstrs.size() << ")\n" ); |
677 | insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, TraceEnsemble, |
678 | RegUnits, TII, Pattern: P, IncrementalUpdate); |
679 | // Eagerly stop after the first pattern fires. |
680 | Changed = true; |
681 | break; |
682 | } else { |
683 | // For big basic blocks, we only compute the full trace the first time |
684 | // we hit this. We do not invalidate the trace, but instead update the |
685 | // instruction depths incrementally. |
686 | // NOTE: Only the instruction depths up to MI are accurate. All other |
687 | // trace information is not updated. |
688 | MachineTraceMetrics::Trace BlockTrace = TraceEnsemble->getTrace(MBB); |
689 | Traces->verifyAnalysis(); |
690 | if (improvesCriticalPathLen(MBB, Root: &MI, BlockTrace, InsInstrs, DelInstrs, |
691 | InstrIdxForVirtReg, Pattern: P, |
692 | SlackIsAccurate: !IncrementalUpdate) && |
693 | preservesResourceLen(MBB, BlockTrace, InsInstrs, DelInstrs)) { |
694 | if (MBB->size() > inc_threshold) { |
695 | // Use incremental depth updates for basic blocks above treshold |
696 | IncrementalUpdate = true; |
697 | LastUpdate = BlockIter; |
698 | } |
699 | |
700 | insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, TraceEnsemble, |
701 | RegUnits, TII, Pattern: P, IncrementalUpdate); |
702 | |
703 | // Eagerly stop after the first pattern fires. |
704 | Changed = true; |
705 | break; |
706 | } |
707 | // Cleanup instructions of the alternative code sequence. There is no |
708 | // use for them. |
709 | MachineFunction *MF = MBB->getParent(); |
710 | for (auto *InstrPtr : InsInstrs) |
711 | MF->deleteMachineInstr(MI: InstrPtr); |
712 | } |
713 | InstrIdxForVirtReg.clear(); |
714 | } |
715 | } |
716 | |
717 | if (Changed && IncrementalUpdate) |
718 | Traces->invalidate(MBB); |
719 | return Changed; |
720 | } |
721 | |
722 | bool MachineCombiner::runOnMachineFunction(MachineFunction &MF) { |
723 | STI = &MF.getSubtarget(); |
724 | TII = STI->getInstrInfo(); |
725 | TRI = STI->getRegisterInfo(); |
726 | SchedModel = STI->getSchedModel(); |
727 | TSchedModel.init(TSInfo: STI); |
728 | MRI = &MF.getRegInfo(); |
729 | MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI(); |
730 | Traces = &getAnalysis<MachineTraceMetrics>(); |
731 | PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); |
732 | MBFI = (PSI && PSI->hasProfileSummary()) ? |
733 | &getAnalysis<LazyMachineBlockFrequencyInfoPass>().getBFI() : |
734 | nullptr; |
735 | TraceEnsemble = nullptr; |
736 | OptSize = MF.getFunction().hasOptSize(); |
737 | RegClassInfo.runOnMachineFunction(MF); |
738 | |
739 | LLVM_DEBUG(dbgs() << getPassName() << ": " << MF.getName() << '\n'); |
740 | if (!TII->useMachineCombiner()) { |
741 | LLVM_DEBUG( |
742 | dbgs() |
743 | << " Skipping pass: Target does not support machine combiner\n" ); |
744 | return false; |
745 | } |
746 | |
747 | bool Changed = false; |
748 | |
749 | // Try to combine instructions. |
750 | for (auto &MBB : MF) |
751 | Changed |= combineInstructions(MBB: &MBB); |
752 | |
753 | return Changed; |
754 | } |
755 | |