1 | //===---- MachineCombiner.cpp - Instcombining on SSA form machine code ----===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // The machine combiner pass uses machine trace metrics to ensure the combined |
10 | // instructions do not lengthen the critical path or the resource depth. |
11 | //===----------------------------------------------------------------------===// |
12 | |
13 | #include "llvm/ADT/DenseMap.h" |
14 | #include "llvm/ADT/Statistic.h" |
15 | #include "llvm/Analysis/ProfileSummaryInfo.h" |
16 | #include "llvm/CodeGen/LazyMachineBlockFrequencyInfo.h" |
17 | #include "llvm/CodeGen/MachineCombinerPattern.h" |
18 | #include "llvm/CodeGen/MachineDominators.h" |
19 | #include "llvm/CodeGen/MachineFunction.h" |
20 | #include "llvm/CodeGen/MachineFunctionPass.h" |
21 | #include "llvm/CodeGen/MachineLoopInfo.h" |
22 | #include "llvm/CodeGen/MachineRegisterInfo.h" |
23 | #include "llvm/CodeGen/MachineSizeOpts.h" |
24 | #include "llvm/CodeGen/MachineTraceMetrics.h" |
25 | #include "llvm/CodeGen/RegisterClassInfo.h" |
26 | #include "llvm/CodeGen/TargetInstrInfo.h" |
27 | #include "llvm/CodeGen/TargetRegisterInfo.h" |
28 | #include "llvm/CodeGen/TargetSchedule.h" |
29 | #include "llvm/CodeGen/TargetSubtargetInfo.h" |
30 | #include "llvm/InitializePasses.h" |
31 | #include "llvm/Support/CommandLine.h" |
32 | #include "llvm/Support/Debug.h" |
33 | #include "llvm/Support/raw_ostream.h" |
34 | |
35 | using namespace llvm; |
36 | |
37 | #define DEBUG_TYPE "machine-combiner" |
38 | |
39 | STATISTIC(NumInstCombined, "Number of machineinst combined" ); |
40 | |
41 | static cl::opt<unsigned> |
42 | inc_threshold("machine-combiner-inc-threshold" , cl::Hidden, |
43 | cl::desc("Incremental depth computation will be used for basic " |
44 | "blocks with more instructions." ), cl::init(Val: 500)); |
45 | |
46 | static cl::opt<bool> dump_intrs("machine-combiner-dump-subst-intrs" , cl::Hidden, |
47 | cl::desc("Dump all substituted intrs" ), |
48 | cl::init(Val: false)); |
49 | |
50 | #ifdef EXPENSIVE_CHECKS |
51 | static cl::opt<bool> VerifyPatternOrder( |
52 | "machine-combiner-verify-pattern-order" , cl::Hidden, |
53 | cl::desc( |
54 | "Verify that the generated patterns are ordered by increasing latency" ), |
55 | cl::init(true)); |
56 | #else |
57 | static cl::opt<bool> VerifyPatternOrder( |
58 | "machine-combiner-verify-pattern-order" , cl::Hidden, |
59 | cl::desc( |
60 | "Verify that the generated patterns are ordered by increasing latency" ), |
61 | cl::init(Val: false)); |
62 | #endif |
63 | |
64 | namespace { |
65 | class MachineCombiner : public MachineFunctionPass { |
66 | const TargetSubtargetInfo *STI = nullptr; |
67 | const TargetInstrInfo *TII = nullptr; |
68 | const TargetRegisterInfo *TRI = nullptr; |
69 | MCSchedModel SchedModel; |
70 | MachineRegisterInfo *MRI = nullptr; |
71 | MachineLoopInfo *MLI = nullptr; // Current MachineLoopInfo |
72 | MachineTraceMetrics *Traces = nullptr; |
73 | MachineTraceMetrics::Ensemble *TraceEnsemble = nullptr; |
74 | MachineBlockFrequencyInfo *MBFI = nullptr; |
75 | ProfileSummaryInfo *PSI = nullptr; |
76 | RegisterClassInfo RegClassInfo; |
77 | |
78 | TargetSchedModel TSchedModel; |
79 | |
80 | public: |
81 | static char ID; |
82 | MachineCombiner() : MachineFunctionPass(ID) { |
83 | initializeMachineCombinerPass(*PassRegistry::getPassRegistry()); |
84 | } |
85 | void getAnalysisUsage(AnalysisUsage &AU) const override; |
86 | bool runOnMachineFunction(MachineFunction &MF) override; |
87 | StringRef getPassName() const override { return "Machine InstCombiner" ; } |
88 | |
89 | private: |
90 | bool combineInstructions(MachineBasicBlock *); |
91 | MachineInstr *getOperandDef(const MachineOperand &MO); |
92 | bool isTransientMI(const MachineInstr *MI); |
93 | unsigned getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs, |
94 | DenseMap<Register, unsigned> &InstrIdxForVirtReg, |
95 | MachineTraceMetrics::Trace BlockTrace, |
96 | const MachineBasicBlock &MBB); |
97 | unsigned getLatency(MachineInstr *Root, MachineInstr *NewRoot, |
98 | MachineTraceMetrics::Trace BlockTrace); |
99 | bool improvesCriticalPathLen(MachineBasicBlock *MBB, MachineInstr *Root, |
100 | MachineTraceMetrics::Trace BlockTrace, |
101 | SmallVectorImpl<MachineInstr *> &InsInstrs, |
102 | SmallVectorImpl<MachineInstr *> &DelInstrs, |
103 | DenseMap<Register, unsigned> &InstrIdxForVirtReg, |
104 | unsigned Pattern, bool SlackIsAccurate); |
105 | bool reduceRegisterPressure(MachineInstr &Root, MachineBasicBlock *MBB, |
106 | SmallVectorImpl<MachineInstr *> &InsInstrs, |
107 | SmallVectorImpl<MachineInstr *> &DelInstrs, |
108 | unsigned Pattern); |
109 | bool preservesResourceLen(MachineBasicBlock *MBB, |
110 | MachineTraceMetrics::Trace BlockTrace, |
111 | SmallVectorImpl<MachineInstr *> &InsInstrs, |
112 | SmallVectorImpl<MachineInstr *> &DelInstrs); |
113 | void instr2instrSC(SmallVectorImpl<MachineInstr *> &Instrs, |
114 | SmallVectorImpl<const MCSchedClassDesc *> &); |
115 | std::pair<unsigned, unsigned> |
116 | getLatenciesForInstrSequences(MachineInstr &MI, |
117 | SmallVectorImpl<MachineInstr *> &InsInstrs, |
118 | SmallVectorImpl<MachineInstr *> &DelInstrs, |
119 | MachineTraceMetrics::Trace BlockTrace); |
120 | |
121 | void verifyPatternOrder(MachineBasicBlock *MBB, MachineInstr &Root, |
122 | SmallVector<unsigned, 16> &Patterns); |
123 | CombinerObjective getCombinerObjective(unsigned Pattern); |
124 | }; |
125 | } |
126 | |
127 | char MachineCombiner::ID = 0; |
128 | char &llvm::MachineCombinerID = MachineCombiner::ID; |
129 | |
130 | INITIALIZE_PASS_BEGIN(MachineCombiner, DEBUG_TYPE, |
131 | "Machine InstCombiner" , false, false) |
132 | INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass) |
133 | INITIALIZE_PASS_DEPENDENCY(MachineTraceMetricsWrapperPass) |
134 | INITIALIZE_PASS_END(MachineCombiner, DEBUG_TYPE, "Machine InstCombiner" , |
135 | false, false) |
136 | |
137 | void MachineCombiner::getAnalysisUsage(AnalysisUsage &AU) const { |
138 | AU.setPreservesCFG(); |
139 | AU.addPreserved<MachineDominatorTreeWrapperPass>(); |
140 | AU.addRequired<MachineLoopInfoWrapperPass>(); |
141 | AU.addPreserved<MachineLoopInfoWrapperPass>(); |
142 | AU.addRequired<MachineTraceMetricsWrapperPass>(); |
143 | AU.addPreserved<MachineTraceMetricsWrapperPass>(); |
144 | AU.addRequired<LazyMachineBlockFrequencyInfoPass>(); |
145 | AU.addRequired<ProfileSummaryInfoWrapperPass>(); |
146 | MachineFunctionPass::getAnalysisUsage(AU); |
147 | } |
148 | |
149 | MachineInstr * |
150 | MachineCombiner::getOperandDef(const MachineOperand &MO) { |
151 | MachineInstr *DefInstr = nullptr; |
152 | // We need a virtual register definition. |
153 | if (MO.isReg() && MO.getReg().isVirtual()) |
154 | DefInstr = MRI->getUniqueVRegDef(Reg: MO.getReg()); |
155 | return DefInstr; |
156 | } |
157 | |
158 | /// Return true if MI is unlikely to generate an actual target instruction. |
159 | bool MachineCombiner::isTransientMI(const MachineInstr *MI) { |
160 | if (!MI->isCopy()) |
161 | return MI->isTransient(); |
162 | |
163 | // If MI is a COPY, check if its src and dst registers can be coalesced. |
164 | Register Dst = MI->getOperand(i: 0).getReg(); |
165 | Register Src = MI->getOperand(i: 1).getReg(); |
166 | |
167 | if (!MI->isFullCopy()) { |
168 | // If src RC contains super registers of dst RC, it can also be coalesced. |
169 | if (MI->getOperand(i: 0).getSubReg() || Src.isPhysical() || Dst.isPhysical()) |
170 | return false; |
171 | |
172 | auto SrcSub = MI->getOperand(i: 1).getSubReg(); |
173 | auto SrcRC = MRI->getRegClass(Reg: Src); |
174 | auto DstRC = MRI->getRegClass(Reg: Dst); |
175 | return TRI->getMatchingSuperRegClass(A: SrcRC, B: DstRC, Idx: SrcSub) != nullptr; |
176 | } |
177 | |
178 | if (Src.isPhysical() && Dst.isPhysical()) |
179 | return Src == Dst; |
180 | |
181 | if (Src.isVirtual() && Dst.isVirtual()) { |
182 | auto SrcRC = MRI->getRegClass(Reg: Src); |
183 | auto DstRC = MRI->getRegClass(Reg: Dst); |
184 | return SrcRC->hasSuperClassEq(RC: DstRC) || SrcRC->hasSubClassEq(RC: DstRC); |
185 | } |
186 | |
187 | if (Src.isVirtual()) |
188 | std::swap(a&: Src, b&: Dst); |
189 | |
190 | // Now Src is physical register, Dst is virtual register. |
191 | auto DstRC = MRI->getRegClass(Reg: Dst); |
192 | return DstRC->contains(Reg: Src); |
193 | } |
194 | |
195 | /// Computes depth of instructions in vector \InsInstr. |
196 | /// |
197 | /// \param InsInstrs is a vector of machine instructions |
198 | /// \param InstrIdxForVirtReg is a dense map of virtual register to index |
199 | /// of defining machine instruction in \p InsInstrs |
200 | /// \param BlockTrace is a trace of machine instructions |
201 | /// |
202 | /// \returns Depth of last instruction in \InsInstrs ("NewRoot") |
203 | unsigned |
204 | MachineCombiner::getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs, |
205 | DenseMap<Register, unsigned> &InstrIdxForVirtReg, |
206 | MachineTraceMetrics::Trace BlockTrace, |
207 | const MachineBasicBlock &MBB) { |
208 | SmallVector<unsigned, 16> InstrDepth; |
209 | // For each instruction in the new sequence compute the depth based on the |
210 | // operands. Use the trace information when possible. For new operands which |
211 | // are tracked in the InstrIdxForVirtReg map depth is looked up in InstrDepth |
212 | for (auto *InstrPtr : InsInstrs) { // for each Use |
213 | unsigned IDepth = 0; |
214 | for (const MachineOperand &MO : InstrPtr->all_uses()) { |
215 | // Check for virtual register operand. |
216 | if (!MO.getReg().isVirtual()) |
217 | continue; |
218 | unsigned DepthOp = 0; |
219 | unsigned LatencyOp = 0; |
220 | DenseMap<Register, unsigned>::iterator II = |
221 | InstrIdxForVirtReg.find(Val: MO.getReg()); |
222 | if (II != InstrIdxForVirtReg.end()) { |
223 | // Operand is new virtual register not in trace |
224 | assert(II->second < InstrDepth.size() && "Bad Index" ); |
225 | MachineInstr *DefInstr = InsInstrs[II->second]; |
226 | assert(DefInstr && |
227 | "There must be a definition for a new virtual register" ); |
228 | DepthOp = InstrDepth[II->second]; |
229 | int DefIdx = |
230 | DefInstr->findRegisterDefOperandIdx(Reg: MO.getReg(), /*TRI=*/nullptr); |
231 | int UseIdx = |
232 | InstrPtr->findRegisterUseOperandIdx(Reg: MO.getReg(), /*TRI=*/nullptr); |
233 | LatencyOp = TSchedModel.computeOperandLatency(DefMI: DefInstr, DefOperIdx: DefIdx, |
234 | UseMI: InstrPtr, UseOperIdx: UseIdx); |
235 | } else { |
236 | MachineInstr *DefInstr = getOperandDef(MO); |
237 | if (DefInstr && (TII->getMachineCombinerTraceStrategy() != |
238 | MachineTraceStrategy::TS_Local || |
239 | DefInstr->getParent() == &MBB)) { |
240 | DepthOp = BlockTrace.getInstrCycles(MI: *DefInstr).Depth; |
241 | if (!isTransientMI(MI: DefInstr)) |
242 | LatencyOp = TSchedModel.computeOperandLatency( |
243 | DefMI: DefInstr, |
244 | DefOperIdx: DefInstr->findRegisterDefOperandIdx(Reg: MO.getReg(), |
245 | /*TRI=*/nullptr), |
246 | UseMI: InstrPtr, |
247 | UseOperIdx: InstrPtr->findRegisterUseOperandIdx(Reg: MO.getReg(), |
248 | /*TRI=*/nullptr)); |
249 | } |
250 | } |
251 | IDepth = std::max(a: IDepth, b: DepthOp + LatencyOp); |
252 | } |
253 | InstrDepth.push_back(Elt: IDepth); |
254 | } |
255 | unsigned NewRootIdx = InsInstrs.size() - 1; |
256 | return InstrDepth[NewRootIdx]; |
257 | } |
258 | |
259 | /// Computes instruction latency as max of latency of defined operands. |
260 | /// |
261 | /// \param Root is a machine instruction that could be replaced by NewRoot. |
262 | /// It is used to compute a more accurate latency information for NewRoot in |
263 | /// case there is a dependent instruction in the same trace (\p BlockTrace) |
264 | /// \param NewRoot is the instruction for which the latency is computed |
265 | /// \param BlockTrace is a trace of machine instructions |
266 | /// |
267 | /// \returns Latency of \p NewRoot |
268 | unsigned MachineCombiner::getLatency(MachineInstr *Root, MachineInstr *NewRoot, |
269 | MachineTraceMetrics::Trace BlockTrace) { |
270 | // Check each definition in NewRoot and compute the latency |
271 | unsigned NewRootLatency = 0; |
272 | |
273 | for (const MachineOperand &MO : NewRoot->all_defs()) { |
274 | // Check for virtual register operand. |
275 | if (!MO.getReg().isVirtual()) |
276 | continue; |
277 | // Get the first instruction that uses MO |
278 | MachineRegisterInfo::reg_iterator RI = MRI->reg_begin(RegNo: MO.getReg()); |
279 | RI++; |
280 | if (RI == MRI->reg_end()) |
281 | continue; |
282 | MachineInstr *UseMO = RI->getParent(); |
283 | unsigned LatencyOp = 0; |
284 | if (UseMO && BlockTrace.isDepInTrace(DefMI: *Root, UseMI: *UseMO)) { |
285 | LatencyOp = TSchedModel.computeOperandLatency( |
286 | DefMI: NewRoot, |
287 | DefOperIdx: NewRoot->findRegisterDefOperandIdx(Reg: MO.getReg(), /*TRI=*/nullptr), |
288 | UseMI: UseMO, |
289 | UseOperIdx: UseMO->findRegisterUseOperandIdx(Reg: MO.getReg(), /*TRI=*/nullptr)); |
290 | } else { |
291 | LatencyOp = TSchedModel.computeInstrLatency(MI: NewRoot); |
292 | } |
293 | NewRootLatency = std::max(a: NewRootLatency, b: LatencyOp); |
294 | } |
295 | return NewRootLatency; |
296 | } |
297 | |
298 | CombinerObjective MachineCombiner::getCombinerObjective(unsigned Pattern) { |
299 | // TODO: If C++ ever gets a real enum class, make this part of the |
300 | // MachineCombinerPattern class. |
301 | switch (Pattern) { |
302 | case MachineCombinerPattern::REASSOC_AX_BY: |
303 | case MachineCombinerPattern::REASSOC_AX_YB: |
304 | case MachineCombinerPattern::REASSOC_XA_BY: |
305 | case MachineCombinerPattern::REASSOC_XA_YB: |
306 | return CombinerObjective::MustReduceDepth; |
307 | default: |
308 | return TII->getCombinerObjective(Pattern); |
309 | } |
310 | } |
311 | |
312 | /// Estimate the latency of the new and original instruction sequence by summing |
313 | /// up the latencies of the inserted and deleted instructions. This assumes |
314 | /// that the inserted and deleted instructions are dependent instruction chains, |
315 | /// which might not hold in all cases. |
316 | std::pair<unsigned, unsigned> MachineCombiner::getLatenciesForInstrSequences( |
317 | MachineInstr &MI, SmallVectorImpl<MachineInstr *> &InsInstrs, |
318 | SmallVectorImpl<MachineInstr *> &DelInstrs, |
319 | MachineTraceMetrics::Trace BlockTrace) { |
320 | assert(!InsInstrs.empty() && "Only support sequences that insert instrs." ); |
321 | unsigned NewRootLatency = 0; |
322 | // NewRoot is the last instruction in the \p InsInstrs vector. |
323 | MachineInstr *NewRoot = InsInstrs.back(); |
324 | for (unsigned i = 0; i < InsInstrs.size() - 1; i++) |
325 | NewRootLatency += TSchedModel.computeInstrLatency(MI: InsInstrs[i]); |
326 | NewRootLatency += getLatency(Root: &MI, NewRoot, BlockTrace); |
327 | |
328 | unsigned RootLatency = 0; |
329 | for (auto *I : DelInstrs) |
330 | RootLatency += TSchedModel.computeInstrLatency(MI: I); |
331 | |
332 | return {NewRootLatency, RootLatency}; |
333 | } |
334 | |
335 | bool MachineCombiner::reduceRegisterPressure( |
336 | MachineInstr &Root, MachineBasicBlock *MBB, |
337 | SmallVectorImpl<MachineInstr *> &InsInstrs, |
338 | SmallVectorImpl<MachineInstr *> &DelInstrs, unsigned Pattern) { |
339 | // FIXME: for now, we don't do any check for the register pressure patterns. |
340 | // We treat them as always profitable. But we can do better if we make |
341 | // RegPressureTracker class be aware of TIE attribute. Then we can get an |
342 | // accurate compare of register pressure with DelInstrs or InsInstrs. |
343 | return true; |
344 | } |
345 | |
346 | /// The DAGCombine code sequence ends in MI (Machine Instruction) Root. |
347 | /// The new code sequence ends in MI NewRoot. A necessary condition for the new |
348 | /// sequence to replace the old sequence is that it cannot lengthen the critical |
349 | /// path. The definition of "improve" may be restricted by specifying that the |
350 | /// new path improves the data dependency chain (MustReduceDepth). |
351 | bool MachineCombiner::improvesCriticalPathLen( |
352 | MachineBasicBlock *MBB, MachineInstr *Root, |
353 | MachineTraceMetrics::Trace BlockTrace, |
354 | SmallVectorImpl<MachineInstr *> &InsInstrs, |
355 | SmallVectorImpl<MachineInstr *> &DelInstrs, |
356 | DenseMap<Register, unsigned> &InstrIdxForVirtReg, unsigned Pattern, |
357 | bool SlackIsAccurate) { |
358 | // Get depth and latency of NewRoot and Root. |
359 | unsigned NewRootDepth = |
360 | getDepth(InsInstrs, InstrIdxForVirtReg, BlockTrace, MBB: *MBB); |
361 | unsigned RootDepth = BlockTrace.getInstrCycles(MI: *Root).Depth; |
362 | |
363 | LLVM_DEBUG(dbgs() << " Dependence data for " << *Root << "\tNewRootDepth: " |
364 | << NewRootDepth << "\tRootDepth: " << RootDepth); |
365 | |
366 | // For a transform such as reassociation, the cost equation is |
367 | // conservatively calculated so that we must improve the depth (data |
368 | // dependency cycles) in the critical path to proceed with the transform. |
369 | // Being conservative also protects against inaccuracies in the underlying |
370 | // machine trace metrics and CPU models. |
371 | if (getCombinerObjective(Pattern) == CombinerObjective::MustReduceDepth) { |
372 | LLVM_DEBUG(dbgs() << "\tIt MustReduceDepth " ); |
373 | LLVM_DEBUG(NewRootDepth < RootDepth |
374 | ? dbgs() << "\t and it does it\n" |
375 | : dbgs() << "\t but it does NOT do it\n" ); |
376 | return NewRootDepth < RootDepth; |
377 | } |
378 | |
379 | // A more flexible cost calculation for the critical path includes the slack |
380 | // of the original code sequence. This may allow the transform to proceed |
381 | // even if the instruction depths (data dependency cycles) become worse. |
382 | |
383 | // Account for the latency of the inserted and deleted instructions by |
384 | unsigned NewRootLatency, RootLatency; |
385 | if (TII->accumulateInstrSeqToRootLatency(Root&: *Root)) { |
386 | std::tie(args&: NewRootLatency, args&: RootLatency) = |
387 | getLatenciesForInstrSequences(MI&: *Root, InsInstrs, DelInstrs, BlockTrace); |
388 | } else { |
389 | NewRootLatency = TSchedModel.computeInstrLatency(MI: InsInstrs.back()); |
390 | RootLatency = TSchedModel.computeInstrLatency(MI: Root); |
391 | } |
392 | |
393 | unsigned RootSlack = BlockTrace.getInstrSlack(MI: *Root); |
394 | unsigned NewCycleCount = NewRootDepth + NewRootLatency; |
395 | unsigned OldCycleCount = |
396 | RootDepth + RootLatency + (SlackIsAccurate ? RootSlack : 0); |
397 | LLVM_DEBUG(dbgs() << "\n\tNewRootLatency: " << NewRootLatency |
398 | << "\tRootLatency: " << RootLatency << "\n\tRootSlack: " |
399 | << RootSlack << " SlackIsAccurate=" << SlackIsAccurate |
400 | << "\n\tNewRootDepth + NewRootLatency = " << NewCycleCount |
401 | << "\n\tRootDepth + RootLatency + RootSlack = " |
402 | << OldCycleCount); |
403 | LLVM_DEBUG(NewCycleCount <= OldCycleCount |
404 | ? dbgs() << "\n\t It IMPROVES PathLen because" |
405 | : dbgs() << "\n\t It DOES NOT improve PathLen because" ); |
406 | LLVM_DEBUG(dbgs() << "\n\t\tNewCycleCount = " << NewCycleCount |
407 | << ", OldCycleCount = " << OldCycleCount << "\n" ); |
408 | |
409 | return NewCycleCount <= OldCycleCount; |
410 | } |
411 | |
412 | /// helper routine to convert instructions into SC |
413 | void MachineCombiner::instr2instrSC( |
414 | SmallVectorImpl<MachineInstr *> &Instrs, |
415 | SmallVectorImpl<const MCSchedClassDesc *> &) { |
416 | for (auto *InstrPtr : Instrs) { |
417 | unsigned Opc = InstrPtr->getOpcode(); |
418 | unsigned Idx = TII->get(Opcode: Opc).getSchedClass(); |
419 | const MCSchedClassDesc *SC = SchedModel.getSchedClassDesc(SchedClassIdx: Idx); |
420 | InstrsSC.push_back(Elt: SC); |
421 | } |
422 | } |
423 | |
424 | /// True when the new instructions do not increase resource length |
425 | bool MachineCombiner::preservesResourceLen( |
426 | MachineBasicBlock *MBB, MachineTraceMetrics::Trace BlockTrace, |
427 | SmallVectorImpl<MachineInstr *> &InsInstrs, |
428 | SmallVectorImpl<MachineInstr *> &DelInstrs) { |
429 | if (!TSchedModel.hasInstrSchedModel()) |
430 | return true; |
431 | |
432 | // Compute current resource length |
433 | |
434 | //ArrayRef<const MachineBasicBlock *> MBBarr(MBB); |
435 | SmallVector <const MachineBasicBlock *, 1> MBBarr; |
436 | MBBarr.push_back(Elt: MBB); |
437 | unsigned ResLenBeforeCombine = BlockTrace.getResourceLength(Extrablocks: MBBarr); |
438 | |
439 | // Deal with SC rather than Instructions. |
440 | SmallVector<const MCSchedClassDesc *, 16> ; |
441 | SmallVector<const MCSchedClassDesc *, 16> ; |
442 | |
443 | instr2instrSC(Instrs&: InsInstrs, InstrsSC&: InsInstrsSC); |
444 | instr2instrSC(Instrs&: DelInstrs, InstrsSC&: DelInstrsSC); |
445 | |
446 | ArrayRef<const MCSchedClassDesc *> MSCInsArr{InsInstrsSC}; |
447 | ArrayRef<const MCSchedClassDesc *> MSCDelArr{DelInstrsSC}; |
448 | |
449 | // Compute new resource length. |
450 | unsigned ResLenAfterCombine = |
451 | BlockTrace.getResourceLength(Extrablocks: MBBarr, ExtraInstrs: MSCInsArr, RemoveInstrs: MSCDelArr); |
452 | |
453 | LLVM_DEBUG(dbgs() << "\t\tResource length before replacement: " |
454 | << ResLenBeforeCombine |
455 | << " and after: " << ResLenAfterCombine << "\n" ); |
456 | LLVM_DEBUG( |
457 | ResLenAfterCombine <= |
458 | ResLenBeforeCombine + TII->getExtendResourceLenLimit() |
459 | ? dbgs() << "\t\t As result it IMPROVES/PRESERVES Resource Length\n" |
460 | : dbgs() << "\t\t As result it DOES NOT improve/preserve Resource " |
461 | "Length\n" ); |
462 | |
463 | return ResLenAfterCombine <= |
464 | ResLenBeforeCombine + TII->getExtendResourceLenLimit(); |
465 | } |
466 | |
467 | /// Inserts InsInstrs and deletes DelInstrs. Incrementally updates instruction |
468 | /// depths if requested. |
469 | /// |
470 | /// \param MBB basic block to insert instructions in |
471 | /// \param MI current machine instruction |
472 | /// \param InsInstrs new instructions to insert in \p MBB |
473 | /// \param DelInstrs instruction to delete from \p MBB |
474 | /// \param TraceEnsemble is a pointer to the machine trace information |
475 | /// \param RegUnits set of live registers, needed to compute instruction depths |
476 | /// \param TII is target instruction info, used to call target hook |
477 | /// \param Pattern is used to call target hook finalizeInsInstrs |
478 | /// \param IncrementalUpdate if true, compute instruction depths incrementally, |
479 | /// otherwise invalidate the trace |
480 | static void |
481 | insertDeleteInstructions(MachineBasicBlock *MBB, MachineInstr &MI, |
482 | SmallVectorImpl<MachineInstr *> &InsInstrs, |
483 | SmallVectorImpl<MachineInstr *> &DelInstrs, |
484 | MachineTraceMetrics::Ensemble *TraceEnsemble, |
485 | SparseSet<LiveRegUnit> &RegUnits, |
486 | const TargetInstrInfo *TII, unsigned Pattern, |
487 | bool IncrementalUpdate) { |
488 | // If we want to fix up some placeholder for some target, do it now. |
489 | // We need this because in genAlternativeCodeSequence, we have not decided the |
490 | // better pattern InsInstrs or DelInstrs, so we don't want generate some |
491 | // sideeffect to the function. For example we need to delay the constant pool |
492 | // entry creation here after InsInstrs is selected as better pattern. |
493 | // Otherwise the constant pool entry created for InsInstrs will not be deleted |
494 | // even if InsInstrs is not the better pattern. |
495 | TII->finalizeInsInstrs(Root&: MI, Pattern, InsInstrs); |
496 | |
497 | for (auto *InstrPtr : InsInstrs) |
498 | MBB->insert(I: (MachineBasicBlock::iterator)&MI, MI: InstrPtr); |
499 | |
500 | for (auto *InstrPtr : DelInstrs) { |
501 | InstrPtr->eraseFromParent(); |
502 | // Erase all LiveRegs defined by the removed instruction |
503 | for (auto *I = RegUnits.begin(); I != RegUnits.end();) { |
504 | if (I->MI == InstrPtr) |
505 | I = RegUnits.erase(I); |
506 | else |
507 | I++; |
508 | } |
509 | } |
510 | |
511 | if (IncrementalUpdate) |
512 | for (auto *InstrPtr : InsInstrs) |
513 | TraceEnsemble->updateDepth(MBB, *InstrPtr, RegUnits); |
514 | else |
515 | TraceEnsemble->invalidate(MBB); |
516 | |
517 | NumInstCombined++; |
518 | } |
519 | |
520 | // Check that the difference between original and new latency is decreasing for |
521 | // later patterns. This helps to discover sub-optimal pattern orderings. |
522 | void MachineCombiner::verifyPatternOrder(MachineBasicBlock *MBB, |
523 | MachineInstr &Root, |
524 | SmallVector<unsigned, 16> &Patterns) { |
525 | long PrevLatencyDiff = std::numeric_limits<long>::max(); |
526 | (void)PrevLatencyDiff; // Variable is used in assert only. |
527 | for (auto P : Patterns) { |
528 | SmallVector<MachineInstr *, 16> InsInstrs; |
529 | SmallVector<MachineInstr *, 16> DelInstrs; |
530 | DenseMap<Register, unsigned> InstrIdxForVirtReg; |
531 | TII->genAlternativeCodeSequence(Root, Pattern: P, InsInstrs, DelInstrs, |
532 | InstIdxForVirtReg&: InstrIdxForVirtReg); |
533 | // Found pattern, but did not generate alternative sequence. |
534 | // This can happen e.g. when an immediate could not be materialized |
535 | // in a single instruction. |
536 | if (InsInstrs.empty() || !TSchedModel.hasInstrSchedModelOrItineraries()) |
537 | continue; |
538 | |
539 | unsigned NewRootLatency, RootLatency; |
540 | std::tie(args&: NewRootLatency, args&: RootLatency) = getLatenciesForInstrSequences( |
541 | MI&: Root, InsInstrs, DelInstrs, BlockTrace: TraceEnsemble->getTrace(MBB)); |
542 | long CurrentLatencyDiff = ((long)RootLatency) - ((long)NewRootLatency); |
543 | assert(CurrentLatencyDiff <= PrevLatencyDiff && |
544 | "Current pattern is better than previous pattern." ); |
545 | PrevLatencyDiff = CurrentLatencyDiff; |
546 | } |
547 | } |
548 | |
549 | /// Substitute a slow code sequence with a faster one by |
550 | /// evaluating instruction combining pattern. |
551 | /// The prototype of such a pattern is MUl + ADD -> MADD. Performs instruction |
552 | /// combining based on machine trace metrics. Only combine a sequence of |
553 | /// instructions when this neither lengthens the critical path nor increases |
554 | /// resource pressure. When optimizing for codesize always combine when the new |
555 | /// sequence is shorter. |
556 | bool MachineCombiner::combineInstructions(MachineBasicBlock *MBB) { |
557 | bool Changed = false; |
558 | LLVM_DEBUG(dbgs() << "Combining MBB " << MBB->getName() << "\n" ); |
559 | |
560 | bool IncrementalUpdate = false; |
561 | auto BlockIter = MBB->begin(); |
562 | decltype(BlockIter) LastUpdate; |
563 | // Check if the block is in a loop. |
564 | const MachineLoop *ML = MLI->getLoopFor(BB: MBB); |
565 | if (!TraceEnsemble) |
566 | TraceEnsemble = Traces->getEnsemble(TII->getMachineCombinerTraceStrategy()); |
567 | |
568 | SparseSet<LiveRegUnit> RegUnits; |
569 | RegUnits.setUniverse(TRI->getNumRegUnits()); |
570 | |
571 | bool OptForSize = llvm::shouldOptimizeForSize(MBB, PSI, MBFI); |
572 | |
573 | bool DoRegPressureReduce = |
574 | TII->shouldReduceRegisterPressure(MBB, RegClassInfo: &RegClassInfo); |
575 | |
576 | while (BlockIter != MBB->end()) { |
577 | auto &MI = *BlockIter++; |
578 | SmallVector<unsigned, 16> Patterns; |
579 | // The motivating example is: |
580 | // |
581 | // MUL Other MUL_op1 MUL_op2 Other |
582 | // \ / \ | / |
583 | // ADD/SUB => MADD/MSUB |
584 | // (=Root) (=NewRoot) |
585 | |
586 | // The DAGCombine code always replaced MUL + ADD/SUB by MADD. While this is |
587 | // usually beneficial for code size it unfortunately can hurt performance |
588 | // when the ADD is on the critical path, but the MUL is not. With the |
589 | // substitution the MUL becomes part of the critical path (in form of the |
590 | // MADD) and can lengthen it on architectures where the MADD latency is |
591 | // longer than the ADD latency. |
592 | // |
593 | // For each instruction we check if it can be the root of a combiner |
594 | // pattern. Then for each pattern the new code sequence in form of MI is |
595 | // generated and evaluated. When the efficiency criteria (don't lengthen |
596 | // critical path, don't use more resources) is met the new sequence gets |
597 | // hooked up into the basic block before the old sequence is removed. |
598 | // |
599 | // The algorithm does not try to evaluate all patterns and pick the best. |
600 | // This is only an artificial restriction though. In practice there is |
601 | // mostly one pattern, and getMachineCombinerPatterns() can order patterns |
602 | // based on an internal cost heuristic. If |
603 | // machine-combiner-verify-pattern-order is enabled, all patterns are |
604 | // checked to ensure later patterns do not provide better latency savings. |
605 | |
606 | if (!TII->getMachineCombinerPatterns(Root&: MI, Patterns, DoRegPressureReduce)) |
607 | continue; |
608 | |
609 | if (VerifyPatternOrder) |
610 | verifyPatternOrder(MBB, Root&: MI, Patterns); |
611 | |
612 | for (const auto P : Patterns) { |
613 | SmallVector<MachineInstr *, 16> InsInstrs; |
614 | SmallVector<MachineInstr *, 16> DelInstrs; |
615 | DenseMap<Register, unsigned> InstrIdxForVirtReg; |
616 | TII->genAlternativeCodeSequence(Root&: MI, Pattern: P, InsInstrs, DelInstrs, |
617 | InstIdxForVirtReg&: InstrIdxForVirtReg); |
618 | // Found pattern, but did not generate alternative sequence. |
619 | // This can happen e.g. when an immediate could not be materialized |
620 | // in a single instruction. |
621 | if (InsInstrs.empty()) |
622 | continue; |
623 | |
624 | LLVM_DEBUG(if (dump_intrs) { |
625 | dbgs() << "\tFor the Pattern (" << (int)P |
626 | << ") these instructions could be removed\n" ; |
627 | for (auto const *InstrPtr : DelInstrs) |
628 | InstrPtr->print(dbgs(), /*IsStandalone*/false, /*SkipOpers*/false, |
629 | /*SkipDebugLoc*/false, /*AddNewLine*/true, TII); |
630 | dbgs() << "\tThese instructions could replace the removed ones\n" ; |
631 | for (auto const *InstrPtr : InsInstrs) |
632 | InstrPtr->print(dbgs(), /*IsStandalone*/false, /*SkipOpers*/false, |
633 | /*SkipDebugLoc*/false, /*AddNewLine*/true, TII); |
634 | }); |
635 | |
636 | if (IncrementalUpdate && LastUpdate != BlockIter) { |
637 | // Update depths since the last incremental update. |
638 | TraceEnsemble->updateDepths(Start: LastUpdate, End: BlockIter, RegUnits); |
639 | LastUpdate = BlockIter; |
640 | } |
641 | |
642 | if (DoRegPressureReduce && |
643 | getCombinerObjective(Pattern: P) == |
644 | CombinerObjective::MustReduceRegisterPressure) { |
645 | if (MBB->size() > inc_threshold) { |
646 | // Use incremental depth updates for basic blocks above threshold |
647 | IncrementalUpdate = true; |
648 | LastUpdate = BlockIter; |
649 | } |
650 | if (reduceRegisterPressure(Root&: MI, MBB, InsInstrs, DelInstrs, Pattern: P)) { |
651 | // Replace DelInstrs with InsInstrs. |
652 | insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, TraceEnsemble, |
653 | RegUnits, TII, Pattern: P, IncrementalUpdate); |
654 | Changed |= true; |
655 | |
656 | // Go back to previous instruction as it may have ILP reassociation |
657 | // opportunity. |
658 | BlockIter--; |
659 | break; |
660 | } |
661 | } |
662 | |
663 | if (ML && TII->isThroughputPattern(Pattern: P)) { |
664 | LLVM_DEBUG(dbgs() << "\t Replacing due to throughput pattern in loop\n" ); |
665 | insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, TraceEnsemble, |
666 | RegUnits, TII, Pattern: P, IncrementalUpdate); |
667 | // Eagerly stop after the first pattern fires. |
668 | Changed = true; |
669 | break; |
670 | } else if (OptForSize && InsInstrs.size() < DelInstrs.size()) { |
671 | LLVM_DEBUG(dbgs() << "\t Replacing due to OptForSize (" |
672 | << InsInstrs.size() << " < " |
673 | << DelInstrs.size() << ")\n" ); |
674 | insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, TraceEnsemble, |
675 | RegUnits, TII, Pattern: P, IncrementalUpdate); |
676 | // Eagerly stop after the first pattern fires. |
677 | Changed = true; |
678 | break; |
679 | } else { |
680 | // For big basic blocks, we only compute the full trace the first time |
681 | // we hit this. We do not invalidate the trace, but instead update the |
682 | // instruction depths incrementally. |
683 | // NOTE: Only the instruction depths up to MI are accurate. All other |
684 | // trace information is not updated. |
685 | MachineTraceMetrics::Trace BlockTrace = TraceEnsemble->getTrace(MBB); |
686 | Traces->verifyAnalysis(); |
687 | if (improvesCriticalPathLen(MBB, Root: &MI, BlockTrace, InsInstrs, DelInstrs, |
688 | InstrIdxForVirtReg, Pattern: P, |
689 | SlackIsAccurate: !IncrementalUpdate) && |
690 | preservesResourceLen(MBB, BlockTrace, InsInstrs, DelInstrs)) { |
691 | if (MBB->size() > inc_threshold) { |
692 | // Use incremental depth updates for basic blocks above treshold |
693 | IncrementalUpdate = true; |
694 | LastUpdate = BlockIter; |
695 | } |
696 | |
697 | insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, TraceEnsemble, |
698 | RegUnits, TII, Pattern: P, IncrementalUpdate); |
699 | |
700 | // Eagerly stop after the first pattern fires. |
701 | Changed = true; |
702 | break; |
703 | } |
704 | // Cleanup instructions of the alternative code sequence. There is no |
705 | // use for them. |
706 | MachineFunction *MF = MBB->getParent(); |
707 | for (auto *InstrPtr : InsInstrs) |
708 | MF->deleteMachineInstr(MI: InstrPtr); |
709 | } |
710 | InstrIdxForVirtReg.clear(); |
711 | } |
712 | } |
713 | |
714 | if (Changed && IncrementalUpdate) |
715 | Traces->invalidate(MBB); |
716 | return Changed; |
717 | } |
718 | |
719 | bool MachineCombiner::runOnMachineFunction(MachineFunction &MF) { |
720 | STI = &MF.getSubtarget(); |
721 | TII = STI->getInstrInfo(); |
722 | TRI = STI->getRegisterInfo(); |
723 | SchedModel = STI->getSchedModel(); |
724 | TSchedModel.init(TSInfo: STI); |
725 | MRI = &MF.getRegInfo(); |
726 | MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI(); |
727 | Traces = &getAnalysis<MachineTraceMetricsWrapperPass>().getMTM(); |
728 | PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); |
729 | MBFI = (PSI && PSI->hasProfileSummary()) ? |
730 | &getAnalysis<LazyMachineBlockFrequencyInfoPass>().getBFI() : |
731 | nullptr; |
732 | TraceEnsemble = nullptr; |
733 | RegClassInfo.runOnMachineFunction(MF); |
734 | |
735 | LLVM_DEBUG(dbgs() << getPassName() << ": " << MF.getName() << '\n'); |
736 | if (!TII->useMachineCombiner()) { |
737 | LLVM_DEBUG( |
738 | dbgs() |
739 | << " Skipping pass: Target does not support machine combiner\n" ); |
740 | return false; |
741 | } |
742 | |
743 | bool Changed = false; |
744 | |
745 | // Try to combine instructions. |
746 | for (auto &MBB : MF) |
747 | Changed |= combineInstructions(MBB: &MBB); |
748 | |
749 | return Changed; |
750 | } |
751 | |