1//===---- MachineCombiner.cpp - Instcombining on SSA form machine code ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// The machine combiner pass uses machine trace metrics to ensure the combined
10// instructions do not lengthen the critical path or the resource depth.
11//===----------------------------------------------------------------------===//
12
13#include "llvm/ADT/DenseMap.h"
14#include "llvm/ADT/Statistic.h"
15#include "llvm/Analysis/ProfileSummaryInfo.h"
16#include "llvm/CodeGen/LazyMachineBlockFrequencyInfo.h"
17#include "llvm/CodeGen/MachineCombinerPattern.h"
18#include "llvm/CodeGen/MachineDominators.h"
19#include "llvm/CodeGen/MachineFunction.h"
20#include "llvm/CodeGen/MachineFunctionPass.h"
21#include "llvm/CodeGen/MachineLoopInfo.h"
22#include "llvm/CodeGen/MachineRegisterInfo.h"
23#include "llvm/CodeGen/MachineSizeOpts.h"
24#include "llvm/CodeGen/MachineTraceMetrics.h"
25#include "llvm/CodeGen/RegisterClassInfo.h"
26#include "llvm/CodeGen/TargetInstrInfo.h"
27#include "llvm/CodeGen/TargetRegisterInfo.h"
28#include "llvm/CodeGen/TargetSchedule.h"
29#include "llvm/CodeGen/TargetSubtargetInfo.h"
30#include "llvm/InitializePasses.h"
31#include "llvm/Support/CommandLine.h"
32#include "llvm/Support/Debug.h"
33#include "llvm/Support/raw_ostream.h"
34
35using namespace llvm;
36
37#define DEBUG_TYPE "machine-combiner"
38
39STATISTIC(NumInstCombined, "Number of machineinst combined");
40
41static cl::opt<unsigned>
42inc_threshold("machine-combiner-inc-threshold", cl::Hidden,
43 cl::desc("Incremental depth computation will be used for basic "
44 "blocks with more instructions."), cl::init(Val: 500));
45
46static cl::opt<bool> dump_intrs("machine-combiner-dump-subst-intrs", cl::Hidden,
47 cl::desc("Dump all substituted intrs"),
48 cl::init(Val: false));
49
50#ifdef EXPENSIVE_CHECKS
51static cl::opt<bool> VerifyPatternOrder(
52 "machine-combiner-verify-pattern-order", cl::Hidden,
53 cl::desc(
54 "Verify that the generated patterns are ordered by increasing latency"),
55 cl::init(true));
56#else
57static cl::opt<bool> VerifyPatternOrder(
58 "machine-combiner-verify-pattern-order", cl::Hidden,
59 cl::desc(
60 "Verify that the generated patterns are ordered by increasing latency"),
61 cl::init(Val: false));
62#endif
63
64namespace {
65class MachineCombiner : public MachineFunctionPass {
66 const TargetSubtargetInfo *STI = nullptr;
67 const TargetInstrInfo *TII = nullptr;
68 const TargetRegisterInfo *TRI = nullptr;
69 MCSchedModel SchedModel;
70 MachineRegisterInfo *MRI = nullptr;
71 MachineLoopInfo *MLI = nullptr; // Current MachineLoopInfo
72 MachineTraceMetrics *Traces = nullptr;
73 MachineTraceMetrics::Ensemble *TraceEnsemble = nullptr;
74 MachineBlockFrequencyInfo *MBFI = nullptr;
75 ProfileSummaryInfo *PSI = nullptr;
76 RegisterClassInfo RegClassInfo;
77
78 TargetSchedModel TSchedModel;
79
80public:
81 static char ID;
82 MachineCombiner() : MachineFunctionPass(ID) {
83 initializeMachineCombinerPass(*PassRegistry::getPassRegistry());
84 }
85 void getAnalysisUsage(AnalysisUsage &AU) const override;
86 bool runOnMachineFunction(MachineFunction &MF) override;
87 StringRef getPassName() const override { return "Machine InstCombiner"; }
88
89private:
90 bool combineInstructions(MachineBasicBlock *);
91 MachineInstr *getOperandDef(const MachineOperand &MO);
92 bool isTransientMI(const MachineInstr *MI);
93 unsigned getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs,
94 DenseMap<Register, unsigned> &InstrIdxForVirtReg,
95 MachineTraceMetrics::Trace BlockTrace,
96 const MachineBasicBlock &MBB);
97 unsigned getLatency(MachineInstr *Root, MachineInstr *NewRoot,
98 MachineTraceMetrics::Trace BlockTrace);
99 bool improvesCriticalPathLen(MachineBasicBlock *MBB, MachineInstr *Root,
100 MachineTraceMetrics::Trace BlockTrace,
101 SmallVectorImpl<MachineInstr *> &InsInstrs,
102 SmallVectorImpl<MachineInstr *> &DelInstrs,
103 DenseMap<Register, unsigned> &InstrIdxForVirtReg,
104 unsigned Pattern, bool SlackIsAccurate);
105 bool reduceRegisterPressure(MachineInstr &Root, MachineBasicBlock *MBB,
106 SmallVectorImpl<MachineInstr *> &InsInstrs,
107 SmallVectorImpl<MachineInstr *> &DelInstrs,
108 unsigned Pattern);
109 bool preservesResourceLen(MachineBasicBlock *MBB,
110 MachineTraceMetrics::Trace BlockTrace,
111 SmallVectorImpl<MachineInstr *> &InsInstrs,
112 SmallVectorImpl<MachineInstr *> &DelInstrs);
113 void instr2instrSC(SmallVectorImpl<MachineInstr *> &Instrs,
114 SmallVectorImpl<const MCSchedClassDesc *> &InstrsSC);
115 std::pair<unsigned, unsigned>
116 getLatenciesForInstrSequences(MachineInstr &MI,
117 SmallVectorImpl<MachineInstr *> &InsInstrs,
118 SmallVectorImpl<MachineInstr *> &DelInstrs,
119 MachineTraceMetrics::Trace BlockTrace);
120
121 void verifyPatternOrder(MachineBasicBlock *MBB, MachineInstr &Root,
122 SmallVector<unsigned, 16> &Patterns);
123 CombinerObjective getCombinerObjective(unsigned Pattern);
124};
125}
126
127char MachineCombiner::ID = 0;
128char &llvm::MachineCombinerID = MachineCombiner::ID;
129
130INITIALIZE_PASS_BEGIN(MachineCombiner, DEBUG_TYPE,
131 "Machine InstCombiner", false, false)
132INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass)
133INITIALIZE_PASS_DEPENDENCY(MachineTraceMetricsWrapperPass)
134INITIALIZE_PASS_END(MachineCombiner, DEBUG_TYPE, "Machine InstCombiner",
135 false, false)
136
137void MachineCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
138 AU.setPreservesCFG();
139 AU.addPreserved<MachineDominatorTreeWrapperPass>();
140 AU.addRequired<MachineLoopInfoWrapperPass>();
141 AU.addPreserved<MachineLoopInfoWrapperPass>();
142 AU.addRequired<MachineTraceMetricsWrapperPass>();
143 AU.addPreserved<MachineTraceMetricsWrapperPass>();
144 AU.addRequired<LazyMachineBlockFrequencyInfoPass>();
145 AU.addRequired<ProfileSummaryInfoWrapperPass>();
146 MachineFunctionPass::getAnalysisUsage(AU);
147}
148
149MachineInstr *
150MachineCombiner::getOperandDef(const MachineOperand &MO) {
151 MachineInstr *DefInstr = nullptr;
152 // We need a virtual register definition.
153 if (MO.isReg() && MO.getReg().isVirtual())
154 DefInstr = MRI->getUniqueVRegDef(Reg: MO.getReg());
155 return DefInstr;
156}
157
158/// Return true if MI is unlikely to generate an actual target instruction.
159bool MachineCombiner::isTransientMI(const MachineInstr *MI) {
160 if (!MI->isCopy())
161 return MI->isTransient();
162
163 // If MI is a COPY, check if its src and dst registers can be coalesced.
164 Register Dst = MI->getOperand(i: 0).getReg();
165 Register Src = MI->getOperand(i: 1).getReg();
166
167 if (!MI->isFullCopy()) {
168 // If src RC contains super registers of dst RC, it can also be coalesced.
169 if (MI->getOperand(i: 0).getSubReg() || Src.isPhysical() || Dst.isPhysical())
170 return false;
171
172 auto SrcSub = MI->getOperand(i: 1).getSubReg();
173 auto SrcRC = MRI->getRegClass(Reg: Src);
174 auto DstRC = MRI->getRegClass(Reg: Dst);
175 return TRI->getMatchingSuperRegClass(A: SrcRC, B: DstRC, Idx: SrcSub) != nullptr;
176 }
177
178 if (Src.isPhysical() && Dst.isPhysical())
179 return Src == Dst;
180
181 if (Src.isVirtual() && Dst.isVirtual()) {
182 auto SrcRC = MRI->getRegClass(Reg: Src);
183 auto DstRC = MRI->getRegClass(Reg: Dst);
184 return SrcRC->hasSuperClassEq(RC: DstRC) || SrcRC->hasSubClassEq(RC: DstRC);
185 }
186
187 if (Src.isVirtual())
188 std::swap(a&: Src, b&: Dst);
189
190 // Now Src is physical register, Dst is virtual register.
191 auto DstRC = MRI->getRegClass(Reg: Dst);
192 return DstRC->contains(Reg: Src);
193}
194
195/// Computes depth of instructions in vector \InsInstr.
196///
197/// \param InsInstrs is a vector of machine instructions
198/// \param InstrIdxForVirtReg is a dense map of virtual register to index
199/// of defining machine instruction in \p InsInstrs
200/// \param BlockTrace is a trace of machine instructions
201///
202/// \returns Depth of last instruction in \InsInstrs ("NewRoot")
203unsigned
204MachineCombiner::getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs,
205 DenseMap<Register, unsigned> &InstrIdxForVirtReg,
206 MachineTraceMetrics::Trace BlockTrace,
207 const MachineBasicBlock &MBB) {
208 SmallVector<unsigned, 16> InstrDepth;
209 // For each instruction in the new sequence compute the depth based on the
210 // operands. Use the trace information when possible. For new operands which
211 // are tracked in the InstrIdxForVirtReg map depth is looked up in InstrDepth
212 for (auto *InstrPtr : InsInstrs) { // for each Use
213 unsigned IDepth = 0;
214 for (const MachineOperand &MO : InstrPtr->all_uses()) {
215 // Check for virtual register operand.
216 if (!MO.getReg().isVirtual())
217 continue;
218 unsigned DepthOp = 0;
219 unsigned LatencyOp = 0;
220 DenseMap<Register, unsigned>::iterator II =
221 InstrIdxForVirtReg.find(Val: MO.getReg());
222 if (II != InstrIdxForVirtReg.end()) {
223 // Operand is new virtual register not in trace
224 assert(II->second < InstrDepth.size() && "Bad Index");
225 MachineInstr *DefInstr = InsInstrs[II->second];
226 assert(DefInstr &&
227 "There must be a definition for a new virtual register");
228 DepthOp = InstrDepth[II->second];
229 int DefIdx =
230 DefInstr->findRegisterDefOperandIdx(Reg: MO.getReg(), /*TRI=*/nullptr);
231 int UseIdx =
232 InstrPtr->findRegisterUseOperandIdx(Reg: MO.getReg(), /*TRI=*/nullptr);
233 LatencyOp = TSchedModel.computeOperandLatency(DefMI: DefInstr, DefOperIdx: DefIdx,
234 UseMI: InstrPtr, UseOperIdx: UseIdx);
235 } else {
236 MachineInstr *DefInstr = getOperandDef(MO);
237 if (DefInstr && (TII->getMachineCombinerTraceStrategy() !=
238 MachineTraceStrategy::TS_Local ||
239 DefInstr->getParent() == &MBB)) {
240 DepthOp = BlockTrace.getInstrCycles(MI: *DefInstr).Depth;
241 if (!isTransientMI(MI: DefInstr))
242 LatencyOp = TSchedModel.computeOperandLatency(
243 DefMI: DefInstr,
244 DefOperIdx: DefInstr->findRegisterDefOperandIdx(Reg: MO.getReg(),
245 /*TRI=*/nullptr),
246 UseMI: InstrPtr,
247 UseOperIdx: InstrPtr->findRegisterUseOperandIdx(Reg: MO.getReg(),
248 /*TRI=*/nullptr));
249 }
250 }
251 IDepth = std::max(a: IDepth, b: DepthOp + LatencyOp);
252 }
253 InstrDepth.push_back(Elt: IDepth);
254 }
255 unsigned NewRootIdx = InsInstrs.size() - 1;
256 return InstrDepth[NewRootIdx];
257}
258
259/// Computes instruction latency as max of latency of defined operands.
260///
261/// \param Root is a machine instruction that could be replaced by NewRoot.
262/// It is used to compute a more accurate latency information for NewRoot in
263/// case there is a dependent instruction in the same trace (\p BlockTrace)
264/// \param NewRoot is the instruction for which the latency is computed
265/// \param BlockTrace is a trace of machine instructions
266///
267/// \returns Latency of \p NewRoot
268unsigned MachineCombiner::getLatency(MachineInstr *Root, MachineInstr *NewRoot,
269 MachineTraceMetrics::Trace BlockTrace) {
270 // Check each definition in NewRoot and compute the latency
271 unsigned NewRootLatency = 0;
272
273 for (const MachineOperand &MO : NewRoot->all_defs()) {
274 // Check for virtual register operand.
275 if (!MO.getReg().isVirtual())
276 continue;
277 // Get the first instruction that uses MO
278 MachineRegisterInfo::reg_iterator RI = MRI->reg_begin(RegNo: MO.getReg());
279 RI++;
280 if (RI == MRI->reg_end())
281 continue;
282 MachineInstr *UseMO = RI->getParent();
283 unsigned LatencyOp = 0;
284 if (UseMO && BlockTrace.isDepInTrace(DefMI: *Root, UseMI: *UseMO)) {
285 LatencyOp = TSchedModel.computeOperandLatency(
286 DefMI: NewRoot,
287 DefOperIdx: NewRoot->findRegisterDefOperandIdx(Reg: MO.getReg(), /*TRI=*/nullptr),
288 UseMI: UseMO,
289 UseOperIdx: UseMO->findRegisterUseOperandIdx(Reg: MO.getReg(), /*TRI=*/nullptr));
290 } else {
291 LatencyOp = TSchedModel.computeInstrLatency(MI: NewRoot);
292 }
293 NewRootLatency = std::max(a: NewRootLatency, b: LatencyOp);
294 }
295 return NewRootLatency;
296}
297
298CombinerObjective MachineCombiner::getCombinerObjective(unsigned Pattern) {
299 // TODO: If C++ ever gets a real enum class, make this part of the
300 // MachineCombinerPattern class.
301 switch (Pattern) {
302 case MachineCombinerPattern::REASSOC_AX_BY:
303 case MachineCombinerPattern::REASSOC_AX_YB:
304 case MachineCombinerPattern::REASSOC_XA_BY:
305 case MachineCombinerPattern::REASSOC_XA_YB:
306 return CombinerObjective::MustReduceDepth;
307 default:
308 return TII->getCombinerObjective(Pattern);
309 }
310}
311
312/// Estimate the latency of the new and original instruction sequence by summing
313/// up the latencies of the inserted and deleted instructions. This assumes
314/// that the inserted and deleted instructions are dependent instruction chains,
315/// which might not hold in all cases.
316std::pair<unsigned, unsigned> MachineCombiner::getLatenciesForInstrSequences(
317 MachineInstr &MI, SmallVectorImpl<MachineInstr *> &InsInstrs,
318 SmallVectorImpl<MachineInstr *> &DelInstrs,
319 MachineTraceMetrics::Trace BlockTrace) {
320 assert(!InsInstrs.empty() && "Only support sequences that insert instrs.");
321 unsigned NewRootLatency = 0;
322 // NewRoot is the last instruction in the \p InsInstrs vector.
323 MachineInstr *NewRoot = InsInstrs.back();
324 for (unsigned i = 0; i < InsInstrs.size() - 1; i++)
325 NewRootLatency += TSchedModel.computeInstrLatency(MI: InsInstrs[i]);
326 NewRootLatency += getLatency(Root: &MI, NewRoot, BlockTrace);
327
328 unsigned RootLatency = 0;
329 for (auto *I : DelInstrs)
330 RootLatency += TSchedModel.computeInstrLatency(MI: I);
331
332 return {NewRootLatency, RootLatency};
333}
334
335bool MachineCombiner::reduceRegisterPressure(
336 MachineInstr &Root, MachineBasicBlock *MBB,
337 SmallVectorImpl<MachineInstr *> &InsInstrs,
338 SmallVectorImpl<MachineInstr *> &DelInstrs, unsigned Pattern) {
339 // FIXME: for now, we don't do any check for the register pressure patterns.
340 // We treat them as always profitable. But we can do better if we make
341 // RegPressureTracker class be aware of TIE attribute. Then we can get an
342 // accurate compare of register pressure with DelInstrs or InsInstrs.
343 return true;
344}
345
346/// The DAGCombine code sequence ends in MI (Machine Instruction) Root.
347/// The new code sequence ends in MI NewRoot. A necessary condition for the new
348/// sequence to replace the old sequence is that it cannot lengthen the critical
349/// path. The definition of "improve" may be restricted by specifying that the
350/// new path improves the data dependency chain (MustReduceDepth).
351bool MachineCombiner::improvesCriticalPathLen(
352 MachineBasicBlock *MBB, MachineInstr *Root,
353 MachineTraceMetrics::Trace BlockTrace,
354 SmallVectorImpl<MachineInstr *> &InsInstrs,
355 SmallVectorImpl<MachineInstr *> &DelInstrs,
356 DenseMap<Register, unsigned> &InstrIdxForVirtReg, unsigned Pattern,
357 bool SlackIsAccurate) {
358 // Get depth and latency of NewRoot and Root.
359 unsigned NewRootDepth =
360 getDepth(InsInstrs, InstrIdxForVirtReg, BlockTrace, MBB: *MBB);
361 unsigned RootDepth = BlockTrace.getInstrCycles(MI: *Root).Depth;
362
363 LLVM_DEBUG(dbgs() << " Dependence data for " << *Root << "\tNewRootDepth: "
364 << NewRootDepth << "\tRootDepth: " << RootDepth);
365
366 // For a transform such as reassociation, the cost equation is
367 // conservatively calculated so that we must improve the depth (data
368 // dependency cycles) in the critical path to proceed with the transform.
369 // Being conservative also protects against inaccuracies in the underlying
370 // machine trace metrics and CPU models.
371 if (getCombinerObjective(Pattern) == CombinerObjective::MustReduceDepth) {
372 LLVM_DEBUG(dbgs() << "\tIt MustReduceDepth ");
373 LLVM_DEBUG(NewRootDepth < RootDepth
374 ? dbgs() << "\t and it does it\n"
375 : dbgs() << "\t but it does NOT do it\n");
376 return NewRootDepth < RootDepth;
377 }
378
379 // A more flexible cost calculation for the critical path includes the slack
380 // of the original code sequence. This may allow the transform to proceed
381 // even if the instruction depths (data dependency cycles) become worse.
382
383 // Account for the latency of the inserted and deleted instructions by
384 unsigned NewRootLatency, RootLatency;
385 if (TII->accumulateInstrSeqToRootLatency(Root&: *Root)) {
386 std::tie(args&: NewRootLatency, args&: RootLatency) =
387 getLatenciesForInstrSequences(MI&: *Root, InsInstrs, DelInstrs, BlockTrace);
388 } else {
389 NewRootLatency = TSchedModel.computeInstrLatency(MI: InsInstrs.back());
390 RootLatency = TSchedModel.computeInstrLatency(MI: Root);
391 }
392
393 unsigned RootSlack = BlockTrace.getInstrSlack(MI: *Root);
394 unsigned NewCycleCount = NewRootDepth + NewRootLatency;
395 unsigned OldCycleCount =
396 RootDepth + RootLatency + (SlackIsAccurate ? RootSlack : 0);
397 LLVM_DEBUG(dbgs() << "\n\tNewRootLatency: " << NewRootLatency
398 << "\tRootLatency: " << RootLatency << "\n\tRootSlack: "
399 << RootSlack << " SlackIsAccurate=" << SlackIsAccurate
400 << "\n\tNewRootDepth + NewRootLatency = " << NewCycleCount
401 << "\n\tRootDepth + RootLatency + RootSlack = "
402 << OldCycleCount);
403 LLVM_DEBUG(NewCycleCount <= OldCycleCount
404 ? dbgs() << "\n\t It IMPROVES PathLen because"
405 : dbgs() << "\n\t It DOES NOT improve PathLen because");
406 LLVM_DEBUG(dbgs() << "\n\t\tNewCycleCount = " << NewCycleCount
407 << ", OldCycleCount = " << OldCycleCount << "\n");
408
409 return NewCycleCount <= OldCycleCount;
410}
411
412/// helper routine to convert instructions into SC
413void MachineCombiner::instr2instrSC(
414 SmallVectorImpl<MachineInstr *> &Instrs,
415 SmallVectorImpl<const MCSchedClassDesc *> &InstrsSC) {
416 for (auto *InstrPtr : Instrs) {
417 unsigned Opc = InstrPtr->getOpcode();
418 unsigned Idx = TII->get(Opcode: Opc).getSchedClass();
419 const MCSchedClassDesc *SC = SchedModel.getSchedClassDesc(SchedClassIdx: Idx);
420 InstrsSC.push_back(Elt: SC);
421 }
422}
423
424/// True when the new instructions do not increase resource length
425bool MachineCombiner::preservesResourceLen(
426 MachineBasicBlock *MBB, MachineTraceMetrics::Trace BlockTrace,
427 SmallVectorImpl<MachineInstr *> &InsInstrs,
428 SmallVectorImpl<MachineInstr *> &DelInstrs) {
429 if (!TSchedModel.hasInstrSchedModel())
430 return true;
431
432 // Compute current resource length
433
434 //ArrayRef<const MachineBasicBlock *> MBBarr(MBB);
435 SmallVector <const MachineBasicBlock *, 1> MBBarr;
436 MBBarr.push_back(Elt: MBB);
437 unsigned ResLenBeforeCombine = BlockTrace.getResourceLength(Extrablocks: MBBarr);
438
439 // Deal with SC rather than Instructions.
440 SmallVector<const MCSchedClassDesc *, 16> InsInstrsSC;
441 SmallVector<const MCSchedClassDesc *, 16> DelInstrsSC;
442
443 instr2instrSC(Instrs&: InsInstrs, InstrsSC&: InsInstrsSC);
444 instr2instrSC(Instrs&: DelInstrs, InstrsSC&: DelInstrsSC);
445
446 ArrayRef<const MCSchedClassDesc *> MSCInsArr{InsInstrsSC};
447 ArrayRef<const MCSchedClassDesc *> MSCDelArr{DelInstrsSC};
448
449 // Compute new resource length.
450 unsigned ResLenAfterCombine =
451 BlockTrace.getResourceLength(Extrablocks: MBBarr, ExtraInstrs: MSCInsArr, RemoveInstrs: MSCDelArr);
452
453 LLVM_DEBUG(dbgs() << "\t\tResource length before replacement: "
454 << ResLenBeforeCombine
455 << " and after: " << ResLenAfterCombine << "\n");
456 LLVM_DEBUG(
457 ResLenAfterCombine <=
458 ResLenBeforeCombine + TII->getExtendResourceLenLimit()
459 ? dbgs() << "\t\t As result it IMPROVES/PRESERVES Resource Length\n"
460 : dbgs() << "\t\t As result it DOES NOT improve/preserve Resource "
461 "Length\n");
462
463 return ResLenAfterCombine <=
464 ResLenBeforeCombine + TII->getExtendResourceLenLimit();
465}
466
467/// Inserts InsInstrs and deletes DelInstrs. Incrementally updates instruction
468/// depths if requested.
469///
470/// \param MBB basic block to insert instructions in
471/// \param MI current machine instruction
472/// \param InsInstrs new instructions to insert in \p MBB
473/// \param DelInstrs instruction to delete from \p MBB
474/// \param TraceEnsemble is a pointer to the machine trace information
475/// \param RegUnits set of live registers, needed to compute instruction depths
476/// \param TII is target instruction info, used to call target hook
477/// \param Pattern is used to call target hook finalizeInsInstrs
478/// \param IncrementalUpdate if true, compute instruction depths incrementally,
479/// otherwise invalidate the trace
480static void
481insertDeleteInstructions(MachineBasicBlock *MBB, MachineInstr &MI,
482 SmallVectorImpl<MachineInstr *> &InsInstrs,
483 SmallVectorImpl<MachineInstr *> &DelInstrs,
484 MachineTraceMetrics::Ensemble *TraceEnsemble,
485 SparseSet<LiveRegUnit> &RegUnits,
486 const TargetInstrInfo *TII, unsigned Pattern,
487 bool IncrementalUpdate) {
488 // If we want to fix up some placeholder for some target, do it now.
489 // We need this because in genAlternativeCodeSequence, we have not decided the
490 // better pattern InsInstrs or DelInstrs, so we don't want generate some
491 // sideeffect to the function. For example we need to delay the constant pool
492 // entry creation here after InsInstrs is selected as better pattern.
493 // Otherwise the constant pool entry created for InsInstrs will not be deleted
494 // even if InsInstrs is not the better pattern.
495 TII->finalizeInsInstrs(Root&: MI, Pattern, InsInstrs);
496
497 for (auto *InstrPtr : InsInstrs)
498 MBB->insert(I: (MachineBasicBlock::iterator)&MI, MI: InstrPtr);
499
500 for (auto *InstrPtr : DelInstrs) {
501 InstrPtr->eraseFromParent();
502 // Erase all LiveRegs defined by the removed instruction
503 for (auto *I = RegUnits.begin(); I != RegUnits.end();) {
504 if (I->MI == InstrPtr)
505 I = RegUnits.erase(I);
506 else
507 I++;
508 }
509 }
510
511 if (IncrementalUpdate)
512 for (auto *InstrPtr : InsInstrs)
513 TraceEnsemble->updateDepth(MBB, *InstrPtr, RegUnits);
514 else
515 TraceEnsemble->invalidate(MBB);
516
517 NumInstCombined++;
518}
519
520// Check that the difference between original and new latency is decreasing for
521// later patterns. This helps to discover sub-optimal pattern orderings.
522void MachineCombiner::verifyPatternOrder(MachineBasicBlock *MBB,
523 MachineInstr &Root,
524 SmallVector<unsigned, 16> &Patterns) {
525 long PrevLatencyDiff = std::numeric_limits<long>::max();
526 (void)PrevLatencyDiff; // Variable is used in assert only.
527 for (auto P : Patterns) {
528 SmallVector<MachineInstr *, 16> InsInstrs;
529 SmallVector<MachineInstr *, 16> DelInstrs;
530 DenseMap<Register, unsigned> InstrIdxForVirtReg;
531 TII->genAlternativeCodeSequence(Root, Pattern: P, InsInstrs, DelInstrs,
532 InstIdxForVirtReg&: InstrIdxForVirtReg);
533 // Found pattern, but did not generate alternative sequence.
534 // This can happen e.g. when an immediate could not be materialized
535 // in a single instruction.
536 if (InsInstrs.empty() || !TSchedModel.hasInstrSchedModelOrItineraries())
537 continue;
538
539 unsigned NewRootLatency, RootLatency;
540 std::tie(args&: NewRootLatency, args&: RootLatency) = getLatenciesForInstrSequences(
541 MI&: Root, InsInstrs, DelInstrs, BlockTrace: TraceEnsemble->getTrace(MBB));
542 long CurrentLatencyDiff = ((long)RootLatency) - ((long)NewRootLatency);
543 assert(CurrentLatencyDiff <= PrevLatencyDiff &&
544 "Current pattern is better than previous pattern.");
545 PrevLatencyDiff = CurrentLatencyDiff;
546 }
547}
548
549/// Substitute a slow code sequence with a faster one by
550/// evaluating instruction combining pattern.
551/// The prototype of such a pattern is MUl + ADD -> MADD. Performs instruction
552/// combining based on machine trace metrics. Only combine a sequence of
553/// instructions when this neither lengthens the critical path nor increases
554/// resource pressure. When optimizing for codesize always combine when the new
555/// sequence is shorter.
556bool MachineCombiner::combineInstructions(MachineBasicBlock *MBB) {
557 bool Changed = false;
558 LLVM_DEBUG(dbgs() << "Combining MBB " << MBB->getName() << "\n");
559
560 bool IncrementalUpdate = false;
561 auto BlockIter = MBB->begin();
562 decltype(BlockIter) LastUpdate;
563 // Check if the block is in a loop.
564 const MachineLoop *ML = MLI->getLoopFor(BB: MBB);
565 if (!TraceEnsemble)
566 TraceEnsemble = Traces->getEnsemble(TII->getMachineCombinerTraceStrategy());
567
568 SparseSet<LiveRegUnit> RegUnits;
569 RegUnits.setUniverse(TRI->getNumRegUnits());
570
571 bool OptForSize = llvm::shouldOptimizeForSize(MBB, PSI, MBFI);
572
573 bool DoRegPressureReduce =
574 TII->shouldReduceRegisterPressure(MBB, RegClassInfo: &RegClassInfo);
575
576 while (BlockIter != MBB->end()) {
577 auto &MI = *BlockIter++;
578 SmallVector<unsigned, 16> Patterns;
579 // The motivating example is:
580 //
581 // MUL Other MUL_op1 MUL_op2 Other
582 // \ / \ | /
583 // ADD/SUB => MADD/MSUB
584 // (=Root) (=NewRoot)
585
586 // The DAGCombine code always replaced MUL + ADD/SUB by MADD. While this is
587 // usually beneficial for code size it unfortunately can hurt performance
588 // when the ADD is on the critical path, but the MUL is not. With the
589 // substitution the MUL becomes part of the critical path (in form of the
590 // MADD) and can lengthen it on architectures where the MADD latency is
591 // longer than the ADD latency.
592 //
593 // For each instruction we check if it can be the root of a combiner
594 // pattern. Then for each pattern the new code sequence in form of MI is
595 // generated and evaluated. When the efficiency criteria (don't lengthen
596 // critical path, don't use more resources) is met the new sequence gets
597 // hooked up into the basic block before the old sequence is removed.
598 //
599 // The algorithm does not try to evaluate all patterns and pick the best.
600 // This is only an artificial restriction though. In practice there is
601 // mostly one pattern, and getMachineCombinerPatterns() can order patterns
602 // based on an internal cost heuristic. If
603 // machine-combiner-verify-pattern-order is enabled, all patterns are
604 // checked to ensure later patterns do not provide better latency savings.
605
606 if (!TII->getMachineCombinerPatterns(Root&: MI, Patterns, DoRegPressureReduce))
607 continue;
608
609 if (VerifyPatternOrder)
610 verifyPatternOrder(MBB, Root&: MI, Patterns);
611
612 for (const auto P : Patterns) {
613 SmallVector<MachineInstr *, 16> InsInstrs;
614 SmallVector<MachineInstr *, 16> DelInstrs;
615 DenseMap<Register, unsigned> InstrIdxForVirtReg;
616 TII->genAlternativeCodeSequence(Root&: MI, Pattern: P, InsInstrs, DelInstrs,
617 InstIdxForVirtReg&: InstrIdxForVirtReg);
618 // Found pattern, but did not generate alternative sequence.
619 // This can happen e.g. when an immediate could not be materialized
620 // in a single instruction.
621 if (InsInstrs.empty())
622 continue;
623
624 LLVM_DEBUG(if (dump_intrs) {
625 dbgs() << "\tFor the Pattern (" << (int)P
626 << ") these instructions could be removed\n";
627 for (auto const *InstrPtr : DelInstrs)
628 InstrPtr->print(dbgs(), /*IsStandalone*/false, /*SkipOpers*/false,
629 /*SkipDebugLoc*/false, /*AddNewLine*/true, TII);
630 dbgs() << "\tThese instructions could replace the removed ones\n";
631 for (auto const *InstrPtr : InsInstrs)
632 InstrPtr->print(dbgs(), /*IsStandalone*/false, /*SkipOpers*/false,
633 /*SkipDebugLoc*/false, /*AddNewLine*/true, TII);
634 });
635
636 if (IncrementalUpdate && LastUpdate != BlockIter) {
637 // Update depths since the last incremental update.
638 TraceEnsemble->updateDepths(Start: LastUpdate, End: BlockIter, RegUnits);
639 LastUpdate = BlockIter;
640 }
641
642 if (DoRegPressureReduce &&
643 getCombinerObjective(Pattern: P) ==
644 CombinerObjective::MustReduceRegisterPressure) {
645 if (MBB->size() > inc_threshold) {
646 // Use incremental depth updates for basic blocks above threshold
647 IncrementalUpdate = true;
648 LastUpdate = BlockIter;
649 }
650 if (reduceRegisterPressure(Root&: MI, MBB, InsInstrs, DelInstrs, Pattern: P)) {
651 // Replace DelInstrs with InsInstrs.
652 insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, TraceEnsemble,
653 RegUnits, TII, Pattern: P, IncrementalUpdate);
654 Changed |= true;
655
656 // Go back to previous instruction as it may have ILP reassociation
657 // opportunity.
658 BlockIter--;
659 break;
660 }
661 }
662
663 if (ML && TII->isThroughputPattern(Pattern: P)) {
664 LLVM_DEBUG(dbgs() << "\t Replacing due to throughput pattern in loop\n");
665 insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, TraceEnsemble,
666 RegUnits, TII, Pattern: P, IncrementalUpdate);
667 // Eagerly stop after the first pattern fires.
668 Changed = true;
669 break;
670 } else if (OptForSize && InsInstrs.size() < DelInstrs.size()) {
671 LLVM_DEBUG(dbgs() << "\t Replacing due to OptForSize ("
672 << InsInstrs.size() << " < "
673 << DelInstrs.size() << ")\n");
674 insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, TraceEnsemble,
675 RegUnits, TII, Pattern: P, IncrementalUpdate);
676 // Eagerly stop after the first pattern fires.
677 Changed = true;
678 break;
679 } else {
680 // For big basic blocks, we only compute the full trace the first time
681 // we hit this. We do not invalidate the trace, but instead update the
682 // instruction depths incrementally.
683 // NOTE: Only the instruction depths up to MI are accurate. All other
684 // trace information is not updated.
685 MachineTraceMetrics::Trace BlockTrace = TraceEnsemble->getTrace(MBB);
686 Traces->verifyAnalysis();
687 if (improvesCriticalPathLen(MBB, Root: &MI, BlockTrace, InsInstrs, DelInstrs,
688 InstrIdxForVirtReg, Pattern: P,
689 SlackIsAccurate: !IncrementalUpdate) &&
690 preservesResourceLen(MBB, BlockTrace, InsInstrs, DelInstrs)) {
691 if (MBB->size() > inc_threshold) {
692 // Use incremental depth updates for basic blocks above treshold
693 IncrementalUpdate = true;
694 LastUpdate = BlockIter;
695 }
696
697 insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, TraceEnsemble,
698 RegUnits, TII, Pattern: P, IncrementalUpdate);
699
700 // Eagerly stop after the first pattern fires.
701 Changed = true;
702 break;
703 }
704 // Cleanup instructions of the alternative code sequence. There is no
705 // use for them.
706 MachineFunction *MF = MBB->getParent();
707 for (auto *InstrPtr : InsInstrs)
708 MF->deleteMachineInstr(MI: InstrPtr);
709 }
710 InstrIdxForVirtReg.clear();
711 }
712 }
713
714 if (Changed && IncrementalUpdate)
715 Traces->invalidate(MBB);
716 return Changed;
717}
718
719bool MachineCombiner::runOnMachineFunction(MachineFunction &MF) {
720 STI = &MF.getSubtarget();
721 TII = STI->getInstrInfo();
722 TRI = STI->getRegisterInfo();
723 SchedModel = STI->getSchedModel();
724 TSchedModel.init(TSInfo: STI);
725 MRI = &MF.getRegInfo();
726 MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
727 Traces = &getAnalysis<MachineTraceMetricsWrapperPass>().getMTM();
728 PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
729 MBFI = (PSI && PSI->hasProfileSummary()) ?
730 &getAnalysis<LazyMachineBlockFrequencyInfoPass>().getBFI() :
731 nullptr;
732 TraceEnsemble = nullptr;
733 RegClassInfo.runOnMachineFunction(MF);
734
735 LLVM_DEBUG(dbgs() << getPassName() << ": " << MF.getName() << '\n');
736 if (!TII->useMachineCombiner()) {
737 LLVM_DEBUG(
738 dbgs()
739 << " Skipping pass: Target does not support machine combiner\n");
740 return false;
741 }
742
743 bool Changed = false;
744
745 // Try to combine instructions.
746 for (auto &MBB : MF)
747 Changed |= combineInstructions(MBB: &MBB);
748
749 return Changed;
750}
751