1//===---- MachineCombiner.cpp - Instcombining on SSA form machine code ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// The machine combiner pass uses machine trace metrics to ensure the combined
10// instructions do not lengthen the critical path or the resource depth.
11//===----------------------------------------------------------------------===//
12
13#include "llvm/ADT/DenseMap.h"
14#include "llvm/ADT/Statistic.h"
15#include "llvm/Analysis/ProfileSummaryInfo.h"
16#include "llvm/CodeGen/LazyMachineBlockFrequencyInfo.h"
17#include "llvm/CodeGen/MachineCombinerPattern.h"
18#include "llvm/CodeGen/MachineDominators.h"
19#include "llvm/CodeGen/MachineFunction.h"
20#include "llvm/CodeGen/MachineFunctionPass.h"
21#include "llvm/CodeGen/MachineLoopInfo.h"
22#include "llvm/CodeGen/MachineRegisterInfo.h"
23#include "llvm/CodeGen/MachineSizeOpts.h"
24#include "llvm/CodeGen/MachineTraceMetrics.h"
25#include "llvm/CodeGen/RegisterClassInfo.h"
26#include "llvm/CodeGen/TargetInstrInfo.h"
27#include "llvm/CodeGen/TargetRegisterInfo.h"
28#include "llvm/CodeGen/TargetSchedule.h"
29#include "llvm/CodeGen/TargetSubtargetInfo.h"
30#include "llvm/InitializePasses.h"
31#include "llvm/Support/CommandLine.h"
32#include "llvm/Support/Debug.h"
33#include "llvm/Support/raw_ostream.h"
34
35using namespace llvm;
36
37#define DEBUG_TYPE "machine-combiner"
38
39STATISTIC(NumInstCombined, "Number of machineinst combined");
40
41static cl::opt<unsigned>
42inc_threshold("machine-combiner-inc-threshold", cl::Hidden,
43 cl::desc("Incremental depth computation will be used for basic "
44 "blocks with more instructions."), cl::init(Val: 500));
45
46static cl::opt<bool> dump_intrs("machine-combiner-dump-subst-intrs", cl::Hidden,
47 cl::desc("Dump all substituted intrs"),
48 cl::init(Val: false));
49
50#ifdef EXPENSIVE_CHECKS
51static cl::opt<bool> VerifyPatternOrder(
52 "machine-combiner-verify-pattern-order", cl::Hidden,
53 cl::desc(
54 "Verify that the generated patterns are ordered by increasing latency"),
55 cl::init(true));
56#else
57static cl::opt<bool> VerifyPatternOrder(
58 "machine-combiner-verify-pattern-order", cl::Hidden,
59 cl::desc(
60 "Verify that the generated patterns are ordered by increasing latency"),
61 cl::init(Val: false));
62#endif
63
64namespace {
65class MachineCombiner : public MachineFunctionPass {
66 const TargetSubtargetInfo *STI = nullptr;
67 const TargetInstrInfo *TII = nullptr;
68 const TargetRegisterInfo *TRI = nullptr;
69 MCSchedModel SchedModel;
70 MachineRegisterInfo *MRI = nullptr;
71 MachineLoopInfo *MLI = nullptr; // Current MachineLoopInfo
72 MachineTraceMetrics *Traces = nullptr;
73 MachineTraceMetrics::Ensemble *TraceEnsemble = nullptr;
74 MachineBlockFrequencyInfo *MBFI = nullptr;
75 ProfileSummaryInfo *PSI = nullptr;
76 RegisterClassInfo RegClassInfo;
77
78 TargetSchedModel TSchedModel;
79
80public:
81 static char ID;
82 MachineCombiner() : MachineFunctionPass(ID) {}
83 void getAnalysisUsage(AnalysisUsage &AU) const override;
84 bool runOnMachineFunction(MachineFunction &MF) override;
85 StringRef getPassName() const override { return "Machine InstCombiner"; }
86
87private:
88 bool combineInstructions(MachineBasicBlock *);
89 MachineInstr *getOperandDef(const MachineOperand &MO);
90 bool isTransientMI(const MachineInstr *MI);
91 unsigned getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs,
92 DenseMap<Register, unsigned> &InstrIdxForVirtReg,
93 MachineTraceMetrics::Trace BlockTrace,
94 const MachineBasicBlock &MBB);
95 unsigned getLatency(MachineInstr *Root, MachineInstr *NewRoot,
96 MachineTraceMetrics::Trace BlockTrace);
97 bool improvesCriticalPathLen(MachineBasicBlock *MBB, MachineInstr *Root,
98 MachineTraceMetrics::Trace BlockTrace,
99 SmallVectorImpl<MachineInstr *> &InsInstrs,
100 SmallVectorImpl<MachineInstr *> &DelInstrs,
101 DenseMap<Register, unsigned> &InstrIdxForVirtReg,
102 unsigned Pattern, bool SlackIsAccurate);
103 bool reduceRegisterPressure(MachineInstr &Root, MachineBasicBlock *MBB,
104 SmallVectorImpl<MachineInstr *> &InsInstrs,
105 SmallVectorImpl<MachineInstr *> &DelInstrs,
106 unsigned Pattern);
107 bool preservesResourceLen(MachineBasicBlock *MBB,
108 MachineTraceMetrics::Trace BlockTrace,
109 SmallVectorImpl<MachineInstr *> &InsInstrs,
110 SmallVectorImpl<MachineInstr *> &DelInstrs);
111 void instr2instrSC(SmallVectorImpl<MachineInstr *> &Instrs,
112 SmallVectorImpl<const MCSchedClassDesc *> &InstrsSC);
113 std::pair<unsigned, unsigned>
114 getLatenciesForInstrSequences(MachineInstr &MI,
115 SmallVectorImpl<MachineInstr *> &InsInstrs,
116 SmallVectorImpl<MachineInstr *> &DelInstrs,
117 MachineTraceMetrics::Trace BlockTrace);
118
119 CombinerObjective getCombinerObjective(unsigned Pattern);
120};
121}
122
123char MachineCombiner::ID = 0;
124char &llvm::MachineCombinerID = MachineCombiner::ID;
125
126INITIALIZE_PASS_BEGIN(MachineCombiner, DEBUG_TYPE,
127 "Machine InstCombiner", false, false)
128INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass)
129INITIALIZE_PASS_DEPENDENCY(MachineTraceMetricsWrapperPass)
130INITIALIZE_PASS_END(MachineCombiner, DEBUG_TYPE, "Machine InstCombiner",
131 false, false)
132
133void MachineCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
134 AU.setPreservesCFG();
135 AU.addPreserved<MachineDominatorTreeWrapperPass>();
136 AU.addRequired<MachineLoopInfoWrapperPass>();
137 AU.addPreserved<MachineLoopInfoWrapperPass>();
138 AU.addRequired<MachineTraceMetricsWrapperPass>();
139 AU.addPreserved<MachineTraceMetricsWrapperPass>();
140 AU.addRequired<LazyMachineBlockFrequencyInfoPass>();
141 AU.addRequired<ProfileSummaryInfoWrapperPass>();
142 MachineFunctionPass::getAnalysisUsage(AU);
143}
144
145MachineInstr *
146MachineCombiner::getOperandDef(const MachineOperand &MO) {
147 MachineInstr *DefInstr = nullptr;
148 // We need a virtual register definition.
149 if (MO.isReg() && MO.getReg().isVirtual())
150 DefInstr = MRI->getUniqueVRegDef(Reg: MO.getReg());
151 return DefInstr;
152}
153
154/// Return true if MI is unlikely to generate an actual target instruction.
155bool MachineCombiner::isTransientMI(const MachineInstr *MI) {
156 if (!MI->isCopy())
157 return MI->isTransient();
158
159 // If MI is a COPY, check if its src and dst registers can be coalesced.
160 Register Dst = MI->getOperand(i: 0).getReg();
161 Register Src = MI->getOperand(i: 1).getReg();
162
163 if (!MI->isFullCopy()) {
164 // If src RC contains super registers of dst RC, it can also be coalesced.
165 if (MI->getOperand(i: 0).getSubReg() || Src.isPhysical() || Dst.isPhysical())
166 return false;
167
168 auto SrcSub = MI->getOperand(i: 1).getSubReg();
169 auto SrcRC = MRI->getRegClass(Reg: Src);
170 auto DstRC = MRI->getRegClass(Reg: Dst);
171 return TRI->getMatchingSuperRegClass(A: SrcRC, B: DstRC, Idx: SrcSub) != nullptr;
172 }
173
174 if (Src.isPhysical() && Dst.isPhysical())
175 return Src == Dst;
176
177 if (Src.isVirtual() && Dst.isVirtual()) {
178 auto SrcRC = MRI->getRegClass(Reg: Src);
179 auto DstRC = MRI->getRegClass(Reg: Dst);
180 return SrcRC->hasSuperClassEq(RC: DstRC) || SrcRC->hasSubClassEq(RC: DstRC);
181 }
182
183 if (Src.isVirtual())
184 std::swap(a&: Src, b&: Dst);
185
186 // Now Src is physical register, Dst is virtual register.
187 auto DstRC = MRI->getRegClass(Reg: Dst);
188 return DstRC->contains(Reg: Src);
189}
190
191/// Computes depth of instructions in vector \InsInstr.
192///
193/// \param InsInstrs is a vector of machine instructions
194/// \param InstrIdxForVirtReg is a dense map of virtual register to index
195/// of defining machine instruction in \p InsInstrs
196/// \param BlockTrace is a trace of machine instructions
197///
198/// \returns Depth of last instruction in \InsInstrs ("NewRoot")
199unsigned
200MachineCombiner::getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs,
201 DenseMap<Register, unsigned> &InstrIdxForVirtReg,
202 MachineTraceMetrics::Trace BlockTrace,
203 const MachineBasicBlock &MBB) {
204 SmallVector<unsigned, 16> InstrDepth;
205 // For each instruction in the new sequence compute the depth based on the
206 // operands. Use the trace information when possible. For new operands which
207 // are tracked in the InstrIdxForVirtReg map depth is looked up in InstrDepth
208 for (auto *InstrPtr : InsInstrs) { // for each Use
209 unsigned IDepth = 0;
210 for (const MachineOperand &MO : InstrPtr->all_uses()) {
211 // Check for virtual register operand.
212 if (!MO.getReg().isVirtual())
213 continue;
214 unsigned DepthOp = 0;
215 unsigned LatencyOp = 0;
216 DenseMap<Register, unsigned>::iterator II =
217 InstrIdxForVirtReg.find(Val: MO.getReg());
218 if (II != InstrIdxForVirtReg.end()) {
219 // Operand is new virtual register not in trace
220 assert(II->second < InstrDepth.size() && "Bad Index");
221 MachineInstr *DefInstr = InsInstrs[II->second];
222 assert(DefInstr &&
223 "There must be a definition for a new virtual register");
224 DepthOp = InstrDepth[II->second];
225 int DefIdx =
226 DefInstr->findRegisterDefOperandIdx(Reg: MO.getReg(), /*TRI=*/nullptr);
227 int UseIdx =
228 InstrPtr->findRegisterUseOperandIdx(Reg: MO.getReg(), /*TRI=*/nullptr);
229 LatencyOp = TSchedModel.computeOperandLatency(DefMI: DefInstr, DefOperIdx: DefIdx,
230 UseMI: InstrPtr, UseOperIdx: UseIdx);
231 } else {
232 MachineInstr *DefInstr = getOperandDef(MO);
233 if (DefInstr && (TII->getMachineCombinerTraceStrategy() !=
234 MachineTraceStrategy::TS_Local ||
235 DefInstr->getParent() == &MBB)) {
236 DepthOp = BlockTrace.getInstrCycles(MI: *DefInstr).Depth;
237 if (!isTransientMI(MI: DefInstr))
238 LatencyOp = TSchedModel.computeOperandLatency(
239 DefMI: DefInstr,
240 DefOperIdx: DefInstr->findRegisterDefOperandIdx(Reg: MO.getReg(),
241 /*TRI=*/nullptr),
242 UseMI: InstrPtr,
243 UseOperIdx: InstrPtr->findRegisterUseOperandIdx(Reg: MO.getReg(),
244 /*TRI=*/nullptr));
245 }
246 }
247 IDepth = std::max(a: IDepth, b: DepthOp + LatencyOp);
248 }
249 InstrDepth.push_back(Elt: IDepth);
250 }
251 unsigned NewRootIdx = InsInstrs.size() - 1;
252 return InstrDepth[NewRootIdx];
253}
254
255/// Computes instruction latency as max of latency of defined operands.
256///
257/// \param Root is a machine instruction that could be replaced by NewRoot.
258/// It is used to compute a more accurate latency information for NewRoot in
259/// case there is a dependent instruction in the same trace (\p BlockTrace)
260/// \param NewRoot is the instruction for which the latency is computed
261/// \param BlockTrace is a trace of machine instructions
262///
263/// \returns Latency of \p NewRoot
264unsigned MachineCombiner::getLatency(MachineInstr *Root, MachineInstr *NewRoot,
265 MachineTraceMetrics::Trace BlockTrace) {
266 // Check each definition in NewRoot and compute the latency
267 unsigned NewRootLatency = 0;
268
269 for (const MachineOperand &MO : NewRoot->all_defs()) {
270 // Check for virtual register operand.
271 if (!MO.getReg().isVirtual())
272 continue;
273 // Get the first instruction that uses MO
274 MachineRegisterInfo::reg_iterator RI = MRI->reg_begin(RegNo: MO.getReg());
275 RI++;
276 if (RI == MRI->reg_end())
277 continue;
278 MachineInstr *UseMO = RI->getParent();
279 unsigned LatencyOp = 0;
280 if (UseMO && BlockTrace.isDepInTrace(DefMI: *Root, UseMI: *UseMO)) {
281 LatencyOp = TSchedModel.computeOperandLatency(
282 DefMI: NewRoot,
283 DefOperIdx: NewRoot->findRegisterDefOperandIdx(Reg: MO.getReg(), /*TRI=*/nullptr),
284 UseMI: UseMO,
285 UseOperIdx: UseMO->findRegisterUseOperandIdx(Reg: MO.getReg(), /*TRI=*/nullptr));
286 } else {
287 LatencyOp = TSchedModel.computeInstrLatency(MI: NewRoot);
288 }
289 NewRootLatency = std::max(a: NewRootLatency, b: LatencyOp);
290 }
291 return NewRootLatency;
292}
293
294CombinerObjective MachineCombiner::getCombinerObjective(unsigned Pattern) {
295 // TODO: If C++ ever gets a real enum class, make this part of the
296 // MachineCombinerPattern class.
297 switch (Pattern) {
298 case MachineCombinerPattern::REASSOC_AX_BY:
299 case MachineCombinerPattern::REASSOC_AX_YB:
300 case MachineCombinerPattern::REASSOC_XA_BY:
301 case MachineCombinerPattern::REASSOC_XA_YB:
302 return CombinerObjective::MustReduceDepth;
303 default:
304 return TII->getCombinerObjective(Pattern);
305 }
306}
307
308/// Estimate the latency of the new and original instruction sequence by summing
309/// up the latencies of the inserted and deleted instructions. This assumes
310/// that the inserted and deleted instructions are dependent instruction chains,
311/// which might not hold in all cases.
312std::pair<unsigned, unsigned> MachineCombiner::getLatenciesForInstrSequences(
313 MachineInstr &MI, SmallVectorImpl<MachineInstr *> &InsInstrs,
314 SmallVectorImpl<MachineInstr *> &DelInstrs,
315 MachineTraceMetrics::Trace BlockTrace) {
316 assert(!InsInstrs.empty() && "Only support sequences that insert instrs.");
317 unsigned NewRootLatency = 0;
318 // NewRoot is the last instruction in the \p InsInstrs vector.
319 MachineInstr *NewRoot = InsInstrs.back();
320 for (unsigned i = 0; i < InsInstrs.size() - 1; i++)
321 NewRootLatency += TSchedModel.computeInstrLatency(MI: InsInstrs[i]);
322 NewRootLatency += getLatency(Root: &MI, NewRoot, BlockTrace);
323
324 unsigned RootLatency = 0;
325 for (auto *I : DelInstrs)
326 RootLatency += TSchedModel.computeInstrLatency(MI: I);
327
328 return {NewRootLatency, RootLatency};
329}
330
331bool MachineCombiner::reduceRegisterPressure(
332 MachineInstr &Root, MachineBasicBlock *MBB,
333 SmallVectorImpl<MachineInstr *> &InsInstrs,
334 SmallVectorImpl<MachineInstr *> &DelInstrs, unsigned Pattern) {
335 // FIXME: for now, we don't do any check for the register pressure patterns.
336 // We treat them as always profitable. But we can do better if we make
337 // RegPressureTracker class be aware of TIE attribute. Then we can get an
338 // accurate compare of register pressure with DelInstrs or InsInstrs.
339 return true;
340}
341
342/// The DAGCombine code sequence ends in MI (Machine Instruction) Root.
343/// The new code sequence ends in MI NewRoot. A necessary condition for the new
344/// sequence to replace the old sequence is that it cannot lengthen the critical
345/// path. The definition of "improve" may be restricted by specifying that the
346/// new path improves the data dependency chain (MustReduceDepth).
347bool MachineCombiner::improvesCriticalPathLen(
348 MachineBasicBlock *MBB, MachineInstr *Root,
349 MachineTraceMetrics::Trace BlockTrace,
350 SmallVectorImpl<MachineInstr *> &InsInstrs,
351 SmallVectorImpl<MachineInstr *> &DelInstrs,
352 DenseMap<Register, unsigned> &InstrIdxForVirtReg, unsigned Pattern,
353 bool SlackIsAccurate) {
354 // Get depth and latency of NewRoot and Root.
355 unsigned NewRootDepth =
356 getDepth(InsInstrs, InstrIdxForVirtReg, BlockTrace, MBB: *MBB);
357 unsigned RootDepth = BlockTrace.getInstrCycles(MI: *Root).Depth;
358
359 LLVM_DEBUG(dbgs() << " Dependence data for " << *Root << "\tNewRootDepth: "
360 << NewRootDepth << "\tRootDepth: " << RootDepth);
361
362 // For a transform such as reassociation, the cost equation is
363 // conservatively calculated so that we must improve the depth (data
364 // dependency cycles) in the critical path to proceed with the transform.
365 // Being conservative also protects against inaccuracies in the underlying
366 // machine trace metrics and CPU models.
367 if (getCombinerObjective(Pattern) == CombinerObjective::MustReduceDepth) {
368 LLVM_DEBUG(dbgs() << "\tIt MustReduceDepth ");
369 LLVM_DEBUG(NewRootDepth < RootDepth
370 ? dbgs() << "\t and it does it\n"
371 : dbgs() << "\t but it does NOT do it\n");
372 return NewRootDepth < RootDepth;
373 }
374
375 // A more flexible cost calculation for the critical path includes the slack
376 // of the original code sequence. This may allow the transform to proceed
377 // even if the instruction depths (data dependency cycles) become worse.
378
379 // Account for the latency of the inserted and deleted instructions by
380 unsigned NewRootLatency, RootLatency;
381 if (TII->accumulateInstrSeqToRootLatency(Root&: *Root)) {
382 std::tie(args&: NewRootLatency, args&: RootLatency) =
383 getLatenciesForInstrSequences(MI&: *Root, InsInstrs, DelInstrs, BlockTrace);
384 } else {
385 NewRootLatency = TSchedModel.computeInstrLatency(MI: InsInstrs.back());
386 RootLatency = TSchedModel.computeInstrLatency(MI: Root);
387 }
388
389 unsigned RootSlack = BlockTrace.getInstrSlack(MI: *Root);
390 unsigned NewCycleCount = NewRootDepth + NewRootLatency;
391 unsigned OldCycleCount =
392 RootDepth + RootLatency + (SlackIsAccurate ? RootSlack : 0);
393 LLVM_DEBUG(dbgs() << "\n\tNewRootLatency: " << NewRootLatency
394 << "\tRootLatency: " << RootLatency << "\n\tRootSlack: "
395 << RootSlack << " SlackIsAccurate=" << SlackIsAccurate
396 << "\n\tNewRootDepth + NewRootLatency = " << NewCycleCount
397 << "\n\tRootDepth + RootLatency + RootSlack = "
398 << OldCycleCount);
399 LLVM_DEBUG(NewCycleCount <= OldCycleCount
400 ? dbgs() << "\n\t It IMPROVES PathLen because"
401 : dbgs() << "\n\t It DOES NOT improve PathLen because");
402 LLVM_DEBUG(dbgs() << "\n\t\tNewCycleCount = " << NewCycleCount
403 << ", OldCycleCount = " << OldCycleCount << "\n");
404
405 return NewCycleCount <= OldCycleCount;
406}
407
408/// helper routine to convert instructions into SC
409void MachineCombiner::instr2instrSC(
410 SmallVectorImpl<MachineInstr *> &Instrs,
411 SmallVectorImpl<const MCSchedClassDesc *> &InstrsSC) {
412 for (auto *InstrPtr : Instrs) {
413 unsigned Opc = InstrPtr->getOpcode();
414 unsigned Idx = TII->get(Opcode: Opc).getSchedClass();
415 const MCSchedClassDesc *SC = SchedModel.getSchedClassDesc(SchedClassIdx: Idx);
416 InstrsSC.push_back(Elt: SC);
417 }
418}
419
420/// True when the new instructions do not increase resource length
421bool MachineCombiner::preservesResourceLen(
422 MachineBasicBlock *MBB, MachineTraceMetrics::Trace BlockTrace,
423 SmallVectorImpl<MachineInstr *> &InsInstrs,
424 SmallVectorImpl<MachineInstr *> &DelInstrs) {
425 if (!TSchedModel.hasInstrSchedModel())
426 return true;
427
428 // Compute current resource length
429
430 //ArrayRef<const MachineBasicBlock *> MBBarr(MBB);
431 SmallVector <const MachineBasicBlock *, 1> MBBarr;
432 MBBarr.push_back(Elt: MBB);
433 unsigned ResLenBeforeCombine = BlockTrace.getResourceLength(Extrablocks: MBBarr);
434
435 // Deal with SC rather than Instructions.
436 SmallVector<const MCSchedClassDesc *, 16> InsInstrsSC;
437 SmallVector<const MCSchedClassDesc *, 16> DelInstrsSC;
438
439 instr2instrSC(Instrs&: InsInstrs, InstrsSC&: InsInstrsSC);
440 instr2instrSC(Instrs&: DelInstrs, InstrsSC&: DelInstrsSC);
441
442 ArrayRef<const MCSchedClassDesc *> MSCInsArr{InsInstrsSC};
443 ArrayRef<const MCSchedClassDesc *> MSCDelArr{DelInstrsSC};
444
445 // Compute new resource length.
446 unsigned ResLenAfterCombine =
447 BlockTrace.getResourceLength(Extrablocks: MBBarr, ExtraInstrs: MSCInsArr, RemoveInstrs: MSCDelArr);
448
449 LLVM_DEBUG(dbgs() << "\t\tResource length before replacement: "
450 << ResLenBeforeCombine
451 << " and after: " << ResLenAfterCombine << "\n");
452 LLVM_DEBUG(
453 ResLenAfterCombine <=
454 ResLenBeforeCombine + TII->getExtendResourceLenLimit()
455 ? dbgs() << "\t\t As result it IMPROVES/PRESERVES Resource Length\n"
456 : dbgs() << "\t\t As result it DOES NOT improve/preserve Resource "
457 "Length\n");
458
459 return ResLenAfterCombine <=
460 ResLenBeforeCombine + TII->getExtendResourceLenLimit();
461}
462
463/// Inserts InsInstrs and deletes DelInstrs. Incrementally updates instruction
464/// depths if requested.
465///
466/// \param MBB basic block to insert instructions in
467/// \param MI current machine instruction
468/// \param InsInstrs new instructions to insert in \p MBB
469/// \param DelInstrs instruction to delete from \p MBB
470/// \param TraceEnsemble is a pointer to the machine trace information
471/// \param RegUnits set of live registers, needed to compute instruction depths
472/// \param TII is target instruction info, used to call target hook
473/// \param Pattern is used to call target hook finalizeInsInstrs
474/// \param IncrementalUpdate if true, compute instruction depths incrementally,
475/// otherwise invalidate the trace
476static void
477insertDeleteInstructions(MachineBasicBlock *MBB, MachineInstr &MI,
478 SmallVectorImpl<MachineInstr *> &InsInstrs,
479 SmallVectorImpl<MachineInstr *> &DelInstrs,
480 MachineTraceMetrics::Ensemble *TraceEnsemble,
481 LiveRegUnitSet &RegUnits, const TargetInstrInfo *TII,
482 unsigned Pattern, bool IncrementalUpdate) {
483 // If we want to fix up some placeholder for some target, do it now.
484 // We need this because in genAlternativeCodeSequence, we have not decided the
485 // better pattern InsInstrs or DelInstrs, so we don't want generate some
486 // sideeffect to the function. For example we need to delay the constant pool
487 // entry creation here after InsInstrs is selected as better pattern.
488 // Otherwise the constant pool entry created for InsInstrs will not be deleted
489 // even if InsInstrs is not the better pattern.
490 TII->finalizeInsInstrs(Root&: MI, Pattern, InsInstrs);
491
492 for (auto *InstrPtr : InsInstrs)
493 MBB->insert(I: (MachineBasicBlock::iterator)&MI, MI: InstrPtr);
494
495 for (auto *InstrPtr : DelInstrs) {
496 InstrPtr->eraseFromParent();
497 // Erase all LiveRegs defined by the removed instruction
498 for (auto *I = RegUnits.begin(); I != RegUnits.end();) {
499 if (I->MI == InstrPtr)
500 I = RegUnits.erase(I);
501 else
502 I++;
503 }
504 }
505
506 if (IncrementalUpdate)
507 for (auto *InstrPtr : InsInstrs)
508 TraceEnsemble->updateDepth(MBB, *InstrPtr, RegUnits);
509 else
510 TraceEnsemble->invalidate(MBB);
511
512 NumInstCombined++;
513}
514
515/// Substitute a slow code sequence with a faster one by
516/// evaluating instruction combining pattern.
517/// The prototype of such a pattern is MUl + ADD -> MADD. Performs instruction
518/// combining based on machine trace metrics. Only combine a sequence of
519/// instructions when this neither lengthens the critical path nor increases
520/// resource pressure. When optimizing for codesize always combine when the new
521/// sequence is shorter.
522bool MachineCombiner::combineInstructions(MachineBasicBlock *MBB) {
523 bool Changed = false;
524 LLVM_DEBUG(dbgs() << "Combining MBB " << MBB->getName() << "\n");
525
526 bool IncrementalUpdate = false;
527 auto BlockIter = MBB->begin();
528 decltype(BlockIter) LastUpdate;
529 // Check if the block is in a loop.
530 const MachineLoop *ML = MLI->getLoopFor(BB: MBB);
531 if (!TraceEnsemble)
532 TraceEnsemble = Traces->getEnsemble(TII->getMachineCombinerTraceStrategy());
533
534 LiveRegUnitSet RegUnits;
535 RegUnits.setUniverse(TRI->getNumRegUnits());
536
537 bool OptForSize = llvm::shouldOptimizeForSize(MBB, PSI, MBFI);
538
539 bool DoRegPressureReduce =
540 TII->shouldReduceRegisterPressure(MBB, RegClassInfo: &RegClassInfo);
541
542 while (BlockIter != MBB->end()) {
543 auto &MI = *BlockIter++;
544 SmallVector<unsigned, 16> Patterns;
545 // The motivating example is:
546 //
547 // MUL Other MUL_op1 MUL_op2 Other
548 // \ / \ | /
549 // ADD/SUB => MADD/MSUB
550 // (=Root) (=NewRoot)
551
552 // The DAGCombine code always replaced MUL + ADD/SUB by MADD. While this is
553 // usually beneficial for code size it unfortunately can hurt performance
554 // when the ADD is on the critical path, but the MUL is not. With the
555 // substitution the MUL becomes part of the critical path (in form of the
556 // MADD) and can lengthen it on architectures where the MADD latency is
557 // longer than the ADD latency.
558 //
559 // For each instruction we check if it can be the root of a combiner
560 // pattern. Then for each pattern the new code sequence in form of MI is
561 // generated and evaluated. When the efficiency criteria (don't lengthen
562 // critical path, don't use more resources) is met the new sequence gets
563 // hooked up into the basic block before the old sequence is removed.
564 //
565 // The algorithm does not try to evaluate all patterns and pick the best.
566 // This is only an artificial restriction though. In practice there is
567 // mostly one pattern, and getMachineCombinerPatterns() can order patterns
568 // based on an internal cost heuristic. If
569 // machine-combiner-verify-pattern-order is enabled, all patterns are
570 // checked to ensure later patterns do not provide better latency savings.
571
572 if (!TII->getMachineCombinerPatterns(Root&: MI, Patterns, DoRegPressureReduce))
573 continue;
574
575 // Only used when VerifyPatternOrder is enabled.
576 [[maybe_unused]] long PrevLatencyDiff = std::numeric_limits<long>::max();
577
578 for (const auto P : Patterns) {
579 SmallVector<MachineInstr *, 16> InsInstrs;
580 SmallVector<MachineInstr *, 16> DelInstrs;
581 DenseMap<Register, unsigned> InstrIdxForVirtReg;
582 TII->genAlternativeCodeSequence(Root&: MI, Pattern: P, InsInstrs, DelInstrs,
583 InstIdxForVirtReg&: InstrIdxForVirtReg);
584 // Found pattern, but did not generate alternative sequence.
585 // This can happen e.g. when an immediate could not be materialized
586 // in a single instruction.
587 if (InsInstrs.empty())
588 continue;
589
590 LLVM_DEBUG(if (dump_intrs) {
591 dbgs() << "\tFor the Pattern (" << (int)P
592 << ") these instructions could be removed\n";
593 for (auto const *InstrPtr : DelInstrs)
594 InstrPtr->print(dbgs(), /*IsStandalone*/false, /*SkipOpers*/false,
595 /*SkipDebugLoc*/false, /*AddNewLine*/true, TII);
596 dbgs() << "\tThese instructions could replace the removed ones\n";
597 for (auto const *InstrPtr : InsInstrs)
598 InstrPtr->print(dbgs(), /*IsStandalone*/false, /*SkipOpers*/false,
599 /*SkipDebugLoc*/false, /*AddNewLine*/true, TII);
600 });
601
602 // Check that the difference between original and new latency is
603 // decreasing for later patterns. This helps to discover sub-optimal
604 // pattern orderings.
605 if (VerifyPatternOrder && TSchedModel.hasInstrSchedModelOrItineraries()) {
606 auto [NewRootLatency, RootLatency] = getLatenciesForInstrSequences(
607 MI, InsInstrs, DelInstrs, BlockTrace: TraceEnsemble->getTrace(MBB));
608 long CurrentLatencyDiff = ((long)RootLatency) - ((long)NewRootLatency);
609 assert(CurrentLatencyDiff <= PrevLatencyDiff &&
610 "Current pattern is expected to be better than the previous "
611 "pattern.");
612 PrevLatencyDiff = CurrentLatencyDiff;
613 }
614
615 if (IncrementalUpdate && LastUpdate != BlockIter) {
616 // Update depths since the last incremental update.
617 TraceEnsemble->updateDepths(Start: LastUpdate, End: BlockIter, RegUnits);
618 LastUpdate = BlockIter;
619 }
620
621 if (DoRegPressureReduce &&
622 getCombinerObjective(Pattern: P) ==
623 CombinerObjective::MustReduceRegisterPressure) {
624 if (MBB->size() > inc_threshold) {
625 // Use incremental depth updates for basic blocks above threshold
626 IncrementalUpdate = true;
627 LastUpdate = BlockIter;
628 }
629 if (reduceRegisterPressure(Root&: MI, MBB, InsInstrs, DelInstrs, Pattern: P)) {
630 // Replace DelInstrs with InsInstrs.
631 insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, TraceEnsemble,
632 RegUnits, TII, Pattern: P, IncrementalUpdate);
633 Changed |= true;
634
635 // Go back to previous instruction as it may have ILP reassociation
636 // opportunity.
637 BlockIter--;
638 break;
639 }
640 }
641
642 if (ML && TII->isThroughputPattern(Pattern: P)) {
643 LLVM_DEBUG(dbgs() << "\t Replacing due to throughput pattern in loop\n");
644 insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, TraceEnsemble,
645 RegUnits, TII, Pattern: P, IncrementalUpdate);
646 // Eagerly stop after the first pattern fires.
647 Changed = true;
648 break;
649 } else if (OptForSize && InsInstrs.size() < DelInstrs.size()) {
650 LLVM_DEBUG(dbgs() << "\t Replacing due to OptForSize ("
651 << InsInstrs.size() << " < "
652 << DelInstrs.size() << ")\n");
653 insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, TraceEnsemble,
654 RegUnits, TII, Pattern: P, IncrementalUpdate);
655 // Eagerly stop after the first pattern fires.
656 Changed = true;
657 break;
658 } else {
659 // For big basic blocks, we only compute the full trace the first time
660 // we hit this. We do not invalidate the trace, but instead update the
661 // instruction depths incrementally.
662 // NOTE: Only the instruction depths up to MI are accurate. All other
663 // trace information is not updated.
664 MachineTraceMetrics::Trace BlockTrace = TraceEnsemble->getTrace(MBB);
665 Traces->verifyAnalysis();
666 if (improvesCriticalPathLen(MBB, Root: &MI, BlockTrace, InsInstrs, DelInstrs,
667 InstrIdxForVirtReg, Pattern: P,
668 SlackIsAccurate: !IncrementalUpdate) &&
669 preservesResourceLen(MBB, BlockTrace, InsInstrs, DelInstrs)) {
670 if (MBB->size() > inc_threshold) {
671 // Use incremental depth updates for basic blocks above treshold
672 IncrementalUpdate = true;
673 LastUpdate = BlockIter;
674 }
675
676 insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, TraceEnsemble,
677 RegUnits, TII, Pattern: P, IncrementalUpdate);
678
679 // Eagerly stop after the first pattern fires.
680 Changed = true;
681 break;
682 }
683 // Cleanup instructions of the alternative code sequence. There is no
684 // use for them.
685 MachineFunction *MF = MBB->getParent();
686 for (auto *InstrPtr : InsInstrs)
687 MF->deleteMachineInstr(MI: InstrPtr);
688 }
689 InstrIdxForVirtReg.clear();
690 }
691 }
692
693 if (Changed && IncrementalUpdate)
694 Traces->invalidate(MBB);
695 return Changed;
696}
697
698bool MachineCombiner::runOnMachineFunction(MachineFunction &MF) {
699 STI = &MF.getSubtarget();
700 TII = STI->getInstrInfo();
701 TRI = STI->getRegisterInfo();
702 SchedModel = STI->getSchedModel();
703 TSchedModel.init(TSInfo: STI);
704 MRI = &MF.getRegInfo();
705 MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
706 Traces = &getAnalysis<MachineTraceMetricsWrapperPass>().getMTM();
707 PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
708 MBFI = (PSI && PSI->hasProfileSummary()) ?
709 &getAnalysis<LazyMachineBlockFrequencyInfoPass>().getBFI() :
710 nullptr;
711 TraceEnsemble = nullptr;
712 RegClassInfo.runOnMachineFunction(MF);
713
714 LLVM_DEBUG(dbgs() << getPassName() << ": " << MF.getName() << '\n');
715 if (!TII->useMachineCombiner()) {
716 LLVM_DEBUG(
717 dbgs()
718 << " Skipping pass: Target does not support machine combiner\n");
719 return false;
720 }
721
722 bool Changed = false;
723
724 // Try to combine instructions.
725 for (auto &MBB : MF)
726 Changed |= combineInstructions(MBB: &MBB);
727
728 return Changed;
729}
730