1//===---- MachineCombiner.cpp - Instcombining on SSA form machine code ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// The machine combiner pass uses machine trace metrics to ensure the combined
10// instructions do not lengthen the critical path or the resource depth.
11//===----------------------------------------------------------------------===//
12
13#include "llvm/ADT/DenseMap.h"
14#include "llvm/ADT/Statistic.h"
15#include "llvm/Analysis/ProfileSummaryInfo.h"
16#include "llvm/CodeGen/LazyMachineBlockFrequencyInfo.h"
17#include "llvm/CodeGen/MachineCombinerPattern.h"
18#include "llvm/CodeGen/MachineDominators.h"
19#include "llvm/CodeGen/MachineFunction.h"
20#include "llvm/CodeGen/MachineFunctionPass.h"
21#include "llvm/CodeGen/MachineLoopInfo.h"
22#include "llvm/CodeGen/MachineRegisterInfo.h"
23#include "llvm/CodeGen/MachineSizeOpts.h"
24#include "llvm/CodeGen/MachineTraceMetrics.h"
25#include "llvm/CodeGen/RegisterClassInfo.h"
26#include "llvm/CodeGen/TargetInstrInfo.h"
27#include "llvm/CodeGen/TargetRegisterInfo.h"
28#include "llvm/CodeGen/TargetSchedule.h"
29#include "llvm/CodeGen/TargetSubtargetInfo.h"
30#include "llvm/InitializePasses.h"
31#include "llvm/Support/CommandLine.h"
32#include "llvm/Support/Debug.h"
33#include "llvm/Support/raw_ostream.h"
34
35using namespace llvm;
36
37#define DEBUG_TYPE "machine-combiner"
38
39STATISTIC(NumInstCombined, "Number of machineinst combined");
40
41static cl::opt<unsigned>
42inc_threshold("machine-combiner-inc-threshold", cl::Hidden,
43 cl::desc("Incremental depth computation will be used for basic "
44 "blocks with more instructions."), cl::init(Val: 500));
45
46static cl::opt<bool> dump_intrs("machine-combiner-dump-subst-intrs", cl::Hidden,
47 cl::desc("Dump all substituted intrs"),
48 cl::init(Val: false));
49
50#ifdef EXPENSIVE_CHECKS
51static cl::opt<bool> VerifyPatternOrder(
52 "machine-combiner-verify-pattern-order", cl::Hidden,
53 cl::desc(
54 "Verify that the generated patterns are ordered by increasing latency"),
55 cl::init(true));
56#else
57static cl::opt<bool> VerifyPatternOrder(
58 "machine-combiner-verify-pattern-order", cl::Hidden,
59 cl::desc(
60 "Verify that the generated patterns are ordered by increasing latency"),
61 cl::init(Val: false));
62#endif
63
64namespace {
65class MachineCombiner : public MachineFunctionPass {
66 const TargetSubtargetInfo *STI = nullptr;
67 const TargetInstrInfo *TII = nullptr;
68 const TargetRegisterInfo *TRI = nullptr;
69 MCSchedModel SchedModel;
70 MachineRegisterInfo *MRI = nullptr;
71 MachineLoopInfo *MLI = nullptr; // Current MachineLoopInfo
72 MachineTraceMetrics *Traces = nullptr;
73 MachineTraceMetrics::Ensemble *TraceEnsemble = nullptr;
74 MachineBlockFrequencyInfo *MBFI = nullptr;
75 ProfileSummaryInfo *PSI = nullptr;
76 RegisterClassInfo RegClassInfo;
77
78 TargetSchedModel TSchedModel;
79
80public:
81 static char ID;
82 MachineCombiner() : MachineFunctionPass(ID) {}
83 void getAnalysisUsage(AnalysisUsage &AU) const override;
84 bool runOnMachineFunction(MachineFunction &MF) override;
85 StringRef getPassName() const override { return "Machine InstCombiner"; }
86
87private:
88 bool combineInstructions(MachineBasicBlock *);
89 MachineInstr *getOperandDef(const MachineOperand &MO);
90 bool isTransientMI(const MachineInstr *MI);
91 unsigned getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs,
92 DenseMap<Register, unsigned> &InstrIdxForVirtReg,
93 MachineTraceMetrics::Trace BlockTrace,
94 const MachineBasicBlock &MBB);
95 unsigned getLatency(MachineInstr *Root, MachineInstr *NewRoot,
96 MachineTraceMetrics::Trace BlockTrace);
97 bool improvesCriticalPathLen(MachineBasicBlock *MBB, MachineInstr *Root,
98 MachineTraceMetrics::Trace BlockTrace,
99 SmallVectorImpl<MachineInstr *> &InsInstrs,
100 SmallVectorImpl<MachineInstr *> &DelInstrs,
101 DenseMap<Register, unsigned> &InstrIdxForVirtReg,
102 unsigned Pattern, bool SlackIsAccurate);
103 bool reduceRegisterPressure(MachineInstr &Root, MachineBasicBlock *MBB,
104 SmallVectorImpl<MachineInstr *> &InsInstrs,
105 SmallVectorImpl<MachineInstr *> &DelInstrs,
106 unsigned Pattern);
107 bool preservesResourceLen(MachineBasicBlock *MBB,
108 MachineTraceMetrics::Trace BlockTrace,
109 SmallVectorImpl<MachineInstr *> &InsInstrs,
110 SmallVectorImpl<MachineInstr *> &DelInstrs);
111 void instr2instrSC(SmallVectorImpl<MachineInstr *> &Instrs,
112 SmallVectorImpl<const MCSchedClassDesc *> &InstrsSC);
113 std::pair<unsigned, unsigned>
114 getLatenciesForInstrSequences(MachineInstr &MI,
115 SmallVectorImpl<MachineInstr *> &InsInstrs,
116 SmallVectorImpl<MachineInstr *> &DelInstrs,
117 MachineTraceMetrics::Trace BlockTrace);
118
119 CombinerObjective getCombinerObjective(unsigned Pattern);
120};
121}
122
123char MachineCombiner::ID = 0;
124char &llvm::MachineCombinerID = MachineCombiner::ID;
125
126INITIALIZE_PASS_BEGIN(MachineCombiner, DEBUG_TYPE,
127 "Machine InstCombiner", false, false)
128INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass)
129INITIALIZE_PASS_DEPENDENCY(MachineTraceMetricsWrapperPass)
130INITIALIZE_PASS_END(MachineCombiner, DEBUG_TYPE, "Machine InstCombiner",
131 false, false)
132
133void MachineCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
134 AU.setPreservesCFG();
135 AU.addPreserved<MachineDominatorTreeWrapperPass>();
136 AU.addRequired<MachineLoopInfoWrapperPass>();
137 AU.addPreserved<MachineLoopInfoWrapperPass>();
138 AU.addRequired<MachineTraceMetricsWrapperPass>();
139 AU.addPreserved<MachineTraceMetricsWrapperPass>();
140 AU.addRequired<LazyMachineBlockFrequencyInfoPass>();
141 AU.addRequired<ProfileSummaryInfoWrapperPass>();
142 MachineFunctionPass::getAnalysisUsage(AU);
143}
144
145MachineInstr *
146MachineCombiner::getOperandDef(const MachineOperand &MO) {
147 MachineInstr *DefInstr = nullptr;
148 // We need a virtual register definition.
149 if (MO.isReg() && MO.getReg().isVirtual())
150 DefInstr = MRI->getUniqueVRegDef(Reg: MO.getReg());
151 return DefInstr;
152}
153
154/// Return true if MI is unlikely to generate an actual target instruction.
155bool MachineCombiner::isTransientMI(const MachineInstr *MI) {
156 if (!MI->isCopy())
157 return MI->isTransient();
158
159 // If MI is a COPY, check if its src and dst registers can be coalesced.
160 Register Dst = MI->getOperand(i: 0).getReg();
161 Register Src = MI->getOperand(i: 1).getReg();
162
163 if (!MI->isFullCopy()) {
164 // If src RC contains super registers of dst RC, it can also be coalesced.
165 if (MI->getOperand(i: 0).getSubReg() || Src.isPhysical() || Dst.isPhysical())
166 return false;
167
168 auto SrcSub = MI->getOperand(i: 1).getSubReg();
169 auto SrcRC = MRI->getRegClass(Reg: Src);
170 auto DstRC = MRI->getRegClass(Reg: Dst);
171 return TRI->getMatchingSuperRegClass(A: SrcRC, B: DstRC, Idx: SrcSub) != nullptr;
172 }
173
174 if (Src.isPhysical() && Dst.isPhysical())
175 return Src == Dst;
176
177 if (Src.isVirtual() && Dst.isVirtual()) {
178 auto SrcRC = MRI->getRegClass(Reg: Src);
179 auto DstRC = MRI->getRegClass(Reg: Dst);
180 return SrcRC->hasSuperClassEq(RC: DstRC) || SrcRC->hasSubClassEq(RC: DstRC);
181 }
182
183 if (Src.isVirtual())
184 std::swap(a&: Src, b&: Dst);
185
186 // Now Src is physical register, Dst is virtual register.
187 auto DstRC = MRI->getRegClass(Reg: Dst);
188 return DstRC->contains(Reg: Src);
189}
190
191/// Computes depth of instructions in vector \InsInstr.
192///
193/// \param InsInstrs is a vector of machine instructions
194/// \param InstrIdxForVirtReg is a dense map of virtual register to index
195/// of defining machine instruction in \p InsInstrs
196/// \param BlockTrace is a trace of machine instructions
197///
198/// \returns Depth of last instruction in \InsInstrs ("NewRoot")
199unsigned
200MachineCombiner::getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs,
201 DenseMap<Register, unsigned> &InstrIdxForVirtReg,
202 MachineTraceMetrics::Trace BlockTrace,
203 const MachineBasicBlock &MBB) {
204 SmallVector<unsigned, 16> InstrDepth;
205 // For each instruction in the new sequence compute the depth based on the
206 // operands. Use the trace information when possible. For new operands which
207 // are tracked in the InstrIdxForVirtReg map depth is looked up in InstrDepth
208 for (auto *InstrPtr : InsInstrs) { // for each Use
209 unsigned IDepth = 0;
210 for (const MachineOperand &MO : InstrPtr->all_uses()) {
211 // Check for virtual register operand.
212 if (!MO.getReg().isVirtual())
213 continue;
214 unsigned DepthOp = 0;
215 unsigned LatencyOp = 0;
216 auto II = InstrIdxForVirtReg.find(Val: MO.getReg());
217 if (II != InstrIdxForVirtReg.end()) {
218 // Operand is new virtual register not in trace
219 assert(II->second < InstrDepth.size() && "Bad Index");
220 MachineInstr *DefInstr = InsInstrs[II->second];
221 assert(DefInstr &&
222 "There must be a definition for a new virtual register");
223 DepthOp = InstrDepth[II->second];
224 int DefIdx =
225 DefInstr->findRegisterDefOperandIdx(Reg: MO.getReg(), /*TRI=*/nullptr);
226 int UseIdx =
227 InstrPtr->findRegisterUseOperandIdx(Reg: MO.getReg(), /*TRI=*/nullptr);
228 LatencyOp = TSchedModel.computeOperandLatency(DefMI: DefInstr, DefOperIdx: DefIdx,
229 UseMI: InstrPtr, UseOperIdx: UseIdx);
230 } else {
231 MachineInstr *DefInstr = getOperandDef(MO);
232 if (DefInstr && (TII->getMachineCombinerTraceStrategy() !=
233 MachineTraceStrategy::TS_Local ||
234 DefInstr->getParent() == &MBB)) {
235 DepthOp = BlockTrace.getInstrCycles(MI: *DefInstr).Depth;
236 if (!isTransientMI(MI: DefInstr))
237 LatencyOp = TSchedModel.computeOperandLatency(
238 DefMI: DefInstr,
239 DefOperIdx: DefInstr->findRegisterDefOperandIdx(Reg: MO.getReg(),
240 /*TRI=*/nullptr),
241 UseMI: InstrPtr,
242 UseOperIdx: InstrPtr->findRegisterUseOperandIdx(Reg: MO.getReg(),
243 /*TRI=*/nullptr));
244 }
245 }
246 IDepth = std::max(a: IDepth, b: DepthOp + LatencyOp);
247 }
248 InstrDepth.push_back(Elt: IDepth);
249 }
250 unsigned NewRootIdx = InsInstrs.size() - 1;
251 return InstrDepth[NewRootIdx];
252}
253
254/// Computes instruction latency as max of latency of defined operands.
255///
256/// \param Root is a machine instruction that could be replaced by NewRoot.
257/// It is used to compute a more accurate latency information for NewRoot in
258/// case there is a dependent instruction in the same trace (\p BlockTrace)
259/// \param NewRoot is the instruction for which the latency is computed
260/// \param BlockTrace is a trace of machine instructions
261///
262/// \returns Latency of \p NewRoot
263unsigned MachineCombiner::getLatency(MachineInstr *Root, MachineInstr *NewRoot,
264 MachineTraceMetrics::Trace BlockTrace) {
265 // Check each definition in NewRoot and compute the latency
266 unsigned NewRootLatency = 0;
267
268 for (const MachineOperand &MO : NewRoot->all_defs()) {
269 // Check for virtual register operand.
270 if (!MO.getReg().isVirtual())
271 continue;
272 // Get the first instruction that uses MO
273 MachineRegisterInfo::reg_iterator RI = MRI->reg_begin(RegNo: MO.getReg());
274 RI++;
275 if (RI == MRI->reg_end())
276 continue;
277 MachineInstr *UseMO = RI->getParent();
278 unsigned LatencyOp = 0;
279 if (UseMO && BlockTrace.isDepInTrace(DefMI: *Root, UseMI: *UseMO)) {
280 LatencyOp = TSchedModel.computeOperandLatency(
281 DefMI: NewRoot,
282 DefOperIdx: NewRoot->findRegisterDefOperandIdx(Reg: MO.getReg(), /*TRI=*/nullptr),
283 UseMI: UseMO,
284 UseOperIdx: UseMO->findRegisterUseOperandIdx(Reg: MO.getReg(), /*TRI=*/nullptr));
285 } else {
286 LatencyOp = TSchedModel.computeInstrLatency(MI: NewRoot);
287 }
288 NewRootLatency = std::max(a: NewRootLatency, b: LatencyOp);
289 }
290 return NewRootLatency;
291}
292
293CombinerObjective MachineCombiner::getCombinerObjective(unsigned Pattern) {
294 // TODO: If C++ ever gets a real enum class, make this part of the
295 // MachineCombinerPattern class.
296 switch (Pattern) {
297 case MachineCombinerPattern::REASSOC_AX_BY:
298 case MachineCombinerPattern::REASSOC_AX_YB:
299 case MachineCombinerPattern::REASSOC_XA_BY:
300 case MachineCombinerPattern::REASSOC_XA_YB:
301 return CombinerObjective::MustReduceDepth;
302 default:
303 return TII->getCombinerObjective(Pattern);
304 }
305}
306
307/// Estimate the latency of the new and original instruction sequence by summing
308/// up the latencies of the inserted and deleted instructions. This assumes
309/// that the inserted and deleted instructions are dependent instruction chains,
310/// which might not hold in all cases.
311std::pair<unsigned, unsigned> MachineCombiner::getLatenciesForInstrSequences(
312 MachineInstr &MI, SmallVectorImpl<MachineInstr *> &InsInstrs,
313 SmallVectorImpl<MachineInstr *> &DelInstrs,
314 MachineTraceMetrics::Trace BlockTrace) {
315 assert(!InsInstrs.empty() && "Only support sequences that insert instrs.");
316 unsigned NewRootLatency = 0;
317 // NewRoot is the last instruction in the \p InsInstrs vector.
318 MachineInstr *NewRoot = InsInstrs.back();
319 for (unsigned i = 0; i < InsInstrs.size() - 1; i++)
320 NewRootLatency += TSchedModel.computeInstrLatency(MI: InsInstrs[i]);
321 NewRootLatency += getLatency(Root: &MI, NewRoot, BlockTrace);
322
323 unsigned RootLatency = 0;
324 for (auto *I : DelInstrs)
325 RootLatency += TSchedModel.computeInstrLatency(MI: I);
326
327 return {NewRootLatency, RootLatency};
328}
329
330bool MachineCombiner::reduceRegisterPressure(
331 MachineInstr &Root, MachineBasicBlock *MBB,
332 SmallVectorImpl<MachineInstr *> &InsInstrs,
333 SmallVectorImpl<MachineInstr *> &DelInstrs, unsigned Pattern) {
334 // FIXME: for now, we don't do any check for the register pressure patterns.
335 // We treat them as always profitable. But we can do better if we make
336 // RegPressureTracker class be aware of TIE attribute. Then we can get an
337 // accurate compare of register pressure with DelInstrs or InsInstrs.
338 return true;
339}
340
341/// The DAGCombine code sequence ends in MI (Machine Instruction) Root.
342/// The new code sequence ends in MI NewRoot. A necessary condition for the new
343/// sequence to replace the old sequence is that it cannot lengthen the critical
344/// path. The definition of "improve" may be restricted by specifying that the
345/// new path improves the data dependency chain (MustReduceDepth).
346bool MachineCombiner::improvesCriticalPathLen(
347 MachineBasicBlock *MBB, MachineInstr *Root,
348 MachineTraceMetrics::Trace BlockTrace,
349 SmallVectorImpl<MachineInstr *> &InsInstrs,
350 SmallVectorImpl<MachineInstr *> &DelInstrs,
351 DenseMap<Register, unsigned> &InstrIdxForVirtReg, unsigned Pattern,
352 bool SlackIsAccurate) {
353 // Get depth and latency of NewRoot and Root.
354 unsigned NewRootDepth =
355 getDepth(InsInstrs, InstrIdxForVirtReg, BlockTrace, MBB: *MBB);
356 unsigned RootDepth = BlockTrace.getInstrCycles(MI: *Root).Depth;
357
358 LLVM_DEBUG(dbgs() << " Dependence data for " << *Root << "\tNewRootDepth: "
359 << NewRootDepth << "\tRootDepth: " << RootDepth);
360
361 // For a transform such as reassociation, the cost equation is
362 // conservatively calculated so that we must improve the depth (data
363 // dependency cycles) in the critical path to proceed with the transform.
364 // Being conservative also protects against inaccuracies in the underlying
365 // machine trace metrics and CPU models.
366 if (getCombinerObjective(Pattern) == CombinerObjective::MustReduceDepth) {
367 LLVM_DEBUG(dbgs() << "\tIt MustReduceDepth ");
368 LLVM_DEBUG(NewRootDepth < RootDepth
369 ? dbgs() << "\t and it does it\n"
370 : dbgs() << "\t but it does NOT do it\n");
371 return NewRootDepth < RootDepth;
372 }
373
374 // A more flexible cost calculation for the critical path includes the slack
375 // of the original code sequence. This may allow the transform to proceed
376 // even if the instruction depths (data dependency cycles) become worse.
377
378 // Account for the latency of the inserted and deleted instructions by
379 unsigned NewRootLatency, RootLatency;
380 if (TII->accumulateInstrSeqToRootLatency(Root&: *Root)) {
381 std::tie(args&: NewRootLatency, args&: RootLatency) =
382 getLatenciesForInstrSequences(MI&: *Root, InsInstrs, DelInstrs, BlockTrace);
383 } else {
384 NewRootLatency = TSchedModel.computeInstrLatency(MI: InsInstrs.back());
385 RootLatency = TSchedModel.computeInstrLatency(MI: Root);
386 }
387
388 unsigned RootSlack = BlockTrace.getInstrSlack(MI: *Root);
389 unsigned NewCycleCount = NewRootDepth + NewRootLatency;
390 unsigned OldCycleCount =
391 RootDepth + RootLatency + (SlackIsAccurate ? RootSlack : 0);
392 LLVM_DEBUG(dbgs() << "\n\tNewRootLatency: " << NewRootLatency
393 << "\tRootLatency: " << RootLatency << "\n\tRootSlack: "
394 << RootSlack << " SlackIsAccurate=" << SlackIsAccurate
395 << "\n\tNewRootDepth + NewRootLatency = " << NewCycleCount
396 << "\n\tRootDepth + RootLatency + RootSlack = "
397 << OldCycleCount);
398 LLVM_DEBUG(NewCycleCount <= OldCycleCount
399 ? dbgs() << "\n\t It IMPROVES PathLen because"
400 : dbgs() << "\n\t It DOES NOT improve PathLen because");
401 LLVM_DEBUG(dbgs() << "\n\t\tNewCycleCount = " << NewCycleCount
402 << ", OldCycleCount = " << OldCycleCount << "\n");
403
404 return NewCycleCount <= OldCycleCount;
405}
406
407/// helper routine to convert instructions into SC
408void MachineCombiner::instr2instrSC(
409 SmallVectorImpl<MachineInstr *> &Instrs,
410 SmallVectorImpl<const MCSchedClassDesc *> &InstrsSC) {
411 for (auto *InstrPtr : Instrs) {
412 unsigned Opc = InstrPtr->getOpcode();
413 unsigned Idx = TII->get(Opcode: Opc).getSchedClass();
414 const MCSchedClassDesc *SC = SchedModel.getSchedClassDesc(SchedClassIdx: Idx);
415 InstrsSC.push_back(Elt: SC);
416 }
417}
418
419/// True when the new instructions do not increase resource length
420bool MachineCombiner::preservesResourceLen(
421 MachineBasicBlock *MBB, MachineTraceMetrics::Trace BlockTrace,
422 SmallVectorImpl<MachineInstr *> &InsInstrs,
423 SmallVectorImpl<MachineInstr *> &DelInstrs) {
424 if (!TSchedModel.hasInstrSchedModel())
425 return true;
426
427 // Compute current resource length
428
429 //ArrayRef<const MachineBasicBlock *> MBBarr(MBB);
430 SmallVector <const MachineBasicBlock *, 1> MBBarr;
431 MBBarr.push_back(Elt: MBB);
432 unsigned ResLenBeforeCombine = BlockTrace.getResourceLength(Extrablocks: MBBarr);
433
434 // Deal with SC rather than Instructions.
435 SmallVector<const MCSchedClassDesc *, 16> InsInstrsSC;
436 SmallVector<const MCSchedClassDesc *, 16> DelInstrsSC;
437
438 instr2instrSC(Instrs&: InsInstrs, InstrsSC&: InsInstrsSC);
439 instr2instrSC(Instrs&: DelInstrs, InstrsSC&: DelInstrsSC);
440
441 ArrayRef<const MCSchedClassDesc *> MSCInsArr{InsInstrsSC};
442 ArrayRef<const MCSchedClassDesc *> MSCDelArr{DelInstrsSC};
443
444 // Compute new resource length.
445 unsigned ResLenAfterCombine =
446 BlockTrace.getResourceLength(Extrablocks: MBBarr, ExtraInstrs: MSCInsArr, RemoveInstrs: MSCDelArr);
447
448 LLVM_DEBUG(dbgs() << "\t\tResource length before replacement: "
449 << ResLenBeforeCombine
450 << " and after: " << ResLenAfterCombine << "\n");
451 LLVM_DEBUG(
452 ResLenAfterCombine <=
453 ResLenBeforeCombine + TII->getExtendResourceLenLimit()
454 ? dbgs() << "\t\t As result it IMPROVES/PRESERVES Resource Length\n"
455 : dbgs() << "\t\t As result it DOES NOT improve/preserve Resource "
456 "Length\n");
457
458 return ResLenAfterCombine <=
459 ResLenBeforeCombine + TII->getExtendResourceLenLimit();
460}
461
462/// Inserts InsInstrs and deletes DelInstrs. Incrementally updates instruction
463/// depths if requested.
464///
465/// \param MBB basic block to insert instructions in
466/// \param MI current machine instruction
467/// \param InsInstrs new instructions to insert in \p MBB
468/// \param DelInstrs instruction to delete from \p MBB
469/// \param TraceEnsemble is a pointer to the machine trace information
470/// \param RegUnits set of live registers, needed to compute instruction depths
471/// \param TII is target instruction info, used to call target hook
472/// \param Pattern is used to call target hook finalizeInsInstrs
473/// \param IncrementalUpdate if true, compute instruction depths incrementally,
474/// otherwise invalidate the trace
475static void
476insertDeleteInstructions(MachineBasicBlock *MBB, MachineInstr &MI,
477 SmallVectorImpl<MachineInstr *> &InsInstrs,
478 SmallVectorImpl<MachineInstr *> &DelInstrs,
479 MachineTraceMetrics::Ensemble *TraceEnsemble,
480 LiveRegUnitSet &RegUnits, const TargetInstrInfo *TII,
481 unsigned Pattern, bool IncrementalUpdate) {
482 // If we want to fix up some placeholder for some target, do it now.
483 // We need this because in genAlternativeCodeSequence, we have not decided the
484 // better pattern InsInstrs or DelInstrs, so we don't want generate some
485 // sideeffect to the function. For example we need to delay the constant pool
486 // entry creation here after InsInstrs is selected as better pattern.
487 // Otherwise the constant pool entry created for InsInstrs will not be deleted
488 // even if InsInstrs is not the better pattern.
489 TII->finalizeInsInstrs(Root&: MI, Pattern, InsInstrs);
490
491 for (auto *InstrPtr : InsInstrs)
492 MBB->insert(I: (MachineBasicBlock::iterator)&MI, MI: InstrPtr);
493
494 for (auto *InstrPtr : DelInstrs) {
495 InstrPtr->eraseFromParent();
496 // Erase all LiveRegs defined by the removed instruction
497 for (auto *I = RegUnits.begin(); I != RegUnits.end();) {
498 if (I->MI == InstrPtr)
499 I = RegUnits.erase(I);
500 else
501 I++;
502 }
503 }
504
505 if (IncrementalUpdate)
506 for (auto *InstrPtr : InsInstrs)
507 TraceEnsemble->updateDepth(MBB, *InstrPtr, RegUnits);
508 else
509 TraceEnsemble->invalidate(MBB);
510
511 NumInstCombined++;
512}
513
514/// Substitute a slow code sequence with a faster one by
515/// evaluating instruction combining pattern.
516/// The prototype of such a pattern is MUl + ADD -> MADD. Performs instruction
517/// combining based on machine trace metrics. Only combine a sequence of
518/// instructions when this neither lengthens the critical path nor increases
519/// resource pressure. When optimizing for codesize always combine when the new
520/// sequence is shorter.
521bool MachineCombiner::combineInstructions(MachineBasicBlock *MBB) {
522 bool Changed = false;
523 LLVM_DEBUG(dbgs() << "Combining MBB " << MBB->getName() << "\n");
524
525 bool IncrementalUpdate = false;
526 auto BlockIter = MBB->begin();
527 decltype(BlockIter) LastUpdate;
528 // Check if the block is in a loop.
529 const MachineLoop *ML = MLI->getLoopFor(BB: MBB);
530 if (!TraceEnsemble)
531 TraceEnsemble = Traces->getEnsemble(TII->getMachineCombinerTraceStrategy());
532
533 LiveRegUnitSet RegUnits;
534 RegUnits.setUniverse(TRI->getNumRegUnits());
535
536 bool OptForSize = llvm::shouldOptimizeForSize(MBB, PSI, MBFI);
537
538 bool DoRegPressureReduce =
539 TII->shouldReduceRegisterPressure(MBB, RegClassInfo: &RegClassInfo);
540
541 while (BlockIter != MBB->end()) {
542 auto &MI = *BlockIter++;
543 SmallVector<unsigned, 16> Patterns;
544 // The motivating example is:
545 //
546 // MUL Other MUL_op1 MUL_op2 Other
547 // \ / \ | /
548 // ADD/SUB => MADD/MSUB
549 // (=Root) (=NewRoot)
550
551 // The DAGCombine code always replaced MUL + ADD/SUB by MADD. While this is
552 // usually beneficial for code size it unfortunately can hurt performance
553 // when the ADD is on the critical path, but the MUL is not. With the
554 // substitution the MUL becomes part of the critical path (in form of the
555 // MADD) and can lengthen it on architectures where the MADD latency is
556 // longer than the ADD latency.
557 //
558 // For each instruction we check if it can be the root of a combiner
559 // pattern. Then for each pattern the new code sequence in form of MI is
560 // generated and evaluated. When the efficiency criteria (don't lengthen
561 // critical path, don't use more resources) is met the new sequence gets
562 // hooked up into the basic block before the old sequence is removed.
563 //
564 // The algorithm does not try to evaluate all patterns and pick the best.
565 // This is only an artificial restriction though. In practice there is
566 // mostly one pattern, and getMachineCombinerPatterns() can order patterns
567 // based on an internal cost heuristic. If
568 // machine-combiner-verify-pattern-order is enabled, all patterns are
569 // checked to ensure later patterns do not provide better latency savings.
570
571 if (!TII->getMachineCombinerPatterns(Root&: MI, Patterns, DoRegPressureReduce))
572 continue;
573
574 // Only used when VerifyPatternOrder is enabled.
575 [[maybe_unused]] long PrevLatencyDiff = std::numeric_limits<long>::max();
576
577 for (const auto P : Patterns) {
578 SmallVector<MachineInstr *, 16> InsInstrs;
579 SmallVector<MachineInstr *, 16> DelInstrs;
580 DenseMap<Register, unsigned> InstrIdxForVirtReg;
581 TII->genAlternativeCodeSequence(Root&: MI, Pattern: P, InsInstrs, DelInstrs,
582 InstIdxForVirtReg&: InstrIdxForVirtReg);
583 // Found pattern, but did not generate alternative sequence.
584 // This can happen e.g. when an immediate could not be materialized
585 // in a single instruction.
586 if (InsInstrs.empty())
587 continue;
588
589 LLVM_DEBUG(if (dump_intrs) {
590 dbgs() << "\tFor the Pattern (" << (int)P
591 << ") these instructions could be removed\n";
592 for (auto const *InstrPtr : DelInstrs)
593 InstrPtr->print(dbgs(), /*IsStandalone*/false, /*SkipOpers*/false,
594 /*SkipDebugLoc*/false, /*AddNewLine*/true, TII);
595 dbgs() << "\tThese instructions could replace the removed ones\n";
596 for (auto const *InstrPtr : InsInstrs)
597 InstrPtr->print(dbgs(), /*IsStandalone*/false, /*SkipOpers*/false,
598 /*SkipDebugLoc*/false, /*AddNewLine*/true, TII);
599 });
600
601 // Check that the difference between original and new latency is
602 // decreasing for later patterns. This helps to discover sub-optimal
603 // pattern orderings.
604 if (VerifyPatternOrder && TSchedModel.hasInstrSchedModelOrItineraries()) {
605 auto [NewRootLatency, RootLatency] = getLatenciesForInstrSequences(
606 MI, InsInstrs, DelInstrs, BlockTrace: TraceEnsemble->getTrace(MBB));
607 long CurrentLatencyDiff = ((long)RootLatency) - ((long)NewRootLatency);
608 assert(CurrentLatencyDiff <= PrevLatencyDiff &&
609 "Current pattern is expected to be better than the previous "
610 "pattern.");
611 PrevLatencyDiff = CurrentLatencyDiff;
612 }
613
614 if (IncrementalUpdate && LastUpdate != BlockIter) {
615 // Update depths since the last incremental update.
616 TraceEnsemble->updateDepths(Start: LastUpdate, End: BlockIter, RegUnits);
617 LastUpdate = BlockIter;
618 }
619
620 if (DoRegPressureReduce &&
621 getCombinerObjective(Pattern: P) ==
622 CombinerObjective::MustReduceRegisterPressure) {
623 if (MBB->size() > inc_threshold) {
624 // Use incremental depth updates for basic blocks above threshold
625 IncrementalUpdate = true;
626 LastUpdate = BlockIter;
627 }
628 if (reduceRegisterPressure(Root&: MI, MBB, InsInstrs, DelInstrs, Pattern: P)) {
629 // Replace DelInstrs with InsInstrs.
630 insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, TraceEnsemble,
631 RegUnits, TII, Pattern: P, IncrementalUpdate);
632 Changed |= true;
633
634 // Go back to previous instruction as it may have ILP reassociation
635 // opportunity.
636 BlockIter--;
637 break;
638 }
639 }
640
641 if (ML && TII->isThroughputPattern(Pattern: P)) {
642 LLVM_DEBUG(dbgs() << "\t Replacing due to throughput pattern in loop\n");
643 insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, TraceEnsemble,
644 RegUnits, TII, Pattern: P, IncrementalUpdate);
645 // Eagerly stop after the first pattern fires.
646 Changed = true;
647 break;
648 } else if (OptForSize && InsInstrs.size() < DelInstrs.size()) {
649 LLVM_DEBUG(dbgs() << "\t Replacing due to OptForSize ("
650 << InsInstrs.size() << " < "
651 << DelInstrs.size() << ")\n");
652 insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, TraceEnsemble,
653 RegUnits, TII, Pattern: P, IncrementalUpdate);
654 // Eagerly stop after the first pattern fires.
655 Changed = true;
656 break;
657 } else {
658 // For big basic blocks, we only compute the full trace the first time
659 // we hit this. We do not invalidate the trace, but instead update the
660 // instruction depths incrementally.
661 // NOTE: Only the instruction depths up to MI are accurate. All other
662 // trace information is not updated.
663 MachineTraceMetrics::Trace BlockTrace = TraceEnsemble->getTrace(MBB);
664 Traces->verifyAnalysis();
665 if (improvesCriticalPathLen(MBB, Root: &MI, BlockTrace, InsInstrs, DelInstrs,
666 InstrIdxForVirtReg, Pattern: P,
667 SlackIsAccurate: !IncrementalUpdate) &&
668 preservesResourceLen(MBB, BlockTrace, InsInstrs, DelInstrs)) {
669 if (MBB->size() > inc_threshold) {
670 // Use incremental depth updates for basic blocks above treshold
671 IncrementalUpdate = true;
672 LastUpdate = BlockIter;
673 }
674
675 insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, TraceEnsemble,
676 RegUnits, TII, Pattern: P, IncrementalUpdate);
677
678 // Eagerly stop after the first pattern fires.
679 Changed = true;
680 break;
681 }
682 // Cleanup instructions of the alternative code sequence. There is no
683 // use for them.
684 MachineFunction *MF = MBB->getParent();
685 for (auto *InstrPtr : InsInstrs)
686 MF->deleteMachineInstr(MI: InstrPtr);
687 }
688 InstrIdxForVirtReg.clear();
689 }
690 }
691
692 if (Changed && IncrementalUpdate)
693 Traces->invalidate(MBB);
694 return Changed;
695}
696
697bool MachineCombiner::runOnMachineFunction(MachineFunction &MF) {
698 STI = &MF.getSubtarget();
699 TII = STI->getInstrInfo();
700 TRI = STI->getRegisterInfo();
701 SchedModel = STI->getSchedModel();
702 TSchedModel.init(TSInfo: STI);
703 MRI = &MF.getRegInfo();
704 MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
705 Traces = &getAnalysis<MachineTraceMetricsWrapperPass>().getMTM();
706 PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
707 MBFI = (PSI && PSI->hasProfileSummary()) ?
708 &getAnalysis<LazyMachineBlockFrequencyInfoPass>().getBFI() :
709 nullptr;
710 TraceEnsemble = nullptr;
711 RegClassInfo.runOnMachineFunction(MF);
712
713 LLVM_DEBUG(dbgs() << getPassName() << ": " << MF.getName() << '\n');
714 if (!TII->useMachineCombiner()) {
715 LLVM_DEBUG(
716 dbgs()
717 << " Skipping pass: Target does not support machine combiner\n");
718 return false;
719 }
720
721 bool Changed = false;
722
723 // Try to combine instructions.
724 for (auto &MBB : MF)
725 Changed |= combineInstructions(MBB: &MBB);
726
727 return Changed;
728}
729