ScheduleDAGInstrs.cpp source code [llvm_projects/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp]

1	//===---- ScheduleDAGInstrs.cpp - MachineInstr Rescheduling ---------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	/// \file This implements the ScheduleDAGInstrs class, which implements
10	/// re-scheduling of MachineInstrs.
11	//
12	//===----------------------------------------------------------------------===//
13
14	#include "llvm/CodeGen/ScheduleDAGInstrs.h"
15
16	#include "llvm/ADT/IntEqClasses.h"
17	#include "llvm/ADT/MapVector.h"
18	#include "llvm/ADT/SmallVector.h"
19	#include "llvm/ADT/SparseSet.h"
20	#include "llvm/ADT/iterator_range.h"
21	#include "llvm/Analysis/AliasAnalysis.h"
22	#include "llvm/Analysis/ValueTracking.h"
23	#include "llvm/CodeGen/LiveIntervals.h"
24	#include "llvm/CodeGen/LivePhysRegs.h"
25	#include "llvm/CodeGen/MachineBasicBlock.h"
26	#include "llvm/CodeGen/MachineFrameInfo.h"
27	#include "llvm/CodeGen/MachineFunction.h"
28	#include "llvm/CodeGen/MachineInstr.h"
29	#include "llvm/CodeGen/MachineInstrBundle.h"
30	#include "llvm/CodeGen/MachineMemOperand.h"
31	#include "llvm/CodeGen/MachineOperand.h"
32	#include "llvm/CodeGen/MachineRegisterInfo.h"
33	#include "llvm/CodeGen/PseudoSourceValue.h"
34	#include "llvm/CodeGen/RegisterPressure.h"
35	#include "llvm/CodeGen/ScheduleDAG.h"
36	#include "llvm/CodeGen/ScheduleDFS.h"
37	#include "llvm/CodeGen/SlotIndexes.h"
38	#include "llvm/CodeGen/TargetInstrInfo.h"
39	#include "llvm/CodeGen/TargetRegisterInfo.h"
40	#include "llvm/CodeGen/TargetSubtargetInfo.h"
41	#include "llvm/Config/llvm-config.h"
42	#include "llvm/IR/Constants.h"
43	#include "llvm/IR/Function.h"
44	#include "llvm/IR/Type.h"
45	#include "llvm/IR/Value.h"
46	#include "llvm/MC/LaneBitmask.h"
47	#include "llvm/MC/MCRegisterInfo.h"
48	#include "llvm/Support/Casting.h"
49	#include "llvm/Support/CommandLine.h"
50	#include "llvm/Support/Compiler.h"
51	#include "llvm/Support/Debug.h"
52	#include "llvm/Support/ErrorHandling.h"
53	#include "llvm/Support/Format.h"
54	#include "llvm/Support/raw_ostream.h"
55	#include <algorithm>
56	#include <cassert>
57	#include <iterator>
58	#include <utility>
59	#include <vector>
60
61	using namespace llvm;
62
63	#define DEBUG_TYPE "machine-scheduler"
64
65	static cl::opt<bool>
66	EnableAASchedMI("enable-aa-sched-mi", cl::Hidden,
67	cl::desc ("Enable use of AA during MI DAG construction"));
68
69	static cl::opt<bool> UseTBAA("use-tbaa-in-sched-mi", cl::Hidden,
70	cl::init(Val: true), cl::desc ("Enable use of TBAA during MI DAG construction"));
71
72	static cl::opt<bool>
73	EnableSchedModel("schedmodel", cl::Hidden, cl::init(Val: true),
74	cl::desc ("Use TargetSchedModel for latency lookup"));
75
76	static cl::opt<bool>
77	EnableSchedItins("scheditins", cl::Hidden, cl::init(Val: true),
78	cl::desc ("Use InstrItineraryData for latency lookup"));
79
80	// Note: the two options below might be used in tuning compile time vs
81	// output quality. Setting HugeRegion so large that it will never be
82	// reached means best-effort, but may be slow.
83
84	// When Stores and Loads maps (or NonAliasStores and NonAliasLoads)
85	// together hold this many SUs, a reduction of maps will be done.
86	static cl::opt<unsigned> HugeRegion("dag-maps-huge-region", cl::Hidden,
87	cl::init(Val: `1000`), cl::desc ("The limit to use while constructing the DAG "
88	"prior to scheduling, at which point a trade-off "
89	"is made to avoid excessive compile time."));
90
91	static cl::opt<unsigned> ReductionSize(
92	"dag-maps-reduction-size", cl::Hidden,
93	cl::desc ("A huge scheduling region will have maps reduced by this many "
94	"nodes at a time. Defaults to HugeRegion / 2."));
95
96	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
97	static cl::opt<bool> SchedPrintCycles(
98	"sched-print-cycles", cl::Hidden, cl::init(false),
99	cl::desc("Report top/bottom cycles when dumping SUnit instances"));
100	#endif
101
102	static unsigned getReductionSize() {
103	// Always reduce a huge region with half of the elements, except
104	// when user sets this number explicitly.
105	if (ReductionSize.getNumOccurrences() == `0`)
106	return HugeRegion / `2`;
107	return ReductionSize;
108	}
109
110	static void dumpSUList(const ScheduleDAGInstrs::SUList &L) {
111	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
112	dbgs() << "{ ";
113	for (const SUnit *SU : L) {
114	dbgs() << "SU(" << SU->NodeNum << ")";
115	if (SU != L.back())
116	dbgs() << ", ";
117	}
118	dbgs() << "}\n";
119	#endif
120	}
121
122	ScheduleDAGInstrs::ScheduleDAGInstrs(MachineFunction &mf,
123	const MachineLoopInfo *mli,
124	bool RemoveKillFlags)
125	: ScheduleDAG (mf), MLI(mli), MFI(mf.getFrameInfo()),
126	RemoveKillFlags(RemoveKillFlags),
127	UnknownValue(UndefValue::get(
128	T: Type::getVoidTy(C&: mf.getFunction().getContext()))), Topo (SUnits, &ExitSU) {
129	DbgValues.clear();
130
131	const TargetSubtargetInfo &ST = mf.getSubtarget();
132	SchedModel.init(TSInfo: &ST, EnableSModel: EnableSchedModel, EnableSItins: EnableSchedItins);
133	}
134
135	/// If this machine instr has memory reference information and it can be
136	/// tracked to a normal reference to a known object, return the Value
137	/// for that object. This function returns false the memory location is
138	/// unknown or may alias anything.
139	static bool getUnderlyingObjectsForInstr(const MachineInstr *MI,
140	const MachineFrameInfo &MFI,
141	UnderlyingObjectsVector &Objects,
142	const DataLayout &DL) {
143	auto AllMMOsOkay = [&]() {
144	for (const MachineMemOperand *MMO : MI->memoperands()) {
145	// TODO: Figure out whether isAtomic is really necessary (see D57601).
146	if (MMO->isVolatile() \|\| MMO->isAtomic())
147	return false;
148
149	if (const PseudoSourceValue *PSV = MMO->getPseudoValue()) {
150	// Function that contain tail calls don't have unique PseudoSourceValue
151	// objects. Two PseudoSourceValues might refer to the same or
152	// overlapping locations. The client code calling this function assumes
153	// this is not the case. So return a conservative answer of no known
154	// object.
155	if (MFI.hasTailCall())
156	return false;
157
158	// For now, ignore PseudoSourceValues which may alias LLVM IR values
159	// because the code that uses this function has no way to cope with
160	// such aliases.
161	if (PSV->isAliased(&MFI))
162	return false;
163
164	bool MayAlias = PSV->mayAlias(&MFI);
165	Objects.emplace_back(Args&: PSV, Args&: MayAlias);
166	} else if (const Value *V = MMO->getValue()) {
167	SmallVector<Value *, `4`> Objs;
168	if (!getUnderlyingObjectsForCodeGen(V, Objects&: Objs))
169	return false;
170
171	for (Value *V : Objs) {
172	assert(isIdentifiedObject(V));
173	Objects.emplace_back(Args&: V, Args: true);
174	}
175	} else
176	return false;
177	}
178	return true;
179	};
180
181	if (!AllMMOsOkay ()) {
182	Objects.clear();
183	return false;
184	}
185
186	return true;
187	}
188
189	void ScheduleDAGInstrs::startBlock(MachineBasicBlock *bb) {
190	BB = bb;
191	}
192
193	void ScheduleDAGInstrs::finishBlock() {
194	// Subclasses should no longer refer to the old block.
195	BB = nullptr;
196	}
197
198	void ScheduleDAGInstrs::enterRegion(MachineBasicBlock *bb,
199	MachineBasicBlock::iterator begin,
200	MachineBasicBlock::iterator end,
201	unsigned regioninstrs) {
202	assert(bb == BB && "startBlock should set BB");
203	RegionBegin = begin;
204	RegionEnd = end;
205	NumRegionInstrs = regioninstrs;
206	}
207
208	void ScheduleDAGInstrs::exitRegion() {
209	// Nothing to do.
210	}
211
212	void ScheduleDAGInstrs::addSchedBarrierDeps() {
213	MachineInstr *ExitMI =
214	RegionEnd != BB->end()
215	? &*skipDebugInstructionsBackward(It: RegionEnd, Begin: RegionBegin)
216	: nullptr;
217	ExitSU.setInstr(ExitMI);
218	// Add dependencies on the defs and uses of the instruction.
219	if (ExitMI) {
220	const MCInstrDesc &MIDesc = ExitMI->getDesc();
221	for (const MachineOperand &MO : ExitMI->all_uses()) {
222	unsigned OpIdx = MO.getOperandNo();
223	Register Reg = MO.getReg();
224	if (Reg.isPhysical()) {
225	// addPhysRegDataDeps uses the provided operand index to retrieve
226	// the operand use cycle from the scheduling model. If the operand
227	// is "fake" (e.g., an operand of a call instruction used to pass
228	// an argument to the called function.), the scheduling model may not
229	// have an entry for it. If this is the case, pass -1 as operand index,
230	// which will cause addPhysRegDataDeps to add an artificial dependency.
231	// FIXME: Using hasImplicitUseOfPhysReg here is inaccurate as it misses
232	// aliases. When fixing, make sure to update addPhysRegDataDeps, too.
233	bool IsRealUse = OpIdx < MIDesc.getNumOperands() \|\|
234	MIDesc.hasImplicitUseOfPhysReg(Reg);
235	for (MCRegUnit Unit : TRI->regunits(Reg))
236	Uses.insert(Val: PhysRegSUOper (&ExitSU, IsRealUse ? OpIdx : -`1`, Unit));
237	} else if (Reg.isVirtual() && MO.readsReg()) {
238	addVRegUseDeps(SU: &ExitSU, OperIdx: OpIdx);
239	}
240	}
241	}
242	if (!ExitMI \|\| (!ExitMI->isCall() && !ExitMI->isBarrier())) {
243	// For others, e.g. fallthrough, conditional branch, assume the exit
244	// uses all the registers that are livein to the successor blocks.
245	for (const MachineBasicBlock *Succ : BB->successors()) {
246	for (const auto &LI : Succ->liveins()) {
247	for (MCRegUnitMaskIterator U(LI.PhysReg, TRI); U.isValid(); ++U) {
248	auto [Unit, Mask] = *U;
249	if ((Mask & LI.LaneMask).any() && !Uses.contains(Key: Unit))
250	Uses.insert(Val: PhysRegSUOper (&ExitSU, -`1`, Unit));
251	}
252	}
253	}
254	}
255	}
256
257	/// MO is an operand of SU's instruction that defines a physical register. Adds
258	/// data dependencies from SU to any uses of the physical register.
259	void ScheduleDAGInstrs::addPhysRegDataDeps(SUnit SU, unsigned* OperIdx) {
260	const MachineOperand &MO = SU->getInstr()->getOperand(i: OperIdx);
261	assert(MO.isDef() && "expect physreg def");
262	Register Reg = MO.getReg();
263
264	// Ask the target if address-backscheduling is desirable, and if so how much.
265	const TargetSubtargetInfo &ST = MF.getSubtarget();
266
267	// Only use any non-zero latency for real defs/uses, in contrast to
268	// "fake" operands added by regalloc.
269	const MCInstrDesc &DefMIDesc = SU->getInstr()->getDesc();
270	bool ImplicitPseudoDef = (OperIdx >= DefMIDesc.getNumOperands() &&
271	!DefMIDesc.hasImplicitDefOfPhysReg(Reg));
272	for (MCRegUnit Unit : TRI->regunits(Reg)) {
273	for (RegUnit2SUnitsMap::iterator I = Uses.find(Key: Unit); I != Uses.end();
274	++I) {
275	SUnit *UseSU = I ->SU;
276	if (UseSU == SU)
277	continue;
278
279	// Adjust the dependence latency using operand def/use information,
280	// then allow the target to perform its own adjustments.
281	MachineInstr UseInstr = nullptr*;
282	int UseOpIdx = I ->OpIdx;
283	bool ImplicitPseudoUse = false;
284	SDep Dep;
285	if (UseOpIdx < `0`) {
286	Dep = SDep (SU, SDep::Artificial);
287	} else {
288	// Set the hasPhysRegDefs only for physreg defs that have a use within
289	// the scheduling region.
290	SU->hasPhysRegDefs = true;
291
292	UseInstr = UseSU->getInstr();
293	Register UseReg = UseInstr->getOperand(i: UseOpIdx).getReg();
294	const MCInstrDesc &UseMIDesc = UseInstr->getDesc();
295	ImplicitPseudoUse = UseOpIdx >= ((int)UseMIDesc.getNumOperands()) &&
296	!UseMIDesc.hasImplicitUseOfPhysReg(Reg: UseReg);
297
298	Dep = SDep (SU, SDep::Data, UseReg);
299	}
300	if (!ImplicitPseudoDef && !ImplicitPseudoUse) {
301	Dep.setLatency(SchedModel.computeOperandLatency(DefMI: SU->getInstr(), DefOperIdx: OperIdx,
302	UseMI: UseInstr, UseOperIdx: UseOpIdx));
303	} else {
304	Dep.setLatency(`0`);
305	}
306	ST.adjustSchedDependency(Def: SU, DefOpIdx: OperIdx, Use: UseSU, UseOpIdx, Dep, SchedModel: &SchedModel);
307	UseSU->addPred(D: Dep);
308	}
309	}
310	}
311
312	/// Adds register dependencies (data, anti, and output) from this SUnit
313	/// to following instructions in the same scheduling region that depend the
314	/// physical register referenced at OperIdx.
315	void ScheduleDAGInstrs::addPhysRegDeps(SUnit SU, unsigned* OperIdx) {
316	MachineInstr *MI = SU->getInstr();
317	MachineOperand &MO = MI->getOperand(i: OperIdx);
318	Register Reg = MO.getReg();
319	// We do not need to track any dependencies for constant registers.
320	if (MRI.isConstantPhysReg(PhysReg: Reg))
321	return;
322
323	const TargetSubtargetInfo &ST = MF.getSubtarget();
324
325	// Optionally add output and anti dependencies. For anti
326	// dependencies we use a latency of 0 because for a multi-issue
327	// target we want to allow the defining instruction to issue
328	// in the same cycle as the using instruction.
329	// TODO: Using a latency of 1 here for output dependencies assumes
330	// there's no cost for reusing registers.
331	SDep::Kind Kind = MO.isUse() ? SDep::Anti : SDep::Output;
332	for (MCRegUnit Unit : TRI->regunits(Reg)) {
333	for (RegUnit2SUnitsMap::iterator I = Defs.find(Key: Unit); I != Defs.end();
334	++I) {
335	SUnit *DefSU = I ->SU;
336	if (DefSU == &ExitSU)
337	continue;
338	MachineInstr *DefInstr = DefSU->getInstr();
339	MachineOperand &DefMO = DefInstr->getOperand(i: I ->OpIdx);
340	if (DefSU != SU &&
341	(Kind != SDep::Output \|\| !MO.isDead() \|\| !DefMO.isDead())) {
342	SDep Dep(SU, Kind, DefMO.getReg());
343	if (Kind != SDep::Anti) {
344	Dep.setLatency(
345	SchedModel.computeOutputLatency(DefMI: MI, DefOperIdx: OperIdx, DepMI: DefInstr));
346	}
347	ST.adjustSchedDependency(Def: SU, DefOpIdx: OperIdx, Use: DefSU, UseOpIdx: I ->OpIdx, Dep,
348	SchedModel: &SchedModel);
349	DefSU->addPred(D: Dep);
350	}
351	}
352	}
353
354	if (MO.isUse()) {
355	SU->hasPhysRegUses = true;
356	// Either insert a new Reg2SUnits entry with an empty SUnits list, or
357	// retrieve the existing SUnits list for this register's uses.
358	// Push this SUnit on the use list.
359	for (MCRegUnit Unit : TRI->regunits(Reg))
360	Uses.insert(Val: PhysRegSUOper (SU, OperIdx, Unit));
361	if (RemoveKillFlags)
362	MO.setIsKill(false);
363	} else {
364	addPhysRegDataDeps(SU, OperIdx);
365
366	// Clear previous uses and defs of this register and its subregisters.
367	for (MCRegUnit Unit : TRI->regunits(Reg)) {
368	Uses.eraseAll(K: Unit);
369	if (!MO.isDead())
370	Defs.eraseAll(K: Unit);
371	}
372
373	if (MO.isDead() && SU->isCall) {
374	// Calls will not be reordered because of chain dependencies (see
375	// below). Since call operands are dead, calls may continue to be added
376	// to the DefList making dependence checking quadratic in the size of
377	// the block. Instead, we leave only one call at the back of the
378	// DefList.
379	for (MCRegUnit Unit : TRI->regunits(Reg)) {
380	RegUnit2SUnitsMap::RangePair P = Defs.equal_range(K: Unit);
381	RegUnit2SUnitsMap::iterator B = P.first;
382	RegUnit2SUnitsMap::iterator I = P.second;
383	for (bool isBegin = I == B; !isBegin; / empty /) {
384	isBegin = (--I) == B;
385	if (!I ->SU->isCall)
386	break;
387	I = Defs.erase(I);
388	}
389	}
390	}
391
392	// Defs are pushed in the order they are visited and never reordered.
393	for (MCRegUnit Unit : TRI->regunits(Reg))
394	Defs.insert(Val: PhysRegSUOper (SU, OperIdx, Unit));
395	}
396	}
397
398	LaneBitmask ScheduleDAGInstrs::getLaneMaskForMO(const MachineOperand &MO) const
399	{
400	Register Reg = MO.getReg();
401	// No point in tracking lanemasks if we don't have interesting subregisters.
402	const TargetRegisterClass &RC = *MRI.getRegClass(Reg);
403	if (!RC.HasDisjunctSubRegs)
404	return LaneBitmask::getAll();
405
406	unsigned SubReg = MO.getSubReg();
407	if (SubReg == `0`)
408	return RC.getLaneMask();
409	return TRI->getSubRegIndexLaneMask(SubIdx: SubReg);
410	}
411
412	bool ScheduleDAGInstrs::deadDefHasNoUse(const MachineOperand &MO) {
413	auto RegUse = CurrentVRegUses.find(Key: MO.getReg());
414	if (RegUse == CurrentVRegUses.end())
415	return true;
416	return (RegUse ->LaneMask & getLaneMaskForMO(MO)).none();
417	}
418
419	/// Adds register output and data dependencies from this SUnit to instructions
420	/// that occur later in the same scheduling region if they read from or write to
421	/// the virtual register defined at OperIdx.
422	///
423	/// TODO: Hoist loop induction variable increments. This has to be
424	/// reevaluated. Generally, IV scheduling should be done before coalescing.
425	void ScheduleDAGInstrs::addVRegDefDeps(SUnit SU, unsigned* OperIdx) {
426	MachineInstr *MI = SU->getInstr();
427	MachineOperand &MO = MI->getOperand(i: OperIdx);
428	Register Reg = MO.getReg();
429
430	LaneBitmask DefLaneMask;
431	LaneBitmask KillLaneMask;
432	if (TrackLaneMasks) {
433	bool IsKill = MO.getSubReg() == `0` \|\| MO.isUndef();
434	DefLaneMask = getLaneMaskForMO(MO);
435	// If we have a <read-undef> flag, none of the lane values comes from an
436	// earlier instruction.
437	KillLaneMask = IsKill ? LaneBitmask::getAll() : DefLaneMask;
438
439	if (MO.getSubReg() != `0` && MO.isUndef()) {
440	// There may be other subregister defs on the same instruction of the same
441	// register in later operands. The lanes of other defs will now be live
442	// after this instruction, so these should not be treated as killed by the
443	// instruction even though they appear to be killed in this one operand.
444	for (const MachineOperand &OtherMO :
445	llvm::drop_begin(RangeOrContainer: MI->operands(), N: OperIdx + `1`))
446	if (OtherMO.isReg() && OtherMO.isDef() && OtherMO.getReg() == Reg)
447	KillLaneMask &= ~getLaneMaskForMO(MO: OtherMO);
448	}
449
450	// Clear undef flag, we'll re-add it later once we know which subregister
451	// Def is first.
452	MO.setIsUndef(false);
453	} else {
454	DefLaneMask = LaneBitmask::getAll();
455	KillLaneMask = LaneBitmask::getAll();
456	}
457
458	if (MO.isDead()) {
459	assert(deadDefHasNoUse(MO) && "Dead defs should have no uses");
460	} else {
461	// Add data dependence to all uses we found so far.
462	const TargetSubtargetInfo &ST = MF.getSubtarget();
463	for (VReg2SUnitOperIdxMultiMap::iterator I = CurrentVRegUses.find(Key: Reg),
464	E = CurrentVRegUses.end(); I != E; /empty/) {
465	LaneBitmask LaneMask = I ->LaneMask;
466	// Ignore uses of other lanes.
467	if ((LaneMask & KillLaneMask).none()) {
468	++I;
469	continue;
470	}
471
472	if ((LaneMask & DefLaneMask).any()) {
473	SUnit *UseSU = I ->SU;
474	MachineInstr *Use = UseSU->getInstr();
475	SDep Dep(SU, SDep::Data, Reg);
476	Dep.setLatency(SchedModel.computeOperandLatency(DefMI: MI, DefOperIdx: OperIdx, UseMI: Use,
477	UseOperIdx: I ->OperandIndex));
478	ST.adjustSchedDependency(Def: SU, DefOpIdx: OperIdx, Use: UseSU, UseOpIdx: I ->OperandIndex, Dep,
479	SchedModel: &SchedModel);
480	UseSU->addPred(D: Dep);
481	}
482
483	LaneMask &= ~KillLaneMask;
484	// If we found a Def for all lanes of this use, remove it from the list.
485	if (LaneMask.any()) {
486	I ->LaneMask = LaneMask;
487	++I;
488	} else
489	I = CurrentVRegUses.erase(I);
490	}
491	}
492
493	// Shortcut: Singly defined vregs do not have output/anti dependencies.
494	if (MRI.hasOneDef(RegNo: Reg))
495	return;
496
497	// Add output dependence to the next nearest defs of this vreg.
498	//
499	// Unless this definition is dead, the output dependence should be
500	// transitively redundant with antidependencies from this definition's
501	// uses. We're conservative for now until we have a way to guarantee the uses
502	// are not eliminated sometime during scheduling. The output dependence edge
503	// is also useful if output latency exceeds def-use latency.
504	LaneBitmask LaneMask = DefLaneMask;
505	for (VReg2SUnit &V2SU : make_range(x: CurrentVRegDefs.find(Key: Reg),
506	y: CurrentVRegDefs.end())) {
507	// Ignore defs for other lanes.
508	if ((V2SU.LaneMask & LaneMask).none())
509	continue;
510	// Add an output dependence.
511	SUnit *DefSU = V2SU.SU;
512	// Ignore additional defs of the same lanes in one instruction. This can
513	// happen because lanemasks are shared for targets with too many
514	// subregisters. We also use some representration tricks/hacks where we
515	// add super-register defs/uses, to imply that although we only access parts
516	// of the reg we care about the full one.
517	if (DefSU == SU)
518	continue;
519	SDep Dep(SU, SDep::Output, Reg);
520	Dep.setLatency(
521	SchedModel.computeOutputLatency(DefMI: MI, DefOperIdx: OperIdx, DepMI: DefSU->getInstr()));
522	DefSU->addPred(D: Dep);
523
524	// Update current definition. This can get tricky if the def was about a
525	// bigger lanemask before. We then have to shrink it and create a new
526	// VReg2SUnit for the non-overlapping part.
527	LaneBitmask OverlapMask = V2SU.LaneMask & LaneMask;
528	LaneBitmask NonOverlapMask = V2SU.LaneMask & ~LaneMask;
529	V2SU.SU = SU;
530	V2SU.LaneMask = OverlapMask;
531	if (NonOverlapMask.any())
532	CurrentVRegDefs.insert(Val: VReg2SUnit (Reg, NonOverlapMask, DefSU));
533	}
534	// If there was no CurrentVRegDefs entry for some lanes yet, create one.
535	if (LaneMask.any())
536	CurrentVRegDefs.insert(Val: VReg2SUnit (Reg, LaneMask, SU));
537	}
538
539	/// Adds a register data dependency if the instruction that defines the
540	/// virtual register used at OperIdx is mapped to an SUnit. Add a register
541	/// antidependency from this SUnit to instructions that occur later in the same
542	/// scheduling region if they write the virtual register.
543	///
544	/// TODO: Handle ExitSU "uses" properly.
545	void ScheduleDAGInstrs::addVRegUseDeps(SUnit SU, unsigned* OperIdx) {
546	const MachineInstr *MI = SU->getInstr();
547	assert(!MI->isDebugOrPseudoInstr());
548
549	const MachineOperand &MO = MI->getOperand(i: OperIdx);
550	Register Reg = MO.getReg();
551
552	// Remember the use. Data dependencies will be added when we find the def.
553	LaneBitmask LaneMask = TrackLaneMasks ? getLaneMaskForMO(MO)
554	: LaneBitmask::getAll();
555	CurrentVRegUses.insert(Val: VReg2SUnitOperIdx (Reg, LaneMask, OperIdx, SU));
556
557	// Add antidependences to the following defs of the vreg.
558	for (VReg2SUnit &V2SU : make_range(x: CurrentVRegDefs.find(Key: Reg),
559	y: CurrentVRegDefs.end())) {
560	// Ignore defs for unrelated lanes.
561	LaneBitmask PrevDefLaneMask = V2SU.LaneMask;
562	if ((PrevDefLaneMask & LaneMask).none())
563	continue;
564	if (V2SU.SU == SU)
565	continue;
566
567	V2SU.SU->addPred(D: SDep (SU, SDep::Anti, Reg));
568	}
569	}
570
571
572	void ScheduleDAGInstrs::addChainDependency (SUnit SUa, SUnit SUb,
573	unsigned Latency) {
574	if (SUa->getInstr()->mayAlias(AA: getAAForDep(), Other: *SUb->getInstr(), UseTBAA)) {
575	SDep Dep(SUa, SDep::MayAliasMem);
576	Dep.setLatency(Latency);
577	SUb->addPred(D: Dep);
578	}
579	}
580
581	/// Creates an SUnit for each real instruction, numbered in top-down
582	/// topological order. The instruction order A < B, implies that no edge exists
583	/// from B to A.
584	///
585	/// Map each real instruction to its SUnit.
586	///
587	/// After initSUnits, the SUnits vector cannot be resized and the scheduler may
588	/// hang onto SUnit pointers. We may relax this in the future by using SUnit IDs
589	/// instead of pointers.
590	///
591	/// MachineScheduler relies on initSUnits numbering the nodes by their order in
592	/// the original instruction list.
593	void ScheduleDAGInstrs::initSUnits() {
594	// We'll be allocating one SUnit for each real instruction in the region,
595	// which is contained within a basic block.
596	SUnits.reserve(n: NumRegionInstrs);
597
598	for (MachineInstr &MI : make_range(x: RegionBegin, y: RegionEnd)) {
599	if (MI.isDebugOrPseudoInstr())
600	continue;
601
602	SUnit *SU = newSUnit(MI: &MI);
603	MISUnitMap [&MI] = SU;
604
605	SU->isCall = MI.isCall();
606	SU->isCommutable = MI.isCommutable();
607
608	// Assign the Latency field of SU using target-provided information.
609	SU->Latency = SchedModel.computeInstrLatency(MI: SU->getInstr());
610
611	// If this SUnit uses a reserved or unbuffered resource, mark it as such.
612	//
613	// Reserved resources block an instruction from issuing and stall the
614	// entire pipeline. These are identified by BufferSize=0.
615	//
616	// Unbuffered resources prevent execution of subsequent instructions that
617	// require the same resources. This is used for in-order execution pipelines
618	// within an out-of-order core. These are identified by BufferSize=1.
619	if (SchedModel.hasInstrSchedModel()) {
620	const MCSchedClassDesc *SC = getSchedClass(SU);
621	for (const MCWriteProcResEntry &PRE :
622	make_range(x: SchedModel.getWriteProcResBegin(SC),
623	y: SchedModel.getWriteProcResEnd(SC))) {
624	switch (SchedModel.getProcResource(PIdx: PRE.ProcResourceIdx)->BufferSize) {
625	case `0`:
626	SU->hasReservedResource = true;
627	break;
628	case `1`:
629	SU->isUnbuffered = true;
630	break;
631	default:
632	break;
633	}
634	}
635	}
636	}
637	}
638
639	class ScheduleDAGInstrs::Value2SUsMap
640	: public SmallMapVector<ValueType, SUList, `4`> {
641	/// Current total number of SUs in map.
642	unsigned NumNodes = `0`;
643
644	/// 1 for loads, 0 for stores. (see comment in SUList)
645	unsigned TrueMemOrderLatency;
646
647	public:
648	Value2SUsMap(unsigned lat = `0`) : TrueMemOrderLatency(lat) {}
649
650	/// To keep NumNodes up to date, insert() is used instead of
651	/// this operator w/ push_back().
652	ValueType &operator[](const SUList &Key) {
653	llvm_unreachable("Don't use. Use insert() instead."); };
654
655	/// Adds SU to the SUList of V. If Map grows huge, reduce its size by calling
656	/// reduce().
657	void inline insert(SUnit *SU, ValueType V) {
658	MapVector::operator[](Key: V).push_back(x: SU);
659	NumNodes++;
660	}
661
662	/// Clears the list of SUs mapped to V.
663	void inline clearList(ValueType V) {
664	iterator Itr = find(Key: V);
665	if (Itr != end()) {
666	assert(NumNodes >= Itr->second.size());
667	NumNodes -= Itr->second.size();
668
669	Itr->second.clear();
670	}
671	}
672
673	/// Clears map from all contents.
674	void clear() {
675	SmallMapVector<ValueType, SUList, `4`>::clear();
676	NumNodes = `0`;
677	}
678
679	unsigned inline size() const { return NumNodes; }
680
681	/// Counts the number of SUs in this map after a reduction.
682	void reComputeSize() {
683	NumNodes = `0`;
684	for (auto &I : *this)
685	NumNodes += I.second.size();
686	}
687
688	unsigned inline getTrueMemOrderLatency() const {
689	return TrueMemOrderLatency;
690	}
691
692	void dump();
693	};
694
695	void ScheduleDAGInstrs::addChainDependencies(SUnit *SU,
696	Value2SUsMap &Val2SUsMap) {
697	for (auto &I : Val2SUsMap)
698	addChainDependencies(SU, SUs&: I.second,
699	Latency: Val2SUsMap.getTrueMemOrderLatency());
700	}
701
702	void ScheduleDAGInstrs::addChainDependencies(SUnit *SU,
703	Value2SUsMap &Val2SUsMap,
704	ValueType V) {
705	Value2SUsMap::iterator Itr = Val2SUsMap.find(Key: V);
706	if (Itr != Val2SUsMap.end())
707	addChainDependencies(SU, SUs&: Itr->second,
708	Latency: Val2SUsMap.getTrueMemOrderLatency());
709	}
710
711	void ScheduleDAGInstrs::addBarrierChain(Value2SUsMap &map) {
712	assert(BarrierChain != nullptr);
713
714	for (auto &[V, SUs] : map) {
715	(void)V;
716	for (auto *SU : SUs)
717	SU->addPredBarrier(SU: BarrierChain);
718	}
719	map.clear();
720	}
721
722	void ScheduleDAGInstrs::insertBarrierChain(Value2SUsMap &map) {
723	assert(BarrierChain != nullptr);
724
725	// Go through all lists of SUs.
726	for (Value2SUsMap::iterator I = map.begin(), EE = map.end(); I != EE;) {
727	Value2SUsMap::iterator CurrItr = I++;
728	SUList &sus = CurrItr->second;
729	SUList::iterator SUItr = sus.begin(), SUEE = sus.end();
730	for (; SUItr != SUEE; ++SUItr) {
731	// Stop on BarrierChain or any instruction above it.
732	if ((*SUItr)->NodeNum <= BarrierChain->NodeNum)
733	break;
734
735	(*SUItr)->addPredBarrier(SU: BarrierChain);
736	}
737
738	// Remove also the BarrierChain from list if present.
739	if (SUItr != SUEE && *SUItr == BarrierChain)
740	SUItr ++;
741
742	// Remove all SUs that are now successors of BarrierChain.
743	if (SUItr != sus.begin())
744	sus.erase(first: sus.begin(), last: SUItr);
745	}
746
747	// Remove all entries with empty su lists.
748	map.remove_if(Pred: [&](std::pair<ValueType, SUList> &mapEntry) {
749	return (mapEntry.second.empty()); });
750
751	// Recompute the size of the map (NumNodes).
752	map.reComputeSize();
753	}
754
755	void ScheduleDAGInstrs::buildSchedGraph(AAResults *AA,
756	RegPressureTracker *RPTracker,
757	PressureDiffs *PDiffs,
758	LiveIntervals *LIS,
759	bool TrackLaneMasks) {
760	const TargetSubtargetInfo &ST = MF.getSubtarget();
761	bool UseAA = EnableAASchedMI.getNumOccurrences() > `0` ? EnableAASchedMI
762	: ST.useAA();
763	if (UseAA && AA)
764	AAForDep.emplace(args&: *AA);
765
766	BarrierChain = nullptr;
767
768	this->TrackLaneMasks = TrackLaneMasks;
769	MISUnitMap.clear();
770	ScheduleDAG::clearDAG();
771
772	// Create an SUnit for each real instruction.
773	initSUnits();
774
775	if (PDiffs)
776	PDiffs->init(N: SUnits.size());
777
778	// We build scheduling units by walking a block's instruction list
779	// from bottom to top.
780
781	// Each MIs' memory operand(s) is analyzed to a list of underlying
782	// objects. The SU is then inserted in the SUList(s) mapped from the
783	// Value(s). Each Value thus gets mapped to lists of SUs depending
784	// on it, stores and loads kept separately. Two SUs are trivially
785	// non-aliasing if they both depend on only identified Values and do
786	// not share any common Value.
787	Value2SUsMap Stores, Loads(`1` /TrueMemOrderLatency/);
788
789	// Certain memory accesses are known to not alias any SU in Stores
790	// or Loads, and have therefore their own 'NonAlias'
791	// domain. E.g. spill / reload instructions never alias LLVM I/R
792	// Values. It would be nice to assume that this type of memory
793	// accesses always have a proper memory operand modelling, and are
794	// therefore never unanalyzable, but this is conservatively not
795	// done.
796	Value2SUsMap NonAliasStores, NonAliasLoads(`1` /TrueMemOrderLatency/);
797
798	// Track all instructions that may raise floating-point exceptions.
799	// These do not depend on one other (or normal loads or stores), but
800	// must not be rescheduled across global barriers. Note that we don't
801	// really need a "map" here since we don't track those MIs by value;
802	// using the same Value2SUsMap data type here is simply a matter of
803	// convenience.
804	Value2SUsMap FPExceptions;
805
806	// Remove any stale debug info; sometimes BuildSchedGraph is called again
807	// without emitting the info from the previous call.
808	DbgValues.clear();
809	FirstDbgValue = nullptr;
810
811	assert(Defs.empty() && Uses.empty() &&
812	"Only BuildGraph should update Defs/Uses");
813	Defs.setUniverse(TRI->getNumRegs());
814	Uses.setUniverse(TRI->getNumRegs());
815
816	assert(CurrentVRegDefs.empty() && "nobody else should use CurrentVRegDefs");
817	assert(CurrentVRegUses.empty() && "nobody else should use CurrentVRegUses");
818	unsigned NumVirtRegs = MRI.getNumVirtRegs();
819	CurrentVRegDefs.setUniverse(NumVirtRegs);
820	CurrentVRegUses.setUniverse(NumVirtRegs);
821
822	// Model data dependencies between instructions being scheduled and the
823	// ExitSU.
824	addSchedBarrierDeps();
825
826	// Walk the list of instructions, from bottom moving up.
827	MachineInstr DbgMI = nullptr*;
828	for (MachineBasicBlock::iterator MII = RegionEnd, MIE = RegionBegin;
829	MII != MIE; --MII) {
830	MachineInstr &MI = *std::prev(x: MII);
831	if (DbgMI) {
832	DbgValues.emplace_back(args&: DbgMI, args: &MI);
833	DbgMI = nullptr;
834	}
835
836	if (MI.isDebugValue() \|\| MI.isDebugPHI()) {
837	DbgMI = &MI;
838	continue;
839	}
840
841	if (MI.isDebugLabel() \|\| MI.isDebugRef() \|\| MI.isPseudoProbe())
842	continue;
843
844	SUnit *SU = MISUnitMap [&MI];
845	assert(SU && "No SUnit mapped to this MI");
846
847	if (RPTracker) {
848	RegisterOperands RegOpers;
849	RegOpers.collect(MI, TRI: TRI, MRI, TrackLaneMasks, IgnoreDead: false*);
850	if (TrackLaneMasks) {
851	SlotIndex SlotIdx = LIS->getInstructionIndex(Instr: MI);
852	RegOpers.adjustLaneLiveness(LIS: *LIS, MRI, Pos: SlotIdx);
853	}
854	if (PDiffs != nullptr)
855	PDiffs->addInstruction(Idx: SU->NodeNum, RegOpers, MRI);
856
857	if (RPTracker->getPos() == RegionEnd \|\| &*RPTracker->getPos() != &MI)
858	RPTracker->recedeSkipDebugValues();
859	assert(&*RPTracker->getPos() == &MI && "RPTracker in sync");
860	RPTracker->recede(RegOpers);
861	}
862
863	assert(
864	(CanHandleTerminators \|\| (!MI.isTerminator() && !MI.isPosition())) &&
865	"Cannot schedule terminators or labels!");
866
867	// Add register-based dependencies (data, anti, and output).
868	// For some instructions (calls, returns, inline-asm, etc.) there can
869	// be explicit uses and implicit defs, in which case the use will appear
870	// on the operand list before the def. Do two passes over the operand
871	// list to make sure that defs are processed before any uses.
872	bool HasVRegDef = false;
873	for (unsigned j = `0`, n = MI.getNumOperands(); j != n; ++j) {
874	const MachineOperand &MO = MI.getOperand(i: j);
875	if (!MO.isReg() \|\| !MO.isDef())
876	continue;
877	Register Reg = MO.getReg();
878	if (Reg.isPhysical()) {
879	addPhysRegDeps(SU, OperIdx: j);
880	} else if (Reg.isVirtual()) {
881	HasVRegDef = true;
882	addVRegDefDeps(SU, OperIdx: j);
883	}
884	}
885	// Now process all uses.
886	for (unsigned j = `0`, n = MI.getNumOperands(); j != n; ++j) {
887	const MachineOperand &MO = MI.getOperand(i: j);
888	// Only look at use operands.
889	// We do not need to check for MO.readsReg() here because subsequent
890	// subregister defs will get output dependence edges and need no
891	// additional use dependencies.
892	if (!MO.isReg() \|\| !MO.isUse())
893	continue;
894	Register Reg = MO.getReg();
895	if (Reg.isPhysical()) {
896	addPhysRegDeps(SU, OperIdx: j);
897	} else if (Reg.isVirtual() && MO.readsReg()) {
898	addVRegUseDeps(SU, OperIdx: j);
899	}
900	}
901
902	// If we haven't seen any uses in this scheduling region, create a
903	// dependence edge to ExitSU to model the live-out latency. This is required
904	// for vreg defs with no in-region use, and prefetches with no vreg def.
905	//
906	// FIXME: NumDataSuccs would be more precise than NumSuccs here. This
907	// check currently relies on being called before adding chain deps.
908	if (SU->NumSuccs == `0` && SU->Latency > `1` && (HasVRegDef \|\| MI.mayLoad())) {
909	SDep Dep(SU, SDep::Artificial);
910	Dep.setLatency(SU->Latency - `1`);
911	ExitSU.addPred(D: Dep);
912	}
913
914	// Add memory dependencies (Note: isStoreToStackSlot and
915	// isLoadFromStackSLot are not usable after stack slots are lowered to
916	// actual addresses).
917
918	const TargetInstrInfo *TII = ST.getInstrInfo();
919	// This is a barrier event that acts as a pivotal node in the DAG.
920	if (TII->isGlobalMemoryObject(MI: &MI)) {
921
922	// Become the barrier chain.
923	if (BarrierChain)
924	BarrierChain->addPredBarrier(SU);
925	BarrierChain = SU;
926
927	LLVM_DEBUG(dbgs() << "Global memory object and new barrier chain: SU("
928	<< BarrierChain->NodeNum << ").\n");
929
930	// Add dependencies against everything below it and clear maps.
931	addBarrierChain(map&: Stores);
932	addBarrierChain(map&: Loads);
933	addBarrierChain(map&: NonAliasStores);
934	addBarrierChain(map&: NonAliasLoads);
935	addBarrierChain(map&: FPExceptions);
936
937	continue;
938	}
939
940	// Instructions that may raise FP exceptions may not be moved
941	// across any global barriers.
942	if (MI.mayRaiseFPException()) {
943	if (BarrierChain)
944	BarrierChain->addPredBarrier(SU);
945
946	FPExceptions.insert(SU, V: UnknownValue);
947
948	if (FPExceptions.size() >= HugeRegion) {
949	LLVM_DEBUG(dbgs() << "Reducing FPExceptions map.\n");
950	Value2SUsMap empty;
951	reduceHugeMemNodeMaps(stores&: FPExceptions, loads&: empty, N: getReductionSize());
952	}
953	}
954
955	// If it's not a store or a variant load, we're done.
956	if (!MI.mayStore() &&
957	!(MI.mayLoad() && !MI.isDereferenceableInvariantLoad()))
958	continue;
959
960	// Always add dependecy edge to BarrierChain if present.
961	if (BarrierChain)
962	BarrierChain->addPredBarrier(SU);
963
964	// Find the underlying objects for MI. The Objs vector is either
965	// empty, or filled with the Values of memory locations which this
966	// SU depends on.
967	UnderlyingObjectsVector Objs;
968	bool ObjsFound = getUnderlyingObjectsForInstr(MI: &MI, MFI, Objects&: Objs,
969	DL: MF.getDataLayout());
970
971	if (MI.mayStore()) {
972	if (!ObjsFound) {
973	// An unknown store depends on all stores and loads.
974	addChainDependencies(SU, Val2SUsMap&: Stores);
975	addChainDependencies(SU, Val2SUsMap&: NonAliasStores);
976	addChainDependencies(SU, Val2SUsMap&: Loads);
977	addChainDependencies(SU, Val2SUsMap&: NonAliasLoads);
978
979	// Map this store to 'UnknownValue'.
980	Stores.insert(SU, V: UnknownValue);
981	} else {
982	// Add precise dependencies against all previously seen memory
983	// accesses mapped to the same Value(s).
984	for (const UnderlyingObject &UnderlObj : Objs) {
985	ValueType V = UnderlObj.getValue();
986	bool ThisMayAlias = UnderlObj.mayAlias();
987
988	// Add dependencies to previous stores and loads mapped to V.
989	addChainDependencies(SU, Val2SUsMap&: (ThisMayAlias ? Stores : NonAliasStores), V);
990	addChainDependencies(SU, Val2SUsMap&: (ThisMayAlias ? Loads : NonAliasLoads), V);
991	}
992	// Update the store map after all chains have been added to avoid adding
993	// self-loop edge if multiple underlying objects are present.
994	for (const UnderlyingObject &UnderlObj : Objs) {
995	ValueType V = UnderlObj.getValue();
996	bool ThisMayAlias = UnderlObj.mayAlias();
997
998	// Map this store to V.
999	(ThisMayAlias ? Stores : NonAliasStores).insert(SU, V);
1000	}
1001	// The store may have dependencies to unanalyzable loads and
1002	// stores.
1003	addChainDependencies(SU, Val2SUsMap&: Loads, V: UnknownValue);
1004	addChainDependencies(SU, Val2SUsMap&: Stores, V: UnknownValue);
1005	}
1006	} else { // SU is a load.
1007	if (!ObjsFound) {
1008	// An unknown load depends on all stores.
1009	addChainDependencies(SU, Val2SUsMap&: Stores);
1010	addChainDependencies(SU, Val2SUsMap&: NonAliasStores);
1011
1012	Loads.insert(SU, V: UnknownValue);
1013	} else {
1014	for (const UnderlyingObject &UnderlObj : Objs) {
1015	ValueType V = UnderlObj.getValue();
1016	bool ThisMayAlias = UnderlObj.mayAlias();
1017
1018	// Add precise dependencies against all previously seen stores
1019	// mapping to the same Value(s).
1020	addChainDependencies(SU, Val2SUsMap&: (ThisMayAlias ? Stores : NonAliasStores), V);
1021
1022	// Map this load to V.
1023	(ThisMayAlias ? Loads : NonAliasLoads).insert(SU, V);
1024	}
1025	// The load may have dependencies to unanalyzable stores.
1026	addChainDependencies(SU, Val2SUsMap&: Stores, V: UnknownValue);
1027	}
1028	}
1029
1030	// Reduce maps if they grow huge.
1031	if (Stores.size() + Loads.size() >= HugeRegion) {
1032	LLVM_DEBUG(dbgs() << "Reducing Stores and Loads maps.\n");
1033	reduceHugeMemNodeMaps(stores&: Stores, loads&: Loads, N: getReductionSize());
1034	}
1035	if (NonAliasStores.size() + NonAliasLoads.size() >= HugeRegion) {
1036	LLVM_DEBUG(dbgs() << "Reducing NonAliasStores and NonAliasLoads maps.\n");
1037	reduceHugeMemNodeMaps(stores&: NonAliasStores, loads&: NonAliasLoads, N: getReductionSize());
1038	}
1039	}
1040
1041	if (DbgMI)
1042	FirstDbgValue = DbgMI;
1043
1044	Defs.clear();
1045	Uses.clear();
1046	CurrentVRegDefs.clear();
1047	CurrentVRegUses.clear();
1048
1049	Topo.MarkDirty();
1050	}
1051
1052	raw_ostream &llvm::operator<<(raw_ostream &OS, const PseudoSourceValue* PSV) {
1053	PSV->printCustom(O&: OS);
1054	return OS;
1055	}
1056
1057	void ScheduleDAGInstrs::Value2SUsMap::dump() {
1058	for (const auto &[ValType, SUs] : *this) {
1059	if (isa<const Value *>(Val: ValType)) {
1060	const Value V = cast<const* Value *>(Val: ValType);
1061	if (isa<UndefValue>(Val: V))
1062	dbgs() << "Unknown";
1063	else
1064	V->printAsOperand(O&: dbgs());
1065	} else if (isa<const PseudoSourceValue *>(Val: ValType))
1066	dbgs() << cast<const PseudoSourceValue *>(Val: ValType);
1067	else
1068	llvm_unreachable("Unknown Value type.");
1069
1070	dbgs() << " : ";
1071	dumpSUList(L: SUs);
1072	}
1073	}
1074
1075	void ScheduleDAGInstrs::reduceHugeMemNodeMaps(Value2SUsMap &stores,
1076	Value2SUsMap &loads, unsigned N) {
1077	LLVM_DEBUG(dbgs() << "Before reduction:\nStoring SUnits:\n"; stores.dump();
1078	dbgs() << "Loading SUnits:\n"; loads.dump());
1079
1080	// Insert all SU's NodeNums into a vector and sort it.
1081	std::vector<unsigned> NodeNums;
1082	NodeNums.reserve(n: stores.size() + loads.size());
1083	for (const auto &[V, SUs] : stores) {
1084	(void)V;
1085	for (const auto *SU : SUs)
1086	NodeNums.push_back(x: SU->NodeNum);
1087	}
1088	for (const auto &[V, SUs] : loads) {
1089	(void)V;
1090	for (const auto *SU : SUs)
1091	NodeNums.push_back(x: SU->NodeNum);
1092	}
1093	llvm::sort(C&: NodeNums);
1094
1095	// The N last elements in NodeNums will be removed, and the SU with
1096	// the lowest NodeNum of them will become the new BarrierChain to
1097	// let the not yet seen SUs have a dependency to the removed SUs.
1098	assert(N <= NodeNums.size());
1099	SUnit newBarrierChain = &SUnits [(NodeNums.end() - N)];
1100	if (BarrierChain) {
1101	// The aliasing and non-aliasing maps reduce independently of each
1102	// other, but share a common BarrierChain. Check if the
1103	// newBarrierChain is above the former one. If it is not, it may
1104	// introduce a loop to use newBarrierChain, so keep the old one.
1105	if (newBarrierChain->NodeNum < BarrierChain->NodeNum) {
1106	BarrierChain->addPredBarrier(SU: newBarrierChain);
1107	BarrierChain = newBarrierChain;
1108	LLVM_DEBUG(dbgs() << "Inserting new barrier chain: SU("
1109	<< BarrierChain->NodeNum << ").\n");
1110	}
1111	else
1112	LLVM_DEBUG(dbgs() << "Keeping old barrier chain: SU("
1113	<< BarrierChain->NodeNum << ").\n");
1114	}
1115	else
1116	BarrierChain = newBarrierChain;
1117
1118	insertBarrierChain(map&: stores);
1119	insertBarrierChain(map&: loads);
1120
1121	LLVM_DEBUG(dbgs() << "After reduction:\nStoring SUnits:\n"; stores.dump();
1122	dbgs() << "Loading SUnits:\n"; loads.dump());
1123	}
1124
1125	static void toggleKills(const MachineRegisterInfo &MRI, LiveRegUnits &LiveRegs,
1126	MachineInstr &MI, bool addToLiveRegs) {
1127	for (MachineOperand &MO : MI.operands()) {
1128	if (!MO.isReg() \|\| !MO.readsReg())
1129	continue;
1130	Register Reg = MO.getReg();
1131	if (!Reg)
1132	continue;
1133
1134	// Things that are available after the instruction are killed by it.
1135	bool IsKill = LiveRegs.available(Reg);
1136
1137	// Exception: Do not kill reserved registers
1138	MO.setIsKill(IsKill && !MRI.isReserved(PhysReg: Reg));
1139	if (addToLiveRegs)
1140	LiveRegs.addReg(Reg);
1141	}
1142	}
1143
1144	void ScheduleDAGInstrs::fixupKills(MachineBasicBlock &MBB) {
1145	LLVM_DEBUG(dbgs() << "Fixup kills for " << printMBBReference(MBB) << `'\n'`);
1146
1147	LiveRegs.init(TRI: *TRI);
1148	LiveRegs.addLiveOuts(MBB);
1149
1150	// Examine block from end to start...
1151	for (MachineInstr &MI : llvm::reverse(C&: MBB)) {
1152	if (MI.isDebugOrPseudoInstr())
1153	continue;
1154
1155	// Update liveness. Registers that are defed but not used in this
1156	// instruction are now dead. Mark register and all subregs as they
1157	// are completely defined.
1158	for (ConstMIBundleOperands O(MI); O.isValid(); ++O) {
1159	const MachineOperand &MO = *O;
1160	if (MO.isReg()) {
1161	if (!MO.isDef())
1162	continue;
1163	Register Reg = MO.getReg();
1164	if (!Reg)
1165	continue;
1166	LiveRegs.removeReg(Reg);
1167	} else if (MO.isRegMask()) {
1168	LiveRegs.removeRegsNotPreserved(RegMask: MO.getRegMask());
1169	}
1170	}
1171
1172	// If there is a bundle header fix it up first.
1173	if (!MI.isBundled()) {
1174	toggleKills(MRI, LiveRegs, MI, addToLiveRegs: true);
1175	} else {
1176	MachineBasicBlock::instr_iterator Bundle = MI.getIterator();
1177	if (MI.isBundle())
1178	toggleKills(MRI, LiveRegs, MI, addToLiveRegs: false);
1179
1180	// Some targets make the (questionable) assumtion that the instructions
1181	// inside the bundle are ordered and consequently only the last use of
1182	// a register inside the bundle can kill it.
1183	MachineBasicBlock::instr_iterator I = std::next(x: Bundle);
1184	while (I ->isBundledWithSucc())
1185	++I;
1186	do {
1187	if (!I ->isDebugOrPseudoInstr())
1188	toggleKills(MRI, LiveRegs, MI&: I, addToLiveRegs: true*);
1189	--I;
1190	} while (I != Bundle);
1191	}
1192	}
1193	}
1194
1195	void ScheduleDAGInstrs::dumpNode(const SUnit &SU) const {
1196	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
1197	dumpNodeName(SU);
1198	if (SchedPrintCycles)
1199	dbgs() << " [TopReadyCycle = " << SU.TopReadyCycle
1200	<< ", BottomReadyCycle = " << SU.BotReadyCycle << "]";
1201	dbgs() << ": ";
1202	SU.getInstr()->dump();
1203	#endif
1204	}
1205
1206	void ScheduleDAGInstrs::dump() const {
1207	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
1208	if (EntrySU.getInstr() != nullptr)
1209	dumpNodeAll(EntrySU);
1210	for (const SUnit &SU : SUnits)
1211	dumpNodeAll(SU);
1212	if (ExitSU.getInstr() != nullptr)
1213	dumpNodeAll(ExitSU);
1214	#endif
1215	}
1216
1217	std::string ScheduleDAGInstrs::getGraphNodeLabel(const SUnit SU) const* {
1218	std::string s;
1219	raw_string_ostream oss(s);
1220	if (SU == &EntrySU)
1221	oss << "<entry>";
1222	else if (SU == &ExitSU)
1223	oss << "<exit>";
1224	else
1225	SU->getInstr()->print(OS&: oss, /IsStandalone=/true);
1226	return s;
1227	}
1228
1229	/// Return the basic block label. It is not necessarily unique because a block
1230	/// contains multiple scheduling regions. But it is fine for visualization.
1231	std::string ScheduleDAGInstrs::getDAGName() const {
1232	return "dag." + BB->getFullName();
1233	}
1234
1235	bool ScheduleDAGInstrs::canAddEdge(SUnit SuccSU, SUnit PredSU) {
1236	return SuccSU == &ExitSU \|\| !Topo.IsReachable(SU: PredSU, TargetSU: SuccSU);
1237	}
1238
1239	bool ScheduleDAGInstrs::addEdge(SUnit SuccSU, const* SDep &PredDep) {
1240	if (SuccSU != &ExitSU) {
1241	// Do not use WillCreateCycle, it assumes SD scheduling.
1242	// If Pred is reachable from Succ, then the edge creates a cycle.
1243	if (Topo.IsReachable(SU: PredDep.getSUnit(), TargetSU: SuccSU))
1244	return false;
1245	Topo.AddPredQueued(Y: SuccSU, X: PredDep.getSUnit());
1246	}
1247	SuccSU->addPred(D: PredDep, /Required=/!PredDep.isArtificial());
1248	// Return true regardless of whether a new edge needed to be inserted.
1249	return true;
1250	}
1251
1252	//===----------------------------------------------------------------------===//
1253	// SchedDFSResult Implementation
1254	//===----------------------------------------------------------------------===//
1255
1256	namespace llvm {
1257
1258	/// Internal state used to compute SchedDFSResult.
1259	class SchedDFSImpl {
1260	SchedDFSResult &R;
1261
1262	/// Join DAG nodes into equivalence classes by their subtree.
1263	IntEqClasses SubtreeClasses;
1264	/// List PredSU, SuccSU pairs that represent data edges between subtrees.
1265	std::vector<std::pair<const SUnit , const* SUnit*>> ConnectionPairs;
1266
1267	struct RootData {
1268	unsigned NodeID;
1269	unsigned ParentNodeID; ///< Parent node (member of the parent subtree).
1270	unsigned SubInstrCount = `0`; ///< Instr count in this tree only, not
1271	/// children.
1272
1273	RootData(unsigned id): NodeID(id),
1274	ParentNodeID(SchedDFSResult::InvalidSubtreeID) {}
1275
1276	unsigned getSparseSetIndex() const { return NodeID; }
1277	};
1278
1279	SparseSet<RootData> RootSet;
1280
1281	public:
1282	SchedDFSImpl(SchedDFSResult &r): R(r), SubtreeClasses (R.DFSNodeData.size()) {
1283	RootSet.setUniverse(R.DFSNodeData.size());
1284	}
1285
1286	/// Returns true if this node been visited by the DFS traversal.
1287	///
1288	/// During visitPostorderNode the Node's SubtreeID is assigned to the Node
1289	/// ID. Later, SubtreeID is updated but remains valid.
1290	bool isVisited(const SUnit SU) const* {
1291	return R.DFSNodeData [SU->NodeNum].SubtreeID
1292	!= SchedDFSResult::InvalidSubtreeID;
1293	}
1294
1295	/// Initializes this node's instruction count. We don't need to flag the node
1296	/// visited until visitPostorder because the DAG cannot have cycles.
1297	void visitPreorder(const SUnit *SU) {
1298	R.DFSNodeData [SU->NodeNum].InstrCount =
1299	SU->getInstr()->isTransient() ? `0` : `1`;
1300	}
1301
1302	/// Called once for each node after all predecessors are visited. Revisit this
1303	/// node's predecessors and potentially join them now that we know the ILP of
1304	/// the other predecessors.
1305	void visitPostorderNode(const SUnit *SU) {
1306	// Mark this node as the root of a subtree. It may be joined with its
1307	// successors later.
1308	R.DFSNodeData [SU->NodeNum].SubtreeID = SU->NodeNum;
1309	RootData RData(SU->NodeNum);
1310	RData.SubInstrCount = SU->getInstr()->isTransient() ? `0` : `1`;
1311
1312	// If any predecessors are still in their own subtree, they either cannot be
1313	// joined or are large enough to remain separate. If this parent node's
1314	// total instruction count is not greater than a child subtree by at least
1315	// the subtree limit, then try to join it now since splitting subtrees is
1316	// only useful if multiple high-pressure paths are possible.
1317	unsigned InstrCount = R.DFSNodeData [SU->NodeNum].InstrCount;
1318	for (const SDep &PredDep : SU->Preds) {
1319	if (PredDep.getKind() != SDep::Data)
1320	continue;
1321	unsigned PredNum = PredDep.getSUnit()->NodeNum;
1322	if ((InstrCount - R.DFSNodeData [PredNum].InstrCount) < R.SubtreeLimit)
1323	joinPredSubtree(PredDep, Succ: SU, /CheckLimit=/false);
1324
1325	// Either link or merge the TreeData entry from the child to the parent.
1326	if (R.DFSNodeData [PredNum].SubtreeID == PredNum) {
1327	// If the predecessor's parent is invalid, this is a tree edge and the
1328	// current node is the parent.
1329	if (RootSet [PredNum].ParentNodeID == SchedDFSResult::InvalidSubtreeID)
1330	RootSet [PredNum].ParentNodeID = SU->NodeNum;
1331	}
1332	else if (RootSet.count(Key: PredNum)) {
1333	// The predecessor is not a root, but is still in the root set. This
1334	// must be the new parent that it was just joined to. Note that
1335	// RootSet[PredNum].ParentNodeID may either be invalid or may still be
1336	// set to the original parent.
1337	RData.SubInstrCount += RootSet [PredNum].SubInstrCount;
1338	RootSet.erase(Key: PredNum);
1339	}
1340	}
1341	RootSet [SU->NodeNum] = RData;
1342	}
1343
1344	/// Called once for each tree edge after calling visitPostOrderNode on
1345	/// the predecessor. Increment the parent node's instruction count and
1346	/// preemptively join this subtree to its parent's if it is small enough.
1347	void visitPostorderEdge(const SDep &PredDep, const SUnit *Succ) {
1348	R.DFSNodeData [Succ->NodeNum].InstrCount
1349	+= R.DFSNodeData [PredDep.getSUnit()->NodeNum].InstrCount;
1350	joinPredSubtree(PredDep, Succ);
1351	}
1352
1353	/// Adds a connection for cross edges.
1354	void visitCrossEdge(const SDep &PredDep, const SUnit *Succ) {
1355	ConnectionPairs.emplace_back(args: PredDep.getSUnit(), args&: Succ);
1356	}
1357
1358	/// Sets each node's subtree ID to the representative ID and record
1359	/// connections between trees.
1360	void finalize() {
1361	SubtreeClasses.compress();
1362	R.DFSTreeData.resize(N: SubtreeClasses.getNumClasses());
1363	assert(SubtreeClasses.getNumClasses() == RootSet.size()
1364	&& "number of roots should match trees");
1365	for (const RootData &Root : RootSet) {
1366	unsigned TreeID = SubtreeClasses [Root.NodeID];
1367	if (Root.ParentNodeID != SchedDFSResult::InvalidSubtreeID)
1368	R.DFSTreeData [TreeID].ParentTreeID = SubtreeClasses [Root.ParentNodeID];
1369	R.DFSTreeData [TreeID].SubInstrCount = Root.SubInstrCount;
1370	// Note that SubInstrCount may be greater than InstrCount if we joined
1371	// subtrees across a cross edge. InstrCount will be attributed to the
1372	// original parent, while SubInstrCount will be attributed to the joined
1373	// parent.
1374	}
1375	R.SubtreeConnections.resize(new_size: SubtreeClasses.getNumClasses());
1376	R.SubtreeConnectLevels.resize(new_size: SubtreeClasses.getNumClasses());
1377	LLVM_DEBUG(dbgs() << R.getNumSubtrees() << " subtrees:\n");
1378	for (unsigned Idx = `0`, End = R.DFSNodeData.size(); Idx != End; ++Idx) {
1379	R.DFSNodeData [Idx].SubtreeID = SubtreeClasses [Idx];
1380	LLVM_DEBUG(dbgs() << " SU(" << Idx << ") in tree "
1381	<< R.DFSNodeData[Idx].SubtreeID << `'\n'`);
1382	}
1383	for (const auto &[Pred, Succ] : ConnectionPairs) {
1384	unsigned PredTree = SubtreeClasses [Pred->NodeNum];
1385	unsigned SuccTree = SubtreeClasses [Succ->NodeNum];
1386	if (PredTree == SuccTree)
1387	continue;
1388	unsigned Depth = Pred->getDepth();
1389	addConnection(FromTree: PredTree, ToTree: SuccTree, Depth);
1390	addConnection(FromTree: SuccTree, ToTree: PredTree, Depth);
1391	}
1392	}
1393
1394	protected:
1395	/// Joins the predecessor subtree with the successor that is its DFS parent.
1396	/// Applies some heuristics before joining.
1397	bool joinPredSubtree(const SDep &PredDep, const SUnit *Succ,
1398	bool CheckLimit = true) {
1399	assert(PredDep.getKind() == SDep::Data && "Subtrees are for data edges");
1400
1401	// Check if the predecessor is already joined.
1402	const SUnit *PredSU = PredDep.getSUnit();
1403	unsigned PredNum = PredSU->NodeNum;
1404	if (R.DFSNodeData [PredNum].SubtreeID != PredNum)
1405	return false;
1406
1407	// Four is the magic number of successors before a node is considered a
1408	// pinch point.
1409	unsigned NumDataSucs = `0`;
1410	for (const SDep &SuccDep : PredSU->Succs) {
1411	if (SuccDep.getKind() == SDep::Data) {
1412	if (++NumDataSucs >= `4`)
1413	return false;
1414	}
1415	}
1416	if (CheckLimit && R.DFSNodeData [PredNum].InstrCount > R.SubtreeLimit)
1417	return false;
1418	R.DFSNodeData [PredNum].SubtreeID = Succ->NodeNum;
1419	SubtreeClasses.join(a: Succ->NodeNum, b: PredNum);
1420	return true;
1421	}
1422
1423	/// Called by finalize() to record a connection between trees.
1424	void addConnection(unsigned FromTree, unsigned ToTree, unsigned Depth) {
1425	if (!Depth)
1426	return;
1427
1428	do {
1429	SmallVectorImpl<SchedDFSResult::Connection> &Connections =
1430	R.SubtreeConnections [FromTree];
1431	for (SchedDFSResult::Connection &C : Connections) {
1432	if (C.TreeID == ToTree) {
1433	C.Level = std::max(a: C.Level, b: Depth);
1434	return;
1435	}
1436	}
1437	Connections.push_back(Elt: SchedDFSResult::Connection (ToTree, Depth));
1438	FromTree = R.DFSTreeData [FromTree].ParentTreeID;
1439	} while (FromTree != SchedDFSResult::InvalidSubtreeID);
1440	}
1441	};
1442
1443	} // end namespace llvm
1444
1445	namespace {
1446
1447	/// Manage the stack used by a reverse depth-first search over the DAG.
1448	class SchedDAGReverseDFS {
1449	std::vector<std::pair<const SUnit *, SUnit::const_pred_iterator>> DFSStack;
1450
1451	public:
1452	bool isComplete() const { return DFSStack.empty(); }
1453
1454	void follow(const SUnit *SU) {
1455	DFSStack.emplace_back(args&: SU, args: SU->Preds.begin());
1456	}
1457	void advance() { ++DFSStack.back().second; }
1458
1459	const SDep *backtrack() {
1460	DFSStack.pop_back();
1461	return DFSStack.empty() ? nullptr : std::prev(x: DFSStack.back().second);
1462	}
1463
1464	const SUnit getCurr() const* { return DFSStack.back().first; }
1465
1466	SUnit::const_pred_iterator getPred() const { return DFSStack.back().second; }
1467
1468	SUnit::const_pred_iterator getPredEnd() const {
1469	return getCurr()->Preds.end();
1470	}
1471	};
1472
1473	} // end anonymous namespace
1474
1475	static bool hasDataSucc(const SUnit *SU) {
1476	for (const SDep &SuccDep : SU->Succs) {
1477	if (SuccDep.getKind() == SDep::Data &&
1478	!SuccDep.getSUnit()->isBoundaryNode())
1479	return true;
1480	}
1481	return false;
1482	}
1483
1484	/// Computes an ILP metric for all nodes in the subDAG reachable via depth-first
1485	/// search from this root.
1486	void SchedDFSResult::compute(ArrayRef<SUnit> SUnits) {
1487	if (!IsBottomUp)
1488	llvm_unreachable("Top-down ILP metric is unimplemented");
1489
1490	SchedDFSImpl Impl(*this);
1491	for (const SUnit &SU : SUnits) {
1492	if (Impl.isVisited(SU: &SU) \|\| hasDataSucc(SU: &SU))
1493	continue;
1494
1495	SchedDAGReverseDFS DFS;
1496	Impl.visitPreorder(SU: &SU);
1497	DFS.follow(SU: &SU);
1498	while (true) {
1499	// Traverse the leftmost path as far as possible.
1500	while (DFS.getPred() != DFS.getPredEnd()) {
1501	const SDep &PredDep = *DFS.getPred();
1502	DFS.advance();
1503	// Ignore non-data edges.
1504	if (PredDep.getKind() != SDep::Data
1505	\|\| PredDep.getSUnit()->isBoundaryNode()) {
1506	continue;
1507	}
1508	// An already visited edge is a cross edge, assuming an acyclic DAG.
1509	if (Impl.isVisited(SU: PredDep.getSUnit())) {
1510	Impl.visitCrossEdge(PredDep, Succ: DFS.getCurr());
1511	continue;
1512	}
1513	Impl.visitPreorder(SU: PredDep.getSUnit());
1514	DFS.follow(SU: PredDep.getSUnit());
1515	}
1516	// Visit the top of the stack in postorder and backtrack.
1517	const SUnit *Child = DFS.getCurr();
1518	const SDep *PredDep = DFS.backtrack();
1519	Impl.visitPostorderNode(SU: Child);
1520	if (PredDep)
1521	Impl.visitPostorderEdge(PredDep: *PredDep, Succ: DFS.getCurr());
1522	if (DFS.isComplete())
1523	break;
1524	}
1525	}
1526	Impl.finalize();
1527	}
1528
1529	/// The root of the given SubtreeID was just scheduled. For all subtrees
1530	/// connected to this tree, record the depth of the connection so that the
1531	/// nearest connected subtrees can be prioritized.
1532	void SchedDFSResult::scheduleTree(unsigned SubtreeID) {
1533	for (const Connection &C : SubtreeConnections [SubtreeID]) {
1534	SubtreeConnectLevels [C.TreeID] =
1535	std::max(a: SubtreeConnectLevels [C.TreeID], b: C.Level);
1536	LLVM_DEBUG(dbgs() << " Tree: " << C.TreeID << " @"
1537	<< SubtreeConnectLevels[C.TreeID] << `'\n'`);
1538	}
1539	}
1540
1541	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
1542	LLVM_DUMP_METHOD void ILPValue::print(raw_ostream &OS) const {
1543	OS << InstrCount << " / " << Length << " = ";
1544	if (!Length)
1545	OS << "BADILP";
1546	else
1547	OS << format("%g", ((double)InstrCount / Length));
1548	}
1549
1550	LLVM_DUMP_METHOD void ILPValue::dump() const {
1551	dbgs() << *this << `'\n'`;
1552	}
1553
1554	namespace llvm {
1555
1556	LLVM_ATTRIBUTE_UNUSED
1557	raw_ostream &operator<<(raw_ostream &OS, const ILPValue &Val) {
1558	Val.print(OS);
1559	return OS;
1560	}
1561
1562	} // end namespace llvm
1563
1564	#endif
1565

Browse the source code of llvm_projects/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp