SIInsertWaitcnts.cpp source code [llvm_projects/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp]

1	//===- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	/// \file
10	/// Insert wait instructions for memory reads and writes.
11	///
12	/// Memory reads and writes are issued asynchronously, so we need to insert
13	/// S_WAITCNT instructions when we want to access any of their results or
14	/// overwrite any register that's used asynchronously.
15	///
16	/// TODO: This pass currently keeps one timeline per hardware counter. A more
17	/// finely-grained approach that keeps one timeline per event type could
18	/// sometimes get away with generating weaker s_waitcnt instructions. For
19	/// example, when both SMEM and LDS are in flight and we need to wait for
20	/// the i-th-last LDS instruction, then an lgkmcnt(i) is actually sufficient,
21	/// but the pass will currently generate a conservative lgkmcnt(0) because
22	/// multiple event types are in flight.
23	//
24	//===----------------------------------------------------------------------===//
25
26	#include "AMDGPU.h"
27	#include "GCNSubtarget.h"
28	#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
29	#include "SIMachineFunctionInfo.h"
30	#include "Utils/AMDGPUBaseInfo.h"
31	#include "llvm/ADT/MapVector.h"
32	#include "llvm/ADT/PostOrderIterator.h"
33	#include "llvm/ADT/Sequence.h"
34	#include "llvm/Analysis/AliasAnalysis.h"
35	#include "llvm/CodeGen/MachineFrameInfo.h"
36	#include "llvm/CodeGen/MachineLoopInfo.h"
37	#include "llvm/CodeGen/MachinePassManager.h"
38	#include "llvm/CodeGen/MachinePostDominators.h"
39	#include "llvm/IR/Dominators.h"
40	#include "llvm/InitializePasses.h"
41	#include "llvm/Support/DebugCounter.h"
42	#include "llvm/TargetParser/TargetParser.h"
43
44	using namespace llvm;
45	using namespace llvm::AMDGPU;
46
47	#define DEBUG_TYPE "si-insert-waitcnts"
48
49	DEBUG_COUNTER(ForceExpCounter, DEBUG_TYPE "-forceexp",
50	"Force emit s_waitcnt expcnt(0) instrs");
51	DEBUG_COUNTER(ForceLgkmCounter, DEBUG_TYPE "-forcelgkm",
52	"Force emit s_waitcnt lgkmcnt(0) instrs");
53	DEBUG_COUNTER(ForceVMCounter, DEBUG_TYPE "-forcevm",
54	"Force emit s_waitcnt vmcnt(0) instrs");
55
56	static cl::opt<bool>
57	ForceEmitZeroFlag("amdgpu-waitcnt-forcezero",
58	cl::desc ("Force all waitcnt instrs to be emitted as "
59	"s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"),
60	cl::init(Val: false), cl::Hidden);
61
62	static cl::opt<bool> ForceEmitZeroLoadFlag(
63	"amdgpu-waitcnt-load-forcezero",
64	cl::desc ("Force all waitcnt load counters to wait until 0"),
65	cl::init(Val: false), cl::Hidden);
66
67	static cl::opt<bool> ExpertSchedulingModeFlag(
68	"amdgpu-expert-scheduling-mode",
69	cl::desc ("Enable expert scheduling mode 2 for all functions (GFX12+ only)"),
70	cl::init(Val: false), cl::Hidden);
71
72	namespace {
73	// Get the maximum wait count value for a given counter type.
74	static unsigned getWaitCountMax(const AMDGPU::HardwareLimits &Limits,
75	InstCounterType T) {
76	switch (T) {
77	case LOAD_CNT:
78	return Limits.LoadcntMax;
79	case DS_CNT:
80	return Limits.DscntMax;
81	case EXP_CNT:
82	return Limits.ExpcntMax;
83	case STORE_CNT:
84	return Limits.StorecntMax;
85	case SAMPLE_CNT:
86	return Limits.SamplecntMax;
87	case BVH_CNT:
88	return Limits.BvhcntMax;
89	case KM_CNT:
90	return Limits.KmcntMax;
91	case X_CNT:
92	return Limits.XcntMax;
93	case VA_VDST:
94	return Limits.VaVdstMax;
95	case VM_VSRC:
96	return Limits.VmVsrcMax;
97	default:
98	return `0`;
99	}
100	}
101
102	/// Integer IDs used to track vector memory locations we may have to wait on.
103	/// Encoded as u16 chunks:
104	///
105	/// [0, REGUNITS_END ): MCRegUnit
106	/// [LDSDMA_BEGIN, LDSDMA_END ) : LDS DMA IDs
107	///
108	/// NOTE: The choice of encoding these as "u16 chunks" is arbitrary.
109	/// It gives (2 << 16) - 1 entries per category which is more than enough
110	/// for all register units. MCPhysReg is u16 so we don't even support >u16
111	/// physical register numbers at this time, let alone >u16 register units.
112	/// In any case, an assertion in "WaitcntBrackets" ensures REGUNITS_END
113	/// is enough for all register units.
114	using VMEMID = uint32_t;
115
116	enum : VMEMID {
117	TRACKINGID_RANGE_LEN = (`1` << `16`),
118
119	// Important: MCRegUnits must always be tracked starting from 0, as we
120	// need to be able to convert between a MCRegUnit and a VMEMID freely.
121	REGUNITS_BEGIN = `0`,
122	REGUNITS_END = REGUNITS_BEGIN + TRACKINGID_RANGE_LEN,
123
124	// Note for LDSDMA: LDSDMA_BEGIN corresponds to the "common"
125	// entry, which is updated for all LDS DMA operations encountered.
126	// Specific LDS DMA IDs start at LDSDMA_BEGIN + 1.
127	NUM_LDSDMA = TRACKINGID_RANGE_LEN,
128	LDSDMA_BEGIN = REGUNITS_END,
129	LDSDMA_END = LDSDMA_BEGIN + NUM_LDSDMA,
130	};
131
132	/// Convert a MCRegUnit to a VMEMID.
133	static constexpr VMEMID toVMEMID(MCRegUnit RU) {
134	return static_cast<unsigned>(RU);
135	}
136
137	#define AMDGPU_DECLARE_WAIT_EVENTS(DECL) \
138	DECL(VMEM_ACCESS) /* vmem read & write (pre-gfx10), vmem read (gfx10+) */ \
139	DECL(VMEM_SAMPLER_READ_ACCESS) /* vmem SAMPLER read (gfx12+ only) */ \
140	DECL(VMEM_BVH_READ_ACCESS) /* vmem BVH read (gfx12+ only) */ \
141	DECL(GLOBAL_INV_ACCESS) /* GLOBAL_INV (gfx12+ only) */ \
142	DECL(VMEM_WRITE_ACCESS) /* vmem write that is not scratch */ \
143	DECL(SCRATCH_WRITE_ACCESS) /* vmem write that may be scratch */ \
144	DECL(VMEM_GROUP) /* vmem group */ \
145	DECL(LDS_ACCESS) /* lds read & write */ \
146	DECL(GDS_ACCESS) /* gds read & write */ \
147	DECL(SQ_MESSAGE) /* send message */ \
148	DECL(SCC_WRITE) /* write to SCC from barrier */ \
149	DECL(SMEM_ACCESS) /* scalar-memory read & write */ \
150	DECL(SMEM_GROUP) /* scalar-memory group */ \
151	DECL(EXP_GPR_LOCK) /* export holding on its data src */ \
152	DECL(GDS_GPR_LOCK) /* GDS holding on its data and addr src */ \
153	DECL(EXP_POS_ACCESS) /* write to export position */ \
154	DECL(EXP_PARAM_ACCESS) /* write to export parameter */ \
155	DECL(VMW_GPR_LOCK) /* vmem write holding on its data src */ \
156	DECL(EXP_LDS_ACCESS) /* read by ldsdir counting as export */ \
157	DECL(VGPR_CSMACC_WRITE) /* write VGPR dest in Core/Side-MACC VALU */ \
158	DECL(VGPR_DPMACC_WRITE) /* write VGPR dest in DPMACC VALU */ \
159	DECL(VGPR_TRANS_WRITE) /* write VGPR dest in TRANS VALU */ \
160	DECL(VGPR_XDL_WRITE) /* write VGPR dest in XDL VALU */ \
161	DECL(VGPR_LDS_READ) /* read VGPR source in LDS */ \
162	DECL(VGPR_FLAT_READ) /* read VGPR source in FLAT */ \
163	DECL(VGPR_VMEM_READ) /* read VGPR source in other VMEM */
164
165	// clang-format off
166	#define AMDGPU_EVENT_ENUM(Name) Name,
167	enum WaitEventType {
168	AMDGPU_DECLARE_WAIT_EVENTS(AMDGPU_EVENT_ENUM)
169	NUM_WAIT_EVENTS
170	};
171	#undef AMDGPU_EVENT_ENUM
172	} // namespace
173
174	namespace llvm {
175	template <> struct enum_iteration_traits<WaitEventType> {
176	static constexpr bool is_iterable = true;
177	};
178	} // namespace llvm
179
180	namespace {
181
182	/// Return an iterator over all events between VMEM_ACCESS (the first event)
183	/// and \c MaxEvent (exclusive, default value yields an enumeration over
184	/// all counters).
185	auto wait_events(WaitEventType MaxEvent = NUM_WAIT_EVENTS) {
186	return enum_seq(Begin: VMEM_ACCESS, End: MaxEvent);
187	}
188
189	#define AMDGPU_EVENT_NAME(Name) #Name,
190	static constexpr StringLiteral WaitEventTypeName[] = {
191	AMDGPU_DECLARE_WAIT_EVENTS(AMDGPU_EVENT_NAME)
192	};
193	#undef AMDGPU_EVENT_NAME
194	static constexpr StringLiteral getWaitEventTypeName(WaitEventType Event) {
195	return WaitEventTypeName[Event];
196	}
197	// clang-format on
198
199	// Enumerate different types of result-returning VMEM operations. Although
200	// s_waitcnt orders them all with a single vmcnt counter, in the absence of
201	// s_waitcnt only instructions of the same VmemType are guaranteed to write
202	// their results in order -- so there is no need to insert an s_waitcnt between
203	// two instructions of the same type that write the same vgpr.
204	enum VmemType {
205	// BUF instructions and MIMG instructions without a sampler.
206	VMEM_NOSAMPLER,
207	// MIMG instructions with a sampler.
208	VMEM_SAMPLER,
209	// BVH instructions
210	VMEM_BVH,
211	NUM_VMEM_TYPES
212	};
213
214	// Maps values of InstCounterType to the instruction that waits on that
215	// counter. Only used if GCNSubtarget::hasExtendedWaitCounts()
216	// returns true, and does not cover VA_VDST or VM_VSRC.
217	static const unsigned instrsForExtendedCounterTypes[NUM_EXTENDED_INST_CNTS] = {
218	AMDGPU::S_WAIT_LOADCNT, AMDGPU::S_WAIT_DSCNT, AMDGPU::S_WAIT_EXPCNT,
219	AMDGPU::S_WAIT_STORECNT, AMDGPU::S_WAIT_SAMPLECNT, AMDGPU::S_WAIT_BVHCNT,
220	AMDGPU::S_WAIT_KMCNT, AMDGPU::S_WAIT_XCNT};
221
222	static bool updateVMCntOnly(const MachineInstr &Inst) {
223	return (SIInstrInfo::isVMEM(MI: Inst) && !SIInstrInfo::isFLAT(MI: Inst)) \|\|
224	SIInstrInfo::isFLATGlobal(MI: Inst) \|\| SIInstrInfo::isFLATScratch(MI: Inst);
225	}
226
227	#ifndef NDEBUG
228	static bool isNormalMode(InstCounterType MaxCounter) {
229	return MaxCounter == NUM_NORMAL_INST_CNTS;
230	}
231	#endif // NDEBUG
232
233	VmemType getVmemType(const MachineInstr &Inst) {
234	assert(updateVMCntOnly(Inst));
235	if (!SIInstrInfo::isImage(MI: Inst))
236	return VMEM_NOSAMPLER;
237	const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc: Inst.getOpcode());
238	const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo =
239	AMDGPU::getMIMGBaseOpcodeInfo(BaseOpcode: Info->BaseOpcode);
240
241	if (BaseInfo->BVH)
242	return VMEM_BVH;
243
244	// We have to make an additional check for isVSAMPLE here since some
245	// instructions don't have a sampler, but are still classified as sampler
246	// instructions for the purposes of e.g. waitcnt.
247	if (BaseInfo->Sampler \|\| BaseInfo->MSAA \|\| SIInstrInfo::isVSAMPLE(MI: Inst))
248	return VMEM_SAMPLER;
249
250	return VMEM_NOSAMPLER;
251	}
252
253	void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) {
254	Wait.set(T, Val: std::min(a: Wait.get(T), b: Count));
255	}
256
257	void setNoWait(AMDGPU::Waitcnt &Wait, InstCounterType T) { Wait.set(T, Val: ~`0u`); }
258
259	/// A small set of events.
260	class WaitEventSet {
261	unsigned Mask = `0`;
262
263	public:
264	WaitEventSet() = default;
265	explicit constexpr WaitEventSet(WaitEventType Event) {
266	static_assert(NUM_WAIT_EVENTS <= sizeof(Mask) * `8`,
267	"Not enough bits in Mask for all the events");
268	Mask \|= `1` << Event;
269	}
270	constexpr WaitEventSet(std::initializer_list<WaitEventType> Events) {
271	for (auto &E : Events) {
272	Mask \|= `1` << E;
273	}
274	}
275	void insert(const WaitEventType &Event) { Mask \|= `1` << Event; }
276	void remove(const WaitEventType &Event) { Mask &= ~(`1` << Event); }
277	void remove(const WaitEventSet &Other) { Mask &= ~Other.Mask; }
278	bool contains(const WaitEventType &Event) const {
279	return Mask & (`1` << Event);
280	}
281	/// \Returns true if this set contains all elements of \p Other.
282	bool contains(const WaitEventSet &Other) const {
283	return (~Mask & Other.Mask) == `0`;
284	}
285	/// \Returns the intersection of this and \p Other.
286	WaitEventSet operator&(const WaitEventSet &Other) const {
287	auto Copy = *this;
288	Copy.Mask &= Other.Mask;
289	return Copy;
290	}
291	/// \Returns the union of this and \p Other.
292	WaitEventSet operator\|(const WaitEventSet &Other) const {
293	auto Copy = *this;
294	Copy.Mask \|= Other.Mask;
295	return Copy;
296	}
297	/// This set becomes the union of this and \p Other.
298	WaitEventSet &operator\|=(const WaitEventSet &Other) {
299	Mask \|= Other.Mask;
300	return *this;
301	}
302	/// This set becomes the intersection of this and \p Other.
303	WaitEventSet &operator&=(const WaitEventSet &Other) {
304	Mask &= Other.Mask;
305	return *this;
306	}
307	bool operator==(const WaitEventSet &Other) const {
308	return Mask == Other.Mask;
309	}
310	bool operator!=(const WaitEventSet &Other) const { return !(*this == Other); }
311	bool empty() const { return Mask == `0`; }
312	/// \Returns true if the set contains more than one element.
313	bool twoOrMore() const { return Mask & (Mask - `1`); }
314	operator bool() const { return !empty(); }
315	void print(raw_ostream &OS) const {
316	ListSeparator LS(", ");
317	for (WaitEventType Event : wait_events()) {
318	if (contains(Event))
319	OS << LS << getWaitEventTypeName(Event);
320	}
321	}
322	LLVM_DUMP_METHOD void dump() const;
323	};
324
325	void WaitEventSet::dump() const {
326	print(OS&: dbgs());
327	dbgs() << "\n";
328	}
329
330	class WaitcntBrackets;
331
332	// This abstracts the logic for generating and updating S_WAIT instructions*
333	// away from the analysis that determines where they are needed. This was
334	// done because the set of counters and instructions for waiting on them
335	// underwent a major shift with gfx12, sufficiently so that having this
336	// abstraction allows the main analysis logic to be simpler than it would
337	// otherwise have had to become.
338	class WaitcntGenerator {
339	protected:
340	const GCNSubtarget &ST;
341	const SIInstrInfo &TII;
342	AMDGPU::IsaVersion IV;
343	InstCounterType MaxCounter;
344	bool OptNone;
345	bool ExpandWaitcntProfiling = false;
346	const AMDGPU::HardwareLimits &Limits;
347
348	public:
349	WaitcntGenerator() = delete;
350	WaitcntGenerator(const WaitcntGenerator &) = delete;
351	WaitcntGenerator(const MachineFunction &MF, InstCounterType MaxCounter,
352	const AMDGPU::HardwareLimits &Limits)
353	: ST(MF.getSubtarget<GCNSubtarget>()), TII(*ST.getInstrInfo()),
354	IV(AMDGPU::getIsaVersion(GPU: ST.getCPU())), MaxCounter(MaxCounter),
355	OptNone(MF.getFunction().hasOptNone() \|\|
356	MF.getTarget().getOptLevel() == CodeGenOptLevel::None),
357	ExpandWaitcntProfiling(
358	MF.getFunction().hasFnAttribute(Kind: "amdgpu-expand-waitcnt-profiling")),
359	Limits(Limits) {}
360
361	// Return true if the current function should be compiled with no
362	// optimization.
363	bool isOptNone() const { return OptNone; }
364
365	const AMDGPU::HardwareLimits &getLimits() const { return Limits; }
366
367	// Edits an existing sequence of wait count instructions according
368	// to an incoming Waitcnt value, which is itself updated to reflect
369	// any new wait count instructions which may need to be generated by
370	// WaitcntGenerator::createNewWaitcnt(). It will return true if any edits
371	// were made.
372	//
373	// This editing will usually be merely updated operands, but it may also
374	// delete instructions if the incoming Wait value indicates they are not
375	// needed. It may also remove existing instructions for which a wait
376	// is needed if it can be determined that it is better to generate new
377	// instructions later, as can happen on gfx12.
378	virtual bool
379	applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
380	MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
381	MachineBasicBlock::instr_iterator It) const = `0`;
382
383	// Transform a soft waitcnt into a normal one.
384	bool promoteSoftWaitCnt(MachineInstr Waitcnt) const*;
385
386	// Generates new wait count instructions according to the value of
387	// Wait, returning true if any new instructions were created.
388	// ScoreBrackets is used for profiling expansion.
389	virtual bool createNewWaitcnt(MachineBasicBlock &Block,
390	MachineBasicBlock::instr_iterator It,
391	AMDGPU::Waitcnt Wait,
392	const WaitcntBrackets &ScoreBrackets) = `0`;
393
394	// Returns the WaitEventSet that corresponds to counter \p T.
395	virtual const WaitEventSet &getWaitEvents(InstCounterType T) const = `0`;
396
397	/// \returns the counter that corresponds to event \p E.
398	InstCounterType getCounterFromEvent(WaitEventType E) const {
399	for (auto T : inst_counter_types()) {
400	if (getWaitEvents(T).contains(Event: E))
401	return T;
402	}
403	llvm_unreachable("event type has no associated counter");
404	}
405
406	// Returns a new waitcnt with all counters except VScnt set to 0. If
407	// IncludeVSCnt is true, VScnt is set to 0, otherwise it is set to ~0u.
408	virtual AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const = `0`;
409
410	virtual ~WaitcntGenerator() = default;
411	};
412
413	class WaitcntGeneratorPreGFX12 final : public WaitcntGenerator {
414	static constexpr const WaitEventSet
415	WaitEventMaskForInstPreGFX12[NUM_INST_CNTS] = {
416	WaitEventSet (
417	{VMEM_ACCESS, VMEM_SAMPLER_READ_ACCESS, VMEM_BVH_READ_ACCESS}),
418	WaitEventSet ({SMEM_ACCESS, LDS_ACCESS, GDS_ACCESS, SQ_MESSAGE}),
419	WaitEventSet ({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK,
420	EXP_PARAM_ACCESS, EXP_POS_ACCESS, EXP_LDS_ACCESS}),
421	WaitEventSet ({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
422	WaitEventSet (),
423	WaitEventSet (),
424	WaitEventSet (),
425	WaitEventSet (),
426	WaitEventSet (),
427	WaitEventSet ()};
428
429	public:
430	using WaitcntGenerator::WaitcntGenerator;
431	bool
432	applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
433	MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
434	MachineBasicBlock::instr_iterator It) const override;
435
436	bool createNewWaitcnt(MachineBasicBlock &Block,
437	MachineBasicBlock::instr_iterator It,
438	AMDGPU::Waitcnt Wait,
439	const WaitcntBrackets &ScoreBrackets) override;
440
441	const WaitEventSet &getWaitEvents(InstCounterType T) const override {
442	return WaitEventMaskForInstPreGFX12[T];
443	}
444
445	AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override;
446	};
447
448	class WaitcntGeneratorGFX12Plus final : public WaitcntGenerator {
449	protected:
450	bool IsExpertMode;
451	static constexpr const WaitEventSet
452	WaitEventMaskForInstGFX12Plus[NUM_INST_CNTS] = {
453	WaitEventSet ({VMEM_ACCESS, GLOBAL_INV_ACCESS}),
454	WaitEventSet ({LDS_ACCESS, GDS_ACCESS}),
455	WaitEventSet ({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK,
456	EXP_PARAM_ACCESS, EXP_POS_ACCESS, EXP_LDS_ACCESS}),
457	WaitEventSet ({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
458	WaitEventSet ({VMEM_SAMPLER_READ_ACCESS}),
459	WaitEventSet ({VMEM_BVH_READ_ACCESS}),
460	WaitEventSet ({SMEM_ACCESS, SQ_MESSAGE, SCC_WRITE}),
461	WaitEventSet ({VMEM_GROUP, SMEM_GROUP}),
462	WaitEventSet ({VGPR_CSMACC_WRITE, VGPR_DPMACC_WRITE, VGPR_TRANS_WRITE,
463	VGPR_XDL_WRITE}),
464	WaitEventSet ({VGPR_LDS_READ, VGPR_FLAT_READ, VGPR_VMEM_READ})};
465
466	public:
467	WaitcntGeneratorGFX12Plus() = delete;
468	WaitcntGeneratorGFX12Plus(const MachineFunction &MF,
469	InstCounterType MaxCounter,
470	const AMDGPU::HardwareLimits &Limits,
471	bool IsExpertMode)
472	: WaitcntGenerator (MF, MaxCounter, Limits), IsExpertMode(IsExpertMode) {}
473
474	bool
475	applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
476	MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
477	MachineBasicBlock::instr_iterator It) const override;
478
479	bool createNewWaitcnt(MachineBasicBlock &Block,
480	MachineBasicBlock::instr_iterator It,
481	AMDGPU::Waitcnt Wait,
482	const WaitcntBrackets &ScoreBrackets) override;
483
484	const WaitEventSet &getWaitEvents(InstCounterType T) const override {
485	return WaitEventMaskForInstGFX12Plus[T];
486	}
487
488	AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override;
489	};
490
491	// Flags indicating which counters should be flushed in a loop preheader.
492	struct PreheaderFlushFlags {
493	bool FlushVmCnt = false;
494	bool FlushDsCnt = false;
495	};
496
497	class SIInsertWaitcnts {
498	DenseMap<const Value , MachineBasicBlock > SLoadAddresses;
499	DenseMap<MachineBasicBlock *, PreheaderFlushFlags> PreheadersToFlush;
500	MachineLoopInfo &MLI;
501	MachinePostDominatorTree &PDT;
502	AliasAnalysis AA = nullptr*;
503	MachineFunction &MF;
504
505	struct BlockInfo {
506	std::unique_ptr<WaitcntBrackets> Incoming;
507	bool Dirty = true;
508	};
509
510	MapVector<MachineBasicBlock *, BlockInfo> BlockInfos;
511
512	bool ForceEmitWaitcnt[NUM_INST_CNTS] = {};
513
514	std::unique_ptr<WaitcntGenerator> WCG;
515
516	// Remember call and return instructions in the function.
517	DenseSet<MachineInstr *> CallInsts;
518	DenseSet<MachineInstr *> ReturnInsts;
519
520	// Remember all S_ENDPGM instructions. The boolean flag is true if there might
521	// be outstanding stores but definitely no outstanding scratch stores, to help
522	// with insertion of DEALLOC_VGPRS messages.
523	DenseMap<MachineInstr , bool*> EndPgmInsts;
524
525	AMDGPU::HardwareLimits Limits;
526
527	public:
528	const GCNSubtarget &ST;
529	const SIInstrInfo &TII;
530	const SIRegisterInfo &TRI;
531	const MachineRegisterInfo &MRI;
532	InstCounterType SmemAccessCounter;
533	InstCounterType MaxCounter;
534	bool IsExpertMode = false;
535
536	SIInsertWaitcnts(MachineLoopInfo &MLI, MachinePostDominatorTree &PDT,
537	AliasAnalysis *AA, MachineFunction &MF)
538	: MLI(MLI), PDT(PDT), AA(AA), MF(MF), ST(MF.getSubtarget<GCNSubtarget>()),
539	TII(*ST.getInstrInfo()), TRI(TII.getRegisterInfo()),
540	MRI(MF.getRegInfo()) {
541	(void)ForceExpCounter;
542	(void)ForceLgkmCounter;
543	(void)ForceVMCounter;
544	}
545
546	const AMDGPU::HardwareLimits &getLimits() const { return Limits; }
547
548	PreheaderFlushFlags getPreheaderFlushFlags(MachineLoop *ML,
549	const WaitcntBrackets &Brackets);
550	PreheaderFlushFlags isPreheaderToFlush(MachineBasicBlock &MBB,
551	const WaitcntBrackets &ScoreBrackets);
552	bool isVMEMOrFlatVMEM(const MachineInstr &MI) const;
553	bool isDSRead(const MachineInstr &MI) const;
554	bool mayStoreIncrementingDSCNT(const MachineInstr &MI) const;
555	bool run();
556
557	void setForceEmitWaitcnt() {
558	// For non-debug builds, ForceEmitWaitcnt has been initialized to false;
559	// For debug builds, get the debug counter info and adjust if need be
560	#ifndef NDEBUG
561	if (DebugCounter::isCounterSet(ForceExpCounter) &&
562	DebugCounter::shouldExecute(ForceExpCounter)) {
563	ForceEmitWaitcnt[EXP_CNT] = true;
564	} else {
565	ForceEmitWaitcnt[EXP_CNT] = false;
566	}
567
568	if (DebugCounter::isCounterSet(ForceLgkmCounter) &&
569	DebugCounter::shouldExecute(ForceLgkmCounter)) {
570	ForceEmitWaitcnt[DS_CNT] = true;
571	ForceEmitWaitcnt[KM_CNT] = true;
572	} else {
573	ForceEmitWaitcnt[DS_CNT] = false;
574	ForceEmitWaitcnt[KM_CNT] = false;
575	}
576
577	if (DebugCounter::isCounterSet(ForceVMCounter) &&
578	DebugCounter::shouldExecute(ForceVMCounter)) {
579	ForceEmitWaitcnt[LOAD_CNT] = true;
580	ForceEmitWaitcnt[SAMPLE_CNT] = true;
581	ForceEmitWaitcnt[BVH_CNT] = true;
582	} else {
583	ForceEmitWaitcnt[LOAD_CNT] = false;
584	ForceEmitWaitcnt[SAMPLE_CNT] = false;
585	ForceEmitWaitcnt[BVH_CNT] = false;
586	}
587
588	ForceEmitWaitcnt[VA_VDST] = false;
589	ForceEmitWaitcnt[VM_VSRC] = false;
590	#endif // NDEBUG
591	}
592
593	// Return the appropriate VMEM__ACCESS type for Inst, which must be a VMEM*
594	// instruction.
595	WaitEventType getVmemWaitEventType(const MachineInstr &Inst) const {
596	switch (Inst.getOpcode()) {
597	// FIXME: GLOBAL_INV needs to be tracked with xcnt too.
598	case AMDGPU::GLOBAL_INV:
599	return GLOBAL_INV_ACCESS; // tracked using loadcnt, but doesn't write
600	// VGPRs
601	case AMDGPU::GLOBAL_WB:
602	case AMDGPU::GLOBAL_WBINV:
603	return VMEM_WRITE_ACCESS; // tracked using storecnt
604	default:
605	break;
606	}
607
608	// Maps VMEM access types to their corresponding WaitEventType.
609	static const WaitEventType VmemReadMapping[NUM_VMEM_TYPES] = {
610	VMEM_ACCESS, VMEM_SAMPLER_READ_ACCESS, VMEM_BVH_READ_ACCESS};
611
612	assert(SIInstrInfo::isVMEM(Inst));
613	// LDS DMA loads are also stores, but on the LDS side. On the VMEM side
614	// these should use VM_CNT.
615	if (!ST.hasVscnt() \|\| SIInstrInfo::mayWriteLDSThroughDMA(MI: Inst))
616	return VMEM_ACCESS;
617	if (Inst.mayStore() &&
618	(!Inst.mayLoad() \|\| SIInstrInfo::isAtomicNoRet(MI: Inst))) {
619	if (TII.mayAccessScratch(MI: Inst))
620	return SCRATCH_WRITE_ACCESS;
621	return VMEM_WRITE_ACCESS;
622	}
623	if (!ST.hasExtendedWaitCounts() \|\| SIInstrInfo::isFLAT(MI: Inst))
624	return VMEM_ACCESS;
625	return VmemReadMapping[getVmemType(Inst)];
626	}
627
628	std::optional<WaitEventType>
629	getExpertSchedulingEventType(const MachineInstr &Inst) const;
630
631	bool isAsync(const MachineInstr &MI) const {
632	if (!SIInstrInfo::isLDSDMA(MI))
633	return false;
634	if (SIInstrInfo::usesASYNC_CNT(MI))
635	return true;
636	const MachineOperand *Async =
637	TII.getNamedOperand(MI, OperandName: AMDGPU::OpName::IsAsync);
638	return Async && (Async->getImm());
639	}
640
641	bool isNonAsyncLdsDmaWrite(const MachineInstr &MI) const {
642	return SIInstrInfo::mayWriteLDSThroughDMA(MI) && !isAsync(MI);
643	}
644
645	bool isAsyncLdsDmaWrite(const MachineInstr &MI) const {
646	return SIInstrInfo::mayWriteLDSThroughDMA(MI) && isAsync(MI);
647	}
648
649	bool isVmemAccess(const MachineInstr &MI) const;
650	bool generateWaitcntInstBefore(MachineInstr &MI,
651	WaitcntBrackets &ScoreBrackets,
652	MachineInstr *OldWaitcntInstr,
653	PreheaderFlushFlags FlushFlags);
654	bool generateWaitcnt(AMDGPU::Waitcnt Wait,
655	MachineBasicBlock::instr_iterator It,
656	MachineBasicBlock &Block, WaitcntBrackets &ScoreBrackets,
657	MachineInstr *OldWaitcntInstr);
658	/// \returns all events that correspond to \p Inst.
659	WaitEventSet getEventsFor(const MachineInstr &Inst) const;
660	void updateEventWaitcntAfter(MachineInstr &Inst,
661	WaitcntBrackets *ScoreBrackets);
662	bool isNextENDPGM(MachineBasicBlock::instr_iterator It,
663	MachineBasicBlock Block) const*;
664	bool insertForcedWaitAfter(MachineInstr &Inst, MachineBasicBlock &Block,
665	WaitcntBrackets &ScoreBrackets);
666	bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block,
667	WaitcntBrackets &ScoreBrackets);
668	/// Removes redundant Soft Xcnt Waitcnts in \p Block emitted by the Memory
669	/// Legalizer. Returns true if block was modified.
670	bool removeRedundantSoftXcnts(MachineBasicBlock &Block);
671	void setSchedulingMode(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
672	bool ExpertMode) const;
673	const WaitEventSet &getWaitEvents(InstCounterType T) const {
674	return WCG ->getWaitEvents(T);
675	}
676	InstCounterType getCounterFromEvent(WaitEventType E) const {
677	return WCG ->getCounterFromEvent(E);
678	}
679	};
680
681	// This objects maintains the current score brackets of each wait counter, and
682	// a per-register scoreboard for each wait counter.
683	//
684	// We also maintain the latest score for every event type that can change the
685	// waitcnt in order to know if there are multiple types of events within
686	// the brackets. When multiple types of event happen in the bracket,
687	// wait count may get decreased out of order, therefore we need to put in
688	// "s_waitcnt 0" before use.
689	class WaitcntBrackets {
690	public:
691	WaitcntBrackets(const SIInsertWaitcnts *Context) : Context(Context) {
692	assert(Context->TRI.getNumRegUnits() < REGUNITS_END);
693	}
694
695	#ifndef NDEBUG
696	~WaitcntBrackets() {
697	unsigned NumUnusedVmem = `0`, NumUnusedSGPRs = `0`;
698	for (auto &[ID, Val] : VMem) {
699	if (Val.empty())
700	++NumUnusedVmem;
701	}
702	for (auto &[ID, Val] : SGPRs) {
703	if (Val.empty())
704	++NumUnusedSGPRs;
705	}
706
707	if (NumUnusedVmem \|\| NumUnusedSGPRs) {
708	errs() << "WaitcntBracket had unused entries at destruction time: "
709	<< NumUnusedVmem << " VMem and " << NumUnusedSGPRs
710	<< " SGPR unused entries\n";
711	std::abort();
712	}
713	}
714	#endif
715
716	bool isSmemCounter(InstCounterType T) const {
717	return T == Context->SmemAccessCounter \|\| T == X_CNT;
718	}
719
720	unsigned getSgprScoresIdx(InstCounterType T) const {
721	assert(isSmemCounter(T) && "Invalid SMEM counter");
722	return T == X_CNT ? `1` : `0`;
723	}
724
725	unsigned getOutstanding(InstCounterType T) const {
726	return ScoreUBs[T] - ScoreLBs[T];
727	}
728
729	bool hasPendingVMEM(VMEMID ID, InstCounterType T) const {
730	return getVMemScore(TID: ID, T) > getScoreLB(T);
731	}
732
733	/// \Return true if we have no score entries for counter \p T.
734	bool empty(InstCounterType T) const { return getScoreRange(T) == `0`; }
735
736	private:
737	unsigned getScoreLB(InstCounterType T) const {
738	assert(T < NUM_INST_CNTS);
739	return ScoreLBs[T];
740	}
741
742	unsigned getScoreUB(InstCounterType T) const {
743	assert(T < NUM_INST_CNTS);
744	return ScoreUBs[T];
745	}
746
747	unsigned getScoreRange(InstCounterType T) const {
748	return getScoreUB(T) - getScoreLB(T);
749	}
750
751	unsigned getSGPRScore(MCRegUnit RU, InstCounterType T) const {
752	auto It = SGPRs.find(Val: RU);
753	return It != SGPRs.end() ? It ->second.Scores [getSgprScoresIdx(T)] : `0`;
754	}
755
756	unsigned getVMemScore(VMEMID TID, InstCounterType T) const {
757	auto It = VMem.find(Val: TID);
758	return It != VMem.end() ? It ->second.Scores [T] : `0`;
759	}
760
761	public:
762	bool merge(const WaitcntBrackets &Other);
763
764	bool counterOutOfOrder(InstCounterType T) const;
765	void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const {
766	simplifyWaitcnt(CheckWait: Wait, UpdateWait&: Wait);
767	}
768	void simplifyWaitcnt(const AMDGPU::Waitcnt &CheckWait,
769	AMDGPU::Waitcnt &UpdateWait) const;
770	void simplifyWaitcnt(InstCounterType T, unsigned &Count) const;
771	void simplifyWaitcnt(Waitcnt &Wait, InstCounterType T) const;
772	void simplifyXcnt(const AMDGPU::Waitcnt &CheckWait,
773	AMDGPU::Waitcnt &UpdateWait) const;
774	void simplifyVmVsrc(const AMDGPU::Waitcnt &CheckWait,
775	AMDGPU::Waitcnt &UpdateWait) const;
776
777	void determineWaitForPhysReg(InstCounterType T, MCPhysReg Reg,
778	AMDGPU::Waitcnt &Wait) const;
779	void determineWaitForLDSDMA(InstCounterType T, VMEMID TID,
780	AMDGPU::Waitcnt &Wait) const;
781	AMDGPU::Waitcnt determineAsyncWait(unsigned N);
782	void tryClearSCCWriteEvent(MachineInstr *Inst);
783
784	void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
785	void applyWaitcnt(InstCounterType T, unsigned Count);
786	void applyWaitcnt(const AMDGPU::Waitcnt &Wait, InstCounterType T);
787	void updateByEvent(WaitEventType E, MachineInstr &MI);
788	void recordAsyncMark(MachineInstr &MI);
789
790	bool hasPendingEvent() const { return !PendingEvents.empty(); }
791	bool hasPendingEvent(WaitEventType E) const {
792	return PendingEvents.contains(Event: E);
793	}
794	bool hasPendingEvent(InstCounterType T) const {
795	bool HasPending = PendingEvents & Context->getWaitEvents(T);
796	assert(HasPending == !empty(T) &&
797	"Expected pending events iff scoreboard is not empty");
798	return HasPending;
799	}
800
801	bool hasMixedPendingEvents(InstCounterType T) const {
802	WaitEventSet Events = PendingEvents & Context->getWaitEvents(T);
803	// Return true if more than one bit is set in Events.
804	return Events.twoOrMore();
805	}
806
807	bool hasPendingFlat() const {
808	return ((LastFlat[DS_CNT] > ScoreLBs[DS_CNT] &&
809	LastFlat[DS_CNT] <= ScoreUBs[DS_CNT]) \|\|
810	(LastFlat[LOAD_CNT] > ScoreLBs[LOAD_CNT] &&
811	LastFlat[LOAD_CNT] <= ScoreUBs[LOAD_CNT]));
812	}
813
814	void setPendingFlat() {
815	LastFlat[LOAD_CNT] = ScoreUBs[LOAD_CNT];
816	LastFlat[DS_CNT] = ScoreUBs[DS_CNT];
817	}
818
819	bool hasPendingGDS() const {
820	return LastGDS > ScoreLBs[DS_CNT] && LastGDS <= ScoreUBs[DS_CNT];
821	}
822
823	unsigned getPendingGDSWait() const {
824	return std::min(a: getScoreUB(T: DS_CNT) - LastGDS,
825	b: getWaitCountMax(Limits: Context->getLimits(), T: DS_CNT) - `1`);
826	}
827
828	void setPendingGDS() { LastGDS = ScoreUBs[DS_CNT]; }
829
830	// Return true if there might be pending writes to the vgpr-interval by VMEM
831	// instructions with types different from V.
832	bool hasOtherPendingVmemTypes(MCPhysReg Reg, VmemType V) const {
833	for (MCRegUnit RU : regunits(Reg)) {
834	auto It = VMem.find(Val: toVMEMID(RU));
835	if (It != VMem.end() && (It ->second.VMEMTypes & ~(`1` << V)))
836	return true;
837	}
838	return false;
839	}
840
841	void clearVgprVmemTypes(MCPhysReg Reg) {
842	for (MCRegUnit RU : regunits(Reg)) {
843	if (auto It = VMem.find(Val: toVMEMID(RU)); It != VMem.end()) {
844	It ->second.VMEMTypes = `0`;
845	if (It ->second.empty())
846	VMem.erase(I: It);
847	}
848	}
849	}
850
851	void setStateOnFunctionEntryOrReturn() {
852	setScoreUB(T: STORE_CNT, Val: getScoreUB(T: STORE_CNT) +
853	getWaitCountMax(Limits: Context->getLimits(), T: STORE_CNT));
854	PendingEvents \|= Context->getWaitEvents(T: STORE_CNT);
855	}
856
857	ArrayRef<const MachineInstr > getLDSDMAStores() const* {
858	return LDSDMAStores;
859	}
860
861	bool hasPointSampleAccel(const MachineInstr &MI) const;
862	bool hasPointSamplePendingVmemTypes(const MachineInstr &MI,
863	MCPhysReg RU) const;
864
865	void print(raw_ostream &) const;
866	void dump() const { print(dbgs()); }
867
868	// Free up memory by removing empty entries from the DenseMap that track event
869	// scores.
870	void purgeEmptyTrackingData();
871
872	private:
873	struct MergeInfo {
874	unsigned OldLB;
875	unsigned OtherLB;
876	unsigned MyShift;
877	unsigned OtherShift;
878	};
879
880	using CounterValueArray = std::array<unsigned, NUM_INST_CNTS>;
881
882	void determineWaitForScore(InstCounterType T, unsigned Score,
883	AMDGPU::Waitcnt &Wait) const;
884
885	static bool mergeScore(const MergeInfo &M, unsigned &Score,
886	unsigned OtherScore);
887	bool mergeAsyncMarks(ArrayRef<MergeInfo> MergeInfos,
888	ArrayRef<CounterValueArray> OtherMarks);
889
890	iterator_range<MCRegUnitIterator> regunits(MCPhysReg Reg) const {
891	assert(Reg != AMDGPU::SCC && "Shouldn't be used on SCC");
892	if (!Context->TRI.isInAllocatableClass(RegNo: Reg))
893	return {{}, {}};
894	const TargetRegisterClass *RC = Context->TRI.getPhysRegBaseClass(Reg);
895	unsigned Size = Context->TRI.getRegSizeInBits(RC: *RC);
896	if (Size == `16` && Context->ST.hasD16Writes32BitVgpr())
897	Reg = Context->TRI.get32BitRegister(Reg);
898	return Context->TRI.regunits(Reg);
899	}
900
901	void setScoreLB(InstCounterType T, unsigned Val) {
902	assert(T < NUM_INST_CNTS);
903	ScoreLBs[T] = Val;
904	}
905
906	void setScoreUB(InstCounterType T, unsigned Val) {
907	assert(T < NUM_INST_CNTS);
908	ScoreUBs[T] = Val;
909
910	if (T != EXP_CNT)
911	return;
912
913	if (getScoreRange(T: EXP_CNT) > getWaitCountMax(Limits: Context->getLimits(), T: EXP_CNT))
914	ScoreLBs[EXP_CNT] =
915	ScoreUBs[EXP_CNT] - getWaitCountMax(Limits: Context->getLimits(), T: EXP_CNT);
916	}
917
918	void setRegScore(MCPhysReg Reg, InstCounterType T, unsigned Val) {
919	const SIRegisterInfo &TRI = Context->TRI;
920	if (Reg == AMDGPU::SCC) {
921	SCCScore = Val;
922	} else if (TRI.isVectorRegister(MRI: Context->MRI, Reg)) {
923	for (MCRegUnit RU : regunits(Reg))
924	VMem [toVMEMID(RU)].Scores [T] = Val;
925	} else if (TRI.isSGPRReg(MRI: Context->MRI, Reg)) {
926	auto STy = getSgprScoresIdx(T);
927	for (MCRegUnit RU : regunits(Reg))
928	SGPRs [RU].Scores [STy] = Val;
929	} else {
930	llvm_unreachable("Register cannot be tracked/unknown register!");
931	}
932	}
933
934	void setVMemScore(VMEMID TID, InstCounterType T, unsigned Val) {
935	VMem [TID].Scores [T] = Val;
936	}
937
938	void setScoreByOperand(const MachineOperand &Op, InstCounterType CntTy,
939	unsigned Val);
940
941	const SIInsertWaitcnts *Context;
942
943	unsigned ScoreLBs[NUM_INST_CNTS] = {`0`};
944	unsigned ScoreUBs[NUM_INST_CNTS] = {`0`};
945	WaitEventSet PendingEvents;
946	// Remember the last flat memory operation.
947	unsigned LastFlat[NUM_INST_CNTS] = {`0`};
948	// Remember the last GDS operation.
949	unsigned LastGDS = `0`;
950
951	// The score tracking logic is fragmented as follows:
952	// - VMem: VGPR RegUnits and LDS DMA IDs, see the VMEMID encoding.
953	// - SGPRs: SGPR RegUnits
954	// - SCC: Non-allocatable and not general purpose: not a SGPR.
955	//
956	// For the VMem case, if the key is within the range of LDS DMA IDs,
957	// then the corresponding index into the `LDSDMAStores` vector below is:
958	// Key - LDSDMA_BEGIN - 1
959	// This is because LDSDMA_BEGIN is a generic entry and does not have an
960	// associated MachineInstr.
961	//
962	// TODO: Could we track SCC alongside SGPRs so it's not longer a special case?
963
964	struct VMEMInfo {
965	// Scores for all instruction counters. Zero-initialized.
966	CounterValueArray Scores{};
967	// Bitmask of the VmemTypes of VMEM instructions for this VGPR.
968	unsigned VMEMTypes = `0`;
969
970	bool empty() const { return all_of(Range: Scores, P: equal_to(Arg: `0`)) && !VMEMTypes; }
971	};
972
973	struct SGPRInfo {
974	// Wait cnt scores for every sgpr, the DS_CNT (corresponding to LGKMcnt
975	// pre-gfx12) or KM_CNT (gfx12+ only), and X_CNT (gfx1250) are relevant.
976	// Row 0 represents the score for either DS_CNT or KM_CNT and row 1 keeps
977	// the X_CNT score.
978	std::array<unsigned, `2`> Scores = {`0`};
979
980	bool empty() const { return !Scores [`0`] && !Scores [`1`]; }
981	};
982
983	DenseMap<VMEMID, VMEMInfo> VMem; // VGPR + LDS DMA
984	DenseMap<MCRegUnit, SGPRInfo> SGPRs;
985
986	// Reg score for SCC.
987	unsigned SCCScore = `0`;
988	// The unique instruction that has an SCC write pending, if there is one.
989	const MachineInstr PendingSCCWrite = nullptr*;
990
991	// Store representative LDS DMA operations. The only useful info here is
992	// alias info. One store is kept per unique AAInfo.
993	SmallVector<const MachineInstr *> LDSDMAStores;
994
995	// State of all counters at each async mark encountered so far.
996	SmallVector<CounterValueArray> AsyncMarks;
997
998	// But in the rare pathological case, a nest of loops that pushes marks
999	// without waiting on any mark can cause AsyncMarks to grow very large. We cap
1000	// it to a reasonable limit. We can tune this later or potentially introduce a
1001	// user option to control the value.
1002	static constexpr unsigned MaxAsyncMarks = `16`;
1003
1004	// Track the upper bound score for async operations that are not part of a
1005	// mark yet. Initialized to all zeros.
1006	CounterValueArray AsyncScore{};
1007	};
1008
1009	class SIInsertWaitcntsLegacy : public MachineFunctionPass {
1010	public:
1011	static char ID;
1012	SIInsertWaitcntsLegacy() : MachineFunctionPass (ID) {}
1013
1014	bool runOnMachineFunction(MachineFunction &MF) override;
1015
1016	StringRef getPassName() const override {
1017	return "SI insert wait instructions";
1018	}
1019
1020	void getAnalysisUsage(AnalysisUsage &AU) const override {
1021	AU.setPreservesCFG();
1022	AU.addRequired<MachineLoopInfoWrapperPass>();
1023	AU.addRequired<MachinePostDominatorTreeWrapperPass>();
1024	AU.addUsedIfAvailable<AAResultsWrapperPass>();
1025	AU.addPreserved<AAResultsWrapperPass>();
1026	MachineFunctionPass::getAnalysisUsage(AU);
1027	}
1028	};
1029
1030	} // end anonymous namespace
1031
1032	void WaitcntBrackets::setScoreByOperand(const MachineOperand &Op,
1033	InstCounterType CntTy, unsigned Score) {
1034	setRegScore(Reg: Op.getReg().asMCReg(), T: CntTy, Val: Score);
1035	}
1036
1037	// Return true if the subtarget is one that enables Point Sample Acceleration
1038	// and the MachineInstr passed in is one to which it might be applied (the
1039	// hardware makes this decision based on several factors, but we can't determine
1040	// this at compile time, so we have to assume it might be applied if the
1041	// instruction supports it).
1042	bool WaitcntBrackets::hasPointSampleAccel(const MachineInstr &MI) const {
1043	if (!Context->ST.hasPointSampleAccel() \|\| !SIInstrInfo::isMIMG(MI))
1044	return false;
1045
1046	const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc: MI.getOpcode());
1047	const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo =
1048	AMDGPU::getMIMGBaseOpcodeInfo(BaseOpcode: Info->BaseOpcode);
1049	return BaseInfo->PointSampleAccel;
1050	}
1051
1052	// Return true if the subtarget enables Point Sample Acceleration, the supplied
1053	// MachineInstr is one to which it might be applied and the supplied interval is
1054	// one that has outstanding writes to vmem-types different than VMEM_NOSAMPLER
1055	// (this is the type that a point sample accelerated instruction effectively
1056	// becomes)
1057	bool WaitcntBrackets::hasPointSamplePendingVmemTypes(const MachineInstr &MI,
1058	MCPhysReg Reg) const {
1059	if (!hasPointSampleAccel(MI))
1060	return false;
1061
1062	return hasOtherPendingVmemTypes(Reg, V: VMEM_NOSAMPLER);
1063	}
1064
1065	void WaitcntBrackets::updateByEvent(WaitEventType E, MachineInstr &Inst) {
1066	InstCounterType T = Context->getCounterFromEvent(E);
1067	assert(T < Context->MaxCounter);
1068
1069	unsigned UB = getScoreUB(T);
1070	unsigned CurrScore = UB + `1`;
1071	if (CurrScore == `0`)
1072	report_fatal_error(reason: "InsertWaitcnt score wraparound");
1073	// PendingEvents and ScoreUB need to be update regardless if this event
1074	// changes the score of a register or not.
1075	// Examples including vm_cnt when buffer-store or lgkm_cnt when send-message.
1076	PendingEvents.insert(Event: E);
1077	setScoreUB(T, Val: CurrScore);
1078
1079	const SIRegisterInfo &TRI = Context->TRI;
1080	const MachineRegisterInfo &MRI = Context->MRI;
1081	const SIInstrInfo &TII = Context->TII;
1082
1083	if (T == EXP_CNT) {
1084	// Put score on the source vgprs. If this is a store, just use those
1085	// specific register(s).
1086	if (TII.isDS(MI: Inst) && Inst.mayLoadOrStore()) {
1087	// All GDS operations must protect their address register (same as
1088	// export.)
1089	if (const auto *AddrOp = TII.getNamedOperand(MI&: Inst, OperandName: AMDGPU::OpName::addr))
1090	setScoreByOperand(Op: *AddrOp, CntTy: EXP_CNT, Score: CurrScore);
1091
1092	if (Inst.mayStore()) {
1093	if (const auto *Data0 =
1094	TII.getNamedOperand(MI&: Inst, OperandName: AMDGPU::OpName::data0))
1095	setScoreByOperand(Op: *Data0, CntTy: EXP_CNT, Score: CurrScore);
1096	if (const auto *Data1 =
1097	TII.getNamedOperand(MI&: Inst, OperandName: AMDGPU::OpName::data1))
1098	setScoreByOperand(Op: *Data1, CntTy: EXP_CNT, Score: CurrScore);
1099	} else if (SIInstrInfo::isAtomicRet(MI: Inst) && !SIInstrInfo::isGWS(MI: Inst) &&
1100	Inst.getOpcode() != AMDGPU::DS_APPEND &&
1101	Inst.getOpcode() != AMDGPU::DS_CONSUME &&
1102	Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) {
1103	for (const MachineOperand &Op : Inst.all_uses()) {
1104	if (TRI.isVectorRegister(MRI, Reg: Op.getReg()))
1105	setScoreByOperand(Op, CntTy: EXP_CNT, Score: CurrScore);
1106	}
1107	}
1108	} else if (TII.isFLAT(MI: Inst)) {
1109	if (Inst.mayStore()) {
1110	setScoreByOperand(Op: *TII.getNamedOperand(MI&: Inst, OperandName: AMDGPU::OpName::data),
1111	CntTy: EXP_CNT, Score: CurrScore);
1112	} else if (SIInstrInfo::isAtomicRet(MI: Inst)) {
1113	setScoreByOperand(Op: *TII.getNamedOperand(MI&: Inst, OperandName: AMDGPU::OpName::data),
1114	CntTy: EXP_CNT, Score: CurrScore);
1115	}
1116	} else if (TII.isMIMG(MI: Inst)) {
1117	if (Inst.mayStore()) {
1118	setScoreByOperand(Op: Inst.getOperand(i: `0`), CntTy: EXP_CNT, Score: CurrScore);
1119	} else if (SIInstrInfo::isAtomicRet(MI: Inst)) {
1120	setScoreByOperand(Op: *TII.getNamedOperand(MI&: Inst, OperandName: AMDGPU::OpName::data),
1121	CntTy: EXP_CNT, Score: CurrScore);
1122	}
1123	} else if (TII.isMTBUF(MI: Inst)) {
1124	if (Inst.mayStore())
1125	setScoreByOperand(Op: Inst.getOperand(i: `0`), CntTy: EXP_CNT, Score: CurrScore);
1126	} else if (TII.isMUBUF(MI: Inst)) {
1127	if (Inst.mayStore()) {
1128	setScoreByOperand(Op: Inst.getOperand(i: `0`), CntTy: EXP_CNT, Score: CurrScore);
1129	} else if (SIInstrInfo::isAtomicRet(MI: Inst)) {
1130	setScoreByOperand(Op: *TII.getNamedOperand(MI&: Inst, OperandName: AMDGPU::OpName::data),
1131	CntTy: EXP_CNT, Score: CurrScore);
1132	}
1133	} else if (TII.isLDSDIR(MI: Inst)) {
1134	// LDSDIR instructions attach the score to the destination.
1135	setScoreByOperand(Op: *TII.getNamedOperand(MI&: Inst, OperandName: AMDGPU::OpName::vdst),
1136	CntTy: EXP_CNT, Score: CurrScore);
1137	} else {
1138	if (TII.isEXP(MI: Inst)) {
1139	// For export the destination registers are really temps that
1140	// can be used as the actual source after export patching, so
1141	// we need to treat them like sources and set the EXP_CNT
1142	// score.
1143	for (MachineOperand &DefMO : Inst.all_defs()) {
1144	if (TRI.isVGPR(MRI, Reg: DefMO.getReg())) {
1145	setScoreByOperand(Op: DefMO, CntTy: EXP_CNT, Score: CurrScore);
1146	}
1147	}
1148	}
1149	for (const MachineOperand &Op : Inst.all_uses()) {
1150	if (TRI.isVectorRegister(MRI, Reg: Op.getReg()))
1151	setScoreByOperand(Op, CntTy: EXP_CNT, Score: CurrScore);
1152	}
1153	}
1154	} else if (T == X_CNT) {
1155	WaitEventType OtherEvent = E == SMEM_GROUP ? VMEM_GROUP : SMEM_GROUP;
1156	if (PendingEvents.contains(Event: OtherEvent)) {
1157	// Hardware inserts an implicit xcnt between interleaved
1158	// SMEM and VMEM operations. So there will never be
1159	// outstanding address translations for both SMEM and
1160	// VMEM at the same time.
1161	setScoreLB(T, Val: getScoreUB(T) - `1`);
1162	PendingEvents.remove(Event: OtherEvent);
1163	}
1164	for (const MachineOperand &Op : Inst.all_uses())
1165	setScoreByOperand(Op, CntTy: T, Score: CurrScore);
1166	} else if (T == VA_VDST \|\| T == VM_VSRC) {
1167	// Match the score to the VGPR destination or source registers as
1168	// appropriate
1169	for (const MachineOperand &Op : Inst.operands()) {
1170	if (!Op.isReg() \|\| (T == VA_VDST && Op.isUse()) \|\|
1171	(T == VM_VSRC && Op.isDef()))
1172	continue;
1173	if (TRI.isVectorRegister(MRI: Context->MRI, Reg: Op.getReg()))
1174	setScoreByOperand(Op, CntTy: T, Score: CurrScore);
1175	}
1176	} else / LGKM_CNT \|\| EXP_CNT \|\| VS_CNT \|\| NUM_INST_CNTS / {
1177	// Match the score to the destination registers.
1178	//
1179	// Check only explicit operands. Stores, especially spill stores, include
1180	// implicit uses and defs of their super registers which would create an
1181	// artificial dependency, while these are there only for register liveness
1182	// accounting purposes.
1183	//
1184	// Special cases where implicit register defs exists, such as M0 or VCC,
1185	// but none with memory instructions.
1186	for (const MachineOperand &Op : Inst.defs()) {
1187	if (T == LOAD_CNT \|\| T == SAMPLE_CNT \|\| T == BVH_CNT) {
1188	if (!TRI.isVectorRegister(MRI, Reg: Op.getReg())) // TODO: add wrapper
1189	continue;
1190	if (updateVMCntOnly(Inst)) {
1191	// updateVMCntOnly should only leave us with VGPRs
1192	// MUBUF, MTBUF, MIMG, FlatGlobal, and FlatScratch only have VGPR/AGPR
1193	// defs. That's required for a sane index into `VgprMemTypes` below
1194	assert(TRI.isVectorRegister(MRI, Op.getReg()));
1195	VmemType V = getVmemType(Inst);
1196	unsigned char TypesMask = `1` << V;
1197	// If instruction can have Point Sample Accel applied, we have to flag
1198	// this with another potential dependency
1199	if (hasPointSampleAccel(MI: Inst))
1200	TypesMask \|= `1` << VMEM_NOSAMPLER;
1201	for (MCRegUnit RU : regunits(Reg: Op.getReg().asMCReg()))
1202	VMem [toVMEMID(RU)].VMEMTypes \|= TypesMask;
1203	}
1204	}
1205	setScoreByOperand(Op, CntTy: T, Score: CurrScore);
1206	}
1207	if (Inst.mayStore() &&
1208	(TII.isDS(MI: Inst) \|\| Context->isNonAsyncLdsDmaWrite(MI: Inst))) {
1209	// MUBUF and FLAT LDS DMA operations need a wait on vmcnt before LDS
1210	// written can be accessed. A load from LDS to VMEM does not need a wait.
1211	//
1212	// The "Slot" is the offset from LDSDMA_BEGIN. If it's non-zero, then
1213	// there is a MachineInstr in LDSDMAStores used to track this LDSDMA
1214	// store. The "Slot" is the index into LDSDMAStores + 1.
1215	unsigned Slot = `0`;
1216	for (const auto *MemOp : Inst.memoperands()) {
1217	if (!MemOp->isStore() \|\|
1218	MemOp->getAddrSpace() != AMDGPUAS::LOCAL_ADDRESS)
1219	continue;
1220	// Comparing just AA info does not guarantee memoperands are equal
1221	// in general, but this is so for LDS DMA in practice.
1222	auto AAI = MemOp->getAAInfo();
1223	// Alias scope information gives a way to definitely identify an
1224	// original memory object and practically produced in the module LDS
1225	// lowering pass. If there is no scope available we will not be able
1226	// to disambiguate LDS aliasing as after the module lowering all LDS
1227	// is squashed into a single big object.
1228	if (!AAI \|\| !AAI.Scope)
1229	break;
1230	for (unsigned I = `0`, E = LDSDMAStores.size(); I != E && !Slot; ++I) {
1231	for (const auto *MemOp : LDSDMAStores [I]->memoperands()) {
1232	if (MemOp->isStore() && AAI == MemOp->getAAInfo()) {
1233	Slot = I + `1`;
1234	break;
1235	}
1236	}
1237	}
1238	if (Slot)
1239	break;
1240	// The slot may not be valid because it can be >= NUM_LDSDMA which
1241	// means the scoreboard cannot track it. We still want to preserve the
1242	// MI in order to check alias information, though.
1243	LDSDMAStores.push_back(Elt: &Inst);
1244	Slot = LDSDMAStores.size();
1245	break;
1246	}
1247	setVMemScore(TID: LDSDMA_BEGIN, T, Val: CurrScore);
1248	if (Slot && Slot < NUM_LDSDMA)
1249	setVMemScore(TID: LDSDMA_BEGIN + Slot, T, Val: CurrScore);
1250	}
1251
1252	// FIXME: Not supported on GFX12 yet. Newer async operations use other
1253	// counters too, so will need a map from instruction or event types to
1254	// counter types.
1255	if (Context->isAsyncLdsDmaWrite(MI: Inst) && T == LOAD_CNT) {
1256	assert(!SIInstrInfo::usesASYNC_CNT(Inst) &&
1257	"unexpected GFX1250 instruction");
1258	AsyncScore [T] = CurrScore;
1259	}
1260
1261	if (SIInstrInfo::isSBarrierSCCWrite(Opcode: Inst.getOpcode())) {
1262	setRegScore(Reg: AMDGPU::SCC, T, Val: CurrScore);
1263	PendingSCCWrite = &Inst;
1264	}
1265	}
1266	}
1267
1268	void WaitcntBrackets::recordAsyncMark(MachineInstr &Inst) {
1269	// In the absence of loops, AsyncMarks can grow linearly with the program
1270	// until we encounter an ASYNCMARK_WAIT. We could drop the oldest mark above a
1271	// limit every time we push a new mark, but that seems like unnecessary work
1272	// in practical cases. We do separately truncate the array when processing a
1273	// loop, which should be sufficient.
1274	AsyncMarks.push_back(Elt: AsyncScore);
1275	AsyncScore = {};
1276	LLVM_DEBUG({
1277	dbgs() << "recordAsyncMark:\n" << Inst;
1278	for (const auto &Mark : AsyncMarks) {
1279	llvm::interleaveComma(Mark, dbgs());
1280	dbgs() << `'\n'`;
1281	}
1282	});
1283	}
1284
1285	void WaitcntBrackets::print(raw_ostream &OS) const {
1286	const GCNSubtarget &ST = Context->ST;
1287
1288	for (auto T : inst_counter_types(MaxCounter: Context->MaxCounter)) {
1289	unsigned SR = getScoreRange(T);
1290	switch (T) {
1291	case LOAD_CNT:
1292	OS << " " << (ST.hasExtendedWaitCounts() ? "LOAD" : "VM") << "_CNT("
1293	<< SR << "):";
1294	break;
1295	case DS_CNT:
1296	OS << " " << (ST.hasExtendedWaitCounts() ? "DS" : "LGKM") << "_CNT("
1297	<< SR << "):";
1298	break;
1299	case EXP_CNT:
1300	OS << " EXP_CNT(" << SR << "):";
1301	break;
1302	case STORE_CNT:
1303	OS << " " << (ST.hasExtendedWaitCounts() ? "STORE" : "VS") << "_CNT("
1304	<< SR << "):";
1305	break;
1306	case SAMPLE_CNT:
1307	OS << " SAMPLE_CNT(" << SR << "):";
1308	break;
1309	case BVH_CNT:
1310	OS << " BVH_CNT(" << SR << "):";
1311	break;
1312	case KM_CNT:
1313	OS << " KM_CNT(" << SR << "):";
1314	break;
1315	case X_CNT:
1316	OS << " X_CNT(" << SR << "):";
1317	break;
1318	case VA_VDST:
1319	OS << " VA_VDST(" << SR << "): ";
1320	break;
1321	case VM_VSRC:
1322	OS << " VM_VSRC(" << SR << "): ";
1323	break;
1324	default:
1325	OS << " UNKNOWN(" << SR << "):";
1326	break;
1327	}
1328
1329	if (SR != `0`) {
1330	// Print vgpr scores.
1331	unsigned LB = getScoreLB(T);
1332
1333	SmallVector<VMEMID> SortedVMEMIDs(VMem.keys());
1334	sort(C&: SortedVMEMIDs);
1335
1336	for (auto ID : SortedVMEMIDs) {
1337	unsigned RegScore = VMem.at(Val: ID).Scores [T];
1338	if (RegScore <= LB)
1339	continue;
1340	unsigned RelScore = RegScore - LB - `1`;
1341	if (ID < REGUNITS_END) {
1342	OS << `' '` << RelScore << ":vRU" << ID;
1343	} else {
1344	assert(ID >= LDSDMA_BEGIN && ID < LDSDMA_END &&
1345	"Unhandled/unexpected ID value!");
1346	OS << `' '` << RelScore << ":LDSDMA" << ID;
1347	}
1348	}
1349
1350	// Also need to print sgpr scores for lgkm_cnt or xcnt.
1351	if (isSmemCounter(T)) {
1352	SmallVector<MCRegUnit> SortedSMEMIDs(SGPRs.keys());
1353	sort(C&: SortedSMEMIDs);
1354	for (auto ID : SortedSMEMIDs) {
1355	unsigned RegScore = SGPRs.at(Val: ID).Scores [getSgprScoresIdx(T)];
1356	if (RegScore <= LB)
1357	continue;
1358	unsigned RelScore = RegScore - LB - `1`;
1359	OS << `' '` << RelScore << ":sRU" << static_cast<unsigned>(ID);
1360	}
1361	}
1362
1363	if (T == KM_CNT && SCCScore > `0`)
1364	OS << `' '` << SCCScore << ":scc";
1365	}
1366	OS << `'\n'`;
1367	}
1368
1369	OS << "Pending Events: ";
1370	if (hasPendingEvent()) {
1371	ListSeparator LS;
1372	for (unsigned I = `0`; I != NUM_WAIT_EVENTS; ++I) {
1373	if (hasPendingEvent(E: (WaitEventType)I)) {
1374	OS << LS << WaitEventTypeName[I];
1375	}
1376	}
1377	} else {
1378	OS << "none";
1379	}
1380	OS << `'\n'`;
1381
1382	OS << "Async score: ";
1383	if (AsyncScore.empty())
1384	OS << "none";
1385	else
1386	llvm::interleaveComma(c: AsyncScore, os&: OS);
1387	OS << `'\n'`;
1388
1389	OS << "Async marks: " << AsyncMarks.size() << `'\n'`;
1390
1391	for (const auto &Mark : AsyncMarks) {
1392	for (auto T : inst_counter_types()) {
1393	unsigned MarkedScore = Mark [T];
1394	switch (T) {
1395	case LOAD_CNT:
1396	OS << " " << (ST.hasExtendedWaitCounts() ? "LOAD" : "VM")
1397	<< "_CNT: " << MarkedScore;
1398	break;
1399	case DS_CNT:
1400	OS << " " << (ST.hasExtendedWaitCounts() ? "DS" : "LGKM")
1401	<< "_CNT: " << MarkedScore;
1402	break;
1403	case EXP_CNT:
1404	OS << " EXP_CNT: " << MarkedScore;
1405	break;
1406	case STORE_CNT:
1407	OS << " " << (ST.hasExtendedWaitCounts() ? "STORE" : "VS")
1408	<< "_CNT: " << MarkedScore;
1409	break;
1410	case SAMPLE_CNT:
1411	OS << " SAMPLE_CNT: " << MarkedScore;
1412	break;
1413	case BVH_CNT:
1414	OS << " BVH_CNT: " << MarkedScore;
1415	break;
1416	case KM_CNT:
1417	OS << " KM_CNT: " << MarkedScore;
1418	break;
1419	case X_CNT:
1420	OS << " X_CNT: " << MarkedScore;
1421	break;
1422	default:
1423	OS << " UNKNOWN: " << MarkedScore;
1424	break;
1425	}
1426	}
1427	OS << `'\n'`;
1428	}
1429	OS << `'\n'`;
1430	}
1431
1432	/// Simplify \p UpdateWait by removing waits that are redundant based on the
1433	/// current WaitcntBrackets and any other waits specified in \p CheckWait.
1434	void WaitcntBrackets::simplifyWaitcnt(const AMDGPU::Waitcnt &CheckWait,
1435	AMDGPU::Waitcnt &UpdateWait) const {
1436	simplifyWaitcnt(Wait&: UpdateWait, T: LOAD_CNT);
1437	simplifyWaitcnt(Wait&: UpdateWait, T: EXP_CNT);
1438	simplifyWaitcnt(Wait&: UpdateWait, T: DS_CNT);
1439	simplifyWaitcnt(Wait&: UpdateWait, T: STORE_CNT);
1440	simplifyWaitcnt(Wait&: UpdateWait, T: SAMPLE_CNT);
1441	simplifyWaitcnt(Wait&: UpdateWait, T: BVH_CNT);
1442	simplifyWaitcnt(Wait&: UpdateWait, T: KM_CNT);
1443	simplifyXcnt(CheckWait, UpdateWait);
1444	simplifyWaitcnt(Wait&: UpdateWait, T: VA_VDST);
1445	simplifyVmVsrc(CheckWait, UpdateWait);
1446	}
1447
1448	void WaitcntBrackets::simplifyWaitcnt(InstCounterType T,
1449	unsigned &Count) const {
1450	// The number of outstanding events for this type, T, can be calculated
1451	// as (UB - LB). If the current Count is greater than or equal to the number
1452	// of outstanding events, then the wait for this counter is redundant.
1453	if (Count >= getScoreRange(T))
1454	Count = ~`0u`;
1455	}
1456
1457	void WaitcntBrackets::simplifyWaitcnt(Waitcnt &Wait, InstCounterType T) const {
1458	unsigned Cnt = Wait.get(T);
1459	simplifyWaitcnt(T, Count&: Cnt);
1460	Wait.set(T, Val: Cnt);
1461	}
1462
1463	void WaitcntBrackets::simplifyXcnt(const AMDGPU::Waitcnt &CheckWait,
1464	AMDGPU::Waitcnt &UpdateWait) const {
1465	// Try to simplify xcnt further by checking for joint kmcnt and loadcnt
1466	// optimizations. On entry to a block with multiple predescessors, there may
1467	// be pending SMEM and VMEM events active at the same time.
1468	// In such cases, only clear one active event at a time.
1469	// TODO: Revisit xcnt optimizations for gfx1250.
1470	// Wait on XCNT is redundant if we are already waiting for a load to complete.
1471	// SMEM can return out of order, so only omit XCNT wait if we are waiting till
1472	// zero.
1473	if (CheckWait.get(T: KM_CNT) == `0` && hasPendingEvent(E: SMEM_GROUP))
1474	UpdateWait.set(T: X_CNT, Val: ~`0u`);
1475	// If we have pending store we cannot optimize XCnt because we do not wait for
1476	// stores. VMEM loads retun in order, so if we only have loads XCnt is
1477	// decremented to the same number as LOADCnt.
1478	if (CheckWait.get(T: LOAD_CNT) != ~`0u` && hasPendingEvent(E: VMEM_GROUP) &&
1479	!hasPendingEvent(T: STORE_CNT) &&
1480	CheckWait.get(T: X_CNT) >= CheckWait.get(T: LOAD_CNT))
1481	UpdateWait.set(T: X_CNT, Val: ~`0u`);
1482	simplifyWaitcnt(Wait&: UpdateWait, T: X_CNT);
1483	}
1484
1485	void WaitcntBrackets::simplifyVmVsrc(const AMDGPU::Waitcnt &CheckWait,
1486	AMDGPU::Waitcnt &UpdateWait) const {
1487	// Waiting for some counters implies waiting for VM_VSRC, since an
1488	// instruction that decrements a counter on completion would have
1489	// decremented VM_VSRC once its VGPR operands had been read.
1490	if (CheckWait.get(T: VM_VSRC) >=
1491	std::min(l: {CheckWait.get(T: LOAD_CNT), CheckWait.get(T: STORE_CNT),
1492	CheckWait.get(T: SAMPLE_CNT), CheckWait.get(T: BVH_CNT),
1493	CheckWait.get(T: DS_CNT)}))
1494	UpdateWait.set(T: VM_VSRC, Val: ~`0u`);
1495	simplifyWaitcnt(Wait&: UpdateWait, T: VM_VSRC);
1496	}
1497
1498	void WaitcntBrackets::purgeEmptyTrackingData() {
1499	for (auto &[K, V] : make_early_inc_range(Range&: VMem)) {
1500	if (V.empty())
1501	VMem.erase(Val: K);
1502	}
1503	for (auto &[K, V] : make_early_inc_range(Range&: SGPRs)) {
1504	if (V.empty())
1505	SGPRs.erase(Val: K);
1506	}
1507	}
1508
1509	void WaitcntBrackets::determineWaitForScore(InstCounterType T,
1510	unsigned ScoreToWait,
1511	AMDGPU::Waitcnt &Wait) const {
1512	const unsigned LB = getScoreLB(T);
1513	const unsigned UB = getScoreUB(T);
1514
1515	// If the score falls within the bracket, we need a waitcnt.
1516	if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
1517	if ((T == LOAD_CNT \|\| T == DS_CNT) && hasPendingFlat() &&
1518	!Context->ST.hasFlatLgkmVMemCountInOrder()) {
1519	// If there is a pending FLAT operation, and this is a VMem or LGKM
1520	// waitcnt and the target can report early completion, then we need
1521	// to force a waitcnt 0.
1522	addWait(Wait, T, Count: `0`);
1523	} else if (counterOutOfOrder(T)) {
1524	// Counter can get decremented out-of-order when there
1525	// are multiple types event in the bracket. Also emit an s_wait counter
1526	// with a conservative value of 0 for the counter.
1527	addWait(Wait, T, Count: `0`);
1528	} else {
1529	// If a counter has been maxed out avoid overflow by waiting for
1530	// MAX(CounterType) - 1 instead.
1531	unsigned NeededWait = std::min(
1532	a: UB - ScoreToWait, b: getWaitCountMax(Limits: Context->getLimits(), T) - `1`);
1533	addWait(Wait, T, Count: NeededWait);
1534	}
1535	}
1536	}
1537
1538	AMDGPU::Waitcnt WaitcntBrackets::determineAsyncWait(unsigned N) {
1539	LLVM_DEBUG({
1540	dbgs() << "Need " << N << " async marks. Found " << AsyncMarks.size()
1541	<< ":\n";
1542	for (const auto &Mark : AsyncMarks) {
1543	llvm::interleaveComma(Mark, dbgs());
1544	dbgs() << `'\n'`;
1545	}
1546	});
1547
1548	if (AsyncMarks.size() == MaxAsyncMarks) {
1549	// Enforcing MaxAsyncMarks here is unnecessary work because the size of
1550	// MaxAsyncMarks is linear when traversing straightline code. But we do
1551	// need to check if truncation may have occured at a merge, and adjust N
1552	// to ensure that a wait is generated.
1553	LLVM_DEBUG(dbgs() << "Possible truncation. Ensuring a non-trivial wait.\n");
1554	N = std::min(a: N, b: (unsigned)MaxAsyncMarks - `1`);
1555	}
1556
1557	AMDGPU::Waitcnt Wait;
1558	if (AsyncMarks.size() <= N) {
1559	LLVM_DEBUG(dbgs() << "No additional wait for async mark.\n");
1560	return Wait;
1561	}
1562
1563	size_t MarkIndex = AsyncMarks.size() - N - `1`;
1564	const auto &RequiredMark = AsyncMarks [MarkIndex];
1565	for (InstCounterType T : inst_counter_types())
1566	determineWaitForScore(T, ScoreToWait: RequiredMark [T], Wait);
1567
1568	// Immediately remove the waited mark and all older ones
1569	// This happens BEFORE the wait is actually inserted, which is fine
1570	// because we've already extracted the wait requirements
1571	LLVM_DEBUG({
1572	dbgs() << "Removing " << (MarkIndex + `1`)
1573	<< " async marks after determining wait\n";
1574	});
1575	AsyncMarks.erase(CS: AsyncMarks.begin(), CE: AsyncMarks.begin() + MarkIndex + `1`);
1576
1577	LLVM_DEBUG(dbgs() << "Waits to add: " << Wait);
1578	return Wait;
1579	}
1580
1581	void WaitcntBrackets::determineWaitForPhysReg(InstCounterType T, MCPhysReg Reg,
1582	AMDGPU::Waitcnt &Wait) const {
1583	if (Reg == AMDGPU::SCC) {
1584	determineWaitForScore(T, ScoreToWait: SCCScore, Wait);
1585	} else {
1586	bool IsVGPR = Context->TRI.isVectorRegister(MRI: Context->MRI, Reg);
1587	for (MCRegUnit RU : regunits(Reg))
1588	determineWaitForScore(
1589	T, ScoreToWait: IsVGPR ? getVMemScore(TID: toVMEMID(RU), T) : getSGPRScore(RU, T),
1590	Wait);
1591	}
1592	}
1593
1594	void WaitcntBrackets::determineWaitForLDSDMA(InstCounterType T, VMEMID TID,
1595	AMDGPU::Waitcnt &Wait) const {
1596	assert(TID >= LDSDMA_BEGIN && TID < LDSDMA_END);
1597	determineWaitForScore(T, ScoreToWait: getVMemScore(TID, T), Wait);
1598	}
1599
1600	void WaitcntBrackets::tryClearSCCWriteEvent(MachineInstr *Inst) {
1601	// S_BARRIER_WAIT on the same barrier guarantees that the pending write to
1602	// SCC has landed
1603	if (PendingSCCWrite &&
1604	PendingSCCWrite->getOpcode() == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM &&
1605	PendingSCCWrite->getOperand(i: `0`).getImm() == Inst->getOperand(i: `0`).getImm()) {
1606	WaitEventSet SCC_WRITE_PendingEvent(SCC_WRITE);
1607	// If this SCC_WRITE is the only pending KM_CNT event, clear counter.
1608	if ((PendingEvents & Context->getWaitEvents(T: KM_CNT)) ==
1609	SCC_WRITE_PendingEvent) {
1610	setScoreLB(T: KM_CNT, Val: getScoreUB(T: KM_CNT));
1611	}
1612
1613	PendingEvents.remove(Other: SCC_WRITE_PendingEvent);
1614	PendingSCCWrite = nullptr;
1615	}
1616	}
1617
1618	void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) {
1619	for (InstCounterType T : inst_counter_types())
1620	applyWaitcnt(Wait, T);
1621	}
1622
1623	void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
1624	const unsigned UB = getScoreUB(T);
1625	if (Count >= UB)
1626	return;
1627	if (Count != `0`) {
1628	if (counterOutOfOrder(T))
1629	return;
1630	setScoreLB(T, Val: std::max(a: getScoreLB(T), b: UB - Count));
1631	} else {
1632	setScoreLB(T, Val: UB);
1633	PendingEvents.remove(Other: Context->getWaitEvents(T));
1634	}
1635
1636	if (T == KM_CNT && Count == `0` && hasPendingEvent(E: SMEM_GROUP)) {
1637	if (!hasMixedPendingEvents(T: X_CNT))
1638	applyWaitcnt(T: X_CNT, Count: `0`);
1639	else
1640	PendingEvents.remove(Event: SMEM_GROUP);
1641	}
1642	if (T == LOAD_CNT && hasPendingEvent(E: VMEM_GROUP) &&
1643	!hasPendingEvent(T: STORE_CNT)) {
1644	if (!hasMixedPendingEvents(T: X_CNT))
1645	applyWaitcnt(T: X_CNT, Count);
1646	else if (Count == `0`)
1647	PendingEvents.remove(Event: VMEM_GROUP);
1648	}
1649	}
1650
1651	void WaitcntBrackets::applyWaitcnt(const Waitcnt &Wait, InstCounterType T) {
1652	unsigned Cnt = Wait.get(T);
1653	applyWaitcnt(T, Count: Cnt);
1654	}
1655
1656	// Where there are multiple types of event in the bracket of a counter,
1657	// the decrement may go out of order.
1658	bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const {
1659	// Scalar memory read always can go out of order.
1660	if ((T == Context->SmemAccessCounter && hasPendingEvent(E: SMEM_ACCESS)) \|\|
1661	(T == X_CNT && hasPendingEvent(E: SMEM_GROUP)))
1662	return true;
1663
1664	// GLOBAL_INV completes in-order with other LOAD_CNT events (VMEM_ACCESS),
1665	// so having GLOBAL_INV_ACCESS mixed with other LOAD_CNT events doesn't cause
1666	// out-of-order completion.
1667	if (T == LOAD_CNT) {
1668	unsigned Events = hasPendingEvent(T);
1669	// Remove GLOBAL_INV_ACCESS from the event mask before checking for mixed
1670	// events
1671	Events &= ~(`1` << GLOBAL_INV_ACCESS);
1672	// Return true only if there are still multiple event types after removing
1673	// GLOBAL_INV
1674	return Events & (Events - `1`);
1675	}
1676
1677	return hasMixedPendingEvents(T);
1678	}
1679
1680	INITIALIZE_PASS_BEGIN(SIInsertWaitcntsLegacy, DEBUG_TYPE, "SI Insert Waitcnts",
1681	false, false)
1682	INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass)
1683	INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTreeWrapperPass)
1684	INITIALIZE_PASS_END(SIInsertWaitcntsLegacy, DEBUG_TYPE, "SI Insert Waitcnts",
1685	false, false)
1686
1687	char SIInsertWaitcntsLegacy::ID = `0`;
1688
1689	char &llvm::SIInsertWaitcntsID = SIInsertWaitcntsLegacy::ID;
1690
1691	FunctionPass *llvm::createSIInsertWaitcntsPass() {
1692	return new SIInsertWaitcntsLegacy ();
1693	}
1694
1695	static bool updateOperandIfDifferent(MachineInstr &MI, AMDGPU::OpName OpName,
1696	unsigned NewEnc) {
1697	int OpIdx = AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: OpName);
1698	assert(OpIdx >= `0`);
1699
1700	MachineOperand &MO = MI.getOperand(i: OpIdx);
1701
1702	if (NewEnc == MO.getImm())
1703	return false;
1704
1705	MO.setImm(NewEnc);
1706	return true;
1707	}
1708
1709	/// Determine if \p MI is a gfx12+ single-counter S_WAIT_CNT instruction,*
1710	/// and if so, which counter it is waiting on.
1711	static std::optional<InstCounterType> counterTypeForInstr(unsigned Opcode) {
1712	switch (Opcode) {
1713	case AMDGPU::S_WAIT_LOADCNT:
1714	return LOAD_CNT;
1715	case AMDGPU::S_WAIT_EXPCNT:
1716	return EXP_CNT;
1717	case AMDGPU::S_WAIT_STORECNT:
1718	return STORE_CNT;
1719	case AMDGPU::S_WAIT_SAMPLECNT:
1720	return SAMPLE_CNT;
1721	case AMDGPU::S_WAIT_BVHCNT:
1722	return BVH_CNT;
1723	case AMDGPU::S_WAIT_DSCNT:
1724	return DS_CNT;
1725	case AMDGPU::S_WAIT_KMCNT:
1726	return KM_CNT;
1727	case AMDGPU::S_WAIT_XCNT:
1728	return X_CNT;
1729	default:
1730	return {};
1731	}
1732	}
1733
1734	bool WaitcntGenerator::promoteSoftWaitCnt(MachineInstr Waitcnt) const* {
1735	unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Opcode: Waitcnt->getOpcode());
1736	if (Opcode == Waitcnt->getOpcode())
1737	return false;
1738
1739	Waitcnt->setDesc(TII.get(Opcode));
1740	return true;
1741	}
1742
1743	/// Combine consecutive S_WAITCNT and S_WAITCNT_VSCNT instructions that
1744	/// precede \p It and follow \p OldWaitcntInstr and apply any extra waits
1745	/// from \p Wait that were added by previous passes. Currently this pass
1746	/// conservatively assumes that these preexisting waits are required for
1747	/// correctness.
1748	bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
1749	WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
1750	AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) const {
1751	assert(isNormalMode(MaxCounter));
1752
1753	bool Modified = false;
1754	MachineInstr WaitcntInstr = nullptr*;
1755	MachineInstr WaitcntVsCntInstr = nullptr*;
1756
1757	LLVM_DEBUG({
1758	dbgs() << "PreGFX12::applyPreexistingWaitcnt at: ";
1759	if (It.isEnd())
1760	dbgs() << "end of block\n";
1761	else
1762	dbgs() << *It;
1763	});
1764
1765	for (auto &II :
1766	make_early_inc_range(Range: make_range(x: OldWaitcntInstr.getIterator(), y: It))) {
1767	LLVM_DEBUG(dbgs() << "pre-existing iter: " << II);
1768	if (II.isMetaInstruction()) {
1769	LLVM_DEBUG(dbgs() << "skipped meta instruction\n");
1770	continue;
1771	}
1772
1773	unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Opcode: II.getOpcode());
1774	bool TrySimplify = Opcode != II.getOpcode() && !OptNone;
1775
1776	// Update required wait count. If this is a soft waitcnt (= it was added
1777	// by an earlier pass), it may be entirely removed.
1778	if (Opcode == AMDGPU::S_WAITCNT) {
1779	unsigned IEnc = II.getOperand(i: `0`).getImm();
1780	AMDGPU::Waitcnt OldWait = AMDGPU::decodeWaitcnt(Version: IV, Encoded: IEnc);
1781	if (TrySimplify)
1782	ScoreBrackets.simplifyWaitcnt(Wait&: OldWait);
1783	Wait = Wait.combined(Other: OldWait);
1784
1785	// Merge consecutive waitcnt of the same type by erasing multiples.
1786	if (WaitcntInstr \|\| (!Wait.hasWaitExceptStoreCnt() && TrySimplify)) {
1787	II.eraseFromParent();
1788	Modified = true;
1789	} else
1790	WaitcntInstr = &II;
1791	} else if (Opcode == AMDGPU::S_WAITCNT_lds_direct) {
1792	assert(ST.hasVMemToLDSLoad());
1793	LLVM_DEBUG(dbgs() << "Processing S_WAITCNT_lds_direct: " << II
1794	<< "Before: " << Wait << `'\n'`;);
1795	ScoreBrackets.determineWaitForLDSDMA(T: LOAD_CNT, TID: LDSDMA_BEGIN, Wait);
1796	LLVM_DEBUG(dbgs() << "After: " << Wait << `'\n'`;);
1797
1798	// It is possible (but unlikely) that this is the only wait instruction,
1799	// in which case, we exit this loop without a WaitcntInstr to consume
1800	// `Wait`. But that works because `Wait` was passed in by reference, and
1801	// the callee eventually calls createNewWaitcnt on it. We test this
1802	// possibility in an articial MIR test since such a situation cannot be
1803	// recreated by running the memory legalizer.
1804	II.eraseFromParent();
1805	} else if (Opcode == AMDGPU::WAIT_ASYNCMARK) {
1806	unsigned N = II.getOperand(i: `0`).getImm();
1807	LLVM_DEBUG(dbgs() << "Processing WAIT_ASYNCMARK: " << II << `'\n'`;);
1808	AMDGPU::Waitcnt OldWait = ScoreBrackets.determineAsyncWait(N);
1809	Wait = Wait.combined(Other: OldWait);
1810	} else {
1811	assert(Opcode == AMDGPU::S_WAITCNT_VSCNT);
1812	assert(II.getOperand(`0`).getReg() == AMDGPU::SGPR_NULL);
1813
1814	unsigned OldVSCnt =
1815	TII.getNamedOperand(MI&: II, OperandName: AMDGPU::OpName::simm16)->getImm();
1816	if (TrySimplify)
1817	ScoreBrackets.simplifyWaitcnt(T: InstCounterType::STORE_CNT, Count&: OldVSCnt);
1818	Wait.set(T: STORE_CNT, Val: std::min(a: Wait.get(T: STORE_CNT), b: OldVSCnt));
1819
1820	if (WaitcntVsCntInstr \|\| (!Wait.hasWaitStoreCnt() && TrySimplify)) {
1821	II.eraseFromParent();
1822	Modified = true;
1823	} else
1824	WaitcntVsCntInstr = &II;
1825	}
1826	}
1827
1828	if (WaitcntInstr) {
1829	Modified \|= updateOperandIfDifferent(MI&: *WaitcntInstr, OpName: AMDGPU::OpName::simm16,
1830	NewEnc: AMDGPU::encodeWaitcnt(Version: IV, Decoded: Wait));
1831	Modified \|= promoteSoftWaitCnt(Waitcnt: WaitcntInstr);
1832
1833	ScoreBrackets.applyWaitcnt(Wait, T: LOAD_CNT);
1834	ScoreBrackets.applyWaitcnt(Wait, T: EXP_CNT);
1835	ScoreBrackets.applyWaitcnt(Wait, T: DS_CNT);
1836	Wait.set(T: LOAD_CNT, Val: ~`0u`);
1837	Wait.set(T: EXP_CNT, Val: ~`0u`);
1838	Wait.set(T: DS_CNT, Val: ~`0u`);
1839
1840	LLVM_DEBUG(It.isEnd() ? dbgs() << "applied pre-existing waitcnt\n"
1841	<< "New Instr at block end: "
1842	<< *WaitcntInstr << `'\n'`
1843	: dbgs() << "applied pre-existing waitcnt\n"
1844	<< "Old Instr: " << *It
1845	<< "New Instr: " << *WaitcntInstr << `'\n'`);
1846	}
1847
1848	if (WaitcntVsCntInstr) {
1849	Modified \|= updateOperandIfDifferent(
1850	MI&: *WaitcntVsCntInstr, OpName: AMDGPU::OpName::simm16, NewEnc: Wait.get(T: STORE_CNT));
1851	Modified \|= promoteSoftWaitCnt(Waitcnt: WaitcntVsCntInstr);
1852
1853	ScoreBrackets.applyWaitcnt(T: STORE_CNT, Count: Wait.get(T: STORE_CNT));
1854	Wait.set(T: STORE_CNT, Val: ~`0u`);
1855
1856	LLVM_DEBUG(It.isEnd()
1857	? dbgs() << "applied pre-existing waitcnt\n"
1858	<< "New Instr at block end: " << *WaitcntVsCntInstr
1859	<< `'\n'`
1860	: dbgs() << "applied pre-existing waitcnt\n"
1861	<< "Old Instr: " << *It
1862	<< "New Instr: " << *WaitcntVsCntInstr << `'\n'`);
1863	}
1864
1865	return Modified;
1866	}
1867
1868	/// Generate S_WAITCNT and/or S_WAITCNT_VSCNT instructions for any
1869	/// required counters in \p Wait
1870	bool WaitcntGeneratorPreGFX12::createNewWaitcnt(
1871	MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It,
1872	AMDGPU::Waitcnt Wait, const WaitcntBrackets &ScoreBrackets) {
1873	assert(isNormalMode(MaxCounter));
1874
1875	bool Modified = false;
1876	const DebugLoc &DL = Block.findDebugLoc(MBBI: It);
1877
1878	// Helper to emit expanded waitcnt sequence for profiling.
1879	// Emits waitcnts from (Outstanding-1) down to Target.
1880	// The EmitWaitcnt callback emits a single waitcnt.
1881	auto EmitExpandedWaitcnt = [&](unsigned Outstanding, unsigned Target,
1882	auto EmitWaitcnt) {
1883	do {
1884	EmitWaitcnt(--Outstanding);
1885	} while (Outstanding > Target);
1886	Modified = true;
1887	};
1888
1889	// Waits for VMcnt, LKGMcnt and/or EXPcnt are encoded together into a
1890	// single instruction while VScnt has its own instruction.
1891	if (Wait.hasWaitExceptStoreCnt()) {
1892	// If profiling expansion is enabled, emit an expanded sequence
1893	if (ExpandWaitcntProfiling) {
1894	// Check if any of the counters to be waited on are out-of-order.
1895	// If so, fall back to normal (non-expanded) behavior since expansion
1896	// would provide misleading profiling information.
1897	bool AnyOutOfOrder = false;
1898	for (auto CT : {LOAD_CNT, DS_CNT, EXP_CNT}) {
1899	unsigned WaitCnt = Wait.get(T: CT);
1900	if (WaitCnt != ~`0u` && ScoreBrackets.counterOutOfOrder(T: CT)) {
1901	AnyOutOfOrder = true;
1902	break;
1903	}
1904	}
1905
1906	if (AnyOutOfOrder) {
1907	// Fall back to non-expanded wait
1908	unsigned Enc = AMDGPU::encodeWaitcnt(Version: IV, Decoded: Wait);
1909	BuildMI(BB&: Block, I: It, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_WAITCNT)).addImm(Val: Enc);
1910	Modified = true;
1911	} else {
1912	// All counters are in-order, safe to expand
1913	for (auto CT : {LOAD_CNT, DS_CNT, EXP_CNT}) {
1914	unsigned WaitCnt = Wait.get(T: CT);
1915	if (WaitCnt == ~`0u`)
1916	continue;
1917
1918	unsigned Outstanding = std::min(a: ScoreBrackets.getOutstanding(T: CT),
1919	b: getWaitCountMax(Limits: getLimits(), T: CT) - `1`);
1920	EmitExpandedWaitcnt (Outstanding, WaitCnt, [&](unsigned Count) {
1921	AMDGPU::Waitcnt W;
1922	W.set(T: CT, Val: Count);
1923	BuildMI(BB&: Block, I: It, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_WAITCNT))
1924	.addImm(Val: AMDGPU::encodeWaitcnt(Version: IV, Decoded: W));
1925	});
1926	}
1927	}
1928	} else {
1929	// Normal behavior: emit single combined waitcnt
1930	unsigned Enc = AMDGPU::encodeWaitcnt(Version: IV, Decoded: Wait);
1931	[[maybe_unused]] auto SWaitInst =
1932	BuildMI(BB&: Block, I: It, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_WAITCNT)).addImm(Val: Enc);
1933	Modified = true;
1934
1935	LLVM_DEBUG(dbgs() << "PreGFX12::createNewWaitcnt\n";
1936	if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
1937	dbgs() << "New Instr: " << *SWaitInst << `'\n'`);
1938	}
1939	}
1940
1941	if (Wait.hasWaitStoreCnt()) {
1942	assert(ST.hasVscnt());
1943
1944	if (ExpandWaitcntProfiling && Wait.get(T: STORE_CNT) != ~`0u` &&
1945	!ScoreBrackets.counterOutOfOrder(T: STORE_CNT)) {
1946	// Only expand if counter is not out-of-order
1947	unsigned Outstanding =
1948	std::min(a: ScoreBrackets.getOutstanding(T: STORE_CNT),
1949	b: getWaitCountMax(Limits: getLimits(), T: STORE_CNT) - `1`);
1950	EmitExpandedWaitcnt (
1951	Outstanding, Wait.get(T: STORE_CNT), [&](unsigned Count) {
1952	BuildMI(BB&: Block, I: It, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_WAITCNT_VSCNT))
1953	.addReg(RegNo: AMDGPU::SGPR_NULL, Flags: RegState::Undef)
1954	.addImm(Val: Count);
1955	});
1956	} else {
1957	[[maybe_unused]] auto SWaitInst =
1958	BuildMI(BB&: Block, I: It, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_WAITCNT_VSCNT))
1959	.addReg(RegNo: AMDGPU::SGPR_NULL, Flags: RegState::Undef)
1960	.addImm(Val: Wait.get(T: STORE_CNT));
1961	Modified = true;
1962
1963	LLVM_DEBUG(dbgs() << "PreGFX12::createNewWaitcnt\n";
1964	if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
1965	dbgs() << "New Instr: " << *SWaitInst << `'\n'`);
1966	}
1967	}
1968
1969	return Modified;
1970	}
1971
1972	AMDGPU::Waitcnt
1973	WaitcntGeneratorPreGFX12::getAllZeroWaitcnt(bool IncludeVSCnt) const {
1974	return AMDGPU::Waitcnt (`0`, `0`, `0`, IncludeVSCnt && ST.hasVscnt() ? `0` : ~`0u`);
1975	}
1976
1977	AMDGPU::Waitcnt
1978	WaitcntGeneratorGFX12Plus::getAllZeroWaitcnt(bool IncludeVSCnt) const {
1979	unsigned ExpertVal = IsExpertMode ? `0` : ~`0u`;
1980	return AMDGPU::Waitcnt (`0`, `0`, `0`, IncludeVSCnt ? `0` : ~`0u`, `0`, `0`, `0`,
1981	~`0u` / XCNT /, ExpertVal, ExpertVal);
1982	}
1983
1984	/// Combine consecutive S_WAIT_CNT instructions that precede \p It and*
1985	/// follow \p OldWaitcntInstr and apply any extra waits from \p Wait that
1986	/// were added by previous passes. Currently this pass conservatively
1987	/// assumes that these preexisting waits are required for correctness.
1988	bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
1989	WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
1990	AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) const {
1991	assert(!isNormalMode(MaxCounter));
1992
1993	bool Modified = false;
1994	MachineInstr CombinedLoadDsCntInstr = nullptr*;
1995	MachineInstr CombinedStoreDsCntInstr = nullptr*;
1996	MachineInstr WaitcntDepctrInstr = nullptr*;
1997	MachineInstr *WaitInstrs[NUM_EXTENDED_INST_CNTS] = {};
1998
1999	LLVM_DEBUG({
2000	dbgs() << "GFX12Plus::applyPreexistingWaitcnt at: ";
2001	if (It.isEnd())
2002	dbgs() << "end of block\n";
2003	else
2004	dbgs() << *It;
2005	});
2006
2007	// Accumulate waits that should not be simplified.
2008	AMDGPU::Waitcnt RequiredWait;
2009
2010	for (auto &II :
2011	make_early_inc_range(Range: make_range(x: OldWaitcntInstr.getIterator(), y: It))) {
2012	LLVM_DEBUG(dbgs() << "pre-existing iter: " << II);
2013	if (II.isMetaInstruction()) {
2014	LLVM_DEBUG(dbgs() << "skipped meta instruction\n");
2015	continue;
2016	}
2017
2018	// Update required wait count. If this is a soft waitcnt (= it was added
2019	// by an earlier pass), it may be entirely removed.
2020
2021	unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Opcode: II.getOpcode());
2022	bool TrySimplify = Opcode != II.getOpcode() && !OptNone;
2023
2024	// Don't crash if the programmer used legacy waitcnt intrinsics, but don't
2025	// attempt to do more than that either.
2026	if (Opcode == AMDGPU::S_WAITCNT)
2027	continue;
2028
2029	if (Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT) {
2030	unsigned OldEnc =
2031	TII.getNamedOperand(MI&: II, OperandName: AMDGPU::OpName::simm16)->getImm();
2032	AMDGPU::Waitcnt OldWait = AMDGPU::decodeLoadcntDscnt(Version: IV, LoadcntDscnt: OldEnc);
2033	if (TrySimplify)
2034	Wait = Wait.combined(Other: OldWait);
2035	else
2036	RequiredWait = RequiredWait.combined(Other: OldWait);
2037	// Keep the first wait_loadcnt, erase the rest.
2038	if (CombinedLoadDsCntInstr == nullptr) {
2039	CombinedLoadDsCntInstr = &II;
2040	} else {
2041	II.eraseFromParent();
2042	Modified = true;
2043	}
2044	} else if (Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT) {
2045	unsigned OldEnc =
2046	TII.getNamedOperand(MI&: II, OperandName: AMDGPU::OpName::simm16)->getImm();
2047	AMDGPU::Waitcnt OldWait = AMDGPU::decodeStorecntDscnt(Version: IV, StorecntDscnt: OldEnc);
2048	if (TrySimplify)
2049	Wait = Wait.combined(Other: OldWait);
2050	else
2051	RequiredWait = RequiredWait.combined(Other: OldWait);
2052	// Keep the first wait_storecnt, erase the rest.
2053	if (CombinedStoreDsCntInstr == nullptr) {
2054	CombinedStoreDsCntInstr = &II;
2055	} else {
2056	II.eraseFromParent();
2057	Modified = true;
2058	}
2059	} else if (Opcode == AMDGPU::S_WAITCNT_DEPCTR) {
2060	unsigned OldEnc =
2061	TII.getNamedOperand(MI&: II, OperandName: AMDGPU::OpName::simm16)->getImm();
2062	AMDGPU::Waitcnt OldWait;
2063	OldWait.set(T: VA_VDST, Val: AMDGPU::DepCtr::decodeFieldVaVdst(Encoded: OldEnc));
2064	OldWait.set(T: VM_VSRC, Val: AMDGPU::DepCtr::decodeFieldVmVsrc(Encoded: OldEnc));
2065	if (TrySimplify)
2066	ScoreBrackets.simplifyWaitcnt(Wait&: OldWait);
2067	Wait = Wait.combined(Other: OldWait);
2068	if (WaitcntDepctrInstr == nullptr) {
2069	WaitcntDepctrInstr = &II;
2070	} else {
2071	// S_WAITCNT_DEPCTR requires special care. Don't remove a
2072	// duplicate if it is waiting on things other than VA_VDST or
2073	// VM_VSRC. If that is the case, just make sure the VA_VDST and
2074	// VM_VSRC subfields of the operand are set to the "no wait"
2075	// values.
2076
2077	unsigned Enc =
2078	TII.getNamedOperand(MI&: II, OperandName: AMDGPU::OpName::simm16)->getImm();
2079	Enc = AMDGPU::DepCtr::encodeFieldVmVsrc(Encoded: Enc, VmVsrc: ~`0u`);
2080	Enc = AMDGPU::DepCtr::encodeFieldVaVdst(Encoded: Enc, VaVdst: ~`0u`);
2081
2082	if (Enc != (unsigned)AMDGPU::DepCtr::getDefaultDepCtrEncoding(STI: ST)) {
2083	Modified \|= updateOperandIfDifferent(MI&: II, OpName: AMDGPU::OpName::simm16, NewEnc: Enc);
2084	Modified \|= promoteSoftWaitCnt(Waitcnt: &II);
2085	} else {
2086	II.eraseFromParent();
2087	Modified = true;
2088	}
2089	}
2090	} else if (Opcode == AMDGPU::S_WAITCNT_lds_direct) {
2091	// Architectures higher than GFX10 do not have direct loads to
2092	// LDS, so no work required here yet.
2093	II.eraseFromParent();
2094	Modified = true;
2095	} else if (Opcode == AMDGPU::WAIT_ASYNCMARK) {
2096	reportFatalUsageError(reason: "WAIT_ASYNCMARK is not ready for GFX12 yet");
2097	} else {
2098	std::optional<InstCounterType> CT = counterTypeForInstr(Opcode);
2099	assert(CT.has_value());
2100	unsigned OldCnt =
2101	TII.getNamedOperand(MI&: II, OperandName: AMDGPU::OpName::simm16)->getImm();
2102	if (TrySimplify)
2103	addWait(Wait, T: CT.value(), Count: OldCnt);
2104	else
2105	addWait(Wait&: RequiredWait, T: CT.value(), Count: OldCnt);
2106	// Keep the first wait of its kind, erase the rest.
2107	if (WaitInstrs[CT.value()] == nullptr) {
2108	WaitInstrs[CT.value()] = &II;
2109	} else {
2110	II.eraseFromParent();
2111	Modified = true;
2112	}
2113	}
2114	}
2115
2116	ScoreBrackets.simplifyWaitcnt(CheckWait: Wait.combined(Other: RequiredWait), UpdateWait&: Wait);
2117	Wait = Wait.combined(Other: RequiredWait);
2118
2119	if (CombinedLoadDsCntInstr) {
2120	// Only keep an S_WAIT_LOADCNT_DSCNT if both counters actually need
2121	// to be waited for. Otherwise, let the instruction be deleted so
2122	// the appropriate single counter wait instruction can be inserted
2123	// instead, when new S_WAIT_CNT instructions are inserted by*
2124	// createNewWaitcnt(). As a side effect, resetting the wait counts will
2125	// cause any redundant S_WAIT_LOADCNT or S_WAIT_DSCNT to be removed by
2126	// the loop below that deals with single counter instructions.
2127	//
2128	// A wait for LOAD_CNT or DS_CNT implies a wait for VM_VSRC, since
2129	// instructions that have decremented LOAD_CNT or DS_CNT on completion
2130	// will have needed to wait for their register sources to be available
2131	// first.
2132	if (Wait.get(T: LOAD_CNT) != ~`0u` && Wait.get(T: DS_CNT) != ~`0u`) {
2133	unsigned NewEnc = AMDGPU::encodeLoadcntDscnt(Version: IV, Decoded: Wait);
2134	Modified \|= updateOperandIfDifferent(MI&: *CombinedLoadDsCntInstr,
2135	OpName: AMDGPU::OpName::simm16, NewEnc);
2136	Modified \|= promoteSoftWaitCnt(Waitcnt: CombinedLoadDsCntInstr);
2137	ScoreBrackets.applyWaitcnt(T: LOAD_CNT, Count: Wait.get(T: LOAD_CNT));
2138	ScoreBrackets.applyWaitcnt(T: DS_CNT, Count: Wait.get(T: DS_CNT));
2139	Wait.set(T: LOAD_CNT, Val: ~`0u`);
2140	Wait.set(T: DS_CNT, Val: ~`0u`);
2141
2142	LLVM_DEBUG(It.isEnd() ? dbgs() << "applied pre-existing waitcnt\n"
2143	<< "New Instr at block end: "
2144	<< *CombinedLoadDsCntInstr << `'\n'`
2145	: dbgs() << "applied pre-existing waitcnt\n"
2146	<< "Old Instr: " << *It << "New Instr: "
2147	<< *CombinedLoadDsCntInstr << `'\n'`);
2148	} else {
2149	CombinedLoadDsCntInstr->eraseFromParent();
2150	Modified = true;
2151	}
2152	}
2153
2154	if (CombinedStoreDsCntInstr) {
2155	// Similarly for S_WAIT_STORECNT_DSCNT.
2156	if (Wait.get(T: STORE_CNT) != ~`0u` && Wait.get(T: DS_CNT) != ~`0u`) {
2157	unsigned NewEnc = AMDGPU::encodeStorecntDscnt(Version: IV, Decoded: Wait);
2158	Modified \|= updateOperandIfDifferent(MI&: *CombinedStoreDsCntInstr,
2159	OpName: AMDGPU::OpName::simm16, NewEnc);
2160	Modified \|= promoteSoftWaitCnt(Waitcnt: CombinedStoreDsCntInstr);
2161	ScoreBrackets.applyWaitcnt(Wait, T: STORE_CNT);
2162	ScoreBrackets.applyWaitcnt(Wait, T: DS_CNT);
2163	Wait.set(T: STORE_CNT, Val: ~`0u`);
2164	Wait.set(T: DS_CNT, Val: ~`0u`);
2165
2166	LLVM_DEBUG(It.isEnd() ? dbgs() << "applied pre-existing waitcnt\n"
2167	<< "New Instr at block end: "
2168	<< *CombinedStoreDsCntInstr << `'\n'`
2169	: dbgs() << "applied pre-existing waitcnt\n"
2170	<< "Old Instr: " << *It << "New Instr: "
2171	<< *CombinedStoreDsCntInstr << `'\n'`);
2172	} else {
2173	CombinedStoreDsCntInstr->eraseFromParent();
2174	Modified = true;
2175	}
2176	}
2177
2178	// Look for an opportunity to convert existing S_WAIT_LOADCNT,
2179	// S_WAIT_STORECNT and S_WAIT_DSCNT into new S_WAIT_LOADCNT_DSCNT
2180	// or S_WAIT_STORECNT_DSCNT. This is achieved by selectively removing
2181	// instructions so that createNewWaitcnt() will create new combined
2182	// instructions to replace them.
2183
2184	if (Wait.get(T: DS_CNT) != ~`0u`) {
2185	// This is a vector of addresses in WaitInstrs pointing to instructions
2186	// that should be removed if they are present.
2187	SmallVector<MachineInstr **, `2`> WaitsToErase;
2188
2189	// If it's known that both DScnt and either LOADcnt or STOREcnt (but not
2190	// both) need to be waited for, ensure that there are no existing
2191	// individual wait count instructions for these.
2192
2193	if (Wait.get(T: LOAD_CNT) != ~`0u`) {
2194	WaitsToErase.push_back(Elt: &WaitInstrs[LOAD_CNT]);
2195	WaitsToErase.push_back(Elt: &WaitInstrs[DS_CNT]);
2196	} else if (Wait.get(T: STORE_CNT) != ~`0u`) {
2197	WaitsToErase.push_back(Elt: &WaitInstrs[STORE_CNT]);
2198	WaitsToErase.push_back(Elt: &WaitInstrs[DS_CNT]);
2199	}
2200
2201	for (MachineInstr **WI : WaitsToErase) {
2202	if (!*WI)
2203	continue;
2204
2205	(*WI)->eraseFromParent();
2206	WI = nullptr*;
2207	Modified = true;
2208	}
2209	}
2210
2211	for (auto CT : inst_counter_types(MaxCounter: NUM_EXTENDED_INST_CNTS)) {
2212	if (!WaitInstrs[CT])
2213	continue;
2214
2215	unsigned NewCnt = Wait.get(T: CT);
2216	if (NewCnt != ~`0u`) {
2217	Modified \|= updateOperandIfDifferent(MI&: *WaitInstrs[CT],
2218	OpName: AMDGPU::OpName::simm16, NewEnc: NewCnt);
2219	Modified \|= promoteSoftWaitCnt(Waitcnt: WaitInstrs[CT]);
2220
2221	ScoreBrackets.applyWaitcnt(T: CT, Count: NewCnt);
2222	setNoWait(Wait, T: CT);
2223
2224	LLVM_DEBUG(It.isEnd()
2225	? dbgs() << "applied pre-existing waitcnt\n"
2226	<< "New Instr at block end: " << *WaitInstrs[CT]
2227	<< `'\n'`
2228	: dbgs() << "applied pre-existing waitcnt\n"
2229	<< "Old Instr: " << *It
2230	<< "New Instr: " << *WaitInstrs[CT] << `'\n'`);
2231	} else {
2232	WaitInstrs[CT]->eraseFromParent();
2233	Modified = true;
2234	}
2235	}
2236
2237	if (WaitcntDepctrInstr) {
2238	// Get the encoded Depctr immediate and override the VA_VDST and VM_VSRC
2239	// subfields with the new required values.
2240	unsigned Enc =
2241	TII.getNamedOperand(MI&: *WaitcntDepctrInstr, OperandName: AMDGPU::OpName::simm16)
2242	->getImm();
2243	Enc = AMDGPU::DepCtr::encodeFieldVmVsrc(Encoded: Enc, VmVsrc: Wait.get(T: VM_VSRC));
2244	Enc = AMDGPU::DepCtr::encodeFieldVaVdst(Encoded: Enc, VaVdst: Wait.get(T: VA_VDST));
2245
2246	ScoreBrackets.applyWaitcnt(T: VA_VDST, Count: Wait.get(T: VA_VDST));
2247	ScoreBrackets.applyWaitcnt(T: VM_VSRC, Count: Wait.get(T: VM_VSRC));
2248	Wait.set(T: VA_VDST, Val: ~`0u`);
2249	Wait.set(T: VM_VSRC, Val: ~`0u`);
2250
2251	// If that new encoded Depctr immediate would actually still wait
2252	// for anything, update the instruction's operand. Otherwise it can
2253	// just be deleted.
2254	if (Enc != (unsigned)AMDGPU::DepCtr::getDefaultDepCtrEncoding(STI: ST)) {
2255	Modified \|= updateOperandIfDifferent(MI&: *WaitcntDepctrInstr,
2256	OpName: AMDGPU::OpName::simm16, NewEnc: Enc);
2257	LLVM_DEBUG(It.isEnd() ? dbgs() << "applyPreexistingWaitcnt\n"
2258	<< "New Instr at block end: "
2259	<< *WaitcntDepctrInstr << `'\n'`
2260	: dbgs() << "applyPreexistingWaitcnt\n"
2261	<< "Old Instr: " << *It << "New Instr: "
2262	<< *WaitcntDepctrInstr << `'\n'`);
2263	} else {
2264	WaitcntDepctrInstr->eraseFromParent();
2265	Modified = true;
2266	}
2267	}
2268
2269	return Modified;
2270	}
2271
2272	/// Generate S_WAIT_CNT instructions for any required counters in \p Wait*
2273	bool WaitcntGeneratorGFX12Plus::createNewWaitcnt(
2274	MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It,
2275	AMDGPU::Waitcnt Wait, const WaitcntBrackets &ScoreBrackets) {
2276	assert(!isNormalMode(MaxCounter));
2277
2278	bool Modified = false;
2279	const DebugLoc &DL = Block.findDebugLoc(MBBI: It);
2280
2281	// Helper to emit expanded waitcnt sequence for profiling.
2282	auto EmitExpandedWaitcnt = [&](unsigned Outstanding, unsigned Target,
2283	auto EmitWaitcnt) {
2284	for (unsigned I = Outstanding - `1`; I > Target && I != ~`0u`; --I)
2285	EmitWaitcnt(I);
2286	EmitWaitcnt(Target);
2287	Modified = true;
2288	};
2289
2290	// For GFX12+, we use separate wait instructions, which makes expansion
2291	// simpler
2292	if (ExpandWaitcntProfiling) {
2293	for (auto CT : inst_counter_types(MaxCounter: NUM_EXTENDED_INST_CNTS)) {
2294	unsigned Count = Wait.get(T: CT);
2295	if (Count == ~`0u`)
2296	continue;
2297
2298	// Skip expansion for out-of-order counters - emit normal wait instead
2299	if (ScoreBrackets.counterOutOfOrder(T: CT)) {
2300	BuildMI(BB&: Block, I: It, MIMD: DL, MCID: TII.get(Opcode: instrsForExtendedCounterTypes[CT]))
2301	.addImm(Val: Count);
2302	Modified = true;
2303	continue;
2304	}
2305
2306	unsigned Outstanding = std::min(a: ScoreBrackets.getOutstanding(T: CT),
2307	b: getWaitCountMax(Limits: getLimits(), T: CT) - `1`);
2308	EmitExpandedWaitcnt (Outstanding, Count, [&](unsigned Val) {
2309	BuildMI(BB&: Block, I: It, MIMD: DL, MCID: TII.get(Opcode: instrsForExtendedCounterTypes[CT]))
2310	.addImm(Val);
2311	});
2312	}
2313	return Modified;
2314	}
2315
2316	// Normal behavior (no expansion)
2317	// Check for opportunities to use combined wait instructions.
2318	if (Wait.get(T: DS_CNT) != ~`0u`) {
2319	MachineInstr SWaitInst = nullptr*;
2320
2321	if (Wait.get(T: LOAD_CNT) != ~`0u`) {
2322	unsigned Enc = AMDGPU::encodeLoadcntDscnt(Version: IV, Decoded: Wait);
2323
2324	SWaitInst = BuildMI(BB&: Block, I: It, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_WAIT_LOADCNT_DSCNT))
2325	.addImm(Val: Enc);
2326
2327	Wait.set(T: LOAD_CNT, Val: ~`0u`);
2328	Wait.set(T: DS_CNT, Val: ~`0u`);
2329	} else if (Wait.get(T: STORE_CNT) != ~`0u`) {
2330	unsigned Enc = AMDGPU::encodeStorecntDscnt(Version: IV, Decoded: Wait);
2331
2332	SWaitInst = BuildMI(BB&: Block, I: It, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_WAIT_STORECNT_DSCNT))
2333	.addImm(Val: Enc);
2334
2335	Wait.set(T: STORE_CNT, Val: ~`0u`);
2336	Wait.set(T: DS_CNT, Val: ~`0u`);
2337	}
2338
2339	if (SWaitInst) {
2340	Modified = true;
2341
2342	LLVM_DEBUG(dbgs() << "GFX12Plus::createNewWaitcnt\n";
2343	if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
2344	dbgs() << "New Instr: " << *SWaitInst << `'\n'`);
2345	}
2346	}
2347
2348	// Generate an instruction for any remaining counter that needs
2349	// waiting for.
2350
2351	for (auto CT : inst_counter_types(MaxCounter: NUM_EXTENDED_INST_CNTS)) {
2352	unsigned Count = Wait.get(T: CT);
2353	if (Count == ~`0u`)
2354	continue;
2355
2356	[[maybe_unused]] auto SWaitInst =
2357	BuildMI(BB&: Block, I: It, MIMD: DL, MCID: TII.get(Opcode: instrsForExtendedCounterTypes[CT]))
2358	.addImm(Val: Count);
2359
2360	Modified = true;
2361
2362	LLVM_DEBUG(dbgs() << "GFX12Plus::createNewWaitcnt\n";
2363	if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
2364	dbgs() << "New Instr: " << *SWaitInst << `'\n'`);
2365	}
2366
2367	if (Wait.hasWaitDepctr()) {
2368	assert(IsExpertMode);
2369	unsigned Enc = AMDGPU::DepCtr::encodeFieldVmVsrc(VmVsrc: Wait.get(T: VM_VSRC), STI: ST);
2370	Enc = AMDGPU::DepCtr::encodeFieldVaVdst(Encoded: Enc, VaVdst: Wait.get(T: VA_VDST));
2371
2372	[[maybe_unused]] auto SWaitInst =
2373	BuildMI(BB&: Block, I: It, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_WAITCNT_DEPCTR)).addImm(Val: Enc);
2374
2375	Modified = true;
2376
2377	LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
2378	if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
2379	dbgs() << "New Instr: " << *SWaitInst << `'\n'`);
2380	}
2381
2382	return Modified;
2383	}
2384
2385	/// Generate s_waitcnt instruction to be placed before cur_Inst.
2386	/// Instructions of a given type are returned in order,
2387	/// but instructions of different types can complete out of order.
2388	/// We rely on this in-order completion
2389	/// and simply assign a score to the memory access instructions.
2390	/// We keep track of the active "score bracket" to determine
2391	/// if an access of a memory read requires an s_waitcnt
2392	/// and if so what the value of each counter is.
2393	/// The "score bracket" is bound by the lower bound and upper bound
2394	/// scores (_score_LB and _score_ub respectively).
2395	/// If FlushFlags.FlushVmCnt is true, we want to flush the vmcnt counter here.
2396	/// If FlushFlags.FlushDsCnt is true, we want to flush the dscnt counter here
2397	/// (GFX12+ only, where DS_CNT is a separate counter).
2398	bool SIInsertWaitcnts::generateWaitcntInstBefore(
2399	MachineInstr &MI, WaitcntBrackets &ScoreBrackets,
2400	MachineInstr *OldWaitcntInstr, PreheaderFlushFlags FlushFlags) {
2401	LLVM_DEBUG(dbgs() << "\n*** GenerateWaitcntInstBefore: "; MI.print(dbgs()););
2402	setForceEmitWaitcnt();
2403
2404	assert(!MI.isMetaInstruction());
2405
2406	AMDGPU::Waitcnt Wait;
2407	const unsigned Opc = MI.getOpcode();
2408
2409	switch (Opc) {
2410	case AMDGPU::BUFFER_WBINVL1:
2411	case AMDGPU::BUFFER_WBINVL1_SC:
2412	case AMDGPU::BUFFER_WBINVL1_VOL:
2413	case AMDGPU::BUFFER_GL0_INV:
2414	case AMDGPU::BUFFER_GL1_INV: {
2415	// FIXME: This should have already been handled by the memory legalizer.
2416	// Removing this currently doesn't affect any lit tests, but we need to
2417	// verify that nothing was relying on this. The number of buffer invalidates
2418	// being handled here should not be expanded.
2419	Wait.set(T: LOAD_CNT, Val: `0`);
2420	break;
2421	}
2422	case AMDGPU::SI_RETURN_TO_EPILOG:
2423	case AMDGPU::SI_RETURN:
2424	case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN:
2425	case AMDGPU::S_SETPC_B64_return: {
2426	// All waits must be resolved at call return.
2427	// NOTE: this could be improved with knowledge of all call sites or
2428	// with knowledge of the called routines.
2429	ReturnInsts.insert(V: &MI);
2430	AMDGPU::Waitcnt AllZeroWait =
2431	WCG ->getAllZeroWaitcnt(/IncludeVSCnt=/false);
2432	// On GFX12+, if LOAD_CNT is pending but no VGPRs are waiting for loads
2433	// (e.g., only GLOBAL_INV is pending), we can skip waiting on loadcnt.
2434	// GLOBAL_INV increments loadcnt but doesn't write to VGPRs, so there's
2435	// no need to wait for it at function boundaries.
2436	if (ST.hasExtendedWaitCounts() &&
2437	!ScoreBrackets.hasPendingEvent(E: VMEM_ACCESS))
2438	AllZeroWait.set(T: LOAD_CNT, Val: ~`0u`);
2439	Wait = AllZeroWait;
2440	break;
2441	}
2442	case AMDGPU::S_ENDPGM:
2443	case AMDGPU::S_ENDPGM_SAVED: {
2444	// In dynamic VGPR mode, we want to release the VGPRs before the wave exits.
2445	// Technically the hardware will do this on its own if we don't, but that
2446	// might cost extra cycles compared to doing it explicitly.
2447	// When not in dynamic VGPR mode, identify S_ENDPGM instructions which may
2448	// have to wait for outstanding VMEM stores. In this case it can be useful
2449	// to send a message to explicitly release all VGPRs before the stores have
2450	// completed, but it is only safe to do this if there are no outstanding
2451	// scratch stores.
2452	EndPgmInsts [&MI] = !ScoreBrackets.empty(T: STORE_CNT) &&
2453	!ScoreBrackets.hasPendingEvent(E: SCRATCH_WRITE_ACCESS);
2454	break;
2455	}
2456	case AMDGPU::S_SENDMSG:
2457	case AMDGPU::S_SENDMSGHALT: {
2458	if (ST.hasLegacyGeometry() &&
2459	((MI.getOperand(i: `0`).getImm() & AMDGPU::SendMsg::ID_MASK_PreGFX11_) ==
2460	AMDGPU::SendMsg::ID_GS_DONE_PreGFX11)) {
2461	// Resolve vm waits before gs-done.
2462	Wait.set(T: LOAD_CNT, Val: `0`);
2463	break;
2464	}
2465	[[fallthrough]];
2466	}
2467	default: {
2468
2469	// Export & GDS instructions do not read the EXEC mask until after the
2470	// export is granted (which can occur well after the instruction is issued).
2471	// The shader program must flush all EXP operations on the export-count
2472	// before overwriting the EXEC mask.
2473	if (MI.modifiesRegister(Reg: AMDGPU::EXEC, TRI: &TRI)) {
2474	// Export and GDS are tracked individually, either may trigger a waitcnt
2475	// for EXEC.
2476	if (ScoreBrackets.hasPendingEvent(E: EXP_GPR_LOCK) \|\|
2477	ScoreBrackets.hasPendingEvent(E: EXP_PARAM_ACCESS) \|\|
2478	ScoreBrackets.hasPendingEvent(E: EXP_POS_ACCESS) \|\|
2479	ScoreBrackets.hasPendingEvent(E: GDS_GPR_LOCK)) {
2480	Wait.set(T: EXP_CNT, Val: `0`);
2481	}
2482	}
2483
2484	// Wait for any pending GDS instruction to complete before any
2485	// "Always GDS" instruction.
2486	if (TII.isAlwaysGDS(Opcode: Opc) && ScoreBrackets.hasPendingGDS())
2487	addWait(Wait, T: DS_CNT, Count: ScoreBrackets.getPendingGDSWait());
2488
2489	if (MI.isCall()) {
2490	// The function is going to insert a wait on everything in its prolog.
2491	// This still needs to be careful if the call target is a load (e.g. a GOT
2492	// load). We also need to check WAW dependency with saved PC.
2493	CallInsts.insert(V: &MI);
2494	Wait = AMDGPU::Waitcnt ();
2495
2496	const MachineOperand &CallAddrOp = TII.getCalleeOperand(MI);
2497	if (CallAddrOp.isReg()) {
2498	ScoreBrackets.determineWaitForPhysReg(
2499	T: SmemAccessCounter, Reg: CallAddrOp.getReg().asMCReg(), Wait);
2500
2501	if (const auto *RtnAddrOp =
2502	TII.getNamedOperand(MI, OperandName: AMDGPU::OpName::dst)) {
2503	ScoreBrackets.determineWaitForPhysReg(
2504	T: SmemAccessCounter, Reg: RtnAddrOp->getReg().asMCReg(), Wait);
2505	}
2506	}
2507	} else if (Opc == AMDGPU::S_BARRIER_WAIT) {
2508	ScoreBrackets.tryClearSCCWriteEvent(Inst: &MI);
2509	} else {
2510	// FIXME: Should not be relying on memoperands.
2511	// Look at the source operands of every instruction to see if
2512	// any of them results from a previous memory operation that affects
2513	// its current usage. If so, an s_waitcnt instruction needs to be
2514	// emitted.
2515	// If the source operand was defined by a load, add the s_waitcnt
2516	// instruction.
2517	//
2518	// Two cases are handled for destination operands:
2519	// 1) If the destination operand was defined by a load, add the s_waitcnt
2520	// instruction to guarantee the right WAW order.
2521	// 2) If a destination operand that was used by a recent export/store ins,
2522	// add s_waitcnt on exp_cnt to guarantee the WAR order.
2523
2524	for (const MachineMemOperand *Memop : MI.memoperands()) {
2525	const Value *Ptr = Memop->getValue();
2526	if (Memop->isStore()) {
2527	if (auto It = SLoadAddresses.find(Val: Ptr); It != SLoadAddresses.end()) {
2528	addWait(Wait, T: SmemAccessCounter, Count: `0`);
2529	if (PDT.dominates(A: MI.getParent(), B: It ->second))
2530	SLoadAddresses.erase(I: It);
2531	}
2532	}
2533	unsigned AS = Memop->getAddrSpace();
2534	if (AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::FLAT_ADDRESS)
2535	continue;
2536	// No need to wait before load from VMEM to LDS.
2537	if (TII.mayWriteLDSThroughDMA(MI))
2538	continue;
2539
2540	// LOAD_CNT is only relevant to vgpr or LDS.
2541	unsigned TID = LDSDMA_BEGIN;
2542	if (Ptr && Memop->getAAInfo()) {
2543	const auto &LDSDMAStores = ScoreBrackets.getLDSDMAStores();
2544	for (unsigned I = `0`, E = LDSDMAStores.size(); I != E; ++I) {
2545	if (MI.mayAlias(AA, Other: LDSDMAStores [I], UseTBAA: true*)) {
2546	if ((I + `1`) >= NUM_LDSDMA) {
2547	// We didn't have enough slot to track this LDS DMA store, it
2548	// has been tracked using the common RegNo (FIRST_LDS_VGPR).
2549	ScoreBrackets.determineWaitForLDSDMA(T: LOAD_CNT, TID, Wait);
2550	break;
2551	}
2552
2553	ScoreBrackets.determineWaitForLDSDMA(T: LOAD_CNT, TID: TID + I + `1`, Wait);
2554	}
2555	}
2556	} else {
2557	ScoreBrackets.determineWaitForLDSDMA(T: LOAD_CNT, TID, Wait);
2558	}
2559	if (Memop->isStore()) {
2560	ScoreBrackets.determineWaitForLDSDMA(T: EXP_CNT, TID, Wait);
2561	}
2562	}
2563
2564	// Loop over use and def operands.
2565	for (const MachineOperand &Op : MI.operands()) {
2566	if (!Op.isReg())
2567	continue;
2568
2569	// If the instruction does not read tied source, skip the operand.
2570	if (Op.isTied() && Op.isUse() && TII.doesNotReadTiedSource(MI))
2571	continue;
2572
2573	MCPhysReg Reg = Op.getReg().asMCReg();
2574
2575	const bool IsVGPR = TRI.isVectorRegister(MRI, Reg: Op.getReg());
2576	if (IsVGPR) {
2577	// Implicit VGPR defs and uses are never a part of the memory
2578	// instructions description and usually present to account for
2579	// super-register liveness.
2580	// TODO: Most of the other instructions also have implicit uses
2581	// for the liveness accounting only.
2582	if (Op.isImplicit() && MI.mayLoadOrStore())
2583	continue;
2584
2585	ScoreBrackets.determineWaitForPhysReg(T: VA_VDST, Reg, Wait);
2586	if (Op.isDef())
2587	ScoreBrackets.determineWaitForPhysReg(T: VM_VSRC, Reg, Wait);
2588	// RAW always needs an s_waitcnt. WAW needs an s_waitcnt unless the
2589	// previous write and this write are the same type of VMEM
2590	// instruction, in which case they are (in some architectures)
2591	// guaranteed to write their results in order anyway.
2592	// Additionally check instructions where Point Sample Acceleration
2593	// might be applied.
2594	if (Op.isUse() \|\| !updateVMCntOnly(Inst: MI) \|\|
2595	ScoreBrackets.hasOtherPendingVmemTypes(Reg, V: getVmemType(Inst: MI)) \|\|
2596	ScoreBrackets.hasPointSamplePendingVmemTypes(MI, Reg) \|\|
2597	!ST.hasVmemWriteVgprInOrder()) {
2598	ScoreBrackets.determineWaitForPhysReg(T: LOAD_CNT, Reg, Wait);
2599	ScoreBrackets.determineWaitForPhysReg(T: SAMPLE_CNT, Reg, Wait);
2600	ScoreBrackets.determineWaitForPhysReg(T: BVH_CNT, Reg, Wait);
2601	ScoreBrackets.clearVgprVmemTypes(Reg);
2602	}
2603
2604	if (Op.isDef() \|\| ScoreBrackets.hasPendingEvent(E: EXP_LDS_ACCESS)) {
2605	ScoreBrackets.determineWaitForPhysReg(T: EXP_CNT, Reg, Wait);
2606	}
2607	ScoreBrackets.determineWaitForPhysReg(T: DS_CNT, Reg, Wait);
2608	} else if (Op.getReg() == AMDGPU::SCC) {
2609	ScoreBrackets.determineWaitForPhysReg(T: KM_CNT, Reg, Wait);
2610	} else {
2611	ScoreBrackets.determineWaitForPhysReg(T: SmemAccessCounter, Reg, Wait);
2612	}
2613
2614	if (ST.hasWaitXcnt() && Op.isDef())
2615	ScoreBrackets.determineWaitForPhysReg(T: X_CNT, Reg, Wait);
2616	}
2617	}
2618	}
2619	}
2620
2621	// Ensure safety against exceptions from outstanding memory operations while
2622	// waiting for a barrier:
2623	//
2624	// Some subtargets safely handle backing off the barrier in hardware*
2625	// when an exception occurs.
2626	// Some subtargets have an implicit S_WAITCNT 0 before barriers, so that*
2627	// there can be no outstanding memory operations during the wait.
2628	// Subtargets with split barriers don't need to back off the barrier; it*
2629	// is up to the trap handler to preserve the user barrier state correctly.
2630	//
2631	// In all other cases, ensure safety by ensuring that there are no outstanding
2632	// memory operations.
2633	if (Opc == AMDGPU::S_BARRIER && !ST.hasAutoWaitcntBeforeBarrier() &&
2634	!ST.hasBackOffBarrier()) {
2635	Wait = Wait.combined(Other: WCG ->getAllZeroWaitcnt(/IncludeVSCnt=/true));
2636	}
2637
2638	// TODO: Remove this work-around, enable the assert for Bug 457939
2639	// after fixing the scheduler. Also, the Shader Compiler code is
2640	// independent of target.
2641	if (SIInstrInfo::isCBranchVCCZRead(MI) && ST.hasReadVCCZBug() &&
2642	ScoreBrackets.hasPendingEvent(E: SMEM_ACCESS)) {
2643	Wait.set(T: DS_CNT, Val: `0`);
2644	}
2645
2646	// Verify that the wait is actually needed.
2647	ScoreBrackets.simplifyWaitcnt(Wait);
2648
2649	// It is only necessary to insert an S_WAITCNT_DEPCTR instruction that
2650	// waits on VA_VDST if the instruction it would precede is not a VALU
2651	// instruction, since hardware handles VALU->VGPR->VALU hazards in
2652	// expert scheduling mode.
2653	if (TII.isVALU(MI))
2654	Wait.set(T: VA_VDST, Val: ~`0u`);
2655
2656	// Since the translation for VMEM addresses occur in-order, we can apply the
2657	// XCnt if the current instruction is of VMEM type and has a memory
2658	// dependency with another VMEM instruction in flight.
2659	if (Wait.get(T: X_CNT) != ~`0u` && isVmemAccess(MI)) {
2660	ScoreBrackets.applyWaitcnt(Wait, T: X_CNT);
2661	Wait.set(T: X_CNT, Val: ~`0u`);
2662	}
2663
2664	// When forcing emit, we need to skip terminators because that would break the
2665	// terminators of the MBB if we emit a waitcnt between terminators.
2666	if (ForceEmitZeroFlag && !MI.isTerminator())
2667	Wait = WCG ->getAllZeroWaitcnt(/IncludeVSCnt=/false);
2668
2669	// If we force waitcnt then update Wait accordingly.
2670	for (InstCounterType T : inst_counter_types()) {
2671	if (!ForceEmitWaitcnt[T])
2672	continue;
2673	Wait.set(T, Val: `0`);
2674	}
2675
2676	if (FlushFlags.FlushVmCnt) {
2677	for (InstCounterType T : {LOAD_CNT, SAMPLE_CNT, BVH_CNT})
2678	Wait.set(T, Val: `0`);
2679	}
2680
2681	if (FlushFlags.FlushDsCnt && ScoreBrackets.hasPendingEvent(T: DS_CNT))
2682	Wait.set(T: DS_CNT, Val: `0`);
2683
2684	if (ForceEmitZeroLoadFlag && Wait.get(T: LOAD_CNT) != ~`0u`)
2685	Wait.set(T: LOAD_CNT, Val: `0`);
2686
2687	return generateWaitcnt(Wait, It: MI.getIterator(), Block&: *MI.getParent(), ScoreBrackets,
2688	OldWaitcntInstr);
2689	}
2690
2691	bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
2692	MachineBasicBlock::instr_iterator It,
2693	MachineBasicBlock &Block,
2694	WaitcntBrackets &ScoreBrackets,
2695	MachineInstr *OldWaitcntInstr) {
2696	bool Modified = false;
2697
2698	if (OldWaitcntInstr)
2699	// Try to merge the required wait with preexisting waitcnt instructions.
2700	// Also erase redundant waitcnt.
2701	Modified =
2702	WCG ->applyPreexistingWaitcnt(ScoreBrackets, OldWaitcntInstr&: *OldWaitcntInstr, Wait, It);
2703
2704	// ExpCnt can be merged into VINTERP.
2705	if (Wait.get(T: EXP_CNT) != ~`0u` && It != Block.instr_end() &&
2706	SIInstrInfo::isVINTERP(MI: *It)) {
2707	MachineOperand WaitExp = TII.getNamedOperand(MI&: It, OperandName: AMDGPU::OpName::waitexp);
2708	if (Wait.get(T: EXP_CNT) < WaitExp->getImm()) {
2709	WaitExp->setImm(Wait.get(T: EXP_CNT));
2710	Modified = true;
2711	}
2712	// Apply ExpCnt before resetting it, so applyWaitcnt below sees all counts.
2713	ScoreBrackets.applyWaitcnt(Wait, T: EXP_CNT);
2714	Wait.set(T: EXP_CNT, Val: ~`0u`);
2715
2716	LLVM_DEBUG(dbgs() << "generateWaitcnt\n"
2717	<< "Update Instr: " << *It);
2718	}
2719
2720	if (WCG ->createNewWaitcnt(Block, It, Wait, ScoreBrackets))
2721	Modified = true;
2722
2723	// Any counts that could have been applied to any existing waitcnt
2724	// instructions will have been done so, now deal with any remaining.
2725	ScoreBrackets.applyWaitcnt(Wait);
2726
2727	return Modified;
2728	}
2729
2730	std::optional<WaitEventType>
2731	SIInsertWaitcnts::getExpertSchedulingEventType(const MachineInstr &Inst) const {
2732	if (TII.isVALU(MI: Inst)) {
2733	// Core/Side-, DP-, XDL- and TRANS-MACC VALU instructions complete
2734	// out-of-order with respect to each other, so each of these classes
2735	// has its own event.
2736
2737	if (TII.isXDL(MI: Inst))
2738	return VGPR_XDL_WRITE;
2739
2740	if (TII.isTRANS(MI: Inst))
2741	return VGPR_TRANS_WRITE;
2742
2743	if (AMDGPU::isDPMACCInstruction(Opc: Inst.getOpcode()))
2744	return VGPR_DPMACC_WRITE;
2745
2746	return VGPR_CSMACC_WRITE;
2747	}
2748
2749	// FLAT and LDS instructions may read their VGPR sources out-of-order
2750	// with respect to each other and all other VMEM instructions, so
2751	// each of these also has a separate event.
2752
2753	if (TII.isFLAT(MI: Inst))
2754	return VGPR_FLAT_READ;
2755
2756	if (TII.isDS(MI: Inst))
2757	return VGPR_LDS_READ;
2758
2759	if (TII.isVMEM(MI: Inst) \|\| TII.isVIMAGE(MI: Inst) \|\| TII.isVSAMPLE(MI: Inst))
2760	return VGPR_VMEM_READ;
2761
2762	// Otherwise, no hazard.
2763
2764	return {};
2765	}
2766
2767	bool SIInsertWaitcnts::isVmemAccess(const MachineInstr &MI) const {
2768	return (TII.isFLAT(MI) && TII.mayAccessVMEMThroughFlat(MI)) \|\|
2769	(TII.isVMEM(MI) && !AMDGPU::getMUBUFIsBufferInv(Opc: MI.getOpcode()));
2770	}
2771
2772	// Return true if the next instruction is S_ENDPGM, following fallthrough
2773	// blocks if necessary.
2774	bool SIInsertWaitcnts::isNextENDPGM(MachineBasicBlock::instr_iterator It,
2775	MachineBasicBlock Block) const* {
2776	auto BlockEnd = Block->getParent()->end();
2777	auto BlockIter = Block->getIterator();
2778
2779	while (true) {
2780	if (It.isEnd()) {
2781	if (++BlockIter != BlockEnd) {
2782	It = BlockIter ->instr_begin();
2783	continue;
2784	}
2785
2786	return false;
2787	}
2788
2789	if (!It ->isMetaInstruction())
2790	break;
2791
2792	It ++;
2793	}
2794
2795	assert(!It.isEnd());
2796
2797	return It ->getOpcode() == AMDGPU::S_ENDPGM;
2798	}
2799
2800	// Add a wait after an instruction if architecture requirements mandate one.
2801	bool SIInsertWaitcnts::insertForcedWaitAfter(MachineInstr &Inst,
2802	MachineBasicBlock &Block,
2803	WaitcntBrackets &ScoreBrackets) {
2804	AMDGPU::Waitcnt Wait;
2805	bool NeedsEndPGMCheck = false;
2806
2807	if (ST.isPreciseMemoryEnabled() && Inst.mayLoadOrStore())
2808	Wait = WCG ->getAllZeroWaitcnt(IncludeVSCnt: Inst.mayStore() &&
2809	!SIInstrInfo::isAtomicRet(MI: Inst));
2810
2811	if (TII.isAlwaysGDS(Opcode: Inst.getOpcode())) {
2812	Wait.set(T: DS_CNT, Val: `0`);
2813	NeedsEndPGMCheck = true;
2814	}
2815
2816	ScoreBrackets.simplifyWaitcnt(Wait);
2817
2818	auto SuccessorIt = std::next(x: Inst.getIterator());
2819	bool Result = generateWaitcnt(Wait, It: SuccessorIt, Block, ScoreBrackets,
2820	/OldWaitcntInstr=/nullptr);
2821
2822	if (Result && NeedsEndPGMCheck && isNextENDPGM(It: SuccessorIt, Block: &Block)) {
2823	BuildMI(BB&: Block, I: SuccessorIt, MIMD: Inst.getDebugLoc(), MCID: TII.get(Opcode: AMDGPU::S_NOP))
2824	.addImm(Val: `0`);
2825	}
2826
2827	return Result;
2828	}
2829
2830	WaitEventSet SIInsertWaitcnts::getEventsFor(const MachineInstr &Inst) const {
2831	WaitEventSet Events;
2832	if (IsExpertMode) {
2833	if (const auto ET = getExpertSchedulingEventType(Inst))
2834	Events.insert(Event: *ET);
2835	}
2836
2837	if (TII.isDS(MI: Inst) && TII.usesLGKM_CNT(MI: Inst)) {
2838	if (TII.isAlwaysGDS(Opcode: Inst.getOpcode()) \|\|
2839	TII.hasModifiersSet(MI: Inst, OpName: AMDGPU::OpName::gds)) {
2840	Events.insert(Event: GDS_ACCESS);
2841	Events.insert(Event: GDS_GPR_LOCK);
2842	} else {
2843	Events.insert(Event: LDS_ACCESS);
2844	}
2845	} else if (TII.isFLAT(MI: Inst)) {
2846	if (SIInstrInfo::isGFX12CacheInvOrWBInst(Opc: Inst.getOpcode())) {
2847	Events.insert(Event: getVmemWaitEventType(Inst));
2848	} else {
2849	assert(Inst.mayLoadOrStore());
2850	if (TII.mayAccessVMEMThroughFlat(MI: Inst)) {
2851	if (ST.hasWaitXcnt())
2852	Events.insert(Event: VMEM_GROUP);
2853	Events.insert(Event: getVmemWaitEventType(Inst));
2854	}
2855	if (TII.mayAccessLDSThroughFlat(MI: Inst))
2856	Events.insert(Event: LDS_ACCESS);
2857	}
2858	} else if (SIInstrInfo::isVMEM(MI: Inst) &&
2859	(!AMDGPU::getMUBUFIsBufferInv(Opc: Inst.getOpcode()) \|\|
2860	Inst.getOpcode() == AMDGPU::BUFFER_WBL2)) {
2861	// BUFFER_WBL2 is included here because unlike invalidates, has to be
2862	// followed "S_WAITCNT vmcnt(0)" is needed after to ensure the writeback has
2863	// completed.
2864	if (ST.hasWaitXcnt())
2865	Events.insert(Event: VMEM_GROUP);
2866	Events.insert(Event: getVmemWaitEventType(Inst));
2867	if (ST.vmemWriteNeedsExpWaitcnt() &&
2868	(Inst.mayStore() \|\| SIInstrInfo::isAtomicRet(MI: Inst))) {
2869	Events.insert(Event: VMW_GPR_LOCK);
2870	}
2871	} else if (TII.isSMRD(MI: Inst)) {
2872	if (ST.hasWaitXcnt())
2873	Events.insert(Event: SMEM_GROUP);
2874	Events.insert(Event: SMEM_ACCESS);
2875	} else if (SIInstrInfo::isLDSDIR(MI: Inst)) {
2876	Events.insert(Event: EXP_LDS_ACCESS);
2877	} else if (SIInstrInfo::isEXP(MI: Inst)) {
2878	unsigned Imm = TII.getNamedOperand(MI: Inst, OperandName: AMDGPU::OpName::tgt)->getImm();
2879	if (Imm >= AMDGPU::Exp::ET_PARAM0 && Imm <= AMDGPU::Exp::ET_PARAM31)
2880	Events.insert(Event: EXP_PARAM_ACCESS);
2881	else if (Imm >= AMDGPU::Exp::ET_POS0 && Imm <= AMDGPU::Exp::ET_POS_LAST)
2882	Events.insert(Event: EXP_POS_ACCESS);
2883	else
2884	Events.insert(Event: EXP_GPR_LOCK);
2885	} else if (SIInstrInfo::isSBarrierSCCWrite(Opcode: Inst.getOpcode())) {
2886	Events.insert(Event: SCC_WRITE);
2887	} else {
2888	switch (Inst.getOpcode()) {
2889	case AMDGPU::S_SENDMSG:
2890	case AMDGPU::S_SENDMSG_RTN_B32:
2891	case AMDGPU::S_SENDMSG_RTN_B64:
2892	case AMDGPU::S_SENDMSGHALT:
2893	Events.insert(Event: SQ_MESSAGE);
2894	break;
2895	case AMDGPU::S_MEMTIME:
2896	case AMDGPU::S_MEMREALTIME:
2897	case AMDGPU::S_GET_BARRIER_STATE_M0:
2898	case AMDGPU::S_GET_BARRIER_STATE_IMM:
2899	Events.insert(Event: SMEM_ACCESS);
2900	break;
2901	}
2902	}
2903	return Events;
2904	}
2905
2906	void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
2907	WaitcntBrackets *ScoreBrackets) {
2908
2909	WaitEventSet InstEvents = getEventsFor(Inst);
2910	for (WaitEventType E : wait_events()) {
2911	if (InstEvents.contains(Event: E))
2912	ScoreBrackets->updateByEvent(E, Inst);
2913	}
2914
2915	if (TII.isDS(MI: Inst) && TII.usesLGKM_CNT(MI: Inst)) {
2916	if (TII.isAlwaysGDS(Opcode: Inst.getOpcode()) \|\|
2917	TII.hasModifiersSet(MI: Inst, OpName: AMDGPU::OpName::gds)) {
2918	ScoreBrackets->setPendingGDS();
2919	}
2920	} else if (TII.isFLAT(MI: Inst)) {
2921	if (Inst.mayLoadOrStore() && TII.mayAccessVMEMThroughFlat(MI: Inst) &&
2922	TII.mayAccessLDSThroughFlat(MI: Inst) && !SIInstrInfo::isLDSDMA(MI: Inst))
2923	// Async/LDSDMA operations have FLAT encoding but do not actually use flat
2924	// pointers. They do have two operands that each access global and LDS,
2925	// thus making it appear at this point that they are using a flat pointer.
2926	// Filter them out, and for the rest, generate a dependency on flat
2927	// pointers so that both VM and LGKM counters are flushed.
2928	ScoreBrackets->setPendingFlat();
2929	} else if (Inst.isCall()) {
2930	// Act as a wait on everything
2931	ScoreBrackets->applyWaitcnt(Wait: WCG ->getAllZeroWaitcnt(/IncludeVSCnt=/false));
2932	ScoreBrackets->setStateOnFunctionEntryOrReturn();
2933	} else if (TII.isVINTERP(MI: Inst)) {
2934	int64_t Imm = TII.getNamedOperand(MI&: Inst, OperandName: AMDGPU::OpName::waitexp)->getImm();
2935	ScoreBrackets->applyWaitcnt(T: EXP_CNT, Count: Imm);
2936	}
2937	}
2938
2939	bool WaitcntBrackets::mergeScore(const MergeInfo &M, unsigned &Score,
2940	unsigned OtherScore) {
2941	unsigned MyShifted = Score <= M.OldLB ? `0` : Score + M.MyShift;
2942	unsigned OtherShifted =
2943	OtherScore <= M.OtherLB ? `0` : OtherScore + M.OtherShift;
2944	Score = std::max(a: MyShifted, b: OtherShifted);
2945	return OtherShifted > MyShifted;
2946	}
2947
2948	bool WaitcntBrackets::mergeAsyncMarks(ArrayRef<MergeInfo> MergeInfos,
2949	ArrayRef<CounterValueArray> OtherMarks) {
2950	bool StrictDom = false;
2951
2952	LLVM_DEBUG(dbgs() << "Merging async marks ...");
2953	// Early exit: both empty
2954	if (AsyncMarks.empty() && OtherMarks.empty()) {
2955	LLVM_DEBUG(dbgs() << " nothing to merge\n");
2956	return false;
2957	}
2958	LLVM_DEBUG(dbgs() << `'\n'`);
2959
2960	// Determine maximum length needed after merging
2961	auto MaxSize = (unsigned)std::max(a: AsyncMarks.size(), b: OtherMarks.size());
2962	MaxSize = std::min(a: MaxSize, b: MaxAsyncMarks);
2963
2964	// Keep only the most recent marks within our limit.
2965	if (AsyncMarks.size() > MaxSize)
2966	AsyncMarks.erase(CS: AsyncMarks.begin(),
2967	CE: AsyncMarks.begin() + (AsyncMarks.size() - MaxSize));
2968
2969	// Pad with zero-filled marks if our list is shorter. Zero represents "no
2970	// pending async operations at this checkpoint" and acts as the identity
2971	// element for max() during merging. We pad at the beginning since the marks
2972	// need to be aligned in most-recent order.
2973	constexpr CounterValueArray ZeroMark{};
2974	AsyncMarks.insert(I: AsyncMarks.begin(), NumToInsert: MaxSize - AsyncMarks.size(), Elt: ZeroMark);
2975
2976	LLVM_DEBUG({
2977	dbgs() << "Before merge:\n";
2978	for (const auto &Mark : AsyncMarks) {
2979	llvm::interleaveComma(Mark, dbgs());
2980	dbgs() << `'\n'`;
2981	}
2982	dbgs() << "Other marks:\n";
2983	for (const auto &Mark : OtherMarks) {
2984	llvm::interleaveComma(Mark, dbgs());
2985	dbgs() << `'\n'`;
2986	}
2987	});
2988
2989	// Merge element-wise using the existing mergeScore function and the
2990	// appropriate MergeInfo for each counter type. Iterate only while we have
2991	// elements in both vectors.
2992	unsigned OtherSize = OtherMarks.size();
2993	unsigned OurSize = AsyncMarks.size();
2994	unsigned MergeCount = std::min(a: OtherSize, b: OurSize);
2995	for (auto Idx : seq_inclusive<unsigned>(Begin: `1`, End: MergeCount)) {
2996	for (auto T : inst_counter_types(MaxCounter: Context->MaxCounter)) {
2997	StrictDom \|= mergeScore(M: MergeInfos [T], Score&: AsyncMarks [OurSize - Idx][T],
2998	OtherScore: OtherMarks [OtherSize - Idx][T]);
2999	}
3000	}
3001
3002	LLVM_DEBUG({
3003	dbgs() << "After merge:\n";
3004	for (const auto &Mark : AsyncMarks) {
3005	llvm::interleaveComma(Mark, dbgs());
3006	dbgs() << `'\n'`;
3007	}
3008	});
3009
3010	return StrictDom;
3011	}
3012
3013	/// Merge the pending events and associater score brackets of \p Other into
3014	/// this brackets status.
3015	///
3016	/// Returns whether the merge resulted in a change that requires tighter waits
3017	/// (i.e. the merged brackets strictly dominate the original brackets).
3018	bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
3019	bool StrictDom = false;
3020
3021	// Check if "other" has keys we don't have, and create default entries for
3022	// those. If they remain empty after merging, we will clean it up after.
3023	for (auto K : Other.VMem.keys())
3024	VMem.try_emplace(Key: K);
3025	for (auto K : Other.SGPRs.keys())
3026	SGPRs.try_emplace(Key: K);
3027
3028	// Array to store MergeInfo for each counter type
3029	MergeInfo MergeInfos[NUM_INST_CNTS];
3030
3031	for (auto T : inst_counter_types(MaxCounter: Context->MaxCounter)) {
3032	// Merge event flags for this counter
3033	const WaitEventSet &EventsForT = Context->getWaitEvents(T);
3034	const WaitEventSet OldEvents = PendingEvents & EventsForT;
3035	const WaitEventSet OtherEvents = Other.PendingEvents & EventsForT;
3036	if (!OldEvents.contains(Other: OtherEvents))
3037	StrictDom = true;
3038	PendingEvents \|= OtherEvents;
3039
3040	// Merge scores for this counter
3041	const unsigned MyPending = ScoreUBs[T] - ScoreLBs[T];
3042	const unsigned OtherPending = Other.ScoreUBs[T] - Other.ScoreLBs[T];
3043	const unsigned NewUB = ScoreLBs[T] + std::max(a: MyPending, b: OtherPending);
3044	if (NewUB < ScoreLBs[T])
3045	report_fatal_error(reason: "waitcnt score overflow");
3046
3047	MergeInfo &M = MergeInfos[T];
3048	M.OldLB = ScoreLBs[T];
3049	M.OtherLB = Other.ScoreLBs[T];
3050	M.MyShift = NewUB - ScoreUBs[T];
3051	M.OtherShift = NewUB - Other.ScoreUBs[T];
3052
3053	ScoreUBs[T] = NewUB;
3054
3055	StrictDom \|= mergeScore(M, Score&: LastFlat[T], OtherScore: Other.LastFlat[T]);
3056
3057	if (T == DS_CNT)
3058	StrictDom \|= mergeScore(M, Score&: LastGDS, OtherScore: Other.LastGDS);
3059
3060	if (T == KM_CNT) {
3061	StrictDom \|= mergeScore(M, Score&: SCCScore, OtherScore: Other.SCCScore);
3062	if (Other.hasPendingEvent(E: SCC_WRITE)) {
3063	if (!OldEvents.contains(Event: SCC_WRITE)) {
3064	PendingSCCWrite = Other.PendingSCCWrite;
3065	} else if (PendingSCCWrite != Other.PendingSCCWrite) {
3066	PendingSCCWrite = nullptr;
3067	}
3068	}
3069	}
3070
3071	for (auto &[RegID, Info] : VMem)
3072	StrictDom \|= mergeScore(M, Score&: Info.Scores [T], OtherScore: Other.getVMemScore(TID: RegID, T));
3073
3074	if (isSmemCounter(T)) {
3075	unsigned Idx = getSgprScoresIdx(T);
3076	for (auto &[RegID, Info] : SGPRs) {
3077	auto It = Other.SGPRs.find(Val: RegID);
3078	unsigned OtherScore =
3079	(It != Other.SGPRs.end()) ? It ->second.Scores [Idx] : `0`;
3080	StrictDom \|= mergeScore(M, Score&: Info.Scores [Idx], OtherScore);
3081	}
3082	}
3083	}
3084
3085	for (auto &[TID, Info] : VMem) {
3086	if (auto It = Other.VMem.find(Val: TID); It != Other.VMem.end()) {
3087	unsigned char NewVmemTypes = Info.VMEMTypes \| It ->second.VMEMTypes;
3088	StrictDom \|= NewVmemTypes != Info.VMEMTypes;
3089	Info.VMEMTypes = NewVmemTypes;
3090	}
3091	}
3092
3093	StrictDom \|= mergeAsyncMarks(MergeInfos, OtherMarks: Other.AsyncMarks);
3094	for (auto T : inst_counter_types(MaxCounter: Context->MaxCounter))
3095	StrictDom \|= mergeScore(M: MergeInfos[T], Score&: AsyncScore [T], OtherScore: Other.AsyncScore [T]);
3096
3097	purgeEmptyTrackingData();
3098	return StrictDom;
3099	}
3100
3101	static bool isWaitInstr(MachineInstr &Inst) {
3102	unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Opcode: Inst.getOpcode());
3103	return Opcode == AMDGPU::S_WAITCNT \|\|
3104	(Opcode == AMDGPU::S_WAITCNT_VSCNT && Inst.getOperand(i: `0`).isReg() &&
3105	Inst.getOperand(i: `0`).getReg() == AMDGPU::SGPR_NULL) \|\|
3106	Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT \|\|
3107	Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT \|\|
3108	Opcode == AMDGPU::S_WAITCNT_lds_direct \|\|
3109	Opcode == AMDGPU::WAIT_ASYNCMARK \|\|
3110	counterTypeForInstr(Opcode).has_value();
3111	}
3112
3113	void SIInsertWaitcnts::setSchedulingMode(MachineBasicBlock &MBB,
3114	MachineBasicBlock::iterator I,
3115	bool ExpertMode) const {
3116	const unsigned EncodedReg = AMDGPU::Hwreg::HwregEncoding::encode(
3117	Values: AMDGPU::Hwreg::ID_SCHED_MODE, Values: AMDGPU::Hwreg::HwregOffset::Default, Values: `2`);
3118	BuildMI(BB&: MBB, I, MIMD: DebugLoc (), MCID: TII.get(Opcode: AMDGPU::S_SETREG_IMM32_B32))
3119	.addImm(Val: ExpertMode ? `2` : `0`)
3120	.addImm(Val: EncodedReg);
3121	}
3122
3123	namespace {
3124	// TODO: Remove this work-around after fixing the scheduler.
3125	// There are two reasons why vccz might be incorrect; see ST.hasReadVCCZBug()
3126	// and ST.partialVCCWritesUpdateVCCZ().
3127	// i. VCCZBug: There is a hardware bug on CI/SI where SMRD instruction may
3128	// corrupt vccz bit, so when we detect that an instruction may read from
3129	// a corrupt vccz bit, we need to:
3130	// 1. Insert s_waitcnt lgkm(0) to wait for all outstanding SMRD
3131	// operations to complete.
3132	// 2. Recompute the correct value of vccz by writing the current value
3133	// of vcc back to vcc.
3134	// ii. Partial writes to vcc don't update vccz, so we need to recompute the
3135	// correct value of vccz by reading vcc and writing it back to vcc.
3136	// No waitcnt is needed in this case.
3137	class VCCZWorkaround {
3138	const WaitcntBrackets &ScoreBrackets;
3139	const GCNSubtarget &ST;
3140	const SIInstrInfo &TII;
3141	const SIRegisterInfo &TRI;
3142	bool VCCZCorruptionBug = false;
3143	bool VCCZNotUpdatedByPartialWrites = false;
3144	/// vccz could be incorrect at a basic block boundary if a predecessor wrote
3145	/// to vcc and then issued an smem load, so initialize to true.
3146	bool MustRecomputeVCCZ = true;
3147
3148	public:
3149	VCCZWorkaround(const WaitcntBrackets &ScoreBrackets, const GCNSubtarget &ST,
3150	const SIInstrInfo &TII, const SIRegisterInfo &TRI)
3151	: ScoreBrackets(ScoreBrackets), ST(ST), TII(TII), TRI(TRI) {
3152	VCCZCorruptionBug = ST.hasReadVCCZBug();
3153	VCCZNotUpdatedByPartialWrites = !ST.partialVCCWritesUpdateVCCZ();
3154	}
3155	/// If \p MI reads vccz and we must recompute it based on MustRecomputeVCCZ,
3156	/// then emit a vccz recompute instruction before \p MI. This needs to be
3157	/// called on every instruction in the basic block because it also tracks the
3158	/// state and updates MustRecomputeVCCZ accordingly. Returns true if it
3159	/// modified the IR.
3160	bool tryRecomputeVCCZ(MachineInstr &MI) {
3161	// No need to run this if neither bug is present.
3162	if (!VCCZCorruptionBug && !VCCZNotUpdatedByPartialWrites)
3163	return false;
3164
3165	// If MI is an SMEM and it can corrupt vccz on this target, then we need
3166	// both to emit a waitcnt and to recompute vccz.
3167	// But we don't actually emit a waitcnt here. This is done in
3168	// generateWaitcntInstBefore() because it tracks all the necessary waitcnt
3169	// state, and can either skip emitting a waitcnt if there is already one in
3170	// the IR, or emit an "optimized" combined waitcnt.
3171	// If this is an smem read, it could complete and clobber vccz at any time.
3172	MustRecomputeVCCZ \|= VCCZCorruptionBug && TII.isSMRD(MI);
3173
3174	// If the target partial vcc writes don't update vccz, and MI is such an
3175	// instruction then we must recompute vccz.
3176	// Note: We are using PartiallyWritesToVCCOpt optional to avoid calling
3177	// `definesRegister()` more than needed, because it's not very cheap.
3178	std::optional<bool> PartiallyWritesToVCCOpt;
3179	auto PartiallyWritesToVCC = [](MachineInstr &MI) {
3180	return MI.definesRegister(Reg: AMDGPU::VCC_LO, /TRI=/nullptr) \|\|
3181	MI.definesRegister(Reg: AMDGPU::VCC_HI, /TRI=/nullptr);
3182	};
3183	if (VCCZNotUpdatedByPartialWrites) {
3184	PartiallyWritesToVCCOpt = PartiallyWritesToVCC(MI);
3185	// If this is a partial VCC write but won't update vccz, then we must
3186	// recompute vccz.
3187	MustRecomputeVCCZ \|= *PartiallyWritesToVCCOpt;
3188	}
3189
3190	// If MI is a vcc write with no pending smem, or there is a pending smem
3191	// but the target does not suffer from the vccz corruption bug, then we
3192	// don't need to recompute vccz as this write will recompute it anyway.
3193	if (!ScoreBrackets.hasPendingEvent(E: SMEM_ACCESS) \|\| !VCCZCorruptionBug) {
3194	// Compute PartiallyWritesToVCCOpt if we haven't done so already.
3195	if (!PartiallyWritesToVCCOpt)
3196	PartiallyWritesToVCCOpt = PartiallyWritesToVCC(MI);
3197	bool FullyWritesToVCC = !*PartiallyWritesToVCCOpt &&
3198	MI.definesRegister(Reg: AMDGPU::VCC, /TRI=/nullptr);
3199	// If we write to the full vcc or we write partially and the target
3200	// updates vccz on partial writes, then vccz will be updated correctly.
3201	bool UpdatesVCCZ = FullyWritesToVCC \|\| (!VCCZNotUpdatedByPartialWrites &&
3202	*PartiallyWritesToVCCOpt);
3203	if (UpdatesVCCZ)
3204	MustRecomputeVCCZ = false;
3205	}
3206
3207	// If MI is a branch that reads VCCZ then emit a waitcnt and a vccz
3208	// restore instruction if either is needed.
3209	if (SIInstrInfo::isCBranchVCCZRead(MI) && MustRecomputeVCCZ) {
3210	// Recompute the vccz bit. Any time a value is written to vcc, the vccz
3211	// bit is updated, so we can restore the bit by reading the value of vcc
3212	// and then writing it back to the register.
3213	BuildMI(BB&: *MI.getParent(), I&: MI, MIMD: MI.getDebugLoc(),
3214	MCID: TII.get(Opcode: ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
3215	DestReg: TRI.getVCC())
3216	.addReg(RegNo: TRI.getVCC());
3217	MustRecomputeVCCZ = false;
3218	return true;
3219	}
3220	return false;
3221	}
3222	};
3223
3224	} // namespace
3225
3226	// Generate s_waitcnt instructions where needed.
3227	bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
3228	MachineBasicBlock &Block,
3229	WaitcntBrackets &ScoreBrackets) {
3230	bool Modified = false;
3231
3232	LLVM_DEBUG({
3233	dbgs() << "*** Begin Block: ";
3234	Block.printName(dbgs());
3235	ScoreBrackets.dump();
3236	});
3237	VCCZWorkaround VCCZW(ScoreBrackets, ST, TII, TRI);
3238
3239	// Walk over the instructions.
3240	MachineInstr OldWaitcntInstr = nullptr*;
3241
3242	// NOTE: We may append instrs after Inst while iterating.
3243	for (MachineBasicBlock::instr_iterator Iter = Block.instr_begin(),
3244	E = Block.instr_end();
3245	Iter != E; ++Iter) {
3246	MachineInstr &Inst = *Iter;
3247	if (Inst.isMetaInstruction())
3248	continue;
3249	// Track pre-existing waitcnts that were added in earlier iterations or by
3250	// the memory legalizer.
3251	if (isWaitInstr(Inst) \|\|
3252	(IsExpertMode && Inst.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR)) {
3253	if (!OldWaitcntInstr)
3254	OldWaitcntInstr = &Inst;
3255	continue;
3256	}
3257
3258	PreheaderFlushFlags FlushFlags;
3259	if (Block.getFirstTerminator() == Inst)
3260	FlushFlags = isPreheaderToFlush(MBB&: Block, ScoreBrackets);
3261
3262	// Generate an s_waitcnt instruction to be placed before Inst, if needed.
3263	Modified \|= generateWaitcntInstBefore(MI&: Inst, ScoreBrackets, OldWaitcntInstr,
3264	FlushFlags);
3265	OldWaitcntInstr = nullptr;
3266
3267	if (Inst.getOpcode() == AMDGPU::ASYNCMARK) {
3268	// FIXME: Not supported on GFX12 yet. Will need a new feature when we do.
3269	//
3270	// Asyncmarks record the current wait state and so should not allow
3271	// waitcnts that occur after them to be merged into waitcnts that occur
3272	// before.
3273	assert(ST.getGeneration() < AMDGPUSubtarget::GFX12);
3274	ScoreBrackets.recordAsyncMark(Inst);
3275	continue;
3276	}
3277
3278	if (TII.isSMRD(MI: Inst)) {
3279	for (const MachineMemOperand *Memop : Inst.memoperands()) {
3280	// No need to handle invariant loads when avoiding WAR conflicts, as
3281	// there cannot be a vector store to the same memory location.
3282	if (!Memop->isInvariant()) {
3283	const Value *Ptr = Memop->getValue();
3284	SLoadAddresses.insert(KV: std::pair(Ptr, Inst.getParent()));
3285	}
3286	}
3287	}
3288
3289	updateEventWaitcntAfter(Inst, ScoreBrackets: &ScoreBrackets);
3290
3291	// Note: insertForcedWaitAfter() may add instrs after Iter that need to be
3292	// visited by the loop.
3293	Modified \|= insertForcedWaitAfter(Inst, Block, ScoreBrackets);
3294
3295	LLVM_DEBUG({
3296	Inst.print(dbgs());
3297	ScoreBrackets.dump();
3298	});
3299
3300	// If the target suffers from the vccz bugs, this may emit the necessary
3301	// vccz recompute instruction before \p Inst if needed.
3302	Modified \|= VCCZW.tryRecomputeVCCZ(MI&: Inst);
3303	}
3304
3305	// Flush counters at the end of the block if needed (for preheaders with no
3306	// terminator).
3307	AMDGPU::Waitcnt Wait;
3308	if (Block.getFirstTerminator() == Block.end()) {
3309	PreheaderFlushFlags FlushFlags = isPreheaderToFlush(MBB&: Block, ScoreBrackets);
3310	if (FlushFlags.FlushVmCnt) {
3311	if (ScoreBrackets.hasPendingEvent(T: LOAD_CNT))
3312	Wait.set(T: LOAD_CNT, Val: `0`);
3313	if (ScoreBrackets.hasPendingEvent(T: SAMPLE_CNT))
3314	Wait.set(T: SAMPLE_CNT, Val: `0`);
3315	if (ScoreBrackets.hasPendingEvent(T: BVH_CNT))
3316	Wait.set(T: BVH_CNT, Val: `0`);
3317	}
3318	if (FlushFlags.FlushDsCnt && ScoreBrackets.hasPendingEvent(T: DS_CNT))
3319	Wait.set(T: DS_CNT, Val: `0`);
3320	}
3321
3322	// Combine or remove any redundant waitcnts at the end of the block.
3323	Modified \|= generateWaitcnt(Wait, It: Block.instr_end(), Block, ScoreBrackets,
3324	OldWaitcntInstr);
3325
3326	LLVM_DEBUG({
3327	dbgs() << "*** End Block: ";
3328	Block.printName(dbgs());
3329	ScoreBrackets.dump();
3330	});
3331
3332	return Modified;
3333	}
3334
3335	bool SIInsertWaitcnts::removeRedundantSoftXcnts(MachineBasicBlock &Block) {
3336	if (Block.size() <= `1`)
3337	return false;
3338	// The Memory Legalizer conservatively inserts a soft xcnt before each
3339	// atomic RMW operation. However, for sequences of back-to-back atomic
3340	// RMWs, only the first s_wait_xcnt insertion is necessary. Optimize away
3341	// the redundant soft xcnts.
3342	bool Modified = false;
3343	// Remember the last atomic with a soft xcnt right before it.
3344	MachineInstr LastAtomicWithSoftXcnt = nullptr*;
3345
3346	for (MachineInstr &MI : drop_begin(RangeOrContainer&: Block)) {
3347	// Ignore last atomic if non-LDS VMEM and SMEM.
3348	bool IsLDS =
3349	TII.isDS(MI) \|\| (TII.isFLAT(MI) && TII.mayAccessLDSThroughFlat(MI));
3350	if (!IsLDS && (MI.mayLoad() ^ MI.mayStore()))
3351	LastAtomicWithSoftXcnt = nullptr;
3352
3353	bool IsAtomicRMW = (MI.getDesc().TSFlags & SIInstrFlags::maybeAtomic) &&
3354	MI.mayLoad() && MI.mayStore();
3355	MachineInstr &PrevMI = *MI.getPrevNode();
3356	// This is an atomic with a soft xcnt.
3357	if (PrevMI.getOpcode() == AMDGPU::S_WAIT_XCNT_soft && IsAtomicRMW) {
3358	// If we have already found an atomic with a soft xcnt, remove this soft
3359	// xcnt as it's redundant.
3360	if (LastAtomicWithSoftXcnt) {
3361	PrevMI.eraseFromParent();
3362	Modified = true;
3363	}
3364	LastAtomicWithSoftXcnt = &MI;
3365	}
3366	}
3367	return Modified;
3368	}
3369
3370	// Return flags indicating which counters should be flushed in the preheader.
3371	PreheaderFlushFlags
3372	SIInsertWaitcnts::isPreheaderToFlush(MachineBasicBlock &MBB,
3373	const WaitcntBrackets &ScoreBrackets) {
3374	auto [Iterator, IsInserted] =
3375	PreheadersToFlush.try_emplace(Key: &MBB, Args: PreheaderFlushFlags ());
3376	if (!IsInserted)
3377	return Iterator ->second;
3378
3379	MachineBasicBlock *Succ = MBB.getSingleSuccessor();
3380	if (!Succ)
3381	return PreheaderFlushFlags ();
3382
3383	MachineLoop *Loop = MLI.getLoopFor(BB: Succ);
3384	if (!Loop)
3385	return PreheaderFlushFlags ();
3386
3387	if (Loop->getLoopPreheader() == &MBB) {
3388	Iterator ->second = getPreheaderFlushFlags(ML: Loop, Brackets: ScoreBrackets);
3389	return Iterator ->second;
3390	}
3391
3392	return PreheaderFlushFlags ();
3393	}
3394
3395	bool SIInsertWaitcnts::isVMEMOrFlatVMEM(const MachineInstr &MI) const {
3396	if (SIInstrInfo::isFLAT(MI))
3397	return TII.mayAccessVMEMThroughFlat(MI);
3398	return SIInstrInfo::isVMEM(MI);
3399	}
3400
3401	bool SIInsertWaitcnts::isDSRead(const MachineInstr &MI) const {
3402	return SIInstrInfo::isDS(MI) && MI.mayLoad() && !MI.mayStore();
3403	}
3404
3405	// Check if instruction is a store to LDS that is counted via DSCNT
3406	// (where that counter exists).
3407	bool SIInsertWaitcnts::mayStoreIncrementingDSCNT(const MachineInstr &MI) const {
3408	return MI.mayStore() && SIInstrInfo::isDS(MI);
3409	}
3410
3411	// Return flags indicating which counters should be flushed in the preheader of
3412	// the given loop. We currently decide to flush in the following situations:
3413	// For VMEM (FlushVmCnt):
3414	// 1. The loop contains vmem store(s), no vmem load and at least one use of a
3415	// vgpr containing a value that is loaded outside of the loop. (Only on
3416	// targets with no vscnt counter).
3417	// 2. The loop contains vmem load(s), but the loaded values are not used in the
3418	// loop, and at least one use of a vgpr containing a value that is loaded
3419	// outside of the loop.
3420	// For DS (FlushDsCnt, GFX12+ only):
3421	// 3. The loop contains no DS reads, and at least one use of a vgpr containing
3422	// a value that is DS read outside of the loop.
3423	// 4. The loop contains DS read(s), loaded values are not used in the same
3424	// iteration but in the next iteration (prefetch pattern), and at least one
3425	// use of a vgpr containing a value that is DS read outside of the loop.
3426	// Flushing in preheader reduces wait overhead if the wait requirement in
3427	// iteration 1 would otherwise be more strict (but unfortunately preheader
3428	// flush decision is taken before knowing that).
3429	// 5. (Single-block loops only) The loop has DS prefetch reads with flush point
3430	// tracking. Some DS reads may be used in the same iteration (creating
3431	// "flush points"), but others remain unflushed at the backedge. When a DS
3432	// read is consumed in the same iteration, it and all prior reads are
3433	// "flushed" (FIFO order). No DS writes are allowed in the loop.
3434	// TODO: Find a way to extend to multi-block loops.
3435	PreheaderFlushFlags
3436	SIInsertWaitcnts::getPreheaderFlushFlags(MachineLoop *ML,
3437	const WaitcntBrackets &Brackets) {
3438	PreheaderFlushFlags Flags;
3439	bool HasVMemLoad = false;
3440	bool HasVMemStore = false;
3441	bool UsesVgprVMEMLoadedOutside = false;
3442	bool UsesVgprDSReadOutside = false;
3443	bool VMemInvalidated = false;
3444	// DS optimization only applies to GFX12+ where DS_CNT is separate.
3445	// Tracking status for "no DS read in loop" or "pure DS prefetch
3446	// (use only in next iteration)".
3447	bool TrackSimpleDSOpt = ST.hasExtendedWaitCounts();
3448	DenseSet<MCRegUnit> VgprUse;
3449	DenseSet<MCRegUnit> VgprDefVMEM;
3450	DenseSet<MCRegUnit> VgprDefDS;
3451
3452	// Track DS reads for prefetch pattern with flush points (single-block only).
3453	// Keeps track of the last DS read (position counted from the top of the loop)
3454	// to each VGPR. Read is considered consumed (and thus needs flushing) if
3455	// the dest register has a use or is overwritten (by any later opertions).
3456	DenseMap<MCRegUnit, unsigned> LastDSReadPositionMap;
3457	unsigned DSReadPosition = `0`;
3458	bool IsSingleBlock = ML->getNumBlocks() == `1`;
3459	bool TrackDSFlushPoint = ST.hasExtendedWaitCounts() && IsSingleBlock;
3460	unsigned LastDSFlushPosition = `0`;
3461
3462	for (MachineBasicBlock *MBB : ML->blocks()) {
3463	for (MachineInstr &MI : *MBB) {
3464	if (isVMEMOrFlatVMEM(MI)) {
3465	HasVMemLoad \|= MI.mayLoad();
3466	HasVMemStore \|= MI.mayStore();
3467	}
3468	// TODO: Can we relax DSStore check? There may be cases where
3469	// these DS stores are drained prior to the end of MBB (or loop).
3470	if (mayStoreIncrementingDSCNT(MI)) {
3471	// Early exit if none of the optimizations are feasible.
3472	// Otherwise, set tracking status appropriately and continue.
3473	if (VMemInvalidated)
3474	return Flags;
3475	TrackSimpleDSOpt = false;
3476	TrackDSFlushPoint = false;
3477	}
3478	bool IsDSRead = isDSRead(MI);
3479	if (IsDSRead)
3480	++DSReadPosition;
3481
3482	// Helper: if RU has a pending DS read, update LastDSFlushPosition
3483	auto updateDSReadFlushTracking = [&](MCRegUnit RU) {
3484	if (!TrackDSFlushPoint)
3485	return;
3486	if (auto It = LastDSReadPositionMap.find(Val: RU);
3487	It != LastDSReadPositionMap.end()) {
3488	// RU defined by DSRead is used or overwritten. Need to complete
3489	// the read, if not already implied by a later DSRead (to any RU)
3490	// needing to complete in FIFO order.
3491	LastDSFlushPosition = std::max(a: LastDSFlushPosition, b: It ->second);
3492	}
3493	};
3494
3495	for (const MachineOperand &Op : MI.all_uses()) {
3496	if (Op.isDebug() \|\| !TRI.isVectorRegister(MRI, Reg: Op.getReg()))
3497	continue;
3498	// Vgpr use
3499	for (MCRegUnit RU : TRI.regunits(Reg: Op.getReg().asMCReg())) {
3500	// If we find a register that is loaded inside the loop, 1. and 2.
3501	// are invalidated.
3502	if (VgprDefVMEM.contains(V: RU))
3503	VMemInvalidated = true;
3504
3505	// Check for DS reads used inside the loop
3506	if (VgprDefDS.contains(V: RU))
3507	TrackSimpleDSOpt = false;
3508
3509	// Early exit if all optimizations are invalidated
3510	if (VMemInvalidated && !TrackSimpleDSOpt && !TrackDSFlushPoint)
3511	return Flags;
3512
3513	// Check for flush points (DS read used in same iteration)
3514	updateDSReadFlushTracking (RU);
3515
3516	VgprUse.insert(V: RU);
3517	// Check if this register has a pending VMEM load from outside the
3518	// loop (value loaded outside and used inside).
3519	VMEMID ID = toVMEMID(RU);
3520	if (Brackets.hasPendingVMEM(ID, T: LOAD_CNT) \|\|
3521	Brackets.hasPendingVMEM(ID, T: SAMPLE_CNT) \|\|
3522	Brackets.hasPendingVMEM(ID, T: BVH_CNT))
3523	UsesVgprVMEMLoadedOutside = true;
3524	// Check if loaded outside the loop via DS (not VMEM/FLAT).
3525	// Only consider it a DS read if there's no pending VMEM load for
3526	// this register, since FLAT can set both counters.
3527	else if (Brackets.hasPendingVMEM(ID, T: DS_CNT))
3528	UsesVgprDSReadOutside = true;
3529	}
3530	}
3531
3532	// VMem load vgpr def
3533	if (isVMEMOrFlatVMEM(MI) && MI.mayLoad()) {
3534	for (const MachineOperand &Op : MI.all_defs()) {
3535	for (MCRegUnit RU : TRI.regunits(Reg: Op.getReg().asMCReg())) {
3536	// If we find a register that is loaded inside the loop, 1. and 2.
3537	// are invalidated.
3538	if (VgprUse.contains(V: RU))
3539	VMemInvalidated = true;
3540	VgprDefVMEM.insert(V: RU);
3541	}
3542	}
3543	// Early exit if all optimizations are invalidated
3544	if (VMemInvalidated && !TrackSimpleDSOpt && !TrackDSFlushPoint)
3545	return Flags;
3546	}
3547
3548	// DS read vgpr def
3549	// Note: Unlike VMEM, we DON'T invalidate when VgprUse.contains(RegNo).
3550	// If USE comes before DEF, it's the prefetch pattern (use value from
3551	// previous iteration, read for next iteration). We should still flush
3552	// in preheader so iteration 1 doesn't need to wait inside the loop.
3553	// Only invalidate when DEF comes before USE (same-iteration consumption,
3554	// checked above when processing uses).
3555	if (IsDSRead \|\| TrackDSFlushPoint) {
3556	for (const MachineOperand &Op : MI.all_defs()) {
3557	if (!TRI.isVectorRegister(MRI, Reg: Op.getReg()))
3558	continue;
3559	for (MCRegUnit RU : TRI.regunits(Reg: Op.getReg().asMCReg())) {
3560	// Check for overwrite of pending DS read (flush point) by any
3561	// instruction
3562	updateDSReadFlushTracking (RU);
3563	if (IsDSRead) {
3564	VgprDefDS.insert(V: RU);
3565	if (TrackDSFlushPoint)
3566	LastDSReadPositionMap [RU] = DSReadPosition;
3567	}
3568	}
3569	}
3570	}
3571	}
3572	}
3573
3574	// VMEM flush decision
3575	if (!VMemInvalidated && UsesVgprVMEMLoadedOutside &&
3576	((!ST.hasVscnt() && HasVMemStore && !HasVMemLoad) \|\|
3577	(HasVMemLoad && ST.hasVmemWriteVgprInOrder())))
3578	Flags.FlushVmCnt = true;
3579
3580	// DS flush decision:
3581	// Simple DS Opt: flush if loop uses DS read values from outside
3582	// and either has no DS reads in the loop, or DS reads whose results
3583	// are not used in the loop.
3584	bool SimpleDSOpt = TrackSimpleDSOpt && UsesVgprDSReadOutside;
3585	// Prefetch with flush points: some DS reads used in same iteration,
3586	// but unflushed reads remain at backedge
3587	bool HasUnflushedDSReads = DSReadPosition > LastDSFlushPosition;
3588	bool DSFlushPointPrefetch =
3589	TrackDSFlushPoint && UsesVgprDSReadOutside && HasUnflushedDSReads;
3590
3591	if (SimpleDSOpt \|\| DSFlushPointPrefetch)
3592	Flags.FlushDsCnt = true;
3593
3594	return Flags;
3595	}
3596
3597	bool SIInsertWaitcntsLegacy::runOnMachineFunction(MachineFunction &MF) {
3598	auto &MLI = getAnalysis<MachineLoopInfoWrapperPass>().getLI();
3599	auto &PDT =
3600	getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree();
3601	AliasAnalysis AA = nullptr*;
3602	if (auto *AAR = getAnalysisIfAvailable<AAResultsWrapperPass>())
3603	AA = &AAR->getAAResults();
3604
3605	return SIInsertWaitcnts (MLI, PDT, AA, MF).run();
3606	}
3607
3608	PreservedAnalyses
3609	SIInsertWaitcntsPass::run(MachineFunction &MF,
3610	MachineFunctionAnalysisManager &MFAM) {
3611	auto &MLI = MFAM.getResult<MachineLoopAnalysis>(IR&: MF);
3612	auto &PDT = MFAM.getResult<MachinePostDominatorTreeAnalysis>(IR&: MF);
3613	auto *AA = MFAM.getResult<FunctionAnalysisManagerMachineFunctionProxy>(IR&: MF)
3614	.getManager()
3615	.getCachedResult<AAManager>(IR&: MF.getFunction());
3616
3617	if (!SIInsertWaitcnts (MLI, PDT, AA, MF).run())
3618	return PreservedAnalyses::all();
3619
3620	return getMachineFunctionPassPreservedAnalyses()
3621	.preserveSet<CFGAnalyses>()
3622	.preserve<AAManager>();
3623	}
3624
3625	bool SIInsertWaitcnts::run() {
3626	const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
3627
3628	AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(GPU: ST.getCPU());
3629
3630	// Initialize hardware limits first, as they're needed by the generators.
3631	Limits = AMDGPU::HardwareLimits (IV);
3632
3633	if (ST.hasExtendedWaitCounts()) {
3634	IsExpertMode = ST.hasExpertSchedulingMode() &&
3635	(ExpertSchedulingModeFlag.getNumOccurrences()
3636	? ExpertSchedulingModeFlag
3637	: MF.getFunction()
3638	.getFnAttribute(Kind: "amdgpu-expert-scheduling-mode")
3639	.getValueAsBool());
3640	MaxCounter = IsExpertMode ? NUM_EXPERT_INST_CNTS : NUM_EXTENDED_INST_CNTS;
3641	// Initialize WCG per MF. It contains state that depends on MF attributes.
3642	WCG = std::make_unique<WaitcntGeneratorGFX12Plus>(args&: MF, args&: MaxCounter, args&: Limits,
3643	args&: IsExpertMode);
3644	} else {
3645	MaxCounter = NUM_NORMAL_INST_CNTS;
3646	// Initialize WCG per MF. It contains state that depends on MF attributes.
3647	WCG = std::make_unique<WaitcntGeneratorPreGFX12>(args&: MF, args: NUM_NORMAL_INST_CNTS,
3648	args&: Limits);
3649	}
3650
3651	SmemAccessCounter = getCounterFromEvent(E: SMEM_ACCESS);
3652
3653	bool Modified = false;
3654
3655	MachineBasicBlock &EntryBB = MF.front();
3656
3657	if (!MFI->isEntryFunction()) {
3658	// Wait for any outstanding memory operations that the input registers may
3659	// depend on. We can't track them and it's better to do the wait after the
3660	// costly call sequence.
3661
3662	// TODO: Could insert earlier and schedule more liberally with operations
3663	// that only use caller preserved registers.
3664	MachineBasicBlock::iterator I = EntryBB.begin();
3665	while (I != EntryBB.end() && I ->isMetaInstruction())
3666	++I;
3667
3668	if (ST.hasExtendedWaitCounts()) {
3669	BuildMI(BB&: EntryBB, I, MIMD: DebugLoc (), MCID: TII.get(Opcode: AMDGPU::S_WAIT_LOADCNT_DSCNT))
3670	.addImm(Val: `0`);
3671	for (auto CT : inst_counter_types(MaxCounter: NUM_EXTENDED_INST_CNTS)) {
3672	if (CT == LOAD_CNT \|\| CT == DS_CNT \|\| CT == STORE_CNT \|\| CT == X_CNT)
3673	continue;
3674
3675	if (!ST.hasImageInsts() &&
3676	(CT == EXP_CNT \|\| CT == SAMPLE_CNT \|\| CT == BVH_CNT))
3677	continue;
3678
3679	BuildMI(BB&: EntryBB, I, MIMD: DebugLoc (),
3680	MCID: TII.get(Opcode: instrsForExtendedCounterTypes[CT]))
3681	.addImm(Val: `0`);
3682	}
3683	if (IsExpertMode) {
3684	unsigned Enc = AMDGPU::DepCtr::encodeFieldVaVdst(VaVdst: `0`, STI: ST);
3685	Enc = AMDGPU::DepCtr::encodeFieldVmVsrc(Encoded: Enc, VmVsrc: `0`);
3686	BuildMI(BB&: EntryBB, I, MIMD: DebugLoc (), MCID: TII.get(Opcode: AMDGPU::S_WAITCNT_DEPCTR))
3687	.addImm(Val: Enc);
3688	}
3689	} else {
3690	BuildMI(BB&: EntryBB, I, MIMD: DebugLoc (), MCID: TII.get(Opcode: AMDGPU::S_WAITCNT)).addImm(Val: `0`);
3691	}
3692
3693	auto NonKernelInitialState = std::make_unique<WaitcntBrackets>(args: this);
3694	NonKernelInitialState ->setStateOnFunctionEntryOrReturn();
3695	BlockInfos [&EntryBB].Incoming = std::move(NonKernelInitialState);
3696
3697	Modified = true;
3698	}
3699
3700	// Keep iterating over the blocks in reverse post order, inserting and
3701	// updating s_waitcnt where needed, until a fix point is reached.
3702	for (auto MBB : ReversePostOrderTraversal<MachineFunction >(&MF))
3703	BlockInfos.try_emplace(Key: MBB);
3704
3705	std::unique_ptr<WaitcntBrackets> Brackets;
3706	bool Repeat;
3707	do {
3708	Repeat = false;
3709
3710	for (auto BII = BlockInfos.begin(), BIE = BlockInfos.end(); BII != BIE;
3711	++BII) {
3712	MachineBasicBlock *MBB = BII->first;
3713	BlockInfo &BI = BII->second;
3714	if (!BI.Dirty)
3715	continue;
3716
3717	if (BI.Incoming) {
3718	if (!Brackets)
3719	Brackets = std::make_unique<WaitcntBrackets>(args&: *BI.Incoming);
3720	else
3721	Brackets = BI.Incoming;
3722	} else {
3723	if (!Brackets) {
3724	Brackets = std::make_unique<WaitcntBrackets>(args: this);
3725	} else {
3726	// Reinitialize in-place. N.B. do not do this by assigning from a
3727	// temporary because the WaitcntBrackets class is large and it could
3728	// cause this function to use an unreasonable amount of stack space.
3729	Brackets ->~WaitcntBrackets();
3730	new (Brackets.get()) WaitcntBrackets (this);
3731	}
3732	}
3733
3734	if (ST.hasWaitXcnt())
3735	Modified \|= removeRedundantSoftXcnts(Block&: *MBB);
3736	Modified \|= insertWaitcntInBlock(MF, Block&: MBB, ScoreBrackets&: Brackets);
3737	BI.Dirty = false;
3738
3739	if (Brackets ->hasPendingEvent()) {
3740	BlockInfo MoveBracketsToSucc = nullptr*;
3741	for (MachineBasicBlock *Succ : MBB->successors()) {
3742	auto *SuccBII = BlockInfos.find(Key: Succ);
3743	BlockInfo &SuccBI = SuccBII->second;
3744	if (!SuccBI.Incoming) {
3745	SuccBI.Dirty = true;
3746	if (SuccBII <= BII) {
3747	LLVM_DEBUG(dbgs() << "Repeat on backedge without merge\n");
3748	Repeat = true;
3749	}
3750	if (!MoveBracketsToSucc) {
3751	MoveBracketsToSucc = &SuccBI;
3752	} else {
3753	SuccBI.Incoming = std::make_unique<WaitcntBrackets>(args&: *Brackets);
3754	}
3755	} else {
3756	LLVM_DEBUG({
3757	dbgs() << "Try to merge ";
3758	MBB->printName(dbgs());
3759	dbgs() << " into ";
3760	Succ->printName(dbgs());
3761	dbgs() << `'\n'`;
3762	});
3763	if (SuccBI.Incoming ->merge(Other: *Brackets)) {
3764	SuccBI.Dirty = true;
3765	if (SuccBII <= BII) {
3766	LLVM_DEBUG(dbgs() << "Repeat on backedge with merge\n");
3767	Repeat = true;
3768	}
3769	}
3770	}
3771	}
3772	if (MoveBracketsToSucc)
3773	MoveBracketsToSucc->Incoming = std::move(Brackets);
3774	}
3775	}
3776	} while (Repeat);
3777
3778	if (ST.hasScalarStores()) {
3779	SmallVector<MachineBasicBlock *, `4`> EndPgmBlocks;
3780	bool HaveScalarStores = false;
3781
3782	for (MachineBasicBlock &MBB : MF) {
3783	for (MachineInstr &MI : MBB) {
3784	if (!HaveScalarStores && TII.isScalarStore(MI))
3785	HaveScalarStores = true;
3786
3787	if (MI.getOpcode() == AMDGPU::S_ENDPGM \|\|
3788	MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
3789	EndPgmBlocks.push_back(Elt: &MBB);
3790	}
3791	}
3792
3793	if (HaveScalarStores) {
3794	// If scalar writes are used, the cache must be flushed or else the next
3795	// wave to reuse the same scratch memory can be clobbered.
3796	//
3797	// Insert s_dcache_wb at wave termination points if there were any scalar
3798	// stores, and only if the cache hasn't already been flushed. This could
3799	// be improved by looking across blocks for flushes in postdominating
3800	// blocks from the stores but an explicitly requested flush is probably
3801	// very rare.
3802	for (MachineBasicBlock *MBB : EndPgmBlocks) {
3803	bool SeenDCacheWB = false;
3804
3805	for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
3806	I != E; ++I) {
3807	if (I ->getOpcode() == AMDGPU::S_DCACHE_WB)
3808	SeenDCacheWB = true;
3809	else if (TII.isScalarStore(MI: *I))
3810	SeenDCacheWB = false;
3811
3812	// FIXME: It would be better to insert this before a waitcnt if any.
3813	if ((I ->getOpcode() == AMDGPU::S_ENDPGM \|\|
3814	I ->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&
3815	!SeenDCacheWB) {
3816	Modified = true;
3817	BuildMI(BB&: *MBB, I, MIMD: I ->getDebugLoc(), MCID: TII.get(Opcode: AMDGPU::S_DCACHE_WB));
3818	}
3819	}
3820	}
3821	}
3822	}
3823
3824	if (IsExpertMode) {
3825	// Enable expert scheduling on function entry. To satisfy ABI requirements
3826	// and to allow calls between function with different expert scheduling
3827	// settings, disable it around calls and before returns.
3828
3829	MachineBasicBlock::iterator I = EntryBB.begin();
3830	while (I != EntryBB.end() && I ->isMetaInstruction())
3831	++I;
3832	setSchedulingMode(MBB&: EntryBB, I, ExpertMode: true);
3833
3834	for (MachineInstr *MI : CallInsts) {
3835	MachineBasicBlock &MBB = *MI->getParent();
3836	setSchedulingMode(MBB, I: MI, ExpertMode: false);
3837	setSchedulingMode(MBB, I: std::next(x: MI->getIterator()), ExpertMode: true);
3838	}
3839
3840	for (MachineInstr *MI : ReturnInsts)
3841	setSchedulingMode(MBB&: MI->getParent(), I: MI, ExpertMode: false*);
3842
3843	Modified = true;
3844	}
3845
3846	// Deallocate the VGPRs before previously identified S_ENDPGM instructions.
3847	// This is done in different ways depending on how the VGPRs were allocated
3848	// (i.e. whether we're in dynamic VGPR mode or not).
3849	// Skip deallocation if kernel is waveslot limited vs VGPR limited. A short
3850	// waveslot limited kernel runs slower with the deallocation.
3851	if (!WCG ->isOptNone() && MFI->isDynamicVGPREnabled()) {
3852	for (auto [MI, _] : EndPgmInsts) {
3853	BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(),
3854	MCID: TII.get(Opcode: AMDGPU::S_ALLOC_VGPR))
3855	.addImm(Val: `0`);
3856	Modified = true;
3857	}
3858	} else if (!WCG ->isOptNone() &&
3859	ST.getGeneration() >= AMDGPUSubtarget::GFX11 &&
3860	(MF.getFrameInfo().hasCalls() \|\|
3861	ST.getOccupancyWithNumVGPRs(
3862	VGPRs: TRI.getNumUsedPhysRegs(MRI, RC: AMDGPU::VGPR_32RegClass),
3863	/IsDynamicVGPR=/DynamicVGPRBlockSize: false) <
3864	AMDGPU::IsaInfo::getMaxWavesPerEU(STI: &ST))) {
3865	for (auto [MI, Flag] : EndPgmInsts) {
3866	if (Flag) {
3867	if (ST.requiresNopBeforeDeallocVGPRs()) {
3868	BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(),
3869	MCID: TII.get(Opcode: AMDGPU::S_NOP))
3870	.addImm(Val: `0`);
3871	}
3872	BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(),
3873	MCID: TII.get(Opcode: AMDGPU::S_SENDMSG))
3874	.addImm(Val: AMDGPU::SendMsg::ID_DEALLOC_VGPRS_GFX11Plus);
3875	Modified = true;
3876	}
3877	}
3878	}
3879
3880	return Modified;
3881	}
3882

Browse the source code of llvm_projects/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp