SIInsertWaitcnts.cpp source code [llvm_projects/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp]

1	//===- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	/// \file
10	/// Insert wait instructions for memory reads and writes.
11	///
12	/// Memory reads and writes are issued asynchronously, so we need to insert
13	/// S_WAITCNT instructions when we want to access any of their results or
14	/// overwrite any register that's used asynchronously.
15	///
16	/// TODO: This pass currently keeps one timeline per hardware counter. A more
17	/// finely-grained approach that keeps one timeline per event type could
18	/// sometimes get away with generating weaker s_waitcnt instructions. For
19	/// example, when both SMEM and LDS are in flight and we need to wait for
20	/// the i-th-last LDS instruction, then an lgkmcnt(i) is actually sufficient,
21	/// but the pass will currently generate a conservative lgkmcnt(0) because
22	/// multiple event types are in flight.
23	//
24	//===----------------------------------------------------------------------===//
25
26	#include "AMDGPU.h"
27	#include "GCNSubtarget.h"
28	#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
29	#include "SIMachineFunctionInfo.h"
30	#include "Utils/AMDGPUBaseInfo.h"
31	#include "llvm/ADT/MapVector.h"
32	#include "llvm/ADT/PostOrderIterator.h"
33	#include "llvm/ADT/Sequence.h"
34	#include "llvm/Analysis/AliasAnalysis.h"
35	#include "llvm/CodeGen/MachineLoopInfo.h"
36	#include "llvm/CodeGen/MachinePassManager.h"
37	#include "llvm/CodeGen/MachinePostDominators.h"
38	#include "llvm/Support/DebugCounter.h"
39	#include "llvm/TargetParser/TargetParser.h"
40	using namespace llvm;
41
42	#define DEBUG_TYPE "si-insert-waitcnts"
43
44	DEBUG_COUNTER(ForceExpCounter, DEBUG_TYPE "-forceexp",
45	"Force emit s_waitcnt expcnt(0) instrs");
46	DEBUG_COUNTER(ForceLgkmCounter, DEBUG_TYPE "-forcelgkm",
47	"Force emit s_waitcnt lgkmcnt(0) instrs");
48	DEBUG_COUNTER(ForceVMCounter, DEBUG_TYPE "-forcevm",
49	"Force emit s_waitcnt vmcnt(0) instrs");
50
51	static cl::opt<bool>
52	ForceEmitZeroFlag("amdgpu-waitcnt-forcezero",
53	cl::desc ("Force all waitcnt instrs to be emitted as "
54	"s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"),
55	cl::init(Val: false), cl::Hidden);
56
57	static cl::opt<bool> ForceEmitZeroLoadFlag(
58	"amdgpu-waitcnt-load-forcezero",
59	cl::desc ("Force all waitcnt load counters to wait until 0"),
60	cl::init(Val: false), cl::Hidden);
61
62	namespace {
63	// Class of object that encapsulates latest instruction counter score
64	// associated with the operand. Used for determining whether
65	// s_waitcnt instruction needs to be emitted.
66
67	enum InstCounterType {
68	LOAD_CNT = `0`, // VMcnt prior to gfx12.
69	DS_CNT, // LKGMcnt prior to gfx12.
70	EXP_CNT, //
71	STORE_CNT, // VScnt in gfx10/gfx11.
72	NUM_NORMAL_INST_CNTS,
73	SAMPLE_CNT = NUM_NORMAL_INST_CNTS, // gfx12+ only.
74	BVH_CNT, // gfx12+ only.
75	KM_CNT, // gfx12+ only.
76	X_CNT, // gfx1250.
77	NUM_EXTENDED_INST_CNTS,
78	NUM_INST_CNTS = NUM_EXTENDED_INST_CNTS
79	};
80	} // namespace
81
82	namespace llvm {
83	template <> struct enum_iteration_traits<InstCounterType> {
84	static constexpr bool is_iterable = true;
85	};
86	} // namespace llvm
87
88	namespace {
89	// Return an iterator over all counters between LOAD_CNT (the first counter)
90	// and \c MaxCounter (exclusive, default value yields an enumeration over
91	// all counters).
92	auto inst_counter_types(InstCounterType MaxCounter = NUM_INST_CNTS) {
93	return enum_seq(Begin: LOAD_CNT, End: MaxCounter);
94	}
95
96	using RegInterval = std::pair<int, int>;
97
98	struct HardwareLimits {
99	unsigned LoadcntMax; // Corresponds to VMcnt prior to gfx12.
100	unsigned ExpcntMax;
101	unsigned DscntMax; // Corresponds to LGKMcnt prior to gfx12.
102	unsigned StorecntMax; // Corresponds to VScnt in gfx10/gfx11.
103	unsigned SamplecntMax; // gfx12+ only.
104	unsigned BvhcntMax; // gfx12+ only.
105	unsigned KmcntMax; // gfx12+ only.
106	unsigned XcntMax; // gfx1250.
107	};
108
109	#define AMDGPU_DECLARE_WAIT_EVENTS(DECL) \
110	DECL(VMEM_ACCESS) /* vmem read & write */ \
111	DECL(VMEM_READ_ACCESS) /* vmem read */ \
112	DECL(VMEM_SAMPLER_READ_ACCESS) /* vmem SAMPLER read (gfx12+ only) */ \
113	DECL(VMEM_BVH_READ_ACCESS) /* vmem BVH read (gfx12+ only) */ \
114	DECL(VMEM_WRITE_ACCESS) /* vmem write that is not scratch */ \
115	DECL(SCRATCH_WRITE_ACCESS) /* vmem write that may be scratch */ \
116	DECL(VMEM_GROUP) /* vmem group */ \
117	DECL(LDS_ACCESS) /* lds read & write */ \
118	DECL(GDS_ACCESS) /* gds read & write */ \
119	DECL(SQ_MESSAGE) /* send message */ \
120	DECL(SMEM_ACCESS) /* scalar-memory read & write */ \
121	DECL(SMEM_GROUP) /* scalar-memory group */ \
122	DECL(EXP_GPR_LOCK) /* export holding on its data src */ \
123	DECL(GDS_GPR_LOCK) /* GDS holding on its data and addr src */ \
124	DECL(EXP_POS_ACCESS) /* write to export position */ \
125	DECL(EXP_PARAM_ACCESS) /* write to export parameter */ \
126	DECL(VMW_GPR_LOCK) /* vmem write holding on its data src */ \
127	DECL(EXP_LDS_ACCESS) /* read by ldsdir counting as export */
128
129	// clang-format off
130	#define AMDGPU_EVENT_ENUM(Name) Name,
131	enum WaitEventType {
132	AMDGPU_DECLARE_WAIT_EVENTS(AMDGPU_EVENT_ENUM)
133	NUM_WAIT_EVENTS
134	};
135	#undef AMDGPU_EVENT_ENUM
136
137	#define AMDGPU_EVENT_NAME(Name) #Name,
138	static constexpr StringLiteral WaitEventTypeName[] = {
139	AMDGPU_DECLARE_WAIT_EVENTS(AMDGPU_EVENT_NAME)
140	};
141	#undef AMDGPU_EVENT_NAME
142	// clang-format on
143
144	// The mapping is:
145	// 0 .. SQ_MAX_PGM_VGPRS-1 real VGPRs
146	// SQ_MAX_PGM_VGPRS .. NUM_ALL_VGPRS-1 extra VGPR-like slots
147	// NUM_ALL_VGPRS .. NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS-1 real SGPRs
148	// We reserve a fixed number of VGPR slots in the scoring tables for
149	// special tokens like SCMEM_LDS (needed for buffer load to LDS).
150	enum RegisterMapping {
151	SQ_MAX_PGM_VGPRS = `1024`, // Maximum programmable VGPRs across all targets.
152	AGPR_OFFSET = `512`, // Maximum programmable ArchVGPRs across all targets.
153	SQ_MAX_PGM_SGPRS = `128`, // Maximum programmable SGPRs across all targets.
154	// Artificial register slots to track LDS writes into specific LDS locations
155	// if a location is known. When slots are exhausted or location is
156	// unknown use the first slot. The first slot is also always updated in
157	// addition to known location's slot to properly generate waits if dependent
158	// instruction's location is unknown.
159	FIRST_LDS_VGPR = SQ_MAX_PGM_VGPRS, // Extra slots for LDS stores.
160	NUM_LDS_VGPRS = `9`, // One more than the stores we track.
161	NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_LDS_VGPRS, // Where SGPRs start.
162	};
163
164	// Enumerate different types of result-returning VMEM operations. Although
165	// s_waitcnt orders them all with a single vmcnt counter, in the absence of
166	// s_waitcnt only instructions of the same VmemType are guaranteed to write
167	// their results in order -- so there is no need to insert an s_waitcnt between
168	// two instructions of the same type that write the same vgpr.
169	enum VmemType {
170	// BUF instructions and MIMG instructions without a sampler.
171	VMEM_NOSAMPLER,
172	// MIMG instructions with a sampler.
173	VMEM_SAMPLER,
174	// BVH instructions
175	VMEM_BVH,
176	NUM_VMEM_TYPES
177	};
178
179	// Maps values of InstCounterType to the instruction that waits on that
180	// counter. Only used if GCNSubtarget::hasExtendedWaitCounts()
181	// returns true.
182	static const unsigned instrsForExtendedCounterTypes[NUM_EXTENDED_INST_CNTS] = {
183	AMDGPU::S_WAIT_LOADCNT, AMDGPU::S_WAIT_DSCNT, AMDGPU::S_WAIT_EXPCNT,
184	AMDGPU::S_WAIT_STORECNT, AMDGPU::S_WAIT_SAMPLECNT, AMDGPU::S_WAIT_BVHCNT,
185	AMDGPU::S_WAIT_KMCNT, AMDGPU::S_WAIT_XCNT};
186
187	static bool updateVMCntOnly(const MachineInstr &Inst) {
188	return (SIInstrInfo::isVMEM(MI: Inst) && !SIInstrInfo::isFLAT(MI: Inst)) \|\|
189	SIInstrInfo::isFLATGlobal(MI: Inst) \|\| SIInstrInfo::isFLATScratch(MI: Inst);
190	}
191
192	#ifndef NDEBUG
193	static bool isNormalMode(InstCounterType MaxCounter) {
194	return MaxCounter == NUM_NORMAL_INST_CNTS;
195	}
196	#endif // NDEBUG
197
198	VmemType getVmemType(const MachineInstr &Inst) {
199	assert(updateVMCntOnly(Inst));
200	if (!SIInstrInfo::isImage(MI: Inst))
201	return VMEM_NOSAMPLER;
202	const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc: Inst.getOpcode());
203	const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo =
204	AMDGPU::getMIMGBaseOpcodeInfo(BaseOpcode: Info->BaseOpcode);
205
206	if (BaseInfo->BVH)
207	return VMEM_BVH;
208
209	// We have to make an additional check for isVSAMPLE here since some
210	// instructions don't have a sampler, but are still classified as sampler
211	// instructions for the purposes of e.g. waitcnt.
212	if (BaseInfo->Sampler \|\| BaseInfo->MSAA \|\| SIInstrInfo::isVSAMPLE(MI: Inst))
213	return VMEM_SAMPLER;
214
215	return VMEM_NOSAMPLER;
216	}
217
218	unsigned &getCounterRef(AMDGPU::Waitcnt &Wait, InstCounterType T) {
219	switch (T) {
220	case LOAD_CNT:
221	return Wait.LoadCnt;
222	case EXP_CNT:
223	return Wait.ExpCnt;
224	case DS_CNT:
225	return Wait.DsCnt;
226	case STORE_CNT:
227	return Wait.StoreCnt;
228	case SAMPLE_CNT:
229	return Wait.SampleCnt;
230	case BVH_CNT:
231	return Wait.BvhCnt;
232	case KM_CNT:
233	return Wait.KmCnt;
234	case X_CNT:
235	return Wait.XCnt;
236	default:
237	llvm_unreachable("bad InstCounterType");
238	}
239	}
240
241	void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) {
242	unsigned &WC = getCounterRef(Wait, T);
243	WC = std::min(a: WC, b: Count);
244	}
245
246	void setNoWait(AMDGPU::Waitcnt &Wait, InstCounterType T) {
247	getCounterRef(Wait, T) = ~`0u`;
248	}
249
250	unsigned getWait(AMDGPU::Waitcnt &Wait, InstCounterType T) {
251	return getCounterRef(Wait, T);
252	}
253
254	// Mapping from event to counter according to the table masks.
255	InstCounterType eventCounter(const unsigned *masks, WaitEventType E) {
256	for (auto T : inst_counter_types()) {
257	if (masks[T] & (`1` << E))
258	return T;
259	}
260	llvm_unreachable("event type has no associated counter");
261	}
262
263	// This objects maintains the current score brackets of each wait counter, and
264	// a per-register scoreboard for each wait counter.
265	//
266	// We also maintain the latest score for every event type that can change the
267	// waitcnt in order to know if there are multiple types of events within
268	// the brackets. When multiple types of event happen in the bracket,
269	// wait count may get decreased out of order, therefore we need to put in
270	// "s_waitcnt 0" before use.
271	class WaitcntBrackets {
272	public:
273	WaitcntBrackets(const GCNSubtarget *SubTarget, InstCounterType MaxCounter,
274	HardwareLimits Limits, const unsigned *WaitEventMaskForInst,
275	InstCounterType SmemAccessCounter)
276	: ST(SubTarget), MaxCounter(MaxCounter), Limits (Limits),
277	WaitEventMaskForInst(WaitEventMaskForInst),
278	SmemAccessCounter(SmemAccessCounter) {}
279
280	unsigned getWaitCountMax(InstCounterType T) const {
281	switch (T) {
282	case LOAD_CNT:
283	return Limits.LoadcntMax;
284	case DS_CNT:
285	return Limits.DscntMax;
286	case EXP_CNT:
287	return Limits.ExpcntMax;
288	case STORE_CNT:
289	return Limits.StorecntMax;
290	case SAMPLE_CNT:
291	return Limits.SamplecntMax;
292	case BVH_CNT:
293	return Limits.BvhcntMax;
294	case KM_CNT:
295	return Limits.KmcntMax;
296	case X_CNT:
297	return Limits.XcntMax;
298	default:
299	break;
300	}
301	return `0`;
302	}
303
304	bool isSmemCounter(InstCounterType T) const {
305	return T == SmemAccessCounter \|\| T == X_CNT;
306	}
307
308	unsigned getSgprScoresIdx(InstCounterType T) const {
309	assert(isSmemCounter(T) && "Invalid SMEM counter");
310	return T == X_CNT ? `1` : `0`;
311	}
312
313	unsigned getScoreLB(InstCounterType T) const {
314	assert(T < NUM_INST_CNTS);
315	return ScoreLBs[T];
316	}
317
318	unsigned getScoreUB(InstCounterType T) const {
319	assert(T < NUM_INST_CNTS);
320	return ScoreUBs[T];
321	}
322
323	unsigned getScoreRange(InstCounterType T) const {
324	return getScoreUB(T) - getScoreLB(T);
325	}
326
327	unsigned getRegScore(int GprNo, InstCounterType T) const {
328	if (GprNo < NUM_ALL_VGPRS)
329	return VgprScores[T][GprNo];
330	return SgprScores[getSgprScoresIdx(T)][GprNo - NUM_ALL_VGPRS];
331	}
332
333	bool merge(const WaitcntBrackets &Other);
334
335	RegInterval getRegInterval(const MachineInstr *MI,
336	const MachineRegisterInfo *MRI,
337	const SIRegisterInfo *TRI,
338	const MachineOperand &Op) const;
339
340	bool counterOutOfOrder(InstCounterType T) const;
341	void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const;
342	void simplifyWaitcnt(InstCounterType T, unsigned &Count) const;
343
344	void determineWait(InstCounterType T, RegInterval Interval,
345	AMDGPU::Waitcnt &Wait) const;
346	void determineWait(InstCounterType T, int RegNo,
347	AMDGPU::Waitcnt &Wait) const {
348	determineWait(T, Interval: {RegNo, RegNo + `1`}, Wait);
349	}
350
351	void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
352	void applyWaitcnt(InstCounterType T, unsigned Count);
353	void applyXcnt(const AMDGPU::Waitcnt &Wait);
354	void updateByEvent(const SIInstrInfo TII, const* SIRegisterInfo *TRI,
355	const MachineRegisterInfo *MRI, WaitEventType E,
356	MachineInstr &MI);
357
358	unsigned hasPendingEvent() const { return PendingEvents; }
359	unsigned hasPendingEvent(WaitEventType E) const {
360	return PendingEvents & (`1` << E);
361	}
362	unsigned hasPendingEvent(InstCounterType T) const {
363	unsigned HasPending = PendingEvents & WaitEventMaskForInst[T];
364	assert((HasPending != `0`) == (getScoreRange(T) != `0`));
365	return HasPending;
366	}
367
368	bool hasMixedPendingEvents(InstCounterType T) const {
369	unsigned Events = hasPendingEvent(T);
370	// Return true if more than one bit is set in Events.
371	return Events & (Events - `1`);
372	}
373
374	bool hasPendingFlat() const {
375	return ((LastFlat[DS_CNT] > ScoreLBs[DS_CNT] &&
376	LastFlat[DS_CNT] <= ScoreUBs[DS_CNT]) \|\|
377	(LastFlat[LOAD_CNT] > ScoreLBs[LOAD_CNT] &&
378	LastFlat[LOAD_CNT] <= ScoreUBs[LOAD_CNT]));
379	}
380
381	void setPendingFlat() {
382	LastFlat[LOAD_CNT] = ScoreUBs[LOAD_CNT];
383	LastFlat[DS_CNT] = ScoreUBs[DS_CNT];
384	}
385
386	bool hasPendingGDS() const {
387	return LastGDS > ScoreLBs[DS_CNT] && LastGDS <= ScoreUBs[DS_CNT];
388	}
389
390	unsigned getPendingGDSWait() const {
391	return std::min(a: getScoreUB(T: DS_CNT) - LastGDS, b: getWaitCountMax(T: DS_CNT) - `1`);
392	}
393
394	void setPendingGDS() { LastGDS = ScoreUBs[DS_CNT]; }
395
396	// Return true if there might be pending writes to the vgpr-interval by VMEM
397	// instructions with types different from V.
398	bool hasOtherPendingVmemTypes(RegInterval Interval, VmemType V) const {
399	for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
400	assert(RegNo < NUM_ALL_VGPRS);
401	if (VgprVmemTypes[RegNo] & ~(`1` << V))
402	return true;
403	}
404	return false;
405	}
406
407	void clearVgprVmemTypes(RegInterval Interval) {
408	for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
409	assert(RegNo < NUM_ALL_VGPRS);
410	VgprVmemTypes[RegNo] = `0`;
411	}
412	}
413
414	void setStateOnFunctionEntryOrReturn() {
415	setScoreUB(T: STORE_CNT, Val: getScoreUB(T: STORE_CNT) + getWaitCountMax(T: STORE_CNT));
416	PendingEvents \|= WaitEventMaskForInst[STORE_CNT];
417	}
418
419	ArrayRef<const MachineInstr > getLDSDMAStores() const* {
420	return LDSDMAStores;
421	}
422
423	bool hasPointSampleAccel(const MachineInstr &MI) const;
424	bool hasPointSamplePendingVmemTypes(const MachineInstr &MI,
425	RegInterval Interval) const;
426
427	void print(raw_ostream &) const;
428	void dump() const { print(dbgs()); }
429
430	private:
431	struct MergeInfo {
432	unsigned OldLB;
433	unsigned OtherLB;
434	unsigned MyShift;
435	unsigned OtherShift;
436	};
437	static bool mergeScore(const MergeInfo &M, unsigned &Score,
438	unsigned OtherScore);
439
440	void setScoreLB(InstCounterType T, unsigned Val) {
441	assert(T < NUM_INST_CNTS);
442	ScoreLBs[T] = Val;
443	}
444
445	void setScoreUB(InstCounterType T, unsigned Val) {
446	assert(T < NUM_INST_CNTS);
447	ScoreUBs[T] = Val;
448
449	if (T != EXP_CNT)
450	return;
451
452	if (getScoreRange(T: EXP_CNT) > getWaitCountMax(T: EXP_CNT))
453	ScoreLBs[EXP_CNT] = ScoreUBs[EXP_CNT] - getWaitCountMax(T: EXP_CNT);
454	}
455
456	void setRegScore(int GprNo, InstCounterType T, unsigned Val) {
457	setScoreByInterval(Interval: {GprNo, GprNo + `1`}, CntTy: T, Score: Val);
458	}
459
460	void setScoreByInterval(RegInterval Interval, InstCounterType CntTy,
461	unsigned Score);
462
463	void setScoreByOperand(const MachineInstr MI, const* SIRegisterInfo *TRI,
464	const MachineRegisterInfo *MRI,
465	const MachineOperand &Op, InstCounterType CntTy,
466	unsigned Val);
467
468	const GCNSubtarget ST = nullptr*;
469	InstCounterType MaxCounter = NUM_EXTENDED_INST_CNTS;
470	HardwareLimits Limits = {};
471	const unsigned *WaitEventMaskForInst;
472	InstCounterType SmemAccessCounter;
473	unsigned ScoreLBs[NUM_INST_CNTS] = {`0`};
474	unsigned ScoreUBs[NUM_INST_CNTS] = {`0`};
475	unsigned PendingEvents = `0`;
476	// Remember the last flat memory operation.
477	unsigned LastFlat[NUM_INST_CNTS] = {`0`};
478	// Remember the last GDS operation.
479	unsigned LastGDS = `0`;
480	// wait_cnt scores for every vgpr.
481	// Keep track of the VgprUB and SgprUB to make merge at join efficient.
482	int VgprUB = -`1`;
483	int SgprUB = -`1`;
484	unsigned VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS] = {{`0`}};
485	// Wait cnt scores for every sgpr, the DS_CNT (corresponding to LGKMcnt
486	// pre-gfx12) or KM_CNT (gfx12+ only), and X_CNT (gfx1250) are relevant.
487	// Row 0 represents the score for either DS_CNT or KM_CNT and row 1 keeps the
488	// X_CNT score.
489	unsigned SgprScores[`2`][SQ_MAX_PGM_SGPRS] = {{`0`}};
490	// Bitmask of the VmemTypes of VMEM instructions that might have a pending
491	// write to each vgpr.
492	unsigned char VgprVmemTypes[NUM_ALL_VGPRS] = {`0`};
493	// Store representative LDS DMA operations. The only useful info here is
494	// alias info. One store is kept per unique AAInfo.
495	SmallVector<const MachineInstr *, NUM_LDS_VGPRS - `1`> LDSDMAStores;
496	};
497
498	// This abstracts the logic for generating and updating S_WAIT instructions*
499	// away from the analysis that determines where they are needed. This was
500	// done because the set of counters and instructions for waiting on them
501	// underwent a major shift with gfx12, sufficiently so that having this
502	// abstraction allows the main analysis logic to be simpler than it would
503	// otherwise have had to become.
504	class WaitcntGenerator {
505	protected:
506	const GCNSubtarget ST = nullptr*;
507	const SIInstrInfo TII = nullptr*;
508	AMDGPU::IsaVersion IV;
509	InstCounterType MaxCounter;
510	bool OptNone;
511
512	public:
513	WaitcntGenerator() = default;
514	WaitcntGenerator(const MachineFunction &MF, InstCounterType MaxCounter)
515	: ST(&MF.getSubtarget<GCNSubtarget>()), TII(ST->getInstrInfo()),
516	IV(AMDGPU::getIsaVersion(GPU: ST->getCPU())), MaxCounter(MaxCounter),
517	OptNone(MF.getFunction().hasOptNone() \|\|
518	MF.getTarget().getOptLevel() == CodeGenOptLevel::None) {}
519
520	// Return true if the current function should be compiled with no
521	// optimization.
522	bool isOptNone() const { return OptNone; }
523
524	// Edits an existing sequence of wait count instructions according
525	// to an incoming Waitcnt value, which is itself updated to reflect
526	// any new wait count instructions which may need to be generated by
527	// WaitcntGenerator::createNewWaitcnt(). It will return true if any edits
528	// were made.
529	//
530	// This editing will usually be merely updated operands, but it may also
531	// delete instructions if the incoming Wait value indicates they are not
532	// needed. It may also remove existing instructions for which a wait
533	// is needed if it can be determined that it is better to generate new
534	// instructions later, as can happen on gfx12.
535	virtual bool
536	applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
537	MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
538	MachineBasicBlock::instr_iterator It) const = `0`;
539
540	// Transform a soft waitcnt into a normal one.
541	bool promoteSoftWaitCnt(MachineInstr Waitcnt) const*;
542
543	// Generates new wait count instructions according to the value of
544	// Wait, returning true if any new instructions were created.
545	virtual bool createNewWaitcnt(MachineBasicBlock &Block,
546	MachineBasicBlock::instr_iterator It,
547	AMDGPU::Waitcnt Wait) = `0`;
548
549	// Returns an array of bit masks which can be used to map values in
550	// WaitEventType to corresponding counter values in InstCounterType.
551	virtual const unsigned getWaitEventMask() const* = `0`;
552
553	// Returns a new waitcnt with all counters except VScnt set to 0. If
554	// IncludeVSCnt is true, VScnt is set to 0, otherwise it is set to ~0u.
555	virtual AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const = `0`;
556
557	virtual ~WaitcntGenerator() = default;
558
559	// Create a mask value from the initializer list of wait event types.
560	static constexpr unsigned
561	eventMask(std::initializer_list<WaitEventType> Events) {
562	unsigned Mask = `0`;
563	for (auto &E : Events)
564	Mask \|= `1` << E;
565
566	return Mask;
567	}
568	};
569
570	class WaitcntGeneratorPreGFX12 : public WaitcntGenerator {
571	public:
572	WaitcntGeneratorPreGFX12() = default;
573	WaitcntGeneratorPreGFX12(const MachineFunction &MF)
574	: WaitcntGenerator (MF, NUM_NORMAL_INST_CNTS) {}
575
576	bool
577	applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
578	MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
579	MachineBasicBlock::instr_iterator It) const override;
580
581	bool createNewWaitcnt(MachineBasicBlock &Block,
582	MachineBasicBlock::instr_iterator It,
583	AMDGPU::Waitcnt Wait) override;
584
585	const unsigned getWaitEventMask() const* override {
586	assert(ST);
587
588	static const unsigned WaitEventMaskForInstPreGFX12[NUM_INST_CNTS] = {
589	eventMask(Events: {VMEM_ACCESS, VMEM_READ_ACCESS, VMEM_SAMPLER_READ_ACCESS,
590	VMEM_BVH_READ_ACCESS}),
591	eventMask(Events: {SMEM_ACCESS, LDS_ACCESS, GDS_ACCESS, SQ_MESSAGE}),
592	eventMask(Events: {EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK, EXP_PARAM_ACCESS,
593	EXP_POS_ACCESS, EXP_LDS_ACCESS}),
594	eventMask(Events: {VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
595	`0`,
596	`0`,
597	`0`,
598	`0`};
599
600	return WaitEventMaskForInstPreGFX12;
601	}
602
603	AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override;
604	};
605
606	class WaitcntGeneratorGFX12Plus : public WaitcntGenerator {
607	public:
608	WaitcntGeneratorGFX12Plus() = default;
609	WaitcntGeneratorGFX12Plus(const MachineFunction &MF,
610	InstCounterType MaxCounter)
611	: WaitcntGenerator (MF, MaxCounter) {}
612
613	bool
614	applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
615	MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
616	MachineBasicBlock::instr_iterator It) const override;
617
618	bool createNewWaitcnt(MachineBasicBlock &Block,
619	MachineBasicBlock::instr_iterator It,
620	AMDGPU::Waitcnt Wait) override;
621
622	const unsigned getWaitEventMask() const* override {
623	assert(ST);
624
625	static const unsigned WaitEventMaskForInstGFX12Plus[NUM_INST_CNTS] = {
626	eventMask(Events: {VMEM_ACCESS, VMEM_READ_ACCESS}),
627	eventMask(Events: {LDS_ACCESS, GDS_ACCESS}),
628	eventMask(Events: {EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK, EXP_PARAM_ACCESS,
629	EXP_POS_ACCESS, EXP_LDS_ACCESS}),
630	eventMask(Events: {VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
631	eventMask(Events: {VMEM_SAMPLER_READ_ACCESS}),
632	eventMask(Events: {VMEM_BVH_READ_ACCESS}),
633	eventMask(Events: {SMEM_ACCESS, SQ_MESSAGE}),
634	eventMask(Events: {VMEM_GROUP, SMEM_GROUP})};
635
636	return WaitEventMaskForInstGFX12Plus;
637	}
638
639	AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override;
640	};
641
642	class SIInsertWaitcnts {
643	private:
644	const GCNSubtarget ST = nullptr*;
645	const SIInstrInfo TII = nullptr*;
646	const SIRegisterInfo TRI = nullptr*;
647	const MachineRegisterInfo MRI = nullptr*;
648
649	DenseMap<const Value , MachineBasicBlock > SLoadAddresses;
650	DenseMap<MachineBasicBlock , bool*> PreheadersToFlush;
651	MachineLoopInfo *MLI;
652	MachinePostDominatorTree *PDT;
653	AliasAnalysis AA = nullptr*;
654
655	struct BlockInfo {
656	std::unique_ptr<WaitcntBrackets> Incoming;
657	bool Dirty = true;
658	};
659
660	InstCounterType SmemAccessCounter;
661
662	MapVector<MachineBasicBlock *, BlockInfo> BlockInfos;
663
664	bool ForceEmitWaitcnt[NUM_INST_CNTS];
665
666	// In any given run of this pass, WCG will point to one of these two
667	// generator objects, which must have been re-initialised before use
668	// from a value made using a subtarget constructor.
669	WaitcntGeneratorPreGFX12 WCGPreGFX12;
670	WaitcntGeneratorGFX12Plus WCGGFX12Plus;
671
672	WaitcntGenerator WCG = nullptr*;
673
674	// S_ENDPGM instructions before which we should insert a DEALLOC_VGPRS
675	// message.
676	DenseSet<MachineInstr *> ReleaseVGPRInsts;
677
678	InstCounterType MaxCounter = NUM_NORMAL_INST_CNTS;
679
680	public:
681	SIInsertWaitcnts(MachineLoopInfo MLI, MachinePostDominatorTree PDT,
682	AliasAnalysis *AA)
683	: MLI(MLI), PDT(PDT), AA(AA) {
684	(void)ForceExpCounter;
685	(void)ForceLgkmCounter;
686	(void)ForceVMCounter;
687	}
688
689	bool shouldFlushVmCnt(MachineLoop ML, const* WaitcntBrackets &Brackets);
690	bool isPreheaderToFlush(MachineBasicBlock &MBB,
691	const WaitcntBrackets &ScoreBrackets);
692	bool isVMEMOrFlatVMEM(const MachineInstr &MI) const;
693	bool run(MachineFunction &MF);
694
695	bool isForceEmitWaitcnt() const {
696	for (auto T : inst_counter_types())
697	if (ForceEmitWaitcnt[T])
698	return true;
699	return false;
700	}
701
702	void setForceEmitWaitcnt() {
703	// For non-debug builds, ForceEmitWaitcnt has been initialized to false;
704	// For debug builds, get the debug counter info and adjust if need be
705	#ifndef NDEBUG
706	if (DebugCounter::isCounterSet(ForceExpCounter) &&
707	DebugCounter::shouldExecute(ForceExpCounter)) {
708	ForceEmitWaitcnt[EXP_CNT] = true;
709	} else {
710	ForceEmitWaitcnt[EXP_CNT] = false;
711	}
712
713	if (DebugCounter::isCounterSet(ForceLgkmCounter) &&
714	DebugCounter::shouldExecute(ForceLgkmCounter)) {
715	ForceEmitWaitcnt[DS_CNT] = true;
716	ForceEmitWaitcnt[KM_CNT] = true;
717	} else {
718	ForceEmitWaitcnt[DS_CNT] = false;
719	ForceEmitWaitcnt[KM_CNT] = false;
720	}
721
722	if (DebugCounter::isCounterSet(ForceVMCounter) &&
723	DebugCounter::shouldExecute(ForceVMCounter)) {
724	ForceEmitWaitcnt[LOAD_CNT] = true;
725	ForceEmitWaitcnt[SAMPLE_CNT] = true;
726	ForceEmitWaitcnt[BVH_CNT] = true;
727	} else {
728	ForceEmitWaitcnt[LOAD_CNT] = false;
729	ForceEmitWaitcnt[SAMPLE_CNT] = false;
730	ForceEmitWaitcnt[BVH_CNT] = false;
731	}
732	#endif // NDEBUG
733	}
734
735	// Return the appropriate VMEM__ACCESS type for Inst, which must be a VMEM*
736	// instruction.
737	WaitEventType getVmemWaitEventType(const MachineInstr &Inst) const {
738	switch (Inst.getOpcode()) {
739	case AMDGPU::GLOBAL_INV:
740	return VMEM_READ_ACCESS; // tracked using loadcnt
741	case AMDGPU::GLOBAL_WB:
742	case AMDGPU::GLOBAL_WBINV:
743	return VMEM_WRITE_ACCESS; // tracked using storecnt
744	default:
745	break;
746	}
747
748	// Maps VMEM access types to their corresponding WaitEventType.
749	static const WaitEventType VmemReadMapping[NUM_VMEM_TYPES] = {
750	VMEM_READ_ACCESS, VMEM_SAMPLER_READ_ACCESS, VMEM_BVH_READ_ACCESS};
751
752	assert(SIInstrInfo::isVMEM(Inst));
753	// LDS DMA loads are also stores, but on the LDS side. On the VMEM side
754	// these should use VM_CNT.
755	if (!ST->hasVscnt() \|\| SIInstrInfo::mayWriteLDSThroughDMA(MI: Inst))
756	return VMEM_ACCESS;
757	if (Inst.mayStore() &&
758	(!Inst.mayLoad() \|\| SIInstrInfo::isAtomicNoRet(MI: Inst))) {
759	// FLAT and SCRATCH instructions may access scratch. Other VMEM
760	// instructions do not.
761	if (SIInstrInfo::isFLAT(MI: Inst) && mayAccessScratchThroughFlat(MI: Inst))
762	return SCRATCH_WRITE_ACCESS;
763	return VMEM_WRITE_ACCESS;
764	}
765	if (!ST->hasExtendedWaitCounts() \|\| SIInstrInfo::isFLAT(MI: Inst))
766	return VMEM_READ_ACCESS;
767	return VmemReadMapping[getVmemType(Inst)];
768	}
769
770	bool hasXcnt() const { return ST->hasWaitXCnt(); }
771
772	bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const;
773	bool mayAccessLDSThroughFlat(const MachineInstr &MI) const;
774	bool mayAccessScratchThroughFlat(const MachineInstr &MI) const;
775	bool isVmemAccess(const MachineInstr &MI) const;
776	bool generateWaitcntInstBefore(MachineInstr &MI,
777	WaitcntBrackets &ScoreBrackets,
778	MachineInstr *OldWaitcntInstr,
779	bool FlushVmCnt);
780	bool generateWaitcnt(AMDGPU::Waitcnt Wait,
781	MachineBasicBlock::instr_iterator It,
782	MachineBasicBlock &Block, WaitcntBrackets &ScoreBrackets,
783	MachineInstr *OldWaitcntInstr);
784	void updateEventWaitcntAfter(MachineInstr &Inst,
785	WaitcntBrackets *ScoreBrackets);
786	bool isNextENDPGM(MachineBasicBlock::instr_iterator It,
787	MachineBasicBlock Block) const*;
788	bool insertForcedWaitAfter(MachineInstr &Inst, MachineBasicBlock &Block,
789	WaitcntBrackets &ScoreBrackets);
790	bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block,
791	WaitcntBrackets &ScoreBrackets);
792	};
793
794	class SIInsertWaitcntsLegacy : public MachineFunctionPass {
795	public:
796	static char ID;
797	SIInsertWaitcntsLegacy() : MachineFunctionPass (ID) {}
798
799	bool runOnMachineFunction(MachineFunction &MF) override;
800
801	StringRef getPassName() const override {
802	return "SI insert wait instructions";
803	}
804
805	void getAnalysisUsage(AnalysisUsage &AU) const override {
806	AU.setPreservesCFG();
807	AU.addRequired<MachineLoopInfoWrapperPass>();
808	AU.addRequired<MachinePostDominatorTreeWrapperPass>();
809	AU.addUsedIfAvailable<AAResultsWrapperPass>();
810	AU.addPreserved<AAResultsWrapperPass>();
811	MachineFunctionPass::getAnalysisUsage(AU);
812	}
813	};
814
815	} // end anonymous namespace
816
817	RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
818	const MachineRegisterInfo *MRI,
819	const SIRegisterInfo *TRI,
820	const MachineOperand &Op) const {
821	if (!TRI->isInAllocatableClass(RegNo: Op.getReg()))
822	return {-`1`, -`1`};
823
824	// A use via a PW operand does not need a waitcnt.
825	// A partial write is not a WAW.
826	assert(!Op.getSubReg() \|\| !Op.isUndef());
827
828	RegInterval Result;
829
830	MCRegister MCReg = AMDGPU::getMCReg(Reg: Op.getReg(), STI: *ST);
831	unsigned RegIdx = TRI->getHWRegIndex(Reg: MCReg);
832	assert(isUInt<`8`>(RegIdx));
833
834	const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Reg: Op.getReg());
835	unsigned Size = TRI->getRegSizeInBits(RC: *RC);
836
837	// AGPRs/VGPRs are tracked every 16 bits, SGPRs by 32 bits
838	if (TRI->isVectorRegister(MRI: *MRI, Reg: Op.getReg())) {
839	unsigned Reg = RegIdx << `1` \| (AMDGPU::isHi16Reg(Reg: MCReg, MRI: *TRI) ? `1` : `0`);
840	assert(Reg < AGPR_OFFSET);
841	Result.first = Reg;
842	if (TRI->isAGPR(MRI: *MRI, Reg: Op.getReg()))
843	Result.first += AGPR_OFFSET;
844	assert(Result.first >= `0` && Result.first < SQ_MAX_PGM_VGPRS);
845	assert(Size % `16` == `0`);
846	Result.second = Result.first + (Size / `16`);
847	} else if (TRI->isSGPRReg(MRI: *MRI, Reg: Op.getReg()) && RegIdx < SQ_MAX_PGM_SGPRS) {
848	// SGPRs including VCC, TTMPs and EXEC but excluding read-only scalar
849	// sources like SRC_PRIVATE_BASE.
850	Result.first = RegIdx + NUM_ALL_VGPRS;
851	Result.second = Result.first + divideCeil(Numerator: Size, Denominator: `32`);
852	} else {
853	return {-`1`, -`1`};
854	}
855
856	return Result;
857	}
858
859	void WaitcntBrackets::setScoreByInterval(RegInterval Interval,
860	InstCounterType CntTy,
861	unsigned Score) {
862	for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
863	if (RegNo < NUM_ALL_VGPRS) {
864	VgprUB = std::max(a: VgprUB, b: RegNo);
865	VgprScores[CntTy][RegNo] = Score;
866	} else {
867	SgprUB = std::max(a: SgprUB, b: RegNo - NUM_ALL_VGPRS);
868	SgprScores[getSgprScoresIdx(T: CntTy)][RegNo - NUM_ALL_VGPRS] = Score;
869	}
870	}
871	}
872
873	void WaitcntBrackets::setScoreByOperand(const MachineInstr *MI,
874	const SIRegisterInfo *TRI,
875	const MachineRegisterInfo *MRI,
876	const MachineOperand &Op,
877	InstCounterType CntTy, unsigned Score) {
878	RegInterval Interval = getRegInterval(MI, MRI, TRI, Op);
879	setScoreByInterval(Interval, CntTy, Score);
880	}
881
882	// Return true if the subtarget is one that enables Point Sample Acceleration
883	// and the MachineInstr passed in is one to which it might be applied (the
884	// hardware makes this decision based on several factors, but we can't determine
885	// this at compile time, so we have to assume it might be applied if the
886	// instruction supports it).
887	bool WaitcntBrackets::hasPointSampleAccel(const MachineInstr &MI) const {
888	if (!ST->hasPointSampleAccel() \|\| !SIInstrInfo::isMIMG(MI))
889	return false;
890
891	const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc: MI.getOpcode());
892	const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo =
893	AMDGPU::getMIMGBaseOpcodeInfo(BaseOpcode: Info->BaseOpcode);
894	return BaseInfo->PointSampleAccel;
895	}
896
897	// Return true if the subtarget enables Point Sample Acceleration, the supplied
898	// MachineInstr is one to which it might be applied and the supplied interval is
899	// one that has outstanding writes to vmem-types different than VMEM_NOSAMPLER
900	// (this is the type that a point sample accelerated instruction effectively
901	// becomes)
902	bool WaitcntBrackets::hasPointSamplePendingVmemTypes(
903	const MachineInstr &MI, RegInterval Interval) const {
904	if (!hasPointSampleAccel(MI))
905	return false;
906
907	return hasOtherPendingVmemTypes(Interval, V: VMEM_NOSAMPLER);
908	}
909
910	void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
911	const SIRegisterInfo *TRI,
912	const MachineRegisterInfo *MRI,
913	WaitEventType E, MachineInstr &Inst) {
914	InstCounterType T = eventCounter(masks: WaitEventMaskForInst, E);
915
916	unsigned UB = getScoreUB(T);
917	unsigned CurrScore = UB + `1`;
918	if (CurrScore == `0`)
919	report_fatal_error(reason: "InsertWaitcnt score wraparound");
920	// PendingEvents and ScoreUB need to be update regardless if this event
921	// changes the score of a register or not.
922	// Examples including vm_cnt when buffer-store or lgkm_cnt when send-message.
923	PendingEvents \|= `1` << E;
924	setScoreUB(T, Val: CurrScore);
925
926	if (T == EXP_CNT) {
927	// Put score on the source vgprs. If this is a store, just use those
928	// specific register(s).
929	if (TII->isDS(MI: Inst) && Inst.mayLoadOrStore()) {
930	// All GDS operations must protect their address register (same as
931	// export.)
932	if (const auto *AddrOp = TII->getNamedOperand(MI&: Inst, OperandName: AMDGPU::OpName::addr))
933	setScoreByOperand(MI: &Inst, TRI, MRI, Op: *AddrOp, CntTy: EXP_CNT, Score: CurrScore);
934
935	if (Inst.mayStore()) {
936	if (const auto *Data0 =
937	TII->getNamedOperand(MI&: Inst, OperandName: AMDGPU::OpName::data0))
938	setScoreByOperand(MI: &Inst, TRI, MRI, Op: *Data0, CntTy: EXP_CNT, Score: CurrScore);
939	if (const auto *Data1 =
940	TII->getNamedOperand(MI&: Inst, OperandName: AMDGPU::OpName::data1))
941	setScoreByOperand(MI: &Inst, TRI, MRI, Op: *Data1, CntTy: EXP_CNT, Score: CurrScore);
942	} else if (SIInstrInfo::isAtomicRet(MI: Inst) && !SIInstrInfo::isGWS(MI: Inst) &&
943	Inst.getOpcode() != AMDGPU::DS_APPEND &&
944	Inst.getOpcode() != AMDGPU::DS_CONSUME &&
945	Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) {
946	for (const MachineOperand &Op : Inst.all_uses()) {
947	if (TRI->isVectorRegister(MRI: *MRI, Reg: Op.getReg()))
948	setScoreByOperand(MI: &Inst, TRI, MRI, Op, CntTy: EXP_CNT, Score: CurrScore);
949	}
950	}
951	} else if (TII->isFLAT(MI: Inst)) {
952	if (Inst.mayStore()) {
953	setScoreByOperand(MI: &Inst, TRI, MRI,
954	Op: *TII->getNamedOperand(MI&: Inst, OperandName: AMDGPU::OpName::data),
955	CntTy: EXP_CNT, Score: CurrScore);
956	} else if (SIInstrInfo::isAtomicRet(MI: Inst)) {
957	setScoreByOperand(MI: &Inst, TRI, MRI,
958	Op: *TII->getNamedOperand(MI&: Inst, OperandName: AMDGPU::OpName::data),
959	CntTy: EXP_CNT, Score: CurrScore);
960	}
961	} else if (TII->isMIMG(MI: Inst)) {
962	if (Inst.mayStore()) {
963	setScoreByOperand(MI: &Inst, TRI, MRI, Op: Inst.getOperand(i: `0`), CntTy: EXP_CNT,
964	Score: CurrScore);
965	} else if (SIInstrInfo::isAtomicRet(MI: Inst)) {
966	setScoreByOperand(MI: &Inst, TRI, MRI,
967	Op: *TII->getNamedOperand(MI&: Inst, OperandName: AMDGPU::OpName::data),
968	CntTy: EXP_CNT, Score: CurrScore);
969	}
970	} else if (TII->isMTBUF(MI: Inst)) {
971	if (Inst.mayStore())
972	setScoreByOperand(MI: &Inst, TRI, MRI, Op: Inst.getOperand(i: `0`), CntTy: EXP_CNT,
973	Score: CurrScore);
974	} else if (TII->isMUBUF(MI: Inst)) {
975	if (Inst.mayStore()) {
976	setScoreByOperand(MI: &Inst, TRI, MRI, Op: Inst.getOperand(i: `0`), CntTy: EXP_CNT,
977	Score: CurrScore);
978	} else if (SIInstrInfo::isAtomicRet(MI: Inst)) {
979	setScoreByOperand(MI: &Inst, TRI, MRI,
980	Op: *TII->getNamedOperand(MI&: Inst, OperandName: AMDGPU::OpName::data),
981	CntTy: EXP_CNT, Score: CurrScore);
982	}
983	} else if (TII->isLDSDIR(MI: Inst)) {
984	// LDSDIR instructions attach the score to the destination.
985	setScoreByOperand(MI: &Inst, TRI, MRI,
986	Op: *TII->getNamedOperand(MI&: Inst, OperandName: AMDGPU::OpName::vdst),
987	CntTy: EXP_CNT, Score: CurrScore);
988	} else {
989	if (TII->isEXP(MI: Inst)) {
990	// For export the destination registers are really temps that
991	// can be used as the actual source after export patching, so
992	// we need to treat them like sources and set the EXP_CNT
993	// score.
994	for (MachineOperand &DefMO : Inst.all_defs()) {
995	if (TRI->isVGPR(MRI: *MRI, Reg: DefMO.getReg())) {
996	setScoreByOperand(MI: &Inst, TRI, MRI, Op: DefMO, CntTy: EXP_CNT, Score: CurrScore);
997	}
998	}
999	}
1000	for (const MachineOperand &Op : Inst.all_uses()) {
1001	if (TRI->isVectorRegister(MRI: *MRI, Reg: Op.getReg()))
1002	setScoreByOperand(MI: &Inst, TRI, MRI, Op, CntTy: EXP_CNT, Score: CurrScore);
1003	}
1004	}
1005	} else if (T == X_CNT) {
1006	for (const MachineOperand &Op : Inst.all_uses())
1007	setScoreByOperand(MI: &Inst, TRI, MRI, Op, CntTy: T, Score: CurrScore);
1008	} else / LGKM_CNT \|\| EXP_CNT \|\| VS_CNT \|\| NUM_INST_CNTS / {
1009	// Match the score to the destination registers.
1010	//
1011	// Check only explicit operands. Stores, especially spill stores, include
1012	// implicit uses and defs of their super registers which would create an
1013	// artificial dependency, while these are there only for register liveness
1014	// accounting purposes.
1015	//
1016	// Special cases where implicit register defs exists, such as M0 or VCC,
1017	// but none with memory instructions.
1018	for (const MachineOperand &Op : Inst.defs()) {
1019	RegInterval Interval = getRegInterval(MI: &Inst, MRI, TRI, Op);
1020	if (T == LOAD_CNT \|\| T == SAMPLE_CNT \|\| T == BVH_CNT) {
1021	if (Interval.first >= NUM_ALL_VGPRS)
1022	continue;
1023	if (updateVMCntOnly(Inst)) {
1024	// updateVMCntOnly should only leave us with VGPRs
1025	// MUBUF, MTBUF, MIMG, FlatGlobal, and FlatScratch only have VGPR/AGPR
1026	// defs. That's required for a sane index into `VgprMemTypes` below
1027	assert(TRI->isVectorRegister(*MRI, Op.getReg()));
1028	VmemType V = getVmemType(Inst);
1029	unsigned char TypesMask = `1` << V;
1030	// If instruction can have Point Sample Accel applied, we have to flag
1031	// this with another potential dependency
1032	if (hasPointSampleAccel(MI: Inst))
1033	TypesMask \|= `1` << VMEM_NOSAMPLER;
1034	for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo)
1035	VgprVmemTypes[RegNo] \|= TypesMask;
1036	}
1037	}
1038	setScoreByInterval(Interval, CntTy: T, Score: CurrScore);
1039	}
1040	if (Inst.mayStore() &&
1041	(TII->isDS(MI: Inst) \|\| TII->mayWriteLDSThroughDMA(MI: Inst))) {
1042	// MUBUF and FLAT LDS DMA operations need a wait on vmcnt before LDS
1043	// written can be accessed. A load from LDS to VMEM does not need a wait.
1044	unsigned Slot = `0`;
1045	for (const auto *MemOp : Inst.memoperands()) {
1046	if (!MemOp->isStore() \|\|
1047	MemOp->getAddrSpace() != AMDGPUAS::LOCAL_ADDRESS)
1048	continue;
1049	// Comparing just AA info does not guarantee memoperands are equal
1050	// in general, but this is so for LDS DMA in practice.
1051	auto AAI = MemOp->getAAInfo();
1052	// Alias scope information gives a way to definitely identify an
1053	// original memory object and practically produced in the module LDS
1054	// lowering pass. If there is no scope available we will not be able
1055	// to disambiguate LDS aliasing as after the module lowering all LDS
1056	// is squashed into a single big object. Do not attempt to use one of
1057	// the limited LDSDMAStores for something we will not be able to use
1058	// anyway.
1059	if (!AAI \|\| !AAI.Scope)
1060	break;
1061	for (unsigned I = `0`, E = LDSDMAStores.size(); I != E && !Slot; ++I) {
1062	for (const auto *MemOp : LDSDMAStores [I]->memoperands()) {
1063	if (MemOp->isStore() && AAI == MemOp->getAAInfo()) {
1064	Slot = I + `1`;
1065	break;
1066	}
1067	}
1068	}
1069	if (Slot \|\| LDSDMAStores.size() == NUM_LDS_VGPRS - `1`)
1070	break;
1071	LDSDMAStores.push_back(Elt: &Inst);
1072	Slot = LDSDMAStores.size();
1073	break;
1074	}
1075	setRegScore(GprNo: FIRST_LDS_VGPR + Slot, T, Val: CurrScore);
1076	if (Slot)
1077	setRegScore(GprNo: FIRST_LDS_VGPR, T, Val: CurrScore);
1078	}
1079	}
1080	}
1081
1082	void WaitcntBrackets::print(raw_ostream &OS) const {
1083	OS << `'\n'`;
1084	for (auto T : inst_counter_types(MaxCounter)) {
1085	unsigned SR = getScoreRange(T);
1086
1087	switch (T) {
1088	case LOAD_CNT:
1089	OS << " " << (ST->hasExtendedWaitCounts() ? "LOAD" : "VM") << "_CNT("
1090	<< SR << "): ";
1091	break;
1092	case DS_CNT:
1093	OS << " " << (ST->hasExtendedWaitCounts() ? "DS" : "LGKM") << "_CNT("
1094	<< SR << "): ";
1095	break;
1096	case EXP_CNT:
1097	OS << " EXP_CNT(" << SR << "): ";
1098	break;
1099	case STORE_CNT:
1100	OS << " " << (ST->hasExtendedWaitCounts() ? "STORE" : "VS") << "_CNT("
1101	<< SR << "): ";
1102	break;
1103	case SAMPLE_CNT:
1104	OS << " SAMPLE_CNT(" << SR << "): ";
1105	break;
1106	case BVH_CNT:
1107	OS << " BVH_CNT(" << SR << "): ";
1108	break;
1109	case KM_CNT:
1110	OS << " KM_CNT(" << SR << "): ";
1111	break;
1112	case X_CNT:
1113	OS << " X_CNT(" << SR << "): ";
1114	break;
1115	default:
1116	OS << " UNKNOWN(" << SR << "): ";
1117	break;
1118	}
1119
1120	if (SR != `0`) {
1121	// Print vgpr scores.
1122	unsigned LB = getScoreLB(T);
1123
1124	for (int J = `0`; J <= VgprUB; J++) {
1125	unsigned RegScore = getRegScore(GprNo: J, T);
1126	if (RegScore <= LB)
1127	continue;
1128	unsigned RelScore = RegScore - LB - `1`;
1129	if (J < FIRST_LDS_VGPR) {
1130	OS << RelScore << ":v" << J << " ";
1131	} else {
1132	OS << RelScore << ":ds ";
1133	}
1134	}
1135	// Also need to print sgpr scores for lgkm_cnt or xcnt.
1136	if (isSmemCounter(T)) {
1137	for (int J = `0`; J <= SgprUB; J++) {
1138	unsigned RegScore = getRegScore(GprNo: J + NUM_ALL_VGPRS, T);
1139	if (RegScore <= LB)
1140	continue;
1141	unsigned RelScore = RegScore - LB - `1`;
1142	OS << RelScore << ":s" << J << " ";
1143	}
1144	}
1145	}
1146	OS << `'\n'`;
1147	}
1148
1149	OS << "Pending Events: ";
1150	if (hasPendingEvent()) {
1151	ListSeparator LS;
1152	for (unsigned I = `0`; I != NUM_WAIT_EVENTS; ++I) {
1153	if (hasPendingEvent(E: (WaitEventType)I)) {
1154	OS << LS << WaitEventTypeName[I];
1155	}
1156	}
1157	} else {
1158	OS << "none";
1159	}
1160	OS << `'\n'`;
1161
1162	OS << `'\n'`;
1163	}
1164
1165	/// Simplify the waitcnt, in the sense of removing redundant counts, and return
1166	/// whether a waitcnt instruction is needed at all.
1167	void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const {
1168	simplifyWaitcnt(T: LOAD_CNT, Count&: Wait.LoadCnt);
1169	simplifyWaitcnt(T: EXP_CNT, Count&: Wait.ExpCnt);
1170	simplifyWaitcnt(T: DS_CNT, Count&: Wait.DsCnt);
1171	simplifyWaitcnt(T: STORE_CNT, Count&: Wait.StoreCnt);
1172	simplifyWaitcnt(T: SAMPLE_CNT, Count&: Wait.SampleCnt);
1173	simplifyWaitcnt(T: BVH_CNT, Count&: Wait.BvhCnt);
1174	simplifyWaitcnt(T: KM_CNT, Count&: Wait.KmCnt);
1175	simplifyWaitcnt(T: X_CNT, Count&: Wait.XCnt);
1176	}
1177
1178	void WaitcntBrackets::simplifyWaitcnt(InstCounterType T,
1179	unsigned &Count) const {
1180	// The number of outstanding events for this type, T, can be calculated
1181	// as (UB - LB). If the current Count is greater than or equal to the number
1182	// of outstanding events, then the wait for this counter is redundant.
1183	if (Count >= getScoreRange(T))
1184	Count = ~`0u`;
1185	}
1186
1187	void WaitcntBrackets::determineWait(InstCounterType T, RegInterval Interval,
1188	AMDGPU::Waitcnt &Wait) const {
1189	const unsigned LB = getScoreLB(T);
1190	const unsigned UB = getScoreUB(T);
1191	for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
1192	unsigned ScoreToWait = getRegScore(GprNo: RegNo, T);
1193
1194	// If the score of src_operand falls within the bracket, we need an
1195	// s_waitcnt instruction.
1196	if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
1197	if ((T == LOAD_CNT \|\| T == DS_CNT) && hasPendingFlat() &&
1198	!ST->hasFlatLgkmVMemCountInOrder()) {
1199	// If there is a pending FLAT operation, and this is a VMem or LGKM
1200	// waitcnt and the target can report early completion, then we need
1201	// to force a waitcnt 0.
1202	addWait(Wait, T, Count: `0`);
1203	} else if (counterOutOfOrder(T)) {
1204	// Counter can get decremented out-of-order when there
1205	// are multiple types event in the bracket. Also emit an s_wait counter
1206	// with a conservative value of 0 for the counter.
1207	addWait(Wait, T, Count: `0`);
1208	} else {
1209	// If a counter has been maxed out avoid overflow by waiting for
1210	// MAX(CounterType) - 1 instead.
1211	unsigned NeededWait =
1212	std::min(a: UB - ScoreToWait, b: getWaitCountMax(T) - `1`);
1213	addWait(Wait, T, Count: NeededWait);
1214	}
1215	}
1216	}
1217	}
1218
1219	void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) {
1220	applyWaitcnt(T: LOAD_CNT, Count: Wait.LoadCnt);
1221	applyWaitcnt(T: EXP_CNT, Count: Wait.ExpCnt);
1222	applyWaitcnt(T: DS_CNT, Count: Wait.DsCnt);
1223	applyWaitcnt(T: STORE_CNT, Count: Wait.StoreCnt);
1224	applyWaitcnt(T: SAMPLE_CNT, Count: Wait.SampleCnt);
1225	applyWaitcnt(T: BVH_CNT, Count: Wait.BvhCnt);
1226	applyWaitcnt(T: KM_CNT, Count: Wait.KmCnt);
1227	applyXcnt(Wait);
1228	}
1229
1230	void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
1231	const unsigned UB = getScoreUB(T);
1232	if (Count >= UB)
1233	return;
1234	if (Count != `0`) {
1235	if (counterOutOfOrder(T))
1236	return;
1237	setScoreLB(T, Val: std::max(a: getScoreLB(T), b: UB - Count));
1238	} else {
1239	setScoreLB(T, Val: UB);
1240	PendingEvents &= ~WaitEventMaskForInst[T];
1241	}
1242	}
1243
1244	void WaitcntBrackets::applyXcnt(const AMDGPU::Waitcnt &Wait) {
1245	// Wait on XCNT is redundant if we are already waiting for a load to complete.
1246	// SMEM can return out of order, so only omit XCNT wait if we are waiting till
1247	// zero.
1248	if (Wait.KmCnt == `0` && hasPendingEvent(E: SMEM_GROUP))
1249	return applyWaitcnt(T: X_CNT, Count: `0`);
1250
1251	// If we have pending store we cannot optimize XCnt because we do not wait for
1252	// stores. VMEM loads retun in order, so if we only have loads XCnt is
1253	// decremented to the same number as LOADCnt.
1254	if (Wait.LoadCnt != ~`0u` && hasPendingEvent(E: VMEM_GROUP) &&
1255	!hasPendingEvent(T: STORE_CNT))
1256	return applyWaitcnt(T: X_CNT, Count: std::min(a: Wait.XCnt, b: Wait.LoadCnt));
1257
1258	applyWaitcnt(T: X_CNT, Count: Wait.XCnt);
1259	}
1260
1261	// Where there are multiple types of event in the bracket of a counter,
1262	// the decrement may go out of order.
1263	bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const {
1264	// Scalar memory read always can go out of order.
1265	if ((T == SmemAccessCounter && hasPendingEvent(E: SMEM_ACCESS)) \|\|
1266	(T == X_CNT && hasPendingEvent(E: SMEM_GROUP)))
1267	return true;
1268	return hasMixedPendingEvents(T);
1269	}
1270
1271	INITIALIZE_PASS_BEGIN(SIInsertWaitcntsLegacy, DEBUG_TYPE, "SI Insert Waitcnts",
1272	false, false)
1273	INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass)
1274	INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTreeWrapperPass)
1275	INITIALIZE_PASS_END(SIInsertWaitcntsLegacy, DEBUG_TYPE, "SI Insert Waitcnts",
1276	false, false)
1277
1278	char SIInsertWaitcntsLegacy::ID = `0`;
1279
1280	char &llvm::SIInsertWaitcntsID = SIInsertWaitcntsLegacy::ID;
1281
1282	FunctionPass *llvm::createSIInsertWaitcntsPass() {
1283	return new SIInsertWaitcntsLegacy ();
1284	}
1285
1286	static bool updateOperandIfDifferent(MachineInstr &MI, AMDGPU::OpName OpName,
1287	unsigned NewEnc) {
1288	int OpIdx = AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: OpName);
1289	assert(OpIdx >= `0`);
1290
1291	MachineOperand &MO = MI.getOperand(i: OpIdx);
1292
1293	if (NewEnc == MO.getImm())
1294	return false;
1295
1296	MO.setImm(NewEnc);
1297	return true;
1298	}
1299
1300	/// Determine if \p MI is a gfx12+ single-counter S_WAIT_CNT instruction,*
1301	/// and if so, which counter it is waiting on.
1302	static std::optional<InstCounterType> counterTypeForInstr(unsigned Opcode) {
1303	switch (Opcode) {
1304	case AMDGPU::S_WAIT_LOADCNT:
1305	return LOAD_CNT;
1306	case AMDGPU::S_WAIT_EXPCNT:
1307	return EXP_CNT;
1308	case AMDGPU::S_WAIT_STORECNT:
1309	return STORE_CNT;
1310	case AMDGPU::S_WAIT_SAMPLECNT:
1311	return SAMPLE_CNT;
1312	case AMDGPU::S_WAIT_BVHCNT:
1313	return BVH_CNT;
1314	case AMDGPU::S_WAIT_DSCNT:
1315	return DS_CNT;
1316	case AMDGPU::S_WAIT_KMCNT:
1317	return KM_CNT;
1318	case AMDGPU::S_WAIT_XCNT:
1319	return X_CNT;
1320	default:
1321	return {};
1322	}
1323	}
1324
1325	bool WaitcntGenerator::promoteSoftWaitCnt(MachineInstr Waitcnt) const* {
1326	unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Opcode: Waitcnt->getOpcode());
1327	if (Opcode == Waitcnt->getOpcode())
1328	return false;
1329
1330	Waitcnt->setDesc(TII->get(Opcode));
1331	return true;
1332	}
1333
1334	/// Combine consecutive S_WAITCNT and S_WAITCNT_VSCNT instructions that
1335	/// precede \p It and follow \p OldWaitcntInstr and apply any extra waits
1336	/// from \p Wait that were added by previous passes. Currently this pass
1337	/// conservatively assumes that these preexisting waits are required for
1338	/// correctness.
1339	bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
1340	WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
1341	AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) const {
1342	assert(ST);
1343	assert(isNormalMode(MaxCounter));
1344
1345	bool Modified = false;
1346	MachineInstr WaitcntInstr = nullptr*;
1347	MachineInstr WaitcntVsCntInstr = nullptr*;
1348
1349	LLVM_DEBUG({
1350	dbgs() << "PreGFX12::applyPreexistingWaitcnt at: ";
1351	if (It == OldWaitcntInstr.getParent()->instr_end())
1352	dbgs() << "end of block\n";
1353	else
1354	dbgs() << *It;
1355	});
1356
1357	for (auto &II :
1358	make_early_inc_range(Range: make_range(x: OldWaitcntInstr.getIterator(), y: It))) {
1359	LLVM_DEBUG(dbgs() << "pre-existing iter: " << II);
1360	if (II.isMetaInstruction()) {
1361	LLVM_DEBUG(dbgs() << "skipped meta instruction\n");
1362	continue;
1363	}
1364
1365	unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Opcode: II.getOpcode());
1366	bool TrySimplify = Opcode != II.getOpcode() && !OptNone;
1367
1368	// Update required wait count. If this is a soft waitcnt (= it was added
1369	// by an earlier pass), it may be entirely removed.
1370	if (Opcode == AMDGPU::S_WAITCNT) {
1371	unsigned IEnc = II.getOperand(i: `0`).getImm();
1372	AMDGPU::Waitcnt OldWait = AMDGPU::decodeWaitcnt(Version: IV, Encoded: IEnc);
1373	if (TrySimplify)
1374	ScoreBrackets.simplifyWaitcnt(Wait&: OldWait);
1375	Wait = Wait.combined(Other: OldWait);
1376
1377	// Merge consecutive waitcnt of the same type by erasing multiples.
1378	if (WaitcntInstr \|\| (!Wait.hasWaitExceptStoreCnt() && TrySimplify)) {
1379	II.eraseFromParent();
1380	Modified = true;
1381	} else
1382	WaitcntInstr = &II;
1383	} else {
1384	assert(Opcode == AMDGPU::S_WAITCNT_VSCNT);
1385	assert(II.getOperand(`0`).getReg() == AMDGPU::SGPR_NULL);
1386
1387	unsigned OldVSCnt =
1388	TII->getNamedOperand(MI&: II, OperandName: AMDGPU::OpName::simm16)->getImm();
1389	if (TrySimplify)
1390	ScoreBrackets.simplifyWaitcnt(T: InstCounterType::STORE_CNT, Count&: OldVSCnt);
1391	Wait.StoreCnt = std::min(a: Wait.StoreCnt, b: OldVSCnt);
1392
1393	if (WaitcntVsCntInstr \|\| (!Wait.hasWaitStoreCnt() && TrySimplify)) {
1394	II.eraseFromParent();
1395	Modified = true;
1396	} else
1397	WaitcntVsCntInstr = &II;
1398	}
1399	}
1400
1401	if (WaitcntInstr) {
1402	Modified \|= updateOperandIfDifferent(MI&: *WaitcntInstr, OpName: AMDGPU::OpName::simm16,
1403	NewEnc: AMDGPU::encodeWaitcnt(Version: IV, Decoded: Wait));
1404	Modified \|= promoteSoftWaitCnt(Waitcnt: WaitcntInstr);
1405
1406	ScoreBrackets.applyWaitcnt(T: LOAD_CNT, Count: Wait.LoadCnt);
1407	ScoreBrackets.applyWaitcnt(T: EXP_CNT, Count: Wait.ExpCnt);
1408	ScoreBrackets.applyWaitcnt(T: DS_CNT, Count: Wait.DsCnt);
1409	Wait.LoadCnt = ~`0u`;
1410	Wait.ExpCnt = ~`0u`;
1411	Wait.DsCnt = ~`0u`;
1412
1413	LLVM_DEBUG(It == WaitcntInstr->getParent()->end()
1414	? dbgs()
1415	<< "applied pre-existing waitcnt\n"
1416	<< "New Instr at block end: " << *WaitcntInstr << `'\n'`
1417	: dbgs() << "applied pre-existing waitcnt\n"
1418	<< "Old Instr: " << *It
1419	<< "New Instr: " << *WaitcntInstr << `'\n'`);
1420	}
1421
1422	if (WaitcntVsCntInstr) {
1423	Modified \|= updateOperandIfDifferent(MI&: *WaitcntVsCntInstr,
1424	OpName: AMDGPU::OpName::simm16, NewEnc: Wait.StoreCnt);
1425	Modified \|= promoteSoftWaitCnt(Waitcnt: WaitcntVsCntInstr);
1426
1427	ScoreBrackets.applyWaitcnt(T: STORE_CNT, Count: Wait.StoreCnt);
1428	Wait.StoreCnt = ~`0u`;
1429
1430	LLVM_DEBUG(It == WaitcntVsCntInstr->getParent()->end()
1431	? dbgs() << "applied pre-existing waitcnt\n"
1432	<< "New Instr at block end: " << *WaitcntVsCntInstr
1433	<< `'\n'`
1434	: dbgs() << "applied pre-existing waitcnt\n"
1435	<< "Old Instr: " << *It
1436	<< "New Instr: " << *WaitcntVsCntInstr << `'\n'`);
1437	}
1438
1439	return Modified;
1440	}
1441
1442	/// Generate S_WAITCNT and/or S_WAITCNT_VSCNT instructions for any
1443	/// required counters in \p Wait
1444	bool WaitcntGeneratorPreGFX12::createNewWaitcnt(
1445	MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It,
1446	AMDGPU::Waitcnt Wait) {
1447	assert(ST);
1448	assert(isNormalMode(MaxCounter));
1449
1450	bool Modified = false;
1451	const DebugLoc &DL = Block.findDebugLoc(MBBI: It);
1452
1453	// Waits for VMcnt, LKGMcnt and/or EXPcnt are encoded together into a
1454	// single instruction while VScnt has its own instruction.
1455	if (Wait.hasWaitExceptStoreCnt()) {
1456	unsigned Enc = AMDGPU::encodeWaitcnt(Version: IV, Decoded: Wait);
1457	[[maybe_unused]] auto SWaitInst =
1458	BuildMI(BB&: Block, I: It, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_WAITCNT)).addImm(Val: Enc);
1459	Modified = true;
1460
1461	LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
1462	if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
1463	dbgs() << "New Instr: " << *SWaitInst << `'\n'`);
1464	}
1465
1466	if (Wait.hasWaitStoreCnt()) {
1467	assert(ST->hasVscnt());
1468
1469	[[maybe_unused]] auto SWaitInst =
1470	BuildMI(BB&: Block, I: It, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_WAITCNT_VSCNT))
1471	.addReg(RegNo: AMDGPU::SGPR_NULL, flags: RegState::Undef)
1472	.addImm(Val: Wait.StoreCnt);
1473	Modified = true;
1474
1475	LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
1476	if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
1477	dbgs() << "New Instr: " << *SWaitInst << `'\n'`);
1478	}
1479
1480	return Modified;
1481	}
1482
1483	AMDGPU::Waitcnt
1484	WaitcntGeneratorPreGFX12::getAllZeroWaitcnt(bool IncludeVSCnt) const {
1485	return AMDGPU::Waitcnt (`0`, `0`, `0`, IncludeVSCnt && ST->hasVscnt() ? `0` : ~`0u`);
1486	}
1487
1488	AMDGPU::Waitcnt
1489	WaitcntGeneratorGFX12Plus::getAllZeroWaitcnt(bool IncludeVSCnt) const {
1490	return AMDGPU::Waitcnt (`0`, `0`, `0`, IncludeVSCnt ? `0` : ~`0u`, `0`, `0`, `0`,
1491	~`0u` / XCNT /);
1492	}
1493
1494	/// Combine consecutive S_WAIT_CNT instructions that precede \p It and*
1495	/// follow \p OldWaitcntInstr and apply any extra waits from \p Wait that
1496	/// were added by previous passes. Currently this pass conservatively
1497	/// assumes that these preexisting waits are required for correctness.
1498	bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
1499	WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
1500	AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) const {
1501	assert(ST);
1502	assert(!isNormalMode(MaxCounter));
1503
1504	bool Modified = false;
1505	MachineInstr CombinedLoadDsCntInstr = nullptr*;
1506	MachineInstr CombinedStoreDsCntInstr = nullptr*;
1507	MachineInstr *WaitInstrs[NUM_EXTENDED_INST_CNTS] = {};
1508
1509	LLVM_DEBUG({
1510	dbgs() << "GFX12Plus::applyPreexistingWaitcnt at: ";
1511	if (It == OldWaitcntInstr.getParent()->instr_end())
1512	dbgs() << "end of block\n";
1513	else
1514	dbgs() << *It;
1515	});
1516
1517	for (auto &II :
1518	make_early_inc_range(Range: make_range(x: OldWaitcntInstr.getIterator(), y: It))) {
1519	LLVM_DEBUG(dbgs() << "pre-existing iter: " << II);
1520	if (II.isMetaInstruction()) {
1521	LLVM_DEBUG(dbgs() << "skipped meta instruction\n");
1522	continue;
1523	}
1524
1525	MachineInstr **UpdatableInstr;
1526
1527	// Update required wait count. If this is a soft waitcnt (= it was added
1528	// by an earlier pass), it may be entirely removed.
1529
1530	unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Opcode: II.getOpcode());
1531	bool TrySimplify = Opcode != II.getOpcode() && !OptNone;
1532
1533	// Don't crash if the programmer used legacy waitcnt intrinsics, but don't
1534	// attempt to do more than that either.
1535	if (Opcode == AMDGPU::S_WAITCNT)
1536	continue;
1537
1538	if (Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT) {
1539	unsigned OldEnc =
1540	TII->getNamedOperand(MI&: II, OperandName: AMDGPU::OpName::simm16)->getImm();
1541	AMDGPU::Waitcnt OldWait = AMDGPU::decodeLoadcntDscnt(Version: IV, LoadcntDscnt: OldEnc);
1542	if (TrySimplify)
1543	ScoreBrackets.simplifyWaitcnt(Wait&: OldWait);
1544	Wait = Wait.combined(Other: OldWait);
1545	UpdatableInstr = &CombinedLoadDsCntInstr;
1546	} else if (Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT) {
1547	unsigned OldEnc =
1548	TII->getNamedOperand(MI&: II, OperandName: AMDGPU::OpName::simm16)->getImm();
1549	AMDGPU::Waitcnt OldWait = AMDGPU::decodeStorecntDscnt(Version: IV, StorecntDscnt: OldEnc);
1550	if (TrySimplify)
1551	ScoreBrackets.simplifyWaitcnt(Wait&: OldWait);
1552	Wait = Wait.combined(Other: OldWait);
1553	UpdatableInstr = &CombinedStoreDsCntInstr;
1554	} else {
1555	std::optional<InstCounterType> CT = counterTypeForInstr(Opcode);
1556	assert(CT.has_value());
1557	unsigned OldCnt =
1558	TII->getNamedOperand(MI&: II, OperandName: AMDGPU::OpName::simm16)->getImm();
1559	if (TrySimplify)
1560	ScoreBrackets.simplifyWaitcnt(T: CT.value(), Count&: OldCnt);
1561	addWait(Wait, T: CT.value(), Count: OldCnt);
1562	UpdatableInstr = &WaitInstrs[CT.value()];
1563	}
1564
1565	// Merge consecutive waitcnt of the same type by erasing multiples.
1566	if (!*UpdatableInstr) {
1567	*UpdatableInstr = &II;
1568	} else {
1569	II.eraseFromParent();
1570	Modified = true;
1571	}
1572	}
1573
1574	if (CombinedLoadDsCntInstr) {
1575	// Only keep an S_WAIT_LOADCNT_DSCNT if both counters actually need
1576	// to be waited for. Otherwise, let the instruction be deleted so
1577	// the appropriate single counter wait instruction can be inserted
1578	// instead, when new S_WAIT_CNT instructions are inserted by*
1579	// createNewWaitcnt(). As a side effect, resetting the wait counts will
1580	// cause any redundant S_WAIT_LOADCNT or S_WAIT_DSCNT to be removed by
1581	// the loop below that deals with single counter instructions.
1582	if (Wait.LoadCnt != ~`0u` && Wait.DsCnt != ~`0u`) {
1583	unsigned NewEnc = AMDGPU::encodeLoadcntDscnt(Version: IV, Decoded: Wait);
1584	Modified \|= updateOperandIfDifferent(MI&: *CombinedLoadDsCntInstr,
1585	OpName: AMDGPU::OpName::simm16, NewEnc);
1586	Modified \|= promoteSoftWaitCnt(Waitcnt: CombinedLoadDsCntInstr);
1587	ScoreBrackets.applyWaitcnt(T: LOAD_CNT, Count: Wait.LoadCnt);
1588	ScoreBrackets.applyWaitcnt(T: DS_CNT, Count: Wait.DsCnt);
1589	Wait.LoadCnt = ~`0u`;
1590	Wait.DsCnt = ~`0u`;
1591
1592	LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
1593	? dbgs() << "applied pre-existing waitcnt\n"
1594	<< "New Instr at block end: "
1595	<< *CombinedLoadDsCntInstr << `'\n'`
1596	: dbgs() << "applied pre-existing waitcnt\n"
1597	<< "Old Instr: " << *It << "New Instr: "
1598	<< *CombinedLoadDsCntInstr << `'\n'`);
1599	} else {
1600	CombinedLoadDsCntInstr->eraseFromParent();
1601	Modified = true;
1602	}
1603	}
1604
1605	if (CombinedStoreDsCntInstr) {
1606	// Similarly for S_WAIT_STORECNT_DSCNT.
1607	if (Wait.StoreCnt != ~`0u` && Wait.DsCnt != ~`0u`) {
1608	unsigned NewEnc = AMDGPU::encodeStorecntDscnt(Version: IV, Decoded: Wait);
1609	Modified \|= updateOperandIfDifferent(MI&: *CombinedStoreDsCntInstr,
1610	OpName: AMDGPU::OpName::simm16, NewEnc);
1611	Modified \|= promoteSoftWaitCnt(Waitcnt: CombinedStoreDsCntInstr);
1612	ScoreBrackets.applyWaitcnt(T: STORE_CNT, Count: Wait.StoreCnt);
1613	ScoreBrackets.applyWaitcnt(T: DS_CNT, Count: Wait.DsCnt);
1614	Wait.StoreCnt = ~`0u`;
1615	Wait.DsCnt = ~`0u`;
1616
1617	LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
1618	? dbgs() << "applied pre-existing waitcnt\n"
1619	<< "New Instr at block end: "
1620	<< *CombinedStoreDsCntInstr << `'\n'`
1621	: dbgs() << "applied pre-existing waitcnt\n"
1622	<< "Old Instr: " << *It << "New Instr: "
1623	<< *CombinedStoreDsCntInstr << `'\n'`);
1624	} else {
1625	CombinedStoreDsCntInstr->eraseFromParent();
1626	Modified = true;
1627	}
1628	}
1629
1630	// Look for an opportunity to convert existing S_WAIT_LOADCNT,
1631	// S_WAIT_STORECNT and S_WAIT_DSCNT into new S_WAIT_LOADCNT_DSCNT
1632	// or S_WAIT_STORECNT_DSCNT. This is achieved by selectively removing
1633	// instructions so that createNewWaitcnt() will create new combined
1634	// instructions to replace them.
1635
1636	if (Wait.DsCnt != ~`0u`) {
1637	// This is a vector of addresses in WaitInstrs pointing to instructions
1638	// that should be removed if they are present.
1639	SmallVector<MachineInstr **, `2`> WaitsToErase;
1640
1641	// If it's known that both DScnt and either LOADcnt or STOREcnt (but not
1642	// both) need to be waited for, ensure that there are no existing
1643	// individual wait count instructions for these.
1644
1645	if (Wait.LoadCnt != ~`0u`) {
1646	WaitsToErase.push_back(Elt: &WaitInstrs[LOAD_CNT]);
1647	WaitsToErase.push_back(Elt: &WaitInstrs[DS_CNT]);
1648	} else if (Wait.StoreCnt != ~`0u`) {
1649	WaitsToErase.push_back(Elt: &WaitInstrs[STORE_CNT]);
1650	WaitsToErase.push_back(Elt: &WaitInstrs[DS_CNT]);
1651	}
1652
1653	for (MachineInstr **WI : WaitsToErase) {
1654	if (!*WI)
1655	continue;
1656
1657	(*WI)->eraseFromParent();
1658	WI = nullptr*;
1659	Modified = true;
1660	}
1661	}
1662
1663	for (auto CT : inst_counter_types(MaxCounter: NUM_EXTENDED_INST_CNTS)) {
1664	if (!WaitInstrs[CT])
1665	continue;
1666
1667	unsigned NewCnt = getWait(Wait, T: CT);
1668	if (NewCnt != ~`0u`) {
1669	Modified \|= updateOperandIfDifferent(MI&: *WaitInstrs[CT],
1670	OpName: AMDGPU::OpName::simm16, NewEnc: NewCnt);
1671	Modified \|= promoteSoftWaitCnt(Waitcnt: WaitInstrs[CT]);
1672
1673	ScoreBrackets.applyWaitcnt(T: CT, Count: NewCnt);
1674	setNoWait(Wait, T: CT);
1675
1676	LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
1677	? dbgs() << "applied pre-existing waitcnt\n"
1678	<< "New Instr at block end: " << *WaitInstrs[CT]
1679	<< `'\n'`
1680	: dbgs() << "applied pre-existing waitcnt\n"
1681	<< "Old Instr: " << *It
1682	<< "New Instr: " << *WaitInstrs[CT] << `'\n'`);
1683	} else {
1684	WaitInstrs[CT]->eraseFromParent();
1685	Modified = true;
1686	}
1687	}
1688
1689	return Modified;
1690	}
1691
1692	/// Generate S_WAIT_CNT instructions for any required counters in \p Wait*
1693	bool WaitcntGeneratorGFX12Plus::createNewWaitcnt(
1694	MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It,
1695	AMDGPU::Waitcnt Wait) {
1696	assert(ST);
1697	assert(!isNormalMode(MaxCounter));
1698
1699	bool Modified = false;
1700	const DebugLoc &DL = Block.findDebugLoc(MBBI: It);
1701
1702	// Check for opportunities to use combined wait instructions.
1703	if (Wait.DsCnt != ~`0u`) {
1704	MachineInstr SWaitInst = nullptr*;
1705
1706	if (Wait.LoadCnt != ~`0u`) {
1707	unsigned Enc = AMDGPU::encodeLoadcntDscnt(Version: IV, Decoded: Wait);
1708
1709	SWaitInst = BuildMI(BB&: Block, I: It, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_WAIT_LOADCNT_DSCNT))
1710	.addImm(Val: Enc);
1711
1712	Wait.LoadCnt = ~`0u`;
1713	Wait.DsCnt = ~`0u`;
1714	} else if (Wait.StoreCnt != ~`0u`) {
1715	unsigned Enc = AMDGPU::encodeStorecntDscnt(Version: IV, Decoded: Wait);
1716
1717	SWaitInst =
1718	BuildMI(BB&: Block, I: It, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_WAIT_STORECNT_DSCNT))
1719	.addImm(Val: Enc);
1720
1721	Wait.StoreCnt = ~`0u`;
1722	Wait.DsCnt = ~`0u`;
1723	}
1724
1725	if (SWaitInst) {
1726	Modified = true;
1727
1728	LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
1729	if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
1730	dbgs() << "New Instr: " << *SWaitInst << `'\n'`);
1731	}
1732	}
1733
1734	// Generate an instruction for any remaining counter that needs
1735	// waiting for.
1736
1737	for (auto CT : inst_counter_types(MaxCounter: NUM_EXTENDED_INST_CNTS)) {
1738	unsigned Count = getWait(Wait, T: CT);
1739	if (Count == ~`0u`)
1740	continue;
1741
1742	[[maybe_unused]] auto SWaitInst =
1743	BuildMI(BB&: Block, I: It, MIMD: DL, MCID: TII->get(Opcode: instrsForExtendedCounterTypes[CT]))
1744	.addImm(Val: Count);
1745
1746	Modified = true;
1747
1748	LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
1749	if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
1750	dbgs() << "New Instr: " << *SWaitInst << `'\n'`);
1751	}
1752
1753	return Modified;
1754	}
1755
1756	static bool readsVCCZ(const MachineInstr &MI) {
1757	unsigned Opc = MI.getOpcode();
1758	return (Opc == AMDGPU::S_CBRANCH_VCCNZ \|\| Opc == AMDGPU::S_CBRANCH_VCCZ) &&
1759	!MI.getOperand(i: `1`).isUndef();
1760	}
1761
1762	/// \returns true if the callee inserts an s_waitcnt 0 on function entry.
1763	static bool callWaitsOnFunctionEntry(const MachineInstr &MI) {
1764	// Currently all conventions wait, but this may not always be the case.
1765	//
1766	// TODO: If IPRA is enabled, and the callee is isSafeForNoCSROpt, it may make
1767	// senses to omit the wait and do it in the caller.
1768	return true;
1769	}
1770
1771	/// \returns true if the callee is expected to wait for any outstanding waits
1772	/// before returning.
1773	static bool callWaitsOnFunctionReturn(const MachineInstr &MI) { return true; }
1774
1775	/// Generate s_waitcnt instruction to be placed before cur_Inst.
1776	/// Instructions of a given type are returned in order,
1777	/// but instructions of different types can complete out of order.
1778	/// We rely on this in-order completion
1779	/// and simply assign a score to the memory access instructions.
1780	/// We keep track of the active "score bracket" to determine
1781	/// if an access of a memory read requires an s_waitcnt
1782	/// and if so what the value of each counter is.
1783	/// The "score bracket" is bound by the lower bound and upper bound
1784	/// scores (_score_LB and _score_ub respectively).
1785	/// If FlushVmCnt is true, that means that we want to generate a s_waitcnt to
1786	/// flush the vmcnt counter here.
1787	bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
1788	WaitcntBrackets &ScoreBrackets,
1789	MachineInstr *OldWaitcntInstr,
1790	bool FlushVmCnt) {
1791	setForceEmitWaitcnt();
1792
1793	assert(!MI.isMetaInstruction());
1794
1795	AMDGPU::Waitcnt Wait;
1796
1797	// FIXME: This should have already been handled by the memory legalizer.
1798	// Removing this currently doesn't affect any lit tests, but we need to
1799	// verify that nothing was relying on this. The number of buffer invalidates
1800	// being handled here should not be expanded.
1801	if (MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 \|\|
1802	MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC \|\|
1803	MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL \|\|
1804	MI.getOpcode() == AMDGPU::BUFFER_GL0_INV \|\|
1805	MI.getOpcode() == AMDGPU::BUFFER_GL1_INV) {
1806	Wait.LoadCnt = `0`;
1807	}
1808
1809	// All waits must be resolved at call return.
1810	// NOTE: this could be improved with knowledge of all call sites or
1811	// with knowledge of the called routines.
1812	if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG \|\|
1813	MI.getOpcode() == AMDGPU::SI_RETURN \|\|
1814	MI.getOpcode() == AMDGPU::S_SETPC_B64_return \|\|
1815	(MI.isReturn() && MI.isCall() && !callWaitsOnFunctionEntry(MI))) {
1816	Wait = Wait.combined(Other: WCG->getAllZeroWaitcnt(/IncludeVSCnt=/false));
1817	}
1818	// In dynamic VGPR mode, we want to release the VGPRs before the wave exits.
1819	// Technically the hardware will do this on its own if we don't, but that
1820	// might cost extra cycles compared to doing it explicitly.
1821	// When not in dynamic VGPR mode, identify S_ENDPGM instructions which may
1822	// have to wait for outstanding VMEM stores. In this case it can be useful to
1823	// send a message to explicitly release all VGPRs before the stores have
1824	// completed, but it is only safe to do this if there are no outstanding
1825	// scratch stores.
1826	else if (MI.getOpcode() == AMDGPU::S_ENDPGM \|\|
1827	MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED) {
1828	if (!WCG->isOptNone() &&
1829	(MI.getMF()->getInfo<SIMachineFunctionInfo>()->isDynamicVGPREnabled() \|\|
1830	(ST->getGeneration() >= AMDGPUSubtarget::GFX11 &&
1831	ScoreBrackets.getScoreRange(T: STORE_CNT) != `0` &&
1832	!ScoreBrackets.hasPendingEvent(E: SCRATCH_WRITE_ACCESS))))
1833	ReleaseVGPRInsts.insert(V: &MI);
1834	}
1835	// Resolve vm waits before gs-done.
1836	else if ((MI.getOpcode() == AMDGPU::S_SENDMSG \|\|
1837	MI.getOpcode() == AMDGPU::S_SENDMSGHALT) &&
1838	ST->hasLegacyGeometry() &&
1839	((MI.getOperand(i: `0`).getImm() & AMDGPU::SendMsg::ID_MASK_PreGFX11_) ==
1840	AMDGPU::SendMsg::ID_GS_DONE_PreGFX11)) {
1841	Wait.LoadCnt = `0`;
1842	}
1843
1844	// Export & GDS instructions do not read the EXEC mask until after the export
1845	// is granted (which can occur well after the instruction is issued).
1846	// The shader program must flush all EXP operations on the export-count
1847	// before overwriting the EXEC mask.
1848	else {
1849	if (MI.modifiesRegister(Reg: AMDGPU::EXEC, TRI)) {
1850	// Export and GDS are tracked individually, either may trigger a waitcnt
1851	// for EXEC.
1852	if (ScoreBrackets.hasPendingEvent(E: EXP_GPR_LOCK) \|\|
1853	ScoreBrackets.hasPendingEvent(E: EXP_PARAM_ACCESS) \|\|
1854	ScoreBrackets.hasPendingEvent(E: EXP_POS_ACCESS) \|\|
1855	ScoreBrackets.hasPendingEvent(E: GDS_GPR_LOCK)) {
1856	Wait.ExpCnt = `0`;
1857	}
1858	}
1859
1860	// Wait for any pending GDS instruction to complete before any
1861	// "Always GDS" instruction.
1862	if (TII->isAlwaysGDS(Opcode: MI.getOpcode()) && ScoreBrackets.hasPendingGDS())
1863	addWait(Wait, T: DS_CNT, Count: ScoreBrackets.getPendingGDSWait());
1864
1865	if (MI.isCall() && callWaitsOnFunctionEntry(MI)) {
1866	// The function is going to insert a wait on everything in its prolog.
1867	// This still needs to be careful if the call target is a load (e.g. a GOT
1868	// load). We also need to check WAW dependency with saved PC.
1869	Wait = AMDGPU::Waitcnt ();
1870
1871	const auto &CallAddrOp = *TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0);
1872	if (CallAddrOp.isReg()) {
1873	RegInterval CallAddrOpInterval =
1874	ScoreBrackets.getRegInterval(MI: &MI, MRI, TRI, Op: CallAddrOp);
1875
1876	ScoreBrackets.determineWait(T: SmemAccessCounter, Interval: CallAddrOpInterval,
1877	Wait);
1878
1879	if (const auto *RtnAddrOp =
1880	TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::dst)) {
1881	RegInterval RtnAddrOpInterval =
1882	ScoreBrackets.getRegInterval(MI: &MI, MRI, TRI, Op: *RtnAddrOp);
1883
1884	ScoreBrackets.determineWait(T: SmemAccessCounter, Interval: RtnAddrOpInterval,
1885	Wait);
1886	}
1887	}
1888	} else {
1889	// FIXME: Should not be relying on memoperands.
1890	// Look at the source operands of every instruction to see if
1891	// any of them results from a previous memory operation that affects
1892	// its current usage. If so, an s_waitcnt instruction needs to be
1893	// emitted.
1894	// If the source operand was defined by a load, add the s_waitcnt
1895	// instruction.
1896	//
1897	// Two cases are handled for destination operands:
1898	// 1) If the destination operand was defined by a load, add the s_waitcnt
1899	// instruction to guarantee the right WAW order.
1900	// 2) If a destination operand that was used by a recent export/store ins,
1901	// add s_waitcnt on exp_cnt to guarantee the WAR order.
1902
1903	for (const MachineMemOperand *Memop : MI.memoperands()) {
1904	const Value *Ptr = Memop->getValue();
1905	if (Memop->isStore()) {
1906	if (auto It = SLoadAddresses.find(Val: Ptr); It != SLoadAddresses.end()) {
1907	addWait(Wait, T: SmemAccessCounter, Count: `0`);
1908	if (PDT->dominates(A: MI.getParent(), B: It ->second))
1909	SLoadAddresses.erase(I: It);
1910	}
1911	}
1912	unsigned AS = Memop->getAddrSpace();
1913	if (AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::FLAT_ADDRESS)
1914	continue;
1915	// No need to wait before load from VMEM to LDS.
1916	if (TII->mayWriteLDSThroughDMA(MI))
1917	continue;
1918
1919	// LOAD_CNT is only relevant to vgpr or LDS.
1920	unsigned RegNo = FIRST_LDS_VGPR;
1921	// Only objects with alias scope info were added to LDSDMAScopes array.
1922	// In the absense of the scope info we will not be able to disambiguate
1923	// aliasing here. There is no need to try searching for a corresponding
1924	// store slot. This is conservatively correct because in that case we
1925	// will produce a wait using the first (general) LDS DMA wait slot which
1926	// will wait on all of them anyway.
1927	if (Ptr && Memop->getAAInfo() && Memop->getAAInfo().Scope) {
1928	const auto &LDSDMAStores = ScoreBrackets.getLDSDMAStores();
1929	for (unsigned I = `0`, E = LDSDMAStores.size(); I != E; ++I) {
1930	if (MI.mayAlias(AA, Other: LDSDMAStores [I], UseTBAA: true*))
1931	ScoreBrackets.determineWait(T: LOAD_CNT, RegNo: RegNo + I + `1`, Wait);
1932	}
1933	} else {
1934	ScoreBrackets.determineWait(T: LOAD_CNT, RegNo, Wait);
1935	}
1936	if (Memop->isStore()) {
1937	ScoreBrackets.determineWait(T: EXP_CNT, RegNo, Wait);
1938	}
1939	}
1940
1941	// Loop over use and def operands.
1942	for (const MachineOperand &Op : MI.operands()) {
1943	if (!Op.isReg())
1944	continue;
1945
1946	// If the instruction does not read tied source, skip the operand.
1947	if (Op.isTied() && Op.isUse() && TII->doesNotReadTiedSource(MI))
1948	continue;
1949
1950	RegInterval Interval = ScoreBrackets.getRegInterval(MI: &MI, MRI, TRI, Op);
1951
1952	const bool IsVGPR = TRI->isVectorRegister(MRI: *MRI, Reg: Op.getReg());
1953	if (IsVGPR) {
1954	// Implicit VGPR defs and uses are never a part of the memory
1955	// instructions description and usually present to account for
1956	// super-register liveness.
1957	// TODO: Most of the other instructions also have implicit uses
1958	// for the liveness accounting only.
1959	if (Op.isImplicit() && MI.mayLoadOrStore())
1960	continue;
1961
1962	// RAW always needs an s_waitcnt. WAW needs an s_waitcnt unless the
1963	// previous write and this write are the same type of VMEM
1964	// instruction, in which case they are (in some architectures)
1965	// guaranteed to write their results in order anyway.
1966	// Additionally check instructions where Point Sample Acceleration
1967	// might be applied.
1968	if (Op.isUse() \|\| !updateVMCntOnly(Inst: MI) \|\|
1969	ScoreBrackets.hasOtherPendingVmemTypes(Interval,
1970	V: getVmemType(Inst: MI)) \|\|
1971	ScoreBrackets.hasPointSamplePendingVmemTypes(MI, Interval) \|\|
1972	!ST->hasVmemWriteVgprInOrder()) {
1973	ScoreBrackets.determineWait(T: LOAD_CNT, Interval, Wait);
1974	ScoreBrackets.determineWait(T: SAMPLE_CNT, Interval, Wait);
1975	ScoreBrackets.determineWait(T: BVH_CNT, Interval, Wait);
1976	ScoreBrackets.clearVgprVmemTypes(Interval);
1977	}
1978
1979	if (Op.isDef() \|\| ScoreBrackets.hasPendingEvent(E: EXP_LDS_ACCESS)) {
1980	ScoreBrackets.determineWait(T: EXP_CNT, Interval, Wait);
1981	}
1982	ScoreBrackets.determineWait(T: DS_CNT, Interval, Wait);
1983	} else {
1984	ScoreBrackets.determineWait(T: SmemAccessCounter, Interval, Wait);
1985	}
1986
1987	if (hasXcnt() && Op.isDef())
1988	ScoreBrackets.determineWait(T: X_CNT, Interval, Wait);
1989	}
1990	}
1991	}
1992
1993	// The subtarget may have an implicit S_WAITCNT 0 before barriers. If it does
1994	// not, we need to ensure the subtarget is capable of backing off barrier
1995	// instructions in case there are any outstanding memory operations that may
1996	// cause an exception. Otherwise, insert an explicit S_WAITCNT 0 here.
1997	if (TII->isBarrierStart(Opcode: MI.getOpcode()) &&
1998	!ST->hasAutoWaitcntBeforeBarrier() && !ST->supportsBackOffBarrier()) {
1999	Wait = Wait.combined(Other: WCG->getAllZeroWaitcnt(/IncludeVSCnt=/true));
2000	}
2001
2002	// TODO: Remove this work-around, enable the assert for Bug 457939
2003	// after fixing the scheduler. Also, the Shader Compiler code is
2004	// independent of target.
2005	if (readsVCCZ(MI) && ST->hasReadVCCZBug()) {
2006	if (ScoreBrackets.hasPendingEvent(E: SMEM_ACCESS)) {
2007	Wait.DsCnt = `0`;
2008	}
2009	}
2010
2011	// Verify that the wait is actually needed.
2012	ScoreBrackets.simplifyWaitcnt(Wait);
2013
2014	// When forcing emit, we need to skip terminators because that would break the
2015	// terminators of the MBB if we emit a waitcnt between terminators.
2016	if (ForceEmitZeroFlag && !MI.isTerminator())
2017	Wait = WCG->getAllZeroWaitcnt(/IncludeVSCnt=/false);
2018
2019	if (ForceEmitWaitcnt[LOAD_CNT])
2020	Wait.LoadCnt = `0`;
2021	if (ForceEmitWaitcnt[EXP_CNT])
2022	Wait.ExpCnt = `0`;
2023	if (ForceEmitWaitcnt[DS_CNT])
2024	Wait.DsCnt = `0`;
2025	if (ForceEmitWaitcnt[SAMPLE_CNT])
2026	Wait.SampleCnt = `0`;
2027	if (ForceEmitWaitcnt[BVH_CNT])
2028	Wait.BvhCnt = `0`;
2029	if (ForceEmitWaitcnt[KM_CNT])
2030	Wait.KmCnt = `0`;
2031	if (ForceEmitWaitcnt[X_CNT])
2032	Wait.XCnt = `0`;
2033
2034	if (FlushVmCnt) {
2035	if (ScoreBrackets.hasPendingEvent(T: LOAD_CNT))
2036	Wait.LoadCnt = `0`;
2037	if (ScoreBrackets.hasPendingEvent(T: SAMPLE_CNT))
2038	Wait.SampleCnt = `0`;
2039	if (ScoreBrackets.hasPendingEvent(T: BVH_CNT))
2040	Wait.BvhCnt = `0`;
2041	}
2042
2043	if (ForceEmitZeroLoadFlag && Wait.LoadCnt != ~`0u`)
2044	Wait.LoadCnt = `0`;
2045
2046	return generateWaitcnt(Wait, It: MI.getIterator(), Block&: *MI.getParent(), ScoreBrackets,
2047	OldWaitcntInstr);
2048	}
2049
2050	bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
2051	MachineBasicBlock::instr_iterator It,
2052	MachineBasicBlock &Block,
2053	WaitcntBrackets &ScoreBrackets,
2054	MachineInstr *OldWaitcntInstr) {
2055	bool Modified = false;
2056
2057	if (OldWaitcntInstr)
2058	// Try to merge the required wait with preexisting waitcnt instructions.
2059	// Also erase redundant waitcnt.
2060	Modified =
2061	WCG->applyPreexistingWaitcnt(ScoreBrackets, OldWaitcntInstr&: *OldWaitcntInstr, Wait, It);
2062
2063	// Any counts that could have been applied to any existing waitcnt
2064	// instructions will have been done so, now deal with any remaining.
2065	ScoreBrackets.applyWaitcnt(Wait);
2066
2067	// ExpCnt can be merged into VINTERP.
2068	if (Wait.ExpCnt != ~`0u` && It != Block.instr_end() &&
2069	SIInstrInfo::isVINTERP(MI: *It)) {
2070	MachineOperand *WaitExp =
2071	TII->getNamedOperand(MI&: *It, OperandName: AMDGPU::OpName::waitexp);
2072	if (Wait.ExpCnt < WaitExp->getImm()) {
2073	WaitExp->setImm(Wait.ExpCnt);
2074	Modified = true;
2075	}
2076	Wait.ExpCnt = ~`0u`;
2077
2078	LLVM_DEBUG(dbgs() << "generateWaitcnt\n"
2079	<< "Update Instr: " << *It);
2080	}
2081
2082	// XCnt may be already consumed by a load wait.
2083	if (Wait.KmCnt == `0` && Wait.XCnt != ~`0u` &&
2084	!ScoreBrackets.hasPendingEvent(E: SMEM_GROUP))
2085	Wait.XCnt = ~`0u`;
2086
2087	if (Wait.LoadCnt == `0` && Wait.XCnt != ~`0u` &&
2088	!ScoreBrackets.hasPendingEvent(E: VMEM_GROUP))
2089	Wait.XCnt = ~`0u`;
2090
2091	// Since the translation for VMEM addresses occur in-order, we can skip the
2092	// XCnt if the current instruction is of VMEM type and has a memory dependency
2093	// with another VMEM instruction in flight.
2094	if (Wait.XCnt != ~`0u` && isVmemAccess(MI: *It))
2095	Wait.XCnt = ~`0u`;
2096
2097	if (WCG->createNewWaitcnt(Block, It, Wait))
2098	Modified = true;
2099
2100	return Modified;
2101	}
2102
2103	// This is a flat memory operation. Check to see if it has memory tokens other
2104	// than LDS. Other address spaces supported by flat memory operations involve
2105	// global memory.
2106	bool SIInsertWaitcnts::mayAccessVMEMThroughFlat(const MachineInstr &MI) const {
2107	assert(TII->isFLAT(MI));
2108
2109	// All flat instructions use the VMEM counter.
2110	assert(TII->usesVM_CNT(MI));
2111
2112	// If there are no memory operands then conservatively assume the flat
2113	// operation may access VMEM.
2114	if (MI.memoperands_empty())
2115	return true;
2116
2117	// See if any memory operand specifies an address space that involves VMEM.
2118	// Flat operations only supported FLAT, LOCAL (LDS), or address spaces
2119	// involving VMEM such as GLOBAL, CONSTANT, PRIVATE (SCRATCH), etc. The REGION
2120	// (GDS) address space is not supported by flat operations. Therefore, simply
2121	// return true unless only the LDS address space is found.
2122	for (const MachineMemOperand *Memop : MI.memoperands()) {
2123	unsigned AS = Memop->getAddrSpace();
2124	assert(AS != AMDGPUAS::REGION_ADDRESS);
2125	if (AS != AMDGPUAS::LOCAL_ADDRESS)
2126	return true;
2127	}
2128
2129	return false;
2130	}
2131
2132	// This is a flat memory operation. Check to see if it has memory tokens for
2133	// either LDS or FLAT.
2134	bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const {
2135	assert(TII->isFLAT(MI));
2136
2137	// Flat instruction such as SCRATCH and GLOBAL do not use the lgkm counter.
2138	if (!TII->usesLGKM_CNT(MI))
2139	return false;
2140
2141	// If in tgsplit mode then there can be no use of LDS.
2142	if (ST->isTgSplitEnabled())
2143	return false;
2144
2145	// If there are no memory operands then conservatively assume the flat
2146	// operation may access LDS.
2147	if (MI.memoperands_empty())
2148	return true;
2149
2150	// See if any memory operand specifies an address space that involves LDS.
2151	for (const MachineMemOperand *Memop : MI.memoperands()) {
2152	unsigned AS = Memop->getAddrSpace();
2153	if (AS == AMDGPUAS::LOCAL_ADDRESS \|\| AS == AMDGPUAS::FLAT_ADDRESS)
2154	return true;
2155	}
2156
2157	return false;
2158	}
2159
2160	// This is a flat memory operation. Check to see if it has memory tokens for
2161	// either scratch or FLAT.
2162	bool SIInsertWaitcnts::mayAccessScratchThroughFlat(
2163	const MachineInstr &MI) const {
2164	assert(TII->isFLAT(MI));
2165
2166	// SCRATCH instructions always access scratch.
2167	if (TII->isFLATScratch(MI))
2168	return true;
2169
2170	// GLOBAL instructions never access scratch.
2171	if (TII->isFLATGlobal(MI))
2172	return false;
2173
2174	// If there are no memory operands then conservatively assume the flat
2175	// operation may access scratch.
2176	if (MI.memoperands_empty())
2177	return true;
2178
2179	// See if any memory operand specifies an address space that involves scratch.
2180	return any_of(Range: MI.memoperands(), P: [](const MachineMemOperand *Memop) {
2181	unsigned AS = Memop->getAddrSpace();
2182	return AS == AMDGPUAS::PRIVATE_ADDRESS \|\| AS == AMDGPUAS::FLAT_ADDRESS;
2183	});
2184	}
2185
2186	bool SIInsertWaitcnts::isVmemAccess(const MachineInstr &MI) const {
2187	return (TII->isFLAT(MI) && mayAccessVMEMThroughFlat(MI)) \|\|
2188	(TII->isVMEM(MI) && !AMDGPU::getMUBUFIsBufferInv(Opc: MI.getOpcode()));
2189	}
2190
2191	static bool isGFX12CacheInvOrWBInst(MachineInstr &Inst) {
2192	auto Opc = Inst.getOpcode();
2193	return Opc == AMDGPU::GLOBAL_INV \|\| Opc == AMDGPU::GLOBAL_WB \|\|
2194	Opc == AMDGPU::GLOBAL_WBINV;
2195	}
2196
2197	// Return true if the next instruction is S_ENDPGM, following fallthrough
2198	// blocks if necessary.
2199	bool SIInsertWaitcnts::isNextENDPGM(MachineBasicBlock::instr_iterator It,
2200	MachineBasicBlock Block) const* {
2201	auto BlockEnd = Block->getParent()->end();
2202	auto BlockIter = Block->getIterator();
2203
2204	while (true) {
2205	if (It.isEnd()) {
2206	if (++BlockIter != BlockEnd) {
2207	It = BlockIter ->instr_begin();
2208	continue;
2209	}
2210
2211	return false;
2212	}
2213
2214	if (!It ->isMetaInstruction())
2215	break;
2216
2217	It ++;
2218	}
2219
2220	assert(!It.isEnd());
2221
2222	return It ->getOpcode() == AMDGPU::S_ENDPGM;
2223	}
2224
2225	// Add a wait after an instruction if architecture requirements mandate one.
2226	bool SIInsertWaitcnts::insertForcedWaitAfter(MachineInstr &Inst,
2227	MachineBasicBlock &Block,
2228	WaitcntBrackets &ScoreBrackets) {
2229	AMDGPU::Waitcnt Wait;
2230	bool NeedsEndPGMCheck = false;
2231
2232	if (ST->isPreciseMemoryEnabled() && Inst.mayLoadOrStore())
2233	Wait = WCG->getAllZeroWaitcnt(IncludeVSCnt: Inst.mayStore() &&
2234	!SIInstrInfo::isAtomicRet(MI: Inst));
2235
2236	if (TII->isAlwaysGDS(Opcode: Inst.getOpcode())) {
2237	Wait.DsCnt = `0`;
2238	NeedsEndPGMCheck = true;
2239	}
2240
2241	ScoreBrackets.simplifyWaitcnt(Wait);
2242
2243	auto SuccessorIt = std::next(x: Inst.getIterator());
2244	bool Result = generateWaitcnt(Wait, It: SuccessorIt, Block, ScoreBrackets,
2245	/OldWaitcntInstr=/nullptr);
2246
2247	if (Result && NeedsEndPGMCheck && isNextENDPGM(It: SuccessorIt, Block: &Block)) {
2248	BuildMI(BB&: Block, I: SuccessorIt, MIMD: Inst.getDebugLoc(), MCID: TII->get(Opcode: AMDGPU::S_NOP))
2249	.addImm(Val: `0`);
2250	}
2251
2252	return Result;
2253	}
2254
2255	void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
2256	WaitcntBrackets *ScoreBrackets) {
2257	// Now look at the instruction opcode. If it is a memory access
2258	// instruction, update the upper-bound of the appropriate counter's
2259	// bracket and the destination operand scores.
2260	// TODO: Use the (TSFlags & SIInstrFlags::DS_CNT) property everywhere.
2261
2262	bool IsVMEMAccess = false;
2263	bool IsSMEMAccess = false;
2264	if (TII->isDS(MI: Inst) && TII->usesLGKM_CNT(MI: Inst)) {
2265	if (TII->isAlwaysGDS(Opcode: Inst.getOpcode()) \|\|
2266	TII->hasModifiersSet(MI: Inst, OpName: AMDGPU::OpName::gds)) {
2267	ScoreBrackets->updateByEvent(TII, TRI, MRI, E: GDS_ACCESS, Inst);
2268	ScoreBrackets->updateByEvent(TII, TRI, MRI, E: GDS_GPR_LOCK, Inst);
2269	ScoreBrackets->setPendingGDS();
2270	} else {
2271	ScoreBrackets->updateByEvent(TII, TRI, MRI, E: LDS_ACCESS, Inst);
2272	}
2273	} else if (TII->isFLAT(MI: Inst)) {
2274	if (isGFX12CacheInvOrWBInst(Inst)) {
2275	ScoreBrackets->updateByEvent(TII, TRI, MRI, E: getVmemWaitEventType(Inst),
2276	Inst);
2277	return;
2278	}
2279
2280	assert(Inst.mayLoadOrStore());
2281
2282	int FlatASCount = `0`;
2283
2284	if (mayAccessVMEMThroughFlat(MI: Inst)) {
2285	++FlatASCount;
2286	IsVMEMAccess = true;
2287	ScoreBrackets->updateByEvent(TII, TRI, MRI, E: getVmemWaitEventType(Inst),
2288	Inst);
2289	}
2290
2291	if (mayAccessLDSThroughFlat(MI: Inst)) {
2292	++FlatASCount;
2293	ScoreBrackets->updateByEvent(TII, TRI, MRI, E: LDS_ACCESS, Inst);
2294	}
2295
2296	// A Flat memory operation must access at least one address space.
2297	assert(FlatASCount);
2298
2299	// This is a flat memory operation that access both VMEM and LDS, so note it
2300	// - it will require that both the VM and LGKM be flushed to zero if it is
2301	// pending when a VM or LGKM dependency occurs.
2302	if (FlatASCount > `1`)
2303	ScoreBrackets->setPendingFlat();
2304	} else if (SIInstrInfo::isVMEM(MI: Inst) &&
2305	!llvm::AMDGPU::getMUBUFIsBufferInv(Opc: Inst.getOpcode())) {
2306	IsVMEMAccess = true;
2307	ScoreBrackets->updateByEvent(TII, TRI, MRI, E: getVmemWaitEventType(Inst),
2308	Inst);
2309
2310	if (ST->vmemWriteNeedsExpWaitcnt() &&
2311	(Inst.mayStore() \|\| SIInstrInfo::isAtomicRet(MI: Inst))) {
2312	ScoreBrackets->updateByEvent(TII, TRI, MRI, E: VMW_GPR_LOCK, Inst);
2313	}
2314	} else if (TII->isSMRD(MI: Inst)) {
2315	IsSMEMAccess = true;
2316	ScoreBrackets->updateByEvent(TII, TRI, MRI, E: SMEM_ACCESS, Inst);
2317	} else if (Inst.isCall()) {
2318	if (callWaitsOnFunctionReturn(MI: Inst)) {
2319	// Act as a wait on everything
2320	ScoreBrackets->applyWaitcnt(
2321	Wait: WCG->getAllZeroWaitcnt(/IncludeVSCnt=/false));
2322	ScoreBrackets->setStateOnFunctionEntryOrReturn();
2323	} else {
2324	// May need to way wait for anything.
2325	ScoreBrackets->applyWaitcnt(Wait: AMDGPU::Waitcnt ());
2326	}
2327	} else if (SIInstrInfo::isLDSDIR(MI: Inst)) {
2328	ScoreBrackets->updateByEvent(TII, TRI, MRI, E: EXP_LDS_ACCESS, Inst);
2329	} else if (TII->isVINTERP(MI: Inst)) {
2330	int64_t Imm = TII->getNamedOperand(MI&: Inst, OperandName: AMDGPU::OpName::waitexp)->getImm();
2331	ScoreBrackets->applyWaitcnt(T: EXP_CNT, Count: Imm);
2332	} else if (SIInstrInfo::isEXP(MI: Inst)) {
2333	unsigned Imm = TII->getNamedOperand(MI&: Inst, OperandName: AMDGPU::OpName::tgt)->getImm();
2334	if (Imm >= AMDGPU::Exp::ET_PARAM0 && Imm <= AMDGPU::Exp::ET_PARAM31)
2335	ScoreBrackets->updateByEvent(TII, TRI, MRI, E: EXP_PARAM_ACCESS, Inst);
2336	else if (Imm >= AMDGPU::Exp::ET_POS0 && Imm <= AMDGPU::Exp::ET_POS_LAST)
2337	ScoreBrackets->updateByEvent(TII, TRI, MRI, E: EXP_POS_ACCESS, Inst);
2338	else
2339	ScoreBrackets->updateByEvent(TII, TRI, MRI, E: EXP_GPR_LOCK, Inst);
2340	} else {
2341	switch (Inst.getOpcode()) {
2342	case AMDGPU::S_SENDMSG:
2343	case AMDGPU::S_SENDMSG_RTN_B32:
2344	case AMDGPU::S_SENDMSG_RTN_B64:
2345	case AMDGPU::S_SENDMSGHALT:
2346	ScoreBrackets->updateByEvent(TII, TRI, MRI, E: SQ_MESSAGE, Inst);
2347	break;
2348	case AMDGPU::S_MEMTIME:
2349	case AMDGPU::S_MEMREALTIME:
2350	case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0:
2351	case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM:
2352	case AMDGPU::S_GET_BARRIER_STATE_M0:
2353	case AMDGPU::S_GET_BARRIER_STATE_IMM:
2354	ScoreBrackets->updateByEvent(TII, TRI, MRI, E: SMEM_ACCESS, Inst);
2355	break;
2356	}
2357	}
2358
2359	if (!hasXcnt())
2360	return;
2361
2362	if (IsVMEMAccess)
2363	ScoreBrackets->updateByEvent(TII, TRI, MRI, E: VMEM_GROUP, Inst);
2364
2365	if (IsSMEMAccess)
2366	ScoreBrackets->updateByEvent(TII, TRI, MRI, E: SMEM_GROUP, Inst);
2367	}
2368
2369	bool WaitcntBrackets::mergeScore(const MergeInfo &M, unsigned &Score,
2370	unsigned OtherScore) {
2371	unsigned MyShifted = Score <= M.OldLB ? `0` : Score + M.MyShift;
2372	unsigned OtherShifted =
2373	OtherScore <= M.OtherLB ? `0` : OtherScore + M.OtherShift;
2374	Score = std::max(a: MyShifted, b: OtherShifted);
2375	return OtherShifted > MyShifted;
2376	}
2377
2378	/// Merge the pending events and associater score brackets of \p Other into
2379	/// this brackets status.
2380	///
2381	/// Returns whether the merge resulted in a change that requires tighter waits
2382	/// (i.e. the merged brackets strictly dominate the original brackets).
2383	bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
2384	bool StrictDom = false;
2385
2386	VgprUB = std::max(a: VgprUB, b: Other.VgprUB);
2387	SgprUB = std::max(a: SgprUB, b: Other.SgprUB);
2388
2389	for (auto T : inst_counter_types(MaxCounter)) {
2390	// Merge event flags for this counter
2391	const unsigned OldEvents = PendingEvents & WaitEventMaskForInst[T];
2392	const unsigned OtherEvents = Other.PendingEvents & WaitEventMaskForInst[T];
2393	if (OtherEvents & ~OldEvents)
2394	StrictDom = true;
2395	PendingEvents \|= OtherEvents;
2396
2397	// Merge scores for this counter
2398	const unsigned MyPending = ScoreUBs[T] - ScoreLBs[T];
2399	const unsigned OtherPending = Other.ScoreUBs[T] - Other.ScoreLBs[T];
2400	const unsigned NewUB = ScoreLBs[T] + std::max(a: MyPending, b: OtherPending);
2401	if (NewUB < ScoreLBs[T])
2402	report_fatal_error(reason: "waitcnt score overflow");
2403
2404	MergeInfo M;
2405	M.OldLB = ScoreLBs[T];
2406	M.OtherLB = Other.ScoreLBs[T];
2407	M.MyShift = NewUB - ScoreUBs[T];
2408	M.OtherShift = NewUB - Other.ScoreUBs[T];
2409
2410	ScoreUBs[T] = NewUB;
2411
2412	StrictDom \|= mergeScore(M, Score&: LastFlat[T], OtherScore: Other.LastFlat[T]);
2413
2414	if (T == DS_CNT)
2415	StrictDom \|= mergeScore(M, Score&: LastGDS, OtherScore: Other.LastGDS);
2416
2417	for (int J = `0`; J <= VgprUB; J++)
2418	StrictDom \|= mergeScore(M, Score&: VgprScores[T][J], OtherScore: Other.VgprScores[T][J]);
2419
2420	if (isSmemCounter(T)) {
2421	unsigned Idx = getSgprScoresIdx(T);
2422	for (int J = `0`; J <= SgprUB; J++)
2423	StrictDom \|=
2424	mergeScore(M, Score&: SgprScores[Idx][J], OtherScore: Other.SgprScores[Idx][J]);
2425	}
2426	}
2427
2428	for (int J = `0`; J <= VgprUB; J++) {
2429	unsigned char NewVmemTypes = VgprVmemTypes[J] \| Other.VgprVmemTypes[J];
2430	StrictDom \|= NewVmemTypes != VgprVmemTypes[J];
2431	VgprVmemTypes[J] = NewVmemTypes;
2432	}
2433
2434	return StrictDom;
2435	}
2436
2437	static bool isWaitInstr(MachineInstr &Inst) {
2438	unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Opcode: Inst.getOpcode());
2439	return Opcode == AMDGPU::S_WAITCNT \|\|
2440	(Opcode == AMDGPU::S_WAITCNT_VSCNT && Inst.getOperand(i: `0`).isReg() &&
2441	Inst.getOperand(i: `0`).getReg() == AMDGPU::SGPR_NULL) \|\|
2442	Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT \|\|
2443	Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT \|\|
2444	counterTypeForInstr(Opcode).has_value();
2445	}
2446
2447	// Generate s_waitcnt instructions where needed.
2448	bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
2449	MachineBasicBlock &Block,
2450	WaitcntBrackets &ScoreBrackets) {
2451	bool Modified = false;
2452
2453	LLVM_DEBUG({
2454	dbgs() << "*** Begin Block: ";
2455	Block.printName(dbgs());
2456	ScoreBrackets.dump();
2457	});
2458
2459	// Track the correctness of vccz through this basic block. There are two
2460	// reasons why it might be incorrect; see ST->hasReadVCCZBug() and
2461	// ST->partialVCCWritesUpdateVCCZ().
2462	bool VCCZCorrect = true;
2463	if (ST->hasReadVCCZBug()) {
2464	// vccz could be incorrect at a basic block boundary if a predecessor wrote
2465	// to vcc and then issued an smem load.
2466	VCCZCorrect = false;
2467	} else if (!ST->partialVCCWritesUpdateVCCZ()) {
2468	// vccz could be incorrect at a basic block boundary if a predecessor wrote
2469	// to vcc_lo or vcc_hi.
2470	VCCZCorrect = false;
2471	}
2472
2473	// Walk over the instructions.
2474	MachineInstr OldWaitcntInstr = nullptr*;
2475
2476	for (MachineBasicBlock::instr_iterator Iter = Block.instr_begin(),
2477	E = Block.instr_end();
2478	Iter != E;) {
2479	MachineInstr &Inst = *Iter;
2480	if (Inst.isMetaInstruction()) {
2481	++Iter;
2482	continue;
2483	}
2484
2485	// Track pre-existing waitcnts that were added in earlier iterations or by
2486	// the memory legalizer.
2487	if (isWaitInstr(Inst)) {
2488	if (!OldWaitcntInstr)
2489	OldWaitcntInstr = &Inst;
2490	++Iter;
2491	continue;
2492	}
2493
2494	bool FlushVmCnt = Block.getFirstTerminator() == Inst &&
2495	isPreheaderToFlush(MBB&: Block, ScoreBrackets);
2496
2497	// Generate an s_waitcnt instruction to be placed before Inst, if needed.
2498	Modified \|= generateWaitcntInstBefore(MI&: Inst, ScoreBrackets, OldWaitcntInstr,
2499	FlushVmCnt);
2500	OldWaitcntInstr = nullptr;
2501
2502	// Restore vccz if it's not known to be correct already.
2503	bool RestoreVCCZ = !VCCZCorrect && readsVCCZ(MI: Inst);
2504
2505	// Don't examine operands unless we need to track vccz correctness.
2506	if (ST->hasReadVCCZBug() \|\| !ST->partialVCCWritesUpdateVCCZ()) {
2507	if (Inst.definesRegister(Reg: AMDGPU::VCC_LO, /TRI=/nullptr) \|\|
2508	Inst.definesRegister(Reg: AMDGPU::VCC_HI, /TRI=/nullptr)) {
2509	// Up to gfx9, writes to vcc_lo and vcc_hi don't update vccz.
2510	if (!ST->partialVCCWritesUpdateVCCZ())
2511	VCCZCorrect = false;
2512	} else if (Inst.definesRegister(Reg: AMDGPU::VCC, /TRI=/nullptr)) {
2513	// There is a hardware bug on CI/SI where SMRD instruction may corrupt
2514	// vccz bit, so when we detect that an instruction may read from a
2515	// corrupt vccz bit, we need to:
2516	// 1. Insert s_waitcnt lgkm(0) to wait for all outstanding SMRD
2517	// operations to complete.
2518	// 2. Restore the correct value of vccz by writing the current value
2519	// of vcc back to vcc.
2520	if (ST->hasReadVCCZBug() &&
2521	ScoreBrackets.hasPendingEvent(E: SMEM_ACCESS)) {
2522	// Writes to vcc while there's an outstanding smem read may get
2523	// clobbered as soon as any read completes.
2524	VCCZCorrect = false;
2525	} else {
2526	// Writes to vcc will fix any incorrect value in vccz.
2527	VCCZCorrect = true;
2528	}
2529	}
2530	}
2531
2532	if (TII->isSMRD(MI: Inst)) {
2533	for (const MachineMemOperand *Memop : Inst.memoperands()) {
2534	// No need to handle invariant loads when avoiding WAR conflicts, as
2535	// there cannot be a vector store to the same memory location.
2536	if (!Memop->isInvariant()) {
2537	const Value *Ptr = Memop->getValue();
2538	SLoadAddresses.insert(KV: std::pair(Ptr, Inst.getParent()));
2539	}
2540	}
2541	if (ST->hasReadVCCZBug()) {
2542	// This smem read could complete and clobber vccz at any time.
2543	VCCZCorrect = false;
2544	}
2545	}
2546
2547	updateEventWaitcntAfter(Inst, ScoreBrackets: &ScoreBrackets);
2548
2549	Modified \|= insertForcedWaitAfter(Inst, Block, ScoreBrackets);
2550
2551	LLVM_DEBUG({
2552	Inst.print(dbgs());
2553	ScoreBrackets.dump();
2554	});
2555
2556	// TODO: Remove this work-around after fixing the scheduler and enable the
2557	// assert above.
2558	if (RestoreVCCZ) {
2559	// Restore the vccz bit. Any time a value is written to vcc, the vcc
2560	// bit is updated, so we can restore the bit by reading the value of
2561	// vcc and then writing it back to the register.
2562	BuildMI(BB&: Block, I&: Inst, MIMD: Inst.getDebugLoc(),
2563	MCID: TII->get(Opcode: ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
2564	DestReg: TRI->getVCC())
2565	.addReg(RegNo: TRI->getVCC());
2566	VCCZCorrect = true;
2567	Modified = true;
2568	}
2569
2570	++Iter;
2571	}
2572
2573	// Flush the LOADcnt, SAMPLEcnt and BVHcnt counters at the end of the block if
2574	// needed.
2575	AMDGPU::Waitcnt Wait;
2576	if (Block.getFirstTerminator() == Block.end() &&
2577	isPreheaderToFlush(MBB&: Block, ScoreBrackets)) {
2578	if (ScoreBrackets.hasPendingEvent(T: LOAD_CNT))
2579	Wait.LoadCnt = `0`;
2580	if (ScoreBrackets.hasPendingEvent(T: SAMPLE_CNT))
2581	Wait.SampleCnt = `0`;
2582	if (ScoreBrackets.hasPendingEvent(T: BVH_CNT))
2583	Wait.BvhCnt = `0`;
2584	}
2585
2586	// Combine or remove any redundant waitcnts at the end of the block.
2587	Modified \|= generateWaitcnt(Wait, It: Block.instr_end(), Block, ScoreBrackets,
2588	OldWaitcntInstr);
2589
2590	LLVM_DEBUG({
2591	dbgs() << "*** End Block: ";
2592	Block.printName(dbgs());
2593	ScoreBrackets.dump();
2594	});
2595
2596	return Modified;
2597	}
2598
2599	// Return true if the given machine basic block is a preheader of a loop in
2600	// which we want to flush the vmcnt counter, and false otherwise.
2601	bool SIInsertWaitcnts::isPreheaderToFlush(
2602	MachineBasicBlock &MBB, const WaitcntBrackets &ScoreBrackets) {
2603	auto [Iterator, IsInserted] = PreheadersToFlush.try_emplace(Key: &MBB, Args: false);
2604	if (!IsInserted)
2605	return Iterator ->second;
2606
2607	MachineBasicBlock *Succ = MBB.getSingleSuccessor();
2608	if (!Succ)
2609	return false;
2610
2611	MachineLoop *Loop = MLI->getLoopFor(BB: Succ);
2612	if (!Loop)
2613	return false;
2614
2615	if (Loop->getLoopPreheader() == &MBB &&
2616	shouldFlushVmCnt(ML: Loop, Brackets: ScoreBrackets)) {
2617	Iterator ->second = true;
2618	return true;
2619	}
2620
2621	return false;
2622	}
2623
2624	bool SIInsertWaitcnts::isVMEMOrFlatVMEM(const MachineInstr &MI) const {
2625	if (SIInstrInfo::isFLAT(MI))
2626	return mayAccessVMEMThroughFlat(MI);
2627	return SIInstrInfo::isVMEM(MI);
2628	}
2629
2630	// Return true if it is better to flush the vmcnt counter in the preheader of
2631	// the given loop. We currently decide to flush in two situations:
2632	// 1. The loop contains vmem store(s), no vmem load and at least one use of a
2633	// vgpr containing a value that is loaded outside of the loop. (Only on
2634	// targets with no vscnt counter).
2635	// 2. The loop contains vmem load(s), but the loaded values are not used in the
2636	// loop, and at least one use of a vgpr containing a value that is loaded
2637	// outside of the loop.
2638	bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML,
2639	const WaitcntBrackets &Brackets) {
2640	bool HasVMemLoad = false;
2641	bool HasVMemStore = false;
2642	bool UsesVgprLoadedOutside = false;
2643	DenseSet<Register> VgprUse;
2644	DenseSet<Register> VgprDef;
2645
2646	for (MachineBasicBlock *MBB : ML->blocks()) {
2647	for (MachineInstr &MI : *MBB) {
2648	if (isVMEMOrFlatVMEM(MI)) {
2649	if (MI.mayLoad())
2650	HasVMemLoad = true;
2651	if (MI.mayStore())
2652	HasVMemStore = true;
2653	}
2654	for (const MachineOperand &Op : MI.all_uses()) {
2655	if (!TRI->isVectorRegister(MRI: *MRI, Reg: Op.getReg()))
2656	continue;
2657	RegInterval Interval = Brackets.getRegInterval(MI: &MI, MRI, TRI, Op);
2658	// Vgpr use
2659	for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
2660	// If we find a register that is loaded inside the loop, 1. and 2.
2661	// are invalidated and we can exit.
2662	if (VgprDef.contains(V: RegNo))
2663	return false;
2664	VgprUse.insert(V: RegNo);
2665	// If at least one of Op's registers is in the score brackets, the
2666	// value is likely loaded outside of the loop.
2667	if (Brackets.getRegScore(GprNo: RegNo, T: LOAD_CNT) >
2668	Brackets.getScoreLB(T: LOAD_CNT) \|\|
2669	Brackets.getRegScore(GprNo: RegNo, T: SAMPLE_CNT) >
2670	Brackets.getScoreLB(T: SAMPLE_CNT) \|\|
2671	Brackets.getRegScore(GprNo: RegNo, T: BVH_CNT) >
2672	Brackets.getScoreLB(T: BVH_CNT)) {
2673	UsesVgprLoadedOutside = true;
2674	break;
2675	}
2676	}
2677	}
2678
2679	// VMem load vgpr def
2680	if (isVMEMOrFlatVMEM(MI) && MI.mayLoad()) {
2681	for (const MachineOperand &Op : MI.all_defs()) {
2682	RegInterval Interval = Brackets.getRegInterval(MI: &MI, MRI, TRI, Op);
2683	for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
2684	// If we find a register that is loaded inside the loop, 1. and 2.
2685	// are invalidated and we can exit.
2686	if (VgprUse.contains(V: RegNo))
2687	return false;
2688	VgprDef.insert(V: RegNo);
2689	}
2690	}
2691	}
2692	}
2693	}
2694	if (!ST->hasVscnt() && HasVMemStore && !HasVMemLoad && UsesVgprLoadedOutside)
2695	return true;
2696	return HasVMemLoad && UsesVgprLoadedOutside && ST->hasVmemWriteVgprInOrder();
2697	}
2698
2699	bool SIInsertWaitcntsLegacy::runOnMachineFunction(MachineFunction &MF) {
2700	auto *MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
2701	auto *PDT =
2702	&getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree();
2703	AliasAnalysis AA = nullptr*;
2704	if (auto *AAR = getAnalysisIfAvailable<AAResultsWrapperPass>())
2705	AA = &AAR->getAAResults();
2706
2707	return SIInsertWaitcnts (MLI, PDT, AA).run(MF);
2708	}
2709
2710	PreservedAnalyses
2711	SIInsertWaitcntsPass::run(MachineFunction &MF,
2712	MachineFunctionAnalysisManager &MFAM) {
2713	auto *MLI = &MFAM.getResult<MachineLoopAnalysis>(IR&: MF);
2714	auto *PDT = &MFAM.getResult<MachinePostDominatorTreeAnalysis>(IR&: MF);
2715	auto *AA = MFAM.getResult<FunctionAnalysisManagerMachineFunctionProxy>(IR&: MF)
2716	.getManager()
2717	.getCachedResult<AAManager>(IR&: MF.getFunction());
2718
2719	if (!SIInsertWaitcnts (MLI, PDT, AA).run(MF))
2720	return PreservedAnalyses::all();
2721
2722	return getMachineFunctionPassPreservedAnalyses()
2723	.preserveSet<CFGAnalyses>()
2724	.preserve<AAManager>();
2725	}
2726
2727	bool SIInsertWaitcnts::run(MachineFunction &MF) {
2728	ST = &MF.getSubtarget<GCNSubtarget>();
2729	TII = ST->getInstrInfo();
2730	TRI = &TII->getRegisterInfo();
2731	MRI = &MF.getRegInfo();
2732	const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2733
2734	AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(GPU: ST->getCPU());
2735
2736	if (ST->hasExtendedWaitCounts()) {
2737	MaxCounter = NUM_EXTENDED_INST_CNTS;
2738	WCGGFX12Plus = WaitcntGeneratorGFX12Plus (MF, MaxCounter);
2739	WCG = &WCGGFX12Plus;
2740	} else {
2741	MaxCounter = NUM_NORMAL_INST_CNTS;
2742	WCGPreGFX12 = WaitcntGeneratorPreGFX12 (MF);
2743	WCG = &WCGPreGFX12;
2744	}
2745
2746	for (auto T : inst_counter_types())
2747	ForceEmitWaitcnt[T] = false;
2748
2749	const unsigned *WaitEventMaskForInst = WCG->getWaitEventMask();
2750
2751	SmemAccessCounter = eventCounter(masks: WaitEventMaskForInst, E: SMEM_ACCESS);
2752
2753	HardwareLimits Limits = {};
2754	if (ST->hasExtendedWaitCounts()) {
2755	Limits.LoadcntMax = AMDGPU::getLoadcntBitMask(Version: IV);
2756	Limits.DscntMax = AMDGPU::getDscntBitMask(Version: IV);
2757	} else {
2758	Limits.LoadcntMax = AMDGPU::getVmcntBitMask(Version: IV);
2759	Limits.DscntMax = AMDGPU::getLgkmcntBitMask(Version: IV);
2760	}
2761	Limits.ExpcntMax = AMDGPU::getExpcntBitMask(Version: IV);
2762	Limits.StorecntMax = AMDGPU::getStorecntBitMask(Version: IV);
2763	Limits.SamplecntMax = AMDGPU::getSamplecntBitMask(Version: IV);
2764	Limits.BvhcntMax = AMDGPU::getBvhcntBitMask(Version: IV);
2765	Limits.KmcntMax = AMDGPU::getKmcntBitMask(Version: IV);
2766	Limits.XcntMax = AMDGPU::getXcntBitMask(Version: IV);
2767
2768	[[maybe_unused]] unsigned NumVGPRsMax =
2769	ST->getAddressableNumVGPRs(DynamicVGPRBlockSize: MFI->getDynamicVGPRBlockSize());
2770	[[maybe_unused]] unsigned NumSGPRsMax = ST->getAddressableNumSGPRs();
2771	assert(NumVGPRsMax <= SQ_MAX_PGM_VGPRS);
2772	assert(NumSGPRsMax <= SQ_MAX_PGM_SGPRS);
2773
2774	BlockInfos.clear();
2775	bool Modified = false;
2776
2777	MachineBasicBlock &EntryBB = MF.front();
2778	MachineBasicBlock::iterator I = EntryBB.begin();
2779
2780	if (!MFI->isEntryFunction()) {
2781	// Wait for any outstanding memory operations that the input registers may
2782	// depend on. We can't track them and it's better to do the wait after the
2783	// costly call sequence.
2784
2785	// TODO: Could insert earlier and schedule more liberally with operations
2786	// that only use caller preserved registers.
2787	for (MachineBasicBlock::iterator E = EntryBB.end();
2788	I != E && (I ->isPHI() \|\| I ->isMetaInstruction()); ++I)
2789	;
2790
2791	if (ST->hasExtendedWaitCounts()) {
2792	BuildMI(BB&: EntryBB, I, MIMD: DebugLoc (), MCID: TII->get(Opcode: AMDGPU::S_WAIT_LOADCNT_DSCNT))
2793	.addImm(Val: `0`);
2794	for (auto CT : inst_counter_types(MaxCounter: NUM_EXTENDED_INST_CNTS)) {
2795	if (CT == LOAD_CNT \|\| CT == DS_CNT \|\| CT == STORE_CNT \|\| CT == X_CNT)
2796	continue;
2797
2798	if (!ST->hasImageInsts() &&
2799	(CT == EXP_CNT \|\| CT == SAMPLE_CNT \|\| CT == BVH_CNT))
2800	continue;
2801
2802	BuildMI(BB&: EntryBB, I, MIMD: DebugLoc (),
2803	MCID: TII->get(Opcode: instrsForExtendedCounterTypes[CT]))
2804	.addImm(Val: `0`);
2805	}
2806	} else {
2807	BuildMI(BB&: EntryBB, I, MIMD: DebugLoc (), MCID: TII->get(Opcode: AMDGPU::S_WAITCNT)).addImm(Val: `0`);
2808	}
2809
2810	auto NonKernelInitialState = std::make_unique<WaitcntBrackets>(
2811	args&: ST, args&: MaxCounter, args&: Limits, args&: WaitEventMaskForInst, args&: SmemAccessCounter);
2812	NonKernelInitialState ->setStateOnFunctionEntryOrReturn();
2813	BlockInfos [&EntryBB].Incoming = std::move(NonKernelInitialState);
2814
2815	Modified = true;
2816	}
2817
2818	// Keep iterating over the blocks in reverse post order, inserting and
2819	// updating s_waitcnt where needed, until a fix point is reached.
2820	for (auto MBB : ReversePostOrderTraversal<MachineFunction >(&MF))
2821	BlockInfos.try_emplace(Key: MBB);
2822
2823	std::unique_ptr<WaitcntBrackets> Brackets;
2824	bool Repeat;
2825	do {
2826	Repeat = false;
2827
2828	for (auto BII = BlockInfos.begin(), BIE = BlockInfos.end(); BII != BIE;
2829	++BII) {
2830	MachineBasicBlock *MBB = BII->first;
2831	BlockInfo &BI = BII->second;
2832	if (!BI.Dirty)
2833	continue;
2834
2835	if (BI.Incoming) {
2836	if (!Brackets)
2837	Brackets = std::make_unique<WaitcntBrackets>(args&: *BI.Incoming);
2838	else
2839	Brackets = BI.Incoming;
2840	} else {
2841	if (!Brackets) {
2842	Brackets = std::make_unique<WaitcntBrackets>(
2843	args&: ST, args&: MaxCounter, args&: Limits, args&: WaitEventMaskForInst, args&: SmemAccessCounter);
2844	} else {
2845	// Reinitialize in-place. N.B. do not do this by assigning from a
2846	// temporary because the WaitcntBrackets class is large and it could
2847	// cause this function to use an unreasonable amount of stack space.
2848	Brackets ->~WaitcntBrackets();
2849	new (Brackets.get()) WaitcntBrackets (
2850	ST, MaxCounter, Limits, WaitEventMaskForInst, SmemAccessCounter);
2851	}
2852	}
2853
2854	Modified \|= insertWaitcntInBlock(MF, Block&: MBB, ScoreBrackets&: Brackets);
2855	BI.Dirty = false;
2856
2857	if (Brackets ->hasPendingEvent()) {
2858	BlockInfo MoveBracketsToSucc = nullptr*;
2859	for (MachineBasicBlock *Succ : MBB->successors()) {
2860	auto *SuccBII = BlockInfos.find(Key: Succ);
2861	BlockInfo &SuccBI = SuccBII->second;
2862	if (!SuccBI.Incoming) {
2863	SuccBI.Dirty = true;
2864	if (SuccBII <= BII) {
2865	LLVM_DEBUG(dbgs() << "repeat on backedge\n");
2866	Repeat = true;
2867	}
2868	if (!MoveBracketsToSucc) {
2869	MoveBracketsToSucc = &SuccBI;
2870	} else {
2871	SuccBI.Incoming = std::make_unique<WaitcntBrackets>(args&: *Brackets);
2872	}
2873	} else if (SuccBI.Incoming ->merge(Other: *Brackets)) {
2874	SuccBI.Dirty = true;
2875	if (SuccBII <= BII) {
2876	LLVM_DEBUG(dbgs() << "repeat on backedge\n");
2877	Repeat = true;
2878	}
2879	}
2880	}
2881	if (MoveBracketsToSucc)
2882	MoveBracketsToSucc->Incoming = std::move(Brackets);
2883	}
2884	}
2885	} while (Repeat);
2886
2887	if (ST->hasScalarStores()) {
2888	SmallVector<MachineBasicBlock *, `4`> EndPgmBlocks;
2889	bool HaveScalarStores = false;
2890
2891	for (MachineBasicBlock &MBB : MF) {
2892	for (MachineInstr &MI : MBB) {
2893	if (!HaveScalarStores && TII->isScalarStore(MI))
2894	HaveScalarStores = true;
2895
2896	if (MI.getOpcode() == AMDGPU::S_ENDPGM \|\|
2897	MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
2898	EndPgmBlocks.push_back(Elt: &MBB);
2899	}
2900	}
2901
2902	if (HaveScalarStores) {
2903	// If scalar writes are used, the cache must be flushed or else the next
2904	// wave to reuse the same scratch memory can be clobbered.
2905	//
2906	// Insert s_dcache_wb at wave termination points if there were any scalar
2907	// stores, and only if the cache hasn't already been flushed. This could
2908	// be improved by looking across blocks for flushes in postdominating
2909	// blocks from the stores but an explicitly requested flush is probably
2910	// very rare.
2911	for (MachineBasicBlock *MBB : EndPgmBlocks) {
2912	bool SeenDCacheWB = false;
2913
2914	for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
2915	I != E; ++I) {
2916	if (I ->getOpcode() == AMDGPU::S_DCACHE_WB)
2917	SeenDCacheWB = true;
2918	else if (TII->isScalarStore(MI: *I))
2919	SeenDCacheWB = false;
2920
2921	// FIXME: It would be better to insert this before a waitcnt if any.
2922	if ((I ->getOpcode() == AMDGPU::S_ENDPGM \|\|
2923	I ->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&
2924	!SeenDCacheWB) {
2925	Modified = true;
2926	BuildMI(BB&: *MBB, I, MIMD: I ->getDebugLoc(), MCID: TII->get(Opcode: AMDGPU::S_DCACHE_WB));
2927	}
2928	}
2929	}
2930	}
2931	}
2932
2933	// Deallocate the VGPRs before previously identified S_ENDPGM instructions.
2934	// This is done in different ways depending on how the VGPRs were allocated
2935	// (i.e. whether we're in dynamic VGPR mode or not).
2936	// Skip deallocation if kernel is waveslot limited vs VGPR limited. A short
2937	// waveslot limited kernel runs slower with the deallocation.
2938	if (MFI->isDynamicVGPREnabled()) {
2939	for (MachineInstr *MI : ReleaseVGPRInsts) {
2940	BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(),
2941	MCID: TII->get(Opcode: AMDGPU::S_ALLOC_VGPR))
2942	.addImm(Val: `0`);
2943	Modified = true;
2944	}
2945	} else {
2946	if (!ReleaseVGPRInsts.empty() &&
2947	(MF.getFrameInfo().hasCalls() \|\|
2948	ST->getOccupancyWithNumVGPRs(
2949	VGPRs: TRI->getNumUsedPhysRegs(MRI: *MRI, RC: AMDGPU::VGPR_32RegClass),
2950	/IsDynamicVGPR=/DynamicVGPRBlockSize: false) <
2951	AMDGPU::IsaInfo::getMaxWavesPerEU(STI: ST))) {
2952	for (MachineInstr *MI : ReleaseVGPRInsts) {
2953	if (ST->requiresNopBeforeDeallocVGPRs()) {
2954	BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(),
2955	MCID: TII->get(Opcode: AMDGPU::S_NOP))
2956	.addImm(Val: `0`);
2957	}
2958	BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(),
2959	MCID: TII->get(Opcode: AMDGPU::S_SENDMSG))
2960	.addImm(Val: AMDGPU::SendMsg::ID_DEALLOC_VGPRS_GFX11Plus);
2961	Modified = true;
2962	}
2963	}
2964	}
2965	ReleaseVGPRInsts.clear();
2966	PreheadersToFlush.clear();
2967	SLoadAddresses.clear();
2968
2969	return Modified;
2970	}
2971

Browse the source code of llvm_projects/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp