SIInsertWaitcnts.cpp source code [llvm_projects/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp]

1	//===- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	/// \file
10	/// Insert wait instructions for memory reads and writes.
11	///
12	/// Memory reads and writes are issued asynchronously, so we need to insert
13	/// S_WAITCNT instructions when we want to access any of their results or
14	/// overwrite any register that's used asynchronously.
15	///
16	/// TODO: This pass currently keeps one timeline per hardware counter. A more
17	/// finely-grained approach that keeps one timeline per event type could
18	/// sometimes get away with generating weaker s_waitcnt instructions. For
19	/// example, when both SMEM and LDS are in flight and we need to wait for
20	/// the i-th-last LDS instruction, then an lgkmcnt(i) is actually sufficient,
21	/// but the pass will currently generate a conservative lgkmcnt(0) because
22	/// multiple event types are in flight.
23	//
24	//===----------------------------------------------------------------------===//
25
26	#include "AMDGPU.h"
27	#include "GCNSubtarget.h"
28	#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
29	#include "SIMachineFunctionInfo.h"
30	#include "Utils/AMDGPUBaseInfo.h"
31	#include "llvm/ADT/MapVector.h"
32	#include "llvm/ADT/PostOrderIterator.h"
33	#include "llvm/ADT/Sequence.h"
34	#include "llvm/Analysis/AliasAnalysis.h"
35	#include "llvm/CodeGen/MachineLoopInfo.h"
36	#include "llvm/CodeGen/MachinePostDominators.h"
37	#include "llvm/InitializePasses.h"
38	#include "llvm/Support/DebugCounter.h"
39	#include "llvm/TargetParser/TargetParser.h"
40	using namespace llvm;
41
42	#define DEBUG_TYPE "si-insert-waitcnts"
43
44	DEBUG_COUNTER(ForceExpCounter, DEBUG_TYPE"-forceexp",
45	"Force emit s_waitcnt expcnt(0) instrs");
46	DEBUG_COUNTER(ForceLgkmCounter, DEBUG_TYPE"-forcelgkm",
47	"Force emit s_waitcnt lgkmcnt(0) instrs");
48	DEBUG_COUNTER(ForceVMCounter, DEBUG_TYPE"-forcevm",
49	"Force emit s_waitcnt vmcnt(0) instrs");
50
51	static cl::opt<bool> ForceEmitZeroFlag(
52	"amdgpu-waitcnt-forcezero",
53	cl::desc ("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"),
54	cl::init(Val: false), cl::Hidden);
55
56	namespace {
57	// Class of object that encapsulates latest instruction counter score
58	// associated with the operand. Used for determining whether
59	// s_waitcnt instruction needs to be emitted.
60
61	enum InstCounterType {
62	LOAD_CNT = `0`, // VMcnt prior to gfx12.
63	DS_CNT, // LKGMcnt prior to gfx12.
64	EXP_CNT, //
65	STORE_CNT, // VScnt in gfx10/gfx11.
66	NUM_NORMAL_INST_CNTS,
67	SAMPLE_CNT = NUM_NORMAL_INST_CNTS, // gfx12+ only.
68	BVH_CNT, // gfx12+ only.
69	KM_CNT, // gfx12+ only.
70	NUM_EXTENDED_INST_CNTS,
71	NUM_INST_CNTS = NUM_EXTENDED_INST_CNTS
72	};
73	} // namespace
74
75	namespace llvm {
76	template <> struct enum_iteration_traits<InstCounterType> {
77	static constexpr bool is_iterable = true;
78	};
79	} // namespace llvm
80
81	namespace {
82	// Return an iterator over all counters between LOAD_CNT (the first counter)
83	// and \c MaxCounter (exclusive, default value yields an enumeration over
84	// all counters).
85	auto inst_counter_types(InstCounterType MaxCounter = NUM_INST_CNTS) {
86	return enum_seq(Begin: LOAD_CNT, End: MaxCounter);
87	}
88
89	using RegInterval = std::pair<int, int>;
90
91	struct HardwareLimits {
92	unsigned LoadcntMax; // Corresponds to VMcnt prior to gfx12.
93	unsigned ExpcntMax;
94	unsigned DscntMax; // Corresponds to LGKMcnt prior to gfx12.
95	unsigned StorecntMax; // Corresponds to VScnt in gfx10/gfx11.
96	unsigned SamplecntMax; // gfx12+ only.
97	unsigned BvhcntMax; // gfx12+ only.
98	unsigned KmcntMax; // gfx12+ only.
99	};
100
101	struct RegisterEncoding {
102	unsigned VGPR0;
103	unsigned VGPRL;
104	unsigned SGPR0;
105	unsigned SGPRL;
106	};
107
108	enum WaitEventType {
109	VMEM_ACCESS, // vector-memory read & write
110	VMEM_READ_ACCESS, // vector-memory read
111	VMEM_SAMPLER_READ_ACCESS, // vector-memory SAMPLER read (gfx12+ only)
112	VMEM_BVH_READ_ACCESS, // vector-memory BVH read (gfx12+ only)
113	VMEM_WRITE_ACCESS, // vector-memory write that is not scratch
114	SCRATCH_WRITE_ACCESS, // vector-memory write that may be scratch
115	LDS_ACCESS, // lds read & write
116	GDS_ACCESS, // gds read & write
117	SQ_MESSAGE, // send message
118	SMEM_ACCESS, // scalar-memory read & write
119	EXP_GPR_LOCK, // export holding on its data src
120	GDS_GPR_LOCK, // GDS holding on its data and addr src
121	EXP_POS_ACCESS, // write to export position
122	EXP_PARAM_ACCESS, // write to export parameter
123	VMW_GPR_LOCK, // vector-memory write holding on its data src
124	EXP_LDS_ACCESS, // read by ldsdir counting as export
125	NUM_WAIT_EVENTS,
126	};
127
128	// The mapping is:
129	// 0 .. SQ_MAX_PGM_VGPRS-1 real VGPRs
130	// SQ_MAX_PGM_VGPRS .. NUM_ALL_VGPRS-1 extra VGPR-like slots
131	// NUM_ALL_VGPRS .. NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS-1 real SGPRs
132	// We reserve a fixed number of VGPR slots in the scoring tables for
133	// special tokens like SCMEM_LDS (needed for buffer load to LDS).
134	enum RegisterMapping {
135	SQ_MAX_PGM_VGPRS = `512`, // Maximum programmable VGPRs across all targets.
136	AGPR_OFFSET = `256`, // Maximum programmable ArchVGPRs across all targets.
137	SQ_MAX_PGM_SGPRS = `256`, // Maximum programmable SGPRs across all targets.
138	NUM_EXTRA_VGPRS = `9`, // Reserved slots for DS.
139	// Artificial register slots to track LDS writes into specific LDS locations
140	// if a location is known. When slots are exhausted or location is
141	// unknown use the first slot. The first slot is also always updated in
142	// addition to known location's slot to properly generate waits if dependent
143	// instruction's location is unknown.
144	EXTRA_VGPR_LDS = `0`,
145	NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_EXTRA_VGPRS, // Where SGPR starts.
146	};
147
148	// Enumerate different types of result-returning VMEM operations. Although
149	// s_waitcnt orders them all with a single vmcnt counter, in the absence of
150	// s_waitcnt only instructions of the same VmemType are guaranteed to write
151	// their results in order -- so there is no need to insert an s_waitcnt between
152	// two instructions of the same type that write the same vgpr.
153	enum VmemType {
154	// BUF instructions and MIMG instructions without a sampler.
155	VMEM_NOSAMPLER,
156	// MIMG instructions with a sampler.
157	VMEM_SAMPLER,
158	// BVH instructions
159	VMEM_BVH,
160	NUM_VMEM_TYPES
161	};
162
163	// Maps values of InstCounterType to the instruction that waits on that
164	// counter. Only used if GCNSubtarget::hasExtendedWaitCounts()
165	// returns true.
166	static const unsigned instrsForExtendedCounterTypes[NUM_EXTENDED_INST_CNTS] = {
167	AMDGPU::S_WAIT_LOADCNT, AMDGPU::S_WAIT_DSCNT, AMDGPU::S_WAIT_EXPCNT,
168	AMDGPU::S_WAIT_STORECNT, AMDGPU::S_WAIT_SAMPLECNT, AMDGPU::S_WAIT_BVHCNT,
169	AMDGPU::S_WAIT_KMCNT};
170
171	static bool updateVMCntOnly(const MachineInstr &Inst) {
172	return SIInstrInfo::isVMEM(MI: Inst) \|\| SIInstrInfo::isFLATGlobal(MI: Inst) \|\|
173	SIInstrInfo::isFLATScratch(MI: Inst);
174	}
175
176	#ifndef NDEBUG
177	static bool isNormalMode(InstCounterType MaxCounter) {
178	return MaxCounter == NUM_NORMAL_INST_CNTS;
179	}
180	#endif // NDEBUG
181
182	VmemType getVmemType(const MachineInstr &Inst) {
183	assert(updateVMCntOnly(Inst));
184	if (!SIInstrInfo::isMIMG(MI: Inst) && !SIInstrInfo::isVIMAGE(MI: Inst) &&
185	!SIInstrInfo::isVSAMPLE(MI: Inst))
186	return VMEM_NOSAMPLER;
187	const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc: Inst.getOpcode());
188	const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo =
189	AMDGPU::getMIMGBaseOpcodeInfo(BaseOpcode: Info->BaseOpcode);
190	// We have to make an additional check for isVSAMPLE here since some
191	// instructions don't have a sampler, but are still classified as sampler
192	// instructions for the purposes of e.g. waitcnt.
193	return BaseInfo->BVH ? VMEM_BVH
194	: (BaseInfo->Sampler \|\| SIInstrInfo::isVSAMPLE(MI: Inst)) ? VMEM_SAMPLER
195	: VMEM_NOSAMPLER;
196	}
197
198	unsigned &getCounterRef(AMDGPU::Waitcnt &Wait, InstCounterType T) {
199	switch (T) {
200	case LOAD_CNT:
201	return Wait.LoadCnt;
202	case EXP_CNT:
203	return Wait.ExpCnt;
204	case DS_CNT:
205	return Wait.DsCnt;
206	case STORE_CNT:
207	return Wait.StoreCnt;
208	case SAMPLE_CNT:
209	return Wait.SampleCnt;
210	case BVH_CNT:
211	return Wait.BvhCnt;
212	case KM_CNT:
213	return Wait.KmCnt;
214	default:
215	llvm_unreachable("bad InstCounterType");
216	}
217	}
218
219	void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) {
220	unsigned &WC = getCounterRef(Wait, T);
221	WC = std::min(a: WC, b: Count);
222	}
223
224	void setNoWait(AMDGPU::Waitcnt &Wait, InstCounterType T) {
225	getCounterRef(Wait, T) = ~`0u`;
226	}
227
228	unsigned getWait(AMDGPU::Waitcnt &Wait, InstCounterType T) {
229	return getCounterRef(Wait, T);
230	}
231
232	// Mapping from event to counter according to the table masks.
233	InstCounterType eventCounter(const unsigned *masks, WaitEventType E) {
234	for (auto T : inst_counter_types()) {
235	if (masks[T] & (`1` << E))
236	return T;
237	}
238	llvm_unreachable("event type has no associated counter");
239	}
240
241	// This objects maintains the current score brackets of each wait counter, and
242	// a per-register scoreboard for each wait counter.
243	//
244	// We also maintain the latest score for every event type that can change the
245	// waitcnt in order to know if there are multiple types of events within
246	// the brackets. When multiple types of event happen in the bracket,
247	// wait count may get decreased out of order, therefore we need to put in
248	// "s_waitcnt 0" before use.
249	class WaitcntBrackets {
250	public:
251	WaitcntBrackets(const GCNSubtarget *SubTarget, InstCounterType MaxCounter,
252	HardwareLimits Limits, RegisterEncoding Encoding,
253	const unsigned *WaitEventMaskForInst,
254	InstCounterType SmemAccessCounter)
255	: ST(SubTarget), MaxCounter(MaxCounter), Limits (Limits),
256	Encoding (Encoding), WaitEventMaskForInst(WaitEventMaskForInst),
257	SmemAccessCounter(SmemAccessCounter) {}
258
259	unsigned getWaitCountMax(InstCounterType T) const {
260	switch (T) {
261	case LOAD_CNT:
262	return Limits.LoadcntMax;
263	case DS_CNT:
264	return Limits.DscntMax;
265	case EXP_CNT:
266	return Limits.ExpcntMax;
267	case STORE_CNT:
268	return Limits.StorecntMax;
269	case SAMPLE_CNT:
270	return Limits.SamplecntMax;
271	case BVH_CNT:
272	return Limits.BvhcntMax;
273	case KM_CNT:
274	return Limits.KmcntMax;
275	default:
276	break;
277	}
278	return `0`;
279	}
280
281	unsigned getScoreLB(InstCounterType T) const {
282	assert(T < NUM_INST_CNTS);
283	return ScoreLBs[T];
284	}
285
286	unsigned getScoreUB(InstCounterType T) const {
287	assert(T < NUM_INST_CNTS);
288	return ScoreUBs[T];
289	}
290
291	unsigned getScoreRange(InstCounterType T) const {
292	return getScoreUB(T) - getScoreLB(T);
293	}
294
295	unsigned getRegScore(int GprNo, InstCounterType T) const {
296	if (GprNo < NUM_ALL_VGPRS) {
297	return VgprScores[T][GprNo];
298	}
299	assert(T == SmemAccessCounter);
300	return SgprScores[GprNo - NUM_ALL_VGPRS];
301	}
302
303	bool merge(const WaitcntBrackets &Other);
304
305	RegInterval getRegInterval(const MachineInstr *MI,
306	const MachineRegisterInfo *MRI,
307	const SIRegisterInfo TRI, unsigned* OpNo) const;
308
309	bool counterOutOfOrder(InstCounterType T) const;
310	void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const;
311	void simplifyWaitcnt(InstCounterType T, unsigned &Count) const;
312	void determineWait(InstCounterType T, int RegNo, AMDGPU::Waitcnt &Wait) const;
313	void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
314	void applyWaitcnt(InstCounterType T, unsigned Count);
315	void updateByEvent(const SIInstrInfo TII, const* SIRegisterInfo *TRI,
316	const MachineRegisterInfo *MRI, WaitEventType E,
317	MachineInstr &MI);
318
319	unsigned hasPendingEvent() const { return PendingEvents; }
320	unsigned hasPendingEvent(WaitEventType E) const {
321	return PendingEvents & (`1` << E);
322	}
323	unsigned hasPendingEvent(InstCounterType T) const {
324	unsigned HasPending = PendingEvents & WaitEventMaskForInst[T];
325	assert((HasPending != `0`) == (getScoreRange(T) != `0`));
326	return HasPending;
327	}
328
329	bool hasMixedPendingEvents(InstCounterType T) const {
330	unsigned Events = hasPendingEvent(T);
331	// Return true if more than one bit is set in Events.
332	return Events & (Events - `1`);
333	}
334
335	bool hasPendingFlat() const {
336	return ((LastFlat[DS_CNT] > ScoreLBs[DS_CNT] &&
337	LastFlat[DS_CNT] <= ScoreUBs[DS_CNT]) \|\|
338	(LastFlat[LOAD_CNT] > ScoreLBs[LOAD_CNT] &&
339	LastFlat[LOAD_CNT] <= ScoreUBs[LOAD_CNT]));
340	}
341
342	void setPendingFlat() {
343	LastFlat[LOAD_CNT] = ScoreUBs[LOAD_CNT];
344	LastFlat[DS_CNT] = ScoreUBs[DS_CNT];
345	}
346
347	// Return true if there might be pending writes to the specified vgpr by VMEM
348	// instructions with types different from V.
349	bool hasOtherPendingVmemTypes(int GprNo, VmemType V) const {
350	assert(GprNo < NUM_ALL_VGPRS);
351	return VgprVmemTypes[GprNo] & ~(`1` << V);
352	}
353
354	void clearVgprVmemTypes(int GprNo) {
355	assert(GprNo < NUM_ALL_VGPRS);
356	VgprVmemTypes[GprNo] = `0`;
357	}
358
359	void setStateOnFunctionEntryOrReturn() {
360	setScoreUB(T: STORE_CNT, Val: getScoreUB(T: STORE_CNT) + getWaitCountMax(T: STORE_CNT));
361	PendingEvents \|= WaitEventMaskForInst[STORE_CNT];
362	}
363
364	ArrayRef<const MachineInstr > getLDSDMAStores() const* {
365	return LDSDMAStores;
366	}
367
368	void print(raw_ostream &);
369	void dump() { print(dbgs()); }
370
371	private:
372	struct MergeInfo {
373	unsigned OldLB;
374	unsigned OtherLB;
375	unsigned MyShift;
376	unsigned OtherShift;
377	};
378	static bool mergeScore(const MergeInfo &M, unsigned &Score,
379	unsigned OtherScore);
380
381	void setScoreLB(InstCounterType T, unsigned Val) {
382	assert(T < NUM_INST_CNTS);
383	ScoreLBs[T] = Val;
384	}
385
386	void setScoreUB(InstCounterType T, unsigned Val) {
387	assert(T < NUM_INST_CNTS);
388	ScoreUBs[T] = Val;
389
390	if (T != EXP_CNT)
391	return;
392
393	if (getScoreRange(T: EXP_CNT) > getWaitCountMax(T: EXP_CNT))
394	ScoreLBs[EXP_CNT] = ScoreUBs[EXP_CNT] - getWaitCountMax(T: EXP_CNT);
395	}
396
397	void setRegScore(int GprNo, InstCounterType T, unsigned Val) {
398	if (GprNo < NUM_ALL_VGPRS) {
399	VgprUB = std::max(a: VgprUB, b: GprNo);
400	VgprScores[T][GprNo] = Val;
401	} else {
402	assert(T == SmemAccessCounter);
403	SgprUB = std::max(a: SgprUB, b: GprNo - NUM_ALL_VGPRS);
404	SgprScores[GprNo - NUM_ALL_VGPRS] = Val;
405	}
406	}
407
408	void setExpScore(const MachineInstr MI, const* SIInstrInfo *TII,
409	const SIRegisterInfo TRI, const* MachineRegisterInfo *MRI,
410	unsigned OpNo, unsigned Val);
411
412	const GCNSubtarget ST = nullptr*;
413	InstCounterType MaxCounter = NUM_EXTENDED_INST_CNTS;
414	HardwareLimits Limits = {};
415	RegisterEncoding Encoding = {};
416	const unsigned *WaitEventMaskForInst;
417	InstCounterType SmemAccessCounter;
418	unsigned ScoreLBs[NUM_INST_CNTS] = {`0`};
419	unsigned ScoreUBs[NUM_INST_CNTS] = {`0`};
420	unsigned PendingEvents = `0`;
421	// Remember the last flat memory operation.
422	unsigned LastFlat[NUM_INST_CNTS] = {`0`};
423	// wait_cnt scores for every vgpr.
424	// Keep track of the VgprUB and SgprUB to make merge at join efficient.
425	int VgprUB = -`1`;
426	int SgprUB = -`1`;
427	unsigned VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS] = {{`0`}};
428	// Wait cnt scores for every sgpr, only DS_CNT (corresponding to LGKMcnt
429	// pre-gfx12) or KM_CNT (gfx12+ only) are relevant.
430	unsigned SgprScores[SQ_MAX_PGM_SGPRS] = {`0`};
431	// Bitmask of the VmemTypes of VMEM instructions that might have a pending
432	// write to each vgpr.
433	unsigned char VgprVmemTypes[NUM_ALL_VGPRS] = {`0`};
434	// Store representative LDS DMA operations. The only useful info here is
435	// alias info. One store is kept per unique AAInfo.
436	SmallVector<const MachineInstr *, NUM_EXTRA_VGPRS - `1`> LDSDMAStores;
437	};
438
439	// This abstracts the logic for generating and updating S_WAIT instructions*
440	// away from the analysis that determines where they are needed. This was
441	// done because the set of counters and instructions for waiting on them
442	// underwent a major shift with gfx12, sufficiently so that having this
443	// abstraction allows the main analysis logic to be simpler than it would
444	// otherwise have had to become.
445	class WaitcntGenerator {
446	protected:
447	const GCNSubtarget ST = nullptr*;
448	const SIInstrInfo TII = nullptr*;
449	AMDGPU::IsaVersion IV;
450	InstCounterType MaxCounter;
451	bool OptNone;
452
453	public:
454	WaitcntGenerator() = default;
455	WaitcntGenerator(const MachineFunction &MF, InstCounterType MaxCounter)
456	: ST(&MF.getSubtarget<GCNSubtarget>()), TII(ST->getInstrInfo()),
457	IV(AMDGPU::getIsaVersion(GPU: ST->getCPU())), MaxCounter(MaxCounter),
458	OptNone(MF.getFunction().hasOptNone() \|\|
459	MF.getTarget().getOptLevel() == CodeGenOptLevel::None) {}
460
461	// Return true if the current function should be compiled with no
462	// optimization.
463	bool isOptNone() const { return OptNone; }
464
465	// Edits an existing sequence of wait count instructions according
466	// to an incoming Waitcnt value, which is itself updated to reflect
467	// any new wait count instructions which may need to be generated by
468	// WaitcntGenerator::createNewWaitcnt(). It will return true if any edits
469	// were made.
470	//
471	// This editing will usually be merely updated operands, but it may also
472	// delete instructions if the incoming Wait value indicates they are not
473	// needed. It may also remove existing instructions for which a wait
474	// is needed if it can be determined that it is better to generate new
475	// instructions later, as can happen on gfx12.
476	virtual bool
477	applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
478	MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
479	MachineBasicBlock::instr_iterator It) const = `0`;
480
481	// Transform a soft waitcnt into a normal one.
482	bool promoteSoftWaitCnt(MachineInstr Waitcnt) const*;
483
484	// Generates new wait count instructions according to the value of
485	// Wait, returning true if any new instructions were created.
486	virtual bool createNewWaitcnt(MachineBasicBlock &Block,
487	MachineBasicBlock::instr_iterator It,
488	AMDGPU::Waitcnt Wait) = `0`;
489
490	// Returns an array of bit masks which can be used to map values in
491	// WaitEventType to corresponding counter values in InstCounterType.
492	virtual const unsigned getWaitEventMask() const* = `0`;
493
494	// Returns a new waitcnt with all counters except VScnt set to 0. If
495	// IncludeVSCnt is true, VScnt is set to 0, otherwise it is set to ~0u.
496	virtual AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const = `0`;
497
498	virtual ~WaitcntGenerator() = default;
499
500	// Create a mask value from the initializer list of wait event types.
501	static constexpr unsigned
502	eventMask(std::initializer_list<WaitEventType> Events) {
503	unsigned Mask = `0`;
504	for (auto &E : Events)
505	Mask \|= `1` << E;
506
507	return Mask;
508	}
509	};
510
511	class WaitcntGeneratorPreGFX12 : public WaitcntGenerator {
512	public:
513	WaitcntGeneratorPreGFX12() = default;
514	WaitcntGeneratorPreGFX12(const MachineFunction &MF)
515	: WaitcntGenerator (MF, NUM_NORMAL_INST_CNTS) {}
516
517	bool
518	applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
519	MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
520	MachineBasicBlock::instr_iterator It) const override;
521
522	bool createNewWaitcnt(MachineBasicBlock &Block,
523	MachineBasicBlock::instr_iterator It,
524	AMDGPU::Waitcnt Wait) override;
525
526	const unsigned getWaitEventMask() const* override {
527	assert(ST);
528
529	static const unsigned WaitEventMaskForInstPreGFX12[NUM_INST_CNTS] = {
530	eventMask(Events: {VMEM_ACCESS, VMEM_READ_ACCESS, VMEM_SAMPLER_READ_ACCESS,
531	VMEM_BVH_READ_ACCESS}),
532	eventMask(Events: {SMEM_ACCESS, LDS_ACCESS, GDS_ACCESS, SQ_MESSAGE}),
533	eventMask(Events: {EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK, EXP_PARAM_ACCESS,
534	EXP_POS_ACCESS, EXP_LDS_ACCESS}),
535	eventMask(Events: {VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
536	`0`,
537	`0`,
538	`0`};
539
540	return WaitEventMaskForInstPreGFX12;
541	}
542
543	AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override;
544	};
545
546	class WaitcntGeneratorGFX12Plus : public WaitcntGenerator {
547	public:
548	WaitcntGeneratorGFX12Plus() = default;
549	WaitcntGeneratorGFX12Plus(const MachineFunction &MF,
550	InstCounterType MaxCounter)
551	: WaitcntGenerator (MF, MaxCounter) {}
552
553	bool
554	applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
555	MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
556	MachineBasicBlock::instr_iterator It) const override;
557
558	bool createNewWaitcnt(MachineBasicBlock &Block,
559	MachineBasicBlock::instr_iterator It,
560	AMDGPU::Waitcnt Wait) override;
561
562	const unsigned getWaitEventMask() const* override {
563	assert(ST);
564
565	static const unsigned WaitEventMaskForInstGFX12Plus[NUM_INST_CNTS] = {
566	eventMask(Events: {VMEM_ACCESS, VMEM_READ_ACCESS}),
567	eventMask(Events: {LDS_ACCESS, GDS_ACCESS}),
568	eventMask(Events: {EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK, EXP_PARAM_ACCESS,
569	EXP_POS_ACCESS, EXP_LDS_ACCESS}),
570	eventMask(Events: {VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
571	eventMask(Events: {VMEM_SAMPLER_READ_ACCESS}),
572	eventMask(Events: {VMEM_BVH_READ_ACCESS}),
573	eventMask(Events: {SMEM_ACCESS, SQ_MESSAGE})};
574
575	return WaitEventMaskForInstGFX12Plus;
576	}
577
578	AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override;
579	};
580
581	class SIInsertWaitcnts : public MachineFunctionPass {
582	private:
583	const GCNSubtarget ST = nullptr*;
584	const SIInstrInfo TII = nullptr*;
585	const SIRegisterInfo TRI = nullptr*;
586	const MachineRegisterInfo MRI = nullptr*;
587
588	DenseMap<const Value , MachineBasicBlock > SLoadAddresses;
589	DenseMap<MachineBasicBlock , bool*> PreheadersToFlush;
590	MachineLoopInfo *MLI;
591	MachinePostDominatorTree *PDT;
592	AliasAnalysis AA = nullptr*;
593
594	struct BlockInfo {
595	std::unique_ptr<WaitcntBrackets> Incoming;
596	bool Dirty = true;
597	};
598
599	InstCounterType SmemAccessCounter;
600
601	MapVector<MachineBasicBlock *, BlockInfo> BlockInfos;
602
603	// ForceEmitZeroWaitcnts: force all waitcnts insts to be s_waitcnt 0
604	// because of amdgpu-waitcnt-forcezero flag
605	bool ForceEmitZeroWaitcnts;
606	bool ForceEmitWaitcnt[NUM_INST_CNTS];
607
608	// In any given run of this pass, WCG will point to one of these two
609	// generator objects, which must have been re-initialised before use
610	// from a value made using a subtarget constructor.
611	WaitcntGeneratorPreGFX12 WCGPreGFX12;
612	WaitcntGeneratorGFX12Plus WCGGFX12Plus;
613
614	WaitcntGenerator WCG = nullptr*;
615
616	// S_ENDPGM instructions before which we should insert a DEALLOC_VGPRS
617	// message.
618	DenseSet<MachineInstr *> ReleaseVGPRInsts;
619
620	InstCounterType MaxCounter = NUM_NORMAL_INST_CNTS;
621
622	public:
623	static char ID;
624
625	SIInsertWaitcnts() : MachineFunctionPass (ID) {
626	(void)ForceExpCounter;
627	(void)ForceLgkmCounter;
628	(void)ForceVMCounter;
629	}
630
631	bool shouldFlushVmCnt(MachineLoop *ML, WaitcntBrackets &Brackets);
632	bool isPreheaderToFlush(MachineBasicBlock &MBB,
633	WaitcntBrackets &ScoreBrackets);
634	bool isVMEMOrFlatVMEM(const MachineInstr &MI) const;
635	bool runOnMachineFunction(MachineFunction &MF) override;
636
637	StringRef getPassName() const override {
638	return "SI insert wait instructions";
639	}
640
641	void getAnalysisUsage(AnalysisUsage &AU) const override {
642	AU.setPreservesCFG();
643	AU.addRequired<MachineLoopInfoWrapperPass>();
644	AU.addRequired<MachinePostDominatorTreeWrapperPass>();
645	AU.addUsedIfAvailable<AAResultsWrapperPass>();
646	AU.addPreserved<AAResultsWrapperPass>();
647	MachineFunctionPass::getAnalysisUsage(AU);
648	}
649
650	bool isForceEmitWaitcnt() const {
651	for (auto T : inst_counter_types())
652	if (ForceEmitWaitcnt[T])
653	return true;
654	return false;
655	}
656
657	void setForceEmitWaitcnt() {
658	// For non-debug builds, ForceEmitWaitcnt has been initialized to false;
659	// For debug builds, get the debug counter info and adjust if need be
660	#ifndef NDEBUG
661	if (DebugCounter::isCounterSet(ForceExpCounter) &&
662	DebugCounter::shouldExecute(ForceExpCounter)) {
663	ForceEmitWaitcnt[EXP_CNT] = true;
664	} else {
665	ForceEmitWaitcnt[EXP_CNT] = false;
666	}
667
668	if (DebugCounter::isCounterSet(ForceLgkmCounter) &&
669	DebugCounter::shouldExecute(ForceLgkmCounter)) {
670	ForceEmitWaitcnt[DS_CNT] = true;
671	ForceEmitWaitcnt[KM_CNT] = true;
672	} else {
673	ForceEmitWaitcnt[DS_CNT] = false;
674	ForceEmitWaitcnt[KM_CNT] = false;
675	}
676
677	if (DebugCounter::isCounterSet(ForceVMCounter) &&
678	DebugCounter::shouldExecute(ForceVMCounter)) {
679	ForceEmitWaitcnt[LOAD_CNT] = true;
680	ForceEmitWaitcnt[SAMPLE_CNT] = true;
681	ForceEmitWaitcnt[BVH_CNT] = true;
682	} else {
683	ForceEmitWaitcnt[LOAD_CNT] = false;
684	ForceEmitWaitcnt[SAMPLE_CNT] = false;
685	ForceEmitWaitcnt[BVH_CNT] = false;
686	}
687	#endif // NDEBUG
688	}
689
690	// Return the appropriate VMEM__ACCESS type for Inst, which must be a VMEM or*
691	// FLAT instruction.
692	WaitEventType getVmemWaitEventType(const MachineInstr &Inst) const {
693	// Maps VMEM access types to their corresponding WaitEventType.
694	static const WaitEventType VmemReadMapping[NUM_VMEM_TYPES] = {
695	VMEM_READ_ACCESS, VMEM_SAMPLER_READ_ACCESS, VMEM_BVH_READ_ACCESS};
696
697	assert(SIInstrInfo::isVMEM(Inst) \|\| SIInstrInfo::isFLAT(Inst));
698	// LDS DMA loads are also stores, but on the LDS side. On the VMEM side
699	// these should use VM_CNT.
700	if (!ST->hasVscnt() \|\| SIInstrInfo::mayWriteLDSThroughDMA(MI: Inst))
701	return VMEM_ACCESS;
702	if (Inst.mayStore() &&
703	(!Inst.mayLoad() \|\| SIInstrInfo::isAtomicNoRet(MI: Inst))) {
704	// FLAT and SCRATCH instructions may access scratch. Other VMEM
705	// instructions do not.
706	if (SIInstrInfo::isFLAT(MI: Inst) && mayAccessScratchThroughFlat(MI: Inst))
707	return SCRATCH_WRITE_ACCESS;
708	return VMEM_WRITE_ACCESS;
709	}
710	if (!ST->hasExtendedWaitCounts() \|\| SIInstrInfo::isFLAT(MI: Inst))
711	return VMEM_READ_ACCESS;
712	return VmemReadMapping[getVmemType(Inst)];
713	}
714
715	bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const;
716	bool mayAccessLDSThroughFlat(const MachineInstr &MI) const;
717	bool mayAccessScratchThroughFlat(const MachineInstr &MI) const;
718	bool generateWaitcntInstBefore(MachineInstr &MI,
719	WaitcntBrackets &ScoreBrackets,
720	MachineInstr *OldWaitcntInstr,
721	bool FlushVmCnt);
722	bool generateWaitcnt(AMDGPU::Waitcnt Wait,
723	MachineBasicBlock::instr_iterator It,
724	MachineBasicBlock &Block, WaitcntBrackets &ScoreBrackets,
725	MachineInstr *OldWaitcntInstr);
726	void updateEventWaitcntAfter(MachineInstr &Inst,
727	WaitcntBrackets *ScoreBrackets);
728	bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block,
729	WaitcntBrackets &ScoreBrackets);
730	};
731
732	} // end anonymous namespace
733
734	RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
735	const MachineRegisterInfo *MRI,
736	const SIRegisterInfo *TRI,
737	unsigned OpNo) const {
738	const MachineOperand &Op = MI->getOperand(i: OpNo);
739	if (!TRI->isInAllocatableClass(RegNo: Op.getReg()))
740	return {-`1`, -`1`};
741
742	// A use via a PW operand does not need a waitcnt.
743	// A partial write is not a WAW.
744	assert(!Op.getSubReg() \|\| !Op.isUndef());
745
746	RegInterval Result;
747
748	unsigned Reg = TRI->getEncodingValue(RegNo: AMDGPU::getMCReg(Reg: Op.getReg(), STI: *ST)) &
749	AMDGPU::HWEncoding::REG_IDX_MASK;
750
751	if (TRI->isVectorRegister(MRI: *MRI, Reg: Op.getReg())) {
752	assert(Reg >= Encoding.VGPR0 && Reg <= Encoding.VGPRL);
753	Result.first = Reg - Encoding.VGPR0;
754	if (TRI->isAGPR(MRI: *MRI, Reg: Op.getReg()))
755	Result.first += AGPR_OFFSET;
756	assert(Result.first >= `0` && Result.first < SQ_MAX_PGM_VGPRS);
757	} else if (TRI->isSGPRReg(MRI: *MRI, Reg: Op.getReg())) {
758	assert(Reg >= Encoding.SGPR0 && Reg < SQ_MAX_PGM_SGPRS);
759	Result.first = Reg - Encoding.SGPR0 + NUM_ALL_VGPRS;
760	assert(Result.first >= NUM_ALL_VGPRS &&
761	Result.first < SQ_MAX_PGM_SGPRS + NUM_ALL_VGPRS);
762	}
763	// TODO: Handle TTMP
764	// else if (TRI->isTTMP(MRI, Reg.getReg())) ...*
765	else
766	return {-`1`, -`1`};
767
768	const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Reg: Op.getReg());
769	unsigned Size = TRI->getRegSizeInBits(RC: *RC);
770	Result.second = Result.first + ((Size + `16`) / `32`);
771
772	return Result;
773	}
774
775	void WaitcntBrackets::setExpScore(const MachineInstr *MI,
776	const SIInstrInfo *TII,
777	const SIRegisterInfo *TRI,
778	const MachineRegisterInfo MRI, unsigned* OpNo,
779	unsigned Val) {
780	RegInterval Interval = getRegInterval(MI, MRI, TRI, OpNo);
781	assert(TRI->isVectorRegister(*MRI, MI->getOperand(OpNo).getReg()));
782	for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
783	setRegScore(GprNo: RegNo, T: EXP_CNT, Val);
784	}
785	}
786
787	void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
788	const SIRegisterInfo *TRI,
789	const MachineRegisterInfo *MRI,
790	WaitEventType E, MachineInstr &Inst) {
791	InstCounterType T = eventCounter(masks: WaitEventMaskForInst, E);
792
793	unsigned UB = getScoreUB(T);
794	unsigned CurrScore = UB + `1`;
795	if (CurrScore == `0`)
796	report_fatal_error(reason: "InsertWaitcnt score wraparound");
797	// PendingEvents and ScoreUB need to be update regardless if this event
798	// changes the score of a register or not.
799	// Examples including vm_cnt when buffer-store or lgkm_cnt when send-message.
800	PendingEvents \|= `1` << E;
801	setScoreUB(T, Val: CurrScore);
802
803	if (T == EXP_CNT) {
804	// Put score on the source vgprs. If this is a store, just use those
805	// specific register(s).
806	if (TII->isDS(MI: Inst) && (Inst.mayStore() \|\| Inst.mayLoad())) {
807	int AddrOpIdx =
808	AMDGPU::getNamedOperandIdx(Opcode: Inst.getOpcode(), NamedIdx: AMDGPU::OpName::addr);
809	// All GDS operations must protect their address register (same as
810	// export.)
811	if (AddrOpIdx != -`1`) {
812	setExpScore(MI: &Inst, TII, TRI, MRI, OpNo: AddrOpIdx, Val: CurrScore);
813	}
814
815	if (Inst.mayStore()) {
816	if (AMDGPU::hasNamedOperand(Opcode: Inst.getOpcode(), NamedIdx: AMDGPU::OpName::data0)) {
817	setExpScore(
818	MI: &Inst, TII, TRI, MRI,
819	OpNo: AMDGPU::getNamedOperandIdx(Opcode: Inst.getOpcode(), NamedIdx: AMDGPU::OpName::data0),
820	Val: CurrScore);
821	}
822	if (AMDGPU::hasNamedOperand(Opcode: Inst.getOpcode(), NamedIdx: AMDGPU::OpName::data1)) {
823	setExpScore(MI: &Inst, TII, TRI, MRI,
824	OpNo: AMDGPU::getNamedOperandIdx(Opcode: Inst.getOpcode(),
825	NamedIdx: AMDGPU::OpName::data1),
826	Val: CurrScore);
827	}
828	} else if (SIInstrInfo::isAtomicRet(MI: Inst) && !SIInstrInfo::isGWS(MI: Inst) &&
829	Inst.getOpcode() != AMDGPU::DS_APPEND &&
830	Inst.getOpcode() != AMDGPU::DS_CONSUME &&
831	Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) {
832	for (unsigned I = `0`, E = Inst.getNumOperands(); I != E; ++I) {
833	const MachineOperand &Op = Inst.getOperand(i: I);
834	if (Op.isReg() && !Op.isDef() &&
835	TRI->isVectorRegister(MRI: *MRI, Reg: Op.getReg())) {
836	setExpScore(MI: &Inst, TII, TRI, MRI, OpNo: I, Val: CurrScore);
837	}
838	}
839	}
840	} else if (TII->isFLAT(MI: Inst)) {
841	if (Inst.mayStore()) {
842	setExpScore(
843	MI: &Inst, TII, TRI, MRI,
844	OpNo: AMDGPU::getNamedOperandIdx(Opcode: Inst.getOpcode(), NamedIdx: AMDGPU::OpName::data),
845	Val: CurrScore);
846	} else if (SIInstrInfo::isAtomicRet(MI: Inst)) {
847	setExpScore(
848	MI: &Inst, TII, TRI, MRI,
849	OpNo: AMDGPU::getNamedOperandIdx(Opcode: Inst.getOpcode(), NamedIdx: AMDGPU::OpName::data),
850	Val: CurrScore);
851	}
852	} else if (TII->isMIMG(MI: Inst)) {
853	if (Inst.mayStore()) {
854	setExpScore(MI: &Inst, TII, TRI, MRI, OpNo: `0`, Val: CurrScore);
855	} else if (SIInstrInfo::isAtomicRet(MI: Inst)) {
856	setExpScore(
857	MI: &Inst, TII, TRI, MRI,
858	OpNo: AMDGPU::getNamedOperandIdx(Opcode: Inst.getOpcode(), NamedIdx: AMDGPU::OpName::data),
859	Val: CurrScore);
860	}
861	} else if (TII->isMTBUF(MI: Inst)) {
862	if (Inst.mayStore()) {
863	setExpScore(MI: &Inst, TII, TRI, MRI, OpNo: `0`, Val: CurrScore);
864	}
865	} else if (TII->isMUBUF(MI: Inst)) {
866	if (Inst.mayStore()) {
867	setExpScore(MI: &Inst, TII, TRI, MRI, OpNo: `0`, Val: CurrScore);
868	} else if (SIInstrInfo::isAtomicRet(MI: Inst)) {
869	setExpScore(
870	MI: &Inst, TII, TRI, MRI,
871	OpNo: AMDGPU::getNamedOperandIdx(Opcode: Inst.getOpcode(), NamedIdx: AMDGPU::OpName::data),
872	Val: CurrScore);
873	}
874	} else if (TII->isLDSDIR(MI: Inst)) {
875	// LDSDIR instructions attach the score to the destination.
876	setExpScore(
877	MI: &Inst, TII, TRI, MRI,
878	OpNo: AMDGPU::getNamedOperandIdx(Opcode: Inst.getOpcode(), NamedIdx: AMDGPU::OpName::vdst),
879	Val: CurrScore);
880	} else {
881	if (TII->isEXP(MI: Inst)) {
882	// For export the destination registers are really temps that
883	// can be used as the actual source after export patching, so
884	// we need to treat them like sources and set the EXP_CNT
885	// score.
886	for (unsigned I = `0`, E = Inst.getNumOperands(); I != E; ++I) {
887	MachineOperand &DefMO = Inst.getOperand(i: I);
888	if (DefMO.isReg() && DefMO.isDef() &&
889	TRI->isVGPR(MRI: *MRI, Reg: DefMO.getReg())) {
890	setRegScore(
891	GprNo: TRI->getEncodingValue(RegNo: AMDGPU::getMCReg(Reg: DefMO.getReg(), STI: *ST)),
892	T: EXP_CNT, Val: CurrScore);
893	}
894	}
895	}
896	for (unsigned I = `0`, E = Inst.getNumOperands(); I != E; ++I) {
897	MachineOperand &MO = Inst.getOperand(i: I);
898	if (MO.isReg() && !MO.isDef() &&
899	TRI->isVectorRegister(MRI: *MRI, Reg: MO.getReg())) {
900	setExpScore(MI: &Inst, TII, TRI, MRI, OpNo: I, Val: CurrScore);
901	}
902	}
903	}
904	} else / LGKM_CNT \|\| EXP_CNT \|\| VS_CNT \|\| NUM_INST_CNTS / {
905	// Match the score to the destination registers.
906	for (unsigned I = `0`, E = Inst.getNumOperands(); I != E; ++I) {
907	auto &Op = Inst.getOperand(i: I);
908	if (!Op.isReg() \|\| !Op.isDef())
909	continue;
910	RegInterval Interval = getRegInterval(MI: &Inst, MRI, TRI, OpNo: I);
911	if (T == LOAD_CNT \|\| T == SAMPLE_CNT \|\| T == BVH_CNT) {
912	if (Interval.first >= NUM_ALL_VGPRS)
913	continue;
914	if (updateVMCntOnly(Inst)) {
915	// updateVMCntOnly should only leave us with VGPRs
916	// MUBUF, MTBUF, MIMG, FlatGlobal, and FlatScratch only have VGPR/AGPR
917	// defs. That's required for a sane index into `VgprMemTypes` below
918	assert(TRI->isVectorRegister(*MRI, Op.getReg()));
919	VmemType V = getVmemType(Inst);
920	for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo)
921	VgprVmemTypes[RegNo] \|= `1` << V;
922	}
923	}
924	for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
925	setRegScore(GprNo: RegNo, T, Val: CurrScore);
926	}
927	}
928	if (Inst.mayStore() &&
929	(TII->isDS(MI: Inst) \|\| TII->mayWriteLDSThroughDMA(MI: Inst))) {
930	// MUBUF and FLAT LDS DMA operations need a wait on vmcnt before LDS
931	// written can be accessed. A load from LDS to VMEM does not need a wait.
932	unsigned Slot = `0`;
933	for (const auto *MemOp : Inst.memoperands()) {
934	if (!MemOp->isStore() \|\|
935	MemOp->getAddrSpace() != AMDGPUAS::LOCAL_ADDRESS)
936	continue;
937	// Comparing just AA info does not guarantee memoperands are equal
938	// in general, but this is so for LDS DMA in practice.
939	auto AAI = MemOp->getAAInfo();
940	// Alias scope information gives a way to definitely identify an
941	// original memory object and practically produced in the module LDS
942	// lowering pass. If there is no scope available we will not be able
943	// to disambiguate LDS aliasing as after the module lowering all LDS
944	// is squashed into a single big object. Do not attempt to use one of
945	// the limited LDSDMAStores for something we will not be able to use
946	// anyway.
947	if (!AAI \|\| !AAI.Scope)
948	break;
949	for (unsigned I = `0`, E = LDSDMAStores.size(); I != E && !Slot; ++I) {
950	for (const auto *MemOp : LDSDMAStores [I]->memoperands()) {
951	if (MemOp->isStore() && AAI == MemOp->getAAInfo()) {
952	Slot = I + `1`;
953	break;
954	}
955	}
956	}
957	if (Slot \|\| LDSDMAStores.size() == NUM_EXTRA_VGPRS - `1`)
958	break;
959	LDSDMAStores.push_back(Elt: &Inst);
960	Slot = LDSDMAStores.size();
961	break;
962	}
963	setRegScore(GprNo: SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS + Slot, T, Val: CurrScore);
964	if (Slot)
965	setRegScore(GprNo: SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS, T, Val: CurrScore);
966	}
967	}
968	}
969
970	void WaitcntBrackets::print(raw_ostream &OS) {
971	OS << `'\n'`;
972	for (auto T : inst_counter_types(MaxCounter)) {
973	unsigned SR = getScoreRange(T);
974
975	switch (T) {
976	case LOAD_CNT:
977	OS << " " << (ST->hasExtendedWaitCounts() ? "LOAD" : "VM") << "_CNT("
978	<< SR << "): ";
979	break;
980	case DS_CNT:
981	OS << " " << (ST->hasExtendedWaitCounts() ? "DS" : "LGKM") << "_CNT("
982	<< SR << "): ";
983	break;
984	case EXP_CNT:
985	OS << " EXP_CNT(" << SR << "): ";
986	break;
987	case STORE_CNT:
988	OS << " " << (ST->hasExtendedWaitCounts() ? "STORE" : "VS") << "_CNT("
989	<< SR << "): ";
990	break;
991	case SAMPLE_CNT:
992	OS << " SAMPLE_CNT(" << SR << "): ";
993	break;
994	case BVH_CNT:
995	OS << " BVH_CNT(" << SR << "): ";
996	break;
997	case KM_CNT:
998	OS << " KM_CNT(" << SR << "): ";
999	break;
1000	default:
1001	OS << " UNKNOWN(" << SR << "): ";
1002	break;
1003	}
1004
1005	if (SR != `0`) {
1006	// Print vgpr scores.
1007	unsigned LB = getScoreLB(T);
1008
1009	for (int J = `0`; J <= VgprUB; J++) {
1010	unsigned RegScore = getRegScore(GprNo: J, T);
1011	if (RegScore <= LB)
1012	continue;
1013	unsigned RelScore = RegScore - LB - `1`;
1014	if (J < SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS) {
1015	OS << RelScore << ":v" << J << " ";
1016	} else {
1017	OS << RelScore << ":ds ";
1018	}
1019	}
1020	// Also need to print sgpr scores for lgkm_cnt.
1021	if (T == SmemAccessCounter) {
1022	for (int J = `0`; J <= SgprUB; J++) {
1023	unsigned RegScore = getRegScore(GprNo: J + NUM_ALL_VGPRS, T);
1024	if (RegScore <= LB)
1025	continue;
1026	unsigned RelScore = RegScore - LB - `1`;
1027	OS << RelScore << ":s" << J << " ";
1028	}
1029	}
1030	}
1031	OS << `'\n'`;
1032	}
1033	OS << `'\n'`;
1034	}
1035
1036	/// Simplify the waitcnt, in the sense of removing redundant counts, and return
1037	/// whether a waitcnt instruction is needed at all.
1038	void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const {
1039	simplifyWaitcnt(T: LOAD_CNT, Count&: Wait.LoadCnt);
1040	simplifyWaitcnt(T: EXP_CNT, Count&: Wait.ExpCnt);
1041	simplifyWaitcnt(T: DS_CNT, Count&: Wait.DsCnt);
1042	simplifyWaitcnt(T: STORE_CNT, Count&: Wait.StoreCnt);
1043	simplifyWaitcnt(T: SAMPLE_CNT, Count&: Wait.SampleCnt);
1044	simplifyWaitcnt(T: BVH_CNT, Count&: Wait.BvhCnt);
1045	simplifyWaitcnt(T: KM_CNT, Count&: Wait.KmCnt);
1046	}
1047
1048	void WaitcntBrackets::simplifyWaitcnt(InstCounterType T,
1049	unsigned &Count) const {
1050	// The number of outstanding events for this type, T, can be calculated
1051	// as (UB - LB). If the current Count is greater than or equal to the number
1052	// of outstanding events, then the wait for this counter is redundant.
1053	if (Count >= getScoreRange(T))
1054	Count = ~`0u`;
1055	}
1056
1057	void WaitcntBrackets::determineWait(InstCounterType T, int RegNo,
1058	AMDGPU::Waitcnt &Wait) const {
1059	unsigned ScoreToWait = getRegScore(GprNo: RegNo, T);
1060
1061	// If the score of src_operand falls within the bracket, we need an
1062	// s_waitcnt instruction.
1063	const unsigned LB = getScoreLB(T);
1064	const unsigned UB = getScoreUB(T);
1065	if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
1066	if ((T == LOAD_CNT \|\| T == DS_CNT) && hasPendingFlat() &&
1067	!ST->hasFlatLgkmVMemCountInOrder()) {
1068	// If there is a pending FLAT operation, and this is a VMem or LGKM
1069	// waitcnt and the target can report early completion, then we need
1070	// to force a waitcnt 0.
1071	addWait(Wait, T, Count: `0`);
1072	} else if (counterOutOfOrder(T)) {
1073	// Counter can get decremented out-of-order when there
1074	// are multiple types event in the bracket. Also emit an s_wait counter
1075	// with a conservative value of 0 for the counter.
1076	addWait(Wait, T, Count: `0`);
1077	} else {
1078	// If a counter has been maxed out avoid overflow by waiting for
1079	// MAX(CounterType) - 1 instead.
1080	unsigned NeededWait = std::min(a: UB - ScoreToWait, b: getWaitCountMax(T) - `1`);
1081	addWait(Wait, T, Count: NeededWait);
1082	}
1083	}
1084	}
1085
1086	void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) {
1087	applyWaitcnt(T: LOAD_CNT, Count: Wait.LoadCnt);
1088	applyWaitcnt(T: EXP_CNT, Count: Wait.ExpCnt);
1089	applyWaitcnt(T: DS_CNT, Count: Wait.DsCnt);
1090	applyWaitcnt(T: STORE_CNT, Count: Wait.StoreCnt);
1091	applyWaitcnt(T: SAMPLE_CNT, Count: Wait.SampleCnt);
1092	applyWaitcnt(T: BVH_CNT, Count: Wait.BvhCnt);
1093	applyWaitcnt(T: KM_CNT, Count: Wait.KmCnt);
1094	}
1095
1096	void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
1097	const unsigned UB = getScoreUB(T);
1098	if (Count >= UB)
1099	return;
1100	if (Count != `0`) {
1101	if (counterOutOfOrder(T))
1102	return;
1103	setScoreLB(T, Val: std::max(a: getScoreLB(T), b: UB - Count));
1104	} else {
1105	setScoreLB(T, Val: UB);
1106	PendingEvents &= ~WaitEventMaskForInst[T];
1107	}
1108	}
1109
1110	// Where there are multiple types of event in the bracket of a counter,
1111	// the decrement may go out of order.
1112	bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const {
1113	// Scalar memory read always can go out of order.
1114	if (T == SmemAccessCounter && hasPendingEvent(E: SMEM_ACCESS))
1115	return true;
1116	return hasMixedPendingEvents(T);
1117	}
1118
1119	INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
1120	false)
1121	INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass)
1122	INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTreeWrapperPass)
1123	INITIALIZE_PASS_END(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
1124	false)
1125
1126	char SIInsertWaitcnts::ID = `0`;
1127
1128	char &llvm::SIInsertWaitcntsID = SIInsertWaitcnts::ID;
1129
1130	FunctionPass *llvm::createSIInsertWaitcntsPass() {
1131	return new SIInsertWaitcnts ();
1132	}
1133
1134	static bool updateOperandIfDifferent(MachineInstr &MI, uint16_t OpName,
1135	unsigned NewEnc) {
1136	int OpIdx = AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), NamedIdx: OpName);
1137	assert(OpIdx >= `0`);
1138
1139	MachineOperand &MO = MI.getOperand(i: OpIdx);
1140
1141	if (NewEnc == MO.getImm())
1142	return false;
1143
1144	MO.setImm(NewEnc);
1145	return true;
1146	}
1147
1148	/// Determine if \p MI is a gfx12+ single-counter S_WAIT_CNT instruction,*
1149	/// and if so, which counter it is waiting on.
1150	static std::optional<InstCounterType> counterTypeForInstr(unsigned Opcode) {
1151	switch (Opcode) {
1152	case AMDGPU::S_WAIT_LOADCNT:
1153	return LOAD_CNT;
1154	case AMDGPU::S_WAIT_EXPCNT:
1155	return EXP_CNT;
1156	case AMDGPU::S_WAIT_STORECNT:
1157	return STORE_CNT;
1158	case AMDGPU::S_WAIT_SAMPLECNT:
1159	return SAMPLE_CNT;
1160	case AMDGPU::S_WAIT_BVHCNT:
1161	return BVH_CNT;
1162	case AMDGPU::S_WAIT_DSCNT:
1163	return DS_CNT;
1164	case AMDGPU::S_WAIT_KMCNT:
1165	return KM_CNT;
1166	default:
1167	return {};
1168	}
1169	}
1170
1171	bool WaitcntGenerator::promoteSoftWaitCnt(MachineInstr Waitcnt) const* {
1172	unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Opcode: Waitcnt->getOpcode());
1173	if (Opcode == Waitcnt->getOpcode())
1174	return false;
1175
1176	Waitcnt->setDesc(TII->get(Opcode));
1177	return true;
1178	}
1179
1180	/// Combine consecutive S_WAITCNT and S_WAITCNT_VSCNT instructions that
1181	/// precede \p It and follow \p OldWaitcntInstr and apply any extra waits
1182	/// from \p Wait that were added by previous passes. Currently this pass
1183	/// conservatively assumes that these preexisting waits are required for
1184	/// correctness.
1185	bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
1186	WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
1187	AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) const {
1188	assert(ST);
1189	assert(isNormalMode(MaxCounter));
1190
1191	bool Modified = false;
1192	MachineInstr WaitcntInstr = nullptr*;
1193	MachineInstr WaitcntVsCntInstr = nullptr*;
1194
1195	for (auto &II :
1196	make_early_inc_range(Range: make_range(x: OldWaitcntInstr.getIterator(), y: It))) {
1197	if (II.isMetaInstruction())
1198	continue;
1199
1200	unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Opcode: II.getOpcode());
1201	bool TrySimplify = Opcode != II.getOpcode() && !OptNone;
1202
1203	// Update required wait count. If this is a soft waitcnt (= it was added
1204	// by an earlier pass), it may be entirely removed.
1205	if (Opcode == AMDGPU::S_WAITCNT) {
1206	unsigned IEnc = II.getOperand(i: `0`).getImm();
1207	AMDGPU::Waitcnt OldWait = AMDGPU::decodeWaitcnt(Version: IV, Encoded: IEnc);
1208	if (TrySimplify)
1209	ScoreBrackets.simplifyWaitcnt(Wait&: OldWait);
1210	Wait = Wait.combined(Other: OldWait);
1211
1212	// Merge consecutive waitcnt of the same type by erasing multiples.
1213	if (WaitcntInstr \|\| (!Wait.hasWaitExceptStoreCnt() && TrySimplify)) {
1214	II.eraseFromParent();
1215	Modified = true;
1216	} else
1217	WaitcntInstr = &II;
1218	} else {
1219	assert(Opcode == AMDGPU::S_WAITCNT_VSCNT);
1220	assert(II.getOperand(`0`).getReg() == AMDGPU::SGPR_NULL);
1221
1222	unsigned OldVSCnt =
1223	TII->getNamedOperand(MI&: II, OperandName: AMDGPU::OpName::simm16)->getImm();
1224	if (TrySimplify)
1225	ScoreBrackets.simplifyWaitcnt(T: InstCounterType::STORE_CNT, Count&: OldVSCnt);
1226	Wait.StoreCnt = std::min(a: Wait.StoreCnt, b: OldVSCnt);
1227
1228	if (WaitcntVsCntInstr \|\| (!Wait.hasWaitStoreCnt() && TrySimplify)) {
1229	II.eraseFromParent();
1230	Modified = true;
1231	} else
1232	WaitcntVsCntInstr = &II;
1233	}
1234	}
1235
1236	if (WaitcntInstr) {
1237	Modified \|= updateOperandIfDifferent(MI&: *WaitcntInstr, OpName: AMDGPU::OpName::simm16,
1238	NewEnc: AMDGPU::encodeWaitcnt(Version: IV, Decoded: Wait));
1239	Modified \|= promoteSoftWaitCnt(Waitcnt: WaitcntInstr);
1240
1241	ScoreBrackets.applyWaitcnt(T: LOAD_CNT, Count: Wait.LoadCnt);
1242	ScoreBrackets.applyWaitcnt(T: EXP_CNT, Count: Wait.ExpCnt);
1243	ScoreBrackets.applyWaitcnt(T: DS_CNT, Count: Wait.DsCnt);
1244	Wait.LoadCnt = ~`0u`;
1245	Wait.ExpCnt = ~`0u`;
1246	Wait.DsCnt = ~`0u`;
1247
1248	LLVM_DEBUG(It == WaitcntInstr->getParent()->end()
1249	? dbgs()
1250	<< "applyPreexistingWaitcnt\n"
1251	<< "New Instr at block end: " << *WaitcntInstr << `'\n'`
1252	: dbgs() << "applyPreexistingWaitcnt\n"
1253	<< "Old Instr: " << *It
1254	<< "New Instr: " << *WaitcntInstr << `'\n'`);
1255	}
1256
1257	if (WaitcntVsCntInstr) {
1258	Modified \|= updateOperandIfDifferent(MI&: *WaitcntVsCntInstr,
1259	OpName: AMDGPU::OpName::simm16, NewEnc: Wait.StoreCnt);
1260	Modified \|= promoteSoftWaitCnt(Waitcnt: WaitcntVsCntInstr);
1261
1262	ScoreBrackets.applyWaitcnt(T: STORE_CNT, Count: Wait.StoreCnt);
1263	Wait.StoreCnt = ~`0u`;
1264
1265	LLVM_DEBUG(It == WaitcntVsCntInstr->getParent()->end()
1266	? dbgs() << "applyPreexistingWaitcnt\n"
1267	<< "New Instr at block end: " << *WaitcntVsCntInstr
1268	<< `'\n'`
1269	: dbgs() << "applyPreexistingWaitcnt\n"
1270	<< "Old Instr: " << *It
1271	<< "New Instr: " << *WaitcntVsCntInstr << `'\n'`);
1272	}
1273
1274	return Modified;
1275	}
1276
1277	/// Generate S_WAITCNT and/or S_WAITCNT_VSCNT instructions for any
1278	/// required counters in \p Wait
1279	bool WaitcntGeneratorPreGFX12::createNewWaitcnt(
1280	MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It,
1281	AMDGPU::Waitcnt Wait) {
1282	assert(ST);
1283	assert(isNormalMode(MaxCounter));
1284
1285	bool Modified = false;
1286	const DebugLoc &DL = Block.findDebugLoc(MBBI: It);
1287
1288	// Waits for VMcnt, LKGMcnt and/or EXPcnt are encoded together into a
1289	// single instruction while VScnt has its own instruction.
1290	if (Wait.hasWaitExceptStoreCnt()) {
1291	unsigned Enc = AMDGPU::encodeWaitcnt(Version: IV, Decoded: Wait);
1292	[[maybe_unused]] auto SWaitInst =
1293	BuildMI(BB&: Block, I: It, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_WAITCNT)).addImm(Val: Enc);
1294	Modified = true;
1295
1296	LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
1297	if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
1298	dbgs() << "New Instr: " << *SWaitInst << `'\n'`);
1299	}
1300
1301	if (Wait.hasWaitStoreCnt()) {
1302	assert(ST->hasVscnt());
1303
1304	[[maybe_unused]] auto SWaitInst =
1305	BuildMI(BB&: Block, I: It, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_WAITCNT_VSCNT))
1306	.addReg(RegNo: AMDGPU::SGPR_NULL, flags: RegState::Undef)
1307	.addImm(Val: Wait.StoreCnt);
1308	Modified = true;
1309
1310	LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
1311	if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
1312	dbgs() << "New Instr: " << *SWaitInst << `'\n'`);
1313	}
1314
1315	return Modified;
1316	}
1317
1318	AMDGPU::Waitcnt
1319	WaitcntGeneratorPreGFX12::getAllZeroWaitcnt(bool IncludeVSCnt) const {
1320	return AMDGPU::Waitcnt (`0`, `0`, `0`, IncludeVSCnt && ST->hasVscnt() ? `0` : ~`0u`);
1321	}
1322
1323	AMDGPU::Waitcnt
1324	WaitcntGeneratorGFX12Plus::getAllZeroWaitcnt(bool IncludeVSCnt) const {
1325	return AMDGPU::Waitcnt (`0`, `0`, `0`, IncludeVSCnt ? `0` : ~`0u`, `0`, `0`, `0`);
1326	}
1327
1328	/// Combine consecutive S_WAIT_CNT instructions that precede \p It and*
1329	/// follow \p OldWaitcntInstr and apply any extra waits from \p Wait that
1330	/// were added by previous passes. Currently this pass conservatively
1331	/// assumes that these preexisting waits are required for correctness.
1332	bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
1333	WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
1334	AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) const {
1335	assert(ST);
1336	assert(!isNormalMode(MaxCounter));
1337
1338	bool Modified = false;
1339	MachineInstr CombinedLoadDsCntInstr = nullptr*;
1340	MachineInstr CombinedStoreDsCntInstr = nullptr*;
1341	MachineInstr *WaitInstrs[NUM_EXTENDED_INST_CNTS] = {};
1342
1343	for (auto &II :
1344	make_early_inc_range(Range: make_range(x: OldWaitcntInstr.getIterator(), y: It))) {
1345	if (II.isMetaInstruction())
1346	continue;
1347
1348	MachineInstr **UpdatableInstr;
1349
1350	// Update required wait count. If this is a soft waitcnt (= it was added
1351	// by an earlier pass), it may be entirely removed.
1352
1353	unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Opcode: II.getOpcode());
1354	bool TrySimplify = Opcode != II.getOpcode() && !OptNone;
1355
1356	// Don't crash if the programmer used legacy waitcnt intrinsics, but don't
1357	// attempt to do more than that either.
1358	if (Opcode == AMDGPU::S_WAITCNT)
1359	continue;
1360
1361	if (Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT) {
1362	unsigned OldEnc =
1363	TII->getNamedOperand(MI&: II, OperandName: AMDGPU::OpName::simm16)->getImm();
1364	AMDGPU::Waitcnt OldWait = AMDGPU::decodeLoadcntDscnt(Version: IV, LoadcntDscnt: OldEnc);
1365	if (TrySimplify)
1366	ScoreBrackets.simplifyWaitcnt(Wait&: OldWait);
1367	Wait = Wait.combined(Other: OldWait);
1368	UpdatableInstr = &CombinedLoadDsCntInstr;
1369	} else if (Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT) {
1370	unsigned OldEnc =
1371	TII->getNamedOperand(MI&: II, OperandName: AMDGPU::OpName::simm16)->getImm();
1372	AMDGPU::Waitcnt OldWait = AMDGPU::decodeStorecntDscnt(Version: IV, StorecntDscnt: OldEnc);
1373	if (TrySimplify)
1374	ScoreBrackets.simplifyWaitcnt(Wait&: OldWait);
1375	Wait = Wait.combined(Other: OldWait);
1376	UpdatableInstr = &CombinedStoreDsCntInstr;
1377	} else {
1378	std::optional<InstCounterType> CT = counterTypeForInstr(Opcode);
1379	assert(CT.has_value());
1380	unsigned OldCnt =
1381	TII->getNamedOperand(MI&: II, OperandName: AMDGPU::OpName::simm16)->getImm();
1382	if (TrySimplify)
1383	ScoreBrackets.simplifyWaitcnt(T: CT.value(), Count&: OldCnt);
1384	addWait(Wait, T: CT.value(), Count: OldCnt);
1385	UpdatableInstr = &WaitInstrs[CT.value()];
1386	}
1387
1388	// Merge consecutive waitcnt of the same type by erasing multiples.
1389	if (!*UpdatableInstr) {
1390	*UpdatableInstr = &II;
1391	} else {
1392	II.eraseFromParent();
1393	Modified = true;
1394	}
1395	}
1396
1397	if (CombinedLoadDsCntInstr) {
1398	// Only keep an S_WAIT_LOADCNT_DSCNT if both counters actually need
1399	// to be waited for. Otherwise, let the instruction be deleted so
1400	// the appropriate single counter wait instruction can be inserted
1401	// instead, when new S_WAIT_CNT instructions are inserted by*
1402	// createNewWaitcnt(). As a side effect, resetting the wait counts will
1403	// cause any redundant S_WAIT_LOADCNT or S_WAIT_DSCNT to be removed by
1404	// the loop below that deals with single counter instructions.
1405	if (Wait.LoadCnt != ~`0u` && Wait.DsCnt != ~`0u`) {
1406	unsigned NewEnc = AMDGPU::encodeLoadcntDscnt(Version: IV, Decoded: Wait);
1407	Modified \|= updateOperandIfDifferent(MI&: *CombinedLoadDsCntInstr,
1408	OpName: AMDGPU::OpName::simm16, NewEnc);
1409	Modified \|= promoteSoftWaitCnt(Waitcnt: CombinedLoadDsCntInstr);
1410	ScoreBrackets.applyWaitcnt(T: LOAD_CNT, Count: Wait.LoadCnt);
1411	ScoreBrackets.applyWaitcnt(T: DS_CNT, Count: Wait.DsCnt);
1412	Wait.LoadCnt = ~`0u`;
1413	Wait.DsCnt = ~`0u`;
1414
1415	LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
1416	? dbgs() << "applyPreexistingWaitcnt\n"
1417	<< "New Instr at block end: "
1418	<< *CombinedLoadDsCntInstr << `'\n'`
1419	: dbgs() << "applyPreexistingWaitcnt\n"
1420	<< "Old Instr: " << *It << "New Instr: "
1421	<< *CombinedLoadDsCntInstr << `'\n'`);
1422	} else {
1423	CombinedLoadDsCntInstr->eraseFromParent();
1424	Modified = true;
1425	}
1426	}
1427
1428	if (CombinedStoreDsCntInstr) {
1429	// Similarly for S_WAIT_STORECNT_DSCNT.
1430	if (Wait.StoreCnt != ~`0u` && Wait.DsCnt != ~`0u`) {
1431	unsigned NewEnc = AMDGPU::encodeStorecntDscnt(Version: IV, Decoded: Wait);
1432	Modified \|= updateOperandIfDifferent(MI&: *CombinedStoreDsCntInstr,
1433	OpName: AMDGPU::OpName::simm16, NewEnc);
1434	Modified \|= promoteSoftWaitCnt(Waitcnt: CombinedStoreDsCntInstr);
1435	ScoreBrackets.applyWaitcnt(T: STORE_CNT, Count: Wait.StoreCnt);
1436	ScoreBrackets.applyWaitcnt(T: DS_CNT, Count: Wait.DsCnt);
1437	Wait.StoreCnt = ~`0u`;
1438	Wait.DsCnt = ~`0u`;
1439
1440	LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
1441	? dbgs() << "applyPreexistingWaitcnt\n"
1442	<< "New Instr at block end: "
1443	<< *CombinedStoreDsCntInstr << `'\n'`
1444	: dbgs() << "applyPreexistingWaitcnt\n"
1445	<< "Old Instr: " << *It << "New Instr: "
1446	<< *CombinedStoreDsCntInstr << `'\n'`);
1447	} else {
1448	CombinedStoreDsCntInstr->eraseFromParent();
1449	Modified = true;
1450	}
1451	}
1452
1453	// Look for an opportunity to convert existing S_WAIT_LOADCNT,
1454	// S_WAIT_STORECNT and S_WAIT_DSCNT into new S_WAIT_LOADCNT_DSCNT
1455	// or S_WAIT_STORECNT_DSCNT. This is achieved by selectively removing
1456	// instructions so that createNewWaitcnt() will create new combined
1457	// instructions to replace them.
1458
1459	if (Wait.DsCnt != ~`0u`) {
1460	// This is a vector of addresses in WaitInstrs pointing to instructions
1461	// that should be removed if they are present.
1462	SmallVector<MachineInstr **, `2`> WaitsToErase;
1463
1464	// If it's known that both DScnt and either LOADcnt or STOREcnt (but not
1465	// both) need to be waited for, ensure that there are no existing
1466	// individual wait count instructions for these.
1467
1468	if (Wait.LoadCnt != ~`0u`) {
1469	WaitsToErase.push_back(Elt: &WaitInstrs[LOAD_CNT]);
1470	WaitsToErase.push_back(Elt: &WaitInstrs[DS_CNT]);
1471	} else if (Wait.StoreCnt != ~`0u`) {
1472	WaitsToErase.push_back(Elt: &WaitInstrs[STORE_CNT]);
1473	WaitsToErase.push_back(Elt: &WaitInstrs[DS_CNT]);
1474	}
1475
1476	for (MachineInstr **WI : WaitsToErase) {
1477	if (!*WI)
1478	continue;
1479
1480	(*WI)->eraseFromParent();
1481	WI = nullptr*;
1482	Modified = true;
1483	}
1484	}
1485
1486	for (auto CT : inst_counter_types(MaxCounter: NUM_EXTENDED_INST_CNTS)) {
1487	if (!WaitInstrs[CT])
1488	continue;
1489
1490	unsigned NewCnt = getWait(Wait, T: CT);
1491	if (NewCnt != ~`0u`) {
1492	Modified \|= updateOperandIfDifferent(MI&: *WaitInstrs[CT],
1493	OpName: AMDGPU::OpName::simm16, NewEnc: NewCnt);
1494	Modified \|= promoteSoftWaitCnt(Waitcnt: WaitInstrs[CT]);
1495
1496	ScoreBrackets.applyWaitcnt(T: CT, Count: NewCnt);
1497	setNoWait(Wait, T: CT);
1498
1499	LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
1500	? dbgs() << "applyPreexistingWaitcnt\n"
1501	<< "New Instr at block end: " << *WaitInstrs[CT]
1502	<< `'\n'`
1503	: dbgs() << "applyPreexistingWaitcnt\n"
1504	<< "Old Instr: " << *It
1505	<< "New Instr: " << *WaitInstrs[CT] << `'\n'`);
1506	} else {
1507	WaitInstrs[CT]->eraseFromParent();
1508	Modified = true;
1509	}
1510	}
1511
1512	return Modified;
1513	}
1514
1515	/// Generate S_WAIT_CNT instructions for any required counters in \p Wait*
1516	bool WaitcntGeneratorGFX12Plus::createNewWaitcnt(
1517	MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It,
1518	AMDGPU::Waitcnt Wait) {
1519	assert(ST);
1520	assert(!isNormalMode(MaxCounter));
1521
1522	bool Modified = false;
1523	const DebugLoc &DL = Block.findDebugLoc(MBBI: It);
1524
1525	// Check for opportunities to use combined wait instructions.
1526	if (Wait.DsCnt != ~`0u`) {
1527	MachineInstr SWaitInst = nullptr*;
1528
1529	if (Wait.LoadCnt != ~`0u`) {
1530	unsigned Enc = AMDGPU::encodeLoadcntDscnt(Version: IV, Decoded: Wait);
1531
1532	SWaitInst = BuildMI(BB&: Block, I: It, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_WAIT_LOADCNT_DSCNT))
1533	.addImm(Val: Enc);
1534
1535	Wait.LoadCnt = ~`0u`;
1536	Wait.DsCnt = ~`0u`;
1537	} else if (Wait.StoreCnt != ~`0u`) {
1538	unsigned Enc = AMDGPU::encodeStorecntDscnt(Version: IV, Decoded: Wait);
1539
1540	SWaitInst =
1541	BuildMI(BB&: Block, I: It, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_WAIT_STORECNT_DSCNT))
1542	.addImm(Val: Enc);
1543
1544	Wait.StoreCnt = ~`0u`;
1545	Wait.DsCnt = ~`0u`;
1546	}
1547
1548	if (SWaitInst) {
1549	Modified = true;
1550
1551	LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
1552	if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
1553	dbgs() << "New Instr: " << *SWaitInst << `'\n'`);
1554	}
1555	}
1556
1557	// Generate an instruction for any remaining counter that needs
1558	// waiting for.
1559
1560	for (auto CT : inst_counter_types(MaxCounter: NUM_EXTENDED_INST_CNTS)) {
1561	unsigned Count = getWait(Wait, T: CT);
1562	if (Count == ~`0u`)
1563	continue;
1564
1565	[[maybe_unused]] auto SWaitInst =
1566	BuildMI(BB&: Block, I: It, MIMD: DL, MCID: TII->get(Opcode: instrsForExtendedCounterTypes[CT]))
1567	.addImm(Val: Count);
1568
1569	Modified = true;
1570
1571	LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
1572	if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
1573	dbgs() << "New Instr: " << *SWaitInst << `'\n'`);
1574	}
1575
1576	return Modified;
1577	}
1578
1579	static bool readsVCCZ(const MachineInstr &MI) {
1580	unsigned Opc = MI.getOpcode();
1581	return (Opc == AMDGPU::S_CBRANCH_VCCNZ \|\| Opc == AMDGPU::S_CBRANCH_VCCZ) &&
1582	!MI.getOperand(i: `1`).isUndef();
1583	}
1584
1585	/// \returns true if the callee inserts an s_waitcnt 0 on function entry.
1586	static bool callWaitsOnFunctionEntry(const MachineInstr &MI) {
1587	// Currently all conventions wait, but this may not always be the case.
1588	//
1589	// TODO: If IPRA is enabled, and the callee is isSafeForNoCSROpt, it may make
1590	// senses to omit the wait and do it in the caller.
1591	return true;
1592	}
1593
1594	/// \returns true if the callee is expected to wait for any outstanding waits
1595	/// before returning.
1596	static bool callWaitsOnFunctionReturn(const MachineInstr &MI) {
1597	return true;
1598	}
1599
1600	/// Generate s_waitcnt instruction to be placed before cur_Inst.
1601	/// Instructions of a given type are returned in order,
1602	/// but instructions of different types can complete out of order.
1603	/// We rely on this in-order completion
1604	/// and simply assign a score to the memory access instructions.
1605	/// We keep track of the active "score bracket" to determine
1606	/// if an access of a memory read requires an s_waitcnt
1607	/// and if so what the value of each counter is.
1608	/// The "score bracket" is bound by the lower bound and upper bound
1609	/// scores (_score_LB and _score_ub respectively).
1610	/// If FlushVmCnt is true, that means that we want to generate a s_waitcnt to
1611	/// flush the vmcnt counter here.
1612	bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
1613	WaitcntBrackets &ScoreBrackets,
1614	MachineInstr *OldWaitcntInstr,
1615	bool FlushVmCnt) {
1616	setForceEmitWaitcnt();
1617
1618	if (MI.isMetaInstruction())
1619	return false;
1620
1621	AMDGPU::Waitcnt Wait;
1622
1623	// FIXME: This should have already been handled by the memory legalizer.
1624	// Removing this currently doesn't affect any lit tests, but we need to
1625	// verify that nothing was relying on this. The number of buffer invalidates
1626	// being handled here should not be expanded.
1627	if (MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 \|\|
1628	MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC \|\|
1629	MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL \|\|
1630	MI.getOpcode() == AMDGPU::BUFFER_GL0_INV \|\|
1631	MI.getOpcode() == AMDGPU::BUFFER_GL1_INV) {
1632	Wait.LoadCnt = `0`;
1633	}
1634
1635	// All waits must be resolved at call return.
1636	// NOTE: this could be improved with knowledge of all call sites or
1637	// with knowledge of the called routines.
1638	if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG \|\|
1639	MI.getOpcode() == AMDGPU::SI_RETURN \|\|
1640	MI.getOpcode() == AMDGPU::S_SETPC_B64_return \|\|
1641	(MI.isReturn() && MI.isCall() && !callWaitsOnFunctionEntry(MI))) {
1642	Wait = Wait.combined(Other: WCG->getAllZeroWaitcnt(/IncludeVSCnt=/false));
1643	}
1644	// Identify S_ENDPGM instructions which may have to wait for outstanding VMEM
1645	// stores. In this case it can be useful to send a message to explicitly
1646	// release all VGPRs before the stores have completed, but it is only safe to
1647	// do this if:
1648	// there are no outstanding scratch stores*
1649	// we are not in Dynamic VGPR mode*
1650	else if (MI.getOpcode() == AMDGPU::S_ENDPGM \|\|
1651	MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED) {
1652	if (ST->getGeneration() >= AMDGPUSubtarget::GFX11 && !WCG->isOptNone() &&
1653	ScoreBrackets.getScoreRange(T: STORE_CNT) != `0` &&
1654	!ScoreBrackets.hasPendingEvent(E: SCRATCH_WRITE_ACCESS))
1655	ReleaseVGPRInsts.insert(V: &MI);
1656	}
1657	// Resolve vm waits before gs-done.
1658	else if ((MI.getOpcode() == AMDGPU::S_SENDMSG \|\|
1659	MI.getOpcode() == AMDGPU::S_SENDMSGHALT) &&
1660	ST->hasLegacyGeometry() &&
1661	((MI.getOperand(i: `0`).getImm() & AMDGPU::SendMsg::ID_MASK_PreGFX11_) ==
1662	AMDGPU::SendMsg::ID_GS_DONE_PreGFX11)) {
1663	Wait.LoadCnt = `0`;
1664	}
1665
1666	// Export & GDS instructions do not read the EXEC mask until after the export
1667	// is granted (which can occur well after the instruction is issued).
1668	// The shader program must flush all EXP operations on the export-count
1669	// before overwriting the EXEC mask.
1670	else {
1671	if (MI.modifiesRegister(Reg: AMDGPU::EXEC, TRI)) {
1672	// Export and GDS are tracked individually, either may trigger a waitcnt
1673	// for EXEC.
1674	if (ScoreBrackets.hasPendingEvent(E: EXP_GPR_LOCK) \|\|
1675	ScoreBrackets.hasPendingEvent(E: EXP_PARAM_ACCESS) \|\|
1676	ScoreBrackets.hasPendingEvent(E: EXP_POS_ACCESS) \|\|
1677	ScoreBrackets.hasPendingEvent(E: GDS_GPR_LOCK)) {
1678	Wait.ExpCnt = `0`;
1679	}
1680	}
1681
1682	if (MI.isCall() && callWaitsOnFunctionEntry(MI)) {
1683	// The function is going to insert a wait on everything in its prolog.
1684	// This still needs to be careful if the call target is a load (e.g. a GOT
1685	// load). We also need to check WAW dependency with saved PC.
1686	Wait = AMDGPU::Waitcnt ();
1687
1688	int CallAddrOpIdx =
1689	AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), NamedIdx: AMDGPU::OpName::src0);
1690
1691	if (MI.getOperand(i: CallAddrOpIdx).isReg()) {
1692	RegInterval CallAddrOpInterval =
1693	ScoreBrackets.getRegInterval(MI: &MI, MRI, TRI, OpNo: CallAddrOpIdx);
1694
1695	for (int RegNo = CallAddrOpInterval.first;
1696	RegNo < CallAddrOpInterval.second; ++RegNo)
1697	ScoreBrackets.determineWait(T: SmemAccessCounter, RegNo, Wait);
1698
1699	int RtnAddrOpIdx =
1700	AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), NamedIdx: AMDGPU::OpName::dst);
1701	if (RtnAddrOpIdx != -`1`) {
1702	RegInterval RtnAddrOpInterval =
1703	ScoreBrackets.getRegInterval(MI: &MI, MRI, TRI, OpNo: RtnAddrOpIdx);
1704
1705	for (int RegNo = RtnAddrOpInterval.first;
1706	RegNo < RtnAddrOpInterval.second; ++RegNo)
1707	ScoreBrackets.determineWait(T: SmemAccessCounter, RegNo, Wait);
1708	}
1709	}
1710	} else {
1711	// FIXME: Should not be relying on memoperands.
1712	// Look at the source operands of every instruction to see if
1713	// any of them results from a previous memory operation that affects
1714	// its current usage. If so, an s_waitcnt instruction needs to be
1715	// emitted.
1716	// If the source operand was defined by a load, add the s_waitcnt
1717	// instruction.
1718	//
1719	// Two cases are handled for destination operands:
1720	// 1) If the destination operand was defined by a load, add the s_waitcnt
1721	// instruction to guarantee the right WAW order.
1722	// 2) If a destination operand that was used by a recent export/store ins,
1723	// add s_waitcnt on exp_cnt to guarantee the WAR order.
1724
1725	for (const MachineMemOperand *Memop : MI.memoperands()) {
1726	const Value *Ptr = Memop->getValue();
1727	if (Memop->isStore() && SLoadAddresses.count(Val: Ptr)) {
1728	addWait(Wait, T: SmemAccessCounter, Count: `0`);
1729	if (PDT->dominates(A: MI.getParent(), B: SLoadAddresses.find(Val: Ptr)->second))
1730	SLoadAddresses.erase(Val: Ptr);
1731	}
1732	unsigned AS = Memop->getAddrSpace();
1733	if (AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::FLAT_ADDRESS)
1734	continue;
1735	// No need to wait before load from VMEM to LDS.
1736	if (TII->mayWriteLDSThroughDMA(MI))
1737	continue;
1738
1739	// LOAD_CNT is only relevant to vgpr or LDS.
1740	unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
1741	bool FoundAliasingStore = false;
1742	// Only objects with alias scope info were added to LDSDMAScopes array.
1743	// In the absense of the scope info we will not be able to disambiguate
1744	// aliasing here. There is no need to try searching for a corresponding
1745	// store slot. This is conservatively correct because in that case we
1746	// will produce a wait using the first (general) LDS DMA wait slot which
1747	// will wait on all of them anyway.
1748	if (Ptr && Memop->getAAInfo() && Memop->getAAInfo().Scope) {
1749	const auto &LDSDMAStores = ScoreBrackets.getLDSDMAStores();
1750	for (unsigned I = `0`, E = LDSDMAStores.size(); I != E; ++I) {
1751	if (MI.mayAlias(AA, Other: LDSDMAStores [I], UseTBAA: true*)) {
1752	FoundAliasingStore = true;
1753	ScoreBrackets.determineWait(T: LOAD_CNT, RegNo: RegNo + I + `1`, Wait);
1754	}
1755	}
1756	}
1757	if (!FoundAliasingStore)
1758	ScoreBrackets.determineWait(T: LOAD_CNT, RegNo, Wait);
1759	if (Memop->isStore()) {
1760	ScoreBrackets.determineWait(T: EXP_CNT, RegNo, Wait);
1761	}
1762	}
1763
1764	// Loop over use and def operands.
1765	for (unsigned I = `0`, E = MI.getNumOperands(); I != E; ++I) {
1766	MachineOperand &Op = MI.getOperand(i: I);
1767	if (!Op.isReg())
1768	continue;
1769
1770	// If the instruction does not read tied source, skip the operand.
1771	if (Op.isTied() && Op.isUse() && TII->doesNotReadTiedSource(MI))
1772	continue;
1773
1774	RegInterval Interval = ScoreBrackets.getRegInterval(MI: &MI, MRI, TRI, OpNo: I);
1775
1776	const bool IsVGPR = TRI->isVectorRegister(MRI: *MRI, Reg: Op.getReg());
1777	for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
1778	if (IsVGPR) {
1779	// RAW always needs an s_waitcnt. WAW needs an s_waitcnt unless the
1780	// previous write and this write are the same type of VMEM
1781	// instruction, in which case they are (in some architectures)
1782	// guaranteed to write their results in order anyway.
1783	if (Op.isUse() \|\| !updateVMCntOnly(Inst: MI) \|\|
1784	ScoreBrackets.hasOtherPendingVmemTypes(GprNo: RegNo,
1785	V: getVmemType(Inst: MI)) \|\|
1786	!ST->hasVmemWriteVgprInOrder()) {
1787	ScoreBrackets.determineWait(T: LOAD_CNT, RegNo, Wait);
1788	ScoreBrackets.determineWait(T: SAMPLE_CNT, RegNo, Wait);
1789	ScoreBrackets.determineWait(T: BVH_CNT, RegNo, Wait);
1790	ScoreBrackets.clearVgprVmemTypes(GprNo: RegNo);
1791	}
1792	if (Op.isDef() \|\| ScoreBrackets.hasPendingEvent(E: EXP_LDS_ACCESS)) {
1793	ScoreBrackets.determineWait(T: EXP_CNT, RegNo, Wait);
1794	}
1795	ScoreBrackets.determineWait(T: DS_CNT, RegNo, Wait);
1796	} else {
1797	ScoreBrackets.determineWait(T: SmemAccessCounter, RegNo, Wait);
1798	}
1799	}
1800	}
1801	}
1802	}
1803
1804	// The subtarget may have an implicit S_WAITCNT 0 before barriers. If it does
1805	// not, we need to ensure the subtarget is capable of backing off barrier
1806	// instructions in case there are any outstanding memory operations that may
1807	// cause an exception. Otherwise, insert an explicit S_WAITCNT 0 here.
1808	if (TII->isBarrierStart(Opcode: MI.getOpcode()) &&
1809	!ST->hasAutoWaitcntBeforeBarrier() && !ST->supportsBackOffBarrier()) {
1810	Wait = Wait.combined(Other: WCG->getAllZeroWaitcnt(/IncludeVSCnt=/true));
1811	}
1812
1813	// TODO: Remove this work-around, enable the assert for Bug 457939
1814	// after fixing the scheduler. Also, the Shader Compiler code is
1815	// independent of target.
1816	if (readsVCCZ(MI) && ST->hasReadVCCZBug()) {
1817	if (ScoreBrackets.hasPendingEvent(E: SMEM_ACCESS)) {
1818	Wait.DsCnt = `0`;
1819	}
1820	}
1821
1822	// Verify that the wait is actually needed.
1823	ScoreBrackets.simplifyWaitcnt(Wait);
1824
1825	if (ForceEmitZeroWaitcnts)
1826	Wait = WCG->getAllZeroWaitcnt(/IncludeVSCnt=/false);
1827
1828	if (ForceEmitWaitcnt[LOAD_CNT])
1829	Wait.LoadCnt = `0`;
1830	if (ForceEmitWaitcnt[EXP_CNT])
1831	Wait.ExpCnt = `0`;
1832	if (ForceEmitWaitcnt[DS_CNT])
1833	Wait.DsCnt = `0`;
1834	if (ForceEmitWaitcnt[SAMPLE_CNT])
1835	Wait.SampleCnt = `0`;
1836	if (ForceEmitWaitcnt[BVH_CNT])
1837	Wait.BvhCnt = `0`;
1838	if (ForceEmitWaitcnt[KM_CNT])
1839	Wait.KmCnt = `0`;
1840
1841	if (FlushVmCnt) {
1842	if (ScoreBrackets.hasPendingEvent(T: LOAD_CNT))
1843	Wait.LoadCnt = `0`;
1844	if (ScoreBrackets.hasPendingEvent(T: SAMPLE_CNT))
1845	Wait.SampleCnt = `0`;
1846	if (ScoreBrackets.hasPendingEvent(T: BVH_CNT))
1847	Wait.BvhCnt = `0`;
1848	}
1849
1850	return generateWaitcnt(Wait, It: MI.getIterator(), Block&: *MI.getParent(), ScoreBrackets,
1851	OldWaitcntInstr);
1852	}
1853
1854	bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
1855	MachineBasicBlock::instr_iterator It,
1856	MachineBasicBlock &Block,
1857	WaitcntBrackets &ScoreBrackets,
1858	MachineInstr *OldWaitcntInstr) {
1859	bool Modified = false;
1860
1861	if (OldWaitcntInstr)
1862	// Try to merge the required wait with preexisting waitcnt instructions.
1863	// Also erase redundant waitcnt.
1864	Modified =
1865	WCG->applyPreexistingWaitcnt(ScoreBrackets, OldWaitcntInstr&: *OldWaitcntInstr, Wait, It);
1866
1867	// Any counts that could have been applied to any existing waitcnt
1868	// instructions will have been done so, now deal with any remaining.
1869	ScoreBrackets.applyWaitcnt(Wait);
1870
1871	// ExpCnt can be merged into VINTERP.
1872	if (Wait.ExpCnt != ~`0u` && It != Block.instr_end() &&
1873	SIInstrInfo::isVINTERP(MI: *It)) {
1874	MachineOperand *WaitExp =
1875	TII->getNamedOperand(MI&: *It, OperandName: AMDGPU::OpName::waitexp);
1876	if (Wait.ExpCnt < WaitExp->getImm()) {
1877	WaitExp->setImm(Wait.ExpCnt);
1878	Modified = true;
1879	}
1880	Wait.ExpCnt = ~`0u`;
1881
1882	LLVM_DEBUG(dbgs() << "generateWaitcnt\n"
1883	<< "Update Instr: " << *It);
1884	}
1885
1886	if (WCG->createNewWaitcnt(Block, It, Wait))
1887	Modified = true;
1888
1889	return Modified;
1890	}
1891
1892	// This is a flat memory operation. Check to see if it has memory tokens other
1893	// than LDS. Other address spaces supported by flat memory operations involve
1894	// global memory.
1895	bool SIInsertWaitcnts::mayAccessVMEMThroughFlat(const MachineInstr &MI) const {
1896	assert(TII->isFLAT(MI));
1897
1898	// All flat instructions use the VMEM counter.
1899	assert(TII->usesVM_CNT(MI));
1900
1901	// If there are no memory operands then conservatively assume the flat
1902	// operation may access VMEM.
1903	if (MI.memoperands_empty())
1904	return true;
1905
1906	// See if any memory operand specifies an address space that involves VMEM.
1907	// Flat operations only supported FLAT, LOCAL (LDS), or address spaces
1908	// involving VMEM such as GLOBAL, CONSTANT, PRIVATE (SCRATCH), etc. The REGION
1909	// (GDS) address space is not supported by flat operations. Therefore, simply
1910	// return true unless only the LDS address space is found.
1911	for (const MachineMemOperand *Memop : MI.memoperands()) {
1912	unsigned AS = Memop->getAddrSpace();
1913	assert(AS != AMDGPUAS::REGION_ADDRESS);
1914	if (AS != AMDGPUAS::LOCAL_ADDRESS)
1915	return true;
1916	}
1917
1918	return false;
1919	}
1920
1921	// This is a flat memory operation. Check to see if it has memory tokens for
1922	// either LDS or FLAT.
1923	bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const {
1924	assert(TII->isFLAT(MI));
1925
1926	// Flat instruction such as SCRATCH and GLOBAL do not use the lgkm counter.
1927	if (!TII->usesLGKM_CNT(MI))
1928	return false;
1929
1930	// If in tgsplit mode then there can be no use of LDS.
1931	if (ST->isTgSplitEnabled())
1932	return false;
1933
1934	// If there are no memory operands then conservatively assume the flat
1935	// operation may access LDS.
1936	if (MI.memoperands_empty())
1937	return true;
1938
1939	// See if any memory operand specifies an address space that involves LDS.
1940	for (const MachineMemOperand *Memop : MI.memoperands()) {
1941	unsigned AS = Memop->getAddrSpace();
1942	if (AS == AMDGPUAS::LOCAL_ADDRESS \|\| AS == AMDGPUAS::FLAT_ADDRESS)
1943	return true;
1944	}
1945
1946	return false;
1947	}
1948
1949	// This is a flat memory operation. Check to see if it has memory tokens for
1950	// either scratch or FLAT.
1951	bool SIInsertWaitcnts::mayAccessScratchThroughFlat(
1952	const MachineInstr &MI) const {
1953	assert(TII->isFLAT(MI));
1954
1955	// SCRATCH instructions always access scratch.
1956	if (TII->isFLATScratch(MI))
1957	return true;
1958
1959	// GLOBAL instructions never access scratch.
1960	if (TII->isFLATGlobal(MI))
1961	return false;
1962
1963	// If there are no memory operands then conservatively assume the flat
1964	// operation may access scratch.
1965	if (MI.memoperands_empty())
1966	return true;
1967
1968	// See if any memory operand specifies an address space that involves scratch.
1969	return any_of(Range: MI.memoperands(), P: [](const MachineMemOperand *Memop) {
1970	unsigned AS = Memop->getAddrSpace();
1971	return AS == AMDGPUAS::PRIVATE_ADDRESS \|\| AS == AMDGPUAS::FLAT_ADDRESS;
1972	});
1973	}
1974
1975	static bool isCacheInvOrWBInst(MachineInstr &Inst) {
1976	auto Opc = Inst.getOpcode();
1977	return Opc == AMDGPU::GLOBAL_INV \|\| Opc == AMDGPU::GLOBAL_WB \|\|
1978	Opc == AMDGPU::GLOBAL_WBINV;
1979	}
1980
1981	void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
1982	WaitcntBrackets *ScoreBrackets) {
1983	// Now look at the instruction opcode. If it is a memory access
1984	// instruction, update the upper-bound of the appropriate counter's
1985	// bracket and the destination operand scores.
1986	// TODO: Use the (TSFlags & SIInstrFlags::DS_CNT) property everywhere.
1987
1988	if (TII->isDS(MI: Inst) && TII->usesLGKM_CNT(MI: Inst)) {
1989	if (TII->isAlwaysGDS(Opcode: Inst.getOpcode()) \|\|
1990	TII->hasModifiersSet(MI: Inst, OpName: AMDGPU::OpName::gds)) {
1991	ScoreBrackets->updateByEvent(TII, TRI, MRI, E: GDS_ACCESS, Inst);
1992	ScoreBrackets->updateByEvent(TII, TRI, MRI, E: GDS_GPR_LOCK, Inst);
1993	} else {
1994	ScoreBrackets->updateByEvent(TII, TRI, MRI, E: LDS_ACCESS, Inst);
1995	}
1996	} else if (TII->isFLAT(MI: Inst)) {
1997	// TODO: Track this properly.
1998	if (isCacheInvOrWBInst(Inst))
1999	return;
2000
2001	assert(Inst.mayLoadOrStore());
2002
2003	int FlatASCount = `0`;
2004
2005	if (mayAccessVMEMThroughFlat(MI: Inst)) {
2006	++FlatASCount;
2007	ScoreBrackets->updateByEvent(TII, TRI, MRI, E: getVmemWaitEventType(Inst),
2008	Inst);
2009	}
2010
2011	if (mayAccessLDSThroughFlat(MI: Inst)) {
2012	++FlatASCount;
2013	ScoreBrackets->updateByEvent(TII, TRI, MRI, E: LDS_ACCESS, Inst);
2014	}
2015
2016	// A Flat memory operation must access at least one address space.
2017	assert(FlatASCount);
2018
2019	// This is a flat memory operation that access both VMEM and LDS, so note it
2020	// - it will require that both the VM and LGKM be flushed to zero if it is
2021	// pending when a VM or LGKM dependency occurs.
2022	if (FlatASCount > `1`)
2023	ScoreBrackets->setPendingFlat();
2024	} else if (SIInstrInfo::isVMEM(MI: Inst) &&
2025	!llvm::AMDGPU::getMUBUFIsBufferInv(Opc: Inst.getOpcode())) {
2026	ScoreBrackets->updateByEvent(TII, TRI, MRI, E: getVmemWaitEventType(Inst),
2027	Inst);
2028
2029	if (ST->vmemWriteNeedsExpWaitcnt() &&
2030	(Inst.mayStore() \|\| SIInstrInfo::isAtomicRet(MI: Inst))) {
2031	ScoreBrackets->updateByEvent(TII, TRI, MRI, E: VMW_GPR_LOCK, Inst);
2032	}
2033	} else if (TII->isSMRD(MI: Inst)) {
2034	ScoreBrackets->updateByEvent(TII, TRI, MRI, E: SMEM_ACCESS, Inst);
2035	} else if (Inst.isCall()) {
2036	if (callWaitsOnFunctionReturn(MI: Inst)) {
2037	// Act as a wait on everything
2038	ScoreBrackets->applyWaitcnt(
2039	Wait: WCG->getAllZeroWaitcnt(/IncludeVSCnt=/false));
2040	ScoreBrackets->setStateOnFunctionEntryOrReturn();
2041	} else {
2042	// May need to way wait for anything.
2043	ScoreBrackets->applyWaitcnt(Wait: AMDGPU::Waitcnt ());
2044	}
2045	} else if (SIInstrInfo::isLDSDIR(MI: Inst)) {
2046	ScoreBrackets->updateByEvent(TII, TRI, MRI, E: EXP_LDS_ACCESS, Inst);
2047	} else if (TII->isVINTERP(MI: Inst)) {
2048	int64_t Imm = TII->getNamedOperand(MI&: Inst, OperandName: AMDGPU::OpName::waitexp)->getImm();
2049	ScoreBrackets->applyWaitcnt(T: EXP_CNT, Count: Imm);
2050	} else if (SIInstrInfo::isEXP(MI: Inst)) {
2051	unsigned Imm = TII->getNamedOperand(MI&: Inst, OperandName: AMDGPU::OpName::tgt)->getImm();
2052	if (Imm >= AMDGPU::Exp::ET_PARAM0 && Imm <= AMDGPU::Exp::ET_PARAM31)
2053	ScoreBrackets->updateByEvent(TII, TRI, MRI, E: EXP_PARAM_ACCESS, Inst);
2054	else if (Imm >= AMDGPU::Exp::ET_POS0 && Imm <= AMDGPU::Exp::ET_POS_LAST)
2055	ScoreBrackets->updateByEvent(TII, TRI, MRI, E: EXP_POS_ACCESS, Inst);
2056	else
2057	ScoreBrackets->updateByEvent(TII, TRI, MRI, E: EXP_GPR_LOCK, Inst);
2058	} else {
2059	switch (Inst.getOpcode()) {
2060	case AMDGPU::S_SENDMSG:
2061	case AMDGPU::S_SENDMSG_RTN_B32:
2062	case AMDGPU::S_SENDMSG_RTN_B64:
2063	case AMDGPU::S_SENDMSGHALT:
2064	ScoreBrackets->updateByEvent(TII, TRI, MRI, E: SQ_MESSAGE, Inst);
2065	break;
2066	case AMDGPU::S_MEMTIME:
2067	case AMDGPU::S_MEMREALTIME:
2068	case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0:
2069	case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM:
2070	case AMDGPU::S_BARRIER_LEAVE:
2071	case AMDGPU::S_GET_BARRIER_STATE_M0:
2072	case AMDGPU::S_GET_BARRIER_STATE_IMM:
2073	ScoreBrackets->updateByEvent(TII, TRI, MRI, E: SMEM_ACCESS, Inst);
2074	break;
2075	}
2076	}
2077	}
2078
2079	bool WaitcntBrackets::mergeScore(const MergeInfo &M, unsigned &Score,
2080	unsigned OtherScore) {
2081	unsigned MyShifted = Score <= M.OldLB ? `0` : Score + M.MyShift;
2082	unsigned OtherShifted =
2083	OtherScore <= M.OtherLB ? `0` : OtherScore + M.OtherShift;
2084	Score = std::max(a: MyShifted, b: OtherShifted);
2085	return OtherShifted > MyShifted;
2086	}
2087
2088	/// Merge the pending events and associater score brackets of \p Other into
2089	/// this brackets status.
2090	///
2091	/// Returns whether the merge resulted in a change that requires tighter waits
2092	/// (i.e. the merged brackets strictly dominate the original brackets).
2093	bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
2094	bool StrictDom = false;
2095
2096	VgprUB = std::max(a: VgprUB, b: Other.VgprUB);
2097	SgprUB = std::max(a: SgprUB, b: Other.SgprUB);
2098
2099	for (auto T : inst_counter_types(MaxCounter)) {
2100	// Merge event flags for this counter
2101	const unsigned OldEvents = PendingEvents & WaitEventMaskForInst[T];
2102	const unsigned OtherEvents = Other.PendingEvents & WaitEventMaskForInst[T];
2103	if (OtherEvents & ~OldEvents)
2104	StrictDom = true;
2105	PendingEvents \|= OtherEvents;
2106
2107	// Merge scores for this counter
2108	const unsigned MyPending = ScoreUBs[T] - ScoreLBs[T];
2109	const unsigned OtherPending = Other.ScoreUBs[T] - Other.ScoreLBs[T];
2110	const unsigned NewUB = ScoreLBs[T] + std::max(a: MyPending, b: OtherPending);
2111	if (NewUB < ScoreLBs[T])
2112	report_fatal_error(reason: "waitcnt score overflow");
2113
2114	MergeInfo M;
2115	M.OldLB = ScoreLBs[T];
2116	M.OtherLB = Other.ScoreLBs[T];
2117	M.MyShift = NewUB - ScoreUBs[T];
2118	M.OtherShift = NewUB - Other.ScoreUBs[T];
2119
2120	ScoreUBs[T] = NewUB;
2121
2122	StrictDom \|= mergeScore(M, Score&: LastFlat[T], OtherScore: Other.LastFlat[T]);
2123
2124	for (int J = `0`; J <= VgprUB; J++)
2125	StrictDom \|= mergeScore(M, Score&: VgprScores[T][J], OtherScore: Other.VgprScores[T][J]);
2126
2127	if (T == SmemAccessCounter) {
2128	for (int J = `0`; J <= SgprUB; J++)
2129	StrictDom \|= mergeScore(M, Score&: SgprScores[J], OtherScore: Other.SgprScores[J]);
2130	}
2131	}
2132
2133	for (int J = `0`; J <= VgprUB; J++) {
2134	unsigned char NewVmemTypes = VgprVmemTypes[J] \| Other.VgprVmemTypes[J];
2135	StrictDom \|= NewVmemTypes != VgprVmemTypes[J];
2136	VgprVmemTypes[J] = NewVmemTypes;
2137	}
2138
2139	return StrictDom;
2140	}
2141
2142	static bool isWaitInstr(MachineInstr &Inst) {
2143	unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Opcode: Inst.getOpcode());
2144	return Opcode == AMDGPU::S_WAITCNT \|\|
2145	(Opcode == AMDGPU::S_WAITCNT_VSCNT && Inst.getOperand(i: `0`).isReg() &&
2146	Inst.getOperand(i: `0`).getReg() == AMDGPU::SGPR_NULL) \|\|
2147	Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT \|\|
2148	Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT \|\|
2149	counterTypeForInstr(Opcode).has_value();
2150	}
2151
2152	// Generate s_waitcnt instructions where needed.
2153	bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
2154	MachineBasicBlock &Block,
2155	WaitcntBrackets &ScoreBrackets) {
2156	bool Modified = false;
2157
2158	LLVM_DEBUG({
2159	dbgs() << "* Block" << Block.getNumber() << " *";
2160	ScoreBrackets.dump();
2161	});
2162
2163	// Track the correctness of vccz through this basic block. There are two
2164	// reasons why it might be incorrect; see ST->hasReadVCCZBug() and
2165	// ST->partialVCCWritesUpdateVCCZ().
2166	bool VCCZCorrect = true;
2167	if (ST->hasReadVCCZBug()) {
2168	// vccz could be incorrect at a basic block boundary if a predecessor wrote
2169	// to vcc and then issued an smem load.
2170	VCCZCorrect = false;
2171	} else if (!ST->partialVCCWritesUpdateVCCZ()) {
2172	// vccz could be incorrect at a basic block boundary if a predecessor wrote
2173	// to vcc_lo or vcc_hi.
2174	VCCZCorrect = false;
2175	}
2176
2177	// Walk over the instructions.
2178	MachineInstr OldWaitcntInstr = nullptr*;
2179
2180	for (MachineBasicBlock::instr_iterator Iter = Block.instr_begin(),
2181	E = Block.instr_end();
2182	Iter != E;) {
2183	MachineInstr &Inst = *Iter;
2184
2185	// Track pre-existing waitcnts that were added in earlier iterations or by
2186	// the memory legalizer.
2187	if (isWaitInstr(Inst)) {
2188	if (!OldWaitcntInstr)
2189	OldWaitcntInstr = &Inst;
2190	++Iter;
2191	continue;
2192	}
2193
2194	bool FlushVmCnt = Block.getFirstTerminator() == Inst &&
2195	isPreheaderToFlush(MBB&: Block, ScoreBrackets);
2196
2197	// Generate an s_waitcnt instruction to be placed before Inst, if needed.
2198	Modified \|= generateWaitcntInstBefore(MI&: Inst, ScoreBrackets, OldWaitcntInstr,
2199	FlushVmCnt);
2200	OldWaitcntInstr = nullptr;
2201
2202	// Restore vccz if it's not known to be correct already.
2203	bool RestoreVCCZ = !VCCZCorrect && readsVCCZ(MI: Inst);
2204
2205	// Don't examine operands unless we need to track vccz correctness.
2206	if (ST->hasReadVCCZBug() \|\| !ST->partialVCCWritesUpdateVCCZ()) {
2207	if (Inst.definesRegister(Reg: AMDGPU::VCC_LO, /TRI=/nullptr) \|\|
2208	Inst.definesRegister(Reg: AMDGPU::VCC_HI, /TRI=/nullptr)) {
2209	// Up to gfx9, writes to vcc_lo and vcc_hi don't update vccz.
2210	if (!ST->partialVCCWritesUpdateVCCZ())
2211	VCCZCorrect = false;
2212	} else if (Inst.definesRegister(Reg: AMDGPU::VCC, /TRI=/nullptr)) {
2213	// There is a hardware bug on CI/SI where SMRD instruction may corrupt
2214	// vccz bit, so when we detect that an instruction may read from a
2215	// corrupt vccz bit, we need to:
2216	// 1. Insert s_waitcnt lgkm(0) to wait for all outstanding SMRD
2217	// operations to complete.
2218	// 2. Restore the correct value of vccz by writing the current value
2219	// of vcc back to vcc.
2220	if (ST->hasReadVCCZBug() &&
2221	ScoreBrackets.hasPendingEvent(E: SMEM_ACCESS)) {
2222	// Writes to vcc while there's an outstanding smem read may get
2223	// clobbered as soon as any read completes.
2224	VCCZCorrect = false;
2225	} else {
2226	// Writes to vcc will fix any incorrect value in vccz.
2227	VCCZCorrect = true;
2228	}
2229	}
2230	}
2231
2232	if (TII->isSMRD(MI: Inst)) {
2233	for (const MachineMemOperand *Memop : Inst.memoperands()) {
2234	// No need to handle invariant loads when avoiding WAR conflicts, as
2235	// there cannot be a vector store to the same memory location.
2236	if (!Memop->isInvariant()) {
2237	const Value *Ptr = Memop->getValue();
2238	SLoadAddresses.insert(KV: std::pair(Ptr, Inst.getParent()));
2239	}
2240	}
2241	if (ST->hasReadVCCZBug()) {
2242	// This smem read could complete and clobber vccz at any time.
2243	VCCZCorrect = false;
2244	}
2245	}
2246
2247	updateEventWaitcntAfter(Inst, ScoreBrackets: &ScoreBrackets);
2248
2249	if (ST->isPreciseMemoryEnabled() && Inst.mayLoadOrStore()) {
2250	AMDGPU::Waitcnt Wait = WCG->getAllZeroWaitcnt(
2251	IncludeVSCnt: Inst.mayStore() && !SIInstrInfo::isAtomicRet(MI: Inst));
2252	ScoreBrackets.simplifyWaitcnt(Wait);
2253	Modified \|= generateWaitcnt(Wait, It: std::next(x: Inst.getIterator()), Block,
2254	ScoreBrackets, /OldWaitcntInstr=/nullptr);
2255	}
2256
2257	LLVM_DEBUG({
2258	Inst.print(dbgs());
2259	ScoreBrackets.dump();
2260	});
2261
2262	// TODO: Remove this work-around after fixing the scheduler and enable the
2263	// assert above.
2264	if (RestoreVCCZ) {
2265	// Restore the vccz bit. Any time a value is written to vcc, the vcc
2266	// bit is updated, so we can restore the bit by reading the value of
2267	// vcc and then writing it back to the register.
2268	BuildMI(BB&: Block, I&: Inst, MIMD: Inst.getDebugLoc(),
2269	MCID: TII->get(Opcode: ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
2270	DestReg: TRI->getVCC())
2271	.addReg(RegNo: TRI->getVCC());
2272	VCCZCorrect = true;
2273	Modified = true;
2274	}
2275
2276	++Iter;
2277	}
2278
2279	// Flush the LOADcnt, SAMPLEcnt and BVHcnt counters at the end of the block if
2280	// needed.
2281	AMDGPU::Waitcnt Wait;
2282	if (Block.getFirstTerminator() == Block.end() &&
2283	isPreheaderToFlush(MBB&: Block, ScoreBrackets)) {
2284	if (ScoreBrackets.hasPendingEvent(T: LOAD_CNT))
2285	Wait.LoadCnt = `0`;
2286	if (ScoreBrackets.hasPendingEvent(T: SAMPLE_CNT))
2287	Wait.SampleCnt = `0`;
2288	if (ScoreBrackets.hasPendingEvent(T: BVH_CNT))
2289	Wait.BvhCnt = `0`;
2290	}
2291
2292	// Combine or remove any redundant waitcnts at the end of the block.
2293	Modified \|= generateWaitcnt(Wait, It: Block.instr_end(), Block, ScoreBrackets,
2294	OldWaitcntInstr);
2295
2296	return Modified;
2297	}
2298
2299	// Return true if the given machine basic block is a preheader of a loop in
2300	// which we want to flush the vmcnt counter, and false otherwise.
2301	bool SIInsertWaitcnts::isPreheaderToFlush(MachineBasicBlock &MBB,
2302	WaitcntBrackets &ScoreBrackets) {
2303	auto [Iterator, IsInserted] = PreheadersToFlush.try_emplace(Key: &MBB, Args: false);
2304	if (!IsInserted)
2305	return Iterator ->second;
2306
2307	MachineBasicBlock *Succ = MBB.getSingleSuccessor();
2308	if (!Succ)
2309	return false;
2310
2311	MachineLoop *Loop = MLI->getLoopFor(BB: Succ);
2312	if (!Loop)
2313	return false;
2314
2315	if (Loop->getLoopPreheader() == &MBB &&
2316	shouldFlushVmCnt(ML: Loop, Brackets&: ScoreBrackets)) {
2317	Iterator ->second = true;
2318	return true;
2319	}
2320
2321	return false;
2322	}
2323
2324	bool SIInsertWaitcnts::isVMEMOrFlatVMEM(const MachineInstr &MI) const {
2325	return SIInstrInfo::isVMEM(MI) \|\|
2326	(SIInstrInfo::isFLAT(MI) && mayAccessVMEMThroughFlat(MI));
2327	}
2328
2329	// Return true if it is better to flush the vmcnt counter in the preheader of
2330	// the given loop. We currently decide to flush in two situations:
2331	// 1. The loop contains vmem store(s), no vmem load and at least one use of a
2332	// vgpr containing a value that is loaded outside of the loop. (Only on
2333	// targets with no vscnt counter).
2334	// 2. The loop contains vmem load(s), but the loaded values are not used in the
2335	// loop, and at least one use of a vgpr containing a value that is loaded
2336	// outside of the loop.
2337	bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML,
2338	WaitcntBrackets &Brackets) {
2339	bool HasVMemLoad = false;
2340	bool HasVMemStore = false;
2341	bool UsesVgprLoadedOutside = false;
2342	DenseSet<Register> VgprUse;
2343	DenseSet<Register> VgprDef;
2344
2345	for (MachineBasicBlock *MBB : ML->blocks()) {
2346	for (MachineInstr &MI : *MBB) {
2347	if (isVMEMOrFlatVMEM(MI)) {
2348	if (MI.mayLoad())
2349	HasVMemLoad = true;
2350	if (MI.mayStore())
2351	HasVMemStore = true;
2352	}
2353	for (unsigned I = `0`; I < MI.getNumOperands(); I++) {
2354	MachineOperand &Op = MI.getOperand(i: I);
2355	if (!Op.isReg() \|\| !TRI->isVectorRegister(MRI: *MRI, Reg: Op.getReg()))
2356	continue;
2357	RegInterval Interval = Brackets.getRegInterval(MI: &MI, MRI, TRI, OpNo: I);
2358	// Vgpr use
2359	if (Op.isUse()) {
2360	for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
2361	// If we find a register that is loaded inside the loop, 1. and 2.
2362	// are invalidated and we can exit.
2363	if (VgprDef.contains(V: RegNo))
2364	return false;
2365	VgprUse.insert(V: RegNo);
2366	// If at least one of Op's registers is in the score brackets, the
2367	// value is likely loaded outside of the loop.
2368	if (Brackets.getRegScore(GprNo: RegNo, T: LOAD_CNT) >
2369	Brackets.getScoreLB(T: LOAD_CNT) \|\|
2370	Brackets.getRegScore(GprNo: RegNo, T: SAMPLE_CNT) >
2371	Brackets.getScoreLB(T: SAMPLE_CNT) \|\|
2372	Brackets.getRegScore(GprNo: RegNo, T: BVH_CNT) >
2373	Brackets.getScoreLB(T: BVH_CNT)) {
2374	UsesVgprLoadedOutside = true;
2375	break;
2376	}
2377	}
2378	}
2379	// VMem load vgpr def
2380	else if (isVMEMOrFlatVMEM(MI) && MI.mayLoad() && Op.isDef())
2381	for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
2382	// If we find a register that is loaded inside the loop, 1. and 2.
2383	// are invalidated and we can exit.
2384	if (VgprUse.contains(V: RegNo))
2385	return false;
2386	VgprDef.insert(V: RegNo);
2387	}
2388	}
2389	}
2390	}
2391	if (!ST->hasVscnt() && HasVMemStore && !HasVMemLoad && UsesVgprLoadedOutside)
2392	return true;
2393	return HasVMemLoad && UsesVgprLoadedOutside && ST->hasVmemWriteVgprInOrder();
2394	}
2395
2396	bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
2397	ST = &MF.getSubtarget<GCNSubtarget>();
2398	TII = ST->getInstrInfo();
2399	TRI = &TII->getRegisterInfo();
2400	MRI = &MF.getRegInfo();
2401	const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2402	MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
2403	PDT = &getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree();
2404	if (auto AAR = getAnalysisIfAvailable<AAResultsWrapperPass>())
2405	AA = &AAR->getAAResults();
2406
2407	AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(GPU: ST->getCPU());
2408
2409	if (ST->hasExtendedWaitCounts()) {
2410	MaxCounter = NUM_EXTENDED_INST_CNTS;
2411	WCGGFX12Plus = WaitcntGeneratorGFX12Plus (MF, MaxCounter);
2412	WCG = &WCGGFX12Plus;
2413	} else {
2414	MaxCounter = NUM_NORMAL_INST_CNTS;
2415	WCGPreGFX12 = WaitcntGeneratorPreGFX12 (MF);
2416	WCG = &WCGPreGFX12;
2417	}
2418
2419	ForceEmitZeroWaitcnts = ForceEmitZeroFlag;
2420	for (auto T : inst_counter_types())
2421	ForceEmitWaitcnt[T] = false;
2422
2423	const unsigned *WaitEventMaskForInst = WCG->getWaitEventMask();
2424
2425	SmemAccessCounter = eventCounter(masks: WaitEventMaskForInst, E: SMEM_ACCESS);
2426
2427	HardwareLimits Limits = {};
2428	if (ST->hasExtendedWaitCounts()) {
2429	Limits.LoadcntMax = AMDGPU::getLoadcntBitMask(Version: IV);
2430	Limits.DscntMax = AMDGPU::getDscntBitMask(Version: IV);
2431	} else {
2432	Limits.LoadcntMax = AMDGPU::getVmcntBitMask(Version: IV);
2433	Limits.DscntMax = AMDGPU::getLgkmcntBitMask(Version: IV);
2434	}
2435	Limits.ExpcntMax = AMDGPU::getExpcntBitMask(Version: IV);
2436	Limits.StorecntMax = AMDGPU::getStorecntBitMask(Version: IV);
2437	Limits.SamplecntMax = AMDGPU::getSamplecntBitMask(Version: IV);
2438	Limits.BvhcntMax = AMDGPU::getBvhcntBitMask(Version: IV);
2439	Limits.KmcntMax = AMDGPU::getKmcntBitMask(Version: IV);
2440
2441	unsigned NumVGPRsMax = ST->getAddressableNumVGPRs();
2442	unsigned NumSGPRsMax = ST->getAddressableNumSGPRs();
2443	assert(NumVGPRsMax <= SQ_MAX_PGM_VGPRS);
2444	assert(NumSGPRsMax <= SQ_MAX_PGM_SGPRS);
2445
2446	RegisterEncoding Encoding = {};
2447	Encoding.VGPR0 =
2448	TRI->getEncodingValue(RegNo: AMDGPU::VGPR0) & AMDGPU::HWEncoding::REG_IDX_MASK;
2449	Encoding.VGPRL = Encoding.VGPR0 + NumVGPRsMax - `1`;
2450	Encoding.SGPR0 =
2451	TRI->getEncodingValue(RegNo: AMDGPU::SGPR0) & AMDGPU::HWEncoding::REG_IDX_MASK;
2452	Encoding.SGPRL = Encoding.SGPR0 + NumSGPRsMax - `1`;
2453
2454	BlockInfos.clear();
2455	bool Modified = false;
2456
2457	MachineBasicBlock &EntryBB = MF.front();
2458	MachineBasicBlock::iterator I = EntryBB.begin();
2459
2460	if (!MFI->isEntryFunction()) {
2461	// Wait for any outstanding memory operations that the input registers may
2462	// depend on. We can't track them and it's better to do the wait after the
2463	// costly call sequence.
2464
2465	// TODO: Could insert earlier and schedule more liberally with operations
2466	// that only use caller preserved registers.
2467	for (MachineBasicBlock::iterator E = EntryBB.end();
2468	I != E && (I ->isPHI() \|\| I ->isMetaInstruction()); ++I)
2469	;
2470
2471	if (ST->hasExtendedWaitCounts()) {
2472	BuildMI(BB&: EntryBB, I, MIMD: DebugLoc (), MCID: TII->get(Opcode: AMDGPU::S_WAIT_LOADCNT_DSCNT))
2473	.addImm(Val: `0`);
2474	for (auto CT : inst_counter_types(MaxCounter: NUM_EXTENDED_INST_CNTS)) {
2475	if (CT == LOAD_CNT \|\| CT == DS_CNT \|\| CT == STORE_CNT)
2476	continue;
2477
2478	BuildMI(BB&: EntryBB, I, MIMD: DebugLoc (),
2479	MCID: TII->get(Opcode: instrsForExtendedCounterTypes[CT]))
2480	.addImm(Val: `0`);
2481	}
2482	} else {
2483	BuildMI(BB&: EntryBB, I, MIMD: DebugLoc (), MCID: TII->get(Opcode: AMDGPU::S_WAITCNT)).addImm(Val: `0`);
2484	}
2485
2486	auto NonKernelInitialState = std::make_unique<WaitcntBrackets>(
2487	args&: ST, args&: MaxCounter, args&: Limits, args&: Encoding, args&: WaitEventMaskForInst,
2488	args&: SmemAccessCounter);
2489	NonKernelInitialState ->setStateOnFunctionEntryOrReturn();
2490	BlockInfos [&EntryBB].Incoming = std::move(NonKernelInitialState);
2491
2492	Modified = true;
2493	}
2494
2495	// Keep iterating over the blocks in reverse post order, inserting and
2496	// updating s_waitcnt where needed, until a fix point is reached.
2497	for (auto MBB : ReversePostOrderTraversal<MachineFunction >(&MF))
2498	BlockInfos.insert(KV: {MBB, BlockInfo ()});
2499
2500	std::unique_ptr<WaitcntBrackets> Brackets;
2501	bool Repeat;
2502	do {
2503	Repeat = false;
2504
2505	for (auto BII = BlockInfos.begin(), BIE = BlockInfos.end(); BII != BIE;
2506	++BII) {
2507	MachineBasicBlock *MBB = BII->first;
2508	BlockInfo &BI = BII->second;
2509	if (!BI.Dirty)
2510	continue;
2511
2512	if (BI.Incoming) {
2513	if (!Brackets)
2514	Brackets = std::make_unique<WaitcntBrackets>(args&: *BI.Incoming);
2515	else
2516	Brackets = BI.Incoming;
2517	} else {
2518	if (!Brackets)
2519	Brackets = std::make_unique<WaitcntBrackets>(
2520	args&: ST, args&: MaxCounter, args&: Limits, args&: Encoding, args&: WaitEventMaskForInst,
2521	args&: SmemAccessCounter);
2522	else
2523	*Brackets = WaitcntBrackets (ST, MaxCounter, Limits, Encoding,
2524	WaitEventMaskForInst, SmemAccessCounter);
2525	}
2526
2527	Modified \|= insertWaitcntInBlock(MF, Block&: MBB, ScoreBrackets&: Brackets);
2528	BI.Dirty = false;
2529
2530	if (Brackets ->hasPendingEvent()) {
2531	BlockInfo MoveBracketsToSucc = nullptr*;
2532	for (MachineBasicBlock *Succ : MBB->successors()) {
2533	auto SuccBII = BlockInfos.find(Key: Succ);
2534	BlockInfo &SuccBI = SuccBII->second;
2535	if (!SuccBI.Incoming) {
2536	SuccBI.Dirty = true;
2537	if (SuccBII <= BII)
2538	Repeat = true;
2539	if (!MoveBracketsToSucc) {
2540	MoveBracketsToSucc = &SuccBI;
2541	} else {
2542	SuccBI.Incoming = std::make_unique<WaitcntBrackets>(args&: *Brackets);
2543	}
2544	} else if (SuccBI.Incoming ->merge(Other: *Brackets)) {
2545	SuccBI.Dirty = true;
2546	if (SuccBII <= BII)
2547	Repeat = true;
2548	}
2549	}
2550	if (MoveBracketsToSucc)
2551	MoveBracketsToSucc->Incoming = std::move(Brackets);
2552	}
2553	}
2554	} while (Repeat);
2555
2556	if (ST->hasScalarStores()) {
2557	SmallVector<MachineBasicBlock *, `4`> EndPgmBlocks;
2558	bool HaveScalarStores = false;
2559
2560	for (MachineBasicBlock &MBB : MF) {
2561	for (MachineInstr &MI : MBB) {
2562	if (!HaveScalarStores && TII->isScalarStore(MI))
2563	HaveScalarStores = true;
2564
2565	if (MI.getOpcode() == AMDGPU::S_ENDPGM \|\|
2566	MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
2567	EndPgmBlocks.push_back(Elt: &MBB);
2568	}
2569	}
2570
2571	if (HaveScalarStores) {
2572	// If scalar writes are used, the cache must be flushed or else the next
2573	// wave to reuse the same scratch memory can be clobbered.
2574	//
2575	// Insert s_dcache_wb at wave termination points if there were any scalar
2576	// stores, and only if the cache hasn't already been flushed. This could
2577	// be improved by looking across blocks for flushes in postdominating
2578	// blocks from the stores but an explicitly requested flush is probably
2579	// very rare.
2580	for (MachineBasicBlock *MBB : EndPgmBlocks) {
2581	bool SeenDCacheWB = false;
2582
2583	for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
2584	I != E; ++I) {
2585	if (I ->getOpcode() == AMDGPU::S_DCACHE_WB)
2586	SeenDCacheWB = true;
2587	else if (TII->isScalarStore(MI: *I))
2588	SeenDCacheWB = false;
2589
2590	// FIXME: It would be better to insert this before a waitcnt if any.
2591	if ((I ->getOpcode() == AMDGPU::S_ENDPGM \|\|
2592	I ->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&
2593	!SeenDCacheWB) {
2594	Modified = true;
2595	BuildMI(BB&: *MBB, I, MIMD: I ->getDebugLoc(), MCID: TII->get(Opcode: AMDGPU::S_DCACHE_WB));
2596	}
2597	}
2598	}
2599	}
2600	}
2601
2602	// Insert DEALLOC_VGPR messages before previously identified S_ENDPGM
2603	// instructions.
2604	for (MachineInstr *MI : ReleaseVGPRInsts) {
2605	if (ST->requiresNopBeforeDeallocVGPRs()) {
2606	BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(), MCID: TII->get(Opcode: AMDGPU::S_NOP))
2607	.addImm(Val: `0`);
2608	}
2609	BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(),
2610	MCID: TII->get(Opcode: AMDGPU::S_SENDMSG))
2611	.addImm(Val: AMDGPU::SendMsg::ID_DEALLOC_VGPRS_GFX11Plus);
2612	Modified = true;
2613	}
2614	ReleaseVGPRInsts.clear();
2615
2616	return Modified;
2617	}
2618

Browse the source code of llvm_projects/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp