1//===- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Insert wait instructions for memory reads and writes.
11///
12/// Memory reads and writes are issued asynchronously, so we need to insert
13/// S_WAITCNT instructions when we want to access any of their results or
14/// overwrite any register that's used asynchronously.
15///
16/// TODO: This pass currently keeps one timeline per hardware counter. A more
17/// finely-grained approach that keeps one timeline per event type could
18/// sometimes get away with generating weaker s_waitcnt instructions. For
19/// example, when both SMEM and LDS are in flight and we need to wait for
20/// the i-th-last LDS instruction, then an lgkmcnt(i) is actually sufficient,
21/// but the pass will currently generate a conservative lgkmcnt(0) because
22/// multiple event types are in flight.
23//
24//===----------------------------------------------------------------------===//
25
26#include "AMDGPU.h"
27#include "GCNSubtarget.h"
28#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
29#include "SIMachineFunctionInfo.h"
30#include "Utils/AMDGPUBaseInfo.h"
31#include "llvm/ADT/MapVector.h"
32#include "llvm/ADT/PostOrderIterator.h"
33#include "llvm/ADT/Sequence.h"
34#include "llvm/Analysis/AliasAnalysis.h"
35#include "llvm/CodeGen/MachineLoopInfo.h"
36#include "llvm/CodeGen/MachinePassManager.h"
37#include "llvm/CodeGen/MachinePostDominators.h"
38#include "llvm/Support/DebugCounter.h"
39#include "llvm/TargetParser/TargetParser.h"
40using namespace llvm;
41
42#define DEBUG_TYPE "si-insert-waitcnts"
43
44DEBUG_COUNTER(ForceExpCounter, DEBUG_TYPE "-forceexp",
45 "Force emit s_waitcnt expcnt(0) instrs");
46DEBUG_COUNTER(ForceLgkmCounter, DEBUG_TYPE "-forcelgkm",
47 "Force emit s_waitcnt lgkmcnt(0) instrs");
48DEBUG_COUNTER(ForceVMCounter, DEBUG_TYPE "-forcevm",
49 "Force emit s_waitcnt vmcnt(0) instrs");
50
51static cl::opt<bool>
52 ForceEmitZeroFlag("amdgpu-waitcnt-forcezero",
53 cl::desc("Force all waitcnt instrs to be emitted as "
54 "s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"),
55 cl::init(Val: false), cl::Hidden);
56
57static cl::opt<bool> ForceEmitZeroLoadFlag(
58 "amdgpu-waitcnt-load-forcezero",
59 cl::desc("Force all waitcnt load counters to wait until 0"),
60 cl::init(Val: false), cl::Hidden);
61
62namespace {
63// Class of object that encapsulates latest instruction counter score
64// associated with the operand. Used for determining whether
65// s_waitcnt instruction needs to be emitted.
66
67enum InstCounterType {
68 LOAD_CNT = 0, // VMcnt prior to gfx12.
69 DS_CNT, // LKGMcnt prior to gfx12.
70 EXP_CNT, //
71 STORE_CNT, // VScnt in gfx10/gfx11.
72 NUM_NORMAL_INST_CNTS,
73 SAMPLE_CNT = NUM_NORMAL_INST_CNTS, // gfx12+ only.
74 BVH_CNT, // gfx12+ only.
75 KM_CNT, // gfx12+ only.
76 X_CNT, // gfx1250.
77 NUM_EXTENDED_INST_CNTS,
78 NUM_INST_CNTS = NUM_EXTENDED_INST_CNTS
79};
80} // namespace
81
82namespace llvm {
83template <> struct enum_iteration_traits<InstCounterType> {
84 static constexpr bool is_iterable = true;
85};
86} // namespace llvm
87
88namespace {
89// Return an iterator over all counters between LOAD_CNT (the first counter)
90// and \c MaxCounter (exclusive, default value yields an enumeration over
91// all counters).
92auto inst_counter_types(InstCounterType MaxCounter = NUM_INST_CNTS) {
93 return enum_seq(Begin: LOAD_CNT, End: MaxCounter);
94}
95
96using RegInterval = std::pair<int, int>;
97
98struct HardwareLimits {
99 unsigned LoadcntMax; // Corresponds to VMcnt prior to gfx12.
100 unsigned ExpcntMax;
101 unsigned DscntMax; // Corresponds to LGKMcnt prior to gfx12.
102 unsigned StorecntMax; // Corresponds to VScnt in gfx10/gfx11.
103 unsigned SamplecntMax; // gfx12+ only.
104 unsigned BvhcntMax; // gfx12+ only.
105 unsigned KmcntMax; // gfx12+ only.
106 unsigned XcntMax; // gfx1250.
107};
108
109#define AMDGPU_DECLARE_WAIT_EVENTS(DECL) \
110 DECL(VMEM_ACCESS) /* vmem read & write */ \
111 DECL(VMEM_READ_ACCESS) /* vmem read */ \
112 DECL(VMEM_SAMPLER_READ_ACCESS) /* vmem SAMPLER read (gfx12+ only) */ \
113 DECL(VMEM_BVH_READ_ACCESS) /* vmem BVH read (gfx12+ only) */ \
114 DECL(VMEM_WRITE_ACCESS) /* vmem write that is not scratch */ \
115 DECL(SCRATCH_WRITE_ACCESS) /* vmem write that may be scratch */ \
116 DECL(VMEM_GROUP) /* vmem group */ \
117 DECL(LDS_ACCESS) /* lds read & write */ \
118 DECL(GDS_ACCESS) /* gds read & write */ \
119 DECL(SQ_MESSAGE) /* send message */ \
120 DECL(SMEM_ACCESS) /* scalar-memory read & write */ \
121 DECL(SMEM_GROUP) /* scalar-memory group */ \
122 DECL(EXP_GPR_LOCK) /* export holding on its data src */ \
123 DECL(GDS_GPR_LOCK) /* GDS holding on its data and addr src */ \
124 DECL(EXP_POS_ACCESS) /* write to export position */ \
125 DECL(EXP_PARAM_ACCESS) /* write to export parameter */ \
126 DECL(VMW_GPR_LOCK) /* vmem write holding on its data src */ \
127 DECL(EXP_LDS_ACCESS) /* read by ldsdir counting as export */
128
129// clang-format off
130#define AMDGPU_EVENT_ENUM(Name) Name,
131enum WaitEventType {
132 AMDGPU_DECLARE_WAIT_EVENTS(AMDGPU_EVENT_ENUM)
133 NUM_WAIT_EVENTS
134};
135#undef AMDGPU_EVENT_ENUM
136
137#define AMDGPU_EVENT_NAME(Name) #Name,
138static constexpr StringLiteral WaitEventTypeName[] = {
139 AMDGPU_DECLARE_WAIT_EVENTS(AMDGPU_EVENT_NAME)
140};
141#undef AMDGPU_EVENT_NAME
142// clang-format on
143
144// The mapping is:
145// 0 .. SQ_MAX_PGM_VGPRS-1 real VGPRs
146// SQ_MAX_PGM_VGPRS .. NUM_ALL_VGPRS-1 extra VGPR-like slots
147// NUM_ALL_VGPRS .. NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS-1 real SGPRs
148// We reserve a fixed number of VGPR slots in the scoring tables for
149// special tokens like SCMEM_LDS (needed for buffer load to LDS).
150enum RegisterMapping {
151 SQ_MAX_PGM_VGPRS = 1024, // Maximum programmable VGPRs across all targets.
152 AGPR_OFFSET = 512, // Maximum programmable ArchVGPRs across all targets.
153 SQ_MAX_PGM_SGPRS = 128, // Maximum programmable SGPRs across all targets.
154 // Artificial register slots to track LDS writes into specific LDS locations
155 // if a location is known. When slots are exhausted or location is
156 // unknown use the first slot. The first slot is also always updated in
157 // addition to known location's slot to properly generate waits if dependent
158 // instruction's location is unknown.
159 FIRST_LDS_VGPR = SQ_MAX_PGM_VGPRS, // Extra slots for LDS stores.
160 NUM_LDS_VGPRS = 9, // One more than the stores we track.
161 NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_LDS_VGPRS, // Where SGPRs start.
162};
163
164// Enumerate different types of result-returning VMEM operations. Although
165// s_waitcnt orders them all with a single vmcnt counter, in the absence of
166// s_waitcnt only instructions of the same VmemType are guaranteed to write
167// their results in order -- so there is no need to insert an s_waitcnt between
168// two instructions of the same type that write the same vgpr.
169enum VmemType {
170 // BUF instructions and MIMG instructions without a sampler.
171 VMEM_NOSAMPLER,
172 // MIMG instructions with a sampler.
173 VMEM_SAMPLER,
174 // BVH instructions
175 VMEM_BVH,
176 NUM_VMEM_TYPES
177};
178
179// Maps values of InstCounterType to the instruction that waits on that
180// counter. Only used if GCNSubtarget::hasExtendedWaitCounts()
181// returns true.
182static const unsigned instrsForExtendedCounterTypes[NUM_EXTENDED_INST_CNTS] = {
183 AMDGPU::S_WAIT_LOADCNT, AMDGPU::S_WAIT_DSCNT, AMDGPU::S_WAIT_EXPCNT,
184 AMDGPU::S_WAIT_STORECNT, AMDGPU::S_WAIT_SAMPLECNT, AMDGPU::S_WAIT_BVHCNT,
185 AMDGPU::S_WAIT_KMCNT, AMDGPU::S_WAIT_XCNT};
186
187static bool updateVMCntOnly(const MachineInstr &Inst) {
188 return (SIInstrInfo::isVMEM(MI: Inst) && !SIInstrInfo::isFLAT(MI: Inst)) ||
189 SIInstrInfo::isFLATGlobal(MI: Inst) || SIInstrInfo::isFLATScratch(MI: Inst);
190}
191
192#ifndef NDEBUG
193static bool isNormalMode(InstCounterType MaxCounter) {
194 return MaxCounter == NUM_NORMAL_INST_CNTS;
195}
196#endif // NDEBUG
197
198VmemType getVmemType(const MachineInstr &Inst) {
199 assert(updateVMCntOnly(Inst));
200 if (!SIInstrInfo::isImage(MI: Inst))
201 return VMEM_NOSAMPLER;
202 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc: Inst.getOpcode());
203 const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo =
204 AMDGPU::getMIMGBaseOpcodeInfo(BaseOpcode: Info->BaseOpcode);
205
206 if (BaseInfo->BVH)
207 return VMEM_BVH;
208
209 // We have to make an additional check for isVSAMPLE here since some
210 // instructions don't have a sampler, but are still classified as sampler
211 // instructions for the purposes of e.g. waitcnt.
212 if (BaseInfo->Sampler || BaseInfo->MSAA || SIInstrInfo::isVSAMPLE(MI: Inst))
213 return VMEM_SAMPLER;
214
215 return VMEM_NOSAMPLER;
216}
217
218unsigned &getCounterRef(AMDGPU::Waitcnt &Wait, InstCounterType T) {
219 switch (T) {
220 case LOAD_CNT:
221 return Wait.LoadCnt;
222 case EXP_CNT:
223 return Wait.ExpCnt;
224 case DS_CNT:
225 return Wait.DsCnt;
226 case STORE_CNT:
227 return Wait.StoreCnt;
228 case SAMPLE_CNT:
229 return Wait.SampleCnt;
230 case BVH_CNT:
231 return Wait.BvhCnt;
232 case KM_CNT:
233 return Wait.KmCnt;
234 case X_CNT:
235 return Wait.XCnt;
236 default:
237 llvm_unreachable("bad InstCounterType");
238 }
239}
240
241void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) {
242 unsigned &WC = getCounterRef(Wait, T);
243 WC = std::min(a: WC, b: Count);
244}
245
246void setNoWait(AMDGPU::Waitcnt &Wait, InstCounterType T) {
247 getCounterRef(Wait, T) = ~0u;
248}
249
250unsigned getWait(AMDGPU::Waitcnt &Wait, InstCounterType T) {
251 return getCounterRef(Wait, T);
252}
253
254// Mapping from event to counter according to the table masks.
255InstCounterType eventCounter(const unsigned *masks, WaitEventType E) {
256 for (auto T : inst_counter_types()) {
257 if (masks[T] & (1 << E))
258 return T;
259 }
260 llvm_unreachable("event type has no associated counter");
261}
262
263// This objects maintains the current score brackets of each wait counter, and
264// a per-register scoreboard for each wait counter.
265//
266// We also maintain the latest score for every event type that can change the
267// waitcnt in order to know if there are multiple types of events within
268// the brackets. When multiple types of event happen in the bracket,
269// wait count may get decreased out of order, therefore we need to put in
270// "s_waitcnt 0" before use.
271class WaitcntBrackets {
272public:
273 WaitcntBrackets(const GCNSubtarget *SubTarget, InstCounterType MaxCounter,
274 HardwareLimits Limits, const unsigned *WaitEventMaskForInst,
275 InstCounterType SmemAccessCounter)
276 : ST(SubTarget), MaxCounter(MaxCounter), Limits(Limits),
277 WaitEventMaskForInst(WaitEventMaskForInst),
278 SmemAccessCounter(SmemAccessCounter) {}
279
280 unsigned getWaitCountMax(InstCounterType T) const {
281 switch (T) {
282 case LOAD_CNT:
283 return Limits.LoadcntMax;
284 case DS_CNT:
285 return Limits.DscntMax;
286 case EXP_CNT:
287 return Limits.ExpcntMax;
288 case STORE_CNT:
289 return Limits.StorecntMax;
290 case SAMPLE_CNT:
291 return Limits.SamplecntMax;
292 case BVH_CNT:
293 return Limits.BvhcntMax;
294 case KM_CNT:
295 return Limits.KmcntMax;
296 case X_CNT:
297 return Limits.XcntMax;
298 default:
299 break;
300 }
301 return 0;
302 }
303
304 bool isSmemCounter(InstCounterType T) const {
305 return T == SmemAccessCounter || T == X_CNT;
306 }
307
308 unsigned getSgprScoresIdx(InstCounterType T) const {
309 assert(isSmemCounter(T) && "Invalid SMEM counter");
310 return T == X_CNT ? 1 : 0;
311 }
312
313 unsigned getScoreLB(InstCounterType T) const {
314 assert(T < NUM_INST_CNTS);
315 return ScoreLBs[T];
316 }
317
318 unsigned getScoreUB(InstCounterType T) const {
319 assert(T < NUM_INST_CNTS);
320 return ScoreUBs[T];
321 }
322
323 unsigned getScoreRange(InstCounterType T) const {
324 return getScoreUB(T) - getScoreLB(T);
325 }
326
327 unsigned getRegScore(int GprNo, InstCounterType T) const {
328 if (GprNo < NUM_ALL_VGPRS)
329 return VgprScores[T][GprNo];
330 return SgprScores[getSgprScoresIdx(T)][GprNo - NUM_ALL_VGPRS];
331 }
332
333 bool merge(const WaitcntBrackets &Other);
334
335 RegInterval getRegInterval(const MachineInstr *MI,
336 const MachineRegisterInfo *MRI,
337 const SIRegisterInfo *TRI,
338 const MachineOperand &Op) const;
339
340 bool counterOutOfOrder(InstCounterType T) const;
341 void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const;
342 void simplifyWaitcnt(InstCounterType T, unsigned &Count) const;
343
344 void determineWait(InstCounterType T, RegInterval Interval,
345 AMDGPU::Waitcnt &Wait) const;
346 void determineWait(InstCounterType T, int RegNo,
347 AMDGPU::Waitcnt &Wait) const {
348 determineWait(T, Interval: {RegNo, RegNo + 1}, Wait);
349 }
350
351 void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
352 void applyWaitcnt(InstCounterType T, unsigned Count);
353 void applyXcnt(const AMDGPU::Waitcnt &Wait);
354 void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI,
355 const MachineRegisterInfo *MRI, WaitEventType E,
356 MachineInstr &MI);
357
358 unsigned hasPendingEvent() const { return PendingEvents; }
359 unsigned hasPendingEvent(WaitEventType E) const {
360 return PendingEvents & (1 << E);
361 }
362 unsigned hasPendingEvent(InstCounterType T) const {
363 unsigned HasPending = PendingEvents & WaitEventMaskForInst[T];
364 assert((HasPending != 0) == (getScoreRange(T) != 0));
365 return HasPending;
366 }
367
368 bool hasMixedPendingEvents(InstCounterType T) const {
369 unsigned Events = hasPendingEvent(T);
370 // Return true if more than one bit is set in Events.
371 return Events & (Events - 1);
372 }
373
374 bool hasPendingFlat() const {
375 return ((LastFlat[DS_CNT] > ScoreLBs[DS_CNT] &&
376 LastFlat[DS_CNT] <= ScoreUBs[DS_CNT]) ||
377 (LastFlat[LOAD_CNT] > ScoreLBs[LOAD_CNT] &&
378 LastFlat[LOAD_CNT] <= ScoreUBs[LOAD_CNT]));
379 }
380
381 void setPendingFlat() {
382 LastFlat[LOAD_CNT] = ScoreUBs[LOAD_CNT];
383 LastFlat[DS_CNT] = ScoreUBs[DS_CNT];
384 }
385
386 bool hasPendingGDS() const {
387 return LastGDS > ScoreLBs[DS_CNT] && LastGDS <= ScoreUBs[DS_CNT];
388 }
389
390 unsigned getPendingGDSWait() const {
391 return std::min(a: getScoreUB(T: DS_CNT) - LastGDS, b: getWaitCountMax(T: DS_CNT) - 1);
392 }
393
394 void setPendingGDS() { LastGDS = ScoreUBs[DS_CNT]; }
395
396 // Return true if there might be pending writes to the vgpr-interval by VMEM
397 // instructions with types different from V.
398 bool hasOtherPendingVmemTypes(RegInterval Interval, VmemType V) const {
399 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
400 assert(RegNo < NUM_ALL_VGPRS);
401 if (VgprVmemTypes[RegNo] & ~(1 << V))
402 return true;
403 }
404 return false;
405 }
406
407 void clearVgprVmemTypes(RegInterval Interval) {
408 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
409 assert(RegNo < NUM_ALL_VGPRS);
410 VgprVmemTypes[RegNo] = 0;
411 }
412 }
413
414 void setStateOnFunctionEntryOrReturn() {
415 setScoreUB(T: STORE_CNT, Val: getScoreUB(T: STORE_CNT) + getWaitCountMax(T: STORE_CNT));
416 PendingEvents |= WaitEventMaskForInst[STORE_CNT];
417 }
418
419 ArrayRef<const MachineInstr *> getLDSDMAStores() const {
420 return LDSDMAStores;
421 }
422
423 bool hasPointSampleAccel(const MachineInstr &MI) const;
424 bool hasPointSamplePendingVmemTypes(const MachineInstr &MI,
425 RegInterval Interval) const;
426
427 void print(raw_ostream &) const;
428 void dump() const { print(dbgs()); }
429
430private:
431 struct MergeInfo {
432 unsigned OldLB;
433 unsigned OtherLB;
434 unsigned MyShift;
435 unsigned OtherShift;
436 };
437 static bool mergeScore(const MergeInfo &M, unsigned &Score,
438 unsigned OtherScore);
439
440 void setScoreLB(InstCounterType T, unsigned Val) {
441 assert(T < NUM_INST_CNTS);
442 ScoreLBs[T] = Val;
443 }
444
445 void setScoreUB(InstCounterType T, unsigned Val) {
446 assert(T < NUM_INST_CNTS);
447 ScoreUBs[T] = Val;
448
449 if (T != EXP_CNT)
450 return;
451
452 if (getScoreRange(T: EXP_CNT) > getWaitCountMax(T: EXP_CNT))
453 ScoreLBs[EXP_CNT] = ScoreUBs[EXP_CNT] - getWaitCountMax(T: EXP_CNT);
454 }
455
456 void setRegScore(int GprNo, InstCounterType T, unsigned Val) {
457 setScoreByInterval(Interval: {GprNo, GprNo + 1}, CntTy: T, Score: Val);
458 }
459
460 void setScoreByInterval(RegInterval Interval, InstCounterType CntTy,
461 unsigned Score);
462
463 void setScoreByOperand(const MachineInstr *MI, const SIRegisterInfo *TRI,
464 const MachineRegisterInfo *MRI,
465 const MachineOperand &Op, InstCounterType CntTy,
466 unsigned Val);
467
468 const GCNSubtarget *ST = nullptr;
469 InstCounterType MaxCounter = NUM_EXTENDED_INST_CNTS;
470 HardwareLimits Limits = {};
471 const unsigned *WaitEventMaskForInst;
472 InstCounterType SmemAccessCounter;
473 unsigned ScoreLBs[NUM_INST_CNTS] = {0};
474 unsigned ScoreUBs[NUM_INST_CNTS] = {0};
475 unsigned PendingEvents = 0;
476 // Remember the last flat memory operation.
477 unsigned LastFlat[NUM_INST_CNTS] = {0};
478 // Remember the last GDS operation.
479 unsigned LastGDS = 0;
480 // wait_cnt scores for every vgpr.
481 // Keep track of the VgprUB and SgprUB to make merge at join efficient.
482 int VgprUB = -1;
483 int SgprUB = -1;
484 unsigned VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS] = {{0}};
485 // Wait cnt scores for every sgpr, the DS_CNT (corresponding to LGKMcnt
486 // pre-gfx12) or KM_CNT (gfx12+ only), and X_CNT (gfx1250) are relevant.
487 // Row 0 represents the score for either DS_CNT or KM_CNT and row 1 keeps the
488 // X_CNT score.
489 unsigned SgprScores[2][SQ_MAX_PGM_SGPRS] = {{0}};
490 // Bitmask of the VmemTypes of VMEM instructions that might have a pending
491 // write to each vgpr.
492 unsigned char VgprVmemTypes[NUM_ALL_VGPRS] = {0};
493 // Store representative LDS DMA operations. The only useful info here is
494 // alias info. One store is kept per unique AAInfo.
495 SmallVector<const MachineInstr *, NUM_LDS_VGPRS - 1> LDSDMAStores;
496};
497
498// This abstracts the logic for generating and updating S_WAIT* instructions
499// away from the analysis that determines where they are needed. This was
500// done because the set of counters and instructions for waiting on them
501// underwent a major shift with gfx12, sufficiently so that having this
502// abstraction allows the main analysis logic to be simpler than it would
503// otherwise have had to become.
504class WaitcntGenerator {
505protected:
506 const GCNSubtarget *ST = nullptr;
507 const SIInstrInfo *TII = nullptr;
508 AMDGPU::IsaVersion IV;
509 InstCounterType MaxCounter;
510 bool OptNone;
511
512public:
513 WaitcntGenerator() = default;
514 WaitcntGenerator(const MachineFunction &MF, InstCounterType MaxCounter)
515 : ST(&MF.getSubtarget<GCNSubtarget>()), TII(ST->getInstrInfo()),
516 IV(AMDGPU::getIsaVersion(GPU: ST->getCPU())), MaxCounter(MaxCounter),
517 OptNone(MF.getFunction().hasOptNone() ||
518 MF.getTarget().getOptLevel() == CodeGenOptLevel::None) {}
519
520 // Return true if the current function should be compiled with no
521 // optimization.
522 bool isOptNone() const { return OptNone; }
523
524 // Edits an existing sequence of wait count instructions according
525 // to an incoming Waitcnt value, which is itself updated to reflect
526 // any new wait count instructions which may need to be generated by
527 // WaitcntGenerator::createNewWaitcnt(). It will return true if any edits
528 // were made.
529 //
530 // This editing will usually be merely updated operands, but it may also
531 // delete instructions if the incoming Wait value indicates they are not
532 // needed. It may also remove existing instructions for which a wait
533 // is needed if it can be determined that it is better to generate new
534 // instructions later, as can happen on gfx12.
535 virtual bool
536 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
537 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
538 MachineBasicBlock::instr_iterator It) const = 0;
539
540 // Transform a soft waitcnt into a normal one.
541 bool promoteSoftWaitCnt(MachineInstr *Waitcnt) const;
542
543 // Generates new wait count instructions according to the value of
544 // Wait, returning true if any new instructions were created.
545 virtual bool createNewWaitcnt(MachineBasicBlock &Block,
546 MachineBasicBlock::instr_iterator It,
547 AMDGPU::Waitcnt Wait) = 0;
548
549 // Returns an array of bit masks which can be used to map values in
550 // WaitEventType to corresponding counter values in InstCounterType.
551 virtual const unsigned *getWaitEventMask() const = 0;
552
553 // Returns a new waitcnt with all counters except VScnt set to 0. If
554 // IncludeVSCnt is true, VScnt is set to 0, otherwise it is set to ~0u.
555 virtual AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const = 0;
556
557 virtual ~WaitcntGenerator() = default;
558
559 // Create a mask value from the initializer list of wait event types.
560 static constexpr unsigned
561 eventMask(std::initializer_list<WaitEventType> Events) {
562 unsigned Mask = 0;
563 for (auto &E : Events)
564 Mask |= 1 << E;
565
566 return Mask;
567 }
568};
569
570class WaitcntGeneratorPreGFX12 : public WaitcntGenerator {
571public:
572 WaitcntGeneratorPreGFX12() = default;
573 WaitcntGeneratorPreGFX12(const MachineFunction &MF)
574 : WaitcntGenerator(MF, NUM_NORMAL_INST_CNTS) {}
575
576 bool
577 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
578 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
579 MachineBasicBlock::instr_iterator It) const override;
580
581 bool createNewWaitcnt(MachineBasicBlock &Block,
582 MachineBasicBlock::instr_iterator It,
583 AMDGPU::Waitcnt Wait) override;
584
585 const unsigned *getWaitEventMask() const override {
586 assert(ST);
587
588 static const unsigned WaitEventMaskForInstPreGFX12[NUM_INST_CNTS] = {
589 eventMask(Events: {VMEM_ACCESS, VMEM_READ_ACCESS, VMEM_SAMPLER_READ_ACCESS,
590 VMEM_BVH_READ_ACCESS}),
591 eventMask(Events: {SMEM_ACCESS, LDS_ACCESS, GDS_ACCESS, SQ_MESSAGE}),
592 eventMask(Events: {EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK, EXP_PARAM_ACCESS,
593 EXP_POS_ACCESS, EXP_LDS_ACCESS}),
594 eventMask(Events: {VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
595 0,
596 0,
597 0,
598 0};
599
600 return WaitEventMaskForInstPreGFX12;
601 }
602
603 AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override;
604};
605
606class WaitcntGeneratorGFX12Plus : public WaitcntGenerator {
607public:
608 WaitcntGeneratorGFX12Plus() = default;
609 WaitcntGeneratorGFX12Plus(const MachineFunction &MF,
610 InstCounterType MaxCounter)
611 : WaitcntGenerator(MF, MaxCounter) {}
612
613 bool
614 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
615 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
616 MachineBasicBlock::instr_iterator It) const override;
617
618 bool createNewWaitcnt(MachineBasicBlock &Block,
619 MachineBasicBlock::instr_iterator It,
620 AMDGPU::Waitcnt Wait) override;
621
622 const unsigned *getWaitEventMask() const override {
623 assert(ST);
624
625 static const unsigned WaitEventMaskForInstGFX12Plus[NUM_INST_CNTS] = {
626 eventMask(Events: {VMEM_ACCESS, VMEM_READ_ACCESS}),
627 eventMask(Events: {LDS_ACCESS, GDS_ACCESS}),
628 eventMask(Events: {EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK, EXP_PARAM_ACCESS,
629 EXP_POS_ACCESS, EXP_LDS_ACCESS}),
630 eventMask(Events: {VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
631 eventMask(Events: {VMEM_SAMPLER_READ_ACCESS}),
632 eventMask(Events: {VMEM_BVH_READ_ACCESS}),
633 eventMask(Events: {SMEM_ACCESS, SQ_MESSAGE}),
634 eventMask(Events: {VMEM_GROUP, SMEM_GROUP})};
635
636 return WaitEventMaskForInstGFX12Plus;
637 }
638
639 AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override;
640};
641
642class SIInsertWaitcnts {
643private:
644 const GCNSubtarget *ST = nullptr;
645 const SIInstrInfo *TII = nullptr;
646 const SIRegisterInfo *TRI = nullptr;
647 const MachineRegisterInfo *MRI = nullptr;
648
649 DenseMap<const Value *, MachineBasicBlock *> SLoadAddresses;
650 DenseMap<MachineBasicBlock *, bool> PreheadersToFlush;
651 MachineLoopInfo *MLI;
652 MachinePostDominatorTree *PDT;
653 AliasAnalysis *AA = nullptr;
654
655 struct BlockInfo {
656 std::unique_ptr<WaitcntBrackets> Incoming;
657 bool Dirty = true;
658 };
659
660 InstCounterType SmemAccessCounter;
661
662 MapVector<MachineBasicBlock *, BlockInfo> BlockInfos;
663
664 bool ForceEmitWaitcnt[NUM_INST_CNTS];
665
666 // In any given run of this pass, WCG will point to one of these two
667 // generator objects, which must have been re-initialised before use
668 // from a value made using a subtarget constructor.
669 WaitcntGeneratorPreGFX12 WCGPreGFX12;
670 WaitcntGeneratorGFX12Plus WCGGFX12Plus;
671
672 WaitcntGenerator *WCG = nullptr;
673
674 // S_ENDPGM instructions before which we should insert a DEALLOC_VGPRS
675 // message.
676 DenseSet<MachineInstr *> ReleaseVGPRInsts;
677
678 InstCounterType MaxCounter = NUM_NORMAL_INST_CNTS;
679
680public:
681 SIInsertWaitcnts(MachineLoopInfo *MLI, MachinePostDominatorTree *PDT,
682 AliasAnalysis *AA)
683 : MLI(MLI), PDT(PDT), AA(AA) {
684 (void)ForceExpCounter;
685 (void)ForceLgkmCounter;
686 (void)ForceVMCounter;
687 }
688
689 bool shouldFlushVmCnt(MachineLoop *ML, const WaitcntBrackets &Brackets);
690 bool isPreheaderToFlush(MachineBasicBlock &MBB,
691 const WaitcntBrackets &ScoreBrackets);
692 bool isVMEMOrFlatVMEM(const MachineInstr &MI) const;
693 bool run(MachineFunction &MF);
694
695 bool isForceEmitWaitcnt() const {
696 for (auto T : inst_counter_types())
697 if (ForceEmitWaitcnt[T])
698 return true;
699 return false;
700 }
701
702 void setForceEmitWaitcnt() {
703// For non-debug builds, ForceEmitWaitcnt has been initialized to false;
704// For debug builds, get the debug counter info and adjust if need be
705#ifndef NDEBUG
706 if (DebugCounter::isCounterSet(ForceExpCounter) &&
707 DebugCounter::shouldExecute(ForceExpCounter)) {
708 ForceEmitWaitcnt[EXP_CNT] = true;
709 } else {
710 ForceEmitWaitcnt[EXP_CNT] = false;
711 }
712
713 if (DebugCounter::isCounterSet(ForceLgkmCounter) &&
714 DebugCounter::shouldExecute(ForceLgkmCounter)) {
715 ForceEmitWaitcnt[DS_CNT] = true;
716 ForceEmitWaitcnt[KM_CNT] = true;
717 } else {
718 ForceEmitWaitcnt[DS_CNT] = false;
719 ForceEmitWaitcnt[KM_CNT] = false;
720 }
721
722 if (DebugCounter::isCounterSet(ForceVMCounter) &&
723 DebugCounter::shouldExecute(ForceVMCounter)) {
724 ForceEmitWaitcnt[LOAD_CNT] = true;
725 ForceEmitWaitcnt[SAMPLE_CNT] = true;
726 ForceEmitWaitcnt[BVH_CNT] = true;
727 } else {
728 ForceEmitWaitcnt[LOAD_CNT] = false;
729 ForceEmitWaitcnt[SAMPLE_CNT] = false;
730 ForceEmitWaitcnt[BVH_CNT] = false;
731 }
732#endif // NDEBUG
733 }
734
735 // Return the appropriate VMEM_*_ACCESS type for Inst, which must be a VMEM
736 // instruction.
737 WaitEventType getVmemWaitEventType(const MachineInstr &Inst) const {
738 switch (Inst.getOpcode()) {
739 case AMDGPU::GLOBAL_INV:
740 return VMEM_READ_ACCESS; // tracked using loadcnt
741 case AMDGPU::GLOBAL_WB:
742 case AMDGPU::GLOBAL_WBINV:
743 return VMEM_WRITE_ACCESS; // tracked using storecnt
744 default:
745 break;
746 }
747
748 // Maps VMEM access types to their corresponding WaitEventType.
749 static const WaitEventType VmemReadMapping[NUM_VMEM_TYPES] = {
750 VMEM_READ_ACCESS, VMEM_SAMPLER_READ_ACCESS, VMEM_BVH_READ_ACCESS};
751
752 assert(SIInstrInfo::isVMEM(Inst));
753 // LDS DMA loads are also stores, but on the LDS side. On the VMEM side
754 // these should use VM_CNT.
755 if (!ST->hasVscnt() || SIInstrInfo::mayWriteLDSThroughDMA(MI: Inst))
756 return VMEM_ACCESS;
757 if (Inst.mayStore() &&
758 (!Inst.mayLoad() || SIInstrInfo::isAtomicNoRet(MI: Inst))) {
759 // FLAT and SCRATCH instructions may access scratch. Other VMEM
760 // instructions do not.
761 if (SIInstrInfo::isFLAT(MI: Inst) && mayAccessScratchThroughFlat(MI: Inst))
762 return SCRATCH_WRITE_ACCESS;
763 return VMEM_WRITE_ACCESS;
764 }
765 if (!ST->hasExtendedWaitCounts() || SIInstrInfo::isFLAT(MI: Inst))
766 return VMEM_READ_ACCESS;
767 return VmemReadMapping[getVmemType(Inst)];
768 }
769
770 bool hasXcnt() const { return ST->hasWaitXCnt(); }
771
772 bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const;
773 bool mayAccessLDSThroughFlat(const MachineInstr &MI) const;
774 bool mayAccessScratchThroughFlat(const MachineInstr &MI) const;
775 bool isVmemAccess(const MachineInstr &MI) const;
776 bool generateWaitcntInstBefore(MachineInstr &MI,
777 WaitcntBrackets &ScoreBrackets,
778 MachineInstr *OldWaitcntInstr,
779 bool FlushVmCnt);
780 bool generateWaitcnt(AMDGPU::Waitcnt Wait,
781 MachineBasicBlock::instr_iterator It,
782 MachineBasicBlock &Block, WaitcntBrackets &ScoreBrackets,
783 MachineInstr *OldWaitcntInstr);
784 void updateEventWaitcntAfter(MachineInstr &Inst,
785 WaitcntBrackets *ScoreBrackets);
786 bool isNextENDPGM(MachineBasicBlock::instr_iterator It,
787 MachineBasicBlock *Block) const;
788 bool insertForcedWaitAfter(MachineInstr &Inst, MachineBasicBlock &Block,
789 WaitcntBrackets &ScoreBrackets);
790 bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block,
791 WaitcntBrackets &ScoreBrackets);
792};
793
794class SIInsertWaitcntsLegacy : public MachineFunctionPass {
795public:
796 static char ID;
797 SIInsertWaitcntsLegacy() : MachineFunctionPass(ID) {}
798
799 bool runOnMachineFunction(MachineFunction &MF) override;
800
801 StringRef getPassName() const override {
802 return "SI insert wait instructions";
803 }
804
805 void getAnalysisUsage(AnalysisUsage &AU) const override {
806 AU.setPreservesCFG();
807 AU.addRequired<MachineLoopInfoWrapperPass>();
808 AU.addRequired<MachinePostDominatorTreeWrapperPass>();
809 AU.addUsedIfAvailable<AAResultsWrapperPass>();
810 AU.addPreserved<AAResultsWrapperPass>();
811 MachineFunctionPass::getAnalysisUsage(AU);
812 }
813};
814
815} // end anonymous namespace
816
817RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
818 const MachineRegisterInfo *MRI,
819 const SIRegisterInfo *TRI,
820 const MachineOperand &Op) const {
821 if (!TRI->isInAllocatableClass(RegNo: Op.getReg()))
822 return {-1, -1};
823
824 // A use via a PW operand does not need a waitcnt.
825 // A partial write is not a WAW.
826 assert(!Op.getSubReg() || !Op.isUndef());
827
828 RegInterval Result;
829
830 MCRegister MCReg = AMDGPU::getMCReg(Reg: Op.getReg(), STI: *ST);
831 unsigned RegIdx = TRI->getHWRegIndex(Reg: MCReg);
832 assert(isUInt<8>(RegIdx));
833
834 const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Reg: Op.getReg());
835 unsigned Size = TRI->getRegSizeInBits(RC: *RC);
836
837 // AGPRs/VGPRs are tracked every 16 bits, SGPRs by 32 bits
838 if (TRI->isVectorRegister(MRI: *MRI, Reg: Op.getReg())) {
839 unsigned Reg = RegIdx << 1 | (AMDGPU::isHi16Reg(Reg: MCReg, MRI: *TRI) ? 1 : 0);
840 assert(Reg < AGPR_OFFSET);
841 Result.first = Reg;
842 if (TRI->isAGPR(MRI: *MRI, Reg: Op.getReg()))
843 Result.first += AGPR_OFFSET;
844 assert(Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS);
845 assert(Size % 16 == 0);
846 Result.second = Result.first + (Size / 16);
847 } else if (TRI->isSGPRReg(MRI: *MRI, Reg: Op.getReg()) && RegIdx < SQ_MAX_PGM_SGPRS) {
848 // SGPRs including VCC, TTMPs and EXEC but excluding read-only scalar
849 // sources like SRC_PRIVATE_BASE.
850 Result.first = RegIdx + NUM_ALL_VGPRS;
851 Result.second = Result.first + divideCeil(Numerator: Size, Denominator: 32);
852 } else {
853 return {-1, -1};
854 }
855
856 return Result;
857}
858
859void WaitcntBrackets::setScoreByInterval(RegInterval Interval,
860 InstCounterType CntTy,
861 unsigned Score) {
862 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
863 if (RegNo < NUM_ALL_VGPRS) {
864 VgprUB = std::max(a: VgprUB, b: RegNo);
865 VgprScores[CntTy][RegNo] = Score;
866 } else {
867 SgprUB = std::max(a: SgprUB, b: RegNo - NUM_ALL_VGPRS);
868 SgprScores[getSgprScoresIdx(T: CntTy)][RegNo - NUM_ALL_VGPRS] = Score;
869 }
870 }
871}
872
873void WaitcntBrackets::setScoreByOperand(const MachineInstr *MI,
874 const SIRegisterInfo *TRI,
875 const MachineRegisterInfo *MRI,
876 const MachineOperand &Op,
877 InstCounterType CntTy, unsigned Score) {
878 RegInterval Interval = getRegInterval(MI, MRI, TRI, Op);
879 setScoreByInterval(Interval, CntTy, Score);
880}
881
882// Return true if the subtarget is one that enables Point Sample Acceleration
883// and the MachineInstr passed in is one to which it might be applied (the
884// hardware makes this decision based on several factors, but we can't determine
885// this at compile time, so we have to assume it might be applied if the
886// instruction supports it).
887bool WaitcntBrackets::hasPointSampleAccel(const MachineInstr &MI) const {
888 if (!ST->hasPointSampleAccel() || !SIInstrInfo::isMIMG(MI))
889 return false;
890
891 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc: MI.getOpcode());
892 const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo =
893 AMDGPU::getMIMGBaseOpcodeInfo(BaseOpcode: Info->BaseOpcode);
894 return BaseInfo->PointSampleAccel;
895}
896
897// Return true if the subtarget enables Point Sample Acceleration, the supplied
898// MachineInstr is one to which it might be applied and the supplied interval is
899// one that has outstanding writes to vmem-types different than VMEM_NOSAMPLER
900// (this is the type that a point sample accelerated instruction effectively
901// becomes)
902bool WaitcntBrackets::hasPointSamplePendingVmemTypes(
903 const MachineInstr &MI, RegInterval Interval) const {
904 if (!hasPointSampleAccel(MI))
905 return false;
906
907 return hasOtherPendingVmemTypes(Interval, V: VMEM_NOSAMPLER);
908}
909
910void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
911 const SIRegisterInfo *TRI,
912 const MachineRegisterInfo *MRI,
913 WaitEventType E, MachineInstr &Inst) {
914 InstCounterType T = eventCounter(masks: WaitEventMaskForInst, E);
915
916 unsigned UB = getScoreUB(T);
917 unsigned CurrScore = UB + 1;
918 if (CurrScore == 0)
919 report_fatal_error(reason: "InsertWaitcnt score wraparound");
920 // PendingEvents and ScoreUB need to be update regardless if this event
921 // changes the score of a register or not.
922 // Examples including vm_cnt when buffer-store or lgkm_cnt when send-message.
923 PendingEvents |= 1 << E;
924 setScoreUB(T, Val: CurrScore);
925
926 if (T == EXP_CNT) {
927 // Put score on the source vgprs. If this is a store, just use those
928 // specific register(s).
929 if (TII->isDS(MI: Inst) && Inst.mayLoadOrStore()) {
930 // All GDS operations must protect their address register (same as
931 // export.)
932 if (const auto *AddrOp = TII->getNamedOperand(MI&: Inst, OperandName: AMDGPU::OpName::addr))
933 setScoreByOperand(MI: &Inst, TRI, MRI, Op: *AddrOp, CntTy: EXP_CNT, Score: CurrScore);
934
935 if (Inst.mayStore()) {
936 if (const auto *Data0 =
937 TII->getNamedOperand(MI&: Inst, OperandName: AMDGPU::OpName::data0))
938 setScoreByOperand(MI: &Inst, TRI, MRI, Op: *Data0, CntTy: EXP_CNT, Score: CurrScore);
939 if (const auto *Data1 =
940 TII->getNamedOperand(MI&: Inst, OperandName: AMDGPU::OpName::data1))
941 setScoreByOperand(MI: &Inst, TRI, MRI, Op: *Data1, CntTy: EXP_CNT, Score: CurrScore);
942 } else if (SIInstrInfo::isAtomicRet(MI: Inst) && !SIInstrInfo::isGWS(MI: Inst) &&
943 Inst.getOpcode() != AMDGPU::DS_APPEND &&
944 Inst.getOpcode() != AMDGPU::DS_CONSUME &&
945 Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) {
946 for (const MachineOperand &Op : Inst.all_uses()) {
947 if (TRI->isVectorRegister(MRI: *MRI, Reg: Op.getReg()))
948 setScoreByOperand(MI: &Inst, TRI, MRI, Op, CntTy: EXP_CNT, Score: CurrScore);
949 }
950 }
951 } else if (TII->isFLAT(MI: Inst)) {
952 if (Inst.mayStore()) {
953 setScoreByOperand(MI: &Inst, TRI, MRI,
954 Op: *TII->getNamedOperand(MI&: Inst, OperandName: AMDGPU::OpName::data),
955 CntTy: EXP_CNT, Score: CurrScore);
956 } else if (SIInstrInfo::isAtomicRet(MI: Inst)) {
957 setScoreByOperand(MI: &Inst, TRI, MRI,
958 Op: *TII->getNamedOperand(MI&: Inst, OperandName: AMDGPU::OpName::data),
959 CntTy: EXP_CNT, Score: CurrScore);
960 }
961 } else if (TII->isMIMG(MI: Inst)) {
962 if (Inst.mayStore()) {
963 setScoreByOperand(MI: &Inst, TRI, MRI, Op: Inst.getOperand(i: 0), CntTy: EXP_CNT,
964 Score: CurrScore);
965 } else if (SIInstrInfo::isAtomicRet(MI: Inst)) {
966 setScoreByOperand(MI: &Inst, TRI, MRI,
967 Op: *TII->getNamedOperand(MI&: Inst, OperandName: AMDGPU::OpName::data),
968 CntTy: EXP_CNT, Score: CurrScore);
969 }
970 } else if (TII->isMTBUF(MI: Inst)) {
971 if (Inst.mayStore())
972 setScoreByOperand(MI: &Inst, TRI, MRI, Op: Inst.getOperand(i: 0), CntTy: EXP_CNT,
973 Score: CurrScore);
974 } else if (TII->isMUBUF(MI: Inst)) {
975 if (Inst.mayStore()) {
976 setScoreByOperand(MI: &Inst, TRI, MRI, Op: Inst.getOperand(i: 0), CntTy: EXP_CNT,
977 Score: CurrScore);
978 } else if (SIInstrInfo::isAtomicRet(MI: Inst)) {
979 setScoreByOperand(MI: &Inst, TRI, MRI,
980 Op: *TII->getNamedOperand(MI&: Inst, OperandName: AMDGPU::OpName::data),
981 CntTy: EXP_CNT, Score: CurrScore);
982 }
983 } else if (TII->isLDSDIR(MI: Inst)) {
984 // LDSDIR instructions attach the score to the destination.
985 setScoreByOperand(MI: &Inst, TRI, MRI,
986 Op: *TII->getNamedOperand(MI&: Inst, OperandName: AMDGPU::OpName::vdst),
987 CntTy: EXP_CNT, Score: CurrScore);
988 } else {
989 if (TII->isEXP(MI: Inst)) {
990 // For export the destination registers are really temps that
991 // can be used as the actual source after export patching, so
992 // we need to treat them like sources and set the EXP_CNT
993 // score.
994 for (MachineOperand &DefMO : Inst.all_defs()) {
995 if (TRI->isVGPR(MRI: *MRI, Reg: DefMO.getReg())) {
996 setScoreByOperand(MI: &Inst, TRI, MRI, Op: DefMO, CntTy: EXP_CNT, Score: CurrScore);
997 }
998 }
999 }
1000 for (const MachineOperand &Op : Inst.all_uses()) {
1001 if (TRI->isVectorRegister(MRI: *MRI, Reg: Op.getReg()))
1002 setScoreByOperand(MI: &Inst, TRI, MRI, Op, CntTy: EXP_CNT, Score: CurrScore);
1003 }
1004 }
1005 } else if (T == X_CNT) {
1006 for (const MachineOperand &Op : Inst.all_uses())
1007 setScoreByOperand(MI: &Inst, TRI, MRI, Op, CntTy: T, Score: CurrScore);
1008 } else /* LGKM_CNT || EXP_CNT || VS_CNT || NUM_INST_CNTS */ {
1009 // Match the score to the destination registers.
1010 //
1011 // Check only explicit operands. Stores, especially spill stores, include
1012 // implicit uses and defs of their super registers which would create an
1013 // artificial dependency, while these are there only for register liveness
1014 // accounting purposes.
1015 //
1016 // Special cases where implicit register defs exists, such as M0 or VCC,
1017 // but none with memory instructions.
1018 for (const MachineOperand &Op : Inst.defs()) {
1019 RegInterval Interval = getRegInterval(MI: &Inst, MRI, TRI, Op);
1020 if (T == LOAD_CNT || T == SAMPLE_CNT || T == BVH_CNT) {
1021 if (Interval.first >= NUM_ALL_VGPRS)
1022 continue;
1023 if (updateVMCntOnly(Inst)) {
1024 // updateVMCntOnly should only leave us with VGPRs
1025 // MUBUF, MTBUF, MIMG, FlatGlobal, and FlatScratch only have VGPR/AGPR
1026 // defs. That's required for a sane index into `VgprMemTypes` below
1027 assert(TRI->isVectorRegister(*MRI, Op.getReg()));
1028 VmemType V = getVmemType(Inst);
1029 unsigned char TypesMask = 1 << V;
1030 // If instruction can have Point Sample Accel applied, we have to flag
1031 // this with another potential dependency
1032 if (hasPointSampleAccel(MI: Inst))
1033 TypesMask |= 1 << VMEM_NOSAMPLER;
1034 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo)
1035 VgprVmemTypes[RegNo] |= TypesMask;
1036 }
1037 }
1038 setScoreByInterval(Interval, CntTy: T, Score: CurrScore);
1039 }
1040 if (Inst.mayStore() &&
1041 (TII->isDS(MI: Inst) || TII->mayWriteLDSThroughDMA(MI: Inst))) {
1042 // MUBUF and FLAT LDS DMA operations need a wait on vmcnt before LDS
1043 // written can be accessed. A load from LDS to VMEM does not need a wait.
1044 unsigned Slot = 0;
1045 for (const auto *MemOp : Inst.memoperands()) {
1046 if (!MemOp->isStore() ||
1047 MemOp->getAddrSpace() != AMDGPUAS::LOCAL_ADDRESS)
1048 continue;
1049 // Comparing just AA info does not guarantee memoperands are equal
1050 // in general, but this is so for LDS DMA in practice.
1051 auto AAI = MemOp->getAAInfo();
1052 // Alias scope information gives a way to definitely identify an
1053 // original memory object and practically produced in the module LDS
1054 // lowering pass. If there is no scope available we will not be able
1055 // to disambiguate LDS aliasing as after the module lowering all LDS
1056 // is squashed into a single big object. Do not attempt to use one of
1057 // the limited LDSDMAStores for something we will not be able to use
1058 // anyway.
1059 if (!AAI || !AAI.Scope)
1060 break;
1061 for (unsigned I = 0, E = LDSDMAStores.size(); I != E && !Slot; ++I) {
1062 for (const auto *MemOp : LDSDMAStores[I]->memoperands()) {
1063 if (MemOp->isStore() && AAI == MemOp->getAAInfo()) {
1064 Slot = I + 1;
1065 break;
1066 }
1067 }
1068 }
1069 if (Slot || LDSDMAStores.size() == NUM_LDS_VGPRS - 1)
1070 break;
1071 LDSDMAStores.push_back(Elt: &Inst);
1072 Slot = LDSDMAStores.size();
1073 break;
1074 }
1075 setRegScore(GprNo: FIRST_LDS_VGPR + Slot, T, Val: CurrScore);
1076 if (Slot)
1077 setRegScore(GprNo: FIRST_LDS_VGPR, T, Val: CurrScore);
1078 }
1079 }
1080}
1081
1082void WaitcntBrackets::print(raw_ostream &OS) const {
1083 OS << '\n';
1084 for (auto T : inst_counter_types(MaxCounter)) {
1085 unsigned SR = getScoreRange(T);
1086
1087 switch (T) {
1088 case LOAD_CNT:
1089 OS << " " << (ST->hasExtendedWaitCounts() ? "LOAD" : "VM") << "_CNT("
1090 << SR << "): ";
1091 break;
1092 case DS_CNT:
1093 OS << " " << (ST->hasExtendedWaitCounts() ? "DS" : "LGKM") << "_CNT("
1094 << SR << "): ";
1095 break;
1096 case EXP_CNT:
1097 OS << " EXP_CNT(" << SR << "): ";
1098 break;
1099 case STORE_CNT:
1100 OS << " " << (ST->hasExtendedWaitCounts() ? "STORE" : "VS") << "_CNT("
1101 << SR << "): ";
1102 break;
1103 case SAMPLE_CNT:
1104 OS << " SAMPLE_CNT(" << SR << "): ";
1105 break;
1106 case BVH_CNT:
1107 OS << " BVH_CNT(" << SR << "): ";
1108 break;
1109 case KM_CNT:
1110 OS << " KM_CNT(" << SR << "): ";
1111 break;
1112 case X_CNT:
1113 OS << " X_CNT(" << SR << "): ";
1114 break;
1115 default:
1116 OS << " UNKNOWN(" << SR << "): ";
1117 break;
1118 }
1119
1120 if (SR != 0) {
1121 // Print vgpr scores.
1122 unsigned LB = getScoreLB(T);
1123
1124 for (int J = 0; J <= VgprUB; J++) {
1125 unsigned RegScore = getRegScore(GprNo: J, T);
1126 if (RegScore <= LB)
1127 continue;
1128 unsigned RelScore = RegScore - LB - 1;
1129 if (J < FIRST_LDS_VGPR) {
1130 OS << RelScore << ":v" << J << " ";
1131 } else {
1132 OS << RelScore << ":ds ";
1133 }
1134 }
1135 // Also need to print sgpr scores for lgkm_cnt or xcnt.
1136 if (isSmemCounter(T)) {
1137 for (int J = 0; J <= SgprUB; J++) {
1138 unsigned RegScore = getRegScore(GprNo: J + NUM_ALL_VGPRS, T);
1139 if (RegScore <= LB)
1140 continue;
1141 unsigned RelScore = RegScore - LB - 1;
1142 OS << RelScore << ":s" << J << " ";
1143 }
1144 }
1145 }
1146 OS << '\n';
1147 }
1148
1149 OS << "Pending Events: ";
1150 if (hasPendingEvent()) {
1151 ListSeparator LS;
1152 for (unsigned I = 0; I != NUM_WAIT_EVENTS; ++I) {
1153 if (hasPendingEvent(E: (WaitEventType)I)) {
1154 OS << LS << WaitEventTypeName[I];
1155 }
1156 }
1157 } else {
1158 OS << "none";
1159 }
1160 OS << '\n';
1161
1162 OS << '\n';
1163}
1164
1165/// Simplify the waitcnt, in the sense of removing redundant counts, and return
1166/// whether a waitcnt instruction is needed at all.
1167void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const {
1168 simplifyWaitcnt(T: LOAD_CNT, Count&: Wait.LoadCnt);
1169 simplifyWaitcnt(T: EXP_CNT, Count&: Wait.ExpCnt);
1170 simplifyWaitcnt(T: DS_CNT, Count&: Wait.DsCnt);
1171 simplifyWaitcnt(T: STORE_CNT, Count&: Wait.StoreCnt);
1172 simplifyWaitcnt(T: SAMPLE_CNT, Count&: Wait.SampleCnt);
1173 simplifyWaitcnt(T: BVH_CNT, Count&: Wait.BvhCnt);
1174 simplifyWaitcnt(T: KM_CNT, Count&: Wait.KmCnt);
1175 simplifyWaitcnt(T: X_CNT, Count&: Wait.XCnt);
1176}
1177
1178void WaitcntBrackets::simplifyWaitcnt(InstCounterType T,
1179 unsigned &Count) const {
1180 // The number of outstanding events for this type, T, can be calculated
1181 // as (UB - LB). If the current Count is greater than or equal to the number
1182 // of outstanding events, then the wait for this counter is redundant.
1183 if (Count >= getScoreRange(T))
1184 Count = ~0u;
1185}
1186
1187void WaitcntBrackets::determineWait(InstCounterType T, RegInterval Interval,
1188 AMDGPU::Waitcnt &Wait) const {
1189 const unsigned LB = getScoreLB(T);
1190 const unsigned UB = getScoreUB(T);
1191 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
1192 unsigned ScoreToWait = getRegScore(GprNo: RegNo, T);
1193
1194 // If the score of src_operand falls within the bracket, we need an
1195 // s_waitcnt instruction.
1196 if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
1197 if ((T == LOAD_CNT || T == DS_CNT) && hasPendingFlat() &&
1198 !ST->hasFlatLgkmVMemCountInOrder()) {
1199 // If there is a pending FLAT operation, and this is a VMem or LGKM
1200 // waitcnt and the target can report early completion, then we need
1201 // to force a waitcnt 0.
1202 addWait(Wait, T, Count: 0);
1203 } else if (counterOutOfOrder(T)) {
1204 // Counter can get decremented out-of-order when there
1205 // are multiple types event in the bracket. Also emit an s_wait counter
1206 // with a conservative value of 0 for the counter.
1207 addWait(Wait, T, Count: 0);
1208 } else {
1209 // If a counter has been maxed out avoid overflow by waiting for
1210 // MAX(CounterType) - 1 instead.
1211 unsigned NeededWait =
1212 std::min(a: UB - ScoreToWait, b: getWaitCountMax(T) - 1);
1213 addWait(Wait, T, Count: NeededWait);
1214 }
1215 }
1216 }
1217}
1218
1219void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) {
1220 applyWaitcnt(T: LOAD_CNT, Count: Wait.LoadCnt);
1221 applyWaitcnt(T: EXP_CNT, Count: Wait.ExpCnt);
1222 applyWaitcnt(T: DS_CNT, Count: Wait.DsCnt);
1223 applyWaitcnt(T: STORE_CNT, Count: Wait.StoreCnt);
1224 applyWaitcnt(T: SAMPLE_CNT, Count: Wait.SampleCnt);
1225 applyWaitcnt(T: BVH_CNT, Count: Wait.BvhCnt);
1226 applyWaitcnt(T: KM_CNT, Count: Wait.KmCnt);
1227 applyXcnt(Wait);
1228}
1229
1230void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
1231 const unsigned UB = getScoreUB(T);
1232 if (Count >= UB)
1233 return;
1234 if (Count != 0) {
1235 if (counterOutOfOrder(T))
1236 return;
1237 setScoreLB(T, Val: std::max(a: getScoreLB(T), b: UB - Count));
1238 } else {
1239 setScoreLB(T, Val: UB);
1240 PendingEvents &= ~WaitEventMaskForInst[T];
1241 }
1242}
1243
1244void WaitcntBrackets::applyXcnt(const AMDGPU::Waitcnt &Wait) {
1245 // Wait on XCNT is redundant if we are already waiting for a load to complete.
1246 // SMEM can return out of order, so only omit XCNT wait if we are waiting till
1247 // zero.
1248 if (Wait.KmCnt == 0 && hasPendingEvent(E: SMEM_GROUP))
1249 return applyWaitcnt(T: X_CNT, Count: 0);
1250
1251 // If we have pending store we cannot optimize XCnt because we do not wait for
1252 // stores. VMEM loads retun in order, so if we only have loads XCnt is
1253 // decremented to the same number as LOADCnt.
1254 if (Wait.LoadCnt != ~0u && hasPendingEvent(E: VMEM_GROUP) &&
1255 !hasPendingEvent(T: STORE_CNT))
1256 return applyWaitcnt(T: X_CNT, Count: std::min(a: Wait.XCnt, b: Wait.LoadCnt));
1257
1258 applyWaitcnt(T: X_CNT, Count: Wait.XCnt);
1259}
1260
1261// Where there are multiple types of event in the bracket of a counter,
1262// the decrement may go out of order.
1263bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const {
1264 // Scalar memory read always can go out of order.
1265 if ((T == SmemAccessCounter && hasPendingEvent(E: SMEM_ACCESS)) ||
1266 (T == X_CNT && hasPendingEvent(E: SMEM_GROUP)))
1267 return true;
1268 return hasMixedPendingEvents(T);
1269}
1270
1271INITIALIZE_PASS_BEGIN(SIInsertWaitcntsLegacy, DEBUG_TYPE, "SI Insert Waitcnts",
1272 false, false)
1273INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass)
1274INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTreeWrapperPass)
1275INITIALIZE_PASS_END(SIInsertWaitcntsLegacy, DEBUG_TYPE, "SI Insert Waitcnts",
1276 false, false)
1277
1278char SIInsertWaitcntsLegacy::ID = 0;
1279
1280char &llvm::SIInsertWaitcntsID = SIInsertWaitcntsLegacy::ID;
1281
1282FunctionPass *llvm::createSIInsertWaitcntsPass() {
1283 return new SIInsertWaitcntsLegacy();
1284}
1285
1286static bool updateOperandIfDifferent(MachineInstr &MI, AMDGPU::OpName OpName,
1287 unsigned NewEnc) {
1288 int OpIdx = AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: OpName);
1289 assert(OpIdx >= 0);
1290
1291 MachineOperand &MO = MI.getOperand(i: OpIdx);
1292
1293 if (NewEnc == MO.getImm())
1294 return false;
1295
1296 MO.setImm(NewEnc);
1297 return true;
1298}
1299
1300/// Determine if \p MI is a gfx12+ single-counter S_WAIT_*CNT instruction,
1301/// and if so, which counter it is waiting on.
1302static std::optional<InstCounterType> counterTypeForInstr(unsigned Opcode) {
1303 switch (Opcode) {
1304 case AMDGPU::S_WAIT_LOADCNT:
1305 return LOAD_CNT;
1306 case AMDGPU::S_WAIT_EXPCNT:
1307 return EXP_CNT;
1308 case AMDGPU::S_WAIT_STORECNT:
1309 return STORE_CNT;
1310 case AMDGPU::S_WAIT_SAMPLECNT:
1311 return SAMPLE_CNT;
1312 case AMDGPU::S_WAIT_BVHCNT:
1313 return BVH_CNT;
1314 case AMDGPU::S_WAIT_DSCNT:
1315 return DS_CNT;
1316 case AMDGPU::S_WAIT_KMCNT:
1317 return KM_CNT;
1318 case AMDGPU::S_WAIT_XCNT:
1319 return X_CNT;
1320 default:
1321 return {};
1322 }
1323}
1324
1325bool WaitcntGenerator::promoteSoftWaitCnt(MachineInstr *Waitcnt) const {
1326 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Opcode: Waitcnt->getOpcode());
1327 if (Opcode == Waitcnt->getOpcode())
1328 return false;
1329
1330 Waitcnt->setDesc(TII->get(Opcode));
1331 return true;
1332}
1333
1334/// Combine consecutive S_WAITCNT and S_WAITCNT_VSCNT instructions that
1335/// precede \p It and follow \p OldWaitcntInstr and apply any extra waits
1336/// from \p Wait that were added by previous passes. Currently this pass
1337/// conservatively assumes that these preexisting waits are required for
1338/// correctness.
1339bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
1340 WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
1341 AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) const {
1342 assert(ST);
1343 assert(isNormalMode(MaxCounter));
1344
1345 bool Modified = false;
1346 MachineInstr *WaitcntInstr = nullptr;
1347 MachineInstr *WaitcntVsCntInstr = nullptr;
1348
1349 LLVM_DEBUG({
1350 dbgs() << "PreGFX12::applyPreexistingWaitcnt at: ";
1351 if (It == OldWaitcntInstr.getParent()->instr_end())
1352 dbgs() << "end of block\n";
1353 else
1354 dbgs() << *It;
1355 });
1356
1357 for (auto &II :
1358 make_early_inc_range(Range: make_range(x: OldWaitcntInstr.getIterator(), y: It))) {
1359 LLVM_DEBUG(dbgs() << "pre-existing iter: " << II);
1360 if (II.isMetaInstruction()) {
1361 LLVM_DEBUG(dbgs() << "skipped meta instruction\n");
1362 continue;
1363 }
1364
1365 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Opcode: II.getOpcode());
1366 bool TrySimplify = Opcode != II.getOpcode() && !OptNone;
1367
1368 // Update required wait count. If this is a soft waitcnt (= it was added
1369 // by an earlier pass), it may be entirely removed.
1370 if (Opcode == AMDGPU::S_WAITCNT) {
1371 unsigned IEnc = II.getOperand(i: 0).getImm();
1372 AMDGPU::Waitcnt OldWait = AMDGPU::decodeWaitcnt(Version: IV, Encoded: IEnc);
1373 if (TrySimplify)
1374 ScoreBrackets.simplifyWaitcnt(Wait&: OldWait);
1375 Wait = Wait.combined(Other: OldWait);
1376
1377 // Merge consecutive waitcnt of the same type by erasing multiples.
1378 if (WaitcntInstr || (!Wait.hasWaitExceptStoreCnt() && TrySimplify)) {
1379 II.eraseFromParent();
1380 Modified = true;
1381 } else
1382 WaitcntInstr = &II;
1383 } else {
1384 assert(Opcode == AMDGPU::S_WAITCNT_VSCNT);
1385 assert(II.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1386
1387 unsigned OldVSCnt =
1388 TII->getNamedOperand(MI&: II, OperandName: AMDGPU::OpName::simm16)->getImm();
1389 if (TrySimplify)
1390 ScoreBrackets.simplifyWaitcnt(T: InstCounterType::STORE_CNT, Count&: OldVSCnt);
1391 Wait.StoreCnt = std::min(a: Wait.StoreCnt, b: OldVSCnt);
1392
1393 if (WaitcntVsCntInstr || (!Wait.hasWaitStoreCnt() && TrySimplify)) {
1394 II.eraseFromParent();
1395 Modified = true;
1396 } else
1397 WaitcntVsCntInstr = &II;
1398 }
1399 }
1400
1401 if (WaitcntInstr) {
1402 Modified |= updateOperandIfDifferent(MI&: *WaitcntInstr, OpName: AMDGPU::OpName::simm16,
1403 NewEnc: AMDGPU::encodeWaitcnt(Version: IV, Decoded: Wait));
1404 Modified |= promoteSoftWaitCnt(Waitcnt: WaitcntInstr);
1405
1406 ScoreBrackets.applyWaitcnt(T: LOAD_CNT, Count: Wait.LoadCnt);
1407 ScoreBrackets.applyWaitcnt(T: EXP_CNT, Count: Wait.ExpCnt);
1408 ScoreBrackets.applyWaitcnt(T: DS_CNT, Count: Wait.DsCnt);
1409 Wait.LoadCnt = ~0u;
1410 Wait.ExpCnt = ~0u;
1411 Wait.DsCnt = ~0u;
1412
1413 LLVM_DEBUG(It == WaitcntInstr->getParent()->end()
1414 ? dbgs()
1415 << "applied pre-existing waitcnt\n"
1416 << "New Instr at block end: " << *WaitcntInstr << '\n'
1417 : dbgs() << "applied pre-existing waitcnt\n"
1418 << "Old Instr: " << *It
1419 << "New Instr: " << *WaitcntInstr << '\n');
1420 }
1421
1422 if (WaitcntVsCntInstr) {
1423 Modified |= updateOperandIfDifferent(MI&: *WaitcntVsCntInstr,
1424 OpName: AMDGPU::OpName::simm16, NewEnc: Wait.StoreCnt);
1425 Modified |= promoteSoftWaitCnt(Waitcnt: WaitcntVsCntInstr);
1426
1427 ScoreBrackets.applyWaitcnt(T: STORE_CNT, Count: Wait.StoreCnt);
1428 Wait.StoreCnt = ~0u;
1429
1430 LLVM_DEBUG(It == WaitcntVsCntInstr->getParent()->end()
1431 ? dbgs() << "applied pre-existing waitcnt\n"
1432 << "New Instr at block end: " << *WaitcntVsCntInstr
1433 << '\n'
1434 : dbgs() << "applied pre-existing waitcnt\n"
1435 << "Old Instr: " << *It
1436 << "New Instr: " << *WaitcntVsCntInstr << '\n');
1437 }
1438
1439 return Modified;
1440}
1441
1442/// Generate S_WAITCNT and/or S_WAITCNT_VSCNT instructions for any
1443/// required counters in \p Wait
1444bool WaitcntGeneratorPreGFX12::createNewWaitcnt(
1445 MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It,
1446 AMDGPU::Waitcnt Wait) {
1447 assert(ST);
1448 assert(isNormalMode(MaxCounter));
1449
1450 bool Modified = false;
1451 const DebugLoc &DL = Block.findDebugLoc(MBBI: It);
1452
1453 // Waits for VMcnt, LKGMcnt and/or EXPcnt are encoded together into a
1454 // single instruction while VScnt has its own instruction.
1455 if (Wait.hasWaitExceptStoreCnt()) {
1456 unsigned Enc = AMDGPU::encodeWaitcnt(Version: IV, Decoded: Wait);
1457 [[maybe_unused]] auto SWaitInst =
1458 BuildMI(BB&: Block, I: It, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_WAITCNT)).addImm(Val: Enc);
1459 Modified = true;
1460
1461 LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
1462 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
1463 dbgs() << "New Instr: " << *SWaitInst << '\n');
1464 }
1465
1466 if (Wait.hasWaitStoreCnt()) {
1467 assert(ST->hasVscnt());
1468
1469 [[maybe_unused]] auto SWaitInst =
1470 BuildMI(BB&: Block, I: It, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_WAITCNT_VSCNT))
1471 .addReg(RegNo: AMDGPU::SGPR_NULL, flags: RegState::Undef)
1472 .addImm(Val: Wait.StoreCnt);
1473 Modified = true;
1474
1475 LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
1476 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
1477 dbgs() << "New Instr: " << *SWaitInst << '\n');
1478 }
1479
1480 return Modified;
1481}
1482
1483AMDGPU::Waitcnt
1484WaitcntGeneratorPreGFX12::getAllZeroWaitcnt(bool IncludeVSCnt) const {
1485 return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt && ST->hasVscnt() ? 0 : ~0u);
1486}
1487
1488AMDGPU::Waitcnt
1489WaitcntGeneratorGFX12Plus::getAllZeroWaitcnt(bool IncludeVSCnt) const {
1490 return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt ? 0 : ~0u, 0, 0, 0,
1491 ~0u /* XCNT */);
1492}
1493
1494/// Combine consecutive S_WAIT_*CNT instructions that precede \p It and
1495/// follow \p OldWaitcntInstr and apply any extra waits from \p Wait that
1496/// were added by previous passes. Currently this pass conservatively
1497/// assumes that these preexisting waits are required for correctness.
1498bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
1499 WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
1500 AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) const {
1501 assert(ST);
1502 assert(!isNormalMode(MaxCounter));
1503
1504 bool Modified = false;
1505 MachineInstr *CombinedLoadDsCntInstr = nullptr;
1506 MachineInstr *CombinedStoreDsCntInstr = nullptr;
1507 MachineInstr *WaitInstrs[NUM_EXTENDED_INST_CNTS] = {};
1508
1509 LLVM_DEBUG({
1510 dbgs() << "GFX12Plus::applyPreexistingWaitcnt at: ";
1511 if (It == OldWaitcntInstr.getParent()->instr_end())
1512 dbgs() << "end of block\n";
1513 else
1514 dbgs() << *It;
1515 });
1516
1517 for (auto &II :
1518 make_early_inc_range(Range: make_range(x: OldWaitcntInstr.getIterator(), y: It))) {
1519 LLVM_DEBUG(dbgs() << "pre-existing iter: " << II);
1520 if (II.isMetaInstruction()) {
1521 LLVM_DEBUG(dbgs() << "skipped meta instruction\n");
1522 continue;
1523 }
1524
1525 MachineInstr **UpdatableInstr;
1526
1527 // Update required wait count. If this is a soft waitcnt (= it was added
1528 // by an earlier pass), it may be entirely removed.
1529
1530 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Opcode: II.getOpcode());
1531 bool TrySimplify = Opcode != II.getOpcode() && !OptNone;
1532
1533 // Don't crash if the programmer used legacy waitcnt intrinsics, but don't
1534 // attempt to do more than that either.
1535 if (Opcode == AMDGPU::S_WAITCNT)
1536 continue;
1537
1538 if (Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT) {
1539 unsigned OldEnc =
1540 TII->getNamedOperand(MI&: II, OperandName: AMDGPU::OpName::simm16)->getImm();
1541 AMDGPU::Waitcnt OldWait = AMDGPU::decodeLoadcntDscnt(Version: IV, LoadcntDscnt: OldEnc);
1542 if (TrySimplify)
1543 ScoreBrackets.simplifyWaitcnt(Wait&: OldWait);
1544 Wait = Wait.combined(Other: OldWait);
1545 UpdatableInstr = &CombinedLoadDsCntInstr;
1546 } else if (Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT) {
1547 unsigned OldEnc =
1548 TII->getNamedOperand(MI&: II, OperandName: AMDGPU::OpName::simm16)->getImm();
1549 AMDGPU::Waitcnt OldWait = AMDGPU::decodeStorecntDscnt(Version: IV, StorecntDscnt: OldEnc);
1550 if (TrySimplify)
1551 ScoreBrackets.simplifyWaitcnt(Wait&: OldWait);
1552 Wait = Wait.combined(Other: OldWait);
1553 UpdatableInstr = &CombinedStoreDsCntInstr;
1554 } else {
1555 std::optional<InstCounterType> CT = counterTypeForInstr(Opcode);
1556 assert(CT.has_value());
1557 unsigned OldCnt =
1558 TII->getNamedOperand(MI&: II, OperandName: AMDGPU::OpName::simm16)->getImm();
1559 if (TrySimplify)
1560 ScoreBrackets.simplifyWaitcnt(T: CT.value(), Count&: OldCnt);
1561 addWait(Wait, T: CT.value(), Count: OldCnt);
1562 UpdatableInstr = &WaitInstrs[CT.value()];
1563 }
1564
1565 // Merge consecutive waitcnt of the same type by erasing multiples.
1566 if (!*UpdatableInstr) {
1567 *UpdatableInstr = &II;
1568 } else {
1569 II.eraseFromParent();
1570 Modified = true;
1571 }
1572 }
1573
1574 if (CombinedLoadDsCntInstr) {
1575 // Only keep an S_WAIT_LOADCNT_DSCNT if both counters actually need
1576 // to be waited for. Otherwise, let the instruction be deleted so
1577 // the appropriate single counter wait instruction can be inserted
1578 // instead, when new S_WAIT_*CNT instructions are inserted by
1579 // createNewWaitcnt(). As a side effect, resetting the wait counts will
1580 // cause any redundant S_WAIT_LOADCNT or S_WAIT_DSCNT to be removed by
1581 // the loop below that deals with single counter instructions.
1582 if (Wait.LoadCnt != ~0u && Wait.DsCnt != ~0u) {
1583 unsigned NewEnc = AMDGPU::encodeLoadcntDscnt(Version: IV, Decoded: Wait);
1584 Modified |= updateOperandIfDifferent(MI&: *CombinedLoadDsCntInstr,
1585 OpName: AMDGPU::OpName::simm16, NewEnc);
1586 Modified |= promoteSoftWaitCnt(Waitcnt: CombinedLoadDsCntInstr);
1587 ScoreBrackets.applyWaitcnt(T: LOAD_CNT, Count: Wait.LoadCnt);
1588 ScoreBrackets.applyWaitcnt(T: DS_CNT, Count: Wait.DsCnt);
1589 Wait.LoadCnt = ~0u;
1590 Wait.DsCnt = ~0u;
1591
1592 LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
1593 ? dbgs() << "applied pre-existing waitcnt\n"
1594 << "New Instr at block end: "
1595 << *CombinedLoadDsCntInstr << '\n'
1596 : dbgs() << "applied pre-existing waitcnt\n"
1597 << "Old Instr: " << *It << "New Instr: "
1598 << *CombinedLoadDsCntInstr << '\n');
1599 } else {
1600 CombinedLoadDsCntInstr->eraseFromParent();
1601 Modified = true;
1602 }
1603 }
1604
1605 if (CombinedStoreDsCntInstr) {
1606 // Similarly for S_WAIT_STORECNT_DSCNT.
1607 if (Wait.StoreCnt != ~0u && Wait.DsCnt != ~0u) {
1608 unsigned NewEnc = AMDGPU::encodeStorecntDscnt(Version: IV, Decoded: Wait);
1609 Modified |= updateOperandIfDifferent(MI&: *CombinedStoreDsCntInstr,
1610 OpName: AMDGPU::OpName::simm16, NewEnc);
1611 Modified |= promoteSoftWaitCnt(Waitcnt: CombinedStoreDsCntInstr);
1612 ScoreBrackets.applyWaitcnt(T: STORE_CNT, Count: Wait.StoreCnt);
1613 ScoreBrackets.applyWaitcnt(T: DS_CNT, Count: Wait.DsCnt);
1614 Wait.StoreCnt = ~0u;
1615 Wait.DsCnt = ~0u;
1616
1617 LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
1618 ? dbgs() << "applied pre-existing waitcnt\n"
1619 << "New Instr at block end: "
1620 << *CombinedStoreDsCntInstr << '\n'
1621 : dbgs() << "applied pre-existing waitcnt\n"
1622 << "Old Instr: " << *It << "New Instr: "
1623 << *CombinedStoreDsCntInstr << '\n');
1624 } else {
1625 CombinedStoreDsCntInstr->eraseFromParent();
1626 Modified = true;
1627 }
1628 }
1629
1630 // Look for an opportunity to convert existing S_WAIT_LOADCNT,
1631 // S_WAIT_STORECNT and S_WAIT_DSCNT into new S_WAIT_LOADCNT_DSCNT
1632 // or S_WAIT_STORECNT_DSCNT. This is achieved by selectively removing
1633 // instructions so that createNewWaitcnt() will create new combined
1634 // instructions to replace them.
1635
1636 if (Wait.DsCnt != ~0u) {
1637 // This is a vector of addresses in WaitInstrs pointing to instructions
1638 // that should be removed if they are present.
1639 SmallVector<MachineInstr **, 2> WaitsToErase;
1640
1641 // If it's known that both DScnt and either LOADcnt or STOREcnt (but not
1642 // both) need to be waited for, ensure that there are no existing
1643 // individual wait count instructions for these.
1644
1645 if (Wait.LoadCnt != ~0u) {
1646 WaitsToErase.push_back(Elt: &WaitInstrs[LOAD_CNT]);
1647 WaitsToErase.push_back(Elt: &WaitInstrs[DS_CNT]);
1648 } else if (Wait.StoreCnt != ~0u) {
1649 WaitsToErase.push_back(Elt: &WaitInstrs[STORE_CNT]);
1650 WaitsToErase.push_back(Elt: &WaitInstrs[DS_CNT]);
1651 }
1652
1653 for (MachineInstr **WI : WaitsToErase) {
1654 if (!*WI)
1655 continue;
1656
1657 (*WI)->eraseFromParent();
1658 *WI = nullptr;
1659 Modified = true;
1660 }
1661 }
1662
1663 for (auto CT : inst_counter_types(MaxCounter: NUM_EXTENDED_INST_CNTS)) {
1664 if (!WaitInstrs[CT])
1665 continue;
1666
1667 unsigned NewCnt = getWait(Wait, T: CT);
1668 if (NewCnt != ~0u) {
1669 Modified |= updateOperandIfDifferent(MI&: *WaitInstrs[CT],
1670 OpName: AMDGPU::OpName::simm16, NewEnc: NewCnt);
1671 Modified |= promoteSoftWaitCnt(Waitcnt: WaitInstrs[CT]);
1672
1673 ScoreBrackets.applyWaitcnt(T: CT, Count: NewCnt);
1674 setNoWait(Wait, T: CT);
1675
1676 LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
1677 ? dbgs() << "applied pre-existing waitcnt\n"
1678 << "New Instr at block end: " << *WaitInstrs[CT]
1679 << '\n'
1680 : dbgs() << "applied pre-existing waitcnt\n"
1681 << "Old Instr: " << *It
1682 << "New Instr: " << *WaitInstrs[CT] << '\n');
1683 } else {
1684 WaitInstrs[CT]->eraseFromParent();
1685 Modified = true;
1686 }
1687 }
1688
1689 return Modified;
1690}
1691
1692/// Generate S_WAIT_*CNT instructions for any required counters in \p Wait
1693bool WaitcntGeneratorGFX12Plus::createNewWaitcnt(
1694 MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It,
1695 AMDGPU::Waitcnt Wait) {
1696 assert(ST);
1697 assert(!isNormalMode(MaxCounter));
1698
1699 bool Modified = false;
1700 const DebugLoc &DL = Block.findDebugLoc(MBBI: It);
1701
1702 // Check for opportunities to use combined wait instructions.
1703 if (Wait.DsCnt != ~0u) {
1704 MachineInstr *SWaitInst = nullptr;
1705
1706 if (Wait.LoadCnt != ~0u) {
1707 unsigned Enc = AMDGPU::encodeLoadcntDscnt(Version: IV, Decoded: Wait);
1708
1709 SWaitInst = BuildMI(BB&: Block, I: It, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_WAIT_LOADCNT_DSCNT))
1710 .addImm(Val: Enc);
1711
1712 Wait.LoadCnt = ~0u;
1713 Wait.DsCnt = ~0u;
1714 } else if (Wait.StoreCnt != ~0u) {
1715 unsigned Enc = AMDGPU::encodeStorecntDscnt(Version: IV, Decoded: Wait);
1716
1717 SWaitInst =
1718 BuildMI(BB&: Block, I: It, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_WAIT_STORECNT_DSCNT))
1719 .addImm(Val: Enc);
1720
1721 Wait.StoreCnt = ~0u;
1722 Wait.DsCnt = ~0u;
1723 }
1724
1725 if (SWaitInst) {
1726 Modified = true;
1727
1728 LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
1729 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
1730 dbgs() << "New Instr: " << *SWaitInst << '\n');
1731 }
1732 }
1733
1734 // Generate an instruction for any remaining counter that needs
1735 // waiting for.
1736
1737 for (auto CT : inst_counter_types(MaxCounter: NUM_EXTENDED_INST_CNTS)) {
1738 unsigned Count = getWait(Wait, T: CT);
1739 if (Count == ~0u)
1740 continue;
1741
1742 [[maybe_unused]] auto SWaitInst =
1743 BuildMI(BB&: Block, I: It, MIMD: DL, MCID: TII->get(Opcode: instrsForExtendedCounterTypes[CT]))
1744 .addImm(Val: Count);
1745
1746 Modified = true;
1747
1748 LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
1749 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
1750 dbgs() << "New Instr: " << *SWaitInst << '\n');
1751 }
1752
1753 return Modified;
1754}
1755
1756static bool readsVCCZ(const MachineInstr &MI) {
1757 unsigned Opc = MI.getOpcode();
1758 return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) &&
1759 !MI.getOperand(i: 1).isUndef();
1760}
1761
1762/// \returns true if the callee inserts an s_waitcnt 0 on function entry.
1763static bool callWaitsOnFunctionEntry(const MachineInstr &MI) {
1764 // Currently all conventions wait, but this may not always be the case.
1765 //
1766 // TODO: If IPRA is enabled, and the callee is isSafeForNoCSROpt, it may make
1767 // senses to omit the wait and do it in the caller.
1768 return true;
1769}
1770
1771/// \returns true if the callee is expected to wait for any outstanding waits
1772/// before returning.
1773static bool callWaitsOnFunctionReturn(const MachineInstr &MI) { return true; }
1774
1775/// Generate s_waitcnt instruction to be placed before cur_Inst.
1776/// Instructions of a given type are returned in order,
1777/// but instructions of different types can complete out of order.
1778/// We rely on this in-order completion
1779/// and simply assign a score to the memory access instructions.
1780/// We keep track of the active "score bracket" to determine
1781/// if an access of a memory read requires an s_waitcnt
1782/// and if so what the value of each counter is.
1783/// The "score bracket" is bound by the lower bound and upper bound
1784/// scores (*_score_LB and *_score_ub respectively).
1785/// If FlushVmCnt is true, that means that we want to generate a s_waitcnt to
1786/// flush the vmcnt counter here.
1787bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
1788 WaitcntBrackets &ScoreBrackets,
1789 MachineInstr *OldWaitcntInstr,
1790 bool FlushVmCnt) {
1791 setForceEmitWaitcnt();
1792
1793 assert(!MI.isMetaInstruction());
1794
1795 AMDGPU::Waitcnt Wait;
1796
1797 // FIXME: This should have already been handled by the memory legalizer.
1798 // Removing this currently doesn't affect any lit tests, but we need to
1799 // verify that nothing was relying on this. The number of buffer invalidates
1800 // being handled here should not be expanded.
1801 if (MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 ||
1802 MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC ||
1803 MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL ||
1804 MI.getOpcode() == AMDGPU::BUFFER_GL0_INV ||
1805 MI.getOpcode() == AMDGPU::BUFFER_GL1_INV) {
1806 Wait.LoadCnt = 0;
1807 }
1808
1809 // All waits must be resolved at call return.
1810 // NOTE: this could be improved with knowledge of all call sites or
1811 // with knowledge of the called routines.
1812 if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
1813 MI.getOpcode() == AMDGPU::SI_RETURN ||
1814 MI.getOpcode() == AMDGPU::S_SETPC_B64_return ||
1815 (MI.isReturn() && MI.isCall() && !callWaitsOnFunctionEntry(MI))) {
1816 Wait = Wait.combined(Other: WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false));
1817 }
1818 // In dynamic VGPR mode, we want to release the VGPRs before the wave exits.
1819 // Technically the hardware will do this on its own if we don't, but that
1820 // might cost extra cycles compared to doing it explicitly.
1821 // When not in dynamic VGPR mode, identify S_ENDPGM instructions which may
1822 // have to wait for outstanding VMEM stores. In this case it can be useful to
1823 // send a message to explicitly release all VGPRs before the stores have
1824 // completed, but it is only safe to do this if there are no outstanding
1825 // scratch stores.
1826 else if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
1827 MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED) {
1828 if (!WCG->isOptNone() &&
1829 (MI.getMF()->getInfo<SIMachineFunctionInfo>()->isDynamicVGPREnabled() ||
1830 (ST->getGeneration() >= AMDGPUSubtarget::GFX11 &&
1831 ScoreBrackets.getScoreRange(T: STORE_CNT) != 0 &&
1832 !ScoreBrackets.hasPendingEvent(E: SCRATCH_WRITE_ACCESS))))
1833 ReleaseVGPRInsts.insert(V: &MI);
1834 }
1835 // Resolve vm waits before gs-done.
1836 else if ((MI.getOpcode() == AMDGPU::S_SENDMSG ||
1837 MI.getOpcode() == AMDGPU::S_SENDMSGHALT) &&
1838 ST->hasLegacyGeometry() &&
1839 ((MI.getOperand(i: 0).getImm() & AMDGPU::SendMsg::ID_MASK_PreGFX11_) ==
1840 AMDGPU::SendMsg::ID_GS_DONE_PreGFX11)) {
1841 Wait.LoadCnt = 0;
1842 }
1843
1844 // Export & GDS instructions do not read the EXEC mask until after the export
1845 // is granted (which can occur well after the instruction is issued).
1846 // The shader program must flush all EXP operations on the export-count
1847 // before overwriting the EXEC mask.
1848 else {
1849 if (MI.modifiesRegister(Reg: AMDGPU::EXEC, TRI)) {
1850 // Export and GDS are tracked individually, either may trigger a waitcnt
1851 // for EXEC.
1852 if (ScoreBrackets.hasPendingEvent(E: EXP_GPR_LOCK) ||
1853 ScoreBrackets.hasPendingEvent(E: EXP_PARAM_ACCESS) ||
1854 ScoreBrackets.hasPendingEvent(E: EXP_POS_ACCESS) ||
1855 ScoreBrackets.hasPendingEvent(E: GDS_GPR_LOCK)) {
1856 Wait.ExpCnt = 0;
1857 }
1858 }
1859
1860 // Wait for any pending GDS instruction to complete before any
1861 // "Always GDS" instruction.
1862 if (TII->isAlwaysGDS(Opcode: MI.getOpcode()) && ScoreBrackets.hasPendingGDS())
1863 addWait(Wait, T: DS_CNT, Count: ScoreBrackets.getPendingGDSWait());
1864
1865 if (MI.isCall() && callWaitsOnFunctionEntry(MI)) {
1866 // The function is going to insert a wait on everything in its prolog.
1867 // This still needs to be careful if the call target is a load (e.g. a GOT
1868 // load). We also need to check WAW dependency with saved PC.
1869 Wait = AMDGPU::Waitcnt();
1870
1871 const auto &CallAddrOp = *TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0);
1872 if (CallAddrOp.isReg()) {
1873 RegInterval CallAddrOpInterval =
1874 ScoreBrackets.getRegInterval(MI: &MI, MRI, TRI, Op: CallAddrOp);
1875
1876 ScoreBrackets.determineWait(T: SmemAccessCounter, Interval: CallAddrOpInterval,
1877 Wait);
1878
1879 if (const auto *RtnAddrOp =
1880 TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::dst)) {
1881 RegInterval RtnAddrOpInterval =
1882 ScoreBrackets.getRegInterval(MI: &MI, MRI, TRI, Op: *RtnAddrOp);
1883
1884 ScoreBrackets.determineWait(T: SmemAccessCounter, Interval: RtnAddrOpInterval,
1885 Wait);
1886 }
1887 }
1888 } else {
1889 // FIXME: Should not be relying on memoperands.
1890 // Look at the source operands of every instruction to see if
1891 // any of them results from a previous memory operation that affects
1892 // its current usage. If so, an s_waitcnt instruction needs to be
1893 // emitted.
1894 // If the source operand was defined by a load, add the s_waitcnt
1895 // instruction.
1896 //
1897 // Two cases are handled for destination operands:
1898 // 1) If the destination operand was defined by a load, add the s_waitcnt
1899 // instruction to guarantee the right WAW order.
1900 // 2) If a destination operand that was used by a recent export/store ins,
1901 // add s_waitcnt on exp_cnt to guarantee the WAR order.
1902
1903 for (const MachineMemOperand *Memop : MI.memoperands()) {
1904 const Value *Ptr = Memop->getValue();
1905 if (Memop->isStore()) {
1906 if (auto It = SLoadAddresses.find(Val: Ptr); It != SLoadAddresses.end()) {
1907 addWait(Wait, T: SmemAccessCounter, Count: 0);
1908 if (PDT->dominates(A: MI.getParent(), B: It->second))
1909 SLoadAddresses.erase(I: It);
1910 }
1911 }
1912 unsigned AS = Memop->getAddrSpace();
1913 if (AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::FLAT_ADDRESS)
1914 continue;
1915 // No need to wait before load from VMEM to LDS.
1916 if (TII->mayWriteLDSThroughDMA(MI))
1917 continue;
1918
1919 // LOAD_CNT is only relevant to vgpr or LDS.
1920 unsigned RegNo = FIRST_LDS_VGPR;
1921 // Only objects with alias scope info were added to LDSDMAScopes array.
1922 // In the absense of the scope info we will not be able to disambiguate
1923 // aliasing here. There is no need to try searching for a corresponding
1924 // store slot. This is conservatively correct because in that case we
1925 // will produce a wait using the first (general) LDS DMA wait slot which
1926 // will wait on all of them anyway.
1927 if (Ptr && Memop->getAAInfo() && Memop->getAAInfo().Scope) {
1928 const auto &LDSDMAStores = ScoreBrackets.getLDSDMAStores();
1929 for (unsigned I = 0, E = LDSDMAStores.size(); I != E; ++I) {
1930 if (MI.mayAlias(AA, Other: *LDSDMAStores[I], UseTBAA: true))
1931 ScoreBrackets.determineWait(T: LOAD_CNT, RegNo: RegNo + I + 1, Wait);
1932 }
1933 } else {
1934 ScoreBrackets.determineWait(T: LOAD_CNT, RegNo, Wait);
1935 }
1936 if (Memop->isStore()) {
1937 ScoreBrackets.determineWait(T: EXP_CNT, RegNo, Wait);
1938 }
1939 }
1940
1941 // Loop over use and def operands.
1942 for (const MachineOperand &Op : MI.operands()) {
1943 if (!Op.isReg())
1944 continue;
1945
1946 // If the instruction does not read tied source, skip the operand.
1947 if (Op.isTied() && Op.isUse() && TII->doesNotReadTiedSource(MI))
1948 continue;
1949
1950 RegInterval Interval = ScoreBrackets.getRegInterval(MI: &MI, MRI, TRI, Op);
1951
1952 const bool IsVGPR = TRI->isVectorRegister(MRI: *MRI, Reg: Op.getReg());
1953 if (IsVGPR) {
1954 // Implicit VGPR defs and uses are never a part of the memory
1955 // instructions description and usually present to account for
1956 // super-register liveness.
1957 // TODO: Most of the other instructions also have implicit uses
1958 // for the liveness accounting only.
1959 if (Op.isImplicit() && MI.mayLoadOrStore())
1960 continue;
1961
1962 // RAW always needs an s_waitcnt. WAW needs an s_waitcnt unless the
1963 // previous write and this write are the same type of VMEM
1964 // instruction, in which case they are (in some architectures)
1965 // guaranteed to write their results in order anyway.
1966 // Additionally check instructions where Point Sample Acceleration
1967 // might be applied.
1968 if (Op.isUse() || !updateVMCntOnly(Inst: MI) ||
1969 ScoreBrackets.hasOtherPendingVmemTypes(Interval,
1970 V: getVmemType(Inst: MI)) ||
1971 ScoreBrackets.hasPointSamplePendingVmemTypes(MI, Interval) ||
1972 !ST->hasVmemWriteVgprInOrder()) {
1973 ScoreBrackets.determineWait(T: LOAD_CNT, Interval, Wait);
1974 ScoreBrackets.determineWait(T: SAMPLE_CNT, Interval, Wait);
1975 ScoreBrackets.determineWait(T: BVH_CNT, Interval, Wait);
1976 ScoreBrackets.clearVgprVmemTypes(Interval);
1977 }
1978
1979 if (Op.isDef() || ScoreBrackets.hasPendingEvent(E: EXP_LDS_ACCESS)) {
1980 ScoreBrackets.determineWait(T: EXP_CNT, Interval, Wait);
1981 }
1982 ScoreBrackets.determineWait(T: DS_CNT, Interval, Wait);
1983 } else {
1984 ScoreBrackets.determineWait(T: SmemAccessCounter, Interval, Wait);
1985 }
1986
1987 if (hasXcnt() && Op.isDef())
1988 ScoreBrackets.determineWait(T: X_CNT, Interval, Wait);
1989 }
1990 }
1991 }
1992
1993 // The subtarget may have an implicit S_WAITCNT 0 before barriers. If it does
1994 // not, we need to ensure the subtarget is capable of backing off barrier
1995 // instructions in case there are any outstanding memory operations that may
1996 // cause an exception. Otherwise, insert an explicit S_WAITCNT 0 here.
1997 if (TII->isBarrierStart(Opcode: MI.getOpcode()) &&
1998 !ST->hasAutoWaitcntBeforeBarrier() && !ST->supportsBackOffBarrier()) {
1999 Wait = Wait.combined(Other: WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/true));
2000 }
2001
2002 // TODO: Remove this work-around, enable the assert for Bug 457939
2003 // after fixing the scheduler. Also, the Shader Compiler code is
2004 // independent of target.
2005 if (readsVCCZ(MI) && ST->hasReadVCCZBug()) {
2006 if (ScoreBrackets.hasPendingEvent(E: SMEM_ACCESS)) {
2007 Wait.DsCnt = 0;
2008 }
2009 }
2010
2011 // Verify that the wait is actually needed.
2012 ScoreBrackets.simplifyWaitcnt(Wait);
2013
2014 // When forcing emit, we need to skip terminators because that would break the
2015 // terminators of the MBB if we emit a waitcnt between terminators.
2016 if (ForceEmitZeroFlag && !MI.isTerminator())
2017 Wait = WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false);
2018
2019 if (ForceEmitWaitcnt[LOAD_CNT])
2020 Wait.LoadCnt = 0;
2021 if (ForceEmitWaitcnt[EXP_CNT])
2022 Wait.ExpCnt = 0;
2023 if (ForceEmitWaitcnt[DS_CNT])
2024 Wait.DsCnt = 0;
2025 if (ForceEmitWaitcnt[SAMPLE_CNT])
2026 Wait.SampleCnt = 0;
2027 if (ForceEmitWaitcnt[BVH_CNT])
2028 Wait.BvhCnt = 0;
2029 if (ForceEmitWaitcnt[KM_CNT])
2030 Wait.KmCnt = 0;
2031 if (ForceEmitWaitcnt[X_CNT])
2032 Wait.XCnt = 0;
2033
2034 if (FlushVmCnt) {
2035 if (ScoreBrackets.hasPendingEvent(T: LOAD_CNT))
2036 Wait.LoadCnt = 0;
2037 if (ScoreBrackets.hasPendingEvent(T: SAMPLE_CNT))
2038 Wait.SampleCnt = 0;
2039 if (ScoreBrackets.hasPendingEvent(T: BVH_CNT))
2040 Wait.BvhCnt = 0;
2041 }
2042
2043 if (ForceEmitZeroLoadFlag && Wait.LoadCnt != ~0u)
2044 Wait.LoadCnt = 0;
2045
2046 return generateWaitcnt(Wait, It: MI.getIterator(), Block&: *MI.getParent(), ScoreBrackets,
2047 OldWaitcntInstr);
2048}
2049
2050bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
2051 MachineBasicBlock::instr_iterator It,
2052 MachineBasicBlock &Block,
2053 WaitcntBrackets &ScoreBrackets,
2054 MachineInstr *OldWaitcntInstr) {
2055 bool Modified = false;
2056
2057 if (OldWaitcntInstr)
2058 // Try to merge the required wait with preexisting waitcnt instructions.
2059 // Also erase redundant waitcnt.
2060 Modified =
2061 WCG->applyPreexistingWaitcnt(ScoreBrackets, OldWaitcntInstr&: *OldWaitcntInstr, Wait, It);
2062
2063 // Any counts that could have been applied to any existing waitcnt
2064 // instructions will have been done so, now deal with any remaining.
2065 ScoreBrackets.applyWaitcnt(Wait);
2066
2067 // ExpCnt can be merged into VINTERP.
2068 if (Wait.ExpCnt != ~0u && It != Block.instr_end() &&
2069 SIInstrInfo::isVINTERP(MI: *It)) {
2070 MachineOperand *WaitExp =
2071 TII->getNamedOperand(MI&: *It, OperandName: AMDGPU::OpName::waitexp);
2072 if (Wait.ExpCnt < WaitExp->getImm()) {
2073 WaitExp->setImm(Wait.ExpCnt);
2074 Modified = true;
2075 }
2076 Wait.ExpCnt = ~0u;
2077
2078 LLVM_DEBUG(dbgs() << "generateWaitcnt\n"
2079 << "Update Instr: " << *It);
2080 }
2081
2082 // XCnt may be already consumed by a load wait.
2083 if (Wait.KmCnt == 0 && Wait.XCnt != ~0u &&
2084 !ScoreBrackets.hasPendingEvent(E: SMEM_GROUP))
2085 Wait.XCnt = ~0u;
2086
2087 if (Wait.LoadCnt == 0 && Wait.XCnt != ~0u &&
2088 !ScoreBrackets.hasPendingEvent(E: VMEM_GROUP))
2089 Wait.XCnt = ~0u;
2090
2091 // Since the translation for VMEM addresses occur in-order, we can skip the
2092 // XCnt if the current instruction is of VMEM type and has a memory dependency
2093 // with another VMEM instruction in flight.
2094 if (Wait.XCnt != ~0u && isVmemAccess(MI: *It))
2095 Wait.XCnt = ~0u;
2096
2097 if (WCG->createNewWaitcnt(Block, It, Wait))
2098 Modified = true;
2099
2100 return Modified;
2101}
2102
2103// This is a flat memory operation. Check to see if it has memory tokens other
2104// than LDS. Other address spaces supported by flat memory operations involve
2105// global memory.
2106bool SIInsertWaitcnts::mayAccessVMEMThroughFlat(const MachineInstr &MI) const {
2107 assert(TII->isFLAT(MI));
2108
2109 // All flat instructions use the VMEM counter.
2110 assert(TII->usesVM_CNT(MI));
2111
2112 // If there are no memory operands then conservatively assume the flat
2113 // operation may access VMEM.
2114 if (MI.memoperands_empty())
2115 return true;
2116
2117 // See if any memory operand specifies an address space that involves VMEM.
2118 // Flat operations only supported FLAT, LOCAL (LDS), or address spaces
2119 // involving VMEM such as GLOBAL, CONSTANT, PRIVATE (SCRATCH), etc. The REGION
2120 // (GDS) address space is not supported by flat operations. Therefore, simply
2121 // return true unless only the LDS address space is found.
2122 for (const MachineMemOperand *Memop : MI.memoperands()) {
2123 unsigned AS = Memop->getAddrSpace();
2124 assert(AS != AMDGPUAS::REGION_ADDRESS);
2125 if (AS != AMDGPUAS::LOCAL_ADDRESS)
2126 return true;
2127 }
2128
2129 return false;
2130}
2131
2132// This is a flat memory operation. Check to see if it has memory tokens for
2133// either LDS or FLAT.
2134bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const {
2135 assert(TII->isFLAT(MI));
2136
2137 // Flat instruction such as SCRATCH and GLOBAL do not use the lgkm counter.
2138 if (!TII->usesLGKM_CNT(MI))
2139 return false;
2140
2141 // If in tgsplit mode then there can be no use of LDS.
2142 if (ST->isTgSplitEnabled())
2143 return false;
2144
2145 // If there are no memory operands then conservatively assume the flat
2146 // operation may access LDS.
2147 if (MI.memoperands_empty())
2148 return true;
2149
2150 // See if any memory operand specifies an address space that involves LDS.
2151 for (const MachineMemOperand *Memop : MI.memoperands()) {
2152 unsigned AS = Memop->getAddrSpace();
2153 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS)
2154 return true;
2155 }
2156
2157 return false;
2158}
2159
2160// This is a flat memory operation. Check to see if it has memory tokens for
2161// either scratch or FLAT.
2162bool SIInsertWaitcnts::mayAccessScratchThroughFlat(
2163 const MachineInstr &MI) const {
2164 assert(TII->isFLAT(MI));
2165
2166 // SCRATCH instructions always access scratch.
2167 if (TII->isFLATScratch(MI))
2168 return true;
2169
2170 // GLOBAL instructions never access scratch.
2171 if (TII->isFLATGlobal(MI))
2172 return false;
2173
2174 // If there are no memory operands then conservatively assume the flat
2175 // operation may access scratch.
2176 if (MI.memoperands_empty())
2177 return true;
2178
2179 // See if any memory operand specifies an address space that involves scratch.
2180 return any_of(Range: MI.memoperands(), P: [](const MachineMemOperand *Memop) {
2181 unsigned AS = Memop->getAddrSpace();
2182 return AS == AMDGPUAS::PRIVATE_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS;
2183 });
2184}
2185
2186bool SIInsertWaitcnts::isVmemAccess(const MachineInstr &MI) const {
2187 return (TII->isFLAT(MI) && mayAccessVMEMThroughFlat(MI)) ||
2188 (TII->isVMEM(MI) && !AMDGPU::getMUBUFIsBufferInv(Opc: MI.getOpcode()));
2189}
2190
2191static bool isGFX12CacheInvOrWBInst(MachineInstr &Inst) {
2192 auto Opc = Inst.getOpcode();
2193 return Opc == AMDGPU::GLOBAL_INV || Opc == AMDGPU::GLOBAL_WB ||
2194 Opc == AMDGPU::GLOBAL_WBINV;
2195}
2196
2197// Return true if the next instruction is S_ENDPGM, following fallthrough
2198// blocks if necessary.
2199bool SIInsertWaitcnts::isNextENDPGM(MachineBasicBlock::instr_iterator It,
2200 MachineBasicBlock *Block) const {
2201 auto BlockEnd = Block->getParent()->end();
2202 auto BlockIter = Block->getIterator();
2203
2204 while (true) {
2205 if (It.isEnd()) {
2206 if (++BlockIter != BlockEnd) {
2207 It = BlockIter->instr_begin();
2208 continue;
2209 }
2210
2211 return false;
2212 }
2213
2214 if (!It->isMetaInstruction())
2215 break;
2216
2217 It++;
2218 }
2219
2220 assert(!It.isEnd());
2221
2222 return It->getOpcode() == AMDGPU::S_ENDPGM;
2223}
2224
2225// Add a wait after an instruction if architecture requirements mandate one.
2226bool SIInsertWaitcnts::insertForcedWaitAfter(MachineInstr &Inst,
2227 MachineBasicBlock &Block,
2228 WaitcntBrackets &ScoreBrackets) {
2229 AMDGPU::Waitcnt Wait;
2230 bool NeedsEndPGMCheck = false;
2231
2232 if (ST->isPreciseMemoryEnabled() && Inst.mayLoadOrStore())
2233 Wait = WCG->getAllZeroWaitcnt(IncludeVSCnt: Inst.mayStore() &&
2234 !SIInstrInfo::isAtomicRet(MI: Inst));
2235
2236 if (TII->isAlwaysGDS(Opcode: Inst.getOpcode())) {
2237 Wait.DsCnt = 0;
2238 NeedsEndPGMCheck = true;
2239 }
2240
2241 ScoreBrackets.simplifyWaitcnt(Wait);
2242
2243 auto SuccessorIt = std::next(x: Inst.getIterator());
2244 bool Result = generateWaitcnt(Wait, It: SuccessorIt, Block, ScoreBrackets,
2245 /*OldWaitcntInstr=*/nullptr);
2246
2247 if (Result && NeedsEndPGMCheck && isNextENDPGM(It: SuccessorIt, Block: &Block)) {
2248 BuildMI(BB&: Block, I: SuccessorIt, MIMD: Inst.getDebugLoc(), MCID: TII->get(Opcode: AMDGPU::S_NOP))
2249 .addImm(Val: 0);
2250 }
2251
2252 return Result;
2253}
2254
2255void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
2256 WaitcntBrackets *ScoreBrackets) {
2257 // Now look at the instruction opcode. If it is a memory access
2258 // instruction, update the upper-bound of the appropriate counter's
2259 // bracket and the destination operand scores.
2260 // TODO: Use the (TSFlags & SIInstrFlags::DS_CNT) property everywhere.
2261
2262 bool IsVMEMAccess = false;
2263 bool IsSMEMAccess = false;
2264 if (TII->isDS(MI: Inst) && TII->usesLGKM_CNT(MI: Inst)) {
2265 if (TII->isAlwaysGDS(Opcode: Inst.getOpcode()) ||
2266 TII->hasModifiersSet(MI: Inst, OpName: AMDGPU::OpName::gds)) {
2267 ScoreBrackets->updateByEvent(TII, TRI, MRI, E: GDS_ACCESS, Inst);
2268 ScoreBrackets->updateByEvent(TII, TRI, MRI, E: GDS_GPR_LOCK, Inst);
2269 ScoreBrackets->setPendingGDS();
2270 } else {
2271 ScoreBrackets->updateByEvent(TII, TRI, MRI, E: LDS_ACCESS, Inst);
2272 }
2273 } else if (TII->isFLAT(MI: Inst)) {
2274 if (isGFX12CacheInvOrWBInst(Inst)) {
2275 ScoreBrackets->updateByEvent(TII, TRI, MRI, E: getVmemWaitEventType(Inst),
2276 Inst);
2277 return;
2278 }
2279
2280 assert(Inst.mayLoadOrStore());
2281
2282 int FlatASCount = 0;
2283
2284 if (mayAccessVMEMThroughFlat(MI: Inst)) {
2285 ++FlatASCount;
2286 IsVMEMAccess = true;
2287 ScoreBrackets->updateByEvent(TII, TRI, MRI, E: getVmemWaitEventType(Inst),
2288 Inst);
2289 }
2290
2291 if (mayAccessLDSThroughFlat(MI: Inst)) {
2292 ++FlatASCount;
2293 ScoreBrackets->updateByEvent(TII, TRI, MRI, E: LDS_ACCESS, Inst);
2294 }
2295
2296 // A Flat memory operation must access at least one address space.
2297 assert(FlatASCount);
2298
2299 // This is a flat memory operation that access both VMEM and LDS, so note it
2300 // - it will require that both the VM and LGKM be flushed to zero if it is
2301 // pending when a VM or LGKM dependency occurs.
2302 if (FlatASCount > 1)
2303 ScoreBrackets->setPendingFlat();
2304 } else if (SIInstrInfo::isVMEM(MI: Inst) &&
2305 !llvm::AMDGPU::getMUBUFIsBufferInv(Opc: Inst.getOpcode())) {
2306 IsVMEMAccess = true;
2307 ScoreBrackets->updateByEvent(TII, TRI, MRI, E: getVmemWaitEventType(Inst),
2308 Inst);
2309
2310 if (ST->vmemWriteNeedsExpWaitcnt() &&
2311 (Inst.mayStore() || SIInstrInfo::isAtomicRet(MI: Inst))) {
2312 ScoreBrackets->updateByEvent(TII, TRI, MRI, E: VMW_GPR_LOCK, Inst);
2313 }
2314 } else if (TII->isSMRD(MI: Inst)) {
2315 IsSMEMAccess = true;
2316 ScoreBrackets->updateByEvent(TII, TRI, MRI, E: SMEM_ACCESS, Inst);
2317 } else if (Inst.isCall()) {
2318 if (callWaitsOnFunctionReturn(MI: Inst)) {
2319 // Act as a wait on everything
2320 ScoreBrackets->applyWaitcnt(
2321 Wait: WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false));
2322 ScoreBrackets->setStateOnFunctionEntryOrReturn();
2323 } else {
2324 // May need to way wait for anything.
2325 ScoreBrackets->applyWaitcnt(Wait: AMDGPU::Waitcnt());
2326 }
2327 } else if (SIInstrInfo::isLDSDIR(MI: Inst)) {
2328 ScoreBrackets->updateByEvent(TII, TRI, MRI, E: EXP_LDS_ACCESS, Inst);
2329 } else if (TII->isVINTERP(MI: Inst)) {
2330 int64_t Imm = TII->getNamedOperand(MI&: Inst, OperandName: AMDGPU::OpName::waitexp)->getImm();
2331 ScoreBrackets->applyWaitcnt(T: EXP_CNT, Count: Imm);
2332 } else if (SIInstrInfo::isEXP(MI: Inst)) {
2333 unsigned Imm = TII->getNamedOperand(MI&: Inst, OperandName: AMDGPU::OpName::tgt)->getImm();
2334 if (Imm >= AMDGPU::Exp::ET_PARAM0 && Imm <= AMDGPU::Exp::ET_PARAM31)
2335 ScoreBrackets->updateByEvent(TII, TRI, MRI, E: EXP_PARAM_ACCESS, Inst);
2336 else if (Imm >= AMDGPU::Exp::ET_POS0 && Imm <= AMDGPU::Exp::ET_POS_LAST)
2337 ScoreBrackets->updateByEvent(TII, TRI, MRI, E: EXP_POS_ACCESS, Inst);
2338 else
2339 ScoreBrackets->updateByEvent(TII, TRI, MRI, E: EXP_GPR_LOCK, Inst);
2340 } else {
2341 switch (Inst.getOpcode()) {
2342 case AMDGPU::S_SENDMSG:
2343 case AMDGPU::S_SENDMSG_RTN_B32:
2344 case AMDGPU::S_SENDMSG_RTN_B64:
2345 case AMDGPU::S_SENDMSGHALT:
2346 ScoreBrackets->updateByEvent(TII, TRI, MRI, E: SQ_MESSAGE, Inst);
2347 break;
2348 case AMDGPU::S_MEMTIME:
2349 case AMDGPU::S_MEMREALTIME:
2350 case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0:
2351 case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM:
2352 case AMDGPU::S_GET_BARRIER_STATE_M0:
2353 case AMDGPU::S_GET_BARRIER_STATE_IMM:
2354 ScoreBrackets->updateByEvent(TII, TRI, MRI, E: SMEM_ACCESS, Inst);
2355 break;
2356 }
2357 }
2358
2359 if (!hasXcnt())
2360 return;
2361
2362 if (IsVMEMAccess)
2363 ScoreBrackets->updateByEvent(TII, TRI, MRI, E: VMEM_GROUP, Inst);
2364
2365 if (IsSMEMAccess)
2366 ScoreBrackets->updateByEvent(TII, TRI, MRI, E: SMEM_GROUP, Inst);
2367}
2368
2369bool WaitcntBrackets::mergeScore(const MergeInfo &M, unsigned &Score,
2370 unsigned OtherScore) {
2371 unsigned MyShifted = Score <= M.OldLB ? 0 : Score + M.MyShift;
2372 unsigned OtherShifted =
2373 OtherScore <= M.OtherLB ? 0 : OtherScore + M.OtherShift;
2374 Score = std::max(a: MyShifted, b: OtherShifted);
2375 return OtherShifted > MyShifted;
2376}
2377
2378/// Merge the pending events and associater score brackets of \p Other into
2379/// this brackets status.
2380///
2381/// Returns whether the merge resulted in a change that requires tighter waits
2382/// (i.e. the merged brackets strictly dominate the original brackets).
2383bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
2384 bool StrictDom = false;
2385
2386 VgprUB = std::max(a: VgprUB, b: Other.VgprUB);
2387 SgprUB = std::max(a: SgprUB, b: Other.SgprUB);
2388
2389 for (auto T : inst_counter_types(MaxCounter)) {
2390 // Merge event flags for this counter
2391 const unsigned OldEvents = PendingEvents & WaitEventMaskForInst[T];
2392 const unsigned OtherEvents = Other.PendingEvents & WaitEventMaskForInst[T];
2393 if (OtherEvents & ~OldEvents)
2394 StrictDom = true;
2395 PendingEvents |= OtherEvents;
2396
2397 // Merge scores for this counter
2398 const unsigned MyPending = ScoreUBs[T] - ScoreLBs[T];
2399 const unsigned OtherPending = Other.ScoreUBs[T] - Other.ScoreLBs[T];
2400 const unsigned NewUB = ScoreLBs[T] + std::max(a: MyPending, b: OtherPending);
2401 if (NewUB < ScoreLBs[T])
2402 report_fatal_error(reason: "waitcnt score overflow");
2403
2404 MergeInfo M;
2405 M.OldLB = ScoreLBs[T];
2406 M.OtherLB = Other.ScoreLBs[T];
2407 M.MyShift = NewUB - ScoreUBs[T];
2408 M.OtherShift = NewUB - Other.ScoreUBs[T];
2409
2410 ScoreUBs[T] = NewUB;
2411
2412 StrictDom |= mergeScore(M, Score&: LastFlat[T], OtherScore: Other.LastFlat[T]);
2413
2414 if (T == DS_CNT)
2415 StrictDom |= mergeScore(M, Score&: LastGDS, OtherScore: Other.LastGDS);
2416
2417 for (int J = 0; J <= VgprUB; J++)
2418 StrictDom |= mergeScore(M, Score&: VgprScores[T][J], OtherScore: Other.VgprScores[T][J]);
2419
2420 if (isSmemCounter(T)) {
2421 unsigned Idx = getSgprScoresIdx(T);
2422 for (int J = 0; J <= SgprUB; J++)
2423 StrictDom |=
2424 mergeScore(M, Score&: SgprScores[Idx][J], OtherScore: Other.SgprScores[Idx][J]);
2425 }
2426 }
2427
2428 for (int J = 0; J <= VgprUB; J++) {
2429 unsigned char NewVmemTypes = VgprVmemTypes[J] | Other.VgprVmemTypes[J];
2430 StrictDom |= NewVmemTypes != VgprVmemTypes[J];
2431 VgprVmemTypes[J] = NewVmemTypes;
2432 }
2433
2434 return StrictDom;
2435}
2436
2437static bool isWaitInstr(MachineInstr &Inst) {
2438 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Opcode: Inst.getOpcode());
2439 return Opcode == AMDGPU::S_WAITCNT ||
2440 (Opcode == AMDGPU::S_WAITCNT_VSCNT && Inst.getOperand(i: 0).isReg() &&
2441 Inst.getOperand(i: 0).getReg() == AMDGPU::SGPR_NULL) ||
2442 Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT ||
2443 Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT ||
2444 counterTypeForInstr(Opcode).has_value();
2445}
2446
2447// Generate s_waitcnt instructions where needed.
2448bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
2449 MachineBasicBlock &Block,
2450 WaitcntBrackets &ScoreBrackets) {
2451 bool Modified = false;
2452
2453 LLVM_DEBUG({
2454 dbgs() << "*** Begin Block: ";
2455 Block.printName(dbgs());
2456 ScoreBrackets.dump();
2457 });
2458
2459 // Track the correctness of vccz through this basic block. There are two
2460 // reasons why it might be incorrect; see ST->hasReadVCCZBug() and
2461 // ST->partialVCCWritesUpdateVCCZ().
2462 bool VCCZCorrect = true;
2463 if (ST->hasReadVCCZBug()) {
2464 // vccz could be incorrect at a basic block boundary if a predecessor wrote
2465 // to vcc and then issued an smem load.
2466 VCCZCorrect = false;
2467 } else if (!ST->partialVCCWritesUpdateVCCZ()) {
2468 // vccz could be incorrect at a basic block boundary if a predecessor wrote
2469 // to vcc_lo or vcc_hi.
2470 VCCZCorrect = false;
2471 }
2472
2473 // Walk over the instructions.
2474 MachineInstr *OldWaitcntInstr = nullptr;
2475
2476 for (MachineBasicBlock::instr_iterator Iter = Block.instr_begin(),
2477 E = Block.instr_end();
2478 Iter != E;) {
2479 MachineInstr &Inst = *Iter;
2480 if (Inst.isMetaInstruction()) {
2481 ++Iter;
2482 continue;
2483 }
2484
2485 // Track pre-existing waitcnts that were added in earlier iterations or by
2486 // the memory legalizer.
2487 if (isWaitInstr(Inst)) {
2488 if (!OldWaitcntInstr)
2489 OldWaitcntInstr = &Inst;
2490 ++Iter;
2491 continue;
2492 }
2493
2494 bool FlushVmCnt = Block.getFirstTerminator() == Inst &&
2495 isPreheaderToFlush(MBB&: Block, ScoreBrackets);
2496
2497 // Generate an s_waitcnt instruction to be placed before Inst, if needed.
2498 Modified |= generateWaitcntInstBefore(MI&: Inst, ScoreBrackets, OldWaitcntInstr,
2499 FlushVmCnt);
2500 OldWaitcntInstr = nullptr;
2501
2502 // Restore vccz if it's not known to be correct already.
2503 bool RestoreVCCZ = !VCCZCorrect && readsVCCZ(MI: Inst);
2504
2505 // Don't examine operands unless we need to track vccz correctness.
2506 if (ST->hasReadVCCZBug() || !ST->partialVCCWritesUpdateVCCZ()) {
2507 if (Inst.definesRegister(Reg: AMDGPU::VCC_LO, /*TRI=*/nullptr) ||
2508 Inst.definesRegister(Reg: AMDGPU::VCC_HI, /*TRI=*/nullptr)) {
2509 // Up to gfx9, writes to vcc_lo and vcc_hi don't update vccz.
2510 if (!ST->partialVCCWritesUpdateVCCZ())
2511 VCCZCorrect = false;
2512 } else if (Inst.definesRegister(Reg: AMDGPU::VCC, /*TRI=*/nullptr)) {
2513 // There is a hardware bug on CI/SI where SMRD instruction may corrupt
2514 // vccz bit, so when we detect that an instruction may read from a
2515 // corrupt vccz bit, we need to:
2516 // 1. Insert s_waitcnt lgkm(0) to wait for all outstanding SMRD
2517 // operations to complete.
2518 // 2. Restore the correct value of vccz by writing the current value
2519 // of vcc back to vcc.
2520 if (ST->hasReadVCCZBug() &&
2521 ScoreBrackets.hasPendingEvent(E: SMEM_ACCESS)) {
2522 // Writes to vcc while there's an outstanding smem read may get
2523 // clobbered as soon as any read completes.
2524 VCCZCorrect = false;
2525 } else {
2526 // Writes to vcc will fix any incorrect value in vccz.
2527 VCCZCorrect = true;
2528 }
2529 }
2530 }
2531
2532 if (TII->isSMRD(MI: Inst)) {
2533 for (const MachineMemOperand *Memop : Inst.memoperands()) {
2534 // No need to handle invariant loads when avoiding WAR conflicts, as
2535 // there cannot be a vector store to the same memory location.
2536 if (!Memop->isInvariant()) {
2537 const Value *Ptr = Memop->getValue();
2538 SLoadAddresses.insert(KV: std::pair(Ptr, Inst.getParent()));
2539 }
2540 }
2541 if (ST->hasReadVCCZBug()) {
2542 // This smem read could complete and clobber vccz at any time.
2543 VCCZCorrect = false;
2544 }
2545 }
2546
2547 updateEventWaitcntAfter(Inst, ScoreBrackets: &ScoreBrackets);
2548
2549 Modified |= insertForcedWaitAfter(Inst, Block, ScoreBrackets);
2550
2551 LLVM_DEBUG({
2552 Inst.print(dbgs());
2553 ScoreBrackets.dump();
2554 });
2555
2556 // TODO: Remove this work-around after fixing the scheduler and enable the
2557 // assert above.
2558 if (RestoreVCCZ) {
2559 // Restore the vccz bit. Any time a value is written to vcc, the vcc
2560 // bit is updated, so we can restore the bit by reading the value of
2561 // vcc and then writing it back to the register.
2562 BuildMI(BB&: Block, I&: Inst, MIMD: Inst.getDebugLoc(),
2563 MCID: TII->get(Opcode: ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
2564 DestReg: TRI->getVCC())
2565 .addReg(RegNo: TRI->getVCC());
2566 VCCZCorrect = true;
2567 Modified = true;
2568 }
2569
2570 ++Iter;
2571 }
2572
2573 // Flush the LOADcnt, SAMPLEcnt and BVHcnt counters at the end of the block if
2574 // needed.
2575 AMDGPU::Waitcnt Wait;
2576 if (Block.getFirstTerminator() == Block.end() &&
2577 isPreheaderToFlush(MBB&: Block, ScoreBrackets)) {
2578 if (ScoreBrackets.hasPendingEvent(T: LOAD_CNT))
2579 Wait.LoadCnt = 0;
2580 if (ScoreBrackets.hasPendingEvent(T: SAMPLE_CNT))
2581 Wait.SampleCnt = 0;
2582 if (ScoreBrackets.hasPendingEvent(T: BVH_CNT))
2583 Wait.BvhCnt = 0;
2584 }
2585
2586 // Combine or remove any redundant waitcnts at the end of the block.
2587 Modified |= generateWaitcnt(Wait, It: Block.instr_end(), Block, ScoreBrackets,
2588 OldWaitcntInstr);
2589
2590 LLVM_DEBUG({
2591 dbgs() << "*** End Block: ";
2592 Block.printName(dbgs());
2593 ScoreBrackets.dump();
2594 });
2595
2596 return Modified;
2597}
2598
2599// Return true if the given machine basic block is a preheader of a loop in
2600// which we want to flush the vmcnt counter, and false otherwise.
2601bool SIInsertWaitcnts::isPreheaderToFlush(
2602 MachineBasicBlock &MBB, const WaitcntBrackets &ScoreBrackets) {
2603 auto [Iterator, IsInserted] = PreheadersToFlush.try_emplace(Key: &MBB, Args: false);
2604 if (!IsInserted)
2605 return Iterator->second;
2606
2607 MachineBasicBlock *Succ = MBB.getSingleSuccessor();
2608 if (!Succ)
2609 return false;
2610
2611 MachineLoop *Loop = MLI->getLoopFor(BB: Succ);
2612 if (!Loop)
2613 return false;
2614
2615 if (Loop->getLoopPreheader() == &MBB &&
2616 shouldFlushVmCnt(ML: Loop, Brackets: ScoreBrackets)) {
2617 Iterator->second = true;
2618 return true;
2619 }
2620
2621 return false;
2622}
2623
2624bool SIInsertWaitcnts::isVMEMOrFlatVMEM(const MachineInstr &MI) const {
2625 if (SIInstrInfo::isFLAT(MI))
2626 return mayAccessVMEMThroughFlat(MI);
2627 return SIInstrInfo::isVMEM(MI);
2628}
2629
2630// Return true if it is better to flush the vmcnt counter in the preheader of
2631// the given loop. We currently decide to flush in two situations:
2632// 1. The loop contains vmem store(s), no vmem load and at least one use of a
2633// vgpr containing a value that is loaded outside of the loop. (Only on
2634// targets with no vscnt counter).
2635// 2. The loop contains vmem load(s), but the loaded values are not used in the
2636// loop, and at least one use of a vgpr containing a value that is loaded
2637// outside of the loop.
2638bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML,
2639 const WaitcntBrackets &Brackets) {
2640 bool HasVMemLoad = false;
2641 bool HasVMemStore = false;
2642 bool UsesVgprLoadedOutside = false;
2643 DenseSet<Register> VgprUse;
2644 DenseSet<Register> VgprDef;
2645
2646 for (MachineBasicBlock *MBB : ML->blocks()) {
2647 for (MachineInstr &MI : *MBB) {
2648 if (isVMEMOrFlatVMEM(MI)) {
2649 if (MI.mayLoad())
2650 HasVMemLoad = true;
2651 if (MI.mayStore())
2652 HasVMemStore = true;
2653 }
2654 for (const MachineOperand &Op : MI.all_uses()) {
2655 if (!TRI->isVectorRegister(MRI: *MRI, Reg: Op.getReg()))
2656 continue;
2657 RegInterval Interval = Brackets.getRegInterval(MI: &MI, MRI, TRI, Op);
2658 // Vgpr use
2659 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
2660 // If we find a register that is loaded inside the loop, 1. and 2.
2661 // are invalidated and we can exit.
2662 if (VgprDef.contains(V: RegNo))
2663 return false;
2664 VgprUse.insert(V: RegNo);
2665 // If at least one of Op's registers is in the score brackets, the
2666 // value is likely loaded outside of the loop.
2667 if (Brackets.getRegScore(GprNo: RegNo, T: LOAD_CNT) >
2668 Brackets.getScoreLB(T: LOAD_CNT) ||
2669 Brackets.getRegScore(GprNo: RegNo, T: SAMPLE_CNT) >
2670 Brackets.getScoreLB(T: SAMPLE_CNT) ||
2671 Brackets.getRegScore(GprNo: RegNo, T: BVH_CNT) >
2672 Brackets.getScoreLB(T: BVH_CNT)) {
2673 UsesVgprLoadedOutside = true;
2674 break;
2675 }
2676 }
2677 }
2678
2679 // VMem load vgpr def
2680 if (isVMEMOrFlatVMEM(MI) && MI.mayLoad()) {
2681 for (const MachineOperand &Op : MI.all_defs()) {
2682 RegInterval Interval = Brackets.getRegInterval(MI: &MI, MRI, TRI, Op);
2683 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
2684 // If we find a register that is loaded inside the loop, 1. and 2.
2685 // are invalidated and we can exit.
2686 if (VgprUse.contains(V: RegNo))
2687 return false;
2688 VgprDef.insert(V: RegNo);
2689 }
2690 }
2691 }
2692 }
2693 }
2694 if (!ST->hasVscnt() && HasVMemStore && !HasVMemLoad && UsesVgprLoadedOutside)
2695 return true;
2696 return HasVMemLoad && UsesVgprLoadedOutside && ST->hasVmemWriteVgprInOrder();
2697}
2698
2699bool SIInsertWaitcntsLegacy::runOnMachineFunction(MachineFunction &MF) {
2700 auto *MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
2701 auto *PDT =
2702 &getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree();
2703 AliasAnalysis *AA = nullptr;
2704 if (auto *AAR = getAnalysisIfAvailable<AAResultsWrapperPass>())
2705 AA = &AAR->getAAResults();
2706
2707 return SIInsertWaitcnts(MLI, PDT, AA).run(MF);
2708}
2709
2710PreservedAnalyses
2711SIInsertWaitcntsPass::run(MachineFunction &MF,
2712 MachineFunctionAnalysisManager &MFAM) {
2713 auto *MLI = &MFAM.getResult<MachineLoopAnalysis>(IR&: MF);
2714 auto *PDT = &MFAM.getResult<MachinePostDominatorTreeAnalysis>(IR&: MF);
2715 auto *AA = MFAM.getResult<FunctionAnalysisManagerMachineFunctionProxy>(IR&: MF)
2716 .getManager()
2717 .getCachedResult<AAManager>(IR&: MF.getFunction());
2718
2719 if (!SIInsertWaitcnts(MLI, PDT, AA).run(MF))
2720 return PreservedAnalyses::all();
2721
2722 return getMachineFunctionPassPreservedAnalyses()
2723 .preserveSet<CFGAnalyses>()
2724 .preserve<AAManager>();
2725}
2726
2727bool SIInsertWaitcnts::run(MachineFunction &MF) {
2728 ST = &MF.getSubtarget<GCNSubtarget>();
2729 TII = ST->getInstrInfo();
2730 TRI = &TII->getRegisterInfo();
2731 MRI = &MF.getRegInfo();
2732 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2733
2734 AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(GPU: ST->getCPU());
2735
2736 if (ST->hasExtendedWaitCounts()) {
2737 MaxCounter = NUM_EXTENDED_INST_CNTS;
2738 WCGGFX12Plus = WaitcntGeneratorGFX12Plus(MF, MaxCounter);
2739 WCG = &WCGGFX12Plus;
2740 } else {
2741 MaxCounter = NUM_NORMAL_INST_CNTS;
2742 WCGPreGFX12 = WaitcntGeneratorPreGFX12(MF);
2743 WCG = &WCGPreGFX12;
2744 }
2745
2746 for (auto T : inst_counter_types())
2747 ForceEmitWaitcnt[T] = false;
2748
2749 const unsigned *WaitEventMaskForInst = WCG->getWaitEventMask();
2750
2751 SmemAccessCounter = eventCounter(masks: WaitEventMaskForInst, E: SMEM_ACCESS);
2752
2753 HardwareLimits Limits = {};
2754 if (ST->hasExtendedWaitCounts()) {
2755 Limits.LoadcntMax = AMDGPU::getLoadcntBitMask(Version: IV);
2756 Limits.DscntMax = AMDGPU::getDscntBitMask(Version: IV);
2757 } else {
2758 Limits.LoadcntMax = AMDGPU::getVmcntBitMask(Version: IV);
2759 Limits.DscntMax = AMDGPU::getLgkmcntBitMask(Version: IV);
2760 }
2761 Limits.ExpcntMax = AMDGPU::getExpcntBitMask(Version: IV);
2762 Limits.StorecntMax = AMDGPU::getStorecntBitMask(Version: IV);
2763 Limits.SamplecntMax = AMDGPU::getSamplecntBitMask(Version: IV);
2764 Limits.BvhcntMax = AMDGPU::getBvhcntBitMask(Version: IV);
2765 Limits.KmcntMax = AMDGPU::getKmcntBitMask(Version: IV);
2766 Limits.XcntMax = AMDGPU::getXcntBitMask(Version: IV);
2767
2768 [[maybe_unused]] unsigned NumVGPRsMax =
2769 ST->getAddressableNumVGPRs(DynamicVGPRBlockSize: MFI->getDynamicVGPRBlockSize());
2770 [[maybe_unused]] unsigned NumSGPRsMax = ST->getAddressableNumSGPRs();
2771 assert(NumVGPRsMax <= SQ_MAX_PGM_VGPRS);
2772 assert(NumSGPRsMax <= SQ_MAX_PGM_SGPRS);
2773
2774 BlockInfos.clear();
2775 bool Modified = false;
2776
2777 MachineBasicBlock &EntryBB = MF.front();
2778 MachineBasicBlock::iterator I = EntryBB.begin();
2779
2780 if (!MFI->isEntryFunction()) {
2781 // Wait for any outstanding memory operations that the input registers may
2782 // depend on. We can't track them and it's better to do the wait after the
2783 // costly call sequence.
2784
2785 // TODO: Could insert earlier and schedule more liberally with operations
2786 // that only use caller preserved registers.
2787 for (MachineBasicBlock::iterator E = EntryBB.end();
2788 I != E && (I->isPHI() || I->isMetaInstruction()); ++I)
2789 ;
2790
2791 if (ST->hasExtendedWaitCounts()) {
2792 BuildMI(BB&: EntryBB, I, MIMD: DebugLoc(), MCID: TII->get(Opcode: AMDGPU::S_WAIT_LOADCNT_DSCNT))
2793 .addImm(Val: 0);
2794 for (auto CT : inst_counter_types(MaxCounter: NUM_EXTENDED_INST_CNTS)) {
2795 if (CT == LOAD_CNT || CT == DS_CNT || CT == STORE_CNT || CT == X_CNT)
2796 continue;
2797
2798 if (!ST->hasImageInsts() &&
2799 (CT == EXP_CNT || CT == SAMPLE_CNT || CT == BVH_CNT))
2800 continue;
2801
2802 BuildMI(BB&: EntryBB, I, MIMD: DebugLoc(),
2803 MCID: TII->get(Opcode: instrsForExtendedCounterTypes[CT]))
2804 .addImm(Val: 0);
2805 }
2806 } else {
2807 BuildMI(BB&: EntryBB, I, MIMD: DebugLoc(), MCID: TII->get(Opcode: AMDGPU::S_WAITCNT)).addImm(Val: 0);
2808 }
2809
2810 auto NonKernelInitialState = std::make_unique<WaitcntBrackets>(
2811 args&: ST, args&: MaxCounter, args&: Limits, args&: WaitEventMaskForInst, args&: SmemAccessCounter);
2812 NonKernelInitialState->setStateOnFunctionEntryOrReturn();
2813 BlockInfos[&EntryBB].Incoming = std::move(NonKernelInitialState);
2814
2815 Modified = true;
2816 }
2817
2818 // Keep iterating over the blocks in reverse post order, inserting and
2819 // updating s_waitcnt where needed, until a fix point is reached.
2820 for (auto *MBB : ReversePostOrderTraversal<MachineFunction *>(&MF))
2821 BlockInfos.try_emplace(Key: MBB);
2822
2823 std::unique_ptr<WaitcntBrackets> Brackets;
2824 bool Repeat;
2825 do {
2826 Repeat = false;
2827
2828 for (auto BII = BlockInfos.begin(), BIE = BlockInfos.end(); BII != BIE;
2829 ++BII) {
2830 MachineBasicBlock *MBB = BII->first;
2831 BlockInfo &BI = BII->second;
2832 if (!BI.Dirty)
2833 continue;
2834
2835 if (BI.Incoming) {
2836 if (!Brackets)
2837 Brackets = std::make_unique<WaitcntBrackets>(args&: *BI.Incoming);
2838 else
2839 *Brackets = *BI.Incoming;
2840 } else {
2841 if (!Brackets) {
2842 Brackets = std::make_unique<WaitcntBrackets>(
2843 args&: ST, args&: MaxCounter, args&: Limits, args&: WaitEventMaskForInst, args&: SmemAccessCounter);
2844 } else {
2845 // Reinitialize in-place. N.B. do not do this by assigning from a
2846 // temporary because the WaitcntBrackets class is large and it could
2847 // cause this function to use an unreasonable amount of stack space.
2848 Brackets->~WaitcntBrackets();
2849 new (Brackets.get()) WaitcntBrackets(
2850 ST, MaxCounter, Limits, WaitEventMaskForInst, SmemAccessCounter);
2851 }
2852 }
2853
2854 Modified |= insertWaitcntInBlock(MF, Block&: *MBB, ScoreBrackets&: *Brackets);
2855 BI.Dirty = false;
2856
2857 if (Brackets->hasPendingEvent()) {
2858 BlockInfo *MoveBracketsToSucc = nullptr;
2859 for (MachineBasicBlock *Succ : MBB->successors()) {
2860 auto *SuccBII = BlockInfos.find(Key: Succ);
2861 BlockInfo &SuccBI = SuccBII->second;
2862 if (!SuccBI.Incoming) {
2863 SuccBI.Dirty = true;
2864 if (SuccBII <= BII) {
2865 LLVM_DEBUG(dbgs() << "repeat on backedge\n");
2866 Repeat = true;
2867 }
2868 if (!MoveBracketsToSucc) {
2869 MoveBracketsToSucc = &SuccBI;
2870 } else {
2871 SuccBI.Incoming = std::make_unique<WaitcntBrackets>(args&: *Brackets);
2872 }
2873 } else if (SuccBI.Incoming->merge(Other: *Brackets)) {
2874 SuccBI.Dirty = true;
2875 if (SuccBII <= BII) {
2876 LLVM_DEBUG(dbgs() << "repeat on backedge\n");
2877 Repeat = true;
2878 }
2879 }
2880 }
2881 if (MoveBracketsToSucc)
2882 MoveBracketsToSucc->Incoming = std::move(Brackets);
2883 }
2884 }
2885 } while (Repeat);
2886
2887 if (ST->hasScalarStores()) {
2888 SmallVector<MachineBasicBlock *, 4> EndPgmBlocks;
2889 bool HaveScalarStores = false;
2890
2891 for (MachineBasicBlock &MBB : MF) {
2892 for (MachineInstr &MI : MBB) {
2893 if (!HaveScalarStores && TII->isScalarStore(MI))
2894 HaveScalarStores = true;
2895
2896 if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
2897 MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
2898 EndPgmBlocks.push_back(Elt: &MBB);
2899 }
2900 }
2901
2902 if (HaveScalarStores) {
2903 // If scalar writes are used, the cache must be flushed or else the next
2904 // wave to reuse the same scratch memory can be clobbered.
2905 //
2906 // Insert s_dcache_wb at wave termination points if there were any scalar
2907 // stores, and only if the cache hasn't already been flushed. This could
2908 // be improved by looking across blocks for flushes in postdominating
2909 // blocks from the stores but an explicitly requested flush is probably
2910 // very rare.
2911 for (MachineBasicBlock *MBB : EndPgmBlocks) {
2912 bool SeenDCacheWB = false;
2913
2914 for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
2915 I != E; ++I) {
2916 if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
2917 SeenDCacheWB = true;
2918 else if (TII->isScalarStore(MI: *I))
2919 SeenDCacheWB = false;
2920
2921 // FIXME: It would be better to insert this before a waitcnt if any.
2922 if ((I->getOpcode() == AMDGPU::S_ENDPGM ||
2923 I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&
2924 !SeenDCacheWB) {
2925 Modified = true;
2926 BuildMI(BB&: *MBB, I, MIMD: I->getDebugLoc(), MCID: TII->get(Opcode: AMDGPU::S_DCACHE_WB));
2927 }
2928 }
2929 }
2930 }
2931 }
2932
2933 // Deallocate the VGPRs before previously identified S_ENDPGM instructions.
2934 // This is done in different ways depending on how the VGPRs were allocated
2935 // (i.e. whether we're in dynamic VGPR mode or not).
2936 // Skip deallocation if kernel is waveslot limited vs VGPR limited. A short
2937 // waveslot limited kernel runs slower with the deallocation.
2938 if (MFI->isDynamicVGPREnabled()) {
2939 for (MachineInstr *MI : ReleaseVGPRInsts) {
2940 BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(),
2941 MCID: TII->get(Opcode: AMDGPU::S_ALLOC_VGPR))
2942 .addImm(Val: 0);
2943 Modified = true;
2944 }
2945 } else {
2946 if (!ReleaseVGPRInsts.empty() &&
2947 (MF.getFrameInfo().hasCalls() ||
2948 ST->getOccupancyWithNumVGPRs(
2949 VGPRs: TRI->getNumUsedPhysRegs(MRI: *MRI, RC: AMDGPU::VGPR_32RegClass),
2950 /*IsDynamicVGPR=*/DynamicVGPRBlockSize: false) <
2951 AMDGPU::IsaInfo::getMaxWavesPerEU(STI: ST))) {
2952 for (MachineInstr *MI : ReleaseVGPRInsts) {
2953 if (ST->requiresNopBeforeDeallocVGPRs()) {
2954 BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(),
2955 MCID: TII->get(Opcode: AMDGPU::S_NOP))
2956 .addImm(Val: 0);
2957 }
2958 BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(),
2959 MCID: TII->get(Opcode: AMDGPU::S_SENDMSG))
2960 .addImm(Val: AMDGPU::SendMsg::ID_DEALLOC_VGPRS_GFX11Plus);
2961 Modified = true;
2962 }
2963 }
2964 }
2965 ReleaseVGPRInsts.clear();
2966 PreheadersToFlush.clear();
2967 SLoadAddresses.clear();
2968
2969 return Modified;
2970}
2971