1//===- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Insert wait instructions for memory reads and writes.
11///
12/// Memory reads and writes are issued asynchronously, so we need to insert
13/// S_WAITCNT instructions when we want to access any of their results or
14/// overwrite any register that's used asynchronously.
15///
16/// TODO: This pass currently keeps one timeline per hardware counter. A more
17/// finely-grained approach that keeps one timeline per event type could
18/// sometimes get away with generating weaker s_waitcnt instructions. For
19/// example, when both SMEM and LDS are in flight and we need to wait for
20/// the i-th-last LDS instruction, then an lgkmcnt(i) is actually sufficient,
21/// but the pass will currently generate a conservative lgkmcnt(0) because
22/// multiple event types are in flight.
23//
24//===----------------------------------------------------------------------===//
25
26#include "AMDGPU.h"
27#include "GCNSubtarget.h"
28#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
29#include "SIMachineFunctionInfo.h"
30#include "Utils/AMDGPUBaseInfo.h"
31#include "llvm/ADT/MapVector.h"
32#include "llvm/ADT/PostOrderIterator.h"
33#include "llvm/ADT/Sequence.h"
34#include "llvm/Analysis/AliasAnalysis.h"
35#include "llvm/CodeGen/MachineFrameInfo.h"
36#include "llvm/CodeGen/MachineLoopInfo.h"
37#include "llvm/CodeGen/MachinePassManager.h"
38#include "llvm/CodeGen/MachinePostDominators.h"
39#include "llvm/IR/Dominators.h"
40#include "llvm/InitializePasses.h"
41#include "llvm/Support/DebugCounter.h"
42#include "llvm/TargetParser/TargetParser.h"
43
44using namespace llvm;
45using namespace llvm::AMDGPU;
46
47#define DEBUG_TYPE "si-insert-waitcnts"
48
49DEBUG_COUNTER(ForceExpCounter, DEBUG_TYPE "-forceexp",
50 "Force emit s_waitcnt expcnt(0) instrs");
51DEBUG_COUNTER(ForceLgkmCounter, DEBUG_TYPE "-forcelgkm",
52 "Force emit s_waitcnt lgkmcnt(0) instrs");
53DEBUG_COUNTER(ForceVMCounter, DEBUG_TYPE "-forcevm",
54 "Force emit s_waitcnt vmcnt(0) instrs");
55
56static cl::opt<bool>
57 ForceEmitZeroFlag("amdgpu-waitcnt-forcezero",
58 cl::desc("Force all waitcnt instrs to be emitted as "
59 "s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"),
60 cl::init(Val: false), cl::Hidden);
61
62static cl::opt<bool> ForceEmitZeroLoadFlag(
63 "amdgpu-waitcnt-load-forcezero",
64 cl::desc("Force all waitcnt load counters to wait until 0"),
65 cl::init(Val: false), cl::Hidden);
66
67static cl::opt<bool> ExpertSchedulingModeFlag(
68 "amdgpu-expert-scheduling-mode",
69 cl::desc("Enable expert scheduling mode 2 for all functions (GFX12+ only)"),
70 cl::init(Val: false), cl::Hidden);
71
72namespace {
73// Get the maximum wait count value for a given counter type.
74static unsigned getWaitCountMax(const AMDGPU::HardwareLimits &Limits,
75 InstCounterType T) {
76 switch (T) {
77 case LOAD_CNT:
78 return Limits.LoadcntMax;
79 case DS_CNT:
80 return Limits.DscntMax;
81 case EXP_CNT:
82 return Limits.ExpcntMax;
83 case STORE_CNT:
84 return Limits.StorecntMax;
85 case SAMPLE_CNT:
86 return Limits.SamplecntMax;
87 case BVH_CNT:
88 return Limits.BvhcntMax;
89 case KM_CNT:
90 return Limits.KmcntMax;
91 case X_CNT:
92 return Limits.XcntMax;
93 case VA_VDST:
94 return Limits.VaVdstMax;
95 case VM_VSRC:
96 return Limits.VmVsrcMax;
97 default:
98 return 0;
99 }
100}
101
102/// Integer IDs used to track vector memory locations we may have to wait on.
103/// Encoded as u16 chunks:
104///
105/// [0, REGUNITS_END ): MCRegUnit
106/// [LDSDMA_BEGIN, LDSDMA_END ) : LDS DMA IDs
107///
108/// NOTE: The choice of encoding these as "u16 chunks" is arbitrary.
109/// It gives (2 << 16) - 1 entries per category which is more than enough
110/// for all register units. MCPhysReg is u16 so we don't even support >u16
111/// physical register numbers at this time, let alone >u16 register units.
112/// In any case, an assertion in "WaitcntBrackets" ensures REGUNITS_END
113/// is enough for all register units.
114using VMEMID = uint32_t;
115
116enum : VMEMID {
117 TRACKINGID_RANGE_LEN = (1 << 16),
118
119 // Important: MCRegUnits must always be tracked starting from 0, as we
120 // need to be able to convert between a MCRegUnit and a VMEMID freely.
121 REGUNITS_BEGIN = 0,
122 REGUNITS_END = REGUNITS_BEGIN + TRACKINGID_RANGE_LEN,
123
124 // Note for LDSDMA: LDSDMA_BEGIN corresponds to the "common"
125 // entry, which is updated for all LDS DMA operations encountered.
126 // Specific LDS DMA IDs start at LDSDMA_BEGIN + 1.
127 NUM_LDSDMA = TRACKINGID_RANGE_LEN,
128 LDSDMA_BEGIN = REGUNITS_END,
129 LDSDMA_END = LDSDMA_BEGIN + NUM_LDSDMA,
130};
131
132/// Convert a MCRegUnit to a VMEMID.
133static constexpr VMEMID toVMEMID(MCRegUnit RU) {
134 return static_cast<unsigned>(RU);
135}
136
137#define AMDGPU_DECLARE_WAIT_EVENTS(DECL) \
138 DECL(VMEM_ACCESS) /* vmem read & write (pre-gfx10), vmem read (gfx10+) */ \
139 DECL(VMEM_SAMPLER_READ_ACCESS) /* vmem SAMPLER read (gfx12+ only) */ \
140 DECL(VMEM_BVH_READ_ACCESS) /* vmem BVH read (gfx12+ only) */ \
141 DECL(GLOBAL_INV_ACCESS) /* GLOBAL_INV (gfx12+ only) */ \
142 DECL(VMEM_WRITE_ACCESS) /* vmem write that is not scratch */ \
143 DECL(SCRATCH_WRITE_ACCESS) /* vmem write that may be scratch */ \
144 DECL(VMEM_GROUP) /* vmem group */ \
145 DECL(LDS_ACCESS) /* lds read & write */ \
146 DECL(GDS_ACCESS) /* gds read & write */ \
147 DECL(SQ_MESSAGE) /* send message */ \
148 DECL(SCC_WRITE) /* write to SCC from barrier */ \
149 DECL(SMEM_ACCESS) /* scalar-memory read & write */ \
150 DECL(SMEM_GROUP) /* scalar-memory group */ \
151 DECL(EXP_GPR_LOCK) /* export holding on its data src */ \
152 DECL(GDS_GPR_LOCK) /* GDS holding on its data and addr src */ \
153 DECL(EXP_POS_ACCESS) /* write to export position */ \
154 DECL(EXP_PARAM_ACCESS) /* write to export parameter */ \
155 DECL(VMW_GPR_LOCK) /* vmem write holding on its data src */ \
156 DECL(EXP_LDS_ACCESS) /* read by ldsdir counting as export */ \
157 DECL(VGPR_CSMACC_WRITE) /* write VGPR dest in Core/Side-MACC VALU */ \
158 DECL(VGPR_DPMACC_WRITE) /* write VGPR dest in DPMACC VALU */ \
159 DECL(VGPR_TRANS_WRITE) /* write VGPR dest in TRANS VALU */ \
160 DECL(VGPR_XDL_WRITE) /* write VGPR dest in XDL VALU */ \
161 DECL(VGPR_LDS_READ) /* read VGPR source in LDS */ \
162 DECL(VGPR_FLAT_READ) /* read VGPR source in FLAT */ \
163 DECL(VGPR_VMEM_READ) /* read VGPR source in other VMEM */
164
165// clang-format off
166#define AMDGPU_EVENT_ENUM(Name) Name,
167enum WaitEventType {
168 AMDGPU_DECLARE_WAIT_EVENTS(AMDGPU_EVENT_ENUM)
169 NUM_WAIT_EVENTS
170};
171#undef AMDGPU_EVENT_ENUM
172} // namespace
173
174namespace llvm {
175template <> struct enum_iteration_traits<WaitEventType> {
176 static constexpr bool is_iterable = true;
177};
178} // namespace llvm
179
180namespace {
181
182/// Return an iterator over all events between VMEM_ACCESS (the first event)
183/// and \c MaxEvent (exclusive, default value yields an enumeration over
184/// all counters).
185auto wait_events(WaitEventType MaxEvent = NUM_WAIT_EVENTS) {
186 return enum_seq(Begin: VMEM_ACCESS, End: MaxEvent);
187}
188
189#define AMDGPU_EVENT_NAME(Name) #Name,
190static constexpr StringLiteral WaitEventTypeName[] = {
191 AMDGPU_DECLARE_WAIT_EVENTS(AMDGPU_EVENT_NAME)
192};
193#undef AMDGPU_EVENT_NAME
194static constexpr StringLiteral getWaitEventTypeName(WaitEventType Event) {
195 return WaitEventTypeName[Event];
196}
197// clang-format on
198
199// Enumerate different types of result-returning VMEM operations. Although
200// s_waitcnt orders them all with a single vmcnt counter, in the absence of
201// s_waitcnt only instructions of the same VmemType are guaranteed to write
202// their results in order -- so there is no need to insert an s_waitcnt between
203// two instructions of the same type that write the same vgpr.
204enum VmemType {
205 // BUF instructions and MIMG instructions without a sampler.
206 VMEM_NOSAMPLER,
207 // MIMG instructions with a sampler.
208 VMEM_SAMPLER,
209 // BVH instructions
210 VMEM_BVH,
211 NUM_VMEM_TYPES
212};
213
214// Maps values of InstCounterType to the instruction that waits on that
215// counter. Only used if GCNSubtarget::hasExtendedWaitCounts()
216// returns true, and does not cover VA_VDST or VM_VSRC.
217static const unsigned instrsForExtendedCounterTypes[NUM_EXTENDED_INST_CNTS] = {
218 AMDGPU::S_WAIT_LOADCNT, AMDGPU::S_WAIT_DSCNT, AMDGPU::S_WAIT_EXPCNT,
219 AMDGPU::S_WAIT_STORECNT, AMDGPU::S_WAIT_SAMPLECNT, AMDGPU::S_WAIT_BVHCNT,
220 AMDGPU::S_WAIT_KMCNT, AMDGPU::S_WAIT_XCNT};
221
222static bool updateVMCntOnly(const MachineInstr &Inst) {
223 return (SIInstrInfo::isVMEM(MI: Inst) && !SIInstrInfo::isFLAT(MI: Inst)) ||
224 SIInstrInfo::isFLATGlobal(MI: Inst) || SIInstrInfo::isFLATScratch(MI: Inst);
225}
226
227#ifndef NDEBUG
228static bool isNormalMode(InstCounterType MaxCounter) {
229 return MaxCounter == NUM_NORMAL_INST_CNTS;
230}
231#endif // NDEBUG
232
233VmemType getVmemType(const MachineInstr &Inst) {
234 assert(updateVMCntOnly(Inst));
235 if (!SIInstrInfo::isImage(MI: Inst))
236 return VMEM_NOSAMPLER;
237 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc: Inst.getOpcode());
238 const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo =
239 AMDGPU::getMIMGBaseOpcodeInfo(BaseOpcode: Info->BaseOpcode);
240
241 if (BaseInfo->BVH)
242 return VMEM_BVH;
243
244 // We have to make an additional check for isVSAMPLE here since some
245 // instructions don't have a sampler, but are still classified as sampler
246 // instructions for the purposes of e.g. waitcnt.
247 if (BaseInfo->Sampler || BaseInfo->MSAA || SIInstrInfo::isVSAMPLE(MI: Inst))
248 return VMEM_SAMPLER;
249
250 return VMEM_NOSAMPLER;
251}
252
253void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) {
254 Wait.set(T, Val: std::min(a: Wait.get(T), b: Count));
255}
256
257void setNoWait(AMDGPU::Waitcnt &Wait, InstCounterType T) { Wait.set(T, Val: ~0u); }
258
259/// A small set of events.
260class WaitEventSet {
261 unsigned Mask = 0;
262
263public:
264 WaitEventSet() = default;
265 explicit constexpr WaitEventSet(WaitEventType Event) {
266 static_assert(NUM_WAIT_EVENTS <= sizeof(Mask) * 8,
267 "Not enough bits in Mask for all the events");
268 Mask |= 1 << Event;
269 }
270 constexpr WaitEventSet(std::initializer_list<WaitEventType> Events) {
271 for (auto &E : Events) {
272 Mask |= 1 << E;
273 }
274 }
275 void insert(const WaitEventType &Event) { Mask |= 1 << Event; }
276 void remove(const WaitEventType &Event) { Mask &= ~(1 << Event); }
277 void remove(const WaitEventSet &Other) { Mask &= ~Other.Mask; }
278 bool contains(const WaitEventType &Event) const {
279 return Mask & (1 << Event);
280 }
281 /// \Returns true if this set contains all elements of \p Other.
282 bool contains(const WaitEventSet &Other) const {
283 return (~Mask & Other.Mask) == 0;
284 }
285 /// \Returns the intersection of this and \p Other.
286 WaitEventSet operator&(const WaitEventSet &Other) const {
287 auto Copy = *this;
288 Copy.Mask &= Other.Mask;
289 return Copy;
290 }
291 /// \Returns the union of this and \p Other.
292 WaitEventSet operator|(const WaitEventSet &Other) const {
293 auto Copy = *this;
294 Copy.Mask |= Other.Mask;
295 return Copy;
296 }
297 /// This set becomes the union of this and \p Other.
298 WaitEventSet &operator|=(const WaitEventSet &Other) {
299 Mask |= Other.Mask;
300 return *this;
301 }
302 /// This set becomes the intersection of this and \p Other.
303 WaitEventSet &operator&=(const WaitEventSet &Other) {
304 Mask &= Other.Mask;
305 return *this;
306 }
307 bool operator==(const WaitEventSet &Other) const {
308 return Mask == Other.Mask;
309 }
310 bool operator!=(const WaitEventSet &Other) const { return !(*this == Other); }
311 bool empty() const { return Mask == 0; }
312 /// \Returns true if the set contains more than one element.
313 bool twoOrMore() const { return Mask & (Mask - 1); }
314 operator bool() const { return !empty(); }
315 void print(raw_ostream &OS) const {
316 ListSeparator LS(", ");
317 for (WaitEventType Event : wait_events()) {
318 OS << LS << getWaitEventTypeName(Event);
319 }
320 }
321 LLVM_DUMP_METHOD void dump() const;
322};
323
324void WaitEventSet::dump() const {
325 print(OS&: dbgs());
326 dbgs() << "\n";
327}
328
329class WaitcntBrackets;
330
331// This abstracts the logic for generating and updating S_WAIT* instructions
332// away from the analysis that determines where they are needed. This was
333// done because the set of counters and instructions for waiting on them
334// underwent a major shift with gfx12, sufficiently so that having this
335// abstraction allows the main analysis logic to be simpler than it would
336// otherwise have had to become.
337class WaitcntGenerator {
338protected:
339 const GCNSubtarget &ST;
340 const SIInstrInfo &TII;
341 AMDGPU::IsaVersion IV;
342 InstCounterType MaxCounter;
343 bool OptNone;
344 bool ExpandWaitcntProfiling = false;
345 const AMDGPU::HardwareLimits *Limits = nullptr;
346
347public:
348 WaitcntGenerator() = delete;
349 WaitcntGenerator(const WaitcntGenerator &) = delete;
350 WaitcntGenerator(const MachineFunction &MF, InstCounterType MaxCounter,
351 const AMDGPU::HardwareLimits *Limits)
352 : ST(MF.getSubtarget<GCNSubtarget>()), TII(*ST.getInstrInfo()),
353 IV(AMDGPU::getIsaVersion(GPU: ST.getCPU())), MaxCounter(MaxCounter),
354 OptNone(MF.getFunction().hasOptNone() ||
355 MF.getTarget().getOptLevel() == CodeGenOptLevel::None),
356 ExpandWaitcntProfiling(
357 MF.getFunction().hasFnAttribute(Kind: "amdgpu-expand-waitcnt-profiling")),
358 Limits(Limits) {}
359
360 // Return true if the current function should be compiled with no
361 // optimization.
362 bool isOptNone() const { return OptNone; }
363
364 const AMDGPU::HardwareLimits &getLimits() const { return *Limits; }
365
366 // Edits an existing sequence of wait count instructions according
367 // to an incoming Waitcnt value, which is itself updated to reflect
368 // any new wait count instructions which may need to be generated by
369 // WaitcntGenerator::createNewWaitcnt(). It will return true if any edits
370 // were made.
371 //
372 // This editing will usually be merely updated operands, but it may also
373 // delete instructions if the incoming Wait value indicates they are not
374 // needed. It may also remove existing instructions for which a wait
375 // is needed if it can be determined that it is better to generate new
376 // instructions later, as can happen on gfx12.
377 virtual bool
378 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
379 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
380 MachineBasicBlock::instr_iterator It) const = 0;
381
382 // Transform a soft waitcnt into a normal one.
383 bool promoteSoftWaitCnt(MachineInstr *Waitcnt) const;
384
385 // Generates new wait count instructions according to the value of
386 // Wait, returning true if any new instructions were created.
387 // ScoreBrackets is used for profiling expansion.
388 virtual bool createNewWaitcnt(MachineBasicBlock &Block,
389 MachineBasicBlock::instr_iterator It,
390 AMDGPU::Waitcnt Wait,
391 const WaitcntBrackets &ScoreBrackets) = 0;
392
393 // Returns the WaitEventSet that corresponds to counter \p T.
394 virtual const WaitEventSet &getWaitEvents(InstCounterType T) const = 0;
395
396 /// \returns the counter that corresponds to event \p E.
397 InstCounterType getCounterFromEvent(WaitEventType E) const {
398 for (auto T : inst_counter_types()) {
399 if (getWaitEvents(T).contains(Event: E))
400 return T;
401 }
402 llvm_unreachable("event type has no associated counter");
403 }
404
405 // Returns a new waitcnt with all counters except VScnt set to 0. If
406 // IncludeVSCnt is true, VScnt is set to 0, otherwise it is set to ~0u.
407 virtual AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const = 0;
408
409 virtual ~WaitcntGenerator() = default;
410};
411
412class WaitcntGeneratorPreGFX12 final : public WaitcntGenerator {
413 static constexpr const WaitEventSet
414 WaitEventMaskForInstPreGFX12[NUM_INST_CNTS] = {
415 WaitEventSet(
416 {VMEM_ACCESS, VMEM_SAMPLER_READ_ACCESS, VMEM_BVH_READ_ACCESS}),
417 WaitEventSet({SMEM_ACCESS, LDS_ACCESS, GDS_ACCESS, SQ_MESSAGE}),
418 WaitEventSet({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK,
419 EXP_PARAM_ACCESS, EXP_POS_ACCESS, EXP_LDS_ACCESS}),
420 WaitEventSet({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
421 WaitEventSet(),
422 WaitEventSet(),
423 WaitEventSet(),
424 WaitEventSet(),
425 WaitEventSet(),
426 WaitEventSet()};
427
428public:
429 using WaitcntGenerator::WaitcntGenerator;
430 bool
431 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
432 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
433 MachineBasicBlock::instr_iterator It) const override;
434
435 bool createNewWaitcnt(MachineBasicBlock &Block,
436 MachineBasicBlock::instr_iterator It,
437 AMDGPU::Waitcnt Wait,
438 const WaitcntBrackets &ScoreBrackets) override;
439
440 const WaitEventSet &getWaitEvents(InstCounterType T) const override {
441 return WaitEventMaskForInstPreGFX12[T];
442 }
443
444 AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override;
445};
446
447class WaitcntGeneratorGFX12Plus final : public WaitcntGenerator {
448protected:
449 bool IsExpertMode;
450 static constexpr const WaitEventSet
451 WaitEventMaskForInstGFX12Plus[NUM_INST_CNTS] = {
452 WaitEventSet({VMEM_ACCESS, GLOBAL_INV_ACCESS}),
453 WaitEventSet({LDS_ACCESS, GDS_ACCESS}),
454 WaitEventSet({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK,
455 EXP_PARAM_ACCESS, EXP_POS_ACCESS, EXP_LDS_ACCESS}),
456 WaitEventSet({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
457 WaitEventSet({VMEM_SAMPLER_READ_ACCESS}),
458 WaitEventSet({VMEM_BVH_READ_ACCESS}),
459 WaitEventSet({SMEM_ACCESS, SQ_MESSAGE, SCC_WRITE}),
460 WaitEventSet({VMEM_GROUP, SMEM_GROUP}),
461 WaitEventSet({VGPR_CSMACC_WRITE, VGPR_DPMACC_WRITE, VGPR_TRANS_WRITE,
462 VGPR_XDL_WRITE}),
463 WaitEventSet({VGPR_LDS_READ, VGPR_FLAT_READ, VGPR_VMEM_READ})};
464
465public:
466 WaitcntGeneratorGFX12Plus() = delete;
467 WaitcntGeneratorGFX12Plus(const MachineFunction &MF,
468 InstCounterType MaxCounter,
469 const AMDGPU::HardwareLimits *Limits,
470 bool IsExpertMode)
471 : WaitcntGenerator(MF, MaxCounter, Limits), IsExpertMode(IsExpertMode) {}
472
473 bool
474 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
475 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
476 MachineBasicBlock::instr_iterator It) const override;
477
478 bool createNewWaitcnt(MachineBasicBlock &Block,
479 MachineBasicBlock::instr_iterator It,
480 AMDGPU::Waitcnt Wait,
481 const WaitcntBrackets &ScoreBrackets) override;
482
483 const WaitEventSet &getWaitEvents(InstCounterType T) const override {
484 return WaitEventMaskForInstGFX12Plus[T];
485 }
486
487 AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override;
488};
489
490// Flags indicating which counters should be flushed in a loop preheader.
491struct PreheaderFlushFlags {
492 bool FlushVmCnt = false;
493 bool FlushDsCnt = false;
494};
495
496class SIInsertWaitcnts {
497public:
498 const GCNSubtarget *ST;
499 const SIInstrInfo *TII = nullptr;
500 const SIRegisterInfo *TRI = nullptr;
501 const MachineRegisterInfo *MRI = nullptr;
502 InstCounterType SmemAccessCounter;
503 InstCounterType MaxCounter;
504 bool IsExpertMode = false;
505
506private:
507 DenseMap<const Value *, MachineBasicBlock *> SLoadAddresses;
508 DenseMap<MachineBasicBlock *, PreheaderFlushFlags> PreheadersToFlush;
509 MachineLoopInfo *MLI;
510 MachinePostDominatorTree *PDT;
511 AliasAnalysis *AA = nullptr;
512
513 struct BlockInfo {
514 std::unique_ptr<WaitcntBrackets> Incoming;
515 bool Dirty = true;
516 };
517
518 MapVector<MachineBasicBlock *, BlockInfo> BlockInfos;
519
520 bool ForceEmitWaitcnt[NUM_INST_CNTS];
521
522 std::unique_ptr<WaitcntGenerator> WCG;
523
524 // Remember call and return instructions in the function.
525 DenseSet<MachineInstr *> CallInsts;
526 DenseSet<MachineInstr *> ReturnInsts;
527
528 // Remember all S_ENDPGM instructions. The boolean flag is true if there might
529 // be outstanding stores but definitely no outstanding scratch stores, to help
530 // with insertion of DEALLOC_VGPRS messages.
531 DenseMap<MachineInstr *, bool> EndPgmInsts;
532
533 AMDGPU::HardwareLimits Limits;
534
535public:
536 SIInsertWaitcnts(MachineLoopInfo *MLI, MachinePostDominatorTree *PDT,
537 AliasAnalysis *AA)
538 : MLI(MLI), PDT(PDT), AA(AA) {
539 (void)ForceExpCounter;
540 (void)ForceLgkmCounter;
541 (void)ForceVMCounter;
542 }
543
544 const AMDGPU::HardwareLimits &getLimits() const { return Limits; }
545
546 PreheaderFlushFlags getPreheaderFlushFlags(MachineLoop *ML,
547 const WaitcntBrackets &Brackets);
548 PreheaderFlushFlags isPreheaderToFlush(MachineBasicBlock &MBB,
549 const WaitcntBrackets &ScoreBrackets);
550 bool isVMEMOrFlatVMEM(const MachineInstr &MI) const;
551 bool isDSRead(const MachineInstr &MI) const;
552 bool mayStoreIncrementingDSCNT(const MachineInstr &MI) const;
553 bool run(MachineFunction &MF);
554
555 void setForceEmitWaitcnt() {
556// For non-debug builds, ForceEmitWaitcnt has been initialized to false;
557// For debug builds, get the debug counter info and adjust if need be
558#ifndef NDEBUG
559 if (DebugCounter::isCounterSet(ForceExpCounter) &&
560 DebugCounter::shouldExecute(ForceExpCounter)) {
561 ForceEmitWaitcnt[EXP_CNT] = true;
562 } else {
563 ForceEmitWaitcnt[EXP_CNT] = false;
564 }
565
566 if (DebugCounter::isCounterSet(ForceLgkmCounter) &&
567 DebugCounter::shouldExecute(ForceLgkmCounter)) {
568 ForceEmitWaitcnt[DS_CNT] = true;
569 ForceEmitWaitcnt[KM_CNT] = true;
570 } else {
571 ForceEmitWaitcnt[DS_CNT] = false;
572 ForceEmitWaitcnt[KM_CNT] = false;
573 }
574
575 if (DebugCounter::isCounterSet(ForceVMCounter) &&
576 DebugCounter::shouldExecute(ForceVMCounter)) {
577 ForceEmitWaitcnt[LOAD_CNT] = true;
578 ForceEmitWaitcnt[SAMPLE_CNT] = true;
579 ForceEmitWaitcnt[BVH_CNT] = true;
580 } else {
581 ForceEmitWaitcnt[LOAD_CNT] = false;
582 ForceEmitWaitcnt[SAMPLE_CNT] = false;
583 ForceEmitWaitcnt[BVH_CNT] = false;
584 }
585
586 ForceEmitWaitcnt[VA_VDST] = false;
587 ForceEmitWaitcnt[VM_VSRC] = false;
588#endif // NDEBUG
589 }
590
591 // Return the appropriate VMEM_*_ACCESS type for Inst, which must be a VMEM
592 // instruction.
593 WaitEventType getVmemWaitEventType(const MachineInstr &Inst) const {
594 switch (Inst.getOpcode()) {
595 // FIXME: GLOBAL_INV needs to be tracked with xcnt too.
596 case AMDGPU::GLOBAL_INV:
597 return GLOBAL_INV_ACCESS; // tracked using loadcnt, but doesn't write
598 // VGPRs
599 case AMDGPU::GLOBAL_WB:
600 case AMDGPU::GLOBAL_WBINV:
601 return VMEM_WRITE_ACCESS; // tracked using storecnt
602 default:
603 break;
604 }
605
606 // Maps VMEM access types to their corresponding WaitEventType.
607 static const WaitEventType VmemReadMapping[NUM_VMEM_TYPES] = {
608 VMEM_ACCESS, VMEM_SAMPLER_READ_ACCESS, VMEM_BVH_READ_ACCESS};
609
610 assert(SIInstrInfo::isVMEM(Inst));
611 // LDS DMA loads are also stores, but on the LDS side. On the VMEM side
612 // these should use VM_CNT.
613 if (!ST->hasVscnt() || SIInstrInfo::mayWriteLDSThroughDMA(MI: Inst))
614 return VMEM_ACCESS;
615 if (Inst.mayStore() &&
616 (!Inst.mayLoad() || SIInstrInfo::isAtomicNoRet(MI: Inst))) {
617 if (TII->mayAccessScratch(MI: Inst))
618 return SCRATCH_WRITE_ACCESS;
619 return VMEM_WRITE_ACCESS;
620 }
621 if (!ST->hasExtendedWaitCounts() || SIInstrInfo::isFLAT(MI: Inst))
622 return VMEM_ACCESS;
623 return VmemReadMapping[getVmemType(Inst)];
624 }
625
626 std::optional<WaitEventType>
627 getExpertSchedulingEventType(const MachineInstr &Inst) const;
628
629 bool isAsync(const MachineInstr &MI) const {
630 if (!SIInstrInfo::isLDSDMA(MI))
631 return false;
632 if (SIInstrInfo::usesASYNC_CNT(MI))
633 return true;
634 const MachineOperand *Async =
635 TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::IsAsync);
636 return Async && (Async->getImm());
637 }
638
639 bool isNonAsyncLdsDmaWrite(const MachineInstr &MI) const {
640 return SIInstrInfo::mayWriteLDSThroughDMA(MI) && !isAsync(MI);
641 }
642
643 bool isAsyncLdsDmaWrite(const MachineInstr &MI) const {
644 return SIInstrInfo::mayWriteLDSThroughDMA(MI) && isAsync(MI);
645 }
646
647 bool isVmemAccess(const MachineInstr &MI) const;
648 bool generateWaitcntInstBefore(MachineInstr &MI,
649 WaitcntBrackets &ScoreBrackets,
650 MachineInstr *OldWaitcntInstr,
651 PreheaderFlushFlags FlushFlags);
652 bool generateWaitcnt(AMDGPU::Waitcnt Wait,
653 MachineBasicBlock::instr_iterator It,
654 MachineBasicBlock &Block, WaitcntBrackets &ScoreBrackets,
655 MachineInstr *OldWaitcntInstr);
656 /// \returns all events that correspond to \p Inst.
657 WaitEventSet getEventsFor(const MachineInstr &Inst) const;
658 void updateEventWaitcntAfter(MachineInstr &Inst,
659 WaitcntBrackets *ScoreBrackets);
660 bool isNextENDPGM(MachineBasicBlock::instr_iterator It,
661 MachineBasicBlock *Block) const;
662 bool insertForcedWaitAfter(MachineInstr &Inst, MachineBasicBlock &Block,
663 WaitcntBrackets &ScoreBrackets);
664 bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block,
665 WaitcntBrackets &ScoreBrackets);
666 /// Removes redundant Soft Xcnt Waitcnts in \p Block emitted by the Memory
667 /// Legalizer. Returns true if block was modified.
668 bool removeRedundantSoftXcnts(MachineBasicBlock &Block);
669 void setSchedulingMode(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
670 bool ExpertMode) const;
671 const WaitEventSet &getWaitEvents(InstCounterType T) const {
672 return WCG->getWaitEvents(T);
673 }
674 InstCounterType getCounterFromEvent(WaitEventType E) const {
675 return WCG->getCounterFromEvent(E);
676 }
677};
678
679// This objects maintains the current score brackets of each wait counter, and
680// a per-register scoreboard for each wait counter.
681//
682// We also maintain the latest score for every event type that can change the
683// waitcnt in order to know if there are multiple types of events within
684// the brackets. When multiple types of event happen in the bracket,
685// wait count may get decreased out of order, therefore we need to put in
686// "s_waitcnt 0" before use.
687class WaitcntBrackets {
688public:
689 WaitcntBrackets(const SIInsertWaitcnts *Context) : Context(Context) {
690 assert(Context->TRI->getNumRegUnits() < REGUNITS_END);
691 }
692
693#ifndef NDEBUG
694 ~WaitcntBrackets() {
695 unsigned NumUnusedVmem = 0, NumUnusedSGPRs = 0;
696 for (auto &[ID, Val] : VMem) {
697 if (Val.empty())
698 ++NumUnusedVmem;
699 }
700 for (auto &[ID, Val] : SGPRs) {
701 if (Val.empty())
702 ++NumUnusedSGPRs;
703 }
704
705 if (NumUnusedVmem || NumUnusedSGPRs) {
706 errs() << "WaitcntBracket had unused entries at destruction time: "
707 << NumUnusedVmem << " VMem and " << NumUnusedSGPRs
708 << " SGPR unused entries\n";
709 std::abort();
710 }
711 }
712#endif
713
714 bool isSmemCounter(InstCounterType T) const {
715 return T == Context->SmemAccessCounter || T == X_CNT;
716 }
717
718 unsigned getSgprScoresIdx(InstCounterType T) const {
719 assert(isSmemCounter(T) && "Invalid SMEM counter");
720 return T == X_CNT ? 1 : 0;
721 }
722
723 unsigned getOutstanding(InstCounterType T) const {
724 return ScoreUBs[T] - ScoreLBs[T];
725 }
726
727 bool hasPendingVMEM(VMEMID ID, InstCounterType T) const {
728 return getVMemScore(TID: ID, T) > getScoreLB(T);
729 }
730
731 /// \Return true if we have no score entries for counter \p T.
732 bool empty(InstCounterType T) const { return getScoreRange(T) == 0; }
733
734private:
735 unsigned getScoreLB(InstCounterType T) const {
736 assert(T < NUM_INST_CNTS);
737 return ScoreLBs[T];
738 }
739
740 unsigned getScoreUB(InstCounterType T) const {
741 assert(T < NUM_INST_CNTS);
742 return ScoreUBs[T];
743 }
744
745 unsigned getScoreRange(InstCounterType T) const {
746 return getScoreUB(T) - getScoreLB(T);
747 }
748
749 unsigned getSGPRScore(MCRegUnit RU, InstCounterType T) const {
750 auto It = SGPRs.find(Val: RU);
751 return It != SGPRs.end() ? It->second.Scores[getSgprScoresIdx(T)] : 0;
752 }
753
754 unsigned getVMemScore(VMEMID TID, InstCounterType T) const {
755 auto It = VMem.find(Val: TID);
756 return It != VMem.end() ? It->second.Scores[T] : 0;
757 }
758
759public:
760 bool merge(const WaitcntBrackets &Other);
761
762 bool counterOutOfOrder(InstCounterType T) const;
763 void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const {
764 simplifyWaitcnt(CheckWait: Wait, UpdateWait&: Wait);
765 }
766 void simplifyWaitcnt(const AMDGPU::Waitcnt &CheckWait,
767 AMDGPU::Waitcnt &UpdateWait) const;
768 void simplifyWaitcnt(InstCounterType T, unsigned &Count) const;
769 void simplifyWaitcnt(Waitcnt &Wait, InstCounterType T) const;
770 void simplifyXcnt(const AMDGPU::Waitcnt &CheckWait,
771 AMDGPU::Waitcnt &UpdateWait) const;
772 void simplifyVmVsrc(const AMDGPU::Waitcnt &CheckWait,
773 AMDGPU::Waitcnt &UpdateWait) const;
774
775 void determineWaitForPhysReg(InstCounterType T, MCPhysReg Reg,
776 AMDGPU::Waitcnt &Wait) const;
777 void determineWaitForLDSDMA(InstCounterType T, VMEMID TID,
778 AMDGPU::Waitcnt &Wait) const;
779 AMDGPU::Waitcnt determineAsyncWait(unsigned N);
780 void tryClearSCCWriteEvent(MachineInstr *Inst);
781
782 void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
783 void applyWaitcnt(InstCounterType T, unsigned Count);
784 void applyWaitcnt(const AMDGPU::Waitcnt &Wait, InstCounterType T);
785 void updateByEvent(WaitEventType E, MachineInstr &MI);
786 void recordAsyncMark(MachineInstr &MI);
787
788 bool hasPendingEvent() const { return !PendingEvents.empty(); }
789 bool hasPendingEvent(WaitEventType E) const {
790 return PendingEvents.contains(Event: E);
791 }
792 bool hasPendingEvent(InstCounterType T) const {
793 bool HasPending = PendingEvents & Context->getWaitEvents(T);
794 assert(HasPending == !empty(T) &&
795 "Expected pending events iff scoreboard is not empty");
796 return HasPending;
797 }
798
799 bool hasMixedPendingEvents(InstCounterType T) const {
800 WaitEventSet Events = PendingEvents & Context->getWaitEvents(T);
801 // Return true if more than one bit is set in Events.
802 return Events.twoOrMore();
803 }
804
805 bool hasPendingFlat() const {
806 return ((LastFlat[DS_CNT] > ScoreLBs[DS_CNT] &&
807 LastFlat[DS_CNT] <= ScoreUBs[DS_CNT]) ||
808 (LastFlat[LOAD_CNT] > ScoreLBs[LOAD_CNT] &&
809 LastFlat[LOAD_CNT] <= ScoreUBs[LOAD_CNT]));
810 }
811
812 void setPendingFlat() {
813 LastFlat[LOAD_CNT] = ScoreUBs[LOAD_CNT];
814 LastFlat[DS_CNT] = ScoreUBs[DS_CNT];
815 }
816
817 bool hasPendingGDS() const {
818 return LastGDS > ScoreLBs[DS_CNT] && LastGDS <= ScoreUBs[DS_CNT];
819 }
820
821 unsigned getPendingGDSWait() const {
822 return std::min(a: getScoreUB(T: DS_CNT) - LastGDS,
823 b: getWaitCountMax(Limits: Context->getLimits(), T: DS_CNT) - 1);
824 }
825
826 void setPendingGDS() { LastGDS = ScoreUBs[DS_CNT]; }
827
828 // Return true if there might be pending writes to the vgpr-interval by VMEM
829 // instructions with types different from V.
830 bool hasOtherPendingVmemTypes(MCPhysReg Reg, VmemType V) const {
831 for (MCRegUnit RU : regunits(Reg)) {
832 auto It = VMem.find(Val: toVMEMID(RU));
833 if (It != VMem.end() && (It->second.VMEMTypes & ~(1 << V)))
834 return true;
835 }
836 return false;
837 }
838
839 void clearVgprVmemTypes(MCPhysReg Reg) {
840 for (MCRegUnit RU : regunits(Reg)) {
841 if (auto It = VMem.find(Val: toVMEMID(RU)); It != VMem.end()) {
842 It->second.VMEMTypes = 0;
843 if (It->second.empty())
844 VMem.erase(I: It);
845 }
846 }
847 }
848
849 void setStateOnFunctionEntryOrReturn() {
850 setScoreUB(T: STORE_CNT, Val: getScoreUB(T: STORE_CNT) +
851 getWaitCountMax(Limits: Context->getLimits(), T: STORE_CNT));
852 PendingEvents |= Context->getWaitEvents(T: STORE_CNT);
853 }
854
855 ArrayRef<const MachineInstr *> getLDSDMAStores() const {
856 return LDSDMAStores;
857 }
858
859 bool hasPointSampleAccel(const MachineInstr &MI) const;
860 bool hasPointSamplePendingVmemTypes(const MachineInstr &MI,
861 MCPhysReg RU) const;
862
863 void print(raw_ostream &) const;
864 void dump() const { print(dbgs()); }
865
866 // Free up memory by removing empty entries from the DenseMap that track event
867 // scores.
868 void purgeEmptyTrackingData();
869
870private:
871 struct MergeInfo {
872 unsigned OldLB;
873 unsigned OtherLB;
874 unsigned MyShift;
875 unsigned OtherShift;
876 };
877
878 using CounterValueArray = std::array<unsigned, NUM_INST_CNTS>;
879
880 void determineWaitForScore(InstCounterType T, unsigned Score,
881 AMDGPU::Waitcnt &Wait) const;
882
883 static bool mergeScore(const MergeInfo &M, unsigned &Score,
884 unsigned OtherScore);
885 bool mergeAsyncMarks(ArrayRef<MergeInfo> MergeInfos,
886 ArrayRef<CounterValueArray> OtherMarks);
887
888 iterator_range<MCRegUnitIterator> regunits(MCPhysReg Reg) const {
889 assert(Reg != AMDGPU::SCC && "Shouldn't be used on SCC");
890 if (!Context->TRI->isInAllocatableClass(RegNo: Reg))
891 return {{}, {}};
892 const TargetRegisterClass *RC = Context->TRI->getPhysRegBaseClass(Reg);
893 unsigned Size = Context->TRI->getRegSizeInBits(RC: *RC);
894 if (Size == 16 && Context->ST->hasD16Writes32BitVgpr())
895 Reg = Context->TRI->get32BitRegister(Reg);
896 return Context->TRI->regunits(Reg);
897 }
898
899 void setScoreLB(InstCounterType T, unsigned Val) {
900 assert(T < NUM_INST_CNTS);
901 ScoreLBs[T] = Val;
902 }
903
904 void setScoreUB(InstCounterType T, unsigned Val) {
905 assert(T < NUM_INST_CNTS);
906 ScoreUBs[T] = Val;
907
908 if (T != EXP_CNT)
909 return;
910
911 if (getScoreRange(T: EXP_CNT) > getWaitCountMax(Limits: Context->getLimits(), T: EXP_CNT))
912 ScoreLBs[EXP_CNT] =
913 ScoreUBs[EXP_CNT] - getWaitCountMax(Limits: Context->getLimits(), T: EXP_CNT);
914 }
915
916 void setRegScore(MCPhysReg Reg, InstCounterType T, unsigned Val) {
917 const SIRegisterInfo *TRI = Context->TRI;
918 if (Reg == AMDGPU::SCC) {
919 SCCScore = Val;
920 } else if (TRI->isVectorRegister(MRI: *Context->MRI, Reg)) {
921 for (MCRegUnit RU : regunits(Reg))
922 VMem[toVMEMID(RU)].Scores[T] = Val;
923 } else if (TRI->isSGPRReg(MRI: *Context->MRI, Reg)) {
924 auto STy = getSgprScoresIdx(T);
925 for (MCRegUnit RU : regunits(Reg))
926 SGPRs[RU].Scores[STy] = Val;
927 } else {
928 llvm_unreachable("Register cannot be tracked/unknown register!");
929 }
930 }
931
932 void setVMemScore(VMEMID TID, InstCounterType T, unsigned Val) {
933 VMem[TID].Scores[T] = Val;
934 }
935
936 void setScoreByOperand(const MachineOperand &Op, InstCounterType CntTy,
937 unsigned Val);
938
939 const SIInsertWaitcnts *Context;
940
941 unsigned ScoreLBs[NUM_INST_CNTS] = {0};
942 unsigned ScoreUBs[NUM_INST_CNTS] = {0};
943 WaitEventSet PendingEvents;
944 // Remember the last flat memory operation.
945 unsigned LastFlat[NUM_INST_CNTS] = {0};
946 // Remember the last GDS operation.
947 unsigned LastGDS = 0;
948
949 // The score tracking logic is fragmented as follows:
950 // - VMem: VGPR RegUnits and LDS DMA IDs, see the VMEMID encoding.
951 // - SGPRs: SGPR RegUnits
952 // - SCC: Non-allocatable and not general purpose: not a SGPR.
953 //
954 // For the VMem case, if the key is within the range of LDS DMA IDs,
955 // then the corresponding index into the `LDSDMAStores` vector below is:
956 // Key - LDSDMA_BEGIN - 1
957 // This is because LDSDMA_BEGIN is a generic entry and does not have an
958 // associated MachineInstr.
959 //
960 // TODO: Could we track SCC alongside SGPRs so it's not longer a special case?
961
962 struct VMEMInfo {
963 // Scores for all instruction counters. Zero-initialized.
964 CounterValueArray Scores{};
965 // Bitmask of the VmemTypes of VMEM instructions for this VGPR.
966 unsigned VMEMTypes = 0;
967
968 bool empty() const { return all_of(Range: Scores, P: equal_to(Arg: 0)) && !VMEMTypes; }
969 };
970
971 struct SGPRInfo {
972 // Wait cnt scores for every sgpr, the DS_CNT (corresponding to LGKMcnt
973 // pre-gfx12) or KM_CNT (gfx12+ only), and X_CNT (gfx1250) are relevant.
974 // Row 0 represents the score for either DS_CNT or KM_CNT and row 1 keeps
975 // the X_CNT score.
976 std::array<unsigned, 2> Scores = {0};
977
978 bool empty() const { return !Scores[0] && !Scores[1]; }
979 };
980
981 DenseMap<VMEMID, VMEMInfo> VMem; // VGPR + LDS DMA
982 DenseMap<MCRegUnit, SGPRInfo> SGPRs;
983
984 // Reg score for SCC.
985 unsigned SCCScore = 0;
986 // The unique instruction that has an SCC write pending, if there is one.
987 const MachineInstr *PendingSCCWrite = nullptr;
988
989 // Store representative LDS DMA operations. The only useful info here is
990 // alias info. One store is kept per unique AAInfo.
991 SmallVector<const MachineInstr *> LDSDMAStores;
992
993 // State of all counters at each async mark encountered so far.
994 SmallVector<CounterValueArray> AsyncMarks;
995
996 // But in the rare pathological case, a nest of loops that pushes marks
997 // without waiting on any mark can cause AsyncMarks to grow very large. We cap
998 // it to a reasonable limit. We can tune this later or potentially introduce a
999 // user option to control the value.
1000 static constexpr unsigned MaxAsyncMarks = 16;
1001
1002 // Track the upper bound score for async operations that are not part of a
1003 // mark yet. Initialized to all zeros.
1004 CounterValueArray AsyncScore{};
1005};
1006
1007class SIInsertWaitcntsLegacy : public MachineFunctionPass {
1008public:
1009 static char ID;
1010 SIInsertWaitcntsLegacy() : MachineFunctionPass(ID) {}
1011
1012 bool runOnMachineFunction(MachineFunction &MF) override;
1013
1014 StringRef getPassName() const override {
1015 return "SI insert wait instructions";
1016 }
1017
1018 void getAnalysisUsage(AnalysisUsage &AU) const override {
1019 AU.setPreservesCFG();
1020 AU.addRequired<MachineLoopInfoWrapperPass>();
1021 AU.addRequired<MachinePostDominatorTreeWrapperPass>();
1022 AU.addUsedIfAvailable<AAResultsWrapperPass>();
1023 AU.addPreserved<AAResultsWrapperPass>();
1024 MachineFunctionPass::getAnalysisUsage(AU);
1025 }
1026};
1027
1028} // end anonymous namespace
1029
1030void WaitcntBrackets::setScoreByOperand(const MachineOperand &Op,
1031 InstCounterType CntTy, unsigned Score) {
1032 setRegScore(Reg: Op.getReg().asMCReg(), T: CntTy, Val: Score);
1033}
1034
1035// Return true if the subtarget is one that enables Point Sample Acceleration
1036// and the MachineInstr passed in is one to which it might be applied (the
1037// hardware makes this decision based on several factors, but we can't determine
1038// this at compile time, so we have to assume it might be applied if the
1039// instruction supports it).
1040bool WaitcntBrackets::hasPointSampleAccel(const MachineInstr &MI) const {
1041 if (!Context->ST->hasPointSampleAccel() || !SIInstrInfo::isMIMG(MI))
1042 return false;
1043
1044 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc: MI.getOpcode());
1045 const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo =
1046 AMDGPU::getMIMGBaseOpcodeInfo(BaseOpcode: Info->BaseOpcode);
1047 return BaseInfo->PointSampleAccel;
1048}
1049
1050// Return true if the subtarget enables Point Sample Acceleration, the supplied
1051// MachineInstr is one to which it might be applied and the supplied interval is
1052// one that has outstanding writes to vmem-types different than VMEM_NOSAMPLER
1053// (this is the type that a point sample accelerated instruction effectively
1054// becomes)
1055bool WaitcntBrackets::hasPointSamplePendingVmemTypes(const MachineInstr &MI,
1056 MCPhysReg Reg) const {
1057 if (!hasPointSampleAccel(MI))
1058 return false;
1059
1060 return hasOtherPendingVmemTypes(Reg, V: VMEM_NOSAMPLER);
1061}
1062
1063void WaitcntBrackets::updateByEvent(WaitEventType E, MachineInstr &Inst) {
1064 InstCounterType T = Context->getCounterFromEvent(E);
1065 assert(T < Context->MaxCounter);
1066
1067 unsigned UB = getScoreUB(T);
1068 unsigned CurrScore = UB + 1;
1069 if (CurrScore == 0)
1070 report_fatal_error(reason: "InsertWaitcnt score wraparound");
1071 // PendingEvents and ScoreUB need to be update regardless if this event
1072 // changes the score of a register or not.
1073 // Examples including vm_cnt when buffer-store or lgkm_cnt when send-message.
1074 PendingEvents.insert(Event: E);
1075 setScoreUB(T, Val: CurrScore);
1076
1077 const SIRegisterInfo *TRI = Context->TRI;
1078 const MachineRegisterInfo *MRI = Context->MRI;
1079 const SIInstrInfo *TII = Context->TII;
1080
1081 if (T == EXP_CNT) {
1082 // Put score on the source vgprs. If this is a store, just use those
1083 // specific register(s).
1084 if (TII->isDS(MI: Inst) && Inst.mayLoadOrStore()) {
1085 // All GDS operations must protect their address register (same as
1086 // export.)
1087 if (const auto *AddrOp = TII->getNamedOperand(MI&: Inst, OperandName: AMDGPU::OpName::addr))
1088 setScoreByOperand(Op: *AddrOp, CntTy: EXP_CNT, Score: CurrScore);
1089
1090 if (Inst.mayStore()) {
1091 if (const auto *Data0 =
1092 TII->getNamedOperand(MI&: Inst, OperandName: AMDGPU::OpName::data0))
1093 setScoreByOperand(Op: *Data0, CntTy: EXP_CNT, Score: CurrScore);
1094 if (const auto *Data1 =
1095 TII->getNamedOperand(MI&: Inst, OperandName: AMDGPU::OpName::data1))
1096 setScoreByOperand(Op: *Data1, CntTy: EXP_CNT, Score: CurrScore);
1097 } else if (SIInstrInfo::isAtomicRet(MI: Inst) && !SIInstrInfo::isGWS(MI: Inst) &&
1098 Inst.getOpcode() != AMDGPU::DS_APPEND &&
1099 Inst.getOpcode() != AMDGPU::DS_CONSUME &&
1100 Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) {
1101 for (const MachineOperand &Op : Inst.all_uses()) {
1102 if (TRI->isVectorRegister(MRI: *MRI, Reg: Op.getReg()))
1103 setScoreByOperand(Op, CntTy: EXP_CNT, Score: CurrScore);
1104 }
1105 }
1106 } else if (TII->isFLAT(MI: Inst)) {
1107 if (Inst.mayStore()) {
1108 setScoreByOperand(Op: *TII->getNamedOperand(MI&: Inst, OperandName: AMDGPU::OpName::data),
1109 CntTy: EXP_CNT, Score: CurrScore);
1110 } else if (SIInstrInfo::isAtomicRet(MI: Inst)) {
1111 setScoreByOperand(Op: *TII->getNamedOperand(MI&: Inst, OperandName: AMDGPU::OpName::data),
1112 CntTy: EXP_CNT, Score: CurrScore);
1113 }
1114 } else if (TII->isMIMG(MI: Inst)) {
1115 if (Inst.mayStore()) {
1116 setScoreByOperand(Op: Inst.getOperand(i: 0), CntTy: EXP_CNT, Score: CurrScore);
1117 } else if (SIInstrInfo::isAtomicRet(MI: Inst)) {
1118 setScoreByOperand(Op: *TII->getNamedOperand(MI&: Inst, OperandName: AMDGPU::OpName::data),
1119 CntTy: EXP_CNT, Score: CurrScore);
1120 }
1121 } else if (TII->isMTBUF(MI: Inst)) {
1122 if (Inst.mayStore())
1123 setScoreByOperand(Op: Inst.getOperand(i: 0), CntTy: EXP_CNT, Score: CurrScore);
1124 } else if (TII->isMUBUF(MI: Inst)) {
1125 if (Inst.mayStore()) {
1126 setScoreByOperand(Op: Inst.getOperand(i: 0), CntTy: EXP_CNT, Score: CurrScore);
1127 } else if (SIInstrInfo::isAtomicRet(MI: Inst)) {
1128 setScoreByOperand(Op: *TII->getNamedOperand(MI&: Inst, OperandName: AMDGPU::OpName::data),
1129 CntTy: EXP_CNT, Score: CurrScore);
1130 }
1131 } else if (TII->isLDSDIR(MI: Inst)) {
1132 // LDSDIR instructions attach the score to the destination.
1133 setScoreByOperand(Op: *TII->getNamedOperand(MI&: Inst, OperandName: AMDGPU::OpName::vdst),
1134 CntTy: EXP_CNT, Score: CurrScore);
1135 } else {
1136 if (TII->isEXP(MI: Inst)) {
1137 // For export the destination registers are really temps that
1138 // can be used as the actual source after export patching, so
1139 // we need to treat them like sources and set the EXP_CNT
1140 // score.
1141 for (MachineOperand &DefMO : Inst.all_defs()) {
1142 if (TRI->isVGPR(MRI: *MRI, Reg: DefMO.getReg())) {
1143 setScoreByOperand(Op: DefMO, CntTy: EXP_CNT, Score: CurrScore);
1144 }
1145 }
1146 }
1147 for (const MachineOperand &Op : Inst.all_uses()) {
1148 if (TRI->isVectorRegister(MRI: *MRI, Reg: Op.getReg()))
1149 setScoreByOperand(Op, CntTy: EXP_CNT, Score: CurrScore);
1150 }
1151 }
1152 } else if (T == X_CNT) {
1153 WaitEventType OtherEvent = E == SMEM_GROUP ? VMEM_GROUP : SMEM_GROUP;
1154 if (PendingEvents.contains(Event: OtherEvent)) {
1155 // Hardware inserts an implicit xcnt between interleaved
1156 // SMEM and VMEM operations. So there will never be
1157 // outstanding address translations for both SMEM and
1158 // VMEM at the same time.
1159 setScoreLB(T, Val: getScoreUB(T) - 1);
1160 PendingEvents.remove(Event: OtherEvent);
1161 }
1162 for (const MachineOperand &Op : Inst.all_uses())
1163 setScoreByOperand(Op, CntTy: T, Score: CurrScore);
1164 } else if (T == VA_VDST || T == VM_VSRC) {
1165 // Match the score to the VGPR destination or source registers as
1166 // appropriate
1167 for (const MachineOperand &Op : Inst.operands()) {
1168 if (!Op.isReg() || (T == VA_VDST && Op.isUse()) ||
1169 (T == VM_VSRC && Op.isDef()))
1170 continue;
1171 if (TRI->isVectorRegister(MRI: *Context->MRI, Reg: Op.getReg()))
1172 setScoreByOperand(Op, CntTy: T, Score: CurrScore);
1173 }
1174 } else /* LGKM_CNT || EXP_CNT || VS_CNT || NUM_INST_CNTS */ {
1175 // Match the score to the destination registers.
1176 //
1177 // Check only explicit operands. Stores, especially spill stores, include
1178 // implicit uses and defs of their super registers which would create an
1179 // artificial dependency, while these are there only for register liveness
1180 // accounting purposes.
1181 //
1182 // Special cases where implicit register defs exists, such as M0 or VCC,
1183 // but none with memory instructions.
1184 for (const MachineOperand &Op : Inst.defs()) {
1185 if (T == LOAD_CNT || T == SAMPLE_CNT || T == BVH_CNT) {
1186 if (!TRI->isVectorRegister(MRI: *MRI, Reg: Op.getReg())) // TODO: add wrapper
1187 continue;
1188 if (updateVMCntOnly(Inst)) {
1189 // updateVMCntOnly should only leave us with VGPRs
1190 // MUBUF, MTBUF, MIMG, FlatGlobal, and FlatScratch only have VGPR/AGPR
1191 // defs. That's required for a sane index into `VgprMemTypes` below
1192 assert(TRI->isVectorRegister(*MRI, Op.getReg()));
1193 VmemType V = getVmemType(Inst);
1194 unsigned char TypesMask = 1 << V;
1195 // If instruction can have Point Sample Accel applied, we have to flag
1196 // this with another potential dependency
1197 if (hasPointSampleAccel(MI: Inst))
1198 TypesMask |= 1 << VMEM_NOSAMPLER;
1199 for (MCRegUnit RU : regunits(Reg: Op.getReg().asMCReg()))
1200 VMem[toVMEMID(RU)].VMEMTypes |= TypesMask;
1201 }
1202 }
1203 setScoreByOperand(Op, CntTy: T, Score: CurrScore);
1204 }
1205 if (Inst.mayStore() &&
1206 (TII->isDS(MI: Inst) || Context->isNonAsyncLdsDmaWrite(MI: Inst))) {
1207 // MUBUF and FLAT LDS DMA operations need a wait on vmcnt before LDS
1208 // written can be accessed. A load from LDS to VMEM does not need a wait.
1209 //
1210 // The "Slot" is the offset from LDSDMA_BEGIN. If it's non-zero, then
1211 // there is a MachineInstr in LDSDMAStores used to track this LDSDMA
1212 // store. The "Slot" is the index into LDSDMAStores + 1.
1213 unsigned Slot = 0;
1214 for (const auto *MemOp : Inst.memoperands()) {
1215 if (!MemOp->isStore() ||
1216 MemOp->getAddrSpace() != AMDGPUAS::LOCAL_ADDRESS)
1217 continue;
1218 // Comparing just AA info does not guarantee memoperands are equal
1219 // in general, but this is so for LDS DMA in practice.
1220 auto AAI = MemOp->getAAInfo();
1221 // Alias scope information gives a way to definitely identify an
1222 // original memory object and practically produced in the module LDS
1223 // lowering pass. If there is no scope available we will not be able
1224 // to disambiguate LDS aliasing as after the module lowering all LDS
1225 // is squashed into a single big object.
1226 if (!AAI || !AAI.Scope)
1227 break;
1228 for (unsigned I = 0, E = LDSDMAStores.size(); I != E && !Slot; ++I) {
1229 for (const auto *MemOp : LDSDMAStores[I]->memoperands()) {
1230 if (MemOp->isStore() && AAI == MemOp->getAAInfo()) {
1231 Slot = I + 1;
1232 break;
1233 }
1234 }
1235 }
1236 if (Slot)
1237 break;
1238 // The slot may not be valid because it can be >= NUM_LDSDMA which
1239 // means the scoreboard cannot track it. We still want to preserve the
1240 // MI in order to check alias information, though.
1241 LDSDMAStores.push_back(Elt: &Inst);
1242 Slot = LDSDMAStores.size();
1243 break;
1244 }
1245 setVMemScore(TID: LDSDMA_BEGIN, T, Val: CurrScore);
1246 if (Slot && Slot < NUM_LDSDMA)
1247 setVMemScore(TID: LDSDMA_BEGIN + Slot, T, Val: CurrScore);
1248 }
1249
1250 // FIXME: Not supported on GFX12 yet. Newer async operations use other
1251 // counters too, so will need a map from instruction or event types to
1252 // counter types.
1253 if (Context->isAsyncLdsDmaWrite(MI: Inst) && T == LOAD_CNT) {
1254 assert(!SIInstrInfo::usesASYNC_CNT(Inst) &&
1255 "unexpected GFX1250 instruction");
1256 AsyncScore[T] = CurrScore;
1257 }
1258
1259 if (SIInstrInfo::isSBarrierSCCWrite(Opcode: Inst.getOpcode())) {
1260 setRegScore(Reg: AMDGPU::SCC, T, Val: CurrScore);
1261 PendingSCCWrite = &Inst;
1262 }
1263 }
1264}
1265
1266void WaitcntBrackets::recordAsyncMark(MachineInstr &Inst) {
1267 // In the absence of loops, AsyncMarks can grow linearly with the program
1268 // until we encounter an ASYNCMARK_WAIT. We could drop the oldest mark above a
1269 // limit every time we push a new mark, but that seems like unnecessary work
1270 // in practical cases. We do separately truncate the array when processing a
1271 // loop, which should be sufficient.
1272 AsyncMarks.push_back(Elt: AsyncScore);
1273 AsyncScore = {};
1274 LLVM_DEBUG({
1275 dbgs() << "recordAsyncMark:\n" << Inst;
1276 for (const auto &Mark : AsyncMarks) {
1277 llvm::interleaveComma(Mark, dbgs());
1278 dbgs() << '\n';
1279 }
1280 });
1281}
1282
1283void WaitcntBrackets::print(raw_ostream &OS) const {
1284 const GCNSubtarget *ST = Context->ST;
1285
1286 for (auto T : inst_counter_types(MaxCounter: Context->MaxCounter)) {
1287 unsigned SR = getScoreRange(T);
1288 switch (T) {
1289 case LOAD_CNT:
1290 OS << " " << (ST->hasExtendedWaitCounts() ? "LOAD" : "VM") << "_CNT("
1291 << SR << "):";
1292 break;
1293 case DS_CNT:
1294 OS << " " << (ST->hasExtendedWaitCounts() ? "DS" : "LGKM") << "_CNT("
1295 << SR << "):";
1296 break;
1297 case EXP_CNT:
1298 OS << " EXP_CNT(" << SR << "):";
1299 break;
1300 case STORE_CNT:
1301 OS << " " << (ST->hasExtendedWaitCounts() ? "STORE" : "VS") << "_CNT("
1302 << SR << "):";
1303 break;
1304 case SAMPLE_CNT:
1305 OS << " SAMPLE_CNT(" << SR << "):";
1306 break;
1307 case BVH_CNT:
1308 OS << " BVH_CNT(" << SR << "):";
1309 break;
1310 case KM_CNT:
1311 OS << " KM_CNT(" << SR << "):";
1312 break;
1313 case X_CNT:
1314 OS << " X_CNT(" << SR << "):";
1315 break;
1316 case VA_VDST:
1317 OS << " VA_VDST(" << SR << "): ";
1318 break;
1319 case VM_VSRC:
1320 OS << " VM_VSRC(" << SR << "): ";
1321 break;
1322 default:
1323 OS << " UNKNOWN(" << SR << "):";
1324 break;
1325 }
1326
1327 if (SR != 0) {
1328 // Print vgpr scores.
1329 unsigned LB = getScoreLB(T);
1330
1331 SmallVector<VMEMID> SortedVMEMIDs(VMem.keys());
1332 sort(C&: SortedVMEMIDs);
1333
1334 for (auto ID : SortedVMEMIDs) {
1335 unsigned RegScore = VMem.at(Val: ID).Scores[T];
1336 if (RegScore <= LB)
1337 continue;
1338 unsigned RelScore = RegScore - LB - 1;
1339 if (ID < REGUNITS_END) {
1340 OS << ' ' << RelScore << ":vRU" << ID;
1341 } else {
1342 assert(ID >= LDSDMA_BEGIN && ID < LDSDMA_END &&
1343 "Unhandled/unexpected ID value!");
1344 OS << ' ' << RelScore << ":LDSDMA" << ID;
1345 }
1346 }
1347
1348 // Also need to print sgpr scores for lgkm_cnt or xcnt.
1349 if (isSmemCounter(T)) {
1350 SmallVector<MCRegUnit> SortedSMEMIDs(SGPRs.keys());
1351 sort(C&: SortedSMEMIDs);
1352 for (auto ID : SortedSMEMIDs) {
1353 unsigned RegScore = SGPRs.at(Val: ID).Scores[getSgprScoresIdx(T)];
1354 if (RegScore <= LB)
1355 continue;
1356 unsigned RelScore = RegScore - LB - 1;
1357 OS << ' ' << RelScore << ":sRU" << static_cast<unsigned>(ID);
1358 }
1359 }
1360
1361 if (T == KM_CNT && SCCScore > 0)
1362 OS << ' ' << SCCScore << ":scc";
1363 }
1364 OS << '\n';
1365 }
1366
1367 OS << "Pending Events: ";
1368 if (hasPendingEvent()) {
1369 ListSeparator LS;
1370 for (unsigned I = 0; I != NUM_WAIT_EVENTS; ++I) {
1371 if (hasPendingEvent(E: (WaitEventType)I)) {
1372 OS << LS << WaitEventTypeName[I];
1373 }
1374 }
1375 } else {
1376 OS << "none";
1377 }
1378 OS << '\n';
1379
1380 OS << "Async score: ";
1381 if (AsyncScore.empty())
1382 OS << "none";
1383 else
1384 llvm::interleaveComma(c: AsyncScore, os&: OS);
1385 OS << '\n';
1386
1387 OS << "Async marks: " << AsyncMarks.size() << '\n';
1388
1389 for (const auto &Mark : AsyncMarks) {
1390 for (auto T : inst_counter_types()) {
1391 unsigned MarkedScore = Mark[T];
1392 switch (T) {
1393 case LOAD_CNT:
1394 OS << " " << (ST->hasExtendedWaitCounts() ? "LOAD" : "VM")
1395 << "_CNT: " << MarkedScore;
1396 break;
1397 case DS_CNT:
1398 OS << " " << (ST->hasExtendedWaitCounts() ? "DS" : "LGKM")
1399 << "_CNT: " << MarkedScore;
1400 break;
1401 case EXP_CNT:
1402 OS << " EXP_CNT: " << MarkedScore;
1403 break;
1404 case STORE_CNT:
1405 OS << " " << (ST->hasExtendedWaitCounts() ? "STORE" : "VS")
1406 << "_CNT: " << MarkedScore;
1407 break;
1408 case SAMPLE_CNT:
1409 OS << " SAMPLE_CNT: " << MarkedScore;
1410 break;
1411 case BVH_CNT:
1412 OS << " BVH_CNT: " << MarkedScore;
1413 break;
1414 case KM_CNT:
1415 OS << " KM_CNT: " << MarkedScore;
1416 break;
1417 case X_CNT:
1418 OS << " X_CNT: " << MarkedScore;
1419 break;
1420 default:
1421 OS << " UNKNOWN: " << MarkedScore;
1422 break;
1423 }
1424 }
1425 OS << '\n';
1426 }
1427 OS << '\n';
1428}
1429
1430/// Simplify \p UpdateWait by removing waits that are redundant based on the
1431/// current WaitcntBrackets and any other waits specified in \p CheckWait.
1432void WaitcntBrackets::simplifyWaitcnt(const AMDGPU::Waitcnt &CheckWait,
1433 AMDGPU::Waitcnt &UpdateWait) const {
1434 simplifyWaitcnt(Wait&: UpdateWait, T: LOAD_CNT);
1435 simplifyWaitcnt(Wait&: UpdateWait, T: EXP_CNT);
1436 simplifyWaitcnt(Wait&: UpdateWait, T: DS_CNT);
1437 simplifyWaitcnt(Wait&: UpdateWait, T: STORE_CNT);
1438 simplifyWaitcnt(Wait&: UpdateWait, T: SAMPLE_CNT);
1439 simplifyWaitcnt(Wait&: UpdateWait, T: BVH_CNT);
1440 simplifyWaitcnt(Wait&: UpdateWait, T: KM_CNT);
1441 simplifyXcnt(CheckWait, UpdateWait);
1442 simplifyWaitcnt(Wait&: UpdateWait, T: VA_VDST);
1443 simplifyVmVsrc(CheckWait, UpdateWait);
1444}
1445
1446void WaitcntBrackets::simplifyWaitcnt(InstCounterType T,
1447 unsigned &Count) const {
1448 // The number of outstanding events for this type, T, can be calculated
1449 // as (UB - LB). If the current Count is greater than or equal to the number
1450 // of outstanding events, then the wait for this counter is redundant.
1451 if (Count >= getScoreRange(T))
1452 Count = ~0u;
1453}
1454
1455void WaitcntBrackets::simplifyWaitcnt(Waitcnt &Wait, InstCounterType T) const {
1456 unsigned Cnt = Wait.get(T);
1457 simplifyWaitcnt(T, Count&: Cnt);
1458 Wait.set(T, Val: Cnt);
1459}
1460
1461void WaitcntBrackets::simplifyXcnt(const AMDGPU::Waitcnt &CheckWait,
1462 AMDGPU::Waitcnt &UpdateWait) const {
1463 // Try to simplify xcnt further by checking for joint kmcnt and loadcnt
1464 // optimizations. On entry to a block with multiple predescessors, there may
1465 // be pending SMEM and VMEM events active at the same time.
1466 // In such cases, only clear one active event at a time.
1467 // TODO: Revisit xcnt optimizations for gfx1250.
1468 // Wait on XCNT is redundant if we are already waiting for a load to complete.
1469 // SMEM can return out of order, so only omit XCNT wait if we are waiting till
1470 // zero.
1471 if (CheckWait.get(T: KM_CNT) == 0 && hasPendingEvent(E: SMEM_GROUP))
1472 UpdateWait.set(T: X_CNT, Val: ~0u);
1473 // If we have pending store we cannot optimize XCnt because we do not wait for
1474 // stores. VMEM loads retun in order, so if we only have loads XCnt is
1475 // decremented to the same number as LOADCnt.
1476 if (CheckWait.get(T: LOAD_CNT) != ~0u && hasPendingEvent(E: VMEM_GROUP) &&
1477 !hasPendingEvent(T: STORE_CNT) &&
1478 CheckWait.get(T: X_CNT) >= CheckWait.get(T: LOAD_CNT))
1479 UpdateWait.set(T: X_CNT, Val: ~0u);
1480 simplifyWaitcnt(Wait&: UpdateWait, T: X_CNT);
1481}
1482
1483void WaitcntBrackets::simplifyVmVsrc(const AMDGPU::Waitcnt &CheckWait,
1484 AMDGPU::Waitcnt &UpdateWait) const {
1485 // Waiting for some counters implies waiting for VM_VSRC, since an
1486 // instruction that decrements a counter on completion would have
1487 // decremented VM_VSRC once its VGPR operands had been read.
1488 if (CheckWait.get(T: VM_VSRC) >=
1489 std::min(l: {CheckWait.get(T: LOAD_CNT), CheckWait.get(T: STORE_CNT),
1490 CheckWait.get(T: SAMPLE_CNT), CheckWait.get(T: BVH_CNT),
1491 CheckWait.get(T: DS_CNT)}))
1492 UpdateWait.set(T: VM_VSRC, Val: ~0u);
1493 simplifyWaitcnt(Wait&: UpdateWait, T: VM_VSRC);
1494}
1495
1496void WaitcntBrackets::purgeEmptyTrackingData() {
1497 for (auto &[K, V] : make_early_inc_range(Range&: VMem)) {
1498 if (V.empty())
1499 VMem.erase(Val: K);
1500 }
1501 for (auto &[K, V] : make_early_inc_range(Range&: SGPRs)) {
1502 if (V.empty())
1503 SGPRs.erase(Val: K);
1504 }
1505}
1506
1507void WaitcntBrackets::determineWaitForScore(InstCounterType T,
1508 unsigned ScoreToWait,
1509 AMDGPU::Waitcnt &Wait) const {
1510 const unsigned LB = getScoreLB(T);
1511 const unsigned UB = getScoreUB(T);
1512
1513 // If the score falls within the bracket, we need a waitcnt.
1514 if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
1515 if ((T == LOAD_CNT || T == DS_CNT) && hasPendingFlat() &&
1516 !Context->ST->hasFlatLgkmVMemCountInOrder()) {
1517 // If there is a pending FLAT operation, and this is a VMem or LGKM
1518 // waitcnt and the target can report early completion, then we need
1519 // to force a waitcnt 0.
1520 addWait(Wait, T, Count: 0);
1521 } else if (counterOutOfOrder(T)) {
1522 // Counter can get decremented out-of-order when there
1523 // are multiple types event in the bracket. Also emit an s_wait counter
1524 // with a conservative value of 0 for the counter.
1525 addWait(Wait, T, Count: 0);
1526 } else {
1527 // If a counter has been maxed out avoid overflow by waiting for
1528 // MAX(CounterType) - 1 instead.
1529 unsigned NeededWait = std::min(
1530 a: UB - ScoreToWait, b: getWaitCountMax(Limits: Context->getLimits(), T) - 1);
1531 addWait(Wait, T, Count: NeededWait);
1532 }
1533 }
1534}
1535
1536AMDGPU::Waitcnt WaitcntBrackets::determineAsyncWait(unsigned N) {
1537 LLVM_DEBUG({
1538 dbgs() << "Need " << N << " async marks. Found " << AsyncMarks.size()
1539 << ":\n";
1540 for (const auto &Mark : AsyncMarks) {
1541 llvm::interleaveComma(Mark, dbgs());
1542 dbgs() << '\n';
1543 }
1544 });
1545
1546 if (AsyncMarks.size() == MaxAsyncMarks) {
1547 // Enforcing MaxAsyncMarks here is unnecessary work because the size of
1548 // MaxAsyncMarks is linear when traversing straightline code. But we do
1549 // need to check if truncation may have occured at a merge, and adjust N
1550 // to ensure that a wait is generated.
1551 LLVM_DEBUG(dbgs() << "Possible truncation. Ensuring a non-trivial wait.\n");
1552 N = std::min(a: N, b: (unsigned)MaxAsyncMarks - 1);
1553 }
1554
1555 AMDGPU::Waitcnt Wait;
1556 if (AsyncMarks.size() <= N) {
1557 LLVM_DEBUG(dbgs() << "No additional wait for async mark.\n");
1558 return Wait;
1559 }
1560
1561 size_t MarkIndex = AsyncMarks.size() - N - 1;
1562 const auto &RequiredMark = AsyncMarks[MarkIndex];
1563 for (InstCounterType T : inst_counter_types())
1564 determineWaitForScore(T, ScoreToWait: RequiredMark[T], Wait);
1565
1566 // Immediately remove the waited mark and all older ones
1567 // This happens BEFORE the wait is actually inserted, which is fine
1568 // because we've already extracted the wait requirements
1569 LLVM_DEBUG({
1570 dbgs() << "Removing " << (MarkIndex + 1)
1571 << " async marks after determining wait\n";
1572 });
1573 AsyncMarks.erase(CS: AsyncMarks.begin(), CE: AsyncMarks.begin() + MarkIndex + 1);
1574
1575 LLVM_DEBUG(dbgs() << "Waits to add: " << Wait);
1576 return Wait;
1577}
1578
1579void WaitcntBrackets::determineWaitForPhysReg(InstCounterType T, MCPhysReg Reg,
1580 AMDGPU::Waitcnt &Wait) const {
1581 if (Reg == AMDGPU::SCC) {
1582 determineWaitForScore(T, ScoreToWait: SCCScore, Wait);
1583 } else {
1584 bool IsVGPR = Context->TRI->isVectorRegister(MRI: *Context->MRI, Reg);
1585 for (MCRegUnit RU : regunits(Reg))
1586 determineWaitForScore(
1587 T, ScoreToWait: IsVGPR ? getVMemScore(TID: toVMEMID(RU), T) : getSGPRScore(RU, T),
1588 Wait);
1589 }
1590}
1591
1592void WaitcntBrackets::determineWaitForLDSDMA(InstCounterType T, VMEMID TID,
1593 AMDGPU::Waitcnt &Wait) const {
1594 assert(TID >= LDSDMA_BEGIN && TID < LDSDMA_END);
1595 determineWaitForScore(T, ScoreToWait: getVMemScore(TID, T), Wait);
1596}
1597
1598void WaitcntBrackets::tryClearSCCWriteEvent(MachineInstr *Inst) {
1599 // S_BARRIER_WAIT on the same barrier guarantees that the pending write to
1600 // SCC has landed
1601 if (PendingSCCWrite &&
1602 PendingSCCWrite->getOpcode() == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM &&
1603 PendingSCCWrite->getOperand(i: 0).getImm() == Inst->getOperand(i: 0).getImm()) {
1604 WaitEventSet SCC_WRITE_PendingEvent(SCC_WRITE);
1605 // If this SCC_WRITE is the only pending KM_CNT event, clear counter.
1606 if ((PendingEvents & Context->getWaitEvents(T: KM_CNT)) ==
1607 SCC_WRITE_PendingEvent) {
1608 setScoreLB(T: KM_CNT, Val: getScoreUB(T: KM_CNT));
1609 }
1610
1611 PendingEvents.remove(Other: SCC_WRITE_PendingEvent);
1612 PendingSCCWrite = nullptr;
1613 }
1614}
1615
1616void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) {
1617 applyWaitcnt(Wait, T: LOAD_CNT);
1618 applyWaitcnt(Wait, T: EXP_CNT);
1619 applyWaitcnt(Wait, T: DS_CNT);
1620 applyWaitcnt(Wait, T: STORE_CNT);
1621 applyWaitcnt(Wait, T: SAMPLE_CNT);
1622 applyWaitcnt(Wait, T: BVH_CNT);
1623 applyWaitcnt(Wait, T: KM_CNT);
1624 applyWaitcnt(Wait, T: X_CNT);
1625 applyWaitcnt(Wait, T: VA_VDST);
1626 applyWaitcnt(Wait, T: VM_VSRC);
1627}
1628
1629void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
1630 const unsigned UB = getScoreUB(T);
1631 if (Count >= UB)
1632 return;
1633 if (Count != 0) {
1634 if (counterOutOfOrder(T))
1635 return;
1636 setScoreLB(T, Val: std::max(a: getScoreLB(T), b: UB - Count));
1637 } else {
1638 setScoreLB(T, Val: UB);
1639 PendingEvents.remove(Other: Context->getWaitEvents(T));
1640 }
1641
1642 if (T == KM_CNT && Count == 0 && hasPendingEvent(E: SMEM_GROUP)) {
1643 if (!hasMixedPendingEvents(T: X_CNT))
1644 applyWaitcnt(T: X_CNT, Count: 0);
1645 else
1646 PendingEvents.remove(Event: SMEM_GROUP);
1647 }
1648 if (T == LOAD_CNT && hasPendingEvent(E: VMEM_GROUP) &&
1649 !hasPendingEvent(T: STORE_CNT)) {
1650 if (!hasMixedPendingEvents(T: X_CNT))
1651 applyWaitcnt(T: X_CNT, Count);
1652 else if (Count == 0)
1653 PendingEvents.remove(Event: VMEM_GROUP);
1654 }
1655}
1656
1657void WaitcntBrackets::applyWaitcnt(const Waitcnt &Wait, InstCounterType T) {
1658 unsigned Cnt = Wait.get(T);
1659 applyWaitcnt(T, Count: Cnt);
1660}
1661
1662// Where there are multiple types of event in the bracket of a counter,
1663// the decrement may go out of order.
1664bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const {
1665 // Scalar memory read always can go out of order.
1666 if ((T == Context->SmemAccessCounter && hasPendingEvent(E: SMEM_ACCESS)) ||
1667 (T == X_CNT && hasPendingEvent(E: SMEM_GROUP)))
1668 return true;
1669
1670 // GLOBAL_INV completes in-order with other LOAD_CNT events (VMEM_ACCESS),
1671 // so having GLOBAL_INV_ACCESS mixed with other LOAD_CNT events doesn't cause
1672 // out-of-order completion.
1673 if (T == LOAD_CNT) {
1674 unsigned Events = hasPendingEvent(T);
1675 // Remove GLOBAL_INV_ACCESS from the event mask before checking for mixed
1676 // events
1677 Events &= ~(1 << GLOBAL_INV_ACCESS);
1678 // Return true only if there are still multiple event types after removing
1679 // GLOBAL_INV
1680 return Events & (Events - 1);
1681 }
1682
1683 return hasMixedPendingEvents(T);
1684}
1685
1686INITIALIZE_PASS_BEGIN(SIInsertWaitcntsLegacy, DEBUG_TYPE, "SI Insert Waitcnts",
1687 false, false)
1688INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass)
1689INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTreeWrapperPass)
1690INITIALIZE_PASS_END(SIInsertWaitcntsLegacy, DEBUG_TYPE, "SI Insert Waitcnts",
1691 false, false)
1692
1693char SIInsertWaitcntsLegacy::ID = 0;
1694
1695char &llvm::SIInsertWaitcntsID = SIInsertWaitcntsLegacy::ID;
1696
1697FunctionPass *llvm::createSIInsertWaitcntsPass() {
1698 return new SIInsertWaitcntsLegacy();
1699}
1700
1701static bool updateOperandIfDifferent(MachineInstr &MI, AMDGPU::OpName OpName,
1702 unsigned NewEnc) {
1703 int OpIdx = AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: OpName);
1704 assert(OpIdx >= 0);
1705
1706 MachineOperand &MO = MI.getOperand(i: OpIdx);
1707
1708 if (NewEnc == MO.getImm())
1709 return false;
1710
1711 MO.setImm(NewEnc);
1712 return true;
1713}
1714
1715/// Determine if \p MI is a gfx12+ single-counter S_WAIT_*CNT instruction,
1716/// and if so, which counter it is waiting on.
1717static std::optional<InstCounterType> counterTypeForInstr(unsigned Opcode) {
1718 switch (Opcode) {
1719 case AMDGPU::S_WAIT_LOADCNT:
1720 return LOAD_CNT;
1721 case AMDGPU::S_WAIT_EXPCNT:
1722 return EXP_CNT;
1723 case AMDGPU::S_WAIT_STORECNT:
1724 return STORE_CNT;
1725 case AMDGPU::S_WAIT_SAMPLECNT:
1726 return SAMPLE_CNT;
1727 case AMDGPU::S_WAIT_BVHCNT:
1728 return BVH_CNT;
1729 case AMDGPU::S_WAIT_DSCNT:
1730 return DS_CNT;
1731 case AMDGPU::S_WAIT_KMCNT:
1732 return KM_CNT;
1733 case AMDGPU::S_WAIT_XCNT:
1734 return X_CNT;
1735 default:
1736 return {};
1737 }
1738}
1739
1740bool WaitcntGenerator::promoteSoftWaitCnt(MachineInstr *Waitcnt) const {
1741 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Opcode: Waitcnt->getOpcode());
1742 if (Opcode == Waitcnt->getOpcode())
1743 return false;
1744
1745 Waitcnt->setDesc(TII.get(Opcode));
1746 return true;
1747}
1748
1749/// Combine consecutive S_WAITCNT and S_WAITCNT_VSCNT instructions that
1750/// precede \p It and follow \p OldWaitcntInstr and apply any extra waits
1751/// from \p Wait that were added by previous passes. Currently this pass
1752/// conservatively assumes that these preexisting waits are required for
1753/// correctness.
1754bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
1755 WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
1756 AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) const {
1757 assert(isNormalMode(MaxCounter));
1758
1759 bool Modified = false;
1760 MachineInstr *WaitcntInstr = nullptr;
1761 MachineInstr *WaitcntVsCntInstr = nullptr;
1762
1763 LLVM_DEBUG({
1764 dbgs() << "PreGFX12::applyPreexistingWaitcnt at: ";
1765 if (It.isEnd())
1766 dbgs() << "end of block\n";
1767 else
1768 dbgs() << *It;
1769 });
1770
1771 for (auto &II :
1772 make_early_inc_range(Range: make_range(x: OldWaitcntInstr.getIterator(), y: It))) {
1773 LLVM_DEBUG(dbgs() << "pre-existing iter: " << II);
1774 if (II.isMetaInstruction()) {
1775 LLVM_DEBUG(dbgs() << "skipped meta instruction\n");
1776 continue;
1777 }
1778
1779 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Opcode: II.getOpcode());
1780 bool TrySimplify = Opcode != II.getOpcode() && !OptNone;
1781
1782 // Update required wait count. If this is a soft waitcnt (= it was added
1783 // by an earlier pass), it may be entirely removed.
1784 if (Opcode == AMDGPU::S_WAITCNT) {
1785 unsigned IEnc = II.getOperand(i: 0).getImm();
1786 AMDGPU::Waitcnt OldWait = AMDGPU::decodeWaitcnt(Version: IV, Encoded: IEnc);
1787 if (TrySimplify)
1788 ScoreBrackets.simplifyWaitcnt(Wait&: OldWait);
1789 Wait = Wait.combined(Other: OldWait);
1790
1791 // Merge consecutive waitcnt of the same type by erasing multiples.
1792 if (WaitcntInstr || (!Wait.hasWaitExceptStoreCnt() && TrySimplify)) {
1793 II.eraseFromParent();
1794 Modified = true;
1795 } else
1796 WaitcntInstr = &II;
1797 } else if (Opcode == AMDGPU::S_WAITCNT_lds_direct) {
1798 assert(ST.hasVMemToLDSLoad());
1799 LLVM_DEBUG(dbgs() << "Processing S_WAITCNT_lds_direct: " << II
1800 << "Before: " << Wait << '\n';);
1801 ScoreBrackets.determineWaitForLDSDMA(T: LOAD_CNT, TID: LDSDMA_BEGIN, Wait);
1802 LLVM_DEBUG(dbgs() << "After: " << Wait << '\n';);
1803
1804 // It is possible (but unlikely) that this is the only wait instruction,
1805 // in which case, we exit this loop without a WaitcntInstr to consume
1806 // `Wait`. But that works because `Wait` was passed in by reference, and
1807 // the callee eventually calls createNewWaitcnt on it. We test this
1808 // possibility in an articial MIR test since such a situation cannot be
1809 // recreated by running the memory legalizer.
1810 II.eraseFromParent();
1811 } else if (Opcode == AMDGPU::WAIT_ASYNCMARK) {
1812 unsigned N = II.getOperand(i: 0).getImm();
1813 LLVM_DEBUG(dbgs() << "Processing WAIT_ASYNCMARK: " << II << '\n';);
1814 AMDGPU::Waitcnt OldWait = ScoreBrackets.determineAsyncWait(N);
1815 Wait = Wait.combined(Other: OldWait);
1816 } else {
1817 assert(Opcode == AMDGPU::S_WAITCNT_VSCNT);
1818 assert(II.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1819
1820 unsigned OldVSCnt =
1821 TII.getNamedOperand(MI&: II, OperandName: AMDGPU::OpName::simm16)->getImm();
1822 if (TrySimplify)
1823 ScoreBrackets.simplifyWaitcnt(T: InstCounterType::STORE_CNT, Count&: OldVSCnt);
1824 Wait.set(T: STORE_CNT, Val: std::min(a: Wait.get(T: STORE_CNT), b: OldVSCnt));
1825
1826 if (WaitcntVsCntInstr || (!Wait.hasWaitStoreCnt() && TrySimplify)) {
1827 II.eraseFromParent();
1828 Modified = true;
1829 } else
1830 WaitcntVsCntInstr = &II;
1831 }
1832 }
1833
1834 if (WaitcntInstr) {
1835 Modified |= updateOperandIfDifferent(MI&: *WaitcntInstr, OpName: AMDGPU::OpName::simm16,
1836 NewEnc: AMDGPU::encodeWaitcnt(Version: IV, Decoded: Wait));
1837 Modified |= promoteSoftWaitCnt(Waitcnt: WaitcntInstr);
1838
1839 ScoreBrackets.applyWaitcnt(Wait, T: LOAD_CNT);
1840 ScoreBrackets.applyWaitcnt(Wait, T: EXP_CNT);
1841 ScoreBrackets.applyWaitcnt(Wait, T: DS_CNT);
1842 Wait.set(T: LOAD_CNT, Val: ~0u);
1843 Wait.set(T: EXP_CNT, Val: ~0u);
1844 Wait.set(T: DS_CNT, Val: ~0u);
1845
1846 LLVM_DEBUG(It.isEnd() ? dbgs() << "applied pre-existing waitcnt\n"
1847 << "New Instr at block end: "
1848 << *WaitcntInstr << '\n'
1849 : dbgs() << "applied pre-existing waitcnt\n"
1850 << "Old Instr: " << *It
1851 << "New Instr: " << *WaitcntInstr << '\n');
1852 }
1853
1854 if (WaitcntVsCntInstr) {
1855 Modified |= updateOperandIfDifferent(
1856 MI&: *WaitcntVsCntInstr, OpName: AMDGPU::OpName::simm16, NewEnc: Wait.get(T: STORE_CNT));
1857 Modified |= promoteSoftWaitCnt(Waitcnt: WaitcntVsCntInstr);
1858
1859 ScoreBrackets.applyWaitcnt(T: STORE_CNT, Count: Wait.get(T: STORE_CNT));
1860 Wait.set(T: STORE_CNT, Val: ~0u);
1861
1862 LLVM_DEBUG(It.isEnd()
1863 ? dbgs() << "applied pre-existing waitcnt\n"
1864 << "New Instr at block end: " << *WaitcntVsCntInstr
1865 << '\n'
1866 : dbgs() << "applied pre-existing waitcnt\n"
1867 << "Old Instr: " << *It
1868 << "New Instr: " << *WaitcntVsCntInstr << '\n');
1869 }
1870
1871 return Modified;
1872}
1873
1874/// Generate S_WAITCNT and/or S_WAITCNT_VSCNT instructions for any
1875/// required counters in \p Wait
1876bool WaitcntGeneratorPreGFX12::createNewWaitcnt(
1877 MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It,
1878 AMDGPU::Waitcnt Wait, const WaitcntBrackets &ScoreBrackets) {
1879 assert(isNormalMode(MaxCounter));
1880
1881 bool Modified = false;
1882 const DebugLoc &DL = Block.findDebugLoc(MBBI: It);
1883
1884 // Helper to emit expanded waitcnt sequence for profiling.
1885 // Emits waitcnts from (Outstanding-1) down to Target.
1886 // The EmitWaitcnt callback emits a single waitcnt.
1887 auto EmitExpandedWaitcnt = [&](unsigned Outstanding, unsigned Target,
1888 auto EmitWaitcnt) {
1889 do {
1890 EmitWaitcnt(--Outstanding);
1891 } while (Outstanding > Target);
1892 Modified = true;
1893 };
1894
1895 // Waits for VMcnt, LKGMcnt and/or EXPcnt are encoded together into a
1896 // single instruction while VScnt has its own instruction.
1897 if (Wait.hasWaitExceptStoreCnt()) {
1898 // If profiling expansion is enabled, emit an expanded sequence
1899 if (ExpandWaitcntProfiling) {
1900 // Check if any of the counters to be waited on are out-of-order.
1901 // If so, fall back to normal (non-expanded) behavior since expansion
1902 // would provide misleading profiling information.
1903 bool AnyOutOfOrder = false;
1904 for (auto CT : {LOAD_CNT, DS_CNT, EXP_CNT}) {
1905 unsigned WaitCnt = Wait.get(T: CT);
1906 if (WaitCnt != ~0u && ScoreBrackets.counterOutOfOrder(T: CT)) {
1907 AnyOutOfOrder = true;
1908 break;
1909 }
1910 }
1911
1912 if (AnyOutOfOrder) {
1913 // Fall back to non-expanded wait
1914 unsigned Enc = AMDGPU::encodeWaitcnt(Version: IV, Decoded: Wait);
1915 BuildMI(BB&: Block, I: It, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_WAITCNT)).addImm(Val: Enc);
1916 Modified = true;
1917 } else {
1918 // All counters are in-order, safe to expand
1919 for (auto CT : {LOAD_CNT, DS_CNT, EXP_CNT}) {
1920 unsigned WaitCnt = Wait.get(T: CT);
1921 if (WaitCnt == ~0u)
1922 continue;
1923
1924 unsigned Outstanding = std::min(a: ScoreBrackets.getOutstanding(T: CT),
1925 b: getWaitCountMax(Limits: getLimits(), T: CT) - 1);
1926 EmitExpandedWaitcnt(Outstanding, WaitCnt, [&](unsigned Count) {
1927 AMDGPU::Waitcnt W;
1928 W.set(T: CT, Val: Count);
1929 BuildMI(BB&: Block, I: It, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_WAITCNT))
1930 .addImm(Val: AMDGPU::encodeWaitcnt(Version: IV, Decoded: W));
1931 });
1932 }
1933 }
1934 } else {
1935 // Normal behavior: emit single combined waitcnt
1936 unsigned Enc = AMDGPU::encodeWaitcnt(Version: IV, Decoded: Wait);
1937 [[maybe_unused]] auto SWaitInst =
1938 BuildMI(BB&: Block, I: It, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_WAITCNT)).addImm(Val: Enc);
1939 Modified = true;
1940
1941 LLVM_DEBUG(dbgs() << "PreGFX12::createNewWaitcnt\n";
1942 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
1943 dbgs() << "New Instr: " << *SWaitInst << '\n');
1944 }
1945 }
1946
1947 if (Wait.hasWaitStoreCnt()) {
1948 assert(ST.hasVscnt());
1949
1950 if (ExpandWaitcntProfiling && Wait.get(T: STORE_CNT) != ~0u &&
1951 !ScoreBrackets.counterOutOfOrder(T: STORE_CNT)) {
1952 // Only expand if counter is not out-of-order
1953 unsigned Outstanding =
1954 std::min(a: ScoreBrackets.getOutstanding(T: STORE_CNT),
1955 b: getWaitCountMax(Limits: getLimits(), T: STORE_CNT) - 1);
1956 EmitExpandedWaitcnt(
1957 Outstanding, Wait.get(T: STORE_CNT), [&](unsigned Count) {
1958 BuildMI(BB&: Block, I: It, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_WAITCNT_VSCNT))
1959 .addReg(RegNo: AMDGPU::SGPR_NULL, Flags: RegState::Undef)
1960 .addImm(Val: Count);
1961 });
1962 } else {
1963 [[maybe_unused]] auto SWaitInst =
1964 BuildMI(BB&: Block, I: It, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_WAITCNT_VSCNT))
1965 .addReg(RegNo: AMDGPU::SGPR_NULL, Flags: RegState::Undef)
1966 .addImm(Val: Wait.get(T: STORE_CNT));
1967 Modified = true;
1968
1969 LLVM_DEBUG(dbgs() << "PreGFX12::createNewWaitcnt\n";
1970 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
1971 dbgs() << "New Instr: " << *SWaitInst << '\n');
1972 }
1973 }
1974
1975 return Modified;
1976}
1977
1978AMDGPU::Waitcnt
1979WaitcntGeneratorPreGFX12::getAllZeroWaitcnt(bool IncludeVSCnt) const {
1980 return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt && ST.hasVscnt() ? 0 : ~0u);
1981}
1982
1983AMDGPU::Waitcnt
1984WaitcntGeneratorGFX12Plus::getAllZeroWaitcnt(bool IncludeVSCnt) const {
1985 unsigned ExpertVal = IsExpertMode ? 0 : ~0u;
1986 return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt ? 0 : ~0u, 0, 0, 0,
1987 ~0u /* XCNT */, ExpertVal, ExpertVal);
1988}
1989
1990/// Combine consecutive S_WAIT_*CNT instructions that precede \p It and
1991/// follow \p OldWaitcntInstr and apply any extra waits from \p Wait that
1992/// were added by previous passes. Currently this pass conservatively
1993/// assumes that these preexisting waits are required for correctness.
1994bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
1995 WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
1996 AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) const {
1997 assert(!isNormalMode(MaxCounter));
1998
1999 bool Modified = false;
2000 MachineInstr *CombinedLoadDsCntInstr = nullptr;
2001 MachineInstr *CombinedStoreDsCntInstr = nullptr;
2002 MachineInstr *WaitcntDepctrInstr = nullptr;
2003 MachineInstr *WaitInstrs[NUM_EXTENDED_INST_CNTS] = {};
2004
2005 LLVM_DEBUG({
2006 dbgs() << "GFX12Plus::applyPreexistingWaitcnt at: ";
2007 if (It.isEnd())
2008 dbgs() << "end of block\n";
2009 else
2010 dbgs() << *It;
2011 });
2012
2013 // Accumulate waits that should not be simplified.
2014 AMDGPU::Waitcnt RequiredWait;
2015
2016 for (auto &II :
2017 make_early_inc_range(Range: make_range(x: OldWaitcntInstr.getIterator(), y: It))) {
2018 LLVM_DEBUG(dbgs() << "pre-existing iter: " << II);
2019 if (II.isMetaInstruction()) {
2020 LLVM_DEBUG(dbgs() << "skipped meta instruction\n");
2021 continue;
2022 }
2023
2024 MachineInstr **UpdatableInstr;
2025
2026 // Update required wait count. If this is a soft waitcnt (= it was added
2027 // by an earlier pass), it may be entirely removed.
2028
2029 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Opcode: II.getOpcode());
2030 bool TrySimplify = Opcode != II.getOpcode() && !OptNone;
2031
2032 // Don't crash if the programmer used legacy waitcnt intrinsics, but don't
2033 // attempt to do more than that either.
2034 if (Opcode == AMDGPU::S_WAITCNT)
2035 continue;
2036
2037 if (Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT) {
2038 unsigned OldEnc =
2039 TII.getNamedOperand(MI&: II, OperandName: AMDGPU::OpName::simm16)->getImm();
2040 AMDGPU::Waitcnt OldWait = AMDGPU::decodeLoadcntDscnt(Version: IV, LoadcntDscnt: OldEnc);
2041 if (TrySimplify)
2042 Wait = Wait.combined(Other: OldWait);
2043 else
2044 RequiredWait = RequiredWait.combined(Other: OldWait);
2045 UpdatableInstr = &CombinedLoadDsCntInstr;
2046 } else if (Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT) {
2047 unsigned OldEnc =
2048 TII.getNamedOperand(MI&: II, OperandName: AMDGPU::OpName::simm16)->getImm();
2049 AMDGPU::Waitcnt OldWait = AMDGPU::decodeStorecntDscnt(Version: IV, StorecntDscnt: OldEnc);
2050 if (TrySimplify)
2051 Wait = Wait.combined(Other: OldWait);
2052 else
2053 RequiredWait = RequiredWait.combined(Other: OldWait);
2054 UpdatableInstr = &CombinedStoreDsCntInstr;
2055 } else if (Opcode == AMDGPU::S_WAITCNT_DEPCTR) {
2056 unsigned OldEnc =
2057 TII.getNamedOperand(MI&: II, OperandName: AMDGPU::OpName::simm16)->getImm();
2058 AMDGPU::Waitcnt OldWait;
2059 OldWait.set(T: VA_VDST, Val: AMDGPU::DepCtr::decodeFieldVaVdst(Encoded: OldEnc));
2060 OldWait.set(T: VM_VSRC, Val: AMDGPU::DepCtr::decodeFieldVmVsrc(Encoded: OldEnc));
2061 if (TrySimplify)
2062 ScoreBrackets.simplifyWaitcnt(Wait&: OldWait);
2063 Wait = Wait.combined(Other: OldWait);
2064 UpdatableInstr = &WaitcntDepctrInstr;
2065 } else if (Opcode == AMDGPU::S_WAITCNT_lds_direct) {
2066 // Architectures higher than GFX10 do not have direct loads to
2067 // LDS, so no work required here yet.
2068 II.eraseFromParent();
2069 continue;
2070 } else if (Opcode == AMDGPU::WAIT_ASYNCMARK) {
2071 reportFatalUsageError(reason: "WAIT_ASYNCMARK is not ready for GFX12 yet");
2072 } else {
2073 std::optional<InstCounterType> CT = counterTypeForInstr(Opcode);
2074 assert(CT.has_value());
2075 unsigned OldCnt =
2076 TII.getNamedOperand(MI&: II, OperandName: AMDGPU::OpName::simm16)->getImm();
2077 if (TrySimplify)
2078 addWait(Wait, T: CT.value(), Count: OldCnt);
2079 else
2080 addWait(Wait&: RequiredWait, T: CT.value(), Count: OldCnt);
2081 UpdatableInstr = &WaitInstrs[CT.value()];
2082 }
2083
2084 // Merge consecutive waitcnt of the same type by erasing multiples.
2085 if (!*UpdatableInstr) {
2086 *UpdatableInstr = &II;
2087 } else if (Opcode == AMDGPU::S_WAITCNT_DEPCTR) {
2088 // S_WAITCNT_DEPCTR requires special care. Don't remove a
2089 // duplicate if it is waiting on things other than VA_VDST or
2090 // VM_VSRC. If that is the case, just make sure the VA_VDST and
2091 // VM_VSRC subfields of the operand are set to the "no wait"
2092 // values.
2093
2094 unsigned Enc = TII.getNamedOperand(MI&: II, OperandName: AMDGPU::OpName::simm16)->getImm();
2095 Enc = AMDGPU::DepCtr::encodeFieldVmVsrc(Encoded: Enc, VmVsrc: ~0u);
2096 Enc = AMDGPU::DepCtr::encodeFieldVaVdst(Encoded: Enc, VaVdst: ~0u);
2097
2098 if (Enc != (unsigned)AMDGPU::DepCtr::getDefaultDepCtrEncoding(STI: ST)) {
2099 Modified |= updateOperandIfDifferent(MI&: II, OpName: AMDGPU::OpName::simm16, NewEnc: Enc);
2100 Modified |= promoteSoftWaitCnt(Waitcnt: &II);
2101 } else {
2102 II.eraseFromParent();
2103 Modified = true;
2104 }
2105 } else {
2106 II.eraseFromParent();
2107 Modified = true;
2108 }
2109 }
2110
2111 ScoreBrackets.simplifyWaitcnt(CheckWait: Wait.combined(Other: RequiredWait), UpdateWait&: Wait);
2112 Wait = Wait.combined(Other: RequiredWait);
2113
2114 if (CombinedLoadDsCntInstr) {
2115 // Only keep an S_WAIT_LOADCNT_DSCNT if both counters actually need
2116 // to be waited for. Otherwise, let the instruction be deleted so
2117 // the appropriate single counter wait instruction can be inserted
2118 // instead, when new S_WAIT_*CNT instructions are inserted by
2119 // createNewWaitcnt(). As a side effect, resetting the wait counts will
2120 // cause any redundant S_WAIT_LOADCNT or S_WAIT_DSCNT to be removed by
2121 // the loop below that deals with single counter instructions.
2122 //
2123 // A wait for LOAD_CNT or DS_CNT implies a wait for VM_VSRC, since
2124 // instructions that have decremented LOAD_CNT or DS_CNT on completion
2125 // will have needed to wait for their register sources to be available
2126 // first.
2127 if (Wait.get(T: LOAD_CNT) != ~0u && Wait.get(T: DS_CNT) != ~0u) {
2128 unsigned NewEnc = AMDGPU::encodeLoadcntDscnt(Version: IV, Decoded: Wait);
2129 Modified |= updateOperandIfDifferent(MI&: *CombinedLoadDsCntInstr,
2130 OpName: AMDGPU::OpName::simm16, NewEnc);
2131 Modified |= promoteSoftWaitCnt(Waitcnt: CombinedLoadDsCntInstr);
2132 ScoreBrackets.applyWaitcnt(T: LOAD_CNT, Count: Wait.get(T: LOAD_CNT));
2133 ScoreBrackets.applyWaitcnt(T: DS_CNT, Count: Wait.get(T: DS_CNT));
2134 Wait.set(T: LOAD_CNT, Val: ~0u);
2135 Wait.set(T: DS_CNT, Val: ~0u);
2136
2137 LLVM_DEBUG(It.isEnd() ? dbgs() << "applied pre-existing waitcnt\n"
2138 << "New Instr at block end: "
2139 << *CombinedLoadDsCntInstr << '\n'
2140 : dbgs() << "applied pre-existing waitcnt\n"
2141 << "Old Instr: " << *It << "New Instr: "
2142 << *CombinedLoadDsCntInstr << '\n');
2143 } else {
2144 CombinedLoadDsCntInstr->eraseFromParent();
2145 Modified = true;
2146 }
2147 }
2148
2149 if (CombinedStoreDsCntInstr) {
2150 // Similarly for S_WAIT_STORECNT_DSCNT.
2151 if (Wait.get(T: STORE_CNT) != ~0u && Wait.get(T: DS_CNT) != ~0u) {
2152 unsigned NewEnc = AMDGPU::encodeStorecntDscnt(Version: IV, Decoded: Wait);
2153 Modified |= updateOperandIfDifferent(MI&: *CombinedStoreDsCntInstr,
2154 OpName: AMDGPU::OpName::simm16, NewEnc);
2155 Modified |= promoteSoftWaitCnt(Waitcnt: CombinedStoreDsCntInstr);
2156 ScoreBrackets.applyWaitcnt(Wait, T: STORE_CNT);
2157 ScoreBrackets.applyWaitcnt(Wait, T: DS_CNT);
2158 Wait.set(T: STORE_CNT, Val: ~0u);
2159 Wait.set(T: DS_CNT, Val: ~0u);
2160
2161 LLVM_DEBUG(It.isEnd() ? dbgs() << "applied pre-existing waitcnt\n"
2162 << "New Instr at block end: "
2163 << *CombinedStoreDsCntInstr << '\n'
2164 : dbgs() << "applied pre-existing waitcnt\n"
2165 << "Old Instr: " << *It << "New Instr: "
2166 << *CombinedStoreDsCntInstr << '\n');
2167 } else {
2168 CombinedStoreDsCntInstr->eraseFromParent();
2169 Modified = true;
2170 }
2171 }
2172
2173 // Look for an opportunity to convert existing S_WAIT_LOADCNT,
2174 // S_WAIT_STORECNT and S_WAIT_DSCNT into new S_WAIT_LOADCNT_DSCNT
2175 // or S_WAIT_STORECNT_DSCNT. This is achieved by selectively removing
2176 // instructions so that createNewWaitcnt() will create new combined
2177 // instructions to replace them.
2178
2179 if (Wait.get(T: DS_CNT) != ~0u) {
2180 // This is a vector of addresses in WaitInstrs pointing to instructions
2181 // that should be removed if they are present.
2182 SmallVector<MachineInstr **, 2> WaitsToErase;
2183
2184 // If it's known that both DScnt and either LOADcnt or STOREcnt (but not
2185 // both) need to be waited for, ensure that there are no existing
2186 // individual wait count instructions for these.
2187
2188 if (Wait.get(T: LOAD_CNT) != ~0u) {
2189 WaitsToErase.push_back(Elt: &WaitInstrs[LOAD_CNT]);
2190 WaitsToErase.push_back(Elt: &WaitInstrs[DS_CNT]);
2191 } else if (Wait.get(T: STORE_CNT) != ~0u) {
2192 WaitsToErase.push_back(Elt: &WaitInstrs[STORE_CNT]);
2193 WaitsToErase.push_back(Elt: &WaitInstrs[DS_CNT]);
2194 }
2195
2196 for (MachineInstr **WI : WaitsToErase) {
2197 if (!*WI)
2198 continue;
2199
2200 (*WI)->eraseFromParent();
2201 *WI = nullptr;
2202 Modified = true;
2203 }
2204 }
2205
2206 for (auto CT : inst_counter_types(MaxCounter: NUM_EXTENDED_INST_CNTS)) {
2207 if (!WaitInstrs[CT])
2208 continue;
2209
2210 unsigned NewCnt = Wait.get(T: CT);
2211 if (NewCnt != ~0u) {
2212 Modified |= updateOperandIfDifferent(MI&: *WaitInstrs[CT],
2213 OpName: AMDGPU::OpName::simm16, NewEnc: NewCnt);
2214 Modified |= promoteSoftWaitCnt(Waitcnt: WaitInstrs[CT]);
2215
2216 ScoreBrackets.applyWaitcnt(T: CT, Count: NewCnt);
2217 setNoWait(Wait, T: CT);
2218
2219 LLVM_DEBUG(It.isEnd()
2220 ? dbgs() << "applied pre-existing waitcnt\n"
2221 << "New Instr at block end: " << *WaitInstrs[CT]
2222 << '\n'
2223 : dbgs() << "applied pre-existing waitcnt\n"
2224 << "Old Instr: " << *It
2225 << "New Instr: " << *WaitInstrs[CT] << '\n');
2226 } else {
2227 WaitInstrs[CT]->eraseFromParent();
2228 Modified = true;
2229 }
2230 }
2231
2232 if (WaitcntDepctrInstr) {
2233 // Get the encoded Depctr immediate and override the VA_VDST and VM_VSRC
2234 // subfields with the new required values.
2235 unsigned Enc =
2236 TII.getNamedOperand(MI&: *WaitcntDepctrInstr, OperandName: AMDGPU::OpName::simm16)
2237 ->getImm();
2238 Enc = AMDGPU::DepCtr::encodeFieldVmVsrc(Encoded: Enc, VmVsrc: Wait.get(T: VM_VSRC));
2239 Enc = AMDGPU::DepCtr::encodeFieldVaVdst(Encoded: Enc, VaVdst: Wait.get(T: VA_VDST));
2240
2241 ScoreBrackets.applyWaitcnt(T: VA_VDST, Count: Wait.get(T: VA_VDST));
2242 ScoreBrackets.applyWaitcnt(T: VM_VSRC, Count: Wait.get(T: VM_VSRC));
2243 Wait.set(T: VA_VDST, Val: ~0u);
2244 Wait.set(T: VM_VSRC, Val: ~0u);
2245
2246 // If that new encoded Depctr immediate would actually still wait
2247 // for anything, update the instruction's operand. Otherwise it can
2248 // just be deleted.
2249 if (Enc != (unsigned)AMDGPU::DepCtr::getDefaultDepCtrEncoding(STI: ST)) {
2250 Modified |= updateOperandIfDifferent(MI&: *WaitcntDepctrInstr,
2251 OpName: AMDGPU::OpName::simm16, NewEnc: Enc);
2252 LLVM_DEBUG(It.isEnd() ? dbgs() << "applyPreexistingWaitcnt\n"
2253 << "New Instr at block end: "
2254 << *WaitcntDepctrInstr << '\n'
2255 : dbgs() << "applyPreexistingWaitcnt\n"
2256 << "Old Instr: " << *It << "New Instr: "
2257 << *WaitcntDepctrInstr << '\n');
2258 } else {
2259 WaitcntDepctrInstr->eraseFromParent();
2260 Modified = true;
2261 }
2262 }
2263
2264 return Modified;
2265}
2266
2267/// Generate S_WAIT_*CNT instructions for any required counters in \p Wait
2268bool WaitcntGeneratorGFX12Plus::createNewWaitcnt(
2269 MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It,
2270 AMDGPU::Waitcnt Wait, const WaitcntBrackets &ScoreBrackets) {
2271 assert(!isNormalMode(MaxCounter));
2272
2273 bool Modified = false;
2274 const DebugLoc &DL = Block.findDebugLoc(MBBI: It);
2275
2276 // Helper to emit expanded waitcnt sequence for profiling.
2277 auto EmitExpandedWaitcnt = [&](unsigned Outstanding, unsigned Target,
2278 auto EmitWaitcnt) {
2279 for (unsigned I = Outstanding - 1; I > Target && I != ~0u; --I)
2280 EmitWaitcnt(I);
2281 EmitWaitcnt(Target);
2282 Modified = true;
2283 };
2284
2285 // For GFX12+, we use separate wait instructions, which makes expansion
2286 // simpler
2287 if (ExpandWaitcntProfiling) {
2288 for (auto CT : inst_counter_types(MaxCounter: NUM_EXTENDED_INST_CNTS)) {
2289 unsigned Count = Wait.get(T: CT);
2290 if (Count == ~0u)
2291 continue;
2292
2293 // Skip expansion for out-of-order counters - emit normal wait instead
2294 if (ScoreBrackets.counterOutOfOrder(T: CT)) {
2295 BuildMI(BB&: Block, I: It, MIMD: DL, MCID: TII.get(Opcode: instrsForExtendedCounterTypes[CT]))
2296 .addImm(Val: Count);
2297 Modified = true;
2298 continue;
2299 }
2300
2301 unsigned Outstanding = std::min(a: ScoreBrackets.getOutstanding(T: CT),
2302 b: getWaitCountMax(Limits: getLimits(), T: CT) - 1);
2303 EmitExpandedWaitcnt(Outstanding, Count, [&](unsigned Val) {
2304 BuildMI(BB&: Block, I: It, MIMD: DL, MCID: TII.get(Opcode: instrsForExtendedCounterTypes[CT]))
2305 .addImm(Val);
2306 });
2307 }
2308 return Modified;
2309 }
2310
2311 // Normal behavior (no expansion)
2312 // Check for opportunities to use combined wait instructions.
2313 if (Wait.get(T: DS_CNT) != ~0u) {
2314 MachineInstr *SWaitInst = nullptr;
2315
2316 if (Wait.get(T: LOAD_CNT) != ~0u) {
2317 unsigned Enc = AMDGPU::encodeLoadcntDscnt(Version: IV, Decoded: Wait);
2318
2319 SWaitInst = BuildMI(BB&: Block, I: It, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_WAIT_LOADCNT_DSCNT))
2320 .addImm(Val: Enc);
2321
2322 Wait.set(T: LOAD_CNT, Val: ~0u);
2323 Wait.set(T: DS_CNT, Val: ~0u);
2324 } else if (Wait.get(T: STORE_CNT) != ~0u) {
2325 unsigned Enc = AMDGPU::encodeStorecntDscnt(Version: IV, Decoded: Wait);
2326
2327 SWaitInst = BuildMI(BB&: Block, I: It, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_WAIT_STORECNT_DSCNT))
2328 .addImm(Val: Enc);
2329
2330 Wait.set(T: STORE_CNT, Val: ~0u);
2331 Wait.set(T: DS_CNT, Val: ~0u);
2332 }
2333
2334 if (SWaitInst) {
2335 Modified = true;
2336
2337 LLVM_DEBUG(dbgs() << "GFX12Plus::createNewWaitcnt\n";
2338 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
2339 dbgs() << "New Instr: " << *SWaitInst << '\n');
2340 }
2341 }
2342
2343 // Generate an instruction for any remaining counter that needs
2344 // waiting for.
2345
2346 for (auto CT : inst_counter_types(MaxCounter: NUM_EXTENDED_INST_CNTS)) {
2347 unsigned Count = Wait.get(T: CT);
2348 if (Count == ~0u)
2349 continue;
2350
2351 [[maybe_unused]] auto SWaitInst =
2352 BuildMI(BB&: Block, I: It, MIMD: DL, MCID: TII.get(Opcode: instrsForExtendedCounterTypes[CT]))
2353 .addImm(Val: Count);
2354
2355 Modified = true;
2356
2357 LLVM_DEBUG(dbgs() << "GFX12Plus::createNewWaitcnt\n";
2358 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
2359 dbgs() << "New Instr: " << *SWaitInst << '\n');
2360 }
2361
2362 if (Wait.hasWaitDepctr()) {
2363 assert(IsExpertMode);
2364 unsigned Enc = AMDGPU::DepCtr::encodeFieldVmVsrc(VmVsrc: Wait.get(T: VM_VSRC), STI: ST);
2365 Enc = AMDGPU::DepCtr::encodeFieldVaVdst(Encoded: Enc, VaVdst: Wait.get(T: VA_VDST));
2366
2367 [[maybe_unused]] auto SWaitInst =
2368 BuildMI(BB&: Block, I: It, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_WAITCNT_DEPCTR)).addImm(Val: Enc);
2369
2370 Modified = true;
2371
2372 LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
2373 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
2374 dbgs() << "New Instr: " << *SWaitInst << '\n');
2375 }
2376
2377 return Modified;
2378}
2379
2380/// Generate s_waitcnt instruction to be placed before cur_Inst.
2381/// Instructions of a given type are returned in order,
2382/// but instructions of different types can complete out of order.
2383/// We rely on this in-order completion
2384/// and simply assign a score to the memory access instructions.
2385/// We keep track of the active "score bracket" to determine
2386/// if an access of a memory read requires an s_waitcnt
2387/// and if so what the value of each counter is.
2388/// The "score bracket" is bound by the lower bound and upper bound
2389/// scores (*_score_LB and *_score_ub respectively).
2390/// If FlushFlags.FlushVmCnt is true, we want to flush the vmcnt counter here.
2391/// If FlushFlags.FlushDsCnt is true, we want to flush the dscnt counter here
2392/// (GFX12+ only, where DS_CNT is a separate counter).
2393bool SIInsertWaitcnts::generateWaitcntInstBefore(
2394 MachineInstr &MI, WaitcntBrackets &ScoreBrackets,
2395 MachineInstr *OldWaitcntInstr, PreheaderFlushFlags FlushFlags) {
2396 LLVM_DEBUG(dbgs() << "\n*** GenerateWaitcntInstBefore: "; MI.print(dbgs()););
2397 setForceEmitWaitcnt();
2398
2399 assert(!MI.isMetaInstruction());
2400
2401 AMDGPU::Waitcnt Wait;
2402 const unsigned Opc = MI.getOpcode();
2403
2404 switch (Opc) {
2405 case AMDGPU::BUFFER_WBINVL1:
2406 case AMDGPU::BUFFER_WBINVL1_SC:
2407 case AMDGPU::BUFFER_WBINVL1_VOL:
2408 case AMDGPU::BUFFER_GL0_INV:
2409 case AMDGPU::BUFFER_GL1_INV: {
2410 // FIXME: This should have already been handled by the memory legalizer.
2411 // Removing this currently doesn't affect any lit tests, but we need to
2412 // verify that nothing was relying on this. The number of buffer invalidates
2413 // being handled here should not be expanded.
2414 Wait.set(T: LOAD_CNT, Val: 0);
2415 break;
2416 }
2417 case AMDGPU::SI_RETURN_TO_EPILOG:
2418 case AMDGPU::SI_RETURN:
2419 case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN:
2420 case AMDGPU::S_SETPC_B64_return: {
2421 // All waits must be resolved at call return.
2422 // NOTE: this could be improved with knowledge of all call sites or
2423 // with knowledge of the called routines.
2424 ReturnInsts.insert(V: &MI);
2425 AMDGPU::Waitcnt AllZeroWait =
2426 WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false);
2427 // On GFX12+, if LOAD_CNT is pending but no VGPRs are waiting for loads
2428 // (e.g., only GLOBAL_INV is pending), we can skip waiting on loadcnt.
2429 // GLOBAL_INV increments loadcnt but doesn't write to VGPRs, so there's
2430 // no need to wait for it at function boundaries.
2431 if (ST->hasExtendedWaitCounts() &&
2432 !ScoreBrackets.hasPendingEvent(E: VMEM_ACCESS))
2433 AllZeroWait.set(T: LOAD_CNT, Val: ~0u);
2434 Wait = AllZeroWait;
2435 break;
2436 }
2437 case AMDGPU::S_ENDPGM:
2438 case AMDGPU::S_ENDPGM_SAVED: {
2439 // In dynamic VGPR mode, we want to release the VGPRs before the wave exits.
2440 // Technically the hardware will do this on its own if we don't, but that
2441 // might cost extra cycles compared to doing it explicitly.
2442 // When not in dynamic VGPR mode, identify S_ENDPGM instructions which may
2443 // have to wait for outstanding VMEM stores. In this case it can be useful
2444 // to send a message to explicitly release all VGPRs before the stores have
2445 // completed, but it is only safe to do this if there are no outstanding
2446 // scratch stores.
2447 EndPgmInsts[&MI] = !ScoreBrackets.empty(T: STORE_CNT) &&
2448 !ScoreBrackets.hasPendingEvent(E: SCRATCH_WRITE_ACCESS);
2449 break;
2450 }
2451 case AMDGPU::S_SENDMSG:
2452 case AMDGPU::S_SENDMSGHALT: {
2453 if (ST->hasLegacyGeometry() &&
2454 ((MI.getOperand(i: 0).getImm() & AMDGPU::SendMsg::ID_MASK_PreGFX11_) ==
2455 AMDGPU::SendMsg::ID_GS_DONE_PreGFX11)) {
2456 // Resolve vm waits before gs-done.
2457 Wait.set(T: LOAD_CNT, Val: 0);
2458 break;
2459 }
2460 [[fallthrough]];
2461 }
2462 default: {
2463
2464 // Export & GDS instructions do not read the EXEC mask until after the
2465 // export is granted (which can occur well after the instruction is issued).
2466 // The shader program must flush all EXP operations on the export-count
2467 // before overwriting the EXEC mask.
2468 if (MI.modifiesRegister(Reg: AMDGPU::EXEC, TRI)) {
2469 // Export and GDS are tracked individually, either may trigger a waitcnt
2470 // for EXEC.
2471 if (ScoreBrackets.hasPendingEvent(E: EXP_GPR_LOCK) ||
2472 ScoreBrackets.hasPendingEvent(E: EXP_PARAM_ACCESS) ||
2473 ScoreBrackets.hasPendingEvent(E: EXP_POS_ACCESS) ||
2474 ScoreBrackets.hasPendingEvent(E: GDS_GPR_LOCK)) {
2475 Wait.set(T: EXP_CNT, Val: 0);
2476 }
2477 }
2478
2479 // Wait for any pending GDS instruction to complete before any
2480 // "Always GDS" instruction.
2481 if (TII->isAlwaysGDS(Opcode: Opc) && ScoreBrackets.hasPendingGDS())
2482 addWait(Wait, T: DS_CNT, Count: ScoreBrackets.getPendingGDSWait());
2483
2484 if (MI.isCall()) {
2485 // The function is going to insert a wait on everything in its prolog.
2486 // This still needs to be careful if the call target is a load (e.g. a GOT
2487 // load). We also need to check WAW dependency with saved PC.
2488 CallInsts.insert(V: &MI);
2489 Wait = AMDGPU::Waitcnt();
2490
2491 const MachineOperand &CallAddrOp = TII->getCalleeOperand(MI);
2492 if (CallAddrOp.isReg()) {
2493 ScoreBrackets.determineWaitForPhysReg(
2494 T: SmemAccessCounter, Reg: CallAddrOp.getReg().asMCReg(), Wait);
2495
2496 if (const auto *RtnAddrOp =
2497 TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::dst)) {
2498 ScoreBrackets.determineWaitForPhysReg(
2499 T: SmemAccessCounter, Reg: RtnAddrOp->getReg().asMCReg(), Wait);
2500 }
2501 }
2502 } else if (Opc == AMDGPU::S_BARRIER_WAIT) {
2503 ScoreBrackets.tryClearSCCWriteEvent(Inst: &MI);
2504 } else {
2505 // FIXME: Should not be relying on memoperands.
2506 // Look at the source operands of every instruction to see if
2507 // any of them results from a previous memory operation that affects
2508 // its current usage. If so, an s_waitcnt instruction needs to be
2509 // emitted.
2510 // If the source operand was defined by a load, add the s_waitcnt
2511 // instruction.
2512 //
2513 // Two cases are handled for destination operands:
2514 // 1) If the destination operand was defined by a load, add the s_waitcnt
2515 // instruction to guarantee the right WAW order.
2516 // 2) If a destination operand that was used by a recent export/store ins,
2517 // add s_waitcnt on exp_cnt to guarantee the WAR order.
2518
2519 for (const MachineMemOperand *Memop : MI.memoperands()) {
2520 const Value *Ptr = Memop->getValue();
2521 if (Memop->isStore()) {
2522 if (auto It = SLoadAddresses.find(Val: Ptr); It != SLoadAddresses.end()) {
2523 addWait(Wait, T: SmemAccessCounter, Count: 0);
2524 if (PDT->dominates(A: MI.getParent(), B: It->second))
2525 SLoadAddresses.erase(I: It);
2526 }
2527 }
2528 unsigned AS = Memop->getAddrSpace();
2529 if (AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::FLAT_ADDRESS)
2530 continue;
2531 // No need to wait before load from VMEM to LDS.
2532 if (TII->mayWriteLDSThroughDMA(MI))
2533 continue;
2534
2535 // LOAD_CNT is only relevant to vgpr or LDS.
2536 unsigned TID = LDSDMA_BEGIN;
2537 if (Ptr && Memop->getAAInfo()) {
2538 const auto &LDSDMAStores = ScoreBrackets.getLDSDMAStores();
2539 for (unsigned I = 0, E = LDSDMAStores.size(); I != E; ++I) {
2540 if (MI.mayAlias(AA, Other: *LDSDMAStores[I], UseTBAA: true)) {
2541 if ((I + 1) >= NUM_LDSDMA) {
2542 // We didn't have enough slot to track this LDS DMA store, it
2543 // has been tracked using the common RegNo (FIRST_LDS_VGPR).
2544 ScoreBrackets.determineWaitForLDSDMA(T: LOAD_CNT, TID, Wait);
2545 break;
2546 }
2547
2548 ScoreBrackets.determineWaitForLDSDMA(T: LOAD_CNT, TID: TID + I + 1, Wait);
2549 }
2550 }
2551 } else {
2552 ScoreBrackets.determineWaitForLDSDMA(T: LOAD_CNT, TID, Wait);
2553 }
2554 if (Memop->isStore()) {
2555 ScoreBrackets.determineWaitForLDSDMA(T: EXP_CNT, TID, Wait);
2556 }
2557 }
2558
2559 // Loop over use and def operands.
2560 for (const MachineOperand &Op : MI.operands()) {
2561 if (!Op.isReg())
2562 continue;
2563
2564 // If the instruction does not read tied source, skip the operand.
2565 if (Op.isTied() && Op.isUse() && TII->doesNotReadTiedSource(MI))
2566 continue;
2567
2568 MCPhysReg Reg = Op.getReg().asMCReg();
2569
2570 const bool IsVGPR = TRI->isVectorRegister(MRI: *MRI, Reg: Op.getReg());
2571 if (IsVGPR) {
2572 // Implicit VGPR defs and uses are never a part of the memory
2573 // instructions description and usually present to account for
2574 // super-register liveness.
2575 // TODO: Most of the other instructions also have implicit uses
2576 // for the liveness accounting only.
2577 if (Op.isImplicit() && MI.mayLoadOrStore())
2578 continue;
2579
2580 ScoreBrackets.determineWaitForPhysReg(T: VA_VDST, Reg, Wait);
2581 if (Op.isDef())
2582 ScoreBrackets.determineWaitForPhysReg(T: VM_VSRC, Reg, Wait);
2583 // RAW always needs an s_waitcnt. WAW needs an s_waitcnt unless the
2584 // previous write and this write are the same type of VMEM
2585 // instruction, in which case they are (in some architectures)
2586 // guaranteed to write their results in order anyway.
2587 // Additionally check instructions where Point Sample Acceleration
2588 // might be applied.
2589 if (Op.isUse() || !updateVMCntOnly(Inst: MI) ||
2590 ScoreBrackets.hasOtherPendingVmemTypes(Reg, V: getVmemType(Inst: MI)) ||
2591 ScoreBrackets.hasPointSamplePendingVmemTypes(MI, Reg) ||
2592 !ST->hasVmemWriteVgprInOrder()) {
2593 ScoreBrackets.determineWaitForPhysReg(T: LOAD_CNT, Reg, Wait);
2594 ScoreBrackets.determineWaitForPhysReg(T: SAMPLE_CNT, Reg, Wait);
2595 ScoreBrackets.determineWaitForPhysReg(T: BVH_CNT, Reg, Wait);
2596 ScoreBrackets.clearVgprVmemTypes(Reg);
2597 }
2598
2599 if (Op.isDef() || ScoreBrackets.hasPendingEvent(E: EXP_LDS_ACCESS)) {
2600 ScoreBrackets.determineWaitForPhysReg(T: EXP_CNT, Reg, Wait);
2601 }
2602 ScoreBrackets.determineWaitForPhysReg(T: DS_CNT, Reg, Wait);
2603 } else if (Op.getReg() == AMDGPU::SCC) {
2604 ScoreBrackets.determineWaitForPhysReg(T: KM_CNT, Reg, Wait);
2605 } else {
2606 ScoreBrackets.determineWaitForPhysReg(T: SmemAccessCounter, Reg, Wait);
2607 }
2608
2609 if (ST->hasWaitXcnt() && Op.isDef())
2610 ScoreBrackets.determineWaitForPhysReg(T: X_CNT, Reg, Wait);
2611 }
2612 }
2613 }
2614 }
2615
2616 // Ensure safety against exceptions from outstanding memory operations while
2617 // waiting for a barrier:
2618 //
2619 // * Some subtargets safely handle backing off the barrier in hardware
2620 // when an exception occurs.
2621 // * Some subtargets have an implicit S_WAITCNT 0 before barriers, so that
2622 // there can be no outstanding memory operations during the wait.
2623 // * Subtargets with split barriers don't need to back off the barrier; it
2624 // is up to the trap handler to preserve the user barrier state correctly.
2625 //
2626 // In all other cases, ensure safety by ensuring that there are no outstanding
2627 // memory operations.
2628 if (Opc == AMDGPU::S_BARRIER && !ST->hasAutoWaitcntBeforeBarrier() &&
2629 !ST->hasBackOffBarrier()) {
2630 Wait = Wait.combined(Other: WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/true));
2631 }
2632
2633 // TODO: Remove this work-around, enable the assert for Bug 457939
2634 // after fixing the scheduler. Also, the Shader Compiler code is
2635 // independent of target.
2636 if (SIInstrInfo::isCBranchVCCZRead(MI) && ST->hasReadVCCZBug() &&
2637 ScoreBrackets.hasPendingEvent(E: SMEM_ACCESS)) {
2638 Wait.set(T: DS_CNT, Val: 0);
2639 }
2640
2641 // Verify that the wait is actually needed.
2642 ScoreBrackets.simplifyWaitcnt(Wait);
2643
2644 // It is only necessary to insert an S_WAITCNT_DEPCTR instruction that
2645 // waits on VA_VDST if the instruction it would precede is not a VALU
2646 // instruction, since hardware handles VALU->VGPR->VALU hazards in
2647 // expert scheduling mode.
2648 if (TII->isVALU(MI))
2649 Wait.set(T: VA_VDST, Val: ~0u);
2650
2651 // Since the translation for VMEM addresses occur in-order, we can apply the
2652 // XCnt if the current instruction is of VMEM type and has a memory
2653 // dependency with another VMEM instruction in flight.
2654 if (Wait.get(T: X_CNT) != ~0u && isVmemAccess(MI)) {
2655 ScoreBrackets.applyWaitcnt(Wait, T: X_CNT);
2656 Wait.set(T: X_CNT, Val: ~0u);
2657 }
2658
2659 // When forcing emit, we need to skip terminators because that would break the
2660 // terminators of the MBB if we emit a waitcnt between terminators.
2661 if (ForceEmitZeroFlag && !MI.isTerminator())
2662 Wait = WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false);
2663
2664 // If we force waitcnt then update Wait accordingly.
2665 for (InstCounterType T : inst_counter_types()) {
2666 if (!ForceEmitWaitcnt[T])
2667 continue;
2668 Wait.set(T, Val: 0);
2669 }
2670
2671 if (FlushFlags.FlushVmCnt) {
2672 for (InstCounterType T : {LOAD_CNT, SAMPLE_CNT, BVH_CNT})
2673 Wait.set(T, Val: 0);
2674 }
2675
2676 if (FlushFlags.FlushDsCnt && ScoreBrackets.hasPendingEvent(T: DS_CNT))
2677 Wait.set(T: DS_CNT, Val: 0);
2678
2679 if (ForceEmitZeroLoadFlag && Wait.get(T: LOAD_CNT) != ~0u)
2680 Wait.set(T: LOAD_CNT, Val: 0);
2681
2682 return generateWaitcnt(Wait, It: MI.getIterator(), Block&: *MI.getParent(), ScoreBrackets,
2683 OldWaitcntInstr);
2684}
2685
2686bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
2687 MachineBasicBlock::instr_iterator It,
2688 MachineBasicBlock &Block,
2689 WaitcntBrackets &ScoreBrackets,
2690 MachineInstr *OldWaitcntInstr) {
2691 bool Modified = false;
2692
2693 if (OldWaitcntInstr)
2694 // Try to merge the required wait with preexisting waitcnt instructions.
2695 // Also erase redundant waitcnt.
2696 Modified =
2697 WCG->applyPreexistingWaitcnt(ScoreBrackets, OldWaitcntInstr&: *OldWaitcntInstr, Wait, It);
2698
2699 // ExpCnt can be merged into VINTERP.
2700 if (Wait.get(T: EXP_CNT) != ~0u && It != Block.instr_end() &&
2701 SIInstrInfo::isVINTERP(MI: *It)) {
2702 MachineOperand *WaitExp =
2703 TII->getNamedOperand(MI&: *It, OperandName: AMDGPU::OpName::waitexp);
2704 if (Wait.get(T: EXP_CNT) < WaitExp->getImm()) {
2705 WaitExp->setImm(Wait.get(T: EXP_CNT));
2706 Modified = true;
2707 }
2708 // Apply ExpCnt before resetting it, so applyWaitcnt below sees all counts.
2709 ScoreBrackets.applyWaitcnt(Wait, T: EXP_CNT);
2710 Wait.set(T: EXP_CNT, Val: ~0u);
2711
2712 LLVM_DEBUG(dbgs() << "generateWaitcnt\n"
2713 << "Update Instr: " << *It);
2714 }
2715
2716 if (WCG->createNewWaitcnt(Block, It, Wait, ScoreBrackets))
2717 Modified = true;
2718
2719 // Any counts that could have been applied to any existing waitcnt
2720 // instructions will have been done so, now deal with any remaining.
2721 ScoreBrackets.applyWaitcnt(Wait);
2722
2723 return Modified;
2724}
2725
2726std::optional<WaitEventType>
2727SIInsertWaitcnts::getExpertSchedulingEventType(const MachineInstr &Inst) const {
2728 if (TII->isVALU(MI: Inst)) {
2729 // Core/Side-, DP-, XDL- and TRANS-MACC VALU instructions complete
2730 // out-of-order with respect to each other, so each of these classes
2731 // has its own event.
2732
2733 if (TII->isXDL(MI: Inst))
2734 return VGPR_XDL_WRITE;
2735
2736 if (TII->isTRANS(MI: Inst))
2737 return VGPR_TRANS_WRITE;
2738
2739 if (AMDGPU::isDPMACCInstruction(Opc: Inst.getOpcode()))
2740 return VGPR_DPMACC_WRITE;
2741
2742 return VGPR_CSMACC_WRITE;
2743 }
2744
2745 // FLAT and LDS instructions may read their VGPR sources out-of-order
2746 // with respect to each other and all other VMEM instructions, so
2747 // each of these also has a separate event.
2748
2749 if (TII->isFLAT(MI: Inst))
2750 return VGPR_FLAT_READ;
2751
2752 if (TII->isDS(MI: Inst))
2753 return VGPR_LDS_READ;
2754
2755 if (TII->isVMEM(MI: Inst) || TII->isVIMAGE(MI: Inst) || TII->isVSAMPLE(MI: Inst))
2756 return VGPR_VMEM_READ;
2757
2758 // Otherwise, no hazard.
2759
2760 return {};
2761}
2762
2763bool SIInsertWaitcnts::isVmemAccess(const MachineInstr &MI) const {
2764 return (TII->isFLAT(MI) && TII->mayAccessVMEMThroughFlat(MI)) ||
2765 (TII->isVMEM(MI) && !AMDGPU::getMUBUFIsBufferInv(Opc: MI.getOpcode()));
2766}
2767
2768// Return true if the next instruction is S_ENDPGM, following fallthrough
2769// blocks if necessary.
2770bool SIInsertWaitcnts::isNextENDPGM(MachineBasicBlock::instr_iterator It,
2771 MachineBasicBlock *Block) const {
2772 auto BlockEnd = Block->getParent()->end();
2773 auto BlockIter = Block->getIterator();
2774
2775 while (true) {
2776 if (It.isEnd()) {
2777 if (++BlockIter != BlockEnd) {
2778 It = BlockIter->instr_begin();
2779 continue;
2780 }
2781
2782 return false;
2783 }
2784
2785 if (!It->isMetaInstruction())
2786 break;
2787
2788 It++;
2789 }
2790
2791 assert(!It.isEnd());
2792
2793 return It->getOpcode() == AMDGPU::S_ENDPGM;
2794}
2795
2796// Add a wait after an instruction if architecture requirements mandate one.
2797bool SIInsertWaitcnts::insertForcedWaitAfter(MachineInstr &Inst,
2798 MachineBasicBlock &Block,
2799 WaitcntBrackets &ScoreBrackets) {
2800 AMDGPU::Waitcnt Wait;
2801 bool NeedsEndPGMCheck = false;
2802
2803 if (ST->isPreciseMemoryEnabled() && Inst.mayLoadOrStore())
2804 Wait = WCG->getAllZeroWaitcnt(IncludeVSCnt: Inst.mayStore() &&
2805 !SIInstrInfo::isAtomicRet(MI: Inst));
2806
2807 if (TII->isAlwaysGDS(Opcode: Inst.getOpcode())) {
2808 Wait.set(T: DS_CNT, Val: 0);
2809 NeedsEndPGMCheck = true;
2810 }
2811
2812 ScoreBrackets.simplifyWaitcnt(Wait);
2813
2814 auto SuccessorIt = std::next(x: Inst.getIterator());
2815 bool Result = generateWaitcnt(Wait, It: SuccessorIt, Block, ScoreBrackets,
2816 /*OldWaitcntInstr=*/nullptr);
2817
2818 if (Result && NeedsEndPGMCheck && isNextENDPGM(It: SuccessorIt, Block: &Block)) {
2819 BuildMI(BB&: Block, I: SuccessorIt, MIMD: Inst.getDebugLoc(), MCID: TII->get(Opcode: AMDGPU::S_NOP))
2820 .addImm(Val: 0);
2821 }
2822
2823 return Result;
2824}
2825
2826WaitEventSet SIInsertWaitcnts::getEventsFor(const MachineInstr &Inst) const {
2827 WaitEventSet Events;
2828 if (IsExpertMode) {
2829 if (const auto ET = getExpertSchedulingEventType(Inst))
2830 Events.insert(Event: *ET);
2831 }
2832
2833 if (TII->isDS(MI: Inst) && TII->usesLGKM_CNT(MI: Inst)) {
2834 if (TII->isAlwaysGDS(Opcode: Inst.getOpcode()) ||
2835 TII->hasModifiersSet(MI: Inst, OpName: AMDGPU::OpName::gds)) {
2836 Events.insert(Event: GDS_ACCESS);
2837 Events.insert(Event: GDS_GPR_LOCK);
2838 } else {
2839 Events.insert(Event: LDS_ACCESS);
2840 }
2841 } else if (TII->isFLAT(MI: Inst)) {
2842 if (SIInstrInfo::isGFX12CacheInvOrWBInst(Opc: Inst.getOpcode())) {
2843 Events.insert(Event: getVmemWaitEventType(Inst));
2844 } else {
2845 assert(Inst.mayLoadOrStore());
2846 if (TII->mayAccessVMEMThroughFlat(MI: Inst)) {
2847 if (ST->hasWaitXcnt())
2848 Events.insert(Event: VMEM_GROUP);
2849 Events.insert(Event: getVmemWaitEventType(Inst));
2850 }
2851 if (TII->mayAccessLDSThroughFlat(MI: Inst))
2852 Events.insert(Event: LDS_ACCESS);
2853 }
2854 } else if (SIInstrInfo::isVMEM(MI: Inst) &&
2855 (!AMDGPU::getMUBUFIsBufferInv(Opc: Inst.getOpcode()) ||
2856 Inst.getOpcode() == AMDGPU::BUFFER_WBL2)) {
2857 // BUFFER_WBL2 is included here because unlike invalidates, has to be
2858 // followed "S_WAITCNT vmcnt(0)" is needed after to ensure the writeback has
2859 // completed.
2860 if (ST->hasWaitXcnt())
2861 Events.insert(Event: VMEM_GROUP);
2862 Events.insert(Event: getVmemWaitEventType(Inst));
2863 if (ST->vmemWriteNeedsExpWaitcnt() &&
2864 (Inst.mayStore() || SIInstrInfo::isAtomicRet(MI: Inst))) {
2865 Events.insert(Event: VMW_GPR_LOCK);
2866 }
2867 } else if (TII->isSMRD(MI: Inst)) {
2868 if (ST->hasWaitXcnt())
2869 Events.insert(Event: SMEM_GROUP);
2870 Events.insert(Event: SMEM_ACCESS);
2871 } else if (SIInstrInfo::isLDSDIR(MI: Inst)) {
2872 Events.insert(Event: EXP_LDS_ACCESS);
2873 } else if (SIInstrInfo::isEXP(MI: Inst)) {
2874 unsigned Imm = TII->getNamedOperand(MI: Inst, OperandName: AMDGPU::OpName::tgt)->getImm();
2875 if (Imm >= AMDGPU::Exp::ET_PARAM0 && Imm <= AMDGPU::Exp::ET_PARAM31)
2876 Events.insert(Event: EXP_PARAM_ACCESS);
2877 else if (Imm >= AMDGPU::Exp::ET_POS0 && Imm <= AMDGPU::Exp::ET_POS_LAST)
2878 Events.insert(Event: EXP_POS_ACCESS);
2879 else
2880 Events.insert(Event: EXP_GPR_LOCK);
2881 } else if (SIInstrInfo::isSBarrierSCCWrite(Opcode: Inst.getOpcode())) {
2882 Events.insert(Event: SCC_WRITE);
2883 } else {
2884 switch (Inst.getOpcode()) {
2885 case AMDGPU::S_SENDMSG:
2886 case AMDGPU::S_SENDMSG_RTN_B32:
2887 case AMDGPU::S_SENDMSG_RTN_B64:
2888 case AMDGPU::S_SENDMSGHALT:
2889 Events.insert(Event: SQ_MESSAGE);
2890 break;
2891 case AMDGPU::S_MEMTIME:
2892 case AMDGPU::S_MEMREALTIME:
2893 case AMDGPU::S_GET_BARRIER_STATE_M0:
2894 case AMDGPU::S_GET_BARRIER_STATE_IMM:
2895 Events.insert(Event: SMEM_ACCESS);
2896 break;
2897 }
2898 }
2899 return Events;
2900}
2901
2902void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
2903 WaitcntBrackets *ScoreBrackets) {
2904
2905 WaitEventSet InstEvents = getEventsFor(Inst);
2906 for (WaitEventType E : wait_events()) {
2907 if (InstEvents.contains(Event: E))
2908 ScoreBrackets->updateByEvent(E, Inst);
2909 }
2910
2911 if (TII->isDS(MI: Inst) && TII->usesLGKM_CNT(MI: Inst)) {
2912 if (TII->isAlwaysGDS(Opcode: Inst.getOpcode()) ||
2913 TII->hasModifiersSet(MI: Inst, OpName: AMDGPU::OpName::gds)) {
2914 ScoreBrackets->setPendingGDS();
2915 }
2916 } else if (TII->isFLAT(MI: Inst)) {
2917 if (Inst.mayLoadOrStore() && TII->mayAccessVMEMThroughFlat(MI: Inst) &&
2918 TII->mayAccessLDSThroughFlat(MI: Inst) && !SIInstrInfo::isLDSDMA(MI: Inst))
2919 // Async/LDSDMA operations have FLAT encoding but do not actually use flat
2920 // pointers. They do have two operands that each access global and LDS,
2921 // thus making it appear at this point that they are using a flat pointer.
2922 // Filter them out, and for the rest, generate a dependency on flat
2923 // pointers so that both VM and LGKM counters are flushed.
2924 ScoreBrackets->setPendingFlat();
2925 } else if (Inst.isCall()) {
2926 // Act as a wait on everything
2927 ScoreBrackets->applyWaitcnt(Wait: WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false));
2928 ScoreBrackets->setStateOnFunctionEntryOrReturn();
2929 } else if (TII->isVINTERP(MI: Inst)) {
2930 int64_t Imm = TII->getNamedOperand(MI&: Inst, OperandName: AMDGPU::OpName::waitexp)->getImm();
2931 ScoreBrackets->applyWaitcnt(T: EXP_CNT, Count: Imm);
2932 }
2933}
2934
2935bool WaitcntBrackets::mergeScore(const MergeInfo &M, unsigned &Score,
2936 unsigned OtherScore) {
2937 unsigned MyShifted = Score <= M.OldLB ? 0 : Score + M.MyShift;
2938 unsigned OtherShifted =
2939 OtherScore <= M.OtherLB ? 0 : OtherScore + M.OtherShift;
2940 Score = std::max(a: MyShifted, b: OtherShifted);
2941 return OtherShifted > MyShifted;
2942}
2943
2944bool WaitcntBrackets::mergeAsyncMarks(ArrayRef<MergeInfo> MergeInfos,
2945 ArrayRef<CounterValueArray> OtherMarks) {
2946 bool StrictDom = false;
2947
2948 LLVM_DEBUG(dbgs() << "Merging async marks ...");
2949 // Early exit: both empty
2950 if (AsyncMarks.empty() && OtherMarks.empty()) {
2951 LLVM_DEBUG(dbgs() << " nothing to merge\n");
2952 return false;
2953 }
2954 LLVM_DEBUG(dbgs() << '\n');
2955
2956 // Determine maximum length needed after merging
2957 auto MaxSize = (unsigned)std::max(a: AsyncMarks.size(), b: OtherMarks.size());
2958 MaxSize = std::min(a: MaxSize, b: MaxAsyncMarks);
2959
2960 // Keep only the most recent marks within our limit.
2961 if (AsyncMarks.size() > MaxSize)
2962 AsyncMarks.erase(CS: AsyncMarks.begin(),
2963 CE: AsyncMarks.begin() + (AsyncMarks.size() - MaxSize));
2964
2965 // Pad with zero-filled marks if our list is shorter. Zero represents "no
2966 // pending async operations at this checkpoint" and acts as the identity
2967 // element for max() during merging. We pad at the beginning since the marks
2968 // need to be aligned in most-recent order.
2969 constexpr CounterValueArray ZeroMark{};
2970 AsyncMarks.insert(I: AsyncMarks.begin(), NumToInsert: MaxSize - AsyncMarks.size(), Elt: ZeroMark);
2971
2972 LLVM_DEBUG({
2973 dbgs() << "Before merge:\n";
2974 for (const auto &Mark : AsyncMarks) {
2975 llvm::interleaveComma(Mark, dbgs());
2976 dbgs() << '\n';
2977 }
2978 dbgs() << "Other marks:\n";
2979 for (const auto &Mark : OtherMarks) {
2980 llvm::interleaveComma(Mark, dbgs());
2981 dbgs() << '\n';
2982 }
2983 });
2984
2985 // Merge element-wise using the existing mergeScore function and the
2986 // appropriate MergeInfo for each counter type. Iterate only while we have
2987 // elements in both vectors.
2988 unsigned OtherSize = OtherMarks.size();
2989 unsigned OurSize = AsyncMarks.size();
2990 unsigned MergeCount = std::min(a: OtherSize, b: OurSize);
2991 for (auto Idx : seq_inclusive<unsigned>(Begin: 1, End: MergeCount)) {
2992 for (auto T : inst_counter_types(MaxCounter: Context->MaxCounter)) {
2993 StrictDom |= mergeScore(M: MergeInfos[T], Score&: AsyncMarks[OurSize - Idx][T],
2994 OtherScore: OtherMarks[OtherSize - Idx][T]);
2995 }
2996 }
2997
2998 LLVM_DEBUG({
2999 dbgs() << "After merge:\n";
3000 for (const auto &Mark : AsyncMarks) {
3001 llvm::interleaveComma(Mark, dbgs());
3002 dbgs() << '\n';
3003 }
3004 });
3005
3006 return StrictDom;
3007}
3008
3009/// Merge the pending events and associater score brackets of \p Other into
3010/// this brackets status.
3011///
3012/// Returns whether the merge resulted in a change that requires tighter waits
3013/// (i.e. the merged brackets strictly dominate the original brackets).
3014bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
3015 bool StrictDom = false;
3016
3017 // Check if "other" has keys we don't have, and create default entries for
3018 // those. If they remain empty after merging, we will clean it up after.
3019 for (auto K : Other.VMem.keys())
3020 VMem.try_emplace(Key: K);
3021 for (auto K : Other.SGPRs.keys())
3022 SGPRs.try_emplace(Key: K);
3023
3024 // Array to store MergeInfo for each counter type
3025 MergeInfo MergeInfos[NUM_INST_CNTS];
3026
3027 for (auto T : inst_counter_types(MaxCounter: Context->MaxCounter)) {
3028 // Merge event flags for this counter
3029 const WaitEventSet &EventsForT = Context->getWaitEvents(T);
3030 const WaitEventSet OldEvents = PendingEvents & EventsForT;
3031 const WaitEventSet OtherEvents = Other.PendingEvents & EventsForT;
3032 if (!OldEvents.contains(Other: OtherEvents))
3033 StrictDom = true;
3034 PendingEvents |= OtherEvents;
3035
3036 // Merge scores for this counter
3037 const unsigned MyPending = ScoreUBs[T] - ScoreLBs[T];
3038 const unsigned OtherPending = Other.ScoreUBs[T] - Other.ScoreLBs[T];
3039 const unsigned NewUB = ScoreLBs[T] + std::max(a: MyPending, b: OtherPending);
3040 if (NewUB < ScoreLBs[T])
3041 report_fatal_error(reason: "waitcnt score overflow");
3042
3043 MergeInfo &M = MergeInfos[T];
3044 M.OldLB = ScoreLBs[T];
3045 M.OtherLB = Other.ScoreLBs[T];
3046 M.MyShift = NewUB - ScoreUBs[T];
3047 M.OtherShift = NewUB - Other.ScoreUBs[T];
3048
3049 ScoreUBs[T] = NewUB;
3050
3051 StrictDom |= mergeScore(M, Score&: LastFlat[T], OtherScore: Other.LastFlat[T]);
3052
3053 if (T == DS_CNT)
3054 StrictDom |= mergeScore(M, Score&: LastGDS, OtherScore: Other.LastGDS);
3055
3056 if (T == KM_CNT) {
3057 StrictDom |= mergeScore(M, Score&: SCCScore, OtherScore: Other.SCCScore);
3058 if (Other.hasPendingEvent(E: SCC_WRITE)) {
3059 if (!OldEvents.contains(Event: SCC_WRITE)) {
3060 PendingSCCWrite = Other.PendingSCCWrite;
3061 } else if (PendingSCCWrite != Other.PendingSCCWrite) {
3062 PendingSCCWrite = nullptr;
3063 }
3064 }
3065 }
3066
3067 for (auto &[RegID, Info] : VMem)
3068 StrictDom |= mergeScore(M, Score&: Info.Scores[T], OtherScore: Other.getVMemScore(TID: RegID, T));
3069
3070 if (isSmemCounter(T)) {
3071 unsigned Idx = getSgprScoresIdx(T);
3072 for (auto &[RegID, Info] : SGPRs) {
3073 auto It = Other.SGPRs.find(Val: RegID);
3074 unsigned OtherScore =
3075 (It != Other.SGPRs.end()) ? It->second.Scores[Idx] : 0;
3076 StrictDom |= mergeScore(M, Score&: Info.Scores[Idx], OtherScore);
3077 }
3078 }
3079 }
3080
3081 for (auto &[TID, Info] : VMem) {
3082 if (auto It = Other.VMem.find(Val: TID); It != Other.VMem.end()) {
3083 unsigned char NewVmemTypes = Info.VMEMTypes | It->second.VMEMTypes;
3084 StrictDom |= NewVmemTypes != Info.VMEMTypes;
3085 Info.VMEMTypes = NewVmemTypes;
3086 }
3087 }
3088
3089 StrictDom |= mergeAsyncMarks(MergeInfos, OtherMarks: Other.AsyncMarks);
3090 for (auto T : inst_counter_types(MaxCounter: Context->MaxCounter))
3091 StrictDom |= mergeScore(M: MergeInfos[T], Score&: AsyncScore[T], OtherScore: Other.AsyncScore[T]);
3092
3093 purgeEmptyTrackingData();
3094 return StrictDom;
3095}
3096
3097static bool isWaitInstr(MachineInstr &Inst) {
3098 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Opcode: Inst.getOpcode());
3099 return Opcode == AMDGPU::S_WAITCNT ||
3100 (Opcode == AMDGPU::S_WAITCNT_VSCNT && Inst.getOperand(i: 0).isReg() &&
3101 Inst.getOperand(i: 0).getReg() == AMDGPU::SGPR_NULL) ||
3102 Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT ||
3103 Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT ||
3104 Opcode == AMDGPU::S_WAITCNT_lds_direct ||
3105 Opcode == AMDGPU::WAIT_ASYNCMARK ||
3106 counterTypeForInstr(Opcode).has_value();
3107}
3108
3109void SIInsertWaitcnts::setSchedulingMode(MachineBasicBlock &MBB,
3110 MachineBasicBlock::iterator I,
3111 bool ExpertMode) const {
3112 const unsigned EncodedReg = AMDGPU::Hwreg::HwregEncoding::encode(
3113 Values: AMDGPU::Hwreg::ID_SCHED_MODE, Values: AMDGPU::Hwreg::HwregOffset::Default, Values: 2);
3114 BuildMI(BB&: MBB, I, MIMD: DebugLoc(), MCID: TII->get(Opcode: AMDGPU::S_SETREG_IMM32_B32))
3115 .addImm(Val: ExpertMode ? 2 : 0)
3116 .addImm(Val: EncodedReg);
3117}
3118
3119// Generate s_waitcnt instructions where needed.
3120bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
3121 MachineBasicBlock &Block,
3122 WaitcntBrackets &ScoreBrackets) {
3123 bool Modified = false;
3124
3125 LLVM_DEBUG({
3126 dbgs() << "*** Begin Block: ";
3127 Block.printName(dbgs());
3128 ScoreBrackets.dump();
3129 });
3130
3131 // Track the correctness of vccz through this basic block. There are two
3132 // reasons why it might be incorrect; see ST->hasReadVCCZBug() and
3133 // ST->partialVCCWritesUpdateVCCZ().
3134 bool VCCZCorrect = true;
3135 if (ST->hasReadVCCZBug()) {
3136 // vccz could be incorrect at a basic block boundary if a predecessor wrote
3137 // to vcc and then issued an smem load.
3138 VCCZCorrect = false;
3139 } else if (!ST->partialVCCWritesUpdateVCCZ()) {
3140 // vccz could be incorrect at a basic block boundary if a predecessor wrote
3141 // to vcc_lo or vcc_hi.
3142 VCCZCorrect = false;
3143 }
3144
3145 // Walk over the instructions.
3146 MachineInstr *OldWaitcntInstr = nullptr;
3147
3148 // NOTE: We may append instrs after Inst while iterating.
3149 for (MachineBasicBlock::instr_iterator Iter = Block.instr_begin(),
3150 E = Block.instr_end();
3151 Iter != E; ++Iter) {
3152 MachineInstr &Inst = *Iter;
3153 if (Inst.isMetaInstruction())
3154 continue;
3155 // Track pre-existing waitcnts that were added in earlier iterations or by
3156 // the memory legalizer.
3157 if (isWaitInstr(Inst) ||
3158 (IsExpertMode && Inst.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR)) {
3159 if (!OldWaitcntInstr)
3160 OldWaitcntInstr = &Inst;
3161 continue;
3162 }
3163
3164 PreheaderFlushFlags FlushFlags;
3165 if (Block.getFirstTerminator() == Inst)
3166 FlushFlags = isPreheaderToFlush(MBB&: Block, ScoreBrackets);
3167
3168 if (Inst.getOpcode() == AMDGPU::ASYNCMARK) {
3169 // FIXME: Not supported on GFX12 yet. Will need a new feature when we do.
3170 assert(ST->getGeneration() < AMDGPUSubtarget::GFX12);
3171 ScoreBrackets.recordAsyncMark(Inst);
3172 continue;
3173 }
3174
3175 // Generate an s_waitcnt instruction to be placed before Inst, if needed.
3176 Modified |= generateWaitcntInstBefore(MI&: Inst, ScoreBrackets, OldWaitcntInstr,
3177 FlushFlags);
3178 OldWaitcntInstr = nullptr;
3179
3180 // Restore vccz if it's not known to be correct already.
3181 bool RestoreVCCZ = !VCCZCorrect && SIInstrInfo::isCBranchVCCZRead(MI: Inst);
3182
3183 // Don't examine operands unless we need to track vccz correctness.
3184 if (ST->hasReadVCCZBug() || !ST->partialVCCWritesUpdateVCCZ()) {
3185 if (Inst.definesRegister(Reg: AMDGPU::VCC_LO, /*TRI=*/nullptr) ||
3186 Inst.definesRegister(Reg: AMDGPU::VCC_HI, /*TRI=*/nullptr)) {
3187 // Up to gfx9, writes to vcc_lo and vcc_hi don't update vccz.
3188 if (!ST->partialVCCWritesUpdateVCCZ())
3189 VCCZCorrect = false;
3190 } else if (Inst.definesRegister(Reg: AMDGPU::VCC, /*TRI=*/nullptr)) {
3191 // There is a hardware bug on CI/SI where SMRD instruction may corrupt
3192 // vccz bit, so when we detect that an instruction may read from a
3193 // corrupt vccz bit, we need to:
3194 // 1. Insert s_waitcnt lgkm(0) to wait for all outstanding SMRD
3195 // operations to complete.
3196 // 2. Restore the correct value of vccz by writing the current value
3197 // of vcc back to vcc.
3198 if (ST->hasReadVCCZBug() &&
3199 ScoreBrackets.hasPendingEvent(E: SMEM_ACCESS)) {
3200 // Writes to vcc while there's an outstanding smem read may get
3201 // clobbered as soon as any read completes.
3202 VCCZCorrect = false;
3203 } else {
3204 // Writes to vcc will fix any incorrect value in vccz.
3205 VCCZCorrect = true;
3206 }
3207 }
3208 }
3209
3210 if (TII->isSMRD(MI: Inst)) {
3211 for (const MachineMemOperand *Memop : Inst.memoperands()) {
3212 // No need to handle invariant loads when avoiding WAR conflicts, as
3213 // there cannot be a vector store to the same memory location.
3214 if (!Memop->isInvariant()) {
3215 const Value *Ptr = Memop->getValue();
3216 SLoadAddresses.insert(KV: std::pair(Ptr, Inst.getParent()));
3217 }
3218 }
3219 if (ST->hasReadVCCZBug()) {
3220 // This smem read could complete and clobber vccz at any time.
3221 VCCZCorrect = false;
3222 }
3223 }
3224
3225 updateEventWaitcntAfter(Inst, ScoreBrackets: &ScoreBrackets);
3226
3227 // Note: insertForcedWaitAfter() may add instrs after Iter that need to be
3228 // visited by the loop.
3229 Modified |= insertForcedWaitAfter(Inst, Block, ScoreBrackets);
3230
3231 LLVM_DEBUG({
3232 Inst.print(dbgs());
3233 ScoreBrackets.dump();
3234 });
3235
3236 // TODO: Remove this work-around after fixing the scheduler and enable the
3237 // assert above.
3238 if (RestoreVCCZ) {
3239 // Restore the vccz bit. Any time a value is written to vcc, the vcc
3240 // bit is updated, so we can restore the bit by reading the value of
3241 // vcc and then writing it back to the register.
3242 BuildMI(BB&: Block, I&: Inst, MIMD: Inst.getDebugLoc(),
3243 MCID: TII->get(Opcode: ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
3244 DestReg: TRI->getVCC())
3245 .addReg(RegNo: TRI->getVCC());
3246 VCCZCorrect = true;
3247 Modified = true;
3248 }
3249 }
3250
3251 // Flush counters at the end of the block if needed (for preheaders with no
3252 // terminator).
3253 AMDGPU::Waitcnt Wait;
3254 if (Block.getFirstTerminator() == Block.end()) {
3255 PreheaderFlushFlags FlushFlags = isPreheaderToFlush(MBB&: Block, ScoreBrackets);
3256 if (FlushFlags.FlushVmCnt) {
3257 if (ScoreBrackets.hasPendingEvent(T: LOAD_CNT))
3258 Wait.set(T: LOAD_CNT, Val: 0);
3259 if (ScoreBrackets.hasPendingEvent(T: SAMPLE_CNT))
3260 Wait.set(T: SAMPLE_CNT, Val: 0);
3261 if (ScoreBrackets.hasPendingEvent(T: BVH_CNT))
3262 Wait.set(T: BVH_CNT, Val: 0);
3263 }
3264 if (FlushFlags.FlushDsCnt && ScoreBrackets.hasPendingEvent(T: DS_CNT))
3265 Wait.set(T: DS_CNT, Val: 0);
3266 }
3267
3268 // Combine or remove any redundant waitcnts at the end of the block.
3269 Modified |= generateWaitcnt(Wait, It: Block.instr_end(), Block, ScoreBrackets,
3270 OldWaitcntInstr);
3271
3272 LLVM_DEBUG({
3273 dbgs() << "*** End Block: ";
3274 Block.printName(dbgs());
3275 ScoreBrackets.dump();
3276 });
3277
3278 return Modified;
3279}
3280
3281bool SIInsertWaitcnts::removeRedundantSoftXcnts(MachineBasicBlock &Block) {
3282 if (Block.size() <= 1)
3283 return false;
3284 // The Memory Legalizer conservatively inserts a soft xcnt before each
3285 // atomic RMW operation. However, for sequences of back-to-back atomic
3286 // RMWs, only the first s_wait_xcnt insertion is necessary. Optimize away
3287 // the redundant soft xcnts.
3288 bool Modified = false;
3289 // Remember the last atomic with a soft xcnt right before it.
3290 MachineInstr *LastAtomicWithSoftXcnt = nullptr;
3291
3292 for (MachineInstr &MI : drop_begin(RangeOrContainer&: Block)) {
3293 // Ignore last atomic if non-LDS VMEM and SMEM.
3294 bool IsLDS =
3295 TII->isDS(MI) || (TII->isFLAT(MI) && TII->mayAccessLDSThroughFlat(MI));
3296 if (!IsLDS && (MI.mayLoad() ^ MI.mayStore()))
3297 LastAtomicWithSoftXcnt = nullptr;
3298
3299 bool IsAtomicRMW = (MI.getDesc().TSFlags & SIInstrFlags::maybeAtomic) &&
3300 MI.mayLoad() && MI.mayStore();
3301 MachineInstr &PrevMI = *MI.getPrevNode();
3302 // This is an atomic with a soft xcnt.
3303 if (PrevMI.getOpcode() == AMDGPU::S_WAIT_XCNT_soft && IsAtomicRMW) {
3304 // If we have already found an atomic with a soft xcnt, remove this soft
3305 // xcnt as it's redundant.
3306 if (LastAtomicWithSoftXcnt) {
3307 PrevMI.eraseFromParent();
3308 Modified = true;
3309 }
3310 LastAtomicWithSoftXcnt = &MI;
3311 }
3312 }
3313 return Modified;
3314}
3315
3316// Return flags indicating which counters should be flushed in the preheader.
3317PreheaderFlushFlags
3318SIInsertWaitcnts::isPreheaderToFlush(MachineBasicBlock &MBB,
3319 const WaitcntBrackets &ScoreBrackets) {
3320 auto [Iterator, IsInserted] =
3321 PreheadersToFlush.try_emplace(Key: &MBB, Args: PreheaderFlushFlags());
3322 if (!IsInserted)
3323 return Iterator->second;
3324
3325 MachineBasicBlock *Succ = MBB.getSingleSuccessor();
3326 if (!Succ)
3327 return PreheaderFlushFlags();
3328
3329 MachineLoop *Loop = MLI->getLoopFor(BB: Succ);
3330 if (!Loop)
3331 return PreheaderFlushFlags();
3332
3333 if (Loop->getLoopPreheader() == &MBB) {
3334 Iterator->second = getPreheaderFlushFlags(ML: Loop, Brackets: ScoreBrackets);
3335 return Iterator->second;
3336 }
3337
3338 return PreheaderFlushFlags();
3339}
3340
3341bool SIInsertWaitcnts::isVMEMOrFlatVMEM(const MachineInstr &MI) const {
3342 if (SIInstrInfo::isFLAT(MI))
3343 return TII->mayAccessVMEMThroughFlat(MI);
3344 return SIInstrInfo::isVMEM(MI);
3345}
3346
3347bool SIInsertWaitcnts::isDSRead(const MachineInstr &MI) const {
3348 return SIInstrInfo::isDS(MI) && MI.mayLoad() && !MI.mayStore();
3349}
3350
3351// Check if instruction is a store to LDS that is counted via DSCNT
3352// (where that counter exists).
3353bool SIInsertWaitcnts::mayStoreIncrementingDSCNT(const MachineInstr &MI) const {
3354 if (!MI.mayStore())
3355 return false;
3356 if (SIInstrInfo::isDS(MI))
3357 return true;
3358 return false;
3359}
3360
3361// Return flags indicating which counters should be flushed in the preheader of
3362// the given loop. We currently decide to flush in a few situations:
3363// For VMEM (FlushVmCnt):
3364// 1. The loop contains vmem store(s), no vmem load and at least one use of a
3365// vgpr containing a value that is loaded outside of the loop. (Only on
3366// targets with no vscnt counter).
3367// 2. The loop contains vmem load(s), but the loaded values are not used in the
3368// loop, and at least one use of a vgpr containing a value that is loaded
3369// outside of the loop.
3370// For DS (FlushDsCnt, GFX12+ only):
3371// 3. The loop contains no DS reads, and at least one use of a vgpr containing
3372// a value that is DS loaded outside of the loop.
3373// 4. The loop contains DS read(s), loaded values are not used in the same
3374// iteration but in the next iteration (prefetch pattern), and at least one
3375// use of a vgpr containing a value that is DS loaded outside of the loop.
3376// Flushing in preheader reduces wait overhead if the wait requirement in
3377// iteration 1 would otherwise be more strict.
3378PreheaderFlushFlags
3379SIInsertWaitcnts::getPreheaderFlushFlags(MachineLoop *ML,
3380 const WaitcntBrackets &Brackets) {
3381 PreheaderFlushFlags Flags;
3382 bool HasVMemLoad = false;
3383 bool HasVMemStore = false;
3384 bool UsesVgprLoadedOutsideVMEM = false;
3385 bool UsesVgprLoadedOutsideDS = false;
3386 bool VMemInvalidated = false;
3387 // DS optimization only applies to GFX12+ where DS_CNT is separate.
3388 bool DSInvalidated = !ST->hasExtendedWaitCounts();
3389 DenseSet<MCRegUnit> VgprUse;
3390 DenseSet<MCRegUnit> VgprDefVMEM;
3391 DenseSet<MCRegUnit> VgprDefDS;
3392
3393 for (MachineBasicBlock *MBB : ML->blocks()) {
3394 for (MachineInstr &MI : *MBB) {
3395 if (isVMEMOrFlatVMEM(MI)) {
3396 HasVMemLoad |= MI.mayLoad();
3397 HasVMemStore |= MI.mayStore();
3398 }
3399 // TODO: Can we relax DSStore check? There may be cases where
3400 // these DS stores are drained prior to the end of MBB (or loop).
3401 if (mayStoreIncrementingDSCNT(MI)) {
3402 // Early exit if both optimizations are invalidated.
3403 // Otherwise, set invalid status and continue.
3404 if (VMemInvalidated)
3405 return Flags;
3406 DSInvalidated = true;
3407 }
3408 for (const MachineOperand &Op : MI.all_uses()) {
3409 if (Op.isDebug() || !TRI->isVectorRegister(MRI: *MRI, Reg: Op.getReg()))
3410 continue;
3411 // Vgpr use
3412 for (MCRegUnit RU : TRI->regunits(Reg: Op.getReg().asMCReg())) {
3413 // If we find a register that is loaded inside the loop, 1. and 2.
3414 // are invalidated.
3415 if (VgprDefVMEM.contains(V: RU))
3416 VMemInvalidated = true;
3417
3418 // Check for DS loads used inside the loop
3419 if (VgprDefDS.contains(V: RU))
3420 DSInvalidated = true;
3421
3422 // Early exit if both optimizations are invalidated
3423 if (VMemInvalidated && DSInvalidated)
3424 return Flags;
3425
3426 VgprUse.insert(V: RU);
3427 // Check if this register has a pending VMEM load from outside the
3428 // loop (value loaded outside and used inside).
3429 VMEMID ID = toVMEMID(RU);
3430 if (Brackets.hasPendingVMEM(ID, T: LOAD_CNT) ||
3431 Brackets.hasPendingVMEM(ID, T: SAMPLE_CNT) ||
3432 Brackets.hasPendingVMEM(ID, T: BVH_CNT))
3433 UsesVgprLoadedOutsideVMEM = true;
3434 // Check if loaded outside the loop via DS (not VMEM/FLAT).
3435 // Only consider it a DS load if there's no pending VMEM load for
3436 // this register, since FLAT can set both counters.
3437 else if (Brackets.hasPendingVMEM(ID, T: DS_CNT))
3438 UsesVgprLoadedOutsideDS = true;
3439 }
3440 }
3441
3442 // VMem load vgpr def
3443 if (isVMEMOrFlatVMEM(MI) && MI.mayLoad()) {
3444 for (const MachineOperand &Op : MI.all_defs()) {
3445 for (MCRegUnit RU : TRI->regunits(Reg: Op.getReg().asMCReg())) {
3446 // If we find a register that is loaded inside the loop, 1. and 2.
3447 // are invalidated.
3448 if (VgprUse.contains(V: RU))
3449 VMemInvalidated = true;
3450 VgprDefVMEM.insert(V: RU);
3451 }
3452 }
3453 // Early exit if both optimizations are invalidated
3454 if (VMemInvalidated && DSInvalidated)
3455 return Flags;
3456 }
3457
3458 // DS read vgpr def
3459 // Note: Unlike VMEM, we DON'T invalidate when VgprUse.contains(RegNo).
3460 // If USE comes before DEF, it's the prefetch pattern (use value from
3461 // previous iteration, load for next iteration). We should still flush
3462 // in preheader so iteration 1 doesn't need to wait inside the loop.
3463 // Only invalidate when DEF comes before USE (same-iteration consumption,
3464 // checked above when processing uses).
3465 if (isDSRead(MI)) {
3466 for (const MachineOperand &Op : MI.all_defs()) {
3467 for (MCRegUnit RU : TRI->regunits(Reg: Op.getReg().asMCReg())) {
3468 VgprDefDS.insert(V: RU);
3469 }
3470 }
3471 }
3472 }
3473 }
3474
3475 // VMEM flush decision
3476 if (!VMemInvalidated && UsesVgprLoadedOutsideVMEM &&
3477 ((!ST->hasVscnt() && HasVMemStore && !HasVMemLoad) ||
3478 (HasVMemLoad && ST->hasVmemWriteVgprInOrder())))
3479 Flags.FlushVmCnt = true;
3480
3481 // DS flush decision: flush if loop uses DS-loaded values from outside
3482 // and either has no DS reads in the loop, or DS reads whose results
3483 // are not used in the loop.
3484 // DSInvalidated is pre-set to true on non-GFX12+ targets where DS_CNT
3485 // is LGKM_CNT which also tracks FLAT/SMEM.
3486 if (!DSInvalidated && UsesVgprLoadedOutsideDS)
3487 Flags.FlushDsCnt = true;
3488
3489 return Flags;
3490}
3491
3492bool SIInsertWaitcntsLegacy::runOnMachineFunction(MachineFunction &MF) {
3493 auto *MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
3494 auto *PDT =
3495 &getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree();
3496 AliasAnalysis *AA = nullptr;
3497 if (auto *AAR = getAnalysisIfAvailable<AAResultsWrapperPass>())
3498 AA = &AAR->getAAResults();
3499
3500 return SIInsertWaitcnts(MLI, PDT, AA).run(MF);
3501}
3502
3503PreservedAnalyses
3504SIInsertWaitcntsPass::run(MachineFunction &MF,
3505 MachineFunctionAnalysisManager &MFAM) {
3506 auto *MLI = &MFAM.getResult<MachineLoopAnalysis>(IR&: MF);
3507 auto *PDT = &MFAM.getResult<MachinePostDominatorTreeAnalysis>(IR&: MF);
3508 auto *AA = MFAM.getResult<FunctionAnalysisManagerMachineFunctionProxy>(IR&: MF)
3509 .getManager()
3510 .getCachedResult<AAManager>(IR&: MF.getFunction());
3511
3512 if (!SIInsertWaitcnts(MLI, PDT, AA).run(MF))
3513 return PreservedAnalyses::all();
3514
3515 return getMachineFunctionPassPreservedAnalyses()
3516 .preserveSet<CFGAnalyses>()
3517 .preserve<AAManager>();
3518}
3519
3520bool SIInsertWaitcnts::run(MachineFunction &MF) {
3521 ST = &MF.getSubtarget<GCNSubtarget>();
3522 TII = ST->getInstrInfo();
3523 TRI = &TII->getRegisterInfo();
3524 MRI = &MF.getRegInfo();
3525 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
3526
3527 AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(GPU: ST->getCPU());
3528
3529 // Initialize hardware limits first, as they're needed by the generators.
3530 Limits = AMDGPU::HardwareLimits(IV);
3531
3532 if (ST->hasExtendedWaitCounts()) {
3533 IsExpertMode = ST->hasExpertSchedulingMode() &&
3534 (ExpertSchedulingModeFlag.getNumOccurrences()
3535 ? ExpertSchedulingModeFlag
3536 : MF.getFunction()
3537 .getFnAttribute(Kind: "amdgpu-expert-scheduling-mode")
3538 .getValueAsBool());
3539 MaxCounter = IsExpertMode ? NUM_EXPERT_INST_CNTS : NUM_EXTENDED_INST_CNTS;
3540 if (!WCG)
3541 WCG = std::make_unique<WaitcntGeneratorGFX12Plus>(args&: MF, args&: MaxCounter, args: &Limits,
3542 args&: IsExpertMode);
3543 } else {
3544 MaxCounter = NUM_NORMAL_INST_CNTS;
3545 if (!WCG)
3546 WCG = std::make_unique<WaitcntGeneratorPreGFX12>(args&: MF, args: NUM_NORMAL_INST_CNTS,
3547 args: &Limits);
3548 }
3549
3550 for (auto T : inst_counter_types())
3551 ForceEmitWaitcnt[T] = false;
3552
3553 SmemAccessCounter = getCounterFromEvent(E: SMEM_ACCESS);
3554
3555 BlockInfos.clear();
3556 bool Modified = false;
3557
3558 MachineBasicBlock &EntryBB = MF.front();
3559
3560 if (!MFI->isEntryFunction()) {
3561 // Wait for any outstanding memory operations that the input registers may
3562 // depend on. We can't track them and it's better to do the wait after the
3563 // costly call sequence.
3564
3565 // TODO: Could insert earlier and schedule more liberally with operations
3566 // that only use caller preserved registers.
3567 MachineBasicBlock::iterator I = EntryBB.begin();
3568 while (I != EntryBB.end() && I->isMetaInstruction())
3569 ++I;
3570
3571 if (ST->hasExtendedWaitCounts()) {
3572 BuildMI(BB&: EntryBB, I, MIMD: DebugLoc(), MCID: TII->get(Opcode: AMDGPU::S_WAIT_LOADCNT_DSCNT))
3573 .addImm(Val: 0);
3574 for (auto CT : inst_counter_types(MaxCounter: NUM_EXTENDED_INST_CNTS)) {
3575 if (CT == LOAD_CNT || CT == DS_CNT || CT == STORE_CNT || CT == X_CNT)
3576 continue;
3577
3578 if (!ST->hasImageInsts() &&
3579 (CT == EXP_CNT || CT == SAMPLE_CNT || CT == BVH_CNT))
3580 continue;
3581
3582 BuildMI(BB&: EntryBB, I, MIMD: DebugLoc(),
3583 MCID: TII->get(Opcode: instrsForExtendedCounterTypes[CT]))
3584 .addImm(Val: 0);
3585 }
3586 if (IsExpertMode) {
3587 unsigned Enc = AMDGPU::DepCtr::encodeFieldVaVdst(VaVdst: 0, STI: *ST);
3588 Enc = AMDGPU::DepCtr::encodeFieldVmVsrc(Encoded: Enc, VmVsrc: 0);
3589 BuildMI(BB&: EntryBB, I, MIMD: DebugLoc(), MCID: TII->get(Opcode: AMDGPU::S_WAITCNT_DEPCTR))
3590 .addImm(Val: Enc);
3591 }
3592 } else {
3593 BuildMI(BB&: EntryBB, I, MIMD: DebugLoc(), MCID: TII->get(Opcode: AMDGPU::S_WAITCNT)).addImm(Val: 0);
3594 }
3595
3596 auto NonKernelInitialState = std::make_unique<WaitcntBrackets>(args: this);
3597 NonKernelInitialState->setStateOnFunctionEntryOrReturn();
3598 BlockInfos[&EntryBB].Incoming = std::move(NonKernelInitialState);
3599
3600 Modified = true;
3601 }
3602
3603 // Keep iterating over the blocks in reverse post order, inserting and
3604 // updating s_waitcnt where needed, until a fix point is reached.
3605 for (auto *MBB : ReversePostOrderTraversal<MachineFunction *>(&MF))
3606 BlockInfos.try_emplace(Key: MBB);
3607
3608 std::unique_ptr<WaitcntBrackets> Brackets;
3609 bool Repeat;
3610 do {
3611 Repeat = false;
3612
3613 for (auto BII = BlockInfos.begin(), BIE = BlockInfos.end(); BII != BIE;
3614 ++BII) {
3615 MachineBasicBlock *MBB = BII->first;
3616 BlockInfo &BI = BII->second;
3617 if (!BI.Dirty)
3618 continue;
3619
3620 if (BI.Incoming) {
3621 if (!Brackets)
3622 Brackets = std::make_unique<WaitcntBrackets>(args&: *BI.Incoming);
3623 else
3624 *Brackets = *BI.Incoming;
3625 } else {
3626 if (!Brackets) {
3627 Brackets = std::make_unique<WaitcntBrackets>(args: this);
3628 } else {
3629 // Reinitialize in-place. N.B. do not do this by assigning from a
3630 // temporary because the WaitcntBrackets class is large and it could
3631 // cause this function to use an unreasonable amount of stack space.
3632 Brackets->~WaitcntBrackets();
3633 new (Brackets.get()) WaitcntBrackets(this);
3634 }
3635 }
3636
3637 if (ST->hasWaitXcnt())
3638 Modified |= removeRedundantSoftXcnts(Block&: *MBB);
3639 Modified |= insertWaitcntInBlock(MF, Block&: *MBB, ScoreBrackets&: *Brackets);
3640 BI.Dirty = false;
3641
3642 if (Brackets->hasPendingEvent()) {
3643 BlockInfo *MoveBracketsToSucc = nullptr;
3644 for (MachineBasicBlock *Succ : MBB->successors()) {
3645 auto *SuccBII = BlockInfos.find(Key: Succ);
3646 BlockInfo &SuccBI = SuccBII->second;
3647 if (!SuccBI.Incoming) {
3648 SuccBI.Dirty = true;
3649 if (SuccBII <= BII) {
3650 LLVM_DEBUG(dbgs() << "Repeat on backedge without merge\n");
3651 Repeat = true;
3652 }
3653 if (!MoveBracketsToSucc) {
3654 MoveBracketsToSucc = &SuccBI;
3655 } else {
3656 SuccBI.Incoming = std::make_unique<WaitcntBrackets>(args&: *Brackets);
3657 }
3658 } else {
3659 LLVM_DEBUG({
3660 dbgs() << "Try to merge ";
3661 MBB->printName(dbgs());
3662 dbgs() << " into ";
3663 Succ->printName(dbgs());
3664 dbgs() << '\n';
3665 });
3666 if (SuccBI.Incoming->merge(Other: *Brackets)) {
3667 SuccBI.Dirty = true;
3668 if (SuccBII <= BII) {
3669 LLVM_DEBUG(dbgs() << "Repeat on backedge with merge\n");
3670 Repeat = true;
3671 }
3672 }
3673 }
3674 }
3675 if (MoveBracketsToSucc)
3676 MoveBracketsToSucc->Incoming = std::move(Brackets);
3677 }
3678 }
3679 } while (Repeat);
3680
3681 if (ST->hasScalarStores()) {
3682 SmallVector<MachineBasicBlock *, 4> EndPgmBlocks;
3683 bool HaveScalarStores = false;
3684
3685 for (MachineBasicBlock &MBB : MF) {
3686 for (MachineInstr &MI : MBB) {
3687 if (!HaveScalarStores && TII->isScalarStore(MI))
3688 HaveScalarStores = true;
3689
3690 if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
3691 MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
3692 EndPgmBlocks.push_back(Elt: &MBB);
3693 }
3694 }
3695
3696 if (HaveScalarStores) {
3697 // If scalar writes are used, the cache must be flushed or else the next
3698 // wave to reuse the same scratch memory can be clobbered.
3699 //
3700 // Insert s_dcache_wb at wave termination points if there were any scalar
3701 // stores, and only if the cache hasn't already been flushed. This could
3702 // be improved by looking across blocks for flushes in postdominating
3703 // blocks from the stores but an explicitly requested flush is probably
3704 // very rare.
3705 for (MachineBasicBlock *MBB : EndPgmBlocks) {
3706 bool SeenDCacheWB = false;
3707
3708 for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
3709 I != E; ++I) {
3710 if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
3711 SeenDCacheWB = true;
3712 else if (TII->isScalarStore(MI: *I))
3713 SeenDCacheWB = false;
3714
3715 // FIXME: It would be better to insert this before a waitcnt if any.
3716 if ((I->getOpcode() == AMDGPU::S_ENDPGM ||
3717 I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&
3718 !SeenDCacheWB) {
3719 Modified = true;
3720 BuildMI(BB&: *MBB, I, MIMD: I->getDebugLoc(), MCID: TII->get(Opcode: AMDGPU::S_DCACHE_WB));
3721 }
3722 }
3723 }
3724 }
3725 }
3726
3727 if (IsExpertMode) {
3728 // Enable expert scheduling on function entry. To satisfy ABI requirements
3729 // and to allow calls between function with different expert scheduling
3730 // settings, disable it around calls and before returns.
3731
3732 MachineBasicBlock::iterator I = EntryBB.begin();
3733 while (I != EntryBB.end() && I->isMetaInstruction())
3734 ++I;
3735 setSchedulingMode(MBB&: EntryBB, I, ExpertMode: true);
3736
3737 for (MachineInstr *MI : CallInsts) {
3738 MachineBasicBlock &MBB = *MI->getParent();
3739 setSchedulingMode(MBB, I: MI, ExpertMode: false);
3740 setSchedulingMode(MBB, I: std::next(x: MI->getIterator()), ExpertMode: true);
3741 }
3742
3743 for (MachineInstr *MI : ReturnInsts)
3744 setSchedulingMode(MBB&: *MI->getParent(), I: MI, ExpertMode: false);
3745
3746 Modified = true;
3747 }
3748
3749 // Deallocate the VGPRs before previously identified S_ENDPGM instructions.
3750 // This is done in different ways depending on how the VGPRs were allocated
3751 // (i.e. whether we're in dynamic VGPR mode or not).
3752 // Skip deallocation if kernel is waveslot limited vs VGPR limited. A short
3753 // waveslot limited kernel runs slower with the deallocation.
3754 if (!WCG->isOptNone() && MFI->isDynamicVGPREnabled()) {
3755 for (auto [MI, _] : EndPgmInsts) {
3756 BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(),
3757 MCID: TII->get(Opcode: AMDGPU::S_ALLOC_VGPR))
3758 .addImm(Val: 0);
3759 Modified = true;
3760 }
3761 } else if (!WCG->isOptNone() &&
3762 ST->getGeneration() >= AMDGPUSubtarget::GFX11 &&
3763 (MF.getFrameInfo().hasCalls() ||
3764 ST->getOccupancyWithNumVGPRs(
3765 VGPRs: TRI->getNumUsedPhysRegs(MRI: *MRI, RC: AMDGPU::VGPR_32RegClass),
3766 /*IsDynamicVGPR=*/DynamicVGPRBlockSize: false) <
3767 AMDGPU::IsaInfo::getMaxWavesPerEU(STI: ST))) {
3768 for (auto [MI, Flag] : EndPgmInsts) {
3769 if (Flag) {
3770 if (ST->requiresNopBeforeDeallocVGPRs()) {
3771 BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(),
3772 MCID: TII->get(Opcode: AMDGPU::S_NOP))
3773 .addImm(Val: 0);
3774 }
3775 BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(),
3776 MCID: TII->get(Opcode: AMDGPU::S_SENDMSG))
3777 .addImm(Val: AMDGPU::SendMsg::ID_DEALLOC_VGPRS_GFX11Plus);
3778 Modified = true;
3779 }
3780 }
3781 }
3782
3783 CallInsts.clear();
3784 ReturnInsts.clear();
3785 EndPgmInsts.clear();
3786 PreheadersToFlush.clear();
3787 SLoadAddresses.clear();
3788
3789 return Modified;
3790}
3791