1//===- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Insert wait instructions for memory reads and writes.
11///
12/// Memory reads and writes are issued asynchronously, so we need to insert
13/// S_WAITCNT instructions when we want to access any of their results or
14/// overwrite any register that's used asynchronously.
15///
16/// TODO: This pass currently keeps one timeline per hardware counter. A more
17/// finely-grained approach that keeps one timeline per event type could
18/// sometimes get away with generating weaker s_waitcnt instructions. For
19/// example, when both SMEM and LDS are in flight and we need to wait for
20/// the i-th-last LDS instruction, then an lgkmcnt(i) is actually sufficient,
21/// but the pass will currently generate a conservative lgkmcnt(0) because
22/// multiple event types are in flight.
23//
24//===----------------------------------------------------------------------===//
25
26#include "AMDGPU.h"
27#include "AMDGPUHWEvents.h"
28#include "AMDGPUWaitcntUtils.h"
29#include "GCNSubtarget.h"
30#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
31#include "SIMachineFunctionInfo.h"
32#include "Utils/AMDGPUBaseInfo.h"
33#include "llvm/ADT/MapVector.h"
34#include "llvm/ADT/PostOrderIterator.h"
35#include "llvm/ADT/Sequence.h"
36#include "llvm/Analysis/AliasAnalysis.h"
37#include "llvm/CodeGen/MachineFrameInfo.h"
38#include "llvm/CodeGen/MachineLoopInfo.h"
39#include "llvm/CodeGen/MachinePassManager.h"
40#include "llvm/CodeGen/MachinePostDominators.h"
41#include "llvm/IR/Dominators.h"
42#include "llvm/InitializePasses.h"
43#include "llvm/TargetParser/AMDGPUTargetParser.h"
44
45using namespace llvm;
46
47using HWEvents = AMDGPU::HWEvents;
48
49#define DEBUG_TYPE "si-insert-waitcnts"
50
51static cl::opt<bool>
52 ForceEmitZeroFlag("amdgpu-waitcnt-forcezero",
53 cl::desc("Force all waitcnt instrs to be emitted as "
54 "s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"),
55 cl::init(Val: false), cl::Hidden);
56
57static cl::opt<bool> ForceEmitZeroLoadFlag(
58 "amdgpu-waitcnt-load-forcezero",
59 cl::desc("Force all waitcnt load counters to wait until 0"),
60 cl::init(Val: false), cl::Hidden);
61
62static cl::opt<bool> ExpertSchedulingModeFlag(
63 "amdgpu-expert-scheduling-mode",
64 cl::desc("Enable expert scheduling mode 2 for all functions (GFX12+ only)"),
65 cl::init(Val: false), cl::Hidden);
66
67namespace {
68
69template <typename EmitWaitcntFn>
70static void EmitExpandedWaitcnt(unsigned Outstanding, unsigned Target,
71 EmitWaitcntFn &&EmitWaitcnt) {
72 // Emit waitcnts from (Outstanding - 1) down to Target.
73 for (unsigned I = Outstanding - 1; I > Target && I != ~0u; --I)
74 EmitWaitcnt(I);
75 EmitWaitcnt(Target);
76}
77
78/// Integer IDs used to track vector memory locations we may have to wait on.
79/// Encoded as u16 chunks:
80///
81/// [0, REGUNITS_END ): MCRegUnit
82/// [LDSDMA_BEGIN, LDSDMA_END ) : LDS DMA IDs
83///
84/// NOTE: The choice of encoding these as "u16 chunks" is arbitrary.
85/// It gives (2 << 16) - 1 entries per category which is more than enough
86/// for all register units. MCPhysReg is u16 so we don't even support >u16
87/// physical register numbers at this time, let alone >u16 register units.
88/// In any case, an assertion in "WaitcntBrackets" ensures REGUNITS_END
89/// is enough for all register units.
90using VMEMID = uint32_t;
91
92enum : VMEMID {
93 TRACKINGID_RANGE_LEN = (1 << 16),
94
95 // Important: MCRegUnits must always be tracked starting from 0, as we
96 // need to be able to convert between a MCRegUnit and a VMEMID freely.
97 REGUNITS_BEGIN = 0,
98 REGUNITS_END = REGUNITS_BEGIN + TRACKINGID_RANGE_LEN,
99
100 // Note for LDSDMA: LDSDMA_BEGIN corresponds to the "common"
101 // entry, which is updated for all LDS DMA operations encountered.
102 // Specific LDS DMA IDs start at LDSDMA_BEGIN + 1.
103 NUM_LDSDMA = TRACKINGID_RANGE_LEN,
104 LDSDMA_BEGIN = REGUNITS_END,
105 LDSDMA_END = LDSDMA_BEGIN + NUM_LDSDMA,
106};
107
108/// Convert a MCRegUnit to a VMEMID.
109static constexpr VMEMID toVMEMID(MCRegUnit RU) {
110 return static_cast<unsigned>(RU);
111}
112
113} // namespace
114
115namespace {
116
117// Enumerate different types of result-returning VMEM operations. Although
118// s_waitcnt orders them all with a single vmcnt counter, in the absence of
119// s_waitcnt only instructions of the same VmemType are guaranteed to write
120// their results in order -- so there is no need to insert an s_waitcnt between
121// two instructions of the same type that write the same vgpr.
122enum VmemType {
123 // BUF instructions and MIMG instructions without a sampler.
124 VMEM_NOSAMPLER,
125 // MIMG instructions with a sampler.
126 VMEM_SAMPLER,
127 // BVH instructions
128 VMEM_BVH,
129 NUM_VMEM_TYPES
130};
131
132// Maps values of InstCounterType to the instruction that waits on that
133// counter. Only used if GCNSubtarget::hasExtendedWaitCounts()
134// returns true, and does not cover VA_VDST or VM_VSRC.
135static const unsigned
136 instrsForExtendedCounterTypes[AMDGPU::NUM_EXTENDED_INST_CNTS] = {
137 AMDGPU::S_WAIT_LOADCNT, AMDGPU::S_WAIT_DSCNT,
138 AMDGPU::S_WAIT_EXPCNT, AMDGPU::S_WAIT_STORECNT,
139 AMDGPU::S_WAIT_SAMPLECNT, AMDGPU::S_WAIT_BVHCNT,
140 AMDGPU::S_WAIT_KMCNT, AMDGPU::S_WAIT_XCNT,
141 AMDGPU::S_WAIT_ASYNCCNT, AMDGPU::S_WAIT_TENSORCNT};
142
143// ASYNCMARK and WAIT_ASYNCMARK are meta instructions that emit no hardware
144// code but still need to be processed by this pass for async vmcnt tracking.
145static bool isNonWaitcntMetaInst(const MachineInstr &MI) {
146 switch (MI.getOpcode()) {
147 case AMDGPU::ASYNCMARK:
148 case AMDGPU::WAIT_ASYNCMARK:
149 return false;
150 default:
151 return MI.isMetaInstruction();
152 }
153}
154
155static bool updateVMCntOnly(const MachineInstr &Inst) {
156 return (SIInstrInfo::isVMEM(MI: Inst) && !SIInstrInfo::isFLAT(MI: Inst)) ||
157 SIInstrInfo::isFLATGlobal(MI: Inst) || SIInstrInfo::isFLATScratch(MI: Inst);
158}
159
160#ifndef NDEBUG
161static bool isNormalMode(AMDGPU::InstCounterType MaxCounter) {
162 return MaxCounter == AMDGPU::NUM_NORMAL_INST_CNTS;
163}
164#endif // NDEBUG
165
166VmemType getVmemType(const MachineInstr &Inst) {
167 assert(updateVMCntOnly(Inst));
168 if (!SIInstrInfo::isImage(MI: Inst))
169 return VMEM_NOSAMPLER;
170 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc: Inst.getOpcode());
171 const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo =
172 AMDGPU::getMIMGBaseOpcodeInfo(BaseOpcode: Info->BaseOpcode);
173
174 if (BaseInfo->BVH)
175 return VMEM_BVH;
176
177 // We have to make an additional check for isVSAMPLE here since some
178 // instructions don't have a sampler, but are still classified as sampler
179 // instructions for the purposes of e.g. waitcnt.
180 if (BaseInfo->Sampler || BaseInfo->MSAA || SIInstrInfo::isVSAMPLE(MI: Inst))
181 return VMEM_SAMPLER;
182
183 return VMEM_NOSAMPLER;
184}
185
186class WaitcntBrackets;
187
188// This abstracts the logic for generating and updating S_WAIT* instructions
189// away from the analysis that determines where they are needed. This was
190// done because the set of counters and instructions for waiting on them
191// underwent a major shift with gfx12, sufficiently so that having this
192// abstraction allows the main analysis logic to be simpler than it would
193// otherwise have had to become.
194class WaitcntGenerator {
195protected:
196 const GCNSubtarget &ST;
197 const SIInstrInfo &TII;
198 AMDGPU::IsaVersion IV;
199 AMDGPU::InstCounterType MaxCounter;
200 bool OptNone;
201 bool ExpandWaitcntProfiling = false;
202 const AMDGPU::HardwareLimits &Limits;
203
204public:
205 WaitcntGenerator() = delete;
206 WaitcntGenerator(const WaitcntGenerator &) = delete;
207 WaitcntGenerator(const MachineFunction &MF,
208 AMDGPU::InstCounterType MaxCounter,
209 const AMDGPU::HardwareLimits &Limits)
210 : ST(MF.getSubtarget<GCNSubtarget>()), TII(*ST.getInstrInfo()),
211 IV(AMDGPU::getIsaVersion(GPU: ST.getCPU())), MaxCounter(MaxCounter),
212 OptNone(MF.getFunction().hasOptNone() ||
213 MF.getTarget().getOptLevel() == CodeGenOptLevel::None),
214 ExpandWaitcntProfiling(
215 MF.getFunction().hasFnAttribute(Kind: "amdgpu-expand-waitcnt-profiling")),
216 Limits(Limits) {}
217
218 // Return true if the current function should be compiled with no
219 // optimization.
220 bool isOptNone() const { return OptNone; }
221
222 unsigned getLimit(AMDGPU::InstCounterType E) const { return Limits.get(T: E); }
223
224 // Edits an existing sequence of wait count instructions according
225 // to an incoming Waitcnt value, which is itself updated to reflect
226 // any new wait count instructions which may need to be generated by
227 // WaitcntGenerator::createNewWaitcnt(). It will return true if any edits
228 // were made.
229 //
230 // This editing will usually be merely updated operands, but it may also
231 // delete instructions if the incoming Wait value indicates they are not
232 // needed. It may also remove existing instructions for which a wait
233 // is needed if it can be determined that it is better to generate new
234 // instructions later, as can happen on gfx12.
235 virtual bool
236 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
237 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
238 MachineBasicBlock::instr_iterator It) const = 0;
239
240 // Transform a soft waitcnt into a normal one.
241 bool promoteSoftWaitCnt(MachineInstr *Waitcnt) const;
242
243 // Generates new wait count instructions according to the value of
244 // Wait, returning true if any new instructions were created.
245 // ScoreBrackets is used for profiling expansion.
246 virtual bool createNewWaitcnt(MachineBasicBlock &Block,
247 MachineBasicBlock::instr_iterator It,
248 AMDGPU::Waitcnt Wait,
249 const WaitcntBrackets &ScoreBrackets) = 0;
250
251 // Returns the set of HWEvents that corresponds to counter \p T.
252 virtual HWEvents getWaitEvents(AMDGPU::InstCounterType T) const = 0;
253
254 /// \returns the counter that corresponds to event \p E.
255 AMDGPU::InstCounterType getCounterFromEvent(HWEvents E) const {
256 assert(E.size() == 1 && "Cannot handle a mask of events!");
257 for (auto T : AMDGPU::inst_counter_types()) {
258 if (getWaitEvents(T) & E)
259 return T;
260 }
261 llvm_unreachable("event type has no associated counter");
262 }
263
264 // Returns a new waitcnt with all counters except VScnt set to 0. If
265 // IncludeVSCnt is true, VScnt is set to 0, otherwise it is set to ~0u.
266 // AsyncCnt and TensorCnt always default to ~0u (don't wait for it). They
267 // are only updated when a call to @llvm.amdgcn.wait.asyncmark() is
268 // processed.
269 virtual AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const = 0;
270
271 virtual ~WaitcntGenerator() = default;
272};
273
274class WaitcntGeneratorPreGFX12 final : public WaitcntGenerator {
275 static constexpr const HWEvents
276 WaitEventMaskForInstPreGFX12[AMDGPU::NUM_INST_CNTS] = {
277 HWEvents::VMEM_READ_ACCESS | HWEvents::VMEM_SAMPLER_READ_ACCESS |
278 HWEvents::VMEM_BVH_READ_ACCESS,
279 HWEvents::SMEM_ACCESS | HWEvents::LDS_ACCESS | HWEvents::GDS_ACCESS |
280 HWEvents::SQ_MESSAGE,
281 HWEvents::EXP_GPR_LOCK | HWEvents::GDS_GPR_LOCK |
282 HWEvents::VMW_GPR_LOCK | HWEvents::EXP_PARAM_ACCESS |
283 HWEvents::EXP_POS_ACCESS | HWEvents::EXP_LDS_ACCESS,
284 HWEvents::VMEM_WRITE_ACCESS | HWEvents::SCRATCH_WRITE_ACCESS,
285 HWEvents::NONE,
286 HWEvents::NONE,
287 HWEvents::NONE,
288 HWEvents::NONE,
289 HWEvents::NONE,
290 HWEvents::NONE,
291 HWEvents::NONE,
292 HWEvents::NONE};
293
294public:
295 using WaitcntGenerator::WaitcntGenerator;
296 bool
297 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
298 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
299 MachineBasicBlock::instr_iterator It) const override;
300
301 bool createNewWaitcnt(MachineBasicBlock &Block,
302 MachineBasicBlock::instr_iterator It,
303 AMDGPU::Waitcnt Wait,
304 const WaitcntBrackets &ScoreBrackets) override;
305
306 HWEvents getWaitEvents(AMDGPU::InstCounterType T) const override {
307 HWEvents EVs = WaitEventMaskForInstPreGFX12[T];
308 if (T == AMDGPU::LOAD_CNT && !ST.hasVscnt())
309 EVs |= WaitEventMaskForInstPreGFX12[AMDGPU::STORE_CNT];
310 return EVs;
311 }
312
313 AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override;
314};
315
316class WaitcntGeneratorGFX12Plus final : public WaitcntGenerator {
317protected:
318 bool IsExpertMode;
319 static constexpr const HWEvents
320 WaitEventMaskForInstGFX12Plus[AMDGPU::NUM_INST_CNTS] = {
321 HWEvents::VMEM_READ_ACCESS | HWEvents::GLOBAL_INV_ACCESS,
322 HWEvents::LDS_ACCESS | HWEvents::GDS_ACCESS,
323 HWEvents::EXP_GPR_LOCK | HWEvents::GDS_GPR_LOCK |
324 HWEvents::VMW_GPR_LOCK | HWEvents::EXP_PARAM_ACCESS |
325 HWEvents::EXP_POS_ACCESS | HWEvents::EXP_LDS_ACCESS,
326
327 HWEvents::VMEM_WRITE_ACCESS | HWEvents::SCRATCH_WRITE_ACCESS,
328 HWEvents::VMEM_SAMPLER_READ_ACCESS,
329 HWEvents::VMEM_BVH_READ_ACCESS,
330
331 HWEvents::SMEM_ACCESS | HWEvents::SQ_MESSAGE | HWEvents::SCC_WRITE,
332 HWEvents::VMEM_GROUP | HWEvents::SMEM_GROUP,
333 HWEvents::ASYNC_ACCESS,
334 HWEvents::TENSOR_ACCESS,
335 HWEvents::VGPR_CSMACC_WRITE | HWEvents::VGPR_DPMACC_WRITE |
336 HWEvents::VGPR_TRANS_WRITE | HWEvents::VGPR_XDL_WRITE,
337 HWEvents::VGPR_LDS_READ | HWEvents::VGPR_FLAT_READ |
338 HWEvents::VGPR_VMEM_READ};
339
340public:
341 WaitcntGeneratorGFX12Plus() = delete;
342 WaitcntGeneratorGFX12Plus(const MachineFunction &MF,
343 AMDGPU::InstCounterType MaxCounter,
344 const AMDGPU::HardwareLimits &Limits,
345 bool IsExpertMode)
346 : WaitcntGenerator(MF, MaxCounter, Limits), IsExpertMode(IsExpertMode) {}
347
348 bool
349 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
350 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
351 MachineBasicBlock::instr_iterator It) const override;
352
353 bool createNewWaitcnt(MachineBasicBlock &Block,
354 MachineBasicBlock::instr_iterator It,
355 AMDGPU::Waitcnt Wait,
356 const WaitcntBrackets &ScoreBrackets) override;
357
358 HWEvents getWaitEvents(AMDGPU::InstCounterType T) const override {
359 return WaitEventMaskForInstGFX12Plus[T];
360 }
361
362 AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override;
363};
364
365// Flags indicating which counters should be flushed in a loop preheader.
366struct PreheaderFlushFlags {
367 bool FlushVmCnt = false;
368 bool FlushDsCnt = false;
369};
370
371class SIInsertWaitcnts {
372 DenseMap<const Value *, MachineBasicBlock *> SLoadAddresses;
373 DenseMap<MachineBasicBlock *, PreheaderFlushFlags> PreheadersToFlush;
374 MachineLoopInfo &MLI;
375 MachinePostDominatorTree &PDT;
376 AliasAnalysis *AA = nullptr;
377 MachineFunction &MF;
378
379 struct BlockInfo {
380 std::unique_ptr<WaitcntBrackets> Incoming;
381 bool Dirty = true;
382 BlockInfo() = default;
383 BlockInfo(BlockInfo &&) = default;
384 BlockInfo &operator=(BlockInfo &&) = default;
385 ~BlockInfo();
386 };
387
388 MapVector<MachineBasicBlock *, BlockInfo> BlockInfos;
389
390 bool ForceEmitWaitcnt[AMDGPU::NUM_INST_CNTS] = {};
391
392 std::unique_ptr<WaitcntGenerator> WCG;
393
394 // Remember call and return instructions in the function.
395 DenseSet<MachineInstr *> CallInsts;
396 DenseSet<MachineInstr *> ReturnInsts;
397
398 // Remember all S_ENDPGM instructions. The boolean flag is true if there might
399 // be outstanding stores but definitely no outstanding scratch stores, to help
400 // with insertion of DEALLOC_VGPRS messages.
401 DenseMap<MachineInstr *, bool> EndPgmInsts;
402
403 AMDGPU::HardwareLimits Limits;
404
405public:
406 const GCNSubtarget &ST;
407 const SIInstrInfo &TII;
408 const SIRegisterInfo &TRI;
409 const MachineRegisterInfo &MRI;
410 AMDGPU::InstCounterType SmemAccessCounter;
411 AMDGPU::InstCounterType MaxCounter;
412 bool IsExpertMode = false;
413
414 SIInsertWaitcnts(MachineLoopInfo &MLI, MachinePostDominatorTree &PDT,
415 AliasAnalysis *AA, MachineFunction &MF)
416 : MLI(MLI), PDT(PDT), AA(AA), MF(MF), ST(MF.getSubtarget<GCNSubtarget>()),
417 TII(*ST.getInstrInfo()), TRI(TII.getRegisterInfo()),
418 MRI(MF.getRegInfo()) {}
419
420 const AMDGPU::HardwareLimits &getLimits() const { return Limits; }
421
422 PreheaderFlushFlags getPreheaderFlushFlags(MachineLoop *ML,
423 const WaitcntBrackets &Brackets);
424 PreheaderFlushFlags isPreheaderToFlush(MachineBasicBlock &MBB,
425 const WaitcntBrackets &ScoreBrackets);
426 bool isVMEMOrFlatVMEM(const MachineInstr &MI) const;
427 bool isDSRead(const MachineInstr &MI) const;
428 bool mayStoreIncrementingDSCNT(const MachineInstr &MI) const;
429 bool run();
430
431 bool isAsync(const MachineInstr &MI) const {
432 if (!SIInstrInfo::isLDSDMA(MI))
433 return false;
434 if (SIInstrInfo::usesASYNC_CNT(MI))
435 return true;
436 const MachineOperand *Async =
437 TII.getNamedOperand(MI, OperandName: AMDGPU::OpName::IsAsync);
438 return Async && (Async->getImm());
439 }
440
441 bool isNonAsyncLdsDmaWrite(const MachineInstr &MI) const {
442 return SIInstrInfo::mayWriteLDSThroughDMA(MI) && !isAsync(MI);
443 }
444
445 bool isAsyncLdsDmaWrite(const MachineInstr &MI) const {
446 return SIInstrInfo::mayWriteLDSThroughDMA(MI) && isAsync(MI);
447 }
448
449 bool shouldUpdateAsyncMark(const MachineInstr &MI,
450 AMDGPU::InstCounterType T) const {
451 if (SIInstrInfo::usesTENSOR_CNT(MI))
452 return T == AMDGPU::TENSOR_CNT;
453 if (!isAsyncLdsDmaWrite(MI))
454 return false;
455 if (SIInstrInfo::usesASYNC_CNT(MI))
456 return T == AMDGPU::ASYNC_CNT;
457 return T == AMDGPU::LOAD_CNT;
458 }
459
460 bool isVmemAccess(const MachineInstr &MI) const;
461 bool generateWaitcntInstBefore(MachineInstr &MI,
462 WaitcntBrackets &ScoreBrackets,
463 MachineInstr *OldWaitcntInstr,
464 PreheaderFlushFlags FlushFlags);
465 bool generateWaitcnt(AMDGPU::Waitcnt Wait,
466 MachineBasicBlock::instr_iterator It,
467 MachineBasicBlock &Block, WaitcntBrackets &ScoreBrackets,
468 MachineInstr *OldWaitcntInstr);
469 void updateEventWaitcntAfter(MachineInstr &Inst,
470 WaitcntBrackets *ScoreBrackets);
471 bool isNextENDPGM(MachineBasicBlock::instr_iterator It,
472 MachineBasicBlock *Block) const;
473 bool insertForcedWaitAfter(MachineInstr &Inst, MachineBasicBlock &Block,
474 WaitcntBrackets &ScoreBrackets);
475 bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block,
476 WaitcntBrackets &ScoreBrackets);
477 /// Removes redundant Soft Xcnt Waitcnts in \p Block emitted by the Memory
478 /// Legalizer. Returns true if block was modified.
479 bool removeRedundantSoftXcnts(MachineBasicBlock &Block);
480 void setSchedulingMode(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
481 bool ExpertMode) const;
482 HWEvents getWaitEvents(AMDGPU::InstCounterType T) const {
483 return WCG->getWaitEvents(T);
484 }
485 AMDGPU::InstCounterType getCounterFromEvent(HWEvents E) const {
486 return WCG->getCounterFromEvent(E);
487 }
488};
489
490// This objects maintains the current score brackets of each wait counter, and
491// a per-register scoreboard for each wait counter.
492//
493// We also maintain the latest score for every event type that can change the
494// waitcnt in order to know if there are multiple types of events within
495// the brackets. When multiple types of event happen in the bracket,
496// wait count may get decreased out of order, therefore we need to put in
497// "s_waitcnt 0" before use.
498class WaitcntBrackets {
499public:
500 WaitcntBrackets(const SIInsertWaitcnts *Context) : Context(Context) {
501 assert(Context->TRI.getNumRegUnits() < REGUNITS_END);
502 }
503
504#ifndef NDEBUG
505 ~WaitcntBrackets() {
506 unsigned NumUnusedVmem = 0, NumUnusedSGPRs = 0;
507 for (auto &[ID, Val] : VMem) {
508 if (Val.empty())
509 ++NumUnusedVmem;
510 }
511 for (auto &[ID, Val] : SGPRs) {
512 if (Val.empty())
513 ++NumUnusedSGPRs;
514 }
515
516 if (NumUnusedVmem || NumUnusedSGPRs) {
517 errs() << "WaitcntBracket had unused entries at destruction time: "
518 << NumUnusedVmem << " VMem and " << NumUnusedSGPRs
519 << " SGPR unused entries\n";
520 std::abort();
521 }
522 }
523#endif
524
525 bool isSmemCounter(AMDGPU::InstCounterType T) const {
526 return T == Context->SmemAccessCounter || T == AMDGPU::X_CNT;
527 }
528
529 unsigned getOutstanding(AMDGPU::InstCounterType T) const {
530 return ScoreUBs[T] - ScoreLBs[T];
531 }
532
533 bool hasPendingVMEM(VMEMID ID, AMDGPU::InstCounterType T) const {
534 return getVMemScore(TID: ID, T) > getScoreLB(T);
535 }
536
537 /// \Return true if we have no score entries for counter \p T.
538 bool empty(AMDGPU::InstCounterType T) const { return getScoreRange(T) == 0; }
539
540private:
541 unsigned getScoreLB(AMDGPU::InstCounterType T) const {
542 assert(T < AMDGPU::NUM_INST_CNTS);
543 return ScoreLBs[T];
544 }
545
546 unsigned getScoreUB(AMDGPU::InstCounterType T) const {
547 assert(T < AMDGPU::NUM_INST_CNTS);
548 return ScoreUBs[T];
549 }
550
551 unsigned getScoreRange(AMDGPU::InstCounterType T) const {
552 return getScoreUB(T) - getScoreLB(T);
553 }
554
555 unsigned getSGPRScore(MCRegUnit RU, AMDGPU::InstCounterType T) const {
556 auto It = SGPRs.find(Val: RU);
557 return It != SGPRs.end() ? It->second.get(T) : 0;
558 }
559
560 unsigned getVMemScore(VMEMID TID, AMDGPU::InstCounterType T) const {
561 auto It = VMem.find(Val: TID);
562 return It != VMem.end() ? It->second.Scores[T] : 0;
563 }
564
565public:
566 bool merge(const WaitcntBrackets &Other);
567
568 bool counterOutOfOrder(AMDGPU::InstCounterType T) const;
569 void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const {
570 simplifyWaitcnt(CheckWait: Wait, UpdateWait&: Wait);
571 }
572 void simplifyWaitcnt(const AMDGPU::Waitcnt &CheckWait,
573 AMDGPU::Waitcnt &UpdateWait) const;
574 void simplifyWaitcnt(AMDGPU::InstCounterType T, unsigned &Count) const;
575 void simplifyWaitcnt(AMDGPU::Waitcnt &Wait, AMDGPU::InstCounterType T) const;
576 void simplifyXcnt(const AMDGPU::Waitcnt &CheckWait,
577 AMDGPU::Waitcnt &UpdateWait) const;
578 void simplifyVmVsrc(const AMDGPU::Waitcnt &CheckWait,
579 AMDGPU::Waitcnt &UpdateWait) const;
580
581 void determineWaitForPhysReg(AMDGPU::InstCounterType T, MCPhysReg Reg,
582 AMDGPU::Waitcnt &Wait,
583 const MachineInstr &MI) const;
584 MCPhysReg determineVGPR16Dependency(const MachineInstr &MI,
585 AMDGPU::InstCounterType T,
586 MCPhysReg Reg) const;
587 void determineWaitForLDSDMA(AMDGPU::InstCounterType T, VMEMID TID,
588 AMDGPU::Waitcnt &Wait) const;
589 AMDGPU::Waitcnt determineAsyncWait(unsigned N);
590 void tryClearSCCWriteEvent(MachineInstr *Inst);
591
592 void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
593 void applyWaitcnt(AMDGPU::InstCounterType T, unsigned Count);
594 void applyWaitcnt(const AMDGPU::Waitcnt &Wait, AMDGPU::InstCounterType T);
595 void updateByEvent(HWEvents E, MachineInstr &MI);
596 void recordAsyncMark(MachineInstr &MI);
597
598 HWEvents getPendingEvents() const { return PendingEvents; }
599 bool hasPendingEvent() const { return PendingEvents.any(); }
600 bool hasPendingEvent(HWEvents E) const { return PendingEvents.contains(Other: E); }
601 bool hasPendingEvent(AMDGPU::InstCounterType T) const {
602 bool HasPending = (PendingEvents & Context->getWaitEvents(T)).any();
603 assert(HasPending == !empty(T) &&
604 "Expected pending events iff scoreboard is not empty");
605 return HasPending;
606 }
607
608 bool hasMixedPendingEvents(AMDGPU::InstCounterType T) const {
609 HWEvents Events = PendingEvents & Context->getWaitEvents(T);
610 // Return true if more than one bit is set in Events.
611 return Events.size() > 1;
612 }
613
614 bool hasPendingFlat() const {
615 return ((LastFlatDsCnt > ScoreLBs[AMDGPU::DS_CNT] &&
616 LastFlatDsCnt <= ScoreUBs[AMDGPU::DS_CNT]) ||
617 (LastFlatLoadCnt > ScoreLBs[AMDGPU::LOAD_CNT] &&
618 LastFlatLoadCnt <= ScoreUBs[AMDGPU::LOAD_CNT]));
619 }
620
621 void setPendingFlat() {
622 LastFlatLoadCnt = ScoreUBs[AMDGPU::LOAD_CNT];
623 LastFlatDsCnt = ScoreUBs[AMDGPU::DS_CNT];
624 }
625
626 bool hasPendingGDS() const {
627 return LastGDS > ScoreLBs[AMDGPU::DS_CNT] &&
628 LastGDS <= ScoreUBs[AMDGPU::DS_CNT];
629 }
630
631 unsigned getPendingGDSWait() const {
632 return std::min(a: getScoreUB(T: AMDGPU::DS_CNT) - LastGDS,
633 b: getLimit(T: AMDGPU::DS_CNT) - 1);
634 }
635
636 void setPendingGDS() { LastGDS = ScoreUBs[AMDGPU::DS_CNT]; }
637
638 // Return true if there might be pending writes to the vgpr-interval by VMEM
639 // instructions with types different from V.
640 bool hasOtherPendingVmemTypes(MCPhysReg Reg, VmemType V) const {
641 for (MCRegUnit RU : regunits(Reg)) {
642 auto It = VMem.find(Val: toVMEMID(RU));
643 if (It != VMem.end() && (It->second.VMEMTypes & ~(1 << V)))
644 return true;
645 }
646 return false;
647 }
648
649 void clearVgprVmemTypes(MCPhysReg Reg) {
650 for (MCRegUnit RU : regunits(Reg)) {
651 if (auto It = VMem.find(Val: toVMEMID(RU)); It != VMem.end()) {
652 It->second.VMEMTypes = 0;
653 if (It->second.empty())
654 VMem.erase(I: It);
655 }
656 }
657 }
658
659 void setStateOnFunctionEntryOrReturn() {
660 setScoreUB(T: AMDGPU::STORE_CNT,
661 Val: getScoreUB(T: AMDGPU::STORE_CNT) + getLimit(T: AMDGPU::STORE_CNT));
662 PendingEvents |= Context->getWaitEvents(T: AMDGPU::STORE_CNT);
663 }
664
665 ArrayRef<const MachineInstr *> getLDSDMAStores() const {
666 return LDSDMAStores;
667 }
668
669 bool hasPointSampleAccel(const MachineInstr &MI) const;
670 bool hasPointSamplePendingVmemTypes(const MachineInstr &MI,
671 MCPhysReg RU) const;
672
673 void print(raw_ostream &) const;
674 void dump() const { print(dbgs()); }
675
676 // Free up memory by removing empty entries from the DenseMap that track event
677 // scores.
678 void purgeEmptyTrackingData();
679
680private:
681 unsigned getLimit(AMDGPU::InstCounterType T) const {
682 return Context->getLimits().get(T);
683 }
684
685 struct MergeInfo {
686 unsigned OldLB;
687 unsigned OtherLB;
688 unsigned MyShift;
689 unsigned OtherShift;
690 };
691
692 using CounterValueArray = std::array<unsigned, AMDGPU::NUM_INST_CNTS>;
693
694 void determineWaitForScore(AMDGPU::InstCounterType T, unsigned Score,
695 AMDGPU::Waitcnt &Wait) const;
696
697 static bool mergeScore(const MergeInfo &M, unsigned &Score,
698 unsigned OtherScore);
699 bool mergeAsyncMarks(ArrayRef<MergeInfo> MergeInfos,
700 ArrayRef<CounterValueArray> OtherMarks);
701
702 iterator_range<MCRegUnitIterator> regunits(MCPhysReg Reg) const {
703 assert(Reg != AMDGPU::SCC && "Shouldn't be used on SCC");
704 if (!Context->TRI.isInAllocatableClass(RegNo: Reg))
705 return {{}, {}};
706 return Context->TRI.regunits(Reg);
707 }
708
709 void setScoreLB(AMDGPU::InstCounterType T, unsigned Val) {
710 assert(T < AMDGPU::NUM_INST_CNTS);
711 ScoreLBs[T] = Val;
712 }
713
714 void setScoreUB(AMDGPU::InstCounterType T, unsigned Val) {
715 assert(T < AMDGPU::NUM_INST_CNTS);
716 ScoreUBs[T] = Val;
717
718 if (T != AMDGPU::EXP_CNT)
719 return;
720
721 if (getScoreRange(T: AMDGPU::EXP_CNT) > getLimit(T: AMDGPU::EXP_CNT))
722 ScoreLBs[AMDGPU::EXP_CNT] =
723 ScoreUBs[AMDGPU::EXP_CNT] - getLimit(T: AMDGPU::EXP_CNT);
724 }
725
726 void setRegScore(MCPhysReg Reg, AMDGPU::InstCounterType T, unsigned Val) {
727 const SIRegisterInfo &TRI = Context->TRI;
728 if (Reg == AMDGPU::SCC) {
729 SCCScore = Val;
730 } else if (TRI.isVectorRegister(MRI: Context->MRI, Reg)) {
731 for (MCRegUnit RU : regunits(Reg))
732 VMem[toVMEMID(RU)].Scores[T] = Val;
733 } else if (TRI.isSGPRReg(MRI: Context->MRI, Reg)) {
734 for (MCRegUnit RU : regunits(Reg))
735 SGPRs[RU].get(T) = Val;
736 } else {
737 llvm_unreachable("Register cannot be tracked/unknown register!");
738 }
739 }
740
741 void setVMemScore(VMEMID TID, AMDGPU::InstCounterType T, unsigned Val) {
742 VMem[TID].Scores[T] = Val;
743 }
744
745 void setScoreByOperand(const MachineOperand &Op,
746 AMDGPU::InstCounterType CntTy, unsigned Val);
747
748 const SIInsertWaitcnts *Context;
749
750 unsigned ScoreLBs[AMDGPU::NUM_INST_CNTS] = {0};
751 unsigned ScoreUBs[AMDGPU::NUM_INST_CNTS] = {0};
752 HWEvents PendingEvents;
753 // Remember the last flat memory operation.
754 unsigned LastFlatDsCnt = 0;
755 unsigned LastFlatLoadCnt = 0;
756 // Remember the last GDS operation.
757 unsigned LastGDS = 0;
758
759 // The score tracking logic is fragmented as follows:
760 // - VMem: VGPR RegUnits and LDS DMA IDs, see the VMEMID encoding.
761 // - SGPRs: SGPR RegUnits
762 // - SCC: Non-allocatable and not general purpose: not a SGPR.
763 //
764 // For the VMem case, if the key is within the range of LDS DMA IDs,
765 // then the corresponding index into the `LDSDMAStores` vector below is:
766 // Key - LDSDMA_BEGIN - 1
767 // This is because LDSDMA_BEGIN is a generic entry and does not have an
768 // associated MachineInstr.
769 //
770 // TODO: Could we track SCC alongside SGPRs so it's not longer a special case?
771
772 struct VMEMInfo {
773 // Scores for all instruction counters. Zero-initialized.
774 CounterValueArray Scores{};
775 // Bitmask of the VmemTypes of VMEM instructions for this VGPR.
776 unsigned VMEMTypes = 0;
777
778 bool empty() const { return all_of(Range: Scores, P: equal_to(Arg: 0)) && !VMEMTypes; }
779 };
780
781 /// Wait cnt scores for every sgpr, the DS_CNT (corresponding to LGKMcnt
782 /// pre-gfx12) or KM_CNT (gfx12+ only), and X_CNT (gfx1250) are relevant.
783 class SGPRInfo {
784 /// Either DS_CNT or KM_CNT score.
785 unsigned ScoreDsKmCnt = 0;
786 unsigned ScoreXCnt = 0;
787
788 public:
789 unsigned get(AMDGPU::InstCounterType T) const {
790 assert(
791 (T == AMDGPU::DS_CNT || T == AMDGPU::KM_CNT || T == AMDGPU::X_CNT) &&
792 "Invalid counter");
793 return T == AMDGPU::X_CNT ? ScoreXCnt : ScoreDsKmCnt;
794 }
795 unsigned &get(AMDGPU::InstCounterType T) {
796 assert(
797 (T == AMDGPU::DS_CNT || T == AMDGPU::KM_CNT || T == AMDGPU::X_CNT) &&
798 "Invalid counter");
799 return T == AMDGPU::X_CNT ? ScoreXCnt : ScoreDsKmCnt;
800 }
801
802 bool empty() const { return !ScoreDsKmCnt && !ScoreXCnt; }
803 };
804
805 DenseMap<VMEMID, VMEMInfo> VMem; // VGPR + LDS DMA
806 DenseMap<MCRegUnit, SGPRInfo> SGPRs;
807
808 // Reg score for SCC.
809 unsigned SCCScore = 0;
810 // The unique instruction that has an SCC write pending, if there is one.
811 const MachineInstr *PendingSCCWrite = nullptr;
812
813 // Store representative LDS DMA operations. The only useful info here is
814 // alias info. One store is kept per unique AAInfo.
815 SmallVector<const MachineInstr *> LDSDMAStores;
816
817 // State of all counters at each async mark encountered so far.
818 SmallVector<CounterValueArray> AsyncMarks;
819
820 // But in the rare pathological case, a nest of loops that pushes marks
821 // without waiting on any mark can cause AsyncMarks to grow very large. We cap
822 // it to a reasonable limit. We can tune this later or potentially introduce a
823 // user option to control the value.
824 static constexpr unsigned MaxAsyncMarks = 16;
825
826 // Track the upper bound score for async operations that are not part of a
827 // mark yet. Initialized to all zeros.
828 CounterValueArray AsyncScore{};
829};
830
831SIInsertWaitcnts::BlockInfo::~BlockInfo() = default;
832
833class SIInsertWaitcntsLegacy : public MachineFunctionPass {
834public:
835 static char ID;
836 SIInsertWaitcntsLegacy() : MachineFunctionPass(ID) {}
837
838 bool runOnMachineFunction(MachineFunction &MF) override;
839
840 StringRef getPassName() const override {
841 return "SI insert wait instructions";
842 }
843
844 void getAnalysisUsage(AnalysisUsage &AU) const override {
845 AU.setPreservesCFG();
846 AU.addRequired<MachineLoopInfoWrapperPass>();
847 AU.addRequired<MachinePostDominatorTreeWrapperPass>();
848 AU.addUsedIfAvailable<AAResultsWrapperPass>();
849 AU.addPreserved<AAResultsWrapperPass>();
850 MachineFunctionPass::getAnalysisUsage(AU);
851 }
852};
853
854} // end anonymous namespace
855
856void WaitcntBrackets::setScoreByOperand(const MachineOperand &Op,
857 AMDGPU::InstCounterType CntTy,
858 unsigned Score) {
859 setRegScore(Reg: Op.getReg().asMCReg(), T: CntTy, Val: Score);
860}
861
862// Return true if the subtarget is one that enables Point Sample Acceleration
863// and the MachineInstr passed in is one to which it might be applied (the
864// hardware makes this decision based on several factors, but we can't determine
865// this at compile time, so we have to assume it might be applied if the
866// instruction supports it).
867bool WaitcntBrackets::hasPointSampleAccel(const MachineInstr &MI) const {
868 if (!Context->ST.hasPointSampleAccel() || !SIInstrInfo::isMIMG(MI))
869 return false;
870
871 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc: MI.getOpcode());
872 const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo =
873 AMDGPU::getMIMGBaseOpcodeInfo(BaseOpcode: Info->BaseOpcode);
874 return BaseInfo->PointSampleAccel;
875}
876
877// Return true if the subtarget enables Point Sample Acceleration, the supplied
878// MachineInstr is one to which it might be applied and the supplied interval is
879// one that has outstanding writes to vmem-types different than VMEM_NOSAMPLER
880// (this is the type that a point sample accelerated instruction effectively
881// becomes)
882bool WaitcntBrackets::hasPointSamplePendingVmemTypes(const MachineInstr &MI,
883 MCPhysReg Reg) const {
884 if (!hasPointSampleAccel(MI))
885 return false;
886
887 return hasOtherPendingVmemTypes(Reg, V: VMEM_NOSAMPLER);
888}
889
890void WaitcntBrackets::updateByEvent(HWEvents E, MachineInstr &Inst) {
891 assert(E.size() == 1 && "Expected singular event!");
892 AMDGPU::InstCounterType T = Context->getCounterFromEvent(E);
893 assert(T < Context->MaxCounter);
894
895 unsigned UB = getScoreUB(T);
896 unsigned Increment = 1;
897 if (T == AMDGPU::VA_VDST && AMDGPU::getHasMatrixScale(Opc: Inst.getOpcode()) &&
898 Context->ST.hasVOP3PX2IncrementsVaVdstTwice()) {
899 // V_WMMA_SCALE instructions use VOP3PX2 encoding. Hardware treats this as
900 // two VOP3P instructions and increments VA_VDST twice.
901 Increment = 2;
902 }
903 unsigned CurrScore = UB + Increment;
904 if (CurrScore == 0)
905 report_fatal_error(reason: "InsertWaitcnt score wraparound");
906 // PendingEvents and ScoreUB need to be update regardless if this event
907 // changes the score of a register or not.
908 // Examples including vm_cnt when buffer-store or lgkm_cnt when send-message.
909 PendingEvents |= E;
910 setScoreUB(T, Val: CurrScore);
911
912 const SIRegisterInfo &TRI = Context->TRI;
913 const MachineRegisterInfo &MRI = Context->MRI;
914 const SIInstrInfo &TII = Context->TII;
915
916 if (T == AMDGPU::EXP_CNT) {
917 // Put score on the source vgprs. If this is a store, just use those
918 // specific register(s).
919 if (TII.isDS(MI: Inst) && Inst.mayLoadOrStore()) {
920 // All GDS operations must protect their address register (same as
921 // export.)
922 if (const auto *AddrOp = TII.getNamedOperand(MI&: Inst, OperandName: AMDGPU::OpName::addr))
923 setScoreByOperand(Op: *AddrOp, CntTy: AMDGPU::EXP_CNT, Score: CurrScore);
924
925 if (Inst.mayStore()) {
926 if (const auto *Data0 =
927 TII.getNamedOperand(MI&: Inst, OperandName: AMDGPU::OpName::data0))
928 setScoreByOperand(Op: *Data0, CntTy: AMDGPU::EXP_CNT, Score: CurrScore);
929 if (const auto *Data1 =
930 TII.getNamedOperand(MI&: Inst, OperandName: AMDGPU::OpName::data1))
931 setScoreByOperand(Op: *Data1, CntTy: AMDGPU::EXP_CNT, Score: CurrScore);
932 } else if (SIInstrInfo::isAtomicRet(MI: Inst) && !SIInstrInfo::isGWS(MI: Inst) &&
933 Inst.getOpcode() != AMDGPU::DS_APPEND &&
934 Inst.getOpcode() != AMDGPU::DS_CONSUME &&
935 Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) {
936 for (const MachineOperand &Op : Inst.all_uses()) {
937 if (TRI.isVectorRegister(MRI, Reg: Op.getReg()))
938 setScoreByOperand(Op, CntTy: AMDGPU::EXP_CNT, Score: CurrScore);
939 }
940 }
941 } else if (TII.isFLAT(MI: Inst)) {
942 if (Inst.mayStore()) {
943 setScoreByOperand(Op: *TII.getNamedOperand(MI&: Inst, OperandName: AMDGPU::OpName::data),
944 CntTy: AMDGPU::EXP_CNT, Score: CurrScore);
945 } else if (SIInstrInfo::isAtomicRet(MI: Inst)) {
946 setScoreByOperand(Op: *TII.getNamedOperand(MI&: Inst, OperandName: AMDGPU::OpName::data),
947 CntTy: AMDGPU::EXP_CNT, Score: CurrScore);
948 }
949 } else if (TII.isMIMG(MI: Inst)) {
950 if (Inst.mayStore()) {
951 setScoreByOperand(Op: Inst.getOperand(i: 0), CntTy: AMDGPU::EXP_CNT, Score: CurrScore);
952 } else if (SIInstrInfo::isAtomicRet(MI: Inst)) {
953 setScoreByOperand(Op: *TII.getNamedOperand(MI&: Inst, OperandName: AMDGPU::OpName::data),
954 CntTy: AMDGPU::EXP_CNT, Score: CurrScore);
955 }
956 } else if (TII.isMTBUF(MI: Inst)) {
957 if (Inst.mayStore())
958 setScoreByOperand(Op: Inst.getOperand(i: 0), CntTy: AMDGPU::EXP_CNT, Score: CurrScore);
959 } else if (TII.isMUBUF(MI: Inst)) {
960 if (Inst.mayStore()) {
961 setScoreByOperand(Op: Inst.getOperand(i: 0), CntTy: AMDGPU::EXP_CNT, Score: CurrScore);
962 } else if (SIInstrInfo::isAtomicRet(MI: Inst)) {
963 setScoreByOperand(Op: *TII.getNamedOperand(MI&: Inst, OperandName: AMDGPU::OpName::data),
964 CntTy: AMDGPU::EXP_CNT, Score: CurrScore);
965 }
966 } else if (TII.isLDSDIR(MI: Inst)) {
967 // LDSDIR instructions attach the score to the destination.
968 setScoreByOperand(Op: *TII.getNamedOperand(MI&: Inst, OperandName: AMDGPU::OpName::vdst),
969 CntTy: AMDGPU::EXP_CNT, Score: CurrScore);
970 } else {
971 if (TII.isEXP(MI: Inst)) {
972 // For export the destination registers are really temps that
973 // can be used as the actual source after export patching, so
974 // we need to treat them like sources and set the EXP_CNT
975 // score.
976 for (MachineOperand &DefMO : Inst.all_defs()) {
977 if (TRI.isVGPR(MRI, Reg: DefMO.getReg())) {
978 setScoreByOperand(Op: DefMO, CntTy: AMDGPU::EXP_CNT, Score: CurrScore);
979 }
980 }
981 }
982 for (const MachineOperand &Op : Inst.all_uses()) {
983 if (TRI.isVectorRegister(MRI, Reg: Op.getReg()))
984 setScoreByOperand(Op, CntTy: AMDGPU::EXP_CNT, Score: CurrScore);
985 }
986 }
987 } else if (T == AMDGPU::X_CNT) {
988 HWEvents OtherEvent =
989 E == HWEvents::SMEM_GROUP ? HWEvents::VMEM_GROUP : HWEvents::SMEM_GROUP;
990 if (PendingEvents.contains(Other: OtherEvent)) {
991 // Hardware inserts an implicit xcnt between interleaved
992 // SMEM and VMEM operations. So there will never be
993 // outstanding address translations for both SMEM and
994 // VMEM at the same time.
995 setScoreLB(T, Val: getScoreUB(T) - 1);
996 PendingEvents -= OtherEvent;
997 }
998 for (const MachineOperand &Op : Inst.all_uses())
999 setScoreByOperand(Op, CntTy: T, Score: CurrScore);
1000 } else if (T == AMDGPU::VA_VDST || T == AMDGPU::VM_VSRC) {
1001 // Match the score to the VGPR destination or source registers as
1002 // appropriate
1003 for (const MachineOperand &Op : Inst.operands()) {
1004 if (!Op.isReg() || (T == AMDGPU::VA_VDST && Op.isUse()) ||
1005 (T == AMDGPU::VM_VSRC && Op.isDef()))
1006 continue;
1007 if (TRI.isVectorRegister(MRI: Context->MRI, Reg: Op.getReg()))
1008 setScoreByOperand(Op, CntTy: T, Score: CurrScore);
1009 }
1010 } else /* LGKM_CNT || EXP_CNT || VS_CNT || NUM_INST_CNTS */ {
1011 // Match the score to the destination registers.
1012 //
1013 // Check only explicit operands. Stores, especially spill stores, include
1014 // implicit uses and defs of their super registers which would create an
1015 // artificial dependency, while these are there only for register liveness
1016 // accounting purposes.
1017 //
1018 // Special cases where implicit register defs exists, such as M0 or VCC,
1019 // but none with memory instructions.
1020 for (const MachineOperand &Op : Inst.defs()) {
1021 if (T == AMDGPU::LOAD_CNT || T == AMDGPU::SAMPLE_CNT ||
1022 T == AMDGPU::BVH_CNT) {
1023 if (!TRI.isVectorRegister(MRI, Reg: Op.getReg())) // TODO: add wrapper
1024 continue;
1025 if (updateVMCntOnly(Inst)) {
1026 // updateVMCntOnly should only leave us with VGPRs
1027 // MUBUF, MTBUF, MIMG, FlatGlobal, and FlatScratch only have VGPR/AGPR
1028 // defs. That's required for a sane index into `VgprMemTypes` below
1029 assert(TRI.isVectorRegister(MRI, Op.getReg()));
1030 VmemType V = getVmemType(Inst);
1031 unsigned char TypesMask = 1 << V;
1032 // If instruction can have Point Sample Accel applied, we have to flag
1033 // this with another potential dependency
1034 if (hasPointSampleAccel(MI: Inst))
1035 TypesMask |= 1 << VMEM_NOSAMPLER;
1036 for (MCRegUnit RU : regunits(Reg: Op.getReg().asMCReg()))
1037 VMem[toVMEMID(RU)].VMEMTypes |= TypesMask;
1038 }
1039 }
1040 setScoreByOperand(Op, CntTy: T, Score: CurrScore);
1041 }
1042 if (Inst.mayStore() &&
1043 (TII.isDS(MI: Inst) || Context->isNonAsyncLdsDmaWrite(MI: Inst))) {
1044 // MUBUF and FLAT LDS DMA operations need a wait on vmcnt before LDS
1045 // written can be accessed. A load from LDS to VMEM does not need a wait.
1046 //
1047 // The "Slot" is the offset from LDSDMA_BEGIN. If it's non-zero, then
1048 // there is a MachineInstr in LDSDMAStores used to track this LDSDMA
1049 // store. The "Slot" is the index into LDSDMAStores + 1.
1050 unsigned Slot = 0;
1051 for (const auto *MemOp : Inst.memoperands()) {
1052 if (!MemOp->isStore() ||
1053 MemOp->getAddrSpace() != AMDGPUAS::LOCAL_ADDRESS)
1054 continue;
1055 // Comparing just AA info does not guarantee memoperands are equal
1056 // in general, but this is so for LDS DMA in practice.
1057 auto AAI = MemOp->getAAInfo();
1058 // Alias scope information gives a way to definitely identify an
1059 // original memory object and practically produced in the module LDS
1060 // lowering pass. If there is no scope available we will not be able
1061 // to disambiguate LDS aliasing as after the module lowering all LDS
1062 // is squashed into a single big object.
1063 if (!AAI || !AAI.Scope)
1064 break;
1065 for (unsigned I = 0, E = LDSDMAStores.size(); I != E && !Slot; ++I) {
1066 for (const auto *MemOp : LDSDMAStores[I]->memoperands()) {
1067 if (MemOp->isStore() && AAI == MemOp->getAAInfo()) {
1068 Slot = I + 1;
1069 break;
1070 }
1071 }
1072 }
1073 if (Slot)
1074 break;
1075 // The slot may not be valid because it can be >= NUM_LDSDMA which
1076 // means the scoreboard cannot track it. We still want to preserve the
1077 // MI in order to check alias information, though.
1078 LDSDMAStores.push_back(Elt: &Inst);
1079 Slot = LDSDMAStores.size();
1080 break;
1081 }
1082 setVMemScore(TID: LDSDMA_BEGIN, T, Val: CurrScore);
1083 if (Slot && Slot < NUM_LDSDMA)
1084 setVMemScore(TID: LDSDMA_BEGIN + Slot, T, Val: CurrScore);
1085 }
1086
1087 if (Context->shouldUpdateAsyncMark(MI: Inst, T)) {
1088 AsyncScore[T] = CurrScore;
1089 }
1090
1091 if (SIInstrInfo::isSBarrierSCCWrite(Opcode: Inst.getOpcode())) {
1092 setRegScore(Reg: AMDGPU::SCC, T, Val: CurrScore);
1093 PendingSCCWrite = &Inst;
1094 }
1095 }
1096}
1097
1098void WaitcntBrackets::recordAsyncMark(MachineInstr &Inst) {
1099 // In the absence of loops, AsyncMarks can grow linearly with the program
1100 // until we encounter an ASYNCMARK_WAIT. We could drop the oldest mark above a
1101 // limit every time we push a new mark, but that seems like unnecessary work
1102 // in practical cases. We do separately truncate the array when processing a
1103 // loop, which should be sufficient.
1104 AsyncMarks.push_back(Elt: AsyncScore);
1105 AsyncScore = {};
1106 LLVM_DEBUG({
1107 dbgs() << "recordAsyncMark:\n" << Inst;
1108 for (const auto &Mark : AsyncMarks) {
1109 llvm::interleaveComma(Mark, dbgs());
1110 dbgs() << '\n';
1111 }
1112 });
1113}
1114
1115void WaitcntBrackets::print(raw_ostream &OS) const {
1116 const GCNSubtarget &ST = Context->ST;
1117
1118 for (auto T : inst_counter_types(MaxCounter: Context->MaxCounter)) {
1119 unsigned SR = getScoreRange(T);
1120 switch (T) {
1121 case AMDGPU::LOAD_CNT:
1122 OS << " " << (ST.hasExtendedWaitCounts() ? "LOAD" : "VM") << "_CNT("
1123 << SR << "):";
1124 break;
1125 case AMDGPU::DS_CNT:
1126 OS << " " << (ST.hasExtendedWaitCounts() ? "DS" : "LGKM") << "_CNT("
1127 << SR << "):";
1128 break;
1129 case AMDGPU::EXP_CNT:
1130 OS << " EXP_CNT(" << SR << "):";
1131 break;
1132 case AMDGPU::STORE_CNT:
1133 OS << " " << (ST.hasExtendedWaitCounts() ? "STORE" : "VS") << "_CNT("
1134 << SR << "):";
1135 break;
1136 case AMDGPU::SAMPLE_CNT:
1137 OS << " SAMPLE_CNT(" << SR << "):";
1138 break;
1139 case AMDGPU::BVH_CNT:
1140 OS << " BVH_CNT(" << SR << "):";
1141 break;
1142 case AMDGPU::KM_CNT:
1143 OS << " KM_CNT(" << SR << "):";
1144 break;
1145 case AMDGPU::X_CNT:
1146 OS << " X_CNT(" << SR << "):";
1147 break;
1148 case AMDGPU::ASYNC_CNT:
1149 OS << " ASYNC_CNT(" << SR << "):";
1150 break;
1151 case AMDGPU::VA_VDST:
1152 OS << " VA_VDST(" << SR << "): ";
1153 break;
1154 case AMDGPU::VM_VSRC:
1155 OS << " VM_VSRC(" << SR << "): ";
1156 break;
1157 default:
1158 OS << " UNKNOWN(" << SR << "):";
1159 break;
1160 }
1161
1162 if (SR != 0) {
1163 // Print vgpr scores.
1164 unsigned LB = getScoreLB(T);
1165
1166 SmallVector<VMEMID> SortedVMEMIDs(VMem.keys());
1167 sort(C&: SortedVMEMIDs);
1168
1169 for (auto ID : SortedVMEMIDs) {
1170 unsigned RegScore = VMem.at(Val: ID).Scores[T];
1171 if (RegScore <= LB)
1172 continue;
1173 unsigned RelScore = RegScore - LB - 1;
1174 if (ID < REGUNITS_END) {
1175 OS << ' ' << RelScore << ":vRU" << ID;
1176 } else {
1177 assert(ID >= LDSDMA_BEGIN && ID < LDSDMA_END &&
1178 "Unhandled/unexpected ID value!");
1179 OS << ' ' << RelScore << ":LDSDMA" << ID;
1180 }
1181 }
1182
1183 // Also need to print sgpr scores for lgkm_cnt or xcnt.
1184 if (isSmemCounter(T)) {
1185 SmallVector<MCRegUnit> SortedSMEMIDs(SGPRs.keys());
1186 sort(C&: SortedSMEMIDs);
1187 for (auto ID : SortedSMEMIDs) {
1188 unsigned RegScore = SGPRs.at(Val: ID).get(T);
1189 if (RegScore <= LB)
1190 continue;
1191 unsigned RelScore = RegScore - LB - 1;
1192 OS << ' ' << RelScore << ":sRU" << static_cast<unsigned>(ID);
1193 }
1194 }
1195
1196 if (T == AMDGPU::KM_CNT && SCCScore > 0)
1197 OS << ' ' << SCCScore << ":scc";
1198 }
1199 OS << '\n';
1200 }
1201
1202 OS << "Pending Events: ";
1203 if (hasPendingEvent()) {
1204 OS << getPendingEvents();
1205 } else {
1206 OS << "none";
1207 }
1208 OS << '\n';
1209
1210 OS << "Async score: ";
1211 if (AsyncScore.empty())
1212 OS << "none";
1213 else
1214 llvm::interleaveComma(c: AsyncScore, os&: OS);
1215 OS << '\n';
1216
1217 OS << "Async marks: " << AsyncMarks.size() << '\n';
1218
1219 for (const auto &Mark : AsyncMarks) {
1220 for (auto T : AMDGPU::inst_counter_types()) {
1221 unsigned MarkedScore = Mark[T];
1222 switch (T) {
1223 case AMDGPU::LOAD_CNT:
1224 OS << " " << (ST.hasExtendedWaitCounts() ? "LOAD" : "VM")
1225 << "_CNT: " << MarkedScore;
1226 break;
1227 case AMDGPU::DS_CNT:
1228 OS << " " << (ST.hasExtendedWaitCounts() ? "DS" : "LGKM")
1229 << "_CNT: " << MarkedScore;
1230 break;
1231 case AMDGPU::EXP_CNT:
1232 OS << " EXP_CNT: " << MarkedScore;
1233 break;
1234 case AMDGPU::STORE_CNT:
1235 OS << " " << (ST.hasExtendedWaitCounts() ? "STORE" : "VS")
1236 << "_CNT: " << MarkedScore;
1237 break;
1238 case AMDGPU::SAMPLE_CNT:
1239 OS << " SAMPLE_CNT: " << MarkedScore;
1240 break;
1241 case AMDGPU::BVH_CNT:
1242 OS << " BVH_CNT: " << MarkedScore;
1243 break;
1244 case AMDGPU::KM_CNT:
1245 OS << " KM_CNT: " << MarkedScore;
1246 break;
1247 case AMDGPU::X_CNT:
1248 OS << " X_CNT: " << MarkedScore;
1249 break;
1250 case AMDGPU::ASYNC_CNT:
1251 OS << " ASYNC_CNT: " << MarkedScore;
1252 break;
1253 default:
1254 OS << " UNKNOWN: " << MarkedScore;
1255 break;
1256 }
1257 }
1258 OS << '\n';
1259 }
1260 OS << '\n';
1261}
1262
1263/// Simplify \p UpdateWait by removing waits that are redundant based on the
1264/// current WaitcntBrackets and any other waits specified in \p CheckWait.
1265void WaitcntBrackets::simplifyWaitcnt(const AMDGPU::Waitcnt &CheckWait,
1266 AMDGPU::Waitcnt &UpdateWait) const {
1267 simplifyWaitcnt(Wait&: UpdateWait, T: AMDGPU::LOAD_CNT);
1268 simplifyWaitcnt(Wait&: UpdateWait, T: AMDGPU::EXP_CNT);
1269 simplifyWaitcnt(Wait&: UpdateWait, T: AMDGPU::DS_CNT);
1270 simplifyWaitcnt(Wait&: UpdateWait, T: AMDGPU::STORE_CNT);
1271 simplifyWaitcnt(Wait&: UpdateWait, T: AMDGPU::SAMPLE_CNT);
1272 simplifyWaitcnt(Wait&: UpdateWait, T: AMDGPU::BVH_CNT);
1273 simplifyWaitcnt(Wait&: UpdateWait, T: AMDGPU::KM_CNT);
1274 simplifyXcnt(CheckWait, UpdateWait);
1275 simplifyWaitcnt(Wait&: UpdateWait, T: AMDGPU::VA_VDST);
1276 simplifyVmVsrc(CheckWait, UpdateWait);
1277 simplifyWaitcnt(Wait&: UpdateWait, T: AMDGPU::ASYNC_CNT);
1278}
1279
1280void WaitcntBrackets::simplifyWaitcnt(AMDGPU::InstCounterType T,
1281 unsigned &Count) const {
1282 // The number of outstanding events for this type, T, can be calculated
1283 // as (UB - LB). If the current Count is greater than or equal to the number
1284 // of outstanding events, then the wait for this counter is redundant.
1285 if (Count >= getScoreRange(T))
1286 Count = ~0u;
1287}
1288
1289void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait,
1290 AMDGPU::InstCounterType T) const {
1291 unsigned Cnt = Wait.get(T);
1292 simplifyWaitcnt(T, Count&: Cnt);
1293 Wait.set(T, Val: Cnt);
1294}
1295
1296void WaitcntBrackets::simplifyXcnt(const AMDGPU::Waitcnt &CheckWait,
1297 AMDGPU::Waitcnt &UpdateWait) const {
1298 // Try to simplify xcnt further by checking for joint kmcnt and loadcnt
1299 // optimizations. On entry to a block with multiple predescessors, there may
1300 // be pending SMEM and VMEM events active at the same time.
1301 // In such cases, only clear one active event at a time.
1302 // TODO: Revisit xcnt optimizations for gfx1250.
1303 // Wait on XCNT is redundant if we are already waiting for a load to complete.
1304 // SMEM can return out of order, so only omit XCNT wait if we are waiting till
1305 // zero.
1306 if (CheckWait.get(T: AMDGPU::KM_CNT) == 0 &&
1307 hasPendingEvent(E: HWEvents::SMEM_GROUP))
1308 UpdateWait.set(T: AMDGPU::X_CNT, Val: ~0u);
1309 // If we have pending store we cannot optimize XCnt because we do not wait for
1310 // stores. VMEM loads retun in order, so if we only have loads XCnt is
1311 // decremented to the same number as LOADCnt.
1312 if (CheckWait.get(T: AMDGPU::LOAD_CNT) != ~0u &&
1313 hasPendingEvent(E: HWEvents::VMEM_GROUP) &&
1314 !hasPendingEvent(T: AMDGPU::STORE_CNT) &&
1315 CheckWait.get(T: AMDGPU::X_CNT) >= CheckWait.get(T: AMDGPU::LOAD_CNT))
1316 UpdateWait.set(T: AMDGPU::X_CNT, Val: ~0u);
1317 simplifyWaitcnt(Wait&: UpdateWait, T: AMDGPU::X_CNT);
1318}
1319
1320void WaitcntBrackets::simplifyVmVsrc(const AMDGPU::Waitcnt &CheckWait,
1321 AMDGPU::Waitcnt &UpdateWait) const {
1322 // Waiting for some counters implies waiting for VM_VSRC, since an
1323 // instruction that decrements a counter on completion would have
1324 // decremented VM_VSRC once its VGPR operands had been read.
1325 if (CheckWait.get(T: AMDGPU::VM_VSRC) >=
1326 std::min(l: {CheckWait.get(T: AMDGPU::LOAD_CNT),
1327 CheckWait.get(T: AMDGPU::STORE_CNT),
1328 CheckWait.get(T: AMDGPU::SAMPLE_CNT),
1329 CheckWait.get(T: AMDGPU::BVH_CNT), CheckWait.get(T: AMDGPU::DS_CNT)}))
1330 UpdateWait.set(T: AMDGPU::VM_VSRC, Val: ~0u);
1331 simplifyWaitcnt(Wait&: UpdateWait, T: AMDGPU::VM_VSRC);
1332}
1333
1334void WaitcntBrackets::purgeEmptyTrackingData() {
1335 VMem.remove_if(Pred: [](const auto &P) { return P.second.empty(); });
1336 SGPRs.remove_if(Pred: [](const auto &P) { return P.second.empty(); });
1337}
1338
1339void WaitcntBrackets::determineWaitForScore(AMDGPU::InstCounterType T,
1340 unsigned ScoreToWait,
1341 AMDGPU::Waitcnt &Wait) const {
1342 const unsigned LB = getScoreLB(T);
1343 const unsigned UB = getScoreUB(T);
1344
1345 // If the score falls within the bracket, we need a waitcnt.
1346 if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
1347 if ((T == AMDGPU::LOAD_CNT || T == AMDGPU::DS_CNT) && hasPendingFlat() &&
1348 !Context->ST.hasFlatLgkmVMemCountInOrder()) {
1349 // If there is a pending FLAT operation, and this is a VMem or LGKM
1350 // waitcnt and the target can report early completion, then we need
1351 // to force a waitcnt 0.
1352 Wait.add(T, Count: 0);
1353 } else if (counterOutOfOrder(T)) {
1354 // Counter can get decremented out-of-order when there
1355 // are multiple types event in the bracket. Also emit an s_wait counter
1356 // with a conservative value of 0 for the counter.
1357 Wait.add(T, Count: 0);
1358 } else {
1359 // If a counter has been maxed out avoid overflow by waiting for
1360 // MAX(CounterType) - 1 instead.
1361 unsigned NeededWait = std::min(a: UB - ScoreToWait, b: getLimit(T) - 1);
1362 Wait.add(T, Count: NeededWait);
1363 }
1364 }
1365}
1366
1367AMDGPU::Waitcnt WaitcntBrackets::determineAsyncWait(unsigned N) {
1368 LLVM_DEBUG({
1369 dbgs() << "Need " << N << " async marks. Found " << AsyncMarks.size()
1370 << ":\n";
1371 for (const auto &Mark : AsyncMarks) {
1372 llvm::interleaveComma(Mark, dbgs());
1373 dbgs() << '\n';
1374 }
1375 });
1376
1377 if (AsyncMarks.size() == MaxAsyncMarks) {
1378 // Enforcing MaxAsyncMarks here is unnecessary work because the size of
1379 // MaxAsyncMarks is linear when traversing straightline code. But we do
1380 // need to check if truncation may have occured at a merge, and adjust N
1381 // to ensure that a wait is generated.
1382 LLVM_DEBUG(dbgs() << "Possible truncation. Ensuring a non-trivial wait.\n");
1383 N = std::min(a: N, b: (unsigned)MaxAsyncMarks - 1);
1384 }
1385
1386 AMDGPU::Waitcnt Wait;
1387 if (AsyncMarks.size() <= N) {
1388 LLVM_DEBUG(dbgs() << "No additional wait for async mark.\n");
1389 return Wait;
1390 }
1391
1392 size_t MarkIndex = AsyncMarks.size() - N - 1;
1393 const auto &RequiredMark = AsyncMarks[MarkIndex];
1394 for (AMDGPU::InstCounterType T : AMDGPU::inst_counter_types())
1395 determineWaitForScore(T, ScoreToWait: RequiredMark[T], Wait);
1396
1397 // Immediately remove the waited mark and all older ones
1398 // This happens BEFORE the wait is actually inserted, which is fine
1399 // because we've already extracted the wait requirements
1400 LLVM_DEBUG({
1401 dbgs() << "Removing " << (MarkIndex + 1)
1402 << " async marks after determining wait\n";
1403 });
1404 AsyncMarks.erase(CS: AsyncMarks.begin(), CE: AsyncMarks.begin() + MarkIndex + 1);
1405
1406 LLVM_DEBUG(dbgs() << "Waits to add: " << Wait);
1407 return Wait;
1408}
1409
1410// With D16Write32BitVgpr, D16 inst might be clobbered by events running on the
1411// other half 16bit.
1412//
1413// Replace VGPR16 to VGPR32 for wait check if:
1414// 1. MI is a VALU, and there is a wait event on the other half
1415// 2. MI is a LdSt, and there is a wait event on the other half from different
1416// order group
1417MCPhysReg WaitcntBrackets::determineVGPR16Dependency(const MachineInstr &MI,
1418 AMDGPU::InstCounterType T,
1419 MCPhysReg Reg) const {
1420 const TargetRegisterClass *RC = Context->TRI.getPhysRegBaseClass(Reg);
1421 unsigned Size = Context->TRI.getRegSizeInBits(RC: *RC);
1422
1423 if (Size != 16 || !Context->ST.hasD16Writes32BitVgpr())
1424 return Reg;
1425
1426 // With D16Writes32BitVgpr, D16 Inst might clobber the whole vgpr32
1427 // check dependency on the other half
1428 Register Reg32 = Context->TRI.get32BitRegister(Reg);
1429 Register OtherHalf = Context->TRI.getSubReg(
1430 Reg: Reg32,
1431 Idx: AMDGPU::isHi16Reg(Reg, MRI: Context->TRI) ? AMDGPU::lo16 : AMDGPU::hi16);
1432
1433 AMDGPU::Waitcnt Wait;
1434 for (MCRegUnit RU : regunits(Reg: OtherHalf))
1435 determineWaitForScore(T, ScoreToWait: getVMemScore(TID: toVMEMID(RU), T), Wait);
1436
1437 // No wait on otherhalf
1438 if (!Wait.hasWait())
1439 return Reg;
1440
1441 if (Context->TII.isVALU(MI, /*AllowLDSDMA=*/true))
1442 return Reg32;
1443
1444 // If hi/lo16 mixed events
1445 HWEvents MIEvents =
1446 AMDGPU::getEventsFor(Inst: MI, ST: Context->ST, IsExpertMode: Context->IsExpertMode);
1447 HWEvents OtherHalfEvents = Context->getWaitEvents(T);
1448 HWEvents Events = MIEvents & OtherHalfEvents;
1449 if (Events.size() > 1)
1450 return Reg32;
1451 return Reg;
1452}
1453
1454void WaitcntBrackets::determineWaitForPhysReg(AMDGPU::InstCounterType T,
1455 MCPhysReg Reg,
1456 AMDGPU::Waitcnt &Wait,
1457 const MachineInstr &MI) const {
1458 if (Reg == AMDGPU::SCC) {
1459 determineWaitForScore(T, ScoreToWait: SCCScore, Wait);
1460 } else {
1461 bool IsVGPR = Context->TRI.isVectorRegister(MRI: Context->MRI, Reg);
1462 if (IsVGPR)
1463 Reg = determineVGPR16Dependency(MI, T, Reg);
1464 for (MCRegUnit RU : regunits(Reg))
1465 determineWaitForScore(
1466 T, ScoreToWait: IsVGPR ? getVMemScore(TID: toVMEMID(RU), T) : getSGPRScore(RU, T),
1467 Wait);
1468 }
1469}
1470
1471void WaitcntBrackets::determineWaitForLDSDMA(AMDGPU::InstCounterType T,
1472 VMEMID TID,
1473 AMDGPU::Waitcnt &Wait) const {
1474 assert(TID >= LDSDMA_BEGIN && TID < LDSDMA_END);
1475 determineWaitForScore(T, ScoreToWait: getVMemScore(TID, T), Wait);
1476}
1477
1478void WaitcntBrackets::tryClearSCCWriteEvent(MachineInstr *Inst) {
1479 // S_BARRIER_WAIT on the same barrier guarantees that the pending write to
1480 // SCC has landed
1481 if (PendingSCCWrite &&
1482 PendingSCCWrite->getOpcode() == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM &&
1483 PendingSCCWrite->getOperand(i: 0).getImm() == Inst->getOperand(i: 0).getImm()) {
1484 HWEvents SCC_WRITE_PendingEvent = HWEvents::SCC_WRITE;
1485 // If this SCC_WRITE is the only pending KM_CNT event, clear counter.
1486 if ((PendingEvents & Context->getWaitEvents(T: AMDGPU::KM_CNT)) ==
1487 SCC_WRITE_PendingEvent) {
1488 setScoreLB(T: AMDGPU::KM_CNT, Val: getScoreUB(T: AMDGPU::KM_CNT));
1489 }
1490
1491 PendingEvents -= SCC_WRITE_PendingEvent;
1492 PendingSCCWrite = nullptr;
1493 }
1494}
1495
1496void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) {
1497 for (AMDGPU::InstCounterType T : AMDGPU::inst_counter_types())
1498 applyWaitcnt(Wait, T);
1499}
1500
1501void WaitcntBrackets::applyWaitcnt(AMDGPU::InstCounterType T, unsigned Count) {
1502 const unsigned UB = getScoreUB(T);
1503 if (Count >= UB)
1504 return;
1505 if (Count != 0) {
1506 if (counterOutOfOrder(T))
1507 return;
1508 setScoreLB(T, Val: std::max(a: getScoreLB(T), b: UB - Count));
1509 } else {
1510 setScoreLB(T, Val: UB);
1511 PendingEvents -= Context->getWaitEvents(T);
1512 }
1513
1514 if (T == AMDGPU::KM_CNT && Count == 0 &&
1515 hasPendingEvent(E: HWEvents::SMEM_GROUP)) {
1516 if (!hasMixedPendingEvents(T: AMDGPU::X_CNT))
1517 applyWaitcnt(T: AMDGPU::X_CNT, Count: 0);
1518 else
1519 PendingEvents -= HWEvents::SMEM_GROUP;
1520 }
1521 if (T == AMDGPU::LOAD_CNT && hasPendingEvent(E: HWEvents::VMEM_GROUP) &&
1522 !hasPendingEvent(T: AMDGPU::STORE_CNT)) {
1523 if (!hasMixedPendingEvents(T: AMDGPU::X_CNT))
1524 applyWaitcnt(T: AMDGPU::X_CNT, Count);
1525 else if (Count == 0)
1526 PendingEvents -= HWEvents::VMEM_GROUP;
1527 }
1528}
1529
1530void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait,
1531 AMDGPU::InstCounterType T) {
1532 unsigned Cnt = Wait.get(T);
1533 applyWaitcnt(T, Count: Cnt);
1534}
1535
1536// Where there are multiple types of event in the bracket of a counter,
1537// the decrement may go out of order.
1538bool WaitcntBrackets::counterOutOfOrder(AMDGPU::InstCounterType T) const {
1539 // Scalar memory read always can go out of order.
1540 if ((T == Context->SmemAccessCounter &&
1541 hasPendingEvent(E: HWEvents::SMEM_ACCESS)) ||
1542 (T == AMDGPU::X_CNT && hasPendingEvent(E: HWEvents::SMEM_GROUP)))
1543 return true;
1544
1545 if (T == AMDGPU::LOAD_CNT) {
1546
1547 // On targets without VScnt, LOAD_CNT includes all of STORE_CNT as well.
1548 // All these events use one counter and do not go out of order with respect
1549 // to each other.
1550 if (!Context->ST.hasVscnt())
1551 return false;
1552
1553 HWEvents Events = PendingEvents & Context->getWaitEvents(T);
1554
1555 // GLOBAL_INV completes in-order with other LOAD_CNT events,
1556 // so having GLOBAL_INV_ACCESS mixed with other LOAD_CNT
1557 // events doesn't cause out-of-order completion.
1558 Events -= HWEvents::GLOBAL_INV_ACCESS;
1559
1560 // Return true only if there are still multiple event types after removing
1561 // GLOBAL_INV
1562 return Events.size() > 1;
1563 }
1564
1565 return hasMixedPendingEvents(T);
1566}
1567
1568INITIALIZE_PASS_BEGIN(SIInsertWaitcntsLegacy, DEBUG_TYPE, "SI Insert Waitcnts",
1569 false, false)
1570INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass)
1571INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTreeWrapperPass)
1572INITIALIZE_PASS_END(SIInsertWaitcntsLegacy, DEBUG_TYPE, "SI Insert Waitcnts",
1573 false, false)
1574
1575char SIInsertWaitcntsLegacy::ID = 0;
1576
1577char &llvm::SIInsertWaitcntsID = SIInsertWaitcntsLegacy::ID;
1578
1579FunctionPass *llvm::createSIInsertWaitcntsPass() {
1580 return new SIInsertWaitcntsLegacy();
1581}
1582
1583static bool updateOperandIfDifferent(MachineInstr &MI, AMDGPU::OpName OpName,
1584 unsigned NewEnc) {
1585 int OpIdx = AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: OpName);
1586 assert(OpIdx >= 0);
1587
1588 MachineOperand &MO = MI.getOperand(i: OpIdx);
1589
1590 if (NewEnc == MO.getImm())
1591 return false;
1592
1593 MO.setImm(NewEnc);
1594 return true;
1595}
1596
1597bool WaitcntGenerator::promoteSoftWaitCnt(MachineInstr *Waitcnt) const {
1598 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Opcode: Waitcnt->getOpcode());
1599 if (Opcode == Waitcnt->getOpcode())
1600 return false;
1601
1602 Waitcnt->setDesc(TII.get(Opcode));
1603 return true;
1604}
1605
1606/// Combine consecutive S_WAITCNT and S_WAITCNT_VSCNT instructions that
1607/// precede \p It and follow \p OldWaitcntInstr and apply any extra waits
1608/// from \p Wait that were added by previous passes. Currently this pass
1609/// conservatively assumes that these preexisting waits are required for
1610/// correctness.
1611bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
1612 WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
1613 AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) const {
1614 assert(isNormalMode(MaxCounter));
1615
1616 bool Modified = false;
1617 MachineInstr *WaitcntInstr = nullptr;
1618 MachineInstr *WaitcntVsCntInstr = nullptr;
1619
1620 LLVM_DEBUG({
1621 dbgs() << "PreGFX12::applyPreexistingWaitcnt at: ";
1622 if (It.isEnd())
1623 dbgs() << "end of block\n";
1624 else
1625 dbgs() << *It;
1626 });
1627
1628 for (auto &II :
1629 make_early_inc_range(Range: make_range(x: OldWaitcntInstr.getIterator(), y: It))) {
1630 LLVM_DEBUG(dbgs() << "pre-existing iter: " << II);
1631 if (isNonWaitcntMetaInst(MI: II)) {
1632 LLVM_DEBUG(dbgs() << "skipped meta instruction\n");
1633 continue;
1634 }
1635
1636 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Opcode: II.getOpcode());
1637 bool TrySimplify = Opcode != II.getOpcode() && !OptNone;
1638
1639 // Update required wait count. If this is a soft waitcnt (= it was added
1640 // by an earlier pass), it may be entirely removed.
1641 if (Opcode == AMDGPU::S_WAITCNT) {
1642 unsigned IEnc = II.getOperand(i: 0).getImm();
1643 AMDGPU::Waitcnt OldWait = AMDGPU::decodeWaitcnt(Version: IV, Encoded: IEnc);
1644 if (TrySimplify)
1645 ScoreBrackets.simplifyWaitcnt(Wait&: OldWait);
1646 Wait = Wait.combined(Other: OldWait);
1647
1648 // Merge consecutive waitcnt of the same type by erasing multiples.
1649 if (WaitcntInstr || (!Wait.hasWaitExceptStoreCnt() && TrySimplify)) {
1650 II.eraseFromParent();
1651 Modified = true;
1652 } else
1653 WaitcntInstr = &II;
1654 } else if (Opcode == AMDGPU::S_WAITCNT_lds_direct) {
1655 assert(ST.hasVMemToLDSLoad());
1656 LLVM_DEBUG(dbgs() << "Processing S_WAITCNT_lds_direct: " << II
1657 << "Before: " << Wait << '\n';);
1658 ScoreBrackets.determineWaitForLDSDMA(T: AMDGPU::LOAD_CNT, TID: LDSDMA_BEGIN,
1659 Wait);
1660 LLVM_DEBUG(dbgs() << "After: " << Wait << '\n';);
1661
1662 // It is possible (but unlikely) that this is the only wait instruction,
1663 // in which case, we exit this loop without a WaitcntInstr to consume
1664 // `Wait`. But that works because `Wait` was passed in by reference, and
1665 // the callee eventually calls createNewWaitcnt on it. We test this
1666 // possibility in an articial MIR test since such a situation cannot be
1667 // recreated by running the memory legalizer.
1668 II.eraseFromParent();
1669 } else if (Opcode == AMDGPU::WAIT_ASYNCMARK) {
1670 unsigned N = II.getOperand(i: 0).getImm();
1671 LLVM_DEBUG(dbgs() << "Processing WAIT_ASYNCMARK: " << II << '\n';);
1672 AMDGPU::Waitcnt OldWait = ScoreBrackets.determineAsyncWait(N);
1673 Wait = Wait.combined(Other: OldWait);
1674 } else {
1675 assert(Opcode == AMDGPU::S_WAITCNT_VSCNT);
1676 assert(II.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1677
1678 unsigned OldVSCnt =
1679 TII.getNamedOperand(MI&: II, OperandName: AMDGPU::OpName::simm16)->getImm();
1680 if (TrySimplify)
1681 ScoreBrackets.simplifyWaitcnt(T: AMDGPU::STORE_CNT, Count&: OldVSCnt);
1682 Wait.set(T: AMDGPU::STORE_CNT,
1683 Val: std::min(a: Wait.get(T: AMDGPU::STORE_CNT), b: OldVSCnt));
1684
1685 if (WaitcntVsCntInstr || (!Wait.hasWaitStoreCnt() && TrySimplify)) {
1686 II.eraseFromParent();
1687 Modified = true;
1688 } else
1689 WaitcntVsCntInstr = &II;
1690 }
1691 }
1692
1693 if (WaitcntInstr) {
1694 Modified |= updateOperandIfDifferent(MI&: *WaitcntInstr, OpName: AMDGPU::OpName::simm16,
1695 NewEnc: AMDGPU::encodeWaitcnt(Version: IV, Decoded: Wait));
1696 Modified |= promoteSoftWaitCnt(Waitcnt: WaitcntInstr);
1697
1698 ScoreBrackets.applyWaitcnt(Wait, T: AMDGPU::LOAD_CNT);
1699 ScoreBrackets.applyWaitcnt(Wait, T: AMDGPU::EXP_CNT);
1700 ScoreBrackets.applyWaitcnt(Wait, T: AMDGPU::DS_CNT);
1701 Wait.set(T: AMDGPU::LOAD_CNT, Val: ~0u);
1702 Wait.set(T: AMDGPU::EXP_CNT, Val: ~0u);
1703 Wait.set(T: AMDGPU::DS_CNT, Val: ~0u);
1704
1705 LLVM_DEBUG(It.isEnd() ? dbgs() << "applied pre-existing waitcnt\n"
1706 << "New Instr at block end: "
1707 << *WaitcntInstr << '\n'
1708 : dbgs() << "applied pre-existing waitcnt\n"
1709 << "Old Instr: " << *It
1710 << "New Instr: " << *WaitcntInstr << '\n');
1711 }
1712
1713 if (WaitcntVsCntInstr) {
1714 Modified |=
1715 updateOperandIfDifferent(MI&: *WaitcntVsCntInstr, OpName: AMDGPU::OpName::simm16,
1716 NewEnc: Wait.get(T: AMDGPU::STORE_CNT));
1717 Modified |= promoteSoftWaitCnt(Waitcnt: WaitcntVsCntInstr);
1718
1719 ScoreBrackets.applyWaitcnt(T: AMDGPU::STORE_CNT, Count: Wait.get(T: AMDGPU::STORE_CNT));
1720 Wait.set(T: AMDGPU::STORE_CNT, Val: ~0u);
1721
1722 LLVM_DEBUG(It.isEnd()
1723 ? dbgs() << "applied pre-existing waitcnt\n"
1724 << "New Instr at block end: " << *WaitcntVsCntInstr
1725 << '\n'
1726 : dbgs() << "applied pre-existing waitcnt\n"
1727 << "Old Instr: " << *It
1728 << "New Instr: " << *WaitcntVsCntInstr << '\n');
1729 }
1730
1731 return Modified;
1732}
1733
1734/// Generate S_WAITCNT and/or S_WAITCNT_VSCNT instructions for any
1735/// required counters in \p Wait
1736bool WaitcntGeneratorPreGFX12::createNewWaitcnt(
1737 MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It,
1738 AMDGPU::Waitcnt Wait, const WaitcntBrackets &ScoreBrackets) {
1739 assert(isNormalMode(MaxCounter));
1740
1741 bool Modified = false;
1742 const DebugLoc &DL = Block.findDebugLoc(MBBI: It);
1743
1744 // Waits for VMcnt, LKGMcnt and/or EXPcnt are encoded together into a
1745 // single instruction while VScnt has its own instruction.
1746 if (Wait.hasWaitExceptStoreCnt()) {
1747 // If profiling expansion is enabled, emit an expanded sequence
1748 if (ExpandWaitcntProfiling) {
1749 // Check if any of the counters to be waited on are out-of-order.
1750 // If so, fall back to normal (non-expanded) behavior since expansion
1751 // would provide misleading profiling information.
1752 bool AnyOutOfOrder = false;
1753 for (auto CT : {AMDGPU::LOAD_CNT, AMDGPU::DS_CNT, AMDGPU::EXP_CNT}) {
1754 unsigned WaitCnt = Wait.get(T: CT);
1755 if (WaitCnt != ~0u && ScoreBrackets.counterOutOfOrder(T: CT)) {
1756 AnyOutOfOrder = true;
1757 break;
1758 }
1759 }
1760
1761 if (AnyOutOfOrder) {
1762 // Fall back to non-expanded wait
1763 unsigned Enc = AMDGPU::encodeWaitcnt(Version: IV, Decoded: Wait);
1764 BuildMI(BB&: Block, I: It, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_WAITCNT)).addImm(Val: Enc);
1765 Modified = true;
1766 } else {
1767 // All counters are in-order, safe to expand
1768 for (auto CT : {AMDGPU::LOAD_CNT, AMDGPU::DS_CNT, AMDGPU::EXP_CNT}) {
1769 unsigned WaitCnt = Wait.get(T: CT);
1770 if (WaitCnt == ~0u)
1771 continue;
1772
1773 unsigned Outstanding =
1774 std::min(a: ScoreBrackets.getOutstanding(T: CT), b: getLimit(E: CT) - 1);
1775 EmitExpandedWaitcnt(Outstanding, Target: WaitCnt, EmitWaitcnt: [&](unsigned Count) {
1776 AMDGPU::Waitcnt W;
1777 W.set(T: CT, Val: Count);
1778 BuildMI(BB&: Block, I: It, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_WAITCNT))
1779 .addImm(Val: AMDGPU::encodeWaitcnt(Version: IV, Decoded: W));
1780 });
1781 Modified = true;
1782 }
1783 }
1784 } else {
1785 // Normal behavior: emit single combined waitcnt
1786 unsigned Enc = AMDGPU::encodeWaitcnt(Version: IV, Decoded: Wait);
1787 [[maybe_unused]] auto SWaitInst =
1788 BuildMI(BB&: Block, I: It, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_WAITCNT)).addImm(Val: Enc);
1789 Modified = true;
1790
1791 LLVM_DEBUG(dbgs() << "PreGFX12::createNewWaitcnt\n";
1792 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
1793 dbgs() << "New Instr: " << *SWaitInst << '\n');
1794 }
1795 }
1796
1797 if (Wait.hasWaitStoreCnt()) {
1798 assert(ST.hasVscnt());
1799
1800 if (ExpandWaitcntProfiling && Wait.get(T: AMDGPU::STORE_CNT) != ~0u &&
1801 !ScoreBrackets.counterOutOfOrder(T: AMDGPU::STORE_CNT)) {
1802 // Only expand if counter is not out-of-order
1803 unsigned Outstanding =
1804 std::min(a: ScoreBrackets.getOutstanding(T: AMDGPU::STORE_CNT),
1805 b: getLimit(E: AMDGPU::STORE_CNT) - 1);
1806 EmitExpandedWaitcnt(
1807 Outstanding, Target: Wait.get(T: AMDGPU::STORE_CNT), EmitWaitcnt: [&](unsigned Count) {
1808 BuildMI(BB&: Block, I: It, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_WAITCNT_VSCNT))
1809 .addReg(RegNo: AMDGPU::SGPR_NULL, Flags: RegState::Undef)
1810 .addImm(Val: Count);
1811 });
1812 Modified = true;
1813 } else {
1814 [[maybe_unused]] auto SWaitInst =
1815 BuildMI(BB&: Block, I: It, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_WAITCNT_VSCNT))
1816 .addReg(RegNo: AMDGPU::SGPR_NULL, Flags: RegState::Undef)
1817 .addImm(Val: Wait.get(T: AMDGPU::STORE_CNT));
1818 Modified = true;
1819
1820 LLVM_DEBUG(dbgs() << "PreGFX12::createNewWaitcnt\n";
1821 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
1822 dbgs() << "New Instr: " << *SWaitInst << '\n');
1823 }
1824 }
1825
1826 return Modified;
1827}
1828
1829AMDGPU::Waitcnt
1830WaitcntGeneratorPreGFX12::getAllZeroWaitcnt(bool IncludeVSCnt) const {
1831 return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt && ST.hasVscnt() ? 0 : ~0u);
1832}
1833
1834AMDGPU::Waitcnt
1835WaitcntGeneratorGFX12Plus::getAllZeroWaitcnt(bool IncludeVSCnt) const {
1836 unsigned ExpertVal = IsExpertMode ? 0 : ~0u;
1837 return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt ? 0 : ~0u, 0, 0, 0,
1838 ~0u /* XCNT */, ~0u /* ASYNC_CNT */,
1839 ~0u /* TENSOR_CNT */, ExpertVal, ExpertVal);
1840}
1841
1842/// Combine consecutive S_WAIT_*CNT instructions that precede \p It and
1843/// follow \p OldWaitcntInstr and apply any extra waits from \p Wait that
1844/// were added by previous passes. Currently this pass conservatively
1845/// assumes that these preexisting waits are required for correctness.
1846bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
1847 WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
1848 AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) const {
1849 assert(!isNormalMode(MaxCounter));
1850
1851 bool Modified = false;
1852 MachineInstr *CombinedLoadDsCntInstr = nullptr;
1853 MachineInstr *CombinedStoreDsCntInstr = nullptr;
1854 MachineInstr *WaitcntDepctrInstr = nullptr;
1855 MachineInstr *WaitInstrs[AMDGPU::NUM_EXTENDED_INST_CNTS] = {};
1856
1857 LLVM_DEBUG({
1858 dbgs() << "GFX12Plus::applyPreexistingWaitcnt at: ";
1859 if (It.isEnd())
1860 dbgs() << "end of block\n";
1861 else
1862 dbgs() << *It;
1863 });
1864
1865 // Accumulate waits that should not be simplified.
1866 AMDGPU::Waitcnt RequiredWait;
1867
1868 for (auto &II :
1869 make_early_inc_range(Range: make_range(x: OldWaitcntInstr.getIterator(), y: It))) {
1870 LLVM_DEBUG(dbgs() << "pre-existing iter: " << II);
1871 if (isNonWaitcntMetaInst(MI: II)) {
1872 LLVM_DEBUG(dbgs() << "skipped meta instruction\n");
1873 continue;
1874 }
1875
1876 // Update required wait count. If this is a soft waitcnt (= it was added
1877 // by an earlier pass), it may be entirely removed.
1878
1879 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Opcode: II.getOpcode());
1880 bool TrySimplify = Opcode != II.getOpcode() && !OptNone;
1881
1882 // Don't crash if the programmer used legacy waitcnt intrinsics, but don't
1883 // attempt to do more than that either.
1884 if (Opcode == AMDGPU::S_WAITCNT)
1885 continue;
1886
1887 if (Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT) {
1888 unsigned OldEnc =
1889 TII.getNamedOperand(MI&: II, OperandName: AMDGPU::OpName::simm16)->getImm();
1890 AMDGPU::Waitcnt OldWait = AMDGPU::decodeLoadcntDscnt(Version: IV, LoadcntDscnt: OldEnc);
1891 if (TrySimplify)
1892 Wait = Wait.combined(Other: OldWait);
1893 else
1894 RequiredWait = RequiredWait.combined(Other: OldWait);
1895 // Keep the first wait_loadcnt, erase the rest.
1896 if (CombinedLoadDsCntInstr == nullptr) {
1897 CombinedLoadDsCntInstr = &II;
1898 } else {
1899 II.eraseFromParent();
1900 Modified = true;
1901 }
1902 } else if (Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT) {
1903 unsigned OldEnc =
1904 TII.getNamedOperand(MI&: II, OperandName: AMDGPU::OpName::simm16)->getImm();
1905 AMDGPU::Waitcnt OldWait = AMDGPU::decodeStorecntDscnt(Version: IV, StorecntDscnt: OldEnc);
1906 if (TrySimplify)
1907 Wait = Wait.combined(Other: OldWait);
1908 else
1909 RequiredWait = RequiredWait.combined(Other: OldWait);
1910 // Keep the first wait_storecnt, erase the rest.
1911 if (CombinedStoreDsCntInstr == nullptr) {
1912 CombinedStoreDsCntInstr = &II;
1913 } else {
1914 II.eraseFromParent();
1915 Modified = true;
1916 }
1917 } else if (Opcode == AMDGPU::S_WAITCNT_DEPCTR) {
1918 unsigned OldEnc =
1919 TII.getNamedOperand(MI&: II, OperandName: AMDGPU::OpName::simm16)->getImm();
1920 AMDGPU::Waitcnt OldWait;
1921 OldWait.set(T: AMDGPU::VA_VDST, Val: AMDGPU::DepCtr::decodeFieldVaVdst(Encoded: OldEnc));
1922 OldWait.set(T: AMDGPU::VM_VSRC, Val: AMDGPU::DepCtr::decodeFieldVmVsrc(Encoded: OldEnc));
1923 if (TrySimplify)
1924 ScoreBrackets.simplifyWaitcnt(Wait&: OldWait);
1925 Wait = Wait.combined(Other: OldWait);
1926 if (WaitcntDepctrInstr == nullptr) {
1927 WaitcntDepctrInstr = &II;
1928 } else {
1929 // S_WAITCNT_DEPCTR requires special care. Don't remove a
1930 // duplicate if it is waiting on things other than VA_VDST or
1931 // VM_VSRC. If that is the case, just make sure the VA_VDST and
1932 // VM_VSRC subfields of the operand are set to the "no wait"
1933 // values.
1934
1935 unsigned Enc =
1936 TII.getNamedOperand(MI&: II, OperandName: AMDGPU::OpName::simm16)->getImm();
1937 Enc = AMDGPU::DepCtr::encodeFieldVmVsrc(Encoded: Enc, VmVsrc: ~0u);
1938 Enc = AMDGPU::DepCtr::encodeFieldVaVdst(Encoded: Enc, VaVdst: ~0u);
1939
1940 if (Enc != (unsigned)AMDGPU::DepCtr::getDefaultDepCtrEncoding(STI: ST)) {
1941 Modified |= updateOperandIfDifferent(MI&: II, OpName: AMDGPU::OpName::simm16, NewEnc: Enc);
1942 Modified |= promoteSoftWaitCnt(Waitcnt: &II);
1943 } else {
1944 II.eraseFromParent();
1945 Modified = true;
1946 }
1947 }
1948 } else if (Opcode == AMDGPU::S_WAITCNT_lds_direct) {
1949 // Architectures higher than GFX10 do not have direct loads to
1950 // LDS, so no work required here yet.
1951 II.eraseFromParent();
1952 Modified = true;
1953 } else if (Opcode == AMDGPU::WAIT_ASYNCMARK) {
1954 // Update the Waitcnt, but don't erase the wait.asyncmark() itself. It
1955 // shows up in the assembly as a comment with the original parameter N.
1956 unsigned N = II.getOperand(i: 0).getImm();
1957 AMDGPU::Waitcnt OldWait = ScoreBrackets.determineAsyncWait(N);
1958 Wait = Wait.combined(Other: OldWait);
1959 } else {
1960 std::optional<AMDGPU::InstCounterType> CT =
1961 AMDGPU::counterTypeForInstr(Opcode);
1962 assert(CT.has_value());
1963 unsigned OldCnt =
1964 TII.getNamedOperand(MI&: II, OperandName: AMDGPU::OpName::simm16)->getImm();
1965 if (TrySimplify)
1966 Wait.add(T: CT.value(), Count: OldCnt);
1967 else
1968 RequiredWait.add(T: CT.value(), Count: OldCnt);
1969 // Keep the first wait of its kind, erase the rest.
1970 if (WaitInstrs[CT.value()] == nullptr) {
1971 WaitInstrs[CT.value()] = &II;
1972 } else {
1973 II.eraseFromParent();
1974 Modified = true;
1975 }
1976 }
1977 }
1978
1979 ScoreBrackets.simplifyWaitcnt(CheckWait: Wait.combined(Other: RequiredWait), UpdateWait&: Wait);
1980 Wait = Wait.combined(Other: RequiredWait);
1981
1982 if (CombinedLoadDsCntInstr) {
1983 // Only keep an S_WAIT_LOADCNT_DSCNT if both counters actually need
1984 // to be waited for. Otherwise, let the instruction be deleted so
1985 // the appropriate single counter wait instruction can be inserted
1986 // instead, when new S_WAIT_*CNT instructions are inserted by
1987 // createNewWaitcnt(). As a side effect, resetting the wait counts will
1988 // cause any redundant S_WAIT_LOADCNT or S_WAIT_DSCNT to be removed by
1989 // the loop below that deals with single counter instructions.
1990 //
1991 // A wait for LOAD_CNT or DS_CNT implies a wait for VM_VSRC, since
1992 // instructions that have decremented LOAD_CNT or DS_CNT on completion
1993 // will have needed to wait for their register sources to be available
1994 // first.
1995 if (Wait.get(T: AMDGPU::LOAD_CNT) != ~0u && Wait.get(T: AMDGPU::DS_CNT) != ~0u) {
1996 unsigned NewEnc = AMDGPU::encodeLoadcntDscnt(Version: IV, Decoded: Wait);
1997 Modified |= updateOperandIfDifferent(MI&: *CombinedLoadDsCntInstr,
1998 OpName: AMDGPU::OpName::simm16, NewEnc);
1999 Modified |= promoteSoftWaitCnt(Waitcnt: CombinedLoadDsCntInstr);
2000 ScoreBrackets.applyWaitcnt(T: AMDGPU::LOAD_CNT, Count: Wait.get(T: AMDGPU::LOAD_CNT));
2001 ScoreBrackets.applyWaitcnt(T: AMDGPU::DS_CNT, Count: Wait.get(T: AMDGPU::DS_CNT));
2002 Wait.set(T: AMDGPU::LOAD_CNT, Val: ~0u);
2003 Wait.set(T: AMDGPU::DS_CNT, Val: ~0u);
2004
2005 LLVM_DEBUG(It.isEnd() ? dbgs() << "applied pre-existing waitcnt\n"
2006 << "New Instr at block end: "
2007 << *CombinedLoadDsCntInstr << '\n'
2008 : dbgs() << "applied pre-existing waitcnt\n"
2009 << "Old Instr: " << *It << "New Instr: "
2010 << *CombinedLoadDsCntInstr << '\n');
2011 } else {
2012 CombinedLoadDsCntInstr->eraseFromParent();
2013 Modified = true;
2014 }
2015 }
2016
2017 if (CombinedStoreDsCntInstr) {
2018 // Similarly for S_WAIT_STORECNT_DSCNT.
2019 if (Wait.get(T: AMDGPU::STORE_CNT) != ~0u && Wait.get(T: AMDGPU::DS_CNT) != ~0u) {
2020 unsigned NewEnc = AMDGPU::encodeStorecntDscnt(Version: IV, Decoded: Wait);
2021 Modified |= updateOperandIfDifferent(MI&: *CombinedStoreDsCntInstr,
2022 OpName: AMDGPU::OpName::simm16, NewEnc);
2023 Modified |= promoteSoftWaitCnt(Waitcnt: CombinedStoreDsCntInstr);
2024 ScoreBrackets.applyWaitcnt(Wait, T: AMDGPU::STORE_CNT);
2025 ScoreBrackets.applyWaitcnt(Wait, T: AMDGPU::DS_CNT);
2026 Wait.set(T: AMDGPU::STORE_CNT, Val: ~0u);
2027 Wait.set(T: AMDGPU::DS_CNT, Val: ~0u);
2028
2029 LLVM_DEBUG(It.isEnd() ? dbgs() << "applied pre-existing waitcnt\n"
2030 << "New Instr at block end: "
2031 << *CombinedStoreDsCntInstr << '\n'
2032 : dbgs() << "applied pre-existing waitcnt\n"
2033 << "Old Instr: " << *It << "New Instr: "
2034 << *CombinedStoreDsCntInstr << '\n');
2035 } else {
2036 CombinedStoreDsCntInstr->eraseFromParent();
2037 Modified = true;
2038 }
2039 }
2040
2041 // Look for an opportunity to convert existing S_WAIT_LOADCNT,
2042 // S_WAIT_STORECNT and S_WAIT_DSCNT into new S_WAIT_LOADCNT_DSCNT
2043 // or S_WAIT_STORECNT_DSCNT. This is achieved by selectively removing
2044 // instructions so that createNewWaitcnt() will create new combined
2045 // instructions to replace them.
2046
2047 if (Wait.get(T: AMDGPU::DS_CNT) != ~0u) {
2048 // This is a vector of addresses in WaitInstrs pointing to instructions
2049 // that should be removed if they are present.
2050 SmallVector<MachineInstr **, 2> WaitsToErase;
2051
2052 // If it's known that both DScnt and either LOADcnt or STOREcnt (but not
2053 // both) need to be waited for, ensure that there are no existing
2054 // individual wait count instructions for these.
2055
2056 if (Wait.get(T: AMDGPU::LOAD_CNT) != ~0u) {
2057 WaitsToErase.push_back(Elt: &WaitInstrs[AMDGPU::LOAD_CNT]);
2058 WaitsToErase.push_back(Elt: &WaitInstrs[AMDGPU::DS_CNT]);
2059 } else if (Wait.get(T: AMDGPU::STORE_CNT) != ~0u) {
2060 WaitsToErase.push_back(Elt: &WaitInstrs[AMDGPU::STORE_CNT]);
2061 WaitsToErase.push_back(Elt: &WaitInstrs[AMDGPU::DS_CNT]);
2062 }
2063
2064 for (MachineInstr **WI : WaitsToErase) {
2065 if (!*WI)
2066 continue;
2067
2068 (*WI)->eraseFromParent();
2069 *WI = nullptr;
2070 Modified = true;
2071 }
2072 }
2073
2074 for (auto CT : inst_counter_types(MaxCounter: AMDGPU::NUM_EXTENDED_INST_CNTS)) {
2075 if (!WaitInstrs[CT])
2076 continue;
2077
2078 unsigned NewCnt = Wait.get(T: CT);
2079 if (NewCnt != ~0u) {
2080 Modified |= updateOperandIfDifferent(MI&: *WaitInstrs[CT],
2081 OpName: AMDGPU::OpName::simm16, NewEnc: NewCnt);
2082 Modified |= promoteSoftWaitCnt(Waitcnt: WaitInstrs[CT]);
2083
2084 ScoreBrackets.applyWaitcnt(T: CT, Count: NewCnt);
2085 Wait.clear(T: CT);
2086
2087 LLVM_DEBUG(It.isEnd()
2088 ? dbgs() << "applied pre-existing waitcnt\n"
2089 << "New Instr at block end: " << *WaitInstrs[CT]
2090 << '\n'
2091 : dbgs() << "applied pre-existing waitcnt\n"
2092 << "Old Instr: " << *It
2093 << "New Instr: " << *WaitInstrs[CT] << '\n');
2094 } else {
2095 WaitInstrs[CT]->eraseFromParent();
2096 Modified = true;
2097 }
2098 }
2099
2100 if (WaitcntDepctrInstr) {
2101 // Get the encoded Depctr immediate and override the VA_VDST and VM_VSRC
2102 // subfields with the new required values.
2103 unsigned Enc =
2104 TII.getNamedOperand(MI&: *WaitcntDepctrInstr, OperandName: AMDGPU::OpName::simm16)
2105 ->getImm();
2106 Enc = AMDGPU::DepCtr::encodeFieldVmVsrc(Encoded: Enc, VmVsrc: Wait.get(T: AMDGPU::VM_VSRC));
2107 Enc = AMDGPU::DepCtr::encodeFieldVaVdst(Encoded: Enc, VaVdst: Wait.get(T: AMDGPU::VA_VDST));
2108
2109 ScoreBrackets.applyWaitcnt(T: AMDGPU::VA_VDST, Count: Wait.get(T: AMDGPU::VA_VDST));
2110 ScoreBrackets.applyWaitcnt(T: AMDGPU::VM_VSRC, Count: Wait.get(T: AMDGPU::VM_VSRC));
2111 Wait.set(T: AMDGPU::VA_VDST, Val: ~0u);
2112 Wait.set(T: AMDGPU::VM_VSRC, Val: ~0u);
2113
2114 // If that new encoded Depctr immediate would actually still wait
2115 // for anything, update the instruction's operand. Otherwise it can
2116 // just be deleted.
2117 if (Enc != (unsigned)AMDGPU::DepCtr::getDefaultDepCtrEncoding(STI: ST)) {
2118 Modified |= updateOperandIfDifferent(MI&: *WaitcntDepctrInstr,
2119 OpName: AMDGPU::OpName::simm16, NewEnc: Enc);
2120 LLVM_DEBUG(It.isEnd() ? dbgs() << "applyPreexistingWaitcnt\n"
2121 << "New Instr at block end: "
2122 << *WaitcntDepctrInstr << '\n'
2123 : dbgs() << "applyPreexistingWaitcnt\n"
2124 << "Old Instr: " << *It << "New Instr: "
2125 << *WaitcntDepctrInstr << '\n');
2126 } else {
2127 WaitcntDepctrInstr->eraseFromParent();
2128 Modified = true;
2129 }
2130 }
2131
2132 return Modified;
2133}
2134
2135/// Generate S_WAIT_*CNT instructions for any required counters in \p Wait
2136bool WaitcntGeneratorGFX12Plus::createNewWaitcnt(
2137 MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It,
2138 AMDGPU::Waitcnt Wait, const WaitcntBrackets &ScoreBrackets) {
2139 assert(!isNormalMode(MaxCounter));
2140
2141 bool Modified = false;
2142 const DebugLoc &DL = Block.findDebugLoc(MBBI: It);
2143
2144 // For GFX12+, we use separate wait instructions, which makes expansion
2145 // simpler
2146 if (ExpandWaitcntProfiling) {
2147 for (auto CT : inst_counter_types(MaxCounter: AMDGPU::NUM_EXTENDED_INST_CNTS)) {
2148 unsigned Count = Wait.get(T: CT);
2149 if (Count == ~0u)
2150 continue;
2151
2152 // Skip expansion for out-of-order counters - emit normal wait instead
2153 if (ScoreBrackets.counterOutOfOrder(T: CT)) {
2154 BuildMI(BB&: Block, I: It, MIMD: DL, MCID: TII.get(Opcode: instrsForExtendedCounterTypes[CT]))
2155 .addImm(Val: Count);
2156 Modified = true;
2157 continue;
2158 }
2159
2160 unsigned Outstanding =
2161 std::min(a: ScoreBrackets.getOutstanding(T: CT), b: getLimit(E: CT) - 1);
2162 EmitExpandedWaitcnt(Outstanding, Target: Count, EmitWaitcnt: [&](unsigned Val) {
2163 BuildMI(BB&: Block, I: It, MIMD: DL, MCID: TII.get(Opcode: instrsForExtendedCounterTypes[CT]))
2164 .addImm(Val);
2165 });
2166 Modified = true;
2167 }
2168 return Modified;
2169 }
2170
2171 // Normal behavior (no expansion)
2172 // Check for opportunities to use combined wait instructions.
2173 if (Wait.get(T: AMDGPU::DS_CNT) != ~0u) {
2174 MachineInstr *SWaitInst = nullptr;
2175
2176 if (Wait.get(T: AMDGPU::LOAD_CNT) != ~0u) {
2177 unsigned Enc = AMDGPU::encodeLoadcntDscnt(Version: IV, Decoded: Wait);
2178
2179 SWaitInst = BuildMI(BB&: Block, I: It, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_WAIT_LOADCNT_DSCNT))
2180 .addImm(Val: Enc);
2181
2182 Wait.set(T: AMDGPU::LOAD_CNT, Val: ~0u);
2183 Wait.set(T: AMDGPU::DS_CNT, Val: ~0u);
2184 } else if (Wait.get(T: AMDGPU::STORE_CNT) != ~0u) {
2185 unsigned Enc = AMDGPU::encodeStorecntDscnt(Version: IV, Decoded: Wait);
2186
2187 SWaitInst = BuildMI(BB&: Block, I: It, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_WAIT_STORECNT_DSCNT))
2188 .addImm(Val: Enc);
2189
2190 Wait.set(T: AMDGPU::STORE_CNT, Val: ~0u);
2191 Wait.set(T: AMDGPU::DS_CNT, Val: ~0u);
2192 }
2193
2194 if (SWaitInst) {
2195 Modified = true;
2196
2197 LLVM_DEBUG(dbgs() << "GFX12Plus::createNewWaitcnt\n";
2198 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
2199 dbgs() << "New Instr: " << *SWaitInst << '\n');
2200 }
2201 }
2202
2203 // Generate an instruction for any remaining counter that needs
2204 // waiting for.
2205
2206 for (auto CT : inst_counter_types(MaxCounter: AMDGPU::NUM_EXTENDED_INST_CNTS)) {
2207 unsigned Count = Wait.get(T: CT);
2208 if (Count == ~0u)
2209 continue;
2210
2211 [[maybe_unused]] auto SWaitInst =
2212 BuildMI(BB&: Block, I: It, MIMD: DL, MCID: TII.get(Opcode: instrsForExtendedCounterTypes[CT]))
2213 .addImm(Val: Count);
2214
2215 Modified = true;
2216
2217 LLVM_DEBUG(dbgs() << "GFX12Plus::createNewWaitcnt\n";
2218 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
2219 dbgs() << "New Instr: " << *SWaitInst << '\n');
2220 }
2221
2222 if (Wait.hasWaitDepctr()) {
2223 assert(IsExpertMode);
2224 unsigned Enc =
2225 AMDGPU::DepCtr::encodeFieldVmVsrc(VmVsrc: Wait.get(T: AMDGPU::VM_VSRC), STI: ST);
2226 Enc = AMDGPU::DepCtr::encodeFieldVaVdst(Encoded: Enc, VaVdst: Wait.get(T: AMDGPU::VA_VDST));
2227
2228 [[maybe_unused]] auto SWaitInst =
2229 BuildMI(BB&: Block, I: It, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_WAITCNT_DEPCTR)).addImm(Val: Enc);
2230
2231 Modified = true;
2232
2233 LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
2234 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
2235 dbgs() << "New Instr: " << *SWaitInst << '\n');
2236 }
2237
2238 return Modified;
2239}
2240
2241/// Generate s_waitcnt instruction to be placed before cur_Inst.
2242/// Instructions of a given type are returned in order,
2243/// but instructions of different types can complete out of order.
2244/// We rely on this in-order completion
2245/// and simply assign a score to the memory access instructions.
2246/// We keep track of the active "score bracket" to determine
2247/// if an access of a memory read requires an s_waitcnt
2248/// and if so what the value of each counter is.
2249/// The "score bracket" is bound by the lower bound and upper bound
2250/// scores (*_score_LB and *_score_ub respectively).
2251/// If FlushFlags.FlushVmCnt is true, we want to flush the vmcnt counter here.
2252/// If FlushFlags.FlushDsCnt is true, we want to flush the dscnt counter here
2253/// (GFX12+ only, where DS_CNT is a separate counter).
2254bool SIInsertWaitcnts::generateWaitcntInstBefore(
2255 MachineInstr &MI, WaitcntBrackets &ScoreBrackets,
2256 MachineInstr *OldWaitcntInstr, PreheaderFlushFlags FlushFlags) {
2257 LLVM_DEBUG(dbgs() << "\n*** GenerateWaitcntInstBefore: "; MI.print(dbgs()););
2258
2259 assert(!isNonWaitcntMetaInst(MI));
2260
2261 AMDGPU::Waitcnt Wait;
2262 const unsigned Opc = MI.getOpcode();
2263
2264 switch (Opc) {
2265 case AMDGPU::BUFFER_WBINVL1:
2266 case AMDGPU::BUFFER_WBINVL1_SC:
2267 case AMDGPU::BUFFER_WBINVL1_VOL:
2268 case AMDGPU::BUFFER_GL0_INV:
2269 case AMDGPU::BUFFER_GL1_INV: {
2270 // FIXME: This should have already been handled by the memory legalizer.
2271 // Removing this currently doesn't affect any lit tests, but we need to
2272 // verify that nothing was relying on this. The number of buffer invalidates
2273 // being handled here should not be expanded.
2274 Wait.set(T: AMDGPU::LOAD_CNT, Val: 0);
2275 break;
2276 }
2277 case AMDGPU::SI_RETURN_TO_EPILOG:
2278 case AMDGPU::SI_RETURN:
2279 case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN:
2280 case AMDGPU::S_SETPC_B64_return: {
2281 // All waits must be resolved at call return.
2282 // NOTE: this could be improved with knowledge of all call sites or
2283 // with knowledge of the called routines.
2284 ReturnInsts.insert(V: &MI);
2285 AMDGPU::Waitcnt AllZeroWait =
2286 WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false);
2287 // On GFX12+, if LOAD_CNT is pending but no VGPRs are waiting for loads
2288 // (e.g., only GLOBAL_INV is pending), we can skip waiting on loadcnt.
2289 // GLOBAL_INV increments loadcnt but doesn't write to VGPRs, so there's
2290 // no need to wait for it at function boundaries.
2291 if (ST.hasExtendedWaitCounts() &&
2292 !ScoreBrackets.hasPendingEvent(E: HWEvents::VMEM_READ_ACCESS))
2293 AllZeroWait.set(T: AMDGPU::LOAD_CNT, Val: ~0u);
2294 Wait = AllZeroWait;
2295 break;
2296 }
2297 case AMDGPU::S_ENDPGM:
2298 case AMDGPU::S_ENDPGM_SAVED: {
2299 // In dynamic VGPR mode, we want to release the VGPRs before the wave exits.
2300 // Technically the hardware will do this on its own if we don't, but that
2301 // might cost extra cycles compared to doing it explicitly.
2302 // When not in dynamic VGPR mode, identify S_ENDPGM instructions which may
2303 // have to wait for outstanding VMEM stores. In this case it can be useful
2304 // to send a message to explicitly release all VGPRs before the stores have
2305 // completed, but it is only safe to do this if there are no outstanding
2306 // scratch stores.
2307 EndPgmInsts[&MI] =
2308 !ScoreBrackets.empty(T: AMDGPU::STORE_CNT) &&
2309 !ScoreBrackets.hasPendingEvent(E: HWEvents::SCRATCH_WRITE_ACCESS);
2310 break;
2311 }
2312 case AMDGPU::S_SENDMSG:
2313 case AMDGPU::S_SENDMSGHALT: {
2314 if (ST.hasLegacyGeometry() &&
2315 ((MI.getOperand(i: 0).getImm() & AMDGPU::SendMsg::ID_MASK_PreGFX11_) ==
2316 AMDGPU::SendMsg::ID_GS_DONE_PreGFX11)) {
2317 // Resolve vm waits before gs-done.
2318 Wait.set(T: AMDGPU::LOAD_CNT, Val: 0);
2319 break;
2320 }
2321 [[fallthrough]];
2322 }
2323 default: {
2324
2325 // Export & GDS instructions do not read the EXEC mask until after the
2326 // export is granted (which can occur well after the instruction is issued).
2327 // The shader program must flush all EXP operations on the export-count
2328 // before overwriting the EXEC mask.
2329 if (MI.modifiesRegister(Reg: AMDGPU::EXEC, TRI: &TRI)) {
2330 // Export and GDS are tracked individually, either may trigger a waitcnt
2331 // for EXEC.
2332 if (ScoreBrackets.hasPendingEvent(E: HWEvents::EXP_GPR_LOCK) ||
2333 ScoreBrackets.hasPendingEvent(E: HWEvents::EXP_PARAM_ACCESS) ||
2334 ScoreBrackets.hasPendingEvent(E: HWEvents::EXP_POS_ACCESS) ||
2335 ScoreBrackets.hasPendingEvent(E: HWEvents::GDS_GPR_LOCK)) {
2336 Wait.set(T: AMDGPU::EXP_CNT, Val: 0);
2337 }
2338 }
2339
2340 // Wait for any pending GDS instruction to complete before any
2341 // "Always GDS" instruction.
2342 if (TII.isAlwaysGDS(Opcode: Opc) && ScoreBrackets.hasPendingGDS())
2343 Wait.add(T: AMDGPU::DS_CNT, Count: ScoreBrackets.getPendingGDSWait());
2344
2345 if (MI.isCall()) {
2346 // The function is going to insert a wait on everything in its prolog.
2347 // This still needs to be careful if the call target is a load (e.g. a GOT
2348 // load). We also need to check WAW dependency with saved PC.
2349 CallInsts.insert(V: &MI);
2350 Wait = AMDGPU::Waitcnt();
2351
2352 const MachineOperand &CallAddrOp = TII.getCalleeOperand(MI);
2353 if (CallAddrOp.isReg()) {
2354 ScoreBrackets.determineWaitForPhysReg(
2355 T: SmemAccessCounter, Reg: CallAddrOp.getReg().asMCReg(), Wait, MI);
2356
2357 if (const auto *RtnAddrOp =
2358 TII.getNamedOperand(MI, OperandName: AMDGPU::OpName::dst)) {
2359 ScoreBrackets.determineWaitForPhysReg(
2360 T: SmemAccessCounter, Reg: RtnAddrOp->getReg().asMCReg(), Wait, MI);
2361 }
2362 }
2363 } else if (Opc == AMDGPU::S_BARRIER_WAIT) {
2364 ScoreBrackets.tryClearSCCWriteEvent(Inst: &MI);
2365 } else {
2366 // FIXME: Should not be relying on memoperands.
2367 // Look at the source operands of every instruction to see if
2368 // any of them results from a previous memory operation that affects
2369 // its current usage. If so, an s_waitcnt instruction needs to be
2370 // emitted.
2371 // If the source operand was defined by a load, add the s_waitcnt
2372 // instruction.
2373 //
2374 // Two cases are handled for destination operands:
2375 // 1) If the destination operand was defined by a load, add the s_waitcnt
2376 // instruction to guarantee the right WAW order.
2377 // 2) If a destination operand that was used by a recent export/store ins,
2378 // add s_waitcnt on exp_cnt to guarantee the WAR order.
2379
2380 for (const MachineMemOperand *Memop : MI.memoperands()) {
2381 const Value *Ptr = Memop->getValue();
2382 if (Memop->isStore()) {
2383 if (auto It = SLoadAddresses.find(Val: Ptr); It != SLoadAddresses.end()) {
2384 Wait.add(T: SmemAccessCounter, Count: 0);
2385 if (PDT.dominates(A: MI.getParent(), B: It->second))
2386 SLoadAddresses.erase(I: It);
2387 }
2388 }
2389 unsigned AS = Memop->getAddrSpace();
2390 if (AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::FLAT_ADDRESS)
2391 continue;
2392 // No need to wait before load from VMEM to LDS.
2393 if (TII.mayWriteLDSThroughDMA(MI))
2394 continue;
2395
2396 // LOAD_CNT is only relevant to vgpr or LDS.
2397 unsigned TID = LDSDMA_BEGIN;
2398 if (Ptr && Memop->getAAInfo()) {
2399 const auto &LDSDMAStores = ScoreBrackets.getLDSDMAStores();
2400 for (unsigned I = 0, E = LDSDMAStores.size(); I != E; ++I) {
2401 if (MI.mayAlias(AA, Other: *LDSDMAStores[I], UseTBAA: true)) {
2402 if ((I + 1) >= NUM_LDSDMA) {
2403 // We didn't have enough slot to track this LDS DMA store, it
2404 // has been tracked using the common RegNo (FIRST_LDS_VGPR).
2405 ScoreBrackets.determineWaitForLDSDMA(T: AMDGPU::LOAD_CNT, TID,
2406 Wait);
2407 break;
2408 }
2409
2410 ScoreBrackets.determineWaitForLDSDMA(T: AMDGPU::LOAD_CNT,
2411 TID: TID + I + 1, Wait);
2412 }
2413 }
2414 } else {
2415 ScoreBrackets.determineWaitForLDSDMA(T: AMDGPU::LOAD_CNT, TID, Wait);
2416 }
2417 if (Memop->isStore()) {
2418 ScoreBrackets.determineWaitForLDSDMA(T: AMDGPU::EXP_CNT, TID, Wait);
2419 }
2420 }
2421
2422 // Loop over use and def operands.
2423 for (const MachineOperand &Op : MI.operands()) {
2424 if (!Op.isReg())
2425 continue;
2426
2427 // If the instruction does not read tied source, skip the operand.
2428 if (Op.isTied() && Op.isUse() && TII.doesNotReadTiedSource(MI))
2429 continue;
2430
2431 MCPhysReg Reg = Op.getReg().asMCReg();
2432
2433 const bool IsVGPR = TRI.isVectorRegister(MRI, Reg: Op.getReg());
2434 if (IsVGPR) {
2435 // Implicit VGPR defs and uses are never a part of the memory
2436 // instructions description and usually present to account for
2437 // super-register liveness.
2438 // TODO: Most of the other instructions also have implicit uses
2439 // for the liveness accounting only.
2440 if (Op.isImplicit() && MI.mayLoadOrStore())
2441 continue;
2442
2443 ScoreBrackets.determineWaitForPhysReg(T: AMDGPU::VA_VDST, Reg, Wait, MI);
2444 if (Op.isDef())
2445 ScoreBrackets.determineWaitForPhysReg(T: AMDGPU::VM_VSRC, Reg, Wait,
2446 MI);
2447 // RAW always needs an s_waitcnt. WAW needs an s_waitcnt unless the
2448 // previous write and this write are the same type of VMEM
2449 // instruction, in which case they are (in some architectures)
2450 // guaranteed to write their results in order anyway.
2451 // Additionally check instructions where Point Sample Acceleration
2452 // might be applied.
2453 if (Op.isUse() || !updateVMCntOnly(Inst: MI) ||
2454 ScoreBrackets.hasOtherPendingVmemTypes(Reg, V: getVmemType(Inst: MI)) ||
2455 ScoreBrackets.hasPointSamplePendingVmemTypes(MI, Reg) ||
2456 !ST.hasVmemWriteVgprInOrder()) {
2457 ScoreBrackets.determineWaitForPhysReg(T: AMDGPU::LOAD_CNT, Reg, Wait,
2458 MI);
2459 ScoreBrackets.determineWaitForPhysReg(T: AMDGPU::SAMPLE_CNT, Reg, Wait,
2460 MI);
2461 ScoreBrackets.determineWaitForPhysReg(T: AMDGPU::BVH_CNT, Reg, Wait,
2462 MI);
2463 ScoreBrackets.clearVgprVmemTypes(Reg);
2464 }
2465
2466 if (Op.isDef() ||
2467 ScoreBrackets.hasPendingEvent(E: HWEvents::EXP_LDS_ACCESS)) {
2468 ScoreBrackets.determineWaitForPhysReg(T: AMDGPU::EXP_CNT, Reg, Wait,
2469 MI);
2470 }
2471 ScoreBrackets.determineWaitForPhysReg(T: AMDGPU::DS_CNT, Reg, Wait, MI);
2472 } else if (Op.getReg() == AMDGPU::SCC) {
2473 ScoreBrackets.determineWaitForPhysReg(T: AMDGPU::KM_CNT, Reg, Wait, MI);
2474 } else {
2475 ScoreBrackets.determineWaitForPhysReg(T: SmemAccessCounter, Reg, Wait,
2476 MI);
2477 }
2478
2479 if (ST.hasWaitXcnt() && Op.isDef())
2480 ScoreBrackets.determineWaitForPhysReg(T: AMDGPU::X_CNT, Reg, Wait, MI);
2481 }
2482 }
2483 }
2484 }
2485
2486 // Ensure safety against exceptions from outstanding memory operations while
2487 // waiting for a barrier:
2488 //
2489 // * Some subtargets safely handle backing off the barrier in hardware
2490 // when an exception occurs.
2491 // * Some subtargets have an implicit S_WAITCNT 0 before barriers, so that
2492 // there can be no outstanding memory operations during the wait.
2493 // * Subtargets with split barriers don't need to back off the barrier; it
2494 // is up to the trap handler to preserve the user barrier state correctly.
2495 //
2496 // In all other cases, ensure safety by ensuring that there are no outstanding
2497 // memory operations.
2498 if (Opc == AMDGPU::S_BARRIER && !ST.hasAutoWaitcntBeforeBarrier() &&
2499 !ST.hasBackOffBarrier()) {
2500 Wait = Wait.combined(Other: WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/true));
2501 }
2502
2503 // TODO: Remove this work-around, enable the assert for Bug 457939
2504 // after fixing the scheduler. Also, the Shader Compiler code is
2505 // independent of target.
2506 if (SIInstrInfo::isCBranchVCCZRead(MI) && ST.hasReadVCCZBug() &&
2507 ScoreBrackets.hasPendingEvent(E: HWEvents::SMEM_ACCESS)) {
2508 Wait.set(T: AMDGPU::DS_CNT, Val: 0);
2509 }
2510
2511 // Verify that the wait is actually needed.
2512 ScoreBrackets.simplifyWaitcnt(Wait);
2513
2514 // It is only necessary to insert an S_WAITCNT_DEPCTR instruction that
2515 // waits on VA_VDST if the instruction it would precede is not a VALU
2516 // instruction, since hardware handles VALU->VGPR->VALU hazards in
2517 // expert scheduling mode.
2518 if (TII.isVALU(MI, /*AllowLDSDMA=*/true) && !SIInstrInfo::isLDSDMA(MI))
2519 Wait.set(T: AMDGPU::VA_VDST, Val: ~0u);
2520
2521 // Since the translation for VMEM addresses occur in-order, we can apply the
2522 // XCnt if the current instruction is of VMEM type and has a memory
2523 // dependency with another VMEM instruction in flight.
2524 if (Wait.get(T: AMDGPU::X_CNT) != ~0u && isVmemAccess(MI)) {
2525 ScoreBrackets.applyWaitcnt(Wait, T: AMDGPU::X_CNT);
2526 Wait.set(T: AMDGPU::X_CNT, Val: ~0u);
2527 }
2528
2529 // When forcing emit, we need to skip terminators because that would break the
2530 // terminators of the MBB if we emit a waitcnt between terminators.
2531 if (ForceEmitZeroFlag && !MI.isTerminator())
2532 Wait = WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false);
2533
2534 // If we force waitcnt then update Wait accordingly.
2535 for (AMDGPU::InstCounterType T : AMDGPU::inst_counter_types()) {
2536 if (!ForceEmitWaitcnt[T])
2537 continue;
2538 Wait.set(T, Val: 0);
2539 }
2540
2541 if (FlushFlags.FlushVmCnt) {
2542 for (AMDGPU::InstCounterType T :
2543 {AMDGPU::LOAD_CNT, AMDGPU::SAMPLE_CNT, AMDGPU::BVH_CNT})
2544 Wait.set(T, Val: 0);
2545 }
2546
2547 if (FlushFlags.FlushDsCnt && ScoreBrackets.hasPendingEvent(T: AMDGPU::DS_CNT))
2548 Wait.set(T: AMDGPU::DS_CNT, Val: 0);
2549
2550 if (ForceEmitZeroLoadFlag && Wait.get(T: AMDGPU::LOAD_CNT) != ~0u)
2551 Wait.set(T: AMDGPU::LOAD_CNT, Val: 0);
2552
2553 return generateWaitcnt(Wait, It: MI.getIterator(), Block&: *MI.getParent(), ScoreBrackets,
2554 OldWaitcntInstr);
2555}
2556
2557bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
2558 MachineBasicBlock::instr_iterator It,
2559 MachineBasicBlock &Block,
2560 WaitcntBrackets &ScoreBrackets,
2561 MachineInstr *OldWaitcntInstr) {
2562 bool Modified = false;
2563
2564 if (OldWaitcntInstr)
2565 // Try to merge the required wait with preexisting waitcnt instructions.
2566 // Also erase redundant waitcnt.
2567 Modified =
2568 WCG->applyPreexistingWaitcnt(ScoreBrackets, OldWaitcntInstr&: *OldWaitcntInstr, Wait, It);
2569
2570 // ExpCnt can be merged into VINTERP.
2571 if (Wait.get(T: AMDGPU::EXP_CNT) != ~0u && It != Block.instr_end() &&
2572 SIInstrInfo::isVINTERP(MI: *It)) {
2573 MachineOperand *WaitExp = TII.getNamedOperand(MI&: *It, OperandName: AMDGPU::OpName::waitexp);
2574 if (Wait.get(T: AMDGPU::EXP_CNT) < WaitExp->getImm()) {
2575 WaitExp->setImm(Wait.get(T: AMDGPU::EXP_CNT));
2576 Modified = true;
2577 }
2578 // Apply ExpCnt before resetting it, so applyWaitcnt below sees all counts.
2579 ScoreBrackets.applyWaitcnt(Wait, T: AMDGPU::EXP_CNT);
2580 Wait.set(T: AMDGPU::EXP_CNT, Val: ~0u);
2581
2582 LLVM_DEBUG(dbgs() << "generateWaitcnt\n"
2583 << "Update Instr: " << *It);
2584 }
2585
2586 if (WCG->createNewWaitcnt(Block, It, Wait, ScoreBrackets))
2587 Modified = true;
2588
2589 // Any counts that could have been applied to any existing waitcnt
2590 // instructions will have been done so, now deal with any remaining.
2591 ScoreBrackets.applyWaitcnt(Wait);
2592
2593 return Modified;
2594}
2595
2596bool SIInsertWaitcnts::isVmemAccess(const MachineInstr &MI) const {
2597 return (TII.isFLAT(MI) && TII.mayAccessVMEMThroughFlat(MI)) ||
2598 (TII.isVMEM(MI) && !AMDGPU::getMUBUFIsBufferInv(Opc: MI.getOpcode()));
2599}
2600
2601// Return true if the next instruction is S_ENDPGM, following fallthrough
2602// blocks if necessary.
2603bool SIInsertWaitcnts::isNextENDPGM(MachineBasicBlock::instr_iterator It,
2604 MachineBasicBlock *Block) const {
2605 auto BlockEnd = Block->getParent()->end();
2606 auto BlockIter = Block->getIterator();
2607
2608 while (true) {
2609 if (It.isEnd()) {
2610 if (++BlockIter != BlockEnd) {
2611 It = BlockIter->instr_begin();
2612 continue;
2613 }
2614
2615 return false;
2616 }
2617
2618 if (!It->isMetaInstruction())
2619 break;
2620
2621 It++;
2622 }
2623
2624 assert(!It.isEnd());
2625
2626 return It->getOpcode() == AMDGPU::S_ENDPGM;
2627}
2628
2629// Add a wait after an instruction if architecture requirements mandate one.
2630bool SIInsertWaitcnts::insertForcedWaitAfter(MachineInstr &Inst,
2631 MachineBasicBlock &Block,
2632 WaitcntBrackets &ScoreBrackets) {
2633 AMDGPU::Waitcnt Wait;
2634 bool NeedsEndPGMCheck = false;
2635
2636 if (ST.isPreciseMemoryEnabled() && Inst.mayLoadOrStore())
2637 Wait = WCG->getAllZeroWaitcnt(IncludeVSCnt: Inst.mayStore() &&
2638 !SIInstrInfo::isAtomicRet(MI: Inst));
2639
2640 if (TII.isAlwaysGDS(Opcode: Inst.getOpcode())) {
2641 Wait.set(T: AMDGPU::DS_CNT, Val: 0);
2642 NeedsEndPGMCheck = true;
2643 }
2644
2645 ScoreBrackets.simplifyWaitcnt(Wait);
2646
2647 auto SuccessorIt = std::next(x: Inst.getIterator());
2648 bool Result = generateWaitcnt(Wait, It: SuccessorIt, Block, ScoreBrackets,
2649 /*OldWaitcntInstr=*/nullptr);
2650
2651 if (Result && NeedsEndPGMCheck && isNextENDPGM(It: SuccessorIt, Block: &Block)) {
2652 BuildMI(BB&: Block, I: SuccessorIt, MIMD: Inst.getDebugLoc(), MCID: TII.get(Opcode: AMDGPU::S_NOP))
2653 .addImm(Val: 0);
2654 }
2655
2656 return Result;
2657}
2658
2659void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
2660 WaitcntBrackets *ScoreBrackets) {
2661
2662 HWEvents InstEvents = AMDGPU::getEventsFor(Inst, ST, IsExpertMode);
2663 for (HWEvents E : InstEvents)
2664 ScoreBrackets->updateByEvent(E, Inst);
2665
2666 if (TII.isDS(MI: Inst) && TII.usesLGKM_CNT(MI: Inst)) {
2667 if (TII.isAlwaysGDS(Opcode: Inst.getOpcode()) ||
2668 TII.hasModifiersSet(MI: Inst, OpName: AMDGPU::OpName::gds)) {
2669 ScoreBrackets->setPendingGDS();
2670 }
2671 } else if (TII.isFLAT(MI: Inst)) {
2672 if (Inst.mayLoadOrStore() && TII.mayAccessVMEMThroughFlat(MI: Inst) &&
2673 TII.mayAccessLDSThroughFlat(MI: Inst) && !SIInstrInfo::isLDSDMA(MI: Inst)) {
2674 // Async/LDSDMA operations have FLAT encoding but do not actually use flat
2675 // pointers. They do have two operands that each access global and LDS,
2676 // thus making it appear at this point that they are using a flat pointer.
2677 // Filter them out, and for the rest, generate a dependency on flat
2678 // pointers so that both VM and LGKM counters are flushed.
2679 ScoreBrackets->setPendingFlat();
2680 }
2681 } else if (Inst.isCall()) {
2682 // Act as a wait on everything, but AsyncCnt and TensorCnt are never
2683 // included in such blanket waits.
2684 ScoreBrackets->applyWaitcnt(Wait: WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false));
2685 ScoreBrackets->setStateOnFunctionEntryOrReturn();
2686 } else if (TII.isVINTERP(MI: Inst)) {
2687 int64_t Imm = TII.getNamedOperand(MI&: Inst, OperandName: AMDGPU::OpName::waitexp)->getImm();
2688 ScoreBrackets->applyWaitcnt(T: AMDGPU::EXP_CNT, Count: Imm);
2689 }
2690
2691 // Set XCNT to zero in the bracket for instructions that implicitly drain
2692 // XCNT.
2693 if (ST.hasWaitXcnt() && SIInstrInfo::isXcntDrain(MI: Inst))
2694 ScoreBrackets->applyWaitcnt(T: AMDGPU::X_CNT, Count: 0);
2695}
2696
2697bool WaitcntBrackets::mergeScore(const MergeInfo &M, unsigned &Score,
2698 unsigned OtherScore) {
2699 unsigned MyShifted = Score <= M.OldLB ? 0 : Score + M.MyShift;
2700 unsigned OtherShifted =
2701 OtherScore <= M.OtherLB ? 0 : OtherScore + M.OtherShift;
2702 Score = std::max(a: MyShifted, b: OtherShifted);
2703 return OtherShifted > MyShifted;
2704}
2705
2706bool WaitcntBrackets::mergeAsyncMarks(ArrayRef<MergeInfo> MergeInfos,
2707 ArrayRef<CounterValueArray> OtherMarks) {
2708 bool StrictDom = false;
2709
2710 LLVM_DEBUG(dbgs() << "Merging async marks ...");
2711 // Early exit: nothing to merge when both sides are empty.
2712 if (AsyncMarks.empty() && OtherMarks.empty()) {
2713 LLVM_DEBUG(dbgs() << " nothing to merge\n");
2714 return false;
2715 }
2716 LLVM_DEBUG(dbgs() << '\n');
2717
2718 // Determine maximum length needed after merging
2719 auto MaxSize = (unsigned)std::max(a: AsyncMarks.size(), b: OtherMarks.size());
2720 MaxSize = std::min(a: MaxSize, b: MaxAsyncMarks);
2721
2722 // Keep only the most recent marks within our limit.
2723 if (AsyncMarks.size() > MaxSize)
2724 AsyncMarks.erase(CS: AsyncMarks.begin(),
2725 CE: AsyncMarks.begin() + (AsyncMarks.size() - MaxSize));
2726
2727 // Pad with zero-filled marks if our list is shorter. Zero represents "no
2728 // pending async operations at this checkpoint" and acts as the identity
2729 // element for max() during merging. We pad at the beginning since the marks
2730 // need to be aligned in most-recent order.
2731 constexpr CounterValueArray ZeroMark{};
2732 AsyncMarks.insert(I: AsyncMarks.begin(), NumToInsert: MaxSize - AsyncMarks.size(), Elt: ZeroMark);
2733
2734 LLVM_DEBUG({
2735 dbgs() << "Before merge:\n";
2736 for (const auto &Mark : AsyncMarks) {
2737 llvm::interleaveComma(Mark, dbgs());
2738 dbgs() << '\n';
2739 }
2740 dbgs() << "Other marks:\n";
2741 for (const auto &Mark : OtherMarks) {
2742 llvm::interleaveComma(Mark, dbgs());
2743 dbgs() << '\n';
2744 }
2745 });
2746
2747 // Merge element-wise using the existing mergeScore function and the
2748 // appropriate MergeInfo for each counter type. Iterate only while we have
2749 // elements in both vectors.
2750 unsigned OtherSize = OtherMarks.size();
2751 unsigned OurSize = AsyncMarks.size();
2752 unsigned MergeCount = std::min(a: OtherSize, b: OurSize);
2753 // OtherMarks is empty -> OtherSize == 0 -> MergeCount == 0.
2754 // Our existing marks are the conservative result; return early to avoid
2755 // passing MergeCount == 0 to seq_inclusive which asserts Begin <= End.
2756 if (MergeCount == 0)
2757 return StrictDom;
2758 for (auto Idx : seq_inclusive<unsigned>(Begin: 1, End: MergeCount)) {
2759 for (auto T : inst_counter_types(MaxCounter: Context->MaxCounter)) {
2760 StrictDom |= mergeScore(M: MergeInfos[T], Score&: AsyncMarks[OurSize - Idx][T],
2761 OtherScore: OtherMarks[OtherSize - Idx][T]);
2762 }
2763 }
2764
2765 LLVM_DEBUG({
2766 dbgs() << "After merge:\n";
2767 for (const auto &Mark : AsyncMarks) {
2768 llvm::interleaveComma(Mark, dbgs());
2769 dbgs() << '\n';
2770 }
2771 });
2772
2773 return StrictDom;
2774}
2775
2776/// Merge the pending events and associater score brackets of \p Other into
2777/// this brackets status.
2778///
2779/// Returns whether the merge resulted in a change that requires tighter waits
2780/// (i.e. the merged brackets strictly dominate the original brackets).
2781bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
2782 bool StrictDom = false;
2783
2784 // Check if "other" has keys we don't have, and create default entries for
2785 // those. If they remain empty after merging, we will clean it up after.
2786 for (auto K : Other.VMem.keys())
2787 VMem.try_emplace(Key: K);
2788 for (auto K : Other.SGPRs.keys())
2789 SGPRs.try_emplace(Key: K);
2790
2791 // Array to store MergeInfo for each counter type
2792 MergeInfo MergeInfos[AMDGPU::NUM_INST_CNTS];
2793
2794 for (auto T : inst_counter_types(MaxCounter: Context->MaxCounter)) {
2795 // Merge event flags for this counter
2796 const HWEvents &EventsForT = Context->getWaitEvents(T);
2797 const HWEvents OldEvents = PendingEvents & EventsForT;
2798 const HWEvents OtherEvents = Other.PendingEvents & EventsForT;
2799 if (!OldEvents.contains(Other: OtherEvents))
2800 StrictDom = true;
2801 PendingEvents |= OtherEvents;
2802
2803 // Merge scores for this counter
2804 const unsigned MyPending = ScoreUBs[T] - ScoreLBs[T];
2805 const unsigned OtherPending = Other.ScoreUBs[T] - Other.ScoreLBs[T];
2806 const unsigned NewUB = ScoreLBs[T] + std::max(a: MyPending, b: OtherPending);
2807 if (NewUB < ScoreLBs[T])
2808 report_fatal_error(reason: "waitcnt score overflow");
2809
2810 MergeInfo &M = MergeInfos[T];
2811 M.OldLB = ScoreLBs[T];
2812 M.OtherLB = Other.ScoreLBs[T];
2813 M.MyShift = NewUB - ScoreUBs[T];
2814 M.OtherShift = NewUB - Other.ScoreUBs[T];
2815
2816 ScoreUBs[T] = NewUB;
2817
2818 if (T == AMDGPU::LOAD_CNT)
2819 StrictDom |= mergeScore(M, Score&: LastFlatLoadCnt, OtherScore: Other.LastFlatLoadCnt);
2820
2821 if (T == AMDGPU::DS_CNT) {
2822 StrictDom |= mergeScore(M, Score&: LastFlatDsCnt, OtherScore: Other.LastFlatDsCnt);
2823 StrictDom |= mergeScore(M, Score&: LastGDS, OtherScore: Other.LastGDS);
2824 }
2825
2826 if (T == AMDGPU::KM_CNT) {
2827 StrictDom |= mergeScore(M, Score&: SCCScore, OtherScore: Other.SCCScore);
2828 if (Other.hasPendingEvent(E: HWEvents::SCC_WRITE)) {
2829 if (!(OldEvents & HWEvents::SCC_WRITE)) {
2830 PendingSCCWrite = Other.PendingSCCWrite;
2831 } else if (PendingSCCWrite != Other.PendingSCCWrite) {
2832 PendingSCCWrite = nullptr;
2833 }
2834 }
2835 }
2836
2837 for (auto &[RegID, Info] : VMem)
2838 StrictDom |= mergeScore(M, Score&: Info.Scores[T], OtherScore: Other.getVMemScore(TID: RegID, T));
2839
2840 if (isSmemCounter(T)) {
2841 for (auto &[RegID, Info] : SGPRs) {
2842 auto It = Other.SGPRs.find(Val: RegID);
2843 unsigned OtherScore = (It != Other.SGPRs.end()) ? It->second.get(T) : 0;
2844 StrictDom |= mergeScore(M, Score&: Info.get(T), OtherScore);
2845 }
2846 }
2847 }
2848
2849 for (auto &[TID, Info] : VMem) {
2850 if (auto It = Other.VMem.find(Val: TID); It != Other.VMem.end()) {
2851 unsigned char NewVmemTypes = Info.VMEMTypes | It->second.VMEMTypes;
2852 StrictDom |= NewVmemTypes != Info.VMEMTypes;
2853 Info.VMEMTypes = NewVmemTypes;
2854 }
2855 }
2856
2857 StrictDom |= mergeAsyncMarks(MergeInfos, OtherMarks: Other.AsyncMarks);
2858 for (auto T : inst_counter_types(MaxCounter: Context->MaxCounter))
2859 StrictDom |= mergeScore(M: MergeInfos[T], Score&: AsyncScore[T], OtherScore: Other.AsyncScore[T]);
2860
2861 purgeEmptyTrackingData();
2862 return StrictDom;
2863}
2864
2865static bool isWaitInstr(MachineInstr &Inst) {
2866 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Opcode: Inst.getOpcode());
2867 return Opcode == AMDGPU::S_WAITCNT ||
2868 (Opcode == AMDGPU::S_WAITCNT_VSCNT && Inst.getOperand(i: 0).isReg() &&
2869 Inst.getOperand(i: 0).getReg() == AMDGPU::SGPR_NULL) ||
2870 Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT ||
2871 Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT ||
2872 Opcode == AMDGPU::S_WAITCNT_lds_direct ||
2873 Opcode == AMDGPU::WAIT_ASYNCMARK ||
2874 AMDGPU::counterTypeForInstr(Opcode).has_value();
2875}
2876
2877void SIInsertWaitcnts::setSchedulingMode(MachineBasicBlock &MBB,
2878 MachineBasicBlock::iterator I,
2879 bool ExpertMode) const {
2880 const unsigned EncodedReg = AMDGPU::Hwreg::HwregEncoding::encode(
2881 Values: AMDGPU::Hwreg::ID_SCHED_MODE, Values: AMDGPU::Hwreg::HwregOffset::Default, Values: 2);
2882 BuildMI(BB&: MBB, I, MIMD: DebugLoc(), MCID: TII.get(Opcode: AMDGPU::S_SETREG_IMM32_B32))
2883 .addImm(Val: ExpertMode ? 2 : 0)
2884 .addImm(Val: EncodedReg);
2885}
2886
2887namespace {
2888// TODO: Remove this work-around after fixing the scheduler.
2889// There are two reasons why vccz might be incorrect; see ST.hasReadVCCZBug()
2890// and ST.partialVCCWritesUpdateVCCZ().
2891// i. VCCZBug: There is a hardware bug on CI/SI where SMRD instruction may
2892// corrupt vccz bit, so when we detect that an instruction may read from
2893// a corrupt vccz bit, we need to:
2894// 1. Insert s_waitcnt lgkm(0) to wait for all outstanding SMRD
2895// operations to complete.
2896// 2. Recompute the correct value of vccz by writing the current value
2897// of vcc back to vcc.
2898// ii. Partial writes to vcc don't update vccz, so we need to recompute the
2899// correct value of vccz by reading vcc and writing it back to vcc.
2900// No waitcnt is needed in this case.
2901class VCCZWorkaround {
2902 const WaitcntBrackets &ScoreBrackets;
2903 const GCNSubtarget &ST;
2904 const SIInstrInfo &TII;
2905 const SIRegisterInfo &TRI;
2906 bool VCCZCorruptionBug = false;
2907 bool VCCZNotUpdatedByPartialWrites = false;
2908 /// vccz could be incorrect at a basic block boundary if a predecessor wrote
2909 /// to vcc and then issued an smem load, so initialize to true.
2910 bool MustRecomputeVCCZ = true;
2911
2912public:
2913 VCCZWorkaround(const WaitcntBrackets &ScoreBrackets, const GCNSubtarget &ST,
2914 const SIInstrInfo &TII, const SIRegisterInfo &TRI)
2915 : ScoreBrackets(ScoreBrackets), ST(ST), TII(TII), TRI(TRI) {
2916 VCCZCorruptionBug = ST.hasReadVCCZBug();
2917 VCCZNotUpdatedByPartialWrites = !ST.partialVCCWritesUpdateVCCZ();
2918 }
2919 /// If \p MI reads vccz and we must recompute it based on MustRecomputeVCCZ,
2920 /// then emit a vccz recompute instruction before \p MI. This needs to be
2921 /// called on every instruction in the basic block because it also tracks the
2922 /// state and updates MustRecomputeVCCZ accordingly. Returns true if it
2923 /// modified the IR.
2924 bool tryRecomputeVCCZ(MachineInstr &MI) {
2925 // No need to run this if neither bug is present.
2926 if (!VCCZCorruptionBug && !VCCZNotUpdatedByPartialWrites)
2927 return false;
2928
2929 // If MI is an SMEM and it can corrupt vccz on this target, then we need
2930 // both to emit a waitcnt and to recompute vccz.
2931 // But we don't actually emit a waitcnt here. This is done in
2932 // generateWaitcntInstBefore() because it tracks all the necessary waitcnt
2933 // state, and can either skip emitting a waitcnt if there is already one in
2934 // the IR, or emit an "optimized" combined waitcnt.
2935 // If this is an smem read, it could complete and clobber vccz at any time.
2936 MustRecomputeVCCZ |= VCCZCorruptionBug && TII.isSMRD(MI);
2937
2938 // If the target partial vcc writes don't update vccz, and MI is such an
2939 // instruction then we must recompute vccz.
2940 // Note: We are using PartiallyWritesToVCCOpt optional to avoid calling
2941 // `definesRegister()` more than needed, because it's not very cheap.
2942 std::optional<bool> PartiallyWritesToVCCOpt;
2943 auto PartiallyWritesToVCC = [](MachineInstr &MI) {
2944 return MI.definesRegister(Reg: AMDGPU::VCC_LO, /*TRI=*/nullptr) ||
2945 MI.definesRegister(Reg: AMDGPU::VCC_HI, /*TRI=*/nullptr);
2946 };
2947 if (VCCZNotUpdatedByPartialWrites) {
2948 PartiallyWritesToVCCOpt = PartiallyWritesToVCC(MI);
2949 // If this is a partial VCC write but won't update vccz, then we must
2950 // recompute vccz.
2951 MustRecomputeVCCZ |= *PartiallyWritesToVCCOpt;
2952 }
2953
2954 // If MI is a vcc write with no pending smem, or there is a pending smem
2955 // but the target does not suffer from the vccz corruption bug, then we
2956 // don't need to recompute vccz as this write will recompute it anyway.
2957 if (!ScoreBrackets.hasPendingEvent(E: HWEvents::SMEM_ACCESS) ||
2958 !VCCZCorruptionBug) {
2959 // Compute PartiallyWritesToVCCOpt if we haven't done so already.
2960 if (!PartiallyWritesToVCCOpt)
2961 PartiallyWritesToVCCOpt = PartiallyWritesToVCC(MI);
2962 bool FullyWritesToVCC = !*PartiallyWritesToVCCOpt &&
2963 MI.definesRegister(Reg: AMDGPU::VCC, /*TRI=*/nullptr);
2964 // If we write to the full vcc or we write partially and the target
2965 // updates vccz on partial writes, then vccz will be updated correctly.
2966 bool UpdatesVCCZ = FullyWritesToVCC || (!VCCZNotUpdatedByPartialWrites &&
2967 *PartiallyWritesToVCCOpt);
2968 if (UpdatesVCCZ)
2969 MustRecomputeVCCZ = false;
2970 }
2971
2972 // If MI is a branch that reads VCCZ then emit a waitcnt and a vccz
2973 // restore instruction if either is needed.
2974 if (SIInstrInfo::isCBranchVCCZRead(MI) && MustRecomputeVCCZ) {
2975 // Recompute the vccz bit. Any time a value is written to vcc, the vccz
2976 // bit is updated, so we can restore the bit by reading the value of vcc
2977 // and then writing it back to the register.
2978 BuildMI(BB&: *MI.getParent(), I&: MI, MIMD: MI.getDebugLoc(),
2979 MCID: TII.get(Opcode: ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
2980 DestReg: TRI.getVCC())
2981 .addReg(RegNo: TRI.getVCC());
2982 MustRecomputeVCCZ = false;
2983 return true;
2984 }
2985 return false;
2986 }
2987};
2988
2989} // namespace
2990
2991// Generate s_waitcnt instructions where needed.
2992bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
2993 MachineBasicBlock &Block,
2994 WaitcntBrackets &ScoreBrackets) {
2995 bool Modified = false;
2996
2997 LLVM_DEBUG({
2998 dbgs() << "*** Begin Block: ";
2999 Block.printName(dbgs());
3000 ScoreBrackets.dump();
3001 });
3002 VCCZWorkaround VCCZW(ScoreBrackets, ST, TII, TRI);
3003
3004 // Walk over the instructions.
3005 MachineInstr *OldWaitcntInstr = nullptr;
3006
3007 // NOTE: We may append instrs after Inst while iterating.
3008 for (MachineBasicBlock::instr_iterator Iter = Block.instr_begin(),
3009 E = Block.instr_end();
3010 Iter != E; ++Iter) {
3011 MachineInstr &Inst = *Iter;
3012 if (isNonWaitcntMetaInst(MI: Inst))
3013 continue;
3014 // Track pre-existing waitcnts that were added in earlier iterations or by
3015 // the memory legalizer.
3016 if (isWaitInstr(Inst) ||
3017 (IsExpertMode && Inst.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR)) {
3018 if (!OldWaitcntInstr)
3019 OldWaitcntInstr = &Inst;
3020 continue;
3021 }
3022
3023 PreheaderFlushFlags FlushFlags;
3024 if (Block.getFirstTerminator() == Inst)
3025 FlushFlags = isPreheaderToFlush(MBB&: Block, ScoreBrackets);
3026
3027 // Generate an s_waitcnt instruction to be placed before Inst, if needed.
3028 Modified |= generateWaitcntInstBefore(MI&: Inst, ScoreBrackets, OldWaitcntInstr,
3029 FlushFlags);
3030 OldWaitcntInstr = nullptr;
3031
3032 if (Inst.getOpcode() == AMDGPU::ASYNCMARK) {
3033 // Asyncmarks record the current wait state and so should not allow
3034 // waitcnts that occur after them to be merged into waitcnts that occur
3035 // before.
3036 ScoreBrackets.recordAsyncMark(Inst);
3037 continue;
3038 }
3039
3040 if (TII.isSMRD(MI: Inst)) {
3041 for (const MachineMemOperand *Memop : Inst.memoperands()) {
3042 // No need to handle invariant loads when avoiding WAR conflicts, as
3043 // there cannot be a vector store to the same memory location.
3044 if (!Memop->isInvariant()) {
3045 const Value *Ptr = Memop->getValue();
3046 SLoadAddresses.insert(KV: std::pair(Ptr, Inst.getParent()));
3047 }
3048 }
3049 }
3050
3051 updateEventWaitcntAfter(Inst, ScoreBrackets: &ScoreBrackets);
3052
3053 // Note: insertForcedWaitAfter() may add instrs after Iter that need to be
3054 // visited by the loop.
3055 Modified |= insertForcedWaitAfter(Inst, Block, ScoreBrackets);
3056
3057 LLVM_DEBUG({
3058 Inst.print(dbgs());
3059 ScoreBrackets.dump();
3060 });
3061
3062 // If the target suffers from the vccz bugs, this may emit the necessary
3063 // vccz recompute instruction before \p Inst if needed.
3064 Modified |= VCCZW.tryRecomputeVCCZ(MI&: Inst);
3065 }
3066
3067 // Flush counters at the end of the block if needed (for preheaders with no
3068 // terminator).
3069 AMDGPU::Waitcnt Wait;
3070 if (Block.getFirstTerminator() == Block.end()) {
3071 PreheaderFlushFlags FlushFlags = isPreheaderToFlush(MBB&: Block, ScoreBrackets);
3072 if (FlushFlags.FlushVmCnt) {
3073 if (ScoreBrackets.hasPendingEvent(T: AMDGPU::LOAD_CNT))
3074 Wait.set(T: AMDGPU::LOAD_CNT, Val: 0);
3075 if (ScoreBrackets.hasPendingEvent(T: AMDGPU::SAMPLE_CNT))
3076 Wait.set(T: AMDGPU::SAMPLE_CNT, Val: 0);
3077 if (ScoreBrackets.hasPendingEvent(T: AMDGPU::BVH_CNT))
3078 Wait.set(T: AMDGPU::BVH_CNT, Val: 0);
3079 }
3080 if (FlushFlags.FlushDsCnt && ScoreBrackets.hasPendingEvent(T: AMDGPU::DS_CNT))
3081 Wait.set(T: AMDGPU::DS_CNT, Val: 0);
3082 }
3083
3084 // Combine or remove any redundant waitcnts at the end of the block.
3085 Modified |= generateWaitcnt(Wait, It: Block.instr_end(), Block, ScoreBrackets,
3086 OldWaitcntInstr);
3087
3088 LLVM_DEBUG({
3089 dbgs() << "*** End Block: ";
3090 Block.printName(dbgs());
3091 ScoreBrackets.dump();
3092 });
3093
3094 return Modified;
3095}
3096
3097bool SIInsertWaitcnts::removeRedundantSoftXcnts(MachineBasicBlock &Block) {
3098 if (Block.size() <= 1)
3099 return false;
3100 // The Memory Legalizer conservatively inserts a soft xcnt before each
3101 // atomic RMW operation. However, for sequences of back-to-back atomic
3102 // RMWs, only the first s_wait_xcnt insertion is necessary. Optimize away
3103 // the redundant soft xcnts.
3104 bool Modified = false;
3105 // Remember the last atomic with a soft xcnt right before it.
3106 MachineInstr *LastAtomicWithSoftXcnt = nullptr;
3107
3108 for (MachineInstr &MI : drop_begin(RangeOrContainer&: Block)) {
3109 // Ignore last atomic if non-LDS VMEM and SMEM.
3110 bool IsLDS =
3111 TII.isDS(MI) || (TII.isFLAT(MI) && TII.mayAccessLDSThroughFlat(MI));
3112 if (!IsLDS && (MI.mayLoad() ^ MI.mayStore()))
3113 LastAtomicWithSoftXcnt = nullptr;
3114
3115 bool IsAtomicRMW = (MI.getDesc().TSFlags & SIInstrFlags::maybeAtomic) &&
3116 MI.mayLoad() && MI.mayStore();
3117 MachineInstr &PrevMI = *MI.getPrevNode();
3118 // This is an atomic with a soft xcnt.
3119 if (PrevMI.getOpcode() == AMDGPU::S_WAIT_XCNT_soft && IsAtomicRMW) {
3120 // If we have already found an atomic with a soft xcnt, remove this soft
3121 // xcnt as it's redundant.
3122 if (LastAtomicWithSoftXcnt) {
3123 PrevMI.eraseFromParent();
3124 Modified = true;
3125 }
3126 LastAtomicWithSoftXcnt = &MI;
3127 }
3128 }
3129 return Modified;
3130}
3131
3132// Return flags indicating which counters should be flushed in the preheader.
3133PreheaderFlushFlags
3134SIInsertWaitcnts::isPreheaderToFlush(MachineBasicBlock &MBB,
3135 const WaitcntBrackets &ScoreBrackets) {
3136 auto [Iterator, IsInserted] =
3137 PreheadersToFlush.try_emplace(Key: &MBB, Args: PreheaderFlushFlags());
3138 if (!IsInserted)
3139 return Iterator->second;
3140
3141 MachineBasicBlock *Succ = MBB.getSingleSuccessor();
3142 if (!Succ)
3143 return PreheaderFlushFlags();
3144
3145 MachineLoop *Loop = MLI.getLoopFor(BB: Succ);
3146 if (!Loop)
3147 return PreheaderFlushFlags();
3148
3149 if (Loop->getLoopPreheader() == &MBB) {
3150 Iterator->second = getPreheaderFlushFlags(ML: Loop, Brackets: ScoreBrackets);
3151 return Iterator->second;
3152 }
3153
3154 return PreheaderFlushFlags();
3155}
3156
3157bool SIInsertWaitcnts::isVMEMOrFlatVMEM(const MachineInstr &MI) const {
3158 if (SIInstrInfo::isFLAT(MI))
3159 return TII.mayAccessVMEMThroughFlat(MI);
3160 return SIInstrInfo::isVMEM(MI);
3161}
3162
3163bool SIInsertWaitcnts::isDSRead(const MachineInstr &MI) const {
3164 return SIInstrInfo::isDS(MI) && MI.mayLoad() && !MI.mayStore();
3165}
3166
3167// Check if instruction is a store to LDS that is counted via DSCNT
3168// (where that counter exists).
3169bool SIInsertWaitcnts::mayStoreIncrementingDSCNT(const MachineInstr &MI) const {
3170 return MI.mayStore() && SIInstrInfo::isDS(MI);
3171}
3172
3173// Return flags indicating which counters should be flushed in the preheader of
3174// the given loop. We currently decide to flush in the following situations:
3175// For VMEM (FlushVmCnt):
3176// 1. The loop contains vmem store(s), no vmem load and at least one use of a
3177// vgpr containing a value that is loaded outside of the loop. (Only on
3178// targets with no vscnt counter).
3179// 2. The loop contains vmem load(s), but the loaded values are not used in the
3180// loop, and at least one use of a vgpr containing a value that is loaded
3181// outside of the loop.
3182// For DS (FlushDsCnt, GFX12+ only):
3183// 3. The loop contains no DS reads, and at least one use of a vgpr containing
3184// a value that is DS read outside of the loop.
3185// 4. The loop contains DS read(s), loaded values are not used in the same
3186// iteration but in the next iteration (prefetch pattern), and at least one
3187// use of a vgpr containing a value that is DS read outside of the loop.
3188// Flushing in preheader reduces wait overhead if the wait requirement in
3189// iteration 1 would otherwise be more strict (but unfortunately preheader
3190// flush decision is taken before knowing that).
3191// 5. (Single-block loops only) The loop has DS prefetch reads with flush point
3192// tracking. Some DS reads may be used in the same iteration (creating
3193// "flush points"), but others remain unflushed at the backedge. When a DS
3194// read is consumed in the same iteration, it and all prior reads are
3195// "flushed" (FIFO order). No DS writes are allowed in the loop.
3196// TODO: Find a way to extend to multi-block loops.
3197PreheaderFlushFlags
3198SIInsertWaitcnts::getPreheaderFlushFlags(MachineLoop *ML,
3199 const WaitcntBrackets &Brackets) {
3200 PreheaderFlushFlags Flags;
3201 bool HasVMemLoad = false;
3202 bool HasVMemStore = false;
3203 bool UsesVgprVMEMLoadedOutside = false;
3204 bool UsesVgprDSReadOutside = false;
3205 bool VMemInvalidated = false;
3206 // DS optimization only applies to GFX12+ where DS_CNT is separate.
3207 // Tracking status for "no DS read in loop" or "pure DS prefetch
3208 // (use only in next iteration)".
3209 bool TrackSimpleDSOpt = ST.hasExtendedWaitCounts();
3210 DenseSet<MCRegUnit> VgprUse;
3211 DenseSet<MCRegUnit> VgprDefVMEM;
3212 DenseSet<MCRegUnit> VgprDefDS;
3213
3214 // Track DS reads for prefetch pattern with flush points (single-block only).
3215 // Keeps track of the last DS read (position counted from the top of the loop)
3216 // to each VGPR. Read is considered consumed (and thus needs flushing) if
3217 // the dest register has a use or is overwritten (by any later opertions).
3218 DenseMap<MCRegUnit, unsigned> LastDSReadPositionMap;
3219 unsigned DSReadPosition = 0;
3220 bool IsSingleBlock = ML->getNumBlocks() == 1;
3221 bool TrackDSFlushPoint = ST.hasExtendedWaitCounts() && IsSingleBlock;
3222 unsigned LastDSFlushPosition = 0;
3223
3224 for (MachineBasicBlock *MBB : ML->blocks()) {
3225 for (MachineInstr &MI : *MBB) {
3226 if (isVMEMOrFlatVMEM(MI)) {
3227 HasVMemLoad |= MI.mayLoad();
3228 HasVMemStore |= MI.mayStore();
3229 }
3230 // TODO: Can we relax DSStore check? There may be cases where
3231 // these DS stores are drained prior to the end of MBB (or loop).
3232 if (mayStoreIncrementingDSCNT(MI)) {
3233 // Early exit if none of the optimizations are feasible.
3234 // Otherwise, set tracking status appropriately and continue.
3235 if (VMemInvalidated)
3236 return Flags;
3237 TrackSimpleDSOpt = false;
3238 TrackDSFlushPoint = false;
3239 }
3240 bool IsDSRead = isDSRead(MI);
3241 if (IsDSRead)
3242 ++DSReadPosition;
3243
3244 // Helper: if RU has a pending DS read, update LastDSFlushPosition
3245 auto updateDSReadFlushTracking = [&](MCRegUnit RU) {
3246 if (!TrackDSFlushPoint)
3247 return;
3248 if (auto It = LastDSReadPositionMap.find(Val: RU);
3249 It != LastDSReadPositionMap.end()) {
3250 // RU defined by DSRead is used or overwritten. Need to complete
3251 // the read, if not already implied by a later DSRead (to any RU)
3252 // needing to complete in FIFO order.
3253 LastDSFlushPosition = std::max(a: LastDSFlushPosition, b: It->second);
3254 }
3255 };
3256
3257 for (const MachineOperand &Op : MI.all_uses()) {
3258 if (Op.isDebug() || !TRI.isVectorRegister(MRI, Reg: Op.getReg()))
3259 continue;
3260 // Vgpr use
3261 for (MCRegUnit RU : TRI.regunits(Reg: Op.getReg().asMCReg())) {
3262 // If we find a register that is loaded inside the loop, 1. and 2.
3263 // are invalidated.
3264 if (VgprDefVMEM.contains(V: RU))
3265 VMemInvalidated = true;
3266
3267 // Check for DS reads used inside the loop
3268 if (VgprDefDS.contains(V: RU))
3269 TrackSimpleDSOpt = false;
3270
3271 // Early exit if all optimizations are invalidated
3272 if (VMemInvalidated && !TrackSimpleDSOpt && !TrackDSFlushPoint)
3273 return Flags;
3274
3275 // Check for flush points (DS read used in same iteration)
3276 updateDSReadFlushTracking(RU);
3277
3278 VgprUse.insert(V: RU);
3279 // Check if this register has a pending VMEM load from outside the
3280 // loop (value loaded outside and used inside).
3281 VMEMID ID = toVMEMID(RU);
3282 if (Brackets.hasPendingVMEM(ID, T: AMDGPU::LOAD_CNT) ||
3283 Brackets.hasPendingVMEM(ID, T: AMDGPU::SAMPLE_CNT) ||
3284 Brackets.hasPendingVMEM(ID, T: AMDGPU::BVH_CNT))
3285 UsesVgprVMEMLoadedOutside = true;
3286 // Check if loaded outside the loop via DS (not VMEM/FLAT).
3287 // Only consider it a DS read if there's no pending VMEM load for
3288 // this register, since FLAT can set both counters.
3289 else if (Brackets.hasPendingVMEM(ID, T: AMDGPU::DS_CNT))
3290 UsesVgprDSReadOutside = true;
3291 }
3292 }
3293
3294 // VMem load vgpr def
3295 if (isVMEMOrFlatVMEM(MI) && MI.mayLoad()) {
3296 for (const MachineOperand &Op : MI.all_defs()) {
3297 for (MCRegUnit RU : TRI.regunits(Reg: Op.getReg().asMCReg())) {
3298 // If we find a register that is loaded inside the loop, 1. and 2.
3299 // are invalidated.
3300 if (VgprUse.contains(V: RU))
3301 VMemInvalidated = true;
3302 VgprDefVMEM.insert(V: RU);
3303 }
3304 }
3305 // Early exit if all optimizations are invalidated
3306 if (VMemInvalidated && !TrackSimpleDSOpt && !TrackDSFlushPoint)
3307 return Flags;
3308 }
3309
3310 // DS read vgpr def
3311 // Note: Unlike VMEM, we DON'T invalidate when VgprUse.contains(RegNo).
3312 // If USE comes before DEF, it's the prefetch pattern (use value from
3313 // previous iteration, read for next iteration). We should still flush
3314 // in preheader so iteration 1 doesn't need to wait inside the loop.
3315 // Only invalidate when DEF comes before USE (same-iteration consumption,
3316 // checked above when processing uses).
3317 if (IsDSRead || TrackDSFlushPoint) {
3318 for (const MachineOperand &Op : MI.all_defs()) {
3319 if (!TRI.isVectorRegister(MRI, Reg: Op.getReg()))
3320 continue;
3321 for (MCRegUnit RU : TRI.regunits(Reg: Op.getReg().asMCReg())) {
3322 // Check for overwrite of pending DS read (flush point) by any
3323 // instruction
3324 updateDSReadFlushTracking(RU);
3325 if (IsDSRead) {
3326 VgprDefDS.insert(V: RU);
3327 if (TrackDSFlushPoint)
3328 LastDSReadPositionMap[RU] = DSReadPosition;
3329 }
3330 }
3331 }
3332 }
3333 }
3334 }
3335
3336 // VMEM flush decision
3337 if (!VMemInvalidated && UsesVgprVMEMLoadedOutside &&
3338 ((!ST.hasVscnt() && HasVMemStore && !HasVMemLoad) ||
3339 (HasVMemLoad && ST.hasVmemWriteVgprInOrder())))
3340 Flags.FlushVmCnt = true;
3341
3342 // DS flush decision:
3343 // Simple DS Opt: flush if loop uses DS read values from outside
3344 // and either has no DS reads in the loop, or DS reads whose results
3345 // are not used in the loop.
3346 bool SimpleDSOpt = TrackSimpleDSOpt && UsesVgprDSReadOutside;
3347 // Prefetch with flush points: some DS reads used in same iteration,
3348 // but unflushed reads remain at backedge
3349 bool HasUnflushedDSReads = DSReadPosition > LastDSFlushPosition;
3350 bool DSFlushPointPrefetch =
3351 TrackDSFlushPoint && UsesVgprDSReadOutside && HasUnflushedDSReads;
3352
3353 if (SimpleDSOpt || DSFlushPointPrefetch)
3354 Flags.FlushDsCnt = true;
3355
3356 return Flags;
3357}
3358
3359bool SIInsertWaitcntsLegacy::runOnMachineFunction(MachineFunction &MF) {
3360 auto &MLI = getAnalysis<MachineLoopInfoWrapperPass>().getLI();
3361 auto &PDT =
3362 getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree();
3363 AliasAnalysis *AA = nullptr;
3364 if (auto *AAR = getAnalysisIfAvailable<AAResultsWrapperPass>())
3365 AA = &AAR->getAAResults();
3366
3367 return SIInsertWaitcnts(MLI, PDT, AA, MF).run();
3368}
3369
3370PreservedAnalyses
3371SIInsertWaitcntsPass::run(MachineFunction &MF,
3372 MachineFunctionAnalysisManager &MFAM) {
3373 auto &MLI = MFAM.getResult<MachineLoopAnalysis>(IR&: MF);
3374 auto &PDT = MFAM.getResult<MachinePostDominatorTreeAnalysis>(IR&: MF);
3375 auto *AA = MFAM.getResult<FunctionAnalysisManagerMachineFunctionProxy>(IR&: MF)
3376 .getManager()
3377 .getCachedResult<AAManager>(IR&: MF.getFunction());
3378
3379 if (!SIInsertWaitcnts(MLI, PDT, AA, MF).run())
3380 return PreservedAnalyses::all();
3381
3382 return getMachineFunctionPassPreservedAnalyses()
3383 .preserveSet<CFGAnalyses>()
3384 .preserve<AAManager>();
3385}
3386
3387bool SIInsertWaitcnts::run() {
3388 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
3389
3390 AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(GPU: ST.getCPU());
3391
3392 // Initialize hardware limits first, as they're needed by the generators.
3393 Limits = AMDGPU::HardwareLimits(IV);
3394
3395 if (ST.hasExtendedWaitCounts()) {
3396 IsExpertMode = ST.hasExpertSchedulingMode() &&
3397 (ExpertSchedulingModeFlag.getNumOccurrences()
3398 ? ExpertSchedulingModeFlag
3399 : MF.getFunction()
3400 .getFnAttribute(Kind: "amdgpu-expert-scheduling-mode")
3401 .getValueAsBool());
3402 MaxCounter = IsExpertMode ? AMDGPU::NUM_EXPERT_INST_CNTS
3403 : AMDGPU::NUM_EXTENDED_INST_CNTS;
3404 // Initialize WCG per MF. It contains state that depends on MF attributes.
3405 WCG = std::make_unique<WaitcntGeneratorGFX12Plus>(args&: MF, args&: MaxCounter, args&: Limits,
3406 args&: IsExpertMode);
3407 } else {
3408 MaxCounter = AMDGPU::NUM_NORMAL_INST_CNTS;
3409 // Initialize WCG per MF. It contains state that depends on MF attributes.
3410 WCG = std::make_unique<WaitcntGeneratorPreGFX12>(
3411 args&: MF, args: AMDGPU::NUM_NORMAL_INST_CNTS, args&: Limits);
3412 }
3413
3414 SmemAccessCounter = getCounterFromEvent(E: HWEvents::SMEM_ACCESS);
3415
3416 bool Modified = false;
3417
3418 MachineBasicBlock &EntryBB = MF.front();
3419
3420 if (!MFI->isEntryFunction() &&
3421 !MF.getFunction().hasFnAttribute(Kind: Attribute::Naked)) {
3422 // Wait for any outstanding memory operations that the input registers may
3423 // depend on. We can't track them and it's better to do the wait after the
3424 // costly call sequence.
3425
3426 // TODO: Could insert earlier and schedule more liberally with operations
3427 // that only use caller preserved registers.
3428 MachineBasicBlock::iterator I = EntryBB.begin();
3429 while (I != EntryBB.end() && I->isMetaInstruction())
3430 ++I;
3431
3432 if (ST.hasExtendedWaitCounts()) {
3433 BuildMI(BB&: EntryBB, I, MIMD: DebugLoc(), MCID: TII.get(Opcode: AMDGPU::S_WAIT_LOADCNT_DSCNT))
3434 .addImm(Val: 0);
3435 for (auto CT : inst_counter_types(MaxCounter: AMDGPU::NUM_EXTENDED_INST_CNTS)) {
3436 if (CT == AMDGPU::LOAD_CNT || CT == AMDGPU::DS_CNT ||
3437 CT == AMDGPU::STORE_CNT || CT == AMDGPU::X_CNT ||
3438 CT == AMDGPU::ASYNC_CNT || CT == AMDGPU::TENSOR_CNT)
3439 continue;
3440
3441 if (!ST.hasImageInsts() &&
3442 (CT == AMDGPU::EXP_CNT || CT == AMDGPU::SAMPLE_CNT ||
3443 CT == AMDGPU::BVH_CNT))
3444 continue;
3445
3446 BuildMI(BB&: EntryBB, I, MIMD: DebugLoc(),
3447 MCID: TII.get(Opcode: instrsForExtendedCounterTypes[CT]))
3448 .addImm(Val: 0);
3449 }
3450 if (IsExpertMode) {
3451 unsigned Enc = AMDGPU::DepCtr::encodeFieldVaVdst(VaVdst: 0, STI: ST);
3452 Enc = AMDGPU::DepCtr::encodeFieldVmVsrc(Encoded: Enc, VmVsrc: 0);
3453 BuildMI(BB&: EntryBB, I, MIMD: DebugLoc(), MCID: TII.get(Opcode: AMDGPU::S_WAITCNT_DEPCTR))
3454 .addImm(Val: Enc);
3455 }
3456 } else {
3457 BuildMI(BB&: EntryBB, I, MIMD: DebugLoc(), MCID: TII.get(Opcode: AMDGPU::S_WAITCNT)).addImm(Val: 0);
3458 }
3459
3460 auto NonKernelInitialState = std::make_unique<WaitcntBrackets>(args: this);
3461 NonKernelInitialState->setStateOnFunctionEntryOrReturn();
3462 BlockInfos[&EntryBB].Incoming = std::move(NonKernelInitialState);
3463
3464 Modified = true;
3465 }
3466
3467 // Keep iterating over the blocks in reverse post order, inserting and
3468 // updating s_waitcnt where needed, until a fix point is reached.
3469 for (auto *MBB : ReversePostOrderTraversal<MachineFunction *>(&MF))
3470 BlockInfos.try_emplace(Key: MBB);
3471
3472 std::unique_ptr<WaitcntBrackets> Brackets;
3473 bool Repeat;
3474 do {
3475 Repeat = false;
3476
3477 for (auto BII = BlockInfos.begin(), BIE = BlockInfos.end(); BII != BIE;
3478 ++BII) {
3479 MachineBasicBlock *MBB = BII->first;
3480 BlockInfo &BI = BII->second;
3481 if (!BI.Dirty)
3482 continue;
3483
3484 if (BI.Incoming) {
3485 if (!Brackets)
3486 Brackets = std::make_unique<WaitcntBrackets>(args&: *BI.Incoming);
3487 else
3488 *Brackets = *BI.Incoming;
3489 } else {
3490 if (!Brackets) {
3491 Brackets = std::make_unique<WaitcntBrackets>(args: this);
3492 } else {
3493 // Reinitialize in-place. N.B. do not do this by assigning from a
3494 // temporary because the WaitcntBrackets class is large and it could
3495 // cause this function to use an unreasonable amount of stack space.
3496 Brackets->~WaitcntBrackets();
3497 new (Brackets.get()) WaitcntBrackets(this);
3498 }
3499 }
3500
3501 if (ST.hasWaitXcnt())
3502 Modified |= removeRedundantSoftXcnts(Block&: *MBB);
3503 Modified |= insertWaitcntInBlock(MF, Block&: *MBB, ScoreBrackets&: *Brackets);
3504 BI.Dirty = false;
3505
3506 if (Brackets->hasPendingEvent()) {
3507 BlockInfo *MoveBracketsToSucc = nullptr;
3508 for (MachineBasicBlock *Succ : MBB->successors()) {
3509 auto *SuccBII = BlockInfos.find(Key: Succ);
3510 BlockInfo &SuccBI = SuccBII->second;
3511 if (!SuccBI.Incoming) {
3512 SuccBI.Dirty = true;
3513 if (SuccBII <= BII) {
3514 LLVM_DEBUG(dbgs() << "Repeat on backedge without merge\n");
3515 Repeat = true;
3516 }
3517 if (!MoveBracketsToSucc) {
3518 MoveBracketsToSucc = &SuccBI;
3519 } else {
3520 SuccBI.Incoming = std::make_unique<WaitcntBrackets>(args&: *Brackets);
3521 }
3522 } else {
3523 LLVM_DEBUG({
3524 dbgs() << "Try to merge ";
3525 MBB->printName(dbgs());
3526 dbgs() << " into ";
3527 Succ->printName(dbgs());
3528 dbgs() << '\n';
3529 });
3530 if (SuccBI.Incoming->merge(Other: *Brackets)) {
3531 SuccBI.Dirty = true;
3532 if (SuccBII <= BII) {
3533 LLVM_DEBUG(dbgs() << "Repeat on backedge with merge\n");
3534 Repeat = true;
3535 }
3536 }
3537 }
3538 }
3539 if (MoveBracketsToSucc)
3540 MoveBracketsToSucc->Incoming = std::move(Brackets);
3541 }
3542 }
3543 } while (Repeat);
3544
3545 if (ST.hasScalarStores()) {
3546 SmallVector<MachineBasicBlock *, 4> EndPgmBlocks;
3547 bool HaveScalarStores = false;
3548
3549 for (MachineBasicBlock &MBB : MF) {
3550 for (MachineInstr &MI : MBB) {
3551 if (!HaveScalarStores && TII.isScalarStore(MI))
3552 HaveScalarStores = true;
3553
3554 if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
3555 MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
3556 EndPgmBlocks.push_back(Elt: &MBB);
3557 }
3558 }
3559
3560 if (HaveScalarStores) {
3561 // If scalar writes are used, the cache must be flushed or else the next
3562 // wave to reuse the same scratch memory can be clobbered.
3563 //
3564 // Insert s_dcache_wb at wave termination points if there were any scalar
3565 // stores, and only if the cache hasn't already been flushed. This could
3566 // be improved by looking across blocks for flushes in postdominating
3567 // blocks from the stores but an explicitly requested flush is probably
3568 // very rare.
3569 for (MachineBasicBlock *MBB : EndPgmBlocks) {
3570 bool SeenDCacheWB = false;
3571
3572 for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
3573 I != E; ++I) {
3574 if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
3575 SeenDCacheWB = true;
3576 else if (TII.isScalarStore(MI: *I))
3577 SeenDCacheWB = false;
3578
3579 // FIXME: It would be better to insert this before a waitcnt if any.
3580 if ((I->getOpcode() == AMDGPU::S_ENDPGM ||
3581 I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&
3582 !SeenDCacheWB) {
3583 Modified = true;
3584 BuildMI(BB&: *MBB, I, MIMD: I->getDebugLoc(), MCID: TII.get(Opcode: AMDGPU::S_DCACHE_WB));
3585 }
3586 }
3587 }
3588 }
3589 }
3590
3591 if (IsExpertMode) {
3592 // Enable expert scheduling on function entry. To satisfy ABI requirements
3593 // and to allow calls between function with different expert scheduling
3594 // settings, disable it around calls and before returns.
3595
3596 MachineBasicBlock::iterator I = EntryBB.begin();
3597 while (I != EntryBB.end() && I->isMetaInstruction())
3598 ++I;
3599 setSchedulingMode(MBB&: EntryBB, I, ExpertMode: true);
3600
3601 for (MachineInstr *MI : CallInsts) {
3602 MachineBasicBlock &MBB = *MI->getParent();
3603 setSchedulingMode(MBB, I: MI, ExpertMode: false);
3604 setSchedulingMode(MBB, I: std::next(x: MI->getIterator()), ExpertMode: true);
3605 }
3606
3607 for (MachineInstr *MI : ReturnInsts)
3608 setSchedulingMode(MBB&: *MI->getParent(), I: MI, ExpertMode: false);
3609
3610 Modified = true;
3611 }
3612
3613 // Deallocate the VGPRs before previously identified S_ENDPGM instructions.
3614 // This is done in different ways depending on how the VGPRs were allocated
3615 // (i.e. whether we're in dynamic VGPR mode or not).
3616 // Skip deallocation if kernel is waveslot limited vs VGPR limited. A short
3617 // waveslot limited kernel runs slower with the deallocation.
3618 if (!WCG->isOptNone() && MFI->isDynamicVGPREnabled()) {
3619 for (auto [MI, _] : EndPgmInsts) {
3620 BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(),
3621 MCID: TII.get(Opcode: AMDGPU::S_ALLOC_VGPR))
3622 .addImm(Val: 0);
3623 Modified = true;
3624 }
3625 } else if (!WCG->isOptNone() &&
3626 ST.getGeneration() >= AMDGPUSubtarget::GFX11 &&
3627 (MF.getFrameInfo().hasCalls() ||
3628 ST.getOccupancyWithNumVGPRs(
3629 VGPRs: TRI.getNumUsedPhysRegs(MRI, RC: AMDGPU::VGPR_32RegClass),
3630 /*IsDynamicVGPR=*/DynamicVGPRBlockSize: false) <
3631 AMDGPU::IsaInfo::getMaxWavesPerEU(STI: ST))) {
3632 for (auto [MI, Flag] : EndPgmInsts) {
3633 if (Flag) {
3634 if (ST.requiresNopBeforeDeallocVGPRs()) {
3635 BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(),
3636 MCID: TII.get(Opcode: AMDGPU::S_NOP))
3637 .addImm(Val: 0);
3638 }
3639 BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(),
3640 MCID: TII.get(Opcode: AMDGPU::S_SENDMSG))
3641 .addImm(Val: AMDGPU::SendMsg::ID_DEALLOC_VGPRS_GFX11Plus);
3642 Modified = true;
3643 }
3644 }
3645 }
3646
3647 return Modified;
3648}
3649