1 | //===- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | /// \file |
10 | /// Insert wait instructions for memory reads and writes. |
11 | /// |
12 | /// Memory reads and writes are issued asynchronously, so we need to insert |
13 | /// S_WAITCNT instructions when we want to access any of their results or |
14 | /// overwrite any register that's used asynchronously. |
15 | /// |
16 | /// TODO: This pass currently keeps one timeline per hardware counter. A more |
17 | /// finely-grained approach that keeps one timeline per event type could |
18 | /// sometimes get away with generating weaker s_waitcnt instructions. For |
19 | /// example, when both SMEM and LDS are in flight and we need to wait for |
20 | /// the i-th-last LDS instruction, then an lgkmcnt(i) is actually sufficient, |
21 | /// but the pass will currently generate a conservative lgkmcnt(0) because |
22 | /// multiple event types are in flight. |
23 | // |
24 | //===----------------------------------------------------------------------===// |
25 | |
26 | #include "AMDGPU.h" |
27 | #include "GCNSubtarget.h" |
28 | #include "MCTargetDesc/AMDGPUMCTargetDesc.h" |
29 | #include "SIMachineFunctionInfo.h" |
30 | #include "Utils/AMDGPUBaseInfo.h" |
31 | #include "llvm/ADT/MapVector.h" |
32 | #include "llvm/ADT/PostOrderIterator.h" |
33 | #include "llvm/ADT/Sequence.h" |
34 | #include "llvm/Analysis/AliasAnalysis.h" |
35 | #include "llvm/CodeGen/MachineLoopInfo.h" |
36 | #include "llvm/CodeGen/MachinePostDominators.h" |
37 | #include "llvm/InitializePasses.h" |
38 | #include "llvm/Support/DebugCounter.h" |
39 | #include "llvm/TargetParser/TargetParser.h" |
40 | using namespace llvm; |
41 | |
42 | #define DEBUG_TYPE "si-insert-waitcnts" |
43 | |
44 | DEBUG_COUNTER(ForceExpCounter, DEBUG_TYPE"-forceexp" , |
45 | "Force emit s_waitcnt expcnt(0) instrs" ); |
46 | DEBUG_COUNTER(ForceLgkmCounter, DEBUG_TYPE"-forcelgkm" , |
47 | "Force emit s_waitcnt lgkmcnt(0) instrs" ); |
48 | DEBUG_COUNTER(ForceVMCounter, DEBUG_TYPE"-forcevm" , |
49 | "Force emit s_waitcnt vmcnt(0) instrs" ); |
50 | |
51 | static cl::opt<bool> ForceEmitZeroFlag( |
52 | "amdgpu-waitcnt-forcezero" , |
53 | cl::desc("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)" ), |
54 | cl::init(Val: false), cl::Hidden); |
55 | |
56 | namespace { |
57 | // Class of object that encapsulates latest instruction counter score |
58 | // associated with the operand. Used for determining whether |
59 | // s_waitcnt instruction needs to be emitted. |
60 | |
61 | enum InstCounterType { |
62 | LOAD_CNT = 0, // VMcnt prior to gfx12. |
63 | DS_CNT, // LKGMcnt prior to gfx12. |
64 | EXP_CNT, // |
65 | STORE_CNT, // VScnt in gfx10/gfx11. |
66 | NUM_NORMAL_INST_CNTS, |
67 | SAMPLE_CNT = NUM_NORMAL_INST_CNTS, // gfx12+ only. |
68 | BVH_CNT, // gfx12+ only. |
69 | KM_CNT, // gfx12+ only. |
70 | NUM_EXTENDED_INST_CNTS, |
71 | NUM_INST_CNTS = NUM_EXTENDED_INST_CNTS |
72 | }; |
73 | } // namespace |
74 | |
75 | namespace llvm { |
76 | template <> struct enum_iteration_traits<InstCounterType> { |
77 | static constexpr bool is_iterable = true; |
78 | }; |
79 | } // namespace llvm |
80 | |
81 | namespace { |
82 | // Return an iterator over all counters between LOAD_CNT (the first counter) |
83 | // and \c MaxCounter (exclusive, default value yields an enumeration over |
84 | // all counters). |
85 | auto inst_counter_types(InstCounterType MaxCounter = NUM_INST_CNTS) { |
86 | return enum_seq(Begin: LOAD_CNT, End: MaxCounter); |
87 | } |
88 | |
89 | using RegInterval = std::pair<int, int>; |
90 | |
91 | struct HardwareLimits { |
92 | unsigned LoadcntMax; // Corresponds to VMcnt prior to gfx12. |
93 | unsigned ExpcntMax; |
94 | unsigned DscntMax; // Corresponds to LGKMcnt prior to gfx12. |
95 | unsigned StorecntMax; // Corresponds to VScnt in gfx10/gfx11. |
96 | unsigned SamplecntMax; // gfx12+ only. |
97 | unsigned BvhcntMax; // gfx12+ only. |
98 | unsigned KmcntMax; // gfx12+ only. |
99 | }; |
100 | |
101 | struct RegisterEncoding { |
102 | unsigned VGPR0; |
103 | unsigned VGPRL; |
104 | unsigned SGPR0; |
105 | unsigned SGPRL; |
106 | }; |
107 | |
108 | enum WaitEventType { |
109 | VMEM_ACCESS, // vector-memory read & write |
110 | VMEM_READ_ACCESS, // vector-memory read |
111 | VMEM_SAMPLER_READ_ACCESS, // vector-memory SAMPLER read (gfx12+ only) |
112 | VMEM_BVH_READ_ACCESS, // vector-memory BVH read (gfx12+ only) |
113 | VMEM_WRITE_ACCESS, // vector-memory write that is not scratch |
114 | SCRATCH_WRITE_ACCESS, // vector-memory write that may be scratch |
115 | LDS_ACCESS, // lds read & write |
116 | GDS_ACCESS, // gds read & write |
117 | SQ_MESSAGE, // send message |
118 | SMEM_ACCESS, // scalar-memory read & write |
119 | EXP_GPR_LOCK, // export holding on its data src |
120 | GDS_GPR_LOCK, // GDS holding on its data and addr src |
121 | EXP_POS_ACCESS, // write to export position |
122 | EXP_PARAM_ACCESS, // write to export parameter |
123 | VMW_GPR_LOCK, // vector-memory write holding on its data src |
124 | EXP_LDS_ACCESS, // read by ldsdir counting as export |
125 | NUM_WAIT_EVENTS, |
126 | }; |
127 | |
128 | // The mapping is: |
129 | // 0 .. SQ_MAX_PGM_VGPRS-1 real VGPRs |
130 | // SQ_MAX_PGM_VGPRS .. NUM_ALL_VGPRS-1 extra VGPR-like slots |
131 | // NUM_ALL_VGPRS .. NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS-1 real SGPRs |
132 | // We reserve a fixed number of VGPR slots in the scoring tables for |
133 | // special tokens like SCMEM_LDS (needed for buffer load to LDS). |
134 | enum RegisterMapping { |
135 | SQ_MAX_PGM_VGPRS = 512, // Maximum programmable VGPRs across all targets. |
136 | AGPR_OFFSET = 256, // Maximum programmable ArchVGPRs across all targets. |
137 | SQ_MAX_PGM_SGPRS = 256, // Maximum programmable SGPRs across all targets. |
138 | = 9, // Reserved slots for DS. |
139 | // Artificial register slots to track LDS writes into specific LDS locations |
140 | // if a location is known. When slots are exhausted or location is |
141 | // unknown use the first slot. The first slot is also always updated in |
142 | // addition to known location's slot to properly generate waits if dependent |
143 | // instruction's location is unknown. |
144 | = 0, |
145 | NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_EXTRA_VGPRS, // Where SGPR starts. |
146 | }; |
147 | |
148 | // Enumerate different types of result-returning VMEM operations. Although |
149 | // s_waitcnt orders them all with a single vmcnt counter, in the absence of |
150 | // s_waitcnt only instructions of the same VmemType are guaranteed to write |
151 | // their results in order -- so there is no need to insert an s_waitcnt between |
152 | // two instructions of the same type that write the same vgpr. |
153 | enum VmemType { |
154 | // BUF instructions and MIMG instructions without a sampler. |
155 | VMEM_NOSAMPLER, |
156 | // MIMG instructions with a sampler. |
157 | VMEM_SAMPLER, |
158 | // BVH instructions |
159 | VMEM_BVH, |
160 | NUM_VMEM_TYPES |
161 | }; |
162 | |
163 | // Maps values of InstCounterType to the instruction that waits on that |
164 | // counter. Only used if GCNSubtarget::hasExtendedWaitCounts() |
165 | // returns true. |
166 | static const unsigned instrsForExtendedCounterTypes[NUM_EXTENDED_INST_CNTS] = { |
167 | AMDGPU::S_WAIT_LOADCNT, AMDGPU::S_WAIT_DSCNT, AMDGPU::S_WAIT_EXPCNT, |
168 | AMDGPU::S_WAIT_STORECNT, AMDGPU::S_WAIT_SAMPLECNT, AMDGPU::S_WAIT_BVHCNT, |
169 | AMDGPU::S_WAIT_KMCNT}; |
170 | |
171 | static bool updateVMCntOnly(const MachineInstr &Inst) { |
172 | return SIInstrInfo::isVMEM(MI: Inst) || SIInstrInfo::isFLATGlobal(MI: Inst) || |
173 | SIInstrInfo::isFLATScratch(MI: Inst); |
174 | } |
175 | |
176 | #ifndef NDEBUG |
177 | static bool isNormalMode(InstCounterType MaxCounter) { |
178 | return MaxCounter == NUM_NORMAL_INST_CNTS; |
179 | } |
180 | #endif // NDEBUG |
181 | |
182 | VmemType getVmemType(const MachineInstr &Inst) { |
183 | assert(updateVMCntOnly(Inst)); |
184 | if (!SIInstrInfo::isMIMG(MI: Inst) && !SIInstrInfo::isVIMAGE(MI: Inst) && |
185 | !SIInstrInfo::isVSAMPLE(MI: Inst)) |
186 | return VMEM_NOSAMPLER; |
187 | const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc: Inst.getOpcode()); |
188 | const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo = |
189 | AMDGPU::getMIMGBaseOpcodeInfo(BaseOpcode: Info->BaseOpcode); |
190 | // We have to make an additional check for isVSAMPLE here since some |
191 | // instructions don't have a sampler, but are still classified as sampler |
192 | // instructions for the purposes of e.g. waitcnt. |
193 | return BaseInfo->BVH ? VMEM_BVH |
194 | : (BaseInfo->Sampler || SIInstrInfo::isVSAMPLE(MI: Inst)) ? VMEM_SAMPLER |
195 | : VMEM_NOSAMPLER; |
196 | } |
197 | |
198 | unsigned &getCounterRef(AMDGPU::Waitcnt &Wait, InstCounterType T) { |
199 | switch (T) { |
200 | case LOAD_CNT: |
201 | return Wait.LoadCnt; |
202 | case EXP_CNT: |
203 | return Wait.ExpCnt; |
204 | case DS_CNT: |
205 | return Wait.DsCnt; |
206 | case STORE_CNT: |
207 | return Wait.StoreCnt; |
208 | case SAMPLE_CNT: |
209 | return Wait.SampleCnt; |
210 | case BVH_CNT: |
211 | return Wait.BvhCnt; |
212 | case KM_CNT: |
213 | return Wait.KmCnt; |
214 | default: |
215 | llvm_unreachable("bad InstCounterType" ); |
216 | } |
217 | } |
218 | |
219 | void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) { |
220 | unsigned &WC = getCounterRef(Wait, T); |
221 | WC = std::min(a: WC, b: Count); |
222 | } |
223 | |
224 | void setNoWait(AMDGPU::Waitcnt &Wait, InstCounterType T) { |
225 | getCounterRef(Wait, T) = ~0u; |
226 | } |
227 | |
228 | unsigned getWait(AMDGPU::Waitcnt &Wait, InstCounterType T) { |
229 | return getCounterRef(Wait, T); |
230 | } |
231 | |
232 | // Mapping from event to counter according to the table masks. |
233 | InstCounterType eventCounter(const unsigned *masks, WaitEventType E) { |
234 | for (auto T : inst_counter_types()) { |
235 | if (masks[T] & (1 << E)) |
236 | return T; |
237 | } |
238 | llvm_unreachable("event type has no associated counter" ); |
239 | } |
240 | |
241 | // This objects maintains the current score brackets of each wait counter, and |
242 | // a per-register scoreboard for each wait counter. |
243 | // |
244 | // We also maintain the latest score for every event type that can change the |
245 | // waitcnt in order to know if there are multiple types of events within |
246 | // the brackets. When multiple types of event happen in the bracket, |
247 | // wait count may get decreased out of order, therefore we need to put in |
248 | // "s_waitcnt 0" before use. |
249 | class WaitcntBrackets { |
250 | public: |
251 | WaitcntBrackets(const GCNSubtarget *SubTarget, InstCounterType MaxCounter, |
252 | HardwareLimits Limits, RegisterEncoding Encoding, |
253 | const unsigned *WaitEventMaskForInst, |
254 | InstCounterType SmemAccessCounter) |
255 | : ST(SubTarget), MaxCounter(MaxCounter), Limits(Limits), |
256 | Encoding(Encoding), WaitEventMaskForInst(WaitEventMaskForInst), |
257 | SmemAccessCounter(SmemAccessCounter) {} |
258 | |
259 | unsigned getWaitCountMax(InstCounterType T) const { |
260 | switch (T) { |
261 | case LOAD_CNT: |
262 | return Limits.LoadcntMax; |
263 | case DS_CNT: |
264 | return Limits.DscntMax; |
265 | case EXP_CNT: |
266 | return Limits.ExpcntMax; |
267 | case STORE_CNT: |
268 | return Limits.StorecntMax; |
269 | case SAMPLE_CNT: |
270 | return Limits.SamplecntMax; |
271 | case BVH_CNT: |
272 | return Limits.BvhcntMax; |
273 | case KM_CNT: |
274 | return Limits.KmcntMax; |
275 | default: |
276 | break; |
277 | } |
278 | return 0; |
279 | } |
280 | |
281 | unsigned getScoreLB(InstCounterType T) const { |
282 | assert(T < NUM_INST_CNTS); |
283 | return ScoreLBs[T]; |
284 | } |
285 | |
286 | unsigned getScoreUB(InstCounterType T) const { |
287 | assert(T < NUM_INST_CNTS); |
288 | return ScoreUBs[T]; |
289 | } |
290 | |
291 | unsigned getScoreRange(InstCounterType T) const { |
292 | return getScoreUB(T) - getScoreLB(T); |
293 | } |
294 | |
295 | unsigned getRegScore(int GprNo, InstCounterType T) const { |
296 | if (GprNo < NUM_ALL_VGPRS) { |
297 | return VgprScores[T][GprNo]; |
298 | } |
299 | assert(T == SmemAccessCounter); |
300 | return SgprScores[GprNo - NUM_ALL_VGPRS]; |
301 | } |
302 | |
303 | bool merge(const WaitcntBrackets &Other); |
304 | |
305 | RegInterval getRegInterval(const MachineInstr *MI, |
306 | const MachineRegisterInfo *MRI, |
307 | const SIRegisterInfo *TRI, unsigned OpNo) const; |
308 | |
309 | bool counterOutOfOrder(InstCounterType T) const; |
310 | void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const; |
311 | void simplifyWaitcnt(InstCounterType T, unsigned &Count) const; |
312 | void determineWait(InstCounterType T, int RegNo, AMDGPU::Waitcnt &Wait) const; |
313 | void applyWaitcnt(const AMDGPU::Waitcnt &Wait); |
314 | void applyWaitcnt(InstCounterType T, unsigned Count); |
315 | void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI, |
316 | const MachineRegisterInfo *MRI, WaitEventType E, |
317 | MachineInstr &MI); |
318 | |
319 | unsigned hasPendingEvent() const { return PendingEvents; } |
320 | unsigned hasPendingEvent(WaitEventType E) const { |
321 | return PendingEvents & (1 << E); |
322 | } |
323 | unsigned hasPendingEvent(InstCounterType T) const { |
324 | unsigned HasPending = PendingEvents & WaitEventMaskForInst[T]; |
325 | assert((HasPending != 0) == (getScoreRange(T) != 0)); |
326 | return HasPending; |
327 | } |
328 | |
329 | bool hasMixedPendingEvents(InstCounterType T) const { |
330 | unsigned Events = hasPendingEvent(T); |
331 | // Return true if more than one bit is set in Events. |
332 | return Events & (Events - 1); |
333 | } |
334 | |
335 | bool hasPendingFlat() const { |
336 | return ((LastFlat[DS_CNT] > ScoreLBs[DS_CNT] && |
337 | LastFlat[DS_CNT] <= ScoreUBs[DS_CNT]) || |
338 | (LastFlat[LOAD_CNT] > ScoreLBs[LOAD_CNT] && |
339 | LastFlat[LOAD_CNT] <= ScoreUBs[LOAD_CNT])); |
340 | } |
341 | |
342 | void setPendingFlat() { |
343 | LastFlat[LOAD_CNT] = ScoreUBs[LOAD_CNT]; |
344 | LastFlat[DS_CNT] = ScoreUBs[DS_CNT]; |
345 | } |
346 | |
347 | // Return true if there might be pending writes to the specified vgpr by VMEM |
348 | // instructions with types different from V. |
349 | bool hasOtherPendingVmemTypes(int GprNo, VmemType V) const { |
350 | assert(GprNo < NUM_ALL_VGPRS); |
351 | return VgprVmemTypes[GprNo] & ~(1 << V); |
352 | } |
353 | |
354 | void clearVgprVmemTypes(int GprNo) { |
355 | assert(GprNo < NUM_ALL_VGPRS); |
356 | VgprVmemTypes[GprNo] = 0; |
357 | } |
358 | |
359 | void setStateOnFunctionEntryOrReturn() { |
360 | setScoreUB(T: STORE_CNT, Val: getScoreUB(T: STORE_CNT) + getWaitCountMax(T: STORE_CNT)); |
361 | PendingEvents |= WaitEventMaskForInst[STORE_CNT]; |
362 | } |
363 | |
364 | ArrayRef<const MachineInstr *> getLDSDMAStores() const { |
365 | return LDSDMAStores; |
366 | } |
367 | |
368 | void print(raw_ostream &); |
369 | void dump() { print(dbgs()); } |
370 | |
371 | private: |
372 | struct MergeInfo { |
373 | unsigned OldLB; |
374 | unsigned OtherLB; |
375 | unsigned MyShift; |
376 | unsigned OtherShift; |
377 | }; |
378 | static bool mergeScore(const MergeInfo &M, unsigned &Score, |
379 | unsigned OtherScore); |
380 | |
381 | void setScoreLB(InstCounterType T, unsigned Val) { |
382 | assert(T < NUM_INST_CNTS); |
383 | ScoreLBs[T] = Val; |
384 | } |
385 | |
386 | void setScoreUB(InstCounterType T, unsigned Val) { |
387 | assert(T < NUM_INST_CNTS); |
388 | ScoreUBs[T] = Val; |
389 | |
390 | if (T != EXP_CNT) |
391 | return; |
392 | |
393 | if (getScoreRange(T: EXP_CNT) > getWaitCountMax(T: EXP_CNT)) |
394 | ScoreLBs[EXP_CNT] = ScoreUBs[EXP_CNT] - getWaitCountMax(T: EXP_CNT); |
395 | } |
396 | |
397 | void setRegScore(int GprNo, InstCounterType T, unsigned Val) { |
398 | if (GprNo < NUM_ALL_VGPRS) { |
399 | VgprUB = std::max(a: VgprUB, b: GprNo); |
400 | VgprScores[T][GprNo] = Val; |
401 | } else { |
402 | assert(T == SmemAccessCounter); |
403 | SgprUB = std::max(a: SgprUB, b: GprNo - NUM_ALL_VGPRS); |
404 | SgprScores[GprNo - NUM_ALL_VGPRS] = Val; |
405 | } |
406 | } |
407 | |
408 | void setExpScore(const MachineInstr *MI, const SIInstrInfo *TII, |
409 | const SIRegisterInfo *TRI, const MachineRegisterInfo *MRI, |
410 | unsigned OpNo, unsigned Val); |
411 | |
412 | const GCNSubtarget *ST = nullptr; |
413 | InstCounterType MaxCounter = NUM_EXTENDED_INST_CNTS; |
414 | HardwareLimits Limits = {}; |
415 | RegisterEncoding Encoding = {}; |
416 | const unsigned *WaitEventMaskForInst; |
417 | InstCounterType SmemAccessCounter; |
418 | unsigned ScoreLBs[NUM_INST_CNTS] = {0}; |
419 | unsigned ScoreUBs[NUM_INST_CNTS] = {0}; |
420 | unsigned PendingEvents = 0; |
421 | // Remember the last flat memory operation. |
422 | unsigned LastFlat[NUM_INST_CNTS] = {0}; |
423 | // wait_cnt scores for every vgpr. |
424 | // Keep track of the VgprUB and SgprUB to make merge at join efficient. |
425 | int VgprUB = -1; |
426 | int SgprUB = -1; |
427 | unsigned VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS] = {{0}}; |
428 | // Wait cnt scores for every sgpr, only DS_CNT (corresponding to LGKMcnt |
429 | // pre-gfx12) or KM_CNT (gfx12+ only) are relevant. |
430 | unsigned SgprScores[SQ_MAX_PGM_SGPRS] = {0}; |
431 | // Bitmask of the VmemTypes of VMEM instructions that might have a pending |
432 | // write to each vgpr. |
433 | unsigned char VgprVmemTypes[NUM_ALL_VGPRS] = {0}; |
434 | // Store representative LDS DMA operations. The only useful info here is |
435 | // alias info. One store is kept per unique AAInfo. |
436 | SmallVector<const MachineInstr *, NUM_EXTRA_VGPRS - 1> LDSDMAStores; |
437 | }; |
438 | |
439 | // This abstracts the logic for generating and updating S_WAIT* instructions |
440 | // away from the analysis that determines where they are needed. This was |
441 | // done because the set of counters and instructions for waiting on them |
442 | // underwent a major shift with gfx12, sufficiently so that having this |
443 | // abstraction allows the main analysis logic to be simpler than it would |
444 | // otherwise have had to become. |
445 | class WaitcntGenerator { |
446 | protected: |
447 | const GCNSubtarget *ST = nullptr; |
448 | const SIInstrInfo *TII = nullptr; |
449 | AMDGPU::IsaVersion IV; |
450 | InstCounterType MaxCounter; |
451 | bool OptNone; |
452 | |
453 | public: |
454 | WaitcntGenerator() = default; |
455 | WaitcntGenerator(const MachineFunction &MF, InstCounterType MaxCounter) |
456 | : ST(&MF.getSubtarget<GCNSubtarget>()), TII(ST->getInstrInfo()), |
457 | IV(AMDGPU::getIsaVersion(GPU: ST->getCPU())), MaxCounter(MaxCounter), |
458 | OptNone(MF.getFunction().hasOptNone() || |
459 | MF.getTarget().getOptLevel() == CodeGenOptLevel::None) {} |
460 | |
461 | // Return true if the current function should be compiled with no |
462 | // optimization. |
463 | bool isOptNone() const { return OptNone; } |
464 | |
465 | // Edits an existing sequence of wait count instructions according |
466 | // to an incoming Waitcnt value, which is itself updated to reflect |
467 | // any new wait count instructions which may need to be generated by |
468 | // WaitcntGenerator::createNewWaitcnt(). It will return true if any edits |
469 | // were made. |
470 | // |
471 | // This editing will usually be merely updated operands, but it may also |
472 | // delete instructions if the incoming Wait value indicates they are not |
473 | // needed. It may also remove existing instructions for which a wait |
474 | // is needed if it can be determined that it is better to generate new |
475 | // instructions later, as can happen on gfx12. |
476 | virtual bool |
477 | applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets, |
478 | MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait, |
479 | MachineBasicBlock::instr_iterator It) const = 0; |
480 | |
481 | // Transform a soft waitcnt into a normal one. |
482 | bool promoteSoftWaitCnt(MachineInstr *Waitcnt) const; |
483 | |
484 | // Generates new wait count instructions according to the value of |
485 | // Wait, returning true if any new instructions were created. |
486 | virtual bool createNewWaitcnt(MachineBasicBlock &Block, |
487 | MachineBasicBlock::instr_iterator It, |
488 | AMDGPU::Waitcnt Wait) = 0; |
489 | |
490 | // Returns an array of bit masks which can be used to map values in |
491 | // WaitEventType to corresponding counter values in InstCounterType. |
492 | virtual const unsigned *getWaitEventMask() const = 0; |
493 | |
494 | // Returns a new waitcnt with all counters except VScnt set to 0. If |
495 | // IncludeVSCnt is true, VScnt is set to 0, otherwise it is set to ~0u. |
496 | virtual AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const = 0; |
497 | |
498 | virtual ~WaitcntGenerator() = default; |
499 | |
500 | // Create a mask value from the initializer list of wait event types. |
501 | static constexpr unsigned |
502 | eventMask(std::initializer_list<WaitEventType> Events) { |
503 | unsigned Mask = 0; |
504 | for (auto &E : Events) |
505 | Mask |= 1 << E; |
506 | |
507 | return Mask; |
508 | } |
509 | }; |
510 | |
511 | class WaitcntGeneratorPreGFX12 : public WaitcntGenerator { |
512 | public: |
513 | WaitcntGeneratorPreGFX12() = default; |
514 | WaitcntGeneratorPreGFX12(const MachineFunction &MF) |
515 | : WaitcntGenerator(MF, NUM_NORMAL_INST_CNTS) {} |
516 | |
517 | bool |
518 | applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets, |
519 | MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait, |
520 | MachineBasicBlock::instr_iterator It) const override; |
521 | |
522 | bool createNewWaitcnt(MachineBasicBlock &Block, |
523 | MachineBasicBlock::instr_iterator It, |
524 | AMDGPU::Waitcnt Wait) override; |
525 | |
526 | const unsigned *getWaitEventMask() const override { |
527 | assert(ST); |
528 | |
529 | static const unsigned WaitEventMaskForInstPreGFX12[NUM_INST_CNTS] = { |
530 | eventMask(Events: {VMEM_ACCESS, VMEM_READ_ACCESS, VMEM_SAMPLER_READ_ACCESS, |
531 | VMEM_BVH_READ_ACCESS}), |
532 | eventMask(Events: {SMEM_ACCESS, LDS_ACCESS, GDS_ACCESS, SQ_MESSAGE}), |
533 | eventMask(Events: {EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK, EXP_PARAM_ACCESS, |
534 | EXP_POS_ACCESS, EXP_LDS_ACCESS}), |
535 | eventMask(Events: {VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}), |
536 | 0, |
537 | 0, |
538 | 0}; |
539 | |
540 | return WaitEventMaskForInstPreGFX12; |
541 | } |
542 | |
543 | AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override; |
544 | }; |
545 | |
546 | class WaitcntGeneratorGFX12Plus : public WaitcntGenerator { |
547 | public: |
548 | WaitcntGeneratorGFX12Plus() = default; |
549 | WaitcntGeneratorGFX12Plus(const MachineFunction &MF, |
550 | InstCounterType MaxCounter) |
551 | : WaitcntGenerator(MF, MaxCounter) {} |
552 | |
553 | bool |
554 | applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets, |
555 | MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait, |
556 | MachineBasicBlock::instr_iterator It) const override; |
557 | |
558 | bool createNewWaitcnt(MachineBasicBlock &Block, |
559 | MachineBasicBlock::instr_iterator It, |
560 | AMDGPU::Waitcnt Wait) override; |
561 | |
562 | const unsigned *getWaitEventMask() const override { |
563 | assert(ST); |
564 | |
565 | static const unsigned WaitEventMaskForInstGFX12Plus[NUM_INST_CNTS] = { |
566 | eventMask(Events: {VMEM_ACCESS, VMEM_READ_ACCESS}), |
567 | eventMask(Events: {LDS_ACCESS, GDS_ACCESS}), |
568 | eventMask(Events: {EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK, EXP_PARAM_ACCESS, |
569 | EXP_POS_ACCESS, EXP_LDS_ACCESS}), |
570 | eventMask(Events: {VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}), |
571 | eventMask(Events: {VMEM_SAMPLER_READ_ACCESS}), |
572 | eventMask(Events: {VMEM_BVH_READ_ACCESS}), |
573 | eventMask(Events: {SMEM_ACCESS, SQ_MESSAGE})}; |
574 | |
575 | return WaitEventMaskForInstGFX12Plus; |
576 | } |
577 | |
578 | AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override; |
579 | }; |
580 | |
581 | class SIInsertWaitcnts : public MachineFunctionPass { |
582 | private: |
583 | const GCNSubtarget *ST = nullptr; |
584 | const SIInstrInfo *TII = nullptr; |
585 | const SIRegisterInfo *TRI = nullptr; |
586 | const MachineRegisterInfo *MRI = nullptr; |
587 | |
588 | DenseMap<const Value *, MachineBasicBlock *> SLoadAddresses; |
589 | DenseMap<MachineBasicBlock *, bool> ; |
590 | MachineLoopInfo *MLI; |
591 | MachinePostDominatorTree *PDT; |
592 | AliasAnalysis *AA = nullptr; |
593 | |
594 | struct BlockInfo { |
595 | std::unique_ptr<WaitcntBrackets> Incoming; |
596 | bool Dirty = true; |
597 | }; |
598 | |
599 | InstCounterType SmemAccessCounter; |
600 | |
601 | MapVector<MachineBasicBlock *, BlockInfo> BlockInfos; |
602 | |
603 | // ForceEmitZeroWaitcnts: force all waitcnts insts to be s_waitcnt 0 |
604 | // because of amdgpu-waitcnt-forcezero flag |
605 | bool ForceEmitZeroWaitcnts; |
606 | bool ForceEmitWaitcnt[NUM_INST_CNTS]; |
607 | |
608 | // In any given run of this pass, WCG will point to one of these two |
609 | // generator objects, which must have been re-initialised before use |
610 | // from a value made using a subtarget constructor. |
611 | WaitcntGeneratorPreGFX12 WCGPreGFX12; |
612 | WaitcntGeneratorGFX12Plus WCGGFX12Plus; |
613 | |
614 | WaitcntGenerator *WCG = nullptr; |
615 | |
616 | // S_ENDPGM instructions before which we should insert a DEALLOC_VGPRS |
617 | // message. |
618 | DenseSet<MachineInstr *> ReleaseVGPRInsts; |
619 | |
620 | InstCounterType MaxCounter = NUM_NORMAL_INST_CNTS; |
621 | |
622 | public: |
623 | static char ID; |
624 | |
625 | SIInsertWaitcnts() : MachineFunctionPass(ID) { |
626 | (void)ForceExpCounter; |
627 | (void)ForceLgkmCounter; |
628 | (void)ForceVMCounter; |
629 | } |
630 | |
631 | bool shouldFlushVmCnt(MachineLoop *ML, WaitcntBrackets &Brackets); |
632 | bool isPreheaderToFlush(MachineBasicBlock &MBB, |
633 | WaitcntBrackets &ScoreBrackets); |
634 | bool isVMEMOrFlatVMEM(const MachineInstr &MI) const; |
635 | bool runOnMachineFunction(MachineFunction &MF) override; |
636 | |
637 | StringRef getPassName() const override { |
638 | return "SI insert wait instructions" ; |
639 | } |
640 | |
641 | void getAnalysisUsage(AnalysisUsage &AU) const override { |
642 | AU.setPreservesCFG(); |
643 | AU.addRequired<MachineLoopInfoWrapperPass>(); |
644 | AU.addRequired<MachinePostDominatorTreeWrapperPass>(); |
645 | AU.addUsedIfAvailable<AAResultsWrapperPass>(); |
646 | AU.addPreserved<AAResultsWrapperPass>(); |
647 | MachineFunctionPass::getAnalysisUsage(AU); |
648 | } |
649 | |
650 | bool isForceEmitWaitcnt() const { |
651 | for (auto T : inst_counter_types()) |
652 | if (ForceEmitWaitcnt[T]) |
653 | return true; |
654 | return false; |
655 | } |
656 | |
657 | void setForceEmitWaitcnt() { |
658 | // For non-debug builds, ForceEmitWaitcnt has been initialized to false; |
659 | // For debug builds, get the debug counter info and adjust if need be |
660 | #ifndef NDEBUG |
661 | if (DebugCounter::isCounterSet(ForceExpCounter) && |
662 | DebugCounter::shouldExecute(ForceExpCounter)) { |
663 | ForceEmitWaitcnt[EXP_CNT] = true; |
664 | } else { |
665 | ForceEmitWaitcnt[EXP_CNT] = false; |
666 | } |
667 | |
668 | if (DebugCounter::isCounterSet(ForceLgkmCounter) && |
669 | DebugCounter::shouldExecute(ForceLgkmCounter)) { |
670 | ForceEmitWaitcnt[DS_CNT] = true; |
671 | ForceEmitWaitcnt[KM_CNT] = true; |
672 | } else { |
673 | ForceEmitWaitcnt[DS_CNT] = false; |
674 | ForceEmitWaitcnt[KM_CNT] = false; |
675 | } |
676 | |
677 | if (DebugCounter::isCounterSet(ForceVMCounter) && |
678 | DebugCounter::shouldExecute(ForceVMCounter)) { |
679 | ForceEmitWaitcnt[LOAD_CNT] = true; |
680 | ForceEmitWaitcnt[SAMPLE_CNT] = true; |
681 | ForceEmitWaitcnt[BVH_CNT] = true; |
682 | } else { |
683 | ForceEmitWaitcnt[LOAD_CNT] = false; |
684 | ForceEmitWaitcnt[SAMPLE_CNT] = false; |
685 | ForceEmitWaitcnt[BVH_CNT] = false; |
686 | } |
687 | #endif // NDEBUG |
688 | } |
689 | |
690 | // Return the appropriate VMEM_*_ACCESS type for Inst, which must be a VMEM or |
691 | // FLAT instruction. |
692 | WaitEventType getVmemWaitEventType(const MachineInstr &Inst) const { |
693 | // Maps VMEM access types to their corresponding WaitEventType. |
694 | static const WaitEventType VmemReadMapping[NUM_VMEM_TYPES] = { |
695 | VMEM_READ_ACCESS, VMEM_SAMPLER_READ_ACCESS, VMEM_BVH_READ_ACCESS}; |
696 | |
697 | assert(SIInstrInfo::isVMEM(Inst) || SIInstrInfo::isFLAT(Inst)); |
698 | // LDS DMA loads are also stores, but on the LDS side. On the VMEM side |
699 | // these should use VM_CNT. |
700 | if (!ST->hasVscnt() || SIInstrInfo::mayWriteLDSThroughDMA(MI: Inst)) |
701 | return VMEM_ACCESS; |
702 | if (Inst.mayStore() && |
703 | (!Inst.mayLoad() || SIInstrInfo::isAtomicNoRet(MI: Inst))) { |
704 | // FLAT and SCRATCH instructions may access scratch. Other VMEM |
705 | // instructions do not. |
706 | if (SIInstrInfo::isFLAT(MI: Inst) && mayAccessScratchThroughFlat(MI: Inst)) |
707 | return SCRATCH_WRITE_ACCESS; |
708 | return VMEM_WRITE_ACCESS; |
709 | } |
710 | if (!ST->hasExtendedWaitCounts() || SIInstrInfo::isFLAT(MI: Inst)) |
711 | return VMEM_READ_ACCESS; |
712 | return VmemReadMapping[getVmemType(Inst)]; |
713 | } |
714 | |
715 | bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const; |
716 | bool mayAccessLDSThroughFlat(const MachineInstr &MI) const; |
717 | bool mayAccessScratchThroughFlat(const MachineInstr &MI) const; |
718 | bool generateWaitcntInstBefore(MachineInstr &MI, |
719 | WaitcntBrackets &ScoreBrackets, |
720 | MachineInstr *OldWaitcntInstr, |
721 | bool FlushVmCnt); |
722 | bool generateWaitcnt(AMDGPU::Waitcnt Wait, |
723 | MachineBasicBlock::instr_iterator It, |
724 | MachineBasicBlock &Block, WaitcntBrackets &ScoreBrackets, |
725 | MachineInstr *OldWaitcntInstr); |
726 | void updateEventWaitcntAfter(MachineInstr &Inst, |
727 | WaitcntBrackets *ScoreBrackets); |
728 | bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block, |
729 | WaitcntBrackets &ScoreBrackets); |
730 | }; |
731 | |
732 | } // end anonymous namespace |
733 | |
734 | RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI, |
735 | const MachineRegisterInfo *MRI, |
736 | const SIRegisterInfo *TRI, |
737 | unsigned OpNo) const { |
738 | const MachineOperand &Op = MI->getOperand(i: OpNo); |
739 | if (!TRI->isInAllocatableClass(RegNo: Op.getReg())) |
740 | return {-1, -1}; |
741 | |
742 | // A use via a PW operand does not need a waitcnt. |
743 | // A partial write is not a WAW. |
744 | assert(!Op.getSubReg() || !Op.isUndef()); |
745 | |
746 | RegInterval Result; |
747 | |
748 | unsigned Reg = TRI->getEncodingValue(RegNo: AMDGPU::getMCReg(Reg: Op.getReg(), STI: *ST)) & |
749 | AMDGPU::HWEncoding::REG_IDX_MASK; |
750 | |
751 | if (TRI->isVectorRegister(MRI: *MRI, Reg: Op.getReg())) { |
752 | assert(Reg >= Encoding.VGPR0 && Reg <= Encoding.VGPRL); |
753 | Result.first = Reg - Encoding.VGPR0; |
754 | if (TRI->isAGPR(MRI: *MRI, Reg: Op.getReg())) |
755 | Result.first += AGPR_OFFSET; |
756 | assert(Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS); |
757 | } else if (TRI->isSGPRReg(MRI: *MRI, Reg: Op.getReg())) { |
758 | assert(Reg >= Encoding.SGPR0 && Reg < SQ_MAX_PGM_SGPRS); |
759 | Result.first = Reg - Encoding.SGPR0 + NUM_ALL_VGPRS; |
760 | assert(Result.first >= NUM_ALL_VGPRS && |
761 | Result.first < SQ_MAX_PGM_SGPRS + NUM_ALL_VGPRS); |
762 | } |
763 | // TODO: Handle TTMP |
764 | // else if (TRI->isTTMP(*MRI, Reg.getReg())) ... |
765 | else |
766 | return {-1, -1}; |
767 | |
768 | const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Reg: Op.getReg()); |
769 | unsigned Size = TRI->getRegSizeInBits(RC: *RC); |
770 | Result.second = Result.first + ((Size + 16) / 32); |
771 | |
772 | return Result; |
773 | } |
774 | |
775 | void WaitcntBrackets::setExpScore(const MachineInstr *MI, |
776 | const SIInstrInfo *TII, |
777 | const SIRegisterInfo *TRI, |
778 | const MachineRegisterInfo *MRI, unsigned OpNo, |
779 | unsigned Val) { |
780 | RegInterval Interval = getRegInterval(MI, MRI, TRI, OpNo); |
781 | assert(TRI->isVectorRegister(*MRI, MI->getOperand(OpNo).getReg())); |
782 | for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { |
783 | setRegScore(GprNo: RegNo, T: EXP_CNT, Val); |
784 | } |
785 | } |
786 | |
787 | void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII, |
788 | const SIRegisterInfo *TRI, |
789 | const MachineRegisterInfo *MRI, |
790 | WaitEventType E, MachineInstr &Inst) { |
791 | InstCounterType T = eventCounter(masks: WaitEventMaskForInst, E); |
792 | |
793 | unsigned UB = getScoreUB(T); |
794 | unsigned CurrScore = UB + 1; |
795 | if (CurrScore == 0) |
796 | report_fatal_error(reason: "InsertWaitcnt score wraparound" ); |
797 | // PendingEvents and ScoreUB need to be update regardless if this event |
798 | // changes the score of a register or not. |
799 | // Examples including vm_cnt when buffer-store or lgkm_cnt when send-message. |
800 | PendingEvents |= 1 << E; |
801 | setScoreUB(T, Val: CurrScore); |
802 | |
803 | if (T == EXP_CNT) { |
804 | // Put score on the source vgprs. If this is a store, just use those |
805 | // specific register(s). |
806 | if (TII->isDS(MI: Inst) && (Inst.mayStore() || Inst.mayLoad())) { |
807 | int AddrOpIdx = |
808 | AMDGPU::getNamedOperandIdx(Opcode: Inst.getOpcode(), NamedIdx: AMDGPU::OpName::addr); |
809 | // All GDS operations must protect their address register (same as |
810 | // export.) |
811 | if (AddrOpIdx != -1) { |
812 | setExpScore(MI: &Inst, TII, TRI, MRI, OpNo: AddrOpIdx, Val: CurrScore); |
813 | } |
814 | |
815 | if (Inst.mayStore()) { |
816 | if (AMDGPU::hasNamedOperand(Opcode: Inst.getOpcode(), NamedIdx: AMDGPU::OpName::data0)) { |
817 | setExpScore( |
818 | MI: &Inst, TII, TRI, MRI, |
819 | OpNo: AMDGPU::getNamedOperandIdx(Opcode: Inst.getOpcode(), NamedIdx: AMDGPU::OpName::data0), |
820 | Val: CurrScore); |
821 | } |
822 | if (AMDGPU::hasNamedOperand(Opcode: Inst.getOpcode(), NamedIdx: AMDGPU::OpName::data1)) { |
823 | setExpScore(MI: &Inst, TII, TRI, MRI, |
824 | OpNo: AMDGPU::getNamedOperandIdx(Opcode: Inst.getOpcode(), |
825 | NamedIdx: AMDGPU::OpName::data1), |
826 | Val: CurrScore); |
827 | } |
828 | } else if (SIInstrInfo::isAtomicRet(MI: Inst) && !SIInstrInfo::isGWS(MI: Inst) && |
829 | Inst.getOpcode() != AMDGPU::DS_APPEND && |
830 | Inst.getOpcode() != AMDGPU::DS_CONSUME && |
831 | Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) { |
832 | for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) { |
833 | const MachineOperand &Op = Inst.getOperand(i: I); |
834 | if (Op.isReg() && !Op.isDef() && |
835 | TRI->isVectorRegister(MRI: *MRI, Reg: Op.getReg())) { |
836 | setExpScore(MI: &Inst, TII, TRI, MRI, OpNo: I, Val: CurrScore); |
837 | } |
838 | } |
839 | } |
840 | } else if (TII->isFLAT(MI: Inst)) { |
841 | if (Inst.mayStore()) { |
842 | setExpScore( |
843 | MI: &Inst, TII, TRI, MRI, |
844 | OpNo: AMDGPU::getNamedOperandIdx(Opcode: Inst.getOpcode(), NamedIdx: AMDGPU::OpName::data), |
845 | Val: CurrScore); |
846 | } else if (SIInstrInfo::isAtomicRet(MI: Inst)) { |
847 | setExpScore( |
848 | MI: &Inst, TII, TRI, MRI, |
849 | OpNo: AMDGPU::getNamedOperandIdx(Opcode: Inst.getOpcode(), NamedIdx: AMDGPU::OpName::data), |
850 | Val: CurrScore); |
851 | } |
852 | } else if (TII->isMIMG(MI: Inst)) { |
853 | if (Inst.mayStore()) { |
854 | setExpScore(MI: &Inst, TII, TRI, MRI, OpNo: 0, Val: CurrScore); |
855 | } else if (SIInstrInfo::isAtomicRet(MI: Inst)) { |
856 | setExpScore( |
857 | MI: &Inst, TII, TRI, MRI, |
858 | OpNo: AMDGPU::getNamedOperandIdx(Opcode: Inst.getOpcode(), NamedIdx: AMDGPU::OpName::data), |
859 | Val: CurrScore); |
860 | } |
861 | } else if (TII->isMTBUF(MI: Inst)) { |
862 | if (Inst.mayStore()) { |
863 | setExpScore(MI: &Inst, TII, TRI, MRI, OpNo: 0, Val: CurrScore); |
864 | } |
865 | } else if (TII->isMUBUF(MI: Inst)) { |
866 | if (Inst.mayStore()) { |
867 | setExpScore(MI: &Inst, TII, TRI, MRI, OpNo: 0, Val: CurrScore); |
868 | } else if (SIInstrInfo::isAtomicRet(MI: Inst)) { |
869 | setExpScore( |
870 | MI: &Inst, TII, TRI, MRI, |
871 | OpNo: AMDGPU::getNamedOperandIdx(Opcode: Inst.getOpcode(), NamedIdx: AMDGPU::OpName::data), |
872 | Val: CurrScore); |
873 | } |
874 | } else if (TII->isLDSDIR(MI: Inst)) { |
875 | // LDSDIR instructions attach the score to the destination. |
876 | setExpScore( |
877 | MI: &Inst, TII, TRI, MRI, |
878 | OpNo: AMDGPU::getNamedOperandIdx(Opcode: Inst.getOpcode(), NamedIdx: AMDGPU::OpName::vdst), |
879 | Val: CurrScore); |
880 | } else { |
881 | if (TII->isEXP(MI: Inst)) { |
882 | // For export the destination registers are really temps that |
883 | // can be used as the actual source after export patching, so |
884 | // we need to treat them like sources and set the EXP_CNT |
885 | // score. |
886 | for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) { |
887 | MachineOperand &DefMO = Inst.getOperand(i: I); |
888 | if (DefMO.isReg() && DefMO.isDef() && |
889 | TRI->isVGPR(MRI: *MRI, Reg: DefMO.getReg())) { |
890 | setRegScore( |
891 | GprNo: TRI->getEncodingValue(RegNo: AMDGPU::getMCReg(Reg: DefMO.getReg(), STI: *ST)), |
892 | T: EXP_CNT, Val: CurrScore); |
893 | } |
894 | } |
895 | } |
896 | for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) { |
897 | MachineOperand &MO = Inst.getOperand(i: I); |
898 | if (MO.isReg() && !MO.isDef() && |
899 | TRI->isVectorRegister(MRI: *MRI, Reg: MO.getReg())) { |
900 | setExpScore(MI: &Inst, TII, TRI, MRI, OpNo: I, Val: CurrScore); |
901 | } |
902 | } |
903 | } |
904 | } else /* LGKM_CNT || EXP_CNT || VS_CNT || NUM_INST_CNTS */ { |
905 | // Match the score to the destination registers. |
906 | for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) { |
907 | auto &Op = Inst.getOperand(i: I); |
908 | if (!Op.isReg() || !Op.isDef()) |
909 | continue; |
910 | RegInterval Interval = getRegInterval(MI: &Inst, MRI, TRI, OpNo: I); |
911 | if (T == LOAD_CNT || T == SAMPLE_CNT || T == BVH_CNT) { |
912 | if (Interval.first >= NUM_ALL_VGPRS) |
913 | continue; |
914 | if (updateVMCntOnly(Inst)) { |
915 | // updateVMCntOnly should only leave us with VGPRs |
916 | // MUBUF, MTBUF, MIMG, FlatGlobal, and FlatScratch only have VGPR/AGPR |
917 | // defs. That's required for a sane index into `VgprMemTypes` below |
918 | assert(TRI->isVectorRegister(*MRI, Op.getReg())); |
919 | VmemType V = getVmemType(Inst); |
920 | for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) |
921 | VgprVmemTypes[RegNo] |= 1 << V; |
922 | } |
923 | } |
924 | for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { |
925 | setRegScore(GprNo: RegNo, T, Val: CurrScore); |
926 | } |
927 | } |
928 | if (Inst.mayStore() && |
929 | (TII->isDS(MI: Inst) || TII->mayWriteLDSThroughDMA(MI: Inst))) { |
930 | // MUBUF and FLAT LDS DMA operations need a wait on vmcnt before LDS |
931 | // written can be accessed. A load from LDS to VMEM does not need a wait. |
932 | unsigned Slot = 0; |
933 | for (const auto *MemOp : Inst.memoperands()) { |
934 | if (!MemOp->isStore() || |
935 | MemOp->getAddrSpace() != AMDGPUAS::LOCAL_ADDRESS) |
936 | continue; |
937 | // Comparing just AA info does not guarantee memoperands are equal |
938 | // in general, but this is so for LDS DMA in practice. |
939 | auto AAI = MemOp->getAAInfo(); |
940 | // Alias scope information gives a way to definitely identify an |
941 | // original memory object and practically produced in the module LDS |
942 | // lowering pass. If there is no scope available we will not be able |
943 | // to disambiguate LDS aliasing as after the module lowering all LDS |
944 | // is squashed into a single big object. Do not attempt to use one of |
945 | // the limited LDSDMAStores for something we will not be able to use |
946 | // anyway. |
947 | if (!AAI || !AAI.Scope) |
948 | break; |
949 | for (unsigned I = 0, E = LDSDMAStores.size(); I != E && !Slot; ++I) { |
950 | for (const auto *MemOp : LDSDMAStores[I]->memoperands()) { |
951 | if (MemOp->isStore() && AAI == MemOp->getAAInfo()) { |
952 | Slot = I + 1; |
953 | break; |
954 | } |
955 | } |
956 | } |
957 | if (Slot || LDSDMAStores.size() == NUM_EXTRA_VGPRS - 1) |
958 | break; |
959 | LDSDMAStores.push_back(Elt: &Inst); |
960 | Slot = LDSDMAStores.size(); |
961 | break; |
962 | } |
963 | setRegScore(GprNo: SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS + Slot, T, Val: CurrScore); |
964 | if (Slot) |
965 | setRegScore(GprNo: SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS, T, Val: CurrScore); |
966 | } |
967 | } |
968 | } |
969 | |
970 | void WaitcntBrackets::print(raw_ostream &OS) { |
971 | OS << '\n'; |
972 | for (auto T : inst_counter_types(MaxCounter)) { |
973 | unsigned SR = getScoreRange(T); |
974 | |
975 | switch (T) { |
976 | case LOAD_CNT: |
977 | OS << " " << (ST->hasExtendedWaitCounts() ? "LOAD" : "VM" ) << "_CNT(" |
978 | << SR << "): " ; |
979 | break; |
980 | case DS_CNT: |
981 | OS << " " << (ST->hasExtendedWaitCounts() ? "DS" : "LGKM" ) << "_CNT(" |
982 | << SR << "): " ; |
983 | break; |
984 | case EXP_CNT: |
985 | OS << " EXP_CNT(" << SR << "): " ; |
986 | break; |
987 | case STORE_CNT: |
988 | OS << " " << (ST->hasExtendedWaitCounts() ? "STORE" : "VS" ) << "_CNT(" |
989 | << SR << "): " ; |
990 | break; |
991 | case SAMPLE_CNT: |
992 | OS << " SAMPLE_CNT(" << SR << "): " ; |
993 | break; |
994 | case BVH_CNT: |
995 | OS << " BVH_CNT(" << SR << "): " ; |
996 | break; |
997 | case KM_CNT: |
998 | OS << " KM_CNT(" << SR << "): " ; |
999 | break; |
1000 | default: |
1001 | OS << " UNKNOWN(" << SR << "): " ; |
1002 | break; |
1003 | } |
1004 | |
1005 | if (SR != 0) { |
1006 | // Print vgpr scores. |
1007 | unsigned LB = getScoreLB(T); |
1008 | |
1009 | for (int J = 0; J <= VgprUB; J++) { |
1010 | unsigned RegScore = getRegScore(GprNo: J, T); |
1011 | if (RegScore <= LB) |
1012 | continue; |
1013 | unsigned RelScore = RegScore - LB - 1; |
1014 | if (J < SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS) { |
1015 | OS << RelScore << ":v" << J << " " ; |
1016 | } else { |
1017 | OS << RelScore << ":ds " ; |
1018 | } |
1019 | } |
1020 | // Also need to print sgpr scores for lgkm_cnt. |
1021 | if (T == SmemAccessCounter) { |
1022 | for (int J = 0; J <= SgprUB; J++) { |
1023 | unsigned RegScore = getRegScore(GprNo: J + NUM_ALL_VGPRS, T); |
1024 | if (RegScore <= LB) |
1025 | continue; |
1026 | unsigned RelScore = RegScore - LB - 1; |
1027 | OS << RelScore << ":s" << J << " " ; |
1028 | } |
1029 | } |
1030 | } |
1031 | OS << '\n'; |
1032 | } |
1033 | OS << '\n'; |
1034 | } |
1035 | |
1036 | /// Simplify the waitcnt, in the sense of removing redundant counts, and return |
1037 | /// whether a waitcnt instruction is needed at all. |
1038 | void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const { |
1039 | simplifyWaitcnt(T: LOAD_CNT, Count&: Wait.LoadCnt); |
1040 | simplifyWaitcnt(T: EXP_CNT, Count&: Wait.ExpCnt); |
1041 | simplifyWaitcnt(T: DS_CNT, Count&: Wait.DsCnt); |
1042 | simplifyWaitcnt(T: STORE_CNT, Count&: Wait.StoreCnt); |
1043 | simplifyWaitcnt(T: SAMPLE_CNT, Count&: Wait.SampleCnt); |
1044 | simplifyWaitcnt(T: BVH_CNT, Count&: Wait.BvhCnt); |
1045 | simplifyWaitcnt(T: KM_CNT, Count&: Wait.KmCnt); |
1046 | } |
1047 | |
1048 | void WaitcntBrackets::simplifyWaitcnt(InstCounterType T, |
1049 | unsigned &Count) const { |
1050 | // The number of outstanding events for this type, T, can be calculated |
1051 | // as (UB - LB). If the current Count is greater than or equal to the number |
1052 | // of outstanding events, then the wait for this counter is redundant. |
1053 | if (Count >= getScoreRange(T)) |
1054 | Count = ~0u; |
1055 | } |
1056 | |
1057 | void WaitcntBrackets::determineWait(InstCounterType T, int RegNo, |
1058 | AMDGPU::Waitcnt &Wait) const { |
1059 | unsigned ScoreToWait = getRegScore(GprNo: RegNo, T); |
1060 | |
1061 | // If the score of src_operand falls within the bracket, we need an |
1062 | // s_waitcnt instruction. |
1063 | const unsigned LB = getScoreLB(T); |
1064 | const unsigned UB = getScoreUB(T); |
1065 | if ((UB >= ScoreToWait) && (ScoreToWait > LB)) { |
1066 | if ((T == LOAD_CNT || T == DS_CNT) && hasPendingFlat() && |
1067 | !ST->hasFlatLgkmVMemCountInOrder()) { |
1068 | // If there is a pending FLAT operation, and this is a VMem or LGKM |
1069 | // waitcnt and the target can report early completion, then we need |
1070 | // to force a waitcnt 0. |
1071 | addWait(Wait, T, Count: 0); |
1072 | } else if (counterOutOfOrder(T)) { |
1073 | // Counter can get decremented out-of-order when there |
1074 | // are multiple types event in the bracket. Also emit an s_wait counter |
1075 | // with a conservative value of 0 for the counter. |
1076 | addWait(Wait, T, Count: 0); |
1077 | } else { |
1078 | // If a counter has been maxed out avoid overflow by waiting for |
1079 | // MAX(CounterType) - 1 instead. |
1080 | unsigned NeededWait = std::min(a: UB - ScoreToWait, b: getWaitCountMax(T) - 1); |
1081 | addWait(Wait, T, Count: NeededWait); |
1082 | } |
1083 | } |
1084 | } |
1085 | |
1086 | void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) { |
1087 | applyWaitcnt(T: LOAD_CNT, Count: Wait.LoadCnt); |
1088 | applyWaitcnt(T: EXP_CNT, Count: Wait.ExpCnt); |
1089 | applyWaitcnt(T: DS_CNT, Count: Wait.DsCnt); |
1090 | applyWaitcnt(T: STORE_CNT, Count: Wait.StoreCnt); |
1091 | applyWaitcnt(T: SAMPLE_CNT, Count: Wait.SampleCnt); |
1092 | applyWaitcnt(T: BVH_CNT, Count: Wait.BvhCnt); |
1093 | applyWaitcnt(T: KM_CNT, Count: Wait.KmCnt); |
1094 | } |
1095 | |
1096 | void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) { |
1097 | const unsigned UB = getScoreUB(T); |
1098 | if (Count >= UB) |
1099 | return; |
1100 | if (Count != 0) { |
1101 | if (counterOutOfOrder(T)) |
1102 | return; |
1103 | setScoreLB(T, Val: std::max(a: getScoreLB(T), b: UB - Count)); |
1104 | } else { |
1105 | setScoreLB(T, Val: UB); |
1106 | PendingEvents &= ~WaitEventMaskForInst[T]; |
1107 | } |
1108 | } |
1109 | |
1110 | // Where there are multiple types of event in the bracket of a counter, |
1111 | // the decrement may go out of order. |
1112 | bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const { |
1113 | // Scalar memory read always can go out of order. |
1114 | if (T == SmemAccessCounter && hasPendingEvent(E: SMEM_ACCESS)) |
1115 | return true; |
1116 | return hasMixedPendingEvents(T); |
1117 | } |
1118 | |
1119 | INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts" , false, |
1120 | false) |
1121 | INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass) |
1122 | INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTreeWrapperPass) |
1123 | INITIALIZE_PASS_END(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts" , false, |
1124 | false) |
1125 | |
1126 | char SIInsertWaitcnts::ID = 0; |
1127 | |
1128 | char &llvm::SIInsertWaitcntsID = SIInsertWaitcnts::ID; |
1129 | |
1130 | FunctionPass *llvm::createSIInsertWaitcntsPass() { |
1131 | return new SIInsertWaitcnts(); |
1132 | } |
1133 | |
1134 | static bool updateOperandIfDifferent(MachineInstr &MI, uint16_t OpName, |
1135 | unsigned NewEnc) { |
1136 | int OpIdx = AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), NamedIdx: OpName); |
1137 | assert(OpIdx >= 0); |
1138 | |
1139 | MachineOperand &MO = MI.getOperand(i: OpIdx); |
1140 | |
1141 | if (NewEnc == MO.getImm()) |
1142 | return false; |
1143 | |
1144 | MO.setImm(NewEnc); |
1145 | return true; |
1146 | } |
1147 | |
1148 | /// Determine if \p MI is a gfx12+ single-counter S_WAIT_*CNT instruction, |
1149 | /// and if so, which counter it is waiting on. |
1150 | static std::optional<InstCounterType> counterTypeForInstr(unsigned Opcode) { |
1151 | switch (Opcode) { |
1152 | case AMDGPU::S_WAIT_LOADCNT: |
1153 | return LOAD_CNT; |
1154 | case AMDGPU::S_WAIT_EXPCNT: |
1155 | return EXP_CNT; |
1156 | case AMDGPU::S_WAIT_STORECNT: |
1157 | return STORE_CNT; |
1158 | case AMDGPU::S_WAIT_SAMPLECNT: |
1159 | return SAMPLE_CNT; |
1160 | case AMDGPU::S_WAIT_BVHCNT: |
1161 | return BVH_CNT; |
1162 | case AMDGPU::S_WAIT_DSCNT: |
1163 | return DS_CNT; |
1164 | case AMDGPU::S_WAIT_KMCNT: |
1165 | return KM_CNT; |
1166 | default: |
1167 | return {}; |
1168 | } |
1169 | } |
1170 | |
1171 | bool WaitcntGenerator::promoteSoftWaitCnt(MachineInstr *Waitcnt) const { |
1172 | unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Opcode: Waitcnt->getOpcode()); |
1173 | if (Opcode == Waitcnt->getOpcode()) |
1174 | return false; |
1175 | |
1176 | Waitcnt->setDesc(TII->get(Opcode)); |
1177 | return true; |
1178 | } |
1179 | |
1180 | /// Combine consecutive S_WAITCNT and S_WAITCNT_VSCNT instructions that |
1181 | /// precede \p It and follow \p OldWaitcntInstr and apply any extra waits |
1182 | /// from \p Wait that were added by previous passes. Currently this pass |
1183 | /// conservatively assumes that these preexisting waits are required for |
1184 | /// correctness. |
1185 | bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt( |
1186 | WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr, |
1187 | AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) const { |
1188 | assert(ST); |
1189 | assert(isNormalMode(MaxCounter)); |
1190 | |
1191 | bool Modified = false; |
1192 | MachineInstr *WaitcntInstr = nullptr; |
1193 | MachineInstr *WaitcntVsCntInstr = nullptr; |
1194 | |
1195 | for (auto &II : |
1196 | make_early_inc_range(Range: make_range(x: OldWaitcntInstr.getIterator(), y: It))) { |
1197 | if (II.isMetaInstruction()) |
1198 | continue; |
1199 | |
1200 | unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Opcode: II.getOpcode()); |
1201 | bool TrySimplify = Opcode != II.getOpcode() && !OptNone; |
1202 | |
1203 | // Update required wait count. If this is a soft waitcnt (= it was added |
1204 | // by an earlier pass), it may be entirely removed. |
1205 | if (Opcode == AMDGPU::S_WAITCNT) { |
1206 | unsigned IEnc = II.getOperand(i: 0).getImm(); |
1207 | AMDGPU::Waitcnt OldWait = AMDGPU::decodeWaitcnt(Version: IV, Encoded: IEnc); |
1208 | if (TrySimplify) |
1209 | ScoreBrackets.simplifyWaitcnt(Wait&: OldWait); |
1210 | Wait = Wait.combined(Other: OldWait); |
1211 | |
1212 | // Merge consecutive waitcnt of the same type by erasing multiples. |
1213 | if (WaitcntInstr || (!Wait.hasWaitExceptStoreCnt() && TrySimplify)) { |
1214 | II.eraseFromParent(); |
1215 | Modified = true; |
1216 | } else |
1217 | WaitcntInstr = &II; |
1218 | } else { |
1219 | assert(Opcode == AMDGPU::S_WAITCNT_VSCNT); |
1220 | assert(II.getOperand(0).getReg() == AMDGPU::SGPR_NULL); |
1221 | |
1222 | unsigned OldVSCnt = |
1223 | TII->getNamedOperand(MI&: II, OperandName: AMDGPU::OpName::simm16)->getImm(); |
1224 | if (TrySimplify) |
1225 | ScoreBrackets.simplifyWaitcnt(T: InstCounterType::STORE_CNT, Count&: OldVSCnt); |
1226 | Wait.StoreCnt = std::min(a: Wait.StoreCnt, b: OldVSCnt); |
1227 | |
1228 | if (WaitcntVsCntInstr || (!Wait.hasWaitStoreCnt() && TrySimplify)) { |
1229 | II.eraseFromParent(); |
1230 | Modified = true; |
1231 | } else |
1232 | WaitcntVsCntInstr = &II; |
1233 | } |
1234 | } |
1235 | |
1236 | if (WaitcntInstr) { |
1237 | Modified |= updateOperandIfDifferent(MI&: *WaitcntInstr, OpName: AMDGPU::OpName::simm16, |
1238 | NewEnc: AMDGPU::encodeWaitcnt(Version: IV, Decoded: Wait)); |
1239 | Modified |= promoteSoftWaitCnt(Waitcnt: WaitcntInstr); |
1240 | |
1241 | ScoreBrackets.applyWaitcnt(T: LOAD_CNT, Count: Wait.LoadCnt); |
1242 | ScoreBrackets.applyWaitcnt(T: EXP_CNT, Count: Wait.ExpCnt); |
1243 | ScoreBrackets.applyWaitcnt(T: DS_CNT, Count: Wait.DsCnt); |
1244 | Wait.LoadCnt = ~0u; |
1245 | Wait.ExpCnt = ~0u; |
1246 | Wait.DsCnt = ~0u; |
1247 | |
1248 | LLVM_DEBUG(It == WaitcntInstr->getParent()->end() |
1249 | ? dbgs() |
1250 | << "applyPreexistingWaitcnt\n" |
1251 | << "New Instr at block end: " << *WaitcntInstr << '\n' |
1252 | : dbgs() << "applyPreexistingWaitcnt\n" |
1253 | << "Old Instr: " << *It |
1254 | << "New Instr: " << *WaitcntInstr << '\n'); |
1255 | } |
1256 | |
1257 | if (WaitcntVsCntInstr) { |
1258 | Modified |= updateOperandIfDifferent(MI&: *WaitcntVsCntInstr, |
1259 | OpName: AMDGPU::OpName::simm16, NewEnc: Wait.StoreCnt); |
1260 | Modified |= promoteSoftWaitCnt(Waitcnt: WaitcntVsCntInstr); |
1261 | |
1262 | ScoreBrackets.applyWaitcnt(T: STORE_CNT, Count: Wait.StoreCnt); |
1263 | Wait.StoreCnt = ~0u; |
1264 | |
1265 | LLVM_DEBUG(It == WaitcntVsCntInstr->getParent()->end() |
1266 | ? dbgs() << "applyPreexistingWaitcnt\n" |
1267 | << "New Instr at block end: " << *WaitcntVsCntInstr |
1268 | << '\n' |
1269 | : dbgs() << "applyPreexistingWaitcnt\n" |
1270 | << "Old Instr: " << *It |
1271 | << "New Instr: " << *WaitcntVsCntInstr << '\n'); |
1272 | } |
1273 | |
1274 | return Modified; |
1275 | } |
1276 | |
1277 | /// Generate S_WAITCNT and/or S_WAITCNT_VSCNT instructions for any |
1278 | /// required counters in \p Wait |
1279 | bool WaitcntGeneratorPreGFX12::createNewWaitcnt( |
1280 | MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It, |
1281 | AMDGPU::Waitcnt Wait) { |
1282 | assert(ST); |
1283 | assert(isNormalMode(MaxCounter)); |
1284 | |
1285 | bool Modified = false; |
1286 | const DebugLoc &DL = Block.findDebugLoc(MBBI: It); |
1287 | |
1288 | // Waits for VMcnt, LKGMcnt and/or EXPcnt are encoded together into a |
1289 | // single instruction while VScnt has its own instruction. |
1290 | if (Wait.hasWaitExceptStoreCnt()) { |
1291 | unsigned Enc = AMDGPU::encodeWaitcnt(Version: IV, Decoded: Wait); |
1292 | [[maybe_unused]] auto SWaitInst = |
1293 | BuildMI(BB&: Block, I: It, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_WAITCNT)).addImm(Val: Enc); |
1294 | Modified = true; |
1295 | |
1296 | LLVM_DEBUG(dbgs() << "generateWaitcnt\n" ; |
1297 | if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It; |
1298 | dbgs() << "New Instr: " << *SWaitInst << '\n'); |
1299 | } |
1300 | |
1301 | if (Wait.hasWaitStoreCnt()) { |
1302 | assert(ST->hasVscnt()); |
1303 | |
1304 | [[maybe_unused]] auto SWaitInst = |
1305 | BuildMI(BB&: Block, I: It, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_WAITCNT_VSCNT)) |
1306 | .addReg(RegNo: AMDGPU::SGPR_NULL, flags: RegState::Undef) |
1307 | .addImm(Val: Wait.StoreCnt); |
1308 | Modified = true; |
1309 | |
1310 | LLVM_DEBUG(dbgs() << "generateWaitcnt\n" ; |
1311 | if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It; |
1312 | dbgs() << "New Instr: " << *SWaitInst << '\n'); |
1313 | } |
1314 | |
1315 | return Modified; |
1316 | } |
1317 | |
1318 | AMDGPU::Waitcnt |
1319 | WaitcntGeneratorPreGFX12::getAllZeroWaitcnt(bool IncludeVSCnt) const { |
1320 | return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt && ST->hasVscnt() ? 0 : ~0u); |
1321 | } |
1322 | |
1323 | AMDGPU::Waitcnt |
1324 | WaitcntGeneratorGFX12Plus::getAllZeroWaitcnt(bool IncludeVSCnt) const { |
1325 | return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt ? 0 : ~0u, 0, 0, 0); |
1326 | } |
1327 | |
1328 | /// Combine consecutive S_WAIT_*CNT instructions that precede \p It and |
1329 | /// follow \p OldWaitcntInstr and apply any extra waits from \p Wait that |
1330 | /// were added by previous passes. Currently this pass conservatively |
1331 | /// assumes that these preexisting waits are required for correctness. |
1332 | bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt( |
1333 | WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr, |
1334 | AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) const { |
1335 | assert(ST); |
1336 | assert(!isNormalMode(MaxCounter)); |
1337 | |
1338 | bool Modified = false; |
1339 | MachineInstr *CombinedLoadDsCntInstr = nullptr; |
1340 | MachineInstr *CombinedStoreDsCntInstr = nullptr; |
1341 | MachineInstr *WaitInstrs[NUM_EXTENDED_INST_CNTS] = {}; |
1342 | |
1343 | for (auto &II : |
1344 | make_early_inc_range(Range: make_range(x: OldWaitcntInstr.getIterator(), y: It))) { |
1345 | if (II.isMetaInstruction()) |
1346 | continue; |
1347 | |
1348 | MachineInstr **UpdatableInstr; |
1349 | |
1350 | // Update required wait count. If this is a soft waitcnt (= it was added |
1351 | // by an earlier pass), it may be entirely removed. |
1352 | |
1353 | unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Opcode: II.getOpcode()); |
1354 | bool TrySimplify = Opcode != II.getOpcode() && !OptNone; |
1355 | |
1356 | // Don't crash if the programmer used legacy waitcnt intrinsics, but don't |
1357 | // attempt to do more than that either. |
1358 | if (Opcode == AMDGPU::S_WAITCNT) |
1359 | continue; |
1360 | |
1361 | if (Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT) { |
1362 | unsigned OldEnc = |
1363 | TII->getNamedOperand(MI&: II, OperandName: AMDGPU::OpName::simm16)->getImm(); |
1364 | AMDGPU::Waitcnt OldWait = AMDGPU::decodeLoadcntDscnt(Version: IV, LoadcntDscnt: OldEnc); |
1365 | if (TrySimplify) |
1366 | ScoreBrackets.simplifyWaitcnt(Wait&: OldWait); |
1367 | Wait = Wait.combined(Other: OldWait); |
1368 | UpdatableInstr = &CombinedLoadDsCntInstr; |
1369 | } else if (Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT) { |
1370 | unsigned OldEnc = |
1371 | TII->getNamedOperand(MI&: II, OperandName: AMDGPU::OpName::simm16)->getImm(); |
1372 | AMDGPU::Waitcnt OldWait = AMDGPU::decodeStorecntDscnt(Version: IV, StorecntDscnt: OldEnc); |
1373 | if (TrySimplify) |
1374 | ScoreBrackets.simplifyWaitcnt(Wait&: OldWait); |
1375 | Wait = Wait.combined(Other: OldWait); |
1376 | UpdatableInstr = &CombinedStoreDsCntInstr; |
1377 | } else { |
1378 | std::optional<InstCounterType> CT = counterTypeForInstr(Opcode); |
1379 | assert(CT.has_value()); |
1380 | unsigned OldCnt = |
1381 | TII->getNamedOperand(MI&: II, OperandName: AMDGPU::OpName::simm16)->getImm(); |
1382 | if (TrySimplify) |
1383 | ScoreBrackets.simplifyWaitcnt(T: CT.value(), Count&: OldCnt); |
1384 | addWait(Wait, T: CT.value(), Count: OldCnt); |
1385 | UpdatableInstr = &WaitInstrs[CT.value()]; |
1386 | } |
1387 | |
1388 | // Merge consecutive waitcnt of the same type by erasing multiples. |
1389 | if (!*UpdatableInstr) { |
1390 | *UpdatableInstr = &II; |
1391 | } else { |
1392 | II.eraseFromParent(); |
1393 | Modified = true; |
1394 | } |
1395 | } |
1396 | |
1397 | if (CombinedLoadDsCntInstr) { |
1398 | // Only keep an S_WAIT_LOADCNT_DSCNT if both counters actually need |
1399 | // to be waited for. Otherwise, let the instruction be deleted so |
1400 | // the appropriate single counter wait instruction can be inserted |
1401 | // instead, when new S_WAIT_*CNT instructions are inserted by |
1402 | // createNewWaitcnt(). As a side effect, resetting the wait counts will |
1403 | // cause any redundant S_WAIT_LOADCNT or S_WAIT_DSCNT to be removed by |
1404 | // the loop below that deals with single counter instructions. |
1405 | if (Wait.LoadCnt != ~0u && Wait.DsCnt != ~0u) { |
1406 | unsigned NewEnc = AMDGPU::encodeLoadcntDscnt(Version: IV, Decoded: Wait); |
1407 | Modified |= updateOperandIfDifferent(MI&: *CombinedLoadDsCntInstr, |
1408 | OpName: AMDGPU::OpName::simm16, NewEnc); |
1409 | Modified |= promoteSoftWaitCnt(Waitcnt: CombinedLoadDsCntInstr); |
1410 | ScoreBrackets.applyWaitcnt(T: LOAD_CNT, Count: Wait.LoadCnt); |
1411 | ScoreBrackets.applyWaitcnt(T: DS_CNT, Count: Wait.DsCnt); |
1412 | Wait.LoadCnt = ~0u; |
1413 | Wait.DsCnt = ~0u; |
1414 | |
1415 | LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end() |
1416 | ? dbgs() << "applyPreexistingWaitcnt\n" |
1417 | << "New Instr at block end: " |
1418 | << *CombinedLoadDsCntInstr << '\n' |
1419 | : dbgs() << "applyPreexistingWaitcnt\n" |
1420 | << "Old Instr: " << *It << "New Instr: " |
1421 | << *CombinedLoadDsCntInstr << '\n'); |
1422 | } else { |
1423 | CombinedLoadDsCntInstr->eraseFromParent(); |
1424 | Modified = true; |
1425 | } |
1426 | } |
1427 | |
1428 | if (CombinedStoreDsCntInstr) { |
1429 | // Similarly for S_WAIT_STORECNT_DSCNT. |
1430 | if (Wait.StoreCnt != ~0u && Wait.DsCnt != ~0u) { |
1431 | unsigned NewEnc = AMDGPU::encodeStorecntDscnt(Version: IV, Decoded: Wait); |
1432 | Modified |= updateOperandIfDifferent(MI&: *CombinedStoreDsCntInstr, |
1433 | OpName: AMDGPU::OpName::simm16, NewEnc); |
1434 | Modified |= promoteSoftWaitCnt(Waitcnt: CombinedStoreDsCntInstr); |
1435 | ScoreBrackets.applyWaitcnt(T: STORE_CNT, Count: Wait.StoreCnt); |
1436 | ScoreBrackets.applyWaitcnt(T: DS_CNT, Count: Wait.DsCnt); |
1437 | Wait.StoreCnt = ~0u; |
1438 | Wait.DsCnt = ~0u; |
1439 | |
1440 | LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end() |
1441 | ? dbgs() << "applyPreexistingWaitcnt\n" |
1442 | << "New Instr at block end: " |
1443 | << *CombinedStoreDsCntInstr << '\n' |
1444 | : dbgs() << "applyPreexistingWaitcnt\n" |
1445 | << "Old Instr: " << *It << "New Instr: " |
1446 | << *CombinedStoreDsCntInstr << '\n'); |
1447 | } else { |
1448 | CombinedStoreDsCntInstr->eraseFromParent(); |
1449 | Modified = true; |
1450 | } |
1451 | } |
1452 | |
1453 | // Look for an opportunity to convert existing S_WAIT_LOADCNT, |
1454 | // S_WAIT_STORECNT and S_WAIT_DSCNT into new S_WAIT_LOADCNT_DSCNT |
1455 | // or S_WAIT_STORECNT_DSCNT. This is achieved by selectively removing |
1456 | // instructions so that createNewWaitcnt() will create new combined |
1457 | // instructions to replace them. |
1458 | |
1459 | if (Wait.DsCnt != ~0u) { |
1460 | // This is a vector of addresses in WaitInstrs pointing to instructions |
1461 | // that should be removed if they are present. |
1462 | SmallVector<MachineInstr **, 2> WaitsToErase; |
1463 | |
1464 | // If it's known that both DScnt and either LOADcnt or STOREcnt (but not |
1465 | // both) need to be waited for, ensure that there are no existing |
1466 | // individual wait count instructions for these. |
1467 | |
1468 | if (Wait.LoadCnt != ~0u) { |
1469 | WaitsToErase.push_back(Elt: &WaitInstrs[LOAD_CNT]); |
1470 | WaitsToErase.push_back(Elt: &WaitInstrs[DS_CNT]); |
1471 | } else if (Wait.StoreCnt != ~0u) { |
1472 | WaitsToErase.push_back(Elt: &WaitInstrs[STORE_CNT]); |
1473 | WaitsToErase.push_back(Elt: &WaitInstrs[DS_CNT]); |
1474 | } |
1475 | |
1476 | for (MachineInstr **WI : WaitsToErase) { |
1477 | if (!*WI) |
1478 | continue; |
1479 | |
1480 | (*WI)->eraseFromParent(); |
1481 | *WI = nullptr; |
1482 | Modified = true; |
1483 | } |
1484 | } |
1485 | |
1486 | for (auto CT : inst_counter_types(MaxCounter: NUM_EXTENDED_INST_CNTS)) { |
1487 | if (!WaitInstrs[CT]) |
1488 | continue; |
1489 | |
1490 | unsigned NewCnt = getWait(Wait, T: CT); |
1491 | if (NewCnt != ~0u) { |
1492 | Modified |= updateOperandIfDifferent(MI&: *WaitInstrs[CT], |
1493 | OpName: AMDGPU::OpName::simm16, NewEnc: NewCnt); |
1494 | Modified |= promoteSoftWaitCnt(Waitcnt: WaitInstrs[CT]); |
1495 | |
1496 | ScoreBrackets.applyWaitcnt(T: CT, Count: NewCnt); |
1497 | setNoWait(Wait, T: CT); |
1498 | |
1499 | LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end() |
1500 | ? dbgs() << "applyPreexistingWaitcnt\n" |
1501 | << "New Instr at block end: " << *WaitInstrs[CT] |
1502 | << '\n' |
1503 | : dbgs() << "applyPreexistingWaitcnt\n" |
1504 | << "Old Instr: " << *It |
1505 | << "New Instr: " << *WaitInstrs[CT] << '\n'); |
1506 | } else { |
1507 | WaitInstrs[CT]->eraseFromParent(); |
1508 | Modified = true; |
1509 | } |
1510 | } |
1511 | |
1512 | return Modified; |
1513 | } |
1514 | |
1515 | /// Generate S_WAIT_*CNT instructions for any required counters in \p Wait |
1516 | bool WaitcntGeneratorGFX12Plus::createNewWaitcnt( |
1517 | MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It, |
1518 | AMDGPU::Waitcnt Wait) { |
1519 | assert(ST); |
1520 | assert(!isNormalMode(MaxCounter)); |
1521 | |
1522 | bool Modified = false; |
1523 | const DebugLoc &DL = Block.findDebugLoc(MBBI: It); |
1524 | |
1525 | // Check for opportunities to use combined wait instructions. |
1526 | if (Wait.DsCnt != ~0u) { |
1527 | MachineInstr *SWaitInst = nullptr; |
1528 | |
1529 | if (Wait.LoadCnt != ~0u) { |
1530 | unsigned Enc = AMDGPU::encodeLoadcntDscnt(Version: IV, Decoded: Wait); |
1531 | |
1532 | SWaitInst = BuildMI(BB&: Block, I: It, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_WAIT_LOADCNT_DSCNT)) |
1533 | .addImm(Val: Enc); |
1534 | |
1535 | Wait.LoadCnt = ~0u; |
1536 | Wait.DsCnt = ~0u; |
1537 | } else if (Wait.StoreCnt != ~0u) { |
1538 | unsigned Enc = AMDGPU::encodeStorecntDscnt(Version: IV, Decoded: Wait); |
1539 | |
1540 | SWaitInst = |
1541 | BuildMI(BB&: Block, I: It, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_WAIT_STORECNT_DSCNT)) |
1542 | .addImm(Val: Enc); |
1543 | |
1544 | Wait.StoreCnt = ~0u; |
1545 | Wait.DsCnt = ~0u; |
1546 | } |
1547 | |
1548 | if (SWaitInst) { |
1549 | Modified = true; |
1550 | |
1551 | LLVM_DEBUG(dbgs() << "generateWaitcnt\n" ; |
1552 | if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It; |
1553 | dbgs() << "New Instr: " << *SWaitInst << '\n'); |
1554 | } |
1555 | } |
1556 | |
1557 | // Generate an instruction for any remaining counter that needs |
1558 | // waiting for. |
1559 | |
1560 | for (auto CT : inst_counter_types(MaxCounter: NUM_EXTENDED_INST_CNTS)) { |
1561 | unsigned Count = getWait(Wait, T: CT); |
1562 | if (Count == ~0u) |
1563 | continue; |
1564 | |
1565 | [[maybe_unused]] auto SWaitInst = |
1566 | BuildMI(BB&: Block, I: It, MIMD: DL, MCID: TII->get(Opcode: instrsForExtendedCounterTypes[CT])) |
1567 | .addImm(Val: Count); |
1568 | |
1569 | Modified = true; |
1570 | |
1571 | LLVM_DEBUG(dbgs() << "generateWaitcnt\n" ; |
1572 | if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It; |
1573 | dbgs() << "New Instr: " << *SWaitInst << '\n'); |
1574 | } |
1575 | |
1576 | return Modified; |
1577 | } |
1578 | |
1579 | static bool readsVCCZ(const MachineInstr &MI) { |
1580 | unsigned Opc = MI.getOpcode(); |
1581 | return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) && |
1582 | !MI.getOperand(i: 1).isUndef(); |
1583 | } |
1584 | |
1585 | /// \returns true if the callee inserts an s_waitcnt 0 on function entry. |
1586 | static bool callWaitsOnFunctionEntry(const MachineInstr &MI) { |
1587 | // Currently all conventions wait, but this may not always be the case. |
1588 | // |
1589 | // TODO: If IPRA is enabled, and the callee is isSafeForNoCSROpt, it may make |
1590 | // senses to omit the wait and do it in the caller. |
1591 | return true; |
1592 | } |
1593 | |
1594 | /// \returns true if the callee is expected to wait for any outstanding waits |
1595 | /// before returning. |
1596 | static bool callWaitsOnFunctionReturn(const MachineInstr &MI) { |
1597 | return true; |
1598 | } |
1599 | |
1600 | /// Generate s_waitcnt instruction to be placed before cur_Inst. |
1601 | /// Instructions of a given type are returned in order, |
1602 | /// but instructions of different types can complete out of order. |
1603 | /// We rely on this in-order completion |
1604 | /// and simply assign a score to the memory access instructions. |
1605 | /// We keep track of the active "score bracket" to determine |
1606 | /// if an access of a memory read requires an s_waitcnt |
1607 | /// and if so what the value of each counter is. |
1608 | /// The "score bracket" is bound by the lower bound and upper bound |
1609 | /// scores (*_score_LB and *_score_ub respectively). |
1610 | /// If FlushVmCnt is true, that means that we want to generate a s_waitcnt to |
1611 | /// flush the vmcnt counter here. |
1612 | bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, |
1613 | WaitcntBrackets &ScoreBrackets, |
1614 | MachineInstr *OldWaitcntInstr, |
1615 | bool FlushVmCnt) { |
1616 | setForceEmitWaitcnt(); |
1617 | |
1618 | if (MI.isMetaInstruction()) |
1619 | return false; |
1620 | |
1621 | AMDGPU::Waitcnt Wait; |
1622 | |
1623 | // FIXME: This should have already been handled by the memory legalizer. |
1624 | // Removing this currently doesn't affect any lit tests, but we need to |
1625 | // verify that nothing was relying on this. The number of buffer invalidates |
1626 | // being handled here should not be expanded. |
1627 | if (MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 || |
1628 | MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC || |
1629 | MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL || |
1630 | MI.getOpcode() == AMDGPU::BUFFER_GL0_INV || |
1631 | MI.getOpcode() == AMDGPU::BUFFER_GL1_INV) { |
1632 | Wait.LoadCnt = 0; |
1633 | } |
1634 | |
1635 | // All waits must be resolved at call return. |
1636 | // NOTE: this could be improved with knowledge of all call sites or |
1637 | // with knowledge of the called routines. |
1638 | if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG || |
1639 | MI.getOpcode() == AMDGPU::SI_RETURN || |
1640 | MI.getOpcode() == AMDGPU::S_SETPC_B64_return || |
1641 | (MI.isReturn() && MI.isCall() && !callWaitsOnFunctionEntry(MI))) { |
1642 | Wait = Wait.combined(Other: WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false)); |
1643 | } |
1644 | // Identify S_ENDPGM instructions which may have to wait for outstanding VMEM |
1645 | // stores. In this case it can be useful to send a message to explicitly |
1646 | // release all VGPRs before the stores have completed, but it is only safe to |
1647 | // do this if: |
1648 | // * there are no outstanding scratch stores |
1649 | // * we are not in Dynamic VGPR mode |
1650 | else if (MI.getOpcode() == AMDGPU::S_ENDPGM || |
1651 | MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED) { |
1652 | if (ST->getGeneration() >= AMDGPUSubtarget::GFX11 && !WCG->isOptNone() && |
1653 | ScoreBrackets.getScoreRange(T: STORE_CNT) != 0 && |
1654 | !ScoreBrackets.hasPendingEvent(E: SCRATCH_WRITE_ACCESS)) |
1655 | ReleaseVGPRInsts.insert(V: &MI); |
1656 | } |
1657 | // Resolve vm waits before gs-done. |
1658 | else if ((MI.getOpcode() == AMDGPU::S_SENDMSG || |
1659 | MI.getOpcode() == AMDGPU::S_SENDMSGHALT) && |
1660 | ST->hasLegacyGeometry() && |
1661 | ((MI.getOperand(i: 0).getImm() & AMDGPU::SendMsg::ID_MASK_PreGFX11_) == |
1662 | AMDGPU::SendMsg::ID_GS_DONE_PreGFX11)) { |
1663 | Wait.LoadCnt = 0; |
1664 | } |
1665 | |
1666 | // Export & GDS instructions do not read the EXEC mask until after the export |
1667 | // is granted (which can occur well after the instruction is issued). |
1668 | // The shader program must flush all EXP operations on the export-count |
1669 | // before overwriting the EXEC mask. |
1670 | else { |
1671 | if (MI.modifiesRegister(Reg: AMDGPU::EXEC, TRI)) { |
1672 | // Export and GDS are tracked individually, either may trigger a waitcnt |
1673 | // for EXEC. |
1674 | if (ScoreBrackets.hasPendingEvent(E: EXP_GPR_LOCK) || |
1675 | ScoreBrackets.hasPendingEvent(E: EXP_PARAM_ACCESS) || |
1676 | ScoreBrackets.hasPendingEvent(E: EXP_POS_ACCESS) || |
1677 | ScoreBrackets.hasPendingEvent(E: GDS_GPR_LOCK)) { |
1678 | Wait.ExpCnt = 0; |
1679 | } |
1680 | } |
1681 | |
1682 | if (MI.isCall() && callWaitsOnFunctionEntry(MI)) { |
1683 | // The function is going to insert a wait on everything in its prolog. |
1684 | // This still needs to be careful if the call target is a load (e.g. a GOT |
1685 | // load). We also need to check WAW dependency with saved PC. |
1686 | Wait = AMDGPU::Waitcnt(); |
1687 | |
1688 | int CallAddrOpIdx = |
1689 | AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), NamedIdx: AMDGPU::OpName::src0); |
1690 | |
1691 | if (MI.getOperand(i: CallAddrOpIdx).isReg()) { |
1692 | RegInterval CallAddrOpInterval = |
1693 | ScoreBrackets.getRegInterval(MI: &MI, MRI, TRI, OpNo: CallAddrOpIdx); |
1694 | |
1695 | for (int RegNo = CallAddrOpInterval.first; |
1696 | RegNo < CallAddrOpInterval.second; ++RegNo) |
1697 | ScoreBrackets.determineWait(T: SmemAccessCounter, RegNo, Wait); |
1698 | |
1699 | int RtnAddrOpIdx = |
1700 | AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), NamedIdx: AMDGPU::OpName::dst); |
1701 | if (RtnAddrOpIdx != -1) { |
1702 | RegInterval RtnAddrOpInterval = |
1703 | ScoreBrackets.getRegInterval(MI: &MI, MRI, TRI, OpNo: RtnAddrOpIdx); |
1704 | |
1705 | for (int RegNo = RtnAddrOpInterval.first; |
1706 | RegNo < RtnAddrOpInterval.second; ++RegNo) |
1707 | ScoreBrackets.determineWait(T: SmemAccessCounter, RegNo, Wait); |
1708 | } |
1709 | } |
1710 | } else { |
1711 | // FIXME: Should not be relying on memoperands. |
1712 | // Look at the source operands of every instruction to see if |
1713 | // any of them results from a previous memory operation that affects |
1714 | // its current usage. If so, an s_waitcnt instruction needs to be |
1715 | // emitted. |
1716 | // If the source operand was defined by a load, add the s_waitcnt |
1717 | // instruction. |
1718 | // |
1719 | // Two cases are handled for destination operands: |
1720 | // 1) If the destination operand was defined by a load, add the s_waitcnt |
1721 | // instruction to guarantee the right WAW order. |
1722 | // 2) If a destination operand that was used by a recent export/store ins, |
1723 | // add s_waitcnt on exp_cnt to guarantee the WAR order. |
1724 | |
1725 | for (const MachineMemOperand *Memop : MI.memoperands()) { |
1726 | const Value *Ptr = Memop->getValue(); |
1727 | if (Memop->isStore() && SLoadAddresses.count(Val: Ptr)) { |
1728 | addWait(Wait, T: SmemAccessCounter, Count: 0); |
1729 | if (PDT->dominates(A: MI.getParent(), B: SLoadAddresses.find(Val: Ptr)->second)) |
1730 | SLoadAddresses.erase(Val: Ptr); |
1731 | } |
1732 | unsigned AS = Memop->getAddrSpace(); |
1733 | if (AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::FLAT_ADDRESS) |
1734 | continue; |
1735 | // No need to wait before load from VMEM to LDS. |
1736 | if (TII->mayWriteLDSThroughDMA(MI)) |
1737 | continue; |
1738 | |
1739 | // LOAD_CNT is only relevant to vgpr or LDS. |
1740 | unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS; |
1741 | bool FoundAliasingStore = false; |
1742 | // Only objects with alias scope info were added to LDSDMAScopes array. |
1743 | // In the absense of the scope info we will not be able to disambiguate |
1744 | // aliasing here. There is no need to try searching for a corresponding |
1745 | // store slot. This is conservatively correct because in that case we |
1746 | // will produce a wait using the first (general) LDS DMA wait slot which |
1747 | // will wait on all of them anyway. |
1748 | if (Ptr && Memop->getAAInfo() && Memop->getAAInfo().Scope) { |
1749 | const auto &LDSDMAStores = ScoreBrackets.getLDSDMAStores(); |
1750 | for (unsigned I = 0, E = LDSDMAStores.size(); I != E; ++I) { |
1751 | if (MI.mayAlias(AA, Other: *LDSDMAStores[I], UseTBAA: true)) { |
1752 | FoundAliasingStore = true; |
1753 | ScoreBrackets.determineWait(T: LOAD_CNT, RegNo: RegNo + I + 1, Wait); |
1754 | } |
1755 | } |
1756 | } |
1757 | if (!FoundAliasingStore) |
1758 | ScoreBrackets.determineWait(T: LOAD_CNT, RegNo, Wait); |
1759 | if (Memop->isStore()) { |
1760 | ScoreBrackets.determineWait(T: EXP_CNT, RegNo, Wait); |
1761 | } |
1762 | } |
1763 | |
1764 | // Loop over use and def operands. |
1765 | for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) { |
1766 | MachineOperand &Op = MI.getOperand(i: I); |
1767 | if (!Op.isReg()) |
1768 | continue; |
1769 | |
1770 | // If the instruction does not read tied source, skip the operand. |
1771 | if (Op.isTied() && Op.isUse() && TII->doesNotReadTiedSource(MI)) |
1772 | continue; |
1773 | |
1774 | RegInterval Interval = ScoreBrackets.getRegInterval(MI: &MI, MRI, TRI, OpNo: I); |
1775 | |
1776 | const bool IsVGPR = TRI->isVectorRegister(MRI: *MRI, Reg: Op.getReg()); |
1777 | for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { |
1778 | if (IsVGPR) { |
1779 | // RAW always needs an s_waitcnt. WAW needs an s_waitcnt unless the |
1780 | // previous write and this write are the same type of VMEM |
1781 | // instruction, in which case they are (in some architectures) |
1782 | // guaranteed to write their results in order anyway. |
1783 | if (Op.isUse() || !updateVMCntOnly(Inst: MI) || |
1784 | ScoreBrackets.hasOtherPendingVmemTypes(GprNo: RegNo, |
1785 | V: getVmemType(Inst: MI)) || |
1786 | !ST->hasVmemWriteVgprInOrder()) { |
1787 | ScoreBrackets.determineWait(T: LOAD_CNT, RegNo, Wait); |
1788 | ScoreBrackets.determineWait(T: SAMPLE_CNT, RegNo, Wait); |
1789 | ScoreBrackets.determineWait(T: BVH_CNT, RegNo, Wait); |
1790 | ScoreBrackets.clearVgprVmemTypes(GprNo: RegNo); |
1791 | } |
1792 | if (Op.isDef() || ScoreBrackets.hasPendingEvent(E: EXP_LDS_ACCESS)) { |
1793 | ScoreBrackets.determineWait(T: EXP_CNT, RegNo, Wait); |
1794 | } |
1795 | ScoreBrackets.determineWait(T: DS_CNT, RegNo, Wait); |
1796 | } else { |
1797 | ScoreBrackets.determineWait(T: SmemAccessCounter, RegNo, Wait); |
1798 | } |
1799 | } |
1800 | } |
1801 | } |
1802 | } |
1803 | |
1804 | // The subtarget may have an implicit S_WAITCNT 0 before barriers. If it does |
1805 | // not, we need to ensure the subtarget is capable of backing off barrier |
1806 | // instructions in case there are any outstanding memory operations that may |
1807 | // cause an exception. Otherwise, insert an explicit S_WAITCNT 0 here. |
1808 | if (TII->isBarrierStart(Opcode: MI.getOpcode()) && |
1809 | !ST->hasAutoWaitcntBeforeBarrier() && !ST->supportsBackOffBarrier()) { |
1810 | Wait = Wait.combined(Other: WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/true)); |
1811 | } |
1812 | |
1813 | // TODO: Remove this work-around, enable the assert for Bug 457939 |
1814 | // after fixing the scheduler. Also, the Shader Compiler code is |
1815 | // independent of target. |
1816 | if (readsVCCZ(MI) && ST->hasReadVCCZBug()) { |
1817 | if (ScoreBrackets.hasPendingEvent(E: SMEM_ACCESS)) { |
1818 | Wait.DsCnt = 0; |
1819 | } |
1820 | } |
1821 | |
1822 | // Verify that the wait is actually needed. |
1823 | ScoreBrackets.simplifyWaitcnt(Wait); |
1824 | |
1825 | if (ForceEmitZeroWaitcnts) |
1826 | Wait = WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false); |
1827 | |
1828 | if (ForceEmitWaitcnt[LOAD_CNT]) |
1829 | Wait.LoadCnt = 0; |
1830 | if (ForceEmitWaitcnt[EXP_CNT]) |
1831 | Wait.ExpCnt = 0; |
1832 | if (ForceEmitWaitcnt[DS_CNT]) |
1833 | Wait.DsCnt = 0; |
1834 | if (ForceEmitWaitcnt[SAMPLE_CNT]) |
1835 | Wait.SampleCnt = 0; |
1836 | if (ForceEmitWaitcnt[BVH_CNT]) |
1837 | Wait.BvhCnt = 0; |
1838 | if (ForceEmitWaitcnt[KM_CNT]) |
1839 | Wait.KmCnt = 0; |
1840 | |
1841 | if (FlushVmCnt) { |
1842 | if (ScoreBrackets.hasPendingEvent(T: LOAD_CNT)) |
1843 | Wait.LoadCnt = 0; |
1844 | if (ScoreBrackets.hasPendingEvent(T: SAMPLE_CNT)) |
1845 | Wait.SampleCnt = 0; |
1846 | if (ScoreBrackets.hasPendingEvent(T: BVH_CNT)) |
1847 | Wait.BvhCnt = 0; |
1848 | } |
1849 | |
1850 | return generateWaitcnt(Wait, It: MI.getIterator(), Block&: *MI.getParent(), ScoreBrackets, |
1851 | OldWaitcntInstr); |
1852 | } |
1853 | |
1854 | bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait, |
1855 | MachineBasicBlock::instr_iterator It, |
1856 | MachineBasicBlock &Block, |
1857 | WaitcntBrackets &ScoreBrackets, |
1858 | MachineInstr *OldWaitcntInstr) { |
1859 | bool Modified = false; |
1860 | |
1861 | if (OldWaitcntInstr) |
1862 | // Try to merge the required wait with preexisting waitcnt instructions. |
1863 | // Also erase redundant waitcnt. |
1864 | Modified = |
1865 | WCG->applyPreexistingWaitcnt(ScoreBrackets, OldWaitcntInstr&: *OldWaitcntInstr, Wait, It); |
1866 | |
1867 | // Any counts that could have been applied to any existing waitcnt |
1868 | // instructions will have been done so, now deal with any remaining. |
1869 | ScoreBrackets.applyWaitcnt(Wait); |
1870 | |
1871 | // ExpCnt can be merged into VINTERP. |
1872 | if (Wait.ExpCnt != ~0u && It != Block.instr_end() && |
1873 | SIInstrInfo::isVINTERP(MI: *It)) { |
1874 | MachineOperand *WaitExp = |
1875 | TII->getNamedOperand(MI&: *It, OperandName: AMDGPU::OpName::waitexp); |
1876 | if (Wait.ExpCnt < WaitExp->getImm()) { |
1877 | WaitExp->setImm(Wait.ExpCnt); |
1878 | Modified = true; |
1879 | } |
1880 | Wait.ExpCnt = ~0u; |
1881 | |
1882 | LLVM_DEBUG(dbgs() << "generateWaitcnt\n" |
1883 | << "Update Instr: " << *It); |
1884 | } |
1885 | |
1886 | if (WCG->createNewWaitcnt(Block, It, Wait)) |
1887 | Modified = true; |
1888 | |
1889 | return Modified; |
1890 | } |
1891 | |
1892 | // This is a flat memory operation. Check to see if it has memory tokens other |
1893 | // than LDS. Other address spaces supported by flat memory operations involve |
1894 | // global memory. |
1895 | bool SIInsertWaitcnts::mayAccessVMEMThroughFlat(const MachineInstr &MI) const { |
1896 | assert(TII->isFLAT(MI)); |
1897 | |
1898 | // All flat instructions use the VMEM counter. |
1899 | assert(TII->usesVM_CNT(MI)); |
1900 | |
1901 | // If there are no memory operands then conservatively assume the flat |
1902 | // operation may access VMEM. |
1903 | if (MI.memoperands_empty()) |
1904 | return true; |
1905 | |
1906 | // See if any memory operand specifies an address space that involves VMEM. |
1907 | // Flat operations only supported FLAT, LOCAL (LDS), or address spaces |
1908 | // involving VMEM such as GLOBAL, CONSTANT, PRIVATE (SCRATCH), etc. The REGION |
1909 | // (GDS) address space is not supported by flat operations. Therefore, simply |
1910 | // return true unless only the LDS address space is found. |
1911 | for (const MachineMemOperand *Memop : MI.memoperands()) { |
1912 | unsigned AS = Memop->getAddrSpace(); |
1913 | assert(AS != AMDGPUAS::REGION_ADDRESS); |
1914 | if (AS != AMDGPUAS::LOCAL_ADDRESS) |
1915 | return true; |
1916 | } |
1917 | |
1918 | return false; |
1919 | } |
1920 | |
1921 | // This is a flat memory operation. Check to see if it has memory tokens for |
1922 | // either LDS or FLAT. |
1923 | bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const { |
1924 | assert(TII->isFLAT(MI)); |
1925 | |
1926 | // Flat instruction such as SCRATCH and GLOBAL do not use the lgkm counter. |
1927 | if (!TII->usesLGKM_CNT(MI)) |
1928 | return false; |
1929 | |
1930 | // If in tgsplit mode then there can be no use of LDS. |
1931 | if (ST->isTgSplitEnabled()) |
1932 | return false; |
1933 | |
1934 | // If there are no memory operands then conservatively assume the flat |
1935 | // operation may access LDS. |
1936 | if (MI.memoperands_empty()) |
1937 | return true; |
1938 | |
1939 | // See if any memory operand specifies an address space that involves LDS. |
1940 | for (const MachineMemOperand *Memop : MI.memoperands()) { |
1941 | unsigned AS = Memop->getAddrSpace(); |
1942 | if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS) |
1943 | return true; |
1944 | } |
1945 | |
1946 | return false; |
1947 | } |
1948 | |
1949 | // This is a flat memory operation. Check to see if it has memory tokens for |
1950 | // either scratch or FLAT. |
1951 | bool SIInsertWaitcnts::mayAccessScratchThroughFlat( |
1952 | const MachineInstr &MI) const { |
1953 | assert(TII->isFLAT(MI)); |
1954 | |
1955 | // SCRATCH instructions always access scratch. |
1956 | if (TII->isFLATScratch(MI)) |
1957 | return true; |
1958 | |
1959 | // GLOBAL instructions never access scratch. |
1960 | if (TII->isFLATGlobal(MI)) |
1961 | return false; |
1962 | |
1963 | // If there are no memory operands then conservatively assume the flat |
1964 | // operation may access scratch. |
1965 | if (MI.memoperands_empty()) |
1966 | return true; |
1967 | |
1968 | // See if any memory operand specifies an address space that involves scratch. |
1969 | return any_of(Range: MI.memoperands(), P: [](const MachineMemOperand *Memop) { |
1970 | unsigned AS = Memop->getAddrSpace(); |
1971 | return AS == AMDGPUAS::PRIVATE_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS; |
1972 | }); |
1973 | } |
1974 | |
1975 | static bool isCacheInvOrWBInst(MachineInstr &Inst) { |
1976 | auto Opc = Inst.getOpcode(); |
1977 | return Opc == AMDGPU::GLOBAL_INV || Opc == AMDGPU::GLOBAL_WB || |
1978 | Opc == AMDGPU::GLOBAL_WBINV; |
1979 | } |
1980 | |
1981 | void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst, |
1982 | WaitcntBrackets *ScoreBrackets) { |
1983 | // Now look at the instruction opcode. If it is a memory access |
1984 | // instruction, update the upper-bound of the appropriate counter's |
1985 | // bracket and the destination operand scores. |
1986 | // TODO: Use the (TSFlags & SIInstrFlags::DS_CNT) property everywhere. |
1987 | |
1988 | if (TII->isDS(MI: Inst) && TII->usesLGKM_CNT(MI: Inst)) { |
1989 | if (TII->isAlwaysGDS(Opcode: Inst.getOpcode()) || |
1990 | TII->hasModifiersSet(MI: Inst, OpName: AMDGPU::OpName::gds)) { |
1991 | ScoreBrackets->updateByEvent(TII, TRI, MRI, E: GDS_ACCESS, Inst); |
1992 | ScoreBrackets->updateByEvent(TII, TRI, MRI, E: GDS_GPR_LOCK, Inst); |
1993 | } else { |
1994 | ScoreBrackets->updateByEvent(TII, TRI, MRI, E: LDS_ACCESS, Inst); |
1995 | } |
1996 | } else if (TII->isFLAT(MI: Inst)) { |
1997 | // TODO: Track this properly. |
1998 | if (isCacheInvOrWBInst(Inst)) |
1999 | return; |
2000 | |
2001 | assert(Inst.mayLoadOrStore()); |
2002 | |
2003 | int FlatASCount = 0; |
2004 | |
2005 | if (mayAccessVMEMThroughFlat(MI: Inst)) { |
2006 | ++FlatASCount; |
2007 | ScoreBrackets->updateByEvent(TII, TRI, MRI, E: getVmemWaitEventType(Inst), |
2008 | Inst); |
2009 | } |
2010 | |
2011 | if (mayAccessLDSThroughFlat(MI: Inst)) { |
2012 | ++FlatASCount; |
2013 | ScoreBrackets->updateByEvent(TII, TRI, MRI, E: LDS_ACCESS, Inst); |
2014 | } |
2015 | |
2016 | // A Flat memory operation must access at least one address space. |
2017 | assert(FlatASCount); |
2018 | |
2019 | // This is a flat memory operation that access both VMEM and LDS, so note it |
2020 | // - it will require that both the VM and LGKM be flushed to zero if it is |
2021 | // pending when a VM or LGKM dependency occurs. |
2022 | if (FlatASCount > 1) |
2023 | ScoreBrackets->setPendingFlat(); |
2024 | } else if (SIInstrInfo::isVMEM(MI: Inst) && |
2025 | !llvm::AMDGPU::getMUBUFIsBufferInv(Opc: Inst.getOpcode())) { |
2026 | ScoreBrackets->updateByEvent(TII, TRI, MRI, E: getVmemWaitEventType(Inst), |
2027 | Inst); |
2028 | |
2029 | if (ST->vmemWriteNeedsExpWaitcnt() && |
2030 | (Inst.mayStore() || SIInstrInfo::isAtomicRet(MI: Inst))) { |
2031 | ScoreBrackets->updateByEvent(TII, TRI, MRI, E: VMW_GPR_LOCK, Inst); |
2032 | } |
2033 | } else if (TII->isSMRD(MI: Inst)) { |
2034 | ScoreBrackets->updateByEvent(TII, TRI, MRI, E: SMEM_ACCESS, Inst); |
2035 | } else if (Inst.isCall()) { |
2036 | if (callWaitsOnFunctionReturn(MI: Inst)) { |
2037 | // Act as a wait on everything |
2038 | ScoreBrackets->applyWaitcnt( |
2039 | Wait: WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false)); |
2040 | ScoreBrackets->setStateOnFunctionEntryOrReturn(); |
2041 | } else { |
2042 | // May need to way wait for anything. |
2043 | ScoreBrackets->applyWaitcnt(Wait: AMDGPU::Waitcnt()); |
2044 | } |
2045 | } else if (SIInstrInfo::isLDSDIR(MI: Inst)) { |
2046 | ScoreBrackets->updateByEvent(TII, TRI, MRI, E: EXP_LDS_ACCESS, Inst); |
2047 | } else if (TII->isVINTERP(MI: Inst)) { |
2048 | int64_t Imm = TII->getNamedOperand(MI&: Inst, OperandName: AMDGPU::OpName::waitexp)->getImm(); |
2049 | ScoreBrackets->applyWaitcnt(T: EXP_CNT, Count: Imm); |
2050 | } else if (SIInstrInfo::isEXP(MI: Inst)) { |
2051 | unsigned Imm = TII->getNamedOperand(MI&: Inst, OperandName: AMDGPU::OpName::tgt)->getImm(); |
2052 | if (Imm >= AMDGPU::Exp::ET_PARAM0 && Imm <= AMDGPU::Exp::ET_PARAM31) |
2053 | ScoreBrackets->updateByEvent(TII, TRI, MRI, E: EXP_PARAM_ACCESS, Inst); |
2054 | else if (Imm >= AMDGPU::Exp::ET_POS0 && Imm <= AMDGPU::Exp::ET_POS_LAST) |
2055 | ScoreBrackets->updateByEvent(TII, TRI, MRI, E: EXP_POS_ACCESS, Inst); |
2056 | else |
2057 | ScoreBrackets->updateByEvent(TII, TRI, MRI, E: EXP_GPR_LOCK, Inst); |
2058 | } else { |
2059 | switch (Inst.getOpcode()) { |
2060 | case AMDGPU::S_SENDMSG: |
2061 | case AMDGPU::S_SENDMSG_RTN_B32: |
2062 | case AMDGPU::S_SENDMSG_RTN_B64: |
2063 | case AMDGPU::S_SENDMSGHALT: |
2064 | ScoreBrackets->updateByEvent(TII, TRI, MRI, E: SQ_MESSAGE, Inst); |
2065 | break; |
2066 | case AMDGPU::S_MEMTIME: |
2067 | case AMDGPU::S_MEMREALTIME: |
2068 | case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0: |
2069 | case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM: |
2070 | case AMDGPU::S_BARRIER_LEAVE: |
2071 | case AMDGPU::S_GET_BARRIER_STATE_M0: |
2072 | case AMDGPU::S_GET_BARRIER_STATE_IMM: |
2073 | ScoreBrackets->updateByEvent(TII, TRI, MRI, E: SMEM_ACCESS, Inst); |
2074 | break; |
2075 | } |
2076 | } |
2077 | } |
2078 | |
2079 | bool WaitcntBrackets::mergeScore(const MergeInfo &M, unsigned &Score, |
2080 | unsigned OtherScore) { |
2081 | unsigned MyShifted = Score <= M.OldLB ? 0 : Score + M.MyShift; |
2082 | unsigned OtherShifted = |
2083 | OtherScore <= M.OtherLB ? 0 : OtherScore + M.OtherShift; |
2084 | Score = std::max(a: MyShifted, b: OtherShifted); |
2085 | return OtherShifted > MyShifted; |
2086 | } |
2087 | |
2088 | /// Merge the pending events and associater score brackets of \p Other into |
2089 | /// this brackets status. |
2090 | /// |
2091 | /// Returns whether the merge resulted in a change that requires tighter waits |
2092 | /// (i.e. the merged brackets strictly dominate the original brackets). |
2093 | bool WaitcntBrackets::merge(const WaitcntBrackets &Other) { |
2094 | bool StrictDom = false; |
2095 | |
2096 | VgprUB = std::max(a: VgprUB, b: Other.VgprUB); |
2097 | SgprUB = std::max(a: SgprUB, b: Other.SgprUB); |
2098 | |
2099 | for (auto T : inst_counter_types(MaxCounter)) { |
2100 | // Merge event flags for this counter |
2101 | const unsigned OldEvents = PendingEvents & WaitEventMaskForInst[T]; |
2102 | const unsigned OtherEvents = Other.PendingEvents & WaitEventMaskForInst[T]; |
2103 | if (OtherEvents & ~OldEvents) |
2104 | StrictDom = true; |
2105 | PendingEvents |= OtherEvents; |
2106 | |
2107 | // Merge scores for this counter |
2108 | const unsigned MyPending = ScoreUBs[T] - ScoreLBs[T]; |
2109 | const unsigned OtherPending = Other.ScoreUBs[T] - Other.ScoreLBs[T]; |
2110 | const unsigned NewUB = ScoreLBs[T] + std::max(a: MyPending, b: OtherPending); |
2111 | if (NewUB < ScoreLBs[T]) |
2112 | report_fatal_error(reason: "waitcnt score overflow" ); |
2113 | |
2114 | MergeInfo M; |
2115 | M.OldLB = ScoreLBs[T]; |
2116 | M.OtherLB = Other.ScoreLBs[T]; |
2117 | M.MyShift = NewUB - ScoreUBs[T]; |
2118 | M.OtherShift = NewUB - Other.ScoreUBs[T]; |
2119 | |
2120 | ScoreUBs[T] = NewUB; |
2121 | |
2122 | StrictDom |= mergeScore(M, Score&: LastFlat[T], OtherScore: Other.LastFlat[T]); |
2123 | |
2124 | for (int J = 0; J <= VgprUB; J++) |
2125 | StrictDom |= mergeScore(M, Score&: VgprScores[T][J], OtherScore: Other.VgprScores[T][J]); |
2126 | |
2127 | if (T == SmemAccessCounter) { |
2128 | for (int J = 0; J <= SgprUB; J++) |
2129 | StrictDom |= mergeScore(M, Score&: SgprScores[J], OtherScore: Other.SgprScores[J]); |
2130 | } |
2131 | } |
2132 | |
2133 | for (int J = 0; J <= VgprUB; J++) { |
2134 | unsigned char NewVmemTypes = VgprVmemTypes[J] | Other.VgprVmemTypes[J]; |
2135 | StrictDom |= NewVmemTypes != VgprVmemTypes[J]; |
2136 | VgprVmemTypes[J] = NewVmemTypes; |
2137 | } |
2138 | |
2139 | return StrictDom; |
2140 | } |
2141 | |
2142 | static bool isWaitInstr(MachineInstr &Inst) { |
2143 | unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Opcode: Inst.getOpcode()); |
2144 | return Opcode == AMDGPU::S_WAITCNT || |
2145 | (Opcode == AMDGPU::S_WAITCNT_VSCNT && Inst.getOperand(i: 0).isReg() && |
2146 | Inst.getOperand(i: 0).getReg() == AMDGPU::SGPR_NULL) || |
2147 | Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT || |
2148 | Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT || |
2149 | counterTypeForInstr(Opcode).has_value(); |
2150 | } |
2151 | |
2152 | // Generate s_waitcnt instructions where needed. |
2153 | bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF, |
2154 | MachineBasicBlock &Block, |
2155 | WaitcntBrackets &ScoreBrackets) { |
2156 | bool Modified = false; |
2157 | |
2158 | LLVM_DEBUG({ |
2159 | dbgs() << "*** Block" << Block.getNumber() << " ***" ; |
2160 | ScoreBrackets.dump(); |
2161 | }); |
2162 | |
2163 | // Track the correctness of vccz through this basic block. There are two |
2164 | // reasons why it might be incorrect; see ST->hasReadVCCZBug() and |
2165 | // ST->partialVCCWritesUpdateVCCZ(). |
2166 | bool VCCZCorrect = true; |
2167 | if (ST->hasReadVCCZBug()) { |
2168 | // vccz could be incorrect at a basic block boundary if a predecessor wrote |
2169 | // to vcc and then issued an smem load. |
2170 | VCCZCorrect = false; |
2171 | } else if (!ST->partialVCCWritesUpdateVCCZ()) { |
2172 | // vccz could be incorrect at a basic block boundary if a predecessor wrote |
2173 | // to vcc_lo or vcc_hi. |
2174 | VCCZCorrect = false; |
2175 | } |
2176 | |
2177 | // Walk over the instructions. |
2178 | MachineInstr *OldWaitcntInstr = nullptr; |
2179 | |
2180 | for (MachineBasicBlock::instr_iterator Iter = Block.instr_begin(), |
2181 | E = Block.instr_end(); |
2182 | Iter != E;) { |
2183 | MachineInstr &Inst = *Iter; |
2184 | |
2185 | // Track pre-existing waitcnts that were added in earlier iterations or by |
2186 | // the memory legalizer. |
2187 | if (isWaitInstr(Inst)) { |
2188 | if (!OldWaitcntInstr) |
2189 | OldWaitcntInstr = &Inst; |
2190 | ++Iter; |
2191 | continue; |
2192 | } |
2193 | |
2194 | bool FlushVmCnt = Block.getFirstTerminator() == Inst && |
2195 | isPreheaderToFlush(MBB&: Block, ScoreBrackets); |
2196 | |
2197 | // Generate an s_waitcnt instruction to be placed before Inst, if needed. |
2198 | Modified |= generateWaitcntInstBefore(MI&: Inst, ScoreBrackets, OldWaitcntInstr, |
2199 | FlushVmCnt); |
2200 | OldWaitcntInstr = nullptr; |
2201 | |
2202 | // Restore vccz if it's not known to be correct already. |
2203 | bool RestoreVCCZ = !VCCZCorrect && readsVCCZ(MI: Inst); |
2204 | |
2205 | // Don't examine operands unless we need to track vccz correctness. |
2206 | if (ST->hasReadVCCZBug() || !ST->partialVCCWritesUpdateVCCZ()) { |
2207 | if (Inst.definesRegister(Reg: AMDGPU::VCC_LO, /*TRI=*/nullptr) || |
2208 | Inst.definesRegister(Reg: AMDGPU::VCC_HI, /*TRI=*/nullptr)) { |
2209 | // Up to gfx9, writes to vcc_lo and vcc_hi don't update vccz. |
2210 | if (!ST->partialVCCWritesUpdateVCCZ()) |
2211 | VCCZCorrect = false; |
2212 | } else if (Inst.definesRegister(Reg: AMDGPU::VCC, /*TRI=*/nullptr)) { |
2213 | // There is a hardware bug on CI/SI where SMRD instruction may corrupt |
2214 | // vccz bit, so when we detect that an instruction may read from a |
2215 | // corrupt vccz bit, we need to: |
2216 | // 1. Insert s_waitcnt lgkm(0) to wait for all outstanding SMRD |
2217 | // operations to complete. |
2218 | // 2. Restore the correct value of vccz by writing the current value |
2219 | // of vcc back to vcc. |
2220 | if (ST->hasReadVCCZBug() && |
2221 | ScoreBrackets.hasPendingEvent(E: SMEM_ACCESS)) { |
2222 | // Writes to vcc while there's an outstanding smem read may get |
2223 | // clobbered as soon as any read completes. |
2224 | VCCZCorrect = false; |
2225 | } else { |
2226 | // Writes to vcc will fix any incorrect value in vccz. |
2227 | VCCZCorrect = true; |
2228 | } |
2229 | } |
2230 | } |
2231 | |
2232 | if (TII->isSMRD(MI: Inst)) { |
2233 | for (const MachineMemOperand *Memop : Inst.memoperands()) { |
2234 | // No need to handle invariant loads when avoiding WAR conflicts, as |
2235 | // there cannot be a vector store to the same memory location. |
2236 | if (!Memop->isInvariant()) { |
2237 | const Value *Ptr = Memop->getValue(); |
2238 | SLoadAddresses.insert(KV: std::pair(Ptr, Inst.getParent())); |
2239 | } |
2240 | } |
2241 | if (ST->hasReadVCCZBug()) { |
2242 | // This smem read could complete and clobber vccz at any time. |
2243 | VCCZCorrect = false; |
2244 | } |
2245 | } |
2246 | |
2247 | updateEventWaitcntAfter(Inst, ScoreBrackets: &ScoreBrackets); |
2248 | |
2249 | if (ST->isPreciseMemoryEnabled() && Inst.mayLoadOrStore()) { |
2250 | AMDGPU::Waitcnt Wait = WCG->getAllZeroWaitcnt( |
2251 | IncludeVSCnt: Inst.mayStore() && !SIInstrInfo::isAtomicRet(MI: Inst)); |
2252 | ScoreBrackets.simplifyWaitcnt(Wait); |
2253 | Modified |= generateWaitcnt(Wait, It: std::next(x: Inst.getIterator()), Block, |
2254 | ScoreBrackets, /*OldWaitcntInstr=*/nullptr); |
2255 | } |
2256 | |
2257 | LLVM_DEBUG({ |
2258 | Inst.print(dbgs()); |
2259 | ScoreBrackets.dump(); |
2260 | }); |
2261 | |
2262 | // TODO: Remove this work-around after fixing the scheduler and enable the |
2263 | // assert above. |
2264 | if (RestoreVCCZ) { |
2265 | // Restore the vccz bit. Any time a value is written to vcc, the vcc |
2266 | // bit is updated, so we can restore the bit by reading the value of |
2267 | // vcc and then writing it back to the register. |
2268 | BuildMI(BB&: Block, I&: Inst, MIMD: Inst.getDebugLoc(), |
2269 | MCID: TII->get(Opcode: ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64), |
2270 | DestReg: TRI->getVCC()) |
2271 | .addReg(RegNo: TRI->getVCC()); |
2272 | VCCZCorrect = true; |
2273 | Modified = true; |
2274 | } |
2275 | |
2276 | ++Iter; |
2277 | } |
2278 | |
2279 | // Flush the LOADcnt, SAMPLEcnt and BVHcnt counters at the end of the block if |
2280 | // needed. |
2281 | AMDGPU::Waitcnt Wait; |
2282 | if (Block.getFirstTerminator() == Block.end() && |
2283 | isPreheaderToFlush(MBB&: Block, ScoreBrackets)) { |
2284 | if (ScoreBrackets.hasPendingEvent(T: LOAD_CNT)) |
2285 | Wait.LoadCnt = 0; |
2286 | if (ScoreBrackets.hasPendingEvent(T: SAMPLE_CNT)) |
2287 | Wait.SampleCnt = 0; |
2288 | if (ScoreBrackets.hasPendingEvent(T: BVH_CNT)) |
2289 | Wait.BvhCnt = 0; |
2290 | } |
2291 | |
2292 | // Combine or remove any redundant waitcnts at the end of the block. |
2293 | Modified |= generateWaitcnt(Wait, It: Block.instr_end(), Block, ScoreBrackets, |
2294 | OldWaitcntInstr); |
2295 | |
2296 | return Modified; |
2297 | } |
2298 | |
2299 | // Return true if the given machine basic block is a preheader of a loop in |
2300 | // which we want to flush the vmcnt counter, and false otherwise. |
2301 | bool SIInsertWaitcnts::(MachineBasicBlock &MBB, |
2302 | WaitcntBrackets &ScoreBrackets) { |
2303 | auto [Iterator, IsInserted] = PreheadersToFlush.try_emplace(Key: &MBB, Args: false); |
2304 | if (!IsInserted) |
2305 | return Iterator->second; |
2306 | |
2307 | MachineBasicBlock *Succ = MBB.getSingleSuccessor(); |
2308 | if (!Succ) |
2309 | return false; |
2310 | |
2311 | MachineLoop *Loop = MLI->getLoopFor(BB: Succ); |
2312 | if (!Loop) |
2313 | return false; |
2314 | |
2315 | if (Loop->getLoopPreheader() == &MBB && |
2316 | shouldFlushVmCnt(ML: Loop, Brackets&: ScoreBrackets)) { |
2317 | Iterator->second = true; |
2318 | return true; |
2319 | } |
2320 | |
2321 | return false; |
2322 | } |
2323 | |
2324 | bool SIInsertWaitcnts::isVMEMOrFlatVMEM(const MachineInstr &MI) const { |
2325 | return SIInstrInfo::isVMEM(MI) || |
2326 | (SIInstrInfo::isFLAT(MI) && mayAccessVMEMThroughFlat(MI)); |
2327 | } |
2328 | |
2329 | // Return true if it is better to flush the vmcnt counter in the preheader of |
2330 | // the given loop. We currently decide to flush in two situations: |
2331 | // 1. The loop contains vmem store(s), no vmem load and at least one use of a |
2332 | // vgpr containing a value that is loaded outside of the loop. (Only on |
2333 | // targets with no vscnt counter). |
2334 | // 2. The loop contains vmem load(s), but the loaded values are not used in the |
2335 | // loop, and at least one use of a vgpr containing a value that is loaded |
2336 | // outside of the loop. |
2337 | bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML, |
2338 | WaitcntBrackets &Brackets) { |
2339 | bool HasVMemLoad = false; |
2340 | bool HasVMemStore = false; |
2341 | bool UsesVgprLoadedOutside = false; |
2342 | DenseSet<Register> VgprUse; |
2343 | DenseSet<Register> VgprDef; |
2344 | |
2345 | for (MachineBasicBlock *MBB : ML->blocks()) { |
2346 | for (MachineInstr &MI : *MBB) { |
2347 | if (isVMEMOrFlatVMEM(MI)) { |
2348 | if (MI.mayLoad()) |
2349 | HasVMemLoad = true; |
2350 | if (MI.mayStore()) |
2351 | HasVMemStore = true; |
2352 | } |
2353 | for (unsigned I = 0; I < MI.getNumOperands(); I++) { |
2354 | MachineOperand &Op = MI.getOperand(i: I); |
2355 | if (!Op.isReg() || !TRI->isVectorRegister(MRI: *MRI, Reg: Op.getReg())) |
2356 | continue; |
2357 | RegInterval Interval = Brackets.getRegInterval(MI: &MI, MRI, TRI, OpNo: I); |
2358 | // Vgpr use |
2359 | if (Op.isUse()) { |
2360 | for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { |
2361 | // If we find a register that is loaded inside the loop, 1. and 2. |
2362 | // are invalidated and we can exit. |
2363 | if (VgprDef.contains(V: RegNo)) |
2364 | return false; |
2365 | VgprUse.insert(V: RegNo); |
2366 | // If at least one of Op's registers is in the score brackets, the |
2367 | // value is likely loaded outside of the loop. |
2368 | if (Brackets.getRegScore(GprNo: RegNo, T: LOAD_CNT) > |
2369 | Brackets.getScoreLB(T: LOAD_CNT) || |
2370 | Brackets.getRegScore(GprNo: RegNo, T: SAMPLE_CNT) > |
2371 | Brackets.getScoreLB(T: SAMPLE_CNT) || |
2372 | Brackets.getRegScore(GprNo: RegNo, T: BVH_CNT) > |
2373 | Brackets.getScoreLB(T: BVH_CNT)) { |
2374 | UsesVgprLoadedOutside = true; |
2375 | break; |
2376 | } |
2377 | } |
2378 | } |
2379 | // VMem load vgpr def |
2380 | else if (isVMEMOrFlatVMEM(MI) && MI.mayLoad() && Op.isDef()) |
2381 | for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { |
2382 | // If we find a register that is loaded inside the loop, 1. and 2. |
2383 | // are invalidated and we can exit. |
2384 | if (VgprUse.contains(V: RegNo)) |
2385 | return false; |
2386 | VgprDef.insert(V: RegNo); |
2387 | } |
2388 | } |
2389 | } |
2390 | } |
2391 | if (!ST->hasVscnt() && HasVMemStore && !HasVMemLoad && UsesVgprLoadedOutside) |
2392 | return true; |
2393 | return HasVMemLoad && UsesVgprLoadedOutside && ST->hasVmemWriteVgprInOrder(); |
2394 | } |
2395 | |
2396 | bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { |
2397 | ST = &MF.getSubtarget<GCNSubtarget>(); |
2398 | TII = ST->getInstrInfo(); |
2399 | TRI = &TII->getRegisterInfo(); |
2400 | MRI = &MF.getRegInfo(); |
2401 | const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); |
2402 | MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI(); |
2403 | PDT = &getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree(); |
2404 | if (auto AAR = getAnalysisIfAvailable<AAResultsWrapperPass>()) |
2405 | AA = &AAR->getAAResults(); |
2406 | |
2407 | AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(GPU: ST->getCPU()); |
2408 | |
2409 | if (ST->hasExtendedWaitCounts()) { |
2410 | MaxCounter = NUM_EXTENDED_INST_CNTS; |
2411 | WCGGFX12Plus = WaitcntGeneratorGFX12Plus(MF, MaxCounter); |
2412 | WCG = &WCGGFX12Plus; |
2413 | } else { |
2414 | MaxCounter = NUM_NORMAL_INST_CNTS; |
2415 | WCGPreGFX12 = WaitcntGeneratorPreGFX12(MF); |
2416 | WCG = &WCGPreGFX12; |
2417 | } |
2418 | |
2419 | ForceEmitZeroWaitcnts = ForceEmitZeroFlag; |
2420 | for (auto T : inst_counter_types()) |
2421 | ForceEmitWaitcnt[T] = false; |
2422 | |
2423 | const unsigned *WaitEventMaskForInst = WCG->getWaitEventMask(); |
2424 | |
2425 | SmemAccessCounter = eventCounter(masks: WaitEventMaskForInst, E: SMEM_ACCESS); |
2426 | |
2427 | HardwareLimits Limits = {}; |
2428 | if (ST->hasExtendedWaitCounts()) { |
2429 | Limits.LoadcntMax = AMDGPU::getLoadcntBitMask(Version: IV); |
2430 | Limits.DscntMax = AMDGPU::getDscntBitMask(Version: IV); |
2431 | } else { |
2432 | Limits.LoadcntMax = AMDGPU::getVmcntBitMask(Version: IV); |
2433 | Limits.DscntMax = AMDGPU::getLgkmcntBitMask(Version: IV); |
2434 | } |
2435 | Limits.ExpcntMax = AMDGPU::getExpcntBitMask(Version: IV); |
2436 | Limits.StorecntMax = AMDGPU::getStorecntBitMask(Version: IV); |
2437 | Limits.SamplecntMax = AMDGPU::getSamplecntBitMask(Version: IV); |
2438 | Limits.BvhcntMax = AMDGPU::getBvhcntBitMask(Version: IV); |
2439 | Limits.KmcntMax = AMDGPU::getKmcntBitMask(Version: IV); |
2440 | |
2441 | unsigned NumVGPRsMax = ST->getAddressableNumVGPRs(); |
2442 | unsigned NumSGPRsMax = ST->getAddressableNumSGPRs(); |
2443 | assert(NumVGPRsMax <= SQ_MAX_PGM_VGPRS); |
2444 | assert(NumSGPRsMax <= SQ_MAX_PGM_SGPRS); |
2445 | |
2446 | RegisterEncoding Encoding = {}; |
2447 | Encoding.VGPR0 = |
2448 | TRI->getEncodingValue(RegNo: AMDGPU::VGPR0) & AMDGPU::HWEncoding::REG_IDX_MASK; |
2449 | Encoding.VGPRL = Encoding.VGPR0 + NumVGPRsMax - 1; |
2450 | Encoding.SGPR0 = |
2451 | TRI->getEncodingValue(RegNo: AMDGPU::SGPR0) & AMDGPU::HWEncoding::REG_IDX_MASK; |
2452 | Encoding.SGPRL = Encoding.SGPR0 + NumSGPRsMax - 1; |
2453 | |
2454 | BlockInfos.clear(); |
2455 | bool Modified = false; |
2456 | |
2457 | MachineBasicBlock &EntryBB = MF.front(); |
2458 | MachineBasicBlock::iterator I = EntryBB.begin(); |
2459 | |
2460 | if (!MFI->isEntryFunction()) { |
2461 | // Wait for any outstanding memory operations that the input registers may |
2462 | // depend on. We can't track them and it's better to do the wait after the |
2463 | // costly call sequence. |
2464 | |
2465 | // TODO: Could insert earlier and schedule more liberally with operations |
2466 | // that only use caller preserved registers. |
2467 | for (MachineBasicBlock::iterator E = EntryBB.end(); |
2468 | I != E && (I->isPHI() || I->isMetaInstruction()); ++I) |
2469 | ; |
2470 | |
2471 | if (ST->hasExtendedWaitCounts()) { |
2472 | BuildMI(BB&: EntryBB, I, MIMD: DebugLoc(), MCID: TII->get(Opcode: AMDGPU::S_WAIT_LOADCNT_DSCNT)) |
2473 | .addImm(Val: 0); |
2474 | for (auto CT : inst_counter_types(MaxCounter: NUM_EXTENDED_INST_CNTS)) { |
2475 | if (CT == LOAD_CNT || CT == DS_CNT || CT == STORE_CNT) |
2476 | continue; |
2477 | |
2478 | BuildMI(BB&: EntryBB, I, MIMD: DebugLoc(), |
2479 | MCID: TII->get(Opcode: instrsForExtendedCounterTypes[CT])) |
2480 | .addImm(Val: 0); |
2481 | } |
2482 | } else { |
2483 | BuildMI(BB&: EntryBB, I, MIMD: DebugLoc(), MCID: TII->get(Opcode: AMDGPU::S_WAITCNT)).addImm(Val: 0); |
2484 | } |
2485 | |
2486 | auto NonKernelInitialState = std::make_unique<WaitcntBrackets>( |
2487 | args&: ST, args&: MaxCounter, args&: Limits, args&: Encoding, args&: WaitEventMaskForInst, |
2488 | args&: SmemAccessCounter); |
2489 | NonKernelInitialState->setStateOnFunctionEntryOrReturn(); |
2490 | BlockInfos[&EntryBB].Incoming = std::move(NonKernelInitialState); |
2491 | |
2492 | Modified = true; |
2493 | } |
2494 | |
2495 | // Keep iterating over the blocks in reverse post order, inserting and |
2496 | // updating s_waitcnt where needed, until a fix point is reached. |
2497 | for (auto *MBB : ReversePostOrderTraversal<MachineFunction *>(&MF)) |
2498 | BlockInfos.insert(KV: {MBB, BlockInfo()}); |
2499 | |
2500 | std::unique_ptr<WaitcntBrackets> Brackets; |
2501 | bool Repeat; |
2502 | do { |
2503 | Repeat = false; |
2504 | |
2505 | for (auto BII = BlockInfos.begin(), BIE = BlockInfos.end(); BII != BIE; |
2506 | ++BII) { |
2507 | MachineBasicBlock *MBB = BII->first; |
2508 | BlockInfo &BI = BII->second; |
2509 | if (!BI.Dirty) |
2510 | continue; |
2511 | |
2512 | if (BI.Incoming) { |
2513 | if (!Brackets) |
2514 | Brackets = std::make_unique<WaitcntBrackets>(args&: *BI.Incoming); |
2515 | else |
2516 | *Brackets = *BI.Incoming; |
2517 | } else { |
2518 | if (!Brackets) |
2519 | Brackets = std::make_unique<WaitcntBrackets>( |
2520 | args&: ST, args&: MaxCounter, args&: Limits, args&: Encoding, args&: WaitEventMaskForInst, |
2521 | args&: SmemAccessCounter); |
2522 | else |
2523 | *Brackets = WaitcntBrackets(ST, MaxCounter, Limits, Encoding, |
2524 | WaitEventMaskForInst, SmemAccessCounter); |
2525 | } |
2526 | |
2527 | Modified |= insertWaitcntInBlock(MF, Block&: *MBB, ScoreBrackets&: *Brackets); |
2528 | BI.Dirty = false; |
2529 | |
2530 | if (Brackets->hasPendingEvent()) { |
2531 | BlockInfo *MoveBracketsToSucc = nullptr; |
2532 | for (MachineBasicBlock *Succ : MBB->successors()) { |
2533 | auto SuccBII = BlockInfos.find(Key: Succ); |
2534 | BlockInfo &SuccBI = SuccBII->second; |
2535 | if (!SuccBI.Incoming) { |
2536 | SuccBI.Dirty = true; |
2537 | if (SuccBII <= BII) |
2538 | Repeat = true; |
2539 | if (!MoveBracketsToSucc) { |
2540 | MoveBracketsToSucc = &SuccBI; |
2541 | } else { |
2542 | SuccBI.Incoming = std::make_unique<WaitcntBrackets>(args&: *Brackets); |
2543 | } |
2544 | } else if (SuccBI.Incoming->merge(Other: *Brackets)) { |
2545 | SuccBI.Dirty = true; |
2546 | if (SuccBII <= BII) |
2547 | Repeat = true; |
2548 | } |
2549 | } |
2550 | if (MoveBracketsToSucc) |
2551 | MoveBracketsToSucc->Incoming = std::move(Brackets); |
2552 | } |
2553 | } |
2554 | } while (Repeat); |
2555 | |
2556 | if (ST->hasScalarStores()) { |
2557 | SmallVector<MachineBasicBlock *, 4> EndPgmBlocks; |
2558 | bool HaveScalarStores = false; |
2559 | |
2560 | for (MachineBasicBlock &MBB : MF) { |
2561 | for (MachineInstr &MI : MBB) { |
2562 | if (!HaveScalarStores && TII->isScalarStore(MI)) |
2563 | HaveScalarStores = true; |
2564 | |
2565 | if (MI.getOpcode() == AMDGPU::S_ENDPGM || |
2566 | MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) |
2567 | EndPgmBlocks.push_back(Elt: &MBB); |
2568 | } |
2569 | } |
2570 | |
2571 | if (HaveScalarStores) { |
2572 | // If scalar writes are used, the cache must be flushed or else the next |
2573 | // wave to reuse the same scratch memory can be clobbered. |
2574 | // |
2575 | // Insert s_dcache_wb at wave termination points if there were any scalar |
2576 | // stores, and only if the cache hasn't already been flushed. This could |
2577 | // be improved by looking across blocks for flushes in postdominating |
2578 | // blocks from the stores but an explicitly requested flush is probably |
2579 | // very rare. |
2580 | for (MachineBasicBlock *MBB : EndPgmBlocks) { |
2581 | bool SeenDCacheWB = false; |
2582 | |
2583 | for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); |
2584 | I != E; ++I) { |
2585 | if (I->getOpcode() == AMDGPU::S_DCACHE_WB) |
2586 | SeenDCacheWB = true; |
2587 | else if (TII->isScalarStore(MI: *I)) |
2588 | SeenDCacheWB = false; |
2589 | |
2590 | // FIXME: It would be better to insert this before a waitcnt if any. |
2591 | if ((I->getOpcode() == AMDGPU::S_ENDPGM || |
2592 | I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) && |
2593 | !SeenDCacheWB) { |
2594 | Modified = true; |
2595 | BuildMI(BB&: *MBB, I, MIMD: I->getDebugLoc(), MCID: TII->get(Opcode: AMDGPU::S_DCACHE_WB)); |
2596 | } |
2597 | } |
2598 | } |
2599 | } |
2600 | } |
2601 | |
2602 | // Insert DEALLOC_VGPR messages before previously identified S_ENDPGM |
2603 | // instructions. |
2604 | for (MachineInstr *MI : ReleaseVGPRInsts) { |
2605 | if (ST->requiresNopBeforeDeallocVGPRs()) { |
2606 | BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(), MCID: TII->get(Opcode: AMDGPU::S_NOP)) |
2607 | .addImm(Val: 0); |
2608 | } |
2609 | BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(), |
2610 | MCID: TII->get(Opcode: AMDGPU::S_SENDMSG)) |
2611 | .addImm(Val: AMDGPU::SendMsg::ID_DEALLOC_VGPRS_GFX11Plus); |
2612 | Modified = true; |
2613 | } |
2614 | ReleaseVGPRInsts.clear(); |
2615 | |
2616 | return Modified; |
2617 | } |
2618 | |