1//===- SIMemoryLegalizer.cpp ----------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Memory legalizer - implements memory model. More information can be
11/// found here:
12/// http://llvm.org/docs/AMDGPUUsage.html#memory-model
13//
14//===----------------------------------------------------------------------===//
15
16#include "AMDGPU.h"
17#include "AMDGPUMachineModuleInfo.h"
18#include "GCNSubtarget.h"
19#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
20#include "llvm/ADT/BitmaskEnum.h"
21#include "llvm/ADT/StringExtras.h"
22#include "llvm/CodeGen/MachineBasicBlock.h"
23#include "llvm/CodeGen/MachineFunctionPass.h"
24#include "llvm/CodeGen/MachinePassManager.h"
25#include "llvm/IR/DiagnosticInfo.h"
26#include "llvm/IR/MemoryModelRelaxationAnnotations.h"
27#include "llvm/IR/PassManager.h"
28#include "llvm/Support/AMDGPUAddrSpace.h"
29#include "llvm/Support/AtomicOrdering.h"
30#include "llvm/TargetParser/TargetParser.h"
31
32using namespace llvm;
33using namespace llvm::AMDGPU;
34
35#define DEBUG_TYPE "si-memory-legalizer"
36#define PASS_NAME "SI Memory Legalizer"
37
38static cl::opt<bool> AmdgcnSkipCacheInvalidations(
39 "amdgcn-skip-cache-invalidations", cl::init(Val: false), cl::Hidden,
40 cl::desc("Use this to skip inserting cache invalidating instructions."));
41
42namespace {
43
44LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE();
45
46/// Memory operation flags. Can be ORed together.
47enum class SIMemOp {
48 NONE = 0u,
49 LOAD = 1u << 0,
50 STORE = 1u << 1,
51 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ STORE)
52};
53
54/// Position to insert a new instruction relative to an existing
55/// instruction.
56enum class Position {
57 BEFORE,
58 AFTER
59};
60
61/// The atomic synchronization scopes supported by the AMDGPU target.
62enum class SIAtomicScope {
63 NONE,
64 SINGLETHREAD,
65 WAVEFRONT,
66 WORKGROUP,
67 CLUSTER, // Promoted to AGENT on targets without workgroup clusters.
68 AGENT,
69 SYSTEM
70};
71
72/// The distinct address spaces supported by the AMDGPU target for
73/// atomic memory operation. Can be ORed together.
74enum class SIAtomicAddrSpace {
75 NONE = 0u,
76 GLOBAL = 1u << 0,
77 LDS = 1u << 1,
78 SCRATCH = 1u << 2,
79 GDS = 1u << 3,
80 OTHER = 1u << 4,
81
82 /// The address spaces that can be accessed by a FLAT instruction.
83 FLAT = GLOBAL | LDS | SCRATCH,
84
85 /// The address spaces that support atomic instructions.
86 ATOMIC = GLOBAL | LDS | SCRATCH | GDS,
87
88 /// All address spaces.
89 ALL = GLOBAL | LDS | SCRATCH | GDS | OTHER,
90
91 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
92};
93
94class SIMemOpInfo final {
95private:
96
97 friend class SIMemOpAccess;
98
99 AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
100 AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
101 SIAtomicScope Scope = SIAtomicScope::SYSTEM;
102 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
103 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
104 bool IsCrossAddressSpaceOrdering = false;
105 bool IsVolatile = false;
106 bool IsNonTemporal = false;
107 bool IsLastUse = false;
108 bool IsCooperative = false;
109
110 // TODO: Should we assume Cooperative=true if no MMO is present?
111 SIMemOpInfo(
112 const GCNSubtarget &ST,
113 AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent,
114 SIAtomicScope Scope = SIAtomicScope::SYSTEM,
115 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC,
116 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::ALL,
117 bool IsCrossAddressSpaceOrdering = true,
118 AtomicOrdering FailureOrdering = AtomicOrdering::SequentiallyConsistent,
119 bool IsVolatile = false, bool IsNonTemporal = false,
120 bool IsLastUse = false, bool IsCooperative = false)
121 : Ordering(Ordering), FailureOrdering(FailureOrdering), Scope(Scope),
122 OrderingAddrSpace(OrderingAddrSpace), InstrAddrSpace(InstrAddrSpace),
123 IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering),
124 IsVolatile(IsVolatile), IsNonTemporal(IsNonTemporal),
125 IsLastUse(IsLastUse), IsCooperative(IsCooperative) {
126
127 if (Ordering == AtomicOrdering::NotAtomic) {
128 assert(!IsCooperative && "Cannot be cooperative & non-atomic!");
129 assert(Scope == SIAtomicScope::NONE &&
130 OrderingAddrSpace == SIAtomicAddrSpace::NONE &&
131 !IsCrossAddressSpaceOrdering &&
132 FailureOrdering == AtomicOrdering::NotAtomic);
133 return;
134 }
135
136 assert(Scope != SIAtomicScope::NONE &&
137 (OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
138 SIAtomicAddrSpace::NONE &&
139 (InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
140 SIAtomicAddrSpace::NONE);
141
142 // There is also no cross address space ordering if the ordering
143 // address space is the same as the instruction address space and
144 // only contains a single address space.
145 if ((OrderingAddrSpace == InstrAddrSpace) &&
146 isPowerOf2_32(Value: uint32_t(InstrAddrSpace)))
147 this->IsCrossAddressSpaceOrdering = false;
148
149 // Limit the scope to the maximum supported by the instruction's address
150 // spaces.
151 if ((InstrAddrSpace & ~SIAtomicAddrSpace::SCRATCH) ==
152 SIAtomicAddrSpace::NONE) {
153 this->Scope = std::min(a: Scope, b: SIAtomicScope::SINGLETHREAD);
154 } else if ((InstrAddrSpace &
155 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS)) ==
156 SIAtomicAddrSpace::NONE) {
157 this->Scope = std::min(a: Scope, b: SIAtomicScope::WORKGROUP);
158 } else if ((InstrAddrSpace &
159 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS |
160 SIAtomicAddrSpace::GDS)) == SIAtomicAddrSpace::NONE) {
161 this->Scope = std::min(a: Scope, b: SIAtomicScope::AGENT);
162 }
163
164 // On targets that have no concept of a workgroup cluster, use
165 // AGENT scope as a conservatively correct alternative.
166 if (this->Scope == SIAtomicScope::CLUSTER && !ST.hasClusters())
167 this->Scope = SIAtomicScope::AGENT;
168 }
169
170public:
171 /// \returns Atomic synchronization scope of the machine instruction used to
172 /// create this SIMemOpInfo.
173 SIAtomicScope getScope() const {
174 return Scope;
175 }
176
177 /// \returns Ordering constraint of the machine instruction used to
178 /// create this SIMemOpInfo.
179 AtomicOrdering getOrdering() const {
180 return Ordering;
181 }
182
183 /// \returns Failure ordering constraint of the machine instruction used to
184 /// create this SIMemOpInfo.
185 AtomicOrdering getFailureOrdering() const {
186 return FailureOrdering;
187 }
188
189 /// \returns The address spaces be accessed by the machine
190 /// instruction used to create this SIMemOpInfo.
191 SIAtomicAddrSpace getInstrAddrSpace() const {
192 return InstrAddrSpace;
193 }
194
195 /// \returns The address spaces that must be ordered by the machine
196 /// instruction used to create this SIMemOpInfo.
197 SIAtomicAddrSpace getOrderingAddrSpace() const {
198 return OrderingAddrSpace;
199 }
200
201 /// \returns Return true iff memory ordering of operations on
202 /// different address spaces is required.
203 bool getIsCrossAddressSpaceOrdering() const {
204 return IsCrossAddressSpaceOrdering;
205 }
206
207 /// \returns True if memory access of the machine instruction used to
208 /// create this SIMemOpInfo is volatile, false otherwise.
209 bool isVolatile() const {
210 return IsVolatile;
211 }
212
213 /// \returns True if memory access of the machine instruction used to
214 /// create this SIMemOpInfo is nontemporal, false otherwise.
215 bool isNonTemporal() const {
216 return IsNonTemporal;
217 }
218
219 /// \returns True if memory access of the machine instruction used to
220 /// create this SIMemOpInfo is last use, false otherwise.
221 bool isLastUse() const { return IsLastUse; }
222
223 /// \returns True if this is a cooperative load or store atomic.
224 bool isCooperative() const { return IsCooperative; }
225
226 /// \returns True if ordering constraint of the machine instruction used to
227 /// create this SIMemOpInfo is unordered or higher, false otherwise.
228 bool isAtomic() const {
229 return Ordering != AtomicOrdering::NotAtomic;
230 }
231
232};
233
234class SIMemOpAccess final {
235private:
236 const AMDGPUMachineModuleInfo *MMI = nullptr;
237 const GCNSubtarget &ST;
238
239 /// Reports unsupported message \p Msg for \p MI to LLVM context.
240 void reportUnsupported(const MachineBasicBlock::iterator &MI,
241 const char *Msg) const;
242
243 /// Inspects the target synchronization scope \p SSID and determines
244 /// the SI atomic scope it corresponds to, the address spaces it
245 /// covers, and whether the memory ordering applies between address
246 /// spaces.
247 std::optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
248 toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrAddrSpace) const;
249
250 /// \return Return a bit set of the address spaces accessed by \p AS.
251 SIAtomicAddrSpace toSIAtomicAddrSpace(unsigned AS) const;
252
253 /// \returns Info constructed from \p MI, which has at least machine memory
254 /// operand.
255 std::optional<SIMemOpInfo>
256 constructFromMIWithMMO(const MachineBasicBlock::iterator &MI) const;
257
258public:
259 /// Construct class to support accessing the machine memory operands
260 /// of instructions in the machine function \p MF.
261 SIMemOpAccess(const AMDGPUMachineModuleInfo &MMI, const GCNSubtarget &ST);
262
263 /// \returns Load info if \p MI is a load operation, "std::nullopt" otherwise.
264 std::optional<SIMemOpInfo>
265 getLoadInfo(const MachineBasicBlock::iterator &MI) const;
266
267 /// \returns Store info if \p MI is a store operation, "std::nullopt"
268 /// otherwise.
269 std::optional<SIMemOpInfo>
270 getStoreInfo(const MachineBasicBlock::iterator &MI) const;
271
272 /// \returns Atomic fence info if \p MI is an atomic fence operation,
273 /// "std::nullopt" otherwise.
274 std::optional<SIMemOpInfo>
275 getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const;
276
277 /// \returns Atomic cmpxchg/rmw info if \p MI is an atomic cmpxchg or
278 /// rmw operation, "std::nullopt" otherwise.
279 std::optional<SIMemOpInfo>
280 getAtomicCmpxchgOrRmwInfo(const MachineBasicBlock::iterator &MI) const;
281
282 /// \returns DMA to LDS info if \p MI is as a direct-to/from-LDS load/store,
283 /// along with an indication of whether this is a load or store. If it is not
284 /// a direct-to-LDS operation, returns std::nullopt.
285 std::optional<SIMemOpInfo>
286 getLDSDMAInfo(const MachineBasicBlock::iterator &MI) const;
287};
288
289class SICacheControl {
290protected:
291
292 /// AMDGPU subtarget info.
293 const GCNSubtarget &ST;
294
295 /// Instruction info.
296 const SIInstrInfo *TII = nullptr;
297
298 IsaVersion IV;
299
300 /// Whether to insert cache invalidating instructions.
301 bool InsertCacheInv;
302
303 SICacheControl(const GCNSubtarget &ST);
304
305 /// Sets CPol \p Bits to "true" if present in instruction \p MI.
306 /// \returns Returns true if \p MI is modified, false otherwise.
307 bool enableCPolBits(const MachineBasicBlock::iterator MI,
308 unsigned Bits) const;
309
310 /// Check if any atomic operation on AS can affect memory accessible via the
311 /// global address space.
312 bool canAffectGlobalAddrSpace(SIAtomicAddrSpace AS) const;
313
314public:
315 using CPol = AMDGPU::CPol::CPol;
316
317 /// Create a cache control for the subtarget \p ST.
318 static std::unique_ptr<SICacheControl> create(const GCNSubtarget &ST);
319
320 /// Update \p MI memory load instruction to bypass any caches up to
321 /// the \p Scope memory scope for address spaces \p
322 /// AddrSpace. Return true iff the instruction was modified.
323 virtual bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
324 SIAtomicScope Scope,
325 SIAtomicAddrSpace AddrSpace) const = 0;
326
327 /// Update \p MI memory store instruction to bypass any caches up to
328 /// the \p Scope memory scope for address spaces \p
329 /// AddrSpace. Return true iff the instruction was modified.
330 virtual bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
331 SIAtomicScope Scope,
332 SIAtomicAddrSpace AddrSpace) const = 0;
333
334 /// Update \p MI memory read-modify-write instruction to bypass any caches up
335 /// to the \p Scope memory scope for address spaces \p AddrSpace. Return true
336 /// iff the instruction was modified.
337 virtual bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
338 SIAtomicScope Scope,
339 SIAtomicAddrSpace AddrSpace) const = 0;
340
341 /// Update \p MI memory instruction of kind \p Op associated with address
342 /// spaces \p AddrSpace to indicate it is volatile and/or
343 /// nontemporal/last-use. Return true iff the instruction was modified.
344 virtual bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
345 SIAtomicAddrSpace AddrSpace,
346 SIMemOp Op, bool IsVolatile,
347 bool IsNonTemporal,
348 bool IsLastUse = false) const = 0;
349
350 /// Add final touches to a `mayStore` instruction \p MI, which may be a
351 /// Store or RMW instruction.
352 /// FIXME: This takes a MI because iterators aren't handled properly. When
353 /// this is called, they often point to entirely different insts. Thus we back
354 /// up the inst early and pass it here instead.
355 virtual bool finalizeStore(MachineInstr &MI, bool Atomic) const {
356 return false;
357 };
358
359 /// Handle cooperative load/store atomics.
360 virtual bool handleCooperativeAtomic(MachineInstr &MI) const {
361 llvm_unreachable(
362 "cooperative atomics are not available on this architecture");
363 }
364
365 /// Inserts any necessary instructions at position \p Pos relative
366 /// to instruction \p MI to ensure memory instructions before \p Pos of kind
367 /// \p Op associated with address spaces \p AddrSpace have completed. Used
368 /// between memory instructions to enforce the order they become visible as
369 /// observed by other memory instructions executing in memory scope \p Scope.
370 /// \p IsCrossAddrSpaceOrdering indicates if the memory ordering is between
371 /// address spaces. If \p AtomicsOnly is true, only insert waits for counters
372 /// that are used by atomic instructions.
373 /// Returns true iff any instructions inserted.
374 virtual bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
375 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
376 bool IsCrossAddrSpaceOrdering, Position Pos,
377 AtomicOrdering Order, bool AtomicsOnly) const = 0;
378
379 /// Inserts any necessary instructions at position \p Pos relative to
380 /// instruction \p MI to ensure any subsequent memory instructions of this
381 /// thread with address spaces \p AddrSpace will observe the previous memory
382 /// operations by any thread for memory scopes up to memory scope \p Scope .
383 /// Returns true iff any instructions inserted.
384 virtual bool insertAcquire(MachineBasicBlock::iterator &MI,
385 SIAtomicScope Scope,
386 SIAtomicAddrSpace AddrSpace,
387 Position Pos) const = 0;
388
389 /// Inserts any necessary instructions at position \p Pos relative to
390 /// instruction \p MI to ensure previous memory instructions by this thread
391 /// with address spaces \p AddrSpace have completed and can be observed by
392 /// subsequent memory instructions by any thread executing in memory scope \p
393 /// Scope. \p IsCrossAddrSpaceOrdering indicates if the memory ordering is
394 /// between address spaces. Returns true iff any instructions inserted.
395 virtual bool insertRelease(MachineBasicBlock::iterator &MI,
396 SIAtomicScope Scope,
397 SIAtomicAddrSpace AddrSpace,
398 bool IsCrossAddrSpaceOrdering,
399 Position Pos) const = 0;
400
401 /// Handle operations that are considered non-volatile.
402 /// See \ref isNonVolatileMemoryAccess
403 virtual bool handleNonVolatile(MachineInstr &MI) const { return false; }
404
405 /// Virtual destructor to allow derivations to be deleted.
406 virtual ~SICacheControl() = default;
407};
408
409/// Generates code sequences for the memory model of all GFX targets below
410/// GFX10.
411class SIGfx6CacheControl final : public SICacheControl {
412public:
413
414 SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {}
415
416 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
417 SIAtomicScope Scope,
418 SIAtomicAddrSpace AddrSpace) const override;
419
420 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
421 SIAtomicScope Scope,
422 SIAtomicAddrSpace AddrSpace) const override;
423
424 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
425 SIAtomicScope Scope,
426 SIAtomicAddrSpace AddrSpace) const override;
427
428 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
429 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
430 bool IsVolatile, bool IsNonTemporal,
431 bool IsLastUse) const override;
432
433 bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
434 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
435 bool IsCrossAddrSpaceOrdering, Position Pos,
436 AtomicOrdering Order, bool AtomicsOnly) const override;
437
438 bool insertAcquire(MachineBasicBlock::iterator &MI,
439 SIAtomicScope Scope,
440 SIAtomicAddrSpace AddrSpace,
441 Position Pos) const override;
442
443 bool insertRelease(MachineBasicBlock::iterator &MI,
444 SIAtomicScope Scope,
445 SIAtomicAddrSpace AddrSpace,
446 bool IsCrossAddrSpaceOrdering,
447 Position Pos) const override;
448};
449
450/// Generates code sequences for the memory model of GFX10/11.
451class SIGfx10CacheControl final : public SICacheControl {
452public:
453 SIGfx10CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {}
454
455 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
456 SIAtomicScope Scope,
457 SIAtomicAddrSpace AddrSpace) const override;
458
459 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
460 SIAtomicScope Scope,
461 SIAtomicAddrSpace AddrSpace) const override {
462 return false;
463 }
464
465 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
466 SIAtomicScope Scope,
467 SIAtomicAddrSpace AddrSpace) const override {
468 return false;
469 }
470
471 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
472 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
473 bool IsVolatile, bool IsNonTemporal,
474 bool IsLastUse) const override;
475
476 bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
477 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
478 bool IsCrossAddrSpaceOrdering, Position Pos,
479 AtomicOrdering Order, bool AtomicsOnly) const override;
480
481 bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
482 SIAtomicAddrSpace AddrSpace, Position Pos) const override;
483
484 bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
485 SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,
486 Position Pos) const override {
487 return insertWait(MI, Scope, AddrSpace, Op: SIMemOp::LOAD | SIMemOp::STORE,
488 IsCrossAddrSpaceOrdering, Pos, Order: AtomicOrdering::Release,
489 /*AtomicsOnly=*/false);
490 }
491};
492
493class SIGfx12CacheControl final : public SICacheControl {
494protected:
495 // Sets TH policy to \p Value if CPol operand is present in instruction \p MI.
496 // \returns Returns true if \p MI is modified, false otherwise.
497 bool setTH(const MachineBasicBlock::iterator MI,
498 AMDGPU::CPol::CPol Value) const;
499
500 // Sets Scope policy to \p Value if CPol operand is present in instruction \p
501 // MI. \returns Returns true if \p MI is modified, false otherwise.
502 bool setScope(const MachineBasicBlock::iterator MI,
503 AMDGPU::CPol::CPol Value) const;
504
505 // Stores with system scope (SCOPE_SYS) need to wait for:
506 // - loads or atomics(returning) - wait for {LOAD|SAMPLE|BVH|KM}CNT==0
507 // - non-returning-atomics - wait for STORECNT==0
508 // TODO: SIInsertWaitcnts will not always be able to remove STORECNT waits
509 // since it does not distinguish atomics-with-return from regular stores.
510 // There is no need to wait if memory is cached (mtype != UC).
511 bool
512 insertWaitsBeforeSystemScopeStore(const MachineBasicBlock::iterator MI) const;
513
514 bool setAtomicScope(const MachineBasicBlock::iterator &MI,
515 SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const;
516
517public:
518 SIGfx12CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {
519 // GFX120x and GFX125x memory models greatly overlap, and in some cases
520 // the behavior is the same if assuming GFX120x in CU mode.
521 assert(!ST.hasGFX1250Insts() || ST.hasGFX13Insts() || ST.isCuModeEnabled());
522 }
523
524 bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
525 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
526 bool IsCrossAddrSpaceOrdering, Position Pos,
527 AtomicOrdering Order, bool AtomicsOnly) const override;
528
529 bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
530 SIAtomicAddrSpace AddrSpace, Position Pos) const override;
531
532 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
533 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
534 bool IsVolatile, bool IsNonTemporal,
535 bool IsLastUse) const override;
536
537 bool finalizeStore(MachineInstr &MI, bool Atomic) const override;
538
539 bool handleCooperativeAtomic(MachineInstr &MI) const override;
540
541 bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
542 SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,
543 Position Pos) const override;
544
545 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
546 SIAtomicScope Scope,
547 SIAtomicAddrSpace AddrSpace) const override {
548 return setAtomicScope(MI, Scope, AddrSpace);
549 }
550
551 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
552 SIAtomicScope Scope,
553 SIAtomicAddrSpace AddrSpace) const override {
554 return setAtomicScope(MI, Scope, AddrSpace);
555 }
556
557 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
558 SIAtomicScope Scope,
559 SIAtomicAddrSpace AddrSpace) const override {
560 return setAtomicScope(MI, Scope, AddrSpace);
561 }
562
563 bool handleNonVolatile(MachineInstr &MI) const override;
564};
565
566class SIMemoryLegalizer final {
567private:
568 const MachineModuleInfo &MMI;
569 /// Cache Control.
570 std::unique_ptr<SICacheControl> CC = nullptr;
571
572 /// List of atomic pseudo instructions.
573 std::list<MachineBasicBlock::iterator> AtomicPseudoMIs;
574
575 /// Return true iff instruction \p MI is a atomic instruction that
576 /// returns a result.
577 bool isAtomicRet(const MachineInstr &MI) const {
578 return SIInstrInfo::isAtomicRet(MI);
579 }
580
581 /// Removes all processed atomic pseudo instructions from the current
582 /// function. Returns true if current function is modified, false otherwise.
583 bool removeAtomicPseudoMIs();
584
585 /// Expands load operation \p MI. Returns true if instructions are
586 /// added/deleted or \p MI is modified, false otherwise.
587 bool expandLoad(const SIMemOpInfo &MOI,
588 MachineBasicBlock::iterator &MI);
589 /// Expands store operation \p MI. Returns true if instructions are
590 /// added/deleted or \p MI is modified, false otherwise.
591 bool expandStore(const SIMemOpInfo &MOI,
592 MachineBasicBlock::iterator &MI);
593 /// Expands atomic fence operation \p MI. Returns true if
594 /// instructions are added/deleted or \p MI is modified, false otherwise.
595 bool expandAtomicFence(const SIMemOpInfo &MOI,
596 MachineBasicBlock::iterator &MI);
597 /// Expands atomic cmpxchg or rmw operation \p MI. Returns true if
598 /// instructions are added/deleted or \p MI is modified, false otherwise.
599 bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
600 MachineBasicBlock::iterator &MI);
601 /// Expands LDS DMA operation \p MI. Returns true if instructions are
602 /// added/deleted or \p MI is modified, false otherwise.
603 bool expandLDSDMA(const SIMemOpInfo &MOI, MachineBasicBlock::iterator &MI);
604
605public:
606 SIMemoryLegalizer(const MachineModuleInfo &MMI) : MMI(MMI) {};
607 bool run(MachineFunction &MF);
608};
609
610class SIMemoryLegalizerLegacy final : public MachineFunctionPass {
611public:
612 static char ID;
613
614 SIMemoryLegalizerLegacy() : MachineFunctionPass(ID) {}
615
616 void getAnalysisUsage(AnalysisUsage &AU) const override {
617 AU.setPreservesCFG();
618 MachineFunctionPass::getAnalysisUsage(AU);
619 }
620
621 StringRef getPassName() const override {
622 return PASS_NAME;
623 }
624
625 bool runOnMachineFunction(MachineFunction &MF) override;
626};
627
628static const StringMap<SIAtomicAddrSpace> ASNames = {{
629 {"global", SIAtomicAddrSpace::GLOBAL},
630 {"local", SIAtomicAddrSpace::LDS},
631}};
632
633void diagnoseUnknownMMRAASName(const MachineInstr &MI, StringRef AS) {
634 const MachineFunction *MF = MI.getMF();
635 const Function &Fn = MF->getFunction();
636 SmallString<128> Str;
637 raw_svector_ostream OS(Str);
638 OS << "unknown address space '" << AS << "'; expected one of ";
639 ListSeparator LS;
640 for (const auto &[Name, Val] : ASNames)
641 OS << LS << '\'' << Name << '\'';
642 Fn.getContext().diagnose(
643 DI: DiagnosticInfoUnsupported(Fn, Str.str(), MI.getDebugLoc(), DS_Warning));
644}
645
646/// Reads \p MI's MMRAs to parse the "amdgpu-synchronize-as" MMRA.
647/// If this tag isn't present, or if it has no meaningful values, returns
648/// \p none, otherwise returns the address spaces specified by the MD.
649static std::optional<SIAtomicAddrSpace>
650getSynchronizeAddrSpaceMD(const MachineInstr &MI) {
651 static constexpr StringLiteral FenceASPrefix = "amdgpu-synchronize-as";
652
653 auto MMRA = MMRAMetadata(MI.getMMRAMetadata());
654 if (!MMRA)
655 return std::nullopt;
656
657 SIAtomicAddrSpace Result = SIAtomicAddrSpace::NONE;
658 for (const auto &[Prefix, Suffix] : MMRA) {
659 if (Prefix != FenceASPrefix)
660 continue;
661
662 if (auto It = ASNames.find(Key: Suffix); It != ASNames.end())
663 Result |= It->second;
664 else
665 diagnoseUnknownMMRAASName(MI, AS: Suffix);
666 }
667
668 if (Result == SIAtomicAddrSpace::NONE)
669 return std::nullopt;
670
671 return Result;
672}
673
674} // end anonymous namespace
675
676void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI,
677 const char *Msg) const {
678 const Function &Func = MI->getMF()->getFunction();
679 Func.getContext().diagnose(
680 DI: DiagnosticInfoUnsupported(Func, Msg, MI->getDebugLoc()));
681}
682
683std::optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
684SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID,
685 SIAtomicAddrSpace InstrAddrSpace) const {
686 if (SSID == SyncScope::System)
687 return std::tuple(SIAtomicScope::SYSTEM, SIAtomicAddrSpace::ATOMIC, true);
688 if (SSID == MMI->getAgentSSID())
689 return std::tuple(SIAtomicScope::AGENT, SIAtomicAddrSpace::ATOMIC, true);
690 if (SSID == MMI->getClusterSSID())
691 return std::tuple(SIAtomicScope::CLUSTER, SIAtomicAddrSpace::ATOMIC, true);
692 if (SSID == MMI->getWorkgroupSSID())
693 return std::tuple(SIAtomicScope::WORKGROUP, SIAtomicAddrSpace::ATOMIC,
694 true);
695 if (SSID == MMI->getWavefrontSSID())
696 return std::tuple(SIAtomicScope::WAVEFRONT, SIAtomicAddrSpace::ATOMIC,
697 true);
698 if (SSID == SyncScope::SingleThread)
699 return std::tuple(SIAtomicScope::SINGLETHREAD, SIAtomicAddrSpace::ATOMIC,
700 true);
701 if (SSID == MMI->getSystemOneAddressSpaceSSID())
702 return std::tuple(SIAtomicScope::SYSTEM,
703 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
704 if (SSID == MMI->getAgentOneAddressSpaceSSID())
705 return std::tuple(SIAtomicScope::AGENT,
706 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
707 if (SSID == MMI->getClusterOneAddressSpaceSSID())
708 return std::tuple(SIAtomicScope::CLUSTER,
709 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
710 if (SSID == MMI->getWorkgroupOneAddressSpaceSSID())
711 return std::tuple(SIAtomicScope::WORKGROUP,
712 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
713 if (SSID == MMI->getWavefrontOneAddressSpaceSSID())
714 return std::tuple(SIAtomicScope::WAVEFRONT,
715 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
716 if (SSID == MMI->getSingleThreadOneAddressSpaceSSID())
717 return std::tuple(SIAtomicScope::SINGLETHREAD,
718 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
719 return std::nullopt;
720}
721
722SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const {
723 if (AS == AMDGPUAS::FLAT_ADDRESS)
724 return SIAtomicAddrSpace::FLAT;
725 if (AS == AMDGPUAS::GLOBAL_ADDRESS)
726 return SIAtomicAddrSpace::GLOBAL;
727 if (AS == AMDGPUAS::LOCAL_ADDRESS)
728 return SIAtomicAddrSpace::LDS;
729 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
730 return SIAtomicAddrSpace::SCRATCH;
731 if (AS == AMDGPUAS::REGION_ADDRESS)
732 return SIAtomicAddrSpace::GDS;
733 if (AS == AMDGPUAS::BUFFER_FAT_POINTER || AS == AMDGPUAS::BUFFER_RESOURCE ||
734 AS == AMDGPUAS::BUFFER_STRIDED_POINTER)
735 return SIAtomicAddrSpace::GLOBAL;
736
737 return SIAtomicAddrSpace::OTHER;
738}
739
740SIMemOpAccess::SIMemOpAccess(const AMDGPUMachineModuleInfo &MMI_,
741 const GCNSubtarget &ST)
742 : MMI(&MMI_), ST(ST) {}
743
744std::optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
745 const MachineBasicBlock::iterator &MI) const {
746 assert(MI->getNumMemOperands() > 0);
747
748 SyncScope::ID SSID = SyncScope::SingleThread;
749 AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
750 AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
751 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
752 bool IsNonTemporal = true;
753 bool IsVolatile = false;
754 bool IsLastUse = false;
755 bool IsCooperative = false;
756
757 // Validator should check whether or not MMOs cover the entire set of
758 // locations accessed by the memory instruction.
759 for (const auto &MMO : MI->memoperands()) {
760 IsNonTemporal &= MMO->isNonTemporal();
761 IsVolatile |= MMO->isVolatile();
762 IsLastUse |= MMO->getFlags() & MOLastUse;
763 IsCooperative |= MMO->getFlags() & MOCooperative;
764 InstrAddrSpace |=
765 toSIAtomicAddrSpace(AS: MMO->getPointerInfo().getAddrSpace());
766 AtomicOrdering OpOrdering = MMO->getSuccessOrdering();
767 if (OpOrdering != AtomicOrdering::NotAtomic) {
768 const auto &IsSyncScopeInclusion =
769 MMI->isSyncScopeInclusion(A: SSID, B: MMO->getSyncScopeID());
770 if (!IsSyncScopeInclusion) {
771 reportUnsupported(MI,
772 Msg: "Unsupported non-inclusive atomic synchronization scope");
773 return std::nullopt;
774 }
775
776 SSID = *IsSyncScopeInclusion ? SSID : MMO->getSyncScopeID();
777 Ordering = getMergedAtomicOrdering(AO: Ordering, Other: OpOrdering);
778 assert(MMO->getFailureOrdering() != AtomicOrdering::Release &&
779 MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease);
780 FailureOrdering =
781 getMergedAtomicOrdering(AO: FailureOrdering, Other: MMO->getFailureOrdering());
782 }
783 }
784
785 // FIXME: The MMO of buffer atomic instructions does not always have an atomic
786 // ordering. We only need to handle VBUFFER atomics on GFX12+ so we can fix it
787 // here, but the lowering should really be cleaned up at some point.
788 if ((ST.getGeneration() >= GCNSubtarget::GFX12) && SIInstrInfo::isBUF(MI: *MI) &&
789 SIInstrInfo::isAtomic(MI: *MI) && Ordering == AtomicOrdering::NotAtomic)
790 Ordering = AtomicOrdering::Monotonic;
791
792 SIAtomicScope Scope = SIAtomicScope::NONE;
793 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
794 bool IsCrossAddressSpaceOrdering = false;
795 if (Ordering != AtomicOrdering::NotAtomic) {
796 auto ScopeOrNone = toSIAtomicScope(SSID, InstrAddrSpace);
797 if (!ScopeOrNone) {
798 reportUnsupported(MI, Msg: "Unsupported atomic synchronization scope");
799 return std::nullopt;
800 }
801 std::tie(args&: Scope, args&: OrderingAddrSpace, args&: IsCrossAddressSpaceOrdering) =
802 *ScopeOrNone;
803 if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
804 ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace) ||
805 ((InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) == SIAtomicAddrSpace::NONE)) {
806 reportUnsupported(MI, Msg: "Unsupported atomic address space");
807 return std::nullopt;
808 }
809 }
810 return SIMemOpInfo(ST, Ordering, Scope, OrderingAddrSpace, InstrAddrSpace,
811 IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile,
812 IsNonTemporal, IsLastUse, IsCooperative);
813}
814
815std::optional<SIMemOpInfo>
816SIMemOpAccess::getLoadInfo(const MachineBasicBlock::iterator &MI) const {
817 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
818
819 if (!(MI->mayLoad() && !MI->mayStore()))
820 return std::nullopt;
821
822 // Be conservative if there are no memory operands.
823 if (MI->getNumMemOperands() == 0)
824 return SIMemOpInfo(ST);
825
826 return constructFromMIWithMMO(MI);
827}
828
829std::optional<SIMemOpInfo>
830SIMemOpAccess::getStoreInfo(const MachineBasicBlock::iterator &MI) const {
831 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
832
833 if (!(!MI->mayLoad() && MI->mayStore()))
834 return std::nullopt;
835
836 // Be conservative if there are no memory operands.
837 if (MI->getNumMemOperands() == 0)
838 return SIMemOpInfo(ST);
839
840 return constructFromMIWithMMO(MI);
841}
842
843std::optional<SIMemOpInfo>
844SIMemOpAccess::getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const {
845 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
846
847 if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE)
848 return std::nullopt;
849
850 AtomicOrdering Ordering =
851 static_cast<AtomicOrdering>(MI->getOperand(i: 0).getImm());
852
853 SyncScope::ID SSID = static_cast<SyncScope::ID>(MI->getOperand(i: 1).getImm());
854 auto ScopeOrNone = toSIAtomicScope(SSID, InstrAddrSpace: SIAtomicAddrSpace::ATOMIC);
855 if (!ScopeOrNone) {
856 reportUnsupported(MI, Msg: "Unsupported atomic synchronization scope");
857 return std::nullopt;
858 }
859
860 SIAtomicScope Scope = SIAtomicScope::NONE;
861 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
862 bool IsCrossAddressSpaceOrdering = false;
863 std::tie(args&: Scope, args&: OrderingAddrSpace, args&: IsCrossAddressSpaceOrdering) =
864 *ScopeOrNone;
865
866 if (OrderingAddrSpace != SIAtomicAddrSpace::ATOMIC) {
867 // We currently expect refineOrderingAS to be the only place that
868 // can refine the AS ordered by the fence.
869 // If that changes, we need to review the semantics of that function
870 // in case it needs to preserve certain address spaces.
871 reportUnsupported(MI, Msg: "Unsupported atomic address space");
872 return std::nullopt;
873 }
874
875 auto SynchronizeAS = getSynchronizeAddrSpaceMD(MI: *MI);
876 if (SynchronizeAS)
877 OrderingAddrSpace = *SynchronizeAS;
878
879 return SIMemOpInfo(ST, Ordering, Scope, OrderingAddrSpace,
880 SIAtomicAddrSpace::ATOMIC, IsCrossAddressSpaceOrdering,
881 AtomicOrdering::NotAtomic);
882}
883
884std::optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo(
885 const MachineBasicBlock::iterator &MI) const {
886 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
887
888 if (!(MI->mayLoad() && MI->mayStore()))
889 return std::nullopt;
890
891 // Be conservative if there are no memory operands.
892 if (MI->getNumMemOperands() == 0)
893 return SIMemOpInfo(ST);
894
895 return constructFromMIWithMMO(MI);
896}
897
898std::optional<SIMemOpInfo>
899SIMemOpAccess::getLDSDMAInfo(const MachineBasicBlock::iterator &MI) const {
900 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
901
902 if (!SIInstrInfo::isLDSDMA(MI: *MI))
903 return std::nullopt;
904
905 return constructFromMIWithMMO(MI);
906}
907
908/// \returns true if \p MI has one or more MMO, and all of them are fit for
909/// being marked as non-volatile. This means that either they are accessing the
910/// constant address space, are accessing a known invariant memory location, or
911/// that they are marked with the non-volatile metadata/MMO flag.
912static bool isNonVolatileMemoryAccess(const MachineInstr &MI) {
913 if (MI.getNumMemOperands() == 0)
914 return false;
915 return all_of(Range: MI.memoperands(), P: [&](const MachineMemOperand *MMO) {
916 return MMO->getFlags() & (MOThreadPrivate | MachineMemOperand::MOInvariant);
917 });
918}
919
920SICacheControl::SICacheControl(const GCNSubtarget &ST) : ST(ST) {
921 TII = ST.getInstrInfo();
922 IV = getIsaVersion(GPU: ST.getCPU());
923 InsertCacheInv = !AmdgcnSkipCacheInvalidations;
924}
925
926bool SICacheControl::enableCPolBits(const MachineBasicBlock::iterator MI,
927 unsigned Bits) const {
928 MachineOperand *CPol = TII->getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::cpol);
929 if (!CPol)
930 return false;
931
932 CPol->setImm(CPol->getImm() | Bits);
933 return true;
934}
935
936bool SICacheControl::canAffectGlobalAddrSpace(SIAtomicAddrSpace AS) const {
937 assert((!ST.hasGloballyAddressableScratch() ||
938 (AS & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE ||
939 (AS & SIAtomicAddrSpace::SCRATCH) == SIAtomicAddrSpace::NONE) &&
940 "scratch instructions should already be replaced by flat "
941 "instructions if GloballyAddressableScratch is enabled");
942 return (AS & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE;
943}
944
945/* static */
946std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) {
947 GCNSubtarget::Generation Generation = ST.getGeneration();
948 if (Generation < AMDGPUSubtarget::GFX10)
949 return std::make_unique<SIGfx6CacheControl>(args: ST);
950 if (Generation < AMDGPUSubtarget::GFX12)
951 return std::make_unique<SIGfx10CacheControl>(args: ST);
952 return std::make_unique<SIGfx12CacheControl>(args: ST);
953}
954
955bool SIGfx6CacheControl::enableLoadCacheBypass(
956 const MachineBasicBlock::iterator &MI,
957 SIAtomicScope Scope,
958 SIAtomicAddrSpace AddrSpace) const {
959 assert(MI->mayLoad() && !MI->mayStore());
960
961 if (!canAffectGlobalAddrSpace(AS: AddrSpace)) {
962 /// The scratch address space does not need the global memory caches
963 /// to be bypassed as all memory operations by the same thread are
964 /// sequentially consistent, and no other thread can access scratch
965 /// memory.
966
967 /// Other address spaces do not have a cache.
968 return false;
969 }
970
971 bool Changed = false;
972 switch (Scope) {
973 case SIAtomicScope::SYSTEM:
974 if (ST.hasGFX940Insts()) {
975 // Set SC bits to indicate system scope.
976 Changed |= enableCPolBits(MI, Bits: CPol::SC0 | CPol::SC1);
977 break;
978 }
979 [[fallthrough]];
980 case SIAtomicScope::AGENT:
981 if (ST.hasGFX940Insts()) {
982 // Set SC bits to indicate agent scope.
983 Changed |= enableCPolBits(MI, Bits: CPol::SC1);
984 } else {
985 // Set L1 cache policy to MISS_EVICT.
986 // Note: there is no L2 cache bypass policy at the ISA level.
987 Changed |= enableCPolBits(MI, Bits: CPol::GLC);
988 }
989 break;
990 case SIAtomicScope::WORKGROUP:
991 if (ST.hasGFX940Insts()) {
992 // In threadgroup split mode the waves of a work-group can be executing
993 // on different CUs. Therefore need to bypass the L1 which is per CU.
994 // Otherwise in non-threadgroup split mode all waves of a work-group are
995 // on the same CU, and so the L1 does not need to be bypassed. Setting
996 // SC bits to indicate work-group scope will do this automatically.
997 Changed |= enableCPolBits(MI, Bits: CPol::SC0);
998 } else if (ST.hasGFX90AInsts()) {
999 // In threadgroup split mode the waves of a work-group can be executing
1000 // on different CUs. Therefore need to bypass the L1 which is per CU.
1001 // Otherwise in non-threadgroup split mode all waves of a work-group are
1002 // on the same CU, and so the L1 does not need to be bypassed.
1003 if (ST.isTgSplitEnabled())
1004 Changed |= enableCPolBits(MI, Bits: CPol::GLC);
1005 }
1006 break;
1007 case SIAtomicScope::WAVEFRONT:
1008 case SIAtomicScope::SINGLETHREAD:
1009 // No cache to bypass.
1010 break;
1011 default:
1012 llvm_unreachable("Unsupported synchronization scope");
1013 }
1014
1015 return Changed;
1016}
1017
1018bool SIGfx6CacheControl::enableStoreCacheBypass(
1019 const MachineBasicBlock::iterator &MI,
1020 SIAtomicScope Scope,
1021 SIAtomicAddrSpace AddrSpace) const {
1022 assert(!MI->mayLoad() && MI->mayStore());
1023 bool Changed = false;
1024
1025 /// For targets other than GFX940, the L1 cache is write through so does not
1026 /// need to be bypassed. There is no bypass control for the L2 cache at the
1027 /// isa level.
1028
1029 if (ST.hasGFX940Insts() && canAffectGlobalAddrSpace(AS: AddrSpace)) {
1030 switch (Scope) {
1031 case SIAtomicScope::SYSTEM:
1032 // Set SC bits to indicate system scope.
1033 Changed |= enableCPolBits(MI, Bits: CPol::SC0 | CPol::SC1);
1034 break;
1035 case SIAtomicScope::AGENT:
1036 // Set SC bits to indicate agent scope.
1037 Changed |= enableCPolBits(MI, Bits: CPol::SC1);
1038 break;
1039 case SIAtomicScope::WORKGROUP:
1040 // Set SC bits to indicate workgroup scope.
1041 Changed |= enableCPolBits(MI, Bits: CPol::SC0);
1042 break;
1043 case SIAtomicScope::WAVEFRONT:
1044 case SIAtomicScope::SINGLETHREAD:
1045 // Leave SC bits unset to indicate wavefront scope.
1046 break;
1047 default:
1048 llvm_unreachable("Unsupported synchronization scope");
1049 }
1050
1051 /// The scratch address space does not need the global memory caches
1052 /// to be bypassed as all memory operations by the same thread are
1053 /// sequentially consistent, and no other thread can access scratch
1054 /// memory.
1055
1056 /// Other address spaces do not have a cache.
1057 }
1058
1059 return Changed;
1060}
1061
1062bool SIGfx6CacheControl::enableRMWCacheBypass(
1063 const MachineBasicBlock::iterator &MI,
1064 SIAtomicScope Scope,
1065 SIAtomicAddrSpace AddrSpace) const {
1066 assert(MI->mayLoad() && MI->mayStore());
1067 bool Changed = false;
1068
1069 /// For targets other than GFX940, do not set GLC for RMW atomic operations as
1070 /// L0/L1 cache is automatically bypassed, and the GLC bit is instead used to
1071 /// indicate if they are return or no-return. Note: there is no L2 cache
1072 /// coherent bypass control at the ISA level.
1073 /// For GFX90A+, RMW atomics implicitly bypass the L1 cache.
1074
1075 if (ST.hasGFX940Insts() && canAffectGlobalAddrSpace(AS: AddrSpace)) {
1076 switch (Scope) {
1077 case SIAtomicScope::SYSTEM:
1078 // Set SC1 bit to indicate system scope.
1079 Changed |= enableCPolBits(MI, Bits: CPol::SC1);
1080 break;
1081 case SIAtomicScope::AGENT:
1082 case SIAtomicScope::WORKGROUP:
1083 case SIAtomicScope::WAVEFRONT:
1084 case SIAtomicScope::SINGLETHREAD:
1085 // RMW atomic operations implicitly bypass the L1 cache and only use SC1
1086 // to indicate system or agent scope. The SC0 bit is used to indicate if
1087 // they are return or no-return. Leave SC1 bit unset to indicate agent
1088 // scope.
1089 break;
1090 default:
1091 llvm_unreachable("Unsupported synchronization scope");
1092 }
1093 }
1094
1095 return Changed;
1096}
1097
1098bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal(
1099 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1100 bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
1101 // Only handle load and store, not atomic read-modify-write insructions. The
1102 // latter use glc to indicate if the atomic returns a result and so must not
1103 // be used for cache control.
1104 assert((MI->mayLoad() ^ MI->mayStore()) || SIInstrInfo::isLDSDMA(*MI));
1105
1106 // Only update load and store, not LLVM IR atomic read-modify-write
1107 // instructions. The latter are always marked as volatile so cannot sensibly
1108 // handle it as do not want to pessimize all atomics. Also they do not support
1109 // the nontemporal attribute.
1110 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1111
1112 bool Changed = false;
1113
1114 if (IsVolatile) {
1115 if (ST.hasGFX940Insts()) {
1116 // Set SC bits to indicate system scope.
1117 Changed |= enableCPolBits(MI, Bits: CPol::SC0 | CPol::SC1);
1118 } else if (Op == SIMemOp::LOAD) {
1119 // Set L1 cache policy to be MISS_EVICT for load instructions
1120 // and MISS_LRU for store instructions.
1121 // Note: there is no L2 cache bypass policy at the ISA level.
1122 Changed |= enableCPolBits(MI, Bits: CPol::GLC);
1123 }
1124
1125 // Ensure operation has completed at system scope to cause all volatile
1126 // operations to be visible outside the program in a global order. Do not
1127 // request cross address space as only the global address space can be
1128 // observable outside the program, so no need to cause a waitcnt for LDS
1129 // address space operations.
1130 Changed |= insertWait(MI, Scope: SIAtomicScope::SYSTEM, AddrSpace, Op, IsCrossAddrSpaceOrdering: false,
1131 Pos: Position::AFTER, Order: AtomicOrdering::Unordered,
1132 /*AtomicsOnly=*/false);
1133
1134 return Changed;
1135 }
1136
1137 if (IsNonTemporal) {
1138 if (ST.hasGFX940Insts()) {
1139 Changed |= enableCPolBits(MI, Bits: CPol::NT);
1140 } else {
1141 // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT
1142 // for both loads and stores, and the L2 cache policy to STREAM.
1143 Changed |= enableCPolBits(MI, Bits: CPol::SLC | CPol::GLC);
1144 }
1145 return Changed;
1146 }
1147
1148 return Changed;
1149}
1150
1151bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
1152 SIAtomicScope Scope,
1153 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1154 bool IsCrossAddrSpaceOrdering, Position Pos,
1155 AtomicOrdering Order,
1156 bool AtomicsOnly) const {
1157 bool Changed = false;
1158
1159 MachineBasicBlock &MBB = *MI->getParent();
1160 const DebugLoc &DL = MI->getDebugLoc();
1161
1162 if (Pos == Position::AFTER)
1163 ++MI;
1164
1165 // GFX90A+
1166 if (ST.hasGFX90AInsts() && ST.isTgSplitEnabled()) {
1167 // In threadgroup split mode the waves of a work-group can be executing on
1168 // different CUs. Therefore need to wait for global or GDS memory operations
1169 // to complete to ensure they are visible to waves in the other CUs.
1170 // Otherwise in non-threadgroup split mode all waves of a work-group are on
1171 // the same CU, so no need to wait for global memory as all waves in the
1172 // work-group access the same the L1, nor wait for GDS as access are ordered
1173 // on a CU.
1174 if (((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH |
1175 SIAtomicAddrSpace::GDS)) != SIAtomicAddrSpace::NONE) &&
1176 (Scope == SIAtomicScope::WORKGROUP)) {
1177 // Same as <GFX90A at AGENT scope;
1178 Scope = SIAtomicScope::AGENT;
1179 }
1180 // In threadgroup split mode LDS cannot be allocated so no need to wait for
1181 // LDS memory operations.
1182 AddrSpace &= ~SIAtomicAddrSpace::LDS;
1183 }
1184
1185 bool VMCnt = false;
1186 bool LGKMCnt = false;
1187
1188 if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
1189 SIAtomicAddrSpace::NONE) {
1190 switch (Scope) {
1191 case SIAtomicScope::SYSTEM:
1192 case SIAtomicScope::AGENT:
1193 VMCnt |= true;
1194 break;
1195 case SIAtomicScope::WORKGROUP:
1196 case SIAtomicScope::WAVEFRONT:
1197 case SIAtomicScope::SINGLETHREAD:
1198 // The L1 cache keeps all memory operations in order for
1199 // wavefronts in the same work-group.
1200 break;
1201 default:
1202 llvm_unreachable("Unsupported synchronization scope");
1203 }
1204 }
1205
1206 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1207 switch (Scope) {
1208 case SIAtomicScope::SYSTEM:
1209 case SIAtomicScope::AGENT:
1210 case SIAtomicScope::WORKGROUP:
1211 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
1212 // not needed as LDS operations for all waves are executed in a total
1213 // global ordering as observed by all waves. Required if also
1214 // synchronizing with global/GDS memory as LDS operations could be
1215 // reordered with respect to later global/GDS memory operations of the
1216 // same wave.
1217 LGKMCnt |= IsCrossAddrSpaceOrdering;
1218 break;
1219 case SIAtomicScope::WAVEFRONT:
1220 case SIAtomicScope::SINGLETHREAD:
1221 // The LDS keeps all memory operations in order for
1222 // the same wavefront.
1223 break;
1224 default:
1225 llvm_unreachable("Unsupported synchronization scope");
1226 }
1227 }
1228
1229 if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
1230 switch (Scope) {
1231 case SIAtomicScope::SYSTEM:
1232 case SIAtomicScope::AGENT:
1233 // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
1234 // is not needed as GDS operations for all waves are executed in a total
1235 // global ordering as observed by all waves. Required if also
1236 // synchronizing with global/LDS memory as GDS operations could be
1237 // reordered with respect to later global/LDS memory operations of the
1238 // same wave.
1239 LGKMCnt |= IsCrossAddrSpaceOrdering;
1240 break;
1241 case SIAtomicScope::WORKGROUP:
1242 case SIAtomicScope::WAVEFRONT:
1243 case SIAtomicScope::SINGLETHREAD:
1244 // The GDS keeps all memory operations in order for
1245 // the same work-group.
1246 break;
1247 default:
1248 llvm_unreachable("Unsupported synchronization scope");
1249 }
1250 }
1251
1252 if (VMCnt || LGKMCnt) {
1253 unsigned WaitCntImmediate =
1254 AMDGPU::encodeWaitcnt(Version: IV,
1255 Vmcnt: VMCnt ? 0 : getVmcntBitMask(Version: IV),
1256 Expcnt: getExpcntBitMask(Version: IV),
1257 Lgkmcnt: LGKMCnt ? 0 : getLgkmcntBitMask(Version: IV));
1258 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_WAITCNT_soft))
1259 .addImm(Val: WaitCntImmediate);
1260 Changed = true;
1261 }
1262
1263 // On architectures that support direct loads to LDS, emit an unknown waitcnt
1264 // at workgroup-scoped release operations that specify the LDS address space.
1265 // SIInsertWaitcnts will later replace this with a vmcnt().
1266 if (ST.hasVMemToLDSLoad() && isReleaseOrStronger(AO: Order) &&
1267 Scope == SIAtomicScope::WORKGROUP &&
1268 (AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1269 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_WAITCNT_lds_direct));
1270 Changed = true;
1271 }
1272
1273 if (Pos == Position::AFTER)
1274 --MI;
1275
1276 return Changed;
1277}
1278
1279static bool canUseBUFFER_WBINVL1_VOL(const GCNSubtarget &ST) {
1280 if (ST.getGeneration() <= AMDGPUSubtarget::SOUTHERN_ISLANDS)
1281 return false;
1282 return !ST.isAmdPalOS() && !ST.isMesa3DOS();
1283}
1284
1285bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1286 SIAtomicScope Scope,
1287 SIAtomicAddrSpace AddrSpace,
1288 Position Pos) const {
1289 if (!InsertCacheInv)
1290 return false;
1291
1292 bool Changed = false;
1293
1294 MachineBasicBlock &MBB = *MI->getParent();
1295 const DebugLoc &DL = MI->getDebugLoc();
1296
1297 if (Pos == Position::AFTER)
1298 ++MI;
1299
1300 const unsigned InvalidateL1 = canUseBUFFER_WBINVL1_VOL(ST)
1301 ? AMDGPU::BUFFER_WBINVL1_VOL
1302 : AMDGPU::BUFFER_WBINVL1;
1303
1304 if (canAffectGlobalAddrSpace(AS: AddrSpace)) {
1305 switch (Scope) {
1306 case SIAtomicScope::SYSTEM:
1307 if (ST.hasGFX940Insts()) {
1308 // Ensures that following loads will not see stale remote VMEM data or
1309 // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW
1310 // and CC will never be stale due to the local memory probes.
1311 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::BUFFER_INV))
1312 // Set SC bits to indicate system scope.
1313 .addImm(Val: AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);
1314 // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
1315 // hardware does not reorder memory operations by the same wave with
1316 // respect to a preceding "BUFFER_INV". The invalidate is guaranteed to
1317 // remove any cache lines of earlier writes by the same wave and ensures
1318 // later reads by the same wave will refetch the cache lines.
1319 Changed = true;
1320 break;
1321 }
1322
1323 if (ST.hasGFX90AInsts()) {
1324 // Ensures that following loads will not see stale remote VMEM data or
1325 // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW
1326 // and CC will never be stale due to the local memory probes.
1327 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::BUFFER_INVL2));
1328 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: InvalidateL1));
1329 // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
1330 // hardware does not reorder memory operations by the same wave with
1331 // respect to a preceding "BUFFER_INVL2". The invalidate is guaranteed
1332 // to remove any cache lines of earlier writes by the same wave and
1333 // ensures later reads by the same wave will refetch the cache lines.
1334 Changed = true;
1335 break;
1336 }
1337 [[fallthrough]];
1338 case SIAtomicScope::AGENT:
1339 if (ST.hasGFX940Insts()) {
1340 // Ensures that following loads will not see stale remote date or local
1341 // MTYPE NC global data. Local MTYPE RW and CC memory will never be
1342 // stale due to the memory probes.
1343 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::BUFFER_INV))
1344 // Set SC bits to indicate agent scope.
1345 .addImm(Val: AMDGPU::CPol::SC1);
1346 // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
1347 // does not reorder memory operations with respect to preceeding buffer
1348 // invalidate. The invalidate is guaranteed to remove any cache lines of
1349 // earlier writes and ensures later writes will refetch the cache lines.
1350 } else
1351 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: InvalidateL1));
1352 Changed = true;
1353 break;
1354 case SIAtomicScope::WORKGROUP:
1355 if (ST.isTgSplitEnabled()) {
1356 if (ST.hasGFX940Insts()) {
1357 // In threadgroup split mode the waves of a work-group can be
1358 // executing on different CUs. Therefore need to invalidate the L1
1359 // which is per CU. Otherwise in non-threadgroup split mode all waves
1360 // of a work-group are on the same CU, and so the L1 does not need to
1361 // be invalidated.
1362
1363 // Ensures L1 is invalidated if in threadgroup split mode. In
1364 // non-threadgroup split mode it is a NOP, but no point generating it
1365 // in that case if know not in that mode.
1366 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::BUFFER_INV))
1367 // Set SC bits to indicate work-group scope.
1368 .addImm(Val: AMDGPU::CPol::SC0);
1369 // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
1370 // does not reorder memory operations with respect to preceeding
1371 // buffer invalidate. The invalidate is guaranteed to remove any cache
1372 // lines of earlier writes and ensures later writes will refetch the
1373 // cache lines.
1374 Changed = true;
1375 } else if (ST.hasGFX90AInsts()) {
1376 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: InvalidateL1));
1377 Changed = true;
1378 }
1379 }
1380 break;
1381 case SIAtomicScope::WAVEFRONT:
1382 case SIAtomicScope::SINGLETHREAD:
1383 // For GFX940, we could generate "BUFFER_INV" but it would do nothing as
1384 // there are no caches to invalidate. All other targets have no cache to
1385 // invalidate.
1386 break;
1387 default:
1388 llvm_unreachable("Unsupported synchronization scope");
1389 }
1390 }
1391
1392 /// The scratch address space does not need the global memory cache
1393 /// to be flushed as all memory operations by the same thread are
1394 /// sequentially consistent, and no other thread can access scratch
1395 /// memory.
1396
1397 /// Other address spaces do not have a cache.
1398
1399 if (Pos == Position::AFTER)
1400 --MI;
1401
1402 return Changed;
1403}
1404
1405bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1406 SIAtomicScope Scope,
1407 SIAtomicAddrSpace AddrSpace,
1408 bool IsCrossAddrSpaceOrdering,
1409 Position Pos) const {
1410 bool Changed = false;
1411
1412 if (ST.hasGFX90AInsts()) {
1413 MachineBasicBlock &MBB = *MI->getParent();
1414 const DebugLoc &DL = MI->getDebugLoc();
1415
1416 if (Pos == Position::AFTER)
1417 ++MI;
1418
1419 if (canAffectGlobalAddrSpace(AS: AddrSpace)) {
1420 switch (Scope) {
1421 case SIAtomicScope::SYSTEM:
1422 // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
1423 // hardware does not reorder memory operations by the same wave with
1424 // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
1425 // to initiate writeback of any dirty cache lines of earlier writes by
1426 // the same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
1427 // writeback has completed.
1428 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::BUFFER_WBL2))
1429 // Set SC bits to indicate system scope.
1430 .addImm(Val: AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);
1431 Changed = true;
1432 break;
1433 case SIAtomicScope::AGENT:
1434 if (ST.hasGFX940Insts()) {
1435 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::BUFFER_WBL2))
1436 // Set SC bits to indicate agent scope.
1437 .addImm(Val: AMDGPU::CPol::SC1);
1438
1439 // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is
1440 // SIAtomicScope::AGENT, the following insertWait will generate the
1441 // required "S_WAITCNT vmcnt(0)".
1442 Changed = true;
1443 }
1444 break;
1445 case SIAtomicScope::WORKGROUP:
1446 case SIAtomicScope::WAVEFRONT:
1447 case SIAtomicScope::SINGLETHREAD:
1448 // For GFX940, do not generate "BUFFER_WBL2" as there are no caches it
1449 // would writeback, and would require an otherwise unnecessary
1450 // "S_WAITCNT vmcnt(0)".
1451 break;
1452 default:
1453 llvm_unreachable("Unsupported synchronization scope");
1454 }
1455 }
1456
1457 if (Pos == Position::AFTER)
1458 --MI;
1459 }
1460
1461 // Ensure the necessary S_WAITCNT needed by any "BUFFER_WBL2" as well as other
1462 // S_WAITCNT needed.
1463 Changed |= insertWait(MI, Scope, AddrSpace, Op: SIMemOp::LOAD | SIMemOp::STORE,
1464 IsCrossAddrSpaceOrdering, Pos, Order: AtomicOrdering::Release,
1465 /*AtomicsOnly=*/false);
1466
1467 return Changed;
1468}
1469
1470bool SIGfx10CacheControl::enableLoadCacheBypass(
1471 const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
1472 SIAtomicAddrSpace AddrSpace) const {
1473 assert(MI->mayLoad() && !MI->mayStore());
1474 bool Changed = false;
1475
1476 if (canAffectGlobalAddrSpace(AS: AddrSpace)) {
1477 switch (Scope) {
1478 case SIAtomicScope::SYSTEM:
1479 case SIAtomicScope::AGENT:
1480 // Set the L0 and L1 cache policies to MISS_EVICT.
1481 // Note: there is no L2 cache coherent bypass control at the ISA level.
1482 // For GFX10, set GLC+DLC, for GFX11, only set GLC.
1483 Changed |=
1484 enableCPolBits(MI, Bits: CPol::GLC | (AMDGPU::isGFX10(STI: ST) ? CPol::DLC : 0));
1485 break;
1486 case SIAtomicScope::WORKGROUP:
1487 // In WGP mode the waves of a work-group can be executing on either CU of
1488 // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
1489 // CU mode all waves of a work-group are on the same CU, and so the L0
1490 // does not need to be bypassed.
1491 if (!ST.isCuModeEnabled())
1492 Changed |= enableCPolBits(MI, Bits: CPol::GLC);
1493 break;
1494 case SIAtomicScope::WAVEFRONT:
1495 case SIAtomicScope::SINGLETHREAD:
1496 // No cache to bypass.
1497 break;
1498 default:
1499 llvm_unreachable("Unsupported synchronization scope");
1500 }
1501 }
1502
1503 /// The scratch address space does not need the global memory caches
1504 /// to be bypassed as all memory operations by the same thread are
1505 /// sequentially consistent, and no other thread can access scratch
1506 /// memory.
1507
1508 /// Other address spaces do not have a cache.
1509
1510 return Changed;
1511}
1512
1513bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal(
1514 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1515 bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
1516
1517 // Only handle load and store, not atomic read-modify-write insructions. The
1518 // latter use glc to indicate if the atomic returns a result and so must not
1519 // be used for cache control.
1520 assert((MI->mayLoad() ^ MI->mayStore()) || SIInstrInfo::isLDSDMA(*MI));
1521
1522 // Only update load and store, not LLVM IR atomic read-modify-write
1523 // instructions. The latter are always marked as volatile so cannot sensibly
1524 // handle it as do not want to pessimize all atomics. Also they do not support
1525 // the nontemporal attribute.
1526 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1527
1528 bool Changed = false;
1529
1530 if (IsVolatile) {
1531 // Set L0 and L1 cache policy to be MISS_EVICT for load instructions
1532 // and MISS_LRU for store instructions.
1533 // Note: there is no L2 cache coherent bypass control at the ISA level.
1534 if (Op == SIMemOp::LOAD) {
1535 Changed |= enableCPolBits(MI, Bits: CPol::GLC | CPol::DLC);
1536 }
1537
1538 // GFX11: Set MALL NOALLOC for both load and store instructions.
1539 if (AMDGPU::isGFX11(STI: ST))
1540 Changed |= enableCPolBits(MI, Bits: CPol::DLC);
1541
1542 // Ensure operation has completed at system scope to cause all volatile
1543 // operations to be visible outside the program in a global order. Do not
1544 // request cross address space as only the global address space can be
1545 // observable outside the program, so no need to cause a waitcnt for LDS
1546 // address space operations.
1547 Changed |= insertWait(MI, Scope: SIAtomicScope::SYSTEM, AddrSpace, Op, IsCrossAddrSpaceOrdering: false,
1548 Pos: Position::AFTER, Order: AtomicOrdering::Unordered,
1549 /*AtomicsOnly=*/false);
1550 return Changed;
1551 }
1552
1553 if (IsNonTemporal) {
1554 // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT
1555 // and L2 cache policy to STREAM.
1556 // For stores setting both GLC and SLC configures L0 and L1 cache policy
1557 // to MISS_EVICT and the L2 cache policy to STREAM.
1558 if (Op == SIMemOp::STORE)
1559 Changed |= enableCPolBits(MI, Bits: CPol::GLC);
1560 Changed |= enableCPolBits(MI, Bits: CPol::SLC);
1561
1562 // GFX11: Set MALL NOALLOC for both load and store instructions.
1563 if (AMDGPU::isGFX11(STI: ST))
1564 Changed |= enableCPolBits(MI, Bits: CPol::DLC);
1565
1566 return Changed;
1567 }
1568
1569 return Changed;
1570}
1571
1572bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
1573 SIAtomicScope Scope,
1574 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1575 bool IsCrossAddrSpaceOrdering,
1576 Position Pos, AtomicOrdering Order,
1577 bool AtomicsOnly) const {
1578 bool Changed = false;
1579
1580 MachineBasicBlock &MBB = *MI->getParent();
1581 const DebugLoc &DL = MI->getDebugLoc();
1582
1583 if (Pos == Position::AFTER)
1584 ++MI;
1585
1586 bool VMCnt = false;
1587 bool VSCnt = false;
1588 bool LGKMCnt = false;
1589
1590 if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
1591 SIAtomicAddrSpace::NONE) {
1592 switch (Scope) {
1593 case SIAtomicScope::SYSTEM:
1594 case SIAtomicScope::AGENT:
1595 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1596 VMCnt |= true;
1597 if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1598 VSCnt |= true;
1599 break;
1600 case SIAtomicScope::WORKGROUP:
1601 // In WGP mode the waves of a work-group can be executing on either CU of
1602 // the WGP. Therefore need to wait for operations to complete to ensure
1603 // they are visible to waves in the other CU as the L0 is per CU.
1604 // Otherwise in CU mode and all waves of a work-group are on the same CU
1605 // which shares the same L0. Note that we still need to wait when
1606 // performing a release in this mode to respect the transitivity of
1607 // happens-before, e.g. other waves of the workgroup must be able to
1608 // release the memory from another wave at a wider scope.
1609 if (!ST.isCuModeEnabled() || isReleaseOrStronger(AO: Order)) {
1610 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1611 VMCnt |= true;
1612 if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1613 VSCnt |= true;
1614 }
1615 break;
1616 case SIAtomicScope::WAVEFRONT:
1617 case SIAtomicScope::SINGLETHREAD:
1618 // The L0 cache keeps all memory operations in order for
1619 // work-items in the same wavefront.
1620 break;
1621 default:
1622 llvm_unreachable("Unsupported synchronization scope");
1623 }
1624 }
1625
1626 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1627 switch (Scope) {
1628 case SIAtomicScope::SYSTEM:
1629 case SIAtomicScope::AGENT:
1630 case SIAtomicScope::WORKGROUP:
1631 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
1632 // not needed as LDS operations for all waves are executed in a total
1633 // global ordering as observed by all waves. Required if also
1634 // synchronizing with global/GDS memory as LDS operations could be
1635 // reordered with respect to later global/GDS memory operations of the
1636 // same wave.
1637 LGKMCnt |= IsCrossAddrSpaceOrdering;
1638 break;
1639 case SIAtomicScope::WAVEFRONT:
1640 case SIAtomicScope::SINGLETHREAD:
1641 // The LDS keeps all memory operations in order for
1642 // the same wavefront.
1643 break;
1644 default:
1645 llvm_unreachable("Unsupported synchronization scope");
1646 }
1647 }
1648
1649 if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
1650 switch (Scope) {
1651 case SIAtomicScope::SYSTEM:
1652 case SIAtomicScope::AGENT:
1653 // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
1654 // is not needed as GDS operations for all waves are executed in a total
1655 // global ordering as observed by all waves. Required if also
1656 // synchronizing with global/LDS memory as GDS operations could be
1657 // reordered with respect to later global/LDS memory operations of the
1658 // same wave.
1659 LGKMCnt |= IsCrossAddrSpaceOrdering;
1660 break;
1661 case SIAtomicScope::WORKGROUP:
1662 case SIAtomicScope::WAVEFRONT:
1663 case SIAtomicScope::SINGLETHREAD:
1664 // The GDS keeps all memory operations in order for
1665 // the same work-group.
1666 break;
1667 default:
1668 llvm_unreachable("Unsupported synchronization scope");
1669 }
1670 }
1671
1672 if (VMCnt || LGKMCnt) {
1673 unsigned WaitCntImmediate =
1674 AMDGPU::encodeWaitcnt(Version: IV,
1675 Vmcnt: VMCnt ? 0 : getVmcntBitMask(Version: IV),
1676 Expcnt: getExpcntBitMask(Version: IV),
1677 Lgkmcnt: LGKMCnt ? 0 : getLgkmcntBitMask(Version: IV));
1678 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_WAITCNT_soft))
1679 .addImm(Val: WaitCntImmediate);
1680 Changed = true;
1681 }
1682
1683 // On architectures that support direct loads to LDS, emit an unknown waitcnt
1684 // at workgroup-scoped release operations that specify the LDS address space.
1685 // SIInsertWaitcnts will later replace this with a vmcnt().
1686 if (ST.hasVMemToLDSLoad() && isReleaseOrStronger(AO: Order) &&
1687 Scope == SIAtomicScope::WORKGROUP &&
1688 (AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1689 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_WAITCNT_lds_direct));
1690 Changed = true;
1691 }
1692
1693 if (VSCnt) {
1694 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_WAITCNT_VSCNT_soft))
1695 .addReg(RegNo: AMDGPU::SGPR_NULL, Flags: RegState::Undef)
1696 .addImm(Val: 0);
1697 Changed = true;
1698 }
1699
1700 if (Pos == Position::AFTER)
1701 --MI;
1702
1703 return Changed;
1704}
1705
1706bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1707 SIAtomicScope Scope,
1708 SIAtomicAddrSpace AddrSpace,
1709 Position Pos) const {
1710 if (!InsertCacheInv)
1711 return false;
1712
1713 bool Changed = false;
1714
1715 MachineBasicBlock &MBB = *MI->getParent();
1716 const DebugLoc &DL = MI->getDebugLoc();
1717
1718 if (Pos == Position::AFTER)
1719 ++MI;
1720
1721 if (canAffectGlobalAddrSpace(AS: AddrSpace)) {
1722 switch (Scope) {
1723 case SIAtomicScope::SYSTEM:
1724 case SIAtomicScope::AGENT:
1725 // The order of invalidates matter here. We must invalidate "outer in"
1726 // so L1 -> L0 to avoid L0 pulling in stale data from L1 when it is
1727 // invalidated.
1728 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::BUFFER_GL1_INV));
1729 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::BUFFER_GL0_INV));
1730 Changed = true;
1731 break;
1732 case SIAtomicScope::WORKGROUP:
1733 // In WGP mode the waves of a work-group can be executing on either CU of
1734 // the WGP. Therefore need to invalidate the L0 which is per CU. Otherwise
1735 // in CU mode and all waves of a work-group are on the same CU, and so the
1736 // L0 does not need to be invalidated.
1737 if (!ST.isCuModeEnabled()) {
1738 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::BUFFER_GL0_INV));
1739 Changed = true;
1740 }
1741 break;
1742 case SIAtomicScope::WAVEFRONT:
1743 case SIAtomicScope::SINGLETHREAD:
1744 // No cache to invalidate.
1745 break;
1746 default:
1747 llvm_unreachable("Unsupported synchronization scope");
1748 }
1749 }
1750
1751 /// The scratch address space does not need the global memory cache
1752 /// to be flushed as all memory operations by the same thread are
1753 /// sequentially consistent, and no other thread can access scratch
1754 /// memory.
1755
1756 /// Other address spaces do not have a cache.
1757
1758 if (Pos == Position::AFTER)
1759 --MI;
1760
1761 return Changed;
1762}
1763
1764bool SIGfx12CacheControl::setTH(const MachineBasicBlock::iterator MI,
1765 AMDGPU::CPol::CPol Value) const {
1766 MachineOperand *CPol = TII->getNamedOperand(MI&: *MI, OperandName: OpName::cpol);
1767 if (!CPol)
1768 return false;
1769
1770 uint64_t NewTH = Value & AMDGPU::CPol::TH;
1771 if ((CPol->getImm() & AMDGPU::CPol::TH) != NewTH) {
1772 CPol->setImm((CPol->getImm() & ~AMDGPU::CPol::TH) | NewTH);
1773 return true;
1774 }
1775
1776 return false;
1777}
1778
1779bool SIGfx12CacheControl::setScope(const MachineBasicBlock::iterator MI,
1780 AMDGPU::CPol::CPol Value) const {
1781 MachineOperand *CPol = TII->getNamedOperand(MI&: *MI, OperandName: OpName::cpol);
1782 if (!CPol)
1783 return false;
1784
1785 uint64_t NewScope = Value & AMDGPU::CPol::SCOPE;
1786 if ((CPol->getImm() & AMDGPU::CPol::SCOPE) != NewScope) {
1787 CPol->setImm((CPol->getImm() & ~AMDGPU::CPol::SCOPE) | NewScope);
1788 return true;
1789 }
1790
1791 return false;
1792}
1793
1794bool SIGfx12CacheControl::insertWaitsBeforeSystemScopeStore(
1795 const MachineBasicBlock::iterator MI) const {
1796 // TODO: implement flag for frontend to give us a hint not to insert waits.
1797
1798 MachineBasicBlock &MBB = *MI->getParent();
1799 const DebugLoc &DL = MI->getDebugLoc();
1800
1801 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: S_WAIT_LOADCNT_soft)).addImm(Val: 0);
1802 if (ST.hasImageInsts()) {
1803 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: S_WAIT_SAMPLECNT_soft)).addImm(Val: 0);
1804 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: S_WAIT_BVHCNT_soft)).addImm(Val: 0);
1805 }
1806 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: S_WAIT_KMCNT_soft)).addImm(Val: 0);
1807 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: S_WAIT_STORECNT_soft)).addImm(Val: 0);
1808
1809 return true;
1810}
1811
1812bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI,
1813 SIAtomicScope Scope,
1814 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1815 bool IsCrossAddrSpaceOrdering,
1816 Position Pos, AtomicOrdering Order,
1817 bool AtomicsOnly) const {
1818 bool Changed = false;
1819
1820 MachineBasicBlock &MBB = *MI->getParent();
1821 const DebugLoc &DL = MI->getDebugLoc();
1822
1823 bool LOADCnt = false;
1824 bool DSCnt = false;
1825 bool STORECnt = false;
1826
1827 if (Pos == Position::AFTER)
1828 ++MI;
1829
1830 if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
1831 SIAtomicAddrSpace::NONE) {
1832 switch (Scope) {
1833 case SIAtomicScope::SYSTEM:
1834 case SIAtomicScope::AGENT:
1835 case SIAtomicScope::CLUSTER:
1836 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1837 LOADCnt |= true;
1838 if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1839 STORECnt |= true;
1840 break;
1841 case SIAtomicScope::WORKGROUP:
1842 // GFX12.0:
1843 // In WGP mode the waves of a work-group can be executing on either CU
1844 // of the WGP. Therefore need to wait for operations to complete to
1845 // ensure they are visible to waves in the other CU as the L0 is per CU.
1846 //
1847 // Otherwise in CU mode and all waves of a work-group are on the same CU
1848 // which shares the same L0. Note that we still need to wait when
1849 // performing a release in this mode to respect the transitivity of
1850 // happens-before, e.g. other waves of the workgroup must be able to
1851 // release the memory from another wave at a wider scope.
1852 //
1853 // GFX12.5:
1854 // CU$ has two ports. To ensure operations are visible at the workgroup
1855 // level, we need to ensure all operations in this port have completed
1856 // so the other SIMDs in the WG can see them. There is no ordering
1857 // guarantee between the ports.
1858 if (!ST.isCuModeEnabled() || ST.hasGFX1250Insts() ||
1859 isReleaseOrStronger(AO: Order)) {
1860 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1861 LOADCnt |= true;
1862 if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1863 STORECnt |= true;
1864 }
1865 break;
1866 case SIAtomicScope::WAVEFRONT:
1867 case SIAtomicScope::SINGLETHREAD:
1868 // The L0 cache keeps all memory operations in order for
1869 // work-items in the same wavefront.
1870 break;
1871 default:
1872 llvm_unreachable("Unsupported synchronization scope");
1873 }
1874 }
1875
1876 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1877 switch (Scope) {
1878 case SIAtomicScope::SYSTEM:
1879 case SIAtomicScope::AGENT:
1880 case SIAtomicScope::CLUSTER:
1881 case SIAtomicScope::WORKGROUP:
1882 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
1883 // not needed as LDS operations for all waves are executed in a total
1884 // global ordering as observed by all waves. Required if also
1885 // synchronizing with global/GDS memory as LDS operations could be
1886 // reordered with respect to later global/GDS memory operations of the
1887 // same wave.
1888 DSCnt |= IsCrossAddrSpaceOrdering;
1889 break;
1890 case SIAtomicScope::WAVEFRONT:
1891 case SIAtomicScope::SINGLETHREAD:
1892 // The LDS keeps all memory operations in order for
1893 // the same wavefront.
1894 break;
1895 default:
1896 llvm_unreachable("Unsupported synchronization scope");
1897 }
1898 }
1899
1900 if (LOADCnt) {
1901 // Acquire sequences only need to wait on the previous atomic operation.
1902 // e.g. a typical sequence looks like
1903 // atomic load
1904 // (wait)
1905 // global_inv
1906 //
1907 // We do not have BVH or SAMPLE atomics, so the atomic load is always going
1908 // to be tracked using loadcnt.
1909 //
1910 // This also applies to fences. Fences cannot pair with an instruction
1911 // tracked with bvh/samplecnt as we don't have any atomics that do that.
1912 if (!AtomicsOnly && ST.hasImageInsts()) {
1913 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_WAIT_BVHCNT_soft)).addImm(Val: 0);
1914 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_WAIT_SAMPLECNT_soft)).addImm(Val: 0);
1915 }
1916 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_WAIT_LOADCNT_soft)).addImm(Val: 0);
1917 Changed = true;
1918 }
1919
1920 if (STORECnt) {
1921 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_WAIT_STORECNT_soft)).addImm(Val: 0);
1922 Changed = true;
1923 }
1924
1925 if (DSCnt) {
1926 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_WAIT_DSCNT_soft)).addImm(Val: 0);
1927 Changed = true;
1928 }
1929
1930 if (Pos == Position::AFTER)
1931 --MI;
1932
1933 return Changed;
1934}
1935
1936bool SIGfx12CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1937 SIAtomicScope Scope,
1938 SIAtomicAddrSpace AddrSpace,
1939 Position Pos) const {
1940 if (!InsertCacheInv)
1941 return false;
1942
1943 MachineBasicBlock &MBB = *MI->getParent();
1944 const DebugLoc &DL = MI->getDebugLoc();
1945
1946 /// The scratch address space does not need the global memory cache
1947 /// to be flushed as all memory operations by the same thread are
1948 /// sequentially consistent, and no other thread can access scratch
1949 /// memory.
1950
1951 /// Other address spaces do not have a cache.
1952 if (!canAffectGlobalAddrSpace(AS: AddrSpace))
1953 return false;
1954
1955 AMDGPU::CPol::CPol ScopeImm = AMDGPU::CPol::SCOPE_DEV;
1956 switch (Scope) {
1957 case SIAtomicScope::SYSTEM:
1958 ScopeImm = AMDGPU::CPol::SCOPE_SYS;
1959 break;
1960 case SIAtomicScope::AGENT:
1961 ScopeImm = AMDGPU::CPol::SCOPE_DEV;
1962 break;
1963 case SIAtomicScope::CLUSTER:
1964 ScopeImm = AMDGPU::CPol::SCOPE_SE;
1965 break;
1966 case SIAtomicScope::WORKGROUP:
1967 // GFX12.0:
1968 // In WGP mode the waves of a work-group can be executing on either CU of
1969 // the WGP. Therefore we need to invalidate the L0 which is per CU.
1970 // Otherwise in CU mode all waves of a work-group are on the same CU, and
1971 // so the L0 does not need to be invalidated.
1972 //
1973 // GFX12.5 has a shared WGP$, so no invalidates are required.
1974 if (ST.isCuModeEnabled())
1975 return false;
1976
1977 ScopeImm = AMDGPU::CPol::SCOPE_SE;
1978 break;
1979 case SIAtomicScope::WAVEFRONT:
1980 case SIAtomicScope::SINGLETHREAD:
1981 // No cache to invalidate.
1982 return false;
1983 default:
1984 llvm_unreachable("Unsupported synchronization scope");
1985 }
1986
1987 if (Pos == Position::AFTER)
1988 ++MI;
1989
1990 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::GLOBAL_INV)).addImm(Val: ScopeImm);
1991
1992 if (Pos == Position::AFTER)
1993 --MI;
1994
1995 // Target requires a waitcnt to ensure that the proceeding INV has completed
1996 // as it may get reorded with following load instructions.
1997 if (ST.hasINVWBL2WaitCntRequirement() && Scope > SIAtomicScope::CLUSTER) {
1998 insertWait(MI, Scope, AddrSpace, Op: SIMemOp::LOAD,
1999 /*IsCrossAddrSpaceOrdering=*/false, Pos, Order: AtomicOrdering::Acquire,
2000 /*AtomicsOnly=*/false);
2001
2002 if (Pos == Position::AFTER)
2003 --MI;
2004 }
2005
2006 return true;
2007}
2008
2009bool SIGfx12CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
2010 SIAtomicScope Scope,
2011 SIAtomicAddrSpace AddrSpace,
2012 bool IsCrossAddrSpaceOrdering,
2013 Position Pos) const {
2014 bool Changed = false;
2015
2016 MachineBasicBlock &MBB = *MI->getParent();
2017 const DebugLoc &DL = MI->getDebugLoc();
2018
2019 // The scratch address space does not need the global memory cache
2020 // writeback as all memory operations by the same thread are
2021 // sequentially consistent, and no other thread can access scratch
2022 // memory.
2023 if (canAffectGlobalAddrSpace(AS: AddrSpace)) {
2024 if (Pos == Position::AFTER)
2025 ++MI;
2026
2027 // global_wb is only necessary at system scope for GFX12.0,
2028 // they're also necessary at device scope for GFX12.5 as stores
2029 // cannot report completion earlier than L2.
2030 //
2031 // Emitting it for lower scopes is a slow no-op, so we omit it
2032 // for performance.
2033 std::optional<AMDGPU::CPol::CPol> NeedsWB;
2034 switch (Scope) {
2035 case SIAtomicScope::SYSTEM:
2036 NeedsWB = AMDGPU::CPol::SCOPE_SYS;
2037 break;
2038 case SIAtomicScope::AGENT:
2039 // GFX12.5 may have >1 L2 per device so we must emit a device scope WB.
2040 if (ST.hasGFX1250Insts())
2041 NeedsWB = AMDGPU::CPol::SCOPE_DEV;
2042 break;
2043 case SIAtomicScope::CLUSTER:
2044 case SIAtomicScope::WORKGROUP:
2045 // No WB necessary, but we still have to wait.
2046 case SIAtomicScope::WAVEFRONT:
2047 case SIAtomicScope::SINGLETHREAD:
2048 // No WB or wait necessary here, but insertWait takes care of that.
2049 break;
2050 default:
2051 llvm_unreachable("Unsupported synchronization scope");
2052 }
2053
2054 if (NeedsWB) {
2055 // Target requires a waitcnt to ensure that the proceeding store
2056 // proceeding store/rmw operations have completed in L2 so their data will
2057 // be written back by the WB instruction.
2058 if (ST.hasINVWBL2WaitCntRequirement())
2059 insertWait(MI, Scope, AddrSpace, Op: SIMemOp::LOAD | SIMemOp::STORE,
2060 /*IsCrossAddrSpaceOrdering=*/false, Pos,
2061 Order: AtomicOrdering::Release,
2062 /*AtomicsOnly=*/false);
2063
2064 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::GLOBAL_WB)).addImm(Val: *NeedsWB);
2065 Changed = true;
2066 }
2067
2068 if (Pos == Position::AFTER)
2069 --MI;
2070 }
2071
2072 // We always have to wait for previous memory operations (load/store) to
2073 // complete, whether we inserted a WB or not. If we inserted a WB (storecnt),
2074 // we of course need to wait for that as well.
2075 Changed |= insertWait(MI, Scope, AddrSpace, Op: SIMemOp::LOAD | SIMemOp::STORE,
2076 IsCrossAddrSpaceOrdering, Pos, Order: AtomicOrdering::Release,
2077 /*AtomicsOnly=*/false);
2078
2079 return Changed;
2080}
2081
2082bool SIGfx12CacheControl::handleNonVolatile(MachineInstr &MI) const {
2083 // On GFX12.5, set the NV CPol bit.
2084 if (!ST.hasGFX1250Insts())
2085 return false;
2086 MachineOperand *CPol = TII->getNamedOperand(MI, OperandName: OpName::cpol);
2087 if (!CPol)
2088 return false;
2089 CPol->setImm(CPol->getImm() | AMDGPU::CPol::NV);
2090 return true;
2091}
2092
2093bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
2094 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
2095 bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
2096
2097 // Only handle load and store, not atomic read-modify-write instructions.
2098 assert((MI->mayLoad() ^ MI->mayStore()) || SIInstrInfo::isLDSDMA(*MI));
2099
2100 // Only update load and store, not LLVM IR atomic read-modify-write
2101 // instructions. The latter are always marked as volatile so cannot sensibly
2102 // handle it as do not want to pessimize all atomics. Also they do not support
2103 // the nontemporal attribute.
2104 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
2105
2106 bool Changed = false;
2107
2108 if (IsLastUse) {
2109 // Set last-use hint.
2110 Changed |= setTH(MI, Value: AMDGPU::CPol::TH_LU);
2111 } else if (IsNonTemporal) {
2112 // Set non-temporal hint for all cache levels.
2113 Changed |= setTH(MI, Value: AMDGPU::CPol::TH_NT);
2114 }
2115
2116 if (IsVolatile) {
2117 Changed |= setScope(MI, Value: AMDGPU::CPol::SCOPE_SYS);
2118
2119 if (ST.requiresWaitXCntForSingleAccessInstructions() &&
2120 SIInstrInfo::isVMEM(MI: *MI)) {
2121 MachineBasicBlock &MBB = *MI->getParent();
2122 BuildMI(BB&: MBB, I: MI, MIMD: MI->getDebugLoc(), MCID: TII->get(Opcode: S_WAIT_XCNT_soft)).addImm(Val: 0);
2123 Changed = true;
2124 }
2125
2126 // Ensure operation has completed at system scope to cause all volatile
2127 // operations to be visible outside the program in a global order. Do not
2128 // request cross address space as only the global address space can be
2129 // observable outside the program, so no need to cause a waitcnt for LDS
2130 // address space operations.
2131 Changed |= insertWait(MI, Scope: SIAtomicScope::SYSTEM, AddrSpace, Op, IsCrossAddrSpaceOrdering: false,
2132 Pos: Position::AFTER, Order: AtomicOrdering::Unordered,
2133 /*AtomicsOnly=*/false);
2134 }
2135
2136 return Changed;
2137}
2138
2139bool SIGfx12CacheControl::finalizeStore(MachineInstr &MI, bool Atomic) const {
2140 assert(MI.mayStore() && "Not a Store inst");
2141 const bool IsRMW = (MI.mayLoad() && MI.mayStore());
2142 bool Changed = false;
2143
2144 if (Atomic && ST.requiresWaitXCntForSingleAccessInstructions() &&
2145 SIInstrInfo::isVMEM(MI)) {
2146 MachineBasicBlock &MBB = *MI.getParent();
2147 BuildMI(BB&: MBB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: S_WAIT_XCNT_soft)).addImm(Val: 0);
2148 Changed = true;
2149 }
2150
2151 // Remaining fixes do not apply to RMWs.
2152 if (IsRMW)
2153 return Changed;
2154
2155 MachineOperand *CPol = TII->getNamedOperand(MI, OperandName: OpName::cpol);
2156 if (!CPol) // Some vmem operations do not have a scope and are not concerned.
2157 return Changed;
2158 const unsigned Scope = CPol->getImm() & CPol::SCOPE;
2159
2160 // GFX12.0 only: Extra waits needed before system scope stores.
2161 if (ST.requiresWaitsBeforeSystemScopeStores() && !Atomic &&
2162 Scope == CPol::SCOPE_SYS)
2163 Changed |= insertWaitsBeforeSystemScopeStore(MI: MI.getIterator());
2164
2165 return Changed;
2166}
2167
2168bool SIGfx12CacheControl::handleCooperativeAtomic(MachineInstr &MI) const {
2169 if (!ST.hasGFX1250Insts())
2170 return false;
2171
2172 // Cooperative atomics need to be SCOPE_DEV or higher.
2173 MachineOperand *CPol = TII->getNamedOperand(MI, OperandName: OpName::cpol);
2174 assert(CPol && "No CPol operand?");
2175 const unsigned Scope = CPol->getImm() & CPol::SCOPE;
2176 if (Scope < CPol::SCOPE_DEV)
2177 return setScope(MI, Value: CPol::SCOPE_DEV);
2178 return false;
2179}
2180
2181bool SIGfx12CacheControl::setAtomicScope(const MachineBasicBlock::iterator &MI,
2182 SIAtomicScope Scope,
2183 SIAtomicAddrSpace AddrSpace) const {
2184 bool Changed = false;
2185
2186 if (canAffectGlobalAddrSpace(AS: AddrSpace)) {
2187 switch (Scope) {
2188 case SIAtomicScope::SYSTEM:
2189 Changed |= setScope(MI, Value: AMDGPU::CPol::SCOPE_SYS);
2190 break;
2191 case SIAtomicScope::AGENT:
2192 Changed |= setScope(MI, Value: AMDGPU::CPol::SCOPE_DEV);
2193 break;
2194 case SIAtomicScope::CLUSTER:
2195 Changed |= setScope(MI, Value: AMDGPU::CPol::SCOPE_SE);
2196 break;
2197 case SIAtomicScope::WORKGROUP:
2198 // In workgroup mode, SCOPE_SE is needed as waves can executes on
2199 // different CUs that access different L0s.
2200 if (!ST.isCuModeEnabled())
2201 Changed |= setScope(MI, Value: AMDGPU::CPol::SCOPE_SE);
2202 break;
2203 case SIAtomicScope::WAVEFRONT:
2204 case SIAtomicScope::SINGLETHREAD:
2205 // No cache to bypass.
2206 break;
2207 default:
2208 llvm_unreachable("Unsupported synchronization scope");
2209 }
2210 }
2211
2212 // The scratch address space does not need the global memory caches
2213 // to be bypassed as all memory operations by the same thread are
2214 // sequentially consistent, and no other thread can access scratch
2215 // memory.
2216
2217 // Other address spaces do not have a cache.
2218
2219 return Changed;
2220}
2221
2222bool SIMemoryLegalizer::removeAtomicPseudoMIs() {
2223 if (AtomicPseudoMIs.empty())
2224 return false;
2225
2226 for (auto &MI : AtomicPseudoMIs)
2227 MI->eraseFromParent();
2228
2229 AtomicPseudoMIs.clear();
2230 return true;
2231}
2232
2233bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI,
2234 MachineBasicBlock::iterator &MI) {
2235 assert(MI->mayLoad() && !MI->mayStore());
2236
2237 bool Changed = false;
2238
2239 if (MOI.isAtomic()) {
2240 const AtomicOrdering Order = MOI.getOrdering();
2241 if (Order == AtomicOrdering::Monotonic ||
2242 Order == AtomicOrdering::Acquire ||
2243 Order == AtomicOrdering::SequentiallyConsistent) {
2244 Changed |= CC->enableLoadCacheBypass(MI, Scope: MOI.getScope(),
2245 AddrSpace: MOI.getOrderingAddrSpace());
2246 }
2247
2248 // Handle cooperative atomics after cache bypass step, as it may override
2249 // the scope of the instruction to a greater scope.
2250 if (MOI.isCooperative())
2251 Changed |= CC->handleCooperativeAtomic(MI&: *MI);
2252
2253 if (Order == AtomicOrdering::SequentiallyConsistent)
2254 Changed |= CC->insertWait(MI, Scope: MOI.getScope(), AddrSpace: MOI.getOrderingAddrSpace(),
2255 Op: SIMemOp::LOAD | SIMemOp::STORE,
2256 IsCrossAddrSpaceOrdering: MOI.getIsCrossAddressSpaceOrdering(),
2257 Pos: Position::BEFORE, Order, /*AtomicsOnly=*/false);
2258
2259 if (Order == AtomicOrdering::Acquire ||
2260 Order == AtomicOrdering::SequentiallyConsistent) {
2261 // The wait below only needs to wait on the prior atomic.
2262 Changed |=
2263 CC->insertWait(MI, Scope: MOI.getScope(), AddrSpace: MOI.getInstrAddrSpace(),
2264 Op: SIMemOp::LOAD, IsCrossAddrSpaceOrdering: MOI.getIsCrossAddressSpaceOrdering(),
2265 Pos: Position::AFTER, Order, /*AtomicsOnly=*/true);
2266 Changed |= CC->insertAcquire(MI, Scope: MOI.getScope(),
2267 AddrSpace: MOI.getOrderingAddrSpace(),
2268 Pos: Position::AFTER);
2269 }
2270
2271 return Changed;
2272 }
2273
2274 // Atomic instructions already bypass caches to the scope specified by the
2275 // SyncScope operand. Only non-atomic volatile and nontemporal/last-use
2276 // instructions need additional treatment.
2277 Changed |= CC->enableVolatileAndOrNonTemporal(
2278 MI, AddrSpace: MOI.getInstrAddrSpace(), Op: SIMemOp::LOAD, IsVolatile: MOI.isVolatile(),
2279 IsNonTemporal: MOI.isNonTemporal(), IsLastUse: MOI.isLastUse());
2280
2281 return Changed;
2282}
2283
2284bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
2285 MachineBasicBlock::iterator &MI) {
2286 assert(!MI->mayLoad() && MI->mayStore());
2287
2288 bool Changed = false;
2289 // FIXME: Necessary hack because iterator can lose track of the store.
2290 MachineInstr &StoreMI = *MI;
2291
2292 if (MOI.isAtomic()) {
2293 if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
2294 MOI.getOrdering() == AtomicOrdering::Release ||
2295 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2296 Changed |= CC->enableStoreCacheBypass(MI, Scope: MOI.getScope(),
2297 AddrSpace: MOI.getOrderingAddrSpace());
2298 }
2299
2300 // Handle cooperative atomics after cache bypass step, as it may override
2301 // the scope of the instruction to a greater scope.
2302 if (MOI.isCooperative())
2303 Changed |= CC->handleCooperativeAtomic(MI&: *MI);
2304
2305 if (MOI.getOrdering() == AtomicOrdering::Release ||
2306 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2307 Changed |= CC->insertRelease(MI, Scope: MOI.getScope(),
2308 AddrSpace: MOI.getOrderingAddrSpace(),
2309 IsCrossAddrSpaceOrdering: MOI.getIsCrossAddressSpaceOrdering(),
2310 Pos: Position::BEFORE);
2311
2312 Changed |= CC->finalizeStore(MI&: StoreMI, /*Atomic=*/true);
2313 return Changed;
2314 }
2315
2316 // Atomic instructions already bypass caches to the scope specified by the
2317 // SyncScope operand. Only non-atomic volatile and nontemporal instructions
2318 // need additional treatment.
2319 Changed |= CC->enableVolatileAndOrNonTemporal(
2320 MI, AddrSpace: MOI.getInstrAddrSpace(), Op: SIMemOp::STORE, IsVolatile: MOI.isVolatile(),
2321 IsNonTemporal: MOI.isNonTemporal());
2322
2323 // GFX12 specific, scope(desired coherence domain in cache hierarchy) is
2324 // instruction field, do not confuse it with atomic scope.
2325 Changed |= CC->finalizeStore(MI&: StoreMI, /*Atomic=*/false);
2326 return Changed;
2327}
2328
2329bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
2330 MachineBasicBlock::iterator &MI) {
2331 assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE);
2332
2333 AtomicPseudoMIs.push_back(x: MI);
2334 bool Changed = false;
2335
2336 const SIAtomicAddrSpace OrderingAddrSpace = MOI.getOrderingAddrSpace();
2337
2338 if (MOI.isAtomic()) {
2339 const AtomicOrdering Order = MOI.getOrdering();
2340 if (Order == AtomicOrdering::Acquire) {
2341 // Acquire fences only need to wait on the previous atomic they pair with.
2342 Changed |= CC->insertWait(MI, Scope: MOI.getScope(), AddrSpace: OrderingAddrSpace,
2343 Op: SIMemOp::LOAD | SIMemOp::STORE,
2344 IsCrossAddrSpaceOrdering: MOI.getIsCrossAddressSpaceOrdering(),
2345 Pos: Position::BEFORE, Order, /*AtomicsOnly=*/true);
2346 }
2347
2348 if (Order == AtomicOrdering::Release ||
2349 Order == AtomicOrdering::AcquireRelease ||
2350 Order == AtomicOrdering::SequentiallyConsistent)
2351 /// TODO: This relies on a barrier always generating a waitcnt
2352 /// for LDS to ensure it is not reordered with the completion of
2353 /// the proceeding LDS operations. If barrier had a memory
2354 /// ordering and memory scope, then library does not need to
2355 /// generate a fence. Could add support in this file for
2356 /// barrier. SIInsertWaitcnt.cpp could then stop unconditionally
2357 /// adding S_WAITCNT before a S_BARRIER.
2358 Changed |= CC->insertRelease(MI, Scope: MOI.getScope(), AddrSpace: OrderingAddrSpace,
2359 IsCrossAddrSpaceOrdering: MOI.getIsCrossAddressSpaceOrdering(),
2360 Pos: Position::BEFORE);
2361
2362 // TODO: If both release and invalidate are happening they could be combined
2363 // to use the single "BUFFER_WBINV*" instruction. This could be done by
2364 // reorganizing this code or as part of optimizing SIInsertWaitcnt pass to
2365 // track cache invalidate and write back instructions.
2366
2367 if (Order == AtomicOrdering::Acquire ||
2368 Order == AtomicOrdering::AcquireRelease ||
2369 Order == AtomicOrdering::SequentiallyConsistent)
2370 Changed |= CC->insertAcquire(MI, Scope: MOI.getScope(), AddrSpace: OrderingAddrSpace,
2371 Pos: Position::BEFORE);
2372
2373 return Changed;
2374 }
2375
2376 return Changed;
2377}
2378
2379bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
2380 MachineBasicBlock::iterator &MI) {
2381 assert(MI->mayLoad() && MI->mayStore());
2382
2383 bool Changed = false;
2384 MachineInstr &RMWMI = *MI;
2385
2386 if (MOI.isAtomic()) {
2387 const AtomicOrdering Order = MOI.getOrdering();
2388 if (Order == AtomicOrdering::Monotonic ||
2389 Order == AtomicOrdering::Acquire || Order == AtomicOrdering::Release ||
2390 Order == AtomicOrdering::AcquireRelease ||
2391 Order == AtomicOrdering::SequentiallyConsistent) {
2392 Changed |= CC->enableRMWCacheBypass(MI, Scope: MOI.getScope(),
2393 AddrSpace: MOI.getInstrAddrSpace());
2394 }
2395
2396 if (Order == AtomicOrdering::Release ||
2397 Order == AtomicOrdering::AcquireRelease ||
2398 Order == AtomicOrdering::SequentiallyConsistent ||
2399 MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent)
2400 Changed |= CC->insertRelease(MI, Scope: MOI.getScope(),
2401 AddrSpace: MOI.getOrderingAddrSpace(),
2402 IsCrossAddrSpaceOrdering: MOI.getIsCrossAddressSpaceOrdering(),
2403 Pos: Position::BEFORE);
2404
2405 if (Order == AtomicOrdering::Acquire ||
2406 Order == AtomicOrdering::AcquireRelease ||
2407 Order == AtomicOrdering::SequentiallyConsistent ||
2408 MOI.getFailureOrdering() == AtomicOrdering::Acquire ||
2409 MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) {
2410 // Only wait on the previous atomic.
2411 Changed |=
2412 CC->insertWait(MI, Scope: MOI.getScope(), AddrSpace: MOI.getInstrAddrSpace(),
2413 Op: isAtomicRet(MI: *MI) ? SIMemOp::LOAD : SIMemOp::STORE,
2414 IsCrossAddrSpaceOrdering: MOI.getIsCrossAddressSpaceOrdering(), Pos: Position::AFTER,
2415 Order, /*AtomicsOnly=*/true);
2416 Changed |= CC->insertAcquire(MI, Scope: MOI.getScope(),
2417 AddrSpace: MOI.getOrderingAddrSpace(),
2418 Pos: Position::AFTER);
2419 }
2420
2421 Changed |= CC->finalizeStore(MI&: RMWMI, /*Atomic=*/true);
2422 return Changed;
2423 }
2424
2425 return Changed;
2426}
2427
2428bool SIMemoryLegalizer::expandLDSDMA(const SIMemOpInfo &MOI,
2429 MachineBasicBlock::iterator &MI) {
2430 assert(MI->mayLoad() && MI->mayStore());
2431
2432 // The volatility or nontemporal-ness of the operation is a
2433 // function of the global memory, not the LDS.
2434 SIMemOp OpKind =
2435 SIInstrInfo::mayWriteLDSThroughDMA(MI: *MI) ? SIMemOp::LOAD : SIMemOp::STORE;
2436
2437 // Handle volatile and/or nontemporal markers on direct-to-LDS loads and
2438 // stores. The operation is treated as a volatile/nontemporal store
2439 // to its second argument.
2440 return CC->enableVolatileAndOrNonTemporal(
2441 MI, AddrSpace: MOI.getInstrAddrSpace(), Op: OpKind, IsVolatile: MOI.isVolatile(),
2442 IsNonTemporal: MOI.isNonTemporal(), IsLastUse: MOI.isLastUse());
2443}
2444
2445bool SIMemoryLegalizerLegacy::runOnMachineFunction(MachineFunction &MF) {
2446 const MachineModuleInfo &MMI =
2447 getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
2448 return SIMemoryLegalizer(MMI).run(MF);
2449}
2450
2451PreservedAnalyses
2452SIMemoryLegalizerPass::run(MachineFunction &MF,
2453 MachineFunctionAnalysisManager &MFAM) {
2454 auto *MMI = MFAM.getResult<ModuleAnalysisManagerMachineFunctionProxy>(IR&: MF)
2455 .getCachedResult<MachineModuleAnalysis>(
2456 IR&: *MF.getFunction().getParent());
2457 assert(MMI && "MachineModuleAnalysis must be available");
2458 if (!SIMemoryLegalizer(MMI->getMMI()).run(MF))
2459 return PreservedAnalyses::all();
2460 return getMachineFunctionPassPreservedAnalyses().preserveSet<CFGAnalyses>();
2461}
2462
2463bool SIMemoryLegalizer::run(MachineFunction &MF) {
2464 bool Changed = false;
2465
2466 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2467 SIMemOpAccess MOA(MMI.getObjFileInfo<AMDGPUMachineModuleInfo>(), ST);
2468 CC = SICacheControl::create(ST);
2469
2470 for (auto &MBB : MF) {
2471 for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) {
2472
2473 // Unbundle instructions after the post-RA scheduler.
2474 if (MI->isBundle() && MI->mayLoadOrStore()) {
2475 MachineBasicBlock::instr_iterator II(MI->getIterator());
2476 for (MachineBasicBlock::instr_iterator I = ++II, E = MBB.instr_end();
2477 I != E && I->isBundledWithPred(); ++I) {
2478 I->unbundleFromPred();
2479 for (MachineOperand &MO : I->operands())
2480 if (MO.isReg())
2481 MO.setIsInternalRead(false);
2482 }
2483
2484 MI->eraseFromParent();
2485 MI = II->getIterator();
2486 }
2487
2488 if (MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic) {
2489 if (const auto &MOI = MOA.getLoadInfo(MI))
2490 Changed |= expandLoad(MOI: *MOI, MI);
2491 else if (const auto &MOI = MOA.getStoreInfo(MI))
2492 Changed |= expandStore(MOI: *MOI, MI);
2493 else if (const auto &MOI = MOA.getLDSDMAInfo(MI))
2494 Changed |= expandLDSDMA(MOI: *MOI, MI);
2495 else if (const auto &MOI = MOA.getAtomicFenceInfo(MI))
2496 Changed |= expandAtomicFence(MOI: *MOI, MI);
2497 else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI))
2498 Changed |= expandAtomicCmpxchgOrRmw(MOI: *MOI, MI);
2499 }
2500
2501 if (isNonVolatileMemoryAccess(MI: *MI))
2502 Changed |= CC->handleNonVolatile(MI&: *MI);
2503 }
2504 }
2505
2506 Changed |= removeAtomicPseudoMIs();
2507 return Changed;
2508}
2509
2510INITIALIZE_PASS(SIMemoryLegalizerLegacy, DEBUG_TYPE, PASS_NAME, false, false)
2511
2512char SIMemoryLegalizerLegacy::ID = 0;
2513char &llvm::SIMemoryLegalizerID = SIMemoryLegalizerLegacy::ID;
2514
2515FunctionPass *llvm::createSIMemoryLegalizerPass() {
2516 return new SIMemoryLegalizerLegacy();
2517}
2518