1//===- SIMemoryLegalizer.cpp ----------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Memory legalizer - implements memory model. More information can be
11/// found here:
12/// http://llvm.org/docs/AMDGPUUsage.html#memory-model
13//
14//===----------------------------------------------------------------------===//
15
16#include "AMDGPU.h"
17#include "AMDGPUMachineModuleInfo.h"
18#include "GCNSubtarget.h"
19#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
20#include "llvm/ADT/BitmaskEnum.h"
21#include "llvm/ADT/StringExtras.h"
22#include "llvm/CodeGen/MachineBasicBlock.h"
23#include "llvm/CodeGen/MachineFunctionPass.h"
24#include "llvm/CodeGen/MachinePassManager.h"
25#include "llvm/IR/DiagnosticInfo.h"
26#include "llvm/IR/MemoryModelRelaxationAnnotations.h"
27#include "llvm/IR/PassManager.h"
28#include "llvm/Support/AMDGPUAddrSpace.h"
29#include "llvm/Support/AtomicOrdering.h"
30#include "llvm/TargetParser/TargetParser.h"
31
32using namespace llvm;
33using namespace llvm::AMDGPU;
34
35#define DEBUG_TYPE "si-memory-legalizer"
36#define PASS_NAME "SI Memory Legalizer"
37
38static cl::opt<bool> AmdgcnSkipCacheInvalidations(
39 "amdgcn-skip-cache-invalidations", cl::init(Val: false), cl::Hidden,
40 cl::desc("Use this to skip inserting cache invalidating instructions."));
41
42namespace {
43
44LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE();
45
46/// Memory operation flags. Can be ORed together.
47enum class SIMemOp {
48 NONE = 0u,
49 LOAD = 1u << 0,
50 STORE = 1u << 1,
51 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ STORE)
52};
53
54/// Position to insert a new instruction relative to an existing
55/// instruction.
56enum class Position {
57 BEFORE,
58 AFTER
59};
60
61/// The atomic synchronization scopes supported by the AMDGPU target.
62enum class SIAtomicScope {
63 NONE,
64 SINGLETHREAD,
65 WAVEFRONT,
66 WORKGROUP,
67 CLUSTER, // Promoted to AGENT on targets without workgroup clusters.
68 AGENT,
69 SYSTEM
70};
71
72/// The distinct address spaces supported by the AMDGPU target for
73/// atomic memory operation. Can be ORed together.
74enum class SIAtomicAddrSpace {
75 NONE = 0u,
76 GLOBAL = 1u << 0,
77 LDS = 1u << 1,
78 SCRATCH = 1u << 2,
79 GDS = 1u << 3,
80 OTHER = 1u << 4,
81
82 /// The address spaces that can be accessed by a FLAT instruction.
83 FLAT = GLOBAL | LDS | SCRATCH,
84
85 /// The address spaces that support atomic instructions.
86 ATOMIC = GLOBAL | LDS | SCRATCH | GDS,
87
88 /// All address spaces.
89 ALL = GLOBAL | LDS | SCRATCH | GDS | OTHER,
90
91 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
92};
93
94class SIMemOpInfo final {
95private:
96
97 friend class SIMemOpAccess;
98
99 AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
100 AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
101 SIAtomicScope Scope = SIAtomicScope::SYSTEM;
102 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
103 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
104 bool IsCrossAddressSpaceOrdering = false;
105 bool IsVolatile = false;
106 bool IsNonTemporal = false;
107 bool IsLastUse = false;
108 bool IsCooperative = false;
109
110 // TODO: Should we assume Cooperative=true if no MMO is present?
111 SIMemOpInfo(
112 const GCNSubtarget &ST,
113 AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent,
114 SIAtomicScope Scope = SIAtomicScope::SYSTEM,
115 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC,
116 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::ALL,
117 bool IsCrossAddressSpaceOrdering = true,
118 AtomicOrdering FailureOrdering = AtomicOrdering::SequentiallyConsistent,
119 bool IsVolatile = false, bool IsNonTemporal = false,
120 bool IsLastUse = false, bool IsCooperative = false)
121 : Ordering(Ordering), FailureOrdering(FailureOrdering), Scope(Scope),
122 OrderingAddrSpace(OrderingAddrSpace), InstrAddrSpace(InstrAddrSpace),
123 IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering),
124 IsVolatile(IsVolatile), IsNonTemporal(IsNonTemporal),
125 IsLastUse(IsLastUse), IsCooperative(IsCooperative) {
126
127 if (Ordering == AtomicOrdering::NotAtomic) {
128 assert(!IsCooperative && "Cannot be cooperative & non-atomic!");
129 assert(Scope == SIAtomicScope::NONE &&
130 OrderingAddrSpace == SIAtomicAddrSpace::NONE &&
131 !IsCrossAddressSpaceOrdering &&
132 FailureOrdering == AtomicOrdering::NotAtomic);
133 return;
134 }
135
136 assert(Scope != SIAtomicScope::NONE &&
137 (OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
138 SIAtomicAddrSpace::NONE &&
139 (InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
140 SIAtomicAddrSpace::NONE);
141
142 // There is also no cross address space ordering if the ordering
143 // address space is the same as the instruction address space and
144 // only contains a single address space.
145 if ((OrderingAddrSpace == InstrAddrSpace) &&
146 isPowerOf2_32(Value: uint32_t(InstrAddrSpace)))
147 this->IsCrossAddressSpaceOrdering = false;
148
149 // Limit the scope to the maximum supported by the instruction's address
150 // spaces.
151 if ((InstrAddrSpace & ~SIAtomicAddrSpace::SCRATCH) ==
152 SIAtomicAddrSpace::NONE) {
153 this->Scope = std::min(a: Scope, b: SIAtomicScope::SINGLETHREAD);
154 } else if ((InstrAddrSpace &
155 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS)) ==
156 SIAtomicAddrSpace::NONE) {
157 this->Scope = std::min(a: Scope, b: SIAtomicScope::WORKGROUP);
158 } else if ((InstrAddrSpace &
159 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS |
160 SIAtomicAddrSpace::GDS)) == SIAtomicAddrSpace::NONE) {
161 this->Scope = std::min(a: Scope, b: SIAtomicScope::AGENT);
162 }
163
164 // On targets that have no concept of a workgroup cluster, use
165 // AGENT scope as a conservatively correct alternative.
166 if (this->Scope == SIAtomicScope::CLUSTER && !ST.hasClusters())
167 this->Scope = SIAtomicScope::AGENT;
168 }
169
170public:
171 /// \returns Atomic synchronization scope of the machine instruction used to
172 /// create this SIMemOpInfo.
173 SIAtomicScope getScope() const {
174 return Scope;
175 }
176
177 /// \returns Ordering constraint of the machine instruction used to
178 /// create this SIMemOpInfo.
179 AtomicOrdering getOrdering() const {
180 return Ordering;
181 }
182
183 /// \returns Failure ordering constraint of the machine instruction used to
184 /// create this SIMemOpInfo.
185 AtomicOrdering getFailureOrdering() const {
186 return FailureOrdering;
187 }
188
189 /// \returns The address spaces be accessed by the machine
190 /// instruction used to create this SIMemOpInfo.
191 SIAtomicAddrSpace getInstrAddrSpace() const {
192 return InstrAddrSpace;
193 }
194
195 /// \returns The address spaces that must be ordered by the machine
196 /// instruction used to create this SIMemOpInfo.
197 SIAtomicAddrSpace getOrderingAddrSpace() const {
198 return OrderingAddrSpace;
199 }
200
201 /// \returns Return true iff memory ordering of operations on
202 /// different address spaces is required.
203 bool getIsCrossAddressSpaceOrdering() const {
204 return IsCrossAddressSpaceOrdering;
205 }
206
207 /// \returns True if memory access of the machine instruction used to
208 /// create this SIMemOpInfo is volatile, false otherwise.
209 bool isVolatile() const {
210 return IsVolatile;
211 }
212
213 /// \returns True if memory access of the machine instruction used to
214 /// create this SIMemOpInfo is nontemporal, false otherwise.
215 bool isNonTemporal() const {
216 return IsNonTemporal;
217 }
218
219 /// \returns True if memory access of the machine instruction used to
220 /// create this SIMemOpInfo is last use, false otherwise.
221 bool isLastUse() const { return IsLastUse; }
222
223 /// \returns True if this is a cooperative load or store atomic.
224 bool isCooperative() const { return IsCooperative; }
225
226 /// \returns True if ordering constraint of the machine instruction used to
227 /// create this SIMemOpInfo is unordered or higher, false otherwise.
228 bool isAtomic() const {
229 return Ordering != AtomicOrdering::NotAtomic;
230 }
231
232};
233
234class SIMemOpAccess final {
235private:
236 const AMDGPUMachineModuleInfo *MMI = nullptr;
237 const GCNSubtarget &ST;
238
239 /// Reports unsupported message \p Msg for \p MI to LLVM context.
240 void reportUnsupported(const MachineBasicBlock::iterator &MI,
241 const char *Msg) const;
242
243 /// Inspects the target synchronization scope \p SSID and determines
244 /// the SI atomic scope it corresponds to, the address spaces it
245 /// covers, and whether the memory ordering applies between address
246 /// spaces.
247 std::optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
248 toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrAddrSpace) const;
249
250 /// \return Return a bit set of the address spaces accessed by \p AS.
251 SIAtomicAddrSpace toSIAtomicAddrSpace(unsigned AS) const;
252
253 /// \returns Info constructed from \p MI, which has at least machine memory
254 /// operand.
255 std::optional<SIMemOpInfo>
256 constructFromMIWithMMO(const MachineBasicBlock::iterator &MI) const;
257
258public:
259 /// Construct class to support accessing the machine memory operands
260 /// of instructions in the machine function \p MF.
261 SIMemOpAccess(const AMDGPUMachineModuleInfo &MMI, const GCNSubtarget &ST);
262
263 /// \returns Load info if \p MI is a load operation, "std::nullopt" otherwise.
264 std::optional<SIMemOpInfo>
265 getLoadInfo(const MachineBasicBlock::iterator &MI) const;
266
267 /// \returns Store info if \p MI is a store operation, "std::nullopt"
268 /// otherwise.
269 std::optional<SIMemOpInfo>
270 getStoreInfo(const MachineBasicBlock::iterator &MI) const;
271
272 /// \returns Atomic fence info if \p MI is an atomic fence operation,
273 /// "std::nullopt" otherwise.
274 std::optional<SIMemOpInfo>
275 getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const;
276
277 /// \returns Atomic cmpxchg/rmw info if \p MI is an atomic cmpxchg or
278 /// rmw operation, "std::nullopt" otherwise.
279 std::optional<SIMemOpInfo>
280 getAtomicCmpxchgOrRmwInfo(const MachineBasicBlock::iterator &MI) const;
281
282 /// \returns DMA to LDS info if \p MI is as a direct-to/from-LDS load/store,
283 /// along with an indication of whether this is a load or store. If it is not
284 /// a direct-to-LDS operation, returns std::nullopt.
285 std::optional<SIMemOpInfo>
286 getLDSDMAInfo(const MachineBasicBlock::iterator &MI) const;
287};
288
289class SICacheControl {
290protected:
291
292 /// AMDGPU subtarget info.
293 const GCNSubtarget &ST;
294
295 /// Instruction info.
296 const SIInstrInfo *TII = nullptr;
297
298 IsaVersion IV;
299
300 /// Whether to insert cache invalidating instructions.
301 bool InsertCacheInv;
302
303 SICacheControl(const GCNSubtarget &ST);
304
305 /// Sets CPol \p Bits to "true" if present in instruction \p MI.
306 /// \returns Returns true if \p MI is modified, false otherwise.
307 bool enableCPolBits(const MachineBasicBlock::iterator MI,
308 unsigned Bits) const;
309
310 /// Check if any atomic operation on AS can affect memory accessible via the
311 /// global address space.
312 bool canAffectGlobalAddrSpace(SIAtomicAddrSpace AS) const;
313
314public:
315 using CPol = AMDGPU::CPol::CPol;
316
317 /// Create a cache control for the subtarget \p ST.
318 static std::unique_ptr<SICacheControl> create(const GCNSubtarget &ST);
319
320 /// Update \p MI memory load instruction to bypass any caches up to
321 /// the \p Scope memory scope for address spaces \p
322 /// AddrSpace. Return true iff the instruction was modified.
323 virtual bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
324 SIAtomicScope Scope,
325 SIAtomicAddrSpace AddrSpace) const = 0;
326
327 /// Update \p MI memory store instruction to bypass any caches up to
328 /// the \p Scope memory scope for address spaces \p
329 /// AddrSpace. Return true iff the instruction was modified.
330 virtual bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
331 SIAtomicScope Scope,
332 SIAtomicAddrSpace AddrSpace) const = 0;
333
334 /// Update \p MI memory read-modify-write instruction to bypass any caches up
335 /// to the \p Scope memory scope for address spaces \p AddrSpace. Return true
336 /// iff the instruction was modified.
337 virtual bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
338 SIAtomicScope Scope,
339 SIAtomicAddrSpace AddrSpace) const = 0;
340
341 /// Update \p MI memory instruction of kind \p Op associated with address
342 /// spaces \p AddrSpace to indicate it is volatile and/or
343 /// nontemporal/last-use. Return true iff the instruction was modified.
344 virtual bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
345 SIAtomicAddrSpace AddrSpace,
346 SIMemOp Op, bool IsVolatile,
347 bool IsNonTemporal,
348 bool IsLastUse = false) const = 0;
349
350 /// Add final touches to a `mayStore` instruction \p MI, which may be a
351 /// Store or RMW instruction.
352 /// FIXME: This takes a MI because iterators aren't handled properly. When
353 /// this is called, they often point to entirely different insts. Thus we back
354 /// up the inst early and pass it here instead.
355 virtual bool finalizeStore(MachineInstr &MI, bool Atomic) const {
356 return false;
357 };
358
359 /// Handle cooperative load/store atomics.
360 virtual bool handleCooperativeAtomic(MachineInstr &MI) const {
361 llvm_unreachable(
362 "cooperative atomics are not available on this architecture");
363 }
364
365 /// Inserts any necessary instructions at position \p Pos relative
366 /// to instruction \p MI to ensure memory instructions before \p Pos of kind
367 /// \p Op associated with address spaces \p AddrSpace have completed. Used
368 /// between memory instructions to enforce the order they become visible as
369 /// observed by other memory instructions executing in memory scope \p Scope.
370 /// \p IsCrossAddrSpaceOrdering indicates if the memory ordering is between
371 /// address spaces. If \p AtomicsOnly is true, only insert waits for counters
372 /// that are used by atomic instructions.
373 /// Returns true iff any instructions inserted.
374 virtual bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
375 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
376 bool IsCrossAddrSpaceOrdering, Position Pos,
377 AtomicOrdering Order, bool AtomicsOnly) const = 0;
378
379 /// Inserts any necessary instructions at position \p Pos relative to
380 /// instruction \p MI to ensure any subsequent memory instructions of this
381 /// thread with address spaces \p AddrSpace will observe the previous memory
382 /// operations by any thread for memory scopes up to memory scope \p Scope .
383 /// Returns true iff any instructions inserted.
384 virtual bool insertAcquire(MachineBasicBlock::iterator &MI,
385 SIAtomicScope Scope,
386 SIAtomicAddrSpace AddrSpace,
387 Position Pos) const = 0;
388
389 /// Inserts any necessary instructions at position \p Pos relative to
390 /// instruction \p MI to ensure previous memory instructions by this thread
391 /// with address spaces \p AddrSpace have completed and can be observed by
392 /// subsequent memory instructions by any thread executing in memory scope \p
393 /// Scope. \p IsCrossAddrSpaceOrdering indicates if the memory ordering is
394 /// between address spaces. Returns true iff any instructions inserted.
395 virtual bool insertRelease(MachineBasicBlock::iterator &MI,
396 SIAtomicScope Scope,
397 SIAtomicAddrSpace AddrSpace,
398 bool IsCrossAddrSpaceOrdering,
399 Position Pos) const = 0;
400
401 /// Virtual destructor to allow derivations to be deleted.
402 virtual ~SICacheControl() = default;
403};
404
405/// Generates code sequences for the memory model of all GFX targets below
406/// GFX10.
407class SIGfx6CacheControl final : public SICacheControl {
408public:
409
410 SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {}
411
412 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
413 SIAtomicScope Scope,
414 SIAtomicAddrSpace AddrSpace) const override;
415
416 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
417 SIAtomicScope Scope,
418 SIAtomicAddrSpace AddrSpace) const override;
419
420 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
421 SIAtomicScope Scope,
422 SIAtomicAddrSpace AddrSpace) const override;
423
424 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
425 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
426 bool IsVolatile, bool IsNonTemporal,
427 bool IsLastUse) const override;
428
429 bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
430 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
431 bool IsCrossAddrSpaceOrdering, Position Pos,
432 AtomicOrdering Order, bool AtomicsOnly) const override;
433
434 bool insertAcquire(MachineBasicBlock::iterator &MI,
435 SIAtomicScope Scope,
436 SIAtomicAddrSpace AddrSpace,
437 Position Pos) const override;
438
439 bool insertRelease(MachineBasicBlock::iterator &MI,
440 SIAtomicScope Scope,
441 SIAtomicAddrSpace AddrSpace,
442 bool IsCrossAddrSpaceOrdering,
443 Position Pos) const override;
444};
445
446/// Generates code sequences for the memory model of GFX10/11.
447class SIGfx10CacheControl final : public SICacheControl {
448public:
449 SIGfx10CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {}
450
451 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
452 SIAtomicScope Scope,
453 SIAtomicAddrSpace AddrSpace) const override;
454
455 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
456 SIAtomicScope Scope,
457 SIAtomicAddrSpace AddrSpace) const override {
458 return false;
459 }
460
461 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
462 SIAtomicScope Scope,
463 SIAtomicAddrSpace AddrSpace) const override {
464 return false;
465 }
466
467 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
468 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
469 bool IsVolatile, bool IsNonTemporal,
470 bool IsLastUse) const override;
471
472 bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
473 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
474 bool IsCrossAddrSpaceOrdering, Position Pos,
475 AtomicOrdering Order, bool AtomicsOnly) const override;
476
477 bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
478 SIAtomicAddrSpace AddrSpace, Position Pos) const override;
479
480 bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
481 SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,
482 Position Pos) const override {
483 return insertWait(MI, Scope, AddrSpace, Op: SIMemOp::LOAD | SIMemOp::STORE,
484 IsCrossAddrSpaceOrdering, Pos, Order: AtomicOrdering::Release,
485 /*AtomicsOnly=*/false);
486 }
487};
488
489class SIGfx12CacheControl final : public SICacheControl {
490protected:
491 // Sets TH policy to \p Value if CPol operand is present in instruction \p MI.
492 // \returns Returns true if \p MI is modified, false otherwise.
493 bool setTH(const MachineBasicBlock::iterator MI,
494 AMDGPU::CPol::CPol Value) const;
495
496 // Sets Scope policy to \p Value if CPol operand is present in instruction \p
497 // MI. \returns Returns true if \p MI is modified, false otherwise.
498 bool setScope(const MachineBasicBlock::iterator MI,
499 AMDGPU::CPol::CPol Value) const;
500
501 // Stores with system scope (SCOPE_SYS) need to wait for:
502 // - loads or atomics(returning) - wait for {LOAD|SAMPLE|BVH|KM}CNT==0
503 // - non-returning-atomics - wait for STORECNT==0
504 // TODO: SIInsertWaitcnts will not always be able to remove STORECNT waits
505 // since it does not distinguish atomics-with-return from regular stores.
506 // There is no need to wait if memory is cached (mtype != UC).
507 bool
508 insertWaitsBeforeSystemScopeStore(const MachineBasicBlock::iterator MI) const;
509
510 bool setAtomicScope(const MachineBasicBlock::iterator &MI,
511 SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const;
512
513public:
514 SIGfx12CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {
515 // GFX120x and GFX125x memory models greatly overlap, and in some cases
516 // the behavior is the same if assuming GFX120x in CU mode.
517 assert(!ST.hasGFX1250Insts() || ST.hasGFX13Insts() || ST.isCuModeEnabled());
518 }
519
520 bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
521 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
522 bool IsCrossAddrSpaceOrdering, Position Pos,
523 AtomicOrdering Order, bool AtomicsOnly) const override;
524
525 bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
526 SIAtomicAddrSpace AddrSpace, Position Pos) const override;
527
528 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
529 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
530 bool IsVolatile, bool IsNonTemporal,
531 bool IsLastUse) const override;
532
533 bool finalizeStore(MachineInstr &MI, bool Atomic) const override;
534
535 bool handleCooperativeAtomic(MachineInstr &MI) const override;
536
537 bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
538 SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,
539 Position Pos) const override;
540
541 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
542 SIAtomicScope Scope,
543 SIAtomicAddrSpace AddrSpace) const override {
544 return setAtomicScope(MI, Scope, AddrSpace);
545 }
546
547 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
548 SIAtomicScope Scope,
549 SIAtomicAddrSpace AddrSpace) const override {
550 return setAtomicScope(MI, Scope, AddrSpace);
551 }
552
553 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
554 SIAtomicScope Scope,
555 SIAtomicAddrSpace AddrSpace) const override {
556 return setAtomicScope(MI, Scope, AddrSpace);
557 }
558};
559
560class SIMemoryLegalizer final {
561private:
562 const MachineModuleInfo &MMI;
563 /// Cache Control.
564 std::unique_ptr<SICacheControl> CC = nullptr;
565
566 /// List of atomic pseudo instructions.
567 std::list<MachineBasicBlock::iterator> AtomicPseudoMIs;
568
569 /// Return true iff instruction \p MI is a atomic instruction that
570 /// returns a result.
571 bool isAtomicRet(const MachineInstr &MI) const {
572 return SIInstrInfo::isAtomicRet(MI);
573 }
574
575 /// Removes all processed atomic pseudo instructions from the current
576 /// function. Returns true if current function is modified, false otherwise.
577 bool removeAtomicPseudoMIs();
578
579 /// Expands load operation \p MI. Returns true if instructions are
580 /// added/deleted or \p MI is modified, false otherwise.
581 bool expandLoad(const SIMemOpInfo &MOI,
582 MachineBasicBlock::iterator &MI);
583 /// Expands store operation \p MI. Returns true if instructions are
584 /// added/deleted or \p MI is modified, false otherwise.
585 bool expandStore(const SIMemOpInfo &MOI,
586 MachineBasicBlock::iterator &MI);
587 /// Expands atomic fence operation \p MI. Returns true if
588 /// instructions are added/deleted or \p MI is modified, false otherwise.
589 bool expandAtomicFence(const SIMemOpInfo &MOI,
590 MachineBasicBlock::iterator &MI);
591 /// Expands atomic cmpxchg or rmw operation \p MI. Returns true if
592 /// instructions are added/deleted or \p MI is modified, false otherwise.
593 bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
594 MachineBasicBlock::iterator &MI);
595 /// Expands LDS DMA operation \p MI. Returns true if instructions are
596 /// added/deleted or \p MI is modified, false otherwise.
597 bool expandLDSDMA(const SIMemOpInfo &MOI, MachineBasicBlock::iterator &MI);
598
599public:
600 SIMemoryLegalizer(const MachineModuleInfo &MMI) : MMI(MMI) {};
601 bool run(MachineFunction &MF);
602};
603
604class SIMemoryLegalizerLegacy final : public MachineFunctionPass {
605public:
606 static char ID;
607
608 SIMemoryLegalizerLegacy() : MachineFunctionPass(ID) {}
609
610 void getAnalysisUsage(AnalysisUsage &AU) const override {
611 AU.setPreservesCFG();
612 MachineFunctionPass::getAnalysisUsage(AU);
613 }
614
615 StringRef getPassName() const override {
616 return PASS_NAME;
617 }
618
619 bool runOnMachineFunction(MachineFunction &MF) override;
620};
621
622static const StringMap<SIAtomicAddrSpace> ASNames = {{
623 {"global", SIAtomicAddrSpace::GLOBAL},
624 {"local", SIAtomicAddrSpace::LDS},
625}};
626
627void diagnoseUnknownMMRAASName(const MachineInstr &MI, StringRef AS) {
628 const MachineFunction *MF = MI.getMF();
629 const Function &Fn = MF->getFunction();
630 SmallString<128> Str;
631 raw_svector_ostream OS(Str);
632 OS << "unknown address space '" << AS << "'; expected one of ";
633 ListSeparator LS;
634 for (const auto &[Name, Val] : ASNames)
635 OS << LS << '\'' << Name << '\'';
636 Fn.getContext().diagnose(
637 DI: DiagnosticInfoUnsupported(Fn, Str.str(), MI.getDebugLoc(), DS_Warning));
638}
639
640/// Reads \p MI's MMRAs to parse the "amdgpu-synchronize-as" MMRA.
641/// If this tag isn't present, or if it has no meaningful values, returns
642/// \p none, otherwise returns the address spaces specified by the MD.
643static std::optional<SIAtomicAddrSpace>
644getSynchronizeAddrSpaceMD(const MachineInstr &MI) {
645 static constexpr StringLiteral FenceASPrefix = "amdgpu-synchronize-as";
646
647 auto MMRA = MMRAMetadata(MI.getMMRAMetadata());
648 if (!MMRA)
649 return std::nullopt;
650
651 SIAtomicAddrSpace Result = SIAtomicAddrSpace::NONE;
652 for (const auto &[Prefix, Suffix] : MMRA) {
653 if (Prefix != FenceASPrefix)
654 continue;
655
656 if (auto It = ASNames.find(Key: Suffix); It != ASNames.end())
657 Result |= It->second;
658 else
659 diagnoseUnknownMMRAASName(MI, AS: Suffix);
660 }
661
662 if (Result == SIAtomicAddrSpace::NONE)
663 return std::nullopt;
664
665 return Result;
666}
667
668} // end anonymous namespace
669
670void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI,
671 const char *Msg) const {
672 const Function &Func = MI->getMF()->getFunction();
673 Func.getContext().diagnose(
674 DI: DiagnosticInfoUnsupported(Func, Msg, MI->getDebugLoc()));
675}
676
677std::optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
678SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID,
679 SIAtomicAddrSpace InstrAddrSpace) const {
680 if (SSID == SyncScope::System)
681 return std::tuple(SIAtomicScope::SYSTEM, SIAtomicAddrSpace::ATOMIC, true);
682 if (SSID == MMI->getAgentSSID())
683 return std::tuple(SIAtomicScope::AGENT, SIAtomicAddrSpace::ATOMIC, true);
684 if (SSID == MMI->getClusterSSID())
685 return std::tuple(SIAtomicScope::CLUSTER, SIAtomicAddrSpace::ATOMIC, true);
686 if (SSID == MMI->getWorkgroupSSID())
687 return std::tuple(SIAtomicScope::WORKGROUP, SIAtomicAddrSpace::ATOMIC,
688 true);
689 if (SSID == MMI->getWavefrontSSID())
690 return std::tuple(SIAtomicScope::WAVEFRONT, SIAtomicAddrSpace::ATOMIC,
691 true);
692 if (SSID == SyncScope::SingleThread)
693 return std::tuple(SIAtomicScope::SINGLETHREAD, SIAtomicAddrSpace::ATOMIC,
694 true);
695 if (SSID == MMI->getSystemOneAddressSpaceSSID())
696 return std::tuple(SIAtomicScope::SYSTEM,
697 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
698 if (SSID == MMI->getAgentOneAddressSpaceSSID())
699 return std::tuple(SIAtomicScope::AGENT,
700 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
701 if (SSID == MMI->getClusterOneAddressSpaceSSID())
702 return std::tuple(SIAtomicScope::CLUSTER,
703 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
704 if (SSID == MMI->getWorkgroupOneAddressSpaceSSID())
705 return std::tuple(SIAtomicScope::WORKGROUP,
706 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
707 if (SSID == MMI->getWavefrontOneAddressSpaceSSID())
708 return std::tuple(SIAtomicScope::WAVEFRONT,
709 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
710 if (SSID == MMI->getSingleThreadOneAddressSpaceSSID())
711 return std::tuple(SIAtomicScope::SINGLETHREAD,
712 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
713 return std::nullopt;
714}
715
716SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const {
717 if (AS == AMDGPUAS::FLAT_ADDRESS)
718 return SIAtomicAddrSpace::FLAT;
719 if (AS == AMDGPUAS::GLOBAL_ADDRESS)
720 return SIAtomicAddrSpace::GLOBAL;
721 if (AS == AMDGPUAS::LOCAL_ADDRESS)
722 return SIAtomicAddrSpace::LDS;
723 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
724 return SIAtomicAddrSpace::SCRATCH;
725 if (AS == AMDGPUAS::REGION_ADDRESS)
726 return SIAtomicAddrSpace::GDS;
727 if (AS == AMDGPUAS::BUFFER_FAT_POINTER || AS == AMDGPUAS::BUFFER_RESOURCE ||
728 AS == AMDGPUAS::BUFFER_STRIDED_POINTER)
729 return SIAtomicAddrSpace::GLOBAL;
730
731 return SIAtomicAddrSpace::OTHER;
732}
733
734SIMemOpAccess::SIMemOpAccess(const AMDGPUMachineModuleInfo &MMI_,
735 const GCNSubtarget &ST)
736 : MMI(&MMI_), ST(ST) {}
737
738std::optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
739 const MachineBasicBlock::iterator &MI) const {
740 assert(MI->getNumMemOperands() > 0);
741
742 SyncScope::ID SSID = SyncScope::SingleThread;
743 AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
744 AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
745 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
746 bool IsNonTemporal = true;
747 bool IsVolatile = false;
748 bool IsLastUse = false;
749 bool IsCooperative = false;
750
751 // Validator should check whether or not MMOs cover the entire set of
752 // locations accessed by the memory instruction.
753 for (const auto &MMO : MI->memoperands()) {
754 IsNonTemporal &= MMO->isNonTemporal();
755 IsVolatile |= MMO->isVolatile();
756 IsLastUse |= MMO->getFlags() & MOLastUse;
757 IsCooperative |= MMO->getFlags() & MOCooperative;
758 InstrAddrSpace |=
759 toSIAtomicAddrSpace(AS: MMO->getPointerInfo().getAddrSpace());
760 AtomicOrdering OpOrdering = MMO->getSuccessOrdering();
761 if (OpOrdering != AtomicOrdering::NotAtomic) {
762 const auto &IsSyncScopeInclusion =
763 MMI->isSyncScopeInclusion(A: SSID, B: MMO->getSyncScopeID());
764 if (!IsSyncScopeInclusion) {
765 reportUnsupported(MI,
766 Msg: "Unsupported non-inclusive atomic synchronization scope");
767 return std::nullopt;
768 }
769
770 SSID = *IsSyncScopeInclusion ? SSID : MMO->getSyncScopeID();
771 Ordering = getMergedAtomicOrdering(AO: Ordering, Other: OpOrdering);
772 assert(MMO->getFailureOrdering() != AtomicOrdering::Release &&
773 MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease);
774 FailureOrdering =
775 getMergedAtomicOrdering(AO: FailureOrdering, Other: MMO->getFailureOrdering());
776 }
777 }
778
779 // FIXME: The MMO of buffer atomic instructions does not always have an atomic
780 // ordering. We only need to handle VBUFFER atomics on GFX12+ so we can fix it
781 // here, but the lowering should really be cleaned up at some point.
782 if ((ST.getGeneration() >= GCNSubtarget::GFX12) && SIInstrInfo::isBUF(MI: *MI) &&
783 SIInstrInfo::isAtomic(MI: *MI) && Ordering == AtomicOrdering::NotAtomic)
784 Ordering = AtomicOrdering::Monotonic;
785
786 SIAtomicScope Scope = SIAtomicScope::NONE;
787 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
788 bool IsCrossAddressSpaceOrdering = false;
789 if (Ordering != AtomicOrdering::NotAtomic) {
790 auto ScopeOrNone = toSIAtomicScope(SSID, InstrAddrSpace);
791 if (!ScopeOrNone) {
792 reportUnsupported(MI, Msg: "Unsupported atomic synchronization scope");
793 return std::nullopt;
794 }
795 std::tie(args&: Scope, args&: OrderingAddrSpace, args&: IsCrossAddressSpaceOrdering) =
796 *ScopeOrNone;
797 if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
798 ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace) ||
799 ((InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) == SIAtomicAddrSpace::NONE)) {
800 reportUnsupported(MI, Msg: "Unsupported atomic address space");
801 return std::nullopt;
802 }
803 }
804 return SIMemOpInfo(ST, Ordering, Scope, OrderingAddrSpace, InstrAddrSpace,
805 IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile,
806 IsNonTemporal, IsLastUse, IsCooperative);
807}
808
809std::optional<SIMemOpInfo>
810SIMemOpAccess::getLoadInfo(const MachineBasicBlock::iterator &MI) const {
811 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
812
813 if (!(MI->mayLoad() && !MI->mayStore()))
814 return std::nullopt;
815
816 // Be conservative if there are no memory operands.
817 if (MI->getNumMemOperands() == 0)
818 return SIMemOpInfo(ST);
819
820 return constructFromMIWithMMO(MI);
821}
822
823std::optional<SIMemOpInfo>
824SIMemOpAccess::getStoreInfo(const MachineBasicBlock::iterator &MI) const {
825 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
826
827 if (!(!MI->mayLoad() && MI->mayStore()))
828 return std::nullopt;
829
830 // Be conservative if there are no memory operands.
831 if (MI->getNumMemOperands() == 0)
832 return SIMemOpInfo(ST);
833
834 return constructFromMIWithMMO(MI);
835}
836
837std::optional<SIMemOpInfo>
838SIMemOpAccess::getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const {
839 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
840
841 if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE)
842 return std::nullopt;
843
844 AtomicOrdering Ordering =
845 static_cast<AtomicOrdering>(MI->getOperand(i: 0).getImm());
846
847 SyncScope::ID SSID = static_cast<SyncScope::ID>(MI->getOperand(i: 1).getImm());
848 auto ScopeOrNone = toSIAtomicScope(SSID, InstrAddrSpace: SIAtomicAddrSpace::ATOMIC);
849 if (!ScopeOrNone) {
850 reportUnsupported(MI, Msg: "Unsupported atomic synchronization scope");
851 return std::nullopt;
852 }
853
854 SIAtomicScope Scope = SIAtomicScope::NONE;
855 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
856 bool IsCrossAddressSpaceOrdering = false;
857 std::tie(args&: Scope, args&: OrderingAddrSpace, args&: IsCrossAddressSpaceOrdering) =
858 *ScopeOrNone;
859
860 if (OrderingAddrSpace != SIAtomicAddrSpace::ATOMIC) {
861 // We currently expect refineOrderingAS to be the only place that
862 // can refine the AS ordered by the fence.
863 // If that changes, we need to review the semantics of that function
864 // in case it needs to preserve certain address spaces.
865 reportUnsupported(MI, Msg: "Unsupported atomic address space");
866 return std::nullopt;
867 }
868
869 auto SynchronizeAS = getSynchronizeAddrSpaceMD(MI: *MI);
870 if (SynchronizeAS)
871 OrderingAddrSpace = *SynchronizeAS;
872
873 return SIMemOpInfo(ST, Ordering, Scope, OrderingAddrSpace,
874 SIAtomicAddrSpace::ATOMIC, IsCrossAddressSpaceOrdering,
875 AtomicOrdering::NotAtomic);
876}
877
878std::optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo(
879 const MachineBasicBlock::iterator &MI) const {
880 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
881
882 if (!(MI->mayLoad() && MI->mayStore()))
883 return std::nullopt;
884
885 // Be conservative if there are no memory operands.
886 if (MI->getNumMemOperands() == 0)
887 return SIMemOpInfo(ST);
888
889 return constructFromMIWithMMO(MI);
890}
891
892std::optional<SIMemOpInfo>
893SIMemOpAccess::getLDSDMAInfo(const MachineBasicBlock::iterator &MI) const {
894 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
895
896 if (!SIInstrInfo::isLDSDMA(MI: *MI))
897 return std::nullopt;
898
899 return constructFromMIWithMMO(MI);
900}
901
902SICacheControl::SICacheControl(const GCNSubtarget &ST) : ST(ST) {
903 TII = ST.getInstrInfo();
904 IV = getIsaVersion(GPU: ST.getCPU());
905 InsertCacheInv = !AmdgcnSkipCacheInvalidations;
906}
907
908bool SICacheControl::enableCPolBits(const MachineBasicBlock::iterator MI,
909 unsigned Bits) const {
910 MachineOperand *CPol = TII->getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::cpol);
911 if (!CPol)
912 return false;
913
914 CPol->setImm(CPol->getImm() | Bits);
915 return true;
916}
917
918bool SICacheControl::canAffectGlobalAddrSpace(SIAtomicAddrSpace AS) const {
919 assert((!ST.hasGloballyAddressableScratch() ||
920 (AS & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE ||
921 (AS & SIAtomicAddrSpace::SCRATCH) == SIAtomicAddrSpace::NONE) &&
922 "scratch instructions should already be replaced by flat "
923 "instructions if GloballyAddressableScratch is enabled");
924 return (AS & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE;
925}
926
927/* static */
928std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) {
929 GCNSubtarget::Generation Generation = ST.getGeneration();
930 if (Generation < AMDGPUSubtarget::GFX10)
931 return std::make_unique<SIGfx6CacheControl>(args: ST);
932 if (Generation < AMDGPUSubtarget::GFX12)
933 return std::make_unique<SIGfx10CacheControl>(args: ST);
934 return std::make_unique<SIGfx12CacheControl>(args: ST);
935}
936
937bool SIGfx6CacheControl::enableLoadCacheBypass(
938 const MachineBasicBlock::iterator &MI,
939 SIAtomicScope Scope,
940 SIAtomicAddrSpace AddrSpace) const {
941 assert(MI->mayLoad() && !MI->mayStore());
942
943 if (!canAffectGlobalAddrSpace(AS: AddrSpace)) {
944 /// The scratch address space does not need the global memory caches
945 /// to be bypassed as all memory operations by the same thread are
946 /// sequentially consistent, and no other thread can access scratch
947 /// memory.
948
949 /// Other address spaces do not have a cache.
950 return false;
951 }
952
953 bool Changed = false;
954 switch (Scope) {
955 case SIAtomicScope::SYSTEM:
956 if (ST.hasGFX940Insts()) {
957 // Set SC bits to indicate system scope.
958 Changed |= enableCPolBits(MI, Bits: CPol::SC0 | CPol::SC1);
959 break;
960 }
961 [[fallthrough]];
962 case SIAtomicScope::AGENT:
963 if (ST.hasGFX940Insts()) {
964 // Set SC bits to indicate agent scope.
965 Changed |= enableCPolBits(MI, Bits: CPol::SC1);
966 } else {
967 // Set L1 cache policy to MISS_EVICT.
968 // Note: there is no L2 cache bypass policy at the ISA level.
969 Changed |= enableCPolBits(MI, Bits: CPol::GLC);
970 }
971 break;
972 case SIAtomicScope::WORKGROUP:
973 if (ST.hasGFX940Insts()) {
974 // In threadgroup split mode the waves of a work-group can be executing
975 // on different CUs. Therefore need to bypass the L1 which is per CU.
976 // Otherwise in non-threadgroup split mode all waves of a work-group are
977 // on the same CU, and so the L1 does not need to be bypassed. Setting
978 // SC bits to indicate work-group scope will do this automatically.
979 Changed |= enableCPolBits(MI, Bits: CPol::SC0);
980 } else if (ST.hasGFX90AInsts()) {
981 // In threadgroup split mode the waves of a work-group can be executing
982 // on different CUs. Therefore need to bypass the L1 which is per CU.
983 // Otherwise in non-threadgroup split mode all waves of a work-group are
984 // on the same CU, and so the L1 does not need to be bypassed.
985 if (ST.isTgSplitEnabled())
986 Changed |= enableCPolBits(MI, Bits: CPol::GLC);
987 }
988 break;
989 case SIAtomicScope::WAVEFRONT:
990 case SIAtomicScope::SINGLETHREAD:
991 // No cache to bypass.
992 break;
993 default:
994 llvm_unreachable("Unsupported synchronization scope");
995 }
996
997 return Changed;
998}
999
1000bool SIGfx6CacheControl::enableStoreCacheBypass(
1001 const MachineBasicBlock::iterator &MI,
1002 SIAtomicScope Scope,
1003 SIAtomicAddrSpace AddrSpace) const {
1004 assert(!MI->mayLoad() && MI->mayStore());
1005 bool Changed = false;
1006
1007 /// For targets other than GFX940, the L1 cache is write through so does not
1008 /// need to be bypassed. There is no bypass control for the L2 cache at the
1009 /// isa level.
1010
1011 if (ST.hasGFX940Insts() && canAffectGlobalAddrSpace(AS: AddrSpace)) {
1012 switch (Scope) {
1013 case SIAtomicScope::SYSTEM:
1014 // Set SC bits to indicate system scope.
1015 Changed |= enableCPolBits(MI, Bits: CPol::SC0 | CPol::SC1);
1016 break;
1017 case SIAtomicScope::AGENT:
1018 // Set SC bits to indicate agent scope.
1019 Changed |= enableCPolBits(MI, Bits: CPol::SC1);
1020 break;
1021 case SIAtomicScope::WORKGROUP:
1022 // Set SC bits to indicate workgroup scope.
1023 Changed |= enableCPolBits(MI, Bits: CPol::SC0);
1024 break;
1025 case SIAtomicScope::WAVEFRONT:
1026 case SIAtomicScope::SINGLETHREAD:
1027 // Leave SC bits unset to indicate wavefront scope.
1028 break;
1029 default:
1030 llvm_unreachable("Unsupported synchronization scope");
1031 }
1032
1033 /// The scratch address space does not need the global memory caches
1034 /// to be bypassed as all memory operations by the same thread are
1035 /// sequentially consistent, and no other thread can access scratch
1036 /// memory.
1037
1038 /// Other address spaces do not have a cache.
1039 }
1040
1041 return Changed;
1042}
1043
1044bool SIGfx6CacheControl::enableRMWCacheBypass(
1045 const MachineBasicBlock::iterator &MI,
1046 SIAtomicScope Scope,
1047 SIAtomicAddrSpace AddrSpace) const {
1048 assert(MI->mayLoad() && MI->mayStore());
1049 bool Changed = false;
1050
1051 /// For targets other than GFX940, do not set GLC for RMW atomic operations as
1052 /// L0/L1 cache is automatically bypassed, and the GLC bit is instead used to
1053 /// indicate if they are return or no-return. Note: there is no L2 cache
1054 /// coherent bypass control at the ISA level.
1055 /// For GFX90A+, RMW atomics implicitly bypass the L1 cache.
1056
1057 if (ST.hasGFX940Insts() && canAffectGlobalAddrSpace(AS: AddrSpace)) {
1058 switch (Scope) {
1059 case SIAtomicScope::SYSTEM:
1060 // Set SC1 bit to indicate system scope.
1061 Changed |= enableCPolBits(MI, Bits: CPol::SC1);
1062 break;
1063 case SIAtomicScope::AGENT:
1064 case SIAtomicScope::WORKGROUP:
1065 case SIAtomicScope::WAVEFRONT:
1066 case SIAtomicScope::SINGLETHREAD:
1067 // RMW atomic operations implicitly bypass the L1 cache and only use SC1
1068 // to indicate system or agent scope. The SC0 bit is used to indicate if
1069 // they are return or no-return. Leave SC1 bit unset to indicate agent
1070 // scope.
1071 break;
1072 default:
1073 llvm_unreachable("Unsupported synchronization scope");
1074 }
1075 }
1076
1077 return Changed;
1078}
1079
1080bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal(
1081 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1082 bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
1083 // Only handle load and store, not atomic read-modify-write insructions. The
1084 // latter use glc to indicate if the atomic returns a result and so must not
1085 // be used for cache control.
1086 assert((MI->mayLoad() ^ MI->mayStore()) || SIInstrInfo::isLDSDMA(*MI));
1087
1088 // Only update load and store, not LLVM IR atomic read-modify-write
1089 // instructions. The latter are always marked as volatile so cannot sensibly
1090 // handle it as do not want to pessimize all atomics. Also they do not support
1091 // the nontemporal attribute.
1092 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1093
1094 bool Changed = false;
1095
1096 if (IsVolatile) {
1097 if (ST.hasGFX940Insts()) {
1098 // Set SC bits to indicate system scope.
1099 Changed |= enableCPolBits(MI, Bits: CPol::SC0 | CPol::SC1);
1100 } else if (Op == SIMemOp::LOAD) {
1101 // Set L1 cache policy to be MISS_EVICT for load instructions
1102 // and MISS_LRU for store instructions.
1103 // Note: there is no L2 cache bypass policy at the ISA level.
1104 Changed |= enableCPolBits(MI, Bits: CPol::GLC);
1105 }
1106
1107 // Ensure operation has completed at system scope to cause all volatile
1108 // operations to be visible outside the program in a global order. Do not
1109 // request cross address space as only the global address space can be
1110 // observable outside the program, so no need to cause a waitcnt for LDS
1111 // address space operations.
1112 Changed |= insertWait(MI, Scope: SIAtomicScope::SYSTEM, AddrSpace, Op, IsCrossAddrSpaceOrdering: false,
1113 Pos: Position::AFTER, Order: AtomicOrdering::Unordered,
1114 /*AtomicsOnly=*/false);
1115
1116 return Changed;
1117 }
1118
1119 if (IsNonTemporal) {
1120 if (ST.hasGFX940Insts()) {
1121 Changed |= enableCPolBits(MI, Bits: CPol::NT);
1122 } else {
1123 // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT
1124 // for both loads and stores, and the L2 cache policy to STREAM.
1125 Changed |= enableCPolBits(MI, Bits: CPol::SLC | CPol::GLC);
1126 }
1127 return Changed;
1128 }
1129
1130 return Changed;
1131}
1132
1133bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
1134 SIAtomicScope Scope,
1135 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1136 bool IsCrossAddrSpaceOrdering, Position Pos,
1137 AtomicOrdering Order,
1138 bool AtomicsOnly) const {
1139 bool Changed = false;
1140
1141 MachineBasicBlock &MBB = *MI->getParent();
1142 const DebugLoc &DL = MI->getDebugLoc();
1143
1144 if (Pos == Position::AFTER)
1145 ++MI;
1146
1147 // GFX90A+
1148 if (ST.hasGFX90AInsts() && ST.isTgSplitEnabled()) {
1149 // In threadgroup split mode the waves of a work-group can be executing on
1150 // different CUs. Therefore need to wait for global or GDS memory operations
1151 // to complete to ensure they are visible to waves in the other CUs.
1152 // Otherwise in non-threadgroup split mode all waves of a work-group are on
1153 // the same CU, so no need to wait for global memory as all waves in the
1154 // work-group access the same the L1, nor wait for GDS as access are ordered
1155 // on a CU.
1156 if (((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH |
1157 SIAtomicAddrSpace::GDS)) != SIAtomicAddrSpace::NONE) &&
1158 (Scope == SIAtomicScope::WORKGROUP)) {
1159 // Same as <GFX90A at AGENT scope;
1160 Scope = SIAtomicScope::AGENT;
1161 }
1162 // In threadgroup split mode LDS cannot be allocated so no need to wait for
1163 // LDS memory operations.
1164 AddrSpace &= ~SIAtomicAddrSpace::LDS;
1165 }
1166
1167 bool VMCnt = false;
1168 bool LGKMCnt = false;
1169
1170 if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
1171 SIAtomicAddrSpace::NONE) {
1172 switch (Scope) {
1173 case SIAtomicScope::SYSTEM:
1174 case SIAtomicScope::AGENT:
1175 VMCnt |= true;
1176 break;
1177 case SIAtomicScope::WORKGROUP:
1178 case SIAtomicScope::WAVEFRONT:
1179 case SIAtomicScope::SINGLETHREAD:
1180 // The L1 cache keeps all memory operations in order for
1181 // wavefronts in the same work-group.
1182 break;
1183 default:
1184 llvm_unreachable("Unsupported synchronization scope");
1185 }
1186 }
1187
1188 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1189 switch (Scope) {
1190 case SIAtomicScope::SYSTEM:
1191 case SIAtomicScope::AGENT:
1192 case SIAtomicScope::WORKGROUP:
1193 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
1194 // not needed as LDS operations for all waves are executed in a total
1195 // global ordering as observed by all waves. Required if also
1196 // synchronizing with global/GDS memory as LDS operations could be
1197 // reordered with respect to later global/GDS memory operations of the
1198 // same wave.
1199 LGKMCnt |= IsCrossAddrSpaceOrdering;
1200 break;
1201 case SIAtomicScope::WAVEFRONT:
1202 case SIAtomicScope::SINGLETHREAD:
1203 // The LDS keeps all memory operations in order for
1204 // the same wavefront.
1205 break;
1206 default:
1207 llvm_unreachable("Unsupported synchronization scope");
1208 }
1209 }
1210
1211 if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
1212 switch (Scope) {
1213 case SIAtomicScope::SYSTEM:
1214 case SIAtomicScope::AGENT:
1215 // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
1216 // is not needed as GDS operations for all waves are executed in a total
1217 // global ordering as observed by all waves. Required if also
1218 // synchronizing with global/LDS memory as GDS operations could be
1219 // reordered with respect to later global/LDS memory operations of the
1220 // same wave.
1221 LGKMCnt |= IsCrossAddrSpaceOrdering;
1222 break;
1223 case SIAtomicScope::WORKGROUP:
1224 case SIAtomicScope::WAVEFRONT:
1225 case SIAtomicScope::SINGLETHREAD:
1226 // The GDS keeps all memory operations in order for
1227 // the same work-group.
1228 break;
1229 default:
1230 llvm_unreachable("Unsupported synchronization scope");
1231 }
1232 }
1233
1234 if (VMCnt || LGKMCnt) {
1235 unsigned WaitCntImmediate =
1236 AMDGPU::encodeWaitcnt(Version: IV,
1237 Vmcnt: VMCnt ? 0 : getVmcntBitMask(Version: IV),
1238 Expcnt: getExpcntBitMask(Version: IV),
1239 Lgkmcnt: LGKMCnt ? 0 : getLgkmcntBitMask(Version: IV));
1240 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_WAITCNT_soft))
1241 .addImm(Val: WaitCntImmediate);
1242 Changed = true;
1243 }
1244
1245 // On architectures that support direct loads to LDS, emit an unknown waitcnt
1246 // at workgroup-scoped release operations that specify the LDS address space.
1247 // SIInsertWaitcnts will later replace this with a vmcnt().
1248 if (ST.hasVMemToLDSLoad() && isReleaseOrStronger(AO: Order) &&
1249 Scope == SIAtomicScope::WORKGROUP &&
1250 (AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1251 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_WAITCNT_lds_direct));
1252 Changed = true;
1253 }
1254
1255 if (Pos == Position::AFTER)
1256 --MI;
1257
1258 return Changed;
1259}
1260
1261static bool canUseBUFFER_WBINVL1_VOL(const GCNSubtarget &ST) {
1262 if (ST.getGeneration() <= AMDGPUSubtarget::SOUTHERN_ISLANDS)
1263 return false;
1264 return !ST.isAmdPalOS() && !ST.isMesa3DOS();
1265}
1266
1267bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1268 SIAtomicScope Scope,
1269 SIAtomicAddrSpace AddrSpace,
1270 Position Pos) const {
1271 if (!InsertCacheInv)
1272 return false;
1273
1274 bool Changed = false;
1275
1276 MachineBasicBlock &MBB = *MI->getParent();
1277 const DebugLoc &DL = MI->getDebugLoc();
1278
1279 if (Pos == Position::AFTER)
1280 ++MI;
1281
1282 const unsigned InvalidateL1 = canUseBUFFER_WBINVL1_VOL(ST)
1283 ? AMDGPU::BUFFER_WBINVL1_VOL
1284 : AMDGPU::BUFFER_WBINVL1;
1285
1286 if (canAffectGlobalAddrSpace(AS: AddrSpace)) {
1287 switch (Scope) {
1288 case SIAtomicScope::SYSTEM:
1289 if (ST.hasGFX940Insts()) {
1290 // Ensures that following loads will not see stale remote VMEM data or
1291 // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW
1292 // and CC will never be stale due to the local memory probes.
1293 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::BUFFER_INV))
1294 // Set SC bits to indicate system scope.
1295 .addImm(Val: AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);
1296 // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
1297 // hardware does not reorder memory operations by the same wave with
1298 // respect to a preceding "BUFFER_INV". The invalidate is guaranteed to
1299 // remove any cache lines of earlier writes by the same wave and ensures
1300 // later reads by the same wave will refetch the cache lines.
1301 Changed = true;
1302 break;
1303 }
1304
1305 if (ST.hasGFX90AInsts()) {
1306 // Ensures that following loads will not see stale remote VMEM data or
1307 // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW
1308 // and CC will never be stale due to the local memory probes.
1309 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::BUFFER_INVL2));
1310 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: InvalidateL1));
1311 // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
1312 // hardware does not reorder memory operations by the same wave with
1313 // respect to a preceding "BUFFER_INVL2". The invalidate is guaranteed
1314 // to remove any cache lines of earlier writes by the same wave and
1315 // ensures later reads by the same wave will refetch the cache lines.
1316 Changed = true;
1317 break;
1318 }
1319 [[fallthrough]];
1320 case SIAtomicScope::AGENT:
1321 if (ST.hasGFX940Insts()) {
1322 // Ensures that following loads will not see stale remote date or local
1323 // MTYPE NC global data. Local MTYPE RW and CC memory will never be
1324 // stale due to the memory probes.
1325 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::BUFFER_INV))
1326 // Set SC bits to indicate agent scope.
1327 .addImm(Val: AMDGPU::CPol::SC1);
1328 // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
1329 // does not reorder memory operations with respect to preceeding buffer
1330 // invalidate. The invalidate is guaranteed to remove any cache lines of
1331 // earlier writes and ensures later writes will refetch the cache lines.
1332 } else
1333 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: InvalidateL1));
1334 Changed = true;
1335 break;
1336 case SIAtomicScope::WORKGROUP:
1337 if (ST.isTgSplitEnabled()) {
1338 if (ST.hasGFX940Insts()) {
1339 // In threadgroup split mode the waves of a work-group can be
1340 // executing on different CUs. Therefore need to invalidate the L1
1341 // which is per CU. Otherwise in non-threadgroup split mode all waves
1342 // of a work-group are on the same CU, and so the L1 does not need to
1343 // be invalidated.
1344
1345 // Ensures L1 is invalidated if in threadgroup split mode. In
1346 // non-threadgroup split mode it is a NOP, but no point generating it
1347 // in that case if know not in that mode.
1348 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::BUFFER_INV))
1349 // Set SC bits to indicate work-group scope.
1350 .addImm(Val: AMDGPU::CPol::SC0);
1351 // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
1352 // does not reorder memory operations with respect to preceeding
1353 // buffer invalidate. The invalidate is guaranteed to remove any cache
1354 // lines of earlier writes and ensures later writes will refetch the
1355 // cache lines.
1356 Changed = true;
1357 } else if (ST.hasGFX90AInsts()) {
1358 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: InvalidateL1));
1359 Changed = true;
1360 }
1361 }
1362 break;
1363 case SIAtomicScope::WAVEFRONT:
1364 case SIAtomicScope::SINGLETHREAD:
1365 // For GFX940, we could generate "BUFFER_INV" but it would do nothing as
1366 // there are no caches to invalidate. All other targets have no cache to
1367 // invalidate.
1368 break;
1369 default:
1370 llvm_unreachable("Unsupported synchronization scope");
1371 }
1372 }
1373
1374 /// The scratch address space does not need the global memory cache
1375 /// to be flushed as all memory operations by the same thread are
1376 /// sequentially consistent, and no other thread can access scratch
1377 /// memory.
1378
1379 /// Other address spaces do not have a cache.
1380
1381 if (Pos == Position::AFTER)
1382 --MI;
1383
1384 return Changed;
1385}
1386
1387bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1388 SIAtomicScope Scope,
1389 SIAtomicAddrSpace AddrSpace,
1390 bool IsCrossAddrSpaceOrdering,
1391 Position Pos) const {
1392 bool Changed = false;
1393
1394 if (ST.hasGFX90AInsts()) {
1395 MachineBasicBlock &MBB = *MI->getParent();
1396 const DebugLoc &DL = MI->getDebugLoc();
1397
1398 if (Pos == Position::AFTER)
1399 ++MI;
1400
1401 if (canAffectGlobalAddrSpace(AS: AddrSpace)) {
1402 switch (Scope) {
1403 case SIAtomicScope::SYSTEM:
1404 // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
1405 // hardware does not reorder memory operations by the same wave with
1406 // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
1407 // to initiate writeback of any dirty cache lines of earlier writes by
1408 // the same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
1409 // writeback has completed.
1410 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::BUFFER_WBL2))
1411 // Set SC bits to indicate system scope.
1412 .addImm(Val: AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);
1413 Changed = true;
1414 break;
1415 case SIAtomicScope::AGENT:
1416 if (ST.hasGFX940Insts()) {
1417 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::BUFFER_WBL2))
1418 // Set SC bits to indicate agent scope.
1419 .addImm(Val: AMDGPU::CPol::SC1);
1420
1421 // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is
1422 // SIAtomicScope::AGENT, the following insertWait will generate the
1423 // required "S_WAITCNT vmcnt(0)".
1424 Changed = true;
1425 }
1426 break;
1427 case SIAtomicScope::WORKGROUP:
1428 case SIAtomicScope::WAVEFRONT:
1429 case SIAtomicScope::SINGLETHREAD:
1430 // For GFX940, do not generate "BUFFER_WBL2" as there are no caches it
1431 // would writeback, and would require an otherwise unnecessary
1432 // "S_WAITCNT vmcnt(0)".
1433 break;
1434 default:
1435 llvm_unreachable("Unsupported synchronization scope");
1436 }
1437 }
1438
1439 if (Pos == Position::AFTER)
1440 --MI;
1441 }
1442
1443 // Ensure the necessary S_WAITCNT needed by any "BUFFER_WBL2" as well as other
1444 // S_WAITCNT needed.
1445 Changed |= insertWait(MI, Scope, AddrSpace, Op: SIMemOp::LOAD | SIMemOp::STORE,
1446 IsCrossAddrSpaceOrdering, Pos, Order: AtomicOrdering::Release,
1447 /*AtomicsOnly=*/false);
1448
1449 return Changed;
1450}
1451
1452bool SIGfx10CacheControl::enableLoadCacheBypass(
1453 const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
1454 SIAtomicAddrSpace AddrSpace) const {
1455 assert(MI->mayLoad() && !MI->mayStore());
1456 bool Changed = false;
1457
1458 if (canAffectGlobalAddrSpace(AS: AddrSpace)) {
1459 switch (Scope) {
1460 case SIAtomicScope::SYSTEM:
1461 case SIAtomicScope::AGENT:
1462 // Set the L0 and L1 cache policies to MISS_EVICT.
1463 // Note: there is no L2 cache coherent bypass control at the ISA level.
1464 // For GFX10, set GLC+DLC, for GFX11, only set GLC.
1465 Changed |=
1466 enableCPolBits(MI, Bits: CPol::GLC | (AMDGPU::isGFX10(STI: ST) ? CPol::DLC : 0));
1467 break;
1468 case SIAtomicScope::WORKGROUP:
1469 // In WGP mode the waves of a work-group can be executing on either CU of
1470 // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
1471 // CU mode all waves of a work-group are on the same CU, and so the L0
1472 // does not need to be bypassed.
1473 if (!ST.isCuModeEnabled())
1474 Changed |= enableCPolBits(MI, Bits: CPol::GLC);
1475 break;
1476 case SIAtomicScope::WAVEFRONT:
1477 case SIAtomicScope::SINGLETHREAD:
1478 // No cache to bypass.
1479 break;
1480 default:
1481 llvm_unreachable("Unsupported synchronization scope");
1482 }
1483 }
1484
1485 /// The scratch address space does not need the global memory caches
1486 /// to be bypassed as all memory operations by the same thread are
1487 /// sequentially consistent, and no other thread can access scratch
1488 /// memory.
1489
1490 /// Other address spaces do not have a cache.
1491
1492 return Changed;
1493}
1494
1495bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal(
1496 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1497 bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
1498
1499 // Only handle load and store, not atomic read-modify-write insructions. The
1500 // latter use glc to indicate if the atomic returns a result and so must not
1501 // be used for cache control.
1502 assert((MI->mayLoad() ^ MI->mayStore()) || SIInstrInfo::isLDSDMA(*MI));
1503
1504 // Only update load and store, not LLVM IR atomic read-modify-write
1505 // instructions. The latter are always marked as volatile so cannot sensibly
1506 // handle it as do not want to pessimize all atomics. Also they do not support
1507 // the nontemporal attribute.
1508 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1509
1510 bool Changed = false;
1511
1512 if (IsVolatile) {
1513 // Set L0 and L1 cache policy to be MISS_EVICT for load instructions
1514 // and MISS_LRU for store instructions.
1515 // Note: there is no L2 cache coherent bypass control at the ISA level.
1516 if (Op == SIMemOp::LOAD) {
1517 Changed |= enableCPolBits(MI, Bits: CPol::GLC | CPol::DLC);
1518 }
1519
1520 // GFX11: Set MALL NOALLOC for both load and store instructions.
1521 if (AMDGPU::isGFX11(STI: ST))
1522 Changed |= enableCPolBits(MI, Bits: CPol::DLC);
1523
1524 // Ensure operation has completed at system scope to cause all volatile
1525 // operations to be visible outside the program in a global order. Do not
1526 // request cross address space as only the global address space can be
1527 // observable outside the program, so no need to cause a waitcnt for LDS
1528 // address space operations.
1529 Changed |= insertWait(MI, Scope: SIAtomicScope::SYSTEM, AddrSpace, Op, IsCrossAddrSpaceOrdering: false,
1530 Pos: Position::AFTER, Order: AtomicOrdering::Unordered,
1531 /*AtomicsOnly=*/false);
1532 return Changed;
1533 }
1534
1535 if (IsNonTemporal) {
1536 // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT
1537 // and L2 cache policy to STREAM.
1538 // For stores setting both GLC and SLC configures L0 and L1 cache policy
1539 // to MISS_EVICT and the L2 cache policy to STREAM.
1540 if (Op == SIMemOp::STORE)
1541 Changed |= enableCPolBits(MI, Bits: CPol::GLC);
1542 Changed |= enableCPolBits(MI, Bits: CPol::SLC);
1543
1544 // GFX11: Set MALL NOALLOC for both load and store instructions.
1545 if (AMDGPU::isGFX11(STI: ST))
1546 Changed |= enableCPolBits(MI, Bits: CPol::DLC);
1547
1548 return Changed;
1549 }
1550
1551 return Changed;
1552}
1553
1554bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
1555 SIAtomicScope Scope,
1556 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1557 bool IsCrossAddrSpaceOrdering,
1558 Position Pos, AtomicOrdering Order,
1559 bool AtomicsOnly) const {
1560 bool Changed = false;
1561
1562 MachineBasicBlock &MBB = *MI->getParent();
1563 const DebugLoc &DL = MI->getDebugLoc();
1564
1565 if (Pos == Position::AFTER)
1566 ++MI;
1567
1568 bool VMCnt = false;
1569 bool VSCnt = false;
1570 bool LGKMCnt = false;
1571
1572 if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
1573 SIAtomicAddrSpace::NONE) {
1574 switch (Scope) {
1575 case SIAtomicScope::SYSTEM:
1576 case SIAtomicScope::AGENT:
1577 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1578 VMCnt |= true;
1579 if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1580 VSCnt |= true;
1581 break;
1582 case SIAtomicScope::WORKGROUP:
1583 // In WGP mode the waves of a work-group can be executing on either CU of
1584 // the WGP. Therefore need to wait for operations to complete to ensure
1585 // they are visible to waves in the other CU as the L0 is per CU.
1586 // Otherwise in CU mode and all waves of a work-group are on the same CU
1587 // which shares the same L0. Note that we still need to wait when
1588 // performing a release in this mode to respect the transitivity of
1589 // happens-before, e.g. other waves of the workgroup must be able to
1590 // release the memory from another wave at a wider scope.
1591 if (!ST.isCuModeEnabled() || isReleaseOrStronger(AO: Order)) {
1592 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1593 VMCnt |= true;
1594 if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1595 VSCnt |= true;
1596 }
1597 break;
1598 case SIAtomicScope::WAVEFRONT:
1599 case SIAtomicScope::SINGLETHREAD:
1600 // The L0 cache keeps all memory operations in order for
1601 // work-items in the same wavefront.
1602 break;
1603 default:
1604 llvm_unreachable("Unsupported synchronization scope");
1605 }
1606 }
1607
1608 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1609 switch (Scope) {
1610 case SIAtomicScope::SYSTEM:
1611 case SIAtomicScope::AGENT:
1612 case SIAtomicScope::WORKGROUP:
1613 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
1614 // not needed as LDS operations for all waves are executed in a total
1615 // global ordering as observed by all waves. Required if also
1616 // synchronizing with global/GDS memory as LDS operations could be
1617 // reordered with respect to later global/GDS memory operations of the
1618 // same wave.
1619 LGKMCnt |= IsCrossAddrSpaceOrdering;
1620 break;
1621 case SIAtomicScope::WAVEFRONT:
1622 case SIAtomicScope::SINGLETHREAD:
1623 // The LDS keeps all memory operations in order for
1624 // the same wavefront.
1625 break;
1626 default:
1627 llvm_unreachable("Unsupported synchronization scope");
1628 }
1629 }
1630
1631 if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
1632 switch (Scope) {
1633 case SIAtomicScope::SYSTEM:
1634 case SIAtomicScope::AGENT:
1635 // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
1636 // is not needed as GDS operations for all waves are executed in a total
1637 // global ordering as observed by all waves. Required if also
1638 // synchronizing with global/LDS memory as GDS operations could be
1639 // reordered with respect to later global/LDS memory operations of the
1640 // same wave.
1641 LGKMCnt |= IsCrossAddrSpaceOrdering;
1642 break;
1643 case SIAtomicScope::WORKGROUP:
1644 case SIAtomicScope::WAVEFRONT:
1645 case SIAtomicScope::SINGLETHREAD:
1646 // The GDS keeps all memory operations in order for
1647 // the same work-group.
1648 break;
1649 default:
1650 llvm_unreachable("Unsupported synchronization scope");
1651 }
1652 }
1653
1654 if (VMCnt || LGKMCnt) {
1655 unsigned WaitCntImmediate =
1656 AMDGPU::encodeWaitcnt(Version: IV,
1657 Vmcnt: VMCnt ? 0 : getVmcntBitMask(Version: IV),
1658 Expcnt: getExpcntBitMask(Version: IV),
1659 Lgkmcnt: LGKMCnt ? 0 : getLgkmcntBitMask(Version: IV));
1660 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_WAITCNT_soft))
1661 .addImm(Val: WaitCntImmediate);
1662 Changed = true;
1663 }
1664
1665 // On architectures that support direct loads to LDS, emit an unknown waitcnt
1666 // at workgroup-scoped release operations that specify the LDS address space.
1667 // SIInsertWaitcnts will later replace this with a vmcnt().
1668 if (ST.hasVMemToLDSLoad() && isReleaseOrStronger(AO: Order) &&
1669 Scope == SIAtomicScope::WORKGROUP &&
1670 (AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1671 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_WAITCNT_lds_direct));
1672 Changed = true;
1673 }
1674
1675 if (VSCnt) {
1676 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_WAITCNT_VSCNT_soft))
1677 .addReg(RegNo: AMDGPU::SGPR_NULL, Flags: RegState::Undef)
1678 .addImm(Val: 0);
1679 Changed = true;
1680 }
1681
1682 if (Pos == Position::AFTER)
1683 --MI;
1684
1685 return Changed;
1686}
1687
1688bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1689 SIAtomicScope Scope,
1690 SIAtomicAddrSpace AddrSpace,
1691 Position Pos) const {
1692 if (!InsertCacheInv)
1693 return false;
1694
1695 bool Changed = false;
1696
1697 MachineBasicBlock &MBB = *MI->getParent();
1698 const DebugLoc &DL = MI->getDebugLoc();
1699
1700 if (Pos == Position::AFTER)
1701 ++MI;
1702
1703 if (canAffectGlobalAddrSpace(AS: AddrSpace)) {
1704 switch (Scope) {
1705 case SIAtomicScope::SYSTEM:
1706 case SIAtomicScope::AGENT:
1707 // The order of invalidates matter here. We must invalidate "outer in"
1708 // so L1 -> L0 to avoid L0 pulling in stale data from L1 when it is
1709 // invalidated.
1710 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::BUFFER_GL1_INV));
1711 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::BUFFER_GL0_INV));
1712 Changed = true;
1713 break;
1714 case SIAtomicScope::WORKGROUP:
1715 // In WGP mode the waves of a work-group can be executing on either CU of
1716 // the WGP. Therefore need to invalidate the L0 which is per CU. Otherwise
1717 // in CU mode and all waves of a work-group are on the same CU, and so the
1718 // L0 does not need to be invalidated.
1719 if (!ST.isCuModeEnabled()) {
1720 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::BUFFER_GL0_INV));
1721 Changed = true;
1722 }
1723 break;
1724 case SIAtomicScope::WAVEFRONT:
1725 case SIAtomicScope::SINGLETHREAD:
1726 // No cache to invalidate.
1727 break;
1728 default:
1729 llvm_unreachable("Unsupported synchronization scope");
1730 }
1731 }
1732
1733 /// The scratch address space does not need the global memory cache
1734 /// to be flushed as all memory operations by the same thread are
1735 /// sequentially consistent, and no other thread can access scratch
1736 /// memory.
1737
1738 /// Other address spaces do not have a cache.
1739
1740 if (Pos == Position::AFTER)
1741 --MI;
1742
1743 return Changed;
1744}
1745
1746bool SIGfx12CacheControl::setTH(const MachineBasicBlock::iterator MI,
1747 AMDGPU::CPol::CPol Value) const {
1748 MachineOperand *CPol = TII->getNamedOperand(MI&: *MI, OperandName: OpName::cpol);
1749 if (!CPol)
1750 return false;
1751
1752 uint64_t NewTH = Value & AMDGPU::CPol::TH;
1753 if ((CPol->getImm() & AMDGPU::CPol::TH) != NewTH) {
1754 CPol->setImm((CPol->getImm() & ~AMDGPU::CPol::TH) | NewTH);
1755 return true;
1756 }
1757
1758 return false;
1759}
1760
1761bool SIGfx12CacheControl::setScope(const MachineBasicBlock::iterator MI,
1762 AMDGPU::CPol::CPol Value) const {
1763 MachineOperand *CPol = TII->getNamedOperand(MI&: *MI, OperandName: OpName::cpol);
1764 if (!CPol)
1765 return false;
1766
1767 uint64_t NewScope = Value & AMDGPU::CPol::SCOPE;
1768 if ((CPol->getImm() & AMDGPU::CPol::SCOPE) != NewScope) {
1769 CPol->setImm((CPol->getImm() & ~AMDGPU::CPol::SCOPE) | NewScope);
1770 return true;
1771 }
1772
1773 return false;
1774}
1775
1776bool SIGfx12CacheControl::insertWaitsBeforeSystemScopeStore(
1777 const MachineBasicBlock::iterator MI) const {
1778 // TODO: implement flag for frontend to give us a hint not to insert waits.
1779
1780 MachineBasicBlock &MBB = *MI->getParent();
1781 const DebugLoc &DL = MI->getDebugLoc();
1782
1783 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: S_WAIT_LOADCNT_soft)).addImm(Val: 0);
1784 if (ST.hasImageInsts()) {
1785 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: S_WAIT_SAMPLECNT_soft)).addImm(Val: 0);
1786 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: S_WAIT_BVHCNT_soft)).addImm(Val: 0);
1787 }
1788 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: S_WAIT_KMCNT_soft)).addImm(Val: 0);
1789 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: S_WAIT_STORECNT_soft)).addImm(Val: 0);
1790
1791 return true;
1792}
1793
1794bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI,
1795 SIAtomicScope Scope,
1796 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1797 bool IsCrossAddrSpaceOrdering,
1798 Position Pos, AtomicOrdering Order,
1799 bool AtomicsOnly) const {
1800 bool Changed = false;
1801
1802 MachineBasicBlock &MBB = *MI->getParent();
1803 const DebugLoc &DL = MI->getDebugLoc();
1804
1805 bool LOADCnt = false;
1806 bool DSCnt = false;
1807 bool STORECnt = false;
1808
1809 if (Pos == Position::AFTER)
1810 ++MI;
1811
1812 if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
1813 SIAtomicAddrSpace::NONE) {
1814 switch (Scope) {
1815 case SIAtomicScope::SYSTEM:
1816 case SIAtomicScope::AGENT:
1817 case SIAtomicScope::CLUSTER:
1818 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1819 LOADCnt |= true;
1820 if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1821 STORECnt |= true;
1822 break;
1823 case SIAtomicScope::WORKGROUP:
1824 // GFX12.0:
1825 // In WGP mode the waves of a work-group can be executing on either CU
1826 // of the WGP. Therefore need to wait for operations to complete to
1827 // ensure they are visible to waves in the other CU as the L0 is per CU.
1828 //
1829 // Otherwise in CU mode and all waves of a work-group are on the same CU
1830 // which shares the same L0. Note that we still need to wait when
1831 // performing a release in this mode to respect the transitivity of
1832 // happens-before, e.g. other waves of the workgroup must be able to
1833 // release the memory from another wave at a wider scope.
1834 //
1835 // GFX12.5:
1836 // CU$ has two ports. To ensure operations are visible at the workgroup
1837 // level, we need to ensure all operations in this port have completed
1838 // so the other SIMDs in the WG can see them. There is no ordering
1839 // guarantee between the ports.
1840 if (!ST.isCuModeEnabled() || ST.hasGFX1250Insts() ||
1841 isReleaseOrStronger(AO: Order)) {
1842 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1843 LOADCnt |= true;
1844 if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1845 STORECnt |= true;
1846 }
1847 break;
1848 case SIAtomicScope::WAVEFRONT:
1849 case SIAtomicScope::SINGLETHREAD:
1850 // The L0 cache keeps all memory operations in order for
1851 // work-items in the same wavefront.
1852 break;
1853 default:
1854 llvm_unreachable("Unsupported synchronization scope");
1855 }
1856 }
1857
1858 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1859 switch (Scope) {
1860 case SIAtomicScope::SYSTEM:
1861 case SIAtomicScope::AGENT:
1862 case SIAtomicScope::CLUSTER:
1863 case SIAtomicScope::WORKGROUP:
1864 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
1865 // not needed as LDS operations for all waves are executed in a total
1866 // global ordering as observed by all waves. Required if also
1867 // synchronizing with global/GDS memory as LDS operations could be
1868 // reordered with respect to later global/GDS memory operations of the
1869 // same wave.
1870 DSCnt |= IsCrossAddrSpaceOrdering;
1871 break;
1872 case SIAtomicScope::WAVEFRONT:
1873 case SIAtomicScope::SINGLETHREAD:
1874 // The LDS keeps all memory operations in order for
1875 // the same wavefront.
1876 break;
1877 default:
1878 llvm_unreachable("Unsupported synchronization scope");
1879 }
1880 }
1881
1882 if (LOADCnt) {
1883 // Acquire sequences only need to wait on the previous atomic operation.
1884 // e.g. a typical sequence looks like
1885 // atomic load
1886 // (wait)
1887 // global_inv
1888 //
1889 // We do not have BVH or SAMPLE atomics, so the atomic load is always going
1890 // to be tracked using loadcnt.
1891 //
1892 // This also applies to fences. Fences cannot pair with an instruction
1893 // tracked with bvh/samplecnt as we don't have any atomics that do that.
1894 if (!AtomicsOnly && ST.hasImageInsts()) {
1895 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_WAIT_BVHCNT_soft)).addImm(Val: 0);
1896 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_WAIT_SAMPLECNT_soft)).addImm(Val: 0);
1897 }
1898 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_WAIT_LOADCNT_soft)).addImm(Val: 0);
1899 Changed = true;
1900 }
1901
1902 if (STORECnt) {
1903 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_WAIT_STORECNT_soft)).addImm(Val: 0);
1904 Changed = true;
1905 }
1906
1907 if (DSCnt) {
1908 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_WAIT_DSCNT_soft)).addImm(Val: 0);
1909 Changed = true;
1910 }
1911
1912 if (Pos == Position::AFTER)
1913 --MI;
1914
1915 return Changed;
1916}
1917
1918bool SIGfx12CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1919 SIAtomicScope Scope,
1920 SIAtomicAddrSpace AddrSpace,
1921 Position Pos) const {
1922 if (!InsertCacheInv)
1923 return false;
1924
1925 MachineBasicBlock &MBB = *MI->getParent();
1926 const DebugLoc &DL = MI->getDebugLoc();
1927
1928 /// The scratch address space does not need the global memory cache
1929 /// to be flushed as all memory operations by the same thread are
1930 /// sequentially consistent, and no other thread can access scratch
1931 /// memory.
1932
1933 /// Other address spaces do not have a cache.
1934 if (!canAffectGlobalAddrSpace(AS: AddrSpace))
1935 return false;
1936
1937 AMDGPU::CPol::CPol ScopeImm = AMDGPU::CPol::SCOPE_DEV;
1938 switch (Scope) {
1939 case SIAtomicScope::SYSTEM:
1940 ScopeImm = AMDGPU::CPol::SCOPE_SYS;
1941 break;
1942 case SIAtomicScope::AGENT:
1943 ScopeImm = AMDGPU::CPol::SCOPE_DEV;
1944 break;
1945 case SIAtomicScope::CLUSTER:
1946 ScopeImm = AMDGPU::CPol::SCOPE_SE;
1947 break;
1948 case SIAtomicScope::WORKGROUP:
1949 // GFX12.0:
1950 // In WGP mode the waves of a work-group can be executing on either CU of
1951 // the WGP. Therefore we need to invalidate the L0 which is per CU.
1952 // Otherwise in CU mode all waves of a work-group are on the same CU, and
1953 // so the L0 does not need to be invalidated.
1954 //
1955 // GFX12.5 has a shared WGP$, so no invalidates are required.
1956 if (ST.isCuModeEnabled())
1957 return false;
1958
1959 ScopeImm = AMDGPU::CPol::SCOPE_SE;
1960 break;
1961 case SIAtomicScope::WAVEFRONT:
1962 case SIAtomicScope::SINGLETHREAD:
1963 // No cache to invalidate.
1964 return false;
1965 default:
1966 llvm_unreachable("Unsupported synchronization scope");
1967 }
1968
1969 if (Pos == Position::AFTER)
1970 ++MI;
1971
1972 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::GLOBAL_INV)).addImm(Val: ScopeImm);
1973
1974 if (Pos == Position::AFTER)
1975 --MI;
1976
1977 // Target requires a waitcnt to ensure that the proceeding INV has completed
1978 // as it may get reorded with following load instructions.
1979 if (ST.hasINVWBL2WaitCntRequirement() && Scope > SIAtomicScope::CLUSTER) {
1980 insertWait(MI, Scope, AddrSpace, Op: SIMemOp::LOAD,
1981 /*IsCrossAddrSpaceOrdering=*/false, Pos, Order: AtomicOrdering::Acquire,
1982 /*AtomicsOnly=*/false);
1983
1984 if (Pos == Position::AFTER)
1985 --MI;
1986 }
1987
1988 return true;
1989}
1990
1991bool SIGfx12CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1992 SIAtomicScope Scope,
1993 SIAtomicAddrSpace AddrSpace,
1994 bool IsCrossAddrSpaceOrdering,
1995 Position Pos) const {
1996 bool Changed = false;
1997
1998 MachineBasicBlock &MBB = *MI->getParent();
1999 const DebugLoc &DL = MI->getDebugLoc();
2000
2001 // The scratch address space does not need the global memory cache
2002 // writeback as all memory operations by the same thread are
2003 // sequentially consistent, and no other thread can access scratch
2004 // memory.
2005 if (canAffectGlobalAddrSpace(AS: AddrSpace)) {
2006 if (Pos == Position::AFTER)
2007 ++MI;
2008
2009 // global_wb is only necessary at system scope for GFX12.0,
2010 // they're also necessary at device scope for GFX12.5 as stores
2011 // cannot report completion earlier than L2.
2012 //
2013 // Emitting it for lower scopes is a slow no-op, so we omit it
2014 // for performance.
2015 std::optional<AMDGPU::CPol::CPol> NeedsWB;
2016 switch (Scope) {
2017 case SIAtomicScope::SYSTEM:
2018 NeedsWB = AMDGPU::CPol::SCOPE_SYS;
2019 break;
2020 case SIAtomicScope::AGENT:
2021 // GFX12.5 may have >1 L2 per device so we must emit a device scope WB.
2022 if (ST.hasGFX1250Insts())
2023 NeedsWB = AMDGPU::CPol::SCOPE_DEV;
2024 break;
2025 case SIAtomicScope::CLUSTER:
2026 case SIAtomicScope::WORKGROUP:
2027 // No WB necessary, but we still have to wait.
2028 case SIAtomicScope::WAVEFRONT:
2029 case SIAtomicScope::SINGLETHREAD:
2030 // No WB or wait necessary here, but insertWait takes care of that.
2031 break;
2032 default:
2033 llvm_unreachable("Unsupported synchronization scope");
2034 }
2035
2036 if (NeedsWB) {
2037 // Target requires a waitcnt to ensure that the proceeding store
2038 // proceeding store/rmw operations have completed in L2 so their data will
2039 // be written back by the WB instruction.
2040 if (ST.hasINVWBL2WaitCntRequirement())
2041 insertWait(MI, Scope, AddrSpace, Op: SIMemOp::LOAD | SIMemOp::STORE,
2042 /*IsCrossAddrSpaceOrdering=*/false, Pos,
2043 Order: AtomicOrdering::Release,
2044 /*AtomicsOnly=*/false);
2045
2046 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::GLOBAL_WB)).addImm(Val: *NeedsWB);
2047 Changed = true;
2048 }
2049
2050 if (Pos == Position::AFTER)
2051 --MI;
2052 }
2053
2054 // We always have to wait for previous memory operations (load/store) to
2055 // complete, whether we inserted a WB or not. If we inserted a WB (storecnt),
2056 // we of course need to wait for that as well.
2057 Changed |= insertWait(MI, Scope, AddrSpace, Op: SIMemOp::LOAD | SIMemOp::STORE,
2058 IsCrossAddrSpaceOrdering, Pos, Order: AtomicOrdering::Release,
2059 /*AtomicsOnly=*/false);
2060
2061 return Changed;
2062}
2063
2064bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
2065 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
2066 bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
2067
2068 // Only handle load and store, not atomic read-modify-write instructions.
2069 assert((MI->mayLoad() ^ MI->mayStore()) || SIInstrInfo::isLDSDMA(*MI));
2070
2071 // Only update load and store, not LLVM IR atomic read-modify-write
2072 // instructions. The latter are always marked as volatile so cannot sensibly
2073 // handle it as do not want to pessimize all atomics. Also they do not support
2074 // the nontemporal attribute.
2075 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
2076
2077 bool Changed = false;
2078
2079 if (IsLastUse) {
2080 // Set last-use hint.
2081 Changed |= setTH(MI, Value: AMDGPU::CPol::TH_LU);
2082 } else if (IsNonTemporal) {
2083 // Set non-temporal hint for all cache levels.
2084 Changed |= setTH(MI, Value: AMDGPU::CPol::TH_NT);
2085 }
2086
2087 if (IsVolatile) {
2088 Changed |= setScope(MI, Value: AMDGPU::CPol::SCOPE_SYS);
2089
2090 if (ST.requiresWaitXCntForSingleAccessInstructions() &&
2091 SIInstrInfo::isVMEM(MI: *MI)) {
2092 MachineBasicBlock &MBB = *MI->getParent();
2093 BuildMI(BB&: MBB, I: MI, MIMD: MI->getDebugLoc(), MCID: TII->get(Opcode: S_WAIT_XCNT_soft)).addImm(Val: 0);
2094 Changed = true;
2095 }
2096
2097 // Ensure operation has completed at system scope to cause all volatile
2098 // operations to be visible outside the program in a global order. Do not
2099 // request cross address space as only the global address space can be
2100 // observable outside the program, so no need to cause a waitcnt for LDS
2101 // address space operations.
2102 Changed |= insertWait(MI, Scope: SIAtomicScope::SYSTEM, AddrSpace, Op, IsCrossAddrSpaceOrdering: false,
2103 Pos: Position::AFTER, Order: AtomicOrdering::Unordered,
2104 /*AtomicsOnly=*/false);
2105 }
2106
2107 return Changed;
2108}
2109
2110bool SIGfx12CacheControl::finalizeStore(MachineInstr &MI, bool Atomic) const {
2111 assert(MI.mayStore() && "Not a Store inst");
2112 const bool IsRMW = (MI.mayLoad() && MI.mayStore());
2113 bool Changed = false;
2114
2115 if (Atomic && ST.requiresWaitXCntForSingleAccessInstructions() &&
2116 SIInstrInfo::isVMEM(MI)) {
2117 MachineBasicBlock &MBB = *MI.getParent();
2118 BuildMI(BB&: MBB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: S_WAIT_XCNT_soft)).addImm(Val: 0);
2119 Changed = true;
2120 }
2121
2122 // Remaining fixes do not apply to RMWs.
2123 if (IsRMW)
2124 return Changed;
2125
2126 MachineOperand *CPol = TII->getNamedOperand(MI, OperandName: OpName::cpol);
2127 if (!CPol) // Some vmem operations do not have a scope and are not concerned.
2128 return Changed;
2129 const unsigned Scope = CPol->getImm() & CPol::SCOPE;
2130
2131 // GFX12.0 only: Extra waits needed before system scope stores.
2132 if (ST.requiresWaitsBeforeSystemScopeStores() && !Atomic &&
2133 Scope == CPol::SCOPE_SYS)
2134 Changed |= insertWaitsBeforeSystemScopeStore(MI: MI.getIterator());
2135
2136 return Changed;
2137}
2138
2139bool SIGfx12CacheControl::handleCooperativeAtomic(MachineInstr &MI) const {
2140 if (!ST.hasGFX1250Insts())
2141 return false;
2142
2143 // Cooperative atomics need to be SCOPE_DEV or higher.
2144 MachineOperand *CPol = TII->getNamedOperand(MI, OperandName: OpName::cpol);
2145 assert(CPol && "No CPol operand?");
2146 const unsigned Scope = CPol->getImm() & CPol::SCOPE;
2147 if (Scope < CPol::SCOPE_DEV)
2148 return setScope(MI, Value: CPol::SCOPE_DEV);
2149 return false;
2150}
2151
2152bool SIGfx12CacheControl::setAtomicScope(const MachineBasicBlock::iterator &MI,
2153 SIAtomicScope Scope,
2154 SIAtomicAddrSpace AddrSpace) const {
2155 bool Changed = false;
2156
2157 if (canAffectGlobalAddrSpace(AS: AddrSpace)) {
2158 switch (Scope) {
2159 case SIAtomicScope::SYSTEM:
2160 Changed |= setScope(MI, Value: AMDGPU::CPol::SCOPE_SYS);
2161 break;
2162 case SIAtomicScope::AGENT:
2163 Changed |= setScope(MI, Value: AMDGPU::CPol::SCOPE_DEV);
2164 break;
2165 case SIAtomicScope::CLUSTER:
2166 Changed |= setScope(MI, Value: AMDGPU::CPol::SCOPE_SE);
2167 break;
2168 case SIAtomicScope::WORKGROUP:
2169 // In workgroup mode, SCOPE_SE is needed as waves can executes on
2170 // different CUs that access different L0s.
2171 if (!ST.isCuModeEnabled())
2172 Changed |= setScope(MI, Value: AMDGPU::CPol::SCOPE_SE);
2173 break;
2174 case SIAtomicScope::WAVEFRONT:
2175 case SIAtomicScope::SINGLETHREAD:
2176 // No cache to bypass.
2177 break;
2178 default:
2179 llvm_unreachable("Unsupported synchronization scope");
2180 }
2181 }
2182
2183 // The scratch address space does not need the global memory caches
2184 // to be bypassed as all memory operations by the same thread are
2185 // sequentially consistent, and no other thread can access scratch
2186 // memory.
2187
2188 // Other address spaces do not have a cache.
2189
2190 return Changed;
2191}
2192
2193bool SIMemoryLegalizer::removeAtomicPseudoMIs() {
2194 if (AtomicPseudoMIs.empty())
2195 return false;
2196
2197 for (auto &MI : AtomicPseudoMIs)
2198 MI->eraseFromParent();
2199
2200 AtomicPseudoMIs.clear();
2201 return true;
2202}
2203
2204bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI,
2205 MachineBasicBlock::iterator &MI) {
2206 assert(MI->mayLoad() && !MI->mayStore());
2207
2208 bool Changed = false;
2209
2210 if (MOI.isAtomic()) {
2211 const AtomicOrdering Order = MOI.getOrdering();
2212 if (Order == AtomicOrdering::Monotonic ||
2213 Order == AtomicOrdering::Acquire ||
2214 Order == AtomicOrdering::SequentiallyConsistent) {
2215 Changed |= CC->enableLoadCacheBypass(MI, Scope: MOI.getScope(),
2216 AddrSpace: MOI.getOrderingAddrSpace());
2217 }
2218
2219 // Handle cooperative atomics after cache bypass step, as it may override
2220 // the scope of the instruction to a greater scope.
2221 if (MOI.isCooperative())
2222 Changed |= CC->handleCooperativeAtomic(MI&: *MI);
2223
2224 if (Order == AtomicOrdering::SequentiallyConsistent)
2225 Changed |= CC->insertWait(MI, Scope: MOI.getScope(), AddrSpace: MOI.getOrderingAddrSpace(),
2226 Op: SIMemOp::LOAD | SIMemOp::STORE,
2227 IsCrossAddrSpaceOrdering: MOI.getIsCrossAddressSpaceOrdering(),
2228 Pos: Position::BEFORE, Order, /*AtomicsOnly=*/false);
2229
2230 if (Order == AtomicOrdering::Acquire ||
2231 Order == AtomicOrdering::SequentiallyConsistent) {
2232 // The wait below only needs to wait on the prior atomic.
2233 Changed |=
2234 CC->insertWait(MI, Scope: MOI.getScope(), AddrSpace: MOI.getInstrAddrSpace(),
2235 Op: SIMemOp::LOAD, IsCrossAddrSpaceOrdering: MOI.getIsCrossAddressSpaceOrdering(),
2236 Pos: Position::AFTER, Order, /*AtomicsOnly=*/true);
2237 Changed |= CC->insertAcquire(MI, Scope: MOI.getScope(),
2238 AddrSpace: MOI.getOrderingAddrSpace(),
2239 Pos: Position::AFTER);
2240 }
2241
2242 return Changed;
2243 }
2244
2245 // Atomic instructions already bypass caches to the scope specified by the
2246 // SyncScope operand. Only non-atomic volatile and nontemporal/last-use
2247 // instructions need additional treatment.
2248 Changed |= CC->enableVolatileAndOrNonTemporal(
2249 MI, AddrSpace: MOI.getInstrAddrSpace(), Op: SIMemOp::LOAD, IsVolatile: MOI.isVolatile(),
2250 IsNonTemporal: MOI.isNonTemporal(), IsLastUse: MOI.isLastUse());
2251
2252 return Changed;
2253}
2254
2255bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
2256 MachineBasicBlock::iterator &MI) {
2257 assert(!MI->mayLoad() && MI->mayStore());
2258
2259 bool Changed = false;
2260 // FIXME: Necessary hack because iterator can lose track of the store.
2261 MachineInstr &StoreMI = *MI;
2262
2263 if (MOI.isAtomic()) {
2264 if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
2265 MOI.getOrdering() == AtomicOrdering::Release ||
2266 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2267 Changed |= CC->enableStoreCacheBypass(MI, Scope: MOI.getScope(),
2268 AddrSpace: MOI.getOrderingAddrSpace());
2269 }
2270
2271 // Handle cooperative atomics after cache bypass step, as it may override
2272 // the scope of the instruction to a greater scope.
2273 if (MOI.isCooperative())
2274 Changed |= CC->handleCooperativeAtomic(MI&: *MI);
2275
2276 if (MOI.getOrdering() == AtomicOrdering::Release ||
2277 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2278 Changed |= CC->insertRelease(MI, Scope: MOI.getScope(),
2279 AddrSpace: MOI.getOrderingAddrSpace(),
2280 IsCrossAddrSpaceOrdering: MOI.getIsCrossAddressSpaceOrdering(),
2281 Pos: Position::BEFORE);
2282
2283 Changed |= CC->finalizeStore(MI&: StoreMI, /*Atomic=*/true);
2284 return Changed;
2285 }
2286
2287 // Atomic instructions already bypass caches to the scope specified by the
2288 // SyncScope operand. Only non-atomic volatile and nontemporal instructions
2289 // need additional treatment.
2290 Changed |= CC->enableVolatileAndOrNonTemporal(
2291 MI, AddrSpace: MOI.getInstrAddrSpace(), Op: SIMemOp::STORE, IsVolatile: MOI.isVolatile(),
2292 IsNonTemporal: MOI.isNonTemporal());
2293
2294 // GFX12 specific, scope(desired coherence domain in cache hierarchy) is
2295 // instruction field, do not confuse it with atomic scope.
2296 Changed |= CC->finalizeStore(MI&: StoreMI, /*Atomic=*/false);
2297 return Changed;
2298}
2299
2300bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
2301 MachineBasicBlock::iterator &MI) {
2302 assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE);
2303
2304 AtomicPseudoMIs.push_back(x: MI);
2305 bool Changed = false;
2306
2307 const SIAtomicAddrSpace OrderingAddrSpace = MOI.getOrderingAddrSpace();
2308
2309 if (MOI.isAtomic()) {
2310 const AtomicOrdering Order = MOI.getOrdering();
2311 if (Order == AtomicOrdering::Acquire) {
2312 // Acquire fences only need to wait on the previous atomic they pair with.
2313 Changed |= CC->insertWait(MI, Scope: MOI.getScope(), AddrSpace: OrderingAddrSpace,
2314 Op: SIMemOp::LOAD | SIMemOp::STORE,
2315 IsCrossAddrSpaceOrdering: MOI.getIsCrossAddressSpaceOrdering(),
2316 Pos: Position::BEFORE, Order, /*AtomicsOnly=*/true);
2317 }
2318
2319 if (Order == AtomicOrdering::Release ||
2320 Order == AtomicOrdering::AcquireRelease ||
2321 Order == AtomicOrdering::SequentiallyConsistent)
2322 /// TODO: This relies on a barrier always generating a waitcnt
2323 /// for LDS to ensure it is not reordered with the completion of
2324 /// the proceeding LDS operations. If barrier had a memory
2325 /// ordering and memory scope, then library does not need to
2326 /// generate a fence. Could add support in this file for
2327 /// barrier. SIInsertWaitcnt.cpp could then stop unconditionally
2328 /// adding S_WAITCNT before a S_BARRIER.
2329 Changed |= CC->insertRelease(MI, Scope: MOI.getScope(), AddrSpace: OrderingAddrSpace,
2330 IsCrossAddrSpaceOrdering: MOI.getIsCrossAddressSpaceOrdering(),
2331 Pos: Position::BEFORE);
2332
2333 // TODO: If both release and invalidate are happening they could be combined
2334 // to use the single "BUFFER_WBINV*" instruction. This could be done by
2335 // reorganizing this code or as part of optimizing SIInsertWaitcnt pass to
2336 // track cache invalidate and write back instructions.
2337
2338 if (Order == AtomicOrdering::Acquire ||
2339 Order == AtomicOrdering::AcquireRelease ||
2340 Order == AtomicOrdering::SequentiallyConsistent)
2341 Changed |= CC->insertAcquire(MI, Scope: MOI.getScope(), AddrSpace: OrderingAddrSpace,
2342 Pos: Position::BEFORE);
2343
2344 return Changed;
2345 }
2346
2347 return Changed;
2348}
2349
2350bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
2351 MachineBasicBlock::iterator &MI) {
2352 assert(MI->mayLoad() && MI->mayStore());
2353
2354 bool Changed = false;
2355 MachineInstr &RMWMI = *MI;
2356
2357 if (MOI.isAtomic()) {
2358 const AtomicOrdering Order = MOI.getOrdering();
2359 if (Order == AtomicOrdering::Monotonic ||
2360 Order == AtomicOrdering::Acquire || Order == AtomicOrdering::Release ||
2361 Order == AtomicOrdering::AcquireRelease ||
2362 Order == AtomicOrdering::SequentiallyConsistent) {
2363 Changed |= CC->enableRMWCacheBypass(MI, Scope: MOI.getScope(),
2364 AddrSpace: MOI.getInstrAddrSpace());
2365 }
2366
2367 if (Order == AtomicOrdering::Release ||
2368 Order == AtomicOrdering::AcquireRelease ||
2369 Order == AtomicOrdering::SequentiallyConsistent ||
2370 MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent)
2371 Changed |= CC->insertRelease(MI, Scope: MOI.getScope(),
2372 AddrSpace: MOI.getOrderingAddrSpace(),
2373 IsCrossAddrSpaceOrdering: MOI.getIsCrossAddressSpaceOrdering(),
2374 Pos: Position::BEFORE);
2375
2376 if (Order == AtomicOrdering::Acquire ||
2377 Order == AtomicOrdering::AcquireRelease ||
2378 Order == AtomicOrdering::SequentiallyConsistent ||
2379 MOI.getFailureOrdering() == AtomicOrdering::Acquire ||
2380 MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) {
2381 // Only wait on the previous atomic.
2382 Changed |=
2383 CC->insertWait(MI, Scope: MOI.getScope(), AddrSpace: MOI.getInstrAddrSpace(),
2384 Op: isAtomicRet(MI: *MI) ? SIMemOp::LOAD : SIMemOp::STORE,
2385 IsCrossAddrSpaceOrdering: MOI.getIsCrossAddressSpaceOrdering(), Pos: Position::AFTER,
2386 Order, /*AtomicsOnly=*/true);
2387 Changed |= CC->insertAcquire(MI, Scope: MOI.getScope(),
2388 AddrSpace: MOI.getOrderingAddrSpace(),
2389 Pos: Position::AFTER);
2390 }
2391
2392 Changed |= CC->finalizeStore(MI&: RMWMI, /*Atomic=*/true);
2393 return Changed;
2394 }
2395
2396 return Changed;
2397}
2398
2399bool SIMemoryLegalizer::expandLDSDMA(const SIMemOpInfo &MOI,
2400 MachineBasicBlock::iterator &MI) {
2401 assert(MI->mayLoad() && MI->mayStore());
2402
2403 // The volatility or nontemporal-ness of the operation is a
2404 // function of the global memory, not the LDS.
2405 SIMemOp OpKind =
2406 SIInstrInfo::mayWriteLDSThroughDMA(MI: *MI) ? SIMemOp::LOAD : SIMemOp::STORE;
2407
2408 // Handle volatile and/or nontemporal markers on direct-to-LDS loads and
2409 // stores. The operation is treated as a volatile/nontemporal store
2410 // to its second argument.
2411 return CC->enableVolatileAndOrNonTemporal(
2412 MI, AddrSpace: MOI.getInstrAddrSpace(), Op: OpKind, IsVolatile: MOI.isVolatile(),
2413 IsNonTemporal: MOI.isNonTemporal(), IsLastUse: MOI.isLastUse());
2414}
2415
2416bool SIMemoryLegalizerLegacy::runOnMachineFunction(MachineFunction &MF) {
2417 const MachineModuleInfo &MMI =
2418 getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
2419 return SIMemoryLegalizer(MMI).run(MF);
2420}
2421
2422PreservedAnalyses
2423SIMemoryLegalizerPass::run(MachineFunction &MF,
2424 MachineFunctionAnalysisManager &MFAM) {
2425 auto *MMI = MFAM.getResult<ModuleAnalysisManagerMachineFunctionProxy>(IR&: MF)
2426 .getCachedResult<MachineModuleAnalysis>(
2427 IR&: *MF.getFunction().getParent());
2428 assert(MMI && "MachineModuleAnalysis must be available");
2429 if (!SIMemoryLegalizer(MMI->getMMI()).run(MF))
2430 return PreservedAnalyses::all();
2431 return getMachineFunctionPassPreservedAnalyses().preserveSet<CFGAnalyses>();
2432}
2433
2434bool SIMemoryLegalizer::run(MachineFunction &MF) {
2435 bool Changed = false;
2436
2437 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2438 SIMemOpAccess MOA(MMI.getObjFileInfo<AMDGPUMachineModuleInfo>(), ST);
2439 CC = SICacheControl::create(ST);
2440
2441 for (auto &MBB : MF) {
2442 for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) {
2443
2444 // Unbundle instructions after the post-RA scheduler.
2445 if (MI->isBundle() && MI->mayLoadOrStore()) {
2446 MachineBasicBlock::instr_iterator II(MI->getIterator());
2447 for (MachineBasicBlock::instr_iterator I = ++II, E = MBB.instr_end();
2448 I != E && I->isBundledWithPred(); ++I) {
2449 I->unbundleFromPred();
2450 for (MachineOperand &MO : I->operands())
2451 if (MO.isReg())
2452 MO.setIsInternalRead(false);
2453 }
2454
2455 MI->eraseFromParent();
2456 MI = II->getIterator();
2457 }
2458
2459 if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic))
2460 continue;
2461
2462 if (const auto &MOI = MOA.getLoadInfo(MI)) {
2463 Changed |= expandLoad(MOI: *MOI, MI);
2464 } else if (const auto &MOI = MOA.getStoreInfo(MI)) {
2465 Changed |= expandStore(MOI: *MOI, MI);
2466 } else if (const auto &MOI = MOA.getLDSDMAInfo(MI)) {
2467 Changed |= expandLDSDMA(MOI: *MOI, MI);
2468 } else if (const auto &MOI = MOA.getAtomicFenceInfo(MI)) {
2469 Changed |= expandAtomicFence(MOI: *MOI, MI);
2470 } else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI)) {
2471 Changed |= expandAtomicCmpxchgOrRmw(MOI: *MOI, MI);
2472 }
2473 }
2474 }
2475
2476 Changed |= removeAtomicPseudoMIs();
2477 return Changed;
2478}
2479
2480INITIALIZE_PASS(SIMemoryLegalizerLegacy, DEBUG_TYPE, PASS_NAME, false, false)
2481
2482char SIMemoryLegalizerLegacy::ID = 0;
2483char &llvm::SIMemoryLegalizerID = SIMemoryLegalizerLegacy::ID;
2484
2485FunctionPass *llvm::createSIMemoryLegalizerPass() {
2486 return new SIMemoryLegalizerLegacy();
2487}
2488