1//===- SIMemoryLegalizer.cpp ----------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Memory legalizer - implements memory model. More information can be
11/// found here:
12/// http://llvm.org/docs/AMDGPUUsage.html#memory-model
13//
14//===----------------------------------------------------------------------===//
15
16#include "AMDGPU.h"
17#include "AMDGPUMachineModuleInfo.h"
18#include "GCNSubtarget.h"
19#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
20#include "llvm/ADT/BitmaskEnum.h"
21#include "llvm/ADT/StringExtras.h"
22#include "llvm/CodeGen/MachineBasicBlock.h"
23#include "llvm/CodeGen/MachineFunctionPass.h"
24#include "llvm/CodeGen/MachinePassManager.h"
25#include "llvm/IR/DiagnosticInfo.h"
26#include "llvm/IR/MemoryModelRelaxationAnnotations.h"
27#include "llvm/IR/PassManager.h"
28#include "llvm/Support/AtomicOrdering.h"
29#include "llvm/TargetParser/TargetParser.h"
30
31using namespace llvm;
32using namespace llvm::AMDGPU;
33
34#define DEBUG_TYPE "si-memory-legalizer"
35#define PASS_NAME "SI Memory Legalizer"
36
37static cl::opt<bool> AmdgcnSkipCacheInvalidations(
38 "amdgcn-skip-cache-invalidations", cl::init(Val: false), cl::Hidden,
39 cl::desc("Use this to skip inserting cache invalidating instructions."));
40
41namespace {
42
43LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE();
44
45/// Memory operation flags. Can be ORed together.
46enum class SIMemOp {
47 NONE = 0u,
48 LOAD = 1u << 0,
49 STORE = 1u << 1,
50 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ STORE)
51};
52
53/// Position to insert a new instruction relative to an existing
54/// instruction.
55enum class Position {
56 BEFORE,
57 AFTER
58};
59
60/// The atomic synchronization scopes supported by the AMDGPU target.
61enum class SIAtomicScope {
62 NONE,
63 SINGLETHREAD,
64 WAVEFRONT,
65 WORKGROUP,
66 AGENT,
67 SYSTEM
68};
69
70/// The distinct address spaces supported by the AMDGPU target for
71/// atomic memory operation. Can be ORed together.
72enum class SIAtomicAddrSpace {
73 NONE = 0u,
74 GLOBAL = 1u << 0,
75 LDS = 1u << 1,
76 SCRATCH = 1u << 2,
77 GDS = 1u << 3,
78 OTHER = 1u << 4,
79
80 /// The address spaces that can be accessed by a FLAT instruction.
81 FLAT = GLOBAL | LDS | SCRATCH,
82
83 /// The address spaces that support atomic instructions.
84 ATOMIC = GLOBAL | LDS | SCRATCH | GDS,
85
86 /// All address spaces.
87 ALL = GLOBAL | LDS | SCRATCH | GDS | OTHER,
88
89 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
90};
91
92class SIMemOpInfo final {
93private:
94
95 friend class SIMemOpAccess;
96
97 AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
98 AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
99 SIAtomicScope Scope = SIAtomicScope::SYSTEM;
100 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
101 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
102 bool IsCrossAddressSpaceOrdering = false;
103 bool IsVolatile = false;
104 bool IsNonTemporal = false;
105 bool IsLastUse = false;
106
107 SIMemOpInfo(
108 AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent,
109 SIAtomicScope Scope = SIAtomicScope::SYSTEM,
110 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC,
111 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::ALL,
112 bool IsCrossAddressSpaceOrdering = true,
113 AtomicOrdering FailureOrdering = AtomicOrdering::SequentiallyConsistent,
114 bool IsVolatile = false, bool IsNonTemporal = false,
115 bool IsLastUse = false)
116 : Ordering(Ordering), FailureOrdering(FailureOrdering), Scope(Scope),
117 OrderingAddrSpace(OrderingAddrSpace), InstrAddrSpace(InstrAddrSpace),
118 IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering),
119 IsVolatile(IsVolatile), IsNonTemporal(IsNonTemporal),
120 IsLastUse(IsLastUse) {
121
122 if (Ordering == AtomicOrdering::NotAtomic) {
123 assert(Scope == SIAtomicScope::NONE &&
124 OrderingAddrSpace == SIAtomicAddrSpace::NONE &&
125 !IsCrossAddressSpaceOrdering &&
126 FailureOrdering == AtomicOrdering::NotAtomic);
127 return;
128 }
129
130 assert(Scope != SIAtomicScope::NONE &&
131 (OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
132 SIAtomicAddrSpace::NONE &&
133 (InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
134 SIAtomicAddrSpace::NONE);
135
136 // There is also no cross address space ordering if the ordering
137 // address space is the same as the instruction address space and
138 // only contains a single address space.
139 if ((OrderingAddrSpace == InstrAddrSpace) &&
140 isPowerOf2_32(Value: uint32_t(InstrAddrSpace)))
141 this->IsCrossAddressSpaceOrdering = false;
142
143 // Limit the scope to the maximum supported by the instruction's address
144 // spaces.
145 if ((InstrAddrSpace & ~SIAtomicAddrSpace::SCRATCH) ==
146 SIAtomicAddrSpace::NONE) {
147 this->Scope = std::min(a: Scope, b: SIAtomicScope::SINGLETHREAD);
148 } else if ((InstrAddrSpace &
149 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS)) ==
150 SIAtomicAddrSpace::NONE) {
151 this->Scope = std::min(a: Scope, b: SIAtomicScope::WORKGROUP);
152 } else if ((InstrAddrSpace &
153 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS |
154 SIAtomicAddrSpace::GDS)) == SIAtomicAddrSpace::NONE) {
155 this->Scope = std::min(a: Scope, b: SIAtomicScope::AGENT);
156 }
157 }
158
159public:
160 /// \returns Atomic synchronization scope of the machine instruction used to
161 /// create this SIMemOpInfo.
162 SIAtomicScope getScope() const {
163 return Scope;
164 }
165
166 /// \returns Ordering constraint of the machine instruction used to
167 /// create this SIMemOpInfo.
168 AtomicOrdering getOrdering() const {
169 return Ordering;
170 }
171
172 /// \returns Failure ordering constraint of the machine instruction used to
173 /// create this SIMemOpInfo.
174 AtomicOrdering getFailureOrdering() const {
175 return FailureOrdering;
176 }
177
178 /// \returns The address spaces be accessed by the machine
179 /// instruction used to create this SIMemOpInfo.
180 SIAtomicAddrSpace getInstrAddrSpace() const {
181 return InstrAddrSpace;
182 }
183
184 /// \returns The address spaces that must be ordered by the machine
185 /// instruction used to create this SIMemOpInfo.
186 SIAtomicAddrSpace getOrderingAddrSpace() const {
187 return OrderingAddrSpace;
188 }
189
190 /// \returns Return true iff memory ordering of operations on
191 /// different address spaces is required.
192 bool getIsCrossAddressSpaceOrdering() const {
193 return IsCrossAddressSpaceOrdering;
194 }
195
196 /// \returns True if memory access of the machine instruction used to
197 /// create this SIMemOpInfo is volatile, false otherwise.
198 bool isVolatile() const {
199 return IsVolatile;
200 }
201
202 /// \returns True if memory access of the machine instruction used to
203 /// create this SIMemOpInfo is nontemporal, false otherwise.
204 bool isNonTemporal() const {
205 return IsNonTemporal;
206 }
207
208 /// \returns True if memory access of the machine instruction used to
209 /// create this SIMemOpInfo is last use, false otherwise.
210 bool isLastUse() const { return IsLastUse; }
211
212 /// \returns True if ordering constraint of the machine instruction used to
213 /// create this SIMemOpInfo is unordered or higher, false otherwise.
214 bool isAtomic() const {
215 return Ordering != AtomicOrdering::NotAtomic;
216 }
217
218};
219
220class SIMemOpAccess final {
221private:
222 const AMDGPUMachineModuleInfo *MMI = nullptr;
223
224 /// Reports unsupported message \p Msg for \p MI to LLVM context.
225 void reportUnsupported(const MachineBasicBlock::iterator &MI,
226 const char *Msg) const;
227
228 /// Inspects the target synchronization scope \p SSID and determines
229 /// the SI atomic scope it corresponds to, the address spaces it
230 /// covers, and whether the memory ordering applies between address
231 /// spaces.
232 std::optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
233 toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrAddrSpace) const;
234
235 /// \return Return a bit set of the address spaces accessed by \p AS.
236 SIAtomicAddrSpace toSIAtomicAddrSpace(unsigned AS) const;
237
238 /// \returns Info constructed from \p MI, which has at least machine memory
239 /// operand.
240 std::optional<SIMemOpInfo>
241 constructFromMIWithMMO(const MachineBasicBlock::iterator &MI) const;
242
243public:
244 /// Construct class to support accessing the machine memory operands
245 /// of instructions in the machine function \p MF.
246 SIMemOpAccess(const AMDGPUMachineModuleInfo &MMI);
247
248 /// \returns Load info if \p MI is a load operation, "std::nullopt" otherwise.
249 std::optional<SIMemOpInfo>
250 getLoadInfo(const MachineBasicBlock::iterator &MI) const;
251
252 /// \returns Store info if \p MI is a store operation, "std::nullopt"
253 /// otherwise.
254 std::optional<SIMemOpInfo>
255 getStoreInfo(const MachineBasicBlock::iterator &MI) const;
256
257 /// \returns Atomic fence info if \p MI is an atomic fence operation,
258 /// "std::nullopt" otherwise.
259 std::optional<SIMemOpInfo>
260 getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const;
261
262 /// \returns Atomic cmpxchg/rmw info if \p MI is an atomic cmpxchg or
263 /// rmw operation, "std::nullopt" otherwise.
264 std::optional<SIMemOpInfo>
265 getAtomicCmpxchgOrRmwInfo(const MachineBasicBlock::iterator &MI) const;
266};
267
268class SICacheControl {
269protected:
270
271 /// AMDGPU subtarget info.
272 const GCNSubtarget &ST;
273
274 /// Instruction info.
275 const SIInstrInfo *TII = nullptr;
276
277 IsaVersion IV;
278
279 /// Whether to insert cache invalidating instructions.
280 bool InsertCacheInv;
281
282 SICacheControl(const GCNSubtarget &ST);
283
284 /// Sets named bit \p BitName to "true" if present in instruction \p MI.
285 /// \returns Returns true if \p MI is modified, false otherwise.
286 bool enableNamedBit(const MachineBasicBlock::iterator MI,
287 AMDGPU::CPol::CPol Bit) const;
288
289public:
290
291 /// Create a cache control for the subtarget \p ST.
292 static std::unique_ptr<SICacheControl> create(const GCNSubtarget &ST);
293
294 /// Update \p MI memory load instruction to bypass any caches up to
295 /// the \p Scope memory scope for address spaces \p
296 /// AddrSpace. Return true iff the instruction was modified.
297 virtual bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
298 SIAtomicScope Scope,
299 SIAtomicAddrSpace AddrSpace) const = 0;
300
301 /// Update \p MI memory store instruction to bypass any caches up to
302 /// the \p Scope memory scope for address spaces \p
303 /// AddrSpace. Return true iff the instruction was modified.
304 virtual bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
305 SIAtomicScope Scope,
306 SIAtomicAddrSpace AddrSpace) const = 0;
307
308 /// Update \p MI memory read-modify-write instruction to bypass any caches up
309 /// to the \p Scope memory scope for address spaces \p AddrSpace. Return true
310 /// iff the instruction was modified.
311 virtual bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
312 SIAtomicScope Scope,
313 SIAtomicAddrSpace AddrSpace) const = 0;
314
315 /// Update \p MI memory instruction of kind \p Op associated with address
316 /// spaces \p AddrSpace to indicate it is volatile and/or
317 /// nontemporal/last-use. Return true iff the instruction was modified.
318 virtual bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
319 SIAtomicAddrSpace AddrSpace,
320 SIMemOp Op, bool IsVolatile,
321 bool IsNonTemporal,
322 bool IsLastUse = false) const = 0;
323
324 virtual bool expandSystemScopeStore(MachineBasicBlock::iterator &MI) const {
325 return false;
326 };
327
328 /// Inserts any necessary instructions at position \p Pos relative
329 /// to instruction \p MI to ensure memory instructions before \p Pos of kind
330 /// \p Op associated with address spaces \p AddrSpace have completed. Used
331 /// between memory instructions to enforce the order they become visible as
332 /// observed by other memory instructions executing in memory scope \p Scope.
333 /// \p IsCrossAddrSpaceOrdering indicates if the memory ordering is between
334 /// address spaces. Returns true iff any instructions inserted.
335 virtual bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
336 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
337 bool IsCrossAddrSpaceOrdering, Position Pos,
338 AtomicOrdering Order) const = 0;
339
340 /// Inserts any necessary instructions at position \p Pos relative to
341 /// instruction \p MI to ensure any subsequent memory instructions of this
342 /// thread with address spaces \p AddrSpace will observe the previous memory
343 /// operations by any thread for memory scopes up to memory scope \p Scope .
344 /// Returns true iff any instructions inserted.
345 virtual bool insertAcquire(MachineBasicBlock::iterator &MI,
346 SIAtomicScope Scope,
347 SIAtomicAddrSpace AddrSpace,
348 Position Pos) const = 0;
349
350 /// Inserts any necessary instructions at position \p Pos relative to
351 /// instruction \p MI to ensure previous memory instructions by this thread
352 /// with address spaces \p AddrSpace have completed and can be observed by
353 /// subsequent memory instructions by any thread executing in memory scope \p
354 /// Scope. \p IsCrossAddrSpaceOrdering indicates if the memory ordering is
355 /// between address spaces. Returns true iff any instructions inserted.
356 virtual bool insertRelease(MachineBasicBlock::iterator &MI,
357 SIAtomicScope Scope,
358 SIAtomicAddrSpace AddrSpace,
359 bool IsCrossAddrSpaceOrdering,
360 Position Pos) const = 0;
361
362 /// Virtual destructor to allow derivations to be deleted.
363 virtual ~SICacheControl() = default;
364};
365
366class SIGfx6CacheControl : public SICacheControl {
367protected:
368
369 /// Sets GLC bit to "true" if present in \p MI. Returns true if \p MI
370 /// is modified, false otherwise.
371 bool enableGLCBit(const MachineBasicBlock::iterator &MI) const {
372 return enableNamedBit(MI, Bit: AMDGPU::CPol::GLC);
373 }
374
375 /// Sets SLC bit to "true" if present in \p MI. Returns true if \p MI
376 /// is modified, false otherwise.
377 bool enableSLCBit(const MachineBasicBlock::iterator &MI) const {
378 return enableNamedBit(MI, Bit: AMDGPU::CPol::SLC);
379 }
380
381public:
382
383 SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {}
384
385 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
386 SIAtomicScope Scope,
387 SIAtomicAddrSpace AddrSpace) const override;
388
389 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
390 SIAtomicScope Scope,
391 SIAtomicAddrSpace AddrSpace) const override;
392
393 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
394 SIAtomicScope Scope,
395 SIAtomicAddrSpace AddrSpace) const override;
396
397 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
398 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
399 bool IsVolatile, bool IsNonTemporal,
400 bool IsLastUse) const override;
401
402 bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
403 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
404 bool IsCrossAddrSpaceOrdering, Position Pos,
405 AtomicOrdering Order) const override;
406
407 bool insertAcquire(MachineBasicBlock::iterator &MI,
408 SIAtomicScope Scope,
409 SIAtomicAddrSpace AddrSpace,
410 Position Pos) const override;
411
412 bool insertRelease(MachineBasicBlock::iterator &MI,
413 SIAtomicScope Scope,
414 SIAtomicAddrSpace AddrSpace,
415 bool IsCrossAddrSpaceOrdering,
416 Position Pos) const override;
417};
418
419class SIGfx7CacheControl : public SIGfx6CacheControl {
420public:
421
422 SIGfx7CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {}
423
424 bool insertAcquire(MachineBasicBlock::iterator &MI,
425 SIAtomicScope Scope,
426 SIAtomicAddrSpace AddrSpace,
427 Position Pos) const override;
428
429};
430
431class SIGfx90ACacheControl : public SIGfx7CacheControl {
432public:
433
434 SIGfx90ACacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {}
435
436 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
437 SIAtomicScope Scope,
438 SIAtomicAddrSpace AddrSpace) const override;
439
440 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
441 SIAtomicScope Scope,
442 SIAtomicAddrSpace AddrSpace) const override;
443
444 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
445 SIAtomicScope Scope,
446 SIAtomicAddrSpace AddrSpace) const override;
447
448 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
449 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
450 bool IsVolatile, bool IsNonTemporal,
451 bool IsLastUse) const override;
452
453 bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
454 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
455 bool IsCrossAddrSpaceOrdering, Position Pos,
456 AtomicOrdering Order) const override;
457
458 bool insertAcquire(MachineBasicBlock::iterator &MI,
459 SIAtomicScope Scope,
460 SIAtomicAddrSpace AddrSpace,
461 Position Pos) const override;
462
463 bool insertRelease(MachineBasicBlock::iterator &MI,
464 SIAtomicScope Scope,
465 SIAtomicAddrSpace AddrSpace,
466 bool IsCrossAddrSpaceOrdering,
467 Position Pos) const override;
468};
469
470class SIGfx940CacheControl : public SIGfx90ACacheControl {
471protected:
472
473 /// Sets SC0 bit to "true" if present in \p MI. Returns true if \p MI
474 /// is modified, false otherwise.
475 bool enableSC0Bit(const MachineBasicBlock::iterator &MI) const {
476 return enableNamedBit(MI, Bit: AMDGPU::CPol::SC0);
477 }
478
479 /// Sets SC1 bit to "true" if present in \p MI. Returns true if \p MI
480 /// is modified, false otherwise.
481 bool enableSC1Bit(const MachineBasicBlock::iterator &MI) const {
482 return enableNamedBit(MI, Bit: AMDGPU::CPol::SC1);
483 }
484
485 /// Sets NT bit to "true" if present in \p MI. Returns true if \p MI
486 /// is modified, false otherwise.
487 bool enableNTBit(const MachineBasicBlock::iterator &MI) const {
488 return enableNamedBit(MI, Bit: AMDGPU::CPol::NT);
489 }
490
491public:
492 SIGfx940CacheControl(const GCNSubtarget &ST) : SIGfx90ACacheControl(ST) {};
493
494 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
495 SIAtomicScope Scope,
496 SIAtomicAddrSpace AddrSpace) const override;
497
498 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
499 SIAtomicScope Scope,
500 SIAtomicAddrSpace AddrSpace) const override;
501
502 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
503 SIAtomicScope Scope,
504 SIAtomicAddrSpace AddrSpace) const override;
505
506 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
507 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
508 bool IsVolatile, bool IsNonTemporal,
509 bool IsLastUse) const override;
510
511 bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
512 SIAtomicAddrSpace AddrSpace, Position Pos) const override;
513
514 bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
515 SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,
516 Position Pos) const override;
517};
518
519class SIGfx10CacheControl : public SIGfx7CacheControl {
520protected:
521
522 /// Sets DLC bit to "true" if present in \p MI. Returns true if \p MI
523 /// is modified, false otherwise.
524 bool enableDLCBit(const MachineBasicBlock::iterator &MI) const {
525 return enableNamedBit(MI, Bit: AMDGPU::CPol::DLC);
526 }
527
528public:
529
530 SIGfx10CacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {}
531
532 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
533 SIAtomicScope Scope,
534 SIAtomicAddrSpace AddrSpace) const override;
535
536 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
537 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
538 bool IsVolatile, bool IsNonTemporal,
539 bool IsLastUse) const override;
540
541 bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
542 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
543 bool IsCrossAddrSpaceOrdering, Position Pos,
544 AtomicOrdering Order) const override;
545
546 bool insertAcquire(MachineBasicBlock::iterator &MI,
547 SIAtomicScope Scope,
548 SIAtomicAddrSpace AddrSpace,
549 Position Pos) const override;
550};
551
552class SIGfx11CacheControl : public SIGfx10CacheControl {
553public:
554 SIGfx11CacheControl(const GCNSubtarget &ST) : SIGfx10CacheControl(ST) {}
555
556 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
557 SIAtomicScope Scope,
558 SIAtomicAddrSpace AddrSpace) const override;
559
560 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
561 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
562 bool IsVolatile, bool IsNonTemporal,
563 bool IsLastUse) const override;
564};
565
566class SIGfx12CacheControl : public SIGfx11CacheControl {
567protected:
568 // Sets TH policy to \p Value if CPol operand is present in instruction \p MI.
569 // \returns Returns true if \p MI is modified, false otherwise.
570 bool setTH(const MachineBasicBlock::iterator MI,
571 AMDGPU::CPol::CPol Value) const;
572 // Sets Scope policy to \p Value if CPol operand is present in instruction \p
573 // MI. \returns Returns true if \p MI is modified, false otherwise.
574 bool setScope(const MachineBasicBlock::iterator MI,
575 AMDGPU::CPol::CPol Value) const;
576
577 // Stores with system scope (SCOPE_SYS) need to wait for:
578 // - loads or atomics(returning) - wait for {LOAD|SAMPLE|BVH|KM}CNT==0
579 // - non-returning-atomics - wait for STORECNT==0
580 // TODO: SIInsertWaitcnts will not always be able to remove STORECNT waits
581 // since it does not distinguish atomics-with-return from regular stores.
582 // There is no need to wait if memory is cached (mtype != UC).
583 bool
584 insertWaitsBeforeSystemScopeStore(const MachineBasicBlock::iterator MI) const;
585
586 bool setAtomicScope(const MachineBasicBlock::iterator &MI,
587 SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const;
588
589public:
590 SIGfx12CacheControl(const GCNSubtarget &ST) : SIGfx11CacheControl(ST) {}
591
592 bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
593 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
594 bool IsCrossAddrSpaceOrdering, Position Pos,
595 AtomicOrdering Order) const override;
596
597 bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
598 SIAtomicAddrSpace AddrSpace, Position Pos) const override;
599
600 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
601 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
602 bool IsVolatile, bool IsNonTemporal,
603 bool IsLastUse) const override;
604
605 bool expandSystemScopeStore(MachineBasicBlock::iterator &MI) const override;
606
607 bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
608 SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,
609 Position Pos) const override;
610
611 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
612 SIAtomicScope Scope,
613 SIAtomicAddrSpace AddrSpace) const override {
614 return setAtomicScope(MI, Scope, AddrSpace);
615 }
616
617 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
618 SIAtomicScope Scope,
619 SIAtomicAddrSpace AddrSpace) const override {
620 return setAtomicScope(MI, Scope, AddrSpace);
621 }
622
623 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
624 SIAtomicScope Scope,
625 SIAtomicAddrSpace AddrSpace) const override {
626 return setAtomicScope(MI, Scope, AddrSpace);
627 }
628};
629
630class SIMemoryLegalizer final {
631private:
632 const MachineModuleInfo &MMI;
633 /// Cache Control.
634 std::unique_ptr<SICacheControl> CC = nullptr;
635
636 /// List of atomic pseudo instructions.
637 std::list<MachineBasicBlock::iterator> AtomicPseudoMIs;
638
639 /// Return true iff instruction \p MI is a atomic instruction that
640 /// returns a result.
641 bool isAtomicRet(const MachineInstr &MI) const {
642 return SIInstrInfo::isAtomicRet(MI);
643 }
644
645 /// Removes all processed atomic pseudo instructions from the current
646 /// function. Returns true if current function is modified, false otherwise.
647 bool removeAtomicPseudoMIs();
648
649 /// Expands load operation \p MI. Returns true if instructions are
650 /// added/deleted or \p MI is modified, false otherwise.
651 bool expandLoad(const SIMemOpInfo &MOI,
652 MachineBasicBlock::iterator &MI);
653 /// Expands store operation \p MI. Returns true if instructions are
654 /// added/deleted or \p MI is modified, false otherwise.
655 bool expandStore(const SIMemOpInfo &MOI,
656 MachineBasicBlock::iterator &MI);
657 /// Expands atomic fence operation \p MI. Returns true if
658 /// instructions are added/deleted or \p MI is modified, false otherwise.
659 bool expandAtomicFence(const SIMemOpInfo &MOI,
660 MachineBasicBlock::iterator &MI);
661 /// Expands atomic cmpxchg or rmw operation \p MI. Returns true if
662 /// instructions are added/deleted or \p MI is modified, false otherwise.
663 bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
664 MachineBasicBlock::iterator &MI);
665
666public:
667 SIMemoryLegalizer(const MachineModuleInfo &MMI) : MMI(MMI) {};
668 bool run(MachineFunction &MF);
669};
670
671class SIMemoryLegalizerLegacy final : public MachineFunctionPass {
672public:
673 static char ID;
674
675 SIMemoryLegalizerLegacy() : MachineFunctionPass(ID) {}
676
677 void getAnalysisUsage(AnalysisUsage &AU) const override {
678 AU.setPreservesCFG();
679 MachineFunctionPass::getAnalysisUsage(AU);
680 }
681
682 StringRef getPassName() const override {
683 return PASS_NAME;
684 }
685
686 bool runOnMachineFunction(MachineFunction &MF) override;
687};
688
689static const StringMap<SIAtomicAddrSpace> ASNames = {{
690 {"global", SIAtomicAddrSpace::GLOBAL},
691 {"local", SIAtomicAddrSpace::LDS},
692}};
693
694void diagnoseUnknownMMRAASName(const MachineInstr &MI, StringRef AS) {
695 const MachineFunction *MF = MI.getMF();
696 const Function &Fn = MF->getFunction();
697 SmallString<128> Str;
698 raw_svector_ostream OS(Str);
699 OS << "unknown address space '" << AS << "'; expected one of ";
700 ListSeparator LS;
701 for (const auto &[Name, Val] : ASNames)
702 OS << LS << '\'' << Name << '\'';
703 Fn.getContext().diagnose(
704 DI: DiagnosticInfoUnsupported(Fn, Str.str(), MI.getDebugLoc(), DS_Warning));
705}
706
707/// Reads \p MI's MMRAs to parse the "amdgpu-as" MMRA.
708/// If this tag isn't present, or if it has no meaningful values, returns \p
709/// Default. Otherwise returns all the address spaces concerned by the MMRA.
710static SIAtomicAddrSpace getFenceAddrSpaceMMRA(const MachineInstr &MI,
711 SIAtomicAddrSpace Default) {
712 static constexpr StringLiteral FenceASPrefix = "amdgpu-as";
713
714 auto MMRA = MMRAMetadata(MI.getMMRAMetadata());
715 if (!MMRA)
716 return Default;
717
718 SIAtomicAddrSpace Result = SIAtomicAddrSpace::NONE;
719 for (const auto &[Prefix, Suffix] : MMRA) {
720 if (Prefix != FenceASPrefix)
721 continue;
722
723 if (auto It = ASNames.find(Key: Suffix); It != ASNames.end())
724 Result |= It->second;
725 else
726 diagnoseUnknownMMRAASName(MI, AS: Suffix);
727 }
728
729 return (Result != SIAtomicAddrSpace::NONE) ? Result : Default;
730}
731
732} // end anonymous namespace
733
734void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI,
735 const char *Msg) const {
736 const Function &Func = MI->getParent()->getParent()->getFunction();
737 Func.getContext().diagnose(
738 DI: DiagnosticInfoUnsupported(Func, Msg, MI->getDebugLoc()));
739}
740
741std::optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
742SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID,
743 SIAtomicAddrSpace InstrAddrSpace) const {
744 if (SSID == SyncScope::System)
745 return std::tuple(SIAtomicScope::SYSTEM, SIAtomicAddrSpace::ATOMIC, true);
746 if (SSID == MMI->getAgentSSID())
747 return std::tuple(SIAtomicScope::AGENT, SIAtomicAddrSpace::ATOMIC, true);
748 if (SSID == MMI->getWorkgroupSSID())
749 return std::tuple(SIAtomicScope::WORKGROUP, SIAtomicAddrSpace::ATOMIC,
750 true);
751 if (SSID == MMI->getWavefrontSSID())
752 return std::tuple(SIAtomicScope::WAVEFRONT, SIAtomicAddrSpace::ATOMIC,
753 true);
754 if (SSID == SyncScope::SingleThread)
755 return std::tuple(SIAtomicScope::SINGLETHREAD, SIAtomicAddrSpace::ATOMIC,
756 true);
757 if (SSID == MMI->getSystemOneAddressSpaceSSID())
758 return std::tuple(SIAtomicScope::SYSTEM,
759 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
760 if (SSID == MMI->getAgentOneAddressSpaceSSID())
761 return std::tuple(SIAtomicScope::AGENT,
762 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
763 if (SSID == MMI->getWorkgroupOneAddressSpaceSSID())
764 return std::tuple(SIAtomicScope::WORKGROUP,
765 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
766 if (SSID == MMI->getWavefrontOneAddressSpaceSSID())
767 return std::tuple(SIAtomicScope::WAVEFRONT,
768 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
769 if (SSID == MMI->getSingleThreadOneAddressSpaceSSID())
770 return std::tuple(SIAtomicScope::SINGLETHREAD,
771 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
772 return std::nullopt;
773}
774
775SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const {
776 if (AS == AMDGPUAS::FLAT_ADDRESS)
777 return SIAtomicAddrSpace::FLAT;
778 if (AS == AMDGPUAS::GLOBAL_ADDRESS)
779 return SIAtomicAddrSpace::GLOBAL;
780 if (AS == AMDGPUAS::LOCAL_ADDRESS)
781 return SIAtomicAddrSpace::LDS;
782 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
783 return SIAtomicAddrSpace::SCRATCH;
784 if (AS == AMDGPUAS::REGION_ADDRESS)
785 return SIAtomicAddrSpace::GDS;
786
787 return SIAtomicAddrSpace::OTHER;
788}
789
790SIMemOpAccess::SIMemOpAccess(const AMDGPUMachineModuleInfo &MMI_)
791 : MMI(&MMI_) {}
792
793std::optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
794 const MachineBasicBlock::iterator &MI) const {
795 assert(MI->getNumMemOperands() > 0);
796
797 SyncScope::ID SSID = SyncScope::SingleThread;
798 AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
799 AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
800 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
801 bool IsNonTemporal = true;
802 bool IsVolatile = false;
803 bool IsLastUse = false;
804
805 // Validator should check whether or not MMOs cover the entire set of
806 // locations accessed by the memory instruction.
807 for (const auto &MMO : MI->memoperands()) {
808 IsNonTemporal &= MMO->isNonTemporal();
809 IsVolatile |= MMO->isVolatile();
810 IsLastUse |= MMO->getFlags() & MOLastUse;
811 InstrAddrSpace |=
812 toSIAtomicAddrSpace(AS: MMO->getPointerInfo().getAddrSpace());
813 AtomicOrdering OpOrdering = MMO->getSuccessOrdering();
814 if (OpOrdering != AtomicOrdering::NotAtomic) {
815 const auto &IsSyncScopeInclusion =
816 MMI->isSyncScopeInclusion(A: SSID, B: MMO->getSyncScopeID());
817 if (!IsSyncScopeInclusion) {
818 reportUnsupported(MI,
819 Msg: "Unsupported non-inclusive atomic synchronization scope");
820 return std::nullopt;
821 }
822
823 SSID = *IsSyncScopeInclusion ? SSID : MMO->getSyncScopeID();
824 Ordering = getMergedAtomicOrdering(AO: Ordering, Other: OpOrdering);
825 assert(MMO->getFailureOrdering() != AtomicOrdering::Release &&
826 MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease);
827 FailureOrdering =
828 getMergedAtomicOrdering(AO: FailureOrdering, Other: MMO->getFailureOrdering());
829 }
830 }
831
832 SIAtomicScope Scope = SIAtomicScope::NONE;
833 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
834 bool IsCrossAddressSpaceOrdering = false;
835 if (Ordering != AtomicOrdering::NotAtomic) {
836 auto ScopeOrNone = toSIAtomicScope(SSID, InstrAddrSpace);
837 if (!ScopeOrNone) {
838 reportUnsupported(MI, Msg: "Unsupported atomic synchronization scope");
839 return std::nullopt;
840 }
841 std::tie(args&: Scope, args&: OrderingAddrSpace, args&: IsCrossAddressSpaceOrdering) =
842 *ScopeOrNone;
843 if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
844 ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace) ||
845 ((InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) == SIAtomicAddrSpace::NONE)) {
846 reportUnsupported(MI, Msg: "Unsupported atomic address space");
847 return std::nullopt;
848 }
849 }
850 return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, InstrAddrSpace,
851 IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile,
852 IsNonTemporal, IsLastUse);
853}
854
855std::optional<SIMemOpInfo>
856SIMemOpAccess::getLoadInfo(const MachineBasicBlock::iterator &MI) const {
857 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
858
859 if (!(MI->mayLoad() && !MI->mayStore()))
860 return std::nullopt;
861
862 // Be conservative if there are no memory operands.
863 if (MI->getNumMemOperands() == 0)
864 return SIMemOpInfo();
865
866 return constructFromMIWithMMO(MI);
867}
868
869std::optional<SIMemOpInfo>
870SIMemOpAccess::getStoreInfo(const MachineBasicBlock::iterator &MI) const {
871 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
872
873 if (!(!MI->mayLoad() && MI->mayStore()))
874 return std::nullopt;
875
876 // Be conservative if there are no memory operands.
877 if (MI->getNumMemOperands() == 0)
878 return SIMemOpInfo();
879
880 return constructFromMIWithMMO(MI);
881}
882
883std::optional<SIMemOpInfo>
884SIMemOpAccess::getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const {
885 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
886
887 if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE)
888 return std::nullopt;
889
890 AtomicOrdering Ordering =
891 static_cast<AtomicOrdering>(MI->getOperand(i: 0).getImm());
892
893 SyncScope::ID SSID = static_cast<SyncScope::ID>(MI->getOperand(i: 1).getImm());
894 auto ScopeOrNone = toSIAtomicScope(SSID, InstrAddrSpace: SIAtomicAddrSpace::ATOMIC);
895 if (!ScopeOrNone) {
896 reportUnsupported(MI, Msg: "Unsupported atomic synchronization scope");
897 return std::nullopt;
898 }
899
900 SIAtomicScope Scope = SIAtomicScope::NONE;
901 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
902 bool IsCrossAddressSpaceOrdering = false;
903 std::tie(args&: Scope, args&: OrderingAddrSpace, args&: IsCrossAddressSpaceOrdering) =
904 *ScopeOrNone;
905
906 if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
907 ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) {
908 reportUnsupported(MI, Msg: "Unsupported atomic address space");
909 return std::nullopt;
910 }
911
912 return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, SIAtomicAddrSpace::ATOMIC,
913 IsCrossAddressSpaceOrdering, AtomicOrdering::NotAtomic);
914}
915
916std::optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo(
917 const MachineBasicBlock::iterator &MI) const {
918 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
919
920 if (!(MI->mayLoad() && MI->mayStore()))
921 return std::nullopt;
922
923 // Be conservative if there are no memory operands.
924 if (MI->getNumMemOperands() == 0)
925 return SIMemOpInfo();
926
927 return constructFromMIWithMMO(MI);
928}
929
930SICacheControl::SICacheControl(const GCNSubtarget &ST) : ST(ST) {
931 TII = ST.getInstrInfo();
932 IV = getIsaVersion(GPU: ST.getCPU());
933 InsertCacheInv = !AmdgcnSkipCacheInvalidations;
934}
935
936bool SICacheControl::enableNamedBit(const MachineBasicBlock::iterator MI,
937 AMDGPU::CPol::CPol Bit) const {
938 MachineOperand *CPol = TII->getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::cpol);
939 if (!CPol)
940 return false;
941
942 CPol->setImm(CPol->getImm() | Bit);
943 return true;
944}
945
946/* static */
947std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) {
948 GCNSubtarget::Generation Generation = ST.getGeneration();
949 if (ST.hasGFX940Insts())
950 return std::make_unique<SIGfx940CacheControl>(args: ST);
951 if (ST.hasGFX90AInsts())
952 return std::make_unique<SIGfx90ACacheControl>(args: ST);
953 if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS)
954 return std::make_unique<SIGfx6CacheControl>(args: ST);
955 if (Generation < AMDGPUSubtarget::GFX10)
956 return std::make_unique<SIGfx7CacheControl>(args: ST);
957 if (Generation < AMDGPUSubtarget::GFX11)
958 return std::make_unique<SIGfx10CacheControl>(args: ST);
959 if (Generation < AMDGPUSubtarget::GFX12)
960 return std::make_unique<SIGfx11CacheControl>(args: ST);
961 return std::make_unique<SIGfx12CacheControl>(args: ST);
962}
963
964bool SIGfx6CacheControl::enableLoadCacheBypass(
965 const MachineBasicBlock::iterator &MI,
966 SIAtomicScope Scope,
967 SIAtomicAddrSpace AddrSpace) const {
968 assert(MI->mayLoad() && !MI->mayStore());
969 bool Changed = false;
970
971 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
972 switch (Scope) {
973 case SIAtomicScope::SYSTEM:
974 case SIAtomicScope::AGENT:
975 // Set L1 cache policy to MISS_EVICT.
976 // Note: there is no L2 cache bypass policy at the ISA level.
977 Changed |= enableGLCBit(MI);
978 break;
979 case SIAtomicScope::WORKGROUP:
980 case SIAtomicScope::WAVEFRONT:
981 case SIAtomicScope::SINGLETHREAD:
982 // No cache to bypass.
983 break;
984 default:
985 llvm_unreachable("Unsupported synchronization scope");
986 }
987 }
988
989 /// The scratch address space does not need the global memory caches
990 /// to be bypassed as all memory operations by the same thread are
991 /// sequentially consistent, and no other thread can access scratch
992 /// memory.
993
994 /// Other address spaces do not have a cache.
995
996 return Changed;
997}
998
999bool SIGfx6CacheControl::enableStoreCacheBypass(
1000 const MachineBasicBlock::iterator &MI,
1001 SIAtomicScope Scope,
1002 SIAtomicAddrSpace AddrSpace) const {
1003 assert(!MI->mayLoad() && MI->mayStore());
1004 bool Changed = false;
1005
1006 /// The L1 cache is write through so does not need to be bypassed. There is no
1007 /// bypass control for the L2 cache at the isa level.
1008
1009 return Changed;
1010}
1011
1012bool SIGfx6CacheControl::enableRMWCacheBypass(
1013 const MachineBasicBlock::iterator &MI,
1014 SIAtomicScope Scope,
1015 SIAtomicAddrSpace AddrSpace) const {
1016 assert(MI->mayLoad() && MI->mayStore());
1017 bool Changed = false;
1018
1019 /// Do not set GLC for RMW atomic operations as L0/L1 cache is automatically
1020 /// bypassed, and the GLC bit is instead used to indicate if they are
1021 /// return or no-return.
1022 /// Note: there is no L2 cache coherent bypass control at the ISA level.
1023
1024 return Changed;
1025}
1026
1027bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal(
1028 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1029 bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
1030 // Only handle load and store, not atomic read-modify-write insructions. The
1031 // latter use glc to indicate if the atomic returns a result and so must not
1032 // be used for cache control.
1033 assert(MI->mayLoad() ^ MI->mayStore());
1034
1035 // Only update load and store, not LLVM IR atomic read-modify-write
1036 // instructions. The latter are always marked as volatile so cannot sensibly
1037 // handle it as do not want to pessimize all atomics. Also they do not support
1038 // the nontemporal attribute.
1039 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1040
1041 bool Changed = false;
1042
1043 if (IsVolatile) {
1044 // Set L1 cache policy to be MISS_EVICT for load instructions
1045 // and MISS_LRU for store instructions.
1046 // Note: there is no L2 cache bypass policy at the ISA level.
1047 if (Op == SIMemOp::LOAD)
1048 Changed |= enableGLCBit(MI);
1049
1050 // Ensure operation has completed at system scope to cause all volatile
1051 // operations to be visible outside the program in a global order. Do not
1052 // request cross address space as only the global address space can be
1053 // observable outside the program, so no need to cause a waitcnt for LDS
1054 // address space operations.
1055 Changed |= insertWait(MI, Scope: SIAtomicScope::SYSTEM, AddrSpace, Op, IsCrossAddrSpaceOrdering: false,
1056 Pos: Position::AFTER, Order: AtomicOrdering::Unordered);
1057
1058 return Changed;
1059 }
1060
1061 if (IsNonTemporal) {
1062 // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT
1063 // for both loads and stores, and the L2 cache policy to STREAM.
1064 Changed |= enableGLCBit(MI);
1065 Changed |= enableSLCBit(MI);
1066 return Changed;
1067 }
1068
1069 return Changed;
1070}
1071
1072bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
1073 SIAtomicScope Scope,
1074 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1075 bool IsCrossAddrSpaceOrdering, Position Pos,
1076 AtomicOrdering Order) const {
1077 bool Changed = false;
1078
1079 MachineBasicBlock &MBB = *MI->getParent();
1080 DebugLoc DL = MI->getDebugLoc();
1081
1082 if (Pos == Position::AFTER)
1083 ++MI;
1084
1085 bool VMCnt = false;
1086 bool LGKMCnt = false;
1087
1088 if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
1089 SIAtomicAddrSpace::NONE) {
1090 switch (Scope) {
1091 case SIAtomicScope::SYSTEM:
1092 case SIAtomicScope::AGENT:
1093 VMCnt |= true;
1094 break;
1095 case SIAtomicScope::WORKGROUP:
1096 case SIAtomicScope::WAVEFRONT:
1097 case SIAtomicScope::SINGLETHREAD:
1098 // The L1 cache keeps all memory operations in order for
1099 // wavefronts in the same work-group.
1100 break;
1101 default:
1102 llvm_unreachable("Unsupported synchronization scope");
1103 }
1104 }
1105
1106 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1107 switch (Scope) {
1108 case SIAtomicScope::SYSTEM:
1109 case SIAtomicScope::AGENT:
1110 case SIAtomicScope::WORKGROUP:
1111 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
1112 // not needed as LDS operations for all waves are executed in a total
1113 // global ordering as observed by all waves. Required if also
1114 // synchronizing with global/GDS memory as LDS operations could be
1115 // reordered with respect to later global/GDS memory operations of the
1116 // same wave.
1117 LGKMCnt |= IsCrossAddrSpaceOrdering;
1118 break;
1119 case SIAtomicScope::WAVEFRONT:
1120 case SIAtomicScope::SINGLETHREAD:
1121 // The LDS keeps all memory operations in order for
1122 // the same wavefront.
1123 break;
1124 default:
1125 llvm_unreachable("Unsupported synchronization scope");
1126 }
1127 }
1128
1129 if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
1130 switch (Scope) {
1131 case SIAtomicScope::SYSTEM:
1132 case SIAtomicScope::AGENT:
1133 // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
1134 // is not needed as GDS operations for all waves are executed in a total
1135 // global ordering as observed by all waves. Required if also
1136 // synchronizing with global/LDS memory as GDS operations could be
1137 // reordered with respect to later global/LDS memory operations of the
1138 // same wave.
1139 LGKMCnt |= IsCrossAddrSpaceOrdering;
1140 break;
1141 case SIAtomicScope::WORKGROUP:
1142 case SIAtomicScope::WAVEFRONT:
1143 case SIAtomicScope::SINGLETHREAD:
1144 // The GDS keeps all memory operations in order for
1145 // the same work-group.
1146 break;
1147 default:
1148 llvm_unreachable("Unsupported synchronization scope");
1149 }
1150 }
1151
1152 if (VMCnt || LGKMCnt) {
1153 unsigned WaitCntImmediate =
1154 AMDGPU::encodeWaitcnt(Version: IV,
1155 Vmcnt: VMCnt ? 0 : getVmcntBitMask(Version: IV),
1156 Expcnt: getExpcntBitMask(Version: IV),
1157 Lgkmcnt: LGKMCnt ? 0 : getLgkmcntBitMask(Version: IV));
1158 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_WAITCNT_soft))
1159 .addImm(Val: WaitCntImmediate);
1160 Changed = true;
1161 }
1162
1163 if (Pos == Position::AFTER)
1164 --MI;
1165
1166 return Changed;
1167}
1168
1169bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1170 SIAtomicScope Scope,
1171 SIAtomicAddrSpace AddrSpace,
1172 Position Pos) const {
1173 if (!InsertCacheInv)
1174 return false;
1175
1176 bool Changed = false;
1177
1178 MachineBasicBlock &MBB = *MI->getParent();
1179 DebugLoc DL = MI->getDebugLoc();
1180
1181 if (Pos == Position::AFTER)
1182 ++MI;
1183
1184 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1185 switch (Scope) {
1186 case SIAtomicScope::SYSTEM:
1187 case SIAtomicScope::AGENT:
1188 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::BUFFER_WBINVL1));
1189 Changed = true;
1190 break;
1191 case SIAtomicScope::WORKGROUP:
1192 case SIAtomicScope::WAVEFRONT:
1193 case SIAtomicScope::SINGLETHREAD:
1194 // No cache to invalidate.
1195 break;
1196 default:
1197 llvm_unreachable("Unsupported synchronization scope");
1198 }
1199 }
1200
1201 /// The scratch address space does not need the global memory cache
1202 /// to be flushed as all memory operations by the same thread are
1203 /// sequentially consistent, and no other thread can access scratch
1204 /// memory.
1205
1206 /// Other address spaces do not have a cache.
1207
1208 if (Pos == Position::AFTER)
1209 --MI;
1210
1211 return Changed;
1212}
1213
1214bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1215 SIAtomicScope Scope,
1216 SIAtomicAddrSpace AddrSpace,
1217 bool IsCrossAddrSpaceOrdering,
1218 Position Pos) const {
1219 return insertWait(MI, Scope, AddrSpace, Op: SIMemOp::LOAD | SIMemOp::STORE,
1220 IsCrossAddrSpaceOrdering, Pos, Order: AtomicOrdering::Release);
1221}
1222
1223bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1224 SIAtomicScope Scope,
1225 SIAtomicAddrSpace AddrSpace,
1226 Position Pos) const {
1227 if (!InsertCacheInv)
1228 return false;
1229
1230 bool Changed = false;
1231
1232 MachineBasicBlock &MBB = *MI->getParent();
1233 DebugLoc DL = MI->getDebugLoc();
1234
1235 const GCNSubtarget &STM = MBB.getParent()->getSubtarget<GCNSubtarget>();
1236
1237 const unsigned InvalidateL1 = STM.isAmdPalOS() || STM.isMesa3DOS()
1238 ? AMDGPU::BUFFER_WBINVL1
1239 : AMDGPU::BUFFER_WBINVL1_VOL;
1240
1241 if (Pos == Position::AFTER)
1242 ++MI;
1243
1244 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1245 switch (Scope) {
1246 case SIAtomicScope::SYSTEM:
1247 case SIAtomicScope::AGENT:
1248 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: InvalidateL1));
1249 Changed = true;
1250 break;
1251 case SIAtomicScope::WORKGROUP:
1252 case SIAtomicScope::WAVEFRONT:
1253 case SIAtomicScope::SINGLETHREAD:
1254 // No cache to invalidate.
1255 break;
1256 default:
1257 llvm_unreachable("Unsupported synchronization scope");
1258 }
1259 }
1260
1261 /// The scratch address space does not need the global memory cache
1262 /// to be flushed as all memory operations by the same thread are
1263 /// sequentially consistent, and no other thread can access scratch
1264 /// memory.
1265
1266 /// Other address spaces do not have a cache.
1267
1268 if (Pos == Position::AFTER)
1269 --MI;
1270
1271 return Changed;
1272}
1273
1274bool SIGfx90ACacheControl::enableLoadCacheBypass(
1275 const MachineBasicBlock::iterator &MI,
1276 SIAtomicScope Scope,
1277 SIAtomicAddrSpace AddrSpace) const {
1278 assert(MI->mayLoad() && !MI->mayStore());
1279 bool Changed = false;
1280
1281 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1282 switch (Scope) {
1283 case SIAtomicScope::SYSTEM:
1284 case SIAtomicScope::AGENT:
1285 // Set the L1 cache policy to MISS_LRU.
1286 // Note: there is no L2 cache bypass policy at the ISA level.
1287 Changed |= enableGLCBit(MI);
1288 break;
1289 case SIAtomicScope::WORKGROUP:
1290 // In threadgroup split mode the waves of a work-group can be executing on
1291 // different CUs. Therefore need to bypass the L1 which is per CU.
1292 // Otherwise in non-threadgroup split mode all waves of a work-group are
1293 // on the same CU, and so the L1 does not need to be bypassed.
1294 if (ST.isTgSplitEnabled())
1295 Changed |= enableGLCBit(MI);
1296 break;
1297 case SIAtomicScope::WAVEFRONT:
1298 case SIAtomicScope::SINGLETHREAD:
1299 // No cache to bypass.
1300 break;
1301 default:
1302 llvm_unreachable("Unsupported synchronization scope");
1303 }
1304 }
1305
1306 /// The scratch address space does not need the global memory caches
1307 /// to be bypassed as all memory operations by the same thread are
1308 /// sequentially consistent, and no other thread can access scratch
1309 /// memory.
1310
1311 /// Other address spaces do not have a cache.
1312
1313 return Changed;
1314}
1315
1316bool SIGfx90ACacheControl::enableStoreCacheBypass(
1317 const MachineBasicBlock::iterator &MI,
1318 SIAtomicScope Scope,
1319 SIAtomicAddrSpace AddrSpace) const {
1320 assert(!MI->mayLoad() && MI->mayStore());
1321 bool Changed = false;
1322
1323 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1324 switch (Scope) {
1325 case SIAtomicScope::SYSTEM:
1326 case SIAtomicScope::AGENT:
1327 /// Do not set glc for store atomic operations as they implicitly write
1328 /// through the L1 cache.
1329 break;
1330 case SIAtomicScope::WORKGROUP:
1331 case SIAtomicScope::WAVEFRONT:
1332 case SIAtomicScope::SINGLETHREAD:
1333 // No cache to bypass. Store atomics implicitly write through the L1
1334 // cache.
1335 break;
1336 default:
1337 llvm_unreachable("Unsupported synchronization scope");
1338 }
1339 }
1340
1341 /// The scratch address space does not need the global memory caches
1342 /// to be bypassed as all memory operations by the same thread are
1343 /// sequentially consistent, and no other thread can access scratch
1344 /// memory.
1345
1346 /// Other address spaces do not have a cache.
1347
1348 return Changed;
1349}
1350
1351bool SIGfx90ACacheControl::enableRMWCacheBypass(
1352 const MachineBasicBlock::iterator &MI,
1353 SIAtomicScope Scope,
1354 SIAtomicAddrSpace AddrSpace) const {
1355 assert(MI->mayLoad() && MI->mayStore());
1356 bool Changed = false;
1357
1358 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1359 switch (Scope) {
1360 case SIAtomicScope::SYSTEM:
1361 case SIAtomicScope::AGENT:
1362 /// Do not set glc for RMW atomic operations as they implicitly bypass
1363 /// the L1 cache, and the glc bit is instead used to indicate if they are
1364 /// return or no-return.
1365 break;
1366 case SIAtomicScope::WORKGROUP:
1367 case SIAtomicScope::WAVEFRONT:
1368 case SIAtomicScope::SINGLETHREAD:
1369 // No cache to bypass. RMW atomics implicitly bypass the L1 cache.
1370 break;
1371 default:
1372 llvm_unreachable("Unsupported synchronization scope");
1373 }
1374 }
1375
1376 return Changed;
1377}
1378
1379bool SIGfx90ACacheControl::enableVolatileAndOrNonTemporal(
1380 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1381 bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
1382 // Only handle load and store, not atomic read-modify-write insructions. The
1383 // latter use glc to indicate if the atomic returns a result and so must not
1384 // be used for cache control.
1385 assert(MI->mayLoad() ^ MI->mayStore());
1386
1387 // Only update load and store, not LLVM IR atomic read-modify-write
1388 // instructions. The latter are always marked as volatile so cannot sensibly
1389 // handle it as do not want to pessimize all atomics. Also they do not support
1390 // the nontemporal attribute.
1391 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1392
1393 bool Changed = false;
1394
1395 if (IsVolatile) {
1396 // Set L1 cache policy to be MISS_EVICT for load instructions
1397 // and MISS_LRU for store instructions.
1398 // Note: there is no L2 cache bypass policy at the ISA level.
1399 if (Op == SIMemOp::LOAD)
1400 Changed |= enableGLCBit(MI);
1401
1402 // Ensure operation has completed at system scope to cause all volatile
1403 // operations to be visible outside the program in a global order. Do not
1404 // request cross address space as only the global address space can be
1405 // observable outside the program, so no need to cause a waitcnt for LDS
1406 // address space operations.
1407 Changed |= insertWait(MI, Scope: SIAtomicScope::SYSTEM, AddrSpace, Op, IsCrossAddrSpaceOrdering: false,
1408 Pos: Position::AFTER, Order: AtomicOrdering::Unordered);
1409
1410 return Changed;
1411 }
1412
1413 if (IsNonTemporal) {
1414 // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT
1415 // for both loads and stores, and the L2 cache policy to STREAM.
1416 Changed |= enableGLCBit(MI);
1417 Changed |= enableSLCBit(MI);
1418 return Changed;
1419 }
1420
1421 return Changed;
1422}
1423
1424bool SIGfx90ACacheControl::insertWait(MachineBasicBlock::iterator &MI,
1425 SIAtomicScope Scope,
1426 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1427 bool IsCrossAddrSpaceOrdering,
1428 Position Pos,
1429 AtomicOrdering Order) const {
1430 if (ST.isTgSplitEnabled()) {
1431 // In threadgroup split mode the waves of a work-group can be executing on
1432 // different CUs. Therefore need to wait for global or GDS memory operations
1433 // to complete to ensure they are visible to waves in the other CUs.
1434 // Otherwise in non-threadgroup split mode all waves of a work-group are on
1435 // the same CU, so no need to wait for global memory as all waves in the
1436 // work-group access the same the L1, nor wait for GDS as access are ordered
1437 // on a CU.
1438 if (((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH |
1439 SIAtomicAddrSpace::GDS)) != SIAtomicAddrSpace::NONE) &&
1440 (Scope == SIAtomicScope::WORKGROUP)) {
1441 // Same as GFX7 using agent scope.
1442 Scope = SIAtomicScope::AGENT;
1443 }
1444 // In threadgroup split mode LDS cannot be allocated so no need to wait for
1445 // LDS memory operations.
1446 AddrSpace &= ~SIAtomicAddrSpace::LDS;
1447 }
1448 return SIGfx7CacheControl::insertWait(MI, Scope, AddrSpace, Op,
1449 IsCrossAddrSpaceOrdering, Pos, Order);
1450}
1451
1452bool SIGfx90ACacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1453 SIAtomicScope Scope,
1454 SIAtomicAddrSpace AddrSpace,
1455 Position Pos) const {
1456 if (!InsertCacheInv)
1457 return false;
1458
1459 bool Changed = false;
1460
1461 MachineBasicBlock &MBB = *MI->getParent();
1462 DebugLoc DL = MI->getDebugLoc();
1463
1464 if (Pos == Position::AFTER)
1465 ++MI;
1466
1467 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1468 switch (Scope) {
1469 case SIAtomicScope::SYSTEM:
1470 // Ensures that following loads will not see stale remote VMEM data or
1471 // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and
1472 // CC will never be stale due to the local memory probes.
1473 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::BUFFER_INVL2));
1474 // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
1475 // hardware does not reorder memory operations by the same wave with
1476 // respect to a preceding "BUFFER_INVL2". The invalidate is guaranteed to
1477 // remove any cache lines of earlier writes by the same wave and ensures
1478 // later reads by the same wave will refetch the cache lines.
1479 Changed = true;
1480 break;
1481 case SIAtomicScope::AGENT:
1482 // Same as GFX7.
1483 break;
1484 case SIAtomicScope::WORKGROUP:
1485 // In threadgroup split mode the waves of a work-group can be executing on
1486 // different CUs. Therefore need to invalidate the L1 which is per CU.
1487 // Otherwise in non-threadgroup split mode all waves of a work-group are
1488 // on the same CU, and so the L1 does not need to be invalidated.
1489 if (ST.isTgSplitEnabled()) {
1490 // Same as GFX7 using agent scope.
1491 Scope = SIAtomicScope::AGENT;
1492 }
1493 break;
1494 case SIAtomicScope::WAVEFRONT:
1495 case SIAtomicScope::SINGLETHREAD:
1496 // Same as GFX7.
1497 break;
1498 default:
1499 llvm_unreachable("Unsupported synchronization scope");
1500 }
1501 }
1502
1503 /// The scratch address space does not need the global memory cache
1504 /// to be flushed as all memory operations by the same thread are
1505 /// sequentially consistent, and no other thread can access scratch
1506 /// memory.
1507
1508 /// Other address spaces do not have a cache.
1509
1510 if (Pos == Position::AFTER)
1511 --MI;
1512
1513 Changed |= SIGfx7CacheControl::insertAcquire(MI, Scope, AddrSpace, Pos);
1514
1515 return Changed;
1516}
1517
1518bool SIGfx90ACacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1519 SIAtomicScope Scope,
1520 SIAtomicAddrSpace AddrSpace,
1521 bool IsCrossAddrSpaceOrdering,
1522 Position Pos) const {
1523 bool Changed = false;
1524
1525 MachineBasicBlock &MBB = *MI->getParent();
1526 const DebugLoc &DL = MI->getDebugLoc();
1527
1528 if (Pos == Position::AFTER)
1529 ++MI;
1530
1531 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1532 switch (Scope) {
1533 case SIAtomicScope::SYSTEM:
1534 // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
1535 // hardware does not reorder memory operations by the same wave with
1536 // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
1537 // to initiate writeback of any dirty cache lines of earlier writes by the
1538 // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
1539 // writeback has completed.
1540 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::BUFFER_WBL2))
1541 // Set SC bits to indicate system scope.
1542 .addImm(Val: AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);
1543 // Followed by same as GFX7, which will ensure the necessary "S_WAITCNT
1544 // vmcnt(0)" needed by the "BUFFER_WBL2".
1545 Changed = true;
1546 break;
1547 case SIAtomicScope::AGENT:
1548 case SIAtomicScope::WORKGROUP:
1549 case SIAtomicScope::WAVEFRONT:
1550 case SIAtomicScope::SINGLETHREAD:
1551 // Same as GFX7.
1552 break;
1553 default:
1554 llvm_unreachable("Unsupported synchronization scope");
1555 }
1556 }
1557
1558 if (Pos == Position::AFTER)
1559 --MI;
1560
1561 Changed |=
1562 SIGfx7CacheControl::insertRelease(MI, Scope, AddrSpace,
1563 IsCrossAddrSpaceOrdering, Pos);
1564
1565 return Changed;
1566}
1567
1568bool SIGfx940CacheControl::enableLoadCacheBypass(
1569 const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
1570 SIAtomicAddrSpace AddrSpace) const {
1571 assert(MI->mayLoad() && !MI->mayStore());
1572 bool Changed = false;
1573
1574 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1575 switch (Scope) {
1576 case SIAtomicScope::SYSTEM:
1577 // Set SC bits to indicate system scope.
1578 Changed |= enableSC0Bit(MI);
1579 Changed |= enableSC1Bit(MI);
1580 break;
1581 case SIAtomicScope::AGENT:
1582 // Set SC bits to indicate agent scope.
1583 Changed |= enableSC1Bit(MI);
1584 break;
1585 case SIAtomicScope::WORKGROUP:
1586 // In threadgroup split mode the waves of a work-group can be executing on
1587 // different CUs. Therefore need to bypass the L1 which is per CU.
1588 // Otherwise in non-threadgroup split mode all waves of a work-group are
1589 // on the same CU, and so the L1 does not need to be bypassed. Setting SC
1590 // bits to indicate work-group scope will do this automatically.
1591 Changed |= enableSC0Bit(MI);
1592 break;
1593 case SIAtomicScope::WAVEFRONT:
1594 case SIAtomicScope::SINGLETHREAD:
1595 // Leave SC bits unset to indicate wavefront scope.
1596 break;
1597 default:
1598 llvm_unreachable("Unsupported synchronization scope");
1599 }
1600 }
1601
1602 /// The scratch address space does not need the global memory caches
1603 /// to be bypassed as all memory operations by the same thread are
1604 /// sequentially consistent, and no other thread can access scratch
1605 /// memory.
1606
1607 /// Other address spaces do not have a cache.
1608
1609 return Changed;
1610}
1611
1612bool SIGfx940CacheControl::enableStoreCacheBypass(
1613 const MachineBasicBlock::iterator &MI,
1614 SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const {
1615 assert(!MI->mayLoad() && MI->mayStore());
1616 bool Changed = false;
1617
1618 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1619 switch (Scope) {
1620 case SIAtomicScope::SYSTEM:
1621 // Set SC bits to indicate system scope.
1622 Changed |= enableSC0Bit(MI);
1623 Changed |= enableSC1Bit(MI);
1624 break;
1625 case SIAtomicScope::AGENT:
1626 // Set SC bits to indicate agent scope.
1627 Changed |= enableSC1Bit(MI);
1628 break;
1629 case SIAtomicScope::WORKGROUP:
1630 // Set SC bits to indicate workgroup scope.
1631 Changed |= enableSC0Bit(MI);
1632 break;
1633 case SIAtomicScope::WAVEFRONT:
1634 case SIAtomicScope::SINGLETHREAD:
1635 // Leave SC bits unset to indicate wavefront scope.
1636 break;
1637 default:
1638 llvm_unreachable("Unsupported synchronization scope");
1639 }
1640 }
1641
1642 /// The scratch address space does not need the global memory caches
1643 /// to be bypassed as all memory operations by the same thread are
1644 /// sequentially consistent, and no other thread can access scratch
1645 /// memory.
1646
1647 /// Other address spaces do not have a cache.
1648
1649 return Changed;
1650}
1651
1652bool SIGfx940CacheControl::enableRMWCacheBypass(
1653 const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
1654 SIAtomicAddrSpace AddrSpace) const {
1655 assert(MI->mayLoad() && MI->mayStore());
1656 bool Changed = false;
1657
1658 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1659 switch (Scope) {
1660 case SIAtomicScope::SYSTEM:
1661 // Set SC1 bit to indicate system scope.
1662 Changed |= enableSC1Bit(MI);
1663 break;
1664 case SIAtomicScope::AGENT:
1665 case SIAtomicScope::WORKGROUP:
1666 case SIAtomicScope::WAVEFRONT:
1667 case SIAtomicScope::SINGLETHREAD:
1668 // RMW atomic operations implicitly bypass the L1 cache and only use SC1
1669 // to indicate system or agent scope. The SC0 bit is used to indicate if
1670 // they are return or no-return. Leave SC1 bit unset to indicate agent
1671 // scope.
1672 break;
1673 default:
1674 llvm_unreachable("Unsupported synchronization scope");
1675 }
1676 }
1677
1678 return Changed;
1679}
1680
1681bool SIGfx940CacheControl::enableVolatileAndOrNonTemporal(
1682 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1683 bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
1684 // Only handle load and store, not atomic read-modify-write insructions. The
1685 // latter use glc to indicate if the atomic returns a result and so must not
1686 // be used for cache control.
1687 assert(MI->mayLoad() ^ MI->mayStore());
1688
1689 // Only update load and store, not LLVM IR atomic read-modify-write
1690 // instructions. The latter are always marked as volatile so cannot sensibly
1691 // handle it as do not want to pessimize all atomics. Also they do not support
1692 // the nontemporal attribute.
1693 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1694
1695 bool Changed = false;
1696
1697 if (IsVolatile) {
1698 // Set SC bits to indicate system scope.
1699 Changed |= enableSC0Bit(MI);
1700 Changed |= enableSC1Bit(MI);
1701
1702 // Ensure operation has completed at system scope to cause all volatile
1703 // operations to be visible outside the program in a global order. Do not
1704 // request cross address space as only the global address space can be
1705 // observable outside the program, so no need to cause a waitcnt for LDS
1706 // address space operations.
1707 Changed |= insertWait(MI, Scope: SIAtomicScope::SYSTEM, AddrSpace, Op, IsCrossAddrSpaceOrdering: false,
1708 Pos: Position::AFTER, Order: AtomicOrdering::Unordered);
1709
1710 return Changed;
1711 }
1712
1713 if (IsNonTemporal) {
1714 Changed |= enableNTBit(MI);
1715 return Changed;
1716 }
1717
1718 return Changed;
1719}
1720
1721bool SIGfx940CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1722 SIAtomicScope Scope,
1723 SIAtomicAddrSpace AddrSpace,
1724 Position Pos) const {
1725 if (!InsertCacheInv)
1726 return false;
1727
1728 bool Changed = false;
1729
1730 MachineBasicBlock &MBB = *MI->getParent();
1731 DebugLoc DL = MI->getDebugLoc();
1732
1733 if (Pos == Position::AFTER)
1734 ++MI;
1735
1736 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1737 switch (Scope) {
1738 case SIAtomicScope::SYSTEM:
1739 // Ensures that following loads will not see stale remote VMEM data or
1740 // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and
1741 // CC will never be stale due to the local memory probes.
1742 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::BUFFER_INV))
1743 // Set SC bits to indicate system scope.
1744 .addImm(Val: AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);
1745 // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
1746 // hardware does not reorder memory operations by the same wave with
1747 // respect to a preceding "BUFFER_INV". The invalidate is guaranteed to
1748 // remove any cache lines of earlier writes by the same wave and ensures
1749 // later reads by the same wave will refetch the cache lines.
1750 Changed = true;
1751 break;
1752 case SIAtomicScope::AGENT:
1753 // Ensures that following loads will not see stale remote date or local
1754 // MTYPE NC global data. Local MTYPE RW and CC memory will never be stale
1755 // due to the memory probes.
1756 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::BUFFER_INV))
1757 // Set SC bits to indicate agent scope.
1758 .addImm(Val: AMDGPU::CPol::SC1);
1759 // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
1760 // does not reorder memory operations with respect to preceeding buffer
1761 // invalidate. The invalidate is guaranteed to remove any cache lines of
1762 // earlier writes and ensures later writes will refetch the cache lines.
1763 Changed = true;
1764 break;
1765 case SIAtomicScope::WORKGROUP:
1766 // In threadgroup split mode the waves of a work-group can be executing on
1767 // different CUs. Therefore need to invalidate the L1 which is per CU.
1768 // Otherwise in non-threadgroup split mode all waves of a work-group are
1769 // on the same CU, and so the L1 does not need to be invalidated.
1770 if (ST.isTgSplitEnabled()) {
1771 // Ensures L1 is invalidated if in threadgroup split mode. In
1772 // non-threadgroup split mode it is a NOP, but no point generating it in
1773 // that case if know not in that mode.
1774 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::BUFFER_INV))
1775 // Set SC bits to indicate work-group scope.
1776 .addImm(Val: AMDGPU::CPol::SC0);
1777 // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
1778 // does not reorder memory operations with respect to preceeding buffer
1779 // invalidate. The invalidate is guaranteed to remove any cache lines of
1780 // earlier writes and ensures later writes will refetch the cache lines.
1781 Changed = true;
1782 }
1783 break;
1784 case SIAtomicScope::WAVEFRONT:
1785 case SIAtomicScope::SINGLETHREAD:
1786 // Could generate "BUFFER_INV" but it would do nothing as there are no
1787 // caches to invalidate.
1788 break;
1789 default:
1790 llvm_unreachable("Unsupported synchronization scope");
1791 }
1792 }
1793
1794 /// The scratch address space does not need the global memory cache
1795 /// to be flushed as all memory operations by the same thread are
1796 /// sequentially consistent, and no other thread can access scratch
1797 /// memory.
1798
1799 /// Other address spaces do not have a cache.
1800
1801 if (Pos == Position::AFTER)
1802 --MI;
1803
1804 return Changed;
1805}
1806
1807bool SIGfx940CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1808 SIAtomicScope Scope,
1809 SIAtomicAddrSpace AddrSpace,
1810 bool IsCrossAddrSpaceOrdering,
1811 Position Pos) const {
1812 bool Changed = false;
1813
1814 MachineBasicBlock &MBB = *MI->getParent();
1815 DebugLoc DL = MI->getDebugLoc();
1816
1817 if (Pos == Position::AFTER)
1818 ++MI;
1819
1820 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1821 switch (Scope) {
1822 case SIAtomicScope::SYSTEM:
1823 // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
1824 // hardware does not reorder memory operations by the same wave with
1825 // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
1826 // to initiate writeback of any dirty cache lines of earlier writes by the
1827 // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
1828 // writeback has completed.
1829 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::BUFFER_WBL2))
1830 // Set SC bits to indicate system scope.
1831 .addImm(Val: AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);
1832 // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is
1833 // SIAtomicScope::SYSTEM, the following insertWait will generate the
1834 // required "S_WAITCNT vmcnt(0)" needed by the "BUFFER_WBL2".
1835 Changed = true;
1836 break;
1837 case SIAtomicScope::AGENT:
1838 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::BUFFER_WBL2))
1839 // Set SC bits to indicate agent scope.
1840 .addImm(Val: AMDGPU::CPol::SC1);
1841
1842 // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is
1843 // SIAtomicScope::AGENT, the following insertWait will generate the
1844 // required "S_WAITCNT vmcnt(0)".
1845 Changed = true;
1846 break;
1847 case SIAtomicScope::WORKGROUP:
1848 case SIAtomicScope::WAVEFRONT:
1849 case SIAtomicScope::SINGLETHREAD:
1850 // Do not generate "BUFFER_WBL2" as there are no caches it would
1851 // writeback, and would require an otherwise unnecessary
1852 // "S_WAITCNT vmcnt(0)".
1853 break;
1854 default:
1855 llvm_unreachable("Unsupported synchronization scope");
1856 }
1857 }
1858
1859 if (Pos == Position::AFTER)
1860 --MI;
1861
1862 // Ensure the necessary S_WAITCNT needed by any "BUFFER_WBL2" as well as other
1863 // S_WAITCNT needed.
1864 Changed |= insertWait(MI, Scope, AddrSpace, Op: SIMemOp::LOAD | SIMemOp::STORE,
1865 IsCrossAddrSpaceOrdering, Pos, Order: AtomicOrdering::Release);
1866
1867 return Changed;
1868}
1869
1870bool SIGfx10CacheControl::enableLoadCacheBypass(
1871 const MachineBasicBlock::iterator &MI,
1872 SIAtomicScope Scope,
1873 SIAtomicAddrSpace AddrSpace) const {
1874 assert(MI->mayLoad() && !MI->mayStore());
1875 bool Changed = false;
1876
1877 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1878 switch (Scope) {
1879 case SIAtomicScope::SYSTEM:
1880 case SIAtomicScope::AGENT:
1881 // Set the L0 and L1 cache policies to MISS_EVICT.
1882 // Note: there is no L2 cache coherent bypass control at the ISA level.
1883 Changed |= enableGLCBit(MI);
1884 Changed |= enableDLCBit(MI);
1885 break;
1886 case SIAtomicScope::WORKGROUP:
1887 // In WGP mode the waves of a work-group can be executing on either CU of
1888 // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
1889 // CU mode all waves of a work-group are on the same CU, and so the L0
1890 // does not need to be bypassed.
1891 if (!ST.isCuModeEnabled())
1892 Changed |= enableGLCBit(MI);
1893 break;
1894 case SIAtomicScope::WAVEFRONT:
1895 case SIAtomicScope::SINGLETHREAD:
1896 // No cache to bypass.
1897 break;
1898 default:
1899 llvm_unreachable("Unsupported synchronization scope");
1900 }
1901 }
1902
1903 /// The scratch address space does not need the global memory caches
1904 /// to be bypassed as all memory operations by the same thread are
1905 /// sequentially consistent, and no other thread can access scratch
1906 /// memory.
1907
1908 /// Other address spaces do not have a cache.
1909
1910 return Changed;
1911}
1912
1913bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal(
1914 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1915 bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
1916
1917 // Only handle load and store, not atomic read-modify-write insructions. The
1918 // latter use glc to indicate if the atomic returns a result and so must not
1919 // be used for cache control.
1920 assert(MI->mayLoad() ^ MI->mayStore());
1921
1922 // Only update load and store, not LLVM IR atomic read-modify-write
1923 // instructions. The latter are always marked as volatile so cannot sensibly
1924 // handle it as do not want to pessimize all atomics. Also they do not support
1925 // the nontemporal attribute.
1926 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1927
1928 bool Changed = false;
1929
1930 if (IsVolatile) {
1931 // Set L0 and L1 cache policy to be MISS_EVICT for load instructions
1932 // and MISS_LRU for store instructions.
1933 // Note: there is no L2 cache coherent bypass control at the ISA level.
1934 if (Op == SIMemOp::LOAD) {
1935 Changed |= enableGLCBit(MI);
1936 Changed |= enableDLCBit(MI);
1937 }
1938
1939 // Ensure operation has completed at system scope to cause all volatile
1940 // operations to be visible outside the program in a global order. Do not
1941 // request cross address space as only the global address space can be
1942 // observable outside the program, so no need to cause a waitcnt for LDS
1943 // address space operations.
1944 Changed |= insertWait(MI, Scope: SIAtomicScope::SYSTEM, AddrSpace, Op, IsCrossAddrSpaceOrdering: false,
1945 Pos: Position::AFTER, Order: AtomicOrdering::Unordered);
1946 return Changed;
1947 }
1948
1949 if (IsNonTemporal) {
1950 // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT
1951 // and L2 cache policy to STREAM.
1952 // For stores setting both GLC and SLC configures L0 and L1 cache policy
1953 // to MISS_EVICT and the L2 cache policy to STREAM.
1954 if (Op == SIMemOp::STORE)
1955 Changed |= enableGLCBit(MI);
1956 Changed |= enableSLCBit(MI);
1957
1958 return Changed;
1959 }
1960
1961 return Changed;
1962}
1963
1964bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
1965 SIAtomicScope Scope,
1966 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1967 bool IsCrossAddrSpaceOrdering,
1968 Position Pos, AtomicOrdering Order) const {
1969 bool Changed = false;
1970
1971 MachineBasicBlock &MBB = *MI->getParent();
1972 DebugLoc DL = MI->getDebugLoc();
1973
1974 if (Pos == Position::AFTER)
1975 ++MI;
1976
1977 bool VMCnt = false;
1978 bool VSCnt = false;
1979 bool LGKMCnt = false;
1980
1981 if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
1982 SIAtomicAddrSpace::NONE) {
1983 switch (Scope) {
1984 case SIAtomicScope::SYSTEM:
1985 case SIAtomicScope::AGENT:
1986 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1987 VMCnt |= true;
1988 if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1989 VSCnt |= true;
1990 break;
1991 case SIAtomicScope::WORKGROUP:
1992 // In WGP mode the waves of a work-group can be executing on either CU of
1993 // the WGP. Therefore need to wait for operations to complete to ensure
1994 // they are visible to waves in the other CU as the L0 is per CU.
1995 // Otherwise in CU mode and all waves of a work-group are on the same CU
1996 // which shares the same L0.
1997 if (!ST.isCuModeEnabled()) {
1998 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1999 VMCnt |= true;
2000 if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
2001 VSCnt |= true;
2002 }
2003 break;
2004 case SIAtomicScope::WAVEFRONT:
2005 case SIAtomicScope::SINGLETHREAD:
2006 // The L0 cache keeps all memory operations in order for
2007 // work-items in the same wavefront.
2008 break;
2009 default:
2010 llvm_unreachable("Unsupported synchronization scope");
2011 }
2012 }
2013
2014 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
2015 switch (Scope) {
2016 case SIAtomicScope::SYSTEM:
2017 case SIAtomicScope::AGENT:
2018 case SIAtomicScope::WORKGROUP:
2019 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
2020 // not needed as LDS operations for all waves are executed in a total
2021 // global ordering as observed by all waves. Required if also
2022 // synchronizing with global/GDS memory as LDS operations could be
2023 // reordered with respect to later global/GDS memory operations of the
2024 // same wave.
2025 LGKMCnt |= IsCrossAddrSpaceOrdering;
2026 break;
2027 case SIAtomicScope::WAVEFRONT:
2028 case SIAtomicScope::SINGLETHREAD:
2029 // The LDS keeps all memory operations in order for
2030 // the same wavefront.
2031 break;
2032 default:
2033 llvm_unreachable("Unsupported synchronization scope");
2034 }
2035 }
2036
2037 if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
2038 switch (Scope) {
2039 case SIAtomicScope::SYSTEM:
2040 case SIAtomicScope::AGENT:
2041 // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
2042 // is not needed as GDS operations for all waves are executed in a total
2043 // global ordering as observed by all waves. Required if also
2044 // synchronizing with global/LDS memory as GDS operations could be
2045 // reordered with respect to later global/LDS memory operations of the
2046 // same wave.
2047 LGKMCnt |= IsCrossAddrSpaceOrdering;
2048 break;
2049 case SIAtomicScope::WORKGROUP:
2050 case SIAtomicScope::WAVEFRONT:
2051 case SIAtomicScope::SINGLETHREAD:
2052 // The GDS keeps all memory operations in order for
2053 // the same work-group.
2054 break;
2055 default:
2056 llvm_unreachable("Unsupported synchronization scope");
2057 }
2058 }
2059
2060 if (VMCnt || LGKMCnt) {
2061 unsigned WaitCntImmediate =
2062 AMDGPU::encodeWaitcnt(Version: IV,
2063 Vmcnt: VMCnt ? 0 : getVmcntBitMask(Version: IV),
2064 Expcnt: getExpcntBitMask(Version: IV),
2065 Lgkmcnt: LGKMCnt ? 0 : getLgkmcntBitMask(Version: IV));
2066 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_WAITCNT_soft))
2067 .addImm(Val: WaitCntImmediate);
2068 Changed = true;
2069 }
2070
2071 if (VSCnt) {
2072 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_WAITCNT_VSCNT_soft))
2073 .addReg(RegNo: AMDGPU::SGPR_NULL, flags: RegState::Undef)
2074 .addImm(Val: 0);
2075 Changed = true;
2076 }
2077
2078 if (Pos == Position::AFTER)
2079 --MI;
2080
2081 return Changed;
2082}
2083
2084bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
2085 SIAtomicScope Scope,
2086 SIAtomicAddrSpace AddrSpace,
2087 Position Pos) const {
2088 if (!InsertCacheInv)
2089 return false;
2090
2091 bool Changed = false;
2092
2093 MachineBasicBlock &MBB = *MI->getParent();
2094 DebugLoc DL = MI->getDebugLoc();
2095
2096 if (Pos == Position::AFTER)
2097 ++MI;
2098
2099 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
2100 switch (Scope) {
2101 case SIAtomicScope::SYSTEM:
2102 case SIAtomicScope::AGENT:
2103 // The order of invalidates matter here. We must invalidate "outer in"
2104 // so L1 -> L0 to avoid L0 pulling in stale data from L1 when it is
2105 // invalidated.
2106 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::BUFFER_GL1_INV));
2107 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::BUFFER_GL0_INV));
2108 Changed = true;
2109 break;
2110 case SIAtomicScope::WORKGROUP:
2111 // In WGP mode the waves of a work-group can be executing on either CU of
2112 // the WGP. Therefore need to invalidate the L0 which is per CU. Otherwise
2113 // in CU mode and all waves of a work-group are on the same CU, and so the
2114 // L0 does not need to be invalidated.
2115 if (!ST.isCuModeEnabled()) {
2116 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::BUFFER_GL0_INV));
2117 Changed = true;
2118 }
2119 break;
2120 case SIAtomicScope::WAVEFRONT:
2121 case SIAtomicScope::SINGLETHREAD:
2122 // No cache to invalidate.
2123 break;
2124 default:
2125 llvm_unreachable("Unsupported synchronization scope");
2126 }
2127 }
2128
2129 /// The scratch address space does not need the global memory cache
2130 /// to be flushed as all memory operations by the same thread are
2131 /// sequentially consistent, and no other thread can access scratch
2132 /// memory.
2133
2134 /// Other address spaces do not have a cache.
2135
2136 if (Pos == Position::AFTER)
2137 --MI;
2138
2139 return Changed;
2140}
2141
2142bool SIGfx11CacheControl::enableLoadCacheBypass(
2143 const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
2144 SIAtomicAddrSpace AddrSpace) const {
2145 assert(MI->mayLoad() && !MI->mayStore());
2146 bool Changed = false;
2147
2148 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
2149 switch (Scope) {
2150 case SIAtomicScope::SYSTEM:
2151 case SIAtomicScope::AGENT:
2152 // Set the L0 and L1 cache policies to MISS_EVICT.
2153 // Note: there is no L2 cache coherent bypass control at the ISA level.
2154 Changed |= enableGLCBit(MI);
2155 break;
2156 case SIAtomicScope::WORKGROUP:
2157 // In WGP mode the waves of a work-group can be executing on either CU of
2158 // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
2159 // CU mode all waves of a work-group are on the same CU, and so the L0
2160 // does not need to be bypassed.
2161 if (!ST.isCuModeEnabled())
2162 Changed |= enableGLCBit(MI);
2163 break;
2164 case SIAtomicScope::WAVEFRONT:
2165 case SIAtomicScope::SINGLETHREAD:
2166 // No cache to bypass.
2167 break;
2168 default:
2169 llvm_unreachable("Unsupported synchronization scope");
2170 }
2171 }
2172
2173 /// The scratch address space does not need the global memory caches
2174 /// to be bypassed as all memory operations by the same thread are
2175 /// sequentially consistent, and no other thread can access scratch
2176 /// memory.
2177
2178 /// Other address spaces do not have a cache.
2179
2180 return Changed;
2181}
2182
2183bool SIGfx11CacheControl::enableVolatileAndOrNonTemporal(
2184 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
2185 bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
2186
2187 // Only handle load and store, not atomic read-modify-write insructions. The
2188 // latter use glc to indicate if the atomic returns a result and so must not
2189 // be used for cache control.
2190 assert(MI->mayLoad() ^ MI->mayStore());
2191
2192 // Only update load and store, not LLVM IR atomic read-modify-write
2193 // instructions. The latter are always marked as volatile so cannot sensibly
2194 // handle it as do not want to pessimize all atomics. Also they do not support
2195 // the nontemporal attribute.
2196 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
2197
2198 bool Changed = false;
2199
2200 if (IsVolatile) {
2201 // Set L0 and L1 cache policy to be MISS_EVICT for load instructions
2202 // and MISS_LRU for store instructions.
2203 // Note: there is no L2 cache coherent bypass control at the ISA level.
2204 if (Op == SIMemOp::LOAD)
2205 Changed |= enableGLCBit(MI);
2206
2207 // Set MALL NOALLOC for load and store instructions.
2208 Changed |= enableDLCBit(MI);
2209
2210 // Ensure operation has completed at system scope to cause all volatile
2211 // operations to be visible outside the program in a global order. Do not
2212 // request cross address space as only the global address space can be
2213 // observable outside the program, so no need to cause a waitcnt for LDS
2214 // address space operations.
2215 Changed |= insertWait(MI, Scope: SIAtomicScope::SYSTEM, AddrSpace, Op, IsCrossAddrSpaceOrdering: false,
2216 Pos: Position::AFTER, Order: AtomicOrdering::Unordered);
2217 return Changed;
2218 }
2219
2220 if (IsNonTemporal) {
2221 // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT
2222 // and L2 cache policy to STREAM.
2223 // For stores setting both GLC and SLC configures L0 and L1 cache policy
2224 // to MISS_EVICT and the L2 cache policy to STREAM.
2225 if (Op == SIMemOp::STORE)
2226 Changed |= enableGLCBit(MI);
2227 Changed |= enableSLCBit(MI);
2228
2229 // Set MALL NOALLOC for load and store instructions.
2230 Changed |= enableDLCBit(MI);
2231 return Changed;
2232 }
2233
2234 return Changed;
2235}
2236
2237bool SIGfx12CacheControl::setTH(const MachineBasicBlock::iterator MI,
2238 AMDGPU::CPol::CPol Value) const {
2239 MachineOperand *CPol = TII->getNamedOperand(MI&: *MI, OperandName: OpName::cpol);
2240 if (!CPol)
2241 return false;
2242
2243 uint64_t NewTH = Value & AMDGPU::CPol::TH;
2244 if ((CPol->getImm() & AMDGPU::CPol::TH) != NewTH) {
2245 CPol->setImm((CPol->getImm() & ~AMDGPU::CPol::TH) | NewTH);
2246 return true;
2247 }
2248
2249 return false;
2250}
2251
2252bool SIGfx12CacheControl::setScope(const MachineBasicBlock::iterator MI,
2253 AMDGPU::CPol::CPol Value) const {
2254 MachineOperand *CPol = TII->getNamedOperand(MI&: *MI, OperandName: OpName::cpol);
2255 if (!CPol)
2256 return false;
2257
2258 uint64_t NewScope = Value & AMDGPU::CPol::SCOPE;
2259 if ((CPol->getImm() & AMDGPU::CPol::SCOPE) != NewScope) {
2260 CPol->setImm((CPol->getImm() & ~AMDGPU::CPol::SCOPE) | NewScope);
2261 return true;
2262 }
2263
2264 return false;
2265}
2266
2267bool SIGfx12CacheControl::insertWaitsBeforeSystemScopeStore(
2268 const MachineBasicBlock::iterator MI) const {
2269 // TODO: implement flag for frontend to give us a hint not to insert waits.
2270
2271 MachineBasicBlock &MBB = *MI->getParent();
2272 const DebugLoc &DL = MI->getDebugLoc();
2273
2274 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: S_WAIT_LOADCNT_soft)).addImm(Val: 0);
2275 if (ST.hasImageInsts()) {
2276 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: S_WAIT_SAMPLECNT_soft)).addImm(Val: 0);
2277 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: S_WAIT_BVHCNT_soft)).addImm(Val: 0);
2278 }
2279 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: S_WAIT_KMCNT_soft)).addImm(Val: 0);
2280 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: S_WAIT_STORECNT_soft)).addImm(Val: 0);
2281
2282 return true;
2283}
2284
2285bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI,
2286 SIAtomicScope Scope,
2287 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
2288 bool IsCrossAddrSpaceOrdering,
2289 Position Pos, AtomicOrdering Order) const {
2290 bool Changed = false;
2291
2292 MachineBasicBlock &MBB = *MI->getParent();
2293 DebugLoc DL = MI->getDebugLoc();
2294
2295 bool LOADCnt = false;
2296 bool DSCnt = false;
2297 bool STORECnt = false;
2298
2299 if (Pos == Position::AFTER)
2300 ++MI;
2301
2302 if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
2303 SIAtomicAddrSpace::NONE) {
2304 switch (Scope) {
2305 case SIAtomicScope::SYSTEM:
2306 case SIAtomicScope::AGENT:
2307 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
2308 LOADCnt |= true;
2309 if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
2310 STORECnt |= true;
2311 break;
2312 case SIAtomicScope::WORKGROUP:
2313 // In WGP mode the waves of a work-group can be executing on either CU of
2314 // the WGP. Therefore need to wait for operations to complete to ensure
2315 // they are visible to waves in the other CU as the L0 is per CU.
2316 // Otherwise in CU mode and all waves of a work-group are on the same CU
2317 // which shares the same L0.
2318 if (!ST.isCuModeEnabled()) {
2319 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
2320 LOADCnt |= true;
2321 if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
2322 STORECnt |= true;
2323 }
2324 break;
2325 case SIAtomicScope::WAVEFRONT:
2326 case SIAtomicScope::SINGLETHREAD:
2327 // The L0 cache keeps all memory operations in order for
2328 // work-items in the same wavefront.
2329 break;
2330 default:
2331 llvm_unreachable("Unsupported synchronization scope");
2332 }
2333 }
2334
2335 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
2336 switch (Scope) {
2337 case SIAtomicScope::SYSTEM:
2338 case SIAtomicScope::AGENT:
2339 case SIAtomicScope::WORKGROUP:
2340 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
2341 // not needed as LDS operations for all waves are executed in a total
2342 // global ordering as observed by all waves. Required if also
2343 // synchronizing with global/GDS memory as LDS operations could be
2344 // reordered with respect to later global/GDS memory operations of the
2345 // same wave.
2346 DSCnt |= IsCrossAddrSpaceOrdering;
2347 break;
2348 case SIAtomicScope::WAVEFRONT:
2349 case SIAtomicScope::SINGLETHREAD:
2350 // The LDS keeps all memory operations in order for
2351 // the same wavefront.
2352 break;
2353 default:
2354 llvm_unreachable("Unsupported synchronization scope");
2355 }
2356 }
2357
2358 if (LOADCnt) {
2359 // Acquire sequences only need to wait on the previous atomic operation.
2360 // e.g. a typical sequence looks like
2361 // atomic load
2362 // (wait)
2363 // global_inv
2364 //
2365 // We do not have BVH or SAMPLE atomics, so the atomic load is always going
2366 // to be tracked using loadcnt.
2367 //
2368 // This also applies to fences. Fences cannot pair with an instruction
2369 // tracked with bvh/samplecnt as we don't have any atomics that do that.
2370 if (Order != AtomicOrdering::Acquire) {
2371 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_WAIT_BVHCNT_soft)).addImm(Val: 0);
2372 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_WAIT_SAMPLECNT_soft)).addImm(Val: 0);
2373 }
2374 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_WAIT_LOADCNT_soft)).addImm(Val: 0);
2375 Changed = true;
2376 }
2377
2378 if (STORECnt) {
2379 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_WAIT_STORECNT_soft)).addImm(Val: 0);
2380 Changed = true;
2381 }
2382
2383 if (DSCnt) {
2384 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_WAIT_DSCNT_soft)).addImm(Val: 0);
2385 Changed = true;
2386 }
2387
2388 if (Pos == Position::AFTER)
2389 --MI;
2390
2391 return Changed;
2392}
2393
2394bool SIGfx12CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
2395 SIAtomicScope Scope,
2396 SIAtomicAddrSpace AddrSpace,
2397 Position Pos) const {
2398 if (!InsertCacheInv)
2399 return false;
2400
2401 MachineBasicBlock &MBB = *MI->getParent();
2402 DebugLoc DL = MI->getDebugLoc();
2403
2404 /// The scratch address space does not need the global memory cache
2405 /// to be flushed as all memory operations by the same thread are
2406 /// sequentially consistent, and no other thread can access scratch
2407 /// memory.
2408
2409 /// Other address spaces do not have a cache.
2410 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) == SIAtomicAddrSpace::NONE)
2411 return false;
2412
2413 AMDGPU::CPol::CPol ScopeImm = AMDGPU::CPol::SCOPE_DEV;
2414 switch (Scope) {
2415 case SIAtomicScope::SYSTEM:
2416 ScopeImm = AMDGPU::CPol::SCOPE_SYS;
2417 break;
2418 case SIAtomicScope::AGENT:
2419 ScopeImm = AMDGPU::CPol::SCOPE_DEV;
2420 break;
2421 case SIAtomicScope::WORKGROUP:
2422 // In WGP mode the waves of a work-group can be executing on either CU of
2423 // the WGP. Therefore we need to invalidate the L0 which is per CU.
2424 // Otherwise in CU mode all waves of a work-group are on the same CU, and so
2425 // the L0 does not need to be invalidated.
2426 if (ST.isCuModeEnabled())
2427 return false;
2428
2429 ScopeImm = AMDGPU::CPol::SCOPE_SE;
2430 break;
2431 case SIAtomicScope::WAVEFRONT:
2432 case SIAtomicScope::SINGLETHREAD:
2433 // No cache to invalidate.
2434 return false;
2435 default:
2436 llvm_unreachable("Unsupported synchronization scope");
2437 }
2438
2439 if (Pos == Position::AFTER)
2440 ++MI;
2441
2442 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::GLOBAL_INV)).addImm(Val: ScopeImm);
2443
2444 if (Pos == Position::AFTER)
2445 --MI;
2446
2447 return true;
2448}
2449
2450bool SIGfx12CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
2451 SIAtomicScope Scope,
2452 SIAtomicAddrSpace AddrSpace,
2453 bool IsCrossAddrSpaceOrdering,
2454 Position Pos) const {
2455 MachineBasicBlock &MBB = *MI->getParent();
2456 DebugLoc DL = MI->getDebugLoc();
2457
2458 // The scratch address space does not need the global memory cache
2459 // writeback as all memory operations by the same thread are
2460 // sequentially consistent, and no other thread can access scratch
2461 // memory.
2462
2463 // Other address spaces do not have a cache.
2464 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) == SIAtomicAddrSpace::NONE)
2465 return false;
2466
2467 if (Pos == Position::AFTER)
2468 ++MI;
2469
2470 // global_wb is only necessary at system scope for gfx120x targets.
2471 //
2472 // Emitting it for lower scopes is a slow no-op, so we omit it
2473 // for performance.
2474 switch (Scope) {
2475 case SIAtomicScope::SYSTEM:
2476 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::GLOBAL_WB))
2477 .addImm(Val: AMDGPU::CPol::SCOPE_SYS);
2478 break;
2479 case SIAtomicScope::AGENT:
2480 case SIAtomicScope::WORKGROUP:
2481 // No WB necessary, but we still have to wait.
2482 break;
2483 case SIAtomicScope::WAVEFRONT:
2484 case SIAtomicScope::SINGLETHREAD:
2485 // No WB or wait necessary here.
2486 return false;
2487 default:
2488 llvm_unreachable("Unsupported synchronization scope");
2489 }
2490
2491 if (Pos == Position::AFTER)
2492 --MI;
2493
2494 // We always have to wait for previous memory operations (load/store) to
2495 // complete, whether we inserted a WB or not. If we inserted a WB (storecnt),
2496 // we of course need to wait for that as well.
2497 insertWait(MI, Scope, AddrSpace, Op: SIMemOp::LOAD | SIMemOp::STORE,
2498 IsCrossAddrSpaceOrdering, Pos, Order: AtomicOrdering::Release);
2499
2500 return true;
2501}
2502
2503bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
2504 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
2505 bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
2506
2507 // Only handle load and store, not atomic read-modify-write instructions.
2508 assert(MI->mayLoad() ^ MI->mayStore());
2509
2510 // Only update load and store, not LLVM IR atomic read-modify-write
2511 // instructions. The latter are always marked as volatile so cannot sensibly
2512 // handle it as do not want to pessimize all atomics. Also they do not support
2513 // the nontemporal attribute.
2514 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
2515
2516 bool Changed = false;
2517
2518 if (IsLastUse) {
2519 // Set last-use hint.
2520 Changed |= setTH(MI, Value: AMDGPU::CPol::TH_LU);
2521 } else if (IsNonTemporal) {
2522 // Set non-temporal hint for all cache levels.
2523 Changed |= setTH(MI, Value: AMDGPU::CPol::TH_NT);
2524 }
2525
2526 if (IsVolatile) {
2527 Changed |= setScope(MI, Value: AMDGPU::CPol::SCOPE_SYS);
2528
2529 if (Op == SIMemOp::STORE)
2530 Changed |= insertWaitsBeforeSystemScopeStore(MI);
2531
2532 // Ensure operation has completed at system scope to cause all volatile
2533 // operations to be visible outside the program in a global order. Do not
2534 // request cross address space as only the global address space can be
2535 // observable outside the program, so no need to cause a waitcnt for LDS
2536 // address space operations.
2537 Changed |= insertWait(MI, Scope: SIAtomicScope::SYSTEM, AddrSpace, Op, IsCrossAddrSpaceOrdering: false,
2538 Pos: Position::AFTER, Order: AtomicOrdering::Unordered);
2539 }
2540
2541 return Changed;
2542}
2543
2544bool SIGfx12CacheControl::expandSystemScopeStore(
2545 MachineBasicBlock::iterator &MI) const {
2546 MachineOperand *CPol = TII->getNamedOperand(MI&: *MI, OperandName: OpName::cpol);
2547 if (CPol && ((CPol->getImm() & CPol::SCOPE) == CPol::SCOPE_SYS))
2548 return insertWaitsBeforeSystemScopeStore(MI);
2549
2550 return false;
2551}
2552
2553bool SIGfx12CacheControl::setAtomicScope(const MachineBasicBlock::iterator &MI,
2554 SIAtomicScope Scope,
2555 SIAtomicAddrSpace AddrSpace) const {
2556 bool Changed = false;
2557
2558 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
2559 switch (Scope) {
2560 case SIAtomicScope::SYSTEM:
2561 Changed |= setScope(MI, Value: AMDGPU::CPol::SCOPE_SYS);
2562 break;
2563 case SIAtomicScope::AGENT:
2564 Changed |= setScope(MI, Value: AMDGPU::CPol::SCOPE_DEV);
2565 break;
2566 case SIAtomicScope::WORKGROUP:
2567 // In workgroup mode, SCOPE_SE is needed as waves can executes on
2568 // different CUs that access different L0s.
2569 if (!ST.isCuModeEnabled())
2570 Changed |= setScope(MI, Value: AMDGPU::CPol::SCOPE_SE);
2571 break;
2572 case SIAtomicScope::WAVEFRONT:
2573 case SIAtomicScope::SINGLETHREAD:
2574 // No cache to bypass.
2575 break;
2576 default:
2577 llvm_unreachable("Unsupported synchronization scope");
2578 }
2579 }
2580
2581 // The scratch address space does not need the global memory caches
2582 // to be bypassed as all memory operations by the same thread are
2583 // sequentially consistent, and no other thread can access scratch
2584 // memory.
2585
2586 // Other address spaces do not have a cache.
2587
2588 return Changed;
2589}
2590
2591bool SIMemoryLegalizer::removeAtomicPseudoMIs() {
2592 if (AtomicPseudoMIs.empty())
2593 return false;
2594
2595 for (auto &MI : AtomicPseudoMIs)
2596 MI->eraseFromParent();
2597
2598 AtomicPseudoMIs.clear();
2599 return true;
2600}
2601
2602bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI,
2603 MachineBasicBlock::iterator &MI) {
2604 assert(MI->mayLoad() && !MI->mayStore());
2605
2606 bool Changed = false;
2607
2608 if (MOI.isAtomic()) {
2609 const AtomicOrdering Order = MOI.getOrdering();
2610 if (Order == AtomicOrdering::Monotonic ||
2611 Order == AtomicOrdering::Acquire ||
2612 Order == AtomicOrdering::SequentiallyConsistent) {
2613 Changed |= CC->enableLoadCacheBypass(MI, Scope: MOI.getScope(),
2614 AddrSpace: MOI.getOrderingAddrSpace());
2615 }
2616
2617 if (Order == AtomicOrdering::SequentiallyConsistent)
2618 Changed |= CC->insertWait(MI, Scope: MOI.getScope(), AddrSpace: MOI.getOrderingAddrSpace(),
2619 Op: SIMemOp::LOAD | SIMemOp::STORE,
2620 IsCrossAddrSpaceOrdering: MOI.getIsCrossAddressSpaceOrdering(),
2621 Pos: Position::BEFORE, Order);
2622
2623 if (Order == AtomicOrdering::Acquire ||
2624 Order == AtomicOrdering::SequentiallyConsistent) {
2625 Changed |= CC->insertWait(
2626 MI, Scope: MOI.getScope(), AddrSpace: MOI.getInstrAddrSpace(), Op: SIMemOp::LOAD,
2627 IsCrossAddrSpaceOrdering: MOI.getIsCrossAddressSpaceOrdering(), Pos: Position::AFTER, Order);
2628 Changed |= CC->insertAcquire(MI, Scope: MOI.getScope(),
2629 AddrSpace: MOI.getOrderingAddrSpace(),
2630 Pos: Position::AFTER);
2631 }
2632
2633 return Changed;
2634 }
2635
2636 // Atomic instructions already bypass caches to the scope specified by the
2637 // SyncScope operand. Only non-atomic volatile and nontemporal/last-use
2638 // instructions need additional treatment.
2639 Changed |= CC->enableVolatileAndOrNonTemporal(
2640 MI, AddrSpace: MOI.getInstrAddrSpace(), Op: SIMemOp::LOAD, IsVolatile: MOI.isVolatile(),
2641 IsNonTemporal: MOI.isNonTemporal(), IsLastUse: MOI.isLastUse());
2642
2643 return Changed;
2644}
2645
2646bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
2647 MachineBasicBlock::iterator &MI) {
2648 assert(!MI->mayLoad() && MI->mayStore());
2649
2650 bool Changed = false;
2651
2652 if (MOI.isAtomic()) {
2653 if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
2654 MOI.getOrdering() == AtomicOrdering::Release ||
2655 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2656 Changed |= CC->enableStoreCacheBypass(MI, Scope: MOI.getScope(),
2657 AddrSpace: MOI.getOrderingAddrSpace());
2658 }
2659
2660 if (MOI.getOrdering() == AtomicOrdering::Release ||
2661 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2662 Changed |= CC->insertRelease(MI, Scope: MOI.getScope(),
2663 AddrSpace: MOI.getOrderingAddrSpace(),
2664 IsCrossAddrSpaceOrdering: MOI.getIsCrossAddressSpaceOrdering(),
2665 Pos: Position::BEFORE);
2666
2667 return Changed;
2668 }
2669
2670 // Atomic instructions already bypass caches to the scope specified by the
2671 // SyncScope operand. Only non-atomic volatile and nontemporal instructions
2672 // need additional treatment.
2673 Changed |= CC->enableVolatileAndOrNonTemporal(
2674 MI, AddrSpace: MOI.getInstrAddrSpace(), Op: SIMemOp::STORE, IsVolatile: MOI.isVolatile(),
2675 IsNonTemporal: MOI.isNonTemporal());
2676
2677 // GFX12 specific, scope(desired coherence domain in cache hierarchy) is
2678 // instruction field, do not confuse it with atomic scope.
2679 Changed |= CC->expandSystemScopeStore(MI);
2680 return Changed;
2681}
2682
2683bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
2684 MachineBasicBlock::iterator &MI) {
2685 assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE);
2686
2687 AtomicPseudoMIs.push_back(x: MI);
2688 bool Changed = false;
2689
2690 // Refine fenced address space based on MMRAs.
2691 //
2692 // TODO: Should we support this MMRA on other atomic operations?
2693 auto OrderingAddrSpace =
2694 getFenceAddrSpaceMMRA(MI: *MI, Default: MOI.getOrderingAddrSpace());
2695
2696 if (MOI.isAtomic()) {
2697 const AtomicOrdering Order = MOI.getOrdering();
2698 if (Order == AtomicOrdering::Acquire) {
2699 Changed |= CC->insertWait(
2700 MI, Scope: MOI.getScope(), AddrSpace: OrderingAddrSpace, Op: SIMemOp::LOAD | SIMemOp::STORE,
2701 IsCrossAddrSpaceOrdering: MOI.getIsCrossAddressSpaceOrdering(), Pos: Position::BEFORE, Order);
2702 }
2703
2704 if (Order == AtomicOrdering::Release ||
2705 Order == AtomicOrdering::AcquireRelease ||
2706 Order == AtomicOrdering::SequentiallyConsistent)
2707 /// TODO: This relies on a barrier always generating a waitcnt
2708 /// for LDS to ensure it is not reordered with the completion of
2709 /// the proceeding LDS operations. If barrier had a memory
2710 /// ordering and memory scope, then library does not need to
2711 /// generate a fence. Could add support in this file for
2712 /// barrier. SIInsertWaitcnt.cpp could then stop unconditionally
2713 /// adding S_WAITCNT before a S_BARRIER.
2714 Changed |= CC->insertRelease(MI, Scope: MOI.getScope(), AddrSpace: OrderingAddrSpace,
2715 IsCrossAddrSpaceOrdering: MOI.getIsCrossAddressSpaceOrdering(),
2716 Pos: Position::BEFORE);
2717
2718 // TODO: If both release and invalidate are happening they could be combined
2719 // to use the single "BUFFER_WBINV*" instruction. This could be done by
2720 // reorganizing this code or as part of optimizing SIInsertWaitcnt pass to
2721 // track cache invalidate and write back instructions.
2722
2723 if (Order == AtomicOrdering::Acquire ||
2724 Order == AtomicOrdering::AcquireRelease ||
2725 Order == AtomicOrdering::SequentiallyConsistent)
2726 Changed |= CC->insertAcquire(MI, Scope: MOI.getScope(), AddrSpace: OrderingAddrSpace,
2727 Pos: Position::BEFORE);
2728
2729 return Changed;
2730 }
2731
2732 return Changed;
2733}
2734
2735bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
2736 MachineBasicBlock::iterator &MI) {
2737 assert(MI->mayLoad() && MI->mayStore());
2738
2739 bool Changed = false;
2740
2741 if (MOI.isAtomic()) {
2742 const AtomicOrdering Order = MOI.getOrdering();
2743 if (Order == AtomicOrdering::Monotonic ||
2744 Order == AtomicOrdering::Acquire || Order == AtomicOrdering::Release ||
2745 Order == AtomicOrdering::AcquireRelease ||
2746 Order == AtomicOrdering::SequentiallyConsistent) {
2747 Changed |= CC->enableRMWCacheBypass(MI, Scope: MOI.getScope(),
2748 AddrSpace: MOI.getInstrAddrSpace());
2749 }
2750
2751 if (Order == AtomicOrdering::Release ||
2752 Order == AtomicOrdering::AcquireRelease ||
2753 Order == AtomicOrdering::SequentiallyConsistent ||
2754 MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent)
2755 Changed |= CC->insertRelease(MI, Scope: MOI.getScope(),
2756 AddrSpace: MOI.getOrderingAddrSpace(),
2757 IsCrossAddrSpaceOrdering: MOI.getIsCrossAddressSpaceOrdering(),
2758 Pos: Position::BEFORE);
2759
2760 if (Order == AtomicOrdering::Acquire ||
2761 Order == AtomicOrdering::AcquireRelease ||
2762 Order == AtomicOrdering::SequentiallyConsistent ||
2763 MOI.getFailureOrdering() == AtomicOrdering::Acquire ||
2764 MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) {
2765 Changed |= CC->insertWait(
2766 MI, Scope: MOI.getScope(), AddrSpace: MOI.getInstrAddrSpace(),
2767 Op: isAtomicRet(MI: *MI) ? SIMemOp::LOAD : SIMemOp::STORE,
2768 IsCrossAddrSpaceOrdering: MOI.getIsCrossAddressSpaceOrdering(), Pos: Position::AFTER, Order);
2769 Changed |= CC->insertAcquire(MI, Scope: MOI.getScope(),
2770 AddrSpace: MOI.getOrderingAddrSpace(),
2771 Pos: Position::AFTER);
2772 }
2773
2774 return Changed;
2775 }
2776
2777 return Changed;
2778}
2779
2780bool SIMemoryLegalizerLegacy::runOnMachineFunction(MachineFunction &MF) {
2781 const MachineModuleInfo &MMI =
2782 getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
2783 return SIMemoryLegalizer(MMI).run(MF);
2784}
2785
2786PreservedAnalyses
2787SIMemoryLegalizerPass::run(MachineFunction &MF,
2788 MachineFunctionAnalysisManager &MFAM) {
2789 auto *MMI = MFAM.getResult<ModuleAnalysisManagerMachineFunctionProxy>(IR&: MF)
2790 .getCachedResult<MachineModuleAnalysis>(
2791 IR&: *MF.getFunction().getParent());
2792 assert(MMI && "MachineModuleAnalysis must be available");
2793 if (!SIMemoryLegalizer(MMI->getMMI()).run(MF))
2794 return PreservedAnalyses::all();
2795 return getMachineFunctionPassPreservedAnalyses().preserveSet<CFGAnalyses>();
2796}
2797
2798bool SIMemoryLegalizer::run(MachineFunction &MF) {
2799 bool Changed = false;
2800
2801 SIMemOpAccess MOA(MMI.getObjFileInfo<AMDGPUMachineModuleInfo>());
2802 CC = SICacheControl::create(ST: MF.getSubtarget<GCNSubtarget>());
2803
2804 for (auto &MBB : MF) {
2805 for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) {
2806
2807 // Unbundle instructions after the post-RA scheduler.
2808 if (MI->isBundle() && MI->mayLoadOrStore()) {
2809 MachineBasicBlock::instr_iterator II(MI->getIterator());
2810 for (MachineBasicBlock::instr_iterator I = ++II, E = MBB.instr_end();
2811 I != E && I->isBundledWithPred(); ++I) {
2812 I->unbundleFromPred();
2813 for (MachineOperand &MO : I->operands())
2814 if (MO.isReg())
2815 MO.setIsInternalRead(false);
2816 }
2817
2818 MI->eraseFromParent();
2819 MI = II->getIterator();
2820 }
2821
2822 if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic))
2823 continue;
2824
2825 if (const auto &MOI = MOA.getLoadInfo(MI))
2826 Changed |= expandLoad(MOI: *MOI, MI);
2827 else if (const auto &MOI = MOA.getStoreInfo(MI)) {
2828 Changed |= expandStore(MOI: *MOI, MI);
2829 } else if (const auto &MOI = MOA.getAtomicFenceInfo(MI))
2830 Changed |= expandAtomicFence(MOI: *MOI, MI);
2831 else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI))
2832 Changed |= expandAtomicCmpxchgOrRmw(MOI: *MOI, MI);
2833 }
2834 }
2835
2836 Changed |= removeAtomicPseudoMIs();
2837 return Changed;
2838}
2839
2840INITIALIZE_PASS(SIMemoryLegalizerLegacy, DEBUG_TYPE, PASS_NAME, false, false)
2841
2842char SIMemoryLegalizerLegacy::ID = 0;
2843char &llvm::SIMemoryLegalizerID = SIMemoryLegalizerLegacy::ID;
2844
2845FunctionPass *llvm::createSIMemoryLegalizerPass() {
2846 return new SIMemoryLegalizerLegacy();
2847}
2848