SIMemoryLegalizer.cpp source code [llvm_projects/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp]

1	//===- SIMemoryLegalizer.cpp ----------------------------------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	/// \file
10	/// Memory legalizer - implements memory model. More information can be
11	/// found here:
12	/// http://llvm.org/docs/AMDGPUUsage.html#memory-model
13	//
14	//===----------------------------------------------------------------------===//
15
16	#include "AMDGPU.h"
17	#include "AMDGPUMachineModuleInfo.h"
18	#include "GCNSubtarget.h"
19	#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
20	#include "llvm/ADT/BitmaskEnum.h"
21	#include "llvm/ADT/StringExtras.h"
22	#include "llvm/CodeGen/MachineBasicBlock.h"
23	#include "llvm/CodeGen/MachineFunctionPass.h"
24	#include "llvm/CodeGen/MachinePassManager.h"
25	#include "llvm/IR/DiagnosticInfo.h"
26	#include "llvm/IR/MemoryModelRelaxationAnnotations.h"
27	#include "llvm/IR/PassManager.h"
28	#include "llvm/Support/AMDGPUAddrSpace.h"
29	#include "llvm/Support/AtomicOrdering.h"
30	#include "llvm/TargetParser/TargetParser.h"
31
32	using namespace llvm;
33	using namespace llvm::AMDGPU;
34
35	#define DEBUG_TYPE "si-memory-legalizer"
36	#define PASS_NAME "SI Memory Legalizer"
37
38	static cl::opt<bool> AmdgcnSkipCacheInvalidations(
39	"amdgcn-skip-cache-invalidations", cl::init(Val: false), cl::Hidden,
40	cl::desc ("Use this to skip inserting cache invalidating instructions."));
41
42	namespace {
43
44	LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE();
45
46	/// Memory operation flags. Can be ORed together.
47	enum class SIMemOp {
48	NONE = `0u`,
49	LOAD = `1u` << `0`,
50	STORE = `1u` << `1`,
51	LLVM_MARK_AS_BITMASK_ENUM(/ LargestFlag = / STORE)
52	};
53
54	/// Position to insert a new instruction relative to an existing
55	/// instruction.
56	enum class Position {
57	BEFORE,
58	AFTER
59	};
60
61	/// The atomic synchronization scopes supported by the AMDGPU target.
62	enum class SIAtomicScope {
63	NONE,
64	SINGLETHREAD,
65	WAVEFRONT,
66	WORKGROUP,
67	CLUSTER, // Promoted to AGENT on targets without workgroup clusters.
68	AGENT,
69	SYSTEM
70	};
71
72	/// The distinct address spaces supported by the AMDGPU target for
73	/// atomic memory operation. Can be ORed together.
74	enum class SIAtomicAddrSpace {
75	NONE = `0u`,
76	GLOBAL = `1u` << `0`,
77	LDS = `1u` << `1`,
78	SCRATCH = `1u` << `2`,
79	GDS = `1u` << `3`,
80	OTHER = `1u` << `4`,
81
82	/// The address spaces that can be accessed by a FLAT instruction.
83	FLAT = GLOBAL \| LDS \| SCRATCH,
84
85	/// The address spaces that support atomic instructions.
86	ATOMIC = GLOBAL \| LDS \| SCRATCH \| GDS,
87
88	/// All address spaces.
89	ALL = GLOBAL \| LDS \| SCRATCH \| GDS \| OTHER,
90
91	LLVM_MARK_AS_BITMASK_ENUM(/ LargestFlag = / ALL)
92	};
93
94	class SIMemOpInfo final {
95	private:
96
97	friend class SIMemOpAccess;
98
99	AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
100	AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
101	SIAtomicScope Scope = SIAtomicScope::SYSTEM;
102	SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
103	SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
104	bool IsCrossAddressSpaceOrdering = false;
105	bool IsVolatile = false;
106	bool IsNonTemporal = false;
107	bool IsLastUse = false;
108	bool IsCooperative = false;
109
110	// TODO: Should we assume Cooperative=true if no MMO is present?
111	SIMemOpInfo(
112	const GCNSubtarget &ST,
113	AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent,
114	SIAtomicScope Scope = SIAtomicScope::SYSTEM,
115	SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC,
116	SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::ALL,
117	bool IsCrossAddressSpaceOrdering = true,
118	AtomicOrdering FailureOrdering = AtomicOrdering::SequentiallyConsistent,
119	bool IsVolatile = false, bool IsNonTemporal = false,
120	bool IsLastUse = false, bool IsCooperative = false)
121	: Ordering(Ordering), FailureOrdering(FailureOrdering), Scope(Scope),
122	OrderingAddrSpace(OrderingAddrSpace), InstrAddrSpace(InstrAddrSpace),
123	IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering),
124	IsVolatile(IsVolatile), IsNonTemporal(IsNonTemporal),
125	IsLastUse(IsLastUse), IsCooperative(IsCooperative) {
126
127	if (Ordering == AtomicOrdering::NotAtomic) {
128	assert(!IsCooperative && "Cannot be cooperative & non-atomic!");
129	assert(Scope == SIAtomicScope::NONE &&
130	OrderingAddrSpace == SIAtomicAddrSpace::NONE &&
131	!IsCrossAddressSpaceOrdering &&
132	FailureOrdering == AtomicOrdering::NotAtomic);
133	return;
134	}
135
136	assert(Scope != SIAtomicScope::NONE &&
137	(OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
138	SIAtomicAddrSpace::NONE &&
139	(InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
140	SIAtomicAddrSpace::NONE);
141
142	// There is also no cross address space ordering if the ordering
143	// address space is the same as the instruction address space and
144	// only contains a single address space.
145	if ((OrderingAddrSpace == InstrAddrSpace) &&
146	isPowerOf2_32(Value: uint32_t(InstrAddrSpace)))
147	this->IsCrossAddressSpaceOrdering = false;
148
149	// Limit the scope to the maximum supported by the instruction's address
150	// spaces.
151	if ((InstrAddrSpace & ~SIAtomicAddrSpace::SCRATCH) ==
152	SIAtomicAddrSpace::NONE) {
153	this->Scope = std::min(a: Scope, b: SIAtomicScope::SINGLETHREAD);
154	} else if ((InstrAddrSpace &
155	~(SIAtomicAddrSpace::SCRATCH \| SIAtomicAddrSpace::LDS)) ==
156	SIAtomicAddrSpace::NONE) {
157	this->Scope = std::min(a: Scope, b: SIAtomicScope::WORKGROUP);
158	} else if ((InstrAddrSpace &
159	~(SIAtomicAddrSpace::SCRATCH \| SIAtomicAddrSpace::LDS \|
160	SIAtomicAddrSpace::GDS)) == SIAtomicAddrSpace::NONE) {
161	this->Scope = std::min(a: Scope, b: SIAtomicScope::AGENT);
162	}
163
164	// On targets that have no concept of a workgroup cluster, use
165	// AGENT scope as a conservatively correct alternative.
166	if (this->Scope == SIAtomicScope::CLUSTER && !ST.hasClusters())
167	this->Scope = SIAtomicScope::AGENT;
168	}
169
170	public:
171	/// \returns Atomic synchronization scope of the machine instruction used to
172	/// create this SIMemOpInfo.
173	SIAtomicScope getScope() const {
174	return Scope;
175	}
176
177	/// \returns Ordering constraint of the machine instruction used to
178	/// create this SIMemOpInfo.
179	AtomicOrdering getOrdering() const {
180	return Ordering;
181	}
182
183	/// \returns Failure ordering constraint of the machine instruction used to
184	/// create this SIMemOpInfo.
185	AtomicOrdering getFailureOrdering() const {
186	return FailureOrdering;
187	}
188
189	/// \returns The address spaces be accessed by the machine
190	/// instruction used to create this SIMemOpInfo.
191	SIAtomicAddrSpace getInstrAddrSpace() const {
192	return InstrAddrSpace;
193	}
194
195	/// \returns The address spaces that must be ordered by the machine
196	/// instruction used to create this SIMemOpInfo.
197	SIAtomicAddrSpace getOrderingAddrSpace() const {
198	return OrderingAddrSpace;
199	}
200
201	/// \returns Return true iff memory ordering of operations on
202	/// different address spaces is required.
203	bool getIsCrossAddressSpaceOrdering() const {
204	return IsCrossAddressSpaceOrdering;
205	}
206
207	/// \returns True if memory access of the machine instruction used to
208	/// create this SIMemOpInfo is volatile, false otherwise.
209	bool isVolatile() const {
210	return IsVolatile;
211	}
212
213	/// \returns True if memory access of the machine instruction used to
214	/// create this SIMemOpInfo is nontemporal, false otherwise.
215	bool isNonTemporal() const {
216	return IsNonTemporal;
217	}
218
219	/// \returns True if memory access of the machine instruction used to
220	/// create this SIMemOpInfo is last use, false otherwise.
221	bool isLastUse() const { return IsLastUse; }
222
223	/// \returns True if this is a cooperative load or store atomic.
224	bool isCooperative() const { return IsCooperative; }
225
226	/// \returns True if ordering constraint of the machine instruction used to
227	/// create this SIMemOpInfo is unordered or higher, false otherwise.
228	bool isAtomic() const {
229	return Ordering != AtomicOrdering::NotAtomic;
230	}
231
232	};
233
234	class SIMemOpAccess final {
235	private:
236	const AMDGPUMachineModuleInfo MMI = nullptr*;
237	const GCNSubtarget &ST;
238
239	/// Reports unsupported message \p Msg for \p MI to LLVM context.
240	void reportUnsupported(const MachineBasicBlock::iterator &MI,
241	const char Msg) const*;
242
243	/// Inspects the target synchronization scope \p SSID and determines
244	/// the SI atomic scope it corresponds to, the address spaces it
245	/// covers, and whether the memory ordering applies between address
246	/// spaces.
247	std::optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
248	toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrAddrSpace) const;
249
250	/// \return Return a bit set of the address spaces accessed by \p AS.
251	SIAtomicAddrSpace toSIAtomicAddrSpace(unsigned AS) const;
252
253	/// \returns Info constructed from \p MI, which has at least machine memory
254	/// operand.
255	std::optional<SIMemOpInfo>
256	constructFromMIWithMMO(const MachineBasicBlock::iterator &MI) const;
257
258	public:
259	/// Construct class to support accessing the machine memory operands
260	/// of instructions in the machine function \p MF.
261	SIMemOpAccess(const AMDGPUMachineModuleInfo &MMI, const GCNSubtarget &ST);
262
263	/// \returns Load info if \p MI is a load operation, "std::nullopt" otherwise.
264	std::optional<SIMemOpInfo>
265	getLoadInfo(const MachineBasicBlock::iterator &MI) const;
266
267	/// \returns Store info if \p MI is a store operation, "std::nullopt"
268	/// otherwise.
269	std::optional<SIMemOpInfo>
270	getStoreInfo(const MachineBasicBlock::iterator &MI) const;
271
272	/// \returns Atomic fence info if \p MI is an atomic fence operation,
273	/// "std::nullopt" otherwise.
274	std::optional<SIMemOpInfo>
275	getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const;
276
277	/// \returns Atomic cmpxchg/rmw info if \p MI is an atomic cmpxchg or
278	/// rmw operation, "std::nullopt" otherwise.
279	std::optional<SIMemOpInfo>
280	getAtomicCmpxchgOrRmwInfo(const MachineBasicBlock::iterator &MI) const;
281
282	/// \returns DMA to LDS info if \p MI is as a direct-to/from-LDS load/store,
283	/// along with an indication of whether this is a load or store. If it is not
284	/// a direct-to-LDS operation, returns std::nullopt.
285	std::optional<SIMemOpInfo>
286	getLDSDMAInfo(const MachineBasicBlock::iterator &MI) const;
287	};
288
289	class SICacheControl {
290	protected:
291
292	/// AMDGPU subtarget info.
293	const GCNSubtarget &ST;
294
295	/// Instruction info.
296	const SIInstrInfo TII = nullptr*;
297
298	IsaVersion IV;
299
300	/// Whether to insert cache invalidating instructions.
301	bool InsertCacheInv;
302
303	SICacheControl(const GCNSubtarget &ST);
304
305	/// Sets CPol \p Bits to "true" if present in instruction \p MI.
306	/// \returns Returns true if \p MI is modified, false otherwise.
307	bool enableCPolBits(const MachineBasicBlock::iterator MI,
308	unsigned Bits) const;
309
310	/// Check if any atomic operation on AS can affect memory accessible via the
311	/// global address space.
312	bool canAffectGlobalAddrSpace(SIAtomicAddrSpace AS) const;
313
314	public:
315	using CPol = AMDGPU::CPol::CPol;
316
317	/// Create a cache control for the subtarget \p ST.
318	static std::unique_ptr<SICacheControl> create(const GCNSubtarget &ST);
319
320	/// Update \p MI memory load instruction to bypass any caches up to
321	/// the \p Scope memory scope for address spaces \p
322	/// AddrSpace. Return true iff the instruction was modified.
323	virtual bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
324	SIAtomicScope Scope,
325	SIAtomicAddrSpace AddrSpace) const = `0`;
326
327	/// Update \p MI memory store instruction to bypass any caches up to
328	/// the \p Scope memory scope for address spaces \p
329	/// AddrSpace. Return true iff the instruction was modified.
330	virtual bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
331	SIAtomicScope Scope,
332	SIAtomicAddrSpace AddrSpace) const = `0`;
333
334	/// Update \p MI memory read-modify-write instruction to bypass any caches up
335	/// to the \p Scope memory scope for address spaces \p AddrSpace. Return true
336	/// iff the instruction was modified.
337	virtual bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
338	SIAtomicScope Scope,
339	SIAtomicAddrSpace AddrSpace) const = `0`;
340
341	/// Update \p MI memory instruction of kind \p Op associated with address
342	/// spaces \p AddrSpace to indicate it is volatile and/or
343	/// nontemporal/last-use. Return true iff the instruction was modified.
344	virtual bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
345	SIAtomicAddrSpace AddrSpace,
346	SIMemOp Op, bool IsVolatile,
347	bool IsNonTemporal,
348	bool IsLastUse = false) const = `0`;
349
350	/// Add final touches to a `mayStore` instruction \p MI, which may be a
351	/// Store or RMW instruction.
352	/// FIXME: This takes a MI because iterators aren't handled properly. When
353	/// this is called, they often point to entirely different insts. Thus we back
354	/// up the inst early and pass it here instead.
355	virtual bool finalizeStore(MachineInstr &MI, bool Atomic) const {
356	return false;
357	};
358
359	/// Handle cooperative load/store atomics.
360	virtual bool handleCooperativeAtomic(MachineInstr &MI) const {
361	llvm_unreachable(
362	"cooperative atomics are not available on this architecture");
363	}
364
365	/// Inserts any necessary instructions at position \p Pos relative
366	/// to instruction \p MI to ensure memory instructions before \p Pos of kind
367	/// \p Op associated with address spaces \p AddrSpace have completed. Used
368	/// between memory instructions to enforce the order they become visible as
369	/// observed by other memory instructions executing in memory scope \p Scope.
370	/// \p IsCrossAddrSpaceOrdering indicates if the memory ordering is between
371	/// address spaces. If \p AtomicsOnly is true, only insert waits for counters
372	/// that are used by atomic instructions.
373	/// Returns true iff any instructions inserted.
374	virtual bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
375	SIAtomicAddrSpace AddrSpace, SIMemOp Op,
376	bool IsCrossAddrSpaceOrdering, Position Pos,
377	AtomicOrdering Order, bool AtomicsOnly) const = `0`;
378
379	/// Inserts any necessary instructions at position \p Pos relative to
380	/// instruction \p MI to ensure any subsequent memory instructions of this
381	/// thread with address spaces \p AddrSpace will observe the previous memory
382	/// operations by any thread for memory scopes up to memory scope \p Scope .
383	/// Returns true iff any instructions inserted.
384	virtual bool insertAcquire(MachineBasicBlock::iterator &MI,
385	SIAtomicScope Scope,
386	SIAtomicAddrSpace AddrSpace,
387	Position Pos) const = `0`;
388
389	/// Inserts any necessary instructions at position \p Pos relative to
390	/// instruction \p MI to ensure previous memory instructions by this thread
391	/// with address spaces \p AddrSpace have completed and can be observed by
392	/// subsequent memory instructions by any thread executing in memory scope \p
393	/// Scope. \p IsCrossAddrSpaceOrdering indicates if the memory ordering is
394	/// between address spaces. Returns true iff any instructions inserted.
395	virtual bool insertRelease(MachineBasicBlock::iterator &MI,
396	SIAtomicScope Scope,
397	SIAtomicAddrSpace AddrSpace,
398	bool IsCrossAddrSpaceOrdering,
399	Position Pos) const = `0`;
400
401	/// Handle operations that are considered non-volatile.
402	/// See \ref isNonVolatileMemoryAccess
403	virtual bool handleNonVolatile(MachineInstr &MI) const { return false; }
404
405	/// Virtual destructor to allow derivations to be deleted.
406	virtual ~SICacheControl() = default;
407	};
408
409	/// Generates code sequences for the memory model of all GFX targets below
410	/// GFX10.
411	class SIGfx6CacheControl final : public SICacheControl {
412	public:
413
414	SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl (ST) {}
415
416	bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
417	SIAtomicScope Scope,
418	SIAtomicAddrSpace AddrSpace) const override;
419
420	bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
421	SIAtomicScope Scope,
422	SIAtomicAddrSpace AddrSpace) const override;
423
424	bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
425	SIAtomicScope Scope,
426	SIAtomicAddrSpace AddrSpace) const override;
427
428	bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
429	SIAtomicAddrSpace AddrSpace, SIMemOp Op,
430	bool IsVolatile, bool IsNonTemporal,
431	bool IsLastUse) const override;
432
433	bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
434	SIAtomicAddrSpace AddrSpace, SIMemOp Op,
435	bool IsCrossAddrSpaceOrdering, Position Pos,
436	AtomicOrdering Order, bool AtomicsOnly) const override;
437
438	bool insertAcquire(MachineBasicBlock::iterator &MI,
439	SIAtomicScope Scope,
440	SIAtomicAddrSpace AddrSpace,
441	Position Pos) const override;
442
443	bool insertRelease(MachineBasicBlock::iterator &MI,
444	SIAtomicScope Scope,
445	SIAtomicAddrSpace AddrSpace,
446	bool IsCrossAddrSpaceOrdering,
447	Position Pos) const override;
448	};
449
450	/// Generates code sequences for the memory model of GFX10/11.
451	class SIGfx10CacheControl final : public SICacheControl {
452	public:
453	SIGfx10CacheControl(const GCNSubtarget &ST) : SICacheControl (ST) {}
454
455	bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
456	SIAtomicScope Scope,
457	SIAtomicAddrSpace AddrSpace) const override;
458
459	bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
460	SIAtomicScope Scope,
461	SIAtomicAddrSpace AddrSpace) const override {
462	return false;
463	}
464
465	bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
466	SIAtomicScope Scope,
467	SIAtomicAddrSpace AddrSpace) const override {
468	return false;
469	}
470
471	bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
472	SIAtomicAddrSpace AddrSpace, SIMemOp Op,
473	bool IsVolatile, bool IsNonTemporal,
474	bool IsLastUse) const override;
475
476	bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
477	SIAtomicAddrSpace AddrSpace, SIMemOp Op,
478	bool IsCrossAddrSpaceOrdering, Position Pos,
479	AtomicOrdering Order, bool AtomicsOnly) const override;
480
481	bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
482	SIAtomicAddrSpace AddrSpace, Position Pos) const override;
483
484	bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
485	SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,
486	Position Pos) const override {
487	return insertWait(MI, Scope, AddrSpace, Op: SIMemOp::LOAD \| SIMemOp::STORE,
488	IsCrossAddrSpaceOrdering, Pos, Order: AtomicOrdering::Release,
489	/AtomicsOnly=/false);
490	}
491	};
492
493	class SIGfx12CacheControl final : public SICacheControl {
494	protected:
495	// Sets TH policy to \p Value if CPol operand is present in instruction \p MI.
496	// \returns Returns true if \p MI is modified, false otherwise.
497	bool setTH(const MachineBasicBlock::iterator MI,
498	AMDGPU::CPol::CPol Value) const;
499
500	// Sets Scope policy to \p Value if CPol operand is present in instruction \p
501	// MI. \returns Returns true if \p MI is modified, false otherwise.
502	bool setScope(const MachineBasicBlock::iterator MI,
503	AMDGPU::CPol::CPol Value) const;
504
505	// Stores with system scope (SCOPE_SYS) need to wait for:
506	// - loads or atomics(returning) - wait for {LOAD\|SAMPLE\|BVH\|KM}CNT==0
507	// - non-returning-atomics - wait for STORECNT==0
508	// TODO: SIInsertWaitcnts will not always be able to remove STORECNT waits
509	// since it does not distinguish atomics-with-return from regular stores.
510	// There is no need to wait if memory is cached (mtype != UC).
511	bool
512	insertWaitsBeforeSystemScopeStore(const MachineBasicBlock::iterator MI) const;
513
514	bool setAtomicScope(const MachineBasicBlock::iterator &MI,
515	SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const;
516
517	public:
518	SIGfx12CacheControl(const GCNSubtarget &ST) : SICacheControl (ST) {
519	// GFX120x and GFX125x memory models greatly overlap, and in some cases
520	// the behavior is the same if assuming GFX120x in CU mode.
521	assert(!ST.hasGFX1250Insts() \|\| ST.hasGFX13Insts() \|\| ST.isCuModeEnabled());
522	}
523
524	bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
525	SIAtomicAddrSpace AddrSpace, SIMemOp Op,
526	bool IsCrossAddrSpaceOrdering, Position Pos,
527	AtomicOrdering Order, bool AtomicsOnly) const override;
528
529	bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
530	SIAtomicAddrSpace AddrSpace, Position Pos) const override;
531
532	bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
533	SIAtomicAddrSpace AddrSpace, SIMemOp Op,
534	bool IsVolatile, bool IsNonTemporal,
535	bool IsLastUse) const override;
536
537	bool finalizeStore(MachineInstr &MI, bool Atomic) const override;
538
539	bool handleCooperativeAtomic(MachineInstr &MI) const override;
540
541	bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
542	SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,
543	Position Pos) const override;
544
545	bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
546	SIAtomicScope Scope,
547	SIAtomicAddrSpace AddrSpace) const override {
548	return setAtomicScope(MI, Scope, AddrSpace);
549	}
550
551	bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
552	SIAtomicScope Scope,
553	SIAtomicAddrSpace AddrSpace) const override {
554	return setAtomicScope(MI, Scope, AddrSpace);
555	}
556
557	bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
558	SIAtomicScope Scope,
559	SIAtomicAddrSpace AddrSpace) const override {
560	return setAtomicScope(MI, Scope, AddrSpace);
561	}
562
563	bool handleNonVolatile(MachineInstr &MI) const override;
564	};
565
566	class SIMemoryLegalizer final {
567	private:
568	const MachineModuleInfo &MMI;
569	/// Cache Control.
570	std::unique_ptr<SICacheControl> CC = nullptr;
571
572	/// List of atomic pseudo instructions.
573	std::list<MachineBasicBlock::iterator> AtomicPseudoMIs;
574
575	/// Return true iff instruction \p MI is a atomic instruction that
576	/// returns a result.
577	bool isAtomicRet(const MachineInstr &MI) const {
578	return SIInstrInfo::isAtomicRet(MI);
579	}
580
581	/// Removes all processed atomic pseudo instructions from the current
582	/// function. Returns true if current function is modified, false otherwise.
583	bool removeAtomicPseudoMIs();
584
585	/// Expands load operation \p MI. Returns true if instructions are
586	/// added/deleted or \p MI is modified, false otherwise.
587	bool expandLoad(const SIMemOpInfo &MOI,
588	MachineBasicBlock::iterator &MI);
589	/// Expands store operation \p MI. Returns true if instructions are
590	/// added/deleted or \p MI is modified, false otherwise.
591	bool expandStore(const SIMemOpInfo &MOI,
592	MachineBasicBlock::iterator &MI);
593	/// Expands atomic fence operation \p MI. Returns true if
594	/// instructions are added/deleted or \p MI is modified, false otherwise.
595	bool expandAtomicFence(const SIMemOpInfo &MOI,
596	MachineBasicBlock::iterator &MI);
597	/// Expands atomic cmpxchg or rmw operation \p MI. Returns true if
598	/// instructions are added/deleted or \p MI is modified, false otherwise.
599	bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
600	MachineBasicBlock::iterator &MI);
601	/// Expands LDS DMA operation \p MI. Returns true if instructions are
602	/// added/deleted or \p MI is modified, false otherwise.
603	bool expandLDSDMA(const SIMemOpInfo &MOI, MachineBasicBlock::iterator &MI);
604
605	public:
606	SIMemoryLegalizer(const MachineModuleInfo &MMI) : MMI(MMI) {};
607	bool run(MachineFunction &MF);
608	};
609
610	class SIMemoryLegalizerLegacy final : public MachineFunctionPass {
611	public:
612	static char ID;
613
614	SIMemoryLegalizerLegacy() : MachineFunctionPass (ID) {}
615
616	void getAnalysisUsage(AnalysisUsage &AU) const override {
617	AU.setPreservesCFG();
618	MachineFunctionPass::getAnalysisUsage(AU);
619	}
620
621	StringRef getPassName() const override {
622	return PASS_NAME;
623	}
624
625	bool runOnMachineFunction(MachineFunction &MF) override;
626	};
627
628	static const StringMap<SIAtomicAddrSpace> ASNames = {{
629	{"global", SIAtomicAddrSpace::GLOBAL},
630	{"local", SIAtomicAddrSpace::LDS},
631	}};
632
633	void diagnoseUnknownMMRAASName(const MachineInstr &MI, StringRef AS) {
634	const MachineFunction *MF = MI.getMF();
635	const Function &Fn = MF->getFunction();
636	SmallString<`128`> Str;
637	raw_svector_ostream OS(Str);
638	OS << "unknown address space '" << AS << "'; expected one of ";
639	ListSeparator LS;
640	for (const auto &[Name, Val] : ASNames)
641	OS << LS << `'\''` << Name << `'\''`;
642	Fn.getContext().diagnose(
643	DI: DiagnosticInfoUnsupported (Fn, Str.str(), MI.getDebugLoc(), DS_Warning));
644	}
645
646	/// Reads \p MI's MMRAs to parse the "amdgpu-synchronize-as" MMRA.
647	/// If this tag isn't present, or if it has no meaningful values, returns
648	/// \p none, otherwise returns the address spaces specified by the MD.
649	static std::optional<SIAtomicAddrSpace>
650	getSynchronizeAddrSpaceMD(const MachineInstr &MI) {
651	static constexpr StringLiteral FenceASPrefix = "amdgpu-synchronize-as";
652
653	auto MMRA = MMRAMetadata (MI.getMMRAMetadata());
654	if (!MMRA)
655	return std::nullopt;
656
657	SIAtomicAddrSpace Result = SIAtomicAddrSpace::NONE;
658	for (const auto &[Prefix, Suffix] : MMRA) {
659	if (Prefix != FenceASPrefix)
660	continue;
661
662	if (auto It = ASNames.find(Key: Suffix); It != ASNames.end())
663	Result \|= It ->second;
664	else
665	diagnoseUnknownMMRAASName(MI, AS: Suffix);
666	}
667
668	if (Result == SIAtomicAddrSpace::NONE)
669	return std::nullopt;
670
671	return Result;
672	}
673
674	} // end anonymous namespace
675
676	void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI,
677	const char Msg) const* {
678	const Function &Func = MI ->getMF()->getFunction();
679	Func.getContext().diagnose(
680	DI: DiagnosticInfoUnsupported (Func, Msg, MI ->getDebugLoc()));
681	}
682
683	std::optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
684	SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID,
685	SIAtomicAddrSpace InstrAddrSpace) const {
686	if (SSID == SyncScope::System)
687	return std::tuple(SIAtomicScope::SYSTEM, SIAtomicAddrSpace::ATOMIC, true);
688	if (SSID == MMI->getAgentSSID())
689	return std::tuple(SIAtomicScope::AGENT, SIAtomicAddrSpace::ATOMIC, true);
690	if (SSID == MMI->getClusterSSID())
691	return std::tuple(SIAtomicScope::CLUSTER, SIAtomicAddrSpace::ATOMIC, true);
692	if (SSID == MMI->getWorkgroupSSID())
693	return std::tuple(SIAtomicScope::WORKGROUP, SIAtomicAddrSpace::ATOMIC,
694	true);
695	if (SSID == MMI->getWavefrontSSID())
696	return std::tuple(SIAtomicScope::WAVEFRONT, SIAtomicAddrSpace::ATOMIC,
697	true);
698	if (SSID == SyncScope::SingleThread)
699	return std::tuple(SIAtomicScope::SINGLETHREAD, SIAtomicAddrSpace::ATOMIC,
700	true);
701	if (SSID == MMI->getSystemOneAddressSpaceSSID())
702	return std::tuple(SIAtomicScope::SYSTEM,
703	SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
704	if (SSID == MMI->getAgentOneAddressSpaceSSID())
705	return std::tuple(SIAtomicScope::AGENT,
706	SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
707	if (SSID == MMI->getClusterOneAddressSpaceSSID())
708	return std::tuple(SIAtomicScope::CLUSTER,
709	SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
710	if (SSID == MMI->getWorkgroupOneAddressSpaceSSID())
711	return std::tuple(SIAtomicScope::WORKGROUP,
712	SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
713	if (SSID == MMI->getWavefrontOneAddressSpaceSSID())
714	return std::tuple(SIAtomicScope::WAVEFRONT,
715	SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
716	if (SSID == MMI->getSingleThreadOneAddressSpaceSSID())
717	return std::tuple(SIAtomicScope::SINGLETHREAD,
718	SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
719	return std::nullopt;
720	}
721
722	SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const {
723	if (AS == AMDGPUAS::FLAT_ADDRESS)
724	return SIAtomicAddrSpace::FLAT;
725	if (AS == AMDGPUAS::GLOBAL_ADDRESS)
726	return SIAtomicAddrSpace::GLOBAL;
727	if (AS == AMDGPUAS::LOCAL_ADDRESS)
728	return SIAtomicAddrSpace::LDS;
729	if (AS == AMDGPUAS::PRIVATE_ADDRESS)
730	return SIAtomicAddrSpace::SCRATCH;
731	if (AS == AMDGPUAS::REGION_ADDRESS)
732	return SIAtomicAddrSpace::GDS;
733	if (AS == AMDGPUAS::BUFFER_FAT_POINTER \|\| AS == AMDGPUAS::BUFFER_RESOURCE \|\|
734	AS == AMDGPUAS::BUFFER_STRIDED_POINTER)
735	return SIAtomicAddrSpace::GLOBAL;
736
737	return SIAtomicAddrSpace::OTHER;
738	}
739
740	SIMemOpAccess::SIMemOpAccess(const AMDGPUMachineModuleInfo &MMI_,
741	const GCNSubtarget &ST)
742	: MMI(&MMI_), ST(ST) {}
743
744	std::optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
745	const MachineBasicBlock::iterator &MI) const {
746	assert(MI->getNumMemOperands() > `0`);
747
748	SyncScope::ID SSID = SyncScope::SingleThread;
749	AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
750	AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
751	SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
752	bool IsNonTemporal = true;
753	bool IsVolatile = false;
754	bool IsLastUse = false;
755	bool IsCooperative = false;
756
757	// Validator should check whether or not MMOs cover the entire set of
758	// locations accessed by the memory instruction.
759	for (const auto &MMO : MI ->memoperands()) {
760	IsNonTemporal &= MMO->isNonTemporal();
761	IsVolatile \|= MMO->isVolatile();
762	IsLastUse \|= MMO->getFlags() & MOLastUse;
763	IsCooperative \|= MMO->getFlags() & MOCooperative;
764	InstrAddrSpace \|=
765	toSIAtomicAddrSpace(AS: MMO->getPointerInfo().getAddrSpace());
766	AtomicOrdering OpOrdering = MMO->getSuccessOrdering();
767	if (OpOrdering != AtomicOrdering::NotAtomic) {
768	const auto &IsSyncScopeInclusion =
769	MMI->isSyncScopeInclusion(A: SSID, B: MMO->getSyncScopeID());
770	if (!IsSyncScopeInclusion) {
771	reportUnsupported(MI,
772	Msg: "Unsupported non-inclusive atomic synchronization scope");
773	return std::nullopt;
774	}
775
776	SSID = *IsSyncScopeInclusion ? SSID : MMO->getSyncScopeID();
777	Ordering = getMergedAtomicOrdering(AO: Ordering, Other: OpOrdering);
778	assert(MMO->getFailureOrdering() != AtomicOrdering::Release &&
779	MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease);
780	FailureOrdering =
781	getMergedAtomicOrdering(AO: FailureOrdering, Other: MMO->getFailureOrdering());
782	}
783	}
784
785	// FIXME: The MMO of buffer atomic instructions does not always have an atomic
786	// ordering. We only need to handle VBUFFER atomics on GFX12+ so we can fix it
787	// here, but the lowering should really be cleaned up at some point.
788	if ((ST.getGeneration() >= GCNSubtarget::GFX12) && SIInstrInfo::isBUF(MI: *MI) &&
789	SIInstrInfo::isAtomic(MI: *MI) && Ordering == AtomicOrdering::NotAtomic)
790	Ordering = AtomicOrdering::Monotonic;
791
792	SIAtomicScope Scope = SIAtomicScope::NONE;
793	SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
794	bool IsCrossAddressSpaceOrdering = false;
795	if (Ordering != AtomicOrdering::NotAtomic) {
796	auto ScopeOrNone = toSIAtomicScope(SSID, InstrAddrSpace);
797	if (!ScopeOrNone) {
798	reportUnsupported(MI, Msg: "Unsupported atomic synchronization scope");
799	return std::nullopt;
800	}
801	std::tie(args&: Scope, args&: OrderingAddrSpace, args&: IsCrossAddressSpaceOrdering) =
802	*ScopeOrNone;
803	if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) \|\|
804	((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace) \|\|
805	((InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) == SIAtomicAddrSpace::NONE)) {
806	reportUnsupported(MI, Msg: "Unsupported atomic address space");
807	return std::nullopt;
808	}
809	}
810	return SIMemOpInfo (ST, Ordering, Scope, OrderingAddrSpace, InstrAddrSpace,
811	IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile,
812	IsNonTemporal, IsLastUse, IsCooperative);
813	}
814
815	std::optional<SIMemOpInfo>
816	SIMemOpAccess::getLoadInfo(const MachineBasicBlock::iterator &MI) const {
817	assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
818
819	if (!(MI ->mayLoad() && !MI ->mayStore()))
820	return std::nullopt;
821
822	// Be conservative if there are no memory operands.
823	if (MI ->getNumMemOperands() == `0`)
824	return SIMemOpInfo (ST);
825
826	return constructFromMIWithMMO(MI);
827	}
828
829	std::optional<SIMemOpInfo>
830	SIMemOpAccess::getStoreInfo(const MachineBasicBlock::iterator &MI) const {
831	assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
832
833	if (!(!MI ->mayLoad() && MI ->mayStore()))
834	return std::nullopt;
835
836	// Be conservative if there are no memory operands.
837	if (MI ->getNumMemOperands() == `0`)
838	return SIMemOpInfo (ST);
839
840	return constructFromMIWithMMO(MI);
841	}
842
843	std::optional<SIMemOpInfo>
844	SIMemOpAccess::getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const {
845	assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
846
847	if (MI ->getOpcode() != AMDGPU::ATOMIC_FENCE)
848	return std::nullopt;
849
850	AtomicOrdering Ordering =
851	static_cast<AtomicOrdering>(MI ->getOperand(i: `0`).getImm());
852
853	SyncScope::ID SSID = static_cast<SyncScope::ID>(MI ->getOperand(i: `1`).getImm());
854	auto ScopeOrNone = toSIAtomicScope(SSID, InstrAddrSpace: SIAtomicAddrSpace::ATOMIC);
855	if (!ScopeOrNone) {
856	reportUnsupported(MI, Msg: "Unsupported atomic synchronization scope");
857	return std::nullopt;
858	}
859
860	SIAtomicScope Scope = SIAtomicScope::NONE;
861	SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
862	bool IsCrossAddressSpaceOrdering = false;
863	std::tie(args&: Scope, args&: OrderingAddrSpace, args&: IsCrossAddressSpaceOrdering) =
864	*ScopeOrNone;
865
866	if (OrderingAddrSpace != SIAtomicAddrSpace::ATOMIC) {
867	// We currently expect refineOrderingAS to be the only place that
868	// can refine the AS ordered by the fence.
869	// If that changes, we need to review the semantics of that function
870	// in case it needs to preserve certain address spaces.
871	reportUnsupported(MI, Msg: "Unsupported atomic address space");
872	return std::nullopt;
873	}
874
875	auto SynchronizeAS = getSynchronizeAddrSpaceMD(MI: *MI);
876	if (SynchronizeAS)
877	OrderingAddrSpace = *SynchronizeAS;
878
879	return SIMemOpInfo (ST, Ordering, Scope, OrderingAddrSpace,
880	SIAtomicAddrSpace::ATOMIC, IsCrossAddressSpaceOrdering,
881	AtomicOrdering::NotAtomic);
882	}
883
884	std::optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo(
885	const MachineBasicBlock::iterator &MI) const {
886	assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
887
888	if (!(MI ->mayLoad() && MI ->mayStore()))
889	return std::nullopt;
890
891	// Be conservative if there are no memory operands.
892	if (MI ->getNumMemOperands() == `0`)
893	return SIMemOpInfo (ST);
894
895	return constructFromMIWithMMO(MI);
896	}
897
898	std::optional<SIMemOpInfo>
899	SIMemOpAccess::getLDSDMAInfo(const MachineBasicBlock::iterator &MI) const {
900	assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
901
902	if (!SIInstrInfo::isLDSDMA(MI: *MI))
903	return std::nullopt;
904
905	return constructFromMIWithMMO(MI);
906	}
907
908	/// \returns true if \p MI has one or more MMO, and all of them are fit for
909	/// being marked as non-volatile. This means that either they are accessing the
910	/// constant address space, are accessing a known invariant memory location, or
911	/// that they are marked with the non-volatile metadata/MMO flag.
912	static bool isNonVolatileMemoryAccess(const MachineInstr &MI) {
913	if (MI.getNumMemOperands() == `0`)
914	return false;
915	return all_of(Range: MI.memoperands(), P: [&](const MachineMemOperand *MMO) {
916	return MMO->getFlags() & (MOThreadPrivate \| MachineMemOperand::MOInvariant);
917	});
918	}
919
920	SICacheControl::SICacheControl(const GCNSubtarget &ST) : ST(ST) {
921	TII = ST.getInstrInfo();
922	IV = getIsaVersion(GPU: ST.getCPU());
923	InsertCacheInv = !AmdgcnSkipCacheInvalidations;
924	}
925
926	bool SICacheControl::enableCPolBits(const MachineBasicBlock::iterator MI,
927	unsigned Bits) const {
928	MachineOperand CPol = TII->getNamedOperand(MI&: MI, OperandName: AMDGPU::OpName::cpol);
929	if (!CPol)
930	return false;
931
932	CPol->setImm(CPol->getImm() \| Bits);
933	return true;
934	}
935
936	bool SICacheControl::canAffectGlobalAddrSpace(SIAtomicAddrSpace AS) const {
937	assert((!ST.hasGloballyAddressableScratch() \|\|
938	(AS & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE \|\|
939	(AS & SIAtomicAddrSpace::SCRATCH) == SIAtomicAddrSpace::NONE) &&
940	"scratch instructions should already be replaced by flat "
941	"instructions if GloballyAddressableScratch is enabled");
942	return (AS & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE;
943	}
944
945	/ static /
946	std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) {
947	GCNSubtarget::Generation Generation = ST.getGeneration();
948	if (Generation < AMDGPUSubtarget::GFX10)
949	return std::make_unique<SIGfx6CacheControl>(args: ST);
950	if (Generation < AMDGPUSubtarget::GFX12)
951	return std::make_unique<SIGfx10CacheControl>(args: ST);
952	return std::make_unique<SIGfx12CacheControl>(args: ST);
953	}
954
955	bool SIGfx6CacheControl::enableLoadCacheBypass(
956	const MachineBasicBlock::iterator &MI,
957	SIAtomicScope Scope,
958	SIAtomicAddrSpace AddrSpace) const {
959	assert(MI->mayLoad() && !MI->mayStore());
960
961	if (!canAffectGlobalAddrSpace(AS: AddrSpace)) {
962	/// The scratch address space does not need the global memory caches
963	/// to be bypassed as all memory operations by the same thread are
964	/// sequentially consistent, and no other thread can access scratch
965	/// memory.
966
967	/// Other address spaces do not have a cache.
968	return false;
969	}
970
971	bool Changed = false;
972	switch (Scope) {
973	case SIAtomicScope::SYSTEM:
974	if (ST.hasGFX940Insts()) {
975	// Set SC bits to indicate system scope.
976	Changed \|= enableCPolBits(MI, Bits: CPol::SC0 \| CPol::SC1);
977	break;
978	}
979	[[fallthrough]];
980	case SIAtomicScope::AGENT:
981	if (ST.hasGFX940Insts()) {
982	// Set SC bits to indicate agent scope.
983	Changed \|= enableCPolBits(MI, Bits: CPol::SC1);
984	} else {
985	// Set L1 cache policy to MISS_EVICT.
986	// Note: there is no L2 cache bypass policy at the ISA level.
987	Changed \|= enableCPolBits(MI, Bits: CPol::GLC);
988	}
989	break;
990	case SIAtomicScope::WORKGROUP:
991	if (ST.hasGFX940Insts()) {
992	// In threadgroup split mode the waves of a work-group can be executing
993	// on different CUs. Therefore need to bypass the L1 which is per CU.
994	// Otherwise in non-threadgroup split mode all waves of a work-group are
995	// on the same CU, and so the L1 does not need to be bypassed. Setting
996	// SC bits to indicate work-group scope will do this automatically.
997	Changed \|= enableCPolBits(MI, Bits: CPol::SC0);
998	} else if (ST.hasGFX90AInsts()) {
999	// In threadgroup split mode the waves of a work-group can be executing
1000	// on different CUs. Therefore need to bypass the L1 which is per CU.
1001	// Otherwise in non-threadgroup split mode all waves of a work-group are
1002	// on the same CU, and so the L1 does not need to be bypassed.
1003	if (ST.isTgSplitEnabled())
1004	Changed \|= enableCPolBits(MI, Bits: CPol::GLC);
1005	}
1006	break;
1007	case SIAtomicScope::WAVEFRONT:
1008	case SIAtomicScope::SINGLETHREAD:
1009	// No cache to bypass.
1010	break;
1011	default:
1012	llvm_unreachable("Unsupported synchronization scope");
1013	}
1014
1015	return Changed;
1016	}
1017
1018	bool SIGfx6CacheControl::enableStoreCacheBypass(
1019	const MachineBasicBlock::iterator &MI,
1020	SIAtomicScope Scope,
1021	SIAtomicAddrSpace AddrSpace) const {
1022	assert(!MI->mayLoad() && MI->mayStore());
1023	bool Changed = false;
1024
1025	/// For targets other than GFX940, the L1 cache is write through so does not
1026	/// need to be bypassed. There is no bypass control for the L2 cache at the
1027	/// isa level.
1028
1029	if (ST.hasGFX940Insts() && canAffectGlobalAddrSpace(AS: AddrSpace)) {
1030	switch (Scope) {
1031	case SIAtomicScope::SYSTEM:
1032	// Set SC bits to indicate system scope.
1033	Changed \|= enableCPolBits(MI, Bits: CPol::SC0 \| CPol::SC1);
1034	break;
1035	case SIAtomicScope::AGENT:
1036	// Set SC bits to indicate agent scope.
1037	Changed \|= enableCPolBits(MI, Bits: CPol::SC1);
1038	break;
1039	case SIAtomicScope::WORKGROUP:
1040	// Set SC bits to indicate workgroup scope.
1041	Changed \|= enableCPolBits(MI, Bits: CPol::SC0);
1042	break;
1043	case SIAtomicScope::WAVEFRONT:
1044	case SIAtomicScope::SINGLETHREAD:
1045	// Leave SC bits unset to indicate wavefront scope.
1046	break;
1047	default:
1048	llvm_unreachable("Unsupported synchronization scope");
1049	}
1050
1051	/// The scratch address space does not need the global memory caches
1052	/// to be bypassed as all memory operations by the same thread are
1053	/// sequentially consistent, and no other thread can access scratch
1054	/// memory.
1055
1056	/// Other address spaces do not have a cache.
1057	}
1058
1059	return Changed;
1060	}
1061
1062	bool SIGfx6CacheControl::enableRMWCacheBypass(
1063	const MachineBasicBlock::iterator &MI,
1064	SIAtomicScope Scope,
1065	SIAtomicAddrSpace AddrSpace) const {
1066	assert(MI->mayLoad() && MI->mayStore());
1067	bool Changed = false;
1068
1069	/// For targets other than GFX940, do not set GLC for RMW atomic operations as
1070	/// L0/L1 cache is automatically bypassed, and the GLC bit is instead used to
1071	/// indicate if they are return or no-return. Note: there is no L2 cache
1072	/// coherent bypass control at the ISA level.
1073	/// For GFX90A+, RMW atomics implicitly bypass the L1 cache.
1074
1075	if (ST.hasGFX940Insts() && canAffectGlobalAddrSpace(AS: AddrSpace)) {
1076	switch (Scope) {
1077	case SIAtomicScope::SYSTEM:
1078	// Set SC1 bit to indicate system scope.
1079	Changed \|= enableCPolBits(MI, Bits: CPol::SC1);
1080	break;
1081	case SIAtomicScope::AGENT:
1082	case SIAtomicScope::WORKGROUP:
1083	case SIAtomicScope::WAVEFRONT:
1084	case SIAtomicScope::SINGLETHREAD:
1085	// RMW atomic operations implicitly bypass the L1 cache and only use SC1
1086	// to indicate system or agent scope. The SC0 bit is used to indicate if
1087	// they are return or no-return. Leave SC1 bit unset to indicate agent
1088	// scope.
1089	break;
1090	default:
1091	llvm_unreachable("Unsupported synchronization scope");
1092	}
1093	}
1094
1095	return Changed;
1096	}
1097
1098	bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal(
1099	MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1100	bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
1101	// Only handle load and store, not atomic read-modify-write insructions. The
1102	// latter use glc to indicate if the atomic returns a result and so must not
1103	// be used for cache control.
1104	assert((MI->mayLoad() ^ MI->mayStore()) \|\| SIInstrInfo::isLDSDMA(*MI));
1105
1106	// Only update load and store, not LLVM IR atomic read-modify-write
1107	// instructions. The latter are always marked as volatile so cannot sensibly
1108	// handle it as do not want to pessimize all atomics. Also they do not support
1109	// the nontemporal attribute.
1110	assert(Op == SIMemOp::LOAD \|\| Op == SIMemOp::STORE);
1111
1112	bool Changed = false;
1113
1114	if (IsVolatile) {
1115	if (ST.hasGFX940Insts()) {
1116	// Set SC bits to indicate system scope.
1117	Changed \|= enableCPolBits(MI, Bits: CPol::SC0 \| CPol::SC1);
1118	} else if (Op == SIMemOp::LOAD) {
1119	// Set L1 cache policy to be MISS_EVICT for load instructions
1120	// and MISS_LRU for store instructions.
1121	// Note: there is no L2 cache bypass policy at the ISA level.
1122	Changed \|= enableCPolBits(MI, Bits: CPol::GLC);
1123	}
1124
1125	// Ensure operation has completed at system scope to cause all volatile
1126	// operations to be visible outside the program in a global order. Do not
1127	// request cross address space as only the global address space can be
1128	// observable outside the program, so no need to cause a waitcnt for LDS
1129	// address space operations.
1130	Changed \|= insertWait(MI, Scope: SIAtomicScope::SYSTEM, AddrSpace, Op, IsCrossAddrSpaceOrdering: false,
1131	Pos: Position::AFTER, Order: AtomicOrdering::Unordered,
1132	/AtomicsOnly=/false);
1133
1134	return Changed;
1135	}
1136
1137	if (IsNonTemporal) {
1138	if (ST.hasGFX940Insts()) {
1139	Changed \|= enableCPolBits(MI, Bits: CPol::NT);
1140	} else {
1141	// Setting both GLC and SLC configures L1 cache policy to MISS_EVICT
1142	// for both loads and stores, and the L2 cache policy to STREAM.
1143	Changed \|= enableCPolBits(MI, Bits: CPol::SLC \| CPol::GLC);
1144	}
1145	return Changed;
1146	}
1147
1148	return Changed;
1149	}
1150
1151	bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
1152	SIAtomicScope Scope,
1153	SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1154	bool IsCrossAddrSpaceOrdering, Position Pos,
1155	AtomicOrdering Order,
1156	bool AtomicsOnly) const {
1157	bool Changed = false;
1158
1159	MachineBasicBlock &MBB = *MI ->getParent();
1160	const DebugLoc &DL = MI ->getDebugLoc();
1161
1162	if (Pos == Position::AFTER)
1163	++MI;
1164
1165	// GFX90A+
1166	if (ST.hasGFX90AInsts() && ST.isTgSplitEnabled()) {
1167	// In threadgroup split mode the waves of a work-group can be executing on
1168	// different CUs. Therefore need to wait for global or GDS memory operations
1169	// to complete to ensure they are visible to waves in the other CUs.
1170	// Otherwise in non-threadgroup split mode all waves of a work-group are on
1171	// the same CU, so no need to wait for global memory as all waves in the
1172	// work-group access the same the L1, nor wait for GDS as access are ordered
1173	// on a CU.
1174	if (((AddrSpace & (SIAtomicAddrSpace::GLOBAL \| SIAtomicAddrSpace::SCRATCH \|
1175	SIAtomicAddrSpace::GDS)) != SIAtomicAddrSpace::NONE) &&
1176	(Scope == SIAtomicScope::WORKGROUP)) {
1177	// Same as <GFX90A at AGENT scope;
1178	Scope = SIAtomicScope::AGENT;
1179	}
1180	// In threadgroup split mode LDS cannot be allocated so no need to wait for
1181	// LDS memory operations.
1182	AddrSpace &= ~SIAtomicAddrSpace::LDS;
1183	}
1184
1185	bool VMCnt = false;
1186	bool LGKMCnt = false;
1187
1188	if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL \| SIAtomicAddrSpace::SCRATCH)) !=
1189	SIAtomicAddrSpace::NONE) {
1190	switch (Scope) {
1191	case SIAtomicScope::SYSTEM:
1192	case SIAtomicScope::AGENT:
1193	VMCnt \|= true;
1194	break;
1195	case SIAtomicScope::WORKGROUP:
1196	case SIAtomicScope::WAVEFRONT:
1197	case SIAtomicScope::SINGLETHREAD:
1198	// The L1 cache keeps all memory operations in order for
1199	// wavefronts in the same work-group.
1200	break;
1201	default:
1202	llvm_unreachable("Unsupported synchronization scope");
1203	}
1204	}
1205
1206	if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1207	switch (Scope) {
1208	case SIAtomicScope::SYSTEM:
1209	case SIAtomicScope::AGENT:
1210	case SIAtomicScope::WORKGROUP:
1211	// If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
1212	// not needed as LDS operations for all waves are executed in a total
1213	// global ordering as observed by all waves. Required if also
1214	// synchronizing with global/GDS memory as LDS operations could be
1215	// reordered with respect to later global/GDS memory operations of the
1216	// same wave.
1217	LGKMCnt \|= IsCrossAddrSpaceOrdering;
1218	break;
1219	case SIAtomicScope::WAVEFRONT:
1220	case SIAtomicScope::SINGLETHREAD:
1221	// The LDS keeps all memory operations in order for
1222	// the same wavefront.
1223	break;
1224	default:
1225	llvm_unreachable("Unsupported synchronization scope");
1226	}
1227	}
1228
1229	if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
1230	switch (Scope) {
1231	case SIAtomicScope::SYSTEM:
1232	case SIAtomicScope::AGENT:
1233	// If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
1234	// is not needed as GDS operations for all waves are executed in a total
1235	// global ordering as observed by all waves. Required if also
1236	// synchronizing with global/LDS memory as GDS operations could be
1237	// reordered with respect to later global/LDS memory operations of the
1238	// same wave.
1239	LGKMCnt \|= IsCrossAddrSpaceOrdering;
1240	break;
1241	case SIAtomicScope::WORKGROUP:
1242	case SIAtomicScope::WAVEFRONT:
1243	case SIAtomicScope::SINGLETHREAD:
1244	// The GDS keeps all memory operations in order for
1245	// the same work-group.
1246	break;
1247	default:
1248	llvm_unreachable("Unsupported synchronization scope");
1249	}
1250	}
1251
1252	if (VMCnt \|\| LGKMCnt) {
1253	unsigned WaitCntImmediate =
1254	AMDGPU::encodeWaitcnt(Version: IV,
1255	Vmcnt: VMCnt ? `0` : getVmcntBitMask(Version: IV),
1256	Expcnt: getExpcntBitMask(Version: IV),
1257	Lgkmcnt: LGKMCnt ? `0` : getLgkmcntBitMask(Version: IV));
1258	BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_WAITCNT_soft))
1259	.addImm(Val: WaitCntImmediate);
1260	Changed = true;
1261	}
1262
1263	// On architectures that support direct loads to LDS, emit an unknown waitcnt
1264	// at workgroup-scoped release operations that specify the LDS address space.
1265	// SIInsertWaitcnts will later replace this with a vmcnt().
1266	if (ST.hasVMemToLDSLoad() && isReleaseOrStronger(AO: Order) &&
1267	Scope == SIAtomicScope::WORKGROUP &&
1268	(AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1269	BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_WAITCNT_lds_direct));
1270	Changed = true;
1271	}
1272
1273	if (Pos == Position::AFTER)
1274	--MI;
1275
1276	return Changed;
1277	}
1278
1279	static bool canUseBUFFER_WBINVL1_VOL(const GCNSubtarget &ST) {
1280	if (ST.getGeneration() <= AMDGPUSubtarget::SOUTHERN_ISLANDS)
1281	return false;
1282	return !ST.isAmdPalOS() && !ST.isMesa3DOS();
1283	}
1284
1285	bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1286	SIAtomicScope Scope,
1287	SIAtomicAddrSpace AddrSpace,
1288	Position Pos) const {
1289	if (!InsertCacheInv)
1290	return false;
1291
1292	bool Changed = false;
1293
1294	MachineBasicBlock &MBB = *MI ->getParent();
1295	const DebugLoc &DL = MI ->getDebugLoc();
1296
1297	if (Pos == Position::AFTER)
1298	++MI;
1299
1300	const unsigned InvalidateL1 = canUseBUFFER_WBINVL1_VOL(ST)
1301	? AMDGPU::BUFFER_WBINVL1_VOL
1302	: AMDGPU::BUFFER_WBINVL1;
1303
1304	if (canAffectGlobalAddrSpace(AS: AddrSpace)) {
1305	switch (Scope) {
1306	case SIAtomicScope::SYSTEM:
1307	if (ST.hasGFX940Insts()) {
1308	// Ensures that following loads will not see stale remote VMEM data or
1309	// stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW
1310	// and CC will never be stale due to the local memory probes.
1311	BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::BUFFER_INV))
1312	// Set SC bits to indicate system scope.
1313	.addImm(Val: AMDGPU::CPol::SC0 \| AMDGPU::CPol::SC1);
1314	// Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
1315	// hardware does not reorder memory operations by the same wave with
1316	// respect to a preceding "BUFFER_INV". The invalidate is guaranteed to
1317	// remove any cache lines of earlier writes by the same wave and ensures
1318	// later reads by the same wave will refetch the cache lines.
1319	Changed = true;
1320	break;
1321	}
1322
1323	if (ST.hasGFX90AInsts()) {
1324	// Ensures that following loads will not see stale remote VMEM data or
1325	// stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW
1326	// and CC will never be stale due to the local memory probes.
1327	BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::BUFFER_INVL2));
1328	BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: InvalidateL1));
1329	// Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
1330	// hardware does not reorder memory operations by the same wave with
1331	// respect to a preceding "BUFFER_INVL2". The invalidate is guaranteed
1332	// to remove any cache lines of earlier writes by the same wave and
1333	// ensures later reads by the same wave will refetch the cache lines.
1334	Changed = true;
1335	break;
1336	}
1337	[[fallthrough]];
1338	case SIAtomicScope::AGENT:
1339	if (ST.hasGFX940Insts()) {
1340	// Ensures that following loads will not see stale remote date or local
1341	// MTYPE NC global data. Local MTYPE RW and CC memory will never be
1342	// stale due to the memory probes.
1343	BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::BUFFER_INV))
1344	// Set SC bits to indicate agent scope.
1345	.addImm(Val: AMDGPU::CPol::SC1);
1346	// Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
1347	// does not reorder memory operations with respect to preceeding buffer
1348	// invalidate. The invalidate is guaranteed to remove any cache lines of
1349	// earlier writes and ensures later writes will refetch the cache lines.
1350	} else
1351	BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: InvalidateL1));
1352	Changed = true;
1353	break;
1354	case SIAtomicScope::WORKGROUP:
1355	if (ST.isTgSplitEnabled()) {
1356	if (ST.hasGFX940Insts()) {
1357	// In threadgroup split mode the waves of a work-group can be
1358	// executing on different CUs. Therefore need to invalidate the L1
1359	// which is per CU. Otherwise in non-threadgroup split mode all waves
1360	// of a work-group are on the same CU, and so the L1 does not need to
1361	// be invalidated.
1362
1363	// Ensures L1 is invalidated if in threadgroup split mode. In
1364	// non-threadgroup split mode it is a NOP, but no point generating it
1365	// in that case if know not in that mode.
1366	BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::BUFFER_INV))
1367	// Set SC bits to indicate work-group scope.
1368	.addImm(Val: AMDGPU::CPol::SC0);
1369	// Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
1370	// does not reorder memory operations with respect to preceeding
1371	// buffer invalidate. The invalidate is guaranteed to remove any cache
1372	// lines of earlier writes and ensures later writes will refetch the
1373	// cache lines.
1374	Changed = true;
1375	} else if (ST.hasGFX90AInsts()) {
1376	BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: InvalidateL1));
1377	Changed = true;
1378	}
1379	}
1380	break;
1381	case SIAtomicScope::WAVEFRONT:
1382	case SIAtomicScope::SINGLETHREAD:
1383	// For GFX940, we could generate "BUFFER_INV" but it would do nothing as
1384	// there are no caches to invalidate. All other targets have no cache to
1385	// invalidate.
1386	break;
1387	default:
1388	llvm_unreachable("Unsupported synchronization scope");
1389	}
1390	}
1391
1392	/// The scratch address space does not need the global memory cache
1393	/// to be flushed as all memory operations by the same thread are
1394	/// sequentially consistent, and no other thread can access scratch
1395	/// memory.
1396
1397	/// Other address spaces do not have a cache.
1398
1399	if (Pos == Position::AFTER)
1400	--MI;
1401
1402	return Changed;
1403	}
1404
1405	bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1406	SIAtomicScope Scope,
1407	SIAtomicAddrSpace AddrSpace,
1408	bool IsCrossAddrSpaceOrdering,
1409	Position Pos) const {
1410	bool Changed = false;
1411
1412	if (ST.hasGFX90AInsts()) {
1413	MachineBasicBlock &MBB = *MI ->getParent();
1414	const DebugLoc &DL = MI ->getDebugLoc();
1415
1416	if (Pos == Position::AFTER)
1417	++MI;
1418
1419	if (canAffectGlobalAddrSpace(AS: AddrSpace)) {
1420	switch (Scope) {
1421	case SIAtomicScope::SYSTEM:
1422	// Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
1423	// hardware does not reorder memory operations by the same wave with
1424	// respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
1425	// to initiate writeback of any dirty cache lines of earlier writes by
1426	// the same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
1427	// writeback has completed.
1428	BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::BUFFER_WBL2))
1429	// Set SC bits to indicate system scope.
1430	.addImm(Val: AMDGPU::CPol::SC0 \| AMDGPU::CPol::SC1);
1431	Changed = true;
1432	break;
1433	case SIAtomicScope::AGENT:
1434	if (ST.hasGFX940Insts()) {
1435	BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::BUFFER_WBL2))
1436	// Set SC bits to indicate agent scope.
1437	.addImm(Val: AMDGPU::CPol::SC1);
1438
1439	// Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is
1440	// SIAtomicScope::AGENT, the following insertWait will generate the
1441	// required "S_WAITCNT vmcnt(0)".
1442	Changed = true;
1443	}
1444	break;
1445	case SIAtomicScope::WORKGROUP:
1446	case SIAtomicScope::WAVEFRONT:
1447	case SIAtomicScope::SINGLETHREAD:
1448	// For GFX940, do not generate "BUFFER_WBL2" as there are no caches it
1449	// would writeback, and would require an otherwise unnecessary
1450	// "S_WAITCNT vmcnt(0)".
1451	break;
1452	default:
1453	llvm_unreachable("Unsupported synchronization scope");
1454	}
1455	}
1456
1457	if (Pos == Position::AFTER)
1458	--MI;
1459	}
1460
1461	// Ensure the necessary S_WAITCNT needed by any "BUFFER_WBL2" as well as other
1462	// S_WAITCNT needed.
1463	Changed \|= insertWait(MI, Scope, AddrSpace, Op: SIMemOp::LOAD \| SIMemOp::STORE,
1464	IsCrossAddrSpaceOrdering, Pos, Order: AtomicOrdering::Release,
1465	/AtomicsOnly=/false);
1466
1467	return Changed;
1468	}
1469
1470	bool SIGfx10CacheControl::enableLoadCacheBypass(
1471	const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
1472	SIAtomicAddrSpace AddrSpace) const {
1473	assert(MI->mayLoad() && !MI->mayStore());
1474	bool Changed = false;
1475
1476	if (canAffectGlobalAddrSpace(AS: AddrSpace)) {
1477	switch (Scope) {
1478	case SIAtomicScope::SYSTEM:
1479	case SIAtomicScope::AGENT:
1480	// Set the L0 and L1 cache policies to MISS_EVICT.
1481	// Note: there is no L2 cache coherent bypass control at the ISA level.
1482	// For GFX10, set GLC+DLC, for GFX11, only set GLC.
1483	Changed \|=
1484	enableCPolBits(MI, Bits: CPol::GLC \| (AMDGPU::isGFX10(STI: ST) ? CPol::DLC : `0`));
1485	break;
1486	case SIAtomicScope::WORKGROUP:
1487	// In WGP mode the waves of a work-group can be executing on either CU of
1488	// the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
1489	// CU mode all waves of a work-group are on the same CU, and so the L0
1490	// does not need to be bypassed.
1491	if (!ST.isCuModeEnabled())
1492	Changed \|= enableCPolBits(MI, Bits: CPol::GLC);
1493	break;
1494	case SIAtomicScope::WAVEFRONT:
1495	case SIAtomicScope::SINGLETHREAD:
1496	// No cache to bypass.
1497	break;
1498	default:
1499	llvm_unreachable("Unsupported synchronization scope");
1500	}
1501	}
1502
1503	/// The scratch address space does not need the global memory caches
1504	/// to be bypassed as all memory operations by the same thread are
1505	/// sequentially consistent, and no other thread can access scratch
1506	/// memory.
1507
1508	/// Other address spaces do not have a cache.
1509
1510	return Changed;
1511	}
1512
1513	bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal(
1514	MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1515	bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
1516
1517	// Only handle load and store, not atomic read-modify-write insructions. The
1518	// latter use glc to indicate if the atomic returns a result and so must not
1519	// be used for cache control.
1520	assert((MI->mayLoad() ^ MI->mayStore()) \|\| SIInstrInfo::isLDSDMA(*MI));
1521
1522	// Only update load and store, not LLVM IR atomic read-modify-write
1523	// instructions. The latter are always marked as volatile so cannot sensibly
1524	// handle it as do not want to pessimize all atomics. Also they do not support
1525	// the nontemporal attribute.
1526	assert(Op == SIMemOp::LOAD \|\| Op == SIMemOp::STORE);
1527
1528	bool Changed = false;
1529
1530	if (IsVolatile) {
1531	// Set L0 and L1 cache policy to be MISS_EVICT for load instructions
1532	// and MISS_LRU for store instructions.
1533	// Note: there is no L2 cache coherent bypass control at the ISA level.
1534	if (Op == SIMemOp::LOAD) {
1535	Changed \|= enableCPolBits(MI, Bits: CPol::GLC \| CPol::DLC);
1536	}
1537
1538	// GFX11: Set MALL NOALLOC for both load and store instructions.
1539	if (AMDGPU::isGFX11(STI: ST))
1540	Changed \|= enableCPolBits(MI, Bits: CPol::DLC);
1541
1542	// Ensure operation has completed at system scope to cause all volatile
1543	// operations to be visible outside the program in a global order. Do not
1544	// request cross address space as only the global address space can be
1545	// observable outside the program, so no need to cause a waitcnt for LDS
1546	// address space operations.
1547	Changed \|= insertWait(MI, Scope: SIAtomicScope::SYSTEM, AddrSpace, Op, IsCrossAddrSpaceOrdering: false,
1548	Pos: Position::AFTER, Order: AtomicOrdering::Unordered,
1549	/AtomicsOnly=/false);
1550	return Changed;
1551	}
1552
1553	if (IsNonTemporal) {
1554	// For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT
1555	// and L2 cache policy to STREAM.
1556	// For stores setting both GLC and SLC configures L0 and L1 cache policy
1557	// to MISS_EVICT and the L2 cache policy to STREAM.
1558	if (Op == SIMemOp::STORE)
1559	Changed \|= enableCPolBits(MI, Bits: CPol::GLC);
1560	Changed \|= enableCPolBits(MI, Bits: CPol::SLC);
1561
1562	// GFX11: Set MALL NOALLOC for both load and store instructions.
1563	if (AMDGPU::isGFX11(STI: ST))
1564	Changed \|= enableCPolBits(MI, Bits: CPol::DLC);
1565
1566	return Changed;
1567	}
1568
1569	return Changed;
1570	}
1571
1572	bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
1573	SIAtomicScope Scope,
1574	SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1575	bool IsCrossAddrSpaceOrdering,
1576	Position Pos, AtomicOrdering Order,
1577	bool AtomicsOnly) const {
1578	bool Changed = false;
1579
1580	MachineBasicBlock &MBB = *MI ->getParent();
1581	const DebugLoc &DL = MI ->getDebugLoc();
1582
1583	if (Pos == Position::AFTER)
1584	++MI;
1585
1586	bool VMCnt = false;
1587	bool VSCnt = false;
1588	bool LGKMCnt = false;
1589
1590	if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL \| SIAtomicAddrSpace::SCRATCH)) !=
1591	SIAtomicAddrSpace::NONE) {
1592	switch (Scope) {
1593	case SIAtomicScope::SYSTEM:
1594	case SIAtomicScope::AGENT:
1595	if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1596	VMCnt \|= true;
1597	if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1598	VSCnt \|= true;
1599	break;
1600	case SIAtomicScope::WORKGROUP:
1601	// In WGP mode the waves of a work-group can be executing on either CU of
1602	// the WGP. Therefore need to wait for operations to complete to ensure
1603	// they are visible to waves in the other CU as the L0 is per CU.
1604	// Otherwise in CU mode and all waves of a work-group are on the same CU
1605	// which shares the same L0. Note that we still need to wait when
1606	// performing a release in this mode to respect the transitivity of
1607	// happens-before, e.g. other waves of the workgroup must be able to
1608	// release the memory from another wave at a wider scope.
1609	if (!ST.isCuModeEnabled() \|\| isReleaseOrStronger(AO: Order)) {
1610	if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1611	VMCnt \|= true;
1612	if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1613	VSCnt \|= true;
1614	}
1615	break;
1616	case SIAtomicScope::WAVEFRONT:
1617	case SIAtomicScope::SINGLETHREAD:
1618	// The L0 cache keeps all memory operations in order for
1619	// work-items in the same wavefront.
1620	break;
1621	default:
1622	llvm_unreachable("Unsupported synchronization scope");
1623	}
1624	}
1625
1626	if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1627	switch (Scope) {
1628	case SIAtomicScope::SYSTEM:
1629	case SIAtomicScope::AGENT:
1630	case SIAtomicScope::WORKGROUP:
1631	// If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
1632	// not needed as LDS operations for all waves are executed in a total
1633	// global ordering as observed by all waves. Required if also
1634	// synchronizing with global/GDS memory as LDS operations could be
1635	// reordered with respect to later global/GDS memory operations of the
1636	// same wave.
1637	LGKMCnt \|= IsCrossAddrSpaceOrdering;
1638	break;
1639	case SIAtomicScope::WAVEFRONT:
1640	case SIAtomicScope::SINGLETHREAD:
1641	// The LDS keeps all memory operations in order for
1642	// the same wavefront.
1643	break;
1644	default:
1645	llvm_unreachable("Unsupported synchronization scope");
1646	}
1647	}
1648
1649	if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
1650	switch (Scope) {
1651	case SIAtomicScope::SYSTEM:
1652	case SIAtomicScope::AGENT:
1653	// If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
1654	// is not needed as GDS operations for all waves are executed in a total
1655	// global ordering as observed by all waves. Required if also
1656	// synchronizing with global/LDS memory as GDS operations could be
1657	// reordered with respect to later global/LDS memory operations of the
1658	// same wave.
1659	LGKMCnt \|= IsCrossAddrSpaceOrdering;
1660	break;
1661	case SIAtomicScope::WORKGROUP:
1662	case SIAtomicScope::WAVEFRONT:
1663	case SIAtomicScope::SINGLETHREAD:
1664	// The GDS keeps all memory operations in order for
1665	// the same work-group.
1666	break;
1667	default:
1668	llvm_unreachable("Unsupported synchronization scope");
1669	}
1670	}
1671
1672	if (VMCnt \|\| LGKMCnt) {
1673	unsigned WaitCntImmediate =
1674	AMDGPU::encodeWaitcnt(Version: IV,
1675	Vmcnt: VMCnt ? `0` : getVmcntBitMask(Version: IV),
1676	Expcnt: getExpcntBitMask(Version: IV),
1677	Lgkmcnt: LGKMCnt ? `0` : getLgkmcntBitMask(Version: IV));
1678	BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_WAITCNT_soft))
1679	.addImm(Val: WaitCntImmediate);
1680	Changed = true;
1681	}
1682
1683	// On architectures that support direct loads to LDS, emit an unknown waitcnt
1684	// at workgroup-scoped release operations that specify the LDS address space.
1685	// SIInsertWaitcnts will later replace this with a vmcnt().
1686	if (ST.hasVMemToLDSLoad() && isReleaseOrStronger(AO: Order) &&
1687	Scope == SIAtomicScope::WORKGROUP &&
1688	(AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1689	BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_WAITCNT_lds_direct));
1690	Changed = true;
1691	}
1692
1693	if (VSCnt) {
1694	BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_WAITCNT_VSCNT_soft))
1695	.addReg(RegNo: AMDGPU::SGPR_NULL, Flags: RegState::Undef)
1696	.addImm(Val: `0`);
1697	Changed = true;
1698	}
1699
1700	if (Pos == Position::AFTER)
1701	--MI;
1702
1703	return Changed;
1704	}
1705
1706	bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1707	SIAtomicScope Scope,
1708	SIAtomicAddrSpace AddrSpace,
1709	Position Pos) const {
1710	if (!InsertCacheInv)
1711	return false;
1712
1713	bool Changed = false;
1714
1715	MachineBasicBlock &MBB = *MI ->getParent();
1716	const DebugLoc &DL = MI ->getDebugLoc();
1717
1718	if (Pos == Position::AFTER)
1719	++MI;
1720
1721	if (canAffectGlobalAddrSpace(AS: AddrSpace)) {
1722	switch (Scope) {
1723	case SIAtomicScope::SYSTEM:
1724	case SIAtomicScope::AGENT:
1725	// The order of invalidates matter here. We must invalidate "outer in"
1726	// so L1 -> L0 to avoid L0 pulling in stale data from L1 when it is
1727	// invalidated.
1728	BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::BUFFER_GL1_INV));
1729	BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::BUFFER_GL0_INV));
1730	Changed = true;
1731	break;
1732	case SIAtomicScope::WORKGROUP:
1733	// In WGP mode the waves of a work-group can be executing on either CU of
1734	// the WGP. Therefore need to invalidate the L0 which is per CU. Otherwise
1735	// in CU mode and all waves of a work-group are on the same CU, and so the
1736	// L0 does not need to be invalidated.
1737	if (!ST.isCuModeEnabled()) {
1738	BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::BUFFER_GL0_INV));
1739	Changed = true;
1740	}
1741	break;
1742	case SIAtomicScope::WAVEFRONT:
1743	case SIAtomicScope::SINGLETHREAD:
1744	// No cache to invalidate.
1745	break;
1746	default:
1747	llvm_unreachable("Unsupported synchronization scope");
1748	}
1749	}
1750
1751	/// The scratch address space does not need the global memory cache
1752	/// to be flushed as all memory operations by the same thread are
1753	/// sequentially consistent, and no other thread can access scratch
1754	/// memory.
1755
1756	/// Other address spaces do not have a cache.
1757
1758	if (Pos == Position::AFTER)
1759	--MI;
1760
1761	return Changed;
1762	}
1763
1764	bool SIGfx12CacheControl::setTH(const MachineBasicBlock::iterator MI,
1765	AMDGPU::CPol::CPol Value) const {
1766	MachineOperand CPol = TII->getNamedOperand(MI&: MI, OperandName: OpName::cpol);
1767	if (!CPol)
1768	return false;
1769
1770	uint64_t NewTH = Value & AMDGPU::CPol::TH;
1771	if ((CPol->getImm() & AMDGPU::CPol::TH) != NewTH) {
1772	CPol->setImm((CPol->getImm() & ~AMDGPU::CPol::TH) \| NewTH);
1773	return true;
1774	}
1775
1776	return false;
1777	}
1778
1779	bool SIGfx12CacheControl::setScope(const MachineBasicBlock::iterator MI,
1780	AMDGPU::CPol::CPol Value) const {
1781	MachineOperand CPol = TII->getNamedOperand(MI&: MI, OperandName: OpName::cpol);
1782	if (!CPol)
1783	return false;
1784
1785	uint64_t NewScope = Value & AMDGPU::CPol::SCOPE;
1786	if ((CPol->getImm() & AMDGPU::CPol::SCOPE) != NewScope) {
1787	CPol->setImm((CPol->getImm() & ~AMDGPU::CPol::SCOPE) \| NewScope);
1788	return true;
1789	}
1790
1791	return false;
1792	}
1793
1794	bool SIGfx12CacheControl::insertWaitsBeforeSystemScopeStore(
1795	const MachineBasicBlock::iterator MI) const {
1796	// TODO: implement flag for frontend to give us a hint not to insert waits.
1797
1798	MachineBasicBlock &MBB = *MI ->getParent();
1799	const DebugLoc &DL = MI ->getDebugLoc();
1800
1801	BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: S_WAIT_LOADCNT_soft)).addImm(Val: `0`);
1802	if (ST.hasImageInsts()) {
1803	BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: S_WAIT_SAMPLECNT_soft)).addImm(Val: `0`);
1804	BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: S_WAIT_BVHCNT_soft)).addImm(Val: `0`);
1805	}
1806	BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: S_WAIT_KMCNT_soft)).addImm(Val: `0`);
1807	BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: S_WAIT_STORECNT_soft)).addImm(Val: `0`);
1808
1809	return true;
1810	}
1811
1812	bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI,
1813	SIAtomicScope Scope,
1814	SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1815	bool IsCrossAddrSpaceOrdering,
1816	Position Pos, AtomicOrdering Order,
1817	bool AtomicsOnly) const {
1818	bool Changed = false;
1819
1820	MachineBasicBlock &MBB = *MI ->getParent();
1821	const DebugLoc &DL = MI ->getDebugLoc();
1822
1823	bool LOADCnt = false;
1824	bool DSCnt = false;
1825	bool STORECnt = false;
1826
1827	if (Pos == Position::AFTER)
1828	++MI;
1829
1830	if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL \| SIAtomicAddrSpace::SCRATCH)) !=
1831	SIAtomicAddrSpace::NONE) {
1832	switch (Scope) {
1833	case SIAtomicScope::SYSTEM:
1834	case SIAtomicScope::AGENT:
1835	case SIAtomicScope::CLUSTER:
1836	if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1837	LOADCnt \|= true;
1838	if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1839	STORECnt \|= true;
1840	break;
1841	case SIAtomicScope::WORKGROUP:
1842	// GFX12.0:
1843	// In WGP mode the waves of a work-group can be executing on either CU
1844	// of the WGP. Therefore need to wait for operations to complete to
1845	// ensure they are visible to waves in the other CU as the L0 is per CU.
1846	//
1847	// Otherwise in CU mode and all waves of a work-group are on the same CU
1848	// which shares the same L0. Note that we still need to wait when
1849	// performing a release in this mode to respect the transitivity of
1850	// happens-before, e.g. other waves of the workgroup must be able to
1851	// release the memory from another wave at a wider scope.
1852	//
1853	// GFX12.5:
1854	// CU$ has two ports. To ensure operations are visible at the workgroup
1855	// level, we need to ensure all operations in this port have completed
1856	// so the other SIMDs in the WG can see them. There is no ordering
1857	// guarantee between the ports.
1858	if (!ST.isCuModeEnabled() \|\| ST.hasGFX1250Insts() \|\|
1859	isReleaseOrStronger(AO: Order)) {
1860	if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1861	LOADCnt \|= true;
1862	if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1863	STORECnt \|= true;
1864	}
1865	break;
1866	case SIAtomicScope::WAVEFRONT:
1867	case SIAtomicScope::SINGLETHREAD:
1868	// The L0 cache keeps all memory operations in order for
1869	// work-items in the same wavefront.
1870	break;
1871	default:
1872	llvm_unreachable("Unsupported synchronization scope");
1873	}
1874	}
1875
1876	if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1877	switch (Scope) {
1878	case SIAtomicScope::SYSTEM:
1879	case SIAtomicScope::AGENT:
1880	case SIAtomicScope::CLUSTER:
1881	case SIAtomicScope::WORKGROUP:
1882	// If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
1883	// not needed as LDS operations for all waves are executed in a total
1884	// global ordering as observed by all waves. Required if also
1885	// synchronizing with global/GDS memory as LDS operations could be
1886	// reordered with respect to later global/GDS memory operations of the
1887	// same wave.
1888	DSCnt \|= IsCrossAddrSpaceOrdering;
1889	break;
1890	case SIAtomicScope::WAVEFRONT:
1891	case SIAtomicScope::SINGLETHREAD:
1892	// The LDS keeps all memory operations in order for
1893	// the same wavefront.
1894	break;
1895	default:
1896	llvm_unreachable("Unsupported synchronization scope");
1897	}
1898	}
1899
1900	if (LOADCnt) {
1901	// Acquire sequences only need to wait on the previous atomic operation.
1902	// e.g. a typical sequence looks like
1903	// atomic load
1904	// (wait)
1905	// global_inv
1906	//
1907	// We do not have BVH or SAMPLE atomics, so the atomic load is always going
1908	// to be tracked using loadcnt.
1909	//
1910	// This also applies to fences. Fences cannot pair with an instruction
1911	// tracked with bvh/samplecnt as we don't have any atomics that do that.
1912	if (!AtomicsOnly && ST.hasImageInsts()) {
1913	BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_WAIT_BVHCNT_soft)).addImm(Val: `0`);
1914	BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_WAIT_SAMPLECNT_soft)).addImm(Val: `0`);
1915	}
1916	BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_WAIT_LOADCNT_soft)).addImm(Val: `0`);
1917	Changed = true;
1918	}
1919
1920	if (STORECnt) {
1921	BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_WAIT_STORECNT_soft)).addImm(Val: `0`);
1922	Changed = true;
1923	}
1924
1925	if (DSCnt) {
1926	BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_WAIT_DSCNT_soft)).addImm(Val: `0`);
1927	Changed = true;
1928	}
1929
1930	if (Pos == Position::AFTER)
1931	--MI;
1932
1933	return Changed;
1934	}
1935
1936	bool SIGfx12CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1937	SIAtomicScope Scope,
1938	SIAtomicAddrSpace AddrSpace,
1939	Position Pos) const {
1940	if (!InsertCacheInv)
1941	return false;
1942
1943	MachineBasicBlock &MBB = *MI ->getParent();
1944	const DebugLoc &DL = MI ->getDebugLoc();
1945
1946	/// The scratch address space does not need the global memory cache
1947	/// to be flushed as all memory operations by the same thread are
1948	/// sequentially consistent, and no other thread can access scratch
1949	/// memory.
1950
1951	/// Other address spaces do not have a cache.
1952	if (!canAffectGlobalAddrSpace(AS: AddrSpace))
1953	return false;
1954
1955	AMDGPU::CPol::CPol ScopeImm = AMDGPU::CPol::SCOPE_DEV;
1956	switch (Scope) {
1957	case SIAtomicScope::SYSTEM:
1958	ScopeImm = AMDGPU::CPol::SCOPE_SYS;
1959	break;
1960	case SIAtomicScope::AGENT:
1961	ScopeImm = AMDGPU::CPol::SCOPE_DEV;
1962	break;
1963	case SIAtomicScope::CLUSTER:
1964	ScopeImm = AMDGPU::CPol::SCOPE_SE;
1965	break;
1966	case SIAtomicScope::WORKGROUP:
1967	// GFX12.0:
1968	// In WGP mode the waves of a work-group can be executing on either CU of
1969	// the WGP. Therefore we need to invalidate the L0 which is per CU.
1970	// Otherwise in CU mode all waves of a work-group are on the same CU, and
1971	// so the L0 does not need to be invalidated.
1972	//
1973	// GFX12.5 has a shared WGP$, so no invalidates are required.
1974	if (ST.isCuModeEnabled())
1975	return false;
1976
1977	ScopeImm = AMDGPU::CPol::SCOPE_SE;
1978	break;
1979	case SIAtomicScope::WAVEFRONT:
1980	case SIAtomicScope::SINGLETHREAD:
1981	// No cache to invalidate.
1982	return false;
1983	default:
1984	llvm_unreachable("Unsupported synchronization scope");
1985	}
1986
1987	if (Pos == Position::AFTER)
1988	++MI;
1989
1990	BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::GLOBAL_INV)).addImm(Val: ScopeImm);
1991
1992	if (Pos == Position::AFTER)
1993	--MI;
1994
1995	// Target requires a waitcnt to ensure that the proceeding INV has completed
1996	// as it may get reorded with following load instructions.
1997	if (ST.hasINVWBL2WaitCntRequirement() && Scope > SIAtomicScope::CLUSTER) {
1998	insertWait(MI, Scope, AddrSpace, Op: SIMemOp::LOAD,
1999	/IsCrossAddrSpaceOrdering=/false, Pos, Order: AtomicOrdering::Acquire,
2000	/AtomicsOnly=/false);
2001
2002	if (Pos == Position::AFTER)
2003	--MI;
2004	}
2005
2006	return true;
2007	}
2008
2009	bool SIGfx12CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
2010	SIAtomicScope Scope,
2011	SIAtomicAddrSpace AddrSpace,
2012	bool IsCrossAddrSpaceOrdering,
2013	Position Pos) const {
2014	bool Changed = false;
2015
2016	MachineBasicBlock &MBB = *MI ->getParent();
2017	const DebugLoc &DL = MI ->getDebugLoc();
2018
2019	// The scratch address space does not need the global memory cache
2020	// writeback as all memory operations by the same thread are
2021	// sequentially consistent, and no other thread can access scratch
2022	// memory.
2023	if (canAffectGlobalAddrSpace(AS: AddrSpace)) {
2024	if (Pos == Position::AFTER)
2025	++MI;
2026
2027	// global_wb is only necessary at system scope for GFX12.0,
2028	// they're also necessary at device scope for GFX12.5 as stores
2029	// cannot report completion earlier than L2.
2030	//
2031	// Emitting it for lower scopes is a slow no-op, so we omit it
2032	// for performance.
2033	std::optional<AMDGPU::CPol::CPol> NeedsWB;
2034	switch (Scope) {
2035	case SIAtomicScope::SYSTEM:
2036	NeedsWB = AMDGPU::CPol::SCOPE_SYS;
2037	break;
2038	case SIAtomicScope::AGENT:
2039	// GFX12.5 may have >1 L2 per device so we must emit a device scope WB.
2040	if (ST.hasGFX1250Insts())
2041	NeedsWB = AMDGPU::CPol::SCOPE_DEV;
2042	break;
2043	case SIAtomicScope::CLUSTER:
2044	case SIAtomicScope::WORKGROUP:
2045	// No WB necessary, but we still have to wait.
2046	case SIAtomicScope::WAVEFRONT:
2047	case SIAtomicScope::SINGLETHREAD:
2048	// No WB or wait necessary here, but insertWait takes care of that.
2049	break;
2050	default:
2051	llvm_unreachable("Unsupported synchronization scope");
2052	}
2053
2054	if (NeedsWB) {
2055	// Target requires a waitcnt to ensure that the proceeding store
2056	// proceeding store/rmw operations have completed in L2 so their data will
2057	// be written back by the WB instruction.
2058	if (ST.hasINVWBL2WaitCntRequirement())
2059	insertWait(MI, Scope, AddrSpace, Op: SIMemOp::LOAD \| SIMemOp::STORE,
2060	/IsCrossAddrSpaceOrdering=/false, Pos,
2061	Order: AtomicOrdering::Release,
2062	/AtomicsOnly=/false);
2063
2064	BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::GLOBAL_WB)).addImm(Val: *NeedsWB);
2065	Changed = true;
2066	}
2067
2068	if (Pos == Position::AFTER)
2069	--MI;
2070	}
2071
2072	// We always have to wait for previous memory operations (load/store) to
2073	// complete, whether we inserted a WB or not. If we inserted a WB (storecnt),
2074	// we of course need to wait for that as well.
2075	Changed \|= insertWait(MI, Scope, AddrSpace, Op: SIMemOp::LOAD \| SIMemOp::STORE,
2076	IsCrossAddrSpaceOrdering, Pos, Order: AtomicOrdering::Release,
2077	/AtomicsOnly=/false);
2078
2079	return Changed;
2080	}
2081
2082	bool SIGfx12CacheControl::handleNonVolatile(MachineInstr &MI) const {
2083	// On GFX12.5, set the NV CPol bit.
2084	if (!ST.hasGFX1250Insts())
2085	return false;
2086	MachineOperand *CPol = TII->getNamedOperand(MI, OperandName: OpName::cpol);
2087	if (!CPol)
2088	return false;
2089	CPol->setImm(CPol->getImm() \| AMDGPU::CPol::NV);
2090	return true;
2091	}
2092
2093	bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
2094	MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
2095	bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
2096
2097	// Only handle load and store, not atomic read-modify-write instructions.
2098	assert((MI->mayLoad() ^ MI->mayStore()) \|\| SIInstrInfo::isLDSDMA(*MI));
2099
2100	// Only update load and store, not LLVM IR atomic read-modify-write
2101	// instructions. The latter are always marked as volatile so cannot sensibly
2102	// handle it as do not want to pessimize all atomics. Also they do not support
2103	// the nontemporal attribute.
2104	assert(Op == SIMemOp::LOAD \|\| Op == SIMemOp::STORE);
2105
2106	bool Changed = false;
2107
2108	if (IsLastUse) {
2109	// Set last-use hint.
2110	Changed \|= setTH(MI, Value: AMDGPU::CPol::TH_LU);
2111	} else if (IsNonTemporal) {
2112	// Set non-temporal hint for all cache levels.
2113	Changed \|= setTH(MI, Value: AMDGPU::CPol::TH_NT);
2114	}
2115
2116	if (IsVolatile) {
2117	Changed \|= setScope(MI, Value: AMDGPU::CPol::SCOPE_SYS);
2118
2119	if (ST.requiresWaitXCntForSingleAccessInstructions() &&
2120	SIInstrInfo::isVMEM(MI: *MI)) {
2121	MachineBasicBlock &MBB = *MI ->getParent();
2122	BuildMI(BB&: MBB, I: MI, MIMD: MI ->getDebugLoc(), MCID: TII->get(Opcode: S_WAIT_XCNT_soft)).addImm(Val: `0`);
2123	Changed = true;
2124	}
2125
2126	// Ensure operation has completed at system scope to cause all volatile
2127	// operations to be visible outside the program in a global order. Do not
2128	// request cross address space as only the global address space can be
2129	// observable outside the program, so no need to cause a waitcnt for LDS
2130	// address space operations.
2131	Changed \|= insertWait(MI, Scope: SIAtomicScope::SYSTEM, AddrSpace, Op, IsCrossAddrSpaceOrdering: false,
2132	Pos: Position::AFTER, Order: AtomicOrdering::Unordered,
2133	/AtomicsOnly=/false);
2134	}
2135
2136	return Changed;
2137	}
2138
2139	bool SIGfx12CacheControl::finalizeStore(MachineInstr &MI, bool Atomic) const {
2140	assert(MI.mayStore() && "Not a Store inst");
2141	const bool IsRMW = (MI.mayLoad() && MI.mayStore());
2142	bool Changed = false;
2143
2144	if (Atomic && ST.requiresWaitXCntForSingleAccessInstructions() &&
2145	SIInstrInfo::isVMEM(MI)) {
2146	MachineBasicBlock &MBB = *MI.getParent();
2147	BuildMI(BB&: MBB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: S_WAIT_XCNT_soft)).addImm(Val: `0`);
2148	Changed = true;
2149	}
2150
2151	// Remaining fixes do not apply to RMWs.
2152	if (IsRMW)
2153	return Changed;
2154
2155	MachineOperand *CPol = TII->getNamedOperand(MI, OperandName: OpName::cpol);
2156	if (!CPol) // Some vmem operations do not have a scope and are not concerned.
2157	return Changed;
2158	const unsigned Scope = CPol->getImm() & CPol::SCOPE;
2159
2160	// GFX12.0 only: Extra waits needed before system scope stores.
2161	if (ST.requiresWaitsBeforeSystemScopeStores() && !Atomic &&
2162	Scope == CPol::SCOPE_SYS)
2163	Changed \|= insertWaitsBeforeSystemScopeStore(MI: MI.getIterator());
2164
2165	return Changed;
2166	}
2167
2168	bool SIGfx12CacheControl::handleCooperativeAtomic(MachineInstr &MI) const {
2169	if (!ST.hasGFX1250Insts())
2170	return false;
2171
2172	// Cooperative atomics need to be SCOPE_DEV or higher.
2173	MachineOperand *CPol = TII->getNamedOperand(MI, OperandName: OpName::cpol);
2174	assert(CPol && "No CPol operand?");
2175	const unsigned Scope = CPol->getImm() & CPol::SCOPE;
2176	if (Scope < CPol::SCOPE_DEV)
2177	return setScope(MI, Value: CPol::SCOPE_DEV);
2178	return false;
2179	}
2180
2181	bool SIGfx12CacheControl::setAtomicScope(const MachineBasicBlock::iterator &MI,
2182	SIAtomicScope Scope,
2183	SIAtomicAddrSpace AddrSpace) const {
2184	bool Changed = false;
2185
2186	if (canAffectGlobalAddrSpace(AS: AddrSpace)) {
2187	switch (Scope) {
2188	case SIAtomicScope::SYSTEM:
2189	Changed \|= setScope(MI, Value: AMDGPU::CPol::SCOPE_SYS);
2190	break;
2191	case SIAtomicScope::AGENT:
2192	Changed \|= setScope(MI, Value: AMDGPU::CPol::SCOPE_DEV);
2193	break;
2194	case SIAtomicScope::CLUSTER:
2195	Changed \|= setScope(MI, Value: AMDGPU::CPol::SCOPE_SE);
2196	break;
2197	case SIAtomicScope::WORKGROUP:
2198	// In workgroup mode, SCOPE_SE is needed as waves can executes on
2199	// different CUs that access different L0s.
2200	if (!ST.isCuModeEnabled())
2201	Changed \|= setScope(MI, Value: AMDGPU::CPol::SCOPE_SE);
2202	break;
2203	case SIAtomicScope::WAVEFRONT:
2204	case SIAtomicScope::SINGLETHREAD:
2205	// No cache to bypass.
2206	break;
2207	default:
2208	llvm_unreachable("Unsupported synchronization scope");
2209	}
2210	}
2211
2212	// The scratch address space does not need the global memory caches
2213	// to be bypassed as all memory operations by the same thread are
2214	// sequentially consistent, and no other thread can access scratch
2215	// memory.
2216
2217	// Other address spaces do not have a cache.
2218
2219	return Changed;
2220	}
2221
2222	bool SIMemoryLegalizer::removeAtomicPseudoMIs() {
2223	if (AtomicPseudoMIs.empty())
2224	return false;
2225
2226	for (auto &MI : AtomicPseudoMIs)
2227	MI ->eraseFromParent();
2228
2229	AtomicPseudoMIs.clear();
2230	return true;
2231	}
2232
2233	bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI,
2234	MachineBasicBlock::iterator &MI) {
2235	assert(MI->mayLoad() && !MI->mayStore());
2236
2237	bool Changed = false;
2238
2239	if (MOI.isAtomic()) {
2240	const AtomicOrdering Order = MOI.getOrdering();
2241	if (Order == AtomicOrdering::Monotonic \|\|
2242	Order == AtomicOrdering::Acquire \|\|
2243	Order == AtomicOrdering::SequentiallyConsistent) {
2244	Changed \|= CC ->enableLoadCacheBypass(MI, Scope: MOI.getScope(),
2245	AddrSpace: MOI.getOrderingAddrSpace());
2246	}
2247
2248	// Handle cooperative atomics after cache bypass step, as it may override
2249	// the scope of the instruction to a greater scope.
2250	if (MOI.isCooperative())
2251	Changed \|= CC ->handleCooperativeAtomic(MI&: *MI);
2252
2253	if (Order == AtomicOrdering::SequentiallyConsistent)
2254	Changed \|= CC ->insertWait(MI, Scope: MOI.getScope(), AddrSpace: MOI.getOrderingAddrSpace(),
2255	Op: SIMemOp::LOAD \| SIMemOp::STORE,
2256	IsCrossAddrSpaceOrdering: MOI.getIsCrossAddressSpaceOrdering(),
2257	Pos: Position::BEFORE, Order, /AtomicsOnly=/false);
2258
2259	if (Order == AtomicOrdering::Acquire \|\|
2260	Order == AtomicOrdering::SequentiallyConsistent) {
2261	// The wait below only needs to wait on the prior atomic.
2262	Changed \|=
2263	CC ->insertWait(MI, Scope: MOI.getScope(), AddrSpace: MOI.getInstrAddrSpace(),
2264	Op: SIMemOp::LOAD, IsCrossAddrSpaceOrdering: MOI.getIsCrossAddressSpaceOrdering(),
2265	Pos: Position::AFTER, Order, /AtomicsOnly=/true);
2266	Changed \|= CC ->insertAcquire(MI, Scope: MOI.getScope(),
2267	AddrSpace: MOI.getOrderingAddrSpace(),
2268	Pos: Position::AFTER);
2269	}
2270
2271	return Changed;
2272	}
2273
2274	// Atomic instructions already bypass caches to the scope specified by the
2275	// SyncScope operand. Only non-atomic volatile and nontemporal/last-use
2276	// instructions need additional treatment.
2277	Changed \|= CC ->enableVolatileAndOrNonTemporal(
2278	MI, AddrSpace: MOI.getInstrAddrSpace(), Op: SIMemOp::LOAD, IsVolatile: MOI.isVolatile(),
2279	IsNonTemporal: MOI.isNonTemporal(), IsLastUse: MOI.isLastUse());
2280
2281	return Changed;
2282	}
2283
2284	bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
2285	MachineBasicBlock::iterator &MI) {
2286	assert(!MI->mayLoad() && MI->mayStore());
2287
2288	bool Changed = false;
2289	// FIXME: Necessary hack because iterator can lose track of the store.
2290	MachineInstr &StoreMI = *MI;
2291
2292	if (MOI.isAtomic()) {
2293	if (MOI.getOrdering() == AtomicOrdering::Monotonic \|\|
2294	MOI.getOrdering() == AtomicOrdering::Release \|\|
2295	MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2296	Changed \|= CC ->enableStoreCacheBypass(MI, Scope: MOI.getScope(),
2297	AddrSpace: MOI.getOrderingAddrSpace());
2298	}
2299
2300	// Handle cooperative atomics after cache bypass step, as it may override
2301	// the scope of the instruction to a greater scope.
2302	if (MOI.isCooperative())
2303	Changed \|= CC ->handleCooperativeAtomic(MI&: *MI);
2304
2305	if (MOI.getOrdering() == AtomicOrdering::Release \|\|
2306	MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2307	Changed \|= CC ->insertRelease(MI, Scope: MOI.getScope(),
2308	AddrSpace: MOI.getOrderingAddrSpace(),
2309	IsCrossAddrSpaceOrdering: MOI.getIsCrossAddressSpaceOrdering(),
2310	Pos: Position::BEFORE);
2311
2312	Changed \|= CC ->finalizeStore(MI&: StoreMI, /Atomic=/true);
2313	return Changed;
2314	}
2315
2316	// Atomic instructions already bypass caches to the scope specified by the
2317	// SyncScope operand. Only non-atomic volatile and nontemporal instructions
2318	// need additional treatment.
2319	Changed \|= CC ->enableVolatileAndOrNonTemporal(
2320	MI, AddrSpace: MOI.getInstrAddrSpace(), Op: SIMemOp::STORE, IsVolatile: MOI.isVolatile(),
2321	IsNonTemporal: MOI.isNonTemporal());
2322
2323	// GFX12 specific, scope(desired coherence domain in cache hierarchy) is
2324	// instruction field, do not confuse it with atomic scope.
2325	Changed \|= CC ->finalizeStore(MI&: StoreMI, /Atomic=/false);
2326	return Changed;
2327	}
2328
2329	bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
2330	MachineBasicBlock::iterator &MI) {
2331	assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE);
2332
2333	AtomicPseudoMIs.push_back(x: MI);
2334	bool Changed = false;
2335
2336	const SIAtomicAddrSpace OrderingAddrSpace = MOI.getOrderingAddrSpace();
2337
2338	if (MOI.isAtomic()) {
2339	const AtomicOrdering Order = MOI.getOrdering();
2340	if (Order == AtomicOrdering::Acquire) {
2341	// Acquire fences only need to wait on the previous atomic they pair with.
2342	Changed \|= CC ->insertWait(MI, Scope: MOI.getScope(), AddrSpace: OrderingAddrSpace,
2343	Op: SIMemOp::LOAD \| SIMemOp::STORE,
2344	IsCrossAddrSpaceOrdering: MOI.getIsCrossAddressSpaceOrdering(),
2345	Pos: Position::BEFORE, Order, /AtomicsOnly=/true);
2346	}
2347
2348	if (Order == AtomicOrdering::Release \|\|
2349	Order == AtomicOrdering::AcquireRelease \|\|
2350	Order == AtomicOrdering::SequentiallyConsistent)
2351	/// TODO: This relies on a barrier always generating a waitcnt
2352	/// for LDS to ensure it is not reordered with the completion of
2353	/// the proceeding LDS operations. If barrier had a memory
2354	/// ordering and memory scope, then library does not need to
2355	/// generate a fence. Could add support in this file for
2356	/// barrier. SIInsertWaitcnt.cpp could then stop unconditionally
2357	/// adding S_WAITCNT before a S_BARRIER.
2358	Changed \|= CC ->insertRelease(MI, Scope: MOI.getScope(), AddrSpace: OrderingAddrSpace,
2359	IsCrossAddrSpaceOrdering: MOI.getIsCrossAddressSpaceOrdering(),
2360	Pos: Position::BEFORE);
2361
2362	// TODO: If both release and invalidate are happening they could be combined
2363	// to use the single "BUFFER_WBINV" instruction. This could be done by*
2364	// reorganizing this code or as part of optimizing SIInsertWaitcnt pass to
2365	// track cache invalidate and write back instructions.
2366
2367	if (Order == AtomicOrdering::Acquire \|\|
2368	Order == AtomicOrdering::AcquireRelease \|\|
2369	Order == AtomicOrdering::SequentiallyConsistent)
2370	Changed \|= CC ->insertAcquire(MI, Scope: MOI.getScope(), AddrSpace: OrderingAddrSpace,
2371	Pos: Position::BEFORE);
2372
2373	return Changed;
2374	}
2375
2376	return Changed;
2377	}
2378
2379	bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
2380	MachineBasicBlock::iterator &MI) {
2381	assert(MI->mayLoad() && MI->mayStore());
2382
2383	bool Changed = false;
2384	MachineInstr &RMWMI = *MI;
2385
2386	if (MOI.isAtomic()) {
2387	const AtomicOrdering Order = MOI.getOrdering();
2388	if (Order == AtomicOrdering::Monotonic \|\|
2389	Order == AtomicOrdering::Acquire \|\| Order == AtomicOrdering::Release \|\|
2390	Order == AtomicOrdering::AcquireRelease \|\|
2391	Order == AtomicOrdering::SequentiallyConsistent) {
2392	Changed \|= CC ->enableRMWCacheBypass(MI, Scope: MOI.getScope(),
2393	AddrSpace: MOI.getInstrAddrSpace());
2394	}
2395
2396	if (Order == AtomicOrdering::Release \|\|
2397	Order == AtomicOrdering::AcquireRelease \|\|
2398	Order == AtomicOrdering::SequentiallyConsistent \|\|
2399	MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent)
2400	Changed \|= CC ->insertRelease(MI, Scope: MOI.getScope(),
2401	AddrSpace: MOI.getOrderingAddrSpace(),
2402	IsCrossAddrSpaceOrdering: MOI.getIsCrossAddressSpaceOrdering(),
2403	Pos: Position::BEFORE);
2404
2405	if (Order == AtomicOrdering::Acquire \|\|
2406	Order == AtomicOrdering::AcquireRelease \|\|
2407	Order == AtomicOrdering::SequentiallyConsistent \|\|
2408	MOI.getFailureOrdering() == AtomicOrdering::Acquire \|\|
2409	MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) {
2410	// Only wait on the previous atomic.
2411	Changed \|=
2412	CC ->insertWait(MI, Scope: MOI.getScope(), AddrSpace: MOI.getInstrAddrSpace(),
2413	Op: isAtomicRet(MI: *MI) ? SIMemOp::LOAD : SIMemOp::STORE,
2414	IsCrossAddrSpaceOrdering: MOI.getIsCrossAddressSpaceOrdering(), Pos: Position::AFTER,
2415	Order, /AtomicsOnly=/true);
2416	Changed \|= CC ->insertAcquire(MI, Scope: MOI.getScope(),
2417	AddrSpace: MOI.getOrderingAddrSpace(),
2418	Pos: Position::AFTER);
2419	}
2420
2421	Changed \|= CC ->finalizeStore(MI&: RMWMI, /Atomic=/true);
2422	return Changed;
2423	}
2424
2425	return Changed;
2426	}
2427
2428	bool SIMemoryLegalizer::expandLDSDMA(const SIMemOpInfo &MOI,
2429	MachineBasicBlock::iterator &MI) {
2430	assert(MI->mayLoad() && MI->mayStore());
2431
2432	// The volatility or nontemporal-ness of the operation is a
2433	// function of the global memory, not the LDS.
2434	SIMemOp OpKind =
2435	SIInstrInfo::mayWriteLDSThroughDMA(MI: *MI) ? SIMemOp::LOAD : SIMemOp::STORE;
2436
2437	// Handle volatile and/or nontemporal markers on direct-to-LDS loads and
2438	// stores. The operation is treated as a volatile/nontemporal store
2439	// to its second argument.
2440	return CC ->enableVolatileAndOrNonTemporal(
2441	MI, AddrSpace: MOI.getInstrAddrSpace(), Op: OpKind, IsVolatile: MOI.isVolatile(),
2442	IsNonTemporal: MOI.isNonTemporal(), IsLastUse: MOI.isLastUse());
2443	}
2444
2445	bool SIMemoryLegalizerLegacy::runOnMachineFunction(MachineFunction &MF) {
2446	const MachineModuleInfo &MMI =
2447	getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
2448	return SIMemoryLegalizer (MMI).run(MF);
2449	}
2450
2451	PreservedAnalyses
2452	SIMemoryLegalizerPass::run(MachineFunction &MF,
2453	MachineFunctionAnalysisManager &MFAM) {
2454	auto *MMI = MFAM.getResult<ModuleAnalysisManagerMachineFunctionProxy>(IR&: MF)
2455	.getCachedResult<MachineModuleAnalysis>(
2456	IR&: *MF.getFunction().getParent());
2457	assert(MMI && "MachineModuleAnalysis must be available");
2458	if (!SIMemoryLegalizer (MMI->getMMI()).run(MF))
2459	return PreservedAnalyses::all();
2460	return getMachineFunctionPassPreservedAnalyses().preserveSet<CFGAnalyses>();
2461	}
2462
2463	bool SIMemoryLegalizer::run(MachineFunction &MF) {
2464	bool Changed = false;
2465
2466	const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2467	SIMemOpAccess MOA(MMI.getObjFileInfo<AMDGPUMachineModuleInfo>(), ST);
2468	CC = SICacheControl::create(ST);
2469
2470	for (auto &MBB : MF) {
2471	for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) {
2472
2473	// Unbundle instructions after the post-RA scheduler.
2474	if (MI ->isBundle() && MI ->mayLoadOrStore()) {
2475	MachineBasicBlock::instr_iterator II(MI ->getIterator());
2476	for (MachineBasicBlock::instr_iterator I = ++II, E = MBB.instr_end();
2477	I != E && I ->isBundledWithPred(); ++I) {
2478	I ->unbundleFromPred();
2479	for (MachineOperand &MO : I ->operands())
2480	if (MO.isReg())
2481	MO.setIsInternalRead(false);
2482	}
2483
2484	MI = MI ->eraseFromParent();
2485	}
2486
2487	if (MI ->getDesc().TSFlags & SIInstrFlags::maybeAtomic) {
2488	if (const auto &MOI = MOA.getLoadInfo(MI))
2489	Changed \|= expandLoad(MOI: *MOI, MI);
2490	else if (const auto &MOI = MOA.getStoreInfo(MI))
2491	Changed \|= expandStore(MOI: *MOI, MI);
2492	else if (const auto &MOI = MOA.getLDSDMAInfo(MI))
2493	Changed \|= expandLDSDMA(MOI: *MOI, MI);
2494	else if (const auto &MOI = MOA.getAtomicFenceInfo(MI))
2495	Changed \|= expandAtomicFence(MOI: *MOI, MI);
2496	else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI))
2497	Changed \|= expandAtomicCmpxchgOrRmw(MOI: *MOI, MI);
2498	}
2499
2500	if (isNonVolatileMemoryAccess(MI: *MI))
2501	Changed \|= CC ->handleNonVolatile(MI&: *MI);
2502	}
2503	}
2504
2505	Changed \|= removeAtomicPseudoMIs();
2506	return Changed;
2507	}
2508
2509	INITIALIZE_PASS(SIMemoryLegalizerLegacy, DEBUG_TYPE, PASS_NAME, false, false)
2510
2511	char SIMemoryLegalizerLegacy::ID = `0`;
2512	char &llvm::SIMemoryLegalizerID = SIMemoryLegalizerLegacy::ID;
2513
2514	FunctionPass *llvm::createSIMemoryLegalizerPass() {
2515	return new SIMemoryLegalizerLegacy ();
2516	}
2517

Browse the source code of llvm_projects/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp