SIMemoryLegalizer.cpp source code [llvm_projects/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp]

1	//===- SIMemoryLegalizer.cpp ----------------------------------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	/// \file
10	/// Memory legalizer - implements memory model. More information can be
11	/// found here:
12	/// http://llvm.org/docs/AMDGPUUsage.html#memory-model
13	//
14	//===----------------------------------------------------------------------===//
15
16	#include "AMDGPU.h"
17	#include "AMDGPUMachineModuleInfo.h"
18	#include "GCNSubtarget.h"
19	#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
20	#include "llvm/ADT/BitmaskEnum.h"
21	#include "llvm/ADT/StringExtras.h"
22	#include "llvm/CodeGen/MachineBasicBlock.h"
23	#include "llvm/CodeGen/MachineFunctionPass.h"
24	#include "llvm/CodeGen/MachinePassManager.h"
25	#include "llvm/IR/DiagnosticInfo.h"
26	#include "llvm/IR/MemoryModelRelaxationAnnotations.h"
27	#include "llvm/IR/PassManager.h"
28	#include "llvm/Support/AtomicOrdering.h"
29	#include "llvm/TargetParser/TargetParser.h"
30
31	using namespace llvm;
32	using namespace llvm::AMDGPU;
33
34	#define DEBUG_TYPE "si-memory-legalizer"
35	#define PASS_NAME "SI Memory Legalizer"
36
37	static cl::opt<bool> AmdgcnSkipCacheInvalidations(
38	"amdgcn-skip-cache-invalidations", cl::init(Val: false), cl::Hidden,
39	cl::desc ("Use this to skip inserting cache invalidating instructions."));
40
41	namespace {
42
43	LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE();
44
45	/// Memory operation flags. Can be ORed together.
46	enum class SIMemOp {
47	NONE = `0u`,
48	LOAD = `1u` << `0`,
49	STORE = `1u` << `1`,
50	LLVM_MARK_AS_BITMASK_ENUM(/ LargestFlag = / STORE)
51	};
52
53	/// Position to insert a new instruction relative to an existing
54	/// instruction.
55	enum class Position {
56	BEFORE,
57	AFTER
58	};
59
60	/// The atomic synchronization scopes supported by the AMDGPU target.
61	enum class SIAtomicScope {
62	NONE,
63	SINGLETHREAD,
64	WAVEFRONT,
65	WORKGROUP,
66	AGENT,
67	SYSTEM
68	};
69
70	/// The distinct address spaces supported by the AMDGPU target for
71	/// atomic memory operation. Can be ORed together.
72	enum class SIAtomicAddrSpace {
73	NONE = `0u`,
74	GLOBAL = `1u` << `0`,
75	LDS = `1u` << `1`,
76	SCRATCH = `1u` << `2`,
77	GDS = `1u` << `3`,
78	OTHER = `1u` << `4`,
79
80	/// The address spaces that can be accessed by a FLAT instruction.
81	FLAT = GLOBAL \| LDS \| SCRATCH,
82
83	/// The address spaces that support atomic instructions.
84	ATOMIC = GLOBAL \| LDS \| SCRATCH \| GDS,
85
86	/// All address spaces.
87	ALL = GLOBAL \| LDS \| SCRATCH \| GDS \| OTHER,
88
89	LLVM_MARK_AS_BITMASK_ENUM(/ LargestFlag = / ALL)
90	};
91
92	class SIMemOpInfo final {
93	private:
94
95	friend class SIMemOpAccess;
96
97	AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
98	AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
99	SIAtomicScope Scope = SIAtomicScope::SYSTEM;
100	SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
101	SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
102	bool IsCrossAddressSpaceOrdering = false;
103	bool IsVolatile = false;
104	bool IsNonTemporal = false;
105	bool IsLastUse = false;
106
107	SIMemOpInfo(
108	AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent,
109	SIAtomicScope Scope = SIAtomicScope::SYSTEM,
110	SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC,
111	SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::ALL,
112	bool IsCrossAddressSpaceOrdering = true,
113	AtomicOrdering FailureOrdering = AtomicOrdering::SequentiallyConsistent,
114	bool IsVolatile = false, bool IsNonTemporal = false,
115	bool IsLastUse = false)
116	: Ordering(Ordering), FailureOrdering(FailureOrdering), Scope(Scope),
117	OrderingAddrSpace(OrderingAddrSpace), InstrAddrSpace(InstrAddrSpace),
118	IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering),
119	IsVolatile(IsVolatile), IsNonTemporal(IsNonTemporal),
120	IsLastUse(IsLastUse) {
121
122	if (Ordering == AtomicOrdering::NotAtomic) {
123	assert(Scope == SIAtomicScope::NONE &&
124	OrderingAddrSpace == SIAtomicAddrSpace::NONE &&
125	!IsCrossAddressSpaceOrdering &&
126	FailureOrdering == AtomicOrdering::NotAtomic);
127	return;
128	}
129
130	assert(Scope != SIAtomicScope::NONE &&
131	(OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
132	SIAtomicAddrSpace::NONE &&
133	(InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
134	SIAtomicAddrSpace::NONE);
135
136	// There is also no cross address space ordering if the ordering
137	// address space is the same as the instruction address space and
138	// only contains a single address space.
139	if ((OrderingAddrSpace == InstrAddrSpace) &&
140	isPowerOf2_32(Value: uint32_t(InstrAddrSpace)))
141	this->IsCrossAddressSpaceOrdering = false;
142
143	// Limit the scope to the maximum supported by the instruction's address
144	// spaces.
145	if ((InstrAddrSpace & ~SIAtomicAddrSpace::SCRATCH) ==
146	SIAtomicAddrSpace::NONE) {
147	this->Scope = std::min(a: Scope, b: SIAtomicScope::SINGLETHREAD);
148	} else if ((InstrAddrSpace &
149	~(SIAtomicAddrSpace::SCRATCH \| SIAtomicAddrSpace::LDS)) ==
150	SIAtomicAddrSpace::NONE) {
151	this->Scope = std::min(a: Scope, b: SIAtomicScope::WORKGROUP);
152	} else if ((InstrAddrSpace &
153	~(SIAtomicAddrSpace::SCRATCH \| SIAtomicAddrSpace::LDS \|
154	SIAtomicAddrSpace::GDS)) == SIAtomicAddrSpace::NONE) {
155	this->Scope = std::min(a: Scope, b: SIAtomicScope::AGENT);
156	}
157	}
158
159	public:
160	/// \returns Atomic synchronization scope of the machine instruction used to
161	/// create this SIMemOpInfo.
162	SIAtomicScope getScope() const {
163	return Scope;
164	}
165
166	/// \returns Ordering constraint of the machine instruction used to
167	/// create this SIMemOpInfo.
168	AtomicOrdering getOrdering() const {
169	return Ordering;
170	}
171
172	/// \returns Failure ordering constraint of the machine instruction used to
173	/// create this SIMemOpInfo.
174	AtomicOrdering getFailureOrdering() const {
175	return FailureOrdering;
176	}
177
178	/// \returns The address spaces be accessed by the machine
179	/// instruction used to create this SIMemOpInfo.
180	SIAtomicAddrSpace getInstrAddrSpace() const {
181	return InstrAddrSpace;
182	}
183
184	/// \returns The address spaces that must be ordered by the machine
185	/// instruction used to create this SIMemOpInfo.
186	SIAtomicAddrSpace getOrderingAddrSpace() const {
187	return OrderingAddrSpace;
188	}
189
190	/// \returns Return true iff memory ordering of operations on
191	/// different address spaces is required.
192	bool getIsCrossAddressSpaceOrdering() const {
193	return IsCrossAddressSpaceOrdering;
194	}
195
196	/// \returns True if memory access of the machine instruction used to
197	/// create this SIMemOpInfo is volatile, false otherwise.
198	bool isVolatile() const {
199	return IsVolatile;
200	}
201
202	/// \returns True if memory access of the machine instruction used to
203	/// create this SIMemOpInfo is nontemporal, false otherwise.
204	bool isNonTemporal() const {
205	return IsNonTemporal;
206	}
207
208	/// \returns True if memory access of the machine instruction used to
209	/// create this SIMemOpInfo is last use, false otherwise.
210	bool isLastUse() const { return IsLastUse; }
211
212	/// \returns True if ordering constraint of the machine instruction used to
213	/// create this SIMemOpInfo is unordered or higher, false otherwise.
214	bool isAtomic() const {
215	return Ordering != AtomicOrdering::NotAtomic;
216	}
217
218	};
219
220	class SIMemOpAccess final {
221	private:
222	const AMDGPUMachineModuleInfo MMI = nullptr*;
223
224	/// Reports unsupported message \p Msg for \p MI to LLVM context.
225	void reportUnsupported(const MachineBasicBlock::iterator &MI,
226	const char Msg) const*;
227
228	/// Inspects the target synchronization scope \p SSID and determines
229	/// the SI atomic scope it corresponds to, the address spaces it
230	/// covers, and whether the memory ordering applies between address
231	/// spaces.
232	std::optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
233	toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrAddrSpace) const;
234
235	/// \return Return a bit set of the address spaces accessed by \p AS.
236	SIAtomicAddrSpace toSIAtomicAddrSpace(unsigned AS) const;
237
238	/// \returns Info constructed from \p MI, which has at least machine memory
239	/// operand.
240	std::optional<SIMemOpInfo>
241	constructFromMIWithMMO(const MachineBasicBlock::iterator &MI) const;
242
243	public:
244	/// Construct class to support accessing the machine memory operands
245	/// of instructions in the machine function \p MF.
246	SIMemOpAccess(const AMDGPUMachineModuleInfo &MMI);
247
248	/// \returns Load info if \p MI is a load operation, "std::nullopt" otherwise.
249	std::optional<SIMemOpInfo>
250	getLoadInfo(const MachineBasicBlock::iterator &MI) const;
251
252	/// \returns Store info if \p MI is a store operation, "std::nullopt"
253	/// otherwise.
254	std::optional<SIMemOpInfo>
255	getStoreInfo(const MachineBasicBlock::iterator &MI) const;
256
257	/// \returns Atomic fence info if \p MI is an atomic fence operation,
258	/// "std::nullopt" otherwise.
259	std::optional<SIMemOpInfo>
260	getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const;
261
262	/// \returns Atomic cmpxchg/rmw info if \p MI is an atomic cmpxchg or
263	/// rmw operation, "std::nullopt" otherwise.
264	std::optional<SIMemOpInfo>
265	getAtomicCmpxchgOrRmwInfo(const MachineBasicBlock::iterator &MI) const;
266	};
267
268	class SICacheControl {
269	protected:
270
271	/// AMDGPU subtarget info.
272	const GCNSubtarget &ST;
273
274	/// Instruction info.
275	const SIInstrInfo TII = nullptr*;
276
277	IsaVersion IV;
278
279	/// Whether to insert cache invalidating instructions.
280	bool InsertCacheInv;
281
282	SICacheControl(const GCNSubtarget &ST);
283
284	/// Sets named bit \p BitName to "true" if present in instruction \p MI.
285	/// \returns Returns true if \p MI is modified, false otherwise.
286	bool enableNamedBit(const MachineBasicBlock::iterator MI,
287	AMDGPU::CPol::CPol Bit) const;
288
289	public:
290
291	/// Create a cache control for the subtarget \p ST.
292	static std::unique_ptr<SICacheControl> create(const GCNSubtarget &ST);
293
294	/// Update \p MI memory load instruction to bypass any caches up to
295	/// the \p Scope memory scope for address spaces \p
296	/// AddrSpace. Return true iff the instruction was modified.
297	virtual bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
298	SIAtomicScope Scope,
299	SIAtomicAddrSpace AddrSpace) const = `0`;
300
301	/// Update \p MI memory store instruction to bypass any caches up to
302	/// the \p Scope memory scope for address spaces \p
303	/// AddrSpace. Return true iff the instruction was modified.
304	virtual bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
305	SIAtomicScope Scope,
306	SIAtomicAddrSpace AddrSpace) const = `0`;
307
308	/// Update \p MI memory read-modify-write instruction to bypass any caches up
309	/// to the \p Scope memory scope for address spaces \p AddrSpace. Return true
310	/// iff the instruction was modified.
311	virtual bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
312	SIAtomicScope Scope,
313	SIAtomicAddrSpace AddrSpace) const = `0`;
314
315	/// Update \p MI memory instruction of kind \p Op associated with address
316	/// spaces \p AddrSpace to indicate it is volatile and/or
317	/// nontemporal/last-use. Return true iff the instruction was modified.
318	virtual bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
319	SIAtomicAddrSpace AddrSpace,
320	SIMemOp Op, bool IsVolatile,
321	bool IsNonTemporal,
322	bool IsLastUse = false) const = `0`;
323
324	virtual bool expandSystemScopeStore(MachineBasicBlock::iterator &MI) const {
325	return false;
326	};
327
328	/// Inserts any necessary instructions at position \p Pos relative
329	/// to instruction \p MI to ensure memory instructions before \p Pos of kind
330	/// \p Op associated with address spaces \p AddrSpace have completed. Used
331	/// between memory instructions to enforce the order they become visible as
332	/// observed by other memory instructions executing in memory scope \p Scope.
333	/// \p IsCrossAddrSpaceOrdering indicates if the memory ordering is between
334	/// address spaces. Returns true iff any instructions inserted.
335	virtual bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
336	SIAtomicAddrSpace AddrSpace, SIMemOp Op,
337	bool IsCrossAddrSpaceOrdering, Position Pos,
338	AtomicOrdering Order) const = `0`;
339
340	/// Inserts any necessary instructions at position \p Pos relative to
341	/// instruction \p MI to ensure any subsequent memory instructions of this
342	/// thread with address spaces \p AddrSpace will observe the previous memory
343	/// operations by any thread for memory scopes up to memory scope \p Scope .
344	/// Returns true iff any instructions inserted.
345	virtual bool insertAcquire(MachineBasicBlock::iterator &MI,
346	SIAtomicScope Scope,
347	SIAtomicAddrSpace AddrSpace,
348	Position Pos) const = `0`;
349
350	/// Inserts any necessary instructions at position \p Pos relative to
351	/// instruction \p MI to ensure previous memory instructions by this thread
352	/// with address spaces \p AddrSpace have completed and can be observed by
353	/// subsequent memory instructions by any thread executing in memory scope \p
354	/// Scope. \p IsCrossAddrSpaceOrdering indicates if the memory ordering is
355	/// between address spaces. Returns true iff any instructions inserted.
356	virtual bool insertRelease(MachineBasicBlock::iterator &MI,
357	SIAtomicScope Scope,
358	SIAtomicAddrSpace AddrSpace,
359	bool IsCrossAddrSpaceOrdering,
360	Position Pos) const = `0`;
361
362	/// Virtual destructor to allow derivations to be deleted.
363	virtual ~SICacheControl() = default;
364	};
365
366	class SIGfx6CacheControl : public SICacheControl {
367	protected:
368
369	/// Sets GLC bit to "true" if present in \p MI. Returns true if \p MI
370	/// is modified, false otherwise.
371	bool enableGLCBit(const MachineBasicBlock::iterator &MI) const {
372	return enableNamedBit(MI, Bit: AMDGPU::CPol::GLC);
373	}
374
375	/// Sets SLC bit to "true" if present in \p MI. Returns true if \p MI
376	/// is modified, false otherwise.
377	bool enableSLCBit(const MachineBasicBlock::iterator &MI) const {
378	return enableNamedBit(MI, Bit: AMDGPU::CPol::SLC);
379	}
380
381	public:
382
383	SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl (ST) {}
384
385	bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
386	SIAtomicScope Scope,
387	SIAtomicAddrSpace AddrSpace) const override;
388
389	bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
390	SIAtomicScope Scope,
391	SIAtomicAddrSpace AddrSpace) const override;
392
393	bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
394	SIAtomicScope Scope,
395	SIAtomicAddrSpace AddrSpace) const override;
396
397	bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
398	SIAtomicAddrSpace AddrSpace, SIMemOp Op,
399	bool IsVolatile, bool IsNonTemporal,
400	bool IsLastUse) const override;
401
402	bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
403	SIAtomicAddrSpace AddrSpace, SIMemOp Op,
404	bool IsCrossAddrSpaceOrdering, Position Pos,
405	AtomicOrdering Order) const override;
406
407	bool insertAcquire(MachineBasicBlock::iterator &MI,
408	SIAtomicScope Scope,
409	SIAtomicAddrSpace AddrSpace,
410	Position Pos) const override;
411
412	bool insertRelease(MachineBasicBlock::iterator &MI,
413	SIAtomicScope Scope,
414	SIAtomicAddrSpace AddrSpace,
415	bool IsCrossAddrSpaceOrdering,
416	Position Pos) const override;
417	};
418
419	class SIGfx7CacheControl : public SIGfx6CacheControl {
420	public:
421
422	SIGfx7CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl (ST) {}
423
424	bool insertAcquire(MachineBasicBlock::iterator &MI,
425	SIAtomicScope Scope,
426	SIAtomicAddrSpace AddrSpace,
427	Position Pos) const override;
428
429	};
430
431	class SIGfx90ACacheControl : public SIGfx7CacheControl {
432	public:
433
434	SIGfx90ACacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl (ST) {}
435
436	bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
437	SIAtomicScope Scope,
438	SIAtomicAddrSpace AddrSpace) const override;
439
440	bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
441	SIAtomicScope Scope,
442	SIAtomicAddrSpace AddrSpace) const override;
443
444	bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
445	SIAtomicScope Scope,
446	SIAtomicAddrSpace AddrSpace) const override;
447
448	bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
449	SIAtomicAddrSpace AddrSpace, SIMemOp Op,
450	bool IsVolatile, bool IsNonTemporal,
451	bool IsLastUse) const override;
452
453	bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
454	SIAtomicAddrSpace AddrSpace, SIMemOp Op,
455	bool IsCrossAddrSpaceOrdering, Position Pos,
456	AtomicOrdering Order) const override;
457
458	bool insertAcquire(MachineBasicBlock::iterator &MI,
459	SIAtomicScope Scope,
460	SIAtomicAddrSpace AddrSpace,
461	Position Pos) const override;
462
463	bool insertRelease(MachineBasicBlock::iterator &MI,
464	SIAtomicScope Scope,
465	SIAtomicAddrSpace AddrSpace,
466	bool IsCrossAddrSpaceOrdering,
467	Position Pos) const override;
468	};
469
470	class SIGfx940CacheControl : public SIGfx90ACacheControl {
471	protected:
472
473	/// Sets SC0 bit to "true" if present in \p MI. Returns true if \p MI
474	/// is modified, false otherwise.
475	bool enableSC0Bit(const MachineBasicBlock::iterator &MI) const {
476	return enableNamedBit(MI, Bit: AMDGPU::CPol::SC0);
477	}
478
479	/// Sets SC1 bit to "true" if present in \p MI. Returns true if \p MI
480	/// is modified, false otherwise.
481	bool enableSC1Bit(const MachineBasicBlock::iterator &MI) const {
482	return enableNamedBit(MI, Bit: AMDGPU::CPol::SC1);
483	}
484
485	/// Sets NT bit to "true" if present in \p MI. Returns true if \p MI
486	/// is modified, false otherwise.
487	bool enableNTBit(const MachineBasicBlock::iterator &MI) const {
488	return enableNamedBit(MI, Bit: AMDGPU::CPol::NT);
489	}
490
491	public:
492	SIGfx940CacheControl(const GCNSubtarget &ST) : SIGfx90ACacheControl (ST) {};
493
494	bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
495	SIAtomicScope Scope,
496	SIAtomicAddrSpace AddrSpace) const override;
497
498	bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
499	SIAtomicScope Scope,
500	SIAtomicAddrSpace AddrSpace) const override;
501
502	bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
503	SIAtomicScope Scope,
504	SIAtomicAddrSpace AddrSpace) const override;
505
506	bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
507	SIAtomicAddrSpace AddrSpace, SIMemOp Op,
508	bool IsVolatile, bool IsNonTemporal,
509	bool IsLastUse) const override;
510
511	bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
512	SIAtomicAddrSpace AddrSpace, Position Pos) const override;
513
514	bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
515	SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,
516	Position Pos) const override;
517	};
518
519	class SIGfx10CacheControl : public SIGfx7CacheControl {
520	protected:
521
522	/// Sets DLC bit to "true" if present in \p MI. Returns true if \p MI
523	/// is modified, false otherwise.
524	bool enableDLCBit(const MachineBasicBlock::iterator &MI) const {
525	return enableNamedBit(MI, Bit: AMDGPU::CPol::DLC);
526	}
527
528	public:
529
530	SIGfx10CacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl (ST) {}
531
532	bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
533	SIAtomicScope Scope,
534	SIAtomicAddrSpace AddrSpace) const override;
535
536	bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
537	SIAtomicAddrSpace AddrSpace, SIMemOp Op,
538	bool IsVolatile, bool IsNonTemporal,
539	bool IsLastUse) const override;
540
541	bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
542	SIAtomicAddrSpace AddrSpace, SIMemOp Op,
543	bool IsCrossAddrSpaceOrdering, Position Pos,
544	AtomicOrdering Order) const override;
545
546	bool insertAcquire(MachineBasicBlock::iterator &MI,
547	SIAtomicScope Scope,
548	SIAtomicAddrSpace AddrSpace,
549	Position Pos) const override;
550	};
551
552	class SIGfx11CacheControl : public SIGfx10CacheControl {
553	public:
554	SIGfx11CacheControl(const GCNSubtarget &ST) : SIGfx10CacheControl (ST) {}
555
556	bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
557	SIAtomicScope Scope,
558	SIAtomicAddrSpace AddrSpace) const override;
559
560	bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
561	SIAtomicAddrSpace AddrSpace, SIMemOp Op,
562	bool IsVolatile, bool IsNonTemporal,
563	bool IsLastUse) const override;
564	};
565
566	class SIGfx12CacheControl : public SIGfx11CacheControl {
567	protected:
568	// Sets TH policy to \p Value if CPol operand is present in instruction \p MI.
569	// \returns Returns true if \p MI is modified, false otherwise.
570	bool setTH(const MachineBasicBlock::iterator MI,
571	AMDGPU::CPol::CPol Value) const;
572	// Sets Scope policy to \p Value if CPol operand is present in instruction \p
573	// MI. \returns Returns true if \p MI is modified, false otherwise.
574	bool setScope(const MachineBasicBlock::iterator MI,
575	AMDGPU::CPol::CPol Value) const;
576
577	// Stores with system scope (SCOPE_SYS) need to wait for:
578	// - loads or atomics(returning) - wait for {LOAD\|SAMPLE\|BVH\|KM}CNT==0
579	// - non-returning-atomics - wait for STORECNT==0
580	// TODO: SIInsertWaitcnts will not always be able to remove STORECNT waits
581	// since it does not distinguish atomics-with-return from regular stores.
582	// There is no need to wait if memory is cached (mtype != UC).
583	bool
584	insertWaitsBeforeSystemScopeStore(const MachineBasicBlock::iterator MI) const;
585
586	bool setAtomicScope(const MachineBasicBlock::iterator &MI,
587	SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const;
588
589	public:
590	SIGfx12CacheControl(const GCNSubtarget &ST) : SIGfx11CacheControl (ST) {}
591
592	bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
593	SIAtomicAddrSpace AddrSpace, SIMemOp Op,
594	bool IsCrossAddrSpaceOrdering, Position Pos,
595	AtomicOrdering Order) const override;
596
597	bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
598	SIAtomicAddrSpace AddrSpace, Position Pos) const override;
599
600	bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
601	SIAtomicAddrSpace AddrSpace, SIMemOp Op,
602	bool IsVolatile, bool IsNonTemporal,
603	bool IsLastUse) const override;
604
605	bool expandSystemScopeStore(MachineBasicBlock::iterator &MI) const override;
606
607	bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
608	SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,
609	Position Pos) const override;
610
611	bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
612	SIAtomicScope Scope,
613	SIAtomicAddrSpace AddrSpace) const override {
614	return setAtomicScope(MI, Scope, AddrSpace);
615	}
616
617	bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
618	SIAtomicScope Scope,
619	SIAtomicAddrSpace AddrSpace) const override {
620	return setAtomicScope(MI, Scope, AddrSpace);
621	}
622
623	bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
624	SIAtomicScope Scope,
625	SIAtomicAddrSpace AddrSpace) const override {
626	return setAtomicScope(MI, Scope, AddrSpace);
627	}
628	};
629
630	class SIMemoryLegalizer final {
631	private:
632	const MachineModuleInfo &MMI;
633	/// Cache Control.
634	std::unique_ptr<SICacheControl> CC = nullptr;
635
636	/// List of atomic pseudo instructions.
637	std::list<MachineBasicBlock::iterator> AtomicPseudoMIs;
638
639	/// Return true iff instruction \p MI is a atomic instruction that
640	/// returns a result.
641	bool isAtomicRet(const MachineInstr &MI) const {
642	return SIInstrInfo::isAtomicRet(MI);
643	}
644
645	/// Removes all processed atomic pseudo instructions from the current
646	/// function. Returns true if current function is modified, false otherwise.
647	bool removeAtomicPseudoMIs();
648
649	/// Expands load operation \p MI. Returns true if instructions are
650	/// added/deleted or \p MI is modified, false otherwise.
651	bool expandLoad(const SIMemOpInfo &MOI,
652	MachineBasicBlock::iterator &MI);
653	/// Expands store operation \p MI. Returns true if instructions are
654	/// added/deleted or \p MI is modified, false otherwise.
655	bool expandStore(const SIMemOpInfo &MOI,
656	MachineBasicBlock::iterator &MI);
657	/// Expands atomic fence operation \p MI. Returns true if
658	/// instructions are added/deleted or \p MI is modified, false otherwise.
659	bool expandAtomicFence(const SIMemOpInfo &MOI,
660	MachineBasicBlock::iterator &MI);
661	/// Expands atomic cmpxchg or rmw operation \p MI. Returns true if
662	/// instructions are added/deleted or \p MI is modified, false otherwise.
663	bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
664	MachineBasicBlock::iterator &MI);
665
666	public:
667	SIMemoryLegalizer(const MachineModuleInfo &MMI) : MMI(MMI) {};
668	bool run(MachineFunction &MF);
669	};
670
671	class SIMemoryLegalizerLegacy final : public MachineFunctionPass {
672	public:
673	static char ID;
674
675	SIMemoryLegalizerLegacy() : MachineFunctionPass (ID) {}
676
677	void getAnalysisUsage(AnalysisUsage &AU) const override {
678	AU.setPreservesCFG();
679	MachineFunctionPass::getAnalysisUsage(AU);
680	}
681
682	StringRef getPassName() const override {
683	return PASS_NAME;
684	}
685
686	bool runOnMachineFunction(MachineFunction &MF) override;
687	};
688
689	static const StringMap<SIAtomicAddrSpace> ASNames = {{
690	{"global", SIAtomicAddrSpace::GLOBAL},
691	{"local", SIAtomicAddrSpace::LDS},
692	}};
693
694	void diagnoseUnknownMMRAASName(const MachineInstr &MI, StringRef AS) {
695	const MachineFunction *MF = MI.getMF();
696	const Function &Fn = MF->getFunction();
697	SmallString<`128`> Str;
698	raw_svector_ostream OS(Str);
699	OS << "unknown address space '" << AS << "'; expected one of ";
700	ListSeparator LS;
701	for (const auto &[Name, Val] : ASNames)
702	OS << LS << `'\''` << Name << `'\''`;
703	Fn.getContext().diagnose(
704	DI: DiagnosticInfoUnsupported (Fn, Str.str(), MI.getDebugLoc(), DS_Warning));
705	}
706
707	/// Reads \p MI's MMRAs to parse the "amdgpu-as" MMRA.
708	/// If this tag isn't present, or if it has no meaningful values, returns \p
709	/// Default. Otherwise returns all the address spaces concerned by the MMRA.
710	static SIAtomicAddrSpace getFenceAddrSpaceMMRA(const MachineInstr &MI,
711	SIAtomicAddrSpace Default) {
712	static constexpr StringLiteral FenceASPrefix = "amdgpu-as";
713
714	auto MMRA = MMRAMetadata (MI.getMMRAMetadata());
715	if (!MMRA)
716	return Default;
717
718	SIAtomicAddrSpace Result = SIAtomicAddrSpace::NONE;
719	for (const auto &[Prefix, Suffix] : MMRA) {
720	if (Prefix != FenceASPrefix)
721	continue;
722
723	if (auto It = ASNames.find(Key: Suffix); It != ASNames.end())
724	Result \|= It ->second;
725	else
726	diagnoseUnknownMMRAASName(MI, AS: Suffix);
727	}
728
729	return (Result != SIAtomicAddrSpace::NONE) ? Result : Default;
730	}
731
732	} // end anonymous namespace
733
734	void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI,
735	const char Msg) const* {
736	const Function &Func = MI ->getParent()->getParent()->getFunction();
737	Func.getContext().diagnose(
738	DI: DiagnosticInfoUnsupported (Func, Msg, MI ->getDebugLoc()));
739	}
740
741	std::optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
742	SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID,
743	SIAtomicAddrSpace InstrAddrSpace) const {
744	if (SSID == SyncScope::System)
745	return std::tuple(SIAtomicScope::SYSTEM, SIAtomicAddrSpace::ATOMIC, true);
746	if (SSID == MMI->getAgentSSID())
747	return std::tuple(SIAtomicScope::AGENT, SIAtomicAddrSpace::ATOMIC, true);
748	if (SSID == MMI->getWorkgroupSSID())
749	return std::tuple(SIAtomicScope::WORKGROUP, SIAtomicAddrSpace::ATOMIC,
750	true);
751	if (SSID == MMI->getWavefrontSSID())
752	return std::tuple(SIAtomicScope::WAVEFRONT, SIAtomicAddrSpace::ATOMIC,
753	true);
754	if (SSID == SyncScope::SingleThread)
755	return std::tuple(SIAtomicScope::SINGLETHREAD, SIAtomicAddrSpace::ATOMIC,
756	true);
757	if (SSID == MMI->getSystemOneAddressSpaceSSID())
758	return std::tuple(SIAtomicScope::SYSTEM,
759	SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
760	if (SSID == MMI->getAgentOneAddressSpaceSSID())
761	return std::tuple(SIAtomicScope::AGENT,
762	SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
763	if (SSID == MMI->getWorkgroupOneAddressSpaceSSID())
764	return std::tuple(SIAtomicScope::WORKGROUP,
765	SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
766	if (SSID == MMI->getWavefrontOneAddressSpaceSSID())
767	return std::tuple(SIAtomicScope::WAVEFRONT,
768	SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
769	if (SSID == MMI->getSingleThreadOneAddressSpaceSSID())
770	return std::tuple(SIAtomicScope::SINGLETHREAD,
771	SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
772	return std::nullopt;
773	}
774
775	SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const {
776	if (AS == AMDGPUAS::FLAT_ADDRESS)
777	return SIAtomicAddrSpace::FLAT;
778	if (AS == AMDGPUAS::GLOBAL_ADDRESS)
779	return SIAtomicAddrSpace::GLOBAL;
780	if (AS == AMDGPUAS::LOCAL_ADDRESS)
781	return SIAtomicAddrSpace::LDS;
782	if (AS == AMDGPUAS::PRIVATE_ADDRESS)
783	return SIAtomicAddrSpace::SCRATCH;
784	if (AS == AMDGPUAS::REGION_ADDRESS)
785	return SIAtomicAddrSpace::GDS;
786
787	return SIAtomicAddrSpace::OTHER;
788	}
789
790	SIMemOpAccess::SIMemOpAccess(const AMDGPUMachineModuleInfo &MMI_)
791	: MMI(&MMI_) {}
792
793	std::optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
794	const MachineBasicBlock::iterator &MI) const {
795	assert(MI->getNumMemOperands() > `0`);
796
797	SyncScope::ID SSID = SyncScope::SingleThread;
798	AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
799	AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
800	SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
801	bool IsNonTemporal = true;
802	bool IsVolatile = false;
803	bool IsLastUse = false;
804
805	// Validator should check whether or not MMOs cover the entire set of
806	// locations accessed by the memory instruction.
807	for (const auto &MMO : MI ->memoperands()) {
808	IsNonTemporal &= MMO->isNonTemporal();
809	IsVolatile \|= MMO->isVolatile();
810	IsLastUse \|= MMO->getFlags() & MOLastUse;
811	InstrAddrSpace \|=
812	toSIAtomicAddrSpace(AS: MMO->getPointerInfo().getAddrSpace());
813	AtomicOrdering OpOrdering = MMO->getSuccessOrdering();
814	if (OpOrdering != AtomicOrdering::NotAtomic) {
815	const auto &IsSyncScopeInclusion =
816	MMI->isSyncScopeInclusion(A: SSID, B: MMO->getSyncScopeID());
817	if (!IsSyncScopeInclusion) {
818	reportUnsupported(MI,
819	Msg: "Unsupported non-inclusive atomic synchronization scope");
820	return std::nullopt;
821	}
822
823	SSID = *IsSyncScopeInclusion ? SSID : MMO->getSyncScopeID();
824	Ordering = getMergedAtomicOrdering(AO: Ordering, Other: OpOrdering);
825	assert(MMO->getFailureOrdering() != AtomicOrdering::Release &&
826	MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease);
827	FailureOrdering =
828	getMergedAtomicOrdering(AO: FailureOrdering, Other: MMO->getFailureOrdering());
829	}
830	}
831
832	SIAtomicScope Scope = SIAtomicScope::NONE;
833	SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
834	bool IsCrossAddressSpaceOrdering = false;
835	if (Ordering != AtomicOrdering::NotAtomic) {
836	auto ScopeOrNone = toSIAtomicScope(SSID, InstrAddrSpace);
837	if (!ScopeOrNone) {
838	reportUnsupported(MI, Msg: "Unsupported atomic synchronization scope");
839	return std::nullopt;
840	}
841	std::tie(args&: Scope, args&: OrderingAddrSpace, args&: IsCrossAddressSpaceOrdering) =
842	*ScopeOrNone;
843	if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) \|\|
844	((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace) \|\|
845	((InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) == SIAtomicAddrSpace::NONE)) {
846	reportUnsupported(MI, Msg: "Unsupported atomic address space");
847	return std::nullopt;
848	}
849	}
850	return SIMemOpInfo (Ordering, Scope, OrderingAddrSpace, InstrAddrSpace,
851	IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile,
852	IsNonTemporal, IsLastUse);
853	}
854
855	std::optional<SIMemOpInfo>
856	SIMemOpAccess::getLoadInfo(const MachineBasicBlock::iterator &MI) const {
857	assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
858
859	if (!(MI ->mayLoad() && !MI ->mayStore()))
860	return std::nullopt;
861
862	// Be conservative if there are no memory operands.
863	if (MI ->getNumMemOperands() == `0`)
864	return SIMemOpInfo ();
865
866	return constructFromMIWithMMO(MI);
867	}
868
869	std::optional<SIMemOpInfo>
870	SIMemOpAccess::getStoreInfo(const MachineBasicBlock::iterator &MI) const {
871	assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
872
873	if (!(!MI ->mayLoad() && MI ->mayStore()))
874	return std::nullopt;
875
876	// Be conservative if there are no memory operands.
877	if (MI ->getNumMemOperands() == `0`)
878	return SIMemOpInfo ();
879
880	return constructFromMIWithMMO(MI);
881	}
882
883	std::optional<SIMemOpInfo>
884	SIMemOpAccess::getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const {
885	assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
886
887	if (MI ->getOpcode() != AMDGPU::ATOMIC_FENCE)
888	return std::nullopt;
889
890	AtomicOrdering Ordering =
891	static_cast<AtomicOrdering>(MI ->getOperand(i: `0`).getImm());
892
893	SyncScope::ID SSID = static_cast<SyncScope::ID>(MI ->getOperand(i: `1`).getImm());
894	auto ScopeOrNone = toSIAtomicScope(SSID, InstrAddrSpace: SIAtomicAddrSpace::ATOMIC);
895	if (!ScopeOrNone) {
896	reportUnsupported(MI, Msg: "Unsupported atomic synchronization scope");
897	return std::nullopt;
898	}
899
900	SIAtomicScope Scope = SIAtomicScope::NONE;
901	SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
902	bool IsCrossAddressSpaceOrdering = false;
903	std::tie(args&: Scope, args&: OrderingAddrSpace, args&: IsCrossAddressSpaceOrdering) =
904	*ScopeOrNone;
905
906	if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) \|\|
907	((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) {
908	reportUnsupported(MI, Msg: "Unsupported atomic address space");
909	return std::nullopt;
910	}
911
912	return SIMemOpInfo (Ordering, Scope, OrderingAddrSpace, SIAtomicAddrSpace::ATOMIC,
913	IsCrossAddressSpaceOrdering, AtomicOrdering::NotAtomic);
914	}
915
916	std::optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo(
917	const MachineBasicBlock::iterator &MI) const {
918	assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
919
920	if (!(MI ->mayLoad() && MI ->mayStore()))
921	return std::nullopt;
922
923	// Be conservative if there are no memory operands.
924	if (MI ->getNumMemOperands() == `0`)
925	return SIMemOpInfo ();
926
927	return constructFromMIWithMMO(MI);
928	}
929
930	SICacheControl::SICacheControl(const GCNSubtarget &ST) : ST(ST) {
931	TII = ST.getInstrInfo();
932	IV = getIsaVersion(GPU: ST.getCPU());
933	InsertCacheInv = !AmdgcnSkipCacheInvalidations;
934	}
935
936	bool SICacheControl::enableNamedBit(const MachineBasicBlock::iterator MI,
937	AMDGPU::CPol::CPol Bit) const {
938	MachineOperand CPol = TII->getNamedOperand(MI&: MI, OperandName: AMDGPU::OpName::cpol);
939	if (!CPol)
940	return false;
941
942	CPol->setImm(CPol->getImm() \| Bit);
943	return true;
944	}
945
946	/ static /
947	std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) {
948	GCNSubtarget::Generation Generation = ST.getGeneration();
949	if (ST.hasGFX940Insts())
950	return std::make_unique<SIGfx940CacheControl>(args: ST);
951	if (ST.hasGFX90AInsts())
952	return std::make_unique<SIGfx90ACacheControl>(args: ST);
953	if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS)
954	return std::make_unique<SIGfx6CacheControl>(args: ST);
955	if (Generation < AMDGPUSubtarget::GFX10)
956	return std::make_unique<SIGfx7CacheControl>(args: ST);
957	if (Generation < AMDGPUSubtarget::GFX11)
958	return std::make_unique<SIGfx10CacheControl>(args: ST);
959	if (Generation < AMDGPUSubtarget::GFX12)
960	return std::make_unique<SIGfx11CacheControl>(args: ST);
961	return std::make_unique<SIGfx12CacheControl>(args: ST);
962	}
963
964	bool SIGfx6CacheControl::enableLoadCacheBypass(
965	const MachineBasicBlock::iterator &MI,
966	SIAtomicScope Scope,
967	SIAtomicAddrSpace AddrSpace) const {
968	assert(MI->mayLoad() && !MI->mayStore());
969	bool Changed = false;
970
971	if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
972	switch (Scope) {
973	case SIAtomicScope::SYSTEM:
974	case SIAtomicScope::AGENT:
975	// Set L1 cache policy to MISS_EVICT.
976	// Note: there is no L2 cache bypass policy at the ISA level.
977	Changed \|= enableGLCBit(MI);
978	break;
979	case SIAtomicScope::WORKGROUP:
980	case SIAtomicScope::WAVEFRONT:
981	case SIAtomicScope::SINGLETHREAD:
982	// No cache to bypass.
983	break;
984	default:
985	llvm_unreachable("Unsupported synchronization scope");
986	}
987	}
988
989	/// The scratch address space does not need the global memory caches
990	/// to be bypassed as all memory operations by the same thread are
991	/// sequentially consistent, and no other thread can access scratch
992	/// memory.
993
994	/// Other address spaces do not have a cache.
995
996	return Changed;
997	}
998
999	bool SIGfx6CacheControl::enableStoreCacheBypass(
1000	const MachineBasicBlock::iterator &MI,
1001	SIAtomicScope Scope,
1002	SIAtomicAddrSpace AddrSpace) const {
1003	assert(!MI->mayLoad() && MI->mayStore());
1004	bool Changed = false;
1005
1006	/// The L1 cache is write through so does not need to be bypassed. There is no
1007	/// bypass control for the L2 cache at the isa level.
1008
1009	return Changed;
1010	}
1011
1012	bool SIGfx6CacheControl::enableRMWCacheBypass(
1013	const MachineBasicBlock::iterator &MI,
1014	SIAtomicScope Scope,
1015	SIAtomicAddrSpace AddrSpace) const {
1016	assert(MI->mayLoad() && MI->mayStore());
1017	bool Changed = false;
1018
1019	/// Do not set GLC for RMW atomic operations as L0/L1 cache is automatically
1020	/// bypassed, and the GLC bit is instead used to indicate if they are
1021	/// return or no-return.
1022	/// Note: there is no L2 cache coherent bypass control at the ISA level.
1023
1024	return Changed;
1025	}
1026
1027	bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal(
1028	MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1029	bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
1030	// Only handle load and store, not atomic read-modify-write insructions. The
1031	// latter use glc to indicate if the atomic returns a result and so must not
1032	// be used for cache control.
1033	assert(MI->mayLoad() ^ MI->mayStore());
1034
1035	// Only update load and store, not LLVM IR atomic read-modify-write
1036	// instructions. The latter are always marked as volatile so cannot sensibly
1037	// handle it as do not want to pessimize all atomics. Also they do not support
1038	// the nontemporal attribute.
1039	assert(Op == SIMemOp::LOAD \|\| Op == SIMemOp::STORE);
1040
1041	bool Changed = false;
1042
1043	if (IsVolatile) {
1044	// Set L1 cache policy to be MISS_EVICT for load instructions
1045	// and MISS_LRU for store instructions.
1046	// Note: there is no L2 cache bypass policy at the ISA level.
1047	if (Op == SIMemOp::LOAD)
1048	Changed \|= enableGLCBit(MI);
1049
1050	// Ensure operation has completed at system scope to cause all volatile
1051	// operations to be visible outside the program in a global order. Do not
1052	// request cross address space as only the global address space can be
1053	// observable outside the program, so no need to cause a waitcnt for LDS
1054	// address space operations.
1055	Changed \|= insertWait(MI, Scope: SIAtomicScope::SYSTEM, AddrSpace, Op, IsCrossAddrSpaceOrdering: false,
1056	Pos: Position::AFTER, Order: AtomicOrdering::Unordered);
1057
1058	return Changed;
1059	}
1060
1061	if (IsNonTemporal) {
1062	// Setting both GLC and SLC configures L1 cache policy to MISS_EVICT
1063	// for both loads and stores, and the L2 cache policy to STREAM.
1064	Changed \|= enableGLCBit(MI);
1065	Changed \|= enableSLCBit(MI);
1066	return Changed;
1067	}
1068
1069	return Changed;
1070	}
1071
1072	bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
1073	SIAtomicScope Scope,
1074	SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1075	bool IsCrossAddrSpaceOrdering, Position Pos,
1076	AtomicOrdering Order) const {
1077	bool Changed = false;
1078
1079	MachineBasicBlock &MBB = *MI ->getParent();
1080	DebugLoc DL = MI ->getDebugLoc();
1081
1082	if (Pos == Position::AFTER)
1083	++MI;
1084
1085	bool VMCnt = false;
1086	bool LGKMCnt = false;
1087
1088	if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL \| SIAtomicAddrSpace::SCRATCH)) !=
1089	SIAtomicAddrSpace::NONE) {
1090	switch (Scope) {
1091	case SIAtomicScope::SYSTEM:
1092	case SIAtomicScope::AGENT:
1093	VMCnt \|= true;
1094	break;
1095	case SIAtomicScope::WORKGROUP:
1096	case SIAtomicScope::WAVEFRONT:
1097	case SIAtomicScope::SINGLETHREAD:
1098	// The L1 cache keeps all memory operations in order for
1099	// wavefronts in the same work-group.
1100	break;
1101	default:
1102	llvm_unreachable("Unsupported synchronization scope");
1103	}
1104	}
1105
1106	if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1107	switch (Scope) {
1108	case SIAtomicScope::SYSTEM:
1109	case SIAtomicScope::AGENT:
1110	case SIAtomicScope::WORKGROUP:
1111	// If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
1112	// not needed as LDS operations for all waves are executed in a total
1113	// global ordering as observed by all waves. Required if also
1114	// synchronizing with global/GDS memory as LDS operations could be
1115	// reordered with respect to later global/GDS memory operations of the
1116	// same wave.
1117	LGKMCnt \|= IsCrossAddrSpaceOrdering;
1118	break;
1119	case SIAtomicScope::WAVEFRONT:
1120	case SIAtomicScope::SINGLETHREAD:
1121	// The LDS keeps all memory operations in order for
1122	// the same wavefront.
1123	break;
1124	default:
1125	llvm_unreachable("Unsupported synchronization scope");
1126	}
1127	}
1128
1129	if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
1130	switch (Scope) {
1131	case SIAtomicScope::SYSTEM:
1132	case SIAtomicScope::AGENT:
1133	// If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
1134	// is not needed as GDS operations for all waves are executed in a total
1135	// global ordering as observed by all waves. Required if also
1136	// synchronizing with global/LDS memory as GDS operations could be
1137	// reordered with respect to later global/LDS memory operations of the
1138	// same wave.
1139	LGKMCnt \|= IsCrossAddrSpaceOrdering;
1140	break;
1141	case SIAtomicScope::WORKGROUP:
1142	case SIAtomicScope::WAVEFRONT:
1143	case SIAtomicScope::SINGLETHREAD:
1144	// The GDS keeps all memory operations in order for
1145	// the same work-group.
1146	break;
1147	default:
1148	llvm_unreachable("Unsupported synchronization scope");
1149	}
1150	}
1151
1152	if (VMCnt \|\| LGKMCnt) {
1153	unsigned WaitCntImmediate =
1154	AMDGPU::encodeWaitcnt(Version: IV,
1155	Vmcnt: VMCnt ? `0` : getVmcntBitMask(Version: IV),
1156	Expcnt: getExpcntBitMask(Version: IV),
1157	Lgkmcnt: LGKMCnt ? `0` : getLgkmcntBitMask(Version: IV));
1158	BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_WAITCNT_soft))
1159	.addImm(Val: WaitCntImmediate);
1160	Changed = true;
1161	}
1162
1163	if (Pos == Position::AFTER)
1164	--MI;
1165
1166	return Changed;
1167	}
1168
1169	bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1170	SIAtomicScope Scope,
1171	SIAtomicAddrSpace AddrSpace,
1172	Position Pos) const {
1173	if (!InsertCacheInv)
1174	return false;
1175
1176	bool Changed = false;
1177
1178	MachineBasicBlock &MBB = *MI ->getParent();
1179	DebugLoc DL = MI ->getDebugLoc();
1180
1181	if (Pos == Position::AFTER)
1182	++MI;
1183
1184	if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1185	switch (Scope) {
1186	case SIAtomicScope::SYSTEM:
1187	case SIAtomicScope::AGENT:
1188	BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::BUFFER_WBINVL1));
1189	Changed = true;
1190	break;
1191	case SIAtomicScope::WORKGROUP:
1192	case SIAtomicScope::WAVEFRONT:
1193	case SIAtomicScope::SINGLETHREAD:
1194	// No cache to invalidate.
1195	break;
1196	default:
1197	llvm_unreachable("Unsupported synchronization scope");
1198	}
1199	}
1200
1201	/// The scratch address space does not need the global memory cache
1202	/// to be flushed as all memory operations by the same thread are
1203	/// sequentially consistent, and no other thread can access scratch
1204	/// memory.
1205
1206	/// Other address spaces do not have a cache.
1207
1208	if (Pos == Position::AFTER)
1209	--MI;
1210
1211	return Changed;
1212	}
1213
1214	bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1215	SIAtomicScope Scope,
1216	SIAtomicAddrSpace AddrSpace,
1217	bool IsCrossAddrSpaceOrdering,
1218	Position Pos) const {
1219	return insertWait(MI, Scope, AddrSpace, Op: SIMemOp::LOAD \| SIMemOp::STORE,
1220	IsCrossAddrSpaceOrdering, Pos, Order: AtomicOrdering::Release);
1221	}
1222
1223	bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1224	SIAtomicScope Scope,
1225	SIAtomicAddrSpace AddrSpace,
1226	Position Pos) const {
1227	if (!InsertCacheInv)
1228	return false;
1229
1230	bool Changed = false;
1231
1232	MachineBasicBlock &MBB = *MI ->getParent();
1233	DebugLoc DL = MI ->getDebugLoc();
1234
1235	const GCNSubtarget &STM = MBB.getParent()->getSubtarget<GCNSubtarget>();
1236
1237	const unsigned InvalidateL1 = STM.isAmdPalOS() \|\| STM.isMesa3DOS()
1238	? AMDGPU::BUFFER_WBINVL1
1239	: AMDGPU::BUFFER_WBINVL1_VOL;
1240
1241	if (Pos == Position::AFTER)
1242	++MI;
1243
1244	if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1245	switch (Scope) {
1246	case SIAtomicScope::SYSTEM:
1247	case SIAtomicScope::AGENT:
1248	BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: InvalidateL1));
1249	Changed = true;
1250	break;
1251	case SIAtomicScope::WORKGROUP:
1252	case SIAtomicScope::WAVEFRONT:
1253	case SIAtomicScope::SINGLETHREAD:
1254	// No cache to invalidate.
1255	break;
1256	default:
1257	llvm_unreachable("Unsupported synchronization scope");
1258	}
1259	}
1260
1261	/// The scratch address space does not need the global memory cache
1262	/// to be flushed as all memory operations by the same thread are
1263	/// sequentially consistent, and no other thread can access scratch
1264	/// memory.
1265
1266	/// Other address spaces do not have a cache.
1267
1268	if (Pos == Position::AFTER)
1269	--MI;
1270
1271	return Changed;
1272	}
1273
1274	bool SIGfx90ACacheControl::enableLoadCacheBypass(
1275	const MachineBasicBlock::iterator &MI,
1276	SIAtomicScope Scope,
1277	SIAtomicAddrSpace AddrSpace) const {
1278	assert(MI->mayLoad() && !MI->mayStore());
1279	bool Changed = false;
1280
1281	if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1282	switch (Scope) {
1283	case SIAtomicScope::SYSTEM:
1284	case SIAtomicScope::AGENT:
1285	// Set the L1 cache policy to MISS_LRU.
1286	// Note: there is no L2 cache bypass policy at the ISA level.
1287	Changed \|= enableGLCBit(MI);
1288	break;
1289	case SIAtomicScope::WORKGROUP:
1290	// In threadgroup split mode the waves of a work-group can be executing on
1291	// different CUs. Therefore need to bypass the L1 which is per CU.
1292	// Otherwise in non-threadgroup split mode all waves of a work-group are
1293	// on the same CU, and so the L1 does not need to be bypassed.
1294	if (ST.isTgSplitEnabled())
1295	Changed \|= enableGLCBit(MI);
1296	break;
1297	case SIAtomicScope::WAVEFRONT:
1298	case SIAtomicScope::SINGLETHREAD:
1299	// No cache to bypass.
1300	break;
1301	default:
1302	llvm_unreachable("Unsupported synchronization scope");
1303	}
1304	}
1305
1306	/// The scratch address space does not need the global memory caches
1307	/// to be bypassed as all memory operations by the same thread are
1308	/// sequentially consistent, and no other thread can access scratch
1309	/// memory.
1310
1311	/// Other address spaces do not have a cache.
1312
1313	return Changed;
1314	}
1315
1316	bool SIGfx90ACacheControl::enableStoreCacheBypass(
1317	const MachineBasicBlock::iterator &MI,
1318	SIAtomicScope Scope,
1319	SIAtomicAddrSpace AddrSpace) const {
1320	assert(!MI->mayLoad() && MI->mayStore());
1321	bool Changed = false;
1322
1323	if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1324	switch (Scope) {
1325	case SIAtomicScope::SYSTEM:
1326	case SIAtomicScope::AGENT:
1327	/// Do not set glc for store atomic operations as they implicitly write
1328	/// through the L1 cache.
1329	break;
1330	case SIAtomicScope::WORKGROUP:
1331	case SIAtomicScope::WAVEFRONT:
1332	case SIAtomicScope::SINGLETHREAD:
1333	// No cache to bypass. Store atomics implicitly write through the L1
1334	// cache.
1335	break;
1336	default:
1337	llvm_unreachable("Unsupported synchronization scope");
1338	}
1339	}
1340
1341	/// The scratch address space does not need the global memory caches
1342	/// to be bypassed as all memory operations by the same thread are
1343	/// sequentially consistent, and no other thread can access scratch
1344	/// memory.
1345
1346	/// Other address spaces do not have a cache.
1347
1348	return Changed;
1349	}
1350
1351	bool SIGfx90ACacheControl::enableRMWCacheBypass(
1352	const MachineBasicBlock::iterator &MI,
1353	SIAtomicScope Scope,
1354	SIAtomicAddrSpace AddrSpace) const {
1355	assert(MI->mayLoad() && MI->mayStore());
1356	bool Changed = false;
1357
1358	if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1359	switch (Scope) {
1360	case SIAtomicScope::SYSTEM:
1361	case SIAtomicScope::AGENT:
1362	/// Do not set glc for RMW atomic operations as they implicitly bypass
1363	/// the L1 cache, and the glc bit is instead used to indicate if they are
1364	/// return or no-return.
1365	break;
1366	case SIAtomicScope::WORKGROUP:
1367	case SIAtomicScope::WAVEFRONT:
1368	case SIAtomicScope::SINGLETHREAD:
1369	// No cache to bypass. RMW atomics implicitly bypass the L1 cache.
1370	break;
1371	default:
1372	llvm_unreachable("Unsupported synchronization scope");
1373	}
1374	}
1375
1376	return Changed;
1377	}
1378
1379	bool SIGfx90ACacheControl::enableVolatileAndOrNonTemporal(
1380	MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1381	bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
1382	// Only handle load and store, not atomic read-modify-write insructions. The
1383	// latter use glc to indicate if the atomic returns a result and so must not
1384	// be used for cache control.
1385	assert(MI->mayLoad() ^ MI->mayStore());
1386
1387	// Only update load and store, not LLVM IR atomic read-modify-write
1388	// instructions. The latter are always marked as volatile so cannot sensibly
1389	// handle it as do not want to pessimize all atomics. Also they do not support
1390	// the nontemporal attribute.
1391	assert(Op == SIMemOp::LOAD \|\| Op == SIMemOp::STORE);
1392
1393	bool Changed = false;
1394
1395	if (IsVolatile) {
1396	// Set L1 cache policy to be MISS_EVICT for load instructions
1397	// and MISS_LRU for store instructions.
1398	// Note: there is no L2 cache bypass policy at the ISA level.
1399	if (Op == SIMemOp::LOAD)
1400	Changed \|= enableGLCBit(MI);
1401
1402	// Ensure operation has completed at system scope to cause all volatile
1403	// operations to be visible outside the program in a global order. Do not
1404	// request cross address space as only the global address space can be
1405	// observable outside the program, so no need to cause a waitcnt for LDS
1406	// address space operations.
1407	Changed \|= insertWait(MI, Scope: SIAtomicScope::SYSTEM, AddrSpace, Op, IsCrossAddrSpaceOrdering: false,
1408	Pos: Position::AFTER, Order: AtomicOrdering::Unordered);
1409
1410	return Changed;
1411	}
1412
1413	if (IsNonTemporal) {
1414	// Setting both GLC and SLC configures L1 cache policy to MISS_EVICT
1415	// for both loads and stores, and the L2 cache policy to STREAM.
1416	Changed \|= enableGLCBit(MI);
1417	Changed \|= enableSLCBit(MI);
1418	return Changed;
1419	}
1420
1421	return Changed;
1422	}
1423
1424	bool SIGfx90ACacheControl::insertWait(MachineBasicBlock::iterator &MI,
1425	SIAtomicScope Scope,
1426	SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1427	bool IsCrossAddrSpaceOrdering,
1428	Position Pos,
1429	AtomicOrdering Order) const {
1430	if (ST.isTgSplitEnabled()) {
1431	// In threadgroup split mode the waves of a work-group can be executing on
1432	// different CUs. Therefore need to wait for global or GDS memory operations
1433	// to complete to ensure they are visible to waves in the other CUs.
1434	// Otherwise in non-threadgroup split mode all waves of a work-group are on
1435	// the same CU, so no need to wait for global memory as all waves in the
1436	// work-group access the same the L1, nor wait for GDS as access are ordered
1437	// on a CU.
1438	if (((AddrSpace & (SIAtomicAddrSpace::GLOBAL \| SIAtomicAddrSpace::SCRATCH \|
1439	SIAtomicAddrSpace::GDS)) != SIAtomicAddrSpace::NONE) &&
1440	(Scope == SIAtomicScope::WORKGROUP)) {
1441	// Same as GFX7 using agent scope.
1442	Scope = SIAtomicScope::AGENT;
1443	}
1444	// In threadgroup split mode LDS cannot be allocated so no need to wait for
1445	// LDS memory operations.
1446	AddrSpace &= ~SIAtomicAddrSpace::LDS;
1447	}
1448	return SIGfx7CacheControl::insertWait(MI, Scope, AddrSpace, Op,
1449	IsCrossAddrSpaceOrdering, Pos, Order);
1450	}
1451
1452	bool SIGfx90ACacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1453	SIAtomicScope Scope,
1454	SIAtomicAddrSpace AddrSpace,
1455	Position Pos) const {
1456	if (!InsertCacheInv)
1457	return false;
1458
1459	bool Changed = false;
1460
1461	MachineBasicBlock &MBB = *MI ->getParent();
1462	DebugLoc DL = MI ->getDebugLoc();
1463
1464	if (Pos == Position::AFTER)
1465	++MI;
1466
1467	if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1468	switch (Scope) {
1469	case SIAtomicScope::SYSTEM:
1470	// Ensures that following loads will not see stale remote VMEM data or
1471	// stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and
1472	// CC will never be stale due to the local memory probes.
1473	BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::BUFFER_INVL2));
1474	// Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
1475	// hardware does not reorder memory operations by the same wave with
1476	// respect to a preceding "BUFFER_INVL2". The invalidate is guaranteed to
1477	// remove any cache lines of earlier writes by the same wave and ensures
1478	// later reads by the same wave will refetch the cache lines.
1479	Changed = true;
1480	break;
1481	case SIAtomicScope::AGENT:
1482	// Same as GFX7.
1483	break;
1484	case SIAtomicScope::WORKGROUP:
1485	// In threadgroup split mode the waves of a work-group can be executing on
1486	// different CUs. Therefore need to invalidate the L1 which is per CU.
1487	// Otherwise in non-threadgroup split mode all waves of a work-group are
1488	// on the same CU, and so the L1 does not need to be invalidated.
1489	if (ST.isTgSplitEnabled()) {
1490	// Same as GFX7 using agent scope.
1491	Scope = SIAtomicScope::AGENT;
1492	}
1493	break;
1494	case SIAtomicScope::WAVEFRONT:
1495	case SIAtomicScope::SINGLETHREAD:
1496	// Same as GFX7.
1497	break;
1498	default:
1499	llvm_unreachable("Unsupported synchronization scope");
1500	}
1501	}
1502
1503	/// The scratch address space does not need the global memory cache
1504	/// to be flushed as all memory operations by the same thread are
1505	/// sequentially consistent, and no other thread can access scratch
1506	/// memory.
1507
1508	/// Other address spaces do not have a cache.
1509
1510	if (Pos == Position::AFTER)
1511	--MI;
1512
1513	Changed \|= SIGfx7CacheControl::insertAcquire(MI, Scope, AddrSpace, Pos);
1514
1515	return Changed;
1516	}
1517
1518	bool SIGfx90ACacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1519	SIAtomicScope Scope,
1520	SIAtomicAddrSpace AddrSpace,
1521	bool IsCrossAddrSpaceOrdering,
1522	Position Pos) const {
1523	bool Changed = false;
1524
1525	MachineBasicBlock &MBB = *MI ->getParent();
1526	const DebugLoc &DL = MI ->getDebugLoc();
1527
1528	if (Pos == Position::AFTER)
1529	++MI;
1530
1531	if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1532	switch (Scope) {
1533	case SIAtomicScope::SYSTEM:
1534	// Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
1535	// hardware does not reorder memory operations by the same wave with
1536	// respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
1537	// to initiate writeback of any dirty cache lines of earlier writes by the
1538	// same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
1539	// writeback has completed.
1540	BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::BUFFER_WBL2))
1541	// Set SC bits to indicate system scope.
1542	.addImm(Val: AMDGPU::CPol::SC0 \| AMDGPU::CPol::SC1);
1543	// Followed by same as GFX7, which will ensure the necessary "S_WAITCNT
1544	// vmcnt(0)" needed by the "BUFFER_WBL2".
1545	Changed = true;
1546	break;
1547	case SIAtomicScope::AGENT:
1548	case SIAtomicScope::WORKGROUP:
1549	case SIAtomicScope::WAVEFRONT:
1550	case SIAtomicScope::SINGLETHREAD:
1551	// Same as GFX7.
1552	break;
1553	default:
1554	llvm_unreachable("Unsupported synchronization scope");
1555	}
1556	}
1557
1558	if (Pos == Position::AFTER)
1559	--MI;
1560
1561	Changed \|=
1562	SIGfx7CacheControl::insertRelease(MI, Scope, AddrSpace,
1563	IsCrossAddrSpaceOrdering, Pos);
1564
1565	return Changed;
1566	}
1567
1568	bool SIGfx940CacheControl::enableLoadCacheBypass(
1569	const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
1570	SIAtomicAddrSpace AddrSpace) const {
1571	assert(MI->mayLoad() && !MI->mayStore());
1572	bool Changed = false;
1573
1574	if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1575	switch (Scope) {
1576	case SIAtomicScope::SYSTEM:
1577	// Set SC bits to indicate system scope.
1578	Changed \|= enableSC0Bit(MI);
1579	Changed \|= enableSC1Bit(MI);
1580	break;
1581	case SIAtomicScope::AGENT:
1582	// Set SC bits to indicate agent scope.
1583	Changed \|= enableSC1Bit(MI);
1584	break;
1585	case SIAtomicScope::WORKGROUP:
1586	// In threadgroup split mode the waves of a work-group can be executing on
1587	// different CUs. Therefore need to bypass the L1 which is per CU.
1588	// Otherwise in non-threadgroup split mode all waves of a work-group are
1589	// on the same CU, and so the L1 does not need to be bypassed. Setting SC
1590	// bits to indicate work-group scope will do this automatically.
1591	Changed \|= enableSC0Bit(MI);
1592	break;
1593	case SIAtomicScope::WAVEFRONT:
1594	case SIAtomicScope::SINGLETHREAD:
1595	// Leave SC bits unset to indicate wavefront scope.
1596	break;
1597	default:
1598	llvm_unreachable("Unsupported synchronization scope");
1599	}
1600	}
1601
1602	/// The scratch address space does not need the global memory caches
1603	/// to be bypassed as all memory operations by the same thread are
1604	/// sequentially consistent, and no other thread can access scratch
1605	/// memory.
1606
1607	/// Other address spaces do not have a cache.
1608
1609	return Changed;
1610	}
1611
1612	bool SIGfx940CacheControl::enableStoreCacheBypass(
1613	const MachineBasicBlock::iterator &MI,
1614	SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const {
1615	assert(!MI->mayLoad() && MI->mayStore());
1616	bool Changed = false;
1617
1618	if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1619	switch (Scope) {
1620	case SIAtomicScope::SYSTEM:
1621	// Set SC bits to indicate system scope.
1622	Changed \|= enableSC0Bit(MI);
1623	Changed \|= enableSC1Bit(MI);
1624	break;
1625	case SIAtomicScope::AGENT:
1626	// Set SC bits to indicate agent scope.
1627	Changed \|= enableSC1Bit(MI);
1628	break;
1629	case SIAtomicScope::WORKGROUP:
1630	// Set SC bits to indicate workgroup scope.
1631	Changed \|= enableSC0Bit(MI);
1632	break;
1633	case SIAtomicScope::WAVEFRONT:
1634	case SIAtomicScope::SINGLETHREAD:
1635	// Leave SC bits unset to indicate wavefront scope.
1636	break;
1637	default:
1638	llvm_unreachable("Unsupported synchronization scope");
1639	}
1640	}
1641
1642	/// The scratch address space does not need the global memory caches
1643	/// to be bypassed as all memory operations by the same thread are
1644	/// sequentially consistent, and no other thread can access scratch
1645	/// memory.
1646
1647	/// Other address spaces do not have a cache.
1648
1649	return Changed;
1650	}
1651
1652	bool SIGfx940CacheControl::enableRMWCacheBypass(
1653	const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
1654	SIAtomicAddrSpace AddrSpace) const {
1655	assert(MI->mayLoad() && MI->mayStore());
1656	bool Changed = false;
1657
1658	if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1659	switch (Scope) {
1660	case SIAtomicScope::SYSTEM:
1661	// Set SC1 bit to indicate system scope.
1662	Changed \|= enableSC1Bit(MI);
1663	break;
1664	case SIAtomicScope::AGENT:
1665	case SIAtomicScope::WORKGROUP:
1666	case SIAtomicScope::WAVEFRONT:
1667	case SIAtomicScope::SINGLETHREAD:
1668	// RMW atomic operations implicitly bypass the L1 cache and only use SC1
1669	// to indicate system or agent scope. The SC0 bit is used to indicate if
1670	// they are return or no-return. Leave SC1 bit unset to indicate agent
1671	// scope.
1672	break;
1673	default:
1674	llvm_unreachable("Unsupported synchronization scope");
1675	}
1676	}
1677
1678	return Changed;
1679	}
1680
1681	bool SIGfx940CacheControl::enableVolatileAndOrNonTemporal(
1682	MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1683	bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
1684	// Only handle load and store, not atomic read-modify-write insructions. The
1685	// latter use glc to indicate if the atomic returns a result and so must not
1686	// be used for cache control.
1687	assert(MI->mayLoad() ^ MI->mayStore());
1688
1689	// Only update load and store, not LLVM IR atomic read-modify-write
1690	// instructions. The latter are always marked as volatile so cannot sensibly
1691	// handle it as do not want to pessimize all atomics. Also they do not support
1692	// the nontemporal attribute.
1693	assert(Op == SIMemOp::LOAD \|\| Op == SIMemOp::STORE);
1694
1695	bool Changed = false;
1696
1697	if (IsVolatile) {
1698	// Set SC bits to indicate system scope.
1699	Changed \|= enableSC0Bit(MI);
1700	Changed \|= enableSC1Bit(MI);
1701
1702	// Ensure operation has completed at system scope to cause all volatile
1703	// operations to be visible outside the program in a global order. Do not
1704	// request cross address space as only the global address space can be
1705	// observable outside the program, so no need to cause a waitcnt for LDS
1706	// address space operations.
1707	Changed \|= insertWait(MI, Scope: SIAtomicScope::SYSTEM, AddrSpace, Op, IsCrossAddrSpaceOrdering: false,
1708	Pos: Position::AFTER, Order: AtomicOrdering::Unordered);
1709
1710	return Changed;
1711	}
1712
1713	if (IsNonTemporal) {
1714	Changed \|= enableNTBit(MI);
1715	return Changed;
1716	}
1717
1718	return Changed;
1719	}
1720
1721	bool SIGfx940CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1722	SIAtomicScope Scope,
1723	SIAtomicAddrSpace AddrSpace,
1724	Position Pos) const {
1725	if (!InsertCacheInv)
1726	return false;
1727
1728	bool Changed = false;
1729
1730	MachineBasicBlock &MBB = *MI ->getParent();
1731	DebugLoc DL = MI ->getDebugLoc();
1732
1733	if (Pos == Position::AFTER)
1734	++MI;
1735
1736	if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1737	switch (Scope) {
1738	case SIAtomicScope::SYSTEM:
1739	// Ensures that following loads will not see stale remote VMEM data or
1740	// stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and
1741	// CC will never be stale due to the local memory probes.
1742	BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::BUFFER_INV))
1743	// Set SC bits to indicate system scope.
1744	.addImm(Val: AMDGPU::CPol::SC0 \| AMDGPU::CPol::SC1);
1745	// Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
1746	// hardware does not reorder memory operations by the same wave with
1747	// respect to a preceding "BUFFER_INV". The invalidate is guaranteed to
1748	// remove any cache lines of earlier writes by the same wave and ensures
1749	// later reads by the same wave will refetch the cache lines.
1750	Changed = true;
1751	break;
1752	case SIAtomicScope::AGENT:
1753	// Ensures that following loads will not see stale remote date or local
1754	// MTYPE NC global data. Local MTYPE RW and CC memory will never be stale
1755	// due to the memory probes.
1756	BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::BUFFER_INV))
1757	// Set SC bits to indicate agent scope.
1758	.addImm(Val: AMDGPU::CPol::SC1);
1759	// Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
1760	// does not reorder memory operations with respect to preceeding buffer
1761	// invalidate. The invalidate is guaranteed to remove any cache lines of
1762	// earlier writes and ensures later writes will refetch the cache lines.
1763	Changed = true;
1764	break;
1765	case SIAtomicScope::WORKGROUP:
1766	// In threadgroup split mode the waves of a work-group can be executing on
1767	// different CUs. Therefore need to invalidate the L1 which is per CU.
1768	// Otherwise in non-threadgroup split mode all waves of a work-group are
1769	// on the same CU, and so the L1 does not need to be invalidated.
1770	if (ST.isTgSplitEnabled()) {
1771	// Ensures L1 is invalidated if in threadgroup split mode. In
1772	// non-threadgroup split mode it is a NOP, but no point generating it in
1773	// that case if know not in that mode.
1774	BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::BUFFER_INV))
1775	// Set SC bits to indicate work-group scope.
1776	.addImm(Val: AMDGPU::CPol::SC0);
1777	// Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
1778	// does not reorder memory operations with respect to preceeding buffer
1779	// invalidate. The invalidate is guaranteed to remove any cache lines of
1780	// earlier writes and ensures later writes will refetch the cache lines.
1781	Changed = true;
1782	}
1783	break;
1784	case SIAtomicScope::WAVEFRONT:
1785	case SIAtomicScope::SINGLETHREAD:
1786	// Could generate "BUFFER_INV" but it would do nothing as there are no
1787	// caches to invalidate.
1788	break;
1789	default:
1790	llvm_unreachable("Unsupported synchronization scope");
1791	}
1792	}
1793
1794	/// The scratch address space does not need the global memory cache
1795	/// to be flushed as all memory operations by the same thread are
1796	/// sequentially consistent, and no other thread can access scratch
1797	/// memory.
1798
1799	/// Other address spaces do not have a cache.
1800
1801	if (Pos == Position::AFTER)
1802	--MI;
1803
1804	return Changed;
1805	}
1806
1807	bool SIGfx940CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1808	SIAtomicScope Scope,
1809	SIAtomicAddrSpace AddrSpace,
1810	bool IsCrossAddrSpaceOrdering,
1811	Position Pos) const {
1812	bool Changed = false;
1813
1814	MachineBasicBlock &MBB = *MI ->getParent();
1815	DebugLoc DL = MI ->getDebugLoc();
1816
1817	if (Pos == Position::AFTER)
1818	++MI;
1819
1820	if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1821	switch (Scope) {
1822	case SIAtomicScope::SYSTEM:
1823	// Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
1824	// hardware does not reorder memory operations by the same wave with
1825	// respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
1826	// to initiate writeback of any dirty cache lines of earlier writes by the
1827	// same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
1828	// writeback has completed.
1829	BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::BUFFER_WBL2))
1830	// Set SC bits to indicate system scope.
1831	.addImm(Val: AMDGPU::CPol::SC0 \| AMDGPU::CPol::SC1);
1832	// Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is
1833	// SIAtomicScope::SYSTEM, the following insertWait will generate the
1834	// required "S_WAITCNT vmcnt(0)" needed by the "BUFFER_WBL2".
1835	Changed = true;
1836	break;
1837	case SIAtomicScope::AGENT:
1838	BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::BUFFER_WBL2))
1839	// Set SC bits to indicate agent scope.
1840	.addImm(Val: AMDGPU::CPol::SC1);
1841
1842	// Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is
1843	// SIAtomicScope::AGENT, the following insertWait will generate the
1844	// required "S_WAITCNT vmcnt(0)".
1845	Changed = true;
1846	break;
1847	case SIAtomicScope::WORKGROUP:
1848	case SIAtomicScope::WAVEFRONT:
1849	case SIAtomicScope::SINGLETHREAD:
1850	// Do not generate "BUFFER_WBL2" as there are no caches it would
1851	// writeback, and would require an otherwise unnecessary
1852	// "S_WAITCNT vmcnt(0)".
1853	break;
1854	default:
1855	llvm_unreachable("Unsupported synchronization scope");
1856	}
1857	}
1858
1859	if (Pos == Position::AFTER)
1860	--MI;
1861
1862	// Ensure the necessary S_WAITCNT needed by any "BUFFER_WBL2" as well as other
1863	// S_WAITCNT needed.
1864	Changed \|= insertWait(MI, Scope, AddrSpace, Op: SIMemOp::LOAD \| SIMemOp::STORE,
1865	IsCrossAddrSpaceOrdering, Pos, Order: AtomicOrdering::Release);
1866
1867	return Changed;
1868	}
1869
1870	bool SIGfx10CacheControl::enableLoadCacheBypass(
1871	const MachineBasicBlock::iterator &MI,
1872	SIAtomicScope Scope,
1873	SIAtomicAddrSpace AddrSpace) const {
1874	assert(MI->mayLoad() && !MI->mayStore());
1875	bool Changed = false;
1876
1877	if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1878	switch (Scope) {
1879	case SIAtomicScope::SYSTEM:
1880	case SIAtomicScope::AGENT:
1881	// Set the L0 and L1 cache policies to MISS_EVICT.
1882	// Note: there is no L2 cache coherent bypass control at the ISA level.
1883	Changed \|= enableGLCBit(MI);
1884	Changed \|= enableDLCBit(MI);
1885	break;
1886	case SIAtomicScope::WORKGROUP:
1887	// In WGP mode the waves of a work-group can be executing on either CU of
1888	// the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
1889	// CU mode all waves of a work-group are on the same CU, and so the L0
1890	// does not need to be bypassed.
1891	if (!ST.isCuModeEnabled())
1892	Changed \|= enableGLCBit(MI);
1893	break;
1894	case SIAtomicScope::WAVEFRONT:
1895	case SIAtomicScope::SINGLETHREAD:
1896	// No cache to bypass.
1897	break;
1898	default:
1899	llvm_unreachable("Unsupported synchronization scope");
1900	}
1901	}
1902
1903	/// The scratch address space does not need the global memory caches
1904	/// to be bypassed as all memory operations by the same thread are
1905	/// sequentially consistent, and no other thread can access scratch
1906	/// memory.
1907
1908	/// Other address spaces do not have a cache.
1909
1910	return Changed;
1911	}
1912
1913	bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal(
1914	MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1915	bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
1916
1917	// Only handle load and store, not atomic read-modify-write insructions. The
1918	// latter use glc to indicate if the atomic returns a result and so must not
1919	// be used for cache control.
1920	assert(MI->mayLoad() ^ MI->mayStore());
1921
1922	// Only update load and store, not LLVM IR atomic read-modify-write
1923	// instructions. The latter are always marked as volatile so cannot sensibly
1924	// handle it as do not want to pessimize all atomics. Also they do not support
1925	// the nontemporal attribute.
1926	assert(Op == SIMemOp::LOAD \|\| Op == SIMemOp::STORE);
1927
1928	bool Changed = false;
1929
1930	if (IsVolatile) {
1931	// Set L0 and L1 cache policy to be MISS_EVICT for load instructions
1932	// and MISS_LRU for store instructions.
1933	// Note: there is no L2 cache coherent bypass control at the ISA level.
1934	if (Op == SIMemOp::LOAD) {
1935	Changed \|= enableGLCBit(MI);
1936	Changed \|= enableDLCBit(MI);
1937	}
1938
1939	// Ensure operation has completed at system scope to cause all volatile
1940	// operations to be visible outside the program in a global order. Do not
1941	// request cross address space as only the global address space can be
1942	// observable outside the program, so no need to cause a waitcnt for LDS
1943	// address space operations.
1944	Changed \|= insertWait(MI, Scope: SIAtomicScope::SYSTEM, AddrSpace, Op, IsCrossAddrSpaceOrdering: false,
1945	Pos: Position::AFTER, Order: AtomicOrdering::Unordered);
1946	return Changed;
1947	}
1948
1949	if (IsNonTemporal) {
1950	// For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT
1951	// and L2 cache policy to STREAM.
1952	// For stores setting both GLC and SLC configures L0 and L1 cache policy
1953	// to MISS_EVICT and the L2 cache policy to STREAM.
1954	if (Op == SIMemOp::STORE)
1955	Changed \|= enableGLCBit(MI);
1956	Changed \|= enableSLCBit(MI);
1957
1958	return Changed;
1959	}
1960
1961	return Changed;
1962	}
1963
1964	bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
1965	SIAtomicScope Scope,
1966	SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1967	bool IsCrossAddrSpaceOrdering,
1968	Position Pos, AtomicOrdering Order) const {
1969	bool Changed = false;
1970
1971	MachineBasicBlock &MBB = *MI ->getParent();
1972	DebugLoc DL = MI ->getDebugLoc();
1973
1974	if (Pos == Position::AFTER)
1975	++MI;
1976
1977	bool VMCnt = false;
1978	bool VSCnt = false;
1979	bool LGKMCnt = false;
1980
1981	if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL \| SIAtomicAddrSpace::SCRATCH)) !=
1982	SIAtomicAddrSpace::NONE) {
1983	switch (Scope) {
1984	case SIAtomicScope::SYSTEM:
1985	case SIAtomicScope::AGENT:
1986	if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1987	VMCnt \|= true;
1988	if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1989	VSCnt \|= true;
1990	break;
1991	case SIAtomicScope::WORKGROUP:
1992	// In WGP mode the waves of a work-group can be executing on either CU of
1993	// the WGP. Therefore need to wait for operations to complete to ensure
1994	// they are visible to waves in the other CU as the L0 is per CU.
1995	// Otherwise in CU mode and all waves of a work-group are on the same CU
1996	// which shares the same L0.
1997	if (!ST.isCuModeEnabled()) {
1998	if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1999	VMCnt \|= true;
2000	if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
2001	VSCnt \|= true;
2002	}
2003	break;
2004	case SIAtomicScope::WAVEFRONT:
2005	case SIAtomicScope::SINGLETHREAD:
2006	// The L0 cache keeps all memory operations in order for
2007	// work-items in the same wavefront.
2008	break;
2009	default:
2010	llvm_unreachable("Unsupported synchronization scope");
2011	}
2012	}
2013
2014	if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
2015	switch (Scope) {
2016	case SIAtomicScope::SYSTEM:
2017	case SIAtomicScope::AGENT:
2018	case SIAtomicScope::WORKGROUP:
2019	// If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
2020	// not needed as LDS operations for all waves are executed in a total
2021	// global ordering as observed by all waves. Required if also
2022	// synchronizing with global/GDS memory as LDS operations could be
2023	// reordered with respect to later global/GDS memory operations of the
2024	// same wave.
2025	LGKMCnt \|= IsCrossAddrSpaceOrdering;
2026	break;
2027	case SIAtomicScope::WAVEFRONT:
2028	case SIAtomicScope::SINGLETHREAD:
2029	// The LDS keeps all memory operations in order for
2030	// the same wavefront.
2031	break;
2032	default:
2033	llvm_unreachable("Unsupported synchronization scope");
2034	}
2035	}
2036
2037	if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
2038	switch (Scope) {
2039	case SIAtomicScope::SYSTEM:
2040	case SIAtomicScope::AGENT:
2041	// If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
2042	// is not needed as GDS operations for all waves are executed in a total
2043	// global ordering as observed by all waves. Required if also
2044	// synchronizing with global/LDS memory as GDS operations could be
2045	// reordered with respect to later global/LDS memory operations of the
2046	// same wave.
2047	LGKMCnt \|= IsCrossAddrSpaceOrdering;
2048	break;
2049	case SIAtomicScope::WORKGROUP:
2050	case SIAtomicScope::WAVEFRONT:
2051	case SIAtomicScope::SINGLETHREAD:
2052	// The GDS keeps all memory operations in order for
2053	// the same work-group.
2054	break;
2055	default:
2056	llvm_unreachable("Unsupported synchronization scope");
2057	}
2058	}
2059
2060	if (VMCnt \|\| LGKMCnt) {
2061	unsigned WaitCntImmediate =
2062	AMDGPU::encodeWaitcnt(Version: IV,
2063	Vmcnt: VMCnt ? `0` : getVmcntBitMask(Version: IV),
2064	Expcnt: getExpcntBitMask(Version: IV),
2065	Lgkmcnt: LGKMCnt ? `0` : getLgkmcntBitMask(Version: IV));
2066	BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_WAITCNT_soft))
2067	.addImm(Val: WaitCntImmediate);
2068	Changed = true;
2069	}
2070
2071	if (VSCnt) {
2072	BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_WAITCNT_VSCNT_soft))
2073	.addReg(RegNo: AMDGPU::SGPR_NULL, flags: RegState::Undef)
2074	.addImm(Val: `0`);
2075	Changed = true;
2076	}
2077
2078	if (Pos == Position::AFTER)
2079	--MI;
2080
2081	return Changed;
2082	}
2083
2084	bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
2085	SIAtomicScope Scope,
2086	SIAtomicAddrSpace AddrSpace,
2087	Position Pos) const {
2088	if (!InsertCacheInv)
2089	return false;
2090
2091	bool Changed = false;
2092
2093	MachineBasicBlock &MBB = *MI ->getParent();
2094	DebugLoc DL = MI ->getDebugLoc();
2095
2096	if (Pos == Position::AFTER)
2097	++MI;
2098
2099	if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
2100	switch (Scope) {
2101	case SIAtomicScope::SYSTEM:
2102	case SIAtomicScope::AGENT:
2103	// The order of invalidates matter here. We must invalidate "outer in"
2104	// so L1 -> L0 to avoid L0 pulling in stale data from L1 when it is
2105	// invalidated.
2106	BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::BUFFER_GL1_INV));
2107	BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::BUFFER_GL0_INV));
2108	Changed = true;
2109	break;
2110	case SIAtomicScope::WORKGROUP:
2111	// In WGP mode the waves of a work-group can be executing on either CU of
2112	// the WGP. Therefore need to invalidate the L0 which is per CU. Otherwise
2113	// in CU mode and all waves of a work-group are on the same CU, and so the
2114	// L0 does not need to be invalidated.
2115	if (!ST.isCuModeEnabled()) {
2116	BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::BUFFER_GL0_INV));
2117	Changed = true;
2118	}
2119	break;
2120	case SIAtomicScope::WAVEFRONT:
2121	case SIAtomicScope::SINGLETHREAD:
2122	// No cache to invalidate.
2123	break;
2124	default:
2125	llvm_unreachable("Unsupported synchronization scope");
2126	}
2127	}
2128
2129	/// The scratch address space does not need the global memory cache
2130	/// to be flushed as all memory operations by the same thread are
2131	/// sequentially consistent, and no other thread can access scratch
2132	/// memory.
2133
2134	/// Other address spaces do not have a cache.
2135
2136	if (Pos == Position::AFTER)
2137	--MI;
2138
2139	return Changed;
2140	}
2141
2142	bool SIGfx11CacheControl::enableLoadCacheBypass(
2143	const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
2144	SIAtomicAddrSpace AddrSpace) const {
2145	assert(MI->mayLoad() && !MI->mayStore());
2146	bool Changed = false;
2147
2148	if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
2149	switch (Scope) {
2150	case SIAtomicScope::SYSTEM:
2151	case SIAtomicScope::AGENT:
2152	// Set the L0 and L1 cache policies to MISS_EVICT.
2153	// Note: there is no L2 cache coherent bypass control at the ISA level.
2154	Changed \|= enableGLCBit(MI);
2155	break;
2156	case SIAtomicScope::WORKGROUP:
2157	// In WGP mode the waves of a work-group can be executing on either CU of
2158	// the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
2159	// CU mode all waves of a work-group are on the same CU, and so the L0
2160	// does not need to be bypassed.
2161	if (!ST.isCuModeEnabled())
2162	Changed \|= enableGLCBit(MI);
2163	break;
2164	case SIAtomicScope::WAVEFRONT:
2165	case SIAtomicScope::SINGLETHREAD:
2166	// No cache to bypass.
2167	break;
2168	default:
2169	llvm_unreachable("Unsupported synchronization scope");
2170	}
2171	}
2172
2173	/// The scratch address space does not need the global memory caches
2174	/// to be bypassed as all memory operations by the same thread are
2175	/// sequentially consistent, and no other thread can access scratch
2176	/// memory.
2177
2178	/// Other address spaces do not have a cache.
2179
2180	return Changed;
2181	}
2182
2183	bool SIGfx11CacheControl::enableVolatileAndOrNonTemporal(
2184	MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
2185	bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
2186
2187	// Only handle load and store, not atomic read-modify-write insructions. The
2188	// latter use glc to indicate if the atomic returns a result and so must not
2189	// be used for cache control.
2190	assert(MI->mayLoad() ^ MI->mayStore());
2191
2192	// Only update load and store, not LLVM IR atomic read-modify-write
2193	// instructions. The latter are always marked as volatile so cannot sensibly
2194	// handle it as do not want to pessimize all atomics. Also they do not support
2195	// the nontemporal attribute.
2196	assert(Op == SIMemOp::LOAD \|\| Op == SIMemOp::STORE);
2197
2198	bool Changed = false;
2199
2200	if (IsVolatile) {
2201	// Set L0 and L1 cache policy to be MISS_EVICT for load instructions
2202	// and MISS_LRU for store instructions.
2203	// Note: there is no L2 cache coherent bypass control at the ISA level.
2204	if (Op == SIMemOp::LOAD)
2205	Changed \|= enableGLCBit(MI);
2206
2207	// Set MALL NOALLOC for load and store instructions.
2208	Changed \|= enableDLCBit(MI);
2209
2210	// Ensure operation has completed at system scope to cause all volatile
2211	// operations to be visible outside the program in a global order. Do not
2212	// request cross address space as only the global address space can be
2213	// observable outside the program, so no need to cause a waitcnt for LDS
2214	// address space operations.
2215	Changed \|= insertWait(MI, Scope: SIAtomicScope::SYSTEM, AddrSpace, Op, IsCrossAddrSpaceOrdering: false,
2216	Pos: Position::AFTER, Order: AtomicOrdering::Unordered);
2217	return Changed;
2218	}
2219
2220	if (IsNonTemporal) {
2221	// For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT
2222	// and L2 cache policy to STREAM.
2223	// For stores setting both GLC and SLC configures L0 and L1 cache policy
2224	// to MISS_EVICT and the L2 cache policy to STREAM.
2225	if (Op == SIMemOp::STORE)
2226	Changed \|= enableGLCBit(MI);
2227	Changed \|= enableSLCBit(MI);
2228
2229	// Set MALL NOALLOC for load and store instructions.
2230	Changed \|= enableDLCBit(MI);
2231	return Changed;
2232	}
2233
2234	return Changed;
2235	}
2236
2237	bool SIGfx12CacheControl::setTH(const MachineBasicBlock::iterator MI,
2238	AMDGPU::CPol::CPol Value) const {
2239	MachineOperand CPol = TII->getNamedOperand(MI&: MI, OperandName: OpName::cpol);
2240	if (!CPol)
2241	return false;
2242
2243	uint64_t NewTH = Value & AMDGPU::CPol::TH;
2244	if ((CPol->getImm() & AMDGPU::CPol::TH) != NewTH) {
2245	CPol->setImm((CPol->getImm() & ~AMDGPU::CPol::TH) \| NewTH);
2246	return true;
2247	}
2248
2249	return false;
2250	}
2251
2252	bool SIGfx12CacheControl::setScope(const MachineBasicBlock::iterator MI,
2253	AMDGPU::CPol::CPol Value) const {
2254	MachineOperand CPol = TII->getNamedOperand(MI&: MI, OperandName: OpName::cpol);
2255	if (!CPol)
2256	return false;
2257
2258	uint64_t NewScope = Value & AMDGPU::CPol::SCOPE;
2259	if ((CPol->getImm() & AMDGPU::CPol::SCOPE) != NewScope) {
2260	CPol->setImm((CPol->getImm() & ~AMDGPU::CPol::SCOPE) \| NewScope);
2261	return true;
2262	}
2263
2264	return false;
2265	}
2266
2267	bool SIGfx12CacheControl::insertWaitsBeforeSystemScopeStore(
2268	const MachineBasicBlock::iterator MI) const {
2269	// TODO: implement flag for frontend to give us a hint not to insert waits.
2270
2271	MachineBasicBlock &MBB = *MI ->getParent();
2272	const DebugLoc &DL = MI ->getDebugLoc();
2273
2274	BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: S_WAIT_LOADCNT_soft)).addImm(Val: `0`);
2275	if (ST.hasImageInsts()) {
2276	BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: S_WAIT_SAMPLECNT_soft)).addImm(Val: `0`);
2277	BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: S_WAIT_BVHCNT_soft)).addImm(Val: `0`);
2278	}
2279	BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: S_WAIT_KMCNT_soft)).addImm(Val: `0`);
2280	BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: S_WAIT_STORECNT_soft)).addImm(Val: `0`);
2281
2282	return true;
2283	}
2284
2285	bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI,
2286	SIAtomicScope Scope,
2287	SIAtomicAddrSpace AddrSpace, SIMemOp Op,
2288	bool IsCrossAddrSpaceOrdering,
2289	Position Pos, AtomicOrdering Order) const {
2290	bool Changed = false;
2291
2292	MachineBasicBlock &MBB = *MI ->getParent();
2293	DebugLoc DL = MI ->getDebugLoc();
2294
2295	bool LOADCnt = false;
2296	bool DSCnt = false;
2297	bool STORECnt = false;
2298
2299	if (Pos == Position::AFTER)
2300	++MI;
2301
2302	if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL \| SIAtomicAddrSpace::SCRATCH)) !=
2303	SIAtomicAddrSpace::NONE) {
2304	switch (Scope) {
2305	case SIAtomicScope::SYSTEM:
2306	case SIAtomicScope::AGENT:
2307	if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
2308	LOADCnt \|= true;
2309	if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
2310	STORECnt \|= true;
2311	break;
2312	case SIAtomicScope::WORKGROUP:
2313	// In WGP mode the waves of a work-group can be executing on either CU of
2314	// the WGP. Therefore need to wait for operations to complete to ensure
2315	// they are visible to waves in the other CU as the L0 is per CU.
2316	// Otherwise in CU mode and all waves of a work-group are on the same CU
2317	// which shares the same L0.
2318	if (!ST.isCuModeEnabled()) {
2319	if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
2320	LOADCnt \|= true;
2321	if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
2322	STORECnt \|= true;
2323	}
2324	break;
2325	case SIAtomicScope::WAVEFRONT:
2326	case SIAtomicScope::SINGLETHREAD:
2327	// The L0 cache keeps all memory operations in order for
2328	// work-items in the same wavefront.
2329	break;
2330	default:
2331	llvm_unreachable("Unsupported synchronization scope");
2332	}
2333	}
2334
2335	if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
2336	switch (Scope) {
2337	case SIAtomicScope::SYSTEM:
2338	case SIAtomicScope::AGENT:
2339	case SIAtomicScope::WORKGROUP:
2340	// If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
2341	// not needed as LDS operations for all waves are executed in a total
2342	// global ordering as observed by all waves. Required if also
2343	// synchronizing with global/GDS memory as LDS operations could be
2344	// reordered with respect to later global/GDS memory operations of the
2345	// same wave.
2346	DSCnt \|= IsCrossAddrSpaceOrdering;
2347	break;
2348	case SIAtomicScope::WAVEFRONT:
2349	case SIAtomicScope::SINGLETHREAD:
2350	// The LDS keeps all memory operations in order for
2351	// the same wavefront.
2352	break;
2353	default:
2354	llvm_unreachable("Unsupported synchronization scope");
2355	}
2356	}
2357
2358	if (LOADCnt) {
2359	// Acquire sequences only need to wait on the previous atomic operation.
2360	// e.g. a typical sequence looks like
2361	// atomic load
2362	// (wait)
2363	// global_inv
2364	//
2365	// We do not have BVH or SAMPLE atomics, so the atomic load is always going
2366	// to be tracked using loadcnt.
2367	//
2368	// This also applies to fences. Fences cannot pair with an instruction
2369	// tracked with bvh/samplecnt as we don't have any atomics that do that.
2370	if (Order != AtomicOrdering::Acquire) {
2371	BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_WAIT_BVHCNT_soft)).addImm(Val: `0`);
2372	BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_WAIT_SAMPLECNT_soft)).addImm(Val: `0`);
2373	}
2374	BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_WAIT_LOADCNT_soft)).addImm(Val: `0`);
2375	Changed = true;
2376	}
2377
2378	if (STORECnt) {
2379	BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_WAIT_STORECNT_soft)).addImm(Val: `0`);
2380	Changed = true;
2381	}
2382
2383	if (DSCnt) {
2384	BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_WAIT_DSCNT_soft)).addImm(Val: `0`);
2385	Changed = true;
2386	}
2387
2388	if (Pos == Position::AFTER)
2389	--MI;
2390
2391	return Changed;
2392	}
2393
2394	bool SIGfx12CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
2395	SIAtomicScope Scope,
2396	SIAtomicAddrSpace AddrSpace,
2397	Position Pos) const {
2398	if (!InsertCacheInv)
2399	return false;
2400
2401	MachineBasicBlock &MBB = *MI ->getParent();
2402	DebugLoc DL = MI ->getDebugLoc();
2403
2404	/// The scratch address space does not need the global memory cache
2405	/// to be flushed as all memory operations by the same thread are
2406	/// sequentially consistent, and no other thread can access scratch
2407	/// memory.
2408
2409	/// Other address spaces do not have a cache.
2410	if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) == SIAtomicAddrSpace::NONE)
2411	return false;
2412
2413	AMDGPU::CPol::CPol ScopeImm = AMDGPU::CPol::SCOPE_DEV;
2414	switch (Scope) {
2415	case SIAtomicScope::SYSTEM:
2416	ScopeImm = AMDGPU::CPol::SCOPE_SYS;
2417	break;
2418	case SIAtomicScope::AGENT:
2419	ScopeImm = AMDGPU::CPol::SCOPE_DEV;
2420	break;
2421	case SIAtomicScope::WORKGROUP:
2422	// In WGP mode the waves of a work-group can be executing on either CU of
2423	// the WGP. Therefore we need to invalidate the L0 which is per CU.
2424	// Otherwise in CU mode all waves of a work-group are on the same CU, and so
2425	// the L0 does not need to be invalidated.
2426	if (ST.isCuModeEnabled())
2427	return false;
2428
2429	ScopeImm = AMDGPU::CPol::SCOPE_SE;
2430	break;
2431	case SIAtomicScope::WAVEFRONT:
2432	case SIAtomicScope::SINGLETHREAD:
2433	// No cache to invalidate.
2434	return false;
2435	default:
2436	llvm_unreachable("Unsupported synchronization scope");
2437	}
2438
2439	if (Pos == Position::AFTER)
2440	++MI;
2441
2442	BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::GLOBAL_INV)).addImm(Val: ScopeImm);
2443
2444	if (Pos == Position::AFTER)
2445	--MI;
2446
2447	return true;
2448	}
2449
2450	bool SIGfx12CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
2451	SIAtomicScope Scope,
2452	SIAtomicAddrSpace AddrSpace,
2453	bool IsCrossAddrSpaceOrdering,
2454	Position Pos) const {
2455	MachineBasicBlock &MBB = *MI ->getParent();
2456	DebugLoc DL = MI ->getDebugLoc();
2457
2458	// The scratch address space does not need the global memory cache
2459	// writeback as all memory operations by the same thread are
2460	// sequentially consistent, and no other thread can access scratch
2461	// memory.
2462
2463	// Other address spaces do not have a cache.
2464	if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) == SIAtomicAddrSpace::NONE)
2465	return false;
2466
2467	if (Pos == Position::AFTER)
2468	++MI;
2469
2470	// global_wb is only necessary at system scope for gfx120x targets.
2471	//
2472	// Emitting it for lower scopes is a slow no-op, so we omit it
2473	// for performance.
2474	switch (Scope) {
2475	case SIAtomicScope::SYSTEM:
2476	BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::GLOBAL_WB))
2477	.addImm(Val: AMDGPU::CPol::SCOPE_SYS);
2478	break;
2479	case SIAtomicScope::AGENT:
2480	case SIAtomicScope::WORKGROUP:
2481	// No WB necessary, but we still have to wait.
2482	break;
2483	case SIAtomicScope::WAVEFRONT:
2484	case SIAtomicScope::SINGLETHREAD:
2485	// No WB or wait necessary here.
2486	return false;
2487	default:
2488	llvm_unreachable("Unsupported synchronization scope");
2489	}
2490
2491	if (Pos == Position::AFTER)
2492	--MI;
2493
2494	// We always have to wait for previous memory operations (load/store) to
2495	// complete, whether we inserted a WB or not. If we inserted a WB (storecnt),
2496	// we of course need to wait for that as well.
2497	insertWait(MI, Scope, AddrSpace, Op: SIMemOp::LOAD \| SIMemOp::STORE,
2498	IsCrossAddrSpaceOrdering, Pos, Order: AtomicOrdering::Release);
2499
2500	return true;
2501	}
2502
2503	bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
2504	MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
2505	bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
2506
2507	// Only handle load and store, not atomic read-modify-write instructions.
2508	assert(MI->mayLoad() ^ MI->mayStore());
2509
2510	// Only update load and store, not LLVM IR atomic read-modify-write
2511	// instructions. The latter are always marked as volatile so cannot sensibly
2512	// handle it as do not want to pessimize all atomics. Also they do not support
2513	// the nontemporal attribute.
2514	assert(Op == SIMemOp::LOAD \|\| Op == SIMemOp::STORE);
2515
2516	bool Changed = false;
2517
2518	if (IsLastUse) {
2519	// Set last-use hint.
2520	Changed \|= setTH(MI, Value: AMDGPU::CPol::TH_LU);
2521	} else if (IsNonTemporal) {
2522	// Set non-temporal hint for all cache levels.
2523	Changed \|= setTH(MI, Value: AMDGPU::CPol::TH_NT);
2524	}
2525
2526	if (IsVolatile) {
2527	Changed \|= setScope(MI, Value: AMDGPU::CPol::SCOPE_SYS);
2528
2529	if (Op == SIMemOp::STORE)
2530	Changed \|= insertWaitsBeforeSystemScopeStore(MI);
2531
2532	// Ensure operation has completed at system scope to cause all volatile
2533	// operations to be visible outside the program in a global order. Do not
2534	// request cross address space as only the global address space can be
2535	// observable outside the program, so no need to cause a waitcnt for LDS
2536	// address space operations.
2537	Changed \|= insertWait(MI, Scope: SIAtomicScope::SYSTEM, AddrSpace, Op, IsCrossAddrSpaceOrdering: false,
2538	Pos: Position::AFTER, Order: AtomicOrdering::Unordered);
2539	}
2540
2541	return Changed;
2542	}
2543
2544	bool SIGfx12CacheControl::expandSystemScopeStore(
2545	MachineBasicBlock::iterator &MI) const {
2546	MachineOperand CPol = TII->getNamedOperand(MI&: MI, OperandName: OpName::cpol);
2547	if (CPol && ((CPol->getImm() & CPol::SCOPE) == CPol::SCOPE_SYS))
2548	return insertWaitsBeforeSystemScopeStore(MI);
2549
2550	return false;
2551	}
2552
2553	bool SIGfx12CacheControl::setAtomicScope(const MachineBasicBlock::iterator &MI,
2554	SIAtomicScope Scope,
2555	SIAtomicAddrSpace AddrSpace) const {
2556	bool Changed = false;
2557
2558	if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
2559	switch (Scope) {
2560	case SIAtomicScope::SYSTEM:
2561	Changed \|= setScope(MI, Value: AMDGPU::CPol::SCOPE_SYS);
2562	break;
2563	case SIAtomicScope::AGENT:
2564	Changed \|= setScope(MI, Value: AMDGPU::CPol::SCOPE_DEV);
2565	break;
2566	case SIAtomicScope::WORKGROUP:
2567	// In workgroup mode, SCOPE_SE is needed as waves can executes on
2568	// different CUs that access different L0s.
2569	if (!ST.isCuModeEnabled())
2570	Changed \|= setScope(MI, Value: AMDGPU::CPol::SCOPE_SE);
2571	break;
2572	case SIAtomicScope::WAVEFRONT:
2573	case SIAtomicScope::SINGLETHREAD:
2574	// No cache to bypass.
2575	break;
2576	default:
2577	llvm_unreachable("Unsupported synchronization scope");
2578	}
2579	}
2580
2581	// The scratch address space does not need the global memory caches
2582	// to be bypassed as all memory operations by the same thread are
2583	// sequentially consistent, and no other thread can access scratch
2584	// memory.
2585
2586	// Other address spaces do not have a cache.
2587
2588	return Changed;
2589	}
2590
2591	bool SIMemoryLegalizer::removeAtomicPseudoMIs() {
2592	if (AtomicPseudoMIs.empty())
2593	return false;
2594
2595	for (auto &MI : AtomicPseudoMIs)
2596	MI ->eraseFromParent();
2597
2598	AtomicPseudoMIs.clear();
2599	return true;
2600	}
2601
2602	bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI,
2603	MachineBasicBlock::iterator &MI) {
2604	assert(MI->mayLoad() && !MI->mayStore());
2605
2606	bool Changed = false;
2607
2608	if (MOI.isAtomic()) {
2609	const AtomicOrdering Order = MOI.getOrdering();
2610	if (Order == AtomicOrdering::Monotonic \|\|
2611	Order == AtomicOrdering::Acquire \|\|
2612	Order == AtomicOrdering::SequentiallyConsistent) {
2613	Changed \|= CC ->enableLoadCacheBypass(MI, Scope: MOI.getScope(),
2614	AddrSpace: MOI.getOrderingAddrSpace());
2615	}
2616
2617	if (Order == AtomicOrdering::SequentiallyConsistent)
2618	Changed \|= CC ->insertWait(MI, Scope: MOI.getScope(), AddrSpace: MOI.getOrderingAddrSpace(),
2619	Op: SIMemOp::LOAD \| SIMemOp::STORE,
2620	IsCrossAddrSpaceOrdering: MOI.getIsCrossAddressSpaceOrdering(),
2621	Pos: Position::BEFORE, Order);
2622
2623	if (Order == AtomicOrdering::Acquire \|\|
2624	Order == AtomicOrdering::SequentiallyConsistent) {
2625	Changed \|= CC ->insertWait(
2626	MI, Scope: MOI.getScope(), AddrSpace: MOI.getInstrAddrSpace(), Op: SIMemOp::LOAD,
2627	IsCrossAddrSpaceOrdering: MOI.getIsCrossAddressSpaceOrdering(), Pos: Position::AFTER, Order);
2628	Changed \|= CC ->insertAcquire(MI, Scope: MOI.getScope(),
2629	AddrSpace: MOI.getOrderingAddrSpace(),
2630	Pos: Position::AFTER);
2631	}
2632
2633	return Changed;
2634	}
2635
2636	// Atomic instructions already bypass caches to the scope specified by the
2637	// SyncScope operand. Only non-atomic volatile and nontemporal/last-use
2638	// instructions need additional treatment.
2639	Changed \|= CC ->enableVolatileAndOrNonTemporal(
2640	MI, AddrSpace: MOI.getInstrAddrSpace(), Op: SIMemOp::LOAD, IsVolatile: MOI.isVolatile(),
2641	IsNonTemporal: MOI.isNonTemporal(), IsLastUse: MOI.isLastUse());
2642
2643	return Changed;
2644	}
2645
2646	bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
2647	MachineBasicBlock::iterator &MI) {
2648	assert(!MI->mayLoad() && MI->mayStore());
2649
2650	bool Changed = false;
2651
2652	if (MOI.isAtomic()) {
2653	if (MOI.getOrdering() == AtomicOrdering::Monotonic \|\|
2654	MOI.getOrdering() == AtomicOrdering::Release \|\|
2655	MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2656	Changed \|= CC ->enableStoreCacheBypass(MI, Scope: MOI.getScope(),
2657	AddrSpace: MOI.getOrderingAddrSpace());
2658	}
2659
2660	if (MOI.getOrdering() == AtomicOrdering::Release \|\|
2661	MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2662	Changed \|= CC ->insertRelease(MI, Scope: MOI.getScope(),
2663	AddrSpace: MOI.getOrderingAddrSpace(),
2664	IsCrossAddrSpaceOrdering: MOI.getIsCrossAddressSpaceOrdering(),
2665	Pos: Position::BEFORE);
2666
2667	return Changed;
2668	}
2669
2670	// Atomic instructions already bypass caches to the scope specified by the
2671	// SyncScope operand. Only non-atomic volatile and nontemporal instructions
2672	// need additional treatment.
2673	Changed \|= CC ->enableVolatileAndOrNonTemporal(
2674	MI, AddrSpace: MOI.getInstrAddrSpace(), Op: SIMemOp::STORE, IsVolatile: MOI.isVolatile(),
2675	IsNonTemporal: MOI.isNonTemporal());
2676
2677	// GFX12 specific, scope(desired coherence domain in cache hierarchy) is
2678	// instruction field, do not confuse it with atomic scope.
2679	Changed \|= CC ->expandSystemScopeStore(MI);
2680	return Changed;
2681	}
2682
2683	bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
2684	MachineBasicBlock::iterator &MI) {
2685	assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE);
2686
2687	AtomicPseudoMIs.push_back(x: MI);
2688	bool Changed = false;
2689
2690	// Refine fenced address space based on MMRAs.
2691	//
2692	// TODO: Should we support this MMRA on other atomic operations?
2693	auto OrderingAddrSpace =
2694	getFenceAddrSpaceMMRA(MI: *MI, Default: MOI.getOrderingAddrSpace());
2695
2696	if (MOI.isAtomic()) {
2697	const AtomicOrdering Order = MOI.getOrdering();
2698	if (Order == AtomicOrdering::Acquire) {
2699	Changed \|= CC ->insertWait(
2700	MI, Scope: MOI.getScope(), AddrSpace: OrderingAddrSpace, Op: SIMemOp::LOAD \| SIMemOp::STORE,
2701	IsCrossAddrSpaceOrdering: MOI.getIsCrossAddressSpaceOrdering(), Pos: Position::BEFORE, Order);
2702	}
2703
2704	if (Order == AtomicOrdering::Release \|\|
2705	Order == AtomicOrdering::AcquireRelease \|\|
2706	Order == AtomicOrdering::SequentiallyConsistent)
2707	/// TODO: This relies on a barrier always generating a waitcnt
2708	/// for LDS to ensure it is not reordered with the completion of
2709	/// the proceeding LDS operations. If barrier had a memory
2710	/// ordering and memory scope, then library does not need to
2711	/// generate a fence. Could add support in this file for
2712	/// barrier. SIInsertWaitcnt.cpp could then stop unconditionally
2713	/// adding S_WAITCNT before a S_BARRIER.
2714	Changed \|= CC ->insertRelease(MI, Scope: MOI.getScope(), AddrSpace: OrderingAddrSpace,
2715	IsCrossAddrSpaceOrdering: MOI.getIsCrossAddressSpaceOrdering(),
2716	Pos: Position::BEFORE);
2717
2718	// TODO: If both release and invalidate are happening they could be combined
2719	// to use the single "BUFFER_WBINV" instruction. This could be done by*
2720	// reorganizing this code or as part of optimizing SIInsertWaitcnt pass to
2721	// track cache invalidate and write back instructions.
2722
2723	if (Order == AtomicOrdering::Acquire \|\|
2724	Order == AtomicOrdering::AcquireRelease \|\|
2725	Order == AtomicOrdering::SequentiallyConsistent)
2726	Changed \|= CC ->insertAcquire(MI, Scope: MOI.getScope(), AddrSpace: OrderingAddrSpace,
2727	Pos: Position::BEFORE);
2728
2729	return Changed;
2730	}
2731
2732	return Changed;
2733	}
2734
2735	bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
2736	MachineBasicBlock::iterator &MI) {
2737	assert(MI->mayLoad() && MI->mayStore());
2738
2739	bool Changed = false;
2740
2741	if (MOI.isAtomic()) {
2742	const AtomicOrdering Order = MOI.getOrdering();
2743	if (Order == AtomicOrdering::Monotonic \|\|
2744	Order == AtomicOrdering::Acquire \|\| Order == AtomicOrdering::Release \|\|
2745	Order == AtomicOrdering::AcquireRelease \|\|
2746	Order == AtomicOrdering::SequentiallyConsistent) {
2747	Changed \|= CC ->enableRMWCacheBypass(MI, Scope: MOI.getScope(),
2748	AddrSpace: MOI.getInstrAddrSpace());
2749	}
2750
2751	if (Order == AtomicOrdering::Release \|\|
2752	Order == AtomicOrdering::AcquireRelease \|\|
2753	Order == AtomicOrdering::SequentiallyConsistent \|\|
2754	MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent)
2755	Changed \|= CC ->insertRelease(MI, Scope: MOI.getScope(),
2756	AddrSpace: MOI.getOrderingAddrSpace(),
2757	IsCrossAddrSpaceOrdering: MOI.getIsCrossAddressSpaceOrdering(),
2758	Pos: Position::BEFORE);
2759
2760	if (Order == AtomicOrdering::Acquire \|\|
2761	Order == AtomicOrdering::AcquireRelease \|\|
2762	Order == AtomicOrdering::SequentiallyConsistent \|\|
2763	MOI.getFailureOrdering() == AtomicOrdering::Acquire \|\|
2764	MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) {
2765	Changed \|= CC ->insertWait(
2766	MI, Scope: MOI.getScope(), AddrSpace: MOI.getInstrAddrSpace(),
2767	Op: isAtomicRet(MI: *MI) ? SIMemOp::LOAD : SIMemOp::STORE,
2768	IsCrossAddrSpaceOrdering: MOI.getIsCrossAddressSpaceOrdering(), Pos: Position::AFTER, Order);
2769	Changed \|= CC ->insertAcquire(MI, Scope: MOI.getScope(),
2770	AddrSpace: MOI.getOrderingAddrSpace(),
2771	Pos: Position::AFTER);
2772	}
2773
2774	return Changed;
2775	}
2776
2777	return Changed;
2778	}
2779
2780	bool SIMemoryLegalizerLegacy::runOnMachineFunction(MachineFunction &MF) {
2781	const MachineModuleInfo &MMI =
2782	getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
2783	return SIMemoryLegalizer (MMI).run(MF);
2784	}
2785
2786	PreservedAnalyses
2787	SIMemoryLegalizerPass::run(MachineFunction &MF,
2788	MachineFunctionAnalysisManager &MFAM) {
2789	auto *MMI = MFAM.getResult<ModuleAnalysisManagerMachineFunctionProxy>(IR&: MF)
2790	.getCachedResult<MachineModuleAnalysis>(
2791	IR&: *MF.getFunction().getParent());
2792	assert(MMI && "MachineModuleAnalysis must be available");
2793	if (!SIMemoryLegalizer (MMI->getMMI()).run(MF))
2794	return PreservedAnalyses::all();
2795	return getMachineFunctionPassPreservedAnalyses().preserveSet<CFGAnalyses>();
2796	}
2797
2798	bool SIMemoryLegalizer::run(MachineFunction &MF) {
2799	bool Changed = false;
2800
2801	SIMemOpAccess MOA(MMI.getObjFileInfo<AMDGPUMachineModuleInfo>());
2802	CC = SICacheControl::create(ST: MF.getSubtarget<GCNSubtarget>());
2803
2804	for (auto &MBB : MF) {
2805	for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) {
2806
2807	// Unbundle instructions after the post-RA scheduler.
2808	if (MI ->isBundle() && MI ->mayLoadOrStore()) {
2809	MachineBasicBlock::instr_iterator II(MI ->getIterator());
2810	for (MachineBasicBlock::instr_iterator I = ++II, E = MBB.instr_end();
2811	I != E && I ->isBundledWithPred(); ++I) {
2812	I ->unbundleFromPred();
2813	for (MachineOperand &MO : I ->operands())
2814	if (MO.isReg())
2815	MO.setIsInternalRead(false);
2816	}
2817
2818	MI ->eraseFromParent();
2819	MI = II ->getIterator();
2820	}
2821
2822	if (!(MI ->getDesc().TSFlags & SIInstrFlags::maybeAtomic))
2823	continue;
2824
2825	if (const auto &MOI = MOA.getLoadInfo(MI))
2826	Changed \|= expandLoad(MOI: *MOI, MI);
2827	else if (const auto &MOI = MOA.getStoreInfo(MI)) {
2828	Changed \|= expandStore(MOI: *MOI, MI);
2829	} else if (const auto &MOI = MOA.getAtomicFenceInfo(MI))
2830	Changed \|= expandAtomicFence(MOI: *MOI, MI);
2831	else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI))
2832	Changed \|= expandAtomicCmpxchgOrRmw(MOI: *MOI, MI);
2833	}
2834	}
2835
2836	Changed \|= removeAtomicPseudoMIs();
2837	return Changed;
2838	}
2839
2840	INITIALIZE_PASS(SIMemoryLegalizerLegacy, DEBUG_TYPE, PASS_NAME, false, false)
2841
2842	char SIMemoryLegalizerLegacy::ID = `0`;
2843	char &llvm::SIMemoryLegalizerID = SIMemoryLegalizerLegacy::ID;
2844
2845	FunctionPass *llvm::createSIMemoryLegalizerPass() {
2846	return new SIMemoryLegalizerLegacy ();
2847	}
2848

Browse the source code of llvm_projects/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp