SIMemoryLegalizer.cpp source code [llvm_projects/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp]

1	//===- SIMemoryLegalizer.cpp ----------------------------------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	/// \file
10	/// Memory legalizer - implements memory model. More information can be
11	/// found here:
12	/// http://llvm.org/docs/AMDGPUUsage.html#memory-model
13	//
14	//===----------------------------------------------------------------------===//
15
16	#include "AMDGPU.h"
17	#include "AMDGPUMachineModuleInfo.h"
18	#include "GCNSubtarget.h"
19	#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
20	#include "llvm/ADT/BitmaskEnum.h"
21	#include "llvm/ADT/StringExtras.h"
22	#include "llvm/CodeGen/MachineBasicBlock.h"
23	#include "llvm/CodeGen/MachineFunctionPass.h"
24	#include "llvm/IR/DiagnosticInfo.h"
25	#include "llvm/IR/MemoryModelRelaxationAnnotations.h"
26	#include "llvm/Support/AtomicOrdering.h"
27	#include "llvm/TargetParser/TargetParser.h"
28
29	using namespace llvm;
30	using namespace llvm::AMDGPU;
31
32	#define DEBUG_TYPE "si-memory-legalizer"
33	#define PASS_NAME "SI Memory Legalizer"
34
35	static cl::opt<bool> AmdgcnSkipCacheInvalidations(
36	"amdgcn-skip-cache-invalidations", cl::init(Val: false), cl::Hidden,
37	cl::desc ("Use this to skip inserting cache invalidating instructions."));
38
39	namespace {
40
41	LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE();
42
43	/// Memory operation flags. Can be ORed together.
44	enum class SIMemOp {
45	NONE = `0u`,
46	LOAD = `1u` << `0`,
47	STORE = `1u` << `1`,
48	LLVM_MARK_AS_BITMASK_ENUM(/ LargestFlag = / STORE)
49	};
50
51	/// Position to insert a new instruction relative to an existing
52	/// instruction.
53	enum class Position {
54	BEFORE,
55	AFTER
56	};
57
58	/// The atomic synchronization scopes supported by the AMDGPU target.
59	enum class SIAtomicScope {
60	NONE,
61	SINGLETHREAD,
62	WAVEFRONT,
63	WORKGROUP,
64	AGENT,
65	SYSTEM
66	};
67
68	/// The distinct address spaces supported by the AMDGPU target for
69	/// atomic memory operation. Can be ORed together.
70	enum class SIAtomicAddrSpace {
71	NONE = `0u`,
72	GLOBAL = `1u` << `0`,
73	LDS = `1u` << `1`,
74	SCRATCH = `1u` << `2`,
75	GDS = `1u` << `3`,
76	OTHER = `1u` << `4`,
77
78	/// The address spaces that can be accessed by a FLAT instruction.
79	FLAT = GLOBAL \| LDS \| SCRATCH,
80
81	/// The address spaces that support atomic instructions.
82	ATOMIC = GLOBAL \| LDS \| SCRATCH \| GDS,
83
84	/// All address spaces.
85	ALL = GLOBAL \| LDS \| SCRATCH \| GDS \| OTHER,
86
87	LLVM_MARK_AS_BITMASK_ENUM(/ LargestFlag = / ALL)
88	};
89
90	class SIMemOpInfo final {
91	private:
92
93	friend class SIMemOpAccess;
94
95	AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
96	AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
97	SIAtomicScope Scope = SIAtomicScope::SYSTEM;
98	SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
99	SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
100	bool IsCrossAddressSpaceOrdering = false;
101	bool IsVolatile = false;
102	bool IsNonTemporal = false;
103	bool IsLastUse = false;
104
105	SIMemOpInfo(
106	AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent,
107	SIAtomicScope Scope = SIAtomicScope::SYSTEM,
108	SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC,
109	SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::ALL,
110	bool IsCrossAddressSpaceOrdering = true,
111	AtomicOrdering FailureOrdering = AtomicOrdering::SequentiallyConsistent,
112	bool IsVolatile = false, bool IsNonTemporal = false,
113	bool IsLastUse = false)
114	: Ordering(Ordering), FailureOrdering(FailureOrdering), Scope(Scope),
115	OrderingAddrSpace(OrderingAddrSpace), InstrAddrSpace(InstrAddrSpace),
116	IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering),
117	IsVolatile(IsVolatile), IsNonTemporal(IsNonTemporal),
118	IsLastUse(IsLastUse) {
119
120	if (Ordering == AtomicOrdering::NotAtomic) {
121	assert(Scope == SIAtomicScope::NONE &&
122	OrderingAddrSpace == SIAtomicAddrSpace::NONE &&
123	!IsCrossAddressSpaceOrdering &&
124	FailureOrdering == AtomicOrdering::NotAtomic);
125	return;
126	}
127
128	assert(Scope != SIAtomicScope::NONE &&
129	(OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
130	SIAtomicAddrSpace::NONE &&
131	(InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
132	SIAtomicAddrSpace::NONE);
133
134	// There is also no cross address space ordering if the ordering
135	// address space is the same as the instruction address space and
136	// only contains a single address space.
137	if ((OrderingAddrSpace == InstrAddrSpace) &&
138	isPowerOf2_32(Value: uint32_t(InstrAddrSpace)))
139	this->IsCrossAddressSpaceOrdering = false;
140
141	// Limit the scope to the maximum supported by the instruction's address
142	// spaces.
143	if ((InstrAddrSpace & ~SIAtomicAddrSpace::SCRATCH) ==
144	SIAtomicAddrSpace::NONE) {
145	this->Scope = std::min(a: Scope, b: SIAtomicScope::SINGLETHREAD);
146	} else if ((InstrAddrSpace &
147	~(SIAtomicAddrSpace::SCRATCH \| SIAtomicAddrSpace::LDS)) ==
148	SIAtomicAddrSpace::NONE) {
149	this->Scope = std::min(a: Scope, b: SIAtomicScope::WORKGROUP);
150	} else if ((InstrAddrSpace &
151	~(SIAtomicAddrSpace::SCRATCH \| SIAtomicAddrSpace::LDS \|
152	SIAtomicAddrSpace::GDS)) == SIAtomicAddrSpace::NONE) {
153	this->Scope = std::min(a: Scope, b: SIAtomicScope::AGENT);
154	}
155	}
156
157	public:
158	/// \returns Atomic synchronization scope of the machine instruction used to
159	/// create this SIMemOpInfo.
160	SIAtomicScope getScope() const {
161	return Scope;
162	}
163
164	/// \returns Ordering constraint of the machine instruction used to
165	/// create this SIMemOpInfo.
166	AtomicOrdering getOrdering() const {
167	return Ordering;
168	}
169
170	/// \returns Failure ordering constraint of the machine instruction used to
171	/// create this SIMemOpInfo.
172	AtomicOrdering getFailureOrdering() const {
173	return FailureOrdering;
174	}
175
176	/// \returns The address spaces be accessed by the machine
177	/// instruction used to create this SIMemOpInfo.
178	SIAtomicAddrSpace getInstrAddrSpace() const {
179	return InstrAddrSpace;
180	}
181
182	/// \returns The address spaces that must be ordered by the machine
183	/// instruction used to create this SIMemOpInfo.
184	SIAtomicAddrSpace getOrderingAddrSpace() const {
185	return OrderingAddrSpace;
186	}
187
188	/// \returns Return true iff memory ordering of operations on
189	/// different address spaces is required.
190	bool getIsCrossAddressSpaceOrdering() const {
191	return IsCrossAddressSpaceOrdering;
192	}
193
194	/// \returns True if memory access of the machine instruction used to
195	/// create this SIMemOpInfo is volatile, false otherwise.
196	bool isVolatile() const {
197	return IsVolatile;
198	}
199
200	/// \returns True if memory access of the machine instruction used to
201	/// create this SIMemOpInfo is nontemporal, false otherwise.
202	bool isNonTemporal() const {
203	return IsNonTemporal;
204	}
205
206	/// \returns True if memory access of the machine instruction used to
207	/// create this SIMemOpInfo is last use, false otherwise.
208	bool isLastUse() const { return IsLastUse; }
209
210	/// \returns True if ordering constraint of the machine instruction used to
211	/// create this SIMemOpInfo is unordered or higher, false otherwise.
212	bool isAtomic() const {
213	return Ordering != AtomicOrdering::NotAtomic;
214	}
215
216	};
217
218	class SIMemOpAccess final {
219	private:
220	const AMDGPUMachineModuleInfo MMI = nullptr*;
221
222	/// Reports unsupported message \p Msg for \p MI to LLVM context.
223	void reportUnsupported(const MachineBasicBlock::iterator &MI,
224	const char Msg) const*;
225
226	/// Inspects the target synchronization scope \p SSID and determines
227	/// the SI atomic scope it corresponds to, the address spaces it
228	/// covers, and whether the memory ordering applies between address
229	/// spaces.
230	std::optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
231	toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrAddrSpace) const;
232
233	/// \return Return a bit set of the address spaces accessed by \p AS.
234	SIAtomicAddrSpace toSIAtomicAddrSpace(unsigned AS) const;
235
236	/// \returns Info constructed from \p MI, which has at least machine memory
237	/// operand.
238	std::optional<SIMemOpInfo>
239	constructFromMIWithMMO(const MachineBasicBlock::iterator &MI) const;
240
241	public:
242	/// Construct class to support accessing the machine memory operands
243	/// of instructions in the machine function \p MF.
244	SIMemOpAccess(const AMDGPUMachineModuleInfo &MMI);
245
246	/// \returns Load info if \p MI is a load operation, "std::nullopt" otherwise.
247	std::optional<SIMemOpInfo>
248	getLoadInfo(const MachineBasicBlock::iterator &MI) const;
249
250	/// \returns Store info if \p MI is a store operation, "std::nullopt"
251	/// otherwise.
252	std::optional<SIMemOpInfo>
253	getStoreInfo(const MachineBasicBlock::iterator &MI) const;
254
255	/// \returns Atomic fence info if \p MI is an atomic fence operation,
256	/// "std::nullopt" otherwise.
257	std::optional<SIMemOpInfo>
258	getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const;
259
260	/// \returns Atomic cmpxchg/rmw info if \p MI is an atomic cmpxchg or
261	/// rmw operation, "std::nullopt" otherwise.
262	std::optional<SIMemOpInfo>
263	getAtomicCmpxchgOrRmwInfo(const MachineBasicBlock::iterator &MI) const;
264	};
265
266	class SICacheControl {
267	protected:
268
269	/// AMDGPU subtarget info.
270	const GCNSubtarget &ST;
271
272	/// Instruction info.
273	const SIInstrInfo TII = nullptr*;
274
275	IsaVersion IV;
276
277	/// Whether to insert cache invalidating instructions.
278	bool InsertCacheInv;
279
280	SICacheControl(const GCNSubtarget &ST);
281
282	/// Sets named bit \p BitName to "true" if present in instruction \p MI.
283	/// \returns Returns true if \p MI is modified, false otherwise.
284	bool enableNamedBit(const MachineBasicBlock::iterator MI,
285	AMDGPU::CPol::CPol Bit) const;
286
287	public:
288
289	/// Create a cache control for the subtarget \p ST.
290	static std::unique_ptr<SICacheControl> create(const GCNSubtarget &ST);
291
292	/// Update \p MI memory load instruction to bypass any caches up to
293	/// the \p Scope memory scope for address spaces \p
294	/// AddrSpace. Return true iff the instruction was modified.
295	virtual bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
296	SIAtomicScope Scope,
297	SIAtomicAddrSpace AddrSpace) const = `0`;
298
299	/// Update \p MI memory store instruction to bypass any caches up to
300	/// the \p Scope memory scope for address spaces \p
301	/// AddrSpace. Return true iff the instruction was modified.
302	virtual bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
303	SIAtomicScope Scope,
304	SIAtomicAddrSpace AddrSpace) const = `0`;
305
306	/// Update \p MI memory read-modify-write instruction to bypass any caches up
307	/// to the \p Scope memory scope for address spaces \p AddrSpace. Return true
308	/// iff the instruction was modified.
309	virtual bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
310	SIAtomicScope Scope,
311	SIAtomicAddrSpace AddrSpace) const = `0`;
312
313	/// Update \p MI memory instruction of kind \p Op associated with address
314	/// spaces \p AddrSpace to indicate it is volatile and/or
315	/// nontemporal/last-use. Return true iff the instruction was modified.
316	virtual bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
317	SIAtomicAddrSpace AddrSpace,
318	SIMemOp Op, bool IsVolatile,
319	bool IsNonTemporal,
320	bool IsLastUse = false) const = `0`;
321
322	virtual bool expandSystemScopeStore(MachineBasicBlock::iterator &MI) const {
323	return false;
324	};
325
326	/// Inserts any necessary instructions at position \p Pos relative
327	/// to instruction \p MI to ensure memory instructions before \p Pos of kind
328	/// \p Op associated with address spaces \p AddrSpace have completed. Used
329	/// between memory instructions to enforce the order they become visible as
330	/// observed by other memory instructions executing in memory scope \p Scope.
331	/// \p IsCrossAddrSpaceOrdering indicates if the memory ordering is between
332	/// address spaces. Returns true iff any instructions inserted.
333	virtual bool insertWait(MachineBasicBlock::iterator &MI,
334	SIAtomicScope Scope,
335	SIAtomicAddrSpace AddrSpace,
336	SIMemOp Op,
337	bool IsCrossAddrSpaceOrdering,
338	Position Pos) const = `0`;
339
340	/// Inserts any necessary instructions at position \p Pos relative to
341	/// instruction \p MI to ensure any subsequent memory instructions of this
342	/// thread with address spaces \p AddrSpace will observe the previous memory
343	/// operations by any thread for memory scopes up to memory scope \p Scope .
344	/// Returns true iff any instructions inserted.
345	virtual bool insertAcquire(MachineBasicBlock::iterator &MI,
346	SIAtomicScope Scope,
347	SIAtomicAddrSpace AddrSpace,
348	Position Pos) const = `0`;
349
350	/// Inserts any necessary instructions at position \p Pos relative to
351	/// instruction \p MI to ensure previous memory instructions by this thread
352	/// with address spaces \p AddrSpace have completed and can be observed by
353	/// subsequent memory instructions by any thread executing in memory scope \p
354	/// Scope. \p IsCrossAddrSpaceOrdering indicates if the memory ordering is
355	/// between address spaces. Returns true iff any instructions inserted.
356	virtual bool insertRelease(MachineBasicBlock::iterator &MI,
357	SIAtomicScope Scope,
358	SIAtomicAddrSpace AddrSpace,
359	bool IsCrossAddrSpaceOrdering,
360	Position Pos) const = `0`;
361
362	/// Virtual destructor to allow derivations to be deleted.
363	virtual ~SICacheControl() = default;
364
365	virtual bool tryForceStoreSC0SC1(const SIMemOpInfo &MOI,
366	MachineBasicBlock::iterator &MI) const {
367	return false;
368	}
369	};
370
371	class SIGfx6CacheControl : public SICacheControl {
372	protected:
373
374	/// Sets GLC bit to "true" if present in \p MI. Returns true if \p MI
375	/// is modified, false otherwise.
376	bool enableGLCBit(const MachineBasicBlock::iterator &MI) const {
377	return enableNamedBit(MI, Bit: AMDGPU::CPol::GLC);
378	}
379
380	/// Sets SLC bit to "true" if present in \p MI. Returns true if \p MI
381	/// is modified, false otherwise.
382	bool enableSLCBit(const MachineBasicBlock::iterator &MI) const {
383	return enableNamedBit(MI, Bit: AMDGPU::CPol::SLC);
384	}
385
386	public:
387
388	SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl (ST) {}
389
390	bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
391	SIAtomicScope Scope,
392	SIAtomicAddrSpace AddrSpace) const override;
393
394	bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
395	SIAtomicScope Scope,
396	SIAtomicAddrSpace AddrSpace) const override;
397
398	bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
399	SIAtomicScope Scope,
400	SIAtomicAddrSpace AddrSpace) const override;
401
402	bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
403	SIAtomicAddrSpace AddrSpace, SIMemOp Op,
404	bool IsVolatile, bool IsNonTemporal,
405	bool IsLastUse) const override;
406
407	bool insertWait(MachineBasicBlock::iterator &MI,
408	SIAtomicScope Scope,
409	SIAtomicAddrSpace AddrSpace,
410	SIMemOp Op,
411	bool IsCrossAddrSpaceOrdering,
412	Position Pos) const override;
413
414	bool insertAcquire(MachineBasicBlock::iterator &MI,
415	SIAtomicScope Scope,
416	SIAtomicAddrSpace AddrSpace,
417	Position Pos) const override;
418
419	bool insertRelease(MachineBasicBlock::iterator &MI,
420	SIAtomicScope Scope,
421	SIAtomicAddrSpace AddrSpace,
422	bool IsCrossAddrSpaceOrdering,
423	Position Pos) const override;
424	};
425
426	class SIGfx7CacheControl : public SIGfx6CacheControl {
427	public:
428
429	SIGfx7CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl (ST) {}
430
431	bool insertAcquire(MachineBasicBlock::iterator &MI,
432	SIAtomicScope Scope,
433	SIAtomicAddrSpace AddrSpace,
434	Position Pos) const override;
435
436	};
437
438	class SIGfx90ACacheControl : public SIGfx7CacheControl {
439	public:
440
441	SIGfx90ACacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl (ST) {}
442
443	bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
444	SIAtomicScope Scope,
445	SIAtomicAddrSpace AddrSpace) const override;
446
447	bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
448	SIAtomicScope Scope,
449	SIAtomicAddrSpace AddrSpace) const override;
450
451	bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
452	SIAtomicScope Scope,
453	SIAtomicAddrSpace AddrSpace) const override;
454
455	bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
456	SIAtomicAddrSpace AddrSpace, SIMemOp Op,
457	bool IsVolatile, bool IsNonTemporal,
458	bool IsLastUse) const override;
459
460	bool insertWait(MachineBasicBlock::iterator &MI,
461	SIAtomicScope Scope,
462	SIAtomicAddrSpace AddrSpace,
463	SIMemOp Op,
464	bool IsCrossAddrSpaceOrdering,
465	Position Pos) const override;
466
467	bool insertAcquire(MachineBasicBlock::iterator &MI,
468	SIAtomicScope Scope,
469	SIAtomicAddrSpace AddrSpace,
470	Position Pos) const override;
471
472	bool insertRelease(MachineBasicBlock::iterator &MI,
473	SIAtomicScope Scope,
474	SIAtomicAddrSpace AddrSpace,
475	bool IsCrossAddrSpaceOrdering,
476	Position Pos) const override;
477	};
478
479	class SIGfx940CacheControl : public SIGfx90ACacheControl {
480	protected:
481
482	/// Sets SC0 bit to "true" if present in \p MI. Returns true if \p MI
483	/// is modified, false otherwise.
484	bool enableSC0Bit(const MachineBasicBlock::iterator &MI) const {
485	return enableNamedBit(MI, Bit: AMDGPU::CPol::SC0);
486	}
487
488	/// Sets SC1 bit to "true" if present in \p MI. Returns true if \p MI
489	/// is modified, false otherwise.
490	bool enableSC1Bit(const MachineBasicBlock::iterator &MI) const {
491	return enableNamedBit(MI, Bit: AMDGPU::CPol::SC1);
492	}
493
494	/// Sets NT bit to "true" if present in \p MI. Returns true if \p MI
495	/// is modified, false otherwise.
496	bool enableNTBit(const MachineBasicBlock::iterator &MI) const {
497	return enableNamedBit(MI, Bit: AMDGPU::CPol::NT);
498	}
499
500	public:
501
502	SIGfx940CacheControl(const GCNSubtarget &ST) : SIGfx90ACacheControl (ST) {};
503
504	bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
505	SIAtomicScope Scope,
506	SIAtomicAddrSpace AddrSpace) const override;
507
508	bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
509	SIAtomicScope Scope,
510	SIAtomicAddrSpace AddrSpace) const override;
511
512	bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
513	SIAtomicScope Scope,
514	SIAtomicAddrSpace AddrSpace) const override;
515
516	bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
517	SIAtomicAddrSpace AddrSpace, SIMemOp Op,
518	bool IsVolatile, bool IsNonTemporal,
519	bool IsLastUse) const override;
520
521	bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
522	SIAtomicAddrSpace AddrSpace, Position Pos) const override;
523
524	bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
525	SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,
526	Position Pos) const override;
527
528	bool tryForceStoreSC0SC1(const SIMemOpInfo &MOI,
529	MachineBasicBlock::iterator &MI) const override {
530	bool Changed = false;
531	if (ST.hasForceStoreSC0SC1() &&
532	(MOI.getInstrAddrSpace() & (SIAtomicAddrSpace::SCRATCH \|
533	SIAtomicAddrSpace::GLOBAL \|
534	SIAtomicAddrSpace::OTHER)) !=
535	SIAtomicAddrSpace::NONE) {
536	Changed \|= enableSC0Bit(MI);
537	Changed \|= enableSC1Bit(MI);
538	}
539	return Changed;
540	}
541	};
542
543	class SIGfx10CacheControl : public SIGfx7CacheControl {
544	protected:
545
546	/// Sets DLC bit to "true" if present in \p MI. Returns true if \p MI
547	/// is modified, false otherwise.
548	bool enableDLCBit(const MachineBasicBlock::iterator &MI) const {
549	return enableNamedBit(MI, Bit: AMDGPU::CPol::DLC);
550	}
551
552	public:
553
554	SIGfx10CacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl (ST) {}
555
556	bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
557	SIAtomicScope Scope,
558	SIAtomicAddrSpace AddrSpace) const override;
559
560	bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
561	SIAtomicAddrSpace AddrSpace, SIMemOp Op,
562	bool IsVolatile, bool IsNonTemporal,
563	bool IsLastUse) const override;
564
565	bool insertWait(MachineBasicBlock::iterator &MI,
566	SIAtomicScope Scope,
567	SIAtomicAddrSpace AddrSpace,
568	SIMemOp Op,
569	bool IsCrossAddrSpaceOrdering,
570	Position Pos) const override;
571
572	bool insertAcquire(MachineBasicBlock::iterator &MI,
573	SIAtomicScope Scope,
574	SIAtomicAddrSpace AddrSpace,
575	Position Pos) const override;
576	};
577
578	class SIGfx11CacheControl : public SIGfx10CacheControl {
579	public:
580	SIGfx11CacheControl(const GCNSubtarget &ST) : SIGfx10CacheControl (ST) {}
581
582	bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
583	SIAtomicScope Scope,
584	SIAtomicAddrSpace AddrSpace) const override;
585
586	bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
587	SIAtomicAddrSpace AddrSpace, SIMemOp Op,
588	bool IsVolatile, bool IsNonTemporal,
589	bool IsLastUse) const override;
590	};
591
592	class SIGfx12CacheControl : public SIGfx11CacheControl {
593	protected:
594	// Sets TH policy to \p Value if CPol operand is present in instruction \p MI.
595	// \returns Returns true if \p MI is modified, false otherwise.
596	bool setTH(const MachineBasicBlock::iterator MI,
597	AMDGPU::CPol::CPol Value) const;
598	// Sets Scope policy to \p Value if CPol operand is present in instruction \p
599	// MI. \returns Returns true if \p MI is modified, false otherwise.
600	bool setScope(const MachineBasicBlock::iterator MI,
601	AMDGPU::CPol::CPol Value) const;
602
603	// Stores with system scope (SCOPE_SYS) need to wait for:
604	// - loads or atomics(returning) - wait for {LOAD\|SAMPLE\|BVH\|KM}CNT==0
605	// - non-returning-atomics - wait for STORECNT==0
606	// TODO: SIInsertWaitcnts will not always be able to remove STORECNT waits
607	// since it does not distinguish atomics-with-return from regular stores.
608	// There is no need to wait if memory is cached (mtype != UC).
609	bool
610	insertWaitsBeforeSystemScopeStore(const MachineBasicBlock::iterator MI) const;
611
612	bool setAtomicScope(const MachineBasicBlock::iterator &MI,
613	SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const;
614
615	public:
616	SIGfx12CacheControl(const GCNSubtarget &ST) : SIGfx11CacheControl (ST) {}
617
618	bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
619	SIAtomicAddrSpace AddrSpace, SIMemOp Op,
620	bool IsCrossAddrSpaceOrdering, Position Pos) const override;
621
622	bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
623	SIAtomicAddrSpace AddrSpace, Position Pos) const override;
624
625	bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
626	SIAtomicAddrSpace AddrSpace, SIMemOp Op,
627	bool IsVolatile, bool IsNonTemporal,
628	bool IsLastUse) const override;
629
630	bool expandSystemScopeStore(MachineBasicBlock::iterator &MI) const override;
631
632	bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
633	SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,
634	Position Pos) const override;
635
636	bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
637	SIAtomicScope Scope,
638	SIAtomicAddrSpace AddrSpace) const override {
639	return setAtomicScope(MI, Scope, AddrSpace);
640	}
641
642	bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
643	SIAtomicScope Scope,
644	SIAtomicAddrSpace AddrSpace) const override {
645	return setAtomicScope(MI, Scope, AddrSpace);
646	}
647
648	bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
649	SIAtomicScope Scope,
650	SIAtomicAddrSpace AddrSpace) const override {
651	return setAtomicScope(MI, Scope, AddrSpace);
652	}
653	};
654
655	class SIMemoryLegalizer final : public MachineFunctionPass {
656	private:
657
658	/// Cache Control.
659	std::unique_ptr<SICacheControl> CC = nullptr;
660
661	/// List of atomic pseudo instructions.
662	std::list<MachineBasicBlock::iterator> AtomicPseudoMIs;
663
664	/// Return true iff instruction \p MI is a atomic instruction that
665	/// returns a result.
666	bool isAtomicRet(const MachineInstr &MI) const {
667	return SIInstrInfo::isAtomicRet(MI);
668	}
669
670	/// Removes all processed atomic pseudo instructions from the current
671	/// function. Returns true if current function is modified, false otherwise.
672	bool removeAtomicPseudoMIs();
673
674	/// Expands load operation \p MI. Returns true if instructions are
675	/// added/deleted or \p MI is modified, false otherwise.
676	bool expandLoad(const SIMemOpInfo &MOI,
677	MachineBasicBlock::iterator &MI);
678	/// Expands store operation \p MI. Returns true if instructions are
679	/// added/deleted or \p MI is modified, false otherwise.
680	bool expandStore(const SIMemOpInfo &MOI,
681	MachineBasicBlock::iterator &MI);
682	/// Expands atomic fence operation \p MI. Returns true if
683	/// instructions are added/deleted or \p MI is modified, false otherwise.
684	bool expandAtomicFence(const SIMemOpInfo &MOI,
685	MachineBasicBlock::iterator &MI);
686	/// Expands atomic cmpxchg or rmw operation \p MI. Returns true if
687	/// instructions are added/deleted or \p MI is modified, false otherwise.
688	bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
689	MachineBasicBlock::iterator &MI);
690
691	public:
692	static char ID;
693
694	SIMemoryLegalizer() : MachineFunctionPass (ID) {}
695
696	void getAnalysisUsage(AnalysisUsage &AU) const override {
697	AU.setPreservesCFG();
698	MachineFunctionPass::getAnalysisUsage(AU);
699	}
700
701	StringRef getPassName() const override {
702	return PASS_NAME;
703	}
704
705	bool runOnMachineFunction(MachineFunction &MF) override;
706	};
707
708	static const StringMap<SIAtomicAddrSpace> ASNames = {{
709	{"global", SIAtomicAddrSpace::GLOBAL},
710	{"local", SIAtomicAddrSpace::LDS},
711	}};
712
713	void diagnoseUnknownMMRAASName(const MachineInstr &MI, StringRef AS) {
714	const MachineFunction *MF = MI.getMF();
715	const Function &Fn = MF->getFunction();
716	SmallString<`128`> Str;
717	raw_svector_ostream OS(Str);
718	OS << "unknown address space '" << AS << "'; expected one of ";
719	ListSeparator LS;
720	for (const auto &[Name, Val] : ASNames)
721	OS << LS << `'\''` << Name << `'\''`;
722	DiagnosticInfoUnsupported BadTag(Fn, Str.str(), MI.getDebugLoc(), DS_Warning);
723	Fn.getContext().diagnose(DI: BadTag);
724	}
725
726	/// Reads \p MI's MMRAs to parse the "amdgpu-as" MMRA.
727	/// If this tag isn't present, or if it has no meaningful values, returns \p
728	/// Default. Otherwise returns all the address spaces concerned by the MMRA.
729	static SIAtomicAddrSpace getFenceAddrSpaceMMRA(const MachineInstr &MI,
730	SIAtomicAddrSpace Default) {
731	static constexpr StringLiteral FenceASPrefix = "amdgpu-as";
732
733	auto MMRA = MMRAMetadata (MI.getMMRAMetadata());
734	if (!MMRA)
735	return Default;
736
737	SIAtomicAddrSpace Result = SIAtomicAddrSpace::NONE;
738	for (const auto &[Prefix, Suffix] : MMRA) {
739	if (Prefix != FenceASPrefix)
740	continue;
741
742	if (auto It = ASNames.find(Key: Suffix); It != ASNames.end())
743	Result \|= It ->second;
744	else
745	diagnoseUnknownMMRAASName(MI, AS: Suffix);
746	}
747
748	return (Result != SIAtomicAddrSpace::NONE) ? Result : Default;
749	}
750
751	} // end anonymous namespace
752
753	void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI,
754	const char Msg) const* {
755	const Function &Func = MI ->getParent()->getParent()->getFunction();
756	DiagnosticInfoUnsupported Diag(Func, Msg, MI ->getDebugLoc());
757	Func.getContext().diagnose(DI: Diag);
758	}
759
760	std::optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
761	SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID,
762	SIAtomicAddrSpace InstrAddrSpace) const {
763	if (SSID == SyncScope::System)
764	return std::tuple(SIAtomicScope::SYSTEM, SIAtomicAddrSpace::ATOMIC, true);
765	if (SSID == MMI->getAgentSSID())
766	return std::tuple(SIAtomicScope::AGENT, SIAtomicAddrSpace::ATOMIC, true);
767	if (SSID == MMI->getWorkgroupSSID())
768	return std::tuple(SIAtomicScope::WORKGROUP, SIAtomicAddrSpace::ATOMIC,
769	true);
770	if (SSID == MMI->getWavefrontSSID())
771	return std::tuple(SIAtomicScope::WAVEFRONT, SIAtomicAddrSpace::ATOMIC,
772	true);
773	if (SSID == SyncScope::SingleThread)
774	return std::tuple(SIAtomicScope::SINGLETHREAD, SIAtomicAddrSpace::ATOMIC,
775	true);
776	if (SSID == MMI->getSystemOneAddressSpaceSSID())
777	return std::tuple(SIAtomicScope::SYSTEM,
778	SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
779	if (SSID == MMI->getAgentOneAddressSpaceSSID())
780	return std::tuple(SIAtomicScope::AGENT,
781	SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
782	if (SSID == MMI->getWorkgroupOneAddressSpaceSSID())
783	return std::tuple(SIAtomicScope::WORKGROUP,
784	SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
785	if (SSID == MMI->getWavefrontOneAddressSpaceSSID())
786	return std::tuple(SIAtomicScope::WAVEFRONT,
787	SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
788	if (SSID == MMI->getSingleThreadOneAddressSpaceSSID())
789	return std::tuple(SIAtomicScope::SINGLETHREAD,
790	SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
791	return std::nullopt;
792	}
793
794	SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const {
795	if (AS == AMDGPUAS::FLAT_ADDRESS)
796	return SIAtomicAddrSpace::FLAT;
797	if (AS == AMDGPUAS::GLOBAL_ADDRESS)
798	return SIAtomicAddrSpace::GLOBAL;
799	if (AS == AMDGPUAS::LOCAL_ADDRESS)
800	return SIAtomicAddrSpace::LDS;
801	if (AS == AMDGPUAS::PRIVATE_ADDRESS)
802	return SIAtomicAddrSpace::SCRATCH;
803	if (AS == AMDGPUAS::REGION_ADDRESS)
804	return SIAtomicAddrSpace::GDS;
805
806	return SIAtomicAddrSpace::OTHER;
807	}
808
809	SIMemOpAccess::SIMemOpAccess(const AMDGPUMachineModuleInfo &MMI_)
810	: MMI(&MMI_) {}
811
812	std::optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
813	const MachineBasicBlock::iterator &MI) const {
814	assert(MI->getNumMemOperands() > `0`);
815
816	SyncScope::ID SSID = SyncScope::SingleThread;
817	AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
818	AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
819	SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
820	bool IsNonTemporal = true;
821	bool IsVolatile = false;
822	bool IsLastUse = false;
823
824	// Validator should check whether or not MMOs cover the entire set of
825	// locations accessed by the memory instruction.
826	for (const auto &MMO : MI ->memoperands()) {
827	IsNonTemporal &= MMO->isNonTemporal();
828	IsVolatile \|= MMO->isVolatile();
829	IsLastUse \|= MMO->getFlags() & MOLastUse;
830	InstrAddrSpace \|=
831	toSIAtomicAddrSpace(AS: MMO->getPointerInfo().getAddrSpace());
832	AtomicOrdering OpOrdering = MMO->getSuccessOrdering();
833	if (OpOrdering != AtomicOrdering::NotAtomic) {
834	const auto &IsSyncScopeInclusion =
835	MMI->isSyncScopeInclusion(A: SSID, B: MMO->getSyncScopeID());
836	if (!IsSyncScopeInclusion) {
837	reportUnsupported(MI,
838	Msg: "Unsupported non-inclusive atomic synchronization scope");
839	return std::nullopt;
840	}
841
842	SSID = *IsSyncScopeInclusion ? SSID : MMO->getSyncScopeID();
843	Ordering = getMergedAtomicOrdering(AO: Ordering, Other: OpOrdering);
844	assert(MMO->getFailureOrdering() != AtomicOrdering::Release &&
845	MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease);
846	FailureOrdering =
847	getMergedAtomicOrdering(AO: FailureOrdering, Other: MMO->getFailureOrdering());
848	}
849	}
850
851	SIAtomicScope Scope = SIAtomicScope::NONE;
852	SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
853	bool IsCrossAddressSpaceOrdering = false;
854	if (Ordering != AtomicOrdering::NotAtomic) {
855	auto ScopeOrNone = toSIAtomicScope(SSID, InstrAddrSpace);
856	if (!ScopeOrNone) {
857	reportUnsupported(MI, Msg: "Unsupported atomic synchronization scope");
858	return std::nullopt;
859	}
860	std::tie(args&: Scope, args&: OrderingAddrSpace, args&: IsCrossAddressSpaceOrdering) =
861	*ScopeOrNone;
862	if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) \|\|
863	((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace) \|\|
864	((InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) == SIAtomicAddrSpace::NONE)) {
865	reportUnsupported(MI, Msg: "Unsupported atomic address space");
866	return std::nullopt;
867	}
868	}
869	return SIMemOpInfo (Ordering, Scope, OrderingAddrSpace, InstrAddrSpace,
870	IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile,
871	IsNonTemporal, IsLastUse);
872	}
873
874	std::optional<SIMemOpInfo>
875	SIMemOpAccess::getLoadInfo(const MachineBasicBlock::iterator &MI) const {
876	assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
877
878	if (!(MI ->mayLoad() && !MI ->mayStore()))
879	return std::nullopt;
880
881	// Be conservative if there are no memory operands.
882	if (MI ->getNumMemOperands() == `0`)
883	return SIMemOpInfo ();
884
885	return constructFromMIWithMMO(MI);
886	}
887
888	std::optional<SIMemOpInfo>
889	SIMemOpAccess::getStoreInfo(const MachineBasicBlock::iterator &MI) const {
890	assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
891
892	if (!(!MI ->mayLoad() && MI ->mayStore()))
893	return std::nullopt;
894
895	// Be conservative if there are no memory operands.
896	if (MI ->getNumMemOperands() == `0`)
897	return SIMemOpInfo ();
898
899	return constructFromMIWithMMO(MI);
900	}
901
902	std::optional<SIMemOpInfo>
903	SIMemOpAccess::getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const {
904	assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
905
906	if (MI ->getOpcode() != AMDGPU::ATOMIC_FENCE)
907	return std::nullopt;
908
909	AtomicOrdering Ordering =
910	static_cast<AtomicOrdering>(MI ->getOperand(i: `0`).getImm());
911
912	SyncScope::ID SSID = static_cast<SyncScope::ID>(MI ->getOperand(i: `1`).getImm());
913	auto ScopeOrNone = toSIAtomicScope(SSID, InstrAddrSpace: SIAtomicAddrSpace::ATOMIC);
914	if (!ScopeOrNone) {
915	reportUnsupported(MI, Msg: "Unsupported atomic synchronization scope");
916	return std::nullopt;
917	}
918
919	SIAtomicScope Scope = SIAtomicScope::NONE;
920	SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
921	bool IsCrossAddressSpaceOrdering = false;
922	std::tie(args&: Scope, args&: OrderingAddrSpace, args&: IsCrossAddressSpaceOrdering) =
923	*ScopeOrNone;
924
925	if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) \|\|
926	((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) {
927	reportUnsupported(MI, Msg: "Unsupported atomic address space");
928	return std::nullopt;
929	}
930
931	return SIMemOpInfo (Ordering, Scope, OrderingAddrSpace, SIAtomicAddrSpace::ATOMIC,
932	IsCrossAddressSpaceOrdering, AtomicOrdering::NotAtomic);
933	}
934
935	std::optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo(
936	const MachineBasicBlock::iterator &MI) const {
937	assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
938
939	if (!(MI ->mayLoad() && MI ->mayStore()))
940	return std::nullopt;
941
942	// Be conservative if there are no memory operands.
943	if (MI ->getNumMemOperands() == `0`)
944	return SIMemOpInfo ();
945
946	return constructFromMIWithMMO(MI);
947	}
948
949	SICacheControl::SICacheControl(const GCNSubtarget &ST) : ST(ST) {
950	TII = ST.getInstrInfo();
951	IV = getIsaVersion(GPU: ST.getCPU());
952	InsertCacheInv = !AmdgcnSkipCacheInvalidations;
953	}
954
955	bool SICacheControl::enableNamedBit(const MachineBasicBlock::iterator MI,
956	AMDGPU::CPol::CPol Bit) const {
957	MachineOperand CPol = TII->getNamedOperand(MI&: MI, OperandName: AMDGPU::OpName::cpol);
958	if (!CPol)
959	return false;
960
961	CPol->setImm(CPol->getImm() \| Bit);
962	return true;
963	}
964
965	/ static /
966	std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) {
967	GCNSubtarget::Generation Generation = ST.getGeneration();
968	if (ST.hasGFX940Insts())
969	return std::make_unique<SIGfx940CacheControl>(args: ST);
970	if (ST.hasGFX90AInsts())
971	return std::make_unique<SIGfx90ACacheControl>(args: ST);
972	if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS)
973	return std::make_unique<SIGfx6CacheControl>(args: ST);
974	if (Generation < AMDGPUSubtarget::GFX10)
975	return std::make_unique<SIGfx7CacheControl>(args: ST);
976	if (Generation < AMDGPUSubtarget::GFX11)
977	return std::make_unique<SIGfx10CacheControl>(args: ST);
978	if (Generation < AMDGPUSubtarget::GFX12)
979	return std::make_unique<SIGfx11CacheControl>(args: ST);
980	return std::make_unique<SIGfx12CacheControl>(args: ST);
981	}
982
983	bool SIGfx6CacheControl::enableLoadCacheBypass(
984	const MachineBasicBlock::iterator &MI,
985	SIAtomicScope Scope,
986	SIAtomicAddrSpace AddrSpace) const {
987	assert(MI->mayLoad() && !MI->mayStore());
988	bool Changed = false;
989
990	if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
991	switch (Scope) {
992	case SIAtomicScope::SYSTEM:
993	case SIAtomicScope::AGENT:
994	// Set L1 cache policy to MISS_EVICT.
995	// Note: there is no L2 cache bypass policy at the ISA level.
996	Changed \|= enableGLCBit(MI);
997	break;
998	case SIAtomicScope::WORKGROUP:
999	case SIAtomicScope::WAVEFRONT:
1000	case SIAtomicScope::SINGLETHREAD:
1001	// No cache to bypass.
1002	break;
1003	default:
1004	llvm_unreachable("Unsupported synchronization scope");
1005	}
1006	}
1007
1008	/// The scratch address space does not need the global memory caches
1009	/// to be bypassed as all memory operations by the same thread are
1010	/// sequentially consistent, and no other thread can access scratch
1011	/// memory.
1012
1013	/// Other address spaces do not have a cache.
1014
1015	return Changed;
1016	}
1017
1018	bool SIGfx6CacheControl::enableStoreCacheBypass(
1019	const MachineBasicBlock::iterator &MI,
1020	SIAtomicScope Scope,
1021	SIAtomicAddrSpace AddrSpace) const {
1022	assert(!MI->mayLoad() && MI->mayStore());
1023	bool Changed = false;
1024
1025	/// The L1 cache is write through so does not need to be bypassed. There is no
1026	/// bypass control for the L2 cache at the isa level.
1027
1028	return Changed;
1029	}
1030
1031	bool SIGfx6CacheControl::enableRMWCacheBypass(
1032	const MachineBasicBlock::iterator &MI,
1033	SIAtomicScope Scope,
1034	SIAtomicAddrSpace AddrSpace) const {
1035	assert(MI->mayLoad() && MI->mayStore());
1036	bool Changed = false;
1037
1038	/// Do not set GLC for RMW atomic operations as L0/L1 cache is automatically
1039	/// bypassed, and the GLC bit is instead used to indicate if they are
1040	/// return or no-return.
1041	/// Note: there is no L2 cache coherent bypass control at the ISA level.
1042
1043	return Changed;
1044	}
1045
1046	bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal(
1047	MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1048	bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
1049	// Only handle load and store, not atomic read-modify-write insructions. The
1050	// latter use glc to indicate if the atomic returns a result and so must not
1051	// be used for cache control.
1052	assert(MI->mayLoad() ^ MI->mayStore());
1053
1054	// Only update load and store, not LLVM IR atomic read-modify-write
1055	// instructions. The latter are always marked as volatile so cannot sensibly
1056	// handle it as do not want to pessimize all atomics. Also they do not support
1057	// the nontemporal attribute.
1058	assert(Op == SIMemOp::LOAD \|\| Op == SIMemOp::STORE);
1059
1060	bool Changed = false;
1061
1062	if (IsVolatile) {
1063	// Set L1 cache policy to be MISS_EVICT for load instructions
1064	// and MISS_LRU for store instructions.
1065	// Note: there is no L2 cache bypass policy at the ISA level.
1066	if (Op == SIMemOp::LOAD)
1067	Changed \|= enableGLCBit(MI);
1068
1069	// Ensure operation has completed at system scope to cause all volatile
1070	// operations to be visible outside the program in a global order. Do not
1071	// request cross address space as only the global address space can be
1072	// observable outside the program, so no need to cause a waitcnt for LDS
1073	// address space operations.
1074	Changed \|= insertWait(MI, Scope: SIAtomicScope::SYSTEM, AddrSpace, Op, IsCrossAddrSpaceOrdering: false,
1075	Pos: Position::AFTER);
1076
1077	return Changed;
1078	}
1079
1080	if (IsNonTemporal) {
1081	// Setting both GLC and SLC configures L1 cache policy to MISS_EVICT
1082	// for both loads and stores, and the L2 cache policy to STREAM.
1083	Changed \|= enableGLCBit(MI);
1084	Changed \|= enableSLCBit(MI);
1085	return Changed;
1086	}
1087
1088	return Changed;
1089	}
1090
1091	bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
1092	SIAtomicScope Scope,
1093	SIAtomicAddrSpace AddrSpace,
1094	SIMemOp Op,
1095	bool IsCrossAddrSpaceOrdering,
1096	Position Pos) const {
1097	bool Changed = false;
1098
1099	MachineBasicBlock &MBB = *MI ->getParent();
1100	DebugLoc DL = MI ->getDebugLoc();
1101
1102	if (Pos == Position::AFTER)
1103	++MI;
1104
1105	bool VMCnt = false;
1106	bool LGKMCnt = false;
1107
1108	if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL \| SIAtomicAddrSpace::SCRATCH)) !=
1109	SIAtomicAddrSpace::NONE) {
1110	switch (Scope) {
1111	case SIAtomicScope::SYSTEM:
1112	case SIAtomicScope::AGENT:
1113	VMCnt \|= true;
1114	break;
1115	case SIAtomicScope::WORKGROUP:
1116	case SIAtomicScope::WAVEFRONT:
1117	case SIAtomicScope::SINGLETHREAD:
1118	// The L1 cache keeps all memory operations in order for
1119	// wavefronts in the same work-group.
1120	break;
1121	default:
1122	llvm_unreachable("Unsupported synchronization scope");
1123	}
1124	}
1125
1126	if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1127	switch (Scope) {
1128	case SIAtomicScope::SYSTEM:
1129	case SIAtomicScope::AGENT:
1130	case SIAtomicScope::WORKGROUP:
1131	// If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
1132	// not needed as LDS operations for all waves are executed in a total
1133	// global ordering as observed by all waves. Required if also
1134	// synchronizing with global/GDS memory as LDS operations could be
1135	// reordered with respect to later global/GDS memory operations of the
1136	// same wave.
1137	LGKMCnt \|= IsCrossAddrSpaceOrdering;
1138	break;
1139	case SIAtomicScope::WAVEFRONT:
1140	case SIAtomicScope::SINGLETHREAD:
1141	// The LDS keeps all memory operations in order for
1142	// the same wavefront.
1143	break;
1144	default:
1145	llvm_unreachable("Unsupported synchronization scope");
1146	}
1147	}
1148
1149	if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
1150	switch (Scope) {
1151	case SIAtomicScope::SYSTEM:
1152	case SIAtomicScope::AGENT:
1153	// If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
1154	// is not needed as GDS operations for all waves are executed in a total
1155	// global ordering as observed by all waves. Required if also
1156	// synchronizing with global/LDS memory as GDS operations could be
1157	// reordered with respect to later global/LDS memory operations of the
1158	// same wave.
1159	LGKMCnt \|= IsCrossAddrSpaceOrdering;
1160	break;
1161	case SIAtomicScope::WORKGROUP:
1162	case SIAtomicScope::WAVEFRONT:
1163	case SIAtomicScope::SINGLETHREAD:
1164	// The GDS keeps all memory operations in order for
1165	// the same work-group.
1166	break;
1167	default:
1168	llvm_unreachable("Unsupported synchronization scope");
1169	}
1170	}
1171
1172	if (VMCnt \|\| LGKMCnt) {
1173	unsigned WaitCntImmediate =
1174	AMDGPU::encodeWaitcnt(Version: IV,
1175	Vmcnt: VMCnt ? `0` : getVmcntBitMask(Version: IV),
1176	Expcnt: getExpcntBitMask(Version: IV),
1177	Lgkmcnt: LGKMCnt ? `0` : getLgkmcntBitMask(Version: IV));
1178	BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_WAITCNT_soft))
1179	.addImm(Val: WaitCntImmediate);
1180	Changed = true;
1181	}
1182
1183	if (Pos == Position::AFTER)
1184	--MI;
1185
1186	return Changed;
1187	}
1188
1189	bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1190	SIAtomicScope Scope,
1191	SIAtomicAddrSpace AddrSpace,
1192	Position Pos) const {
1193	if (!InsertCacheInv)
1194	return false;
1195
1196	bool Changed = false;
1197
1198	MachineBasicBlock &MBB = *MI ->getParent();
1199	DebugLoc DL = MI ->getDebugLoc();
1200
1201	if (Pos == Position::AFTER)
1202	++MI;
1203
1204	if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1205	switch (Scope) {
1206	case SIAtomicScope::SYSTEM:
1207	case SIAtomicScope::AGENT:
1208	BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::BUFFER_WBINVL1));
1209	Changed = true;
1210	break;
1211	case SIAtomicScope::WORKGROUP:
1212	case SIAtomicScope::WAVEFRONT:
1213	case SIAtomicScope::SINGLETHREAD:
1214	// No cache to invalidate.
1215	break;
1216	default:
1217	llvm_unreachable("Unsupported synchronization scope");
1218	}
1219	}
1220
1221	/// The scratch address space does not need the global memory cache
1222	/// to be flushed as all memory operations by the same thread are
1223	/// sequentially consistent, and no other thread can access scratch
1224	/// memory.
1225
1226	/// Other address spaces do not have a cache.
1227
1228	if (Pos == Position::AFTER)
1229	--MI;
1230
1231	return Changed;
1232	}
1233
1234	bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1235	SIAtomicScope Scope,
1236	SIAtomicAddrSpace AddrSpace,
1237	bool IsCrossAddrSpaceOrdering,
1238	Position Pos) const {
1239	return insertWait(MI, Scope, AddrSpace, Op: SIMemOp::LOAD \| SIMemOp::STORE,
1240	IsCrossAddrSpaceOrdering, Pos);
1241	}
1242
1243	bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1244	SIAtomicScope Scope,
1245	SIAtomicAddrSpace AddrSpace,
1246	Position Pos) const {
1247	if (!InsertCacheInv)
1248	return false;
1249
1250	bool Changed = false;
1251
1252	MachineBasicBlock &MBB = *MI ->getParent();
1253	DebugLoc DL = MI ->getDebugLoc();
1254
1255	const GCNSubtarget &STM = MBB.getParent()->getSubtarget<GCNSubtarget>();
1256
1257	const unsigned InvalidateL1 = STM.isAmdPalOS() \|\| STM.isMesa3DOS()
1258	? AMDGPU::BUFFER_WBINVL1
1259	: AMDGPU::BUFFER_WBINVL1_VOL;
1260
1261	if (Pos == Position::AFTER)
1262	++MI;
1263
1264	if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1265	switch (Scope) {
1266	case SIAtomicScope::SYSTEM:
1267	case SIAtomicScope::AGENT:
1268	BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: InvalidateL1));
1269	Changed = true;
1270	break;
1271	case SIAtomicScope::WORKGROUP:
1272	case SIAtomicScope::WAVEFRONT:
1273	case SIAtomicScope::SINGLETHREAD:
1274	// No cache to invalidate.
1275	break;
1276	default:
1277	llvm_unreachable("Unsupported synchronization scope");
1278	}
1279	}
1280
1281	/// The scratch address space does not need the global memory cache
1282	/// to be flushed as all memory operations by the same thread are
1283	/// sequentially consistent, and no other thread can access scratch
1284	/// memory.
1285
1286	/// Other address spaces do not have a cache.
1287
1288	if (Pos == Position::AFTER)
1289	--MI;
1290
1291	return Changed;
1292	}
1293
1294	bool SIGfx90ACacheControl::enableLoadCacheBypass(
1295	const MachineBasicBlock::iterator &MI,
1296	SIAtomicScope Scope,
1297	SIAtomicAddrSpace AddrSpace) const {
1298	assert(MI->mayLoad() && !MI->mayStore());
1299	bool Changed = false;
1300
1301	if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1302	switch (Scope) {
1303	case SIAtomicScope::SYSTEM:
1304	case SIAtomicScope::AGENT:
1305	// Set the L1 cache policy to MISS_LRU.
1306	// Note: there is no L2 cache bypass policy at the ISA level.
1307	Changed \|= enableGLCBit(MI);
1308	break;
1309	case SIAtomicScope::WORKGROUP:
1310	// In threadgroup split mode the waves of a work-group can be executing on
1311	// different CUs. Therefore need to bypass the L1 which is per CU.
1312	// Otherwise in non-threadgroup split mode all waves of a work-group are
1313	// on the same CU, and so the L1 does not need to be bypassed.
1314	if (ST.isTgSplitEnabled())
1315	Changed \|= enableGLCBit(MI);
1316	break;
1317	case SIAtomicScope::WAVEFRONT:
1318	case SIAtomicScope::SINGLETHREAD:
1319	// No cache to bypass.
1320	break;
1321	default:
1322	llvm_unreachable("Unsupported synchronization scope");
1323	}
1324	}
1325
1326	/// The scratch address space does not need the global memory caches
1327	/// to be bypassed as all memory operations by the same thread are
1328	/// sequentially consistent, and no other thread can access scratch
1329	/// memory.
1330
1331	/// Other address spaces do not have a cache.
1332
1333	return Changed;
1334	}
1335
1336	bool SIGfx90ACacheControl::enableStoreCacheBypass(
1337	const MachineBasicBlock::iterator &MI,
1338	SIAtomicScope Scope,
1339	SIAtomicAddrSpace AddrSpace) const {
1340	assert(!MI->mayLoad() && MI->mayStore());
1341	bool Changed = false;
1342
1343	if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1344	switch (Scope) {
1345	case SIAtomicScope::SYSTEM:
1346	case SIAtomicScope::AGENT:
1347	/// Do not set glc for store atomic operations as they implicitly write
1348	/// through the L1 cache.
1349	break;
1350	case SIAtomicScope::WORKGROUP:
1351	case SIAtomicScope::WAVEFRONT:
1352	case SIAtomicScope::SINGLETHREAD:
1353	// No cache to bypass. Store atomics implicitly write through the L1
1354	// cache.
1355	break;
1356	default:
1357	llvm_unreachable("Unsupported synchronization scope");
1358	}
1359	}
1360
1361	/// The scratch address space does not need the global memory caches
1362	/// to be bypassed as all memory operations by the same thread are
1363	/// sequentially consistent, and no other thread can access scratch
1364	/// memory.
1365
1366	/// Other address spaces do not have a cache.
1367
1368	return Changed;
1369	}
1370
1371	bool SIGfx90ACacheControl::enableRMWCacheBypass(
1372	const MachineBasicBlock::iterator &MI,
1373	SIAtomicScope Scope,
1374	SIAtomicAddrSpace AddrSpace) const {
1375	assert(MI->mayLoad() && MI->mayStore());
1376	bool Changed = false;
1377
1378	if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1379	switch (Scope) {
1380	case SIAtomicScope::SYSTEM:
1381	case SIAtomicScope::AGENT:
1382	/// Do not set glc for RMW atomic operations as they implicitly bypass
1383	/// the L1 cache, and the glc bit is instead used to indicate if they are
1384	/// return or no-return.
1385	break;
1386	case SIAtomicScope::WORKGROUP:
1387	case SIAtomicScope::WAVEFRONT:
1388	case SIAtomicScope::SINGLETHREAD:
1389	// No cache to bypass. RMW atomics implicitly bypass the L1 cache.
1390	break;
1391	default:
1392	llvm_unreachable("Unsupported synchronization scope");
1393	}
1394	}
1395
1396	return Changed;
1397	}
1398
1399	bool SIGfx90ACacheControl::enableVolatileAndOrNonTemporal(
1400	MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1401	bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
1402	// Only handle load and store, not atomic read-modify-write insructions. The
1403	// latter use glc to indicate if the atomic returns a result and so must not
1404	// be used for cache control.
1405	assert(MI->mayLoad() ^ MI->mayStore());
1406
1407	// Only update load and store, not LLVM IR atomic read-modify-write
1408	// instructions. The latter are always marked as volatile so cannot sensibly
1409	// handle it as do not want to pessimize all atomics. Also they do not support
1410	// the nontemporal attribute.
1411	assert(Op == SIMemOp::LOAD \|\| Op == SIMemOp::STORE);
1412
1413	bool Changed = false;
1414
1415	if (IsVolatile) {
1416	// Set L1 cache policy to be MISS_EVICT for load instructions
1417	// and MISS_LRU for store instructions.
1418	// Note: there is no L2 cache bypass policy at the ISA level.
1419	if (Op == SIMemOp::LOAD)
1420	Changed \|= enableGLCBit(MI);
1421
1422	// Ensure operation has completed at system scope to cause all volatile
1423	// operations to be visible outside the program in a global order. Do not
1424	// request cross address space as only the global address space can be
1425	// observable outside the program, so no need to cause a waitcnt for LDS
1426	// address space operations.
1427	Changed \|= insertWait(MI, Scope: SIAtomicScope::SYSTEM, AddrSpace, Op, IsCrossAddrSpaceOrdering: false,
1428	Pos: Position::AFTER);
1429
1430	return Changed;
1431	}
1432
1433	if (IsNonTemporal) {
1434	// Setting both GLC and SLC configures L1 cache policy to MISS_EVICT
1435	// for both loads and stores, and the L2 cache policy to STREAM.
1436	Changed \|= enableGLCBit(MI);
1437	Changed \|= enableSLCBit(MI);
1438	return Changed;
1439	}
1440
1441	return Changed;
1442	}
1443
1444	bool SIGfx90ACacheControl::insertWait(MachineBasicBlock::iterator &MI,
1445	SIAtomicScope Scope,
1446	SIAtomicAddrSpace AddrSpace,
1447	SIMemOp Op,
1448	bool IsCrossAddrSpaceOrdering,
1449	Position Pos) const {
1450	if (ST.isTgSplitEnabled()) {
1451	// In threadgroup split mode the waves of a work-group can be executing on
1452	// different CUs. Therefore need to wait for global or GDS memory operations
1453	// to complete to ensure they are visible to waves in the other CUs.
1454	// Otherwise in non-threadgroup split mode all waves of a work-group are on
1455	// the same CU, so no need to wait for global memory as all waves in the
1456	// work-group access the same the L1, nor wait for GDS as access are ordered
1457	// on a CU.
1458	if (((AddrSpace & (SIAtomicAddrSpace::GLOBAL \| SIAtomicAddrSpace::SCRATCH \|
1459	SIAtomicAddrSpace::GDS)) != SIAtomicAddrSpace::NONE) &&
1460	(Scope == SIAtomicScope::WORKGROUP)) {
1461	// Same as GFX7 using agent scope.
1462	Scope = SIAtomicScope::AGENT;
1463	}
1464	// In threadgroup split mode LDS cannot be allocated so no need to wait for
1465	// LDS memory operations.
1466	AddrSpace &= ~SIAtomicAddrSpace::LDS;
1467	}
1468	return SIGfx7CacheControl::insertWait(MI, Scope, AddrSpace, Op,
1469	IsCrossAddrSpaceOrdering, Pos);
1470	}
1471
1472	bool SIGfx90ACacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1473	SIAtomicScope Scope,
1474	SIAtomicAddrSpace AddrSpace,
1475	Position Pos) const {
1476	if (!InsertCacheInv)
1477	return false;
1478
1479	bool Changed = false;
1480
1481	MachineBasicBlock &MBB = *MI ->getParent();
1482	DebugLoc DL = MI ->getDebugLoc();
1483
1484	if (Pos == Position::AFTER)
1485	++MI;
1486
1487	if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1488	switch (Scope) {
1489	case SIAtomicScope::SYSTEM:
1490	// Ensures that following loads will not see stale remote VMEM data or
1491	// stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and
1492	// CC will never be stale due to the local memory probes.
1493	BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::BUFFER_INVL2));
1494	// Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
1495	// hardware does not reorder memory operations by the same wave with
1496	// respect to a preceding "BUFFER_INVL2". The invalidate is guaranteed to
1497	// remove any cache lines of earlier writes by the same wave and ensures
1498	// later reads by the same wave will refetch the cache lines.
1499	Changed = true;
1500	break;
1501	case SIAtomicScope::AGENT:
1502	// Same as GFX7.
1503	break;
1504	case SIAtomicScope::WORKGROUP:
1505	// In threadgroup split mode the waves of a work-group can be executing on
1506	// different CUs. Therefore need to invalidate the L1 which is per CU.
1507	// Otherwise in non-threadgroup split mode all waves of a work-group are
1508	// on the same CU, and so the L1 does not need to be invalidated.
1509	if (ST.isTgSplitEnabled()) {
1510	// Same as GFX7 using agent scope.
1511	Scope = SIAtomicScope::AGENT;
1512	}
1513	break;
1514	case SIAtomicScope::WAVEFRONT:
1515	case SIAtomicScope::SINGLETHREAD:
1516	// Same as GFX7.
1517	break;
1518	default:
1519	llvm_unreachable("Unsupported synchronization scope");
1520	}
1521	}
1522
1523	/// The scratch address space does not need the global memory cache
1524	/// to be flushed as all memory operations by the same thread are
1525	/// sequentially consistent, and no other thread can access scratch
1526	/// memory.
1527
1528	/// Other address spaces do not have a cache.
1529
1530	if (Pos == Position::AFTER)
1531	--MI;
1532
1533	Changed \|= SIGfx7CacheControl::insertAcquire(MI, Scope, AddrSpace, Pos);
1534
1535	return Changed;
1536	}
1537
1538	bool SIGfx90ACacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1539	SIAtomicScope Scope,
1540	SIAtomicAddrSpace AddrSpace,
1541	bool IsCrossAddrSpaceOrdering,
1542	Position Pos) const {
1543	bool Changed = false;
1544
1545	MachineBasicBlock &MBB = *MI ->getParent();
1546	const DebugLoc &DL = MI ->getDebugLoc();
1547
1548	if (Pos == Position::AFTER)
1549	++MI;
1550
1551	if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1552	switch (Scope) {
1553	case SIAtomicScope::SYSTEM:
1554	// Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
1555	// hardware does not reorder memory operations by the same wave with
1556	// respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
1557	// to initiate writeback of any dirty cache lines of earlier writes by the
1558	// same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
1559	// writeback has completed.
1560	BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::BUFFER_WBL2))
1561	// Set SC bits to indicate system scope.
1562	.addImm(Val: AMDGPU::CPol::SC0 \| AMDGPU::CPol::SC1);
1563	// Followed by same as GFX7, which will ensure the necessary "S_WAITCNT
1564	// vmcnt(0)" needed by the "BUFFER_WBL2".
1565	Changed = true;
1566	break;
1567	case SIAtomicScope::AGENT:
1568	case SIAtomicScope::WORKGROUP:
1569	case SIAtomicScope::WAVEFRONT:
1570	case SIAtomicScope::SINGLETHREAD:
1571	// Same as GFX7.
1572	break;
1573	default:
1574	llvm_unreachable("Unsupported synchronization scope");
1575	}
1576	}
1577
1578	if (Pos == Position::AFTER)
1579	--MI;
1580
1581	Changed \|=
1582	SIGfx7CacheControl::insertRelease(MI, Scope, AddrSpace,
1583	IsCrossAddrSpaceOrdering, Pos);
1584
1585	return Changed;
1586	}
1587
1588	bool SIGfx940CacheControl::enableLoadCacheBypass(
1589	const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
1590	SIAtomicAddrSpace AddrSpace) const {
1591	assert(MI->mayLoad() && !MI->mayStore());
1592	bool Changed = false;
1593
1594	if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1595	switch (Scope) {
1596	case SIAtomicScope::SYSTEM:
1597	// Set SC bits to indicate system scope.
1598	Changed \|= enableSC0Bit(MI);
1599	Changed \|= enableSC1Bit(MI);
1600	break;
1601	case SIAtomicScope::AGENT:
1602	// Set SC bits to indicate agent scope.
1603	Changed \|= enableSC1Bit(MI);
1604	break;
1605	case SIAtomicScope::WORKGROUP:
1606	// In threadgroup split mode the waves of a work-group can be executing on
1607	// different CUs. Therefore need to bypass the L1 which is per CU.
1608	// Otherwise in non-threadgroup split mode all waves of a work-group are
1609	// on the same CU, and so the L1 does not need to be bypassed. Setting SC
1610	// bits to indicate work-group scope will do this automatically.
1611	Changed \|= enableSC0Bit(MI);
1612	break;
1613	case SIAtomicScope::WAVEFRONT:
1614	case SIAtomicScope::SINGLETHREAD:
1615	// Leave SC bits unset to indicate wavefront scope.
1616	break;
1617	default:
1618	llvm_unreachable("Unsupported synchronization scope");
1619	}
1620	}
1621
1622	/// The scratch address space does not need the global memory caches
1623	/// to be bypassed as all memory operations by the same thread are
1624	/// sequentially consistent, and no other thread can access scratch
1625	/// memory.
1626
1627	/// Other address spaces do not have a cache.
1628
1629	return Changed;
1630	}
1631
1632	bool SIGfx940CacheControl::enableStoreCacheBypass(
1633	const MachineBasicBlock::iterator &MI,
1634	SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const {
1635	assert(!MI->mayLoad() && MI->mayStore());
1636	bool Changed = false;
1637
1638	if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1639	switch (Scope) {
1640	case SIAtomicScope::SYSTEM:
1641	// Set SC bits to indicate system scope.
1642	Changed \|= enableSC0Bit(MI);
1643	Changed \|= enableSC1Bit(MI);
1644	break;
1645	case SIAtomicScope::AGENT:
1646	// Set SC bits to indicate agent scope.
1647	Changed \|= enableSC1Bit(MI);
1648	break;
1649	case SIAtomicScope::WORKGROUP:
1650	// Set SC bits to indicate workgroup scope.
1651	Changed \|= enableSC0Bit(MI);
1652	break;
1653	case SIAtomicScope::WAVEFRONT:
1654	case SIAtomicScope::SINGLETHREAD:
1655	// Leave SC bits unset to indicate wavefront scope.
1656	break;
1657	default:
1658	llvm_unreachable("Unsupported synchronization scope");
1659	}
1660	}
1661
1662	/// The scratch address space does not need the global memory caches
1663	/// to be bypassed as all memory operations by the same thread are
1664	/// sequentially consistent, and no other thread can access scratch
1665	/// memory.
1666
1667	/// Other address spaces do not have a cache.
1668
1669	return Changed;
1670	}
1671
1672	bool SIGfx940CacheControl::enableRMWCacheBypass(
1673	const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
1674	SIAtomicAddrSpace AddrSpace) const {
1675	assert(MI->mayLoad() && MI->mayStore());
1676	bool Changed = false;
1677
1678	if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1679	switch (Scope) {
1680	case SIAtomicScope::SYSTEM:
1681	// Set SC1 bit to indicate system scope.
1682	Changed \|= enableSC1Bit(MI);
1683	break;
1684	case SIAtomicScope::AGENT:
1685	case SIAtomicScope::WORKGROUP:
1686	case SIAtomicScope::WAVEFRONT:
1687	case SIAtomicScope::SINGLETHREAD:
1688	// RMW atomic operations implicitly bypass the L1 cache and only use SC1
1689	// to indicate system or agent scope. The SC0 bit is used to indicate if
1690	// they are return or no-return. Leave SC1 bit unset to indicate agent
1691	// scope.
1692	break;
1693	default:
1694	llvm_unreachable("Unsupported synchronization scope");
1695	}
1696	}
1697
1698	return Changed;
1699	}
1700
1701	bool SIGfx940CacheControl::enableVolatileAndOrNonTemporal(
1702	MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1703	bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
1704	// Only handle load and store, not atomic read-modify-write insructions. The
1705	// latter use glc to indicate if the atomic returns a result and so must not
1706	// be used for cache control.
1707	assert(MI->mayLoad() ^ MI->mayStore());
1708
1709	// Only update load and store, not LLVM IR atomic read-modify-write
1710	// instructions. The latter are always marked as volatile so cannot sensibly
1711	// handle it as do not want to pessimize all atomics. Also they do not support
1712	// the nontemporal attribute.
1713	assert(Op == SIMemOp::LOAD \|\| Op == SIMemOp::STORE);
1714
1715	bool Changed = false;
1716
1717	if (IsVolatile) {
1718	// Set SC bits to indicate system scope.
1719	Changed \|= enableSC0Bit(MI);
1720	Changed \|= enableSC1Bit(MI);
1721
1722	// Ensure operation has completed at system scope to cause all volatile
1723	// operations to be visible outside the program in a global order. Do not
1724	// request cross address space as only the global address space can be
1725	// observable outside the program, so no need to cause a waitcnt for LDS
1726	// address space operations.
1727	Changed \|= insertWait(MI, Scope: SIAtomicScope::SYSTEM, AddrSpace, Op, IsCrossAddrSpaceOrdering: false,
1728	Pos: Position::AFTER);
1729
1730	return Changed;
1731	}
1732
1733	if (IsNonTemporal) {
1734	Changed \|= enableNTBit(MI);
1735	return Changed;
1736	}
1737
1738	return Changed;
1739	}
1740
1741	bool SIGfx940CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1742	SIAtomicScope Scope,
1743	SIAtomicAddrSpace AddrSpace,
1744	Position Pos) const {
1745	if (!InsertCacheInv)
1746	return false;
1747
1748	bool Changed = false;
1749
1750	MachineBasicBlock &MBB = *MI ->getParent();
1751	DebugLoc DL = MI ->getDebugLoc();
1752
1753	if (Pos == Position::AFTER)
1754	++MI;
1755
1756	if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1757	switch (Scope) {
1758	case SIAtomicScope::SYSTEM:
1759	// Ensures that following loads will not see stale remote VMEM data or
1760	// stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and
1761	// CC will never be stale due to the local memory probes.
1762	BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::BUFFER_INV))
1763	// Set SC bits to indicate system scope.
1764	.addImm(Val: AMDGPU::CPol::SC0 \| AMDGPU::CPol::SC1);
1765	// Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
1766	// hardware does not reorder memory operations by the same wave with
1767	// respect to a preceding "BUFFER_INV". The invalidate is guaranteed to
1768	// remove any cache lines of earlier writes by the same wave and ensures
1769	// later reads by the same wave will refetch the cache lines.
1770	Changed = true;
1771	break;
1772	case SIAtomicScope::AGENT:
1773	// Ensures that following loads will not see stale remote date or local
1774	// MTYPE NC global data. Local MTYPE RW and CC memory will never be stale
1775	// due to the memory probes.
1776	BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::BUFFER_INV))
1777	// Set SC bits to indicate agent scope.
1778	.addImm(Val: AMDGPU::CPol::SC1);
1779	// Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
1780	// does not reorder memory operations with respect to preceeding buffer
1781	// invalidate. The invalidate is guaranteed to remove any cache lines of
1782	// earlier writes and ensures later writes will refetch the cache lines.
1783	Changed = true;
1784	break;
1785	case SIAtomicScope::WORKGROUP:
1786	// In threadgroup split mode the waves of a work-group can be executing on
1787	// different CUs. Therefore need to invalidate the L1 which is per CU.
1788	// Otherwise in non-threadgroup split mode all waves of a work-group are
1789	// on the same CU, and so the L1 does not need to be invalidated.
1790	if (ST.isTgSplitEnabled()) {
1791	// Ensures L1 is invalidated if in threadgroup split mode. In
1792	// non-threadgroup split mode it is a NOP, but no point generating it in
1793	// that case if know not in that mode.
1794	BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::BUFFER_INV))
1795	// Set SC bits to indicate work-group scope.
1796	.addImm(Val: AMDGPU::CPol::SC0);
1797	// Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
1798	// does not reorder memory operations with respect to preceeding buffer
1799	// invalidate. The invalidate is guaranteed to remove any cache lines of
1800	// earlier writes and ensures later writes will refetch the cache lines.
1801	Changed = true;
1802	}
1803	break;
1804	case SIAtomicScope::WAVEFRONT:
1805	case SIAtomicScope::SINGLETHREAD:
1806	// Could generate "BUFFER_INV" but it would do nothing as there are no
1807	// caches to invalidate.
1808	break;
1809	default:
1810	llvm_unreachable("Unsupported synchronization scope");
1811	}
1812	}
1813
1814	/// The scratch address space does not need the global memory cache
1815	/// to be flushed as all memory operations by the same thread are
1816	/// sequentially consistent, and no other thread can access scratch
1817	/// memory.
1818
1819	/// Other address spaces do not have a cache.
1820
1821	if (Pos == Position::AFTER)
1822	--MI;
1823
1824	return Changed;
1825	}
1826
1827	bool SIGfx940CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1828	SIAtomicScope Scope,
1829	SIAtomicAddrSpace AddrSpace,
1830	bool IsCrossAddrSpaceOrdering,
1831	Position Pos) const {
1832	bool Changed = false;
1833
1834	MachineBasicBlock &MBB = *MI ->getParent();
1835	DebugLoc DL = MI ->getDebugLoc();
1836
1837	if (Pos == Position::AFTER)
1838	++MI;
1839
1840	if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1841	switch (Scope) {
1842	case SIAtomicScope::SYSTEM:
1843	// Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
1844	// hardware does not reorder memory operations by the same wave with
1845	// respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
1846	// to initiate writeback of any dirty cache lines of earlier writes by the
1847	// same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
1848	// writeback has completed.
1849	BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::BUFFER_WBL2))
1850	// Set SC bits to indicate system scope.
1851	.addImm(Val: AMDGPU::CPol::SC0 \| AMDGPU::CPol::SC1);
1852	// Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is
1853	// SIAtomicScope::SYSTEM, the following insertWait will generate the
1854	// required "S_WAITCNT vmcnt(0)" needed by the "BUFFER_WBL2".
1855	Changed = true;
1856	break;
1857	case SIAtomicScope::AGENT:
1858	BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::BUFFER_WBL2))
1859	// Set SC bits to indicate agent scope.
1860	.addImm(Val: AMDGPU::CPol::SC1);
1861
1862	// Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is
1863	// SIAtomicScope::AGENT, the following insertWait will generate the
1864	// required "S_WAITCNT vmcnt(0)".
1865	Changed = true;
1866	break;
1867	case SIAtomicScope::WORKGROUP:
1868	case SIAtomicScope::WAVEFRONT:
1869	case SIAtomicScope::SINGLETHREAD:
1870	// Do not generate "BUFFER_WBL2" as there are no caches it would
1871	// writeback, and would require an otherwise unnecessary
1872	// "S_WAITCNT vmcnt(0)".
1873	break;
1874	default:
1875	llvm_unreachable("Unsupported synchronization scope");
1876	}
1877	}
1878
1879	if (Pos == Position::AFTER)
1880	--MI;
1881
1882	// Ensure the necessary S_WAITCNT needed by any "BUFFER_WBL2" as well as other
1883	// S_WAITCNT needed.
1884	Changed \|= insertWait(MI, Scope, AddrSpace, Op: SIMemOp::LOAD \| SIMemOp::STORE,
1885	IsCrossAddrSpaceOrdering, Pos);
1886
1887	return Changed;
1888	}
1889
1890	bool SIGfx10CacheControl::enableLoadCacheBypass(
1891	const MachineBasicBlock::iterator &MI,
1892	SIAtomicScope Scope,
1893	SIAtomicAddrSpace AddrSpace) const {
1894	assert(MI->mayLoad() && !MI->mayStore());
1895	bool Changed = false;
1896
1897	if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1898	switch (Scope) {
1899	case SIAtomicScope::SYSTEM:
1900	case SIAtomicScope::AGENT:
1901	// Set the L0 and L1 cache policies to MISS_EVICT.
1902	// Note: there is no L2 cache coherent bypass control at the ISA level.
1903	Changed \|= enableGLCBit(MI);
1904	Changed \|= enableDLCBit(MI);
1905	break;
1906	case SIAtomicScope::WORKGROUP:
1907	// In WGP mode the waves of a work-group can be executing on either CU of
1908	// the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
1909	// CU mode all waves of a work-group are on the same CU, and so the L0
1910	// does not need to be bypassed.
1911	if (!ST.isCuModeEnabled())
1912	Changed \|= enableGLCBit(MI);
1913	break;
1914	case SIAtomicScope::WAVEFRONT:
1915	case SIAtomicScope::SINGLETHREAD:
1916	// No cache to bypass.
1917	break;
1918	default:
1919	llvm_unreachable("Unsupported synchronization scope");
1920	}
1921	}
1922
1923	/// The scratch address space does not need the global memory caches
1924	/// to be bypassed as all memory operations by the same thread are
1925	/// sequentially consistent, and no other thread can access scratch
1926	/// memory.
1927
1928	/// Other address spaces do not have a cache.
1929
1930	return Changed;
1931	}
1932
1933	bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal(
1934	MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1935	bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
1936
1937	// Only handle load and store, not atomic read-modify-write insructions. The
1938	// latter use glc to indicate if the atomic returns a result and so must not
1939	// be used for cache control.
1940	assert(MI->mayLoad() ^ MI->mayStore());
1941
1942	// Only update load and store, not LLVM IR atomic read-modify-write
1943	// instructions. The latter are always marked as volatile so cannot sensibly
1944	// handle it as do not want to pessimize all atomics. Also they do not support
1945	// the nontemporal attribute.
1946	assert(Op == SIMemOp::LOAD \|\| Op == SIMemOp::STORE);
1947
1948	bool Changed = false;
1949
1950	if (IsVolatile) {
1951	// Set L0 and L1 cache policy to be MISS_EVICT for load instructions
1952	// and MISS_LRU for store instructions.
1953	// Note: there is no L2 cache coherent bypass control at the ISA level.
1954	if (Op == SIMemOp::LOAD) {
1955	Changed \|= enableGLCBit(MI);
1956	Changed \|= enableDLCBit(MI);
1957	}
1958
1959	// Ensure operation has completed at system scope to cause all volatile
1960	// operations to be visible outside the program in a global order. Do not
1961	// request cross address space as only the global address space can be
1962	// observable outside the program, so no need to cause a waitcnt for LDS
1963	// address space operations.
1964	Changed \|= insertWait(MI, Scope: SIAtomicScope::SYSTEM, AddrSpace, Op, IsCrossAddrSpaceOrdering: false,
1965	Pos: Position::AFTER);
1966	return Changed;
1967	}
1968
1969	if (IsNonTemporal) {
1970	// For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT
1971	// and L2 cache policy to STREAM.
1972	// For stores setting both GLC and SLC configures L0 and L1 cache policy
1973	// to MISS_EVICT and the L2 cache policy to STREAM.
1974	if (Op == SIMemOp::STORE)
1975	Changed \|= enableGLCBit(MI);
1976	Changed \|= enableSLCBit(MI);
1977
1978	return Changed;
1979	}
1980
1981	return Changed;
1982	}
1983
1984	bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
1985	SIAtomicScope Scope,
1986	SIAtomicAddrSpace AddrSpace,
1987	SIMemOp Op,
1988	bool IsCrossAddrSpaceOrdering,
1989	Position Pos) const {
1990	bool Changed = false;
1991
1992	MachineBasicBlock &MBB = *MI ->getParent();
1993	DebugLoc DL = MI ->getDebugLoc();
1994
1995	if (Pos == Position::AFTER)
1996	++MI;
1997
1998	bool VMCnt = false;
1999	bool VSCnt = false;
2000	bool LGKMCnt = false;
2001
2002	if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL \| SIAtomicAddrSpace::SCRATCH)) !=
2003	SIAtomicAddrSpace::NONE) {
2004	switch (Scope) {
2005	case SIAtomicScope::SYSTEM:
2006	case SIAtomicScope::AGENT:
2007	if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
2008	VMCnt \|= true;
2009	if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
2010	VSCnt \|= true;
2011	break;
2012	case SIAtomicScope::WORKGROUP:
2013	// In WGP mode the waves of a work-group can be executing on either CU of
2014	// the WGP. Therefore need to wait for operations to complete to ensure
2015	// they are visible to waves in the other CU as the L0 is per CU.
2016	// Otherwise in CU mode and all waves of a work-group are on the same CU
2017	// which shares the same L0.
2018	if (!ST.isCuModeEnabled()) {
2019	if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
2020	VMCnt \|= true;
2021	if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
2022	VSCnt \|= true;
2023	}
2024	break;
2025	case SIAtomicScope::WAVEFRONT:
2026	case SIAtomicScope::SINGLETHREAD:
2027	// The L0 cache keeps all memory operations in order for
2028	// work-items in the same wavefront.
2029	break;
2030	default:
2031	llvm_unreachable("Unsupported synchronization scope");
2032	}
2033	}
2034
2035	if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
2036	switch (Scope) {
2037	case SIAtomicScope::SYSTEM:
2038	case SIAtomicScope::AGENT:
2039	case SIAtomicScope::WORKGROUP:
2040	// If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
2041	// not needed as LDS operations for all waves are executed in a total
2042	// global ordering as observed by all waves. Required if also
2043	// synchronizing with global/GDS memory as LDS operations could be
2044	// reordered with respect to later global/GDS memory operations of the
2045	// same wave.
2046	LGKMCnt \|= IsCrossAddrSpaceOrdering;
2047	break;
2048	case SIAtomicScope::WAVEFRONT:
2049	case SIAtomicScope::SINGLETHREAD:
2050	// The LDS keeps all memory operations in order for
2051	// the same wavefront.
2052	break;
2053	default:
2054	llvm_unreachable("Unsupported synchronization scope");
2055	}
2056	}
2057
2058	if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
2059	switch (Scope) {
2060	case SIAtomicScope::SYSTEM:
2061	case SIAtomicScope::AGENT:
2062	// If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
2063	// is not needed as GDS operations for all waves are executed in a total
2064	// global ordering as observed by all waves. Required if also
2065	// synchronizing with global/LDS memory as GDS operations could be
2066	// reordered with respect to later global/LDS memory operations of the
2067	// same wave.
2068	LGKMCnt \|= IsCrossAddrSpaceOrdering;
2069	break;
2070	case SIAtomicScope::WORKGROUP:
2071	case SIAtomicScope::WAVEFRONT:
2072	case SIAtomicScope::SINGLETHREAD:
2073	// The GDS keeps all memory operations in order for
2074	// the same work-group.
2075	break;
2076	default:
2077	llvm_unreachable("Unsupported synchronization scope");
2078	}
2079	}
2080
2081	if (VMCnt \|\| LGKMCnt) {
2082	unsigned WaitCntImmediate =
2083	AMDGPU::encodeWaitcnt(Version: IV,
2084	Vmcnt: VMCnt ? `0` : getVmcntBitMask(Version: IV),
2085	Expcnt: getExpcntBitMask(Version: IV),
2086	Lgkmcnt: LGKMCnt ? `0` : getLgkmcntBitMask(Version: IV));
2087	BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_WAITCNT_soft))
2088	.addImm(Val: WaitCntImmediate);
2089	Changed = true;
2090	}
2091
2092	if (VSCnt) {
2093	BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_WAITCNT_VSCNT_soft))
2094	.addReg(RegNo: AMDGPU::SGPR_NULL, flags: RegState::Undef)
2095	.addImm(Val: `0`);
2096	Changed = true;
2097	}
2098
2099	if (Pos == Position::AFTER)
2100	--MI;
2101
2102	return Changed;
2103	}
2104
2105	bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
2106	SIAtomicScope Scope,
2107	SIAtomicAddrSpace AddrSpace,
2108	Position Pos) const {
2109	if (!InsertCacheInv)
2110	return false;
2111
2112	bool Changed = false;
2113
2114	MachineBasicBlock &MBB = *MI ->getParent();
2115	DebugLoc DL = MI ->getDebugLoc();
2116
2117	if (Pos == Position::AFTER)
2118	++MI;
2119
2120	if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
2121	switch (Scope) {
2122	case SIAtomicScope::SYSTEM:
2123	case SIAtomicScope::AGENT:
2124	// The order of invalidates matter here. We must invalidate "outer in"
2125	// so L1 -> L0 to avoid L0 pulling in stale data from L1 when it is
2126	// invalidated.
2127	BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::BUFFER_GL1_INV));
2128	BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::BUFFER_GL0_INV));
2129	Changed = true;
2130	break;
2131	case SIAtomicScope::WORKGROUP:
2132	// In WGP mode the waves of a work-group can be executing on either CU of
2133	// the WGP. Therefore need to invalidate the L0 which is per CU. Otherwise
2134	// in CU mode and all waves of a work-group are on the same CU, and so the
2135	// L0 does not need to be invalidated.
2136	if (!ST.isCuModeEnabled()) {
2137	BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::BUFFER_GL0_INV));
2138	Changed = true;
2139	}
2140	break;
2141	case SIAtomicScope::WAVEFRONT:
2142	case SIAtomicScope::SINGLETHREAD:
2143	// No cache to invalidate.
2144	break;
2145	default:
2146	llvm_unreachable("Unsupported synchronization scope");
2147	}
2148	}
2149
2150	/// The scratch address space does not need the global memory cache
2151	/// to be flushed as all memory operations by the same thread are
2152	/// sequentially consistent, and no other thread can access scratch
2153	/// memory.
2154
2155	/// Other address spaces do not have a cache.
2156
2157	if (Pos == Position::AFTER)
2158	--MI;
2159
2160	return Changed;
2161	}
2162
2163	bool SIGfx11CacheControl::enableLoadCacheBypass(
2164	const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
2165	SIAtomicAddrSpace AddrSpace) const {
2166	assert(MI->mayLoad() && !MI->mayStore());
2167	bool Changed = false;
2168
2169	if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
2170	switch (Scope) {
2171	case SIAtomicScope::SYSTEM:
2172	case SIAtomicScope::AGENT:
2173	// Set the L0 and L1 cache policies to MISS_EVICT.
2174	// Note: there is no L2 cache coherent bypass control at the ISA level.
2175	Changed \|= enableGLCBit(MI);
2176	break;
2177	case SIAtomicScope::WORKGROUP:
2178	// In WGP mode the waves of a work-group can be executing on either CU of
2179	// the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
2180	// CU mode all waves of a work-group are on the same CU, and so the L0
2181	// does not need to be bypassed.
2182	if (!ST.isCuModeEnabled())
2183	Changed \|= enableGLCBit(MI);
2184	break;
2185	case SIAtomicScope::WAVEFRONT:
2186	case SIAtomicScope::SINGLETHREAD:
2187	// No cache to bypass.
2188	break;
2189	default:
2190	llvm_unreachable("Unsupported synchronization scope");
2191	}
2192	}
2193
2194	/// The scratch address space does not need the global memory caches
2195	/// to be bypassed as all memory operations by the same thread are
2196	/// sequentially consistent, and no other thread can access scratch
2197	/// memory.
2198
2199	/// Other address spaces do not have a cache.
2200
2201	return Changed;
2202	}
2203
2204	bool SIGfx11CacheControl::enableVolatileAndOrNonTemporal(
2205	MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
2206	bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
2207
2208	// Only handle load and store, not atomic read-modify-write insructions. The
2209	// latter use glc to indicate if the atomic returns a result and so must not
2210	// be used for cache control.
2211	assert(MI->mayLoad() ^ MI->mayStore());
2212
2213	// Only update load and store, not LLVM IR atomic read-modify-write
2214	// instructions. The latter are always marked as volatile so cannot sensibly
2215	// handle it as do not want to pessimize all atomics. Also they do not support
2216	// the nontemporal attribute.
2217	assert(Op == SIMemOp::LOAD \|\| Op == SIMemOp::STORE);
2218
2219	bool Changed = false;
2220
2221	if (IsVolatile) {
2222	// Set L0 and L1 cache policy to be MISS_EVICT for load instructions
2223	// and MISS_LRU for store instructions.
2224	// Note: there is no L2 cache coherent bypass control at the ISA level.
2225	if (Op == SIMemOp::LOAD)
2226	Changed \|= enableGLCBit(MI);
2227
2228	// Set MALL NOALLOC for load and store instructions.
2229	Changed \|= enableDLCBit(MI);
2230
2231	// Ensure operation has completed at system scope to cause all volatile
2232	// operations to be visible outside the program in a global order. Do not
2233	// request cross address space as only the global address space can be
2234	// observable outside the program, so no need to cause a waitcnt for LDS
2235	// address space operations.
2236	Changed \|= insertWait(MI, Scope: SIAtomicScope::SYSTEM, AddrSpace, Op, IsCrossAddrSpaceOrdering: false,
2237	Pos: Position::AFTER);
2238	return Changed;
2239	}
2240
2241	if (IsNonTemporal) {
2242	// For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT
2243	// and L2 cache policy to STREAM.
2244	// For stores setting both GLC and SLC configures L0 and L1 cache policy
2245	// to MISS_EVICT and the L2 cache policy to STREAM.
2246	if (Op == SIMemOp::STORE)
2247	Changed \|= enableGLCBit(MI);
2248	Changed \|= enableSLCBit(MI);
2249
2250	// Set MALL NOALLOC for load and store instructions.
2251	Changed \|= enableDLCBit(MI);
2252	return Changed;
2253	}
2254
2255	return Changed;
2256	}
2257
2258	bool SIGfx12CacheControl::setTH(const MachineBasicBlock::iterator MI,
2259	AMDGPU::CPol::CPol Value) const {
2260	MachineOperand CPol = TII->getNamedOperand(MI&: MI, OperandName: OpName::cpol);
2261	if (!CPol)
2262	return false;
2263
2264	uint64_t NewTH = Value & AMDGPU::CPol::TH;
2265	if ((CPol->getImm() & AMDGPU::CPol::TH) != NewTH) {
2266	CPol->setImm((CPol->getImm() & ~AMDGPU::CPol::TH) \| NewTH);
2267	return true;
2268	}
2269
2270	return false;
2271	}
2272
2273	bool SIGfx12CacheControl::setScope(const MachineBasicBlock::iterator MI,
2274	AMDGPU::CPol::CPol Value) const {
2275	MachineOperand CPol = TII->getNamedOperand(MI&: MI, OperandName: OpName::cpol);
2276	if (!CPol)
2277	return false;
2278
2279	uint64_t NewScope = Value & AMDGPU::CPol::SCOPE;
2280	if ((CPol->getImm() & AMDGPU::CPol::SCOPE) != NewScope) {
2281	CPol->setImm((CPol->getImm() & ~AMDGPU::CPol::SCOPE) \| NewScope);
2282	return true;
2283	}
2284
2285	return false;
2286	}
2287
2288	bool SIGfx12CacheControl::insertWaitsBeforeSystemScopeStore(
2289	const MachineBasicBlock::iterator MI) const {
2290	// TODO: implement flag for frontend to give us a hint not to insert waits.
2291
2292	MachineBasicBlock &MBB = *MI ->getParent();
2293	const DebugLoc &DL = MI ->getDebugLoc();
2294
2295	BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: S_WAIT_LOADCNT_soft)).addImm(Val: `0`);
2296	BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: S_WAIT_SAMPLECNT_soft)).addImm(Val: `0`);
2297	BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: S_WAIT_BVHCNT_soft)).addImm(Val: `0`);
2298	BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: S_WAIT_KMCNT_soft)).addImm(Val: `0`);
2299	BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: S_WAIT_STORECNT_soft)).addImm(Val: `0`);
2300
2301	return true;
2302	}
2303
2304	bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI,
2305	SIAtomicScope Scope,
2306	SIAtomicAddrSpace AddrSpace, SIMemOp Op,
2307	bool IsCrossAddrSpaceOrdering,
2308	Position Pos) const {
2309	bool Changed = false;
2310
2311	MachineBasicBlock &MBB = *MI ->getParent();
2312	DebugLoc DL = MI ->getDebugLoc();
2313
2314	bool LOADCnt = false;
2315	bool DSCnt = false;
2316	bool STORECnt = false;
2317
2318	if (Pos == Position::AFTER)
2319	++MI;
2320
2321	if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL \| SIAtomicAddrSpace::SCRATCH)) !=
2322	SIAtomicAddrSpace::NONE) {
2323	switch (Scope) {
2324	case SIAtomicScope::SYSTEM:
2325	case SIAtomicScope::AGENT:
2326	if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
2327	LOADCnt \|= true;
2328	if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
2329	STORECnt \|= true;
2330	break;
2331	case SIAtomicScope::WORKGROUP:
2332	// In WGP mode the waves of a work-group can be executing on either CU of
2333	// the WGP. Therefore need to wait for operations to complete to ensure
2334	// they are visible to waves in the other CU as the L0 is per CU.
2335	// Otherwise in CU mode and all waves of a work-group are on the same CU
2336	// which shares the same L0.
2337	if (!ST.isCuModeEnabled()) {
2338	if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
2339	LOADCnt \|= true;
2340	if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
2341	STORECnt \|= true;
2342	}
2343	break;
2344	case SIAtomicScope::WAVEFRONT:
2345	case SIAtomicScope::SINGLETHREAD:
2346	// The L0 cache keeps all memory operations in order for
2347	// work-items in the same wavefront.
2348	break;
2349	default:
2350	llvm_unreachable("Unsupported synchronization scope");
2351	}
2352	}
2353
2354	if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
2355	switch (Scope) {
2356	case SIAtomicScope::SYSTEM:
2357	case SIAtomicScope::AGENT:
2358	case SIAtomicScope::WORKGROUP:
2359	// If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
2360	// not needed as LDS operations for all waves are executed in a total
2361	// global ordering as observed by all waves. Required if also
2362	// synchronizing with global/GDS memory as LDS operations could be
2363	// reordered with respect to later global/GDS memory operations of the
2364	// same wave.
2365	DSCnt \|= IsCrossAddrSpaceOrdering;
2366	break;
2367	case SIAtomicScope::WAVEFRONT:
2368	case SIAtomicScope::SINGLETHREAD:
2369	// The LDS keeps all memory operations in order for
2370	// the same wavefront.
2371	break;
2372	default:
2373	llvm_unreachable("Unsupported synchronization scope");
2374	}
2375	}
2376
2377	if (LOADCnt) {
2378	BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_WAIT_BVHCNT_soft)).addImm(Val: `0`);
2379	BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_WAIT_SAMPLECNT_soft)).addImm(Val: `0`);
2380	BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_WAIT_LOADCNT_soft)).addImm(Val: `0`);
2381	Changed = true;
2382	}
2383
2384	if (STORECnt) {
2385	BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_WAIT_STORECNT_soft)).addImm(Val: `0`);
2386	Changed = true;
2387	}
2388
2389	if (DSCnt) {
2390	BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_WAIT_DSCNT_soft)).addImm(Val: `0`);
2391	Changed = true;
2392	}
2393
2394	if (Pos == Position::AFTER)
2395	--MI;
2396
2397	return Changed;
2398	}
2399
2400	bool SIGfx12CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
2401	SIAtomicScope Scope,
2402	SIAtomicAddrSpace AddrSpace,
2403	Position Pos) const {
2404	if (!InsertCacheInv)
2405	return false;
2406
2407	MachineBasicBlock &MBB = *MI ->getParent();
2408	DebugLoc DL = MI ->getDebugLoc();
2409
2410	/// The scratch address space does not need the global memory cache
2411	/// to be flushed as all memory operations by the same thread are
2412	/// sequentially consistent, and no other thread can access scratch
2413	/// memory.
2414
2415	/// Other address spaces do not have a cache.
2416	if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) == SIAtomicAddrSpace::NONE)
2417	return false;
2418
2419	AMDGPU::CPol::CPol ScopeImm = AMDGPU::CPol::SCOPE_DEV;
2420	switch (Scope) {
2421	case SIAtomicScope::SYSTEM:
2422	ScopeImm = AMDGPU::CPol::SCOPE_SYS;
2423	break;
2424	case SIAtomicScope::AGENT:
2425	ScopeImm = AMDGPU::CPol::SCOPE_DEV;
2426	break;
2427	case SIAtomicScope::WORKGROUP:
2428	// In WGP mode the waves of a work-group can be executing on either CU of
2429	// the WGP. Therefore we need to invalidate the L0 which is per CU.
2430	// Otherwise in CU mode all waves of a work-group are on the same CU, and so
2431	// the L0 does not need to be invalidated.
2432	if (ST.isCuModeEnabled())
2433	return false;
2434
2435	ScopeImm = AMDGPU::CPol::SCOPE_SE;
2436	break;
2437	case SIAtomicScope::WAVEFRONT:
2438	case SIAtomicScope::SINGLETHREAD:
2439	// No cache to invalidate.
2440	return false;
2441	default:
2442	llvm_unreachable("Unsupported synchronization scope");
2443	}
2444
2445	if (Pos == Position::AFTER)
2446	++MI;
2447
2448	BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::GLOBAL_INV)).addImm(Val: ScopeImm);
2449
2450	if (Pos == Position::AFTER)
2451	--MI;
2452
2453	return true;
2454	}
2455
2456	bool SIGfx12CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
2457	SIAtomicScope Scope,
2458	SIAtomicAddrSpace AddrSpace,
2459	bool IsCrossAddrSpaceOrdering,
2460	Position Pos) const {
2461	MachineBasicBlock &MBB = *MI ->getParent();
2462	DebugLoc DL = MI ->getDebugLoc();
2463
2464	// The scratch address space does not need the global memory cache
2465	// writeback as all memory operations by the same thread are
2466	// sequentially consistent, and no other thread can access scratch
2467	// memory.
2468
2469	// Other address spaces do not have a cache.
2470	if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) == SIAtomicAddrSpace::NONE)
2471	return false;
2472
2473	if (Pos == Position::AFTER)
2474	++MI;
2475
2476	// GLOBAL_WB is always needed, even for write-through caches, as it
2477	// additionally ensures all operations have reached the desired cache level.
2478	bool SkipWB = false;
2479	AMDGPU::CPol::CPol ScopeImm = AMDGPU::CPol::SCOPE_DEV;
2480	switch (Scope) {
2481	case SIAtomicScope::SYSTEM:
2482	ScopeImm = AMDGPU::CPol::SCOPE_SYS;
2483	break;
2484	case SIAtomicScope::AGENT:
2485	ScopeImm = AMDGPU::CPol::SCOPE_DEV;
2486	break;
2487	case SIAtomicScope::WORKGROUP:
2488	// In WGP mode the waves of a work-group can be executing on either CU of
2489	// the WGP. Therefore we need to ensure all operations have reached L1,
2490	// hence the SCOPE_SE WB.
2491	// For CU mode, we need operations to reach L0, so the wait is enough -
2492	// there are no ways for an operation to report completion without reaching
2493	// at least L0.
2494	if (ST.isCuModeEnabled())
2495	SkipWB = true;
2496	else
2497	ScopeImm = AMDGPU::CPol::SCOPE_SE;
2498	break;
2499	case SIAtomicScope::WAVEFRONT:
2500	case SIAtomicScope::SINGLETHREAD:
2501	// No cache to invalidate.
2502	return false;
2503	default:
2504	llvm_unreachable("Unsupported synchronization scope");
2505	}
2506
2507	if (!SkipWB)
2508	BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::GLOBAL_WB)).addImm(Val: ScopeImm);
2509
2510	if (Pos == Position::AFTER)
2511	--MI;
2512
2513	// We always have to wait for previous memory operations (load/store) to
2514	// complete, whether we inserted a WB or not. If we inserted a WB (storecnt),
2515	// we of course need to wait for that as well.
2516	insertWait(MI, Scope, AddrSpace, Op: SIMemOp::LOAD \| SIMemOp::STORE,
2517	IsCrossAddrSpaceOrdering, Pos);
2518
2519	return true;
2520	}
2521
2522	bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
2523	MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
2524	bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
2525
2526	// Only handle load and store, not atomic read-modify-write instructions.
2527	assert(MI->mayLoad() ^ MI->mayStore());
2528
2529	// Only update load and store, not LLVM IR atomic read-modify-write
2530	// instructions. The latter are always marked as volatile so cannot sensibly
2531	// handle it as do not want to pessimize all atomics. Also they do not support
2532	// the nontemporal attribute.
2533	assert(Op == SIMemOp::LOAD \|\| Op == SIMemOp::STORE);
2534
2535	bool Changed = false;
2536
2537	if (IsLastUse) {
2538	// Set last-use hint.
2539	Changed \|= setTH(MI, Value: AMDGPU::CPol::TH_LU);
2540	} else if (IsNonTemporal) {
2541	// Set non-temporal hint for all cache levels.
2542	Changed \|= setTH(MI, Value: AMDGPU::CPol::TH_NT);
2543	}
2544
2545	if (IsVolatile) {
2546	Changed \|= setScope(MI, Value: AMDGPU::CPol::SCOPE_SYS);
2547
2548	if (Op == SIMemOp::STORE)
2549	Changed \|= insertWaitsBeforeSystemScopeStore(MI);
2550
2551	// Ensure operation has completed at system scope to cause all volatile
2552	// operations to be visible outside the program in a global order. Do not
2553	// request cross address space as only the global address space can be
2554	// observable outside the program, so no need to cause a waitcnt for LDS
2555	// address space operations.
2556	Changed \|= insertWait(MI, Scope: SIAtomicScope::SYSTEM, AddrSpace, Op, IsCrossAddrSpaceOrdering: false,
2557	Pos: Position::AFTER);
2558	}
2559
2560	return Changed;
2561	}
2562
2563	bool SIGfx12CacheControl::expandSystemScopeStore(
2564	MachineBasicBlock::iterator &MI) const {
2565	MachineOperand CPol = TII->getNamedOperand(MI&: MI, OperandName: OpName::cpol);
2566	if (CPol && ((CPol->getImm() & CPol::SCOPE) == CPol::SCOPE_SYS))
2567	return insertWaitsBeforeSystemScopeStore(MI);
2568
2569	return false;
2570	}
2571
2572	bool SIGfx12CacheControl::setAtomicScope(const MachineBasicBlock::iterator &MI,
2573	SIAtomicScope Scope,
2574	SIAtomicAddrSpace AddrSpace) const {
2575	bool Changed = false;
2576
2577	if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
2578	switch (Scope) {
2579	case SIAtomicScope::SYSTEM:
2580	Changed \|= setScope(MI, Value: AMDGPU::CPol::SCOPE_SYS);
2581	break;
2582	case SIAtomicScope::AGENT:
2583	Changed \|= setScope(MI, Value: AMDGPU::CPol::SCOPE_DEV);
2584	break;
2585	case SIAtomicScope::WORKGROUP:
2586	// In workgroup mode, SCOPE_SE is needed as waves can executes on
2587	// different CUs that access different L0s.
2588	if (!ST.isCuModeEnabled())
2589	Changed \|= setScope(MI, Value: AMDGPU::CPol::SCOPE_SE);
2590	break;
2591	case SIAtomicScope::WAVEFRONT:
2592	case SIAtomicScope::SINGLETHREAD:
2593	// No cache to bypass.
2594	break;
2595	default:
2596	llvm_unreachable("Unsupported synchronization scope");
2597	}
2598	}
2599
2600	// The scratch address space does not need the global memory caches
2601	// to be bypassed as all memory operations by the same thread are
2602	// sequentially consistent, and no other thread can access scratch
2603	// memory.
2604
2605	// Other address spaces do not have a cache.
2606
2607	return Changed;
2608	}
2609
2610	bool SIMemoryLegalizer::removeAtomicPseudoMIs() {
2611	if (AtomicPseudoMIs.empty())
2612	return false;
2613
2614	for (auto &MI : AtomicPseudoMIs)
2615	MI ->eraseFromParent();
2616
2617	AtomicPseudoMIs.clear();
2618	return true;
2619	}
2620
2621	bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI,
2622	MachineBasicBlock::iterator &MI) {
2623	assert(MI->mayLoad() && !MI->mayStore());
2624
2625	bool Changed = false;
2626
2627	if (MOI.isAtomic()) {
2628	if (MOI.getOrdering() == AtomicOrdering::Monotonic \|\|
2629	MOI.getOrdering() == AtomicOrdering::Acquire \|\|
2630	MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2631	Changed \|= CC ->enableLoadCacheBypass(MI, Scope: MOI.getScope(),
2632	AddrSpace: MOI.getOrderingAddrSpace());
2633	}
2634
2635	if (MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2636	Changed \|= CC ->insertWait(MI, Scope: MOI.getScope(),
2637	AddrSpace: MOI.getOrderingAddrSpace(),
2638	Op: SIMemOp::LOAD \| SIMemOp::STORE,
2639	IsCrossAddrSpaceOrdering: MOI.getIsCrossAddressSpaceOrdering(),
2640	Pos: Position::BEFORE);
2641
2642	if (MOI.getOrdering() == AtomicOrdering::Acquire \|\|
2643	MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2644	Changed \|= CC ->insertWait(MI, Scope: MOI.getScope(),
2645	AddrSpace: MOI.getInstrAddrSpace(),
2646	Op: SIMemOp::LOAD,
2647	IsCrossAddrSpaceOrdering: MOI.getIsCrossAddressSpaceOrdering(),
2648	Pos: Position::AFTER);
2649	Changed \|= CC ->insertAcquire(MI, Scope: MOI.getScope(),
2650	AddrSpace: MOI.getOrderingAddrSpace(),
2651	Pos: Position::AFTER);
2652	}
2653
2654	return Changed;
2655	}
2656
2657	// Atomic instructions already bypass caches to the scope specified by the
2658	// SyncScope operand. Only non-atomic volatile and nontemporal/last-use
2659	// instructions need additional treatment.
2660	Changed \|= CC ->enableVolatileAndOrNonTemporal(
2661	MI, AddrSpace: MOI.getInstrAddrSpace(), Op: SIMemOp::LOAD, IsVolatile: MOI.isVolatile(),
2662	IsNonTemporal: MOI.isNonTemporal(), IsLastUse: MOI.isLastUse());
2663
2664	return Changed;
2665	}
2666
2667	bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
2668	MachineBasicBlock::iterator &MI) {
2669	assert(!MI->mayLoad() && MI->mayStore());
2670
2671	bool Changed = false;
2672
2673	if (MOI.isAtomic()) {
2674	if (MOI.getOrdering() == AtomicOrdering::Monotonic \|\|
2675	MOI.getOrdering() == AtomicOrdering::Release \|\|
2676	MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2677	Changed \|= CC ->enableStoreCacheBypass(MI, Scope: MOI.getScope(),
2678	AddrSpace: MOI.getOrderingAddrSpace());
2679	}
2680
2681	if (MOI.getOrdering() == AtomicOrdering::Release \|\|
2682	MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2683	Changed \|= CC ->insertRelease(MI, Scope: MOI.getScope(),
2684	AddrSpace: MOI.getOrderingAddrSpace(),
2685	IsCrossAddrSpaceOrdering: MOI.getIsCrossAddressSpaceOrdering(),
2686	Pos: Position::BEFORE);
2687
2688	return Changed;
2689	}
2690
2691	// Atomic instructions already bypass caches to the scope specified by the
2692	// SyncScope operand. Only non-atomic volatile and nontemporal instructions
2693	// need additional treatment.
2694	Changed \|= CC ->enableVolatileAndOrNonTemporal(
2695	MI, AddrSpace: MOI.getInstrAddrSpace(), Op: SIMemOp::STORE, IsVolatile: MOI.isVolatile(),
2696	IsNonTemporal: MOI.isNonTemporal());
2697
2698	// GFX12 specific, scope(desired coherence domain in cache hierarchy) is
2699	// instruction field, do not confuse it with atomic scope.
2700	Changed \|= CC ->expandSystemScopeStore(MI);
2701	return Changed;
2702	}
2703
2704	bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
2705	MachineBasicBlock::iterator &MI) {
2706	assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE);
2707
2708	AtomicPseudoMIs.push_back(x: MI);
2709	bool Changed = false;
2710
2711	// Refine fenced address space based on MMRAs.
2712	//
2713	// TODO: Should we support this MMRA on other atomic operations?
2714	auto OrderingAddrSpace =
2715	getFenceAddrSpaceMMRA(MI: *MI, Default: MOI.getOrderingAddrSpace());
2716
2717	if (MOI.isAtomic()) {
2718	if (MOI.getOrdering() == AtomicOrdering::Acquire)
2719	Changed \|= CC ->insertWait(
2720	MI, Scope: MOI.getScope(), AddrSpace: OrderingAddrSpace, Op: SIMemOp::LOAD \| SIMemOp::STORE,
2721	IsCrossAddrSpaceOrdering: MOI.getIsCrossAddressSpaceOrdering(), Pos: Position::BEFORE);
2722
2723	if (MOI.getOrdering() == AtomicOrdering::Release \|\|
2724	MOI.getOrdering() == AtomicOrdering::AcquireRelease \|\|
2725	MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2726	/// TODO: This relies on a barrier always generating a waitcnt
2727	/// for LDS to ensure it is not reordered with the completion of
2728	/// the proceeding LDS operations. If barrier had a memory
2729	/// ordering and memory scope, then library does not need to
2730	/// generate a fence. Could add support in this file for
2731	/// barrier. SIInsertWaitcnt.cpp could then stop unconditionally
2732	/// adding S_WAITCNT before a S_BARRIER.
2733	Changed \|= CC ->insertRelease(MI, Scope: MOI.getScope(), AddrSpace: OrderingAddrSpace,
2734	IsCrossAddrSpaceOrdering: MOI.getIsCrossAddressSpaceOrdering(),
2735	Pos: Position::BEFORE);
2736
2737	// TODO: If both release and invalidate are happening they could be combined
2738	// to use the single "BUFFER_WBINV" instruction. This could be done by*
2739	// reorganizing this code or as part of optimizing SIInsertWaitcnt pass to
2740	// track cache invalidate and write back instructions.
2741
2742	if (MOI.getOrdering() == AtomicOrdering::Acquire \|\|
2743	MOI.getOrdering() == AtomicOrdering::AcquireRelease \|\|
2744	MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2745	Changed \|= CC ->insertAcquire(MI, Scope: MOI.getScope(), AddrSpace: OrderingAddrSpace,
2746	Pos: Position::BEFORE);
2747
2748	return Changed;
2749	}
2750
2751	return Changed;
2752	}
2753
2754	bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
2755	MachineBasicBlock::iterator &MI) {
2756	assert(MI->mayLoad() && MI->mayStore());
2757
2758	bool Changed = false;
2759
2760	if (MOI.isAtomic()) {
2761	if (MOI.getOrdering() == AtomicOrdering::Monotonic \|\|
2762	MOI.getOrdering() == AtomicOrdering::Acquire \|\|
2763	MOI.getOrdering() == AtomicOrdering::Release \|\|
2764	MOI.getOrdering() == AtomicOrdering::AcquireRelease \|\|
2765	MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2766	Changed \|= CC ->enableRMWCacheBypass(MI, Scope: MOI.getScope(),
2767	AddrSpace: MOI.getInstrAddrSpace());
2768	}
2769
2770	if (MOI.getOrdering() == AtomicOrdering::Release \|\|
2771	MOI.getOrdering() == AtomicOrdering::AcquireRelease \|\|
2772	MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent \|\|
2773	MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent)
2774	Changed \|= CC ->insertRelease(MI, Scope: MOI.getScope(),
2775	AddrSpace: MOI.getOrderingAddrSpace(),
2776	IsCrossAddrSpaceOrdering: MOI.getIsCrossAddressSpaceOrdering(),
2777	Pos: Position::BEFORE);
2778
2779	if (MOI.getOrdering() == AtomicOrdering::Acquire \|\|
2780	MOI.getOrdering() == AtomicOrdering::AcquireRelease \|\|
2781	MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent \|\|
2782	MOI.getFailureOrdering() == AtomicOrdering::Acquire \|\|
2783	MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) {
2784	Changed \|= CC ->insertWait(MI, Scope: MOI.getScope(),
2785	AddrSpace: MOI.getInstrAddrSpace(),
2786	Op: isAtomicRet(MI: *MI) ? SIMemOp::LOAD :
2787	SIMemOp::STORE,
2788	IsCrossAddrSpaceOrdering: MOI.getIsCrossAddressSpaceOrdering(),
2789	Pos: Position::AFTER);
2790	Changed \|= CC ->insertAcquire(MI, Scope: MOI.getScope(),
2791	AddrSpace: MOI.getOrderingAddrSpace(),
2792	Pos: Position::AFTER);
2793	}
2794
2795	return Changed;
2796	}
2797
2798	return Changed;
2799	}
2800
2801	bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) {
2802	bool Changed = false;
2803
2804	const MachineModuleInfo &MMI =
2805	getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
2806
2807	SIMemOpAccess MOA(MMI.getObjFileInfo<AMDGPUMachineModuleInfo>());
2808	CC = SICacheControl::create(ST: MF.getSubtarget<GCNSubtarget>());
2809
2810	for (auto &MBB : MF) {
2811	for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) {
2812
2813	// Unbundle instructions after the post-RA scheduler.
2814	if (MI ->isBundle() && MI ->mayLoadOrStore()) {
2815	MachineBasicBlock::instr_iterator II(MI ->getIterator());
2816	for (MachineBasicBlock::instr_iterator I = ++II, E = MBB.instr_end();
2817	I != E && I ->isBundledWithPred(); ++I) {
2818	I ->unbundleFromPred();
2819	for (MachineOperand &MO : I ->operands())
2820	if (MO.isReg())
2821	MO.setIsInternalRead(false);
2822	}
2823
2824	MI ->eraseFromParent();
2825	MI = II ->getIterator();
2826	}
2827
2828	if (!(MI ->getDesc().TSFlags & SIInstrFlags::maybeAtomic))
2829	continue;
2830
2831	if (const auto &MOI = MOA.getLoadInfo(MI))
2832	Changed \|= expandLoad(MOI: *MOI, MI);
2833	else if (const auto &MOI = MOA.getStoreInfo(MI)) {
2834	Changed \|= expandStore(MOI: *MOI, MI);
2835	Changed \|= CC ->tryForceStoreSC0SC1(MOI: *MOI, MI);
2836	} else if (const auto &MOI = MOA.getAtomicFenceInfo(MI))
2837	Changed \|= expandAtomicFence(MOI: *MOI, MI);
2838	else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI))
2839	Changed \|= expandAtomicCmpxchgOrRmw(MOI: *MOI, MI);
2840	}
2841	}
2842
2843	Changed \|= removeAtomicPseudoMIs();
2844	return Changed;
2845	}
2846
2847	INITIALIZE_PASS(SIMemoryLegalizer, DEBUG_TYPE, PASS_NAME, false, false)
2848
2849	char SIMemoryLegalizer::ID = `0`;
2850	char &llvm::SIMemoryLegalizerID = SIMemoryLegalizer::ID;
2851
2852	FunctionPass *llvm::createSIMemoryLegalizerPass() {
2853	return new SIMemoryLegalizer ();
2854	}
2855

Browse the source code of llvm_projects/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp