AtomicExpandPass.cpp source code [llvm_projects/llvm/lib/CodeGen/AtomicExpandPass.cpp]

1	//===- AtomicExpandPass.cpp - Expand atomic instructions ------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This file contains a pass (at IR level) to replace atomic instructions with
10	// __atomic_ library calls, or target specific instruction which implement the*
11	// same semantics in a way which better fits the target backend. This can
12	// include the use of (intrinsic-based) load-linked/store-conditional loops,
13	// AtomicCmpXchg, or type coercions.
14	//
15	//===----------------------------------------------------------------------===//
16
17	#include "llvm/ADT/ArrayRef.h"
18	#include "llvm/ADT/STLFunctionalExtras.h"
19	#include "llvm/ADT/SmallVector.h"
20	#include "llvm/Analysis/InstSimplifyFolder.h"
21	#include "llvm/Analysis/OptimizationRemarkEmitter.h"
22	#include "llvm/CodeGen/AtomicExpand.h"
23	#include "llvm/CodeGen/AtomicExpandUtils.h"
24	#include "llvm/CodeGen/TargetLowering.h"
25	#include "llvm/CodeGen/TargetPassConfig.h"
26	#include "llvm/CodeGen/TargetSubtargetInfo.h"
27	#include "llvm/CodeGen/ValueTypes.h"
28	#include "llvm/IR/Attributes.h"
29	#include "llvm/IR/BasicBlock.h"
30	#include "llvm/IR/Constant.h"
31	#include "llvm/IR/Constants.h"
32	#include "llvm/IR/DataLayout.h"
33	#include "llvm/IR/DerivedTypes.h"
34	#include "llvm/IR/Function.h"
35	#include "llvm/IR/IRBuilder.h"
36	#include "llvm/IR/Instruction.h"
37	#include "llvm/IR/Instructions.h"
38	#include "llvm/IR/MDBuilder.h"
39	#include "llvm/IR/MemoryModelRelaxationAnnotations.h"
40	#include "llvm/IR/Module.h"
41	#include "llvm/IR/Type.h"
42	#include "llvm/IR/User.h"
43	#include "llvm/IR/Value.h"
44	#include "llvm/InitializePasses.h"
45	#include "llvm/Pass.h"
46	#include "llvm/Support/AtomicOrdering.h"
47	#include "llvm/Support/Casting.h"
48	#include "llvm/Support/Debug.h"
49	#include "llvm/Support/ErrorHandling.h"
50	#include "llvm/Support/raw_ostream.h"
51	#include "llvm/Target/TargetMachine.h"
52	#include "llvm/Transforms/Utils/LowerAtomic.h"
53	#include <cassert>
54	#include <cstdint>
55	#include <iterator>
56
57	using namespace llvm;
58
59	#define DEBUG_TYPE "atomic-expand"
60
61	namespace {
62
63	class AtomicExpandImpl {
64	const TargetLowering TLI = nullptr*;
65	const DataLayout DL = nullptr*;
66
67	private:
68	bool bracketInstWithFences(Instruction *I, AtomicOrdering Order);
69	IntegerType getCorrespondingIntegerType(Type T, const DataLayout &DL);
70	LoadInst convertAtomicLoadToIntegerType(LoadInst LI);
71	bool tryExpandAtomicLoad(LoadInst *LI);
72	bool expandAtomicLoadToLL(LoadInst *LI);
73	bool expandAtomicLoadToCmpXchg(LoadInst *LI);
74	StoreInst convertAtomicStoreToIntegerType(StoreInst SI);
75	bool tryExpandAtomicStore(StoreInst *SI);
76	void expandAtomicStore(StoreInst *SI);
77	bool tryExpandAtomicRMW(AtomicRMWInst *AI);
78	AtomicRMWInst convertAtomicXchgToIntegerType(AtomicRMWInst RMWI);
79	Value *
80	insertRMWLLSCLoop(IRBuilderBase &Builder, Type ResultTy, Value Addr,
81	Align AddrAlign, AtomicOrdering MemOpOrder,
82	function_ref<Value (IRBuilderBase &, Value )> PerformOp);
83	void expandAtomicOpToLLSC(
84	Instruction I, Type ResultTy, Value *Addr, Align AddrAlign,
85	AtomicOrdering MemOpOrder,
86	function_ref<Value (IRBuilderBase &, Value )> PerformOp);
87	void expandPartwordAtomicRMW(
88	AtomicRMWInst *I, TargetLoweringBase::AtomicExpansionKind ExpansionKind);
89	AtomicRMWInst widenPartwordAtomicRMW(AtomicRMWInst AI);
90	bool expandPartwordCmpXchg(AtomicCmpXchgInst *I);
91	void expandAtomicRMWToMaskedIntrinsic(AtomicRMWInst *AI);
92	void expandAtomicCmpXchgToMaskedIntrinsic(AtomicCmpXchgInst *CI);
93
94	AtomicCmpXchgInst convertCmpXchgToIntegerType(AtomicCmpXchgInst CI);
95	static Value *insertRMWCmpXchgLoop(
96	IRBuilderBase &Builder, Type ResultType, Value Addr, Align AddrAlign,
97	AtomicOrdering MemOpOrder, SyncScope::ID SSID,
98	function_ref<Value (IRBuilderBase &, Value )> PerformOp,
99	CreateCmpXchgInstFun CreateCmpXchg, Instruction *MetadataSrc);
100	bool tryExpandAtomicCmpXchg(AtomicCmpXchgInst *CI);
101
102	bool expandAtomicCmpXchg(AtomicCmpXchgInst *CI);
103	bool isIdempotentRMW(AtomicRMWInst *RMWI);
104	bool simplifyIdempotentRMW(AtomicRMWInst *RMWI);
105
106	bool expandAtomicOpToLibcall(Instruction I, unsigned* Size, Align Alignment,
107	Value PointerOperand, Value ValueOperand,
108	Value *CASExpected, AtomicOrdering Ordering,
109	AtomicOrdering Ordering2,
110	ArrayRef<RTLIB::Libcall> Libcalls);
111	void expandAtomicLoadToLibcall(LoadInst *LI);
112	void expandAtomicStoreToLibcall(StoreInst *LI);
113	void expandAtomicRMWToLibcall(AtomicRMWInst *I);
114	void expandAtomicCASToLibcall(AtomicCmpXchgInst *I);
115
116	friend bool
117	llvm::expandAtomicRMWToCmpXchg(AtomicRMWInst *AI,
118	CreateCmpXchgInstFun CreateCmpXchg);
119
120	bool processAtomicInstr(Instruction *I);
121
122	public:
123	bool run(Function &F, const TargetMachine *TM);
124	};
125
126	class AtomicExpandLegacy : public FunctionPass {
127	public:
128	static char ID; // Pass identification, replacement for typeid
129
130	AtomicExpandLegacy() : FunctionPass (ID) {
131	initializeAtomicExpandLegacyPass(*PassRegistry::getPassRegistry());
132	}
133
134	bool runOnFunction(Function &F) override;
135	};
136
137	// IRBuilder to be used for replacement atomic instructions.
138	struct ReplacementIRBuilder
139	: IRBuilder<InstSimplifyFolder, IRBuilderCallbackInserter> {
140	MDNode MMRAMD = nullptr*;
141
142	// Preserves the DebugLoc from I, and preserves still valid metadata.
143	// Enable StrictFP builder mode when appropriate.
144	explicit ReplacementIRBuilder(Instruction I, const* DataLayout &DL)
145	: IRBuilder (I->getContext(), InstSimplifyFolder (DL),
146	IRBuilderCallbackInserter (
147	[this](Instruction *I) { addMMRAMD(I); })) {
148	SetInsertPoint(I);
149	this->CollectMetadataToCopy(Src: I, MetadataKinds: {LLVMContext::MD_pcsections});
150	if (BB->getParent()->getAttributes().hasFnAttr(Kind: Attribute::StrictFP))
151	this->setIsFPConstrained(true);
152
153	MMRAMD = I->getMetadata(KindID: LLVMContext::MD_mmra);
154	}
155
156	void addMMRAMD(Instruction *I) {
157	if (canInstructionHaveMMRAs(I: *I))
158	I->setMetadata(KindID: LLVMContext::MD_mmra, Node: MMRAMD);
159	}
160	};
161
162	} // end anonymous namespace
163
164	char AtomicExpandLegacy::ID = `0`;
165
166	char &llvm::AtomicExpandID = AtomicExpandLegacy::ID;
167
168	INITIALIZE_PASS_BEGIN(AtomicExpandLegacy, DEBUG_TYPE,
169	"Expand Atomic instructions", false, false)
170	INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
171	INITIALIZE_PASS_END(AtomicExpandLegacy, DEBUG_TYPE,
172	"Expand Atomic instructions", false, false)
173
174	// Helper functions to retrieve the size of atomic instructions.
175	static unsigned getAtomicOpSize(LoadInst *LI) {
176	const DataLayout &DL = LI->getDataLayout();
177	return DL.getTypeStoreSize(Ty: LI->getType());
178	}
179
180	static unsigned getAtomicOpSize(StoreInst *SI) {
181	const DataLayout &DL = SI->getDataLayout();
182	return DL.getTypeStoreSize(Ty: SI->getValueOperand()->getType());
183	}
184
185	static unsigned getAtomicOpSize(AtomicRMWInst *RMWI) {
186	const DataLayout &DL = RMWI->getDataLayout();
187	return DL.getTypeStoreSize(Ty: RMWI->getValOperand()->getType());
188	}
189
190	static unsigned getAtomicOpSize(AtomicCmpXchgInst *CASI) {
191	const DataLayout &DL = CASI->getDataLayout();
192	return DL.getTypeStoreSize(Ty: CASI->getCompareOperand()->getType());
193	}
194
195	/// Copy metadata that's safe to preserve when widening atomics.
196	static void copyMetadataForAtomic(Instruction &Dest,
197	const Instruction &Source) {
198	SmallVector<std::pair<unsigned, MDNode *>, `8`> MD;
199	Source.getAllMetadata(MDs&: MD);
200	LLVMContext &Ctx = Dest.getContext();
201	MDBuilder MDB(Ctx);
202
203	for (auto [ID, N] : MD) {
204	switch (ID) {
205	case LLVMContext::MD_dbg:
206	case LLVMContext::MD_tbaa:
207	case LLVMContext::MD_tbaa_struct:
208	case LLVMContext::MD_alias_scope:
209	case LLVMContext::MD_noalias:
210	case LLVMContext::MD_noalias_addrspace:
211	case LLVMContext::MD_access_group:
212	case LLVMContext::MD_mmra:
213	Dest.setMetadata(KindID: ID, Node: N);
214	break;
215	default:
216	if (ID == Ctx.getMDKindID(Name: "amdgpu.no.remote.memory"))
217	Dest.setMetadata(KindID: ID, Node: N);
218	else if (ID == Ctx.getMDKindID(Name: "amdgpu.no.fine.grained.memory"))
219	Dest.setMetadata(KindID: ID, Node: N);
220
221	// Losing amdgpu.ignore.denormal.mode, but it doesn't matter for current
222	// uses.
223	break;
224	}
225	}
226	}
227
228	// Determine if a particular atomic operation has a supported size,
229	// and is of appropriate alignment, to be passed through for target
230	// lowering. (Versus turning into a __atomic libcall)
231	template <typename Inst>
232	static bool atomicSizeSupported(const TargetLowering TLI, Inst I) {
233	unsigned Size = getAtomicOpSize(I);
234	Align Alignment = I->getAlign();
235	return Alignment >= Size &&
236	Size <= TLI->getMaxAtomicSizeInBitsSupported() / `8`;
237	}
238
239	bool AtomicExpandImpl::processAtomicInstr(Instruction *I) {
240	auto *LI = dyn_cast<LoadInst>(Val: I);
241	auto *SI = dyn_cast<StoreInst>(Val: I);
242	auto *RMWI = dyn_cast<AtomicRMWInst>(Val: I);
243	auto *CASI = dyn_cast<AtomicCmpXchgInst>(Val: I);
244
245	bool MadeChange = false;
246
247	// If the Size/Alignment is not supported, replace with a libcall.
248	if (LI) {
249	if (!LI->isAtomic())
250	return false;
251
252	if (!atomicSizeSupported(TLI, I: LI)) {
253	expandAtomicLoadToLibcall(LI);
254	return true;
255	}
256
257	if (TLI->shouldCastAtomicLoadInIR(LI) ==
258	TargetLoweringBase::AtomicExpansionKind::CastToInteger) {
259	I = LI = convertAtomicLoadToIntegerType(LI);
260	MadeChange = true;
261	}
262	} else if (SI) {
263	if (!SI->isAtomic())
264	return false;
265
266	if (!atomicSizeSupported(TLI, I: SI)) {
267	expandAtomicStoreToLibcall(LI: SI);
268	return true;
269	}
270
271	if (TLI->shouldCastAtomicStoreInIR(SI) ==
272	TargetLoweringBase::AtomicExpansionKind::CastToInteger) {
273	I = SI = convertAtomicStoreToIntegerType(SI);
274	MadeChange = true;
275	}
276	} else if (RMWI) {
277	if (!atomicSizeSupported(TLI, I: RMWI)) {
278	expandAtomicRMWToLibcall(I: RMWI);
279	return true;
280	}
281
282	if (TLI->shouldCastAtomicRMWIInIR(RMWI) ==
283	TargetLoweringBase::AtomicExpansionKind::CastToInteger) {
284	I = RMWI = convertAtomicXchgToIntegerType(RMWI);
285	MadeChange = true;
286	}
287	} else if (CASI) {
288	if (!atomicSizeSupported(TLI, I: CASI)) {
289	expandAtomicCASToLibcall(I: CASI);
290	return true;
291	}
292
293	// TODO: when we're ready to make the change at the IR level, we can
294	// extend convertCmpXchgToInteger for floating point too.
295	if (CASI->getCompareOperand()->getType()->isPointerTy()) {
296	// TODO: add a TLI hook to control this so that each target can
297	// convert to lowering the original type one at a time.
298	I = CASI = convertCmpXchgToIntegerType(CI: CASI);
299	MadeChange = true;
300	}
301	} else
302	return false;
303
304	if (TLI->shouldInsertFencesForAtomic(I)) {
305	auto FenceOrdering = AtomicOrdering::Monotonic;
306	if (LI && isAcquireOrStronger(AO: LI->getOrdering())) {
307	FenceOrdering = LI->getOrdering();
308	LI->setOrdering(AtomicOrdering::Monotonic);
309	} else if (SI && isReleaseOrStronger(AO: SI->getOrdering())) {
310	FenceOrdering = SI->getOrdering();
311	SI->setOrdering(AtomicOrdering::Monotonic);
312	} else if (RMWI && (isReleaseOrStronger(AO: RMWI->getOrdering()) \|\|
313	isAcquireOrStronger(AO: RMWI->getOrdering()))) {
314	FenceOrdering = RMWI->getOrdering();
315	RMWI->setOrdering(AtomicOrdering::Monotonic);
316	} else if (CASI &&
317	TLI->shouldExpandAtomicCmpXchgInIR(AI: CASI) ==
318	TargetLoweringBase::AtomicExpansionKind::None &&
319	(isReleaseOrStronger(AO: CASI->getSuccessOrdering()) \|\|
320	isAcquireOrStronger(AO: CASI->getSuccessOrdering()) \|\|
321	isAcquireOrStronger(AO: CASI->getFailureOrdering()))) {
322	// If a compare and swap is lowered to LL/SC, we can do smarter fence
323	// insertion, with a stronger one on the success path than on the
324	// failure path. As a result, fence insertion is directly done by
325	// expandAtomicCmpXchg in that case.
326	FenceOrdering = CASI->getMergedOrdering();
327	auto CASOrdering = TLI->atomicOperationOrderAfterFenceSplit(I: CASI);
328
329	CASI->setSuccessOrdering(CASOrdering);
330	CASI->setFailureOrdering(CASOrdering);
331	}
332
333	if (FenceOrdering != AtomicOrdering::Monotonic) {
334	MadeChange \|= bracketInstWithFences(I, Order: FenceOrdering);
335	}
336	} else if (I->hasAtomicStore() &&
337	TLI->shouldInsertTrailingFenceForAtomicStore(I)) {
338	auto FenceOrdering = AtomicOrdering::Monotonic;
339	if (SI)
340	FenceOrdering = SI->getOrdering();
341	else if (RMWI)
342	FenceOrdering = RMWI->getOrdering();
343	else if (CASI && TLI->shouldExpandAtomicCmpXchgInIR(AI: CASI) !=
344	TargetLoweringBase::AtomicExpansionKind::LLSC)
345	// LLSC is handled in expandAtomicCmpXchg().
346	FenceOrdering = CASI->getSuccessOrdering();
347
348	IRBuilder Builder(I);
349	if (auto TrailingFence =
350	TLI->emitTrailingFence(Builder, Inst: I, Ord: FenceOrdering)) {
351	TrailingFence->moveAfter(MovePos: I);
352	MadeChange = true;
353	}
354	}
355
356	if (LI)
357	MadeChange \|= tryExpandAtomicLoad(LI);
358	else if (SI)
359	MadeChange \|= tryExpandAtomicStore(SI);
360	else if (RMWI) {
361	// There are two different ways of expanding RMW instructions:
362	// - into a load if it is idempotent
363	// - into a Cmpxchg/LL-SC loop otherwise
364	// we try them in that order.
365
366	if (isIdempotentRMW(RMWI) && simplifyIdempotentRMW(RMWI)) {
367	MadeChange = true;
368
369	} else {
370	MadeChange \|= tryExpandAtomicRMW(AI: RMWI);
371	}
372	} else if (CASI)
373	MadeChange \|= tryExpandAtomicCmpXchg(CI: CASI);
374
375	return MadeChange;
376	}
377
378	bool AtomicExpandImpl::run(Function &F, const TargetMachine *TM) {
379	const auto *Subtarget = TM->getSubtargetImpl(F);
380	if (!Subtarget->enableAtomicExpand())
381	return false;
382	TLI = Subtarget->getTargetLowering();
383	DL = &F.getDataLayout();
384
385	bool MadeChange = false;
386
387	for (Function::iterator BBI = F.begin(), BBE = F.end(); BBI != BBE; ++BBI) {
388	BasicBlock BB = &BBI;
389
390	BasicBlock::reverse_iterator Next;
391
392	for (BasicBlock::reverse_iterator I = BB->rbegin(), E = BB->rend(); I != E;
393	I = Next) {
394	Instruction &Inst = *I;
395	Next = std::next(x: I);
396
397	if (processAtomicInstr(I: &Inst)) {
398	MadeChange = true;
399
400	// New blocks may have been inserted.
401	BBE = F.end();
402	}
403	}
404	}
405
406	return MadeChange;
407	}
408
409	bool AtomicExpandLegacy::runOnFunction(Function &F) {
410
411	auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
412	if (!TPC)
413	return false;
414	auto *TM = &TPC->getTM<TargetMachine>();
415	AtomicExpandImpl AE;
416	return AE.run(F, TM);
417	}
418
419	FunctionPass *llvm::createAtomicExpandLegacyPass() {
420	return new AtomicExpandLegacy ();
421	}
422
423	PreservedAnalyses AtomicExpandPass::run(Function &F,
424	FunctionAnalysisManager &AM) {
425	AtomicExpandImpl AE;
426
427	bool Changed = AE.run(F, TM);
428	if (!Changed)
429	return PreservedAnalyses::all();
430
431	return PreservedAnalyses::none();
432	}
433
434	bool AtomicExpandImpl::bracketInstWithFences(Instruction *I,
435	AtomicOrdering Order) {
436	ReplacementIRBuilder Builder(I, *DL);
437
438	auto LeadingFence = TLI->emitLeadingFence(Builder, Inst: I, Ord: Order);
439
440	auto TrailingFence = TLI->emitTrailingFence(Builder, Inst: I, Ord: Order);
441	// We have a guard here because not every atomic operation generates a
442	// trailing fence.
443	if (TrailingFence)
444	TrailingFence->moveAfter(MovePos: I);
445
446	return (LeadingFence \|\| TrailingFence);
447	}
448
449	/// Get the iX type with the same bitwidth as T.
450	IntegerType *
451	AtomicExpandImpl::getCorrespondingIntegerType(Type T, const* DataLayout &DL) {
452	EVT VT = TLI->getMemValueType(DL, Ty: T);
453	unsigned BitWidth = VT.getStoreSizeInBits();
454	assert(BitWidth == VT.getSizeInBits() && "must be a power of two");
455	return IntegerType::get(C&: T->getContext(), NumBits: BitWidth);
456	}
457
458	/// Convert an atomic load of a non-integral type to an integer load of the
459	/// equivalent bitwidth. See the function comment on
460	/// convertAtomicStoreToIntegerType for background.
461	LoadInst AtomicExpandImpl::convertAtomicLoadToIntegerType(LoadInst LI) {
462	auto *M = LI->getModule();
463	Type *NewTy = getCorrespondingIntegerType(T: LI->getType(), DL: M->getDataLayout());
464
465	ReplacementIRBuilder Builder(LI, *DL);
466
467	Value *Addr = LI->getPointerOperand();
468
469	auto *NewLI = Builder.CreateLoad(Ty: NewTy, Ptr: Addr);
470	NewLI->setAlignment(LI->getAlign());
471	NewLI->setVolatile(LI->isVolatile());
472	NewLI->setAtomic(Ordering: LI->getOrdering(), SSID: LI->getSyncScopeID());
473	LLVM_DEBUG(dbgs() << "Replaced " << LI << " with " << NewLI << "\n");
474
475	Value *NewVal = Builder.CreateBitCast(V: NewLI, DestTy: LI->getType());
476	LI->replaceAllUsesWith(V: NewVal);
477	LI->eraseFromParent();
478	return NewLI;
479	}
480
481	AtomicRMWInst *
482	AtomicExpandImpl::convertAtomicXchgToIntegerType(AtomicRMWInst *RMWI) {
483	assert(RMWI->getOperation() == AtomicRMWInst::Xchg);
484
485	auto *M = RMWI->getModule();
486	Type *NewTy =
487	getCorrespondingIntegerType(T: RMWI->getType(), DL: M->getDataLayout());
488
489	ReplacementIRBuilder Builder(RMWI, *DL);
490
491	Value *Addr = RMWI->getPointerOperand();
492	Value *Val = RMWI->getValOperand();
493	Value *NewVal = Val->getType()->isPointerTy()
494	? Builder.CreatePtrToInt(V: Val, DestTy: NewTy)
495	: Builder.CreateBitCast(V: Val, DestTy: NewTy);
496
497	auto *NewRMWI = Builder.CreateAtomicRMW(Op: AtomicRMWInst::Xchg, Ptr: Addr, Val: NewVal,
498	Align: RMWI->getAlign(), Ordering: RMWI->getOrdering(),
499	SSID: RMWI->getSyncScopeID());
500	NewRMWI->setVolatile(RMWI->isVolatile());
501	copyMetadataForAtomic(Dest&: NewRMWI, Source: RMWI);
502	LLVM_DEBUG(dbgs() << "Replaced " << RMWI << " with " << NewRMWI << "\n");
503
504	Value *NewRVal = RMWI->getType()->isPointerTy()
505	? Builder.CreateIntToPtr(V: NewRMWI, DestTy: RMWI->getType())
506	: Builder.CreateBitCast(V: NewRMWI, DestTy: RMWI->getType());
507	RMWI->replaceAllUsesWith(V: NewRVal);
508	RMWI->eraseFromParent();
509	return NewRMWI;
510	}
511
512	bool AtomicExpandImpl::tryExpandAtomicLoad(LoadInst *LI) {
513	switch (TLI->shouldExpandAtomicLoadInIR(LI)) {
514	case TargetLoweringBase::AtomicExpansionKind::None:
515	return false;
516	case TargetLoweringBase::AtomicExpansionKind::LLSC:
517	expandAtomicOpToLLSC(
518	I: LI, ResultTy: LI->getType(), Addr: LI->getPointerOperand(), AddrAlign: LI->getAlign(),
519	MemOpOrder: LI->getOrdering(),
520	PerformOp: [](IRBuilderBase &Builder, Value Loaded) { return* Loaded; });
521	return true;
522	case TargetLoweringBase::AtomicExpansionKind::LLOnly:
523	return expandAtomicLoadToLL(LI);
524	case TargetLoweringBase::AtomicExpansionKind::CmpXChg:
525	return expandAtomicLoadToCmpXchg(LI);
526	case TargetLoweringBase::AtomicExpansionKind::NotAtomic:
527	LI->setAtomic(Ordering: AtomicOrdering::NotAtomic);
528	return true;
529	default:
530	llvm_unreachable("Unhandled case in tryExpandAtomicLoad");
531	}
532	}
533
534	bool AtomicExpandImpl::tryExpandAtomicStore(StoreInst *SI) {
535	switch (TLI->shouldExpandAtomicStoreInIR(SI)) {
536	case TargetLoweringBase::AtomicExpansionKind::None:
537	return false;
538	case TargetLoweringBase::AtomicExpansionKind::Expand:
539	expandAtomicStore(SI);
540	return true;
541	case TargetLoweringBase::AtomicExpansionKind::NotAtomic:
542	SI->setAtomic(Ordering: AtomicOrdering::NotAtomic);
543	return true;
544	default:
545	llvm_unreachable("Unhandled case in tryExpandAtomicStore");
546	}
547	}
548
549	bool AtomicExpandImpl::expandAtomicLoadToLL(LoadInst *LI) {
550	ReplacementIRBuilder Builder(LI, *DL);
551
552	// On some architectures, load-linked instructions are atomic for larger
553	// sizes than normal loads. For example, the only 64-bit load guaranteed
554	// to be single-copy atomic by ARM is an ldrexd (A3.5.3).
555	Value *Val = TLI->emitLoadLinked(Builder, ValueTy: LI->getType(),
556	Addr: LI->getPointerOperand(), Ord: LI->getOrdering());
557	TLI->emitAtomicCmpXchgNoStoreLLBalance(Builder);
558
559	LI->replaceAllUsesWith(V: Val);
560	LI->eraseFromParent();
561
562	return true;
563	}
564
565	bool AtomicExpandImpl::expandAtomicLoadToCmpXchg(LoadInst *LI) {
566	ReplacementIRBuilder Builder(LI, *DL);
567	AtomicOrdering Order = LI->getOrdering();
568	if (Order == AtomicOrdering::Unordered)
569	Order = AtomicOrdering::Monotonic;
570
571	Value *Addr = LI->getPointerOperand();
572	Type *Ty = LI->getType();
573	Constant *DummyVal = Constant::getNullValue(Ty);
574
575	Value *Pair = Builder.CreateAtomicCmpXchg(
576	Ptr: Addr, Cmp: DummyVal, New: DummyVal, Align: LI->getAlign(), SuccessOrdering: Order,
577	FailureOrdering: AtomicCmpXchgInst::getStrongestFailureOrdering(SuccessOrdering: Order));
578	Value *Loaded = Builder.CreateExtractValue(Agg: Pair, Idxs: `0`, Name: "loaded");
579
580	LI->replaceAllUsesWith(V: Loaded);
581	LI->eraseFromParent();
582
583	return true;
584	}
585
586	/// Convert an atomic store of a non-integral type to an integer store of the
587	/// equivalent bitwidth. We used to not support floating point or vector
588	/// atomics in the IR at all. The backends learned to deal with the bitcast
589	/// idiom because that was the only way of expressing the notion of a atomic
590	/// float or vector store. The long term plan is to teach each backend to
591	/// instruction select from the original atomic store, but as a migration
592	/// mechanism, we convert back to the old format which the backends understand.
593	/// Each backend will need individual work to recognize the new format.
594	StoreInst AtomicExpandImpl::convertAtomicStoreToIntegerType(StoreInst SI) {
595	ReplacementIRBuilder Builder(SI, *DL);
596	auto *M = SI->getModule();
597	Type *NewTy = getCorrespondingIntegerType(T: SI->getValueOperand()->getType(),
598	DL: M->getDataLayout());
599	Value *NewVal = Builder.CreateBitCast(V: SI->getValueOperand(), DestTy: NewTy);
600
601	Value *Addr = SI->getPointerOperand();
602
603	StoreInst *NewSI = Builder.CreateStore(Val: NewVal, Ptr: Addr);
604	NewSI->setAlignment(SI->getAlign());
605	NewSI->setVolatile(SI->isVolatile());
606	NewSI->setAtomic(Ordering: SI->getOrdering(), SSID: SI->getSyncScopeID());
607	LLVM_DEBUG(dbgs() << "Replaced " << SI << " with " << NewSI << "\n");
608	SI->eraseFromParent();
609	return NewSI;
610	}
611
612	void AtomicExpandImpl::expandAtomicStore(StoreInst *SI) {
613	// This function is only called on atomic stores that are too large to be
614	// atomic if implemented as a native store. So we replace them by an
615	// atomic swap, that can be implemented for example as a ldrex/strex on ARM
616	// or lock cmpxchg8/16b on X86, as these are atomic for larger sizes.
617	// It is the responsibility of the target to only signal expansion via
618	// shouldExpandAtomicRMW in cases where this is required and possible.
619	ReplacementIRBuilder Builder(SI, *DL);
620	AtomicOrdering Ordering = SI->getOrdering();
621	assert(Ordering != AtomicOrdering::NotAtomic);
622	AtomicOrdering RMWOrdering = Ordering == AtomicOrdering::Unordered
623	? AtomicOrdering::Monotonic
624	: Ordering;
625	AtomicRMWInst *AI = Builder.CreateAtomicRMW(
626	Op: AtomicRMWInst::Xchg, Ptr: SI->getPointerOperand(), Val: SI->getValueOperand(),
627	Align: SI->getAlign(), Ordering: RMWOrdering);
628	SI->eraseFromParent();
629
630	// Now we have an appropriate swap instruction, lower it as usual.
631	tryExpandAtomicRMW(AI);
632	}
633
634	static void createCmpXchgInstFun(IRBuilderBase &Builder, Value *Addr,
635	Value Loaded, Value NewVal, Align AddrAlign,
636	AtomicOrdering MemOpOrder, SyncScope::ID SSID,
637	Value &Success, Value &NewLoaded,
638	Instruction *MetadataSrc) {
639	Type *OrigTy = NewVal->getType();
640
641	// This code can go away when cmpxchg supports FP and vector types.
642	assert(!OrigTy->isPointerTy());
643	bool NeedBitcast = OrigTy->isFloatingPointTy() \|\| OrigTy->isVectorTy();
644	if (NeedBitcast) {
645	IntegerType *IntTy = Builder.getIntNTy(N: OrigTy->getPrimitiveSizeInBits());
646	NewVal = Builder.CreateBitCast(V: NewVal, DestTy: IntTy);
647	Loaded = Builder.CreateBitCast(V: Loaded, DestTy: IntTy);
648	}
649
650	AtomicCmpXchgInst *Pair = Builder.CreateAtomicCmpXchg(
651	Ptr: Addr, Cmp: Loaded, New: NewVal, Align: AddrAlign, SuccessOrdering: MemOpOrder,
652	FailureOrdering: AtomicCmpXchgInst::getStrongestFailureOrdering(SuccessOrdering: MemOpOrder), SSID);
653	if (MetadataSrc)
654	copyMetadataForAtomic(Dest&: Pair, Source: MetadataSrc);
655
656	Success = Builder.CreateExtractValue(Agg: Pair, Idxs: `1`, Name: "success");
657	NewLoaded = Builder.CreateExtractValue(Agg: Pair, Idxs: `0`, Name: "newloaded");
658
659	if (NeedBitcast)
660	NewLoaded = Builder.CreateBitCast(V: NewLoaded, DestTy: OrigTy);
661	}
662
663	bool AtomicExpandImpl::tryExpandAtomicRMW(AtomicRMWInst *AI) {
664	LLVMContext &Ctx = AI->getModule()->getContext();
665	TargetLowering::AtomicExpansionKind Kind = TLI->shouldExpandAtomicRMWInIR(RMW: AI);
666	switch (Kind) {
667	case TargetLoweringBase::AtomicExpansionKind::None:
668	return false;
669	case TargetLoweringBase::AtomicExpansionKind::LLSC: {
670	unsigned MinCASSize = TLI->getMinCmpXchgSizeInBits() / `8`;
671	unsigned ValueSize = getAtomicOpSize(RMWI: AI);
672	if (ValueSize < MinCASSize) {
673	expandPartwordAtomicRMW(I: AI,
674	ExpansionKind: TargetLoweringBase::AtomicExpansionKind::LLSC);
675	} else {
676	auto PerformOp = [&](IRBuilderBase &Builder, Value *Loaded) {
677	return buildAtomicRMWValue(Op: AI->getOperation(), Builder, Loaded,
678	Val: AI->getValOperand());
679	};
680	expandAtomicOpToLLSC(I: AI, ResultTy: AI->getType(), Addr: AI->getPointerOperand(),
681	AddrAlign: AI->getAlign(), MemOpOrder: AI->getOrdering(), PerformOp);
682	}
683	return true;
684	}
685	case TargetLoweringBase::AtomicExpansionKind::CmpXChg: {
686	unsigned MinCASSize = TLI->getMinCmpXchgSizeInBits() / `8`;
687	unsigned ValueSize = getAtomicOpSize(RMWI: AI);
688	if (ValueSize < MinCASSize) {
689	expandPartwordAtomicRMW(I: AI,
690	ExpansionKind: TargetLoweringBase::AtomicExpansionKind::CmpXChg);
691	} else {
692	SmallVector<StringRef> SSNs;
693	Ctx.getSyncScopeNames(SSNs);
694	auto MemScope = SSNs [AI->getSyncScopeID()].empty()
695	? "system"
696	: SSNs [AI->getSyncScopeID()];
697	OptimizationRemarkEmitter ORE(AI->getFunction());
698	ORE.emit(RemarkBuilder: [&]() {
699	return OptimizationRemark (DEBUG_TYPE, "Passed", AI)
700	<< "A compare and swap loop was generated for an atomic "
701	<< AI->getOperationName(Op: AI->getOperation()) << " operation at "
702	<< MemScope << " memory scope";
703	});
704	expandAtomicRMWToCmpXchg(AI, CreateCmpXchg: createCmpXchgInstFun);
705	}
706	return true;
707	}
708	case TargetLoweringBase::AtomicExpansionKind::MaskedIntrinsic: {
709	unsigned MinCASSize = TLI->getMinCmpXchgSizeInBits() / `8`;
710	unsigned ValueSize = getAtomicOpSize(RMWI: AI);
711	if (ValueSize < MinCASSize) {
712	AtomicRMWInst::BinOp Op = AI->getOperation();
713	// Widen And/Or/Xor and give the target another chance at expanding it.
714	if (Op == AtomicRMWInst::Or \|\| Op == AtomicRMWInst::Xor \|\|
715	Op == AtomicRMWInst::And) {
716	tryExpandAtomicRMW(AI: widenPartwordAtomicRMW(AI));
717	return true;
718	}
719	}
720	expandAtomicRMWToMaskedIntrinsic(AI);
721	return true;
722	}
723	case TargetLoweringBase::AtomicExpansionKind::BitTestIntrinsic: {
724	TLI->emitBitTestAtomicRMWIntrinsic(AI);
725	return true;
726	}
727	case TargetLoweringBase::AtomicExpansionKind::CmpArithIntrinsic: {
728	TLI->emitCmpArithAtomicRMWIntrinsic(AI);
729	return true;
730	}
731	case TargetLoweringBase::AtomicExpansionKind::NotAtomic:
732	return lowerAtomicRMWInst(RMWI: AI);
733	case TargetLoweringBase::AtomicExpansionKind::Expand:
734	TLI->emitExpandAtomicRMW(AI);
735	return true;
736	default:
737	llvm_unreachable("Unhandled case in tryExpandAtomicRMW");
738	}
739	}
740
741	namespace {
742
743	struct PartwordMaskValues {
744	// These three fields are guaranteed to be set by createMaskInstrs.
745	Type WordType = nullptr*;
746	Type ValueType = nullptr*;
747	Type IntValueType = nullptr*;
748	Value AlignedAddr = nullptr*;
749	Align AlignedAddrAlignment;
750	// The remaining fields can be null.
751	Value ShiftAmt = nullptr*;
752	Value Mask = nullptr*;
753	Value Inv_Mask = nullptr*;
754	};
755
756	LLVM_ATTRIBUTE_UNUSED
757	raw_ostream &operator<<(raw_ostream &O, const PartwordMaskValues &PMV) {
758	auto PrintObj = [&O](auto *V) {
759	if (V)
760	O << *V;
761	else
762	O << "nullptr";
763	O << `'\n'`;
764	};
765	O << "PartwordMaskValues {\n";
766	O << " WordType: ";
767	PrintObj (PMV.WordType);
768	O << " ValueType: ";
769	PrintObj (PMV.ValueType);
770	O << " AlignedAddr: ";
771	PrintObj (PMV.AlignedAddr);
772	O << " AlignedAddrAlignment: " << PMV.AlignedAddrAlignment.value() << `'\n'`;
773	O << " ShiftAmt: ";
774	PrintObj (PMV.ShiftAmt);
775	O << " Mask: ";
776	PrintObj (PMV.Mask);
777	O << " Inv_Mask: ";
778	PrintObj (PMV.Inv_Mask);
779	O << "}\n";
780	return O;
781	}
782
783	} // end anonymous namespace
784
785	/// This is a helper function which builds instructions to provide
786	/// values necessary for partword atomic operations. It takes an
787	/// incoming address, Addr, and ValueType, and constructs the address,
788	/// shift-amounts and masks needed to work with a larger value of size
789	/// WordSize.
790	///
791	/// AlignedAddr: Addr rounded down to a multiple of WordSize
792	///
793	/// ShiftAmt: Number of bits to right-shift a WordSize value loaded
794	/// from AlignAddr for it to have the same value as if
795	/// ValueType was loaded from Addr.
796	///
797	/// Mask: Value to mask with the value loaded from AlignAddr to
798	/// include only the part that would've been loaded from Addr.
799	///
800	/// Inv_Mask: The inverse of Mask.
801	static PartwordMaskValues createMaskInstrs(IRBuilderBase &Builder,
802	Instruction I, Type ValueType,
803	Value *Addr, Align AddrAlign,
804	unsigned MinWordSize) {
805	PartwordMaskValues PMV;
806
807	Module *M = I->getModule();
808	LLVMContext &Ctx = M->getContext();
809	const DataLayout &DL = M->getDataLayout();
810	unsigned ValueSize = DL.getTypeStoreSize(Ty: ValueType);
811
812	PMV.ValueType = PMV.IntValueType = ValueType;
813	if (PMV.ValueType->isFloatingPointTy() \|\| PMV.ValueType->isVectorTy())
814	PMV.IntValueType =
815	Type::getIntNTy(C&: Ctx, N: ValueType->getPrimitiveSizeInBits());
816
817	PMV.WordType = MinWordSize > ValueSize ? Type::getIntNTy(C&: Ctx, N: MinWordSize * `8`)
818	: ValueType;
819	if (PMV.ValueType == PMV.WordType) {
820	PMV.AlignedAddr = Addr;
821	PMV.AlignedAddrAlignment = AddrAlign;
822	PMV.ShiftAmt = ConstantInt::get(Ty: PMV.ValueType, V: `0`);
823	PMV.Mask = ConstantInt::get(Ty: PMV.ValueType, V: ~`0`, /isSigned/ IsSigned: true);
824	return PMV;
825	}
826
827	PMV.AlignedAddrAlignment = Align (MinWordSize);
828
829	assert(ValueSize < MinWordSize);
830
831	PointerType *PtrTy = cast<PointerType>(Val: Addr->getType());
832	IntegerType *IntTy = DL.getIndexType(C&: Ctx, AddressSpace: PtrTy->getAddressSpace());
833	Value *PtrLSB;
834
835	if (AddrAlign < MinWordSize) {
836	PMV.AlignedAddr = Builder.CreateIntrinsic(
837	ID: Intrinsic::ptrmask, Types: {PtrTy, IntTy},
838	Args: {Addr, ConstantInt::get(Ty: IntTy, V: ~(uint64_t)(MinWordSize - `1`))}, FMFSource: nullptr,
839	Name: "AlignedAddr");
840
841	Value *AddrInt = Builder.CreatePtrToInt(V: Addr, DestTy: IntTy);
842	PtrLSB = Builder.CreateAnd(LHS: AddrInt, RHS: MinWordSize - `1`, Name: "PtrLSB");
843	} else {
844	// If the alignment is high enough, the LSB are known 0.
845	PMV.AlignedAddr = Addr;
846	PtrLSB = ConstantInt::getNullValue(Ty: IntTy);
847	}
848
849	if (DL.isLittleEndian()) {
850	// turn bytes into bits
851	PMV.ShiftAmt = Builder.CreateShl(LHS: PtrLSB, RHS: `3`);
852	} else {
853	// turn bytes into bits, and count from the other side.
854	PMV.ShiftAmt = Builder.CreateShl(
855	LHS: Builder.CreateXor(LHS: PtrLSB, RHS: MinWordSize - ValueSize), RHS: `3`);
856	}
857
858	PMV.ShiftAmt = Builder.CreateTrunc(V: PMV.ShiftAmt, DestTy: PMV.WordType, Name: "ShiftAmt");
859	PMV.Mask = Builder.CreateShl(
860	LHS: ConstantInt::get(Ty: PMV.WordType, V: (`1` << (ValueSize * `8`)) - `1`), RHS: PMV.ShiftAmt,
861	Name: "Mask");
862
863	PMV.Inv_Mask = Builder.CreateNot(V: PMV.Mask, Name: "Inv_Mask");
864
865	return PMV;
866	}
867
868	static Value extractMaskedValue(IRBuilderBase &Builder, Value WideWord,
869	const PartwordMaskValues &PMV) {
870	assert(WideWord->getType() == PMV.WordType && "Widened type mismatch");
871	if (PMV.WordType == PMV.ValueType)
872	return WideWord;
873
874	Value *Shift = Builder.CreateLShr(LHS: WideWord, RHS: PMV.ShiftAmt, Name: "shifted");
875	Value *Trunc = Builder.CreateTrunc(V: Shift, DestTy: PMV.IntValueType, Name: "extracted");
876	return Builder.CreateBitCast(V: Trunc, DestTy: PMV.ValueType);
877	}
878
879	static Value insertMaskedValue(IRBuilderBase &Builder, Value WideWord,
880	Value Updated, const* PartwordMaskValues &PMV) {
881	assert(WideWord->getType() == PMV.WordType && "Widened type mismatch");
882	assert(Updated->getType() == PMV.ValueType && "Value type mismatch");
883	if (PMV.WordType == PMV.ValueType)
884	return Updated;
885
886	Updated = Builder.CreateBitCast(V: Updated, DestTy: PMV.IntValueType);
887
888	Value *ZExt = Builder.CreateZExt(V: Updated, DestTy: PMV.WordType, Name: "extended");
889	Value *Shift =
890	Builder.CreateShl(LHS: ZExt, RHS: PMV.ShiftAmt, Name: "shifted", /HasNUW/ true);
891	Value *And = Builder.CreateAnd(LHS: WideWord, RHS: PMV.Inv_Mask, Name: "unmasked");
892	Value *Or = Builder.CreateOr(LHS: And, RHS: Shift, Name: "inserted");
893	return Or;
894	}
895
896	/// Emit IR to implement a masked version of a given atomicrmw
897	/// operation. (That is, only the bits under the Mask should be
898	/// affected by the operation)
899	static Value *performMaskedAtomicOp(AtomicRMWInst::BinOp Op,
900	IRBuilderBase &Builder, Value *Loaded,
901	Value Shifted_Inc, Value Inc,
902	const PartwordMaskValues &PMV) {
903	// TODO: update to use
904	// https://graphics.stanford.edu/~seander/bithacks.html#MaskedMerge in order
905	// to merge bits from two values without requiring PMV.Inv_Mask.
906	switch (Op) {
907	case AtomicRMWInst::Xchg: {
908	Value *Loaded_MaskOut = Builder.CreateAnd(LHS: Loaded, RHS: PMV.Inv_Mask);
909	Value *FinalVal = Builder.CreateOr(LHS: Loaded_MaskOut, RHS: Shifted_Inc);
910	return FinalVal;
911	}
912	case AtomicRMWInst::Or:
913	case AtomicRMWInst::Xor:
914	case AtomicRMWInst::And:
915	llvm_unreachable("Or/Xor/And handled by widenPartwordAtomicRMW");
916	case AtomicRMWInst::Add:
917	case AtomicRMWInst::Sub:
918	case AtomicRMWInst::Nand: {
919	// The other arithmetic ops need to be masked into place.
920	Value *NewVal = buildAtomicRMWValue(Op, Builder, Loaded, Val: Shifted_Inc);
921	Value *NewVal_Masked = Builder.CreateAnd(LHS: NewVal, RHS: PMV.Mask);
922	Value *Loaded_MaskOut = Builder.CreateAnd(LHS: Loaded, RHS: PMV.Inv_Mask);
923	Value *FinalVal = Builder.CreateOr(LHS: Loaded_MaskOut, RHS: NewVal_Masked);
924	return FinalVal;
925	}
926	case AtomicRMWInst::Max:
927	case AtomicRMWInst::Min:
928	case AtomicRMWInst::UMax:
929	case AtomicRMWInst::UMin:
930	case AtomicRMWInst::FAdd:
931	case AtomicRMWInst::FSub:
932	case AtomicRMWInst::FMin:
933	case AtomicRMWInst::FMax:
934	case AtomicRMWInst::FMaximum:
935	case AtomicRMWInst::FMinimum:
936	case AtomicRMWInst::UIncWrap:
937	case AtomicRMWInst::UDecWrap:
938	case AtomicRMWInst::USubCond:
939	case AtomicRMWInst::USubSat: {
940	// Finally, other ops will operate on the full value, so truncate down to
941	// the original size, and expand out again after doing the
942	// operation. Bitcasts will be inserted for FP values.
943	Value *Loaded_Extract = extractMaskedValue(Builder, WideWord: Loaded, PMV);
944	Value *NewVal = buildAtomicRMWValue(Op, Builder, Loaded: Loaded_Extract, Val: Inc);
945	Value *FinalVal = insertMaskedValue(Builder, WideWord: Loaded, Updated: NewVal, PMV);
946	return FinalVal;
947	}
948	default:
949	llvm_unreachable("Unknown atomic op");
950	}
951	}
952
953	/// Expand a sub-word atomicrmw operation into an appropriate
954	/// word-sized operation.
955	///
956	/// It will create an LL/SC or cmpxchg loop, as appropriate, the same
957	/// way as a typical atomicrmw expansion. The only difference here is
958	/// that the operation inside of the loop may operate upon only a
959	/// part of the value.
960	void AtomicExpandImpl::expandPartwordAtomicRMW(
961	AtomicRMWInst *AI, TargetLoweringBase::AtomicExpansionKind ExpansionKind) {
962	// Widen And/Or/Xor and give the target another chance at expanding it.
963	AtomicRMWInst::BinOp Op = AI->getOperation();
964	if (Op == AtomicRMWInst::Or \|\| Op == AtomicRMWInst::Xor \|\|
965	Op == AtomicRMWInst::And) {
966	tryExpandAtomicRMW(AI: widenPartwordAtomicRMW(AI));
967	return;
968	}
969	AtomicOrdering MemOpOrder = AI->getOrdering();
970	SyncScope::ID SSID = AI->getSyncScopeID();
971
972	ReplacementIRBuilder Builder(AI, *DL);
973
974	PartwordMaskValues PMV =
975	createMaskInstrs(Builder, I: AI, ValueType: AI->getType(), Addr: AI->getPointerOperand(),
976	AddrAlign: AI->getAlign(), MinWordSize: TLI->getMinCmpXchgSizeInBits() / `8`);
977
978	Value ValOperand_Shifted = nullptr*;
979	if (Op == AtomicRMWInst::Xchg \|\| Op == AtomicRMWInst::Add \|\|
980	Op == AtomicRMWInst::Sub \|\| Op == AtomicRMWInst::Nand) {
981	Value *ValOp = Builder.CreateBitCast(V: AI->getValOperand(), DestTy: PMV.IntValueType);
982	ValOperand_Shifted =
983	Builder.CreateShl(LHS: Builder.CreateZExt(V: ValOp, DestTy: PMV.WordType), RHS: PMV.ShiftAmt,
984	Name: "ValOperand_Shifted");
985	}
986
987	auto PerformPartwordOp = [&](IRBuilderBase &Builder, Value *Loaded) {
988	return performMaskedAtomicOp(Op, Builder, Loaded, Shifted_Inc: ValOperand_Shifted,
989	Inc: AI->getValOperand(), PMV);
990	};
991
992	Value *OldResult;
993	if (ExpansionKind == TargetLoweringBase::AtomicExpansionKind::CmpXChg) {
994	OldResult = insertRMWCmpXchgLoop(
995	Builder, ResultType: PMV.WordType, Addr: PMV.AlignedAddr, AddrAlign: PMV.AlignedAddrAlignment,
996	MemOpOrder, SSID, PerformOp: PerformPartwordOp, CreateCmpXchg: createCmpXchgInstFun, MetadataSrc: AI);
997	} else {
998	assert(ExpansionKind == TargetLoweringBase::AtomicExpansionKind::LLSC);
999	OldResult = insertRMWLLSCLoop(Builder, ResultTy: PMV.WordType, Addr: PMV.AlignedAddr,
1000	AddrAlign: PMV.AlignedAddrAlignment, MemOpOrder,
1001	PerformOp: PerformPartwordOp);
1002	}
1003
1004	Value *FinalOldResult = extractMaskedValue(Builder, WideWord: OldResult, PMV);
1005	AI->replaceAllUsesWith(V: FinalOldResult);
1006	AI->eraseFromParent();
1007	}
1008
1009	// Widen the bitwise atomicrmw (or/xor/and) to the minimum supported width.
1010	AtomicRMWInst AtomicExpandImpl::widenPartwordAtomicRMW(AtomicRMWInst AI) {
1011	ReplacementIRBuilder Builder(AI, *DL);
1012	AtomicRMWInst::BinOp Op = AI->getOperation();
1013
1014	assert((Op == AtomicRMWInst::Or \|\| Op == AtomicRMWInst::Xor \|\|
1015	Op == AtomicRMWInst::And) &&
1016	"Unable to widen operation");
1017
1018	PartwordMaskValues PMV =
1019	createMaskInstrs(Builder, I: AI, ValueType: AI->getType(), Addr: AI->getPointerOperand(),
1020	AddrAlign: AI->getAlign(), MinWordSize: TLI->getMinCmpXchgSizeInBits() / `8`);
1021
1022	Value *ValOperand_Shifted =
1023	Builder.CreateShl(LHS: Builder.CreateZExt(V: AI->getValOperand(), DestTy: PMV.WordType),
1024	RHS: PMV.ShiftAmt, Name: "ValOperand_Shifted");
1025
1026	Value *NewOperand;
1027
1028	if (Op == AtomicRMWInst::And)
1029	NewOperand =
1030	Builder.CreateOr(LHS: ValOperand_Shifted, RHS: PMV.Inv_Mask, Name: "AndOperand");
1031	else
1032	NewOperand = ValOperand_Shifted;
1033
1034	AtomicRMWInst *NewAI = Builder.CreateAtomicRMW(
1035	Op, Ptr: PMV.AlignedAddr, Val: NewOperand, Align: PMV.AlignedAddrAlignment,
1036	Ordering: AI->getOrdering(), SSID: AI->getSyncScopeID());
1037
1038	copyMetadataForAtomic(Dest&: NewAI, Source: AI);
1039
1040	Value *FinalOldResult = extractMaskedValue(Builder, WideWord: NewAI, PMV);
1041	AI->replaceAllUsesWith(V: FinalOldResult);
1042	AI->eraseFromParent();
1043	return NewAI;
1044	}
1045
1046	bool AtomicExpandImpl::expandPartwordCmpXchg(AtomicCmpXchgInst *CI) {
1047	// The basic idea here is that we're expanding a cmpxchg of a
1048	// smaller memory size up to a word-sized cmpxchg. To do this, we
1049	// need to add a retry-loop for strong cmpxchg, so that
1050	// modifications to other parts of the word don't cause a spurious
1051	// failure.
1052
1053	// This generates code like the following:
1054	// [[Setup mask values PMV.]]*
1055	// %NewVal_Shifted = shl i32 %NewVal, %PMV.ShiftAmt
1056	// %Cmp_Shifted = shl i32 %Cmp, %PMV.ShiftAmt
1057	// %InitLoaded = load i32 %addr*
1058	// %InitLoaded_MaskOut = and i32 %InitLoaded, %PMV.Inv_Mask
1059	// br partword.cmpxchg.loop
1060	// partword.cmpxchg.loop:
1061	// %Loaded_MaskOut = phi i32 [ %InitLoaded_MaskOut, %entry ],
1062	// [ %OldVal_MaskOut, %partword.cmpxchg.failure ]
1063	// %FullWord_NewVal = or i32 %Loaded_MaskOut, %NewVal_Shifted
1064	// %FullWord_Cmp = or i32 %Loaded_MaskOut, %Cmp_Shifted
1065	// %NewCI = cmpxchg i32 %PMV.AlignedAddr, i32 %FullWord_Cmp,*
1066	// i32 %FullWord_NewVal success_ordering failure_ordering
1067	// %OldVal = extractvalue { i32, i1 } %NewCI, 0
1068	// %Success = extractvalue { i32, i1 } %NewCI, 1
1069	// br i1 %Success, label %partword.cmpxchg.end,
1070	// label %partword.cmpxchg.failure
1071	// partword.cmpxchg.failure:
1072	// %OldVal_MaskOut = and i32 %OldVal, %PMV.Inv_Mask
1073	// %ShouldContinue = icmp ne i32 %Loaded_MaskOut, %OldVal_MaskOut
1074	// br i1 %ShouldContinue, label %partword.cmpxchg.loop,
1075	// label %partword.cmpxchg.end
1076	// partword.cmpxchg.end:
1077	// %tmp1 = lshr i32 %OldVal, %PMV.ShiftAmt
1078	// %FinalOldVal = trunc i32 %tmp1 to i8
1079	// %tmp2 = insertvalue { i8, i1 } undef, i8 %FinalOldVal, 0
1080	// %Res = insertvalue { i8, i1 } %25, i1 %Success, 1
1081
1082	Value *Addr = CI->getPointerOperand();
1083	Value *Cmp = CI->getCompareOperand();
1084	Value *NewVal = CI->getNewValOperand();
1085
1086	BasicBlock *BB = CI->getParent();
1087	Function *F = BB->getParent();
1088	ReplacementIRBuilder Builder(CI, *DL);
1089	LLVMContext &Ctx = Builder.getContext();
1090
1091	BasicBlock *EndBB =
1092	BB->splitBasicBlock(I: CI->getIterator(), BBName: "partword.cmpxchg.end");
1093	auto FailureBB =
1094	BasicBlock::Create(Context&: Ctx, Name: "partword.cmpxchg.failure", Parent: F, InsertBefore: EndBB);
1095	auto LoopBB = BasicBlock::Create(Context&: Ctx, Name: "partword.cmpxchg.loop", Parent: F, InsertBefore: FailureBB);
1096
1097	// The split call above "helpfully" added a branch at the end of BB
1098	// (to the wrong place).
1099	std::prev(x: BB->end())->eraseFromParent();
1100	Builder.SetInsertPoint(BB);
1101
1102	PartwordMaskValues PMV =
1103	createMaskInstrs(Builder, I: CI, ValueType: CI->getCompareOperand()->getType(), Addr,
1104	AddrAlign: CI->getAlign(), MinWordSize: TLI->getMinCmpXchgSizeInBits() / `8`);
1105
1106	// Shift the incoming values over, into the right location in the word.
1107	Value *NewVal_Shifted =
1108	Builder.CreateShl(LHS: Builder.CreateZExt(V: NewVal, DestTy: PMV.WordType), RHS: PMV.ShiftAmt);
1109	Value *Cmp_Shifted =
1110	Builder.CreateShl(LHS: Builder.CreateZExt(V: Cmp, DestTy: PMV.WordType), RHS: PMV.ShiftAmt);
1111
1112	// Load the entire current word, and mask into place the expected and new
1113	// values
1114	LoadInst *InitLoaded = Builder.CreateLoad(Ty: PMV.WordType, Ptr: PMV.AlignedAddr);
1115	InitLoaded->setVolatile(CI->isVolatile());
1116	Value *InitLoaded_MaskOut = Builder.CreateAnd(LHS: InitLoaded, RHS: PMV.Inv_Mask);
1117	Builder.CreateBr(Dest: LoopBB);
1118
1119	// partword.cmpxchg.loop:
1120	Builder.SetInsertPoint(LoopBB);
1121	PHINode *Loaded_MaskOut = Builder.CreatePHI(Ty: PMV.WordType, NumReservedValues: `2`);
1122	Loaded_MaskOut->addIncoming(V: InitLoaded_MaskOut, BB);
1123
1124	// Mask/Or the expected and new values into place in the loaded word.
1125	Value *FullWord_NewVal = Builder.CreateOr(LHS: Loaded_MaskOut, RHS: NewVal_Shifted);
1126	Value *FullWord_Cmp = Builder.CreateOr(LHS: Loaded_MaskOut, RHS: Cmp_Shifted);
1127	AtomicCmpXchgInst *NewCI = Builder.CreateAtomicCmpXchg(
1128	Ptr: PMV.AlignedAddr, Cmp: FullWord_Cmp, New: FullWord_NewVal, Align: PMV.AlignedAddrAlignment,
1129	SuccessOrdering: CI->getSuccessOrdering(), FailureOrdering: CI->getFailureOrdering(), SSID: CI->getSyncScopeID());
1130	NewCI->setVolatile(CI->isVolatile());
1131	// When we're building a strong cmpxchg, we need a loop, so you
1132	// might think we could use a weak cmpxchg inside. But, using strong
1133	// allows the below comparison for ShouldContinue, and we're
1134	// expecting the underlying cmpxchg to be a machine instruction,
1135	// which is strong anyways.
1136	NewCI->setWeak(CI->isWeak());
1137
1138	Value *OldVal = Builder.CreateExtractValue(Agg: NewCI, Idxs: `0`);
1139	Value *Success = Builder.CreateExtractValue(Agg: NewCI, Idxs: `1`);
1140
1141	if (CI->isWeak())
1142	Builder.CreateBr(Dest: EndBB);
1143	else
1144	Builder.CreateCondBr(Cond: Success, True: EndBB, False: FailureBB);
1145
1146	// partword.cmpxchg.failure:
1147	Builder.SetInsertPoint(FailureBB);
1148	// Upon failure, verify that the masked-out part of the loaded value
1149	// has been modified. If it didn't, abort the cmpxchg, since the
1150	// masked-in part must've.
1151	Value *OldVal_MaskOut = Builder.CreateAnd(LHS: OldVal, RHS: PMV.Inv_Mask);
1152	Value *ShouldContinue = Builder.CreateICmpNE(LHS: Loaded_MaskOut, RHS: OldVal_MaskOut);
1153	Builder.CreateCondBr(Cond: ShouldContinue, True: LoopBB, False: EndBB);
1154
1155	// Add the second value to the phi from above
1156	Loaded_MaskOut->addIncoming(V: OldVal_MaskOut, BB: FailureBB);
1157
1158	// partword.cmpxchg.end:
1159	Builder.SetInsertPoint(CI);
1160
1161	Value *FinalOldVal = extractMaskedValue(Builder, WideWord: OldVal, PMV);
1162	Value *Res = PoisonValue::get(T: CI->getType());
1163	Res = Builder.CreateInsertValue(Agg: Res, Val: FinalOldVal, Idxs: `0`);
1164	Res = Builder.CreateInsertValue(Agg: Res, Val: Success, Idxs: `1`);
1165
1166	CI->replaceAllUsesWith(V: Res);
1167	CI->eraseFromParent();
1168	return true;
1169	}
1170
1171	void AtomicExpandImpl::expandAtomicOpToLLSC(
1172	Instruction I, Type ResultType, Value *Addr, Align AddrAlign,
1173	AtomicOrdering MemOpOrder,
1174	function_ref<Value (IRBuilderBase &, Value )> PerformOp) {
1175	ReplacementIRBuilder Builder(I, *DL);
1176	Value *Loaded = insertRMWLLSCLoop(Builder, ResultTy: ResultType, Addr, AddrAlign,
1177	MemOpOrder, PerformOp);
1178
1179	I->replaceAllUsesWith(V: Loaded);
1180	I->eraseFromParent();
1181	}
1182
1183	void AtomicExpandImpl::expandAtomicRMWToMaskedIntrinsic(AtomicRMWInst *AI) {
1184	ReplacementIRBuilder Builder(AI, *DL);
1185
1186	PartwordMaskValues PMV =
1187	createMaskInstrs(Builder, I: AI, ValueType: AI->getType(), Addr: AI->getPointerOperand(),
1188	AddrAlign: AI->getAlign(), MinWordSize: TLI->getMinCmpXchgSizeInBits() / `8`);
1189
1190	// The value operand must be sign-extended for signed min/max so that the
1191	// target's signed comparison instructions can be used. Otherwise, just
1192	// zero-ext.
1193	Instruction::CastOps CastOp = Instruction::ZExt;
1194	AtomicRMWInst::BinOp RMWOp = AI->getOperation();
1195	if (RMWOp == AtomicRMWInst::Max \|\| RMWOp == AtomicRMWInst::Min)
1196	CastOp = Instruction::SExt;
1197
1198	Value *ValOperand_Shifted = Builder.CreateShl(
1199	LHS: Builder.CreateCast(Op: CastOp, V: AI->getValOperand(), DestTy: PMV.WordType),
1200	RHS: PMV.ShiftAmt, Name: "ValOperand_Shifted");
1201	Value *OldResult = TLI->emitMaskedAtomicRMWIntrinsic(
1202	Builder, AI, AlignedAddr: PMV.AlignedAddr, Incr: ValOperand_Shifted, Mask: PMV.Mask, ShiftAmt: PMV.ShiftAmt,
1203	Ord: AI->getOrdering());
1204	Value *FinalOldResult = extractMaskedValue(Builder, WideWord: OldResult, PMV);
1205	AI->replaceAllUsesWith(V: FinalOldResult);
1206	AI->eraseFromParent();
1207	}
1208
1209	void AtomicExpandImpl::expandAtomicCmpXchgToMaskedIntrinsic(
1210	AtomicCmpXchgInst *CI) {
1211	ReplacementIRBuilder Builder(CI, *DL);
1212
1213	PartwordMaskValues PMV = createMaskInstrs(
1214	Builder, I: CI, ValueType: CI->getCompareOperand()->getType(), Addr: CI->getPointerOperand(),
1215	AddrAlign: CI->getAlign(), MinWordSize: TLI->getMinCmpXchgSizeInBits() / `8`);
1216
1217	Value *CmpVal_Shifted = Builder.CreateShl(
1218	LHS: Builder.CreateZExt(V: CI->getCompareOperand(), DestTy: PMV.WordType), RHS: PMV.ShiftAmt,
1219	Name: "CmpVal_Shifted");
1220	Value *NewVal_Shifted = Builder.CreateShl(
1221	LHS: Builder.CreateZExt(V: CI->getNewValOperand(), DestTy: PMV.WordType), RHS: PMV.ShiftAmt,
1222	Name: "NewVal_Shifted");
1223	Value *OldVal = TLI->emitMaskedAtomicCmpXchgIntrinsic(
1224	Builder, CI, AlignedAddr: PMV.AlignedAddr, CmpVal: CmpVal_Shifted, NewVal: NewVal_Shifted, Mask: PMV.Mask,
1225	Ord: CI->getMergedOrdering());
1226	Value *FinalOldVal = extractMaskedValue(Builder, WideWord: OldVal, PMV);
1227	Value *Res = PoisonValue::get(T: CI->getType());
1228	Res = Builder.CreateInsertValue(Agg: Res, Val: FinalOldVal, Idxs: `0`);
1229	Value *Success = Builder.CreateICmpEQ(
1230	LHS: CmpVal_Shifted, RHS: Builder.CreateAnd(LHS: OldVal, RHS: PMV.Mask), Name: "Success");
1231	Res = Builder.CreateInsertValue(Agg: Res, Val: Success, Idxs: `1`);
1232
1233	CI->replaceAllUsesWith(V: Res);
1234	CI->eraseFromParent();
1235	}
1236
1237	Value *AtomicExpandImpl::insertRMWLLSCLoop(
1238	IRBuilderBase &Builder, Type ResultTy, Value Addr, Align AddrAlign,
1239	AtomicOrdering MemOpOrder,
1240	function_ref<Value (IRBuilderBase &, Value )> PerformOp) {
1241	LLVMContext &Ctx = Builder.getContext();
1242	BasicBlock *BB = Builder.GetInsertBlock();
1243	Function *F = BB->getParent();
1244
1245	assert(AddrAlign >=
1246	F->getDataLayout().getTypeStoreSize(ResultTy) &&
1247	"Expected at least natural alignment at this point.");
1248
1249	// Given: atomicrmw some_op iN %addr, iN %incr ordering*
1250	//
1251	// The standard expansion we produce is:
1252	// [...]
1253	// atomicrmw.start:
1254	// %loaded = @load.linked(%addr)
1255	// %new = some_op iN %loaded, %incr
1256	// %stored = @store_conditional(%new, %addr)
1257	// %try_again = icmp i32 ne %stored, 0
1258	// br i1 %try_again, label %loop, label %atomicrmw.end
1259	// atomicrmw.end:
1260	// [...]
1261	BasicBlock *ExitBB =
1262	BB->splitBasicBlock(I: Builder.GetInsertPoint(), BBName: "atomicrmw.end");
1263	BasicBlock *LoopBB = BasicBlock::Create(Context&: Ctx, Name: "atomicrmw.start", Parent: F, InsertBefore: ExitBB);
1264
1265	// The split call above "helpfully" added a branch at the end of BB (to the
1266	// wrong place).
1267	std::prev(x: BB->end())->eraseFromParent();
1268	Builder.SetInsertPoint(BB);
1269	Builder.CreateBr(Dest: LoopBB);
1270
1271	// Start the main loop block now that we've taken care of the preliminaries.
1272	Builder.SetInsertPoint(LoopBB);
1273	Value *Loaded = TLI->emitLoadLinked(Builder, ValueTy: ResultTy, Addr, Ord: MemOpOrder);
1274
1275	Value *NewVal = PerformOp (Builder, Loaded);
1276
1277	Value *StoreSuccess =
1278	TLI->emitStoreConditional(Builder, Val: NewVal, Addr, Ord: MemOpOrder);
1279	Value *TryAgain = Builder.CreateICmpNE(
1280	LHS: StoreSuccess, RHS: ConstantInt::get(Ty: IntegerType::get(C&: Ctx, NumBits: `32`), V: `0`), Name: "tryagain");
1281	Builder.CreateCondBr(Cond: TryAgain, True: LoopBB, False: ExitBB);
1282
1283	Builder.SetInsertPoint(TheBB: ExitBB, IP: ExitBB->begin());
1284	return Loaded;
1285	}
1286
1287	/// Convert an atomic cmpxchg of a non-integral type to an integer cmpxchg of
1288	/// the equivalent bitwidth. We used to not support pointer cmpxchg in the
1289	/// IR. As a migration step, we convert back to what use to be the standard
1290	/// way to represent a pointer cmpxchg so that we can update backends one by
1291	/// one.
1292	AtomicCmpXchgInst *
1293	AtomicExpandImpl::convertCmpXchgToIntegerType(AtomicCmpXchgInst *CI) {
1294	auto *M = CI->getModule();
1295	Type *NewTy = getCorrespondingIntegerType(T: CI->getCompareOperand()->getType(),
1296	DL: M->getDataLayout());
1297
1298	ReplacementIRBuilder Builder(CI, *DL);
1299
1300	Value *Addr = CI->getPointerOperand();
1301
1302	Value *NewCmp = Builder.CreatePtrToInt(V: CI->getCompareOperand(), DestTy: NewTy);
1303	Value *NewNewVal = Builder.CreatePtrToInt(V: CI->getNewValOperand(), DestTy: NewTy);
1304
1305	auto *NewCI = Builder.CreateAtomicCmpXchg(
1306	Ptr: Addr, Cmp: NewCmp, New: NewNewVal, Align: CI->getAlign(), SuccessOrdering: CI->getSuccessOrdering(),
1307	FailureOrdering: CI->getFailureOrdering(), SSID: CI->getSyncScopeID());
1308	NewCI->setVolatile(CI->isVolatile());
1309	NewCI->setWeak(CI->isWeak());
1310	LLVM_DEBUG(dbgs() << "Replaced " << CI << " with " << NewCI << "\n");
1311
1312	Value *OldVal = Builder.CreateExtractValue(Agg: NewCI, Idxs: `0`);
1313	Value *Succ = Builder.CreateExtractValue(Agg: NewCI, Idxs: `1`);
1314
1315	OldVal = Builder.CreateIntToPtr(V: OldVal, DestTy: CI->getCompareOperand()->getType());
1316
1317	Value *Res = PoisonValue::get(T: CI->getType());
1318	Res = Builder.CreateInsertValue(Agg: Res, Val: OldVal, Idxs: `0`);
1319	Res = Builder.CreateInsertValue(Agg: Res, Val: Succ, Idxs: `1`);
1320
1321	CI->replaceAllUsesWith(V: Res);
1322	CI->eraseFromParent();
1323	return NewCI;
1324	}
1325
1326	bool AtomicExpandImpl::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
1327	AtomicOrdering SuccessOrder = CI->getSuccessOrdering();
1328	AtomicOrdering FailureOrder = CI->getFailureOrdering();
1329	Value *Addr = CI->getPointerOperand();
1330	BasicBlock *BB = CI->getParent();
1331	Function *F = BB->getParent();
1332	LLVMContext &Ctx = F->getContext();
1333	// If shouldInsertFencesForAtomic() returns true, then the target does not
1334	// want to deal with memory orders, and emitLeading/TrailingFence should take
1335	// care of everything. Otherwise, emitLeading/TrailingFence are no-op and we
1336	// should preserve the ordering.
1337	bool ShouldInsertFencesForAtomic = TLI->shouldInsertFencesForAtomic(I: CI);
1338	AtomicOrdering MemOpOrder = ShouldInsertFencesForAtomic
1339	? AtomicOrdering::Monotonic
1340	: CI->getMergedOrdering();
1341
1342	// In implementations which use a barrier to achieve release semantics, we can
1343	// delay emitting this barrier until we know a store is actually going to be
1344	// attempted. The cost of this delay is that we need 2 copies of the block
1345	// emitting the load-linked, affecting code size.
1346	//
1347	// Ideally, this logic would be unconditional except for the minsize check
1348	// since in other cases the extra blocks naturally collapse down to the
1349	// minimal loop. Unfortunately, this puts too much stress on later
1350	// optimisations so we avoid emitting the extra logic in those cases too.
1351	bool HasReleasedLoadBB = !CI->isWeak() && ShouldInsertFencesForAtomic &&
1352	SuccessOrder != AtomicOrdering::Monotonic &&
1353	SuccessOrder != AtomicOrdering::Acquire &&
1354	!F->hasMinSize();
1355
1356	// There's no overhead for sinking the release barrier in a weak cmpxchg, so
1357	// do it even on minsize.
1358	bool UseUnconditionalReleaseBarrier = F->hasMinSize() && !CI->isWeak();
1359
1360	// Given: cmpxchg some_op iN %addr, iN %desired, iN %new success_ord fail_ord*
1361	//
1362	// The full expansion we produce is:
1363	// [...]
1364	// %aligned.addr = ...
1365	// cmpxchg.start:
1366	// %unreleasedload = @load.linked(%aligned.addr)
1367	// %unreleasedload.extract = extract value from %unreleasedload
1368	// %should_store = icmp eq %unreleasedload.extract, %desired
1369	// br i1 %should_store, label %cmpxchg.releasingstore,
1370	// label %cmpxchg.nostore
1371	// cmpxchg.releasingstore:
1372	// fence?
1373	// br label cmpxchg.trystore
1374	// cmpxchg.trystore:
1375	// %loaded.trystore = phi [%unreleasedload, %cmpxchg.releasingstore],
1376	// [%releasedload, %cmpxchg.releasedload]
1377	// %updated.new = insert %new into %loaded.trystore
1378	// %stored = @store_conditional(%updated.new, %aligned.addr)
1379	// %success = icmp eq i32 %stored, 0
1380	// br i1 %success, label %cmpxchg.success,
1381	// label %cmpxchg.releasedload/%cmpxchg.failure
1382	// cmpxchg.releasedload:
1383	// %releasedload = @load.linked(%aligned.addr)
1384	// %releasedload.extract = extract value from %releasedload
1385	// %should_store = icmp eq %releasedload.extract, %desired
1386	// br i1 %should_store, label %cmpxchg.trystore,
1387	// label %cmpxchg.failure
1388	// cmpxchg.success:
1389	// fence?
1390	// br label %cmpxchg.end
1391	// cmpxchg.nostore:
1392	// %loaded.nostore = phi [%unreleasedload, %cmpxchg.start],
1393	// [%releasedload,
1394	// %cmpxchg.releasedload/%cmpxchg.trystore]
1395	// @load_linked_fail_balance()?
1396	// br label %cmpxchg.failure
1397	// cmpxchg.failure:
1398	// fence?
1399	// br label %cmpxchg.end
1400	// cmpxchg.end:
1401	// %loaded.exit = phi [%loaded.nostore, %cmpxchg.failure],
1402	// [%loaded.trystore, %cmpxchg.trystore]
1403	// %success = phi i1 [true, %cmpxchg.success], [false, %cmpxchg.failure]
1404	// %loaded = extract value from %loaded.exit
1405	// %restmp = insertvalue { iN, i1 } undef, iN %loaded, 0
1406	// %res = insertvalue { iN, i1 } %restmp, i1 %success, 1
1407	// [...]
1408	BasicBlock *ExitBB = BB->splitBasicBlock(I: CI->getIterator(), BBName: "cmpxchg.end");
1409	auto FailureBB = BasicBlock::Create(Context&: Ctx, Name: "cmpxchg.failure", Parent: F, InsertBefore: ExitBB);
1410	auto NoStoreBB = BasicBlock::Create(Context&: Ctx, Name: "cmpxchg.nostore", Parent: F, InsertBefore: FailureBB);
1411	auto SuccessBB = BasicBlock::Create(Context&: Ctx, Name: "cmpxchg.success", Parent: F, InsertBefore: NoStoreBB);
1412	auto ReleasedLoadBB =
1413	BasicBlock::Create(Context&: Ctx, Name: "cmpxchg.releasedload", Parent: F, InsertBefore: SuccessBB);
1414	auto TryStoreBB =
1415	BasicBlock::Create(Context&: Ctx, Name: "cmpxchg.trystore", Parent: F, InsertBefore: ReleasedLoadBB);
1416	auto ReleasingStoreBB =
1417	BasicBlock::Create(Context&: Ctx, Name: "cmpxchg.fencedstore", Parent: F, InsertBefore: TryStoreBB);
1418	auto StartBB = BasicBlock::Create(Context&: Ctx, Name: "cmpxchg.start", Parent: F, InsertBefore: ReleasingStoreBB);
1419
1420	ReplacementIRBuilder Builder(CI, *DL);
1421
1422	// The split call above "helpfully" added a branch at the end of BB (to the
1423	// wrong place), but we might want a fence too. It's easiest to just remove
1424	// the branch entirely.
1425	std::prev(x: BB->end())->eraseFromParent();
1426	Builder.SetInsertPoint(BB);
1427	if (ShouldInsertFencesForAtomic && UseUnconditionalReleaseBarrier)
1428	TLI->emitLeadingFence(Builder, Inst: CI, Ord: SuccessOrder);
1429
1430	PartwordMaskValues PMV =
1431	createMaskInstrs(Builder, I: CI, ValueType: CI->getCompareOperand()->getType(), Addr,
1432	AddrAlign: CI->getAlign(), MinWordSize: TLI->getMinCmpXchgSizeInBits() / `8`);
1433	Builder.CreateBr(Dest: StartBB);
1434
1435	// Start the main loop block now that we've taken care of the preliminaries.
1436	Builder.SetInsertPoint(StartBB);
1437	Value *UnreleasedLoad =
1438	TLI->emitLoadLinked(Builder, ValueTy: PMV.WordType, Addr: PMV.AlignedAddr, Ord: MemOpOrder);
1439	Value *UnreleasedLoadExtract =
1440	extractMaskedValue(Builder, WideWord: UnreleasedLoad, PMV);
1441	Value *ShouldStore = Builder.CreateICmpEQ(
1442	LHS: UnreleasedLoadExtract, RHS: CI->getCompareOperand(), Name: "should_store");
1443
1444	// If the cmpxchg doesn't actually need any ordering when it fails, we can
1445	// jump straight past that fence instruction (if it exists).
1446	Builder.CreateCondBr(Cond: ShouldStore, True: ReleasingStoreBB, False: NoStoreBB);
1447
1448	Builder.SetInsertPoint(ReleasingStoreBB);
1449	if (ShouldInsertFencesForAtomic && !UseUnconditionalReleaseBarrier)
1450	TLI->emitLeadingFence(Builder, Inst: CI, Ord: SuccessOrder);
1451	Builder.CreateBr(Dest: TryStoreBB);
1452
1453	Builder.SetInsertPoint(TryStoreBB);
1454	PHINode *LoadedTryStore =
1455	Builder.CreatePHI(Ty: PMV.WordType, NumReservedValues: `2`, Name: "loaded.trystore");
1456	LoadedTryStore->addIncoming(V: UnreleasedLoad, BB: ReleasingStoreBB);
1457	Value *NewValueInsert =
1458	insertMaskedValue(Builder, WideWord: LoadedTryStore, Updated: CI->getNewValOperand(), PMV);
1459	Value *StoreSuccess = TLI->emitStoreConditional(Builder, Val: NewValueInsert,
1460	Addr: PMV.AlignedAddr, Ord: MemOpOrder);
1461	StoreSuccess = Builder.CreateICmpEQ(
1462	LHS: StoreSuccess, RHS: ConstantInt::get(Ty: Type::getInt32Ty(C&: Ctx), V: `0`), Name: "success");
1463	BasicBlock *RetryBB = HasReleasedLoadBB ? ReleasedLoadBB : StartBB;
1464	Builder.CreateCondBr(Cond: StoreSuccess, True: SuccessBB,
1465	False: CI->isWeak() ? FailureBB : RetryBB);
1466
1467	Builder.SetInsertPoint(ReleasedLoadBB);
1468	Value *SecondLoad;
1469	if (HasReleasedLoadBB) {
1470	SecondLoad =
1471	TLI->emitLoadLinked(Builder, ValueTy: PMV.WordType, Addr: PMV.AlignedAddr, Ord: MemOpOrder);
1472	Value *SecondLoadExtract = extractMaskedValue(Builder, WideWord: SecondLoad, PMV);
1473	ShouldStore = Builder.CreateICmpEQ(LHS: SecondLoadExtract,
1474	RHS: CI->getCompareOperand(), Name: "should_store");
1475
1476	// If the cmpxchg doesn't actually need any ordering when it fails, we can
1477	// jump straight past that fence instruction (if it exists).
1478	Builder.CreateCondBr(Cond: ShouldStore, True: TryStoreBB, False: NoStoreBB);
1479	// Update PHI node in TryStoreBB.
1480	LoadedTryStore->addIncoming(V: SecondLoad, BB: ReleasedLoadBB);
1481	} else
1482	Builder.CreateUnreachable();
1483
1484	// Make sure later instructions don't get reordered with a fence if
1485	// necessary.
1486	Builder.SetInsertPoint(SuccessBB);
1487	if (ShouldInsertFencesForAtomic \|\|
1488	TLI->shouldInsertTrailingFenceForAtomicStore(I: CI))
1489	TLI->emitTrailingFence(Builder, Inst: CI, Ord: SuccessOrder);
1490	Builder.CreateBr(Dest: ExitBB);
1491
1492	Builder.SetInsertPoint(NoStoreBB);
1493	PHINode *LoadedNoStore =
1494	Builder.CreatePHI(Ty: UnreleasedLoad->getType(), NumReservedValues: `2`, Name: "loaded.nostore");
1495	LoadedNoStore->addIncoming(V: UnreleasedLoad, BB: StartBB);
1496	if (HasReleasedLoadBB)
1497	LoadedNoStore->addIncoming(V: SecondLoad, BB: ReleasedLoadBB);
1498
1499	// In the failing case, where we don't execute the store-conditional, the
1500	// target might want to balance out the load-linked with a dedicated
1501	// instruction (e.g., on ARM, clearing the exclusive monitor).
1502	TLI->emitAtomicCmpXchgNoStoreLLBalance(Builder);
1503	Builder.CreateBr(Dest: FailureBB);
1504
1505	Builder.SetInsertPoint(FailureBB);
1506	PHINode *LoadedFailure =
1507	Builder.CreatePHI(Ty: UnreleasedLoad->getType(), NumReservedValues: `2`, Name: "loaded.failure");
1508	LoadedFailure->addIncoming(V: LoadedNoStore, BB: NoStoreBB);
1509	if (CI->isWeak())
1510	LoadedFailure->addIncoming(V: LoadedTryStore, BB: TryStoreBB);
1511	if (ShouldInsertFencesForAtomic)
1512	TLI->emitTrailingFence(Builder, Inst: CI, Ord: FailureOrder);
1513	Builder.CreateBr(Dest: ExitBB);
1514
1515	// Finally, we have control-flow based knowledge of whether the cmpxchg
1516	// succeeded or not. We expose this to later passes by converting any
1517	// subsequent "icmp eq/ne %loaded, %oldval" into a use of an appropriate
1518	// PHI.
1519	Builder.SetInsertPoint(TheBB: ExitBB, IP: ExitBB->begin());
1520	PHINode *LoadedExit =
1521	Builder.CreatePHI(Ty: UnreleasedLoad->getType(), NumReservedValues: `2`, Name: "loaded.exit");
1522	LoadedExit->addIncoming(V: LoadedTryStore, BB: SuccessBB);
1523	LoadedExit->addIncoming(V: LoadedFailure, BB: FailureBB);
1524	PHINode *Success = Builder.CreatePHI(Ty: Type::getInt1Ty(C&: Ctx), NumReservedValues: `2`, Name: "success");
1525	Success->addIncoming(V: ConstantInt::getTrue(Context&: Ctx), BB: SuccessBB);
1526	Success->addIncoming(V: ConstantInt::getFalse(Context&: Ctx), BB: FailureBB);
1527
1528	// This is the "exit value" from the cmpxchg expansion. It may be of
1529	// a type wider than the one in the cmpxchg instruction.
1530	Value *LoadedFull = LoadedExit;
1531
1532	Builder.SetInsertPoint(TheBB: ExitBB, IP: std::next(x: Success->getIterator()));
1533	Value *Loaded = extractMaskedValue(Builder, WideWord: LoadedFull, PMV);
1534
1535	// Look for any users of the cmpxchg that are just comparing the loaded value
1536	// against the desired one, and replace them with the CFG-derived version.
1537	SmallVector<ExtractValueInst *, `2`> PrunedInsts;
1538	for (auto *User : CI->users()) {
1539	ExtractValueInst *EV = dyn_cast<ExtractValueInst>(Val: User);
1540	if (!EV)
1541	continue;
1542
1543	assert(EV->getNumIndices() == `1` && EV->getIndices()[`0`] <= `1` &&
1544	"weird extraction from { iN, i1 }");
1545
1546	if (EV->getIndices()[`0`] == `0`)
1547	EV->replaceAllUsesWith(V: Loaded);
1548	else
1549	EV->replaceAllUsesWith(V: Success);
1550
1551	PrunedInsts.push_back(Elt: EV);
1552	}
1553
1554	// We can remove the instructions now we're no longer iterating through them.
1555	for (auto *EV : PrunedInsts)
1556	EV->eraseFromParent();
1557
1558	if (!CI->use_empty()) {
1559	// Some use of the full struct return that we don't understand has happened,
1560	// so we've got to reconstruct it properly.
1561	Value *Res;
1562	Res = Builder.CreateInsertValue(Agg: PoisonValue::get(T: CI->getType()), Val: Loaded, Idxs: `0`);
1563	Res = Builder.CreateInsertValue(Agg: Res, Val: Success, Idxs: `1`);
1564
1565	CI->replaceAllUsesWith(V: Res);
1566	}
1567
1568	CI->eraseFromParent();
1569	return true;
1570	}
1571
1572	bool AtomicExpandImpl::isIdempotentRMW(AtomicRMWInst *RMWI) {
1573	// TODO: Add floating point support.
1574	auto C = dyn_cast<ConstantInt>(Val: RMWI->getValOperand());
1575	if (!C)
1576	return false;
1577
1578	switch (RMWI->getOperation()) {
1579	case AtomicRMWInst::Add:
1580	case AtomicRMWInst::Sub:
1581	case AtomicRMWInst::Or:
1582	case AtomicRMWInst::Xor:
1583	return C->isZero();
1584	case AtomicRMWInst::And:
1585	return C->isMinusOne();
1586	case AtomicRMWInst::Min:
1587	return C->isMaxValue(IsSigned: true);
1588	case AtomicRMWInst::Max:
1589	return C->isMinValue(IsSigned: true);
1590	case AtomicRMWInst::UMin:
1591	return C->isMaxValue(IsSigned: false);
1592	case AtomicRMWInst::UMax:
1593	return C->isMinValue(IsSigned: false);
1594	default:
1595	return false;
1596	}
1597	}
1598
1599	bool AtomicExpandImpl::simplifyIdempotentRMW(AtomicRMWInst *RMWI) {
1600	if (auto ResultingLoad = TLI->lowerIdempotentRMWIntoFencedLoad(RMWI)) {
1601	tryExpandAtomicLoad(LI: ResultingLoad);
1602	return true;
1603	}
1604	return false;
1605	}
1606
1607	Value *AtomicExpandImpl::insertRMWCmpXchgLoop(
1608	IRBuilderBase &Builder, Type ResultTy, Value Addr, Align AddrAlign,
1609	AtomicOrdering MemOpOrder, SyncScope::ID SSID,
1610	function_ref<Value (IRBuilderBase &, Value )> PerformOp,
1611	CreateCmpXchgInstFun CreateCmpXchg, Instruction *MetadataSrc) {
1612	LLVMContext &Ctx = Builder.getContext();
1613	BasicBlock *BB = Builder.GetInsertBlock();
1614	Function *F = BB->getParent();
1615
1616	// Given: atomicrmw some_op iN %addr, iN %incr ordering*
1617	//
1618	// The standard expansion we produce is:
1619	// [...]
1620	// %init_loaded = load atomic iN %addr*
1621	// br label %loop
1622	// loop:
1623	// %loaded = phi iN [ %init_loaded, %entry ], [ %new_loaded, %loop ]
1624	// %new = some_op iN %loaded, %incr
1625	// %pair = cmpxchg iN %addr, iN %loaded, iN %new*
1626	// %new_loaded = extractvalue { iN, i1 } %pair, 0
1627	// %success = extractvalue { iN, i1 } %pair, 1
1628	// br i1 %success, label %atomicrmw.end, label %loop
1629	// atomicrmw.end:
1630	// [...]
1631	BasicBlock *ExitBB =
1632	BB->splitBasicBlock(I: Builder.GetInsertPoint(), BBName: "atomicrmw.end");
1633	BasicBlock *LoopBB = BasicBlock::Create(Context&: Ctx, Name: "atomicrmw.start", Parent: F, InsertBefore: ExitBB);
1634
1635	// The split call above "helpfully" added a branch at the end of BB (to the
1636	// wrong place), but we want a load. It's easiest to just remove
1637	// the branch entirely.
1638	std::prev(x: BB->end())->eraseFromParent();
1639	Builder.SetInsertPoint(BB);
1640	LoadInst *InitLoaded = Builder.CreateAlignedLoad(Ty: ResultTy, Ptr: Addr, Align: AddrAlign);
1641	Builder.CreateBr(Dest: LoopBB);
1642
1643	// Start the main loop block now that we've taken care of the preliminaries.
1644	Builder.SetInsertPoint(LoopBB);
1645	PHINode *Loaded = Builder.CreatePHI(Ty: ResultTy, NumReservedValues: `2`, Name: "loaded");
1646	Loaded->addIncoming(V: InitLoaded, BB);
1647
1648	Value *NewVal = PerformOp (Builder, Loaded);
1649
1650	Value NewLoaded = nullptr*;
1651	Value Success = nullptr*;
1652
1653	CreateCmpXchg (Builder, Addr, Loaded, NewVal, AddrAlign,
1654	MemOpOrder == AtomicOrdering::Unordered
1655	? AtomicOrdering::Monotonic
1656	: MemOpOrder,
1657	SSID, Success, NewLoaded, MetadataSrc);
1658	assert(Success && NewLoaded);
1659
1660	Loaded->addIncoming(V: NewLoaded, BB: LoopBB);
1661
1662	Builder.CreateCondBr(Cond: Success, True: ExitBB, False: LoopBB);
1663
1664	Builder.SetInsertPoint(TheBB: ExitBB, IP: ExitBB->begin());
1665	return NewLoaded;
1666	}
1667
1668	bool AtomicExpandImpl::tryExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
1669	unsigned MinCASSize = TLI->getMinCmpXchgSizeInBits() / `8`;
1670	unsigned ValueSize = getAtomicOpSize(CASI: CI);
1671
1672	switch (TLI->shouldExpandAtomicCmpXchgInIR(AI: CI)) {
1673	default:
1674	llvm_unreachable("Unhandled case in tryExpandAtomicCmpXchg");
1675	case TargetLoweringBase::AtomicExpansionKind::None:
1676	if (ValueSize < MinCASSize)
1677	return expandPartwordCmpXchg(CI);
1678	return false;
1679	case TargetLoweringBase::AtomicExpansionKind::LLSC: {
1680	return expandAtomicCmpXchg(CI);
1681	}
1682	case TargetLoweringBase::AtomicExpansionKind::MaskedIntrinsic:
1683	expandAtomicCmpXchgToMaskedIntrinsic(CI);
1684	return true;
1685	case TargetLoweringBase::AtomicExpansionKind::NotAtomic:
1686	return lowerAtomicCmpXchgInst(CXI: CI);
1687	case TargetLoweringBase::AtomicExpansionKind::Expand: {
1688	TLI->emitExpandAtomicCmpXchg(CI);
1689	return true;
1690	}
1691	}
1692	}
1693
1694	// Note: This function is exposed externally by AtomicExpandUtils.h
1695	bool llvm::expandAtomicRMWToCmpXchg(AtomicRMWInst *AI,
1696	CreateCmpXchgInstFun CreateCmpXchg) {
1697	ReplacementIRBuilder Builder(AI, AI->getDataLayout());
1698	Builder.setIsFPConstrained(
1699	AI->getFunction()->hasFnAttribute(Kind: Attribute::StrictFP));
1700
1701	// FIXME: If FP exceptions are observable, we should force them off for the
1702	// loop for the FP atomics.
1703	Value *Loaded = AtomicExpandImpl::insertRMWCmpXchgLoop(
1704	Builder, ResultTy: AI->getType(), Addr: AI->getPointerOperand(), AddrAlign: AI->getAlign(),
1705	MemOpOrder: AI->getOrdering(), SSID: AI->getSyncScopeID(),
1706	PerformOp: [&](IRBuilderBase &Builder, Value *Loaded) {
1707	return buildAtomicRMWValue(Op: AI->getOperation(), Builder, Loaded,
1708	Val: AI->getValOperand());
1709	},
1710	CreateCmpXchg, /MetadataSrc=/AI);
1711
1712	AI->replaceAllUsesWith(V: Loaded);
1713	AI->eraseFromParent();
1714	return true;
1715	}
1716
1717	// In order to use one of the sized library calls such as
1718	// __atomic_fetch_add_4, the alignment must be sufficient, the size
1719	// must be one of the potentially-specialized sizes, and the value
1720	// type must actually exist in C on the target (otherwise, the
1721	// function wouldn't actually be defined.)
1722	static bool canUseSizedAtomicCall(unsigned Size, Align Alignment,
1723	const DataLayout &DL) {
1724	// TODO: "LargestSize" is an approximation for "largest type that
1725	// you can express in C". It seems to be the case that int128 is
1726	// supported on all 64-bit platforms, otherwise only up to 64-bit
1727	// integers are supported. If we get this wrong, then we'll try to
1728	// call a sized libcall that doesn't actually exist. There should
1729	// really be some more reliable way in LLVM of determining integer
1730	// sizes which are valid in the target's C ABI...
1731	unsigned LargestSize = DL.getLargestLegalIntTypeSizeInBits() >= `64` ? `16` : `8`;
1732	return Alignment >= Size &&
1733	(Size == `1` \|\| Size == `2` \|\| Size == `4` \|\| Size == `8` \|\| Size == `16`) &&
1734	Size <= LargestSize;
1735	}
1736
1737	void AtomicExpandImpl::expandAtomicLoadToLibcall(LoadInst *I) {
1738	static const RTLIB::Libcall Libcalls[`6`] = {
1739	RTLIB::ATOMIC_LOAD, RTLIB::ATOMIC_LOAD_1, RTLIB::ATOMIC_LOAD_2,
1740	RTLIB::ATOMIC_LOAD_4, RTLIB::ATOMIC_LOAD_8, RTLIB::ATOMIC_LOAD_16};
1741	unsigned Size = getAtomicOpSize(LI: I);
1742
1743	bool expanded = expandAtomicOpToLibcall(
1744	I, Size, Alignment: I->getAlign(), PointerOperand: I->getPointerOperand(), ValueOperand: nullptr, CASExpected: nullptr,
1745	Ordering: I->getOrdering(), Ordering2: AtomicOrdering::NotAtomic, Libcalls);
1746	if (!expanded)
1747	report_fatal_error(reason: "expandAtomicOpToLibcall shouldn't fail for Load");
1748	}
1749
1750	void AtomicExpandImpl::expandAtomicStoreToLibcall(StoreInst *I) {
1751	static const RTLIB::Libcall Libcalls[`6`] = {
1752	RTLIB::ATOMIC_STORE, RTLIB::ATOMIC_STORE_1, RTLIB::ATOMIC_STORE_2,
1753	RTLIB::ATOMIC_STORE_4, RTLIB::ATOMIC_STORE_8, RTLIB::ATOMIC_STORE_16};
1754	unsigned Size = getAtomicOpSize(SI: I);
1755
1756	bool expanded = expandAtomicOpToLibcall(
1757	I, Size, Alignment: I->getAlign(), PointerOperand: I->getPointerOperand(), ValueOperand: I->getValueOperand(),
1758	CASExpected: nullptr, Ordering: I->getOrdering(), Ordering2: AtomicOrdering::NotAtomic, Libcalls);
1759	if (!expanded)
1760	report_fatal_error(reason: "expandAtomicOpToLibcall shouldn't fail for Store");
1761	}
1762
1763	void AtomicExpandImpl::expandAtomicCASToLibcall(AtomicCmpXchgInst *I) {
1764	static const RTLIB::Libcall Libcalls[`6`] = {
1765	RTLIB::ATOMIC_COMPARE_EXCHANGE, RTLIB::ATOMIC_COMPARE_EXCHANGE_1,
1766	RTLIB::ATOMIC_COMPARE_EXCHANGE_2, RTLIB::ATOMIC_COMPARE_EXCHANGE_4,
1767	RTLIB::ATOMIC_COMPARE_EXCHANGE_8, RTLIB::ATOMIC_COMPARE_EXCHANGE_16};
1768	unsigned Size = getAtomicOpSize(CASI: I);
1769
1770	bool expanded = expandAtomicOpToLibcall(
1771	I, Size, Alignment: I->getAlign(), PointerOperand: I->getPointerOperand(), ValueOperand: I->getNewValOperand(),
1772	CASExpected: I->getCompareOperand(), Ordering: I->getSuccessOrdering(), Ordering2: I->getFailureOrdering(),
1773	Libcalls);
1774	if (!expanded)
1775	report_fatal_error(reason: "expandAtomicOpToLibcall shouldn't fail for CAS");
1776	}
1777
1778	static ArrayRef<RTLIB::Libcall> GetRMWLibcall(AtomicRMWInst::BinOp Op) {
1779	static const RTLIB::Libcall LibcallsXchg[`6`] = {
1780	RTLIB::ATOMIC_EXCHANGE, RTLIB::ATOMIC_EXCHANGE_1,
1781	RTLIB::ATOMIC_EXCHANGE_2, RTLIB::ATOMIC_EXCHANGE_4,
1782	RTLIB::ATOMIC_EXCHANGE_8, RTLIB::ATOMIC_EXCHANGE_16};
1783	static const RTLIB::Libcall LibcallsAdd[`6`] = {
1784	RTLIB::UNKNOWN_LIBCALL, RTLIB::ATOMIC_FETCH_ADD_1,
1785	RTLIB::ATOMIC_FETCH_ADD_2, RTLIB::ATOMIC_FETCH_ADD_4,
1786	RTLIB::ATOMIC_FETCH_ADD_8, RTLIB::ATOMIC_FETCH_ADD_16};
1787	static const RTLIB::Libcall LibcallsSub[`6`] = {
1788	RTLIB::UNKNOWN_LIBCALL, RTLIB::ATOMIC_FETCH_SUB_1,
1789	RTLIB::ATOMIC_FETCH_SUB_2, RTLIB::ATOMIC_FETCH_SUB_4,
1790	RTLIB::ATOMIC_FETCH_SUB_8, RTLIB::ATOMIC_FETCH_SUB_16};
1791	static const RTLIB::Libcall LibcallsAnd[`6`] = {
1792	RTLIB::UNKNOWN_LIBCALL, RTLIB::ATOMIC_FETCH_AND_1,
1793	RTLIB::ATOMIC_FETCH_AND_2, RTLIB::ATOMIC_FETCH_AND_4,
1794	RTLIB::ATOMIC_FETCH_AND_8, RTLIB::ATOMIC_FETCH_AND_16};
1795	static const RTLIB::Libcall LibcallsOr[`6`] = {
1796	RTLIB::UNKNOWN_LIBCALL, RTLIB::ATOMIC_FETCH_OR_1,
1797	RTLIB::ATOMIC_FETCH_OR_2, RTLIB::ATOMIC_FETCH_OR_4,
1798	RTLIB::ATOMIC_FETCH_OR_8, RTLIB::ATOMIC_FETCH_OR_16};
1799	static const RTLIB::Libcall LibcallsXor[`6`] = {
1800	RTLIB::UNKNOWN_LIBCALL, RTLIB::ATOMIC_FETCH_XOR_1,
1801	RTLIB::ATOMIC_FETCH_XOR_2, RTLIB::ATOMIC_FETCH_XOR_4,
1802	RTLIB::ATOMIC_FETCH_XOR_8, RTLIB::ATOMIC_FETCH_XOR_16};
1803	static const RTLIB::Libcall LibcallsNand[`6`] = {
1804	RTLIB::UNKNOWN_LIBCALL, RTLIB::ATOMIC_FETCH_NAND_1,
1805	RTLIB::ATOMIC_FETCH_NAND_2, RTLIB::ATOMIC_FETCH_NAND_4,
1806	RTLIB::ATOMIC_FETCH_NAND_8, RTLIB::ATOMIC_FETCH_NAND_16};
1807
1808	switch (Op) {
1809	case AtomicRMWInst::BAD_BINOP:
1810	llvm_unreachable("Should not have BAD_BINOP.");
1811	case AtomicRMWInst::Xchg:
1812	return ArrayRef(LibcallsXchg);
1813	case AtomicRMWInst::Add:
1814	return ArrayRef(LibcallsAdd);
1815	case AtomicRMWInst::Sub:
1816	return ArrayRef(LibcallsSub);
1817	case AtomicRMWInst::And:
1818	return ArrayRef(LibcallsAnd);
1819	case AtomicRMWInst::Or:
1820	return ArrayRef(LibcallsOr);
1821	case AtomicRMWInst::Xor:
1822	return ArrayRef(LibcallsXor);
1823	case AtomicRMWInst::Nand:
1824	return ArrayRef(LibcallsNand);
1825	case AtomicRMWInst::Max:
1826	case AtomicRMWInst::Min:
1827	case AtomicRMWInst::UMax:
1828	case AtomicRMWInst::UMin:
1829	case AtomicRMWInst::FMax:
1830	case AtomicRMWInst::FMin:
1831	case AtomicRMWInst::FMaximum:
1832	case AtomicRMWInst::FMinimum:
1833	case AtomicRMWInst::FAdd:
1834	case AtomicRMWInst::FSub:
1835	case AtomicRMWInst::UIncWrap:
1836	case AtomicRMWInst::UDecWrap:
1837	case AtomicRMWInst::USubCond:
1838	case AtomicRMWInst::USubSat:
1839	// No atomic libcalls are available for these.
1840	return {};
1841	}
1842	llvm_unreachable("Unexpected AtomicRMW operation.");
1843	}
1844
1845	void AtomicExpandImpl::expandAtomicRMWToLibcall(AtomicRMWInst *I) {
1846	ArrayRef<RTLIB::Libcall> Libcalls = GetRMWLibcall(Op: I->getOperation());
1847
1848	unsigned Size = getAtomicOpSize(RMWI: I);
1849
1850	bool Success = false;
1851	if (!Libcalls.empty())
1852	Success = expandAtomicOpToLibcall(
1853	I, Size, Alignment: I->getAlign(), PointerOperand: I->getPointerOperand(), ValueOperand: I->getValOperand(),
1854	CASExpected: nullptr, Ordering: I->getOrdering(), Ordering2: AtomicOrdering::NotAtomic, Libcalls);
1855
1856	// The expansion failed: either there were no libcalls at all for
1857	// the operation (min/max), or there were only size-specialized
1858	// libcalls (add/sub/etc) and we needed a generic. So, expand to a
1859	// CAS libcall, via a CAS loop, instead.
1860	if (!Success) {
1861	expandAtomicRMWToCmpXchg(
1862	AI: I, CreateCmpXchg: [this](IRBuilderBase &Builder, Value Addr, Value Loaded,
1863	Value *NewVal, Align Alignment, AtomicOrdering MemOpOrder,
1864	SyncScope::ID SSID, Value &Success, Value &NewLoaded,
1865	Instruction *MetadataSrc) {
1866	// Create the CAS instruction normally...
1867	AtomicCmpXchgInst *Pair = Builder.CreateAtomicCmpXchg(
1868	Ptr: Addr, Cmp: Loaded, New: NewVal, Align: Alignment, SuccessOrdering: MemOpOrder,
1869	FailureOrdering: AtomicCmpXchgInst::getStrongestFailureOrdering(SuccessOrdering: MemOpOrder), SSID);
1870	if (MetadataSrc)
1871	copyMetadataForAtomic(Dest&: Pair, Source: MetadataSrc);
1872
1873	Success = Builder.CreateExtractValue(Agg: Pair, Idxs: `1`, Name: "success");
1874	NewLoaded = Builder.CreateExtractValue(Agg: Pair, Idxs: `0`, Name: "newloaded");
1875
1876	// ...and then expand the CAS into a libcall.
1877	expandAtomicCASToLibcall(I: Pair);
1878	});
1879	}
1880	}
1881
1882	// A helper routine for the above expandAtomicToLibcall functions.*
1883	//
1884	// 'Libcalls' contains an array of enum values for the particular
1885	// ATOMIC libcalls to be emitted. All of the other arguments besides
1886	// 'I' are extracted from the Instruction subclass by the
1887	// caller. Depending on the particular call, some will be null.
1888	bool AtomicExpandImpl::expandAtomicOpToLibcall(
1889	Instruction I, unsigned* Size, Align Alignment, Value *PointerOperand,
1890	Value ValueOperand, Value CASExpected, AtomicOrdering Ordering,
1891	AtomicOrdering Ordering2, ArrayRef<RTLIB::Libcall> Libcalls) {
1892	assert(Libcalls.size() == `6`);
1893
1894	LLVMContext &Ctx = I->getContext();
1895	Module *M = I->getModule();
1896	const DataLayout &DL = M->getDataLayout();
1897	IRBuilder<> Builder(I);
1898	IRBuilder<> AllocaBuilder(&I->getFunction()->getEntryBlock().front());
1899
1900	bool UseSizedLibcall = canUseSizedAtomicCall(Size, Alignment, DL);
1901	Type SizedIntTy = Type::getIntNTy(C&: Ctx, N: Size `8`);
1902
1903	const Align AllocaAlignment = DL.getPrefTypeAlign(Ty: SizedIntTy);
1904
1905	// TODO: the "order" argument type is "int", not int32. So
1906	// getInt32Ty may be wrong if the arch uses e.g. 16-bit ints.
1907	ConstantInt *SizeVal64 = ConstantInt::get(Ty: Type::getInt64Ty(C&: Ctx), V: Size);
1908	assert(Ordering != AtomicOrdering::NotAtomic && "expect atomic MO");
1909	Constant *OrderingVal =
1910	ConstantInt::get(Ty: Type::getInt32Ty(C&: Ctx), V: (int)toCABI(AO: Ordering));
1911	Constant Ordering2Val = nullptr*;
1912	if (CASExpected) {
1913	assert(Ordering2 != AtomicOrdering::NotAtomic && "expect atomic MO");
1914	Ordering2Val =
1915	ConstantInt::get(Ty: Type::getInt32Ty(C&: Ctx), V: (int)toCABI(AO: Ordering2));
1916	}
1917	bool HasResult = I->getType() != Type::getVoidTy(C&: Ctx);
1918
1919	RTLIB::Libcall RTLibType;
1920	if (UseSizedLibcall) {
1921	switch (Size) {
1922	case `1`:
1923	RTLibType = Libcalls [`1`];
1924	break;
1925	case `2`:
1926	RTLibType = Libcalls [`2`];
1927	break;
1928	case `4`:
1929	RTLibType = Libcalls [`3`];
1930	break;
1931	case `8`:
1932	RTLibType = Libcalls [`4`];
1933	break;
1934	case `16`:
1935	RTLibType = Libcalls [`5`];
1936	break;
1937	}
1938	} else if (Libcalls [`0`] != RTLIB::UNKNOWN_LIBCALL) {
1939	RTLibType = Libcalls [`0`];
1940	} else {
1941	// Can't use sized function, and there's no generic for this
1942	// operation, so give up.
1943	return false;
1944	}
1945
1946	if (!TLI->getLibcallName(Call: RTLibType)) {
1947	// This target does not implement the requested atomic libcall so give up.
1948	return false;
1949	}
1950
1951	// Build up the function call. There's two kinds. First, the sized
1952	// variants. These calls are going to be one of the following (with
1953	// N=1,2,4,8,16):
1954	// iN __atomic_load_N(iN ptr, int ordering)*
1955	// void __atomic_store_N(iN ptr, iN val, int ordering)*
1956	// iN __atomic_{exchange\|fetch_}_N(iN ptr, iN val, int ordering)
1957	// bool __atomic_compare_exchange_N(iN ptr, iN expected, iN desired,
1958	// int success_order, int failure_order)
1959	//
1960	// Note that these functions can be used for non-integer atomic
1961	// operations, the values just need to be bitcast to integers on the
1962	// way in and out.
1963	//
1964	// And, then, the generic variants. They look like the following:
1965	// void __atomic_load(size_t size, void ptr, void ret, int ordering)
1966	// void __atomic_store(size_t size, void ptr, void val, int ordering)
1967	// void __atomic_exchange(size_t size, void ptr, void val, void ret,*
1968	// int ordering)
1969	// bool __atomic_compare_exchange(size_t size, void ptr, void expected,
1970	// void desired, int success_order,*
1971	// int failure_order)
1972	//
1973	// The different signatures are built up depending on the
1974	// 'UseSizedLibcall', 'CASExpected', 'ValueOperand', and 'HasResult'
1975	// variables.
1976
1977	AllocaInst AllocaCASExpected = nullptr*;
1978	AllocaInst AllocaValue = nullptr*;
1979	AllocaInst AllocaResult = nullptr*;
1980
1981	Type *ResultTy;
1982	SmallVector<Value *, `6`> Args;
1983	AttributeList Attr;
1984
1985	// 'size' argument.
1986	if (!UseSizedLibcall) {
1987	// Note, getIntPtrType is assumed equivalent to size_t.
1988	Args.push_back(Elt: ConstantInt::get(Ty: DL.getIntPtrType(C&: Ctx), V: Size));
1989	}
1990
1991	// 'ptr' argument.
1992	// note: This assumes all address spaces share a common libfunc
1993	// implementation and that addresses are convertable. For systems without
1994	// that property, we'd need to extend this mechanism to support AS-specific
1995	// families of atomic intrinsics.
1996	Value *PtrVal = PointerOperand;
1997	PtrVal = Builder.CreateAddrSpaceCast(V: PtrVal, DestTy: PointerType::getUnqual(C&: Ctx));
1998	Args.push_back(Elt: PtrVal);
1999
2000	// 'expected' argument, if present.
2001	if (CASExpected) {
2002	AllocaCASExpected = AllocaBuilder.CreateAlloca(Ty: CASExpected->getType());
2003	AllocaCASExpected->setAlignment(AllocaAlignment);
2004	Builder.CreateLifetimeStart(Ptr: AllocaCASExpected, Size: SizeVal64);
2005	Builder.CreateAlignedStore(Val: CASExpected, Ptr: AllocaCASExpected, Align: AllocaAlignment);
2006	Args.push_back(Elt: AllocaCASExpected);
2007	}
2008
2009	// 'val' argument ('desired' for cas), if present.
2010	if (ValueOperand) {
2011	if (UseSizedLibcall) {
2012	Value *IntValue =
2013	Builder.CreateBitOrPointerCast(V: ValueOperand, DestTy: SizedIntTy);
2014	Args.push_back(Elt: IntValue);
2015	} else {
2016	AllocaValue = AllocaBuilder.CreateAlloca(Ty: ValueOperand->getType());
2017	AllocaValue->setAlignment(AllocaAlignment);
2018	Builder.CreateLifetimeStart(Ptr: AllocaValue, Size: SizeVal64);
2019	Builder.CreateAlignedStore(Val: ValueOperand, Ptr: AllocaValue, Align: AllocaAlignment);
2020	Args.push_back(Elt: AllocaValue);
2021	}
2022	}
2023
2024	// 'ret' argument.
2025	if (!CASExpected && HasResult && !UseSizedLibcall) {
2026	AllocaResult = AllocaBuilder.CreateAlloca(Ty: I->getType());
2027	AllocaResult->setAlignment(AllocaAlignment);
2028	Builder.CreateLifetimeStart(Ptr: AllocaResult, Size: SizeVal64);
2029	Args.push_back(Elt: AllocaResult);
2030	}
2031
2032	// 'ordering' ('success_order' for cas) argument.
2033	Args.push_back(Elt: OrderingVal);
2034
2035	// 'failure_order' argument, if present.
2036	if (Ordering2Val)
2037	Args.push_back(Elt: Ordering2Val);
2038
2039	// Now, the return type.
2040	if (CASExpected) {
2041	ResultTy = Type::getInt1Ty(C&: Ctx);
2042	Attr = Attr.addRetAttribute(C&: Ctx, Kind: Attribute::ZExt);
2043	} else if (HasResult && UseSizedLibcall)
2044	ResultTy = SizedIntTy;
2045	else
2046	ResultTy = Type::getVoidTy(C&: Ctx);
2047
2048	// Done with setting up arguments and return types, create the call:
2049	SmallVector<Type *, `6`> ArgTys;
2050	for (Value *Arg : Args)
2051	ArgTys.push_back(Elt: Arg->getType());
2052	FunctionType FnType = FunctionType::get(Result: ResultTy, Params: ArgTys, isVarArg: false*);
2053	FunctionCallee LibcallFn =
2054	M->getOrInsertFunction(Name: TLI->getLibcallName(Call: RTLibType), T: FnType, AttributeList: Attr);
2055	CallInst *Call = Builder.CreateCall(Callee: LibcallFn, Args);
2056	Call->setAttributes(Attr);
2057	Value *Result = Call;
2058
2059	// And then, extract the results...
2060	if (ValueOperand && !UseSizedLibcall)
2061	Builder.CreateLifetimeEnd(Ptr: AllocaValue, Size: SizeVal64);
2062
2063	if (CASExpected) {
2064	// The final result from the CAS is {load of 'expected' alloca, bool result
2065	// from call}
2066	Type *FinalResultTy = I->getType();
2067	Value *V = PoisonValue::get(T: FinalResultTy);
2068	Value *ExpectedOut = Builder.CreateAlignedLoad(
2069	Ty: CASExpected->getType(), Ptr: AllocaCASExpected, Align: AllocaAlignment);
2070	Builder.CreateLifetimeEnd(Ptr: AllocaCASExpected, Size: SizeVal64);
2071	V = Builder.CreateInsertValue(Agg: V, Val: ExpectedOut, Idxs: `0`);
2072	V = Builder.CreateInsertValue(Agg: V, Val: Result, Idxs: `1`);
2073	I->replaceAllUsesWith(V);
2074	} else if (HasResult) {
2075	Value *V;
2076	if (UseSizedLibcall)
2077	V = Builder.CreateBitOrPointerCast(V: Result, DestTy: I->getType());
2078	else {
2079	V = Builder.CreateAlignedLoad(Ty: I->getType(), Ptr: AllocaResult,
2080	Align: AllocaAlignment);
2081	Builder.CreateLifetimeEnd(Ptr: AllocaResult, Size: SizeVal64);
2082	}
2083	I->replaceAllUsesWith(V);
2084	}
2085	I->eraseFromParent();
2086	return true;
2087	}
2088

Browse the source code of llvm_projects/llvm/lib/CodeGen/AtomicExpandPass.cpp