AtomicExpandPass.cpp source code [llvm_projects/llvm/lib/CodeGen/AtomicExpandPass.cpp]

1	//===- AtomicExpandPass.cpp - Expand atomic instructions ------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This file contains a pass (at IR level) to replace atomic instructions with
10	// __atomic_ library calls, or target specific instruction which implement the*
11	// same semantics in a way which better fits the target backend. This can
12	// include the use of (intrinsic-based) load-linked/store-conditional loops,
13	// AtomicCmpXchg, or type coercions.
14	//
15	//===----------------------------------------------------------------------===//
16
17	#include "llvm/ADT/ArrayRef.h"
18	#include "llvm/ADT/STLFunctionalExtras.h"
19	#include "llvm/ADT/SmallVector.h"
20	#include "llvm/Analysis/InstSimplifyFolder.h"
21	#include "llvm/Analysis/OptimizationRemarkEmitter.h"
22	#include "llvm/CodeGen/AtomicExpand.h"
23	#include "llvm/CodeGen/AtomicExpandUtils.h"
24	#include "llvm/CodeGen/TargetLowering.h"
25	#include "llvm/CodeGen/TargetPassConfig.h"
26	#include "llvm/CodeGen/TargetSubtargetInfo.h"
27	#include "llvm/CodeGen/ValueTypes.h"
28	#include "llvm/IR/Attributes.h"
29	#include "llvm/IR/BasicBlock.h"
30	#include "llvm/IR/Constant.h"
31	#include "llvm/IR/Constants.h"
32	#include "llvm/IR/DataLayout.h"
33	#include "llvm/IR/DerivedTypes.h"
34	#include "llvm/IR/Function.h"
35	#include "llvm/IR/IRBuilder.h"
36	#include "llvm/IR/Instruction.h"
37	#include "llvm/IR/Instructions.h"
38	#include "llvm/IR/MDBuilder.h"
39	#include "llvm/IR/MemoryModelRelaxationAnnotations.h"
40	#include "llvm/IR/Module.h"
41	#include "llvm/IR/ProfDataUtils.h"
42	#include "llvm/IR/Type.h"
43	#include "llvm/IR/User.h"
44	#include "llvm/IR/Value.h"
45	#include "llvm/InitializePasses.h"
46	#include "llvm/Pass.h"
47	#include "llvm/Support/AtomicOrdering.h"
48	#include "llvm/Support/Casting.h"
49	#include "llvm/Support/Debug.h"
50	#include "llvm/Support/ErrorHandling.h"
51	#include "llvm/Support/raw_ostream.h"
52	#include "llvm/Target/TargetMachine.h"
53	#include "llvm/Transforms/Utils/LowerAtomic.h"
54	#include <cassert>
55	#include <cstdint>
56	#include <iterator>
57
58	using namespace llvm;
59
60	#define DEBUG_TYPE "atomic-expand"
61
62	namespace {
63
64	class AtomicExpandImpl {
65	const TargetLowering TLI = nullptr*;
66	const LibcallLoweringInfo LibcallLowering = nullptr*;
67	const DataLayout DL = nullptr*;
68
69	private:
70	void handleFailure(Instruction &FailedInst, const Twine &Msg) const {
71	LLVMContext &Ctx = FailedInst.getContext();
72
73	// TODO: Do not use generic error type.
74	Ctx.emitError(I: &FailedInst, ErrorStr: Msg);
75
76	if (!FailedInst.getType()->isVoidTy())
77	FailedInst.replaceAllUsesWith(V: PoisonValue::get(T: FailedInst.getType()));
78	FailedInst.eraseFromParent();
79	}
80
81	bool bracketInstWithFences(Instruction *I, AtomicOrdering Order);
82	IntegerType getCorrespondingIntegerType(Type T, const DataLayout &DL);
83	LoadInst convertAtomicLoadToIntegerType(LoadInst LI);
84	bool tryExpandAtomicLoad(LoadInst *LI);
85	bool expandAtomicLoadToLL(LoadInst *LI);
86	bool expandAtomicLoadToCmpXchg(LoadInst *LI);
87	StoreInst convertAtomicStoreToIntegerType(StoreInst SI);
88	bool tryExpandAtomicStore(StoreInst *SI);
89	void expandAtomicStoreToXChg(StoreInst *SI);
90	bool tryExpandAtomicRMW(AtomicRMWInst *AI);
91	AtomicRMWInst convertAtomicXchgToIntegerType(AtomicRMWInst RMWI);
92	Value *
93	insertRMWLLSCLoop(IRBuilderBase &Builder, Type ResultTy, Value Addr,
94	Align AddrAlign, AtomicOrdering MemOpOrder,
95	function_ref<Value (IRBuilderBase &, Value )> PerformOp);
96	void expandAtomicOpToLLSC(
97	Instruction I, Type ResultTy, Value *Addr, Align AddrAlign,
98	AtomicOrdering MemOpOrder,
99	function_ref<Value (IRBuilderBase &, Value )> PerformOp);
100	void expandPartwordAtomicRMW(
101	AtomicRMWInst *I, TargetLoweringBase::AtomicExpansionKind ExpansionKind);
102	AtomicRMWInst widenPartwordAtomicRMW(AtomicRMWInst AI);
103	bool expandPartwordCmpXchg(AtomicCmpXchgInst *I);
104	void expandAtomicRMWToMaskedIntrinsic(AtomicRMWInst *AI);
105	void expandAtomicCmpXchgToMaskedIntrinsic(AtomicCmpXchgInst *CI);
106
107	AtomicCmpXchgInst convertCmpXchgToIntegerType(AtomicCmpXchgInst CI);
108	static Value *insertRMWCmpXchgLoop(
109	IRBuilderBase &Builder, Type ResultType, Value Addr, Align AddrAlign,
110	AtomicOrdering MemOpOrder, SyncScope::ID SSID,
111	function_ref<Value (IRBuilderBase &, Value )> PerformOp,
112	CreateCmpXchgInstFun CreateCmpXchg, Instruction *MetadataSrc);
113	bool tryExpandAtomicCmpXchg(AtomicCmpXchgInst *CI);
114
115	bool expandAtomicCmpXchg(AtomicCmpXchgInst *CI);
116	bool isIdempotentRMW(AtomicRMWInst *RMWI);
117	bool simplifyIdempotentRMW(AtomicRMWInst *RMWI);
118
119	bool expandAtomicOpToLibcall(Instruction I, unsigned* Size, Align Alignment,
120	Value PointerOperand, Value ValueOperand,
121	Value *CASExpected, AtomicOrdering Ordering,
122	AtomicOrdering Ordering2,
123	ArrayRef<RTLIB::Libcall> Libcalls);
124	void expandAtomicLoadToLibcall(LoadInst *LI);
125	void expandAtomicStoreToLibcall(StoreInst *LI);
126	void expandAtomicRMWToLibcall(AtomicRMWInst *I);
127	void expandAtomicCASToLibcall(AtomicCmpXchgInst *I);
128
129	friend bool
130	llvm::expandAtomicRMWToCmpXchg(AtomicRMWInst *AI,
131	CreateCmpXchgInstFun CreateCmpXchg);
132
133	bool processAtomicInstr(Instruction *I);
134
135	public:
136	bool run(Function &F,
137	const LibcallLoweringModuleAnalysisResult &LibcallResult,
138	const TargetMachine *TM);
139	};
140
141	class AtomicExpandLegacy : public FunctionPass {
142	public:
143	static char ID; // Pass identification, replacement for typeid
144
145	AtomicExpandLegacy() : FunctionPass (ID) {}
146
147	void getAnalysisUsage(AnalysisUsage &AU) const override {
148	AU.addRequired<LibcallLoweringInfoWrapper>();
149	FunctionPass::getAnalysisUsage(AU);
150	}
151
152	bool runOnFunction(Function &F) override;
153	};
154
155	// IRBuilder to be used for replacement atomic instructions.
156	struct ReplacementIRBuilder
157	: IRBuilder<InstSimplifyFolder, IRBuilderCallbackInserter> {
158	MDNode MMRAMD = nullptr*;
159
160	// Preserves the DebugLoc from I, and preserves still valid metadata.
161	// Enable StrictFP builder mode when appropriate.
162	explicit ReplacementIRBuilder(Instruction I, const* DataLayout &DL)
163	: IRBuilder (I->getContext(), InstSimplifyFolder (DL),
164	IRBuilderCallbackInserter (
165	[this](Instruction *I) { addMMRAMD(I); })) {
166	SetInsertPoint(I);
167	this->CollectMetadataToCopy(Src: I, MetadataKinds: {LLVMContext::MD_pcsections});
168	if (BB->getParent()->getAttributes().hasFnAttr(Kind: Attribute::StrictFP))
169	this->setIsFPConstrained(true);
170
171	MMRAMD = I->getMetadata(KindID: LLVMContext::MD_mmra);
172	}
173
174	void addMMRAMD(Instruction *I) {
175	if (canInstructionHaveMMRAs(I: *I))
176	I->setMetadata(KindID: LLVMContext::MD_mmra, Node: MMRAMD);
177	}
178	};
179
180	} // end anonymous namespace
181
182	char AtomicExpandLegacy::ID = `0`;
183
184	char &llvm::AtomicExpandID = AtomicExpandLegacy::ID;
185
186	INITIALIZE_PASS_BEGIN(AtomicExpandLegacy, DEBUG_TYPE,
187	"Expand Atomic instructions", false, false)
188	INITIALIZE_PASS_DEPENDENCY(LibcallLoweringInfoWrapper)
189	INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
190	INITIALIZE_PASS_END(AtomicExpandLegacy, DEBUG_TYPE,
191	"Expand Atomic instructions", false, false)
192
193	// Helper functions to retrieve the size of atomic instructions.
194	static unsigned getAtomicOpSize(LoadInst *LI) {
195	const DataLayout &DL = LI->getDataLayout();
196	return DL.getTypeStoreSize(Ty: LI->getType());
197	}
198
199	static unsigned getAtomicOpSize(StoreInst *SI) {
200	const DataLayout &DL = SI->getDataLayout();
201	return DL.getTypeStoreSize(Ty: SI->getValueOperand()->getType());
202	}
203
204	static unsigned getAtomicOpSize(AtomicRMWInst *RMWI) {
205	const DataLayout &DL = RMWI->getDataLayout();
206	return DL.getTypeStoreSize(Ty: RMWI->getValOperand()->getType());
207	}
208
209	static unsigned getAtomicOpSize(AtomicCmpXchgInst *CASI) {
210	const DataLayout &DL = CASI->getDataLayout();
211	return DL.getTypeStoreSize(Ty: CASI->getCompareOperand()->getType());
212	}
213
214	/// Copy metadata that's safe to preserve when widening atomics.
215	static void copyMetadataForAtomic(Instruction &Dest,
216	const Instruction &Source) {
217	SmallVector<std::pair<unsigned, MDNode *>, `8`> MD;
218	Source.getAllMetadata(MDs&: MD);
219	LLVMContext &Ctx = Dest.getContext();
220	MDBuilder MDB(Ctx);
221
222	for (auto [ID, N] : MD) {
223	switch (ID) {
224	case LLVMContext::MD_dbg:
225	case LLVMContext::MD_tbaa:
226	case LLVMContext::MD_tbaa_struct:
227	case LLVMContext::MD_alias_scope:
228	case LLVMContext::MD_noalias:
229	case LLVMContext::MD_noalias_addrspace:
230	case LLVMContext::MD_access_group:
231	case LLVMContext::MD_mmra:
232	Dest.setMetadata(KindID: ID, Node: N);
233	break;
234	default:
235	if (ID == Ctx.getMDKindID(Name: "amdgpu.no.remote.memory"))
236	Dest.setMetadata(KindID: ID, Node: N);
237	else if (ID == Ctx.getMDKindID(Name: "amdgpu.no.fine.grained.memory"))
238	Dest.setMetadata(KindID: ID, Node: N);
239
240	// Losing amdgpu.ignore.denormal.mode, but it doesn't matter for current
241	// uses.
242	break;
243	}
244	}
245	}
246
247	// Determine if a particular atomic operation has a supported size,
248	// and is of appropriate alignment, to be passed through for target
249	// lowering. (Versus turning into a __atomic libcall)
250	template <typename Inst>
251	static bool atomicSizeSupported(const TargetLowering TLI, Inst I) {
252	unsigned Size = getAtomicOpSize(I);
253	Align Alignment = I->getAlign();
254	return Alignment >= Size &&
255	Size <= TLI->getMaxAtomicSizeInBitsSupported() / `8`;
256	}
257
258	bool AtomicExpandImpl::processAtomicInstr(Instruction *I) {
259	auto *LI = dyn_cast<LoadInst>(Val: I);
260	auto *SI = dyn_cast<StoreInst>(Val: I);
261	auto *RMWI = dyn_cast<AtomicRMWInst>(Val: I);
262	auto *CASI = dyn_cast<AtomicCmpXchgInst>(Val: I);
263
264	bool MadeChange = false;
265
266	// If the Size/Alignment is not supported, replace with a libcall.
267	if (LI) {
268	if (!LI->isAtomic())
269	return false;
270
271	if (!atomicSizeSupported(TLI, I: LI)) {
272	expandAtomicLoadToLibcall(LI);
273	return true;
274	}
275
276	if (TLI->shouldCastAtomicLoadInIR(LI) ==
277	TargetLoweringBase::AtomicExpansionKind::CastToInteger) {
278	I = LI = convertAtomicLoadToIntegerType(LI);
279	MadeChange = true;
280	}
281	} else if (SI) {
282	if (!SI->isAtomic())
283	return false;
284
285	if (!atomicSizeSupported(TLI, I: SI)) {
286	expandAtomicStoreToLibcall(LI: SI);
287	return true;
288	}
289
290	if (TLI->shouldCastAtomicStoreInIR(SI) ==
291	TargetLoweringBase::AtomicExpansionKind::CastToInteger) {
292	I = SI = convertAtomicStoreToIntegerType(SI);
293	MadeChange = true;
294	}
295	} else if (RMWI) {
296	if (!atomicSizeSupported(TLI, I: RMWI)) {
297	expandAtomicRMWToLibcall(I: RMWI);
298	return true;
299	}
300
301	if (TLI->shouldCastAtomicRMWIInIR(RMWI) ==
302	TargetLoweringBase::AtomicExpansionKind::CastToInteger) {
303	I = RMWI = convertAtomicXchgToIntegerType(RMWI);
304	MadeChange = true;
305	}
306	} else if (CASI) {
307	if (!atomicSizeSupported(TLI, I: CASI)) {
308	expandAtomicCASToLibcall(I: CASI);
309	return true;
310	}
311
312	// TODO: when we're ready to make the change at the IR level, we can
313	// extend convertCmpXchgToInteger for floating point too.
314	if (CASI->getCompareOperand()->getType()->isPointerTy()) {
315	// TODO: add a TLI hook to control this so that each target can
316	// convert to lowering the original type one at a time.
317	I = CASI = convertCmpXchgToIntegerType(CI: CASI);
318	MadeChange = true;
319	}
320	} else
321	return false;
322
323	if (TLI->shouldInsertFencesForAtomic(I)) {
324	auto FenceOrdering = AtomicOrdering::Monotonic;
325	if (LI && isAcquireOrStronger(AO: LI->getOrdering())) {
326	FenceOrdering = LI->getOrdering();
327	LI->setOrdering(AtomicOrdering::Monotonic);
328	} else if (SI && isReleaseOrStronger(AO: SI->getOrdering())) {
329	FenceOrdering = SI->getOrdering();
330	SI->setOrdering(AtomicOrdering::Monotonic);
331	} else if (RMWI && (isReleaseOrStronger(AO: RMWI->getOrdering()) \|\|
332	isAcquireOrStronger(AO: RMWI->getOrdering()))) {
333	FenceOrdering = RMWI->getOrdering();
334	RMWI->setOrdering(TLI->atomicOperationOrderAfterFenceSplit(I: RMWI));
335	} else if (CASI &&
336	TLI->shouldExpandAtomicCmpXchgInIR(AI: CASI) ==
337	TargetLoweringBase::AtomicExpansionKind::None &&
338	(isReleaseOrStronger(AO: CASI->getSuccessOrdering()) \|\|
339	isAcquireOrStronger(AO: CASI->getSuccessOrdering()) \|\|
340	isAcquireOrStronger(AO: CASI->getFailureOrdering()))) {
341	// If a compare and swap is lowered to LL/SC, we can do smarter fence
342	// insertion, with a stronger one on the success path than on the
343	// failure path. As a result, fence insertion is directly done by
344	// expandAtomicCmpXchg in that case.
345	FenceOrdering = CASI->getMergedOrdering();
346	auto CASOrdering = TLI->atomicOperationOrderAfterFenceSplit(I: CASI);
347
348	CASI->setSuccessOrdering(CASOrdering);
349	CASI->setFailureOrdering(CASOrdering);
350	}
351
352	if (FenceOrdering != AtomicOrdering::Monotonic) {
353	MadeChange \|= bracketInstWithFences(I, Order: FenceOrdering);
354	}
355	} else if (TLI->shouldInsertTrailingSeqCstFenceForAtomicStore(I) &&
356	!(CASI && TLI->shouldExpandAtomicCmpXchgInIR(AI: CASI) ==
357	TargetLoweringBase::AtomicExpansionKind::LLSC)) {
358	// CmpXchg LLSC is handled in expandAtomicCmpXchg().
359	IRBuilder Builder(I);
360	if (auto TrailingFence = TLI->emitTrailingFence(
361	Builder, Inst: I, Ord: AtomicOrdering::SequentiallyConsistent)) {
362	TrailingFence->moveAfter(MovePos: I);
363	MadeChange = true;
364	}
365	}
366
367	if (LI)
368	MadeChange \|= tryExpandAtomicLoad(LI);
369	else if (SI)
370	MadeChange \|= tryExpandAtomicStore(SI);
371	else if (RMWI) {
372	// There are two different ways of expanding RMW instructions:
373	// - into a load if it is idempotent
374	// - into a Cmpxchg/LL-SC loop otherwise
375	// we try them in that order.
376
377	if (isIdempotentRMW(RMWI) && simplifyIdempotentRMW(RMWI)) {
378	MadeChange = true;
379
380	} else {
381	MadeChange \|= tryExpandAtomicRMW(AI: RMWI);
382	}
383	} else if (CASI)
384	MadeChange \|= tryExpandAtomicCmpXchg(CI: CASI);
385
386	return MadeChange;
387	}
388
389	bool AtomicExpandImpl::run(
390	Function &F, const LibcallLoweringModuleAnalysisResult &LibcallResult,
391	const TargetMachine *TM) {
392	const auto *Subtarget = TM->getSubtargetImpl(F);
393	if (!Subtarget->enableAtomicExpand())
394	return false;
395	TLI = Subtarget->getTargetLowering();
396	LibcallLowering = &LibcallResult.getLibcallLowering(Subtarget: *Subtarget);
397	DL = &F.getDataLayout();
398
399	bool MadeChange = false;
400
401	for (Function::iterator BBI = F.begin(), BBE = F.end(); BBI != BBE; ++BBI) {
402	BasicBlock BB = &BBI;
403
404	BasicBlock::reverse_iterator Next;
405
406	for (BasicBlock::reverse_iterator I = BB->rbegin(), E = BB->rend(); I != E;
407	I = Next) {
408	Instruction &Inst = *I;
409	Next = std::next(x: I);
410
411	if (processAtomicInstr(I: &Inst)) {
412	MadeChange = true;
413
414	// New blocks may have been inserted.
415	BBE = F.end();
416	}
417	}
418	}
419
420	return MadeChange;
421	}
422
423	bool AtomicExpandLegacy::runOnFunction(Function &F) {
424
425	auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
426	if (!TPC)
427	return false;
428	auto *TM = &TPC->getTM<TargetMachine>();
429
430	const LibcallLoweringModuleAnalysisResult &LibcallResult =
431	getAnalysis<LibcallLoweringInfoWrapper>().getResult(M: *F.getParent());
432	AtomicExpandImpl AE;
433	return AE.run(F, LibcallResult, TM);
434	}
435
436	FunctionPass *llvm::createAtomicExpandLegacyPass() {
437	return new AtomicExpandLegacy ();
438	}
439
440	PreservedAnalyses AtomicExpandPass::run(Function &F,
441	FunctionAnalysisManager &FAM) {
442	auto &MAMProxy = FAM.getResult<ModuleAnalysisManagerFunctionProxy>(IR&: F);
443
444	const LibcallLoweringModuleAnalysisResult *LibcallResult =
445	MAMProxy.getCachedResult<LibcallLoweringModuleAnalysis>(IR&: *F.getParent());
446
447	if (!LibcallResult) {
448	F.getContext().emitError(ErrorStr: "'" + LibcallLoweringModuleAnalysis::name() +
449	"' analysis required");
450	return PreservedAnalyses::all();
451	}
452
453	AtomicExpandImpl AE;
454
455	bool Changed = AE.run(F, LibcallResult: *LibcallResult, TM);
456	if (!Changed)
457	return PreservedAnalyses::all();
458
459	return PreservedAnalyses::none();
460	}
461
462	bool AtomicExpandImpl::bracketInstWithFences(Instruction *I,
463	AtomicOrdering Order) {
464	ReplacementIRBuilder Builder(I, *DL);
465
466	auto LeadingFence = TLI->emitLeadingFence(Builder, Inst: I, Ord: Order);
467
468	auto TrailingFence = TLI->emitTrailingFence(Builder, Inst: I, Ord: Order);
469	// We have a guard here because not every atomic operation generates a
470	// trailing fence.
471	if (TrailingFence)
472	TrailingFence->moveAfter(MovePos: I);
473
474	return (LeadingFence \|\| TrailingFence);
475	}
476
477	/// Get the iX type with the same bitwidth as T.
478	IntegerType *
479	AtomicExpandImpl::getCorrespondingIntegerType(Type T, const* DataLayout &DL) {
480	EVT VT = TLI->getMemValueType(DL, Ty: T);
481	unsigned BitWidth = VT.getStoreSizeInBits();
482	assert(BitWidth == VT.getSizeInBits() && "must be a power of two");
483	return IntegerType::get(C&: T->getContext(), NumBits: BitWidth);
484	}
485
486	/// Convert an atomic load of a non-integral type to an integer load of the
487	/// equivalent bitwidth. See the function comment on
488	/// convertAtomicStoreToIntegerType for background.
489	LoadInst AtomicExpandImpl::convertAtomicLoadToIntegerType(LoadInst LI) {
490	auto *M = LI->getModule();
491	Type *NewTy = getCorrespondingIntegerType(T: LI->getType(), DL: M->getDataLayout());
492
493	ReplacementIRBuilder Builder(LI, *DL);
494
495	Value *Addr = LI->getPointerOperand();
496
497	auto *NewLI = Builder.CreateLoad(Ty: NewTy, Ptr: Addr);
498	NewLI->setAlignment(LI->getAlign());
499	NewLI->setVolatile(LI->isVolatile());
500	NewLI->setAtomic(Ordering: LI->getOrdering(), SSID: LI->getSyncScopeID());
501	LLVM_DEBUG(dbgs() << "Replaced " << LI << " with " << NewLI << "\n");
502
503	Value *NewVal = Builder.CreateBitCast(V: NewLI, DestTy: LI->getType());
504	LI->replaceAllUsesWith(V: NewVal);
505	LI->eraseFromParent();
506	return NewLI;
507	}
508
509	AtomicRMWInst *
510	AtomicExpandImpl::convertAtomicXchgToIntegerType(AtomicRMWInst *RMWI) {
511	assert(RMWI->getOperation() == AtomicRMWInst::Xchg);
512
513	auto *M = RMWI->getModule();
514	Type *NewTy =
515	getCorrespondingIntegerType(T: RMWI->getType(), DL: M->getDataLayout());
516
517	ReplacementIRBuilder Builder(RMWI, *DL);
518
519	Value *Addr = RMWI->getPointerOperand();
520	Value *Val = RMWI->getValOperand();
521	Value *NewVal = Val->getType()->isPointerTy()
522	? Builder.CreatePtrToInt(V: Val, DestTy: NewTy)
523	: Builder.CreateBitCast(V: Val, DestTy: NewTy);
524
525	auto *NewRMWI = Builder.CreateAtomicRMW(Op: AtomicRMWInst::Xchg, Ptr: Addr, Val: NewVal,
526	Align: RMWI->getAlign(), Ordering: RMWI->getOrdering(),
527	SSID: RMWI->getSyncScopeID());
528	NewRMWI->setVolatile(RMWI->isVolatile());
529	copyMetadataForAtomic(Dest&: NewRMWI, Source: RMWI);
530	LLVM_DEBUG(dbgs() << "Replaced " << RMWI << " with " << NewRMWI << "\n");
531
532	Value *NewRVal = RMWI->getType()->isPointerTy()
533	? Builder.CreateIntToPtr(V: NewRMWI, DestTy: RMWI->getType())
534	: Builder.CreateBitCast(V: NewRMWI, DestTy: RMWI->getType());
535	RMWI->replaceAllUsesWith(V: NewRVal);
536	RMWI->eraseFromParent();
537	return NewRMWI;
538	}
539
540	bool AtomicExpandImpl::tryExpandAtomicLoad(LoadInst *LI) {
541	switch (TLI->shouldExpandAtomicLoadInIR(LI)) {
542	case TargetLoweringBase::AtomicExpansionKind::None:
543	return false;
544	case TargetLoweringBase::AtomicExpansionKind::LLSC:
545	expandAtomicOpToLLSC(
546	I: LI, ResultTy: LI->getType(), Addr: LI->getPointerOperand(), AddrAlign: LI->getAlign(),
547	MemOpOrder: LI->getOrdering(),
548	PerformOp: [](IRBuilderBase &Builder, Value Loaded) { return* Loaded; });
549	return true;
550	case TargetLoweringBase::AtomicExpansionKind::LLOnly:
551	return expandAtomicLoadToLL(LI);
552	case TargetLoweringBase::AtomicExpansionKind::CmpXChg:
553	return expandAtomicLoadToCmpXchg(LI);
554	case TargetLoweringBase::AtomicExpansionKind::NotAtomic:
555	LI->setAtomic(Ordering: AtomicOrdering::NotAtomic);
556	return true;
557	case TargetLoweringBase::AtomicExpansionKind::CustomExpand:
558	TLI->emitExpandAtomicLoad(LI);
559	return true;
560	default:
561	llvm_unreachable("Unhandled case in tryExpandAtomicLoad");
562	}
563	}
564
565	bool AtomicExpandImpl::tryExpandAtomicStore(StoreInst *SI) {
566	switch (TLI->shouldExpandAtomicStoreInIR(SI)) {
567	case TargetLoweringBase::AtomicExpansionKind::None:
568	return false;
569	case TargetLoweringBase::AtomicExpansionKind::CustomExpand:
570	TLI->emitExpandAtomicStore(SI);
571	return true;
572	case TargetLoweringBase::AtomicExpansionKind::Expand:
573	expandAtomicStoreToXChg(SI);
574	return true;
575	case TargetLoweringBase::AtomicExpansionKind::NotAtomic:
576	SI->setAtomic(Ordering: AtomicOrdering::NotAtomic);
577	return true;
578	default:
579	llvm_unreachable("Unhandled case in tryExpandAtomicStore");
580	}
581	}
582
583	bool AtomicExpandImpl::expandAtomicLoadToLL(LoadInst *LI) {
584	ReplacementIRBuilder Builder(LI, *DL);
585
586	// On some architectures, load-linked instructions are atomic for larger
587	// sizes than normal loads. For example, the only 64-bit load guaranteed
588	// to be single-copy atomic by ARM is an ldrexd (A3.5.3).
589	Value *Val = TLI->emitLoadLinked(Builder, ValueTy: LI->getType(),
590	Addr: LI->getPointerOperand(), Ord: LI->getOrdering());
591	TLI->emitAtomicCmpXchgNoStoreLLBalance(Builder);
592
593	LI->replaceAllUsesWith(V: Val);
594	LI->eraseFromParent();
595
596	return true;
597	}
598
599	bool AtomicExpandImpl::expandAtomicLoadToCmpXchg(LoadInst *LI) {
600	ReplacementIRBuilder Builder(LI, *DL);
601	AtomicOrdering Order = LI->getOrdering();
602	if (Order == AtomicOrdering::Unordered)
603	Order = AtomicOrdering::Monotonic;
604
605	Value *Addr = LI->getPointerOperand();
606	Type *Ty = LI->getType();
607	Constant *DummyVal = Constant::getNullValue(Ty);
608
609	Value *Pair = Builder.CreateAtomicCmpXchg(
610	Ptr: Addr, Cmp: DummyVal, New: DummyVal, Align: LI->getAlign(), SuccessOrdering: Order,
611	FailureOrdering: AtomicCmpXchgInst::getStrongestFailureOrdering(SuccessOrdering: Order));
612	Value *Loaded = Builder.CreateExtractValue(Agg: Pair, Idxs: `0`, Name: "loaded");
613
614	LI->replaceAllUsesWith(V: Loaded);
615	LI->eraseFromParent();
616
617	return true;
618	}
619
620	/// Convert an atomic store of a non-integral type to an integer store of the
621	/// equivalent bitwidth. We used to not support floating point or vector
622	/// atomics in the IR at all. The backends learned to deal with the bitcast
623	/// idiom because that was the only way of expressing the notion of a atomic
624	/// float or vector store. The long term plan is to teach each backend to
625	/// instruction select from the original atomic store, but as a migration
626	/// mechanism, we convert back to the old format which the backends understand.
627	/// Each backend will need individual work to recognize the new format.
628	StoreInst AtomicExpandImpl::convertAtomicStoreToIntegerType(StoreInst SI) {
629	ReplacementIRBuilder Builder(SI, *DL);
630	auto *M = SI->getModule();
631	Type *NewTy = getCorrespondingIntegerType(T: SI->getValueOperand()->getType(),
632	DL: M->getDataLayout());
633	Value *NewVal = Builder.CreateBitCast(V: SI->getValueOperand(), DestTy: NewTy);
634
635	Value *Addr = SI->getPointerOperand();
636
637	StoreInst *NewSI = Builder.CreateStore(Val: NewVal, Ptr: Addr);
638	NewSI->setAlignment(SI->getAlign());
639	NewSI->setVolatile(SI->isVolatile());
640	NewSI->setAtomic(Ordering: SI->getOrdering(), SSID: SI->getSyncScopeID());
641	LLVM_DEBUG(dbgs() << "Replaced " << SI << " with " << NewSI << "\n");
642	SI->eraseFromParent();
643	return NewSI;
644	}
645
646	void AtomicExpandImpl::expandAtomicStoreToXChg(StoreInst *SI) {
647	// This function is only called on atomic stores that are too large to be
648	// atomic if implemented as a native store. So we replace them by an
649	// atomic swap, that can be implemented for example as a ldrex/strex on ARM
650	// or lock cmpxchg8/16b on X86, as these are atomic for larger sizes.
651	// It is the responsibility of the target to only signal expansion via
652	// shouldExpandAtomicRMW in cases where this is required and possible.
653	ReplacementIRBuilder Builder(SI, *DL);
654	AtomicOrdering Ordering = SI->getOrdering();
655	assert(Ordering != AtomicOrdering::NotAtomic);
656	AtomicOrdering RMWOrdering = Ordering == AtomicOrdering::Unordered
657	? AtomicOrdering::Monotonic
658	: Ordering;
659	AtomicRMWInst *AI = Builder.CreateAtomicRMW(
660	Op: AtomicRMWInst::Xchg, Ptr: SI->getPointerOperand(), Val: SI->getValueOperand(),
661	Align: SI->getAlign(), Ordering: RMWOrdering);
662	SI->eraseFromParent();
663
664	// Now we have an appropriate swap instruction, lower it as usual.
665	tryExpandAtomicRMW(AI);
666	}
667
668	static void createCmpXchgInstFun(IRBuilderBase &Builder, Value *Addr,
669	Value Loaded, Value NewVal, Align AddrAlign,
670	AtomicOrdering MemOpOrder, SyncScope::ID SSID,
671	Value &Success, Value &NewLoaded,
672	Instruction *MetadataSrc) {
673	Type *OrigTy = NewVal->getType();
674
675	// This code can go away when cmpxchg supports FP and vector types.
676	assert(!OrigTy->isPointerTy());
677	bool NeedBitcast = OrigTy->isFloatingPointTy() \|\| OrigTy->isVectorTy();
678	if (NeedBitcast) {
679	IntegerType *IntTy = Builder.getIntNTy(N: OrigTy->getPrimitiveSizeInBits());
680	NewVal = Builder.CreateBitCast(V: NewVal, DestTy: IntTy);
681	Loaded = Builder.CreateBitCast(V: Loaded, DestTy: IntTy);
682	}
683
684	AtomicCmpXchgInst *Pair = Builder.CreateAtomicCmpXchg(
685	Ptr: Addr, Cmp: Loaded, New: NewVal, Align: AddrAlign, SuccessOrdering: MemOpOrder,
686	FailureOrdering: AtomicCmpXchgInst::getStrongestFailureOrdering(SuccessOrdering: MemOpOrder), SSID);
687	if (MetadataSrc)
688	copyMetadataForAtomic(Dest&: Pair, Source: MetadataSrc);
689
690	Success = Builder.CreateExtractValue(Agg: Pair, Idxs: `1`, Name: "success");
691	NewLoaded = Builder.CreateExtractValue(Agg: Pair, Idxs: `0`, Name: "newloaded");
692
693	if (NeedBitcast)
694	NewLoaded = Builder.CreateBitCast(V: NewLoaded, DestTy: OrigTy);
695	}
696
697	bool AtomicExpandImpl::tryExpandAtomicRMW(AtomicRMWInst *AI) {
698	LLVMContext &Ctx = AI->getModule()->getContext();
699	TargetLowering::AtomicExpansionKind Kind = TLI->shouldExpandAtomicRMWInIR(RMW: AI);
700	switch (Kind) {
701	case TargetLoweringBase::AtomicExpansionKind::None:
702	return false;
703	case TargetLoweringBase::AtomicExpansionKind::LLSC: {
704	unsigned MinCASSize = TLI->getMinCmpXchgSizeInBits() / `8`;
705	unsigned ValueSize = getAtomicOpSize(RMWI: AI);
706	if (ValueSize < MinCASSize) {
707	expandPartwordAtomicRMW(I: AI,
708	ExpansionKind: TargetLoweringBase::AtomicExpansionKind::LLSC);
709	} else {
710	auto PerformOp = [&](IRBuilderBase &Builder, Value *Loaded) {
711	return buildAtomicRMWValue(Op: AI->getOperation(), Builder, Loaded,
712	Val: AI->getValOperand());
713	};
714	expandAtomicOpToLLSC(I: AI, ResultTy: AI->getType(), Addr: AI->getPointerOperand(),
715	AddrAlign: AI->getAlign(), MemOpOrder: AI->getOrdering(), PerformOp);
716	}
717	return true;
718	}
719	case TargetLoweringBase::AtomicExpansionKind::CmpXChg: {
720	unsigned MinCASSize = TLI->getMinCmpXchgSizeInBits() / `8`;
721	unsigned ValueSize = getAtomicOpSize(RMWI: AI);
722	if (ValueSize < MinCASSize) {
723	expandPartwordAtomicRMW(I: AI,
724	ExpansionKind: TargetLoweringBase::AtomicExpansionKind::CmpXChg);
725	} else {
726	SmallVector<StringRef> SSNs;
727	Ctx.getSyncScopeNames(SSNs);
728	auto MemScope = SSNs [AI->getSyncScopeID()].empty()
729	? "system"
730	: SSNs [AI->getSyncScopeID()];
731	OptimizationRemarkEmitter ORE(AI->getFunction());
732	ORE.emit(RemarkBuilder: [&]() {
733	return OptimizationRemark (DEBUG_TYPE, "Passed", AI)
734	<< "A compare and swap loop was generated for an atomic "
735	<< AI->getOperationName(Op: AI->getOperation()) << " operation at "
736	<< MemScope << " memory scope";
737	});
738	expandAtomicRMWToCmpXchg(AI, CreateCmpXchg: createCmpXchgInstFun);
739	}
740	return true;
741	}
742	case TargetLoweringBase::AtomicExpansionKind::MaskedIntrinsic: {
743	unsigned MinCASSize = TLI->getMinCmpXchgSizeInBits() / `8`;
744	unsigned ValueSize = getAtomicOpSize(RMWI: AI);
745	if (ValueSize < MinCASSize) {
746	AtomicRMWInst::BinOp Op = AI->getOperation();
747	// Widen And/Or/Xor and give the target another chance at expanding it.
748	if (Op == AtomicRMWInst::Or \|\| Op == AtomicRMWInst::Xor \|\|
749	Op == AtomicRMWInst::And) {
750	tryExpandAtomicRMW(AI: widenPartwordAtomicRMW(AI));
751	return true;
752	}
753	}
754	expandAtomicRMWToMaskedIntrinsic(AI);
755	return true;
756	}
757	case TargetLoweringBase::AtomicExpansionKind::BitTestIntrinsic: {
758	TLI->emitBitTestAtomicRMWIntrinsic(AI);
759	return true;
760	}
761	case TargetLoweringBase::AtomicExpansionKind::CmpArithIntrinsic: {
762	TLI->emitCmpArithAtomicRMWIntrinsic(AI);
763	return true;
764	}
765	case TargetLoweringBase::AtomicExpansionKind::NotAtomic:
766	return lowerAtomicRMWInst(RMWI: AI);
767	case TargetLoweringBase::AtomicExpansionKind::CustomExpand:
768	TLI->emitExpandAtomicRMW(AI);
769	return true;
770	default:
771	llvm_unreachable("Unhandled case in tryExpandAtomicRMW");
772	}
773	}
774
775	namespace {
776
777	struct PartwordMaskValues {
778	// These three fields are guaranteed to be set by createMaskInstrs.
779	Type WordType = nullptr*;
780	Type ValueType = nullptr*;
781	Type IntValueType = nullptr*;
782	Value AlignedAddr = nullptr*;
783	Align AlignedAddrAlignment;
784	// The remaining fields can be null.
785	Value ShiftAmt = nullptr*;
786	Value Mask = nullptr*;
787	Value Inv_Mask = nullptr*;
788	};
789
790	[[maybe_unused]]
791	raw_ostream &operator<<(raw_ostream &O, const PartwordMaskValues &PMV) {
792	auto PrintObj = [&O](auto *V) {
793	if (V)
794	O << *V;
795	else
796	O << "nullptr";
797	O << `'\n'`;
798	};
799	O << "PartwordMaskValues {\n";
800	O << " WordType: ";
801	PrintObj (PMV.WordType);
802	O << " ValueType: ";
803	PrintObj (PMV.ValueType);
804	O << " AlignedAddr: ";
805	PrintObj (PMV.AlignedAddr);
806	O << " AlignedAddrAlignment: " << PMV.AlignedAddrAlignment.value() << `'\n'`;
807	O << " ShiftAmt: ";
808	PrintObj (PMV.ShiftAmt);
809	O << " Mask: ";
810	PrintObj (PMV.Mask);
811	O << " Inv_Mask: ";
812	PrintObj (PMV.Inv_Mask);
813	O << "}\n";
814	return O;
815	}
816
817	} // end anonymous namespace
818
819	/// This is a helper function which builds instructions to provide
820	/// values necessary for partword atomic operations. It takes an
821	/// incoming address, Addr, and ValueType, and constructs the address,
822	/// shift-amounts and masks needed to work with a larger value of size
823	/// WordSize.
824	///
825	/// AlignedAddr: Addr rounded down to a multiple of WordSize
826	///
827	/// ShiftAmt: Number of bits to right-shift a WordSize value loaded
828	/// from AlignAddr for it to have the same value as if
829	/// ValueType was loaded from Addr.
830	///
831	/// Mask: Value to mask with the value loaded from AlignAddr to
832	/// include only the part that would've been loaded from Addr.
833	///
834	/// Inv_Mask: The inverse of Mask.
835	static PartwordMaskValues createMaskInstrs(IRBuilderBase &Builder,
836	Instruction I, Type ValueType,
837	Value *Addr, Align AddrAlign,
838	unsigned MinWordSize) {
839	PartwordMaskValues PMV;
840
841	Module *M = I->getModule();
842	LLVMContext &Ctx = M->getContext();
843	const DataLayout &DL = M->getDataLayout();
844	unsigned ValueSize = DL.getTypeStoreSize(Ty: ValueType);
845
846	PMV.ValueType = PMV.IntValueType = ValueType;
847	if (PMV.ValueType->isFloatingPointTy() \|\| PMV.ValueType->isVectorTy())
848	PMV.IntValueType =
849	Type::getIntNTy(C&: Ctx, N: ValueType->getPrimitiveSizeInBits());
850
851	PMV.WordType = MinWordSize > ValueSize ? Type::getIntNTy(C&: Ctx, N: MinWordSize * `8`)
852	: ValueType;
853	if (PMV.ValueType == PMV.WordType) {
854	PMV.AlignedAddr = Addr;
855	PMV.AlignedAddrAlignment = AddrAlign;
856	PMV.ShiftAmt = ConstantInt::get(Ty: PMV.ValueType, V: `0`);
857	PMV.Mask = ConstantInt::get(Ty: PMV.ValueType, V: ~`0`, /isSigned/ IsSigned: true);
858	return PMV;
859	}
860
861	PMV.AlignedAddrAlignment = Align (MinWordSize);
862
863	assert(ValueSize < MinWordSize);
864
865	PointerType *PtrTy = cast<PointerType>(Val: Addr->getType());
866	IntegerType *IntTy = DL.getIndexType(C&: Ctx, AddressSpace: PtrTy->getAddressSpace());
867	Value *PtrLSB;
868
869	if (AddrAlign < MinWordSize) {
870	PMV.AlignedAddr = Builder.CreateIntrinsic(
871	ID: Intrinsic::ptrmask, Types: {PtrTy, IntTy},
872	Args: {Addr, ConstantInt::getSigned(Ty: IntTy, V: ~(uint64_t)(MinWordSize - `1`))},
873	FMFSource: nullptr, Name: "AlignedAddr");
874
875	Value *AddrInt = Builder.CreatePtrToInt(V: Addr, DestTy: IntTy);
876	PtrLSB = Builder.CreateAnd(LHS: AddrInt, RHS: MinWordSize - `1`, Name: "PtrLSB");
877	} else {
878	// If the alignment is high enough, the LSB are known 0.
879	PMV.AlignedAddr = Addr;
880	PtrLSB = ConstantInt::getNullValue(Ty: IntTy);
881	}
882
883	if (DL.isLittleEndian()) {
884	// turn bytes into bits
885	PMV.ShiftAmt = Builder.CreateShl(LHS: PtrLSB, RHS: `3`);
886	} else {
887	// turn bytes into bits, and count from the other side.
888	PMV.ShiftAmt = Builder.CreateShl(
889	LHS: Builder.CreateXor(LHS: PtrLSB, RHS: MinWordSize - ValueSize), RHS: `3`);
890	}
891
892	PMV.ShiftAmt = Builder.CreateTrunc(V: PMV.ShiftAmt, DestTy: PMV.WordType, Name: "ShiftAmt");
893	PMV.Mask = Builder.CreateShl(
894	LHS: ConstantInt::get(Ty: PMV.WordType, V: (`1` << (ValueSize * `8`)) - `1`), RHS: PMV.ShiftAmt,
895	Name: "Mask");
896
897	PMV.Inv_Mask = Builder.CreateNot(V: PMV.Mask, Name: "Inv_Mask");
898
899	return PMV;
900	}
901
902	static Value extractMaskedValue(IRBuilderBase &Builder, Value WideWord,
903	const PartwordMaskValues &PMV) {
904	assert(WideWord->getType() == PMV.WordType && "Widened type mismatch");
905	if (PMV.WordType == PMV.ValueType)
906	return WideWord;
907
908	Value *Shift = Builder.CreateLShr(LHS: WideWord, RHS: PMV.ShiftAmt, Name: "shifted");
909	Value *Trunc = Builder.CreateTrunc(V: Shift, DestTy: PMV.IntValueType, Name: "extracted");
910	return Builder.CreateBitCast(V: Trunc, DestTy: PMV.ValueType);
911	}
912
913	static Value insertMaskedValue(IRBuilderBase &Builder, Value WideWord,
914	Value Updated, const* PartwordMaskValues &PMV) {
915	assert(WideWord->getType() == PMV.WordType && "Widened type mismatch");
916	assert(Updated->getType() == PMV.ValueType && "Value type mismatch");
917	if (PMV.WordType == PMV.ValueType)
918	return Updated;
919
920	Updated = Builder.CreateBitCast(V: Updated, DestTy: PMV.IntValueType);
921
922	Value *ZExt = Builder.CreateZExt(V: Updated, DestTy: PMV.WordType, Name: "extended");
923	Value *Shift =
924	Builder.CreateShl(LHS: ZExt, RHS: PMV.ShiftAmt, Name: "shifted", /HasNUW/ true);
925	Value *And = Builder.CreateAnd(LHS: WideWord, RHS: PMV.Inv_Mask, Name: "unmasked");
926	Value *Or = Builder.CreateOr(LHS: And, RHS: Shift, Name: "inserted");
927	return Or;
928	}
929
930	/// Emit IR to implement a masked version of a given atomicrmw
931	/// operation. (That is, only the bits under the Mask should be
932	/// affected by the operation)
933	static Value *performMaskedAtomicOp(AtomicRMWInst::BinOp Op,
934	IRBuilderBase &Builder, Value *Loaded,
935	Value Shifted_Inc, Value Inc,
936	const PartwordMaskValues &PMV) {
937	// TODO: update to use
938	// https://graphics.stanford.edu/~seander/bithacks.html#MaskedMerge in order
939	// to merge bits from two values without requiring PMV.Inv_Mask.
940	switch (Op) {
941	case AtomicRMWInst::Xchg: {
942	Value *Loaded_MaskOut = Builder.CreateAnd(LHS: Loaded, RHS: PMV.Inv_Mask);
943	Value *FinalVal = Builder.CreateOr(LHS: Loaded_MaskOut, RHS: Shifted_Inc);
944	return FinalVal;
945	}
946	case AtomicRMWInst::Or:
947	case AtomicRMWInst::Xor:
948	case AtomicRMWInst::And:
949	llvm_unreachable("Or/Xor/And handled by widenPartwordAtomicRMW");
950	case AtomicRMWInst::Add:
951	case AtomicRMWInst::Sub:
952	case AtomicRMWInst::Nand: {
953	// The other arithmetic ops need to be masked into place.
954	Value *NewVal = buildAtomicRMWValue(Op, Builder, Loaded, Val: Shifted_Inc);
955	Value *NewVal_Masked = Builder.CreateAnd(LHS: NewVal, RHS: PMV.Mask);
956	Value *Loaded_MaskOut = Builder.CreateAnd(LHS: Loaded, RHS: PMV.Inv_Mask);
957	Value *FinalVal = Builder.CreateOr(LHS: Loaded_MaskOut, RHS: NewVal_Masked);
958	return FinalVal;
959	}
960	case AtomicRMWInst::Max:
961	case AtomicRMWInst::Min:
962	case AtomicRMWInst::UMax:
963	case AtomicRMWInst::UMin:
964	case AtomicRMWInst::FAdd:
965	case AtomicRMWInst::FSub:
966	case AtomicRMWInst::FMin:
967	case AtomicRMWInst::FMax:
968	case AtomicRMWInst::FMaximum:
969	case AtomicRMWInst::FMinimum:
970	case AtomicRMWInst::UIncWrap:
971	case AtomicRMWInst::UDecWrap:
972	case AtomicRMWInst::USubCond:
973	case AtomicRMWInst::USubSat: {
974	// Finally, other ops will operate on the full value, so truncate down to
975	// the original size, and expand out again after doing the
976	// operation. Bitcasts will be inserted for FP values.
977	Value *Loaded_Extract = extractMaskedValue(Builder, WideWord: Loaded, PMV);
978	Value *NewVal = buildAtomicRMWValue(Op, Builder, Loaded: Loaded_Extract, Val: Inc);
979	Value *FinalVal = insertMaskedValue(Builder, WideWord: Loaded, Updated: NewVal, PMV);
980	return FinalVal;
981	}
982	default:
983	llvm_unreachable("Unknown atomic op");
984	}
985	}
986
987	/// Expand a sub-word atomicrmw operation into an appropriate
988	/// word-sized operation.
989	///
990	/// It will create an LL/SC or cmpxchg loop, as appropriate, the same
991	/// way as a typical atomicrmw expansion. The only difference here is
992	/// that the operation inside of the loop may operate upon only a
993	/// part of the value.
994	void AtomicExpandImpl::expandPartwordAtomicRMW(
995	AtomicRMWInst *AI, TargetLoweringBase::AtomicExpansionKind ExpansionKind) {
996	// Widen And/Or/Xor and give the target another chance at expanding it.
997	AtomicRMWInst::BinOp Op = AI->getOperation();
998	if (Op == AtomicRMWInst::Or \|\| Op == AtomicRMWInst::Xor \|\|
999	Op == AtomicRMWInst::And) {
1000	tryExpandAtomicRMW(AI: widenPartwordAtomicRMW(AI));
1001	return;
1002	}
1003	AtomicOrdering MemOpOrder = AI->getOrdering();
1004	SyncScope::ID SSID = AI->getSyncScopeID();
1005
1006	ReplacementIRBuilder Builder(AI, *DL);
1007
1008	PartwordMaskValues PMV =
1009	createMaskInstrs(Builder, I: AI, ValueType: AI->getType(), Addr: AI->getPointerOperand(),
1010	AddrAlign: AI->getAlign(), MinWordSize: TLI->getMinCmpXchgSizeInBits() / `8`);
1011
1012	Value ValOperand_Shifted = nullptr*;
1013	if (Op == AtomicRMWInst::Xchg \|\| Op == AtomicRMWInst::Add \|\|
1014	Op == AtomicRMWInst::Sub \|\| Op == AtomicRMWInst::Nand) {
1015	Value *ValOp = Builder.CreateBitCast(V: AI->getValOperand(), DestTy: PMV.IntValueType);
1016	ValOperand_Shifted =
1017	Builder.CreateShl(LHS: Builder.CreateZExt(V: ValOp, DestTy: PMV.WordType), RHS: PMV.ShiftAmt,
1018	Name: "ValOperand_Shifted");
1019	}
1020
1021	auto PerformPartwordOp = [&](IRBuilderBase &Builder, Value *Loaded) {
1022	return performMaskedAtomicOp(Op, Builder, Loaded, Shifted_Inc: ValOperand_Shifted,
1023	Inc: AI->getValOperand(), PMV);
1024	};
1025
1026	Value *OldResult;
1027	if (ExpansionKind == TargetLoweringBase::AtomicExpansionKind::CmpXChg) {
1028	OldResult = insertRMWCmpXchgLoop(
1029	Builder, ResultType: PMV.WordType, Addr: PMV.AlignedAddr, AddrAlign: PMV.AlignedAddrAlignment,
1030	MemOpOrder, SSID, PerformOp: PerformPartwordOp, CreateCmpXchg: createCmpXchgInstFun, MetadataSrc: AI);
1031	} else {
1032	assert(ExpansionKind == TargetLoweringBase::AtomicExpansionKind::LLSC);
1033	OldResult = insertRMWLLSCLoop(Builder, ResultTy: PMV.WordType, Addr: PMV.AlignedAddr,
1034	AddrAlign: PMV.AlignedAddrAlignment, MemOpOrder,
1035	PerformOp: PerformPartwordOp);
1036	}
1037
1038	Value *FinalOldResult = extractMaskedValue(Builder, WideWord: OldResult, PMV);
1039	AI->replaceAllUsesWith(V: FinalOldResult);
1040	AI->eraseFromParent();
1041	}
1042
1043	// Widen the bitwise atomicrmw (or/xor/and) to the minimum supported width.
1044	AtomicRMWInst AtomicExpandImpl::widenPartwordAtomicRMW(AtomicRMWInst AI) {
1045	ReplacementIRBuilder Builder(AI, *DL);
1046	AtomicRMWInst::BinOp Op = AI->getOperation();
1047
1048	assert((Op == AtomicRMWInst::Or \|\| Op == AtomicRMWInst::Xor \|\|
1049	Op == AtomicRMWInst::And) &&
1050	"Unable to widen operation");
1051
1052	PartwordMaskValues PMV =
1053	createMaskInstrs(Builder, I: AI, ValueType: AI->getType(), Addr: AI->getPointerOperand(),
1054	AddrAlign: AI->getAlign(), MinWordSize: TLI->getMinCmpXchgSizeInBits() / `8`);
1055
1056	Value *ValOperand_Shifted =
1057	Builder.CreateShl(LHS: Builder.CreateZExt(V: AI->getValOperand(), DestTy: PMV.WordType),
1058	RHS: PMV.ShiftAmt, Name: "ValOperand_Shifted");
1059
1060	Value *NewOperand;
1061
1062	if (Op == AtomicRMWInst::And)
1063	NewOperand =
1064	Builder.CreateOr(LHS: ValOperand_Shifted, RHS: PMV.Inv_Mask, Name: "AndOperand");
1065	else
1066	NewOperand = ValOperand_Shifted;
1067
1068	AtomicRMWInst *NewAI = Builder.CreateAtomicRMW(
1069	Op, Ptr: PMV.AlignedAddr, Val: NewOperand, Align: PMV.AlignedAddrAlignment,
1070	Ordering: AI->getOrdering(), SSID: AI->getSyncScopeID());
1071
1072	copyMetadataForAtomic(Dest&: NewAI, Source: AI);
1073
1074	Value *FinalOldResult = extractMaskedValue(Builder, WideWord: NewAI, PMV);
1075	AI->replaceAllUsesWith(V: FinalOldResult);
1076	AI->eraseFromParent();
1077	return NewAI;
1078	}
1079
1080	bool AtomicExpandImpl::expandPartwordCmpXchg(AtomicCmpXchgInst *CI) {
1081	// The basic idea here is that we're expanding a cmpxchg of a
1082	// smaller memory size up to a word-sized cmpxchg. To do this, we
1083	// need to add a retry-loop for strong cmpxchg, so that
1084	// modifications to other parts of the word don't cause a spurious
1085	// failure.
1086
1087	// This generates code like the following:
1088	// [[Setup mask values PMV.]]*
1089	// %NewVal_Shifted = shl i32 %NewVal, %PMV.ShiftAmt
1090	// %Cmp_Shifted = shl i32 %Cmp, %PMV.ShiftAmt
1091	// %InitLoaded = load i32 %addr*
1092	// %InitLoaded_MaskOut = and i32 %InitLoaded, %PMV.Inv_Mask
1093	// br partword.cmpxchg.loop
1094	// partword.cmpxchg.loop:
1095	// %Loaded_MaskOut = phi i32 [ %InitLoaded_MaskOut, %entry ],
1096	// [ %OldVal_MaskOut, %partword.cmpxchg.failure ]
1097	// %FullWord_NewVal = or i32 %Loaded_MaskOut, %NewVal_Shifted
1098	// %FullWord_Cmp = or i32 %Loaded_MaskOut, %Cmp_Shifted
1099	// %NewCI = cmpxchg i32 %PMV.AlignedAddr, i32 %FullWord_Cmp,*
1100	// i32 %FullWord_NewVal success_ordering failure_ordering
1101	// %OldVal = extractvalue { i32, i1 } %NewCI, 0
1102	// %Success = extractvalue { i32, i1 } %NewCI, 1
1103	// br i1 %Success, label %partword.cmpxchg.end,
1104	// label %partword.cmpxchg.failure
1105	// partword.cmpxchg.failure:
1106	// %OldVal_MaskOut = and i32 %OldVal, %PMV.Inv_Mask
1107	// %ShouldContinue = icmp ne i32 %Loaded_MaskOut, %OldVal_MaskOut
1108	// br i1 %ShouldContinue, label %partword.cmpxchg.loop,
1109	// label %partword.cmpxchg.end
1110	// partword.cmpxchg.end:
1111	// %tmp1 = lshr i32 %OldVal, %PMV.ShiftAmt
1112	// %FinalOldVal = trunc i32 %tmp1 to i8
1113	// %tmp2 = insertvalue { i8, i1 } undef, i8 %FinalOldVal, 0
1114	// %Res = insertvalue { i8, i1 } %25, i1 %Success, 1
1115
1116	Value *Addr = CI->getPointerOperand();
1117	Value *Cmp = CI->getCompareOperand();
1118	Value *NewVal = CI->getNewValOperand();
1119
1120	BasicBlock *BB = CI->getParent();
1121	Function *F = BB->getParent();
1122	ReplacementIRBuilder Builder(CI, *DL);
1123	LLVMContext &Ctx = Builder.getContext();
1124
1125	BasicBlock *EndBB =
1126	BB->splitBasicBlock(I: CI->getIterator(), BBName: "partword.cmpxchg.end");
1127	auto FailureBB =
1128	BasicBlock::Create(Context&: Ctx, Name: "partword.cmpxchg.failure", Parent: F, InsertBefore: EndBB);
1129	auto LoopBB = BasicBlock::Create(Context&: Ctx, Name: "partword.cmpxchg.loop", Parent: F, InsertBefore: FailureBB);
1130
1131	// The split call above "helpfully" added a branch at the end of BB
1132	// (to the wrong place).
1133	std::prev(x: BB->end())->eraseFromParent();
1134	Builder.SetInsertPoint(BB);
1135
1136	PartwordMaskValues PMV =
1137	createMaskInstrs(Builder, I: CI, ValueType: CI->getCompareOperand()->getType(), Addr,
1138	AddrAlign: CI->getAlign(), MinWordSize: TLI->getMinCmpXchgSizeInBits() / `8`);
1139
1140	// Shift the incoming values over, into the right location in the word.
1141	Value *NewVal_Shifted =
1142	Builder.CreateShl(LHS: Builder.CreateZExt(V: NewVal, DestTy: PMV.WordType), RHS: PMV.ShiftAmt);
1143	Value *Cmp_Shifted =
1144	Builder.CreateShl(LHS: Builder.CreateZExt(V: Cmp, DestTy: PMV.WordType), RHS: PMV.ShiftAmt);
1145
1146	// Load the entire current word, and mask into place the expected and new
1147	// values
1148	LoadInst *InitLoaded = Builder.CreateLoad(Ty: PMV.WordType, Ptr: PMV.AlignedAddr);
1149	InitLoaded->setVolatile(CI->isVolatile());
1150	Value *InitLoaded_MaskOut = Builder.CreateAnd(LHS: InitLoaded, RHS: PMV.Inv_Mask);
1151	Builder.CreateBr(Dest: LoopBB);
1152
1153	// partword.cmpxchg.loop:
1154	Builder.SetInsertPoint(LoopBB);
1155	PHINode *Loaded_MaskOut = Builder.CreatePHI(Ty: PMV.WordType, NumReservedValues: `2`);
1156	Loaded_MaskOut->addIncoming(V: InitLoaded_MaskOut, BB);
1157
1158	// Mask/Or the expected and new values into place in the loaded word.
1159	Value *FullWord_NewVal = Builder.CreateOr(LHS: Loaded_MaskOut, RHS: NewVal_Shifted);
1160	Value *FullWord_Cmp = Builder.CreateOr(LHS: Loaded_MaskOut, RHS: Cmp_Shifted);
1161	AtomicCmpXchgInst *NewCI = Builder.CreateAtomicCmpXchg(
1162	Ptr: PMV.AlignedAddr, Cmp: FullWord_Cmp, New: FullWord_NewVal, Align: PMV.AlignedAddrAlignment,
1163	SuccessOrdering: CI->getSuccessOrdering(), FailureOrdering: CI->getFailureOrdering(), SSID: CI->getSyncScopeID());
1164	NewCI->setVolatile(CI->isVolatile());
1165	// When we're building a strong cmpxchg, we need a loop, so you
1166	// might think we could use a weak cmpxchg inside. But, using strong
1167	// allows the below comparison for ShouldContinue, and we're
1168	// expecting the underlying cmpxchg to be a machine instruction,
1169	// which is strong anyways.
1170	NewCI->setWeak(CI->isWeak());
1171
1172	Value *OldVal = Builder.CreateExtractValue(Agg: NewCI, Idxs: `0`);
1173	Value *Success = Builder.CreateExtractValue(Agg: NewCI, Idxs: `1`);
1174
1175	if (CI->isWeak())
1176	Builder.CreateBr(Dest: EndBB);
1177	else
1178	Builder.CreateCondBr(Cond: Success, True: EndBB, False: FailureBB);
1179
1180	// partword.cmpxchg.failure:
1181	Builder.SetInsertPoint(FailureBB);
1182	// Upon failure, verify that the masked-out part of the loaded value
1183	// has been modified. If it didn't, abort the cmpxchg, since the
1184	// masked-in part must've.
1185	Value *OldVal_MaskOut = Builder.CreateAnd(LHS: OldVal, RHS: PMV.Inv_Mask);
1186	Value *ShouldContinue = Builder.CreateICmpNE(LHS: Loaded_MaskOut, RHS: OldVal_MaskOut);
1187	Builder.CreateCondBr(Cond: ShouldContinue, True: LoopBB, False: EndBB);
1188
1189	// Add the second value to the phi from above
1190	Loaded_MaskOut->addIncoming(V: OldVal_MaskOut, BB: FailureBB);
1191
1192	// partword.cmpxchg.end:
1193	Builder.SetInsertPoint(CI);
1194
1195	Value *FinalOldVal = extractMaskedValue(Builder, WideWord: OldVal, PMV);
1196	Value *Res = PoisonValue::get(T: CI->getType());
1197	Res = Builder.CreateInsertValue(Agg: Res, Val: FinalOldVal, Idxs: `0`);
1198	Res = Builder.CreateInsertValue(Agg: Res, Val: Success, Idxs: `1`);
1199
1200	CI->replaceAllUsesWith(V: Res);
1201	CI->eraseFromParent();
1202	return true;
1203	}
1204
1205	void AtomicExpandImpl::expandAtomicOpToLLSC(
1206	Instruction I, Type ResultType, Value *Addr, Align AddrAlign,
1207	AtomicOrdering MemOpOrder,
1208	function_ref<Value (IRBuilderBase &, Value )> PerformOp) {
1209	ReplacementIRBuilder Builder(I, *DL);
1210	Value *Loaded = insertRMWLLSCLoop(Builder, ResultTy: ResultType, Addr, AddrAlign,
1211	MemOpOrder, PerformOp);
1212
1213	I->replaceAllUsesWith(V: Loaded);
1214	I->eraseFromParent();
1215	}
1216
1217	void AtomicExpandImpl::expandAtomicRMWToMaskedIntrinsic(AtomicRMWInst *AI) {
1218	ReplacementIRBuilder Builder(AI, *DL);
1219
1220	PartwordMaskValues PMV =
1221	createMaskInstrs(Builder, I: AI, ValueType: AI->getType(), Addr: AI->getPointerOperand(),
1222	AddrAlign: AI->getAlign(), MinWordSize: TLI->getMinCmpXchgSizeInBits() / `8`);
1223
1224	// The value operand must be sign-extended for signed min/max so that the
1225	// target's signed comparison instructions can be used. Otherwise, just
1226	// zero-ext.
1227	Instruction::CastOps CastOp = Instruction::ZExt;
1228	AtomicRMWInst::BinOp RMWOp = AI->getOperation();
1229	if (RMWOp == AtomicRMWInst::Max \|\| RMWOp == AtomicRMWInst::Min)
1230	CastOp = Instruction::SExt;
1231
1232	Value *ValOperand_Shifted = Builder.CreateShl(
1233	LHS: Builder.CreateCast(Op: CastOp, V: AI->getValOperand(), DestTy: PMV.WordType),
1234	RHS: PMV.ShiftAmt, Name: "ValOperand_Shifted");
1235	Value *OldResult = TLI->emitMaskedAtomicRMWIntrinsic(
1236	Builder, AI, AlignedAddr: PMV.AlignedAddr, Incr: ValOperand_Shifted, Mask: PMV.Mask, ShiftAmt: PMV.ShiftAmt,
1237	Ord: AI->getOrdering());
1238	Value *FinalOldResult = extractMaskedValue(Builder, WideWord: OldResult, PMV);
1239	AI->replaceAllUsesWith(V: FinalOldResult);
1240	AI->eraseFromParent();
1241	}
1242
1243	void AtomicExpandImpl::expandAtomicCmpXchgToMaskedIntrinsic(
1244	AtomicCmpXchgInst *CI) {
1245	ReplacementIRBuilder Builder(CI, *DL);
1246
1247	PartwordMaskValues PMV = createMaskInstrs(
1248	Builder, I: CI, ValueType: CI->getCompareOperand()->getType(), Addr: CI->getPointerOperand(),
1249	AddrAlign: CI->getAlign(), MinWordSize: TLI->getMinCmpXchgSizeInBits() / `8`);
1250
1251	Value *CmpVal_Shifted = Builder.CreateShl(
1252	LHS: Builder.CreateZExt(V: CI->getCompareOperand(), DestTy: PMV.WordType), RHS: PMV.ShiftAmt,
1253	Name: "CmpVal_Shifted");
1254	Value *NewVal_Shifted = Builder.CreateShl(
1255	LHS: Builder.CreateZExt(V: CI->getNewValOperand(), DestTy: PMV.WordType), RHS: PMV.ShiftAmt,
1256	Name: "NewVal_Shifted");
1257	Value *OldVal = TLI->emitMaskedAtomicCmpXchgIntrinsic(
1258	Builder, CI, AlignedAddr: PMV.AlignedAddr, CmpVal: CmpVal_Shifted, NewVal: NewVal_Shifted, Mask: PMV.Mask,
1259	Ord: CI->getMergedOrdering());
1260	Value *FinalOldVal = extractMaskedValue(Builder, WideWord: OldVal, PMV);
1261	Value *Res = PoisonValue::get(T: CI->getType());
1262	Res = Builder.CreateInsertValue(Agg: Res, Val: FinalOldVal, Idxs: `0`);
1263	Value *Success = Builder.CreateICmpEQ(
1264	LHS: CmpVal_Shifted, RHS: Builder.CreateAnd(LHS: OldVal, RHS: PMV.Mask), Name: "Success");
1265	Res = Builder.CreateInsertValue(Agg: Res, Val: Success, Idxs: `1`);
1266
1267	CI->replaceAllUsesWith(V: Res);
1268	CI->eraseFromParent();
1269	}
1270
1271	Value *AtomicExpandImpl::insertRMWLLSCLoop(
1272	IRBuilderBase &Builder, Type ResultTy, Value Addr, Align AddrAlign,
1273	AtomicOrdering MemOpOrder,
1274	function_ref<Value (IRBuilderBase &, Value )> PerformOp) {
1275	LLVMContext &Ctx = Builder.getContext();
1276	BasicBlock *BB = Builder.GetInsertBlock();
1277	Function *F = BB->getParent();
1278
1279	assert(AddrAlign >= F->getDataLayout().getTypeStoreSize(ResultTy) &&
1280	"Expected at least natural alignment at this point.");
1281
1282	// Given: atomicrmw some_op iN %addr, iN %incr ordering*
1283	//
1284	// The standard expansion we produce is:
1285	// [...]
1286	// atomicrmw.start:
1287	// %loaded = @load.linked(%addr)
1288	// %new = some_op iN %loaded, %incr
1289	// %stored = @store_conditional(%new, %addr)
1290	// %try_again = icmp i32 ne %stored, 0
1291	// br i1 %try_again, label %loop, label %atomicrmw.end
1292	// atomicrmw.end:
1293	// [...]
1294	BasicBlock *ExitBB =
1295	BB->splitBasicBlock(I: Builder.GetInsertPoint(), BBName: "atomicrmw.end");
1296	BasicBlock *LoopBB = BasicBlock::Create(Context&: Ctx, Name: "atomicrmw.start", Parent: F, InsertBefore: ExitBB);
1297
1298	// The split call above "helpfully" added a branch at the end of BB (to the
1299	// wrong place).
1300	std::prev(x: BB->end())->eraseFromParent();
1301	Builder.SetInsertPoint(BB);
1302	Builder.CreateBr(Dest: LoopBB);
1303
1304	// Start the main loop block now that we've taken care of the preliminaries.
1305	Builder.SetInsertPoint(LoopBB);
1306	Value *Loaded = TLI->emitLoadLinked(Builder, ValueTy: ResultTy, Addr, Ord: MemOpOrder);
1307
1308	Value *NewVal = PerformOp (Builder, Loaded);
1309
1310	Value *StoreSuccess =
1311	TLI->emitStoreConditional(Builder, Val: NewVal, Addr, Ord: MemOpOrder);
1312	Value *TryAgain = Builder.CreateICmpNE(
1313	LHS: StoreSuccess, RHS: ConstantInt::get(Ty: IntegerType::get(C&: Ctx, NumBits: `32`), V: `0`), Name: "tryagain");
1314
1315	Instruction *CondBr = Builder.CreateCondBr(Cond: TryAgain, True: LoopBB, False: ExitBB);
1316
1317	// Atomic RMW expands to a Load-linked / Store-Conditional loop, because it is
1318	// hard to predict precise branch weigths we mark the branch as "unknown"
1319	// (50/50) to prevent misleading optimizations.
1320	setExplicitlyUnknownBranchWeightsIfProfiled(I&: *CondBr, DEBUG_TYPE);
1321
1322	Builder.SetInsertPoint(TheBB: ExitBB, IP: ExitBB->begin());
1323	return Loaded;
1324	}
1325
1326	/// Convert an atomic cmpxchg of a non-integral type to an integer cmpxchg of
1327	/// the equivalent bitwidth. We used to not support pointer cmpxchg in the
1328	/// IR. As a migration step, we convert back to what use to be the standard
1329	/// way to represent a pointer cmpxchg so that we can update backends one by
1330	/// one.
1331	AtomicCmpXchgInst *
1332	AtomicExpandImpl::convertCmpXchgToIntegerType(AtomicCmpXchgInst *CI) {
1333	auto *M = CI->getModule();
1334	Type *NewTy = getCorrespondingIntegerType(T: CI->getCompareOperand()->getType(),
1335	DL: M->getDataLayout());
1336
1337	ReplacementIRBuilder Builder(CI, *DL);
1338
1339	Value *Addr = CI->getPointerOperand();
1340
1341	Value *NewCmp = Builder.CreatePtrToInt(V: CI->getCompareOperand(), DestTy: NewTy);
1342	Value *NewNewVal = Builder.CreatePtrToInt(V: CI->getNewValOperand(), DestTy: NewTy);
1343
1344	auto *NewCI = Builder.CreateAtomicCmpXchg(
1345	Ptr: Addr, Cmp: NewCmp, New: NewNewVal, Align: CI->getAlign(), SuccessOrdering: CI->getSuccessOrdering(),
1346	FailureOrdering: CI->getFailureOrdering(), SSID: CI->getSyncScopeID());
1347	NewCI->setVolatile(CI->isVolatile());
1348	NewCI->setWeak(CI->isWeak());
1349	LLVM_DEBUG(dbgs() << "Replaced " << CI << " with " << NewCI << "\n");
1350
1351	Value *OldVal = Builder.CreateExtractValue(Agg: NewCI, Idxs: `0`);
1352	Value *Succ = Builder.CreateExtractValue(Agg: NewCI, Idxs: `1`);
1353
1354	OldVal = Builder.CreateIntToPtr(V: OldVal, DestTy: CI->getCompareOperand()->getType());
1355
1356	Value *Res = PoisonValue::get(T: CI->getType());
1357	Res = Builder.CreateInsertValue(Agg: Res, Val: OldVal, Idxs: `0`);
1358	Res = Builder.CreateInsertValue(Agg: Res, Val: Succ, Idxs: `1`);
1359
1360	CI->replaceAllUsesWith(V: Res);
1361	CI->eraseFromParent();
1362	return NewCI;
1363	}
1364
1365	bool AtomicExpandImpl::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
1366	AtomicOrdering SuccessOrder = CI->getSuccessOrdering();
1367	AtomicOrdering FailureOrder = CI->getFailureOrdering();
1368	Value *Addr = CI->getPointerOperand();
1369	BasicBlock *BB = CI->getParent();
1370	Function *F = BB->getParent();
1371	LLVMContext &Ctx = F->getContext();
1372	// If shouldInsertFencesForAtomic() returns true, then the target does not
1373	// want to deal with memory orders, and emitLeading/TrailingFence should take
1374	// care of everything. Otherwise, emitLeading/TrailingFence are no-op and we
1375	// should preserve the ordering.
1376	bool ShouldInsertFencesForAtomic = TLI->shouldInsertFencesForAtomic(I: CI);
1377	AtomicOrdering MemOpOrder = ShouldInsertFencesForAtomic
1378	? AtomicOrdering::Monotonic
1379	: CI->getMergedOrdering();
1380
1381	// In implementations which use a barrier to achieve release semantics, we can
1382	// delay emitting this barrier until we know a store is actually going to be
1383	// attempted. The cost of this delay is that we need 2 copies of the block
1384	// emitting the load-linked, affecting code size.
1385	//
1386	// Ideally, this logic would be unconditional except for the minsize check
1387	// since in other cases the extra blocks naturally collapse down to the
1388	// minimal loop. Unfortunately, this puts too much stress on later
1389	// optimisations so we avoid emitting the extra logic in those cases too.
1390	bool HasReleasedLoadBB = !CI->isWeak() && ShouldInsertFencesForAtomic &&
1391	SuccessOrder != AtomicOrdering::Monotonic &&
1392	SuccessOrder != AtomicOrdering::Acquire &&
1393	!F->hasMinSize();
1394
1395	// There's no overhead for sinking the release barrier in a weak cmpxchg, so
1396	// do it even on minsize.
1397	bool UseUnconditionalReleaseBarrier = F->hasMinSize() && !CI->isWeak();
1398
1399	// Given: cmpxchg some_op iN %addr, iN %desired, iN %new success_ord fail_ord*
1400	//
1401	// The full expansion we produce is:
1402	// [...]
1403	// %aligned.addr = ...
1404	// cmpxchg.start:
1405	// %unreleasedload = @load.linked(%aligned.addr)
1406	// %unreleasedload.extract = extract value from %unreleasedload
1407	// %should_store = icmp eq %unreleasedload.extract, %desired
1408	// br i1 %should_store, label %cmpxchg.releasingstore,
1409	// label %cmpxchg.nostore
1410	// cmpxchg.releasingstore:
1411	// fence?
1412	// br label cmpxchg.trystore
1413	// cmpxchg.trystore:
1414	// %loaded.trystore = phi [%unreleasedload, %cmpxchg.releasingstore],
1415	// [%releasedload, %cmpxchg.releasedload]
1416	// %updated.new = insert %new into %loaded.trystore
1417	// %stored = @store_conditional(%updated.new, %aligned.addr)
1418	// %success = icmp eq i32 %stored, 0
1419	// br i1 %success, label %cmpxchg.success,
1420	// label %cmpxchg.releasedload/%cmpxchg.failure
1421	// cmpxchg.releasedload:
1422	// %releasedload = @load.linked(%aligned.addr)
1423	// %releasedload.extract = extract value from %releasedload
1424	// %should_store = icmp eq %releasedload.extract, %desired
1425	// br i1 %should_store, label %cmpxchg.trystore,
1426	// label %cmpxchg.failure
1427	// cmpxchg.success:
1428	// fence?
1429	// br label %cmpxchg.end
1430	// cmpxchg.nostore:
1431	// %loaded.nostore = phi [%unreleasedload, %cmpxchg.start],
1432	// [%releasedload,
1433	// %cmpxchg.releasedload/%cmpxchg.trystore]
1434	// @load_linked_fail_balance()?
1435	// br label %cmpxchg.failure
1436	// cmpxchg.failure:
1437	// fence?
1438	// br label %cmpxchg.end
1439	// cmpxchg.end:
1440	// %loaded.exit = phi [%loaded.nostore, %cmpxchg.failure],
1441	// [%loaded.trystore, %cmpxchg.trystore]
1442	// %success = phi i1 [true, %cmpxchg.success], [false, %cmpxchg.failure]
1443	// %loaded = extract value from %loaded.exit
1444	// %restmp = insertvalue { iN, i1 } undef, iN %loaded, 0
1445	// %res = insertvalue { iN, i1 } %restmp, i1 %success, 1
1446	// [...]
1447	BasicBlock *ExitBB = BB->splitBasicBlock(I: CI->getIterator(), BBName: "cmpxchg.end");
1448	auto FailureBB = BasicBlock::Create(Context&: Ctx, Name: "cmpxchg.failure", Parent: F, InsertBefore: ExitBB);
1449	auto NoStoreBB = BasicBlock::Create(Context&: Ctx, Name: "cmpxchg.nostore", Parent: F, InsertBefore: FailureBB);
1450	auto SuccessBB = BasicBlock::Create(Context&: Ctx, Name: "cmpxchg.success", Parent: F, InsertBefore: NoStoreBB);
1451	auto ReleasedLoadBB =
1452	BasicBlock::Create(Context&: Ctx, Name: "cmpxchg.releasedload", Parent: F, InsertBefore: SuccessBB);
1453	auto TryStoreBB =
1454	BasicBlock::Create(Context&: Ctx, Name: "cmpxchg.trystore", Parent: F, InsertBefore: ReleasedLoadBB);
1455	auto ReleasingStoreBB =
1456	BasicBlock::Create(Context&: Ctx, Name: "cmpxchg.fencedstore", Parent: F, InsertBefore: TryStoreBB);
1457	auto StartBB = BasicBlock::Create(Context&: Ctx, Name: "cmpxchg.start", Parent: F, InsertBefore: ReleasingStoreBB);
1458
1459	ReplacementIRBuilder Builder(CI, *DL);
1460
1461	// The split call above "helpfully" added a branch at the end of BB (to the
1462	// wrong place), but we might want a fence too. It's easiest to just remove
1463	// the branch entirely.
1464	std::prev(x: BB->end())->eraseFromParent();
1465	Builder.SetInsertPoint(BB);
1466	if (ShouldInsertFencesForAtomic && UseUnconditionalReleaseBarrier)
1467	TLI->emitLeadingFence(Builder, Inst: CI, Ord: SuccessOrder);
1468
1469	PartwordMaskValues PMV =
1470	createMaskInstrs(Builder, I: CI, ValueType: CI->getCompareOperand()->getType(), Addr,
1471	AddrAlign: CI->getAlign(), MinWordSize: TLI->getMinCmpXchgSizeInBits() / `8`);
1472	Builder.CreateBr(Dest: StartBB);
1473
1474	// Start the main loop block now that we've taken care of the preliminaries.
1475	Builder.SetInsertPoint(StartBB);
1476	Value *UnreleasedLoad =
1477	TLI->emitLoadLinked(Builder, ValueTy: PMV.WordType, Addr: PMV.AlignedAddr, Ord: MemOpOrder);
1478	Value *UnreleasedLoadExtract =
1479	extractMaskedValue(Builder, WideWord: UnreleasedLoad, PMV);
1480	Value *ShouldStore = Builder.CreateICmpEQ(
1481	LHS: UnreleasedLoadExtract, RHS: CI->getCompareOperand(), Name: "should_store");
1482
1483	// If the cmpxchg doesn't actually need any ordering when it fails, we can
1484	// jump straight past that fence instruction (if it exists).
1485	Builder.CreateCondBr(Cond: ShouldStore, True: ReleasingStoreBB, False: NoStoreBB,
1486	BranchWeights: MDBuilder (F->getContext()).createLikelyBranchWeights());
1487
1488	Builder.SetInsertPoint(ReleasingStoreBB);
1489	if (ShouldInsertFencesForAtomic && !UseUnconditionalReleaseBarrier)
1490	TLI->emitLeadingFence(Builder, Inst: CI, Ord: SuccessOrder);
1491	Builder.CreateBr(Dest: TryStoreBB);
1492
1493	Builder.SetInsertPoint(TryStoreBB);
1494	PHINode *LoadedTryStore =
1495	Builder.CreatePHI(Ty: PMV.WordType, NumReservedValues: `2`, Name: "loaded.trystore");
1496	LoadedTryStore->addIncoming(V: UnreleasedLoad, BB: ReleasingStoreBB);
1497	Value *NewValueInsert =
1498	insertMaskedValue(Builder, WideWord: LoadedTryStore, Updated: CI->getNewValOperand(), PMV);
1499	Value *StoreSuccess = TLI->emitStoreConditional(Builder, Val: NewValueInsert,
1500	Addr: PMV.AlignedAddr, Ord: MemOpOrder);
1501	StoreSuccess = Builder.CreateICmpEQ(
1502	LHS: StoreSuccess, RHS: ConstantInt::get(Ty: Type::getInt32Ty(C&: Ctx), V: `0`), Name: "success");
1503	BasicBlock *RetryBB = HasReleasedLoadBB ? ReleasedLoadBB : StartBB;
1504	Builder.CreateCondBr(Cond: StoreSuccess, True: SuccessBB,
1505	False: CI->isWeak() ? FailureBB : RetryBB,
1506	BranchWeights: MDBuilder (F->getContext()).createLikelyBranchWeights());
1507
1508	Builder.SetInsertPoint(ReleasedLoadBB);
1509	Value *SecondLoad;
1510	if (HasReleasedLoadBB) {
1511	SecondLoad =
1512	TLI->emitLoadLinked(Builder, ValueTy: PMV.WordType, Addr: PMV.AlignedAddr, Ord: MemOpOrder);
1513	Value *SecondLoadExtract = extractMaskedValue(Builder, WideWord: SecondLoad, PMV);
1514	ShouldStore = Builder.CreateICmpEQ(LHS: SecondLoadExtract,
1515	RHS: CI->getCompareOperand(), Name: "should_store");
1516
1517	// If the cmpxchg doesn't actually need any ordering when it fails, we can
1518	// jump straight past that fence instruction (if it exists).
1519	Builder.CreateCondBr(
1520	Cond: ShouldStore, True: TryStoreBB, False: NoStoreBB,
1521	BranchWeights: MDBuilder (F->getContext()).createLikelyBranchWeights());
1522	// Update PHI node in TryStoreBB.
1523	LoadedTryStore->addIncoming(V: SecondLoad, BB: ReleasedLoadBB);
1524	} else
1525	Builder.CreateUnreachable();
1526
1527	// Make sure later instructions don't get reordered with a fence if
1528	// necessary.
1529	Builder.SetInsertPoint(SuccessBB);
1530	if (ShouldInsertFencesForAtomic \|\|
1531	TLI->shouldInsertTrailingSeqCstFenceForAtomicStore(I: CI))
1532	TLI->emitTrailingFence(Builder, Inst: CI, Ord: SuccessOrder);
1533	Builder.CreateBr(Dest: ExitBB);
1534
1535	Builder.SetInsertPoint(NoStoreBB);
1536	PHINode *LoadedNoStore =
1537	Builder.CreatePHI(Ty: UnreleasedLoad->getType(), NumReservedValues: `2`, Name: "loaded.nostore");
1538	LoadedNoStore->addIncoming(V: UnreleasedLoad, BB: StartBB);
1539	if (HasReleasedLoadBB)
1540	LoadedNoStore->addIncoming(V: SecondLoad, BB: ReleasedLoadBB);
1541
1542	// In the failing case, where we don't execute the store-conditional, the
1543	// target might want to balance out the load-linked with a dedicated
1544	// instruction (e.g., on ARM, clearing the exclusive monitor).
1545	TLI->emitAtomicCmpXchgNoStoreLLBalance(Builder);
1546	Builder.CreateBr(Dest: FailureBB);
1547
1548	Builder.SetInsertPoint(FailureBB);
1549	PHINode *LoadedFailure =
1550	Builder.CreatePHI(Ty: UnreleasedLoad->getType(), NumReservedValues: `2`, Name: "loaded.failure");
1551	LoadedFailure->addIncoming(V: LoadedNoStore, BB: NoStoreBB);
1552	if (CI->isWeak())
1553	LoadedFailure->addIncoming(V: LoadedTryStore, BB: TryStoreBB);
1554	if (ShouldInsertFencesForAtomic)
1555	TLI->emitTrailingFence(Builder, Inst: CI, Ord: FailureOrder);
1556	Builder.CreateBr(Dest: ExitBB);
1557
1558	// Finally, we have control-flow based knowledge of whether the cmpxchg
1559	// succeeded or not. We expose this to later passes by converting any
1560	// subsequent "icmp eq/ne %loaded, %oldval" into a use of an appropriate
1561	// PHI.
1562	Builder.SetInsertPoint(TheBB: ExitBB, IP: ExitBB->begin());
1563	PHINode *LoadedExit =
1564	Builder.CreatePHI(Ty: UnreleasedLoad->getType(), NumReservedValues: `2`, Name: "loaded.exit");
1565	LoadedExit->addIncoming(V: LoadedTryStore, BB: SuccessBB);
1566	LoadedExit->addIncoming(V: LoadedFailure, BB: FailureBB);
1567	PHINode *Success = Builder.CreatePHI(Ty: Type::getInt1Ty(C&: Ctx), NumReservedValues: `2`, Name: "success");
1568	Success->addIncoming(V: ConstantInt::getTrue(Context&: Ctx), BB: SuccessBB);
1569	Success->addIncoming(V: ConstantInt::getFalse(Context&: Ctx), BB: FailureBB);
1570
1571	// This is the "exit value" from the cmpxchg expansion. It may be of
1572	// a type wider than the one in the cmpxchg instruction.
1573	Value *LoadedFull = LoadedExit;
1574
1575	Builder.SetInsertPoint(TheBB: ExitBB, IP: std::next(x: Success->getIterator()));
1576	Value *Loaded = extractMaskedValue(Builder, WideWord: LoadedFull, PMV);
1577
1578	// Look for any users of the cmpxchg that are just comparing the loaded value
1579	// against the desired one, and replace them with the CFG-derived version.
1580	SmallVector<ExtractValueInst *, `2`> PrunedInsts;
1581	for (auto *User : CI->users()) {
1582	ExtractValueInst *EV = dyn_cast<ExtractValueInst>(Val: User);
1583	if (!EV)
1584	continue;
1585
1586	assert(EV->getNumIndices() == `1` && EV->getIndices()[`0`] <= `1` &&
1587	"weird extraction from { iN, i1 }");
1588
1589	if (EV->getIndices()[`0`] == `0`)
1590	EV->replaceAllUsesWith(V: Loaded);
1591	else
1592	EV->replaceAllUsesWith(V: Success);
1593
1594	PrunedInsts.push_back(Elt: EV);
1595	}
1596
1597	// We can remove the instructions now we're no longer iterating through them.
1598	for (auto *EV : PrunedInsts)
1599	EV->eraseFromParent();
1600
1601	if (!CI->use_empty()) {
1602	// Some use of the full struct return that we don't understand has happened,
1603	// so we've got to reconstruct it properly.
1604	Value *Res;
1605	Res = Builder.CreateInsertValue(Agg: PoisonValue::get(T: CI->getType()), Val: Loaded, Idxs: `0`);
1606	Res = Builder.CreateInsertValue(Agg: Res, Val: Success, Idxs: `1`);
1607
1608	CI->replaceAllUsesWith(V: Res);
1609	}
1610
1611	CI->eraseFromParent();
1612	return true;
1613	}
1614
1615	bool AtomicExpandImpl::isIdempotentRMW(AtomicRMWInst *RMWI) {
1616	// TODO: Add floating point support.
1617	auto C = dyn_cast<ConstantInt>(Val: RMWI->getValOperand());
1618	if (!C)
1619	return false;
1620
1621	switch (RMWI->getOperation()) {
1622	case AtomicRMWInst::Add:
1623	case AtomicRMWInst::Sub:
1624	case AtomicRMWInst::Or:
1625	case AtomicRMWInst::Xor:
1626	return C->isZero();
1627	case AtomicRMWInst::And:
1628	return C->isMinusOne();
1629	case AtomicRMWInst::Min:
1630	return C->isMaxValue(IsSigned: true);
1631	case AtomicRMWInst::Max:
1632	return C->isMinValue(IsSigned: true);
1633	case AtomicRMWInst::UMin:
1634	return C->isMaxValue(IsSigned: false);
1635	case AtomicRMWInst::UMax:
1636	return C->isMinValue(IsSigned: false);
1637	default:
1638	return false;
1639	}
1640	}
1641
1642	bool AtomicExpandImpl::simplifyIdempotentRMW(AtomicRMWInst *RMWI) {
1643	if (auto ResultingLoad = TLI->lowerIdempotentRMWIntoFencedLoad(RMWI)) {
1644	tryExpandAtomicLoad(LI: ResultingLoad);
1645	return true;
1646	}
1647	return false;
1648	}
1649
1650	Value *AtomicExpandImpl::insertRMWCmpXchgLoop(
1651	IRBuilderBase &Builder, Type ResultTy, Value Addr, Align AddrAlign,
1652	AtomicOrdering MemOpOrder, SyncScope::ID SSID,
1653	function_ref<Value (IRBuilderBase &, Value )> PerformOp,
1654	CreateCmpXchgInstFun CreateCmpXchg, Instruction *MetadataSrc) {
1655	LLVMContext &Ctx = Builder.getContext();
1656	BasicBlock *BB = Builder.GetInsertBlock();
1657	Function *F = BB->getParent();
1658
1659	// Given: atomicrmw some_op iN %addr, iN %incr ordering*
1660	//
1661	// The standard expansion we produce is:
1662	// [...]
1663	// %init_loaded = load atomic iN %addr*
1664	// br label %loop
1665	// loop:
1666	// %loaded = phi iN [ %init_loaded, %entry ], [ %new_loaded, %loop ]
1667	// %new = some_op iN %loaded, %incr
1668	// %pair = cmpxchg iN %addr, iN %loaded, iN %new*
1669	// %new_loaded = extractvalue { iN, i1 } %pair, 0
1670	// %success = extractvalue { iN, i1 } %pair, 1
1671	// br i1 %success, label %atomicrmw.end, label %loop
1672	// atomicrmw.end:
1673	// [...]
1674	BasicBlock *ExitBB =
1675	BB->splitBasicBlock(I: Builder.GetInsertPoint(), BBName: "atomicrmw.end");
1676	BasicBlock *LoopBB = BasicBlock::Create(Context&: Ctx, Name: "atomicrmw.start", Parent: F, InsertBefore: ExitBB);
1677
1678	// The split call above "helpfully" added a branch at the end of BB (to the
1679	// wrong place), but we want a load. It's easiest to just remove
1680	// the branch entirely.
1681	std::prev(x: BB->end())->eraseFromParent();
1682	Builder.SetInsertPoint(BB);
1683	LoadInst *InitLoaded = Builder.CreateAlignedLoad(Ty: ResultTy, Ptr: Addr, Align: AddrAlign);
1684	// TODO: The initial load must be atomic with the same synchronization scope
1685	// to avoid a data race with concurrent stores. If the instruction being
1686	// emulated is volatile, issue a volatile load.
1687	Builder.CreateBr(Dest: LoopBB);
1688
1689	// Start the main loop block now that we've taken care of the preliminaries.
1690	Builder.SetInsertPoint(LoopBB);
1691	PHINode *Loaded = Builder.CreatePHI(Ty: ResultTy, NumReservedValues: `2`, Name: "loaded");
1692	Loaded->addIncoming(V: InitLoaded, BB);
1693
1694	Value *NewVal = PerformOp (Builder, Loaded);
1695
1696	Value NewLoaded = nullptr*;
1697	Value Success = nullptr*;
1698
1699	CreateCmpXchg (Builder, Addr, Loaded, NewVal, AddrAlign,
1700	MemOpOrder == AtomicOrdering::Unordered
1701	? AtomicOrdering::Monotonic
1702	: MemOpOrder,
1703	SSID, Success, NewLoaded, MetadataSrc);
1704	assert(Success && NewLoaded);
1705
1706	Loaded->addIncoming(V: NewLoaded, BB: LoopBB);
1707
1708	Instruction *CondBr = Builder.CreateCondBr(Cond: Success, True: ExitBB, False: LoopBB);
1709
1710	// Atomic RMW expands to a cmpxchg loop, Since precise branch weights
1711	// cannot be easily determined here, we mark the branch as "unknown" (50/50)
1712	// to prevent misleading optimizations.
1713	setExplicitlyUnknownBranchWeightsIfProfiled(I&: *CondBr, DEBUG_TYPE);
1714
1715	Builder.SetInsertPoint(TheBB: ExitBB, IP: ExitBB->begin());
1716	return NewLoaded;
1717	}
1718
1719	bool AtomicExpandImpl::tryExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
1720	unsigned MinCASSize = TLI->getMinCmpXchgSizeInBits() / `8`;
1721	unsigned ValueSize = getAtomicOpSize(CASI: CI);
1722
1723	switch (TLI->shouldExpandAtomicCmpXchgInIR(AI: CI)) {
1724	default:
1725	llvm_unreachable("Unhandled case in tryExpandAtomicCmpXchg");
1726	case TargetLoweringBase::AtomicExpansionKind::None:
1727	if (ValueSize < MinCASSize)
1728	return expandPartwordCmpXchg(CI);
1729	return false;
1730	case TargetLoweringBase::AtomicExpansionKind::LLSC: {
1731	return expandAtomicCmpXchg(CI);
1732	}
1733	case TargetLoweringBase::AtomicExpansionKind::MaskedIntrinsic:
1734	expandAtomicCmpXchgToMaskedIntrinsic(CI);
1735	return true;
1736	case TargetLoweringBase::AtomicExpansionKind::NotAtomic:
1737	return lowerAtomicCmpXchgInst(CXI: CI);
1738	case TargetLoweringBase::AtomicExpansionKind::CustomExpand: {
1739	TLI->emitExpandAtomicCmpXchg(CI);
1740	return true;
1741	}
1742	}
1743	}
1744
1745	// Note: This function is exposed externally by AtomicExpandUtils.h
1746	bool llvm::expandAtomicRMWToCmpXchg(AtomicRMWInst *AI,
1747	CreateCmpXchgInstFun CreateCmpXchg) {
1748	ReplacementIRBuilder Builder(AI, AI->getDataLayout());
1749	Builder.setIsFPConstrained(
1750	AI->getFunction()->hasFnAttribute(Kind: Attribute::StrictFP));
1751
1752	// FIXME: If FP exceptions are observable, we should force them off for the
1753	// loop for the FP atomics.
1754	Value *Loaded = AtomicExpandImpl::insertRMWCmpXchgLoop(
1755	Builder, ResultTy: AI->getType(), Addr: AI->getPointerOperand(), AddrAlign: AI->getAlign(),
1756	MemOpOrder: AI->getOrdering(), SSID: AI->getSyncScopeID(),
1757	PerformOp: [&](IRBuilderBase &Builder, Value *Loaded) {
1758	return buildAtomicRMWValue(Op: AI->getOperation(), Builder, Loaded,
1759	Val: AI->getValOperand());
1760	},
1761	CreateCmpXchg, /MetadataSrc=/AI);
1762
1763	AI->replaceAllUsesWith(V: Loaded);
1764	AI->eraseFromParent();
1765	return true;
1766	}
1767
1768	// In order to use one of the sized library calls such as
1769	// __atomic_fetch_add_4, the alignment must be sufficient, the size
1770	// must be one of the potentially-specialized sizes, and the value
1771	// type must actually exist in C on the target (otherwise, the
1772	// function wouldn't actually be defined.)
1773	static bool canUseSizedAtomicCall(unsigned Size, Align Alignment,
1774	const DataLayout &DL) {
1775	// TODO: "LargestSize" is an approximation for "largest type that
1776	// you can express in C". It seems to be the case that int128 is
1777	// supported on all 64-bit platforms, otherwise only up to 64-bit
1778	// integers are supported. If we get this wrong, then we'll try to
1779	// call a sized libcall that doesn't actually exist. There should
1780	// really be some more reliable way in LLVM of determining integer
1781	// sizes which are valid in the target's C ABI...
1782	unsigned LargestSize = DL.getLargestLegalIntTypeSizeInBits() >= `64` ? `16` : `8`;
1783	return Alignment >= Size &&
1784	(Size == `1` \|\| Size == `2` \|\| Size == `4` \|\| Size == `8` \|\| Size == `16`) &&
1785	Size <= LargestSize;
1786	}
1787
1788	void AtomicExpandImpl::expandAtomicLoadToLibcall(LoadInst *I) {
1789	static const RTLIB::Libcall Libcalls[`6`] = {
1790	RTLIB::ATOMIC_LOAD, RTLIB::ATOMIC_LOAD_1, RTLIB::ATOMIC_LOAD_2,
1791	RTLIB::ATOMIC_LOAD_4, RTLIB::ATOMIC_LOAD_8, RTLIB::ATOMIC_LOAD_16};
1792	unsigned Size = getAtomicOpSize(LI: I);
1793
1794	bool expanded = expandAtomicOpToLibcall(
1795	I, Size, Alignment: I->getAlign(), PointerOperand: I->getPointerOperand(), ValueOperand: nullptr, CASExpected: nullptr,
1796	Ordering: I->getOrdering(), Ordering2: AtomicOrdering::NotAtomic, Libcalls);
1797	if (!expanded)
1798	handleFailure(FailedInst&: *I, Msg: "unsupported atomic load");
1799	}
1800
1801	void AtomicExpandImpl::expandAtomicStoreToLibcall(StoreInst *I) {
1802	static const RTLIB::Libcall Libcalls[`6`] = {
1803	RTLIB::ATOMIC_STORE, RTLIB::ATOMIC_STORE_1, RTLIB::ATOMIC_STORE_2,
1804	RTLIB::ATOMIC_STORE_4, RTLIB::ATOMIC_STORE_8, RTLIB::ATOMIC_STORE_16};
1805	unsigned Size = getAtomicOpSize(SI: I);
1806
1807	bool expanded = expandAtomicOpToLibcall(
1808	I, Size, Alignment: I->getAlign(), PointerOperand: I->getPointerOperand(), ValueOperand: I->getValueOperand(),
1809	CASExpected: nullptr, Ordering: I->getOrdering(), Ordering2: AtomicOrdering::NotAtomic, Libcalls);
1810	if (!expanded)
1811	handleFailure(FailedInst&: *I, Msg: "unsupported atomic store");
1812	}
1813
1814	void AtomicExpandImpl::expandAtomicCASToLibcall(AtomicCmpXchgInst *I) {
1815	static const RTLIB::Libcall Libcalls[`6`] = {
1816	RTLIB::ATOMIC_COMPARE_EXCHANGE, RTLIB::ATOMIC_COMPARE_EXCHANGE_1,
1817	RTLIB::ATOMIC_COMPARE_EXCHANGE_2, RTLIB::ATOMIC_COMPARE_EXCHANGE_4,
1818	RTLIB::ATOMIC_COMPARE_EXCHANGE_8, RTLIB::ATOMIC_COMPARE_EXCHANGE_16};
1819	unsigned Size = getAtomicOpSize(CASI: I);
1820
1821	bool expanded = expandAtomicOpToLibcall(
1822	I, Size, Alignment: I->getAlign(), PointerOperand: I->getPointerOperand(), ValueOperand: I->getNewValOperand(),
1823	CASExpected: I->getCompareOperand(), Ordering: I->getSuccessOrdering(), Ordering2: I->getFailureOrdering(),
1824	Libcalls);
1825	if (!expanded)
1826	handleFailure(FailedInst&: *I, Msg: "unsupported cmpxchg");
1827	}
1828
1829	static ArrayRef<RTLIB::Libcall> GetRMWLibcall(AtomicRMWInst::BinOp Op) {
1830	static const RTLIB::Libcall LibcallsXchg[`6`] = {
1831	RTLIB::ATOMIC_EXCHANGE, RTLIB::ATOMIC_EXCHANGE_1,
1832	RTLIB::ATOMIC_EXCHANGE_2, RTLIB::ATOMIC_EXCHANGE_4,
1833	RTLIB::ATOMIC_EXCHANGE_8, RTLIB::ATOMIC_EXCHANGE_16};
1834	static const RTLIB::Libcall LibcallsAdd[`6`] = {
1835	RTLIB::UNKNOWN_LIBCALL, RTLIB::ATOMIC_FETCH_ADD_1,
1836	RTLIB::ATOMIC_FETCH_ADD_2, RTLIB::ATOMIC_FETCH_ADD_4,
1837	RTLIB::ATOMIC_FETCH_ADD_8, RTLIB::ATOMIC_FETCH_ADD_16};
1838	static const RTLIB::Libcall LibcallsSub[`6`] = {
1839	RTLIB::UNKNOWN_LIBCALL, RTLIB::ATOMIC_FETCH_SUB_1,
1840	RTLIB::ATOMIC_FETCH_SUB_2, RTLIB::ATOMIC_FETCH_SUB_4,
1841	RTLIB::ATOMIC_FETCH_SUB_8, RTLIB::ATOMIC_FETCH_SUB_16};
1842	static const RTLIB::Libcall LibcallsAnd[`6`] = {
1843	RTLIB::UNKNOWN_LIBCALL, RTLIB::ATOMIC_FETCH_AND_1,
1844	RTLIB::ATOMIC_FETCH_AND_2, RTLIB::ATOMIC_FETCH_AND_4,
1845	RTLIB::ATOMIC_FETCH_AND_8, RTLIB::ATOMIC_FETCH_AND_16};
1846	static const RTLIB::Libcall LibcallsOr[`6`] = {
1847	RTLIB::UNKNOWN_LIBCALL, RTLIB::ATOMIC_FETCH_OR_1,
1848	RTLIB::ATOMIC_FETCH_OR_2, RTLIB::ATOMIC_FETCH_OR_4,
1849	RTLIB::ATOMIC_FETCH_OR_8, RTLIB::ATOMIC_FETCH_OR_16};
1850	static const RTLIB::Libcall LibcallsXor[`6`] = {
1851	RTLIB::UNKNOWN_LIBCALL, RTLIB::ATOMIC_FETCH_XOR_1,
1852	RTLIB::ATOMIC_FETCH_XOR_2, RTLIB::ATOMIC_FETCH_XOR_4,
1853	RTLIB::ATOMIC_FETCH_XOR_8, RTLIB::ATOMIC_FETCH_XOR_16};
1854	static const RTLIB::Libcall LibcallsNand[`6`] = {
1855	RTLIB::UNKNOWN_LIBCALL, RTLIB::ATOMIC_FETCH_NAND_1,
1856	RTLIB::ATOMIC_FETCH_NAND_2, RTLIB::ATOMIC_FETCH_NAND_4,
1857	RTLIB::ATOMIC_FETCH_NAND_8, RTLIB::ATOMIC_FETCH_NAND_16};
1858
1859	switch (Op) {
1860	case AtomicRMWInst::BAD_BINOP:
1861	llvm_unreachable("Should not have BAD_BINOP.");
1862	case AtomicRMWInst::Xchg:
1863	return ArrayRef(LibcallsXchg);
1864	case AtomicRMWInst::Add:
1865	return ArrayRef(LibcallsAdd);
1866	case AtomicRMWInst::Sub:
1867	return ArrayRef(LibcallsSub);
1868	case AtomicRMWInst::And:
1869	return ArrayRef(LibcallsAnd);
1870	case AtomicRMWInst::Or:
1871	return ArrayRef(LibcallsOr);
1872	case AtomicRMWInst::Xor:
1873	return ArrayRef(LibcallsXor);
1874	case AtomicRMWInst::Nand:
1875	return ArrayRef(LibcallsNand);
1876	case AtomicRMWInst::Max:
1877	case AtomicRMWInst::Min:
1878	case AtomicRMWInst::UMax:
1879	case AtomicRMWInst::UMin:
1880	case AtomicRMWInst::FMax:
1881	case AtomicRMWInst::FMin:
1882	case AtomicRMWInst::FMaximum:
1883	case AtomicRMWInst::FMinimum:
1884	case AtomicRMWInst::FAdd:
1885	case AtomicRMWInst::FSub:
1886	case AtomicRMWInst::UIncWrap:
1887	case AtomicRMWInst::UDecWrap:
1888	case AtomicRMWInst::USubCond:
1889	case AtomicRMWInst::USubSat:
1890	// No atomic libcalls are available for these.
1891	return {};
1892	}
1893	llvm_unreachable("Unexpected AtomicRMW operation.");
1894	}
1895
1896	void AtomicExpandImpl::expandAtomicRMWToLibcall(AtomicRMWInst *I) {
1897	ArrayRef<RTLIB::Libcall> Libcalls = GetRMWLibcall(Op: I->getOperation());
1898
1899	unsigned Size = getAtomicOpSize(RMWI: I);
1900
1901	bool Success = false;
1902	if (!Libcalls.empty())
1903	Success = expandAtomicOpToLibcall(
1904	I, Size, Alignment: I->getAlign(), PointerOperand: I->getPointerOperand(), ValueOperand: I->getValOperand(),
1905	CASExpected: nullptr, Ordering: I->getOrdering(), Ordering2: AtomicOrdering::NotAtomic, Libcalls);
1906
1907	// The expansion failed: either there were no libcalls at all for
1908	// the operation (min/max), or there were only size-specialized
1909	// libcalls (add/sub/etc) and we needed a generic. So, expand to a
1910	// CAS libcall, via a CAS loop, instead.
1911	if (!Success) {
1912	expandAtomicRMWToCmpXchg(
1913	AI: I, CreateCmpXchg: [this](IRBuilderBase &Builder, Value Addr, Value Loaded,
1914	Value *NewVal, Align Alignment, AtomicOrdering MemOpOrder,
1915	SyncScope::ID SSID, Value &Success, Value &NewLoaded,
1916	Instruction *MetadataSrc) {
1917	// Create the CAS instruction normally...
1918	AtomicCmpXchgInst *Pair = Builder.CreateAtomicCmpXchg(
1919	Ptr: Addr, Cmp: Loaded, New: NewVal, Align: Alignment, SuccessOrdering: MemOpOrder,
1920	FailureOrdering: AtomicCmpXchgInst::getStrongestFailureOrdering(SuccessOrdering: MemOpOrder), SSID);
1921	if (MetadataSrc)
1922	copyMetadataForAtomic(Dest&: Pair, Source: MetadataSrc);
1923
1924	Success = Builder.CreateExtractValue(Agg: Pair, Idxs: `1`, Name: "success");
1925	NewLoaded = Builder.CreateExtractValue(Agg: Pair, Idxs: `0`, Name: "newloaded");
1926
1927	// ...and then expand the CAS into a libcall.
1928	expandAtomicCASToLibcall(I: Pair);
1929	});
1930	}
1931	}
1932
1933	// A helper routine for the above expandAtomicToLibcall functions.*
1934	//
1935	// 'Libcalls' contains an array of enum values for the particular
1936	// ATOMIC libcalls to be emitted. All of the other arguments besides
1937	// 'I' are extracted from the Instruction subclass by the
1938	// caller. Depending on the particular call, some will be null.
1939	bool AtomicExpandImpl::expandAtomicOpToLibcall(
1940	Instruction I, unsigned* Size, Align Alignment, Value *PointerOperand,
1941	Value ValueOperand, Value CASExpected, AtomicOrdering Ordering,
1942	AtomicOrdering Ordering2, ArrayRef<RTLIB::Libcall> Libcalls) {
1943	assert(Libcalls.size() == `6`);
1944
1945	LLVMContext &Ctx = I->getContext();
1946	Module *M = I->getModule();
1947	const DataLayout &DL = M->getDataLayout();
1948	IRBuilder<> Builder(I);
1949	IRBuilder<> AllocaBuilder(&I->getFunction()->getEntryBlock().front());
1950
1951	bool UseSizedLibcall = canUseSizedAtomicCall(Size, Alignment, DL);
1952	Type SizedIntTy = Type::getIntNTy(C&: Ctx, N: Size `8`);
1953
1954	if (M->getTargetTriple().isOSWindows() && M->getTargetTriple().isX86_64() &&
1955	Size == `16`) {
1956	// x86_64 Windows passes i128 as an XMM vector; on return, it is in
1957	// XMM0, and as a parameter, it is passed indirectly. The generic lowering
1958	// rules handles this correctly if we pass it as a v2i64 rather than
1959	// i128. This is what Clang does in the frontend for such types as well
1960	// (see WinX86_64ABIInfo::classify in Clang).
1961	SizedIntTy = FixedVectorType::get(ElementType: Type::getInt64Ty(C&: Ctx), NumElts: `2`);
1962	}
1963
1964	const Align AllocaAlignment = DL.getPrefTypeAlign(Ty: SizedIntTy);
1965
1966	// TODO: the "order" argument type is "int", not int32. So
1967	// getInt32Ty may be wrong if the arch uses e.g. 16-bit ints.
1968	assert(Ordering != AtomicOrdering::NotAtomic && "expect atomic MO");
1969	Constant *OrderingVal =
1970	ConstantInt::get(Ty: Type::getInt32Ty(C&: Ctx), V: (int)toCABI(AO: Ordering));
1971	Constant Ordering2Val = nullptr*;
1972	if (CASExpected) {
1973	assert(Ordering2 != AtomicOrdering::NotAtomic && "expect atomic MO");
1974	Ordering2Val =
1975	ConstantInt::get(Ty: Type::getInt32Ty(C&: Ctx), V: (int)toCABI(AO: Ordering2));
1976	}
1977	bool HasResult = I->getType() != Type::getVoidTy(C&: Ctx);
1978
1979	RTLIB::Libcall RTLibType;
1980	if (UseSizedLibcall) {
1981	switch (Size) {
1982	case `1`:
1983	RTLibType = Libcalls [`1`];
1984	break;
1985	case `2`:
1986	RTLibType = Libcalls [`2`];
1987	break;
1988	case `4`:
1989	RTLibType = Libcalls [`3`];
1990	break;
1991	case `8`:
1992	RTLibType = Libcalls [`4`];
1993	break;
1994	case `16`:
1995	RTLibType = Libcalls [`5`];
1996	break;
1997	}
1998	} else if (Libcalls [`0`] != RTLIB::UNKNOWN_LIBCALL) {
1999	RTLibType = Libcalls [`0`];
2000	} else {
2001	// Can't use sized function, and there's no generic for this
2002	// operation, so give up.
2003	return false;
2004	}
2005
2006	RTLIB::LibcallImpl LibcallImpl = LibcallLowering->getLibcallImpl(Call: RTLibType);
2007	if (LibcallImpl == RTLIB::Unsupported) {
2008	// This target does not implement the requested atomic libcall so give up.
2009	return false;
2010	}
2011
2012	// Build up the function call. There's two kinds. First, the sized
2013	// variants. These calls are going to be one of the following (with
2014	// N=1,2,4,8,16):
2015	// iN __atomic_load_N(iN ptr, int ordering)*
2016	// void __atomic_store_N(iN ptr, iN val, int ordering)*
2017	// iN __atomic_{exchange\|fetch_}_N(iN ptr, iN val, int ordering)
2018	// bool __atomic_compare_exchange_N(iN ptr, iN expected, iN desired,
2019	// int success_order, int failure_order)
2020	//
2021	// Note that these functions can be used for non-integer atomic
2022	// operations, the values just need to be bitcast to integers on the
2023	// way in and out.
2024	//
2025	// And, then, the generic variants. They look like the following:
2026	// void __atomic_load(size_t size, void ptr, void ret, int ordering)
2027	// void __atomic_store(size_t size, void ptr, void val, int ordering)
2028	// void __atomic_exchange(size_t size, void ptr, void val, void ret,*
2029	// int ordering)
2030	// bool __atomic_compare_exchange(size_t size, void ptr, void expected,
2031	// void desired, int success_order,*
2032	// int failure_order)
2033	//
2034	// The different signatures are built up depending on the
2035	// 'UseSizedLibcall', 'CASExpected', 'ValueOperand', and 'HasResult'
2036	// variables.
2037
2038	AllocaInst AllocaCASExpected = nullptr*;
2039	AllocaInst AllocaValue = nullptr*;
2040	AllocaInst AllocaResult = nullptr*;
2041
2042	Type *ResultTy;
2043	SmallVector<Value *, `6`> Args;
2044	AttributeList Attr;
2045
2046	// 'size' argument.
2047	if (!UseSizedLibcall) {
2048	// Note, getIntPtrType is assumed equivalent to size_t.
2049	Args.push_back(Elt: ConstantInt::get(Ty: DL.getIntPtrType(C&: Ctx), V: Size));
2050	}
2051
2052	// 'ptr' argument.
2053	// note: This assumes all address spaces share a common libfunc
2054	// implementation and that addresses are convertable. For systems without
2055	// that property, we'd need to extend this mechanism to support AS-specific
2056	// families of atomic intrinsics.
2057	Value *PtrVal = PointerOperand;
2058	PtrVal = Builder.CreateAddrSpaceCast(V: PtrVal, DestTy: PointerType::getUnqual(C&: Ctx));
2059	Args.push_back(Elt: PtrVal);
2060
2061	// 'expected' argument, if present.
2062	if (CASExpected) {
2063	AllocaCASExpected = AllocaBuilder.CreateAlloca(Ty: CASExpected->getType());
2064	AllocaCASExpected->setAlignment(AllocaAlignment);
2065	Builder.CreateLifetimeStart(Ptr: AllocaCASExpected);
2066	Builder.CreateAlignedStore(Val: CASExpected, Ptr: AllocaCASExpected, Align: AllocaAlignment);
2067	Args.push_back(Elt: AllocaCASExpected);
2068	}
2069
2070	// 'val' argument ('desired' for cas), if present.
2071	if (ValueOperand) {
2072	if (UseSizedLibcall) {
2073	Value *IntValue =
2074	Builder.CreateBitOrPointerCast(V: ValueOperand, DestTy: SizedIntTy);
2075	Args.push_back(Elt: IntValue);
2076	} else {
2077	AllocaValue = AllocaBuilder.CreateAlloca(Ty: ValueOperand->getType());
2078	AllocaValue->setAlignment(AllocaAlignment);
2079	Builder.CreateLifetimeStart(Ptr: AllocaValue);
2080	Builder.CreateAlignedStore(Val: ValueOperand, Ptr: AllocaValue, Align: AllocaAlignment);
2081	Args.push_back(Elt: AllocaValue);
2082	}
2083	}
2084
2085	// 'ret' argument.
2086	if (!CASExpected && HasResult && !UseSizedLibcall) {
2087	AllocaResult = AllocaBuilder.CreateAlloca(Ty: I->getType());
2088	AllocaResult->setAlignment(AllocaAlignment);
2089	Builder.CreateLifetimeStart(Ptr: AllocaResult);
2090	Args.push_back(Elt: AllocaResult);
2091	}
2092
2093	// 'ordering' ('success_order' for cas) argument.
2094	Args.push_back(Elt: OrderingVal);
2095
2096	// 'failure_order' argument, if present.
2097	if (Ordering2Val)
2098	Args.push_back(Elt: Ordering2Val);
2099
2100	// Now, the return type.
2101	if (CASExpected) {
2102	ResultTy = Type::getInt1Ty(C&: Ctx);
2103	Attr = Attr.addRetAttribute(C&: Ctx, Kind: Attribute::ZExt);
2104	} else if (HasResult && UseSizedLibcall)
2105	ResultTy = SizedIntTy;
2106	else
2107	ResultTy = Type::getVoidTy(C&: Ctx);
2108
2109	// Done with setting up arguments and return types, create the call:
2110	SmallVector<Type *, `6`> ArgTys;
2111	for (Value *Arg : Args)
2112	ArgTys.push_back(Elt: Arg->getType());
2113	FunctionType FnType = FunctionType::get(Result: ResultTy, Params: ArgTys, isVarArg: false*);
2114	FunctionCallee LibcallFn = M->getOrInsertFunction(
2115	Name: RTLIB::RuntimeLibcallsInfo::getLibcallImplName(CallImpl: LibcallImpl), T: FnType,
2116	AttributeList: Attr);
2117	CallInst *Call = Builder.CreateCall(Callee: LibcallFn, Args);
2118	Call->setAttributes(Attr);
2119	Value *Result = Call;
2120
2121	// And then, extract the results...
2122	if (ValueOperand && !UseSizedLibcall)
2123	Builder.CreateLifetimeEnd(Ptr: AllocaValue);
2124
2125	if (CASExpected) {
2126	// The final result from the CAS is {load of 'expected' alloca, bool result
2127	// from call}
2128	Type *FinalResultTy = I->getType();
2129	Value *V = PoisonValue::get(T: FinalResultTy);
2130	Value *ExpectedOut = Builder.CreateAlignedLoad(
2131	Ty: CASExpected->getType(), Ptr: AllocaCASExpected, Align: AllocaAlignment);
2132	Builder.CreateLifetimeEnd(Ptr: AllocaCASExpected);
2133	V = Builder.CreateInsertValue(Agg: V, Val: ExpectedOut, Idxs: `0`);
2134	V = Builder.CreateInsertValue(Agg: V, Val: Result, Idxs: `1`);
2135	I->replaceAllUsesWith(V);
2136	} else if (HasResult) {
2137	Value *V;
2138	if (UseSizedLibcall)
2139	V = Builder.CreateBitOrPointerCast(V: Result, DestTy: I->getType());
2140	else {
2141	V = Builder.CreateAlignedLoad(Ty: I->getType(), Ptr: AllocaResult,
2142	Align: AllocaAlignment);
2143	Builder.CreateLifetimeEnd(Ptr: AllocaResult);
2144	}
2145	I->replaceAllUsesWith(V);
2146	}
2147	I->eraseFromParent();
2148	return true;
2149	}
2150

Browse the source code of llvm_projects/llvm/lib/CodeGen/AtomicExpandPass.cpp