1//===- AtomicExpandPass.cpp - Expand atomic instructions ------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file contains a pass (at IR level) to replace atomic instructions with
10// __atomic_* library calls, or target specific instruction which implement the
11// same semantics in a way which better fits the target backend. This can
12// include the use of (intrinsic-based) load-linked/store-conditional loops,
13// AtomicCmpXchg, or type coercions.
14//
15//===----------------------------------------------------------------------===//
16
17#include "llvm/ADT/ArrayRef.h"
18#include "llvm/ADT/STLFunctionalExtras.h"
19#include "llvm/ADT/SmallVector.h"
20#include "llvm/Analysis/InstSimplifyFolder.h"
21#include "llvm/Analysis/OptimizationRemarkEmitter.h"
22#include "llvm/CodeGen/AtomicExpand.h"
23#include "llvm/CodeGen/TargetLowering.h"
24#include "llvm/CodeGen/TargetPassConfig.h"
25#include "llvm/CodeGen/TargetSubtargetInfo.h"
26#include "llvm/CodeGen/ValueTypes.h"
27#include "llvm/IR/Attributes.h"
28#include "llvm/IR/BasicBlock.h"
29#include "llvm/IR/Constant.h"
30#include "llvm/IR/Constants.h"
31#include "llvm/IR/DataLayout.h"
32#include "llvm/IR/DerivedTypes.h"
33#include "llvm/IR/Function.h"
34#include "llvm/IR/IRBuilder.h"
35#include "llvm/IR/Instruction.h"
36#include "llvm/IR/Instructions.h"
37#include "llvm/IR/MDBuilder.h"
38#include "llvm/IR/MemoryModelRelaxationAnnotations.h"
39#include "llvm/IR/Module.h"
40#include "llvm/IR/ProfDataUtils.h"
41#include "llvm/IR/Type.h"
42#include "llvm/IR/User.h"
43#include "llvm/IR/Value.h"
44#include "llvm/InitializePasses.h"
45#include "llvm/Pass.h"
46#include "llvm/Support/AtomicOrdering.h"
47#include "llvm/Support/Casting.h"
48#include "llvm/Support/Debug.h"
49#include "llvm/Support/ErrorHandling.h"
50#include "llvm/Support/raw_ostream.h"
51#include "llvm/Target/TargetMachine.h"
52#include "llvm/Transforms/Utils/LowerAtomic.h"
53#include <cassert>
54#include <cstdint>
55#include <iterator>
56
57using namespace llvm;
58
59#define DEBUG_TYPE "atomic-expand"
60
61namespace {
62
63class AtomicExpandImpl {
64 const TargetLowering *TLI = nullptr;
65 const LibcallLoweringInfo *LibcallLowering = nullptr;
66 const DataLayout *DL = nullptr;
67
68private:
69 /// Callback type for emitting a cmpxchg instruction during RMW expansion.
70 /// Parameters: (Builder, Addr, Loaded, NewVal, AddrAlign, MemOpOrder,
71 /// SSID, IsVolatile, /* OUT */ Success, /* OUT */ NewLoaded,
72 /// MetadataSrc)
73 using CreateCmpXchgInstFun = function_ref<void(
74 IRBuilderBase &, Value *, Value *, Value *, Align, AtomicOrdering,
75 SyncScope::ID, Value *&, Value *&, Instruction *)>;
76
77 void handleFailure(Instruction &FailedInst, const Twine &Msg) const {
78 LLVMContext &Ctx = FailedInst.getContext();
79
80 // TODO: Do not use generic error type.
81 Ctx.emitError(I: &FailedInst, ErrorStr: Msg);
82
83 if (!FailedInst.getType()->isVoidTy())
84 FailedInst.replaceAllUsesWith(V: PoisonValue::get(T: FailedInst.getType()));
85 FailedInst.eraseFromParent();
86 }
87
88 bool bracketInstWithFences(Instruction *I, AtomicOrdering Order);
89 bool tryInsertTrailingSeqCstFence(Instruction *AtomicI);
90 template <typename AtomicInst>
91 bool tryInsertFencesForAtomic(AtomicInst *AtomicI, bool OrderingRequiresFence,
92 AtomicOrdering NewOrdering);
93 IntegerType *getCorrespondingIntegerType(Type *T, const DataLayout &DL);
94 LoadInst *convertAtomicLoadToIntegerType(LoadInst *LI);
95 bool tryExpandAtomicLoad(LoadInst *LI);
96 bool expandAtomicLoadToLL(LoadInst *LI);
97 bool expandAtomicLoadToCmpXchg(LoadInst *LI);
98 StoreInst *convertAtomicStoreToIntegerType(StoreInst *SI);
99 bool tryExpandAtomicStore(StoreInst *SI);
100 void expandAtomicStoreToXChg(StoreInst *SI);
101 bool tryExpandAtomicRMW(AtomicRMWInst *AI);
102 AtomicRMWInst *convertAtomicXchgToIntegerType(AtomicRMWInst *RMWI);
103 Value *
104 insertRMWLLSCLoop(IRBuilderBase &Builder, Type *ResultTy, Value *Addr,
105 Align AddrAlign, AtomicOrdering MemOpOrder,
106 function_ref<Value *(IRBuilderBase &, Value *)> PerformOp);
107 void expandAtomicOpToLLSC(
108 Instruction *I, Type *ResultTy, Value *Addr, Align AddrAlign,
109 AtomicOrdering MemOpOrder,
110 function_ref<Value *(IRBuilderBase &, Value *)> PerformOp);
111 void expandPartwordAtomicRMW(
112 AtomicRMWInst *I, TargetLoweringBase::AtomicExpansionKind ExpansionKind);
113 AtomicRMWInst *widenPartwordAtomicRMW(AtomicRMWInst *AI);
114 bool expandPartwordCmpXchg(AtomicCmpXchgInst *I);
115 void expandAtomicRMWToMaskedIntrinsic(AtomicRMWInst *AI);
116 void expandAtomicCmpXchgToMaskedIntrinsic(AtomicCmpXchgInst *CI);
117
118 AtomicCmpXchgInst *convertCmpXchgToIntegerType(AtomicCmpXchgInst *CI);
119 static Value *insertRMWCmpXchgLoop(
120 IRBuilderBase &Builder, Type *ResultType, Value *Addr, Align AddrAlign,
121 AtomicOrdering MemOpOrder, SyncScope::ID SSID,
122 function_ref<Value *(IRBuilderBase &, Value *)> PerformOp,
123 CreateCmpXchgInstFun CreateCmpXchg, Instruction *MetadataSrc);
124 bool tryExpandAtomicCmpXchg(AtomicCmpXchgInst *CI);
125
126 bool expandAtomicCmpXchg(AtomicCmpXchgInst *CI);
127 bool isIdempotentRMW(AtomicRMWInst *RMWI);
128 bool simplifyIdempotentRMW(AtomicRMWInst *RMWI);
129
130 bool expandAtomicOpToLibcall(Instruction *I, unsigned Size, Align Alignment,
131 Value *PointerOperand, Value *ValueOperand,
132 Value *CASExpected, AtomicOrdering Ordering,
133 AtomicOrdering Ordering2,
134 ArrayRef<RTLIB::Libcall> Libcalls);
135 void expandAtomicLoadToLibcall(LoadInst *LI);
136 void expandAtomicStoreToLibcall(StoreInst *LI);
137 void expandAtomicRMWToLibcall(AtomicRMWInst *I);
138 void expandAtomicCASToLibcall(AtomicCmpXchgInst *I);
139
140 bool expandAtomicRMWToCmpXchg(AtomicRMWInst *AI,
141 CreateCmpXchgInstFun CreateCmpXchg);
142
143 bool processAtomicInstr(Instruction *I);
144
145public:
146 bool run(Function &F,
147 const LibcallLoweringModuleAnalysisResult &LibcallResult,
148 const TargetMachine *TM);
149};
150
151class AtomicExpandLegacy : public FunctionPass {
152public:
153 static char ID; // Pass identification, replacement for typeid
154
155 AtomicExpandLegacy() : FunctionPass(ID) {}
156
157 void getAnalysisUsage(AnalysisUsage &AU) const override {
158 AU.addRequired<LibcallLoweringInfoWrapper>();
159 FunctionPass::getAnalysisUsage(AU);
160 }
161
162 bool runOnFunction(Function &F) override;
163};
164
165// IRBuilder to be used for replacement atomic instructions.
166struct ReplacementIRBuilder
167 : IRBuilder<InstSimplifyFolder, IRBuilderCallbackInserter> {
168 MDNode *MMRAMD = nullptr;
169
170 // Preserves the DebugLoc from I, and preserves still valid metadata.
171 // Enable StrictFP builder mode when appropriate.
172 explicit ReplacementIRBuilder(Instruction *I, const DataLayout &DL)
173 : IRBuilder(I->getContext(), InstSimplifyFolder(DL),
174 IRBuilderCallbackInserter(
175 [this](Instruction *I) { addMMRAMD(I); })) {
176 SetInsertPoint(I);
177 this->CollectMetadataToCopy(Src: I, MetadataKinds: {LLVMContext::MD_pcsections});
178 if (BB->getParent()->getAttributes().hasFnAttr(Kind: Attribute::StrictFP))
179 this->setIsFPConstrained(true);
180
181 MMRAMD = I->getMetadata(KindID: LLVMContext::MD_mmra);
182 }
183
184 void addMMRAMD(Instruction *I) {
185 if (canInstructionHaveMMRAs(I: *I))
186 I->setMetadata(KindID: LLVMContext::MD_mmra, Node: MMRAMD);
187 }
188};
189
190} // end anonymous namespace
191
192char AtomicExpandLegacy::ID = 0;
193
194char &llvm::AtomicExpandID = AtomicExpandLegacy::ID;
195
196INITIALIZE_PASS_BEGIN(AtomicExpandLegacy, DEBUG_TYPE,
197 "Expand Atomic instructions", false, false)
198INITIALIZE_PASS_DEPENDENCY(LibcallLoweringInfoWrapper)
199INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
200INITIALIZE_PASS_END(AtomicExpandLegacy, DEBUG_TYPE,
201 "Expand Atomic instructions", false, false)
202
203// Helper functions to retrieve the size of atomic instructions.
204static unsigned getAtomicOpSize(LoadInst *LI) {
205 const DataLayout &DL = LI->getDataLayout();
206 return DL.getTypeStoreSize(Ty: LI->getType());
207}
208
209static unsigned getAtomicOpSize(StoreInst *SI) {
210 const DataLayout &DL = SI->getDataLayout();
211 return DL.getTypeStoreSize(Ty: SI->getValueOperand()->getType());
212}
213
214static unsigned getAtomicOpSize(AtomicRMWInst *RMWI) {
215 const DataLayout &DL = RMWI->getDataLayout();
216 return DL.getTypeStoreSize(Ty: RMWI->getValOperand()->getType());
217}
218
219static unsigned getAtomicOpSize(AtomicCmpXchgInst *CASI) {
220 const DataLayout &DL = CASI->getDataLayout();
221 return DL.getTypeStoreSize(Ty: CASI->getCompareOperand()->getType());
222}
223
224/// Copy metadata that's safe to preserve when widening atomics.
225static void copyMetadataForAtomic(Instruction &Dest,
226 const Instruction &Source) {
227 SmallVector<std::pair<unsigned, MDNode *>, 8> MD;
228 Source.getAllMetadata(MDs&: MD);
229 LLVMContext &Ctx = Dest.getContext();
230 MDBuilder MDB(Ctx);
231
232 for (auto [ID, N] : MD) {
233 switch (ID) {
234 case LLVMContext::MD_dbg:
235 case LLVMContext::MD_tbaa:
236 case LLVMContext::MD_tbaa_struct:
237 case LLVMContext::MD_alias_scope:
238 case LLVMContext::MD_noalias:
239 case LLVMContext::MD_noalias_addrspace:
240 case LLVMContext::MD_access_group:
241 case LLVMContext::MD_mmra:
242 Dest.setMetadata(KindID: ID, Node: N);
243 break;
244 default:
245 if (ID == Ctx.getMDKindID(Name: "amdgpu.no.remote.memory"))
246 Dest.setMetadata(KindID: ID, Node: N);
247 else if (ID == Ctx.getMDKindID(Name: "amdgpu.no.fine.grained.memory"))
248 Dest.setMetadata(KindID: ID, Node: N);
249
250 // Losing amdgpu.ignore.denormal.mode, but it doesn't matter for current
251 // uses.
252 break;
253 }
254 }
255}
256
257// Determine if a particular atomic operation has a supported size,
258// and is of appropriate alignment, to be passed through for target
259// lowering. (Versus turning into a __atomic libcall)
260template <typename Inst>
261static bool atomicSizeSupported(const TargetLowering *TLI, Inst *I) {
262 unsigned Size = getAtomicOpSize(I);
263 Align Alignment = I->getAlign();
264 return Alignment >= Size &&
265 Size <= TLI->getMaxAtomicSizeInBitsSupported() / 8;
266}
267
268bool AtomicExpandImpl::tryInsertTrailingSeqCstFence(Instruction *AtomicI) {
269 if (!TLI->shouldInsertTrailingSeqCstFenceForAtomicStore(I: AtomicI))
270 return false;
271
272 IRBuilder Builder(AtomicI);
273 if (auto *TrailingFence = TLI->emitTrailingFence(
274 Builder, Inst: AtomicI, Ord: AtomicOrdering::SequentiallyConsistent)) {
275 TrailingFence->moveAfter(MovePos: AtomicI);
276 return true;
277 }
278 return false;
279}
280
281template <typename AtomicInst>
282bool AtomicExpandImpl::tryInsertFencesForAtomic(AtomicInst *AtomicI,
283 bool OrderingRequiresFence,
284 AtomicOrdering NewOrdering) {
285 bool ShouldInsertFences = TLI->shouldInsertFencesForAtomic(I: AtomicI);
286 if (OrderingRequiresFence && ShouldInsertFences) {
287 AtomicOrdering FenceOrdering = AtomicI->getOrdering();
288 AtomicI->setOrdering(NewOrdering);
289 return bracketInstWithFences(I: AtomicI, Order: FenceOrdering);
290 }
291 if (!ShouldInsertFences)
292 return tryInsertTrailingSeqCstFence(AtomicI);
293 return false;
294}
295
296bool AtomicExpandImpl::processAtomicInstr(Instruction *I) {
297 if (auto *LI = dyn_cast<LoadInst>(Val: I)) {
298 if (!LI->isAtomic())
299 return false;
300
301 if (!atomicSizeSupported(TLI, I: LI)) {
302 expandAtomicLoadToLibcall(LI);
303 return true;
304 }
305
306 bool MadeChange = false;
307 if (TLI->shouldCastAtomicLoadInIR(LI) ==
308 TargetLoweringBase::AtomicExpansionKind::CastToInteger) {
309 LI = convertAtomicLoadToIntegerType(LI);
310 MadeChange = true;
311 }
312
313 MadeChange |= tryInsertFencesForAtomic(
314 AtomicI: LI, OrderingRequiresFence: isAcquireOrStronger(AO: LI->getOrdering()), NewOrdering: AtomicOrdering::Monotonic);
315
316 MadeChange |= tryExpandAtomicLoad(LI);
317 return MadeChange;
318 }
319
320 if (auto *SI = dyn_cast<StoreInst>(Val: I)) {
321 if (!SI->isAtomic())
322 return false;
323
324 if (!atomicSizeSupported(TLI, I: SI)) {
325 expandAtomicStoreToLibcall(LI: SI);
326 return true;
327 }
328
329 bool MadeChange = false;
330 if (TLI->shouldCastAtomicStoreInIR(SI) ==
331 TargetLoweringBase::AtomicExpansionKind::CastToInteger) {
332 SI = convertAtomicStoreToIntegerType(SI);
333 MadeChange = true;
334 }
335
336 MadeChange |= tryInsertFencesForAtomic(
337 AtomicI: SI, OrderingRequiresFence: isReleaseOrStronger(AO: SI->getOrdering()), NewOrdering: AtomicOrdering::Monotonic);
338
339 MadeChange |= tryExpandAtomicStore(SI);
340 return MadeChange;
341 }
342
343 if (auto *RMWI = dyn_cast<AtomicRMWInst>(Val: I)) {
344 if (!atomicSizeSupported(TLI, I: RMWI)) {
345 expandAtomicRMWToLibcall(I: RMWI);
346 return true;
347 }
348
349 bool MadeChange = false;
350 if (TLI->shouldCastAtomicRMWIInIR(RMWI) ==
351 TargetLoweringBase::AtomicExpansionKind::CastToInteger) {
352 RMWI = convertAtomicXchgToIntegerType(RMWI);
353 MadeChange = true;
354 }
355
356 MadeChange |= tryInsertFencesForAtomic(
357 AtomicI: RMWI,
358 OrderingRequiresFence: isReleaseOrStronger(AO: RMWI->getOrdering()) ||
359 isAcquireOrStronger(AO: RMWI->getOrdering()),
360 NewOrdering: TLI->atomicOperationOrderAfterFenceSplit(I: RMWI));
361
362 // There are two different ways of expanding RMW instructions:
363 // - into a load if it is idempotent
364 // - into a Cmpxchg/LL-SC loop otherwise
365 // we try them in that order.
366 MadeChange |= (isIdempotentRMW(RMWI) && simplifyIdempotentRMW(RMWI)) ||
367 tryExpandAtomicRMW(AI: RMWI);
368 return MadeChange;
369 }
370
371 if (auto *CASI = dyn_cast<AtomicCmpXchgInst>(Val: I)) {
372 if (!atomicSizeSupported(TLI, I: CASI)) {
373 expandAtomicCASToLibcall(I: CASI);
374 return true;
375 }
376
377 // TODO: when we're ready to make the change at the IR level, we can
378 // extend convertCmpXchgToInteger for floating point too.
379 bool MadeChange = false;
380 if (CASI->getCompareOperand()->getType()->isPointerTy()) {
381 // TODO: add a TLI hook to control this so that each target can
382 // convert to lowering the original type one at a time.
383 CASI = convertCmpXchgToIntegerType(CI: CASI);
384 MadeChange = true;
385 }
386
387 auto CmpXchgExpansion = TLI->shouldExpandAtomicCmpXchgInIR(AI: CASI);
388 if (TLI->shouldInsertFencesForAtomic(I: CASI)) {
389 if (CmpXchgExpansion == TargetLoweringBase::AtomicExpansionKind::None &&
390 (isReleaseOrStronger(AO: CASI->getSuccessOrdering()) ||
391 isAcquireOrStronger(AO: CASI->getSuccessOrdering()) ||
392 isAcquireOrStronger(AO: CASI->getFailureOrdering()))) {
393 // If a compare and swap is lowered to LL/SC, we can do smarter fence
394 // insertion, with a stronger one on the success path than on the
395 // failure path. As a result, fence insertion is directly done by
396 // expandAtomicCmpXchg in that case.
397 AtomicOrdering FenceOrdering = CASI->getMergedOrdering();
398 AtomicOrdering CASOrdering =
399 TLI->atomicOperationOrderAfterFenceSplit(I: CASI);
400 CASI->setSuccessOrdering(CASOrdering);
401 CASI->setFailureOrdering(CASOrdering);
402 MadeChange |= bracketInstWithFences(I: CASI, Order: FenceOrdering);
403 }
404 } else if (CmpXchgExpansion !=
405 TargetLoweringBase::AtomicExpansionKind::LLSC) {
406 // CmpXchg LLSC is handled in expandAtomicCmpXchg().
407 MadeChange |= tryInsertTrailingSeqCstFence(AtomicI: CASI);
408 }
409
410 MadeChange |= tryExpandAtomicCmpXchg(CI: CASI);
411 return MadeChange;
412 }
413
414 return false;
415}
416
417bool AtomicExpandImpl::run(
418 Function &F, const LibcallLoweringModuleAnalysisResult &LibcallResult,
419 const TargetMachine *TM) {
420 const auto *Subtarget = TM->getSubtargetImpl(F);
421 if (!Subtarget->enableAtomicExpand())
422 return false;
423 TLI = Subtarget->getTargetLowering();
424 LibcallLowering = &LibcallResult.getLibcallLowering(Subtarget: *Subtarget);
425 DL = &F.getDataLayout();
426
427 bool MadeChange = false;
428
429 for (Function::iterator BBI = F.begin(), BBE = F.end(); BBI != BBE; ++BBI) {
430 BasicBlock *BB = &*BBI;
431
432 BasicBlock::reverse_iterator Next;
433
434 for (BasicBlock::reverse_iterator I = BB->rbegin(), E = BB->rend(); I != E;
435 I = Next) {
436 Instruction &Inst = *I;
437 Next = std::next(x: I);
438
439 if (processAtomicInstr(I: &Inst)) {
440 MadeChange = true;
441
442 // New blocks may have been inserted.
443 BBE = F.end();
444 }
445 }
446 }
447
448 return MadeChange;
449}
450
451bool AtomicExpandLegacy::runOnFunction(Function &F) {
452
453 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
454 if (!TPC)
455 return false;
456 auto *TM = &TPC->getTM<TargetMachine>();
457
458 const LibcallLoweringModuleAnalysisResult &LibcallResult =
459 getAnalysis<LibcallLoweringInfoWrapper>().getResult(M: *F.getParent());
460 AtomicExpandImpl AE;
461 return AE.run(F, LibcallResult, TM);
462}
463
464FunctionPass *llvm::createAtomicExpandLegacyPass() {
465 return new AtomicExpandLegacy();
466}
467
468PreservedAnalyses AtomicExpandPass::run(Function &F,
469 FunctionAnalysisManager &FAM) {
470 auto &MAMProxy = FAM.getResult<ModuleAnalysisManagerFunctionProxy>(IR&: F);
471
472 const LibcallLoweringModuleAnalysisResult *LibcallResult =
473 MAMProxy.getCachedResult<LibcallLoweringModuleAnalysis>(IR&: *F.getParent());
474
475 if (!LibcallResult) {
476 F.getContext().emitError(ErrorStr: "'" + LibcallLoweringModuleAnalysis::name() +
477 "' analysis required");
478 return PreservedAnalyses::all();
479 }
480
481 AtomicExpandImpl AE;
482
483 bool Changed = AE.run(F, LibcallResult: *LibcallResult, TM);
484 if (!Changed)
485 return PreservedAnalyses::all();
486
487 return PreservedAnalyses::none();
488}
489
490bool AtomicExpandImpl::bracketInstWithFences(Instruction *I,
491 AtomicOrdering Order) {
492 ReplacementIRBuilder Builder(I, *DL);
493
494 auto LeadingFence = TLI->emitLeadingFence(Builder, Inst: I, Ord: Order);
495
496 auto TrailingFence = TLI->emitTrailingFence(Builder, Inst: I, Ord: Order);
497 // We have a guard here because not every atomic operation generates a
498 // trailing fence.
499 if (TrailingFence)
500 TrailingFence->moveAfter(MovePos: I);
501
502 return (LeadingFence || TrailingFence);
503}
504
505/// Get the iX type with the same bitwidth as T.
506IntegerType *
507AtomicExpandImpl::getCorrespondingIntegerType(Type *T, const DataLayout &DL) {
508 EVT VT = TLI->getMemValueType(DL, Ty: T);
509 unsigned BitWidth = VT.getStoreSizeInBits();
510 assert(BitWidth == VT.getSizeInBits() && "must be a power of two");
511 return IntegerType::get(C&: T->getContext(), NumBits: BitWidth);
512}
513
514/// Convert an atomic load of a non-integral type to an integer load of the
515/// equivalent bitwidth. See the function comment on
516/// convertAtomicStoreToIntegerType for background.
517LoadInst *AtomicExpandImpl::convertAtomicLoadToIntegerType(LoadInst *LI) {
518 auto *M = LI->getModule();
519 Type *NewTy = getCorrespondingIntegerType(T: LI->getType(), DL: M->getDataLayout());
520
521 ReplacementIRBuilder Builder(LI, *DL);
522
523 Value *Addr = LI->getPointerOperand();
524
525 auto *NewLI = Builder.CreateLoad(Ty: NewTy, Ptr: Addr);
526 NewLI->setAlignment(LI->getAlign());
527 NewLI->setVolatile(LI->isVolatile());
528 NewLI->setAtomic(Ordering: LI->getOrdering(), SSID: LI->getSyncScopeID());
529 LLVM_DEBUG(dbgs() << "Replaced " << *LI << " with " << *NewLI << "\n");
530
531 Value *NewVal = Builder.CreateBitCast(V: NewLI, DestTy: LI->getType());
532 LI->replaceAllUsesWith(V: NewVal);
533 LI->eraseFromParent();
534 return NewLI;
535}
536
537AtomicRMWInst *
538AtomicExpandImpl::convertAtomicXchgToIntegerType(AtomicRMWInst *RMWI) {
539 assert(RMWI->getOperation() == AtomicRMWInst::Xchg);
540
541 auto *M = RMWI->getModule();
542 Type *NewTy =
543 getCorrespondingIntegerType(T: RMWI->getType(), DL: M->getDataLayout());
544
545 ReplacementIRBuilder Builder(RMWI, *DL);
546
547 Value *Addr = RMWI->getPointerOperand();
548 Value *Val = RMWI->getValOperand();
549 Value *NewVal = Val->getType()->isPointerTy()
550 ? Builder.CreatePtrToInt(V: Val, DestTy: NewTy)
551 : Builder.CreateBitCast(V: Val, DestTy: NewTy);
552
553 auto *NewRMWI = Builder.CreateAtomicRMW(Op: AtomicRMWInst::Xchg, Ptr: Addr, Val: NewVal,
554 Align: RMWI->getAlign(), Ordering: RMWI->getOrdering(),
555 SSID: RMWI->getSyncScopeID());
556 NewRMWI->setVolatile(RMWI->isVolatile());
557 copyMetadataForAtomic(Dest&: *NewRMWI, Source: *RMWI);
558 LLVM_DEBUG(dbgs() << "Replaced " << *RMWI << " with " << *NewRMWI << "\n");
559
560 Value *NewRVal = RMWI->getType()->isPointerTy()
561 ? Builder.CreateIntToPtr(V: NewRMWI, DestTy: RMWI->getType())
562 : Builder.CreateBitCast(V: NewRMWI, DestTy: RMWI->getType());
563 RMWI->replaceAllUsesWith(V: NewRVal);
564 RMWI->eraseFromParent();
565 return NewRMWI;
566}
567
568bool AtomicExpandImpl::tryExpandAtomicLoad(LoadInst *LI) {
569 switch (TLI->shouldExpandAtomicLoadInIR(LI)) {
570 case TargetLoweringBase::AtomicExpansionKind::None:
571 return false;
572 case TargetLoweringBase::AtomicExpansionKind::LLSC:
573 expandAtomicOpToLLSC(
574 I: LI, ResultTy: LI->getType(), Addr: LI->getPointerOperand(), AddrAlign: LI->getAlign(),
575 MemOpOrder: LI->getOrdering(),
576 PerformOp: [](IRBuilderBase &Builder, Value *Loaded) { return Loaded; });
577 return true;
578 case TargetLoweringBase::AtomicExpansionKind::LLOnly:
579 return expandAtomicLoadToLL(LI);
580 case TargetLoweringBase::AtomicExpansionKind::CmpXChg:
581 return expandAtomicLoadToCmpXchg(LI);
582 case TargetLoweringBase::AtomicExpansionKind::NotAtomic:
583 LI->setAtomic(Ordering: AtomicOrdering::NotAtomic);
584 return true;
585 case TargetLoweringBase::AtomicExpansionKind::CustomExpand:
586 TLI->emitExpandAtomicLoad(LI);
587 return true;
588 default:
589 llvm_unreachable("Unhandled case in tryExpandAtomicLoad");
590 }
591}
592
593bool AtomicExpandImpl::tryExpandAtomicStore(StoreInst *SI) {
594 switch (TLI->shouldExpandAtomicStoreInIR(SI)) {
595 case TargetLoweringBase::AtomicExpansionKind::None:
596 return false;
597 case TargetLoweringBase::AtomicExpansionKind::CustomExpand:
598 TLI->emitExpandAtomicStore(SI);
599 return true;
600 case TargetLoweringBase::AtomicExpansionKind::Expand:
601 expandAtomicStoreToXChg(SI);
602 return true;
603 case TargetLoweringBase::AtomicExpansionKind::NotAtomic:
604 SI->setAtomic(Ordering: AtomicOrdering::NotAtomic);
605 return true;
606 default:
607 llvm_unreachable("Unhandled case in tryExpandAtomicStore");
608 }
609}
610
611bool AtomicExpandImpl::expandAtomicLoadToLL(LoadInst *LI) {
612 ReplacementIRBuilder Builder(LI, *DL);
613
614 // On some architectures, load-linked instructions are atomic for larger
615 // sizes than normal loads. For example, the only 64-bit load guaranteed
616 // to be single-copy atomic by ARM is an ldrexd (A3.5.3).
617 Value *Val = TLI->emitLoadLinked(Builder, ValueTy: LI->getType(),
618 Addr: LI->getPointerOperand(), Ord: LI->getOrdering());
619 TLI->emitAtomicCmpXchgNoStoreLLBalance(Builder);
620
621 LI->replaceAllUsesWith(V: Val);
622 LI->eraseFromParent();
623
624 return true;
625}
626
627bool AtomicExpandImpl::expandAtomicLoadToCmpXchg(LoadInst *LI) {
628 ReplacementIRBuilder Builder(LI, *DL);
629 AtomicOrdering Order = LI->getOrdering();
630 if (Order == AtomicOrdering::Unordered)
631 Order = AtomicOrdering::Monotonic;
632
633 Value *Addr = LI->getPointerOperand();
634 Type *Ty = LI->getType();
635 Constant *DummyVal = Constant::getNullValue(Ty);
636
637 Value *Pair = Builder.CreateAtomicCmpXchg(
638 Ptr: Addr, Cmp: DummyVal, New: DummyVal, Align: LI->getAlign(), SuccessOrdering: Order,
639 FailureOrdering: AtomicCmpXchgInst::getStrongestFailureOrdering(SuccessOrdering: Order));
640 Value *Loaded = Builder.CreateExtractValue(Agg: Pair, Idxs: 0, Name: "loaded");
641
642 LI->replaceAllUsesWith(V: Loaded);
643 LI->eraseFromParent();
644
645 return true;
646}
647
648/// Convert an atomic store of a non-integral type to an integer store of the
649/// equivalent bitwidth. We used to not support floating point or vector
650/// atomics in the IR at all. The backends learned to deal with the bitcast
651/// idiom because that was the only way of expressing the notion of a atomic
652/// float or vector store. The long term plan is to teach each backend to
653/// instruction select from the original atomic store, but as a migration
654/// mechanism, we convert back to the old format which the backends understand.
655/// Each backend will need individual work to recognize the new format.
656StoreInst *AtomicExpandImpl::convertAtomicStoreToIntegerType(StoreInst *SI) {
657 ReplacementIRBuilder Builder(SI, *DL);
658 auto *M = SI->getModule();
659 Type *NewTy = getCorrespondingIntegerType(T: SI->getValueOperand()->getType(),
660 DL: M->getDataLayout());
661 Value *NewVal = Builder.CreateBitCast(V: SI->getValueOperand(), DestTy: NewTy);
662
663 Value *Addr = SI->getPointerOperand();
664
665 StoreInst *NewSI = Builder.CreateStore(Val: NewVal, Ptr: Addr);
666 NewSI->setAlignment(SI->getAlign());
667 NewSI->setVolatile(SI->isVolatile());
668 NewSI->setAtomic(Ordering: SI->getOrdering(), SSID: SI->getSyncScopeID());
669 LLVM_DEBUG(dbgs() << "Replaced " << *SI << " with " << *NewSI << "\n");
670 SI->eraseFromParent();
671 return NewSI;
672}
673
674void AtomicExpandImpl::expandAtomicStoreToXChg(StoreInst *SI) {
675 // This function is only called on atomic stores that are too large to be
676 // atomic if implemented as a native store. So we replace them by an
677 // atomic swap, that can be implemented for example as a ldrex/strex on ARM
678 // or lock cmpxchg8/16b on X86, as these are atomic for larger sizes.
679 // It is the responsibility of the target to only signal expansion via
680 // shouldExpandAtomicRMW in cases where this is required and possible.
681 ReplacementIRBuilder Builder(SI, *DL);
682 AtomicOrdering Ordering = SI->getOrdering();
683 assert(Ordering != AtomicOrdering::NotAtomic);
684 AtomicOrdering RMWOrdering = Ordering == AtomicOrdering::Unordered
685 ? AtomicOrdering::Monotonic
686 : Ordering;
687 AtomicRMWInst *AI = Builder.CreateAtomicRMW(
688 Op: AtomicRMWInst::Xchg, Ptr: SI->getPointerOperand(), Val: SI->getValueOperand(),
689 Align: SI->getAlign(), Ordering: RMWOrdering);
690 SI->eraseFromParent();
691
692 // Now we have an appropriate swap instruction, lower it as usual.
693 tryExpandAtomicRMW(AI);
694}
695
696static void createCmpXchgInstFun(IRBuilderBase &Builder, Value *Addr,
697 Value *Loaded, Value *NewVal, Align AddrAlign,
698 AtomicOrdering MemOpOrder, SyncScope::ID SSID,
699 Value *&Success, Value *&NewLoaded,
700 Instruction *MetadataSrc) {
701 Type *OrigTy = NewVal->getType();
702
703 // This code can go away when cmpxchg supports FP and vector types.
704 assert(!OrigTy->isPointerTy());
705 bool NeedBitcast = OrigTy->isFloatingPointTy() || OrigTy->isVectorTy();
706 if (NeedBitcast) {
707 IntegerType *IntTy = Builder.getIntNTy(N: OrigTy->getPrimitiveSizeInBits());
708 NewVal = Builder.CreateBitCast(V: NewVal, DestTy: IntTy);
709 Loaded = Builder.CreateBitCast(V: Loaded, DestTy: IntTy);
710 }
711
712 AtomicCmpXchgInst *Pair = Builder.CreateAtomicCmpXchg(
713 Ptr: Addr, Cmp: Loaded, New: NewVal, Align: AddrAlign, SuccessOrdering: MemOpOrder,
714 FailureOrdering: AtomicCmpXchgInst::getStrongestFailureOrdering(SuccessOrdering: MemOpOrder), SSID);
715 if (MetadataSrc)
716 copyMetadataForAtomic(Dest&: *Pair, Source: *MetadataSrc);
717
718 Success = Builder.CreateExtractValue(Agg: Pair, Idxs: 1, Name: "success");
719 NewLoaded = Builder.CreateExtractValue(Agg: Pair, Idxs: 0, Name: "newloaded");
720
721 if (NeedBitcast)
722 NewLoaded = Builder.CreateBitCast(V: NewLoaded, DestTy: OrigTy);
723}
724
725bool AtomicExpandImpl::tryExpandAtomicRMW(AtomicRMWInst *AI) {
726 LLVMContext &Ctx = AI->getModule()->getContext();
727 TargetLowering::AtomicExpansionKind Kind = TLI->shouldExpandAtomicRMWInIR(RMW: AI);
728 switch (Kind) {
729 case TargetLoweringBase::AtomicExpansionKind::None:
730 return false;
731 case TargetLoweringBase::AtomicExpansionKind::LLSC: {
732 unsigned MinCASSize = TLI->getMinCmpXchgSizeInBits() / 8;
733 unsigned ValueSize = getAtomicOpSize(RMWI: AI);
734 if (ValueSize < MinCASSize) {
735 expandPartwordAtomicRMW(I: AI,
736 ExpansionKind: TargetLoweringBase::AtomicExpansionKind::LLSC);
737 } else {
738 auto PerformOp = [&](IRBuilderBase &Builder, Value *Loaded) {
739 return buildAtomicRMWValue(Op: AI->getOperation(), Builder, Loaded,
740 Val: AI->getValOperand());
741 };
742 expandAtomicOpToLLSC(I: AI, ResultTy: AI->getType(), Addr: AI->getPointerOperand(),
743 AddrAlign: AI->getAlign(), MemOpOrder: AI->getOrdering(), PerformOp);
744 }
745 return true;
746 }
747 case TargetLoweringBase::AtomicExpansionKind::CmpXChg: {
748 unsigned MinCASSize = TLI->getMinCmpXchgSizeInBits() / 8;
749 unsigned ValueSize = getAtomicOpSize(RMWI: AI);
750 if (ValueSize < MinCASSize) {
751 expandPartwordAtomicRMW(I: AI,
752 ExpansionKind: TargetLoweringBase::AtomicExpansionKind::CmpXChg);
753 } else {
754 SmallVector<StringRef> SSNs;
755 Ctx.getSyncScopeNames(SSNs);
756 auto MemScope = SSNs[AI->getSyncScopeID()].empty()
757 ? "system"
758 : SSNs[AI->getSyncScopeID()];
759 OptimizationRemarkEmitter ORE(AI->getFunction());
760 ORE.emit(RemarkBuilder: [&]() {
761 return OptimizationRemark(DEBUG_TYPE, "Passed", AI)
762 << "A compare and swap loop was generated for an atomic "
763 << AI->getOperationName(Op: AI->getOperation()) << " operation at "
764 << MemScope << " memory scope";
765 });
766 expandAtomicRMWToCmpXchg(AI, CreateCmpXchg: createCmpXchgInstFun);
767 }
768 return true;
769 }
770 case TargetLoweringBase::AtomicExpansionKind::MaskedIntrinsic: {
771 unsigned MinCASSize = TLI->getMinCmpXchgSizeInBits() / 8;
772 unsigned ValueSize = getAtomicOpSize(RMWI: AI);
773 if (ValueSize < MinCASSize) {
774 AtomicRMWInst::BinOp Op = AI->getOperation();
775 // Widen And/Or/Xor and give the target another chance at expanding it.
776 if (Op == AtomicRMWInst::Or || Op == AtomicRMWInst::Xor ||
777 Op == AtomicRMWInst::And) {
778 tryExpandAtomicRMW(AI: widenPartwordAtomicRMW(AI));
779 return true;
780 }
781 }
782 expandAtomicRMWToMaskedIntrinsic(AI);
783 return true;
784 }
785 case TargetLoweringBase::AtomicExpansionKind::BitTestIntrinsic: {
786 TLI->emitBitTestAtomicRMWIntrinsic(AI);
787 return true;
788 }
789 case TargetLoweringBase::AtomicExpansionKind::CmpArithIntrinsic: {
790 TLI->emitCmpArithAtomicRMWIntrinsic(AI);
791 return true;
792 }
793 case TargetLoweringBase::AtomicExpansionKind::NotAtomic:
794 return lowerAtomicRMWInst(RMWI: AI);
795 case TargetLoweringBase::AtomicExpansionKind::CustomExpand:
796 TLI->emitExpandAtomicRMW(AI);
797 return true;
798 default:
799 llvm_unreachable("Unhandled case in tryExpandAtomicRMW");
800 }
801}
802
803namespace {
804
805struct PartwordMaskValues {
806 // These three fields are guaranteed to be set by createMaskInstrs.
807 Type *WordType = nullptr;
808 Type *ValueType = nullptr;
809 Type *IntValueType = nullptr;
810 Value *AlignedAddr = nullptr;
811 Align AlignedAddrAlignment;
812 // The remaining fields can be null.
813 Value *ShiftAmt = nullptr;
814 Value *Mask = nullptr;
815 Value *Inv_Mask = nullptr;
816};
817
818[[maybe_unused]]
819raw_ostream &operator<<(raw_ostream &O, const PartwordMaskValues &PMV) {
820 auto PrintObj = [&O](auto *V) {
821 if (V)
822 O << *V;
823 else
824 O << "nullptr";
825 O << '\n';
826 };
827 O << "PartwordMaskValues {\n";
828 O << " WordType: ";
829 PrintObj(PMV.WordType);
830 O << " ValueType: ";
831 PrintObj(PMV.ValueType);
832 O << " AlignedAddr: ";
833 PrintObj(PMV.AlignedAddr);
834 O << " AlignedAddrAlignment: " << PMV.AlignedAddrAlignment.value() << '\n';
835 O << " ShiftAmt: ";
836 PrintObj(PMV.ShiftAmt);
837 O << " Mask: ";
838 PrintObj(PMV.Mask);
839 O << " Inv_Mask: ";
840 PrintObj(PMV.Inv_Mask);
841 O << "}\n";
842 return O;
843}
844
845} // end anonymous namespace
846
847/// This is a helper function which builds instructions to provide
848/// values necessary for partword atomic operations. It takes an
849/// incoming address, Addr, and ValueType, and constructs the address,
850/// shift-amounts and masks needed to work with a larger value of size
851/// WordSize.
852///
853/// AlignedAddr: Addr rounded down to a multiple of WordSize
854///
855/// ShiftAmt: Number of bits to right-shift a WordSize value loaded
856/// from AlignAddr for it to have the same value as if
857/// ValueType was loaded from Addr.
858///
859/// Mask: Value to mask with the value loaded from AlignAddr to
860/// include only the part that would've been loaded from Addr.
861///
862/// Inv_Mask: The inverse of Mask.
863static PartwordMaskValues createMaskInstrs(IRBuilderBase &Builder,
864 Instruction *I, Type *ValueType,
865 Value *Addr, Align AddrAlign,
866 unsigned MinWordSize) {
867 PartwordMaskValues PMV;
868
869 Module *M = I->getModule();
870 LLVMContext &Ctx = M->getContext();
871 const DataLayout &DL = M->getDataLayout();
872 unsigned ValueSize = DL.getTypeStoreSize(Ty: ValueType);
873
874 PMV.ValueType = PMV.IntValueType = ValueType;
875 if (PMV.ValueType->isFloatingPointTy() || PMV.ValueType->isVectorTy())
876 PMV.IntValueType =
877 Type::getIntNTy(C&: Ctx, N: ValueType->getPrimitiveSizeInBits());
878
879 PMV.WordType = MinWordSize > ValueSize ? Type::getIntNTy(C&: Ctx, N: MinWordSize * 8)
880 : ValueType;
881 if (PMV.ValueType == PMV.WordType) {
882 PMV.AlignedAddr = Addr;
883 PMV.AlignedAddrAlignment = AddrAlign;
884 PMV.ShiftAmt = ConstantInt::get(Ty: PMV.ValueType, V: 0);
885 PMV.Mask = ConstantInt::get(Ty: PMV.ValueType, V: ~0, /*isSigned*/ IsSigned: true);
886 return PMV;
887 }
888
889 PMV.AlignedAddrAlignment = Align(MinWordSize);
890
891 assert(ValueSize < MinWordSize);
892
893 PointerType *PtrTy = cast<PointerType>(Val: Addr->getType());
894 IntegerType *IntTy = DL.getIndexType(C&: Ctx, AddressSpace: PtrTy->getAddressSpace());
895 Value *PtrLSB;
896
897 if (AddrAlign < MinWordSize) {
898 PMV.AlignedAddr = Builder.CreateIntrinsic(
899 ID: Intrinsic::ptrmask, Types: {PtrTy, IntTy},
900 Args: {Addr, ConstantInt::getSigned(Ty: IntTy, V: ~(uint64_t)(MinWordSize - 1))},
901 FMFSource: nullptr, Name: "AlignedAddr");
902
903 Value *AddrInt = Builder.CreatePtrToInt(V: Addr, DestTy: IntTy);
904 PtrLSB = Builder.CreateAnd(LHS: AddrInt, RHS: MinWordSize - 1, Name: "PtrLSB");
905 } else {
906 // If the alignment is high enough, the LSB are known 0.
907 PMV.AlignedAddr = Addr;
908 PtrLSB = ConstantInt::getNullValue(Ty: IntTy);
909 }
910
911 if (DL.isLittleEndian()) {
912 // turn bytes into bits
913 PMV.ShiftAmt = Builder.CreateShl(LHS: PtrLSB, RHS: 3);
914 } else {
915 // turn bytes into bits, and count from the other side.
916 PMV.ShiftAmt = Builder.CreateShl(
917 LHS: Builder.CreateXor(LHS: PtrLSB, RHS: MinWordSize - ValueSize), RHS: 3);
918 }
919
920 PMV.ShiftAmt = Builder.CreateTrunc(V: PMV.ShiftAmt, DestTy: PMV.WordType, Name: "ShiftAmt");
921 PMV.Mask = Builder.CreateShl(
922 LHS: ConstantInt::get(Ty: PMV.WordType, V: (1 << (ValueSize * 8)) - 1), RHS: PMV.ShiftAmt,
923 Name: "Mask");
924
925 PMV.Inv_Mask = Builder.CreateNot(V: PMV.Mask, Name: "Inv_Mask");
926
927 return PMV;
928}
929
930static Value *extractMaskedValue(IRBuilderBase &Builder, Value *WideWord,
931 const PartwordMaskValues &PMV) {
932 assert(WideWord->getType() == PMV.WordType && "Widened type mismatch");
933 if (PMV.WordType == PMV.ValueType)
934 return WideWord;
935
936 Value *Shift = Builder.CreateLShr(LHS: WideWord, RHS: PMV.ShiftAmt, Name: "shifted");
937 Value *Trunc = Builder.CreateTrunc(V: Shift, DestTy: PMV.IntValueType, Name: "extracted");
938 return Builder.CreateBitCast(V: Trunc, DestTy: PMV.ValueType);
939}
940
941static Value *insertMaskedValue(IRBuilderBase &Builder, Value *WideWord,
942 Value *Updated, const PartwordMaskValues &PMV) {
943 assert(WideWord->getType() == PMV.WordType && "Widened type mismatch");
944 assert(Updated->getType() == PMV.ValueType && "Value type mismatch");
945 if (PMV.WordType == PMV.ValueType)
946 return Updated;
947
948 Updated = Builder.CreateBitCast(V: Updated, DestTy: PMV.IntValueType);
949
950 Value *ZExt = Builder.CreateZExt(V: Updated, DestTy: PMV.WordType, Name: "extended");
951 Value *Shift =
952 Builder.CreateShl(LHS: ZExt, RHS: PMV.ShiftAmt, Name: "shifted", /*HasNUW*/ true);
953 Value *And = Builder.CreateAnd(LHS: WideWord, RHS: PMV.Inv_Mask, Name: "unmasked");
954 Value *Or = Builder.CreateOr(LHS: And, RHS: Shift, Name: "inserted");
955 return Or;
956}
957
958/// Emit IR to implement a masked version of a given atomicrmw
959/// operation. (That is, only the bits under the Mask should be
960/// affected by the operation)
961static Value *performMaskedAtomicOp(AtomicRMWInst::BinOp Op,
962 IRBuilderBase &Builder, Value *Loaded,
963 Value *Shifted_Inc, Value *Inc,
964 const PartwordMaskValues &PMV) {
965 // TODO: update to use
966 // https://graphics.stanford.edu/~seander/bithacks.html#MaskedMerge in order
967 // to merge bits from two values without requiring PMV.Inv_Mask.
968 switch (Op) {
969 case AtomicRMWInst::Xchg: {
970 Value *Loaded_MaskOut = Builder.CreateAnd(LHS: Loaded, RHS: PMV.Inv_Mask);
971 Value *FinalVal = Builder.CreateOr(LHS: Loaded_MaskOut, RHS: Shifted_Inc);
972 return FinalVal;
973 }
974 case AtomicRMWInst::Or:
975 case AtomicRMWInst::Xor:
976 case AtomicRMWInst::And:
977 llvm_unreachable("Or/Xor/And handled by widenPartwordAtomicRMW");
978 case AtomicRMWInst::Add:
979 case AtomicRMWInst::Sub:
980 case AtomicRMWInst::Nand: {
981 // The other arithmetic ops need to be masked into place.
982 Value *NewVal = buildAtomicRMWValue(Op, Builder, Loaded, Val: Shifted_Inc);
983 Value *NewVal_Masked = Builder.CreateAnd(LHS: NewVal, RHS: PMV.Mask);
984 Value *Loaded_MaskOut = Builder.CreateAnd(LHS: Loaded, RHS: PMV.Inv_Mask);
985 Value *FinalVal = Builder.CreateOr(LHS: Loaded_MaskOut, RHS: NewVal_Masked);
986 return FinalVal;
987 }
988 case AtomicRMWInst::Max:
989 case AtomicRMWInst::Min:
990 case AtomicRMWInst::UMax:
991 case AtomicRMWInst::UMin:
992 case AtomicRMWInst::FAdd:
993 case AtomicRMWInst::FSub:
994 case AtomicRMWInst::FMin:
995 case AtomicRMWInst::FMax:
996 case AtomicRMWInst::FMaximum:
997 case AtomicRMWInst::FMinimum:
998 case AtomicRMWInst::FMaximumNum:
999 case AtomicRMWInst::FMinimumNum:
1000 case AtomicRMWInst::UIncWrap:
1001 case AtomicRMWInst::UDecWrap:
1002 case AtomicRMWInst::USubCond:
1003 case AtomicRMWInst::USubSat: {
1004 // Finally, other ops will operate on the full value, so truncate down to
1005 // the original size, and expand out again after doing the
1006 // operation. Bitcasts will be inserted for FP values.
1007 Value *Loaded_Extract = extractMaskedValue(Builder, WideWord: Loaded, PMV);
1008 Value *NewVal = buildAtomicRMWValue(Op, Builder, Loaded: Loaded_Extract, Val: Inc);
1009 Value *FinalVal = insertMaskedValue(Builder, WideWord: Loaded, Updated: NewVal, PMV);
1010 return FinalVal;
1011 }
1012 default:
1013 llvm_unreachable("Unknown atomic op");
1014 }
1015}
1016
1017/// Expand a sub-word atomicrmw operation into an appropriate
1018/// word-sized operation.
1019///
1020/// It will create an LL/SC or cmpxchg loop, as appropriate, the same
1021/// way as a typical atomicrmw expansion. The only difference here is
1022/// that the operation inside of the loop may operate upon only a
1023/// part of the value.
1024void AtomicExpandImpl::expandPartwordAtomicRMW(
1025 AtomicRMWInst *AI, TargetLoweringBase::AtomicExpansionKind ExpansionKind) {
1026 // Widen And/Or/Xor and give the target another chance at expanding it.
1027 AtomicRMWInst::BinOp Op = AI->getOperation();
1028 if (Op == AtomicRMWInst::Or || Op == AtomicRMWInst::Xor ||
1029 Op == AtomicRMWInst::And) {
1030 tryExpandAtomicRMW(AI: widenPartwordAtomicRMW(AI));
1031 return;
1032 }
1033 AtomicOrdering MemOpOrder = AI->getOrdering();
1034 SyncScope::ID SSID = AI->getSyncScopeID();
1035
1036 ReplacementIRBuilder Builder(AI, *DL);
1037
1038 PartwordMaskValues PMV =
1039 createMaskInstrs(Builder, I: AI, ValueType: AI->getType(), Addr: AI->getPointerOperand(),
1040 AddrAlign: AI->getAlign(), MinWordSize: TLI->getMinCmpXchgSizeInBits() / 8);
1041
1042 Value *ValOperand_Shifted = nullptr;
1043 if (Op == AtomicRMWInst::Xchg || Op == AtomicRMWInst::Add ||
1044 Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Nand) {
1045 Value *ValOp = Builder.CreateBitCast(V: AI->getValOperand(), DestTy: PMV.IntValueType);
1046 ValOperand_Shifted =
1047 Builder.CreateShl(LHS: Builder.CreateZExt(V: ValOp, DestTy: PMV.WordType), RHS: PMV.ShiftAmt,
1048 Name: "ValOperand_Shifted");
1049 }
1050
1051 auto PerformPartwordOp = [&](IRBuilderBase &Builder, Value *Loaded) {
1052 return performMaskedAtomicOp(Op, Builder, Loaded, Shifted_Inc: ValOperand_Shifted,
1053 Inc: AI->getValOperand(), PMV);
1054 };
1055
1056 Value *OldResult;
1057 if (ExpansionKind == TargetLoweringBase::AtomicExpansionKind::CmpXChg) {
1058 OldResult = insertRMWCmpXchgLoop(
1059 Builder, ResultType: PMV.WordType, Addr: PMV.AlignedAddr, AddrAlign: PMV.AlignedAddrAlignment,
1060 MemOpOrder, SSID, PerformOp: PerformPartwordOp, CreateCmpXchg: createCmpXchgInstFun, MetadataSrc: AI);
1061 } else {
1062 assert(ExpansionKind == TargetLoweringBase::AtomicExpansionKind::LLSC);
1063 OldResult = insertRMWLLSCLoop(Builder, ResultTy: PMV.WordType, Addr: PMV.AlignedAddr,
1064 AddrAlign: PMV.AlignedAddrAlignment, MemOpOrder,
1065 PerformOp: PerformPartwordOp);
1066 }
1067
1068 Value *FinalOldResult = extractMaskedValue(Builder, WideWord: OldResult, PMV);
1069 AI->replaceAllUsesWith(V: FinalOldResult);
1070 AI->eraseFromParent();
1071}
1072
1073// Widen the bitwise atomicrmw (or/xor/and) to the minimum supported width.
1074AtomicRMWInst *AtomicExpandImpl::widenPartwordAtomicRMW(AtomicRMWInst *AI) {
1075 ReplacementIRBuilder Builder(AI, *DL);
1076 AtomicRMWInst::BinOp Op = AI->getOperation();
1077
1078 assert((Op == AtomicRMWInst::Or || Op == AtomicRMWInst::Xor ||
1079 Op == AtomicRMWInst::And) &&
1080 "Unable to widen operation");
1081
1082 PartwordMaskValues PMV =
1083 createMaskInstrs(Builder, I: AI, ValueType: AI->getType(), Addr: AI->getPointerOperand(),
1084 AddrAlign: AI->getAlign(), MinWordSize: TLI->getMinCmpXchgSizeInBits() / 8);
1085
1086 Value *ValOperand_Shifted =
1087 Builder.CreateShl(LHS: Builder.CreateZExt(V: AI->getValOperand(), DestTy: PMV.WordType),
1088 RHS: PMV.ShiftAmt, Name: "ValOperand_Shifted");
1089
1090 Value *NewOperand;
1091
1092 if (Op == AtomicRMWInst::And)
1093 NewOperand =
1094 Builder.CreateOr(LHS: ValOperand_Shifted, RHS: PMV.Inv_Mask, Name: "AndOperand");
1095 else
1096 NewOperand = ValOperand_Shifted;
1097
1098 AtomicRMWInst *NewAI = Builder.CreateAtomicRMW(
1099 Op, Ptr: PMV.AlignedAddr, Val: NewOperand, Align: PMV.AlignedAddrAlignment,
1100 Ordering: AI->getOrdering(), SSID: AI->getSyncScopeID());
1101
1102 copyMetadataForAtomic(Dest&: *NewAI, Source: *AI);
1103
1104 Value *FinalOldResult = extractMaskedValue(Builder, WideWord: NewAI, PMV);
1105 AI->replaceAllUsesWith(V: FinalOldResult);
1106 AI->eraseFromParent();
1107 return NewAI;
1108}
1109
1110bool AtomicExpandImpl::expandPartwordCmpXchg(AtomicCmpXchgInst *CI) {
1111 // The basic idea here is that we're expanding a cmpxchg of a
1112 // smaller memory size up to a word-sized cmpxchg. To do this, we
1113 // need to add a retry-loop for strong cmpxchg, so that
1114 // modifications to other parts of the word don't cause a spurious
1115 // failure.
1116
1117 // This generates code like the following:
1118 // [[Setup mask values PMV.*]]
1119 // %NewVal_Shifted = shl i32 %NewVal, %PMV.ShiftAmt
1120 // %Cmp_Shifted = shl i32 %Cmp, %PMV.ShiftAmt
1121 // %InitLoaded = load i32* %addr
1122 // %InitLoaded_MaskOut = and i32 %InitLoaded, %PMV.Inv_Mask
1123 // br partword.cmpxchg.loop
1124 // partword.cmpxchg.loop:
1125 // %Loaded_MaskOut = phi i32 [ %InitLoaded_MaskOut, %entry ],
1126 // [ %OldVal_MaskOut, %partword.cmpxchg.failure ]
1127 // %FullWord_NewVal = or i32 %Loaded_MaskOut, %NewVal_Shifted
1128 // %FullWord_Cmp = or i32 %Loaded_MaskOut, %Cmp_Shifted
1129 // %NewCI = cmpxchg i32* %PMV.AlignedAddr, i32 %FullWord_Cmp,
1130 // i32 %FullWord_NewVal success_ordering failure_ordering
1131 // %OldVal = extractvalue { i32, i1 } %NewCI, 0
1132 // %Success = extractvalue { i32, i1 } %NewCI, 1
1133 // br i1 %Success, label %partword.cmpxchg.end,
1134 // label %partword.cmpxchg.failure
1135 // partword.cmpxchg.failure:
1136 // %OldVal_MaskOut = and i32 %OldVal, %PMV.Inv_Mask
1137 // %ShouldContinue = icmp ne i32 %Loaded_MaskOut, %OldVal_MaskOut
1138 // br i1 %ShouldContinue, label %partword.cmpxchg.loop,
1139 // label %partword.cmpxchg.end
1140 // partword.cmpxchg.end:
1141 // %tmp1 = lshr i32 %OldVal, %PMV.ShiftAmt
1142 // %FinalOldVal = trunc i32 %tmp1 to i8
1143 // %tmp2 = insertvalue { i8, i1 } undef, i8 %FinalOldVal, 0
1144 // %Res = insertvalue { i8, i1 } %25, i1 %Success, 1
1145
1146 Value *Addr = CI->getPointerOperand();
1147 Value *Cmp = CI->getCompareOperand();
1148 Value *NewVal = CI->getNewValOperand();
1149
1150 BasicBlock *BB = CI->getParent();
1151 Function *F = BB->getParent();
1152 ReplacementIRBuilder Builder(CI, *DL);
1153 LLVMContext &Ctx = Builder.getContext();
1154
1155 BasicBlock *EndBB =
1156 BB->splitBasicBlock(I: CI->getIterator(), BBName: "partword.cmpxchg.end");
1157 auto FailureBB =
1158 BasicBlock::Create(Context&: Ctx, Name: "partword.cmpxchg.failure", Parent: F, InsertBefore: EndBB);
1159 auto LoopBB = BasicBlock::Create(Context&: Ctx, Name: "partword.cmpxchg.loop", Parent: F, InsertBefore: FailureBB);
1160
1161 // The split call above "helpfully" added a branch at the end of BB
1162 // (to the wrong place).
1163 std::prev(x: BB->end())->eraseFromParent();
1164 Builder.SetInsertPoint(BB);
1165
1166 PartwordMaskValues PMV =
1167 createMaskInstrs(Builder, I: CI, ValueType: CI->getCompareOperand()->getType(), Addr,
1168 AddrAlign: CI->getAlign(), MinWordSize: TLI->getMinCmpXchgSizeInBits() / 8);
1169
1170 // Shift the incoming values over, into the right location in the word.
1171 Value *NewVal_Shifted =
1172 Builder.CreateShl(LHS: Builder.CreateZExt(V: NewVal, DestTy: PMV.WordType), RHS: PMV.ShiftAmt);
1173 Value *Cmp_Shifted =
1174 Builder.CreateShl(LHS: Builder.CreateZExt(V: Cmp, DestTy: PMV.WordType), RHS: PMV.ShiftAmt);
1175
1176 // Load the entire current word, and mask into place the expected and new
1177 // values
1178 LoadInst *InitLoaded = Builder.CreateLoad(Ty: PMV.WordType, Ptr: PMV.AlignedAddr);
1179 InitLoaded->setVolatile(CI->isVolatile());
1180 Value *InitLoaded_MaskOut = Builder.CreateAnd(LHS: InitLoaded, RHS: PMV.Inv_Mask);
1181 Builder.CreateBr(Dest: LoopBB);
1182
1183 // partword.cmpxchg.loop:
1184 Builder.SetInsertPoint(LoopBB);
1185 PHINode *Loaded_MaskOut = Builder.CreatePHI(Ty: PMV.WordType, NumReservedValues: 2);
1186 Loaded_MaskOut->addIncoming(V: InitLoaded_MaskOut, BB);
1187
1188 // Mask/Or the expected and new values into place in the loaded word.
1189 Value *FullWord_NewVal = Builder.CreateOr(LHS: Loaded_MaskOut, RHS: NewVal_Shifted);
1190 Value *FullWord_Cmp = Builder.CreateOr(LHS: Loaded_MaskOut, RHS: Cmp_Shifted);
1191 AtomicCmpXchgInst *NewCI = Builder.CreateAtomicCmpXchg(
1192 Ptr: PMV.AlignedAddr, Cmp: FullWord_Cmp, New: FullWord_NewVal, Align: PMV.AlignedAddrAlignment,
1193 SuccessOrdering: CI->getSuccessOrdering(), FailureOrdering: CI->getFailureOrdering(), SSID: CI->getSyncScopeID());
1194 NewCI->setVolatile(CI->isVolatile());
1195 // When we're building a strong cmpxchg, we need a loop, so you
1196 // might think we could use a weak cmpxchg inside. But, using strong
1197 // allows the below comparison for ShouldContinue, and we're
1198 // expecting the underlying cmpxchg to be a machine instruction,
1199 // which is strong anyways.
1200 NewCI->setWeak(CI->isWeak());
1201
1202 Value *OldVal = Builder.CreateExtractValue(Agg: NewCI, Idxs: 0);
1203 Value *Success = Builder.CreateExtractValue(Agg: NewCI, Idxs: 1);
1204
1205 if (CI->isWeak())
1206 Builder.CreateBr(Dest: EndBB);
1207 else
1208 Builder.CreateCondBr(Cond: Success, True: EndBB, False: FailureBB);
1209
1210 // partword.cmpxchg.failure:
1211 Builder.SetInsertPoint(FailureBB);
1212 // Upon failure, verify that the masked-out part of the loaded value
1213 // has been modified. If it didn't, abort the cmpxchg, since the
1214 // masked-in part must've.
1215 Value *OldVal_MaskOut = Builder.CreateAnd(LHS: OldVal, RHS: PMV.Inv_Mask);
1216 Value *ShouldContinue = Builder.CreateICmpNE(LHS: Loaded_MaskOut, RHS: OldVal_MaskOut);
1217 Builder.CreateCondBr(Cond: ShouldContinue, True: LoopBB, False: EndBB);
1218
1219 // Add the second value to the phi from above
1220 Loaded_MaskOut->addIncoming(V: OldVal_MaskOut, BB: FailureBB);
1221
1222 // partword.cmpxchg.end:
1223 Builder.SetInsertPoint(CI);
1224
1225 Value *FinalOldVal = extractMaskedValue(Builder, WideWord: OldVal, PMV);
1226 Value *Res = PoisonValue::get(T: CI->getType());
1227 Res = Builder.CreateInsertValue(Agg: Res, Val: FinalOldVal, Idxs: 0);
1228 Res = Builder.CreateInsertValue(Agg: Res, Val: Success, Idxs: 1);
1229
1230 CI->replaceAllUsesWith(V: Res);
1231 CI->eraseFromParent();
1232 return true;
1233}
1234
1235void AtomicExpandImpl::expandAtomicOpToLLSC(
1236 Instruction *I, Type *ResultType, Value *Addr, Align AddrAlign,
1237 AtomicOrdering MemOpOrder,
1238 function_ref<Value *(IRBuilderBase &, Value *)> PerformOp) {
1239 ReplacementIRBuilder Builder(I, *DL);
1240 Value *Loaded = insertRMWLLSCLoop(Builder, ResultTy: ResultType, Addr, AddrAlign,
1241 MemOpOrder, PerformOp);
1242
1243 I->replaceAllUsesWith(V: Loaded);
1244 I->eraseFromParent();
1245}
1246
1247void AtomicExpandImpl::expandAtomicRMWToMaskedIntrinsic(AtomicRMWInst *AI) {
1248 ReplacementIRBuilder Builder(AI, *DL);
1249
1250 PartwordMaskValues PMV =
1251 createMaskInstrs(Builder, I: AI, ValueType: AI->getType(), Addr: AI->getPointerOperand(),
1252 AddrAlign: AI->getAlign(), MinWordSize: TLI->getMinCmpXchgSizeInBits() / 8);
1253
1254 // The value operand must be sign-extended for signed min/max so that the
1255 // target's signed comparison instructions can be used. Otherwise, just
1256 // zero-ext.
1257 Instruction::CastOps CastOp = Instruction::ZExt;
1258 AtomicRMWInst::BinOp RMWOp = AI->getOperation();
1259 if (RMWOp == AtomicRMWInst::Max || RMWOp == AtomicRMWInst::Min)
1260 CastOp = Instruction::SExt;
1261
1262 Value *ValOperand_Shifted = Builder.CreateShl(
1263 LHS: Builder.CreateCast(Op: CastOp, V: AI->getValOperand(), DestTy: PMV.WordType),
1264 RHS: PMV.ShiftAmt, Name: "ValOperand_Shifted");
1265 Value *OldResult = TLI->emitMaskedAtomicRMWIntrinsic(
1266 Builder, AI, AlignedAddr: PMV.AlignedAddr, Incr: ValOperand_Shifted, Mask: PMV.Mask, ShiftAmt: PMV.ShiftAmt,
1267 Ord: AI->getOrdering());
1268 Value *FinalOldResult = extractMaskedValue(Builder, WideWord: OldResult, PMV);
1269 AI->replaceAllUsesWith(V: FinalOldResult);
1270 AI->eraseFromParent();
1271}
1272
1273void AtomicExpandImpl::expandAtomicCmpXchgToMaskedIntrinsic(
1274 AtomicCmpXchgInst *CI) {
1275 ReplacementIRBuilder Builder(CI, *DL);
1276
1277 PartwordMaskValues PMV = createMaskInstrs(
1278 Builder, I: CI, ValueType: CI->getCompareOperand()->getType(), Addr: CI->getPointerOperand(),
1279 AddrAlign: CI->getAlign(), MinWordSize: TLI->getMinCmpXchgSizeInBits() / 8);
1280
1281 Value *CmpVal_Shifted = Builder.CreateShl(
1282 LHS: Builder.CreateZExt(V: CI->getCompareOperand(), DestTy: PMV.WordType), RHS: PMV.ShiftAmt,
1283 Name: "CmpVal_Shifted");
1284 Value *NewVal_Shifted = Builder.CreateShl(
1285 LHS: Builder.CreateZExt(V: CI->getNewValOperand(), DestTy: PMV.WordType), RHS: PMV.ShiftAmt,
1286 Name: "NewVal_Shifted");
1287 Value *OldVal = TLI->emitMaskedAtomicCmpXchgIntrinsic(
1288 Builder, CI, AlignedAddr: PMV.AlignedAddr, CmpVal: CmpVal_Shifted, NewVal: NewVal_Shifted, Mask: PMV.Mask,
1289 Ord: CI->getMergedOrdering());
1290 Value *FinalOldVal = extractMaskedValue(Builder, WideWord: OldVal, PMV);
1291 Value *Res = PoisonValue::get(T: CI->getType());
1292 Res = Builder.CreateInsertValue(Agg: Res, Val: FinalOldVal, Idxs: 0);
1293 Value *Success = Builder.CreateICmpEQ(
1294 LHS: CmpVal_Shifted, RHS: Builder.CreateAnd(LHS: OldVal, RHS: PMV.Mask), Name: "Success");
1295 Res = Builder.CreateInsertValue(Agg: Res, Val: Success, Idxs: 1);
1296
1297 CI->replaceAllUsesWith(V: Res);
1298 CI->eraseFromParent();
1299}
1300
1301Value *AtomicExpandImpl::insertRMWLLSCLoop(
1302 IRBuilderBase &Builder, Type *ResultTy, Value *Addr, Align AddrAlign,
1303 AtomicOrdering MemOpOrder,
1304 function_ref<Value *(IRBuilderBase &, Value *)> PerformOp) {
1305 LLVMContext &Ctx = Builder.getContext();
1306 BasicBlock *BB = Builder.GetInsertBlock();
1307 Function *F = BB->getParent();
1308
1309 assert(AddrAlign >= F->getDataLayout().getTypeStoreSize(ResultTy) &&
1310 "Expected at least natural alignment at this point.");
1311
1312 // Given: atomicrmw some_op iN* %addr, iN %incr ordering
1313 //
1314 // The standard expansion we produce is:
1315 // [...]
1316 // atomicrmw.start:
1317 // %loaded = @load.linked(%addr)
1318 // %new = some_op iN %loaded, %incr
1319 // %stored = @store_conditional(%new, %addr)
1320 // %try_again = icmp i32 ne %stored, 0
1321 // br i1 %try_again, label %loop, label %atomicrmw.end
1322 // atomicrmw.end:
1323 // [...]
1324 BasicBlock *ExitBB =
1325 BB->splitBasicBlock(I: Builder.GetInsertPoint(), BBName: "atomicrmw.end");
1326 BasicBlock *LoopBB = BasicBlock::Create(Context&: Ctx, Name: "atomicrmw.start", Parent: F, InsertBefore: ExitBB);
1327
1328 // The split call above "helpfully" added a branch at the end of BB (to the
1329 // wrong place).
1330 std::prev(x: BB->end())->eraseFromParent();
1331 Builder.SetInsertPoint(BB);
1332 Builder.CreateBr(Dest: LoopBB);
1333
1334 // Start the main loop block now that we've taken care of the preliminaries.
1335 Builder.SetInsertPoint(LoopBB);
1336 Value *Loaded = TLI->emitLoadLinked(Builder, ValueTy: ResultTy, Addr, Ord: MemOpOrder);
1337
1338 Value *NewVal = PerformOp(Builder, Loaded);
1339
1340 Value *StoreSuccess =
1341 TLI->emitStoreConditional(Builder, Val: NewVal, Addr, Ord: MemOpOrder);
1342 Value *TryAgain = Builder.CreateICmpNE(
1343 LHS: StoreSuccess, RHS: ConstantInt::get(Ty: IntegerType::get(C&: Ctx, NumBits: 32), V: 0), Name: "tryagain");
1344
1345 Instruction *CondBr = Builder.CreateCondBr(Cond: TryAgain, True: LoopBB, False: ExitBB);
1346
1347 // Atomic RMW expands to a Load-linked / Store-Conditional loop, because it is
1348 // hard to predict precise branch weigths we mark the branch as "unknown"
1349 // (50/50) to prevent misleading optimizations.
1350 setExplicitlyUnknownBranchWeightsIfProfiled(I&: *CondBr, DEBUG_TYPE);
1351
1352 Builder.SetInsertPoint(TheBB: ExitBB, IP: ExitBB->begin());
1353 return Loaded;
1354}
1355
1356/// Convert an atomic cmpxchg of a non-integral type to an integer cmpxchg of
1357/// the equivalent bitwidth. We used to not support pointer cmpxchg in the
1358/// IR. As a migration step, we convert back to what use to be the standard
1359/// way to represent a pointer cmpxchg so that we can update backends one by
1360/// one.
1361AtomicCmpXchgInst *
1362AtomicExpandImpl::convertCmpXchgToIntegerType(AtomicCmpXchgInst *CI) {
1363 auto *M = CI->getModule();
1364 Type *NewTy = getCorrespondingIntegerType(T: CI->getCompareOperand()->getType(),
1365 DL: M->getDataLayout());
1366
1367 ReplacementIRBuilder Builder(CI, *DL);
1368
1369 Value *Addr = CI->getPointerOperand();
1370
1371 Value *NewCmp = Builder.CreatePtrToInt(V: CI->getCompareOperand(), DestTy: NewTy);
1372 Value *NewNewVal = Builder.CreatePtrToInt(V: CI->getNewValOperand(), DestTy: NewTy);
1373
1374 auto *NewCI = Builder.CreateAtomicCmpXchg(
1375 Ptr: Addr, Cmp: NewCmp, New: NewNewVal, Align: CI->getAlign(), SuccessOrdering: CI->getSuccessOrdering(),
1376 FailureOrdering: CI->getFailureOrdering(), SSID: CI->getSyncScopeID());
1377 NewCI->setVolatile(CI->isVolatile());
1378 NewCI->setWeak(CI->isWeak());
1379 LLVM_DEBUG(dbgs() << "Replaced " << *CI << " with " << *NewCI << "\n");
1380
1381 Value *OldVal = Builder.CreateExtractValue(Agg: NewCI, Idxs: 0);
1382 Value *Succ = Builder.CreateExtractValue(Agg: NewCI, Idxs: 1);
1383
1384 OldVal = Builder.CreateIntToPtr(V: OldVal, DestTy: CI->getCompareOperand()->getType());
1385
1386 Value *Res = PoisonValue::get(T: CI->getType());
1387 Res = Builder.CreateInsertValue(Agg: Res, Val: OldVal, Idxs: 0);
1388 Res = Builder.CreateInsertValue(Agg: Res, Val: Succ, Idxs: 1);
1389
1390 CI->replaceAllUsesWith(V: Res);
1391 CI->eraseFromParent();
1392 return NewCI;
1393}
1394
1395bool AtomicExpandImpl::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
1396 AtomicOrdering SuccessOrder = CI->getSuccessOrdering();
1397 AtomicOrdering FailureOrder = CI->getFailureOrdering();
1398 Value *Addr = CI->getPointerOperand();
1399 BasicBlock *BB = CI->getParent();
1400 Function *F = BB->getParent();
1401 LLVMContext &Ctx = F->getContext();
1402 // If shouldInsertFencesForAtomic() returns true, then the target does not
1403 // want to deal with memory orders, and emitLeading/TrailingFence should take
1404 // care of everything. Otherwise, emitLeading/TrailingFence are no-op and we
1405 // should preserve the ordering.
1406 bool ShouldInsertFencesForAtomic = TLI->shouldInsertFencesForAtomic(I: CI);
1407 AtomicOrdering MemOpOrder = ShouldInsertFencesForAtomic
1408 ? AtomicOrdering::Monotonic
1409 : CI->getMergedOrdering();
1410
1411 // In implementations which use a barrier to achieve release semantics, we can
1412 // delay emitting this barrier until we know a store is actually going to be
1413 // attempted. The cost of this delay is that we need 2 copies of the block
1414 // emitting the load-linked, affecting code size.
1415 //
1416 // Ideally, this logic would be unconditional except for the minsize check
1417 // since in other cases the extra blocks naturally collapse down to the
1418 // minimal loop. Unfortunately, this puts too much stress on later
1419 // optimisations so we avoid emitting the extra logic in those cases too.
1420 bool HasReleasedLoadBB = !CI->isWeak() && ShouldInsertFencesForAtomic &&
1421 SuccessOrder != AtomicOrdering::Monotonic &&
1422 SuccessOrder != AtomicOrdering::Acquire &&
1423 !F->hasMinSize();
1424
1425 // There's no overhead for sinking the release barrier in a weak cmpxchg, so
1426 // do it even on minsize.
1427 bool UseUnconditionalReleaseBarrier = F->hasMinSize() && !CI->isWeak();
1428
1429 // Given: cmpxchg some_op iN* %addr, iN %desired, iN %new success_ord fail_ord
1430 //
1431 // The full expansion we produce is:
1432 // [...]
1433 // %aligned.addr = ...
1434 // cmpxchg.start:
1435 // %unreleasedload = @load.linked(%aligned.addr)
1436 // %unreleasedload.extract = extract value from %unreleasedload
1437 // %should_store = icmp eq %unreleasedload.extract, %desired
1438 // br i1 %should_store, label %cmpxchg.releasingstore,
1439 // label %cmpxchg.nostore
1440 // cmpxchg.releasingstore:
1441 // fence?
1442 // br label cmpxchg.trystore
1443 // cmpxchg.trystore:
1444 // %loaded.trystore = phi [%unreleasedload, %cmpxchg.releasingstore],
1445 // [%releasedload, %cmpxchg.releasedload]
1446 // %updated.new = insert %new into %loaded.trystore
1447 // %stored = @store_conditional(%updated.new, %aligned.addr)
1448 // %success = icmp eq i32 %stored, 0
1449 // br i1 %success, label %cmpxchg.success,
1450 // label %cmpxchg.releasedload/%cmpxchg.failure
1451 // cmpxchg.releasedload:
1452 // %releasedload = @load.linked(%aligned.addr)
1453 // %releasedload.extract = extract value from %releasedload
1454 // %should_store = icmp eq %releasedload.extract, %desired
1455 // br i1 %should_store, label %cmpxchg.trystore,
1456 // label %cmpxchg.failure
1457 // cmpxchg.success:
1458 // fence?
1459 // br label %cmpxchg.end
1460 // cmpxchg.nostore:
1461 // %loaded.nostore = phi [%unreleasedload, %cmpxchg.start],
1462 // [%releasedload,
1463 // %cmpxchg.releasedload/%cmpxchg.trystore]
1464 // @load_linked_fail_balance()?
1465 // br label %cmpxchg.failure
1466 // cmpxchg.failure:
1467 // fence?
1468 // br label %cmpxchg.end
1469 // cmpxchg.end:
1470 // %loaded.exit = phi [%loaded.nostore, %cmpxchg.failure],
1471 // [%loaded.trystore, %cmpxchg.trystore]
1472 // %success = phi i1 [true, %cmpxchg.success], [false, %cmpxchg.failure]
1473 // %loaded = extract value from %loaded.exit
1474 // %restmp = insertvalue { iN, i1 } undef, iN %loaded, 0
1475 // %res = insertvalue { iN, i1 } %restmp, i1 %success, 1
1476 // [...]
1477 BasicBlock *ExitBB = BB->splitBasicBlock(I: CI->getIterator(), BBName: "cmpxchg.end");
1478 auto FailureBB = BasicBlock::Create(Context&: Ctx, Name: "cmpxchg.failure", Parent: F, InsertBefore: ExitBB);
1479 auto NoStoreBB = BasicBlock::Create(Context&: Ctx, Name: "cmpxchg.nostore", Parent: F, InsertBefore: FailureBB);
1480 auto SuccessBB = BasicBlock::Create(Context&: Ctx, Name: "cmpxchg.success", Parent: F, InsertBefore: NoStoreBB);
1481 auto ReleasedLoadBB =
1482 BasicBlock::Create(Context&: Ctx, Name: "cmpxchg.releasedload", Parent: F, InsertBefore: SuccessBB);
1483 auto TryStoreBB =
1484 BasicBlock::Create(Context&: Ctx, Name: "cmpxchg.trystore", Parent: F, InsertBefore: ReleasedLoadBB);
1485 auto ReleasingStoreBB =
1486 BasicBlock::Create(Context&: Ctx, Name: "cmpxchg.fencedstore", Parent: F, InsertBefore: TryStoreBB);
1487 auto StartBB = BasicBlock::Create(Context&: Ctx, Name: "cmpxchg.start", Parent: F, InsertBefore: ReleasingStoreBB);
1488
1489 ReplacementIRBuilder Builder(CI, *DL);
1490
1491 // The split call above "helpfully" added a branch at the end of BB (to the
1492 // wrong place), but we might want a fence too. It's easiest to just remove
1493 // the branch entirely.
1494 std::prev(x: BB->end())->eraseFromParent();
1495 Builder.SetInsertPoint(BB);
1496 if (ShouldInsertFencesForAtomic && UseUnconditionalReleaseBarrier)
1497 TLI->emitLeadingFence(Builder, Inst: CI, Ord: SuccessOrder);
1498
1499 PartwordMaskValues PMV =
1500 createMaskInstrs(Builder, I: CI, ValueType: CI->getCompareOperand()->getType(), Addr,
1501 AddrAlign: CI->getAlign(), MinWordSize: TLI->getMinCmpXchgSizeInBits() / 8);
1502 Builder.CreateBr(Dest: StartBB);
1503
1504 // Start the main loop block now that we've taken care of the preliminaries.
1505 Builder.SetInsertPoint(StartBB);
1506 Value *UnreleasedLoad =
1507 TLI->emitLoadLinked(Builder, ValueTy: PMV.WordType, Addr: PMV.AlignedAddr, Ord: MemOpOrder);
1508 Value *UnreleasedLoadExtract =
1509 extractMaskedValue(Builder, WideWord: UnreleasedLoad, PMV);
1510 Value *ShouldStore = Builder.CreateICmpEQ(
1511 LHS: UnreleasedLoadExtract, RHS: CI->getCompareOperand(), Name: "should_store");
1512
1513 // If the cmpxchg doesn't actually need any ordering when it fails, we can
1514 // jump straight past that fence instruction (if it exists).
1515 Builder.CreateCondBr(Cond: ShouldStore, True: ReleasingStoreBB, False: NoStoreBB,
1516 BranchWeights: MDBuilder(F->getContext()).createLikelyBranchWeights());
1517
1518 Builder.SetInsertPoint(ReleasingStoreBB);
1519 if (ShouldInsertFencesForAtomic && !UseUnconditionalReleaseBarrier)
1520 TLI->emitLeadingFence(Builder, Inst: CI, Ord: SuccessOrder);
1521 Builder.CreateBr(Dest: TryStoreBB);
1522
1523 Builder.SetInsertPoint(TryStoreBB);
1524 PHINode *LoadedTryStore =
1525 Builder.CreatePHI(Ty: PMV.WordType, NumReservedValues: 2, Name: "loaded.trystore");
1526 LoadedTryStore->addIncoming(V: UnreleasedLoad, BB: ReleasingStoreBB);
1527 Value *NewValueInsert =
1528 insertMaskedValue(Builder, WideWord: LoadedTryStore, Updated: CI->getNewValOperand(), PMV);
1529 Value *StoreSuccess = TLI->emitStoreConditional(Builder, Val: NewValueInsert,
1530 Addr: PMV.AlignedAddr, Ord: MemOpOrder);
1531 StoreSuccess = Builder.CreateICmpEQ(
1532 LHS: StoreSuccess, RHS: ConstantInt::get(Ty: Type::getInt32Ty(C&: Ctx), V: 0), Name: "success");
1533 BasicBlock *RetryBB = HasReleasedLoadBB ? ReleasedLoadBB : StartBB;
1534 Builder.CreateCondBr(Cond: StoreSuccess, True: SuccessBB,
1535 False: CI->isWeak() ? FailureBB : RetryBB,
1536 BranchWeights: MDBuilder(F->getContext()).createLikelyBranchWeights());
1537
1538 Builder.SetInsertPoint(ReleasedLoadBB);
1539 Value *SecondLoad;
1540 if (HasReleasedLoadBB) {
1541 SecondLoad =
1542 TLI->emitLoadLinked(Builder, ValueTy: PMV.WordType, Addr: PMV.AlignedAddr, Ord: MemOpOrder);
1543 Value *SecondLoadExtract = extractMaskedValue(Builder, WideWord: SecondLoad, PMV);
1544 ShouldStore = Builder.CreateICmpEQ(LHS: SecondLoadExtract,
1545 RHS: CI->getCompareOperand(), Name: "should_store");
1546
1547 // If the cmpxchg doesn't actually need any ordering when it fails, we can
1548 // jump straight past that fence instruction (if it exists).
1549 Builder.CreateCondBr(
1550 Cond: ShouldStore, True: TryStoreBB, False: NoStoreBB,
1551 BranchWeights: MDBuilder(F->getContext()).createLikelyBranchWeights());
1552 // Update PHI node in TryStoreBB.
1553 LoadedTryStore->addIncoming(V: SecondLoad, BB: ReleasedLoadBB);
1554 } else
1555 Builder.CreateUnreachable();
1556
1557 // Make sure later instructions don't get reordered with a fence if
1558 // necessary.
1559 Builder.SetInsertPoint(SuccessBB);
1560 if (ShouldInsertFencesForAtomic ||
1561 TLI->shouldInsertTrailingSeqCstFenceForAtomicStore(I: CI))
1562 TLI->emitTrailingFence(Builder, Inst: CI, Ord: SuccessOrder);
1563 Builder.CreateBr(Dest: ExitBB);
1564
1565 Builder.SetInsertPoint(NoStoreBB);
1566 PHINode *LoadedNoStore =
1567 Builder.CreatePHI(Ty: UnreleasedLoad->getType(), NumReservedValues: 2, Name: "loaded.nostore");
1568 LoadedNoStore->addIncoming(V: UnreleasedLoad, BB: StartBB);
1569 if (HasReleasedLoadBB)
1570 LoadedNoStore->addIncoming(V: SecondLoad, BB: ReleasedLoadBB);
1571
1572 // In the failing case, where we don't execute the store-conditional, the
1573 // target might want to balance out the load-linked with a dedicated
1574 // instruction (e.g., on ARM, clearing the exclusive monitor).
1575 TLI->emitAtomicCmpXchgNoStoreLLBalance(Builder);
1576 Builder.CreateBr(Dest: FailureBB);
1577
1578 Builder.SetInsertPoint(FailureBB);
1579 PHINode *LoadedFailure =
1580 Builder.CreatePHI(Ty: UnreleasedLoad->getType(), NumReservedValues: 2, Name: "loaded.failure");
1581 LoadedFailure->addIncoming(V: LoadedNoStore, BB: NoStoreBB);
1582 if (CI->isWeak())
1583 LoadedFailure->addIncoming(V: LoadedTryStore, BB: TryStoreBB);
1584 if (ShouldInsertFencesForAtomic)
1585 TLI->emitTrailingFence(Builder, Inst: CI, Ord: FailureOrder);
1586 Builder.CreateBr(Dest: ExitBB);
1587
1588 // Finally, we have control-flow based knowledge of whether the cmpxchg
1589 // succeeded or not. We expose this to later passes by converting any
1590 // subsequent "icmp eq/ne %loaded, %oldval" into a use of an appropriate
1591 // PHI.
1592 Builder.SetInsertPoint(TheBB: ExitBB, IP: ExitBB->begin());
1593 PHINode *LoadedExit =
1594 Builder.CreatePHI(Ty: UnreleasedLoad->getType(), NumReservedValues: 2, Name: "loaded.exit");
1595 LoadedExit->addIncoming(V: LoadedTryStore, BB: SuccessBB);
1596 LoadedExit->addIncoming(V: LoadedFailure, BB: FailureBB);
1597 PHINode *Success = Builder.CreatePHI(Ty: Type::getInt1Ty(C&: Ctx), NumReservedValues: 2, Name: "success");
1598 Success->addIncoming(V: ConstantInt::getTrue(Context&: Ctx), BB: SuccessBB);
1599 Success->addIncoming(V: ConstantInt::getFalse(Context&: Ctx), BB: FailureBB);
1600
1601 // This is the "exit value" from the cmpxchg expansion. It may be of
1602 // a type wider than the one in the cmpxchg instruction.
1603 Value *LoadedFull = LoadedExit;
1604
1605 Builder.SetInsertPoint(TheBB: ExitBB, IP: std::next(x: Success->getIterator()));
1606 Value *Loaded = extractMaskedValue(Builder, WideWord: LoadedFull, PMV);
1607
1608 // Look for any users of the cmpxchg that are just comparing the loaded value
1609 // against the desired one, and replace them with the CFG-derived version.
1610 SmallVector<ExtractValueInst *, 2> PrunedInsts;
1611 for (auto *User : CI->users()) {
1612 ExtractValueInst *EV = dyn_cast<ExtractValueInst>(Val: User);
1613 if (!EV)
1614 continue;
1615
1616 assert(EV->getNumIndices() == 1 && EV->getIndices()[0] <= 1 &&
1617 "weird extraction from { iN, i1 }");
1618
1619 if (EV->getIndices()[0] == 0)
1620 EV->replaceAllUsesWith(V: Loaded);
1621 else
1622 EV->replaceAllUsesWith(V: Success);
1623
1624 PrunedInsts.push_back(Elt: EV);
1625 }
1626
1627 // We can remove the instructions now we're no longer iterating through them.
1628 for (auto *EV : PrunedInsts)
1629 EV->eraseFromParent();
1630
1631 if (!CI->use_empty()) {
1632 // Some use of the full struct return that we don't understand has happened,
1633 // so we've got to reconstruct it properly.
1634 Value *Res;
1635 Res = Builder.CreateInsertValue(Agg: PoisonValue::get(T: CI->getType()), Val: Loaded, Idxs: 0);
1636 Res = Builder.CreateInsertValue(Agg: Res, Val: Success, Idxs: 1);
1637
1638 CI->replaceAllUsesWith(V: Res);
1639 }
1640
1641 CI->eraseFromParent();
1642 return true;
1643}
1644
1645bool AtomicExpandImpl::isIdempotentRMW(AtomicRMWInst *RMWI) {
1646 // TODO: Add floating point support.
1647 auto C = dyn_cast<ConstantInt>(Val: RMWI->getValOperand());
1648 if (!C)
1649 return false;
1650
1651 switch (RMWI->getOperation()) {
1652 case AtomicRMWInst::Add:
1653 case AtomicRMWInst::Sub:
1654 case AtomicRMWInst::Or:
1655 case AtomicRMWInst::Xor:
1656 return C->isZero();
1657 case AtomicRMWInst::And:
1658 return C->isMinusOne();
1659 case AtomicRMWInst::Min:
1660 return C->isMaxValue(IsSigned: true);
1661 case AtomicRMWInst::Max:
1662 return C->isMinValue(IsSigned: true);
1663 case AtomicRMWInst::UMin:
1664 return C->isMaxValue(IsSigned: false);
1665 case AtomicRMWInst::UMax:
1666 return C->isMinValue(IsSigned: false);
1667 default:
1668 return false;
1669 }
1670}
1671
1672bool AtomicExpandImpl::simplifyIdempotentRMW(AtomicRMWInst *RMWI) {
1673 if (auto ResultingLoad = TLI->lowerIdempotentRMWIntoFencedLoad(RMWI)) {
1674 tryExpandAtomicLoad(LI: ResultingLoad);
1675 return true;
1676 }
1677 return false;
1678}
1679
1680Value *AtomicExpandImpl::insertRMWCmpXchgLoop(
1681 IRBuilderBase &Builder, Type *ResultTy, Value *Addr, Align AddrAlign,
1682 AtomicOrdering MemOpOrder, SyncScope::ID SSID,
1683 function_ref<Value *(IRBuilderBase &, Value *)> PerformOp,
1684 CreateCmpXchgInstFun CreateCmpXchg, Instruction *MetadataSrc) {
1685 LLVMContext &Ctx = Builder.getContext();
1686 BasicBlock *BB = Builder.GetInsertBlock();
1687 Function *F = BB->getParent();
1688
1689 // Given: atomicrmw some_op iN* %addr, iN %incr ordering
1690 //
1691 // The standard expansion we produce is:
1692 // [...]
1693 // %init_loaded = load atomic iN* %addr
1694 // br label %loop
1695 // loop:
1696 // %loaded = phi iN [ %init_loaded, %entry ], [ %new_loaded, %loop ]
1697 // %new = some_op iN %loaded, %incr
1698 // %pair = cmpxchg iN* %addr, iN %loaded, iN %new
1699 // %new_loaded = extractvalue { iN, i1 } %pair, 0
1700 // %success = extractvalue { iN, i1 } %pair, 1
1701 // br i1 %success, label %atomicrmw.end, label %loop
1702 // atomicrmw.end:
1703 // [...]
1704 BasicBlock *ExitBB =
1705 BB->splitBasicBlock(I: Builder.GetInsertPoint(), BBName: "atomicrmw.end");
1706 BasicBlock *LoopBB = BasicBlock::Create(Context&: Ctx, Name: "atomicrmw.start", Parent: F, InsertBefore: ExitBB);
1707
1708 // The split call above "helpfully" added a branch at the end of BB (to the
1709 // wrong place), but we want a load. It's easiest to just remove
1710 // the branch entirely.
1711 std::prev(x: BB->end())->eraseFromParent();
1712 Builder.SetInsertPoint(BB);
1713 LoadInst *InitLoaded = Builder.CreateAlignedLoad(Ty: ResultTy, Ptr: Addr, Align: AddrAlign);
1714 // TODO: The initial load must be atomic with the same synchronization scope
1715 // to avoid a data race with concurrent stores. If the instruction being
1716 // emulated is volatile, issue a volatile load.
1717 Builder.CreateBr(Dest: LoopBB);
1718
1719 // Start the main loop block now that we've taken care of the preliminaries.
1720 Builder.SetInsertPoint(LoopBB);
1721 PHINode *Loaded = Builder.CreatePHI(Ty: ResultTy, NumReservedValues: 2, Name: "loaded");
1722 Loaded->addIncoming(V: InitLoaded, BB);
1723
1724 Value *NewVal = PerformOp(Builder, Loaded);
1725
1726 Value *NewLoaded = nullptr;
1727 Value *Success = nullptr;
1728
1729 CreateCmpXchg(Builder, Addr, Loaded, NewVal, AddrAlign,
1730 MemOpOrder == AtomicOrdering::Unordered
1731 ? AtomicOrdering::Monotonic
1732 : MemOpOrder,
1733 SSID, Success, NewLoaded, MetadataSrc);
1734 assert(Success && NewLoaded);
1735
1736 Loaded->addIncoming(V: NewLoaded, BB: LoopBB);
1737
1738 Instruction *CondBr = Builder.CreateCondBr(Cond: Success, True: ExitBB, False: LoopBB);
1739
1740 // Atomic RMW expands to a cmpxchg loop, Since precise branch weights
1741 // cannot be easily determined here, we mark the branch as "unknown" (50/50)
1742 // to prevent misleading optimizations.
1743 setExplicitlyUnknownBranchWeightsIfProfiled(I&: *CondBr, DEBUG_TYPE);
1744
1745 Builder.SetInsertPoint(TheBB: ExitBB, IP: ExitBB->begin());
1746 return NewLoaded;
1747}
1748
1749bool AtomicExpandImpl::tryExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
1750 unsigned MinCASSize = TLI->getMinCmpXchgSizeInBits() / 8;
1751 unsigned ValueSize = getAtomicOpSize(CASI: CI);
1752
1753 switch (TLI->shouldExpandAtomicCmpXchgInIR(AI: CI)) {
1754 default:
1755 llvm_unreachable("Unhandled case in tryExpandAtomicCmpXchg");
1756 case TargetLoweringBase::AtomicExpansionKind::None:
1757 if (ValueSize < MinCASSize)
1758 return expandPartwordCmpXchg(CI);
1759 return false;
1760 case TargetLoweringBase::AtomicExpansionKind::LLSC: {
1761 return expandAtomicCmpXchg(CI);
1762 }
1763 case TargetLoweringBase::AtomicExpansionKind::MaskedIntrinsic:
1764 expandAtomicCmpXchgToMaskedIntrinsic(CI);
1765 return true;
1766 case TargetLoweringBase::AtomicExpansionKind::NotAtomic:
1767 return lowerAtomicCmpXchgInst(CXI: CI);
1768 case TargetLoweringBase::AtomicExpansionKind::CustomExpand: {
1769 TLI->emitExpandAtomicCmpXchg(CI);
1770 return true;
1771 }
1772 }
1773}
1774
1775bool AtomicExpandImpl::expandAtomicRMWToCmpXchg(
1776 AtomicRMWInst *AI, CreateCmpXchgInstFun CreateCmpXchg) {
1777 ReplacementIRBuilder Builder(AI, AI->getDataLayout());
1778 Builder.setIsFPConstrained(
1779 AI->getFunction()->hasFnAttribute(Kind: Attribute::StrictFP));
1780
1781 // FIXME: If FP exceptions are observable, we should force them off for the
1782 // loop for the FP atomics.
1783 Value *Loaded = AtomicExpandImpl::insertRMWCmpXchgLoop(
1784 Builder, ResultTy: AI->getType(), Addr: AI->getPointerOperand(), AddrAlign: AI->getAlign(),
1785 MemOpOrder: AI->getOrdering(), SSID: AI->getSyncScopeID(),
1786 PerformOp: [&](IRBuilderBase &Builder, Value *Loaded) {
1787 return buildAtomicRMWValue(Op: AI->getOperation(), Builder, Loaded,
1788 Val: AI->getValOperand());
1789 },
1790 CreateCmpXchg, /*MetadataSrc=*/AI);
1791
1792 AI->replaceAllUsesWith(V: Loaded);
1793 AI->eraseFromParent();
1794 return true;
1795}
1796
1797// In order to use one of the sized library calls such as
1798// __atomic_fetch_add_4, the alignment must be sufficient, the size
1799// must be one of the potentially-specialized sizes, and the value
1800// type must actually exist in C on the target (otherwise, the
1801// function wouldn't actually be defined.)
1802static bool canUseSizedAtomicCall(unsigned Size, Align Alignment,
1803 const DataLayout &DL) {
1804 // TODO: "LargestSize" is an approximation for "largest type that
1805 // you can express in C". It seems to be the case that int128 is
1806 // supported on all 64-bit platforms, otherwise only up to 64-bit
1807 // integers are supported. If we get this wrong, then we'll try to
1808 // call a sized libcall that doesn't actually exist. There should
1809 // really be some more reliable way in LLVM of determining integer
1810 // sizes which are valid in the target's C ABI...
1811 unsigned LargestSize = DL.getLargestLegalIntTypeSizeInBits() >= 64 ? 16 : 8;
1812 return Alignment >= Size &&
1813 (Size == 1 || Size == 2 || Size == 4 || Size == 8 || Size == 16) &&
1814 Size <= LargestSize;
1815}
1816
1817void AtomicExpandImpl::expandAtomicLoadToLibcall(LoadInst *I) {
1818 static const RTLIB::Libcall Libcalls[6] = {
1819 RTLIB::ATOMIC_LOAD, RTLIB::ATOMIC_LOAD_1, RTLIB::ATOMIC_LOAD_2,
1820 RTLIB::ATOMIC_LOAD_4, RTLIB::ATOMIC_LOAD_8, RTLIB::ATOMIC_LOAD_16};
1821 unsigned Size = getAtomicOpSize(LI: I);
1822
1823 bool expanded = expandAtomicOpToLibcall(
1824 I, Size, Alignment: I->getAlign(), PointerOperand: I->getPointerOperand(), ValueOperand: nullptr, CASExpected: nullptr,
1825 Ordering: I->getOrdering(), Ordering2: AtomicOrdering::NotAtomic, Libcalls);
1826 if (!expanded)
1827 handleFailure(FailedInst&: *I, Msg: "unsupported atomic load");
1828}
1829
1830void AtomicExpandImpl::expandAtomicStoreToLibcall(StoreInst *I) {
1831 static const RTLIB::Libcall Libcalls[6] = {
1832 RTLIB::ATOMIC_STORE, RTLIB::ATOMIC_STORE_1, RTLIB::ATOMIC_STORE_2,
1833 RTLIB::ATOMIC_STORE_4, RTLIB::ATOMIC_STORE_8, RTLIB::ATOMIC_STORE_16};
1834 unsigned Size = getAtomicOpSize(SI: I);
1835
1836 bool expanded = expandAtomicOpToLibcall(
1837 I, Size, Alignment: I->getAlign(), PointerOperand: I->getPointerOperand(), ValueOperand: I->getValueOperand(),
1838 CASExpected: nullptr, Ordering: I->getOrdering(), Ordering2: AtomicOrdering::NotAtomic, Libcalls);
1839 if (!expanded)
1840 handleFailure(FailedInst&: *I, Msg: "unsupported atomic store");
1841}
1842
1843void AtomicExpandImpl::expandAtomicCASToLibcall(AtomicCmpXchgInst *I) {
1844 static const RTLIB::Libcall Libcalls[6] = {
1845 RTLIB::ATOMIC_COMPARE_EXCHANGE, RTLIB::ATOMIC_COMPARE_EXCHANGE_1,
1846 RTLIB::ATOMIC_COMPARE_EXCHANGE_2, RTLIB::ATOMIC_COMPARE_EXCHANGE_4,
1847 RTLIB::ATOMIC_COMPARE_EXCHANGE_8, RTLIB::ATOMIC_COMPARE_EXCHANGE_16};
1848 unsigned Size = getAtomicOpSize(CASI: I);
1849
1850 bool expanded = expandAtomicOpToLibcall(
1851 I, Size, Alignment: I->getAlign(), PointerOperand: I->getPointerOperand(), ValueOperand: I->getNewValOperand(),
1852 CASExpected: I->getCompareOperand(), Ordering: I->getSuccessOrdering(), Ordering2: I->getFailureOrdering(),
1853 Libcalls);
1854 if (!expanded)
1855 handleFailure(FailedInst&: *I, Msg: "unsupported cmpxchg");
1856}
1857
1858static ArrayRef<RTLIB::Libcall> GetRMWLibcall(AtomicRMWInst::BinOp Op) {
1859 static const RTLIB::Libcall LibcallsXchg[6] = {
1860 RTLIB::ATOMIC_EXCHANGE, RTLIB::ATOMIC_EXCHANGE_1,
1861 RTLIB::ATOMIC_EXCHANGE_2, RTLIB::ATOMIC_EXCHANGE_4,
1862 RTLIB::ATOMIC_EXCHANGE_8, RTLIB::ATOMIC_EXCHANGE_16};
1863 static const RTLIB::Libcall LibcallsAdd[6] = {
1864 RTLIB::UNKNOWN_LIBCALL, RTLIB::ATOMIC_FETCH_ADD_1,
1865 RTLIB::ATOMIC_FETCH_ADD_2, RTLIB::ATOMIC_FETCH_ADD_4,
1866 RTLIB::ATOMIC_FETCH_ADD_8, RTLIB::ATOMIC_FETCH_ADD_16};
1867 static const RTLIB::Libcall LibcallsSub[6] = {
1868 RTLIB::UNKNOWN_LIBCALL, RTLIB::ATOMIC_FETCH_SUB_1,
1869 RTLIB::ATOMIC_FETCH_SUB_2, RTLIB::ATOMIC_FETCH_SUB_4,
1870 RTLIB::ATOMIC_FETCH_SUB_8, RTLIB::ATOMIC_FETCH_SUB_16};
1871 static const RTLIB::Libcall LibcallsAnd[6] = {
1872 RTLIB::UNKNOWN_LIBCALL, RTLIB::ATOMIC_FETCH_AND_1,
1873 RTLIB::ATOMIC_FETCH_AND_2, RTLIB::ATOMIC_FETCH_AND_4,
1874 RTLIB::ATOMIC_FETCH_AND_8, RTLIB::ATOMIC_FETCH_AND_16};
1875 static const RTLIB::Libcall LibcallsOr[6] = {
1876 RTLIB::UNKNOWN_LIBCALL, RTLIB::ATOMIC_FETCH_OR_1,
1877 RTLIB::ATOMIC_FETCH_OR_2, RTLIB::ATOMIC_FETCH_OR_4,
1878 RTLIB::ATOMIC_FETCH_OR_8, RTLIB::ATOMIC_FETCH_OR_16};
1879 static const RTLIB::Libcall LibcallsXor[6] = {
1880 RTLIB::UNKNOWN_LIBCALL, RTLIB::ATOMIC_FETCH_XOR_1,
1881 RTLIB::ATOMIC_FETCH_XOR_2, RTLIB::ATOMIC_FETCH_XOR_4,
1882 RTLIB::ATOMIC_FETCH_XOR_8, RTLIB::ATOMIC_FETCH_XOR_16};
1883 static const RTLIB::Libcall LibcallsNand[6] = {
1884 RTLIB::UNKNOWN_LIBCALL, RTLIB::ATOMIC_FETCH_NAND_1,
1885 RTLIB::ATOMIC_FETCH_NAND_2, RTLIB::ATOMIC_FETCH_NAND_4,
1886 RTLIB::ATOMIC_FETCH_NAND_8, RTLIB::ATOMIC_FETCH_NAND_16};
1887
1888 switch (Op) {
1889 case AtomicRMWInst::BAD_BINOP:
1890 llvm_unreachable("Should not have BAD_BINOP.");
1891 case AtomicRMWInst::Xchg:
1892 return ArrayRef(LibcallsXchg);
1893 case AtomicRMWInst::Add:
1894 return ArrayRef(LibcallsAdd);
1895 case AtomicRMWInst::Sub:
1896 return ArrayRef(LibcallsSub);
1897 case AtomicRMWInst::And:
1898 return ArrayRef(LibcallsAnd);
1899 case AtomicRMWInst::Or:
1900 return ArrayRef(LibcallsOr);
1901 case AtomicRMWInst::Xor:
1902 return ArrayRef(LibcallsXor);
1903 case AtomicRMWInst::Nand:
1904 return ArrayRef(LibcallsNand);
1905 case AtomicRMWInst::Max:
1906 case AtomicRMWInst::Min:
1907 case AtomicRMWInst::UMax:
1908 case AtomicRMWInst::UMin:
1909 case AtomicRMWInst::FMax:
1910 case AtomicRMWInst::FMin:
1911 case AtomicRMWInst::FMaximum:
1912 case AtomicRMWInst::FMinimum:
1913 case AtomicRMWInst::FMaximumNum:
1914 case AtomicRMWInst::FMinimumNum:
1915 case AtomicRMWInst::FAdd:
1916 case AtomicRMWInst::FSub:
1917 case AtomicRMWInst::UIncWrap:
1918 case AtomicRMWInst::UDecWrap:
1919 case AtomicRMWInst::USubCond:
1920 case AtomicRMWInst::USubSat:
1921 // No atomic libcalls are available for these.
1922 return {};
1923 }
1924 llvm_unreachable("Unexpected AtomicRMW operation.");
1925}
1926
1927void AtomicExpandImpl::expandAtomicRMWToLibcall(AtomicRMWInst *I) {
1928 ArrayRef<RTLIB::Libcall> Libcalls = GetRMWLibcall(Op: I->getOperation());
1929
1930 unsigned Size = getAtomicOpSize(RMWI: I);
1931
1932 bool Success = false;
1933 if (!Libcalls.empty())
1934 Success = expandAtomicOpToLibcall(
1935 I, Size, Alignment: I->getAlign(), PointerOperand: I->getPointerOperand(), ValueOperand: I->getValOperand(),
1936 CASExpected: nullptr, Ordering: I->getOrdering(), Ordering2: AtomicOrdering::NotAtomic, Libcalls);
1937
1938 // The expansion failed: either there were no libcalls at all for
1939 // the operation (min/max), or there were only size-specialized
1940 // libcalls (add/sub/etc) and we needed a generic. So, expand to a
1941 // CAS libcall, via a CAS loop, instead.
1942 if (!Success) {
1943 expandAtomicRMWToCmpXchg(
1944 AI: I, CreateCmpXchg: [this](IRBuilderBase &Builder, Value *Addr, Value *Loaded,
1945 Value *NewVal, Align Alignment, AtomicOrdering MemOpOrder,
1946 SyncScope::ID SSID, Value *&Success, Value *&NewLoaded,
1947 Instruction *MetadataSrc) {
1948 // Create the CAS instruction normally...
1949 AtomicCmpXchgInst *Pair = Builder.CreateAtomicCmpXchg(
1950 Ptr: Addr, Cmp: Loaded, New: NewVal, Align: Alignment, SuccessOrdering: MemOpOrder,
1951 FailureOrdering: AtomicCmpXchgInst::getStrongestFailureOrdering(SuccessOrdering: MemOpOrder), SSID);
1952 if (MetadataSrc)
1953 copyMetadataForAtomic(Dest&: *Pair, Source: *MetadataSrc);
1954
1955 Success = Builder.CreateExtractValue(Agg: Pair, Idxs: 1, Name: "success");
1956 NewLoaded = Builder.CreateExtractValue(Agg: Pair, Idxs: 0, Name: "newloaded");
1957
1958 // ...and then expand the CAS into a libcall.
1959 expandAtomicCASToLibcall(I: Pair);
1960 });
1961 }
1962}
1963
1964// A helper routine for the above expandAtomic*ToLibcall functions.
1965//
1966// 'Libcalls' contains an array of enum values for the particular
1967// ATOMIC libcalls to be emitted. All of the other arguments besides
1968// 'I' are extracted from the Instruction subclass by the
1969// caller. Depending on the particular call, some will be null.
1970bool AtomicExpandImpl::expandAtomicOpToLibcall(
1971 Instruction *I, unsigned Size, Align Alignment, Value *PointerOperand,
1972 Value *ValueOperand, Value *CASExpected, AtomicOrdering Ordering,
1973 AtomicOrdering Ordering2, ArrayRef<RTLIB::Libcall> Libcalls) {
1974 assert(Libcalls.size() == 6);
1975
1976 LLVMContext &Ctx = I->getContext();
1977 Module *M = I->getModule();
1978 const DataLayout &DL = M->getDataLayout();
1979 IRBuilder<> Builder(I);
1980 IRBuilder<> AllocaBuilder(&I->getFunction()->getEntryBlock().front());
1981
1982 bool UseSizedLibcall = canUseSizedAtomicCall(Size, Alignment, DL);
1983 Type *SizedIntTy = Type::getIntNTy(C&: Ctx, N: Size * 8);
1984
1985 if (M->getTargetTriple().isOSWindows() && M->getTargetTriple().isX86_64() &&
1986 Size == 16) {
1987 // x86_64 Windows passes i128 as an XMM vector; on return, it is in
1988 // XMM0, and as a parameter, it is passed indirectly. The generic lowering
1989 // rules handles this correctly if we pass it as a v2i64 rather than
1990 // i128. This is what Clang does in the frontend for such types as well
1991 // (see WinX86_64ABIInfo::classify in Clang).
1992 SizedIntTy = FixedVectorType::get(ElementType: Type::getInt64Ty(C&: Ctx), NumElts: 2);
1993 }
1994
1995 const Align AllocaAlignment = DL.getPrefTypeAlign(Ty: SizedIntTy);
1996
1997 // TODO: the "order" argument type is "int", not int32. So
1998 // getInt32Ty may be wrong if the arch uses e.g. 16-bit ints.
1999 assert(Ordering != AtomicOrdering::NotAtomic && "expect atomic MO");
2000 Constant *OrderingVal =
2001 ConstantInt::get(Ty: Type::getInt32Ty(C&: Ctx), V: (int)toCABI(AO: Ordering));
2002 Constant *Ordering2Val = nullptr;
2003 if (CASExpected) {
2004 assert(Ordering2 != AtomicOrdering::NotAtomic && "expect atomic MO");
2005 Ordering2Val =
2006 ConstantInt::get(Ty: Type::getInt32Ty(C&: Ctx), V: (int)toCABI(AO: Ordering2));
2007 }
2008 bool HasResult = I->getType() != Type::getVoidTy(C&: Ctx);
2009
2010 RTLIB::Libcall RTLibType;
2011 if (UseSizedLibcall) {
2012 switch (Size) {
2013 case 1:
2014 RTLibType = Libcalls[1];
2015 break;
2016 case 2:
2017 RTLibType = Libcalls[2];
2018 break;
2019 case 4:
2020 RTLibType = Libcalls[3];
2021 break;
2022 case 8:
2023 RTLibType = Libcalls[4];
2024 break;
2025 case 16:
2026 RTLibType = Libcalls[5];
2027 break;
2028 }
2029 } else if (Libcalls[0] != RTLIB::UNKNOWN_LIBCALL) {
2030 RTLibType = Libcalls[0];
2031 } else {
2032 // Can't use sized function, and there's no generic for this
2033 // operation, so give up.
2034 return false;
2035 }
2036
2037 RTLIB::LibcallImpl LibcallImpl = LibcallLowering->getLibcallImpl(Call: RTLibType);
2038 if (LibcallImpl == RTLIB::Unsupported) {
2039 // This target does not implement the requested atomic libcall so give up.
2040 return false;
2041 }
2042
2043 // Build up the function call. There's two kinds. First, the sized
2044 // variants. These calls are going to be one of the following (with
2045 // N=1,2,4,8,16):
2046 // iN __atomic_load_N(iN *ptr, int ordering)
2047 // void __atomic_store_N(iN *ptr, iN val, int ordering)
2048 // iN __atomic_{exchange|fetch_*}_N(iN *ptr, iN val, int ordering)
2049 // bool __atomic_compare_exchange_N(iN *ptr, iN *expected, iN desired,
2050 // int success_order, int failure_order)
2051 //
2052 // Note that these functions can be used for non-integer atomic
2053 // operations, the values just need to be bitcast to integers on the
2054 // way in and out.
2055 //
2056 // And, then, the generic variants. They look like the following:
2057 // void __atomic_load(size_t size, void *ptr, void *ret, int ordering)
2058 // void __atomic_store(size_t size, void *ptr, void *val, int ordering)
2059 // void __atomic_exchange(size_t size, void *ptr, void *val, void *ret,
2060 // int ordering)
2061 // bool __atomic_compare_exchange(size_t size, void *ptr, void *expected,
2062 // void *desired, int success_order,
2063 // int failure_order)
2064 //
2065 // The different signatures are built up depending on the
2066 // 'UseSizedLibcall', 'CASExpected', 'ValueOperand', and 'HasResult'
2067 // variables.
2068
2069 AllocaInst *AllocaCASExpected = nullptr;
2070 AllocaInst *AllocaValue = nullptr;
2071 AllocaInst *AllocaResult = nullptr;
2072
2073 Type *ResultTy;
2074 SmallVector<Value *, 6> Args;
2075 AttributeList Attr;
2076
2077 // 'size' argument.
2078 if (!UseSizedLibcall) {
2079 // Note, getIntPtrType is assumed equivalent to size_t.
2080 Args.push_back(Elt: ConstantInt::get(Ty: DL.getIntPtrType(C&: Ctx), V: Size));
2081 }
2082
2083 // 'ptr' argument.
2084 // note: This assumes all address spaces share a common libfunc
2085 // implementation and that addresses are convertable. For systems without
2086 // that property, we'd need to extend this mechanism to support AS-specific
2087 // families of atomic intrinsics.
2088 Value *PtrVal = PointerOperand;
2089 PtrVal = Builder.CreateAddrSpaceCast(V: PtrVal, DestTy: PointerType::getUnqual(C&: Ctx));
2090 Args.push_back(Elt: PtrVal);
2091
2092 // 'expected' argument, if present.
2093 if (CASExpected) {
2094 AllocaCASExpected = AllocaBuilder.CreateAlloca(Ty: CASExpected->getType());
2095 AllocaCASExpected->setAlignment(AllocaAlignment);
2096 Builder.CreateLifetimeStart(Ptr: AllocaCASExpected);
2097 Builder.CreateAlignedStore(Val: CASExpected, Ptr: AllocaCASExpected, Align: AllocaAlignment);
2098 Args.push_back(Elt: AllocaCASExpected);
2099 }
2100
2101 // 'val' argument ('desired' for cas), if present.
2102 if (ValueOperand) {
2103 if (UseSizedLibcall) {
2104 Value *IntValue =
2105 Builder.CreateBitOrPointerCast(V: ValueOperand, DestTy: SizedIntTy);
2106 Args.push_back(Elt: IntValue);
2107 } else {
2108 AllocaValue = AllocaBuilder.CreateAlloca(Ty: ValueOperand->getType());
2109 AllocaValue->setAlignment(AllocaAlignment);
2110 Builder.CreateLifetimeStart(Ptr: AllocaValue);
2111 Builder.CreateAlignedStore(Val: ValueOperand, Ptr: AllocaValue, Align: AllocaAlignment);
2112 Args.push_back(Elt: AllocaValue);
2113 }
2114 }
2115
2116 // 'ret' argument.
2117 if (!CASExpected && HasResult && !UseSizedLibcall) {
2118 AllocaResult = AllocaBuilder.CreateAlloca(Ty: I->getType());
2119 AllocaResult->setAlignment(AllocaAlignment);
2120 Builder.CreateLifetimeStart(Ptr: AllocaResult);
2121 Args.push_back(Elt: AllocaResult);
2122 }
2123
2124 // 'ordering' ('success_order' for cas) argument.
2125 Args.push_back(Elt: OrderingVal);
2126
2127 // 'failure_order' argument, if present.
2128 if (Ordering2Val)
2129 Args.push_back(Elt: Ordering2Val);
2130
2131 // Now, the return type.
2132 if (CASExpected) {
2133 ResultTy = Type::getInt1Ty(C&: Ctx);
2134 Attr = Attr.addRetAttribute(C&: Ctx, Kind: Attribute::ZExt);
2135 } else if (HasResult && UseSizedLibcall)
2136 ResultTy = SizedIntTy;
2137 else
2138 ResultTy = Type::getVoidTy(C&: Ctx);
2139
2140 // Done with setting up arguments and return types, create the call:
2141 SmallVector<Type *, 6> ArgTys;
2142 for (Value *Arg : Args)
2143 ArgTys.push_back(Elt: Arg->getType());
2144 FunctionType *FnType = FunctionType::get(Result: ResultTy, Params: ArgTys, isVarArg: false);
2145 FunctionCallee LibcallFn = M->getOrInsertFunction(
2146 Name: RTLIB::RuntimeLibcallsInfo::getLibcallImplName(CallImpl: LibcallImpl), T: FnType,
2147 AttributeList: Attr);
2148 CallInst *Call = Builder.CreateCall(Callee: LibcallFn, Args);
2149 Call->setAttributes(Attr);
2150 Value *Result = Call;
2151
2152 // And then, extract the results...
2153 if (ValueOperand && !UseSizedLibcall)
2154 Builder.CreateLifetimeEnd(Ptr: AllocaValue);
2155
2156 if (CASExpected) {
2157 // The final result from the CAS is {load of 'expected' alloca, bool result
2158 // from call}
2159 Type *FinalResultTy = I->getType();
2160 Value *V = PoisonValue::get(T: FinalResultTy);
2161 Value *ExpectedOut = Builder.CreateAlignedLoad(
2162 Ty: CASExpected->getType(), Ptr: AllocaCASExpected, Align: AllocaAlignment);
2163 Builder.CreateLifetimeEnd(Ptr: AllocaCASExpected);
2164 V = Builder.CreateInsertValue(Agg: V, Val: ExpectedOut, Idxs: 0);
2165 V = Builder.CreateInsertValue(Agg: V, Val: Result, Idxs: 1);
2166 I->replaceAllUsesWith(V);
2167 } else if (HasResult) {
2168 Value *V;
2169 if (UseSizedLibcall)
2170 V = Builder.CreateBitOrPointerCast(V: Result, DestTy: I->getType());
2171 else {
2172 V = Builder.CreateAlignedLoad(Ty: I->getType(), Ptr: AllocaResult,
2173 Align: AllocaAlignment);
2174 Builder.CreateLifetimeEnd(Ptr: AllocaResult);
2175 }
2176 I->replaceAllUsesWith(V);
2177 }
2178 I->eraseFromParent();
2179 return true;
2180}
2181