1//===- LowerMemIntrinsics.cpp ----------------------------------*- C++ -*--===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#include "llvm/Transforms/Utils/LowerMemIntrinsics.h"
10#include "llvm/Analysis/ScalarEvolution.h"
11#include "llvm/Analysis/TargetTransformInfo.h"
12#include "llvm/IR/IRBuilder.h"
13#include "llvm/IR/IntrinsicInst.h"
14#include "llvm/IR/MDBuilder.h"
15#include "llvm/IR/ProfDataUtils.h"
16#include "llvm/ProfileData/InstrProf.h"
17#include "llvm/Support/Debug.h"
18#include "llvm/Support/MathExtras.h"
19#include "llvm/Transforms/Utils/BasicBlockUtils.h"
20#include "llvm/Transforms/Utils/LoopUtils.h"
21#include <limits>
22#include <optional>
23
24#define DEBUG_TYPE "lower-mem-intrinsics"
25
26using namespace llvm;
27
28namespace llvm {
29extern cl::opt<bool> ProfcheckDisableMetadataFixes;
30}
31
32/// \returns \p Len urem \p OpSize, checking for optimization opportunities.
33/// \p OpSizeVal must be the integer value of the \c ConstantInt \p OpSize.
34static Value *getRuntimeLoopRemainder(IRBuilderBase &B, Value *Len,
35 Value *OpSize, unsigned OpSizeVal) {
36 // For powers of 2, we can and by (OpSizeVal - 1) instead of using urem.
37 if (isPowerOf2_32(Value: OpSizeVal))
38 return B.CreateAnd(LHS: Len, RHS: OpSizeVal - 1);
39 return B.CreateURem(LHS: Len, RHS: OpSize);
40}
41
42/// \returns (\p Len udiv \p OpSize) mul \p OpSize, checking for optimization
43/// opportunities.
44/// If \p RTLoopRemainder is provided, it must be the result of
45/// \c getRuntimeLoopRemainder() with the same arguments.
46static Value *getRuntimeLoopUnits(IRBuilderBase &B, Value *Len, Value *OpSize,
47 unsigned OpSizeVal,
48 Value *RTLoopRemainder = nullptr) {
49 if (!RTLoopRemainder)
50 RTLoopRemainder = getRuntimeLoopRemainder(B, Len, OpSize, OpSizeVal);
51 return B.CreateSub(LHS: Len, RHS: RTLoopRemainder);
52}
53
54namespace {
55/// Container for the return values of insertLoopExpansion.
56struct LoopExpansionInfo {
57 /// The instruction at the end of the main loop body.
58 Instruction *MainLoopIP = nullptr;
59
60 /// The unit index in the main loop body.
61 Value *MainLoopIndex = nullptr;
62
63 /// The instruction at the end of the residual loop body. Can be nullptr if no
64 /// residual is required.
65 Instruction *ResidualLoopIP = nullptr;
66
67 /// The unit index in the residual loop body. Can be nullptr if no residual is
68 /// required.
69 Value *ResidualLoopIndex = nullptr;
70};
71
72std::optional<uint64_t> getAverageMemOpLoopTripCount(const MemIntrinsic &I) {
73 if (ProfcheckDisableMetadataFixes)
74 return std::nullopt;
75 if (std::optional<Function::ProfileCount> EC =
76 I.getFunction()->getEntryCount();
77 !EC || !EC->getCount())
78 return std::nullopt;
79 if (const auto Len = I.getLengthInBytes())
80 return Len->getZExtValue();
81 uint64_t Total = 0;
82 SmallVector<InstrProfValueData> ProfData =
83 getValueProfDataFromInst(Inst: I, ValueKind: InstrProfValueKind::IPVK_MemOPSize,
84 MaxNumValueData: std::numeric_limits<uint32_t>::max(), TotalC&: Total);
85 if (!Total)
86 return std::nullopt;
87 uint64_t TripCount = 0;
88 for (const auto &P : ProfData)
89 TripCount += P.Count * P.Value;
90 return std::round(x: 1.0 * TripCount / Total);
91}
92
93} // namespace
94
95/// Insert the control flow and loop counters for a memcpy/memset loop
96/// expansion.
97///
98/// This function inserts IR corresponding to the following C code before
99/// \p InsertBefore:
100/// \code
101/// LoopUnits = (Len / MainLoopStep) * MainLoopStep;
102/// ResidualUnits = Len - LoopUnits;
103/// MainLoopIndex = 0;
104/// if (LoopUnits > 0) {
105/// do {
106/// // MainLoopIP
107/// MainLoopIndex += MainLoopStep;
108/// } while (MainLoopIndex < LoopUnits);
109/// }
110/// for (size_t i = 0; i < ResidualUnits; i += ResidualLoopStep) {
111/// ResidualLoopIndex = LoopUnits + i;
112/// // ResidualLoopIP
113/// }
114/// \endcode
115///
116/// \p MainLoopStep and \p ResidualLoopStep determine by how many "units" the
117/// loop index is increased in each iteration of the main and residual loops,
118/// respectively. In most cases, the "unit" will be bytes, but larger units are
119/// useful for lowering memset.pattern.
120///
121/// The computation of \c LoopUnits and \c ResidualUnits is performed at compile
122/// time if \p Len is a \c ConstantInt.
123/// The second (residual) loop is omitted if \p ResidualLoopStep is 0 or equal
124/// to \p MainLoopStep.
125/// The generated \c MainLoopIP, \c MainLoopIndex, \c ResidualLoopIP, and
126/// \c ResidualLoopIndex are returned in a \c LoopExpansionInfo object.
127static LoopExpansionInfo
128insertLoopExpansion(Instruction *InsertBefore, Value *Len,
129 unsigned MainLoopStep, unsigned ResidualLoopStep,
130 StringRef BBNamePrefix,
131 std::optional<uint64_t> AverageTripCount) {
132 assert((ResidualLoopStep == 0 || MainLoopStep % ResidualLoopStep == 0) &&
133 "ResidualLoopStep must divide MainLoopStep if specified");
134 assert(ResidualLoopStep <= MainLoopStep &&
135 "ResidualLoopStep cannot be larger than MainLoopStep");
136 assert(MainLoopStep > 0 && "MainLoopStep must be non-zero");
137 LoopExpansionInfo LEI;
138 BasicBlock *PreLoopBB = InsertBefore->getParent();
139 BasicBlock *PostLoopBB = PreLoopBB->splitBasicBlock(
140 I: InsertBefore, BBName: BBNamePrefix + "-post-expansion");
141 Function *ParentFunc = PreLoopBB->getParent();
142 LLVMContext &Ctx = PreLoopBB->getContext();
143 const DebugLoc &DbgLoc = InsertBefore->getStableDebugLoc();
144 IRBuilder<> PreLoopBuilder(PreLoopBB->getTerminator());
145 PreLoopBuilder.SetCurrentDebugLocation(DbgLoc);
146
147 // Calculate the main loop trip count and remaining units to cover after the
148 // loop.
149 Type *LenType = Len->getType();
150 IntegerType *ILenType = cast<IntegerType>(Val: LenType);
151 ConstantInt *CIMainLoopStep = ConstantInt::get(Ty: ILenType, V: MainLoopStep);
152
153 Value *LoopUnits = Len;
154 Value *ResidualUnits = nullptr;
155 // We can make a conditional branch unconditional if we know that the
156 // MainLoop must be executed at least once.
157 bool MustTakeMainLoop = false;
158 if (MainLoopStep != 1) {
159 if (auto *CLen = dyn_cast<ConstantInt>(Val: Len)) {
160 uint64_t TotalUnits = CLen->getZExtValue();
161 uint64_t LoopEndCount = alignDown(Value: TotalUnits, Align: MainLoopStep);
162 uint64_t ResidualCount = TotalUnits - LoopEndCount;
163 LoopUnits = ConstantInt::get(Ty: LenType, V: LoopEndCount);
164 ResidualUnits = ConstantInt::get(Ty: LenType, V: ResidualCount);
165 MustTakeMainLoop = LoopEndCount > 0;
166 // As an optimization, we could skip generating the residual loop if
167 // ResidualCount is known to be 0. However, current uses of this function
168 // don't request a residual loop if the length is constant (they generate
169 // a (potentially empty) sequence of loads and stores instead), so this
170 // optimization would have no effect here.
171 } else {
172 ResidualUnits = getRuntimeLoopRemainder(B&: PreLoopBuilder, Len,
173 OpSize: CIMainLoopStep, OpSizeVal: MainLoopStep);
174 LoopUnits = getRuntimeLoopUnits(B&: PreLoopBuilder, Len, OpSize: CIMainLoopStep,
175 OpSizeVal: MainLoopStep, RTLoopRemainder: ResidualUnits);
176 }
177 } else if (auto *CLen = dyn_cast<ConstantInt>(Val: Len)) {
178 MustTakeMainLoop = CLen->getZExtValue() > 0;
179 }
180
181 BasicBlock *MainLoopBB = BasicBlock::Create(
182 Context&: Ctx, Name: BBNamePrefix + "-expansion-main-body", Parent: ParentFunc, InsertBefore: PostLoopBB);
183 IRBuilder<> LoopBuilder(MainLoopBB);
184 LoopBuilder.SetCurrentDebugLocation(DbgLoc);
185
186 PHINode *LoopIndex = LoopBuilder.CreatePHI(Ty: LenType, NumReservedValues: 2, Name: "loop-index");
187 LEI.MainLoopIndex = LoopIndex;
188 LoopIndex->addIncoming(V: ConstantInt::get(Ty: LenType, V: 0U), BB: PreLoopBB);
189
190 Value *NewIndex =
191 LoopBuilder.CreateAdd(LHS: LoopIndex, RHS: ConstantInt::get(Ty: LenType, V: MainLoopStep));
192 LoopIndex->addIncoming(V: NewIndex, BB: MainLoopBB);
193
194 // One argument of the addition is a loop-variant PHI, so it must be an
195 // Instruction (i.e., it cannot be a Constant).
196 LEI.MainLoopIP = cast<Instruction>(Val: NewIndex);
197
198 if (ResidualLoopStep > 0 && ResidualLoopStep < MainLoopStep) {
199 // Loop body for the residual accesses.
200 BasicBlock *ResLoopBB =
201 BasicBlock::Create(Context&: Ctx, Name: BBNamePrefix + "-expansion-residual-body",
202 Parent: PreLoopBB->getParent(), InsertBefore: PostLoopBB);
203 // BB to check if the residual loop is needed.
204 BasicBlock *ResidualCondBB =
205 BasicBlock::Create(Context&: Ctx, Name: BBNamePrefix + "-expansion-residual-cond",
206 Parent: PreLoopBB->getParent(), InsertBefore: ResLoopBB);
207
208 // Enter the MainLoop unless no main loop iteration is required.
209 ConstantInt *Zero = ConstantInt::get(Ty: ILenType, V: 0U);
210 if (MustTakeMainLoop)
211 PreLoopBuilder.CreateBr(Dest: MainLoopBB);
212 else {
213 auto *BR = PreLoopBuilder.CreateCondBr(
214 Cond: PreLoopBuilder.CreateICmpNE(LHS: LoopUnits, RHS: Zero), True: MainLoopBB,
215 False: ResidualCondBB);
216 if (AverageTripCount.has_value()) {
217 MDBuilder MDB(ParentFunc->getContext());
218 setFittedBranchWeights(I&: *BR,
219 Weights: {AverageTripCount.value() % MainLoopStep, 1},
220 /*IsExpected=*/false);
221 } else {
222 setExplicitlyUnknownBranchWeightsIfProfiled(I&: *BR, DEBUG_TYPE);
223 }
224 }
225 PreLoopBB->getTerminator()->eraseFromParent();
226
227 // Stay in the MainLoop until we have handled all the LoopUnits. Then go to
228 // the residual condition BB.
229 LoopBuilder.CreateCondBr(Cond: LoopBuilder.CreateICmpULT(LHS: NewIndex, RHS: LoopUnits),
230 True: MainLoopBB, False: ResidualCondBB);
231
232 // Determine if we need to branch to the residual loop or bypass it.
233 IRBuilder<> RCBuilder(ResidualCondBB);
234 RCBuilder.SetCurrentDebugLocation(DbgLoc);
235 RCBuilder.CreateCondBr(Cond: RCBuilder.CreateICmpNE(LHS: ResidualUnits, RHS: Zero),
236 True: ResLoopBB, False: PostLoopBB);
237
238 IRBuilder<> ResBuilder(ResLoopBB);
239 ResBuilder.SetCurrentDebugLocation(DbgLoc);
240 PHINode *ResidualIndex =
241 ResBuilder.CreatePHI(Ty: LenType, NumReservedValues: 2, Name: "residual-loop-index");
242 ResidualIndex->addIncoming(V: Zero, BB: ResidualCondBB);
243
244 // Add the offset at the end of the main loop to the loop counter of the
245 // residual loop to get the proper index.
246 Value *FullOffset = ResBuilder.CreateAdd(LHS: LoopUnits, RHS: ResidualIndex);
247 LEI.ResidualLoopIndex = FullOffset;
248
249 Value *ResNewIndex = ResBuilder.CreateAdd(
250 LHS: ResidualIndex, RHS: ConstantInt::get(Ty: LenType, V: ResidualLoopStep));
251 ResidualIndex->addIncoming(V: ResNewIndex, BB: ResLoopBB);
252
253 // One argument of the addition is a loop-variant PHI, so it must be an
254 // Instruction (i.e., it cannot be a Constant).
255 LEI.ResidualLoopIP = cast<Instruction>(Val: ResNewIndex);
256
257 // Stay in the residual loop until all ResidualUnits are handled.
258 ResBuilder.CreateCondBr(
259 Cond: ResBuilder.CreateICmpULT(LHS: ResNewIndex, RHS: ResidualUnits), True: ResLoopBB,
260 False: PostLoopBB);
261 } else {
262 // There is no need for a residual loop after the main loop. We do however
263 // need to patch up the control flow by creating the terminators for the
264 // preloop block and the main loop.
265
266 // Enter the MainLoop unless no main loop iteration is required.
267 if (MustTakeMainLoop) {
268 PreLoopBuilder.CreateBr(Dest: MainLoopBB);
269 } else {
270 ConstantInt *Zero = ConstantInt::get(Ty: ILenType, V: 0U);
271 MDBuilder B(ParentFunc->getContext());
272 PreLoopBuilder.CreateCondBr(Cond: PreLoopBuilder.CreateICmpNE(LHS: LoopUnits, RHS: Zero),
273 True: MainLoopBB, False: PostLoopBB,
274 BranchWeights: B.createLikelyBranchWeights());
275 }
276 PreLoopBB->getTerminator()->eraseFromParent();
277 // Stay in the MainLoop until we have handled all the LoopUnits.
278 auto *Br = LoopBuilder.CreateCondBr(
279 Cond: LoopBuilder.CreateICmpULT(LHS: NewIndex, RHS: LoopUnits), True: MainLoopBB, False: PostLoopBB);
280 if (AverageTripCount.has_value())
281 setFittedBranchWeights(I&: *Br, Weights: {AverageTripCount.value() / MainLoopStep, 1},
282 /*IsExpected=*/false);
283 else
284 setExplicitlyUnknownBranchWeightsIfProfiled(I&: *Br, DEBUG_TYPE);
285 }
286 return LEI;
287}
288
289void llvm::createMemCpyLoopKnownSize(Instruction *InsertBefore, Value *SrcAddr,
290 Value *DstAddr, ConstantInt *CopyLen,
291 Align SrcAlign, Align DstAlign,
292 bool SrcIsVolatile, bool DstIsVolatile,
293 bool CanOverlap,
294 const TargetTransformInfo &TTI,
295 std::optional<uint32_t> AtomicElementSize,
296 std::optional<uint64_t> AverageTripCount) {
297 // No need to expand zero length copies.
298 if (CopyLen->isZero())
299 return;
300
301 BasicBlock *PreLoopBB = InsertBefore->getParent();
302 Function *ParentFunc = PreLoopBB->getParent();
303 LLVMContext &Ctx = PreLoopBB->getContext();
304 const DataLayout &DL = ParentFunc->getDataLayout();
305 MDBuilder MDB(Ctx);
306 MDNode *NewDomain = MDB.createAnonymousAliasScopeDomain(Name: "MemCopyDomain");
307 StringRef Name = "MemCopyAliasScope";
308 MDNode *NewScope = MDB.createAnonymousAliasScope(Domain: NewDomain, Name);
309
310 unsigned SrcAS = cast<PointerType>(Val: SrcAddr->getType())->getAddressSpace();
311 unsigned DstAS = cast<PointerType>(Val: DstAddr->getType())->getAddressSpace();
312
313 Type *TypeOfCopyLen = CopyLen->getType();
314 Type *LoopOpType = TTI.getMemcpyLoopLoweringType(
315 Context&: Ctx, Length: CopyLen, SrcAddrSpace: SrcAS, DestAddrSpace: DstAS, SrcAlign, DestAlign: DstAlign, AtomicElementSize);
316 assert((!AtomicElementSize || !LoopOpType->isVectorTy()) &&
317 "Atomic memcpy lowering is not supported for vector operand type");
318
319 Type *Int8Type = Type::getInt8Ty(C&: Ctx);
320 TypeSize LoopOpSize = DL.getTypeStoreSize(Ty: LoopOpType);
321 assert(LoopOpSize.isFixed() && "LoopOpType cannot be a scalable vector type");
322 assert((!AtomicElementSize || LoopOpSize % *AtomicElementSize == 0) &&
323 "Atomic memcpy lowering is not supported for selected operand size");
324
325 uint64_t LoopEndCount =
326 alignDown(Value: CopyLen->getZExtValue(), Align: LoopOpSize.getFixedValue());
327
328 // Skip the loop expansion entirely if the loop would never be taken.
329 if (LoopEndCount != 0) {
330 LoopExpansionInfo LEI =
331 insertLoopExpansion(InsertBefore, Len: CopyLen, MainLoopStep: LoopOpSize, ResidualLoopStep: 0,
332 BBNamePrefix: "static-memcpy", AverageTripCount);
333
334 // Fill MainLoopBB
335 IRBuilder<> MainLoopBuilder(LEI.MainLoopIP);
336 Align PartDstAlign(commonAlignment(A: DstAlign, Offset: LoopOpSize));
337 Align PartSrcAlign(commonAlignment(A: SrcAlign, Offset: LoopOpSize));
338
339 // If we used LoopOpType as GEP element type, we would iterate over the
340 // buffers in TypeStoreSize strides while copying TypeAllocSize bytes, i.e.,
341 // we would miss bytes if TypeStoreSize != TypeAllocSize. Therefore, use
342 // byte offsets computed from the TypeStoreSize.
343 Value *SrcGEP =
344 MainLoopBuilder.CreateInBoundsGEP(Ty: Int8Type, Ptr: SrcAddr, IdxList: LEI.MainLoopIndex);
345 LoadInst *Load = MainLoopBuilder.CreateAlignedLoad(
346 Ty: LoopOpType, Ptr: SrcGEP, Align: PartSrcAlign, isVolatile: SrcIsVolatile);
347 if (!CanOverlap) {
348 // Set alias scope for loads.
349 Load->setMetadata(KindID: LLVMContext::MD_alias_scope,
350 Node: MDNode::get(Context&: Ctx, MDs: NewScope));
351 }
352 Value *DstGEP =
353 MainLoopBuilder.CreateInBoundsGEP(Ty: Int8Type, Ptr: DstAddr, IdxList: LEI.MainLoopIndex);
354 StoreInst *Store = MainLoopBuilder.CreateAlignedStore(
355 Val: Load, Ptr: DstGEP, Align: PartDstAlign, isVolatile: DstIsVolatile);
356 if (!CanOverlap) {
357 // Indicate that stores don't overlap loads.
358 Store->setMetadata(KindID: LLVMContext::MD_noalias, Node: MDNode::get(Context&: Ctx, MDs: NewScope));
359 }
360 if (AtomicElementSize) {
361 Load->setAtomic(Ordering: AtomicOrdering::Unordered);
362 Store->setAtomic(Ordering: AtomicOrdering::Unordered);
363 }
364 assert(!LEI.ResidualLoopIP && !LEI.ResidualLoopIndex &&
365 "No residual loop was requested");
366 }
367
368 // Copy the remaining bytes with straight-line code.
369 uint64_t BytesCopied = LoopEndCount;
370 uint64_t RemainingBytes = CopyLen->getZExtValue() - BytesCopied;
371 if (RemainingBytes == 0)
372 return;
373
374 IRBuilder<> RBuilder(InsertBefore);
375 SmallVector<Type *, 5> RemainingOps;
376 TTI.getMemcpyLoopResidualLoweringType(OpsOut&: RemainingOps, Context&: Ctx, RemainingBytes,
377 SrcAddrSpace: SrcAS, DestAddrSpace: DstAS, SrcAlign, DestAlign: DstAlign,
378 AtomicCpySize: AtomicElementSize);
379
380 for (auto *OpTy : RemainingOps) {
381 Align PartSrcAlign(commonAlignment(A: SrcAlign, Offset: BytesCopied));
382 Align PartDstAlign(commonAlignment(A: DstAlign, Offset: BytesCopied));
383
384 TypeSize OperandSize = DL.getTypeStoreSize(Ty: OpTy);
385 assert((!AtomicElementSize || OperandSize % *AtomicElementSize == 0) &&
386 "Atomic memcpy lowering is not supported for selected operand size");
387
388 Value *SrcGEP = RBuilder.CreateInBoundsGEP(
389 Ty: Int8Type, Ptr: SrcAddr, IdxList: ConstantInt::get(Ty: TypeOfCopyLen, V: BytesCopied));
390 LoadInst *Load =
391 RBuilder.CreateAlignedLoad(Ty: OpTy, Ptr: SrcGEP, Align: PartSrcAlign, isVolatile: SrcIsVolatile);
392 if (!CanOverlap) {
393 // Set alias scope for loads.
394 Load->setMetadata(KindID: LLVMContext::MD_alias_scope,
395 Node: MDNode::get(Context&: Ctx, MDs: NewScope));
396 }
397 Value *DstGEP = RBuilder.CreateInBoundsGEP(
398 Ty: Int8Type, Ptr: DstAddr, IdxList: ConstantInt::get(Ty: TypeOfCopyLen, V: BytesCopied));
399 StoreInst *Store =
400 RBuilder.CreateAlignedStore(Val: Load, Ptr: DstGEP, Align: PartDstAlign, isVolatile: DstIsVolatile);
401 if (!CanOverlap) {
402 // Indicate that stores don't overlap loads.
403 Store->setMetadata(KindID: LLVMContext::MD_noalias, Node: MDNode::get(Context&: Ctx, MDs: NewScope));
404 }
405 if (AtomicElementSize) {
406 Load->setAtomic(Ordering: AtomicOrdering::Unordered);
407 Store->setAtomic(Ordering: AtomicOrdering::Unordered);
408 }
409 BytesCopied += OperandSize;
410 }
411 assert(BytesCopied == CopyLen->getZExtValue() &&
412 "Bytes copied should match size in the call!");
413}
414
415void llvm::createMemCpyLoopUnknownSize(
416 Instruction *InsertBefore, Value *SrcAddr, Value *DstAddr, Value *CopyLen,
417 Align SrcAlign, Align DstAlign, bool SrcIsVolatile, bool DstIsVolatile,
418 bool CanOverlap, const TargetTransformInfo &TTI,
419 std::optional<uint32_t> AtomicElementSize,
420 std::optional<uint64_t> AverageTripCount) {
421 BasicBlock *PreLoopBB = InsertBefore->getParent();
422 Function *ParentFunc = PreLoopBB->getParent();
423 const DataLayout &DL = ParentFunc->getDataLayout();
424 LLVMContext &Ctx = PreLoopBB->getContext();
425 MDBuilder MDB(Ctx);
426 MDNode *NewDomain = MDB.createAnonymousAliasScopeDomain(Name: "MemCopyDomain");
427 StringRef Name = "MemCopyAliasScope";
428 MDNode *NewScope = MDB.createAnonymousAliasScope(Domain: NewDomain, Name);
429
430 unsigned SrcAS = cast<PointerType>(Val: SrcAddr->getType())->getAddressSpace();
431 unsigned DstAS = cast<PointerType>(Val: DstAddr->getType())->getAddressSpace();
432
433 Type *LoopOpType = TTI.getMemcpyLoopLoweringType(
434 Context&: Ctx, Length: CopyLen, SrcAddrSpace: SrcAS, DestAddrSpace: DstAS, SrcAlign, DestAlign: DstAlign, AtomicElementSize);
435 assert((!AtomicElementSize || !LoopOpType->isVectorTy()) &&
436 "Atomic memcpy lowering is not supported for vector operand type");
437 TypeSize LoopOpSize = DL.getTypeStoreSize(Ty: LoopOpType);
438 assert((!AtomicElementSize || LoopOpSize % *AtomicElementSize == 0) &&
439 "Atomic memcpy lowering is not supported for selected operand size");
440
441 Type *Int8Type = Type::getInt8Ty(C&: Ctx);
442
443 Type *ResidualLoopOpType = AtomicElementSize
444 ? Type::getIntNTy(C&: Ctx, N: *AtomicElementSize * 8)
445 : Int8Type;
446 TypeSize ResidualLoopOpSize = DL.getTypeStoreSize(Ty: ResidualLoopOpType);
447 assert(ResidualLoopOpSize == (AtomicElementSize ? *AtomicElementSize : 1) &&
448 "Store size is expected to match type size");
449
450 LoopExpansionInfo LEI =
451 insertLoopExpansion(InsertBefore, Len: CopyLen, MainLoopStep: LoopOpSize, ResidualLoopStep: ResidualLoopOpSize,
452 BBNamePrefix: "dynamic-memcpy", AverageTripCount);
453
454 // Fill MainLoopBB
455 IRBuilder<> MainLoopBuilder(LEI.MainLoopIP);
456 Align PartSrcAlign(commonAlignment(A: SrcAlign, Offset: LoopOpSize));
457 Align PartDstAlign(commonAlignment(A: DstAlign, Offset: LoopOpSize));
458
459 // If we used LoopOpType as GEP element type, we would iterate over the
460 // buffers in TypeStoreSize strides while copying TypeAllocSize bytes, i.e.,
461 // we would miss bytes if TypeStoreSize != TypeAllocSize. Therefore, use byte
462 // offsets computed from the TypeStoreSize.
463 Value *SrcGEP =
464 MainLoopBuilder.CreateInBoundsGEP(Ty: Int8Type, Ptr: SrcAddr, IdxList: LEI.MainLoopIndex);
465 LoadInst *Load = MainLoopBuilder.CreateAlignedLoad(
466 Ty: LoopOpType, Ptr: SrcGEP, Align: PartSrcAlign, isVolatile: SrcIsVolatile);
467 if (!CanOverlap) {
468 // Set alias scope for loads.
469 Load->setMetadata(KindID: LLVMContext::MD_alias_scope, Node: MDNode::get(Context&: Ctx, MDs: NewScope));
470 }
471 Value *DstGEP =
472 MainLoopBuilder.CreateInBoundsGEP(Ty: Int8Type, Ptr: DstAddr, IdxList: LEI.MainLoopIndex);
473 StoreInst *Store = MainLoopBuilder.CreateAlignedStore(
474 Val: Load, Ptr: DstGEP, Align: PartDstAlign, isVolatile: DstIsVolatile);
475 if (!CanOverlap) {
476 // Indicate that stores don't overlap loads.
477 Store->setMetadata(KindID: LLVMContext::MD_noalias, Node: MDNode::get(Context&: Ctx, MDs: NewScope));
478 }
479 if (AtomicElementSize) {
480 Load->setAtomic(Ordering: AtomicOrdering::Unordered);
481 Store->setAtomic(Ordering: AtomicOrdering::Unordered);
482 }
483
484 // Fill ResidualLoopBB.
485 if (!LEI.ResidualLoopIP)
486 return;
487
488 Align ResSrcAlign(commonAlignment(A: PartSrcAlign, Offset: ResidualLoopOpSize));
489 Align ResDstAlign(commonAlignment(A: PartDstAlign, Offset: ResidualLoopOpSize));
490
491 IRBuilder<> ResLoopBuilder(LEI.ResidualLoopIP);
492 Value *ResSrcGEP = ResLoopBuilder.CreateInBoundsGEP(Ty: Int8Type, Ptr: SrcAddr,
493 IdxList: LEI.ResidualLoopIndex);
494 LoadInst *ResLoad = ResLoopBuilder.CreateAlignedLoad(
495 Ty: ResidualLoopOpType, Ptr: ResSrcGEP, Align: ResSrcAlign, isVolatile: SrcIsVolatile);
496 if (!CanOverlap) {
497 // Set alias scope for loads.
498 ResLoad->setMetadata(KindID: LLVMContext::MD_alias_scope,
499 Node: MDNode::get(Context&: Ctx, MDs: NewScope));
500 }
501 Value *ResDstGEP = ResLoopBuilder.CreateInBoundsGEP(Ty: Int8Type, Ptr: DstAddr,
502 IdxList: LEI.ResidualLoopIndex);
503 StoreInst *ResStore = ResLoopBuilder.CreateAlignedStore(
504 Val: ResLoad, Ptr: ResDstGEP, Align: ResDstAlign, isVolatile: DstIsVolatile);
505 if (!CanOverlap) {
506 // Indicate that stores don't overlap loads.
507 ResStore->setMetadata(KindID: LLVMContext::MD_noalias, Node: MDNode::get(Context&: Ctx, MDs: NewScope));
508 }
509 if (AtomicElementSize) {
510 ResLoad->setAtomic(Ordering: AtomicOrdering::Unordered);
511 ResStore->setAtomic(Ordering: AtomicOrdering::Unordered);
512 }
513}
514
515// If \p Addr1 and \p Addr2 are pointers to different address spaces, create an
516// addresspacecast to obtain a pair of pointers in the same addressspace. The
517// caller needs to ensure that addrspacecasting is possible.
518// No-op if the pointers are in the same address space.
519static std::pair<Value *, Value *>
520tryInsertCastToCommonAddrSpace(IRBuilderBase &B, Value *Addr1, Value *Addr2,
521 const TargetTransformInfo &TTI) {
522 Value *ResAddr1 = Addr1;
523 Value *ResAddr2 = Addr2;
524
525 unsigned AS1 = cast<PointerType>(Val: Addr1->getType())->getAddressSpace();
526 unsigned AS2 = cast<PointerType>(Val: Addr2->getType())->getAddressSpace();
527 if (AS1 != AS2) {
528 if (TTI.isValidAddrSpaceCast(FromAS: AS2, ToAS: AS1))
529 ResAddr2 = B.CreateAddrSpaceCast(V: Addr2, DestTy: Addr1->getType());
530 else if (TTI.isValidAddrSpaceCast(FromAS: AS1, ToAS: AS2))
531 ResAddr1 = B.CreateAddrSpaceCast(V: Addr1, DestTy: Addr2->getType());
532 else
533 llvm_unreachable("Can only lower memmove between address spaces if they "
534 "support addrspacecast");
535 }
536 return {ResAddr1, ResAddr2};
537}
538
539// Lower memmove to IR. memmove is required to correctly copy overlapping memory
540// regions; therefore, it has to check the relative positions of the source and
541// destination pointers and choose the copy direction accordingly.
542//
543// The code below is an IR rendition of this C function:
544//
545// void* memmove(void* dst, const void* src, size_t n) {
546// unsigned char* d = dst;
547// const unsigned char* s = src;
548// if (s < d) {
549// // copy backwards
550// while (n--) {
551// d[n] = s[n];
552// }
553// } else {
554// // copy forward
555// for (size_t i = 0; i < n; ++i) {
556// d[i] = s[i];
557// }
558// }
559// return dst;
560// }
561//
562// If the TargetTransformInfo specifies a wider MemcpyLoopLoweringType, it is
563// used for the memory accesses in the loops. Then, additional loops with
564// byte-wise accesses are added for the remaining bytes.
565static void createMemMoveLoopUnknownSize(Instruction *InsertBefore,
566 Value *SrcAddr, Value *DstAddr,
567 Value *CopyLen, Align SrcAlign,
568 Align DstAlign, bool SrcIsVolatile,
569 bool DstIsVolatile,
570 const TargetTransformInfo &TTI) {
571 Type *TypeOfCopyLen = CopyLen->getType();
572 BasicBlock *OrigBB = InsertBefore->getParent();
573 Function *F = OrigBB->getParent();
574 const DataLayout &DL = F->getDataLayout();
575 LLVMContext &Ctx = OrigBB->getContext();
576 unsigned SrcAS = cast<PointerType>(Val: SrcAddr->getType())->getAddressSpace();
577 unsigned DstAS = cast<PointerType>(Val: DstAddr->getType())->getAddressSpace();
578
579 Type *LoopOpType = TTI.getMemcpyLoopLoweringType(Context&: Ctx, Length: CopyLen, SrcAddrSpace: SrcAS, DestAddrSpace: DstAS,
580 SrcAlign, DestAlign: DstAlign);
581 TypeSize LoopOpSize = DL.getTypeStoreSize(Ty: LoopOpType);
582 Type *Int8Type = Type::getInt8Ty(C&: Ctx);
583 bool LoopOpIsInt8 = LoopOpType == Int8Type;
584
585 // If the memory accesses are wider than one byte, residual loops with
586 // i8-accesses are required to move remaining bytes.
587 bool RequiresResidual = !LoopOpIsInt8;
588
589 Type *ResidualLoopOpType = Int8Type;
590 TypeSize ResidualLoopOpSize = DL.getTypeStoreSize(Ty: ResidualLoopOpType);
591
592 // Calculate the loop trip count and remaining bytes to copy after the loop.
593 IntegerType *ILengthType = cast<IntegerType>(Val: TypeOfCopyLen);
594 ConstantInt *CILoopOpSize = ConstantInt::get(Ty: ILengthType, V: LoopOpSize);
595 ConstantInt *CIResidualLoopOpSize =
596 ConstantInt::get(Ty: ILengthType, V: ResidualLoopOpSize);
597 ConstantInt *Zero = ConstantInt::get(Ty: ILengthType, V: 0);
598
599 const DebugLoc &DbgLoc = InsertBefore->getStableDebugLoc();
600 IRBuilder<> PLBuilder(InsertBefore);
601 PLBuilder.SetCurrentDebugLocation(DbgLoc);
602
603 Value *RuntimeLoopBytes = CopyLen;
604 Value *RuntimeLoopRemainder = nullptr;
605 Value *SkipResidualCondition = nullptr;
606 if (RequiresResidual) {
607 RuntimeLoopRemainder =
608 getRuntimeLoopRemainder(B&: PLBuilder, Len: CopyLen, OpSize: CILoopOpSize, OpSizeVal: LoopOpSize);
609 RuntimeLoopBytes = getRuntimeLoopUnits(B&: PLBuilder, Len: CopyLen, OpSize: CILoopOpSize,
610 OpSizeVal: LoopOpSize, RTLoopRemainder: RuntimeLoopRemainder);
611 SkipResidualCondition =
612 PLBuilder.CreateICmpEQ(LHS: RuntimeLoopRemainder, RHS: Zero, Name: "skip_residual");
613 }
614 Value *SkipMainCondition =
615 PLBuilder.CreateICmpEQ(LHS: RuntimeLoopBytes, RHS: Zero, Name: "skip_main");
616
617 // Create the a comparison of src and dst, based on which we jump to either
618 // the forward-copy part of the function (if src >= dst) or the backwards-copy
619 // part (if src < dst).
620 // SplitBlockAndInsertIfThenElse conveniently creates the basic if-then-else
621 // structure. Its block terminators (unconditional branches) are replaced by
622 // the appropriate conditional branches when the loop is built.
623 // If the pointers are in different address spaces, they need to be converted
624 // to a compatible one. Cases where memory ranges in the different address
625 // spaces cannot overlap are lowered as memcpy and not handled here.
626 auto [CmpSrcAddr, CmpDstAddr] =
627 tryInsertCastToCommonAddrSpace(B&: PLBuilder, Addr1: SrcAddr, Addr2: DstAddr, TTI);
628 Value *PtrCompare =
629 PLBuilder.CreateICmpULT(LHS: CmpSrcAddr, RHS: CmpDstAddr, Name: "compare_src_dst");
630 Instruction *ThenTerm, *ElseTerm;
631 SplitBlockAndInsertIfThenElse(Cond: PtrCompare, SplitBefore: InsertBefore->getIterator(),
632 ThenTerm: &ThenTerm, ElseTerm: &ElseTerm);
633
634 // If the LoopOpSize is greater than 1, each part of the function consists of
635 // four blocks:
636 // memmove_copy_backwards:
637 // skip the residual loop when 0 iterations are required
638 // memmove_bwd_residual_loop:
639 // copy the last few bytes individually so that the remaining length is
640 // a multiple of the LoopOpSize
641 // memmove_bwd_middle: skip the main loop when 0 iterations are required
642 // memmove_bwd_main_loop: the actual backwards loop BB with wide accesses
643 // memmove_copy_forward: skip the main loop when 0 iterations are required
644 // memmove_fwd_main_loop: the actual forward loop BB with wide accesses
645 // memmove_fwd_middle: skip the residual loop when 0 iterations are required
646 // memmove_fwd_residual_loop: copy the last few bytes individually
647 //
648 // The main and residual loop are switched between copying forward and
649 // backward so that the residual loop always operates on the end of the moved
650 // range. This is based on the assumption that buffers whose start is aligned
651 // with the LoopOpSize are more common than buffers whose end is.
652 //
653 // If the LoopOpSize is 1, each part of the function consists of two blocks:
654 // memmove_copy_backwards: skip the loop when 0 iterations are required
655 // memmove_bwd_main_loop: the actual backwards loop BB
656 // memmove_copy_forward: skip the loop when 0 iterations are required
657 // memmove_fwd_main_loop: the actual forward loop BB
658 BasicBlock *CopyBackwardsBB = ThenTerm->getParent();
659 CopyBackwardsBB->setName("memmove_copy_backwards");
660 BasicBlock *CopyForwardBB = ElseTerm->getParent();
661 CopyForwardBB->setName("memmove_copy_forward");
662 BasicBlock *ExitBB = InsertBefore->getParent();
663 ExitBB->setName("memmove_done");
664
665 Align PartSrcAlign(commonAlignment(A: SrcAlign, Offset: LoopOpSize));
666 Align PartDstAlign(commonAlignment(A: DstAlign, Offset: LoopOpSize));
667
668 // Accesses in the residual loops do not share the same alignment as those in
669 // the main loops.
670 Align ResidualSrcAlign(commonAlignment(A: PartSrcAlign, Offset: ResidualLoopOpSize));
671 Align ResidualDstAlign(commonAlignment(A: PartDstAlign, Offset: ResidualLoopOpSize));
672
673 // Copying backwards.
674 {
675 BasicBlock *MainLoopBB = BasicBlock::Create(
676 Context&: F->getContext(), Name: "memmove_bwd_main_loop", Parent: F, InsertBefore: CopyForwardBB);
677
678 // The predecessor of the memmove_bwd_main_loop. Updated in the
679 // following if a residual loop is emitted first.
680 BasicBlock *PredBB = CopyBackwardsBB;
681
682 if (RequiresResidual) {
683 // backwards residual loop
684 BasicBlock *ResidualLoopBB = BasicBlock::Create(
685 Context&: F->getContext(), Name: "memmove_bwd_residual_loop", Parent: F, InsertBefore: MainLoopBB);
686 IRBuilder<> ResidualLoopBuilder(ResidualLoopBB);
687 ResidualLoopBuilder.SetCurrentDebugLocation(DbgLoc);
688 PHINode *ResidualLoopPhi = ResidualLoopBuilder.CreatePHI(Ty: ILengthType, NumReservedValues: 0);
689 Value *ResidualIndex = ResidualLoopBuilder.CreateSub(
690 LHS: ResidualLoopPhi, RHS: CIResidualLoopOpSize, Name: "bwd_residual_index");
691 // If we used LoopOpType as GEP element type, we would iterate over the
692 // buffers in TypeStoreSize strides while copying TypeAllocSize bytes,
693 // i.e., we would miss bytes if TypeStoreSize != TypeAllocSize. Therefore,
694 // use byte offsets computed from the TypeStoreSize.
695 Value *LoadGEP = ResidualLoopBuilder.CreateInBoundsGEP(Ty: Int8Type, Ptr: SrcAddr,
696 IdxList: ResidualIndex);
697 Value *Element = ResidualLoopBuilder.CreateAlignedLoad(
698 Ty: ResidualLoopOpType, Ptr: LoadGEP, Align: ResidualSrcAlign, isVolatile: SrcIsVolatile,
699 Name: "element");
700 Value *StoreGEP = ResidualLoopBuilder.CreateInBoundsGEP(Ty: Int8Type, Ptr: DstAddr,
701 IdxList: ResidualIndex);
702 ResidualLoopBuilder.CreateAlignedStore(Val: Element, Ptr: StoreGEP,
703 Align: ResidualDstAlign, isVolatile: DstIsVolatile);
704
705 // After the residual loop, go to an intermediate block.
706 BasicBlock *IntermediateBB = BasicBlock::Create(
707 Context&: F->getContext(), Name: "memmove_bwd_middle", Parent: F, InsertBefore: MainLoopBB);
708 // Later code expects a terminator in the PredBB.
709 IRBuilder<> IntermediateBuilder(IntermediateBB);
710 IntermediateBuilder.SetCurrentDebugLocation(DbgLoc);
711 IntermediateBuilder.CreateUnreachable();
712 ResidualLoopBuilder.CreateCondBr(
713 Cond: ResidualLoopBuilder.CreateICmpEQ(LHS: ResidualIndex, RHS: RuntimeLoopBytes),
714 True: IntermediateBB, False: ResidualLoopBB);
715
716 ResidualLoopPhi->addIncoming(V: ResidualIndex, BB: ResidualLoopBB);
717 ResidualLoopPhi->addIncoming(V: CopyLen, BB: CopyBackwardsBB);
718
719 // How to get to the residual:
720 BranchInst *BrInst =
721 BranchInst::Create(IfTrue: IntermediateBB, IfFalse: ResidualLoopBB,
722 Cond: SkipResidualCondition, InsertBefore: ThenTerm->getIterator());
723 BrInst->setDebugLoc(DbgLoc);
724 ThenTerm->eraseFromParent();
725
726 PredBB = IntermediateBB;
727 }
728
729 // main loop
730 IRBuilder<> MainLoopBuilder(MainLoopBB);
731 MainLoopBuilder.SetCurrentDebugLocation(DbgLoc);
732 PHINode *MainLoopPhi = MainLoopBuilder.CreatePHI(Ty: ILengthType, NumReservedValues: 0);
733 Value *MainIndex =
734 MainLoopBuilder.CreateSub(LHS: MainLoopPhi, RHS: CILoopOpSize, Name: "bwd_main_index");
735 Value *LoadGEP =
736 MainLoopBuilder.CreateInBoundsGEP(Ty: Int8Type, Ptr: SrcAddr, IdxList: MainIndex);
737 Value *Element = MainLoopBuilder.CreateAlignedLoad(
738 Ty: LoopOpType, Ptr: LoadGEP, Align: PartSrcAlign, isVolatile: SrcIsVolatile, Name: "element");
739 Value *StoreGEP =
740 MainLoopBuilder.CreateInBoundsGEP(Ty: Int8Type, Ptr: DstAddr, IdxList: MainIndex);
741 MainLoopBuilder.CreateAlignedStore(Val: Element, Ptr: StoreGEP, Align: PartDstAlign,
742 isVolatile: DstIsVolatile);
743 MainLoopBuilder.CreateCondBr(Cond: MainLoopBuilder.CreateICmpEQ(LHS: MainIndex, RHS: Zero),
744 True: ExitBB, False: MainLoopBB);
745 MainLoopPhi->addIncoming(V: MainIndex, BB: MainLoopBB);
746 MainLoopPhi->addIncoming(V: RuntimeLoopBytes, BB: PredBB);
747
748 // How to get to the main loop:
749 Instruction *PredBBTerm = PredBB->getTerminator();
750 BranchInst *BrInst = BranchInst::Create(
751 IfTrue: ExitBB, IfFalse: MainLoopBB, Cond: SkipMainCondition, InsertBefore: PredBBTerm->getIterator());
752 BrInst->setDebugLoc(DbgLoc);
753 PredBBTerm->eraseFromParent();
754 }
755
756 // Copying forward.
757 // main loop
758 {
759 BasicBlock *MainLoopBB =
760 BasicBlock::Create(Context&: F->getContext(), Name: "memmove_fwd_main_loop", Parent: F, InsertBefore: ExitBB);
761 IRBuilder<> MainLoopBuilder(MainLoopBB);
762 MainLoopBuilder.SetCurrentDebugLocation(DbgLoc);
763 PHINode *MainLoopPhi =
764 MainLoopBuilder.CreatePHI(Ty: ILengthType, NumReservedValues: 0, Name: "fwd_main_index");
765 Value *LoadGEP =
766 MainLoopBuilder.CreateInBoundsGEP(Ty: Int8Type, Ptr: SrcAddr, IdxList: MainLoopPhi);
767 Value *Element = MainLoopBuilder.CreateAlignedLoad(
768 Ty: LoopOpType, Ptr: LoadGEP, Align: PartSrcAlign, isVolatile: SrcIsVolatile, Name: "element");
769 Value *StoreGEP =
770 MainLoopBuilder.CreateInBoundsGEP(Ty: Int8Type, Ptr: DstAddr, IdxList: MainLoopPhi);
771 MainLoopBuilder.CreateAlignedStore(Val: Element, Ptr: StoreGEP, Align: PartDstAlign,
772 isVolatile: DstIsVolatile);
773 Value *MainIndex = MainLoopBuilder.CreateAdd(LHS: MainLoopPhi, RHS: CILoopOpSize);
774 MainLoopPhi->addIncoming(V: MainIndex, BB: MainLoopBB);
775 MainLoopPhi->addIncoming(V: Zero, BB: CopyForwardBB);
776
777 Instruction *CopyFwdBBTerm = CopyForwardBB->getTerminator();
778 BasicBlock *SuccessorBB = ExitBB;
779 if (RequiresResidual)
780 SuccessorBB =
781 BasicBlock::Create(Context&: F->getContext(), Name: "memmove_fwd_middle", Parent: F, InsertBefore: ExitBB);
782
783 // leaving or staying in the main loop
784 MainLoopBuilder.CreateCondBr(
785 Cond: MainLoopBuilder.CreateICmpEQ(LHS: MainIndex, RHS: RuntimeLoopBytes), True: SuccessorBB,
786 False: MainLoopBB);
787
788 // getting in or skipping the main loop
789 BranchInst *BrInst =
790 BranchInst::Create(IfTrue: SuccessorBB, IfFalse: MainLoopBB, Cond: SkipMainCondition,
791 InsertBefore: CopyFwdBBTerm->getIterator());
792 BrInst->setDebugLoc(DbgLoc);
793 CopyFwdBBTerm->eraseFromParent();
794
795 if (RequiresResidual) {
796 BasicBlock *IntermediateBB = SuccessorBB;
797 IRBuilder<> IntermediateBuilder(IntermediateBB);
798 IntermediateBuilder.SetCurrentDebugLocation(DbgLoc);
799 BasicBlock *ResidualLoopBB = BasicBlock::Create(
800 Context&: F->getContext(), Name: "memmove_fwd_residual_loop", Parent: F, InsertBefore: ExitBB);
801 IntermediateBuilder.CreateCondBr(Cond: SkipResidualCondition, True: ExitBB,
802 False: ResidualLoopBB);
803
804 // Residual loop
805 IRBuilder<> ResidualLoopBuilder(ResidualLoopBB);
806 ResidualLoopBuilder.SetCurrentDebugLocation(DbgLoc);
807 PHINode *ResidualLoopPhi =
808 ResidualLoopBuilder.CreatePHI(Ty: ILengthType, NumReservedValues: 0, Name: "fwd_residual_index");
809 Value *LoadGEP = ResidualLoopBuilder.CreateInBoundsGEP(Ty: Int8Type, Ptr: SrcAddr,
810 IdxList: ResidualLoopPhi);
811 Value *Element = ResidualLoopBuilder.CreateAlignedLoad(
812 Ty: ResidualLoopOpType, Ptr: LoadGEP, Align: ResidualSrcAlign, isVolatile: SrcIsVolatile,
813 Name: "element");
814 Value *StoreGEP = ResidualLoopBuilder.CreateInBoundsGEP(Ty: Int8Type, Ptr: DstAddr,
815 IdxList: ResidualLoopPhi);
816 ResidualLoopBuilder.CreateAlignedStore(Val: Element, Ptr: StoreGEP,
817 Align: ResidualDstAlign, isVolatile: DstIsVolatile);
818 Value *ResidualIndex =
819 ResidualLoopBuilder.CreateAdd(LHS: ResidualLoopPhi, RHS: CIResidualLoopOpSize);
820 ResidualLoopBuilder.CreateCondBr(
821 Cond: ResidualLoopBuilder.CreateICmpEQ(LHS: ResidualIndex, RHS: CopyLen), True: ExitBB,
822 False: ResidualLoopBB);
823 ResidualLoopPhi->addIncoming(V: ResidualIndex, BB: ResidualLoopBB);
824 ResidualLoopPhi->addIncoming(V: RuntimeLoopBytes, BB: IntermediateBB);
825 }
826 }
827}
828
829// Similar to createMemMoveLoopUnknownSize, only the trip counts are computed at
830// compile time, obsolete loops and branches are omitted, and the residual code
831// is straight-line code instead of a loop.
832static void createMemMoveLoopKnownSize(Instruction *InsertBefore,
833 Value *SrcAddr, Value *DstAddr,
834 ConstantInt *CopyLen, Align SrcAlign,
835 Align DstAlign, bool SrcIsVolatile,
836 bool DstIsVolatile,
837 const TargetTransformInfo &TTI) {
838 // No need to expand zero length moves.
839 if (CopyLen->isZero())
840 return;
841
842 Type *TypeOfCopyLen = CopyLen->getType();
843 BasicBlock *OrigBB = InsertBefore->getParent();
844 Function *F = OrigBB->getParent();
845 const DataLayout &DL = F->getDataLayout();
846 LLVMContext &Ctx = OrigBB->getContext();
847 unsigned SrcAS = cast<PointerType>(Val: SrcAddr->getType())->getAddressSpace();
848 unsigned DstAS = cast<PointerType>(Val: DstAddr->getType())->getAddressSpace();
849
850 Type *LoopOpType = TTI.getMemcpyLoopLoweringType(Context&: Ctx, Length: CopyLen, SrcAddrSpace: SrcAS, DestAddrSpace: DstAS,
851 SrcAlign, DestAlign: DstAlign);
852 TypeSize LoopOpSize = DL.getTypeStoreSize(Ty: LoopOpType);
853 assert(LoopOpSize.isFixed() && "LoopOpType cannot be a scalable vector type");
854 Type *Int8Type = Type::getInt8Ty(C&: Ctx);
855
856 // Calculate the loop trip count and remaining bytes to copy after the loop.
857 uint64_t BytesCopiedInLoop =
858 alignDown(Value: CopyLen->getZExtValue(), Align: LoopOpSize.getFixedValue());
859 uint64_t RemainingBytes = CopyLen->getZExtValue() - BytesCopiedInLoop;
860
861 IntegerType *ILengthType = cast<IntegerType>(Val: TypeOfCopyLen);
862 ConstantInt *Zero = ConstantInt::get(Ty: ILengthType, V: 0);
863 ConstantInt *LoopBound = ConstantInt::get(Ty: ILengthType, V: BytesCopiedInLoop);
864 ConstantInt *CILoopOpSize = ConstantInt::get(Ty: ILengthType, V: LoopOpSize);
865
866 const DebugLoc &DbgLoc = InsertBefore->getStableDebugLoc();
867 IRBuilder<> PLBuilder(InsertBefore);
868 PLBuilder.SetCurrentDebugLocation(DbgLoc);
869
870 auto [CmpSrcAddr, CmpDstAddr] =
871 tryInsertCastToCommonAddrSpace(B&: PLBuilder, Addr1: SrcAddr, Addr2: DstAddr, TTI);
872 Value *PtrCompare =
873 PLBuilder.CreateICmpULT(LHS: CmpSrcAddr, RHS: CmpDstAddr, Name: "compare_src_dst");
874 Instruction *ThenTerm, *ElseTerm;
875 SplitBlockAndInsertIfThenElse(Cond: PtrCompare, SplitBefore: InsertBefore->getIterator(),
876 ThenTerm: &ThenTerm, ElseTerm: &ElseTerm);
877
878 BasicBlock *CopyBackwardsBB = ThenTerm->getParent();
879 BasicBlock *CopyForwardBB = ElseTerm->getParent();
880 BasicBlock *ExitBB = InsertBefore->getParent();
881 ExitBB->setName("memmove_done");
882
883 Align PartSrcAlign(commonAlignment(A: SrcAlign, Offset: LoopOpSize));
884 Align PartDstAlign(commonAlignment(A: DstAlign, Offset: LoopOpSize));
885
886 // Helper function to generate a load/store pair of a given type in the
887 // residual. Used in the forward and backward branches.
888 auto GenerateResidualLdStPair = [&](Type *OpTy, IRBuilderBase &Builder,
889 uint64_t &BytesCopied) {
890 Align ResSrcAlign(commonAlignment(A: SrcAlign, Offset: BytesCopied));
891 Align ResDstAlign(commonAlignment(A: DstAlign, Offset: BytesCopied));
892
893 TypeSize OperandSize = DL.getTypeStoreSize(Ty: OpTy);
894
895 // If we used LoopOpType as GEP element type, we would iterate over the
896 // buffers in TypeStoreSize strides while copying TypeAllocSize bytes, i.e.,
897 // we would miss bytes if TypeStoreSize != TypeAllocSize. Therefore, use
898 // byte offsets computed from the TypeStoreSize.
899 Value *SrcGEP = Builder.CreateInBoundsGEP(
900 Ty: Int8Type, Ptr: SrcAddr, IdxList: ConstantInt::get(Ty: TypeOfCopyLen, V: BytesCopied));
901 LoadInst *Load =
902 Builder.CreateAlignedLoad(Ty: OpTy, Ptr: SrcGEP, Align: ResSrcAlign, isVolatile: SrcIsVolatile);
903 Value *DstGEP = Builder.CreateInBoundsGEP(
904 Ty: Int8Type, Ptr: DstAddr, IdxList: ConstantInt::get(Ty: TypeOfCopyLen, V: BytesCopied));
905 Builder.CreateAlignedStore(Val: Load, Ptr: DstGEP, Align: ResDstAlign, isVolatile: DstIsVolatile);
906 BytesCopied += OperandSize;
907 };
908
909 // Copying backwards.
910 if (RemainingBytes != 0) {
911 CopyBackwardsBB->setName("memmove_bwd_residual");
912 uint64_t BytesCopied = BytesCopiedInLoop;
913
914 // Residual code is required to move the remaining bytes. We need the same
915 // instructions as in the forward case, only in reverse. So we generate code
916 // the same way, except that we change the IRBuilder insert point for each
917 // load/store pair so that each one is inserted before the previous one
918 // instead of after it.
919 IRBuilder<> BwdResBuilder(CopyBackwardsBB,
920 CopyBackwardsBB->getFirstNonPHIIt());
921 BwdResBuilder.SetCurrentDebugLocation(DbgLoc);
922 SmallVector<Type *, 5> RemainingOps;
923 TTI.getMemcpyLoopResidualLoweringType(OpsOut&: RemainingOps, Context&: Ctx, RemainingBytes,
924 SrcAddrSpace: SrcAS, DestAddrSpace: DstAS, SrcAlign: PartSrcAlign,
925 DestAlign: PartDstAlign);
926 for (auto *OpTy : RemainingOps) {
927 // reverse the order of the emitted operations
928 BwdResBuilder.SetInsertPoint(TheBB: CopyBackwardsBB,
929 IP: CopyBackwardsBB->getFirstNonPHIIt());
930 GenerateResidualLdStPair(OpTy, BwdResBuilder, BytesCopied);
931 }
932 }
933 if (BytesCopiedInLoop != 0) {
934 BasicBlock *LoopBB = CopyBackwardsBB;
935 BasicBlock *PredBB = OrigBB;
936 if (RemainingBytes != 0) {
937 // if we introduce residual code, it needs its separate BB
938 LoopBB = CopyBackwardsBB->splitBasicBlock(
939 I: CopyBackwardsBB->getTerminator(), BBName: "memmove_bwd_loop");
940 PredBB = CopyBackwardsBB;
941 } else {
942 CopyBackwardsBB->setName("memmove_bwd_loop");
943 }
944 IRBuilder<> LoopBuilder(LoopBB->getTerminator());
945 LoopBuilder.SetCurrentDebugLocation(DbgLoc);
946 PHINode *LoopPhi = LoopBuilder.CreatePHI(Ty: ILengthType, NumReservedValues: 0);
947 Value *Index = LoopBuilder.CreateSub(LHS: LoopPhi, RHS: CILoopOpSize, Name: "bwd_index");
948 Value *LoadGEP = LoopBuilder.CreateInBoundsGEP(Ty: Int8Type, Ptr: SrcAddr, IdxList: Index);
949 Value *Element = LoopBuilder.CreateAlignedLoad(
950 Ty: LoopOpType, Ptr: LoadGEP, Align: PartSrcAlign, isVolatile: SrcIsVolatile, Name: "element");
951 Value *StoreGEP = LoopBuilder.CreateInBoundsGEP(Ty: Int8Type, Ptr: DstAddr, IdxList: Index);
952 LoopBuilder.CreateAlignedStore(Val: Element, Ptr: StoreGEP, Align: PartDstAlign,
953 isVolatile: DstIsVolatile);
954
955 // Replace the unconditional branch introduced by
956 // SplitBlockAndInsertIfThenElse to turn LoopBB into a loop.
957 Instruction *UncondTerm = LoopBB->getTerminator();
958 LoopBuilder.CreateCondBr(Cond: LoopBuilder.CreateICmpEQ(LHS: Index, RHS: Zero), True: ExitBB,
959 False: LoopBB);
960 UncondTerm->eraseFromParent();
961
962 LoopPhi->addIncoming(V: Index, BB: LoopBB);
963 LoopPhi->addIncoming(V: LoopBound, BB: PredBB);
964 }
965
966 // Copying forward.
967 BasicBlock *FwdResidualBB = CopyForwardBB;
968 if (BytesCopiedInLoop != 0) {
969 CopyForwardBB->setName("memmove_fwd_loop");
970 BasicBlock *LoopBB = CopyForwardBB;
971 BasicBlock *SuccBB = ExitBB;
972 if (RemainingBytes != 0) {
973 // if we introduce residual code, it needs its separate BB
974 SuccBB = CopyForwardBB->splitBasicBlock(I: CopyForwardBB->getTerminator(),
975 BBName: "memmove_fwd_residual");
976 FwdResidualBB = SuccBB;
977 }
978 IRBuilder<> LoopBuilder(LoopBB->getTerminator());
979 LoopBuilder.SetCurrentDebugLocation(DbgLoc);
980 PHINode *LoopPhi = LoopBuilder.CreatePHI(Ty: ILengthType, NumReservedValues: 0, Name: "fwd_index");
981 Value *LoadGEP = LoopBuilder.CreateInBoundsGEP(Ty: Int8Type, Ptr: SrcAddr, IdxList: LoopPhi);
982 Value *Element = LoopBuilder.CreateAlignedLoad(
983 Ty: LoopOpType, Ptr: LoadGEP, Align: PartSrcAlign, isVolatile: SrcIsVolatile, Name: "element");
984 Value *StoreGEP = LoopBuilder.CreateInBoundsGEP(Ty: Int8Type, Ptr: DstAddr, IdxList: LoopPhi);
985 LoopBuilder.CreateAlignedStore(Val: Element, Ptr: StoreGEP, Align: PartDstAlign,
986 isVolatile: DstIsVolatile);
987 Value *Index = LoopBuilder.CreateAdd(LHS: LoopPhi, RHS: CILoopOpSize);
988 LoopPhi->addIncoming(V: Index, BB: LoopBB);
989 LoopPhi->addIncoming(V: Zero, BB: OrigBB);
990
991 // Replace the unconditional branch to turn LoopBB into a loop.
992 Instruction *UncondTerm = LoopBB->getTerminator();
993 LoopBuilder.CreateCondBr(Cond: LoopBuilder.CreateICmpEQ(LHS: Index, RHS: LoopBound), True: SuccBB,
994 False: LoopBB);
995 UncondTerm->eraseFromParent();
996 }
997
998 if (RemainingBytes != 0) {
999 uint64_t BytesCopied = BytesCopiedInLoop;
1000
1001 // Residual code is required to move the remaining bytes. In the forward
1002 // case, we emit it in the normal order.
1003 IRBuilder<> FwdResBuilder(FwdResidualBB->getTerminator());
1004 FwdResBuilder.SetCurrentDebugLocation(DbgLoc);
1005 SmallVector<Type *, 5> RemainingOps;
1006 TTI.getMemcpyLoopResidualLoweringType(OpsOut&: RemainingOps, Context&: Ctx, RemainingBytes,
1007 SrcAddrSpace: SrcAS, DestAddrSpace: DstAS, SrcAlign: PartSrcAlign,
1008 DestAlign: PartDstAlign);
1009 for (auto *OpTy : RemainingOps)
1010 GenerateResidualLdStPair(OpTy, FwdResBuilder, BytesCopied);
1011 }
1012}
1013
1014/// Create a Value of \p DstType that consists of a sequence of copies of
1015/// \p SetValue, using bitcasts and a vector splat.
1016static Value *createMemSetSplat(const DataLayout &DL, IRBuilderBase &B,
1017 Value *SetValue, Type *DstType) {
1018 TypeSize DstSize = DL.getTypeStoreSize(Ty: DstType);
1019 Type *SetValueType = SetValue->getType();
1020 TypeSize SetValueSize = DL.getTypeStoreSize(Ty: SetValueType);
1021 assert(SetValueSize == DL.getTypeAllocSize(SetValueType) &&
1022 "Store size and alloc size of SetValue's type must match");
1023 assert(SetValueSize != 0 && DstSize % SetValueSize == 0 &&
1024 "DstType size must be a multiple of SetValue size");
1025
1026 Value *Result = SetValue;
1027 if (DstSize != SetValueSize) {
1028 if (!SetValueType->isIntegerTy() && !SetValueType->isFloatingPointTy()) {
1029 // If the type cannot be put into a vector, bitcast to iN first.
1030 LLVMContext &Ctx = SetValue->getContext();
1031 Result = B.CreateBitCast(V: Result, DestTy: Type::getIntNTy(C&: Ctx, N: SetValueSize * 8),
1032 Name: "setvalue.toint");
1033 }
1034 // Form a sufficiently large vector consisting of SetValue, repeated.
1035 Result =
1036 B.CreateVectorSplat(NumElts: DstSize / SetValueSize, V: Result, Name: "setvalue.splat");
1037 }
1038
1039 // The value has the right size, but we might have to bitcast it to the right
1040 // type.
1041 Result = B.CreateBitCast(V: Result, DestTy: DstType, Name: "setvalue.splat.cast");
1042 return Result;
1043}
1044
1045static void
1046createMemSetLoopKnownSize(Instruction *InsertBefore, Value *DstAddr,
1047 ConstantInt *Len, Value *SetValue, Align DstAlign,
1048 bool IsVolatile, const TargetTransformInfo *TTI,
1049 std::optional<uint64_t> AverageTripCount) {
1050 // No need to expand zero length memsets.
1051 if (Len->isZero())
1052 return;
1053
1054 BasicBlock *PreLoopBB = InsertBefore->getParent();
1055 Function *ParentFunc = PreLoopBB->getParent();
1056 const DataLayout &DL = ParentFunc->getDataLayout();
1057 LLVMContext &Ctx = PreLoopBB->getContext();
1058
1059 unsigned DstAS = cast<PointerType>(Val: DstAddr->getType())->getAddressSpace();
1060
1061 Type *TypeOfLen = Len->getType();
1062 Type *Int8Type = Type::getInt8Ty(C&: Ctx);
1063 assert(SetValue->getType() == Int8Type && "Can only set bytes");
1064
1065 Type *LoopOpType = Int8Type;
1066 if (TTI) {
1067 // Use the same memory access type as for a memcpy with the same Dst and Src
1068 // alignment and address space.
1069 LoopOpType = TTI->getMemcpyLoopLoweringType(
1070 Context&: Ctx, Length: Len, SrcAddrSpace: DstAS, DestAddrSpace: DstAS, SrcAlign: DstAlign, DestAlign: DstAlign, AtomicElementSize: std::nullopt);
1071 }
1072 TypeSize LoopOpSize = DL.getTypeStoreSize(Ty: LoopOpType);
1073 assert(LoopOpSize.isFixed() && "LoopOpType cannot be a scalable vector type");
1074
1075 uint64_t LoopEndCount =
1076 alignDown(Value: Len->getZExtValue(), Align: LoopOpSize.getFixedValue());
1077
1078 if (LoopEndCount != 0) {
1079 Value *SplatSetValue = nullptr;
1080 {
1081 IRBuilder<> PreLoopBuilder(InsertBefore);
1082 SplatSetValue =
1083 createMemSetSplat(DL, B&: PreLoopBuilder, SetValue, DstType: LoopOpType);
1084 }
1085
1086 // Don't generate a residual loop, the remaining bytes are set with
1087 // straight-line code.
1088 LoopExpansionInfo LEI = insertLoopExpansion(
1089 InsertBefore, Len, MainLoopStep: LoopOpSize, ResidualLoopStep: 0, BBNamePrefix: "static-memset", AverageTripCount);
1090
1091 // Fill MainLoopBB
1092 IRBuilder<> MainLoopBuilder(LEI.MainLoopIP);
1093 Align PartDstAlign(commonAlignment(A: DstAlign, Offset: LoopOpSize));
1094
1095 Value *DstGEP =
1096 MainLoopBuilder.CreateInBoundsGEP(Ty: Int8Type, Ptr: DstAddr, IdxList: LEI.MainLoopIndex);
1097
1098 MainLoopBuilder.CreateAlignedStore(Val: SplatSetValue, Ptr: DstGEP, Align: PartDstAlign,
1099 isVolatile: IsVolatile);
1100
1101 assert(!LEI.ResidualLoopIP && !LEI.ResidualLoopIndex &&
1102 "No residual loop was requested");
1103 }
1104
1105 uint64_t BytesSet = LoopEndCount;
1106 uint64_t RemainingBytes = Len->getZExtValue() - BytesSet;
1107 if (RemainingBytes == 0)
1108 return;
1109
1110 IRBuilder<> RBuilder(InsertBefore);
1111
1112 assert(TTI && "there cannot be a residual loop without TTI");
1113 SmallVector<Type *, 5> RemainingOps;
1114 TTI->getMemcpyLoopResidualLoweringType(OpsOut&: RemainingOps, Context&: Ctx, RemainingBytes,
1115 SrcAddrSpace: DstAS, DestAddrSpace: DstAS, SrcAlign: DstAlign, DestAlign: DstAlign,
1116 AtomicCpySize: std::nullopt);
1117
1118 Type *PreviousOpTy = nullptr;
1119 Value *SplatSetValue = nullptr;
1120 for (auto *OpTy : RemainingOps) {
1121 TypeSize OperandSize = DL.getTypeStoreSize(Ty: OpTy);
1122 assert(OperandSize.isFixed() &&
1123 "Operand types cannot be scalable vector types");
1124 Align PartDstAlign(commonAlignment(A: DstAlign, Offset: BytesSet));
1125
1126 // Avoid recomputing the splat SetValue if it's the same as for the last
1127 // iteration.
1128 if (OpTy != PreviousOpTy)
1129 SplatSetValue = createMemSetSplat(DL, B&: RBuilder, SetValue, DstType: OpTy);
1130
1131 Value *DstGEP = RBuilder.CreateInBoundsGEP(
1132 Ty: Int8Type, Ptr: DstAddr, IdxList: ConstantInt::get(Ty: TypeOfLen, V: BytesSet));
1133 RBuilder.CreateAlignedStore(Val: SplatSetValue, Ptr: DstGEP, Align: PartDstAlign,
1134 isVolatile: IsVolatile);
1135 BytesSet += OperandSize;
1136 PreviousOpTy = OpTy;
1137 }
1138 assert(BytesSet == Len->getZExtValue() &&
1139 "Bytes set should match size in the call!");
1140}
1141
1142static void
1143createMemSetLoopUnknownSize(Instruction *InsertBefore, Value *DstAddr,
1144 Value *Len, Value *SetValue, Align DstAlign,
1145 bool IsVolatile, const TargetTransformInfo *TTI,
1146 std::optional<uint64_t> AverageTripCount) {
1147 BasicBlock *PreLoopBB = InsertBefore->getParent();
1148 Function *ParentFunc = PreLoopBB->getParent();
1149 const DataLayout &DL = ParentFunc->getDataLayout();
1150 LLVMContext &Ctx = PreLoopBB->getContext();
1151
1152 unsigned DstAS = cast<PointerType>(Val: DstAddr->getType())->getAddressSpace();
1153
1154 Type *Int8Type = Type::getInt8Ty(C&: Ctx);
1155 assert(SetValue->getType() == Int8Type && "Can only set bytes");
1156
1157 Type *LoopOpType = Int8Type;
1158 if (TTI) {
1159 LoopOpType = TTI->getMemcpyLoopLoweringType(
1160 Context&: Ctx, Length: Len, SrcAddrSpace: DstAS, DestAddrSpace: DstAS, SrcAlign: DstAlign, DestAlign: DstAlign, AtomicElementSize: std::nullopt);
1161 }
1162 TypeSize LoopOpSize = DL.getTypeStoreSize(Ty: LoopOpType);
1163 assert(LoopOpSize.isFixed() && "LoopOpType cannot be a scalable vector type");
1164
1165 Type *ResidualLoopOpType = Int8Type;
1166 TypeSize ResidualLoopOpSize = DL.getTypeStoreSize(Ty: ResidualLoopOpType);
1167
1168 Value *SplatSetValue = SetValue;
1169 {
1170 IRBuilder<> PreLoopBuilder(InsertBefore);
1171 SplatSetValue = createMemSetSplat(DL, B&: PreLoopBuilder, SetValue, DstType: LoopOpType);
1172 }
1173
1174 LoopExpansionInfo LEI =
1175 insertLoopExpansion(InsertBefore, Len, MainLoopStep: LoopOpSize, ResidualLoopStep: ResidualLoopOpSize,
1176 BBNamePrefix: "dynamic-memset", AverageTripCount);
1177
1178 // Fill MainLoopBB
1179 IRBuilder<> MainLoopBuilder(LEI.MainLoopIP);
1180 Align PartDstAlign(commonAlignment(A: DstAlign, Offset: LoopOpSize));
1181
1182 Value *DstGEP =
1183 MainLoopBuilder.CreateInBoundsGEP(Ty: Int8Type, Ptr: DstAddr, IdxList: LEI.MainLoopIndex);
1184 MainLoopBuilder.CreateAlignedStore(Val: SplatSetValue, Ptr: DstGEP, Align: PartDstAlign,
1185 isVolatile: IsVolatile);
1186
1187 // Fill ResidualLoopBB
1188 if (!LEI.ResidualLoopIP)
1189 return;
1190
1191 Align ResDstAlign(commonAlignment(A: PartDstAlign, Offset: ResidualLoopOpSize));
1192
1193 IRBuilder<> ResLoopBuilder(LEI.ResidualLoopIP);
1194
1195 Value *ResDstGEP = ResLoopBuilder.CreateInBoundsGEP(Ty: Int8Type, Ptr: DstAddr,
1196 IdxList: LEI.ResidualLoopIndex);
1197 ResLoopBuilder.CreateAlignedStore(Val: SetValue, Ptr: ResDstGEP, Align: ResDstAlign,
1198 isVolatile: IsVolatile);
1199}
1200
1201static void createMemSetLoop(Instruction *InsertBefore, Value *DstAddr,
1202 Value *CopyLen, Value *SetValue, Align DstAlign,
1203 std::optional<uint64_t> AverageTripCount,
1204 bool IsVolatile) {
1205 // Currently no longer used for memset, only for memset.pattern.
1206 // TODO: Update the memset.pattern lowering to also use the loop expansion
1207 // framework and remove this function.
1208 Type *TypeOfCopyLen = CopyLen->getType();
1209 BasicBlock *OrigBB = InsertBefore->getParent();
1210 Function *F = OrigBB->getParent();
1211 const DataLayout &DL = F->getDataLayout();
1212 BasicBlock *NewBB =
1213 OrigBB->splitBasicBlock(I: InsertBefore, BBName: "split");
1214 BasicBlock *LoopBB
1215 = BasicBlock::Create(Context&: F->getContext(), Name: "loadstoreloop", Parent: F, InsertBefore: NewBB);
1216
1217 const DebugLoc &DbgLoc = InsertBefore->getStableDebugLoc();
1218 IRBuilder<> Builder(OrigBB->getTerminator());
1219 Builder.SetCurrentDebugLocation(DbgLoc);
1220
1221 auto *ToLoopBR = Builder.CreateCondBr(
1222 Cond: Builder.CreateICmpEQ(LHS: ConstantInt::get(Ty: TypeOfCopyLen, V: 0), RHS: CopyLen), True: NewBB,
1223 False: LoopBB);
1224 MDBuilder MDB(F->getContext());
1225 if (AverageTripCount.has_value())
1226 ToLoopBR->setMetadata(KindID: LLVMContext::MD_prof,
1227 Node: MDB.createLikelyBranchWeights());
1228 else
1229 setExplicitlyUnknownBranchWeightsIfProfiled(I&: *ToLoopBR, DEBUG_TYPE);
1230
1231 OrigBB->getTerminator()->eraseFromParent();
1232
1233 TypeSize PartSize = DL.getTypeStoreSize(Ty: SetValue->getType());
1234 Align PartAlign(commonAlignment(A: DstAlign, Offset: PartSize));
1235
1236 IRBuilder<> LoopBuilder(LoopBB);
1237 LoopBuilder.SetCurrentDebugLocation(DbgLoc);
1238 PHINode *LoopIndex = LoopBuilder.CreatePHI(Ty: TypeOfCopyLen, NumReservedValues: 0);
1239 LoopIndex->addIncoming(V: ConstantInt::get(Ty: TypeOfCopyLen, V: 0), BB: OrigBB);
1240
1241 LoopBuilder.CreateAlignedStore(
1242 Val: SetValue,
1243 Ptr: LoopBuilder.CreateInBoundsGEP(Ty: SetValue->getType(), Ptr: DstAddr, IdxList: LoopIndex),
1244 Align: PartAlign, isVolatile: IsVolatile);
1245
1246 Value *NewIndex =
1247 LoopBuilder.CreateAdd(LHS: LoopIndex, RHS: ConstantInt::get(Ty: TypeOfCopyLen, V: 1));
1248 LoopIndex->addIncoming(V: NewIndex, BB: LoopBB);
1249
1250 auto *LoopBR = LoopBuilder.CreateCondBr(
1251 Cond: LoopBuilder.CreateICmpULT(LHS: NewIndex, RHS: CopyLen), True: LoopBB, False: NewBB);
1252 if (AverageTripCount.has_value())
1253 setFittedBranchWeights(I&: *LoopBR, Weights: {AverageTripCount.value(), 1},
1254 /*IsExpected=*/false);
1255 else
1256 setExplicitlyUnknownBranchWeightsIfProfiled(I&: *LoopBR, DEBUG_TYPE);
1257}
1258
1259template <typename T>
1260static bool canOverlap(MemTransferBase<T> *Memcpy, ScalarEvolution *SE) {
1261 if (SE) {
1262 const SCEV *SrcSCEV = SE->getSCEV(V: Memcpy->getRawSource());
1263 const SCEV *DestSCEV = SE->getSCEV(V: Memcpy->getRawDest());
1264 if (SE->isKnownPredicateAt(Pred: CmpInst::ICMP_NE, LHS: SrcSCEV, RHS: DestSCEV, CtxI: Memcpy))
1265 return false;
1266 }
1267 return true;
1268}
1269
1270void llvm::expandMemCpyAsLoop(MemCpyInst *Memcpy,
1271 const TargetTransformInfo &TTI,
1272 ScalarEvolution *SE) {
1273 bool CanOverlap = canOverlap(Memcpy, SE);
1274 auto TripCount = getAverageMemOpLoopTripCount(I: *Memcpy);
1275 if (ConstantInt *CI = dyn_cast<ConstantInt>(Val: Memcpy->getLength())) {
1276 createMemCpyLoopKnownSize(
1277 /*InsertBefore=*/Memcpy,
1278 /*SrcAddr=*/Memcpy->getRawSource(),
1279 /*DstAddr=*/Memcpy->getRawDest(),
1280 /*CopyLen=*/CI,
1281 /*SrcAlign=*/Memcpy->getSourceAlign().valueOrOne(),
1282 /*DstAlign=*/Memcpy->getDestAlign().valueOrOne(),
1283 /*SrcIsVolatile=*/Memcpy->isVolatile(),
1284 /*DstIsVolatile=*/Memcpy->isVolatile(),
1285 /*CanOverlap=*/CanOverlap,
1286 /*TTI=*/TTI,
1287 /*AtomicElementSize=*/std::nullopt,
1288 /*AverageTripCount=*/TripCount);
1289 } else {
1290 createMemCpyLoopUnknownSize(
1291 /*InsertBefore=*/Memcpy,
1292 /*SrcAddr=*/Memcpy->getRawSource(),
1293 /*DstAddr=*/Memcpy->getRawDest(),
1294 /*CopyLen=*/Memcpy->getLength(),
1295 /*SrcAlign=*/Memcpy->getSourceAlign().valueOrOne(),
1296 /*DstAlign=*/Memcpy->getDestAlign().valueOrOne(),
1297 /*SrcIsVolatile=*/Memcpy->isVolatile(),
1298 /*DstIsVolatile=*/Memcpy->isVolatile(),
1299 /*CanOverlap=*/CanOverlap,
1300 /*TTI=*/TTI,
1301 /*AtomicElementSize=*/std::nullopt,
1302 /*AverageTripCount=*/TripCount);
1303 }
1304}
1305
1306bool llvm::expandMemMoveAsLoop(MemMoveInst *Memmove,
1307 const TargetTransformInfo &TTI) {
1308 Value *CopyLen = Memmove->getLength();
1309 Value *SrcAddr = Memmove->getRawSource();
1310 Value *DstAddr = Memmove->getRawDest();
1311 Align SrcAlign = Memmove->getSourceAlign().valueOrOne();
1312 Align DstAlign = Memmove->getDestAlign().valueOrOne();
1313 bool SrcIsVolatile = Memmove->isVolatile();
1314 bool DstIsVolatile = SrcIsVolatile;
1315 IRBuilder<> CastBuilder(Memmove);
1316 CastBuilder.SetCurrentDebugLocation(Memmove->getStableDebugLoc());
1317
1318 unsigned SrcAS = SrcAddr->getType()->getPointerAddressSpace();
1319 unsigned DstAS = DstAddr->getType()->getPointerAddressSpace();
1320 if (SrcAS != DstAS) {
1321 if (!TTI.addrspacesMayAlias(AS0: SrcAS, AS1: DstAS)) {
1322 // We may not be able to emit a pointer comparison, but we don't have
1323 // to. Expand as memcpy.
1324 auto AverageTripCount = getAverageMemOpLoopTripCount(I: *Memmove);
1325 if (ConstantInt *CI = dyn_cast<ConstantInt>(Val: CopyLen)) {
1326 createMemCpyLoopKnownSize(
1327 /*InsertBefore=*/Memmove, SrcAddr, DstAddr, CopyLen: CI, SrcAlign, DstAlign,
1328 SrcIsVolatile, DstIsVolatile,
1329 /*CanOverlap=*/false, TTI, AtomicElementSize: std::nullopt, AverageTripCount);
1330 } else {
1331 createMemCpyLoopUnknownSize(
1332 /*InsertBefore=*/Memmove, SrcAddr, DstAddr, CopyLen, SrcAlign,
1333 DstAlign, SrcIsVolatile, DstIsVolatile,
1334 /*CanOverlap=*/false, TTI, AtomicElementSize: std::nullopt, AverageTripCount);
1335 }
1336
1337 return true;
1338 }
1339
1340 if (!(TTI.isValidAddrSpaceCast(FromAS: DstAS, ToAS: SrcAS) ||
1341 TTI.isValidAddrSpaceCast(FromAS: SrcAS, ToAS: DstAS))) {
1342 // We don't know generically if it's legal to introduce an
1343 // addrspacecast. We need to know either if it's legal to insert an
1344 // addrspacecast, or if the address spaces cannot alias.
1345 LLVM_DEBUG(
1346 dbgs() << "Do not know how to expand memmove between different "
1347 "address spaces\n");
1348 return false;
1349 }
1350 }
1351
1352 if (ConstantInt *CI = dyn_cast<ConstantInt>(Val: CopyLen)) {
1353 createMemMoveLoopKnownSize(
1354 /*InsertBefore=*/Memmove, SrcAddr, DstAddr, CopyLen: CI, SrcAlign, DstAlign,
1355 SrcIsVolatile, DstIsVolatile, TTI);
1356 } else {
1357 createMemMoveLoopUnknownSize(
1358 /*InsertBefore=*/Memmove, SrcAddr, DstAddr, CopyLen, SrcAlign, DstAlign,
1359 SrcIsVolatile, DstIsVolatile, TTI);
1360 }
1361 return true;
1362}
1363
1364void llvm::expandMemSetAsLoop(MemSetInst *Memset,
1365 const TargetTransformInfo *TTI) {
1366 auto AverageTripCount = getAverageMemOpLoopTripCount(I: *Memset);
1367 if (ConstantInt *CI = dyn_cast<ConstantInt>(Val: Memset->getLength())) {
1368 createMemSetLoopKnownSize(
1369 /*InsertBefore=*/Memset,
1370 /*DstAddr=*/Memset->getRawDest(),
1371 /*Len=*/CI,
1372 /*SetValue=*/Memset->getValue(),
1373 /*DstAlign=*/Memset->getDestAlign().valueOrOne(),
1374 /*IsVolatile=*/Memset->isVolatile(),
1375 /*TTI=*/TTI,
1376 /*AverageTripCount=*/AverageTripCount);
1377 } else {
1378 createMemSetLoopUnknownSize(
1379 /*InsertBefore=*/Memset,
1380 /*DstAddr=*/Memset->getRawDest(),
1381 /*Len=*/Memset->getLength(),
1382 /*SetValue=*/Memset->getValue(),
1383 /*DstAlign=*/Memset->getDestAlign().valueOrOne(),
1384 /*IsVolatile=*/Memset->isVolatile(),
1385 /*TTI=*/TTI,
1386 /*AverageTripCount=*/AverageTripCount);
1387 }
1388}
1389
1390void llvm::expandMemSetAsLoop(MemSetInst *MemSet,
1391 const TargetTransformInfo &TTI) {
1392 expandMemSetAsLoop(Memset: MemSet, TTI: &TTI);
1393}
1394
1395void llvm::expandMemSetPatternAsLoop(MemSetPatternInst *Memset) {
1396 createMemSetLoop(/*InsertBefore=*/Memset,
1397 /*DstAddr=*/Memset->getRawDest(),
1398 /*CopyLen=*/Memset->getLength(),
1399 /*SetValue=*/Memset->getValue(),
1400 /*DstAlign=*/Memset->getDestAlign().valueOrOne(),
1401 /*AverageTripCount=*/getAverageMemOpLoopTripCount(I: *Memset),
1402 /*IsVolatile=*/Memset->isVolatile());
1403}
1404
1405void llvm::expandAtomicMemCpyAsLoop(AnyMemCpyInst *AtomicMemcpy,
1406 const TargetTransformInfo &TTI,
1407 ScalarEvolution *SE) {
1408 assert(AtomicMemcpy->isAtomic());
1409 if (ConstantInt *CI = dyn_cast<ConstantInt>(Val: AtomicMemcpy->getLength())) {
1410 createMemCpyLoopKnownSize(
1411 /*InsertBefore=*/AtomicMemcpy,
1412 /*SrcAddr=*/AtomicMemcpy->getRawSource(),
1413 /*DstAddr=*/AtomicMemcpy->getRawDest(),
1414 /*CopyLen=*/CI,
1415 /*SrcAlign=*/AtomicMemcpy->getSourceAlign().valueOrOne(),
1416 /*DstAlign=*/AtomicMemcpy->getDestAlign().valueOrOne(),
1417 /*SrcIsVolatile=*/AtomicMemcpy->isVolatile(),
1418 /*DstIsVolatile=*/AtomicMemcpy->isVolatile(),
1419 /*CanOverlap=*/false, // SrcAddr & DstAddr may not overlap by spec.
1420 /*TTI=*/TTI,
1421 /*AtomicElementSize=*/AtomicMemcpy->getElementSizeInBytes());
1422 } else {
1423 createMemCpyLoopUnknownSize(
1424 /*InsertBefore=*/AtomicMemcpy,
1425 /*SrcAddr=*/AtomicMemcpy->getRawSource(),
1426 /*DstAddr=*/AtomicMemcpy->getRawDest(),
1427 /*CopyLen=*/AtomicMemcpy->getLength(),
1428 /*SrcAlign=*/AtomicMemcpy->getSourceAlign().valueOrOne(),
1429 /*DstAlign=*/AtomicMemcpy->getDestAlign().valueOrOne(),
1430 /*SrcIsVolatile=*/AtomicMemcpy->isVolatile(),
1431 /*DstIsVolatile=*/AtomicMemcpy->isVolatile(),
1432 /*CanOverlap=*/false, // SrcAddr & DstAddr may not overlap by spec.
1433 /*TargetTransformInfo=*/TTI,
1434 /*AtomicElementSize=*/AtomicMemcpy->getElementSizeInBytes());
1435 }
1436}
1437