1//===- LowerMemIntrinsics.cpp ----------------------------------*- C++ -*--===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#include "llvm/Transforms/Utils/LowerMemIntrinsics.h"
10#include "llvm/Analysis/ScalarEvolution.h"
11#include "llvm/Analysis/TargetTransformInfo.h"
12#include "llvm/IR/IRBuilder.h"
13#include "llvm/IR/IntrinsicInst.h"
14#include "llvm/IR/MDBuilder.h"
15#include "llvm/IR/ProfDataUtils.h"
16#include "llvm/ProfileData/InstrProf.h"
17#include "llvm/Support/Debug.h"
18#include "llvm/Support/MathExtras.h"
19#include "llvm/Transforms/Utils/BasicBlockUtils.h"
20#include "llvm/Transforms/Utils/LoopUtils.h"
21#include <limits>
22#include <optional>
23
24#define DEBUG_TYPE "lower-mem-intrinsics"
25
26using namespace llvm;
27
28namespace llvm {
29extern cl::opt<bool> ProfcheckDisableMetadataFixes;
30}
31
32/// \returns \p Len urem \p OpSize, checking for optimization opportunities.
33/// \p OpSizeVal must be the integer value of the \c ConstantInt \p OpSize.
34static Value *getRuntimeLoopRemainder(IRBuilderBase &B, Value *Len,
35 Value *OpSize, unsigned OpSizeVal) {
36 // For powers of 2, we can and by (OpSizeVal - 1) instead of using urem.
37 if (isPowerOf2_32(Value: OpSizeVal))
38 return B.CreateAnd(LHS: Len, RHS: OpSizeVal - 1);
39 return B.CreateURem(LHS: Len, RHS: OpSize);
40}
41
42/// \returns (\p Len udiv \p OpSize) mul \p OpSize, checking for optimization
43/// opportunities.
44/// If \p RTLoopRemainder is provided, it must be the result of
45/// \c getRuntimeLoopRemainder() with the same arguments.
46static Value *getRuntimeLoopUnits(IRBuilderBase &B, Value *Len, Value *OpSize,
47 unsigned OpSizeVal,
48 Value *RTLoopRemainder = nullptr) {
49 if (!RTLoopRemainder)
50 RTLoopRemainder = getRuntimeLoopRemainder(B, Len, OpSize, OpSizeVal);
51 return B.CreateSub(LHS: Len, RHS: RTLoopRemainder);
52}
53
54namespace {
55/// Container for the return values of insertLoopExpansion.
56struct LoopExpansionInfo {
57 /// The instruction at the end of the main loop body.
58 Instruction *MainLoopIP = nullptr;
59
60 /// The unit index in the main loop body.
61 Value *MainLoopIndex = nullptr;
62
63 /// The instruction at the end of the residual loop body. Can be nullptr if no
64 /// residual is required.
65 Instruction *ResidualLoopIP = nullptr;
66
67 /// The unit index in the residual loop body. Can be nullptr if no residual is
68 /// required.
69 Value *ResidualLoopIndex = nullptr;
70};
71
72std::optional<uint64_t> getAverageMemOpLoopTripCount(const MemIntrinsic &I) {
73 if (ProfcheckDisableMetadataFixes)
74 return std::nullopt;
75 if (std::optional<Function::ProfileCount> EC =
76 I.getFunction()->getEntryCount();
77 !EC || !EC->getCount())
78 return std::nullopt;
79 if (const auto Len = I.getLengthInBytes())
80 return Len->getZExtValue();
81 uint64_t Total = 0;
82 SmallVector<InstrProfValueData> ProfData =
83 getValueProfDataFromInst(Inst: I, ValueKind: InstrProfValueKind::IPVK_MemOPSize,
84 MaxNumValueData: std::numeric_limits<uint32_t>::max(), TotalC&: Total);
85 if (!Total)
86 return std::nullopt;
87 uint64_t TripCount = 0;
88 for (const auto &P : ProfData)
89 TripCount += P.Count * P.Value;
90 return std::round(x: 1.0 * TripCount / Total);
91}
92
93} // namespace
94
95/// Insert the control flow and loop counters for a memcpy/memset loop
96/// expansion.
97///
98/// This function inserts IR corresponding to the following C code before
99/// \p InsertBefore:
100/// \code
101/// LoopUnits = (Len / MainLoopStep) * MainLoopStep;
102/// ResidualUnits = Len - LoopUnits;
103/// MainLoopIndex = 0;
104/// if (LoopUnits > 0) {
105/// do {
106/// // MainLoopIP
107/// MainLoopIndex += MainLoopStep;
108/// } while (MainLoopIndex < LoopUnits);
109/// }
110/// for (size_t i = 0; i < ResidualUnits; i += ResidualLoopStep) {
111/// ResidualLoopIndex = LoopUnits + i;
112/// // ResidualLoopIP
113/// }
114/// \endcode
115///
116/// \p MainLoopStep and \p ResidualLoopStep determine by how many "units" the
117/// loop index is increased in each iteration of the main and residual loops,
118/// respectively. In most cases, the "unit" will be bytes, but larger units are
119/// useful for lowering memset.pattern.
120///
121/// The computation of \c LoopUnits and \c ResidualUnits is performed at compile
122/// time if \p Len is a \c ConstantInt.
123/// The second (residual) loop is omitted if \p ResidualLoopStep is 0 or equal
124/// to \p MainLoopStep.
125/// The generated \c MainLoopIP, \c MainLoopIndex, \c ResidualLoopIP, and
126/// \c ResidualLoopIndex are returned in a \c LoopExpansionInfo object.
127///
128/// If provided, \p ExpectedUnits is used as the expected number of units
129/// handled by the loop expansion when computing branch weights.
130static LoopExpansionInfo
131insertLoopExpansion(Instruction *InsertBefore, Value *Len,
132 unsigned MainLoopStep, unsigned ResidualLoopStep,
133 StringRef BBNamePrefix,
134 std::optional<uint64_t> ExpectedUnits) {
135 assert((ResidualLoopStep == 0 || MainLoopStep % ResidualLoopStep == 0) &&
136 "ResidualLoopStep must divide MainLoopStep if specified");
137 assert(ResidualLoopStep <= MainLoopStep &&
138 "ResidualLoopStep cannot be larger than MainLoopStep");
139 assert(MainLoopStep > 0 && "MainLoopStep must be non-zero");
140 LoopExpansionInfo LEI;
141
142 // If the length is known to be zero, there is nothing to do.
143 if (auto *CLen = dyn_cast<ConstantInt>(Val: Len))
144 if (CLen->isZero())
145 return LEI;
146
147 BasicBlock *PreLoopBB = InsertBefore->getParent();
148 BasicBlock *PostLoopBB = PreLoopBB->splitBasicBlock(
149 I: InsertBefore, BBName: BBNamePrefix + "-post-expansion");
150 Function *ParentFunc = PreLoopBB->getParent();
151 LLVMContext &Ctx = PreLoopBB->getContext();
152 const DebugLoc &DbgLoc = InsertBefore->getStableDebugLoc();
153 IRBuilder<> PreLoopBuilder(PreLoopBB->getTerminator());
154 PreLoopBuilder.SetCurrentDebugLocation(DbgLoc);
155
156 // Calculate the main loop trip count and remaining units to cover after the
157 // loop.
158 Type *LenType = Len->getType();
159 IntegerType *ILenType = cast<IntegerType>(Val: LenType);
160 ConstantInt *CIMainLoopStep = ConstantInt::get(Ty: ILenType, V: MainLoopStep);
161 ConstantInt *Zero = ConstantInt::get(Ty: ILenType, V: 0U);
162
163 // We can avoid conditional branches and/or entire loops if we know any of the
164 // following:
165 // - that the main loop must be executed at least once
166 // - that the main loop will not be executed at all
167 // - that the residual loop must be executed at least once
168 // - that the residual loop will not be executed at all
169 bool MustTakeMainLoop = false;
170 bool MayTakeMainLoop = true;
171 bool MustTakeResidualLoop = false;
172 bool MayTakeResidualLoop = true;
173
174 Value *LoopUnits = Len;
175 Value *ResidualUnits = nullptr;
176 if (MainLoopStep != 1) {
177 if (auto *CLen = dyn_cast<ConstantInt>(Val: Len)) {
178 uint64_t TotalUnits = CLen->getZExtValue();
179 uint64_t LoopEndCount = alignDown(Value: TotalUnits, Align: MainLoopStep);
180 uint64_t ResidualCount = TotalUnits - LoopEndCount;
181 LoopUnits = ConstantInt::get(Ty: LenType, V: LoopEndCount);
182 ResidualUnits = ConstantInt::get(Ty: LenType, V: ResidualCount);
183 MustTakeMainLoop = LoopEndCount > 0;
184 MayTakeMainLoop = MustTakeMainLoop;
185 MustTakeResidualLoop = ResidualCount > 0;
186 MayTakeResidualLoop = MustTakeResidualLoop;
187 // TODO: This could also use known bits to check if a non-constant loop
188 // count is guaranteed to be a multiple of MainLoopStep, in which case we
189 // could omit the residual loop. It's unclear if that is worthwhile.
190 } else {
191 ResidualUnits = getRuntimeLoopRemainder(B&: PreLoopBuilder, Len,
192 OpSize: CIMainLoopStep, OpSizeVal: MainLoopStep);
193 LoopUnits = getRuntimeLoopUnits(B&: PreLoopBuilder, Len, OpSize: CIMainLoopStep,
194 OpSizeVal: MainLoopStep, RTLoopRemainder: ResidualUnits);
195 }
196 } else if (auto *CLen = dyn_cast<ConstantInt>(Val: Len)) {
197 MustTakeMainLoop = CLen->getZExtValue() > 0;
198 MayTakeMainLoop = MustTakeMainLoop;
199 }
200
201 // The case where both loops are omitted (i.e., the length is known zero) is
202 // already handled at the beginning of this function.
203 assert((MayTakeMainLoop || MayTakeResidualLoop) &&
204 "At least one of the loops must be generated");
205
206 BasicBlock *MainLoopBB = nullptr;
207 CondBrInst *MainLoopBr = nullptr;
208
209 // Construct the main loop unless we statically known that it is not taken.
210 if (MayTakeMainLoop) {
211 MainLoopBB = BasicBlock::Create(Context&: Ctx, Name: BBNamePrefix + "-expansion-main-body",
212 Parent: ParentFunc, InsertBefore: PostLoopBB);
213 IRBuilder<> LoopBuilder(MainLoopBB);
214 LoopBuilder.SetCurrentDebugLocation(DbgLoc);
215
216 PHINode *LoopIndex = LoopBuilder.CreatePHI(Ty: LenType, NumReservedValues: 2, Name: "loop-index");
217 LEI.MainLoopIndex = LoopIndex;
218 LoopIndex->addIncoming(V: ConstantInt::get(Ty: LenType, V: 0U), BB: PreLoopBB);
219
220 Value *NewIndex = LoopBuilder.CreateAdd(
221 LHS: LoopIndex, RHS: ConstantInt::get(Ty: LenType, V: MainLoopStep));
222 LoopIndex->addIncoming(V: NewIndex, BB: MainLoopBB);
223
224 // One argument of the addition is a loop-variant PHI, so it must be an
225 // Instruction (i.e., it cannot be a Constant).
226 LEI.MainLoopIP = cast<Instruction>(Val: NewIndex);
227
228 // Stay in the MainLoop until we have handled all the LoopUnits. The False
229 // target is adjusted below if a residual is generated.
230 MainLoopBr = LoopBuilder.CreateCondBr(
231 Cond: LoopBuilder.CreateICmpULT(LHS: NewIndex, RHS: LoopUnits), True: MainLoopBB, False: PostLoopBB);
232
233 if (ExpectedUnits.has_value()) {
234 uint64_t BackedgeTakenCount = ExpectedUnits.value() / MainLoopStep;
235 if (BackedgeTakenCount > 0)
236 BackedgeTakenCount -= 1; // The last iteration goes to the False target.
237 MDBuilder MDB(ParentFunc->getContext());
238 setFittedBranchWeights(I&: *MainLoopBr, Weights: {BackedgeTakenCount, 1},
239 /*IsExpected=*/false);
240 } else {
241 setExplicitlyUnknownBranchWeightsIfProfiled(I&: *MainLoopBr, DEBUG_TYPE);
242 }
243 }
244
245 // Construct the residual loop if it is requested from the caller unless we
246 // statically know that it won't be taken.
247 bool ResidualLoopRequested =
248 ResidualLoopStep > 0 && ResidualLoopStep < MainLoopStep;
249 BasicBlock *ResidualLoopBB = nullptr;
250 BasicBlock *ResidualCondBB = nullptr;
251 if (ResidualLoopRequested && MayTakeResidualLoop) {
252 ResidualLoopBB =
253 BasicBlock::Create(Context&: Ctx, Name: BBNamePrefix + "-expansion-residual-body",
254 Parent: PreLoopBB->getParent(), InsertBefore: PostLoopBB);
255
256 // The residual loop body is either reached from the ResidualCondBB (which
257 // checks if the residual loop needs to be executed), from the main loop
258 // body if we know statically that the residual must be executed, or from
259 // the pre-loop BB (conditionally or unconditionally) if the main loop is
260 // omitted.
261 BasicBlock *PredOfResLoopBody = PreLoopBB;
262 if (MainLoopBB) {
263 // If it's statically known that the residual must be executed, we don't
264 // need to create a preheader BB.
265 if (MustTakeResidualLoop) {
266 MainLoopBr->setSuccessor(idx: 1, NewSucc: ResidualLoopBB);
267 PredOfResLoopBody = MainLoopBB;
268 } else {
269 // Construct a preheader BB to check if the residual loop is executed.
270 ResidualCondBB =
271 BasicBlock::Create(Context&: Ctx, Name: BBNamePrefix + "-expansion-residual-cond",
272 Parent: PreLoopBB->getParent(), InsertBefore: ResidualLoopBB);
273
274 // Determine if we need to branch to the residual loop or bypass it.
275 IRBuilder<> RCBuilder(ResidualCondBB);
276 RCBuilder.SetCurrentDebugLocation(DbgLoc);
277 auto *BR =
278 RCBuilder.CreateCondBr(Cond: RCBuilder.CreateICmpNE(LHS: ResidualUnits, RHS: Zero),
279 True: ResidualLoopBB, False: PostLoopBB);
280 if (ExpectedUnits.has_value()) {
281 MDBuilder MDB(ParentFunc->getContext());
282 BR->setMetadata(KindID: LLVMContext::MD_prof,
283 Node: MDB.createLikelyBranchWeights());
284 } else {
285 setExplicitlyUnknownBranchWeightsIfProfiled(I&: *BR, DEBUG_TYPE);
286 }
287
288 MainLoopBr->setSuccessor(idx: 1, NewSucc: ResidualCondBB);
289 PredOfResLoopBody = ResidualCondBB;
290 }
291 }
292
293 IRBuilder<> ResBuilder(ResidualLoopBB);
294 ResBuilder.SetCurrentDebugLocation(DbgLoc);
295 PHINode *ResidualIndex =
296 ResBuilder.CreatePHI(Ty: LenType, NumReservedValues: 2, Name: "residual-loop-index");
297 ResidualIndex->addIncoming(V: Zero, BB: PredOfResLoopBody);
298
299 // Add the offset at the end of the main loop to the loop counter of the
300 // residual loop to get the proper index. If the main loop was omitted, we
301 // can also omit the addition.
302 if (MainLoopBB)
303 LEI.ResidualLoopIndex = ResBuilder.CreateAdd(LHS: LoopUnits, RHS: ResidualIndex);
304 else
305 LEI.ResidualLoopIndex = ResidualIndex;
306
307 Value *ResNewIndex = ResBuilder.CreateAdd(
308 LHS: ResidualIndex, RHS: ConstantInt::get(Ty: LenType, V: ResidualLoopStep));
309 ResidualIndex->addIncoming(V: ResNewIndex, BB: ResidualLoopBB);
310
311 // One argument of the addition is a loop-variant PHI, so it must be an
312 // Instruction (i.e., it cannot be a Constant).
313 LEI.ResidualLoopIP = cast<Instruction>(Val: ResNewIndex);
314
315 // Stay in the residual loop until all ResidualUnits are handled.
316 CondBrInst *BR = ResBuilder.CreateCondBr(
317 Cond: ResBuilder.CreateICmpULT(LHS: ResNewIndex, RHS: ResidualUnits), True: ResidualLoopBB,
318 False: PostLoopBB);
319
320 if (ExpectedUnits.has_value()) {
321 uint64_t BackedgeTakenCount =
322 (ExpectedUnits.value() % MainLoopStep) / ResidualLoopStep;
323 if (BackedgeTakenCount > 0)
324 BackedgeTakenCount -= 1; // The last iteration goes to the False target.
325 MDBuilder MDB(ParentFunc->getContext());
326 setFittedBranchWeights(I&: *BR, Weights: {BackedgeTakenCount, 1},
327 /*IsExpected=*/false);
328 } else {
329 setExplicitlyUnknownBranchWeightsIfProfiled(I&: *BR, DEBUG_TYPE);
330 }
331 }
332
333 // Create the branch in the pre-loop block.
334 if (MustTakeMainLoop) {
335 // Go unconditionally to the main loop if it's statically known that it must
336 // be executed.
337 assert(MainLoopBB);
338 PreLoopBuilder.CreateBr(Dest: MainLoopBB);
339 } else if (!MainLoopBB && ResidualLoopBB) {
340 if (MustTakeResidualLoop) {
341 // If the main loop is omitted and the residual loop is statically known
342 // to be executed, go there unconditionally.
343 PreLoopBuilder.CreateBr(Dest: ResidualLoopBB);
344 } else {
345 // If the main loop is omitted and we don't know if the residual loop is
346 // executed, go there if necessary. The PreLoopBB takes the role of the
347 // preheader for the residual loop in this case.
348 auto *BR = PreLoopBuilder.CreateCondBr(
349 Cond: PreLoopBuilder.CreateICmpNE(LHS: ResidualUnits, RHS: Zero), True: ResidualLoopBB,
350 False: PostLoopBB);
351 if (ExpectedUnits.has_value()) {
352 MDBuilder MDB(ParentFunc->getContext());
353 BR->setMetadata(KindID: LLVMContext::MD_prof, Node: MDB.createLikelyBranchWeights());
354 } else {
355 setExplicitlyUnknownBranchWeightsIfProfiled(I&: *BR, DEBUG_TYPE);
356 }
357 }
358 } else {
359 // Otherwise, go conditionally to the main loop or its successor.
360 // If there is no residual loop, the successor is the post-loop BB.
361 BasicBlock *FalseBB = PostLoopBB;
362 if (ResidualCondBB) {
363 // If we constructed a pre-header for the residual loop, that is the
364 // successor.
365 FalseBB = ResidualCondBB;
366 } else if (ResidualLoopBB) {
367 // If there is a residual loop but the preheader is omitted (because the
368 // residual loop is statically known to be executed), the successor
369 // is the residual loop body.
370 assert(MustTakeResidualLoop);
371 FalseBB = ResidualLoopBB;
372 }
373
374 auto *BR = PreLoopBuilder.CreateCondBr(
375 Cond: PreLoopBuilder.CreateICmpNE(LHS: LoopUnits, RHS: Zero), True: MainLoopBB, False: FalseBB);
376
377 if (ExpectedUnits.has_value()) {
378 MDBuilder MDB(ParentFunc->getContext());
379 BR->setMetadata(KindID: LLVMContext::MD_prof, Node: MDB.createLikelyBranchWeights());
380 } else {
381 setExplicitlyUnknownBranchWeightsIfProfiled(I&: *BR, DEBUG_TYPE);
382 }
383 }
384 // Delete the unconditional branch inserted by splitBasicBlock.
385 PreLoopBB->getTerminator()->eraseFromParent();
386
387 return LEI;
388}
389
390void llvm::createMemCpyLoopKnownSize(Instruction *InsertBefore, Value *SrcAddr,
391 Value *DstAddr, ConstantInt *CopyLen,
392 Align SrcAlign, Align DstAlign,
393 bool SrcIsVolatile, bool DstIsVolatile,
394 bool CanOverlap,
395 const TargetTransformInfo &TTI,
396 std::optional<uint32_t> AtomicElementSize,
397 std::optional<uint64_t> AverageTripCount) {
398 // No need to expand zero length copies.
399 if (CopyLen->isZero())
400 return;
401
402 BasicBlock *PreLoopBB = InsertBefore->getParent();
403 Function *ParentFunc = PreLoopBB->getParent();
404 LLVMContext &Ctx = PreLoopBB->getContext();
405 const DataLayout &DL = ParentFunc->getDataLayout();
406 MDBuilder MDB(Ctx);
407 MDNode *NewDomain = MDB.createAnonymousAliasScopeDomain(Name: "MemCopyDomain");
408 StringRef Name = "MemCopyAliasScope";
409 MDNode *NewScope = MDB.createAnonymousAliasScope(Domain: NewDomain, Name);
410
411 unsigned SrcAS = cast<PointerType>(Val: SrcAddr->getType())->getAddressSpace();
412 unsigned DstAS = cast<PointerType>(Val: DstAddr->getType())->getAddressSpace();
413
414 Type *TypeOfCopyLen = CopyLen->getType();
415 Type *LoopOpType = TTI.getMemcpyLoopLoweringType(
416 Context&: Ctx, Length: CopyLen, SrcAddrSpace: SrcAS, DestAddrSpace: DstAS, SrcAlign, DestAlign: DstAlign, AtomicElementSize);
417 assert((!AtomicElementSize || !LoopOpType->isVectorTy()) &&
418 "Atomic memcpy lowering is not supported for vector operand type");
419
420 Type *Int8Type = Type::getInt8Ty(C&: Ctx);
421 TypeSize LoopOpSize = DL.getTypeStoreSize(Ty: LoopOpType);
422 assert(LoopOpSize.isFixed() && "LoopOpType cannot be a scalable vector type");
423 assert((!AtomicElementSize || LoopOpSize % *AtomicElementSize == 0) &&
424 "Atomic memcpy lowering is not supported for selected operand size");
425
426 uint64_t LoopEndCount =
427 alignDown(Value: CopyLen->getZExtValue(), Align: LoopOpSize.getFixedValue());
428
429 // Skip the loop expansion entirely if the loop would never be taken.
430 if (LoopEndCount != 0) {
431 LoopExpansionInfo LEI =
432 insertLoopExpansion(InsertBefore, Len: CopyLen, MainLoopStep: LoopOpSize, ResidualLoopStep: 0,
433 BBNamePrefix: "static-memcpy", ExpectedUnits: AverageTripCount);
434 assert(LEI.MainLoopIP && LEI.MainLoopIndex &&
435 "Main loop should be generated for non-zero loop count");
436
437 // Fill MainLoopBB
438 IRBuilder<> MainLoopBuilder(LEI.MainLoopIP);
439 Align PartDstAlign(commonAlignment(A: DstAlign, Offset: LoopOpSize));
440 Align PartSrcAlign(commonAlignment(A: SrcAlign, Offset: LoopOpSize));
441
442 // If we used LoopOpType as GEP element type, we would iterate over the
443 // buffers in TypeStoreSize strides while copying TypeAllocSize bytes, i.e.,
444 // we would miss bytes if TypeStoreSize != TypeAllocSize. Therefore, use
445 // byte offsets computed from the TypeStoreSize.
446 Value *SrcGEP =
447 MainLoopBuilder.CreateInBoundsGEP(Ty: Int8Type, Ptr: SrcAddr, IdxList: LEI.MainLoopIndex);
448 LoadInst *Load = MainLoopBuilder.CreateAlignedLoad(
449 Ty: LoopOpType, Ptr: SrcGEP, Align: PartSrcAlign, isVolatile: SrcIsVolatile);
450 if (!CanOverlap) {
451 // Set alias scope for loads.
452 Load->setMetadata(KindID: LLVMContext::MD_alias_scope,
453 Node: MDNode::get(Context&: Ctx, MDs: NewScope));
454 }
455 Value *DstGEP =
456 MainLoopBuilder.CreateInBoundsGEP(Ty: Int8Type, Ptr: DstAddr, IdxList: LEI.MainLoopIndex);
457 StoreInst *Store = MainLoopBuilder.CreateAlignedStore(
458 Val: Load, Ptr: DstGEP, Align: PartDstAlign, isVolatile: DstIsVolatile);
459 if (!CanOverlap) {
460 // Indicate that stores don't overlap loads.
461 Store->setMetadata(KindID: LLVMContext::MD_noalias, Node: MDNode::get(Context&: Ctx, MDs: NewScope));
462 }
463 if (AtomicElementSize) {
464 Load->setAtomic(Ordering: AtomicOrdering::Unordered);
465 Store->setAtomic(Ordering: AtomicOrdering::Unordered);
466 }
467 assert(!LEI.ResidualLoopIP && !LEI.ResidualLoopIndex &&
468 "No residual loop was requested");
469 }
470
471 // Copy the remaining bytes with straight-line code.
472 uint64_t BytesCopied = LoopEndCount;
473 uint64_t RemainingBytes = CopyLen->getZExtValue() - BytesCopied;
474 if (RemainingBytes == 0)
475 return;
476
477 IRBuilder<> RBuilder(InsertBefore);
478 SmallVector<Type *, 5> RemainingOps;
479 TTI.getMemcpyLoopResidualLoweringType(OpsOut&: RemainingOps, Context&: Ctx, RemainingBytes,
480 SrcAddrSpace: SrcAS, DestAddrSpace: DstAS, SrcAlign, DestAlign: DstAlign,
481 AtomicCpySize: AtomicElementSize);
482
483 for (auto *OpTy : RemainingOps) {
484 Align PartSrcAlign(commonAlignment(A: SrcAlign, Offset: BytesCopied));
485 Align PartDstAlign(commonAlignment(A: DstAlign, Offset: BytesCopied));
486
487 TypeSize OperandSize = DL.getTypeStoreSize(Ty: OpTy);
488 assert((!AtomicElementSize || OperandSize % *AtomicElementSize == 0) &&
489 "Atomic memcpy lowering is not supported for selected operand size");
490
491 Value *SrcGEP = RBuilder.CreateInBoundsGEP(
492 Ty: Int8Type, Ptr: SrcAddr, IdxList: ConstantInt::get(Ty: TypeOfCopyLen, V: BytesCopied));
493 LoadInst *Load =
494 RBuilder.CreateAlignedLoad(Ty: OpTy, Ptr: SrcGEP, Align: PartSrcAlign, isVolatile: SrcIsVolatile);
495 if (!CanOverlap) {
496 // Set alias scope for loads.
497 Load->setMetadata(KindID: LLVMContext::MD_alias_scope,
498 Node: MDNode::get(Context&: Ctx, MDs: NewScope));
499 }
500 Value *DstGEP = RBuilder.CreateInBoundsGEP(
501 Ty: Int8Type, Ptr: DstAddr, IdxList: ConstantInt::get(Ty: TypeOfCopyLen, V: BytesCopied));
502 StoreInst *Store =
503 RBuilder.CreateAlignedStore(Val: Load, Ptr: DstGEP, Align: PartDstAlign, isVolatile: DstIsVolatile);
504 if (!CanOverlap) {
505 // Indicate that stores don't overlap loads.
506 Store->setMetadata(KindID: LLVMContext::MD_noalias, Node: MDNode::get(Context&: Ctx, MDs: NewScope));
507 }
508 if (AtomicElementSize) {
509 Load->setAtomic(Ordering: AtomicOrdering::Unordered);
510 Store->setAtomic(Ordering: AtomicOrdering::Unordered);
511 }
512 BytesCopied += OperandSize;
513 }
514 assert(BytesCopied == CopyLen->getZExtValue() &&
515 "Bytes copied should match size in the call!");
516}
517
518void llvm::createMemCpyLoopUnknownSize(
519 Instruction *InsertBefore, Value *SrcAddr, Value *DstAddr, Value *CopyLen,
520 Align SrcAlign, Align DstAlign, bool SrcIsVolatile, bool DstIsVolatile,
521 bool CanOverlap, const TargetTransformInfo &TTI,
522 std::optional<uint32_t> AtomicElementSize,
523 std::optional<uint64_t> AverageTripCount) {
524 BasicBlock *PreLoopBB = InsertBefore->getParent();
525 Function *ParentFunc = PreLoopBB->getParent();
526 const DataLayout &DL = ParentFunc->getDataLayout();
527 LLVMContext &Ctx = PreLoopBB->getContext();
528 MDBuilder MDB(Ctx);
529 MDNode *NewDomain = MDB.createAnonymousAliasScopeDomain(Name: "MemCopyDomain");
530 StringRef Name = "MemCopyAliasScope";
531 MDNode *NewScope = MDB.createAnonymousAliasScope(Domain: NewDomain, Name);
532
533 unsigned SrcAS = cast<PointerType>(Val: SrcAddr->getType())->getAddressSpace();
534 unsigned DstAS = cast<PointerType>(Val: DstAddr->getType())->getAddressSpace();
535
536 Type *LoopOpType = TTI.getMemcpyLoopLoweringType(
537 Context&: Ctx, Length: CopyLen, SrcAddrSpace: SrcAS, DestAddrSpace: DstAS, SrcAlign, DestAlign: DstAlign, AtomicElementSize);
538 assert((!AtomicElementSize || !LoopOpType->isVectorTy()) &&
539 "Atomic memcpy lowering is not supported for vector operand type");
540 TypeSize LoopOpSize = DL.getTypeStoreSize(Ty: LoopOpType);
541 assert((!AtomicElementSize || LoopOpSize % *AtomicElementSize == 0) &&
542 "Atomic memcpy lowering is not supported for selected operand size");
543
544 Type *Int8Type = Type::getInt8Ty(C&: Ctx);
545
546 Type *ResidualLoopOpType = AtomicElementSize
547 ? Type::getIntNTy(C&: Ctx, N: *AtomicElementSize * 8)
548 : Int8Type;
549 TypeSize ResidualLoopOpSize = DL.getTypeStoreSize(Ty: ResidualLoopOpType);
550 assert(ResidualLoopOpSize == (AtomicElementSize ? *AtomicElementSize : 1) &&
551 "Store size is expected to match type size");
552
553 LoopExpansionInfo LEI =
554 insertLoopExpansion(InsertBefore, Len: CopyLen, MainLoopStep: LoopOpSize, ResidualLoopStep: ResidualLoopOpSize,
555 BBNamePrefix: "dynamic-memcpy", ExpectedUnits: AverageTripCount);
556 assert(LEI.MainLoopIP && LEI.MainLoopIndex &&
557 "Main loop should be generated for unknown size copy");
558
559 // Fill MainLoopBB
560 IRBuilder<> MainLoopBuilder(LEI.MainLoopIP);
561 Align PartSrcAlign(commonAlignment(A: SrcAlign, Offset: LoopOpSize));
562 Align PartDstAlign(commonAlignment(A: DstAlign, Offset: LoopOpSize));
563
564 // If we used LoopOpType as GEP element type, we would iterate over the
565 // buffers in TypeStoreSize strides while copying TypeAllocSize bytes, i.e.,
566 // we would miss bytes if TypeStoreSize != TypeAllocSize. Therefore, use byte
567 // offsets computed from the TypeStoreSize.
568 Value *SrcGEP =
569 MainLoopBuilder.CreateInBoundsGEP(Ty: Int8Type, Ptr: SrcAddr, IdxList: LEI.MainLoopIndex);
570 LoadInst *Load = MainLoopBuilder.CreateAlignedLoad(
571 Ty: LoopOpType, Ptr: SrcGEP, Align: PartSrcAlign, isVolatile: SrcIsVolatile);
572 if (!CanOverlap) {
573 // Set alias scope for loads.
574 Load->setMetadata(KindID: LLVMContext::MD_alias_scope, Node: MDNode::get(Context&: Ctx, MDs: NewScope));
575 }
576 Value *DstGEP =
577 MainLoopBuilder.CreateInBoundsGEP(Ty: Int8Type, Ptr: DstAddr, IdxList: LEI.MainLoopIndex);
578 StoreInst *Store = MainLoopBuilder.CreateAlignedStore(
579 Val: Load, Ptr: DstGEP, Align: PartDstAlign, isVolatile: DstIsVolatile);
580 if (!CanOverlap) {
581 // Indicate that stores don't overlap loads.
582 Store->setMetadata(KindID: LLVMContext::MD_noalias, Node: MDNode::get(Context&: Ctx, MDs: NewScope));
583 }
584 if (AtomicElementSize) {
585 Load->setAtomic(Ordering: AtomicOrdering::Unordered);
586 Store->setAtomic(Ordering: AtomicOrdering::Unordered);
587 }
588
589 // Fill ResidualLoopBB.
590 if (!LEI.ResidualLoopIP)
591 return;
592
593 Align ResSrcAlign(commonAlignment(A: PartSrcAlign, Offset: ResidualLoopOpSize));
594 Align ResDstAlign(commonAlignment(A: PartDstAlign, Offset: ResidualLoopOpSize));
595
596 IRBuilder<> ResLoopBuilder(LEI.ResidualLoopIP);
597 Value *ResSrcGEP = ResLoopBuilder.CreateInBoundsGEP(Ty: Int8Type, Ptr: SrcAddr,
598 IdxList: LEI.ResidualLoopIndex);
599 LoadInst *ResLoad = ResLoopBuilder.CreateAlignedLoad(
600 Ty: ResidualLoopOpType, Ptr: ResSrcGEP, Align: ResSrcAlign, isVolatile: SrcIsVolatile);
601 if (!CanOverlap) {
602 // Set alias scope for loads.
603 ResLoad->setMetadata(KindID: LLVMContext::MD_alias_scope,
604 Node: MDNode::get(Context&: Ctx, MDs: NewScope));
605 }
606 Value *ResDstGEP = ResLoopBuilder.CreateInBoundsGEP(Ty: Int8Type, Ptr: DstAddr,
607 IdxList: LEI.ResidualLoopIndex);
608 StoreInst *ResStore = ResLoopBuilder.CreateAlignedStore(
609 Val: ResLoad, Ptr: ResDstGEP, Align: ResDstAlign, isVolatile: DstIsVolatile);
610 if (!CanOverlap) {
611 // Indicate that stores don't overlap loads.
612 ResStore->setMetadata(KindID: LLVMContext::MD_noalias, Node: MDNode::get(Context&: Ctx, MDs: NewScope));
613 }
614 if (AtomicElementSize) {
615 ResLoad->setAtomic(Ordering: AtomicOrdering::Unordered);
616 ResStore->setAtomic(Ordering: AtomicOrdering::Unordered);
617 }
618}
619
620// If \p Addr1 and \p Addr2 are pointers to different address spaces, create an
621// addresspacecast to obtain a pair of pointers in the same addressspace. The
622// caller needs to ensure that addrspacecasting is possible.
623// No-op if the pointers are in the same address space.
624static std::pair<Value *, Value *>
625tryInsertCastToCommonAddrSpace(IRBuilderBase &B, Value *Addr1, Value *Addr2,
626 const TargetTransformInfo &TTI) {
627 Value *ResAddr1 = Addr1;
628 Value *ResAddr2 = Addr2;
629
630 unsigned AS1 = cast<PointerType>(Val: Addr1->getType())->getAddressSpace();
631 unsigned AS2 = cast<PointerType>(Val: Addr2->getType())->getAddressSpace();
632 if (AS1 != AS2) {
633 if (TTI.isValidAddrSpaceCast(FromAS: AS2, ToAS: AS1))
634 ResAddr2 = B.CreateAddrSpaceCast(V: Addr2, DestTy: Addr1->getType());
635 else if (TTI.isValidAddrSpaceCast(FromAS: AS1, ToAS: AS2))
636 ResAddr1 = B.CreateAddrSpaceCast(V: Addr1, DestTy: Addr2->getType());
637 else
638 llvm_unreachable("Can only lower memmove between address spaces if they "
639 "support addrspacecast");
640 }
641 return {ResAddr1, ResAddr2};
642}
643
644// Lower memmove to IR. memmove is required to correctly copy overlapping memory
645// regions; therefore, it has to check the relative positions of the source and
646// destination pointers and choose the copy direction accordingly.
647//
648// The code below is an IR rendition of this C function:
649//
650// void* memmove(void* dst, const void* src, size_t n) {
651// unsigned char* d = dst;
652// const unsigned char* s = src;
653// if (s < d) {
654// // copy backwards
655// while (n--) {
656// d[n] = s[n];
657// }
658// } else {
659// // copy forward
660// for (size_t i = 0; i < n; ++i) {
661// d[i] = s[i];
662// }
663// }
664// return dst;
665// }
666//
667// If the TargetTransformInfo specifies a wider MemcpyLoopLoweringType, it is
668// used for the memory accesses in the loops. Then, additional loops with
669// byte-wise accesses are added for the remaining bytes.
670static void createMemMoveLoopUnknownSize(Instruction *InsertBefore,
671 Value *SrcAddr, Value *DstAddr,
672 Value *CopyLen, Align SrcAlign,
673 Align DstAlign, bool SrcIsVolatile,
674 bool DstIsVolatile,
675 const TargetTransformInfo &TTI) {
676 Type *TypeOfCopyLen = CopyLen->getType();
677 BasicBlock *OrigBB = InsertBefore->getParent();
678 Function *F = OrigBB->getParent();
679 const DataLayout &DL = F->getDataLayout();
680 LLVMContext &Ctx = OrigBB->getContext();
681 unsigned SrcAS = cast<PointerType>(Val: SrcAddr->getType())->getAddressSpace();
682 unsigned DstAS = cast<PointerType>(Val: DstAddr->getType())->getAddressSpace();
683
684 Type *LoopOpType = TTI.getMemcpyLoopLoweringType(Context&: Ctx, Length: CopyLen, SrcAddrSpace: SrcAS, DestAddrSpace: DstAS,
685 SrcAlign, DestAlign: DstAlign);
686 TypeSize LoopOpSize = DL.getTypeStoreSize(Ty: LoopOpType);
687 Type *Int8Type = Type::getInt8Ty(C&: Ctx);
688 bool LoopOpIsInt8 = LoopOpType == Int8Type;
689
690 // If the memory accesses are wider than one byte, residual loops with
691 // i8-accesses are required to move remaining bytes.
692 bool RequiresResidual = !LoopOpIsInt8;
693
694 Type *ResidualLoopOpType = Int8Type;
695 TypeSize ResidualLoopOpSize = DL.getTypeStoreSize(Ty: ResidualLoopOpType);
696
697 // Calculate the loop trip count and remaining bytes to copy after the loop.
698 IntegerType *ILengthType = cast<IntegerType>(Val: TypeOfCopyLen);
699 ConstantInt *CILoopOpSize = ConstantInt::get(Ty: ILengthType, V: LoopOpSize);
700 ConstantInt *CIResidualLoopOpSize =
701 ConstantInt::get(Ty: ILengthType, V: ResidualLoopOpSize);
702 ConstantInt *Zero = ConstantInt::get(Ty: ILengthType, V: 0);
703
704 const DebugLoc &DbgLoc = InsertBefore->getStableDebugLoc();
705 IRBuilder<> PLBuilder(InsertBefore);
706 PLBuilder.SetCurrentDebugLocation(DbgLoc);
707
708 Value *RuntimeLoopBytes = CopyLen;
709 Value *RuntimeLoopRemainder = nullptr;
710 Value *SkipResidualCondition = nullptr;
711 if (RequiresResidual) {
712 RuntimeLoopRemainder =
713 getRuntimeLoopRemainder(B&: PLBuilder, Len: CopyLen, OpSize: CILoopOpSize, OpSizeVal: LoopOpSize);
714 RuntimeLoopBytes = getRuntimeLoopUnits(B&: PLBuilder, Len: CopyLen, OpSize: CILoopOpSize,
715 OpSizeVal: LoopOpSize, RTLoopRemainder: RuntimeLoopRemainder);
716 SkipResidualCondition =
717 PLBuilder.CreateICmpEQ(LHS: RuntimeLoopRemainder, RHS: Zero, Name: "skip_residual");
718 }
719 Value *SkipMainCondition =
720 PLBuilder.CreateICmpEQ(LHS: RuntimeLoopBytes, RHS: Zero, Name: "skip_main");
721
722 // Create the a comparison of src and dst, based on which we jump to either
723 // the forward-copy part of the function (if src >= dst) or the backwards-copy
724 // part (if src < dst).
725 // SplitBlockAndInsertIfThenElse conveniently creates the basic if-then-else
726 // structure. Its block terminators (unconditional branches) are replaced by
727 // the appropriate conditional branches when the loop is built.
728 // If the pointers are in different address spaces, they need to be converted
729 // to a compatible one. Cases where memory ranges in the different address
730 // spaces cannot overlap are lowered as memcpy and not handled here.
731 auto [CmpSrcAddr, CmpDstAddr] =
732 tryInsertCastToCommonAddrSpace(B&: PLBuilder, Addr1: SrcAddr, Addr2: DstAddr, TTI);
733 Value *PtrCompare =
734 PLBuilder.CreateICmpULT(LHS: CmpSrcAddr, RHS: CmpDstAddr, Name: "compare_src_dst");
735 Instruction *ThenTerm, *ElseTerm;
736 SplitBlockAndInsertIfThenElse(Cond: PtrCompare, SplitBefore: InsertBefore->getIterator(),
737 ThenTerm: &ThenTerm, ElseTerm: &ElseTerm);
738
739 // If the LoopOpSize is greater than 1, each part of the function consists of
740 // four blocks:
741 // memmove_copy_backwards:
742 // skip the residual loop when 0 iterations are required
743 // memmove_bwd_residual_loop:
744 // copy the last few bytes individually so that the remaining length is
745 // a multiple of the LoopOpSize
746 // memmove_bwd_middle: skip the main loop when 0 iterations are required
747 // memmove_bwd_main_loop: the actual backwards loop BB with wide accesses
748 // memmove_copy_forward: skip the main loop when 0 iterations are required
749 // memmove_fwd_main_loop: the actual forward loop BB with wide accesses
750 // memmove_fwd_middle: skip the residual loop when 0 iterations are required
751 // memmove_fwd_residual_loop: copy the last few bytes individually
752 //
753 // The main and residual loop are switched between copying forward and
754 // backward so that the residual loop always operates on the end of the moved
755 // range. This is based on the assumption that buffers whose start is aligned
756 // with the LoopOpSize are more common than buffers whose end is.
757 //
758 // If the LoopOpSize is 1, each part of the function consists of two blocks:
759 // memmove_copy_backwards: skip the loop when 0 iterations are required
760 // memmove_bwd_main_loop: the actual backwards loop BB
761 // memmove_copy_forward: skip the loop when 0 iterations are required
762 // memmove_fwd_main_loop: the actual forward loop BB
763 BasicBlock *CopyBackwardsBB = ThenTerm->getParent();
764 CopyBackwardsBB->setName("memmove_copy_backwards");
765 BasicBlock *CopyForwardBB = ElseTerm->getParent();
766 CopyForwardBB->setName("memmove_copy_forward");
767 BasicBlock *ExitBB = InsertBefore->getParent();
768 ExitBB->setName("memmove_done");
769
770 Align PartSrcAlign(commonAlignment(A: SrcAlign, Offset: LoopOpSize));
771 Align PartDstAlign(commonAlignment(A: DstAlign, Offset: LoopOpSize));
772
773 // Accesses in the residual loops do not share the same alignment as those in
774 // the main loops.
775 Align ResidualSrcAlign(commonAlignment(A: PartSrcAlign, Offset: ResidualLoopOpSize));
776 Align ResidualDstAlign(commonAlignment(A: PartDstAlign, Offset: ResidualLoopOpSize));
777
778 // Copying backwards.
779 {
780 BasicBlock *MainLoopBB = BasicBlock::Create(
781 Context&: F->getContext(), Name: "memmove_bwd_main_loop", Parent: F, InsertBefore: CopyForwardBB);
782
783 // The predecessor of the memmove_bwd_main_loop. Updated in the
784 // following if a residual loop is emitted first.
785 BasicBlock *PredBB = CopyBackwardsBB;
786
787 if (RequiresResidual) {
788 // backwards residual loop
789 BasicBlock *ResidualLoopBB = BasicBlock::Create(
790 Context&: F->getContext(), Name: "memmove_bwd_residual_loop", Parent: F, InsertBefore: MainLoopBB);
791 IRBuilder<> ResidualLoopBuilder(ResidualLoopBB);
792 ResidualLoopBuilder.SetCurrentDebugLocation(DbgLoc);
793 PHINode *ResidualLoopPhi = ResidualLoopBuilder.CreatePHI(Ty: ILengthType, NumReservedValues: 0);
794 Value *ResidualIndex = ResidualLoopBuilder.CreateSub(
795 LHS: ResidualLoopPhi, RHS: CIResidualLoopOpSize, Name: "bwd_residual_index");
796 // If we used LoopOpType as GEP element type, we would iterate over the
797 // buffers in TypeStoreSize strides while copying TypeAllocSize bytes,
798 // i.e., we would miss bytes if TypeStoreSize != TypeAllocSize. Therefore,
799 // use byte offsets computed from the TypeStoreSize.
800 Value *LoadGEP = ResidualLoopBuilder.CreateInBoundsGEP(Ty: Int8Type, Ptr: SrcAddr,
801 IdxList: ResidualIndex);
802 Value *Element = ResidualLoopBuilder.CreateAlignedLoad(
803 Ty: ResidualLoopOpType, Ptr: LoadGEP, Align: ResidualSrcAlign, isVolatile: SrcIsVolatile,
804 Name: "element");
805 Value *StoreGEP = ResidualLoopBuilder.CreateInBoundsGEP(Ty: Int8Type, Ptr: DstAddr,
806 IdxList: ResidualIndex);
807 ResidualLoopBuilder.CreateAlignedStore(Val: Element, Ptr: StoreGEP,
808 Align: ResidualDstAlign, isVolatile: DstIsVolatile);
809
810 // After the residual loop, go to an intermediate block.
811 BasicBlock *IntermediateBB = BasicBlock::Create(
812 Context&: F->getContext(), Name: "memmove_bwd_middle", Parent: F, InsertBefore: MainLoopBB);
813 // Later code expects a terminator in the PredBB.
814 IRBuilder<> IntermediateBuilder(IntermediateBB);
815 IntermediateBuilder.SetCurrentDebugLocation(DbgLoc);
816 IntermediateBuilder.CreateUnreachable();
817 ResidualLoopBuilder.CreateCondBr(
818 Cond: ResidualLoopBuilder.CreateICmpEQ(LHS: ResidualIndex, RHS: RuntimeLoopBytes),
819 True: IntermediateBB, False: ResidualLoopBB);
820
821 ResidualLoopPhi->addIncoming(V: ResidualIndex, BB: ResidualLoopBB);
822 ResidualLoopPhi->addIncoming(V: CopyLen, BB: CopyBackwardsBB);
823
824 // How to get to the residual:
825 CondBrInst *BrInst =
826 CondBrInst::Create(Cond: SkipResidualCondition, IfTrue: IntermediateBB,
827 IfFalse: ResidualLoopBB, InsertBefore: ThenTerm->getIterator());
828 BrInst->setDebugLoc(DbgLoc);
829 ThenTerm->eraseFromParent();
830
831 PredBB = IntermediateBB;
832 }
833
834 // main loop
835 IRBuilder<> MainLoopBuilder(MainLoopBB);
836 MainLoopBuilder.SetCurrentDebugLocation(DbgLoc);
837 PHINode *MainLoopPhi = MainLoopBuilder.CreatePHI(Ty: ILengthType, NumReservedValues: 0);
838 Value *MainIndex =
839 MainLoopBuilder.CreateSub(LHS: MainLoopPhi, RHS: CILoopOpSize, Name: "bwd_main_index");
840 Value *LoadGEP =
841 MainLoopBuilder.CreateInBoundsGEP(Ty: Int8Type, Ptr: SrcAddr, IdxList: MainIndex);
842 Value *Element = MainLoopBuilder.CreateAlignedLoad(
843 Ty: LoopOpType, Ptr: LoadGEP, Align: PartSrcAlign, isVolatile: SrcIsVolatile, Name: "element");
844 Value *StoreGEP =
845 MainLoopBuilder.CreateInBoundsGEP(Ty: Int8Type, Ptr: DstAddr, IdxList: MainIndex);
846 MainLoopBuilder.CreateAlignedStore(Val: Element, Ptr: StoreGEP, Align: PartDstAlign,
847 isVolatile: DstIsVolatile);
848 MainLoopBuilder.CreateCondBr(Cond: MainLoopBuilder.CreateICmpEQ(LHS: MainIndex, RHS: Zero),
849 True: ExitBB, False: MainLoopBB);
850 MainLoopPhi->addIncoming(V: MainIndex, BB: MainLoopBB);
851 MainLoopPhi->addIncoming(V: RuntimeLoopBytes, BB: PredBB);
852
853 // How to get to the main loop:
854 Instruction *PredBBTerm = PredBB->getTerminator();
855 CondBrInst *BrInst = CondBrInst::Create(
856 Cond: SkipMainCondition, IfTrue: ExitBB, IfFalse: MainLoopBB, InsertBefore: PredBBTerm->getIterator());
857 BrInst->setDebugLoc(DbgLoc);
858 PredBBTerm->eraseFromParent();
859 }
860
861 // Copying forward.
862 // main loop
863 {
864 BasicBlock *MainLoopBB =
865 BasicBlock::Create(Context&: F->getContext(), Name: "memmove_fwd_main_loop", Parent: F, InsertBefore: ExitBB);
866 IRBuilder<> MainLoopBuilder(MainLoopBB);
867 MainLoopBuilder.SetCurrentDebugLocation(DbgLoc);
868 PHINode *MainLoopPhi =
869 MainLoopBuilder.CreatePHI(Ty: ILengthType, NumReservedValues: 0, Name: "fwd_main_index");
870 Value *LoadGEP =
871 MainLoopBuilder.CreateInBoundsGEP(Ty: Int8Type, Ptr: SrcAddr, IdxList: MainLoopPhi);
872 Value *Element = MainLoopBuilder.CreateAlignedLoad(
873 Ty: LoopOpType, Ptr: LoadGEP, Align: PartSrcAlign, isVolatile: SrcIsVolatile, Name: "element");
874 Value *StoreGEP =
875 MainLoopBuilder.CreateInBoundsGEP(Ty: Int8Type, Ptr: DstAddr, IdxList: MainLoopPhi);
876 MainLoopBuilder.CreateAlignedStore(Val: Element, Ptr: StoreGEP, Align: PartDstAlign,
877 isVolatile: DstIsVolatile);
878 Value *MainIndex = MainLoopBuilder.CreateAdd(LHS: MainLoopPhi, RHS: CILoopOpSize);
879 MainLoopPhi->addIncoming(V: MainIndex, BB: MainLoopBB);
880 MainLoopPhi->addIncoming(V: Zero, BB: CopyForwardBB);
881
882 Instruction *CopyFwdBBTerm = CopyForwardBB->getTerminator();
883 BasicBlock *SuccessorBB = ExitBB;
884 if (RequiresResidual)
885 SuccessorBB =
886 BasicBlock::Create(Context&: F->getContext(), Name: "memmove_fwd_middle", Parent: F, InsertBefore: ExitBB);
887
888 // leaving or staying in the main loop
889 MainLoopBuilder.CreateCondBr(
890 Cond: MainLoopBuilder.CreateICmpEQ(LHS: MainIndex, RHS: RuntimeLoopBytes), True: SuccessorBB,
891 False: MainLoopBB);
892
893 // getting in or skipping the main loop
894 CondBrInst *BrInst =
895 CondBrInst::Create(Cond: SkipMainCondition, IfTrue: SuccessorBB, IfFalse: MainLoopBB,
896 InsertBefore: CopyFwdBBTerm->getIterator());
897 BrInst->setDebugLoc(DbgLoc);
898 CopyFwdBBTerm->eraseFromParent();
899
900 if (RequiresResidual) {
901 BasicBlock *IntermediateBB = SuccessorBB;
902 IRBuilder<> IntermediateBuilder(IntermediateBB);
903 IntermediateBuilder.SetCurrentDebugLocation(DbgLoc);
904 BasicBlock *ResidualLoopBB = BasicBlock::Create(
905 Context&: F->getContext(), Name: "memmove_fwd_residual_loop", Parent: F, InsertBefore: ExitBB);
906 IntermediateBuilder.CreateCondBr(Cond: SkipResidualCondition, True: ExitBB,
907 False: ResidualLoopBB);
908
909 // Residual loop
910 IRBuilder<> ResidualLoopBuilder(ResidualLoopBB);
911 ResidualLoopBuilder.SetCurrentDebugLocation(DbgLoc);
912 PHINode *ResidualLoopPhi =
913 ResidualLoopBuilder.CreatePHI(Ty: ILengthType, NumReservedValues: 0, Name: "fwd_residual_index");
914 Value *LoadGEP = ResidualLoopBuilder.CreateInBoundsGEP(Ty: Int8Type, Ptr: SrcAddr,
915 IdxList: ResidualLoopPhi);
916 Value *Element = ResidualLoopBuilder.CreateAlignedLoad(
917 Ty: ResidualLoopOpType, Ptr: LoadGEP, Align: ResidualSrcAlign, isVolatile: SrcIsVolatile,
918 Name: "element");
919 Value *StoreGEP = ResidualLoopBuilder.CreateInBoundsGEP(Ty: Int8Type, Ptr: DstAddr,
920 IdxList: ResidualLoopPhi);
921 ResidualLoopBuilder.CreateAlignedStore(Val: Element, Ptr: StoreGEP,
922 Align: ResidualDstAlign, isVolatile: DstIsVolatile);
923 Value *ResidualIndex =
924 ResidualLoopBuilder.CreateAdd(LHS: ResidualLoopPhi, RHS: CIResidualLoopOpSize);
925 ResidualLoopBuilder.CreateCondBr(
926 Cond: ResidualLoopBuilder.CreateICmpEQ(LHS: ResidualIndex, RHS: CopyLen), True: ExitBB,
927 False: ResidualLoopBB);
928 ResidualLoopPhi->addIncoming(V: ResidualIndex, BB: ResidualLoopBB);
929 ResidualLoopPhi->addIncoming(V: RuntimeLoopBytes, BB: IntermediateBB);
930 }
931 }
932}
933
934// Similar to createMemMoveLoopUnknownSize, only the trip counts are computed at
935// compile time, obsolete loops and branches are omitted, and the residual code
936// is straight-line code instead of a loop.
937static void createMemMoveLoopKnownSize(Instruction *InsertBefore,
938 Value *SrcAddr, Value *DstAddr,
939 ConstantInt *CopyLen, Align SrcAlign,
940 Align DstAlign, bool SrcIsVolatile,
941 bool DstIsVolatile,
942 const TargetTransformInfo &TTI) {
943 // No need to expand zero length moves.
944 if (CopyLen->isZero())
945 return;
946
947 Type *TypeOfCopyLen = CopyLen->getType();
948 BasicBlock *OrigBB = InsertBefore->getParent();
949 Function *F = OrigBB->getParent();
950 const DataLayout &DL = F->getDataLayout();
951 LLVMContext &Ctx = OrigBB->getContext();
952 unsigned SrcAS = cast<PointerType>(Val: SrcAddr->getType())->getAddressSpace();
953 unsigned DstAS = cast<PointerType>(Val: DstAddr->getType())->getAddressSpace();
954
955 Type *LoopOpType = TTI.getMemcpyLoopLoweringType(Context&: Ctx, Length: CopyLen, SrcAddrSpace: SrcAS, DestAddrSpace: DstAS,
956 SrcAlign, DestAlign: DstAlign);
957 TypeSize LoopOpSize = DL.getTypeStoreSize(Ty: LoopOpType);
958 assert(LoopOpSize.isFixed() && "LoopOpType cannot be a scalable vector type");
959 Type *Int8Type = Type::getInt8Ty(C&: Ctx);
960
961 // Calculate the loop trip count and remaining bytes to copy after the loop.
962 uint64_t BytesCopiedInLoop =
963 alignDown(Value: CopyLen->getZExtValue(), Align: LoopOpSize.getFixedValue());
964 uint64_t RemainingBytes = CopyLen->getZExtValue() - BytesCopiedInLoop;
965
966 IntegerType *ILengthType = cast<IntegerType>(Val: TypeOfCopyLen);
967 ConstantInt *Zero = ConstantInt::get(Ty: ILengthType, V: 0);
968 ConstantInt *LoopBound = ConstantInt::get(Ty: ILengthType, V: BytesCopiedInLoop);
969 ConstantInt *CILoopOpSize = ConstantInt::get(Ty: ILengthType, V: LoopOpSize);
970
971 const DebugLoc &DbgLoc = InsertBefore->getStableDebugLoc();
972 IRBuilder<> PLBuilder(InsertBefore);
973 PLBuilder.SetCurrentDebugLocation(DbgLoc);
974
975 auto [CmpSrcAddr, CmpDstAddr] =
976 tryInsertCastToCommonAddrSpace(B&: PLBuilder, Addr1: SrcAddr, Addr2: DstAddr, TTI);
977 Value *PtrCompare =
978 PLBuilder.CreateICmpULT(LHS: CmpSrcAddr, RHS: CmpDstAddr, Name: "compare_src_dst");
979 Instruction *ThenTerm, *ElseTerm;
980 SplitBlockAndInsertIfThenElse(Cond: PtrCompare, SplitBefore: InsertBefore->getIterator(),
981 ThenTerm: &ThenTerm, ElseTerm: &ElseTerm);
982
983 BasicBlock *CopyBackwardsBB = ThenTerm->getParent();
984 BasicBlock *CopyForwardBB = ElseTerm->getParent();
985 BasicBlock *ExitBB = InsertBefore->getParent();
986 ExitBB->setName("memmove_done");
987
988 Align PartSrcAlign(commonAlignment(A: SrcAlign, Offset: LoopOpSize));
989 Align PartDstAlign(commonAlignment(A: DstAlign, Offset: LoopOpSize));
990
991 // Helper function to generate a load/store pair of a given type in the
992 // residual. Used in the forward and backward branches.
993 auto GenerateResidualLdStPair = [&](Type *OpTy, IRBuilderBase &Builder,
994 uint64_t &BytesCopied) {
995 Align ResSrcAlign(commonAlignment(A: SrcAlign, Offset: BytesCopied));
996 Align ResDstAlign(commonAlignment(A: DstAlign, Offset: BytesCopied));
997
998 TypeSize OperandSize = DL.getTypeStoreSize(Ty: OpTy);
999
1000 // If we used LoopOpType as GEP element type, we would iterate over the
1001 // buffers in TypeStoreSize strides while copying TypeAllocSize bytes, i.e.,
1002 // we would miss bytes if TypeStoreSize != TypeAllocSize. Therefore, use
1003 // byte offsets computed from the TypeStoreSize.
1004 Value *SrcGEP = Builder.CreateInBoundsGEP(
1005 Ty: Int8Type, Ptr: SrcAddr, IdxList: ConstantInt::get(Ty: TypeOfCopyLen, V: BytesCopied));
1006 LoadInst *Load =
1007 Builder.CreateAlignedLoad(Ty: OpTy, Ptr: SrcGEP, Align: ResSrcAlign, isVolatile: SrcIsVolatile);
1008 Value *DstGEP = Builder.CreateInBoundsGEP(
1009 Ty: Int8Type, Ptr: DstAddr, IdxList: ConstantInt::get(Ty: TypeOfCopyLen, V: BytesCopied));
1010 Builder.CreateAlignedStore(Val: Load, Ptr: DstGEP, Align: ResDstAlign, isVolatile: DstIsVolatile);
1011 BytesCopied += OperandSize;
1012 };
1013
1014 // Copying backwards.
1015 if (RemainingBytes != 0) {
1016 CopyBackwardsBB->setName("memmove_bwd_residual");
1017 uint64_t BytesCopied = BytesCopiedInLoop;
1018
1019 // Residual code is required to move the remaining bytes. We need the same
1020 // instructions as in the forward case, only in reverse. So we generate code
1021 // the same way, except that we change the IRBuilder insert point for each
1022 // load/store pair so that each one is inserted before the previous one
1023 // instead of after it.
1024 IRBuilder<> BwdResBuilder(CopyBackwardsBB,
1025 CopyBackwardsBB->getFirstNonPHIIt());
1026 BwdResBuilder.SetCurrentDebugLocation(DbgLoc);
1027 SmallVector<Type *, 5> RemainingOps;
1028 TTI.getMemcpyLoopResidualLoweringType(OpsOut&: RemainingOps, Context&: Ctx, RemainingBytes,
1029 SrcAddrSpace: SrcAS, DestAddrSpace: DstAS, SrcAlign: PartSrcAlign,
1030 DestAlign: PartDstAlign);
1031 for (auto *OpTy : RemainingOps) {
1032 // reverse the order of the emitted operations
1033 BwdResBuilder.SetInsertPoint(TheBB: CopyBackwardsBB,
1034 IP: CopyBackwardsBB->getFirstNonPHIIt());
1035 GenerateResidualLdStPair(OpTy, BwdResBuilder, BytesCopied);
1036 }
1037 }
1038 if (BytesCopiedInLoop != 0) {
1039 BasicBlock *LoopBB = CopyBackwardsBB;
1040 BasicBlock *PredBB = OrigBB;
1041 if (RemainingBytes != 0) {
1042 // if we introduce residual code, it needs its separate BB
1043 LoopBB = CopyBackwardsBB->splitBasicBlock(
1044 I: CopyBackwardsBB->getTerminator(), BBName: "memmove_bwd_loop");
1045 PredBB = CopyBackwardsBB;
1046 } else {
1047 CopyBackwardsBB->setName("memmove_bwd_loop");
1048 }
1049 IRBuilder<> LoopBuilder(LoopBB->getTerminator());
1050 LoopBuilder.SetCurrentDebugLocation(DbgLoc);
1051 PHINode *LoopPhi = LoopBuilder.CreatePHI(Ty: ILengthType, NumReservedValues: 0);
1052 Value *Index = LoopBuilder.CreateSub(LHS: LoopPhi, RHS: CILoopOpSize, Name: "bwd_index");
1053 Value *LoadGEP = LoopBuilder.CreateInBoundsGEP(Ty: Int8Type, Ptr: SrcAddr, IdxList: Index);
1054 Value *Element = LoopBuilder.CreateAlignedLoad(
1055 Ty: LoopOpType, Ptr: LoadGEP, Align: PartSrcAlign, isVolatile: SrcIsVolatile, Name: "element");
1056 Value *StoreGEP = LoopBuilder.CreateInBoundsGEP(Ty: Int8Type, Ptr: DstAddr, IdxList: Index);
1057 LoopBuilder.CreateAlignedStore(Val: Element, Ptr: StoreGEP, Align: PartDstAlign,
1058 isVolatile: DstIsVolatile);
1059
1060 // Replace the unconditional branch introduced by
1061 // SplitBlockAndInsertIfThenElse to turn LoopBB into a loop.
1062 Instruction *UncondTerm = LoopBB->getTerminator();
1063 LoopBuilder.CreateCondBr(Cond: LoopBuilder.CreateICmpEQ(LHS: Index, RHS: Zero), True: ExitBB,
1064 False: LoopBB);
1065 UncondTerm->eraseFromParent();
1066
1067 LoopPhi->addIncoming(V: Index, BB: LoopBB);
1068 LoopPhi->addIncoming(V: LoopBound, BB: PredBB);
1069 }
1070
1071 // Copying forward.
1072 BasicBlock *FwdResidualBB = CopyForwardBB;
1073 if (BytesCopiedInLoop != 0) {
1074 CopyForwardBB->setName("memmove_fwd_loop");
1075 BasicBlock *LoopBB = CopyForwardBB;
1076 BasicBlock *SuccBB = ExitBB;
1077 if (RemainingBytes != 0) {
1078 // if we introduce residual code, it needs its separate BB
1079 SuccBB = CopyForwardBB->splitBasicBlock(I: CopyForwardBB->getTerminator(),
1080 BBName: "memmove_fwd_residual");
1081 FwdResidualBB = SuccBB;
1082 }
1083 IRBuilder<> LoopBuilder(LoopBB->getTerminator());
1084 LoopBuilder.SetCurrentDebugLocation(DbgLoc);
1085 PHINode *LoopPhi = LoopBuilder.CreatePHI(Ty: ILengthType, NumReservedValues: 0, Name: "fwd_index");
1086 Value *LoadGEP = LoopBuilder.CreateInBoundsGEP(Ty: Int8Type, Ptr: SrcAddr, IdxList: LoopPhi);
1087 Value *Element = LoopBuilder.CreateAlignedLoad(
1088 Ty: LoopOpType, Ptr: LoadGEP, Align: PartSrcAlign, isVolatile: SrcIsVolatile, Name: "element");
1089 Value *StoreGEP = LoopBuilder.CreateInBoundsGEP(Ty: Int8Type, Ptr: DstAddr, IdxList: LoopPhi);
1090 LoopBuilder.CreateAlignedStore(Val: Element, Ptr: StoreGEP, Align: PartDstAlign,
1091 isVolatile: DstIsVolatile);
1092 Value *Index = LoopBuilder.CreateAdd(LHS: LoopPhi, RHS: CILoopOpSize);
1093 LoopPhi->addIncoming(V: Index, BB: LoopBB);
1094 LoopPhi->addIncoming(V: Zero, BB: OrigBB);
1095
1096 // Replace the unconditional branch to turn LoopBB into a loop.
1097 Instruction *UncondTerm = LoopBB->getTerminator();
1098 LoopBuilder.CreateCondBr(Cond: LoopBuilder.CreateICmpEQ(LHS: Index, RHS: LoopBound), True: SuccBB,
1099 False: LoopBB);
1100 UncondTerm->eraseFromParent();
1101 }
1102
1103 if (RemainingBytes != 0) {
1104 uint64_t BytesCopied = BytesCopiedInLoop;
1105
1106 // Residual code is required to move the remaining bytes. In the forward
1107 // case, we emit it in the normal order.
1108 IRBuilder<> FwdResBuilder(FwdResidualBB->getTerminator());
1109 FwdResBuilder.SetCurrentDebugLocation(DbgLoc);
1110 SmallVector<Type *, 5> RemainingOps;
1111 TTI.getMemcpyLoopResidualLoweringType(OpsOut&: RemainingOps, Context&: Ctx, RemainingBytes,
1112 SrcAddrSpace: SrcAS, DestAddrSpace: DstAS, SrcAlign: PartSrcAlign,
1113 DestAlign: PartDstAlign);
1114 for (auto *OpTy : RemainingOps)
1115 GenerateResidualLdStPair(OpTy, FwdResBuilder, BytesCopied);
1116 }
1117}
1118
1119/// Create a Value of \p DstType that consists of a sequence of copies of
1120/// \p SetValue, using bitcasts and a vector splat.
1121static Value *createMemSetSplat(const DataLayout &DL, IRBuilderBase &B,
1122 Value *SetValue, Type *DstType) {
1123 TypeSize DstSize = DL.getTypeStoreSize(Ty: DstType);
1124 Type *SetValueType = SetValue->getType();
1125 TypeSize SetValueSize = DL.getTypeStoreSize(Ty: SetValueType);
1126 assert(SetValueSize == DL.getTypeAllocSize(SetValueType) &&
1127 "Store size and alloc size of SetValue's type must match");
1128 assert(SetValueSize != 0 && DstSize % SetValueSize == 0 &&
1129 "DstType size must be a multiple of SetValue size");
1130
1131 Value *Result = SetValue;
1132 if (DstSize != SetValueSize) {
1133 if (!SetValueType->isIntegerTy() && !SetValueType->isFloatingPointTy()) {
1134 // If the type cannot be put into a vector, bitcast to iN first.
1135 LLVMContext &Ctx = SetValue->getContext();
1136 Result = B.CreateBitCast(V: Result, DestTy: Type::getIntNTy(C&: Ctx, N: SetValueSize * 8),
1137 Name: "setvalue.toint");
1138 }
1139 // Form a sufficiently large vector consisting of SetValue, repeated.
1140 Result =
1141 B.CreateVectorSplat(NumElts: DstSize / SetValueSize, V: Result, Name: "setvalue.splat");
1142 }
1143
1144 // The value has the right size, but we might have to bitcast it to the right
1145 // type.
1146 Result = B.CreateBitCast(V: Result, DestTy: DstType, Name: "setvalue.splat.cast");
1147 return Result;
1148}
1149
1150static void
1151createMemSetLoopKnownSize(Instruction *InsertBefore, Value *DstAddr,
1152 ConstantInt *Len, Value *SetValue, Align DstAlign,
1153 bool IsVolatile, const TargetTransformInfo *TTI,
1154 std::optional<uint64_t> AverageTripCount) {
1155 // No need to expand zero length memsets.
1156 if (Len->isZero())
1157 return;
1158
1159 BasicBlock *PreLoopBB = InsertBefore->getParent();
1160 Function *ParentFunc = PreLoopBB->getParent();
1161 const DataLayout &DL = ParentFunc->getDataLayout();
1162 LLVMContext &Ctx = PreLoopBB->getContext();
1163
1164 unsigned DstAS = cast<PointerType>(Val: DstAddr->getType())->getAddressSpace();
1165
1166 Type *TypeOfLen = Len->getType();
1167 Type *Int8Type = Type::getInt8Ty(C&: Ctx);
1168 assert(SetValue->getType() == Int8Type && "Can only set bytes");
1169
1170 Type *LoopOpType = Int8Type;
1171 if (TTI) {
1172 // Use the same memory access type as for a memcpy with the same Dst and Src
1173 // alignment and address space.
1174 LoopOpType = TTI->getMemcpyLoopLoweringType(
1175 Context&: Ctx, Length: Len, SrcAddrSpace: DstAS, DestAddrSpace: DstAS, SrcAlign: DstAlign, DestAlign: DstAlign, AtomicElementSize: std::nullopt);
1176 }
1177 TypeSize LoopOpSize = DL.getTypeStoreSize(Ty: LoopOpType);
1178 assert(LoopOpSize.isFixed() && "LoopOpType cannot be a scalable vector type");
1179
1180 uint64_t LoopEndCount =
1181 alignDown(Value: Len->getZExtValue(), Align: LoopOpSize.getFixedValue());
1182
1183 if (LoopEndCount != 0) {
1184 Value *SplatSetValue = nullptr;
1185 {
1186 IRBuilder<> PreLoopBuilder(InsertBefore);
1187 SplatSetValue =
1188 createMemSetSplat(DL, B&: PreLoopBuilder, SetValue, DstType: LoopOpType);
1189 }
1190
1191 // Don't generate a residual loop, the remaining bytes are set with
1192 // straight-line code.
1193 LoopExpansionInfo LEI = insertLoopExpansion(
1194 InsertBefore, Len, MainLoopStep: LoopOpSize, ResidualLoopStep: 0, BBNamePrefix: "static-memset", ExpectedUnits: AverageTripCount);
1195 assert(LEI.MainLoopIP && LEI.MainLoopIndex &&
1196 "Main loop should be generated for non-zero loop count");
1197
1198 // Fill MainLoopBB
1199 IRBuilder<> MainLoopBuilder(LEI.MainLoopIP);
1200 Align PartDstAlign(commonAlignment(A: DstAlign, Offset: LoopOpSize));
1201
1202 Value *DstGEP =
1203 MainLoopBuilder.CreateInBoundsGEP(Ty: Int8Type, Ptr: DstAddr, IdxList: LEI.MainLoopIndex);
1204
1205 MainLoopBuilder.CreateAlignedStore(Val: SplatSetValue, Ptr: DstGEP, Align: PartDstAlign,
1206 isVolatile: IsVolatile);
1207
1208 assert(!LEI.ResidualLoopIP && !LEI.ResidualLoopIndex &&
1209 "No residual loop was requested");
1210 }
1211
1212 uint64_t BytesSet = LoopEndCount;
1213 uint64_t RemainingBytes = Len->getZExtValue() - BytesSet;
1214 if (RemainingBytes == 0)
1215 return;
1216
1217 IRBuilder<> RBuilder(InsertBefore);
1218
1219 assert(TTI && "there cannot be a residual loop without TTI");
1220 SmallVector<Type *, 5> RemainingOps;
1221 TTI->getMemcpyLoopResidualLoweringType(OpsOut&: RemainingOps, Context&: Ctx, RemainingBytes,
1222 SrcAddrSpace: DstAS, DestAddrSpace: DstAS, SrcAlign: DstAlign, DestAlign: DstAlign,
1223 AtomicCpySize: std::nullopt);
1224
1225 Type *PreviousOpTy = nullptr;
1226 Value *SplatSetValue = nullptr;
1227 for (auto *OpTy : RemainingOps) {
1228 TypeSize OperandSize = DL.getTypeStoreSize(Ty: OpTy);
1229 assert(OperandSize.isFixed() &&
1230 "Operand types cannot be scalable vector types");
1231 Align PartDstAlign(commonAlignment(A: DstAlign, Offset: BytesSet));
1232
1233 // Avoid recomputing the splat SetValue if it's the same as for the last
1234 // iteration.
1235 if (OpTy != PreviousOpTy)
1236 SplatSetValue = createMemSetSplat(DL, B&: RBuilder, SetValue, DstType: OpTy);
1237
1238 Value *DstGEP = RBuilder.CreateInBoundsGEP(
1239 Ty: Int8Type, Ptr: DstAddr, IdxList: ConstantInt::get(Ty: TypeOfLen, V: BytesSet));
1240 RBuilder.CreateAlignedStore(Val: SplatSetValue, Ptr: DstGEP, Align: PartDstAlign,
1241 isVolatile: IsVolatile);
1242 BytesSet += OperandSize;
1243 PreviousOpTy = OpTy;
1244 }
1245 assert(BytesSet == Len->getZExtValue() &&
1246 "Bytes set should match size in the call!");
1247}
1248
1249static void
1250createMemSetLoopUnknownSize(Instruction *InsertBefore, Value *DstAddr,
1251 Value *Len, Value *SetValue, Align DstAlign,
1252 bool IsVolatile, const TargetTransformInfo *TTI,
1253 std::optional<uint64_t> AverageTripCount) {
1254 BasicBlock *PreLoopBB = InsertBefore->getParent();
1255 Function *ParentFunc = PreLoopBB->getParent();
1256 const DataLayout &DL = ParentFunc->getDataLayout();
1257 LLVMContext &Ctx = PreLoopBB->getContext();
1258
1259 unsigned DstAS = cast<PointerType>(Val: DstAddr->getType())->getAddressSpace();
1260
1261 Type *Int8Type = Type::getInt8Ty(C&: Ctx);
1262 assert(SetValue->getType() == Int8Type && "Can only set bytes");
1263
1264 Type *LoopOpType = Int8Type;
1265 if (TTI) {
1266 LoopOpType = TTI->getMemcpyLoopLoweringType(
1267 Context&: Ctx, Length: Len, SrcAddrSpace: DstAS, DestAddrSpace: DstAS, SrcAlign: DstAlign, DestAlign: DstAlign, AtomicElementSize: std::nullopt);
1268 }
1269 TypeSize LoopOpSize = DL.getTypeStoreSize(Ty: LoopOpType);
1270 assert(LoopOpSize.isFixed() && "LoopOpType cannot be a scalable vector type");
1271
1272 Type *ResidualLoopOpType = Int8Type;
1273 TypeSize ResidualLoopOpSize = DL.getTypeStoreSize(Ty: ResidualLoopOpType);
1274
1275 Value *SplatSetValue = SetValue;
1276 {
1277 IRBuilder<> PreLoopBuilder(InsertBefore);
1278 SplatSetValue = createMemSetSplat(DL, B&: PreLoopBuilder, SetValue, DstType: LoopOpType);
1279 }
1280
1281 LoopExpansionInfo LEI =
1282 insertLoopExpansion(InsertBefore, Len, MainLoopStep: LoopOpSize, ResidualLoopStep: ResidualLoopOpSize,
1283 BBNamePrefix: "dynamic-memset", ExpectedUnits: AverageTripCount);
1284 assert(LEI.MainLoopIP && LEI.MainLoopIndex &&
1285 "Main loop should be generated for unknown size memset");
1286
1287 // Fill MainLoopBB
1288 IRBuilder<> MainLoopBuilder(LEI.MainLoopIP);
1289 Align PartDstAlign(commonAlignment(A: DstAlign, Offset: LoopOpSize));
1290
1291 Value *DstGEP =
1292 MainLoopBuilder.CreateInBoundsGEP(Ty: Int8Type, Ptr: DstAddr, IdxList: LEI.MainLoopIndex);
1293 MainLoopBuilder.CreateAlignedStore(Val: SplatSetValue, Ptr: DstGEP, Align: PartDstAlign,
1294 isVolatile: IsVolatile);
1295
1296 // Fill ResidualLoopBB
1297 if (!LEI.ResidualLoopIP)
1298 return;
1299
1300 Align ResDstAlign(commonAlignment(A: PartDstAlign, Offset: ResidualLoopOpSize));
1301
1302 IRBuilder<> ResLoopBuilder(LEI.ResidualLoopIP);
1303
1304 Value *ResDstGEP = ResLoopBuilder.CreateInBoundsGEP(Ty: Int8Type, Ptr: DstAddr,
1305 IdxList: LEI.ResidualLoopIndex);
1306 ResLoopBuilder.CreateAlignedStore(Val: SetValue, Ptr: ResDstGEP, Align: ResDstAlign,
1307 isVolatile: IsVolatile);
1308}
1309
1310static void createMemSetPatternLoop(Instruction *InsertBefore, Value *DstAddr,
1311 Value *Len, Value *SetValue, Align DstAlign,
1312 bool IsVolatile,
1313 const TargetTransformInfo *TTI,
1314 std::optional<uint64_t> AverageTripCount) {
1315 // No need to expand zero length memset.pattern.
1316 if (auto *CLen = dyn_cast<ConstantInt>(Val: Len))
1317 if (CLen->isZero())
1318 return;
1319
1320 BasicBlock *PreLoopBB = InsertBefore->getParent();
1321 Function *ParentFunc = PreLoopBB->getParent();
1322 const DataLayout &DL = ParentFunc->getDataLayout();
1323 LLVMContext &Ctx = PreLoopBB->getContext();
1324
1325 unsigned DstAS = cast<PointerType>(Val: DstAddr->getType())->getAddressSpace();
1326
1327 Type *PreferredLoopOpType = SetValue->getType();
1328 if (TTI) {
1329 PreferredLoopOpType = TTI->getMemcpyLoopLoweringType(
1330 Context&: Ctx, Length: Len, SrcAddrSpace: DstAS, DestAddrSpace: DstAS, SrcAlign: DstAlign, DestAlign: DstAlign, AtomicElementSize: std::nullopt);
1331 }
1332 TypeSize PreferredLoopOpStoreSize = DL.getTypeStoreSize(Ty: PreferredLoopOpType);
1333 assert(PreferredLoopOpStoreSize.isFixed() &&
1334 "PreferredLoopOpType cannot be a scalable vector type");
1335
1336 TypeSize PreferredLoopOpAllocSize = DL.getTypeAllocSize(Ty: PreferredLoopOpType);
1337
1338 Type *OriginalType = SetValue->getType();
1339 TypeSize OriginalTypeStoreSize = DL.getTypeStoreSize(Ty: OriginalType);
1340 TypeSize OriginalTypeAllocSize = DL.getTypeAllocSize(Ty: OriginalType);
1341
1342 // The semantics of memset.pattern restrict what vectorization we can do: It
1343 // has to behave like a series of stores of the SetValue type at offsets that
1344 // are spaced by the alloc size of the SetValue type. If store and alloc size
1345 // of the SetValue type don't match, the bytes that aren't covered by these
1346 // stores must not be overwritten. We therefore only vectorize memset.pattern
1347 // if the store and alloc sizes of the SetValue are equal and properly divide
1348 // the size of the preferred lowering type (and only if store and alloc size
1349 // for the preferred lowering type are also equal).
1350
1351 unsigned MainLoopStep = 1;
1352 Type *MainLoopType = OriginalType;
1353 TypeSize MainLoopAllocSize = OriginalTypeAllocSize;
1354 unsigned ResidualLoopStep = 0;
1355 Type *ResidualLoopType = nullptr;
1356
1357 if (PreferredLoopOpStoreSize == PreferredLoopOpAllocSize &&
1358 OriginalTypeStoreSize == OriginalTypeAllocSize &&
1359 OriginalTypeStoreSize < PreferredLoopOpStoreSize &&
1360 PreferredLoopOpStoreSize % OriginalTypeStoreSize == 0) {
1361 // Multiple instances of SetValue can be combined to reach the preferred
1362 // loop op size.
1363 MainLoopStep = PreferredLoopOpStoreSize / OriginalTypeStoreSize;
1364 MainLoopType = PreferredLoopOpType;
1365 MainLoopAllocSize = PreferredLoopOpStoreSize;
1366
1367 ResidualLoopStep = 1;
1368 ResidualLoopType = OriginalType;
1369 }
1370
1371 // The step arguments here are in terms of the alloc size of the SetValue, not
1372 // in terms of bytes.
1373 LoopExpansionInfo LEI =
1374 insertLoopExpansion(InsertBefore, Len, MainLoopStep, ResidualLoopStep,
1375 BBNamePrefix: "memset.pattern", ExpectedUnits: AverageTripCount);
1376
1377 Align PartDstAlign(commonAlignment(A: DstAlign, Offset: MainLoopAllocSize));
1378
1379 if (LEI.MainLoopIP) {
1380 // Create the loop-invariant splat value before the loop.
1381 IRBuilder<> PreLoopBuilder(PreLoopBB->getTerminator());
1382 Value *MainLoopSetValue = SetValue;
1383 if (MainLoopType != OriginalType)
1384 MainLoopSetValue =
1385 createMemSetSplat(DL, B&: PreLoopBuilder, SetValue, DstType: MainLoopType);
1386
1387 // Fill MainLoopBB
1388 IRBuilder<> MainLoopBuilder(LEI.MainLoopIP);
1389 Value *DstGEP = MainLoopBuilder.CreateInBoundsGEP(Ty: MainLoopType, Ptr: DstAddr,
1390 IdxList: LEI.MainLoopIndex);
1391 MainLoopBuilder.CreateAlignedStore(Val: MainLoopSetValue, Ptr: DstGEP, Align: PartDstAlign,
1392 isVolatile: IsVolatile);
1393 }
1394
1395 if (!LEI.ResidualLoopIP)
1396 return;
1397
1398 // Fill ResidualLoopBB
1399 Align ResDstAlign(
1400 commonAlignment(A: PartDstAlign, Offset: DL.getTypeAllocSize(Ty: ResidualLoopType)));
1401
1402 IRBuilder<> ResLoopBuilder(LEI.ResidualLoopIP);
1403 Value *ResDstGEP = ResLoopBuilder.CreateInBoundsGEP(Ty: ResidualLoopType, Ptr: DstAddr,
1404 IdxList: LEI.ResidualLoopIndex);
1405 ResLoopBuilder.CreateAlignedStore(Val: SetValue, Ptr: ResDstGEP, Align: ResDstAlign,
1406 isVolatile: IsVolatile);
1407}
1408
1409template <typename T>
1410static bool canOverlap(MemTransferBase<T> *Memcpy, ScalarEvolution *SE) {
1411 if (SE) {
1412 const SCEV *SrcSCEV = SE->getSCEV(V: Memcpy->getRawSource());
1413 const SCEV *DestSCEV = SE->getSCEV(V: Memcpy->getRawDest());
1414 if (SE->isKnownPredicateAt(Pred: CmpInst::ICMP_NE, LHS: SrcSCEV, RHS: DestSCEV, CtxI: Memcpy))
1415 return false;
1416 }
1417 return true;
1418}
1419
1420void llvm::expandMemCpyAsLoop(MemCpyInst *Memcpy,
1421 const TargetTransformInfo &TTI,
1422 ScalarEvolution *SE) {
1423 bool CanOverlap = canOverlap(Memcpy, SE);
1424 auto TripCount = getAverageMemOpLoopTripCount(I: *Memcpy);
1425 if (ConstantInt *CI = dyn_cast<ConstantInt>(Val: Memcpy->getLength())) {
1426 createMemCpyLoopKnownSize(
1427 /*InsertBefore=*/Memcpy,
1428 /*SrcAddr=*/Memcpy->getRawSource(),
1429 /*DstAddr=*/Memcpy->getRawDest(),
1430 /*CopyLen=*/CI,
1431 /*SrcAlign=*/Memcpy->getSourceAlign().valueOrOne(),
1432 /*DstAlign=*/Memcpy->getDestAlign().valueOrOne(),
1433 /*SrcIsVolatile=*/Memcpy->isVolatile(),
1434 /*DstIsVolatile=*/Memcpy->isVolatile(),
1435 /*CanOverlap=*/CanOverlap,
1436 /*TTI=*/TTI,
1437 /*AtomicElementSize=*/std::nullopt,
1438 /*AverageTripCount=*/TripCount);
1439 } else {
1440 createMemCpyLoopUnknownSize(
1441 /*InsertBefore=*/Memcpy,
1442 /*SrcAddr=*/Memcpy->getRawSource(),
1443 /*DstAddr=*/Memcpy->getRawDest(),
1444 /*CopyLen=*/Memcpy->getLength(),
1445 /*SrcAlign=*/Memcpy->getSourceAlign().valueOrOne(),
1446 /*DstAlign=*/Memcpy->getDestAlign().valueOrOne(),
1447 /*SrcIsVolatile=*/Memcpy->isVolatile(),
1448 /*DstIsVolatile=*/Memcpy->isVolatile(),
1449 /*CanOverlap=*/CanOverlap,
1450 /*TTI=*/TTI,
1451 /*AtomicElementSize=*/std::nullopt,
1452 /*AverageTripCount=*/TripCount);
1453 }
1454}
1455
1456bool llvm::expandMemMoveAsLoop(MemMoveInst *Memmove,
1457 const TargetTransformInfo &TTI) {
1458 Value *CopyLen = Memmove->getLength();
1459 Value *SrcAddr = Memmove->getRawSource();
1460 Value *DstAddr = Memmove->getRawDest();
1461 Align SrcAlign = Memmove->getSourceAlign().valueOrOne();
1462 Align DstAlign = Memmove->getDestAlign().valueOrOne();
1463 bool SrcIsVolatile = Memmove->isVolatile();
1464 bool DstIsVolatile = SrcIsVolatile;
1465 IRBuilder<> CastBuilder(Memmove);
1466 CastBuilder.SetCurrentDebugLocation(Memmove->getStableDebugLoc());
1467
1468 unsigned SrcAS = SrcAddr->getType()->getPointerAddressSpace();
1469 unsigned DstAS = DstAddr->getType()->getPointerAddressSpace();
1470 if (SrcAS != DstAS) {
1471 if (!TTI.addrspacesMayAlias(AS0: SrcAS, AS1: DstAS)) {
1472 // We may not be able to emit a pointer comparison, but we don't have
1473 // to. Expand as memcpy.
1474 auto AverageTripCount = getAverageMemOpLoopTripCount(I: *Memmove);
1475 if (ConstantInt *CI = dyn_cast<ConstantInt>(Val: CopyLen)) {
1476 createMemCpyLoopKnownSize(
1477 /*InsertBefore=*/Memmove, SrcAddr, DstAddr, CopyLen: CI, SrcAlign, DstAlign,
1478 SrcIsVolatile, DstIsVolatile,
1479 /*CanOverlap=*/false, TTI, AtomicElementSize: std::nullopt, AverageTripCount);
1480 } else {
1481 createMemCpyLoopUnknownSize(
1482 /*InsertBefore=*/Memmove, SrcAddr, DstAddr, CopyLen, SrcAlign,
1483 DstAlign, SrcIsVolatile, DstIsVolatile,
1484 /*CanOverlap=*/false, TTI, AtomicElementSize: std::nullopt, AverageTripCount);
1485 }
1486
1487 return true;
1488 }
1489
1490 if (!(TTI.isValidAddrSpaceCast(FromAS: DstAS, ToAS: SrcAS) ||
1491 TTI.isValidAddrSpaceCast(FromAS: SrcAS, ToAS: DstAS))) {
1492 // We don't know generically if it's legal to introduce an
1493 // addrspacecast. We need to know either if it's legal to insert an
1494 // addrspacecast, or if the address spaces cannot alias.
1495 LLVM_DEBUG(
1496 dbgs() << "Do not know how to expand memmove between different "
1497 "address spaces\n");
1498 return false;
1499 }
1500 }
1501
1502 if (ConstantInt *CI = dyn_cast<ConstantInt>(Val: CopyLen)) {
1503 createMemMoveLoopKnownSize(
1504 /*InsertBefore=*/Memmove, SrcAddr, DstAddr, CopyLen: CI, SrcAlign, DstAlign,
1505 SrcIsVolatile, DstIsVolatile, TTI);
1506 } else {
1507 createMemMoveLoopUnknownSize(
1508 /*InsertBefore=*/Memmove, SrcAddr, DstAddr, CopyLen, SrcAlign, DstAlign,
1509 SrcIsVolatile, DstIsVolatile, TTI);
1510 }
1511 return true;
1512}
1513
1514void llvm::expandMemSetAsLoop(MemSetInst *Memset,
1515 const TargetTransformInfo *TTI) {
1516 auto AverageTripCount = getAverageMemOpLoopTripCount(I: *Memset);
1517 if (ConstantInt *CI = dyn_cast<ConstantInt>(Val: Memset->getLength())) {
1518 createMemSetLoopKnownSize(
1519 /*InsertBefore=*/Memset,
1520 /*DstAddr=*/Memset->getRawDest(),
1521 /*Len=*/CI,
1522 /*SetValue=*/Memset->getValue(),
1523 /*DstAlign=*/Memset->getDestAlign().valueOrOne(),
1524 /*IsVolatile=*/Memset->isVolatile(),
1525 /*TTI=*/TTI,
1526 /*AverageTripCount=*/AverageTripCount);
1527 } else {
1528 createMemSetLoopUnknownSize(
1529 /*InsertBefore=*/Memset,
1530 /*DstAddr=*/Memset->getRawDest(),
1531 /*Len=*/Memset->getLength(),
1532 /*SetValue=*/Memset->getValue(),
1533 /*DstAlign=*/Memset->getDestAlign().valueOrOne(),
1534 /*IsVolatile=*/Memset->isVolatile(),
1535 /*TTI=*/TTI,
1536 /*AverageTripCount=*/AverageTripCount);
1537 }
1538}
1539
1540void llvm::expandMemSetAsLoop(MemSetInst *MemSet,
1541 const TargetTransformInfo &TTI) {
1542 expandMemSetAsLoop(Memset: MemSet, TTI: &TTI);
1543}
1544
1545void llvm::expandMemSetPatternAsLoop(MemSetPatternInst *Memset,
1546 const TargetTransformInfo *TTI) {
1547 createMemSetPatternLoop(
1548 /*InsertBefore=*/Memset,
1549 /*DstAddr=*/Memset->getRawDest(),
1550 /*Len=*/Memset->getLength(),
1551 /*SetValue=*/Memset->getValue(),
1552 /*DstAlign=*/Memset->getDestAlign().valueOrOne(),
1553 /*IsVolatile=*/Memset->isVolatile(),
1554 /*TTI=*/TTI,
1555 /*AverageTripCount=*/getAverageMemOpLoopTripCount(I: *Memset));
1556}
1557
1558void llvm::expandMemSetPatternAsLoop(MemSetPatternInst *MemSet,
1559 const TargetTransformInfo &TTI) {
1560 expandMemSetPatternAsLoop(Memset: MemSet, TTI: &TTI);
1561}
1562
1563void llvm::expandAtomicMemCpyAsLoop(AnyMemCpyInst *AtomicMemcpy,
1564 const TargetTransformInfo &TTI,
1565 ScalarEvolution *SE) {
1566 assert(AtomicMemcpy->isAtomic());
1567 if (ConstantInt *CI = dyn_cast<ConstantInt>(Val: AtomicMemcpy->getLength())) {
1568 createMemCpyLoopKnownSize(
1569 /*InsertBefore=*/AtomicMemcpy,
1570 /*SrcAddr=*/AtomicMemcpy->getRawSource(),
1571 /*DstAddr=*/AtomicMemcpy->getRawDest(),
1572 /*CopyLen=*/CI,
1573 /*SrcAlign=*/AtomicMemcpy->getSourceAlign().valueOrOne(),
1574 /*DstAlign=*/AtomicMemcpy->getDestAlign().valueOrOne(),
1575 /*SrcIsVolatile=*/AtomicMemcpy->isVolatile(),
1576 /*DstIsVolatile=*/AtomicMemcpy->isVolatile(),
1577 /*CanOverlap=*/false, // SrcAddr & DstAddr may not overlap by spec.
1578 /*TTI=*/TTI,
1579 /*AtomicElementSize=*/AtomicMemcpy->getElementSizeInBytes());
1580 } else {
1581 createMemCpyLoopUnknownSize(
1582 /*InsertBefore=*/AtomicMemcpy,
1583 /*SrcAddr=*/AtomicMemcpy->getRawSource(),
1584 /*DstAddr=*/AtomicMemcpy->getRawDest(),
1585 /*CopyLen=*/AtomicMemcpy->getLength(),
1586 /*SrcAlign=*/AtomicMemcpy->getSourceAlign().valueOrOne(),
1587 /*DstAlign=*/AtomicMemcpy->getDestAlign().valueOrOne(),
1588 /*SrcIsVolatile=*/AtomicMemcpy->isVolatile(),
1589 /*DstIsVolatile=*/AtomicMemcpy->isVolatile(),
1590 /*CanOverlap=*/false, // SrcAddr & DstAddr may not overlap by spec.
1591 /*TargetTransformInfo=*/TTI,
1592 /*AtomicElementSize=*/AtomicMemcpy->getElementSizeInBytes());
1593 }
1594}
1595