1 | //===- LowerMemIntrinsics.cpp ----------------------------------*- C++ -*--===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | |
9 | #include "llvm/Transforms/Utils/LowerMemIntrinsics.h" |
10 | #include "llvm/Analysis/ScalarEvolution.h" |
11 | #include "llvm/Analysis/TargetTransformInfo.h" |
12 | #include "llvm/IR/IRBuilder.h" |
13 | #include "llvm/IR/IntrinsicInst.h" |
14 | #include "llvm/IR/MDBuilder.h" |
15 | #include "llvm/Support/Debug.h" |
16 | #include "llvm/Support/MathExtras.h" |
17 | #include "llvm/Transforms/Utils/BasicBlockUtils.h" |
18 | #include <optional> |
19 | |
20 | #define DEBUG_TYPE "lower-mem-intrinsics" |
21 | |
22 | using namespace llvm; |
23 | |
24 | void llvm::createMemCpyLoopKnownSize( |
25 | Instruction *InsertBefore, Value *SrcAddr, Value *DstAddr, |
26 | ConstantInt *CopyLen, Align SrcAlign, Align DstAlign, bool SrcIsVolatile, |
27 | bool DstIsVolatile, bool CanOverlap, const TargetTransformInfo &TTI, |
28 | std::optional<uint32_t> AtomicElementSize) { |
29 | // No need to expand zero length copies. |
30 | if (CopyLen->isZero()) |
31 | return; |
32 | |
33 | BasicBlock *PreLoopBB = InsertBefore->getParent(); |
34 | BasicBlock *PostLoopBB = nullptr; |
35 | Function *ParentFunc = PreLoopBB->getParent(); |
36 | LLVMContext &Ctx = PreLoopBB->getContext(); |
37 | const DataLayout &DL = ParentFunc->getDataLayout(); |
38 | MDBuilder MDB(Ctx); |
39 | MDNode *NewDomain = MDB.createAnonymousAliasScopeDomain(Name: "MemCopyDomain" ); |
40 | StringRef Name = "MemCopyAliasScope" ; |
41 | MDNode *NewScope = MDB.createAnonymousAliasScope(Domain: NewDomain, Name); |
42 | |
43 | unsigned SrcAS = cast<PointerType>(Val: SrcAddr->getType())->getAddressSpace(); |
44 | unsigned DstAS = cast<PointerType>(Val: DstAddr->getType())->getAddressSpace(); |
45 | |
46 | Type *TypeOfCopyLen = CopyLen->getType(); |
47 | Type *LoopOpType = TTI.getMemcpyLoopLoweringType( |
48 | Context&: Ctx, Length: CopyLen, SrcAddrSpace: SrcAS, DestAddrSpace: DstAS, SrcAlign, DestAlign: DstAlign, AtomicElementSize); |
49 | assert((!AtomicElementSize || !LoopOpType->isVectorTy()) && |
50 | "Atomic memcpy lowering is not supported for vector operand type" ); |
51 | |
52 | Type *Int8Type = Type::getInt8Ty(C&: Ctx); |
53 | unsigned LoopOpSize = DL.getTypeStoreSize(Ty: LoopOpType); |
54 | assert((!AtomicElementSize || LoopOpSize % *AtomicElementSize == 0) && |
55 | "Atomic memcpy lowering is not supported for selected operand size" ); |
56 | |
57 | uint64_t LoopEndCount = alignDown(Value: CopyLen->getZExtValue(), Align: LoopOpSize); |
58 | |
59 | if (LoopEndCount != 0) { |
60 | // Split |
61 | PostLoopBB = PreLoopBB->splitBasicBlock(I: InsertBefore, BBName: "memcpy-split" ); |
62 | BasicBlock *LoopBB = |
63 | BasicBlock::Create(Context&: Ctx, Name: "load-store-loop" , Parent: ParentFunc, InsertBefore: PostLoopBB); |
64 | PreLoopBB->getTerminator()->setSuccessor(Idx: 0, BB: LoopBB); |
65 | |
66 | IRBuilder<> PLBuilder(PreLoopBB->getTerminator()); |
67 | |
68 | Align PartDstAlign(commonAlignment(A: DstAlign, Offset: LoopOpSize)); |
69 | Align PartSrcAlign(commonAlignment(A: SrcAlign, Offset: LoopOpSize)); |
70 | |
71 | IRBuilder<> LoopBuilder(LoopBB); |
72 | PHINode *LoopIndex = LoopBuilder.CreatePHI(Ty: TypeOfCopyLen, NumReservedValues: 2, Name: "loop-index" ); |
73 | LoopIndex->addIncoming(V: ConstantInt::get(Ty: TypeOfCopyLen, V: 0U), BB: PreLoopBB); |
74 | // Loop Body |
75 | |
76 | // If we used LoopOpType as GEP element type, we would iterate over the |
77 | // buffers in TypeStoreSize strides while copying TypeAllocSize bytes, i.e., |
78 | // we would miss bytes if TypeStoreSize != TypeAllocSize. Therefore, use |
79 | // byte offsets computed from the TypeStoreSize. |
80 | Value *SrcGEP = LoopBuilder.CreateInBoundsGEP(Ty: Int8Type, Ptr: SrcAddr, IdxList: LoopIndex); |
81 | LoadInst *Load = LoopBuilder.CreateAlignedLoad(Ty: LoopOpType, Ptr: SrcGEP, |
82 | Align: PartSrcAlign, isVolatile: SrcIsVolatile); |
83 | if (!CanOverlap) { |
84 | // Set alias scope for loads. |
85 | Load->setMetadata(KindID: LLVMContext::MD_alias_scope, |
86 | Node: MDNode::get(Context&: Ctx, MDs: NewScope)); |
87 | } |
88 | Value *DstGEP = LoopBuilder.CreateInBoundsGEP(Ty: Int8Type, Ptr: DstAddr, IdxList: LoopIndex); |
89 | StoreInst *Store = LoopBuilder.CreateAlignedStore( |
90 | Val: Load, Ptr: DstGEP, Align: PartDstAlign, isVolatile: DstIsVolatile); |
91 | if (!CanOverlap) { |
92 | // Indicate that stores don't overlap loads. |
93 | Store->setMetadata(KindID: LLVMContext::MD_noalias, Node: MDNode::get(Context&: Ctx, MDs: NewScope)); |
94 | } |
95 | if (AtomicElementSize) { |
96 | Load->setAtomic(Ordering: AtomicOrdering::Unordered); |
97 | Store->setAtomic(Ordering: AtomicOrdering::Unordered); |
98 | } |
99 | Value *NewIndex = LoopBuilder.CreateAdd( |
100 | LHS: LoopIndex, RHS: ConstantInt::get(Ty: TypeOfCopyLen, V: LoopOpSize)); |
101 | LoopIndex->addIncoming(V: NewIndex, BB: LoopBB); |
102 | |
103 | // Create the loop branch condition. |
104 | Constant *LoopEndCI = ConstantInt::get(Ty: TypeOfCopyLen, V: LoopEndCount); |
105 | LoopBuilder.CreateCondBr(Cond: LoopBuilder.CreateICmpULT(LHS: NewIndex, RHS: LoopEndCI), |
106 | True: LoopBB, False: PostLoopBB); |
107 | } |
108 | |
109 | uint64_t BytesCopied = LoopEndCount; |
110 | uint64_t RemainingBytes = CopyLen->getZExtValue() - BytesCopied; |
111 | if (RemainingBytes) { |
112 | BasicBlock::iterator InsertIt = PostLoopBB ? PostLoopBB->getFirstNonPHIIt() |
113 | : InsertBefore->getIterator(); |
114 | IRBuilder<> RBuilder(InsertIt->getParent(), InsertIt); |
115 | |
116 | SmallVector<Type *, 5> RemainingOps; |
117 | TTI.getMemcpyLoopResidualLoweringType(OpsOut&: RemainingOps, Context&: Ctx, RemainingBytes, |
118 | SrcAddrSpace: SrcAS, DestAddrSpace: DstAS, SrcAlign, DestAlign: DstAlign, |
119 | AtomicCpySize: AtomicElementSize); |
120 | |
121 | for (auto *OpTy : RemainingOps) { |
122 | Align PartSrcAlign(commonAlignment(A: SrcAlign, Offset: BytesCopied)); |
123 | Align PartDstAlign(commonAlignment(A: DstAlign, Offset: BytesCopied)); |
124 | |
125 | unsigned OperandSize = DL.getTypeStoreSize(Ty: OpTy); |
126 | assert( |
127 | (!AtomicElementSize || OperandSize % *AtomicElementSize == 0) && |
128 | "Atomic memcpy lowering is not supported for selected operand size" ); |
129 | |
130 | Value *SrcGEP = RBuilder.CreateInBoundsGEP( |
131 | Ty: Int8Type, Ptr: SrcAddr, IdxList: ConstantInt::get(Ty: TypeOfCopyLen, V: BytesCopied)); |
132 | LoadInst *Load = |
133 | RBuilder.CreateAlignedLoad(Ty: OpTy, Ptr: SrcGEP, Align: PartSrcAlign, isVolatile: SrcIsVolatile); |
134 | if (!CanOverlap) { |
135 | // Set alias scope for loads. |
136 | Load->setMetadata(KindID: LLVMContext::MD_alias_scope, |
137 | Node: MDNode::get(Context&: Ctx, MDs: NewScope)); |
138 | } |
139 | Value *DstGEP = RBuilder.CreateInBoundsGEP( |
140 | Ty: Int8Type, Ptr: DstAddr, IdxList: ConstantInt::get(Ty: TypeOfCopyLen, V: BytesCopied)); |
141 | StoreInst *Store = RBuilder.CreateAlignedStore(Val: Load, Ptr: DstGEP, Align: PartDstAlign, |
142 | isVolatile: DstIsVolatile); |
143 | if (!CanOverlap) { |
144 | // Indicate that stores don't overlap loads. |
145 | Store->setMetadata(KindID: LLVMContext::MD_noalias, Node: MDNode::get(Context&: Ctx, MDs: NewScope)); |
146 | } |
147 | if (AtomicElementSize) { |
148 | Load->setAtomic(Ordering: AtomicOrdering::Unordered); |
149 | Store->setAtomic(Ordering: AtomicOrdering::Unordered); |
150 | } |
151 | BytesCopied += OperandSize; |
152 | } |
153 | } |
154 | assert(BytesCopied == CopyLen->getZExtValue() && |
155 | "Bytes copied should match size in the call!" ); |
156 | } |
157 | |
158 | // \returns \p Len urem \p OpSize, checking for optimization opportunities. |
159 | static Value *getRuntimeLoopRemainder(const DataLayout &DL, IRBuilderBase &B, |
160 | Value *Len, Value *OpSize, |
161 | unsigned OpSizeVal) { |
162 | // For powers of 2, we can and by (OpSizeVal - 1) instead of using urem. |
163 | if (isPowerOf2_32(Value: OpSizeVal)) |
164 | return B.CreateAnd(LHS: Len, RHS: OpSizeVal - 1); |
165 | return B.CreateURem(LHS: Len, RHS: OpSize); |
166 | } |
167 | |
168 | // \returns (\p Len udiv \p OpSize) mul \p OpSize, checking for optimization |
169 | // opportunities. |
170 | // If RTLoopRemainder is provided, it must be the result of |
171 | // getRuntimeLoopRemainder() with the same arguments. |
172 | static Value *getRuntimeLoopBytes(const DataLayout &DL, IRBuilderBase &B, |
173 | Value *Len, Value *OpSize, unsigned OpSizeVal, |
174 | Value *RTLoopRemainder = nullptr) { |
175 | if (!RTLoopRemainder) |
176 | RTLoopRemainder = getRuntimeLoopRemainder(DL, B, Len, OpSize, OpSizeVal); |
177 | return B.CreateSub(LHS: Len, RHS: RTLoopRemainder); |
178 | } |
179 | |
180 | void llvm::createMemCpyLoopUnknownSize( |
181 | Instruction *InsertBefore, Value *SrcAddr, Value *DstAddr, Value *CopyLen, |
182 | Align SrcAlign, Align DstAlign, bool SrcIsVolatile, bool DstIsVolatile, |
183 | bool CanOverlap, const TargetTransformInfo &TTI, |
184 | std::optional<uint32_t> AtomicElementSize) { |
185 | BasicBlock *PreLoopBB = InsertBefore->getParent(); |
186 | BasicBlock *PostLoopBB = |
187 | PreLoopBB->splitBasicBlock(I: InsertBefore, BBName: "post-loop-memcpy-expansion" ); |
188 | |
189 | Function *ParentFunc = PreLoopBB->getParent(); |
190 | const DataLayout &DL = ParentFunc->getDataLayout(); |
191 | LLVMContext &Ctx = PreLoopBB->getContext(); |
192 | MDBuilder MDB(Ctx); |
193 | MDNode *NewDomain = MDB.createAnonymousAliasScopeDomain(Name: "MemCopyDomain" ); |
194 | StringRef Name = "MemCopyAliasScope" ; |
195 | MDNode *NewScope = MDB.createAnonymousAliasScope(Domain: NewDomain, Name); |
196 | |
197 | unsigned SrcAS = cast<PointerType>(Val: SrcAddr->getType())->getAddressSpace(); |
198 | unsigned DstAS = cast<PointerType>(Val: DstAddr->getType())->getAddressSpace(); |
199 | |
200 | Type *LoopOpType = TTI.getMemcpyLoopLoweringType( |
201 | Context&: Ctx, Length: CopyLen, SrcAddrSpace: SrcAS, DestAddrSpace: DstAS, SrcAlign, DestAlign: DstAlign, AtomicElementSize); |
202 | assert((!AtomicElementSize || !LoopOpType->isVectorTy()) && |
203 | "Atomic memcpy lowering is not supported for vector operand type" ); |
204 | unsigned LoopOpSize = DL.getTypeStoreSize(Ty: LoopOpType); |
205 | assert((!AtomicElementSize || LoopOpSize % *AtomicElementSize == 0) && |
206 | "Atomic memcpy lowering is not supported for selected operand size" ); |
207 | |
208 | IRBuilder<> PLBuilder(PreLoopBB->getTerminator()); |
209 | |
210 | // Calculate the loop trip count, and remaining bytes to copy after the loop. |
211 | Type *CopyLenType = CopyLen->getType(); |
212 | IntegerType *ILengthType = dyn_cast<IntegerType>(Val: CopyLenType); |
213 | assert(ILengthType && |
214 | "expected size argument to memcpy to be an integer type!" ); |
215 | Type *Int8Type = Type::getInt8Ty(C&: Ctx); |
216 | bool LoopOpIsInt8 = LoopOpType == Int8Type; |
217 | ConstantInt *CILoopOpSize = ConstantInt::get(Ty: ILengthType, V: LoopOpSize); |
218 | |
219 | Value *RuntimeLoopBytes = CopyLen; |
220 | Value *RuntimeResidualBytes = nullptr; |
221 | if (!LoopOpIsInt8) { |
222 | RuntimeResidualBytes = getRuntimeLoopRemainder(DL, B&: PLBuilder, Len: CopyLen, |
223 | OpSize: CILoopOpSize, OpSizeVal: LoopOpSize); |
224 | RuntimeLoopBytes = getRuntimeLoopBytes(DL, B&: PLBuilder, Len: CopyLen, OpSize: CILoopOpSize, |
225 | OpSizeVal: LoopOpSize, RTLoopRemainder: RuntimeResidualBytes); |
226 | } |
227 | |
228 | BasicBlock *LoopBB = |
229 | BasicBlock::Create(Context&: Ctx, Name: "loop-memcpy-expansion" , Parent: ParentFunc, InsertBefore: PostLoopBB); |
230 | IRBuilder<> LoopBuilder(LoopBB); |
231 | |
232 | Align PartSrcAlign(commonAlignment(A: SrcAlign, Offset: LoopOpSize)); |
233 | Align PartDstAlign(commonAlignment(A: DstAlign, Offset: LoopOpSize)); |
234 | |
235 | PHINode *LoopIndex = LoopBuilder.CreatePHI(Ty: CopyLenType, NumReservedValues: 2, Name: "loop-index" ); |
236 | LoopIndex->addIncoming(V: ConstantInt::get(Ty: CopyLenType, V: 0U), BB: PreLoopBB); |
237 | |
238 | // If we used LoopOpType as GEP element type, we would iterate over the |
239 | // buffers in TypeStoreSize strides while copying TypeAllocSize bytes, i.e., |
240 | // we would miss bytes if TypeStoreSize != TypeAllocSize. Therefore, use byte |
241 | // offsets computed from the TypeStoreSize. |
242 | Value *SrcGEP = LoopBuilder.CreateInBoundsGEP(Ty: Int8Type, Ptr: SrcAddr, IdxList: LoopIndex); |
243 | LoadInst *Load = LoopBuilder.CreateAlignedLoad(Ty: LoopOpType, Ptr: SrcGEP, |
244 | Align: PartSrcAlign, isVolatile: SrcIsVolatile); |
245 | if (!CanOverlap) { |
246 | // Set alias scope for loads. |
247 | Load->setMetadata(KindID: LLVMContext::MD_alias_scope, Node: MDNode::get(Context&: Ctx, MDs: NewScope)); |
248 | } |
249 | Value *DstGEP = LoopBuilder.CreateInBoundsGEP(Ty: Int8Type, Ptr: DstAddr, IdxList: LoopIndex); |
250 | StoreInst *Store = |
251 | LoopBuilder.CreateAlignedStore(Val: Load, Ptr: DstGEP, Align: PartDstAlign, isVolatile: DstIsVolatile); |
252 | if (!CanOverlap) { |
253 | // Indicate that stores don't overlap loads. |
254 | Store->setMetadata(KindID: LLVMContext::MD_noalias, Node: MDNode::get(Context&: Ctx, MDs: NewScope)); |
255 | } |
256 | if (AtomicElementSize) { |
257 | Load->setAtomic(Ordering: AtomicOrdering::Unordered); |
258 | Store->setAtomic(Ordering: AtomicOrdering::Unordered); |
259 | } |
260 | Value *NewIndex = LoopBuilder.CreateAdd( |
261 | LHS: LoopIndex, RHS: ConstantInt::get(Ty: CopyLenType, V: LoopOpSize)); |
262 | LoopIndex->addIncoming(V: NewIndex, BB: LoopBB); |
263 | |
264 | bool RequiresResidual = |
265 | !LoopOpIsInt8 && !(AtomicElementSize && LoopOpSize == AtomicElementSize); |
266 | if (RequiresResidual) { |
267 | Type *ResLoopOpType = AtomicElementSize |
268 | ? Type::getIntNTy(C&: Ctx, N: *AtomicElementSize * 8) |
269 | : Int8Type; |
270 | unsigned ResLoopOpSize = DL.getTypeStoreSize(Ty: ResLoopOpType); |
271 | assert((ResLoopOpSize == AtomicElementSize ? *AtomicElementSize : 1) && |
272 | "Store size is expected to match type size" ); |
273 | |
274 | Align ResSrcAlign(commonAlignment(A: PartSrcAlign, Offset: ResLoopOpSize)); |
275 | Align ResDstAlign(commonAlignment(A: PartDstAlign, Offset: ResLoopOpSize)); |
276 | |
277 | // Loop body for the residual copy. |
278 | BasicBlock *ResLoopBB = BasicBlock::Create( |
279 | Context&: Ctx, Name: "loop-memcpy-residual" , Parent: PreLoopBB->getParent(), InsertBefore: PostLoopBB); |
280 | // Residual loop header. |
281 | BasicBlock * = BasicBlock::Create( |
282 | Context&: Ctx, Name: "loop-memcpy-residual-header" , Parent: PreLoopBB->getParent(), InsertBefore: nullptr); |
283 | |
284 | // Need to update the pre-loop basic block to branch to the correct place. |
285 | // branch to the main loop if the count is non-zero, branch to the residual |
286 | // loop if the copy size is smaller then 1 iteration of the main loop but |
287 | // non-zero and finally branch to after the residual loop if the memcpy |
288 | // size is zero. |
289 | ConstantInt *Zero = ConstantInt::get(Ty: ILengthType, V: 0U); |
290 | PLBuilder.CreateCondBr(Cond: PLBuilder.CreateICmpNE(LHS: RuntimeLoopBytes, RHS: Zero), |
291 | True: LoopBB, False: ResHeaderBB); |
292 | PreLoopBB->getTerminator()->eraseFromParent(); |
293 | |
294 | LoopBuilder.CreateCondBr( |
295 | Cond: LoopBuilder.CreateICmpULT(LHS: NewIndex, RHS: RuntimeLoopBytes), True: LoopBB, |
296 | False: ResHeaderBB); |
297 | |
298 | // Determine if we need to branch to the residual loop or bypass it. |
299 | IRBuilder<> RHBuilder(ResHeaderBB); |
300 | RHBuilder.CreateCondBr(Cond: RHBuilder.CreateICmpNE(LHS: RuntimeResidualBytes, RHS: Zero), |
301 | True: ResLoopBB, False: PostLoopBB); |
302 | |
303 | // Copy the residual with single byte load/store loop. |
304 | IRBuilder<> ResBuilder(ResLoopBB); |
305 | PHINode *ResidualIndex = |
306 | ResBuilder.CreatePHI(Ty: CopyLenType, NumReservedValues: 2, Name: "residual-loop-index" ); |
307 | ResidualIndex->addIncoming(V: Zero, BB: ResHeaderBB); |
308 | |
309 | Value *FullOffset = ResBuilder.CreateAdd(LHS: RuntimeLoopBytes, RHS: ResidualIndex); |
310 | Value *SrcGEP = ResBuilder.CreateInBoundsGEP(Ty: Int8Type, Ptr: SrcAddr, IdxList: FullOffset); |
311 | LoadInst *Load = ResBuilder.CreateAlignedLoad(Ty: ResLoopOpType, Ptr: SrcGEP, |
312 | Align: ResSrcAlign, isVolatile: SrcIsVolatile); |
313 | if (!CanOverlap) { |
314 | // Set alias scope for loads. |
315 | Load->setMetadata(KindID: LLVMContext::MD_alias_scope, |
316 | Node: MDNode::get(Context&: Ctx, MDs: NewScope)); |
317 | } |
318 | Value *DstGEP = ResBuilder.CreateInBoundsGEP(Ty: Int8Type, Ptr: DstAddr, IdxList: FullOffset); |
319 | StoreInst *Store = |
320 | ResBuilder.CreateAlignedStore(Val: Load, Ptr: DstGEP, Align: ResDstAlign, isVolatile: DstIsVolatile); |
321 | if (!CanOverlap) { |
322 | // Indicate that stores don't overlap loads. |
323 | Store->setMetadata(KindID: LLVMContext::MD_noalias, Node: MDNode::get(Context&: Ctx, MDs: NewScope)); |
324 | } |
325 | if (AtomicElementSize) { |
326 | Load->setAtomic(Ordering: AtomicOrdering::Unordered); |
327 | Store->setAtomic(Ordering: AtomicOrdering::Unordered); |
328 | } |
329 | Value *ResNewIndex = ResBuilder.CreateAdd( |
330 | LHS: ResidualIndex, RHS: ConstantInt::get(Ty: CopyLenType, V: ResLoopOpSize)); |
331 | ResidualIndex->addIncoming(V: ResNewIndex, BB: ResLoopBB); |
332 | |
333 | // Create the loop branch condition. |
334 | ResBuilder.CreateCondBr( |
335 | Cond: ResBuilder.CreateICmpULT(LHS: ResNewIndex, RHS: RuntimeResidualBytes), True: ResLoopBB, |
336 | False: PostLoopBB); |
337 | } else { |
338 | // In this case the loop operand type was a byte, and there is no need for a |
339 | // residual loop to copy the remaining memory after the main loop. |
340 | // We do however need to patch up the control flow by creating the |
341 | // terminators for the preloop block and the memcpy loop. |
342 | ConstantInt *Zero = ConstantInt::get(Ty: ILengthType, V: 0U); |
343 | PLBuilder.CreateCondBr(Cond: PLBuilder.CreateICmpNE(LHS: RuntimeLoopBytes, RHS: Zero), |
344 | True: LoopBB, False: PostLoopBB); |
345 | PreLoopBB->getTerminator()->eraseFromParent(); |
346 | LoopBuilder.CreateCondBr( |
347 | Cond: LoopBuilder.CreateICmpULT(LHS: NewIndex, RHS: RuntimeLoopBytes), True: LoopBB, |
348 | False: PostLoopBB); |
349 | } |
350 | } |
351 | |
352 | // If \p Addr1 and \p Addr2 are pointers to different address spaces, create an |
353 | // addresspacecast to obtain a pair of pointers in the same addressspace. The |
354 | // caller needs to ensure that addrspacecasting is possible. |
355 | // No-op if the pointers are in the same address space. |
356 | static std::pair<Value *, Value *> |
357 | tryInsertCastToCommonAddrSpace(IRBuilderBase &B, Value *Addr1, Value *Addr2, |
358 | const TargetTransformInfo &TTI) { |
359 | Value *ResAddr1 = Addr1; |
360 | Value *ResAddr2 = Addr2; |
361 | |
362 | unsigned AS1 = cast<PointerType>(Val: Addr1->getType())->getAddressSpace(); |
363 | unsigned AS2 = cast<PointerType>(Val: Addr2->getType())->getAddressSpace(); |
364 | if (AS1 != AS2) { |
365 | if (TTI.isValidAddrSpaceCast(FromAS: AS2, ToAS: AS1)) |
366 | ResAddr2 = B.CreateAddrSpaceCast(V: Addr2, DestTy: Addr1->getType()); |
367 | else if (TTI.isValidAddrSpaceCast(FromAS: AS1, ToAS: AS2)) |
368 | ResAddr1 = B.CreateAddrSpaceCast(V: Addr1, DestTy: Addr2->getType()); |
369 | else |
370 | llvm_unreachable("Can only lower memmove between address spaces if they " |
371 | "support addrspacecast" ); |
372 | } |
373 | return {ResAddr1, ResAddr2}; |
374 | } |
375 | |
376 | // Lower memmove to IR. memmove is required to correctly copy overlapping memory |
377 | // regions; therefore, it has to check the relative positions of the source and |
378 | // destination pointers and choose the copy direction accordingly. |
379 | // |
380 | // The code below is an IR rendition of this C function: |
381 | // |
382 | // void* memmove(void* dst, const void* src, size_t n) { |
383 | // unsigned char* d = dst; |
384 | // const unsigned char* s = src; |
385 | // if (s < d) { |
386 | // // copy backwards |
387 | // while (n--) { |
388 | // d[n] = s[n]; |
389 | // } |
390 | // } else { |
391 | // // copy forward |
392 | // for (size_t i = 0; i < n; ++i) { |
393 | // d[i] = s[i]; |
394 | // } |
395 | // } |
396 | // return dst; |
397 | // } |
398 | // |
399 | // If the TargetTransformInfo specifies a wider MemcpyLoopLoweringType, it is |
400 | // used for the memory accesses in the loops. Then, additional loops with |
401 | // byte-wise accesses are added for the remaining bytes. |
402 | static void createMemMoveLoopUnknownSize(Instruction *InsertBefore, |
403 | Value *SrcAddr, Value *DstAddr, |
404 | Value *CopyLen, Align SrcAlign, |
405 | Align DstAlign, bool SrcIsVolatile, |
406 | bool DstIsVolatile, |
407 | const TargetTransformInfo &TTI) { |
408 | Type *TypeOfCopyLen = CopyLen->getType(); |
409 | BasicBlock *OrigBB = InsertBefore->getParent(); |
410 | Function *F = OrigBB->getParent(); |
411 | const DataLayout &DL = F->getDataLayout(); |
412 | LLVMContext &Ctx = OrigBB->getContext(); |
413 | unsigned SrcAS = cast<PointerType>(Val: SrcAddr->getType())->getAddressSpace(); |
414 | unsigned DstAS = cast<PointerType>(Val: DstAddr->getType())->getAddressSpace(); |
415 | |
416 | Type *LoopOpType = TTI.getMemcpyLoopLoweringType(Context&: Ctx, Length: CopyLen, SrcAddrSpace: SrcAS, DestAddrSpace: DstAS, |
417 | SrcAlign, DestAlign: DstAlign); |
418 | unsigned LoopOpSize = DL.getTypeStoreSize(Ty: LoopOpType); |
419 | Type *Int8Type = Type::getInt8Ty(C&: Ctx); |
420 | bool LoopOpIsInt8 = LoopOpType == Int8Type; |
421 | |
422 | // If the memory accesses are wider than one byte, residual loops with |
423 | // i8-accesses are required to move remaining bytes. |
424 | bool RequiresResidual = !LoopOpIsInt8; |
425 | |
426 | Type *ResidualLoopOpType = Int8Type; |
427 | unsigned ResidualLoopOpSize = DL.getTypeStoreSize(Ty: ResidualLoopOpType); |
428 | |
429 | // Calculate the loop trip count and remaining bytes to copy after the loop. |
430 | IntegerType *ILengthType = cast<IntegerType>(Val: TypeOfCopyLen); |
431 | ConstantInt *CILoopOpSize = ConstantInt::get(Ty: ILengthType, V: LoopOpSize); |
432 | ConstantInt *CIResidualLoopOpSize = |
433 | ConstantInt::get(Ty: ILengthType, V: ResidualLoopOpSize); |
434 | ConstantInt *Zero = ConstantInt::get(Ty: ILengthType, V: 0); |
435 | |
436 | IRBuilder<> PLBuilder(InsertBefore); |
437 | |
438 | Value *RuntimeLoopBytes = CopyLen; |
439 | Value *RuntimeLoopRemainder = nullptr; |
440 | Value *SkipResidualCondition = nullptr; |
441 | if (RequiresResidual) { |
442 | RuntimeLoopRemainder = getRuntimeLoopRemainder(DL, B&: PLBuilder, Len: CopyLen, |
443 | OpSize: CILoopOpSize, OpSizeVal: LoopOpSize); |
444 | RuntimeLoopBytes = getRuntimeLoopBytes(DL, B&: PLBuilder, Len: CopyLen, OpSize: CILoopOpSize, |
445 | OpSizeVal: LoopOpSize, RTLoopRemainder: RuntimeLoopRemainder); |
446 | SkipResidualCondition = |
447 | PLBuilder.CreateICmpEQ(LHS: RuntimeLoopRemainder, RHS: Zero, Name: "skip_residual" ); |
448 | } |
449 | Value *SkipMainCondition = |
450 | PLBuilder.CreateICmpEQ(LHS: RuntimeLoopBytes, RHS: Zero, Name: "skip_main" ); |
451 | |
452 | // Create the a comparison of src and dst, based on which we jump to either |
453 | // the forward-copy part of the function (if src >= dst) or the backwards-copy |
454 | // part (if src < dst). |
455 | // SplitBlockAndInsertIfThenElse conveniently creates the basic if-then-else |
456 | // structure. Its block terminators (unconditional branches) are replaced by |
457 | // the appropriate conditional branches when the loop is built. |
458 | // If the pointers are in different address spaces, they need to be converted |
459 | // to a compatible one. Cases where memory ranges in the different address |
460 | // spaces cannot overlap are lowered as memcpy and not handled here. |
461 | auto [CmpSrcAddr, CmpDstAddr] = |
462 | tryInsertCastToCommonAddrSpace(B&: PLBuilder, Addr1: SrcAddr, Addr2: DstAddr, TTI); |
463 | Value *PtrCompare = |
464 | PLBuilder.CreateICmpULT(LHS: CmpSrcAddr, RHS: CmpDstAddr, Name: "compare_src_dst" ); |
465 | Instruction *ThenTerm, *ElseTerm; |
466 | SplitBlockAndInsertIfThenElse(Cond: PtrCompare, SplitBefore: InsertBefore->getIterator(), |
467 | ThenTerm: &ThenTerm, ElseTerm: &ElseTerm); |
468 | |
469 | // If the LoopOpSize is greater than 1, each part of the function consists of |
470 | // four blocks: |
471 | // memmove_copy_backwards: |
472 | // skip the residual loop when 0 iterations are required |
473 | // memmove_bwd_residual_loop: |
474 | // copy the last few bytes individually so that the remaining length is |
475 | // a multiple of the LoopOpSize |
476 | // memmove_bwd_middle: skip the main loop when 0 iterations are required |
477 | // memmove_bwd_main_loop: the actual backwards loop BB with wide accesses |
478 | // memmove_copy_forward: skip the main loop when 0 iterations are required |
479 | // memmove_fwd_main_loop: the actual forward loop BB with wide accesses |
480 | // memmove_fwd_middle: skip the residual loop when 0 iterations are required |
481 | // memmove_fwd_residual_loop: copy the last few bytes individually |
482 | // |
483 | // The main and residual loop are switched between copying forward and |
484 | // backward so that the residual loop always operates on the end of the moved |
485 | // range. This is based on the assumption that buffers whose start is aligned |
486 | // with the LoopOpSize are more common than buffers whose end is. |
487 | // |
488 | // If the LoopOpSize is 1, each part of the function consists of two blocks: |
489 | // memmove_copy_backwards: skip the loop when 0 iterations are required |
490 | // memmove_bwd_main_loop: the actual backwards loop BB |
491 | // memmove_copy_forward: skip the loop when 0 iterations are required |
492 | // memmove_fwd_main_loop: the actual forward loop BB |
493 | BasicBlock *CopyBackwardsBB = ThenTerm->getParent(); |
494 | CopyBackwardsBB->setName("memmove_copy_backwards" ); |
495 | BasicBlock *CopyForwardBB = ElseTerm->getParent(); |
496 | CopyForwardBB->setName("memmove_copy_forward" ); |
497 | BasicBlock *ExitBB = InsertBefore->getParent(); |
498 | ExitBB->setName("memmove_done" ); |
499 | |
500 | Align PartSrcAlign(commonAlignment(A: SrcAlign, Offset: LoopOpSize)); |
501 | Align PartDstAlign(commonAlignment(A: DstAlign, Offset: LoopOpSize)); |
502 | |
503 | // Accesses in the residual loops do not share the same alignment as those in |
504 | // the main loops. |
505 | Align ResidualSrcAlign(commonAlignment(A: PartSrcAlign, Offset: ResidualLoopOpSize)); |
506 | Align ResidualDstAlign(commonAlignment(A: PartDstAlign, Offset: ResidualLoopOpSize)); |
507 | |
508 | // Copying backwards. |
509 | { |
510 | BasicBlock *MainLoopBB = BasicBlock::Create( |
511 | Context&: F->getContext(), Name: "memmove_bwd_main_loop" , Parent: F, InsertBefore: CopyForwardBB); |
512 | |
513 | // The predecessor of the memmove_bwd_main_loop. Updated in the |
514 | // following if a residual loop is emitted first. |
515 | BasicBlock *PredBB = CopyBackwardsBB; |
516 | |
517 | if (RequiresResidual) { |
518 | // backwards residual loop |
519 | BasicBlock *ResidualLoopBB = BasicBlock::Create( |
520 | Context&: F->getContext(), Name: "memmove_bwd_residual_loop" , Parent: F, InsertBefore: MainLoopBB); |
521 | IRBuilder<> ResidualLoopBuilder(ResidualLoopBB); |
522 | PHINode *ResidualLoopPhi = ResidualLoopBuilder.CreatePHI(Ty: ILengthType, NumReservedValues: 0); |
523 | Value *ResidualIndex = ResidualLoopBuilder.CreateSub( |
524 | LHS: ResidualLoopPhi, RHS: CIResidualLoopOpSize, Name: "bwd_residual_index" ); |
525 | // If we used LoopOpType as GEP element type, we would iterate over the |
526 | // buffers in TypeStoreSize strides while copying TypeAllocSize bytes, |
527 | // i.e., we would miss bytes if TypeStoreSize != TypeAllocSize. Therefore, |
528 | // use byte offsets computed from the TypeStoreSize. |
529 | Value *LoadGEP = ResidualLoopBuilder.CreateInBoundsGEP(Ty: Int8Type, Ptr: SrcAddr, |
530 | IdxList: ResidualIndex); |
531 | Value *Element = ResidualLoopBuilder.CreateAlignedLoad( |
532 | Ty: ResidualLoopOpType, Ptr: LoadGEP, Align: ResidualSrcAlign, isVolatile: SrcIsVolatile, |
533 | Name: "element" ); |
534 | Value *StoreGEP = ResidualLoopBuilder.CreateInBoundsGEP(Ty: Int8Type, Ptr: DstAddr, |
535 | IdxList: ResidualIndex); |
536 | ResidualLoopBuilder.CreateAlignedStore(Val: Element, Ptr: StoreGEP, |
537 | Align: ResidualDstAlign, isVolatile: DstIsVolatile); |
538 | |
539 | // After the residual loop, go to an intermediate block. |
540 | BasicBlock *IntermediateBB = BasicBlock::Create( |
541 | Context&: F->getContext(), Name: "memmove_bwd_middle" , Parent: F, InsertBefore: MainLoopBB); |
542 | // Later code expects a terminator in the PredBB. |
543 | IRBuilder<> IntermediateBuilder(IntermediateBB); |
544 | IntermediateBuilder.CreateUnreachable(); |
545 | ResidualLoopBuilder.CreateCondBr( |
546 | Cond: ResidualLoopBuilder.CreateICmpEQ(LHS: ResidualIndex, RHS: RuntimeLoopBytes), |
547 | True: IntermediateBB, False: ResidualLoopBB); |
548 | |
549 | ResidualLoopPhi->addIncoming(V: ResidualIndex, BB: ResidualLoopBB); |
550 | ResidualLoopPhi->addIncoming(V: CopyLen, BB: CopyBackwardsBB); |
551 | |
552 | // How to get to the residual: |
553 | BranchInst::Create(IfTrue: IntermediateBB, IfFalse: ResidualLoopBB, Cond: SkipResidualCondition, |
554 | InsertBefore: ThenTerm->getIterator()); |
555 | ThenTerm->eraseFromParent(); |
556 | |
557 | PredBB = IntermediateBB; |
558 | } |
559 | |
560 | // main loop |
561 | IRBuilder<> MainLoopBuilder(MainLoopBB); |
562 | PHINode *MainLoopPhi = MainLoopBuilder.CreatePHI(Ty: ILengthType, NumReservedValues: 0); |
563 | Value *MainIndex = |
564 | MainLoopBuilder.CreateSub(LHS: MainLoopPhi, RHS: CILoopOpSize, Name: "bwd_main_index" ); |
565 | Value *LoadGEP = |
566 | MainLoopBuilder.CreateInBoundsGEP(Ty: Int8Type, Ptr: SrcAddr, IdxList: MainIndex); |
567 | Value *Element = MainLoopBuilder.CreateAlignedLoad( |
568 | Ty: LoopOpType, Ptr: LoadGEP, Align: PartSrcAlign, isVolatile: SrcIsVolatile, Name: "element" ); |
569 | Value *StoreGEP = |
570 | MainLoopBuilder.CreateInBoundsGEP(Ty: Int8Type, Ptr: DstAddr, IdxList: MainIndex); |
571 | MainLoopBuilder.CreateAlignedStore(Val: Element, Ptr: StoreGEP, Align: PartDstAlign, |
572 | isVolatile: DstIsVolatile); |
573 | MainLoopBuilder.CreateCondBr(Cond: MainLoopBuilder.CreateICmpEQ(LHS: MainIndex, RHS: Zero), |
574 | True: ExitBB, False: MainLoopBB); |
575 | MainLoopPhi->addIncoming(V: MainIndex, BB: MainLoopBB); |
576 | MainLoopPhi->addIncoming(V: RuntimeLoopBytes, BB: PredBB); |
577 | |
578 | // How to get to the main loop: |
579 | Instruction *PredBBTerm = PredBB->getTerminator(); |
580 | BranchInst::Create(IfTrue: ExitBB, IfFalse: MainLoopBB, Cond: SkipMainCondition, |
581 | InsertBefore: PredBBTerm->getIterator()); |
582 | PredBBTerm->eraseFromParent(); |
583 | } |
584 | |
585 | // Copying forward. |
586 | // main loop |
587 | { |
588 | BasicBlock *MainLoopBB = |
589 | BasicBlock::Create(Context&: F->getContext(), Name: "memmove_fwd_main_loop" , Parent: F, InsertBefore: ExitBB); |
590 | IRBuilder<> MainLoopBuilder(MainLoopBB); |
591 | PHINode *MainLoopPhi = |
592 | MainLoopBuilder.CreatePHI(Ty: ILengthType, NumReservedValues: 0, Name: "fwd_main_index" ); |
593 | Value *LoadGEP = |
594 | MainLoopBuilder.CreateInBoundsGEP(Ty: Int8Type, Ptr: SrcAddr, IdxList: MainLoopPhi); |
595 | Value *Element = MainLoopBuilder.CreateAlignedLoad( |
596 | Ty: LoopOpType, Ptr: LoadGEP, Align: PartSrcAlign, isVolatile: SrcIsVolatile, Name: "element" ); |
597 | Value *StoreGEP = |
598 | MainLoopBuilder.CreateInBoundsGEP(Ty: Int8Type, Ptr: DstAddr, IdxList: MainLoopPhi); |
599 | MainLoopBuilder.CreateAlignedStore(Val: Element, Ptr: StoreGEP, Align: PartDstAlign, |
600 | isVolatile: DstIsVolatile); |
601 | Value *MainIndex = MainLoopBuilder.CreateAdd(LHS: MainLoopPhi, RHS: CILoopOpSize); |
602 | MainLoopPhi->addIncoming(V: MainIndex, BB: MainLoopBB); |
603 | MainLoopPhi->addIncoming(V: Zero, BB: CopyForwardBB); |
604 | |
605 | Instruction *CopyFwdBBTerm = CopyForwardBB->getTerminator(); |
606 | BasicBlock *SuccessorBB = ExitBB; |
607 | if (RequiresResidual) |
608 | SuccessorBB = |
609 | BasicBlock::Create(Context&: F->getContext(), Name: "memmove_fwd_middle" , Parent: F, InsertBefore: ExitBB); |
610 | |
611 | // leaving or staying in the main loop |
612 | MainLoopBuilder.CreateCondBr( |
613 | Cond: MainLoopBuilder.CreateICmpEQ(LHS: MainIndex, RHS: RuntimeLoopBytes), True: SuccessorBB, |
614 | False: MainLoopBB); |
615 | |
616 | // getting in or skipping the main loop |
617 | BranchInst::Create(IfTrue: SuccessorBB, IfFalse: MainLoopBB, Cond: SkipMainCondition, |
618 | InsertBefore: CopyFwdBBTerm->getIterator()); |
619 | CopyFwdBBTerm->eraseFromParent(); |
620 | |
621 | if (RequiresResidual) { |
622 | BasicBlock *IntermediateBB = SuccessorBB; |
623 | IRBuilder<> IntermediateBuilder(IntermediateBB); |
624 | BasicBlock *ResidualLoopBB = BasicBlock::Create( |
625 | Context&: F->getContext(), Name: "memmove_fwd_residual_loop" , Parent: F, InsertBefore: ExitBB); |
626 | IntermediateBuilder.CreateCondBr(Cond: SkipResidualCondition, True: ExitBB, |
627 | False: ResidualLoopBB); |
628 | |
629 | // Residual loop |
630 | IRBuilder<> ResidualLoopBuilder(ResidualLoopBB); |
631 | PHINode *ResidualLoopPhi = |
632 | ResidualLoopBuilder.CreatePHI(Ty: ILengthType, NumReservedValues: 0, Name: "fwd_residual_index" ); |
633 | Value *LoadGEP = ResidualLoopBuilder.CreateInBoundsGEP(Ty: Int8Type, Ptr: SrcAddr, |
634 | IdxList: ResidualLoopPhi); |
635 | Value *Element = ResidualLoopBuilder.CreateAlignedLoad( |
636 | Ty: ResidualLoopOpType, Ptr: LoadGEP, Align: ResidualSrcAlign, isVolatile: SrcIsVolatile, |
637 | Name: "element" ); |
638 | Value *StoreGEP = ResidualLoopBuilder.CreateInBoundsGEP(Ty: Int8Type, Ptr: DstAddr, |
639 | IdxList: ResidualLoopPhi); |
640 | ResidualLoopBuilder.CreateAlignedStore(Val: Element, Ptr: StoreGEP, |
641 | Align: ResidualDstAlign, isVolatile: DstIsVolatile); |
642 | Value *ResidualIndex = |
643 | ResidualLoopBuilder.CreateAdd(LHS: ResidualLoopPhi, RHS: CIResidualLoopOpSize); |
644 | ResidualLoopBuilder.CreateCondBr( |
645 | Cond: ResidualLoopBuilder.CreateICmpEQ(LHS: ResidualIndex, RHS: CopyLen), True: ExitBB, |
646 | False: ResidualLoopBB); |
647 | ResidualLoopPhi->addIncoming(V: ResidualIndex, BB: ResidualLoopBB); |
648 | ResidualLoopPhi->addIncoming(V: RuntimeLoopBytes, BB: IntermediateBB); |
649 | } |
650 | } |
651 | } |
652 | |
653 | // Similar to createMemMoveLoopUnknownSize, only the trip counts are computed at |
654 | // compile time, obsolete loops and branches are omitted, and the residual code |
655 | // is straight-line code instead of a loop. |
656 | static void createMemMoveLoopKnownSize(Instruction *InsertBefore, |
657 | Value *SrcAddr, Value *DstAddr, |
658 | ConstantInt *CopyLen, Align SrcAlign, |
659 | Align DstAlign, bool SrcIsVolatile, |
660 | bool DstIsVolatile, |
661 | const TargetTransformInfo &TTI) { |
662 | // No need to expand zero length moves. |
663 | if (CopyLen->isZero()) |
664 | return; |
665 | |
666 | Type *TypeOfCopyLen = CopyLen->getType(); |
667 | BasicBlock *OrigBB = InsertBefore->getParent(); |
668 | Function *F = OrigBB->getParent(); |
669 | const DataLayout &DL = F->getDataLayout(); |
670 | LLVMContext &Ctx = OrigBB->getContext(); |
671 | unsigned SrcAS = cast<PointerType>(Val: SrcAddr->getType())->getAddressSpace(); |
672 | unsigned DstAS = cast<PointerType>(Val: DstAddr->getType())->getAddressSpace(); |
673 | |
674 | Type *LoopOpType = TTI.getMemcpyLoopLoweringType(Context&: Ctx, Length: CopyLen, SrcAddrSpace: SrcAS, DestAddrSpace: DstAS, |
675 | SrcAlign, DestAlign: DstAlign); |
676 | unsigned LoopOpSize = DL.getTypeStoreSize(Ty: LoopOpType); |
677 | Type *Int8Type = Type::getInt8Ty(C&: Ctx); |
678 | |
679 | // Calculate the loop trip count and remaining bytes to copy after the loop. |
680 | uint64_t BytesCopiedInLoop = alignDown(Value: CopyLen->getZExtValue(), Align: LoopOpSize); |
681 | uint64_t RemainingBytes = CopyLen->getZExtValue() - BytesCopiedInLoop; |
682 | |
683 | IntegerType *ILengthType = cast<IntegerType>(Val: TypeOfCopyLen); |
684 | ConstantInt *Zero = ConstantInt::get(Ty: ILengthType, V: 0); |
685 | ConstantInt *LoopBound = ConstantInt::get(Ty: ILengthType, V: BytesCopiedInLoop); |
686 | ConstantInt *CILoopOpSize = ConstantInt::get(Ty: ILengthType, V: LoopOpSize); |
687 | |
688 | IRBuilder<> PLBuilder(InsertBefore); |
689 | |
690 | auto [CmpSrcAddr, CmpDstAddr] = |
691 | tryInsertCastToCommonAddrSpace(B&: PLBuilder, Addr1: SrcAddr, Addr2: DstAddr, TTI); |
692 | Value *PtrCompare = |
693 | PLBuilder.CreateICmpULT(LHS: CmpSrcAddr, RHS: CmpDstAddr, Name: "compare_src_dst" ); |
694 | Instruction *ThenTerm, *ElseTerm; |
695 | SplitBlockAndInsertIfThenElse(Cond: PtrCompare, SplitBefore: InsertBefore->getIterator(), |
696 | ThenTerm: &ThenTerm, ElseTerm: &ElseTerm); |
697 | |
698 | BasicBlock *CopyBackwardsBB = ThenTerm->getParent(); |
699 | BasicBlock *CopyForwardBB = ElseTerm->getParent(); |
700 | BasicBlock *ExitBB = InsertBefore->getParent(); |
701 | ExitBB->setName("memmove_done" ); |
702 | |
703 | Align PartSrcAlign(commonAlignment(A: SrcAlign, Offset: LoopOpSize)); |
704 | Align PartDstAlign(commonAlignment(A: DstAlign, Offset: LoopOpSize)); |
705 | |
706 | // Helper function to generate a load/store pair of a given type in the |
707 | // residual. Used in the forward and backward branches. |
708 | auto GenerateResidualLdStPair = [&](Type *OpTy, IRBuilderBase &Builder, |
709 | uint64_t &BytesCopied) { |
710 | Align ResSrcAlign(commonAlignment(A: SrcAlign, Offset: BytesCopied)); |
711 | Align ResDstAlign(commonAlignment(A: DstAlign, Offset: BytesCopied)); |
712 | |
713 | unsigned OperandSize = DL.getTypeStoreSize(Ty: OpTy); |
714 | |
715 | // If we used LoopOpType as GEP element type, we would iterate over the |
716 | // buffers in TypeStoreSize strides while copying TypeAllocSize bytes, i.e., |
717 | // we would miss bytes if TypeStoreSize != TypeAllocSize. Therefore, use |
718 | // byte offsets computed from the TypeStoreSize. |
719 | Value *SrcGEP = Builder.CreateInBoundsGEP( |
720 | Ty: Int8Type, Ptr: SrcAddr, IdxList: ConstantInt::get(Ty: TypeOfCopyLen, V: BytesCopied)); |
721 | LoadInst *Load = |
722 | Builder.CreateAlignedLoad(Ty: OpTy, Ptr: SrcGEP, Align: ResSrcAlign, isVolatile: SrcIsVolatile); |
723 | Value *DstGEP = Builder.CreateInBoundsGEP( |
724 | Ty: Int8Type, Ptr: DstAddr, IdxList: ConstantInt::get(Ty: TypeOfCopyLen, V: BytesCopied)); |
725 | Builder.CreateAlignedStore(Val: Load, Ptr: DstGEP, Align: ResDstAlign, isVolatile: DstIsVolatile); |
726 | BytesCopied += OperandSize; |
727 | }; |
728 | |
729 | // Copying backwards. |
730 | if (RemainingBytes != 0) { |
731 | CopyBackwardsBB->setName("memmove_bwd_residual" ); |
732 | uint64_t BytesCopied = BytesCopiedInLoop; |
733 | |
734 | // Residual code is required to move the remaining bytes. We need the same |
735 | // instructions as in the forward case, only in reverse. So we generate code |
736 | // the same way, except that we change the IRBuilder insert point for each |
737 | // load/store pair so that each one is inserted before the previous one |
738 | // instead of after it. |
739 | IRBuilder<> BwdResBuilder(CopyBackwardsBB, |
740 | CopyBackwardsBB->getFirstNonPHIIt()); |
741 | SmallVector<Type *, 5> RemainingOps; |
742 | TTI.getMemcpyLoopResidualLoweringType(OpsOut&: RemainingOps, Context&: Ctx, RemainingBytes, |
743 | SrcAddrSpace: SrcAS, DestAddrSpace: DstAS, SrcAlign: PartSrcAlign, |
744 | DestAlign: PartDstAlign); |
745 | for (auto *OpTy : RemainingOps) { |
746 | // reverse the order of the emitted operations |
747 | BwdResBuilder.SetInsertPoint(TheBB: CopyBackwardsBB, |
748 | IP: CopyBackwardsBB->getFirstNonPHIIt()); |
749 | GenerateResidualLdStPair(OpTy, BwdResBuilder, BytesCopied); |
750 | } |
751 | } |
752 | if (BytesCopiedInLoop != 0) { |
753 | BasicBlock *LoopBB = CopyBackwardsBB; |
754 | BasicBlock *PredBB = OrigBB; |
755 | if (RemainingBytes != 0) { |
756 | // if we introduce residual code, it needs its separate BB |
757 | LoopBB = CopyBackwardsBB->splitBasicBlock( |
758 | I: CopyBackwardsBB->getTerminator(), BBName: "memmove_bwd_loop" ); |
759 | PredBB = CopyBackwardsBB; |
760 | } else { |
761 | CopyBackwardsBB->setName("memmove_bwd_loop" ); |
762 | } |
763 | IRBuilder<> LoopBuilder(LoopBB->getTerminator()); |
764 | PHINode *LoopPhi = LoopBuilder.CreatePHI(Ty: ILengthType, NumReservedValues: 0); |
765 | Value *Index = LoopBuilder.CreateSub(LHS: LoopPhi, RHS: CILoopOpSize, Name: "bwd_index" ); |
766 | Value *LoadGEP = LoopBuilder.CreateInBoundsGEP(Ty: Int8Type, Ptr: SrcAddr, IdxList: Index); |
767 | Value *Element = LoopBuilder.CreateAlignedLoad( |
768 | Ty: LoopOpType, Ptr: LoadGEP, Align: PartSrcAlign, isVolatile: SrcIsVolatile, Name: "element" ); |
769 | Value *StoreGEP = LoopBuilder.CreateInBoundsGEP(Ty: Int8Type, Ptr: DstAddr, IdxList: Index); |
770 | LoopBuilder.CreateAlignedStore(Val: Element, Ptr: StoreGEP, Align: PartDstAlign, |
771 | isVolatile: DstIsVolatile); |
772 | |
773 | // Replace the unconditional branch introduced by |
774 | // SplitBlockAndInsertIfThenElse to turn LoopBB into a loop. |
775 | Instruction *UncondTerm = LoopBB->getTerminator(); |
776 | LoopBuilder.CreateCondBr(Cond: LoopBuilder.CreateICmpEQ(LHS: Index, RHS: Zero), True: ExitBB, |
777 | False: LoopBB); |
778 | UncondTerm->eraseFromParent(); |
779 | |
780 | LoopPhi->addIncoming(V: Index, BB: LoopBB); |
781 | LoopPhi->addIncoming(V: LoopBound, BB: PredBB); |
782 | } |
783 | |
784 | // Copying forward. |
785 | BasicBlock *FwdResidualBB = CopyForwardBB; |
786 | if (BytesCopiedInLoop != 0) { |
787 | CopyForwardBB->setName("memmove_fwd_loop" ); |
788 | BasicBlock *LoopBB = CopyForwardBB; |
789 | BasicBlock *SuccBB = ExitBB; |
790 | if (RemainingBytes != 0) { |
791 | // if we introduce residual code, it needs its separate BB |
792 | SuccBB = CopyForwardBB->splitBasicBlock(I: CopyForwardBB->getTerminator(), |
793 | BBName: "memmove_fwd_residual" ); |
794 | FwdResidualBB = SuccBB; |
795 | } |
796 | IRBuilder<> LoopBuilder(LoopBB->getTerminator()); |
797 | PHINode *LoopPhi = LoopBuilder.CreatePHI(Ty: ILengthType, NumReservedValues: 0, Name: "fwd_index" ); |
798 | Value *LoadGEP = LoopBuilder.CreateInBoundsGEP(Ty: Int8Type, Ptr: SrcAddr, IdxList: LoopPhi); |
799 | Value *Element = LoopBuilder.CreateAlignedLoad( |
800 | Ty: LoopOpType, Ptr: LoadGEP, Align: PartSrcAlign, isVolatile: SrcIsVolatile, Name: "element" ); |
801 | Value *StoreGEP = LoopBuilder.CreateInBoundsGEP(Ty: Int8Type, Ptr: DstAddr, IdxList: LoopPhi); |
802 | LoopBuilder.CreateAlignedStore(Val: Element, Ptr: StoreGEP, Align: PartDstAlign, |
803 | isVolatile: DstIsVolatile); |
804 | Value *Index = LoopBuilder.CreateAdd(LHS: LoopPhi, RHS: CILoopOpSize); |
805 | LoopPhi->addIncoming(V: Index, BB: LoopBB); |
806 | LoopPhi->addIncoming(V: Zero, BB: OrigBB); |
807 | |
808 | // Replace the unconditional branch to turn LoopBB into a loop. |
809 | Instruction *UncondTerm = LoopBB->getTerminator(); |
810 | LoopBuilder.CreateCondBr(Cond: LoopBuilder.CreateICmpEQ(LHS: Index, RHS: LoopBound), True: SuccBB, |
811 | False: LoopBB); |
812 | UncondTerm->eraseFromParent(); |
813 | } |
814 | |
815 | if (RemainingBytes != 0) { |
816 | uint64_t BytesCopied = BytesCopiedInLoop; |
817 | |
818 | // Residual code is required to move the remaining bytes. In the forward |
819 | // case, we emit it in the normal order. |
820 | IRBuilder<> FwdResBuilder(FwdResidualBB->getTerminator()); |
821 | SmallVector<Type *, 5> RemainingOps; |
822 | TTI.getMemcpyLoopResidualLoweringType(OpsOut&: RemainingOps, Context&: Ctx, RemainingBytes, |
823 | SrcAddrSpace: SrcAS, DestAddrSpace: DstAS, SrcAlign: PartSrcAlign, |
824 | DestAlign: PartDstAlign); |
825 | for (auto *OpTy : RemainingOps) |
826 | GenerateResidualLdStPair(OpTy, FwdResBuilder, BytesCopied); |
827 | } |
828 | } |
829 | |
830 | static void createMemSetLoop(Instruction *InsertBefore, Value *DstAddr, |
831 | Value *CopyLen, Value *SetValue, Align DstAlign, |
832 | bool IsVolatile) { |
833 | Type *TypeOfCopyLen = CopyLen->getType(); |
834 | BasicBlock *OrigBB = InsertBefore->getParent(); |
835 | Function *F = OrigBB->getParent(); |
836 | const DataLayout &DL = F->getDataLayout(); |
837 | BasicBlock *NewBB = |
838 | OrigBB->splitBasicBlock(I: InsertBefore, BBName: "split" ); |
839 | BasicBlock *LoopBB |
840 | = BasicBlock::Create(Context&: F->getContext(), Name: "loadstoreloop" , Parent: F, InsertBefore: NewBB); |
841 | |
842 | IRBuilder<> Builder(OrigBB->getTerminator()); |
843 | |
844 | Builder.CreateCondBr( |
845 | Cond: Builder.CreateICmpEQ(LHS: ConstantInt::get(Ty: TypeOfCopyLen, V: 0), RHS: CopyLen), True: NewBB, |
846 | False: LoopBB); |
847 | OrigBB->getTerminator()->eraseFromParent(); |
848 | |
849 | unsigned PartSize = DL.getTypeStoreSize(Ty: SetValue->getType()); |
850 | Align PartAlign(commonAlignment(A: DstAlign, Offset: PartSize)); |
851 | |
852 | IRBuilder<> LoopBuilder(LoopBB); |
853 | PHINode *LoopIndex = LoopBuilder.CreatePHI(Ty: TypeOfCopyLen, NumReservedValues: 0); |
854 | LoopIndex->addIncoming(V: ConstantInt::get(Ty: TypeOfCopyLen, V: 0), BB: OrigBB); |
855 | |
856 | LoopBuilder.CreateAlignedStore( |
857 | Val: SetValue, |
858 | Ptr: LoopBuilder.CreateInBoundsGEP(Ty: SetValue->getType(), Ptr: DstAddr, IdxList: LoopIndex), |
859 | Align: PartAlign, isVolatile: IsVolatile); |
860 | |
861 | Value *NewIndex = |
862 | LoopBuilder.CreateAdd(LHS: LoopIndex, RHS: ConstantInt::get(Ty: TypeOfCopyLen, V: 1)); |
863 | LoopIndex->addIncoming(V: NewIndex, BB: LoopBB); |
864 | |
865 | LoopBuilder.CreateCondBr(Cond: LoopBuilder.CreateICmpULT(LHS: NewIndex, RHS: CopyLen), True: LoopBB, |
866 | False: NewBB); |
867 | } |
868 | |
869 | template <typename T> |
870 | static bool canOverlap(MemTransferBase<T> *Memcpy, ScalarEvolution *SE) { |
871 | if (SE) { |
872 | const SCEV *SrcSCEV = SE->getSCEV(V: Memcpy->getRawSource()); |
873 | const SCEV *DestSCEV = SE->getSCEV(V: Memcpy->getRawDest()); |
874 | if (SE->isKnownPredicateAt(Pred: CmpInst::ICMP_NE, LHS: SrcSCEV, RHS: DestSCEV, CtxI: Memcpy)) |
875 | return false; |
876 | } |
877 | return true; |
878 | } |
879 | |
880 | void llvm::expandMemCpyAsLoop(MemCpyInst *Memcpy, |
881 | const TargetTransformInfo &TTI, |
882 | ScalarEvolution *SE) { |
883 | bool CanOverlap = canOverlap(Memcpy, SE); |
884 | if (ConstantInt *CI = dyn_cast<ConstantInt>(Val: Memcpy->getLength())) { |
885 | createMemCpyLoopKnownSize( |
886 | /* InsertBefore */ Memcpy, |
887 | /* SrcAddr */ Memcpy->getRawSource(), |
888 | /* DstAddr */ Memcpy->getRawDest(), |
889 | /* CopyLen */ CI, |
890 | /* SrcAlign */ Memcpy->getSourceAlign().valueOrOne(), |
891 | /* DestAlign */ DstAlign: Memcpy->getDestAlign().valueOrOne(), |
892 | /* SrcIsVolatile */ Memcpy->isVolatile(), |
893 | /* DstIsVolatile */ Memcpy->isVolatile(), |
894 | /* CanOverlap */ CanOverlap, |
895 | /* TargetTransformInfo */ TTI); |
896 | } else { |
897 | createMemCpyLoopUnknownSize( |
898 | /* InsertBefore */ Memcpy, |
899 | /* SrcAddr */ Memcpy->getRawSource(), |
900 | /* DstAddr */ Memcpy->getRawDest(), |
901 | /* CopyLen */ Memcpy->getLength(), |
902 | /* SrcAlign */ Memcpy->getSourceAlign().valueOrOne(), |
903 | /* DestAlign */ DstAlign: Memcpy->getDestAlign().valueOrOne(), |
904 | /* SrcIsVolatile */ Memcpy->isVolatile(), |
905 | /* DstIsVolatile */ Memcpy->isVolatile(), |
906 | /* CanOverlap */ CanOverlap, |
907 | /* TargetTransformInfo */ TTI); |
908 | } |
909 | } |
910 | |
911 | bool llvm::expandMemMoveAsLoop(MemMoveInst *Memmove, |
912 | const TargetTransformInfo &TTI) { |
913 | Value *CopyLen = Memmove->getLength(); |
914 | Value *SrcAddr = Memmove->getRawSource(); |
915 | Value *DstAddr = Memmove->getRawDest(); |
916 | Align SrcAlign = Memmove->getSourceAlign().valueOrOne(); |
917 | Align DstAlign = Memmove->getDestAlign().valueOrOne(); |
918 | bool SrcIsVolatile = Memmove->isVolatile(); |
919 | bool DstIsVolatile = SrcIsVolatile; |
920 | IRBuilder<> CastBuilder(Memmove); |
921 | |
922 | unsigned SrcAS = SrcAddr->getType()->getPointerAddressSpace(); |
923 | unsigned DstAS = DstAddr->getType()->getPointerAddressSpace(); |
924 | if (SrcAS != DstAS) { |
925 | if (!TTI.addrspacesMayAlias(AS0: SrcAS, AS1: DstAS)) { |
926 | // We may not be able to emit a pointer comparison, but we don't have |
927 | // to. Expand as memcpy. |
928 | if (ConstantInt *CI = dyn_cast<ConstantInt>(Val: CopyLen)) { |
929 | createMemCpyLoopKnownSize(/*InsertBefore=*/Memmove, SrcAddr, DstAddr, |
930 | CopyLen: CI, SrcAlign, DstAlign, SrcIsVolatile, |
931 | DstIsVolatile, |
932 | /*CanOverlap=*/false, TTI); |
933 | } else { |
934 | createMemCpyLoopUnknownSize(/*InsertBefore=*/Memmove, SrcAddr, DstAddr, |
935 | CopyLen, SrcAlign, DstAlign, SrcIsVolatile, |
936 | DstIsVolatile, |
937 | /*CanOverlap=*/false, TTI); |
938 | } |
939 | |
940 | return true; |
941 | } |
942 | |
943 | if (!(TTI.isValidAddrSpaceCast(FromAS: DstAS, ToAS: SrcAS) || |
944 | TTI.isValidAddrSpaceCast(FromAS: SrcAS, ToAS: DstAS))) { |
945 | // We don't know generically if it's legal to introduce an |
946 | // addrspacecast. We need to know either if it's legal to insert an |
947 | // addrspacecast, or if the address spaces cannot alias. |
948 | LLVM_DEBUG( |
949 | dbgs() << "Do not know how to expand memmove between different " |
950 | "address spaces\n" ); |
951 | return false; |
952 | } |
953 | } |
954 | |
955 | if (ConstantInt *CI = dyn_cast<ConstantInt>(Val: CopyLen)) { |
956 | createMemMoveLoopKnownSize( |
957 | /*InsertBefore=*/Memmove, SrcAddr, DstAddr, CopyLen: CI, SrcAlign, DstAlign, |
958 | SrcIsVolatile, DstIsVolatile, TTI); |
959 | } else { |
960 | createMemMoveLoopUnknownSize( |
961 | /*InsertBefore=*/Memmove, SrcAddr, DstAddr, CopyLen, SrcAlign, DstAlign, |
962 | SrcIsVolatile, DstIsVolatile, TTI); |
963 | } |
964 | return true; |
965 | } |
966 | |
967 | void llvm::expandMemSetAsLoop(MemSetInst *Memset) { |
968 | createMemSetLoop(/* InsertBefore */ Memset, |
969 | /* DstAddr */ Memset->getRawDest(), |
970 | /* CopyLen */ Memset->getLength(), |
971 | /* SetValue */ Memset->getValue(), |
972 | /* Alignment */ DstAlign: Memset->getDestAlign().valueOrOne(), |
973 | IsVolatile: Memset->isVolatile()); |
974 | } |
975 | |
976 | void llvm::expandMemSetPatternAsLoop(MemSetPatternInst *Memset) { |
977 | createMemSetLoop(/* InsertBefore=*/Memset, |
978 | /* DstAddr=*/Memset->getRawDest(), |
979 | /* CopyLen=*/Memset->getLength(), |
980 | /* SetValue=*/Memset->getValue(), |
981 | /* Alignment=*/DstAlign: Memset->getDestAlign().valueOrOne(), |
982 | IsVolatile: Memset->isVolatile()); |
983 | } |
984 | |
985 | void llvm::expandAtomicMemCpyAsLoop(AnyMemCpyInst *AtomicMemcpy, |
986 | const TargetTransformInfo &TTI, |
987 | ScalarEvolution *SE) { |
988 | assert(AtomicMemcpy->isAtomic()); |
989 | if (ConstantInt *CI = dyn_cast<ConstantInt>(Val: AtomicMemcpy->getLength())) { |
990 | createMemCpyLoopKnownSize( |
991 | /* InsertBefore */ AtomicMemcpy, |
992 | /* SrcAddr */ AtomicMemcpy->getRawSource(), |
993 | /* DstAddr */ AtomicMemcpy->getRawDest(), |
994 | /* CopyLen */ CI, |
995 | /* SrcAlign */ AtomicMemcpy->getSourceAlign().valueOrOne(), |
996 | /* DestAlign */ DstAlign: AtomicMemcpy->getDestAlign().valueOrOne(), |
997 | /* SrcIsVolatile */ AtomicMemcpy->isVolatile(), |
998 | /* DstIsVolatile */ AtomicMemcpy->isVolatile(), |
999 | /* CanOverlap */ false, // SrcAddr & DstAddr may not overlap by spec. |
1000 | /* TargetTransformInfo */ TTI, |
1001 | /* AtomicCpySize */ AtomicElementSize: AtomicMemcpy->getElementSizeInBytes()); |
1002 | } else { |
1003 | createMemCpyLoopUnknownSize( |
1004 | /* InsertBefore */ AtomicMemcpy, |
1005 | /* SrcAddr */ AtomicMemcpy->getRawSource(), |
1006 | /* DstAddr */ AtomicMemcpy->getRawDest(), |
1007 | /* CopyLen */ AtomicMemcpy->getLength(), |
1008 | /* SrcAlign */ AtomicMemcpy->getSourceAlign().valueOrOne(), |
1009 | /* DestAlign */ DstAlign: AtomicMemcpy->getDestAlign().valueOrOne(), |
1010 | /* SrcIsVolatile */ AtomicMemcpy->isVolatile(), |
1011 | /* DstIsVolatile */ AtomicMemcpy->isVolatile(), |
1012 | /* CanOverlap */ false, // SrcAddr & DstAddr may not overlap by spec. |
1013 | /* TargetTransformInfo */ TTI, |
1014 | /* AtomicCpySize */ AtomicElementSize: AtomicMemcpy->getElementSizeInBytes()); |
1015 | } |
1016 | } |
1017 | |