1 | //===- MemCpyOptimizer.cpp - Optimize use of memcpy and friends -----------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This pass performs various transformations related to eliminating memcpy |
10 | // calls, or transforming sets of stores into memset's. |
11 | // |
12 | //===----------------------------------------------------------------------===// |
13 | |
14 | #include "llvm/Transforms/Scalar/MemCpyOptimizer.h" |
15 | #include "llvm/ADT/DenseSet.h" |
16 | #include "llvm/ADT/STLExtras.h" |
17 | #include "llvm/ADT/ScopeExit.h" |
18 | #include "llvm/ADT/SmallVector.h" |
19 | #include "llvm/ADT/Statistic.h" |
20 | #include "llvm/ADT/iterator_range.h" |
21 | #include "llvm/Analysis/AliasAnalysis.h" |
22 | #include "llvm/Analysis/AssumptionCache.h" |
23 | #include "llvm/Analysis/CFG.h" |
24 | #include "llvm/Analysis/CaptureTracking.h" |
25 | #include "llvm/Analysis/GlobalsModRef.h" |
26 | #include "llvm/Analysis/InstructionSimplify.h" |
27 | #include "llvm/Analysis/Loads.h" |
28 | #include "llvm/Analysis/MemoryLocation.h" |
29 | #include "llvm/Analysis/MemorySSA.h" |
30 | #include "llvm/Analysis/MemorySSAUpdater.h" |
31 | #include "llvm/Analysis/PostDominators.h" |
32 | #include "llvm/Analysis/TargetLibraryInfo.h" |
33 | #include "llvm/Analysis/ValueTracking.h" |
34 | #include "llvm/IR/BasicBlock.h" |
35 | #include "llvm/IR/Constants.h" |
36 | #include "llvm/IR/DataLayout.h" |
37 | #include "llvm/IR/DerivedTypes.h" |
38 | #include "llvm/IR/Dominators.h" |
39 | #include "llvm/IR/Function.h" |
40 | #include "llvm/IR/GlobalVariable.h" |
41 | #include "llvm/IR/IRBuilder.h" |
42 | #include "llvm/IR/InstrTypes.h" |
43 | #include "llvm/IR/Instruction.h" |
44 | #include "llvm/IR/Instructions.h" |
45 | #include "llvm/IR/IntrinsicInst.h" |
46 | #include "llvm/IR/Intrinsics.h" |
47 | #include "llvm/IR/LLVMContext.h" |
48 | #include "llvm/IR/Module.h" |
49 | #include "llvm/IR/PassManager.h" |
50 | #include "llvm/IR/Type.h" |
51 | #include "llvm/IR/User.h" |
52 | #include "llvm/IR/Value.h" |
53 | #include "llvm/Support/Casting.h" |
54 | #include "llvm/Support/Debug.h" |
55 | #include "llvm/Support/MathExtras.h" |
56 | #include "llvm/Support/raw_ostream.h" |
57 | #include "llvm/Transforms/Utils/Local.h" |
58 | #include <algorithm> |
59 | #include <cassert> |
60 | #include <cstdint> |
61 | #include <optional> |
62 | |
63 | using namespace llvm; |
64 | |
65 | #define DEBUG_TYPE "memcpyopt" |
66 | |
67 | static cl::opt<bool> EnableMemCpyOptWithoutLibcalls( |
68 | "enable-memcpyopt-without-libcalls" , cl::Hidden, |
69 | cl::desc("Enable memcpyopt even when libcalls are disabled" )); |
70 | |
71 | STATISTIC(NumMemCpyInstr, "Number of memcpy instructions deleted" ); |
72 | STATISTIC(NumMemSetInfer, "Number of memsets inferred" ); |
73 | STATISTIC(NumMoveToCpy, "Number of memmoves converted to memcpy" ); |
74 | STATISTIC(NumCpyToSet, "Number of memcpys converted to memset" ); |
75 | STATISTIC(NumCallSlot, "Number of call slot optimizations performed" ); |
76 | STATISTIC(NumStackMove, "Number of stack-move optimizations performed" ); |
77 | |
78 | namespace { |
79 | |
80 | /// Represents a range of memset'd bytes with the ByteVal value. |
81 | /// This allows us to analyze stores like: |
82 | /// store 0 -> P+1 |
83 | /// store 0 -> P+0 |
84 | /// store 0 -> P+3 |
85 | /// store 0 -> P+2 |
86 | /// which sometimes happens with stores to arrays of structs etc. When we see |
87 | /// the first store, we make a range [1, 2). The second store extends the range |
88 | /// to [0, 2). The third makes a new range [2, 3). The fourth store joins the |
89 | /// two ranges into [0, 3) which is memset'able. |
90 | struct MemsetRange { |
91 | // Start/End - A semi range that describes the span that this range covers. |
92 | // The range is closed at the start and open at the end: [Start, End). |
93 | int64_t Start, End; |
94 | |
95 | /// StartPtr - The getelementptr instruction that points to the start of the |
96 | /// range. |
97 | Value *StartPtr; |
98 | |
99 | /// Alignment - The known alignment of the first store. |
100 | MaybeAlign Alignment; |
101 | |
102 | /// TheStores - The actual stores that make up this range. |
103 | SmallVector<Instruction *, 16> TheStores; |
104 | |
105 | bool isProfitableToUseMemset(const DataLayout &DL) const; |
106 | }; |
107 | |
108 | } // end anonymous namespace |
109 | |
110 | bool MemsetRange::isProfitableToUseMemset(const DataLayout &DL) const { |
111 | // If we found more than 4 stores to merge or 16 bytes, use memset. |
112 | if (TheStores.size() >= 4 || End - Start >= 16) |
113 | return true; |
114 | |
115 | // If there is nothing to merge, don't do anything. |
116 | if (TheStores.size() < 2) |
117 | return false; |
118 | |
119 | // If any of the stores are a memset, then it is always good to extend the |
120 | // memset. |
121 | for (Instruction *SI : TheStores) |
122 | if (!isa<StoreInst>(Val: SI)) |
123 | return true; |
124 | |
125 | // Assume that the code generator is capable of merging pairs of stores |
126 | // together if it wants to. |
127 | if (TheStores.size() == 2) |
128 | return false; |
129 | |
130 | // If we have fewer than 8 stores, it can still be worthwhile to do this. |
131 | // For example, merging 4 i8 stores into an i32 store is useful almost always. |
132 | // However, merging 2 32-bit stores isn't useful on a 32-bit architecture (the |
133 | // memset will be split into 2 32-bit stores anyway) and doing so can |
134 | // pessimize the llvm optimizer. |
135 | // |
136 | // Since we don't have perfect knowledge here, make some assumptions: assume |
137 | // the maximum GPR width is the same size as the largest legal integer |
138 | // size. If so, check to see whether we will end up actually reducing the |
139 | // number of stores used. |
140 | unsigned Bytes = unsigned(End - Start); |
141 | unsigned MaxIntSize = DL.getLargestLegalIntTypeSizeInBits() / 8; |
142 | if (MaxIntSize == 0) |
143 | MaxIntSize = 1; |
144 | unsigned NumPointerStores = Bytes / MaxIntSize; |
145 | |
146 | // Assume the remaining bytes if any are done a byte at a time. |
147 | unsigned NumByteStores = Bytes % MaxIntSize; |
148 | |
149 | // If we will reduce the # stores (according to this heuristic), do the |
150 | // transformation. This encourages merging 4 x i8 -> i32 and 2 x i16 -> i32 |
151 | // etc. |
152 | return TheStores.size() > NumPointerStores + NumByteStores; |
153 | } |
154 | |
155 | namespace { |
156 | |
157 | class MemsetRanges { |
158 | using range_iterator = SmallVectorImpl<MemsetRange>::iterator; |
159 | |
160 | /// A sorted list of the memset ranges. |
161 | SmallVector<MemsetRange, 8> Ranges; |
162 | |
163 | const DataLayout &DL; |
164 | |
165 | public: |
166 | MemsetRanges(const DataLayout &DL) : DL(DL) {} |
167 | |
168 | using const_iterator = SmallVectorImpl<MemsetRange>::const_iterator; |
169 | |
170 | const_iterator begin() const { return Ranges.begin(); } |
171 | const_iterator end() const { return Ranges.end(); } |
172 | bool empty() const { return Ranges.empty(); } |
173 | |
174 | void addInst(int64_t OffsetFromFirst, Instruction *Inst) { |
175 | if (auto *SI = dyn_cast<StoreInst>(Val: Inst)) |
176 | addStore(OffsetFromFirst, SI); |
177 | else |
178 | addMemSet(OffsetFromFirst, MSI: cast<MemSetInst>(Val: Inst)); |
179 | } |
180 | |
181 | void addStore(int64_t OffsetFromFirst, StoreInst *SI) { |
182 | TypeSize StoreSize = DL.getTypeStoreSize(Ty: SI->getOperand(i_nocapture: 0)->getType()); |
183 | assert(!StoreSize.isScalable() && "Can't track scalable-typed stores" ); |
184 | addRange(Start: OffsetFromFirst, Size: StoreSize.getFixedValue(), |
185 | Ptr: SI->getPointerOperand(), Alignment: SI->getAlign(), Inst: SI); |
186 | } |
187 | |
188 | void addMemSet(int64_t OffsetFromFirst, MemSetInst *MSI) { |
189 | int64_t Size = cast<ConstantInt>(Val: MSI->getLength())->getZExtValue(); |
190 | addRange(Start: OffsetFromFirst, Size, Ptr: MSI->getDest(), Alignment: MSI->getDestAlign(), Inst: MSI); |
191 | } |
192 | |
193 | void addRange(int64_t Start, int64_t Size, Value *Ptr, MaybeAlign Alignment, |
194 | Instruction *Inst); |
195 | }; |
196 | |
197 | } // end anonymous namespace |
198 | |
199 | /// Add a new store to the MemsetRanges data structure. This adds a |
200 | /// new range for the specified store at the specified offset, merging into |
201 | /// existing ranges as appropriate. |
202 | void MemsetRanges::addRange(int64_t Start, int64_t Size, Value *Ptr, |
203 | MaybeAlign Alignment, Instruction *Inst) { |
204 | int64_t End = Start + Size; |
205 | |
206 | range_iterator I = partition_point( |
207 | Range&: Ranges, P: [=](const MemsetRange &O) { return O.End < Start; }); |
208 | |
209 | // We now know that I == E, in which case we didn't find anything to merge |
210 | // with, or that Start <= I->End. If End < I->Start or I == E, then we need |
211 | // to insert a new range. Handle this now. |
212 | if (I == Ranges.end() || End < I->Start) { |
213 | MemsetRange &R = *Ranges.insert(I, Elt: MemsetRange()); |
214 | R.Start = Start; |
215 | R.End = End; |
216 | R.StartPtr = Ptr; |
217 | R.Alignment = Alignment; |
218 | R.TheStores.push_back(Elt: Inst); |
219 | return; |
220 | } |
221 | |
222 | // This store overlaps with I, add it. |
223 | I->TheStores.push_back(Elt: Inst); |
224 | |
225 | // At this point, we may have an interval that completely contains our store. |
226 | // If so, just add it to the interval and return. |
227 | if (I->Start <= Start && I->End >= End) |
228 | return; |
229 | |
230 | // Now we know that Start <= I->End and End >= I->Start so the range overlaps |
231 | // but is not entirely contained within the range. |
232 | |
233 | // See if the range extends the start of the range. In this case, it couldn't |
234 | // possibly cause it to join the prior range, because otherwise we would have |
235 | // stopped on *it*. |
236 | if (Start < I->Start) { |
237 | I->Start = Start; |
238 | I->StartPtr = Ptr; |
239 | I->Alignment = Alignment; |
240 | } |
241 | |
242 | // Now we know that Start <= I->End and Start >= I->Start (so the startpoint |
243 | // is in or right at the end of I), and that End >= I->Start. Extend I out to |
244 | // End. |
245 | if (End > I->End) { |
246 | I->End = End; |
247 | range_iterator NextI = I; |
248 | while (++NextI != Ranges.end() && End >= NextI->Start) { |
249 | // Merge the range in. |
250 | I->TheStores.append(in_start: NextI->TheStores.begin(), in_end: NextI->TheStores.end()); |
251 | if (NextI->End > I->End) |
252 | I->End = NextI->End; |
253 | Ranges.erase(CI: NextI); |
254 | NextI = I; |
255 | } |
256 | } |
257 | } |
258 | |
259 | //===----------------------------------------------------------------------===// |
260 | // MemCpyOptLegacyPass Pass |
261 | //===----------------------------------------------------------------------===// |
262 | |
263 | // Check that V is either not accessible by the caller, or unwinding cannot |
264 | // occur between Start and End. |
265 | static bool mayBeVisibleThroughUnwinding(Value *V, Instruction *Start, |
266 | Instruction *End) { |
267 | assert(Start->getParent() == End->getParent() && "Must be in same block" ); |
268 | // Function can't unwind, so it also can't be visible through unwinding. |
269 | if (Start->getFunction()->doesNotThrow()) |
270 | return false; |
271 | |
272 | // Object is not visible on unwind. |
273 | // TODO: Support RequiresNoCaptureBeforeUnwind case. |
274 | bool RequiresNoCaptureBeforeUnwind; |
275 | if (isNotVisibleOnUnwind(Object: getUnderlyingObject(V), |
276 | RequiresNoCaptureBeforeUnwind) && |
277 | !RequiresNoCaptureBeforeUnwind) |
278 | return false; |
279 | |
280 | // Check whether there are any unwinding instructions in the range. |
281 | return any_of(Range: make_range(x: Start->getIterator(), y: End->getIterator()), |
282 | P: [](const Instruction &I) { return I.mayThrow(); }); |
283 | } |
284 | |
285 | void MemCpyOptPass::eraseInstruction(Instruction *I) { |
286 | MSSAU->removeMemoryAccess(I); |
287 | I->eraseFromParent(); |
288 | } |
289 | |
290 | // Check for mod or ref of Loc between Start and End, excluding both boundaries. |
291 | // Start and End must be in the same block. |
292 | // If SkippedLifetimeStart is provided, skip over one clobbering lifetime.start |
293 | // intrinsic and store it inside SkippedLifetimeStart. |
294 | static bool accessedBetween(BatchAAResults &AA, MemoryLocation Loc, |
295 | const MemoryUseOrDef *Start, |
296 | const MemoryUseOrDef *End, |
297 | Instruction **SkippedLifetimeStart = nullptr) { |
298 | assert(Start->getBlock() == End->getBlock() && "Only local supported" ); |
299 | for (const MemoryAccess &MA : |
300 | make_range(x: ++Start->getIterator(), y: End->getIterator())) { |
301 | Instruction *I = cast<MemoryUseOrDef>(Val: MA).getMemoryInst(); |
302 | if (isModOrRefSet(MRI: AA.getModRefInfo(I, OptLoc: Loc))) { |
303 | auto *II = dyn_cast<IntrinsicInst>(Val: I); |
304 | if (II && II->getIntrinsicID() == Intrinsic::lifetime_start && |
305 | SkippedLifetimeStart && !*SkippedLifetimeStart) { |
306 | *SkippedLifetimeStart = I; |
307 | continue; |
308 | } |
309 | |
310 | return true; |
311 | } |
312 | } |
313 | return false; |
314 | } |
315 | |
316 | // Check for mod of Loc between Start and End, excluding both boundaries. |
317 | // Start and End can be in different blocks. |
318 | static bool writtenBetween(MemorySSA *MSSA, BatchAAResults &AA, |
319 | MemoryLocation Loc, const MemoryUseOrDef *Start, |
320 | const MemoryUseOrDef *End) { |
321 | if (isa<MemoryUse>(Val: End)) { |
322 | // For MemoryUses, getClobberingMemoryAccess may skip non-clobbering writes. |
323 | // Manually check read accesses between Start and End, if they are in the |
324 | // same block, for clobbers. Otherwise assume Loc is clobbered. |
325 | return Start->getBlock() != End->getBlock() || |
326 | any_of( |
327 | Range: make_range(x: std::next(x: Start->getIterator()), y: End->getIterator()), |
328 | P: [&AA, Loc](const MemoryAccess &Acc) { |
329 | if (isa<MemoryUse>(Val: &Acc)) |
330 | return false; |
331 | Instruction *AccInst = |
332 | cast<MemoryUseOrDef>(Val: &Acc)->getMemoryInst(); |
333 | return isModSet(MRI: AA.getModRefInfo(I: AccInst, OptLoc: Loc)); |
334 | }); |
335 | } |
336 | |
337 | // TODO: Only walk until we hit Start. |
338 | MemoryAccess *Clobber = MSSA->getWalker()->getClobberingMemoryAccess( |
339 | End->getDefiningAccess(), Loc, AA); |
340 | return !MSSA->dominates(A: Clobber, B: Start); |
341 | } |
342 | |
343 | // Update AA metadata |
344 | static void combineAAMetadata(Instruction *ReplInst, Instruction *I) { |
345 | // FIXME: MD_tbaa_struct and MD_mem_parallel_loop_access should also be |
346 | // handled here, but combineMetadata doesn't support them yet |
347 | unsigned KnownIDs[] = {LLVMContext::MD_tbaa, LLVMContext::MD_alias_scope, |
348 | LLVMContext::MD_noalias, |
349 | LLVMContext::MD_invariant_group, |
350 | LLVMContext::MD_access_group}; |
351 | combineMetadata(K: ReplInst, J: I, KnownIDs, DoesKMove: true); |
352 | } |
353 | |
354 | /// When scanning forward over instructions, we look for some other patterns to |
355 | /// fold away. In particular, this looks for stores to neighboring locations of |
356 | /// memory. If it sees enough consecutive ones, it attempts to merge them |
357 | /// together into a memcpy/memset. |
358 | Instruction *MemCpyOptPass::tryMergingIntoMemset(Instruction *StartInst, |
359 | Value *StartPtr, |
360 | Value *ByteVal) { |
361 | const DataLayout &DL = StartInst->getDataLayout(); |
362 | |
363 | // We can't track scalable types |
364 | if (auto *SI = dyn_cast<StoreInst>(Val: StartInst)) |
365 | if (DL.getTypeStoreSize(Ty: SI->getOperand(i_nocapture: 0)->getType()).isScalable()) |
366 | return nullptr; |
367 | |
368 | // Okay, so we now have a single store that can be splatable. Scan to find |
369 | // all subsequent stores of the same value to offset from the same pointer. |
370 | // Join these together into ranges, so we can decide whether contiguous blocks |
371 | // are stored. |
372 | MemsetRanges Ranges(DL); |
373 | |
374 | BasicBlock::iterator BI(StartInst); |
375 | |
376 | // Keeps track of the last memory use or def before the insertion point for |
377 | // the new memset. The new MemoryDef for the inserted memsets will be inserted |
378 | // after MemInsertPoint. |
379 | MemoryUseOrDef *MemInsertPoint = nullptr; |
380 | for (++BI; !BI->isTerminator(); ++BI) { |
381 | auto *CurrentAcc = cast_or_null<MemoryUseOrDef>( |
382 | Val: MSSAU->getMemorySSA()->getMemoryAccess(I: &*BI)); |
383 | if (CurrentAcc) |
384 | MemInsertPoint = CurrentAcc; |
385 | |
386 | // Calls that only access inaccessible memory do not block merging |
387 | // accessible stores. |
388 | if (auto *CB = dyn_cast<CallBase>(Val&: BI)) { |
389 | if (CB->onlyAccessesInaccessibleMemory()) |
390 | continue; |
391 | } |
392 | |
393 | if (!isa<StoreInst>(Val: BI) && !isa<MemSetInst>(Val: BI)) { |
394 | // If the instruction is readnone, ignore it, otherwise bail out. We |
395 | // don't even allow readonly here because we don't want something like: |
396 | // A[1] = 2; strlen(A); A[2] = 2; -> memcpy(A, ...); strlen(A). |
397 | if (BI->mayWriteToMemory() || BI->mayReadFromMemory()) |
398 | break; |
399 | continue; |
400 | } |
401 | |
402 | if (auto *NextStore = dyn_cast<StoreInst>(Val&: BI)) { |
403 | // If this is a store, see if we can merge it in. |
404 | if (!NextStore->isSimple()) |
405 | break; |
406 | |
407 | Value *StoredVal = NextStore->getValueOperand(); |
408 | |
409 | // Don't convert stores of non-integral pointer types to memsets (which |
410 | // stores integers). |
411 | if (DL.isNonIntegralPointerType(Ty: StoredVal->getType()->getScalarType())) |
412 | break; |
413 | |
414 | // We can't track ranges involving scalable types. |
415 | if (DL.getTypeStoreSize(Ty: StoredVal->getType()).isScalable()) |
416 | break; |
417 | |
418 | // Check to see if this stored value is of the same byte-splattable value. |
419 | Value *StoredByte = isBytewiseValue(V: StoredVal, DL); |
420 | if (isa<UndefValue>(Val: ByteVal) && StoredByte) |
421 | ByteVal = StoredByte; |
422 | if (ByteVal != StoredByte) |
423 | break; |
424 | |
425 | // Check to see if this store is to a constant offset from the start ptr. |
426 | std::optional<int64_t> Offset = |
427 | NextStore->getPointerOperand()->getPointerOffsetFrom(Other: StartPtr, DL); |
428 | if (!Offset) |
429 | break; |
430 | |
431 | Ranges.addStore(OffsetFromFirst: *Offset, SI: NextStore); |
432 | } else { |
433 | auto *MSI = cast<MemSetInst>(Val&: BI); |
434 | |
435 | if (MSI->isVolatile() || ByteVal != MSI->getValue() || |
436 | !isa<ConstantInt>(Val: MSI->getLength())) |
437 | break; |
438 | |
439 | // Check to see if this store is to a constant offset from the start ptr. |
440 | std::optional<int64_t> Offset = |
441 | MSI->getDest()->getPointerOffsetFrom(Other: StartPtr, DL); |
442 | if (!Offset) |
443 | break; |
444 | |
445 | Ranges.addMemSet(OffsetFromFirst: *Offset, MSI); |
446 | } |
447 | } |
448 | |
449 | // If we have no ranges, then we just had a single store with nothing that |
450 | // could be merged in. This is a very common case of course. |
451 | if (Ranges.empty()) |
452 | return nullptr; |
453 | |
454 | // If we had at least one store that could be merged in, add the starting |
455 | // store as well. We try to avoid this unless there is at least something |
456 | // interesting as a small compile-time optimization. |
457 | Ranges.addInst(OffsetFromFirst: 0, Inst: StartInst); |
458 | |
459 | // If we create any memsets, we put it right before the first instruction that |
460 | // isn't part of the memset block. This ensure that the memset is dominated |
461 | // by any addressing instruction needed by the start of the block. |
462 | IRBuilder<> Builder(&*BI); |
463 | |
464 | // Now that we have full information about ranges, loop over the ranges and |
465 | // emit memset's for anything big enough to be worthwhile. |
466 | Instruction *AMemSet = nullptr; |
467 | for (const MemsetRange &Range : Ranges) { |
468 | if (Range.TheStores.size() == 1) |
469 | continue; |
470 | |
471 | // If it is profitable to lower this range to memset, do so now. |
472 | if (!Range.isProfitableToUseMemset(DL)) |
473 | continue; |
474 | |
475 | // Otherwise, we do want to transform this! Create a new memset. |
476 | // Get the starting pointer of the block. |
477 | StartPtr = Range.StartPtr; |
478 | |
479 | AMemSet = Builder.CreateMemSet(Ptr: StartPtr, Val: ByteVal, Size: Range.End - Range.Start, |
480 | Align: Range.Alignment); |
481 | AMemSet->mergeDIAssignID(SourceInstructions: Range.TheStores); |
482 | |
483 | LLVM_DEBUG(dbgs() << "Replace stores:\n" ; for (Instruction *SI |
484 | : Range.TheStores) dbgs() |
485 | << *SI << '\n'; |
486 | dbgs() << "With: " << *AMemSet << '\n'); |
487 | if (!Range.TheStores.empty()) |
488 | AMemSet->setDebugLoc(Range.TheStores[0]->getDebugLoc()); |
489 | |
490 | auto *NewDef = cast<MemoryDef>( |
491 | Val: MemInsertPoint->getMemoryInst() == &*BI |
492 | ? MSSAU->createMemoryAccessBefore(I: AMemSet, Definition: nullptr, InsertPt: MemInsertPoint) |
493 | : MSSAU->createMemoryAccessAfter(I: AMemSet, Definition: nullptr, InsertPt: MemInsertPoint)); |
494 | MSSAU->insertDef(Def: NewDef, /*RenameUses=*/true); |
495 | MemInsertPoint = NewDef; |
496 | |
497 | // Zap all the stores. |
498 | for (Instruction *SI : Range.TheStores) |
499 | eraseInstruction(I: SI); |
500 | |
501 | ++NumMemSetInfer; |
502 | } |
503 | |
504 | return AMemSet; |
505 | } |
506 | |
507 | // This method try to lift a store instruction before position P. |
508 | // It will lift the store and its argument + that anything that |
509 | // may alias with these. |
510 | // The method returns true if it was successful. |
511 | bool MemCpyOptPass::moveUp(StoreInst *SI, Instruction *P, const LoadInst *LI) { |
512 | // If the store alias this position, early bail out. |
513 | MemoryLocation StoreLoc = MemoryLocation::get(SI); |
514 | if (isModOrRefSet(MRI: AA->getModRefInfo(I: P, OptLoc: StoreLoc))) |
515 | return false; |
516 | |
517 | // Keep track of the arguments of all instruction we plan to lift |
518 | // so we can make sure to lift them as well if appropriate. |
519 | DenseSet<Instruction *> Args; |
520 | auto AddArg = [&](Value *Arg) { |
521 | auto *I = dyn_cast<Instruction>(Val: Arg); |
522 | if (I && I->getParent() == SI->getParent()) { |
523 | // Cannot hoist user of P above P |
524 | if (I == P) |
525 | return false; |
526 | Args.insert(V: I); |
527 | } |
528 | return true; |
529 | }; |
530 | if (!AddArg(SI->getPointerOperand())) |
531 | return false; |
532 | |
533 | // Instruction to lift before P. |
534 | SmallVector<Instruction *, 8> ToLift{SI}; |
535 | |
536 | // Memory locations of lifted instructions. |
537 | SmallVector<MemoryLocation, 8> MemLocs{StoreLoc}; |
538 | |
539 | // Lifted calls. |
540 | SmallVector<const CallBase *, 8> Calls; |
541 | |
542 | const MemoryLocation LoadLoc = MemoryLocation::get(LI); |
543 | |
544 | for (auto I = --SI->getIterator(), E = P->getIterator(); I != E; --I) { |
545 | auto *C = &*I; |
546 | |
547 | // Make sure hoisting does not perform a store that was not guaranteed to |
548 | // happen. |
549 | if (!isGuaranteedToTransferExecutionToSuccessor(I: C)) |
550 | return false; |
551 | |
552 | bool MayAlias = isModOrRefSet(MRI: AA->getModRefInfo(I: C, OptLoc: std::nullopt)); |
553 | |
554 | bool NeedLift = false; |
555 | if (Args.erase(V: C)) |
556 | NeedLift = true; |
557 | else if (MayAlias) { |
558 | NeedLift = llvm::any_of(Range&: MemLocs, P: [C, this](const MemoryLocation &ML) { |
559 | return isModOrRefSet(MRI: AA->getModRefInfo(I: C, OptLoc: ML)); |
560 | }); |
561 | |
562 | if (!NeedLift) |
563 | NeedLift = llvm::any_of(Range&: Calls, P: [C, this](const CallBase *Call) { |
564 | return isModOrRefSet(MRI: AA->getModRefInfo(I: C, Call)); |
565 | }); |
566 | } |
567 | |
568 | if (!NeedLift) |
569 | continue; |
570 | |
571 | if (MayAlias) { |
572 | // Since LI is implicitly moved downwards past the lifted instructions, |
573 | // none of them may modify its source. |
574 | if (isModSet(MRI: AA->getModRefInfo(I: C, OptLoc: LoadLoc))) |
575 | return false; |
576 | else if (const auto *Call = dyn_cast<CallBase>(Val: C)) { |
577 | // If we can't lift this before P, it's game over. |
578 | if (isModOrRefSet(MRI: AA->getModRefInfo(I: P, Call))) |
579 | return false; |
580 | |
581 | Calls.push_back(Elt: Call); |
582 | } else if (isa<LoadInst>(Val: C) || isa<StoreInst>(Val: C) || isa<VAArgInst>(Val: C)) { |
583 | // If we can't lift this before P, it's game over. |
584 | auto ML = MemoryLocation::get(Inst: C); |
585 | if (isModOrRefSet(MRI: AA->getModRefInfo(I: P, OptLoc: ML))) |
586 | return false; |
587 | |
588 | MemLocs.push_back(Elt: ML); |
589 | } else |
590 | // We don't know how to lift this instruction. |
591 | return false; |
592 | } |
593 | |
594 | ToLift.push_back(Elt: C); |
595 | for (Value *Op : C->operands()) |
596 | if (!AddArg(Op)) |
597 | return false; |
598 | } |
599 | |
600 | // Find MSSA insertion point. Normally P will always have a corresponding |
601 | // memory access before which we can insert. However, with non-standard AA |
602 | // pipelines, there may be a mismatch between AA and MSSA, in which case we |
603 | // will scan for a memory access before P. In either case, we know for sure |
604 | // that at least the load will have a memory access. |
605 | // TODO: Simplify this once P will be determined by MSSA, in which case the |
606 | // discrepancy can no longer occur. |
607 | MemoryUseOrDef *MemInsertPoint = nullptr; |
608 | if (MemoryUseOrDef *MA = MSSAU->getMemorySSA()->getMemoryAccess(I: P)) { |
609 | MemInsertPoint = cast<MemoryUseOrDef>(Val&: --MA->getIterator()); |
610 | } else { |
611 | const Instruction *ConstP = P; |
612 | for (const Instruction &I : make_range(x: ++ConstP->getReverseIterator(), |
613 | y: ++LI->getReverseIterator())) { |
614 | if (MemoryUseOrDef *MA = MSSAU->getMemorySSA()->getMemoryAccess(I: &I)) { |
615 | MemInsertPoint = MA; |
616 | break; |
617 | } |
618 | } |
619 | } |
620 | |
621 | // We made it, we need to lift. |
622 | for (auto *I : llvm::reverse(C&: ToLift)) { |
623 | LLVM_DEBUG(dbgs() << "Lifting " << *I << " before " << *P << "\n" ); |
624 | I->moveBefore(MovePos: P); |
625 | assert(MemInsertPoint && "Must have found insert point" ); |
626 | if (MemoryUseOrDef *MA = MSSAU->getMemorySSA()->getMemoryAccess(I)) { |
627 | MSSAU->moveAfter(What: MA, Where: MemInsertPoint); |
628 | MemInsertPoint = MA; |
629 | } |
630 | } |
631 | |
632 | return true; |
633 | } |
634 | |
635 | bool MemCpyOptPass::processStoreOfLoad(StoreInst *SI, LoadInst *LI, |
636 | const DataLayout &DL, |
637 | BasicBlock::iterator &BBI) { |
638 | if (!LI->isSimple() || !LI->hasOneUse() || LI->getParent() != SI->getParent()) |
639 | return false; |
640 | |
641 | auto *T = LI->getType(); |
642 | // Don't introduce calls to memcpy/memmove intrinsics out of thin air if |
643 | // the corresponding libcalls are not available. |
644 | // TODO: We should really distinguish between libcall availability and |
645 | // our ability to introduce intrinsics. |
646 | if (T->isAggregateType() && |
647 | (EnableMemCpyOptWithoutLibcalls || |
648 | (TLI->has(F: LibFunc_memcpy) && TLI->has(F: LibFunc_memmove)))) { |
649 | MemoryLocation LoadLoc = MemoryLocation::get(LI); |
650 | |
651 | // We use alias analysis to check if an instruction may store to |
652 | // the memory we load from in between the load and the store. If |
653 | // such an instruction is found, we try to promote there instead |
654 | // of at the store position. |
655 | // TODO: Can use MSSA for this. |
656 | Instruction *P = SI; |
657 | for (auto &I : make_range(x: ++LI->getIterator(), y: SI->getIterator())) { |
658 | if (isModSet(MRI: AA->getModRefInfo(I: &I, OptLoc: LoadLoc))) { |
659 | P = &I; |
660 | break; |
661 | } |
662 | } |
663 | |
664 | // We found an instruction that may write to the loaded memory. |
665 | // We can try to promote at this position instead of the store |
666 | // position if nothing aliases the store memory after this and the store |
667 | // destination is not in the range. |
668 | if (P && P != SI) { |
669 | if (!moveUp(SI, P, LI)) |
670 | P = nullptr; |
671 | } |
672 | |
673 | // If a valid insertion position is found, then we can promote |
674 | // the load/store pair to a memcpy. |
675 | if (P) { |
676 | // If we load from memory that may alias the memory we store to, |
677 | // memmove must be used to preserve semantic. If not, memcpy can |
678 | // be used. Also, if we load from constant memory, memcpy can be used |
679 | // as the constant memory won't be modified. |
680 | bool UseMemMove = false; |
681 | if (isModSet(MRI: AA->getModRefInfo(I: SI, OptLoc: LoadLoc))) |
682 | UseMemMove = true; |
683 | |
684 | IRBuilder<> Builder(P); |
685 | Value *Size = |
686 | Builder.CreateTypeSize(DstType: Builder.getInt64Ty(), Size: DL.getTypeStoreSize(Ty: T)); |
687 | Instruction *M; |
688 | if (UseMemMove) |
689 | M = Builder.CreateMemMove(Dst: SI->getPointerOperand(), DstAlign: SI->getAlign(), |
690 | Src: LI->getPointerOperand(), SrcAlign: LI->getAlign(), |
691 | Size); |
692 | else |
693 | M = Builder.CreateMemCpy(Dst: SI->getPointerOperand(), DstAlign: SI->getAlign(), |
694 | Src: LI->getPointerOperand(), SrcAlign: LI->getAlign(), Size); |
695 | M->copyMetadata(SrcInst: *SI, WL: LLVMContext::MD_DIAssignID); |
696 | |
697 | LLVM_DEBUG(dbgs() << "Promoting " << *LI << " to " << *SI << " => " << *M |
698 | << "\n" ); |
699 | |
700 | auto *LastDef = |
701 | cast<MemoryDef>(Val: MSSAU->getMemorySSA()->getMemoryAccess(I: SI)); |
702 | auto *NewAccess = MSSAU->createMemoryAccessAfter(I: M, Definition: nullptr, InsertPt: LastDef); |
703 | MSSAU->insertDef(Def: cast<MemoryDef>(Val: NewAccess), /*RenameUses=*/true); |
704 | |
705 | eraseInstruction(I: SI); |
706 | eraseInstruction(I: LI); |
707 | ++NumMemCpyInstr; |
708 | |
709 | // Make sure we do not invalidate the iterator. |
710 | BBI = M->getIterator(); |
711 | return true; |
712 | } |
713 | } |
714 | |
715 | // Detect cases where we're performing call slot forwarding, but |
716 | // happen to be using a load-store pair to implement it, rather than |
717 | // a memcpy. |
718 | BatchAAResults BAA(*AA); |
719 | auto GetCall = [&]() -> CallInst * { |
720 | // We defer this expensive clobber walk until the cheap checks |
721 | // have been done on the source inside performCallSlotOptzn. |
722 | if (auto *LoadClobber = dyn_cast<MemoryUseOrDef>( |
723 | Val: MSSA->getWalker()->getClobberingMemoryAccess(I: LI, AA&: BAA))) |
724 | return dyn_cast_or_null<CallInst>(Val: LoadClobber->getMemoryInst()); |
725 | return nullptr; |
726 | }; |
727 | |
728 | bool Changed = performCallSlotOptzn( |
729 | cpyLoad: LI, cpyStore: SI, cpyDst: SI->getPointerOperand()->stripPointerCasts(), |
730 | cpySrc: LI->getPointerOperand()->stripPointerCasts(), |
731 | cpyLen: DL.getTypeStoreSize(Ty: SI->getOperand(i_nocapture: 0)->getType()), |
732 | cpyAlign: std::min(a: SI->getAlign(), b: LI->getAlign()), BAA, GetC: GetCall); |
733 | if (Changed) { |
734 | eraseInstruction(I: SI); |
735 | eraseInstruction(I: LI); |
736 | ++NumMemCpyInstr; |
737 | return true; |
738 | } |
739 | |
740 | // If this is a load-store pair from a stack slot to a stack slot, we |
741 | // might be able to perform the stack-move optimization just as we do for |
742 | // memcpys from an alloca to an alloca. |
743 | if (auto *DestAlloca = dyn_cast<AllocaInst>(Val: SI->getPointerOperand())) { |
744 | if (auto *SrcAlloca = dyn_cast<AllocaInst>(Val: LI->getPointerOperand())) { |
745 | if (performStackMoveOptzn(Load: LI, Store: SI, DestAlloca, SrcAlloca, |
746 | Size: DL.getTypeStoreSize(Ty: T), BAA)) { |
747 | // Avoid invalidating the iterator. |
748 | BBI = SI->getNextNonDebugInstruction()->getIterator(); |
749 | eraseInstruction(I: SI); |
750 | eraseInstruction(I: LI); |
751 | ++NumMemCpyInstr; |
752 | return true; |
753 | } |
754 | } |
755 | } |
756 | |
757 | return false; |
758 | } |
759 | |
760 | bool MemCpyOptPass::processStore(StoreInst *SI, BasicBlock::iterator &BBI) { |
761 | if (!SI->isSimple()) |
762 | return false; |
763 | |
764 | // Avoid merging nontemporal stores since the resulting |
765 | // memcpy/memset would not be able to preserve the nontemporal hint. |
766 | // In theory we could teach how to propagate the !nontemporal metadata to |
767 | // memset calls. However, that change would force the backend to |
768 | // conservatively expand !nontemporal memset calls back to sequences of |
769 | // store instructions (effectively undoing the merging). |
770 | if (SI->getMetadata(KindID: LLVMContext::MD_nontemporal)) |
771 | return false; |
772 | |
773 | const DataLayout &DL = SI->getDataLayout(); |
774 | |
775 | Value *StoredVal = SI->getValueOperand(); |
776 | |
777 | // Not all the transforms below are correct for non-integral pointers, bail |
778 | // until we've audited the individual pieces. |
779 | if (DL.isNonIntegralPointerType(Ty: StoredVal->getType()->getScalarType())) |
780 | return false; |
781 | |
782 | // Load to store forwarding can be interpreted as memcpy. |
783 | if (auto *LI = dyn_cast<LoadInst>(Val: StoredVal)) |
784 | return processStoreOfLoad(SI, LI, DL, BBI); |
785 | |
786 | // The following code creates memset intrinsics out of thin air. Don't do |
787 | // this if the corresponding libfunc is not available. |
788 | // TODO: We should really distinguish between libcall availability and |
789 | // our ability to introduce intrinsics. |
790 | if (!(TLI->has(F: LibFunc_memset) || EnableMemCpyOptWithoutLibcalls)) |
791 | return false; |
792 | |
793 | // There are two cases that are interesting for this code to handle: memcpy |
794 | // and memset. Right now we only handle memset. |
795 | |
796 | // Ensure that the value being stored is something that can be memset'able a |
797 | // byte at a time like "0" or "-1" or any width, as well as things like |
798 | // 0xA0A0A0A0 and 0.0. |
799 | auto *V = SI->getOperand(i_nocapture: 0); |
800 | if (Value *ByteVal = isBytewiseValue(V, DL)) { |
801 | if (Instruction *I = |
802 | tryMergingIntoMemset(StartInst: SI, StartPtr: SI->getPointerOperand(), ByteVal)) { |
803 | BBI = I->getIterator(); // Don't invalidate iterator. |
804 | return true; |
805 | } |
806 | |
807 | // If we have an aggregate, we try to promote it to memset regardless |
808 | // of opportunity for merging as it can expose optimization opportunities |
809 | // in subsequent passes. |
810 | auto *T = V->getType(); |
811 | if (T->isAggregateType()) { |
812 | uint64_t Size = DL.getTypeStoreSize(Ty: T); |
813 | IRBuilder<> Builder(SI); |
814 | auto *M = Builder.CreateMemSet(Ptr: SI->getPointerOperand(), Val: ByteVal, Size, |
815 | Align: SI->getAlign()); |
816 | M->copyMetadata(SrcInst: *SI, WL: LLVMContext::MD_DIAssignID); |
817 | |
818 | LLVM_DEBUG(dbgs() << "Promoting " << *SI << " to " << *M << "\n" ); |
819 | |
820 | // The newly inserted memset is immediately overwritten by the original |
821 | // store, so we do not need to rename uses. |
822 | auto *StoreDef = cast<MemoryDef>(Val: MSSA->getMemoryAccess(I: SI)); |
823 | auto *NewAccess = MSSAU->createMemoryAccessBefore(I: M, Definition: nullptr, InsertPt: StoreDef); |
824 | MSSAU->insertDef(Def: cast<MemoryDef>(Val: NewAccess), /*RenameUses=*/false); |
825 | |
826 | eraseInstruction(I: SI); |
827 | NumMemSetInfer++; |
828 | |
829 | // Make sure we do not invalidate the iterator. |
830 | BBI = M->getIterator(); |
831 | return true; |
832 | } |
833 | } |
834 | |
835 | return false; |
836 | } |
837 | |
838 | bool MemCpyOptPass::processMemSet(MemSetInst *MSI, BasicBlock::iterator &BBI) { |
839 | // See if there is another memset or store neighboring this memset which |
840 | // allows us to widen out the memset to do a single larger store. |
841 | if (isa<ConstantInt>(Val: MSI->getLength()) && !MSI->isVolatile()) |
842 | if (Instruction *I = |
843 | tryMergingIntoMemset(StartInst: MSI, StartPtr: MSI->getDest(), ByteVal: MSI->getValue())) { |
844 | BBI = I->getIterator(); // Don't invalidate iterator. |
845 | return true; |
846 | } |
847 | return false; |
848 | } |
849 | |
850 | /// Takes a memcpy and a call that it depends on, |
851 | /// and checks for the possibility of a call slot optimization by having |
852 | /// the call write its result directly into the destination of the memcpy. |
853 | bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpyLoad, |
854 | Instruction *cpyStore, Value *cpyDest, |
855 | Value *cpySrc, TypeSize cpySize, |
856 | Align cpyDestAlign, |
857 | BatchAAResults &BAA, |
858 | std::function<CallInst *()> GetC) { |
859 | // The general transformation to keep in mind is |
860 | // |
861 | // call @func(..., src, ...) |
862 | // memcpy(dest, src, ...) |
863 | // |
864 | // -> |
865 | // |
866 | // memcpy(dest, src, ...) |
867 | // call @func(..., dest, ...) |
868 | // |
869 | // Since moving the memcpy is technically awkward, we additionally check that |
870 | // src only holds uninitialized values at the moment of the call, meaning that |
871 | // the memcpy can be discarded rather than moved. |
872 | |
873 | // We can't optimize scalable types. |
874 | if (cpySize.isScalable()) |
875 | return false; |
876 | |
877 | // Require that src be an alloca. This simplifies the reasoning considerably. |
878 | auto *srcAlloca = dyn_cast<AllocaInst>(Val: cpySrc); |
879 | if (!srcAlloca) |
880 | return false; |
881 | |
882 | ConstantInt *srcArraySize = dyn_cast<ConstantInt>(Val: srcAlloca->getArraySize()); |
883 | if (!srcArraySize) |
884 | return false; |
885 | |
886 | const DataLayout &DL = cpyLoad->getDataLayout(); |
887 | TypeSize SrcAllocaSize = DL.getTypeAllocSize(Ty: srcAlloca->getAllocatedType()); |
888 | // We can't optimize scalable types. |
889 | if (SrcAllocaSize.isScalable()) |
890 | return false; |
891 | uint64_t srcSize = SrcAllocaSize * srcArraySize->getZExtValue(); |
892 | |
893 | if (cpySize < srcSize) |
894 | return false; |
895 | |
896 | CallInst *C = GetC(); |
897 | if (!C) |
898 | return false; |
899 | |
900 | // Lifetime marks shouldn't be operated on. |
901 | if (Function *F = C->getCalledFunction()) |
902 | if (F->isIntrinsic() && F->getIntrinsicID() == Intrinsic::lifetime_start) |
903 | return false; |
904 | |
905 | if (C->getParent() != cpyStore->getParent()) { |
906 | LLVM_DEBUG(dbgs() << "Call Slot: block local restriction\n" ); |
907 | return false; |
908 | } |
909 | |
910 | MemoryLocation DestLoc = |
911 | isa<StoreInst>(Val: cpyStore) |
912 | ? MemoryLocation::get(Inst: cpyStore) |
913 | : MemoryLocation::getForDest(MI: cast<MemCpyInst>(Val: cpyStore)); |
914 | |
915 | // Check that nothing touches the dest of the copy between |
916 | // the call and the store/memcpy. |
917 | Instruction *SkippedLifetimeStart = nullptr; |
918 | if (accessedBetween(AA&: BAA, Loc: DestLoc, Start: MSSA->getMemoryAccess(I: C), |
919 | End: MSSA->getMemoryAccess(I: cpyStore), SkippedLifetimeStart: &SkippedLifetimeStart)) { |
920 | LLVM_DEBUG(dbgs() << "Call Slot: Dest pointer modified after call\n" ); |
921 | return false; |
922 | } |
923 | |
924 | // If we need to move a lifetime.start above the call, make sure that we can |
925 | // actually do so. If the argument is bitcasted for example, we would have to |
926 | // move the bitcast as well, which we don't handle. |
927 | if (SkippedLifetimeStart) { |
928 | auto *LifetimeArg = |
929 | dyn_cast<Instruction>(Val: SkippedLifetimeStart->getOperand(i: 1)); |
930 | if (LifetimeArg && LifetimeArg->getParent() == C->getParent() && |
931 | C->comesBefore(Other: LifetimeArg)) |
932 | return false; |
933 | } |
934 | |
935 | // Check that storing to the first srcSize bytes of dest will not cause a |
936 | // trap or data race. |
937 | bool ExplicitlyDereferenceableOnly; |
938 | if (!isWritableObject(Object: getUnderlyingObject(V: cpyDest), |
939 | ExplicitlyDereferenceableOnly) || |
940 | !isDereferenceableAndAlignedPointer(V: cpyDest, Alignment: Align(1), Size: APInt(64, cpySize), |
941 | DL, CtxI: C, AC, DT)) { |
942 | LLVM_DEBUG(dbgs() << "Call Slot: Dest pointer not dereferenceable\n" ); |
943 | return false; |
944 | } |
945 | |
946 | // Make sure that nothing can observe cpyDest being written early. There are |
947 | // a number of cases to consider: |
948 | // 1. cpyDest cannot be accessed between C and cpyStore as a precondition of |
949 | // the transform. |
950 | // 2. C itself may not access cpyDest (prior to the transform). This is |
951 | // checked further below. |
952 | // 3. If cpyDest is accessible to the caller of this function (potentially |
953 | // captured and not based on an alloca), we need to ensure that we cannot |
954 | // unwind between C and cpyStore. This is checked here. |
955 | // 4. If cpyDest is potentially captured, there may be accesses to it from |
956 | // another thread. In this case, we need to check that cpyStore is |
957 | // guaranteed to be executed if C is. As it is a non-atomic access, it |
958 | // renders accesses from other threads undefined. |
959 | // TODO: This is currently not checked. |
960 | if (mayBeVisibleThroughUnwinding(V: cpyDest, Start: C, End: cpyStore)) { |
961 | LLVM_DEBUG(dbgs() << "Call Slot: Dest may be visible through unwinding\n" ); |
962 | return false; |
963 | } |
964 | |
965 | // Check that dest points to memory that is at least as aligned as src. |
966 | Align srcAlign = srcAlloca->getAlign(); |
967 | bool isDestSufficientlyAligned = srcAlign <= cpyDestAlign; |
968 | // If dest is not aligned enough and we can't increase its alignment then |
969 | // bail out. |
970 | if (!isDestSufficientlyAligned && !isa<AllocaInst>(Val: cpyDest)) { |
971 | LLVM_DEBUG(dbgs() << "Call Slot: Dest not sufficiently aligned\n" ); |
972 | return false; |
973 | } |
974 | |
975 | // Check that src is not accessed except via the call and the memcpy. This |
976 | // guarantees that it holds only undefined values when passed in (so the final |
977 | // memcpy can be dropped), that it is not read or written between the call and |
978 | // the memcpy, and that writing beyond the end of it is undefined. |
979 | SmallVector<User *, 8> srcUseList(srcAlloca->users()); |
980 | while (!srcUseList.empty()) { |
981 | User *U = srcUseList.pop_back_val(); |
982 | |
983 | if (isa<BitCastInst>(Val: U) || isa<AddrSpaceCastInst>(Val: U)) { |
984 | append_range(C&: srcUseList, R: U->users()); |
985 | continue; |
986 | } |
987 | if (const auto *G = dyn_cast<GetElementPtrInst>(Val: U); |
988 | G && G->hasAllZeroIndices()) { |
989 | append_range(C&: srcUseList, R: U->users()); |
990 | continue; |
991 | } |
992 | if (const auto *IT = dyn_cast<IntrinsicInst>(Val: U)) |
993 | if (IT->isLifetimeStartOrEnd()) |
994 | continue; |
995 | |
996 | if (U != C && U != cpyLoad) { |
997 | LLVM_DEBUG(dbgs() << "Call slot: Source accessed by " << *U << "\n" ); |
998 | return false; |
999 | } |
1000 | } |
1001 | |
1002 | // Check whether src is captured by the called function, in which case there |
1003 | // may be further indirect uses of src. |
1004 | bool SrcIsCaptured = any_of(Range: C->args(), P: [&](Use &U) { |
1005 | return U->stripPointerCasts() == cpySrc && |
1006 | !C->doesNotCapture(OpNo: C->getArgOperandNo(U: &U)); |
1007 | }); |
1008 | |
1009 | // If src is captured, then check whether there are any potential uses of |
1010 | // src through the captured pointer before the lifetime of src ends, either |
1011 | // due to a lifetime.end or a return from the function. |
1012 | if (SrcIsCaptured) { |
1013 | // Check that dest is not captured before/at the call. We have already |
1014 | // checked that src is not captured before it. If either had been captured, |
1015 | // then the call might be comparing the argument against the captured dest |
1016 | // or src pointer. |
1017 | Value *DestObj = getUnderlyingObject(V: cpyDest); |
1018 | if (!isIdentifiedFunctionLocal(V: DestObj) || |
1019 | PointerMayBeCapturedBefore(V: DestObj, /* ReturnCaptures */ true, |
1020 | /* StoreCaptures */ true, I: C, DT, |
1021 | /* IncludeI */ true)) |
1022 | return false; |
1023 | |
1024 | MemoryLocation SrcLoc = |
1025 | MemoryLocation(srcAlloca, LocationSize::precise(Value: srcSize)); |
1026 | for (Instruction &I : |
1027 | make_range(x: ++C->getIterator(), y: C->getParent()->end())) { |
1028 | // Lifetime of srcAlloca ends at lifetime.end. |
1029 | if (auto *II = dyn_cast<IntrinsicInst>(Val: &I)) { |
1030 | if (II->getIntrinsicID() == Intrinsic::lifetime_end && |
1031 | II->getArgOperand(i: 1)->stripPointerCasts() == srcAlloca && |
1032 | cast<ConstantInt>(Val: II->getArgOperand(i: 0))->uge(Num: srcSize)) |
1033 | break; |
1034 | } |
1035 | |
1036 | // Lifetime of srcAlloca ends at return. |
1037 | if (isa<ReturnInst>(Val: &I)) |
1038 | break; |
1039 | |
1040 | // Ignore the direct read of src in the load. |
1041 | if (&I == cpyLoad) |
1042 | continue; |
1043 | |
1044 | // Check whether this instruction may mod/ref src through the captured |
1045 | // pointer (we have already any direct mod/refs in the loop above). |
1046 | // Also bail if we hit a terminator, as we don't want to scan into other |
1047 | // blocks. |
1048 | if (isModOrRefSet(MRI: BAA.getModRefInfo(I: &I, OptLoc: SrcLoc)) || I.isTerminator()) |
1049 | return false; |
1050 | } |
1051 | } |
1052 | |
1053 | // Since we're changing the parameter to the callsite, we need to make sure |
1054 | // that what would be the new parameter dominates the callsite. |
1055 | bool NeedMoveGEP = false; |
1056 | if (!DT->dominates(Def: cpyDest, User: C)) { |
1057 | // Support moving a constant index GEP before the call. |
1058 | auto *GEP = dyn_cast<GetElementPtrInst>(Val: cpyDest); |
1059 | if (GEP && GEP->hasAllConstantIndices() && |
1060 | DT->dominates(Def: GEP->getPointerOperand(), User: C)) |
1061 | NeedMoveGEP = true; |
1062 | else |
1063 | return false; |
1064 | } |
1065 | |
1066 | // In addition to knowing that the call does not access src in some |
1067 | // unexpected manner, for example via a global, which we deduce from |
1068 | // the use analysis, we also need to know that it does not sneakily |
1069 | // access dest. We rely on AA to figure this out for us. |
1070 | MemoryLocation DestWithSrcSize(cpyDest, LocationSize::precise(Value: srcSize)); |
1071 | ModRefInfo MR = BAA.getModRefInfo(I: C, OptLoc: DestWithSrcSize); |
1072 | // If necessary, perform additional analysis. |
1073 | if (isModOrRefSet(MRI: MR)) |
1074 | MR = BAA.callCapturesBefore(I: C, MemLoc: DestWithSrcSize, DT); |
1075 | if (isModOrRefSet(MRI: MR)) |
1076 | return false; |
1077 | |
1078 | // We can't create address space casts here because we don't know if they're |
1079 | // safe for the target. |
1080 | if (cpySrc->getType() != cpyDest->getType()) |
1081 | return false; |
1082 | for (unsigned ArgI = 0; ArgI < C->arg_size(); ++ArgI) |
1083 | if (C->getArgOperand(i: ArgI)->stripPointerCasts() == cpySrc && |
1084 | cpySrc->getType() != C->getArgOperand(i: ArgI)->getType()) |
1085 | return false; |
1086 | |
1087 | // All the checks have passed, so do the transformation. |
1088 | bool changedArgument = false; |
1089 | for (unsigned ArgI = 0; ArgI < C->arg_size(); ++ArgI) |
1090 | if (C->getArgOperand(i: ArgI)->stripPointerCasts() == cpySrc) { |
1091 | changedArgument = true; |
1092 | C->setArgOperand(i: ArgI, v: cpyDest); |
1093 | } |
1094 | |
1095 | if (!changedArgument) |
1096 | return false; |
1097 | |
1098 | // If the destination wasn't sufficiently aligned then increase its alignment. |
1099 | if (!isDestSufficientlyAligned) { |
1100 | assert(isa<AllocaInst>(cpyDest) && "Can only increase alloca alignment!" ); |
1101 | cast<AllocaInst>(Val: cpyDest)->setAlignment(srcAlign); |
1102 | } |
1103 | |
1104 | if (NeedMoveGEP) { |
1105 | auto *GEP = dyn_cast<GetElementPtrInst>(Val: cpyDest); |
1106 | GEP->moveBefore(MovePos: C); |
1107 | } |
1108 | |
1109 | if (SkippedLifetimeStart) { |
1110 | SkippedLifetimeStart->moveBefore(MovePos: C); |
1111 | MSSAU->moveBefore(What: MSSA->getMemoryAccess(I: SkippedLifetimeStart), |
1112 | Where: MSSA->getMemoryAccess(I: C)); |
1113 | } |
1114 | |
1115 | combineAAMetadata(ReplInst: C, I: cpyLoad); |
1116 | if (cpyLoad != cpyStore) |
1117 | combineAAMetadata(ReplInst: C, I: cpyStore); |
1118 | |
1119 | ++NumCallSlot; |
1120 | return true; |
1121 | } |
1122 | |
1123 | /// We've found that the (upward scanning) memory dependence of memcpy 'M' is |
1124 | /// the memcpy 'MDep'. Try to simplify M to copy from MDep's input if we can. |
1125 | bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M, |
1126 | MemCpyInst *MDep, |
1127 | BatchAAResults &BAA) { |
1128 | // If dep instruction is reading from our current input, then it is a noop |
1129 | // transfer and substituting the input won't change this instruction. Just |
1130 | // ignore the input and let someone else zap MDep. This handles cases like: |
1131 | // memcpy(a <- a) |
1132 | // memcpy(b <- a) |
1133 | if (M->getSource() == MDep->getSource()) |
1134 | return false; |
1135 | |
1136 | // We can only optimize non-volatile memcpy's. |
1137 | if (MDep->isVolatile()) |
1138 | return false; |
1139 | |
1140 | int64_t MForwardOffset = 0; |
1141 | const DataLayout &DL = M->getModule()->getDataLayout(); |
1142 | // We can only transforms memcpy's where the dest of one is the source of the |
1143 | // other, or they have an offset in a range. |
1144 | if (M->getSource() != MDep->getDest()) { |
1145 | std::optional<int64_t> Offset = |
1146 | M->getSource()->getPointerOffsetFrom(Other: MDep->getDest(), DL); |
1147 | if (!Offset || *Offset < 0) |
1148 | return false; |
1149 | MForwardOffset = *Offset; |
1150 | } |
1151 | |
1152 | // The length of the memcpy's must be the same, or the preceding one |
1153 | // must be larger than the following one. |
1154 | if (MForwardOffset != 0 || MDep->getLength() != M->getLength()) { |
1155 | auto *MDepLen = dyn_cast<ConstantInt>(Val: MDep->getLength()); |
1156 | auto *MLen = dyn_cast<ConstantInt>(Val: M->getLength()); |
1157 | if (!MDepLen || !MLen || |
1158 | MDepLen->getZExtValue() < MLen->getZExtValue() + MForwardOffset) |
1159 | return false; |
1160 | } |
1161 | |
1162 | IRBuilder<> Builder(M); |
1163 | auto *CopySource = MDep->getSource(); |
1164 | Instruction *NewCopySource = nullptr; |
1165 | auto CleanupOnRet = llvm::make_scope_exit(F: [&NewCopySource] { |
1166 | if (NewCopySource && NewCopySource->use_empty()) |
1167 | // Safety: It's safe here because we will only allocate more instructions |
1168 | // after finishing all BatchAA queries, but we have to be careful if we |
1169 | // want to do something like this in another place. Then we'd probably |
1170 | // have to delay instruction removal until all transforms on an |
1171 | // instruction finished. |
1172 | NewCopySource->eraseFromParent(); |
1173 | }); |
1174 | MaybeAlign CopySourceAlign = MDep->getSourceAlign(); |
1175 | // We just need to calculate the actual size of the copy. |
1176 | auto MCopyLoc = MemoryLocation::getForSource(MTI: MDep).getWithNewSize( |
1177 | NewSize: MemoryLocation::getForSource(MTI: M).Size); |
1178 | |
1179 | // When the forwarding offset is greater than 0, we transform |
1180 | // memcpy(d1 <- s1) |
1181 | // memcpy(d2 <- d1+o) |
1182 | // to |
1183 | // memcpy(d2 <- s1+o) |
1184 | if (MForwardOffset > 0) { |
1185 | // The copy destination of `M` maybe can serve as the source of copying. |
1186 | std::optional<int64_t> MDestOffset = |
1187 | M->getRawDest()->getPointerOffsetFrom(Other: MDep->getRawSource(), DL); |
1188 | if (MDestOffset == MForwardOffset) |
1189 | CopySource = M->getDest(); |
1190 | else { |
1191 | CopySource = Builder.CreateInBoundsPtrAdd( |
1192 | Ptr: CopySource, Offset: Builder.getInt64(C: MForwardOffset)); |
1193 | NewCopySource = dyn_cast<Instruction>(Val: CopySource); |
1194 | } |
1195 | // We need to update `MCopyLoc` if an offset exists. |
1196 | MCopyLoc = MCopyLoc.getWithNewPtr(NewPtr: CopySource); |
1197 | if (CopySourceAlign) |
1198 | CopySourceAlign = commonAlignment(A: *CopySourceAlign, Offset: MForwardOffset); |
1199 | } |
1200 | |
1201 | // Verify that the copied-from memory doesn't change in between the two |
1202 | // transfers. For example, in: |
1203 | // memcpy(a <- b) |
1204 | // *b = 42; |
1205 | // memcpy(c <- a) |
1206 | // It would be invalid to transform the second memcpy into memcpy(c <- b). |
1207 | // |
1208 | // TODO: If the code between M and MDep is transparent to the destination "c", |
1209 | // then we could still perform the xform by moving M up to the first memcpy. |
1210 | if (writtenBetween(MSSA, AA&: BAA, Loc: MCopyLoc, Start: MSSA->getMemoryAccess(I: MDep), |
1211 | End: MSSA->getMemoryAccess(I: M))) |
1212 | return false; |
1213 | |
1214 | // No need to create `memcpy(a <- a)`. |
1215 | if (BAA.isMustAlias(V1: M->getDest(), V2: CopySource)) { |
1216 | // Remove the instruction we're replacing. |
1217 | eraseInstruction(I: M); |
1218 | ++NumMemCpyInstr; |
1219 | return true; |
1220 | } |
1221 | |
1222 | // If the dest of the second might alias the source of the first, then the |
1223 | // source and dest might overlap. In addition, if the source of the first |
1224 | // points to constant memory, they won't overlap by definition. Otherwise, we |
1225 | // still want to eliminate the intermediate value, but we have to generate a |
1226 | // memmove instead of memcpy. |
1227 | bool UseMemMove = false; |
1228 | if (isModSet(MRI: BAA.getModRefInfo(I: M, OptLoc: MemoryLocation::getForSource(MTI: MDep)))) { |
1229 | // Don't convert llvm.memcpy.inline into memmove because memmove can be |
1230 | // lowered as a call, and that is not allowed for llvm.memcpy.inline (and |
1231 | // there is no inline version of llvm.memmove) |
1232 | if (isa<MemCpyInlineInst>(Val: M)) |
1233 | return false; |
1234 | UseMemMove = true; |
1235 | } |
1236 | |
1237 | // If all checks passed, then we can transform M. |
1238 | LLVM_DEBUG(dbgs() << "MemCpyOptPass: Forwarding memcpy->memcpy src:\n" |
1239 | << *MDep << '\n' |
1240 | << *M << '\n'); |
1241 | |
1242 | // TODO: Is this worth it if we're creating a less aligned memcpy? For |
1243 | // example we could be moving from movaps -> movq on x86. |
1244 | Instruction *NewM; |
1245 | if (UseMemMove) |
1246 | NewM = |
1247 | Builder.CreateMemMove(Dst: M->getDest(), DstAlign: M->getDestAlign(), Src: CopySource, |
1248 | SrcAlign: CopySourceAlign, Size: M->getLength(), isVolatile: M->isVolatile()); |
1249 | else if (isa<MemCpyInlineInst>(Val: M)) { |
1250 | // llvm.memcpy may be promoted to llvm.memcpy.inline, but the converse is |
1251 | // never allowed since that would allow the latter to be lowered as a call |
1252 | // to an external function. |
1253 | NewM = Builder.CreateMemCpyInline(Dst: M->getDest(), DstAlign: M->getDestAlign(), |
1254 | Src: CopySource, SrcAlign: CopySourceAlign, |
1255 | Size: M->getLength(), isVolatile: M->isVolatile()); |
1256 | } else |
1257 | NewM = |
1258 | Builder.CreateMemCpy(Dst: M->getDest(), DstAlign: M->getDestAlign(), Src: CopySource, |
1259 | SrcAlign: CopySourceAlign, Size: M->getLength(), isVolatile: M->isVolatile()); |
1260 | NewM->copyMetadata(SrcInst: *M, WL: LLVMContext::MD_DIAssignID); |
1261 | |
1262 | assert(isa<MemoryDef>(MSSAU->getMemorySSA()->getMemoryAccess(M))); |
1263 | auto *LastDef = cast<MemoryDef>(Val: MSSAU->getMemorySSA()->getMemoryAccess(I: M)); |
1264 | auto *NewAccess = MSSAU->createMemoryAccessAfter(I: NewM, Definition: nullptr, InsertPt: LastDef); |
1265 | MSSAU->insertDef(Def: cast<MemoryDef>(Val: NewAccess), /*RenameUses=*/true); |
1266 | |
1267 | // Remove the instruction we're replacing. |
1268 | eraseInstruction(I: M); |
1269 | ++NumMemCpyInstr; |
1270 | return true; |
1271 | } |
1272 | |
1273 | /// We've found that the (upward scanning) memory dependence of \p MemCpy is |
1274 | /// \p MemSet. Try to simplify \p MemSet to only set the trailing bytes that |
1275 | /// weren't copied over by \p MemCpy. |
1276 | /// |
1277 | /// In other words, transform: |
1278 | /// \code |
1279 | /// memset(dst, c, dst_size); |
1280 | /// ... |
1281 | /// memcpy(dst, src, src_size); |
1282 | /// \endcode |
1283 | /// into: |
1284 | /// \code |
1285 | /// ... |
1286 | /// memset(dst + src_size, c, dst_size <= src_size ? 0 : dst_size - src_size); |
1287 | /// memcpy(dst, src, src_size); |
1288 | /// \endcode |
1289 | /// |
1290 | /// The memset is sunk to just before the memcpy to ensure that src_size is |
1291 | /// present when emitting the simplified memset. |
1292 | bool MemCpyOptPass::processMemSetMemCpyDependence(MemCpyInst *MemCpy, |
1293 | MemSetInst *MemSet, |
1294 | BatchAAResults &BAA) { |
1295 | // We can only transform memset/memcpy with the same destination. |
1296 | if (!BAA.isMustAlias(V1: MemSet->getDest(), V2: MemCpy->getDest())) |
1297 | return false; |
1298 | |
1299 | // Don't perform the transform if src_size may be zero. In that case, the |
1300 | // transform is essentially a complex no-op and may lead to an infinite |
1301 | // loop if BasicAA is smart enough to understand that dst and dst + src_size |
1302 | // are still MustAlias after the transform. |
1303 | Value *SrcSize = MemCpy->getLength(); |
1304 | if (!isKnownNonZero(V: SrcSize, |
1305 | Q: SimplifyQuery(MemCpy->getDataLayout(), DT, AC, MemCpy))) |
1306 | return false; |
1307 | |
1308 | // Check that src and dst of the memcpy aren't the same. While memcpy |
1309 | // operands cannot partially overlap, exact equality is allowed. |
1310 | if (isModSet(MRI: BAA.getModRefInfo(I: MemCpy, OptLoc: MemoryLocation::getForSource(MTI: MemCpy)))) |
1311 | return false; |
1312 | |
1313 | // We know that dst up to src_size is not written. We now need to make sure |
1314 | // that dst up to dst_size is not accessed. (If we did not move the memset, |
1315 | // checking for reads would be sufficient.) |
1316 | if (accessedBetween(AA&: BAA, Loc: MemoryLocation::getForDest(MI: MemSet), |
1317 | Start: MSSA->getMemoryAccess(I: MemSet), |
1318 | End: MSSA->getMemoryAccess(I: MemCpy))) |
1319 | return false; |
1320 | |
1321 | // Use the same i8* dest as the memcpy, killing the memset dest if different. |
1322 | Value *Dest = MemCpy->getRawDest(); |
1323 | Value *DestSize = MemSet->getLength(); |
1324 | |
1325 | if (mayBeVisibleThroughUnwinding(V: Dest, Start: MemSet, End: MemCpy)) |
1326 | return false; |
1327 | |
1328 | // If the sizes are the same, simply drop the memset instead of generating |
1329 | // a replacement with zero size. |
1330 | if (DestSize == SrcSize) { |
1331 | eraseInstruction(I: MemSet); |
1332 | return true; |
1333 | } |
1334 | |
1335 | // By default, create an unaligned memset. |
1336 | Align Alignment = Align(1); |
1337 | // If Dest is aligned, and SrcSize is constant, use the minimum alignment |
1338 | // of the sum. |
1339 | const Align DestAlign = std::max(a: MemSet->getDestAlign().valueOrOne(), |
1340 | b: MemCpy->getDestAlign().valueOrOne()); |
1341 | if (DestAlign > 1) |
1342 | if (auto *SrcSizeC = dyn_cast<ConstantInt>(Val: SrcSize)) |
1343 | Alignment = commonAlignment(A: DestAlign, Offset: SrcSizeC->getZExtValue()); |
1344 | |
1345 | IRBuilder<> Builder(MemCpy); |
1346 | |
1347 | // Preserve the debug location of the old memset for the code emitted here |
1348 | // related to the new memset. This is correct according to the rules in |
1349 | // https://llvm.org/docs/HowToUpdateDebugInfo.html about "when to preserve an |
1350 | // instruction location", given that we move the memset within the basic |
1351 | // block. |
1352 | assert(MemSet->getParent() == MemCpy->getParent() && |
1353 | "Preserving debug location based on moving memset within BB." ); |
1354 | Builder.SetCurrentDebugLocation(MemSet->getDebugLoc()); |
1355 | |
1356 | // If the sizes have different types, zext the smaller one. |
1357 | if (DestSize->getType() != SrcSize->getType()) { |
1358 | if (DestSize->getType()->getIntegerBitWidth() > |
1359 | SrcSize->getType()->getIntegerBitWidth()) |
1360 | SrcSize = Builder.CreateZExt(V: SrcSize, DestTy: DestSize->getType()); |
1361 | else |
1362 | DestSize = Builder.CreateZExt(V: DestSize, DestTy: SrcSize->getType()); |
1363 | } |
1364 | |
1365 | Value *Ule = Builder.CreateICmpULE(LHS: DestSize, RHS: SrcSize); |
1366 | Value *SizeDiff = Builder.CreateSub(LHS: DestSize, RHS: SrcSize); |
1367 | Value *MemsetLen = Builder.CreateSelect( |
1368 | C: Ule, True: ConstantInt::getNullValue(Ty: DestSize->getType()), False: SizeDiff); |
1369 | Instruction *NewMemSet = |
1370 | Builder.CreateMemSet(Ptr: Builder.CreatePtrAdd(Ptr: Dest, Offset: SrcSize), |
1371 | Val: MemSet->getOperand(i_nocapture: 1), Size: MemsetLen, Align: Alignment); |
1372 | |
1373 | assert(isa<MemoryDef>(MSSAU->getMemorySSA()->getMemoryAccess(MemCpy)) && |
1374 | "MemCpy must be a MemoryDef" ); |
1375 | // The new memset is inserted before the memcpy, and it is known that the |
1376 | // memcpy's defining access is the memset about to be removed. |
1377 | auto *LastDef = |
1378 | cast<MemoryDef>(Val: MSSAU->getMemorySSA()->getMemoryAccess(I: MemCpy)); |
1379 | auto *NewAccess = |
1380 | MSSAU->createMemoryAccessBefore(I: NewMemSet, Definition: nullptr, InsertPt: LastDef); |
1381 | MSSAU->insertDef(Def: cast<MemoryDef>(Val: NewAccess), /*RenameUses=*/true); |
1382 | |
1383 | eraseInstruction(I: MemSet); |
1384 | return true; |
1385 | } |
1386 | |
1387 | /// Determine whether the instruction has undefined content for the given Size, |
1388 | /// either because it was freshly alloca'd or started its lifetime. |
1389 | static bool hasUndefContents(MemorySSA *MSSA, BatchAAResults &AA, Value *V, |
1390 | MemoryDef *Def, Value *Size) { |
1391 | if (MSSA->isLiveOnEntryDef(MA: Def)) |
1392 | return isa<AllocaInst>(Val: getUnderlyingObject(V)); |
1393 | |
1394 | if (auto *II = dyn_cast_or_null<IntrinsicInst>(Val: Def->getMemoryInst())) { |
1395 | if (II->getIntrinsicID() == Intrinsic::lifetime_start) { |
1396 | auto *LTSize = cast<ConstantInt>(Val: II->getArgOperand(i: 0)); |
1397 | |
1398 | if (auto *CSize = dyn_cast<ConstantInt>(Val: Size)) { |
1399 | if (AA.isMustAlias(V1: V, V2: II->getArgOperand(i: 1)) && |
1400 | LTSize->getZExtValue() >= CSize->getZExtValue()) |
1401 | return true; |
1402 | } |
1403 | |
1404 | // If the lifetime.start covers a whole alloca (as it almost always |
1405 | // does) and we're querying a pointer based on that alloca, then we know |
1406 | // the memory is definitely undef, regardless of how exactly we alias. |
1407 | // The size also doesn't matter, as an out-of-bounds access would be UB. |
1408 | if (auto *Alloca = dyn_cast<AllocaInst>(Val: getUnderlyingObject(V))) { |
1409 | if (getUnderlyingObject(V: II->getArgOperand(i: 1)) == Alloca) { |
1410 | const DataLayout &DL = Alloca->getDataLayout(); |
1411 | if (std::optional<TypeSize> AllocaSize = |
1412 | Alloca->getAllocationSize(DL)) |
1413 | if (*AllocaSize == LTSize->getValue()) |
1414 | return true; |
1415 | } |
1416 | } |
1417 | } |
1418 | } |
1419 | |
1420 | return false; |
1421 | } |
1422 | |
1423 | /// Transform memcpy to memset when its source was just memset. |
1424 | /// In other words, turn: |
1425 | /// \code |
1426 | /// memset(dst1, c, dst1_size); |
1427 | /// memcpy(dst2, dst1, dst2_size); |
1428 | /// \endcode |
1429 | /// into: |
1430 | /// \code |
1431 | /// memset(dst1, c, dst1_size); |
1432 | /// memset(dst2, c, dst2_size); |
1433 | /// \endcode |
1434 | /// When dst2_size <= dst1_size. |
1435 | bool MemCpyOptPass::performMemCpyToMemSetOptzn(MemCpyInst *MemCpy, |
1436 | MemSetInst *MemSet, |
1437 | BatchAAResults &BAA) { |
1438 | // Make sure that memcpy(..., memset(...), ...), that is we are memsetting and |
1439 | // memcpying from the same address. Otherwise it is hard to reason about. |
1440 | if (!BAA.isMustAlias(V1: MemSet->getRawDest(), V2: MemCpy->getRawSource())) |
1441 | return false; |
1442 | |
1443 | Value *MemSetSize = MemSet->getLength(); |
1444 | Value *CopySize = MemCpy->getLength(); |
1445 | |
1446 | if (MemSetSize != CopySize) { |
1447 | // Make sure the memcpy doesn't read any more than what the memset wrote. |
1448 | // Don't worry about sizes larger than i64. |
1449 | |
1450 | // A known memset size is required. |
1451 | auto *CMemSetSize = dyn_cast<ConstantInt>(Val: MemSetSize); |
1452 | if (!CMemSetSize) |
1453 | return false; |
1454 | |
1455 | // A known memcpy size is also required. |
1456 | auto *CCopySize = dyn_cast<ConstantInt>(Val: CopySize); |
1457 | if (!CCopySize) |
1458 | return false; |
1459 | if (CCopySize->getZExtValue() > CMemSetSize->getZExtValue()) { |
1460 | // If the memcpy is larger than the memset, but the memory was undef prior |
1461 | // to the memset, we can just ignore the tail. Technically we're only |
1462 | // interested in the bytes from MemSetSize..CopySize here, but as we can't |
1463 | // easily represent this location, we use the full 0..CopySize range. |
1464 | MemoryLocation MemCpyLoc = MemoryLocation::getForSource(MTI: MemCpy); |
1465 | bool CanReduceSize = false; |
1466 | MemoryUseOrDef *MemSetAccess = MSSA->getMemoryAccess(I: MemSet); |
1467 | MemoryAccess *Clobber = MSSA->getWalker()->getClobberingMemoryAccess( |
1468 | MemSetAccess->getDefiningAccess(), MemCpyLoc, AA&: BAA); |
1469 | if (auto *MD = dyn_cast<MemoryDef>(Val: Clobber)) |
1470 | if (hasUndefContents(MSSA, AA&: BAA, V: MemCpy->getSource(), Def: MD, Size: CopySize)) |
1471 | CanReduceSize = true; |
1472 | |
1473 | if (!CanReduceSize) |
1474 | return false; |
1475 | CopySize = MemSetSize; |
1476 | } |
1477 | } |
1478 | |
1479 | IRBuilder<> Builder(MemCpy); |
1480 | Instruction *NewM = |
1481 | Builder.CreateMemSet(Ptr: MemCpy->getRawDest(), Val: MemSet->getOperand(i_nocapture: 1), |
1482 | Size: CopySize, Align: MemCpy->getDestAlign()); |
1483 | auto *LastDef = |
1484 | cast<MemoryDef>(Val: MSSAU->getMemorySSA()->getMemoryAccess(I: MemCpy)); |
1485 | auto *NewAccess = MSSAU->createMemoryAccessAfter(I: NewM, Definition: nullptr, InsertPt: LastDef); |
1486 | MSSAU->insertDef(Def: cast<MemoryDef>(Val: NewAccess), /*RenameUses=*/true); |
1487 | |
1488 | return true; |
1489 | } |
1490 | |
1491 | // Attempts to optimize the pattern whereby memory is copied from an alloca to |
1492 | // another alloca, where the two allocas don't have conflicting mod/ref. If |
1493 | // successful, the two allocas can be merged into one and the transfer can be |
1494 | // deleted. This pattern is generated frequently in Rust, due to the ubiquity of |
1495 | // move operations in that language. |
1496 | // |
1497 | // Once we determine that the optimization is safe to perform, we replace all |
1498 | // uses of the destination alloca with the source alloca. We also "shrink wrap" |
1499 | // the lifetime markers of the single merged alloca to before the first use |
1500 | // and after the last use. Note that the "shrink wrapping" procedure is a safe |
1501 | // transformation only because we restrict the scope of this optimization to |
1502 | // allocas that aren't captured. |
1503 | bool MemCpyOptPass::performStackMoveOptzn(Instruction *Load, Instruction *Store, |
1504 | AllocaInst *DestAlloca, |
1505 | AllocaInst *SrcAlloca, TypeSize Size, |
1506 | BatchAAResults &BAA) { |
1507 | LLVM_DEBUG(dbgs() << "Stack Move: Attempting to optimize:\n" |
1508 | << *Store << "\n" ); |
1509 | |
1510 | // Make sure the two allocas are in the same address space. |
1511 | if (SrcAlloca->getAddressSpace() != DestAlloca->getAddressSpace()) { |
1512 | LLVM_DEBUG(dbgs() << "Stack Move: Address space mismatch\n" ); |
1513 | return false; |
1514 | } |
1515 | |
1516 | // Check that copy is full with static size. |
1517 | const DataLayout &DL = DestAlloca->getDataLayout(); |
1518 | std::optional<TypeSize> SrcSize = SrcAlloca->getAllocationSize(DL); |
1519 | if (!SrcSize || Size != *SrcSize) { |
1520 | LLVM_DEBUG(dbgs() << "Stack Move: Source alloca size mismatch\n" ); |
1521 | return false; |
1522 | } |
1523 | std::optional<TypeSize> DestSize = DestAlloca->getAllocationSize(DL); |
1524 | if (!DestSize || Size != *DestSize) { |
1525 | LLVM_DEBUG(dbgs() << "Stack Move: Destination alloca size mismatch\n" ); |
1526 | return false; |
1527 | } |
1528 | |
1529 | if (!SrcAlloca->isStaticAlloca() || !DestAlloca->isStaticAlloca()) |
1530 | return false; |
1531 | |
1532 | // Check that src and dest are never captured, unescaped allocas. Also |
1533 | // find the nearest common dominator and postdominator for all users in |
1534 | // order to shrink wrap the lifetimes, and instructions with noalias metadata |
1535 | // to remove them. |
1536 | |
1537 | SmallVector<Instruction *, 4> LifetimeMarkers; |
1538 | SmallSet<Instruction *, 4> NoAliasInstrs; |
1539 | bool SrcNotDom = false; |
1540 | |
1541 | // Recursively track the user and check whether modified alias exist. |
1542 | auto IsDereferenceableOrNull = [](Value *V, const DataLayout &DL) -> bool { |
1543 | bool CanBeNull, CanBeFreed; |
1544 | return V->getPointerDereferenceableBytes(DL, CanBeNull, CanBeFreed); |
1545 | }; |
1546 | |
1547 | auto CaptureTrackingWithModRef = |
1548 | [&](Instruction *AI, |
1549 | function_ref<bool(Instruction *)> ModRefCallback) -> bool { |
1550 | SmallVector<Instruction *, 8> Worklist; |
1551 | Worklist.push_back(Elt: AI); |
1552 | unsigned MaxUsesToExplore = getDefaultMaxUsesToExploreForCaptureTracking(); |
1553 | Worklist.reserve(N: MaxUsesToExplore); |
1554 | SmallSet<const Use *, 20> Visited; |
1555 | while (!Worklist.empty()) { |
1556 | Instruction *I = Worklist.back(); |
1557 | Worklist.pop_back(); |
1558 | for (const Use &U : I->uses()) { |
1559 | auto *UI = cast<Instruction>(Val: U.getUser()); |
1560 | // If any use that isn't dominated by SrcAlloca exists, we move src |
1561 | // alloca to the entry before the transformation. |
1562 | if (!DT->dominates(Def: SrcAlloca, User: UI)) |
1563 | SrcNotDom = true; |
1564 | |
1565 | if (Visited.size() >= MaxUsesToExplore) { |
1566 | LLVM_DEBUG( |
1567 | dbgs() |
1568 | << "Stack Move: Exceeded max uses to see ModRef, bailing\n" ); |
1569 | return false; |
1570 | } |
1571 | if (!Visited.insert(Ptr: &U).second) |
1572 | continue; |
1573 | switch (DetermineUseCaptureKind(U, IsDereferenceableOrNull)) { |
1574 | case UseCaptureKind::MAY_CAPTURE: |
1575 | return false; |
1576 | case UseCaptureKind::PASSTHROUGH: |
1577 | // Instructions cannot have non-instruction users. |
1578 | Worklist.push_back(Elt: UI); |
1579 | continue; |
1580 | case UseCaptureKind::NO_CAPTURE: { |
1581 | if (UI->isLifetimeStartOrEnd()) { |
1582 | // We note the locations of these intrinsic calls so that we can |
1583 | // delete them later if the optimization succeeds, this is safe |
1584 | // since both llvm.lifetime.start and llvm.lifetime.end intrinsics |
1585 | // practically fill all the bytes of the alloca with an undefined |
1586 | // value, although conceptually marked as alive/dead. |
1587 | int64_t Size = cast<ConstantInt>(Val: UI->getOperand(i: 0))->getSExtValue(); |
1588 | if (Size < 0 || Size == DestSize) { |
1589 | LifetimeMarkers.push_back(Elt: UI); |
1590 | continue; |
1591 | } |
1592 | } |
1593 | if (UI->hasMetadata(KindID: LLVMContext::MD_noalias)) |
1594 | NoAliasInstrs.insert(Ptr: UI); |
1595 | if (!ModRefCallback(UI)) |
1596 | return false; |
1597 | } |
1598 | } |
1599 | } |
1600 | } |
1601 | return true; |
1602 | }; |
1603 | |
1604 | // Check that dest has no Mod/Ref, from the alloca to the Store, except full |
1605 | // size lifetime intrinsics. And collect modref inst for the reachability |
1606 | // check. |
1607 | ModRefInfo DestModRef = ModRefInfo::NoModRef; |
1608 | MemoryLocation DestLoc(DestAlloca, LocationSize::precise(Value: Size)); |
1609 | SmallVector<BasicBlock *, 8> ReachabilityWorklist; |
1610 | auto DestModRefCallback = [&](Instruction *UI) -> bool { |
1611 | // We don't care about the store itself. |
1612 | if (UI == Store) |
1613 | return true; |
1614 | ModRefInfo Res = BAA.getModRefInfo(I: UI, OptLoc: DestLoc); |
1615 | DestModRef |= Res; |
1616 | if (isModOrRefSet(MRI: Res)) { |
1617 | // Instructions reachability checks. |
1618 | // FIXME: adding the Instruction version isPotentiallyReachableFromMany on |
1619 | // lib/Analysis/CFG.cpp (currently only for BasicBlocks) might be helpful. |
1620 | if (UI->getParent() == Store->getParent()) { |
1621 | // The same block case is special because it's the only time we're |
1622 | // looking within a single block to see which instruction comes first. |
1623 | // Once we start looking at multiple blocks, the first instruction of |
1624 | // the block is reachable, so we only need to determine reachability |
1625 | // between whole blocks. |
1626 | BasicBlock *BB = UI->getParent(); |
1627 | |
1628 | // If A comes before B, then B is definitively reachable from A. |
1629 | if (UI->comesBefore(Other: Store)) |
1630 | return false; |
1631 | |
1632 | // If the user's parent block is entry, no predecessor exists. |
1633 | if (BB->isEntryBlock()) |
1634 | return true; |
1635 | |
1636 | // Otherwise, continue doing the normal per-BB CFG walk. |
1637 | ReachabilityWorklist.append(in_start: succ_begin(BB), in_end: succ_end(BB)); |
1638 | } else { |
1639 | ReachabilityWorklist.push_back(Elt: UI->getParent()); |
1640 | } |
1641 | } |
1642 | return true; |
1643 | }; |
1644 | |
1645 | if (!CaptureTrackingWithModRef(DestAlloca, DestModRefCallback)) |
1646 | return false; |
1647 | // Bailout if Dest may have any ModRef before Store. |
1648 | if (!ReachabilityWorklist.empty() && |
1649 | isPotentiallyReachableFromMany(Worklist&: ReachabilityWorklist, StopBB: Store->getParent(), |
1650 | ExclusionSet: nullptr, DT, LI: nullptr)) |
1651 | return false; |
1652 | |
1653 | // Check that, from after the Load to the end of the BB, |
1654 | // - if the dest has any Mod, src has no Ref, and |
1655 | // - if the dest has any Ref, src has no Mod except full-sized lifetimes. |
1656 | MemoryLocation SrcLoc(SrcAlloca, LocationSize::precise(Value: Size)); |
1657 | |
1658 | auto SrcModRefCallback = [&](Instruction *UI) -> bool { |
1659 | // Any ModRef post-dominated by Load doesn't matter, also Load and Store |
1660 | // themselves can be ignored. |
1661 | if (PDT->dominates(I1: Load, I2: UI) || UI == Load || UI == Store) |
1662 | return true; |
1663 | ModRefInfo Res = BAA.getModRefInfo(I: UI, OptLoc: SrcLoc); |
1664 | if ((isModSet(MRI: DestModRef) && isRefSet(MRI: Res)) || |
1665 | (isRefSet(MRI: DestModRef) && isModSet(MRI: Res))) |
1666 | return false; |
1667 | |
1668 | return true; |
1669 | }; |
1670 | |
1671 | if (!CaptureTrackingWithModRef(SrcAlloca, SrcModRefCallback)) |
1672 | return false; |
1673 | |
1674 | // We can do the transformation. First, move the SrcAlloca to the start of the |
1675 | // BB. |
1676 | if (SrcNotDom) |
1677 | SrcAlloca->moveBefore(BB&: *SrcAlloca->getParent(), |
1678 | I: SrcAlloca->getParent()->getFirstInsertionPt()); |
1679 | // Align the allocas appropriately. |
1680 | SrcAlloca->setAlignment( |
1681 | std::max(a: SrcAlloca->getAlign(), b: DestAlloca->getAlign())); |
1682 | |
1683 | // Merge the two allocas. |
1684 | DestAlloca->replaceAllUsesWith(V: SrcAlloca); |
1685 | eraseInstruction(I: DestAlloca); |
1686 | |
1687 | // Drop metadata on the source alloca. |
1688 | SrcAlloca->dropUnknownNonDebugMetadata(); |
1689 | |
1690 | // TODO: Reconstruct merged lifetime markers. |
1691 | // Remove all other lifetime markers. if the original lifetime intrinsics |
1692 | // exists. |
1693 | if (!LifetimeMarkers.empty()) { |
1694 | for (Instruction *I : LifetimeMarkers) |
1695 | eraseInstruction(I); |
1696 | } |
1697 | |
1698 | // As this transformation can cause memory accesses that didn't previously |
1699 | // alias to begin to alias one another, we remove !noalias metadata from any |
1700 | // uses of either alloca. This is conservative, but more precision doesn't |
1701 | // seem worthwhile right now. |
1702 | for (Instruction *I : NoAliasInstrs) |
1703 | I->setMetadata(KindID: LLVMContext::MD_noalias, Node: nullptr); |
1704 | |
1705 | LLVM_DEBUG(dbgs() << "Stack Move: Performed staack-move optimization\n" ); |
1706 | NumStackMove++; |
1707 | return true; |
1708 | } |
1709 | |
1710 | static bool isZeroSize(Value *Size) { |
1711 | if (auto *I = dyn_cast<Instruction>(Val: Size)) |
1712 | if (auto *Res = simplifyInstruction(I, Q: I->getDataLayout())) |
1713 | Size = Res; |
1714 | // Treat undef/poison size like zero. |
1715 | if (auto *C = dyn_cast<Constant>(Val: Size)) |
1716 | return isa<UndefValue>(Val: C) || C->isNullValue(); |
1717 | return false; |
1718 | } |
1719 | |
1720 | /// Perform simplification of memcpy's. If we have memcpy A |
1721 | /// which copies X to Y, and memcpy B which copies Y to Z, then we can rewrite |
1722 | /// B to be a memcpy from X to Z (or potentially a memmove, depending on |
1723 | /// circumstances). This allows later passes to remove the first memcpy |
1724 | /// altogether. |
1725 | bool MemCpyOptPass::processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI) { |
1726 | // We can only optimize non-volatile memcpy's. |
1727 | if (M->isVolatile()) |
1728 | return false; |
1729 | |
1730 | // If the source and destination of the memcpy are the same, then zap it. |
1731 | if (M->getSource() == M->getDest()) { |
1732 | ++BBI; |
1733 | eraseInstruction(I: M); |
1734 | return true; |
1735 | } |
1736 | |
1737 | // If the size is zero, remove the memcpy. |
1738 | if (isZeroSize(Size: M->getLength())) { |
1739 | ++BBI; |
1740 | eraseInstruction(I: M); |
1741 | return true; |
1742 | } |
1743 | |
1744 | MemoryUseOrDef *MA = MSSA->getMemoryAccess(I: M); |
1745 | if (!MA) |
1746 | // Degenerate case: memcpy marked as not accessing memory. |
1747 | return false; |
1748 | |
1749 | // If copying from a constant, try to turn the memcpy into a memset. |
1750 | if (auto *GV = dyn_cast<GlobalVariable>(Val: M->getSource())) |
1751 | if (GV->isConstant() && GV->hasDefinitiveInitializer()) |
1752 | if (Value *ByteVal = isBytewiseValue(V: GV->getInitializer(), |
1753 | DL: M->getDataLayout())) { |
1754 | IRBuilder<> Builder(M); |
1755 | Instruction *NewM = Builder.CreateMemSet( |
1756 | Ptr: M->getRawDest(), Val: ByteVal, Size: M->getLength(), Align: M->getDestAlign(), isVolatile: false); |
1757 | auto *LastDef = cast<MemoryDef>(Val: MA); |
1758 | auto *NewAccess = |
1759 | MSSAU->createMemoryAccessAfter(I: NewM, Definition: nullptr, InsertPt: LastDef); |
1760 | MSSAU->insertDef(Def: cast<MemoryDef>(Val: NewAccess), /*RenameUses=*/true); |
1761 | |
1762 | eraseInstruction(I: M); |
1763 | ++NumCpyToSet; |
1764 | return true; |
1765 | } |
1766 | |
1767 | BatchAAResults BAA(*AA); |
1768 | // FIXME: Not using getClobberingMemoryAccess() here due to PR54682. |
1769 | MemoryAccess *AnyClobber = MA->getDefiningAccess(); |
1770 | MemoryLocation DestLoc = MemoryLocation::getForDest(MI: M); |
1771 | const MemoryAccess *DestClobber = |
1772 | MSSA->getWalker()->getClobberingMemoryAccess(AnyClobber, DestLoc, AA&: BAA); |
1773 | |
1774 | // Try to turn a partially redundant memset + memcpy into |
1775 | // smaller memset + memcpy. We don't need the memcpy size for this. |
1776 | // The memcpy must post-dom the memset, so limit this to the same basic |
1777 | // block. A non-local generalization is likely not worthwhile. |
1778 | if (auto *MD = dyn_cast<MemoryDef>(Val: DestClobber)) |
1779 | if (auto *MDep = dyn_cast_or_null<MemSetInst>(Val: MD->getMemoryInst())) |
1780 | if (DestClobber->getBlock() == M->getParent()) |
1781 | if (processMemSetMemCpyDependence(MemCpy: M, MemSet: MDep, BAA)) |
1782 | return true; |
1783 | |
1784 | MemoryAccess *SrcClobber = MSSA->getWalker()->getClobberingMemoryAccess( |
1785 | AnyClobber, MemoryLocation::getForSource(MTI: M), AA&: BAA); |
1786 | |
1787 | // There are five possible optimizations we can do for memcpy: |
1788 | // a) memcpy-memcpy xform which exposes redundance for DSE. |
1789 | // b) call-memcpy xform for return slot optimization. |
1790 | // c) memcpy from freshly alloca'd space or space that has just started |
1791 | // its lifetime copies undefined data, and we can therefore eliminate |
1792 | // the memcpy in favor of the data that was already at the destination. |
1793 | // d) memcpy from a just-memset'd source can be turned into memset. |
1794 | // e) elimination of memcpy via stack-move optimization. |
1795 | if (auto *MD = dyn_cast<MemoryDef>(Val: SrcClobber)) { |
1796 | if (Instruction *MI = MD->getMemoryInst()) { |
1797 | if (auto *CopySize = dyn_cast<ConstantInt>(Val: M->getLength())) { |
1798 | if (auto *C = dyn_cast<CallInst>(Val: MI)) { |
1799 | if (performCallSlotOptzn(cpyLoad: M, cpyStore: M, cpyDest: M->getDest(), cpySrc: M->getSource(), |
1800 | cpySize: TypeSize::getFixed(ExactSize: CopySize->getZExtValue()), |
1801 | cpyDestAlign: M->getDestAlign().valueOrOne(), BAA, |
1802 | GetC: [C]() -> CallInst * { return C; })) { |
1803 | LLVM_DEBUG(dbgs() << "Performed call slot optimization:\n" |
1804 | << " call: " << *C << "\n" |
1805 | << " memcpy: " << *M << "\n" ); |
1806 | eraseInstruction(I: M); |
1807 | ++NumMemCpyInstr; |
1808 | return true; |
1809 | } |
1810 | } |
1811 | } |
1812 | if (auto *MDep = dyn_cast<MemCpyInst>(Val: MI)) |
1813 | if (processMemCpyMemCpyDependence(M, MDep, BAA)) |
1814 | return true; |
1815 | if (auto *MDep = dyn_cast<MemSetInst>(Val: MI)) { |
1816 | if (performMemCpyToMemSetOptzn(MemCpy: M, MemSet: MDep, BAA)) { |
1817 | LLVM_DEBUG(dbgs() << "Converted memcpy to memset\n" ); |
1818 | eraseInstruction(I: M); |
1819 | ++NumCpyToSet; |
1820 | return true; |
1821 | } |
1822 | } |
1823 | } |
1824 | |
1825 | if (hasUndefContents(MSSA, AA&: BAA, V: M->getSource(), Def: MD, Size: M->getLength())) { |
1826 | LLVM_DEBUG(dbgs() << "Removed memcpy from undef\n" ); |
1827 | eraseInstruction(I: M); |
1828 | ++NumMemCpyInstr; |
1829 | return true; |
1830 | } |
1831 | } |
1832 | |
1833 | // If the transfer is from a stack slot to a stack slot, then we may be able |
1834 | // to perform the stack-move optimization. See the comments in |
1835 | // performStackMoveOptzn() for more details. |
1836 | auto *DestAlloca = dyn_cast<AllocaInst>(Val: M->getDest()); |
1837 | if (!DestAlloca) |
1838 | return false; |
1839 | auto *SrcAlloca = dyn_cast<AllocaInst>(Val: M->getSource()); |
1840 | if (!SrcAlloca) |
1841 | return false; |
1842 | ConstantInt *Len = dyn_cast<ConstantInt>(Val: M->getLength()); |
1843 | if (Len == nullptr) |
1844 | return false; |
1845 | if (performStackMoveOptzn(Load: M, Store: M, DestAlloca, SrcAlloca, |
1846 | Size: TypeSize::getFixed(ExactSize: Len->getZExtValue()), BAA)) { |
1847 | // Avoid invalidating the iterator. |
1848 | BBI = M->getNextNonDebugInstruction()->getIterator(); |
1849 | eraseInstruction(I: M); |
1850 | ++NumMemCpyInstr; |
1851 | return true; |
1852 | } |
1853 | |
1854 | return false; |
1855 | } |
1856 | |
1857 | /// Transforms memmove calls to memcpy calls when the src/dst are guaranteed |
1858 | /// not to alias. |
1859 | bool MemCpyOptPass::processMemMove(MemMoveInst *M) { |
1860 | // See if the source could be modified by this memmove potentially. |
1861 | if (isModSet(MRI: AA->getModRefInfo(I: M, OptLoc: MemoryLocation::getForSource(MTI: M)))) |
1862 | return false; |
1863 | |
1864 | LLVM_DEBUG(dbgs() << "MemCpyOptPass: Optimizing memmove -> memcpy: " << *M |
1865 | << "\n" ); |
1866 | |
1867 | // If not, then we know we can transform this. |
1868 | Type *ArgTys[3] = {M->getRawDest()->getType(), M->getRawSource()->getType(), |
1869 | M->getLength()->getType()}; |
1870 | M->setCalledFunction( |
1871 | Intrinsic::getDeclaration(M: M->getModule(), id: Intrinsic::memcpy, Tys: ArgTys)); |
1872 | |
1873 | // For MemorySSA nothing really changes (except that memcpy may imply stricter |
1874 | // aliasing guarantees). |
1875 | |
1876 | ++NumMoveToCpy; |
1877 | return true; |
1878 | } |
1879 | |
1880 | /// This is called on every byval argument in call sites. |
1881 | bool MemCpyOptPass::processByValArgument(CallBase &CB, unsigned ArgNo) { |
1882 | const DataLayout &DL = CB.getDataLayout(); |
1883 | // Find out what feeds this byval argument. |
1884 | Value *ByValArg = CB.getArgOperand(i: ArgNo); |
1885 | Type *ByValTy = CB.getParamByValType(ArgNo); |
1886 | TypeSize ByValSize = DL.getTypeAllocSize(Ty: ByValTy); |
1887 | MemoryLocation Loc(ByValArg, LocationSize::precise(Value: ByValSize)); |
1888 | MemoryUseOrDef *CallAccess = MSSA->getMemoryAccess(I: &CB); |
1889 | if (!CallAccess) |
1890 | return false; |
1891 | MemCpyInst *MDep = nullptr; |
1892 | BatchAAResults BAA(*AA); |
1893 | MemoryAccess *Clobber = MSSA->getWalker()->getClobberingMemoryAccess( |
1894 | CallAccess->getDefiningAccess(), Loc, AA&: BAA); |
1895 | if (auto *MD = dyn_cast<MemoryDef>(Val: Clobber)) |
1896 | MDep = dyn_cast_or_null<MemCpyInst>(Val: MD->getMemoryInst()); |
1897 | |
1898 | // If the byval argument isn't fed by a memcpy, ignore it. If it is fed by |
1899 | // a memcpy, see if we can byval from the source of the memcpy instead of the |
1900 | // result. |
1901 | if (!MDep || MDep->isVolatile() || |
1902 | ByValArg->stripPointerCasts() != MDep->getDest()) |
1903 | return false; |
1904 | |
1905 | // The length of the memcpy must be larger or equal to the size of the byval. |
1906 | auto *C1 = dyn_cast<ConstantInt>(Val: MDep->getLength()); |
1907 | if (!C1 || !TypeSize::isKnownGE( |
1908 | LHS: TypeSize::getFixed(ExactSize: C1->getValue().getZExtValue()), RHS: ByValSize)) |
1909 | return false; |
1910 | |
1911 | // Get the alignment of the byval. If the call doesn't specify the alignment, |
1912 | // then it is some target specific value that we can't know. |
1913 | MaybeAlign ByValAlign = CB.getParamAlign(ArgNo); |
1914 | if (!ByValAlign) |
1915 | return false; |
1916 | |
1917 | // If it is greater than the memcpy, then we check to see if we can force the |
1918 | // source of the memcpy to the alignment we need. If we fail, we bail out. |
1919 | MaybeAlign MemDepAlign = MDep->getSourceAlign(); |
1920 | if ((!MemDepAlign || *MemDepAlign < *ByValAlign) && |
1921 | getOrEnforceKnownAlignment(V: MDep->getSource(), PrefAlign: ByValAlign, DL, CxtI: &CB, AC, |
1922 | DT) < *ByValAlign) |
1923 | return false; |
1924 | |
1925 | // The type of the memcpy source must match the byval argument |
1926 | if (MDep->getSource()->getType() != ByValArg->getType()) |
1927 | return false; |
1928 | |
1929 | // Verify that the copied-from memory doesn't change in between the memcpy and |
1930 | // the byval call. |
1931 | // memcpy(a <- b) |
1932 | // *b = 42; |
1933 | // foo(*a) |
1934 | // It would be invalid to transform the second memcpy into foo(*b). |
1935 | if (writtenBetween(MSSA, AA&: BAA, Loc: MemoryLocation::getForSource(MTI: MDep), |
1936 | Start: MSSA->getMemoryAccess(I: MDep), End: CallAccess)) |
1937 | return false; |
1938 | |
1939 | LLVM_DEBUG(dbgs() << "MemCpyOptPass: Forwarding memcpy to byval:\n" |
1940 | << " " << *MDep << "\n" |
1941 | << " " << CB << "\n" ); |
1942 | |
1943 | // Otherwise we're good! Update the byval argument. |
1944 | combineAAMetadata(ReplInst: &CB, I: MDep); |
1945 | CB.setArgOperand(i: ArgNo, v: MDep->getSource()); |
1946 | ++NumMemCpyInstr; |
1947 | return true; |
1948 | } |
1949 | |
1950 | /// This is called on memcpy dest pointer arguments attributed as immutable |
1951 | /// during call. Try to use memcpy source directly if all of the following |
1952 | /// conditions are satisfied. |
1953 | /// 1. The memcpy dst is neither modified during the call nor captured by the |
1954 | /// call. (if readonly, noalias, nocapture attributes on call-site.) |
1955 | /// 2. The memcpy dst is an alloca with known alignment & size. |
1956 | /// 2-1. The memcpy length == the alloca size which ensures that the new |
1957 | /// pointer is dereferenceable for the required range |
1958 | /// 2-2. The src pointer has alignment >= the alloca alignment or can be |
1959 | /// enforced so. |
1960 | /// 3. The memcpy dst and src is not modified between the memcpy and the call. |
1961 | /// (if MSSA clobber check is safe.) |
1962 | /// 4. The memcpy src is not modified during the call. (ModRef check shows no |
1963 | /// Mod.) |
1964 | bool MemCpyOptPass::processImmutArgument(CallBase &CB, unsigned ArgNo) { |
1965 | // 1. Ensure passed argument is immutable during call. |
1966 | if (!(CB.paramHasAttr(ArgNo, Kind: Attribute::NoAlias) && |
1967 | CB.paramHasAttr(ArgNo, Kind: Attribute::NoCapture))) |
1968 | return false; |
1969 | const DataLayout &DL = CB.getDataLayout(); |
1970 | Value *ImmutArg = CB.getArgOperand(i: ArgNo); |
1971 | |
1972 | // 2. Check that arg is alloca |
1973 | // TODO: Even if the arg gets back to branches, we can remove memcpy if all |
1974 | // the alloca alignments can be enforced to source alignment. |
1975 | auto *AI = dyn_cast<AllocaInst>(Val: ImmutArg->stripPointerCasts()); |
1976 | if (!AI) |
1977 | return false; |
1978 | |
1979 | std::optional<TypeSize> AllocaSize = AI->getAllocationSize(DL); |
1980 | // Can't handle unknown size alloca. |
1981 | // (e.g. Variable Length Array, Scalable Vector) |
1982 | if (!AllocaSize || AllocaSize->isScalable()) |
1983 | return false; |
1984 | MemoryLocation Loc(ImmutArg, LocationSize::precise(Value: *AllocaSize)); |
1985 | MemoryUseOrDef *CallAccess = MSSA->getMemoryAccess(I: &CB); |
1986 | if (!CallAccess) |
1987 | return false; |
1988 | |
1989 | MemCpyInst *MDep = nullptr; |
1990 | BatchAAResults BAA(*AA); |
1991 | MemoryAccess *Clobber = MSSA->getWalker()->getClobberingMemoryAccess( |
1992 | CallAccess->getDefiningAccess(), Loc, AA&: BAA); |
1993 | if (auto *MD = dyn_cast<MemoryDef>(Val: Clobber)) |
1994 | MDep = dyn_cast_or_null<MemCpyInst>(Val: MD->getMemoryInst()); |
1995 | |
1996 | // If the immut argument isn't fed by a memcpy, ignore it. If it is fed by |
1997 | // a memcpy, check that the arg equals the memcpy dest. |
1998 | if (!MDep || MDep->isVolatile() || AI != MDep->getDest()) |
1999 | return false; |
2000 | |
2001 | // The type of the memcpy source must match the immut argument |
2002 | if (MDep->getSource()->getType() != ImmutArg->getType()) |
2003 | return false; |
2004 | |
2005 | // 2-1. The length of the memcpy must be equal to the size of the alloca. |
2006 | auto *MDepLen = dyn_cast<ConstantInt>(Val: MDep->getLength()); |
2007 | if (!MDepLen || AllocaSize != MDepLen->getValue()) |
2008 | return false; |
2009 | |
2010 | // 2-2. the memcpy source align must be larger than or equal the alloca's |
2011 | // align. If not so, we check to see if we can force the source of the memcpy |
2012 | // to the alignment we need. If we fail, we bail out. |
2013 | Align MemDepAlign = MDep->getSourceAlign().valueOrOne(); |
2014 | Align AllocaAlign = AI->getAlign(); |
2015 | if (MemDepAlign < AllocaAlign && |
2016 | getOrEnforceKnownAlignment(V: MDep->getSource(), PrefAlign: AllocaAlign, DL, CxtI: &CB, AC, |
2017 | DT) < AllocaAlign) |
2018 | return false; |
2019 | |
2020 | // 3. Verify that the source doesn't change in between the memcpy and |
2021 | // the call. |
2022 | // memcpy(a <- b) |
2023 | // *b = 42; |
2024 | // foo(*a) |
2025 | // It would be invalid to transform the second memcpy into foo(*b). |
2026 | if (writtenBetween(MSSA, AA&: BAA, Loc: MemoryLocation::getForSource(MTI: MDep), |
2027 | Start: MSSA->getMemoryAccess(I: MDep), End: CallAccess)) |
2028 | return false; |
2029 | |
2030 | // 4. The memcpy src must not be modified during the call. |
2031 | if (isModSet(MRI: AA->getModRefInfo(I: &CB, OptLoc: MemoryLocation::getForSource(MTI: MDep)))) |
2032 | return false; |
2033 | |
2034 | LLVM_DEBUG(dbgs() << "MemCpyOptPass: Forwarding memcpy to Immut src:\n" |
2035 | << " " << *MDep << "\n" |
2036 | << " " << CB << "\n" ); |
2037 | |
2038 | // Otherwise we're good! Update the immut argument. |
2039 | combineAAMetadata(ReplInst: &CB, I: MDep); |
2040 | CB.setArgOperand(i: ArgNo, v: MDep->getSource()); |
2041 | ++NumMemCpyInstr; |
2042 | return true; |
2043 | } |
2044 | |
2045 | /// Executes one iteration of MemCpyOptPass. |
2046 | bool MemCpyOptPass::iterateOnFunction(Function &F) { |
2047 | bool MadeChange = false; |
2048 | |
2049 | // Walk all instruction in the function. |
2050 | for (BasicBlock &BB : F) { |
2051 | // Skip unreachable blocks. For example processStore assumes that an |
2052 | // instruction in a BB can't be dominated by a later instruction in the |
2053 | // same BB (which is a scenario that can happen for an unreachable BB that |
2054 | // has itself as a predecessor). |
2055 | if (!DT->isReachableFromEntry(A: &BB)) |
2056 | continue; |
2057 | |
2058 | for (BasicBlock::iterator BI = BB.begin(), BE = BB.end(); BI != BE;) { |
2059 | // Avoid invalidating the iterator. |
2060 | Instruction *I = &*BI++; |
2061 | |
2062 | bool RepeatInstruction = false; |
2063 | |
2064 | if (auto *SI = dyn_cast<StoreInst>(Val: I)) |
2065 | MadeChange |= processStore(SI, BBI&: BI); |
2066 | else if (auto *M = dyn_cast<MemSetInst>(Val: I)) |
2067 | RepeatInstruction = processMemSet(MSI: M, BBI&: BI); |
2068 | else if (auto *M = dyn_cast<MemCpyInst>(Val: I)) |
2069 | RepeatInstruction = processMemCpy(M, BBI&: BI); |
2070 | else if (auto *M = dyn_cast<MemMoveInst>(Val: I)) |
2071 | RepeatInstruction = processMemMove(M); |
2072 | else if (auto *CB = dyn_cast<CallBase>(Val: I)) { |
2073 | for (unsigned i = 0, e = CB->arg_size(); i != e; ++i) { |
2074 | if (CB->isByValArgument(ArgNo: i)) |
2075 | MadeChange |= processByValArgument(CB&: *CB, ArgNo: i); |
2076 | else if (CB->onlyReadsMemory(OpNo: i)) |
2077 | MadeChange |= processImmutArgument(CB&: *CB, ArgNo: i); |
2078 | } |
2079 | } |
2080 | |
2081 | // Reprocess the instruction if desired. |
2082 | if (RepeatInstruction) { |
2083 | if (BI != BB.begin()) |
2084 | --BI; |
2085 | MadeChange = true; |
2086 | } |
2087 | } |
2088 | } |
2089 | |
2090 | return MadeChange; |
2091 | } |
2092 | |
2093 | PreservedAnalyses MemCpyOptPass::run(Function &F, FunctionAnalysisManager &AM) { |
2094 | auto &TLI = AM.getResult<TargetLibraryAnalysis>(IR&: F); |
2095 | auto *AA = &AM.getResult<AAManager>(IR&: F); |
2096 | auto *AC = &AM.getResult<AssumptionAnalysis>(IR&: F); |
2097 | auto *DT = &AM.getResult<DominatorTreeAnalysis>(IR&: F); |
2098 | auto *PDT = &AM.getResult<PostDominatorTreeAnalysis>(IR&: F); |
2099 | auto *MSSA = &AM.getResult<MemorySSAAnalysis>(IR&: F); |
2100 | |
2101 | bool MadeChange = runImpl(F, TLI: &TLI, AA, AC, DT, PDT, MSSA: &MSSA->getMSSA()); |
2102 | if (!MadeChange) |
2103 | return PreservedAnalyses::all(); |
2104 | |
2105 | PreservedAnalyses PA; |
2106 | PA.preserveSet<CFGAnalyses>(); |
2107 | PA.preserve<MemorySSAAnalysis>(); |
2108 | return PA; |
2109 | } |
2110 | |
2111 | bool MemCpyOptPass::runImpl(Function &F, TargetLibraryInfo *TLI_, |
2112 | AliasAnalysis *AA_, AssumptionCache *AC_, |
2113 | DominatorTree *DT_, PostDominatorTree *PDT_, |
2114 | MemorySSA *MSSA_) { |
2115 | bool MadeChange = false; |
2116 | TLI = TLI_; |
2117 | AA = AA_; |
2118 | AC = AC_; |
2119 | DT = DT_; |
2120 | PDT = PDT_; |
2121 | MSSA = MSSA_; |
2122 | MemorySSAUpdater MSSAU_(MSSA_); |
2123 | MSSAU = &MSSAU_; |
2124 | |
2125 | while (true) { |
2126 | if (!iterateOnFunction(F)) |
2127 | break; |
2128 | MadeChange = true; |
2129 | } |
2130 | |
2131 | if (VerifyMemorySSA) |
2132 | MSSA_->verifyMemorySSA(); |
2133 | |
2134 | return MadeChange; |
2135 | } |
2136 | |