MemCpyOptimizer.cpp source code [llvm_projects/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp]

1	//===- MemCpyOptimizer.cpp - Optimize use of memcpy and friends -----------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This pass performs various transformations related to eliminating memcpy
10	// calls, or transforming sets of stores into memset's.
11	//
12	//===----------------------------------------------------------------------===//
13
14	#include "llvm/Transforms/Scalar/MemCpyOptimizer.h"
15	#include "llvm/ADT/DenseSet.h"
16	#include "llvm/ADT/STLExtras.h"
17	#include "llvm/ADT/ScopeExit.h"
18	#include "llvm/ADT/SmallVector.h"
19	#include "llvm/ADT/Statistic.h"
20	#include "llvm/ADT/iterator_range.h"
21	#include "llvm/Analysis/AliasAnalysis.h"
22	#include "llvm/Analysis/AssumptionCache.h"
23	#include "llvm/Analysis/CFG.h"
24	#include "llvm/Analysis/CaptureTracking.h"
25	#include "llvm/Analysis/GlobalsModRef.h"
26	#include "llvm/Analysis/InstructionSimplify.h"
27	#include "llvm/Analysis/Loads.h"
28	#include "llvm/Analysis/MemoryLocation.h"
29	#include "llvm/Analysis/MemorySSA.h"
30	#include "llvm/Analysis/MemorySSAUpdater.h"
31	#include "llvm/Analysis/PostDominators.h"
32	#include "llvm/Analysis/TargetLibraryInfo.h"
33	#include "llvm/Analysis/ValueTracking.h"
34	#include "llvm/IR/BasicBlock.h"
35	#include "llvm/IR/Constants.h"
36	#include "llvm/IR/DataLayout.h"
37	#include "llvm/IR/DerivedTypes.h"
38	#include "llvm/IR/Dominators.h"
39	#include "llvm/IR/Function.h"
40	#include "llvm/IR/GlobalVariable.h"
41	#include "llvm/IR/IRBuilder.h"
42	#include "llvm/IR/InstrTypes.h"
43	#include "llvm/IR/Instruction.h"
44	#include "llvm/IR/Instructions.h"
45	#include "llvm/IR/IntrinsicInst.h"
46	#include "llvm/IR/Intrinsics.h"
47	#include "llvm/IR/LLVMContext.h"
48	#include "llvm/IR/Module.h"
49	#include "llvm/IR/PassManager.h"
50	#include "llvm/IR/Type.h"
51	#include "llvm/IR/User.h"
52	#include "llvm/IR/Value.h"
53	#include "llvm/Support/Casting.h"
54	#include "llvm/Support/Debug.h"
55	#include "llvm/Support/raw_ostream.h"
56	#include "llvm/Transforms/Utils/Local.h"
57	#include <algorithm>
58	#include <cassert>
59	#include <cstdint>
60	#include <optional>
61
62	using namespace llvm;
63
64	#define DEBUG_TYPE "memcpyopt"
65
66	static cl::opt<bool> EnableMemCpyOptWithoutLibcalls(
67	"enable-memcpyopt-without-libcalls", cl::Hidden,
68	cl::desc ("Enable memcpyopt even when libcalls are disabled"));
69
70	STATISTIC(NumMemCpyInstr, "Number of memcpy instructions deleted");
71	STATISTIC(NumMemMoveInstr, "Number of memmove instructions deleted");
72	STATISTIC(NumMemSetInfer, "Number of memsets inferred");
73	STATISTIC(NumMoveToCpy, "Number of memmoves converted to memcpy");
74	STATISTIC(NumCpyToSet, "Number of memcpys converted to memset");
75	STATISTIC(NumCallSlot, "Number of call slot optimizations performed");
76	STATISTIC(NumStackMove, "Number of stack-move optimizations performed");
77
78	namespace {
79
80	/// Represents a range of memset'd bytes with the ByteVal value.
81	/// This allows us to analyze stores like:
82	/// store 0 -> P+1
83	/// store 0 -> P+0
84	/// store 0 -> P+3
85	/// store 0 -> P+2
86	/// which sometimes happens with stores to arrays of structs etc. When we see
87	/// the first store, we make a range [1, 2). The second store extends the range
88	/// to [0, 2). The third makes a new range [2, 3). The fourth store joins the
89	/// two ranges into [0, 3) which is memset'able.
90	struct MemsetRange {
91	// Start/End - A semi range that describes the span that this range covers.
92	// The range is closed at the start and open at the end: [Start, End).
93	int64_t Start, End;
94
95	/// StartPtr - The getelementptr instruction that points to the start of the
96	/// range.
97	Value *StartPtr;
98
99	/// Alignment - The known alignment of the first store.
100	MaybeAlign Alignment;
101
102	/// TheStores - The actual stores that make up this range.
103	SmallVector<Instruction *, `16`> TheStores;
104
105	bool isProfitableToUseMemset(const DataLayout &DL) const;
106	};
107
108	} // end anonymous namespace
109
110	static bool overreadUndefContents(MemorySSA MSSA, MemCpyInst MemCpy,
111	MemIntrinsic *MemSrc, BatchAAResults &BAA);
112
113	bool MemsetRange::isProfitableToUseMemset(const DataLayout &DL) const {
114	// If we found more than 4 stores to merge or 16 bytes, use memset.
115	if (TheStores.size() >= `4` \|\| End - Start >= `16`)
116	return true;
117
118	// If there is nothing to merge, don't do anything.
119	if (TheStores.size() < `2`)
120	return false;
121
122	// If any of the stores are a memset, then it is always good to extend the
123	// memset.
124	for (Instruction *SI : TheStores)
125	if (!isa<StoreInst>(Val: SI))
126	return true;
127
128	// Assume that the code generator is capable of merging pairs of stores
129	// together if it wants to.
130	if (TheStores.size() == `2`)
131	return false;
132
133	// If we have fewer than 8 stores, it can still be worthwhile to do this.
134	// For example, merging 4 i8 stores into an i32 store is useful almost always.
135	// However, merging 2 32-bit stores isn't useful on a 32-bit architecture (the
136	// memset will be split into 2 32-bit stores anyway) and doing so can
137	// pessimize the llvm optimizer.
138	//
139	// Since we don't have perfect knowledge here, make some assumptions: assume
140	// the maximum GPR width is the same size as the largest legal integer
141	// size. If so, check to see whether we will end up actually reducing the
142	// number of stores used.
143	unsigned Bytes = unsigned(End - Start);
144	unsigned MaxIntSize = DL.getLargestLegalIntTypeSizeInBits() / `8`;
145	if (MaxIntSize == `0`)
146	MaxIntSize = `1`;
147	unsigned NumPointerStores = Bytes / MaxIntSize;
148
149	// Assume the remaining bytes if any are done a byte at a time.
150	unsigned NumByteStores = Bytes % MaxIntSize;
151
152	// If we will reduce the # stores (according to this heuristic), do the
153	// transformation. This encourages merging 4 x i8 -> i32 and 2 x i16 -> i32
154	// etc.
155	return TheStores.size() > NumPointerStores + NumByteStores;
156	}
157
158	namespace {
159
160	class MemsetRanges {
161	using range_iterator = SmallVectorImpl<MemsetRange>::iterator;
162
163	/// A sorted list of the memset ranges.
164	SmallVector<MemsetRange, `8`> Ranges;
165
166	const DataLayout &DL;
167
168	public:
169	MemsetRanges(const DataLayout &DL) : DL(DL) {}
170
171	using const_iterator = SmallVectorImpl<MemsetRange>::const_iterator;
172
173	const_iterator begin() const { return Ranges.begin(); }
174	const_iterator end() const { return Ranges.end(); }
175	bool empty() const { return Ranges.empty(); }
176
177	void addInst(int64_t OffsetFromFirst, Instruction *Inst) {
178	if (auto *SI = dyn_cast<StoreInst>(Val: Inst))
179	addStore(OffsetFromFirst, SI);
180	else
181	addMemSet(OffsetFromFirst, MSI: cast<MemSetInst>(Val: Inst));
182	}
183
184	void addStore(int64_t OffsetFromFirst, StoreInst *SI) {
185	TypeSize StoreSize = DL.getTypeStoreSize(Ty: SI->getOperand(i_nocapture: `0`)->getType());
186	assert(!StoreSize.isScalable() && "Can't track scalable-typed stores");
187	addRange(Start: OffsetFromFirst, Size: StoreSize.getFixedValue(),
188	Ptr: SI->getPointerOperand(), Alignment: SI->getAlign(), Inst: SI);
189	}
190
191	void addMemSet(int64_t OffsetFromFirst, MemSetInst *MSI) {
192	int64_t Size = cast<ConstantInt>(Val: MSI->getLength())->getZExtValue();
193	addRange(Start: OffsetFromFirst, Size, Ptr: MSI->getDest(), Alignment: MSI->getDestAlign(), Inst: MSI);
194	}
195
196	void addRange(int64_t Start, int64_t Size, Value *Ptr, MaybeAlign Alignment,
197	Instruction *Inst);
198	};
199
200	} // end anonymous namespace
201
202	/// Add a new store to the MemsetRanges data structure. This adds a
203	/// new range for the specified store at the specified offset, merging into
204	/// existing ranges as appropriate.
205	void MemsetRanges::addRange(int64_t Start, int64_t Size, Value *Ptr,
206	MaybeAlign Alignment, Instruction *Inst) {
207	int64_t End = Start + Size;
208
209	range_iterator I = partition_point(
210	Range&: Ranges, P: [=](const MemsetRange &O) { return O.End < Start; });
211
212	// We now know that I == E, in which case we didn't find anything to merge
213	// with, or that Start <= I->End. If End < I->Start or I == E, then we need
214	// to insert a new range. Handle this now.
215	if (I == Ranges.end() \|\| End < I->Start) {
216	MemsetRange &R = *Ranges.insert(I, Elt: MemsetRange ());
217	R.Start = Start;
218	R.End = End;
219	R.StartPtr = Ptr;
220	R.Alignment = Alignment;
221	R.TheStores.push_back(Elt: Inst);
222	return;
223	}
224
225	// This store overlaps with I, add it.
226	I->TheStores.push_back(Elt: Inst);
227
228	// At this point, we may have an interval that completely contains our store.
229	// If so, just add it to the interval and return.
230	if (I->Start <= Start && I->End >= End)
231	return;
232
233	// Now we know that Start <= I->End and End >= I->Start so the range overlaps
234	// but is not entirely contained within the range.
235
236	// See if the range extends the start of the range. In this case, it couldn't
237	// possibly cause it to join the prior range, because otherwise we would have
238	// stopped on it.
239	if (Start < I->Start) {
240	I->Start = Start;
241	I->StartPtr = Ptr;
242	I->Alignment = Alignment;
243	}
244
245	// Now we know that Start <= I->End and Start >= I->Start (so the startpoint
246	// is in or right at the end of I), and that End >= I->Start. Extend I out to
247	// End.
248	if (End > I->End) {
249	I->End = End;
250	range_iterator NextI = I;
251	while (++NextI != Ranges.end() && End >= NextI->Start) {
252	// Merge the range in.
253	I->TheStores.append(in_start: NextI->TheStores.begin(), in_end: NextI->TheStores.end());
254	if (NextI->End > I->End)
255	I->End = NextI->End;
256	Ranges.erase(CI: NextI);
257	NextI = I;
258	}
259	}
260	}
261
262	//===----------------------------------------------------------------------===//
263	// MemCpyOptLegacyPass Pass
264	//===----------------------------------------------------------------------===//
265
266	// Check that V is either not accessible by the caller, or unwinding cannot
267	// occur between Start and End.
268	static bool mayBeVisibleThroughUnwinding(Value V, Instruction Start,
269	Instruction *End) {
270	assert(Start->getParent() == End->getParent() && "Must be in same block");
271	// Function can't unwind, so it also can't be visible through unwinding.
272	if (Start->getFunction()->doesNotThrow())
273	return false;
274
275	// Object is not visible on unwind.
276	// TODO: Support RequiresNoCaptureBeforeUnwind case.
277	bool RequiresNoCaptureBeforeUnwind;
278	if (isNotVisibleOnUnwind(Object: getUnderlyingObject(V),
279	RequiresNoCaptureBeforeUnwind) &&
280	!RequiresNoCaptureBeforeUnwind)
281	return false;
282
283	// Check whether there are any unwinding instructions in the range.
284	return any_of(Range: make_range(x: Start->getIterator(), y: End->getIterator()),
285	P: [](const Instruction &I) { return I.mayThrow(); });
286	}
287
288	void MemCpyOptPass::eraseInstruction(Instruction *I) {
289	MSSAU->removeMemoryAccess(I);
290	EEA->removeInstruction(I);
291	I->eraseFromParent();
292	}
293
294	// Check for mod or ref of Loc between Start and End, excluding both boundaries.
295	// Start and End must be in the same block.
296	// If SkippedLifetimeStart is provided, skip over one clobbering lifetime.start
297	// intrinsic and store it inside SkippedLifetimeStart.
298	static bool accessedBetween(BatchAAResults &AA, MemoryLocation Loc,
299	const MemoryUseOrDef *Start,
300	const MemoryUseOrDef *End,
301	Instruction SkippedLifetimeStart = nullptr**) {
302	assert(Start->getBlock() == End->getBlock() && "Only local supported");
303	for (const MemoryAccess &MA :
304	make_range(x: ++Start->getIterator(), y: End->getIterator())) {
305	Instruction *I = cast<MemoryUseOrDef>(Val: MA).getMemoryInst();
306	if (isModOrRefSet(MRI: AA.getModRefInfo(I, OptLoc: Loc))) {
307	auto *II = dyn_cast<IntrinsicInst>(Val: I);
308	if (II && II->getIntrinsicID() == Intrinsic::lifetime_start &&
309	SkippedLifetimeStart && !*SkippedLifetimeStart) {
310	*SkippedLifetimeStart = I;
311	continue;
312	}
313
314	return true;
315	}
316	}
317	return false;
318	}
319
320	// Check for mod of Loc between Start and End, excluding both boundaries.
321	// Start and End can be in different blocks.
322	static bool writtenBetween(MemorySSA *MSSA, BatchAAResults &AA,
323	MemoryLocation Loc, const MemoryUseOrDef *Start,
324	const MemoryUseOrDef *End) {
325	if (isa<MemoryUse>(Val: End)) {
326	// For MemoryUses, getClobberingMemoryAccess may skip non-clobbering writes.
327	// Manually check read accesses between Start and End, if they are in the
328	// same block, for clobbers. Otherwise assume Loc is clobbered.
329	return Start->getBlock() != End->getBlock() \|\|
330	any_of(
331	Range: make_range(x: std::next(x: Start->getIterator()), y: End->getIterator()),
332	P: [&AA, Loc](const MemoryAccess &Acc) {
333	if (isa<MemoryUse>(Val: &Acc))
334	return false;
335	Instruction *AccInst =
336	cast<MemoryUseOrDef>(Val: &Acc)->getMemoryInst();
337	return isModSet(MRI: AA.getModRefInfo(I: AccInst, OptLoc: Loc));
338	});
339	}
340
341	// TODO: Only walk until we hit Start.
342	MemoryAccess *Clobber = MSSA->getWalker()->getClobberingMemoryAccess(
343	End->getDefiningAccess(), Loc, AA);
344	return !MSSA->dominates(A: Clobber, B: Start);
345	}
346
347	/// When scanning forward over instructions, we look for some other patterns to
348	/// fold away. In particular, this looks for stores to neighboring locations of
349	/// memory. If it sees enough consecutive ones, it attempts to merge them
350	/// together into a memcpy/memset.
351	Instruction MemCpyOptPass::tryMergingIntoMemset(Instruction StartInst,
352	Value *StartPtr,
353	Value *ByteVal) {
354	const DataLayout &DL = StartInst->getDataLayout();
355
356	// We can't track scalable types
357	if (auto *SI = dyn_cast<StoreInst>(Val: StartInst))
358	if (DL.getTypeStoreSize(Ty: SI->getOperand(i_nocapture: `0`)->getType()).isScalable())
359	return nullptr;
360
361	// Okay, so we now have a single store that can be splatable. Scan to find
362	// all subsequent stores of the same value to offset from the same pointer.
363	// Join these together into ranges, so we can decide whether contiguous blocks
364	// are stored.
365	MemsetRanges Ranges(DL);
366
367	BasicBlock::iterator BI(StartInst);
368
369	// Keeps track of the last memory use or def before the insertion point for
370	// the new memset. The new MemoryDef for the inserted memsets will be inserted
371	// after MemInsertPoint.
372	MemoryUseOrDef MemInsertPoint = nullptr*;
373	for (++BI; !BI ->isTerminator(); ++BI) {
374	auto *CurrentAcc =
375	cast_or_null<MemoryUseOrDef>(Val: MSSA->getMemoryAccess(I: &*BI));
376	if (CurrentAcc)
377	MemInsertPoint = CurrentAcc;
378
379	// Calls that only access inaccessible memory do not block merging
380	// accessible stores.
381	if (auto *CB = dyn_cast<CallBase>(Val&: BI)) {
382	if (CB->onlyAccessesInaccessibleMemory())
383	continue;
384	}
385
386	if (!isa<StoreInst>(Val: BI) && !isa<MemSetInst>(Val: BI)) {
387	// If the instruction is readnone, ignore it, otherwise bail out. We
388	// don't even allow readonly here because we don't want something like:
389	// A[1] = 2; strlen(A); A[2] = 2; -> memcpy(A, ...); strlen(A).
390	if (BI ->mayWriteToMemory() \|\| BI ->mayReadFromMemory())
391	break;
392	continue;
393	}
394
395	if (auto *NextStore = dyn_cast<StoreInst>(Val&: BI)) {
396	// If this is a store, see if we can merge it in.
397	if (!NextStore->isSimple())
398	break;
399
400	Value *StoredVal = NextStore->getValueOperand();
401
402	// Don't convert stores of non-integral pointer types to memsets (which
403	// stores integers).
404	if (DL.isNonIntegralPointerType(Ty: StoredVal->getType()->getScalarType()))
405	break;
406
407	// We can't track ranges involving scalable types.
408	if (DL.getTypeStoreSize(Ty: StoredVal->getType()).isScalable())
409	break;
410
411	// Check to see if this stored value is of the same byte-splattable value.
412	Value *StoredByte = isBytewiseValue(V: StoredVal, DL);
413	if (isa<UndefValue>(Val: ByteVal) && StoredByte)
414	ByteVal = StoredByte;
415	if (ByteVal != StoredByte)
416	break;
417
418	// Check to see if this store is to a constant offset from the start ptr.
419	std::optional<int64_t> Offset =
420	NextStore->getPointerOperand()->getPointerOffsetFrom(Other: StartPtr, DL);
421	if (!Offset)
422	break;
423
424	Ranges.addStore(OffsetFromFirst: *Offset, SI: NextStore);
425	} else {
426	auto *MSI = cast<MemSetInst>(Val&: BI);
427
428	if (MSI->isVolatile() \|\| ByteVal != MSI->getValue() \|\|
429	!isa<ConstantInt>(Val: MSI->getLength()))
430	break;
431
432	// Check to see if this store is to a constant offset from the start ptr.
433	std::optional<int64_t> Offset =
434	MSI->getDest()->getPointerOffsetFrom(Other: StartPtr, DL);
435	if (!Offset)
436	break;
437
438	Ranges.addMemSet(OffsetFromFirst: *Offset, MSI);
439	}
440	}
441
442	// If we have no ranges, then we just had a single store with nothing that
443	// could be merged in. This is a very common case of course.
444	if (Ranges.empty())
445	return nullptr;
446
447	// If we had at least one store that could be merged in, add the starting
448	// store as well. We try to avoid this unless there is at least something
449	// interesting as a small compile-time optimization.
450	Ranges.addInst(OffsetFromFirst: `0`, Inst: StartInst);
451
452	// If we create any memsets, we put it right before the first instruction that
453	// isn't part of the memset block. This ensure that the memset is dominated
454	// by any addressing instruction needed by the start of the block.
455	IRBuilder<> Builder(&*BI);
456
457	// Now that we have full information about ranges, loop over the ranges and
458	// emit memset's for anything big enough to be worthwhile.
459	Instruction AMemSet = nullptr*;
460	for (const MemsetRange &Range : Ranges) {
461	if (Range.TheStores.size() == `1`)
462	continue;
463
464	// If it is profitable to lower this range to memset, do so now.
465	if (!Range.isProfitableToUseMemset(DL))
466	continue;
467
468	// Otherwise, we do want to transform this! Create a new memset.
469	// Get the starting pointer of the block.
470	StartPtr = Range.StartPtr;
471
472	AMemSet = Builder.CreateMemSet(Ptr: StartPtr, Val: ByteVal, Size: Range.End - Range.Start,
473	Align: Range.Alignment);
474	AMemSet->mergeDIAssignID(SourceInstructions: Range.TheStores);
475
476	LLVM_DEBUG(dbgs() << "Replace stores:\n"; for (Instruction *SI
477	: Range.TheStores) dbgs()
478	<< *SI << `'\n'`;
479	dbgs() << "With: " << *AMemSet << `'\n'`);
480	if (!Range.TheStores.empty())
481	AMemSet->setDebugLoc(Range.TheStores [`0`]->getDebugLoc());
482
483	auto *NewDef = cast<MemoryDef>(
484	Val: MemInsertPoint->getMemoryInst() == &*BI
485	? MSSAU->createMemoryAccessBefore(I: AMemSet, Definition: nullptr, InsertPt: MemInsertPoint)
486	: MSSAU->createMemoryAccessAfter(I: AMemSet, Definition: nullptr, InsertPt: MemInsertPoint));
487	MSSAU->insertDef(Def: NewDef, /RenameUses=/true);
488	MemInsertPoint = NewDef;
489
490	// Zap all the stores.
491	for (Instruction *SI : Range.TheStores)
492	eraseInstruction(I: SI);
493
494	++NumMemSetInfer;
495	}
496
497	return AMemSet;
498	}
499
500	// This method try to lift a store instruction before position P.
501	// It will lift the store and its argument + that anything that
502	// may alias with these.
503	// The method returns true if it was successful.
504	bool MemCpyOptPass::moveUp(StoreInst SI, Instruction P, const LoadInst *LI) {
505	// If the store alias this position, early bail out.
506	MemoryLocation StoreLoc = MemoryLocation::get(SI);
507	if (isModOrRefSet(MRI: AA->getModRefInfo(I: P, OptLoc: StoreLoc)))
508	return false;
509
510	// Keep track of the arguments of all instruction we plan to lift
511	// so we can make sure to lift them as well if appropriate.
512	DenseSet<Instruction *> Args;
513	auto AddArg = [&](Value *Arg) {
514	auto *I = dyn_cast<Instruction>(Val: Arg);
515	if (I && I->getParent() == SI->getParent()) {
516	// Cannot hoist user of P above P
517	if (I == P)
518	return false;
519	Args.insert(V: I);
520	}
521	return true;
522	};
523	if (!AddArg (SI->getPointerOperand()))
524	return false;
525
526	// Instruction to lift before P.
527	SmallVector<Instruction *, `8`> ToLift{SI};
528
529	// Memory locations of lifted instructions.
530	SmallVector<MemoryLocation, `8`> MemLocs{StoreLoc};
531
532	// Lifted calls.
533	SmallVector<const CallBase *, `8`> Calls;
534
535	const MemoryLocation LoadLoc = MemoryLocation::get(LI);
536
537	for (auto I = --SI->getIterator(), E = P->getIterator(); I != E; --I) {
538	auto C = &I;
539
540	// Make sure hoisting does not perform a store that was not guaranteed to
541	// happen.
542	if (!isGuaranteedToTransferExecutionToSuccessor(I: C))
543	return false;
544
545	bool MayAlias = isModOrRefSet(MRI: AA->getModRefInfo(I: C, OptLoc: std::nullopt));
546
547	bool NeedLift = false;
548	if (Args.erase(V: C))
549	NeedLift = true;
550	else if (MayAlias) {
551	NeedLift = llvm::any_of(Range&: MemLocs, P: [C, this](const MemoryLocation &ML) {
552	return isModOrRefSet(MRI: AA->getModRefInfo(I: C, OptLoc: ML));
553	});
554
555	if (!NeedLift)
556	NeedLift = llvm::any_of(Range&: Calls, P: [C, this](const CallBase *Call) {
557	return isModOrRefSet(MRI: AA->getModRefInfo(I: C, Call));
558	});
559	}
560
561	if (!NeedLift)
562	continue;
563
564	if (MayAlias) {
565	// Since LI is implicitly moved downwards past the lifted instructions,
566	// none of them may modify its source.
567	if (isModSet(MRI: AA->getModRefInfo(I: C, OptLoc: LoadLoc)))
568	return false;
569	else if (const auto *Call = dyn_cast<CallBase>(Val: C)) {
570	// If we can't lift this before P, it's game over.
571	if (isModOrRefSet(MRI: AA->getModRefInfo(I: P, Call)))
572	return false;
573
574	Calls.push_back(Elt: Call);
575	} else if (isa<LoadInst>(Val: C) \|\| isa<StoreInst>(Val: C) \|\| isa<VAArgInst>(Val: C)) {
576	// If we can't lift this before P, it's game over.
577	auto ML = MemoryLocation::get(Inst: C);
578	if (isModOrRefSet(MRI: AA->getModRefInfo(I: P, OptLoc: ML)))
579	return false;
580
581	MemLocs.push_back(Elt: ML);
582	} else
583	// We don't know how to lift this instruction.
584	return false;
585	}
586
587	ToLift.push_back(Elt: C);
588	for (Value *Op : C->operands())
589	if (!AddArg (Op))
590	return false;
591	}
592
593	// Find MSSA insertion point. Normally P will always have a corresponding
594	// memory access before which we can insert. However, with non-standard AA
595	// pipelines, there may be a mismatch between AA and MSSA, in which case we
596	// will scan for a memory access before P. In either case, we know for sure
597	// that at least the load will have a memory access.
598	// TODO: Simplify this once P will be determined by MSSA, in which case the
599	// discrepancy can no longer occur.
600	MemoryUseOrDef MemInsertPoint = nullptr*;
601	if (MemoryUseOrDef *MA = MSSA->getMemoryAccess(I: P)) {
602	MemInsertPoint = cast<MemoryUseOrDef>(Val&: --MA->getIterator());
603	} else {
604	const Instruction *ConstP = P;
605	for (const Instruction &I : make_range(x: ++ConstP->getReverseIterator(),
606	y: ++LI->getReverseIterator())) {
607	if (MemoryUseOrDef *MA = MSSA->getMemoryAccess(I: &I)) {
608	MemInsertPoint = MA;
609	break;
610	}
611	}
612	}
613
614	// We made it, we need to lift.
615	for (auto *I : llvm::reverse(C&: ToLift)) {
616	LLVM_DEBUG(dbgs() << "Lifting " << I << " before " << P << "\n");
617	I->moveBefore(InsertPos: P->getIterator());
618	assert(MemInsertPoint && "Must have found insert point");
619	if (MemoryUseOrDef *MA = MSSA->getMemoryAccess(I)) {
620	MSSAU->moveAfter(What: MA, Where: MemInsertPoint);
621	MemInsertPoint = MA;
622	}
623	}
624
625	return true;
626	}
627
628	bool MemCpyOptPass::processStoreOfLoad(StoreInst SI, LoadInst LI,
629	const DataLayout &DL,
630	BasicBlock::iterator &BBI) {
631	if (!LI->isSimple() \|\| !LI->hasOneUse() \|\| LI->getParent() != SI->getParent())
632	return false;
633
634	BatchAAResults BAA(*AA, EEA);
635	auto *T = LI->getType();
636	// Don't introduce calls to memcpy/memmove intrinsics out of thin air if
637	// the corresponding libcalls are not available.
638	// TODO: We should really distinguish between libcall availability and
639	// our ability to introduce intrinsics.
640	if (T->isAggregateType() &&
641	(EnableMemCpyOptWithoutLibcalls \|\|
642	(TLI->has(F: LibFunc_memcpy) && TLI->has(F: LibFunc_memmove)))) {
643	MemoryLocation LoadLoc = MemoryLocation::get(LI);
644
645	// We use alias analysis to check if an instruction may store to
646	// the memory we load from in between the load and the store. If
647	// such an instruction is found, we try to promote there instead
648	// of at the store position.
649	// TODO: Can use MSSA for this.
650	Instruction *P = SI;
651	for (auto &I : make_range(x: ++LI->getIterator(), y: SI->getIterator())) {
652	if (isModSet(MRI: BAA.getModRefInfo(I: &I, OptLoc: LoadLoc))) {
653	P = &I;
654	break;
655	}
656	}
657
658	// If we found an instruction that may write to the loaded memory,
659	// we can try to promote at this position instead of the store
660	// position if nothing aliases the store memory after this and the store
661	// destination is not in the range.
662	if (P == SI \|\| moveUp(SI, P, LI)) {
663	// If we load from memory that may alias the memory we store to,
664	// memmove must be used to preserve semantic. If not, memcpy can
665	// be used. Also, if we load from constant memory, memcpy can be used
666	// as the constant memory won't be modified.
667	bool UseMemMove = false;
668	if (isModSet(MRI: AA->getModRefInfo(I: SI, OptLoc: LoadLoc)))
669	UseMemMove = true;
670
671	IRBuilder<> Builder(P);
672	Value *Size =
673	Builder.CreateTypeSize(Ty: Builder.getInt64Ty(), Size: DL.getTypeStoreSize(Ty: T));
674	Instruction *M;
675	if (UseMemMove)
676	M = Builder.CreateMemMove(Dst: SI->getPointerOperand(), DstAlign: SI->getAlign(),
677	Src: LI->getPointerOperand(), SrcAlign: LI->getAlign(),
678	Size);
679	else
680	M = Builder.CreateMemCpy(Dst: SI->getPointerOperand(), DstAlign: SI->getAlign(),
681	Src: LI->getPointerOperand(), SrcAlign: LI->getAlign(), Size);
682	M->copyMetadata(SrcInst: *SI, WL: LLVMContext::MD_DIAssignID);
683
684	LLVM_DEBUG(dbgs() << "Promoting " << LI << " to " << SI << " => " << *M
685	<< "\n");
686
687	auto *LastDef = cast<MemoryDef>(Val: MSSA->getMemoryAccess(I: SI));
688	auto NewAccess = MSSAU->createMemoryAccessAfter(I: M, Definition: nullptr*, InsertPt: LastDef);
689	MSSAU->insertDef(Def: cast<MemoryDef>(Val: NewAccess), /RenameUses=/true);
690
691	eraseInstruction(I: SI);
692	eraseInstruction(I: LI);
693	++NumMemCpyInstr;
694
695	// Make sure we do not invalidate the iterator.
696	BBI = M->getIterator();
697	return true;
698	}
699	}
700
701	// Detect cases where we're performing call slot forwarding, but
702	// happen to be using a load-store pair to implement it, rather than
703	// a memcpy.
704	auto GetCall = [&]() -> CallInst * {
705	// We defer this expensive clobber walk until the cheap checks
706	// have been done on the source inside performCallSlotOptzn.
707	if (auto *LoadClobber = dyn_cast<MemoryUseOrDef>(
708	Val: MSSA->getWalker()->getClobberingMemoryAccess(I: LI, AA&: BAA)))
709	return dyn_cast_or_null<CallInst>(Val: LoadClobber->getMemoryInst());
710	return nullptr;
711	};
712
713	bool Changed = performCallSlotOptzn(
714	cpyLoad: LI, cpyStore: SI, cpyDst: SI->getPointerOperand()->stripPointerCasts(),
715	cpySrc: LI->getPointerOperand()->stripPointerCasts(),
716	cpyLen: DL.getTypeStoreSize(Ty: SI->getOperand(i_nocapture: `0`)->getType()),
717	cpyAlign: std::min(a: SI->getAlign(), b: LI->getAlign()), BAA, GetC: GetCall);
718	if (Changed) {
719	eraseInstruction(I: SI);
720	eraseInstruction(I: LI);
721	++NumMemCpyInstr;
722	return true;
723	}
724
725	// If this is a load-store pair from a stack slot to a stack slot, we
726	// might be able to perform the stack-move optimization just as we do for
727	// memcpys from an alloca to an alloca.
728	if (auto *DestAlloca = dyn_cast<AllocaInst>(Val: SI->getPointerOperand())) {
729	if (auto *SrcAlloca = dyn_cast<AllocaInst>(Val: LI->getPointerOperand())) {
730	if (performStackMoveOptzn(Load: LI, Store: SI, DestAlloca, SrcAlloca,
731	Size: DL.getTypeStoreSize(Ty: T), BAA)) {
732	// Avoid invalidating the iterator.
733	BBI = SI->getNextNonDebugInstruction()->getIterator();
734	eraseInstruction(I: SI);
735	eraseInstruction(I: LI);
736	++NumMemCpyInstr;
737	return true;
738	}
739	}
740	}
741
742	return false;
743	}
744
745	bool MemCpyOptPass::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
746	if (!SI->isSimple())
747	return false;
748
749	// Avoid merging nontemporal stores since the resulting
750	// memcpy/memset would not be able to preserve the nontemporal hint.
751	// In theory we could teach how to propagate the !nontemporal metadata to
752	// memset calls. However, that change would force the backend to
753	// conservatively expand !nontemporal memset calls back to sequences of
754	// store instructions (effectively undoing the merging).
755	if (SI->getMetadata(KindID: LLVMContext::MD_nontemporal))
756	return false;
757
758	const DataLayout &DL = SI->getDataLayout();
759
760	Value *StoredVal = SI->getValueOperand();
761
762	// Not all the transforms below are correct for non-integral pointers, bail
763	// until we've audited the individual pieces.
764	if (DL.isNonIntegralPointerType(Ty: StoredVal->getType()->getScalarType()))
765	return false;
766
767	// Load to store forwarding can be interpreted as memcpy.
768	if (auto *LI = dyn_cast<LoadInst>(Val: StoredVal))
769	return processStoreOfLoad(SI, LI, DL, BBI);
770
771	// The following code creates memset intrinsics out of thin air. Don't do
772	// this if the corresponding libfunc is not available.
773	// TODO: We should really distinguish between libcall availability and
774	// our ability to introduce intrinsics.
775	if (!(TLI->has(F: LibFunc_memset) \|\| EnableMemCpyOptWithoutLibcalls))
776	return false;
777
778	// There are two cases that are interesting for this code to handle: memcpy
779	// and memset. Right now we only handle memset.
780
781	// Ensure that the value being stored is something that can be memset'able a
782	// byte at a time like "0" or "-1" or any width, as well as things like
783	// 0xA0A0A0A0 and 0.0.
784	Value *V = SI->getOperand(i_nocapture: `0`);
785	Value *ByteVal = isBytewiseValue(V, DL);
786	if (!ByteVal)
787	return false;
788
789	if (Instruction *I =
790	tryMergingIntoMemset(StartInst: SI, StartPtr: SI->getPointerOperand(), ByteVal)) {
791	BBI = I->getIterator(); // Don't invalidate iterator.
792	return true;
793	}
794
795	// If we have an aggregate, we try to promote it to memset regardless
796	// of opportunity for merging as it can expose optimization opportunities
797	// in subsequent passes.
798	auto *T = V->getType();
799	if (!T->isAggregateType())
800	return false;
801
802	TypeSize Size = DL.getTypeStoreSize(Ty: T);
803	if (Size.isScalable())
804	return false;
805
806	IRBuilder<> Builder(SI);
807	auto *M = Builder.CreateMemSet(Ptr: SI->getPointerOperand(), Val: ByteVal, Size,
808	Align: SI->getAlign());
809	M->copyMetadata(SrcInst: *SI, WL: LLVMContext::MD_DIAssignID);
810
811	LLVM_DEBUG(dbgs() << "Promoting " << SI << " to " << M << "\n");
812
813	// The newly inserted memset is immediately overwritten by the original
814	// store, so we do not need to rename uses.
815	auto *StoreDef = cast<MemoryDef>(Val: MSSA->getMemoryAccess(I: SI));
816	auto NewAccess = MSSAU->createMemoryAccessBefore(I: M, Definition: nullptr*, InsertPt: StoreDef);
817	MSSAU->insertDef(Def: cast<MemoryDef>(Val: NewAccess), /RenameUses=/false);
818
819	eraseInstruction(I: SI);
820	NumMemSetInfer ++;
821
822	// Make sure we do not invalidate the iterator.
823	BBI = M->getIterator();
824	return true;
825	}
826
827	bool MemCpyOptPass::processMemSet(MemSetInst *MSI, BasicBlock::iterator &BBI) {
828	// See if there is another memset or store neighboring this memset which
829	// allows us to widen out the memset to do a single larger store.
830	if (isa<ConstantInt>(Val: MSI->getLength()) && !MSI->isVolatile())
831	if (Instruction *I =
832	tryMergingIntoMemset(StartInst: MSI, StartPtr: MSI->getDest(), ByteVal: MSI->getValue())) {
833	BBI = I->getIterator(); // Don't invalidate iterator.
834	return true;
835	}
836	return false;
837	}
838
839	/// Takes a memcpy and a call that it depends on,
840	/// and checks for the possibility of a call slot optimization by having
841	/// the call write its result directly into the destination of the memcpy.
842	bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpyLoad,
843	Instruction cpyStore, Value cpyDest,
844	Value *cpySrc, TypeSize cpySize,
845	Align cpyDestAlign,
846	BatchAAResults &BAA,
847	std::function<CallInst *()> GetC) {
848	// The general transformation to keep in mind is
849	//
850	// call @func(..., src, ...)
851	// memcpy(dest, src, ...)
852	//
853	// ->
854	//
855	// memcpy(dest, src, ...)
856	// call @func(..., dest, ...)
857	//
858	// Since moving the memcpy is technically awkward, we additionally check that
859	// src only holds uninitialized values at the moment of the call, meaning that
860	// the memcpy can be discarded rather than moved.
861
862	// We can't optimize scalable types.
863	if (cpySize.isScalable())
864	return false;
865
866	// Require that src be an alloca. This simplifies the reasoning considerably.
867	auto *srcAlloca = dyn_cast<AllocaInst>(Val: cpySrc);
868	if (!srcAlloca)
869	return false;
870
871	ConstantInt *srcArraySize = dyn_cast<ConstantInt>(Val: srcAlloca->getArraySize());
872	if (!srcArraySize)
873	return false;
874
875	const DataLayout &DL = cpyLoad->getDataLayout();
876	TypeSize SrcAllocaSize = DL.getTypeAllocSize(Ty: srcAlloca->getAllocatedType());
877	// We can't optimize scalable types.
878	if (SrcAllocaSize.isScalable())
879	return false;
880	uint64_t srcSize = SrcAllocaSize * srcArraySize->getZExtValue();
881
882	if (cpySize < srcSize)
883	return false;
884
885	CallInst *C = GetC ();
886	if (!C)
887	return false;
888
889	// Lifetime marks shouldn't be operated on.
890	if (Function *F = C->getCalledFunction())
891	if (F->isIntrinsic() && F->getIntrinsicID() == Intrinsic::lifetime_start)
892	return false;
893
894	if (C->getParent() != cpyStore->getParent()) {
895	LLVM_DEBUG(dbgs() << "Call Slot: block local restriction\n");
896	return false;
897	}
898
899	MemoryLocation DestLoc =
900	isa<StoreInst>(Val: cpyStore)
901	? MemoryLocation::get(Inst: cpyStore)
902	: MemoryLocation::getForDest(MI: cast<MemCpyInst>(Val: cpyStore));
903
904	// Check that nothing touches the dest of the copy between
905	// the call and the store/memcpy.
906	Instruction SkippedLifetimeStart = nullptr*;
907	if (accessedBetween(AA&: BAA, Loc: DestLoc, Start: MSSA->getMemoryAccess(I: C),
908	End: MSSA->getMemoryAccess(I: cpyStore), SkippedLifetimeStart: &SkippedLifetimeStart)) {
909	LLVM_DEBUG(dbgs() << "Call Slot: Dest pointer modified after call\n");
910	return false;
911	}
912
913	// If we need to move a lifetime.start above the call, make sure that we can
914	// actually do so. If the argument is bitcasted for example, we would have to
915	// move the bitcast as well, which we don't handle.
916	if (SkippedLifetimeStart) {
917	auto *LifetimeArg =
918	dyn_cast<Instruction>(Val: SkippedLifetimeStart->getOperand(i: `1`));
919	if (LifetimeArg && LifetimeArg->getParent() == C->getParent() &&
920	C->comesBefore(Other: LifetimeArg))
921	return false;
922	}
923
924	// Check that storing to the first srcSize bytes of dest will not cause a
925	// trap or data race.
926	bool ExplicitlyDereferenceableOnly;
927	if (!isWritableObject(Object: getUnderlyingObject(V: cpyDest),
928	ExplicitlyDereferenceableOnly) \|\|
929	!isDereferenceableAndAlignedPointer(V: cpyDest, Alignment: Align (`1`), Size: APInt (`64`, cpySize),
930	DL, CtxI: C, AC, DT)) {
931	LLVM_DEBUG(dbgs() << "Call Slot: Dest pointer not dereferenceable\n");
932	return false;
933	}
934
935	// Make sure that nothing can observe cpyDest being written early. There are
936	// a number of cases to consider:
937	// 1. cpyDest cannot be accessed between C and cpyStore as a precondition of
938	// the transform.
939	// 2. C itself may not access cpyDest (prior to the transform). This is
940	// checked further below.
941	// 3. If cpyDest is accessible to the caller of this function (potentially
942	// captured and not based on an alloca), we need to ensure that we cannot
943	// unwind between C and cpyStore. This is checked here.
944	// 4. If cpyDest is potentially captured, there may be accesses to it from
945	// another thread. In this case, we need to check that cpyStore is
946	// guaranteed to be executed if C is. As it is a non-atomic access, it
947	// renders accesses from other threads undefined.
948	// TODO: This is currently not checked.
949	if (mayBeVisibleThroughUnwinding(V: cpyDest, Start: C, End: cpyStore)) {
950	LLVM_DEBUG(dbgs() << "Call Slot: Dest may be visible through unwinding\n");
951	return false;
952	}
953
954	// Check that dest points to memory that is at least as aligned as src.
955	Align srcAlign = srcAlloca->getAlign();
956	bool isDestSufficientlyAligned = srcAlign <= cpyDestAlign;
957	// If dest is not aligned enough and we can't increase its alignment then
958	// bail out.
959	if (!isDestSufficientlyAligned && !isa<AllocaInst>(Val: cpyDest)) {
960	LLVM_DEBUG(dbgs() << "Call Slot: Dest not sufficiently aligned\n");
961	return false;
962	}
963
964	// Check that src is not accessed except via the call and the memcpy. This
965	// guarantees that it holds only undefined values when passed in (so the final
966	// memcpy can be dropped), that it is not read or written between the call and
967	// the memcpy, and that writing beyond the end of it is undefined.
968	SmallVector<User *, `8`> srcUseList(srcAlloca->users());
969	while (!srcUseList.empty()) {
970	User *U = srcUseList.pop_back_val();
971
972	if (isa<AddrSpaceCastInst>(Val: U)) {
973	append_range(C&: srcUseList, R: U->users());
974	continue;
975	}
976	if (isa<LifetimeIntrinsic>(Val: U))
977	continue;
978
979	if (U != C && U != cpyLoad) {
980	LLVM_DEBUG(dbgs() << "Call slot: Source accessed by " << *U << "\n");
981	return false;
982	}
983	}
984
985	// Check whether src is captured by the called function, in which case there
986	// may be further indirect uses of src.
987	bool SrcIsCaptured = any_of(Range: C->args(), P: [&](Use &U) {
988	return U ->stripPointerCasts() == cpySrc &&
989	!C->doesNotCapture(OpNo: C->getArgOperandNo(U: &U));
990	});
991
992	// If src is captured, then check whether there are any potential uses of
993	// src through the captured pointer before the lifetime of src ends, either
994	// due to a lifetime.end or a return from the function.
995	if (SrcIsCaptured) {
996	// Check that dest is not captured before/at the call. We have already
997	// checked that src is not captured before it. If either had been captured,
998	// then the call might be comparing the argument against the captured dest
999	// or src pointer.
1000	Value *DestObj = getUnderlyingObject(V: cpyDest);
1001	if (!isIdentifiedFunctionLocal(V: DestObj) \|\|
1002	PointerMayBeCapturedBefore(V: DestObj, / ReturnCaptures / true, I: C, DT,
1003	/ IncludeI / true))
1004	return false;
1005
1006	MemoryLocation SrcLoc =
1007	MemoryLocation (srcAlloca, LocationSize::precise(Value: srcSize));
1008	for (Instruction &I :
1009	make_range(x: ++C->getIterator(), y: C->getParent()->end())) {
1010	// Lifetime of srcAlloca ends at lifetime.end.
1011	if (auto *II = dyn_cast<IntrinsicInst>(Val: &I)) {
1012	if (II->getIntrinsicID() == Intrinsic::lifetime_end &&
1013	II->getArgOperand(i: `1`)->stripPointerCasts() == srcAlloca &&
1014	cast<ConstantInt>(Val: II->getArgOperand(i: `0`))->uge(Num: srcSize))
1015	break;
1016	}
1017
1018	// Lifetime of srcAlloca ends at return.
1019	if (isa<ReturnInst>(Val: &I))
1020	break;
1021
1022	// Ignore the direct read of src in the load.
1023	if (&I == cpyLoad)
1024	continue;
1025
1026	// Check whether this instruction may mod/ref src through the captured
1027	// pointer (we have already any direct mod/refs in the loop above).
1028	// Also bail if we hit a terminator, as we don't want to scan into other
1029	// blocks.
1030	if (isModOrRefSet(MRI: BAA.getModRefInfo(I: &I, OptLoc: SrcLoc)) \|\| I.isTerminator())
1031	return false;
1032	}
1033	}
1034
1035	// Since we're changing the parameter to the callsite, we need to make sure
1036	// that what would be the new parameter dominates the callsite.
1037	bool NeedMoveGEP = false;
1038	if (!DT->dominates(Def: cpyDest, User: C)) {
1039	// Support moving a constant index GEP before the call.
1040	auto *GEP = dyn_cast<GetElementPtrInst>(Val: cpyDest);
1041	if (GEP && GEP->hasAllConstantIndices() &&
1042	DT->dominates(Def: GEP->getPointerOperand(), User: C))
1043	NeedMoveGEP = true;
1044	else
1045	return false;
1046	}
1047
1048	// In addition to knowing that the call does not access src in some
1049	// unexpected manner, for example via a global, which we deduce from
1050	// the use analysis, we also need to know that it does not sneakily
1051	// access dest. We rely on AA to figure this out for us.
1052	MemoryLocation DestWithSrcSize(cpyDest, LocationSize::precise(Value: srcSize));
1053	ModRefInfo MR = BAA.getModRefInfo(I: C, OptLoc: DestWithSrcSize);
1054	// If necessary, perform additional analysis.
1055	if (isModOrRefSet(MRI: MR))
1056	MR = BAA.callCapturesBefore(I: C, MemLoc: DestWithSrcSize, DT);
1057	if (isModOrRefSet(MRI: MR))
1058	return false;
1059
1060	// We can't create address space casts here because we don't know if they're
1061	// safe for the target.
1062	if (cpySrc->getType() != cpyDest->getType())
1063	return false;
1064	for (unsigned ArgI = `0`; ArgI < C->arg_size(); ++ArgI)
1065	if (C->getArgOperand(i: ArgI)->stripPointerCasts() == cpySrc &&
1066	cpySrc->getType() != C->getArgOperand(i: ArgI)->getType())
1067	return false;
1068
1069	// All the checks have passed, so do the transformation.
1070	bool changedArgument = false;
1071	for (unsigned ArgI = `0`; ArgI < C->arg_size(); ++ArgI)
1072	if (C->getArgOperand(i: ArgI)->stripPointerCasts() == cpySrc) {
1073	changedArgument = true;
1074	C->setArgOperand(i: ArgI, v: cpyDest);
1075	}
1076
1077	if (!changedArgument)
1078	return false;
1079
1080	// If the destination wasn't sufficiently aligned then increase its alignment.
1081	if (!isDestSufficientlyAligned) {
1082	assert(isa<AllocaInst>(cpyDest) && "Can only increase alloca alignment!");
1083	cast<AllocaInst>(Val: cpyDest)->setAlignment(srcAlign);
1084	}
1085
1086	if (NeedMoveGEP) {
1087	auto *GEP = dyn_cast<GetElementPtrInst>(Val: cpyDest);
1088	GEP->moveBefore(InsertPos: C->getIterator());
1089	}
1090
1091	if (SkippedLifetimeStart) {
1092	SkippedLifetimeStart->moveBefore(InsertPos: C->getIterator());
1093	MSSAU->moveBefore(What: MSSA->getMemoryAccess(I: SkippedLifetimeStart),
1094	Where: MSSA->getMemoryAccess(I: C));
1095	}
1096
1097	combineAAMetadata(K: C, J: cpyLoad);
1098	if (cpyLoad != cpyStore)
1099	combineAAMetadata(K: C, J: cpyStore);
1100
1101	++NumCallSlot;
1102	return true;
1103	}
1104
1105	/// We've found that the (upward scanning) memory dependence of memcpy 'M' is
1106	/// the memcpy 'MDep'. Try to simplify M to copy from MDep's input if we can.
1107	bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M,
1108	MemCpyInst *MDep,
1109	BatchAAResults &BAA) {
1110	// We can only optimize non-volatile memcpy's.
1111	if (MDep->isVolatile())
1112	return false;
1113
1114	// If dep instruction is reading from our current input, then it is a noop
1115	// transfer and substituting the input won't change this instruction. Just
1116	// ignore the input and let someone else zap MDep. This handles cases like:
1117	// memcpy(a <- a)
1118	// memcpy(b <- a)
1119	// This also avoids infinite loops.
1120	if (BAA.isMustAlias(V1: MDep->getDest(), V2: MDep->getSource()))
1121	return false;
1122
1123	int64_t MForwardOffset = `0`;
1124	const DataLayout &DL = M->getModule()->getDataLayout();
1125	// We can only transforms memcpy's where the dest of one is the source of the
1126	// other, or they have an offset in a range.
1127	if (M->getSource() != MDep->getDest()) {
1128	std::optional<int64_t> Offset =
1129	M->getSource()->getPointerOffsetFrom(Other: MDep->getDest(), DL);
1130	if (!Offset \|\| *Offset < `0`)
1131	return false;
1132	MForwardOffset = *Offset;
1133	}
1134
1135	Value *CopyLength = M->getLength();
1136
1137	// The length of the memcpy's must be the same, or the preceding one must be
1138	// larger than the following one, or the contents of the overread must be
1139	// undefined bytes of a defined size.
1140	if (MForwardOffset != `0` \|\| MDep->getLength() != CopyLength) {
1141	auto *MDepLen = dyn_cast<ConstantInt>(Val: MDep->getLength());
1142	auto *MLen = dyn_cast<ConstantInt>(Val: CopyLength);
1143	// This could be converted to a runtime test (%CopyLength =
1144	// min(max(0, MDepLen - MForwardOffset), MLen)), but it is
1145	// unclear if that is useful
1146	if (!MDepLen \|\| !MLen)
1147	return false;
1148	if (MDepLen->getZExtValue() < MLen->getZExtValue() + MForwardOffset) {
1149	if (!overreadUndefContents(MSSA, MemCpy: M, MemSrc: MDep, BAA))
1150	return false;
1151	if (MDepLen->getZExtValue() <= (uint64_t)MForwardOffset)
1152	return false; // Should not reach here (there is obviously no aliasing
1153	// with MDep), so just bail in case it had incomplete info
1154	// somehow
1155	CopyLength = ConstantInt::get(Ty: CopyLength->getType(),
1156	V: MDepLen->getZExtValue() - MForwardOffset);
1157	}
1158	}
1159
1160	IRBuilder<> Builder(M);
1161	auto *CopySource = MDep->getSource();
1162	Instruction NewCopySource = nullptr*;
1163	auto CleanupOnRet = llvm::make_scope_exit(F: [&] {
1164	if (NewCopySource && NewCopySource->use_empty())
1165	// Safety: It's safe here because we will only allocate more instructions
1166	// after finishing all BatchAA queries, but we have to be careful if we
1167	// want to do something like this in another place. Then we'd probably
1168	// have to delay instruction removal until all transforms on an
1169	// instruction finished.
1170	eraseInstruction(I: NewCopySource);
1171	});
1172	MaybeAlign CopySourceAlign = MDep->getSourceAlign();
1173	auto MCopyLoc = MemoryLocation::getForSource(MTI: MDep);
1174	// Truncate the size of the MDep access to just the bytes read
1175	if (MDep->getLength() != CopyLength) {
1176	auto *ConstLength = cast<ConstantInt>(Val: CopyLength);
1177	MCopyLoc = MCopyLoc.getWithNewSize(
1178	NewSize: LocationSize::precise(Value: ConstLength->getZExtValue()));
1179	}
1180
1181	// When the forwarding offset is greater than 0, we transform
1182	// memcpy(d1 <- s1)
1183	// memcpy(d2 <- d1+o)
1184	// to
1185	// memcpy(d2 <- s1+o)
1186	if (MForwardOffset > `0`) {
1187	// The copy destination of `M` maybe can serve as the source of copying.
1188	std::optional<int64_t> MDestOffset =
1189	M->getRawDest()->getPointerOffsetFrom(Other: MDep->getRawSource(), DL);
1190	if (MDestOffset == MForwardOffset)
1191	CopySource = M->getDest();
1192	else {
1193	CopySource = Builder.CreateInBoundsPtrAdd(
1194	Ptr: CopySource, Offset: Builder.getInt64(C: MForwardOffset));
1195	NewCopySource = dyn_cast<Instruction>(Val: CopySource);
1196	}
1197	// We need to update `MCopyLoc` if an offset exists.
1198	MCopyLoc = MCopyLoc.getWithNewPtr(NewPtr: CopySource);
1199	if (CopySourceAlign)
1200	CopySourceAlign = commonAlignment(A: *CopySourceAlign, Offset: MForwardOffset);
1201	}
1202
1203	// Verify that the copied-from memory doesn't change in between the two
1204	// transfers. For example, in:
1205	// memcpy(a <- b)
1206	// b = 42;*
1207	// memcpy(c <- a)
1208	// It would be invalid to transform the second memcpy into memcpy(c <- b).
1209	//
1210	// TODO: If the code between M and MDep is transparent to the destination "c",
1211	// then we could still perform the xform by moving M up to the first memcpy.
1212	if (writtenBetween(MSSA, AA&: BAA, Loc: MCopyLoc, Start: MSSA->getMemoryAccess(I: MDep),
1213	End: MSSA->getMemoryAccess(I: M)))
1214	return false;
1215
1216	// No need to create `memcpy(a <- a)`.
1217	if (BAA.isMustAlias(V1: M->getDest(), V2: CopySource)) {
1218	// Remove the instruction we're replacing.
1219	eraseInstruction(I: M);
1220	++NumMemCpyInstr;
1221	return true;
1222	}
1223
1224	// If the dest of the second might alias the source of the first, then the
1225	// source and dest might overlap. In addition, if the source of the first
1226	// points to constant memory, they won't overlap by definition. Otherwise, we
1227	// still want to eliminate the intermediate value, but we have to generate a
1228	// memmove instead of memcpy.
1229	bool UseMemMove = false;
1230	if (isModSet(MRI: BAA.getModRefInfo(I: M, OptLoc: MemoryLocation::getForSource(MTI: MDep)))) {
1231	// Don't convert llvm.memcpy.inline into memmove because memmove can be
1232	// lowered as a call, and that is not allowed for llvm.memcpy.inline (and
1233	// there is no inline version of llvm.memmove)
1234	if (M->isForceInlined())
1235	return false;
1236	UseMemMove = true;
1237	}
1238
1239	// If all checks passed, then we can transform M.
1240	LLVM_DEBUG(dbgs() << "MemCpyOptPass: Forwarding memcpy->memcpy src:\n"
1241	<< *MDep << `'\n'`
1242	<< *M << `'\n'`);
1243
1244	// TODO: Is this worth it if we're creating a less aligned memcpy? For
1245	// example we could be moving from movaps -> movq on x86.
1246	Instruction *NewM;
1247	if (UseMemMove)
1248	NewM = Builder.CreateMemMove(Dst: M->getDest(), DstAlign: M->getDestAlign(), Src: CopySource,
1249	SrcAlign: CopySourceAlign, Size: CopyLength, isVolatile: M->isVolatile());
1250	else if (M->isForceInlined())
1251	// llvm.memcpy may be promoted to llvm.memcpy.inline, but the converse is
1252	// never allowed since that would allow the latter to be lowered as a call
1253	// to an external function.
1254	NewM = Builder.CreateMemCpyInline(Dst: M->getDest(), DstAlign: M->getDestAlign(),
1255	Src: CopySource, SrcAlign: CopySourceAlign, Size: CopyLength,
1256	isVolatile: M->isVolatile());
1257	else
1258	NewM = Builder.CreateMemCpy(Dst: M->getDest(), DstAlign: M->getDestAlign(), Src: CopySource,
1259	SrcAlign: CopySourceAlign, Size: CopyLength, isVolatile: M->isVolatile());
1260
1261	NewM->copyMetadata(SrcInst: *M, WL: LLVMContext::MD_DIAssignID);
1262
1263	assert(isa<MemoryDef>(MSSA->getMemoryAccess(M)));
1264	auto *LastDef = cast<MemoryDef>(Val: MSSA->getMemoryAccess(I: M));
1265	auto NewAccess = MSSAU->createMemoryAccessAfter(I: NewM, Definition: nullptr*, InsertPt: LastDef);
1266	MSSAU->insertDef(Def: cast<MemoryDef>(Val: NewAccess), /RenameUses=/true);
1267
1268	// Remove the instruction we're replacing.
1269	eraseInstruction(I: M);
1270	++NumMemCpyInstr;
1271	return true;
1272	}
1273
1274	/// We've found that the (upward scanning) memory dependence of \p MemCpy is
1275	/// \p MemSet. Try to simplify \p MemSet to only set the trailing bytes that
1276	/// weren't copied over by \p MemCpy.
1277	///
1278	/// In other words, transform:
1279	/// \code
1280	/// memset(dst, c, dst_size);
1281	/// ...
1282	/// memcpy(dst, src, src_size);
1283	/// \endcode
1284	/// into:
1285	/// \code
1286	/// ...
1287	/// memset(dst + src_size, c, dst_size <= src_size ? 0 : dst_size - src_size);
1288	/// memcpy(dst, src, src_size);
1289	/// \endcode
1290	///
1291	/// The memset is sunk to just before the memcpy to ensure that src_size is
1292	/// present when emitting the simplified memset.
1293	bool MemCpyOptPass::processMemSetMemCpyDependence(MemCpyInst *MemCpy,
1294	MemSetInst *MemSet,
1295	BatchAAResults &BAA) {
1296	// We can only transform memset/memcpy with the same destination.
1297	if (!BAA.isMustAlias(V1: MemSet->getDest(), V2: MemCpy->getDest()))
1298	return false;
1299
1300	// Don't perform the transform if src_size may be zero. In that case, the
1301	// transform is essentially a complex no-op and may lead to an infinite
1302	// loop if BasicAA is smart enough to understand that dst and dst + src_size
1303	// are still MustAlias after the transform.
1304	Value *SrcSize = MemCpy->getLength();
1305	if (!isKnownNonZero(V: SrcSize,
1306	Q: SimplifyQuery (MemCpy->getDataLayout(), DT, AC, MemCpy)))
1307	return false;
1308
1309	// Check that src and dst of the memcpy aren't the same. While memcpy
1310	// operands cannot partially overlap, exact equality is allowed.
1311	if (isModSet(MRI: BAA.getModRefInfo(I: MemCpy, OptLoc: MemoryLocation::getForSource(MTI: MemCpy))))
1312	return false;
1313
1314	// We know that dst up to src_size is not written. We now need to make sure
1315	// that dst up to dst_size is not accessed. (If we did not move the memset,
1316	// checking for reads would be sufficient.)
1317	if (accessedBetween(AA&: BAA, Loc: MemoryLocation::getForDest(MI: MemSet),
1318	Start: MSSA->getMemoryAccess(I: MemSet),
1319	End: MSSA->getMemoryAccess(I: MemCpy)))
1320	return false;
1321
1322	// Use the same i8 dest as the memcpy, killing the memset dest if different.*
1323	Value *Dest = MemCpy->getRawDest();
1324	Value *DestSize = MemSet->getLength();
1325
1326	if (mayBeVisibleThroughUnwinding(V: Dest, Start: MemSet, End: MemCpy))
1327	return false;
1328
1329	// If the sizes are the same, simply drop the memset instead of generating
1330	// a replacement with zero size.
1331	if (DestSize == SrcSize) {
1332	eraseInstruction(I: MemSet);
1333	return true;
1334	}
1335
1336	// By default, create an unaligned memset.
1337	Align Alignment = Align (`1`);
1338	// If Dest is aligned, and SrcSize is constant, use the minimum alignment
1339	// of the sum.
1340	const Align DestAlign = std::max(a: MemSet->getDestAlign().valueOrOne(),
1341	b: MemCpy->getDestAlign().valueOrOne());
1342	if (DestAlign > `1`)
1343	if (auto *SrcSizeC = dyn_cast<ConstantInt>(Val: SrcSize))
1344	Alignment = commonAlignment(A: DestAlign, Offset: SrcSizeC->getZExtValue());
1345
1346	IRBuilder<> Builder(MemCpy);
1347
1348	// Preserve the debug location of the old memset for the code emitted here
1349	// related to the new memset. This is correct according to the rules in
1350	// https://llvm.org/docs/HowToUpdateDebugInfo.html about "when to preserve an
1351	// instruction location", given that we move the memset within the basic
1352	// block.
1353	assert(MemSet->getParent() == MemCpy->getParent() &&
1354	"Preserving debug location based on moving memset within BB.");
1355	Builder.SetCurrentDebugLocation(MemSet->getDebugLoc());
1356
1357	// If the sizes have different types, zext the smaller one.
1358	if (DestSize->getType() != SrcSize->getType()) {
1359	if (DestSize->getType()->getIntegerBitWidth() >
1360	SrcSize->getType()->getIntegerBitWidth())
1361	SrcSize = Builder.CreateZExt(V: SrcSize, DestTy: DestSize->getType());
1362	else
1363	DestSize = Builder.CreateZExt(V: DestSize, DestTy: SrcSize->getType());
1364	}
1365
1366	Value *Ule = Builder.CreateICmpULE(LHS: DestSize, RHS: SrcSize);
1367	Value *SizeDiff = Builder.CreateSub(LHS: DestSize, RHS: SrcSize);
1368	Value *MemsetLen = Builder.CreateSelect(
1369	C: Ule, True: ConstantInt::getNullValue(Ty: DestSize->getType()), False: SizeDiff);
1370	Instruction *NewMemSet =
1371	Builder.CreateMemSet(Ptr: Builder.CreatePtrAdd(Ptr: Dest, Offset: SrcSize),
1372	Val: MemSet->getOperand(i_nocapture: `1`), Size: MemsetLen, Align: Alignment);
1373
1374	assert(isa<MemoryDef>(MSSA->getMemoryAccess(MemCpy)) &&
1375	"MemCpy must be a MemoryDef");
1376	// The new memset is inserted before the memcpy, and it is known that the
1377	// memcpy's defining access is the memset about to be removed.
1378	auto *LastDef = cast<MemoryDef>(Val: MSSA->getMemoryAccess(I: MemCpy));
1379	auto *NewAccess =
1380	MSSAU->createMemoryAccessBefore(I: NewMemSet, Definition: nullptr, InsertPt: LastDef);
1381	MSSAU->insertDef(Def: cast<MemoryDef>(Val: NewAccess), /RenameUses=/true);
1382
1383	eraseInstruction(I: MemSet);
1384	return true;
1385	}
1386
1387	/// Determine whether the pointer V had only undefined content (due to Def) up
1388	/// to the given Size, either because it was freshly alloca'd or started its
1389	/// lifetime.
1390	static bool hasUndefContents(MemorySSA MSSA, BatchAAResults &AA, Value V,
1391	MemoryDef Def, Value Size) {
1392	if (MSSA->isLiveOnEntryDef(MA: Def))
1393	return isa<AllocaInst>(Val: getUnderlyingObject(V));
1394
1395	if (auto *II = dyn_cast_or_null<IntrinsicInst>(Val: Def->getMemoryInst())) {
1396	if (II->getIntrinsicID() == Intrinsic::lifetime_start) {
1397	auto *LTSize = cast<ConstantInt>(Val: II->getArgOperand(i: `0`));
1398
1399	if (auto *CSize = dyn_cast<ConstantInt>(Val: Size)) {
1400	if (AA.isMustAlias(V1: V, V2: II->getArgOperand(i: `1`)) &&
1401	LTSize->getZExtValue() >= CSize->getZExtValue())
1402	return true;
1403	}
1404
1405	// If the lifetime.start covers a whole alloca (as it almost always
1406	// does) and we're querying a pointer based on that alloca, then we know
1407	// the memory is definitely undef, regardless of how exactly we alias.
1408	// The size also doesn't matter, as an out-of-bounds access would be UB.
1409	if (auto *Alloca = dyn_cast<AllocaInst>(Val: getUnderlyingObject(V))) {
1410	if (getUnderlyingObject(V: II->getArgOperand(i: `1`)) == Alloca) {
1411	const DataLayout &DL = Alloca->getDataLayout();
1412	if (std::optional<TypeSize> AllocaSize =
1413	Alloca->getAllocationSize(DL))
1414	if (*AllocaSize == LTSize->getValue())
1415	return true;
1416	}
1417	}
1418	}
1419	}
1420
1421	return false;
1422	}
1423
1424	// If the memcpy is larger than the previous, but the memory was undef prior to
1425	// that, we can just ignore the tail. Technically we're only interested in the
1426	// bytes from 0..MemSrcOffset and MemSrcLength+MemSrcOffset..CopySize here, but
1427	// as we can't easily represent this location (hasUndefContents uses mustAlias
1428	// which cannot deal with offsets), we use the full 0..CopySize range.
1429	static bool overreadUndefContents(MemorySSA MSSA, MemCpyInst MemCpy,
1430	MemIntrinsic *MemSrc, BatchAAResults &BAA) {
1431	Value *CopySize = MemCpy->getLength();
1432	MemoryLocation MemCpyLoc = MemoryLocation::getForSource(MTI: MemCpy);
1433	MemoryUseOrDef *MemSrcAccess = MSSA->getMemoryAccess(I: MemSrc);
1434	MemoryAccess *Clobber = MSSA->getWalker()->getClobberingMemoryAccess(
1435	MemSrcAccess->getDefiningAccess(), MemCpyLoc, AA&: BAA);
1436	if (auto *MD = dyn_cast<MemoryDef>(Val: Clobber))
1437	if (hasUndefContents(MSSA, AA&: BAA, V: MemCpy->getSource(), Def: MD, Size: CopySize))
1438	return true;
1439	return false;
1440	}
1441
1442	/// Transform memcpy to memset when its source was just memset.
1443	/// In other words, turn:
1444	/// \code
1445	/// memset(dst1, c, dst1_size);
1446	/// memcpy(dst2, dst1, dst2_size);
1447	/// \endcode
1448	/// into:
1449	/// \code
1450	/// memset(dst1, c, dst1_size);
1451	/// memset(dst2, c, dst2_size);
1452	/// \endcode
1453	/// When dst2_size <= dst1_size.
1454	bool MemCpyOptPass::performMemCpyToMemSetOptzn(MemCpyInst *MemCpy,
1455	MemSetInst *MemSet,
1456	BatchAAResults &BAA) {
1457	Value *MemSetSize = MemSet->getLength();
1458	Value *CopySize = MemCpy->getLength();
1459
1460	int64_t MOffset = `0`;
1461	const DataLayout &DL = MemCpy->getModule()->getDataLayout();
1462	// We can only transforms memcpy's where the dest of one is the source of the
1463	// other, or they have a known offset.
1464	if (MemCpy->getSource() != MemSet->getDest()) {
1465	std::optional<int64_t> Offset =
1466	MemCpy->getSource()->getPointerOffsetFrom(Other: MemSet->getDest(), DL);
1467	if (!Offset \|\| *Offset < `0`)
1468	return false;
1469	MOffset = *Offset;
1470	}
1471
1472	if (MOffset != `0` \|\| MemSetSize != CopySize) {
1473	// Make sure the memcpy doesn't read any more than what the memset wrote,
1474	// other than undef. Don't worry about sizes larger than i64.
1475	auto *CMemSetSize = dyn_cast<ConstantInt>(Val: MemSetSize);
1476	auto *CCopySize = dyn_cast<ConstantInt>(Val: CopySize);
1477	if (!CMemSetSize \|\| !CCopySize \|\|
1478	CCopySize->getZExtValue() + MOffset > CMemSetSize->getZExtValue()) {
1479	if (!overreadUndefContents(MSSA, MemCpy, MemSrc: MemSet, BAA))
1480	return false;
1481
1482	if (CMemSetSize && CCopySize) {
1483	// If both have constant sizes and offsets, clip the memcpy to the
1484	// bounds of the memset if applicable.
1485	assert(CCopySize->getZExtValue() + MOffset >
1486	CMemSetSize->getZExtValue());
1487	if (MOffset == `0`)
1488	CopySize = MemSetSize;
1489	else
1490	CopySize =
1491	ConstantInt::get(Ty: CopySize->getType(),
1492	V: CMemSetSize->getZExtValue() <= (uint64_t)MOffset
1493	? `0`
1494	: CMemSetSize->getZExtValue() - MOffset);
1495	}
1496	}
1497	}
1498
1499	IRBuilder<> Builder(MemCpy);
1500	Instruction *NewM =
1501	Builder.CreateMemSet(Ptr: MemCpy->getRawDest(), Val: MemSet->getOperand(i_nocapture: `1`),
1502	Size: CopySize, Align: MemCpy->getDestAlign());
1503	auto *LastDef = cast<MemoryDef>(Val: MSSA->getMemoryAccess(I: MemCpy));
1504	auto NewAccess = MSSAU->createMemoryAccessAfter(I: NewM, Definition: nullptr*, InsertPt: LastDef);
1505	MSSAU->insertDef(Def: cast<MemoryDef>(Val: NewAccess), /RenameUses=/true);
1506
1507	return true;
1508	}
1509
1510	// Attempts to optimize the pattern whereby memory is copied from an alloca to
1511	// another alloca, where the two allocas don't have conflicting mod/ref. If
1512	// successful, the two allocas can be merged into one and the transfer can be
1513	// deleted. This pattern is generated frequently in Rust, due to the ubiquity of
1514	// move operations in that language.
1515	//
1516	// Once we determine that the optimization is safe to perform, we replace all
1517	// uses of the destination alloca with the source alloca. We also "shrink wrap"
1518	// the lifetime markers of the single merged alloca to before the first use
1519	// and after the last use. Note that the "shrink wrapping" procedure is a safe
1520	// transformation only because we restrict the scope of this optimization to
1521	// allocas that aren't captured.
1522	bool MemCpyOptPass::performStackMoveOptzn(Instruction Load, Instruction Store,
1523	AllocaInst *DestAlloca,
1524	AllocaInst *SrcAlloca, TypeSize Size,
1525	BatchAAResults &BAA) {
1526	LLVM_DEBUG(dbgs() << "Stack Move: Attempting to optimize:\n"
1527	<< *Store << "\n");
1528
1529	// Make sure the two allocas are in the same address space.
1530	if (SrcAlloca->getAddressSpace() != DestAlloca->getAddressSpace()) {
1531	LLVM_DEBUG(dbgs() << "Stack Move: Address space mismatch\n");
1532	return false;
1533	}
1534
1535	// Check that copy is full with static size.
1536	const DataLayout &DL = DestAlloca->getDataLayout();
1537	std::optional<TypeSize> SrcSize = SrcAlloca->getAllocationSize(DL);
1538	if (!SrcSize \|\| Size != *SrcSize) {
1539	LLVM_DEBUG(dbgs() << "Stack Move: Source alloca size mismatch\n");
1540	return false;
1541	}
1542	std::optional<TypeSize> DestSize = DestAlloca->getAllocationSize(DL);
1543	if (!DestSize \|\| Size != *DestSize) {
1544	LLVM_DEBUG(dbgs() << "Stack Move: Destination alloca size mismatch\n");
1545	return false;
1546	}
1547
1548	if (!SrcAlloca->isStaticAlloca() \|\| !DestAlloca->isStaticAlloca())
1549	return false;
1550
1551	// Check that src and dest are never captured, unescaped allocas. Also
1552	// find the nearest common dominator and postdominator for all users in
1553	// order to shrink wrap the lifetimes, and instructions with noalias metadata
1554	// to remove them.
1555
1556	SmallVector<Instruction *, `4`> LifetimeMarkers;
1557	SmallSet<Instruction *, `4`> AAMetadataInstrs;
1558	bool SrcNotDom = false;
1559
1560	auto CaptureTrackingWithModRef =
1561	[&](Instruction *AI,
1562	function_ref<bool(Instruction )> ModRefCallback) -> bool* {
1563	SmallVector<Instruction *, `8`> Worklist;
1564	Worklist.push_back(Elt: AI);
1565	unsigned MaxUsesToExplore = getDefaultMaxUsesToExploreForCaptureTracking();
1566	Worklist.reserve(N: MaxUsesToExplore);
1567	SmallSet<const Use *, `20`> Visited;
1568	while (!Worklist.empty()) {
1569	Instruction *I = Worklist.pop_back_val();
1570	for (const Use &U : I->uses()) {
1571	auto *UI = cast<Instruction>(Val: U.getUser());
1572	// If any use that isn't dominated by SrcAlloca exists, we move src
1573	// alloca to the entry before the transformation.
1574	if (!DT->dominates(Def: SrcAlloca, User: UI))
1575	SrcNotDom = true;
1576
1577	if (Visited.size() >= MaxUsesToExplore) {
1578	LLVM_DEBUG(
1579	dbgs()
1580	<< "Stack Move: Exceeded max uses to see ModRef, bailing\n");
1581	return false;
1582	}
1583	if (!Visited.insert(Ptr: &U).second)
1584	continue;
1585	UseCaptureInfo CI = DetermineUseCaptureKind(U, Base: AI);
1586	if (capturesAnything(CC: CI.UseCC))
1587	return false;
1588
1589	if (UI->mayReadOrWriteMemory()) {
1590	if (UI->isLifetimeStartOrEnd()) {
1591	// We note the locations of these intrinsic calls so that we can
1592	// delete them later if the optimization succeeds, this is safe
1593	// since both llvm.lifetime.start and llvm.lifetime.end intrinsics
1594	// practically fill all the bytes of the alloca with an undefined
1595	// value, although conceptually marked as alive/dead.
1596	int64_t Size = cast<ConstantInt>(Val: UI->getOperand(i: `0`))->getSExtValue();
1597	if (Size < `0` \|\| Size == DestSize) {
1598	LifetimeMarkers.push_back(Elt: UI);
1599	continue;
1600	}
1601	}
1602	AAMetadataInstrs.insert(Ptr: UI);
1603
1604	if (!ModRefCallback (UI))
1605	return false;
1606	}
1607
1608	if (capturesAnything(CC: CI.ResultCC)) {
1609	Worklist.push_back(Elt: UI);
1610	continue;
1611	}
1612	}
1613	}
1614	return true;
1615	};
1616
1617	// Check that dest has no Mod/Ref, from the alloca to the Store, except full
1618	// size lifetime intrinsics. And collect modref inst for the reachability
1619	// check.
1620	ModRefInfo DestModRef = ModRefInfo::NoModRef;
1621	MemoryLocation DestLoc(DestAlloca, LocationSize::precise(Value: Size));
1622	SmallVector<BasicBlock *, `8`> ReachabilityWorklist;
1623	auto DestModRefCallback = [&](Instruction UI) -> bool* {
1624	// We don't care about the store itself.
1625	if (UI == Store)
1626	return true;
1627	ModRefInfo Res = BAA.getModRefInfo(I: UI, OptLoc: DestLoc);
1628	DestModRef \|= Res;
1629	if (isModOrRefSet(MRI: Res)) {
1630	// Instructions reachability checks.
1631	// FIXME: adding the Instruction version isPotentiallyReachableFromMany on
1632	// lib/Analysis/CFG.cpp (currently only for BasicBlocks) might be helpful.
1633	if (UI->getParent() == Store->getParent()) {
1634	// The same block case is special because it's the only time we're
1635	// looking within a single block to see which instruction comes first.
1636	// Once we start looking at multiple blocks, the first instruction of
1637	// the block is reachable, so we only need to determine reachability
1638	// between whole blocks.
1639	BasicBlock *BB = UI->getParent();
1640
1641	// If A comes before B, then B is definitively reachable from A.
1642	if (UI->comesBefore(Other: Store))
1643	return false;
1644
1645	// If the user's parent block is entry, no predecessor exists.
1646	if (BB->isEntryBlock())
1647	return true;
1648
1649	// Otherwise, continue doing the normal per-BB CFG walk.
1650	ReachabilityWorklist.append(in_start: succ_begin(BB), in_end: succ_end(BB));
1651	} else {
1652	ReachabilityWorklist.push_back(Elt: UI->getParent());
1653	}
1654	}
1655	return true;
1656	};
1657
1658	if (!CaptureTrackingWithModRef (DestAlloca, DestModRefCallback))
1659	return false;
1660	// Bailout if Dest may have any ModRef before Store.
1661	if (!ReachabilityWorklist.empty() &&
1662	isPotentiallyReachableFromMany(Worklist&: ReachabilityWorklist, StopBB: Store->getParent(),
1663	ExclusionSet: nullptr, DT, LI: nullptr))
1664	return false;
1665
1666	// Check that, from after the Load to the end of the BB,
1667	// - if the dest has any Mod, src has no Ref, and
1668	// - if the dest has any Ref, src has no Mod except full-sized lifetimes.
1669	MemoryLocation SrcLoc(SrcAlloca, LocationSize::precise(Value: Size));
1670
1671	auto SrcModRefCallback = [&](Instruction UI) -> bool* {
1672	// Any ModRef post-dominated by Load doesn't matter, also Load and Store
1673	// themselves can be ignored.
1674	if (PDT->dominates(I1: Load, I2: UI) \|\| UI == Load \|\| UI == Store)
1675	return true;
1676	ModRefInfo Res = BAA.getModRefInfo(I: UI, OptLoc: SrcLoc);
1677	if ((isModSet(MRI: DestModRef) && isRefSet(MRI: Res)) \|\|
1678	(isRefSet(MRI: DestModRef) && isModSet(MRI: Res)))
1679	return false;
1680
1681	return true;
1682	};
1683
1684	if (!CaptureTrackingWithModRef (SrcAlloca, SrcModRefCallback))
1685	return false;
1686
1687	// We can do the transformation. First, move the SrcAlloca to the start of the
1688	// BB.
1689	if (SrcNotDom)
1690	SrcAlloca->moveBefore(BB&: *SrcAlloca->getParent(),
1691	I: SrcAlloca->getParent()->getFirstInsertionPt());
1692	// Align the allocas appropriately.
1693	SrcAlloca->setAlignment(
1694	std::max(a: SrcAlloca->getAlign(), b: DestAlloca->getAlign()));
1695
1696	// Merge the two allocas.
1697	DestAlloca->replaceAllUsesWith(V: SrcAlloca);
1698	eraseInstruction(I: DestAlloca);
1699
1700	// Drop metadata on the source alloca.
1701	SrcAlloca->dropUnknownNonDebugMetadata();
1702
1703	// TODO: Reconstruct merged lifetime markers.
1704	// Remove all other lifetime markers. if the original lifetime intrinsics
1705	// exists.
1706	if (!LifetimeMarkers.empty()) {
1707	for (Instruction *I : LifetimeMarkers)
1708	eraseInstruction(I);
1709	}
1710
1711	// As this transformation can cause memory accesses that didn't previously
1712	// alias to begin to alias one another, we remove !alias.scope, !noalias,
1713	// !tbaa and !tbaa_struct metadata from any uses of either alloca.
1714	// This is conservative, but more precision doesn't seem worthwhile
1715	// right now.
1716	for (Instruction *I : AAMetadataInstrs) {
1717	I->setMetadata(KindID: LLVMContext::MD_alias_scope, Node: nullptr);
1718	I->setMetadata(KindID: LLVMContext::MD_noalias, Node: nullptr);
1719	I->setMetadata(KindID: LLVMContext::MD_tbaa, Node: nullptr);
1720	I->setMetadata(KindID: LLVMContext::MD_tbaa_struct, Node: nullptr);
1721	}
1722
1723	LLVM_DEBUG(dbgs() << "Stack Move: Performed staack-move optimization\n");
1724	NumStackMove ++;
1725	return true;
1726	}
1727
1728	static bool isZeroSize(Value *Size) {
1729	if (auto *I = dyn_cast<Instruction>(Val: Size))
1730	if (auto *Res = simplifyInstruction(I, Q: I->getDataLayout()))
1731	Size = Res;
1732	// Treat undef/poison size like zero.
1733	if (auto *C = dyn_cast<Constant>(Val: Size))
1734	return isa<UndefValue>(Val: C) \|\| C->isNullValue();
1735	return false;
1736	}
1737
1738	/// Perform simplification of memcpy's. If we have memcpy A
1739	/// which copies X to Y, and memcpy B which copies Y to Z, then we can rewrite
1740	/// B to be a memcpy from X to Z (or potentially a memmove, depending on
1741	/// circumstances). This allows later passes to remove the first memcpy
1742	/// altogether.
1743	bool MemCpyOptPass::processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI) {
1744	// We can only optimize non-volatile memcpy's.
1745	if (M->isVolatile())
1746	return false;
1747
1748	// If the source and destination of the memcpy are the same, then zap it.
1749	if (M->getSource() == M->getDest()) {
1750	++BBI;
1751	eraseInstruction(I: M);
1752	return true;
1753	}
1754
1755	// If the size is zero, remove the memcpy.
1756	if (isZeroSize(Size: M->getLength())) {
1757	++BBI;
1758	eraseInstruction(I: M);
1759	return true;
1760	}
1761
1762	MemoryUseOrDef *MA = MSSA->getMemoryAccess(I: M);
1763	if (!MA)
1764	// Degenerate case: memcpy marked as not accessing memory.
1765	return false;
1766
1767	// If copying from a constant, try to turn the memcpy into a memset.
1768	if (auto *GV = dyn_cast<GlobalVariable>(Val: M->getSource()))
1769	if (GV->isConstant() && GV->hasDefinitiveInitializer())
1770	if (Value *ByteVal = isBytewiseValue(V: GV->getInitializer(),
1771	DL: M->getDataLayout())) {
1772	IRBuilder<> Builder(M);
1773	Instruction *NewM = Builder.CreateMemSet(
1774	Ptr: M->getRawDest(), Val: ByteVal, Size: M->getLength(), Align: M->getDestAlign(), isVolatile: false);
1775	auto *LastDef = cast<MemoryDef>(Val: MA);
1776	auto *NewAccess =
1777	MSSAU->createMemoryAccessAfter(I: NewM, Definition: nullptr, InsertPt: LastDef);
1778	MSSAU->insertDef(Def: cast<MemoryDef>(Val: NewAccess), /RenameUses=/true);
1779
1780	eraseInstruction(I: M);
1781	++NumCpyToSet;
1782	return true;
1783	}
1784
1785	BatchAAResults BAA(*AA, EEA);
1786	// FIXME: Not using getClobberingMemoryAccess() here due to PR54682.
1787	MemoryAccess *AnyClobber = MA->getDefiningAccess();
1788	MemoryLocation DestLoc = MemoryLocation::getForDest(MI: M);
1789	const MemoryAccess *DestClobber =
1790	MSSA->getWalker()->getClobberingMemoryAccess(AnyClobber, DestLoc, AA&: BAA);
1791
1792	// Try to turn a partially redundant memset + memcpy into
1793	// smaller memset + memcpy. We don't need the memcpy size for this.
1794	// The memcpy must post-dom the memset, so limit this to the same basic
1795	// block. A non-local generalization is likely not worthwhile.
1796	if (auto *MD = dyn_cast<MemoryDef>(Val: DestClobber))
1797	if (auto *MDep = dyn_cast_or_null<MemSetInst>(Val: MD->getMemoryInst()))
1798	if (DestClobber->getBlock() == M->getParent())
1799	if (processMemSetMemCpyDependence(MemCpy: M, MemSet: MDep, BAA))
1800	return true;
1801
1802	MemoryAccess *SrcClobber = MSSA->getWalker()->getClobberingMemoryAccess(
1803	AnyClobber, MemoryLocation::getForSource(MTI: M), AA&: BAA);
1804
1805	// There are five possible optimizations we can do for memcpy:
1806	// a) memcpy-memcpy xform which exposes redundance for DSE.
1807	// b) call-memcpy xform for return slot optimization.
1808	// c) memcpy from freshly alloca'd space or space that has just started
1809	// its lifetime copies undefined data, and we can therefore eliminate
1810	// the memcpy in favor of the data that was already at the destination.
1811	// d) memcpy from a just-memset'd source can be turned into memset.
1812	// e) elimination of memcpy via stack-move optimization.
1813	if (auto *MD = dyn_cast<MemoryDef>(Val: SrcClobber)) {
1814	if (Instruction *MI = MD->getMemoryInst()) {
1815	if (auto *CopySize = dyn_cast<ConstantInt>(Val: M->getLength())) {
1816	if (auto *C = dyn_cast<CallInst>(Val: MI)) {
1817	if (performCallSlotOptzn(cpyLoad: M, cpyStore: M, cpyDest: M->getDest(), cpySrc: M->getSource(),
1818	cpySize: TypeSize::getFixed(ExactSize: CopySize->getZExtValue()),
1819	cpyDestAlign: M->getDestAlign().valueOrOne(), BAA,
1820	GetC: [C]() -> CallInst * { return C; })) {
1821	LLVM_DEBUG(dbgs() << "Performed call slot optimization:\n"
1822	<< " call: " << *C << "\n"
1823	<< " memcpy: " << *M << "\n");
1824	eraseInstruction(I: M);
1825	++NumMemCpyInstr;
1826	return true;
1827	}
1828	}
1829	}
1830	if (auto *MDep = dyn_cast<MemCpyInst>(Val: MI))
1831	if (processMemCpyMemCpyDependence(M, MDep, BAA))
1832	return true;
1833	if (auto *MDep = dyn_cast<MemSetInst>(Val: MI)) {
1834	if (performMemCpyToMemSetOptzn(MemCpy: M, MemSet: MDep, BAA)) {
1835	LLVM_DEBUG(dbgs() << "Converted memcpy to memset\n");
1836	eraseInstruction(I: M);
1837	++NumCpyToSet;
1838	return true;
1839	}
1840	}
1841	}
1842
1843	if (hasUndefContents(MSSA, AA&: BAA, V: M->getSource(), Def: MD, Size: M->getLength())) {
1844	LLVM_DEBUG(dbgs() << "Removed memcpy from undef\n");
1845	eraseInstruction(I: M);
1846	++NumMemCpyInstr;
1847	return true;
1848	}
1849	}
1850
1851	// If the transfer is from a stack slot to a stack slot, then we may be able
1852	// to perform the stack-move optimization. See the comments in
1853	// performStackMoveOptzn() for more details.
1854	auto *DestAlloca = dyn_cast<AllocaInst>(Val: M->getDest());
1855	if (!DestAlloca)
1856	return false;
1857	auto *SrcAlloca = dyn_cast<AllocaInst>(Val: M->getSource());
1858	if (!SrcAlloca)
1859	return false;
1860	ConstantInt *Len = dyn_cast<ConstantInt>(Val: M->getLength());
1861	if (Len == nullptr)
1862	return false;
1863	if (performStackMoveOptzn(Load: M, Store: M, DestAlloca, SrcAlloca,
1864	Size: TypeSize::getFixed(ExactSize: Len->getZExtValue()), BAA)) {
1865	// Avoid invalidating the iterator.
1866	BBI = M->getNextNonDebugInstruction()->getIterator();
1867	eraseInstruction(I: M);
1868	++NumMemCpyInstr;
1869	return true;
1870	}
1871
1872	return false;
1873	}
1874
1875	/// Memmove calls with overlapping src/dest buffers that come after a memset may
1876	/// be removed.
1877	bool MemCpyOptPass::isMemMoveMemSetDependency(MemMoveInst *M) {
1878	const auto &DL = M->getDataLayout();
1879	MemoryUseOrDef *MemMoveAccess = MSSA->getMemoryAccess(I: M);
1880	if (!MemMoveAccess)
1881	return false;
1882
1883	// The memmove is of form memmove(x, x + A, B).
1884	MemoryLocation SourceLoc = MemoryLocation::getForSource(MTI: M);
1885	auto *MemMoveSourceOp = M->getSource();
1886	auto *Source = dyn_cast<GEPOperator>(Val: MemMoveSourceOp);
1887	if (!Source)
1888	return false;
1889
1890	APInt Offset(DL.getIndexTypeSizeInBits(Ty: Source->getType()), `0`);
1891	LocationSize MemMoveLocSize = SourceLoc.Size;
1892	if (Source->getPointerOperand() != M->getDest() \|\|
1893	!MemMoveLocSize.hasValue() \|\|
1894	!Source->accumulateConstantOffset(DL, Offset) \|\| Offset.isNegative()) {
1895	return false;
1896	}
1897
1898	uint64_t MemMoveSize = MemMoveLocSize.getValue();
1899	LocationSize TotalSize =
1900	LocationSize::precise(Value: Offset.getZExtValue() + MemMoveSize);
1901	MemoryLocation CombinedLoc(M->getDest(), TotalSize);
1902
1903	// The first dominating clobbering MemoryAccess for the combined location
1904	// needs to be a memset.
1905	BatchAAResults BAA(*AA);
1906	MemoryAccess *FirstDef = MemMoveAccess->getDefiningAccess();
1907	auto *DestClobber = dyn_cast<MemoryDef>(
1908	Val: MSSA->getWalker()->getClobberingMemoryAccess(FirstDef, CombinedLoc, AA&: BAA));
1909	if (!DestClobber)
1910	return false;
1911
1912	auto *MS = dyn_cast_or_null<MemSetInst>(Val: DestClobber->getMemoryInst());
1913	if (!MS)
1914	return false;
1915
1916	// Memset length must be sufficiently large.
1917	auto *MemSetLength = dyn_cast<ConstantInt>(Val: MS->getLength());
1918	if (!MemSetLength \|\| MemSetLength->getZExtValue() < MemMoveSize)
1919	return false;
1920
1921	// The destination buffer must have been memset'd.
1922	if (!BAA.isMustAlias(V1: MS->getDest(), V2: M->getDest()))
1923	return false;
1924
1925	return true;
1926	}
1927
1928	/// Transforms memmove calls to memcpy calls when the src/dst are guaranteed
1929	/// not to alias.
1930	bool MemCpyOptPass::processMemMove(MemMoveInst *M, BasicBlock::iterator &BBI) {
1931	// See if the source could be modified by this memmove potentially.
1932	if (isModSet(MRI: AA->getModRefInfo(I: M, OptLoc: MemoryLocation::getForSource(MTI: M)))) {
1933	// On the off-chance the memmove clobbers src with previously memset'd
1934	// bytes, the memmove may be redundant.
1935	if (!M->isVolatile() && isMemMoveMemSetDependency(M)) {
1936	LLVM_DEBUG(dbgs() << "Removed redundant memmove.\n");
1937	++BBI;
1938	eraseInstruction(I: M);
1939	++NumMemMoveInstr;
1940	return true;
1941	}
1942	return false;
1943	}
1944
1945	LLVM_DEBUG(dbgs() << "MemCpyOptPass: Optimizing memmove -> memcpy: " << *M
1946	<< "\n");
1947
1948	// If not, then we know we can transform this.
1949	Type *ArgTys[`3`] = {M->getRawDest()->getType(), M->getRawSource()->getType(),
1950	M->getLength()->getType()};
1951	M->setCalledFunction(Intrinsic::getOrInsertDeclaration(
1952	M: M->getModule(), id: Intrinsic::memcpy, Tys: ArgTys));
1953
1954	// For MemorySSA nothing really changes (except that memcpy may imply stricter
1955	// aliasing guarantees).
1956
1957	++NumMoveToCpy;
1958	return true;
1959	}
1960
1961	/// This is called on every byval argument in call sites.
1962	bool MemCpyOptPass::processByValArgument(CallBase &CB, unsigned ArgNo) {
1963	const DataLayout &DL = CB.getDataLayout();
1964	// Find out what feeds this byval argument.
1965	Value *ByValArg = CB.getArgOperand(i: ArgNo);
1966	Type *ByValTy = CB.getParamByValType(ArgNo);
1967	TypeSize ByValSize = DL.getTypeAllocSize(Ty: ByValTy);
1968	MemoryLocation Loc(ByValArg, LocationSize::precise(Value: ByValSize));
1969	MemoryUseOrDef *CallAccess = MSSA->getMemoryAccess(I: &CB);
1970	if (!CallAccess)
1971	return false;
1972	MemCpyInst MDep = nullptr*;
1973	BatchAAResults BAA(*AA, EEA);
1974	MemoryAccess *Clobber = MSSA->getWalker()->getClobberingMemoryAccess(
1975	CallAccess->getDefiningAccess(), Loc, AA&: BAA);
1976	if (auto *MD = dyn_cast<MemoryDef>(Val: Clobber))
1977	MDep = dyn_cast_or_null<MemCpyInst>(Val: MD->getMemoryInst());
1978
1979	// If the byval argument isn't fed by a memcpy, ignore it. If it is fed by
1980	// a memcpy, see if we can byval from the source of the memcpy instead of the
1981	// result.
1982	if (!MDep \|\| MDep->isVolatile() \|\|
1983	ByValArg->stripPointerCasts() != MDep->getDest())
1984	return false;
1985
1986	// The length of the memcpy must be larger or equal to the size of the byval.
1987	auto *C1 = dyn_cast<ConstantInt>(Val: MDep->getLength());
1988	if (!C1 \|\| !TypeSize::isKnownGE(
1989	LHS: TypeSize::getFixed(ExactSize: C1->getValue().getZExtValue()), RHS: ByValSize))
1990	return false;
1991
1992	// Get the alignment of the byval. If the call doesn't specify the alignment,
1993	// then it is some target specific value that we can't know.
1994	MaybeAlign ByValAlign = CB.getParamAlign(ArgNo);
1995	if (!ByValAlign)
1996	return false;
1997
1998	// If it is greater than the memcpy, then we check to see if we can force the
1999	// source of the memcpy to the alignment we need. If we fail, we bail out.
2000	MaybeAlign MemDepAlign = MDep->getSourceAlign();
2001	if ((!MemDepAlign \|\| MemDepAlign < ByValAlign) &&
2002	getOrEnforceKnownAlignment(V: MDep->getSource(), PrefAlign: ByValAlign, DL, CxtI: &CB, AC,
2003	DT) < *ByValAlign)
2004	return false;
2005
2006	// The type of the memcpy source must match the byval argument
2007	if (MDep->getSource()->getType() != ByValArg->getType())
2008	return false;
2009
2010	// Verify that the copied-from memory doesn't change in between the memcpy and
2011	// the byval call.
2012	// memcpy(a <- b)
2013	// b = 42;*
2014	// foo(a)*
2015	// It would be invalid to transform the second memcpy into foo(b).*
2016	if (writtenBetween(MSSA, AA&: BAA, Loc: MemoryLocation::getForSource(MTI: MDep),
2017	Start: MSSA->getMemoryAccess(I: MDep), End: CallAccess))
2018	return false;
2019
2020	LLVM_DEBUG(dbgs() << "MemCpyOptPass: Forwarding memcpy to byval:\n"
2021	<< " " << *MDep << "\n"
2022	<< " " << CB << "\n");
2023
2024	// Otherwise we're good! Update the byval argument.
2025	combineAAMetadata(K: &CB, J: MDep);
2026	CB.setArgOperand(i: ArgNo, v: MDep->getSource());
2027	++NumMemCpyInstr;
2028	return true;
2029	}
2030
2031	/// This is called on memcpy dest pointer arguments attributed as immutable
2032	/// during call. Try to use memcpy source directly if all of the following
2033	/// conditions are satisfied.
2034	/// 1. The memcpy dst is neither modified during the call nor captured by the
2035	/// call.
2036	/// 2. The memcpy dst is an alloca with known alignment & size.
2037	/// 2-1. The memcpy length == the alloca size which ensures that the new
2038	/// pointer is dereferenceable for the required range
2039	/// 2-2. The src pointer has alignment >= the alloca alignment or can be
2040	/// enforced so.
2041	/// 3. The memcpy dst and src is not modified between the memcpy and the call.
2042	/// (if MSSA clobber check is safe.)
2043	/// 4. The memcpy src is not modified during the call. (ModRef check shows no
2044	/// Mod.)
2045	bool MemCpyOptPass::processImmutArgument(CallBase &CB, unsigned ArgNo) {
2046	BatchAAResults BAA(*AA, EEA);
2047	Value *ImmutArg = CB.getArgOperand(i: ArgNo);
2048
2049	// 1. Ensure passed argument is immutable during call.
2050	if (!CB.doesNotCapture(OpNo: ArgNo))
2051	return false;
2052
2053	// We know that the argument is readonly at this point, but the function
2054	// might still modify the same memory through a different pointer. Exclude
2055	// this either via noalias, or alias analysis.
2056	if (!CB.paramHasAttr(ArgNo, Kind: Attribute::NoAlias) &&
2057	isModSet(
2058	MRI: BAA.getModRefInfo(I: &CB, OptLoc: MemoryLocation::getBeforeOrAfter(Ptr: ImmutArg))))
2059	return false;
2060
2061	const DataLayout &DL = CB.getDataLayout();
2062
2063	// 2. Check that arg is alloca
2064	// TODO: Even if the arg gets back to branches, we can remove memcpy if all
2065	// the alloca alignments can be enforced to source alignment.
2066	auto *AI = dyn_cast<AllocaInst>(Val: ImmutArg->stripPointerCasts());
2067	if (!AI)
2068	return false;
2069
2070	std::optional<TypeSize> AllocaSize = AI->getAllocationSize(DL);
2071	// Can't handle unknown size alloca.
2072	// (e.g. Variable Length Array, Scalable Vector)
2073	if (!AllocaSize \|\| AllocaSize ->isScalable())
2074	return false;
2075	MemoryLocation Loc(ImmutArg, LocationSize::precise(Value: *AllocaSize));
2076	MemoryUseOrDef *CallAccess = MSSA->getMemoryAccess(I: &CB);
2077	if (!CallAccess)
2078	return false;
2079
2080	MemCpyInst MDep = nullptr*;
2081	MemoryAccess *Clobber = MSSA->getWalker()->getClobberingMemoryAccess(
2082	CallAccess->getDefiningAccess(), Loc, AA&: BAA);
2083	if (auto *MD = dyn_cast<MemoryDef>(Val: Clobber))
2084	MDep = dyn_cast_or_null<MemCpyInst>(Val: MD->getMemoryInst());
2085
2086	// If the immut argument isn't fed by a memcpy, ignore it. If it is fed by
2087	// a memcpy, check that the arg equals the memcpy dest.
2088	if (!MDep \|\| MDep->isVolatile() \|\| AI != MDep->getDest())
2089	return false;
2090
2091	// The type of the memcpy source must match the immut argument
2092	if (MDep->getSource()->getType() != ImmutArg->getType())
2093	return false;
2094
2095	// 2-1. The length of the memcpy must be equal to the size of the alloca.
2096	auto *MDepLen = dyn_cast<ConstantInt>(Val: MDep->getLength());
2097	if (!MDepLen \|\| AllocaSize != MDepLen->getValue())
2098	return false;
2099
2100	// 2-2. the memcpy source align must be larger than or equal the alloca's
2101	// align. If not so, we check to see if we can force the source of the memcpy
2102	// to the alignment we need. If we fail, we bail out.
2103	Align MemDepAlign = MDep->getSourceAlign().valueOrOne();
2104	Align AllocaAlign = AI->getAlign();
2105	if (MemDepAlign < AllocaAlign &&
2106	getOrEnforceKnownAlignment(V: MDep->getSource(), PrefAlign: AllocaAlign, DL, CxtI: &CB, AC,
2107	DT) < AllocaAlign)
2108	return false;
2109
2110	// 3. Verify that the source doesn't change in between the memcpy and
2111	// the call.
2112	// memcpy(a <- b)
2113	// b = 42;*
2114	// foo(a)*
2115	// It would be invalid to transform the second memcpy into foo(b).*
2116	if (writtenBetween(MSSA, AA&: BAA, Loc: MemoryLocation::getForSource(MTI: MDep),
2117	Start: MSSA->getMemoryAccess(I: MDep), End: CallAccess))
2118	return false;
2119
2120	// 4. The memcpy src must not be modified during the call.
2121	if (isModSet(MRI: BAA.getModRefInfo(I: &CB, OptLoc: MemoryLocation::getForSource(MTI: MDep))))
2122	return false;
2123
2124	LLVM_DEBUG(dbgs() << "MemCpyOptPass: Forwarding memcpy to Immut src:\n"
2125	<< " " << *MDep << "\n"
2126	<< " " << CB << "\n");
2127
2128	// Otherwise we're good! Update the immut argument.
2129	combineAAMetadata(K: &CB, J: MDep);
2130	CB.setArgOperand(i: ArgNo, v: MDep->getSource());
2131	++NumMemCpyInstr;
2132	return true;
2133	}
2134
2135	/// Executes one iteration of MemCpyOptPass.
2136	bool MemCpyOptPass::iterateOnFunction(Function &F) {
2137	bool MadeChange = false;
2138
2139	// Walk all instruction in the function.
2140	for (BasicBlock &BB : F) {
2141	// Skip unreachable blocks. For example processStore assumes that an
2142	// instruction in a BB can't be dominated by a later instruction in the
2143	// same BB (which is a scenario that can happen for an unreachable BB that
2144	// has itself as a predecessor).
2145	if (!DT->isReachableFromEntry(A: &BB))
2146	continue;
2147
2148	for (BasicBlock::iterator BI = BB.begin(), BE = BB.end(); BI != BE;) {
2149	// Avoid invalidating the iterator.
2150	Instruction I = &BI ++;
2151
2152	bool RepeatInstruction = false;
2153
2154	if (auto *SI = dyn_cast<StoreInst>(Val: I))
2155	MadeChange \|= processStore(SI, BBI&: BI);
2156	else if (auto *M = dyn_cast<MemSetInst>(Val: I))
2157	RepeatInstruction = processMemSet(MSI: M, BBI&: BI);
2158	else if (auto *M = dyn_cast<MemCpyInst>(Val: I))
2159	RepeatInstruction = processMemCpy(M, BBI&: BI);
2160	else if (auto *M = dyn_cast<MemMoveInst>(Val: I))
2161	RepeatInstruction = processMemMove(M, BBI&: BI);
2162	else if (auto *CB = dyn_cast<CallBase>(Val: I)) {
2163	for (unsigned i = `0`, e = CB->arg_size(); i != e; ++i) {
2164	if (CB->isByValArgument(ArgNo: i))
2165	MadeChange \|= processByValArgument(CB&: *CB, ArgNo: i);
2166	else if (CB->onlyReadsMemory(OpNo: i))
2167	MadeChange \|= processImmutArgument(CB&: *CB, ArgNo: i);
2168	}
2169	}
2170
2171	// Reprocess the instruction if desired.
2172	if (RepeatInstruction) {
2173	if (BI != BB.begin())
2174	--BI;
2175	MadeChange = true;
2176	}
2177	}
2178	}
2179
2180	return MadeChange;
2181	}
2182
2183	PreservedAnalyses MemCpyOptPass::run(Function &F, FunctionAnalysisManager &AM) {
2184	auto &TLI = AM.getResult<TargetLibraryAnalysis>(IR&: F);
2185	auto *AA = &AM.getResult<AAManager>(IR&: F);
2186	auto *AC = &AM.getResult<AssumptionAnalysis>(IR&: F);
2187	auto *DT = &AM.getResult<DominatorTreeAnalysis>(IR&: F);
2188	auto *PDT = &AM.getResult<PostDominatorTreeAnalysis>(IR&: F);
2189	auto *MSSA = &AM.getResult<MemorySSAAnalysis>(IR&: F);
2190
2191	bool MadeChange = runImpl(F, TLI: &TLI, AA, AC, DT, PDT, MSSA: &MSSA->getMSSA());
2192	if (!MadeChange)
2193	return PreservedAnalyses::all();
2194
2195	PreservedAnalyses PA;
2196	PA.preserveSet<CFGAnalyses>();
2197	PA.preserve<MemorySSAAnalysis>();
2198	return PA;
2199	}
2200
2201	bool MemCpyOptPass::runImpl(Function &F, TargetLibraryInfo *TLI_,
2202	AliasAnalysis AA_, AssumptionCache AC_,
2203	DominatorTree DT_, PostDominatorTree PDT_,
2204	MemorySSA *MSSA_) {
2205	bool MadeChange = false;
2206	TLI = TLI_;
2207	AA = AA_;
2208	AC = AC_;
2209	DT = DT_;
2210	PDT = PDT_;
2211	MSSA = MSSA_;
2212	MemorySSAUpdater MSSAU_(MSSA_);
2213	MSSAU = &MSSAU_;
2214	EarliestEscapeAnalysis EEA_(*DT);
2215	EEA = &EEA_;
2216
2217	while (true) {
2218	if (!iterateOnFunction(F))
2219	break;
2220	MadeChange = true;
2221	}
2222
2223	if (VerifyMemorySSA)
2224	MSSA_->verifyMemorySSA();
2225
2226	return MadeChange;
2227	}
2228

Browse the source code of llvm_projects/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp