LoopAccessAnalysis.cpp source code [llvm_projects/llvm/lib/Analysis/LoopAccessAnalysis.cpp]

1	//===- LoopAccessAnalysis.cpp - Loop Access Analysis Implementation --------==//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// The implementation for the loop memory dependence that was originally
10	// developed for the loop vectorizer.
11	//
12	//===----------------------------------------------------------------------===//
13
14	#include "llvm/Analysis/LoopAccessAnalysis.h"
15	#include "llvm/ADT/APInt.h"
16	#include "llvm/ADT/DenseMap.h"
17	#include "llvm/ADT/EquivalenceClasses.h"
18	#include "llvm/ADT/PointerIntPair.h"
19	#include "llvm/ADT/STLExtras.h"
20	#include "llvm/ADT/SetVector.h"
21	#include "llvm/ADT/SmallPtrSet.h"
22	#include "llvm/ADT/SmallSet.h"
23	#include "llvm/ADT/SmallVector.h"
24	#include "llvm/Analysis/AliasAnalysis.h"
25	#include "llvm/Analysis/AliasSetTracker.h"
26	#include "llvm/Analysis/LoopAnalysisManager.h"
27	#include "llvm/Analysis/LoopInfo.h"
28	#include "llvm/Analysis/LoopIterator.h"
29	#include "llvm/Analysis/MemoryLocation.h"
30	#include "llvm/Analysis/OptimizationRemarkEmitter.h"
31	#include "llvm/Analysis/ScalarEvolution.h"
32	#include "llvm/Analysis/ScalarEvolutionExpressions.h"
33	#include "llvm/Analysis/ScalarEvolutionPatternMatch.h"
34	#include "llvm/Analysis/TargetLibraryInfo.h"
35	#include "llvm/Analysis/TargetTransformInfo.h"
36	#include "llvm/Analysis/ValueTracking.h"
37	#include "llvm/Analysis/VectorUtils.h"
38	#include "llvm/IR/BasicBlock.h"
39	#include "llvm/IR/Constants.h"
40	#include "llvm/IR/DataLayout.h"
41	#include "llvm/IR/DebugLoc.h"
42	#include "llvm/IR/DerivedTypes.h"
43	#include "llvm/IR/DiagnosticInfo.h"
44	#include "llvm/IR/Dominators.h"
45	#include "llvm/IR/Function.h"
46	#include "llvm/IR/InstrTypes.h"
47	#include "llvm/IR/Instruction.h"
48	#include "llvm/IR/Instructions.h"
49	#include "llvm/IR/IntrinsicInst.h"
50	#include "llvm/IR/PassManager.h"
51	#include "llvm/IR/Type.h"
52	#include "llvm/IR/Value.h"
53	#include "llvm/IR/ValueHandle.h"
54	#include "llvm/Support/Casting.h"
55	#include "llvm/Support/CommandLine.h"
56	#include "llvm/Support/Debug.h"
57	#include "llvm/Support/ErrorHandling.h"
58	#include "llvm/Support/raw_ostream.h"
59	#include <algorithm>
60	#include <cassert>
61	#include <cstdint>
62	#include <iterator>
63	#include <utility>
64	#include <variant>
65	#include <vector>
66
67	using namespace llvm;
68	using namespace llvm::SCEVPatternMatch;
69
70	#define DEBUG_TYPE "loop-accesses"
71
72	static cl::opt<unsigned, true>
73	VectorizationFactor("force-vector-width", cl::Hidden,
74	cl::desc ("Sets the SIMD width. Zero is autoselect."),
75	cl::location(L&: VectorizerParams::VectorizationFactor));
76	unsigned VectorizerParams::VectorizationFactor;
77
78	static cl::opt<unsigned, true>
79	VectorizationInterleave("force-vector-interleave", cl::Hidden,
80	cl::desc ("Sets the vectorization interleave count. "
81	"Zero is autoselect."),
82	cl::location(
83	L&: VectorizerParams::VectorizationInterleave));
84	unsigned VectorizerParams::VectorizationInterleave;
85
86	static cl::opt<unsigned, true> RuntimeMemoryCheckThreshold(
87	"runtime-memory-check-threshold", cl::Hidden,
88	cl::desc ("When performing memory disambiguation checks at runtime do not "
89	"generate more than this number of comparisons (default = 8)."),
90	cl::location(L&: VectorizerParams::RuntimeMemoryCheckThreshold), cl::init(Val: `8`));
91	unsigned VectorizerParams::RuntimeMemoryCheckThreshold;
92
93	/// The maximum iterations used to merge memory checks
94	static cl::opt<unsigned> MemoryCheckMergeThreshold(
95	"memory-check-merge-threshold", cl::Hidden,
96	cl::desc ("Maximum number of comparisons done when trying to merge "
97	"runtime memory checks. (default = 100)"),
98	cl::init(Val: `100`));
99
100	/// Maximum SIMD width.
101	const unsigned VectorizerParams::MaxVectorWidth = `64`;
102
103	/// We collect dependences up to this threshold.
104	static cl::opt<unsigned>
105	MaxDependences("max-dependences", cl::Hidden,
106	cl::desc ("Maximum number of dependences collected by "
107	"loop-access analysis (default = 100)"),
108	cl::init(Val: `100`));
109
110	/// This enables versioning on the strides of symbolically striding memory
111	/// accesses in code like the following.
112	/// for (i = 0; i < N; ++i)
113	/// A[i Stride1] += B[i * Stride2] ...*
114	///
115	/// Will be roughly translated to
116	/// if (Stride1 == 1 && Stride2 == 1) {
117	/// for (i = 0; i < N; i+=4)
118	/// A[i:i+3] += ...
119	/// } else
120	/// ...
121	static cl::opt<bool> EnableMemAccessVersioning(
122	"enable-mem-access-versioning", cl::init(Val: true), cl::Hidden,
123	cl::desc ("Enable symbolic stride memory access versioning"));
124
125	/// Enable store-to-load forwarding conflict detection. This option can
126	/// be disabled for correctness testing.
127	static cl::opt<bool> EnableForwardingConflictDetection(
128	"store-to-load-forwarding-conflict-detection", cl::Hidden,
129	cl::desc ("Enable conflict detection in loop-access analysis"),
130	cl::init(Val: true));
131
132	static cl::opt<unsigned> MaxForkedSCEVDepth(
133	"max-forked-scev-depth", cl::Hidden,
134	cl::desc ("Maximum recursion depth when finding forked SCEVs (default = 5)"),
135	cl::init(Val: `5`));
136
137	static cl::opt<bool> SpeculateUnitStride(
138	"laa-speculate-unit-stride", cl::Hidden,
139	cl::desc ("Speculate that non-constant strides are unit in LAA"),
140	cl::init(Val: true));
141
142	static cl::opt<bool, true> HoistRuntimeChecks(
143	"hoist-runtime-checks", cl::Hidden,
144	cl::desc (
145	"Hoist inner loop runtime memory checks to outer loop if possible"),
146	cl::location(L&: VectorizerParams::HoistRuntimeChecks), cl::init(Val: true));
147	bool VectorizerParams::HoistRuntimeChecks;
148
149	bool VectorizerParams::isInterleaveForced() {
150	return ::VectorizationInterleave.getNumOccurrences() > `0`;
151	}
152
153	const SCEV *llvm::replaceSymbolicStrideSCEV(PredicatedScalarEvolution &PSE,
154	const DenseMap<Value , const* SCEV *> &PtrToStride,
155	Value *Ptr) {
156	const SCEV *OrigSCEV = PSE.getSCEV(V: Ptr);
157
158	// If there is an entry in the map return the SCEV of the pointer with the
159	// symbolic stride replaced by one.
160	const SCEV *StrideSCEV = PtrToStride.lookup(Val: Ptr);
161	if (!StrideSCEV)
162	// For a non-symbolic stride, just return the original expression.
163	return OrigSCEV;
164
165	// Note: This assert is both overly strong and overly weak. The actual
166	// invariant here is that StrideSCEV should be loop invariant. The only
167	// such invariant strides we happen to speculate right now are unknowns
168	// and thus this is a reasonable proxy of the actual invariant.
169	assert(isa<SCEVUnknown>(StrideSCEV) && "shouldn't be in map");
170
171	ScalarEvolution *SE = PSE.getSE();
172	const SCEV *CT = SE->getOne(Ty: StrideSCEV->getType());
173	PSE.addPredicate(Pred: *SE->getEqualPredicate(LHS: StrideSCEV, RHS: CT));
174	const SCEV *Expr = PSE.getSCEV(V: Ptr);
175
176	LLVM_DEBUG(dbgs() << "LAA: Replacing SCEV: " << *OrigSCEV
177	<< " by: " << *Expr << "\n");
178	return Expr;
179	}
180
181	RuntimeCheckingPtrGroup::RuntimeCheckingPtrGroup(
182	unsigned Index, const RuntimePointerChecking &RtCheck)
183	: High(RtCheck.Pointers [Index].End), Low(RtCheck.Pointers [Index].Start),
184	AddressSpace(RtCheck.Pointers [Index]
185	.PointerValue ->getType()
186	->getPointerAddressSpace()),
187	NeedsFreeze(RtCheck.Pointers [Index].NeedsFreeze) {
188	Members.push_back(Elt: Index);
189	}
190
191	/// Returns \p A + \p B, if it is guaranteed not to unsigned wrap. Otherwise
192	/// return nullptr. \p A and \p B must have the same type.
193	static const SCEV addSCEVNoOverflow(const* SCEV A, const* SCEV *B,
194	ScalarEvolution &SE) {
195	if (!SE.willNotOverflow(BinOp: Instruction::Add, /IsSigned=/Signed: false, LHS: A, RHS: B))
196	return nullptr;
197	return SE.getAddExpr(LHS: A, RHS: B);
198	}
199
200	/// Returns \p A \p B, if it is guaranteed not to unsigned wrap. Otherwise*
201	/// return nullptr. \p A and \p B must have the same type.
202	static const SCEV mulSCEVOverflow(const* SCEV A, const* SCEV *B,
203	ScalarEvolution &SE) {
204	if (!SE.willNotOverflow(BinOp: Instruction::Mul, /IsSigned=/Signed: false, LHS: A, RHS: B))
205	return nullptr;
206	return SE.getMulExpr(LHS: A, RHS: B);
207	}
208
209	/// Return true, if evaluating \p AR at \p MaxBTC cannot wrap, because \p AR at
210	/// \p MaxBTC is guaranteed inbounds of the accessed object.
211	static bool evaluatePtrAddRecAtMaxBTCWillNotWrap(const SCEVAddRecExpr *AR,
212	const SCEV *MaxBTC,
213	const SCEV *EltSize,
214	ScalarEvolution &SE,
215	const DataLayout &DL) {
216	auto *PointerBase = SE.getPointerBase(V: AR->getStart());
217	auto *StartPtr = dyn_cast<SCEVUnknown>(Val: PointerBase);
218	if (!StartPtr)
219	return false;
220	bool CheckForNonNull, CheckForFreed;
221	uint64_t DerefBytes = StartPtr->getValue()->getPointerDereferenceableBytes(
222	DL, CanBeNull&: CheckForNonNull, CanBeFreed&: CheckForFreed);
223
224	if (CheckForNonNull \|\| CheckForFreed)
225	return false;
226
227	const SCEV *Step = AR->getStepRecurrence(SE);
228	bool IsKnownNonNegative = SE.isKnownNonNegative(S: Step);
229	if (!IsKnownNonNegative && !SE.isKnownNegative(S: Step))
230	return false;
231
232	Type *WiderTy = SE.getWiderType(Ty1: MaxBTC->getType(), Ty2: Step->getType());
233	Step = SE.getNoopOrSignExtend(V: Step, Ty: WiderTy);
234	MaxBTC = SE.getNoopOrZeroExtend(V: MaxBTC, Ty: WiderTy);
235
236	// For the computations below, make sure they don't unsigned wrap.
237	if (!SE.isKnownPredicate(Pred: CmpInst::ICMP_UGE, LHS: AR->getStart(), RHS: StartPtr))
238	return false;
239	const SCEV *StartOffset = SE.getNoopOrZeroExtend(
240	V: SE.getMinusSCEV(LHS: AR->getStart(), RHS: StartPtr), Ty: WiderTy);
241
242	const SCEV *OffsetAtLastIter =
243	mulSCEVOverflow(A: MaxBTC, B: SE.getAbsExpr(Op: Step, /IsNSW=/false), SE);
244	if (!OffsetAtLastIter)
245	return false;
246
247	const SCEV *OffsetEndBytes = addSCEVNoOverflow(
248	A: OffsetAtLastIter, B: SE.getNoopOrZeroExtend(V: EltSize, Ty: WiderTy), SE);
249	if (!OffsetEndBytes)
250	return false;
251
252	if (IsKnownNonNegative) {
253	// For positive steps, check if
254	// (AR->getStart() - StartPtr) + (MaxBTC Step) + EltSize <= DerefBytes,*
255	// while making sure none of the computations unsigned wrap themselves.
256	const SCEV *EndBytes = addSCEVNoOverflow(A: StartOffset, B: OffsetEndBytes, SE);
257	if (!EndBytes)
258	return false;
259	return SE.isKnownPredicate(Pred: CmpInst::ICMP_ULE, LHS: EndBytes,
260	RHS: SE.getConstant(Ty: WiderTy, V: DerefBytes));
261	}
262
263	// For negative steps check if
264	// StartOffset >= (MaxBTC * Step + EltSize)*
265	// StartOffset <= DerefBytes.*
266	assert(SE.isKnownNegative(Step) && "must be known negative");
267	return SE.isKnownPredicate(Pred: CmpInst::ICMP_SGE, LHS: StartOffset, RHS: OffsetEndBytes) &&
268	SE.isKnownPredicate(Pred: CmpInst::ICMP_ULE, LHS: StartOffset,
269	RHS: SE.getConstant(Ty: WiderTy, V: DerefBytes));
270	}
271
272	std::pair<const SCEV , const* SCEV *> llvm::getStartAndEndForAccess(
273	const Loop Lp, const* SCEV PtrExpr, Type AccessTy, const SCEV *BTC,
274	const SCEV MaxBTC, ScalarEvolution SE,
275	DenseMap<std::pair<const SCEV , Type >,
276	std::pair<const SCEV , const* SCEV >> PointerBounds) {
277	std::pair<const SCEV , const* SCEV > PtrBoundsPair;
278	if (PointerBounds) {
279	auto [Iter, Ins] = PointerBounds->insert(
280	KV: {{PtrExpr, AccessTy},
281	{SE->getCouldNotCompute(), SE->getCouldNotCompute()}});
282	if (!Ins)
283	return Iter ->second;
284	PtrBoundsPair = &Iter ->second;
285	}
286
287	const SCEV *ScStart;
288	const SCEV *ScEnd;
289
290	auto &DL = Lp->getHeader()->getDataLayout();
291	Type *IdxTy = DL.getIndexType(PtrTy: PtrExpr->getType());
292	const SCEV *EltSizeSCEV = SE->getStoreSizeOfExpr(IntTy: IdxTy, StoreTy: AccessTy);
293	if (SE->isLoopInvariant(S: PtrExpr, L: Lp)) {
294	ScStart = ScEnd = PtrExpr;
295	} else if (auto *AR = dyn_cast<SCEVAddRecExpr>(Val: PtrExpr)) {
296	ScStart = AR->getStart();
297	if (!isa<SCEVCouldNotCompute>(Val: BTC))
298	// Evaluating AR at an exact BTC is safe: LAA separately checks that
299	// accesses cannot wrap in the loop. If evaluating AR at BTC wraps, then
300	// the loop either triggers UB when executing a memory access with a
301	// poison pointer or the wrapping/poisoned pointer is not used.
302	ScEnd = AR->evaluateAtIteration(It: BTC, SE&: *SE);
303	else {
304	// Evaluating AR at MaxBTC may wrap and create an expression that is less
305	// than the start of the AddRec due to wrapping (for example consider
306	// MaxBTC = -2). If that's the case, set ScEnd to -(EltSize + 1). ScEnd
307	// will get incremented by EltSize before returning, so this effectively
308	// sets ScEnd to the maximum unsigned value for the type. Note that LAA
309	// separately checks that accesses cannot not wrap, so unsigned max
310	// represents an upper bound.
311	if (evaluatePtrAddRecAtMaxBTCWillNotWrap(AR, MaxBTC, EltSize: EltSizeSCEV, SE&: *SE,
312	DL)) {
313	ScEnd = AR->evaluateAtIteration(It: MaxBTC, SE&: *SE);
314	} else {
315	ScEnd = SE->getAddExpr(
316	LHS: SE->getNegativeSCEV(V: EltSizeSCEV),
317	RHS: SE->getSCEV(V: ConstantExpr::getIntToPtr(
318	C: ConstantInt::get(Ty: EltSizeSCEV->getType(), V: -`1`), Ty: AR->getType())));
319	}
320	}
321	const SCEV Step = AR->getStepRecurrence(SE&: SE);
322
323	// For expressions with negative step, the upper bound is ScStart and the
324	// lower bound is ScEnd.
325	if (const auto *CStep = dyn_cast<SCEVConstant>(Val: Step)) {
326	if (CStep->getValue()->isNegative())
327	std::swap(a&: ScStart, b&: ScEnd);
328	} else {
329	// Fallback case: the step is not constant, but we can still
330	// get the upper and lower bounds of the interval by using min/max
331	// expressions.
332	ScStart = SE->getUMinExpr(LHS: ScStart, RHS: ScEnd);
333	ScEnd = SE->getUMaxExpr(LHS: AR->getStart(), RHS: ScEnd);
334	}
335	} else
336	return {SE->getCouldNotCompute(), SE->getCouldNotCompute()};
337
338	assert(SE->isLoopInvariant(ScStart, Lp) && "ScStart needs to be invariant");
339	assert(SE->isLoopInvariant(ScEnd, Lp) && "ScEnd needs to be invariant");
340
341	// Add the size of the pointed element to ScEnd.
342	ScEnd = SE->getAddExpr(LHS: ScEnd, RHS: EltSizeSCEV);
343
344	std::pair<const SCEV , const* SCEV *> Res = {ScStart, ScEnd};
345	if (PointerBounds)
346	*PtrBoundsPair = Res;
347	return Res;
348	}
349
350	/// Calculate Start and End points of memory access using
351	/// getStartAndEndForAccess.
352	void RuntimePointerChecking::insert(Loop Lp, Value Ptr, const SCEV *PtrExpr,
353	Type AccessTy, bool* WritePtr,
354	unsigned DepSetId, unsigned ASId,
355	PredicatedScalarEvolution &PSE,
356	bool NeedsFreeze) {
357	const SCEV *SymbolicMaxBTC = PSE.getSymbolicMaxBackedgeTakenCount();
358	const SCEV *BTC = PSE.getBackedgeTakenCount();
359	const auto &[ScStart, ScEnd] =
360	getStartAndEndForAccess(Lp, PtrExpr, AccessTy, BTC, MaxBTC: SymbolicMaxBTC,
361	SE: PSE.getSE(), PointerBounds: &DC.getPointerBounds());
362	assert(!isa<SCEVCouldNotCompute>(ScStart) &&
363	!isa<SCEVCouldNotCompute>(ScEnd) &&
364	"must be able to compute both start and end expressions");
365	Pointers.emplace_back(Args&: Ptr, Args: ScStart, Args: ScEnd, Args&: WritePtr, Args&: DepSetId, Args&: ASId, Args&: PtrExpr,
366	Args&: NeedsFreeze);
367	}
368
369	bool RuntimePointerChecking::tryToCreateDiffCheck(
370	const RuntimeCheckingPtrGroup &CGI, const RuntimeCheckingPtrGroup &CGJ) {
371	// If either group contains multiple different pointers, bail out.
372	// TODO: Support multiple pointers by using the minimum or maximum pointer,
373	// depending on src & sink.
374	if (CGI.Members.size() != `1` \|\| CGJ.Members.size() != `1`)
375	return false;
376
377	const PointerInfo *Src = &Pointers [CGI.Members [`0`]];
378	const PointerInfo *Sink = &Pointers [CGJ.Members [`0`]];
379
380	// If either pointer is read and written, multiple checks may be needed. Bail
381	// out.
382	if (!DC.getOrderForAccess(Ptr: Src->PointerValue, IsWrite: !Src->IsWritePtr).empty() \|\|
383	!DC.getOrderForAccess(Ptr: Sink->PointerValue, IsWrite: !Sink->IsWritePtr).empty())
384	return false;
385
386	ArrayRef<unsigned> AccSrc =
387	DC.getOrderForAccess(Ptr: Src->PointerValue, IsWrite: Src->IsWritePtr);
388	ArrayRef<unsigned> AccSink =
389	DC.getOrderForAccess(Ptr: Sink->PointerValue, IsWrite: Sink->IsWritePtr);
390	// If either pointer is accessed multiple times, there may not be a clear
391	// src/sink relation. Bail out for now.
392	if (AccSrc.size() != `1` \|\| AccSink.size() != `1`)
393	return false;
394
395	// If the sink is accessed before src, swap src/sink.
396	if (AccSink [`0`] < AccSrc [`0`])
397	std::swap(a&: Src, b&: Sink);
398
399	const SCEVConstant *Step;
400	const SCEV *SrcStart;
401	const SCEV *SinkStart;
402	const Loop *InnerLoop = DC.getInnermostLoop();
403	if (!match(S: Src->Expr,
404	P: m_scev_AffineAddRec(Op0: m_SCEV(V&: SrcStart), Op1: m_SCEVConstant(V&: Step),
405	L: m_SpecificLoop(L: InnerLoop))) \|\|
406	!match(S: Sink->Expr,
407	P: m_scev_AffineAddRec(Op0: m_SCEV(V&: SinkStart), Op1: m_scev_Specific(S: Step),
408	L: m_SpecificLoop(L: InnerLoop))))
409	return false;
410
411	SmallVector<Instruction *, `4`> SrcInsts =
412	DC.getInstructionsForAccess(Ptr: Src->PointerValue, isWrite: Src->IsWritePtr);
413	SmallVector<Instruction *, `4`> SinkInsts =
414	DC.getInstructionsForAccess(Ptr: Sink->PointerValue, isWrite: Sink->IsWritePtr);
415	Type *SrcTy = getLoadStoreType(I: SrcInsts [`0`]);
416	Type *DstTy = getLoadStoreType(I: SinkInsts [`0`]);
417	if (isa<ScalableVectorType>(Val: SrcTy) \|\| isa<ScalableVectorType>(Val: DstTy))
418	return false;
419
420	const DataLayout &DL = InnerLoop->getHeader()->getDataLayout();
421	unsigned AllocSize =
422	std::max(a: DL.getTypeAllocSize(Ty: SrcTy), b: DL.getTypeAllocSize(Ty: DstTy));
423
424	// Only matching constant steps matching the AllocSize are supported at the
425	// moment. This simplifies the difference computation. Can be extended in the
426	// future.
427	if (Step->getAPInt().abs() != AllocSize)
428	return false;
429
430	IntegerType *IntTy =
431	IntegerType::get(C&: Src->PointerValue ->getContext(),
432	NumBits: DL.getPointerSizeInBits(AS: CGI.AddressSpace));
433
434	// When counting down, the dependence distance needs to be swapped.
435	if (Step->getValue()->isNegative())
436	std::swap(a&: SinkStart, b&: SrcStart);
437
438	const SCEV *SinkStartInt = SE->getPtrToIntExpr(Op: SinkStart, Ty: IntTy);
439	const SCEV *SrcStartInt = SE->getPtrToIntExpr(Op: SrcStart, Ty: IntTy);
440	if (isa<SCEVCouldNotCompute>(Val: SinkStartInt) \|\|
441	isa<SCEVCouldNotCompute>(Val: SrcStartInt))
442	return false;
443
444	// If the start values for both Src and Sink also vary according to an outer
445	// loop, then it's probably better to avoid creating diff checks because
446	// they may not be hoisted. We should instead let llvm::addRuntimeChecks
447	// do the expanded full range overlap checks, which can be hoisted.
448	if (HoistRuntimeChecks && InnerLoop->getParentLoop() &&
449	isa<SCEVAddRecExpr>(Val: SinkStartInt) && isa<SCEVAddRecExpr>(Val: SrcStartInt)) {
450	auto *SrcStartAR = cast<SCEVAddRecExpr>(Val: SrcStartInt);
451	auto *SinkStartAR = cast<SCEVAddRecExpr>(Val: SinkStartInt);
452	const Loop *StartARLoop = SrcStartAR->getLoop();
453	if (StartARLoop == SinkStartAR->getLoop() &&
454	StartARLoop == InnerLoop->getParentLoop() &&
455	// If the diff check would already be loop invariant (due to the
456	// recurrences being the same), then we prefer to keep the diff checks
457	// because they are cheaper.
458	SrcStartAR->getStepRecurrence(SE&: *SE) !=
459	SinkStartAR->getStepRecurrence(SE&: *SE)) {
460	LLVM_DEBUG(dbgs() << "LAA: Not creating diff runtime check, since these "
461	"cannot be hoisted out of the outer loop\n");
462	return false;
463	}
464	}
465
466	LLVM_DEBUG(dbgs() << "LAA: Creating diff runtime check for:\n"
467	<< "SrcStart: " << *SrcStartInt << `'\n'`
468	<< "SinkStartInt: " << *SinkStartInt << `'\n'`);
469	DiffChecks.emplace_back(Args&: SrcStartInt, Args&: SinkStartInt, Args&: AllocSize,
470	Args: Src->NeedsFreeze \|\| Sink->NeedsFreeze);
471	return true;
472	}
473
474	SmallVector<RuntimePointerCheck, `4`> RuntimePointerChecking::generateChecks() {
475	SmallVector<RuntimePointerCheck, `4`> Checks;
476
477	for (unsigned I = `0`; I < CheckingGroups.size(); ++I) {
478	for (unsigned J = I + `1`; J < CheckingGroups.size(); ++J) {
479	const RuntimeCheckingPtrGroup &CGI = CheckingGroups [I];
480	const RuntimeCheckingPtrGroup &CGJ = CheckingGroups [J];
481
482	if (needsChecking(M: CGI, N: CGJ)) {
483	CanUseDiffCheck = CanUseDiffCheck && tryToCreateDiffCheck(CGI, CGJ);
484	Checks.emplace_back(Args: &CGI, Args: &CGJ);
485	}
486	}
487	}
488	return Checks;
489	}
490
491	void RuntimePointerChecking::generateChecks(
492	MemoryDepChecker::DepCandidates &DepCands, bool UseDependencies) {
493	assert(Checks.empty() && "Checks is not empty");
494	groupChecks(DepCands, UseDependencies);
495	Checks = generateChecks();
496	}
497
498	bool RuntimePointerChecking::needsChecking(
499	const RuntimeCheckingPtrGroup &M, const RuntimeCheckingPtrGroup &N) const {
500	for (const auto &I : M.Members)
501	for (const auto &J : N.Members)
502	if (needsChecking(I, J))
503	return true;
504	return false;
505	}
506
507	/// Compare \p I and \p J and return the minimum.
508	/// Return nullptr in case we couldn't find an answer.
509	static const SCEV getMinFromExprs(const* SCEV I, const* SCEV *J,
510	ScalarEvolution *SE) {
511	std::optional<APInt> Diff = SE->computeConstantDifference(LHS: J, RHS: I);
512	if (!Diff)
513	return nullptr;
514	return Diff ->isNegative() ? J : I;
515	}
516
517	bool RuntimeCheckingPtrGroup::addPointer(
518	unsigned Index, const RuntimePointerChecking &RtCheck) {
519	return addPointer(
520	Index, Start: RtCheck.Pointers [Index].Start, End: RtCheck.Pointers [Index].End,
521	AS: RtCheck.Pointers [Index].PointerValue ->getType()->getPointerAddressSpace(),
522	NeedsFreeze: RtCheck.Pointers [Index].NeedsFreeze, SE&: *RtCheck.SE);
523	}
524
525	bool RuntimeCheckingPtrGroup::addPointer(unsigned Index, const SCEV *Start,
526	const SCEV End, unsigned* AS,
527	bool NeedsFreeze,
528	ScalarEvolution &SE) {
529	assert(AddressSpace == AS &&
530	"all pointers in a checking group must be in the same address space");
531
532	// Compare the starts and ends with the known minimum and maximum
533	// of this set. We need to know how we compare against the min/max
534	// of the set in order to be able to emit memchecks.
535	const SCEV *Min0 = getMinFromExprs(I: Start, J: Low, SE: &SE);
536	if (!Min0)
537	return false;
538
539	const SCEV *Min1 = getMinFromExprs(I: End, J: High, SE: &SE);
540	if (!Min1)
541	return false;
542
543	// Update the low bound expression if we've found a new min value.
544	if (Min0 == Start)
545	Low = Start;
546
547	// Update the high bound expression if we've found a new max value.
548	if (Min1 != End)
549	High = End;
550
551	Members.push_back(Elt: Index);
552	this->NeedsFreeze \|= NeedsFreeze;
553	return true;
554	}
555
556	void RuntimePointerChecking::groupChecks(
557	MemoryDepChecker::DepCandidates &DepCands, bool UseDependencies) {
558	// We build the groups from dependency candidates equivalence classes
559	// because:
560	// - We know that pointers in the same equivalence class share
561	// the same underlying object and therefore there is a chance
562	// that we can compare pointers
563	// - We wouldn't be able to merge two pointers for which we need
564	// to emit a memcheck. The classes in DepCands are already
565	// conveniently built such that no two pointers in the same
566	// class need checking against each other.
567
568	// We use the following (greedy) algorithm to construct the groups
569	// For every pointer in the equivalence class:
570	// For each existing group:
571	// - if the difference between this pointer and the min/max bounds
572	// of the group is a constant, then make the pointer part of the
573	// group and update the min/max bounds of that group as required.
574
575	CheckingGroups.clear();
576
577	// If we need to check two pointers to the same underlying object
578	// with a non-constant difference, we shouldn't perform any pointer
579	// grouping with those pointers. This is because we can easily get
580	// into cases where the resulting check would return false, even when
581	// the accesses are safe.
582	//
583	// The following example shows this:
584	// for (i = 0; i < 1000; ++i)
585	// a[5000 + i m] = a[i] + a[i + 9000]*
586	//
587	// Here grouping gives a check of (5000, 5000 + 1000 m) against*
588	// (0, 10000) which is always false. However, if m is 1, there is no
589	// dependence. Not grouping the checks for a[i] and a[i + 9000] allows
590	// us to perform an accurate check in this case.
591	//
592	// The above case requires that we have an UnknownDependence between
593	// accesses to the same underlying object. This cannot happen unless
594	// FoundNonConstantDistanceDependence is set, and therefore UseDependencies
595	// is also false. In this case we will use the fallback path and create
596	// separate checking groups for all pointers.
597
598	// If we don't have the dependency partitions, construct a new
599	// checking pointer group for each pointer. This is also required
600	// for correctness, because in this case we can have checking between
601	// pointers to the same underlying object.
602	if (!UseDependencies) {
603	for (unsigned I = `0`; I < Pointers.size(); ++I)
604	CheckingGroups.emplace_back(Args&: I, Args&: *this);
605	return;
606	}
607
608	unsigned TotalComparisons = `0`;
609
610	DenseMap<Value , SmallVector<unsigned*>> PositionMap;
611	for (unsigned Index = `0`; Index < Pointers.size(); ++Index)
612	PositionMap [Pointers [Index].PointerValue].push_back(Elt: Index);
613
614	// We need to keep track of what pointers we've already seen so we
615	// don't process them twice.
616	SmallSet<unsigned, `2`> Seen;
617
618	// Go through all equivalence classes, get the "pointer check groups"
619	// and add them to the overall solution. We use the order in which accesses
620	// appear in 'Pointers' to enforce determinism.
621	for (unsigned I = `0`; I < Pointers.size(); ++I) {
622	// We've seen this pointer before, and therefore already processed
623	// its equivalence class.
624	if (Seen.contains(V: I))
625	continue;
626
627	MemoryDepChecker::MemAccessInfo Access(Pointers [I].PointerValue,
628	Pointers [I].IsWritePtr);
629
630	SmallVector<RuntimeCheckingPtrGroup, `2`> Groups;
631
632	// Because DepCands is constructed by visiting accesses in the order in
633	// which they appear in alias sets (which is deterministic) and the
634	// iteration order within an equivalence class member is only dependent on
635	// the order in which unions and insertions are performed on the
636	// equivalence class, the iteration order is deterministic.
637	for (auto M : DepCands.members(V: Access)) {
638	auto PointerI = PositionMap.find(Val: M.getPointer());
639	// If we can't find the pointer in PositionMap that means we can't
640	// generate a memcheck for it.
641	if (PointerI == PositionMap.end())
642	continue;
643	for (unsigned Pointer : PointerI ->second) {
644	bool Merged = false;
645	// Mark this pointer as seen.
646	Seen.insert(V: Pointer);
647
648	// Go through all the existing sets and see if we can find one
649	// which can include this pointer.
650	for (RuntimeCheckingPtrGroup &Group : Groups) {
651	// Don't perform more than a certain amount of comparisons.
652	// This should limit the cost of grouping the pointers to something
653	// reasonable. If we do end up hitting this threshold, the algorithm
654	// will create separate groups for all remaining pointers.
655	if (TotalComparisons > MemoryCheckMergeThreshold)
656	break;
657
658	TotalComparisons++;
659
660	if (Group.addPointer(Index: Pointer, RtCheck: *this)) {
661	Merged = true;
662	break;
663	}
664	}
665
666	if (!Merged)
667	// We couldn't add this pointer to any existing set or the threshold
668	// for the number of comparisons has been reached. Create a new group
669	// to hold the current pointer.
670	Groups.emplace_back(Args&: Pointer, Args&: *this);
671	}
672	}
673
674	// We've computed the grouped checks for this partition.
675	// Save the results and continue with the next one.
676	llvm::append_range(C&: CheckingGroups, R&: Groups);
677	}
678	}
679
680	bool RuntimePointerChecking::arePointersInSamePartition(
681	const SmallVectorImpl<int> &PtrToPartition, unsigned PtrIdx1,
682	unsigned PtrIdx2) {
683	return (PtrToPartition [PtrIdx1] != -`1` &&
684	PtrToPartition [PtrIdx1] == PtrToPartition [PtrIdx2]);
685	}
686
687	bool RuntimePointerChecking::needsChecking(unsigned I, unsigned J) const {
688	const PointerInfo &PointerI = Pointers [I];
689	const PointerInfo &PointerJ = Pointers [J];
690
691	// No need to check if two readonly pointers intersect.
692	if (!PointerI.IsWritePtr && !PointerJ.IsWritePtr)
693	return false;
694
695	// Only need to check pointers between two different dependency sets.
696	if (PointerI.DependencySetId == PointerJ.DependencySetId)
697	return false;
698
699	// Only need to check pointers in the same alias set.
700	return PointerI.AliasSetId == PointerJ.AliasSetId;
701	}
702
703	/// Assign each RuntimeCheckingPtrGroup pointer an index for stable UTC output.
704	static DenseMap<const RuntimeCheckingPtrGroup , unsigned*>
705	getPtrToIdxMap(ArrayRef<RuntimeCheckingPtrGroup> CheckingGroups) {
706	DenseMap<const RuntimeCheckingPtrGroup , unsigned*> PtrIndices;
707	for (const auto &[Idx, CG] : enumerate(First&: CheckingGroups))
708	PtrIndices [&CG] = Idx;
709	return PtrIndices;
710	}
711
712	void RuntimePointerChecking::printChecks(
713	raw_ostream &OS, const SmallVectorImpl<RuntimePointerCheck> &Checks,
714	unsigned Depth) const {
715	unsigned N = `0`;
716	auto PtrIndices = getPtrToIdxMap(CheckingGroups);
717	for (const auto &[Check1, Check2] : Checks) {
718	const auto &First = Check1->Members, &Second = Check2->Members;
719	OS.indent(NumSpaces: Depth) << "Check " << N++ << ":\n";
720	OS.indent(NumSpaces: Depth + `2`) << "Comparing group GRP" << PtrIndices.at(Val: Check1)
721	<< ":\n";
722	for (unsigned K : First)
723	OS.indent(NumSpaces: Depth + `2`) << *Pointers [K].PointerValue << "\n";
724	OS.indent(NumSpaces: Depth + `2`) << "Against group GRP" << PtrIndices.at(Val: Check2)
725	<< ":\n";
726	for (unsigned K : Second)
727	OS.indent(NumSpaces: Depth + `2`) << *Pointers [K].PointerValue << "\n";
728	}
729	}
730
731	void RuntimePointerChecking::print(raw_ostream &OS, unsigned Depth) const {
732
733	OS.indent(NumSpaces: Depth) << "Run-time memory checks:\n";
734	printChecks(OS, Checks, Depth);
735
736	OS.indent(NumSpaces: Depth) << "Grouped accesses:\n";
737	auto PtrIndices = getPtrToIdxMap(CheckingGroups);
738	for (const auto &CG : CheckingGroups) {
739	OS.indent(NumSpaces: Depth + `2`) << "Group GRP" << PtrIndices.at(Val: &CG) << ":\n";
740	OS.indent(NumSpaces: Depth + `4`) << "(Low: " << CG.Low << " High: " << CG.High
741	<< ")\n";
742	for (unsigned Member : CG.Members) {
743	OS.indent(NumSpaces: Depth + `6`) << "Member: " << *Pointers [Member].Expr << "\n";
744	}
745	}
746	}
747
748	namespace {
749
750	/// Analyses memory accesses in a loop.
751	///
752	/// Checks whether run time pointer checks are needed and builds sets for data
753	/// dependence checking.
754	class AccessAnalysis {
755	public:
756	/// Read or write access location.
757	typedef PointerIntPair<Value , `1`, bool*> MemAccessInfo;
758	typedef SmallVector<MemAccessInfo, `8`> MemAccessInfoList;
759
760	AccessAnalysis(const Loop TheLoop, AAResults AA, const LoopInfo *LI,
761	MemoryDepChecker::DepCandidates &DA,
762	PredicatedScalarEvolution &PSE,
763	SmallPtrSetImpl<MDNode *> &LoopAliasScopes)
764	: TheLoop(TheLoop), BAA (*AA), AST (BAA), LI(LI), DepCands(DA), PSE(PSE),
765	LoopAliasScopes(LoopAliasScopes) {
766	// We're analyzing dependences across loop iterations.
767	BAA.enableCrossIterationMode();
768	}
769
770	/// Register a load and whether it is only read from.
771	void addLoad(const MemoryLocation &Loc, Type AccessTy, bool* IsReadOnly) {
772	Value Ptr = const_cast<Value >(Loc.Ptr);
773	AST.add(Loc: adjustLoc(Loc));
774	Accesses [MemAccessInfo (Ptr, false)].insert(X: AccessTy);
775	if (IsReadOnly)
776	ReadOnlyPtr.insert(Ptr);
777	}
778
779	/// Register a store.
780	void addStore(const MemoryLocation &Loc, Type *AccessTy) {
781	Value Ptr = const_cast<Value >(Loc.Ptr);
782	AST.add(Loc: adjustLoc(Loc));
783	Accesses [MemAccessInfo (Ptr, true)].insert(X: AccessTy);
784	}
785
786	/// Check if we can emit a run-time no-alias check for \p Access.
787	///
788	/// Returns true if we can emit a run-time no alias check for \p Access.
789	/// If we can check this access, this also adds it to a dependence set and
790	/// adds a run-time to check for it to \p RtCheck. If \p Assume is true,
791	/// we will attempt to use additional run-time checks in order to get
792	/// the bounds of the pointer.
793	bool createCheckForAccess(RuntimePointerChecking &RtCheck,
794	MemAccessInfo Access, Type *AccessTy,
795	const DenseMap<Value , const* SCEV *> &Strides,
796	DenseMap<Value , unsigned*> &DepSetId,
797	Loop TheLoop, unsigned* &RunningDepId,
798	unsigned ASId, bool Assume);
799
800	/// Check whether we can check the pointers at runtime for
801	/// non-intersection.
802	///
803	/// Returns true if we need no check or if we do and we can generate them
804	/// (i.e. the pointers have computable bounds). A return value of false means
805	/// we couldn't analyze and generate runtime checks for all pointers in the
806	/// loop, but if \p AllowPartial is set then we will have checks for those
807	/// pointers we could analyze.
808	bool canCheckPtrAtRT(RuntimePointerChecking &RtCheck, Loop *TheLoop,
809	const DenseMap<Value , const* SCEV *> &Strides,
810	Value &UncomputablePtr, bool* AllowPartial);
811
812	/// Goes over all memory accesses, checks whether a RT check is needed
813	/// and builds sets of dependent accesses.
814	void buildDependenceSets() {
815	processMemAccesses();
816	}
817
818	/// Initial processing of memory accesses determined that we need to
819	/// perform dependency checking.
820	///
821	/// Note that this can later be cleared if we retry memcheck analysis without
822	/// dependency checking (i.e. FoundNonConstantDistanceDependence).
823	bool isDependencyCheckNeeded() const { return !CheckDeps.empty(); }
824
825	/// We decided that no dependence analysis would be used. Reset the state.
826	void resetDepChecks(MemoryDepChecker &DepChecker) {
827	CheckDeps.clear();
828	DepChecker.clearDependences();
829	}
830
831	const MemAccessInfoList &getDependenciesToCheck() const { return CheckDeps; }
832
833	private:
834	typedef MapVector<MemAccessInfo, SmallSetVector<Type *, `1`>> PtrAccessMap;
835
836	/// Adjust the MemoryLocation so that it represents accesses to this
837	/// location across all iterations, rather than a single one.
838	MemoryLocation adjustLoc(MemoryLocation Loc) const {
839	// The accessed location varies within the loop, but remains within the
840	// underlying object.
841	Loc.Size = LocationSize::beforeOrAfterPointer();
842	Loc.AATags.Scope = adjustAliasScopeList(ScopeList: Loc.AATags.Scope);
843	Loc.AATags.NoAlias = adjustAliasScopeList(ScopeList: Loc.AATags.NoAlias);
844	return Loc;
845	}
846
847	/// Drop alias scopes that are only valid within a single loop iteration.
848	MDNode adjustAliasScopeList(MDNode ScopeList) const {
849	if (!ScopeList)
850	return nullptr;
851
852	// For the sake of simplicity, drop the whole scope list if any scope is
853	// iteration-local.
854	if (any_of(Range: ScopeList->operands(), P: [&](Metadata *Scope) {
855	return LoopAliasScopes.contains(Ptr: cast<MDNode>(Val: Scope));
856	}))
857	return nullptr;
858
859	return ScopeList;
860	}
861
862	/// Go over all memory access and check whether runtime pointer checks
863	/// are needed and build sets of dependency check candidates.
864	void processMemAccesses();
865
866	/// Map of all accesses. Values are the types used to access memory pointed to
867	/// by the pointer.
868	PtrAccessMap Accesses;
869
870	/// The loop being checked.
871	const Loop *TheLoop;
872
873	/// List of accesses that need a further dependence check.
874	MemAccessInfoList CheckDeps;
875
876	/// Set of pointers that are read only.
877	SmallPtrSet<Value*, `16`> ReadOnlyPtr;
878
879	/// Batched alias analysis results.
880	BatchAAResults BAA;
881
882	/// An alias set tracker to partition the access set by underlying object and
883	//intrinsic property (such as TBAA metadata).
884	AliasSetTracker AST;
885
886	/// The LoopInfo of the loop being checked.
887	const LoopInfo *LI;
888
889	/// Sets of potentially dependent accesses - members of one set share an
890	/// underlying pointer. The set "CheckDeps" identfies which sets really need a
891	/// dependence check.
892	MemoryDepChecker::DepCandidates &DepCands;
893
894	/// Initial processing of memory accesses determined that we may need
895	/// to add memchecks. Perform the analysis to determine the necessary checks.
896	///
897	/// Note that, this is different from isDependencyCheckNeeded. When we retry
898	/// memcheck analysis without dependency checking
899	/// (i.e. FoundNonConstantDistanceDependence), isDependencyCheckNeeded is
900	/// cleared while this remains set if we have potentially dependent accesses.
901	bool IsRTCheckAnalysisNeeded = false;
902
903	/// The SCEV predicate containing all the SCEV-related assumptions.
904	PredicatedScalarEvolution &PSE;
905
906	DenseMap<Value , SmallVector<const* Value *, `16`>> UnderlyingObjects;
907
908	/// Alias scopes that are declared inside the loop, and as such not valid
909	/// across iterations.
910	SmallPtrSetImpl<MDNode *> &LoopAliasScopes;
911	};
912
913	} // end anonymous namespace
914
915	/// Try to compute a constant stride for \p AR. Used by getPtrStride and
916	/// isNoWrap.
917	static std::optional<int64_t>
918	getStrideFromAddRec(const SCEVAddRecExpr AR, const* Loop Lp, Type AccessTy,
919	Value *Ptr, PredicatedScalarEvolution &PSE) {
920	// The access function must stride over the innermost loop.
921	if (Lp != AR->getLoop()) {
922	LLVM_DEBUG({
923	dbgs() << "LAA: Bad stride - Not striding over innermost loop ";
924	if (Ptr)
925	dbgs() << *Ptr << " ";
926
927	dbgs() << "SCEV: " << *AR << "\n";
928	});
929	return std::nullopt;
930	}
931
932	// Check the step is constant.
933	const SCEV Step = AR->getStepRecurrence(SE&: PSE.getSE());
934
935	// Calculate the pointer stride and check if it is constant.
936	const APInt *APStepVal;
937	if (!match(S: Step, P: m_scev_APInt(C&: APStepVal))) {
938	LLVM_DEBUG({
939	dbgs() << "LAA: Bad stride - Not a constant strided ";
940	if (Ptr)
941	dbgs() << *Ptr << " ";
942	dbgs() << "SCEV: " << *AR << "\n";
943	});
944	return std::nullopt;
945	}
946
947	const auto &DL = Lp->getHeader()->getDataLayout();
948	TypeSize AllocSize = DL.getTypeAllocSize(Ty: AccessTy);
949	int64_t Size = AllocSize.getFixedValue();
950
951	// Huge step value - give up.
952	std::optional<int64_t> StepVal = APStepVal->trySExtValue();
953	if (!StepVal)
954	return std::nullopt;
955
956	// Strided access.
957	return StepVal % Size ? std::nullopt : std::make_optional(t: StepVal / Size);
958	}
959
960	/// Check whether \p AR is a non-wrapping AddRec. If \p Ptr is not nullptr, use
961	/// informating from the IR pointer value to determine no-wrap.
962	static bool isNoWrap(PredicatedScalarEvolution &PSE, const SCEVAddRecExpr *AR,
963	Value Ptr, Type AccessTy, const Loop L, bool* Assume,
964	std::optional<int64_t> Stride = std::nullopt) {
965	// FIXME: This should probably only return true for NUW.
966	if (AR->getNoWrapFlags(Mask: SCEV::NoWrapMask))
967	return true;
968
969	if (Ptr && PSE.hasNoOverflow(V: Ptr, Flags: SCEVWrapPredicate::IncrementNUSW))
970	return true;
971
972	// An nusw getelementptr that is an AddRec cannot wrap. If it would wrap,
973	// the distance between the previously accessed location and the wrapped
974	// location will be larger than half the pointer index type space. In that
975	// case, the GEP would be poison and any memory access dependent on it would
976	// be immediate UB when executed.
977	if (auto *GEP = dyn_cast_if_present<GetElementPtrInst>(Val: Ptr);
978	GEP && GEP->hasNoUnsignedSignedWrap())
979	return true;
980
981	if (!Stride)
982	Stride = getStrideFromAddRec(AR, Lp: L, AccessTy, Ptr, PSE);
983	if (Stride) {
984	// If the null pointer is undefined, then a access sequence which would
985	// otherwise access it can be assumed not to unsigned wrap. Note that this
986	// assumes the object in memory is aligned to the natural alignment.
987	unsigned AddrSpace = AR->getType()->getPointerAddressSpace();
988	if (!NullPointerIsDefined(F: L->getHeader()->getParent(), AS: AddrSpace) &&
989	(Stride == `1` \|\| Stride == -`1`))
990	return true;
991	}
992
993	if (Ptr && Assume) {
994	PSE.setNoOverflow(V: Ptr, Flags: SCEVWrapPredicate::IncrementNUSW);
995	LLVM_DEBUG(dbgs() << "LAA: Pointer may wrap:\n"
996	<< "LAA: Pointer: " << *Ptr << "\n"
997	<< "LAA: SCEV: " << *AR << "\n"
998	<< "LAA: Added an overflow assumption\n");
999	return true;
1000	}
1001
1002	return false;
1003	}
1004
1005	static void visitPointers(Value StartPtr, const* Loop &InnermostLoop,
1006	function_ref<void(Value *)> AddPointer) {
1007	SmallPtrSet<Value *, `8`> Visited;
1008	SmallVector<Value *> WorkList;
1009	WorkList.push_back(Elt: StartPtr);
1010
1011	while (!WorkList.empty()) {
1012	Value *Ptr = WorkList.pop_back_val();
1013	if (!Visited.insert(Ptr).second)
1014	continue;
1015	auto *PN = dyn_cast<PHINode>(Val: Ptr);
1016	// SCEV does not look through non-header PHIs inside the loop. Such phis
1017	// can be analyzed by adding separate accesses for each incoming pointer
1018	// value.
1019	if (PN && InnermostLoop.contains(BB: PN->getParent()) &&
1020	PN->getParent() != InnermostLoop.getHeader()) {
1021	llvm::append_range(C&: WorkList, R: PN->incoming_values());
1022	} else
1023	AddPointer (Ptr);
1024	}
1025	}
1026
1027	// Walk back through the IR for a pointer, looking for a select like the
1028	// following:
1029	//
1030	// %offset = select i1 %cmp, i64 %a, i64 %b
1031	// %addr = getelementptr double, double %base, i64 %offset*
1032	// %ld = load double, double %addr, align 8*
1033	//
1034	// We won't be able to form a single SCEVAddRecExpr from this since the
1035	// address for each loop iteration depends on %cmp. We could potentially
1036	// produce multiple valid SCEVAddRecExprs, though, and check all of them for
1037	// memory safety/aliasing if needed.
1038	//
1039	// If we encounter some IR we don't yet handle, or something obviously fine
1040	// like a constant, then we just add the SCEV for that term to the list passed
1041	// in by the caller. If we have a node that may potentially yield a valid
1042	// SCEVAddRecExpr then we decompose it into parts and build the SCEV terms
1043	// ourselves before adding to the list.
1044	static void findForkedSCEVs(
1045	ScalarEvolution SE, const* Loop L, Value Ptr,
1046	SmallVectorImpl<PointerIntPair<const SCEV , `1`, bool*>> &ScevList,
1047	unsigned Depth) {
1048	// If our Value is a SCEVAddRecExpr, loop invariant, not an instruction, or
1049	// we've exceeded our limit on recursion, just return whatever we have
1050	// regardless of whether it can be used for a forked pointer or not, along
1051	// with an indication of whether it might be a poison or undef value.
1052	const SCEV *Scev = SE->getSCEV(V: Ptr);
1053	if (isa<SCEVAddRecExpr>(Val: Scev) \|\| L->isLoopInvariant(V: Ptr) \|\|
1054	!isa<Instruction>(Val: Ptr) \|\| Depth == `0`) {
1055	ScevList.emplace_back(Args&: Scev, Args: !isGuaranteedNotToBeUndefOrPoison(V: Ptr));
1056	return;
1057	}
1058
1059	Depth--;
1060
1061	auto UndefPoisonCheck = [](PointerIntPair<const SCEV , `1`, bool*> S) {
1062	return get<`1`>(Pair: S);
1063	};
1064
1065	auto GetBinOpExpr = [&SE](unsigned Opcode, const SCEV L, const* SCEV *R) {
1066	switch (Opcode) {
1067	case Instruction::Add:
1068	return SE->getAddExpr(LHS: L, RHS: R);
1069	case Instruction::Sub:
1070	return SE->getMinusSCEV(LHS: L, RHS: R);
1071	default:
1072	llvm_unreachable("Unexpected binary operator when walking ForkedPtrs");
1073	}
1074	};
1075
1076	Instruction *I = cast<Instruction>(Val: Ptr);
1077	unsigned Opcode = I->getOpcode();
1078	switch (Opcode) {
1079	case Instruction::GetElementPtr: {
1080	auto *GEP = cast<GetElementPtrInst>(Val: I);
1081	Type *SourceTy = GEP->getSourceElementType();
1082	// We only handle base + single offset GEPs here for now.
1083	// Not dealing with preexisting gathers yet, so no vectors.
1084	if (I->getNumOperands() != `2` \|\| SourceTy->isVectorTy()) {
1085	ScevList.emplace_back(Args&: Scev, Args: !isGuaranteedNotToBeUndefOrPoison(V: GEP));
1086	break;
1087	}
1088	SmallVector<PointerIntPair<const SCEV , `1`, bool*>, `2`> BaseScevs;
1089	SmallVector<PointerIntPair<const SCEV , `1`, bool*>, `2`> OffsetScevs;
1090	findForkedSCEVs(SE, L, Ptr: I->getOperand(i: `0`), ScevList&: BaseScevs, Depth);
1091	findForkedSCEVs(SE, L, Ptr: I->getOperand(i: `1`), ScevList&: OffsetScevs, Depth);
1092
1093	// See if we need to freeze our fork...
1094	bool NeedsFreeze = any_of(Range&: BaseScevs, P: UndefPoisonCheck) \|\|
1095	any_of(Range&: OffsetScevs, P: UndefPoisonCheck);
1096
1097	// Check that we only have a single fork, on either the base or the offset.
1098	// Copy the SCEV across for the one without a fork in order to generate
1099	// the full SCEV for both sides of the GEP.
1100	if (OffsetScevs.size() == `2` && BaseScevs.size() == `1`)
1101	BaseScevs.push_back(Elt: BaseScevs [`0`]);
1102	else if (BaseScevs.size() == `2` && OffsetScevs.size() == `1`)
1103	OffsetScevs.push_back(Elt: OffsetScevs [`0`]);
1104	else {
1105	ScevList.emplace_back(Args&: Scev, Args&: NeedsFreeze);
1106	break;
1107	}
1108
1109	Type *IntPtrTy = SE->getEffectiveSCEVType(Ty: GEP->getPointerOperandType());
1110
1111	// Find the size of the type being pointed to. We only have a single
1112	// index term (guarded above) so we don't need to index into arrays or
1113	// structures, just get the size of the scalar value.
1114	const SCEV *Size = SE->getSizeOfExpr(IntTy: IntPtrTy, AllocTy: SourceTy);
1115
1116	for (auto [B, O] : zip(t&: BaseScevs, u&: OffsetScevs)) {
1117	const SCEV *Base = get<`0`>(Pair: B);
1118	const SCEV *Offset = get<`0`>(Pair: O);
1119
1120	// Scale up the offsets by the size of the type, then add to the bases.
1121	const SCEV *Scaled =
1122	SE->getMulExpr(LHS: Size, RHS: SE->getTruncateOrSignExtend(V: Offset, Ty: IntPtrTy));
1123	ScevList.emplace_back(Args: SE->getAddExpr(LHS: Base, RHS: Scaled), Args&: NeedsFreeze);
1124	}
1125	break;
1126	}
1127	case Instruction::Select: {
1128	SmallVector<PointerIntPair<const SCEV , `1`, bool*>, `2`> ChildScevs;
1129	// A select means we've found a forked pointer, but we currently only
1130	// support a single select per pointer so if there's another behind this
1131	// then we just bail out and return the generic SCEV.
1132	findForkedSCEVs(SE, L, Ptr: I->getOperand(i: `1`), ScevList&: ChildScevs, Depth);
1133	findForkedSCEVs(SE, L, Ptr: I->getOperand(i: `2`), ScevList&: ChildScevs, Depth);
1134	if (ChildScevs.size() == `2`)
1135	append_range(C&: ScevList, R&: ChildScevs);
1136	else
1137	ScevList.emplace_back(Args&: Scev, Args: !isGuaranteedNotToBeUndefOrPoison(V: Ptr));
1138	break;
1139	}
1140	case Instruction::PHI: {
1141	SmallVector<PointerIntPair<const SCEV , `1`, bool*>, `2`> ChildScevs;
1142	// A phi means we've found a forked pointer, but we currently only
1143	// support a single phi per pointer so if there's another behind this
1144	// then we just bail out and return the generic SCEV.
1145	if (I->getNumOperands() == `2`) {
1146	findForkedSCEVs(SE, L, Ptr: I->getOperand(i: `0`), ScevList&: ChildScevs, Depth);
1147	findForkedSCEVs(SE, L, Ptr: I->getOperand(i: `1`), ScevList&: ChildScevs, Depth);
1148	}
1149	if (ChildScevs.size() == `2`)
1150	append_range(C&: ScevList, R&: ChildScevs);
1151	else
1152	ScevList.emplace_back(Args&: Scev, Args: !isGuaranteedNotToBeUndefOrPoison(V: Ptr));
1153	break;
1154	}
1155	case Instruction::Add:
1156	case Instruction::Sub: {
1157	SmallVector<PointerIntPair<const SCEV , `1`, bool*>> LScevs;
1158	SmallVector<PointerIntPair<const SCEV , `1`, bool*>> RScevs;
1159	findForkedSCEVs(SE, L, Ptr: I->getOperand(i: `0`), ScevList&: LScevs, Depth);
1160	findForkedSCEVs(SE, L, Ptr: I->getOperand(i: `1`), ScevList&: RScevs, Depth);
1161
1162	// See if we need to freeze our fork...
1163	bool NeedsFreeze =
1164	any_of(Range&: LScevs, P: UndefPoisonCheck) \|\| any_of(Range&: RScevs, P: UndefPoisonCheck);
1165
1166	// Check that we only have a single fork, on either the left or right side.
1167	// Copy the SCEV across for the one without a fork in order to generate
1168	// the full SCEV for both sides of the BinOp.
1169	if (LScevs.size() == `2` && RScevs.size() == `1`)
1170	RScevs.push_back(Elt: RScevs [`0`]);
1171	else if (RScevs.size() == `2` && LScevs.size() == `1`)
1172	LScevs.push_back(Elt: LScevs [`0`]);
1173	else {
1174	ScevList.emplace_back(Args&: Scev, Args&: NeedsFreeze);
1175	break;
1176	}
1177
1178	for (auto [L, R] : zip(t&: LScevs, u&: RScevs))
1179	ScevList.emplace_back(Args: GetBinOpExpr (Opcode, get<`0`>(Pair: L), get<`0`>(Pair: R)),
1180	Args&: NeedsFreeze);
1181	break;
1182	}
1183	default:
1184	// Just return the current SCEV if we haven't handled the instruction yet.
1185	LLVM_DEBUG(dbgs() << "ForkedPtr unhandled instruction: " << *I << "\n");
1186	ScevList.emplace_back(Args&: Scev, Args: !isGuaranteedNotToBeUndefOrPoison(V: Ptr));
1187	break;
1188	}
1189	}
1190
1191	static SmallVector<PointerIntPair<const SCEV , `1`, bool*>>
1192	findForkedPointer(PredicatedScalarEvolution &PSE,
1193	const DenseMap<Value , const* SCEV > &StridesMap, Value Ptr,
1194	const Loop *L) {
1195	ScalarEvolution *SE = PSE.getSE();
1196	assert(SE->isSCEVable(Ptr->getType()) && "Value is not SCEVable!");
1197	SmallVector<PointerIntPair<const SCEV , `1`, bool*>> Scevs;
1198	findForkedSCEVs(SE, L, Ptr, ScevList&: Scevs, Depth: MaxForkedSCEVDepth);
1199
1200	// For now, we will only accept a forked pointer with two possible SCEVs
1201	// that are either SCEVAddRecExprs or loop invariant.
1202	if (Scevs.size() == `2` &&
1203	(isa<SCEVAddRecExpr>(Val: get<`0`>(Pair: Scevs [`0`])) \|\|
1204	SE->isLoopInvariant(S: get<`0`>(Pair: Scevs [`0`]), L)) &&
1205	(isa<SCEVAddRecExpr>(Val: get<`0`>(Pair: Scevs [`1`])) \|\|
1206	SE->isLoopInvariant(S: get<`0`>(Pair: Scevs [`1`]), L))) {
1207	LLVM_DEBUG(dbgs() << "LAA: Found forked pointer: " << *Ptr << "\n");
1208	LLVM_DEBUG(dbgs() << "\t(1) " << *get<`0`>(Scevs[`0`]) << "\n");
1209	LLVM_DEBUG(dbgs() << "\t(2) " << *get<`0`>(Scevs[`1`]) << "\n");
1210	return Scevs;
1211	}
1212
1213	return {{replaceSymbolicStrideSCEV(PSE, PtrToStride: StridesMap, Ptr), false}};
1214	}
1215
1216	bool AccessAnalysis::createCheckForAccess(
1217	RuntimePointerChecking &RtCheck, MemAccessInfo Access, Type *AccessTy,
1218	const DenseMap<Value , const* SCEV *> &StridesMap,
1219	DenseMap<Value , unsigned> &DepSetId, Loop TheLoop,
1220	unsigned &RunningDepId, unsigned ASId, bool Assume) {
1221	Value *Ptr = Access.getPointer();
1222
1223	SmallVector<PointerIntPair<const SCEV , `1`, bool*>> TranslatedPtrs =
1224	findForkedPointer(PSE, StridesMap, Ptr, L: TheLoop);
1225	assert(!TranslatedPtrs.empty() && "must have some translated pointers");
1226
1227	/// Check whether all pointers can participate in a runtime bounds check. They
1228	/// must either be invariant or AddRecs. If ShouldCheckWrap is true, they also
1229	/// must not wrap.
1230	for (auto &P : TranslatedPtrs) {
1231	// The bounds for loop-invariant pointer is trivial.
1232	if (PSE.getSE()->isLoopInvariant(S: P.getPointer(), L: TheLoop))
1233	continue;
1234
1235	const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Val: P.getPointer());
1236	if (!AR && Assume)
1237	AR = PSE.getAsAddRec(V: Ptr);
1238	if (!AR \|\| !AR->isAffine())
1239	return false;
1240
1241	// If there's only one option for Ptr, look it up after bounds and wrap
1242	// checking, because assumptions might have been added to PSE.
1243	if (TranslatedPtrs.size() == `1`) {
1244	AR =
1245	cast<SCEVAddRecExpr>(Val: replaceSymbolicStrideSCEV(PSE, PtrToStride: StridesMap, Ptr));
1246	P.setPointer(AR);
1247	}
1248
1249	// When we run after a failing dependency check we have to make sure
1250	// we don't have wrapping pointers.
1251	if (!isNoWrap(PSE, AR, Ptr: TranslatedPtrs.size() == `1` ? Ptr : nullptr, AccessTy,
1252	L: TheLoop, Assume)) {
1253	return false;
1254	}
1255	}
1256
1257	for (auto [PtrExpr, NeedsFreeze] : TranslatedPtrs) {
1258	// The id of the dependence set.
1259	unsigned DepId;
1260
1261	if (isDependencyCheckNeeded()) {
1262	Value *Leader = DepCands.getLeaderValue(V: Access).getPointer();
1263	unsigned &LeaderId = DepSetId [Leader];
1264	if (!LeaderId)
1265	LeaderId = RunningDepId++;
1266	DepId = LeaderId;
1267	} else
1268	// Each access has its own dependence set.
1269	DepId = RunningDepId++;
1270
1271	bool IsWrite = Access.getInt();
1272	RtCheck.insert(Lp: TheLoop, Ptr, PtrExpr, AccessTy, WritePtr: IsWrite, DepSetId: DepId, ASId, PSE,
1273	NeedsFreeze);
1274	LLVM_DEBUG(dbgs() << "LAA: Found a runtime check ptr:" << *Ptr << `'\n'`);
1275	}
1276
1277	return true;
1278	}
1279
1280	bool AccessAnalysis::canCheckPtrAtRT(
1281	RuntimePointerChecking &RtCheck, Loop *TheLoop,
1282	const DenseMap<Value , const* SCEV > &StridesMap, Value &UncomputablePtr,
1283	bool AllowPartial) {
1284	// Find pointers with computable bounds. We are going to use this information
1285	// to place a runtime bound check.
1286	bool CanDoRT = true;
1287
1288	bool MayNeedRTCheck = false;
1289	if (!IsRTCheckAnalysisNeeded) return true;
1290
1291	bool IsDepCheckNeeded = isDependencyCheckNeeded();
1292
1293	// We assign a consecutive id to access from different alias sets.
1294	// Accesses between different groups doesn't need to be checked.
1295	unsigned ASId = `0`;
1296	for (const auto &AS : AST) {
1297	int NumReadPtrChecks = `0`;
1298	int NumWritePtrChecks = `0`;
1299	bool CanDoAliasSetRT = true;
1300	++ASId;
1301	auto ASPointers = AS.getPointers();
1302
1303	// We assign consecutive id to access from different dependence sets.
1304	// Accesses within the same set don't need a runtime check.
1305	unsigned RunningDepId = `1`;
1306	DenseMap<Value , unsigned*> DepSetId;
1307
1308	SmallVector<std::pair<MemAccessInfo, Type *>, `4`> Retries;
1309
1310	// First, count how many write and read accesses are in the alias set. Also
1311	// collect MemAccessInfos for later.
1312	SmallVector<MemAccessInfo, `4`> AccessInfos;
1313	for (const Value *ConstPtr : ASPointers) {
1314	Value Ptr = const_cast<Value >(ConstPtr);
1315	bool IsWrite = Accesses.contains(Key: MemAccessInfo (Ptr, true));
1316	if (IsWrite)
1317	++NumWritePtrChecks;
1318	else
1319	++NumReadPtrChecks;
1320	AccessInfos.emplace_back(Args&: Ptr, Args&: IsWrite);
1321	}
1322
1323	// We do not need runtime checks for this alias set, if there are no writes
1324	// or a single write and no reads.
1325	if (NumWritePtrChecks == `0` \|\|
1326	(NumWritePtrChecks == `1` && NumReadPtrChecks == `0`)) {
1327	assert((ASPointers.size() <= `1` \|\|
1328	all_of(ASPointers,
1329	[this](const Value *Ptr) {
1330	MemAccessInfo AccessWrite(const_cast<Value *>(Ptr),
1331	true);
1332	return !DepCands.contains(AccessWrite);
1333	})) &&
1334	"Can only skip updating CanDoRT below, if all entries in AS "
1335	"are reads or there is at most 1 entry");
1336	continue;
1337	}
1338
1339	for (auto &Access : AccessInfos) {
1340	for (const auto &AccessTy : Accesses [Access]) {
1341	if (!createCheckForAccess(RtCheck, Access, AccessTy, StridesMap,
1342	DepSetId, TheLoop, RunningDepId, ASId,
1343	Assume: false)) {
1344	LLVM_DEBUG(dbgs() << "LAA: Can't find bounds for ptr:"
1345	<< *Access.getPointer() << `'\n'`);
1346	Retries.emplace_back(Args&: Access, Args: AccessTy);
1347	CanDoAliasSetRT = false;
1348	}
1349	}
1350	}
1351
1352	// Note that this function computes CanDoRT and MayNeedRTCheck
1353	// independently. For example CanDoRT=false, MayNeedRTCheck=false means that
1354	// we have a pointer for which we couldn't find the bounds but we don't
1355	// actually need to emit any checks so it does not matter.
1356	//
1357	// We need runtime checks for this alias set, if there are at least 2
1358	// dependence sets (in which case RunningDepId > 2) or if we need to re-try
1359	// any bound checks (because in that case the number of dependence sets is
1360	// incomplete).
1361	bool NeedsAliasSetRTCheck = RunningDepId > `2` \|\| !Retries.empty();
1362
1363	// We need to perform run-time alias checks, but some pointers had bounds
1364	// that couldn't be checked.
1365	if (NeedsAliasSetRTCheck && !CanDoAliasSetRT) {
1366	// Reset the CanDoSetRt flag and retry all accesses that have failed.
1367	// We know that we need these checks, so we can now be more aggressive
1368	// and add further checks if required (overflow checks).
1369	CanDoAliasSetRT = true;
1370	for (const auto &[Access, AccessTy] : Retries) {
1371	if (!createCheckForAccess(RtCheck, Access, AccessTy, StridesMap,
1372	DepSetId, TheLoop, RunningDepId, ASId,
1373	/Assume=/true)) {
1374	CanDoAliasSetRT = false;
1375	UncomputablePtr = Access.getPointer();
1376	if (!AllowPartial)
1377	break;
1378	}
1379	}
1380	}
1381
1382	CanDoRT &= CanDoAliasSetRT;
1383	MayNeedRTCheck \|= NeedsAliasSetRTCheck;
1384	++ASId;
1385	}
1386
1387	// If the pointers that we would use for the bounds comparison have different
1388	// address spaces, assume the values aren't directly comparable, so we can't
1389	// use them for the runtime check. We also have to assume they could
1390	// overlap. In the future there should be metadata for whether address spaces
1391	// are disjoint.
1392	unsigned NumPointers = RtCheck.Pointers.size();
1393	for (unsigned i = `0`; i < NumPointers; ++i) {
1394	for (unsigned j = i + `1`; j < NumPointers; ++j) {
1395	// Only need to check pointers between two different dependency sets.
1396	if (RtCheck.Pointers [i].DependencySetId ==
1397	RtCheck.Pointers [j].DependencySetId)
1398	continue;
1399	// Only need to check pointers in the same alias set.
1400	if (RtCheck.Pointers [i].AliasSetId != RtCheck.Pointers [j].AliasSetId)
1401	continue;
1402
1403	Value *PtrI = RtCheck.Pointers [i].PointerValue;
1404	Value *PtrJ = RtCheck.Pointers [j].PointerValue;
1405
1406	unsigned ASi = PtrI->getType()->getPointerAddressSpace();
1407	unsigned ASj = PtrJ->getType()->getPointerAddressSpace();
1408	if (ASi != ASj) {
1409	LLVM_DEBUG(
1410	dbgs() << "LAA: Runtime check would require comparison between"
1411	" different address spaces\n");
1412	return false;
1413	}
1414	}
1415	}
1416
1417	if (MayNeedRTCheck && (CanDoRT \|\| AllowPartial))
1418	RtCheck.generateChecks(DepCands, UseDependencies: IsDepCheckNeeded);
1419
1420	LLVM_DEBUG(dbgs() << "LAA: We need to do " << RtCheck.getNumberOfChecks()
1421	<< " pointer comparisons.\n");
1422
1423	// If we can do run-time checks, but there are no checks, no runtime checks
1424	// are needed. This can happen when all pointers point to the same underlying
1425	// object for example.
1426	RtCheck.Need = CanDoRT ? RtCheck.getNumberOfChecks() != `0` : MayNeedRTCheck;
1427
1428	bool CanDoRTIfNeeded = !RtCheck.Need \|\| CanDoRT;
1429	assert(CanDoRTIfNeeded == (CanDoRT \|\| !MayNeedRTCheck) &&
1430	"CanDoRTIfNeeded depends on RtCheck.Need");
1431	if (!CanDoRTIfNeeded && !AllowPartial)
1432	RtCheck.reset();
1433	return CanDoRTIfNeeded;
1434	}
1435
1436	void AccessAnalysis::processMemAccesses() {
1437	// We process the set twice: first we process read-write pointers, last we
1438	// process read-only pointers. This allows us to skip dependence tests for
1439	// read-only pointers.
1440
1441	LLVM_DEBUG(dbgs() << "LAA: Processing memory accesses...\n");
1442	LLVM_DEBUG(dbgs() << " AST: "; AST.dump());
1443	LLVM_DEBUG(dbgs() << "LAA: Accesses(" << Accesses.size() << "):\n");
1444	LLVM_DEBUG({
1445	for (const auto &[A, _] : Accesses)
1446	dbgs() << "\t" << *A.getPointer() << " ("
1447	<< (A.getInt()
1448	? "write"
1449	: (ReadOnlyPtr.contains(A.getPointer()) ? "read-only"
1450	: "read"))
1451	<< ")\n";
1452	});
1453
1454	// The AliasSetTracker has nicely partitioned our pointers by metadata
1455	// compatibility and potential for underlying-object overlap. As a result, we
1456	// only need to check for potential pointer dependencies within each alias
1457	// set.
1458	for (const auto &AS : AST) {
1459	// Note that both the alias-set tracker and the alias sets themselves used
1460	// ordered collections internally and so the iteration order here is
1461	// deterministic.
1462	auto ASPointers = AS.getPointers();
1463
1464	bool SetHasWrite = false;
1465
1466	// Map of (pointer to underlying objects, accessed address space) to last
1467	// access encountered.
1468	typedef DenseMap<std::pair<const Value , unsigned*>, MemAccessInfo>
1469	UnderlyingObjToAccessMap;
1470	UnderlyingObjToAccessMap ObjToLastAccess;
1471
1472	// Set of access to check after all writes have been processed.
1473	PtrAccessMap DeferredAccesses;
1474
1475	// Iterate over each alias set twice, once to process read/write pointers,
1476	// and then to process read-only pointers.
1477	for (int SetIteration = `0`; SetIteration < `2`; ++SetIteration) {
1478	bool UseDeferred = SetIteration > `0`;
1479	PtrAccessMap &S = UseDeferred ? DeferredAccesses : Accesses;
1480
1481	for (const Value *ConstPtr : ASPointers) {
1482	Value Ptr = const_cast<Value >(ConstPtr);
1483
1484	// For a single memory access in AliasSetTracker, Accesses may contain
1485	// both read and write, and they both need to be handled for CheckDeps.
1486	for (const auto &[AC, _] : S) {
1487	if (AC.getPointer() != Ptr)
1488	continue;
1489
1490	bool IsWrite = AC.getInt();
1491
1492	// If we're using the deferred access set, then it contains only
1493	// reads.
1494	bool IsReadOnlyPtr = ReadOnlyPtr.contains(Ptr) && !IsWrite;
1495	if (UseDeferred && !IsReadOnlyPtr)
1496	continue;
1497	// Otherwise, the pointer must be in the PtrAccessSet, either as a
1498	// read or a write.
1499	assert(((IsReadOnlyPtr && UseDeferred) \|\| IsWrite \|\|
1500	S.contains(MemAccessInfo(Ptr, false))) &&
1501	"Alias-set pointer not in the access set?");
1502
1503	MemAccessInfo Access(Ptr, IsWrite);
1504	DepCands.insert(Data: Access);
1505
1506	// Memorize read-only pointers for later processing and skip them in
1507	// the first round (they need to be checked after we have seen all
1508	// write pointers). Note: we also mark pointer that are not
1509	// consecutive as "read-only" pointers (so that we check
1510	// "a[b[i]] +="). Hence, we need the second check for "!IsWrite".
1511	if (!UseDeferred && IsReadOnlyPtr) {
1512	// We only use the pointer keys, the types vector values don't
1513	// matter.
1514	DeferredAccesses.insert(KV: {Access, {}});
1515	continue;
1516	}
1517
1518	// If this is a write - check other reads and writes for conflicts. If
1519	// this is a read only check other writes for conflicts (but only if
1520	// there is no other write to the ptr - this is an optimization to
1521	// catch "a[i] = a[i] + " without having to do a dependence check).
1522	if ((IsWrite \|\| IsReadOnlyPtr) && SetHasWrite) {
1523	CheckDeps.push_back(Elt: Access);
1524	IsRTCheckAnalysisNeeded = true;
1525	}
1526
1527	if (IsWrite)
1528	SetHasWrite = true;
1529
1530	// Create sets of pointers connected by a shared alias set and
1531	// underlying object.
1532	SmallVector<const Value *, `16`> &UOs = UnderlyingObjects [Ptr];
1533	UOs = {};
1534	::getUnderlyingObjects(V: Ptr, Objects&: UOs, LI);
1535	LLVM_DEBUG(dbgs()
1536	<< "Underlying objects for pointer " << *Ptr << "\n");
1537	for (const Value *UnderlyingObj : UOs) {
1538	// nullptr never alias, don't join sets for pointer that have "null"
1539	// in their UnderlyingObjects list.
1540	if (isa<ConstantPointerNull>(Val: UnderlyingObj) &&
1541	!NullPointerIsDefined(
1542	F: TheLoop->getHeader()->getParent(),
1543	AS: UnderlyingObj->getType()->getPointerAddressSpace()))
1544	continue;
1545
1546	auto [It, Inserted] = ObjToLastAccess.try_emplace(
1547	Key: {UnderlyingObj,
1548	cast<PointerType>(Val: Ptr->getType())->getAddressSpace()},
1549	Args&: Access);
1550	if (!Inserted) {
1551	DepCands.unionSets(V1: Access, V2: It ->second);
1552	It ->second = Access;
1553	}
1554
1555	LLVM_DEBUG(dbgs() << " " << *UnderlyingObj << "\n");
1556	}
1557	}
1558	}
1559	}
1560	}
1561	}
1562
1563	/// Check whether the access through \p Ptr has a constant stride.
1564	std::optional<int64_t>
1565	llvm::getPtrStride(PredicatedScalarEvolution &PSE, Type AccessTy, Value Ptr,
1566	const Loop *Lp,
1567	const DenseMap<Value , const* SCEV *> &StridesMap,
1568	bool Assume, bool ShouldCheckWrap) {
1569	const SCEV *PtrScev = replaceSymbolicStrideSCEV(PSE, PtrToStride: StridesMap, Ptr);
1570	if (PSE.getSE()->isLoopInvariant(S: PtrScev, L: Lp))
1571	return `0`;
1572
1573	assert(Ptr->getType()->isPointerTy() && "Unexpected non-ptr");
1574	if (isa<ScalableVectorType>(Val: AccessTy)) {
1575	LLVM_DEBUG(dbgs() << "LAA: Bad stride - Scalable object: " << *AccessTy
1576	<< "\n");
1577	return std::nullopt;
1578	}
1579
1580	const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Val: PtrScev);
1581	if (Assume && !AR)
1582	AR = PSE.getAsAddRec(V: Ptr);
1583
1584	if (!AR) {
1585	LLVM_DEBUG(dbgs() << "LAA: Bad stride - Not an AddRecExpr pointer " << *Ptr
1586	<< " SCEV: " << *PtrScev << "\n");
1587	return std::nullopt;
1588	}
1589
1590	std::optional<int64_t> Stride =
1591	getStrideFromAddRec(AR, Lp, AccessTy, Ptr, PSE);
1592	if (!ShouldCheckWrap \|\| !Stride)
1593	return Stride;
1594
1595	if (isNoWrap(PSE, AR, Ptr, AccessTy, L: Lp, Assume, Stride))
1596	return Stride;
1597
1598	LLVM_DEBUG(
1599	dbgs() << "LAA: Bad stride - Pointer may wrap in the address space "
1600	<< Ptr << " SCEV: " << AR << "\n");
1601	return std::nullopt;
1602	}
1603
1604	std::optional<int64_t> llvm::getPointersDiff(Type ElemTyA, Value PtrA,
1605	Type ElemTyB, Value PtrB,
1606	const DataLayout &DL,
1607	ScalarEvolution &SE,
1608	bool StrictCheck, bool CheckType) {
1609	assert(PtrA && PtrB && "Expected non-nullptr pointers.");
1610
1611	// Make sure that A and B are different pointers.
1612	if (PtrA == PtrB)
1613	return `0`;
1614
1615	// Make sure that the element types are the same if required.
1616	if (CheckType && ElemTyA != ElemTyB)
1617	return std::nullopt;
1618
1619	unsigned ASA = PtrA->getType()->getPointerAddressSpace();
1620	unsigned ASB = PtrB->getType()->getPointerAddressSpace();
1621
1622	// Check that the address spaces match.
1623	if (ASA != ASB)
1624	return std::nullopt;
1625	unsigned IdxWidth = DL.getIndexSizeInBits(AS: ASA);
1626
1627	APInt OffsetA(IdxWidth, `0`), OffsetB(IdxWidth, `0`);
1628	const Value *PtrA1 = PtrA->stripAndAccumulateConstantOffsets(
1629	DL, Offset&: OffsetA, /AllowNonInbounds=/true);
1630	const Value *PtrB1 = PtrB->stripAndAccumulateConstantOffsets(
1631	DL, Offset&: OffsetB, /AllowNonInbounds=/true);
1632
1633	std::optional<int64_t> Val;
1634	if (PtrA1 == PtrB1) {
1635	// Retrieve the address space again as pointer stripping now tracks through
1636	// `addrspacecast`.
1637	ASA = cast<PointerType>(Val: PtrA1->getType())->getAddressSpace();
1638	ASB = cast<PointerType>(Val: PtrB1->getType())->getAddressSpace();
1639	// Check that the address spaces match and that the pointers are valid.
1640	if (ASA != ASB)
1641	return std::nullopt;
1642
1643	IdxWidth = DL.getIndexSizeInBits(AS: ASA);
1644	OffsetA = OffsetA.sextOrTrunc(width: IdxWidth);
1645	OffsetB = OffsetB.sextOrTrunc(width: IdxWidth);
1646
1647	OffsetB -= OffsetA;
1648	Val = OffsetB.trySExtValue();
1649	} else {
1650	// Otherwise compute the distance with SCEV between the base pointers.
1651	const SCEV *PtrSCEVA = SE.getSCEV(V: PtrA);
1652	const SCEV *PtrSCEVB = SE.getSCEV(V: PtrB);
1653	std::optional<APInt> Diff =
1654	SE.computeConstantDifference(LHS: PtrSCEVB, RHS: PtrSCEVA);
1655	if (!Diff)
1656	return std::nullopt;
1657	Val = Diff ->trySExtValue();
1658	}
1659
1660	if (!Val)
1661	return std::nullopt;
1662
1663	int64_t Size = DL.getTypeStoreSize(Ty: ElemTyA);
1664	int64_t Dist = *Val / Size;
1665
1666	// Ensure that the calculated distance matches the type-based one after all
1667	// the bitcasts removal in the provided pointers.
1668	if (!StrictCheck \|\| Dist * Size == Val)
1669	return Dist;
1670	return std::nullopt;
1671	}
1672
1673	bool llvm::sortPtrAccesses(ArrayRef<Value > VL, Type ElemTy,
1674	const DataLayout &DL, ScalarEvolution &SE,
1675	SmallVectorImpl<unsigned> &SortedIndices) {
1676	assert(llvm::all_of(
1677	VL, [](const Value V) { return* V->getType()->isPointerTy(); }) &&
1678	"Expected list of pointer operands.");
1679	// Walk over the pointers, and map each of them to an offset relative to
1680	// first pointer in the array.
1681	Value *Ptr0 = VL [`0`];
1682
1683	using DistOrdPair = std::pair<int64_t, unsigned>;
1684	auto Compare = llvm::less_first ();
1685	std::set<DistOrdPair, decltype(Compare)> Offsets(Compare);
1686	Offsets.emplace(args: `0`, args: `0`);
1687	bool IsConsecutive = true;
1688	for (auto [Idx, Ptr] : drop_begin(RangeOrContainer: enumerate(First&: VL))) {
1689	std::optional<int64_t> Diff =
1690	getPointersDiff(ElemTyA: ElemTy, PtrA: Ptr0, ElemTyB: ElemTy, PtrB: Ptr, DL, SE,
1691	/StrictCheck=/true);
1692	if (!Diff)
1693	return false;
1694
1695	// Check if the pointer with the same offset is found.
1696	int64_t Offset = *Diff;
1697	auto [It, IsInserted] = Offsets.emplace(args&: Offset, args&: Idx);
1698	if (!IsInserted)
1699	return false;
1700	// Consecutive order if the inserted element is the last one.
1701	IsConsecutive &= std::next(x: It) == Offsets.end();
1702	}
1703	SortedIndices.clear();
1704	if (!IsConsecutive) {
1705	// Fill SortedIndices array only if it is non-consecutive.
1706	SortedIndices.resize(N: VL.size());
1707	for (auto [Idx, Off] : enumerate(First&: Offsets))
1708	SortedIndices [Idx] = Off.second;
1709	}
1710	return true;
1711	}
1712
1713	/// Returns true if the memory operations \p A and \p B are consecutive.
1714	bool llvm::isConsecutiveAccess(Value A, Value B, const DataLayout &DL,
1715	ScalarEvolution &SE, bool CheckType) {
1716	Value *PtrA = getLoadStorePointerOperand(V: A);
1717	Value *PtrB = getLoadStorePointerOperand(V: B);
1718	if (!PtrA \|\| !PtrB)
1719	return false;
1720	Type *ElemTyA = getLoadStoreType(I: A);
1721	Type *ElemTyB = getLoadStoreType(I: B);
1722	std::optional<int64_t> Diff =
1723	getPointersDiff(ElemTyA, PtrA, ElemTyB, PtrB, DL, SE,
1724	/StrictCheck=/true, CheckType);
1725	return Diff == `1`;
1726	}
1727
1728	void MemoryDepChecker::addAccess(StoreInst *SI) {
1729	visitPointers(StartPtr: SI->getPointerOperand(), InnermostLoop: *InnermostLoop,
1730	AddPointer: [this, SI](Value *Ptr) {
1731	Accesses [MemAccessInfo (Ptr, true)].push_back(x: AccessIdx);
1732	InstMap.push_back(Elt: SI);
1733	++AccessIdx;
1734	});
1735	}
1736
1737	void MemoryDepChecker::addAccess(LoadInst *LI) {
1738	visitPointers(StartPtr: LI->getPointerOperand(), InnermostLoop: *InnermostLoop,
1739	AddPointer: [this, LI](Value *Ptr) {
1740	Accesses [MemAccessInfo (Ptr, false)].push_back(x: AccessIdx);
1741	InstMap.push_back(Elt: LI);
1742	++AccessIdx;
1743	});
1744	}
1745
1746	MemoryDepChecker::VectorizationSafetyStatus
1747	MemoryDepChecker::Dependence::isSafeForVectorization(DepType Type) {
1748	switch (Type) {
1749	case NoDep:
1750	case Forward:
1751	case BackwardVectorizable:
1752	return VectorizationSafetyStatus::Safe;
1753
1754	case Unknown:
1755	return VectorizationSafetyStatus::PossiblySafeWithRtChecks;
1756	case ForwardButPreventsForwarding:
1757	case Backward:
1758	case BackwardVectorizableButPreventsForwarding:
1759	case IndirectUnsafe:
1760	return VectorizationSafetyStatus::Unsafe;
1761	}
1762	llvm_unreachable("unexpected DepType!");
1763	}
1764
1765	bool MemoryDepChecker::Dependence::isBackward() const {
1766	switch (Type) {
1767	case NoDep:
1768	case Forward:
1769	case ForwardButPreventsForwarding:
1770	case Unknown:
1771	case IndirectUnsafe:
1772	return false;
1773
1774	case BackwardVectorizable:
1775	case Backward:
1776	case BackwardVectorizableButPreventsForwarding:
1777	return true;
1778	}
1779	llvm_unreachable("unexpected DepType!");
1780	}
1781
1782	bool MemoryDepChecker::Dependence::isPossiblyBackward() const {
1783	return isBackward() \|\| Type == Unknown \|\| Type == IndirectUnsafe;
1784	}
1785
1786	bool MemoryDepChecker::Dependence::isForward() const {
1787	switch (Type) {
1788	case Forward:
1789	case ForwardButPreventsForwarding:
1790	return true;
1791
1792	case NoDep:
1793	case Unknown:
1794	case BackwardVectorizable:
1795	case Backward:
1796	case BackwardVectorizableButPreventsForwarding:
1797	case IndirectUnsafe:
1798	return false;
1799	}
1800	llvm_unreachable("unexpected DepType!");
1801	}
1802
1803	bool MemoryDepChecker::couldPreventStoreLoadForward(uint64_t Distance,
1804	uint64_t TypeByteSize,
1805	unsigned CommonStride) {
1806	// If loads occur at a distance that is not a multiple of a feasible vector
1807	// factor store-load forwarding does not take place.
1808	// Positive dependences might cause troubles because vectorizing them might
1809	// prevent store-load forwarding making vectorized code run a lot slower.
1810	// a[i] = a[i-3] ^ a[i-8];
1811	// The stores to a[i:i+1] don't align with the stores to a[i-3:i-2] and
1812	// hence on your typical architecture store-load forwarding does not take
1813	// place. Vectorizing in such cases does not make sense.
1814	// Store-load forwarding distance.
1815
1816	// After this many iterations store-to-load forwarding conflicts should not
1817	// cause any slowdowns.
1818	const uint64_t NumItersForStoreLoadThroughMemory = `8` * TypeByteSize;
1819	// Maximum vector factor.
1820	uint64_t MaxVFWithoutSLForwardIssuesPowerOf2 =
1821	std::min(a: VectorizerParams::MaxVectorWidth * TypeByteSize,
1822	b: MaxStoreLoadForwardSafeDistanceInBits);
1823
1824	// Compute the smallest VF at which the store and load would be misaligned.
1825	for (uint64_t VF = `2` * TypeByteSize;
1826	VF <= MaxVFWithoutSLForwardIssuesPowerOf2; VF *= `2`) {
1827	// If the number of vector iteration between the store and the load are
1828	// small we could incur conflicts.
1829	if (Distance % VF && Distance / VF < NumItersForStoreLoadThroughMemory) {
1830	MaxVFWithoutSLForwardIssuesPowerOf2 = (VF >> `1`);
1831	break;
1832	}
1833	}
1834
1835	if (MaxVFWithoutSLForwardIssuesPowerOf2 < `2` * TypeByteSize) {
1836	LLVM_DEBUG(
1837	dbgs() << "LAA: Distance " << Distance
1838	<< " that could cause a store-load forwarding conflict\n");
1839	return true;
1840	}
1841
1842	if (CommonStride &&
1843	MaxVFWithoutSLForwardIssuesPowerOf2 <
1844	MaxStoreLoadForwardSafeDistanceInBits &&
1845	MaxVFWithoutSLForwardIssuesPowerOf2 !=
1846	VectorizerParams::MaxVectorWidth * TypeByteSize) {
1847	uint64_t MaxVF =
1848	bit_floor(Value: MaxVFWithoutSLForwardIssuesPowerOf2 / CommonStride);
1849	uint64_t MaxVFInBits = MaxVF * TypeByteSize * `8`;
1850	MaxStoreLoadForwardSafeDistanceInBits =
1851	std::min(a: MaxStoreLoadForwardSafeDistanceInBits, b: MaxVFInBits);
1852	}
1853	return false;
1854	}
1855
1856	void MemoryDepChecker::mergeInStatus(VectorizationSafetyStatus S) {
1857	if (Status < S)
1858	Status = S;
1859	}
1860
1861	/// Given a dependence-distance \p Dist between two memory accesses, that have
1862	/// strides in the same direction whose absolute value of the maximum stride is
1863	/// given in \p MaxStride, in a loop whose maximum backedge taken count is \p
1864	/// MaxBTC, check if it is possible to prove statically that the dependence
1865	/// distance is larger than the range that the accesses will travel through the
1866	/// execution of the loop. If so, return true; false otherwise. This is useful
1867	/// for example in loops such as the following (PR31098):
1868	///
1869	/// for (i = 0; i < D; ++i) {
1870	/// = out[i];
1871	/// out[i+D] =
1872	/// }
1873	static bool isSafeDependenceDistance(const DataLayout &DL, ScalarEvolution &SE,
1874	const SCEV &MaxBTC, const SCEV &Dist,
1875	uint64_t MaxStride) {
1876
1877	// If we can prove that
1878	// () \|Dist\| > MaxBTC Step*
1879	// where Step is the absolute stride of the memory accesses in bytes,
1880	// then there is no dependence.
1881	//
1882	// Rationale:
1883	// We basically want to check if the absolute distance (\|Dist/Step\|)
1884	// is >= the loop iteration count (or > MaxBTC).
1885	// This is equivalent to the Strong SIV Test (Practical Dependence Testing,
1886	// Section 4.2.1); Note, that for vectorization it is sufficient to prove
1887	// that the dependence distance is >= VF; This is checked elsewhere.
1888	// But in some cases we can prune dependence distances early, and
1889	// even before selecting the VF, and without a runtime test, by comparing
1890	// the distance against the loop iteration count. Since the vectorized code
1891	// will be executed only if LoopCount >= VF, proving distance >= LoopCount
1892	// also guarantees that distance >= VF.
1893	//
1894	const SCEV *Step = SE.getConstant(Ty: MaxBTC.getType(), V: MaxStride);
1895	const SCEV *Product = SE.getMulExpr(LHS: &MaxBTC, RHS: Step);
1896
1897	const SCEV *CastedDist = &Dist;
1898	const SCEV *CastedProduct = Product;
1899	uint64_t DistTypeSizeBits = DL.getTypeSizeInBits(Ty: Dist.getType());
1900	uint64_t ProductTypeSizeBits = DL.getTypeSizeInBits(Ty: Product->getType());
1901
1902	// The dependence distance can be positive/negative, so we sign extend Dist;
1903	// The multiplication of the absolute stride in bytes and the
1904	// backedgeTakenCount is non-negative, so we zero extend Product.
1905	if (DistTypeSizeBits > ProductTypeSizeBits)
1906	CastedProduct = SE.getZeroExtendExpr(Op: Product, Ty: Dist.getType());
1907	else
1908	CastedDist = SE.getNoopOrSignExtend(V: &Dist, Ty: Product->getType());
1909
1910	// Is Dist - (MaxBTC Step) > 0 ?*
1911	// (If so, then we have proven () because \|Dist\| >= Dist)
1912	const SCEV *Minus = SE.getMinusSCEV(LHS: CastedDist, RHS: CastedProduct);
1913	if (SE.isKnownPositive(S: Minus))
1914	return true;
1915
1916	// Second try: Is -Dist - (MaxBTC Step) > 0 ?*
1917	// (If so, then we have proven () because \|Dist\| >= -1Dist)*
1918	const SCEV *NegDist = SE.getNegativeSCEV(V: CastedDist);
1919	Minus = SE.getMinusSCEV(LHS: NegDist, RHS: CastedProduct);
1920	return SE.isKnownPositive(S: Minus);
1921	}
1922
1923	/// Check the dependence for two accesses with the same stride \p Stride.
1924	/// \p Distance is the positive distance in bytes, and \p TypeByteSize is type
1925	/// size in bytes.
1926	///
1927	/// \returns true if they are independent.
1928	static bool areStridedAccessesIndependent(uint64_t Distance, uint64_t Stride,
1929	uint64_t TypeByteSize) {
1930	assert(Stride > `1` && "The stride must be greater than 1");
1931	assert(TypeByteSize > `0` && "The type size in byte must be non-zero");
1932	assert(Distance > `0` && "The distance must be non-zero");
1933
1934	// Skip if the distance is not multiple of type byte size.
1935	if (Distance % TypeByteSize)
1936	return false;
1937
1938	// No dependence if the distance is not multiple of the stride.
1939	// E.g.
1940	// for (i = 0; i < 1024 ; i += 4)
1941	// A[i+2] = A[i] + 1;
1942	//
1943	// Two accesses in memory (distance is 2, stride is 4):
1944	// \| A[0] \| \| \| \| A[4] \| \| \| \|
1945	// \| \| \| A[2] \| \| \| \| A[6] \| \|
1946	//
1947	// E.g.
1948	// for (i = 0; i < 1024 ; i += 3)
1949	// A[i+4] = A[i] + 1;
1950	//
1951	// Two accesses in memory (distance is 4, stride is 3):
1952	// \| A[0] \| \| \| A[3] \| \| \| A[6] \| \| \|
1953	// \| \| \| \| \| A[4] \| \| \| A[7] \| \|
1954	return Distance % Stride;
1955	}
1956
1957	std::variant<MemoryDepChecker::Dependence::DepType,
1958	MemoryDepChecker::DepDistanceStrideAndSizeInfo>
1959	MemoryDepChecker::getDependenceDistanceStrideAndSize(
1960	const AccessAnalysis::MemAccessInfo &A, Instruction *AInst,
1961	const AccessAnalysis::MemAccessInfo &B, Instruction *BInst) {
1962	const auto &DL = InnermostLoop->getHeader()->getDataLayout();
1963	auto &SE = *PSE.getSE();
1964	const auto &[APtr, AIsWrite] = A;
1965	const auto &[BPtr, BIsWrite] = B;
1966
1967	// Two reads are independent.
1968	if (!AIsWrite && !BIsWrite)
1969	return MemoryDepChecker::Dependence::NoDep;
1970
1971	Type *ATy = getLoadStoreType(I: AInst);
1972	Type *BTy = getLoadStoreType(I: BInst);
1973
1974	// We cannot check pointers in different address spaces.
1975	if (APtr->getType()->getPointerAddressSpace() !=
1976	BPtr->getType()->getPointerAddressSpace())
1977	return MemoryDepChecker::Dependence::Unknown;
1978
1979	std::optional<int64_t> StrideAPtr =
1980	getPtrStride(PSE, AccessTy: ATy, Ptr: APtr, Lp: InnermostLoop, StridesMap: SymbolicStrides, Assume: true, ShouldCheckWrap: true);
1981	std::optional<int64_t> StrideBPtr =
1982	getPtrStride(PSE, AccessTy: BTy, Ptr: BPtr, Lp: InnermostLoop, StridesMap: SymbolicStrides, Assume: true, ShouldCheckWrap: true);
1983
1984	const SCEV *Src = PSE.getSCEV(V: APtr);
1985	const SCEV *Sink = PSE.getSCEV(V: BPtr);
1986
1987	// If the induction step is negative we have to invert source and sink of the
1988	// dependence when measuring the distance between them. We should not swap
1989	// AIsWrite with BIsWrite, as their uses expect them in program order.
1990	if (StrideAPtr && *StrideAPtr < `0`) {
1991	std::swap(a&: Src, b&: Sink);
1992	std::swap(a&: AInst, b&: BInst);
1993	std::swap(a&: ATy, b&: BTy);
1994	std::swap(lhs&: StrideAPtr, rhs&: StrideBPtr);
1995	}
1996
1997	const SCEV *Dist = SE.getMinusSCEV(LHS: Sink, RHS: Src);
1998
1999	LLVM_DEBUG(dbgs() << "LAA: Src Scev: " << Src << "Sink Scev: " << Sink
2000	<< "\n");
2001	LLVM_DEBUG(dbgs() << "LAA: Distance for " << AInst << " to " << BInst
2002	<< ": " << *Dist << "\n");
2003
2004	// Check if we can prove that Sink only accesses memory after Src's end or
2005	// vice versa. At the moment this is limited to cases where either source or
2006	// sink are loop invariant to avoid compile-time increases. This is not
2007	// required for correctness.
2008	if (SE.isLoopInvariant(S: Src, L: InnermostLoop) \|\|
2009	SE.isLoopInvariant(S: Sink, L: InnermostLoop)) {
2010	const SCEV *BTC = PSE.getBackedgeTakenCount();
2011	const SCEV *SymbolicMaxBTC = PSE.getSymbolicMaxBackedgeTakenCount();
2012	const auto &[SrcStart_, SrcEnd_] =
2013	getStartAndEndForAccess(Lp: InnermostLoop, PtrExpr: Src, AccessTy: ATy, BTC, MaxBTC: SymbolicMaxBTC,
2014	SE: PSE.getSE(), PointerBounds: &PointerBounds);
2015	const auto &[SinkStart_, SinkEnd_] =
2016	getStartAndEndForAccess(Lp: InnermostLoop, PtrExpr: Sink, AccessTy: BTy, BTC, MaxBTC: SymbolicMaxBTC,
2017	SE: PSE.getSE(), PointerBounds: &PointerBounds);
2018	if (!isa<SCEVCouldNotCompute>(Val: SrcStart_) &&
2019	!isa<SCEVCouldNotCompute>(Val: SrcEnd_) &&
2020	!isa<SCEVCouldNotCompute>(Val: SinkStart_) &&
2021	!isa<SCEVCouldNotCompute>(Val: SinkEnd_)) {
2022	if (!LoopGuards)
2023	LoopGuards.emplace(
2024	args: ScalarEvolution::LoopGuards::collect(L: InnermostLoop, SE));
2025	auto SrcEnd = SE.applyLoopGuards(Expr: SrcEnd_, Guards: *LoopGuards);
2026	auto SinkStart = SE.applyLoopGuards(Expr: SinkStart_, Guards: *LoopGuards);
2027	if (SE.isKnownPredicate(Pred: CmpInst::ICMP_ULE, LHS: SrcEnd, RHS: SinkStart))
2028	return MemoryDepChecker::Dependence::NoDep;
2029
2030	auto SinkEnd = SE.applyLoopGuards(Expr: SinkEnd_, Guards: *LoopGuards);
2031	auto SrcStart = SE.applyLoopGuards(Expr: SrcStart_, Guards: *LoopGuards);
2032	if (SE.isKnownPredicate(Pred: CmpInst::ICMP_ULE, LHS: SinkEnd, RHS: SrcStart))
2033	return MemoryDepChecker::Dependence::NoDep;
2034	}
2035	}
2036
2037	// Need accesses with constant strides and the same direction for further
2038	// dependence analysis. We don't want to vectorize "A[B[i]] += ..." and
2039	// similar code or pointer arithmetic that could wrap in the address space.
2040
2041	// If either Src or Sink are not strided (i.e. not a non-wrapping AddRec) and
2042	// not loop-invariant (stride will be 0 in that case), we cannot analyze the
2043	// dependence further and also cannot generate runtime checks.
2044	if (!StrideAPtr \|\| !StrideBPtr) {
2045	LLVM_DEBUG(dbgs() << "Pointer access with non-constant stride\n");
2046	return MemoryDepChecker::Dependence::IndirectUnsafe;
2047	}
2048
2049	int64_t StrideAPtrInt = *StrideAPtr;
2050	int64_t StrideBPtrInt = *StrideBPtr;
2051	LLVM_DEBUG(dbgs() << "LAA: Src induction step: " << StrideAPtrInt
2052	<< " Sink induction step: " << StrideBPtrInt << "\n");
2053	// At least Src or Sink are loop invariant and the other is strided or
2054	// invariant. We can generate a runtime check to disambiguate the accesses.
2055	if (!StrideAPtrInt \|\| !StrideBPtrInt)
2056	return MemoryDepChecker::Dependence::Unknown;
2057
2058	// Both Src and Sink have a constant stride, check if they are in the same
2059	// direction.
2060	if ((StrideAPtrInt > `0`) != (StrideBPtrInt > `0`)) {
2061	LLVM_DEBUG(
2062	dbgs() << "Pointer access with strides in different directions\n");
2063	return MemoryDepChecker::Dependence::Unknown;
2064	}
2065
2066	TypeSize AStoreSz = DL.getTypeStoreSize(Ty: ATy);
2067	TypeSize BStoreSz = DL.getTypeStoreSize(Ty: BTy);
2068
2069	// If store sizes are not the same, set TypeByteSize to zero, so we can check
2070	// it in the caller isDependent.
2071	uint64_t ASz = DL.getTypeAllocSize(Ty: ATy);
2072	uint64_t BSz = DL.getTypeAllocSize(Ty: BTy);
2073	uint64_t TypeByteSize = (AStoreSz == BStoreSz) ? BSz : `0`;
2074
2075	uint64_t StrideAScaled = std::abs(i: StrideAPtrInt) * ASz;
2076	uint64_t StrideBScaled = std::abs(i: StrideBPtrInt) * BSz;
2077
2078	uint64_t MaxStride = std::max(a: StrideAScaled, b: StrideBScaled);
2079
2080	std::optional<uint64_t> CommonStride;
2081	if (StrideAScaled == StrideBScaled)
2082	CommonStride = StrideAScaled;
2083
2084	// TODO: Historically, we don't retry with runtime checks unless the
2085	// (unscaled) strides are the same. Fix this once the condition for runtime
2086	// checks in isDependent is fixed.
2087	bool ShouldRetryWithRuntimeCheck = StrideAPtrInt == StrideBPtrInt;
2088
2089	return DepDistanceStrideAndSizeInfo (Dist, MaxStride, CommonStride,
2090	ShouldRetryWithRuntimeCheck, TypeByteSize,
2091	AIsWrite, BIsWrite);
2092	}
2093
2094	MemoryDepChecker::Dependence::DepType
2095	MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
2096	const MemAccessInfo &B, unsigned BIdx) {
2097	assert(AIdx < BIdx && "Must pass arguments in program order");
2098
2099	// Get the dependence distance, stride, type size and what access writes for
2100	// the dependence between A and B.
2101	auto Res =
2102	getDependenceDistanceStrideAndSize(A, AInst: InstMap [AIdx], B, BInst: InstMap [BIdx]);
2103	if (std::holds_alternative<Dependence::DepType>(v: Res))
2104	return std::get<Dependence::DepType>(v&: Res);
2105
2106	auto &[Dist, MaxStride, CommonStride, ShouldRetryWithRuntimeCheck,
2107	TypeByteSize, AIsWrite, BIsWrite] =
2108	std::get<DepDistanceStrideAndSizeInfo>(v&: Res);
2109	bool HasSameSize = TypeByteSize > `0`;
2110
2111	if (isa<SCEVCouldNotCompute>(Val: Dist)) {
2112	// TODO: Relax requirement that there is a common unscaled stride to retry
2113	// with non-constant distance dependencies.
2114	FoundNonConstantDistanceDependence \|= ShouldRetryWithRuntimeCheck;
2115	LLVM_DEBUG(dbgs() << "LAA: Dependence because of uncomputable distance.\n");
2116	return Dependence::Unknown;
2117	}
2118
2119	ScalarEvolution &SE = *PSE.getSE();
2120	auto &DL = InnermostLoop->getHeader()->getDataLayout();
2121
2122	// If the distance between the acecsses is larger than their maximum absolute
2123	// stride multiplied by the symbolic maximum backedge taken count (which is an
2124	// upper bound of the number of iterations), the accesses are independet, i.e.
2125	// they are far enough appart that accesses won't access the same location
2126	// across all loop ierations.
2127	if (HasSameSize &&
2128	isSafeDependenceDistance(
2129	DL, SE, MaxBTC: (PSE.getSymbolicMaxBackedgeTakenCount()), Dist: Dist, MaxStride))
2130	return Dependence::NoDep;
2131
2132	// The rest of this function relies on ConstDist being at most 64-bits, which
2133	// is checked earlier. Will assert if the calling code changes.
2134	const APInt APDist = nullptr*;
2135	uint64_t ConstDist =
2136	match(S: Dist, P: m_scev_APInt(C&: APDist)) ? APDist->abs().getZExtValue() : `0`;
2137
2138	// Attempt to prove strided accesses independent.
2139	if (APDist) {
2140	// If the distance between accesses and their strides are known constants,
2141	// check whether the accesses interlace each other.
2142	if (ConstDist > `0` && CommonStride && CommonStride > `1` && HasSameSize &&
2143	areStridedAccessesIndependent(Distance: ConstDist, Stride: *CommonStride, TypeByteSize)) {
2144	LLVM_DEBUG(dbgs() << "LAA: Strided accesses are independent\n");
2145	return Dependence::NoDep;
2146	}
2147	} else {
2148	if (!LoopGuards)
2149	LoopGuards.emplace(
2150	args: ScalarEvolution::LoopGuards::collect(L: InnermostLoop, SE));
2151	Dist = SE.applyLoopGuards(Expr: Dist, Guards: *LoopGuards);
2152	}
2153
2154	// Negative distances are not plausible dependencies.
2155	if (SE.isKnownNonPositive(S: Dist)) {
2156	if (SE.isKnownNonNegative(S: Dist)) {
2157	if (HasSameSize) {
2158	// Write to the same location with the same size.
2159	return Dependence::Forward;
2160	}
2161	LLVM_DEBUG(dbgs() << "LAA: possibly zero dependence difference but "
2162	"different type sizes\n");
2163	return Dependence::Unknown;
2164	}
2165
2166	bool IsTrueDataDependence = (AIsWrite && !BIsWrite);
2167	// Check if the first access writes to a location that is read in a later
2168	// iteration, where the distance between them is not a multiple of a vector
2169	// factor and relatively small.
2170	//
2171	// NOTE: There is no need to update MaxSafeVectorWidthInBits after call to
2172	// couldPreventStoreLoadForward, even if it changed MinDepDistBytes, since a
2173	// forward dependency will allow vectorization using any width.
2174
2175	if (IsTrueDataDependence && EnableForwardingConflictDetection) {
2176	if (!ConstDist) {
2177	// TODO: FoundNonConstantDistanceDependence is used as a necessary
2178	// condition to consider retrying with runtime checks. Historically, we
2179	// did not set it when strides were different but there is no inherent
2180	// reason to.
2181	FoundNonConstantDistanceDependence \|= ShouldRetryWithRuntimeCheck;
2182	return Dependence::Unknown;
2183	}
2184	if (!HasSameSize \|\|
2185	couldPreventStoreLoadForward(Distance: ConstDist, TypeByteSize)) {
2186	LLVM_DEBUG(
2187	dbgs() << "LAA: Forward but may prevent st->ld forwarding\n");
2188	return Dependence::ForwardButPreventsForwarding;
2189	}
2190	}
2191
2192	LLVM_DEBUG(dbgs() << "LAA: Dependence is negative\n");
2193	return Dependence::Forward;
2194	}
2195
2196	int64_t MinDistance = SE.getSignedRangeMin(S: Dist).getSExtValue();
2197	// Below we only handle strictly positive distances.
2198	if (MinDistance <= `0`) {
2199	FoundNonConstantDistanceDependence \|= ShouldRetryWithRuntimeCheck;
2200	return Dependence::Unknown;
2201	}
2202
2203	if (!ConstDist) {
2204	// Previously this case would be treated as Unknown, possibly setting
2205	// FoundNonConstantDistanceDependence to force re-trying with runtime
2206	// checks. Until the TODO below is addressed, set it here to preserve
2207	// original behavior w.r.t. re-trying with runtime checks.
2208	// TODO: FoundNonConstantDistanceDependence is used as a necessary
2209	// condition to consider retrying with runtime checks. Historically, we
2210	// did not set it when strides were different but there is no inherent
2211	// reason to.
2212	FoundNonConstantDistanceDependence \|= ShouldRetryWithRuntimeCheck;
2213	}
2214
2215	if (!HasSameSize) {
2216	LLVM_DEBUG(dbgs() << "LAA: ReadWrite-Write positive dependency with "
2217	"different type sizes\n");
2218	return Dependence::Unknown;
2219	}
2220	// Bail out early if passed-in parameters make vectorization not feasible.
2221	unsigned ForcedFactor = (VectorizerParams::VectorizationFactor ?
2222	VectorizerParams::VectorizationFactor : `1`);
2223	unsigned ForcedUnroll = (VectorizerParams::VectorizationInterleave ?
2224	VectorizerParams::VectorizationInterleave : `1`);
2225	// The minimum number of iterations for a vectorized/unrolled version.
2226	unsigned MinNumIter = std::max(a: ForcedFactor * ForcedUnroll, b: `2U`);
2227
2228	// It's not vectorizable if the distance is smaller than the minimum distance
2229	// needed for a vectroized/unrolled version. Vectorizing one iteration in
2230	// front needs MaxStride. Vectorizing the last iteration needs TypeByteSize.
2231	// (No need to plus the last gap distance).
2232	//
2233	// E.g. Assume one char is 1 byte in memory and one int is 4 bytes.
2234	// foo(int A) {*
2235	// int B = (int )((char )A + 14);*
2236	// for (i = 0 ; i < 1024 ; i += 2)
2237	// B[i] = A[i] + 1;
2238	// }
2239	//
2240	// Two accesses in memory (stride is 4 2):*
2241	// \| A[0] \| \| A[2] \| \| A[4] \| \| A[6] \| \|
2242	// \| B[0] \| \| B[2] \| \| B[4] \|
2243	//
2244	// MinDistance needs for vectorizing iterations except the last iteration:
2245	// 4 2 * (MinNumIter - 1). MinDistance needs for the last iteration: 4.*
2246	// So the minimum distance needed is: 4 2 * (MinNumIter - 1) + 4.*
2247	//
2248	// If MinNumIter is 2, it is vectorizable as the minimum distance needed is
2249	// 12, which is less than distance.
2250	//
2251	// If MinNumIter is 4 (Say if a user forces the vectorization factor to be 4),
2252	// the minimum distance needed is 28, which is greater than distance. It is
2253	// not safe to do vectorization.
2254	//
2255	// We use MaxStride (maximum of src and sink strides) to get a conservative
2256	// lower bound on the MinDistanceNeeded in case of different strides.
2257
2258	// We know that Dist is positive, but it may not be constant. Use the signed
2259	// minimum for computations below, as this ensures we compute the closest
2260	// possible dependence distance.
2261	uint64_t MinDistanceNeeded = MaxStride * (MinNumIter - `1`) + TypeByteSize;
2262	if (MinDistanceNeeded > static_cast<uint64_t>(MinDistance)) {
2263	if (!ConstDist) {
2264	// For non-constant distances, we checked the lower bound of the
2265	// dependence distance and the distance may be larger at runtime (and safe
2266	// for vectorization). Classify it as Unknown, so we re-try with runtime
2267	// checks.
2268	return Dependence::Unknown;
2269	}
2270	LLVM_DEBUG(dbgs() << "LAA: Failure because of positive minimum distance "
2271	<< MinDistance << `'\n'`);
2272	return Dependence::Backward;
2273	}
2274
2275	// Unsafe if the minimum distance needed is greater than smallest dependence
2276	// distance distance.
2277	if (MinDistanceNeeded > MinDepDistBytes) {
2278	LLVM_DEBUG(dbgs() << "LAA: Failure because it needs at least "
2279	<< MinDistanceNeeded << " size in bytes\n");
2280	return Dependence::Backward;
2281	}
2282
2283	// Positive distance bigger than max vectorization factor.
2284	// FIXME: Should use max factor instead of max distance in bytes, which could
2285	// not handle different types.
2286	// E.g. Assume one char is 1 byte in memory and one int is 4 bytes.
2287	// void foo (int A, char B) {
2288	// for (unsigned i = 0; i < 1024; i++) {
2289	// A[i+2] = A[i] + 1;
2290	// B[i+2] = B[i] + 1;
2291	// }
2292	// }
2293	//
2294	// This case is currently unsafe according to the max safe distance. If we
2295	// analyze the two accesses on array B, the max safe dependence distance
2296	// is 2. Then we analyze the accesses on array A, the minimum distance needed
2297	// is 8, which is less than 2 and forbidden vectorization, But actually
2298	// both A and B could be vectorized by 2 iterations.
2299	MinDepDistBytes =
2300	std::min(a: static_cast<uint64_t>(MinDistance), b: MinDepDistBytes);
2301
2302	bool IsTrueDataDependence = (!AIsWrite && BIsWrite);
2303	if (IsTrueDataDependence && EnableForwardingConflictDetection && ConstDist &&
2304	couldPreventStoreLoadForward(Distance: MinDistance, TypeByteSize, CommonStride: *CommonStride))
2305	return Dependence::BackwardVectorizableButPreventsForwarding;
2306
2307	uint64_t MaxVF = MinDepDistBytes / MaxStride;
2308	LLVM_DEBUG(dbgs() << "LAA: Positive min distance " << MinDistance
2309	<< " with max VF = " << MaxVF << `'\n'`);
2310
2311	uint64_t MaxVFInBits = MaxVF * TypeByteSize * `8`;
2312	if (!ConstDist && MaxVFInBits < MaxTargetVectorWidthInBits) {
2313	// For non-constant distances, we checked the lower bound of the dependence
2314	// distance and the distance may be larger at runtime (and safe for
2315	// vectorization). Classify it as Unknown, so we re-try with runtime checks.
2316	return Dependence::Unknown;
2317	}
2318
2319	MaxSafeVectorWidthInBits = std::min(a: MaxSafeVectorWidthInBits, b: MaxVFInBits);
2320	return Dependence::BackwardVectorizable;
2321	}
2322
2323	bool MemoryDepChecker::areDepsSafe(const DepCandidates &DepCands,
2324	const MemAccessInfoList &CheckDeps) {
2325
2326	MinDepDistBytes = -`1`;
2327	SmallPtrSet<MemAccessInfo, `8`> Visited;
2328	for (MemAccessInfo CurAccess : CheckDeps) {
2329	if (Visited.contains(Ptr: CurAccess))
2330	continue;
2331
2332	// Check accesses within this set.
2333	EquivalenceClasses<MemAccessInfo>::member_iterator AI =
2334	DepCands.findLeader(V: CurAccess);
2335	EquivalenceClasses<MemAccessInfo>::member_iterator AE =
2336	DepCands.member_end();
2337
2338	// Check every access pair.
2339	while (AI != AE) {
2340	Visited.insert(Ptr: *AI);
2341	bool AIIsWrite = AI ->getInt();
2342	// Check loads only against next equivalent class, but stores also against
2343	// other stores in the same equivalence class - to the same address.
2344	EquivalenceClasses<MemAccessInfo>::member_iterator OI =
2345	(AIIsWrite ? AI : std::next(x: AI));
2346	while (OI != AE) {
2347	// Check every accessing instruction pair in program order.
2348	auto &Acc = Accesses [*AI];
2349	for (std::vector<unsigned>::iterator I1 = Acc.begin(), I1E = Acc.end();
2350	I1 != I1E; ++I1)
2351	// Scan all accesses of another equivalence class, but only the next
2352	// accesses of the same equivalent class.
2353	for (std::vector<unsigned>::iterator
2354	I2 = (OI == AI ? std::next(x: I1) : Accesses [*OI].begin()),
2355	I2E = (OI == AI ? I1E : Accesses [*OI].end());
2356	I2 != I2E; ++I2) {
2357	auto A = std::make_pair(x: &AI, y&: I1);
2358	auto B = std::make_pair(x: &OI, y&: I2);
2359
2360	assert(I1 != I2);
2361	if (I1 > I2)
2362	std::swap(x&: A, y&: B);
2363
2364	Dependence::DepType Type =
2365	isDependent(A: A.first, AIdx: A.second, B: B.first, BIdx: B.second);
2366	mergeInStatus(S: Dependence::isSafeForVectorization(Type));
2367
2368	// Gather dependences unless we accumulated MaxDependences
2369	// dependences. In that case return as soon as we find the first
2370	// unsafe dependence. This puts a limit on this quadratic
2371	// algorithm.
2372	if (RecordDependences) {
2373	if (Type != Dependence::NoDep)
2374	Dependences.emplace_back(Args&: A.second, Args&: B.second, Args&: Type);
2375
2376	if (Dependences.size() >= MaxDependences) {
2377	RecordDependences = false;
2378	Dependences.clear();
2379	LLVM_DEBUG(dbgs()
2380	<< "Too many dependences, stopped recording\n");
2381	}
2382	}
2383	if (!RecordDependences && !isSafeForVectorization())
2384	return false;
2385	}
2386	++OI;
2387	}
2388	++AI;
2389	}
2390	}
2391
2392	LLVM_DEBUG(dbgs() << "Total Dependences: " << Dependences.size() << "\n");
2393	return isSafeForVectorization();
2394	}
2395
2396	SmallVector<Instruction *, `4`>
2397	MemoryDepChecker::getInstructionsForAccess(Value Ptr, bool* IsWrite) const {
2398	MemAccessInfo Access(Ptr, IsWrite);
2399	auto &IndexVector = Accesses.find(Val: Access)->second;
2400
2401	SmallVector<Instruction *, `4`> Insts;
2402	transform(Range: IndexVector,
2403	d_first: std::back_inserter(x&: Insts),
2404	F: [&](unsigned Idx) { return this->InstMap [Idx]; });
2405	return Insts;
2406	}
2407
2408	const char *MemoryDepChecker::Dependence::DepName[] = {
2409	"NoDep",
2410	"Unknown",
2411	"IndirectUnsafe",
2412	"Forward",
2413	"ForwardButPreventsForwarding",
2414	"Backward",
2415	"BackwardVectorizable",
2416	"BackwardVectorizableButPreventsForwarding"};
2417
2418	void MemoryDepChecker::Dependence::print(
2419	raw_ostream &OS, unsigned Depth,
2420	const SmallVectorImpl<Instruction > &Instrs) const* {
2421	OS.indent(NumSpaces: Depth) << DepName[Type] << ":\n";
2422	OS.indent(NumSpaces: Depth + `2`) << *Instrs [Source] << " -> \n";
2423	OS.indent(NumSpaces: Depth + `2`) << *Instrs [Destination] << "\n";
2424	}
2425
2426	bool LoopAccessInfo::canAnalyzeLoop() {
2427	// We need to have a loop header.
2428	LLVM_DEBUG(dbgs() << "\nLAA: Checking a loop in '"
2429	<< TheLoop->getHeader()->getParent()->getName() << "' from "
2430	<< TheLoop->getLocStr() << "\n");
2431
2432	// We can only analyze innermost loops.
2433	if (!TheLoop->isInnermost()) {
2434	LLVM_DEBUG(dbgs() << "LAA: loop is not the innermost loop\n");
2435	recordAnalysis(RemarkName: "NotInnerMostLoop") << "loop is not the innermost loop";
2436	return false;
2437	}
2438
2439	// We must have a single backedge.
2440	if (TheLoop->getNumBackEdges() != `1`) {
2441	LLVM_DEBUG(
2442	dbgs() << "LAA: loop control flow is not understood by analyzer\n");
2443	recordAnalysis(RemarkName: "CFGNotUnderstood")
2444	<< "loop control flow is not understood by analyzer";
2445	return false;
2446	}
2447
2448	// ScalarEvolution needs to be able to find the symbolic max backedge taken
2449	// count, which is an upper bound on the number of loop iterations. The loop
2450	// may execute fewer iterations, if it exits via an uncountable exit.
2451	const SCEV *ExitCount = PSE ->getSymbolicMaxBackedgeTakenCount();
2452	if (isa<SCEVCouldNotCompute>(Val: ExitCount)) {
2453	recordAnalysis(RemarkName: "CantComputeNumberOfIterations")
2454	<< "could not determine number of loop iterations";
2455	LLVM_DEBUG(dbgs() << "LAA: SCEV could not compute the loop exit count.\n");
2456	return false;
2457	}
2458
2459	LLVM_DEBUG(dbgs() << "LAA: Found an analyzable loop: "
2460	<< TheLoop->getHeader()->getName() << "\n");
2461	return true;
2462	}
2463
2464	bool LoopAccessInfo::analyzeLoop(AAResults AA, const* LoopInfo *LI,
2465	const TargetLibraryInfo *TLI,
2466	DominatorTree *DT) {
2467	// Holds the Load and Store instructions.
2468	SmallVector<LoadInst *, `16`> Loads;
2469	SmallVector<StoreInst *, `16`> Stores;
2470	SmallPtrSet<MDNode *, `8`> LoopAliasScopes;
2471
2472	// Holds all the different accesses in the loop.
2473	unsigned NumReads = `0`;
2474	unsigned NumReadWrites = `0`;
2475
2476	bool HasComplexMemInst = false;
2477
2478	// A runtime check is only legal to insert if there are no convergent calls.
2479	HasConvergentOp = false;
2480
2481	PtrRtChecking ->Pointers.clear();
2482	PtrRtChecking ->Need = false;
2483
2484	const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel();
2485
2486	const bool EnableMemAccessVersioningOfLoop =
2487	EnableMemAccessVersioning &&
2488	!TheLoop->getHeader()->getParent()->hasOptSize();
2489
2490	// Traverse blocks in fixed RPOT order, regardless of their storage in the
2491	// loop info, as it may be arbitrary.
2492	LoopBlocksRPO RPOT(TheLoop);
2493	RPOT.perform(LI);
2494	for (BasicBlock *BB : RPOT) {
2495	// Scan the BB and collect legal loads and stores. Also detect any
2496	// convergent instructions.
2497	for (Instruction &I : *BB) {
2498	if (auto *Call = dyn_cast<CallBase>(Val: &I)) {
2499	if (Call->isConvergent())
2500	HasConvergentOp = true;
2501	}
2502
2503	// With both a non-vectorizable memory instruction and a convergent
2504	// operation, found in this loop, no reason to continue the search.
2505	if (HasComplexMemInst && HasConvergentOp)
2506	return false;
2507
2508	// Avoid hitting recordAnalysis multiple times.
2509	if (HasComplexMemInst)
2510	continue;
2511
2512	// Record alias scopes defined inside the loop.
2513	if (auto *Decl = dyn_cast<NoAliasScopeDeclInst>(Val: &I))
2514	for (Metadata *Op : Decl->getScopeList()->operands())
2515	LoopAliasScopes.insert(Ptr: cast<MDNode>(Val: Op));
2516
2517	// Many math library functions read the rounding mode. We will only
2518	// vectorize a loop if it contains known function calls that don't set
2519	// the flag. Therefore, it is safe to ignore this read from memory.
2520	auto *Call = dyn_cast<CallInst>(Val: &I);
2521	if (Call && getVectorIntrinsicIDForCall(CI: Call, TLI))
2522	continue;
2523
2524	// If this is a load, save it. If this instruction can read from memory
2525	// but is not a load, we only allow it if it's a call to a function with a
2526	// vector mapping and no pointer arguments.
2527	if (I.mayReadFromMemory()) {
2528	auto hasPointerArgs = [](CallBase *CB) {
2529	return any_of(Range: CB->args(), P: [](Value const *Arg) {
2530	return Arg->getType()->isPointerTy();
2531	});
2532	};
2533
2534	// If the function has an explicit vectorized counterpart, and does not
2535	// take output/input pointers, we can safely assume that it can be
2536	// vectorized.
2537	if (Call && !Call->isNoBuiltin() && Call->getCalledFunction() &&
2538	!hasPointerArgs (Call) && !VFDatabase::getMappings(CI: *Call).empty())
2539	continue;
2540
2541	auto *Ld = dyn_cast<LoadInst>(Val: &I);
2542	if (!Ld) {
2543	recordAnalysis(RemarkName: "CantVectorizeInstruction", Instr: Ld)
2544	<< "instruction cannot be vectorized";
2545	HasComplexMemInst = true;
2546	continue;
2547	}
2548	if (!Ld->isSimple() && !IsAnnotatedParallel) {
2549	recordAnalysis(RemarkName: "NonSimpleLoad", Instr: Ld)
2550	<< "read with atomic ordering or volatile read";
2551	LLVM_DEBUG(dbgs() << "LAA: Found a non-simple load.\n");
2552	HasComplexMemInst = true;
2553	continue;
2554	}
2555	NumLoads++;
2556	Loads.push_back(Elt: Ld);
2557	DepChecker ->addAccess(LI: Ld);
2558	if (EnableMemAccessVersioningOfLoop)
2559	collectStridedAccess(LoadOrStoreInst: Ld);
2560	continue;
2561	}
2562
2563	// Save 'store' instructions. Abort if other instructions write to memory.
2564	if (I.mayWriteToMemory()) {
2565	auto *St = dyn_cast<StoreInst>(Val: &I);
2566	if (!St) {
2567	recordAnalysis(RemarkName: "CantVectorizeInstruction", Instr: St)
2568	<< "instruction cannot be vectorized";
2569	HasComplexMemInst = true;
2570	continue;
2571	}
2572	if (!St->isSimple() && !IsAnnotatedParallel) {
2573	recordAnalysis(RemarkName: "NonSimpleStore", Instr: St)
2574	<< "write with atomic ordering or volatile write";
2575	LLVM_DEBUG(dbgs() << "LAA: Found a non-simple store.\n");
2576	HasComplexMemInst = true;
2577	continue;
2578	}
2579	NumStores++;
2580	Stores.push_back(Elt: St);
2581	DepChecker ->addAccess(SI: St);
2582	if (EnableMemAccessVersioningOfLoop)
2583	collectStridedAccess(LoadOrStoreInst: St);
2584	}
2585	} // Next instr.
2586	} // Next block.
2587
2588	if (HasComplexMemInst)
2589	return false;
2590
2591	// Now we have two lists that hold the loads and the stores.
2592	// Next, we find the pointers that they use.
2593
2594	// Check if we see any stores. If there are no stores, then we don't
2595	// care if the pointers are restrict.
2596	if (!Stores.size()) {
2597	LLVM_DEBUG(dbgs() << "LAA: Found a read-only loop!\n");
2598	return true;
2599	}
2600
2601	MemoryDepChecker::DepCandidates DepCands;
2602	AccessAnalysis Accesses(TheLoop, AA, LI, DepCands, *PSE, LoopAliasScopes);
2603
2604	// Holds the analyzed pointers. We don't want to call getUnderlyingObjects
2605	// multiple times on the same object. If the ptr is accessed twice, once
2606	// for read and once for write, it will only appear once (on the write
2607	// list). This is okay, since we are going to check for conflicts between
2608	// writes and between reads and writes, but not between reads and reads.
2609	SmallSet<std::pair<Value , Type >, `16`> Seen;
2610
2611	// Record uniform store addresses to identify if we have multiple stores
2612	// to the same address.
2613	SmallPtrSet<Value *, `16`> UniformStores;
2614
2615	for (StoreInst *ST : Stores) {
2616	Value *Ptr = ST->getPointerOperand();
2617
2618	if (isInvariant(V: Ptr)) {
2619	// Record store instructions to loop invariant addresses
2620	StoresToInvariantAddresses.push_back(Elt: ST);
2621	HasStoreStoreDependenceInvolvingLoopInvariantAddress \|=
2622	!UniformStores.insert(Ptr).second;
2623	}
2624
2625	// If we did not* see this pointer before, insert it to the read-write*
2626	// list. At this phase it is only a 'write' list.
2627	Type *AccessTy = getLoadStoreType(I: ST);
2628	if (Seen.insert(V: {Ptr, AccessTy}).second) {
2629	++NumReadWrites;
2630
2631	MemoryLocation Loc = MemoryLocation::get(SI: ST);
2632	// The TBAA metadata could have a control dependency on the predication
2633	// condition, so we cannot rely on it when determining whether or not we
2634	// need runtime pointer checks.
2635	if (blockNeedsPredication(BB: ST->getParent(), TheLoop, DT))
2636	Loc.AATags.TBAA = nullptr;
2637
2638	visitPointers(StartPtr: const_cast<Value >(Loc.Ptr), InnermostLoop: TheLoop,
2639	AddPointer: [&Accesses, AccessTy, Loc](Value *Ptr) {
2640	MemoryLocation NewLoc = Loc.getWithNewPtr(NewPtr: Ptr);
2641	Accesses.addStore(Loc: NewLoc, AccessTy);
2642	});
2643	}
2644	}
2645
2646	if (IsAnnotatedParallel) {
2647	LLVM_DEBUG(
2648	dbgs() << "LAA: A loop annotated parallel, ignore memory dependency "
2649	<< "checks.\n");
2650	return true;
2651	}
2652
2653	for (LoadInst *LD : Loads) {
2654	Value *Ptr = LD->getPointerOperand();
2655	// If we did not* see this pointer before, insert it to the*
2656	// read list. If we did* see it before, then it is already in*
2657	// the read-write list. This allows us to vectorize expressions
2658	// such as A[i] += x; Because the address of A[i] is a read-write
2659	// pointer. This only works if the index of A[i] is consecutive.
2660	// If the address of i is unknown (for example A[B[i]]) then we may
2661	// read a few words, modify, and write a few words, and some of the
2662	// words may be written to the same address.
2663	bool IsReadOnlyPtr = false;
2664	Type *AccessTy = getLoadStoreType(I: LD);
2665	if (Seen.insert(V: {Ptr, AccessTy}).second \|\|
2666	!getPtrStride(PSE&: *PSE, AccessTy, Ptr, Lp: TheLoop, StridesMap: SymbolicStrides)) {
2667	++NumReads;
2668	IsReadOnlyPtr = true;
2669	}
2670
2671	// See if there is an unsafe dependency between a load to a uniform address and
2672	// store to the same uniform address.
2673	if (UniformStores.contains(Ptr)) {
2674	LLVM_DEBUG(dbgs() << "LAA: Found an unsafe dependency between a uniform "
2675	"load and uniform store to the same address!\n");
2676	HasLoadStoreDependenceInvolvingLoopInvariantAddress = true;
2677	}
2678
2679	MemoryLocation Loc = MemoryLocation::get(LI: LD);
2680	// The TBAA metadata could have a control dependency on the predication
2681	// condition, so we cannot rely on it when determining whether or not we
2682	// need runtime pointer checks.
2683	if (blockNeedsPredication(BB: LD->getParent(), TheLoop, DT))
2684	Loc.AATags.TBAA = nullptr;
2685
2686	visitPointers(StartPtr: const_cast<Value >(Loc.Ptr), InnermostLoop: TheLoop,
2687	AddPointer: [&Accesses, AccessTy, Loc, IsReadOnlyPtr](Value *Ptr) {
2688	MemoryLocation NewLoc = Loc.getWithNewPtr(NewPtr: Ptr);
2689	Accesses.addLoad(Loc: NewLoc, AccessTy, IsReadOnly: IsReadOnlyPtr);
2690	});
2691	}
2692
2693	// If we write (or read-write) to a single destination and there are no
2694	// other reads in this loop then is it safe to vectorize.
2695	if (NumReadWrites == `1` && NumReads == `0`) {
2696	LLVM_DEBUG(dbgs() << "LAA: Found a write-only loop!\n");
2697	return true;
2698	}
2699
2700	// Build dependence sets and check whether we need a runtime pointer bounds
2701	// check.
2702	Accesses.buildDependenceSets();
2703
2704	// Find pointers with computable bounds. We are going to use this information
2705	// to place a runtime bound check.
2706	Value UncomputablePtr = nullptr*;
2707	HasCompletePtrRtChecking = Accesses.canCheckPtrAtRT(
2708	RtCheck&: *PtrRtChecking, TheLoop, StridesMap: SymbolicStrides, UncomputablePtr, AllowPartial);
2709	if (!HasCompletePtrRtChecking) {
2710	const auto *I = dyn_cast_or_null<Instruction>(Val: UncomputablePtr);
2711	recordAnalysis(RemarkName: "CantIdentifyArrayBounds", Instr: I)
2712	<< "cannot identify array bounds";
2713	LLVM_DEBUG(dbgs() << "LAA: We can't vectorize because we can't find "
2714	<< "the array bounds.\n");
2715	return false;
2716	}
2717
2718	LLVM_DEBUG(
2719	dbgs() << "LAA: May be able to perform a memory runtime check if needed.\n");
2720
2721	bool DepsAreSafe = true;
2722	if (Accesses.isDependencyCheckNeeded()) {
2723	LLVM_DEBUG(dbgs() << "LAA: Checking memory dependencies\n");
2724	DepsAreSafe =
2725	DepChecker ->areDepsSafe(DepCands, CheckDeps: Accesses.getDependenciesToCheck());
2726
2727	if (!DepsAreSafe && DepChecker ->shouldRetryWithRuntimeCheck()) {
2728	LLVM_DEBUG(dbgs() << "LAA: Retrying with memory checks\n");
2729
2730	// Clear the dependency checks. We assume they are not needed.
2731	Accesses.resetDepChecks(DepChecker&: *DepChecker);
2732
2733	PtrRtChecking ->reset();
2734	PtrRtChecking ->Need = true;
2735
2736	UncomputablePtr = nullptr;
2737	HasCompletePtrRtChecking =
2738	Accesses.canCheckPtrAtRT(RtCheck&: *PtrRtChecking, TheLoop, StridesMap: SymbolicStrides,
2739	UncomputablePtr, AllowPartial);
2740
2741	// Check that we found the bounds for the pointer.
2742	if (!HasCompletePtrRtChecking) {
2743	auto *I = dyn_cast_or_null<Instruction>(Val: UncomputablePtr);
2744	recordAnalysis(RemarkName: "CantCheckMemDepsAtRunTime", Instr: I)
2745	<< "cannot check memory dependencies at runtime";
2746	LLVM_DEBUG(dbgs() << "LAA: Can't vectorize with memory checks\n");
2747	return false;
2748	}
2749	DepsAreSafe = true;
2750	}
2751	}
2752
2753	if (HasConvergentOp) {
2754	recordAnalysis(RemarkName: "CantInsertRuntimeCheckWithConvergent")
2755	<< "cannot add control dependency to convergent operation";
2756	LLVM_DEBUG(dbgs() << "LAA: We can't vectorize because a runtime check "
2757	"would be needed with a convergent operation\n");
2758	return false;
2759	}
2760
2761	if (DepsAreSafe) {
2762	LLVM_DEBUG(
2763	dbgs() << "LAA: No unsafe dependent memory operations in loop. We"
2764	<< (PtrRtChecking->Need ? "" : " don't")
2765	<< " need runtime memory checks.\n");
2766	return true;
2767	}
2768
2769	emitUnsafeDependenceRemark();
2770	return false;
2771	}
2772
2773	void LoopAccessInfo::emitUnsafeDependenceRemark() {
2774	const auto *Deps = getDepChecker().getDependences();
2775	if (!Deps)
2776	return;
2777	const auto *Found =
2778	llvm::find_if(Range: Deps, P: [](const* MemoryDepChecker::Dependence &D) {
2779	return MemoryDepChecker::Dependence::isSafeForVectorization(Type: D.Type) !=
2780	MemoryDepChecker::VectorizationSafetyStatus::Safe;
2781	});
2782	if (Found == Deps->end())
2783	return;
2784	MemoryDepChecker::Dependence Dep = *Found;
2785
2786	LLVM_DEBUG(dbgs() << "LAA: unsafe dependent memory operations in loop\n");
2787
2788	// Emit remark for first unsafe dependence
2789	bool HasForcedDistribution = false;
2790	std::optional<const MDOperand *> Value =
2791	findStringMetadataForLoop(TheLoop, Name: "llvm.loop.distribute.enable");
2792	if (Value) {
2793	const MDOperand Op = Value;
2794	assert(Op && mdconst::hasa<ConstantInt>(*Op) && "invalid metadata");
2795	HasForcedDistribution = mdconst::extract<ConstantInt>(MD: *Op)->getZExtValue();
2796	}
2797
2798	const std::string Info =
2799	HasForcedDistribution
2800	? "unsafe dependent memory operations in loop."
2801	: "unsafe dependent memory operations in loop. Use "
2802	"#pragma clang loop distribute(enable) to allow loop distribution "
2803	"to attempt to isolate the offending operations into a separate "
2804	"loop";
2805	OptimizationRemarkAnalysis &R =
2806	recordAnalysis(RemarkName: "UnsafeDep", Instr: Dep.getDestination(DepChecker: getDepChecker())) << Info;
2807
2808	switch (Dep.Type) {
2809	case MemoryDepChecker::Dependence::NoDep:
2810	case MemoryDepChecker::Dependence::Forward:
2811	case MemoryDepChecker::Dependence::BackwardVectorizable:
2812	llvm_unreachable("Unexpected dependence");
2813	case MemoryDepChecker::Dependence::Backward:
2814	R << "\nBackward loop carried data dependence.";
2815	break;
2816	case MemoryDepChecker::Dependence::ForwardButPreventsForwarding:
2817	R << "\nForward loop carried data dependence that prevents "
2818	"store-to-load forwarding.";
2819	break;
2820	case MemoryDepChecker::Dependence::BackwardVectorizableButPreventsForwarding:
2821	R << "\nBackward loop carried data dependence that prevents "
2822	"store-to-load forwarding.";
2823	break;
2824	case MemoryDepChecker::Dependence::IndirectUnsafe:
2825	R << "\nUnsafe indirect dependence.";
2826	break;
2827	case MemoryDepChecker::Dependence::Unknown:
2828	R << "\nUnknown data dependence.";
2829	break;
2830	}
2831
2832	if (Instruction *I = Dep.getSource(DepChecker: getDepChecker())) {
2833	DebugLoc SourceLoc = I->getDebugLoc();
2834	if (auto *DD = dyn_cast_or_null<Instruction>(Val: getPointerOperand(V: I)))
2835	SourceLoc = DD->getDebugLoc();
2836	if (SourceLoc)
2837	R << " Memory location is the same as accessed at "
2838	<< ore::NV ("Location", SourceLoc);
2839	}
2840	}
2841
2842	bool LoopAccessInfo::blockNeedsPredication(BasicBlock BB, Loop TheLoop,
2843	DominatorTree *DT) {
2844	assert(TheLoop->contains(BB) && "Unknown block used");
2845
2846	// Blocks that do not dominate the latch need predication.
2847	const BasicBlock *Latch = TheLoop->getLoopLatch();
2848	return !DT->dominates(A: BB, B: Latch);
2849	}
2850
2851	OptimizationRemarkAnalysis &
2852	LoopAccessInfo::recordAnalysis(StringRef RemarkName, const Instruction *I) {
2853	assert(!Report && "Multiple reports generated");
2854
2855	const BasicBlock *CodeRegion = TheLoop->getHeader();
2856	DebugLoc DL = TheLoop->getStartLoc();
2857
2858	if (I) {
2859	CodeRegion = I->getParent();
2860	// If there is no debug location attached to the instruction, revert back to
2861	// using the loop's.
2862	if (I->getDebugLoc())
2863	DL = I->getDebugLoc();
2864	}
2865
2866	Report = std::make_unique<OptimizationRemarkAnalysis>(DEBUG_TYPE, args&: RemarkName,
2867	args&: DL, args&: CodeRegion);
2868	return *Report;
2869	}
2870
2871	bool LoopAccessInfo::isInvariant(Value V) const* {
2872	auto *SE = PSE ->getSE();
2873	if (TheLoop->isLoopInvariant(V))
2874	return true;
2875	if (!SE->isSCEVable(Ty: V->getType()))
2876	return false;
2877	const SCEV *S = SE->getSCEV(V);
2878	return SE->isLoopInvariant(S, L: TheLoop);
2879	}
2880
2881	/// If \p Ptr is a GEP, which has a loop-variant operand, return that operand.
2882	/// Otherwise, return \p Ptr.
2883	static Value getLoopVariantGEPOperand(Value Ptr, ScalarEvolution *SE,
2884	Loop *Lp) {
2885	auto *GEP = dyn_cast<GetElementPtrInst>(Val: Ptr);
2886	if (!GEP)
2887	return Ptr;
2888
2889	Value *V = Ptr;
2890	for (const Use &U : GEP->operands()) {
2891	if (!SE->isLoopInvariant(S: SE->getSCEV(V: U), L: Lp)) {
2892	if (V == Ptr)
2893	V = U;
2894	else
2895	// There must be exactly one loop-variant operand.
2896	return Ptr;
2897	}
2898	}
2899	return V;
2900	}
2901
2902	/// Get the stride of a pointer access in a loop. Looks for symbolic
2903	/// strides "a[istride]". Returns the symbolic stride, or null otherwise.*
2904	static const SCEV getStrideFromPointer(Value Ptr, ScalarEvolution SE, Loop Lp) {
2905	auto *PtrTy = dyn_cast<PointerType>(Val: Ptr->getType());
2906	if (!PtrTy)
2907	return nullptr;
2908
2909	// Try to remove a gep instruction to make the pointer (actually index at this
2910	// point) easier analyzable. If OrigPtr is equal to Ptr we are analyzing the
2911	// pointer, otherwise, we are analyzing the index.
2912	Value *OrigPtr = Ptr;
2913
2914	Ptr = getLoopVariantGEPOperand(Ptr, SE, Lp);
2915	const SCEV *V = SE->getSCEV(V: Ptr);
2916
2917	if (Ptr != OrigPtr)
2918	// Strip off casts.
2919	while (auto *C = dyn_cast<SCEVIntegralCastExpr>(Val: V))
2920	V = C->getOperand();
2921
2922	if (!match(S: V, P: m_scev_AffineAddRec(Op0: m_SCEV(), Op1: m_SCEV(V), L: m_SpecificLoop(L: Lp))))
2923	return nullptr;
2924
2925	// Note that the restriction after this loop invariant check are only
2926	// profitability restrictions.
2927	if (!SE->isLoopInvariant(S: V, L: Lp))
2928	return nullptr;
2929
2930	// Look for the loop invariant symbolic value.
2931	if (isa<SCEVUnknown>(Val: V))
2932	return V;
2933
2934	if (auto *C = dyn_cast<SCEVIntegralCastExpr>(Val: V))
2935	if (isa<SCEVUnknown>(Val: C->getOperand()))
2936	return V;
2937
2938	return nullptr;
2939	}
2940
2941	void LoopAccessInfo::collectStridedAccess(Value *MemAccess) {
2942	Value *Ptr = getLoadStorePointerOperand(V: MemAccess);
2943	if (!Ptr)
2944	return;
2945
2946	// Note: getStrideFromPointer is a profitability* heuristic. We*
2947	// could broaden the scope of values returned here - to anything
2948	// which happens to be loop invariant and contributes to the
2949	// computation of an interesting IV - but we chose not to as we
2950	// don't have a cost model here, and broadening the scope exposes
2951	// far too many unprofitable cases.
2952	const SCEV *StrideExpr = getStrideFromPointer(Ptr, SE: PSE ->getSE(), Lp: TheLoop);
2953	if (!StrideExpr)
2954	return;
2955
2956	LLVM_DEBUG(dbgs() << "LAA: Found a strided access that is a candidate for "
2957	"versioning:");
2958	LLVM_DEBUG(dbgs() << " Ptr: " << Ptr << " Stride: " << StrideExpr << "\n");
2959
2960	if (!SpeculateUnitStride) {
2961	LLVM_DEBUG(dbgs() << " Chose not to due to -laa-speculate-unit-stride\n");
2962	return;
2963	}
2964
2965	// Avoid adding the "Stride == 1" predicate when we know that
2966	// Stride >= Trip-Count. Such a predicate will effectively optimize a single
2967	// or zero iteration loop, as Trip-Count <= Stride == 1.
2968	//
2969	// TODO: We are currently not making a very informed decision on when it is
2970	// beneficial to apply stride versioning. It might make more sense that the
2971	// users of this analysis (such as the vectorizer) will trigger it, based on
2972	// their specific cost considerations; For example, in cases where stride
2973	// versioning does not help resolving memory accesses/dependences, the
2974	// vectorizer should evaluate the cost of the runtime test, and the benefit
2975	// of various possible stride specializations, considering the alternatives
2976	// of using gather/scatters (if available).
2977
2978	const SCEV *MaxBTC = PSE ->getSymbolicMaxBackedgeTakenCount();
2979
2980	// Match the types so we can compare the stride and the MaxBTC.
2981	// The Stride can be positive/negative, so we sign extend Stride;
2982	// The backedgeTakenCount is non-negative, so we zero extend MaxBTC.
2983	const DataLayout &DL = TheLoop->getHeader()->getDataLayout();
2984	uint64_t StrideTypeSizeBits = DL.getTypeSizeInBits(Ty: StrideExpr->getType());
2985	uint64_t BETypeSizeBits = DL.getTypeSizeInBits(Ty: MaxBTC->getType());
2986	const SCEV *CastedStride = StrideExpr;
2987	const SCEV *CastedBECount = MaxBTC;
2988	ScalarEvolution *SE = PSE ->getSE();
2989	if (BETypeSizeBits >= StrideTypeSizeBits)
2990	CastedStride = SE->getNoopOrSignExtend(V: StrideExpr, Ty: MaxBTC->getType());
2991	else
2992	CastedBECount = SE->getZeroExtendExpr(Op: MaxBTC, Ty: StrideExpr->getType());
2993	const SCEV *StrideMinusBETaken = SE->getMinusSCEV(LHS: CastedStride, RHS: CastedBECount);
2994	// Since TripCount == BackEdgeTakenCount + 1, checking:
2995	// "Stride >= TripCount" is equivalent to checking:
2996	// Stride - MaxBTC> 0
2997	if (SE->isKnownPositive(S: StrideMinusBETaken)) {
2998	LLVM_DEBUG(
2999	dbgs() << "LAA: Stride>=TripCount; No point in versioning as the "
3000	"Stride==1 predicate will imply that the loop executes "
3001	"at most once.\n");
3002	return;
3003	}
3004	LLVM_DEBUG(dbgs() << "LAA: Found a strided access that we can version.\n");
3005
3006	// Strip back off the integer cast, and check that our result is a
3007	// SCEVUnknown as we expect.
3008	const SCEV *StrideBase = StrideExpr;
3009	if (const auto *C = dyn_cast<SCEVIntegralCastExpr>(Val: StrideBase))
3010	StrideBase = C->getOperand();
3011	SymbolicStrides [Ptr] = cast<SCEVUnknown>(Val: StrideBase);
3012	}
3013
3014	LoopAccessInfo::LoopAccessInfo(Loop L, ScalarEvolution SE,
3015	const TargetTransformInfo *TTI,
3016	const TargetLibraryInfo TLI, AAResults AA,
3017	DominatorTree DT, LoopInfo LI,
3018	bool AllowPartial)
3019	: PSE(std::make_unique<PredicatedScalarEvolution>(args&: SE, args&: L)),
3020	PtrRtChecking (nullptr), TheLoop(L), AllowPartial(AllowPartial) {
3021	unsigned MaxTargetVectorWidthInBits = std::numeric_limits<unsigned>::max();
3022	if (TTI && !TTI->enableScalableVectorization())
3023	// Scale the vector width by 2 as rough estimate to also consider
3024	// interleaving.
3025	MaxTargetVectorWidthInBits =
3026	TTI->getRegisterBitWidth(K: TargetTransformInfo::RGK_FixedWidthVector) * `2`;
3027
3028	DepChecker = std::make_unique<MemoryDepChecker>(args&: *PSE, args&: L, args&: SymbolicStrides,
3029	args&: MaxTargetVectorWidthInBits);
3030	PtrRtChecking = std::make_unique<RuntimePointerChecking>(args&: *DepChecker, args&: SE);
3031	if (canAnalyzeLoop())
3032	CanVecMem = analyzeLoop(AA, LI, TLI, DT);
3033	}
3034
3035	void LoopAccessInfo::print(raw_ostream &OS, unsigned Depth) const {
3036	if (CanVecMem) {
3037	OS.indent(NumSpaces: Depth) << "Memory dependences are safe";
3038	const MemoryDepChecker &DC = getDepChecker();
3039	if (!DC.isSafeForAnyVectorWidth())
3040	OS << " with a maximum safe vector width of "
3041	<< DC.getMaxSafeVectorWidthInBits() << " bits";
3042	if (!DC.isSafeForAnyStoreLoadForwardDistances()) {
3043	uint64_t SLDist = DC.getStoreLoadForwardSafeDistanceInBits();
3044	OS << ", with a maximum safe store-load forward width of " << SLDist
3045	<< " bits";
3046	}
3047	if (PtrRtChecking ->Need)
3048	OS << " with run-time checks";
3049	OS << "\n";
3050	}
3051
3052	if (HasConvergentOp)
3053	OS.indent(NumSpaces: Depth) << "Has convergent operation in loop\n";
3054
3055	if (Report)
3056	OS.indent(NumSpaces: Depth) << "Report: " << Report ->getMsg() << "\n";
3057
3058	if (auto *Dependences = DepChecker ->getDependences()) {
3059	OS.indent(NumSpaces: Depth) << "Dependences:\n";
3060	for (const auto &Dep : *Dependences) {
3061	Dep.print(OS, Depth: Depth + `2`, Instrs: DepChecker ->getMemoryInstructions());
3062	OS << "\n";
3063	}
3064	} else
3065	OS.indent(NumSpaces: Depth) << "Too many dependences, not recorded\n";
3066
3067	// List the pair of accesses need run-time checks to prove independence.
3068	PtrRtChecking ->print(OS, Depth);
3069	if (PtrRtChecking ->Need && !HasCompletePtrRtChecking)
3070	OS.indent(NumSpaces: Depth) << "Generated run-time checks are incomplete\n";
3071	OS << "\n";
3072
3073	OS.indent(NumSpaces: Depth)
3074	<< "Non vectorizable stores to invariant address were "
3075	<< (HasStoreStoreDependenceInvolvingLoopInvariantAddress \|\|
3076	HasLoadStoreDependenceInvolvingLoopInvariantAddress
3077	? ""
3078	: "not ")
3079	<< "found in loop.\n";
3080
3081	OS.indent(NumSpaces: Depth) << "SCEV assumptions:\n";
3082	PSE ->getPredicate().print(OS, Depth);
3083
3084	OS << "\n";
3085
3086	OS.indent(NumSpaces: Depth) << "Expressions re-written:\n";
3087	PSE ->print(OS, Depth);
3088	}
3089
3090	const LoopAccessInfo &LoopAccessInfoManager::getInfo(Loop &L,
3091	bool AllowPartial) {
3092	const auto &[It, Inserted] = LoopAccessInfoMap.try_emplace(Key: &L);
3093
3094	// We need to create the LoopAccessInfo if either we don't already have one,
3095	// or if it was created with a different value of AllowPartial.
3096	if (Inserted \|\| It ->second ->hasAllowPartial() != AllowPartial)
3097	It ->second = std::make_unique<LoopAccessInfo>(args: &L, args: &SE, args&: TTI, args&: TLI, args: &AA, args: &DT,
3098	args: &LI, args&: AllowPartial);
3099
3100	return *It ->second;
3101	}
3102	void LoopAccessInfoManager::clear() {
3103	// Collect LoopAccessInfo entries that may keep references to IR outside the
3104	// analyzed loop or SCEVs that may have been modified or invalidated. At the
3105	// moment, that is loops requiring memory or SCEV runtime checks, as those cache
3106	// SCEVs, e.g. for pointer expressions.
3107	for (const auto &[L, LAI] : LoopAccessInfoMap) {
3108	if (LAI ->getRuntimePointerChecking()->getChecks().empty() &&
3109	LAI ->getPSE().getPredicate().isAlwaysTrue())
3110	continue;
3111	LoopAccessInfoMap.erase(Val: L);
3112	}
3113	}
3114
3115	bool LoopAccessInfoManager::invalidate(
3116	Function &F, const PreservedAnalyses &PA,
3117	FunctionAnalysisManager::Invalidator &Inv) {
3118	// Check whether our analysis is preserved.
3119	auto PAC = PA.getChecker<LoopAccessAnalysis>();
3120	if (!PAC.preserved() && !PAC.preservedSet<AllAnalysesOn<Function>>())
3121	// If not, give up now.
3122	return true;
3123
3124	// Check whether the analyses we depend on became invalid for any reason.
3125	// Skip checking TargetLibraryAnalysis as it is immutable and can't become
3126	// invalid.
3127	return Inv.invalidate<AAManager>(IR&: F, PA) \|\|
3128	Inv.invalidate<ScalarEvolutionAnalysis>(IR&: F, PA) \|\|
3129	Inv.invalidate<LoopAnalysis>(IR&: F, PA) \|\|
3130	Inv.invalidate<DominatorTreeAnalysis>(IR&: F, PA);
3131	}
3132
3133	LoopAccessInfoManager LoopAccessAnalysis::run(Function &F,
3134	FunctionAnalysisManager &FAM) {
3135	auto &SE = FAM.getResult<ScalarEvolutionAnalysis>(IR&: F);
3136	auto &AA = FAM.getResult<AAManager>(IR&: F);
3137	auto &DT = FAM.getResult<DominatorTreeAnalysis>(IR&: F);
3138	auto &LI = FAM.getResult<LoopAnalysis>(IR&: F);
3139	auto &TTI = FAM.getResult<TargetIRAnalysis>(IR&: F);
3140	auto &TLI = FAM.getResult<TargetLibraryAnalysis>(IR&: F);
3141	return LoopAccessInfoManager (SE, AA, DT, LI, &TTI, &TLI);
3142	}
3143
3144	AnalysisKey LoopAccessAnalysis::Key;
3145

Browse the source code of llvm_projects/llvm/lib/Analysis/LoopAccessAnalysis.cpp