LoopAccessAnalysis.cpp source code [llvm_projects/llvm/lib/Analysis/LoopAccessAnalysis.cpp]

1	//===- LoopAccessAnalysis.cpp - Loop Access Analysis Implementation --------==//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// The implementation for the loop memory dependence that was originally
10	// developed for the loop vectorizer.
11	//
12	//===----------------------------------------------------------------------===//
13
14	#include "llvm/Analysis/LoopAccessAnalysis.h"
15	#include "llvm/ADT/APInt.h"
16	#include "llvm/ADT/DenseMap.h"
17	#include "llvm/ADT/EquivalenceClasses.h"
18	#include "llvm/ADT/PointerIntPair.h"
19	#include "llvm/ADT/STLExtras.h"
20	#include "llvm/ADT/SetVector.h"
21	#include "llvm/ADT/SmallPtrSet.h"
22	#include "llvm/ADT/SmallSet.h"
23	#include "llvm/ADT/SmallVector.h"
24	#include "llvm/Analysis/AliasAnalysis.h"
25	#include "llvm/Analysis/AliasSetTracker.h"
26	#include "llvm/Analysis/AssumeBundleQueries.h"
27	#include "llvm/Analysis/AssumptionCache.h"
28	#include "llvm/Analysis/LoopAnalysisManager.h"
29	#include "llvm/Analysis/LoopInfo.h"
30	#include "llvm/Analysis/LoopIterator.h"
31	#include "llvm/Analysis/MemoryLocation.h"
32	#include "llvm/Analysis/OptimizationRemarkEmitter.h"
33	#include "llvm/Analysis/ScalarEvolution.h"
34	#include "llvm/Analysis/ScalarEvolutionExpressions.h"
35	#include "llvm/Analysis/ScalarEvolutionPatternMatch.h"
36	#include "llvm/Analysis/TargetLibraryInfo.h"
37	#include "llvm/Analysis/TargetTransformInfo.h"
38	#include "llvm/Analysis/ValueTracking.h"
39	#include "llvm/Analysis/VectorUtils.h"
40	#include "llvm/IR/BasicBlock.h"
41	#include "llvm/IR/Constants.h"
42	#include "llvm/IR/DataLayout.h"
43	#include "llvm/IR/DebugLoc.h"
44	#include "llvm/IR/DerivedTypes.h"
45	#include "llvm/IR/DiagnosticInfo.h"
46	#include "llvm/IR/Dominators.h"
47	#include "llvm/IR/Function.h"
48	#include "llvm/IR/InstrTypes.h"
49	#include "llvm/IR/Instruction.h"
50	#include "llvm/IR/Instructions.h"
51	#include "llvm/IR/IntrinsicInst.h"
52	#include "llvm/IR/PassManager.h"
53	#include "llvm/IR/Type.h"
54	#include "llvm/IR/Value.h"
55	#include "llvm/IR/ValueHandle.h"
56	#include "llvm/Support/Casting.h"
57	#include "llvm/Support/CommandLine.h"
58	#include "llvm/Support/Debug.h"
59	#include "llvm/Support/ErrorHandling.h"
60	#include "llvm/Support/raw_ostream.h"
61	#include <algorithm>
62	#include <cassert>
63	#include <cstdint>
64	#include <iterator>
65	#include <utility>
66	#include <variant>
67	#include <vector>
68
69	using namespace llvm;
70	using namespace llvm::SCEVPatternMatch;
71
72	#define DEBUG_TYPE "loop-accesses"
73
74	static cl::opt<unsigned, true>
75	VectorizationFactor("force-vector-width", cl::Hidden,
76	cl::desc ("Sets the SIMD width. Zero is autoselect."),
77	cl::location(L&: VectorizerParams::VectorizationFactor));
78	unsigned VectorizerParams::VectorizationFactor;
79
80	static cl::opt<unsigned, true>
81	VectorizationInterleave("force-vector-interleave", cl::Hidden,
82	cl::desc ("Sets the vectorization interleave count. "
83	"Zero is autoselect."),
84	cl::location(
85	L&: VectorizerParams::VectorizationInterleave));
86	unsigned VectorizerParams::VectorizationInterleave;
87
88	static cl::opt<unsigned, true> RuntimeMemoryCheckThreshold(
89	"runtime-memory-check-threshold", cl::Hidden,
90	cl::desc ("When performing memory disambiguation checks at runtime do not "
91	"generate more than this number of comparisons (default = 8)."),
92	cl::location(L&: VectorizerParams::RuntimeMemoryCheckThreshold), cl::init(Val: `8`));
93	unsigned VectorizerParams::RuntimeMemoryCheckThreshold;
94
95	/// The maximum iterations used to merge memory checks
96	static cl::opt<unsigned> MemoryCheckMergeThreshold(
97	"memory-check-merge-threshold", cl::Hidden,
98	cl::desc ("Maximum number of comparisons done when trying to merge "
99	"runtime memory checks. (default = 100)"),
100	cl::init(Val: `100`));
101
102	/// Maximum SIMD width.
103	const unsigned VectorizerParams::MaxVectorWidth = `64`;
104
105	/// We collect dependences up to this threshold.
106	static cl::opt<unsigned>
107	MaxDependences("max-dependences", cl::Hidden,
108	cl::desc ("Maximum number of dependences collected by "
109	"loop-access analysis (default = 100)"),
110	cl::init(Val: `100`));
111
112	/// This enables versioning on the strides of symbolically striding memory
113	/// accesses in code like the following.
114	/// for (i = 0; i < N; ++i)
115	/// A[i Stride1] += B[i * Stride2] ...*
116	///
117	/// Will be roughly translated to
118	/// if (Stride1 == 1 && Stride2 == 1) {
119	/// for (i = 0; i < N; i+=4)
120	/// A[i:i+3] += ...
121	/// } else
122	/// ...
123	static cl::opt<bool> EnableMemAccessVersioning(
124	"enable-mem-access-versioning", cl::init(Val: true), cl::Hidden,
125	cl::desc ("Enable symbolic stride memory access versioning"));
126
127	/// Enable store-to-load forwarding conflict detection. This option can
128	/// be disabled for correctness testing.
129	static cl::opt<bool> EnableForwardingConflictDetection(
130	"store-to-load-forwarding-conflict-detection", cl::Hidden,
131	cl::desc ("Enable conflict detection in loop-access analysis"),
132	cl::init(Val: true));
133
134	static cl::opt<unsigned> MaxForkedSCEVDepth(
135	"max-forked-scev-depth", cl::Hidden,
136	cl::desc ("Maximum recursion depth when finding forked SCEVs (default = 5)"),
137	cl::init(Val: `5`));
138
139	static cl::opt<bool> SpeculateUnitStride(
140	"laa-speculate-unit-stride", cl::Hidden,
141	cl::desc ("Speculate that non-constant strides are unit in LAA"),
142	cl::init(Val: true));
143
144	static cl::opt<bool, true> HoistRuntimeChecks(
145	"hoist-runtime-checks", cl::Hidden,
146	cl::desc (
147	"Hoist inner loop runtime memory checks to outer loop if possible"),
148	cl::location(L&: VectorizerParams::HoistRuntimeChecks), cl::init(Val: true));
149	bool VectorizerParams::HoistRuntimeChecks;
150
151	bool VectorizerParams::isInterleaveForced() {
152	return ::VectorizationInterleave.getNumOccurrences() > `0`;
153	}
154
155	const SCEV *llvm::replaceSymbolicStrideSCEV(PredicatedScalarEvolution &PSE,
156	const DenseMap<Value , const* SCEV *> &PtrToStride,
157	Value *Ptr) {
158	const SCEV *OrigSCEV = PSE.getSCEV(V: Ptr);
159
160	// If there is an entry in the map return the SCEV of the pointer with the
161	// symbolic stride replaced by one.
162	const SCEV *StrideSCEV = PtrToStride.lookup(Val: Ptr);
163	if (!StrideSCEV)
164	// For a non-symbolic stride, just return the original expression.
165	return OrigSCEV;
166
167	// Note: This assert is both overly strong and overly weak. The actual
168	// invariant here is that StrideSCEV should be loop invariant. The only
169	// such invariant strides we happen to speculate right now are unknowns
170	// and thus this is a reasonable proxy of the actual invariant.
171	assert(isa<SCEVUnknown>(StrideSCEV) && "shouldn't be in map");
172
173	ScalarEvolution *SE = PSE.getSE();
174	const SCEV *CT = SE->getOne(Ty: StrideSCEV->getType());
175	PSE.addPredicate(Pred: *SE->getEqualPredicate(LHS: StrideSCEV, RHS: CT));
176	const SCEV *Expr = PSE.getSCEV(V: Ptr);
177
178	LLVM_DEBUG(dbgs() << "LAA: Replacing SCEV: " << *OrigSCEV
179	<< " by: " << *Expr << "\n");
180	return Expr;
181	}
182
183	RuntimeCheckingPtrGroup::RuntimeCheckingPtrGroup(
184	unsigned Index, const RuntimePointerChecking &RtCheck)
185	: High(RtCheck.Pointers [Index].End), Low(RtCheck.Pointers [Index].Start),
186	AddressSpace(RtCheck.Pointers [Index]
187	.PointerValue ->getType()
188	->getPointerAddressSpace()),
189	NeedsFreeze(RtCheck.Pointers [Index].NeedsFreeze) {
190	Members.push_back(Elt: Index);
191	}
192
193	/// Returns \p A + \p B, if it is guaranteed not to unsigned wrap. Otherwise
194	/// return nullptr. \p A and \p B must have the same type.
195	static const SCEV addSCEVNoOverflow(const* SCEV A, const* SCEV *B,
196	ScalarEvolution &SE) {
197	if (!SE.willNotOverflow(BinOp: Instruction::Add, /IsSigned=/Signed: false, LHS: A, RHS: B))
198	return nullptr;
199	return SE.getAddExpr(LHS: A, RHS: B);
200	}
201
202	/// Returns \p A \p B, if it is guaranteed not to unsigned wrap. Otherwise*
203	/// return nullptr. \p A and \p B must have the same type.
204	static const SCEV mulSCEVNoOverflow(const* SCEV A, const* SCEV *B,
205	ScalarEvolution &SE) {
206	if (!SE.willNotOverflow(BinOp: Instruction::Mul, /IsSigned=/Signed: false, LHS: A, RHS: B))
207	return nullptr;
208	return SE.getMulExpr(LHS: A, RHS: B);
209	}
210
211	/// Return true, if evaluating \p AR at \p MaxBTC cannot wrap, because \p AR at
212	/// \p MaxBTC is guaranteed inbounds of the accessed object.
213	static bool evaluatePtrAddRecAtMaxBTCWillNotWrap(
214	const SCEVAddRecExpr AR, const* SCEV MaxBTC, const* SCEV *EltSize,
215	ScalarEvolution &SE, const DataLayout &DL, DominatorTree *DT,
216	AssumptionCache *AC,
217	std::optional<ScalarEvolution::LoopGuards> &LoopGuards) {
218	auto *PointerBase = SE.getPointerBase(V: AR->getStart());
219	auto *StartPtr = dyn_cast<SCEVUnknown>(Val: PointerBase);
220	if (!StartPtr)
221	return false;
222	const Loop *L = AR->getLoop();
223	bool CheckForNonNull, CheckForFreed;
224	Value *StartPtrV = StartPtr->getValue();
225	uint64_t DerefBytes = StartPtrV->getPointerDereferenceableBytes(
226	DL, CanBeNull&: CheckForNonNull, CanBeFreed&: CheckForFreed);
227
228	if (DerefBytes && (CheckForNonNull \|\| CheckForFreed))
229	return false;
230
231	const SCEV *Step = AR->getStepRecurrence(SE);
232	Type *WiderTy = SE.getWiderType(Ty1: MaxBTC->getType(), Ty2: Step->getType());
233	const SCEV *DerefBytesSCEV = SE.getConstant(Ty: WiderTy, V: DerefBytes);
234
235	// Check if we have a suitable dereferencable assumption we can use.
236	Instruction CtxI = &L->getHeader()->getFirstNonPHIIt();
237	if (BasicBlock *LoopPred = L->getLoopPredecessor()) {
238	if (isa<UncondBrInst, CondBrInst>(Val: LoopPred->getTerminator()))
239	CtxI = LoopPred->getTerminator();
240	}
241	RetainedKnowledge DerefRK;
242	getKnowledgeForValue(V: StartPtrV, AttrKinds: {Attribute::Dereferenceable}, AC&: *AC,
243	Filter: [&](RetainedKnowledge RK, Instruction Assume, auto*) {
244	if (!isValidAssumeForContext(I: Assume, CxtI: CtxI, DT))
245	return false;
246	if (StartPtrV->canBeFreed() &&
247	!willNotFreeBetween(Assume, CtxI))
248	return false;
249	DerefRK = std::max(a: DerefRK, b: RK);
250	return true;
251	});
252	if (DerefRK) {
253	const SCEV *DerefRKSCEV = SE.getSCEV(V: DerefRK.IRArgValue);
254	Type *CommonTy =
255	SE.getWiderType(Ty1: DerefBytesSCEV->getType(), Ty2: DerefRKSCEV->getType());
256	DerefBytesSCEV = SE.getNoopOrZeroExtend(V: DerefBytesSCEV, Ty: CommonTy);
257	DerefRKSCEV = SE.getNoopOrZeroExtend(V: DerefRKSCEV, Ty: CommonTy);
258	DerefBytesSCEV = SE.getUMaxExpr(LHS: DerefBytesSCEV, RHS: DerefRKSCEV);
259	}
260
261	if (DerefBytesSCEV->isZero())
262	return false;
263
264	bool IsKnownNonNegative = SE.isKnownNonNegative(S: Step);
265	if (!IsKnownNonNegative && !SE.isKnownNegative(S: Step))
266	return false;
267
268	Step = SE.getNoopOrSignExtend(V: Step, Ty: WiderTy);
269	MaxBTC = SE.getNoopOrZeroExtend(V: MaxBTC, Ty: WiderTy);
270
271	// For the computations below, make sure they don't unsigned wrap.
272	if (!SE.isKnownPredicate(Pred: CmpInst::ICMP_UGE, LHS: AR->getStart(), RHS: StartPtr))
273	return false;
274	const SCEV *StartOffset = SE.getNoopOrZeroExtend(
275	V: SE.getMinusSCEV(LHS: AR->getStart(), RHS: StartPtr), Ty: WiderTy);
276
277	if (!LoopGuards)
278	LoopGuards.emplace(args: ScalarEvolution::LoopGuards::collect(L: AR->getLoop(), SE));
279	MaxBTC = SE.applyLoopGuards(Expr: MaxBTC, Guards: *LoopGuards);
280
281	const SCEV *OffsetAtLastIter =
282	mulSCEVNoOverflow(A: MaxBTC, B: SE.getAbsExpr(Op: Step, /IsNSW=/false), SE);
283	if (!OffsetAtLastIter) {
284	// Re-try with constant max backedge-taken count if using the symbolic one
285	// failed.
286	MaxBTC = SE.getConstantMaxBackedgeTakenCount(L: AR->getLoop());
287	if (isa<SCEVCouldNotCompute>(Val: MaxBTC))
288	return false;
289	MaxBTC = SE.getNoopOrZeroExtend(
290	V: MaxBTC, Ty: WiderTy);
291	OffsetAtLastIter =
292	mulSCEVNoOverflow(A: MaxBTC, B: SE.getAbsExpr(Op: Step, /IsNSW=/false), SE);
293	if (!OffsetAtLastIter)
294	return false;
295	}
296
297	const SCEV *OffsetEndBytes = addSCEVNoOverflow(
298	A: OffsetAtLastIter, B: SE.getNoopOrZeroExtend(V: EltSize, Ty: WiderTy), SE);
299	if (!OffsetEndBytes)
300	return false;
301
302	if (IsKnownNonNegative) {
303	// For positive steps, check if
304	// (AR->getStart() - StartPtr) + (MaxBTC Step) + EltSize <= DerefBytes,*
305	// while making sure none of the computations unsigned wrap themselves.
306	const SCEV *EndBytes = addSCEVNoOverflow(A: StartOffset, B: OffsetEndBytes, SE);
307	if (!EndBytes)
308	return false;
309
310	DerefBytesSCEV = SE.applyLoopGuards(Expr: DerefBytesSCEV, Guards: *LoopGuards);
311	return SE.isKnownPredicate(Pred: CmpInst::ICMP_ULE, LHS: EndBytes, RHS: DerefBytesSCEV);
312	}
313
314	// For negative steps check if
315	// StartOffset >= (MaxBTC * Step + EltSize)*
316	// StartOffset <= DerefBytes.*
317	assert(SE.isKnownNegative(Step) && "must be known negative");
318	return SE.isKnownPredicate(Pred: CmpInst::ICMP_SGE, LHS: StartOffset, RHS: OffsetEndBytes) &&
319	SE.isKnownPredicate(Pred: CmpInst::ICMP_ULE, LHS: StartOffset, RHS: DerefBytesSCEV);
320	}
321
322	std::pair<const SCEV , const* SCEV *> llvm::getStartAndEndForAccess(
323	const Loop Lp, const* SCEV PtrExpr, Type AccessTy, const SCEV *BTC,
324	const SCEV MaxBTC, ScalarEvolution SE,
325	DenseMap<std::pair<const SCEV , const* SCEV *>,
326	std::pair<const SCEV , const* SCEV >> PointerBounds,
327	DominatorTree DT, AssumptionCache AC,
328	std::optional<ScalarEvolution::LoopGuards> &LoopGuards) {
329	auto &DL = Lp->getHeader()->getDataLayout();
330	Type *IdxTy = DL.getIndexType(PtrTy: PtrExpr->getType());
331	const SCEV *EltSizeSCEV = SE->getStoreSizeOfExpr(IntTy: IdxTy, StoreTy: AccessTy);
332
333	// Delegate to the SCEV-based overload, passing through the cache.
334	return getStartAndEndForAccess(Lp, PtrExpr, EltSizeSCEV, BTC, MaxBTC, SE,
335	PointerBounds, DT, AC, LoopGuards);
336	}
337
338	std::pair<const SCEV , const* SCEV *> llvm::getStartAndEndForAccess(
339	const Loop Lp, const* SCEV PtrExpr, const* SCEV *EltSizeSCEV,
340	const SCEV BTC, const* SCEV MaxBTC, ScalarEvolution SE,
341	DenseMap<std::pair<const SCEV , const* SCEV *>,
342	std::pair<const SCEV , const* SCEV >> PointerBounds,
343	DominatorTree DT, AssumptionCache AC,
344	std::optional<ScalarEvolution::LoopGuards> &LoopGuards) {
345	std::pair<const SCEV , const* SCEV > PtrBoundsPair;
346	if (PointerBounds) {
347	auto [Iter, Ins] = PointerBounds->insert(
348	KV: {{PtrExpr, EltSizeSCEV},
349	{SE->getCouldNotCompute(), SE->getCouldNotCompute()}});
350	if (!Ins)
351	return Iter ->second;
352	PtrBoundsPair = &Iter ->second;
353	}
354
355	const SCEV *ScStart;
356	const SCEV *ScEnd;
357
358	auto &DL = Lp->getHeader()->getDataLayout();
359	if (SE->isLoopInvariant(S: PtrExpr, L: Lp)) {
360	ScStart = ScEnd = PtrExpr;
361	} else if (auto *AR = dyn_cast<SCEVAddRecExpr>(Val: PtrExpr)) {
362	ScStart = AR->getStart();
363	if (!isa<SCEVCouldNotCompute>(Val: BTC))
364	// Evaluating AR at an exact BTC is safe: LAA separately checks that
365	// accesses cannot wrap in the loop. If evaluating AR at BTC wraps, then
366	// the loop either triggers UB when executing a memory access with a
367	// poison pointer or the wrapping/poisoned pointer is not used.
368	ScEnd = AR->evaluateAtIteration(It: BTC, SE&: *SE);
369	else {
370	// Evaluating AR at MaxBTC may wrap and create an expression that is less
371	// than the start of the AddRec due to wrapping (for example consider
372	// MaxBTC = -2). If that's the case, set ScEnd to -(EltSize + 1). ScEnd
373	// will get incremented by EltSize before returning, so this effectively
374	// sets ScEnd to the maximum unsigned value for the type. Note that LAA
375	// separately checks that accesses cannot not wrap, so unsigned max
376	// represents an upper bound.
377	if (evaluatePtrAddRecAtMaxBTCWillNotWrap(AR, MaxBTC, EltSize: EltSizeSCEV, SE&: *SE, DL,
378	DT, AC, LoopGuards)) {
379	ScEnd = AR->evaluateAtIteration(It: MaxBTC, SE&: *SE);
380	} else {
381	ScEnd = SE->getAddExpr(
382	LHS: SE->getNegativeSCEV(V: EltSizeSCEV),
383	RHS: SE->getSCEV(V: ConstantExpr::getIntToPtr(
384	C: ConstantInt::getAllOnesValue(Ty: EltSizeSCEV->getType()),
385	Ty: AR->getType())));
386	}
387	}
388	const SCEV Step = AR->getStepRecurrence(SE&: SE);
389
390	// For expressions with negative step, the upper bound is ScStart and the
391	// lower bound is ScEnd.
392	if (const auto *CStep = dyn_cast<SCEVConstant>(Val: Step)) {
393	if (CStep->getValue()->isNegative())
394	std::swap(a&: ScStart, b&: ScEnd);
395	} else {
396	// Fallback case: the step is not constant, but we can still
397	// get the upper and lower bounds of the interval by using min/max
398	// expressions.
399	ScStart = SE->getUMinExpr(LHS: ScStart, RHS: ScEnd);
400	ScEnd = SE->getUMaxExpr(LHS: AR->getStart(), RHS: ScEnd);
401	}
402	} else
403	return {SE->getCouldNotCompute(), SE->getCouldNotCompute()};
404
405	assert(SE->isLoopInvariant(ScStart, Lp) && "ScStart needs to be invariant");
406	assert(SE->isLoopInvariant(ScEnd, Lp) && "ScEnd needs to be invariant");
407
408	// Add the size of the pointed element to ScEnd.
409	ScEnd = SE->getAddExpr(LHS: ScEnd, RHS: EltSizeSCEV);
410
411	std::pair<const SCEV , const* SCEV *> Res = {ScStart, ScEnd};
412	if (PointerBounds)
413	*PtrBoundsPair = Res;
414	return Res;
415	}
416
417	/// Calculate Start and End points of memory access using
418	/// getStartAndEndForAccess.
419	void RuntimePointerChecking::insert(Loop Lp, Value Ptr, const SCEV *PtrExpr,
420	Type AccessTy, bool* WritePtr,
421	unsigned DepSetId, unsigned ASId,
422	PredicatedScalarEvolution &PSE,
423	bool NeedsFreeze) {
424	const SCEV *SymbolicMaxBTC = PSE.getSymbolicMaxBackedgeTakenCount();
425	const SCEV *BTC = PSE.getBackedgeTakenCount();
426	const auto &[ScStart, ScEnd] = getStartAndEndForAccess(
427	Lp, PtrExpr, AccessTy, BTC, MaxBTC: SymbolicMaxBTC, SE: PSE.getSE(),
428	PointerBounds: &DC.getPointerBounds(), DT: DC.getDT(), AC: DC.getAC(), LoopGuards);
429	assert(!isa<SCEVCouldNotCompute>(ScStart) &&
430	!isa<SCEVCouldNotCompute>(ScEnd) &&
431	"must be able to compute both start and end expressions");
432	Pointers.emplace_back(Args&: Ptr, Args: ScStart, Args: ScEnd, Args&: WritePtr, Args&: DepSetId, Args&: ASId, Args&: PtrExpr,
433	Args&: NeedsFreeze);
434	}
435
436	bool RuntimePointerChecking::tryToCreateDiffCheck(
437	const RuntimeCheckingPtrGroup &CGI, const RuntimeCheckingPtrGroup &CGJ) {
438	// If either group contains multiple different pointers, bail out.
439	// TODO: Support multiple pointers by using the minimum or maximum pointer,
440	// depending on src & sink.
441	if (CGI.Members.size() != `1` \|\| CGJ.Members.size() != `1`)
442	return false;
443
444	const PointerInfo *Src = &Pointers [CGI.Members [`0`]];
445	const PointerInfo *Sink = &Pointers [CGJ.Members [`0`]];
446
447	// If either pointer is read and written, multiple checks may be needed. Bail
448	// out.
449	if (!DC.getOrderForAccess(Ptr: Src->PointerValue, IsWrite: !Src->IsWritePtr).empty() \|\|
450	!DC.getOrderForAccess(Ptr: Sink->PointerValue, IsWrite: !Sink->IsWritePtr).empty())
451	return false;
452
453	ArrayRef<unsigned> AccSrc =
454	DC.getOrderForAccess(Ptr: Src->PointerValue, IsWrite: Src->IsWritePtr);
455	ArrayRef<unsigned> AccSink =
456	DC.getOrderForAccess(Ptr: Sink->PointerValue, IsWrite: Sink->IsWritePtr);
457	// If either pointer is accessed multiple times, there may not be a clear
458	// src/sink relation. Bail out for now.
459	if (AccSrc.size() != `1` \|\| AccSink.size() != `1`)
460	return false;
461
462	// If the sink is accessed before src, swap src/sink.
463	if (AccSink [`0`] < AccSrc [`0`])
464	std::swap(a&: Src, b&: Sink);
465
466	const SCEVConstant *Step;
467	const SCEV *SrcStart;
468	const SCEV *SinkStart;
469	const Loop *InnerLoop = DC.getInnermostLoop();
470	if (!match(S: Src->Expr,
471	P: m_scev_AffineAddRec(Op0: m_SCEV(V&: SrcStart), Op1: m_SCEVConstant(V&: Step),
472	L: m_SpecificLoop(L: InnerLoop))) \|\|
473	!match(S: Sink->Expr,
474	P: m_scev_AffineAddRec(Op0: m_SCEV(V&: SinkStart), Op1: m_scev_Specific(S: Step),
475	L: m_SpecificLoop(L: InnerLoop))))
476	return false;
477
478	SmallVector<Instruction *, `4`> SrcInsts =
479	DC.getInstructionsForAccess(Ptr: Src->PointerValue, isWrite: Src->IsWritePtr);
480	SmallVector<Instruction *, `4`> SinkInsts =
481	DC.getInstructionsForAccess(Ptr: Sink->PointerValue, isWrite: Sink->IsWritePtr);
482	Type *SrcTy = getLoadStoreType(I: SrcInsts [`0`]);
483	Type *DstTy = getLoadStoreType(I: SinkInsts [`0`]);
484	if (isa<ScalableVectorType>(Val: SrcTy) \|\| isa<ScalableVectorType>(Val: DstTy))
485	return false;
486
487	const DataLayout &DL = InnerLoop->getHeader()->getDataLayout();
488	unsigned AllocSize =
489	std::max(a: DL.getTypeAllocSize(Ty: SrcTy), b: DL.getTypeAllocSize(Ty: DstTy));
490
491	// Only matching constant steps matching the AllocSize are supported at the
492	// moment. This simplifies the difference computation. Can be extended in the
493	// future.
494	if (Step->getAPInt().abs() != AllocSize)
495	return false;
496
497	// When counting down, the dependence distance needs to be swapped.
498	if (Step->getValue()->isNegative())
499	std::swap(a&: SinkStart, b&: SrcStart);
500
501	const SCEV *SinkStartInt = SE->getPtrToAddrExpr(Op: SinkStart);
502	const SCEV *SrcStartInt = SE->getPtrToAddrExpr(Op: SrcStart);
503	if (isa<SCEVCouldNotCompute>(Val: SinkStartInt) \|\|
504	isa<SCEVCouldNotCompute>(Val: SrcStartInt))
505	return false;
506
507	// If the start values for both Src and Sink also vary according to an outer
508	// loop, then it's probably better to avoid creating diff checks because
509	// they may not be hoisted. We should instead let llvm::addRuntimeChecks
510	// do the expanded full range overlap checks, which can be hoisted.
511	if (HoistRuntimeChecks && InnerLoop->getParentLoop() &&
512	isa<SCEVAddRecExpr>(Val: SinkStartInt) && isa<SCEVAddRecExpr>(Val: SrcStartInt)) {
513	auto *SrcStartAR = cast<SCEVAddRecExpr>(Val: SrcStartInt);
514	auto *SinkStartAR = cast<SCEVAddRecExpr>(Val: SinkStartInt);
515	const Loop *StartARLoop = SrcStartAR->getLoop();
516	if (StartARLoop == SinkStartAR->getLoop() &&
517	StartARLoop == InnerLoop->getParentLoop() &&
518	// If the diff check would already be loop invariant (due to the
519	// recurrences being the same), then we prefer to keep the diff checks
520	// because they are cheaper.
521	SrcStartAR->getStepRecurrence(SE&: *SE) !=
522	SinkStartAR->getStepRecurrence(SE&: *SE)) {
523	LLVM_DEBUG(dbgs() << "LAA: Not creating diff runtime check, since these "
524	"cannot be hoisted out of the outer loop\n");
525	return false;
526	}
527	}
528
529	LLVM_DEBUG(dbgs() << "LAA: Creating diff runtime check for:\n"
530	<< "SrcStart: " << *SrcStartInt << `'\n'`
531	<< "SinkStartInt: " << *SinkStartInt << `'\n'`);
532	DiffChecks.emplace_back(Args&: SrcStartInt, Args&: SinkStartInt, Args&: AllocSize,
533	Args: Src->NeedsFreeze \|\| Sink->NeedsFreeze);
534	return true;
535	}
536
537	SmallVector<RuntimePointerCheck, `4`> RuntimePointerChecking::generateChecks() {
538	SmallVector<RuntimePointerCheck, `4`> Checks;
539
540	for (unsigned I = `0`; I < CheckingGroups.size(); ++I) {
541	for (unsigned J = I + `1`; J < CheckingGroups.size(); ++J) {
542	const RuntimeCheckingPtrGroup &CGI = CheckingGroups [I];
543	const RuntimeCheckingPtrGroup &CGJ = CheckingGroups [J];
544
545	if (needsChecking(M: CGI, N: CGJ)) {
546	CanUseDiffCheck = CanUseDiffCheck && tryToCreateDiffCheck(CGI, CGJ);
547	Checks.emplace_back(Args: &CGI, Args: &CGJ);
548	}
549	}
550	}
551	return Checks;
552	}
553
554	void RuntimePointerChecking::generateChecks(
555	MemoryDepChecker::DepCandidates &DepCands) {
556	assert(Checks.empty() && "Checks is not empty");
557	groupChecks(DepCands);
558	Checks = generateChecks();
559	}
560
561	bool RuntimePointerChecking::needsChecking(
562	const RuntimeCheckingPtrGroup &M, const RuntimeCheckingPtrGroup &N) const {
563	for (const auto &I : M.Members)
564	for (const auto &J : N.Members)
565	if (needsChecking(I, J))
566	return true;
567	return false;
568	}
569
570	/// Compare \p I and \p J and return the minimum.
571	/// Return nullptr in case we couldn't find an answer.
572	static const SCEV getMinFromExprs(const* SCEV I, const* SCEV *J,
573	ScalarEvolution *SE) {
574	std::optional<APInt> Diff = SE->computeConstantDifference(LHS: J, RHS: I);
575	if (!Diff)
576	return nullptr;
577	return Diff ->isNegative() ? J : I;
578	}
579
580	bool RuntimeCheckingPtrGroup::addPointer(
581	unsigned Index, const RuntimePointerChecking &RtCheck) {
582	return addPointer(
583	Index, Start: RtCheck.Pointers [Index].Start, End: RtCheck.Pointers [Index].End,
584	AS: RtCheck.Pointers [Index].PointerValue ->getType()->getPointerAddressSpace(),
585	NeedsFreeze: RtCheck.Pointers [Index].NeedsFreeze, SE&: *RtCheck.SE);
586	}
587
588	bool RuntimeCheckingPtrGroup::addPointer(unsigned Index, const SCEV *Start,
589	const SCEV End, unsigned* AS,
590	bool NeedsFreeze,
591	ScalarEvolution &SE) {
592	assert(AddressSpace == AS &&
593	"all pointers in a checking group must be in the same address space");
594
595	// Compare the starts and ends with the known minimum and maximum
596	// of this set. We need to know how we compare against the min/max
597	// of the set in order to be able to emit memchecks.
598	const SCEV *Min0 = getMinFromExprs(I: Start, J: Low, SE: &SE);
599	if (!Min0)
600	return false;
601
602	const SCEV *Min1 = getMinFromExprs(I: End, J: High, SE: &SE);
603	if (!Min1)
604	return false;
605
606	// Update the low bound expression if we've found a new min value.
607	if (Min0 == Start)
608	Low = Start;
609
610	// Update the high bound expression if we've found a new max value.
611	if (Min1 != End)
612	High = End;
613
614	Members.push_back(Elt: Index);
615	this->NeedsFreeze \|= NeedsFreeze;
616	return true;
617	}
618
619	void RuntimePointerChecking::groupChecks(
620	MemoryDepChecker::DepCandidates &DepCands) {
621	// We build the groups from dependency candidates equivalence classes
622	// because:
623	// - We know that pointers in the same equivalence class share
624	// the same underlying object and therefore there is a chance
625	// that we can compare pointers
626	// - We wouldn't be able to merge two pointers for which we need
627	// to emit a memcheck. The classes in DepCands are already
628	// conveniently built such that no two pointers in the same
629	// class need checking against each other.
630
631	// We use the following (greedy) algorithm to construct the groups
632	// For every pointer in the equivalence class:
633	// For each existing group:
634	// - if the difference between this pointer and the min/max bounds
635	// of the group is a constant, then make the pointer part of the
636	// group and update the min/max bounds of that group as required.
637
638	CheckingGroups.clear();
639
640	// If we need to check two pointers to the same underlying object
641	// with a non-constant difference, we shouldn't perform any pointer
642	// grouping with those pointers. This is because we can easily get
643	// into cases where the resulting check would return false, even when
644	// the accesses are safe.
645	//
646	// The following example shows this:
647	// for (i = 0; i < 1000; ++i)
648	// a[5000 + i m] = a[i] + a[i + 9000]*
649	//
650	// Here grouping gives a check of (5000, 5000 + 1000 m) against*
651	// (0, 10000) which is always false. However, if m is 1, there is no
652	// dependence. Not grouping the checks for a[i] and a[i + 9000] allows
653	// us to perform an accurate check in this case.
654	//
655	// In the above case, we have a non-constant distance and an Unknown
656	// dependence between accesses to the same underlying object, and could retry
657	// with runtime checks without dependency information being available. In this
658	// case we will use the fallback path and create separate checking groups for
659	// accesses not present in DepCands.
660
661	unsigned TotalComparisons = `0`;
662
663	DenseMap<Value , SmallVector<unsigned*>> PositionMap;
664	for (unsigned Index = `0`; Index < Pointers.size(); ++Index)
665	PositionMap [Pointers [Index].PointerValue].push_back(Elt: Index);
666
667	// We need to keep track of what pointers we've already seen so we
668	// don't process them twice.
669	SmallSet<unsigned, `2`> Seen;
670
671	// Go through all equivalence classes, get the "pointer check groups"
672	// and add them to the overall solution. We use the order in which accesses
673	// appear in 'Pointers' to enforce determinism.
674	for (unsigned I = `0`; I < Pointers.size(); ++I) {
675	// We've seen this pointer before, and therefore already processed
676	// its equivalence class.
677	if (Seen.contains(V: I))
678	continue;
679
680	MemoryDepChecker::MemAccessInfo Access(Pointers [I].PointerValue,
681	Pointers [I].IsWritePtr);
682
683	// If there is no entry in the dependency partition, there are no potential
684	// accesses to merge; simply add a new pointer checking group.
685	if (!DepCands.contains(V: Access)) {
686	CheckingGroups.push_back(Elt: RuntimeCheckingPtrGroup (I, *this));
687	continue;
688	}
689
690	SmallVector<RuntimeCheckingPtrGroup, `2`> Groups;
691
692	// Because DepCands is constructed by visiting accesses in the order in
693	// which they appear in alias sets (which is deterministic) and the
694	// iteration order within an equivalence class member is only dependent on
695	// the order in which unions and insertions are performed on the
696	// equivalence class, the iteration order is deterministic.
697	for (auto M : DepCands.members(V: Access)) {
698	auto PointerI = PositionMap.find(Val: M.getPointer());
699	// If we can't find the pointer in PositionMap that means we can't
700	// generate a memcheck for it.
701	if (PointerI == PositionMap.end())
702	continue;
703	for (unsigned Pointer : PointerI ->second) {
704	bool Merged = false;
705	// Mark this pointer as seen.
706	Seen.insert(V: Pointer);
707
708	// Go through all the existing sets and see if we can find one
709	// which can include this pointer.
710	for (RuntimeCheckingPtrGroup &Group : Groups) {
711	// Don't perform more than a certain amount of comparisons.
712	// This should limit the cost of grouping the pointers to something
713	// reasonable. If we do end up hitting this threshold, the algorithm
714	// will create separate groups for all remaining pointers.
715	if (TotalComparisons > MemoryCheckMergeThreshold)
716	break;
717
718	TotalComparisons++;
719
720	if (Group.addPointer(Index: Pointer, RtCheck: *this)) {
721	Merged = true;
722	break;
723	}
724	}
725
726	if (!Merged)
727	// We couldn't add this pointer to any existing set or the threshold
728	// for the number of comparisons has been reached. Create a new group
729	// to hold the current pointer.
730	Groups.emplace_back(Args&: Pointer, Args&: *this);
731	}
732	}
733
734	// We've computed the grouped checks for this partition.
735	// Save the results and continue with the next one.
736	llvm::append_range(C&: CheckingGroups, R&: Groups);
737	}
738	}
739
740	bool RuntimePointerChecking::arePointersInSamePartition(
741	const SmallVectorImpl<int> &PtrToPartition, unsigned PtrIdx1,
742	unsigned PtrIdx2) {
743	return (PtrToPartition [PtrIdx1] != -`1` &&
744	PtrToPartition [PtrIdx1] == PtrToPartition [PtrIdx2]);
745	}
746
747	bool RuntimePointerChecking::needsChecking(unsigned I, unsigned J) const {
748	const PointerInfo &PointerI = Pointers [I];
749	const PointerInfo &PointerJ = Pointers [J];
750
751	// No need to check if two readonly pointers intersect.
752	if (!PointerI.IsWritePtr && !PointerJ.IsWritePtr)
753	return false;
754
755	// Only need to check pointers between two different dependency sets.
756	if (PointerI.DependencySetId == PointerJ.DependencySetId)
757	return false;
758
759	// Only need to check pointers in the same alias set.
760	return PointerI.AliasSetId == PointerJ.AliasSetId;
761	}
762
763	/// Assign each RuntimeCheckingPtrGroup pointer an index for stable UTC output.
764	static DenseMap<const RuntimeCheckingPtrGroup , unsigned*>
765	getPtrToIdxMap(ArrayRef<RuntimeCheckingPtrGroup> CheckingGroups) {
766	DenseMap<const RuntimeCheckingPtrGroup , unsigned*> PtrIndices;
767	for (const auto &[Idx, CG] : enumerate(First&: CheckingGroups))
768	PtrIndices [&CG] = Idx;
769	return PtrIndices;
770	}
771
772	void RuntimePointerChecking::printChecks(
773	raw_ostream &OS, const SmallVectorImpl<RuntimePointerCheck> &Checks,
774	unsigned Depth) const {
775	unsigned N = `0`;
776	auto PtrIndices = getPtrToIdxMap(CheckingGroups);
777	for (const auto &[Check1, Check2] : Checks) {
778	const auto &First = Check1->Members, &Second = Check2->Members;
779	OS.indent(NumSpaces: Depth) << "Check " << N++ << ":\n";
780	OS.indent(NumSpaces: Depth + `2`) << "Comparing group GRP" << PtrIndices.at(Val: Check1)
781	<< ":\n";
782	for (unsigned K : First)
783	OS.indent(NumSpaces: Depth + `2`) << *Pointers [K].PointerValue << "\n";
784	OS.indent(NumSpaces: Depth + `2`) << "Against group GRP" << PtrIndices.at(Val: Check2)
785	<< ":\n";
786	for (unsigned K : Second)
787	OS.indent(NumSpaces: Depth + `2`) << *Pointers [K].PointerValue << "\n";
788	}
789	}
790
791	void RuntimePointerChecking::print(raw_ostream &OS, unsigned Depth) const {
792
793	OS.indent(NumSpaces: Depth) << "Run-time memory checks:\n";
794	printChecks(OS, Checks, Depth);
795
796	OS.indent(NumSpaces: Depth) << "Grouped accesses:\n";
797	auto PtrIndices = getPtrToIdxMap(CheckingGroups);
798	for (const auto &CG : CheckingGroups) {
799	OS.indent(NumSpaces: Depth + `2`) << "Group GRP" << PtrIndices.at(Val: &CG) << ":\n";
800	OS.indent(NumSpaces: Depth + `4`) << "(Low: " << CG.Low << " High: " << CG.High
801	<< ")\n";
802	for (unsigned Member : CG.Members) {
803	OS.indent(NumSpaces: Depth + `6`) << "Member: " << *Pointers [Member].Expr << "\n";
804	}
805	}
806	}
807
808	namespace {
809
810	/// Analyses memory accesses in a loop.
811	///
812	/// Checks whether run time pointer checks are needed and builds sets for data
813	/// dependence checking.
814	class AccessAnalysis {
815	public:
816	/// Read or write access location.
817	typedef PointerIntPair<Value , `1`, bool*> MemAccessInfo;
818	typedef SmallVector<MemAccessInfo, `8`> MemAccessInfoList;
819
820	AccessAnalysis(const Loop TheLoop, AAResults AA, const LoopInfo *LI,
821	DominatorTree &DT, MemoryDepChecker::DepCandidates &DA,
822	PredicatedScalarEvolution &PSE,
823	SmallPtrSetImpl<MDNode *> &LoopAliasScopes)
824	: TheLoop(TheLoop), BAA (*AA), AST (BAA), LI(LI), DT(DT), DepCands(DA),
825	PSE(PSE), LoopAliasScopes(LoopAliasScopes) {
826	// We're analyzing dependences across loop iterations.
827	BAA.enableCrossIterationMode();
828	}
829
830	/// Register a load and whether it is only read from.
831	void addLoad(const MemoryLocation &Loc, Type AccessTy, bool* IsReadOnly) {
832	Value Ptr = const_cast<Value >(Loc.Ptr);
833	AST.add(Loc: adjustLoc(Loc));
834	Accesses [MemAccessInfo (Ptr, false)].insert(X: AccessTy);
835	if (IsReadOnly)
836	ReadOnlyPtr.insert(Ptr);
837	}
838
839	/// Register a store.
840	void addStore(const MemoryLocation &Loc, Type *AccessTy) {
841	Value Ptr = const_cast<Value >(Loc.Ptr);
842	AST.add(Loc: adjustLoc(Loc));
843	Accesses [MemAccessInfo (Ptr, true)].insert(X: AccessTy);
844	}
845
846	/// Check if we can emit a run-time no-alias check for \p Access.
847	///
848	/// Returns true if we can emit a run-time no alias check for \p Access.
849	/// If we can check this access, this also adds it to a dependence set and
850	/// adds a run-time to check for it to \p RtCheck. If \p Assume is true,
851	/// we will attempt to use additional run-time checks in order to get
852	/// the bounds of the pointer.
853	bool createCheckForAccess(RuntimePointerChecking &RtCheck,
854	MemAccessInfo Access, Type *AccessTy,
855	const DenseMap<Value , const* SCEV *> &Strides,
856	DenseMap<Value , unsigned*> &DepSetId,
857	Loop TheLoop, unsigned* &RunningDepId,
858	unsigned ASId, bool Assume);
859
860	/// Check whether we can check the pointers at runtime for
861	/// non-intersection.
862	///
863	/// Returns true if we need no check or if we do and we can generate them
864	/// (i.e. the pointers have computable bounds). A return value of false means
865	/// we couldn't analyze and generate runtime checks for all pointers in the
866	/// loop, but if \p AllowPartial is set then we will have checks for those
867	/// pointers we could analyze. \p DepChecker is used to remove unknown
868	/// dependences from DepCands.
869	bool canCheckPtrAtRT(RuntimePointerChecking &RtCheck, Loop *TheLoop,
870	const DenseMap<Value , const* SCEV *> &Strides,
871	Value &UncomputablePtr, bool* AllowPartial,
872	const MemoryDepChecker &DepChecker);
873
874	/// Goes over all memory accesses, checks whether a RT check is needed
875	/// and builds sets of dependent accesses.
876	void buildDependenceSets() {
877	processMemAccesses();
878	}
879
880	/// Initial processing of memory accesses determined that we need to
881	/// perform dependency checking.
882	///
883	/// Note that this can later be cleared if we retry memcheck analysis without
884	/// dependency checking (i.e. ShouldRetryWithRuntimeChecks).
885	bool isDependencyCheckNeeded() const { return !CheckDeps.empty(); }
886
887	/// We decided that no dependence analysis would be used. Reset the state.
888	void resetDepChecks(MemoryDepChecker &DepChecker) {
889	CheckDeps.clear();
890	DepChecker.clearDependences();
891	}
892
893	const MemAccessInfoList &getDependenciesToCheck() const { return CheckDeps; }
894
895	private:
896	typedef MapVector<MemAccessInfo, SmallSetVector<Type *, `1`>> PtrAccessMap;
897
898	/// Adjust the MemoryLocation so that it represents accesses to this
899	/// location across all iterations, rather than a single one.
900	MemoryLocation adjustLoc(MemoryLocation Loc) const {
901	// The accessed location varies within the loop, but remains within the
902	// underlying object.
903	Loc.Size = LocationSize::beforeOrAfterPointer();
904	Loc.AATags.Scope = adjustAliasScopeList(ScopeList: Loc.AATags.Scope);
905	Loc.AATags.NoAlias = adjustAliasScopeList(ScopeList: Loc.AATags.NoAlias);
906	return Loc;
907	}
908
909	/// Drop alias scopes that are only valid within a single loop iteration.
910	MDNode adjustAliasScopeList(MDNode ScopeList) const {
911	if (!ScopeList)
912	return nullptr;
913
914	// For the sake of simplicity, drop the whole scope list if any scope is
915	// iteration-local.
916	if (any_of(Range: ScopeList->operands(), P: [&](Metadata *Scope) {
917	return LoopAliasScopes.contains(Ptr: cast<MDNode>(Val: Scope));
918	}))
919	return nullptr;
920
921	return ScopeList;
922	}
923
924	/// Go over all memory access and check whether runtime pointer checks
925	/// are needed and build sets of dependency check candidates.
926	void processMemAccesses();
927
928	/// Map of all accesses. Values are the types used to access memory pointed to
929	/// by the pointer.
930	PtrAccessMap Accesses;
931
932	/// The loop being checked.
933	const Loop *TheLoop;
934
935	/// List of accesses that need a further dependence check.
936	MemAccessInfoList CheckDeps;
937
938	/// Set of pointers that are read only.
939	SmallPtrSet<Value*, `16`> ReadOnlyPtr;
940
941	/// Batched alias analysis results.
942	BatchAAResults BAA;
943
944	/// An alias set tracker to partition the access set by underlying object and
945	//intrinsic property (such as TBAA metadata).
946	AliasSetTracker AST;
947
948	/// The LoopInfo of the loop being checked.
949	const LoopInfo *LI;
950
951	/// The dominator tree of the function.
952	DominatorTree &DT;
953
954	/// Sets of potentially dependent accesses - members of one set share an
955	/// underlying pointer. The set "CheckDeps" identfies which sets really need a
956	/// dependence check.
957	MemoryDepChecker::DepCandidates &DepCands;
958
959	/// Initial processing of memory accesses determined that we may need
960	/// to add memchecks. Perform the analysis to determine the necessary checks.
961	///
962	/// Note that, this is different from isDependencyCheckNeeded. When we retry
963	/// memcheck analysis without dependency checking
964	/// (i.e. ShouldRetryWithRuntimeChecks), isDependencyCheckNeeded is
965	/// cleared while this remains set if we have potentially dependent accesses.
966	bool IsRTCheckAnalysisNeeded = false;
967
968	/// The SCEV predicate containing all the SCEV-related assumptions.
969	PredicatedScalarEvolution &PSE;
970
971	DenseMap<Value , SmallVector<const* Value *, `16`>> UnderlyingObjects;
972
973	/// Alias scopes that are declared inside the loop, and as such not valid
974	/// across iterations.
975	SmallPtrSetImpl<MDNode *> &LoopAliasScopes;
976	};
977
978	} // end anonymous namespace
979
980	/// Try to compute a constant stride for \p AR. Used by getPtrStride and
981	/// isNoWrap.
982	static std::optional<int64_t>
983	getStrideFromAddRec(const SCEVAddRecExpr AR, const* Loop Lp, Type AccessTy,
984	Value *Ptr, PredicatedScalarEvolution &PSE) {
985	if (isa<ScalableVectorType>(Val: AccessTy)) {
986	LLVM_DEBUG(dbgs() << "LAA: Bad stride - Scalable object: " << *AccessTy
987	<< "\n");
988	return std::nullopt;
989	}
990
991	// The access function must stride over the innermost loop.
992	if (Lp != AR->getLoop()) {
993	LLVM_DEBUG({
994	dbgs() << "LAA: Bad stride - Not striding over innermost loop ";
995	if (Ptr)
996	dbgs() << *Ptr << " ";
997
998	dbgs() << "SCEV: " << *AR << "\n";
999	});
1000	return std::nullopt;
1001	}
1002
1003	// Check the step is constant.
1004	const SCEV Step = AR->getStepRecurrence(SE&: PSE.getSE());
1005
1006	// Calculate the pointer stride and check if it is constant.
1007	const APInt *APStepVal;
1008	if (!match(S: Step, P: m_scev_APInt(C&: APStepVal))) {
1009	LLVM_DEBUG({
1010	dbgs() << "LAA: Bad stride - Not a constant strided ";
1011	if (Ptr)
1012	dbgs() << *Ptr << " ";
1013	dbgs() << "SCEV: " << *AR << "\n";
1014	});
1015	return std::nullopt;
1016	}
1017
1018	const auto &DL = Lp->getHeader()->getDataLayout();
1019	TypeSize AllocSize = DL.getTypeAllocSize(Ty: AccessTy);
1020	int64_t Size = AllocSize.getFixedValue();
1021
1022	// Huge step value - give up.
1023	std::optional<int64_t> StepVal = APStepVal->trySExtValue();
1024	if (!StepVal)
1025	return std::nullopt;
1026
1027	// Strided access.
1028	return StepVal % Size ? std::nullopt : std::make_optional(t: StepVal / Size);
1029	}
1030
1031	/// Check whether \p AR is a non-wrapping AddRec. If \p Ptr is not nullptr, use
1032	/// informating from the IR pointer value to determine no-wrap.
1033	static bool isNoWrap(PredicatedScalarEvolution &PSE, const SCEVAddRecExpr *AR,
1034	Value Ptr, Type AccessTy, const Loop L, bool* Assume,
1035	const DominatorTree &DT,
1036	std::optional<int64_t> Stride = std::nullopt) {
1037	// FIXME: This should probably only return true for NUW.
1038	if (AR->getNoWrapFlags(Mask: SCEV::NoWrapMask))
1039	return true;
1040
1041	if (Ptr && PSE.hasNoOverflow(V: Ptr, Flags: SCEVWrapPredicate::IncrementNUSW))
1042	return true;
1043
1044	// An nusw getelementptr that is an AddRec cannot wrap. If it would wrap,
1045	// the distance between the previously accessed location and the wrapped
1046	// location will be larger than half the pointer index type space. In that
1047	// case, the GEP would be poison and any memory access dependent on it would
1048	// be immediate UB when executed.
1049	if (auto *GEP = dyn_cast_if_present<GetElementPtrInst>(Val: Ptr);
1050	GEP && GEP->hasNoUnsignedSignedWrap()) {
1051	// For the above reasoning to apply, the pointer must be dereferenced in
1052	// every iteration.
1053	if (L->getHeader() == L->getLoopLatch() \|\|
1054	any_of(Range: GEP->users(), P: [L, &DT, GEP](User *U) {
1055	if (getLoadStorePointerOperand(V: U) != GEP)
1056	return false;
1057	BasicBlock *UserBB = cast<Instruction>(Val: U)->getParent();
1058	if (!L->contains(BB: UserBB))
1059	return false;
1060	return !LoopAccessInfo::blockNeedsPredication(BB: UserBB, TheLoop: L, DT: &DT);
1061	}))
1062	return true;
1063	}
1064
1065	if (!Stride)
1066	Stride = getStrideFromAddRec(AR, Lp: L, AccessTy, Ptr, PSE);
1067	if (Stride) {
1068	// If the null pointer is undefined, then a access sequence which would
1069	// otherwise access it can be assumed not to unsigned wrap. Note that this
1070	// assumes the object in memory is aligned to the natural alignment.
1071	unsigned AddrSpace = AR->getType()->getPointerAddressSpace();
1072	if (!NullPointerIsDefined(F: L->getHeader()->getParent(), AS: AddrSpace) &&
1073	(Stride == `1` \|\| Stride == -`1`))
1074	return true;
1075	}
1076
1077	if (Ptr && Assume) {
1078	PSE.setNoOverflow(V: Ptr, Flags: SCEVWrapPredicate::IncrementNUSW);
1079	LLVM_DEBUG(dbgs() << "LAA: Pointer may wrap:\n"
1080	<< "LAA: Pointer: " << *Ptr << "\n"
1081	<< "LAA: SCEV: " << *AR << "\n"
1082	<< "LAA: Added an overflow assumption\n");
1083	return true;
1084	}
1085
1086	return false;
1087	}
1088
1089	static void visitPointers(Value StartPtr, const* Loop &InnermostLoop,
1090	function_ref<void(Value *)> AddPointer) {
1091	SmallPtrSet<Value *, `8`> Visited;
1092	SmallVector<Value *> WorkList;
1093	WorkList.push_back(Elt: StartPtr);
1094
1095	while (!WorkList.empty()) {
1096	Value *Ptr = WorkList.pop_back_val();
1097	if (!Visited.insert(Ptr).second)
1098	continue;
1099	auto *PN = dyn_cast<PHINode>(Val: Ptr);
1100	// SCEV does not look through non-header PHIs inside the loop. Such phis
1101	// can be analyzed by adding separate accesses for each incoming pointer
1102	// value.
1103	if (PN && InnermostLoop.contains(BB: PN->getParent()) &&
1104	PN->getParent() != InnermostLoop.getHeader()) {
1105	llvm::append_range(C&: WorkList, R: PN->incoming_values());
1106	} else
1107	AddPointer (Ptr);
1108	}
1109	}
1110
1111	// Walk back through the IR for a pointer, looking for a select like the
1112	// following:
1113	//
1114	// %offset = select i1 %cmp, i64 %a, i64 %b
1115	// %addr = getelementptr double, double %base, i64 %offset*
1116	// %ld = load double, double %addr, align 8*
1117	//
1118	// We won't be able to form a single SCEVAddRecExpr from this since the
1119	// address for each loop iteration depends on %cmp. We could potentially
1120	// produce multiple valid SCEVAddRecExprs, though, and check all of them for
1121	// memory safety/aliasing if needed.
1122	//
1123	// If we encounter some IR we don't yet handle, or something obviously fine
1124	// like a constant, then we just add the SCEV for that term to the list passed
1125	// in by the caller. If we have a node that may potentially yield a valid
1126	// SCEVAddRecExpr then we decompose it into parts and build the SCEV terms
1127	// ourselves before adding to the list.
1128	static void findForkedSCEVs(
1129	ScalarEvolution SE, const* Loop L, Value Ptr,
1130	SmallVectorImpl<PointerIntPair<const SCEV , `1`, bool*>> &ScevList,
1131	unsigned Depth) {
1132	// If our Value is a SCEVAddRecExpr, loop invariant, not an instruction, or
1133	// we've exceeded our limit on recursion, just return whatever we have
1134	// regardless of whether it can be used for a forked pointer or not, along
1135	// with an indication of whether it might be a poison or undef value.
1136	const SCEV *Scev = SE->getSCEV(V: Ptr);
1137	if (isa<SCEVAddRecExpr>(Val: Scev) \|\| L->isLoopInvariant(V: Ptr) \|\|
1138	!isa<Instruction>(Val: Ptr) \|\| Depth == `0`) {
1139	ScevList.emplace_back(Args&: Scev, Args: !isGuaranteedNotToBeUndefOrPoison(V: Ptr));
1140	return;
1141	}
1142
1143	Depth--;
1144
1145	auto UndefPoisonCheck = [](PointerIntPair<const SCEV , `1`, bool*> S) {
1146	return get<`1`>(Pair: S);
1147	};
1148
1149	auto GetBinOpExpr = [&SE](unsigned Opcode, const SCEV L, const* SCEV *R) {
1150	switch (Opcode) {
1151	case Instruction::Add:
1152	return SE->getAddExpr(LHS: L, RHS: R);
1153	case Instruction::Sub:
1154	return SE->getMinusSCEV(LHS: L, RHS: R);
1155	default:
1156	llvm_unreachable("Unexpected binary operator when walking ForkedPtrs");
1157	}
1158	};
1159
1160	Instruction *I = cast<Instruction>(Val: Ptr);
1161	unsigned Opcode = I->getOpcode();
1162	switch (Opcode) {
1163	case Instruction::GetElementPtr: {
1164	auto *GEP = cast<GetElementPtrInst>(Val: I);
1165	Type *SourceTy = GEP->getSourceElementType();
1166	// We only handle base + single offset GEPs here for now.
1167	// Not dealing with preexisting gathers yet, so no vectors.
1168	if (I->getNumOperands() != `2` \|\| SourceTy->isVectorTy()) {
1169	ScevList.emplace_back(Args&: Scev, Args: !isGuaranteedNotToBeUndefOrPoison(V: GEP));
1170	break;
1171	}
1172	SmallVector<PointerIntPair<const SCEV , `1`, bool*>, `2`> BaseScevs;
1173	SmallVector<PointerIntPair<const SCEV , `1`, bool*>, `2`> OffsetScevs;
1174	findForkedSCEVs(SE, L, Ptr: I->getOperand(i: `0`), ScevList&: BaseScevs, Depth);
1175	findForkedSCEVs(SE, L, Ptr: I->getOperand(i: `1`), ScevList&: OffsetScevs, Depth);
1176
1177	// See if we need to freeze our fork...
1178	bool NeedsFreeze = any_of(Range&: BaseScevs, P: UndefPoisonCheck) \|\|
1179	any_of(Range&: OffsetScevs, P: UndefPoisonCheck);
1180
1181	// Check that we only have a single fork, on either the base or the offset.
1182	// Copy the SCEV across for the one without a fork in order to generate
1183	// the full SCEV for both sides of the GEP.
1184	if (OffsetScevs.size() == `2` && BaseScevs.size() == `1`)
1185	BaseScevs.push_back(Elt: BaseScevs [`0`]);
1186	else if (BaseScevs.size() == `2` && OffsetScevs.size() == `1`)
1187	OffsetScevs.push_back(Elt: OffsetScevs [`0`]);
1188	else {
1189	ScevList.emplace_back(Args&: Scev, Args&: NeedsFreeze);
1190	break;
1191	}
1192
1193	Type *IntPtrTy = SE->getEffectiveSCEVType(Ty: GEP->getPointerOperandType());
1194
1195	// Find the size of the type being pointed to. We only have a single
1196	// index term (guarded above) so we don't need to index into arrays or
1197	// structures, just get the size of the scalar value.
1198	const SCEV *Size = SE->getSizeOfExpr(IntTy: IntPtrTy, AllocTy: SourceTy);
1199
1200	for (auto [B, O] : zip(t&: BaseScevs, u&: OffsetScevs)) {
1201	const SCEV *Base = get<`0`>(Pair: B);
1202	const SCEV *Offset = get<`0`>(Pair: O);
1203
1204	// Scale up the offsets by the size of the type, then add to the bases.
1205	const SCEV *Scaled =
1206	SE->getMulExpr(LHS: Size, RHS: SE->getTruncateOrSignExtend(V: Offset, Ty: IntPtrTy));
1207	ScevList.emplace_back(Args: SE->getAddExpr(LHS: Base, RHS: Scaled), Args&: NeedsFreeze);
1208	}
1209	break;
1210	}
1211	case Instruction::Select: {
1212	SmallVector<PointerIntPair<const SCEV , `1`, bool*>, `2`> ChildScevs;
1213	// A select means we've found a forked pointer, but we currently only
1214	// support a single select per pointer so if there's another behind this
1215	// then we just bail out and return the generic SCEV.
1216	findForkedSCEVs(SE, L, Ptr: I->getOperand(i: `1`), ScevList&: ChildScevs, Depth);
1217	findForkedSCEVs(SE, L, Ptr: I->getOperand(i: `2`), ScevList&: ChildScevs, Depth);
1218	if (ChildScevs.size() == `2`)
1219	append_range(C&: ScevList, R&: ChildScevs);
1220	else
1221	ScevList.emplace_back(Args&: Scev, Args: !isGuaranteedNotToBeUndefOrPoison(V: Ptr));
1222	break;
1223	}
1224	case Instruction::PHI: {
1225	SmallVector<PointerIntPair<const SCEV , `1`, bool*>, `2`> ChildScevs;
1226	// A phi means we've found a forked pointer, but we currently only
1227	// support a single phi per pointer so if there's another behind this
1228	// then we just bail out and return the generic SCEV.
1229	if (I->getNumOperands() == `2`) {
1230	findForkedSCEVs(SE, L, Ptr: I->getOperand(i: `0`), ScevList&: ChildScevs, Depth);
1231	findForkedSCEVs(SE, L, Ptr: I->getOperand(i: `1`), ScevList&: ChildScevs, Depth);
1232	}
1233	if (ChildScevs.size() == `2`)
1234	append_range(C&: ScevList, R&: ChildScevs);
1235	else
1236	ScevList.emplace_back(Args&: Scev, Args: !isGuaranteedNotToBeUndefOrPoison(V: Ptr));
1237	break;
1238	}
1239	case Instruction::Add:
1240	case Instruction::Sub: {
1241	SmallVector<PointerIntPair<const SCEV , `1`, bool*>> LScevs;
1242	SmallVector<PointerIntPair<const SCEV , `1`, bool*>> RScevs;
1243	findForkedSCEVs(SE, L, Ptr: I->getOperand(i: `0`), ScevList&: LScevs, Depth);
1244	findForkedSCEVs(SE, L, Ptr: I->getOperand(i: `1`), ScevList&: RScevs, Depth);
1245
1246	// See if we need to freeze our fork...
1247	bool NeedsFreeze =
1248	any_of(Range&: LScevs, P: UndefPoisonCheck) \|\| any_of(Range&: RScevs, P: UndefPoisonCheck);
1249
1250	// Check that we only have a single fork, on either the left or right side.
1251	// Copy the SCEV across for the one without a fork in order to generate
1252	// the full SCEV for both sides of the BinOp.
1253	if (LScevs.size() == `2` && RScevs.size() == `1`)
1254	RScevs.push_back(Elt: RScevs [`0`]);
1255	else if (RScevs.size() == `2` && LScevs.size() == `1`)
1256	LScevs.push_back(Elt: LScevs [`0`]);
1257	else {
1258	ScevList.emplace_back(Args&: Scev, Args&: NeedsFreeze);
1259	break;
1260	}
1261
1262	for (auto [L, R] : zip(t&: LScevs, u&: RScevs))
1263	ScevList.emplace_back(Args: GetBinOpExpr (Opcode, get<`0`>(Pair: L), get<`0`>(Pair: R)),
1264	Args&: NeedsFreeze);
1265	break;
1266	}
1267	default:
1268	// Just return the current SCEV if we haven't handled the instruction yet.
1269	LLVM_DEBUG(dbgs() << "ForkedPtr unhandled instruction: " << *I << "\n");
1270	ScevList.emplace_back(Args&: Scev, Args: !isGuaranteedNotToBeUndefOrPoison(V: Ptr));
1271	break;
1272	}
1273	}
1274
1275	bool AccessAnalysis::createCheckForAccess(
1276	RuntimePointerChecking &RtCheck, MemAccessInfo Access, Type *AccessTy,
1277	const DenseMap<Value , const* SCEV *> &StridesMap,
1278	DenseMap<Value , unsigned> &DepSetId, Loop TheLoop,
1279	unsigned &RunningDepId, unsigned ASId, bool Assume) {
1280	Value *Ptr = Access.getPointer();
1281	ScalarEvolution *SE = PSE.getSE();
1282	assert(SE->isSCEVable(Ptr->getType()) && "Value is not SCEVable!");
1283
1284	SmallVector<PointerIntPair<const SCEV , `1`, bool*>> RTCheckPtrs;
1285	findForkedSCEVs(SE, L: TheLoop, Ptr, ScevList&: RTCheckPtrs, Depth: MaxForkedSCEVDepth);
1286	assert(!RTCheckPtrs.empty() &&
1287	"Must have some runtime-check pointer candidates");
1288
1289	// RTCheckPtrs must have size 2 if there are forked pointers. Otherwise, there
1290	// are no forked pointers; replaceSymbolicStridesSCEV in this case.
1291	auto IsLoopInvariantOrAR =
1292	[&SE, &TheLoop](const PointerIntPair<const SCEV , `1`, bool*> &P) {
1293	return SE->isLoopInvariant(S: P.getPointer(), L: TheLoop) \|\|
1294	isa<SCEVAddRecExpr>(Val: P.getPointer());
1295	};
1296	if (RTCheckPtrs.size() == `2` && all_of(Range&: RTCheckPtrs, P: IsLoopInvariantOrAR)) {
1297	LLVM_DEBUG(dbgs() << "LAA: Found forked pointer: " << *Ptr << "\n";
1298	for (const auto &[Idx, Q] : enumerate(RTCheckPtrs)) dbgs()
1299	<< "\t(" << Idx << ") " << *Q.getPointer() << "\n");
1300	} else {
1301	RTCheckPtrs = {{replaceSymbolicStrideSCEV(PSE, PtrToStride: StridesMap, Ptr), false}};
1302	}
1303
1304	/// Check whether all pointers can participate in a runtime bounds check. They
1305	/// must either be invariant or non-wrapping affine AddRecs.
1306	for (auto &P : RTCheckPtrs) {
1307	// The bounds for loop-invariant pointer is trivial.
1308	if (SE->isLoopInvariant(S: P.getPointer(), L: TheLoop))
1309	continue;
1310
1311	const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Val: P.getPointer());
1312	if (!AR && Assume)
1313	AR = PSE.getAsAddRec(V: Ptr);
1314	if (!AR \|\| !AR->isAffine())
1315	return false;
1316
1317	// If there's only one option for Ptr, look it up after bounds and wrap
1318	// checking, because assumptions might have been added to PSE.
1319	if (RTCheckPtrs.size() == `1`) {
1320	AR =
1321	cast<SCEVAddRecExpr>(Val: replaceSymbolicStrideSCEV(PSE, PtrToStride: StridesMap, Ptr));
1322	P.setPointer(AR);
1323	}
1324
1325	if (!isNoWrap(PSE, AR, Ptr: RTCheckPtrs.size() == `1` ? Ptr : nullptr, AccessTy,
1326	L: TheLoop, Assume, DT))
1327	return false;
1328	}
1329
1330	for (const auto &[PtrExpr, NeedsFreeze] : RTCheckPtrs) {
1331	// The id of the dependence set.
1332	unsigned DepId;
1333
1334	if (DepCands.contains(V: Access)) {
1335	Value *Leader = DepCands.getLeaderValue(V: Access).getPointer();
1336	unsigned &LeaderId = DepSetId [Leader];
1337	if (!LeaderId)
1338	LeaderId = RunningDepId++;
1339	DepId = LeaderId;
1340	} else
1341	// Each access has its own dependence set.
1342	DepId = RunningDepId++;
1343
1344	bool IsWrite = Access.getInt();
1345	RtCheck.insert(Lp: TheLoop, Ptr, PtrExpr, AccessTy, WritePtr: IsWrite, DepSetId: DepId, ASId, PSE,
1346	NeedsFreeze);
1347	LLVM_DEBUG(dbgs() << "LAA: Found a runtime check ptr:" << *Ptr << `'\n'`);
1348	}
1349
1350	return true;
1351	}
1352
1353	bool AccessAnalysis::canCheckPtrAtRT(
1354	RuntimePointerChecking &RtCheck, Loop *TheLoop,
1355	const DenseMap<Value , const* SCEV > &StridesMap, Value &UncomputablePtr,
1356	bool AllowPartial, const MemoryDepChecker &DepChecker) {
1357	// Find pointers with computable bounds. We are going to use this information
1358	// to place a runtime bound check.
1359	bool CanDoRT = true;
1360
1361	bool MayNeedRTCheck = false;
1362	if (!IsRTCheckAnalysisNeeded) return true;
1363
1364	if (auto *Deps = DepChecker.getDependences()) {
1365	// If there are unknown dependences, this means runtime checks are needed to
1366	// ensure there's no overlap between accesses to the same underlying object.
1367	// Remove the equivalence classes containing both source and destination
1368	// accesses from DepCands. This ensures runtime checks will be generated
1369	// between those accesses and prevents them from being grouped together.
1370	for (const auto &Dep : *Deps) {
1371	if (Dep.Type != MemoryDepChecker::Dependence::Unknown) {
1372	assert(MemoryDepChecker::Dependence::isSafeForVectorization(Dep.Type) ==
1373	MemoryDepChecker::VectorizationSafetyStatus::Safe &&
1374	"Should only skip safe dependences");
1375	continue;
1376	}
1377	Instruction *Src = Dep.getSource(DepChecker);
1378	Instruction *Dst = Dep.getDestination(DepChecker);
1379	DepCands.eraseClass(V: {getPointerOperand(V: Src), Src->mayWriteToMemory()});
1380	DepCands.eraseClass(V: {getPointerOperand(V: Dst), Dst->mayWriteToMemory()});
1381	}
1382	} else {
1383	CheckDeps.clear();
1384	DepCands = {};
1385	}
1386
1387	// We assign a consecutive id to access from different alias sets.
1388	// Accesses between different groups doesn't need to be checked.
1389	unsigned ASId = `0`;
1390	for (const auto &AS : AST) {
1391	int NumReadPtrChecks = `0`;
1392	int NumWritePtrChecks = `0`;
1393	bool CanDoAliasSetRT = true;
1394	++ASId;
1395	auto ASPointers = AS.getPointers();
1396
1397	// We assign consecutive id to access from different dependence sets.
1398	// Accesses within the same set don't need a runtime check.
1399	unsigned RunningDepId = `1`;
1400	DenseMap<Value , unsigned*> DepSetId;
1401
1402	SmallVector<std::pair<MemAccessInfo, Type *>, `4`> Retries;
1403
1404	// First, count how many write and read accesses are in the alias set. Also
1405	// collect MemAccessInfos for later.
1406	SmallVector<MemAccessInfo, `4`> AccessInfos;
1407	for (const Value *ConstPtr : ASPointers) {
1408	Value Ptr = const_cast<Value >(ConstPtr);
1409	bool IsWrite = Accesses.contains(Key: MemAccessInfo (Ptr, true));
1410	if (IsWrite)
1411	++NumWritePtrChecks;
1412	else
1413	++NumReadPtrChecks;
1414	AccessInfos.emplace_back(Args&: Ptr, Args&: IsWrite);
1415	}
1416
1417	// We do not need runtime checks for this alias set, if there are no writes
1418	// or a single write and no reads.
1419	if (NumWritePtrChecks == `0` \|\|
1420	(NumWritePtrChecks == `1` && NumReadPtrChecks == `0`)) {
1421	assert((ASPointers.size() <= `1` \|\|
1422	all_of(ASPointers,
1423	[this](const Value *Ptr) {
1424	MemAccessInfo AccessWrite(const_cast<Value *>(Ptr),
1425	true);
1426	return !DepCands.contains(AccessWrite);
1427	})) &&
1428	"Can only skip updating CanDoRT below, if all entries in AS "
1429	"are reads or there is at most 1 entry");
1430	continue;
1431	}
1432
1433	for (auto &Access : AccessInfos) {
1434	for (const auto &AccessTy : Accesses [Access]) {
1435	if (!createCheckForAccess(RtCheck, Access, AccessTy, StridesMap,
1436	DepSetId, TheLoop, RunningDepId, ASId,
1437	Assume: false)) {
1438	LLVM_DEBUG(dbgs() << "LAA: Can't find bounds for ptr:"
1439	<< *Access.getPointer() << `'\n'`);
1440	Retries.emplace_back(Args&: Access, Args: AccessTy);
1441	CanDoAliasSetRT = false;
1442	}
1443	}
1444	}
1445
1446	// Note that this function computes CanDoRT and MayNeedRTCheck
1447	// independently. For example CanDoRT=false, MayNeedRTCheck=false means that
1448	// we have a pointer for which we couldn't find the bounds but we don't
1449	// actually need to emit any checks so it does not matter.
1450	//
1451	// We need runtime checks for this alias set, if there are at least 2
1452	// dependence sets (in which case RunningDepId > 2) or if we need to re-try
1453	// any bound checks (because in that case the number of dependence sets is
1454	// incomplete).
1455	bool NeedsAliasSetRTCheck = RunningDepId > `2` \|\| !Retries.empty();
1456
1457	// We need to perform run-time alias checks, but some pointers had bounds
1458	// that couldn't be checked.
1459	if (NeedsAliasSetRTCheck && !CanDoAliasSetRT) {
1460	// Reset the CanDoSetRt flag and retry all accesses that have failed.
1461	// We know that we need these checks, so we can now be more aggressive
1462	// and add further checks if required (overflow checks).
1463	CanDoAliasSetRT = true;
1464	for (const auto &[Access, AccessTy] : Retries) {
1465	if (!createCheckForAccess(RtCheck, Access, AccessTy, StridesMap,
1466	DepSetId, TheLoop, RunningDepId, ASId,
1467	/Assume=/true)) {
1468	CanDoAliasSetRT = false;
1469	UncomputablePtr = Access.getPointer();
1470	if (!AllowPartial)
1471	break;
1472	}
1473	}
1474	}
1475
1476	CanDoRT &= CanDoAliasSetRT;
1477	MayNeedRTCheck \|= NeedsAliasSetRTCheck;
1478	++ASId;
1479	}
1480
1481	// If the pointers that we would use for the bounds comparison have different
1482	// address spaces, assume the values aren't directly comparable, so we can't
1483	// use them for the runtime check. We also have to assume they could
1484	// overlap. In the future there should be metadata for whether address spaces
1485	// are disjoint.
1486	unsigned NumPointers = RtCheck.Pointers.size();
1487	for (unsigned i = `0`; i < NumPointers; ++i) {
1488	for (unsigned j = i + `1`; j < NumPointers; ++j) {
1489	// Only need to check pointers between two different dependency sets.
1490	if (RtCheck.Pointers [i].DependencySetId ==
1491	RtCheck.Pointers [j].DependencySetId)
1492	continue;
1493	// Only need to check pointers in the same alias set.
1494	if (RtCheck.Pointers [i].AliasSetId != RtCheck.Pointers [j].AliasSetId)
1495	continue;
1496
1497	Value *PtrI = RtCheck.Pointers [i].PointerValue;
1498	Value *PtrJ = RtCheck.Pointers [j].PointerValue;
1499
1500	unsigned ASi = PtrI->getType()->getPointerAddressSpace();
1501	unsigned ASj = PtrJ->getType()->getPointerAddressSpace();
1502	if (ASi != ASj) {
1503	LLVM_DEBUG(
1504	dbgs() << "LAA: Runtime check would require comparison between"
1505	" different address spaces\n");
1506	return false;
1507	}
1508	}
1509	}
1510
1511	if (MayNeedRTCheck && (CanDoRT \|\| AllowPartial))
1512	RtCheck.generateChecks(DepCands);
1513
1514	LLVM_DEBUG(dbgs() << "LAA: We need to do " << RtCheck.getNumberOfChecks()
1515	<< " pointer comparisons.\n");
1516
1517	// If we can do run-time checks, but there are no checks, no runtime checks
1518	// are needed. This can happen when all pointers point to the same underlying
1519	// object for example.
1520	RtCheck.Need = CanDoRT ? RtCheck.getNumberOfChecks() != `0` : MayNeedRTCheck;
1521
1522	bool CanDoRTIfNeeded = !RtCheck.Need \|\| CanDoRT;
1523	assert(CanDoRTIfNeeded == (CanDoRT \|\| !MayNeedRTCheck) &&
1524	"CanDoRTIfNeeded depends on RtCheck.Need");
1525	if (!CanDoRTIfNeeded && !AllowPartial)
1526	RtCheck.reset();
1527	return CanDoRTIfNeeded;
1528	}
1529
1530	void AccessAnalysis::processMemAccesses() {
1531	// We process the set twice: first we process read-write pointers, last we
1532	// process read-only pointers. This allows us to skip dependence tests for
1533	// read-only pointers.
1534
1535	LLVM_DEBUG(dbgs() << "LAA: Processing memory accesses...\n");
1536	LLVM_DEBUG(dbgs() << " AST: "; AST.dump());
1537	LLVM_DEBUG(dbgs() << "LAA: Accesses(" << Accesses.size() << "):\n");
1538	LLVM_DEBUG({
1539	for (const auto &[A, _] : Accesses)
1540	dbgs() << "\t" << *A.getPointer() << " ("
1541	<< (A.getInt()
1542	? "write"
1543	: (ReadOnlyPtr.contains(A.getPointer()) ? "read-only"
1544	: "read"))
1545	<< ")\n";
1546	});
1547
1548	// The AliasSetTracker has nicely partitioned our pointers by metadata
1549	// compatibility and potential for underlying-object overlap. As a result, we
1550	// only need to check for potential pointer dependencies within each alias
1551	// set.
1552	for (const auto &AS : AST) {
1553	// Note that both the alias-set tracker and the alias sets themselves used
1554	// ordered collections internally and so the iteration order here is
1555	// deterministic.
1556	auto ASPointers = AS.getPointers();
1557
1558	bool SetHasWrite = false;
1559
1560	// Map of (pointer to underlying objects, accessed address space) to last
1561	// access encountered.
1562	typedef DenseMap<std::pair<const Value , unsigned*>, MemAccessInfo>
1563	UnderlyingObjToAccessMap;
1564	UnderlyingObjToAccessMap ObjToLastAccess;
1565
1566	// Set of access to check after all writes have been processed.
1567	PtrAccessMap DeferredAccesses;
1568
1569	// Iterate over each alias set twice, once to process read/write pointers,
1570	// and then to process read-only pointers.
1571	for (int SetIteration = `0`; SetIteration < `2`; ++SetIteration) {
1572	bool UseDeferred = SetIteration > `0`;
1573	PtrAccessMap &S = UseDeferred ? DeferredAccesses : Accesses;
1574
1575	for (const Value *ConstPtr : ASPointers) {
1576	Value Ptr = const_cast<Value >(ConstPtr);
1577
1578	// For a single memory access in AliasSetTracker, Accesses may contain
1579	// both read and write, and they both need to be handled for CheckDeps.
1580	for (const auto &[AC, _] : S) {
1581	if (AC.getPointer() != Ptr)
1582	continue;
1583
1584	bool IsWrite = AC.getInt();
1585
1586	// If we're using the deferred access set, then it contains only
1587	// reads.
1588	bool IsReadOnlyPtr = ReadOnlyPtr.contains(Ptr) && !IsWrite;
1589	if (UseDeferred && !IsReadOnlyPtr)
1590	continue;
1591	// Otherwise, the pointer must be in the PtrAccessSet, either as a
1592	// read or a write.
1593	assert(((IsReadOnlyPtr && UseDeferred) \|\| IsWrite \|\|
1594	S.contains(MemAccessInfo(Ptr, false))) &&
1595	"Alias-set pointer not in the access set?");
1596
1597	MemAccessInfo Access(Ptr, IsWrite);
1598	DepCands.insert(Data: Access);
1599
1600	// Memorize read-only pointers for later processing and skip them in
1601	// the first round (they need to be checked after we have seen all
1602	// write pointers). Note: we also mark pointer that are not
1603	// consecutive as "read-only" pointers (so that we check
1604	// "a[b[i]] +="). Hence, we need the second check for "!IsWrite".
1605	if (!UseDeferred && IsReadOnlyPtr) {
1606	// We only use the pointer keys, the types vector values don't
1607	// matter.
1608	DeferredAccesses.insert(KV: {Access, {}});
1609	continue;
1610	}
1611
1612	// If this is a write - check other reads and writes for conflicts. If
1613	// this is a read only check other writes for conflicts (but only if
1614	// there is no other write to the ptr - this is an optimization to
1615	// catch "a[i] = a[i] + " without having to do a dependence check).
1616	if ((IsWrite \|\| IsReadOnlyPtr) && SetHasWrite) {
1617	CheckDeps.push_back(Elt: Access);
1618	IsRTCheckAnalysisNeeded = true;
1619	}
1620
1621	if (IsWrite)
1622	SetHasWrite = true;
1623
1624	// Create sets of pointers connected by a shared alias set and
1625	// underlying object.
1626	SmallVector<const Value *, `16`> &UOs = UnderlyingObjects [Ptr];
1627	UOs = {};
1628	::getUnderlyingObjects(V: Ptr, Objects&: UOs, LI);
1629	LLVM_DEBUG(dbgs()
1630	<< "Underlying objects for pointer " << *Ptr << "\n");
1631	for (const Value *UnderlyingObj : UOs) {
1632	// nullptr never alias, don't join sets for pointer that have "null"
1633	// in their UnderlyingObjects list.
1634	if (isa<ConstantPointerNull>(Val: UnderlyingObj) &&
1635	!NullPointerIsDefined(
1636	F: TheLoop->getHeader()->getParent(),
1637	AS: UnderlyingObj->getType()->getPointerAddressSpace()))
1638	continue;
1639
1640	auto [It, Inserted] = ObjToLastAccess.try_emplace(
1641	Key: {UnderlyingObj,
1642	cast<PointerType>(Val: Ptr->getType())->getAddressSpace()},
1643	Args&: Access);
1644	if (!Inserted) {
1645	DepCands.unionSets(V1: Access, V2: It ->second);
1646	It ->second = Access;
1647	}
1648
1649	LLVM_DEBUG(dbgs() << " " << *UnderlyingObj << "\n");
1650	}
1651	}
1652	}
1653	}
1654	}
1655	}
1656
1657	/// Check whether the access through \p Ptr has a constant stride.
1658	std::optional<int64_t>
1659	llvm::getPtrStride(PredicatedScalarEvolution &PSE, Type AccessTy, Value Ptr,
1660	const Loop Lp, const* DominatorTree &DT,
1661	const DenseMap<Value , const* SCEV *> &StridesMap,
1662	bool Assume, bool ShouldCheckWrap) {
1663	const SCEV *PtrScev = replaceSymbolicStrideSCEV(PSE, PtrToStride: StridesMap, Ptr);
1664	if (PSE.getSE()->isLoopInvariant(S: PtrScev, L: Lp))
1665	return `0`;
1666
1667	assert(Ptr->getType()->isPointerTy() && "Unexpected non-ptr");
1668
1669	const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Val: PtrScev);
1670	if (Assume && !AR)
1671	AR = PSE.getAsAddRec(V: Ptr);
1672
1673	if (!AR) {
1674	LLVM_DEBUG(dbgs() << "LAA: Bad stride - Not an AddRecExpr pointer " << *Ptr
1675	<< " SCEV: " << *PtrScev << "\n");
1676	return std::nullopt;
1677	}
1678
1679	std::optional<int64_t> Stride =
1680	getStrideFromAddRec(AR, Lp, AccessTy, Ptr, PSE);
1681	if (!ShouldCheckWrap \|\| !Stride)
1682	return Stride;
1683
1684	if (isNoWrap(PSE, AR, Ptr, AccessTy, L: Lp, Assume, DT, Stride))
1685	return Stride;
1686
1687	LLVM_DEBUG(
1688	dbgs() << "LAA: Bad stride - Pointer may wrap in the address space "
1689	<< Ptr << " SCEV: " << AR << "\n");
1690	return std::nullopt;
1691	}
1692
1693	std::optional<int64_t> llvm::getPointersDiff(Type ElemTyA, Value PtrA,
1694	Type ElemTyB, Value PtrB,
1695	const DataLayout &DL,
1696	ScalarEvolution &SE,
1697	bool StrictCheck, bool CheckType) {
1698	assert(PtrA && PtrB && "Expected non-nullptr pointers.");
1699
1700	// Make sure that A and B are different pointers.
1701	if (PtrA == PtrB)
1702	return `0`;
1703
1704	// Make sure that the element types are the same if required.
1705	if (CheckType && ElemTyA != ElemTyB)
1706	return std::nullopt;
1707
1708	unsigned ASA = PtrA->getType()->getPointerAddressSpace();
1709	unsigned ASB = PtrB->getType()->getPointerAddressSpace();
1710
1711	// Check that the address spaces match.
1712	if (ASA != ASB)
1713	return std::nullopt;
1714	unsigned IdxWidth = DL.getIndexSizeInBits(AS: ASA);
1715
1716	APInt OffsetA(IdxWidth, `0`), OffsetB(IdxWidth, `0`);
1717	const Value *PtrA1 = PtrA->stripAndAccumulateConstantOffsets(
1718	DL, Offset&: OffsetA, /AllowNonInbounds=/true);
1719	const Value *PtrB1 = PtrB->stripAndAccumulateConstantOffsets(
1720	DL, Offset&: OffsetB, /AllowNonInbounds=/true);
1721
1722	std::optional<int64_t> Val;
1723	if (PtrA1 == PtrB1) {
1724	// Retrieve the address space again as pointer stripping now tracks through
1725	// `addrspacecast`.
1726	ASA = cast<PointerType>(Val: PtrA1->getType())->getAddressSpace();
1727	ASB = cast<PointerType>(Val: PtrB1->getType())->getAddressSpace();
1728	// Check that the address spaces match and that the pointers are valid.
1729	if (ASA != ASB)
1730	return std::nullopt;
1731
1732	IdxWidth = DL.getIndexSizeInBits(AS: ASA);
1733	OffsetA = OffsetA.sextOrTrunc(width: IdxWidth);
1734	OffsetB = OffsetB.sextOrTrunc(width: IdxWidth);
1735
1736	OffsetB -= OffsetA;
1737	Val = OffsetB.trySExtValue();
1738	} else {
1739	// Otherwise compute the distance with SCEV between the base pointers.
1740	const SCEV *PtrSCEVA = SE.getSCEV(V: PtrA);
1741	const SCEV *PtrSCEVB = SE.getSCEV(V: PtrB);
1742	std::optional<APInt> Diff =
1743	SE.computeConstantDifference(LHS: PtrSCEVB, RHS: PtrSCEVA);
1744	if (!Diff)
1745	return std::nullopt;
1746	Val = Diff ->trySExtValue();
1747	}
1748
1749	if (!Val)
1750	return std::nullopt;
1751
1752	int64_t Size = DL.getTypeStoreSize(Ty: ElemTyA);
1753	int64_t Dist = *Val / Size;
1754
1755	// Ensure that the calculated distance matches the type-based one after all
1756	// the bitcasts removal in the provided pointers.
1757	if (!StrictCheck \|\| Dist * Size == Val)
1758	return Dist;
1759	return std::nullopt;
1760	}
1761
1762	bool llvm::sortPtrAccesses(ArrayRef<Value > VL, Type ElemTy,
1763	const DataLayout &DL, ScalarEvolution &SE,
1764	SmallVectorImpl<unsigned> &SortedIndices) {
1765	assert(llvm::all_of(
1766	VL, [](const Value V) { return* V->getType()->isPointerTy(); }) &&
1767	"Expected list of pointer operands.");
1768	// Walk over the pointers, and map each of them to an offset relative to
1769	// first pointer in the array.
1770	Value *Ptr0 = VL [`0`];
1771
1772	using DistOrdPair = std::pair<int64_t, unsigned>;
1773	auto Compare = llvm::less_first ();
1774	std::set<DistOrdPair, decltype(Compare)> Offsets(Compare);
1775	Offsets.emplace(args: `0`, args: `0`);
1776	bool IsConsecutive = true;
1777	for (auto [Idx, Ptr] : drop_begin(RangeOrContainer: enumerate(First&: VL))) {
1778	std::optional<int64_t> Diff =
1779	getPointersDiff(ElemTyA: ElemTy, PtrA: Ptr0, ElemTyB: ElemTy, PtrB: Ptr, DL, SE,
1780	/StrictCheck=/true);
1781	if (!Diff)
1782	return false;
1783
1784	// Check if the pointer with the same offset is found.
1785	int64_t Offset = *Diff;
1786	auto [It, IsInserted] = Offsets.emplace(args&: Offset, args&: Idx);
1787	if (!IsInserted)
1788	return false;
1789	// Consecutive order if the inserted element is the last one.
1790	IsConsecutive &= std::next(x: It) == Offsets.end();
1791	}
1792	SortedIndices.clear();
1793	if (!IsConsecutive) {
1794	// Fill SortedIndices array only if it is non-consecutive.
1795	SortedIndices.resize(N: VL.size());
1796	for (auto [Idx, Off] : enumerate(First&: Offsets))
1797	SortedIndices [Idx] = Off.second;
1798	}
1799	return true;
1800	}
1801
1802	/// Returns true if the memory operations \p A and \p B are consecutive.
1803	bool llvm::isConsecutiveAccess(Value A, Value B, const DataLayout &DL,
1804	ScalarEvolution &SE, bool CheckType) {
1805	Value *PtrA = getLoadStorePointerOperand(V: A);
1806	Value *PtrB = getLoadStorePointerOperand(V: B);
1807	if (!PtrA \|\| !PtrB)
1808	return false;
1809	Type *ElemTyA = getLoadStoreType(I: A);
1810	Type *ElemTyB = getLoadStoreType(I: B);
1811	std::optional<int64_t> Diff =
1812	getPointersDiff(ElemTyA, PtrA, ElemTyB, PtrB, DL, SE,
1813	/StrictCheck=/true, CheckType);
1814	return Diff == `1`;
1815	}
1816
1817	void MemoryDepChecker::addAccess(StoreInst *SI) {
1818	visitPointers(StartPtr: SI->getPointerOperand(), InnermostLoop: *InnermostLoop,
1819	AddPointer: [this, SI](Value *Ptr) {
1820	Accesses [MemAccessInfo (Ptr, true)].push_back(x: AccessIdx);
1821	InstMap.push_back(Elt: SI);
1822	++AccessIdx;
1823	});
1824	}
1825
1826	void MemoryDepChecker::addAccess(LoadInst *LI) {
1827	visitPointers(StartPtr: LI->getPointerOperand(), InnermostLoop: *InnermostLoop,
1828	AddPointer: [this, LI](Value *Ptr) {
1829	Accesses [MemAccessInfo (Ptr, false)].push_back(x: AccessIdx);
1830	InstMap.push_back(Elt: LI);
1831	++AccessIdx;
1832	});
1833	}
1834
1835	MemoryDepChecker::VectorizationSafetyStatus
1836	MemoryDepChecker::Dependence::isSafeForVectorization(DepType Type) {
1837	switch (Type) {
1838	case NoDep:
1839	case Forward:
1840	case BackwardVectorizable:
1841	return VectorizationSafetyStatus::Safe;
1842
1843	case Unknown:
1844	return VectorizationSafetyStatus::PossiblySafeWithRtChecks;
1845	case ForwardButPreventsForwarding:
1846	case Backward:
1847	case BackwardVectorizableButPreventsForwarding:
1848	case IndirectUnsafe:
1849	return VectorizationSafetyStatus::Unsafe;
1850	}
1851	llvm_unreachable("unexpected DepType!");
1852	}
1853
1854	bool MemoryDepChecker::Dependence::isBackward() const {
1855	switch (Type) {
1856	case NoDep:
1857	case Forward:
1858	case ForwardButPreventsForwarding:
1859	case Unknown:
1860	case IndirectUnsafe:
1861	return false;
1862
1863	case BackwardVectorizable:
1864	case Backward:
1865	case BackwardVectorizableButPreventsForwarding:
1866	return true;
1867	}
1868	llvm_unreachable("unexpected DepType!");
1869	}
1870
1871	bool MemoryDepChecker::Dependence::isPossiblyBackward() const {
1872	return isBackward() \|\| Type == Unknown \|\| Type == IndirectUnsafe;
1873	}
1874
1875	bool MemoryDepChecker::Dependence::isForward() const {
1876	switch (Type) {
1877	case Forward:
1878	case ForwardButPreventsForwarding:
1879	return true;
1880
1881	case NoDep:
1882	case Unknown:
1883	case BackwardVectorizable:
1884	case Backward:
1885	case BackwardVectorizableButPreventsForwarding:
1886	case IndirectUnsafe:
1887	return false;
1888	}
1889	llvm_unreachable("unexpected DepType!");
1890	}
1891
1892	bool MemoryDepChecker::couldPreventStoreLoadForward(uint64_t Distance,
1893	uint64_t TypeByteSize,
1894	unsigned CommonStride) {
1895	// If loads occur at a distance that is not a multiple of a feasible vector
1896	// factor store-load forwarding does not take place.
1897	// Positive dependences might cause troubles because vectorizing them might
1898	// prevent store-load forwarding making vectorized code run a lot slower.
1899	// a[i] = a[i-3] ^ a[i-8];
1900	// The stores to a[i:i+1] don't align with the stores to a[i-3:i-2] and
1901	// hence on your typical architecture store-load forwarding does not take
1902	// place. Vectorizing in such cases does not make sense.
1903	// Store-load forwarding distance.
1904
1905	// After this many iterations store-to-load forwarding conflicts should not
1906	// cause any slowdowns.
1907	const uint64_t NumItersForStoreLoadThroughMemory = `8` * TypeByteSize;
1908	// Maximum vector factor.
1909	uint64_t MaxVFWithoutSLForwardIssuesPowerOf2 =
1910	std::min(a: VectorizerParams::MaxVectorWidth * TypeByteSize,
1911	b: MaxStoreLoadForwardSafeDistanceInBits);
1912
1913	// Compute the smallest VF at which the store and load would be misaligned.
1914	for (uint64_t VF = `2` * TypeByteSize;
1915	VF <= MaxVFWithoutSLForwardIssuesPowerOf2; VF *= `2`) {
1916	// If the number of vector iteration between the store and the load are
1917	// small we could incur conflicts.
1918	if (Distance % VF && Distance / VF < NumItersForStoreLoadThroughMemory) {
1919	MaxVFWithoutSLForwardIssuesPowerOf2 = (VF >> `1`);
1920	break;
1921	}
1922	}
1923
1924	if (MaxVFWithoutSLForwardIssuesPowerOf2 < `2` * TypeByteSize) {
1925	LLVM_DEBUG(
1926	dbgs() << "LAA: Distance " << Distance
1927	<< " that could cause a store-load forwarding conflict\n");
1928	return true;
1929	}
1930
1931	if (CommonStride &&
1932	MaxVFWithoutSLForwardIssuesPowerOf2 <
1933	MaxStoreLoadForwardSafeDistanceInBits &&
1934	MaxVFWithoutSLForwardIssuesPowerOf2 !=
1935	VectorizerParams::MaxVectorWidth * TypeByteSize) {
1936	uint64_t MaxVF =
1937	bit_floor(Value: MaxVFWithoutSLForwardIssuesPowerOf2 / CommonStride);
1938	uint64_t MaxVFInBits = MaxVF * TypeByteSize * `8`;
1939	MaxStoreLoadForwardSafeDistanceInBits =
1940	std::min(a: MaxStoreLoadForwardSafeDistanceInBits, b: MaxVFInBits);
1941	}
1942	return false;
1943	}
1944
1945	void MemoryDepChecker::mergeInStatus(VectorizationSafetyStatus S) {
1946	if (Status < S)
1947	Status = S;
1948	}
1949
1950	/// Given a dependence-distance \p Dist between two memory accesses, that have
1951	/// strides in the same direction whose absolute value of the maximum stride is
1952	/// given in \p MaxStride, in a loop whose maximum backedge taken count is \p
1953	/// MaxBTC, check if it is possible to prove statically that the dependence
1954	/// distance is larger than the range that the accesses will travel through the
1955	/// execution of the loop. If so, return true; false otherwise. This is useful
1956	/// for example in loops such as the following (PR31098):
1957	///
1958	/// for (i = 0; i < D; ++i) {
1959	/// = out[i];
1960	/// out[i+D] =
1961	/// }
1962	static bool isSafeDependenceDistance(const DataLayout &DL, ScalarEvolution &SE,
1963	const SCEV &MaxBTC, const SCEV &Dist,
1964	uint64_t MaxStride) {
1965
1966	// If we can prove that
1967	// () \|Dist\| > MaxBTC Step*
1968	// where Step is the absolute stride of the memory accesses in bytes,
1969	// then there is no dependence.
1970	//
1971	// Rationale:
1972	// We basically want to check if the absolute distance (\|Dist/Step\|)
1973	// is >= the loop iteration count (or > MaxBTC).
1974	// This is equivalent to the Strong SIV Test (Practical Dependence Testing,
1975	// Section 4.2.1); Note, that for vectorization it is sufficient to prove
1976	// that the dependence distance is >= VF; This is checked elsewhere.
1977	// But in some cases we can prune dependence distances early, and
1978	// even before selecting the VF, and without a runtime test, by comparing
1979	// the distance against the loop iteration count. Since the vectorized code
1980	// will be executed only if LoopCount >= VF, proving distance >= LoopCount
1981	// also guarantees that distance >= VF.
1982	//
1983	const SCEV *Step = SE.getConstant(Ty: MaxBTC.getType(), V: MaxStride);
1984	const SCEV *Product = SE.getMulExpr(LHS: &MaxBTC, RHS: Step);
1985
1986	const SCEV *CastedDist = &Dist;
1987	const SCEV *CastedProduct = Product;
1988	uint64_t DistTypeSizeBits = DL.getTypeSizeInBits(Ty: Dist.getType());
1989	uint64_t ProductTypeSizeBits = DL.getTypeSizeInBits(Ty: Product->getType());
1990
1991	// The dependence distance can be positive/negative, so we sign extend Dist;
1992	// The multiplication of the absolute stride in bytes and the
1993	// backedgeTakenCount is non-negative, so we zero extend Product.
1994	if (DistTypeSizeBits > ProductTypeSizeBits)
1995	CastedProduct = SE.getZeroExtendExpr(Op: Product, Ty: Dist.getType());
1996	else
1997	CastedDist = SE.getNoopOrSignExtend(V: &Dist, Ty: Product->getType());
1998
1999	// Is Dist - (MaxBTC Step) > 0 ?*
2000	// (If so, then we have proven () because \|Dist\| >= Dist)
2001	const SCEV *Minus = SE.getMinusSCEV(LHS: CastedDist, RHS: CastedProduct);
2002	if (SE.isKnownPositive(S: Minus))
2003	return true;
2004
2005	// Second try: Is -Dist - (MaxBTC Step) > 0 ?*
2006	// (If so, then we have proven () because \|Dist\| >= -1Dist)*
2007	const SCEV *NegDist = SE.getNegativeSCEV(V: CastedDist);
2008	Minus = SE.getMinusSCEV(LHS: NegDist, RHS: CastedProduct);
2009	return SE.isKnownPositive(S: Minus);
2010	}
2011
2012	/// Check the dependence for two accesses with the same stride \p Stride.
2013	/// \p Distance is the positive distance in bytes, and \p TypeByteSize is type
2014	/// size in bytes.
2015	///
2016	/// \returns true if they are independent.
2017	static bool areStridedAccessesIndependent(uint64_t Distance, uint64_t Stride,
2018	uint64_t TypeByteSize) {
2019	assert(Stride > `1` && "The stride must be greater than 1");
2020	assert(TypeByteSize > `0` && "The type size in byte must be non-zero");
2021	assert(Distance > `0` && "The distance must be non-zero");
2022
2023	// Skip if the distance is not multiple of type byte size.
2024	if (Distance % TypeByteSize)
2025	return false;
2026
2027	// No dependence if the distance is not multiple of the stride.
2028	// E.g.
2029	// for (i = 0; i < 1024 ; i += 4)
2030	// A[i+2] = A[i] + 1;
2031	//
2032	// Two accesses in memory (distance is 2, stride is 4):
2033	// \| A[0] \| \| \| \| A[4] \| \| \| \|
2034	// \| \| \| A[2] \| \| \| \| A[6] \| \|
2035	//
2036	// E.g.
2037	// for (i = 0; i < 1024 ; i += 3)
2038	// A[i+4] = A[i] + 1;
2039	//
2040	// Two accesses in memory (distance is 4, stride is 3):
2041	// \| A[0] \| \| \| A[3] \| \| \| A[6] \| \| \|
2042	// \| \| \| \| \| A[4] \| \| \| A[7] \| \|
2043	return Distance % Stride;
2044	}
2045
2046	bool MemoryDepChecker::areAccessesCompletelyBeforeOrAfter(const SCEV *Src,
2047	Type *SrcTy,
2048	const SCEV *Sink,
2049	Type *SinkTy) {
2050	const SCEV *BTC = PSE.getBackedgeTakenCount();
2051	const SCEV *SymbolicMaxBTC = PSE.getSymbolicMaxBackedgeTakenCount();
2052	ScalarEvolution &SE = *PSE.getSE();
2053	const auto &[SrcStart_, SrcEnd_] =
2054	getStartAndEndForAccess(Lp: InnermostLoop, PtrExpr: Src, AccessTy: SrcTy, BTC, MaxBTC: SymbolicMaxBTC,
2055	SE: &SE, PointerBounds: &PointerBounds, DT, AC, LoopGuards);
2056	if (isa<SCEVCouldNotCompute>(Val: SrcStart_) \|\| isa<SCEVCouldNotCompute>(Val: SrcEnd_))
2057	return false;
2058
2059	const auto &[SinkStart_, SinkEnd_] =
2060	getStartAndEndForAccess(Lp: InnermostLoop, PtrExpr: Sink, AccessTy: SinkTy, BTC, MaxBTC: SymbolicMaxBTC,
2061	SE: &SE, PointerBounds: &PointerBounds, DT, AC, LoopGuards);
2062	if (isa<SCEVCouldNotCompute>(Val: SinkStart_) \|\|
2063	isa<SCEVCouldNotCompute>(Val: SinkEnd_))
2064	return false;
2065
2066	if (!LoopGuards)
2067	LoopGuards.emplace(args: ScalarEvolution::LoopGuards::collect(L: InnermostLoop, SE));
2068
2069	auto SrcEnd = SE.applyLoopGuards(Expr: SrcEnd_, Guards: *LoopGuards);
2070	auto SinkStart = SE.applyLoopGuards(Expr: SinkStart_, Guards: *LoopGuards);
2071	if (SE.isKnownPredicate(Pred: CmpInst::ICMP_ULE, LHS: SrcEnd, RHS: SinkStart))
2072	return true;
2073
2074	auto SinkEnd = SE.applyLoopGuards(Expr: SinkEnd_, Guards: *LoopGuards);
2075	auto SrcStart = SE.applyLoopGuards(Expr: SrcStart_, Guards: *LoopGuards);
2076	return SE.isKnownPredicate(Pred: CmpInst::ICMP_ULE, LHS: SinkEnd, RHS: SrcStart);
2077	}
2078
2079	std::variant<MemoryDepChecker::Dependence::DepType,
2080	MemoryDepChecker::DepDistanceStrideAndSizeInfo>
2081	MemoryDepChecker::getDependenceDistanceStrideAndSize(
2082	const AccessAnalysis::MemAccessInfo &A, Instruction *AInst,
2083	const AccessAnalysis::MemAccessInfo &B, Instruction *BInst) {
2084	const auto &DL = InnermostLoop->getHeader()->getDataLayout();
2085	auto &SE = *PSE.getSE();
2086	const auto &[APtr, AIsWrite] = A;
2087	const auto &[BPtr, BIsWrite] = B;
2088
2089	// Two reads are independent.
2090	if (!AIsWrite && !BIsWrite)
2091	return MemoryDepChecker::Dependence::NoDep;
2092
2093	Type *ATy = getLoadStoreType(I: AInst);
2094	Type *BTy = getLoadStoreType(I: BInst);
2095
2096	// We cannot check pointers in different address spaces.
2097	if (APtr->getType()->getPointerAddressSpace() !=
2098	BPtr->getType()->getPointerAddressSpace())
2099	return MemoryDepChecker::Dependence::Unknown;
2100
2101	std::optional<int64_t> StrideAPtr = getPtrStride(
2102	PSE, AccessTy: ATy, Ptr: APtr, Lp: InnermostLoop, DT: DT, StridesMap: SymbolicStrides, Assume: true, ShouldCheckWrap: true*);
2103	std::optional<int64_t> StrideBPtr = getPtrStride(
2104	PSE, AccessTy: BTy, Ptr: BPtr, Lp: InnermostLoop, DT: DT, StridesMap: SymbolicStrides, Assume: true, ShouldCheckWrap: true*);
2105
2106	const SCEV *Src = PSE.getSCEV(V: APtr);
2107	const SCEV *Sink = PSE.getSCEV(V: BPtr);
2108
2109	// If the induction step is negative we have to invert source and sink of the
2110	// dependence when measuring the distance between them. We should not swap
2111	// AIsWrite with BIsWrite, as their uses expect them in program order.
2112	if (StrideAPtr && *StrideAPtr < `0`) {
2113	std::swap(a&: Src, b&: Sink);
2114	std::swap(a&: AInst, b&: BInst);
2115	std::swap(a&: ATy, b&: BTy);
2116	std::swap(lhs&: StrideAPtr, rhs&: StrideBPtr);
2117	}
2118
2119	const SCEV *Dist = SE.getMinusSCEV(LHS: Sink, RHS: Src);
2120
2121	LLVM_DEBUG(dbgs() << "LAA: Src Scev: " << Src << "Sink Scev: " << Sink
2122	<< "\n");
2123	LLVM_DEBUG(dbgs() << "LAA: Distance for " << AInst << " to " << BInst
2124	<< ": " << *Dist << "\n");
2125
2126	// Need accesses with constant strides and the same direction for further
2127	// dependence analysis. We don't want to vectorize "A[B[i]] += ..." and
2128	// similar code or pointer arithmetic that could wrap in the address space.
2129
2130	// If either Src or Sink are not strided (i.e. not a non-wrapping AddRec) and
2131	// not loop-invariant (stride will be 0 in that case), we cannot analyze the
2132	// dependence further and also cannot generate runtime checks.
2133	if (!StrideAPtr \|\| !StrideBPtr) {
2134	LLVM_DEBUG(dbgs() << "Pointer access with non-constant stride\n");
2135	return MemoryDepChecker::Dependence::IndirectUnsafe;
2136	}
2137
2138	int64_t StrideAPtrInt = *StrideAPtr;
2139	int64_t StrideBPtrInt = *StrideBPtr;
2140	LLVM_DEBUG(dbgs() << "LAA: Src induction step: " << StrideAPtrInt
2141	<< " Sink induction step: " << StrideBPtrInt << "\n");
2142	// At least Src or Sink are loop invariant and the other is strided or
2143	// invariant. We can generate a runtime check to disambiguate the accesses.
2144	if (!StrideAPtrInt \|\| !StrideBPtrInt)
2145	return MemoryDepChecker::Dependence::Unknown;
2146
2147	// Both Src and Sink have a constant stride, check if they are in the same
2148	// direction.
2149	if ((StrideAPtrInt > `0`) != (StrideBPtrInt > `0`)) {
2150	LLVM_DEBUG(
2151	dbgs() << "Pointer access with strides in different directions\n");
2152	return MemoryDepChecker::Dependence::Unknown;
2153	}
2154
2155	TypeSize AStoreSz = DL.getTypeStoreSize(Ty: ATy);
2156	TypeSize BStoreSz = DL.getTypeStoreSize(Ty: BTy);
2157
2158	// If store sizes are not the same, set TypeByteSize to zero, so we can check
2159	// it in the caller isDependent.
2160	uint64_t ASz = DL.getTypeAllocSize(Ty: ATy);
2161	uint64_t BSz = DL.getTypeAllocSize(Ty: BTy);
2162	uint64_t TypeByteSize = (AStoreSz == BStoreSz) ? BSz : `0`;
2163
2164	uint64_t StrideAScaled = std::abs(i: StrideAPtrInt) * ASz;
2165	uint64_t StrideBScaled = std::abs(i: StrideBPtrInt) * BSz;
2166
2167	uint64_t MaxStride = std::max(a: StrideAScaled, b: StrideBScaled);
2168
2169	std::optional<uint64_t> CommonStride;
2170	if (StrideAScaled == StrideBScaled)
2171	CommonStride = StrideAScaled;
2172
2173	// TODO: Historically, we didn't retry with runtime checks when (unscaled)
2174	// strides were different but there is no inherent reason to.
2175	if (!isa<SCEVConstant>(Val: Dist))
2176	ShouldRetryWithRuntimeChecks \|= StrideAPtrInt == StrideBPtrInt;
2177
2178	// If distance is a SCEVCouldNotCompute, return Unknown immediately.
2179	if (isa<SCEVCouldNotCompute>(Val: Dist)) {
2180	LLVM_DEBUG(dbgs() << "LAA: Uncomputable distance.\n");
2181	return Dependence::Unknown;
2182	}
2183
2184	return DepDistanceStrideAndSizeInfo (Dist, MaxStride, CommonStride,
2185	TypeByteSize, AIsWrite, BIsWrite);
2186	}
2187
2188	MemoryDepChecker::Dependence::DepType
2189	MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
2190	const MemAccessInfo &B, unsigned BIdx) {
2191	assert(AIdx < BIdx && "Must pass arguments in program order");
2192
2193	// Check if we can prove that Sink only accesses memory after Src's end or
2194	// vice versa. The helper is used to perform the checks only on the exit paths
2195	// where it helps to improve the analysis result.
2196	auto CheckCompletelyBeforeOrAfter = [&]() {
2197	auto *APtr = A.getPointer();
2198	auto *BPtr = B.getPointer();
2199	Type *ATy = getLoadStoreType(I: InstMap [AIdx]);
2200	Type *BTy = getLoadStoreType(I: InstMap [BIdx]);
2201	const SCEV *Src = PSE.getSCEV(V: APtr);
2202	const SCEV *Sink = PSE.getSCEV(V: BPtr);
2203	return areAccessesCompletelyBeforeOrAfter(Src, SrcTy: ATy, Sink, SinkTy: BTy);
2204	};
2205
2206	// Get the dependence distance, stride, type size and what access writes for
2207	// the dependence between A and B.
2208	auto Res =
2209	getDependenceDistanceStrideAndSize(A, AInst: InstMap [AIdx], B, BInst: InstMap [BIdx]);
2210	if (std::holds_alternative<Dependence::DepType>(v: Res)) {
2211	if (std::get<Dependence::DepType>(v&: Res) == Dependence::Unknown &&
2212	CheckCompletelyBeforeOrAfter ())
2213	return Dependence::NoDep;
2214	return std::get<Dependence::DepType>(v&: Res);
2215	}
2216
2217	auto &[Dist, MaxStride, CommonStride, TypeByteSize, AIsWrite, BIsWrite] =
2218	std::get<DepDistanceStrideAndSizeInfo>(v&: Res);
2219	bool HasSameSize = TypeByteSize > `0`;
2220
2221	ScalarEvolution &SE = *PSE.getSE();
2222	auto &DL = InnermostLoop->getHeader()->getDataLayout();
2223
2224	// If the distance between the acecsses is larger than their maximum absolute
2225	// stride multiplied by the symbolic maximum backedge taken count (which is an
2226	// upper bound of the number of iterations), the accesses are independet, i.e.
2227	// they are far enough appart that accesses won't access the same location
2228	// across all loop ierations.
2229	if (HasSameSize &&
2230	isSafeDependenceDistance(
2231	DL, SE, MaxBTC: (PSE.getSymbolicMaxBackedgeTakenCount()), Dist: Dist, MaxStride))
2232	return Dependence::NoDep;
2233
2234	// The rest of this function relies on ConstDist being at most 64-bits, which
2235	// is checked earlier. Will assert if the calling code changes.
2236	const APInt APDist = nullptr*;
2237	uint64_t ConstDist =
2238	match(S: Dist, P: m_scev_APInt(C&: APDist)) ? APDist->abs().getZExtValue() : `0`;
2239
2240	// Attempt to prove strided accesses independent.
2241	if (APDist) {
2242	// If the distance between accesses and their strides are known constants,
2243	// check whether the accesses interlace each other.
2244	if (ConstDist > `0` && CommonStride && CommonStride > `1` && HasSameSize &&
2245	areStridedAccessesIndependent(Distance: ConstDist, Stride: *CommonStride, TypeByteSize)) {
2246	LLVM_DEBUG(dbgs() << "LAA: Strided accesses are independent\n");
2247	return Dependence::NoDep;
2248	}
2249	} else {
2250	if (!LoopGuards)
2251	LoopGuards.emplace(
2252	args: ScalarEvolution::LoopGuards::collect(L: InnermostLoop, SE));
2253	Dist = SE.applyLoopGuards(Expr: Dist, Guards: *LoopGuards);
2254	}
2255
2256	// Negative distances are not plausible dependencies.
2257	if (SE.isKnownNonPositive(S: Dist)) {
2258	if (SE.isKnownNonNegative(S: Dist)) {
2259	if (HasSameSize) {
2260	// Write to the same location with the same size.
2261	return Dependence::Forward;
2262	}
2263	LLVM_DEBUG(dbgs() << "LAA: possibly zero dependence difference but "
2264	"different type sizes\n");
2265	return Dependence::Unknown;
2266	}
2267
2268	bool IsTrueDataDependence = (AIsWrite && !BIsWrite);
2269	// Check if the first access writes to a location that is read in a later
2270	// iteration, where the distance between them is not a multiple of a vector
2271	// factor and relatively small.
2272	//
2273	// NOTE: There is no need to update MaxSafeVectorWidthInBits after call to
2274	// couldPreventStoreLoadForward, even if it changed MinDepDistBytes, since a
2275	// forward dependency will allow vectorization using any width.
2276
2277	if (IsTrueDataDependence && EnableForwardingConflictDetection) {
2278	if (!ConstDist) {
2279	return CheckCompletelyBeforeOrAfter () ? Dependence::NoDep
2280	: Dependence::Unknown;
2281	}
2282	if (!HasSameSize \|\|
2283	couldPreventStoreLoadForward(Distance: ConstDist, TypeByteSize)) {
2284	LLVM_DEBUG(
2285	dbgs() << "LAA: Forward but may prevent st->ld forwarding\n");
2286	return Dependence::ForwardButPreventsForwarding;
2287	}
2288	}
2289
2290	LLVM_DEBUG(dbgs() << "LAA: Dependence is negative\n");
2291	return Dependence::Forward;
2292	}
2293
2294	int64_t MinDistance = SE.getSignedRangeMin(S: Dist).getSExtValue();
2295	// Below we only handle strictly positive distances.
2296	if (MinDistance <= `0`) {
2297	return CheckCompletelyBeforeOrAfter () ? Dependence::NoDep
2298	: Dependence::Unknown;
2299	}
2300
2301	if (!HasSameSize) {
2302	if (CheckCompletelyBeforeOrAfter ())
2303	return Dependence::NoDep;
2304	LLVM_DEBUG(dbgs() << "LAA: ReadWrite-Write positive dependency with "
2305	"different type sizes\n");
2306	return Dependence::Unknown;
2307	}
2308	// Bail out early if passed-in parameters make vectorization not feasible.
2309	unsigned ForcedFactor = (VectorizerParams::VectorizationFactor ?
2310	VectorizerParams::VectorizationFactor : `1`);
2311	unsigned ForcedUnroll = (VectorizerParams::VectorizationInterleave ?
2312	VectorizerParams::VectorizationInterleave : `1`);
2313	// The minimum number of iterations for a vectorized/unrolled version.
2314	unsigned MinNumIter = std::max(a: ForcedFactor * ForcedUnroll, b: `2U`);
2315
2316	// It's not vectorizable if the distance is smaller than the minimum distance
2317	// needed for a vectroized/unrolled version. Vectorizing one iteration in
2318	// front needs MaxStride. Vectorizing the last iteration needs TypeByteSize.
2319	// (No need to plus the last gap distance).
2320	//
2321	// E.g. Assume one char is 1 byte in memory and one int is 4 bytes.
2322	// foo(int A) {*
2323	// int B = (int )((char )A + 14);*
2324	// for (i = 0 ; i < 1024 ; i += 2)
2325	// B[i] = A[i] + 1;
2326	// }
2327	//
2328	// Two accesses in memory (stride is 4 2):*
2329	// \| A[0] \| \| A[2] \| \| A[4] \| \| A[6] \| \|
2330	// \| B[0] \| \| B[2] \| \| B[4] \|
2331	//
2332	// MinDistance needs for vectorizing iterations except the last iteration:
2333	// 4 2 * (MinNumIter - 1). MinDistance needs for the last iteration: 4.*
2334	// So the minimum distance needed is: 4 2 * (MinNumIter - 1) + 4.*
2335	//
2336	// If MinNumIter is 2, it is vectorizable as the minimum distance needed is
2337	// 12, which is less than distance.
2338	//
2339	// If MinNumIter is 4 (Say if a user forces the vectorization factor to be 4),
2340	// the minimum distance needed is 28, which is greater than distance. It is
2341	// not safe to do vectorization.
2342	//
2343	// We use MaxStride (maximum of src and sink strides) to get a conservative
2344	// lower bound on the MinDistanceNeeded in case of different strides.
2345
2346	// We know that Dist is positive, but it may not be constant. Use the signed
2347	// minimum for computations below, as this ensures we compute the closest
2348	// possible dependence distance.
2349	uint64_t MinDistanceNeeded = MaxStride * (MinNumIter - `1`) + TypeByteSize;
2350	if (MinDistanceNeeded > static_cast<uint64_t>(MinDistance)) {
2351	if (!ConstDist) {
2352	// For non-constant distances, we checked the lower bound of the
2353	// dependence distance and the distance may be larger at runtime (and safe
2354	// for vectorization). Classify it as Unknown, so we re-try with runtime
2355	// checks, unless we can prove both accesses cannot overlap.
2356	return CheckCompletelyBeforeOrAfter () ? Dependence::NoDep
2357	: Dependence::Unknown;
2358	}
2359	LLVM_DEBUG(dbgs() << "LAA: Failure because of positive minimum distance "
2360	<< MinDistance << `'\n'`);
2361	return Dependence::Backward;
2362	}
2363
2364	// Unsafe if the minimum distance needed is greater than smallest dependence
2365	// distance distance.
2366	if (MinDistanceNeeded > MinDepDistBytes) {
2367	LLVM_DEBUG(dbgs() << "LAA: Failure because it needs at least "
2368	<< MinDistanceNeeded << " size in bytes\n");
2369	return Dependence::Backward;
2370	}
2371
2372	MinDepDistBytes =
2373	std::min(a: static_cast<uint64_t>(MinDistance), b: MinDepDistBytes);
2374
2375	bool IsTrueDataDependence = (!AIsWrite && BIsWrite);
2376	if (IsTrueDataDependence && EnableForwardingConflictDetection && ConstDist &&
2377	couldPreventStoreLoadForward(Distance: MinDistance, TypeByteSize, CommonStride: *CommonStride))
2378	return Dependence::BackwardVectorizableButPreventsForwarding;
2379
2380	uint64_t MaxVF = MinDepDistBytes / MaxStride;
2381	LLVM_DEBUG(dbgs() << "LAA: Positive min distance " << MinDistance
2382	<< " with max VF = " << MaxVF << `'\n'`);
2383
2384	uint64_t MaxVFInBits = MaxVF * TypeByteSize * `8`;
2385	if (!ConstDist && MaxVFInBits < MaxTargetVectorWidthInBits) {
2386	// For non-constant distances, we checked the lower bound of the dependence
2387	// distance and the distance may be larger at runtime (and safe for
2388	// vectorization). Classify it as Unknown, so we re-try with runtime checks,
2389	// unless we can prove both accesses cannot overlap.
2390	return CheckCompletelyBeforeOrAfter () ? Dependence::NoDep
2391	: Dependence::Unknown;
2392	}
2393
2394	if (CheckCompletelyBeforeOrAfter ())
2395	return Dependence::NoDep;
2396
2397	MaxSafeVectorWidthInBits = std::min(a: MaxSafeVectorWidthInBits, b: MaxVFInBits);
2398	return Dependence::BackwardVectorizable;
2399	}
2400
2401	bool MemoryDepChecker::areDepsSafe(const DepCandidates &DepCands,
2402	const MemAccessInfoList &CheckDeps) {
2403
2404	MinDepDistBytes = -`1`;
2405	SmallPtrSet<MemAccessInfo, `8`> Visited;
2406	for (MemAccessInfo CurAccess : CheckDeps) {
2407	if (Visited.contains(Ptr: CurAccess))
2408	continue;
2409
2410	// Check accesses within this set.
2411	EquivalenceClasses<MemAccessInfo>::member_iterator AI =
2412	DepCands.findLeader(V: CurAccess);
2413	EquivalenceClasses<MemAccessInfo>::member_iterator AE =
2414	DepCands.member_end();
2415
2416	// Check every access pair.
2417	while (AI != AE) {
2418	Visited.insert(Ptr: *AI);
2419	bool AIIsWrite = AI ->getInt();
2420	// Check loads only against next equivalent class, but stores also against
2421	// other stores in the same equivalence class - to the same address.
2422	EquivalenceClasses<MemAccessInfo>::member_iterator OI =
2423	(AIIsWrite ? AI : std::next(x: AI));
2424	while (OI != AE) {
2425	// Check every accessing instruction pair in program order.
2426	auto &Acc = Accesses [*AI];
2427	for (std::vector<unsigned>::iterator I1 = Acc.begin(), I1E = Acc.end();
2428	I1 != I1E; ++I1)
2429	// Scan all accesses of another equivalence class, but only the next
2430	// accesses of the same equivalent class.
2431	for (std::vector<unsigned>::iterator
2432	I2 = (OI == AI ? std::next(x: I1) : Accesses [*OI].begin()),
2433	I2E = (OI == AI ? I1E : Accesses [*OI].end());
2434	I2 != I2E; ++I2) {
2435	auto A = std::make_pair(x: &AI, y&: I1);
2436	auto B = std::make_pair(x: &OI, y&: I2);
2437
2438	assert(I1 != I2);
2439	if (I1 > I2)
2440	std::swap(x&: A, y&: B);
2441
2442	Dependence::DepType Type =
2443	isDependent(A: A.first, AIdx: A.second, B: B.first, BIdx: B.second);
2444	mergeInStatus(S: Dependence::isSafeForVectorization(Type));
2445
2446	// Gather dependences unless we accumulated MaxDependences
2447	// dependences. In that case return as soon as we find the first
2448	// unsafe dependence. This puts a limit on this quadratic
2449	// algorithm.
2450	if (RecordDependences) {
2451	if (Type != Dependence::NoDep)
2452	Dependences.emplace_back(Args&: A.second, Args&: B.second, Args&: Type);
2453
2454	if (Dependences.size() >= MaxDependences) {
2455	RecordDependences = false;
2456	Dependences.clear();
2457	LLVM_DEBUG(dbgs()
2458	<< "Too many dependences, stopped recording\n");
2459	}
2460	}
2461	if (!RecordDependences && !isSafeForVectorization())
2462	return false;
2463	}
2464	++OI;
2465	}
2466	++AI;
2467	}
2468	}
2469
2470	LLVM_DEBUG(dbgs() << "Total Dependences: " << Dependences.size() << "\n");
2471	return isSafeForVectorization();
2472	}
2473
2474	SmallVector<Instruction *, `4`>
2475	MemoryDepChecker::getInstructionsForAccess(Value Ptr, bool* IsWrite) const {
2476	MemAccessInfo Access(Ptr, IsWrite);
2477	auto I = Accesses.find(Val: Access);
2478	SmallVector<Instruction *, `4`> Insts;
2479	if (I != Accesses.end()) {
2480	transform(Range: I ->second, d_first: std::back_inserter(x&: Insts),
2481	F: [&](unsigned Idx) { return this->InstMap [Idx]; });
2482	}
2483
2484	return Insts;
2485	}
2486
2487	const char *MemoryDepChecker::Dependence::DepName[] = {
2488	"NoDep",
2489	"Unknown",
2490	"IndirectUnsafe",
2491	"Forward",
2492	"ForwardButPreventsForwarding",
2493	"Backward",
2494	"BackwardVectorizable",
2495	"BackwardVectorizableButPreventsForwarding"};
2496
2497	void MemoryDepChecker::Dependence::print(
2498	raw_ostream &OS, unsigned Depth,
2499	const SmallVectorImpl<Instruction > &Instrs) const* {
2500	OS.indent(NumSpaces: Depth) << DepName[Type] << ":\n";
2501	OS.indent(NumSpaces: Depth + `2`) << *Instrs [Source] << " -> \n";
2502	OS.indent(NumSpaces: Depth + `2`) << *Instrs [Destination] << "\n";
2503	}
2504
2505	bool LoopAccessInfo::canAnalyzeLoop() {
2506	// We need to have a loop header.
2507	LLVM_DEBUG(dbgs() << "\nLAA: Checking a loop in '"
2508	<< TheLoop->getHeader()->getParent()->getName() << "' from "
2509	<< TheLoop->getLocStr() << "\n");
2510
2511	// We can only analyze innermost loops.
2512	if (!TheLoop->isInnermost()) {
2513	LLVM_DEBUG(dbgs() << "LAA: loop is not the innermost loop\n");
2514	recordAnalysis(RemarkName: "NotInnerMostLoop") << "loop is not the innermost loop";
2515	return false;
2516	}
2517
2518	// We must have a single backedge.
2519	if (TheLoop->getNumBackEdges() != `1`) {
2520	LLVM_DEBUG(
2521	dbgs() << "LAA: loop control flow is not understood by analyzer\n");
2522	recordAnalysis(RemarkName: "CFGNotUnderstood")
2523	<< "loop control flow is not understood by analyzer";
2524	return false;
2525	}
2526
2527	// ScalarEvolution needs to be able to find the symbolic max backedge taken
2528	// count, which is an upper bound on the number of loop iterations. The loop
2529	// may execute fewer iterations, if it exits via an uncountable exit.
2530	const SCEV *ExitCount = PSE ->getSymbolicMaxBackedgeTakenCount();
2531	if (isa<SCEVCouldNotCompute>(Val: ExitCount)) {
2532	recordAnalysis(RemarkName: "CantComputeNumberOfIterations")
2533	<< "could not determine number of loop iterations";
2534	LLVM_DEBUG(dbgs() << "LAA: SCEV could not compute the loop exit count.\n");
2535	return false;
2536	}
2537
2538	LLVM_DEBUG(dbgs() << "LAA: Found an analyzable loop: "
2539	<< TheLoop->getHeader()->getName() << "\n");
2540	return true;
2541	}
2542
2543	bool LoopAccessInfo::analyzeLoop(AAResults AA, const* LoopInfo *LI,
2544	const TargetLibraryInfo *TLI,
2545	DominatorTree *DT) {
2546	// Holds the Load and Store instructions.
2547	SmallVector<LoadInst *, `16`> Loads;
2548	SmallVector<StoreInst *, `16`> Stores;
2549	SmallPtrSet<MDNode *, `8`> LoopAliasScopes;
2550
2551	// Holds all the different accesses in the loop.
2552	unsigned NumReads = `0`;
2553	unsigned NumReadWrites = `0`;
2554
2555	bool HasComplexMemInst = false;
2556
2557	// A runtime check is only legal to insert if there are no convergent calls.
2558	HasConvergentOp = false;
2559
2560	PtrRtChecking ->Pointers.clear();
2561	PtrRtChecking ->Need = false;
2562
2563	const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel();
2564
2565	const bool EnableMemAccessVersioningOfLoop =
2566	EnableMemAccessVersioning &&
2567	!TheLoop->getHeader()->getParent()->hasOptSize();
2568
2569	// Traverse blocks in fixed RPOT order, regardless of their storage in the
2570	// loop info, as it may be arbitrary.
2571	LoopBlocksRPO RPOT(TheLoop);
2572	RPOT.perform(LI);
2573	for (BasicBlock *BB : RPOT) {
2574	// Scan the BB and collect legal loads and stores. Also detect any
2575	// convergent instructions.
2576	for (Instruction &I : *BB) {
2577	if (auto *Call = dyn_cast<CallBase>(Val: &I)) {
2578	if (Call->isConvergent())
2579	HasConvergentOp = true;
2580	}
2581
2582	// With both a non-vectorizable memory instruction and a convergent
2583	// operation, found in this loop, no reason to continue the search.
2584	if (HasComplexMemInst && HasConvergentOp)
2585	return false;
2586
2587	// Avoid hitting recordAnalysis multiple times.
2588	if (HasComplexMemInst)
2589	continue;
2590
2591	// Record alias scopes defined inside the loop.
2592	if (auto *Decl = dyn_cast<NoAliasScopeDeclInst>(Val: &I))
2593	for (Metadata *Op : Decl->getScopeList()->operands())
2594	LoopAliasScopes.insert(Ptr: cast<MDNode>(Val: Op));
2595
2596	// Many math library functions read the rounding mode. We will only
2597	// vectorize a loop if it contains known function calls that don't set
2598	// the flag. Therefore, it is safe to ignore this read from memory.
2599	auto *Call = dyn_cast<CallInst>(Val: &I);
2600	if (Call && getVectorIntrinsicIDForCall(CI: Call, TLI))
2601	continue;
2602
2603	// If this is a load, save it. If this instruction can read from memory
2604	// but is not a load, we only allow it if it's a call to a function with a
2605	// vector mapping and no pointer arguments.
2606	if (I.mayReadFromMemory()) {
2607	auto hasPointerArgs = [](CallBase *CB) {
2608	return any_of(Range: CB->args(), P: [](Value const *Arg) {
2609	return Arg->getType()->isPointerTy();
2610	});
2611	};
2612
2613	// If the function has an explicit vectorized counterpart, and does not
2614	// take output/input pointers, we can safely assume that it can be
2615	// vectorized.
2616	if (Call && !Call->isNoBuiltin() && Call->getCalledFunction() &&
2617	!hasPointerArgs (Call) && !VFDatabase::getMappings(CI: *Call).empty())
2618	continue;
2619
2620	auto *Ld = dyn_cast<LoadInst>(Val: &I);
2621	if (!Ld) {
2622	recordAnalysis(RemarkName: "CantVectorizeInstruction", Instr: &I)
2623	<< "instruction cannot be vectorized";
2624	HasComplexMemInst = true;
2625	continue;
2626	}
2627	if (!Ld->isSimple() && !IsAnnotatedParallel) {
2628	recordAnalysis(RemarkName: "NonSimpleLoad", Instr: Ld)
2629	<< "read with atomic ordering or volatile read";
2630	LLVM_DEBUG(dbgs() << "LAA: Found a non-simple load.\n");
2631	HasComplexMemInst = true;
2632	continue;
2633	}
2634	NumLoads++;
2635	Loads.push_back(Elt: Ld);
2636	DepChecker ->addAccess(LI: Ld);
2637	if (EnableMemAccessVersioningOfLoop)
2638	collectStridedAccess(LoadOrStoreInst: Ld);
2639	continue;
2640	}
2641
2642	// Save 'store' instructions. Abort if other instructions write to memory.
2643	if (I.mayWriteToMemory()) {
2644	auto *St = dyn_cast<StoreInst>(Val: &I);
2645	if (!St) {
2646	recordAnalysis(RemarkName: "CantVectorizeInstruction", Instr: &I)
2647	<< "instruction cannot be vectorized";
2648	HasComplexMemInst = true;
2649	continue;
2650	}
2651	if (!St->isSimple() && !IsAnnotatedParallel) {
2652	recordAnalysis(RemarkName: "NonSimpleStore", Instr: St)
2653	<< "write with atomic ordering or volatile write";
2654	LLVM_DEBUG(dbgs() << "LAA: Found a non-simple store.\n");
2655	HasComplexMemInst = true;
2656	continue;
2657	}
2658	NumStores++;
2659	Stores.push_back(Elt: St);
2660	DepChecker ->addAccess(SI: St);
2661	if (EnableMemAccessVersioningOfLoop)
2662	collectStridedAccess(LoadOrStoreInst: St);
2663	}
2664	} // Next instr.
2665	} // Next block.
2666
2667	if (HasComplexMemInst)
2668	return false;
2669
2670	// Now we have two lists that hold the loads and the stores.
2671	// Next, we find the pointers that they use.
2672
2673	// Check if we see any stores. If there are no stores, then we don't
2674	// care if the pointers are restrict.
2675	if (!Stores.size()) {
2676	LLVM_DEBUG(dbgs() << "LAA: Found a read-only loop!\n");
2677	return true;
2678	}
2679
2680	MemoryDepChecker::DepCandidates DepCands;
2681	AccessAnalysis Accesses(TheLoop, AA, LI, DT, DepCands, PSE,
2682	LoopAliasScopes);
2683
2684	// Holds the analyzed pointers. We don't want to call getUnderlyingObjects
2685	// multiple times on the same object. If the ptr is accessed twice, once
2686	// for read and once for write, it will only appear once (on the write
2687	// list). This is okay, since we are going to check for conflicts between
2688	// writes and between reads and writes, but not between reads and reads.
2689	SmallSet<std::pair<Value , Type >, `16`> Seen;
2690
2691	// Record uniform store addresses to identify if we have multiple stores
2692	// to the same address.
2693	SmallPtrSet<Value *, `16`> UniformStores;
2694
2695	for (StoreInst *ST : Stores) {
2696	Value *Ptr = ST->getPointerOperand();
2697
2698	if (isInvariant(V: Ptr)) {
2699	// Record store instructions to loop invariant addresses
2700	StoresToInvariantAddresses.push_back(Elt: ST);
2701	HasStoreStoreDependenceInvolvingLoopInvariantAddress \|=
2702	!UniformStores.insert(Ptr).second;
2703	}
2704
2705	// If we did not* see this pointer before, insert it to the read-write*
2706	// list. At this phase it is only a 'write' list.
2707	Type *AccessTy = getLoadStoreType(I: ST);
2708	if (Seen.insert(V: {Ptr, AccessTy}).second) {
2709	++NumReadWrites;
2710
2711	MemoryLocation Loc = MemoryLocation::get(SI: ST);
2712	// The TBAA metadata could have a control dependency on the predication
2713	// condition, so we cannot rely on it when determining whether or not we
2714	// need runtime pointer checks.
2715	if (blockNeedsPredication(BB: ST->getParent(), TheLoop, DT))
2716	Loc.AATags.TBAA = nullptr;
2717
2718	visitPointers(StartPtr: const_cast<Value >(Loc.Ptr), InnermostLoop: TheLoop,
2719	AddPointer: [&Accesses, AccessTy, Loc](Value *Ptr) {
2720	MemoryLocation NewLoc = Loc.getWithNewPtr(NewPtr: Ptr);
2721	Accesses.addStore(Loc: NewLoc, AccessTy);
2722	});
2723	}
2724	}
2725
2726	if (IsAnnotatedParallel) {
2727	LLVM_DEBUG(
2728	dbgs() << "LAA: A loop annotated parallel, ignore memory dependency "
2729	<< "checks.\n");
2730	return true;
2731	}
2732
2733	for (LoadInst *LD : Loads) {
2734	Value *Ptr = LD->getPointerOperand();
2735	// If we did not* see this pointer before, insert it to the*
2736	// read list. If we did* see it before, then it is already in*
2737	// the read-write list. This allows us to vectorize expressions
2738	// such as A[i] += x; Because the address of A[i] is a read-write
2739	// pointer. This only works if the index of A[i] is consecutive.
2740	// If the address of i is unknown (for example A[B[i]]) then we may
2741	// read a few words, modify, and write a few words, and some of the
2742	// words may be written to the same address.
2743	bool IsReadOnlyPtr = false;
2744	Type *AccessTy = getLoadStoreType(I: LD);
2745	if (Seen.insert(V: {Ptr, AccessTy}).second \|\|
2746	!getPtrStride(PSE&: PSE, AccessTy, Ptr, Lp: TheLoop, DT: DT, StridesMap: SymbolicStrides, Assume: false,
2747	ShouldCheckWrap: true)) {
2748	++NumReads;
2749	IsReadOnlyPtr = true;
2750	}
2751
2752	// See if there is an unsafe dependency between a load to a uniform address and
2753	// store to the same uniform address.
2754	if (UniformStores.contains(Ptr)) {
2755	LLVM_DEBUG(dbgs() << "LAA: Found an unsafe dependency between a uniform "
2756	"load and uniform store to the same address!\n");
2757	HasLoadStoreDependenceInvolvingLoopInvariantAddress = true;
2758	}
2759
2760	MemoryLocation Loc = MemoryLocation::get(LI: LD);
2761	// The TBAA metadata could have a control dependency on the predication
2762	// condition, so we cannot rely on it when determining whether or not we
2763	// need runtime pointer checks.
2764	if (blockNeedsPredication(BB: LD->getParent(), TheLoop, DT))
2765	Loc.AATags.TBAA = nullptr;
2766
2767	visitPointers(StartPtr: const_cast<Value >(Loc.Ptr), InnermostLoop: TheLoop,
2768	AddPointer: [&Accesses, AccessTy, Loc, IsReadOnlyPtr](Value *Ptr) {
2769	MemoryLocation NewLoc = Loc.getWithNewPtr(NewPtr: Ptr);
2770	Accesses.addLoad(Loc: NewLoc, AccessTy, IsReadOnly: IsReadOnlyPtr);
2771	});
2772	}
2773
2774	// If we write (or read-write) to a single destination and there are no
2775	// other reads in this loop then is it safe to vectorize.
2776	if (NumReadWrites == `1` && NumReads == `0`) {
2777	LLVM_DEBUG(dbgs() << "LAA: Found a write-only loop!\n");
2778	return true;
2779	}
2780
2781	// Build dependence sets and check whether we need a runtime pointer bounds
2782	// check.
2783	Accesses.buildDependenceSets();
2784
2785	// Find pointers with computable bounds. We are going to use this information
2786	// to place a runtime bound check.
2787	Value UncomputablePtr = nullptr*;
2788	HasCompletePtrRtChecking =
2789	Accesses.canCheckPtrAtRT(RtCheck&: *PtrRtChecking, TheLoop, StridesMap: SymbolicStrides,
2790	UncomputablePtr, AllowPartial, DepChecker: getDepChecker());
2791	if (!HasCompletePtrRtChecking) {
2792	const auto *I = dyn_cast_or_null<Instruction>(Val: UncomputablePtr);
2793	recordAnalysis(RemarkName: "CantIdentifyArrayBounds", Instr: I)
2794	<< "cannot identify array bounds";
2795	LLVM_DEBUG(dbgs() << "LAA: We can't vectorize because we can't find "
2796	<< "the array bounds.\n");
2797	return false;
2798	}
2799
2800	LLVM_DEBUG(
2801	dbgs() << "LAA: May be able to perform a memory runtime check if needed.\n");
2802
2803	bool DepsAreSafe = true;
2804	if (Accesses.isDependencyCheckNeeded()) {
2805	LLVM_DEBUG(dbgs() << "LAA: Checking memory dependencies\n");
2806	DepsAreSafe =
2807	DepChecker ->areDepsSafe(DepCands, CheckDeps: Accesses.getDependenciesToCheck());
2808
2809	if (!DepsAreSafe && DepChecker ->shouldRetryWithRuntimeChecks()) {
2810	LLVM_DEBUG(dbgs() << "LAA: Retrying with memory checks\n");
2811
2812	PtrRtChecking ->reset();
2813	PtrRtChecking ->Need = true;
2814
2815	UncomputablePtr = nullptr;
2816	HasCompletePtrRtChecking = Accesses.canCheckPtrAtRT(
2817	RtCheck&: *PtrRtChecking, TheLoop, StridesMap: SymbolicStrides, UncomputablePtr,
2818	AllowPartial, DepChecker: getDepChecker());
2819
2820	// Check that we found the bounds for the pointer.
2821	if (!HasCompletePtrRtChecking) {
2822	auto *I = dyn_cast_or_null<Instruction>(Val: UncomputablePtr);
2823	recordAnalysis(RemarkName: "CantCheckMemDepsAtRunTime", Instr: I)
2824	<< "cannot check memory dependencies at runtime";
2825	LLVM_DEBUG(dbgs() << "LAA: Can't vectorize with memory checks\n");
2826	return false;
2827	}
2828
2829	// Clear the dependency checks. They are no longer needed.
2830	Accesses.resetDepChecks(DepChecker&: *DepChecker);
2831
2832	DepsAreSafe = true;
2833	}
2834	}
2835
2836	if (HasConvergentOp) {
2837	recordAnalysis(RemarkName: "CantInsertRuntimeCheckWithConvergent")
2838	<< "cannot add control dependency to convergent operation";
2839	LLVM_DEBUG(dbgs() << "LAA: We can't vectorize because a runtime check "
2840	"would be needed with a convergent operation\n");
2841	return false;
2842	}
2843
2844	if (DepsAreSafe) {
2845	LLVM_DEBUG(
2846	dbgs() << "LAA: No unsafe dependent memory operations in loop. We"
2847	<< (PtrRtChecking->Need ? "" : " don't")
2848	<< " need runtime memory checks.\n");
2849	return true;
2850	}
2851
2852	emitUnsafeDependenceRemark();
2853	return false;
2854	}
2855
2856	void LoopAccessInfo::emitUnsafeDependenceRemark() {
2857	const auto *Deps = getDepChecker().getDependences();
2858	if (!Deps)
2859	return;
2860	const auto *Found =
2861	llvm::find_if(Range: Deps, P: [](const* MemoryDepChecker::Dependence &D) {
2862	return MemoryDepChecker::Dependence::isSafeForVectorization(Type: D.Type) !=
2863	MemoryDepChecker::VectorizationSafetyStatus::Safe;
2864	});
2865	if (Found == Deps->end())
2866	return;
2867	MemoryDepChecker::Dependence Dep = *Found;
2868
2869	LLVM_DEBUG(dbgs() << "LAA: unsafe dependent memory operations in loop\n");
2870
2871	// Emit remark for first unsafe dependence
2872	bool HasForcedDistribution = false;
2873	std::optional<const MDOperand *> Value =
2874	findStringMetadataForLoop(TheLoop, Name: "llvm.loop.distribute.enable");
2875	if (Value) {
2876	const MDOperand Op = Value;
2877	assert(Op && mdconst::hasa<ConstantInt>(*Op) && "invalid metadata");
2878	HasForcedDistribution = mdconst::extract<ConstantInt>(MD: *Op)->getZExtValue();
2879	}
2880
2881	const std::string Info =
2882	HasForcedDistribution
2883	? "unsafe dependent memory operations in loop."
2884	: "unsafe dependent memory operations in loop. Use "
2885	"#pragma clang loop distribute(enable) to allow loop distribution "
2886	"to attempt to isolate the offending operations into a separate "
2887	"loop";
2888	OptimizationRemarkAnalysis &R =
2889	recordAnalysis(RemarkName: "UnsafeDep", Instr: Dep.getDestination(DepChecker: getDepChecker())) << Info;
2890
2891	switch (Dep.Type) {
2892	case MemoryDepChecker::Dependence::NoDep:
2893	case MemoryDepChecker::Dependence::Forward:
2894	case MemoryDepChecker::Dependence::BackwardVectorizable:
2895	llvm_unreachable("Unexpected dependence");
2896	case MemoryDepChecker::Dependence::Backward:
2897	R << "\nBackward loop carried data dependence.";
2898	break;
2899	case MemoryDepChecker::Dependence::ForwardButPreventsForwarding:
2900	R << "\nForward loop carried data dependence that prevents "
2901	"store-to-load forwarding.";
2902	break;
2903	case MemoryDepChecker::Dependence::BackwardVectorizableButPreventsForwarding:
2904	R << "\nBackward loop carried data dependence that prevents "
2905	"store-to-load forwarding.";
2906	break;
2907	case MemoryDepChecker::Dependence::IndirectUnsafe:
2908	R << "\nUnsafe indirect dependence.";
2909	break;
2910	case MemoryDepChecker::Dependence::Unknown:
2911	R << "\nUnknown data dependence.";
2912	break;
2913	}
2914
2915	if (Instruction *I = Dep.getSource(DepChecker: getDepChecker())) {
2916	DebugLoc SourceLoc = I->getDebugLoc();
2917	if (auto *DD = dyn_cast_or_null<Instruction>(Val: getPointerOperand(V: I)))
2918	SourceLoc = DD->getDebugLoc();
2919	if (SourceLoc)
2920	R << " Memory location is the same as accessed at "
2921	<< ore::NV ("Location", SourceLoc);
2922	}
2923	}
2924
2925	bool LoopAccessInfo::blockNeedsPredication(const BasicBlock *BB,
2926	const Loop *TheLoop,
2927	const DominatorTree *DT) {
2928	assert(TheLoop->contains(BB) && "Unknown block used");
2929
2930	// Blocks that do not dominate the latch need predication.
2931	const BasicBlock *Latch = TheLoop->getLoopLatch();
2932	return !DT->dominates(A: BB, B: Latch);
2933	}
2934
2935	OptimizationRemarkAnalysis &
2936	LoopAccessInfo::recordAnalysis(StringRef RemarkName, const Instruction *I) {
2937	assert(!Report && "Multiple reports generated");
2938
2939	const BasicBlock *CodeRegion = TheLoop->getHeader();
2940	DebugLoc DL = TheLoop->getStartLoc();
2941
2942	if (I) {
2943	CodeRegion = I->getParent();
2944	// If there is no debug location attached to the instruction, revert back to
2945	// using the loop's.
2946	if (I->getDebugLoc())
2947	DL = I->getDebugLoc();
2948	}
2949
2950	Report = std::make_unique<OptimizationRemarkAnalysis>(DEBUG_TYPE, args&: RemarkName,
2951	args&: DL, args&: CodeRegion);
2952	return *Report;
2953	}
2954
2955	bool LoopAccessInfo::isInvariant(Value V) const* {
2956	auto *SE = PSE ->getSE();
2957	if (TheLoop->isLoopInvariant(V))
2958	return true;
2959	if (!SE->isSCEVable(Ty: V->getType()))
2960	return false;
2961	const SCEV *S = SE->getSCEV(V);
2962	return SE->isLoopInvariant(S, L: TheLoop);
2963	}
2964
2965	/// If \p Ptr is a GEP, which has a loop-variant operand, return that operand.
2966	/// Otherwise, return \p Ptr.
2967	static Value getLoopVariantGEPOperand(Value Ptr, ScalarEvolution *SE,
2968	Loop *Lp) {
2969	auto *GEP = dyn_cast<GetElementPtrInst>(Val: Ptr);
2970	if (!GEP)
2971	return Ptr;
2972
2973	Value *V = Ptr;
2974	for (const Use &U : GEP->operands()) {
2975	if (!SE->isLoopInvariant(S: SE->getSCEV(V: U), L: Lp)) {
2976	if (V == Ptr)
2977	V = U;
2978	else
2979	// There must be exactly one loop-variant operand.
2980	return Ptr;
2981	}
2982	}
2983	return V;
2984	}
2985
2986	/// Get the stride of a pointer access in a loop. Looks for symbolic
2987	/// strides "a[istride]". Returns the symbolic stride, or null otherwise.*
2988	static const SCEV getStrideFromPointer(Value Ptr, ScalarEvolution SE, Loop Lp) {
2989	auto *PtrTy = dyn_cast<PointerType>(Val: Ptr->getType());
2990	if (!PtrTy)
2991	return nullptr;
2992
2993	// Try to remove a gep instruction to make the pointer (actually index at this
2994	// point) easier analyzable. If OrigPtr is equal to Ptr we are analyzing the
2995	// pointer, otherwise, we are analyzing the index.
2996	Value *OrigPtr = Ptr;
2997
2998	Ptr = getLoopVariantGEPOperand(Ptr, SE, Lp);
2999	const SCEV *V = SE->getSCEV(V: Ptr);
3000
3001	if (Ptr != OrigPtr)
3002	// Strip off casts.
3003	while (auto *C = dyn_cast<SCEVIntegralCastExpr>(Val: V))
3004	V = C->getOperand();
3005
3006	if (!match(S: V, P: m_scev_AffineAddRec(Op0: m_SCEV(), Op1: m_SCEV(V), L: m_SpecificLoop(L: Lp))))
3007	return nullptr;
3008
3009	// Note that the restriction after this loop invariant check are only
3010	// profitability restrictions.
3011	if (!SE->isLoopInvariant(S: V, L: Lp))
3012	return nullptr;
3013
3014	// Look for the loop invariant symbolic value.
3015	if (isa<SCEVUnknown>(Val: V))
3016	return V;
3017
3018	// Look through multiplies that scale a stride by a constant.
3019	match(S: V, P: m_scev_Mul(Op0: m_SCEVConstant(), Op1: m_SCEV(V)));
3020	if (auto *C = dyn_cast<SCEVIntegralCastExpr>(Val: V))
3021	if (isa<SCEVUnknown>(Val: C->getOperand()))
3022	return V;
3023
3024	return nullptr;
3025	}
3026
3027	void LoopAccessInfo::collectStridedAccess(Value *MemAccess) {
3028	Value *Ptr = getLoadStorePointerOperand(V: MemAccess);
3029	if (!Ptr)
3030	return;
3031
3032	// Note: getStrideFromPointer is a profitability* heuristic. We*
3033	// could broaden the scope of values returned here - to anything
3034	// which happens to be loop invariant and contributes to the
3035	// computation of an interesting IV - but we chose not to as we
3036	// don't have a cost model here, and broadening the scope exposes
3037	// far too many unprofitable cases.
3038	const SCEV *StrideExpr = getStrideFromPointer(Ptr, SE: PSE ->getSE(), Lp: TheLoop);
3039	if (!StrideExpr)
3040	return;
3041
3042	if (match(S: StrideExpr, P: m_scev_UndefOrPoison()))
3043	return;
3044
3045	LLVM_DEBUG(dbgs() << "LAA: Found a strided access that is a candidate for "
3046	"versioning:");
3047	LLVM_DEBUG(dbgs() << " Ptr: " << Ptr << " Stride: " << StrideExpr << "\n");
3048
3049	if (!SpeculateUnitStride) {
3050	LLVM_DEBUG(dbgs() << " Chose not to due to -laa-speculate-unit-stride\n");
3051	return;
3052	}
3053
3054	// Avoid adding the "Stride == 1" predicate when we know that
3055	// Stride >= Trip-Count. Such a predicate will effectively optimize a single
3056	// or zero iteration loop, as Trip-Count <= Stride == 1.
3057	//
3058	// TODO: We are currently not making a very informed decision on when it is
3059	// beneficial to apply stride versioning. It might make more sense that the
3060	// users of this analysis (such as the vectorizer) will trigger it, based on
3061	// their specific cost considerations; For example, in cases where stride
3062	// versioning does not help resolving memory accesses/dependences, the
3063	// vectorizer should evaluate the cost of the runtime test, and the benefit
3064	// of various possible stride specializations, considering the alternatives
3065	// of using gather/scatters (if available).
3066
3067	const SCEV *MaxBTC = PSE ->getSymbolicMaxBackedgeTakenCount();
3068
3069	// Match the types so we can compare the stride and the MaxBTC.
3070	// The Stride can be positive/negative, so we sign extend Stride;
3071	// The backedgeTakenCount is non-negative, so we zero extend MaxBTC.
3072	const DataLayout &DL = TheLoop->getHeader()->getDataLayout();
3073	uint64_t StrideTypeSizeBits = DL.getTypeSizeInBits(Ty: StrideExpr->getType());
3074	uint64_t BETypeSizeBits = DL.getTypeSizeInBits(Ty: MaxBTC->getType());
3075	const SCEV *CastedStride = StrideExpr;
3076	const SCEV *CastedBECount = MaxBTC;
3077	ScalarEvolution *SE = PSE ->getSE();
3078	if (BETypeSizeBits >= StrideTypeSizeBits)
3079	CastedStride = SE->getNoopOrSignExtend(V: StrideExpr, Ty: MaxBTC->getType());
3080	else
3081	CastedBECount = SE->getZeroExtendExpr(Op: MaxBTC, Ty: StrideExpr->getType());
3082	const SCEV *StrideMinusBETaken = SE->getMinusSCEV(LHS: CastedStride, RHS: CastedBECount);
3083	// Since TripCount == BackEdgeTakenCount + 1, checking:
3084	// "Stride >= TripCount" is equivalent to checking:
3085	// Stride - MaxBTC> 0
3086	if (SE->isKnownPositive(S: StrideMinusBETaken)) {
3087	LLVM_DEBUG(
3088	dbgs() << "LAA: Stride>=TripCount; No point in versioning as the "
3089	"Stride==1 predicate will imply that the loop executes "
3090	"at most once.\n");
3091	return;
3092	}
3093	LLVM_DEBUG(dbgs() << "LAA: Found a strided access that we can version.\n");
3094
3095	// Strip back off the integer cast, and check that our result is a
3096	// SCEVUnknown as we expect.
3097	const SCEV *StrideBase = StrideExpr;
3098	if (const auto *C = dyn_cast<SCEVIntegralCastExpr>(Val: StrideBase))
3099	StrideBase = C->getOperand();
3100	SymbolicStrides [Ptr] = cast<SCEVUnknown>(Val: StrideBase);
3101	}
3102
3103	LoopAccessInfo::LoopAccessInfo(Loop L, ScalarEvolution SE,
3104	const TargetTransformInfo *TTI,
3105	const TargetLibraryInfo TLI, AAResults AA,
3106	DominatorTree DT, LoopInfo LI,
3107	AssumptionCache AC, bool* AllowPartial)
3108	: PSE(std::make_unique<PredicatedScalarEvolution>(args&: SE, args&: L)),
3109	PtrRtChecking (nullptr), TheLoop(L), AllowPartial(AllowPartial) {
3110	unsigned MaxTargetVectorWidthInBits = std::numeric_limits<unsigned>::max();
3111	if (TTI && !TTI->enableScalableVectorization())
3112	// Scale the vector width by 2 as rough estimate to also consider
3113	// interleaving.
3114	MaxTargetVectorWidthInBits =
3115	TTI->getRegisterBitWidth(K: TargetTransformInfo::RGK_FixedWidthVector) * `2`;
3116
3117	DepChecker = std::make_unique<MemoryDepChecker>(
3118	args&: *PSE, args&: AC, args&: DT, args&: L, args&: SymbolicStrides, args&: MaxTargetVectorWidthInBits, args&: LoopGuards);
3119	PtrRtChecking =
3120	std::make_unique<RuntimePointerChecking>(args&: *DepChecker, args&: SE, args&: LoopGuards);
3121	if (canAnalyzeLoop())
3122	CanVecMem = analyzeLoop(AA, LI, TLI, DT);
3123	}
3124
3125	void LoopAccessInfo::print(raw_ostream &OS, unsigned Depth) const {
3126	if (CanVecMem) {
3127	OS.indent(NumSpaces: Depth) << "Memory dependences are safe";
3128	const MemoryDepChecker &DC = getDepChecker();
3129	if (!DC.isSafeForAnyVectorWidth())
3130	OS << " with a maximum safe vector width of "
3131	<< DC.getMaxSafeVectorWidthInBits() << " bits";
3132	if (!DC.isSafeForAnyStoreLoadForwardDistances()) {
3133	uint64_t SLDist = DC.getStoreLoadForwardSafeDistanceInBits();
3134	OS << ", with a maximum safe store-load forward width of " << SLDist
3135	<< " bits";
3136	}
3137	if (PtrRtChecking ->Need)
3138	OS << " with run-time checks";
3139	OS << "\n";
3140	}
3141
3142	if (HasConvergentOp)
3143	OS.indent(NumSpaces: Depth) << "Has convergent operation in loop\n";
3144
3145	if (Report)
3146	OS.indent(NumSpaces: Depth) << "Report: " << Report ->getMsg() << "\n";
3147
3148	if (auto *Dependences = DepChecker ->getDependences()) {
3149	OS.indent(NumSpaces: Depth) << "Dependences:\n";
3150	for (const auto &Dep : *Dependences) {
3151	Dep.print(OS, Depth: Depth + `2`, Instrs: DepChecker ->getMemoryInstructions());
3152	OS << "\n";
3153	}
3154	} else
3155	OS.indent(NumSpaces: Depth) << "Too many dependences, not recorded\n";
3156
3157	// List the pair of accesses need run-time checks to prove independence.
3158	PtrRtChecking ->print(OS, Depth);
3159	if (PtrRtChecking ->Need && !HasCompletePtrRtChecking)
3160	OS.indent(NumSpaces: Depth) << "Generated run-time checks are incomplete\n";
3161	OS << "\n";
3162
3163	OS.indent(NumSpaces: Depth)
3164	<< "Non vectorizable stores to invariant address were "
3165	<< (HasStoreStoreDependenceInvolvingLoopInvariantAddress \|\|
3166	HasLoadStoreDependenceInvolvingLoopInvariantAddress
3167	? ""
3168	: "not ")
3169	<< "found in loop.\n";
3170
3171	OS.indent(NumSpaces: Depth) << "SCEV assumptions:\n";
3172	PSE ->getPredicate().print(OS, Depth);
3173
3174	OS << "\n";
3175
3176	OS.indent(NumSpaces: Depth) << "Expressions re-written:\n";
3177	PSE ->print(OS, Depth);
3178	}
3179
3180	const LoopAccessInfo &LoopAccessInfoManager::getInfo(Loop &L,
3181	bool AllowPartial) {
3182	const auto &[It, Inserted] = LoopAccessInfoMap.try_emplace(Key: &L);
3183
3184	// We need to create the LoopAccessInfo if either we don't already have one,
3185	// or if it was created with a different value of AllowPartial.
3186	if (Inserted \|\| It ->second ->hasAllowPartial() != AllowPartial)
3187	It ->second = std::make_unique<LoopAccessInfo>(args: &L, args: &SE, args&: TTI, args&: TLI, args: &AA, args: &DT,
3188	args: &LI, args&: AC, args&: AllowPartial);
3189
3190	return *It ->second;
3191	}
3192	void LoopAccessInfoManager::clear() {
3193	// Collect LoopAccessInfo entries that may keep references to IR outside the
3194	// analyzed loop or SCEVs that may have been modified or invalidated. At the
3195	// moment, that is loops requiring memory or SCEV runtime checks, as those cache
3196	// SCEVs, e.g. for pointer expressions.
3197	for (const auto &[L, LAI] : LoopAccessInfoMap) {
3198	if (LAI ->getRuntimePointerChecking()->getChecks().empty() &&
3199	LAI ->getPSE().getPredicate().isAlwaysTrue())
3200	continue;
3201	LoopAccessInfoMap.erase(Val: L);
3202	}
3203	}
3204
3205	bool LoopAccessInfoManager::invalidate(
3206	Function &F, const PreservedAnalyses &PA,
3207	FunctionAnalysisManager::Invalidator &Inv) {
3208	// Check whether our analysis is preserved.
3209	auto PAC = PA.getChecker<LoopAccessAnalysis>();
3210	if (!PAC.preserved() && !PAC.preservedSet<AllAnalysesOn<Function>>())
3211	// If not, give up now.
3212	return true;
3213
3214	// Check whether the analyses we depend on became invalid for any reason.
3215	// Skip checking TargetLibraryAnalysis as it is immutable and can't become
3216	// invalid.
3217	return Inv.invalidate<AAManager>(IR&: F, PA) \|\|
3218	Inv.invalidate<ScalarEvolutionAnalysis>(IR&: F, PA) \|\|
3219	Inv.invalidate<LoopAnalysis>(IR&: F, PA) \|\|
3220	Inv.invalidate<DominatorTreeAnalysis>(IR&: F, PA);
3221	}
3222
3223	LoopAccessInfoManager LoopAccessAnalysis::run(Function &F,
3224	FunctionAnalysisManager &FAM) {
3225	auto &SE = FAM.getResult<ScalarEvolutionAnalysis>(IR&: F);
3226	auto &AA = FAM.getResult<AAManager>(IR&: F);
3227	auto &DT = FAM.getResult<DominatorTreeAnalysis>(IR&: F);
3228	auto &LI = FAM.getResult<LoopAnalysis>(IR&: F);
3229	auto &TTI = FAM.getResult<TargetIRAnalysis>(IR&: F);
3230	auto &TLI = FAM.getResult<TargetLibraryAnalysis>(IR&: F);
3231	auto &AC = FAM.getResult<AssumptionAnalysis>(IR&: F);
3232	return LoopAccessInfoManager (SE, AA, DT, LI, &TTI, &TLI, &AC);
3233	}
3234
3235	AnalysisKey LoopAccessAnalysis::Key;
3236

Browse the source code of llvm_projects/llvm/lib/Analysis/LoopAccessAnalysis.cpp