1//===- LoopAccessAnalysis.cpp - Loop Access Analysis Implementation --------==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// The implementation for the loop memory dependence that was originally
10// developed for the loop vectorizer.
11//
12//===----------------------------------------------------------------------===//
13
14#include "llvm/Analysis/LoopAccessAnalysis.h"
15#include "llvm/ADT/APInt.h"
16#include "llvm/ADT/DenseMap.h"
17#include "llvm/ADT/EquivalenceClasses.h"
18#include "llvm/ADT/PointerIntPair.h"
19#include "llvm/ADT/STLExtras.h"
20#include "llvm/ADT/SetVector.h"
21#include "llvm/ADT/SmallPtrSet.h"
22#include "llvm/ADT/SmallSet.h"
23#include "llvm/ADT/SmallVector.h"
24#include "llvm/Analysis/AliasAnalysis.h"
25#include "llvm/Analysis/AliasSetTracker.h"
26#include "llvm/Analysis/AssumeBundleQueries.h"
27#include "llvm/Analysis/AssumptionCache.h"
28#include "llvm/Analysis/LoopAnalysisManager.h"
29#include "llvm/Analysis/LoopInfo.h"
30#include "llvm/Analysis/LoopIterator.h"
31#include "llvm/Analysis/MemoryLocation.h"
32#include "llvm/Analysis/OptimizationRemarkEmitter.h"
33#include "llvm/Analysis/ScalarEvolution.h"
34#include "llvm/Analysis/ScalarEvolutionExpressions.h"
35#include "llvm/Analysis/ScalarEvolutionPatternMatch.h"
36#include "llvm/Analysis/TargetLibraryInfo.h"
37#include "llvm/Analysis/TargetTransformInfo.h"
38#include "llvm/Analysis/ValueTracking.h"
39#include "llvm/Analysis/VectorUtils.h"
40#include "llvm/IR/BasicBlock.h"
41#include "llvm/IR/Constants.h"
42#include "llvm/IR/DataLayout.h"
43#include "llvm/IR/DebugLoc.h"
44#include "llvm/IR/DerivedTypes.h"
45#include "llvm/IR/DiagnosticInfo.h"
46#include "llvm/IR/Dominators.h"
47#include "llvm/IR/Function.h"
48#include "llvm/IR/InstrTypes.h"
49#include "llvm/IR/Instruction.h"
50#include "llvm/IR/Instructions.h"
51#include "llvm/IR/IntrinsicInst.h"
52#include "llvm/IR/PassManager.h"
53#include "llvm/IR/Type.h"
54#include "llvm/IR/Value.h"
55#include "llvm/IR/ValueHandle.h"
56#include "llvm/Support/Casting.h"
57#include "llvm/Support/CommandLine.h"
58#include "llvm/Support/Debug.h"
59#include "llvm/Support/ErrorHandling.h"
60#include "llvm/Support/raw_ostream.h"
61#include <algorithm>
62#include <cassert>
63#include <cstdint>
64#include <iterator>
65#include <utility>
66#include <variant>
67#include <vector>
68
69using namespace llvm;
70using namespace llvm::SCEVPatternMatch;
71
72#define DEBUG_TYPE "loop-accesses"
73
74static cl::opt<unsigned, true>
75VectorizationFactor("force-vector-width", cl::Hidden,
76 cl::desc("Sets the SIMD width. Zero is autoselect."),
77 cl::location(L&: VectorizerParams::VectorizationFactor));
78unsigned VectorizerParams::VectorizationFactor;
79
80static cl::opt<unsigned, true>
81VectorizationInterleave("force-vector-interleave", cl::Hidden,
82 cl::desc("Sets the vectorization interleave count. "
83 "Zero is autoselect."),
84 cl::location(
85 L&: VectorizerParams::VectorizationInterleave));
86unsigned VectorizerParams::VectorizationInterleave;
87
88static cl::opt<unsigned, true> RuntimeMemoryCheckThreshold(
89 "runtime-memory-check-threshold", cl::Hidden,
90 cl::desc("When performing memory disambiguation checks at runtime do not "
91 "generate more than this number of comparisons (default = 8)."),
92 cl::location(L&: VectorizerParams::RuntimeMemoryCheckThreshold), cl::init(Val: 8));
93unsigned VectorizerParams::RuntimeMemoryCheckThreshold;
94
95/// The maximum iterations used to merge memory checks
96static cl::opt<unsigned> MemoryCheckMergeThreshold(
97 "memory-check-merge-threshold", cl::Hidden,
98 cl::desc("Maximum number of comparisons done when trying to merge "
99 "runtime memory checks. (default = 100)"),
100 cl::init(Val: 100));
101
102/// Maximum SIMD width.
103const unsigned VectorizerParams::MaxVectorWidth = 64;
104
105/// We collect dependences up to this threshold.
106static cl::opt<unsigned>
107 MaxDependences("max-dependences", cl::Hidden,
108 cl::desc("Maximum number of dependences collected by "
109 "loop-access analysis (default = 100)"),
110 cl::init(Val: 100));
111
112/// This enables versioning on the strides of symbolically striding memory
113/// accesses in code like the following.
114/// for (i = 0; i < N; ++i)
115/// A[i * Stride1] += B[i * Stride2] ...
116///
117/// Will be roughly translated to
118/// if (Stride1 == 1 && Stride2 == 1) {
119/// for (i = 0; i < N; i+=4)
120/// A[i:i+3] += ...
121/// } else
122/// ...
123static cl::opt<bool> EnableMemAccessVersioning(
124 "enable-mem-access-versioning", cl::init(Val: true), cl::Hidden,
125 cl::desc("Enable symbolic stride memory access versioning"));
126
127/// Enable store-to-load forwarding conflict detection. This option can
128/// be disabled for correctness testing.
129static cl::opt<bool> EnableForwardingConflictDetection(
130 "store-to-load-forwarding-conflict-detection", cl::Hidden,
131 cl::desc("Enable conflict detection in loop-access analysis"),
132 cl::init(Val: true));
133
134static cl::opt<unsigned> MaxForkedSCEVDepth(
135 "max-forked-scev-depth", cl::Hidden,
136 cl::desc("Maximum recursion depth when finding forked SCEVs (default = 5)"),
137 cl::init(Val: 5));
138
139static cl::opt<bool> SpeculateUnitStride(
140 "laa-speculate-unit-stride", cl::Hidden,
141 cl::desc("Speculate that non-constant strides are unit in LAA"),
142 cl::init(Val: true));
143
144static cl::opt<bool, true> HoistRuntimeChecks(
145 "hoist-runtime-checks", cl::Hidden,
146 cl::desc(
147 "Hoist inner loop runtime memory checks to outer loop if possible"),
148 cl::location(L&: VectorizerParams::HoistRuntimeChecks), cl::init(Val: true));
149bool VectorizerParams::HoistRuntimeChecks;
150
151bool VectorizerParams::isInterleaveForced() {
152 return ::VectorizationInterleave.getNumOccurrences() > 0;
153}
154
155const SCEV *llvm::replaceSymbolicStrideSCEV(PredicatedScalarEvolution &PSE,
156 const DenseMap<Value *, const SCEV *> &PtrToStride,
157 Value *Ptr) {
158 const SCEV *OrigSCEV = PSE.getSCEV(V: Ptr);
159
160 // If there is an entry in the map return the SCEV of the pointer with the
161 // symbolic stride replaced by one.
162 const SCEV *StrideSCEV = PtrToStride.lookup(Val: Ptr);
163 if (!StrideSCEV)
164 // For a non-symbolic stride, just return the original expression.
165 return OrigSCEV;
166
167 // Note: This assert is both overly strong and overly weak. The actual
168 // invariant here is that StrideSCEV should be loop invariant. The only
169 // such invariant strides we happen to speculate right now are unknowns
170 // and thus this is a reasonable proxy of the actual invariant.
171 assert(isa<SCEVUnknown>(StrideSCEV) && "shouldn't be in map");
172
173 ScalarEvolution *SE = PSE.getSE();
174 const SCEV *CT = SE->getOne(Ty: StrideSCEV->getType());
175 PSE.addPredicate(Pred: *SE->getEqualPredicate(LHS: StrideSCEV, RHS: CT));
176 const SCEV *Expr = PSE.getSCEV(V: Ptr);
177
178 LLVM_DEBUG(dbgs() << "LAA: Replacing SCEV: " << *OrigSCEV
179 << " by: " << *Expr << "\n");
180 return Expr;
181}
182
183RuntimeCheckingPtrGroup::RuntimeCheckingPtrGroup(
184 unsigned Index, const RuntimePointerChecking &RtCheck)
185 : High(RtCheck.Pointers[Index].End), Low(RtCheck.Pointers[Index].Start),
186 AddressSpace(RtCheck.Pointers[Index]
187 .PointerValue->getType()
188 ->getPointerAddressSpace()),
189 NeedsFreeze(RtCheck.Pointers[Index].NeedsFreeze) {
190 Members.push_back(Elt: Index);
191}
192
193/// Returns \p A + \p B, if it is guaranteed not to unsigned wrap. Otherwise
194/// return nullptr. \p A and \p B must have the same type.
195static const SCEV *addSCEVNoOverflow(const SCEV *A, const SCEV *B,
196 ScalarEvolution &SE) {
197 if (!SE.willNotOverflow(BinOp: Instruction::Add, /*IsSigned=*/Signed: false, LHS: A, RHS: B))
198 return nullptr;
199 return SE.getAddExpr(LHS: A, RHS: B);
200}
201
202/// Returns \p A * \p B, if it is guaranteed not to unsigned wrap. Otherwise
203/// return nullptr. \p A and \p B must have the same type.
204static const SCEV *mulSCEVOverflow(const SCEV *A, const SCEV *B,
205 ScalarEvolution &SE) {
206 if (!SE.willNotOverflow(BinOp: Instruction::Mul, /*IsSigned=*/Signed: false, LHS: A, RHS: B))
207 return nullptr;
208 return SE.getMulExpr(LHS: A, RHS: B);
209}
210
211/// Return true, if evaluating \p AR at \p MaxBTC cannot wrap, because \p AR at
212/// \p MaxBTC is guaranteed inbounds of the accessed object.
213static bool evaluatePtrAddRecAtMaxBTCWillNotWrap(
214 const SCEVAddRecExpr *AR, const SCEV *MaxBTC, const SCEV *EltSize,
215 ScalarEvolution &SE, const DataLayout &DL, DominatorTree *DT,
216 AssumptionCache *AC,
217 std::optional<ScalarEvolution::LoopGuards> &LoopGuards) {
218 auto *PointerBase = SE.getPointerBase(V: AR->getStart());
219 auto *StartPtr = dyn_cast<SCEVUnknown>(Val: PointerBase);
220 if (!StartPtr)
221 return false;
222 const Loop *L = AR->getLoop();
223 bool CheckForNonNull, CheckForFreed;
224 Value *StartPtrV = StartPtr->getValue();
225 uint64_t DerefBytes = StartPtrV->getPointerDereferenceableBytes(
226 DL, CanBeNull&: CheckForNonNull, CanBeFreed&: CheckForFreed);
227
228 if (DerefBytes && (CheckForNonNull || CheckForFreed))
229 return false;
230
231 const SCEV *Step = AR->getStepRecurrence(SE);
232 Type *WiderTy = SE.getWiderType(Ty1: MaxBTC->getType(), Ty2: Step->getType());
233 const SCEV *DerefBytesSCEV = SE.getConstant(Ty: WiderTy, V: DerefBytes);
234
235 // Check if we have a suitable dereferencable assumption we can use.
236 Instruction *CtxI = &*L->getHeader()->getFirstNonPHIIt();
237 if (BasicBlock *LoopPred = L->getLoopPredecessor()) {
238 if (isa<BranchInst>(Val: LoopPred->getTerminator()))
239 CtxI = LoopPred->getTerminator();
240 }
241 RetainedKnowledge DerefRK;
242 getKnowledgeForValue(V: StartPtrV, AttrKinds: {Attribute::Dereferenceable}, AC&: *AC,
243 Filter: [&](RetainedKnowledge RK, Instruction *Assume, auto) {
244 if (!isValidAssumeForContext(I: Assume, CxtI: CtxI, DT))
245 return false;
246 if (StartPtrV->canBeFreed() &&
247 !willNotFreeBetween(Assume, CtxI))
248 return false;
249 DerefRK = std::max(a: DerefRK, b: RK);
250 return true;
251 });
252 if (DerefRK) {
253 DerefBytesSCEV =
254 SE.getUMaxExpr(LHS: DerefBytesSCEV, RHS: SE.getSCEV(V: DerefRK.IRArgValue));
255 }
256
257 if (DerefBytesSCEV->isZero())
258 return false;
259
260 bool IsKnownNonNegative = SE.isKnownNonNegative(S: Step);
261 if (!IsKnownNonNegative && !SE.isKnownNegative(S: Step))
262 return false;
263
264 Step = SE.getNoopOrSignExtend(V: Step, Ty: WiderTy);
265 MaxBTC = SE.getNoopOrZeroExtend(V: MaxBTC, Ty: WiderTy);
266
267 // For the computations below, make sure they don't unsigned wrap.
268 if (!SE.isKnownPredicate(Pred: CmpInst::ICMP_UGE, LHS: AR->getStart(), RHS: StartPtr))
269 return false;
270 const SCEV *StartOffset = SE.getNoopOrZeroExtend(
271 V: SE.getMinusSCEV(LHS: AR->getStart(), RHS: StartPtr), Ty: WiderTy);
272
273 if (!LoopGuards)
274 LoopGuards.emplace(args: ScalarEvolution::LoopGuards::collect(L: AR->getLoop(), SE));
275 MaxBTC = SE.applyLoopGuards(Expr: MaxBTC, Guards: *LoopGuards);
276
277 const SCEV *OffsetAtLastIter =
278 mulSCEVOverflow(A: MaxBTC, B: SE.getAbsExpr(Op: Step, /*IsNSW=*/false), SE);
279 if (!OffsetAtLastIter) {
280 // Re-try with constant max backedge-taken count if using the symbolic one
281 // failed.
282 MaxBTC = SE.getConstantMaxBackedgeTakenCount(L: AR->getLoop());
283 if (isa<SCEVCouldNotCompute>(Val: MaxBTC))
284 return false;
285 MaxBTC = SE.getNoopOrZeroExtend(
286 V: MaxBTC, Ty: WiderTy);
287 OffsetAtLastIter =
288 mulSCEVOverflow(A: MaxBTC, B: SE.getAbsExpr(Op: Step, /*IsNSW=*/false), SE);
289 if (!OffsetAtLastIter)
290 return false;
291 }
292
293 const SCEV *OffsetEndBytes = addSCEVNoOverflow(
294 A: OffsetAtLastIter, B: SE.getNoopOrZeroExtend(V: EltSize, Ty: WiderTy), SE);
295 if (!OffsetEndBytes)
296 return false;
297
298 if (IsKnownNonNegative) {
299 // For positive steps, check if
300 // (AR->getStart() - StartPtr) + (MaxBTC * Step) + EltSize <= DerefBytes,
301 // while making sure none of the computations unsigned wrap themselves.
302 const SCEV *EndBytes = addSCEVNoOverflow(A: StartOffset, B: OffsetEndBytes, SE);
303 if (!EndBytes)
304 return false;
305
306 DerefBytesSCEV = SE.applyLoopGuards(Expr: DerefBytesSCEV, Guards: *LoopGuards);
307 return SE.isKnownPredicate(Pred: CmpInst::ICMP_ULE, LHS: EndBytes, RHS: DerefBytesSCEV);
308 }
309
310 // For negative steps check if
311 // * StartOffset >= (MaxBTC * Step + EltSize)
312 // * StartOffset <= DerefBytes.
313 assert(SE.isKnownNegative(Step) && "must be known negative");
314 return SE.isKnownPredicate(Pred: CmpInst::ICMP_SGE, LHS: StartOffset, RHS: OffsetEndBytes) &&
315 SE.isKnownPredicate(Pred: CmpInst::ICMP_ULE, LHS: StartOffset, RHS: DerefBytesSCEV);
316}
317
318std::pair<const SCEV *, const SCEV *> llvm::getStartAndEndForAccess(
319 const Loop *Lp, const SCEV *PtrExpr, Type *AccessTy, const SCEV *BTC,
320 const SCEV *MaxBTC, ScalarEvolution *SE,
321 DenseMap<std::pair<const SCEV *, Type *>,
322 std::pair<const SCEV *, const SCEV *>> *PointerBounds,
323 DominatorTree *DT, AssumptionCache *AC,
324 std::optional<ScalarEvolution::LoopGuards> &LoopGuards) {
325 std::pair<const SCEV *, const SCEV *> *PtrBoundsPair;
326 if (PointerBounds) {
327 auto [Iter, Ins] = PointerBounds->insert(
328 KV: {{PtrExpr, AccessTy},
329 {SE->getCouldNotCompute(), SE->getCouldNotCompute()}});
330 if (!Ins)
331 return Iter->second;
332 PtrBoundsPair = &Iter->second;
333 }
334
335 const SCEV *ScStart;
336 const SCEV *ScEnd;
337
338 auto &DL = Lp->getHeader()->getDataLayout();
339 Type *IdxTy = DL.getIndexType(PtrTy: PtrExpr->getType());
340 const SCEV *EltSizeSCEV = SE->getStoreSizeOfExpr(IntTy: IdxTy, StoreTy: AccessTy);
341 if (SE->isLoopInvariant(S: PtrExpr, L: Lp)) {
342 ScStart = ScEnd = PtrExpr;
343 } else if (auto *AR = dyn_cast<SCEVAddRecExpr>(Val: PtrExpr)) {
344 ScStart = AR->getStart();
345 if (!isa<SCEVCouldNotCompute>(Val: BTC))
346 // Evaluating AR at an exact BTC is safe: LAA separately checks that
347 // accesses cannot wrap in the loop. If evaluating AR at BTC wraps, then
348 // the loop either triggers UB when executing a memory access with a
349 // poison pointer or the wrapping/poisoned pointer is not used.
350 ScEnd = AR->evaluateAtIteration(It: BTC, SE&: *SE);
351 else {
352 // Evaluating AR at MaxBTC may wrap and create an expression that is less
353 // than the start of the AddRec due to wrapping (for example consider
354 // MaxBTC = -2). If that's the case, set ScEnd to -(EltSize + 1). ScEnd
355 // will get incremented by EltSize before returning, so this effectively
356 // sets ScEnd to the maximum unsigned value for the type. Note that LAA
357 // separately checks that accesses cannot not wrap, so unsigned max
358 // represents an upper bound.
359 if (evaluatePtrAddRecAtMaxBTCWillNotWrap(AR, MaxBTC, EltSize: EltSizeSCEV, SE&: *SE, DL,
360 DT, AC, LoopGuards)) {
361 ScEnd = AR->evaluateAtIteration(It: MaxBTC, SE&: *SE);
362 } else {
363 ScEnd = SE->getAddExpr(
364 LHS: SE->getNegativeSCEV(V: EltSizeSCEV),
365 RHS: SE->getSCEV(V: ConstantExpr::getIntToPtr(
366 C: ConstantInt::getAllOnesValue(Ty: EltSizeSCEV->getType()),
367 Ty: AR->getType())));
368 }
369 }
370 const SCEV *Step = AR->getStepRecurrence(SE&: *SE);
371
372 // For expressions with negative step, the upper bound is ScStart and the
373 // lower bound is ScEnd.
374 if (const auto *CStep = dyn_cast<SCEVConstant>(Val: Step)) {
375 if (CStep->getValue()->isNegative())
376 std::swap(a&: ScStart, b&: ScEnd);
377 } else {
378 // Fallback case: the step is not constant, but we can still
379 // get the upper and lower bounds of the interval by using min/max
380 // expressions.
381 ScStart = SE->getUMinExpr(LHS: ScStart, RHS: ScEnd);
382 ScEnd = SE->getUMaxExpr(LHS: AR->getStart(), RHS: ScEnd);
383 }
384 } else
385 return {SE->getCouldNotCompute(), SE->getCouldNotCompute()};
386
387 assert(SE->isLoopInvariant(ScStart, Lp) && "ScStart needs to be invariant");
388 assert(SE->isLoopInvariant(ScEnd, Lp) && "ScEnd needs to be invariant");
389
390 // Add the size of the pointed element to ScEnd.
391 ScEnd = SE->getAddExpr(LHS: ScEnd, RHS: EltSizeSCEV);
392
393 std::pair<const SCEV *, const SCEV *> Res = {ScStart, ScEnd};
394 if (PointerBounds)
395 *PtrBoundsPair = Res;
396 return Res;
397}
398
399/// Calculate Start and End points of memory access using
400/// getStartAndEndForAccess.
401void RuntimePointerChecking::insert(Loop *Lp, Value *Ptr, const SCEV *PtrExpr,
402 Type *AccessTy, bool WritePtr,
403 unsigned DepSetId, unsigned ASId,
404 PredicatedScalarEvolution &PSE,
405 bool NeedsFreeze) {
406 const SCEV *SymbolicMaxBTC = PSE.getSymbolicMaxBackedgeTakenCount();
407 const SCEV *BTC = PSE.getBackedgeTakenCount();
408 const auto &[ScStart, ScEnd] = getStartAndEndForAccess(
409 Lp, PtrExpr, AccessTy, BTC, MaxBTC: SymbolicMaxBTC, SE: PSE.getSE(),
410 PointerBounds: &DC.getPointerBounds(), DT: DC.getDT(), AC: DC.getAC(), LoopGuards);
411 assert(!isa<SCEVCouldNotCompute>(ScStart) &&
412 !isa<SCEVCouldNotCompute>(ScEnd) &&
413 "must be able to compute both start and end expressions");
414 Pointers.emplace_back(Args&: Ptr, Args: ScStart, Args: ScEnd, Args&: WritePtr, Args&: DepSetId, Args&: ASId, Args&: PtrExpr,
415 Args&: NeedsFreeze);
416}
417
418bool RuntimePointerChecking::tryToCreateDiffCheck(
419 const RuntimeCheckingPtrGroup &CGI, const RuntimeCheckingPtrGroup &CGJ) {
420 // If either group contains multiple different pointers, bail out.
421 // TODO: Support multiple pointers by using the minimum or maximum pointer,
422 // depending on src & sink.
423 if (CGI.Members.size() != 1 || CGJ.Members.size() != 1)
424 return false;
425
426 const PointerInfo *Src = &Pointers[CGI.Members[0]];
427 const PointerInfo *Sink = &Pointers[CGJ.Members[0]];
428
429 // If either pointer is read and written, multiple checks may be needed. Bail
430 // out.
431 if (!DC.getOrderForAccess(Ptr: Src->PointerValue, IsWrite: !Src->IsWritePtr).empty() ||
432 !DC.getOrderForAccess(Ptr: Sink->PointerValue, IsWrite: !Sink->IsWritePtr).empty())
433 return false;
434
435 ArrayRef<unsigned> AccSrc =
436 DC.getOrderForAccess(Ptr: Src->PointerValue, IsWrite: Src->IsWritePtr);
437 ArrayRef<unsigned> AccSink =
438 DC.getOrderForAccess(Ptr: Sink->PointerValue, IsWrite: Sink->IsWritePtr);
439 // If either pointer is accessed multiple times, there may not be a clear
440 // src/sink relation. Bail out for now.
441 if (AccSrc.size() != 1 || AccSink.size() != 1)
442 return false;
443
444 // If the sink is accessed before src, swap src/sink.
445 if (AccSink[0] < AccSrc[0])
446 std::swap(a&: Src, b&: Sink);
447
448 const SCEVConstant *Step;
449 const SCEV *SrcStart;
450 const SCEV *SinkStart;
451 const Loop *InnerLoop = DC.getInnermostLoop();
452 if (!match(S: Src->Expr,
453 P: m_scev_AffineAddRec(Op0: m_SCEV(V&: SrcStart), Op1: m_SCEVConstant(V&: Step),
454 L: m_SpecificLoop(L: InnerLoop))) ||
455 !match(S: Sink->Expr,
456 P: m_scev_AffineAddRec(Op0: m_SCEV(V&: SinkStart), Op1: m_scev_Specific(S: Step),
457 L: m_SpecificLoop(L: InnerLoop))))
458 return false;
459
460 SmallVector<Instruction *, 4> SrcInsts =
461 DC.getInstructionsForAccess(Ptr: Src->PointerValue, isWrite: Src->IsWritePtr);
462 SmallVector<Instruction *, 4> SinkInsts =
463 DC.getInstructionsForAccess(Ptr: Sink->PointerValue, isWrite: Sink->IsWritePtr);
464 Type *SrcTy = getLoadStoreType(I: SrcInsts[0]);
465 Type *DstTy = getLoadStoreType(I: SinkInsts[0]);
466 if (isa<ScalableVectorType>(Val: SrcTy) || isa<ScalableVectorType>(Val: DstTy))
467 return false;
468
469 const DataLayout &DL = InnerLoop->getHeader()->getDataLayout();
470 unsigned AllocSize =
471 std::max(a: DL.getTypeAllocSize(Ty: SrcTy), b: DL.getTypeAllocSize(Ty: DstTy));
472
473 // Only matching constant steps matching the AllocSize are supported at the
474 // moment. This simplifies the difference computation. Can be extended in the
475 // future.
476 if (Step->getAPInt().abs() != AllocSize)
477 return false;
478
479 IntegerType *IntTy =
480 IntegerType::get(C&: Src->PointerValue->getContext(),
481 NumBits: DL.getPointerSizeInBits(AS: CGI.AddressSpace));
482
483 // When counting down, the dependence distance needs to be swapped.
484 if (Step->getValue()->isNegative())
485 std::swap(a&: SinkStart, b&: SrcStart);
486
487 const SCEV *SinkStartInt = SE->getPtrToIntExpr(Op: SinkStart, Ty: IntTy);
488 const SCEV *SrcStartInt = SE->getPtrToIntExpr(Op: SrcStart, Ty: IntTy);
489 if (isa<SCEVCouldNotCompute>(Val: SinkStartInt) ||
490 isa<SCEVCouldNotCompute>(Val: SrcStartInt))
491 return false;
492
493 // If the start values for both Src and Sink also vary according to an outer
494 // loop, then it's probably better to avoid creating diff checks because
495 // they may not be hoisted. We should instead let llvm::addRuntimeChecks
496 // do the expanded full range overlap checks, which can be hoisted.
497 if (HoistRuntimeChecks && InnerLoop->getParentLoop() &&
498 isa<SCEVAddRecExpr>(Val: SinkStartInt) && isa<SCEVAddRecExpr>(Val: SrcStartInt)) {
499 auto *SrcStartAR = cast<SCEVAddRecExpr>(Val: SrcStartInt);
500 auto *SinkStartAR = cast<SCEVAddRecExpr>(Val: SinkStartInt);
501 const Loop *StartARLoop = SrcStartAR->getLoop();
502 if (StartARLoop == SinkStartAR->getLoop() &&
503 StartARLoop == InnerLoop->getParentLoop() &&
504 // If the diff check would already be loop invariant (due to the
505 // recurrences being the same), then we prefer to keep the diff checks
506 // because they are cheaper.
507 SrcStartAR->getStepRecurrence(SE&: *SE) !=
508 SinkStartAR->getStepRecurrence(SE&: *SE)) {
509 LLVM_DEBUG(dbgs() << "LAA: Not creating diff runtime check, since these "
510 "cannot be hoisted out of the outer loop\n");
511 return false;
512 }
513 }
514
515 LLVM_DEBUG(dbgs() << "LAA: Creating diff runtime check for:\n"
516 << "SrcStart: " << *SrcStartInt << '\n'
517 << "SinkStartInt: " << *SinkStartInt << '\n');
518 DiffChecks.emplace_back(Args&: SrcStartInt, Args&: SinkStartInt, Args&: AllocSize,
519 Args: Src->NeedsFreeze || Sink->NeedsFreeze);
520 return true;
521}
522
523SmallVector<RuntimePointerCheck, 4> RuntimePointerChecking::generateChecks() {
524 SmallVector<RuntimePointerCheck, 4> Checks;
525
526 for (unsigned I = 0; I < CheckingGroups.size(); ++I) {
527 for (unsigned J = I + 1; J < CheckingGroups.size(); ++J) {
528 const RuntimeCheckingPtrGroup &CGI = CheckingGroups[I];
529 const RuntimeCheckingPtrGroup &CGJ = CheckingGroups[J];
530
531 if (needsChecking(M: CGI, N: CGJ)) {
532 CanUseDiffCheck = CanUseDiffCheck && tryToCreateDiffCheck(CGI, CGJ);
533 Checks.emplace_back(Args: &CGI, Args: &CGJ);
534 }
535 }
536 }
537 return Checks;
538}
539
540void RuntimePointerChecking::generateChecks(
541 MemoryDepChecker::DepCandidates &DepCands, bool UseDependencies) {
542 assert(Checks.empty() && "Checks is not empty");
543 groupChecks(DepCands, UseDependencies);
544 Checks = generateChecks();
545}
546
547bool RuntimePointerChecking::needsChecking(
548 const RuntimeCheckingPtrGroup &M, const RuntimeCheckingPtrGroup &N) const {
549 for (const auto &I : M.Members)
550 for (const auto &J : N.Members)
551 if (needsChecking(I, J))
552 return true;
553 return false;
554}
555
556/// Compare \p I and \p J and return the minimum.
557/// Return nullptr in case we couldn't find an answer.
558static const SCEV *getMinFromExprs(const SCEV *I, const SCEV *J,
559 ScalarEvolution *SE) {
560 std::optional<APInt> Diff = SE->computeConstantDifference(LHS: J, RHS: I);
561 if (!Diff)
562 return nullptr;
563 return Diff->isNegative() ? J : I;
564}
565
566bool RuntimeCheckingPtrGroup::addPointer(
567 unsigned Index, const RuntimePointerChecking &RtCheck) {
568 return addPointer(
569 Index, Start: RtCheck.Pointers[Index].Start, End: RtCheck.Pointers[Index].End,
570 AS: RtCheck.Pointers[Index].PointerValue->getType()->getPointerAddressSpace(),
571 NeedsFreeze: RtCheck.Pointers[Index].NeedsFreeze, SE&: *RtCheck.SE);
572}
573
574bool RuntimeCheckingPtrGroup::addPointer(unsigned Index, const SCEV *Start,
575 const SCEV *End, unsigned AS,
576 bool NeedsFreeze,
577 ScalarEvolution &SE) {
578 assert(AddressSpace == AS &&
579 "all pointers in a checking group must be in the same address space");
580
581 // Compare the starts and ends with the known minimum and maximum
582 // of this set. We need to know how we compare against the min/max
583 // of the set in order to be able to emit memchecks.
584 const SCEV *Min0 = getMinFromExprs(I: Start, J: Low, SE: &SE);
585 if (!Min0)
586 return false;
587
588 const SCEV *Min1 = getMinFromExprs(I: End, J: High, SE: &SE);
589 if (!Min1)
590 return false;
591
592 // Update the low bound expression if we've found a new min value.
593 if (Min0 == Start)
594 Low = Start;
595
596 // Update the high bound expression if we've found a new max value.
597 if (Min1 != End)
598 High = End;
599
600 Members.push_back(Elt: Index);
601 this->NeedsFreeze |= NeedsFreeze;
602 return true;
603}
604
605void RuntimePointerChecking::groupChecks(
606 MemoryDepChecker::DepCandidates &DepCands, bool UseDependencies) {
607 // We build the groups from dependency candidates equivalence classes
608 // because:
609 // - We know that pointers in the same equivalence class share
610 // the same underlying object and therefore there is a chance
611 // that we can compare pointers
612 // - We wouldn't be able to merge two pointers for which we need
613 // to emit a memcheck. The classes in DepCands are already
614 // conveniently built such that no two pointers in the same
615 // class need checking against each other.
616
617 // We use the following (greedy) algorithm to construct the groups
618 // For every pointer in the equivalence class:
619 // For each existing group:
620 // - if the difference between this pointer and the min/max bounds
621 // of the group is a constant, then make the pointer part of the
622 // group and update the min/max bounds of that group as required.
623
624 CheckingGroups.clear();
625
626 // If we need to check two pointers to the same underlying object
627 // with a non-constant difference, we shouldn't perform any pointer
628 // grouping with those pointers. This is because we can easily get
629 // into cases where the resulting check would return false, even when
630 // the accesses are safe.
631 //
632 // The following example shows this:
633 // for (i = 0; i < 1000; ++i)
634 // a[5000 + i * m] = a[i] + a[i + 9000]
635 //
636 // Here grouping gives a check of (5000, 5000 + 1000 * m) against
637 // (0, 10000) which is always false. However, if m is 1, there is no
638 // dependence. Not grouping the checks for a[i] and a[i + 9000] allows
639 // us to perform an accurate check in this case.
640 //
641 // In the above case, we have a non-constant distance and an Unknown
642 // dependence between accesses to the same underlying object, and could retry
643 // with runtime checks. Therefore UseDependencies is false. In this case we
644 // will use the fallback path and create separate checking groups for all
645 // pointers.
646
647 // If we don't have the dependency partitions, construct a new
648 // checking pointer group for each pointer. This is also required
649 // for correctness, because in this case we can have checking between
650 // pointers to the same underlying object.
651 if (!UseDependencies) {
652 for (unsigned I = 0; I < Pointers.size(); ++I)
653 CheckingGroups.emplace_back(Args&: I, Args&: *this);
654 return;
655 }
656
657 unsigned TotalComparisons = 0;
658
659 DenseMap<Value *, SmallVector<unsigned>> PositionMap;
660 for (unsigned Index = 0; Index < Pointers.size(); ++Index)
661 PositionMap[Pointers[Index].PointerValue].push_back(Elt: Index);
662
663 // We need to keep track of what pointers we've already seen so we
664 // don't process them twice.
665 SmallSet<unsigned, 2> Seen;
666
667 // Go through all equivalence classes, get the "pointer check groups"
668 // and add them to the overall solution. We use the order in which accesses
669 // appear in 'Pointers' to enforce determinism.
670 for (unsigned I = 0; I < Pointers.size(); ++I) {
671 // We've seen this pointer before, and therefore already processed
672 // its equivalence class.
673 if (Seen.contains(V: I))
674 continue;
675
676 MemoryDepChecker::MemAccessInfo Access(Pointers[I].PointerValue,
677 Pointers[I].IsWritePtr);
678
679 SmallVector<RuntimeCheckingPtrGroup, 2> Groups;
680
681 // Because DepCands is constructed by visiting accesses in the order in
682 // which they appear in alias sets (which is deterministic) and the
683 // iteration order within an equivalence class member is only dependent on
684 // the order in which unions and insertions are performed on the
685 // equivalence class, the iteration order is deterministic.
686 for (auto M : DepCands.members(V: Access)) {
687 auto PointerI = PositionMap.find(Val: M.getPointer());
688 // If we can't find the pointer in PositionMap that means we can't
689 // generate a memcheck for it.
690 if (PointerI == PositionMap.end())
691 continue;
692 for (unsigned Pointer : PointerI->second) {
693 bool Merged = false;
694 // Mark this pointer as seen.
695 Seen.insert(V: Pointer);
696
697 // Go through all the existing sets and see if we can find one
698 // which can include this pointer.
699 for (RuntimeCheckingPtrGroup &Group : Groups) {
700 // Don't perform more than a certain amount of comparisons.
701 // This should limit the cost of grouping the pointers to something
702 // reasonable. If we do end up hitting this threshold, the algorithm
703 // will create separate groups for all remaining pointers.
704 if (TotalComparisons > MemoryCheckMergeThreshold)
705 break;
706
707 TotalComparisons++;
708
709 if (Group.addPointer(Index: Pointer, RtCheck: *this)) {
710 Merged = true;
711 break;
712 }
713 }
714
715 if (!Merged)
716 // We couldn't add this pointer to any existing set or the threshold
717 // for the number of comparisons has been reached. Create a new group
718 // to hold the current pointer.
719 Groups.emplace_back(Args&: Pointer, Args&: *this);
720 }
721 }
722
723 // We've computed the grouped checks for this partition.
724 // Save the results and continue with the next one.
725 llvm::append_range(C&: CheckingGroups, R&: Groups);
726 }
727}
728
729bool RuntimePointerChecking::arePointersInSamePartition(
730 const SmallVectorImpl<int> &PtrToPartition, unsigned PtrIdx1,
731 unsigned PtrIdx2) {
732 return (PtrToPartition[PtrIdx1] != -1 &&
733 PtrToPartition[PtrIdx1] == PtrToPartition[PtrIdx2]);
734}
735
736bool RuntimePointerChecking::needsChecking(unsigned I, unsigned J) const {
737 const PointerInfo &PointerI = Pointers[I];
738 const PointerInfo &PointerJ = Pointers[J];
739
740 // No need to check if two readonly pointers intersect.
741 if (!PointerI.IsWritePtr && !PointerJ.IsWritePtr)
742 return false;
743
744 // Only need to check pointers between two different dependency sets.
745 if (PointerI.DependencySetId == PointerJ.DependencySetId)
746 return false;
747
748 // Only need to check pointers in the same alias set.
749 return PointerI.AliasSetId == PointerJ.AliasSetId;
750}
751
752/// Assign each RuntimeCheckingPtrGroup pointer an index for stable UTC output.
753static DenseMap<const RuntimeCheckingPtrGroup *, unsigned>
754getPtrToIdxMap(ArrayRef<RuntimeCheckingPtrGroup> CheckingGroups) {
755 DenseMap<const RuntimeCheckingPtrGroup *, unsigned> PtrIndices;
756 for (const auto &[Idx, CG] : enumerate(First&: CheckingGroups))
757 PtrIndices[&CG] = Idx;
758 return PtrIndices;
759}
760
761void RuntimePointerChecking::printChecks(
762 raw_ostream &OS, const SmallVectorImpl<RuntimePointerCheck> &Checks,
763 unsigned Depth) const {
764 unsigned N = 0;
765 auto PtrIndices = getPtrToIdxMap(CheckingGroups);
766 for (const auto &[Check1, Check2] : Checks) {
767 const auto &First = Check1->Members, &Second = Check2->Members;
768 OS.indent(NumSpaces: Depth) << "Check " << N++ << ":\n";
769 OS.indent(NumSpaces: Depth + 2) << "Comparing group GRP" << PtrIndices.at(Val: Check1)
770 << ":\n";
771 for (unsigned K : First)
772 OS.indent(NumSpaces: Depth + 2) << *Pointers[K].PointerValue << "\n";
773 OS.indent(NumSpaces: Depth + 2) << "Against group GRP" << PtrIndices.at(Val: Check2)
774 << ":\n";
775 for (unsigned K : Second)
776 OS.indent(NumSpaces: Depth + 2) << *Pointers[K].PointerValue << "\n";
777 }
778}
779
780void RuntimePointerChecking::print(raw_ostream &OS, unsigned Depth) const {
781
782 OS.indent(NumSpaces: Depth) << "Run-time memory checks:\n";
783 printChecks(OS, Checks, Depth);
784
785 OS.indent(NumSpaces: Depth) << "Grouped accesses:\n";
786 auto PtrIndices = getPtrToIdxMap(CheckingGroups);
787 for (const auto &CG : CheckingGroups) {
788 OS.indent(NumSpaces: Depth + 2) << "Group GRP" << PtrIndices.at(Val: &CG) << ":\n";
789 OS.indent(NumSpaces: Depth + 4) << "(Low: " << *CG.Low << " High: " << *CG.High
790 << ")\n";
791 for (unsigned Member : CG.Members) {
792 OS.indent(NumSpaces: Depth + 6) << "Member: " << *Pointers[Member].Expr << "\n";
793 }
794 }
795}
796
797namespace {
798
799/// Analyses memory accesses in a loop.
800///
801/// Checks whether run time pointer checks are needed and builds sets for data
802/// dependence checking.
803class AccessAnalysis {
804public:
805 /// Read or write access location.
806 typedef PointerIntPair<Value *, 1, bool> MemAccessInfo;
807 typedef SmallVector<MemAccessInfo, 8> MemAccessInfoList;
808
809 AccessAnalysis(const Loop *TheLoop, AAResults *AA, const LoopInfo *LI,
810 DominatorTree &DT, MemoryDepChecker::DepCandidates &DA,
811 PredicatedScalarEvolution &PSE,
812 SmallPtrSetImpl<MDNode *> &LoopAliasScopes)
813 : TheLoop(TheLoop), BAA(*AA), AST(BAA), LI(LI), DT(DT), DepCands(DA),
814 PSE(PSE), LoopAliasScopes(LoopAliasScopes) {
815 // We're analyzing dependences across loop iterations.
816 BAA.enableCrossIterationMode();
817 }
818
819 /// Register a load and whether it is only read from.
820 void addLoad(const MemoryLocation &Loc, Type *AccessTy, bool IsReadOnly) {
821 Value *Ptr = const_cast<Value *>(Loc.Ptr);
822 AST.add(Loc: adjustLoc(Loc));
823 Accesses[MemAccessInfo(Ptr, false)].insert(X: AccessTy);
824 if (IsReadOnly)
825 ReadOnlyPtr.insert(Ptr);
826 }
827
828 /// Register a store.
829 void addStore(const MemoryLocation &Loc, Type *AccessTy) {
830 Value *Ptr = const_cast<Value *>(Loc.Ptr);
831 AST.add(Loc: adjustLoc(Loc));
832 Accesses[MemAccessInfo(Ptr, true)].insert(X: AccessTy);
833 }
834
835 /// Check if we can emit a run-time no-alias check for \p Access.
836 ///
837 /// Returns true if we can emit a run-time no alias check for \p Access.
838 /// If we can check this access, this also adds it to a dependence set and
839 /// adds a run-time to check for it to \p RtCheck. If \p Assume is true,
840 /// we will attempt to use additional run-time checks in order to get
841 /// the bounds of the pointer.
842 bool createCheckForAccess(RuntimePointerChecking &RtCheck,
843 MemAccessInfo Access, Type *AccessTy,
844 const DenseMap<Value *, const SCEV *> &Strides,
845 DenseMap<Value *, unsigned> &DepSetId,
846 Loop *TheLoop, unsigned &RunningDepId,
847 unsigned ASId, bool Assume);
848
849 /// Check whether we can check the pointers at runtime for
850 /// non-intersection.
851 ///
852 /// Returns true if we need no check or if we do and we can generate them
853 /// (i.e. the pointers have computable bounds). A return value of false means
854 /// we couldn't analyze and generate runtime checks for all pointers in the
855 /// loop, but if \p AllowPartial is set then we will have checks for those
856 /// pointers we could analyze.
857 bool canCheckPtrAtRT(RuntimePointerChecking &RtCheck, Loop *TheLoop,
858 const DenseMap<Value *, const SCEV *> &Strides,
859 Value *&UncomputablePtr, bool AllowPartial);
860
861 /// Goes over all memory accesses, checks whether a RT check is needed
862 /// and builds sets of dependent accesses.
863 void buildDependenceSets() {
864 processMemAccesses();
865 }
866
867 /// Initial processing of memory accesses determined that we need to
868 /// perform dependency checking.
869 ///
870 /// Note that this can later be cleared if we retry memcheck analysis without
871 /// dependency checking (i.e. ShouldRetryWithRuntimeChecks).
872 bool isDependencyCheckNeeded() const { return !CheckDeps.empty(); }
873
874 /// We decided that no dependence analysis would be used. Reset the state.
875 void resetDepChecks(MemoryDepChecker &DepChecker) {
876 CheckDeps.clear();
877 DepChecker.clearDependences();
878 }
879
880 const MemAccessInfoList &getDependenciesToCheck() const { return CheckDeps; }
881
882private:
883 typedef MapVector<MemAccessInfo, SmallSetVector<Type *, 1>> PtrAccessMap;
884
885 /// Adjust the MemoryLocation so that it represents accesses to this
886 /// location across all iterations, rather than a single one.
887 MemoryLocation adjustLoc(MemoryLocation Loc) const {
888 // The accessed location varies within the loop, but remains within the
889 // underlying object.
890 Loc.Size = LocationSize::beforeOrAfterPointer();
891 Loc.AATags.Scope = adjustAliasScopeList(ScopeList: Loc.AATags.Scope);
892 Loc.AATags.NoAlias = adjustAliasScopeList(ScopeList: Loc.AATags.NoAlias);
893 return Loc;
894 }
895
896 /// Drop alias scopes that are only valid within a single loop iteration.
897 MDNode *adjustAliasScopeList(MDNode *ScopeList) const {
898 if (!ScopeList)
899 return nullptr;
900
901 // For the sake of simplicity, drop the whole scope list if any scope is
902 // iteration-local.
903 if (any_of(Range: ScopeList->operands(), P: [&](Metadata *Scope) {
904 return LoopAliasScopes.contains(Ptr: cast<MDNode>(Val: Scope));
905 }))
906 return nullptr;
907
908 return ScopeList;
909 }
910
911 /// Go over all memory access and check whether runtime pointer checks
912 /// are needed and build sets of dependency check candidates.
913 void processMemAccesses();
914
915 /// Map of all accesses. Values are the types used to access memory pointed to
916 /// by the pointer.
917 PtrAccessMap Accesses;
918
919 /// The loop being checked.
920 const Loop *TheLoop;
921
922 /// List of accesses that need a further dependence check.
923 MemAccessInfoList CheckDeps;
924
925 /// Set of pointers that are read only.
926 SmallPtrSet<Value*, 16> ReadOnlyPtr;
927
928 /// Batched alias analysis results.
929 BatchAAResults BAA;
930
931 /// An alias set tracker to partition the access set by underlying object and
932 //intrinsic property (such as TBAA metadata).
933 AliasSetTracker AST;
934
935 /// The LoopInfo of the loop being checked.
936 const LoopInfo *LI;
937
938 /// The dominator tree of the function.
939 DominatorTree &DT;
940
941 /// Sets of potentially dependent accesses - members of one set share an
942 /// underlying pointer. The set "CheckDeps" identfies which sets really need a
943 /// dependence check.
944 MemoryDepChecker::DepCandidates &DepCands;
945
946 /// Initial processing of memory accesses determined that we may need
947 /// to add memchecks. Perform the analysis to determine the necessary checks.
948 ///
949 /// Note that, this is different from isDependencyCheckNeeded. When we retry
950 /// memcheck analysis without dependency checking
951 /// (i.e. ShouldRetryWithRuntimeChecks), isDependencyCheckNeeded is
952 /// cleared while this remains set if we have potentially dependent accesses.
953 bool IsRTCheckAnalysisNeeded = false;
954
955 /// The SCEV predicate containing all the SCEV-related assumptions.
956 PredicatedScalarEvolution &PSE;
957
958 DenseMap<Value *, SmallVector<const Value *, 16>> UnderlyingObjects;
959
960 /// Alias scopes that are declared inside the loop, and as such not valid
961 /// across iterations.
962 SmallPtrSetImpl<MDNode *> &LoopAliasScopes;
963};
964
965} // end anonymous namespace
966
967/// Try to compute a constant stride for \p AR. Used by getPtrStride and
968/// isNoWrap.
969static std::optional<int64_t>
970getStrideFromAddRec(const SCEVAddRecExpr *AR, const Loop *Lp, Type *AccessTy,
971 Value *Ptr, PredicatedScalarEvolution &PSE) {
972 if (isa<ScalableVectorType>(Val: AccessTy)) {
973 LLVM_DEBUG(dbgs() << "LAA: Bad stride - Scalable object: " << *AccessTy
974 << "\n");
975 return std::nullopt;
976 }
977
978 // The access function must stride over the innermost loop.
979 if (Lp != AR->getLoop()) {
980 LLVM_DEBUG({
981 dbgs() << "LAA: Bad stride - Not striding over innermost loop ";
982 if (Ptr)
983 dbgs() << *Ptr << " ";
984
985 dbgs() << "SCEV: " << *AR << "\n";
986 });
987 return std::nullopt;
988 }
989
990 // Check the step is constant.
991 const SCEV *Step = AR->getStepRecurrence(SE&: *PSE.getSE());
992
993 // Calculate the pointer stride and check if it is constant.
994 const APInt *APStepVal;
995 if (!match(S: Step, P: m_scev_APInt(C&: APStepVal))) {
996 LLVM_DEBUG({
997 dbgs() << "LAA: Bad stride - Not a constant strided ";
998 if (Ptr)
999 dbgs() << *Ptr << " ";
1000 dbgs() << "SCEV: " << *AR << "\n";
1001 });
1002 return std::nullopt;
1003 }
1004
1005 const auto &DL = Lp->getHeader()->getDataLayout();
1006 TypeSize AllocSize = DL.getTypeAllocSize(Ty: AccessTy);
1007 int64_t Size = AllocSize.getFixedValue();
1008
1009 // Huge step value - give up.
1010 std::optional<int64_t> StepVal = APStepVal->trySExtValue();
1011 if (!StepVal)
1012 return std::nullopt;
1013
1014 // Strided access.
1015 return *StepVal % Size ? std::nullopt : std::make_optional(t: *StepVal / Size);
1016}
1017
1018/// Check whether \p AR is a non-wrapping AddRec. If \p Ptr is not nullptr, use
1019/// informating from the IR pointer value to determine no-wrap.
1020static bool isNoWrap(PredicatedScalarEvolution &PSE, const SCEVAddRecExpr *AR,
1021 Value *Ptr, Type *AccessTy, const Loop *L, bool Assume,
1022 const DominatorTree &DT,
1023 std::optional<int64_t> Stride = std::nullopt) {
1024 // FIXME: This should probably only return true for NUW.
1025 if (AR->getNoWrapFlags(Mask: SCEV::NoWrapMask))
1026 return true;
1027
1028 if (Ptr && PSE.hasNoOverflow(V: Ptr, Flags: SCEVWrapPredicate::IncrementNUSW))
1029 return true;
1030
1031 // An nusw getelementptr that is an AddRec cannot wrap. If it would wrap,
1032 // the distance between the previously accessed location and the wrapped
1033 // location will be larger than half the pointer index type space. In that
1034 // case, the GEP would be poison and any memory access dependent on it would
1035 // be immediate UB when executed.
1036 if (auto *GEP = dyn_cast_if_present<GetElementPtrInst>(Val: Ptr);
1037 GEP && GEP->hasNoUnsignedSignedWrap()) {
1038 // For the above reasoning to apply, the pointer must be dereferenced in
1039 // every iteration.
1040 if (L->getHeader() == L->getLoopLatch() ||
1041 any_of(Range: GEP->users(), P: [L, &DT, GEP](User *U) {
1042 if (getLoadStorePointerOperand(V: U) != GEP)
1043 return false;
1044 BasicBlock *UserBB = cast<Instruction>(Val: U)->getParent();
1045 if (!L->contains(BB: UserBB))
1046 return false;
1047 return !LoopAccessInfo::blockNeedsPredication(BB: UserBB, TheLoop: L, DT: &DT);
1048 }))
1049 return true;
1050 }
1051
1052 if (!Stride)
1053 Stride = getStrideFromAddRec(AR, Lp: L, AccessTy, Ptr, PSE);
1054 if (Stride) {
1055 // If the null pointer is undefined, then a access sequence which would
1056 // otherwise access it can be assumed not to unsigned wrap. Note that this
1057 // assumes the object in memory is aligned to the natural alignment.
1058 unsigned AddrSpace = AR->getType()->getPointerAddressSpace();
1059 if (!NullPointerIsDefined(F: L->getHeader()->getParent(), AS: AddrSpace) &&
1060 (Stride == 1 || Stride == -1))
1061 return true;
1062 }
1063
1064 if (Ptr && Assume) {
1065 PSE.setNoOverflow(V: Ptr, Flags: SCEVWrapPredicate::IncrementNUSW);
1066 LLVM_DEBUG(dbgs() << "LAA: Pointer may wrap:\n"
1067 << "LAA: Pointer: " << *Ptr << "\n"
1068 << "LAA: SCEV: " << *AR << "\n"
1069 << "LAA: Added an overflow assumption\n");
1070 return true;
1071 }
1072
1073 return false;
1074}
1075
1076static void visitPointers(Value *StartPtr, const Loop &InnermostLoop,
1077 function_ref<void(Value *)> AddPointer) {
1078 SmallPtrSet<Value *, 8> Visited;
1079 SmallVector<Value *> WorkList;
1080 WorkList.push_back(Elt: StartPtr);
1081
1082 while (!WorkList.empty()) {
1083 Value *Ptr = WorkList.pop_back_val();
1084 if (!Visited.insert(Ptr).second)
1085 continue;
1086 auto *PN = dyn_cast<PHINode>(Val: Ptr);
1087 // SCEV does not look through non-header PHIs inside the loop. Such phis
1088 // can be analyzed by adding separate accesses for each incoming pointer
1089 // value.
1090 if (PN && InnermostLoop.contains(BB: PN->getParent()) &&
1091 PN->getParent() != InnermostLoop.getHeader()) {
1092 llvm::append_range(C&: WorkList, R: PN->incoming_values());
1093 } else
1094 AddPointer(Ptr);
1095 }
1096}
1097
1098// Walk back through the IR for a pointer, looking for a select like the
1099// following:
1100//
1101// %offset = select i1 %cmp, i64 %a, i64 %b
1102// %addr = getelementptr double, double* %base, i64 %offset
1103// %ld = load double, double* %addr, align 8
1104//
1105// We won't be able to form a single SCEVAddRecExpr from this since the
1106// address for each loop iteration depends on %cmp. We could potentially
1107// produce multiple valid SCEVAddRecExprs, though, and check all of them for
1108// memory safety/aliasing if needed.
1109//
1110// If we encounter some IR we don't yet handle, or something obviously fine
1111// like a constant, then we just add the SCEV for that term to the list passed
1112// in by the caller. If we have a node that may potentially yield a valid
1113// SCEVAddRecExpr then we decompose it into parts and build the SCEV terms
1114// ourselves before adding to the list.
1115static void findForkedSCEVs(
1116 ScalarEvolution *SE, const Loop *L, Value *Ptr,
1117 SmallVectorImpl<PointerIntPair<const SCEV *, 1, bool>> &ScevList,
1118 unsigned Depth) {
1119 // If our Value is a SCEVAddRecExpr, loop invariant, not an instruction, or
1120 // we've exceeded our limit on recursion, just return whatever we have
1121 // regardless of whether it can be used for a forked pointer or not, along
1122 // with an indication of whether it might be a poison or undef value.
1123 const SCEV *Scev = SE->getSCEV(V: Ptr);
1124 if (isa<SCEVAddRecExpr>(Val: Scev) || L->isLoopInvariant(V: Ptr) ||
1125 !isa<Instruction>(Val: Ptr) || Depth == 0) {
1126 ScevList.emplace_back(Args&: Scev, Args: !isGuaranteedNotToBeUndefOrPoison(V: Ptr));
1127 return;
1128 }
1129
1130 Depth--;
1131
1132 auto UndefPoisonCheck = [](PointerIntPair<const SCEV *, 1, bool> S) {
1133 return get<1>(Pair: S);
1134 };
1135
1136 auto GetBinOpExpr = [&SE](unsigned Opcode, const SCEV *L, const SCEV *R) {
1137 switch (Opcode) {
1138 case Instruction::Add:
1139 return SE->getAddExpr(LHS: L, RHS: R);
1140 case Instruction::Sub:
1141 return SE->getMinusSCEV(LHS: L, RHS: R);
1142 default:
1143 llvm_unreachable("Unexpected binary operator when walking ForkedPtrs");
1144 }
1145 };
1146
1147 Instruction *I = cast<Instruction>(Val: Ptr);
1148 unsigned Opcode = I->getOpcode();
1149 switch (Opcode) {
1150 case Instruction::GetElementPtr: {
1151 auto *GEP = cast<GetElementPtrInst>(Val: I);
1152 Type *SourceTy = GEP->getSourceElementType();
1153 // We only handle base + single offset GEPs here for now.
1154 // Not dealing with preexisting gathers yet, so no vectors.
1155 if (I->getNumOperands() != 2 || SourceTy->isVectorTy()) {
1156 ScevList.emplace_back(Args&: Scev, Args: !isGuaranteedNotToBeUndefOrPoison(V: GEP));
1157 break;
1158 }
1159 SmallVector<PointerIntPair<const SCEV *, 1, bool>, 2> BaseScevs;
1160 SmallVector<PointerIntPair<const SCEV *, 1, bool>, 2> OffsetScevs;
1161 findForkedSCEVs(SE, L, Ptr: I->getOperand(i: 0), ScevList&: BaseScevs, Depth);
1162 findForkedSCEVs(SE, L, Ptr: I->getOperand(i: 1), ScevList&: OffsetScevs, Depth);
1163
1164 // See if we need to freeze our fork...
1165 bool NeedsFreeze = any_of(Range&: BaseScevs, P: UndefPoisonCheck) ||
1166 any_of(Range&: OffsetScevs, P: UndefPoisonCheck);
1167
1168 // Check that we only have a single fork, on either the base or the offset.
1169 // Copy the SCEV across for the one without a fork in order to generate
1170 // the full SCEV for both sides of the GEP.
1171 if (OffsetScevs.size() == 2 && BaseScevs.size() == 1)
1172 BaseScevs.push_back(Elt: BaseScevs[0]);
1173 else if (BaseScevs.size() == 2 && OffsetScevs.size() == 1)
1174 OffsetScevs.push_back(Elt: OffsetScevs[0]);
1175 else {
1176 ScevList.emplace_back(Args&: Scev, Args&: NeedsFreeze);
1177 break;
1178 }
1179
1180 Type *IntPtrTy = SE->getEffectiveSCEVType(Ty: GEP->getPointerOperandType());
1181
1182 // Find the size of the type being pointed to. We only have a single
1183 // index term (guarded above) so we don't need to index into arrays or
1184 // structures, just get the size of the scalar value.
1185 const SCEV *Size = SE->getSizeOfExpr(IntTy: IntPtrTy, AllocTy: SourceTy);
1186
1187 for (auto [B, O] : zip(t&: BaseScevs, u&: OffsetScevs)) {
1188 const SCEV *Base = get<0>(Pair: B);
1189 const SCEV *Offset = get<0>(Pair: O);
1190
1191 // Scale up the offsets by the size of the type, then add to the bases.
1192 const SCEV *Scaled =
1193 SE->getMulExpr(LHS: Size, RHS: SE->getTruncateOrSignExtend(V: Offset, Ty: IntPtrTy));
1194 ScevList.emplace_back(Args: SE->getAddExpr(LHS: Base, RHS: Scaled), Args&: NeedsFreeze);
1195 }
1196 break;
1197 }
1198 case Instruction::Select: {
1199 SmallVector<PointerIntPair<const SCEV *, 1, bool>, 2> ChildScevs;
1200 // A select means we've found a forked pointer, but we currently only
1201 // support a single select per pointer so if there's another behind this
1202 // then we just bail out and return the generic SCEV.
1203 findForkedSCEVs(SE, L, Ptr: I->getOperand(i: 1), ScevList&: ChildScevs, Depth);
1204 findForkedSCEVs(SE, L, Ptr: I->getOperand(i: 2), ScevList&: ChildScevs, Depth);
1205 if (ChildScevs.size() == 2)
1206 append_range(C&: ScevList, R&: ChildScevs);
1207 else
1208 ScevList.emplace_back(Args&: Scev, Args: !isGuaranteedNotToBeUndefOrPoison(V: Ptr));
1209 break;
1210 }
1211 case Instruction::PHI: {
1212 SmallVector<PointerIntPair<const SCEV *, 1, bool>, 2> ChildScevs;
1213 // A phi means we've found a forked pointer, but we currently only
1214 // support a single phi per pointer so if there's another behind this
1215 // then we just bail out and return the generic SCEV.
1216 if (I->getNumOperands() == 2) {
1217 findForkedSCEVs(SE, L, Ptr: I->getOperand(i: 0), ScevList&: ChildScevs, Depth);
1218 findForkedSCEVs(SE, L, Ptr: I->getOperand(i: 1), ScevList&: ChildScevs, Depth);
1219 }
1220 if (ChildScevs.size() == 2)
1221 append_range(C&: ScevList, R&: ChildScevs);
1222 else
1223 ScevList.emplace_back(Args&: Scev, Args: !isGuaranteedNotToBeUndefOrPoison(V: Ptr));
1224 break;
1225 }
1226 case Instruction::Add:
1227 case Instruction::Sub: {
1228 SmallVector<PointerIntPair<const SCEV *, 1, bool>> LScevs;
1229 SmallVector<PointerIntPair<const SCEV *, 1, bool>> RScevs;
1230 findForkedSCEVs(SE, L, Ptr: I->getOperand(i: 0), ScevList&: LScevs, Depth);
1231 findForkedSCEVs(SE, L, Ptr: I->getOperand(i: 1), ScevList&: RScevs, Depth);
1232
1233 // See if we need to freeze our fork...
1234 bool NeedsFreeze =
1235 any_of(Range&: LScevs, P: UndefPoisonCheck) || any_of(Range&: RScevs, P: UndefPoisonCheck);
1236
1237 // Check that we only have a single fork, on either the left or right side.
1238 // Copy the SCEV across for the one without a fork in order to generate
1239 // the full SCEV for both sides of the BinOp.
1240 if (LScevs.size() == 2 && RScevs.size() == 1)
1241 RScevs.push_back(Elt: RScevs[0]);
1242 else if (RScevs.size() == 2 && LScevs.size() == 1)
1243 LScevs.push_back(Elt: LScevs[0]);
1244 else {
1245 ScevList.emplace_back(Args&: Scev, Args&: NeedsFreeze);
1246 break;
1247 }
1248
1249 for (auto [L, R] : zip(t&: LScevs, u&: RScevs))
1250 ScevList.emplace_back(Args: GetBinOpExpr(Opcode, get<0>(Pair: L), get<0>(Pair: R)),
1251 Args&: NeedsFreeze);
1252 break;
1253 }
1254 default:
1255 // Just return the current SCEV if we haven't handled the instruction yet.
1256 LLVM_DEBUG(dbgs() << "ForkedPtr unhandled instruction: " << *I << "\n");
1257 ScevList.emplace_back(Args&: Scev, Args: !isGuaranteedNotToBeUndefOrPoison(V: Ptr));
1258 break;
1259 }
1260}
1261
1262bool AccessAnalysis::createCheckForAccess(
1263 RuntimePointerChecking &RtCheck, MemAccessInfo Access, Type *AccessTy,
1264 const DenseMap<Value *, const SCEV *> &StridesMap,
1265 DenseMap<Value *, unsigned> &DepSetId, Loop *TheLoop,
1266 unsigned &RunningDepId, unsigned ASId, bool Assume) {
1267 Value *Ptr = Access.getPointer();
1268 ScalarEvolution *SE = PSE.getSE();
1269 assert(SE->isSCEVable(Ptr->getType()) && "Value is not SCEVable!");
1270
1271 SmallVector<PointerIntPair<const SCEV *, 1, bool>> RTCheckPtrs;
1272 findForkedSCEVs(SE, L: TheLoop, Ptr, ScevList&: RTCheckPtrs, Depth: MaxForkedSCEVDepth);
1273 assert(!RTCheckPtrs.empty() &&
1274 "Must have some runtime-check pointer candidates");
1275
1276 // RTCheckPtrs must have size 2 if there are forked pointers. Otherwise, there
1277 // are no forked pointers; replaceSymbolicStridesSCEV in this case.
1278 auto IsLoopInvariantOrAR =
1279 [&SE, &TheLoop](const PointerIntPair<const SCEV *, 1, bool> &P) {
1280 return SE->isLoopInvariant(S: P.getPointer(), L: TheLoop) ||
1281 isa<SCEVAddRecExpr>(Val: P.getPointer());
1282 };
1283 if (RTCheckPtrs.size() == 2 && all_of(Range&: RTCheckPtrs, P: IsLoopInvariantOrAR)) {
1284 LLVM_DEBUG(dbgs() << "LAA: Found forked pointer: " << *Ptr << "\n";
1285 for (const auto &[Idx, Q] : enumerate(RTCheckPtrs)) dbgs()
1286 << "\t(" << Idx << ") " << *Q.getPointer() << "\n");
1287 } else {
1288 RTCheckPtrs = {{replaceSymbolicStrideSCEV(PSE, PtrToStride: StridesMap, Ptr), false}};
1289 }
1290
1291 /// Check whether all pointers can participate in a runtime bounds check. They
1292 /// must either be invariant or non-wrapping affine AddRecs.
1293 for (auto &P : RTCheckPtrs) {
1294 // The bounds for loop-invariant pointer is trivial.
1295 if (SE->isLoopInvariant(S: P.getPointer(), L: TheLoop))
1296 continue;
1297
1298 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Val: P.getPointer());
1299 if (!AR && Assume)
1300 AR = PSE.getAsAddRec(V: Ptr);
1301 if (!AR || !AR->isAffine())
1302 return false;
1303
1304 // If there's only one option for Ptr, look it up after bounds and wrap
1305 // checking, because assumptions might have been added to PSE.
1306 if (RTCheckPtrs.size() == 1) {
1307 AR =
1308 cast<SCEVAddRecExpr>(Val: replaceSymbolicStrideSCEV(PSE, PtrToStride: StridesMap, Ptr));
1309 P.setPointer(AR);
1310 }
1311
1312 if (!isNoWrap(PSE, AR, Ptr: RTCheckPtrs.size() == 1 ? Ptr : nullptr, AccessTy,
1313 L: TheLoop, Assume, DT))
1314 return false;
1315 }
1316
1317 for (const auto &[PtrExpr, NeedsFreeze] : RTCheckPtrs) {
1318 // The id of the dependence set.
1319 unsigned DepId;
1320
1321 if (isDependencyCheckNeeded()) {
1322 Value *Leader = DepCands.getLeaderValue(V: Access).getPointer();
1323 unsigned &LeaderId = DepSetId[Leader];
1324 if (!LeaderId)
1325 LeaderId = RunningDepId++;
1326 DepId = LeaderId;
1327 } else
1328 // Each access has its own dependence set.
1329 DepId = RunningDepId++;
1330
1331 bool IsWrite = Access.getInt();
1332 RtCheck.insert(Lp: TheLoop, Ptr, PtrExpr, AccessTy, WritePtr: IsWrite, DepSetId: DepId, ASId, PSE,
1333 NeedsFreeze);
1334 LLVM_DEBUG(dbgs() << "LAA: Found a runtime check ptr:" << *Ptr << '\n');
1335 }
1336
1337 return true;
1338}
1339
1340bool AccessAnalysis::canCheckPtrAtRT(
1341 RuntimePointerChecking &RtCheck, Loop *TheLoop,
1342 const DenseMap<Value *, const SCEV *> &StridesMap, Value *&UncomputablePtr,
1343 bool AllowPartial) {
1344 // Find pointers with computable bounds. We are going to use this information
1345 // to place a runtime bound check.
1346 bool CanDoRT = true;
1347
1348 bool MayNeedRTCheck = false;
1349 if (!IsRTCheckAnalysisNeeded) return true;
1350
1351 bool IsDepCheckNeeded = isDependencyCheckNeeded();
1352
1353 // We assign a consecutive id to access from different alias sets.
1354 // Accesses between different groups doesn't need to be checked.
1355 unsigned ASId = 0;
1356 for (const auto &AS : AST) {
1357 int NumReadPtrChecks = 0;
1358 int NumWritePtrChecks = 0;
1359 bool CanDoAliasSetRT = true;
1360 ++ASId;
1361 auto ASPointers = AS.getPointers();
1362
1363 // We assign consecutive id to access from different dependence sets.
1364 // Accesses within the same set don't need a runtime check.
1365 unsigned RunningDepId = 1;
1366 DenseMap<Value *, unsigned> DepSetId;
1367
1368 SmallVector<std::pair<MemAccessInfo, Type *>, 4> Retries;
1369
1370 // First, count how many write and read accesses are in the alias set. Also
1371 // collect MemAccessInfos for later.
1372 SmallVector<MemAccessInfo, 4> AccessInfos;
1373 for (const Value *ConstPtr : ASPointers) {
1374 Value *Ptr = const_cast<Value *>(ConstPtr);
1375 bool IsWrite = Accesses.contains(Key: MemAccessInfo(Ptr, true));
1376 if (IsWrite)
1377 ++NumWritePtrChecks;
1378 else
1379 ++NumReadPtrChecks;
1380 AccessInfos.emplace_back(Args&: Ptr, Args&: IsWrite);
1381 }
1382
1383 // We do not need runtime checks for this alias set, if there are no writes
1384 // or a single write and no reads.
1385 if (NumWritePtrChecks == 0 ||
1386 (NumWritePtrChecks == 1 && NumReadPtrChecks == 0)) {
1387 assert((ASPointers.size() <= 1 ||
1388 all_of(ASPointers,
1389 [this](const Value *Ptr) {
1390 MemAccessInfo AccessWrite(const_cast<Value *>(Ptr),
1391 true);
1392 return !DepCands.contains(AccessWrite);
1393 })) &&
1394 "Can only skip updating CanDoRT below, if all entries in AS "
1395 "are reads or there is at most 1 entry");
1396 continue;
1397 }
1398
1399 for (auto &Access : AccessInfos) {
1400 for (const auto &AccessTy : Accesses[Access]) {
1401 if (!createCheckForAccess(RtCheck, Access, AccessTy, StridesMap,
1402 DepSetId, TheLoop, RunningDepId, ASId,
1403 Assume: false)) {
1404 LLVM_DEBUG(dbgs() << "LAA: Can't find bounds for ptr:"
1405 << *Access.getPointer() << '\n');
1406 Retries.emplace_back(Args&: Access, Args: AccessTy);
1407 CanDoAliasSetRT = false;
1408 }
1409 }
1410 }
1411
1412 // Note that this function computes CanDoRT and MayNeedRTCheck
1413 // independently. For example CanDoRT=false, MayNeedRTCheck=false means that
1414 // we have a pointer for which we couldn't find the bounds but we don't
1415 // actually need to emit any checks so it does not matter.
1416 //
1417 // We need runtime checks for this alias set, if there are at least 2
1418 // dependence sets (in which case RunningDepId > 2) or if we need to re-try
1419 // any bound checks (because in that case the number of dependence sets is
1420 // incomplete).
1421 bool NeedsAliasSetRTCheck = RunningDepId > 2 || !Retries.empty();
1422
1423 // We need to perform run-time alias checks, but some pointers had bounds
1424 // that couldn't be checked.
1425 if (NeedsAliasSetRTCheck && !CanDoAliasSetRT) {
1426 // Reset the CanDoSetRt flag and retry all accesses that have failed.
1427 // We know that we need these checks, so we can now be more aggressive
1428 // and add further checks if required (overflow checks).
1429 CanDoAliasSetRT = true;
1430 for (const auto &[Access, AccessTy] : Retries) {
1431 if (!createCheckForAccess(RtCheck, Access, AccessTy, StridesMap,
1432 DepSetId, TheLoop, RunningDepId, ASId,
1433 /*Assume=*/true)) {
1434 CanDoAliasSetRT = false;
1435 UncomputablePtr = Access.getPointer();
1436 if (!AllowPartial)
1437 break;
1438 }
1439 }
1440 }
1441
1442 CanDoRT &= CanDoAliasSetRT;
1443 MayNeedRTCheck |= NeedsAliasSetRTCheck;
1444 ++ASId;
1445 }
1446
1447 // If the pointers that we would use for the bounds comparison have different
1448 // address spaces, assume the values aren't directly comparable, so we can't
1449 // use them for the runtime check. We also have to assume they could
1450 // overlap. In the future there should be metadata for whether address spaces
1451 // are disjoint.
1452 unsigned NumPointers = RtCheck.Pointers.size();
1453 for (unsigned i = 0; i < NumPointers; ++i) {
1454 for (unsigned j = i + 1; j < NumPointers; ++j) {
1455 // Only need to check pointers between two different dependency sets.
1456 if (RtCheck.Pointers[i].DependencySetId ==
1457 RtCheck.Pointers[j].DependencySetId)
1458 continue;
1459 // Only need to check pointers in the same alias set.
1460 if (RtCheck.Pointers[i].AliasSetId != RtCheck.Pointers[j].AliasSetId)
1461 continue;
1462
1463 Value *PtrI = RtCheck.Pointers[i].PointerValue;
1464 Value *PtrJ = RtCheck.Pointers[j].PointerValue;
1465
1466 unsigned ASi = PtrI->getType()->getPointerAddressSpace();
1467 unsigned ASj = PtrJ->getType()->getPointerAddressSpace();
1468 if (ASi != ASj) {
1469 LLVM_DEBUG(
1470 dbgs() << "LAA: Runtime check would require comparison between"
1471 " different address spaces\n");
1472 return false;
1473 }
1474 }
1475 }
1476
1477 if (MayNeedRTCheck && (CanDoRT || AllowPartial))
1478 RtCheck.generateChecks(DepCands, UseDependencies: IsDepCheckNeeded);
1479
1480 LLVM_DEBUG(dbgs() << "LAA: We need to do " << RtCheck.getNumberOfChecks()
1481 << " pointer comparisons.\n");
1482
1483 // If we can do run-time checks, but there are no checks, no runtime checks
1484 // are needed. This can happen when all pointers point to the same underlying
1485 // object for example.
1486 RtCheck.Need = CanDoRT ? RtCheck.getNumberOfChecks() != 0 : MayNeedRTCheck;
1487
1488 bool CanDoRTIfNeeded = !RtCheck.Need || CanDoRT;
1489 assert(CanDoRTIfNeeded == (CanDoRT || !MayNeedRTCheck) &&
1490 "CanDoRTIfNeeded depends on RtCheck.Need");
1491 if (!CanDoRTIfNeeded && !AllowPartial)
1492 RtCheck.reset();
1493 return CanDoRTIfNeeded;
1494}
1495
1496void AccessAnalysis::processMemAccesses() {
1497 // We process the set twice: first we process read-write pointers, last we
1498 // process read-only pointers. This allows us to skip dependence tests for
1499 // read-only pointers.
1500
1501 LLVM_DEBUG(dbgs() << "LAA: Processing memory accesses...\n");
1502 LLVM_DEBUG(dbgs() << " AST: "; AST.dump());
1503 LLVM_DEBUG(dbgs() << "LAA: Accesses(" << Accesses.size() << "):\n");
1504 LLVM_DEBUG({
1505 for (const auto &[A, _] : Accesses)
1506 dbgs() << "\t" << *A.getPointer() << " ("
1507 << (A.getInt()
1508 ? "write"
1509 : (ReadOnlyPtr.contains(A.getPointer()) ? "read-only"
1510 : "read"))
1511 << ")\n";
1512 });
1513
1514 // The AliasSetTracker has nicely partitioned our pointers by metadata
1515 // compatibility and potential for underlying-object overlap. As a result, we
1516 // only need to check for potential pointer dependencies within each alias
1517 // set.
1518 for (const auto &AS : AST) {
1519 // Note that both the alias-set tracker and the alias sets themselves used
1520 // ordered collections internally and so the iteration order here is
1521 // deterministic.
1522 auto ASPointers = AS.getPointers();
1523
1524 bool SetHasWrite = false;
1525
1526 // Map of (pointer to underlying objects, accessed address space) to last
1527 // access encountered.
1528 typedef DenseMap<std::pair<const Value *, unsigned>, MemAccessInfo>
1529 UnderlyingObjToAccessMap;
1530 UnderlyingObjToAccessMap ObjToLastAccess;
1531
1532 // Set of access to check after all writes have been processed.
1533 PtrAccessMap DeferredAccesses;
1534
1535 // Iterate over each alias set twice, once to process read/write pointers,
1536 // and then to process read-only pointers.
1537 for (int SetIteration = 0; SetIteration < 2; ++SetIteration) {
1538 bool UseDeferred = SetIteration > 0;
1539 PtrAccessMap &S = UseDeferred ? DeferredAccesses : Accesses;
1540
1541 for (const Value *ConstPtr : ASPointers) {
1542 Value *Ptr = const_cast<Value *>(ConstPtr);
1543
1544 // For a single memory access in AliasSetTracker, Accesses may contain
1545 // both read and write, and they both need to be handled for CheckDeps.
1546 for (const auto &[AC, _] : S) {
1547 if (AC.getPointer() != Ptr)
1548 continue;
1549
1550 bool IsWrite = AC.getInt();
1551
1552 // If we're using the deferred access set, then it contains only
1553 // reads.
1554 bool IsReadOnlyPtr = ReadOnlyPtr.contains(Ptr) && !IsWrite;
1555 if (UseDeferred && !IsReadOnlyPtr)
1556 continue;
1557 // Otherwise, the pointer must be in the PtrAccessSet, either as a
1558 // read or a write.
1559 assert(((IsReadOnlyPtr && UseDeferred) || IsWrite ||
1560 S.contains(MemAccessInfo(Ptr, false))) &&
1561 "Alias-set pointer not in the access set?");
1562
1563 MemAccessInfo Access(Ptr, IsWrite);
1564 DepCands.insert(Data: Access);
1565
1566 // Memorize read-only pointers for later processing and skip them in
1567 // the first round (they need to be checked after we have seen all
1568 // write pointers). Note: we also mark pointer that are not
1569 // consecutive as "read-only" pointers (so that we check
1570 // "a[b[i]] +="). Hence, we need the second check for "!IsWrite".
1571 if (!UseDeferred && IsReadOnlyPtr) {
1572 // We only use the pointer keys, the types vector values don't
1573 // matter.
1574 DeferredAccesses.insert(KV: {Access, {}});
1575 continue;
1576 }
1577
1578 // If this is a write - check other reads and writes for conflicts. If
1579 // this is a read only check other writes for conflicts (but only if
1580 // there is no other write to the ptr - this is an optimization to
1581 // catch "a[i] = a[i] + " without having to do a dependence check).
1582 if ((IsWrite || IsReadOnlyPtr) && SetHasWrite) {
1583 CheckDeps.push_back(Elt: Access);
1584 IsRTCheckAnalysisNeeded = true;
1585 }
1586
1587 if (IsWrite)
1588 SetHasWrite = true;
1589
1590 // Create sets of pointers connected by a shared alias set and
1591 // underlying object.
1592 SmallVector<const Value *, 16> &UOs = UnderlyingObjects[Ptr];
1593 UOs = {};
1594 ::getUnderlyingObjects(V: Ptr, Objects&: UOs, LI);
1595 LLVM_DEBUG(dbgs()
1596 << "Underlying objects for pointer " << *Ptr << "\n");
1597 for (const Value *UnderlyingObj : UOs) {
1598 // nullptr never alias, don't join sets for pointer that have "null"
1599 // in their UnderlyingObjects list.
1600 if (isa<ConstantPointerNull>(Val: UnderlyingObj) &&
1601 !NullPointerIsDefined(
1602 F: TheLoop->getHeader()->getParent(),
1603 AS: UnderlyingObj->getType()->getPointerAddressSpace()))
1604 continue;
1605
1606 auto [It, Inserted] = ObjToLastAccess.try_emplace(
1607 Key: {UnderlyingObj,
1608 cast<PointerType>(Val: Ptr->getType())->getAddressSpace()},
1609 Args&: Access);
1610 if (!Inserted) {
1611 DepCands.unionSets(V1: Access, V2: It->second);
1612 It->second = Access;
1613 }
1614
1615 LLVM_DEBUG(dbgs() << " " << *UnderlyingObj << "\n");
1616 }
1617 }
1618 }
1619 }
1620 }
1621}
1622
1623/// Check whether the access through \p Ptr has a constant stride.
1624std::optional<int64_t>
1625llvm::getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr,
1626 const Loop *Lp, const DominatorTree &DT,
1627 const DenseMap<Value *, const SCEV *> &StridesMap,
1628 bool Assume, bool ShouldCheckWrap) {
1629 const SCEV *PtrScev = replaceSymbolicStrideSCEV(PSE, PtrToStride: StridesMap, Ptr);
1630 if (PSE.getSE()->isLoopInvariant(S: PtrScev, L: Lp))
1631 return 0;
1632
1633 assert(Ptr->getType()->isPointerTy() && "Unexpected non-ptr");
1634
1635 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Val: PtrScev);
1636 if (Assume && !AR)
1637 AR = PSE.getAsAddRec(V: Ptr);
1638
1639 if (!AR) {
1640 LLVM_DEBUG(dbgs() << "LAA: Bad stride - Not an AddRecExpr pointer " << *Ptr
1641 << " SCEV: " << *PtrScev << "\n");
1642 return std::nullopt;
1643 }
1644
1645 std::optional<int64_t> Stride =
1646 getStrideFromAddRec(AR, Lp, AccessTy, Ptr, PSE);
1647 if (!ShouldCheckWrap || !Stride)
1648 return Stride;
1649
1650 if (isNoWrap(PSE, AR, Ptr, AccessTy, L: Lp, Assume, DT, Stride))
1651 return Stride;
1652
1653 LLVM_DEBUG(
1654 dbgs() << "LAA: Bad stride - Pointer may wrap in the address space "
1655 << *Ptr << " SCEV: " << *AR << "\n");
1656 return std::nullopt;
1657}
1658
1659std::optional<int64_t> llvm::getPointersDiff(Type *ElemTyA, Value *PtrA,
1660 Type *ElemTyB, Value *PtrB,
1661 const DataLayout &DL,
1662 ScalarEvolution &SE,
1663 bool StrictCheck, bool CheckType) {
1664 assert(PtrA && PtrB && "Expected non-nullptr pointers.");
1665
1666 // Make sure that A and B are different pointers.
1667 if (PtrA == PtrB)
1668 return 0;
1669
1670 // Make sure that the element types are the same if required.
1671 if (CheckType && ElemTyA != ElemTyB)
1672 return std::nullopt;
1673
1674 unsigned ASA = PtrA->getType()->getPointerAddressSpace();
1675 unsigned ASB = PtrB->getType()->getPointerAddressSpace();
1676
1677 // Check that the address spaces match.
1678 if (ASA != ASB)
1679 return std::nullopt;
1680 unsigned IdxWidth = DL.getIndexSizeInBits(AS: ASA);
1681
1682 APInt OffsetA(IdxWidth, 0), OffsetB(IdxWidth, 0);
1683 const Value *PtrA1 = PtrA->stripAndAccumulateConstantOffsets(
1684 DL, Offset&: OffsetA, /*AllowNonInbounds=*/true);
1685 const Value *PtrB1 = PtrB->stripAndAccumulateConstantOffsets(
1686 DL, Offset&: OffsetB, /*AllowNonInbounds=*/true);
1687
1688 std::optional<int64_t> Val;
1689 if (PtrA1 == PtrB1) {
1690 // Retrieve the address space again as pointer stripping now tracks through
1691 // `addrspacecast`.
1692 ASA = cast<PointerType>(Val: PtrA1->getType())->getAddressSpace();
1693 ASB = cast<PointerType>(Val: PtrB1->getType())->getAddressSpace();
1694 // Check that the address spaces match and that the pointers are valid.
1695 if (ASA != ASB)
1696 return std::nullopt;
1697
1698 IdxWidth = DL.getIndexSizeInBits(AS: ASA);
1699 OffsetA = OffsetA.sextOrTrunc(width: IdxWidth);
1700 OffsetB = OffsetB.sextOrTrunc(width: IdxWidth);
1701
1702 OffsetB -= OffsetA;
1703 Val = OffsetB.trySExtValue();
1704 } else {
1705 // Otherwise compute the distance with SCEV between the base pointers.
1706 const SCEV *PtrSCEVA = SE.getSCEV(V: PtrA);
1707 const SCEV *PtrSCEVB = SE.getSCEV(V: PtrB);
1708 std::optional<APInt> Diff =
1709 SE.computeConstantDifference(LHS: PtrSCEVB, RHS: PtrSCEVA);
1710 if (!Diff)
1711 return std::nullopt;
1712 Val = Diff->trySExtValue();
1713 }
1714
1715 if (!Val)
1716 return std::nullopt;
1717
1718 int64_t Size = DL.getTypeStoreSize(Ty: ElemTyA);
1719 int64_t Dist = *Val / Size;
1720
1721 // Ensure that the calculated distance matches the type-based one after all
1722 // the bitcasts removal in the provided pointers.
1723 if (!StrictCheck || Dist * Size == Val)
1724 return Dist;
1725 return std::nullopt;
1726}
1727
1728bool llvm::sortPtrAccesses(ArrayRef<Value *> VL, Type *ElemTy,
1729 const DataLayout &DL, ScalarEvolution &SE,
1730 SmallVectorImpl<unsigned> &SortedIndices) {
1731 assert(llvm::all_of(
1732 VL, [](const Value *V) { return V->getType()->isPointerTy(); }) &&
1733 "Expected list of pointer operands.");
1734 // Walk over the pointers, and map each of them to an offset relative to
1735 // first pointer in the array.
1736 Value *Ptr0 = VL[0];
1737
1738 using DistOrdPair = std::pair<int64_t, unsigned>;
1739 auto Compare = llvm::less_first();
1740 std::set<DistOrdPair, decltype(Compare)> Offsets(Compare);
1741 Offsets.emplace(args: 0, args: 0);
1742 bool IsConsecutive = true;
1743 for (auto [Idx, Ptr] : drop_begin(RangeOrContainer: enumerate(First&: VL))) {
1744 std::optional<int64_t> Diff =
1745 getPointersDiff(ElemTyA: ElemTy, PtrA: Ptr0, ElemTyB: ElemTy, PtrB: Ptr, DL, SE,
1746 /*StrictCheck=*/true);
1747 if (!Diff)
1748 return false;
1749
1750 // Check if the pointer with the same offset is found.
1751 int64_t Offset = *Diff;
1752 auto [It, IsInserted] = Offsets.emplace(args&: Offset, args&: Idx);
1753 if (!IsInserted)
1754 return false;
1755 // Consecutive order if the inserted element is the last one.
1756 IsConsecutive &= std::next(x: It) == Offsets.end();
1757 }
1758 SortedIndices.clear();
1759 if (!IsConsecutive) {
1760 // Fill SortedIndices array only if it is non-consecutive.
1761 SortedIndices.resize(N: VL.size());
1762 for (auto [Idx, Off] : enumerate(First&: Offsets))
1763 SortedIndices[Idx] = Off.second;
1764 }
1765 return true;
1766}
1767
1768/// Returns true if the memory operations \p A and \p B are consecutive.
1769bool llvm::isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL,
1770 ScalarEvolution &SE, bool CheckType) {
1771 Value *PtrA = getLoadStorePointerOperand(V: A);
1772 Value *PtrB = getLoadStorePointerOperand(V: B);
1773 if (!PtrA || !PtrB)
1774 return false;
1775 Type *ElemTyA = getLoadStoreType(I: A);
1776 Type *ElemTyB = getLoadStoreType(I: B);
1777 std::optional<int64_t> Diff =
1778 getPointersDiff(ElemTyA, PtrA, ElemTyB, PtrB, DL, SE,
1779 /*StrictCheck=*/true, CheckType);
1780 return Diff == 1;
1781}
1782
1783void MemoryDepChecker::addAccess(StoreInst *SI) {
1784 visitPointers(StartPtr: SI->getPointerOperand(), InnermostLoop: *InnermostLoop,
1785 AddPointer: [this, SI](Value *Ptr) {
1786 Accesses[MemAccessInfo(Ptr, true)].push_back(x: AccessIdx);
1787 InstMap.push_back(Elt: SI);
1788 ++AccessIdx;
1789 });
1790}
1791
1792void MemoryDepChecker::addAccess(LoadInst *LI) {
1793 visitPointers(StartPtr: LI->getPointerOperand(), InnermostLoop: *InnermostLoop,
1794 AddPointer: [this, LI](Value *Ptr) {
1795 Accesses[MemAccessInfo(Ptr, false)].push_back(x: AccessIdx);
1796 InstMap.push_back(Elt: LI);
1797 ++AccessIdx;
1798 });
1799}
1800
1801MemoryDepChecker::VectorizationSafetyStatus
1802MemoryDepChecker::Dependence::isSafeForVectorization(DepType Type) {
1803 switch (Type) {
1804 case NoDep:
1805 case Forward:
1806 case BackwardVectorizable:
1807 return VectorizationSafetyStatus::Safe;
1808
1809 case Unknown:
1810 return VectorizationSafetyStatus::PossiblySafeWithRtChecks;
1811 case ForwardButPreventsForwarding:
1812 case Backward:
1813 case BackwardVectorizableButPreventsForwarding:
1814 case IndirectUnsafe:
1815 return VectorizationSafetyStatus::Unsafe;
1816 }
1817 llvm_unreachable("unexpected DepType!");
1818}
1819
1820bool MemoryDepChecker::Dependence::isBackward() const {
1821 switch (Type) {
1822 case NoDep:
1823 case Forward:
1824 case ForwardButPreventsForwarding:
1825 case Unknown:
1826 case IndirectUnsafe:
1827 return false;
1828
1829 case BackwardVectorizable:
1830 case Backward:
1831 case BackwardVectorizableButPreventsForwarding:
1832 return true;
1833 }
1834 llvm_unreachable("unexpected DepType!");
1835}
1836
1837bool MemoryDepChecker::Dependence::isPossiblyBackward() const {
1838 return isBackward() || Type == Unknown || Type == IndirectUnsafe;
1839}
1840
1841bool MemoryDepChecker::Dependence::isForward() const {
1842 switch (Type) {
1843 case Forward:
1844 case ForwardButPreventsForwarding:
1845 return true;
1846
1847 case NoDep:
1848 case Unknown:
1849 case BackwardVectorizable:
1850 case Backward:
1851 case BackwardVectorizableButPreventsForwarding:
1852 case IndirectUnsafe:
1853 return false;
1854 }
1855 llvm_unreachable("unexpected DepType!");
1856}
1857
1858bool MemoryDepChecker::couldPreventStoreLoadForward(uint64_t Distance,
1859 uint64_t TypeByteSize,
1860 unsigned CommonStride) {
1861 // If loads occur at a distance that is not a multiple of a feasible vector
1862 // factor store-load forwarding does not take place.
1863 // Positive dependences might cause troubles because vectorizing them might
1864 // prevent store-load forwarding making vectorized code run a lot slower.
1865 // a[i] = a[i-3] ^ a[i-8];
1866 // The stores to a[i:i+1] don't align with the stores to a[i-3:i-2] and
1867 // hence on your typical architecture store-load forwarding does not take
1868 // place. Vectorizing in such cases does not make sense.
1869 // Store-load forwarding distance.
1870
1871 // After this many iterations store-to-load forwarding conflicts should not
1872 // cause any slowdowns.
1873 const uint64_t NumItersForStoreLoadThroughMemory = 8 * TypeByteSize;
1874 // Maximum vector factor.
1875 uint64_t MaxVFWithoutSLForwardIssuesPowerOf2 =
1876 std::min(a: VectorizerParams::MaxVectorWidth * TypeByteSize,
1877 b: MaxStoreLoadForwardSafeDistanceInBits);
1878
1879 // Compute the smallest VF at which the store and load would be misaligned.
1880 for (uint64_t VF = 2 * TypeByteSize;
1881 VF <= MaxVFWithoutSLForwardIssuesPowerOf2; VF *= 2) {
1882 // If the number of vector iteration between the store and the load are
1883 // small we could incur conflicts.
1884 if (Distance % VF && Distance / VF < NumItersForStoreLoadThroughMemory) {
1885 MaxVFWithoutSLForwardIssuesPowerOf2 = (VF >> 1);
1886 break;
1887 }
1888 }
1889
1890 if (MaxVFWithoutSLForwardIssuesPowerOf2 < 2 * TypeByteSize) {
1891 LLVM_DEBUG(
1892 dbgs() << "LAA: Distance " << Distance
1893 << " that could cause a store-load forwarding conflict\n");
1894 return true;
1895 }
1896
1897 if (CommonStride &&
1898 MaxVFWithoutSLForwardIssuesPowerOf2 <
1899 MaxStoreLoadForwardSafeDistanceInBits &&
1900 MaxVFWithoutSLForwardIssuesPowerOf2 !=
1901 VectorizerParams::MaxVectorWidth * TypeByteSize) {
1902 uint64_t MaxVF =
1903 bit_floor(Value: MaxVFWithoutSLForwardIssuesPowerOf2 / CommonStride);
1904 uint64_t MaxVFInBits = MaxVF * TypeByteSize * 8;
1905 MaxStoreLoadForwardSafeDistanceInBits =
1906 std::min(a: MaxStoreLoadForwardSafeDistanceInBits, b: MaxVFInBits);
1907 }
1908 return false;
1909}
1910
1911void MemoryDepChecker::mergeInStatus(VectorizationSafetyStatus S) {
1912 if (Status < S)
1913 Status = S;
1914}
1915
1916/// Given a dependence-distance \p Dist between two memory accesses, that have
1917/// strides in the same direction whose absolute value of the maximum stride is
1918/// given in \p MaxStride, in a loop whose maximum backedge taken count is \p
1919/// MaxBTC, check if it is possible to prove statically that the dependence
1920/// distance is larger than the range that the accesses will travel through the
1921/// execution of the loop. If so, return true; false otherwise. This is useful
1922/// for example in loops such as the following (PR31098):
1923///
1924/// for (i = 0; i < D; ++i) {
1925/// = out[i];
1926/// out[i+D] =
1927/// }
1928static bool isSafeDependenceDistance(const DataLayout &DL, ScalarEvolution &SE,
1929 const SCEV &MaxBTC, const SCEV &Dist,
1930 uint64_t MaxStride) {
1931
1932 // If we can prove that
1933 // (**) |Dist| > MaxBTC * Step
1934 // where Step is the absolute stride of the memory accesses in bytes,
1935 // then there is no dependence.
1936 //
1937 // Rationale:
1938 // We basically want to check if the absolute distance (|Dist/Step|)
1939 // is >= the loop iteration count (or > MaxBTC).
1940 // This is equivalent to the Strong SIV Test (Practical Dependence Testing,
1941 // Section 4.2.1); Note, that for vectorization it is sufficient to prove
1942 // that the dependence distance is >= VF; This is checked elsewhere.
1943 // But in some cases we can prune dependence distances early, and
1944 // even before selecting the VF, and without a runtime test, by comparing
1945 // the distance against the loop iteration count. Since the vectorized code
1946 // will be executed only if LoopCount >= VF, proving distance >= LoopCount
1947 // also guarantees that distance >= VF.
1948 //
1949 const SCEV *Step = SE.getConstant(Ty: MaxBTC.getType(), V: MaxStride);
1950 const SCEV *Product = SE.getMulExpr(LHS: &MaxBTC, RHS: Step);
1951
1952 const SCEV *CastedDist = &Dist;
1953 const SCEV *CastedProduct = Product;
1954 uint64_t DistTypeSizeBits = DL.getTypeSizeInBits(Ty: Dist.getType());
1955 uint64_t ProductTypeSizeBits = DL.getTypeSizeInBits(Ty: Product->getType());
1956
1957 // The dependence distance can be positive/negative, so we sign extend Dist;
1958 // The multiplication of the absolute stride in bytes and the
1959 // backedgeTakenCount is non-negative, so we zero extend Product.
1960 if (DistTypeSizeBits > ProductTypeSizeBits)
1961 CastedProduct = SE.getZeroExtendExpr(Op: Product, Ty: Dist.getType());
1962 else
1963 CastedDist = SE.getNoopOrSignExtend(V: &Dist, Ty: Product->getType());
1964
1965 // Is Dist - (MaxBTC * Step) > 0 ?
1966 // (If so, then we have proven (**) because |Dist| >= Dist)
1967 const SCEV *Minus = SE.getMinusSCEV(LHS: CastedDist, RHS: CastedProduct);
1968 if (SE.isKnownPositive(S: Minus))
1969 return true;
1970
1971 // Second try: Is -Dist - (MaxBTC * Step) > 0 ?
1972 // (If so, then we have proven (**) because |Dist| >= -1*Dist)
1973 const SCEV *NegDist = SE.getNegativeSCEV(V: CastedDist);
1974 Minus = SE.getMinusSCEV(LHS: NegDist, RHS: CastedProduct);
1975 return SE.isKnownPositive(S: Minus);
1976}
1977
1978/// Check the dependence for two accesses with the same stride \p Stride.
1979/// \p Distance is the positive distance in bytes, and \p TypeByteSize is type
1980/// size in bytes.
1981///
1982/// \returns true if they are independent.
1983static bool areStridedAccessesIndependent(uint64_t Distance, uint64_t Stride,
1984 uint64_t TypeByteSize) {
1985 assert(Stride > 1 && "The stride must be greater than 1");
1986 assert(TypeByteSize > 0 && "The type size in byte must be non-zero");
1987 assert(Distance > 0 && "The distance must be non-zero");
1988
1989 // Skip if the distance is not multiple of type byte size.
1990 if (Distance % TypeByteSize)
1991 return false;
1992
1993 // No dependence if the distance is not multiple of the stride.
1994 // E.g.
1995 // for (i = 0; i < 1024 ; i += 4)
1996 // A[i+2] = A[i] + 1;
1997 //
1998 // Two accesses in memory (distance is 2, stride is 4):
1999 // | A[0] | | | | A[4] | | | |
2000 // | | | A[2] | | | | A[6] | |
2001 //
2002 // E.g.
2003 // for (i = 0; i < 1024 ; i += 3)
2004 // A[i+4] = A[i] + 1;
2005 //
2006 // Two accesses in memory (distance is 4, stride is 3):
2007 // | A[0] | | | A[3] | | | A[6] | | |
2008 // | | | | | A[4] | | | A[7] | |
2009 return Distance % Stride;
2010}
2011
2012bool MemoryDepChecker::areAccessesCompletelyBeforeOrAfter(const SCEV *Src,
2013 Type *SrcTy,
2014 const SCEV *Sink,
2015 Type *SinkTy) {
2016 const SCEV *BTC = PSE.getBackedgeTakenCount();
2017 const SCEV *SymbolicMaxBTC = PSE.getSymbolicMaxBackedgeTakenCount();
2018 ScalarEvolution &SE = *PSE.getSE();
2019 const auto &[SrcStart_, SrcEnd_] =
2020 getStartAndEndForAccess(Lp: InnermostLoop, PtrExpr: Src, AccessTy: SrcTy, BTC, MaxBTC: SymbolicMaxBTC,
2021 SE: &SE, PointerBounds: &PointerBounds, DT, AC, LoopGuards);
2022 if (isa<SCEVCouldNotCompute>(Val: SrcStart_) || isa<SCEVCouldNotCompute>(Val: SrcEnd_))
2023 return false;
2024
2025 const auto &[SinkStart_, SinkEnd_] =
2026 getStartAndEndForAccess(Lp: InnermostLoop, PtrExpr: Sink, AccessTy: SinkTy, BTC, MaxBTC: SymbolicMaxBTC,
2027 SE: &SE, PointerBounds: &PointerBounds, DT, AC, LoopGuards);
2028 if (isa<SCEVCouldNotCompute>(Val: SinkStart_) ||
2029 isa<SCEVCouldNotCompute>(Val: SinkEnd_))
2030 return false;
2031
2032 if (!LoopGuards)
2033 LoopGuards.emplace(args: ScalarEvolution::LoopGuards::collect(L: InnermostLoop, SE));
2034
2035 auto SrcEnd = SE.applyLoopGuards(Expr: SrcEnd_, Guards: *LoopGuards);
2036 auto SinkStart = SE.applyLoopGuards(Expr: SinkStart_, Guards: *LoopGuards);
2037 if (SE.isKnownPredicate(Pred: CmpInst::ICMP_ULE, LHS: SrcEnd, RHS: SinkStart))
2038 return true;
2039
2040 auto SinkEnd = SE.applyLoopGuards(Expr: SinkEnd_, Guards: *LoopGuards);
2041 auto SrcStart = SE.applyLoopGuards(Expr: SrcStart_, Guards: *LoopGuards);
2042 return SE.isKnownPredicate(Pred: CmpInst::ICMP_ULE, LHS: SinkEnd, RHS: SrcStart);
2043}
2044
2045std::variant<MemoryDepChecker::Dependence::DepType,
2046 MemoryDepChecker::DepDistanceStrideAndSizeInfo>
2047MemoryDepChecker::getDependenceDistanceStrideAndSize(
2048 const AccessAnalysis::MemAccessInfo &A, Instruction *AInst,
2049 const AccessAnalysis::MemAccessInfo &B, Instruction *BInst) {
2050 const auto &DL = InnermostLoop->getHeader()->getDataLayout();
2051 auto &SE = *PSE.getSE();
2052 const auto &[APtr, AIsWrite] = A;
2053 const auto &[BPtr, BIsWrite] = B;
2054
2055 // Two reads are independent.
2056 if (!AIsWrite && !BIsWrite)
2057 return MemoryDepChecker::Dependence::NoDep;
2058
2059 Type *ATy = getLoadStoreType(I: AInst);
2060 Type *BTy = getLoadStoreType(I: BInst);
2061
2062 // We cannot check pointers in different address spaces.
2063 if (APtr->getType()->getPointerAddressSpace() !=
2064 BPtr->getType()->getPointerAddressSpace())
2065 return MemoryDepChecker::Dependence::Unknown;
2066
2067 std::optional<int64_t> StrideAPtr = getPtrStride(
2068 PSE, AccessTy: ATy, Ptr: APtr, Lp: InnermostLoop, DT: *DT, StridesMap: SymbolicStrides, Assume: true, ShouldCheckWrap: true);
2069 std::optional<int64_t> StrideBPtr = getPtrStride(
2070 PSE, AccessTy: BTy, Ptr: BPtr, Lp: InnermostLoop, DT: *DT, StridesMap: SymbolicStrides, Assume: true, ShouldCheckWrap: true);
2071
2072 const SCEV *Src = PSE.getSCEV(V: APtr);
2073 const SCEV *Sink = PSE.getSCEV(V: BPtr);
2074
2075 // If the induction step is negative we have to invert source and sink of the
2076 // dependence when measuring the distance between them. We should not swap
2077 // AIsWrite with BIsWrite, as their uses expect them in program order.
2078 if (StrideAPtr && *StrideAPtr < 0) {
2079 std::swap(a&: Src, b&: Sink);
2080 std::swap(a&: AInst, b&: BInst);
2081 std::swap(a&: ATy, b&: BTy);
2082 std::swap(lhs&: StrideAPtr, rhs&: StrideBPtr);
2083 }
2084
2085 const SCEV *Dist = SE.getMinusSCEV(LHS: Sink, RHS: Src);
2086
2087 LLVM_DEBUG(dbgs() << "LAA: Src Scev: " << *Src << "Sink Scev: " << *Sink
2088 << "\n");
2089 LLVM_DEBUG(dbgs() << "LAA: Distance for " << *AInst << " to " << *BInst
2090 << ": " << *Dist << "\n");
2091
2092 // Need accesses with constant strides and the same direction for further
2093 // dependence analysis. We don't want to vectorize "A[B[i]] += ..." and
2094 // similar code or pointer arithmetic that could wrap in the address space.
2095
2096 // If either Src or Sink are not strided (i.e. not a non-wrapping AddRec) and
2097 // not loop-invariant (stride will be 0 in that case), we cannot analyze the
2098 // dependence further and also cannot generate runtime checks.
2099 if (!StrideAPtr || !StrideBPtr) {
2100 LLVM_DEBUG(dbgs() << "Pointer access with non-constant stride\n");
2101 return MemoryDepChecker::Dependence::IndirectUnsafe;
2102 }
2103
2104 int64_t StrideAPtrInt = *StrideAPtr;
2105 int64_t StrideBPtrInt = *StrideBPtr;
2106 LLVM_DEBUG(dbgs() << "LAA: Src induction step: " << StrideAPtrInt
2107 << " Sink induction step: " << StrideBPtrInt << "\n");
2108 // At least Src or Sink are loop invariant and the other is strided or
2109 // invariant. We can generate a runtime check to disambiguate the accesses.
2110 if (!StrideAPtrInt || !StrideBPtrInt)
2111 return MemoryDepChecker::Dependence::Unknown;
2112
2113 // Both Src and Sink have a constant stride, check if they are in the same
2114 // direction.
2115 if ((StrideAPtrInt > 0) != (StrideBPtrInt > 0)) {
2116 LLVM_DEBUG(
2117 dbgs() << "Pointer access with strides in different directions\n");
2118 return MemoryDepChecker::Dependence::Unknown;
2119 }
2120
2121 TypeSize AStoreSz = DL.getTypeStoreSize(Ty: ATy);
2122 TypeSize BStoreSz = DL.getTypeStoreSize(Ty: BTy);
2123
2124 // If store sizes are not the same, set TypeByteSize to zero, so we can check
2125 // it in the caller isDependent.
2126 uint64_t ASz = DL.getTypeAllocSize(Ty: ATy);
2127 uint64_t BSz = DL.getTypeAllocSize(Ty: BTy);
2128 uint64_t TypeByteSize = (AStoreSz == BStoreSz) ? BSz : 0;
2129
2130 uint64_t StrideAScaled = std::abs(i: StrideAPtrInt) * ASz;
2131 uint64_t StrideBScaled = std::abs(i: StrideBPtrInt) * BSz;
2132
2133 uint64_t MaxStride = std::max(a: StrideAScaled, b: StrideBScaled);
2134
2135 std::optional<uint64_t> CommonStride;
2136 if (StrideAScaled == StrideBScaled)
2137 CommonStride = StrideAScaled;
2138
2139 // TODO: Historically, we didn't retry with runtime checks when (unscaled)
2140 // strides were different but there is no inherent reason to.
2141 if (!isa<SCEVConstant>(Val: Dist))
2142 ShouldRetryWithRuntimeChecks |= StrideAPtrInt == StrideBPtrInt;
2143
2144 // If distance is a SCEVCouldNotCompute, return Unknown immediately.
2145 if (isa<SCEVCouldNotCompute>(Val: Dist)) {
2146 LLVM_DEBUG(dbgs() << "LAA: Uncomputable distance.\n");
2147 return Dependence::Unknown;
2148 }
2149
2150 return DepDistanceStrideAndSizeInfo(Dist, MaxStride, CommonStride,
2151 TypeByteSize, AIsWrite, BIsWrite);
2152}
2153
2154MemoryDepChecker::Dependence::DepType
2155MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
2156 const MemAccessInfo &B, unsigned BIdx) {
2157 assert(AIdx < BIdx && "Must pass arguments in program order");
2158
2159 // Check if we can prove that Sink only accesses memory after Src's end or
2160 // vice versa. The helper is used to perform the checks only on the exit paths
2161 // where it helps to improve the analysis result.
2162 auto CheckCompletelyBeforeOrAfter = [&]() {
2163 auto *APtr = A.getPointer();
2164 auto *BPtr = B.getPointer();
2165 Type *ATy = getLoadStoreType(I: InstMap[AIdx]);
2166 Type *BTy = getLoadStoreType(I: InstMap[BIdx]);
2167 const SCEV *Src = PSE.getSCEV(V: APtr);
2168 const SCEV *Sink = PSE.getSCEV(V: BPtr);
2169 return areAccessesCompletelyBeforeOrAfter(Src, SrcTy: ATy, Sink, SinkTy: BTy);
2170 };
2171
2172 // Get the dependence distance, stride, type size and what access writes for
2173 // the dependence between A and B.
2174 auto Res =
2175 getDependenceDistanceStrideAndSize(A, AInst: InstMap[AIdx], B, BInst: InstMap[BIdx]);
2176 if (std::holds_alternative<Dependence::DepType>(v: Res)) {
2177 if (std::get<Dependence::DepType>(v&: Res) == Dependence::Unknown &&
2178 CheckCompletelyBeforeOrAfter())
2179 return Dependence::NoDep;
2180 return std::get<Dependence::DepType>(v&: Res);
2181 }
2182
2183 auto &[Dist, MaxStride, CommonStride, TypeByteSize, AIsWrite, BIsWrite] =
2184 std::get<DepDistanceStrideAndSizeInfo>(v&: Res);
2185 bool HasSameSize = TypeByteSize > 0;
2186
2187 ScalarEvolution &SE = *PSE.getSE();
2188 auto &DL = InnermostLoop->getHeader()->getDataLayout();
2189
2190 // If the distance between the acecsses is larger than their maximum absolute
2191 // stride multiplied by the symbolic maximum backedge taken count (which is an
2192 // upper bound of the number of iterations), the accesses are independet, i.e.
2193 // they are far enough appart that accesses won't access the same location
2194 // across all loop ierations.
2195 if (HasSameSize &&
2196 isSafeDependenceDistance(
2197 DL, SE, MaxBTC: *(PSE.getSymbolicMaxBackedgeTakenCount()), Dist: *Dist, MaxStride))
2198 return Dependence::NoDep;
2199
2200 // The rest of this function relies on ConstDist being at most 64-bits, which
2201 // is checked earlier. Will assert if the calling code changes.
2202 const APInt *APDist = nullptr;
2203 uint64_t ConstDist =
2204 match(S: Dist, P: m_scev_APInt(C&: APDist)) ? APDist->abs().getZExtValue() : 0;
2205
2206 // Attempt to prove strided accesses independent.
2207 if (APDist) {
2208 // If the distance between accesses and their strides are known constants,
2209 // check whether the accesses interlace each other.
2210 if (ConstDist > 0 && CommonStride && CommonStride > 1 && HasSameSize &&
2211 areStridedAccessesIndependent(Distance: ConstDist, Stride: *CommonStride, TypeByteSize)) {
2212 LLVM_DEBUG(dbgs() << "LAA: Strided accesses are independent\n");
2213 return Dependence::NoDep;
2214 }
2215 } else {
2216 if (!LoopGuards)
2217 LoopGuards.emplace(
2218 args: ScalarEvolution::LoopGuards::collect(L: InnermostLoop, SE));
2219 Dist = SE.applyLoopGuards(Expr: Dist, Guards: *LoopGuards);
2220 }
2221
2222 // Negative distances are not plausible dependencies.
2223 if (SE.isKnownNonPositive(S: Dist)) {
2224 if (SE.isKnownNonNegative(S: Dist)) {
2225 if (HasSameSize) {
2226 // Write to the same location with the same size.
2227 return Dependence::Forward;
2228 }
2229 LLVM_DEBUG(dbgs() << "LAA: possibly zero dependence difference but "
2230 "different type sizes\n");
2231 return Dependence::Unknown;
2232 }
2233
2234 bool IsTrueDataDependence = (AIsWrite && !BIsWrite);
2235 // Check if the first access writes to a location that is read in a later
2236 // iteration, where the distance between them is not a multiple of a vector
2237 // factor and relatively small.
2238 //
2239 // NOTE: There is no need to update MaxSafeVectorWidthInBits after call to
2240 // couldPreventStoreLoadForward, even if it changed MinDepDistBytes, since a
2241 // forward dependency will allow vectorization using any width.
2242
2243 if (IsTrueDataDependence && EnableForwardingConflictDetection) {
2244 if (!ConstDist) {
2245 return CheckCompletelyBeforeOrAfter() ? Dependence::NoDep
2246 : Dependence::Unknown;
2247 }
2248 if (!HasSameSize ||
2249 couldPreventStoreLoadForward(Distance: ConstDist, TypeByteSize)) {
2250 LLVM_DEBUG(
2251 dbgs() << "LAA: Forward but may prevent st->ld forwarding\n");
2252 return Dependence::ForwardButPreventsForwarding;
2253 }
2254 }
2255
2256 LLVM_DEBUG(dbgs() << "LAA: Dependence is negative\n");
2257 return Dependence::Forward;
2258 }
2259
2260 int64_t MinDistance = SE.getSignedRangeMin(S: Dist).getSExtValue();
2261 // Below we only handle strictly positive distances.
2262 if (MinDistance <= 0) {
2263 return CheckCompletelyBeforeOrAfter() ? Dependence::NoDep
2264 : Dependence::Unknown;
2265 }
2266
2267 if (!HasSameSize) {
2268 if (CheckCompletelyBeforeOrAfter())
2269 return Dependence::NoDep;
2270 LLVM_DEBUG(dbgs() << "LAA: ReadWrite-Write positive dependency with "
2271 "different type sizes\n");
2272 return Dependence::Unknown;
2273 }
2274 // Bail out early if passed-in parameters make vectorization not feasible.
2275 unsigned ForcedFactor = (VectorizerParams::VectorizationFactor ?
2276 VectorizerParams::VectorizationFactor : 1);
2277 unsigned ForcedUnroll = (VectorizerParams::VectorizationInterleave ?
2278 VectorizerParams::VectorizationInterleave : 1);
2279 // The minimum number of iterations for a vectorized/unrolled version.
2280 unsigned MinNumIter = std::max(a: ForcedFactor * ForcedUnroll, b: 2U);
2281
2282 // It's not vectorizable if the distance is smaller than the minimum distance
2283 // needed for a vectroized/unrolled version. Vectorizing one iteration in
2284 // front needs MaxStride. Vectorizing the last iteration needs TypeByteSize.
2285 // (No need to plus the last gap distance).
2286 //
2287 // E.g. Assume one char is 1 byte in memory and one int is 4 bytes.
2288 // foo(int *A) {
2289 // int *B = (int *)((char *)A + 14);
2290 // for (i = 0 ; i < 1024 ; i += 2)
2291 // B[i] = A[i] + 1;
2292 // }
2293 //
2294 // Two accesses in memory (stride is 4 * 2):
2295 // | A[0] | | A[2] | | A[4] | | A[6] | |
2296 // | B[0] | | B[2] | | B[4] |
2297 //
2298 // MinDistance needs for vectorizing iterations except the last iteration:
2299 // 4 * 2 * (MinNumIter - 1). MinDistance needs for the last iteration: 4.
2300 // So the minimum distance needed is: 4 * 2 * (MinNumIter - 1) + 4.
2301 //
2302 // If MinNumIter is 2, it is vectorizable as the minimum distance needed is
2303 // 12, which is less than distance.
2304 //
2305 // If MinNumIter is 4 (Say if a user forces the vectorization factor to be 4),
2306 // the minimum distance needed is 28, which is greater than distance. It is
2307 // not safe to do vectorization.
2308 //
2309 // We use MaxStride (maximum of src and sink strides) to get a conservative
2310 // lower bound on the MinDistanceNeeded in case of different strides.
2311
2312 // We know that Dist is positive, but it may not be constant. Use the signed
2313 // minimum for computations below, as this ensures we compute the closest
2314 // possible dependence distance.
2315 uint64_t MinDistanceNeeded = MaxStride * (MinNumIter - 1) + TypeByteSize;
2316 if (MinDistanceNeeded > static_cast<uint64_t>(MinDistance)) {
2317 if (!ConstDist) {
2318 // For non-constant distances, we checked the lower bound of the
2319 // dependence distance and the distance may be larger at runtime (and safe
2320 // for vectorization). Classify it as Unknown, so we re-try with runtime
2321 // checks, unless we can prove both accesses cannot overlap.
2322 return CheckCompletelyBeforeOrAfter() ? Dependence::NoDep
2323 : Dependence::Unknown;
2324 }
2325 LLVM_DEBUG(dbgs() << "LAA: Failure because of positive minimum distance "
2326 << MinDistance << '\n');
2327 return Dependence::Backward;
2328 }
2329
2330 // Unsafe if the minimum distance needed is greater than smallest dependence
2331 // distance distance.
2332 if (MinDistanceNeeded > MinDepDistBytes) {
2333 LLVM_DEBUG(dbgs() << "LAA: Failure because it needs at least "
2334 << MinDistanceNeeded << " size in bytes\n");
2335 return Dependence::Backward;
2336 }
2337
2338 MinDepDistBytes =
2339 std::min(a: static_cast<uint64_t>(MinDistance), b: MinDepDistBytes);
2340
2341 bool IsTrueDataDependence = (!AIsWrite && BIsWrite);
2342 if (IsTrueDataDependence && EnableForwardingConflictDetection && ConstDist &&
2343 couldPreventStoreLoadForward(Distance: MinDistance, TypeByteSize, CommonStride: *CommonStride))
2344 return Dependence::BackwardVectorizableButPreventsForwarding;
2345
2346 uint64_t MaxVF = MinDepDistBytes / MaxStride;
2347 LLVM_DEBUG(dbgs() << "LAA: Positive min distance " << MinDistance
2348 << " with max VF = " << MaxVF << '\n');
2349
2350 uint64_t MaxVFInBits = MaxVF * TypeByteSize * 8;
2351 if (!ConstDist && MaxVFInBits < MaxTargetVectorWidthInBits) {
2352 // For non-constant distances, we checked the lower bound of the dependence
2353 // distance and the distance may be larger at runtime (and safe for
2354 // vectorization). Classify it as Unknown, so we re-try with runtime checks,
2355 // unless we can prove both accesses cannot overlap.
2356 return CheckCompletelyBeforeOrAfter() ? Dependence::NoDep
2357 : Dependence::Unknown;
2358 }
2359
2360 if (CheckCompletelyBeforeOrAfter())
2361 return Dependence::NoDep;
2362
2363 MaxSafeVectorWidthInBits = std::min(a: MaxSafeVectorWidthInBits, b: MaxVFInBits);
2364 return Dependence::BackwardVectorizable;
2365}
2366
2367bool MemoryDepChecker::areDepsSafe(const DepCandidates &DepCands,
2368 const MemAccessInfoList &CheckDeps) {
2369
2370 MinDepDistBytes = -1;
2371 SmallPtrSet<MemAccessInfo, 8> Visited;
2372 for (MemAccessInfo CurAccess : CheckDeps) {
2373 if (Visited.contains(Ptr: CurAccess))
2374 continue;
2375
2376 // Check accesses within this set.
2377 EquivalenceClasses<MemAccessInfo>::member_iterator AI =
2378 DepCands.findLeader(V: CurAccess);
2379 EquivalenceClasses<MemAccessInfo>::member_iterator AE =
2380 DepCands.member_end();
2381
2382 // Check every access pair.
2383 while (AI != AE) {
2384 Visited.insert(Ptr: *AI);
2385 bool AIIsWrite = AI->getInt();
2386 // Check loads only against next equivalent class, but stores also against
2387 // other stores in the same equivalence class - to the same address.
2388 EquivalenceClasses<MemAccessInfo>::member_iterator OI =
2389 (AIIsWrite ? AI : std::next(x: AI));
2390 while (OI != AE) {
2391 // Check every accessing instruction pair in program order.
2392 auto &Acc = Accesses[*AI];
2393 for (std::vector<unsigned>::iterator I1 = Acc.begin(), I1E = Acc.end();
2394 I1 != I1E; ++I1)
2395 // Scan all accesses of another equivalence class, but only the next
2396 // accesses of the same equivalent class.
2397 for (std::vector<unsigned>::iterator
2398 I2 = (OI == AI ? std::next(x: I1) : Accesses[*OI].begin()),
2399 I2E = (OI == AI ? I1E : Accesses[*OI].end());
2400 I2 != I2E; ++I2) {
2401 auto A = std::make_pair(x: &*AI, y&: *I1);
2402 auto B = std::make_pair(x: &*OI, y&: *I2);
2403
2404 assert(*I1 != *I2);
2405 if (*I1 > *I2)
2406 std::swap(x&: A, y&: B);
2407
2408 Dependence::DepType Type =
2409 isDependent(A: *A.first, AIdx: A.second, B: *B.first, BIdx: B.second);
2410 mergeInStatus(S: Dependence::isSafeForVectorization(Type));
2411
2412 // Gather dependences unless we accumulated MaxDependences
2413 // dependences. In that case return as soon as we find the first
2414 // unsafe dependence. This puts a limit on this quadratic
2415 // algorithm.
2416 if (RecordDependences) {
2417 if (Type != Dependence::NoDep)
2418 Dependences.emplace_back(Args&: A.second, Args&: B.second, Args&: Type);
2419
2420 if (Dependences.size() >= MaxDependences) {
2421 RecordDependences = false;
2422 Dependences.clear();
2423 LLVM_DEBUG(dbgs()
2424 << "Too many dependences, stopped recording\n");
2425 }
2426 }
2427 if (!RecordDependences && !isSafeForVectorization())
2428 return false;
2429 }
2430 ++OI;
2431 }
2432 ++AI;
2433 }
2434 }
2435
2436 LLVM_DEBUG(dbgs() << "Total Dependences: " << Dependences.size() << "\n");
2437 return isSafeForVectorization();
2438}
2439
2440SmallVector<Instruction *, 4>
2441MemoryDepChecker::getInstructionsForAccess(Value *Ptr, bool IsWrite) const {
2442 MemAccessInfo Access(Ptr, IsWrite);
2443 auto I = Accesses.find(Val: Access);
2444 SmallVector<Instruction *, 4> Insts;
2445 if (I != Accesses.end()) {
2446 transform(Range: I->second, d_first: std::back_inserter(x&: Insts),
2447 F: [&](unsigned Idx) { return this->InstMap[Idx]; });
2448 }
2449
2450 return Insts;
2451}
2452
2453const char *MemoryDepChecker::Dependence::DepName[] = {
2454 "NoDep",
2455 "Unknown",
2456 "IndirectUnsafe",
2457 "Forward",
2458 "ForwardButPreventsForwarding",
2459 "Backward",
2460 "BackwardVectorizable",
2461 "BackwardVectorizableButPreventsForwarding"};
2462
2463void MemoryDepChecker::Dependence::print(
2464 raw_ostream &OS, unsigned Depth,
2465 const SmallVectorImpl<Instruction *> &Instrs) const {
2466 OS.indent(NumSpaces: Depth) << DepName[Type] << ":\n";
2467 OS.indent(NumSpaces: Depth + 2) << *Instrs[Source] << " -> \n";
2468 OS.indent(NumSpaces: Depth + 2) << *Instrs[Destination] << "\n";
2469}
2470
2471bool LoopAccessInfo::canAnalyzeLoop() {
2472 // We need to have a loop header.
2473 LLVM_DEBUG(dbgs() << "\nLAA: Checking a loop in '"
2474 << TheLoop->getHeader()->getParent()->getName() << "' from "
2475 << TheLoop->getLocStr() << "\n");
2476
2477 // We can only analyze innermost loops.
2478 if (!TheLoop->isInnermost()) {
2479 LLVM_DEBUG(dbgs() << "LAA: loop is not the innermost loop\n");
2480 recordAnalysis(RemarkName: "NotInnerMostLoop") << "loop is not the innermost loop";
2481 return false;
2482 }
2483
2484 // We must have a single backedge.
2485 if (TheLoop->getNumBackEdges() != 1) {
2486 LLVM_DEBUG(
2487 dbgs() << "LAA: loop control flow is not understood by analyzer\n");
2488 recordAnalysis(RemarkName: "CFGNotUnderstood")
2489 << "loop control flow is not understood by analyzer";
2490 return false;
2491 }
2492
2493 // ScalarEvolution needs to be able to find the symbolic max backedge taken
2494 // count, which is an upper bound on the number of loop iterations. The loop
2495 // may execute fewer iterations, if it exits via an uncountable exit.
2496 const SCEV *ExitCount = PSE->getSymbolicMaxBackedgeTakenCount();
2497 if (isa<SCEVCouldNotCompute>(Val: ExitCount)) {
2498 recordAnalysis(RemarkName: "CantComputeNumberOfIterations")
2499 << "could not determine number of loop iterations";
2500 LLVM_DEBUG(dbgs() << "LAA: SCEV could not compute the loop exit count.\n");
2501 return false;
2502 }
2503
2504 LLVM_DEBUG(dbgs() << "LAA: Found an analyzable loop: "
2505 << TheLoop->getHeader()->getName() << "\n");
2506 return true;
2507}
2508
2509bool LoopAccessInfo::analyzeLoop(AAResults *AA, const LoopInfo *LI,
2510 const TargetLibraryInfo *TLI,
2511 DominatorTree *DT) {
2512 // Holds the Load and Store instructions.
2513 SmallVector<LoadInst *, 16> Loads;
2514 SmallVector<StoreInst *, 16> Stores;
2515 SmallPtrSet<MDNode *, 8> LoopAliasScopes;
2516
2517 // Holds all the different accesses in the loop.
2518 unsigned NumReads = 0;
2519 unsigned NumReadWrites = 0;
2520
2521 bool HasComplexMemInst = false;
2522
2523 // A runtime check is only legal to insert if there are no convergent calls.
2524 HasConvergentOp = false;
2525
2526 PtrRtChecking->Pointers.clear();
2527 PtrRtChecking->Need = false;
2528
2529 const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel();
2530
2531 const bool EnableMemAccessVersioningOfLoop =
2532 EnableMemAccessVersioning &&
2533 !TheLoop->getHeader()->getParent()->hasOptSize();
2534
2535 // Traverse blocks in fixed RPOT order, regardless of their storage in the
2536 // loop info, as it may be arbitrary.
2537 LoopBlocksRPO RPOT(TheLoop);
2538 RPOT.perform(LI);
2539 for (BasicBlock *BB : RPOT) {
2540 // Scan the BB and collect legal loads and stores. Also detect any
2541 // convergent instructions.
2542 for (Instruction &I : *BB) {
2543 if (auto *Call = dyn_cast<CallBase>(Val: &I)) {
2544 if (Call->isConvergent())
2545 HasConvergentOp = true;
2546 }
2547
2548 // With both a non-vectorizable memory instruction and a convergent
2549 // operation, found in this loop, no reason to continue the search.
2550 if (HasComplexMemInst && HasConvergentOp)
2551 return false;
2552
2553 // Avoid hitting recordAnalysis multiple times.
2554 if (HasComplexMemInst)
2555 continue;
2556
2557 // Record alias scopes defined inside the loop.
2558 if (auto *Decl = dyn_cast<NoAliasScopeDeclInst>(Val: &I))
2559 for (Metadata *Op : Decl->getScopeList()->operands())
2560 LoopAliasScopes.insert(Ptr: cast<MDNode>(Val: Op));
2561
2562 // Many math library functions read the rounding mode. We will only
2563 // vectorize a loop if it contains known function calls that don't set
2564 // the flag. Therefore, it is safe to ignore this read from memory.
2565 auto *Call = dyn_cast<CallInst>(Val: &I);
2566 if (Call && getVectorIntrinsicIDForCall(CI: Call, TLI))
2567 continue;
2568
2569 // If this is a load, save it. If this instruction can read from memory
2570 // but is not a load, we only allow it if it's a call to a function with a
2571 // vector mapping and no pointer arguments.
2572 if (I.mayReadFromMemory()) {
2573 auto hasPointerArgs = [](CallBase *CB) {
2574 return any_of(Range: CB->args(), P: [](Value const *Arg) {
2575 return Arg->getType()->isPointerTy();
2576 });
2577 };
2578
2579 // If the function has an explicit vectorized counterpart, and does not
2580 // take output/input pointers, we can safely assume that it can be
2581 // vectorized.
2582 if (Call && !Call->isNoBuiltin() && Call->getCalledFunction() &&
2583 !hasPointerArgs(Call) && !VFDatabase::getMappings(CI: *Call).empty())
2584 continue;
2585
2586 auto *Ld = dyn_cast<LoadInst>(Val: &I);
2587 if (!Ld) {
2588 recordAnalysis(RemarkName: "CantVectorizeInstruction", Instr: Ld)
2589 << "instruction cannot be vectorized";
2590 HasComplexMemInst = true;
2591 continue;
2592 }
2593 if (!Ld->isSimple() && !IsAnnotatedParallel) {
2594 recordAnalysis(RemarkName: "NonSimpleLoad", Instr: Ld)
2595 << "read with atomic ordering or volatile read";
2596 LLVM_DEBUG(dbgs() << "LAA: Found a non-simple load.\n");
2597 HasComplexMemInst = true;
2598 continue;
2599 }
2600 NumLoads++;
2601 Loads.push_back(Elt: Ld);
2602 DepChecker->addAccess(LI: Ld);
2603 if (EnableMemAccessVersioningOfLoop)
2604 collectStridedAccess(LoadOrStoreInst: Ld);
2605 continue;
2606 }
2607
2608 // Save 'store' instructions. Abort if other instructions write to memory.
2609 if (I.mayWriteToMemory()) {
2610 auto *St = dyn_cast<StoreInst>(Val: &I);
2611 if (!St) {
2612 recordAnalysis(RemarkName: "CantVectorizeInstruction", Instr: St)
2613 << "instruction cannot be vectorized";
2614 HasComplexMemInst = true;
2615 continue;
2616 }
2617 if (!St->isSimple() && !IsAnnotatedParallel) {
2618 recordAnalysis(RemarkName: "NonSimpleStore", Instr: St)
2619 << "write with atomic ordering or volatile write";
2620 LLVM_DEBUG(dbgs() << "LAA: Found a non-simple store.\n");
2621 HasComplexMemInst = true;
2622 continue;
2623 }
2624 NumStores++;
2625 Stores.push_back(Elt: St);
2626 DepChecker->addAccess(SI: St);
2627 if (EnableMemAccessVersioningOfLoop)
2628 collectStridedAccess(LoadOrStoreInst: St);
2629 }
2630 } // Next instr.
2631 } // Next block.
2632
2633 if (HasComplexMemInst)
2634 return false;
2635
2636 // Now we have two lists that hold the loads and the stores.
2637 // Next, we find the pointers that they use.
2638
2639 // Check if we see any stores. If there are no stores, then we don't
2640 // care if the pointers are *restrict*.
2641 if (!Stores.size()) {
2642 LLVM_DEBUG(dbgs() << "LAA: Found a read-only loop!\n");
2643 return true;
2644 }
2645
2646 MemoryDepChecker::DepCandidates DepCands;
2647 AccessAnalysis Accesses(TheLoop, AA, LI, *DT, DepCands, *PSE,
2648 LoopAliasScopes);
2649
2650 // Holds the analyzed pointers. We don't want to call getUnderlyingObjects
2651 // multiple times on the same object. If the ptr is accessed twice, once
2652 // for read and once for write, it will only appear once (on the write
2653 // list). This is okay, since we are going to check for conflicts between
2654 // writes and between reads and writes, but not between reads and reads.
2655 SmallSet<std::pair<Value *, Type *>, 16> Seen;
2656
2657 // Record uniform store addresses to identify if we have multiple stores
2658 // to the same address.
2659 SmallPtrSet<Value *, 16> UniformStores;
2660
2661 for (StoreInst *ST : Stores) {
2662 Value *Ptr = ST->getPointerOperand();
2663
2664 if (isInvariant(V: Ptr)) {
2665 // Record store instructions to loop invariant addresses
2666 StoresToInvariantAddresses.push_back(Elt: ST);
2667 HasStoreStoreDependenceInvolvingLoopInvariantAddress |=
2668 !UniformStores.insert(Ptr).second;
2669 }
2670
2671 // If we did *not* see this pointer before, insert it to the read-write
2672 // list. At this phase it is only a 'write' list.
2673 Type *AccessTy = getLoadStoreType(I: ST);
2674 if (Seen.insert(V: {Ptr, AccessTy}).second) {
2675 ++NumReadWrites;
2676
2677 MemoryLocation Loc = MemoryLocation::get(SI: ST);
2678 // The TBAA metadata could have a control dependency on the predication
2679 // condition, so we cannot rely on it when determining whether or not we
2680 // need runtime pointer checks.
2681 if (blockNeedsPredication(BB: ST->getParent(), TheLoop, DT))
2682 Loc.AATags.TBAA = nullptr;
2683
2684 visitPointers(StartPtr: const_cast<Value *>(Loc.Ptr), InnermostLoop: *TheLoop,
2685 AddPointer: [&Accesses, AccessTy, Loc](Value *Ptr) {
2686 MemoryLocation NewLoc = Loc.getWithNewPtr(NewPtr: Ptr);
2687 Accesses.addStore(Loc: NewLoc, AccessTy);
2688 });
2689 }
2690 }
2691
2692 if (IsAnnotatedParallel) {
2693 LLVM_DEBUG(
2694 dbgs() << "LAA: A loop annotated parallel, ignore memory dependency "
2695 << "checks.\n");
2696 return true;
2697 }
2698
2699 for (LoadInst *LD : Loads) {
2700 Value *Ptr = LD->getPointerOperand();
2701 // If we did *not* see this pointer before, insert it to the
2702 // read list. If we *did* see it before, then it is already in
2703 // the read-write list. This allows us to vectorize expressions
2704 // such as A[i] += x; Because the address of A[i] is a read-write
2705 // pointer. This only works if the index of A[i] is consecutive.
2706 // If the address of i is unknown (for example A[B[i]]) then we may
2707 // read a few words, modify, and write a few words, and some of the
2708 // words may be written to the same address.
2709 bool IsReadOnlyPtr = false;
2710 Type *AccessTy = getLoadStoreType(I: LD);
2711 if (Seen.insert(V: {Ptr, AccessTy}).second ||
2712 !getPtrStride(PSE&: *PSE, AccessTy, Ptr, Lp: TheLoop, DT: *DT, StridesMap: SymbolicStrides, Assume: false,
2713 ShouldCheckWrap: true)) {
2714 ++NumReads;
2715 IsReadOnlyPtr = true;
2716 }
2717
2718 // See if there is an unsafe dependency between a load to a uniform address and
2719 // store to the same uniform address.
2720 if (UniformStores.contains(Ptr)) {
2721 LLVM_DEBUG(dbgs() << "LAA: Found an unsafe dependency between a uniform "
2722 "load and uniform store to the same address!\n");
2723 HasLoadStoreDependenceInvolvingLoopInvariantAddress = true;
2724 }
2725
2726 MemoryLocation Loc = MemoryLocation::get(LI: LD);
2727 // The TBAA metadata could have a control dependency on the predication
2728 // condition, so we cannot rely on it when determining whether or not we
2729 // need runtime pointer checks.
2730 if (blockNeedsPredication(BB: LD->getParent(), TheLoop, DT))
2731 Loc.AATags.TBAA = nullptr;
2732
2733 visitPointers(StartPtr: const_cast<Value *>(Loc.Ptr), InnermostLoop: *TheLoop,
2734 AddPointer: [&Accesses, AccessTy, Loc, IsReadOnlyPtr](Value *Ptr) {
2735 MemoryLocation NewLoc = Loc.getWithNewPtr(NewPtr: Ptr);
2736 Accesses.addLoad(Loc: NewLoc, AccessTy, IsReadOnly: IsReadOnlyPtr);
2737 });
2738 }
2739
2740 // If we write (or read-write) to a single destination and there are no
2741 // other reads in this loop then is it safe to vectorize.
2742 if (NumReadWrites == 1 && NumReads == 0) {
2743 LLVM_DEBUG(dbgs() << "LAA: Found a write-only loop!\n");
2744 return true;
2745 }
2746
2747 // Build dependence sets and check whether we need a runtime pointer bounds
2748 // check.
2749 Accesses.buildDependenceSets();
2750
2751 // Find pointers with computable bounds. We are going to use this information
2752 // to place a runtime bound check.
2753 Value *UncomputablePtr = nullptr;
2754 HasCompletePtrRtChecking = Accesses.canCheckPtrAtRT(
2755 RtCheck&: *PtrRtChecking, TheLoop, StridesMap: SymbolicStrides, UncomputablePtr, AllowPartial);
2756 if (!HasCompletePtrRtChecking) {
2757 const auto *I = dyn_cast_or_null<Instruction>(Val: UncomputablePtr);
2758 recordAnalysis(RemarkName: "CantIdentifyArrayBounds", Instr: I)
2759 << "cannot identify array bounds";
2760 LLVM_DEBUG(dbgs() << "LAA: We can't vectorize because we can't find "
2761 << "the array bounds.\n");
2762 return false;
2763 }
2764
2765 LLVM_DEBUG(
2766 dbgs() << "LAA: May be able to perform a memory runtime check if needed.\n");
2767
2768 bool DepsAreSafe = true;
2769 if (Accesses.isDependencyCheckNeeded()) {
2770 LLVM_DEBUG(dbgs() << "LAA: Checking memory dependencies\n");
2771 DepsAreSafe =
2772 DepChecker->areDepsSafe(DepCands, CheckDeps: Accesses.getDependenciesToCheck());
2773
2774 if (!DepsAreSafe && DepChecker->shouldRetryWithRuntimeChecks()) {
2775 LLVM_DEBUG(dbgs() << "LAA: Retrying with memory checks\n");
2776
2777 // Clear the dependency checks. We assume they are not needed.
2778 Accesses.resetDepChecks(DepChecker&: *DepChecker);
2779
2780 PtrRtChecking->reset();
2781 PtrRtChecking->Need = true;
2782
2783 UncomputablePtr = nullptr;
2784 HasCompletePtrRtChecking =
2785 Accesses.canCheckPtrAtRT(RtCheck&: *PtrRtChecking, TheLoop, StridesMap: SymbolicStrides,
2786 UncomputablePtr, AllowPartial);
2787
2788 // Check that we found the bounds for the pointer.
2789 if (!HasCompletePtrRtChecking) {
2790 auto *I = dyn_cast_or_null<Instruction>(Val: UncomputablePtr);
2791 recordAnalysis(RemarkName: "CantCheckMemDepsAtRunTime", Instr: I)
2792 << "cannot check memory dependencies at runtime";
2793 LLVM_DEBUG(dbgs() << "LAA: Can't vectorize with memory checks\n");
2794 return false;
2795 }
2796 DepsAreSafe = true;
2797 }
2798 }
2799
2800 if (HasConvergentOp) {
2801 recordAnalysis(RemarkName: "CantInsertRuntimeCheckWithConvergent")
2802 << "cannot add control dependency to convergent operation";
2803 LLVM_DEBUG(dbgs() << "LAA: We can't vectorize because a runtime check "
2804 "would be needed with a convergent operation\n");
2805 return false;
2806 }
2807
2808 if (DepsAreSafe) {
2809 LLVM_DEBUG(
2810 dbgs() << "LAA: No unsafe dependent memory operations in loop. We"
2811 << (PtrRtChecking->Need ? "" : " don't")
2812 << " need runtime memory checks.\n");
2813 return true;
2814 }
2815
2816 emitUnsafeDependenceRemark();
2817 return false;
2818}
2819
2820void LoopAccessInfo::emitUnsafeDependenceRemark() {
2821 const auto *Deps = getDepChecker().getDependences();
2822 if (!Deps)
2823 return;
2824 const auto *Found =
2825 llvm::find_if(Range: *Deps, P: [](const MemoryDepChecker::Dependence &D) {
2826 return MemoryDepChecker::Dependence::isSafeForVectorization(Type: D.Type) !=
2827 MemoryDepChecker::VectorizationSafetyStatus::Safe;
2828 });
2829 if (Found == Deps->end())
2830 return;
2831 MemoryDepChecker::Dependence Dep = *Found;
2832
2833 LLVM_DEBUG(dbgs() << "LAA: unsafe dependent memory operations in loop\n");
2834
2835 // Emit remark for first unsafe dependence
2836 bool HasForcedDistribution = false;
2837 std::optional<const MDOperand *> Value =
2838 findStringMetadataForLoop(TheLoop, Name: "llvm.loop.distribute.enable");
2839 if (Value) {
2840 const MDOperand *Op = *Value;
2841 assert(Op && mdconst::hasa<ConstantInt>(*Op) && "invalid metadata");
2842 HasForcedDistribution = mdconst::extract<ConstantInt>(MD: *Op)->getZExtValue();
2843 }
2844
2845 const std::string Info =
2846 HasForcedDistribution
2847 ? "unsafe dependent memory operations in loop."
2848 : "unsafe dependent memory operations in loop. Use "
2849 "#pragma clang loop distribute(enable) to allow loop distribution "
2850 "to attempt to isolate the offending operations into a separate "
2851 "loop";
2852 OptimizationRemarkAnalysis &R =
2853 recordAnalysis(RemarkName: "UnsafeDep", Instr: Dep.getDestination(DepChecker: getDepChecker())) << Info;
2854
2855 switch (Dep.Type) {
2856 case MemoryDepChecker::Dependence::NoDep:
2857 case MemoryDepChecker::Dependence::Forward:
2858 case MemoryDepChecker::Dependence::BackwardVectorizable:
2859 llvm_unreachable("Unexpected dependence");
2860 case MemoryDepChecker::Dependence::Backward:
2861 R << "\nBackward loop carried data dependence.";
2862 break;
2863 case MemoryDepChecker::Dependence::ForwardButPreventsForwarding:
2864 R << "\nForward loop carried data dependence that prevents "
2865 "store-to-load forwarding.";
2866 break;
2867 case MemoryDepChecker::Dependence::BackwardVectorizableButPreventsForwarding:
2868 R << "\nBackward loop carried data dependence that prevents "
2869 "store-to-load forwarding.";
2870 break;
2871 case MemoryDepChecker::Dependence::IndirectUnsafe:
2872 R << "\nUnsafe indirect dependence.";
2873 break;
2874 case MemoryDepChecker::Dependence::Unknown:
2875 R << "\nUnknown data dependence.";
2876 break;
2877 }
2878
2879 if (Instruction *I = Dep.getSource(DepChecker: getDepChecker())) {
2880 DebugLoc SourceLoc = I->getDebugLoc();
2881 if (auto *DD = dyn_cast_or_null<Instruction>(Val: getPointerOperand(V: I)))
2882 SourceLoc = DD->getDebugLoc();
2883 if (SourceLoc)
2884 R << " Memory location is the same as accessed at "
2885 << ore::NV("Location", SourceLoc);
2886 }
2887}
2888
2889bool LoopAccessInfo::blockNeedsPredication(const BasicBlock *BB,
2890 const Loop *TheLoop,
2891 const DominatorTree *DT) {
2892 assert(TheLoop->contains(BB) && "Unknown block used");
2893
2894 // Blocks that do not dominate the latch need predication.
2895 const BasicBlock *Latch = TheLoop->getLoopLatch();
2896 return !DT->dominates(A: BB, B: Latch);
2897}
2898
2899OptimizationRemarkAnalysis &
2900LoopAccessInfo::recordAnalysis(StringRef RemarkName, const Instruction *I) {
2901 assert(!Report && "Multiple reports generated");
2902
2903 const BasicBlock *CodeRegion = TheLoop->getHeader();
2904 DebugLoc DL = TheLoop->getStartLoc();
2905
2906 if (I) {
2907 CodeRegion = I->getParent();
2908 // If there is no debug location attached to the instruction, revert back to
2909 // using the loop's.
2910 if (I->getDebugLoc())
2911 DL = I->getDebugLoc();
2912 }
2913
2914 Report = std::make_unique<OptimizationRemarkAnalysis>(DEBUG_TYPE, args&: RemarkName,
2915 args&: DL, args&: CodeRegion);
2916 return *Report;
2917}
2918
2919bool LoopAccessInfo::isInvariant(Value *V) const {
2920 auto *SE = PSE->getSE();
2921 if (TheLoop->isLoopInvariant(V))
2922 return true;
2923 if (!SE->isSCEVable(Ty: V->getType()))
2924 return false;
2925 const SCEV *S = SE->getSCEV(V);
2926 return SE->isLoopInvariant(S, L: TheLoop);
2927}
2928
2929/// If \p Ptr is a GEP, which has a loop-variant operand, return that operand.
2930/// Otherwise, return \p Ptr.
2931static Value *getLoopVariantGEPOperand(Value *Ptr, ScalarEvolution *SE,
2932 Loop *Lp) {
2933 auto *GEP = dyn_cast<GetElementPtrInst>(Val: Ptr);
2934 if (!GEP)
2935 return Ptr;
2936
2937 Value *V = Ptr;
2938 for (const Use &U : GEP->operands()) {
2939 if (!SE->isLoopInvariant(S: SE->getSCEV(V: U), L: Lp)) {
2940 if (V == Ptr)
2941 V = U;
2942 else
2943 // There must be exactly one loop-variant operand.
2944 return Ptr;
2945 }
2946 }
2947 return V;
2948}
2949
2950/// Get the stride of a pointer access in a loop. Looks for symbolic
2951/// strides "a[i*stride]". Returns the symbolic stride, or null otherwise.
2952static const SCEV *getStrideFromPointer(Value *Ptr, ScalarEvolution *SE, Loop *Lp) {
2953 auto *PtrTy = dyn_cast<PointerType>(Val: Ptr->getType());
2954 if (!PtrTy)
2955 return nullptr;
2956
2957 // Try to remove a gep instruction to make the pointer (actually index at this
2958 // point) easier analyzable. If OrigPtr is equal to Ptr we are analyzing the
2959 // pointer, otherwise, we are analyzing the index.
2960 Value *OrigPtr = Ptr;
2961
2962 Ptr = getLoopVariantGEPOperand(Ptr, SE, Lp);
2963 const SCEV *V = SE->getSCEV(V: Ptr);
2964
2965 if (Ptr != OrigPtr)
2966 // Strip off casts.
2967 while (auto *C = dyn_cast<SCEVIntegralCastExpr>(Val: V))
2968 V = C->getOperand();
2969
2970 if (!match(S: V, P: m_scev_AffineAddRec(Op0: m_SCEV(), Op1: m_SCEV(V), L: m_SpecificLoop(L: Lp))))
2971 return nullptr;
2972
2973 // Note that the restriction after this loop invariant check are only
2974 // profitability restrictions.
2975 if (!SE->isLoopInvariant(S: V, L: Lp))
2976 return nullptr;
2977
2978 // Look for the loop invariant symbolic value.
2979 if (isa<SCEVUnknown>(Val: V))
2980 return V;
2981
2982 if (auto *C = dyn_cast<SCEVIntegralCastExpr>(Val: V))
2983 if (isa<SCEVUnknown>(Val: C->getOperand()))
2984 return V;
2985
2986 return nullptr;
2987}
2988
2989void LoopAccessInfo::collectStridedAccess(Value *MemAccess) {
2990 Value *Ptr = getLoadStorePointerOperand(V: MemAccess);
2991 if (!Ptr)
2992 return;
2993
2994 // Note: getStrideFromPointer is a *profitability* heuristic. We
2995 // could broaden the scope of values returned here - to anything
2996 // which happens to be loop invariant and contributes to the
2997 // computation of an interesting IV - but we chose not to as we
2998 // don't have a cost model here, and broadening the scope exposes
2999 // far too many unprofitable cases.
3000 const SCEV *StrideExpr = getStrideFromPointer(Ptr, SE: PSE->getSE(), Lp: TheLoop);
3001 if (!StrideExpr)
3002 return;
3003
3004 if (match(S: StrideExpr, P: m_scev_UndefOrPoison()))
3005 return;
3006
3007 LLVM_DEBUG(dbgs() << "LAA: Found a strided access that is a candidate for "
3008 "versioning:");
3009 LLVM_DEBUG(dbgs() << " Ptr: " << *Ptr << " Stride: " << *StrideExpr << "\n");
3010
3011 if (!SpeculateUnitStride) {
3012 LLVM_DEBUG(dbgs() << " Chose not to due to -laa-speculate-unit-stride\n");
3013 return;
3014 }
3015
3016 // Avoid adding the "Stride == 1" predicate when we know that
3017 // Stride >= Trip-Count. Such a predicate will effectively optimize a single
3018 // or zero iteration loop, as Trip-Count <= Stride == 1.
3019 //
3020 // TODO: We are currently not making a very informed decision on when it is
3021 // beneficial to apply stride versioning. It might make more sense that the
3022 // users of this analysis (such as the vectorizer) will trigger it, based on
3023 // their specific cost considerations; For example, in cases where stride
3024 // versioning does not help resolving memory accesses/dependences, the
3025 // vectorizer should evaluate the cost of the runtime test, and the benefit
3026 // of various possible stride specializations, considering the alternatives
3027 // of using gather/scatters (if available).
3028
3029 const SCEV *MaxBTC = PSE->getSymbolicMaxBackedgeTakenCount();
3030
3031 // Match the types so we can compare the stride and the MaxBTC.
3032 // The Stride can be positive/negative, so we sign extend Stride;
3033 // The backedgeTakenCount is non-negative, so we zero extend MaxBTC.
3034 const DataLayout &DL = TheLoop->getHeader()->getDataLayout();
3035 uint64_t StrideTypeSizeBits = DL.getTypeSizeInBits(Ty: StrideExpr->getType());
3036 uint64_t BETypeSizeBits = DL.getTypeSizeInBits(Ty: MaxBTC->getType());
3037 const SCEV *CastedStride = StrideExpr;
3038 const SCEV *CastedBECount = MaxBTC;
3039 ScalarEvolution *SE = PSE->getSE();
3040 if (BETypeSizeBits >= StrideTypeSizeBits)
3041 CastedStride = SE->getNoopOrSignExtend(V: StrideExpr, Ty: MaxBTC->getType());
3042 else
3043 CastedBECount = SE->getZeroExtendExpr(Op: MaxBTC, Ty: StrideExpr->getType());
3044 const SCEV *StrideMinusBETaken = SE->getMinusSCEV(LHS: CastedStride, RHS: CastedBECount);
3045 // Since TripCount == BackEdgeTakenCount + 1, checking:
3046 // "Stride >= TripCount" is equivalent to checking:
3047 // Stride - MaxBTC> 0
3048 if (SE->isKnownPositive(S: StrideMinusBETaken)) {
3049 LLVM_DEBUG(
3050 dbgs() << "LAA: Stride>=TripCount; No point in versioning as the "
3051 "Stride==1 predicate will imply that the loop executes "
3052 "at most once.\n");
3053 return;
3054 }
3055 LLVM_DEBUG(dbgs() << "LAA: Found a strided access that we can version.\n");
3056
3057 // Strip back off the integer cast, and check that our result is a
3058 // SCEVUnknown as we expect.
3059 const SCEV *StrideBase = StrideExpr;
3060 if (const auto *C = dyn_cast<SCEVIntegralCastExpr>(Val: StrideBase))
3061 StrideBase = C->getOperand();
3062 SymbolicStrides[Ptr] = cast<SCEVUnknown>(Val: StrideBase);
3063}
3064
3065LoopAccessInfo::LoopAccessInfo(Loop *L, ScalarEvolution *SE,
3066 const TargetTransformInfo *TTI,
3067 const TargetLibraryInfo *TLI, AAResults *AA,
3068 DominatorTree *DT, LoopInfo *LI,
3069 AssumptionCache *AC, bool AllowPartial)
3070 : PSE(std::make_unique<PredicatedScalarEvolution>(args&: *SE, args&: *L)),
3071 PtrRtChecking(nullptr), TheLoop(L), AllowPartial(AllowPartial) {
3072 unsigned MaxTargetVectorWidthInBits = std::numeric_limits<unsigned>::max();
3073 if (TTI && !TTI->enableScalableVectorization())
3074 // Scale the vector width by 2 as rough estimate to also consider
3075 // interleaving.
3076 MaxTargetVectorWidthInBits =
3077 TTI->getRegisterBitWidth(K: TargetTransformInfo::RGK_FixedWidthVector) * 2;
3078
3079 DepChecker = std::make_unique<MemoryDepChecker>(
3080 args&: *PSE, args&: AC, args&: DT, args&: L, args&: SymbolicStrides, args&: MaxTargetVectorWidthInBits, args&: LoopGuards);
3081 PtrRtChecking =
3082 std::make_unique<RuntimePointerChecking>(args&: *DepChecker, args&: SE, args&: LoopGuards);
3083 if (canAnalyzeLoop())
3084 CanVecMem = analyzeLoop(AA, LI, TLI, DT);
3085}
3086
3087void LoopAccessInfo::print(raw_ostream &OS, unsigned Depth) const {
3088 if (CanVecMem) {
3089 OS.indent(NumSpaces: Depth) << "Memory dependences are safe";
3090 const MemoryDepChecker &DC = getDepChecker();
3091 if (!DC.isSafeForAnyVectorWidth())
3092 OS << " with a maximum safe vector width of "
3093 << DC.getMaxSafeVectorWidthInBits() << " bits";
3094 if (!DC.isSafeForAnyStoreLoadForwardDistances()) {
3095 uint64_t SLDist = DC.getStoreLoadForwardSafeDistanceInBits();
3096 OS << ", with a maximum safe store-load forward width of " << SLDist
3097 << " bits";
3098 }
3099 if (PtrRtChecking->Need)
3100 OS << " with run-time checks";
3101 OS << "\n";
3102 }
3103
3104 if (HasConvergentOp)
3105 OS.indent(NumSpaces: Depth) << "Has convergent operation in loop\n";
3106
3107 if (Report)
3108 OS.indent(NumSpaces: Depth) << "Report: " << Report->getMsg() << "\n";
3109
3110 if (auto *Dependences = DepChecker->getDependences()) {
3111 OS.indent(NumSpaces: Depth) << "Dependences:\n";
3112 for (const auto &Dep : *Dependences) {
3113 Dep.print(OS, Depth: Depth + 2, Instrs: DepChecker->getMemoryInstructions());
3114 OS << "\n";
3115 }
3116 } else
3117 OS.indent(NumSpaces: Depth) << "Too many dependences, not recorded\n";
3118
3119 // List the pair of accesses need run-time checks to prove independence.
3120 PtrRtChecking->print(OS, Depth);
3121 if (PtrRtChecking->Need && !HasCompletePtrRtChecking)
3122 OS.indent(NumSpaces: Depth) << "Generated run-time checks are incomplete\n";
3123 OS << "\n";
3124
3125 OS.indent(NumSpaces: Depth)
3126 << "Non vectorizable stores to invariant address were "
3127 << (HasStoreStoreDependenceInvolvingLoopInvariantAddress ||
3128 HasLoadStoreDependenceInvolvingLoopInvariantAddress
3129 ? ""
3130 : "not ")
3131 << "found in loop.\n";
3132
3133 OS.indent(NumSpaces: Depth) << "SCEV assumptions:\n";
3134 PSE->getPredicate().print(OS, Depth);
3135
3136 OS << "\n";
3137
3138 OS.indent(NumSpaces: Depth) << "Expressions re-written:\n";
3139 PSE->print(OS, Depth);
3140}
3141
3142const LoopAccessInfo &LoopAccessInfoManager::getInfo(Loop &L,
3143 bool AllowPartial) {
3144 const auto &[It, Inserted] = LoopAccessInfoMap.try_emplace(Key: &L);
3145
3146 // We need to create the LoopAccessInfo if either we don't already have one,
3147 // or if it was created with a different value of AllowPartial.
3148 if (Inserted || It->second->hasAllowPartial() != AllowPartial)
3149 It->second = std::make_unique<LoopAccessInfo>(args: &L, args: &SE, args&: TTI, args&: TLI, args: &AA, args: &DT,
3150 args: &LI, args&: AC, args&: AllowPartial);
3151
3152 return *It->second;
3153}
3154void LoopAccessInfoManager::clear() {
3155 // Collect LoopAccessInfo entries that may keep references to IR outside the
3156 // analyzed loop or SCEVs that may have been modified or invalidated. At the
3157 // moment, that is loops requiring memory or SCEV runtime checks, as those cache
3158 // SCEVs, e.g. for pointer expressions.
3159 for (const auto &[L, LAI] : LoopAccessInfoMap) {
3160 if (LAI->getRuntimePointerChecking()->getChecks().empty() &&
3161 LAI->getPSE().getPredicate().isAlwaysTrue())
3162 continue;
3163 LoopAccessInfoMap.erase(Val: L);
3164 }
3165}
3166
3167bool LoopAccessInfoManager::invalidate(
3168 Function &F, const PreservedAnalyses &PA,
3169 FunctionAnalysisManager::Invalidator &Inv) {
3170 // Check whether our analysis is preserved.
3171 auto PAC = PA.getChecker<LoopAccessAnalysis>();
3172 if (!PAC.preserved() && !PAC.preservedSet<AllAnalysesOn<Function>>())
3173 // If not, give up now.
3174 return true;
3175
3176 // Check whether the analyses we depend on became invalid for any reason.
3177 // Skip checking TargetLibraryAnalysis as it is immutable and can't become
3178 // invalid.
3179 return Inv.invalidate<AAManager>(IR&: F, PA) ||
3180 Inv.invalidate<ScalarEvolutionAnalysis>(IR&: F, PA) ||
3181 Inv.invalidate<LoopAnalysis>(IR&: F, PA) ||
3182 Inv.invalidate<DominatorTreeAnalysis>(IR&: F, PA);
3183}
3184
3185LoopAccessInfoManager LoopAccessAnalysis::run(Function &F,
3186 FunctionAnalysisManager &FAM) {
3187 auto &SE = FAM.getResult<ScalarEvolutionAnalysis>(IR&: F);
3188 auto &AA = FAM.getResult<AAManager>(IR&: F);
3189 auto &DT = FAM.getResult<DominatorTreeAnalysis>(IR&: F);
3190 auto &LI = FAM.getResult<LoopAnalysis>(IR&: F);
3191 auto &TTI = FAM.getResult<TargetIRAnalysis>(IR&: F);
3192 auto &TLI = FAM.getResult<TargetLibraryAnalysis>(IR&: F);
3193 auto &AC = FAM.getResult<AssumptionAnalysis>(IR&: F);
3194 return LoopAccessInfoManager(SE, AA, DT, LI, &TTI, &TLI, &AC);
3195}
3196
3197AnalysisKey LoopAccessAnalysis::Key;
3198