1//===- LoopAccessAnalysis.cpp - Loop Access Analysis Implementation --------==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// The implementation for the loop memory dependence that was originally
10// developed for the loop vectorizer.
11//
12//===----------------------------------------------------------------------===//
13
14#include "llvm/Analysis/LoopAccessAnalysis.h"
15#include "llvm/ADT/APInt.h"
16#include "llvm/ADT/DenseMap.h"
17#include "llvm/ADT/EquivalenceClasses.h"
18#include "llvm/ADT/PointerIntPair.h"
19#include "llvm/ADT/STLExtras.h"
20#include "llvm/ADT/SetVector.h"
21#include "llvm/ADT/SmallPtrSet.h"
22#include "llvm/ADT/SmallSet.h"
23#include "llvm/ADT/SmallVector.h"
24#include "llvm/Analysis/AliasAnalysis.h"
25#include "llvm/Analysis/AliasSetTracker.h"
26#include "llvm/Analysis/AssumeBundleQueries.h"
27#include "llvm/Analysis/AssumptionCache.h"
28#include "llvm/Analysis/LoopAnalysisManager.h"
29#include "llvm/Analysis/LoopInfo.h"
30#include "llvm/Analysis/LoopIterator.h"
31#include "llvm/Analysis/MemoryLocation.h"
32#include "llvm/Analysis/OptimizationRemarkEmitter.h"
33#include "llvm/Analysis/ScalarEvolution.h"
34#include "llvm/Analysis/ScalarEvolutionExpressions.h"
35#include "llvm/Analysis/ScalarEvolutionPatternMatch.h"
36#include "llvm/Analysis/TargetLibraryInfo.h"
37#include "llvm/Analysis/TargetTransformInfo.h"
38#include "llvm/Analysis/ValueTracking.h"
39#include "llvm/Analysis/VectorUtils.h"
40#include "llvm/IR/BasicBlock.h"
41#include "llvm/IR/Constants.h"
42#include "llvm/IR/DataLayout.h"
43#include "llvm/IR/DebugLoc.h"
44#include "llvm/IR/DerivedTypes.h"
45#include "llvm/IR/DiagnosticInfo.h"
46#include "llvm/IR/Dominators.h"
47#include "llvm/IR/Function.h"
48#include "llvm/IR/InstrTypes.h"
49#include "llvm/IR/Instruction.h"
50#include "llvm/IR/Instructions.h"
51#include "llvm/IR/IntrinsicInst.h"
52#include "llvm/IR/PassManager.h"
53#include "llvm/IR/Type.h"
54#include "llvm/IR/Value.h"
55#include "llvm/IR/ValueHandle.h"
56#include "llvm/Support/Casting.h"
57#include "llvm/Support/CommandLine.h"
58#include "llvm/Support/Debug.h"
59#include "llvm/Support/ErrorHandling.h"
60#include "llvm/Support/raw_ostream.h"
61#include <algorithm>
62#include <cassert>
63#include <cstdint>
64#include <iterator>
65#include <utility>
66#include <variant>
67#include <vector>
68
69using namespace llvm;
70using namespace llvm::SCEVPatternMatch;
71
72#define DEBUG_TYPE "loop-accesses"
73
74static cl::opt<unsigned, true>
75VectorizationFactor("force-vector-width", cl::Hidden,
76 cl::desc("Sets the SIMD width. Zero is autoselect."),
77 cl::location(L&: VectorizerParams::VectorizationFactor));
78unsigned VectorizerParams::VectorizationFactor;
79
80static cl::opt<unsigned, true>
81VectorizationInterleave("force-vector-interleave", cl::Hidden,
82 cl::desc("Sets the vectorization interleave count. "
83 "Zero is autoselect."),
84 cl::location(
85 L&: VectorizerParams::VectorizationInterleave));
86unsigned VectorizerParams::VectorizationInterleave;
87
88static cl::opt<unsigned, true> RuntimeMemoryCheckThreshold(
89 "runtime-memory-check-threshold", cl::Hidden,
90 cl::desc("When performing memory disambiguation checks at runtime do not "
91 "generate more than this number of comparisons (default = 8)."),
92 cl::location(L&: VectorizerParams::RuntimeMemoryCheckThreshold), cl::init(Val: 8));
93unsigned VectorizerParams::RuntimeMemoryCheckThreshold;
94
95/// The maximum iterations used to merge memory checks
96static cl::opt<unsigned> MemoryCheckMergeThreshold(
97 "memory-check-merge-threshold", cl::Hidden,
98 cl::desc("Maximum number of comparisons done when trying to merge "
99 "runtime memory checks. (default = 100)"),
100 cl::init(Val: 100));
101
102/// Maximum SIMD width.
103const unsigned VectorizerParams::MaxVectorWidth = 64;
104
105/// We collect dependences up to this threshold.
106static cl::opt<unsigned>
107 MaxDependences("max-dependences", cl::Hidden,
108 cl::desc("Maximum number of dependences collected by "
109 "loop-access analysis (default = 100)"),
110 cl::init(Val: 100));
111
112/// This enables versioning on the strides of symbolically striding memory
113/// accesses in code like the following.
114/// for (i = 0; i < N; ++i)
115/// A[i * Stride1] += B[i * Stride2] ...
116///
117/// Will be roughly translated to
118/// if (Stride1 == 1 && Stride2 == 1) {
119/// for (i = 0; i < N; i+=4)
120/// A[i:i+3] += ...
121/// } else
122/// ...
123static cl::opt<bool> EnableMemAccessVersioning(
124 "enable-mem-access-versioning", cl::init(Val: true), cl::Hidden,
125 cl::desc("Enable symbolic stride memory access versioning"));
126
127/// Enable store-to-load forwarding conflict detection. This option can
128/// be disabled for correctness testing.
129static cl::opt<bool> EnableForwardingConflictDetection(
130 "store-to-load-forwarding-conflict-detection", cl::Hidden,
131 cl::desc("Enable conflict detection in loop-access analysis"),
132 cl::init(Val: true));
133
134static cl::opt<unsigned> MaxForkedSCEVDepth(
135 "max-forked-scev-depth", cl::Hidden,
136 cl::desc("Maximum recursion depth when finding forked SCEVs (default = 5)"),
137 cl::init(Val: 5));
138
139static cl::opt<bool> SpeculateUnitStride(
140 "laa-speculate-unit-stride", cl::Hidden,
141 cl::desc("Speculate that non-constant strides are unit in LAA"),
142 cl::init(Val: true));
143
144static cl::opt<bool, true> HoistRuntimeChecks(
145 "hoist-runtime-checks", cl::Hidden,
146 cl::desc(
147 "Hoist inner loop runtime memory checks to outer loop if possible"),
148 cl::location(L&: VectorizerParams::HoistRuntimeChecks), cl::init(Val: true));
149bool VectorizerParams::HoistRuntimeChecks;
150
151bool VectorizerParams::isInterleaveForced() {
152 return ::VectorizationInterleave.getNumOccurrences() > 0;
153}
154
155const SCEV *llvm::replaceSymbolicStrideSCEV(PredicatedScalarEvolution &PSE,
156 const DenseMap<Value *, const SCEV *> &PtrToStride,
157 Value *Ptr) {
158 const SCEV *OrigSCEV = PSE.getSCEV(V: Ptr);
159
160 // If there is an entry in the map return the SCEV of the pointer with the
161 // symbolic stride replaced by one.
162 const SCEV *StrideSCEV = PtrToStride.lookup(Val: Ptr);
163 if (!StrideSCEV)
164 // For a non-symbolic stride, just return the original expression.
165 return OrigSCEV;
166
167 // Note: This assert is both overly strong and overly weak. The actual
168 // invariant here is that StrideSCEV should be loop invariant. The only
169 // such invariant strides we happen to speculate right now are unknowns
170 // and thus this is a reasonable proxy of the actual invariant.
171 assert(isa<SCEVUnknown>(StrideSCEV) && "shouldn't be in map");
172
173 ScalarEvolution *SE = PSE.getSE();
174 const SCEV *CT = SE->getOne(Ty: StrideSCEV->getType());
175 PSE.addPredicate(Pred: *SE->getEqualPredicate(LHS: StrideSCEV, RHS: CT));
176 const SCEV *Expr = PSE.getSCEV(V: Ptr);
177
178 LLVM_DEBUG(dbgs() << "LAA: Replacing SCEV: " << *OrigSCEV
179 << " by: " << *Expr << "\n");
180 return Expr;
181}
182
183RuntimeCheckingPtrGroup::RuntimeCheckingPtrGroup(
184 unsigned Index, const RuntimePointerChecking &RtCheck)
185 : High(RtCheck.Pointers[Index].End), Low(RtCheck.Pointers[Index].Start),
186 AddressSpace(RtCheck.Pointers[Index]
187 .PointerValue->getType()
188 ->getPointerAddressSpace()),
189 NeedsFreeze(RtCheck.Pointers[Index].NeedsFreeze) {
190 Members.push_back(Elt: Index);
191}
192
193/// Returns \p A + \p B, if it is guaranteed not to unsigned wrap. Otherwise
194/// return nullptr. \p A and \p B must have the same type.
195static const SCEV *addSCEVNoOverflow(const SCEV *A, const SCEV *B,
196 ScalarEvolution &SE) {
197 if (!SE.willNotOverflow(BinOp: Instruction::Add, /*IsSigned=*/Signed: false, LHS: A, RHS: B))
198 return nullptr;
199 return SE.getAddExpr(LHS: A, RHS: B);
200}
201
202/// Returns \p A * \p B, if it is guaranteed not to unsigned wrap. Otherwise
203/// return nullptr. \p A and \p B must have the same type.
204static const SCEV *mulSCEVOverflow(const SCEV *A, const SCEV *B,
205 ScalarEvolution &SE) {
206 if (!SE.willNotOverflow(BinOp: Instruction::Mul, /*IsSigned=*/Signed: false, LHS: A, RHS: B))
207 return nullptr;
208 return SE.getMulExpr(LHS: A, RHS: B);
209}
210
211/// Return true, if evaluating \p AR at \p MaxBTC cannot wrap, because \p AR at
212/// \p MaxBTC is guaranteed inbounds of the accessed object.
213static bool evaluatePtrAddRecAtMaxBTCWillNotWrap(
214 const SCEVAddRecExpr *AR, const SCEV *MaxBTC, const SCEV *EltSize,
215 ScalarEvolution &SE, const DataLayout &DL, DominatorTree *DT,
216 AssumptionCache *AC,
217 std::optional<ScalarEvolution::LoopGuards> &LoopGuards) {
218 auto *PointerBase = SE.getPointerBase(V: AR->getStart());
219 auto *StartPtr = dyn_cast<SCEVUnknown>(Val: PointerBase);
220 if (!StartPtr)
221 return false;
222 const Loop *L = AR->getLoop();
223 bool CheckForNonNull, CheckForFreed;
224 Value *StartPtrV = StartPtr->getValue();
225 uint64_t DerefBytes = StartPtrV->getPointerDereferenceableBytes(
226 DL, CanBeNull&: CheckForNonNull, CanBeFreed&: CheckForFreed);
227
228 if (DerefBytes && (CheckForNonNull || CheckForFreed))
229 return false;
230
231 const SCEV *Step = AR->getStepRecurrence(SE);
232 Type *WiderTy = SE.getWiderType(Ty1: MaxBTC->getType(), Ty2: Step->getType());
233 const SCEV *DerefBytesSCEV = SE.getConstant(Ty: WiderTy, V: DerefBytes);
234
235 // Check if we have a suitable dereferencable assumption we can use.
236 Instruction *CtxI = &*L->getHeader()->getFirstNonPHIIt();
237 if (BasicBlock *LoopPred = L->getLoopPredecessor()) {
238 if (isa<BranchInst>(Val: LoopPred->getTerminator()))
239 CtxI = LoopPred->getTerminator();
240 }
241 RetainedKnowledge DerefRK;
242 getKnowledgeForValue(V: StartPtrV, AttrKinds: {Attribute::Dereferenceable}, AC&: *AC,
243 Filter: [&](RetainedKnowledge RK, Instruction *Assume, auto) {
244 if (!isValidAssumeForContext(I: Assume, CxtI: CtxI, DT))
245 return false;
246 if (StartPtrV->canBeFreed() &&
247 !willNotFreeBetween(Assume, CtxI))
248 return false;
249 DerefRK = std::max(a: DerefRK, b: RK);
250 return true;
251 });
252 if (DerefRK) {
253 DerefBytesSCEV =
254 SE.getUMaxExpr(LHS: DerefBytesSCEV, RHS: SE.getSCEV(V: DerefRK.IRArgValue));
255 }
256
257 if (DerefBytesSCEV->isZero())
258 return false;
259
260 bool IsKnownNonNegative = SE.isKnownNonNegative(S: Step);
261 if (!IsKnownNonNegative && !SE.isKnownNegative(S: Step))
262 return false;
263
264 Step = SE.getNoopOrSignExtend(V: Step, Ty: WiderTy);
265 MaxBTC = SE.getNoopOrZeroExtend(V: MaxBTC, Ty: WiderTy);
266
267 // For the computations below, make sure they don't unsigned wrap.
268 if (!SE.isKnownPredicate(Pred: CmpInst::ICMP_UGE, LHS: AR->getStart(), RHS: StartPtr))
269 return false;
270 const SCEV *StartOffset = SE.getNoopOrZeroExtend(
271 V: SE.getMinusSCEV(LHS: AR->getStart(), RHS: StartPtr), Ty: WiderTy);
272
273 if (!LoopGuards)
274 LoopGuards.emplace(args: ScalarEvolution::LoopGuards::collect(L: AR->getLoop(), SE));
275 MaxBTC = SE.applyLoopGuards(Expr: MaxBTC, Guards: *LoopGuards);
276
277 const SCEV *OffsetAtLastIter =
278 mulSCEVOverflow(A: MaxBTC, B: SE.getAbsExpr(Op: Step, /*IsNSW=*/false), SE);
279 if (!OffsetAtLastIter) {
280 // Re-try with constant max backedge-taken count if using the symbolic one
281 // failed.
282 MaxBTC = SE.getConstantMaxBackedgeTakenCount(L: AR->getLoop());
283 if (isa<SCEVCouldNotCompute>(Val: MaxBTC))
284 return false;
285 MaxBTC = SE.getNoopOrZeroExtend(
286 V: MaxBTC, Ty: WiderTy);
287 OffsetAtLastIter =
288 mulSCEVOverflow(A: MaxBTC, B: SE.getAbsExpr(Op: Step, /*IsNSW=*/false), SE);
289 if (!OffsetAtLastIter)
290 return false;
291 }
292
293 const SCEV *OffsetEndBytes = addSCEVNoOverflow(
294 A: OffsetAtLastIter, B: SE.getNoopOrZeroExtend(V: EltSize, Ty: WiderTy), SE);
295 if (!OffsetEndBytes)
296 return false;
297
298 if (IsKnownNonNegative) {
299 // For positive steps, check if
300 // (AR->getStart() - StartPtr) + (MaxBTC * Step) + EltSize <= DerefBytes,
301 // while making sure none of the computations unsigned wrap themselves.
302 const SCEV *EndBytes = addSCEVNoOverflow(A: StartOffset, B: OffsetEndBytes, SE);
303 if (!EndBytes)
304 return false;
305
306 DerefBytesSCEV = SE.applyLoopGuards(Expr: DerefBytesSCEV, Guards: *LoopGuards);
307 return SE.isKnownPredicate(Pred: CmpInst::ICMP_ULE, LHS: EndBytes, RHS: DerefBytesSCEV);
308 }
309
310 // For negative steps check if
311 // * StartOffset >= (MaxBTC * Step + EltSize)
312 // * StartOffset <= DerefBytes.
313 assert(SE.isKnownNegative(Step) && "must be known negative");
314 return SE.isKnownPredicate(Pred: CmpInst::ICMP_SGE, LHS: StartOffset, RHS: OffsetEndBytes) &&
315 SE.isKnownPredicate(Pred: CmpInst::ICMP_ULE, LHS: StartOffset, RHS: DerefBytesSCEV);
316}
317
318std::pair<const SCEV *, const SCEV *> llvm::getStartAndEndForAccess(
319 const Loop *Lp, const SCEV *PtrExpr, Type *AccessTy, const SCEV *BTC,
320 const SCEV *MaxBTC, ScalarEvolution *SE,
321 DenseMap<std::pair<const SCEV *, Type *>,
322 std::pair<const SCEV *, const SCEV *>> *PointerBounds,
323 DominatorTree *DT, AssumptionCache *AC,
324 std::optional<ScalarEvolution::LoopGuards> &LoopGuards) {
325 std::pair<const SCEV *, const SCEV *> *PtrBoundsPair;
326 if (PointerBounds) {
327 auto [Iter, Ins] = PointerBounds->insert(
328 KV: {{PtrExpr, AccessTy},
329 {SE->getCouldNotCompute(), SE->getCouldNotCompute()}});
330 if (!Ins)
331 return Iter->second;
332 PtrBoundsPair = &Iter->second;
333 }
334
335 const SCEV *ScStart;
336 const SCEV *ScEnd;
337
338 auto &DL = Lp->getHeader()->getDataLayout();
339 Type *IdxTy = DL.getIndexType(PtrTy: PtrExpr->getType());
340 const SCEV *EltSizeSCEV = SE->getStoreSizeOfExpr(IntTy: IdxTy, StoreTy: AccessTy);
341 if (SE->isLoopInvariant(S: PtrExpr, L: Lp)) {
342 ScStart = ScEnd = PtrExpr;
343 } else if (auto *AR = dyn_cast<SCEVAddRecExpr>(Val: PtrExpr)) {
344 ScStart = AR->getStart();
345 if (!isa<SCEVCouldNotCompute>(Val: BTC))
346 // Evaluating AR at an exact BTC is safe: LAA separately checks that
347 // accesses cannot wrap in the loop. If evaluating AR at BTC wraps, then
348 // the loop either triggers UB when executing a memory access with a
349 // poison pointer or the wrapping/poisoned pointer is not used.
350 ScEnd = AR->evaluateAtIteration(It: BTC, SE&: *SE);
351 else {
352 // Evaluating AR at MaxBTC may wrap and create an expression that is less
353 // than the start of the AddRec due to wrapping (for example consider
354 // MaxBTC = -2). If that's the case, set ScEnd to -(EltSize + 1). ScEnd
355 // will get incremented by EltSize before returning, so this effectively
356 // sets ScEnd to the maximum unsigned value for the type. Note that LAA
357 // separately checks that accesses cannot not wrap, so unsigned max
358 // represents an upper bound.
359 if (evaluatePtrAddRecAtMaxBTCWillNotWrap(AR, MaxBTC, EltSize: EltSizeSCEV, SE&: *SE, DL,
360 DT, AC, LoopGuards)) {
361 ScEnd = AR->evaluateAtIteration(It: MaxBTC, SE&: *SE);
362 } else {
363 ScEnd = SE->getAddExpr(
364 LHS: SE->getNegativeSCEV(V: EltSizeSCEV),
365 RHS: SE->getSCEV(V: ConstantExpr::getIntToPtr(
366 C: ConstantInt::getAllOnesValue(Ty: EltSizeSCEV->getType()),
367 Ty: AR->getType())));
368 }
369 }
370 const SCEV *Step = AR->getStepRecurrence(SE&: *SE);
371
372 // For expressions with negative step, the upper bound is ScStart and the
373 // lower bound is ScEnd.
374 if (const auto *CStep = dyn_cast<SCEVConstant>(Val: Step)) {
375 if (CStep->getValue()->isNegative())
376 std::swap(a&: ScStart, b&: ScEnd);
377 } else {
378 // Fallback case: the step is not constant, but we can still
379 // get the upper and lower bounds of the interval by using min/max
380 // expressions.
381 ScStart = SE->getUMinExpr(LHS: ScStart, RHS: ScEnd);
382 ScEnd = SE->getUMaxExpr(LHS: AR->getStart(), RHS: ScEnd);
383 }
384 } else
385 return {SE->getCouldNotCompute(), SE->getCouldNotCompute()};
386
387 assert(SE->isLoopInvariant(ScStart, Lp) && "ScStart needs to be invariant");
388 assert(SE->isLoopInvariant(ScEnd, Lp) && "ScEnd needs to be invariant");
389
390 // Add the size of the pointed element to ScEnd.
391 ScEnd = SE->getAddExpr(LHS: ScEnd, RHS: EltSizeSCEV);
392
393 std::pair<const SCEV *, const SCEV *> Res = {ScStart, ScEnd};
394 if (PointerBounds)
395 *PtrBoundsPair = Res;
396 return Res;
397}
398
399/// Calculate Start and End points of memory access using
400/// getStartAndEndForAccess.
401void RuntimePointerChecking::insert(Loop *Lp, Value *Ptr, const SCEV *PtrExpr,
402 Type *AccessTy, bool WritePtr,
403 unsigned DepSetId, unsigned ASId,
404 PredicatedScalarEvolution &PSE,
405 bool NeedsFreeze) {
406 const SCEV *SymbolicMaxBTC = PSE.getSymbolicMaxBackedgeTakenCount();
407 const SCEV *BTC = PSE.getBackedgeTakenCount();
408 const auto &[ScStart, ScEnd] = getStartAndEndForAccess(
409 Lp, PtrExpr, AccessTy, BTC, MaxBTC: SymbolicMaxBTC, SE: PSE.getSE(),
410 PointerBounds: &DC.getPointerBounds(), DT: DC.getDT(), AC: DC.getAC(), LoopGuards);
411 assert(!isa<SCEVCouldNotCompute>(ScStart) &&
412 !isa<SCEVCouldNotCompute>(ScEnd) &&
413 "must be able to compute both start and end expressions");
414 Pointers.emplace_back(Args&: Ptr, Args: ScStart, Args: ScEnd, Args&: WritePtr, Args&: DepSetId, Args&: ASId, Args&: PtrExpr,
415 Args&: NeedsFreeze);
416}
417
418bool RuntimePointerChecking::tryToCreateDiffCheck(
419 const RuntimeCheckingPtrGroup &CGI, const RuntimeCheckingPtrGroup &CGJ) {
420 // If either group contains multiple different pointers, bail out.
421 // TODO: Support multiple pointers by using the minimum or maximum pointer,
422 // depending on src & sink.
423 if (CGI.Members.size() != 1 || CGJ.Members.size() != 1)
424 return false;
425
426 const PointerInfo *Src = &Pointers[CGI.Members[0]];
427 const PointerInfo *Sink = &Pointers[CGJ.Members[0]];
428
429 // If either pointer is read and written, multiple checks may be needed. Bail
430 // out.
431 if (!DC.getOrderForAccess(Ptr: Src->PointerValue, IsWrite: !Src->IsWritePtr).empty() ||
432 !DC.getOrderForAccess(Ptr: Sink->PointerValue, IsWrite: !Sink->IsWritePtr).empty())
433 return false;
434
435 ArrayRef<unsigned> AccSrc =
436 DC.getOrderForAccess(Ptr: Src->PointerValue, IsWrite: Src->IsWritePtr);
437 ArrayRef<unsigned> AccSink =
438 DC.getOrderForAccess(Ptr: Sink->PointerValue, IsWrite: Sink->IsWritePtr);
439 // If either pointer is accessed multiple times, there may not be a clear
440 // src/sink relation. Bail out for now.
441 if (AccSrc.size() != 1 || AccSink.size() != 1)
442 return false;
443
444 // If the sink is accessed before src, swap src/sink.
445 if (AccSink[0] < AccSrc[0])
446 std::swap(a&: Src, b&: Sink);
447
448 const SCEVConstant *Step;
449 const SCEV *SrcStart;
450 const SCEV *SinkStart;
451 const Loop *InnerLoop = DC.getInnermostLoop();
452 if (!match(S: Src->Expr,
453 P: m_scev_AffineAddRec(Op0: m_SCEV(V&: SrcStart), Op1: m_SCEVConstant(V&: Step),
454 L: m_SpecificLoop(L: InnerLoop))) ||
455 !match(S: Sink->Expr,
456 P: m_scev_AffineAddRec(Op0: m_SCEV(V&: SinkStart), Op1: m_scev_Specific(S: Step),
457 L: m_SpecificLoop(L: InnerLoop))))
458 return false;
459
460 SmallVector<Instruction *, 4> SrcInsts =
461 DC.getInstructionsForAccess(Ptr: Src->PointerValue, isWrite: Src->IsWritePtr);
462 SmallVector<Instruction *, 4> SinkInsts =
463 DC.getInstructionsForAccess(Ptr: Sink->PointerValue, isWrite: Sink->IsWritePtr);
464 Type *SrcTy = getLoadStoreType(I: SrcInsts[0]);
465 Type *DstTy = getLoadStoreType(I: SinkInsts[0]);
466 if (isa<ScalableVectorType>(Val: SrcTy) || isa<ScalableVectorType>(Val: DstTy))
467 return false;
468
469 const DataLayout &DL = InnerLoop->getHeader()->getDataLayout();
470 unsigned AllocSize =
471 std::max(a: DL.getTypeAllocSize(Ty: SrcTy), b: DL.getTypeAllocSize(Ty: DstTy));
472
473 // Only matching constant steps matching the AllocSize are supported at the
474 // moment. This simplifies the difference computation. Can be extended in the
475 // future.
476 if (Step->getAPInt().abs() != AllocSize)
477 return false;
478
479 // When counting down, the dependence distance needs to be swapped.
480 if (Step->getValue()->isNegative())
481 std::swap(a&: SinkStart, b&: SrcStart);
482
483 const SCEV *SinkStartInt = SE->getPtrToAddrExpr(Op: SinkStart);
484 const SCEV *SrcStartInt = SE->getPtrToAddrExpr(Op: SrcStart);
485 if (isa<SCEVCouldNotCompute>(Val: SinkStartInt) ||
486 isa<SCEVCouldNotCompute>(Val: SrcStartInt))
487 return false;
488
489 // If the start values for both Src and Sink also vary according to an outer
490 // loop, then it's probably better to avoid creating diff checks because
491 // they may not be hoisted. We should instead let llvm::addRuntimeChecks
492 // do the expanded full range overlap checks, which can be hoisted.
493 if (HoistRuntimeChecks && InnerLoop->getParentLoop() &&
494 isa<SCEVAddRecExpr>(Val: SinkStartInt) && isa<SCEVAddRecExpr>(Val: SrcStartInt)) {
495 auto *SrcStartAR = cast<SCEVAddRecExpr>(Val: SrcStartInt);
496 auto *SinkStartAR = cast<SCEVAddRecExpr>(Val: SinkStartInt);
497 const Loop *StartARLoop = SrcStartAR->getLoop();
498 if (StartARLoop == SinkStartAR->getLoop() &&
499 StartARLoop == InnerLoop->getParentLoop() &&
500 // If the diff check would already be loop invariant (due to the
501 // recurrences being the same), then we prefer to keep the diff checks
502 // because they are cheaper.
503 SrcStartAR->getStepRecurrence(SE&: *SE) !=
504 SinkStartAR->getStepRecurrence(SE&: *SE)) {
505 LLVM_DEBUG(dbgs() << "LAA: Not creating diff runtime check, since these "
506 "cannot be hoisted out of the outer loop\n");
507 return false;
508 }
509 }
510
511 LLVM_DEBUG(dbgs() << "LAA: Creating diff runtime check for:\n"
512 << "SrcStart: " << *SrcStartInt << '\n'
513 << "SinkStartInt: " << *SinkStartInt << '\n');
514 DiffChecks.emplace_back(Args&: SrcStartInt, Args&: SinkStartInt, Args&: AllocSize,
515 Args: Src->NeedsFreeze || Sink->NeedsFreeze);
516 return true;
517}
518
519SmallVector<RuntimePointerCheck, 4> RuntimePointerChecking::generateChecks() {
520 SmallVector<RuntimePointerCheck, 4> Checks;
521
522 for (unsigned I = 0; I < CheckingGroups.size(); ++I) {
523 for (unsigned J = I + 1; J < CheckingGroups.size(); ++J) {
524 const RuntimeCheckingPtrGroup &CGI = CheckingGroups[I];
525 const RuntimeCheckingPtrGroup &CGJ = CheckingGroups[J];
526
527 if (needsChecking(M: CGI, N: CGJ)) {
528 CanUseDiffCheck = CanUseDiffCheck && tryToCreateDiffCheck(CGI, CGJ);
529 Checks.emplace_back(Args: &CGI, Args: &CGJ);
530 }
531 }
532 }
533 return Checks;
534}
535
536void RuntimePointerChecking::generateChecks(
537 MemoryDepChecker::DepCandidates &DepCands, bool UseDependencies) {
538 assert(Checks.empty() && "Checks is not empty");
539 groupChecks(DepCands, UseDependencies);
540 Checks = generateChecks();
541}
542
543bool RuntimePointerChecking::needsChecking(
544 const RuntimeCheckingPtrGroup &M, const RuntimeCheckingPtrGroup &N) const {
545 for (const auto &I : M.Members)
546 for (const auto &J : N.Members)
547 if (needsChecking(I, J))
548 return true;
549 return false;
550}
551
552/// Compare \p I and \p J and return the minimum.
553/// Return nullptr in case we couldn't find an answer.
554static const SCEV *getMinFromExprs(const SCEV *I, const SCEV *J,
555 ScalarEvolution *SE) {
556 std::optional<APInt> Diff = SE->computeConstantDifference(LHS: J, RHS: I);
557 if (!Diff)
558 return nullptr;
559 return Diff->isNegative() ? J : I;
560}
561
562bool RuntimeCheckingPtrGroup::addPointer(
563 unsigned Index, const RuntimePointerChecking &RtCheck) {
564 return addPointer(
565 Index, Start: RtCheck.Pointers[Index].Start, End: RtCheck.Pointers[Index].End,
566 AS: RtCheck.Pointers[Index].PointerValue->getType()->getPointerAddressSpace(),
567 NeedsFreeze: RtCheck.Pointers[Index].NeedsFreeze, SE&: *RtCheck.SE);
568}
569
570bool RuntimeCheckingPtrGroup::addPointer(unsigned Index, const SCEV *Start,
571 const SCEV *End, unsigned AS,
572 bool NeedsFreeze,
573 ScalarEvolution &SE) {
574 assert(AddressSpace == AS &&
575 "all pointers in a checking group must be in the same address space");
576
577 // Compare the starts and ends with the known minimum and maximum
578 // of this set. We need to know how we compare against the min/max
579 // of the set in order to be able to emit memchecks.
580 const SCEV *Min0 = getMinFromExprs(I: Start, J: Low, SE: &SE);
581 if (!Min0)
582 return false;
583
584 const SCEV *Min1 = getMinFromExprs(I: End, J: High, SE: &SE);
585 if (!Min1)
586 return false;
587
588 // Update the low bound expression if we've found a new min value.
589 if (Min0 == Start)
590 Low = Start;
591
592 // Update the high bound expression if we've found a new max value.
593 if (Min1 != End)
594 High = End;
595
596 Members.push_back(Elt: Index);
597 this->NeedsFreeze |= NeedsFreeze;
598 return true;
599}
600
601void RuntimePointerChecking::groupChecks(
602 MemoryDepChecker::DepCandidates &DepCands, bool UseDependencies) {
603 // We build the groups from dependency candidates equivalence classes
604 // because:
605 // - We know that pointers in the same equivalence class share
606 // the same underlying object and therefore there is a chance
607 // that we can compare pointers
608 // - We wouldn't be able to merge two pointers for which we need
609 // to emit a memcheck. The classes in DepCands are already
610 // conveniently built such that no two pointers in the same
611 // class need checking against each other.
612
613 // We use the following (greedy) algorithm to construct the groups
614 // For every pointer in the equivalence class:
615 // For each existing group:
616 // - if the difference between this pointer and the min/max bounds
617 // of the group is a constant, then make the pointer part of the
618 // group and update the min/max bounds of that group as required.
619
620 CheckingGroups.clear();
621
622 // If we need to check two pointers to the same underlying object
623 // with a non-constant difference, we shouldn't perform any pointer
624 // grouping with those pointers. This is because we can easily get
625 // into cases where the resulting check would return false, even when
626 // the accesses are safe.
627 //
628 // The following example shows this:
629 // for (i = 0; i < 1000; ++i)
630 // a[5000 + i * m] = a[i] + a[i + 9000]
631 //
632 // Here grouping gives a check of (5000, 5000 + 1000 * m) against
633 // (0, 10000) which is always false. However, if m is 1, there is no
634 // dependence. Not grouping the checks for a[i] and a[i + 9000] allows
635 // us to perform an accurate check in this case.
636 //
637 // In the above case, we have a non-constant distance and an Unknown
638 // dependence between accesses to the same underlying object, and could retry
639 // with runtime checks. Therefore UseDependencies is false. In this case we
640 // will use the fallback path and create separate checking groups for all
641 // pointers.
642
643 // If we don't have the dependency partitions, construct a new
644 // checking pointer group for each pointer. This is also required
645 // for correctness, because in this case we can have checking between
646 // pointers to the same underlying object.
647 if (!UseDependencies) {
648 for (unsigned I = 0; I < Pointers.size(); ++I)
649 CheckingGroups.emplace_back(Args&: I, Args&: *this);
650 return;
651 }
652
653 unsigned TotalComparisons = 0;
654
655 DenseMap<Value *, SmallVector<unsigned>> PositionMap;
656 for (unsigned Index = 0; Index < Pointers.size(); ++Index)
657 PositionMap[Pointers[Index].PointerValue].push_back(Elt: Index);
658
659 // We need to keep track of what pointers we've already seen so we
660 // don't process them twice.
661 SmallSet<unsigned, 2> Seen;
662
663 // Go through all equivalence classes, get the "pointer check groups"
664 // and add them to the overall solution. We use the order in which accesses
665 // appear in 'Pointers' to enforce determinism.
666 for (unsigned I = 0; I < Pointers.size(); ++I) {
667 // We've seen this pointer before, and therefore already processed
668 // its equivalence class.
669 if (Seen.contains(V: I))
670 continue;
671
672 MemoryDepChecker::MemAccessInfo Access(Pointers[I].PointerValue,
673 Pointers[I].IsWritePtr);
674
675 SmallVector<RuntimeCheckingPtrGroup, 2> Groups;
676
677 // Because DepCands is constructed by visiting accesses in the order in
678 // which they appear in alias sets (which is deterministic) and the
679 // iteration order within an equivalence class member is only dependent on
680 // the order in which unions and insertions are performed on the
681 // equivalence class, the iteration order is deterministic.
682 for (auto M : DepCands.members(V: Access)) {
683 auto PointerI = PositionMap.find(Val: M.getPointer());
684 // If we can't find the pointer in PositionMap that means we can't
685 // generate a memcheck for it.
686 if (PointerI == PositionMap.end())
687 continue;
688 for (unsigned Pointer : PointerI->second) {
689 bool Merged = false;
690 // Mark this pointer as seen.
691 Seen.insert(V: Pointer);
692
693 // Go through all the existing sets and see if we can find one
694 // which can include this pointer.
695 for (RuntimeCheckingPtrGroup &Group : Groups) {
696 // Don't perform more than a certain amount of comparisons.
697 // This should limit the cost of grouping the pointers to something
698 // reasonable. If we do end up hitting this threshold, the algorithm
699 // will create separate groups for all remaining pointers.
700 if (TotalComparisons > MemoryCheckMergeThreshold)
701 break;
702
703 TotalComparisons++;
704
705 if (Group.addPointer(Index: Pointer, RtCheck: *this)) {
706 Merged = true;
707 break;
708 }
709 }
710
711 if (!Merged)
712 // We couldn't add this pointer to any existing set or the threshold
713 // for the number of comparisons has been reached. Create a new group
714 // to hold the current pointer.
715 Groups.emplace_back(Args&: Pointer, Args&: *this);
716 }
717 }
718
719 // We've computed the grouped checks for this partition.
720 // Save the results and continue with the next one.
721 llvm::append_range(C&: CheckingGroups, R&: Groups);
722 }
723}
724
725bool RuntimePointerChecking::arePointersInSamePartition(
726 const SmallVectorImpl<int> &PtrToPartition, unsigned PtrIdx1,
727 unsigned PtrIdx2) {
728 return (PtrToPartition[PtrIdx1] != -1 &&
729 PtrToPartition[PtrIdx1] == PtrToPartition[PtrIdx2]);
730}
731
732bool RuntimePointerChecking::needsChecking(unsigned I, unsigned J) const {
733 const PointerInfo &PointerI = Pointers[I];
734 const PointerInfo &PointerJ = Pointers[J];
735
736 // No need to check if two readonly pointers intersect.
737 if (!PointerI.IsWritePtr && !PointerJ.IsWritePtr)
738 return false;
739
740 // Only need to check pointers between two different dependency sets.
741 if (PointerI.DependencySetId == PointerJ.DependencySetId)
742 return false;
743
744 // Only need to check pointers in the same alias set.
745 return PointerI.AliasSetId == PointerJ.AliasSetId;
746}
747
748/// Assign each RuntimeCheckingPtrGroup pointer an index for stable UTC output.
749static DenseMap<const RuntimeCheckingPtrGroup *, unsigned>
750getPtrToIdxMap(ArrayRef<RuntimeCheckingPtrGroup> CheckingGroups) {
751 DenseMap<const RuntimeCheckingPtrGroup *, unsigned> PtrIndices;
752 for (const auto &[Idx, CG] : enumerate(First&: CheckingGroups))
753 PtrIndices[&CG] = Idx;
754 return PtrIndices;
755}
756
757void RuntimePointerChecking::printChecks(
758 raw_ostream &OS, const SmallVectorImpl<RuntimePointerCheck> &Checks,
759 unsigned Depth) const {
760 unsigned N = 0;
761 auto PtrIndices = getPtrToIdxMap(CheckingGroups);
762 for (const auto &[Check1, Check2] : Checks) {
763 const auto &First = Check1->Members, &Second = Check2->Members;
764 OS.indent(NumSpaces: Depth) << "Check " << N++ << ":\n";
765 OS.indent(NumSpaces: Depth + 2) << "Comparing group GRP" << PtrIndices.at(Val: Check1)
766 << ":\n";
767 for (unsigned K : First)
768 OS.indent(NumSpaces: Depth + 2) << *Pointers[K].PointerValue << "\n";
769 OS.indent(NumSpaces: Depth + 2) << "Against group GRP" << PtrIndices.at(Val: Check2)
770 << ":\n";
771 for (unsigned K : Second)
772 OS.indent(NumSpaces: Depth + 2) << *Pointers[K].PointerValue << "\n";
773 }
774}
775
776void RuntimePointerChecking::print(raw_ostream &OS, unsigned Depth) const {
777
778 OS.indent(NumSpaces: Depth) << "Run-time memory checks:\n";
779 printChecks(OS, Checks, Depth);
780
781 OS.indent(NumSpaces: Depth) << "Grouped accesses:\n";
782 auto PtrIndices = getPtrToIdxMap(CheckingGroups);
783 for (const auto &CG : CheckingGroups) {
784 OS.indent(NumSpaces: Depth + 2) << "Group GRP" << PtrIndices.at(Val: &CG) << ":\n";
785 OS.indent(NumSpaces: Depth + 4) << "(Low: " << *CG.Low << " High: " << *CG.High
786 << ")\n";
787 for (unsigned Member : CG.Members) {
788 OS.indent(NumSpaces: Depth + 6) << "Member: " << *Pointers[Member].Expr << "\n";
789 }
790 }
791}
792
793namespace {
794
795/// Analyses memory accesses in a loop.
796///
797/// Checks whether run time pointer checks are needed and builds sets for data
798/// dependence checking.
799class AccessAnalysis {
800public:
801 /// Read or write access location.
802 typedef PointerIntPair<Value *, 1, bool> MemAccessInfo;
803 typedef SmallVector<MemAccessInfo, 8> MemAccessInfoList;
804
805 AccessAnalysis(const Loop *TheLoop, AAResults *AA, const LoopInfo *LI,
806 DominatorTree &DT, MemoryDepChecker::DepCandidates &DA,
807 PredicatedScalarEvolution &PSE,
808 SmallPtrSetImpl<MDNode *> &LoopAliasScopes)
809 : TheLoop(TheLoop), BAA(*AA), AST(BAA), LI(LI), DT(DT), DepCands(DA),
810 PSE(PSE), LoopAliasScopes(LoopAliasScopes) {
811 // We're analyzing dependences across loop iterations.
812 BAA.enableCrossIterationMode();
813 }
814
815 /// Register a load and whether it is only read from.
816 void addLoad(const MemoryLocation &Loc, Type *AccessTy, bool IsReadOnly) {
817 Value *Ptr = const_cast<Value *>(Loc.Ptr);
818 AST.add(Loc: adjustLoc(Loc));
819 Accesses[MemAccessInfo(Ptr, false)].insert(X: AccessTy);
820 if (IsReadOnly)
821 ReadOnlyPtr.insert(Ptr);
822 }
823
824 /// Register a store.
825 void addStore(const MemoryLocation &Loc, Type *AccessTy) {
826 Value *Ptr = const_cast<Value *>(Loc.Ptr);
827 AST.add(Loc: adjustLoc(Loc));
828 Accesses[MemAccessInfo(Ptr, true)].insert(X: AccessTy);
829 }
830
831 /// Check if we can emit a run-time no-alias check for \p Access.
832 ///
833 /// Returns true if we can emit a run-time no alias check for \p Access.
834 /// If we can check this access, this also adds it to a dependence set and
835 /// adds a run-time to check for it to \p RtCheck. If \p Assume is true,
836 /// we will attempt to use additional run-time checks in order to get
837 /// the bounds of the pointer.
838 bool createCheckForAccess(RuntimePointerChecking &RtCheck,
839 MemAccessInfo Access, Type *AccessTy,
840 const DenseMap<Value *, const SCEV *> &Strides,
841 DenseMap<Value *, unsigned> &DepSetId,
842 Loop *TheLoop, unsigned &RunningDepId,
843 unsigned ASId, bool Assume);
844
845 /// Check whether we can check the pointers at runtime for
846 /// non-intersection.
847 ///
848 /// Returns true if we need no check or if we do and we can generate them
849 /// (i.e. the pointers have computable bounds). A return value of false means
850 /// we couldn't analyze and generate runtime checks for all pointers in the
851 /// loop, but if \p AllowPartial is set then we will have checks for those
852 /// pointers we could analyze.
853 bool canCheckPtrAtRT(RuntimePointerChecking &RtCheck, Loop *TheLoop,
854 const DenseMap<Value *, const SCEV *> &Strides,
855 Value *&UncomputablePtr, bool AllowPartial);
856
857 /// Goes over all memory accesses, checks whether a RT check is needed
858 /// and builds sets of dependent accesses.
859 void buildDependenceSets() {
860 processMemAccesses();
861 }
862
863 /// Initial processing of memory accesses determined that we need to
864 /// perform dependency checking.
865 ///
866 /// Note that this can later be cleared if we retry memcheck analysis without
867 /// dependency checking (i.e. ShouldRetryWithRuntimeChecks).
868 bool isDependencyCheckNeeded() const { return !CheckDeps.empty(); }
869
870 /// We decided that no dependence analysis would be used. Reset the state.
871 void resetDepChecks(MemoryDepChecker &DepChecker) {
872 CheckDeps.clear();
873 DepChecker.clearDependences();
874 }
875
876 const MemAccessInfoList &getDependenciesToCheck() const { return CheckDeps; }
877
878private:
879 typedef MapVector<MemAccessInfo, SmallSetVector<Type *, 1>> PtrAccessMap;
880
881 /// Adjust the MemoryLocation so that it represents accesses to this
882 /// location across all iterations, rather than a single one.
883 MemoryLocation adjustLoc(MemoryLocation Loc) const {
884 // The accessed location varies within the loop, but remains within the
885 // underlying object.
886 Loc.Size = LocationSize::beforeOrAfterPointer();
887 Loc.AATags.Scope = adjustAliasScopeList(ScopeList: Loc.AATags.Scope);
888 Loc.AATags.NoAlias = adjustAliasScopeList(ScopeList: Loc.AATags.NoAlias);
889 return Loc;
890 }
891
892 /// Drop alias scopes that are only valid within a single loop iteration.
893 MDNode *adjustAliasScopeList(MDNode *ScopeList) const {
894 if (!ScopeList)
895 return nullptr;
896
897 // For the sake of simplicity, drop the whole scope list if any scope is
898 // iteration-local.
899 if (any_of(Range: ScopeList->operands(), P: [&](Metadata *Scope) {
900 return LoopAliasScopes.contains(Ptr: cast<MDNode>(Val: Scope));
901 }))
902 return nullptr;
903
904 return ScopeList;
905 }
906
907 /// Go over all memory access and check whether runtime pointer checks
908 /// are needed and build sets of dependency check candidates.
909 void processMemAccesses();
910
911 /// Map of all accesses. Values are the types used to access memory pointed to
912 /// by the pointer.
913 PtrAccessMap Accesses;
914
915 /// The loop being checked.
916 const Loop *TheLoop;
917
918 /// List of accesses that need a further dependence check.
919 MemAccessInfoList CheckDeps;
920
921 /// Set of pointers that are read only.
922 SmallPtrSet<Value*, 16> ReadOnlyPtr;
923
924 /// Batched alias analysis results.
925 BatchAAResults BAA;
926
927 /// An alias set tracker to partition the access set by underlying object and
928 //intrinsic property (such as TBAA metadata).
929 AliasSetTracker AST;
930
931 /// The LoopInfo of the loop being checked.
932 const LoopInfo *LI;
933
934 /// The dominator tree of the function.
935 DominatorTree &DT;
936
937 /// Sets of potentially dependent accesses - members of one set share an
938 /// underlying pointer. The set "CheckDeps" identfies which sets really need a
939 /// dependence check.
940 MemoryDepChecker::DepCandidates &DepCands;
941
942 /// Initial processing of memory accesses determined that we may need
943 /// to add memchecks. Perform the analysis to determine the necessary checks.
944 ///
945 /// Note that, this is different from isDependencyCheckNeeded. When we retry
946 /// memcheck analysis without dependency checking
947 /// (i.e. ShouldRetryWithRuntimeChecks), isDependencyCheckNeeded is
948 /// cleared while this remains set if we have potentially dependent accesses.
949 bool IsRTCheckAnalysisNeeded = false;
950
951 /// The SCEV predicate containing all the SCEV-related assumptions.
952 PredicatedScalarEvolution &PSE;
953
954 DenseMap<Value *, SmallVector<const Value *, 16>> UnderlyingObjects;
955
956 /// Alias scopes that are declared inside the loop, and as such not valid
957 /// across iterations.
958 SmallPtrSetImpl<MDNode *> &LoopAliasScopes;
959};
960
961} // end anonymous namespace
962
963/// Try to compute a constant stride for \p AR. Used by getPtrStride and
964/// isNoWrap.
965static std::optional<int64_t>
966getStrideFromAddRec(const SCEVAddRecExpr *AR, const Loop *Lp, Type *AccessTy,
967 Value *Ptr, PredicatedScalarEvolution &PSE) {
968 if (isa<ScalableVectorType>(Val: AccessTy)) {
969 LLVM_DEBUG(dbgs() << "LAA: Bad stride - Scalable object: " << *AccessTy
970 << "\n");
971 return std::nullopt;
972 }
973
974 // The access function must stride over the innermost loop.
975 if (Lp != AR->getLoop()) {
976 LLVM_DEBUG({
977 dbgs() << "LAA: Bad stride - Not striding over innermost loop ";
978 if (Ptr)
979 dbgs() << *Ptr << " ";
980
981 dbgs() << "SCEV: " << *AR << "\n";
982 });
983 return std::nullopt;
984 }
985
986 // Check the step is constant.
987 const SCEV *Step = AR->getStepRecurrence(SE&: *PSE.getSE());
988
989 // Calculate the pointer stride and check if it is constant.
990 const APInt *APStepVal;
991 if (!match(S: Step, P: m_scev_APInt(C&: APStepVal))) {
992 LLVM_DEBUG({
993 dbgs() << "LAA: Bad stride - Not a constant strided ";
994 if (Ptr)
995 dbgs() << *Ptr << " ";
996 dbgs() << "SCEV: " << *AR << "\n";
997 });
998 return std::nullopt;
999 }
1000
1001 const auto &DL = Lp->getHeader()->getDataLayout();
1002 TypeSize AllocSize = DL.getTypeAllocSize(Ty: AccessTy);
1003 int64_t Size = AllocSize.getFixedValue();
1004
1005 // Huge step value - give up.
1006 std::optional<int64_t> StepVal = APStepVal->trySExtValue();
1007 if (!StepVal)
1008 return std::nullopt;
1009
1010 // Strided access.
1011 return *StepVal % Size ? std::nullopt : std::make_optional(t: *StepVal / Size);
1012}
1013
1014/// Check whether \p AR is a non-wrapping AddRec. If \p Ptr is not nullptr, use
1015/// informating from the IR pointer value to determine no-wrap.
1016static bool isNoWrap(PredicatedScalarEvolution &PSE, const SCEVAddRecExpr *AR,
1017 Value *Ptr, Type *AccessTy, const Loop *L, bool Assume,
1018 const DominatorTree &DT,
1019 std::optional<int64_t> Stride = std::nullopt) {
1020 // FIXME: This should probably only return true for NUW.
1021 if (AR->getNoWrapFlags(Mask: SCEV::NoWrapMask))
1022 return true;
1023
1024 if (Ptr && PSE.hasNoOverflow(V: Ptr, Flags: SCEVWrapPredicate::IncrementNUSW))
1025 return true;
1026
1027 // An nusw getelementptr that is an AddRec cannot wrap. If it would wrap,
1028 // the distance between the previously accessed location and the wrapped
1029 // location will be larger than half the pointer index type space. In that
1030 // case, the GEP would be poison and any memory access dependent on it would
1031 // be immediate UB when executed.
1032 if (auto *GEP = dyn_cast_if_present<GetElementPtrInst>(Val: Ptr);
1033 GEP && GEP->hasNoUnsignedSignedWrap()) {
1034 // For the above reasoning to apply, the pointer must be dereferenced in
1035 // every iteration.
1036 if (L->getHeader() == L->getLoopLatch() ||
1037 any_of(Range: GEP->users(), P: [L, &DT, GEP](User *U) {
1038 if (getLoadStorePointerOperand(V: U) != GEP)
1039 return false;
1040 BasicBlock *UserBB = cast<Instruction>(Val: U)->getParent();
1041 if (!L->contains(BB: UserBB))
1042 return false;
1043 return !LoopAccessInfo::blockNeedsPredication(BB: UserBB, TheLoop: L, DT: &DT);
1044 }))
1045 return true;
1046 }
1047
1048 if (!Stride)
1049 Stride = getStrideFromAddRec(AR, Lp: L, AccessTy, Ptr, PSE);
1050 if (Stride) {
1051 // If the null pointer is undefined, then a access sequence which would
1052 // otherwise access it can be assumed not to unsigned wrap. Note that this
1053 // assumes the object in memory is aligned to the natural alignment.
1054 unsigned AddrSpace = AR->getType()->getPointerAddressSpace();
1055 if (!NullPointerIsDefined(F: L->getHeader()->getParent(), AS: AddrSpace) &&
1056 (Stride == 1 || Stride == -1))
1057 return true;
1058 }
1059
1060 if (Ptr && Assume) {
1061 PSE.setNoOverflow(V: Ptr, Flags: SCEVWrapPredicate::IncrementNUSW);
1062 LLVM_DEBUG(dbgs() << "LAA: Pointer may wrap:\n"
1063 << "LAA: Pointer: " << *Ptr << "\n"
1064 << "LAA: SCEV: " << *AR << "\n"
1065 << "LAA: Added an overflow assumption\n");
1066 return true;
1067 }
1068
1069 return false;
1070}
1071
1072static void visitPointers(Value *StartPtr, const Loop &InnermostLoop,
1073 function_ref<void(Value *)> AddPointer) {
1074 SmallPtrSet<Value *, 8> Visited;
1075 SmallVector<Value *> WorkList;
1076 WorkList.push_back(Elt: StartPtr);
1077
1078 while (!WorkList.empty()) {
1079 Value *Ptr = WorkList.pop_back_val();
1080 if (!Visited.insert(Ptr).second)
1081 continue;
1082 auto *PN = dyn_cast<PHINode>(Val: Ptr);
1083 // SCEV does not look through non-header PHIs inside the loop. Such phis
1084 // can be analyzed by adding separate accesses for each incoming pointer
1085 // value.
1086 if (PN && InnermostLoop.contains(BB: PN->getParent()) &&
1087 PN->getParent() != InnermostLoop.getHeader()) {
1088 llvm::append_range(C&: WorkList, R: PN->incoming_values());
1089 } else
1090 AddPointer(Ptr);
1091 }
1092}
1093
1094// Walk back through the IR for a pointer, looking for a select like the
1095// following:
1096//
1097// %offset = select i1 %cmp, i64 %a, i64 %b
1098// %addr = getelementptr double, double* %base, i64 %offset
1099// %ld = load double, double* %addr, align 8
1100//
1101// We won't be able to form a single SCEVAddRecExpr from this since the
1102// address for each loop iteration depends on %cmp. We could potentially
1103// produce multiple valid SCEVAddRecExprs, though, and check all of them for
1104// memory safety/aliasing if needed.
1105//
1106// If we encounter some IR we don't yet handle, or something obviously fine
1107// like a constant, then we just add the SCEV for that term to the list passed
1108// in by the caller. If we have a node that may potentially yield a valid
1109// SCEVAddRecExpr then we decompose it into parts and build the SCEV terms
1110// ourselves before adding to the list.
1111static void findForkedSCEVs(
1112 ScalarEvolution *SE, const Loop *L, Value *Ptr,
1113 SmallVectorImpl<PointerIntPair<const SCEV *, 1, bool>> &ScevList,
1114 unsigned Depth) {
1115 // If our Value is a SCEVAddRecExpr, loop invariant, not an instruction, or
1116 // we've exceeded our limit on recursion, just return whatever we have
1117 // regardless of whether it can be used for a forked pointer or not, along
1118 // with an indication of whether it might be a poison or undef value.
1119 const SCEV *Scev = SE->getSCEV(V: Ptr);
1120 if (isa<SCEVAddRecExpr>(Val: Scev) || L->isLoopInvariant(V: Ptr) ||
1121 !isa<Instruction>(Val: Ptr) || Depth == 0) {
1122 ScevList.emplace_back(Args&: Scev, Args: !isGuaranteedNotToBeUndefOrPoison(V: Ptr));
1123 return;
1124 }
1125
1126 Depth--;
1127
1128 auto UndefPoisonCheck = [](PointerIntPair<const SCEV *, 1, bool> S) {
1129 return get<1>(Pair: S);
1130 };
1131
1132 auto GetBinOpExpr = [&SE](unsigned Opcode, const SCEV *L, const SCEV *R) {
1133 switch (Opcode) {
1134 case Instruction::Add:
1135 return SE->getAddExpr(LHS: L, RHS: R);
1136 case Instruction::Sub:
1137 return SE->getMinusSCEV(LHS: L, RHS: R);
1138 default:
1139 llvm_unreachable("Unexpected binary operator when walking ForkedPtrs");
1140 }
1141 };
1142
1143 Instruction *I = cast<Instruction>(Val: Ptr);
1144 unsigned Opcode = I->getOpcode();
1145 switch (Opcode) {
1146 case Instruction::GetElementPtr: {
1147 auto *GEP = cast<GetElementPtrInst>(Val: I);
1148 Type *SourceTy = GEP->getSourceElementType();
1149 // We only handle base + single offset GEPs here for now.
1150 // Not dealing with preexisting gathers yet, so no vectors.
1151 if (I->getNumOperands() != 2 || SourceTy->isVectorTy()) {
1152 ScevList.emplace_back(Args&: Scev, Args: !isGuaranteedNotToBeUndefOrPoison(V: GEP));
1153 break;
1154 }
1155 SmallVector<PointerIntPair<const SCEV *, 1, bool>, 2> BaseScevs;
1156 SmallVector<PointerIntPair<const SCEV *, 1, bool>, 2> OffsetScevs;
1157 findForkedSCEVs(SE, L, Ptr: I->getOperand(i: 0), ScevList&: BaseScevs, Depth);
1158 findForkedSCEVs(SE, L, Ptr: I->getOperand(i: 1), ScevList&: OffsetScevs, Depth);
1159
1160 // See if we need to freeze our fork...
1161 bool NeedsFreeze = any_of(Range&: BaseScevs, P: UndefPoisonCheck) ||
1162 any_of(Range&: OffsetScevs, P: UndefPoisonCheck);
1163
1164 // Check that we only have a single fork, on either the base or the offset.
1165 // Copy the SCEV across for the one without a fork in order to generate
1166 // the full SCEV for both sides of the GEP.
1167 if (OffsetScevs.size() == 2 && BaseScevs.size() == 1)
1168 BaseScevs.push_back(Elt: BaseScevs[0]);
1169 else if (BaseScevs.size() == 2 && OffsetScevs.size() == 1)
1170 OffsetScevs.push_back(Elt: OffsetScevs[0]);
1171 else {
1172 ScevList.emplace_back(Args&: Scev, Args&: NeedsFreeze);
1173 break;
1174 }
1175
1176 Type *IntPtrTy = SE->getEffectiveSCEVType(Ty: GEP->getPointerOperandType());
1177
1178 // Find the size of the type being pointed to. We only have a single
1179 // index term (guarded above) so we don't need to index into arrays or
1180 // structures, just get the size of the scalar value.
1181 const SCEV *Size = SE->getSizeOfExpr(IntTy: IntPtrTy, AllocTy: SourceTy);
1182
1183 for (auto [B, O] : zip(t&: BaseScevs, u&: OffsetScevs)) {
1184 const SCEV *Base = get<0>(Pair: B);
1185 const SCEV *Offset = get<0>(Pair: O);
1186
1187 // Scale up the offsets by the size of the type, then add to the bases.
1188 const SCEV *Scaled =
1189 SE->getMulExpr(LHS: Size, RHS: SE->getTruncateOrSignExtend(V: Offset, Ty: IntPtrTy));
1190 ScevList.emplace_back(Args: SE->getAddExpr(LHS: Base, RHS: Scaled), Args&: NeedsFreeze);
1191 }
1192 break;
1193 }
1194 case Instruction::Select: {
1195 SmallVector<PointerIntPair<const SCEV *, 1, bool>, 2> ChildScevs;
1196 // A select means we've found a forked pointer, but we currently only
1197 // support a single select per pointer so if there's another behind this
1198 // then we just bail out and return the generic SCEV.
1199 findForkedSCEVs(SE, L, Ptr: I->getOperand(i: 1), ScevList&: ChildScevs, Depth);
1200 findForkedSCEVs(SE, L, Ptr: I->getOperand(i: 2), ScevList&: ChildScevs, Depth);
1201 if (ChildScevs.size() == 2)
1202 append_range(C&: ScevList, R&: ChildScevs);
1203 else
1204 ScevList.emplace_back(Args&: Scev, Args: !isGuaranteedNotToBeUndefOrPoison(V: Ptr));
1205 break;
1206 }
1207 case Instruction::PHI: {
1208 SmallVector<PointerIntPair<const SCEV *, 1, bool>, 2> ChildScevs;
1209 // A phi means we've found a forked pointer, but we currently only
1210 // support a single phi per pointer so if there's another behind this
1211 // then we just bail out and return the generic SCEV.
1212 if (I->getNumOperands() == 2) {
1213 findForkedSCEVs(SE, L, Ptr: I->getOperand(i: 0), ScevList&: ChildScevs, Depth);
1214 findForkedSCEVs(SE, L, Ptr: I->getOperand(i: 1), ScevList&: ChildScevs, Depth);
1215 }
1216 if (ChildScevs.size() == 2)
1217 append_range(C&: ScevList, R&: ChildScevs);
1218 else
1219 ScevList.emplace_back(Args&: Scev, Args: !isGuaranteedNotToBeUndefOrPoison(V: Ptr));
1220 break;
1221 }
1222 case Instruction::Add:
1223 case Instruction::Sub: {
1224 SmallVector<PointerIntPair<const SCEV *, 1, bool>> LScevs;
1225 SmallVector<PointerIntPair<const SCEV *, 1, bool>> RScevs;
1226 findForkedSCEVs(SE, L, Ptr: I->getOperand(i: 0), ScevList&: LScevs, Depth);
1227 findForkedSCEVs(SE, L, Ptr: I->getOperand(i: 1), ScevList&: RScevs, Depth);
1228
1229 // See if we need to freeze our fork...
1230 bool NeedsFreeze =
1231 any_of(Range&: LScevs, P: UndefPoisonCheck) || any_of(Range&: RScevs, P: UndefPoisonCheck);
1232
1233 // Check that we only have a single fork, on either the left or right side.
1234 // Copy the SCEV across for the one without a fork in order to generate
1235 // the full SCEV for both sides of the BinOp.
1236 if (LScevs.size() == 2 && RScevs.size() == 1)
1237 RScevs.push_back(Elt: RScevs[0]);
1238 else if (RScevs.size() == 2 && LScevs.size() == 1)
1239 LScevs.push_back(Elt: LScevs[0]);
1240 else {
1241 ScevList.emplace_back(Args&: Scev, Args&: NeedsFreeze);
1242 break;
1243 }
1244
1245 for (auto [L, R] : zip(t&: LScevs, u&: RScevs))
1246 ScevList.emplace_back(Args: GetBinOpExpr(Opcode, get<0>(Pair: L), get<0>(Pair: R)),
1247 Args&: NeedsFreeze);
1248 break;
1249 }
1250 default:
1251 // Just return the current SCEV if we haven't handled the instruction yet.
1252 LLVM_DEBUG(dbgs() << "ForkedPtr unhandled instruction: " << *I << "\n");
1253 ScevList.emplace_back(Args&: Scev, Args: !isGuaranteedNotToBeUndefOrPoison(V: Ptr));
1254 break;
1255 }
1256}
1257
1258bool AccessAnalysis::createCheckForAccess(
1259 RuntimePointerChecking &RtCheck, MemAccessInfo Access, Type *AccessTy,
1260 const DenseMap<Value *, const SCEV *> &StridesMap,
1261 DenseMap<Value *, unsigned> &DepSetId, Loop *TheLoop,
1262 unsigned &RunningDepId, unsigned ASId, bool Assume) {
1263 Value *Ptr = Access.getPointer();
1264 ScalarEvolution *SE = PSE.getSE();
1265 assert(SE->isSCEVable(Ptr->getType()) && "Value is not SCEVable!");
1266
1267 SmallVector<PointerIntPair<const SCEV *, 1, bool>> RTCheckPtrs;
1268 findForkedSCEVs(SE, L: TheLoop, Ptr, ScevList&: RTCheckPtrs, Depth: MaxForkedSCEVDepth);
1269 assert(!RTCheckPtrs.empty() &&
1270 "Must have some runtime-check pointer candidates");
1271
1272 // RTCheckPtrs must have size 2 if there are forked pointers. Otherwise, there
1273 // are no forked pointers; replaceSymbolicStridesSCEV in this case.
1274 auto IsLoopInvariantOrAR =
1275 [&SE, &TheLoop](const PointerIntPair<const SCEV *, 1, bool> &P) {
1276 return SE->isLoopInvariant(S: P.getPointer(), L: TheLoop) ||
1277 isa<SCEVAddRecExpr>(Val: P.getPointer());
1278 };
1279 if (RTCheckPtrs.size() == 2 && all_of(Range&: RTCheckPtrs, P: IsLoopInvariantOrAR)) {
1280 LLVM_DEBUG(dbgs() << "LAA: Found forked pointer: " << *Ptr << "\n";
1281 for (const auto &[Idx, Q] : enumerate(RTCheckPtrs)) dbgs()
1282 << "\t(" << Idx << ") " << *Q.getPointer() << "\n");
1283 } else {
1284 RTCheckPtrs = {{replaceSymbolicStrideSCEV(PSE, PtrToStride: StridesMap, Ptr), false}};
1285 }
1286
1287 /// Check whether all pointers can participate in a runtime bounds check. They
1288 /// must either be invariant or non-wrapping affine AddRecs.
1289 for (auto &P : RTCheckPtrs) {
1290 // The bounds for loop-invariant pointer is trivial.
1291 if (SE->isLoopInvariant(S: P.getPointer(), L: TheLoop))
1292 continue;
1293
1294 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Val: P.getPointer());
1295 if (!AR && Assume)
1296 AR = PSE.getAsAddRec(V: Ptr);
1297 if (!AR || !AR->isAffine())
1298 return false;
1299
1300 // If there's only one option for Ptr, look it up after bounds and wrap
1301 // checking, because assumptions might have been added to PSE.
1302 if (RTCheckPtrs.size() == 1) {
1303 AR =
1304 cast<SCEVAddRecExpr>(Val: replaceSymbolicStrideSCEV(PSE, PtrToStride: StridesMap, Ptr));
1305 P.setPointer(AR);
1306 }
1307
1308 if (!isNoWrap(PSE, AR, Ptr: RTCheckPtrs.size() == 1 ? Ptr : nullptr, AccessTy,
1309 L: TheLoop, Assume, DT))
1310 return false;
1311 }
1312
1313 for (const auto &[PtrExpr, NeedsFreeze] : RTCheckPtrs) {
1314 // The id of the dependence set.
1315 unsigned DepId;
1316
1317 if (isDependencyCheckNeeded()) {
1318 Value *Leader = DepCands.getLeaderValue(V: Access).getPointer();
1319 unsigned &LeaderId = DepSetId[Leader];
1320 if (!LeaderId)
1321 LeaderId = RunningDepId++;
1322 DepId = LeaderId;
1323 } else
1324 // Each access has its own dependence set.
1325 DepId = RunningDepId++;
1326
1327 bool IsWrite = Access.getInt();
1328 RtCheck.insert(Lp: TheLoop, Ptr, PtrExpr, AccessTy, WritePtr: IsWrite, DepSetId: DepId, ASId, PSE,
1329 NeedsFreeze);
1330 LLVM_DEBUG(dbgs() << "LAA: Found a runtime check ptr:" << *Ptr << '\n');
1331 }
1332
1333 return true;
1334}
1335
1336bool AccessAnalysis::canCheckPtrAtRT(
1337 RuntimePointerChecking &RtCheck, Loop *TheLoop,
1338 const DenseMap<Value *, const SCEV *> &StridesMap, Value *&UncomputablePtr,
1339 bool AllowPartial) {
1340 // Find pointers with computable bounds. We are going to use this information
1341 // to place a runtime bound check.
1342 bool CanDoRT = true;
1343
1344 bool MayNeedRTCheck = false;
1345 if (!IsRTCheckAnalysisNeeded) return true;
1346
1347 bool IsDepCheckNeeded = isDependencyCheckNeeded();
1348
1349 // We assign a consecutive id to access from different alias sets.
1350 // Accesses between different groups doesn't need to be checked.
1351 unsigned ASId = 0;
1352 for (const auto &AS : AST) {
1353 int NumReadPtrChecks = 0;
1354 int NumWritePtrChecks = 0;
1355 bool CanDoAliasSetRT = true;
1356 ++ASId;
1357 auto ASPointers = AS.getPointers();
1358
1359 // We assign consecutive id to access from different dependence sets.
1360 // Accesses within the same set don't need a runtime check.
1361 unsigned RunningDepId = 1;
1362 DenseMap<Value *, unsigned> DepSetId;
1363
1364 SmallVector<std::pair<MemAccessInfo, Type *>, 4> Retries;
1365
1366 // First, count how many write and read accesses are in the alias set. Also
1367 // collect MemAccessInfos for later.
1368 SmallVector<MemAccessInfo, 4> AccessInfos;
1369 for (const Value *ConstPtr : ASPointers) {
1370 Value *Ptr = const_cast<Value *>(ConstPtr);
1371 bool IsWrite = Accesses.contains(Key: MemAccessInfo(Ptr, true));
1372 if (IsWrite)
1373 ++NumWritePtrChecks;
1374 else
1375 ++NumReadPtrChecks;
1376 AccessInfos.emplace_back(Args&: Ptr, Args&: IsWrite);
1377 }
1378
1379 // We do not need runtime checks for this alias set, if there are no writes
1380 // or a single write and no reads.
1381 if (NumWritePtrChecks == 0 ||
1382 (NumWritePtrChecks == 1 && NumReadPtrChecks == 0)) {
1383 assert((ASPointers.size() <= 1 ||
1384 all_of(ASPointers,
1385 [this](const Value *Ptr) {
1386 MemAccessInfo AccessWrite(const_cast<Value *>(Ptr),
1387 true);
1388 return !DepCands.contains(AccessWrite);
1389 })) &&
1390 "Can only skip updating CanDoRT below, if all entries in AS "
1391 "are reads or there is at most 1 entry");
1392 continue;
1393 }
1394
1395 for (auto &Access : AccessInfos) {
1396 for (const auto &AccessTy : Accesses[Access]) {
1397 if (!createCheckForAccess(RtCheck, Access, AccessTy, StridesMap,
1398 DepSetId, TheLoop, RunningDepId, ASId,
1399 Assume: false)) {
1400 LLVM_DEBUG(dbgs() << "LAA: Can't find bounds for ptr:"
1401 << *Access.getPointer() << '\n');
1402 Retries.emplace_back(Args&: Access, Args: AccessTy);
1403 CanDoAliasSetRT = false;
1404 }
1405 }
1406 }
1407
1408 // Note that this function computes CanDoRT and MayNeedRTCheck
1409 // independently. For example CanDoRT=false, MayNeedRTCheck=false means that
1410 // we have a pointer for which we couldn't find the bounds but we don't
1411 // actually need to emit any checks so it does not matter.
1412 //
1413 // We need runtime checks for this alias set, if there are at least 2
1414 // dependence sets (in which case RunningDepId > 2) or if we need to re-try
1415 // any bound checks (because in that case the number of dependence sets is
1416 // incomplete).
1417 bool NeedsAliasSetRTCheck = RunningDepId > 2 || !Retries.empty();
1418
1419 // We need to perform run-time alias checks, but some pointers had bounds
1420 // that couldn't be checked.
1421 if (NeedsAliasSetRTCheck && !CanDoAliasSetRT) {
1422 // Reset the CanDoSetRt flag and retry all accesses that have failed.
1423 // We know that we need these checks, so we can now be more aggressive
1424 // and add further checks if required (overflow checks).
1425 CanDoAliasSetRT = true;
1426 for (const auto &[Access, AccessTy] : Retries) {
1427 if (!createCheckForAccess(RtCheck, Access, AccessTy, StridesMap,
1428 DepSetId, TheLoop, RunningDepId, ASId,
1429 /*Assume=*/true)) {
1430 CanDoAliasSetRT = false;
1431 UncomputablePtr = Access.getPointer();
1432 if (!AllowPartial)
1433 break;
1434 }
1435 }
1436 }
1437
1438 CanDoRT &= CanDoAliasSetRT;
1439 MayNeedRTCheck |= NeedsAliasSetRTCheck;
1440 ++ASId;
1441 }
1442
1443 // If the pointers that we would use for the bounds comparison have different
1444 // address spaces, assume the values aren't directly comparable, so we can't
1445 // use them for the runtime check. We also have to assume they could
1446 // overlap. In the future there should be metadata for whether address spaces
1447 // are disjoint.
1448 unsigned NumPointers = RtCheck.Pointers.size();
1449 for (unsigned i = 0; i < NumPointers; ++i) {
1450 for (unsigned j = i + 1; j < NumPointers; ++j) {
1451 // Only need to check pointers between two different dependency sets.
1452 if (RtCheck.Pointers[i].DependencySetId ==
1453 RtCheck.Pointers[j].DependencySetId)
1454 continue;
1455 // Only need to check pointers in the same alias set.
1456 if (RtCheck.Pointers[i].AliasSetId != RtCheck.Pointers[j].AliasSetId)
1457 continue;
1458
1459 Value *PtrI = RtCheck.Pointers[i].PointerValue;
1460 Value *PtrJ = RtCheck.Pointers[j].PointerValue;
1461
1462 unsigned ASi = PtrI->getType()->getPointerAddressSpace();
1463 unsigned ASj = PtrJ->getType()->getPointerAddressSpace();
1464 if (ASi != ASj) {
1465 LLVM_DEBUG(
1466 dbgs() << "LAA: Runtime check would require comparison between"
1467 " different address spaces\n");
1468 return false;
1469 }
1470 }
1471 }
1472
1473 if (MayNeedRTCheck && (CanDoRT || AllowPartial))
1474 RtCheck.generateChecks(DepCands, UseDependencies: IsDepCheckNeeded);
1475
1476 LLVM_DEBUG(dbgs() << "LAA: We need to do " << RtCheck.getNumberOfChecks()
1477 << " pointer comparisons.\n");
1478
1479 // If we can do run-time checks, but there are no checks, no runtime checks
1480 // are needed. This can happen when all pointers point to the same underlying
1481 // object for example.
1482 RtCheck.Need = CanDoRT ? RtCheck.getNumberOfChecks() != 0 : MayNeedRTCheck;
1483
1484 bool CanDoRTIfNeeded = !RtCheck.Need || CanDoRT;
1485 assert(CanDoRTIfNeeded == (CanDoRT || !MayNeedRTCheck) &&
1486 "CanDoRTIfNeeded depends on RtCheck.Need");
1487 if (!CanDoRTIfNeeded && !AllowPartial)
1488 RtCheck.reset();
1489 return CanDoRTIfNeeded;
1490}
1491
1492void AccessAnalysis::processMemAccesses() {
1493 // We process the set twice: first we process read-write pointers, last we
1494 // process read-only pointers. This allows us to skip dependence tests for
1495 // read-only pointers.
1496
1497 LLVM_DEBUG(dbgs() << "LAA: Processing memory accesses...\n");
1498 LLVM_DEBUG(dbgs() << " AST: "; AST.dump());
1499 LLVM_DEBUG(dbgs() << "LAA: Accesses(" << Accesses.size() << "):\n");
1500 LLVM_DEBUG({
1501 for (const auto &[A, _] : Accesses)
1502 dbgs() << "\t" << *A.getPointer() << " ("
1503 << (A.getInt()
1504 ? "write"
1505 : (ReadOnlyPtr.contains(A.getPointer()) ? "read-only"
1506 : "read"))
1507 << ")\n";
1508 });
1509
1510 // The AliasSetTracker has nicely partitioned our pointers by metadata
1511 // compatibility and potential for underlying-object overlap. As a result, we
1512 // only need to check for potential pointer dependencies within each alias
1513 // set.
1514 for (const auto &AS : AST) {
1515 // Note that both the alias-set tracker and the alias sets themselves used
1516 // ordered collections internally and so the iteration order here is
1517 // deterministic.
1518 auto ASPointers = AS.getPointers();
1519
1520 bool SetHasWrite = false;
1521
1522 // Map of (pointer to underlying objects, accessed address space) to last
1523 // access encountered.
1524 typedef DenseMap<std::pair<const Value *, unsigned>, MemAccessInfo>
1525 UnderlyingObjToAccessMap;
1526 UnderlyingObjToAccessMap ObjToLastAccess;
1527
1528 // Set of access to check after all writes have been processed.
1529 PtrAccessMap DeferredAccesses;
1530
1531 // Iterate over each alias set twice, once to process read/write pointers,
1532 // and then to process read-only pointers.
1533 for (int SetIteration = 0; SetIteration < 2; ++SetIteration) {
1534 bool UseDeferred = SetIteration > 0;
1535 PtrAccessMap &S = UseDeferred ? DeferredAccesses : Accesses;
1536
1537 for (const Value *ConstPtr : ASPointers) {
1538 Value *Ptr = const_cast<Value *>(ConstPtr);
1539
1540 // For a single memory access in AliasSetTracker, Accesses may contain
1541 // both read and write, and they both need to be handled for CheckDeps.
1542 for (const auto &[AC, _] : S) {
1543 if (AC.getPointer() != Ptr)
1544 continue;
1545
1546 bool IsWrite = AC.getInt();
1547
1548 // If we're using the deferred access set, then it contains only
1549 // reads.
1550 bool IsReadOnlyPtr = ReadOnlyPtr.contains(Ptr) && !IsWrite;
1551 if (UseDeferred && !IsReadOnlyPtr)
1552 continue;
1553 // Otherwise, the pointer must be in the PtrAccessSet, either as a
1554 // read or a write.
1555 assert(((IsReadOnlyPtr && UseDeferred) || IsWrite ||
1556 S.contains(MemAccessInfo(Ptr, false))) &&
1557 "Alias-set pointer not in the access set?");
1558
1559 MemAccessInfo Access(Ptr, IsWrite);
1560 DepCands.insert(Data: Access);
1561
1562 // Memorize read-only pointers for later processing and skip them in
1563 // the first round (they need to be checked after we have seen all
1564 // write pointers). Note: we also mark pointer that are not
1565 // consecutive as "read-only" pointers (so that we check
1566 // "a[b[i]] +="). Hence, we need the second check for "!IsWrite".
1567 if (!UseDeferred && IsReadOnlyPtr) {
1568 // We only use the pointer keys, the types vector values don't
1569 // matter.
1570 DeferredAccesses.insert(KV: {Access, {}});
1571 continue;
1572 }
1573
1574 // If this is a write - check other reads and writes for conflicts. If
1575 // this is a read only check other writes for conflicts (but only if
1576 // there is no other write to the ptr - this is an optimization to
1577 // catch "a[i] = a[i] + " without having to do a dependence check).
1578 if ((IsWrite || IsReadOnlyPtr) && SetHasWrite) {
1579 CheckDeps.push_back(Elt: Access);
1580 IsRTCheckAnalysisNeeded = true;
1581 }
1582
1583 if (IsWrite)
1584 SetHasWrite = true;
1585
1586 // Create sets of pointers connected by a shared alias set and
1587 // underlying object.
1588 SmallVector<const Value *, 16> &UOs = UnderlyingObjects[Ptr];
1589 UOs = {};
1590 ::getUnderlyingObjects(V: Ptr, Objects&: UOs, LI);
1591 LLVM_DEBUG(dbgs()
1592 << "Underlying objects for pointer " << *Ptr << "\n");
1593 for (const Value *UnderlyingObj : UOs) {
1594 // nullptr never alias, don't join sets for pointer that have "null"
1595 // in their UnderlyingObjects list.
1596 if (isa<ConstantPointerNull>(Val: UnderlyingObj) &&
1597 !NullPointerIsDefined(
1598 F: TheLoop->getHeader()->getParent(),
1599 AS: UnderlyingObj->getType()->getPointerAddressSpace()))
1600 continue;
1601
1602 auto [It, Inserted] = ObjToLastAccess.try_emplace(
1603 Key: {UnderlyingObj,
1604 cast<PointerType>(Val: Ptr->getType())->getAddressSpace()},
1605 Args&: Access);
1606 if (!Inserted) {
1607 DepCands.unionSets(V1: Access, V2: It->second);
1608 It->second = Access;
1609 }
1610
1611 LLVM_DEBUG(dbgs() << " " << *UnderlyingObj << "\n");
1612 }
1613 }
1614 }
1615 }
1616 }
1617}
1618
1619/// Check whether the access through \p Ptr has a constant stride.
1620std::optional<int64_t>
1621llvm::getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr,
1622 const Loop *Lp, const DominatorTree &DT,
1623 const DenseMap<Value *, const SCEV *> &StridesMap,
1624 bool Assume, bool ShouldCheckWrap) {
1625 const SCEV *PtrScev = replaceSymbolicStrideSCEV(PSE, PtrToStride: StridesMap, Ptr);
1626 if (PSE.getSE()->isLoopInvariant(S: PtrScev, L: Lp))
1627 return 0;
1628
1629 assert(Ptr->getType()->isPointerTy() && "Unexpected non-ptr");
1630
1631 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Val: PtrScev);
1632 if (Assume && !AR)
1633 AR = PSE.getAsAddRec(V: Ptr);
1634
1635 if (!AR) {
1636 LLVM_DEBUG(dbgs() << "LAA: Bad stride - Not an AddRecExpr pointer " << *Ptr
1637 << " SCEV: " << *PtrScev << "\n");
1638 return std::nullopt;
1639 }
1640
1641 std::optional<int64_t> Stride =
1642 getStrideFromAddRec(AR, Lp, AccessTy, Ptr, PSE);
1643 if (!ShouldCheckWrap || !Stride)
1644 return Stride;
1645
1646 if (isNoWrap(PSE, AR, Ptr, AccessTy, L: Lp, Assume, DT, Stride))
1647 return Stride;
1648
1649 LLVM_DEBUG(
1650 dbgs() << "LAA: Bad stride - Pointer may wrap in the address space "
1651 << *Ptr << " SCEV: " << *AR << "\n");
1652 return std::nullopt;
1653}
1654
1655std::optional<int64_t> llvm::getPointersDiff(Type *ElemTyA, Value *PtrA,
1656 Type *ElemTyB, Value *PtrB,
1657 const DataLayout &DL,
1658 ScalarEvolution &SE,
1659 bool StrictCheck, bool CheckType) {
1660 assert(PtrA && PtrB && "Expected non-nullptr pointers.");
1661
1662 // Make sure that A and B are different pointers.
1663 if (PtrA == PtrB)
1664 return 0;
1665
1666 // Make sure that the element types are the same if required.
1667 if (CheckType && ElemTyA != ElemTyB)
1668 return std::nullopt;
1669
1670 unsigned ASA = PtrA->getType()->getPointerAddressSpace();
1671 unsigned ASB = PtrB->getType()->getPointerAddressSpace();
1672
1673 // Check that the address spaces match.
1674 if (ASA != ASB)
1675 return std::nullopt;
1676 unsigned IdxWidth = DL.getIndexSizeInBits(AS: ASA);
1677
1678 APInt OffsetA(IdxWidth, 0), OffsetB(IdxWidth, 0);
1679 const Value *PtrA1 = PtrA->stripAndAccumulateConstantOffsets(
1680 DL, Offset&: OffsetA, /*AllowNonInbounds=*/true);
1681 const Value *PtrB1 = PtrB->stripAndAccumulateConstantOffsets(
1682 DL, Offset&: OffsetB, /*AllowNonInbounds=*/true);
1683
1684 std::optional<int64_t> Val;
1685 if (PtrA1 == PtrB1) {
1686 // Retrieve the address space again as pointer stripping now tracks through
1687 // `addrspacecast`.
1688 ASA = cast<PointerType>(Val: PtrA1->getType())->getAddressSpace();
1689 ASB = cast<PointerType>(Val: PtrB1->getType())->getAddressSpace();
1690 // Check that the address spaces match and that the pointers are valid.
1691 if (ASA != ASB)
1692 return std::nullopt;
1693
1694 IdxWidth = DL.getIndexSizeInBits(AS: ASA);
1695 OffsetA = OffsetA.sextOrTrunc(width: IdxWidth);
1696 OffsetB = OffsetB.sextOrTrunc(width: IdxWidth);
1697
1698 OffsetB -= OffsetA;
1699 Val = OffsetB.trySExtValue();
1700 } else {
1701 // Otherwise compute the distance with SCEV between the base pointers.
1702 const SCEV *PtrSCEVA = SE.getSCEV(V: PtrA);
1703 const SCEV *PtrSCEVB = SE.getSCEV(V: PtrB);
1704 std::optional<APInt> Diff =
1705 SE.computeConstantDifference(LHS: PtrSCEVB, RHS: PtrSCEVA);
1706 if (!Diff)
1707 return std::nullopt;
1708 Val = Diff->trySExtValue();
1709 }
1710
1711 if (!Val)
1712 return std::nullopt;
1713
1714 int64_t Size = DL.getTypeStoreSize(Ty: ElemTyA);
1715 int64_t Dist = *Val / Size;
1716
1717 // Ensure that the calculated distance matches the type-based one after all
1718 // the bitcasts removal in the provided pointers.
1719 if (!StrictCheck || Dist * Size == Val)
1720 return Dist;
1721 return std::nullopt;
1722}
1723
1724bool llvm::sortPtrAccesses(ArrayRef<Value *> VL, Type *ElemTy,
1725 const DataLayout &DL, ScalarEvolution &SE,
1726 SmallVectorImpl<unsigned> &SortedIndices) {
1727 assert(llvm::all_of(
1728 VL, [](const Value *V) { return V->getType()->isPointerTy(); }) &&
1729 "Expected list of pointer operands.");
1730 // Walk over the pointers, and map each of them to an offset relative to
1731 // first pointer in the array.
1732 Value *Ptr0 = VL[0];
1733
1734 using DistOrdPair = std::pair<int64_t, unsigned>;
1735 auto Compare = llvm::less_first();
1736 std::set<DistOrdPair, decltype(Compare)> Offsets(Compare);
1737 Offsets.emplace(args: 0, args: 0);
1738 bool IsConsecutive = true;
1739 for (auto [Idx, Ptr] : drop_begin(RangeOrContainer: enumerate(First&: VL))) {
1740 std::optional<int64_t> Diff =
1741 getPointersDiff(ElemTyA: ElemTy, PtrA: Ptr0, ElemTyB: ElemTy, PtrB: Ptr, DL, SE,
1742 /*StrictCheck=*/true);
1743 if (!Diff)
1744 return false;
1745
1746 // Check if the pointer with the same offset is found.
1747 int64_t Offset = *Diff;
1748 auto [It, IsInserted] = Offsets.emplace(args&: Offset, args&: Idx);
1749 if (!IsInserted)
1750 return false;
1751 // Consecutive order if the inserted element is the last one.
1752 IsConsecutive &= std::next(x: It) == Offsets.end();
1753 }
1754 SortedIndices.clear();
1755 if (!IsConsecutive) {
1756 // Fill SortedIndices array only if it is non-consecutive.
1757 SortedIndices.resize(N: VL.size());
1758 for (auto [Idx, Off] : enumerate(First&: Offsets))
1759 SortedIndices[Idx] = Off.second;
1760 }
1761 return true;
1762}
1763
1764/// Returns true if the memory operations \p A and \p B are consecutive.
1765bool llvm::isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL,
1766 ScalarEvolution &SE, bool CheckType) {
1767 Value *PtrA = getLoadStorePointerOperand(V: A);
1768 Value *PtrB = getLoadStorePointerOperand(V: B);
1769 if (!PtrA || !PtrB)
1770 return false;
1771 Type *ElemTyA = getLoadStoreType(I: A);
1772 Type *ElemTyB = getLoadStoreType(I: B);
1773 std::optional<int64_t> Diff =
1774 getPointersDiff(ElemTyA, PtrA, ElemTyB, PtrB, DL, SE,
1775 /*StrictCheck=*/true, CheckType);
1776 return Diff == 1;
1777}
1778
1779void MemoryDepChecker::addAccess(StoreInst *SI) {
1780 visitPointers(StartPtr: SI->getPointerOperand(), InnermostLoop: *InnermostLoop,
1781 AddPointer: [this, SI](Value *Ptr) {
1782 Accesses[MemAccessInfo(Ptr, true)].push_back(x: AccessIdx);
1783 InstMap.push_back(Elt: SI);
1784 ++AccessIdx;
1785 });
1786}
1787
1788void MemoryDepChecker::addAccess(LoadInst *LI) {
1789 visitPointers(StartPtr: LI->getPointerOperand(), InnermostLoop: *InnermostLoop,
1790 AddPointer: [this, LI](Value *Ptr) {
1791 Accesses[MemAccessInfo(Ptr, false)].push_back(x: AccessIdx);
1792 InstMap.push_back(Elt: LI);
1793 ++AccessIdx;
1794 });
1795}
1796
1797MemoryDepChecker::VectorizationSafetyStatus
1798MemoryDepChecker::Dependence::isSafeForVectorization(DepType Type) {
1799 switch (Type) {
1800 case NoDep:
1801 case Forward:
1802 case BackwardVectorizable:
1803 return VectorizationSafetyStatus::Safe;
1804
1805 case Unknown:
1806 return VectorizationSafetyStatus::PossiblySafeWithRtChecks;
1807 case ForwardButPreventsForwarding:
1808 case Backward:
1809 case BackwardVectorizableButPreventsForwarding:
1810 case IndirectUnsafe:
1811 return VectorizationSafetyStatus::Unsafe;
1812 }
1813 llvm_unreachable("unexpected DepType!");
1814}
1815
1816bool MemoryDepChecker::Dependence::isBackward() const {
1817 switch (Type) {
1818 case NoDep:
1819 case Forward:
1820 case ForwardButPreventsForwarding:
1821 case Unknown:
1822 case IndirectUnsafe:
1823 return false;
1824
1825 case BackwardVectorizable:
1826 case Backward:
1827 case BackwardVectorizableButPreventsForwarding:
1828 return true;
1829 }
1830 llvm_unreachable("unexpected DepType!");
1831}
1832
1833bool MemoryDepChecker::Dependence::isPossiblyBackward() const {
1834 return isBackward() || Type == Unknown || Type == IndirectUnsafe;
1835}
1836
1837bool MemoryDepChecker::Dependence::isForward() const {
1838 switch (Type) {
1839 case Forward:
1840 case ForwardButPreventsForwarding:
1841 return true;
1842
1843 case NoDep:
1844 case Unknown:
1845 case BackwardVectorizable:
1846 case Backward:
1847 case BackwardVectorizableButPreventsForwarding:
1848 case IndirectUnsafe:
1849 return false;
1850 }
1851 llvm_unreachable("unexpected DepType!");
1852}
1853
1854bool MemoryDepChecker::couldPreventStoreLoadForward(uint64_t Distance,
1855 uint64_t TypeByteSize,
1856 unsigned CommonStride) {
1857 // If loads occur at a distance that is not a multiple of a feasible vector
1858 // factor store-load forwarding does not take place.
1859 // Positive dependences might cause troubles because vectorizing them might
1860 // prevent store-load forwarding making vectorized code run a lot slower.
1861 // a[i] = a[i-3] ^ a[i-8];
1862 // The stores to a[i:i+1] don't align with the stores to a[i-3:i-2] and
1863 // hence on your typical architecture store-load forwarding does not take
1864 // place. Vectorizing in such cases does not make sense.
1865 // Store-load forwarding distance.
1866
1867 // After this many iterations store-to-load forwarding conflicts should not
1868 // cause any slowdowns.
1869 const uint64_t NumItersForStoreLoadThroughMemory = 8 * TypeByteSize;
1870 // Maximum vector factor.
1871 uint64_t MaxVFWithoutSLForwardIssuesPowerOf2 =
1872 std::min(a: VectorizerParams::MaxVectorWidth * TypeByteSize,
1873 b: MaxStoreLoadForwardSafeDistanceInBits);
1874
1875 // Compute the smallest VF at which the store and load would be misaligned.
1876 for (uint64_t VF = 2 * TypeByteSize;
1877 VF <= MaxVFWithoutSLForwardIssuesPowerOf2; VF *= 2) {
1878 // If the number of vector iteration between the store and the load are
1879 // small we could incur conflicts.
1880 if (Distance % VF && Distance / VF < NumItersForStoreLoadThroughMemory) {
1881 MaxVFWithoutSLForwardIssuesPowerOf2 = (VF >> 1);
1882 break;
1883 }
1884 }
1885
1886 if (MaxVFWithoutSLForwardIssuesPowerOf2 < 2 * TypeByteSize) {
1887 LLVM_DEBUG(
1888 dbgs() << "LAA: Distance " << Distance
1889 << " that could cause a store-load forwarding conflict\n");
1890 return true;
1891 }
1892
1893 if (CommonStride &&
1894 MaxVFWithoutSLForwardIssuesPowerOf2 <
1895 MaxStoreLoadForwardSafeDistanceInBits &&
1896 MaxVFWithoutSLForwardIssuesPowerOf2 !=
1897 VectorizerParams::MaxVectorWidth * TypeByteSize) {
1898 uint64_t MaxVF =
1899 bit_floor(Value: MaxVFWithoutSLForwardIssuesPowerOf2 / CommonStride);
1900 uint64_t MaxVFInBits = MaxVF * TypeByteSize * 8;
1901 MaxStoreLoadForwardSafeDistanceInBits =
1902 std::min(a: MaxStoreLoadForwardSafeDistanceInBits, b: MaxVFInBits);
1903 }
1904 return false;
1905}
1906
1907void MemoryDepChecker::mergeInStatus(VectorizationSafetyStatus S) {
1908 if (Status < S)
1909 Status = S;
1910}
1911
1912/// Given a dependence-distance \p Dist between two memory accesses, that have
1913/// strides in the same direction whose absolute value of the maximum stride is
1914/// given in \p MaxStride, in a loop whose maximum backedge taken count is \p
1915/// MaxBTC, check if it is possible to prove statically that the dependence
1916/// distance is larger than the range that the accesses will travel through the
1917/// execution of the loop. If so, return true; false otherwise. This is useful
1918/// for example in loops such as the following (PR31098):
1919///
1920/// for (i = 0; i < D; ++i) {
1921/// = out[i];
1922/// out[i+D] =
1923/// }
1924static bool isSafeDependenceDistance(const DataLayout &DL, ScalarEvolution &SE,
1925 const SCEV &MaxBTC, const SCEV &Dist,
1926 uint64_t MaxStride) {
1927
1928 // If we can prove that
1929 // (**) |Dist| > MaxBTC * Step
1930 // where Step is the absolute stride of the memory accesses in bytes,
1931 // then there is no dependence.
1932 //
1933 // Rationale:
1934 // We basically want to check if the absolute distance (|Dist/Step|)
1935 // is >= the loop iteration count (or > MaxBTC).
1936 // This is equivalent to the Strong SIV Test (Practical Dependence Testing,
1937 // Section 4.2.1); Note, that for vectorization it is sufficient to prove
1938 // that the dependence distance is >= VF; This is checked elsewhere.
1939 // But in some cases we can prune dependence distances early, and
1940 // even before selecting the VF, and without a runtime test, by comparing
1941 // the distance against the loop iteration count. Since the vectorized code
1942 // will be executed only if LoopCount >= VF, proving distance >= LoopCount
1943 // also guarantees that distance >= VF.
1944 //
1945 const SCEV *Step = SE.getConstant(Ty: MaxBTC.getType(), V: MaxStride);
1946 const SCEV *Product = SE.getMulExpr(LHS: &MaxBTC, RHS: Step);
1947
1948 const SCEV *CastedDist = &Dist;
1949 const SCEV *CastedProduct = Product;
1950 uint64_t DistTypeSizeBits = DL.getTypeSizeInBits(Ty: Dist.getType());
1951 uint64_t ProductTypeSizeBits = DL.getTypeSizeInBits(Ty: Product->getType());
1952
1953 // The dependence distance can be positive/negative, so we sign extend Dist;
1954 // The multiplication of the absolute stride in bytes and the
1955 // backedgeTakenCount is non-negative, so we zero extend Product.
1956 if (DistTypeSizeBits > ProductTypeSizeBits)
1957 CastedProduct = SE.getZeroExtendExpr(Op: Product, Ty: Dist.getType());
1958 else
1959 CastedDist = SE.getNoopOrSignExtend(V: &Dist, Ty: Product->getType());
1960
1961 // Is Dist - (MaxBTC * Step) > 0 ?
1962 // (If so, then we have proven (**) because |Dist| >= Dist)
1963 const SCEV *Minus = SE.getMinusSCEV(LHS: CastedDist, RHS: CastedProduct);
1964 if (SE.isKnownPositive(S: Minus))
1965 return true;
1966
1967 // Second try: Is -Dist - (MaxBTC * Step) > 0 ?
1968 // (If so, then we have proven (**) because |Dist| >= -1*Dist)
1969 const SCEV *NegDist = SE.getNegativeSCEV(V: CastedDist);
1970 Minus = SE.getMinusSCEV(LHS: NegDist, RHS: CastedProduct);
1971 return SE.isKnownPositive(S: Minus);
1972}
1973
1974/// Check the dependence for two accesses with the same stride \p Stride.
1975/// \p Distance is the positive distance in bytes, and \p TypeByteSize is type
1976/// size in bytes.
1977///
1978/// \returns true if they are independent.
1979static bool areStridedAccessesIndependent(uint64_t Distance, uint64_t Stride,
1980 uint64_t TypeByteSize) {
1981 assert(Stride > 1 && "The stride must be greater than 1");
1982 assert(TypeByteSize > 0 && "The type size in byte must be non-zero");
1983 assert(Distance > 0 && "The distance must be non-zero");
1984
1985 // Skip if the distance is not multiple of type byte size.
1986 if (Distance % TypeByteSize)
1987 return false;
1988
1989 // No dependence if the distance is not multiple of the stride.
1990 // E.g.
1991 // for (i = 0; i < 1024 ; i += 4)
1992 // A[i+2] = A[i] + 1;
1993 //
1994 // Two accesses in memory (distance is 2, stride is 4):
1995 // | A[0] | | | | A[4] | | | |
1996 // | | | A[2] | | | | A[6] | |
1997 //
1998 // E.g.
1999 // for (i = 0; i < 1024 ; i += 3)
2000 // A[i+4] = A[i] + 1;
2001 //
2002 // Two accesses in memory (distance is 4, stride is 3):
2003 // | A[0] | | | A[3] | | | A[6] | | |
2004 // | | | | | A[4] | | | A[7] | |
2005 return Distance % Stride;
2006}
2007
2008bool MemoryDepChecker::areAccessesCompletelyBeforeOrAfter(const SCEV *Src,
2009 Type *SrcTy,
2010 const SCEV *Sink,
2011 Type *SinkTy) {
2012 const SCEV *BTC = PSE.getBackedgeTakenCount();
2013 const SCEV *SymbolicMaxBTC = PSE.getSymbolicMaxBackedgeTakenCount();
2014 ScalarEvolution &SE = *PSE.getSE();
2015 const auto &[SrcStart_, SrcEnd_] =
2016 getStartAndEndForAccess(Lp: InnermostLoop, PtrExpr: Src, AccessTy: SrcTy, BTC, MaxBTC: SymbolicMaxBTC,
2017 SE: &SE, PointerBounds: &PointerBounds, DT, AC, LoopGuards);
2018 if (isa<SCEVCouldNotCompute>(Val: SrcStart_) || isa<SCEVCouldNotCompute>(Val: SrcEnd_))
2019 return false;
2020
2021 const auto &[SinkStart_, SinkEnd_] =
2022 getStartAndEndForAccess(Lp: InnermostLoop, PtrExpr: Sink, AccessTy: SinkTy, BTC, MaxBTC: SymbolicMaxBTC,
2023 SE: &SE, PointerBounds: &PointerBounds, DT, AC, LoopGuards);
2024 if (isa<SCEVCouldNotCompute>(Val: SinkStart_) ||
2025 isa<SCEVCouldNotCompute>(Val: SinkEnd_))
2026 return false;
2027
2028 if (!LoopGuards)
2029 LoopGuards.emplace(args: ScalarEvolution::LoopGuards::collect(L: InnermostLoop, SE));
2030
2031 auto SrcEnd = SE.applyLoopGuards(Expr: SrcEnd_, Guards: *LoopGuards);
2032 auto SinkStart = SE.applyLoopGuards(Expr: SinkStart_, Guards: *LoopGuards);
2033 if (SE.isKnownPredicate(Pred: CmpInst::ICMP_ULE, LHS: SrcEnd, RHS: SinkStart))
2034 return true;
2035
2036 auto SinkEnd = SE.applyLoopGuards(Expr: SinkEnd_, Guards: *LoopGuards);
2037 auto SrcStart = SE.applyLoopGuards(Expr: SrcStart_, Guards: *LoopGuards);
2038 return SE.isKnownPredicate(Pred: CmpInst::ICMP_ULE, LHS: SinkEnd, RHS: SrcStart);
2039}
2040
2041std::variant<MemoryDepChecker::Dependence::DepType,
2042 MemoryDepChecker::DepDistanceStrideAndSizeInfo>
2043MemoryDepChecker::getDependenceDistanceStrideAndSize(
2044 const AccessAnalysis::MemAccessInfo &A, Instruction *AInst,
2045 const AccessAnalysis::MemAccessInfo &B, Instruction *BInst) {
2046 const auto &DL = InnermostLoop->getHeader()->getDataLayout();
2047 auto &SE = *PSE.getSE();
2048 const auto &[APtr, AIsWrite] = A;
2049 const auto &[BPtr, BIsWrite] = B;
2050
2051 // Two reads are independent.
2052 if (!AIsWrite && !BIsWrite)
2053 return MemoryDepChecker::Dependence::NoDep;
2054
2055 Type *ATy = getLoadStoreType(I: AInst);
2056 Type *BTy = getLoadStoreType(I: BInst);
2057
2058 // We cannot check pointers in different address spaces.
2059 if (APtr->getType()->getPointerAddressSpace() !=
2060 BPtr->getType()->getPointerAddressSpace())
2061 return MemoryDepChecker::Dependence::Unknown;
2062
2063 std::optional<int64_t> StrideAPtr = getPtrStride(
2064 PSE, AccessTy: ATy, Ptr: APtr, Lp: InnermostLoop, DT: *DT, StridesMap: SymbolicStrides, Assume: true, ShouldCheckWrap: true);
2065 std::optional<int64_t> StrideBPtr = getPtrStride(
2066 PSE, AccessTy: BTy, Ptr: BPtr, Lp: InnermostLoop, DT: *DT, StridesMap: SymbolicStrides, Assume: true, ShouldCheckWrap: true);
2067
2068 const SCEV *Src = PSE.getSCEV(V: APtr);
2069 const SCEV *Sink = PSE.getSCEV(V: BPtr);
2070
2071 // If the induction step is negative we have to invert source and sink of the
2072 // dependence when measuring the distance between them. We should not swap
2073 // AIsWrite with BIsWrite, as their uses expect them in program order.
2074 if (StrideAPtr && *StrideAPtr < 0) {
2075 std::swap(a&: Src, b&: Sink);
2076 std::swap(a&: AInst, b&: BInst);
2077 std::swap(a&: ATy, b&: BTy);
2078 std::swap(lhs&: StrideAPtr, rhs&: StrideBPtr);
2079 }
2080
2081 const SCEV *Dist = SE.getMinusSCEV(LHS: Sink, RHS: Src);
2082
2083 LLVM_DEBUG(dbgs() << "LAA: Src Scev: " << *Src << "Sink Scev: " << *Sink
2084 << "\n");
2085 LLVM_DEBUG(dbgs() << "LAA: Distance for " << *AInst << " to " << *BInst
2086 << ": " << *Dist << "\n");
2087
2088 // Need accesses with constant strides and the same direction for further
2089 // dependence analysis. We don't want to vectorize "A[B[i]] += ..." and
2090 // similar code or pointer arithmetic that could wrap in the address space.
2091
2092 // If either Src or Sink are not strided (i.e. not a non-wrapping AddRec) and
2093 // not loop-invariant (stride will be 0 in that case), we cannot analyze the
2094 // dependence further and also cannot generate runtime checks.
2095 if (!StrideAPtr || !StrideBPtr) {
2096 LLVM_DEBUG(dbgs() << "Pointer access with non-constant stride\n");
2097 return MemoryDepChecker::Dependence::IndirectUnsafe;
2098 }
2099
2100 int64_t StrideAPtrInt = *StrideAPtr;
2101 int64_t StrideBPtrInt = *StrideBPtr;
2102 LLVM_DEBUG(dbgs() << "LAA: Src induction step: " << StrideAPtrInt
2103 << " Sink induction step: " << StrideBPtrInt << "\n");
2104 // At least Src or Sink are loop invariant and the other is strided or
2105 // invariant. We can generate a runtime check to disambiguate the accesses.
2106 if (!StrideAPtrInt || !StrideBPtrInt)
2107 return MemoryDepChecker::Dependence::Unknown;
2108
2109 // Both Src and Sink have a constant stride, check if they are in the same
2110 // direction.
2111 if ((StrideAPtrInt > 0) != (StrideBPtrInt > 0)) {
2112 LLVM_DEBUG(
2113 dbgs() << "Pointer access with strides in different directions\n");
2114 return MemoryDepChecker::Dependence::Unknown;
2115 }
2116
2117 TypeSize AStoreSz = DL.getTypeStoreSize(Ty: ATy);
2118 TypeSize BStoreSz = DL.getTypeStoreSize(Ty: BTy);
2119
2120 // If store sizes are not the same, set TypeByteSize to zero, so we can check
2121 // it in the caller isDependent.
2122 uint64_t ASz = DL.getTypeAllocSize(Ty: ATy);
2123 uint64_t BSz = DL.getTypeAllocSize(Ty: BTy);
2124 uint64_t TypeByteSize = (AStoreSz == BStoreSz) ? BSz : 0;
2125
2126 uint64_t StrideAScaled = std::abs(i: StrideAPtrInt) * ASz;
2127 uint64_t StrideBScaled = std::abs(i: StrideBPtrInt) * BSz;
2128
2129 uint64_t MaxStride = std::max(a: StrideAScaled, b: StrideBScaled);
2130
2131 std::optional<uint64_t> CommonStride;
2132 if (StrideAScaled == StrideBScaled)
2133 CommonStride = StrideAScaled;
2134
2135 // TODO: Historically, we didn't retry with runtime checks when (unscaled)
2136 // strides were different but there is no inherent reason to.
2137 if (!isa<SCEVConstant>(Val: Dist))
2138 ShouldRetryWithRuntimeChecks |= StrideAPtrInt == StrideBPtrInt;
2139
2140 // If distance is a SCEVCouldNotCompute, return Unknown immediately.
2141 if (isa<SCEVCouldNotCompute>(Val: Dist)) {
2142 LLVM_DEBUG(dbgs() << "LAA: Uncomputable distance.\n");
2143 return Dependence::Unknown;
2144 }
2145
2146 return DepDistanceStrideAndSizeInfo(Dist, MaxStride, CommonStride,
2147 TypeByteSize, AIsWrite, BIsWrite);
2148}
2149
2150MemoryDepChecker::Dependence::DepType
2151MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
2152 const MemAccessInfo &B, unsigned BIdx) {
2153 assert(AIdx < BIdx && "Must pass arguments in program order");
2154
2155 // Check if we can prove that Sink only accesses memory after Src's end or
2156 // vice versa. The helper is used to perform the checks only on the exit paths
2157 // where it helps to improve the analysis result.
2158 auto CheckCompletelyBeforeOrAfter = [&]() {
2159 auto *APtr = A.getPointer();
2160 auto *BPtr = B.getPointer();
2161 Type *ATy = getLoadStoreType(I: InstMap[AIdx]);
2162 Type *BTy = getLoadStoreType(I: InstMap[BIdx]);
2163 const SCEV *Src = PSE.getSCEV(V: APtr);
2164 const SCEV *Sink = PSE.getSCEV(V: BPtr);
2165 return areAccessesCompletelyBeforeOrAfter(Src, SrcTy: ATy, Sink, SinkTy: BTy);
2166 };
2167
2168 // Get the dependence distance, stride, type size and what access writes for
2169 // the dependence between A and B.
2170 auto Res =
2171 getDependenceDistanceStrideAndSize(A, AInst: InstMap[AIdx], B, BInst: InstMap[BIdx]);
2172 if (std::holds_alternative<Dependence::DepType>(v: Res)) {
2173 if (std::get<Dependence::DepType>(v&: Res) == Dependence::Unknown &&
2174 CheckCompletelyBeforeOrAfter())
2175 return Dependence::NoDep;
2176 return std::get<Dependence::DepType>(v&: Res);
2177 }
2178
2179 auto &[Dist, MaxStride, CommonStride, TypeByteSize, AIsWrite, BIsWrite] =
2180 std::get<DepDistanceStrideAndSizeInfo>(v&: Res);
2181 bool HasSameSize = TypeByteSize > 0;
2182
2183 ScalarEvolution &SE = *PSE.getSE();
2184 auto &DL = InnermostLoop->getHeader()->getDataLayout();
2185
2186 // If the distance between the acecsses is larger than their maximum absolute
2187 // stride multiplied by the symbolic maximum backedge taken count (which is an
2188 // upper bound of the number of iterations), the accesses are independet, i.e.
2189 // they are far enough appart that accesses won't access the same location
2190 // across all loop ierations.
2191 if (HasSameSize &&
2192 isSafeDependenceDistance(
2193 DL, SE, MaxBTC: *(PSE.getSymbolicMaxBackedgeTakenCount()), Dist: *Dist, MaxStride))
2194 return Dependence::NoDep;
2195
2196 // The rest of this function relies on ConstDist being at most 64-bits, which
2197 // is checked earlier. Will assert if the calling code changes.
2198 const APInt *APDist = nullptr;
2199 uint64_t ConstDist =
2200 match(S: Dist, P: m_scev_APInt(C&: APDist)) ? APDist->abs().getZExtValue() : 0;
2201
2202 // Attempt to prove strided accesses independent.
2203 if (APDist) {
2204 // If the distance between accesses and their strides are known constants,
2205 // check whether the accesses interlace each other.
2206 if (ConstDist > 0 && CommonStride && CommonStride > 1 && HasSameSize &&
2207 areStridedAccessesIndependent(Distance: ConstDist, Stride: *CommonStride, TypeByteSize)) {
2208 LLVM_DEBUG(dbgs() << "LAA: Strided accesses are independent\n");
2209 return Dependence::NoDep;
2210 }
2211 } else {
2212 if (!LoopGuards)
2213 LoopGuards.emplace(
2214 args: ScalarEvolution::LoopGuards::collect(L: InnermostLoop, SE));
2215 Dist = SE.applyLoopGuards(Expr: Dist, Guards: *LoopGuards);
2216 }
2217
2218 // Negative distances are not plausible dependencies.
2219 if (SE.isKnownNonPositive(S: Dist)) {
2220 if (SE.isKnownNonNegative(S: Dist)) {
2221 if (HasSameSize) {
2222 // Write to the same location with the same size.
2223 return Dependence::Forward;
2224 }
2225 LLVM_DEBUG(dbgs() << "LAA: possibly zero dependence difference but "
2226 "different type sizes\n");
2227 return Dependence::Unknown;
2228 }
2229
2230 bool IsTrueDataDependence = (AIsWrite && !BIsWrite);
2231 // Check if the first access writes to a location that is read in a later
2232 // iteration, where the distance between them is not a multiple of a vector
2233 // factor and relatively small.
2234 //
2235 // NOTE: There is no need to update MaxSafeVectorWidthInBits after call to
2236 // couldPreventStoreLoadForward, even if it changed MinDepDistBytes, since a
2237 // forward dependency will allow vectorization using any width.
2238
2239 if (IsTrueDataDependence && EnableForwardingConflictDetection) {
2240 if (!ConstDist) {
2241 return CheckCompletelyBeforeOrAfter() ? Dependence::NoDep
2242 : Dependence::Unknown;
2243 }
2244 if (!HasSameSize ||
2245 couldPreventStoreLoadForward(Distance: ConstDist, TypeByteSize)) {
2246 LLVM_DEBUG(
2247 dbgs() << "LAA: Forward but may prevent st->ld forwarding\n");
2248 return Dependence::ForwardButPreventsForwarding;
2249 }
2250 }
2251
2252 LLVM_DEBUG(dbgs() << "LAA: Dependence is negative\n");
2253 return Dependence::Forward;
2254 }
2255
2256 int64_t MinDistance = SE.getSignedRangeMin(S: Dist).getSExtValue();
2257 // Below we only handle strictly positive distances.
2258 if (MinDistance <= 0) {
2259 return CheckCompletelyBeforeOrAfter() ? Dependence::NoDep
2260 : Dependence::Unknown;
2261 }
2262
2263 if (!HasSameSize) {
2264 if (CheckCompletelyBeforeOrAfter())
2265 return Dependence::NoDep;
2266 LLVM_DEBUG(dbgs() << "LAA: ReadWrite-Write positive dependency with "
2267 "different type sizes\n");
2268 return Dependence::Unknown;
2269 }
2270 // Bail out early if passed-in parameters make vectorization not feasible.
2271 unsigned ForcedFactor = (VectorizerParams::VectorizationFactor ?
2272 VectorizerParams::VectorizationFactor : 1);
2273 unsigned ForcedUnroll = (VectorizerParams::VectorizationInterleave ?
2274 VectorizerParams::VectorizationInterleave : 1);
2275 // The minimum number of iterations for a vectorized/unrolled version.
2276 unsigned MinNumIter = std::max(a: ForcedFactor * ForcedUnroll, b: 2U);
2277
2278 // It's not vectorizable if the distance is smaller than the minimum distance
2279 // needed for a vectroized/unrolled version. Vectorizing one iteration in
2280 // front needs MaxStride. Vectorizing the last iteration needs TypeByteSize.
2281 // (No need to plus the last gap distance).
2282 //
2283 // E.g. Assume one char is 1 byte in memory and one int is 4 bytes.
2284 // foo(int *A) {
2285 // int *B = (int *)((char *)A + 14);
2286 // for (i = 0 ; i < 1024 ; i += 2)
2287 // B[i] = A[i] + 1;
2288 // }
2289 //
2290 // Two accesses in memory (stride is 4 * 2):
2291 // | A[0] | | A[2] | | A[4] | | A[6] | |
2292 // | B[0] | | B[2] | | B[4] |
2293 //
2294 // MinDistance needs for vectorizing iterations except the last iteration:
2295 // 4 * 2 * (MinNumIter - 1). MinDistance needs for the last iteration: 4.
2296 // So the minimum distance needed is: 4 * 2 * (MinNumIter - 1) + 4.
2297 //
2298 // If MinNumIter is 2, it is vectorizable as the minimum distance needed is
2299 // 12, which is less than distance.
2300 //
2301 // If MinNumIter is 4 (Say if a user forces the vectorization factor to be 4),
2302 // the minimum distance needed is 28, which is greater than distance. It is
2303 // not safe to do vectorization.
2304 //
2305 // We use MaxStride (maximum of src and sink strides) to get a conservative
2306 // lower bound on the MinDistanceNeeded in case of different strides.
2307
2308 // We know that Dist is positive, but it may not be constant. Use the signed
2309 // minimum for computations below, as this ensures we compute the closest
2310 // possible dependence distance.
2311 uint64_t MinDistanceNeeded = MaxStride * (MinNumIter - 1) + TypeByteSize;
2312 if (MinDistanceNeeded > static_cast<uint64_t>(MinDistance)) {
2313 if (!ConstDist) {
2314 // For non-constant distances, we checked the lower bound of the
2315 // dependence distance and the distance may be larger at runtime (and safe
2316 // for vectorization). Classify it as Unknown, so we re-try with runtime
2317 // checks, unless we can prove both accesses cannot overlap.
2318 return CheckCompletelyBeforeOrAfter() ? Dependence::NoDep
2319 : Dependence::Unknown;
2320 }
2321 LLVM_DEBUG(dbgs() << "LAA: Failure because of positive minimum distance "
2322 << MinDistance << '\n');
2323 return Dependence::Backward;
2324 }
2325
2326 // Unsafe if the minimum distance needed is greater than smallest dependence
2327 // distance distance.
2328 if (MinDistanceNeeded > MinDepDistBytes) {
2329 LLVM_DEBUG(dbgs() << "LAA: Failure because it needs at least "
2330 << MinDistanceNeeded << " size in bytes\n");
2331 return Dependence::Backward;
2332 }
2333
2334 MinDepDistBytes =
2335 std::min(a: static_cast<uint64_t>(MinDistance), b: MinDepDistBytes);
2336
2337 bool IsTrueDataDependence = (!AIsWrite && BIsWrite);
2338 if (IsTrueDataDependence && EnableForwardingConflictDetection && ConstDist &&
2339 couldPreventStoreLoadForward(Distance: MinDistance, TypeByteSize, CommonStride: *CommonStride))
2340 return Dependence::BackwardVectorizableButPreventsForwarding;
2341
2342 uint64_t MaxVF = MinDepDistBytes / MaxStride;
2343 LLVM_DEBUG(dbgs() << "LAA: Positive min distance " << MinDistance
2344 << " with max VF = " << MaxVF << '\n');
2345
2346 uint64_t MaxVFInBits = MaxVF * TypeByteSize * 8;
2347 if (!ConstDist && MaxVFInBits < MaxTargetVectorWidthInBits) {
2348 // For non-constant distances, we checked the lower bound of the dependence
2349 // distance and the distance may be larger at runtime (and safe for
2350 // vectorization). Classify it as Unknown, so we re-try with runtime checks,
2351 // unless we can prove both accesses cannot overlap.
2352 return CheckCompletelyBeforeOrAfter() ? Dependence::NoDep
2353 : Dependence::Unknown;
2354 }
2355
2356 if (CheckCompletelyBeforeOrAfter())
2357 return Dependence::NoDep;
2358
2359 MaxSafeVectorWidthInBits = std::min(a: MaxSafeVectorWidthInBits, b: MaxVFInBits);
2360 return Dependence::BackwardVectorizable;
2361}
2362
2363bool MemoryDepChecker::areDepsSafe(const DepCandidates &DepCands,
2364 const MemAccessInfoList &CheckDeps) {
2365
2366 MinDepDistBytes = -1;
2367 SmallPtrSet<MemAccessInfo, 8> Visited;
2368 for (MemAccessInfo CurAccess : CheckDeps) {
2369 if (Visited.contains(Ptr: CurAccess))
2370 continue;
2371
2372 // Check accesses within this set.
2373 EquivalenceClasses<MemAccessInfo>::member_iterator AI =
2374 DepCands.findLeader(V: CurAccess);
2375 EquivalenceClasses<MemAccessInfo>::member_iterator AE =
2376 DepCands.member_end();
2377
2378 // Check every access pair.
2379 while (AI != AE) {
2380 Visited.insert(Ptr: *AI);
2381 bool AIIsWrite = AI->getInt();
2382 // Check loads only against next equivalent class, but stores also against
2383 // other stores in the same equivalence class - to the same address.
2384 EquivalenceClasses<MemAccessInfo>::member_iterator OI =
2385 (AIIsWrite ? AI : std::next(x: AI));
2386 while (OI != AE) {
2387 // Check every accessing instruction pair in program order.
2388 auto &Acc = Accesses[*AI];
2389 for (std::vector<unsigned>::iterator I1 = Acc.begin(), I1E = Acc.end();
2390 I1 != I1E; ++I1)
2391 // Scan all accesses of another equivalence class, but only the next
2392 // accesses of the same equivalent class.
2393 for (std::vector<unsigned>::iterator
2394 I2 = (OI == AI ? std::next(x: I1) : Accesses[*OI].begin()),
2395 I2E = (OI == AI ? I1E : Accesses[*OI].end());
2396 I2 != I2E; ++I2) {
2397 auto A = std::make_pair(x: &*AI, y&: *I1);
2398 auto B = std::make_pair(x: &*OI, y&: *I2);
2399
2400 assert(*I1 != *I2);
2401 if (*I1 > *I2)
2402 std::swap(x&: A, y&: B);
2403
2404 Dependence::DepType Type =
2405 isDependent(A: *A.first, AIdx: A.second, B: *B.first, BIdx: B.second);
2406 mergeInStatus(S: Dependence::isSafeForVectorization(Type));
2407
2408 // Gather dependences unless we accumulated MaxDependences
2409 // dependences. In that case return as soon as we find the first
2410 // unsafe dependence. This puts a limit on this quadratic
2411 // algorithm.
2412 if (RecordDependences) {
2413 if (Type != Dependence::NoDep)
2414 Dependences.emplace_back(Args&: A.second, Args&: B.second, Args&: Type);
2415
2416 if (Dependences.size() >= MaxDependences) {
2417 RecordDependences = false;
2418 Dependences.clear();
2419 LLVM_DEBUG(dbgs()
2420 << "Too many dependences, stopped recording\n");
2421 }
2422 }
2423 if (!RecordDependences && !isSafeForVectorization())
2424 return false;
2425 }
2426 ++OI;
2427 }
2428 ++AI;
2429 }
2430 }
2431
2432 LLVM_DEBUG(dbgs() << "Total Dependences: " << Dependences.size() << "\n");
2433 return isSafeForVectorization();
2434}
2435
2436SmallVector<Instruction *, 4>
2437MemoryDepChecker::getInstructionsForAccess(Value *Ptr, bool IsWrite) const {
2438 MemAccessInfo Access(Ptr, IsWrite);
2439 auto I = Accesses.find(Val: Access);
2440 SmallVector<Instruction *, 4> Insts;
2441 if (I != Accesses.end()) {
2442 transform(Range: I->second, d_first: std::back_inserter(x&: Insts),
2443 F: [&](unsigned Idx) { return this->InstMap[Idx]; });
2444 }
2445
2446 return Insts;
2447}
2448
2449const char *MemoryDepChecker::Dependence::DepName[] = {
2450 "NoDep",
2451 "Unknown",
2452 "IndirectUnsafe",
2453 "Forward",
2454 "ForwardButPreventsForwarding",
2455 "Backward",
2456 "BackwardVectorizable",
2457 "BackwardVectorizableButPreventsForwarding"};
2458
2459void MemoryDepChecker::Dependence::print(
2460 raw_ostream &OS, unsigned Depth,
2461 const SmallVectorImpl<Instruction *> &Instrs) const {
2462 OS.indent(NumSpaces: Depth) << DepName[Type] << ":\n";
2463 OS.indent(NumSpaces: Depth + 2) << *Instrs[Source] << " -> \n";
2464 OS.indent(NumSpaces: Depth + 2) << *Instrs[Destination] << "\n";
2465}
2466
2467bool LoopAccessInfo::canAnalyzeLoop() {
2468 // We need to have a loop header.
2469 LLVM_DEBUG(dbgs() << "\nLAA: Checking a loop in '"
2470 << TheLoop->getHeader()->getParent()->getName() << "' from "
2471 << TheLoop->getLocStr() << "\n");
2472
2473 // We can only analyze innermost loops.
2474 if (!TheLoop->isInnermost()) {
2475 LLVM_DEBUG(dbgs() << "LAA: loop is not the innermost loop\n");
2476 recordAnalysis(RemarkName: "NotInnerMostLoop") << "loop is not the innermost loop";
2477 return false;
2478 }
2479
2480 // We must have a single backedge.
2481 if (TheLoop->getNumBackEdges() != 1) {
2482 LLVM_DEBUG(
2483 dbgs() << "LAA: loop control flow is not understood by analyzer\n");
2484 recordAnalysis(RemarkName: "CFGNotUnderstood")
2485 << "loop control flow is not understood by analyzer";
2486 return false;
2487 }
2488
2489 // ScalarEvolution needs to be able to find the symbolic max backedge taken
2490 // count, which is an upper bound on the number of loop iterations. The loop
2491 // may execute fewer iterations, if it exits via an uncountable exit.
2492 const SCEV *ExitCount = PSE->getSymbolicMaxBackedgeTakenCount();
2493 if (isa<SCEVCouldNotCompute>(Val: ExitCount)) {
2494 recordAnalysis(RemarkName: "CantComputeNumberOfIterations")
2495 << "could not determine number of loop iterations";
2496 LLVM_DEBUG(dbgs() << "LAA: SCEV could not compute the loop exit count.\n");
2497 return false;
2498 }
2499
2500 LLVM_DEBUG(dbgs() << "LAA: Found an analyzable loop: "
2501 << TheLoop->getHeader()->getName() << "\n");
2502 return true;
2503}
2504
2505bool LoopAccessInfo::analyzeLoop(AAResults *AA, const LoopInfo *LI,
2506 const TargetLibraryInfo *TLI,
2507 DominatorTree *DT) {
2508 // Holds the Load and Store instructions.
2509 SmallVector<LoadInst *, 16> Loads;
2510 SmallVector<StoreInst *, 16> Stores;
2511 SmallPtrSet<MDNode *, 8> LoopAliasScopes;
2512
2513 // Holds all the different accesses in the loop.
2514 unsigned NumReads = 0;
2515 unsigned NumReadWrites = 0;
2516
2517 bool HasComplexMemInst = false;
2518
2519 // A runtime check is only legal to insert if there are no convergent calls.
2520 HasConvergentOp = false;
2521
2522 PtrRtChecking->Pointers.clear();
2523 PtrRtChecking->Need = false;
2524
2525 const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel();
2526
2527 const bool EnableMemAccessVersioningOfLoop =
2528 EnableMemAccessVersioning &&
2529 !TheLoop->getHeader()->getParent()->hasOptSize();
2530
2531 // Traverse blocks in fixed RPOT order, regardless of their storage in the
2532 // loop info, as it may be arbitrary.
2533 LoopBlocksRPO RPOT(TheLoop);
2534 RPOT.perform(LI);
2535 for (BasicBlock *BB : RPOT) {
2536 // Scan the BB and collect legal loads and stores. Also detect any
2537 // convergent instructions.
2538 for (Instruction &I : *BB) {
2539 if (auto *Call = dyn_cast<CallBase>(Val: &I)) {
2540 if (Call->isConvergent())
2541 HasConvergentOp = true;
2542 }
2543
2544 // With both a non-vectorizable memory instruction and a convergent
2545 // operation, found in this loop, no reason to continue the search.
2546 if (HasComplexMemInst && HasConvergentOp)
2547 return false;
2548
2549 // Avoid hitting recordAnalysis multiple times.
2550 if (HasComplexMemInst)
2551 continue;
2552
2553 // Record alias scopes defined inside the loop.
2554 if (auto *Decl = dyn_cast<NoAliasScopeDeclInst>(Val: &I))
2555 for (Metadata *Op : Decl->getScopeList()->operands())
2556 LoopAliasScopes.insert(Ptr: cast<MDNode>(Val: Op));
2557
2558 // Many math library functions read the rounding mode. We will only
2559 // vectorize a loop if it contains known function calls that don't set
2560 // the flag. Therefore, it is safe to ignore this read from memory.
2561 auto *Call = dyn_cast<CallInst>(Val: &I);
2562 if (Call && getVectorIntrinsicIDForCall(CI: Call, TLI))
2563 continue;
2564
2565 // If this is a load, save it. If this instruction can read from memory
2566 // but is not a load, we only allow it if it's a call to a function with a
2567 // vector mapping and no pointer arguments.
2568 if (I.mayReadFromMemory()) {
2569 auto hasPointerArgs = [](CallBase *CB) {
2570 return any_of(Range: CB->args(), P: [](Value const *Arg) {
2571 return Arg->getType()->isPointerTy();
2572 });
2573 };
2574
2575 // If the function has an explicit vectorized counterpart, and does not
2576 // take output/input pointers, we can safely assume that it can be
2577 // vectorized.
2578 if (Call && !Call->isNoBuiltin() && Call->getCalledFunction() &&
2579 !hasPointerArgs(Call) && !VFDatabase::getMappings(CI: *Call).empty())
2580 continue;
2581
2582 auto *Ld = dyn_cast<LoadInst>(Val: &I);
2583 if (!Ld) {
2584 recordAnalysis(RemarkName: "CantVectorizeInstruction", Instr: Ld)
2585 << "instruction cannot be vectorized";
2586 HasComplexMemInst = true;
2587 continue;
2588 }
2589 if (!Ld->isSimple() && !IsAnnotatedParallel) {
2590 recordAnalysis(RemarkName: "NonSimpleLoad", Instr: Ld)
2591 << "read with atomic ordering or volatile read";
2592 LLVM_DEBUG(dbgs() << "LAA: Found a non-simple load.\n");
2593 HasComplexMemInst = true;
2594 continue;
2595 }
2596 NumLoads++;
2597 Loads.push_back(Elt: Ld);
2598 DepChecker->addAccess(LI: Ld);
2599 if (EnableMemAccessVersioningOfLoop)
2600 collectStridedAccess(LoadOrStoreInst: Ld);
2601 continue;
2602 }
2603
2604 // Save 'store' instructions. Abort if other instructions write to memory.
2605 if (I.mayWriteToMemory()) {
2606 auto *St = dyn_cast<StoreInst>(Val: &I);
2607 if (!St) {
2608 recordAnalysis(RemarkName: "CantVectorizeInstruction", Instr: St)
2609 << "instruction cannot be vectorized";
2610 HasComplexMemInst = true;
2611 continue;
2612 }
2613 if (!St->isSimple() && !IsAnnotatedParallel) {
2614 recordAnalysis(RemarkName: "NonSimpleStore", Instr: St)
2615 << "write with atomic ordering or volatile write";
2616 LLVM_DEBUG(dbgs() << "LAA: Found a non-simple store.\n");
2617 HasComplexMemInst = true;
2618 continue;
2619 }
2620 NumStores++;
2621 Stores.push_back(Elt: St);
2622 DepChecker->addAccess(SI: St);
2623 if (EnableMemAccessVersioningOfLoop)
2624 collectStridedAccess(LoadOrStoreInst: St);
2625 }
2626 } // Next instr.
2627 } // Next block.
2628
2629 if (HasComplexMemInst)
2630 return false;
2631
2632 // Now we have two lists that hold the loads and the stores.
2633 // Next, we find the pointers that they use.
2634
2635 // Check if we see any stores. If there are no stores, then we don't
2636 // care if the pointers are *restrict*.
2637 if (!Stores.size()) {
2638 LLVM_DEBUG(dbgs() << "LAA: Found a read-only loop!\n");
2639 return true;
2640 }
2641
2642 MemoryDepChecker::DepCandidates DepCands;
2643 AccessAnalysis Accesses(TheLoop, AA, LI, *DT, DepCands, *PSE,
2644 LoopAliasScopes);
2645
2646 // Holds the analyzed pointers. We don't want to call getUnderlyingObjects
2647 // multiple times on the same object. If the ptr is accessed twice, once
2648 // for read and once for write, it will only appear once (on the write
2649 // list). This is okay, since we are going to check for conflicts between
2650 // writes and between reads and writes, but not between reads and reads.
2651 SmallSet<std::pair<Value *, Type *>, 16> Seen;
2652
2653 // Record uniform store addresses to identify if we have multiple stores
2654 // to the same address.
2655 SmallPtrSet<Value *, 16> UniformStores;
2656
2657 for (StoreInst *ST : Stores) {
2658 Value *Ptr = ST->getPointerOperand();
2659
2660 if (isInvariant(V: Ptr)) {
2661 // Record store instructions to loop invariant addresses
2662 StoresToInvariantAddresses.push_back(Elt: ST);
2663 HasStoreStoreDependenceInvolvingLoopInvariantAddress |=
2664 !UniformStores.insert(Ptr).second;
2665 }
2666
2667 // If we did *not* see this pointer before, insert it to the read-write
2668 // list. At this phase it is only a 'write' list.
2669 Type *AccessTy = getLoadStoreType(I: ST);
2670 if (Seen.insert(V: {Ptr, AccessTy}).second) {
2671 ++NumReadWrites;
2672
2673 MemoryLocation Loc = MemoryLocation::get(SI: ST);
2674 // The TBAA metadata could have a control dependency on the predication
2675 // condition, so we cannot rely on it when determining whether or not we
2676 // need runtime pointer checks.
2677 if (blockNeedsPredication(BB: ST->getParent(), TheLoop, DT))
2678 Loc.AATags.TBAA = nullptr;
2679
2680 visitPointers(StartPtr: const_cast<Value *>(Loc.Ptr), InnermostLoop: *TheLoop,
2681 AddPointer: [&Accesses, AccessTy, Loc](Value *Ptr) {
2682 MemoryLocation NewLoc = Loc.getWithNewPtr(NewPtr: Ptr);
2683 Accesses.addStore(Loc: NewLoc, AccessTy);
2684 });
2685 }
2686 }
2687
2688 if (IsAnnotatedParallel) {
2689 LLVM_DEBUG(
2690 dbgs() << "LAA: A loop annotated parallel, ignore memory dependency "
2691 << "checks.\n");
2692 return true;
2693 }
2694
2695 for (LoadInst *LD : Loads) {
2696 Value *Ptr = LD->getPointerOperand();
2697 // If we did *not* see this pointer before, insert it to the
2698 // read list. If we *did* see it before, then it is already in
2699 // the read-write list. This allows us to vectorize expressions
2700 // such as A[i] += x; Because the address of A[i] is a read-write
2701 // pointer. This only works if the index of A[i] is consecutive.
2702 // If the address of i is unknown (for example A[B[i]]) then we may
2703 // read a few words, modify, and write a few words, and some of the
2704 // words may be written to the same address.
2705 bool IsReadOnlyPtr = false;
2706 Type *AccessTy = getLoadStoreType(I: LD);
2707 if (Seen.insert(V: {Ptr, AccessTy}).second ||
2708 !getPtrStride(PSE&: *PSE, AccessTy, Ptr, Lp: TheLoop, DT: *DT, StridesMap: SymbolicStrides, Assume: false,
2709 ShouldCheckWrap: true)) {
2710 ++NumReads;
2711 IsReadOnlyPtr = true;
2712 }
2713
2714 // See if there is an unsafe dependency between a load to a uniform address and
2715 // store to the same uniform address.
2716 if (UniformStores.contains(Ptr)) {
2717 LLVM_DEBUG(dbgs() << "LAA: Found an unsafe dependency between a uniform "
2718 "load and uniform store to the same address!\n");
2719 HasLoadStoreDependenceInvolvingLoopInvariantAddress = true;
2720 }
2721
2722 MemoryLocation Loc = MemoryLocation::get(LI: LD);
2723 // The TBAA metadata could have a control dependency on the predication
2724 // condition, so we cannot rely on it when determining whether or not we
2725 // need runtime pointer checks.
2726 if (blockNeedsPredication(BB: LD->getParent(), TheLoop, DT))
2727 Loc.AATags.TBAA = nullptr;
2728
2729 visitPointers(StartPtr: const_cast<Value *>(Loc.Ptr), InnermostLoop: *TheLoop,
2730 AddPointer: [&Accesses, AccessTy, Loc, IsReadOnlyPtr](Value *Ptr) {
2731 MemoryLocation NewLoc = Loc.getWithNewPtr(NewPtr: Ptr);
2732 Accesses.addLoad(Loc: NewLoc, AccessTy, IsReadOnly: IsReadOnlyPtr);
2733 });
2734 }
2735
2736 // If we write (or read-write) to a single destination and there are no
2737 // other reads in this loop then is it safe to vectorize.
2738 if (NumReadWrites == 1 && NumReads == 0) {
2739 LLVM_DEBUG(dbgs() << "LAA: Found a write-only loop!\n");
2740 return true;
2741 }
2742
2743 // Build dependence sets and check whether we need a runtime pointer bounds
2744 // check.
2745 Accesses.buildDependenceSets();
2746
2747 // Find pointers with computable bounds. We are going to use this information
2748 // to place a runtime bound check.
2749 Value *UncomputablePtr = nullptr;
2750 HasCompletePtrRtChecking = Accesses.canCheckPtrAtRT(
2751 RtCheck&: *PtrRtChecking, TheLoop, StridesMap: SymbolicStrides, UncomputablePtr, AllowPartial);
2752 if (!HasCompletePtrRtChecking) {
2753 const auto *I = dyn_cast_or_null<Instruction>(Val: UncomputablePtr);
2754 recordAnalysis(RemarkName: "CantIdentifyArrayBounds", Instr: I)
2755 << "cannot identify array bounds";
2756 LLVM_DEBUG(dbgs() << "LAA: We can't vectorize because we can't find "
2757 << "the array bounds.\n");
2758 return false;
2759 }
2760
2761 LLVM_DEBUG(
2762 dbgs() << "LAA: May be able to perform a memory runtime check if needed.\n");
2763
2764 bool DepsAreSafe = true;
2765 if (Accesses.isDependencyCheckNeeded()) {
2766 LLVM_DEBUG(dbgs() << "LAA: Checking memory dependencies\n");
2767 DepsAreSafe =
2768 DepChecker->areDepsSafe(DepCands, CheckDeps: Accesses.getDependenciesToCheck());
2769
2770 if (!DepsAreSafe && DepChecker->shouldRetryWithRuntimeChecks()) {
2771 LLVM_DEBUG(dbgs() << "LAA: Retrying with memory checks\n");
2772
2773 // Clear the dependency checks. We assume they are not needed.
2774 Accesses.resetDepChecks(DepChecker&: *DepChecker);
2775
2776 PtrRtChecking->reset();
2777 PtrRtChecking->Need = true;
2778
2779 UncomputablePtr = nullptr;
2780 HasCompletePtrRtChecking =
2781 Accesses.canCheckPtrAtRT(RtCheck&: *PtrRtChecking, TheLoop, StridesMap: SymbolicStrides,
2782 UncomputablePtr, AllowPartial);
2783
2784 // Check that we found the bounds for the pointer.
2785 if (!HasCompletePtrRtChecking) {
2786 auto *I = dyn_cast_or_null<Instruction>(Val: UncomputablePtr);
2787 recordAnalysis(RemarkName: "CantCheckMemDepsAtRunTime", Instr: I)
2788 << "cannot check memory dependencies at runtime";
2789 LLVM_DEBUG(dbgs() << "LAA: Can't vectorize with memory checks\n");
2790 return false;
2791 }
2792 DepsAreSafe = true;
2793 }
2794 }
2795
2796 if (HasConvergentOp) {
2797 recordAnalysis(RemarkName: "CantInsertRuntimeCheckWithConvergent")
2798 << "cannot add control dependency to convergent operation";
2799 LLVM_DEBUG(dbgs() << "LAA: We can't vectorize because a runtime check "
2800 "would be needed with a convergent operation\n");
2801 return false;
2802 }
2803
2804 if (DepsAreSafe) {
2805 LLVM_DEBUG(
2806 dbgs() << "LAA: No unsafe dependent memory operations in loop. We"
2807 << (PtrRtChecking->Need ? "" : " don't")
2808 << " need runtime memory checks.\n");
2809 return true;
2810 }
2811
2812 emitUnsafeDependenceRemark();
2813 return false;
2814}
2815
2816void LoopAccessInfo::emitUnsafeDependenceRemark() {
2817 const auto *Deps = getDepChecker().getDependences();
2818 if (!Deps)
2819 return;
2820 const auto *Found =
2821 llvm::find_if(Range: *Deps, P: [](const MemoryDepChecker::Dependence &D) {
2822 return MemoryDepChecker::Dependence::isSafeForVectorization(Type: D.Type) !=
2823 MemoryDepChecker::VectorizationSafetyStatus::Safe;
2824 });
2825 if (Found == Deps->end())
2826 return;
2827 MemoryDepChecker::Dependence Dep = *Found;
2828
2829 LLVM_DEBUG(dbgs() << "LAA: unsafe dependent memory operations in loop\n");
2830
2831 // Emit remark for first unsafe dependence
2832 bool HasForcedDistribution = false;
2833 std::optional<const MDOperand *> Value =
2834 findStringMetadataForLoop(TheLoop, Name: "llvm.loop.distribute.enable");
2835 if (Value) {
2836 const MDOperand *Op = *Value;
2837 assert(Op && mdconst::hasa<ConstantInt>(*Op) && "invalid metadata");
2838 HasForcedDistribution = mdconst::extract<ConstantInt>(MD: *Op)->getZExtValue();
2839 }
2840
2841 const std::string Info =
2842 HasForcedDistribution
2843 ? "unsafe dependent memory operations in loop."
2844 : "unsafe dependent memory operations in loop. Use "
2845 "#pragma clang loop distribute(enable) to allow loop distribution "
2846 "to attempt to isolate the offending operations into a separate "
2847 "loop";
2848 OptimizationRemarkAnalysis &R =
2849 recordAnalysis(RemarkName: "UnsafeDep", Instr: Dep.getDestination(DepChecker: getDepChecker())) << Info;
2850
2851 switch (Dep.Type) {
2852 case MemoryDepChecker::Dependence::NoDep:
2853 case MemoryDepChecker::Dependence::Forward:
2854 case MemoryDepChecker::Dependence::BackwardVectorizable:
2855 llvm_unreachable("Unexpected dependence");
2856 case MemoryDepChecker::Dependence::Backward:
2857 R << "\nBackward loop carried data dependence.";
2858 break;
2859 case MemoryDepChecker::Dependence::ForwardButPreventsForwarding:
2860 R << "\nForward loop carried data dependence that prevents "
2861 "store-to-load forwarding.";
2862 break;
2863 case MemoryDepChecker::Dependence::BackwardVectorizableButPreventsForwarding:
2864 R << "\nBackward loop carried data dependence that prevents "
2865 "store-to-load forwarding.";
2866 break;
2867 case MemoryDepChecker::Dependence::IndirectUnsafe:
2868 R << "\nUnsafe indirect dependence.";
2869 break;
2870 case MemoryDepChecker::Dependence::Unknown:
2871 R << "\nUnknown data dependence.";
2872 break;
2873 }
2874
2875 if (Instruction *I = Dep.getSource(DepChecker: getDepChecker())) {
2876 DebugLoc SourceLoc = I->getDebugLoc();
2877 if (auto *DD = dyn_cast_or_null<Instruction>(Val: getPointerOperand(V: I)))
2878 SourceLoc = DD->getDebugLoc();
2879 if (SourceLoc)
2880 R << " Memory location is the same as accessed at "
2881 << ore::NV("Location", SourceLoc);
2882 }
2883}
2884
2885bool LoopAccessInfo::blockNeedsPredication(const BasicBlock *BB,
2886 const Loop *TheLoop,
2887 const DominatorTree *DT) {
2888 assert(TheLoop->contains(BB) && "Unknown block used");
2889
2890 // Blocks that do not dominate the latch need predication.
2891 const BasicBlock *Latch = TheLoop->getLoopLatch();
2892 return !DT->dominates(A: BB, B: Latch);
2893}
2894
2895OptimizationRemarkAnalysis &
2896LoopAccessInfo::recordAnalysis(StringRef RemarkName, const Instruction *I) {
2897 assert(!Report && "Multiple reports generated");
2898
2899 const BasicBlock *CodeRegion = TheLoop->getHeader();
2900 DebugLoc DL = TheLoop->getStartLoc();
2901
2902 if (I) {
2903 CodeRegion = I->getParent();
2904 // If there is no debug location attached to the instruction, revert back to
2905 // using the loop's.
2906 if (I->getDebugLoc())
2907 DL = I->getDebugLoc();
2908 }
2909
2910 Report = std::make_unique<OptimizationRemarkAnalysis>(DEBUG_TYPE, args&: RemarkName,
2911 args&: DL, args&: CodeRegion);
2912 return *Report;
2913}
2914
2915bool LoopAccessInfo::isInvariant(Value *V) const {
2916 auto *SE = PSE->getSE();
2917 if (TheLoop->isLoopInvariant(V))
2918 return true;
2919 if (!SE->isSCEVable(Ty: V->getType()))
2920 return false;
2921 const SCEV *S = SE->getSCEV(V);
2922 return SE->isLoopInvariant(S, L: TheLoop);
2923}
2924
2925/// If \p Ptr is a GEP, which has a loop-variant operand, return that operand.
2926/// Otherwise, return \p Ptr.
2927static Value *getLoopVariantGEPOperand(Value *Ptr, ScalarEvolution *SE,
2928 Loop *Lp) {
2929 auto *GEP = dyn_cast<GetElementPtrInst>(Val: Ptr);
2930 if (!GEP)
2931 return Ptr;
2932
2933 Value *V = Ptr;
2934 for (const Use &U : GEP->operands()) {
2935 if (!SE->isLoopInvariant(S: SE->getSCEV(V: U), L: Lp)) {
2936 if (V == Ptr)
2937 V = U;
2938 else
2939 // There must be exactly one loop-variant operand.
2940 return Ptr;
2941 }
2942 }
2943 return V;
2944}
2945
2946/// Get the stride of a pointer access in a loop. Looks for symbolic
2947/// strides "a[i*stride]". Returns the symbolic stride, or null otherwise.
2948static const SCEV *getStrideFromPointer(Value *Ptr, ScalarEvolution *SE, Loop *Lp) {
2949 auto *PtrTy = dyn_cast<PointerType>(Val: Ptr->getType());
2950 if (!PtrTy)
2951 return nullptr;
2952
2953 // Try to remove a gep instruction to make the pointer (actually index at this
2954 // point) easier analyzable. If OrigPtr is equal to Ptr we are analyzing the
2955 // pointer, otherwise, we are analyzing the index.
2956 Value *OrigPtr = Ptr;
2957
2958 Ptr = getLoopVariantGEPOperand(Ptr, SE, Lp);
2959 const SCEV *V = SE->getSCEV(V: Ptr);
2960
2961 if (Ptr != OrigPtr)
2962 // Strip off casts.
2963 while (auto *C = dyn_cast<SCEVIntegralCastExpr>(Val: V))
2964 V = C->getOperand();
2965
2966 if (!match(S: V, P: m_scev_AffineAddRec(Op0: m_SCEV(), Op1: m_SCEV(V), L: m_SpecificLoop(L: Lp))))
2967 return nullptr;
2968
2969 // Note that the restriction after this loop invariant check are only
2970 // profitability restrictions.
2971 if (!SE->isLoopInvariant(S: V, L: Lp))
2972 return nullptr;
2973
2974 // Look for the loop invariant symbolic value.
2975 if (isa<SCEVUnknown>(Val: V))
2976 return V;
2977
2978 // Look through multiplies that scale a stride by a constant.
2979 match(S: V, P: m_scev_Mul(Op0: m_SCEVConstant(), Op1: m_SCEV(V)));
2980 if (auto *C = dyn_cast<SCEVIntegralCastExpr>(Val: V))
2981 if (isa<SCEVUnknown>(Val: C->getOperand()))
2982 return V;
2983
2984 return nullptr;
2985}
2986
2987void LoopAccessInfo::collectStridedAccess(Value *MemAccess) {
2988 Value *Ptr = getLoadStorePointerOperand(V: MemAccess);
2989 if (!Ptr)
2990 return;
2991
2992 // Note: getStrideFromPointer is a *profitability* heuristic. We
2993 // could broaden the scope of values returned here - to anything
2994 // which happens to be loop invariant and contributes to the
2995 // computation of an interesting IV - but we chose not to as we
2996 // don't have a cost model here, and broadening the scope exposes
2997 // far too many unprofitable cases.
2998 const SCEV *StrideExpr = getStrideFromPointer(Ptr, SE: PSE->getSE(), Lp: TheLoop);
2999 if (!StrideExpr)
3000 return;
3001
3002 if (match(S: StrideExpr, P: m_scev_UndefOrPoison()))
3003 return;
3004
3005 LLVM_DEBUG(dbgs() << "LAA: Found a strided access that is a candidate for "
3006 "versioning:");
3007 LLVM_DEBUG(dbgs() << " Ptr: " << *Ptr << " Stride: " << *StrideExpr << "\n");
3008
3009 if (!SpeculateUnitStride) {
3010 LLVM_DEBUG(dbgs() << " Chose not to due to -laa-speculate-unit-stride\n");
3011 return;
3012 }
3013
3014 // Avoid adding the "Stride == 1" predicate when we know that
3015 // Stride >= Trip-Count. Such a predicate will effectively optimize a single
3016 // or zero iteration loop, as Trip-Count <= Stride == 1.
3017 //
3018 // TODO: We are currently not making a very informed decision on when it is
3019 // beneficial to apply stride versioning. It might make more sense that the
3020 // users of this analysis (such as the vectorizer) will trigger it, based on
3021 // their specific cost considerations; For example, in cases where stride
3022 // versioning does not help resolving memory accesses/dependences, the
3023 // vectorizer should evaluate the cost of the runtime test, and the benefit
3024 // of various possible stride specializations, considering the alternatives
3025 // of using gather/scatters (if available).
3026
3027 const SCEV *MaxBTC = PSE->getSymbolicMaxBackedgeTakenCount();
3028
3029 // Match the types so we can compare the stride and the MaxBTC.
3030 // The Stride can be positive/negative, so we sign extend Stride;
3031 // The backedgeTakenCount is non-negative, so we zero extend MaxBTC.
3032 const DataLayout &DL = TheLoop->getHeader()->getDataLayout();
3033 uint64_t StrideTypeSizeBits = DL.getTypeSizeInBits(Ty: StrideExpr->getType());
3034 uint64_t BETypeSizeBits = DL.getTypeSizeInBits(Ty: MaxBTC->getType());
3035 const SCEV *CastedStride = StrideExpr;
3036 const SCEV *CastedBECount = MaxBTC;
3037 ScalarEvolution *SE = PSE->getSE();
3038 if (BETypeSizeBits >= StrideTypeSizeBits)
3039 CastedStride = SE->getNoopOrSignExtend(V: StrideExpr, Ty: MaxBTC->getType());
3040 else
3041 CastedBECount = SE->getZeroExtendExpr(Op: MaxBTC, Ty: StrideExpr->getType());
3042 const SCEV *StrideMinusBETaken = SE->getMinusSCEV(LHS: CastedStride, RHS: CastedBECount);
3043 // Since TripCount == BackEdgeTakenCount + 1, checking:
3044 // "Stride >= TripCount" is equivalent to checking:
3045 // Stride - MaxBTC> 0
3046 if (SE->isKnownPositive(S: StrideMinusBETaken)) {
3047 LLVM_DEBUG(
3048 dbgs() << "LAA: Stride>=TripCount; No point in versioning as the "
3049 "Stride==1 predicate will imply that the loop executes "
3050 "at most once.\n");
3051 return;
3052 }
3053 LLVM_DEBUG(dbgs() << "LAA: Found a strided access that we can version.\n");
3054
3055 // Strip back off the integer cast, and check that our result is a
3056 // SCEVUnknown as we expect.
3057 const SCEV *StrideBase = StrideExpr;
3058 if (const auto *C = dyn_cast<SCEVIntegralCastExpr>(Val: StrideBase))
3059 StrideBase = C->getOperand();
3060 SymbolicStrides[Ptr] = cast<SCEVUnknown>(Val: StrideBase);
3061}
3062
3063LoopAccessInfo::LoopAccessInfo(Loop *L, ScalarEvolution *SE,
3064 const TargetTransformInfo *TTI,
3065 const TargetLibraryInfo *TLI, AAResults *AA,
3066 DominatorTree *DT, LoopInfo *LI,
3067 AssumptionCache *AC, bool AllowPartial)
3068 : PSE(std::make_unique<PredicatedScalarEvolution>(args&: *SE, args&: *L)),
3069 PtrRtChecking(nullptr), TheLoop(L), AllowPartial(AllowPartial) {
3070 unsigned MaxTargetVectorWidthInBits = std::numeric_limits<unsigned>::max();
3071 if (TTI && !TTI->enableScalableVectorization())
3072 // Scale the vector width by 2 as rough estimate to also consider
3073 // interleaving.
3074 MaxTargetVectorWidthInBits =
3075 TTI->getRegisterBitWidth(K: TargetTransformInfo::RGK_FixedWidthVector) * 2;
3076
3077 DepChecker = std::make_unique<MemoryDepChecker>(
3078 args&: *PSE, args&: AC, args&: DT, args&: L, args&: SymbolicStrides, args&: MaxTargetVectorWidthInBits, args&: LoopGuards);
3079 PtrRtChecking =
3080 std::make_unique<RuntimePointerChecking>(args&: *DepChecker, args&: SE, args&: LoopGuards);
3081 if (canAnalyzeLoop())
3082 CanVecMem = analyzeLoop(AA, LI, TLI, DT);
3083}
3084
3085void LoopAccessInfo::print(raw_ostream &OS, unsigned Depth) const {
3086 if (CanVecMem) {
3087 OS.indent(NumSpaces: Depth) << "Memory dependences are safe";
3088 const MemoryDepChecker &DC = getDepChecker();
3089 if (!DC.isSafeForAnyVectorWidth())
3090 OS << " with a maximum safe vector width of "
3091 << DC.getMaxSafeVectorWidthInBits() << " bits";
3092 if (!DC.isSafeForAnyStoreLoadForwardDistances()) {
3093 uint64_t SLDist = DC.getStoreLoadForwardSafeDistanceInBits();
3094 OS << ", with a maximum safe store-load forward width of " << SLDist
3095 << " bits";
3096 }
3097 if (PtrRtChecking->Need)
3098 OS << " with run-time checks";
3099 OS << "\n";
3100 }
3101
3102 if (HasConvergentOp)
3103 OS.indent(NumSpaces: Depth) << "Has convergent operation in loop\n";
3104
3105 if (Report)
3106 OS.indent(NumSpaces: Depth) << "Report: " << Report->getMsg() << "\n";
3107
3108 if (auto *Dependences = DepChecker->getDependences()) {
3109 OS.indent(NumSpaces: Depth) << "Dependences:\n";
3110 for (const auto &Dep : *Dependences) {
3111 Dep.print(OS, Depth: Depth + 2, Instrs: DepChecker->getMemoryInstructions());
3112 OS << "\n";
3113 }
3114 } else
3115 OS.indent(NumSpaces: Depth) << "Too many dependences, not recorded\n";
3116
3117 // List the pair of accesses need run-time checks to prove independence.
3118 PtrRtChecking->print(OS, Depth);
3119 if (PtrRtChecking->Need && !HasCompletePtrRtChecking)
3120 OS.indent(NumSpaces: Depth) << "Generated run-time checks are incomplete\n";
3121 OS << "\n";
3122
3123 OS.indent(NumSpaces: Depth)
3124 << "Non vectorizable stores to invariant address were "
3125 << (HasStoreStoreDependenceInvolvingLoopInvariantAddress ||
3126 HasLoadStoreDependenceInvolvingLoopInvariantAddress
3127 ? ""
3128 : "not ")
3129 << "found in loop.\n";
3130
3131 OS.indent(NumSpaces: Depth) << "SCEV assumptions:\n";
3132 PSE->getPredicate().print(OS, Depth);
3133
3134 OS << "\n";
3135
3136 OS.indent(NumSpaces: Depth) << "Expressions re-written:\n";
3137 PSE->print(OS, Depth);
3138}
3139
3140const LoopAccessInfo &LoopAccessInfoManager::getInfo(Loop &L,
3141 bool AllowPartial) {
3142 const auto &[It, Inserted] = LoopAccessInfoMap.try_emplace(Key: &L);
3143
3144 // We need to create the LoopAccessInfo if either we don't already have one,
3145 // or if it was created with a different value of AllowPartial.
3146 if (Inserted || It->second->hasAllowPartial() != AllowPartial)
3147 It->second = std::make_unique<LoopAccessInfo>(args: &L, args: &SE, args&: TTI, args&: TLI, args: &AA, args: &DT,
3148 args: &LI, args&: AC, args&: AllowPartial);
3149
3150 return *It->second;
3151}
3152void LoopAccessInfoManager::clear() {
3153 // Collect LoopAccessInfo entries that may keep references to IR outside the
3154 // analyzed loop or SCEVs that may have been modified or invalidated. At the
3155 // moment, that is loops requiring memory or SCEV runtime checks, as those cache
3156 // SCEVs, e.g. for pointer expressions.
3157 for (const auto &[L, LAI] : LoopAccessInfoMap) {
3158 if (LAI->getRuntimePointerChecking()->getChecks().empty() &&
3159 LAI->getPSE().getPredicate().isAlwaysTrue())
3160 continue;
3161 LoopAccessInfoMap.erase(Val: L);
3162 }
3163}
3164
3165bool LoopAccessInfoManager::invalidate(
3166 Function &F, const PreservedAnalyses &PA,
3167 FunctionAnalysisManager::Invalidator &Inv) {
3168 // Check whether our analysis is preserved.
3169 auto PAC = PA.getChecker<LoopAccessAnalysis>();
3170 if (!PAC.preserved() && !PAC.preservedSet<AllAnalysesOn<Function>>())
3171 // If not, give up now.
3172 return true;
3173
3174 // Check whether the analyses we depend on became invalid for any reason.
3175 // Skip checking TargetLibraryAnalysis as it is immutable and can't become
3176 // invalid.
3177 return Inv.invalidate<AAManager>(IR&: F, PA) ||
3178 Inv.invalidate<ScalarEvolutionAnalysis>(IR&: F, PA) ||
3179 Inv.invalidate<LoopAnalysis>(IR&: F, PA) ||
3180 Inv.invalidate<DominatorTreeAnalysis>(IR&: F, PA);
3181}
3182
3183LoopAccessInfoManager LoopAccessAnalysis::run(Function &F,
3184 FunctionAnalysisManager &FAM) {
3185 auto &SE = FAM.getResult<ScalarEvolutionAnalysis>(IR&: F);
3186 auto &AA = FAM.getResult<AAManager>(IR&: F);
3187 auto &DT = FAM.getResult<DominatorTreeAnalysis>(IR&: F);
3188 auto &LI = FAM.getResult<LoopAnalysis>(IR&: F);
3189 auto &TTI = FAM.getResult<TargetIRAnalysis>(IR&: F);
3190 auto &TLI = FAM.getResult<TargetLibraryAnalysis>(IR&: F);
3191 auto &AC = FAM.getResult<AssumptionAnalysis>(IR&: F);
3192 return LoopAccessInfoManager(SE, AA, DT, LI, &TTI, &TLI, &AC);
3193}
3194
3195AnalysisKey LoopAccessAnalysis::Key;
3196