1//===- LoopAccessAnalysis.cpp - Loop Access Analysis Implementation --------==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// The implementation for the loop memory dependence that was originally
10// developed for the loop vectorizer.
11//
12//===----------------------------------------------------------------------===//
13
14#include "llvm/Analysis/LoopAccessAnalysis.h"
15#include "llvm/ADT/APInt.h"
16#include "llvm/ADT/DenseMap.h"
17#include "llvm/ADT/EquivalenceClasses.h"
18#include "llvm/ADT/PointerIntPair.h"
19#include "llvm/ADT/STLExtras.h"
20#include "llvm/ADT/SetVector.h"
21#include "llvm/ADT/SmallPtrSet.h"
22#include "llvm/ADT/SmallSet.h"
23#include "llvm/ADT/SmallVector.h"
24#include "llvm/Analysis/AliasAnalysis.h"
25#include "llvm/Analysis/AliasSetTracker.h"
26#include "llvm/Analysis/AssumeBundleQueries.h"
27#include "llvm/Analysis/AssumptionCache.h"
28#include "llvm/Analysis/LoopAnalysisManager.h"
29#include "llvm/Analysis/LoopInfo.h"
30#include "llvm/Analysis/LoopIterator.h"
31#include "llvm/Analysis/MemoryLocation.h"
32#include "llvm/Analysis/OptimizationRemarkEmitter.h"
33#include "llvm/Analysis/ScalarEvolution.h"
34#include "llvm/Analysis/ScalarEvolutionExpressions.h"
35#include "llvm/Analysis/ScalarEvolutionPatternMatch.h"
36#include "llvm/Analysis/TargetLibraryInfo.h"
37#include "llvm/Analysis/TargetTransformInfo.h"
38#include "llvm/Analysis/ValueTracking.h"
39#include "llvm/Analysis/VectorUtils.h"
40#include "llvm/IR/BasicBlock.h"
41#include "llvm/IR/Constants.h"
42#include "llvm/IR/DataLayout.h"
43#include "llvm/IR/DebugLoc.h"
44#include "llvm/IR/DerivedTypes.h"
45#include "llvm/IR/DiagnosticInfo.h"
46#include "llvm/IR/Dominators.h"
47#include "llvm/IR/Function.h"
48#include "llvm/IR/InstrTypes.h"
49#include "llvm/IR/Instruction.h"
50#include "llvm/IR/Instructions.h"
51#include "llvm/IR/IntrinsicInst.h"
52#include "llvm/IR/PassManager.h"
53#include "llvm/IR/Type.h"
54#include "llvm/IR/Value.h"
55#include "llvm/IR/ValueHandle.h"
56#include "llvm/Support/Casting.h"
57#include "llvm/Support/CommandLine.h"
58#include "llvm/Support/Debug.h"
59#include "llvm/Support/ErrorHandling.h"
60#include "llvm/Support/raw_ostream.h"
61#include <algorithm>
62#include <cassert>
63#include <cstdint>
64#include <iterator>
65#include <utility>
66#include <variant>
67#include <vector>
68
69using namespace llvm;
70using namespace llvm::SCEVPatternMatch;
71
72#define DEBUG_TYPE "loop-accesses"
73
74static cl::opt<unsigned, true>
75VectorizationFactor("force-vector-width", cl::Hidden,
76 cl::desc("Sets the SIMD width. Zero is autoselect."),
77 cl::location(L&: VectorizerParams::VectorizationFactor));
78unsigned VectorizerParams::VectorizationFactor;
79
80static cl::opt<unsigned, true>
81VectorizationInterleave("force-vector-interleave", cl::Hidden,
82 cl::desc("Sets the vectorization interleave count. "
83 "Zero is autoselect."),
84 cl::location(
85 L&: VectorizerParams::VectorizationInterleave));
86unsigned VectorizerParams::VectorizationInterleave;
87
88static cl::opt<unsigned, true> RuntimeMemoryCheckThreshold(
89 "runtime-memory-check-threshold", cl::Hidden,
90 cl::desc("When performing memory disambiguation checks at runtime do not "
91 "generate more than this number of comparisons (default = 8)."),
92 cl::location(L&: VectorizerParams::RuntimeMemoryCheckThreshold), cl::init(Val: 8));
93unsigned VectorizerParams::RuntimeMemoryCheckThreshold;
94
95/// The maximum iterations used to merge memory checks
96static cl::opt<unsigned> MemoryCheckMergeThreshold(
97 "memory-check-merge-threshold", cl::Hidden,
98 cl::desc("Maximum number of comparisons done when trying to merge "
99 "runtime memory checks. (default = 100)"),
100 cl::init(Val: 100));
101
102/// Maximum SIMD width.
103const unsigned VectorizerParams::MaxVectorWidth = 64;
104
105/// We collect dependences up to this threshold.
106static cl::opt<unsigned>
107 MaxDependences("max-dependences", cl::Hidden,
108 cl::desc("Maximum number of dependences collected by "
109 "loop-access analysis (default = 100)"),
110 cl::init(Val: 100));
111
112/// This enables versioning on the strides of symbolically striding memory
113/// accesses in code like the following.
114/// for (i = 0; i < N; ++i)
115/// A[i * Stride1] += B[i * Stride2] ...
116///
117/// Will be roughly translated to
118/// if (Stride1 == 1 && Stride2 == 1) {
119/// for (i = 0; i < N; i+=4)
120/// A[i:i+3] += ...
121/// } else
122/// ...
123static cl::opt<bool> EnableMemAccessVersioning(
124 "enable-mem-access-versioning", cl::init(Val: true), cl::Hidden,
125 cl::desc("Enable symbolic stride memory access versioning"));
126
127/// Enable store-to-load forwarding conflict detection. This option can
128/// be disabled for correctness testing.
129static cl::opt<bool> EnableForwardingConflictDetection(
130 "store-to-load-forwarding-conflict-detection", cl::Hidden,
131 cl::desc("Enable conflict detection in loop-access analysis"),
132 cl::init(Val: true));
133
134static cl::opt<unsigned> MaxForkedSCEVDepth(
135 "max-forked-scev-depth", cl::Hidden,
136 cl::desc("Maximum recursion depth when finding forked SCEVs (default = 5)"),
137 cl::init(Val: 5));
138
139static cl::opt<bool> SpeculateUnitStride(
140 "laa-speculate-unit-stride", cl::Hidden,
141 cl::desc("Speculate that non-constant strides are unit in LAA"),
142 cl::init(Val: true));
143
144static cl::opt<bool, true> HoistRuntimeChecks(
145 "hoist-runtime-checks", cl::Hidden,
146 cl::desc(
147 "Hoist inner loop runtime memory checks to outer loop if possible"),
148 cl::location(L&: VectorizerParams::HoistRuntimeChecks), cl::init(Val: true));
149bool VectorizerParams::HoistRuntimeChecks;
150
151bool VectorizerParams::isInterleaveForced() {
152 return ::VectorizationInterleave.getNumOccurrences() > 0;
153}
154
155const SCEV *llvm::replaceSymbolicStrideSCEV(PredicatedScalarEvolution &PSE,
156 const DenseMap<Value *, const SCEV *> &PtrToStride,
157 Value *Ptr) {
158 const SCEV *OrigSCEV = PSE.getSCEV(V: Ptr);
159
160 // If there is an entry in the map return the SCEV of the pointer with the
161 // symbolic stride replaced by one.
162 const SCEV *StrideSCEV = PtrToStride.lookup(Val: Ptr);
163 if (!StrideSCEV)
164 // For a non-symbolic stride, just return the original expression.
165 return OrigSCEV;
166
167 // Note: This assert is both overly strong and overly weak. The actual
168 // invariant here is that StrideSCEV should be loop invariant. The only
169 // such invariant strides we happen to speculate right now are unknowns
170 // and thus this is a reasonable proxy of the actual invariant.
171 assert(isa<SCEVUnknown>(StrideSCEV) && "shouldn't be in map");
172
173 ScalarEvolution *SE = PSE.getSE();
174 const SCEV *CT = SE->getOne(Ty: StrideSCEV->getType());
175 PSE.addPredicate(Pred: *SE->getEqualPredicate(LHS: StrideSCEV, RHS: CT));
176 const SCEV *Expr = PSE.getSCEV(V: Ptr);
177
178 LLVM_DEBUG(dbgs() << "LAA: Replacing SCEV: " << *OrigSCEV
179 << " by: " << *Expr << "\n");
180 return Expr;
181}
182
183RuntimeCheckingPtrGroup::RuntimeCheckingPtrGroup(
184 unsigned Index, const RuntimePointerChecking &RtCheck)
185 : High(RtCheck.Pointers[Index].End), Low(RtCheck.Pointers[Index].Start),
186 AddressSpace(RtCheck.Pointers[Index]
187 .PointerValue->getType()
188 ->getPointerAddressSpace()),
189 NeedsFreeze(RtCheck.Pointers[Index].NeedsFreeze) {
190 Members.push_back(Elt: Index);
191}
192
193/// Returns \p A + \p B, if it is guaranteed not to unsigned wrap. Otherwise
194/// return nullptr. \p A and \p B must have the same type.
195static const SCEV *addSCEVNoOverflow(const SCEV *A, const SCEV *B,
196 ScalarEvolution &SE) {
197 if (!SE.willNotOverflow(BinOp: Instruction::Add, /*IsSigned=*/Signed: false, LHS: A, RHS: B))
198 return nullptr;
199 return SE.getAddExpr(LHS: A, RHS: B);
200}
201
202/// Returns \p A * \p B, if it is guaranteed not to unsigned wrap. Otherwise
203/// return nullptr. \p A and \p B must have the same type.
204static const SCEV *mulSCEVNoOverflow(const SCEV *A, const SCEV *B,
205 ScalarEvolution &SE) {
206 if (!SE.willNotOverflow(BinOp: Instruction::Mul, /*IsSigned=*/Signed: false, LHS: A, RHS: B))
207 return nullptr;
208 return SE.getMulExpr(LHS: A, RHS: B);
209}
210
211/// Return true, if evaluating \p AR at \p MaxBTC cannot wrap, because \p AR at
212/// \p MaxBTC is guaranteed inbounds of the accessed object.
213static bool evaluatePtrAddRecAtMaxBTCWillNotWrap(
214 const SCEVAddRecExpr *AR, const SCEV *MaxBTC, const SCEV *EltSize,
215 ScalarEvolution &SE, const DataLayout &DL, DominatorTree *DT,
216 AssumptionCache *AC,
217 std::optional<ScalarEvolution::LoopGuards> &LoopGuards) {
218 auto *PointerBase = SE.getPointerBase(V: AR->getStart());
219 auto *StartPtr = dyn_cast<SCEVUnknown>(Val: PointerBase);
220 if (!StartPtr)
221 return false;
222 const Loop *L = AR->getLoop();
223 bool CheckForNonNull, CheckForFreed;
224 Value *StartPtrV = StartPtr->getValue();
225 uint64_t DerefBytes = StartPtrV->getPointerDereferenceableBytes(
226 DL, CanBeNull&: CheckForNonNull, CanBeFreed&: CheckForFreed);
227
228 if (DerefBytes && (CheckForNonNull || CheckForFreed))
229 return false;
230
231 const SCEV *Step = AR->getStepRecurrence(SE);
232 Type *WiderTy = SE.getWiderType(Ty1: MaxBTC->getType(), Ty2: Step->getType());
233 const SCEV *DerefBytesSCEV = SE.getConstant(Ty: WiderTy, V: DerefBytes);
234
235 // Check if we have a suitable dereferencable assumption we can use.
236 Instruction *CtxI = &*L->getHeader()->getFirstNonPHIIt();
237 if (BasicBlock *LoopPred = L->getLoopPredecessor()) {
238 if (isa<UncondBrInst, CondBrInst>(Val: LoopPred->getTerminator()))
239 CtxI = LoopPred->getTerminator();
240 }
241 RetainedKnowledge DerefRK;
242 getKnowledgeForValue(V: StartPtrV, AttrKinds: {Attribute::Dereferenceable}, AC&: *AC,
243 Filter: [&](RetainedKnowledge RK, Instruction *Assume, auto) {
244 if (!isValidAssumeForContext(I: Assume, CxtI: CtxI, DT))
245 return false;
246 if (StartPtrV->canBeFreed() &&
247 !willNotFreeBetween(Assume, CtxI))
248 return false;
249 DerefRK = std::max(a: DerefRK, b: RK);
250 return true;
251 });
252 if (DerefRK) {
253 const SCEV *DerefRKSCEV = SE.getSCEV(V: DerefRK.IRArgValue);
254 Type *CommonTy =
255 SE.getWiderType(Ty1: DerefBytesSCEV->getType(), Ty2: DerefRKSCEV->getType());
256 DerefBytesSCEV = SE.getNoopOrZeroExtend(V: DerefBytesSCEV, Ty: CommonTy);
257 DerefRKSCEV = SE.getNoopOrZeroExtend(V: DerefRKSCEV, Ty: CommonTy);
258 DerefBytesSCEV = SE.getUMaxExpr(LHS: DerefBytesSCEV, RHS: DerefRKSCEV);
259 }
260
261 if (DerefBytesSCEV->isZero())
262 return false;
263
264 bool IsKnownNonNegative = SE.isKnownNonNegative(S: Step);
265 if (!IsKnownNonNegative && !SE.isKnownNegative(S: Step))
266 return false;
267
268 Step = SE.getNoopOrSignExtend(V: Step, Ty: WiderTy);
269 MaxBTC = SE.getNoopOrZeroExtend(V: MaxBTC, Ty: WiderTy);
270
271 // For the computations below, make sure they don't unsigned wrap.
272 if (!SE.isKnownPredicate(Pred: CmpInst::ICMP_UGE, LHS: AR->getStart(), RHS: StartPtr))
273 return false;
274 const SCEV *StartOffset = SE.getNoopOrZeroExtend(
275 V: SE.getMinusSCEV(LHS: AR->getStart(), RHS: StartPtr), Ty: WiderTy);
276
277 if (!LoopGuards)
278 LoopGuards.emplace(args: ScalarEvolution::LoopGuards::collect(L: AR->getLoop(), SE));
279 MaxBTC = SE.applyLoopGuards(Expr: MaxBTC, Guards: *LoopGuards);
280
281 const SCEV *OffsetAtLastIter =
282 mulSCEVNoOverflow(A: MaxBTC, B: SE.getAbsExpr(Op: Step, /*IsNSW=*/false), SE);
283 if (!OffsetAtLastIter) {
284 // Re-try with constant max backedge-taken count if using the symbolic one
285 // failed.
286 MaxBTC = SE.getConstantMaxBackedgeTakenCount(L: AR->getLoop());
287 if (isa<SCEVCouldNotCompute>(Val: MaxBTC))
288 return false;
289 MaxBTC = SE.getNoopOrZeroExtend(
290 V: MaxBTC, Ty: WiderTy);
291 OffsetAtLastIter =
292 mulSCEVNoOverflow(A: MaxBTC, B: SE.getAbsExpr(Op: Step, /*IsNSW=*/false), SE);
293 if (!OffsetAtLastIter)
294 return false;
295 }
296
297 const SCEV *OffsetEndBytes = addSCEVNoOverflow(
298 A: OffsetAtLastIter, B: SE.getNoopOrZeroExtend(V: EltSize, Ty: WiderTy), SE);
299 if (!OffsetEndBytes)
300 return false;
301
302 if (IsKnownNonNegative) {
303 // For positive steps, check if
304 // (AR->getStart() - StartPtr) + (MaxBTC * Step) + EltSize <= DerefBytes,
305 // while making sure none of the computations unsigned wrap themselves.
306 const SCEV *EndBytes = addSCEVNoOverflow(A: StartOffset, B: OffsetEndBytes, SE);
307 if (!EndBytes)
308 return false;
309
310 DerefBytesSCEV = SE.applyLoopGuards(Expr: DerefBytesSCEV, Guards: *LoopGuards);
311 return SE.isKnownPredicate(Pred: CmpInst::ICMP_ULE, LHS: EndBytes, RHS: DerefBytesSCEV);
312 }
313
314 // For negative steps check if
315 // * StartOffset >= (MaxBTC * Step + EltSize)
316 // * StartOffset <= DerefBytes.
317 assert(SE.isKnownNegative(Step) && "must be known negative");
318 return SE.isKnownPredicate(Pred: CmpInst::ICMP_SGE, LHS: StartOffset, RHS: OffsetEndBytes) &&
319 SE.isKnownPredicate(Pred: CmpInst::ICMP_ULE, LHS: StartOffset, RHS: DerefBytesSCEV);
320}
321
322std::pair<const SCEV *, const SCEV *> llvm::getStartAndEndForAccess(
323 const Loop *Lp, const SCEV *PtrExpr, Type *AccessTy, const SCEV *BTC,
324 const SCEV *MaxBTC, ScalarEvolution *SE,
325 DenseMap<std::pair<const SCEV *, const SCEV *>,
326 std::pair<const SCEV *, const SCEV *>> *PointerBounds,
327 DominatorTree *DT, AssumptionCache *AC,
328 std::optional<ScalarEvolution::LoopGuards> &LoopGuards) {
329 auto &DL = Lp->getHeader()->getDataLayout();
330 Type *IdxTy = DL.getIndexType(PtrTy: PtrExpr->getType());
331 const SCEV *EltSizeSCEV = SE->getStoreSizeOfExpr(IntTy: IdxTy, StoreTy: AccessTy);
332
333 // Delegate to the SCEV-based overload, passing through the cache.
334 return getStartAndEndForAccess(Lp, PtrExpr, EltSizeSCEV, BTC, MaxBTC, SE,
335 PointerBounds, DT, AC, LoopGuards);
336}
337
338std::pair<const SCEV *, const SCEV *> llvm::getStartAndEndForAccess(
339 const Loop *Lp, const SCEV *PtrExpr, const SCEV *EltSizeSCEV,
340 const SCEV *BTC, const SCEV *MaxBTC, ScalarEvolution *SE,
341 DenseMap<std::pair<const SCEV *, const SCEV *>,
342 std::pair<const SCEV *, const SCEV *>> *PointerBounds,
343 DominatorTree *DT, AssumptionCache *AC,
344 std::optional<ScalarEvolution::LoopGuards> &LoopGuards) {
345 std::pair<const SCEV *, const SCEV *> *PtrBoundsPair;
346 if (PointerBounds) {
347 auto [Iter, Ins] = PointerBounds->insert(
348 KV: {{PtrExpr, EltSizeSCEV},
349 {SE->getCouldNotCompute(), SE->getCouldNotCompute()}});
350 if (!Ins)
351 return Iter->second;
352 PtrBoundsPair = &Iter->second;
353 }
354
355 const SCEV *ScStart;
356 const SCEV *ScEnd;
357
358 auto &DL = Lp->getHeader()->getDataLayout();
359 if (SE->isLoopInvariant(S: PtrExpr, L: Lp)) {
360 ScStart = ScEnd = PtrExpr;
361 } else if (auto *AR = dyn_cast<SCEVAddRecExpr>(Val: PtrExpr)) {
362 ScStart = AR->getStart();
363 if (!isa<SCEVCouldNotCompute>(Val: BTC))
364 // Evaluating AR at an exact BTC is safe: LAA separately checks that
365 // accesses cannot wrap in the loop. If evaluating AR at BTC wraps, then
366 // the loop either triggers UB when executing a memory access with a
367 // poison pointer or the wrapping/poisoned pointer is not used.
368 ScEnd = AR->evaluateAtIteration(It: BTC, SE&: *SE);
369 else {
370 // Evaluating AR at MaxBTC may wrap and create an expression that is less
371 // than the start of the AddRec due to wrapping (for example consider
372 // MaxBTC = -2). If that's the case, set ScEnd to -(EltSize + 1). ScEnd
373 // will get incremented by EltSize before returning, so this effectively
374 // sets ScEnd to the maximum unsigned value for the type. Note that LAA
375 // separately checks that accesses cannot not wrap, so unsigned max
376 // represents an upper bound.
377 if (evaluatePtrAddRecAtMaxBTCWillNotWrap(AR, MaxBTC, EltSize: EltSizeSCEV, SE&: *SE, DL,
378 DT, AC, LoopGuards)) {
379 ScEnd = AR->evaluateAtIteration(It: MaxBTC, SE&: *SE);
380 } else {
381 ScEnd = SE->getAddExpr(
382 LHS: SE->getNegativeSCEV(V: EltSizeSCEV),
383 RHS: SE->getSCEV(V: ConstantExpr::getIntToPtr(
384 C: ConstantInt::getAllOnesValue(Ty: EltSizeSCEV->getType()),
385 Ty: AR->getType())));
386 }
387 }
388 const SCEV *Step = AR->getStepRecurrence(SE&: *SE);
389
390 // For expressions with negative step, the upper bound is ScStart and the
391 // lower bound is ScEnd.
392 if (const auto *CStep = dyn_cast<SCEVConstant>(Val: Step)) {
393 if (CStep->getValue()->isNegative())
394 std::swap(a&: ScStart, b&: ScEnd);
395 } else {
396 // Fallback case: the step is not constant, but we can still
397 // get the upper and lower bounds of the interval by using min/max
398 // expressions.
399 ScStart = SE->getUMinExpr(LHS: ScStart, RHS: ScEnd);
400 ScEnd = SE->getUMaxExpr(LHS: AR->getStart(), RHS: ScEnd);
401 }
402 } else
403 return {SE->getCouldNotCompute(), SE->getCouldNotCompute()};
404
405 assert(SE->isLoopInvariant(ScStart, Lp) && "ScStart needs to be invariant");
406 assert(SE->isLoopInvariant(ScEnd, Lp) && "ScEnd needs to be invariant");
407
408 // Add the size of the pointed element to ScEnd.
409 ScEnd = SE->getAddExpr(LHS: ScEnd, RHS: EltSizeSCEV);
410
411 std::pair<const SCEV *, const SCEV *> Res = {ScStart, ScEnd};
412 if (PointerBounds)
413 *PtrBoundsPair = Res;
414 return Res;
415}
416
417/// Calculate Start and End points of memory access using
418/// getStartAndEndForAccess.
419void RuntimePointerChecking::insert(Loop *Lp, Value *Ptr, const SCEV *PtrExpr,
420 Type *AccessTy, bool WritePtr,
421 unsigned DepSetId, unsigned ASId,
422 PredicatedScalarEvolution &PSE,
423 bool NeedsFreeze) {
424 const SCEV *SymbolicMaxBTC = PSE.getSymbolicMaxBackedgeTakenCount();
425 const SCEV *BTC = PSE.getBackedgeTakenCount();
426 const auto &[ScStart, ScEnd] = getStartAndEndForAccess(
427 Lp, PtrExpr, AccessTy, BTC, MaxBTC: SymbolicMaxBTC, SE: PSE.getSE(),
428 PointerBounds: &DC.getPointerBounds(), DT: DC.getDT(), AC: DC.getAC(), LoopGuards);
429 assert(!isa<SCEVCouldNotCompute>(ScStart) &&
430 !isa<SCEVCouldNotCompute>(ScEnd) &&
431 "must be able to compute both start and end expressions");
432 Pointers.emplace_back(Args&: Ptr, Args: ScStart, Args: ScEnd, Args&: WritePtr, Args&: DepSetId, Args&: ASId, Args&: PtrExpr,
433 Args&: NeedsFreeze);
434}
435
436bool RuntimePointerChecking::tryToCreateDiffCheck(
437 const RuntimeCheckingPtrGroup &CGI, const RuntimeCheckingPtrGroup &CGJ) {
438 // If either group contains multiple different pointers, bail out.
439 // TODO: Support multiple pointers by using the minimum or maximum pointer,
440 // depending on src & sink.
441 if (CGI.Members.size() != 1 || CGJ.Members.size() != 1)
442 return false;
443
444 const PointerInfo *Src = &Pointers[CGI.Members[0]];
445 const PointerInfo *Sink = &Pointers[CGJ.Members[0]];
446
447 // If either pointer is read and written, multiple checks may be needed. Bail
448 // out.
449 if (!DC.getOrderForAccess(Ptr: Src->PointerValue, IsWrite: !Src->IsWritePtr).empty() ||
450 !DC.getOrderForAccess(Ptr: Sink->PointerValue, IsWrite: !Sink->IsWritePtr).empty())
451 return false;
452
453 ArrayRef<unsigned> AccSrc =
454 DC.getOrderForAccess(Ptr: Src->PointerValue, IsWrite: Src->IsWritePtr);
455 ArrayRef<unsigned> AccSink =
456 DC.getOrderForAccess(Ptr: Sink->PointerValue, IsWrite: Sink->IsWritePtr);
457 // If either pointer is accessed multiple times, there may not be a clear
458 // src/sink relation. Bail out for now.
459 if (AccSrc.size() != 1 || AccSink.size() != 1)
460 return false;
461
462 // If the sink is accessed before src, swap src/sink.
463 if (AccSink[0] < AccSrc[0])
464 std::swap(a&: Src, b&: Sink);
465
466 const SCEVConstant *Step;
467 const SCEV *SrcStart;
468 const SCEV *SinkStart;
469 const Loop *InnerLoop = DC.getInnermostLoop();
470 if (!match(S: Src->Expr,
471 P: m_scev_AffineAddRec(Op0: m_SCEV(V&: SrcStart), Op1: m_SCEVConstant(V&: Step),
472 L: m_SpecificLoop(L: InnerLoop))) ||
473 !match(S: Sink->Expr,
474 P: m_scev_AffineAddRec(Op0: m_SCEV(V&: SinkStart), Op1: m_scev_Specific(S: Step),
475 L: m_SpecificLoop(L: InnerLoop))))
476 return false;
477
478 SmallVector<Instruction *, 4> SrcInsts =
479 DC.getInstructionsForAccess(Ptr: Src->PointerValue, isWrite: Src->IsWritePtr);
480 SmallVector<Instruction *, 4> SinkInsts =
481 DC.getInstructionsForAccess(Ptr: Sink->PointerValue, isWrite: Sink->IsWritePtr);
482 Type *SrcTy = getLoadStoreType(I: SrcInsts[0]);
483 Type *DstTy = getLoadStoreType(I: SinkInsts[0]);
484 if (isa<ScalableVectorType>(Val: SrcTy) || isa<ScalableVectorType>(Val: DstTy))
485 return false;
486
487 const DataLayout &DL = InnerLoop->getHeader()->getDataLayout();
488 unsigned AllocSize =
489 std::max(a: DL.getTypeAllocSize(Ty: SrcTy), b: DL.getTypeAllocSize(Ty: DstTy));
490
491 // Only matching constant steps matching the AllocSize are supported at the
492 // moment. This simplifies the difference computation. Can be extended in the
493 // future.
494 if (Step->getAPInt().abs() != AllocSize)
495 return false;
496
497 // When counting down, the dependence distance needs to be swapped.
498 if (Step->getValue()->isNegative())
499 std::swap(a&: SinkStart, b&: SrcStart);
500
501 const SCEV *SinkStartInt = SE->getPtrToAddrExpr(Op: SinkStart);
502 const SCEV *SrcStartInt = SE->getPtrToAddrExpr(Op: SrcStart);
503 if (isa<SCEVCouldNotCompute>(Val: SinkStartInt) ||
504 isa<SCEVCouldNotCompute>(Val: SrcStartInt))
505 return false;
506
507 // If the start values for both Src and Sink also vary according to an outer
508 // loop, then it's probably better to avoid creating diff checks because
509 // they may not be hoisted. We should instead let llvm::addRuntimeChecks
510 // do the expanded full range overlap checks, which can be hoisted.
511 if (HoistRuntimeChecks && InnerLoop->getParentLoop() &&
512 isa<SCEVAddRecExpr>(Val: SinkStartInt) && isa<SCEVAddRecExpr>(Val: SrcStartInt)) {
513 auto *SrcStartAR = cast<SCEVAddRecExpr>(Val: SrcStartInt);
514 auto *SinkStartAR = cast<SCEVAddRecExpr>(Val: SinkStartInt);
515 const Loop *StartARLoop = SrcStartAR->getLoop();
516 if (StartARLoop == SinkStartAR->getLoop() &&
517 StartARLoop == InnerLoop->getParentLoop() &&
518 // If the diff check would already be loop invariant (due to the
519 // recurrences being the same), then we prefer to keep the diff checks
520 // because they are cheaper.
521 SrcStartAR->getStepRecurrence(SE&: *SE) !=
522 SinkStartAR->getStepRecurrence(SE&: *SE)) {
523 LLVM_DEBUG(dbgs() << "LAA: Not creating diff runtime check, since these "
524 "cannot be hoisted out of the outer loop\n");
525 return false;
526 }
527 }
528
529 LLVM_DEBUG(dbgs() << "LAA: Creating diff runtime check for:\n"
530 << "SrcStart: " << *SrcStartInt << '\n'
531 << "SinkStartInt: " << *SinkStartInt << '\n');
532 DiffChecks.emplace_back(Args&: SrcStartInt, Args&: SinkStartInt, Args&: AllocSize,
533 Args: Src->NeedsFreeze || Sink->NeedsFreeze);
534 return true;
535}
536
537SmallVector<RuntimePointerCheck, 4> RuntimePointerChecking::generateChecks() {
538 SmallVector<RuntimePointerCheck, 4> Checks;
539
540 for (unsigned I = 0; I < CheckingGroups.size(); ++I) {
541 for (unsigned J = I + 1; J < CheckingGroups.size(); ++J) {
542 const RuntimeCheckingPtrGroup &CGI = CheckingGroups[I];
543 const RuntimeCheckingPtrGroup &CGJ = CheckingGroups[J];
544
545 if (needsChecking(M: CGI, N: CGJ)) {
546 CanUseDiffCheck = CanUseDiffCheck && tryToCreateDiffCheck(CGI, CGJ);
547 Checks.emplace_back(Args: &CGI, Args: &CGJ);
548 }
549 }
550 }
551 return Checks;
552}
553
554void RuntimePointerChecking::generateChecks(
555 MemoryDepChecker::DepCandidates &DepCands) {
556 assert(Checks.empty() && "Checks is not empty");
557 groupChecks(DepCands);
558 Checks = generateChecks();
559}
560
561bool RuntimePointerChecking::needsChecking(
562 const RuntimeCheckingPtrGroup &M, const RuntimeCheckingPtrGroup &N) const {
563 for (const auto &I : M.Members)
564 for (const auto &J : N.Members)
565 if (needsChecking(I, J))
566 return true;
567 return false;
568}
569
570/// Compare \p I and \p J and return the minimum.
571/// Return nullptr in case we couldn't find an answer.
572static const SCEV *getMinFromExprs(const SCEV *I, const SCEV *J,
573 ScalarEvolution *SE) {
574 std::optional<APInt> Diff = SE->computeConstantDifference(LHS: J, RHS: I);
575 if (!Diff)
576 return nullptr;
577 return Diff->isNegative() ? J : I;
578}
579
580bool RuntimeCheckingPtrGroup::addPointer(
581 unsigned Index, const RuntimePointerChecking &RtCheck) {
582 return addPointer(
583 Index, Start: RtCheck.Pointers[Index].Start, End: RtCheck.Pointers[Index].End,
584 AS: RtCheck.Pointers[Index].PointerValue->getType()->getPointerAddressSpace(),
585 NeedsFreeze: RtCheck.Pointers[Index].NeedsFreeze, SE&: *RtCheck.SE);
586}
587
588bool RuntimeCheckingPtrGroup::addPointer(unsigned Index, const SCEV *Start,
589 const SCEV *End, unsigned AS,
590 bool NeedsFreeze,
591 ScalarEvolution &SE) {
592 assert(AddressSpace == AS &&
593 "all pointers in a checking group must be in the same address space");
594
595 // Compare the starts and ends with the known minimum and maximum
596 // of this set. We need to know how we compare against the min/max
597 // of the set in order to be able to emit memchecks.
598 const SCEV *Min0 = getMinFromExprs(I: Start, J: Low, SE: &SE);
599 if (!Min0)
600 return false;
601
602 const SCEV *Min1 = getMinFromExprs(I: End, J: High, SE: &SE);
603 if (!Min1)
604 return false;
605
606 // Update the low bound expression if we've found a new min value.
607 if (Min0 == Start)
608 Low = Start;
609
610 // Update the high bound expression if we've found a new max value.
611 if (Min1 != End)
612 High = End;
613
614 Members.push_back(Elt: Index);
615 this->NeedsFreeze |= NeedsFreeze;
616 return true;
617}
618
619void RuntimePointerChecking::groupChecks(
620 MemoryDepChecker::DepCandidates &DepCands) {
621 // We build the groups from dependency candidates equivalence classes
622 // because:
623 // - We know that pointers in the same equivalence class share
624 // the same underlying object and therefore there is a chance
625 // that we can compare pointers
626 // - We wouldn't be able to merge two pointers for which we need
627 // to emit a memcheck. The classes in DepCands are already
628 // conveniently built such that no two pointers in the same
629 // class need checking against each other.
630
631 // We use the following (greedy) algorithm to construct the groups
632 // For every pointer in the equivalence class:
633 // For each existing group:
634 // - if the difference between this pointer and the min/max bounds
635 // of the group is a constant, then make the pointer part of the
636 // group and update the min/max bounds of that group as required.
637
638 CheckingGroups.clear();
639
640 // If we need to check two pointers to the same underlying object
641 // with a non-constant difference, we shouldn't perform any pointer
642 // grouping with those pointers. This is because we can easily get
643 // into cases where the resulting check would return false, even when
644 // the accesses are safe.
645 //
646 // The following example shows this:
647 // for (i = 0; i < 1000; ++i)
648 // a[5000 + i * m] = a[i] + a[i + 9000]
649 //
650 // Here grouping gives a check of (5000, 5000 + 1000 * m) against
651 // (0, 10000) which is always false. However, if m is 1, there is no
652 // dependence. Not grouping the checks for a[i] and a[i + 9000] allows
653 // us to perform an accurate check in this case.
654 //
655 // In the above case, we have a non-constant distance and an Unknown
656 // dependence between accesses to the same underlying object, and could retry
657 // with runtime checks without dependency information being available. In this
658 // case we will use the fallback path and create separate checking groups for
659 // accesses not present in DepCands.
660
661 unsigned TotalComparisons = 0;
662
663 DenseMap<Value *, SmallVector<unsigned>> PositionMap;
664 for (unsigned Index = 0; Index < Pointers.size(); ++Index)
665 PositionMap[Pointers[Index].PointerValue].push_back(Elt: Index);
666
667 // We need to keep track of what pointers we've already seen so we
668 // don't process them twice.
669 SmallSet<unsigned, 2> Seen;
670
671 // Go through all equivalence classes, get the "pointer check groups"
672 // and add them to the overall solution. We use the order in which accesses
673 // appear in 'Pointers' to enforce determinism.
674 for (unsigned I = 0; I < Pointers.size(); ++I) {
675 // We've seen this pointer before, and therefore already processed
676 // its equivalence class.
677 if (Seen.contains(V: I))
678 continue;
679
680 MemoryDepChecker::MemAccessInfo Access(Pointers[I].PointerValue,
681 Pointers[I].IsWritePtr);
682
683 // If there is no entry in the dependency partition, there are no potential
684 // accesses to merge; simply add a new pointer checking group.
685 if (!DepCands.contains(V: Access)) {
686 CheckingGroups.push_back(Elt: RuntimeCheckingPtrGroup(I, *this));
687 continue;
688 }
689
690 SmallVector<RuntimeCheckingPtrGroup, 2> Groups;
691
692 // Because DepCands is constructed by visiting accesses in the order in
693 // which they appear in alias sets (which is deterministic) and the
694 // iteration order within an equivalence class member is only dependent on
695 // the order in which unions and insertions are performed on the
696 // equivalence class, the iteration order is deterministic.
697 for (auto M : DepCands.members(V: Access)) {
698 auto PointerI = PositionMap.find(Val: M.getPointer());
699 // If we can't find the pointer in PositionMap that means we can't
700 // generate a memcheck for it.
701 if (PointerI == PositionMap.end())
702 continue;
703 for (unsigned Pointer : PointerI->second) {
704 bool Merged = false;
705 // Mark this pointer as seen.
706 Seen.insert(V: Pointer);
707
708 // Go through all the existing sets and see if we can find one
709 // which can include this pointer.
710 for (RuntimeCheckingPtrGroup &Group : Groups) {
711 // Don't perform more than a certain amount of comparisons.
712 // This should limit the cost of grouping the pointers to something
713 // reasonable. If we do end up hitting this threshold, the algorithm
714 // will create separate groups for all remaining pointers.
715 if (TotalComparisons > MemoryCheckMergeThreshold)
716 break;
717
718 TotalComparisons++;
719
720 if (Group.addPointer(Index: Pointer, RtCheck: *this)) {
721 Merged = true;
722 break;
723 }
724 }
725
726 if (!Merged)
727 // We couldn't add this pointer to any existing set or the threshold
728 // for the number of comparisons has been reached. Create a new group
729 // to hold the current pointer.
730 Groups.emplace_back(Args&: Pointer, Args&: *this);
731 }
732 }
733
734 // We've computed the grouped checks for this partition.
735 // Save the results and continue with the next one.
736 llvm::append_range(C&: CheckingGroups, R&: Groups);
737 }
738}
739
740bool RuntimePointerChecking::arePointersInSamePartition(
741 const SmallVectorImpl<int> &PtrToPartition, unsigned PtrIdx1,
742 unsigned PtrIdx2) {
743 return (PtrToPartition[PtrIdx1] != -1 &&
744 PtrToPartition[PtrIdx1] == PtrToPartition[PtrIdx2]);
745}
746
747bool RuntimePointerChecking::needsChecking(unsigned I, unsigned J) const {
748 const PointerInfo &PointerI = Pointers[I];
749 const PointerInfo &PointerJ = Pointers[J];
750
751 // No need to check if two readonly pointers intersect.
752 if (!PointerI.IsWritePtr && !PointerJ.IsWritePtr)
753 return false;
754
755 // Only need to check pointers between two different dependency sets.
756 if (PointerI.DependencySetId == PointerJ.DependencySetId)
757 return false;
758
759 // Only need to check pointers in the same alias set.
760 return PointerI.AliasSetId == PointerJ.AliasSetId;
761}
762
763/// Assign each RuntimeCheckingPtrGroup pointer an index for stable UTC output.
764static DenseMap<const RuntimeCheckingPtrGroup *, unsigned>
765getPtrToIdxMap(ArrayRef<RuntimeCheckingPtrGroup> CheckingGroups) {
766 DenseMap<const RuntimeCheckingPtrGroup *, unsigned> PtrIndices;
767 for (const auto &[Idx, CG] : enumerate(First&: CheckingGroups))
768 PtrIndices[&CG] = Idx;
769 return PtrIndices;
770}
771
772void RuntimePointerChecking::printChecks(
773 raw_ostream &OS, const SmallVectorImpl<RuntimePointerCheck> &Checks,
774 unsigned Depth) const {
775 unsigned N = 0;
776 auto PtrIndices = getPtrToIdxMap(CheckingGroups);
777 for (const auto &[Check1, Check2] : Checks) {
778 const auto &First = Check1->Members, &Second = Check2->Members;
779 OS.indent(NumSpaces: Depth) << "Check " << N++ << ":\n";
780 OS.indent(NumSpaces: Depth + 2) << "Comparing group GRP" << PtrIndices.at(Val: Check1)
781 << ":\n";
782 for (unsigned K : First)
783 OS.indent(NumSpaces: Depth + 2) << *Pointers[K].PointerValue << "\n";
784 OS.indent(NumSpaces: Depth + 2) << "Against group GRP" << PtrIndices.at(Val: Check2)
785 << ":\n";
786 for (unsigned K : Second)
787 OS.indent(NumSpaces: Depth + 2) << *Pointers[K].PointerValue << "\n";
788 }
789}
790
791void RuntimePointerChecking::print(raw_ostream &OS, unsigned Depth) const {
792
793 OS.indent(NumSpaces: Depth) << "Run-time memory checks:\n";
794 printChecks(OS, Checks, Depth);
795
796 OS.indent(NumSpaces: Depth) << "Grouped accesses:\n";
797 auto PtrIndices = getPtrToIdxMap(CheckingGroups);
798 for (const auto &CG : CheckingGroups) {
799 OS.indent(NumSpaces: Depth + 2) << "Group GRP" << PtrIndices.at(Val: &CG) << ":\n";
800 OS.indent(NumSpaces: Depth + 4) << "(Low: " << *CG.Low << " High: " << *CG.High
801 << ")\n";
802 for (unsigned Member : CG.Members) {
803 OS.indent(NumSpaces: Depth + 6) << "Member: " << *Pointers[Member].Expr << "\n";
804 }
805 }
806}
807
808namespace {
809
810/// Analyses memory accesses in a loop.
811///
812/// Checks whether run time pointer checks are needed and builds sets for data
813/// dependence checking.
814class AccessAnalysis {
815public:
816 using MemAccessInfo =
817 PointerIntPair<Value * /* AccessPtr */, 1, bool /* IsWrite */>;
818
819 AccessAnalysis(const Loop *TheLoop, AAResults *AA, const LoopInfo *LI,
820 DominatorTree &DT, MemoryDepChecker::DepCandidates &DA,
821 PredicatedScalarEvolution &PSE,
822 SmallPtrSetImpl<MDNode *> &LoopAliasScopes)
823 : TheLoop(TheLoop), BAA(*AA), AST(BAA), LI(LI), DT(DT), DepCands(DA),
824 PSE(PSE), LoopAliasScopes(LoopAliasScopes) {
825 // We're analyzing dependences across loop iterations.
826 BAA.enableCrossIterationMode();
827 }
828
829 /// Register a load and whether it is only read from.
830 void addLoad(const MemoryLocation &Loc, Type *AccessTy, bool IsReadOnly) {
831 Value *Ptr = const_cast<Value *>(Loc.Ptr);
832 AST.add(Loc: adjustLoc(Loc));
833 Accesses[MemAccessInfo(Ptr, false)].insert(X: AccessTy);
834 if (IsReadOnly)
835 ReadOnlyPtr.insert(Ptr);
836 }
837
838 /// Register a store.
839 void addStore(const MemoryLocation &Loc, Type *AccessTy) {
840 Value *Ptr = const_cast<Value *>(Loc.Ptr);
841 AST.add(Loc: adjustLoc(Loc));
842 Accesses[MemAccessInfo(Ptr, true)].insert(X: AccessTy);
843 }
844
845 /// Check if we can emit a run-time no-alias check for \p Access.
846 ///
847 /// Returns true if we can emit a run-time no alias check for \p Access.
848 /// If we can check this access, this also adds it to a dependence set and
849 /// adds a run-time to check for it to \p RtCheck. If \p Assume is true,
850 /// we will attempt to use additional run-time checks in order to get
851 /// the bounds of the pointer.
852 bool createCheckForAccess(RuntimePointerChecking &RtCheck,
853 MemAccessInfo Access, Type *AccessTy,
854 const DenseMap<Value *, const SCEV *> &Strides,
855 DenseMap<Value *, unsigned> &DepSetId,
856 Loop *TheLoop, unsigned &RunningDepId,
857 unsigned ASId, bool Assume);
858
859 /// Check whether we can check the pointers at runtime for
860 /// non-intersection.
861 ///
862 /// Returns true if we need no check or if we do and we can generate them
863 /// (i.e. the pointers have computable bounds). A return value of false means
864 /// we couldn't analyze and generate runtime checks for all pointers in the
865 /// loop, but if \p AllowPartial is set then we will have checks for those
866 /// pointers we could analyze. \p DepChecker is used to remove unknown
867 /// dependences from DepCands.
868 bool canCheckPtrAtRT(RuntimePointerChecking &RtCheck, Loop *TheLoop,
869 const DenseMap<Value *, const SCEV *> &Strides,
870 Value *&UncomputablePtr, bool AllowPartial,
871 const MemoryDepChecker &DepChecker);
872
873 /// Goes over all memory accesses, checks whether a RT check is needed
874 /// and builds sets of dependent accesses.
875 void buildDependenceSets();
876
877 /// Initial processing of memory accesses determined that we need to
878 /// perform dependency checking.
879 ///
880 /// Note that this can later be cleared if we retry memcheck analysis without
881 /// dependency checking (i.e. ShouldRetryWithRuntimeChecks).
882 bool isDependencyCheckNeeded() const { return !CheckDeps.empty(); }
883
884 /// We decided that no dependence analysis would be used. Reset the state.
885 void resetDepChecks(MemoryDepChecker &DepChecker) {
886 CheckDeps.clear();
887 DepChecker.clearDependences();
888 }
889
890 ArrayRef<MemAccessInfo> getDependenciesToCheck() const { return CheckDeps; }
891
892private:
893 using PtrAccessMap = MapVector<MemAccessInfo, SmallSetVector<Type *, 1>>;
894
895 /// Adjust the MemoryLocation so that it represents accesses to this
896 /// location across all iterations, rather than a single one.
897 MemoryLocation adjustLoc(MemoryLocation Loc) const {
898 // The accessed location varies within the loop, but remains within the
899 // underlying object.
900 Loc.Size = LocationSize::beforeOrAfterPointer();
901 Loc.AATags.Scope = adjustAliasScopeList(ScopeList: Loc.AATags.Scope);
902 Loc.AATags.NoAlias = adjustAliasScopeList(ScopeList: Loc.AATags.NoAlias);
903 return Loc;
904 }
905
906 /// Drop alias scopes that are only valid within a single loop iteration.
907 MDNode *adjustAliasScopeList(MDNode *ScopeList) const {
908 if (!ScopeList)
909 return nullptr;
910
911 // For the sake of simplicity, drop the whole scope list if any scope is
912 // iteration-local.
913 if (any_of(Range: ScopeList->operands(), P: [&](Metadata *Scope) {
914 return LoopAliasScopes.contains(Ptr: cast<MDNode>(Val: Scope));
915 }))
916 return nullptr;
917
918 return ScopeList;
919 }
920
921 /// Map of all accesses. Values are the types used to access memory pointed to
922 /// by the pointer.
923 PtrAccessMap Accesses;
924
925 /// The loop being checked.
926 const Loop *TheLoop;
927
928 /// List of accesses that need a further dependence check.
929 SmallVector<MemAccessInfo, 8> CheckDeps;
930
931 /// Set of pointers that are read only.
932 SmallPtrSet<Value*, 16> ReadOnlyPtr;
933
934 /// Batched alias analysis results.
935 BatchAAResults BAA;
936
937 /// An alias set tracker to partition the access set by underlying object and
938 //intrinsic property (such as TBAA metadata).
939 AliasSetTracker AST;
940
941 /// The LoopInfo of the loop being checked.
942 const LoopInfo *LI;
943
944 /// The dominator tree of the function.
945 DominatorTree &DT;
946
947 /// Sets of potentially dependent accesses - members of one set share an
948 /// underlying pointer. The set "CheckDeps" identfies which sets really need a
949 /// dependence check.
950 MemoryDepChecker::DepCandidates &DepCands;
951
952 /// Initial processing of memory accesses determined that we may need
953 /// to add memchecks. Perform the analysis to determine the necessary checks.
954 ///
955 /// Note that, this is different from isDependencyCheckNeeded. When we retry
956 /// memcheck analysis without dependency checking
957 /// (i.e. ShouldRetryWithRuntimeChecks), isDependencyCheckNeeded is
958 /// cleared while this remains set if we have potentially dependent accesses.
959 bool IsRTCheckAnalysisNeeded = false;
960
961 /// The SCEV predicate containing all the SCEV-related assumptions.
962 PredicatedScalarEvolution &PSE;
963
964 DenseMap<Value *, SmallVector<const Value *, 16>> UnderlyingObjects;
965
966 /// Alias scopes that are declared inside the loop, and as such not valid
967 /// across iterations.
968 SmallPtrSetImpl<MDNode *> &LoopAliasScopes;
969};
970
971} // end anonymous namespace
972
973/// Try to compute a constant stride for \p AR. Used by getPtrStride and
974/// isNoWrap.
975static std::optional<int64_t>
976getStrideFromAddRec(const SCEVAddRecExpr *AR, const Loop *Lp, Type *AccessTy,
977 Value *Ptr, PredicatedScalarEvolution &PSE) {
978 if (isa<ScalableVectorType>(Val: AccessTy)) {
979 LLVM_DEBUG(dbgs() << "LAA: Bad stride - Scalable object: " << *AccessTy
980 << "\n");
981 return std::nullopt;
982 }
983
984 // The access function must stride over the innermost loop.
985 if (Lp != AR->getLoop()) {
986 LLVM_DEBUG({
987 dbgs() << "LAA: Bad stride - Not striding over innermost loop ";
988 if (Ptr)
989 dbgs() << *Ptr << " ";
990
991 dbgs() << "SCEV: " << *AR << "\n";
992 });
993 return std::nullopt;
994 }
995
996 // Check the step is constant.
997 const SCEV *Step = AR->getStepRecurrence(SE&: *PSE.getSE());
998
999 // Calculate the pointer stride and check if it is constant.
1000 const APInt *APStepVal;
1001 if (!match(S: Step, P: m_scev_APInt(C&: APStepVal))) {
1002 LLVM_DEBUG({
1003 dbgs() << "LAA: Bad stride - Not a constant strided ";
1004 if (Ptr)
1005 dbgs() << *Ptr << " ";
1006 dbgs() << "SCEV: " << *AR << "\n";
1007 });
1008 return std::nullopt;
1009 }
1010
1011 const auto &DL = Lp->getHeader()->getDataLayout();
1012 TypeSize AllocSize = DL.getTypeAllocSize(Ty: AccessTy);
1013 int64_t Size = AllocSize.getFixedValue();
1014
1015 // Huge step value - give up.
1016 std::optional<int64_t> StepVal = APStepVal->trySExtValue();
1017 if (!StepVal)
1018 return std::nullopt;
1019
1020 // Strided access.
1021 return *StepVal % Size ? std::nullopt : std::make_optional(t: *StepVal / Size);
1022}
1023
1024/// Check whether \p AR is a non-wrapping AddRec. If \p Ptr is not nullptr, use
1025/// informating from the IR pointer value to determine no-wrap.
1026static bool isNoWrap(PredicatedScalarEvolution &PSE, const SCEVAddRecExpr *AR,
1027 Value *Ptr, Type *AccessTy, const Loop *L, bool Assume,
1028 const DominatorTree &DT,
1029 std::optional<int64_t> Stride = std::nullopt) {
1030 // FIXME: This should probably only return true for NUW.
1031 if (any(Val: AR->getNoWrapFlags(Mask: SCEV::NoWrapMask)))
1032 return true;
1033
1034 if (Ptr && PSE.hasNoOverflow(V: Ptr, Flags: SCEVWrapPredicate::IncrementNUSW))
1035 return true;
1036
1037 // An nusw getelementptr that is an AddRec cannot wrap. If it would wrap,
1038 // the distance between the previously accessed location and the wrapped
1039 // location will be larger than half the pointer index type space. In that
1040 // case, the GEP would be poison and any memory access dependent on it would
1041 // be immediate UB when executed.
1042 if (auto *GEP = dyn_cast_if_present<GetElementPtrInst>(Val: Ptr);
1043 GEP && GEP->hasNoUnsignedSignedWrap()) {
1044 // For the above reasoning to apply, the pointer must be dereferenced in
1045 // every iteration.
1046 if (L->getHeader() == L->getLoopLatch() ||
1047 any_of(Range: GEP->users(), P: [L, &DT, GEP](User *U) {
1048 if (getLoadStorePointerOperand(V: U) != GEP)
1049 return false;
1050 BasicBlock *UserBB = cast<Instruction>(Val: U)->getParent();
1051 if (!L->contains(BB: UserBB))
1052 return false;
1053 return !LoopAccessInfo::blockNeedsPredication(BB: UserBB, TheLoop: L, DT: &DT);
1054 }))
1055 return true;
1056 }
1057
1058 if (!Stride)
1059 Stride = getStrideFromAddRec(AR, Lp: L, AccessTy, Ptr, PSE);
1060 if (Stride) {
1061 // If the null pointer is undefined, then a access sequence which would
1062 // otherwise access it can be assumed not to unsigned wrap. Note that this
1063 // assumes the object in memory is aligned to the natural alignment.
1064 unsigned AddrSpace = AR->getType()->getPointerAddressSpace();
1065 if (!NullPointerIsDefined(F: L->getHeader()->getParent(), AS: AddrSpace) &&
1066 (Stride == 1 || Stride == -1))
1067 return true;
1068 }
1069
1070 if (Ptr && Assume) {
1071 PSE.setNoOverflow(V: Ptr, Flags: SCEVWrapPredicate::IncrementNUSW);
1072 LLVM_DEBUG(dbgs() << "LAA: Pointer may wrap:\n"
1073 << "LAA: Pointer: " << *Ptr << "\n"
1074 << "LAA: SCEV: " << *AR << "\n"
1075 << "LAA: Added an overflow assumption\n");
1076 return true;
1077 }
1078
1079 return false;
1080}
1081
1082static void visitPointers(Value *StartPtr, const Loop &InnermostLoop,
1083 function_ref<void(Value *)> AddPointer) {
1084 SmallPtrSet<Value *, 8> Visited;
1085 SmallVector<Value *> WorkList;
1086 WorkList.push_back(Elt: StartPtr);
1087
1088 while (!WorkList.empty()) {
1089 Value *Ptr = WorkList.pop_back_val();
1090 if (!Visited.insert(Ptr).second)
1091 continue;
1092 auto *PN = dyn_cast<PHINode>(Val: Ptr);
1093 // SCEV does not look through non-header PHIs inside the loop. Such phis
1094 // can be analyzed by adding separate accesses for each incoming pointer
1095 // value.
1096 if (PN && InnermostLoop.contains(BB: PN->getParent()) &&
1097 PN->getParent() != InnermostLoop.getHeader()) {
1098 llvm::append_range(C&: WorkList, R: PN->incoming_values());
1099 } else
1100 AddPointer(Ptr);
1101 }
1102}
1103
1104// Walk back through the IR for a pointer, looking for a select like the
1105// following:
1106//
1107// %offset = select i1 %cmp, i64 %a, i64 %b
1108// %addr = getelementptr double, double* %base, i64 %offset
1109// %ld = load double, double* %addr, align 8
1110//
1111// We won't be able to form a single SCEVAddRecExpr from this since the
1112// address for each loop iteration depends on %cmp. We could potentially
1113// produce multiple valid SCEVAddRecExprs, though, and check all of them for
1114// memory safety/aliasing if needed.
1115//
1116// If we encounter some IR we don't yet handle, or something obviously fine
1117// like a constant, then we just add the SCEV for that term to the list passed
1118// in by the caller. If we have a node that may potentially yield a valid
1119// SCEVAddRecExpr then we decompose it into parts and build the SCEV terms
1120// ourselves before adding to the list.
1121static void findForkedSCEVs(
1122 ScalarEvolution *SE, const Loop *L, Value *Ptr,
1123 SmallVectorImpl<PointerIntPair<const SCEV *, 1, bool>> &ScevList,
1124 unsigned Depth) {
1125 // If our Value is a SCEVAddRecExpr, loop invariant, not an instruction, or
1126 // we've exceeded our limit on recursion, just return whatever we have
1127 // regardless of whether it can be used for a forked pointer or not, along
1128 // with an indication of whether it might be a poison or undef value.
1129 const SCEV *Scev = SE->getSCEV(V: Ptr);
1130 if (isa<SCEVAddRecExpr>(Val: Scev) || L->isLoopInvariant(V: Ptr) ||
1131 !isa<Instruction>(Val: Ptr) || Depth == 0) {
1132 ScevList.emplace_back(Args&: Scev, Args: !isGuaranteedNotToBeUndefOrPoison(V: Ptr));
1133 return;
1134 }
1135
1136 Depth--;
1137
1138 auto UndefPoisonCheck = [](PointerIntPair<const SCEV *, 1, bool> S) {
1139 return get<1>(Pair: S);
1140 };
1141
1142 auto GetBinOpExpr = [&SE](unsigned Opcode, const SCEV *L, const SCEV *R) {
1143 switch (Opcode) {
1144 case Instruction::Add:
1145 return SE->getAddExpr(LHS: L, RHS: R);
1146 case Instruction::Sub:
1147 return SE->getMinusSCEV(LHS: L, RHS: R);
1148 default:
1149 llvm_unreachable("Unexpected binary operator when walking ForkedPtrs");
1150 }
1151 };
1152
1153 Instruction *I = cast<Instruction>(Val: Ptr);
1154 unsigned Opcode = I->getOpcode();
1155 switch (Opcode) {
1156 case Instruction::GetElementPtr: {
1157 auto *GEP = cast<GetElementPtrInst>(Val: I);
1158 Type *SourceTy = GEP->getSourceElementType();
1159 // We only handle base + single offset GEPs here for now.
1160 // Not dealing with preexisting gathers yet, so no vectors.
1161 if (I->getNumOperands() != 2 || SourceTy->isVectorTy()) {
1162 ScevList.emplace_back(Args&: Scev, Args: !isGuaranteedNotToBeUndefOrPoison(V: GEP));
1163 break;
1164 }
1165 SmallVector<PointerIntPair<const SCEV *, 1, bool>, 2> BaseScevs;
1166 SmallVector<PointerIntPair<const SCEV *, 1, bool>, 2> OffsetScevs;
1167 findForkedSCEVs(SE, L, Ptr: I->getOperand(i: 0), ScevList&: BaseScevs, Depth);
1168 findForkedSCEVs(SE, L, Ptr: I->getOperand(i: 1), ScevList&: OffsetScevs, Depth);
1169
1170 // See if we need to freeze our fork...
1171 bool NeedsFreeze = any_of(Range&: BaseScevs, P: UndefPoisonCheck) ||
1172 any_of(Range&: OffsetScevs, P: UndefPoisonCheck);
1173
1174 // Check that we only have a single fork, on either the base or the offset.
1175 // Copy the SCEV across for the one without a fork in order to generate
1176 // the full SCEV for both sides of the GEP.
1177 if (OffsetScevs.size() == 2 && BaseScevs.size() == 1)
1178 BaseScevs.push_back(Elt: BaseScevs[0]);
1179 else if (BaseScevs.size() == 2 && OffsetScevs.size() == 1)
1180 OffsetScevs.push_back(Elt: OffsetScevs[0]);
1181 else {
1182 ScevList.emplace_back(Args&: Scev, Args&: NeedsFreeze);
1183 break;
1184 }
1185
1186 Type *IntPtrTy = SE->getEffectiveSCEVType(Ty: GEP->getPointerOperandType());
1187
1188 // Find the size of the type being pointed to. We only have a single
1189 // index term (guarded above) so we don't need to index into arrays or
1190 // structures, just get the size of the scalar value.
1191 const SCEV *Size = SE->getSizeOfExpr(IntTy: IntPtrTy, AllocTy: SourceTy);
1192
1193 for (auto [B, O] : zip(t&: BaseScevs, u&: OffsetScevs)) {
1194 const SCEV *Base = get<0>(Pair: B);
1195 const SCEV *Offset = get<0>(Pair: O);
1196
1197 // Scale up the offsets by the size of the type, then add to the bases.
1198 const SCEV *Scaled =
1199 SE->getMulExpr(LHS: Size, RHS: SE->getTruncateOrSignExtend(V: Offset, Ty: IntPtrTy));
1200 ScevList.emplace_back(Args: SE->getAddExpr(LHS: Base, RHS: Scaled), Args&: NeedsFreeze);
1201 }
1202 break;
1203 }
1204 case Instruction::Select: {
1205 SmallVector<PointerIntPair<const SCEV *, 1, bool>, 2> ChildScevs;
1206 // A select means we've found a forked pointer, but we currently only
1207 // support a single select per pointer so if there's another behind this
1208 // then we just bail out and return the generic SCEV.
1209 findForkedSCEVs(SE, L, Ptr: I->getOperand(i: 1), ScevList&: ChildScevs, Depth);
1210 findForkedSCEVs(SE, L, Ptr: I->getOperand(i: 2), ScevList&: ChildScevs, Depth);
1211 if (ChildScevs.size() == 2)
1212 append_range(C&: ScevList, R&: ChildScevs);
1213 else
1214 ScevList.emplace_back(Args&: Scev, Args: !isGuaranteedNotToBeUndefOrPoison(V: Ptr));
1215 break;
1216 }
1217 case Instruction::PHI: {
1218 SmallVector<PointerIntPair<const SCEV *, 1, bool>, 2> ChildScevs;
1219 // A phi means we've found a forked pointer, but we currently only
1220 // support a single phi per pointer so if there's another behind this
1221 // then we just bail out and return the generic SCEV.
1222 if (I->getNumOperands() == 2) {
1223 findForkedSCEVs(SE, L, Ptr: I->getOperand(i: 0), ScevList&: ChildScevs, Depth);
1224 findForkedSCEVs(SE, L, Ptr: I->getOperand(i: 1), ScevList&: ChildScevs, Depth);
1225 }
1226 if (ChildScevs.size() == 2)
1227 append_range(C&: ScevList, R&: ChildScevs);
1228 else
1229 ScevList.emplace_back(Args&: Scev, Args: !isGuaranteedNotToBeUndefOrPoison(V: Ptr));
1230 break;
1231 }
1232 case Instruction::Add:
1233 case Instruction::Sub: {
1234 SmallVector<PointerIntPair<const SCEV *, 1, bool>> LScevs;
1235 SmallVector<PointerIntPair<const SCEV *, 1, bool>> RScevs;
1236 findForkedSCEVs(SE, L, Ptr: I->getOperand(i: 0), ScevList&: LScevs, Depth);
1237 findForkedSCEVs(SE, L, Ptr: I->getOperand(i: 1), ScevList&: RScevs, Depth);
1238
1239 // See if we need to freeze our fork...
1240 bool NeedsFreeze =
1241 any_of(Range&: LScevs, P: UndefPoisonCheck) || any_of(Range&: RScevs, P: UndefPoisonCheck);
1242
1243 // Check that we only have a single fork, on either the left or right side.
1244 // Copy the SCEV across for the one without a fork in order to generate
1245 // the full SCEV for both sides of the BinOp.
1246 if (LScevs.size() == 2 && RScevs.size() == 1)
1247 RScevs.push_back(Elt: RScevs[0]);
1248 else if (RScevs.size() == 2 && LScevs.size() == 1)
1249 LScevs.push_back(Elt: LScevs[0]);
1250 else {
1251 ScevList.emplace_back(Args&: Scev, Args&: NeedsFreeze);
1252 break;
1253 }
1254
1255 for (auto [L, R] : zip(t&: LScevs, u&: RScevs))
1256 ScevList.emplace_back(Args: GetBinOpExpr(Opcode, get<0>(Pair: L), get<0>(Pair: R)),
1257 Args&: NeedsFreeze);
1258 break;
1259 }
1260 default:
1261 // Just return the current SCEV if we haven't handled the instruction yet.
1262 LLVM_DEBUG(dbgs() << "ForkedPtr unhandled instruction: " << *I << "\n");
1263 ScevList.emplace_back(Args&: Scev, Args: !isGuaranteedNotToBeUndefOrPoison(V: Ptr));
1264 break;
1265 }
1266}
1267
1268bool AccessAnalysis::createCheckForAccess(
1269 RuntimePointerChecking &RtCheck, MemAccessInfo Access, Type *AccessTy,
1270 const DenseMap<Value *, const SCEV *> &StridesMap,
1271 DenseMap<Value *, unsigned> &DepSetId, Loop *TheLoop,
1272 unsigned &RunningDepId, unsigned ASId, bool Assume) {
1273 Value *Ptr = Access.getPointer();
1274 ScalarEvolution *SE = PSE.getSE();
1275 assert(SE->isSCEVable(Ptr->getType()) && "Value is not SCEVable!");
1276
1277 SmallVector<PointerIntPair<const SCEV *, 1, bool>> RTCheckPtrs;
1278 findForkedSCEVs(SE, L: TheLoop, Ptr, ScevList&: RTCheckPtrs, Depth: MaxForkedSCEVDepth);
1279 assert(!RTCheckPtrs.empty() &&
1280 "Must have some runtime-check pointer candidates");
1281
1282 // RTCheckPtrs must have size 2 if there are forked pointers. Otherwise, there
1283 // are no forked pointers; replaceSymbolicStridesSCEV in this case.
1284 auto IsLoopInvariantOrAR =
1285 [&SE, &TheLoop](const PointerIntPair<const SCEV *, 1, bool> &P) {
1286 return SE->isLoopInvariant(S: P.getPointer(), L: TheLoop) ||
1287 isa<SCEVAddRecExpr>(Val: P.getPointer());
1288 };
1289 if (RTCheckPtrs.size() == 2 && all_of(Range&: RTCheckPtrs, P: IsLoopInvariantOrAR)) {
1290 LLVM_DEBUG(dbgs() << "LAA: Found forked pointer: " << *Ptr << "\n";
1291 for (const auto &[Idx, Q] : enumerate(RTCheckPtrs)) dbgs()
1292 << "\t(" << Idx << ") " << *Q.getPointer() << "\n");
1293 } else {
1294 RTCheckPtrs = {{replaceSymbolicStrideSCEV(PSE, PtrToStride: StridesMap, Ptr), false}};
1295 }
1296
1297 /// Check whether all pointers can participate in a runtime bounds check. They
1298 /// must either be invariant or non-wrapping affine AddRecs.
1299 for (auto &P : RTCheckPtrs) {
1300 // The bounds for loop-invariant pointer is trivial.
1301 if (SE->isLoopInvariant(S: P.getPointer(), L: TheLoop))
1302 continue;
1303
1304 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Val: P.getPointer());
1305 if (!AR && Assume)
1306 AR = PSE.getAsAddRec(V: Ptr);
1307 if (!AR || !AR->isAffine())
1308 return false;
1309
1310 // If there's only one option for Ptr, look it up after bounds and wrap
1311 // checking, because assumptions might have been added to PSE.
1312 if (RTCheckPtrs.size() == 1) {
1313 AR =
1314 cast<SCEVAddRecExpr>(Val: replaceSymbolicStrideSCEV(PSE, PtrToStride: StridesMap, Ptr));
1315 P.setPointer(AR);
1316 }
1317
1318 if (!isNoWrap(PSE, AR, Ptr: RTCheckPtrs.size() == 1 ? Ptr : nullptr, AccessTy,
1319 L: TheLoop, Assume, DT))
1320 return false;
1321 }
1322
1323 for (const auto &[PtrExpr, NeedsFreeze] : RTCheckPtrs) {
1324 // The id of the dependence set.
1325 unsigned DepId;
1326
1327 if (DepCands.contains(V: Access)) {
1328 Value *Leader = DepCands.getLeaderValue(V: Access).getPointer();
1329 unsigned &LeaderId = DepSetId[Leader];
1330 if (!LeaderId)
1331 LeaderId = RunningDepId++;
1332 DepId = LeaderId;
1333 } else
1334 // Each access has its own dependence set.
1335 DepId = RunningDepId++;
1336
1337 bool IsWrite = Access.getInt();
1338 RtCheck.insert(Lp: TheLoop, Ptr, PtrExpr, AccessTy, WritePtr: IsWrite, DepSetId: DepId, ASId, PSE,
1339 NeedsFreeze);
1340 LLVM_DEBUG(dbgs() << "LAA: Found a runtime check ptr:" << *Ptr << '\n');
1341 }
1342
1343 return true;
1344}
1345
1346bool AccessAnalysis::canCheckPtrAtRT(
1347 RuntimePointerChecking &RtCheck, Loop *TheLoop,
1348 const DenseMap<Value *, const SCEV *> &StridesMap, Value *&UncomputablePtr,
1349 bool AllowPartial, const MemoryDepChecker &DepChecker) {
1350 // Find pointers with computable bounds. We are going to use this information
1351 // to place a runtime bound check.
1352 bool CanDoRT = true;
1353
1354 bool MayNeedRTCheck = false;
1355 if (!IsRTCheckAnalysisNeeded) return true;
1356
1357 if (auto *Deps = DepChecker.getDependences()) {
1358 // If there are unknown dependences, this means runtime checks are needed to
1359 // ensure there's no overlap between accesses to the same underlying object.
1360 // Remove the equivalence classes containing both source and destination
1361 // accesses from DepCands. This ensures runtime checks will be generated
1362 // between those accesses and prevents them from being grouped together.
1363 for (const auto &Dep : *Deps) {
1364 if (Dep.Type != MemoryDepChecker::Dependence::Unknown) {
1365 assert(MemoryDepChecker::Dependence::isSafeForVectorization(Dep.Type) ==
1366 MemoryDepChecker::VectorizationSafetyStatus::Safe &&
1367 "Should only skip safe dependences");
1368 continue;
1369 }
1370 Instruction *Src = Dep.getSource(DepChecker);
1371 Instruction *Dst = Dep.getDestination(DepChecker);
1372 DepCands.eraseClass(V: {getPointerOperand(V: Src), Src->mayWriteToMemory()});
1373 DepCands.eraseClass(V: {getPointerOperand(V: Dst), Dst->mayWriteToMemory()});
1374 }
1375 } else {
1376 CheckDeps.clear();
1377 DepCands = {};
1378 }
1379
1380 // We assign a consecutive id to access from different alias sets.
1381 // Accesses between different groups doesn't need to be checked.
1382 unsigned ASId = 0;
1383 for (const auto &AS : AST) {
1384 int NumReadPtrChecks = 0;
1385 int NumWritePtrChecks = 0;
1386 bool CanDoAliasSetRT = true;
1387 ++ASId;
1388 auto ASPointers = AS.getPointers();
1389
1390 // We assign consecutive id to access from different dependence sets.
1391 // Accesses within the same set don't need a runtime check.
1392 unsigned RunningDepId = 1;
1393 DenseMap<Value *, unsigned> DepSetId;
1394
1395 SmallVector<std::pair<MemAccessInfo, Type *>, 4> Retries;
1396
1397 // First, count how many write and read accesses are in the alias set. Also
1398 // collect MemAccessInfos for later.
1399 SmallVector<MemAccessInfo, 4> AccessInfos;
1400 for (const Value *ConstPtr : ASPointers) {
1401 Value *Ptr = const_cast<Value *>(ConstPtr);
1402 bool IsWrite = Accesses.contains(Key: MemAccessInfo(Ptr, true));
1403 if (IsWrite)
1404 ++NumWritePtrChecks;
1405 else
1406 ++NumReadPtrChecks;
1407 AccessInfos.emplace_back(Args&: Ptr, Args&: IsWrite);
1408 }
1409
1410 // We do not need runtime checks for this alias set, if there are no writes
1411 // or a single write and no reads.
1412 if (NumWritePtrChecks == 0 ||
1413 (NumWritePtrChecks == 1 && NumReadPtrChecks == 0)) {
1414 assert((ASPointers.size() <= 1 ||
1415 all_of(ASPointers,
1416 [this](const Value *Ptr) {
1417 MemAccessInfo AccessWrite(const_cast<Value *>(Ptr),
1418 true);
1419 return !DepCands.contains(AccessWrite);
1420 })) &&
1421 "Can only skip updating CanDoRT below, if all entries in AS "
1422 "are reads or there is at most 1 entry");
1423 continue;
1424 }
1425
1426 for (auto &Access : AccessInfos) {
1427 for (const auto &AccessTy : Accesses[Access]) {
1428 if (!createCheckForAccess(RtCheck, Access, AccessTy, StridesMap,
1429 DepSetId, TheLoop, RunningDepId, ASId,
1430 Assume: false)) {
1431 LLVM_DEBUG(dbgs() << "LAA: Can't find bounds for ptr:"
1432 << *Access.getPointer() << '\n');
1433 Retries.emplace_back(Args&: Access, Args: AccessTy);
1434 CanDoAliasSetRT = false;
1435 }
1436 }
1437 }
1438
1439 // Note that this function computes CanDoRT and MayNeedRTCheck
1440 // independently. For example CanDoRT=false, MayNeedRTCheck=false means that
1441 // we have a pointer for which we couldn't find the bounds but we don't
1442 // actually need to emit any checks so it does not matter.
1443 //
1444 // We need runtime checks for this alias set, if there are at least 2
1445 // dependence sets (in which case RunningDepId > 2) or if we need to re-try
1446 // any bound checks (because in that case the number of dependence sets is
1447 // incomplete).
1448 bool NeedsAliasSetRTCheck = RunningDepId > 2 || !Retries.empty();
1449
1450 // We need to perform run-time alias checks, but some pointers had bounds
1451 // that couldn't be checked.
1452 if (NeedsAliasSetRTCheck && !CanDoAliasSetRT) {
1453 // Reset the CanDoSetRt flag and retry all accesses that have failed.
1454 // We know that we need these checks, so we can now be more aggressive
1455 // and add further checks if required (overflow checks).
1456 CanDoAliasSetRT = true;
1457 for (const auto &[Access, AccessTy] : Retries) {
1458 if (!createCheckForAccess(RtCheck, Access, AccessTy, StridesMap,
1459 DepSetId, TheLoop, RunningDepId, ASId,
1460 /*Assume=*/true)) {
1461 CanDoAliasSetRT = false;
1462 UncomputablePtr = Access.getPointer();
1463 if (!AllowPartial)
1464 break;
1465 }
1466 }
1467 }
1468
1469 CanDoRT &= CanDoAliasSetRT;
1470 MayNeedRTCheck |= NeedsAliasSetRTCheck;
1471 ++ASId;
1472 }
1473
1474 // If the pointers that we would use for the bounds comparison have different
1475 // address spaces, assume the values aren't directly comparable, so we can't
1476 // use them for the runtime check. We also have to assume they could
1477 // overlap. In the future there should be metadata for whether address spaces
1478 // are disjoint.
1479 unsigned NumPointers = RtCheck.Pointers.size();
1480 for (unsigned i = 0; i < NumPointers; ++i) {
1481 for (unsigned j = i + 1; j < NumPointers; ++j) {
1482 // Only need to check pointers between two different dependency sets.
1483 if (RtCheck.Pointers[i].DependencySetId ==
1484 RtCheck.Pointers[j].DependencySetId)
1485 continue;
1486 // Only need to check pointers in the same alias set.
1487 if (RtCheck.Pointers[i].AliasSetId != RtCheck.Pointers[j].AliasSetId)
1488 continue;
1489
1490 Value *PtrI = RtCheck.Pointers[i].PointerValue;
1491 Value *PtrJ = RtCheck.Pointers[j].PointerValue;
1492
1493 unsigned ASi = PtrI->getType()->getPointerAddressSpace();
1494 unsigned ASj = PtrJ->getType()->getPointerAddressSpace();
1495 if (ASi != ASj) {
1496 LLVM_DEBUG(
1497 dbgs() << "LAA: Runtime check would require comparison between"
1498 " different address spaces\n");
1499 return false;
1500 }
1501 }
1502 }
1503
1504 if (MayNeedRTCheck && (CanDoRT || AllowPartial))
1505 RtCheck.generateChecks(DepCands);
1506
1507 LLVM_DEBUG(dbgs() << "LAA: We need to do " << RtCheck.getNumberOfChecks()
1508 << " pointer comparisons.\n");
1509
1510 // If we can do run-time checks, but there are no checks, no runtime checks
1511 // are needed. This can happen when all pointers point to the same underlying
1512 // object for example.
1513 RtCheck.Need = CanDoRT ? RtCheck.getNumberOfChecks() != 0 : MayNeedRTCheck;
1514
1515 bool CanDoRTIfNeeded = !RtCheck.Need || CanDoRT;
1516 assert(CanDoRTIfNeeded == (CanDoRT || !MayNeedRTCheck) &&
1517 "CanDoRTIfNeeded depends on RtCheck.Need");
1518 if (!CanDoRTIfNeeded && !AllowPartial)
1519 RtCheck.reset();
1520 return CanDoRTIfNeeded;
1521}
1522
1523void AccessAnalysis::buildDependenceSets() {
1524 // We process the set twice: first we process read-write pointers, last we
1525 // process read-only pointers. This allows us to skip dependence tests for
1526 // read-only pointers.
1527
1528 LLVM_DEBUG(dbgs() << "LAA: Processing memory accesses...\n");
1529 LLVM_DEBUG(dbgs() << " AST: "; AST.dump());
1530 LLVM_DEBUG(dbgs() << "LAA: Accesses(" << Accesses.size() << "):\n");
1531 LLVM_DEBUG({
1532 for (const auto &[A, _] : Accesses)
1533 dbgs() << "\t" << *A.getPointer() << " ("
1534 << (A.getInt()
1535 ? "write"
1536 : (ReadOnlyPtr.contains(A.getPointer()) ? "read-only"
1537 : "read"))
1538 << ")\n";
1539 });
1540
1541 // The AliasSetTracker has nicely partitioned our pointers by metadata
1542 // compatibility and potential for underlying-object overlap. As a result, we
1543 // only need to check for potential pointer dependencies within each alias
1544 // set.
1545 for (const auto &AS : AST) {
1546 bool AliasSetHasWrite = false;
1547
1548 // Map of (pointer to underlying objects, accessed address space) to last
1549 // access encountered.
1550 using UnderlyingObjToAccessMap =
1551 DenseMap<std::pair<const Value *, unsigned>, MemAccessInfo>;
1552 UnderlyingObjToAccessMap ObjToLastAccess;
1553
1554 // Set of access to check after all writes have been processed.
1555 PtrAccessMap DeferredAccesses;
1556
1557 // Iterate over each alias set twice, once to process read/write pointers,
1558 // and then to process read-only pointers.
1559
1560 auto ProcessAccesses = [&](bool UseDeferred) {
1561 PtrAccessMap &S = UseDeferred ? DeferredAccesses : Accesses;
1562
1563 // Note that both the alias-set tracker and the alias sets themselves used
1564 // ordered collections internally and so the iteration order here is
1565 // deterministic.
1566 for (const Value *ConstPtr : AS.getPointers()) {
1567 Value *Ptr = const_cast<Value *>(ConstPtr);
1568
1569 // For a single memory access in AliasSetTracker, Accesses may contain
1570 // both read and write, and they both need to be handled for CheckDeps.
1571 for (auto [AccessPtr, IsWrite] : S.keys()) {
1572 if (AccessPtr != Ptr)
1573 continue;
1574
1575 // If we're using the deferred access set, then it contains only
1576 // reads.
1577 bool IsReadOnlyPtr = ReadOnlyPtr.contains(Ptr) && !IsWrite;
1578 if (UseDeferred && !IsReadOnlyPtr)
1579 continue;
1580 // Otherwise, the pointer must be in the PtrAccessSet, either as a
1581 // read or a write.
1582 assert(((IsReadOnlyPtr && UseDeferred) || IsWrite ||
1583 S.contains(MemAccessInfo(Ptr, false))) &&
1584 "Alias-set pointer not in the access set?");
1585
1586 MemAccessInfo Access(Ptr, IsWrite);
1587 DepCands.insert(Data: Access);
1588
1589 // Memorize read-only pointers for later processing and skip them in
1590 // the first round (they need to be checked after we have seen all
1591 // write pointers). Note: we also mark pointer that are not
1592 // consecutive as "read-only" pointers (so that we check
1593 // "a[b[i]] +="). Hence, we need the second check for "!IsWrite".
1594 if (!UseDeferred && IsReadOnlyPtr) {
1595 // We only use the pointer keys, the types vector values don't
1596 // matter.
1597 DeferredAccesses.insert(KV: {Access, {}});
1598 continue;
1599 }
1600
1601 // If this is a write - check other reads and writes for conflicts. If
1602 // this is a read only check other writes for conflicts (but only if
1603 // there is no other write to the ptr - this is an optimization to
1604 // catch "a[i] = a[i] + " without having to do a dependence check).
1605 if ((IsWrite || IsReadOnlyPtr) && AliasSetHasWrite) {
1606 CheckDeps.push_back(Elt: Access);
1607 IsRTCheckAnalysisNeeded = true;
1608 }
1609
1610 if (IsWrite)
1611 AliasSetHasWrite = true;
1612
1613 // Create sets of pointers connected by a shared alias set and
1614 // underlying object.
1615 SmallVector<const Value *, 16> &UOs = UnderlyingObjects[Ptr];
1616 UOs = {};
1617 ::getUnderlyingObjects(V: Ptr, Objects&: UOs, LI);
1618 LLVM_DEBUG(dbgs()
1619 << "Underlying objects for pointer " << *Ptr << "\n");
1620 for (const Value *UnderlyingObj : UOs) {
1621 // nullptr never alias, don't join sets for pointer that have "null"
1622 // in their UnderlyingObjects list.
1623 if (isa<ConstantPointerNull>(Val: UnderlyingObj) &&
1624 !NullPointerIsDefined(
1625 F: TheLoop->getHeader()->getParent(),
1626 AS: UnderlyingObj->getType()->getPointerAddressSpace()))
1627 continue;
1628
1629 auto [It, Inserted] = ObjToLastAccess.try_emplace(
1630 Key: {UnderlyingObj,
1631 cast<PointerType>(Val: Ptr->getType())->getAddressSpace()},
1632 Args&: Access);
1633 if (!Inserted) {
1634 DepCands.unionSets(V1: Access, V2: It->second);
1635 It->second = Access;
1636 }
1637
1638 LLVM_DEBUG(dbgs() << " " << *UnderlyingObj << "\n");
1639 }
1640 }
1641 }
1642 };
1643
1644 ProcessAccesses(false);
1645 ProcessAccesses(true);
1646 }
1647}
1648
1649/// Check whether the access through \p Ptr has a constant stride.
1650std::optional<int64_t>
1651llvm::getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr,
1652 const Loop *Lp, const DominatorTree &DT,
1653 const DenseMap<Value *, const SCEV *> &StridesMap,
1654 bool Assume, bool ShouldCheckWrap) {
1655 const SCEV *PtrScev = replaceSymbolicStrideSCEV(PSE, PtrToStride: StridesMap, Ptr);
1656 if (PSE.getSE()->isLoopInvariant(S: PtrScev, L: Lp))
1657 return 0;
1658
1659 assert(Ptr->getType()->isPointerTy() && "Unexpected non-ptr");
1660
1661 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Val: PtrScev);
1662 if (Assume && !AR)
1663 AR = PSE.getAsAddRec(V: Ptr);
1664
1665 if (!AR) {
1666 LLVM_DEBUG(dbgs() << "LAA: Bad stride - Not an AddRecExpr pointer " << *Ptr
1667 << " SCEV: " << *PtrScev << "\n");
1668 return std::nullopt;
1669 }
1670
1671 std::optional<int64_t> Stride =
1672 getStrideFromAddRec(AR, Lp, AccessTy, Ptr, PSE);
1673 if (!ShouldCheckWrap || !Stride)
1674 return Stride;
1675
1676 if (isNoWrap(PSE, AR, Ptr, AccessTy, L: Lp, Assume, DT, Stride))
1677 return Stride;
1678
1679 LLVM_DEBUG(
1680 dbgs() << "LAA: Bad stride - Pointer may wrap in the address space "
1681 << *Ptr << " SCEV: " << *AR << "\n");
1682 return std::nullopt;
1683}
1684
1685std::optional<int64_t> llvm::getPointersDiff(Type *ElemTyA, Value *PtrA,
1686 Type *ElemTyB, Value *PtrB,
1687 const DataLayout &DL,
1688 ScalarEvolution &SE,
1689 bool StrictCheck, bool CheckType) {
1690 assert(PtrA && PtrB && "Expected non-nullptr pointers.");
1691
1692 // Make sure that A and B are different pointers.
1693 if (PtrA == PtrB)
1694 return 0;
1695
1696 // Make sure that the element types are the same if required.
1697 if (CheckType && ElemTyA != ElemTyB)
1698 return std::nullopt;
1699
1700 unsigned ASA = PtrA->getType()->getPointerAddressSpace();
1701 unsigned ASB = PtrB->getType()->getPointerAddressSpace();
1702
1703 // Check that the address spaces match.
1704 if (ASA != ASB)
1705 return std::nullopt;
1706 unsigned IdxWidth = DL.getIndexSizeInBits(AS: ASA);
1707
1708 APInt OffsetA(IdxWidth, 0), OffsetB(IdxWidth, 0);
1709 const Value *PtrA1 = PtrA->stripAndAccumulateConstantOffsets(
1710 DL, Offset&: OffsetA, /*AllowNonInbounds=*/true);
1711 const Value *PtrB1 = PtrB->stripAndAccumulateConstantOffsets(
1712 DL, Offset&: OffsetB, /*AllowNonInbounds=*/true);
1713
1714 std::optional<int64_t> Val;
1715 if (PtrA1 == PtrB1) {
1716 // Retrieve the address space again as pointer stripping now tracks through
1717 // `addrspacecast`.
1718 ASA = cast<PointerType>(Val: PtrA1->getType())->getAddressSpace();
1719 ASB = cast<PointerType>(Val: PtrB1->getType())->getAddressSpace();
1720 // Check that the address spaces match and that the pointers are valid.
1721 if (ASA != ASB)
1722 return std::nullopt;
1723
1724 IdxWidth = DL.getIndexSizeInBits(AS: ASA);
1725 OffsetA = OffsetA.sextOrTrunc(width: IdxWidth);
1726 OffsetB = OffsetB.sextOrTrunc(width: IdxWidth);
1727
1728 OffsetB -= OffsetA;
1729 Val = OffsetB.trySExtValue();
1730 } else {
1731 // Otherwise compute the distance with SCEV between the base pointers.
1732 const SCEV *PtrSCEVA = SE.getSCEV(V: PtrA);
1733 const SCEV *PtrSCEVB = SE.getSCEV(V: PtrB);
1734 std::optional<APInt> Diff =
1735 SE.computeConstantDifference(LHS: PtrSCEVB, RHS: PtrSCEVA);
1736 if (!Diff)
1737 return std::nullopt;
1738 Val = Diff->trySExtValue();
1739 }
1740
1741 if (!Val)
1742 return std::nullopt;
1743
1744 int64_t Size = DL.getTypeStoreSize(Ty: ElemTyA);
1745 int64_t Dist = *Val / Size;
1746
1747 // Ensure that the calculated distance matches the type-based one after all
1748 // the bitcasts removal in the provided pointers.
1749 if (!StrictCheck || Dist * Size == Val)
1750 return Dist;
1751 return std::nullopt;
1752}
1753
1754bool llvm::sortPtrAccesses(ArrayRef<Value *> VL, Type *ElemTy,
1755 const DataLayout &DL, ScalarEvolution &SE,
1756 SmallVectorImpl<unsigned> &SortedIndices) {
1757 assert(llvm::all_of(
1758 VL, [](const Value *V) { return V->getType()->isPointerTy(); }) &&
1759 "Expected list of pointer operands.");
1760 // Walk over the pointers, and map each of them to an offset relative to
1761 // first pointer in the array.
1762 Value *Ptr0 = VL[0];
1763
1764 using DistOrdPair = std::pair<int64_t, unsigned>;
1765 auto Compare = llvm::less_first();
1766 std::set<DistOrdPair, decltype(Compare)> Offsets(Compare);
1767 Offsets.emplace(args: 0, args: 0);
1768 bool IsConsecutive = true;
1769 for (auto [Idx, Ptr] : drop_begin(RangeOrContainer: enumerate(First&: VL))) {
1770 std::optional<int64_t> Diff =
1771 getPointersDiff(ElemTyA: ElemTy, PtrA: Ptr0, ElemTyB: ElemTy, PtrB: Ptr, DL, SE,
1772 /*StrictCheck=*/true);
1773 if (!Diff)
1774 return false;
1775
1776 // Check if the pointer with the same offset is found.
1777 int64_t Offset = *Diff;
1778 auto [It, IsInserted] = Offsets.emplace(args&: Offset, args&: Idx);
1779 if (!IsInserted)
1780 return false;
1781 // Consecutive order if the inserted element is the last one.
1782 IsConsecutive &= std::next(x: It) == Offsets.end();
1783 }
1784 SortedIndices.clear();
1785 if (!IsConsecutive) {
1786 // Fill SortedIndices array only if it is non-consecutive.
1787 SortedIndices.resize(N: VL.size());
1788 for (auto [Idx, Off] : enumerate(First&: Offsets))
1789 SortedIndices[Idx] = Off.second;
1790 }
1791 return true;
1792}
1793
1794/// Returns true if the memory operations \p A and \p B are consecutive.
1795bool llvm::isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL,
1796 ScalarEvolution &SE, bool CheckType) {
1797 Value *PtrA = getLoadStorePointerOperand(V: A);
1798 Value *PtrB = getLoadStorePointerOperand(V: B);
1799 if (!PtrA || !PtrB)
1800 return false;
1801 Type *ElemTyA = getLoadStoreType(I: A);
1802 Type *ElemTyB = getLoadStoreType(I: B);
1803 std::optional<int64_t> Diff =
1804 getPointersDiff(ElemTyA, PtrA, ElemTyB, PtrB, DL, SE,
1805 /*StrictCheck=*/true, CheckType);
1806 return Diff == 1;
1807}
1808
1809void MemoryDepChecker::addAccess(StoreInst *SI) {
1810 visitPointers(StartPtr: SI->getPointerOperand(), InnermostLoop: *InnermostLoop,
1811 AddPointer: [this, SI](Value *Ptr) {
1812 Accesses[MemAccessInfo(Ptr, true)].push_back(x: AccessIdx);
1813 InstMap.push_back(Elt: SI);
1814 ++AccessIdx;
1815 });
1816}
1817
1818void MemoryDepChecker::addAccess(LoadInst *LI) {
1819 visitPointers(StartPtr: LI->getPointerOperand(), InnermostLoop: *InnermostLoop,
1820 AddPointer: [this, LI](Value *Ptr) {
1821 Accesses[MemAccessInfo(Ptr, false)].push_back(x: AccessIdx);
1822 InstMap.push_back(Elt: LI);
1823 ++AccessIdx;
1824 });
1825}
1826
1827MemoryDepChecker::VectorizationSafetyStatus
1828MemoryDepChecker::Dependence::isSafeForVectorization(DepType Type) {
1829 switch (Type) {
1830 case NoDep:
1831 case Forward:
1832 case BackwardVectorizable:
1833 return VectorizationSafetyStatus::Safe;
1834
1835 case Unknown:
1836 return VectorizationSafetyStatus::PossiblySafeWithRtChecks;
1837 case ForwardButPreventsForwarding:
1838 case Backward:
1839 case BackwardVectorizableButPreventsForwarding:
1840 case IndirectUnsafe:
1841 case InvariantUnsafe:
1842 return VectorizationSafetyStatus::Unsafe;
1843 }
1844 llvm_unreachable("unexpected DepType!");
1845}
1846
1847bool MemoryDepChecker::Dependence::isBackward() const {
1848 switch (Type) {
1849 case NoDep:
1850 case Forward:
1851 case ForwardButPreventsForwarding:
1852 case Unknown:
1853 case IndirectUnsafe:
1854 case InvariantUnsafe:
1855 return false;
1856
1857 case BackwardVectorizable:
1858 case Backward:
1859 case BackwardVectorizableButPreventsForwarding:
1860 return true;
1861 }
1862 llvm_unreachable("unexpected DepType!");
1863}
1864
1865bool MemoryDepChecker::Dependence::isPossiblyBackward() const {
1866 return isBackward() || Type == Unknown || Type == IndirectUnsafe ||
1867 Type == InvariantUnsafe;
1868}
1869
1870bool MemoryDepChecker::Dependence::isForward() const {
1871 switch (Type) {
1872 case Forward:
1873 case ForwardButPreventsForwarding:
1874 return true;
1875
1876 case NoDep:
1877 case Unknown:
1878 case BackwardVectorizable:
1879 case Backward:
1880 case BackwardVectorizableButPreventsForwarding:
1881 case IndirectUnsafe:
1882 case InvariantUnsafe:
1883 return false;
1884 }
1885 llvm_unreachable("unexpected DepType!");
1886}
1887
1888bool MemoryDepChecker::couldPreventStoreLoadForward(uint64_t Distance,
1889 uint64_t TypeByteSize,
1890 unsigned CommonStride) {
1891 // If loads occur at a distance that is not a multiple of a feasible vector
1892 // factor store-load forwarding does not take place.
1893 // Positive dependences might cause troubles because vectorizing them might
1894 // prevent store-load forwarding making vectorized code run a lot slower.
1895 // a[i] = a[i-3] ^ a[i-8];
1896 // The stores to a[i:i+1] don't align with the stores to a[i-3:i-2] and
1897 // hence on your typical architecture store-load forwarding does not take
1898 // place. Vectorizing in such cases does not make sense.
1899 // Store-load forwarding distance.
1900
1901 // After this many iterations store-to-load forwarding conflicts should not
1902 // cause any slowdowns.
1903 const uint64_t NumItersForStoreLoadThroughMemory = 8 * TypeByteSize;
1904 // Maximum vector factor.
1905 uint64_t MaxVFWithoutSLForwardIssuesPowerOf2 =
1906 std::min(a: VectorizerParams::MaxVectorWidth * TypeByteSize,
1907 b: MaxStoreLoadForwardSafeDistanceInBits);
1908
1909 // Compute the smallest VF at which the store and load would be misaligned.
1910 for (uint64_t VF = 2 * TypeByteSize;
1911 VF <= MaxVFWithoutSLForwardIssuesPowerOf2; VF *= 2) {
1912 // If the number of vector iteration between the store and the load are
1913 // small we could incur conflicts.
1914 if (Distance % VF && Distance / VF < NumItersForStoreLoadThroughMemory) {
1915 MaxVFWithoutSLForwardIssuesPowerOf2 = (VF >> 1);
1916 break;
1917 }
1918 }
1919
1920 if (MaxVFWithoutSLForwardIssuesPowerOf2 < 2 * TypeByteSize) {
1921 LLVM_DEBUG(
1922 dbgs() << "LAA: Distance " << Distance
1923 << " that could cause a store-load forwarding conflict\n");
1924 return true;
1925 }
1926
1927 if (CommonStride &&
1928 MaxVFWithoutSLForwardIssuesPowerOf2 <
1929 MaxStoreLoadForwardSafeDistanceInBits &&
1930 MaxVFWithoutSLForwardIssuesPowerOf2 !=
1931 VectorizerParams::MaxVectorWidth * TypeByteSize) {
1932 uint64_t MaxVF =
1933 bit_floor(Value: MaxVFWithoutSLForwardIssuesPowerOf2 / CommonStride);
1934 uint64_t MaxVFInBits = MaxVF * TypeByteSize * 8;
1935 MaxStoreLoadForwardSafeDistanceInBits =
1936 std::min(a: MaxStoreLoadForwardSafeDistanceInBits, b: MaxVFInBits);
1937 }
1938 return false;
1939}
1940
1941void MemoryDepChecker::mergeInStatus(VectorizationSafetyStatus S) {
1942 if (Status < S)
1943 Status = S;
1944}
1945
1946/// Given a dependence-distance \p Dist between two memory accesses, that have
1947/// strides in the same direction whose absolute value of the maximum stride is
1948/// given in \p MaxStride, in a loop whose maximum backedge taken count is \p
1949/// MaxBTC, check if it is possible to prove statically that the dependence
1950/// distance is larger than the range that the accesses will travel through the
1951/// execution of the loop. If so, return true; false otherwise. This is useful
1952/// for example in loops such as the following (PR31098):
1953///
1954/// for (i = 0; i < D; ++i) {
1955/// = out[i];
1956/// out[i+D] =
1957/// }
1958static bool isSafeDependenceDistance(const DataLayout &DL, ScalarEvolution &SE,
1959 const SCEV &MaxBTC, const SCEV &Dist,
1960 uint64_t MaxStride) {
1961
1962 // If we can prove that
1963 // (**) |Dist| > MaxBTC * Step
1964 // where Step is the absolute stride of the memory accesses in bytes,
1965 // then there is no dependence.
1966 //
1967 // Rationale:
1968 // We basically want to check if the absolute distance (|Dist/Step|)
1969 // is >= the loop iteration count (or > MaxBTC).
1970 // This is equivalent to the Strong SIV Test (Practical Dependence Testing,
1971 // Section 4.2.1); Note, that for vectorization it is sufficient to prove
1972 // that the dependence distance is >= VF; This is checked elsewhere.
1973 // But in some cases we can prune dependence distances early, and
1974 // even before selecting the VF, and without a runtime test, by comparing
1975 // the distance against the loop iteration count. Since the vectorized code
1976 // will be executed only if LoopCount >= VF, proving distance >= LoopCount
1977 // also guarantees that distance >= VF.
1978 //
1979 const SCEV *Step = SE.getConstant(Ty: MaxBTC.getType(), V: MaxStride);
1980 const SCEV *Product = SE.getMulExpr(LHS: &MaxBTC, RHS: Step);
1981
1982 const SCEV *CastedDist = &Dist;
1983 const SCEV *CastedProduct = Product;
1984 uint64_t DistTypeSizeBits = DL.getTypeSizeInBits(Ty: Dist.getType());
1985 uint64_t ProductTypeSizeBits = DL.getTypeSizeInBits(Ty: Product->getType());
1986
1987 // The dependence distance can be positive/negative, so we sign extend Dist;
1988 // The multiplication of the absolute stride in bytes and the
1989 // backedgeTakenCount is non-negative, so we zero extend Product.
1990 if (DistTypeSizeBits > ProductTypeSizeBits)
1991 CastedProduct = SE.getZeroExtendExpr(Op: Product, Ty: Dist.getType());
1992 else
1993 CastedDist = SE.getNoopOrSignExtend(V: &Dist, Ty: Product->getType());
1994
1995 // Is Dist - (MaxBTC * Step) > 0 ?
1996 // (If so, then we have proven (**) because |Dist| >= Dist)
1997 const SCEV *Minus = SE.getMinusSCEV(LHS: CastedDist, RHS: CastedProduct);
1998 if (SE.isKnownPositive(S: Minus))
1999 return true;
2000
2001 // Second try: Is -Dist - (MaxBTC * Step) > 0 ?
2002 // (If so, then we have proven (**) because |Dist| >= -1*Dist)
2003 const SCEV *NegDist = SE.getNegativeSCEV(V: CastedDist);
2004 Minus = SE.getMinusSCEV(LHS: NegDist, RHS: CastedProduct);
2005 return SE.isKnownPositive(S: Minus);
2006}
2007
2008/// Check the dependence for two accesses with the same stride \p Stride.
2009/// \p Distance is the positive distance in bytes, and \p TypeByteSize is type
2010/// size in bytes.
2011///
2012/// \returns true if they are independent.
2013static bool areStridedAccessesIndependent(uint64_t Distance, uint64_t Stride,
2014 uint64_t TypeByteSize) {
2015 assert(Stride > 1 && "The stride must be greater than 1");
2016 assert(TypeByteSize > 0 && "The type size in byte must be non-zero");
2017 assert(Distance > 0 && "The distance must be non-zero");
2018
2019 // Skip if the distance is not multiple of type byte size.
2020 if (Distance % TypeByteSize)
2021 return false;
2022
2023 // No dependence if the distance is not multiple of the stride.
2024 // E.g.
2025 // for (i = 0; i < 1024 ; i += 4)
2026 // A[i+2] = A[i] + 1;
2027 //
2028 // Two accesses in memory (distance is 2, stride is 4):
2029 // | A[0] | | | | A[4] | | | |
2030 // | | | A[2] | | | | A[6] | |
2031 //
2032 // E.g.
2033 // for (i = 0; i < 1024 ; i += 3)
2034 // A[i+4] = A[i] + 1;
2035 //
2036 // Two accesses in memory (distance is 4, stride is 3):
2037 // | A[0] | | | A[3] | | | A[6] | | |
2038 // | | | | | A[4] | | | A[7] | |
2039 return Distance % Stride;
2040}
2041
2042bool MemoryDepChecker::areAccessesCompletelyBeforeOrAfter(const SCEV *Src,
2043 Type *SrcTy,
2044 const SCEV *Sink,
2045 Type *SinkTy) {
2046 const SCEV *BTC = PSE.getBackedgeTakenCount();
2047 const SCEV *SymbolicMaxBTC = PSE.getSymbolicMaxBackedgeTakenCount();
2048 ScalarEvolution &SE = *PSE.getSE();
2049 const auto &[SrcStart_, SrcEnd_] =
2050 getStartAndEndForAccess(Lp: InnermostLoop, PtrExpr: Src, AccessTy: SrcTy, BTC, MaxBTC: SymbolicMaxBTC,
2051 SE: &SE, PointerBounds: &PointerBounds, DT, AC, LoopGuards);
2052 if (isa<SCEVCouldNotCompute>(Val: SrcStart_) || isa<SCEVCouldNotCompute>(Val: SrcEnd_))
2053 return false;
2054
2055 const auto &[SinkStart_, SinkEnd_] =
2056 getStartAndEndForAccess(Lp: InnermostLoop, PtrExpr: Sink, AccessTy: SinkTy, BTC, MaxBTC: SymbolicMaxBTC,
2057 SE: &SE, PointerBounds: &PointerBounds, DT, AC, LoopGuards);
2058 if (isa<SCEVCouldNotCompute>(Val: SinkStart_) ||
2059 isa<SCEVCouldNotCompute>(Val: SinkEnd_))
2060 return false;
2061
2062 if (!LoopGuards)
2063 LoopGuards.emplace(args: ScalarEvolution::LoopGuards::collect(L: InnermostLoop, SE));
2064
2065 auto SrcEnd = SE.applyLoopGuards(Expr: SrcEnd_, Guards: *LoopGuards);
2066 auto SinkStart = SE.applyLoopGuards(Expr: SinkStart_, Guards: *LoopGuards);
2067 if (SE.isKnownPredicate(Pred: CmpInst::ICMP_ULE, LHS: SrcEnd, RHS: SinkStart))
2068 return true;
2069
2070 auto SinkEnd = SE.applyLoopGuards(Expr: SinkEnd_, Guards: *LoopGuards);
2071 auto SrcStart = SE.applyLoopGuards(Expr: SrcStart_, Guards: *LoopGuards);
2072 return SE.isKnownPredicate(Pred: CmpInst::ICMP_ULE, LHS: SinkEnd, RHS: SrcStart);
2073}
2074
2075std::variant<MemoryDepChecker::Dependence::DepType,
2076 MemoryDepChecker::DepDistanceStrideAndSizeInfo>
2077MemoryDepChecker::getDependenceDistanceStrideAndSize(
2078 const AccessAnalysis::MemAccessInfo &A, Instruction *AInst,
2079 const AccessAnalysis::MemAccessInfo &B, Instruction *BInst) {
2080 const auto &DL = InnermostLoop->getHeader()->getDataLayout();
2081 auto &SE = *PSE.getSE();
2082 const auto &[APtr, AIsWrite] = A;
2083 const auto &[BPtr, BIsWrite] = B;
2084
2085 // Two reads are independent.
2086 if (!AIsWrite && !BIsWrite)
2087 return MemoryDepChecker::Dependence::NoDep;
2088
2089 Type *ATy = getLoadStoreType(I: AInst);
2090 Type *BTy = getLoadStoreType(I: BInst);
2091
2092 // We cannot check pointers in different address spaces.
2093 if (APtr->getType()->getPointerAddressSpace() !=
2094 BPtr->getType()->getPointerAddressSpace())
2095 return MemoryDepChecker::Dependence::Unknown;
2096
2097 std::optional<int64_t> StrideAPtr = getPtrStride(
2098 PSE, AccessTy: ATy, Ptr: APtr, Lp: InnermostLoop, DT: *DT, StridesMap: SymbolicStrides, Assume: true, ShouldCheckWrap: true);
2099 std::optional<int64_t> StrideBPtr = getPtrStride(
2100 PSE, AccessTy: BTy, Ptr: BPtr, Lp: InnermostLoop, DT: *DT, StridesMap: SymbolicStrides, Assume: true, ShouldCheckWrap: true);
2101
2102 const SCEV *Src = PSE.getSCEV(V: APtr);
2103 const SCEV *Sink = PSE.getSCEV(V: BPtr);
2104
2105 // If the induction step is negative we have to invert source and sink of the
2106 // dependence when measuring the distance between them. We should not swap
2107 // AIsWrite with BIsWrite, as their uses expect them in program order.
2108 if (StrideAPtr && *StrideAPtr < 0) {
2109 std::swap(a&: Src, b&: Sink);
2110 std::swap(a&: AInst, b&: BInst);
2111 std::swap(a&: ATy, b&: BTy);
2112 std::swap(lhs&: StrideAPtr, rhs&: StrideBPtr);
2113 }
2114
2115 const SCEV *Dist = SE.getMinusSCEV(LHS: Sink, RHS: Src);
2116
2117 LLVM_DEBUG(dbgs() << "LAA: Src Scev: " << *Src << "Sink Scev: " << *Sink
2118 << "\n");
2119 LLVM_DEBUG(dbgs() << "LAA: Distance for " << *AInst << " to " << *BInst
2120 << ": " << *Dist << "\n");
2121
2122 // Need accesses with constant strides and the same direction for further
2123 // dependence analysis. We don't want to vectorize "A[B[i]] += ..." and
2124 // similar code or pointer arithmetic that could wrap in the address space.
2125
2126 // If either Src or Sink are not strided (i.e. not a non-wrapping AddRec) and
2127 // not loop-invariant (stride will be 0 in that case), we cannot analyze the
2128 // dependence further and also cannot generate runtime checks.
2129 if (!StrideAPtr || !StrideBPtr) {
2130 LLVM_DEBUG(dbgs() << "Pointer access with non-constant stride\n");
2131 return MemoryDepChecker::Dependence::IndirectUnsafe;
2132 }
2133
2134 int64_t StrideAPtrInt = *StrideAPtr;
2135 int64_t StrideBPtrInt = *StrideBPtr;
2136 LLVM_DEBUG(dbgs() << "LAA: Src induction step: " << StrideAPtrInt
2137 << " Sink induction step: " << StrideBPtrInt << "\n");
2138 // At least Src or Sink are loop invariant and the other is strided or
2139 // invariant.
2140 if (!StrideAPtrInt || !StrideBPtrInt) {
2141 // If both are loop-invariant and access the same location, we cannot
2142 // vectorize.
2143 if (!StrideAPtrInt && !StrideBPtrInt && Dist->isZero())
2144 return MemoryDepChecker::Dependence::InvariantUnsafe;
2145 // Otherwise, we can generate a runtime check to disambiguate the accesses.
2146 return MemoryDepChecker::Dependence::Unknown;
2147 }
2148
2149 // Both Src and Sink have a constant stride, check if they are in the same
2150 // direction.
2151 if ((StrideAPtrInt > 0) != (StrideBPtrInt > 0)) {
2152 LLVM_DEBUG(
2153 dbgs() << "Pointer access with strides in different directions\n");
2154 return MemoryDepChecker::Dependence::Unknown;
2155 }
2156
2157 TypeSize AStoreSz = DL.getTypeStoreSize(Ty: ATy);
2158 TypeSize BStoreSz = DL.getTypeStoreSize(Ty: BTy);
2159
2160 // If store sizes are not the same, set TypeByteSize to zero, so we can check
2161 // it in the caller isDependent.
2162 uint64_t ASz = DL.getTypeAllocSize(Ty: ATy);
2163 uint64_t BSz = DL.getTypeAllocSize(Ty: BTy);
2164 uint64_t TypeByteSize = (AStoreSz == BStoreSz) ? BSz : 0;
2165
2166 uint64_t StrideAScaled = std::abs(i: StrideAPtrInt) * ASz;
2167 uint64_t StrideBScaled = std::abs(i: StrideBPtrInt) * BSz;
2168
2169 uint64_t MaxStride = std::max(a: StrideAScaled, b: StrideBScaled);
2170
2171 std::optional<uint64_t> CommonStride;
2172 if (StrideAScaled == StrideBScaled)
2173 CommonStride = StrideAScaled;
2174
2175 // TODO: Historically, we didn't retry with runtime checks when (unscaled)
2176 // strides were different but there is no inherent reason to.
2177 if (!isa<SCEVConstant>(Val: Dist))
2178 ShouldRetryWithRuntimeChecks |= StrideAPtrInt == StrideBPtrInt;
2179
2180 // If distance is a SCEVCouldNotCompute, return Unknown immediately.
2181 if (isa<SCEVCouldNotCompute>(Val: Dist)) {
2182 LLVM_DEBUG(dbgs() << "LAA: Uncomputable distance.\n");
2183 return Dependence::Unknown;
2184 }
2185
2186 return DepDistanceStrideAndSizeInfo(Dist, MaxStride, CommonStride,
2187 TypeByteSize, AIsWrite, BIsWrite);
2188}
2189
2190MemoryDepChecker::Dependence::DepType
2191MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
2192 const MemAccessInfo &B, unsigned BIdx) {
2193 assert(AIdx < BIdx && "Must pass arguments in program order");
2194
2195 // Check if we can prove that Sink only accesses memory after Src's end or
2196 // vice versa. The helper is used to perform the checks only on the exit paths
2197 // where it helps to improve the analysis result.
2198 auto CheckCompletelyBeforeOrAfter = [&]() {
2199 auto *APtr = A.getPointer();
2200 auto *BPtr = B.getPointer();
2201 Type *ATy = getLoadStoreType(I: InstMap[AIdx]);
2202 Type *BTy = getLoadStoreType(I: InstMap[BIdx]);
2203 const SCEV *Src = PSE.getSCEV(V: APtr);
2204 const SCEV *Sink = PSE.getSCEV(V: BPtr);
2205 return areAccessesCompletelyBeforeOrAfter(Src, SrcTy: ATy, Sink, SinkTy: BTy);
2206 };
2207
2208 // Get the dependence distance, stride, type size and what access writes for
2209 // the dependence between A and B.
2210 auto Res =
2211 getDependenceDistanceStrideAndSize(A, AInst: InstMap[AIdx], B, BInst: InstMap[BIdx]);
2212 if (std::holds_alternative<Dependence::DepType>(v: Res)) {
2213 if (std::get<Dependence::DepType>(v&: Res) == Dependence::Unknown &&
2214 CheckCompletelyBeforeOrAfter())
2215 return Dependence::NoDep;
2216 return std::get<Dependence::DepType>(v&: Res);
2217 }
2218
2219 auto &[Dist, MaxStride, CommonStride, TypeByteSize, AIsWrite, BIsWrite] =
2220 std::get<DepDistanceStrideAndSizeInfo>(v&: Res);
2221 bool HasSameSize = TypeByteSize > 0;
2222
2223 ScalarEvolution &SE = *PSE.getSE();
2224 auto &DL = InnermostLoop->getHeader()->getDataLayout();
2225
2226 // If the distance between the acecsses is larger than their maximum absolute
2227 // stride multiplied by the symbolic maximum backedge taken count (which is an
2228 // upper bound of the number of iterations), the accesses are independet, i.e.
2229 // they are far enough appart that accesses won't access the same location
2230 // across all loop ierations.
2231 if (HasSameSize &&
2232 isSafeDependenceDistance(
2233 DL, SE, MaxBTC: *(PSE.getSymbolicMaxBackedgeTakenCount()), Dist: *Dist, MaxStride))
2234 return Dependence::NoDep;
2235
2236 // The rest of this function relies on ConstDist being at most 64-bits, which
2237 // is checked earlier. Will assert if the calling code changes.
2238 const APInt *APDist = nullptr;
2239 uint64_t ConstDist =
2240 match(S: Dist, P: m_scev_APInt(C&: APDist)) ? APDist->abs().getZExtValue() : 0;
2241
2242 // Attempt to prove strided accesses independent.
2243 if (APDist) {
2244 // If the distance between accesses and their strides are known constants,
2245 // check whether the accesses interlace each other.
2246 if (ConstDist > 0 && CommonStride && CommonStride > 1 && HasSameSize &&
2247 areStridedAccessesIndependent(Distance: ConstDist, Stride: *CommonStride, TypeByteSize)) {
2248 LLVM_DEBUG(dbgs() << "LAA: Strided accesses are independent\n");
2249 return Dependence::NoDep;
2250 }
2251 } else {
2252 if (!LoopGuards)
2253 LoopGuards.emplace(
2254 args: ScalarEvolution::LoopGuards::collect(L: InnermostLoop, SE));
2255 Dist = SE.applyLoopGuards(Expr: Dist, Guards: *LoopGuards);
2256 }
2257
2258 // Negative distances are not plausible dependencies.
2259 if (SE.isKnownNonPositive(S: Dist)) {
2260 if (SE.isKnownNonNegative(S: Dist)) {
2261 if (HasSameSize) {
2262 // Write to the same location with the same size.
2263 return Dependence::Forward;
2264 }
2265 LLVM_DEBUG(dbgs() << "LAA: possibly zero dependence difference but "
2266 "different type sizes\n");
2267 return Dependence::Unknown;
2268 }
2269
2270 bool IsTrueDataDependence = (AIsWrite && !BIsWrite);
2271 // Check if the first access writes to a location that is read in a later
2272 // iteration, where the distance between them is not a multiple of a vector
2273 // factor and relatively small.
2274 //
2275 // NOTE: There is no need to update MaxSafeVectorWidthInBits after call to
2276 // couldPreventStoreLoadForward, even if it changed MinDepDistBytes, since a
2277 // forward dependency will allow vectorization using any width.
2278
2279 if (IsTrueDataDependence && EnableForwardingConflictDetection) {
2280 if (!ConstDist) {
2281 return CheckCompletelyBeforeOrAfter() ? Dependence::NoDep
2282 : Dependence::Unknown;
2283 }
2284 if (!HasSameSize ||
2285 couldPreventStoreLoadForward(Distance: ConstDist, TypeByteSize)) {
2286 LLVM_DEBUG(
2287 dbgs() << "LAA: Forward but may prevent st->ld forwarding\n");
2288 return Dependence::ForwardButPreventsForwarding;
2289 }
2290 }
2291
2292 LLVM_DEBUG(dbgs() << "LAA: Dependence is negative\n");
2293 return Dependence::Forward;
2294 }
2295
2296 int64_t MinDistance = SE.getSignedRangeMin(S: Dist).getSExtValue();
2297 // Below we only handle strictly positive distances.
2298 if (MinDistance <= 0) {
2299 return CheckCompletelyBeforeOrAfter() ? Dependence::NoDep
2300 : Dependence::Unknown;
2301 }
2302
2303 if (!HasSameSize) {
2304 if (CheckCompletelyBeforeOrAfter())
2305 return Dependence::NoDep;
2306 LLVM_DEBUG(dbgs() << "LAA: ReadWrite-Write positive dependency with "
2307 "different type sizes\n");
2308 return Dependence::Unknown;
2309 }
2310 // Bail out early if passed-in parameters make vectorization not feasible.
2311 unsigned ForcedFactor = (VectorizerParams::VectorizationFactor ?
2312 VectorizerParams::VectorizationFactor : 1);
2313 unsigned ForcedUnroll = (VectorizerParams::VectorizationInterleave ?
2314 VectorizerParams::VectorizationInterleave : 1);
2315 // The minimum number of iterations for a vectorized/unrolled version.
2316 unsigned MinNumIter = std::max(a: ForcedFactor * ForcedUnroll, b: 2U);
2317
2318 // It's not vectorizable if the distance is smaller than the minimum distance
2319 // needed for a vectroized/unrolled version. Vectorizing one iteration in
2320 // front needs MaxStride. Vectorizing the last iteration needs TypeByteSize.
2321 // (No need to plus the last gap distance).
2322 //
2323 // E.g. Assume one char is 1 byte in memory and one int is 4 bytes.
2324 // foo(int *A) {
2325 // int *B = (int *)((char *)A + 14);
2326 // for (i = 0 ; i < 1024 ; i += 2)
2327 // B[i] = A[i] + 1;
2328 // }
2329 //
2330 // Two accesses in memory (stride is 4 * 2):
2331 // | A[0] | | A[2] | | A[4] | | A[6] | |
2332 // | B[0] | | B[2] | | B[4] |
2333 //
2334 // MinDistance needs for vectorizing iterations except the last iteration:
2335 // 4 * 2 * (MinNumIter - 1). MinDistance needs for the last iteration: 4.
2336 // So the minimum distance needed is: 4 * 2 * (MinNumIter - 1) + 4.
2337 //
2338 // If MinNumIter is 2, it is vectorizable as the minimum distance needed is
2339 // 12, which is less than distance.
2340 //
2341 // If MinNumIter is 4 (Say if a user forces the vectorization factor to be 4),
2342 // the minimum distance needed is 28, which is greater than distance. It is
2343 // not safe to do vectorization.
2344 //
2345 // We use MaxStride (maximum of src and sink strides) to get a conservative
2346 // lower bound on the MinDistanceNeeded in case of different strides.
2347
2348 // We know that Dist is positive, but it may not be constant. Use the signed
2349 // minimum for computations below, as this ensures we compute the closest
2350 // possible dependence distance.
2351 uint64_t MinDistanceNeeded = MaxStride * (MinNumIter - 1) + TypeByteSize;
2352 if (MinDistanceNeeded > static_cast<uint64_t>(MinDistance)) {
2353 if (!ConstDist) {
2354 // For non-constant distances, we checked the lower bound of the
2355 // dependence distance and the distance may be larger at runtime (and safe
2356 // for vectorization). Classify it as Unknown, so we re-try with runtime
2357 // checks, unless we can prove both accesses cannot overlap.
2358 return CheckCompletelyBeforeOrAfter() ? Dependence::NoDep
2359 : Dependence::Unknown;
2360 }
2361 LLVM_DEBUG(dbgs() << "LAA: Failure because of positive minimum distance "
2362 << MinDistance << '\n');
2363 return Dependence::Backward;
2364 }
2365
2366 // Unsafe if the minimum distance needed is greater than smallest dependence
2367 // distance distance.
2368 if (MinDistanceNeeded > MinDepDistBytes) {
2369 LLVM_DEBUG(dbgs() << "LAA: Failure because it needs at least "
2370 << MinDistanceNeeded << " size in bytes\n");
2371 return Dependence::Backward;
2372 }
2373
2374 MinDepDistBytes =
2375 std::min(a: static_cast<uint64_t>(MinDistance), b: MinDepDistBytes);
2376
2377 bool IsTrueDataDependence = (!AIsWrite && BIsWrite);
2378 if (IsTrueDataDependence && EnableForwardingConflictDetection && ConstDist &&
2379 couldPreventStoreLoadForward(Distance: MinDistance, TypeByteSize, CommonStride: *CommonStride))
2380 return Dependence::BackwardVectorizableButPreventsForwarding;
2381
2382 uint64_t MaxVF = MinDepDistBytes / MaxStride;
2383 LLVM_DEBUG(dbgs() << "LAA: Positive min distance " << MinDistance
2384 << " with max VF = " << MaxVF << '\n');
2385
2386 uint64_t MaxVFInBits = MaxVF * TypeByteSize * 8;
2387 if (!ConstDist && MaxVFInBits < MaxTargetVectorWidthInBits) {
2388 // For non-constant distances, we checked the lower bound of the dependence
2389 // distance and the distance may be larger at runtime (and safe for
2390 // vectorization). Classify it as Unknown, so we re-try with runtime checks,
2391 // unless we can prove both accesses cannot overlap.
2392 return CheckCompletelyBeforeOrAfter() ? Dependence::NoDep
2393 : Dependence::Unknown;
2394 }
2395
2396 if (CheckCompletelyBeforeOrAfter())
2397 return Dependence::NoDep;
2398
2399 MaxSafeVectorWidthInBits = std::min(a: MaxSafeVectorWidthInBits, b: MaxVFInBits);
2400 return Dependence::BackwardVectorizable;
2401}
2402
2403bool MemoryDepChecker::areDepsSafe(const DepCandidates &DepCands,
2404 ArrayRef<MemAccessInfo> CheckDeps) {
2405
2406 MinDepDistBytes = -1;
2407 SmallPtrSet<MemAccessInfo, 8> Visited;
2408 for (MemAccessInfo CurAccess : CheckDeps) {
2409 if (Visited.contains(Ptr: CurAccess))
2410 continue;
2411
2412 // Check accesses within this set.
2413 EquivalenceClasses<MemAccessInfo>::member_iterator AI =
2414 DepCands.findLeader(V: CurAccess);
2415 EquivalenceClasses<MemAccessInfo>::member_iterator AE =
2416 DepCands.member_end();
2417
2418 // Check every access pair.
2419 while (AI != AE) {
2420 Visited.insert(Ptr: *AI);
2421 bool AIIsWrite = AI->getInt();
2422 // Reads from the same pointer don't create extra hazards, but multiple
2423 // stores do (WAW), so start from AI for writes and next(AI) for reads.
2424 EquivalenceClasses<MemAccessInfo>::member_iterator OI =
2425 (AIIsWrite ? AI : std::next(x: AI));
2426 while (OI != AE) {
2427 // Check every accessing instruction pair in program order.
2428 auto &Acc = Accesses[*AI];
2429 for (std::vector<unsigned>::iterator I1 = Acc.begin(), I1E = Acc.end();
2430 I1 != I1E; ++I1)
2431 // When checking for WAW (OI == AI) caused by multiple writes to the
2432 // same pointer, start I2 at the next access past I1 to avoid
2433 // self-comparison.
2434 for (std::vector<unsigned>::iterator
2435 I2 = (OI == AI ? std::next(x: I1) : Accesses[*OI].begin()),
2436 I2E = (OI == AI ? I1E : Accesses[*OI].end());
2437 I2 != I2E; ++I2) {
2438 auto A = std::make_pair(x: &*AI, y&: *I1);
2439 auto B = std::make_pair(x: &*OI, y&: *I2);
2440
2441 assert(*I1 != *I2);
2442 if (*I1 > *I2)
2443 std::swap(x&: A, y&: B);
2444
2445 Dependence::DepType Type =
2446 isDependent(A: *A.first, AIdx: A.second, B: *B.first, BIdx: B.second);
2447 mergeInStatus(S: Dependence::isSafeForVectorization(Type));
2448
2449 // Gather dependences unless we accumulated MaxDependences
2450 // dependences. In that case return as soon as we find the first
2451 // unsafe dependence. This puts a limit on this quadratic
2452 // algorithm.
2453 if (RecordDependences) {
2454 if (Type != Dependence::NoDep)
2455 Dependences.emplace_back(Args&: A.second, Args&: B.second, Args&: Type);
2456
2457 if (Dependences.size() >= MaxDependences) {
2458 RecordDependences = false;
2459 Dependences.clear();
2460 LLVM_DEBUG(dbgs()
2461 << "Too many dependences, stopped recording\n");
2462 }
2463 }
2464 if (!RecordDependences && !isSafeForVectorization())
2465 return false;
2466 }
2467 ++OI;
2468 }
2469 ++AI;
2470 }
2471 }
2472
2473 LLVM_DEBUG(dbgs() << "Total Dependences: " << Dependences.size() << "\n");
2474 return isSafeForVectorization();
2475}
2476
2477SmallVector<Instruction *, 4>
2478MemoryDepChecker::getInstructionsForAccess(Value *Ptr, bool IsWrite) const {
2479 MemAccessInfo Access(Ptr, IsWrite);
2480 auto I = Accesses.find(Val: Access);
2481 SmallVector<Instruction *, 4> Insts;
2482 if (I != Accesses.end()) {
2483 transform(Range: I->second, d_first: std::back_inserter(x&: Insts),
2484 F: [&](unsigned Idx) { return this->InstMap[Idx]; });
2485 }
2486
2487 return Insts;
2488}
2489
2490const char *MemoryDepChecker::Dependence::DepName[] = {
2491 "NoDep",
2492 "Unknown",
2493 "IndirectUnsafe",
2494 "InvariantUnsafe",
2495 "Forward",
2496 "ForwardButPreventsForwarding",
2497 "Backward",
2498 "BackwardVectorizable",
2499 "BackwardVectorizableButPreventsForwarding"};
2500
2501void MemoryDepChecker::Dependence::print(
2502 raw_ostream &OS, unsigned Depth,
2503 const SmallVectorImpl<Instruction *> &Instrs) const {
2504 OS.indent(NumSpaces: Depth) << DepName[Type] << ":\n";
2505 OS.indent(NumSpaces: Depth + 2) << *Instrs[Source] << " -> \n";
2506 OS.indent(NumSpaces: Depth + 2) << *Instrs[Destination] << "\n";
2507}
2508
2509bool LoopAccessInfo::canAnalyzeLoop() {
2510 // We need to have a loop header.
2511 LLVM_DEBUG(dbgs() << "\nLAA: Checking a loop in '"
2512 << TheLoop->getHeader()->getParent()->getName() << "' from "
2513 << TheLoop->getLocStr() << "\n");
2514
2515 // We can only analyze innermost loops.
2516 if (!TheLoop->isInnermost()) {
2517 LLVM_DEBUG(dbgs() << "LAA: loop is not the innermost loop\n");
2518 recordAnalysis(RemarkName: "NotInnerMostLoop") << "loop is not the innermost loop";
2519 return false;
2520 }
2521
2522 // We must have a single backedge.
2523 if (TheLoop->getNumBackEdges() != 1) {
2524 LLVM_DEBUG(
2525 dbgs() << "LAA: loop control flow is not understood by analyzer\n");
2526 recordAnalysis(RemarkName: "CFGNotUnderstood")
2527 << "loop control flow is not understood by analyzer";
2528 return false;
2529 }
2530
2531 // ScalarEvolution needs to be able to find the symbolic max backedge taken
2532 // count, which is an upper bound on the number of loop iterations. The loop
2533 // may execute fewer iterations, if it exits via an uncountable exit.
2534 const SCEV *ExitCount = PSE->getSymbolicMaxBackedgeTakenCount();
2535 if (isa<SCEVCouldNotCompute>(Val: ExitCount)) {
2536 recordAnalysis(RemarkName: "CantComputeNumberOfIterations")
2537 << "could not determine number of loop iterations";
2538 LLVM_DEBUG(dbgs() << "LAA: SCEV could not compute the loop exit count.\n");
2539 return false;
2540 }
2541
2542 LLVM_DEBUG(dbgs() << "LAA: Found an analyzable loop: "
2543 << TheLoop->getHeader()->getName() << "\n");
2544 return true;
2545}
2546
2547bool LoopAccessInfo::analyzeLoop(AAResults *AA, const LoopInfo *LI,
2548 const TargetLibraryInfo *TLI,
2549 DominatorTree *DT) {
2550 // Holds the Load and Store instructions.
2551 SmallVector<LoadInst *, 16> Loads;
2552 SmallVector<StoreInst *, 16> Stores;
2553 SmallPtrSet<MDNode *, 8> LoopAliasScopes;
2554
2555 // Holds all the different accesses in the loop.
2556 unsigned NumReads = 0;
2557 unsigned NumReadWrites = 0;
2558
2559 bool HasComplexMemInst = false;
2560
2561 // A runtime check is only legal to insert if there are no convergent calls.
2562 HasConvergentOp = false;
2563
2564 PtrRtChecking->Pointers.clear();
2565 PtrRtChecking->Need = false;
2566
2567 const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel();
2568
2569 const bool EnableMemAccessVersioningOfLoop =
2570 EnableMemAccessVersioning &&
2571 !TheLoop->getHeader()->getParent()->hasOptSize();
2572
2573 // Traverse blocks in fixed RPOT order, regardless of their storage in the
2574 // loop info, as it may be arbitrary.
2575 LoopBlocksRPO RPOT(TheLoop);
2576 RPOT.perform(LI);
2577
2578 // Don't return early as soon as we found a memory access that cannot be
2579 // vectorize - HasConvergentOp must still be computed as it is part of LAI's
2580 // public API (used by LoopDistribute).
2581 for (BasicBlock *BB : RPOT) {
2582 // Scan the BB and collect legal loads and stores. Also detect any
2583 // convergent instructions.
2584 for (Instruction &I : *BB) {
2585 if (auto *Call = dyn_cast<CallBase>(Val: &I)) {
2586 if (Call->isConvergent())
2587 HasConvergentOp = true;
2588 }
2589
2590 // Unsafe to vectorize and we already found a convergent operation, can
2591 // early return now.
2592 if (HasComplexMemInst && HasConvergentOp)
2593 return false;
2594
2595 // Already unsafe to vectorize; keep scanning for convergent ops.
2596 if (HasComplexMemInst)
2597 continue;
2598
2599 // Record alias scopes defined inside the loop.
2600 if (auto *Decl = dyn_cast<NoAliasScopeDeclInst>(Val: &I))
2601 for (Metadata *Op : Decl->getScopeList()->operands())
2602 LoopAliasScopes.insert(Ptr: cast<MDNode>(Val: Op));
2603
2604 // Many math library functions read the rounding mode. We will only
2605 // vectorize a loop if it contains known function calls that don't set
2606 // the flag. Therefore, it is safe to ignore this read from memory.
2607 auto *Call = dyn_cast<CallInst>(Val: &I);
2608 if (Call && getVectorIntrinsicIDForCall(CI: Call, TLI))
2609 continue;
2610
2611 // If this is a load, save it. If this instruction can read from memory
2612 // but is not a load, we only allow it if it's a call to a function with a
2613 // vector mapping and no pointer arguments.
2614 if (I.mayReadFromMemory()) {
2615 auto hasPointerArgs = [](CallBase *CB) {
2616 return any_of(Range: CB->args(), P: [](Value const *Arg) {
2617 return Arg->getType()->isPointerTy();
2618 });
2619 };
2620
2621 // If the function has an explicit vectorized counterpart, and does not
2622 // take output/input pointers, we can safely assume that it can be
2623 // vectorized.
2624 if (Call && !Call->isNoBuiltin() && Call->getCalledFunction() &&
2625 !hasPointerArgs(Call) && !VFDatabase::getMappings(CI: *Call).empty())
2626 continue;
2627
2628 auto *Ld = dyn_cast<LoadInst>(Val: &I);
2629 if (!Ld) {
2630 recordAnalysis(RemarkName: "CantVectorizeInstruction", Instr: &I)
2631 << "instruction cannot be vectorized";
2632 HasComplexMemInst = true;
2633 continue;
2634 }
2635 if (!Ld->isSimple() && !IsAnnotatedParallel) {
2636 recordAnalysis(RemarkName: "NonSimpleLoad", Instr: Ld)
2637 << "read with atomic ordering or volatile read";
2638 LLVM_DEBUG(dbgs() << "LAA: Found a non-simple load.\n");
2639 HasComplexMemInst = true;
2640 continue;
2641 }
2642 NumLoads++;
2643 Loads.push_back(Elt: Ld);
2644 DepChecker->addAccess(LI: Ld);
2645 if (EnableMemAccessVersioningOfLoop)
2646 collectStridedAccess(LoadOrStoreInst: Ld);
2647 continue;
2648 }
2649
2650 // Save 'store' instructions. Abort if other instructions write to memory.
2651 if (I.mayWriteToMemory()) {
2652 auto *St = dyn_cast<StoreInst>(Val: &I);
2653 if (!St) {
2654 recordAnalysis(RemarkName: "CantVectorizeInstruction", Instr: &I)
2655 << "instruction cannot be vectorized";
2656 HasComplexMemInst = true;
2657 continue;
2658 }
2659 if (!St->isSimple() && !IsAnnotatedParallel) {
2660 recordAnalysis(RemarkName: "NonSimpleStore", Instr: St)
2661 << "write with atomic ordering or volatile write";
2662 LLVM_DEBUG(dbgs() << "LAA: Found a non-simple store.\n");
2663 HasComplexMemInst = true;
2664 continue;
2665 }
2666 NumStores++;
2667 Stores.push_back(Elt: St);
2668 DepChecker->addAccess(SI: St);
2669 if (EnableMemAccessVersioningOfLoop)
2670 collectStridedAccess(LoadOrStoreInst: St);
2671 }
2672 } // Next instr.
2673 } // Next block.
2674
2675 if (HasComplexMemInst)
2676 return false;
2677
2678 // Now we have two lists that hold the loads and the stores.
2679 // Next, we find the pointers that they use.
2680
2681 // Check if we see any stores. If there are no stores, then we don't
2682 // care if the pointers are *restrict*.
2683 if (!Stores.size()) {
2684 LLVM_DEBUG(dbgs() << "LAA: Found a read-only loop!\n");
2685 return true;
2686 }
2687
2688 MemoryDepChecker::DepCandidates DepCands;
2689 AccessAnalysis Accesses(TheLoop, AA, LI, *DT, DepCands, *PSE,
2690 LoopAliasScopes);
2691
2692 // Holds the analyzed pointers. We don't want to call getUnderlyingObjects
2693 // multiple times on the same object. If the ptr is accessed twice, once
2694 // for read and once for write, it will only appear once (on the write
2695 // list). This is okay, since we are going to check for conflicts between
2696 // writes and between reads and writes, but not between reads and reads.
2697 SmallSet<std::pair<Value *, Type *>, 16> Seen;
2698
2699 // Record uniform store addresses to identify if we have multiple stores
2700 // to the same address.
2701 SmallPtrSet<Value *, 16> UniformStores;
2702
2703 for (StoreInst *ST : Stores) {
2704 Value *Ptr = ST->getPointerOperand();
2705
2706 if (isInvariant(V: Ptr)) {
2707 // Record store instructions to loop invariant addresses
2708 StoresToInvariantAddresses.push_back(Elt: ST);
2709 HasStoreStoreDependenceInvolvingLoopInvariantAddress |=
2710 !UniformStores.insert(Ptr).second;
2711 }
2712
2713 // If we did *not* see this pointer before, insert it to the read-write
2714 // list. At this phase it is only a 'write' list.
2715 Type *AccessTy = getLoadStoreType(I: ST);
2716 if (Seen.insert(V: {Ptr, AccessTy}).second) {
2717 ++NumReadWrites;
2718
2719 MemoryLocation Loc = MemoryLocation::get(SI: ST);
2720 // The TBAA metadata could have a control dependency on the predication
2721 // condition, so we cannot rely on it when determining whether or not we
2722 // need runtime pointer checks.
2723 if (blockNeedsPredication(BB: ST->getParent(), TheLoop, DT))
2724 Loc.AATags.TBAA = nullptr;
2725
2726 // Expand forked pointers (i.e., a phi of multiple strided pointers) into
2727 // all alternatives.
2728 visitPointers(StartPtr: const_cast<Value *>(Loc.Ptr), InnermostLoop: *TheLoop,
2729 AddPointer: [&Accesses, AccessTy, Loc](Value *Ptr) {
2730 MemoryLocation NewLoc = Loc.getWithNewPtr(NewPtr: Ptr);
2731 Accesses.addStore(Loc: NewLoc, AccessTy);
2732 });
2733 }
2734 }
2735
2736 if (IsAnnotatedParallel) {
2737 LLVM_DEBUG(
2738 dbgs() << "LAA: A loop annotated parallel, ignore memory dependency "
2739 << "checks.\n");
2740 return true;
2741 }
2742
2743 for (LoadInst *LD : Loads) {
2744 Value *Ptr = LD->getPointerOperand();
2745 // If we did *not* see this pointer before, insert it to the read list. If
2746 // we *did* see it before, then it is already in the read-write list. This
2747 // allows us to vectorize expressions such as A[i] += x; Because the address
2748 // of A[i] is a read-write pointer. This only works if the index of A[i] is
2749 // strictly monotonic, which we approximate (conservatively) via
2750 // getPtrStride. If the address is unknown (e.g. A[B[i]]) then we may read,
2751 // modify, and write overlapping words. Note that "zero stride" is unsafe
2752 // and is being handled below.
2753 bool IsReadOnlyPtr = false;
2754 Type *AccessTy = getLoadStoreType(I: LD);
2755 if (Seen.insert(V: {Ptr, AccessTy}).second ||
2756 !getPtrStride(PSE&: *PSE, AccessTy, Ptr, Lp: TheLoop, DT: *DT, StridesMap: SymbolicStrides, Assume: false,
2757 ShouldCheckWrap: true)) {
2758 ++NumReads;
2759 IsReadOnlyPtr = true;
2760 }
2761
2762 // See if there is an unsafe dependency between a load to a uniform address and
2763 // store to the same uniform address.
2764 if (UniformStores.contains(Ptr)) {
2765 LLVM_DEBUG(dbgs() << "LAA: Found an unsafe dependency between a uniform "
2766 "load and uniform store to the same address!\n");
2767 HasLoadStoreDependenceInvolvingLoopInvariantAddress = true;
2768 }
2769
2770 MemoryLocation Loc = MemoryLocation::get(LI: LD);
2771 // The TBAA metadata could have a control dependency on the predication
2772 // condition, so we cannot rely on it when determining whether or not we
2773 // need runtime pointer checks.
2774 if (blockNeedsPredication(BB: LD->getParent(), TheLoop, DT))
2775 Loc.AATags.TBAA = nullptr;
2776
2777 // Expand forked pointers (i.e., a phi of multiple strided pointers) into
2778 // all alternatives.
2779 visitPointers(StartPtr: const_cast<Value *>(Loc.Ptr), InnermostLoop: *TheLoop,
2780 AddPointer: [&Accesses, AccessTy, Loc, IsReadOnlyPtr](Value *Ptr) {
2781 MemoryLocation NewLoc = Loc.getWithNewPtr(NewPtr: Ptr);
2782 Accesses.addLoad(Loc: NewLoc, AccessTy, IsReadOnly: IsReadOnlyPtr);
2783 });
2784 }
2785
2786 // If we write (or read-write) to a single destination and there are no other
2787 // reads in this loop then is it safe to vectorize: the vectorized stores
2788 // preserve ordering via replication or order-preserving @llvm.masked.scatter.
2789 if (NumReadWrites == 1 && NumReads == 0) {
2790 LLVM_DEBUG(dbgs() << "LAA: Found a write-only loop!\n");
2791 return true;
2792 }
2793
2794 // Build dependence sets and check whether we need a runtime pointer bounds
2795 // check.
2796 Accesses.buildDependenceSets();
2797
2798 // Find pointers with computable bounds. We are going to use this information
2799 // to place a runtime bound check.
2800 Value *UncomputablePtr = nullptr;
2801 HasCompletePtrRtChecking =
2802 Accesses.canCheckPtrAtRT(RtCheck&: *PtrRtChecking, TheLoop, StridesMap: SymbolicStrides,
2803 UncomputablePtr, AllowPartial, DepChecker: getDepChecker());
2804 if (!HasCompletePtrRtChecking) {
2805 const auto *I = dyn_cast_or_null<Instruction>(Val: UncomputablePtr);
2806 recordAnalysis(RemarkName: "CantIdentifyArrayBounds", Instr: I)
2807 << "cannot identify array bounds";
2808 LLVM_DEBUG(dbgs() << "LAA: We can't vectorize because we can't find "
2809 << "the array bounds.\n");
2810 return false;
2811 }
2812
2813 LLVM_DEBUG(
2814 dbgs() << "LAA: May be able to perform a memory runtime check if needed.\n");
2815
2816 bool DepsAreSafe = true;
2817 if (Accesses.isDependencyCheckNeeded()) {
2818 LLVM_DEBUG(dbgs() << "LAA: Checking memory dependencies\n");
2819 DepsAreSafe =
2820 DepChecker->areDepsSafe(DepCands, CheckDeps: Accesses.getDependenciesToCheck());
2821
2822 if (!DepsAreSafe && DepChecker->shouldRetryWithRuntimeChecks()) {
2823 LLVM_DEBUG(dbgs() << "LAA: Retrying with memory checks\n");
2824
2825 PtrRtChecking->reset();
2826 PtrRtChecking->Need = true;
2827
2828 UncomputablePtr = nullptr;
2829 HasCompletePtrRtChecking = Accesses.canCheckPtrAtRT(
2830 RtCheck&: *PtrRtChecking, TheLoop, StridesMap: SymbolicStrides, UncomputablePtr,
2831 AllowPartial, DepChecker: getDepChecker());
2832
2833 // Check that we found the bounds for the pointer.
2834 if (!HasCompletePtrRtChecking) {
2835 auto *I = dyn_cast_or_null<Instruction>(Val: UncomputablePtr);
2836 recordAnalysis(RemarkName: "CantCheckMemDepsAtRunTime", Instr: I)
2837 << "cannot check memory dependencies at runtime";
2838 LLVM_DEBUG(dbgs() << "LAA: Can't vectorize with memory checks\n");
2839 return false;
2840 }
2841
2842 // Clear the dependency checks. They are no longer needed.
2843 Accesses.resetDepChecks(DepChecker&: *DepChecker);
2844
2845 DepsAreSafe = true;
2846 }
2847 }
2848
2849 // Update the invariant address dependence flags based on dependences found
2850 // by the dep checker. Even if dependences were not recorded (too many to
2851 // track), any InvariantUnsafe dep would still have set the status to Unsafe
2852 if (const auto *Deps = DepChecker->getDependences()) {
2853 for (const auto &Dep : *Deps) {
2854 if (Dep.Type != MemoryDepChecker::Dependence::InvariantUnsafe)
2855 continue;
2856 Instruction *Src = Dep.getSource(DepChecker: *DepChecker);
2857 Instruction *Dst = Dep.getDestination(DepChecker: *DepChecker);
2858 if (isa<LoadInst>(Val: Src) != isa<LoadInst>(Val: Dst)) {
2859 HasLoadStoreDependenceInvolvingLoopInvariantAddress = true;
2860 } else {
2861 assert(isa<StoreInst>(Src) && isa<StoreInst>(Dst) &&
2862 "Expected both to be stores");
2863 HasStoreStoreDependenceInvolvingLoopInvariantAddress = true;
2864 }
2865 }
2866 }
2867
2868 if (HasConvergentOp) {
2869 recordAnalysis(RemarkName: "CantInsertRuntimeCheckWithConvergent")
2870 << "cannot add control dependency to convergent operation";
2871 LLVM_DEBUG(dbgs() << "LAA: We can't vectorize because a runtime check "
2872 "would be needed with a convergent operation\n");
2873 return false;
2874 }
2875
2876 if (DepsAreSafe) {
2877 LLVM_DEBUG(
2878 dbgs() << "LAA: No unsafe dependent memory operations in loop. We"
2879 << (PtrRtChecking->Need ? "" : " don't")
2880 << " need runtime memory checks.\n");
2881 return true;
2882 }
2883
2884 emitUnsafeDependenceRemark();
2885 return false;
2886}
2887
2888void LoopAccessInfo::emitUnsafeDependenceRemark() {
2889 const auto *Deps = getDepChecker().getDependences();
2890 if (!Deps)
2891 return;
2892 const auto *Found =
2893 llvm::find_if(Range: *Deps, P: [](const MemoryDepChecker::Dependence &D) {
2894 return MemoryDepChecker::Dependence::isSafeForVectorization(Type: D.Type) !=
2895 MemoryDepChecker::VectorizationSafetyStatus::Safe;
2896 });
2897 if (Found == Deps->end())
2898 return;
2899 MemoryDepChecker::Dependence Dep = *Found;
2900
2901 LLVM_DEBUG(dbgs() << "LAA: unsafe dependent memory operations in loop\n");
2902
2903 // Emit remark for first unsafe dependence
2904 bool HasForcedDistribution = false;
2905 std::optional<const MDOperand *> Value =
2906 findStringMetadataForLoop(TheLoop, Name: "llvm.loop.distribute.enable");
2907 if (Value) {
2908 const MDOperand *Op = *Value;
2909 assert(Op && mdconst::hasa<ConstantInt>(*Op) && "invalid metadata");
2910 HasForcedDistribution = mdconst::extract<ConstantInt>(MD: *Op)->getZExtValue();
2911 }
2912
2913 const std::string Info =
2914 HasForcedDistribution
2915 ? "unsafe dependent memory operations in loop."
2916 : "unsafe dependent memory operations in loop. Use "
2917 "#pragma clang loop distribute(enable) to allow loop distribution "
2918 "to attempt to isolate the offending operations into a separate "
2919 "loop";
2920 OptimizationRemarkAnalysis &R =
2921 recordAnalysis(RemarkName: "UnsafeDep", Instr: Dep.getDestination(DepChecker: getDepChecker())) << Info;
2922
2923 switch (Dep.Type) {
2924 case MemoryDepChecker::Dependence::NoDep:
2925 case MemoryDepChecker::Dependence::Forward:
2926 case MemoryDepChecker::Dependence::BackwardVectorizable:
2927 llvm_unreachable("Unexpected dependence");
2928 case MemoryDepChecker::Dependence::Backward:
2929 R << "\nBackward loop carried data dependence.";
2930 break;
2931 case MemoryDepChecker::Dependence::ForwardButPreventsForwarding:
2932 R << "\nForward loop carried data dependence that prevents "
2933 "store-to-load forwarding.";
2934 break;
2935 case MemoryDepChecker::Dependence::BackwardVectorizableButPreventsForwarding:
2936 R << "\nBackward loop carried data dependence that prevents "
2937 "store-to-load forwarding.";
2938 break;
2939 case MemoryDepChecker::Dependence::IndirectUnsafe:
2940 R << "\nUnsafe indirect dependence.";
2941 break;
2942 case MemoryDepChecker::Dependence::InvariantUnsafe:
2943 R << "\nUnsafe dependence on loop-invariant address.";
2944 break;
2945 case MemoryDepChecker::Dependence::Unknown:
2946 R << "\nUnknown data dependence.";
2947 break;
2948 }
2949
2950 if (Instruction *I = Dep.getSource(DepChecker: getDepChecker())) {
2951 DebugLoc SourceLoc = I->getDebugLoc();
2952 if (auto *DD = dyn_cast_or_null<Instruction>(Val: getPointerOperand(V: I)))
2953 SourceLoc = DD->getDebugLoc();
2954 if (SourceLoc)
2955 R << " Memory location is the same as accessed at "
2956 << ore::NV("Location", SourceLoc);
2957 }
2958}
2959
2960bool LoopAccessInfo::blockNeedsPredication(const BasicBlock *BB,
2961 const Loop *TheLoop,
2962 const DominatorTree *DT) {
2963 assert(TheLoop->contains(BB) && "Unknown block used");
2964
2965 // Blocks that do not dominate the latch need predication.
2966 const BasicBlock *Latch = TheLoop->getLoopLatch();
2967 return !DT->dominates(A: BB, B: Latch);
2968}
2969
2970OptimizationRemarkAnalysis &
2971LoopAccessInfo::recordAnalysis(StringRef RemarkName, const Instruction *I) {
2972 assert(!Report && "Multiple reports generated");
2973
2974 const BasicBlock *CodeRegion = TheLoop->getHeader();
2975 DebugLoc DL = TheLoop->getStartLoc();
2976
2977 if (I) {
2978 CodeRegion = I->getParent();
2979 // If there is no debug location attached to the instruction, revert back to
2980 // using the loop's.
2981 if (I->getDebugLoc())
2982 DL = I->getDebugLoc();
2983 }
2984
2985 Report = std::make_unique<OptimizationRemarkAnalysis>(DEBUG_TYPE, args&: RemarkName,
2986 args&: DL, args&: CodeRegion);
2987 return *Report;
2988}
2989
2990bool LoopAccessInfo::isInvariant(Value *V) const {
2991 auto *SE = PSE->getSE();
2992 if (TheLoop->isLoopInvariant(V))
2993 return true;
2994 if (!SE->isSCEVable(Ty: V->getType()))
2995 return false;
2996 const SCEV *S = SE->getSCEV(V);
2997 return SE->isLoopInvariant(S, L: TheLoop);
2998}
2999
3000/// If \p Ptr is a GEP, which has a loop-variant operand, return that operand.
3001/// Otherwise, return \p Ptr.
3002static Value *getLoopVariantGEPOperand(Value *Ptr, ScalarEvolution *SE,
3003 Loop *Lp) {
3004 auto *GEP = dyn_cast<GetElementPtrInst>(Val: Ptr);
3005 if (!GEP)
3006 return Ptr;
3007
3008 Value *V = Ptr;
3009 for (const Use &U : GEP->operands()) {
3010 if (!SE->isLoopInvariant(S: SE->getSCEV(V: U), L: Lp)) {
3011 if (V == Ptr)
3012 V = U;
3013 else
3014 // There must be exactly one loop-variant operand.
3015 return Ptr;
3016 }
3017 }
3018 return V;
3019}
3020
3021/// Get the stride of a pointer access in a loop. Looks for symbolic
3022/// strides "a[i*stride]". Returns the symbolic stride, or null otherwise.
3023static const SCEV *getStrideFromPointer(Value *Ptr, ScalarEvolution *SE, Loop *Lp) {
3024 auto *PtrTy = dyn_cast<PointerType>(Val: Ptr->getType());
3025 if (!PtrTy)
3026 return nullptr;
3027
3028 // Try to remove a gep instruction to make the pointer (actually index at this
3029 // point) easier analyzable. If OrigPtr is equal to Ptr we are analyzing the
3030 // pointer, otherwise, we are analyzing the index.
3031 Value *OrigPtr = Ptr;
3032
3033 Ptr = getLoopVariantGEPOperand(Ptr, SE, Lp);
3034 const SCEV *V = SE->getSCEV(V: Ptr);
3035
3036 if (Ptr != OrigPtr)
3037 // Strip off casts.
3038 while (auto *C = dyn_cast<SCEVIntegralCastExpr>(Val: V))
3039 V = C->getOperand();
3040
3041 if (!match(S: V, P: m_scev_AffineAddRec(Op0: m_SCEV(), Op1: m_SCEV(V), L: m_SpecificLoop(L: Lp))))
3042 return nullptr;
3043
3044 // Note that the restriction after this loop invariant check are only
3045 // profitability restrictions.
3046 if (!SE->isLoopInvariant(S: V, L: Lp))
3047 return nullptr;
3048
3049 // Look for the loop invariant symbolic value.
3050 if (isa<SCEVUnknown>(Val: V))
3051 return V;
3052
3053 // Look through multiplies that scale a stride by a constant.
3054 match(S: V, P: m_scev_Mul(Op0: m_SCEVConstant(), Op1: m_SCEV(V)));
3055 if (auto *C = dyn_cast<SCEVIntegralCastExpr>(Val: V))
3056 if (isa<SCEVUnknown>(Val: C->getOperand()))
3057 return V;
3058
3059 return nullptr;
3060}
3061
3062void LoopAccessInfo::collectStridedAccess(Value *MemAccess) {
3063 Value *Ptr = getLoadStorePointerOperand(V: MemAccess);
3064 if (!Ptr)
3065 return;
3066
3067 // Note: getStrideFromPointer is a *profitability* heuristic. We
3068 // could broaden the scope of values returned here - to anything
3069 // which happens to be loop invariant and contributes to the
3070 // computation of an interesting IV - but we chose not to as we
3071 // don't have a cost model here, and broadening the scope exposes
3072 // far too many unprofitable cases.
3073 const SCEV *StrideExpr = getStrideFromPointer(Ptr, SE: PSE->getSE(), Lp: TheLoop);
3074 if (!StrideExpr)
3075 return;
3076
3077 if (match(S: StrideExpr, P: m_scev_UndefOrPoison()))
3078 return;
3079
3080 LLVM_DEBUG(dbgs() << "LAA: Found a strided access that is a candidate for "
3081 "versioning:");
3082 LLVM_DEBUG(dbgs() << " Ptr: " << *Ptr << " Stride: " << *StrideExpr << "\n");
3083
3084 if (!SpeculateUnitStride) {
3085 LLVM_DEBUG(dbgs() << " Chose not to due to -laa-speculate-unit-stride\n");
3086 return;
3087 }
3088
3089 // Avoid adding the "Stride == 1" predicate when we know that
3090 // Stride >= Trip-Count. Such a predicate will effectively optimize a single
3091 // or zero iteration loop, as Trip-Count <= Stride == 1.
3092 //
3093 // TODO: We are currently not making a very informed decision on when it is
3094 // beneficial to apply stride versioning. It might make more sense that the
3095 // users of this analysis (such as the vectorizer) will trigger it, based on
3096 // their specific cost considerations; For example, in cases where stride
3097 // versioning does not help resolving memory accesses/dependences, the
3098 // vectorizer should evaluate the cost of the runtime test, and the benefit
3099 // of various possible stride specializations, considering the alternatives
3100 // of using gather/scatters (if available).
3101
3102 const SCEV *MaxBTC = PSE->getSymbolicMaxBackedgeTakenCount();
3103
3104 // Match the types so we can compare the stride and the MaxBTC.
3105 // The Stride can be positive/negative, so we sign extend Stride;
3106 // The backedgeTakenCount is non-negative, so we zero extend MaxBTC.
3107 const DataLayout &DL = TheLoop->getHeader()->getDataLayout();
3108 uint64_t StrideTypeSizeBits = DL.getTypeSizeInBits(Ty: StrideExpr->getType());
3109 uint64_t BETypeSizeBits = DL.getTypeSizeInBits(Ty: MaxBTC->getType());
3110 const SCEV *CastedStride = StrideExpr;
3111 const SCEV *CastedBECount = MaxBTC;
3112 ScalarEvolution *SE = PSE->getSE();
3113 if (BETypeSizeBits >= StrideTypeSizeBits)
3114 CastedStride = SE->getNoopOrSignExtend(V: StrideExpr, Ty: MaxBTC->getType());
3115 else
3116 CastedBECount = SE->getZeroExtendExpr(Op: MaxBTC, Ty: StrideExpr->getType());
3117 const SCEV *StrideMinusBETaken = SE->getMinusSCEV(LHS: CastedStride, RHS: CastedBECount);
3118 // Since TripCount == BackEdgeTakenCount + 1, checking:
3119 // "Stride >= TripCount" is equivalent to checking:
3120 // Stride - MaxBTC> 0
3121 if (SE->isKnownPositive(S: StrideMinusBETaken)) {
3122 LLVM_DEBUG(
3123 dbgs() << "LAA: Stride>=TripCount; No point in versioning as the "
3124 "Stride==1 predicate will imply that the loop executes "
3125 "at most once.\n");
3126 return;
3127 }
3128 LLVM_DEBUG(dbgs() << "LAA: Found a strided access that we can version.\n");
3129
3130 // Strip back off the integer cast, and check that our result is a
3131 // SCEVUnknown as we expect.
3132 const SCEV *StrideBase = StrideExpr;
3133 if (const auto *C = dyn_cast<SCEVIntegralCastExpr>(Val: StrideBase))
3134 StrideBase = C->getOperand();
3135 SymbolicStrides[Ptr] = cast<SCEVUnknown>(Val: StrideBase);
3136}
3137
3138LoopAccessInfo::LoopAccessInfo(Loop *L, ScalarEvolution *SE,
3139 const TargetTransformInfo *TTI,
3140 const TargetLibraryInfo *TLI, AAResults *AA,
3141 DominatorTree *DT, LoopInfo *LI,
3142 AssumptionCache *AC, bool AllowPartial)
3143 : PSE(std::make_unique<PredicatedScalarEvolution>(args&: *SE, args&: *L)),
3144 PtrRtChecking(nullptr), TheLoop(L), AllowPartial(AllowPartial) {
3145 unsigned MaxTargetVectorWidthInBits = std::numeric_limits<unsigned>::max();
3146 if (TTI && !TTI->enableScalableVectorization())
3147 // Scale the vector width by 2 as rough estimate to also consider
3148 // interleaving.
3149 MaxTargetVectorWidthInBits =
3150 TTI->getRegisterBitWidth(K: TargetTransformInfo::RGK_FixedWidthVector) * 2;
3151
3152 DepChecker = std::make_unique<MemoryDepChecker>(
3153 args&: *PSE, args&: AC, args&: DT, args&: L, args&: SymbolicStrides, args&: MaxTargetVectorWidthInBits, args&: LoopGuards);
3154 PtrRtChecking =
3155 std::make_unique<RuntimePointerChecking>(args&: *DepChecker, args&: SE, args&: LoopGuards);
3156 if (canAnalyzeLoop())
3157 CanVecMem = analyzeLoop(AA, LI, TLI, DT);
3158}
3159
3160void LoopAccessInfo::print(raw_ostream &OS, unsigned Depth) const {
3161 if (CanVecMem) {
3162 OS.indent(NumSpaces: Depth) << "Memory dependences are safe";
3163 const MemoryDepChecker &DC = getDepChecker();
3164 if (!DC.isSafeForAnyVectorWidth())
3165 OS << " with a maximum safe vector width of "
3166 << DC.getMaxSafeVectorWidthInBits() << " bits";
3167 if (!DC.isSafeForAnyStoreLoadForwardDistances()) {
3168 uint64_t SLDist = DC.getStoreLoadForwardSafeDistanceInBits();
3169 OS << ", with a maximum safe store-load forward width of " << SLDist
3170 << " bits";
3171 }
3172 if (PtrRtChecking->Need)
3173 OS << " with run-time checks";
3174 OS << "\n";
3175 }
3176
3177 if (HasConvergentOp)
3178 OS.indent(NumSpaces: Depth) << "Has convergent operation in loop\n";
3179
3180 if (Report)
3181 OS.indent(NumSpaces: Depth) << "Report: " << Report->getMsg() << "\n";
3182
3183 if (auto *Dependences = DepChecker->getDependences()) {
3184 OS.indent(NumSpaces: Depth) << "Dependences:\n";
3185 for (const auto &Dep : *Dependences) {
3186 Dep.print(OS, Depth: Depth + 2, Instrs: DepChecker->getMemoryInstructions());
3187 OS << "\n";
3188 }
3189 } else
3190 OS.indent(NumSpaces: Depth) << "Too many dependences, not recorded\n";
3191
3192 // List the pair of accesses need run-time checks to prove independence.
3193 PtrRtChecking->print(OS, Depth);
3194 if (PtrRtChecking->Need && !HasCompletePtrRtChecking)
3195 OS.indent(NumSpaces: Depth) << "Generated run-time checks are incomplete\n";
3196 OS << "\n";
3197
3198 OS.indent(NumSpaces: Depth)
3199 << "Non vectorizable stores to invariant address were "
3200 << (HasStoreStoreDependenceInvolvingLoopInvariantAddress ||
3201 HasLoadStoreDependenceInvolvingLoopInvariantAddress
3202 ? ""
3203 : "not ")
3204 << "found in loop.\n";
3205
3206 OS.indent(NumSpaces: Depth) << "SCEV assumptions:\n";
3207 PSE->getPredicate().print(OS, Depth);
3208
3209 OS << "\n";
3210
3211 OS.indent(NumSpaces: Depth) << "Expressions re-written:\n";
3212 PSE->print(OS, Depth);
3213}
3214
3215const LoopAccessInfo &LoopAccessInfoManager::getInfo(Loop &L,
3216 bool AllowPartial) {
3217 const auto &[It, Inserted] = LoopAccessInfoMap.try_emplace(Key: &L);
3218
3219 // We need to create the LoopAccessInfo if either we don't already have one,
3220 // or if it was created with a different value of AllowPartial.
3221 if (Inserted || It->second->hasAllowPartial() != AllowPartial)
3222 It->second = std::make_unique<LoopAccessInfo>(args: &L, args: &SE, args&: TTI, args&: TLI, args: &AA, args: &DT,
3223 args: &LI, args&: AC, args&: AllowPartial);
3224
3225 return *It->second;
3226}
3227void LoopAccessInfoManager::clear() {
3228 // Collect LoopAccessInfo entries that may keep references to IR outside the
3229 // analyzed loop or SCEVs that may have been modified or invalidated. At the
3230 // moment, that is loops requiring memory or SCEV runtime checks, as those cache
3231 // SCEVs, e.g. for pointer expressions.
3232 for (const auto &[L, LAI] : LoopAccessInfoMap) {
3233 if (LAI->getRuntimePointerChecking()->getChecks().empty() &&
3234 LAI->getPSE().getPredicate().isAlwaysTrue())
3235 continue;
3236 LoopAccessInfoMap.erase(Val: L);
3237 }
3238}
3239
3240bool LoopAccessInfoManager::invalidate(
3241 Function &F, const PreservedAnalyses &PA,
3242 FunctionAnalysisManager::Invalidator &Inv) {
3243 // Check whether our analysis is preserved.
3244 auto PAC = PA.getChecker<LoopAccessAnalysis>();
3245 if (!PAC.preserved() && !PAC.preservedSet<AllAnalysesOn<Function>>())
3246 // If not, give up now.
3247 return true;
3248
3249 // Check whether the analyses we depend on became invalid for any reason.
3250 // Skip checking TargetLibraryAnalysis as it is immutable and can't become
3251 // invalid.
3252 return Inv.invalidate<AAManager>(IR&: F, PA) ||
3253 Inv.invalidate<ScalarEvolutionAnalysis>(IR&: F, PA) ||
3254 Inv.invalidate<LoopAnalysis>(IR&: F, PA) ||
3255 Inv.invalidate<DominatorTreeAnalysis>(IR&: F, PA);
3256}
3257
3258LoopAccessInfoManager LoopAccessAnalysis::run(Function &F,
3259 FunctionAnalysisManager &FAM) {
3260 auto &SE = FAM.getResult<ScalarEvolutionAnalysis>(IR&: F);
3261 auto &AA = FAM.getResult<AAManager>(IR&: F);
3262 auto &DT = FAM.getResult<DominatorTreeAnalysis>(IR&: F);
3263 auto &LI = FAM.getResult<LoopAnalysis>(IR&: F);
3264 auto &TTI = FAM.getResult<TargetIRAnalysis>(IR&: F);
3265 auto &TLI = FAM.getResult<TargetLibraryAnalysis>(IR&: F);
3266 auto &AC = FAM.getResult<AssumptionAnalysis>(IR&: F);
3267 return LoopAccessInfoManager(SE, AA, DT, LI, &TTI, &TLI, &AC);
3268}
3269
3270AnalysisKey LoopAccessAnalysis::Key;
3271