1//===- LoopAccessAnalysis.cpp - Loop Access Analysis Implementation --------==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// The implementation for the loop memory dependence that was originally
10// developed for the loop vectorizer.
11//
12//===----------------------------------------------------------------------===//
13
14#include "llvm/Analysis/LoopAccessAnalysis.h"
15#include "llvm/ADT/APInt.h"
16#include "llvm/ADT/DenseMap.h"
17#include "llvm/ADT/EquivalenceClasses.h"
18#include "llvm/ADT/PointerIntPair.h"
19#include "llvm/ADT/STLExtras.h"
20#include "llvm/ADT/SetVector.h"
21#include "llvm/ADT/SmallPtrSet.h"
22#include "llvm/ADT/SmallSet.h"
23#include "llvm/ADT/SmallVector.h"
24#include "llvm/Analysis/AliasAnalysis.h"
25#include "llvm/Analysis/AliasSetTracker.h"
26#include "llvm/Analysis/AssumeBundleQueries.h"
27#include "llvm/Analysis/AssumptionCache.h"
28#include "llvm/Analysis/LoopAnalysisManager.h"
29#include "llvm/Analysis/LoopInfo.h"
30#include "llvm/Analysis/LoopIterator.h"
31#include "llvm/Analysis/MemoryLocation.h"
32#include "llvm/Analysis/OptimizationRemarkEmitter.h"
33#include "llvm/Analysis/ScalarEvolution.h"
34#include "llvm/Analysis/ScalarEvolutionExpressions.h"
35#include "llvm/Analysis/ScalarEvolutionPatternMatch.h"
36#include "llvm/Analysis/TargetLibraryInfo.h"
37#include "llvm/Analysis/TargetTransformInfo.h"
38#include "llvm/Analysis/ValueTracking.h"
39#include "llvm/Analysis/VectorUtils.h"
40#include "llvm/IR/BasicBlock.h"
41#include "llvm/IR/Constants.h"
42#include "llvm/IR/DataLayout.h"
43#include "llvm/IR/DebugLoc.h"
44#include "llvm/IR/DerivedTypes.h"
45#include "llvm/IR/DiagnosticInfo.h"
46#include "llvm/IR/Dominators.h"
47#include "llvm/IR/Function.h"
48#include "llvm/IR/InstrTypes.h"
49#include "llvm/IR/Instruction.h"
50#include "llvm/IR/Instructions.h"
51#include "llvm/IR/IntrinsicInst.h"
52#include "llvm/IR/PassManager.h"
53#include "llvm/IR/Type.h"
54#include "llvm/IR/Value.h"
55#include "llvm/IR/ValueHandle.h"
56#include "llvm/Support/Casting.h"
57#include "llvm/Support/CommandLine.h"
58#include "llvm/Support/Debug.h"
59#include "llvm/Support/ErrorHandling.h"
60#include "llvm/Support/raw_ostream.h"
61#include <algorithm>
62#include <cassert>
63#include <cstdint>
64#include <iterator>
65#include <utility>
66#include <variant>
67#include <vector>
68
69using namespace llvm;
70using namespace llvm::SCEVPatternMatch;
71
72#define DEBUG_TYPE "loop-accesses"
73
74static cl::opt<unsigned, true>
75VectorizationFactor("force-vector-width", cl::Hidden,
76 cl::desc("Sets the SIMD width. Zero is autoselect."),
77 cl::location(L&: VectorizerParams::VectorizationFactor));
78unsigned VectorizerParams::VectorizationFactor;
79
80static cl::opt<unsigned, true>
81VectorizationInterleave("force-vector-interleave", cl::Hidden,
82 cl::desc("Sets the vectorization interleave count. "
83 "Zero is autoselect."),
84 cl::location(
85 L&: VectorizerParams::VectorizationInterleave));
86unsigned VectorizerParams::VectorizationInterleave;
87
88static cl::opt<unsigned, true> RuntimeMemoryCheckThreshold(
89 "runtime-memory-check-threshold", cl::Hidden,
90 cl::desc("When performing memory disambiguation checks at runtime do not "
91 "generate more than this number of comparisons (default = 8)."),
92 cl::location(L&: VectorizerParams::RuntimeMemoryCheckThreshold), cl::init(Val: 8));
93unsigned VectorizerParams::RuntimeMemoryCheckThreshold;
94
95/// The maximum iterations used to merge memory checks
96static cl::opt<unsigned> MemoryCheckMergeThreshold(
97 "memory-check-merge-threshold", cl::Hidden,
98 cl::desc("Maximum number of comparisons done when trying to merge "
99 "runtime memory checks. (default = 100)"),
100 cl::init(Val: 100));
101
102/// Maximum SIMD width.
103const unsigned VectorizerParams::MaxVectorWidth = 64;
104
105/// We collect dependences up to this threshold.
106static cl::opt<unsigned>
107 MaxDependences("max-dependences", cl::Hidden,
108 cl::desc("Maximum number of dependences collected by "
109 "loop-access analysis (default = 100)"),
110 cl::init(Val: 100));
111
112/// This enables versioning on the strides of symbolically striding memory
113/// accesses in code like the following.
114/// for (i = 0; i < N; ++i)
115/// A[i * Stride1] += B[i * Stride2] ...
116///
117/// Will be roughly translated to
118/// if (Stride1 == 1 && Stride2 == 1) {
119/// for (i = 0; i < N; i+=4)
120/// A[i:i+3] += ...
121/// } else
122/// ...
123static cl::opt<bool> EnableMemAccessVersioning(
124 "enable-mem-access-versioning", cl::init(Val: true), cl::Hidden,
125 cl::desc("Enable symbolic stride memory access versioning"));
126
127/// Enable store-to-load forwarding conflict detection. This option can
128/// be disabled for correctness testing.
129static cl::opt<bool> EnableForwardingConflictDetection(
130 "store-to-load-forwarding-conflict-detection", cl::Hidden,
131 cl::desc("Enable conflict detection in loop-access analysis"),
132 cl::init(Val: true));
133
134static cl::opt<unsigned> MaxForkedSCEVDepth(
135 "max-forked-scev-depth", cl::Hidden,
136 cl::desc("Maximum recursion depth when finding forked SCEVs (default = 5)"),
137 cl::init(Val: 5));
138
139static cl::opt<bool> SpeculateUnitStride(
140 "laa-speculate-unit-stride", cl::Hidden,
141 cl::desc("Speculate that non-constant strides are unit in LAA"),
142 cl::init(Val: true));
143
144static cl::opt<bool, true> HoistRuntimeChecks(
145 "hoist-runtime-checks", cl::Hidden,
146 cl::desc(
147 "Hoist inner loop runtime memory checks to outer loop if possible"),
148 cl::location(L&: VectorizerParams::HoistRuntimeChecks), cl::init(Val: true));
149bool VectorizerParams::HoistRuntimeChecks;
150
151bool VectorizerParams::isInterleaveForced() {
152 return ::VectorizationInterleave.getNumOccurrences() > 0;
153}
154
155const SCEV *llvm::replaceSymbolicStrideSCEV(PredicatedScalarEvolution &PSE,
156 const DenseMap<Value *, const SCEV *> &PtrToStride,
157 Value *Ptr) {
158 const SCEV *OrigSCEV = PSE.getSCEV(V: Ptr);
159
160 // If there is an entry in the map return the SCEV of the pointer with the
161 // symbolic stride replaced by one.
162 const SCEV *StrideSCEV = PtrToStride.lookup(Val: Ptr);
163 if (!StrideSCEV)
164 // For a non-symbolic stride, just return the original expression.
165 return OrigSCEV;
166
167 // Note: This assert is both overly strong and overly weak. The actual
168 // invariant here is that StrideSCEV should be loop invariant. The only
169 // such invariant strides we happen to speculate right now are unknowns
170 // and thus this is a reasonable proxy of the actual invariant.
171 assert(isa<SCEVUnknown>(StrideSCEV) && "shouldn't be in map");
172
173 ScalarEvolution *SE = PSE.getSE();
174 const SCEV *CT = SE->getOne(Ty: StrideSCEV->getType());
175 PSE.addPredicate(Pred: *SE->getEqualPredicate(LHS: StrideSCEV, RHS: CT));
176 const SCEV *Expr = PSE.getSCEV(V: Ptr);
177
178 LLVM_DEBUG(dbgs() << "LAA: Replacing SCEV: " << *OrigSCEV
179 << " by: " << *Expr << "\n");
180 return Expr;
181}
182
183RuntimeCheckingPtrGroup::RuntimeCheckingPtrGroup(
184 unsigned Index, const RuntimePointerChecking &RtCheck)
185 : High(RtCheck.Pointers[Index].End), Low(RtCheck.Pointers[Index].Start),
186 AddressSpace(RtCheck.Pointers[Index]
187 .PointerValue->getType()
188 ->getPointerAddressSpace()),
189 NeedsFreeze(RtCheck.Pointers[Index].NeedsFreeze) {
190 Members.push_back(Elt: Index);
191}
192
193/// Returns \p A + \p B, if it is guaranteed not to unsigned wrap. Otherwise
194/// return nullptr. \p A and \p B must have the same type.
195static const SCEV *addSCEVNoOverflow(const SCEV *A, const SCEV *B,
196 ScalarEvolution &SE) {
197 if (!SE.willNotOverflow(BinOp: Instruction::Add, /*IsSigned=*/Signed: false, LHS: A, RHS: B))
198 return nullptr;
199 return SE.getAddExpr(LHS: A, RHS: B);
200}
201
202/// Returns \p A * \p B, if it is guaranteed not to unsigned wrap. Otherwise
203/// return nullptr. \p A and \p B must have the same type.
204static const SCEV *mulSCEVNoOverflow(const SCEV *A, const SCEV *B,
205 ScalarEvolution &SE) {
206 if (!SE.willNotOverflow(BinOp: Instruction::Mul, /*IsSigned=*/Signed: false, LHS: A, RHS: B))
207 return nullptr;
208 return SE.getMulExpr(LHS: A, RHS: B);
209}
210
211/// Return true, if evaluating \p AR at \p MaxBTC cannot wrap, because \p AR at
212/// \p MaxBTC is guaranteed inbounds of the accessed object.
213static bool evaluatePtrAddRecAtMaxBTCWillNotWrap(
214 const SCEVAddRecExpr *AR, const SCEV *MaxBTC, const SCEV *EltSize,
215 ScalarEvolution &SE, const DataLayout &DL, DominatorTree *DT,
216 AssumptionCache *AC,
217 std::optional<ScalarEvolution::LoopGuards> &LoopGuards) {
218 auto *PointerBase = SE.getPointerBase(V: AR->getStart());
219 auto *StartPtr = dyn_cast<SCEVUnknown>(Val: PointerBase);
220 if (!StartPtr)
221 return false;
222 const Loop *L = AR->getLoop();
223 bool CheckForNonNull, CheckForFreed;
224 Value *StartPtrV = StartPtr->getValue();
225 uint64_t DerefBytes = StartPtrV->getPointerDereferenceableBytes(
226 DL, CanBeNull&: CheckForNonNull, CanBeFreed&: CheckForFreed);
227
228 if (DerefBytes && (CheckForNonNull || CheckForFreed))
229 return false;
230
231 const SCEV *Step = AR->getStepRecurrence(SE);
232 Type *WiderTy = SE.getWiderType(Ty1: MaxBTC->getType(), Ty2: Step->getType());
233 const SCEV *DerefBytesSCEV = SE.getConstant(Ty: WiderTy, V: DerefBytes);
234
235 // Check if we have a suitable dereferencable assumption we can use.
236 Instruction *CtxI = &*L->getHeader()->getFirstNonPHIIt();
237 if (BasicBlock *LoopPred = L->getLoopPredecessor()) {
238 if (isa<UncondBrInst, CondBrInst>(Val: LoopPred->getTerminator()))
239 CtxI = LoopPred->getTerminator();
240 }
241 RetainedKnowledge DerefRK;
242 getKnowledgeForValue(V: StartPtrV, AttrKinds: {Attribute::Dereferenceable}, AC&: *AC,
243 Filter: [&](RetainedKnowledge RK, Instruction *Assume, auto) {
244 if (!isValidAssumeForContext(I: Assume, CxtI: CtxI, DT))
245 return false;
246 if (StartPtrV->canBeFreed() &&
247 !willNotFreeBetween(Assume, CtxI))
248 return false;
249 DerefRK = std::max(a: DerefRK, b: RK);
250 return true;
251 });
252 if (DerefRK) {
253 const SCEV *DerefRKSCEV = SE.getSCEV(V: DerefRK.IRArgValue);
254 Type *CommonTy =
255 SE.getWiderType(Ty1: DerefBytesSCEV->getType(), Ty2: DerefRKSCEV->getType());
256 DerefBytesSCEV = SE.getNoopOrZeroExtend(V: DerefBytesSCEV, Ty: CommonTy);
257 DerefRKSCEV = SE.getNoopOrZeroExtend(V: DerefRKSCEV, Ty: CommonTy);
258 DerefBytesSCEV = SE.getUMaxExpr(LHS: DerefBytesSCEV, RHS: DerefRKSCEV);
259 }
260
261 if (DerefBytesSCEV->isZero())
262 return false;
263
264 bool IsKnownNonNegative = SE.isKnownNonNegative(S: Step);
265 if (!IsKnownNonNegative && !SE.isKnownNegative(S: Step))
266 return false;
267
268 Step = SE.getNoopOrSignExtend(V: Step, Ty: WiderTy);
269 MaxBTC = SE.getNoopOrZeroExtend(V: MaxBTC, Ty: WiderTy);
270
271 // For the computations below, make sure they don't unsigned wrap.
272 if (!SE.isKnownPredicate(Pred: CmpInst::ICMP_UGE, LHS: AR->getStart(), RHS: StartPtr))
273 return false;
274 const SCEV *StartOffset = SE.getNoopOrZeroExtend(
275 V: SE.getMinusSCEV(LHS: AR->getStart(), RHS: StartPtr), Ty: WiderTy);
276
277 if (!LoopGuards)
278 LoopGuards.emplace(args: ScalarEvolution::LoopGuards::collect(L: AR->getLoop(), SE));
279 MaxBTC = SE.applyLoopGuards(Expr: MaxBTC, Guards: *LoopGuards);
280
281 const SCEV *OffsetAtLastIter =
282 mulSCEVNoOverflow(A: MaxBTC, B: SE.getAbsExpr(Op: Step, /*IsNSW=*/false), SE);
283 if (!OffsetAtLastIter) {
284 // Re-try with constant max backedge-taken count if using the symbolic one
285 // failed.
286 MaxBTC = SE.getConstantMaxBackedgeTakenCount(L: AR->getLoop());
287 if (isa<SCEVCouldNotCompute>(Val: MaxBTC))
288 return false;
289 MaxBTC = SE.getNoopOrZeroExtend(
290 V: MaxBTC, Ty: WiderTy);
291 OffsetAtLastIter =
292 mulSCEVNoOverflow(A: MaxBTC, B: SE.getAbsExpr(Op: Step, /*IsNSW=*/false), SE);
293 if (!OffsetAtLastIter)
294 return false;
295 }
296
297 const SCEV *OffsetEndBytes = addSCEVNoOverflow(
298 A: OffsetAtLastIter, B: SE.getNoopOrZeroExtend(V: EltSize, Ty: WiderTy), SE);
299 if (!OffsetEndBytes)
300 return false;
301
302 if (IsKnownNonNegative) {
303 // For positive steps, check if
304 // (AR->getStart() - StartPtr) + (MaxBTC * Step) + EltSize <= DerefBytes,
305 // while making sure none of the computations unsigned wrap themselves.
306 const SCEV *EndBytes = addSCEVNoOverflow(A: StartOffset, B: OffsetEndBytes, SE);
307 if (!EndBytes)
308 return false;
309
310 DerefBytesSCEV = SE.applyLoopGuards(Expr: DerefBytesSCEV, Guards: *LoopGuards);
311 return SE.isKnownPredicate(Pred: CmpInst::ICMP_ULE, LHS: EndBytes, RHS: DerefBytesSCEV);
312 }
313
314 // For negative steps check if
315 // * StartOffset >= (MaxBTC * Step + EltSize)
316 // * StartOffset <= DerefBytes.
317 assert(SE.isKnownNegative(Step) && "must be known negative");
318 return SE.isKnownPredicate(Pred: CmpInst::ICMP_SGE, LHS: StartOffset, RHS: OffsetEndBytes) &&
319 SE.isKnownPredicate(Pred: CmpInst::ICMP_ULE, LHS: StartOffset, RHS: DerefBytesSCEV);
320}
321
322std::pair<const SCEV *, const SCEV *> llvm::getStartAndEndForAccess(
323 const Loop *Lp, const SCEV *PtrExpr, Type *AccessTy, const SCEV *BTC,
324 const SCEV *MaxBTC, ScalarEvolution *SE,
325 DenseMap<std::pair<const SCEV *, const SCEV *>,
326 std::pair<const SCEV *, const SCEV *>> *PointerBounds,
327 DominatorTree *DT, AssumptionCache *AC,
328 std::optional<ScalarEvolution::LoopGuards> &LoopGuards) {
329 auto &DL = Lp->getHeader()->getDataLayout();
330 Type *IdxTy = DL.getIndexType(PtrTy: PtrExpr->getType());
331 const SCEV *EltSizeSCEV = SE->getStoreSizeOfExpr(IntTy: IdxTy, StoreTy: AccessTy);
332
333 // Delegate to the SCEV-based overload, passing through the cache.
334 return getStartAndEndForAccess(Lp, PtrExpr, EltSizeSCEV, BTC, MaxBTC, SE,
335 PointerBounds, DT, AC, LoopGuards);
336}
337
338std::pair<const SCEV *, const SCEV *> llvm::getStartAndEndForAccess(
339 const Loop *Lp, const SCEV *PtrExpr, const SCEV *EltSizeSCEV,
340 const SCEV *BTC, const SCEV *MaxBTC, ScalarEvolution *SE,
341 DenseMap<std::pair<const SCEV *, const SCEV *>,
342 std::pair<const SCEV *, const SCEV *>> *PointerBounds,
343 DominatorTree *DT, AssumptionCache *AC,
344 std::optional<ScalarEvolution::LoopGuards> &LoopGuards) {
345 std::pair<const SCEV *, const SCEV *> *PtrBoundsPair;
346 if (PointerBounds) {
347 auto [Iter, Ins] = PointerBounds->insert(
348 KV: {{PtrExpr, EltSizeSCEV},
349 {SE->getCouldNotCompute(), SE->getCouldNotCompute()}});
350 if (!Ins)
351 return Iter->second;
352 PtrBoundsPair = &Iter->second;
353 }
354
355 const SCEV *ScStart;
356 const SCEV *ScEnd;
357
358 auto &DL = Lp->getHeader()->getDataLayout();
359 if (SE->isLoopInvariant(S: PtrExpr, L: Lp)) {
360 ScStart = ScEnd = PtrExpr;
361 } else if (auto *AR = dyn_cast<SCEVAddRecExpr>(Val: PtrExpr)) {
362 ScStart = AR->getStart();
363 if (!isa<SCEVCouldNotCompute>(Val: BTC))
364 // Evaluating AR at an exact BTC is safe: LAA separately checks that
365 // accesses cannot wrap in the loop. If evaluating AR at BTC wraps, then
366 // the loop either triggers UB when executing a memory access with a
367 // poison pointer or the wrapping/poisoned pointer is not used.
368 ScEnd = AR->evaluateAtIteration(It: BTC, SE&: *SE);
369 else {
370 // Evaluating AR at MaxBTC may wrap and create an expression that is less
371 // than the start of the AddRec due to wrapping (for example consider
372 // MaxBTC = -2). If that's the case, set ScEnd to -(EltSize + 1). ScEnd
373 // will get incremented by EltSize before returning, so this effectively
374 // sets ScEnd to the maximum unsigned value for the type. Note that LAA
375 // separately checks that accesses cannot not wrap, so unsigned max
376 // represents an upper bound.
377 if (evaluatePtrAddRecAtMaxBTCWillNotWrap(AR, MaxBTC, EltSize: EltSizeSCEV, SE&: *SE, DL,
378 DT, AC, LoopGuards)) {
379 ScEnd = AR->evaluateAtIteration(It: MaxBTC, SE&: *SE);
380 } else {
381 ScEnd = SE->getAddExpr(
382 LHS: SE->getNegativeSCEV(V: EltSizeSCEV),
383 RHS: SE->getSCEV(V: ConstantExpr::getIntToPtr(
384 C: ConstantInt::getAllOnesValue(Ty: EltSizeSCEV->getType()),
385 Ty: AR->getType())));
386 }
387 }
388 const SCEV *Step = AR->getStepRecurrence(SE&: *SE);
389
390 // For expressions with negative step, the upper bound is ScStart and the
391 // lower bound is ScEnd.
392 if (const auto *CStep = dyn_cast<SCEVConstant>(Val: Step)) {
393 if (CStep->getValue()->isNegative())
394 std::swap(a&: ScStart, b&: ScEnd);
395 } else {
396 // Fallback case: the step is not constant, but we can still
397 // get the upper and lower bounds of the interval by using min/max
398 // expressions.
399 ScStart = SE->getUMinExpr(LHS: ScStart, RHS: ScEnd);
400 ScEnd = SE->getUMaxExpr(LHS: AR->getStart(), RHS: ScEnd);
401 }
402 } else
403 return {SE->getCouldNotCompute(), SE->getCouldNotCompute()};
404
405 assert(SE->isLoopInvariant(ScStart, Lp) && "ScStart needs to be invariant");
406 assert(SE->isLoopInvariant(ScEnd, Lp) && "ScEnd needs to be invariant");
407
408 // Add the size of the pointed element to ScEnd.
409 ScEnd = SE->getAddExpr(LHS: ScEnd, RHS: EltSizeSCEV);
410
411 std::pair<const SCEV *, const SCEV *> Res = {ScStart, ScEnd};
412 if (PointerBounds)
413 *PtrBoundsPair = Res;
414 return Res;
415}
416
417/// Calculate Start and End points of memory access using
418/// getStartAndEndForAccess.
419void RuntimePointerChecking::insert(Loop *Lp, Value *Ptr, const SCEV *PtrExpr,
420 Type *AccessTy, bool WritePtr,
421 unsigned DepSetId, unsigned ASId,
422 PredicatedScalarEvolution &PSE,
423 bool NeedsFreeze) {
424 const SCEV *SymbolicMaxBTC = PSE.getSymbolicMaxBackedgeTakenCount();
425 const SCEV *BTC = PSE.getBackedgeTakenCount();
426 const auto &[ScStart, ScEnd] = getStartAndEndForAccess(
427 Lp, PtrExpr, AccessTy, BTC, MaxBTC: SymbolicMaxBTC, SE: PSE.getSE(),
428 PointerBounds: &DC.getPointerBounds(), DT: DC.getDT(), AC: DC.getAC(), LoopGuards);
429 assert(!isa<SCEVCouldNotCompute>(ScStart) &&
430 !isa<SCEVCouldNotCompute>(ScEnd) &&
431 "must be able to compute both start and end expressions");
432 Pointers.emplace_back(Args&: Ptr, Args: ScStart, Args: ScEnd, Args&: WritePtr, Args&: DepSetId, Args&: ASId, Args&: PtrExpr,
433 Args&: NeedsFreeze);
434}
435
436bool RuntimePointerChecking::tryToCreateDiffCheck(
437 const RuntimeCheckingPtrGroup &CGI, const RuntimeCheckingPtrGroup &CGJ) {
438 // If either group contains multiple different pointers, bail out.
439 // TODO: Support multiple pointers by using the minimum or maximum pointer,
440 // depending on src & sink.
441 if (CGI.Members.size() != 1 || CGJ.Members.size() != 1)
442 return false;
443
444 const PointerInfo *Src = &Pointers[CGI.Members[0]];
445 const PointerInfo *Sink = &Pointers[CGJ.Members[0]];
446
447 // If either pointer is read and written, multiple checks may be needed. Bail
448 // out.
449 if (!DC.getOrderForAccess(Ptr: Src->PointerValue, IsWrite: !Src->IsWritePtr).empty() ||
450 !DC.getOrderForAccess(Ptr: Sink->PointerValue, IsWrite: !Sink->IsWritePtr).empty())
451 return false;
452
453 ArrayRef<unsigned> AccSrc =
454 DC.getOrderForAccess(Ptr: Src->PointerValue, IsWrite: Src->IsWritePtr);
455 ArrayRef<unsigned> AccSink =
456 DC.getOrderForAccess(Ptr: Sink->PointerValue, IsWrite: Sink->IsWritePtr);
457 // If either pointer is accessed multiple times, there may not be a clear
458 // src/sink relation. Bail out for now.
459 if (AccSrc.size() != 1 || AccSink.size() != 1)
460 return false;
461
462 // If the sink is accessed before src, swap src/sink.
463 if (AccSink[0] < AccSrc[0])
464 std::swap(a&: Src, b&: Sink);
465
466 const SCEVConstant *Step;
467 const SCEV *SrcStart;
468 const SCEV *SinkStart;
469 const Loop *InnerLoop = DC.getInnermostLoop();
470 if (!match(S: Src->Expr,
471 P: m_scev_AffineAddRec(Op0: m_SCEV(V&: SrcStart), Op1: m_SCEVConstant(V&: Step),
472 L: m_SpecificLoop(L: InnerLoop))) ||
473 !match(S: Sink->Expr,
474 P: m_scev_AffineAddRec(Op0: m_SCEV(V&: SinkStart), Op1: m_scev_Specific(S: Step),
475 L: m_SpecificLoop(L: InnerLoop))))
476 return false;
477
478 SmallVector<Instruction *, 4> SrcInsts =
479 DC.getInstructionsForAccess(Ptr: Src->PointerValue, isWrite: Src->IsWritePtr);
480 SmallVector<Instruction *, 4> SinkInsts =
481 DC.getInstructionsForAccess(Ptr: Sink->PointerValue, isWrite: Sink->IsWritePtr);
482 Type *SrcTy = getLoadStoreType(I: SrcInsts[0]);
483 Type *DstTy = getLoadStoreType(I: SinkInsts[0]);
484 if (isa<ScalableVectorType>(Val: SrcTy) || isa<ScalableVectorType>(Val: DstTy))
485 return false;
486
487 const DataLayout &DL = InnerLoop->getHeader()->getDataLayout();
488 unsigned AllocSize =
489 std::max(a: DL.getTypeAllocSize(Ty: SrcTy), b: DL.getTypeAllocSize(Ty: DstTy));
490
491 // Only matching constant steps matching the AllocSize are supported at the
492 // moment. This simplifies the difference computation. Can be extended in the
493 // future.
494 if (Step->getAPInt().abs() != AllocSize)
495 return false;
496
497 // When counting down, the dependence distance needs to be swapped.
498 if (Step->getValue()->isNegative())
499 std::swap(a&: SinkStart, b&: SrcStart);
500
501 const SCEV *SinkStartInt = SE->getPtrToAddrExpr(Op: SinkStart);
502 const SCEV *SrcStartInt = SE->getPtrToAddrExpr(Op: SrcStart);
503 if (isa<SCEVCouldNotCompute>(Val: SinkStartInt) ||
504 isa<SCEVCouldNotCompute>(Val: SrcStartInt))
505 return false;
506
507 // If the start values for both Src and Sink also vary according to an outer
508 // loop, then it's probably better to avoid creating diff checks because
509 // they may not be hoisted. We should instead let llvm::addRuntimeChecks
510 // do the expanded full range overlap checks, which can be hoisted.
511 if (HoistRuntimeChecks && InnerLoop->getParentLoop() &&
512 isa<SCEVAddRecExpr>(Val: SinkStartInt) && isa<SCEVAddRecExpr>(Val: SrcStartInt)) {
513 auto *SrcStartAR = cast<SCEVAddRecExpr>(Val: SrcStartInt);
514 auto *SinkStartAR = cast<SCEVAddRecExpr>(Val: SinkStartInt);
515 const Loop *StartARLoop = SrcStartAR->getLoop();
516 if (StartARLoop == SinkStartAR->getLoop() &&
517 StartARLoop == InnerLoop->getParentLoop() &&
518 // If the diff check would already be loop invariant (due to the
519 // recurrences being the same), then we prefer to keep the diff checks
520 // because they are cheaper.
521 SrcStartAR->getStepRecurrence(SE&: *SE) !=
522 SinkStartAR->getStepRecurrence(SE&: *SE)) {
523 LLVM_DEBUG(dbgs() << "LAA: Not creating diff runtime check, since these "
524 "cannot be hoisted out of the outer loop\n");
525 return false;
526 }
527 }
528
529 LLVM_DEBUG(dbgs() << "LAA: Creating diff runtime check for:\n"
530 << "SrcStart: " << *SrcStartInt << '\n'
531 << "SinkStartInt: " << *SinkStartInt << '\n');
532 DiffChecks.emplace_back(Args&: SrcStartInt, Args&: SinkStartInt, Args&: AllocSize,
533 Args: Src->NeedsFreeze || Sink->NeedsFreeze);
534 return true;
535}
536
537SmallVector<RuntimePointerCheck, 4> RuntimePointerChecking::generateChecks() {
538 SmallVector<RuntimePointerCheck, 4> Checks;
539
540 for (unsigned I = 0; I < CheckingGroups.size(); ++I) {
541 for (unsigned J = I + 1; J < CheckingGroups.size(); ++J) {
542 const RuntimeCheckingPtrGroup &CGI = CheckingGroups[I];
543 const RuntimeCheckingPtrGroup &CGJ = CheckingGroups[J];
544
545 if (needsChecking(M: CGI, N: CGJ)) {
546 CanUseDiffCheck = CanUseDiffCheck && tryToCreateDiffCheck(CGI, CGJ);
547 Checks.emplace_back(Args: &CGI, Args: &CGJ);
548 }
549 }
550 }
551 return Checks;
552}
553
554void RuntimePointerChecking::generateChecks(
555 MemoryDepChecker::DepCandidates &DepCands) {
556 assert(Checks.empty() && "Checks is not empty");
557 groupChecks(DepCands);
558 Checks = generateChecks();
559}
560
561bool RuntimePointerChecking::needsChecking(
562 const RuntimeCheckingPtrGroup &M, const RuntimeCheckingPtrGroup &N) const {
563 for (const auto &I : M.Members)
564 for (const auto &J : N.Members)
565 if (needsChecking(I, J))
566 return true;
567 return false;
568}
569
570/// Compare \p I and \p J and return the minimum.
571/// Return nullptr in case we couldn't find an answer.
572static const SCEV *getMinFromExprs(const SCEV *I, const SCEV *J,
573 ScalarEvolution *SE) {
574 std::optional<APInt> Diff = SE->computeConstantDifference(LHS: J, RHS: I);
575 if (!Diff)
576 return nullptr;
577 return Diff->isNegative() ? J : I;
578}
579
580bool RuntimeCheckingPtrGroup::addPointer(
581 unsigned Index, const RuntimePointerChecking &RtCheck) {
582 return addPointer(
583 Index, Start: RtCheck.Pointers[Index].Start, End: RtCheck.Pointers[Index].End,
584 AS: RtCheck.Pointers[Index].PointerValue->getType()->getPointerAddressSpace(),
585 NeedsFreeze: RtCheck.Pointers[Index].NeedsFreeze, SE&: *RtCheck.SE);
586}
587
588bool RuntimeCheckingPtrGroup::addPointer(unsigned Index, const SCEV *Start,
589 const SCEV *End, unsigned AS,
590 bool NeedsFreeze,
591 ScalarEvolution &SE) {
592 assert(AddressSpace == AS &&
593 "all pointers in a checking group must be in the same address space");
594
595 // Compare the starts and ends with the known minimum and maximum
596 // of this set. We need to know how we compare against the min/max
597 // of the set in order to be able to emit memchecks.
598 const SCEV *Min0 = getMinFromExprs(I: Start, J: Low, SE: &SE);
599 if (!Min0)
600 return false;
601
602 const SCEV *Min1 = getMinFromExprs(I: End, J: High, SE: &SE);
603 if (!Min1)
604 return false;
605
606 // Update the low bound expression if we've found a new min value.
607 if (Min0 == Start)
608 Low = Start;
609
610 // Update the high bound expression if we've found a new max value.
611 if (Min1 != End)
612 High = End;
613
614 Members.push_back(Elt: Index);
615 this->NeedsFreeze |= NeedsFreeze;
616 return true;
617}
618
619void RuntimePointerChecking::groupChecks(
620 MemoryDepChecker::DepCandidates &DepCands) {
621 // We build the groups from dependency candidates equivalence classes
622 // because:
623 // - We know that pointers in the same equivalence class share
624 // the same underlying object and therefore there is a chance
625 // that we can compare pointers
626 // - We wouldn't be able to merge two pointers for which we need
627 // to emit a memcheck. The classes in DepCands are already
628 // conveniently built such that no two pointers in the same
629 // class need checking against each other.
630
631 // We use the following (greedy) algorithm to construct the groups
632 // For every pointer in the equivalence class:
633 // For each existing group:
634 // - if the difference between this pointer and the min/max bounds
635 // of the group is a constant, then make the pointer part of the
636 // group and update the min/max bounds of that group as required.
637
638 CheckingGroups.clear();
639
640 // If we need to check two pointers to the same underlying object
641 // with a non-constant difference, we shouldn't perform any pointer
642 // grouping with those pointers. This is because we can easily get
643 // into cases where the resulting check would return false, even when
644 // the accesses are safe.
645 //
646 // The following example shows this:
647 // for (i = 0; i < 1000; ++i)
648 // a[5000 + i * m] = a[i] + a[i + 9000]
649 //
650 // Here grouping gives a check of (5000, 5000 + 1000 * m) against
651 // (0, 10000) which is always false. However, if m is 1, there is no
652 // dependence. Not grouping the checks for a[i] and a[i + 9000] allows
653 // us to perform an accurate check in this case.
654 //
655 // In the above case, we have a non-constant distance and an Unknown
656 // dependence between accesses to the same underlying object, and could retry
657 // with runtime checks without dependency information being available. In this
658 // case we will use the fallback path and create separate checking groups for
659 // accesses not present in DepCands.
660
661 unsigned TotalComparisons = 0;
662
663 DenseMap<Value *, SmallVector<unsigned>> PositionMap;
664 for (unsigned Index = 0; Index < Pointers.size(); ++Index)
665 PositionMap[Pointers[Index].PointerValue].push_back(Elt: Index);
666
667 // We need to keep track of what pointers we've already seen so we
668 // don't process them twice.
669 SmallSet<unsigned, 2> Seen;
670
671 // Go through all equivalence classes, get the "pointer check groups"
672 // and add them to the overall solution. We use the order in which accesses
673 // appear in 'Pointers' to enforce determinism.
674 for (unsigned I = 0; I < Pointers.size(); ++I) {
675 // We've seen this pointer before, and therefore already processed
676 // its equivalence class.
677 if (Seen.contains(V: I))
678 continue;
679
680 MemoryDepChecker::MemAccessInfo Access(Pointers[I].PointerValue,
681 Pointers[I].IsWritePtr);
682
683 // If there is no entry in the dependency partition, there are no potential
684 // accesses to merge; simply add a new pointer checking group.
685 if (!DepCands.contains(V: Access)) {
686 CheckingGroups.push_back(Elt: RuntimeCheckingPtrGroup(I, *this));
687 continue;
688 }
689
690 SmallVector<RuntimeCheckingPtrGroup, 2> Groups;
691
692 // Because DepCands is constructed by visiting accesses in the order in
693 // which they appear in alias sets (which is deterministic) and the
694 // iteration order within an equivalence class member is only dependent on
695 // the order in which unions and insertions are performed on the
696 // equivalence class, the iteration order is deterministic.
697 for (auto M : DepCands.members(V: Access)) {
698 auto PointerI = PositionMap.find(Val: M.getPointer());
699 // If we can't find the pointer in PositionMap that means we can't
700 // generate a memcheck for it.
701 if (PointerI == PositionMap.end())
702 continue;
703 for (unsigned Pointer : PointerI->second) {
704 bool Merged = false;
705 // Mark this pointer as seen.
706 Seen.insert(V: Pointer);
707
708 // Go through all the existing sets and see if we can find one
709 // which can include this pointer.
710 for (RuntimeCheckingPtrGroup &Group : Groups) {
711 // Don't perform more than a certain amount of comparisons.
712 // This should limit the cost of grouping the pointers to something
713 // reasonable. If we do end up hitting this threshold, the algorithm
714 // will create separate groups for all remaining pointers.
715 if (TotalComparisons > MemoryCheckMergeThreshold)
716 break;
717
718 TotalComparisons++;
719
720 if (Group.addPointer(Index: Pointer, RtCheck: *this)) {
721 Merged = true;
722 break;
723 }
724 }
725
726 if (!Merged)
727 // We couldn't add this pointer to any existing set or the threshold
728 // for the number of comparisons has been reached. Create a new group
729 // to hold the current pointer.
730 Groups.emplace_back(Args&: Pointer, Args&: *this);
731 }
732 }
733
734 // We've computed the grouped checks for this partition.
735 // Save the results and continue with the next one.
736 llvm::append_range(C&: CheckingGroups, R&: Groups);
737 }
738}
739
740bool RuntimePointerChecking::arePointersInSamePartition(
741 const SmallVectorImpl<int> &PtrToPartition, unsigned PtrIdx1,
742 unsigned PtrIdx2) {
743 return (PtrToPartition[PtrIdx1] != -1 &&
744 PtrToPartition[PtrIdx1] == PtrToPartition[PtrIdx2]);
745}
746
747bool RuntimePointerChecking::needsChecking(unsigned I, unsigned J) const {
748 const PointerInfo &PointerI = Pointers[I];
749 const PointerInfo &PointerJ = Pointers[J];
750
751 // No need to check if two readonly pointers intersect.
752 if (!PointerI.IsWritePtr && !PointerJ.IsWritePtr)
753 return false;
754
755 // Only need to check pointers between two different dependency sets.
756 if (PointerI.DependencySetId == PointerJ.DependencySetId)
757 return false;
758
759 // Only need to check pointers in the same alias set.
760 return PointerI.AliasSetId == PointerJ.AliasSetId;
761}
762
763/// Assign each RuntimeCheckingPtrGroup pointer an index for stable UTC output.
764static DenseMap<const RuntimeCheckingPtrGroup *, unsigned>
765getPtrToIdxMap(ArrayRef<RuntimeCheckingPtrGroup> CheckingGroups) {
766 DenseMap<const RuntimeCheckingPtrGroup *, unsigned> PtrIndices;
767 for (const auto &[Idx, CG] : enumerate(First&: CheckingGroups))
768 PtrIndices[&CG] = Idx;
769 return PtrIndices;
770}
771
772void RuntimePointerChecking::printChecks(
773 raw_ostream &OS, const SmallVectorImpl<RuntimePointerCheck> &Checks,
774 unsigned Depth) const {
775 unsigned N = 0;
776 auto PtrIndices = getPtrToIdxMap(CheckingGroups);
777 for (const auto &[Check1, Check2] : Checks) {
778 const auto &First = Check1->Members, &Second = Check2->Members;
779 OS.indent(NumSpaces: Depth) << "Check " << N++ << ":\n";
780 OS.indent(NumSpaces: Depth + 2) << "Comparing group GRP" << PtrIndices.at(Val: Check1)
781 << ":\n";
782 for (unsigned K : First)
783 OS.indent(NumSpaces: Depth + 2) << *Pointers[K].PointerValue << "\n";
784 OS.indent(NumSpaces: Depth + 2) << "Against group GRP" << PtrIndices.at(Val: Check2)
785 << ":\n";
786 for (unsigned K : Second)
787 OS.indent(NumSpaces: Depth + 2) << *Pointers[K].PointerValue << "\n";
788 }
789}
790
791void RuntimePointerChecking::print(raw_ostream &OS, unsigned Depth) const {
792
793 OS.indent(NumSpaces: Depth) << "Run-time memory checks:\n";
794 printChecks(OS, Checks, Depth);
795
796 OS.indent(NumSpaces: Depth) << "Grouped accesses:\n";
797 auto PtrIndices = getPtrToIdxMap(CheckingGroups);
798 for (const auto &CG : CheckingGroups) {
799 OS.indent(NumSpaces: Depth + 2) << "Group GRP" << PtrIndices.at(Val: &CG) << ":\n";
800 OS.indent(NumSpaces: Depth + 4) << "(Low: " << *CG.Low << " High: " << *CG.High
801 << ")\n";
802 for (unsigned Member : CG.Members) {
803 OS.indent(NumSpaces: Depth + 6) << "Member: " << *Pointers[Member].Expr << "\n";
804 }
805 }
806}
807
808namespace {
809
810/// Analyses memory accesses in a loop.
811///
812/// Checks whether run time pointer checks are needed and builds sets for data
813/// dependence checking.
814class AccessAnalysis {
815public:
816 /// Read or write access location.
817 typedef PointerIntPair<Value *, 1, bool> MemAccessInfo;
818 typedef SmallVector<MemAccessInfo, 8> MemAccessInfoList;
819
820 AccessAnalysis(const Loop *TheLoop, AAResults *AA, const LoopInfo *LI,
821 DominatorTree &DT, MemoryDepChecker::DepCandidates &DA,
822 PredicatedScalarEvolution &PSE,
823 SmallPtrSetImpl<MDNode *> &LoopAliasScopes)
824 : TheLoop(TheLoop), BAA(*AA), AST(BAA), LI(LI), DT(DT), DepCands(DA),
825 PSE(PSE), LoopAliasScopes(LoopAliasScopes) {
826 // We're analyzing dependences across loop iterations.
827 BAA.enableCrossIterationMode();
828 }
829
830 /// Register a load and whether it is only read from.
831 void addLoad(const MemoryLocation &Loc, Type *AccessTy, bool IsReadOnly) {
832 Value *Ptr = const_cast<Value *>(Loc.Ptr);
833 AST.add(Loc: adjustLoc(Loc));
834 Accesses[MemAccessInfo(Ptr, false)].insert(X: AccessTy);
835 if (IsReadOnly)
836 ReadOnlyPtr.insert(Ptr);
837 }
838
839 /// Register a store.
840 void addStore(const MemoryLocation &Loc, Type *AccessTy) {
841 Value *Ptr = const_cast<Value *>(Loc.Ptr);
842 AST.add(Loc: adjustLoc(Loc));
843 Accesses[MemAccessInfo(Ptr, true)].insert(X: AccessTy);
844 }
845
846 /// Check if we can emit a run-time no-alias check for \p Access.
847 ///
848 /// Returns true if we can emit a run-time no alias check for \p Access.
849 /// If we can check this access, this also adds it to a dependence set and
850 /// adds a run-time to check for it to \p RtCheck. If \p Assume is true,
851 /// we will attempt to use additional run-time checks in order to get
852 /// the bounds of the pointer.
853 bool createCheckForAccess(RuntimePointerChecking &RtCheck,
854 MemAccessInfo Access, Type *AccessTy,
855 const DenseMap<Value *, const SCEV *> &Strides,
856 DenseMap<Value *, unsigned> &DepSetId,
857 Loop *TheLoop, unsigned &RunningDepId,
858 unsigned ASId, bool Assume);
859
860 /// Check whether we can check the pointers at runtime for
861 /// non-intersection.
862 ///
863 /// Returns true if we need no check or if we do and we can generate them
864 /// (i.e. the pointers have computable bounds). A return value of false means
865 /// we couldn't analyze and generate runtime checks for all pointers in the
866 /// loop, but if \p AllowPartial is set then we will have checks for those
867 /// pointers we could analyze. \p DepChecker is used to remove unknown
868 /// dependences from DepCands.
869 bool canCheckPtrAtRT(RuntimePointerChecking &RtCheck, Loop *TheLoop,
870 const DenseMap<Value *, const SCEV *> &Strides,
871 Value *&UncomputablePtr, bool AllowPartial,
872 const MemoryDepChecker &DepChecker);
873
874 /// Goes over all memory accesses, checks whether a RT check is needed
875 /// and builds sets of dependent accesses.
876 void buildDependenceSets() {
877 processMemAccesses();
878 }
879
880 /// Initial processing of memory accesses determined that we need to
881 /// perform dependency checking.
882 ///
883 /// Note that this can later be cleared if we retry memcheck analysis without
884 /// dependency checking (i.e. ShouldRetryWithRuntimeChecks).
885 bool isDependencyCheckNeeded() const { return !CheckDeps.empty(); }
886
887 /// We decided that no dependence analysis would be used. Reset the state.
888 void resetDepChecks(MemoryDepChecker &DepChecker) {
889 CheckDeps.clear();
890 DepChecker.clearDependences();
891 }
892
893 const MemAccessInfoList &getDependenciesToCheck() const { return CheckDeps; }
894
895private:
896 typedef MapVector<MemAccessInfo, SmallSetVector<Type *, 1>> PtrAccessMap;
897
898 /// Adjust the MemoryLocation so that it represents accesses to this
899 /// location across all iterations, rather than a single one.
900 MemoryLocation adjustLoc(MemoryLocation Loc) const {
901 // The accessed location varies within the loop, but remains within the
902 // underlying object.
903 Loc.Size = LocationSize::beforeOrAfterPointer();
904 Loc.AATags.Scope = adjustAliasScopeList(ScopeList: Loc.AATags.Scope);
905 Loc.AATags.NoAlias = adjustAliasScopeList(ScopeList: Loc.AATags.NoAlias);
906 return Loc;
907 }
908
909 /// Drop alias scopes that are only valid within a single loop iteration.
910 MDNode *adjustAliasScopeList(MDNode *ScopeList) const {
911 if (!ScopeList)
912 return nullptr;
913
914 // For the sake of simplicity, drop the whole scope list if any scope is
915 // iteration-local.
916 if (any_of(Range: ScopeList->operands(), P: [&](Metadata *Scope) {
917 return LoopAliasScopes.contains(Ptr: cast<MDNode>(Val: Scope));
918 }))
919 return nullptr;
920
921 return ScopeList;
922 }
923
924 /// Go over all memory access and check whether runtime pointer checks
925 /// are needed and build sets of dependency check candidates.
926 void processMemAccesses();
927
928 /// Map of all accesses. Values are the types used to access memory pointed to
929 /// by the pointer.
930 PtrAccessMap Accesses;
931
932 /// The loop being checked.
933 const Loop *TheLoop;
934
935 /// List of accesses that need a further dependence check.
936 MemAccessInfoList CheckDeps;
937
938 /// Set of pointers that are read only.
939 SmallPtrSet<Value*, 16> ReadOnlyPtr;
940
941 /// Batched alias analysis results.
942 BatchAAResults BAA;
943
944 /// An alias set tracker to partition the access set by underlying object and
945 //intrinsic property (such as TBAA metadata).
946 AliasSetTracker AST;
947
948 /// The LoopInfo of the loop being checked.
949 const LoopInfo *LI;
950
951 /// The dominator tree of the function.
952 DominatorTree &DT;
953
954 /// Sets of potentially dependent accesses - members of one set share an
955 /// underlying pointer. The set "CheckDeps" identfies which sets really need a
956 /// dependence check.
957 MemoryDepChecker::DepCandidates &DepCands;
958
959 /// Initial processing of memory accesses determined that we may need
960 /// to add memchecks. Perform the analysis to determine the necessary checks.
961 ///
962 /// Note that, this is different from isDependencyCheckNeeded. When we retry
963 /// memcheck analysis without dependency checking
964 /// (i.e. ShouldRetryWithRuntimeChecks), isDependencyCheckNeeded is
965 /// cleared while this remains set if we have potentially dependent accesses.
966 bool IsRTCheckAnalysisNeeded = false;
967
968 /// The SCEV predicate containing all the SCEV-related assumptions.
969 PredicatedScalarEvolution &PSE;
970
971 DenseMap<Value *, SmallVector<const Value *, 16>> UnderlyingObjects;
972
973 /// Alias scopes that are declared inside the loop, and as such not valid
974 /// across iterations.
975 SmallPtrSetImpl<MDNode *> &LoopAliasScopes;
976};
977
978} // end anonymous namespace
979
980/// Try to compute a constant stride for \p AR. Used by getPtrStride and
981/// isNoWrap.
982static std::optional<int64_t>
983getStrideFromAddRec(const SCEVAddRecExpr *AR, const Loop *Lp, Type *AccessTy,
984 Value *Ptr, PredicatedScalarEvolution &PSE) {
985 if (isa<ScalableVectorType>(Val: AccessTy)) {
986 LLVM_DEBUG(dbgs() << "LAA: Bad stride - Scalable object: " << *AccessTy
987 << "\n");
988 return std::nullopt;
989 }
990
991 // The access function must stride over the innermost loop.
992 if (Lp != AR->getLoop()) {
993 LLVM_DEBUG({
994 dbgs() << "LAA: Bad stride - Not striding over innermost loop ";
995 if (Ptr)
996 dbgs() << *Ptr << " ";
997
998 dbgs() << "SCEV: " << *AR << "\n";
999 });
1000 return std::nullopt;
1001 }
1002
1003 // Check the step is constant.
1004 const SCEV *Step = AR->getStepRecurrence(SE&: *PSE.getSE());
1005
1006 // Calculate the pointer stride and check if it is constant.
1007 const APInt *APStepVal;
1008 if (!match(S: Step, P: m_scev_APInt(C&: APStepVal))) {
1009 LLVM_DEBUG({
1010 dbgs() << "LAA: Bad stride - Not a constant strided ";
1011 if (Ptr)
1012 dbgs() << *Ptr << " ";
1013 dbgs() << "SCEV: " << *AR << "\n";
1014 });
1015 return std::nullopt;
1016 }
1017
1018 const auto &DL = Lp->getHeader()->getDataLayout();
1019 TypeSize AllocSize = DL.getTypeAllocSize(Ty: AccessTy);
1020 int64_t Size = AllocSize.getFixedValue();
1021
1022 // Huge step value - give up.
1023 std::optional<int64_t> StepVal = APStepVal->trySExtValue();
1024 if (!StepVal)
1025 return std::nullopt;
1026
1027 // Strided access.
1028 return *StepVal % Size ? std::nullopt : std::make_optional(t: *StepVal / Size);
1029}
1030
1031/// Check whether \p AR is a non-wrapping AddRec. If \p Ptr is not nullptr, use
1032/// informating from the IR pointer value to determine no-wrap.
1033static bool isNoWrap(PredicatedScalarEvolution &PSE, const SCEVAddRecExpr *AR,
1034 Value *Ptr, Type *AccessTy, const Loop *L, bool Assume,
1035 const DominatorTree &DT,
1036 std::optional<int64_t> Stride = std::nullopt) {
1037 // FIXME: This should probably only return true for NUW.
1038 if (AR->getNoWrapFlags(Mask: SCEV::NoWrapMask))
1039 return true;
1040
1041 if (Ptr && PSE.hasNoOverflow(V: Ptr, Flags: SCEVWrapPredicate::IncrementNUSW))
1042 return true;
1043
1044 // An nusw getelementptr that is an AddRec cannot wrap. If it would wrap,
1045 // the distance between the previously accessed location and the wrapped
1046 // location will be larger than half the pointer index type space. In that
1047 // case, the GEP would be poison and any memory access dependent on it would
1048 // be immediate UB when executed.
1049 if (auto *GEP = dyn_cast_if_present<GetElementPtrInst>(Val: Ptr);
1050 GEP && GEP->hasNoUnsignedSignedWrap()) {
1051 // For the above reasoning to apply, the pointer must be dereferenced in
1052 // every iteration.
1053 if (L->getHeader() == L->getLoopLatch() ||
1054 any_of(Range: GEP->users(), P: [L, &DT, GEP](User *U) {
1055 if (getLoadStorePointerOperand(V: U) != GEP)
1056 return false;
1057 BasicBlock *UserBB = cast<Instruction>(Val: U)->getParent();
1058 if (!L->contains(BB: UserBB))
1059 return false;
1060 return !LoopAccessInfo::blockNeedsPredication(BB: UserBB, TheLoop: L, DT: &DT);
1061 }))
1062 return true;
1063 }
1064
1065 if (!Stride)
1066 Stride = getStrideFromAddRec(AR, Lp: L, AccessTy, Ptr, PSE);
1067 if (Stride) {
1068 // If the null pointer is undefined, then a access sequence which would
1069 // otherwise access it can be assumed not to unsigned wrap. Note that this
1070 // assumes the object in memory is aligned to the natural alignment.
1071 unsigned AddrSpace = AR->getType()->getPointerAddressSpace();
1072 if (!NullPointerIsDefined(F: L->getHeader()->getParent(), AS: AddrSpace) &&
1073 (Stride == 1 || Stride == -1))
1074 return true;
1075 }
1076
1077 if (Ptr && Assume) {
1078 PSE.setNoOverflow(V: Ptr, Flags: SCEVWrapPredicate::IncrementNUSW);
1079 LLVM_DEBUG(dbgs() << "LAA: Pointer may wrap:\n"
1080 << "LAA: Pointer: " << *Ptr << "\n"
1081 << "LAA: SCEV: " << *AR << "\n"
1082 << "LAA: Added an overflow assumption\n");
1083 return true;
1084 }
1085
1086 return false;
1087}
1088
1089static void visitPointers(Value *StartPtr, const Loop &InnermostLoop,
1090 function_ref<void(Value *)> AddPointer) {
1091 SmallPtrSet<Value *, 8> Visited;
1092 SmallVector<Value *> WorkList;
1093 WorkList.push_back(Elt: StartPtr);
1094
1095 while (!WorkList.empty()) {
1096 Value *Ptr = WorkList.pop_back_val();
1097 if (!Visited.insert(Ptr).second)
1098 continue;
1099 auto *PN = dyn_cast<PHINode>(Val: Ptr);
1100 // SCEV does not look through non-header PHIs inside the loop. Such phis
1101 // can be analyzed by adding separate accesses for each incoming pointer
1102 // value.
1103 if (PN && InnermostLoop.contains(BB: PN->getParent()) &&
1104 PN->getParent() != InnermostLoop.getHeader()) {
1105 llvm::append_range(C&: WorkList, R: PN->incoming_values());
1106 } else
1107 AddPointer(Ptr);
1108 }
1109}
1110
1111// Walk back through the IR for a pointer, looking for a select like the
1112// following:
1113//
1114// %offset = select i1 %cmp, i64 %a, i64 %b
1115// %addr = getelementptr double, double* %base, i64 %offset
1116// %ld = load double, double* %addr, align 8
1117//
1118// We won't be able to form a single SCEVAddRecExpr from this since the
1119// address for each loop iteration depends on %cmp. We could potentially
1120// produce multiple valid SCEVAddRecExprs, though, and check all of them for
1121// memory safety/aliasing if needed.
1122//
1123// If we encounter some IR we don't yet handle, or something obviously fine
1124// like a constant, then we just add the SCEV for that term to the list passed
1125// in by the caller. If we have a node that may potentially yield a valid
1126// SCEVAddRecExpr then we decompose it into parts and build the SCEV terms
1127// ourselves before adding to the list.
1128static void findForkedSCEVs(
1129 ScalarEvolution *SE, const Loop *L, Value *Ptr,
1130 SmallVectorImpl<PointerIntPair<const SCEV *, 1, bool>> &ScevList,
1131 unsigned Depth) {
1132 // If our Value is a SCEVAddRecExpr, loop invariant, not an instruction, or
1133 // we've exceeded our limit on recursion, just return whatever we have
1134 // regardless of whether it can be used for a forked pointer or not, along
1135 // with an indication of whether it might be a poison or undef value.
1136 const SCEV *Scev = SE->getSCEV(V: Ptr);
1137 if (isa<SCEVAddRecExpr>(Val: Scev) || L->isLoopInvariant(V: Ptr) ||
1138 !isa<Instruction>(Val: Ptr) || Depth == 0) {
1139 ScevList.emplace_back(Args&: Scev, Args: !isGuaranteedNotToBeUndefOrPoison(V: Ptr));
1140 return;
1141 }
1142
1143 Depth--;
1144
1145 auto UndefPoisonCheck = [](PointerIntPair<const SCEV *, 1, bool> S) {
1146 return get<1>(Pair: S);
1147 };
1148
1149 auto GetBinOpExpr = [&SE](unsigned Opcode, const SCEV *L, const SCEV *R) {
1150 switch (Opcode) {
1151 case Instruction::Add:
1152 return SE->getAddExpr(LHS: L, RHS: R);
1153 case Instruction::Sub:
1154 return SE->getMinusSCEV(LHS: L, RHS: R);
1155 default:
1156 llvm_unreachable("Unexpected binary operator when walking ForkedPtrs");
1157 }
1158 };
1159
1160 Instruction *I = cast<Instruction>(Val: Ptr);
1161 unsigned Opcode = I->getOpcode();
1162 switch (Opcode) {
1163 case Instruction::GetElementPtr: {
1164 auto *GEP = cast<GetElementPtrInst>(Val: I);
1165 Type *SourceTy = GEP->getSourceElementType();
1166 // We only handle base + single offset GEPs here for now.
1167 // Not dealing with preexisting gathers yet, so no vectors.
1168 if (I->getNumOperands() != 2 || SourceTy->isVectorTy()) {
1169 ScevList.emplace_back(Args&: Scev, Args: !isGuaranteedNotToBeUndefOrPoison(V: GEP));
1170 break;
1171 }
1172 SmallVector<PointerIntPair<const SCEV *, 1, bool>, 2> BaseScevs;
1173 SmallVector<PointerIntPair<const SCEV *, 1, bool>, 2> OffsetScevs;
1174 findForkedSCEVs(SE, L, Ptr: I->getOperand(i: 0), ScevList&: BaseScevs, Depth);
1175 findForkedSCEVs(SE, L, Ptr: I->getOperand(i: 1), ScevList&: OffsetScevs, Depth);
1176
1177 // See if we need to freeze our fork...
1178 bool NeedsFreeze = any_of(Range&: BaseScevs, P: UndefPoisonCheck) ||
1179 any_of(Range&: OffsetScevs, P: UndefPoisonCheck);
1180
1181 // Check that we only have a single fork, on either the base or the offset.
1182 // Copy the SCEV across for the one without a fork in order to generate
1183 // the full SCEV for both sides of the GEP.
1184 if (OffsetScevs.size() == 2 && BaseScevs.size() == 1)
1185 BaseScevs.push_back(Elt: BaseScevs[0]);
1186 else if (BaseScevs.size() == 2 && OffsetScevs.size() == 1)
1187 OffsetScevs.push_back(Elt: OffsetScevs[0]);
1188 else {
1189 ScevList.emplace_back(Args&: Scev, Args&: NeedsFreeze);
1190 break;
1191 }
1192
1193 Type *IntPtrTy = SE->getEffectiveSCEVType(Ty: GEP->getPointerOperandType());
1194
1195 // Find the size of the type being pointed to. We only have a single
1196 // index term (guarded above) so we don't need to index into arrays or
1197 // structures, just get the size of the scalar value.
1198 const SCEV *Size = SE->getSizeOfExpr(IntTy: IntPtrTy, AllocTy: SourceTy);
1199
1200 for (auto [B, O] : zip(t&: BaseScevs, u&: OffsetScevs)) {
1201 const SCEV *Base = get<0>(Pair: B);
1202 const SCEV *Offset = get<0>(Pair: O);
1203
1204 // Scale up the offsets by the size of the type, then add to the bases.
1205 const SCEV *Scaled =
1206 SE->getMulExpr(LHS: Size, RHS: SE->getTruncateOrSignExtend(V: Offset, Ty: IntPtrTy));
1207 ScevList.emplace_back(Args: SE->getAddExpr(LHS: Base, RHS: Scaled), Args&: NeedsFreeze);
1208 }
1209 break;
1210 }
1211 case Instruction::Select: {
1212 SmallVector<PointerIntPair<const SCEV *, 1, bool>, 2> ChildScevs;
1213 // A select means we've found a forked pointer, but we currently only
1214 // support a single select per pointer so if there's another behind this
1215 // then we just bail out and return the generic SCEV.
1216 findForkedSCEVs(SE, L, Ptr: I->getOperand(i: 1), ScevList&: ChildScevs, Depth);
1217 findForkedSCEVs(SE, L, Ptr: I->getOperand(i: 2), ScevList&: ChildScevs, Depth);
1218 if (ChildScevs.size() == 2)
1219 append_range(C&: ScevList, R&: ChildScevs);
1220 else
1221 ScevList.emplace_back(Args&: Scev, Args: !isGuaranteedNotToBeUndefOrPoison(V: Ptr));
1222 break;
1223 }
1224 case Instruction::PHI: {
1225 SmallVector<PointerIntPair<const SCEV *, 1, bool>, 2> ChildScevs;
1226 // A phi means we've found a forked pointer, but we currently only
1227 // support a single phi per pointer so if there's another behind this
1228 // then we just bail out and return the generic SCEV.
1229 if (I->getNumOperands() == 2) {
1230 findForkedSCEVs(SE, L, Ptr: I->getOperand(i: 0), ScevList&: ChildScevs, Depth);
1231 findForkedSCEVs(SE, L, Ptr: I->getOperand(i: 1), ScevList&: ChildScevs, Depth);
1232 }
1233 if (ChildScevs.size() == 2)
1234 append_range(C&: ScevList, R&: ChildScevs);
1235 else
1236 ScevList.emplace_back(Args&: Scev, Args: !isGuaranteedNotToBeUndefOrPoison(V: Ptr));
1237 break;
1238 }
1239 case Instruction::Add:
1240 case Instruction::Sub: {
1241 SmallVector<PointerIntPair<const SCEV *, 1, bool>> LScevs;
1242 SmallVector<PointerIntPair<const SCEV *, 1, bool>> RScevs;
1243 findForkedSCEVs(SE, L, Ptr: I->getOperand(i: 0), ScevList&: LScevs, Depth);
1244 findForkedSCEVs(SE, L, Ptr: I->getOperand(i: 1), ScevList&: RScevs, Depth);
1245
1246 // See if we need to freeze our fork...
1247 bool NeedsFreeze =
1248 any_of(Range&: LScevs, P: UndefPoisonCheck) || any_of(Range&: RScevs, P: UndefPoisonCheck);
1249
1250 // Check that we only have a single fork, on either the left or right side.
1251 // Copy the SCEV across for the one without a fork in order to generate
1252 // the full SCEV for both sides of the BinOp.
1253 if (LScevs.size() == 2 && RScevs.size() == 1)
1254 RScevs.push_back(Elt: RScevs[0]);
1255 else if (RScevs.size() == 2 && LScevs.size() == 1)
1256 LScevs.push_back(Elt: LScevs[0]);
1257 else {
1258 ScevList.emplace_back(Args&: Scev, Args&: NeedsFreeze);
1259 break;
1260 }
1261
1262 for (auto [L, R] : zip(t&: LScevs, u&: RScevs))
1263 ScevList.emplace_back(Args: GetBinOpExpr(Opcode, get<0>(Pair: L), get<0>(Pair: R)),
1264 Args&: NeedsFreeze);
1265 break;
1266 }
1267 default:
1268 // Just return the current SCEV if we haven't handled the instruction yet.
1269 LLVM_DEBUG(dbgs() << "ForkedPtr unhandled instruction: " << *I << "\n");
1270 ScevList.emplace_back(Args&: Scev, Args: !isGuaranteedNotToBeUndefOrPoison(V: Ptr));
1271 break;
1272 }
1273}
1274
1275bool AccessAnalysis::createCheckForAccess(
1276 RuntimePointerChecking &RtCheck, MemAccessInfo Access, Type *AccessTy,
1277 const DenseMap<Value *, const SCEV *> &StridesMap,
1278 DenseMap<Value *, unsigned> &DepSetId, Loop *TheLoop,
1279 unsigned &RunningDepId, unsigned ASId, bool Assume) {
1280 Value *Ptr = Access.getPointer();
1281 ScalarEvolution *SE = PSE.getSE();
1282 assert(SE->isSCEVable(Ptr->getType()) && "Value is not SCEVable!");
1283
1284 SmallVector<PointerIntPair<const SCEV *, 1, bool>> RTCheckPtrs;
1285 findForkedSCEVs(SE, L: TheLoop, Ptr, ScevList&: RTCheckPtrs, Depth: MaxForkedSCEVDepth);
1286 assert(!RTCheckPtrs.empty() &&
1287 "Must have some runtime-check pointer candidates");
1288
1289 // RTCheckPtrs must have size 2 if there are forked pointers. Otherwise, there
1290 // are no forked pointers; replaceSymbolicStridesSCEV in this case.
1291 auto IsLoopInvariantOrAR =
1292 [&SE, &TheLoop](const PointerIntPair<const SCEV *, 1, bool> &P) {
1293 return SE->isLoopInvariant(S: P.getPointer(), L: TheLoop) ||
1294 isa<SCEVAddRecExpr>(Val: P.getPointer());
1295 };
1296 if (RTCheckPtrs.size() == 2 && all_of(Range&: RTCheckPtrs, P: IsLoopInvariantOrAR)) {
1297 LLVM_DEBUG(dbgs() << "LAA: Found forked pointer: " << *Ptr << "\n";
1298 for (const auto &[Idx, Q] : enumerate(RTCheckPtrs)) dbgs()
1299 << "\t(" << Idx << ") " << *Q.getPointer() << "\n");
1300 } else {
1301 RTCheckPtrs = {{replaceSymbolicStrideSCEV(PSE, PtrToStride: StridesMap, Ptr), false}};
1302 }
1303
1304 /// Check whether all pointers can participate in a runtime bounds check. They
1305 /// must either be invariant or non-wrapping affine AddRecs.
1306 for (auto &P : RTCheckPtrs) {
1307 // The bounds for loop-invariant pointer is trivial.
1308 if (SE->isLoopInvariant(S: P.getPointer(), L: TheLoop))
1309 continue;
1310
1311 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Val: P.getPointer());
1312 if (!AR && Assume)
1313 AR = PSE.getAsAddRec(V: Ptr);
1314 if (!AR || !AR->isAffine())
1315 return false;
1316
1317 // If there's only one option for Ptr, look it up after bounds and wrap
1318 // checking, because assumptions might have been added to PSE.
1319 if (RTCheckPtrs.size() == 1) {
1320 AR =
1321 cast<SCEVAddRecExpr>(Val: replaceSymbolicStrideSCEV(PSE, PtrToStride: StridesMap, Ptr));
1322 P.setPointer(AR);
1323 }
1324
1325 if (!isNoWrap(PSE, AR, Ptr: RTCheckPtrs.size() == 1 ? Ptr : nullptr, AccessTy,
1326 L: TheLoop, Assume, DT))
1327 return false;
1328 }
1329
1330 for (const auto &[PtrExpr, NeedsFreeze] : RTCheckPtrs) {
1331 // The id of the dependence set.
1332 unsigned DepId;
1333
1334 if (DepCands.contains(V: Access)) {
1335 Value *Leader = DepCands.getLeaderValue(V: Access).getPointer();
1336 unsigned &LeaderId = DepSetId[Leader];
1337 if (!LeaderId)
1338 LeaderId = RunningDepId++;
1339 DepId = LeaderId;
1340 } else
1341 // Each access has its own dependence set.
1342 DepId = RunningDepId++;
1343
1344 bool IsWrite = Access.getInt();
1345 RtCheck.insert(Lp: TheLoop, Ptr, PtrExpr, AccessTy, WritePtr: IsWrite, DepSetId: DepId, ASId, PSE,
1346 NeedsFreeze);
1347 LLVM_DEBUG(dbgs() << "LAA: Found a runtime check ptr:" << *Ptr << '\n');
1348 }
1349
1350 return true;
1351}
1352
1353bool AccessAnalysis::canCheckPtrAtRT(
1354 RuntimePointerChecking &RtCheck, Loop *TheLoop,
1355 const DenseMap<Value *, const SCEV *> &StridesMap, Value *&UncomputablePtr,
1356 bool AllowPartial, const MemoryDepChecker &DepChecker) {
1357 // Find pointers with computable bounds. We are going to use this information
1358 // to place a runtime bound check.
1359 bool CanDoRT = true;
1360
1361 bool MayNeedRTCheck = false;
1362 if (!IsRTCheckAnalysisNeeded) return true;
1363
1364 if (auto *Deps = DepChecker.getDependences()) {
1365 // If there are unknown dependences, this means runtime checks are needed to
1366 // ensure there's no overlap between accesses to the same underlying object.
1367 // Remove the equivalence classes containing both source and destination
1368 // accesses from DepCands. This ensures runtime checks will be generated
1369 // between those accesses and prevents them from being grouped together.
1370 for (const auto &Dep : *Deps) {
1371 if (Dep.Type != MemoryDepChecker::Dependence::Unknown) {
1372 assert(MemoryDepChecker::Dependence::isSafeForVectorization(Dep.Type) ==
1373 MemoryDepChecker::VectorizationSafetyStatus::Safe &&
1374 "Should only skip safe dependences");
1375 continue;
1376 }
1377 Instruction *Src = Dep.getSource(DepChecker);
1378 Instruction *Dst = Dep.getDestination(DepChecker);
1379 DepCands.eraseClass(V: {getPointerOperand(V: Src), Src->mayWriteToMemory()});
1380 DepCands.eraseClass(V: {getPointerOperand(V: Dst), Dst->mayWriteToMemory()});
1381 }
1382 } else {
1383 CheckDeps.clear();
1384 DepCands = {};
1385 }
1386
1387 // We assign a consecutive id to access from different alias sets.
1388 // Accesses between different groups doesn't need to be checked.
1389 unsigned ASId = 0;
1390 for (const auto &AS : AST) {
1391 int NumReadPtrChecks = 0;
1392 int NumWritePtrChecks = 0;
1393 bool CanDoAliasSetRT = true;
1394 ++ASId;
1395 auto ASPointers = AS.getPointers();
1396
1397 // We assign consecutive id to access from different dependence sets.
1398 // Accesses within the same set don't need a runtime check.
1399 unsigned RunningDepId = 1;
1400 DenseMap<Value *, unsigned> DepSetId;
1401
1402 SmallVector<std::pair<MemAccessInfo, Type *>, 4> Retries;
1403
1404 // First, count how many write and read accesses are in the alias set. Also
1405 // collect MemAccessInfos for later.
1406 SmallVector<MemAccessInfo, 4> AccessInfos;
1407 for (const Value *ConstPtr : ASPointers) {
1408 Value *Ptr = const_cast<Value *>(ConstPtr);
1409 bool IsWrite = Accesses.contains(Key: MemAccessInfo(Ptr, true));
1410 if (IsWrite)
1411 ++NumWritePtrChecks;
1412 else
1413 ++NumReadPtrChecks;
1414 AccessInfos.emplace_back(Args&: Ptr, Args&: IsWrite);
1415 }
1416
1417 // We do not need runtime checks for this alias set, if there are no writes
1418 // or a single write and no reads.
1419 if (NumWritePtrChecks == 0 ||
1420 (NumWritePtrChecks == 1 && NumReadPtrChecks == 0)) {
1421 assert((ASPointers.size() <= 1 ||
1422 all_of(ASPointers,
1423 [this](const Value *Ptr) {
1424 MemAccessInfo AccessWrite(const_cast<Value *>(Ptr),
1425 true);
1426 return !DepCands.contains(AccessWrite);
1427 })) &&
1428 "Can only skip updating CanDoRT below, if all entries in AS "
1429 "are reads or there is at most 1 entry");
1430 continue;
1431 }
1432
1433 for (auto &Access : AccessInfos) {
1434 for (const auto &AccessTy : Accesses[Access]) {
1435 if (!createCheckForAccess(RtCheck, Access, AccessTy, StridesMap,
1436 DepSetId, TheLoop, RunningDepId, ASId,
1437 Assume: false)) {
1438 LLVM_DEBUG(dbgs() << "LAA: Can't find bounds for ptr:"
1439 << *Access.getPointer() << '\n');
1440 Retries.emplace_back(Args&: Access, Args: AccessTy);
1441 CanDoAliasSetRT = false;
1442 }
1443 }
1444 }
1445
1446 // Note that this function computes CanDoRT and MayNeedRTCheck
1447 // independently. For example CanDoRT=false, MayNeedRTCheck=false means that
1448 // we have a pointer for which we couldn't find the bounds but we don't
1449 // actually need to emit any checks so it does not matter.
1450 //
1451 // We need runtime checks for this alias set, if there are at least 2
1452 // dependence sets (in which case RunningDepId > 2) or if we need to re-try
1453 // any bound checks (because in that case the number of dependence sets is
1454 // incomplete).
1455 bool NeedsAliasSetRTCheck = RunningDepId > 2 || !Retries.empty();
1456
1457 // We need to perform run-time alias checks, but some pointers had bounds
1458 // that couldn't be checked.
1459 if (NeedsAliasSetRTCheck && !CanDoAliasSetRT) {
1460 // Reset the CanDoSetRt flag and retry all accesses that have failed.
1461 // We know that we need these checks, so we can now be more aggressive
1462 // and add further checks if required (overflow checks).
1463 CanDoAliasSetRT = true;
1464 for (const auto &[Access, AccessTy] : Retries) {
1465 if (!createCheckForAccess(RtCheck, Access, AccessTy, StridesMap,
1466 DepSetId, TheLoop, RunningDepId, ASId,
1467 /*Assume=*/true)) {
1468 CanDoAliasSetRT = false;
1469 UncomputablePtr = Access.getPointer();
1470 if (!AllowPartial)
1471 break;
1472 }
1473 }
1474 }
1475
1476 CanDoRT &= CanDoAliasSetRT;
1477 MayNeedRTCheck |= NeedsAliasSetRTCheck;
1478 ++ASId;
1479 }
1480
1481 // If the pointers that we would use for the bounds comparison have different
1482 // address spaces, assume the values aren't directly comparable, so we can't
1483 // use them for the runtime check. We also have to assume they could
1484 // overlap. In the future there should be metadata for whether address spaces
1485 // are disjoint.
1486 unsigned NumPointers = RtCheck.Pointers.size();
1487 for (unsigned i = 0; i < NumPointers; ++i) {
1488 for (unsigned j = i + 1; j < NumPointers; ++j) {
1489 // Only need to check pointers between two different dependency sets.
1490 if (RtCheck.Pointers[i].DependencySetId ==
1491 RtCheck.Pointers[j].DependencySetId)
1492 continue;
1493 // Only need to check pointers in the same alias set.
1494 if (RtCheck.Pointers[i].AliasSetId != RtCheck.Pointers[j].AliasSetId)
1495 continue;
1496
1497 Value *PtrI = RtCheck.Pointers[i].PointerValue;
1498 Value *PtrJ = RtCheck.Pointers[j].PointerValue;
1499
1500 unsigned ASi = PtrI->getType()->getPointerAddressSpace();
1501 unsigned ASj = PtrJ->getType()->getPointerAddressSpace();
1502 if (ASi != ASj) {
1503 LLVM_DEBUG(
1504 dbgs() << "LAA: Runtime check would require comparison between"
1505 " different address spaces\n");
1506 return false;
1507 }
1508 }
1509 }
1510
1511 if (MayNeedRTCheck && (CanDoRT || AllowPartial))
1512 RtCheck.generateChecks(DepCands);
1513
1514 LLVM_DEBUG(dbgs() << "LAA: We need to do " << RtCheck.getNumberOfChecks()
1515 << " pointer comparisons.\n");
1516
1517 // If we can do run-time checks, but there are no checks, no runtime checks
1518 // are needed. This can happen when all pointers point to the same underlying
1519 // object for example.
1520 RtCheck.Need = CanDoRT ? RtCheck.getNumberOfChecks() != 0 : MayNeedRTCheck;
1521
1522 bool CanDoRTIfNeeded = !RtCheck.Need || CanDoRT;
1523 assert(CanDoRTIfNeeded == (CanDoRT || !MayNeedRTCheck) &&
1524 "CanDoRTIfNeeded depends on RtCheck.Need");
1525 if (!CanDoRTIfNeeded && !AllowPartial)
1526 RtCheck.reset();
1527 return CanDoRTIfNeeded;
1528}
1529
1530void AccessAnalysis::processMemAccesses() {
1531 // We process the set twice: first we process read-write pointers, last we
1532 // process read-only pointers. This allows us to skip dependence tests for
1533 // read-only pointers.
1534
1535 LLVM_DEBUG(dbgs() << "LAA: Processing memory accesses...\n");
1536 LLVM_DEBUG(dbgs() << " AST: "; AST.dump());
1537 LLVM_DEBUG(dbgs() << "LAA: Accesses(" << Accesses.size() << "):\n");
1538 LLVM_DEBUG({
1539 for (const auto &[A, _] : Accesses)
1540 dbgs() << "\t" << *A.getPointer() << " ("
1541 << (A.getInt()
1542 ? "write"
1543 : (ReadOnlyPtr.contains(A.getPointer()) ? "read-only"
1544 : "read"))
1545 << ")\n";
1546 });
1547
1548 // The AliasSetTracker has nicely partitioned our pointers by metadata
1549 // compatibility and potential for underlying-object overlap. As a result, we
1550 // only need to check for potential pointer dependencies within each alias
1551 // set.
1552 for (const auto &AS : AST) {
1553 // Note that both the alias-set tracker and the alias sets themselves used
1554 // ordered collections internally and so the iteration order here is
1555 // deterministic.
1556 auto ASPointers = AS.getPointers();
1557
1558 bool SetHasWrite = false;
1559
1560 // Map of (pointer to underlying objects, accessed address space) to last
1561 // access encountered.
1562 typedef DenseMap<std::pair<const Value *, unsigned>, MemAccessInfo>
1563 UnderlyingObjToAccessMap;
1564 UnderlyingObjToAccessMap ObjToLastAccess;
1565
1566 // Set of access to check after all writes have been processed.
1567 PtrAccessMap DeferredAccesses;
1568
1569 // Iterate over each alias set twice, once to process read/write pointers,
1570 // and then to process read-only pointers.
1571 for (int SetIteration = 0; SetIteration < 2; ++SetIteration) {
1572 bool UseDeferred = SetIteration > 0;
1573 PtrAccessMap &S = UseDeferred ? DeferredAccesses : Accesses;
1574
1575 for (const Value *ConstPtr : ASPointers) {
1576 Value *Ptr = const_cast<Value *>(ConstPtr);
1577
1578 // For a single memory access in AliasSetTracker, Accesses may contain
1579 // both read and write, and they both need to be handled for CheckDeps.
1580 for (const auto &[AC, _] : S) {
1581 if (AC.getPointer() != Ptr)
1582 continue;
1583
1584 bool IsWrite = AC.getInt();
1585
1586 // If we're using the deferred access set, then it contains only
1587 // reads.
1588 bool IsReadOnlyPtr = ReadOnlyPtr.contains(Ptr) && !IsWrite;
1589 if (UseDeferred && !IsReadOnlyPtr)
1590 continue;
1591 // Otherwise, the pointer must be in the PtrAccessSet, either as a
1592 // read or a write.
1593 assert(((IsReadOnlyPtr && UseDeferred) || IsWrite ||
1594 S.contains(MemAccessInfo(Ptr, false))) &&
1595 "Alias-set pointer not in the access set?");
1596
1597 MemAccessInfo Access(Ptr, IsWrite);
1598 DepCands.insert(Data: Access);
1599
1600 // Memorize read-only pointers for later processing and skip them in
1601 // the first round (they need to be checked after we have seen all
1602 // write pointers). Note: we also mark pointer that are not
1603 // consecutive as "read-only" pointers (so that we check
1604 // "a[b[i]] +="). Hence, we need the second check for "!IsWrite".
1605 if (!UseDeferred && IsReadOnlyPtr) {
1606 // We only use the pointer keys, the types vector values don't
1607 // matter.
1608 DeferredAccesses.insert(KV: {Access, {}});
1609 continue;
1610 }
1611
1612 // If this is a write - check other reads and writes for conflicts. If
1613 // this is a read only check other writes for conflicts (but only if
1614 // there is no other write to the ptr - this is an optimization to
1615 // catch "a[i] = a[i] + " without having to do a dependence check).
1616 if ((IsWrite || IsReadOnlyPtr) && SetHasWrite) {
1617 CheckDeps.push_back(Elt: Access);
1618 IsRTCheckAnalysisNeeded = true;
1619 }
1620
1621 if (IsWrite)
1622 SetHasWrite = true;
1623
1624 // Create sets of pointers connected by a shared alias set and
1625 // underlying object.
1626 SmallVector<const Value *, 16> &UOs = UnderlyingObjects[Ptr];
1627 UOs = {};
1628 ::getUnderlyingObjects(V: Ptr, Objects&: UOs, LI);
1629 LLVM_DEBUG(dbgs()
1630 << "Underlying objects for pointer " << *Ptr << "\n");
1631 for (const Value *UnderlyingObj : UOs) {
1632 // nullptr never alias, don't join sets for pointer that have "null"
1633 // in their UnderlyingObjects list.
1634 if (isa<ConstantPointerNull>(Val: UnderlyingObj) &&
1635 !NullPointerIsDefined(
1636 F: TheLoop->getHeader()->getParent(),
1637 AS: UnderlyingObj->getType()->getPointerAddressSpace()))
1638 continue;
1639
1640 auto [It, Inserted] = ObjToLastAccess.try_emplace(
1641 Key: {UnderlyingObj,
1642 cast<PointerType>(Val: Ptr->getType())->getAddressSpace()},
1643 Args&: Access);
1644 if (!Inserted) {
1645 DepCands.unionSets(V1: Access, V2: It->second);
1646 It->second = Access;
1647 }
1648
1649 LLVM_DEBUG(dbgs() << " " << *UnderlyingObj << "\n");
1650 }
1651 }
1652 }
1653 }
1654 }
1655}
1656
1657/// Check whether the access through \p Ptr has a constant stride.
1658std::optional<int64_t>
1659llvm::getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr,
1660 const Loop *Lp, const DominatorTree &DT,
1661 const DenseMap<Value *, const SCEV *> &StridesMap,
1662 bool Assume, bool ShouldCheckWrap) {
1663 const SCEV *PtrScev = replaceSymbolicStrideSCEV(PSE, PtrToStride: StridesMap, Ptr);
1664 if (PSE.getSE()->isLoopInvariant(S: PtrScev, L: Lp))
1665 return 0;
1666
1667 assert(Ptr->getType()->isPointerTy() && "Unexpected non-ptr");
1668
1669 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Val: PtrScev);
1670 if (Assume && !AR)
1671 AR = PSE.getAsAddRec(V: Ptr);
1672
1673 if (!AR) {
1674 LLVM_DEBUG(dbgs() << "LAA: Bad stride - Not an AddRecExpr pointer " << *Ptr
1675 << " SCEV: " << *PtrScev << "\n");
1676 return std::nullopt;
1677 }
1678
1679 std::optional<int64_t> Stride =
1680 getStrideFromAddRec(AR, Lp, AccessTy, Ptr, PSE);
1681 if (!ShouldCheckWrap || !Stride)
1682 return Stride;
1683
1684 if (isNoWrap(PSE, AR, Ptr, AccessTy, L: Lp, Assume, DT, Stride))
1685 return Stride;
1686
1687 LLVM_DEBUG(
1688 dbgs() << "LAA: Bad stride - Pointer may wrap in the address space "
1689 << *Ptr << " SCEV: " << *AR << "\n");
1690 return std::nullopt;
1691}
1692
1693std::optional<int64_t> llvm::getPointersDiff(Type *ElemTyA, Value *PtrA,
1694 Type *ElemTyB, Value *PtrB,
1695 const DataLayout &DL,
1696 ScalarEvolution &SE,
1697 bool StrictCheck, bool CheckType) {
1698 assert(PtrA && PtrB && "Expected non-nullptr pointers.");
1699
1700 // Make sure that A and B are different pointers.
1701 if (PtrA == PtrB)
1702 return 0;
1703
1704 // Make sure that the element types are the same if required.
1705 if (CheckType && ElemTyA != ElemTyB)
1706 return std::nullopt;
1707
1708 unsigned ASA = PtrA->getType()->getPointerAddressSpace();
1709 unsigned ASB = PtrB->getType()->getPointerAddressSpace();
1710
1711 // Check that the address spaces match.
1712 if (ASA != ASB)
1713 return std::nullopt;
1714 unsigned IdxWidth = DL.getIndexSizeInBits(AS: ASA);
1715
1716 APInt OffsetA(IdxWidth, 0), OffsetB(IdxWidth, 0);
1717 const Value *PtrA1 = PtrA->stripAndAccumulateConstantOffsets(
1718 DL, Offset&: OffsetA, /*AllowNonInbounds=*/true);
1719 const Value *PtrB1 = PtrB->stripAndAccumulateConstantOffsets(
1720 DL, Offset&: OffsetB, /*AllowNonInbounds=*/true);
1721
1722 std::optional<int64_t> Val;
1723 if (PtrA1 == PtrB1) {
1724 // Retrieve the address space again as pointer stripping now tracks through
1725 // `addrspacecast`.
1726 ASA = cast<PointerType>(Val: PtrA1->getType())->getAddressSpace();
1727 ASB = cast<PointerType>(Val: PtrB1->getType())->getAddressSpace();
1728 // Check that the address spaces match and that the pointers are valid.
1729 if (ASA != ASB)
1730 return std::nullopt;
1731
1732 IdxWidth = DL.getIndexSizeInBits(AS: ASA);
1733 OffsetA = OffsetA.sextOrTrunc(width: IdxWidth);
1734 OffsetB = OffsetB.sextOrTrunc(width: IdxWidth);
1735
1736 OffsetB -= OffsetA;
1737 Val = OffsetB.trySExtValue();
1738 } else {
1739 // Otherwise compute the distance with SCEV between the base pointers.
1740 const SCEV *PtrSCEVA = SE.getSCEV(V: PtrA);
1741 const SCEV *PtrSCEVB = SE.getSCEV(V: PtrB);
1742 std::optional<APInt> Diff =
1743 SE.computeConstantDifference(LHS: PtrSCEVB, RHS: PtrSCEVA);
1744 if (!Diff)
1745 return std::nullopt;
1746 Val = Diff->trySExtValue();
1747 }
1748
1749 if (!Val)
1750 return std::nullopt;
1751
1752 int64_t Size = DL.getTypeStoreSize(Ty: ElemTyA);
1753 int64_t Dist = *Val / Size;
1754
1755 // Ensure that the calculated distance matches the type-based one after all
1756 // the bitcasts removal in the provided pointers.
1757 if (!StrictCheck || Dist * Size == Val)
1758 return Dist;
1759 return std::nullopt;
1760}
1761
1762bool llvm::sortPtrAccesses(ArrayRef<Value *> VL, Type *ElemTy,
1763 const DataLayout &DL, ScalarEvolution &SE,
1764 SmallVectorImpl<unsigned> &SortedIndices) {
1765 assert(llvm::all_of(
1766 VL, [](const Value *V) { return V->getType()->isPointerTy(); }) &&
1767 "Expected list of pointer operands.");
1768 // Walk over the pointers, and map each of them to an offset relative to
1769 // first pointer in the array.
1770 Value *Ptr0 = VL[0];
1771
1772 using DistOrdPair = std::pair<int64_t, unsigned>;
1773 auto Compare = llvm::less_first();
1774 std::set<DistOrdPair, decltype(Compare)> Offsets(Compare);
1775 Offsets.emplace(args: 0, args: 0);
1776 bool IsConsecutive = true;
1777 for (auto [Idx, Ptr] : drop_begin(RangeOrContainer: enumerate(First&: VL))) {
1778 std::optional<int64_t> Diff =
1779 getPointersDiff(ElemTyA: ElemTy, PtrA: Ptr0, ElemTyB: ElemTy, PtrB: Ptr, DL, SE,
1780 /*StrictCheck=*/true);
1781 if (!Diff)
1782 return false;
1783
1784 // Check if the pointer with the same offset is found.
1785 int64_t Offset = *Diff;
1786 auto [It, IsInserted] = Offsets.emplace(args&: Offset, args&: Idx);
1787 if (!IsInserted)
1788 return false;
1789 // Consecutive order if the inserted element is the last one.
1790 IsConsecutive &= std::next(x: It) == Offsets.end();
1791 }
1792 SortedIndices.clear();
1793 if (!IsConsecutive) {
1794 // Fill SortedIndices array only if it is non-consecutive.
1795 SortedIndices.resize(N: VL.size());
1796 for (auto [Idx, Off] : enumerate(First&: Offsets))
1797 SortedIndices[Idx] = Off.second;
1798 }
1799 return true;
1800}
1801
1802/// Returns true if the memory operations \p A and \p B are consecutive.
1803bool llvm::isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL,
1804 ScalarEvolution &SE, bool CheckType) {
1805 Value *PtrA = getLoadStorePointerOperand(V: A);
1806 Value *PtrB = getLoadStorePointerOperand(V: B);
1807 if (!PtrA || !PtrB)
1808 return false;
1809 Type *ElemTyA = getLoadStoreType(I: A);
1810 Type *ElemTyB = getLoadStoreType(I: B);
1811 std::optional<int64_t> Diff =
1812 getPointersDiff(ElemTyA, PtrA, ElemTyB, PtrB, DL, SE,
1813 /*StrictCheck=*/true, CheckType);
1814 return Diff == 1;
1815}
1816
1817void MemoryDepChecker::addAccess(StoreInst *SI) {
1818 visitPointers(StartPtr: SI->getPointerOperand(), InnermostLoop: *InnermostLoop,
1819 AddPointer: [this, SI](Value *Ptr) {
1820 Accesses[MemAccessInfo(Ptr, true)].push_back(x: AccessIdx);
1821 InstMap.push_back(Elt: SI);
1822 ++AccessIdx;
1823 });
1824}
1825
1826void MemoryDepChecker::addAccess(LoadInst *LI) {
1827 visitPointers(StartPtr: LI->getPointerOperand(), InnermostLoop: *InnermostLoop,
1828 AddPointer: [this, LI](Value *Ptr) {
1829 Accesses[MemAccessInfo(Ptr, false)].push_back(x: AccessIdx);
1830 InstMap.push_back(Elt: LI);
1831 ++AccessIdx;
1832 });
1833}
1834
1835MemoryDepChecker::VectorizationSafetyStatus
1836MemoryDepChecker::Dependence::isSafeForVectorization(DepType Type) {
1837 switch (Type) {
1838 case NoDep:
1839 case Forward:
1840 case BackwardVectorizable:
1841 return VectorizationSafetyStatus::Safe;
1842
1843 case Unknown:
1844 return VectorizationSafetyStatus::PossiblySafeWithRtChecks;
1845 case ForwardButPreventsForwarding:
1846 case Backward:
1847 case BackwardVectorizableButPreventsForwarding:
1848 case IndirectUnsafe:
1849 return VectorizationSafetyStatus::Unsafe;
1850 }
1851 llvm_unreachable("unexpected DepType!");
1852}
1853
1854bool MemoryDepChecker::Dependence::isBackward() const {
1855 switch (Type) {
1856 case NoDep:
1857 case Forward:
1858 case ForwardButPreventsForwarding:
1859 case Unknown:
1860 case IndirectUnsafe:
1861 return false;
1862
1863 case BackwardVectorizable:
1864 case Backward:
1865 case BackwardVectorizableButPreventsForwarding:
1866 return true;
1867 }
1868 llvm_unreachable("unexpected DepType!");
1869}
1870
1871bool MemoryDepChecker::Dependence::isPossiblyBackward() const {
1872 return isBackward() || Type == Unknown || Type == IndirectUnsafe;
1873}
1874
1875bool MemoryDepChecker::Dependence::isForward() const {
1876 switch (Type) {
1877 case Forward:
1878 case ForwardButPreventsForwarding:
1879 return true;
1880
1881 case NoDep:
1882 case Unknown:
1883 case BackwardVectorizable:
1884 case Backward:
1885 case BackwardVectorizableButPreventsForwarding:
1886 case IndirectUnsafe:
1887 return false;
1888 }
1889 llvm_unreachable("unexpected DepType!");
1890}
1891
1892bool MemoryDepChecker::couldPreventStoreLoadForward(uint64_t Distance,
1893 uint64_t TypeByteSize,
1894 unsigned CommonStride) {
1895 // If loads occur at a distance that is not a multiple of a feasible vector
1896 // factor store-load forwarding does not take place.
1897 // Positive dependences might cause troubles because vectorizing them might
1898 // prevent store-load forwarding making vectorized code run a lot slower.
1899 // a[i] = a[i-3] ^ a[i-8];
1900 // The stores to a[i:i+1] don't align with the stores to a[i-3:i-2] and
1901 // hence on your typical architecture store-load forwarding does not take
1902 // place. Vectorizing in such cases does not make sense.
1903 // Store-load forwarding distance.
1904
1905 // After this many iterations store-to-load forwarding conflicts should not
1906 // cause any slowdowns.
1907 const uint64_t NumItersForStoreLoadThroughMemory = 8 * TypeByteSize;
1908 // Maximum vector factor.
1909 uint64_t MaxVFWithoutSLForwardIssuesPowerOf2 =
1910 std::min(a: VectorizerParams::MaxVectorWidth * TypeByteSize,
1911 b: MaxStoreLoadForwardSafeDistanceInBits);
1912
1913 // Compute the smallest VF at which the store and load would be misaligned.
1914 for (uint64_t VF = 2 * TypeByteSize;
1915 VF <= MaxVFWithoutSLForwardIssuesPowerOf2; VF *= 2) {
1916 // If the number of vector iteration between the store and the load are
1917 // small we could incur conflicts.
1918 if (Distance % VF && Distance / VF < NumItersForStoreLoadThroughMemory) {
1919 MaxVFWithoutSLForwardIssuesPowerOf2 = (VF >> 1);
1920 break;
1921 }
1922 }
1923
1924 if (MaxVFWithoutSLForwardIssuesPowerOf2 < 2 * TypeByteSize) {
1925 LLVM_DEBUG(
1926 dbgs() << "LAA: Distance " << Distance
1927 << " that could cause a store-load forwarding conflict\n");
1928 return true;
1929 }
1930
1931 if (CommonStride &&
1932 MaxVFWithoutSLForwardIssuesPowerOf2 <
1933 MaxStoreLoadForwardSafeDistanceInBits &&
1934 MaxVFWithoutSLForwardIssuesPowerOf2 !=
1935 VectorizerParams::MaxVectorWidth * TypeByteSize) {
1936 uint64_t MaxVF =
1937 bit_floor(Value: MaxVFWithoutSLForwardIssuesPowerOf2 / CommonStride);
1938 uint64_t MaxVFInBits = MaxVF * TypeByteSize * 8;
1939 MaxStoreLoadForwardSafeDistanceInBits =
1940 std::min(a: MaxStoreLoadForwardSafeDistanceInBits, b: MaxVFInBits);
1941 }
1942 return false;
1943}
1944
1945void MemoryDepChecker::mergeInStatus(VectorizationSafetyStatus S) {
1946 if (Status < S)
1947 Status = S;
1948}
1949
1950/// Given a dependence-distance \p Dist between two memory accesses, that have
1951/// strides in the same direction whose absolute value of the maximum stride is
1952/// given in \p MaxStride, in a loop whose maximum backedge taken count is \p
1953/// MaxBTC, check if it is possible to prove statically that the dependence
1954/// distance is larger than the range that the accesses will travel through the
1955/// execution of the loop. If so, return true; false otherwise. This is useful
1956/// for example in loops such as the following (PR31098):
1957///
1958/// for (i = 0; i < D; ++i) {
1959/// = out[i];
1960/// out[i+D] =
1961/// }
1962static bool isSafeDependenceDistance(const DataLayout &DL, ScalarEvolution &SE,
1963 const SCEV &MaxBTC, const SCEV &Dist,
1964 uint64_t MaxStride) {
1965
1966 // If we can prove that
1967 // (**) |Dist| > MaxBTC * Step
1968 // where Step is the absolute stride of the memory accesses in bytes,
1969 // then there is no dependence.
1970 //
1971 // Rationale:
1972 // We basically want to check if the absolute distance (|Dist/Step|)
1973 // is >= the loop iteration count (or > MaxBTC).
1974 // This is equivalent to the Strong SIV Test (Practical Dependence Testing,
1975 // Section 4.2.1); Note, that for vectorization it is sufficient to prove
1976 // that the dependence distance is >= VF; This is checked elsewhere.
1977 // But in some cases we can prune dependence distances early, and
1978 // even before selecting the VF, and without a runtime test, by comparing
1979 // the distance against the loop iteration count. Since the vectorized code
1980 // will be executed only if LoopCount >= VF, proving distance >= LoopCount
1981 // also guarantees that distance >= VF.
1982 //
1983 const SCEV *Step = SE.getConstant(Ty: MaxBTC.getType(), V: MaxStride);
1984 const SCEV *Product = SE.getMulExpr(LHS: &MaxBTC, RHS: Step);
1985
1986 const SCEV *CastedDist = &Dist;
1987 const SCEV *CastedProduct = Product;
1988 uint64_t DistTypeSizeBits = DL.getTypeSizeInBits(Ty: Dist.getType());
1989 uint64_t ProductTypeSizeBits = DL.getTypeSizeInBits(Ty: Product->getType());
1990
1991 // The dependence distance can be positive/negative, so we sign extend Dist;
1992 // The multiplication of the absolute stride in bytes and the
1993 // backedgeTakenCount is non-negative, so we zero extend Product.
1994 if (DistTypeSizeBits > ProductTypeSizeBits)
1995 CastedProduct = SE.getZeroExtendExpr(Op: Product, Ty: Dist.getType());
1996 else
1997 CastedDist = SE.getNoopOrSignExtend(V: &Dist, Ty: Product->getType());
1998
1999 // Is Dist - (MaxBTC * Step) > 0 ?
2000 // (If so, then we have proven (**) because |Dist| >= Dist)
2001 const SCEV *Minus = SE.getMinusSCEV(LHS: CastedDist, RHS: CastedProduct);
2002 if (SE.isKnownPositive(S: Minus))
2003 return true;
2004
2005 // Second try: Is -Dist - (MaxBTC * Step) > 0 ?
2006 // (If so, then we have proven (**) because |Dist| >= -1*Dist)
2007 const SCEV *NegDist = SE.getNegativeSCEV(V: CastedDist);
2008 Minus = SE.getMinusSCEV(LHS: NegDist, RHS: CastedProduct);
2009 return SE.isKnownPositive(S: Minus);
2010}
2011
2012/// Check the dependence for two accesses with the same stride \p Stride.
2013/// \p Distance is the positive distance in bytes, and \p TypeByteSize is type
2014/// size in bytes.
2015///
2016/// \returns true if they are independent.
2017static bool areStridedAccessesIndependent(uint64_t Distance, uint64_t Stride,
2018 uint64_t TypeByteSize) {
2019 assert(Stride > 1 && "The stride must be greater than 1");
2020 assert(TypeByteSize > 0 && "The type size in byte must be non-zero");
2021 assert(Distance > 0 && "The distance must be non-zero");
2022
2023 // Skip if the distance is not multiple of type byte size.
2024 if (Distance % TypeByteSize)
2025 return false;
2026
2027 // No dependence if the distance is not multiple of the stride.
2028 // E.g.
2029 // for (i = 0; i < 1024 ; i += 4)
2030 // A[i+2] = A[i] + 1;
2031 //
2032 // Two accesses in memory (distance is 2, stride is 4):
2033 // | A[0] | | | | A[4] | | | |
2034 // | | | A[2] | | | | A[6] | |
2035 //
2036 // E.g.
2037 // for (i = 0; i < 1024 ; i += 3)
2038 // A[i+4] = A[i] + 1;
2039 //
2040 // Two accesses in memory (distance is 4, stride is 3):
2041 // | A[0] | | | A[3] | | | A[6] | | |
2042 // | | | | | A[4] | | | A[7] | |
2043 return Distance % Stride;
2044}
2045
2046bool MemoryDepChecker::areAccessesCompletelyBeforeOrAfter(const SCEV *Src,
2047 Type *SrcTy,
2048 const SCEV *Sink,
2049 Type *SinkTy) {
2050 const SCEV *BTC = PSE.getBackedgeTakenCount();
2051 const SCEV *SymbolicMaxBTC = PSE.getSymbolicMaxBackedgeTakenCount();
2052 ScalarEvolution &SE = *PSE.getSE();
2053 const auto &[SrcStart_, SrcEnd_] =
2054 getStartAndEndForAccess(Lp: InnermostLoop, PtrExpr: Src, AccessTy: SrcTy, BTC, MaxBTC: SymbolicMaxBTC,
2055 SE: &SE, PointerBounds: &PointerBounds, DT, AC, LoopGuards);
2056 if (isa<SCEVCouldNotCompute>(Val: SrcStart_) || isa<SCEVCouldNotCompute>(Val: SrcEnd_))
2057 return false;
2058
2059 const auto &[SinkStart_, SinkEnd_] =
2060 getStartAndEndForAccess(Lp: InnermostLoop, PtrExpr: Sink, AccessTy: SinkTy, BTC, MaxBTC: SymbolicMaxBTC,
2061 SE: &SE, PointerBounds: &PointerBounds, DT, AC, LoopGuards);
2062 if (isa<SCEVCouldNotCompute>(Val: SinkStart_) ||
2063 isa<SCEVCouldNotCompute>(Val: SinkEnd_))
2064 return false;
2065
2066 if (!LoopGuards)
2067 LoopGuards.emplace(args: ScalarEvolution::LoopGuards::collect(L: InnermostLoop, SE));
2068
2069 auto SrcEnd = SE.applyLoopGuards(Expr: SrcEnd_, Guards: *LoopGuards);
2070 auto SinkStart = SE.applyLoopGuards(Expr: SinkStart_, Guards: *LoopGuards);
2071 if (SE.isKnownPredicate(Pred: CmpInst::ICMP_ULE, LHS: SrcEnd, RHS: SinkStart))
2072 return true;
2073
2074 auto SinkEnd = SE.applyLoopGuards(Expr: SinkEnd_, Guards: *LoopGuards);
2075 auto SrcStart = SE.applyLoopGuards(Expr: SrcStart_, Guards: *LoopGuards);
2076 return SE.isKnownPredicate(Pred: CmpInst::ICMP_ULE, LHS: SinkEnd, RHS: SrcStart);
2077}
2078
2079std::variant<MemoryDepChecker::Dependence::DepType,
2080 MemoryDepChecker::DepDistanceStrideAndSizeInfo>
2081MemoryDepChecker::getDependenceDistanceStrideAndSize(
2082 const AccessAnalysis::MemAccessInfo &A, Instruction *AInst,
2083 const AccessAnalysis::MemAccessInfo &B, Instruction *BInst) {
2084 const auto &DL = InnermostLoop->getHeader()->getDataLayout();
2085 auto &SE = *PSE.getSE();
2086 const auto &[APtr, AIsWrite] = A;
2087 const auto &[BPtr, BIsWrite] = B;
2088
2089 // Two reads are independent.
2090 if (!AIsWrite && !BIsWrite)
2091 return MemoryDepChecker::Dependence::NoDep;
2092
2093 Type *ATy = getLoadStoreType(I: AInst);
2094 Type *BTy = getLoadStoreType(I: BInst);
2095
2096 // We cannot check pointers in different address spaces.
2097 if (APtr->getType()->getPointerAddressSpace() !=
2098 BPtr->getType()->getPointerAddressSpace())
2099 return MemoryDepChecker::Dependence::Unknown;
2100
2101 std::optional<int64_t> StrideAPtr = getPtrStride(
2102 PSE, AccessTy: ATy, Ptr: APtr, Lp: InnermostLoop, DT: *DT, StridesMap: SymbolicStrides, Assume: true, ShouldCheckWrap: true);
2103 std::optional<int64_t> StrideBPtr = getPtrStride(
2104 PSE, AccessTy: BTy, Ptr: BPtr, Lp: InnermostLoop, DT: *DT, StridesMap: SymbolicStrides, Assume: true, ShouldCheckWrap: true);
2105
2106 const SCEV *Src = PSE.getSCEV(V: APtr);
2107 const SCEV *Sink = PSE.getSCEV(V: BPtr);
2108
2109 // If the induction step is negative we have to invert source and sink of the
2110 // dependence when measuring the distance between them. We should not swap
2111 // AIsWrite with BIsWrite, as their uses expect them in program order.
2112 if (StrideAPtr && *StrideAPtr < 0) {
2113 std::swap(a&: Src, b&: Sink);
2114 std::swap(a&: AInst, b&: BInst);
2115 std::swap(a&: ATy, b&: BTy);
2116 std::swap(lhs&: StrideAPtr, rhs&: StrideBPtr);
2117 }
2118
2119 const SCEV *Dist = SE.getMinusSCEV(LHS: Sink, RHS: Src);
2120
2121 LLVM_DEBUG(dbgs() << "LAA: Src Scev: " << *Src << "Sink Scev: " << *Sink
2122 << "\n");
2123 LLVM_DEBUG(dbgs() << "LAA: Distance for " << *AInst << " to " << *BInst
2124 << ": " << *Dist << "\n");
2125
2126 // Need accesses with constant strides and the same direction for further
2127 // dependence analysis. We don't want to vectorize "A[B[i]] += ..." and
2128 // similar code or pointer arithmetic that could wrap in the address space.
2129
2130 // If either Src or Sink are not strided (i.e. not a non-wrapping AddRec) and
2131 // not loop-invariant (stride will be 0 in that case), we cannot analyze the
2132 // dependence further and also cannot generate runtime checks.
2133 if (!StrideAPtr || !StrideBPtr) {
2134 LLVM_DEBUG(dbgs() << "Pointer access with non-constant stride\n");
2135 return MemoryDepChecker::Dependence::IndirectUnsafe;
2136 }
2137
2138 int64_t StrideAPtrInt = *StrideAPtr;
2139 int64_t StrideBPtrInt = *StrideBPtr;
2140 LLVM_DEBUG(dbgs() << "LAA: Src induction step: " << StrideAPtrInt
2141 << " Sink induction step: " << StrideBPtrInt << "\n");
2142 // At least Src or Sink are loop invariant and the other is strided or
2143 // invariant. We can generate a runtime check to disambiguate the accesses.
2144 if (!StrideAPtrInt || !StrideBPtrInt)
2145 return MemoryDepChecker::Dependence::Unknown;
2146
2147 // Both Src and Sink have a constant stride, check if they are in the same
2148 // direction.
2149 if ((StrideAPtrInt > 0) != (StrideBPtrInt > 0)) {
2150 LLVM_DEBUG(
2151 dbgs() << "Pointer access with strides in different directions\n");
2152 return MemoryDepChecker::Dependence::Unknown;
2153 }
2154
2155 TypeSize AStoreSz = DL.getTypeStoreSize(Ty: ATy);
2156 TypeSize BStoreSz = DL.getTypeStoreSize(Ty: BTy);
2157
2158 // If store sizes are not the same, set TypeByteSize to zero, so we can check
2159 // it in the caller isDependent.
2160 uint64_t ASz = DL.getTypeAllocSize(Ty: ATy);
2161 uint64_t BSz = DL.getTypeAllocSize(Ty: BTy);
2162 uint64_t TypeByteSize = (AStoreSz == BStoreSz) ? BSz : 0;
2163
2164 uint64_t StrideAScaled = std::abs(i: StrideAPtrInt) * ASz;
2165 uint64_t StrideBScaled = std::abs(i: StrideBPtrInt) * BSz;
2166
2167 uint64_t MaxStride = std::max(a: StrideAScaled, b: StrideBScaled);
2168
2169 std::optional<uint64_t> CommonStride;
2170 if (StrideAScaled == StrideBScaled)
2171 CommonStride = StrideAScaled;
2172
2173 // TODO: Historically, we didn't retry with runtime checks when (unscaled)
2174 // strides were different but there is no inherent reason to.
2175 if (!isa<SCEVConstant>(Val: Dist))
2176 ShouldRetryWithRuntimeChecks |= StrideAPtrInt == StrideBPtrInt;
2177
2178 // If distance is a SCEVCouldNotCompute, return Unknown immediately.
2179 if (isa<SCEVCouldNotCompute>(Val: Dist)) {
2180 LLVM_DEBUG(dbgs() << "LAA: Uncomputable distance.\n");
2181 return Dependence::Unknown;
2182 }
2183
2184 return DepDistanceStrideAndSizeInfo(Dist, MaxStride, CommonStride,
2185 TypeByteSize, AIsWrite, BIsWrite);
2186}
2187
2188MemoryDepChecker::Dependence::DepType
2189MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
2190 const MemAccessInfo &B, unsigned BIdx) {
2191 assert(AIdx < BIdx && "Must pass arguments in program order");
2192
2193 // Check if we can prove that Sink only accesses memory after Src's end or
2194 // vice versa. The helper is used to perform the checks only on the exit paths
2195 // where it helps to improve the analysis result.
2196 auto CheckCompletelyBeforeOrAfter = [&]() {
2197 auto *APtr = A.getPointer();
2198 auto *BPtr = B.getPointer();
2199 Type *ATy = getLoadStoreType(I: InstMap[AIdx]);
2200 Type *BTy = getLoadStoreType(I: InstMap[BIdx]);
2201 const SCEV *Src = PSE.getSCEV(V: APtr);
2202 const SCEV *Sink = PSE.getSCEV(V: BPtr);
2203 return areAccessesCompletelyBeforeOrAfter(Src, SrcTy: ATy, Sink, SinkTy: BTy);
2204 };
2205
2206 // Get the dependence distance, stride, type size and what access writes for
2207 // the dependence between A and B.
2208 auto Res =
2209 getDependenceDistanceStrideAndSize(A, AInst: InstMap[AIdx], B, BInst: InstMap[BIdx]);
2210 if (std::holds_alternative<Dependence::DepType>(v: Res)) {
2211 if (std::get<Dependence::DepType>(v&: Res) == Dependence::Unknown &&
2212 CheckCompletelyBeforeOrAfter())
2213 return Dependence::NoDep;
2214 return std::get<Dependence::DepType>(v&: Res);
2215 }
2216
2217 auto &[Dist, MaxStride, CommonStride, TypeByteSize, AIsWrite, BIsWrite] =
2218 std::get<DepDistanceStrideAndSizeInfo>(v&: Res);
2219 bool HasSameSize = TypeByteSize > 0;
2220
2221 ScalarEvolution &SE = *PSE.getSE();
2222 auto &DL = InnermostLoop->getHeader()->getDataLayout();
2223
2224 // If the distance between the acecsses is larger than their maximum absolute
2225 // stride multiplied by the symbolic maximum backedge taken count (which is an
2226 // upper bound of the number of iterations), the accesses are independet, i.e.
2227 // they are far enough appart that accesses won't access the same location
2228 // across all loop ierations.
2229 if (HasSameSize &&
2230 isSafeDependenceDistance(
2231 DL, SE, MaxBTC: *(PSE.getSymbolicMaxBackedgeTakenCount()), Dist: *Dist, MaxStride))
2232 return Dependence::NoDep;
2233
2234 // The rest of this function relies on ConstDist being at most 64-bits, which
2235 // is checked earlier. Will assert if the calling code changes.
2236 const APInt *APDist = nullptr;
2237 uint64_t ConstDist =
2238 match(S: Dist, P: m_scev_APInt(C&: APDist)) ? APDist->abs().getZExtValue() : 0;
2239
2240 // Attempt to prove strided accesses independent.
2241 if (APDist) {
2242 // If the distance between accesses and their strides are known constants,
2243 // check whether the accesses interlace each other.
2244 if (ConstDist > 0 && CommonStride && CommonStride > 1 && HasSameSize &&
2245 areStridedAccessesIndependent(Distance: ConstDist, Stride: *CommonStride, TypeByteSize)) {
2246 LLVM_DEBUG(dbgs() << "LAA: Strided accesses are independent\n");
2247 return Dependence::NoDep;
2248 }
2249 } else {
2250 if (!LoopGuards)
2251 LoopGuards.emplace(
2252 args: ScalarEvolution::LoopGuards::collect(L: InnermostLoop, SE));
2253 Dist = SE.applyLoopGuards(Expr: Dist, Guards: *LoopGuards);
2254 }
2255
2256 // Negative distances are not plausible dependencies.
2257 if (SE.isKnownNonPositive(S: Dist)) {
2258 if (SE.isKnownNonNegative(S: Dist)) {
2259 if (HasSameSize) {
2260 // Write to the same location with the same size.
2261 return Dependence::Forward;
2262 }
2263 LLVM_DEBUG(dbgs() << "LAA: possibly zero dependence difference but "
2264 "different type sizes\n");
2265 return Dependence::Unknown;
2266 }
2267
2268 bool IsTrueDataDependence = (AIsWrite && !BIsWrite);
2269 // Check if the first access writes to a location that is read in a later
2270 // iteration, where the distance between them is not a multiple of a vector
2271 // factor and relatively small.
2272 //
2273 // NOTE: There is no need to update MaxSafeVectorWidthInBits after call to
2274 // couldPreventStoreLoadForward, even if it changed MinDepDistBytes, since a
2275 // forward dependency will allow vectorization using any width.
2276
2277 if (IsTrueDataDependence && EnableForwardingConflictDetection) {
2278 if (!ConstDist) {
2279 return CheckCompletelyBeforeOrAfter() ? Dependence::NoDep
2280 : Dependence::Unknown;
2281 }
2282 if (!HasSameSize ||
2283 couldPreventStoreLoadForward(Distance: ConstDist, TypeByteSize)) {
2284 LLVM_DEBUG(
2285 dbgs() << "LAA: Forward but may prevent st->ld forwarding\n");
2286 return Dependence::ForwardButPreventsForwarding;
2287 }
2288 }
2289
2290 LLVM_DEBUG(dbgs() << "LAA: Dependence is negative\n");
2291 return Dependence::Forward;
2292 }
2293
2294 int64_t MinDistance = SE.getSignedRangeMin(S: Dist).getSExtValue();
2295 // Below we only handle strictly positive distances.
2296 if (MinDistance <= 0) {
2297 return CheckCompletelyBeforeOrAfter() ? Dependence::NoDep
2298 : Dependence::Unknown;
2299 }
2300
2301 if (!HasSameSize) {
2302 if (CheckCompletelyBeforeOrAfter())
2303 return Dependence::NoDep;
2304 LLVM_DEBUG(dbgs() << "LAA: ReadWrite-Write positive dependency with "
2305 "different type sizes\n");
2306 return Dependence::Unknown;
2307 }
2308 // Bail out early if passed-in parameters make vectorization not feasible.
2309 unsigned ForcedFactor = (VectorizerParams::VectorizationFactor ?
2310 VectorizerParams::VectorizationFactor : 1);
2311 unsigned ForcedUnroll = (VectorizerParams::VectorizationInterleave ?
2312 VectorizerParams::VectorizationInterleave : 1);
2313 // The minimum number of iterations for a vectorized/unrolled version.
2314 unsigned MinNumIter = std::max(a: ForcedFactor * ForcedUnroll, b: 2U);
2315
2316 // It's not vectorizable if the distance is smaller than the minimum distance
2317 // needed for a vectroized/unrolled version. Vectorizing one iteration in
2318 // front needs MaxStride. Vectorizing the last iteration needs TypeByteSize.
2319 // (No need to plus the last gap distance).
2320 //
2321 // E.g. Assume one char is 1 byte in memory and one int is 4 bytes.
2322 // foo(int *A) {
2323 // int *B = (int *)((char *)A + 14);
2324 // for (i = 0 ; i < 1024 ; i += 2)
2325 // B[i] = A[i] + 1;
2326 // }
2327 //
2328 // Two accesses in memory (stride is 4 * 2):
2329 // | A[0] | | A[2] | | A[4] | | A[6] | |
2330 // | B[0] | | B[2] | | B[4] |
2331 //
2332 // MinDistance needs for vectorizing iterations except the last iteration:
2333 // 4 * 2 * (MinNumIter - 1). MinDistance needs for the last iteration: 4.
2334 // So the minimum distance needed is: 4 * 2 * (MinNumIter - 1) + 4.
2335 //
2336 // If MinNumIter is 2, it is vectorizable as the minimum distance needed is
2337 // 12, which is less than distance.
2338 //
2339 // If MinNumIter is 4 (Say if a user forces the vectorization factor to be 4),
2340 // the minimum distance needed is 28, which is greater than distance. It is
2341 // not safe to do vectorization.
2342 //
2343 // We use MaxStride (maximum of src and sink strides) to get a conservative
2344 // lower bound on the MinDistanceNeeded in case of different strides.
2345
2346 // We know that Dist is positive, but it may not be constant. Use the signed
2347 // minimum for computations below, as this ensures we compute the closest
2348 // possible dependence distance.
2349 uint64_t MinDistanceNeeded = MaxStride * (MinNumIter - 1) + TypeByteSize;
2350 if (MinDistanceNeeded > static_cast<uint64_t>(MinDistance)) {
2351 if (!ConstDist) {
2352 // For non-constant distances, we checked the lower bound of the
2353 // dependence distance and the distance may be larger at runtime (and safe
2354 // for vectorization). Classify it as Unknown, so we re-try with runtime
2355 // checks, unless we can prove both accesses cannot overlap.
2356 return CheckCompletelyBeforeOrAfter() ? Dependence::NoDep
2357 : Dependence::Unknown;
2358 }
2359 LLVM_DEBUG(dbgs() << "LAA: Failure because of positive minimum distance "
2360 << MinDistance << '\n');
2361 return Dependence::Backward;
2362 }
2363
2364 // Unsafe if the minimum distance needed is greater than smallest dependence
2365 // distance distance.
2366 if (MinDistanceNeeded > MinDepDistBytes) {
2367 LLVM_DEBUG(dbgs() << "LAA: Failure because it needs at least "
2368 << MinDistanceNeeded << " size in bytes\n");
2369 return Dependence::Backward;
2370 }
2371
2372 MinDepDistBytes =
2373 std::min(a: static_cast<uint64_t>(MinDistance), b: MinDepDistBytes);
2374
2375 bool IsTrueDataDependence = (!AIsWrite && BIsWrite);
2376 if (IsTrueDataDependence && EnableForwardingConflictDetection && ConstDist &&
2377 couldPreventStoreLoadForward(Distance: MinDistance, TypeByteSize, CommonStride: *CommonStride))
2378 return Dependence::BackwardVectorizableButPreventsForwarding;
2379
2380 uint64_t MaxVF = MinDepDistBytes / MaxStride;
2381 LLVM_DEBUG(dbgs() << "LAA: Positive min distance " << MinDistance
2382 << " with max VF = " << MaxVF << '\n');
2383
2384 uint64_t MaxVFInBits = MaxVF * TypeByteSize * 8;
2385 if (!ConstDist && MaxVFInBits < MaxTargetVectorWidthInBits) {
2386 // For non-constant distances, we checked the lower bound of the dependence
2387 // distance and the distance may be larger at runtime (and safe for
2388 // vectorization). Classify it as Unknown, so we re-try with runtime checks,
2389 // unless we can prove both accesses cannot overlap.
2390 return CheckCompletelyBeforeOrAfter() ? Dependence::NoDep
2391 : Dependence::Unknown;
2392 }
2393
2394 if (CheckCompletelyBeforeOrAfter())
2395 return Dependence::NoDep;
2396
2397 MaxSafeVectorWidthInBits = std::min(a: MaxSafeVectorWidthInBits, b: MaxVFInBits);
2398 return Dependence::BackwardVectorizable;
2399}
2400
2401bool MemoryDepChecker::areDepsSafe(const DepCandidates &DepCands,
2402 const MemAccessInfoList &CheckDeps) {
2403
2404 MinDepDistBytes = -1;
2405 SmallPtrSet<MemAccessInfo, 8> Visited;
2406 for (MemAccessInfo CurAccess : CheckDeps) {
2407 if (Visited.contains(Ptr: CurAccess))
2408 continue;
2409
2410 // Check accesses within this set.
2411 EquivalenceClasses<MemAccessInfo>::member_iterator AI =
2412 DepCands.findLeader(V: CurAccess);
2413 EquivalenceClasses<MemAccessInfo>::member_iterator AE =
2414 DepCands.member_end();
2415
2416 // Check every access pair.
2417 while (AI != AE) {
2418 Visited.insert(Ptr: *AI);
2419 bool AIIsWrite = AI->getInt();
2420 // Check loads only against next equivalent class, but stores also against
2421 // other stores in the same equivalence class - to the same address.
2422 EquivalenceClasses<MemAccessInfo>::member_iterator OI =
2423 (AIIsWrite ? AI : std::next(x: AI));
2424 while (OI != AE) {
2425 // Check every accessing instruction pair in program order.
2426 auto &Acc = Accesses[*AI];
2427 for (std::vector<unsigned>::iterator I1 = Acc.begin(), I1E = Acc.end();
2428 I1 != I1E; ++I1)
2429 // Scan all accesses of another equivalence class, but only the next
2430 // accesses of the same equivalent class.
2431 for (std::vector<unsigned>::iterator
2432 I2 = (OI == AI ? std::next(x: I1) : Accesses[*OI].begin()),
2433 I2E = (OI == AI ? I1E : Accesses[*OI].end());
2434 I2 != I2E; ++I2) {
2435 auto A = std::make_pair(x: &*AI, y&: *I1);
2436 auto B = std::make_pair(x: &*OI, y&: *I2);
2437
2438 assert(*I1 != *I2);
2439 if (*I1 > *I2)
2440 std::swap(x&: A, y&: B);
2441
2442 Dependence::DepType Type =
2443 isDependent(A: *A.first, AIdx: A.second, B: *B.first, BIdx: B.second);
2444 mergeInStatus(S: Dependence::isSafeForVectorization(Type));
2445
2446 // Gather dependences unless we accumulated MaxDependences
2447 // dependences. In that case return as soon as we find the first
2448 // unsafe dependence. This puts a limit on this quadratic
2449 // algorithm.
2450 if (RecordDependences) {
2451 if (Type != Dependence::NoDep)
2452 Dependences.emplace_back(Args&: A.second, Args&: B.second, Args&: Type);
2453
2454 if (Dependences.size() >= MaxDependences) {
2455 RecordDependences = false;
2456 Dependences.clear();
2457 LLVM_DEBUG(dbgs()
2458 << "Too many dependences, stopped recording\n");
2459 }
2460 }
2461 if (!RecordDependences && !isSafeForVectorization())
2462 return false;
2463 }
2464 ++OI;
2465 }
2466 ++AI;
2467 }
2468 }
2469
2470 LLVM_DEBUG(dbgs() << "Total Dependences: " << Dependences.size() << "\n");
2471 return isSafeForVectorization();
2472}
2473
2474SmallVector<Instruction *, 4>
2475MemoryDepChecker::getInstructionsForAccess(Value *Ptr, bool IsWrite) const {
2476 MemAccessInfo Access(Ptr, IsWrite);
2477 auto I = Accesses.find(Val: Access);
2478 SmallVector<Instruction *, 4> Insts;
2479 if (I != Accesses.end()) {
2480 transform(Range: I->second, d_first: std::back_inserter(x&: Insts),
2481 F: [&](unsigned Idx) { return this->InstMap[Idx]; });
2482 }
2483
2484 return Insts;
2485}
2486
2487const char *MemoryDepChecker::Dependence::DepName[] = {
2488 "NoDep",
2489 "Unknown",
2490 "IndirectUnsafe",
2491 "Forward",
2492 "ForwardButPreventsForwarding",
2493 "Backward",
2494 "BackwardVectorizable",
2495 "BackwardVectorizableButPreventsForwarding"};
2496
2497void MemoryDepChecker::Dependence::print(
2498 raw_ostream &OS, unsigned Depth,
2499 const SmallVectorImpl<Instruction *> &Instrs) const {
2500 OS.indent(NumSpaces: Depth) << DepName[Type] << ":\n";
2501 OS.indent(NumSpaces: Depth + 2) << *Instrs[Source] << " -> \n";
2502 OS.indent(NumSpaces: Depth + 2) << *Instrs[Destination] << "\n";
2503}
2504
2505bool LoopAccessInfo::canAnalyzeLoop() {
2506 // We need to have a loop header.
2507 LLVM_DEBUG(dbgs() << "\nLAA: Checking a loop in '"
2508 << TheLoop->getHeader()->getParent()->getName() << "' from "
2509 << TheLoop->getLocStr() << "\n");
2510
2511 // We can only analyze innermost loops.
2512 if (!TheLoop->isInnermost()) {
2513 LLVM_DEBUG(dbgs() << "LAA: loop is not the innermost loop\n");
2514 recordAnalysis(RemarkName: "NotInnerMostLoop") << "loop is not the innermost loop";
2515 return false;
2516 }
2517
2518 // We must have a single backedge.
2519 if (TheLoop->getNumBackEdges() != 1) {
2520 LLVM_DEBUG(
2521 dbgs() << "LAA: loop control flow is not understood by analyzer\n");
2522 recordAnalysis(RemarkName: "CFGNotUnderstood")
2523 << "loop control flow is not understood by analyzer";
2524 return false;
2525 }
2526
2527 // ScalarEvolution needs to be able to find the symbolic max backedge taken
2528 // count, which is an upper bound on the number of loop iterations. The loop
2529 // may execute fewer iterations, if it exits via an uncountable exit.
2530 const SCEV *ExitCount = PSE->getSymbolicMaxBackedgeTakenCount();
2531 if (isa<SCEVCouldNotCompute>(Val: ExitCount)) {
2532 recordAnalysis(RemarkName: "CantComputeNumberOfIterations")
2533 << "could not determine number of loop iterations";
2534 LLVM_DEBUG(dbgs() << "LAA: SCEV could not compute the loop exit count.\n");
2535 return false;
2536 }
2537
2538 LLVM_DEBUG(dbgs() << "LAA: Found an analyzable loop: "
2539 << TheLoop->getHeader()->getName() << "\n");
2540 return true;
2541}
2542
2543bool LoopAccessInfo::analyzeLoop(AAResults *AA, const LoopInfo *LI,
2544 const TargetLibraryInfo *TLI,
2545 DominatorTree *DT) {
2546 // Holds the Load and Store instructions.
2547 SmallVector<LoadInst *, 16> Loads;
2548 SmallVector<StoreInst *, 16> Stores;
2549 SmallPtrSet<MDNode *, 8> LoopAliasScopes;
2550
2551 // Holds all the different accesses in the loop.
2552 unsigned NumReads = 0;
2553 unsigned NumReadWrites = 0;
2554
2555 bool HasComplexMemInst = false;
2556
2557 // A runtime check is only legal to insert if there are no convergent calls.
2558 HasConvergentOp = false;
2559
2560 PtrRtChecking->Pointers.clear();
2561 PtrRtChecking->Need = false;
2562
2563 const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel();
2564
2565 const bool EnableMemAccessVersioningOfLoop =
2566 EnableMemAccessVersioning &&
2567 !TheLoop->getHeader()->getParent()->hasOptSize();
2568
2569 // Traverse blocks in fixed RPOT order, regardless of their storage in the
2570 // loop info, as it may be arbitrary.
2571 LoopBlocksRPO RPOT(TheLoop);
2572 RPOT.perform(LI);
2573 for (BasicBlock *BB : RPOT) {
2574 // Scan the BB and collect legal loads and stores. Also detect any
2575 // convergent instructions.
2576 for (Instruction &I : *BB) {
2577 if (auto *Call = dyn_cast<CallBase>(Val: &I)) {
2578 if (Call->isConvergent())
2579 HasConvergentOp = true;
2580 }
2581
2582 // With both a non-vectorizable memory instruction and a convergent
2583 // operation, found in this loop, no reason to continue the search.
2584 if (HasComplexMemInst && HasConvergentOp)
2585 return false;
2586
2587 // Avoid hitting recordAnalysis multiple times.
2588 if (HasComplexMemInst)
2589 continue;
2590
2591 // Record alias scopes defined inside the loop.
2592 if (auto *Decl = dyn_cast<NoAliasScopeDeclInst>(Val: &I))
2593 for (Metadata *Op : Decl->getScopeList()->operands())
2594 LoopAliasScopes.insert(Ptr: cast<MDNode>(Val: Op));
2595
2596 // Many math library functions read the rounding mode. We will only
2597 // vectorize a loop if it contains known function calls that don't set
2598 // the flag. Therefore, it is safe to ignore this read from memory.
2599 auto *Call = dyn_cast<CallInst>(Val: &I);
2600 if (Call && getVectorIntrinsicIDForCall(CI: Call, TLI))
2601 continue;
2602
2603 // If this is a load, save it. If this instruction can read from memory
2604 // but is not a load, we only allow it if it's a call to a function with a
2605 // vector mapping and no pointer arguments.
2606 if (I.mayReadFromMemory()) {
2607 auto hasPointerArgs = [](CallBase *CB) {
2608 return any_of(Range: CB->args(), P: [](Value const *Arg) {
2609 return Arg->getType()->isPointerTy();
2610 });
2611 };
2612
2613 // If the function has an explicit vectorized counterpart, and does not
2614 // take output/input pointers, we can safely assume that it can be
2615 // vectorized.
2616 if (Call && !Call->isNoBuiltin() && Call->getCalledFunction() &&
2617 !hasPointerArgs(Call) && !VFDatabase::getMappings(CI: *Call).empty())
2618 continue;
2619
2620 auto *Ld = dyn_cast<LoadInst>(Val: &I);
2621 if (!Ld) {
2622 recordAnalysis(RemarkName: "CantVectorizeInstruction", Instr: &I)
2623 << "instruction cannot be vectorized";
2624 HasComplexMemInst = true;
2625 continue;
2626 }
2627 if (!Ld->isSimple() && !IsAnnotatedParallel) {
2628 recordAnalysis(RemarkName: "NonSimpleLoad", Instr: Ld)
2629 << "read with atomic ordering or volatile read";
2630 LLVM_DEBUG(dbgs() << "LAA: Found a non-simple load.\n");
2631 HasComplexMemInst = true;
2632 continue;
2633 }
2634 NumLoads++;
2635 Loads.push_back(Elt: Ld);
2636 DepChecker->addAccess(LI: Ld);
2637 if (EnableMemAccessVersioningOfLoop)
2638 collectStridedAccess(LoadOrStoreInst: Ld);
2639 continue;
2640 }
2641
2642 // Save 'store' instructions. Abort if other instructions write to memory.
2643 if (I.mayWriteToMemory()) {
2644 auto *St = dyn_cast<StoreInst>(Val: &I);
2645 if (!St) {
2646 recordAnalysis(RemarkName: "CantVectorizeInstruction", Instr: &I)
2647 << "instruction cannot be vectorized";
2648 HasComplexMemInst = true;
2649 continue;
2650 }
2651 if (!St->isSimple() && !IsAnnotatedParallel) {
2652 recordAnalysis(RemarkName: "NonSimpleStore", Instr: St)
2653 << "write with atomic ordering or volatile write";
2654 LLVM_DEBUG(dbgs() << "LAA: Found a non-simple store.\n");
2655 HasComplexMemInst = true;
2656 continue;
2657 }
2658 NumStores++;
2659 Stores.push_back(Elt: St);
2660 DepChecker->addAccess(SI: St);
2661 if (EnableMemAccessVersioningOfLoop)
2662 collectStridedAccess(LoadOrStoreInst: St);
2663 }
2664 } // Next instr.
2665 } // Next block.
2666
2667 if (HasComplexMemInst)
2668 return false;
2669
2670 // Now we have two lists that hold the loads and the stores.
2671 // Next, we find the pointers that they use.
2672
2673 // Check if we see any stores. If there are no stores, then we don't
2674 // care if the pointers are *restrict*.
2675 if (!Stores.size()) {
2676 LLVM_DEBUG(dbgs() << "LAA: Found a read-only loop!\n");
2677 return true;
2678 }
2679
2680 MemoryDepChecker::DepCandidates DepCands;
2681 AccessAnalysis Accesses(TheLoop, AA, LI, *DT, DepCands, *PSE,
2682 LoopAliasScopes);
2683
2684 // Holds the analyzed pointers. We don't want to call getUnderlyingObjects
2685 // multiple times on the same object. If the ptr is accessed twice, once
2686 // for read and once for write, it will only appear once (on the write
2687 // list). This is okay, since we are going to check for conflicts between
2688 // writes and between reads and writes, but not between reads and reads.
2689 SmallSet<std::pair<Value *, Type *>, 16> Seen;
2690
2691 // Record uniform store addresses to identify if we have multiple stores
2692 // to the same address.
2693 SmallPtrSet<Value *, 16> UniformStores;
2694
2695 for (StoreInst *ST : Stores) {
2696 Value *Ptr = ST->getPointerOperand();
2697
2698 if (isInvariant(V: Ptr)) {
2699 // Record store instructions to loop invariant addresses
2700 StoresToInvariantAddresses.push_back(Elt: ST);
2701 HasStoreStoreDependenceInvolvingLoopInvariantAddress |=
2702 !UniformStores.insert(Ptr).second;
2703 }
2704
2705 // If we did *not* see this pointer before, insert it to the read-write
2706 // list. At this phase it is only a 'write' list.
2707 Type *AccessTy = getLoadStoreType(I: ST);
2708 if (Seen.insert(V: {Ptr, AccessTy}).second) {
2709 ++NumReadWrites;
2710
2711 MemoryLocation Loc = MemoryLocation::get(SI: ST);
2712 // The TBAA metadata could have a control dependency on the predication
2713 // condition, so we cannot rely on it when determining whether or not we
2714 // need runtime pointer checks.
2715 if (blockNeedsPredication(BB: ST->getParent(), TheLoop, DT))
2716 Loc.AATags.TBAA = nullptr;
2717
2718 visitPointers(StartPtr: const_cast<Value *>(Loc.Ptr), InnermostLoop: *TheLoop,
2719 AddPointer: [&Accesses, AccessTy, Loc](Value *Ptr) {
2720 MemoryLocation NewLoc = Loc.getWithNewPtr(NewPtr: Ptr);
2721 Accesses.addStore(Loc: NewLoc, AccessTy);
2722 });
2723 }
2724 }
2725
2726 if (IsAnnotatedParallel) {
2727 LLVM_DEBUG(
2728 dbgs() << "LAA: A loop annotated parallel, ignore memory dependency "
2729 << "checks.\n");
2730 return true;
2731 }
2732
2733 for (LoadInst *LD : Loads) {
2734 Value *Ptr = LD->getPointerOperand();
2735 // If we did *not* see this pointer before, insert it to the
2736 // read list. If we *did* see it before, then it is already in
2737 // the read-write list. This allows us to vectorize expressions
2738 // such as A[i] += x; Because the address of A[i] is a read-write
2739 // pointer. This only works if the index of A[i] is consecutive.
2740 // If the address of i is unknown (for example A[B[i]]) then we may
2741 // read a few words, modify, and write a few words, and some of the
2742 // words may be written to the same address.
2743 bool IsReadOnlyPtr = false;
2744 Type *AccessTy = getLoadStoreType(I: LD);
2745 if (Seen.insert(V: {Ptr, AccessTy}).second ||
2746 !getPtrStride(PSE&: *PSE, AccessTy, Ptr, Lp: TheLoop, DT: *DT, StridesMap: SymbolicStrides, Assume: false,
2747 ShouldCheckWrap: true)) {
2748 ++NumReads;
2749 IsReadOnlyPtr = true;
2750 }
2751
2752 // See if there is an unsafe dependency between a load to a uniform address and
2753 // store to the same uniform address.
2754 if (UniformStores.contains(Ptr)) {
2755 LLVM_DEBUG(dbgs() << "LAA: Found an unsafe dependency between a uniform "
2756 "load and uniform store to the same address!\n");
2757 HasLoadStoreDependenceInvolvingLoopInvariantAddress = true;
2758 }
2759
2760 MemoryLocation Loc = MemoryLocation::get(LI: LD);
2761 // The TBAA metadata could have a control dependency on the predication
2762 // condition, so we cannot rely on it when determining whether or not we
2763 // need runtime pointer checks.
2764 if (blockNeedsPredication(BB: LD->getParent(), TheLoop, DT))
2765 Loc.AATags.TBAA = nullptr;
2766
2767 visitPointers(StartPtr: const_cast<Value *>(Loc.Ptr), InnermostLoop: *TheLoop,
2768 AddPointer: [&Accesses, AccessTy, Loc, IsReadOnlyPtr](Value *Ptr) {
2769 MemoryLocation NewLoc = Loc.getWithNewPtr(NewPtr: Ptr);
2770 Accesses.addLoad(Loc: NewLoc, AccessTy, IsReadOnly: IsReadOnlyPtr);
2771 });
2772 }
2773
2774 // If we write (or read-write) to a single destination and there are no
2775 // other reads in this loop then is it safe to vectorize.
2776 if (NumReadWrites == 1 && NumReads == 0) {
2777 LLVM_DEBUG(dbgs() << "LAA: Found a write-only loop!\n");
2778 return true;
2779 }
2780
2781 // Build dependence sets and check whether we need a runtime pointer bounds
2782 // check.
2783 Accesses.buildDependenceSets();
2784
2785 // Find pointers with computable bounds. We are going to use this information
2786 // to place a runtime bound check.
2787 Value *UncomputablePtr = nullptr;
2788 HasCompletePtrRtChecking =
2789 Accesses.canCheckPtrAtRT(RtCheck&: *PtrRtChecking, TheLoop, StridesMap: SymbolicStrides,
2790 UncomputablePtr, AllowPartial, DepChecker: getDepChecker());
2791 if (!HasCompletePtrRtChecking) {
2792 const auto *I = dyn_cast_or_null<Instruction>(Val: UncomputablePtr);
2793 recordAnalysis(RemarkName: "CantIdentifyArrayBounds", Instr: I)
2794 << "cannot identify array bounds";
2795 LLVM_DEBUG(dbgs() << "LAA: We can't vectorize because we can't find "
2796 << "the array bounds.\n");
2797 return false;
2798 }
2799
2800 LLVM_DEBUG(
2801 dbgs() << "LAA: May be able to perform a memory runtime check if needed.\n");
2802
2803 bool DepsAreSafe = true;
2804 if (Accesses.isDependencyCheckNeeded()) {
2805 LLVM_DEBUG(dbgs() << "LAA: Checking memory dependencies\n");
2806 DepsAreSafe =
2807 DepChecker->areDepsSafe(DepCands, CheckDeps: Accesses.getDependenciesToCheck());
2808
2809 if (!DepsAreSafe && DepChecker->shouldRetryWithRuntimeChecks()) {
2810 LLVM_DEBUG(dbgs() << "LAA: Retrying with memory checks\n");
2811
2812 PtrRtChecking->reset();
2813 PtrRtChecking->Need = true;
2814
2815 UncomputablePtr = nullptr;
2816 HasCompletePtrRtChecking = Accesses.canCheckPtrAtRT(
2817 RtCheck&: *PtrRtChecking, TheLoop, StridesMap: SymbolicStrides, UncomputablePtr,
2818 AllowPartial, DepChecker: getDepChecker());
2819
2820 // Check that we found the bounds for the pointer.
2821 if (!HasCompletePtrRtChecking) {
2822 auto *I = dyn_cast_or_null<Instruction>(Val: UncomputablePtr);
2823 recordAnalysis(RemarkName: "CantCheckMemDepsAtRunTime", Instr: I)
2824 << "cannot check memory dependencies at runtime";
2825 LLVM_DEBUG(dbgs() << "LAA: Can't vectorize with memory checks\n");
2826 return false;
2827 }
2828
2829 // Clear the dependency checks. They are no longer needed.
2830 Accesses.resetDepChecks(DepChecker&: *DepChecker);
2831
2832 DepsAreSafe = true;
2833 }
2834 }
2835
2836 if (HasConvergentOp) {
2837 recordAnalysis(RemarkName: "CantInsertRuntimeCheckWithConvergent")
2838 << "cannot add control dependency to convergent operation";
2839 LLVM_DEBUG(dbgs() << "LAA: We can't vectorize because a runtime check "
2840 "would be needed with a convergent operation\n");
2841 return false;
2842 }
2843
2844 if (DepsAreSafe) {
2845 LLVM_DEBUG(
2846 dbgs() << "LAA: No unsafe dependent memory operations in loop. We"
2847 << (PtrRtChecking->Need ? "" : " don't")
2848 << " need runtime memory checks.\n");
2849 return true;
2850 }
2851
2852 emitUnsafeDependenceRemark();
2853 return false;
2854}
2855
2856void LoopAccessInfo::emitUnsafeDependenceRemark() {
2857 const auto *Deps = getDepChecker().getDependences();
2858 if (!Deps)
2859 return;
2860 const auto *Found =
2861 llvm::find_if(Range: *Deps, P: [](const MemoryDepChecker::Dependence &D) {
2862 return MemoryDepChecker::Dependence::isSafeForVectorization(Type: D.Type) !=
2863 MemoryDepChecker::VectorizationSafetyStatus::Safe;
2864 });
2865 if (Found == Deps->end())
2866 return;
2867 MemoryDepChecker::Dependence Dep = *Found;
2868
2869 LLVM_DEBUG(dbgs() << "LAA: unsafe dependent memory operations in loop\n");
2870
2871 // Emit remark for first unsafe dependence
2872 bool HasForcedDistribution = false;
2873 std::optional<const MDOperand *> Value =
2874 findStringMetadataForLoop(TheLoop, Name: "llvm.loop.distribute.enable");
2875 if (Value) {
2876 const MDOperand *Op = *Value;
2877 assert(Op && mdconst::hasa<ConstantInt>(*Op) && "invalid metadata");
2878 HasForcedDistribution = mdconst::extract<ConstantInt>(MD: *Op)->getZExtValue();
2879 }
2880
2881 const std::string Info =
2882 HasForcedDistribution
2883 ? "unsafe dependent memory operations in loop."
2884 : "unsafe dependent memory operations in loop. Use "
2885 "#pragma clang loop distribute(enable) to allow loop distribution "
2886 "to attempt to isolate the offending operations into a separate "
2887 "loop";
2888 OptimizationRemarkAnalysis &R =
2889 recordAnalysis(RemarkName: "UnsafeDep", Instr: Dep.getDestination(DepChecker: getDepChecker())) << Info;
2890
2891 switch (Dep.Type) {
2892 case MemoryDepChecker::Dependence::NoDep:
2893 case MemoryDepChecker::Dependence::Forward:
2894 case MemoryDepChecker::Dependence::BackwardVectorizable:
2895 llvm_unreachable("Unexpected dependence");
2896 case MemoryDepChecker::Dependence::Backward:
2897 R << "\nBackward loop carried data dependence.";
2898 break;
2899 case MemoryDepChecker::Dependence::ForwardButPreventsForwarding:
2900 R << "\nForward loop carried data dependence that prevents "
2901 "store-to-load forwarding.";
2902 break;
2903 case MemoryDepChecker::Dependence::BackwardVectorizableButPreventsForwarding:
2904 R << "\nBackward loop carried data dependence that prevents "
2905 "store-to-load forwarding.";
2906 break;
2907 case MemoryDepChecker::Dependence::IndirectUnsafe:
2908 R << "\nUnsafe indirect dependence.";
2909 break;
2910 case MemoryDepChecker::Dependence::Unknown:
2911 R << "\nUnknown data dependence.";
2912 break;
2913 }
2914
2915 if (Instruction *I = Dep.getSource(DepChecker: getDepChecker())) {
2916 DebugLoc SourceLoc = I->getDebugLoc();
2917 if (auto *DD = dyn_cast_or_null<Instruction>(Val: getPointerOperand(V: I)))
2918 SourceLoc = DD->getDebugLoc();
2919 if (SourceLoc)
2920 R << " Memory location is the same as accessed at "
2921 << ore::NV("Location", SourceLoc);
2922 }
2923}
2924
2925bool LoopAccessInfo::blockNeedsPredication(const BasicBlock *BB,
2926 const Loop *TheLoop,
2927 const DominatorTree *DT) {
2928 assert(TheLoop->contains(BB) && "Unknown block used");
2929
2930 // Blocks that do not dominate the latch need predication.
2931 const BasicBlock *Latch = TheLoop->getLoopLatch();
2932 return !DT->dominates(A: BB, B: Latch);
2933}
2934
2935OptimizationRemarkAnalysis &
2936LoopAccessInfo::recordAnalysis(StringRef RemarkName, const Instruction *I) {
2937 assert(!Report && "Multiple reports generated");
2938
2939 const BasicBlock *CodeRegion = TheLoop->getHeader();
2940 DebugLoc DL = TheLoop->getStartLoc();
2941
2942 if (I) {
2943 CodeRegion = I->getParent();
2944 // If there is no debug location attached to the instruction, revert back to
2945 // using the loop's.
2946 if (I->getDebugLoc())
2947 DL = I->getDebugLoc();
2948 }
2949
2950 Report = std::make_unique<OptimizationRemarkAnalysis>(DEBUG_TYPE, args&: RemarkName,
2951 args&: DL, args&: CodeRegion);
2952 return *Report;
2953}
2954
2955bool LoopAccessInfo::isInvariant(Value *V) const {
2956 auto *SE = PSE->getSE();
2957 if (TheLoop->isLoopInvariant(V))
2958 return true;
2959 if (!SE->isSCEVable(Ty: V->getType()))
2960 return false;
2961 const SCEV *S = SE->getSCEV(V);
2962 return SE->isLoopInvariant(S, L: TheLoop);
2963}
2964
2965/// If \p Ptr is a GEP, which has a loop-variant operand, return that operand.
2966/// Otherwise, return \p Ptr.
2967static Value *getLoopVariantGEPOperand(Value *Ptr, ScalarEvolution *SE,
2968 Loop *Lp) {
2969 auto *GEP = dyn_cast<GetElementPtrInst>(Val: Ptr);
2970 if (!GEP)
2971 return Ptr;
2972
2973 Value *V = Ptr;
2974 for (const Use &U : GEP->operands()) {
2975 if (!SE->isLoopInvariant(S: SE->getSCEV(V: U), L: Lp)) {
2976 if (V == Ptr)
2977 V = U;
2978 else
2979 // There must be exactly one loop-variant operand.
2980 return Ptr;
2981 }
2982 }
2983 return V;
2984}
2985
2986/// Get the stride of a pointer access in a loop. Looks for symbolic
2987/// strides "a[i*stride]". Returns the symbolic stride, or null otherwise.
2988static const SCEV *getStrideFromPointer(Value *Ptr, ScalarEvolution *SE, Loop *Lp) {
2989 auto *PtrTy = dyn_cast<PointerType>(Val: Ptr->getType());
2990 if (!PtrTy)
2991 return nullptr;
2992
2993 // Try to remove a gep instruction to make the pointer (actually index at this
2994 // point) easier analyzable. If OrigPtr is equal to Ptr we are analyzing the
2995 // pointer, otherwise, we are analyzing the index.
2996 Value *OrigPtr = Ptr;
2997
2998 Ptr = getLoopVariantGEPOperand(Ptr, SE, Lp);
2999 const SCEV *V = SE->getSCEV(V: Ptr);
3000
3001 if (Ptr != OrigPtr)
3002 // Strip off casts.
3003 while (auto *C = dyn_cast<SCEVIntegralCastExpr>(Val: V))
3004 V = C->getOperand();
3005
3006 if (!match(S: V, P: m_scev_AffineAddRec(Op0: m_SCEV(), Op1: m_SCEV(V), L: m_SpecificLoop(L: Lp))))
3007 return nullptr;
3008
3009 // Note that the restriction after this loop invariant check are only
3010 // profitability restrictions.
3011 if (!SE->isLoopInvariant(S: V, L: Lp))
3012 return nullptr;
3013
3014 // Look for the loop invariant symbolic value.
3015 if (isa<SCEVUnknown>(Val: V))
3016 return V;
3017
3018 // Look through multiplies that scale a stride by a constant.
3019 match(S: V, P: m_scev_Mul(Op0: m_SCEVConstant(), Op1: m_SCEV(V)));
3020 if (auto *C = dyn_cast<SCEVIntegralCastExpr>(Val: V))
3021 if (isa<SCEVUnknown>(Val: C->getOperand()))
3022 return V;
3023
3024 return nullptr;
3025}
3026
3027void LoopAccessInfo::collectStridedAccess(Value *MemAccess) {
3028 Value *Ptr = getLoadStorePointerOperand(V: MemAccess);
3029 if (!Ptr)
3030 return;
3031
3032 // Note: getStrideFromPointer is a *profitability* heuristic. We
3033 // could broaden the scope of values returned here - to anything
3034 // which happens to be loop invariant and contributes to the
3035 // computation of an interesting IV - but we chose not to as we
3036 // don't have a cost model here, and broadening the scope exposes
3037 // far too many unprofitable cases.
3038 const SCEV *StrideExpr = getStrideFromPointer(Ptr, SE: PSE->getSE(), Lp: TheLoop);
3039 if (!StrideExpr)
3040 return;
3041
3042 if (match(S: StrideExpr, P: m_scev_UndefOrPoison()))
3043 return;
3044
3045 LLVM_DEBUG(dbgs() << "LAA: Found a strided access that is a candidate for "
3046 "versioning:");
3047 LLVM_DEBUG(dbgs() << " Ptr: " << *Ptr << " Stride: " << *StrideExpr << "\n");
3048
3049 if (!SpeculateUnitStride) {
3050 LLVM_DEBUG(dbgs() << " Chose not to due to -laa-speculate-unit-stride\n");
3051 return;
3052 }
3053
3054 // Avoid adding the "Stride == 1" predicate when we know that
3055 // Stride >= Trip-Count. Such a predicate will effectively optimize a single
3056 // or zero iteration loop, as Trip-Count <= Stride == 1.
3057 //
3058 // TODO: We are currently not making a very informed decision on when it is
3059 // beneficial to apply stride versioning. It might make more sense that the
3060 // users of this analysis (such as the vectorizer) will trigger it, based on
3061 // their specific cost considerations; For example, in cases where stride
3062 // versioning does not help resolving memory accesses/dependences, the
3063 // vectorizer should evaluate the cost of the runtime test, and the benefit
3064 // of various possible stride specializations, considering the alternatives
3065 // of using gather/scatters (if available).
3066
3067 const SCEV *MaxBTC = PSE->getSymbolicMaxBackedgeTakenCount();
3068
3069 // Match the types so we can compare the stride and the MaxBTC.
3070 // The Stride can be positive/negative, so we sign extend Stride;
3071 // The backedgeTakenCount is non-negative, so we zero extend MaxBTC.
3072 const DataLayout &DL = TheLoop->getHeader()->getDataLayout();
3073 uint64_t StrideTypeSizeBits = DL.getTypeSizeInBits(Ty: StrideExpr->getType());
3074 uint64_t BETypeSizeBits = DL.getTypeSizeInBits(Ty: MaxBTC->getType());
3075 const SCEV *CastedStride = StrideExpr;
3076 const SCEV *CastedBECount = MaxBTC;
3077 ScalarEvolution *SE = PSE->getSE();
3078 if (BETypeSizeBits >= StrideTypeSizeBits)
3079 CastedStride = SE->getNoopOrSignExtend(V: StrideExpr, Ty: MaxBTC->getType());
3080 else
3081 CastedBECount = SE->getZeroExtendExpr(Op: MaxBTC, Ty: StrideExpr->getType());
3082 const SCEV *StrideMinusBETaken = SE->getMinusSCEV(LHS: CastedStride, RHS: CastedBECount);
3083 // Since TripCount == BackEdgeTakenCount + 1, checking:
3084 // "Stride >= TripCount" is equivalent to checking:
3085 // Stride - MaxBTC> 0
3086 if (SE->isKnownPositive(S: StrideMinusBETaken)) {
3087 LLVM_DEBUG(
3088 dbgs() << "LAA: Stride>=TripCount; No point in versioning as the "
3089 "Stride==1 predicate will imply that the loop executes "
3090 "at most once.\n");
3091 return;
3092 }
3093 LLVM_DEBUG(dbgs() << "LAA: Found a strided access that we can version.\n");
3094
3095 // Strip back off the integer cast, and check that our result is a
3096 // SCEVUnknown as we expect.
3097 const SCEV *StrideBase = StrideExpr;
3098 if (const auto *C = dyn_cast<SCEVIntegralCastExpr>(Val: StrideBase))
3099 StrideBase = C->getOperand();
3100 SymbolicStrides[Ptr] = cast<SCEVUnknown>(Val: StrideBase);
3101}
3102
3103LoopAccessInfo::LoopAccessInfo(Loop *L, ScalarEvolution *SE,
3104 const TargetTransformInfo *TTI,
3105 const TargetLibraryInfo *TLI, AAResults *AA,
3106 DominatorTree *DT, LoopInfo *LI,
3107 AssumptionCache *AC, bool AllowPartial)
3108 : PSE(std::make_unique<PredicatedScalarEvolution>(args&: *SE, args&: *L)),
3109 PtrRtChecking(nullptr), TheLoop(L), AllowPartial(AllowPartial) {
3110 unsigned MaxTargetVectorWidthInBits = std::numeric_limits<unsigned>::max();
3111 if (TTI && !TTI->enableScalableVectorization())
3112 // Scale the vector width by 2 as rough estimate to also consider
3113 // interleaving.
3114 MaxTargetVectorWidthInBits =
3115 TTI->getRegisterBitWidth(K: TargetTransformInfo::RGK_FixedWidthVector) * 2;
3116
3117 DepChecker = std::make_unique<MemoryDepChecker>(
3118 args&: *PSE, args&: AC, args&: DT, args&: L, args&: SymbolicStrides, args&: MaxTargetVectorWidthInBits, args&: LoopGuards);
3119 PtrRtChecking =
3120 std::make_unique<RuntimePointerChecking>(args&: *DepChecker, args&: SE, args&: LoopGuards);
3121 if (canAnalyzeLoop())
3122 CanVecMem = analyzeLoop(AA, LI, TLI, DT);
3123}
3124
3125void LoopAccessInfo::print(raw_ostream &OS, unsigned Depth) const {
3126 if (CanVecMem) {
3127 OS.indent(NumSpaces: Depth) << "Memory dependences are safe";
3128 const MemoryDepChecker &DC = getDepChecker();
3129 if (!DC.isSafeForAnyVectorWidth())
3130 OS << " with a maximum safe vector width of "
3131 << DC.getMaxSafeVectorWidthInBits() << " bits";
3132 if (!DC.isSafeForAnyStoreLoadForwardDistances()) {
3133 uint64_t SLDist = DC.getStoreLoadForwardSafeDistanceInBits();
3134 OS << ", with a maximum safe store-load forward width of " << SLDist
3135 << " bits";
3136 }
3137 if (PtrRtChecking->Need)
3138 OS << " with run-time checks";
3139 OS << "\n";
3140 }
3141
3142 if (HasConvergentOp)
3143 OS.indent(NumSpaces: Depth) << "Has convergent operation in loop\n";
3144
3145 if (Report)
3146 OS.indent(NumSpaces: Depth) << "Report: " << Report->getMsg() << "\n";
3147
3148 if (auto *Dependences = DepChecker->getDependences()) {
3149 OS.indent(NumSpaces: Depth) << "Dependences:\n";
3150 for (const auto &Dep : *Dependences) {
3151 Dep.print(OS, Depth: Depth + 2, Instrs: DepChecker->getMemoryInstructions());
3152 OS << "\n";
3153 }
3154 } else
3155 OS.indent(NumSpaces: Depth) << "Too many dependences, not recorded\n";
3156
3157 // List the pair of accesses need run-time checks to prove independence.
3158 PtrRtChecking->print(OS, Depth);
3159 if (PtrRtChecking->Need && !HasCompletePtrRtChecking)
3160 OS.indent(NumSpaces: Depth) << "Generated run-time checks are incomplete\n";
3161 OS << "\n";
3162
3163 OS.indent(NumSpaces: Depth)
3164 << "Non vectorizable stores to invariant address were "
3165 << (HasStoreStoreDependenceInvolvingLoopInvariantAddress ||
3166 HasLoadStoreDependenceInvolvingLoopInvariantAddress
3167 ? ""
3168 : "not ")
3169 << "found in loop.\n";
3170
3171 OS.indent(NumSpaces: Depth) << "SCEV assumptions:\n";
3172 PSE->getPredicate().print(OS, Depth);
3173
3174 OS << "\n";
3175
3176 OS.indent(NumSpaces: Depth) << "Expressions re-written:\n";
3177 PSE->print(OS, Depth);
3178}
3179
3180const LoopAccessInfo &LoopAccessInfoManager::getInfo(Loop &L,
3181 bool AllowPartial) {
3182 const auto &[It, Inserted] = LoopAccessInfoMap.try_emplace(Key: &L);
3183
3184 // We need to create the LoopAccessInfo if either we don't already have one,
3185 // or if it was created with a different value of AllowPartial.
3186 if (Inserted || It->second->hasAllowPartial() != AllowPartial)
3187 It->second = std::make_unique<LoopAccessInfo>(args: &L, args: &SE, args&: TTI, args&: TLI, args: &AA, args: &DT,
3188 args: &LI, args&: AC, args&: AllowPartial);
3189
3190 return *It->second;
3191}
3192void LoopAccessInfoManager::clear() {
3193 // Collect LoopAccessInfo entries that may keep references to IR outside the
3194 // analyzed loop or SCEVs that may have been modified or invalidated. At the
3195 // moment, that is loops requiring memory or SCEV runtime checks, as those cache
3196 // SCEVs, e.g. for pointer expressions.
3197 for (const auto &[L, LAI] : LoopAccessInfoMap) {
3198 if (LAI->getRuntimePointerChecking()->getChecks().empty() &&
3199 LAI->getPSE().getPredicate().isAlwaysTrue())
3200 continue;
3201 LoopAccessInfoMap.erase(Val: L);
3202 }
3203}
3204
3205bool LoopAccessInfoManager::invalidate(
3206 Function &F, const PreservedAnalyses &PA,
3207 FunctionAnalysisManager::Invalidator &Inv) {
3208 // Check whether our analysis is preserved.
3209 auto PAC = PA.getChecker<LoopAccessAnalysis>();
3210 if (!PAC.preserved() && !PAC.preservedSet<AllAnalysesOn<Function>>())
3211 // If not, give up now.
3212 return true;
3213
3214 // Check whether the analyses we depend on became invalid for any reason.
3215 // Skip checking TargetLibraryAnalysis as it is immutable and can't become
3216 // invalid.
3217 return Inv.invalidate<AAManager>(IR&: F, PA) ||
3218 Inv.invalidate<ScalarEvolutionAnalysis>(IR&: F, PA) ||
3219 Inv.invalidate<LoopAnalysis>(IR&: F, PA) ||
3220 Inv.invalidate<DominatorTreeAnalysis>(IR&: F, PA);
3221}
3222
3223LoopAccessInfoManager LoopAccessAnalysis::run(Function &F,
3224 FunctionAnalysisManager &FAM) {
3225 auto &SE = FAM.getResult<ScalarEvolutionAnalysis>(IR&: F);
3226 auto &AA = FAM.getResult<AAManager>(IR&: F);
3227 auto &DT = FAM.getResult<DominatorTreeAnalysis>(IR&: F);
3228 auto &LI = FAM.getResult<LoopAnalysis>(IR&: F);
3229 auto &TTI = FAM.getResult<TargetIRAnalysis>(IR&: F);
3230 auto &TLI = FAM.getResult<TargetLibraryAnalysis>(IR&: F);
3231 auto &AC = FAM.getResult<AssumptionAnalysis>(IR&: F);
3232 return LoopAccessInfoManager(SE, AA, DT, LI, &TTI, &TLI, &AC);
3233}
3234
3235AnalysisKey LoopAccessAnalysis::Key;
3236