1//===- LoopAccessAnalysis.cpp - Loop Access Analysis Implementation --------==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// The implementation for the loop memory dependence that was originally
10// developed for the loop vectorizer.
11//
12//===----------------------------------------------------------------------===//
13
14#include "llvm/Analysis/LoopAccessAnalysis.h"
15#include "llvm/ADT/APInt.h"
16#include "llvm/ADT/DenseMap.h"
17#include "llvm/ADT/EquivalenceClasses.h"
18#include "llvm/ADT/PointerIntPair.h"
19#include "llvm/ADT/STLExtras.h"
20#include "llvm/ADT/SetVector.h"
21#include "llvm/ADT/SmallPtrSet.h"
22#include "llvm/ADT/SmallSet.h"
23#include "llvm/ADT/SmallVector.h"
24#include "llvm/Analysis/AliasAnalysis.h"
25#include "llvm/Analysis/AliasSetTracker.h"
26#include "llvm/Analysis/AssumeBundleQueries.h"
27#include "llvm/Analysis/AssumptionCache.h"
28#include "llvm/Analysis/LoopAnalysisManager.h"
29#include "llvm/Analysis/LoopInfo.h"
30#include "llvm/Analysis/LoopIterator.h"
31#include "llvm/Analysis/MemoryLocation.h"
32#include "llvm/Analysis/OptimizationRemarkEmitter.h"
33#include "llvm/Analysis/ScalarEvolution.h"
34#include "llvm/Analysis/ScalarEvolutionExpressions.h"
35#include "llvm/Analysis/ScalarEvolutionPatternMatch.h"
36#include "llvm/Analysis/TargetLibraryInfo.h"
37#include "llvm/Analysis/TargetTransformInfo.h"
38#include "llvm/Analysis/ValueTracking.h"
39#include "llvm/Analysis/VectorUtils.h"
40#include "llvm/IR/BasicBlock.h"
41#include "llvm/IR/Constants.h"
42#include "llvm/IR/DataLayout.h"
43#include "llvm/IR/DebugLoc.h"
44#include "llvm/IR/DerivedTypes.h"
45#include "llvm/IR/DiagnosticInfo.h"
46#include "llvm/IR/Dominators.h"
47#include "llvm/IR/Function.h"
48#include "llvm/IR/InstrTypes.h"
49#include "llvm/IR/Instruction.h"
50#include "llvm/IR/Instructions.h"
51#include "llvm/IR/IntrinsicInst.h"
52#include "llvm/IR/PassManager.h"
53#include "llvm/IR/Type.h"
54#include "llvm/IR/Value.h"
55#include "llvm/IR/ValueHandle.h"
56#include "llvm/Support/Casting.h"
57#include "llvm/Support/CommandLine.h"
58#include "llvm/Support/Debug.h"
59#include "llvm/Support/ErrorHandling.h"
60#include "llvm/Support/raw_ostream.h"
61#include <algorithm>
62#include <cassert>
63#include <cstdint>
64#include <iterator>
65#include <utility>
66#include <variant>
67#include <vector>
68
69using namespace llvm;
70using namespace llvm::SCEVPatternMatch;
71
72#define DEBUG_TYPE "loop-accesses"
73
74static cl::opt<ElementCount, true>
75 VectorizationFactor("force-vector-width", cl::Hidden,
76 cl::desc("Sets the SIMD width. Zero is autoselect."),
77 cl::location(L&: VectorizerParams::VectorizationFactor));
78ElementCount VectorizerParams::VectorizationFactor;
79
80static cl::opt<unsigned, true>
81VectorizationInterleave("force-vector-interleave", cl::Hidden,
82 cl::desc("Sets the vectorization interleave count. "
83 "Zero is autoselect."),
84 cl::location(
85 L&: VectorizerParams::VectorizationInterleave));
86unsigned VectorizerParams::VectorizationInterleave;
87
88static cl::opt<unsigned, true> RuntimeMemoryCheckThreshold(
89 "runtime-memory-check-threshold", cl::Hidden,
90 cl::desc("When performing memory disambiguation checks at runtime do not "
91 "generate more than this number of comparisons (default = 8)."),
92 cl::location(L&: VectorizerParams::RuntimeMemoryCheckThreshold), cl::init(Val: 8));
93unsigned VectorizerParams::RuntimeMemoryCheckThreshold;
94
95/// The maximum iterations used to merge memory checks
96static cl::opt<unsigned> MemoryCheckMergeThreshold(
97 "memory-check-merge-threshold", cl::Hidden,
98 cl::desc("Maximum number of comparisons done when trying to merge "
99 "runtime memory checks. (default = 100)"),
100 cl::init(Val: 100));
101
102/// Maximum SIMD width.
103const unsigned VectorizerParams::MaxVectorWidth = 64;
104
105/// We collect dependences up to this threshold.
106static cl::opt<unsigned>
107 MaxDependences("max-dependences", cl::Hidden,
108 cl::desc("Maximum number of dependences collected by "
109 "loop-access analysis (default = 100)"),
110 cl::init(Val: 100));
111
112/// This enables versioning on the strides of symbolically striding memory
113/// accesses in code like the following.
114/// for (i = 0; i < N; ++i)
115/// A[i * Stride1] += B[i * Stride2] ...
116///
117/// Will be roughly translated to
118/// if (Stride1 == 1 && Stride2 == 1) {
119/// for (i = 0; i < N; i+=4)
120/// A[i:i+3] += ...
121/// } else
122/// ...
123static cl::opt<bool> EnableMemAccessVersioning(
124 "enable-mem-access-versioning", cl::init(Val: true), cl::Hidden,
125 cl::desc("Enable symbolic stride memory access versioning"));
126
127/// Enable store-to-load forwarding conflict detection. This option can
128/// be disabled for correctness testing.
129static cl::opt<bool> EnableForwardingConflictDetection(
130 "store-to-load-forwarding-conflict-detection", cl::Hidden,
131 cl::desc("Enable conflict detection in loop-access analysis"),
132 cl::init(Val: true));
133
134static cl::opt<unsigned> MaxForkedSCEVDepth(
135 "max-forked-scev-depth", cl::Hidden,
136 cl::desc("Maximum recursion depth when finding forked SCEVs (default = 5)"),
137 cl::init(Val: 5));
138
139static cl::opt<bool> SpeculateUnitStride(
140 "laa-speculate-unit-stride", cl::Hidden,
141 cl::desc("Speculate that non-constant strides are unit in LAA"),
142 cl::init(Val: true));
143
144static cl::opt<bool, true> HoistRuntimeChecks(
145 "hoist-runtime-checks", cl::Hidden,
146 cl::desc(
147 "Hoist inner loop runtime memory checks to outer loop if possible"),
148 cl::location(L&: VectorizerParams::HoistRuntimeChecks), cl::init(Val: true));
149bool VectorizerParams::HoistRuntimeChecks;
150
151bool VectorizerParams::isInterleaveForced() {
152 return ::VectorizationInterleave.getNumOccurrences() > 0;
153}
154
155const SCEV *llvm::replaceSymbolicStrideSCEV(PredicatedScalarEvolution &PSE,
156 const DenseMap<Value *, const SCEV *> &PtrToStride,
157 Value *Ptr) {
158 const SCEV *OrigSCEV = PSE.getSCEV(V: Ptr);
159
160 // If there is an entry in the map return the SCEV of the pointer with the
161 // symbolic stride replaced by one.
162 const SCEV *StrideSCEV = PtrToStride.lookup(Val: Ptr);
163 if (!StrideSCEV)
164 // For a non-symbolic stride, just return the original expression.
165 return OrigSCEV;
166
167 // Note: This assert is both overly strong and overly weak. The actual
168 // invariant here is that StrideSCEV should be loop invariant. The only
169 // such invariant strides we happen to speculate right now are unknowns
170 // and thus this is a reasonable proxy of the actual invariant.
171 assert(isa<SCEVUnknown>(StrideSCEV) && "shouldn't be in map");
172
173 ScalarEvolution *SE = PSE.getSE();
174 const SCEV *CT = SE->getOne(Ty: StrideSCEV->getType());
175 PSE.addPredicate(Pred: *SE->getEqualPredicate(LHS: StrideSCEV, RHS: CT));
176 const SCEV *Expr = PSE.getSCEV(V: Ptr);
177
178 LLVM_DEBUG(dbgs() << "LAA: Replacing SCEV: " << *OrigSCEV
179 << " by: " << *Expr << "\n");
180 return Expr;
181}
182
183RuntimeCheckingPtrGroup::RuntimeCheckingPtrGroup(
184 unsigned Index, const RuntimePointerChecking &RtCheck)
185 : High(RtCheck.Pointers[Index].End), Low(RtCheck.Pointers[Index].Start),
186 AddressSpace(RtCheck.Pointers[Index]
187 .PointerValue->getType()
188 ->getPointerAddressSpace()),
189 NeedsFreeze(RtCheck.Pointers[Index].NeedsFreeze) {
190 Members.push_back(Elt: Index);
191}
192
193/// Returns \p A + \p B, if it is guaranteed not to unsigned wrap. Otherwise
194/// return nullptr. \p A and \p B must have the same type.
195static const SCEV *addSCEVNoOverflow(const SCEV *A, const SCEV *B,
196 ScalarEvolution &SE) {
197 if (!SE.willNotOverflow(BinOp: Instruction::Add, /*IsSigned=*/Signed: false, LHS: A, RHS: B))
198 return nullptr;
199 return SE.getAddExpr(LHS: A, RHS: B);
200}
201
202/// Returns \p A * \p B, if it is guaranteed not to unsigned wrap. Otherwise
203/// return nullptr. \p A and \p B must have the same type.
204static const SCEV *mulSCEVNoOverflow(const SCEV *A, const SCEV *B,
205 ScalarEvolution &SE) {
206 if (!SE.willNotOverflow(BinOp: Instruction::Mul, /*IsSigned=*/Signed: false, LHS: A, RHS: B))
207 return nullptr;
208 return SE.getMulExpr(LHS: A, RHS: B);
209}
210
211/// Return true, if evaluating \p AR at \p MaxBTC cannot wrap, because \p AR at
212/// \p MaxBTC is guaranteed inbounds of the accessed object.
213static bool evaluatePtrAddRecAtMaxBTCWillNotWrap(
214 const SCEVAddRecExpr *AR, const SCEV *MaxBTC, const SCEV *EltSize,
215 ScalarEvolution &SE, const DataLayout &DL, DominatorTree *DT,
216 AssumptionCache *AC,
217 std::optional<ScalarEvolution::LoopGuards> &LoopGuards) {
218 auto *PointerBase = SE.getPointerBase(V: AR->getStart());
219 auto *StartPtr = dyn_cast<SCEVUnknown>(Val: PointerBase);
220 if (!StartPtr)
221 return false;
222 const Loop *L = AR->getLoop();
223 bool CheckForNonNull;
224 Value *StartPtrV = StartPtr->getValue();
225 // We can ignore frees, as the fact that an object of a certain size existed
226 // at the location *at some point* is sufficient to derive the nowrap fact.
227 uint64_t DerefBytes = StartPtrV->getPointerDereferenceableBytes(
228 DL, CanBeNull&: CheckForNonNull, /*CanBeFreed=*/nullptr);
229
230 if (DerefBytes && CheckForNonNull)
231 return false;
232
233 const SCEV *Step = AR->getStepRecurrence(SE);
234 Type *WiderTy = SE.getWiderType(Ty1: MaxBTC->getType(), Ty2: Step->getType());
235 const SCEV *DerefBytesSCEV = SE.getConstant(Ty: WiderTy, V: DerefBytes);
236
237 // Check if we have a suitable dereferencable assumption we can use.
238 Instruction *CtxI = &*L->getHeader()->getFirstNonPHIIt();
239 if (BasicBlock *LoopPred = L->getLoopPredecessor()) {
240 if (isa<UncondBrInst, CondBrInst>(Val: LoopPred->getTerminator()))
241 CtxI = LoopPred->getTerminator();
242 }
243 getKnowledgeForValue(
244 V: StartPtrV, AttrKinds: Attribute::Dereferenceable, AC&: *AC,
245 Filter: [&](RetainedKnowledge RK, Instruction *Assume, auto) {
246 if (!isValidAssumeForContext(I: Assume, CxtI: CtxI, DT))
247 return false;
248 const SCEV *DerefRKSCEV = SE.getSCEV(V: RK.IRArgValue);
249 Type *CommonTy =
250 SE.getWiderType(Ty1: DerefBytesSCEV->getType(), Ty2: DerefRKSCEV->getType());
251 DerefBytesSCEV = SE.getNoopOrZeroExtend(V: DerefBytesSCEV, Ty: CommonTy);
252 DerefRKSCEV = SE.getNoopOrZeroExtend(V: DerefRKSCEV, Ty: CommonTy);
253 DerefBytesSCEV = SE.getUMaxExpr(LHS: DerefBytesSCEV, RHS: DerefRKSCEV);
254 // Continue with other assumptions.
255 return false;
256 });
257
258 if (DerefBytesSCEV->isZero())
259 return false;
260
261 bool IsKnownNonNegative = SE.isKnownNonNegative(S: Step);
262 if (!IsKnownNonNegative && !SE.isKnownNegative(S: Step))
263 return false;
264
265 Step = SE.getNoopOrSignExtend(V: Step, Ty: WiderTy);
266 MaxBTC = SE.getNoopOrZeroExtend(V: MaxBTC, Ty: WiderTy);
267
268 // For the computations below, make sure they don't unsigned wrap.
269 if (!SE.isKnownPredicate(Pred: CmpInst::ICMP_UGE, LHS: AR->getStart(), RHS: StartPtr))
270 return false;
271 const SCEV *StartOffset = SE.getNoopOrZeroExtend(
272 V: SE.getMinusSCEV(LHS: AR->getStart(), RHS: StartPtr), Ty: WiderTy);
273
274 if (!LoopGuards)
275 LoopGuards.emplace(args: ScalarEvolution::LoopGuards::collect(L: AR->getLoop(), SE));
276 MaxBTC = SE.applyLoopGuards(Expr: MaxBTC, Guards: *LoopGuards);
277
278 const SCEV *OffsetAtLastIter =
279 mulSCEVNoOverflow(A: MaxBTC, B: SE.getAbsExpr(Op: Step, /*IsNSW=*/false), SE);
280 if (!OffsetAtLastIter) {
281 // Re-try with constant max backedge-taken count if using the symbolic one
282 // failed.
283 MaxBTC = SE.getConstantMaxBackedgeTakenCount(L: AR->getLoop());
284 if (isa<SCEVCouldNotCompute>(Val: MaxBTC))
285 return false;
286 MaxBTC = SE.getNoopOrZeroExtend(
287 V: MaxBTC, Ty: WiderTy);
288 OffsetAtLastIter =
289 mulSCEVNoOverflow(A: MaxBTC, B: SE.getAbsExpr(Op: Step, /*IsNSW=*/false), SE);
290 if (!OffsetAtLastIter)
291 return false;
292 }
293
294 const SCEV *OffsetEndBytes = addSCEVNoOverflow(
295 A: OffsetAtLastIter, B: SE.getNoopOrZeroExtend(V: EltSize, Ty: WiderTy), SE);
296 if (!OffsetEndBytes)
297 return false;
298
299 if (IsKnownNonNegative) {
300 // For positive steps, check if
301 // (AR->getStart() - StartPtr) + (MaxBTC * Step) + EltSize <= DerefBytes,
302 // while making sure none of the computations unsigned wrap themselves.
303 const SCEV *EndBytes = addSCEVNoOverflow(A: StartOffset, B: OffsetEndBytes, SE);
304 if (!EndBytes)
305 return false;
306
307 DerefBytesSCEV = SE.applyLoopGuards(Expr: DerefBytesSCEV, Guards: *LoopGuards);
308 return SE.isKnownPredicate(Pred: CmpInst::ICMP_ULE, LHS: EndBytes, RHS: DerefBytesSCEV);
309 }
310
311 // For negative steps check if
312 // * StartOffset >= (MaxBTC * Step + EltSize)
313 // * StartOffset <= DerefBytes.
314 assert(SE.isKnownNegative(Step) && "must be known negative");
315 return SE.isKnownPredicate(Pred: CmpInst::ICMP_SGE, LHS: StartOffset, RHS: OffsetEndBytes) &&
316 SE.isKnownPredicate(Pred: CmpInst::ICMP_ULE, LHS: StartOffset, RHS: DerefBytesSCEV);
317}
318
319std::pair<const SCEV *, const SCEV *> llvm::getStartAndEndForAccess(
320 const Loop *Lp, const SCEV *PtrExpr, Type *AccessTy, const SCEV *BTC,
321 const SCEV *MaxBTC, ScalarEvolution *SE,
322 DenseMap<std::pair<const SCEV *, const SCEV *>,
323 std::pair<const SCEV *, const SCEV *>> *PointerBounds,
324 DominatorTree *DT, AssumptionCache *AC,
325 std::optional<ScalarEvolution::LoopGuards> &LoopGuards) {
326 auto &DL = Lp->getHeader()->getDataLayout();
327 Type *IdxTy = DL.getIndexType(PtrTy: PtrExpr->getType());
328 const SCEV *EltSizeSCEV = SE->getStoreSizeOfExpr(IntTy: IdxTy, StoreTy: AccessTy);
329
330 // Delegate to the SCEV-based overload, passing through the cache.
331 return getStartAndEndForAccess(Lp, PtrExpr, EltSizeSCEV, BTC, MaxBTC, SE,
332 PointerBounds, DT, AC, LoopGuards);
333}
334
335std::pair<const SCEV *, const SCEV *> llvm::getStartAndEndForAccess(
336 const Loop *Lp, const SCEV *PtrExpr, const SCEV *EltSizeSCEV,
337 const SCEV *BTC, const SCEV *MaxBTC, ScalarEvolution *SE,
338 DenseMap<std::pair<const SCEV *, const SCEV *>,
339 std::pair<const SCEV *, const SCEV *>> *PointerBounds,
340 DominatorTree *DT, AssumptionCache *AC,
341 std::optional<ScalarEvolution::LoopGuards> &LoopGuards) {
342 std::pair<const SCEV *, const SCEV *> *PtrBoundsPair;
343 if (PointerBounds) {
344 auto [Iter, Ins] = PointerBounds->insert(
345 KV: {{PtrExpr, EltSizeSCEV},
346 {SE->getCouldNotCompute(), SE->getCouldNotCompute()}});
347 if (!Ins)
348 return Iter->second;
349 PtrBoundsPair = &Iter->second;
350 }
351
352 const SCEV *ScStart;
353 const SCEV *ScEnd;
354
355 auto &DL = Lp->getHeader()->getDataLayout();
356 if (SE->isLoopInvariant(S: PtrExpr, L: Lp)) {
357 ScStart = ScEnd = PtrExpr;
358 } else if (auto *AR = dyn_cast<SCEVAddRecExpr>(Val: PtrExpr)) {
359 ScStart = AR->getStart();
360 if (!isa<SCEVCouldNotCompute>(Val: BTC))
361 // Evaluating AR at an exact BTC is safe: LAA separately checks that
362 // accesses cannot wrap in the loop. If evaluating AR at BTC wraps, then
363 // the loop either triggers UB when executing a memory access with a
364 // poison pointer or the wrapping/poisoned pointer is not used.
365 ScEnd = AR->evaluateAtIteration(It: BTC, SE&: *SE);
366 else {
367 // Evaluating AR at MaxBTC may wrap and create an expression that is less
368 // than the start of the AddRec due to wrapping (for example consider
369 // MaxBTC = -2). If that's the case, set ScEnd to -(EltSize + 1). ScEnd
370 // will get incremented by EltSize before returning, so this effectively
371 // sets ScEnd to the maximum unsigned value for the type. Note that LAA
372 // separately checks that accesses cannot not wrap, so unsigned max
373 // represents an upper bound.
374 if (evaluatePtrAddRecAtMaxBTCWillNotWrap(AR, MaxBTC, EltSize: EltSizeSCEV, SE&: *SE, DL,
375 DT, AC, LoopGuards)) {
376 ScEnd = AR->evaluateAtIteration(It: MaxBTC, SE&: *SE);
377 } else {
378 ScEnd = SE->getAddExpr(
379 LHS: SE->getNegativeSCEV(V: EltSizeSCEV),
380 RHS: SE->getSCEV(V: ConstantExpr::getIntToPtr(
381 C: ConstantInt::getAllOnesValue(Ty: EltSizeSCEV->getType()),
382 Ty: AR->getType())));
383 }
384 }
385 const SCEV *Step = AR->getStepRecurrence(SE&: *SE);
386
387 // For expressions with negative step, the upper bound is ScStart and the
388 // lower bound is ScEnd.
389 if (const auto *CStep = dyn_cast<SCEVConstant>(Val: Step)) {
390 if (CStep->getValue()->isNegative())
391 std::swap(a&: ScStart, b&: ScEnd);
392 } else {
393 // Fallback case: the step is not constant, but we can still
394 // get the upper and lower bounds of the interval by using min/max
395 // expressions.
396 ScStart = SE->getUMinExpr(LHS: ScStart, RHS: ScEnd);
397 ScEnd = SE->getUMaxExpr(LHS: AR->getStart(), RHS: ScEnd);
398 }
399 } else
400 return {SE->getCouldNotCompute(), SE->getCouldNotCompute()};
401
402 assert(SE->isLoopInvariant(ScStart, Lp) && "ScStart needs to be invariant");
403 assert(SE->isLoopInvariant(ScEnd, Lp) && "ScEnd needs to be invariant");
404
405 // Add the size of the pointed element to ScEnd.
406 ScEnd = SE->getAddExpr(LHS: ScEnd, RHS: EltSizeSCEV);
407
408 std::pair<const SCEV *, const SCEV *> Res = {ScStart, ScEnd};
409 if (PointerBounds)
410 *PtrBoundsPair = Res;
411 return Res;
412}
413
414/// Calculate Start and End points of memory access using
415/// getStartAndEndForAccess.
416void RuntimePointerChecking::insert(Loop *Lp, Value *Ptr, const SCEV *PtrExpr,
417 Type *AccessTy, bool WritePtr,
418 unsigned DepSetId, unsigned ASId,
419 PredicatedScalarEvolution &PSE,
420 bool NeedsFreeze) {
421 const SCEV *SymbolicMaxBTC = PSE.getSymbolicMaxBackedgeTakenCount();
422 const SCEV *BTC = PSE.getBackedgeTakenCount();
423 const auto &[ScStart, ScEnd] = getStartAndEndForAccess(
424 Lp, PtrExpr, AccessTy, BTC, MaxBTC: SymbolicMaxBTC, SE: PSE.getSE(),
425 PointerBounds: &DC.getPointerBounds(), DT: DC.getDT(), AC: DC.getAC(), LoopGuards);
426 assert(!isa<SCEVCouldNotCompute>(ScStart) &&
427 !isa<SCEVCouldNotCompute>(ScEnd) &&
428 "must be able to compute both start and end expressions");
429 Pointers.emplace_back(Args&: Ptr, Args: ScStart, Args: ScEnd, Args&: WritePtr, Args&: DepSetId, Args&: ASId, Args&: PtrExpr,
430 Args&: NeedsFreeze);
431}
432
433bool RuntimePointerChecking::tryToCreateDiffCheck(
434 const RuntimeCheckingPtrGroup &CGI, const RuntimeCheckingPtrGroup &CGJ) {
435 // If either group contains multiple different pointers, bail out.
436 // TODO: Support multiple pointers by using the minimum or maximum pointer,
437 // depending on src & sink.
438 if (CGI.Members.size() != 1 || CGJ.Members.size() != 1)
439 return false;
440
441 const PointerInfo *Src = &Pointers[CGI.Members[0]];
442 const PointerInfo *Sink = &Pointers[CGJ.Members[0]];
443
444 // If either pointer is read and written, multiple checks may be needed. Bail
445 // out.
446 if (!DC.getOrderForAccess(Ptr: Src->PointerValue, IsWrite: !Src->IsWritePtr).empty() ||
447 !DC.getOrderForAccess(Ptr: Sink->PointerValue, IsWrite: !Sink->IsWritePtr).empty())
448 return false;
449
450 ArrayRef<unsigned> AccSrc =
451 DC.getOrderForAccess(Ptr: Src->PointerValue, IsWrite: Src->IsWritePtr);
452 ArrayRef<unsigned> AccSink =
453 DC.getOrderForAccess(Ptr: Sink->PointerValue, IsWrite: Sink->IsWritePtr);
454 // If either pointer is accessed multiple times, there may not be a clear
455 // src/sink relation. Bail out for now.
456 if (AccSrc.size() != 1 || AccSink.size() != 1)
457 return false;
458
459 // If the sink is accessed before src, swap src/sink.
460 if (AccSink[0] < AccSrc[0])
461 std::swap(a&: Src, b&: Sink);
462
463 const SCEVConstant *Step;
464 const SCEV *SrcStart;
465 const SCEV *SinkStart;
466 const Loop *InnerLoop = DC.getInnermostLoop();
467 if (!match(S: Src->Expr,
468 P: m_scev_AffineAddRec(Op0: m_SCEV(V&: SrcStart), Op1: m_SCEVConstant(V&: Step),
469 L: m_SpecificLoop(L: InnerLoop))) ||
470 !match(S: Sink->Expr,
471 P: m_scev_AffineAddRec(Op0: m_SCEV(V&: SinkStart), Op1: m_scev_Specific(S: Step),
472 L: m_SpecificLoop(L: InnerLoop))))
473 return false;
474
475 SmallVector<Instruction *, 4> SrcInsts =
476 DC.getInstructionsForAccess(Ptr: Src->PointerValue, isWrite: Src->IsWritePtr);
477 SmallVector<Instruction *, 4> SinkInsts =
478 DC.getInstructionsForAccess(Ptr: Sink->PointerValue, isWrite: Sink->IsWritePtr);
479 Type *SrcTy = getLoadStoreType(I: SrcInsts[0]);
480 Type *DstTy = getLoadStoreType(I: SinkInsts[0]);
481 if (isa<ScalableVectorType>(Val: SrcTy) || isa<ScalableVectorType>(Val: DstTy))
482 return false;
483
484 const DataLayout &DL = InnerLoop->getHeader()->getDataLayout();
485 unsigned AllocSize =
486 std::max(a: DL.getTypeAllocSize(Ty: SrcTy), b: DL.getTypeAllocSize(Ty: DstTy));
487
488 // Only matching constant steps matching the AllocSize are supported at the
489 // moment. This simplifies the difference computation. Can be extended in the
490 // future.
491 if (Step->getAPInt().abs() != AllocSize)
492 return false;
493
494 // When counting down, the dependence distance needs to be swapped.
495 if (Step->getValue()->isNegative())
496 std::swap(a&: SinkStart, b&: SrcStart);
497
498 const SCEV *SinkStartInt = SE->getPtrToAddrExpr(Op: SinkStart);
499 const SCEV *SrcStartInt = SE->getPtrToAddrExpr(Op: SrcStart);
500 if (isa<SCEVCouldNotCompute>(Val: SinkStartInt) ||
501 isa<SCEVCouldNotCompute>(Val: SrcStartInt))
502 return false;
503
504 // If the start values for both Src and Sink also vary according to an outer
505 // loop, then it's probably better to avoid creating diff checks because
506 // they may not be hoisted. We should instead let llvm::addRuntimeChecks
507 // do the expanded full range overlap checks, which can be hoisted.
508 if (HoistRuntimeChecks && InnerLoop->getParentLoop() &&
509 isa<SCEVAddRecExpr>(Val: SinkStartInt) && isa<SCEVAddRecExpr>(Val: SrcStartInt)) {
510 auto *SrcStartAR = cast<SCEVAddRecExpr>(Val: SrcStartInt);
511 auto *SinkStartAR = cast<SCEVAddRecExpr>(Val: SinkStartInt);
512 const Loop *StartARLoop = SrcStartAR->getLoop();
513 if (StartARLoop == SinkStartAR->getLoop() &&
514 StartARLoop == InnerLoop->getParentLoop() &&
515 // If the diff check would already be loop invariant (due to the
516 // recurrences being the same), then we prefer to keep the diff checks
517 // because they are cheaper.
518 SrcStartAR->getStepRecurrence(SE&: *SE) !=
519 SinkStartAR->getStepRecurrence(SE&: *SE)) {
520 LLVM_DEBUG(dbgs() << "LAA: Not creating diff runtime check, since these "
521 "cannot be hoisted out of the outer loop\n");
522 return false;
523 }
524 }
525
526 LLVM_DEBUG(dbgs() << "LAA: Creating diff runtime check for:\n"
527 << "SrcStart: " << *SrcStartInt << '\n'
528 << "SinkStartInt: " << *SinkStartInt << '\n');
529 DiffChecks.emplace_back(Args&: SrcStartInt, Args&: SinkStartInt, Args&: AllocSize,
530 Args: Src->NeedsFreeze || Sink->NeedsFreeze);
531 return true;
532}
533
534SmallVector<RuntimePointerCheck, 4> RuntimePointerChecking::generateChecks() {
535 SmallVector<RuntimePointerCheck, 4> Checks;
536
537 for (unsigned I = 0; I < CheckingGroups.size(); ++I) {
538 for (unsigned J = I + 1; J < CheckingGroups.size(); ++J) {
539 const RuntimeCheckingPtrGroup &CGI = CheckingGroups[I];
540 const RuntimeCheckingPtrGroup &CGJ = CheckingGroups[J];
541
542 if (needsChecking(M: CGI, N: CGJ)) {
543 CanUseDiffCheck = CanUseDiffCheck && tryToCreateDiffCheck(CGI, CGJ);
544 Checks.emplace_back(Args: &CGI, Args: &CGJ);
545 }
546 }
547 }
548 return Checks;
549}
550
551void RuntimePointerChecking::generateChecks(
552 MemoryDepChecker::DepCandidates &DepCands) {
553 assert(Checks.empty() && "Checks is not empty");
554 groupChecks(DepCands);
555 Checks = generateChecks();
556}
557
558bool RuntimePointerChecking::needsChecking(
559 const RuntimeCheckingPtrGroup &M, const RuntimeCheckingPtrGroup &N) const {
560 for (const auto &I : M.Members)
561 for (const auto &J : N.Members)
562 if (needsChecking(I, J))
563 return true;
564 return false;
565}
566
567/// Compare \p I and \p J and return the minimum.
568/// Return nullptr in case we couldn't find an answer.
569static const SCEV *getMinFromExprs(const SCEV *I, const SCEV *J,
570 ScalarEvolution *SE) {
571 std::optional<APInt> Diff = SE->computeConstantDifference(LHS: J, RHS: I);
572 if (!Diff)
573 return nullptr;
574 return Diff->isNegative() ? J : I;
575}
576
577bool RuntimeCheckingPtrGroup::addPointer(
578 unsigned Index, const RuntimePointerChecking &RtCheck) {
579 return addPointer(
580 Index, Start: RtCheck.Pointers[Index].Start, End: RtCheck.Pointers[Index].End,
581 AS: RtCheck.Pointers[Index].PointerValue->getType()->getPointerAddressSpace(),
582 NeedsFreeze: RtCheck.Pointers[Index].NeedsFreeze, SE&: *RtCheck.SE);
583}
584
585bool RuntimeCheckingPtrGroup::addPointer(unsigned Index, const SCEV *Start,
586 const SCEV *End, unsigned AS,
587 bool NeedsFreeze,
588 ScalarEvolution &SE) {
589 assert(AddressSpace == AS &&
590 "all pointers in a checking group must be in the same address space");
591
592 // Compare the starts and ends with the known minimum and maximum
593 // of this set. We need to know how we compare against the min/max
594 // of the set in order to be able to emit memchecks.
595 const SCEV *Min0 = getMinFromExprs(I: Start, J: Low, SE: &SE);
596 if (!Min0)
597 return false;
598
599 const SCEV *Min1 = getMinFromExprs(I: End, J: High, SE: &SE);
600 if (!Min1)
601 return false;
602
603 // Update the low bound expression if we've found a new min value.
604 if (Min0 == Start)
605 Low = Start;
606
607 // Update the high bound expression if we've found a new max value.
608 if (Min1 != End)
609 High = End;
610
611 Members.push_back(Elt: Index);
612 this->NeedsFreeze |= NeedsFreeze;
613 return true;
614}
615
616void RuntimePointerChecking::groupChecks(
617 MemoryDepChecker::DepCandidates &DepCands) {
618 // We build the groups from dependency candidates equivalence classes
619 // because:
620 // - We know that pointers in the same equivalence class share
621 // the same underlying object and therefore there is a chance
622 // that we can compare pointers
623 // - We wouldn't be able to merge two pointers for which we need
624 // to emit a memcheck. The classes in DepCands are already
625 // conveniently built such that no two pointers in the same
626 // class need checking against each other.
627
628 // We use the following (greedy) algorithm to construct the groups
629 // For every pointer in the equivalence class:
630 // For each existing group:
631 // - if the difference between this pointer and the min/max bounds
632 // of the group is a constant, then make the pointer part of the
633 // group and update the min/max bounds of that group as required.
634
635 CheckingGroups.clear();
636
637 // If we need to check two pointers to the same underlying object
638 // with a non-constant difference, we shouldn't perform any pointer
639 // grouping with those pointers. This is because we can easily get
640 // into cases where the resulting check would return false, even when
641 // the accesses are safe.
642 //
643 // The following example shows this:
644 // for (i = 0; i < 1000; ++i)
645 // a[5000 + i * m] = a[i] + a[i + 9000]
646 //
647 // Here grouping gives a check of (5000, 5000 + 1000 * m) against
648 // (0, 10000) which is always false. However, if m is 1, there is no
649 // dependence. Not grouping the checks for a[i] and a[i + 9000] allows
650 // us to perform an accurate check in this case.
651 //
652 // In the above case, we have a non-constant distance and an Unknown
653 // dependence between accesses to the same underlying object, and could retry
654 // with runtime checks without dependency information being available. In this
655 // case we will use the fallback path and create separate checking groups for
656 // accesses not present in DepCands.
657
658 unsigned TotalComparisons = 0;
659
660 DenseMap<Value *, SmallVector<unsigned>> PositionMap;
661 for (unsigned Index = 0; Index < Pointers.size(); ++Index)
662 PositionMap[Pointers[Index].PointerValue].push_back(Elt: Index);
663
664 // We need to keep track of what pointers we've already seen so we
665 // don't process them twice.
666 SmallSet<unsigned, 2> Seen;
667
668 // Go through all equivalence classes, get the "pointer check groups"
669 // and add them to the overall solution. We use the order in which accesses
670 // appear in 'Pointers' to enforce determinism.
671 for (unsigned I = 0; I < Pointers.size(); ++I) {
672 // We've seen this pointer before, and therefore already processed
673 // its equivalence class.
674 if (Seen.contains(V: I))
675 continue;
676
677 MemoryDepChecker::MemAccessInfo Access(Pointers[I].PointerValue,
678 Pointers[I].IsWritePtr);
679
680 // If there is no entry in the dependency partition, there are no potential
681 // accesses to merge; simply add a new pointer checking group.
682 if (!DepCands.contains(V: Access)) {
683 CheckingGroups.push_back(Elt: RuntimeCheckingPtrGroup(I, *this));
684 continue;
685 }
686
687 SmallVector<RuntimeCheckingPtrGroup, 2> Groups;
688
689 // Because DepCands is constructed by visiting accesses in the order in
690 // which they appear in alias sets (which is deterministic) and the
691 // iteration order within an equivalence class member is only dependent on
692 // the order in which unions and insertions are performed on the
693 // equivalence class, the iteration order is deterministic.
694 for (auto M : DepCands.members(V: Access)) {
695 auto PointerI = PositionMap.find(Val: M.getPointer());
696 // If we can't find the pointer in PositionMap that means we can't
697 // generate a memcheck for it.
698 if (PointerI == PositionMap.end())
699 continue;
700 for (unsigned Pointer : PointerI->second) {
701 bool Merged = false;
702 // Mark this pointer as seen.
703 Seen.insert(V: Pointer);
704
705 // Go through all the existing sets and see if we can find one
706 // which can include this pointer.
707 for (RuntimeCheckingPtrGroup &Group : Groups) {
708 // Don't perform more than a certain amount of comparisons.
709 // This should limit the cost of grouping the pointers to something
710 // reasonable. If we do end up hitting this threshold, the algorithm
711 // will create separate groups for all remaining pointers.
712 if (TotalComparisons > MemoryCheckMergeThreshold)
713 break;
714
715 TotalComparisons++;
716
717 if (Group.addPointer(Index: Pointer, RtCheck: *this)) {
718 Merged = true;
719 break;
720 }
721 }
722
723 if (!Merged)
724 // We couldn't add this pointer to any existing set or the threshold
725 // for the number of comparisons has been reached. Create a new group
726 // to hold the current pointer.
727 Groups.emplace_back(Args&: Pointer, Args&: *this);
728 }
729 }
730
731 // We've computed the grouped checks for this partition.
732 // Save the results and continue with the next one.
733 llvm::append_range(C&: CheckingGroups, R&: Groups);
734 }
735}
736
737bool RuntimePointerChecking::arePointersInSamePartition(
738 const SmallVectorImpl<int> &PtrToPartition, unsigned PtrIdx1,
739 unsigned PtrIdx2) {
740 return (PtrToPartition[PtrIdx1] != -1 &&
741 PtrToPartition[PtrIdx1] == PtrToPartition[PtrIdx2]);
742}
743
744bool RuntimePointerChecking::needsChecking(unsigned I, unsigned J) const {
745 const PointerInfo &PointerI = Pointers[I];
746 const PointerInfo &PointerJ = Pointers[J];
747
748 // No need to check if two readonly pointers intersect.
749 if (!PointerI.IsWritePtr && !PointerJ.IsWritePtr)
750 return false;
751
752 // Only need to check pointers between two different dependency sets.
753 if (PointerI.DependencySetId == PointerJ.DependencySetId)
754 return false;
755
756 // Only need to check pointers in the same alias set.
757 return PointerI.AliasSetId == PointerJ.AliasSetId;
758}
759
760/// Assign each RuntimeCheckingPtrGroup pointer an index for stable UTC output.
761static DenseMap<const RuntimeCheckingPtrGroup *, unsigned>
762getPtrToIdxMap(ArrayRef<RuntimeCheckingPtrGroup> CheckingGroups) {
763 DenseMap<const RuntimeCheckingPtrGroup *, unsigned> PtrIndices;
764 for (const auto &[Idx, CG] : enumerate(First&: CheckingGroups))
765 PtrIndices[&CG] = Idx;
766 return PtrIndices;
767}
768
769void RuntimePointerChecking::printChecks(
770 raw_ostream &OS, const SmallVectorImpl<RuntimePointerCheck> &Checks,
771 unsigned Depth) const {
772 unsigned N = 0;
773 auto PtrIndices = getPtrToIdxMap(CheckingGroups);
774 for (const auto &[Check1, Check2] : Checks) {
775 const auto &First = Check1->Members, &Second = Check2->Members;
776 OS.indent(NumSpaces: Depth) << "Check " << N++ << ":\n";
777 OS.indent(NumSpaces: Depth + 2) << "Comparing group GRP" << PtrIndices.at(Val: Check1)
778 << ":\n";
779 for (unsigned K : First)
780 OS.indent(NumSpaces: Depth + 2) << *Pointers[K].PointerValue << "\n";
781 OS.indent(NumSpaces: Depth + 2) << "Against group GRP" << PtrIndices.at(Val: Check2)
782 << ":\n";
783 for (unsigned K : Second)
784 OS.indent(NumSpaces: Depth + 2) << *Pointers[K].PointerValue << "\n";
785 }
786}
787
788void RuntimePointerChecking::print(raw_ostream &OS, unsigned Depth) const {
789
790 OS.indent(NumSpaces: Depth) << "Run-time memory checks:\n";
791 printChecks(OS, Checks, Depth);
792
793 OS.indent(NumSpaces: Depth) << "Grouped accesses:\n";
794 auto PtrIndices = getPtrToIdxMap(CheckingGroups);
795 for (const auto &CG : CheckingGroups) {
796 OS.indent(NumSpaces: Depth + 2) << "Group GRP" << PtrIndices.at(Val: &CG) << ":\n";
797 OS.indent(NumSpaces: Depth + 4) << "(Low: " << *CG.Low << " High: " << *CG.High
798 << ")\n";
799 for (unsigned Member : CG.Members) {
800 OS.indent(NumSpaces: Depth + 6) << "Member: " << *Pointers[Member].Expr << "\n";
801 }
802 }
803}
804
805namespace {
806
807/// Analyses memory accesses in a loop.
808///
809/// Checks whether run time pointer checks are needed and builds sets for data
810/// dependence checking.
811class AccessAnalysis {
812public:
813 using MemAccessInfo =
814 PointerIntPair<Value * /* AccessPtr */, 1, bool /* IsWrite */>;
815
816 AccessAnalysis(const Loop *TheLoop, AAResults *AA, const LoopInfo *LI,
817 DominatorTree &DT, MemoryDepChecker::DepCandidates &DA,
818 PredicatedScalarEvolution &PSE,
819 SmallPtrSetImpl<MDNode *> &LoopAliasScopes)
820 : TheLoop(TheLoop), BAA(*AA), AST(BAA), LI(LI), DT(DT), DepCands(DA),
821 PSE(PSE), LoopAliasScopes(LoopAliasScopes) {
822 // We're analyzing dependences across loop iterations.
823 BAA.enableCrossIterationMode();
824 }
825
826 /// Register a load and whether it is only read from.
827 void addLoad(const MemoryLocation &Loc, Type *AccessTy, bool IsReadOnly) {
828 Value *Ptr = const_cast<Value *>(Loc.Ptr);
829 AST.add(Loc: adjustLoc(Loc));
830 Accesses[MemAccessInfo(Ptr, false)].insert(X: AccessTy);
831 if (IsReadOnly)
832 ReadOnlyPtr.insert(Ptr);
833 }
834
835 /// Register a store.
836 void addStore(const MemoryLocation &Loc, Type *AccessTy) {
837 Value *Ptr = const_cast<Value *>(Loc.Ptr);
838 AST.add(Loc: adjustLoc(Loc));
839 Accesses[MemAccessInfo(Ptr, true)].insert(X: AccessTy);
840 }
841
842 /// Check if we can emit a run-time no-alias check for \p Access.
843 ///
844 /// Returns true if we can emit a run-time no alias check for \p Access.
845 /// If we can check this access, this also adds it to a dependence set and
846 /// adds a run-time to check for it to \p RtCheck. If \p Assume is true,
847 /// we will attempt to use additional run-time checks in order to get
848 /// the bounds of the pointer.
849 bool createCheckForAccess(RuntimePointerChecking &RtCheck,
850 MemAccessInfo Access, Type *AccessTy,
851 const DenseMap<Value *, const SCEV *> &Strides,
852 DenseMap<Value *, unsigned> &DepSetId,
853 Loop *TheLoop, unsigned &RunningDepId,
854 unsigned ASId, bool Assume);
855
856 /// Check whether we can check the pointers at runtime for
857 /// non-intersection.
858 ///
859 /// Returns true if we need no check or if we do and we can generate them
860 /// (i.e. the pointers have computable bounds). A return value of false means
861 /// we couldn't analyze and generate runtime checks for all pointers in the
862 /// loop, but if \p AllowPartial is set then we will have checks for those
863 /// pointers we could analyze. \p DepChecker is used to remove unknown
864 /// dependences from DepCands.
865 bool canCheckPtrAtRT(RuntimePointerChecking &RtCheck, Loop *TheLoop,
866 const DenseMap<Value *, const SCEV *> &Strides,
867 Value *&UncomputablePtr, bool AllowPartial,
868 const MemoryDepChecker &DepChecker);
869
870 /// Goes over all memory accesses, checks whether a RT check is needed
871 /// and builds sets of dependent accesses.
872 void buildDependenceSets();
873
874 /// Initial processing of memory accesses determined that we need to
875 /// perform dependency checking.
876 ///
877 /// Note that this can later be cleared if we retry memcheck analysis without
878 /// dependency checking (i.e. ShouldRetryWithRuntimeChecks).
879 bool isDependencyCheckNeeded() const { return !CheckDeps.empty(); }
880
881 /// We decided that no dependence analysis would be used. Reset the state.
882 void resetDepChecks(MemoryDepChecker &DepChecker) {
883 CheckDeps.clear();
884 DepChecker.clearDependences();
885 }
886
887 ArrayRef<MemAccessInfo> getDependenciesToCheck() const { return CheckDeps; }
888
889private:
890 using PtrAccessMap = MapVector<MemAccessInfo, SmallSetVector<Type *, 1>>;
891
892 /// Adjust the MemoryLocation so that it represents accesses to this
893 /// location across all iterations, rather than a single one.
894 MemoryLocation adjustLoc(MemoryLocation Loc) const {
895 // The accessed location varies within the loop, but remains within the
896 // underlying object.
897 Loc.Size = LocationSize::beforeOrAfterPointer();
898 Loc.AATags.Scope = adjustAliasScopeList(ScopeList: Loc.AATags.Scope);
899 Loc.AATags.NoAlias = adjustAliasScopeList(ScopeList: Loc.AATags.NoAlias);
900 return Loc;
901 }
902
903 /// Drop alias scopes that are only valid within a single loop iteration.
904 MDNode *adjustAliasScopeList(MDNode *ScopeList) const {
905 if (!ScopeList)
906 return nullptr;
907
908 // For the sake of simplicity, drop the whole scope list if any scope is
909 // iteration-local.
910 if (any_of(Range: ScopeList->operands(), P: [&](Metadata *Scope) {
911 return LoopAliasScopes.contains(Ptr: cast<MDNode>(Val: Scope));
912 }))
913 return nullptr;
914
915 return ScopeList;
916 }
917
918 /// Map of all accesses. Values are the types used to access memory pointed to
919 /// by the pointer.
920 PtrAccessMap Accesses;
921
922 /// The loop being checked.
923 const Loop *TheLoop;
924
925 /// List of accesses that need a further dependence check.
926 SmallVector<MemAccessInfo, 8> CheckDeps;
927
928 /// Set of pointers that are read only.
929 SmallPtrSet<Value*, 16> ReadOnlyPtr;
930
931 /// Batched alias analysis results.
932 BatchAAResults BAA;
933
934 /// An alias set tracker to partition the access set by underlying object and
935 //intrinsic property (such as TBAA metadata).
936 AliasSetTracker AST;
937
938 /// The LoopInfo of the loop being checked.
939 const LoopInfo *LI;
940
941 /// The dominator tree of the function.
942 DominatorTree &DT;
943
944 /// Sets of potentially dependent accesses - members of one set share an
945 /// underlying pointer. The set "CheckDeps" identfies which sets really need a
946 /// dependence check.
947 MemoryDepChecker::DepCandidates &DepCands;
948
949 /// Initial processing of memory accesses determined that we may need
950 /// to add memchecks. Perform the analysis to determine the necessary checks.
951 ///
952 /// Note that, this is different from isDependencyCheckNeeded. When we retry
953 /// memcheck analysis without dependency checking
954 /// (i.e. ShouldRetryWithRuntimeChecks), isDependencyCheckNeeded is
955 /// cleared while this remains set if we have potentially dependent accesses.
956 bool IsRTCheckAnalysisNeeded = false;
957
958 /// The SCEV predicate containing all the SCEV-related assumptions.
959 PredicatedScalarEvolution &PSE;
960
961 DenseMap<Value *, SmallVector<const Value *, 16>> UnderlyingObjects;
962
963 /// Alias scopes that are declared inside the loop, and as such not valid
964 /// across iterations.
965 SmallPtrSetImpl<MDNode *> &LoopAliasScopes;
966};
967
968} // end anonymous namespace
969
970/// Try to compute a constant stride for \p AR. Used by getPtrStride and
971/// isNoWrap.
972static std::optional<int64_t>
973getStrideFromAddRec(const SCEVAddRecExpr *AR, const Loop *Lp, Type *AccessTy,
974 Value *Ptr, PredicatedScalarEvolution &PSE) {
975 if (isa<ScalableVectorType>(Val: AccessTy)) {
976 LLVM_DEBUG(dbgs() << "LAA: Bad stride - Scalable object: " << *AccessTy
977 << "\n");
978 return std::nullopt;
979 }
980
981 // The access function must stride over the innermost loop.
982 if (Lp != AR->getLoop()) {
983 LLVM_DEBUG({
984 dbgs() << "LAA: Bad stride - Not striding over innermost loop ";
985 if (Ptr)
986 dbgs() << *Ptr << " ";
987
988 dbgs() << "SCEV: " << *AR << "\n";
989 });
990 return std::nullopt;
991 }
992
993 // Check the step is constant.
994 const SCEV *Step = AR->getStepRecurrence(SE&: *PSE.getSE());
995
996 // Calculate the pointer stride and check if it is constant.
997 const APInt *APStepVal;
998 if (!match(S: Step, P: m_scev_APInt(C&: APStepVal))) {
999 LLVM_DEBUG({
1000 dbgs() << "LAA: Bad stride - Not a constant strided ";
1001 if (Ptr)
1002 dbgs() << *Ptr << " ";
1003 dbgs() << "SCEV: " << *AR << "\n";
1004 });
1005 return std::nullopt;
1006 }
1007
1008 const auto &DL = Lp->getHeader()->getDataLayout();
1009 TypeSize AllocSize = DL.getTypeAllocSize(Ty: AccessTy);
1010 int64_t Size = AllocSize.getFixedValue();
1011
1012 // Huge step value - give up.
1013 std::optional<int64_t> StepVal = APStepVal->trySExtValue();
1014 if (!StepVal)
1015 return std::nullopt;
1016
1017 // Strided access.
1018 return *StepVal % Size ? std::nullopt : std::make_optional(t: *StepVal / Size);
1019}
1020
1021/// Check whether \p AR is a non-wrapping AddRec. If \p Ptr is not nullptr, use
1022/// information from the IR pointer value to determine no-wrap. If \p Predicates
1023/// is not nullptr add no-wrap assumptions if needed.
1024static bool
1025isNoWrap(PredicatedScalarEvolution &PSE, const SCEVAddRecExpr *AR, Value *Ptr,
1026 Type *AccessTy, const Loop *L, const DominatorTree &DT,
1027 std::optional<int64_t> Stride = std::nullopt,
1028 SmallVectorImpl<const SCEVPredicate *> *Predicates = nullptr) {
1029 // FIXME: This should probably only return true for NUW.
1030 if (any(Val: AR->getNoWrapFlags(Mask: SCEV::NoWrapMask)))
1031 return true;
1032
1033 if (Ptr && PSE.hasNoOverflow(V: Ptr, Flags: SCEVWrapPredicate::IncrementNUSW))
1034 return true;
1035
1036 // An nusw getelementptr that is an AddRec cannot wrap. If it would wrap,
1037 // the distance between the previously accessed location and the wrapped
1038 // location will be larger than half the pointer index type space. In that
1039 // case, the GEP would be poison and any memory access dependent on it would
1040 // be immediate UB when executed.
1041 if (auto *GEP = dyn_cast_if_present<GetElementPtrInst>(Val: Ptr);
1042 GEP && GEP->hasNoUnsignedSignedWrap()) {
1043 // For the above reasoning to apply, the pointer must be dereferenced in
1044 // every iteration.
1045 if (L->getHeader() == L->getLoopLatch() ||
1046 any_of(Range: GEP->users(), P: [L, &DT, GEP](User *U) {
1047 if (getLoadStorePointerOperand(V: U) != GEP)
1048 return false;
1049 BasicBlock *UserBB = cast<Instruction>(Val: U)->getParent();
1050 if (!L->contains(BB: UserBB))
1051 return false;
1052 return !LoopAccessInfo::blockNeedsPredication(BB: UserBB, TheLoop: L, DT: &DT);
1053 }))
1054 return true;
1055 }
1056
1057 if (!Stride)
1058 Stride = getStrideFromAddRec(AR, Lp: L, AccessTy, Ptr, PSE);
1059 if (Stride) {
1060 // If the null pointer is undefined, then a access sequence which would
1061 // otherwise access it can be assumed not to unsigned wrap. Note that this
1062 // assumes the object in memory is aligned to the natural alignment.
1063 unsigned AddrSpace = AR->getType()->getPointerAddressSpace();
1064 if (!NullPointerIsDefined(F: L->getHeader()->getParent(), AS: AddrSpace) &&
1065 (Stride == 1 || Stride == -1))
1066 return true;
1067 }
1068
1069 if (Ptr && Predicates) {
1070 ScalarEvolution &SE = *PSE.getSE();
1071 SCEVWrapPredicate::IncrementWrapFlags Flags = SCEVWrapPredicate::clearFlags(
1072 Flags: SCEVWrapPredicate::IncrementNUSW,
1073 OffFlags: SCEVWrapPredicate::getImpliedFlags(AR, SE));
1074 Predicates->push_back(Elt: SE.getWrapPredicate(AR, AddedFlags: Flags));
1075 LLVM_DEBUG(dbgs() << "LAA: Pointer may wrap:\n"
1076 << "LAA: Pointer: " << *Ptr << "\n"
1077 << "LAA: SCEV: " << *AR << "\n"
1078 << "LAA: Added an overflow assumption\n");
1079 return true;
1080 }
1081
1082 return false;
1083}
1084
1085static void visitPointers(Value *StartPtr, const Loop &InnermostLoop,
1086 function_ref<void(Value *)> AddPointer) {
1087 SmallPtrSet<Value *, 8> Visited;
1088 SmallVector<Value *> WorkList;
1089 WorkList.push_back(Elt: StartPtr);
1090
1091 while (!WorkList.empty()) {
1092 Value *Ptr = WorkList.pop_back_val();
1093 if (!Visited.insert(Ptr).second)
1094 continue;
1095 auto *PN = dyn_cast<PHINode>(Val: Ptr);
1096 // SCEV does not look through non-header PHIs inside the loop. Such phis
1097 // can be analyzed by adding separate accesses for each incoming pointer
1098 // value.
1099 if (PN && InnermostLoop.contains(BB: PN->getParent()) &&
1100 PN->getParent() != InnermostLoop.getHeader()) {
1101 llvm::append_range(C&: WorkList, R: PN->incoming_values());
1102 } else
1103 AddPointer(Ptr);
1104 }
1105}
1106
1107// Walk back through the IR for a pointer, looking for a select like the
1108// following:
1109//
1110// %offset = select i1 %cmp, i64 %a, i64 %b
1111// %addr = getelementptr double, double* %base, i64 %offset
1112// %ld = load double, double* %addr, align 8
1113//
1114// We won't be able to form a single SCEVAddRecExpr from this since the
1115// address for each loop iteration depends on %cmp. We could potentially
1116// produce multiple valid SCEVAddRecExprs, though, and check all of them for
1117// memory safety/aliasing if needed.
1118//
1119// If we encounter some IR we don't yet handle, or something obviously fine
1120// like a constant, then we just add the SCEV for that term to the list passed
1121// in by the caller. If we have a node that may potentially yield a valid
1122// SCEVAddRecExpr then we decompose it into parts and build the SCEV terms
1123// ourselves before adding to the list.
1124static void findForkedSCEVs(
1125 ScalarEvolution *SE, const Loop *L, Value *Ptr,
1126 SmallVectorImpl<PointerIntPair<const SCEV *, 1, bool>> &ScevList,
1127 unsigned Depth) {
1128 // If our Value is a SCEVAddRecExpr, loop invariant, not an instruction, or
1129 // we've exceeded our limit on recursion, just return whatever we have
1130 // regardless of whether it can be used for a forked pointer or not, along
1131 // with an indication of whether it might be a poison or undef value.
1132 const SCEV *Scev = SE->getSCEV(V: Ptr);
1133 if (isa<SCEVAddRecExpr>(Val: Scev) || L->isLoopInvariant(V: Ptr) ||
1134 !isa<Instruction>(Val: Ptr) || Depth == 0) {
1135 ScevList.emplace_back(Args&: Scev, Args: !isGuaranteedNotToBeUndefOrPoison(V: Ptr));
1136 return;
1137 }
1138
1139 Depth--;
1140
1141 auto UndefPoisonCheck = [](PointerIntPair<const SCEV *, 1, bool> S) {
1142 return get<1>(Pair: S);
1143 };
1144
1145 auto GetBinOpExpr = [&SE](unsigned Opcode, const SCEV *L, const SCEV *R) {
1146 switch (Opcode) {
1147 case Instruction::Add:
1148 return SE->getAddExpr(LHS: L, RHS: R);
1149 case Instruction::Sub:
1150 return SE->getMinusSCEV(LHS: L, RHS: R);
1151 default:
1152 llvm_unreachable("Unexpected binary operator when walking ForkedPtrs");
1153 }
1154 };
1155
1156 Instruction *I = cast<Instruction>(Val: Ptr);
1157 unsigned Opcode = I->getOpcode();
1158 switch (Opcode) {
1159 case Instruction::GetElementPtr: {
1160 auto *GEP = cast<GetElementPtrInst>(Val: I);
1161 Type *SourceTy = GEP->getSourceElementType();
1162 // We only handle base + single offset GEPs here for now.
1163 // Not dealing with preexisting gathers yet, so no vectors.
1164 if (I->getNumOperands() != 2 || SourceTy->isVectorTy()) {
1165 ScevList.emplace_back(Args&: Scev, Args: !isGuaranteedNotToBeUndefOrPoison(V: GEP));
1166 break;
1167 }
1168 SmallVector<PointerIntPair<const SCEV *, 1, bool>, 2> BaseScevs;
1169 SmallVector<PointerIntPair<const SCEV *, 1, bool>, 2> OffsetScevs;
1170 findForkedSCEVs(SE, L, Ptr: I->getOperand(i: 0), ScevList&: BaseScevs, Depth);
1171 findForkedSCEVs(SE, L, Ptr: I->getOperand(i: 1), ScevList&: OffsetScevs, Depth);
1172
1173 // See if we need to freeze our fork...
1174 bool NeedsFreeze = any_of(Range&: BaseScevs, P: UndefPoisonCheck) ||
1175 any_of(Range&: OffsetScevs, P: UndefPoisonCheck);
1176
1177 // Check that we only have a single fork, on either the base or the offset.
1178 // Copy the SCEV across for the one without a fork in order to generate
1179 // the full SCEV for both sides of the GEP.
1180 if (OffsetScevs.size() == 2 && BaseScevs.size() == 1)
1181 BaseScevs.push_back(Elt: BaseScevs[0]);
1182 else if (BaseScevs.size() == 2 && OffsetScevs.size() == 1)
1183 OffsetScevs.push_back(Elt: OffsetScevs[0]);
1184 else {
1185 ScevList.emplace_back(Args&: Scev, Args&: NeedsFreeze);
1186 break;
1187 }
1188
1189 Type *IntPtrTy = SE->getEffectiveSCEVType(Ty: GEP->getPointerOperandType());
1190
1191 // Find the size of the type being pointed to. We only have a single
1192 // index term (guarded above) so we don't need to index into arrays or
1193 // structures, just get the size of the scalar value.
1194 const SCEV *Size = SE->getSizeOfExpr(IntTy: IntPtrTy, AllocTy: SourceTy);
1195
1196 for (auto [B, O] : zip(t&: BaseScevs, u&: OffsetScevs)) {
1197 const SCEV *Base = get<0>(Pair: B);
1198 const SCEV *Offset = get<0>(Pair: O);
1199
1200 // Scale up the offsets by the size of the type, then add to the bases.
1201 const SCEV *Scaled =
1202 SE->getMulExpr(LHS: Size, RHS: SE->getTruncateOrSignExtend(V: Offset, Ty: IntPtrTy));
1203 ScevList.emplace_back(Args: SE->getAddExpr(LHS: Base, RHS: Scaled), Args&: NeedsFreeze);
1204 }
1205 break;
1206 }
1207 case Instruction::Select: {
1208 SmallVector<PointerIntPair<const SCEV *, 1, bool>, 2> ChildScevs;
1209 // A select means we've found a forked pointer, but we currently only
1210 // support a single select per pointer so if there's another behind this
1211 // then we just bail out and return the generic SCEV.
1212 findForkedSCEVs(SE, L, Ptr: I->getOperand(i: 1), ScevList&: ChildScevs, Depth);
1213 findForkedSCEVs(SE, L, Ptr: I->getOperand(i: 2), ScevList&: ChildScevs, Depth);
1214 if (ChildScevs.size() == 2)
1215 append_range(C&: ScevList, R&: ChildScevs);
1216 else
1217 ScevList.emplace_back(Args&: Scev, Args: !isGuaranteedNotToBeUndefOrPoison(V: Ptr));
1218 break;
1219 }
1220 case Instruction::PHI: {
1221 SmallVector<PointerIntPair<const SCEV *, 1, bool>, 2> ChildScevs;
1222 // A phi means we've found a forked pointer, but we currently only
1223 // support a single phi per pointer so if there's another behind this
1224 // then we just bail out and return the generic SCEV.
1225 if (I->getNumOperands() == 2) {
1226 findForkedSCEVs(SE, L, Ptr: I->getOperand(i: 0), ScevList&: ChildScevs, Depth);
1227 findForkedSCEVs(SE, L, Ptr: I->getOperand(i: 1), ScevList&: ChildScevs, Depth);
1228 }
1229 if (ChildScevs.size() == 2)
1230 append_range(C&: ScevList, R&: ChildScevs);
1231 else
1232 ScevList.emplace_back(Args&: Scev, Args: !isGuaranteedNotToBeUndefOrPoison(V: Ptr));
1233 break;
1234 }
1235 case Instruction::Add:
1236 case Instruction::Sub: {
1237 SmallVector<PointerIntPair<const SCEV *, 1, bool>> LScevs;
1238 SmallVector<PointerIntPair<const SCEV *, 1, bool>> RScevs;
1239 findForkedSCEVs(SE, L, Ptr: I->getOperand(i: 0), ScevList&: LScevs, Depth);
1240 findForkedSCEVs(SE, L, Ptr: I->getOperand(i: 1), ScevList&: RScevs, Depth);
1241
1242 // See if we need to freeze our fork...
1243 bool NeedsFreeze =
1244 any_of(Range&: LScevs, P: UndefPoisonCheck) || any_of(Range&: RScevs, P: UndefPoisonCheck);
1245
1246 // Check that we only have a single fork, on either the left or right side.
1247 // Copy the SCEV across for the one without a fork in order to generate
1248 // the full SCEV for both sides of the BinOp.
1249 if (LScevs.size() == 2 && RScevs.size() == 1)
1250 RScevs.push_back(Elt: RScevs[0]);
1251 else if (RScevs.size() == 2 && LScevs.size() == 1)
1252 LScevs.push_back(Elt: LScevs[0]);
1253 else {
1254 ScevList.emplace_back(Args&: Scev, Args&: NeedsFreeze);
1255 break;
1256 }
1257
1258 for (auto [L, R] : zip(t&: LScevs, u&: RScevs))
1259 ScevList.emplace_back(Args: GetBinOpExpr(Opcode, get<0>(Pair: L), get<0>(Pair: R)),
1260 Args&: NeedsFreeze);
1261 break;
1262 }
1263 default:
1264 // Just return the current SCEV if we haven't handled the instruction yet.
1265 LLVM_DEBUG(dbgs() << "ForkedPtr unhandled instruction: " << *I << "\n");
1266 ScevList.emplace_back(Args&: Scev, Args: !isGuaranteedNotToBeUndefOrPoison(V: Ptr));
1267 break;
1268 }
1269}
1270
1271bool AccessAnalysis::createCheckForAccess(
1272 RuntimePointerChecking &RtCheck, MemAccessInfo Access, Type *AccessTy,
1273 const DenseMap<Value *, const SCEV *> &StridesMap,
1274 DenseMap<Value *, unsigned> &DepSetId, Loop *TheLoop,
1275 unsigned &RunningDepId, unsigned ASId, bool Assume) {
1276 Value *Ptr = Access.getPointer();
1277 ScalarEvolution *SE = PSE.getSE();
1278 assert(SE->isSCEVable(Ptr->getType()) && "Value is not SCEVable!");
1279
1280 SmallVector<PointerIntPair<const SCEV *, 1, bool>> RTCheckPtrs;
1281 findForkedSCEVs(SE, L: TheLoop, Ptr, ScevList&: RTCheckPtrs, Depth: MaxForkedSCEVDepth);
1282 assert(!RTCheckPtrs.empty() &&
1283 "Must have some runtime-check pointer candidates");
1284
1285 // RTCheckPtrs must have size 2 if there are forked pointers. Otherwise, there
1286 // are no forked pointers; replaceSymbolicStridesSCEV in this case.
1287 auto IsLoopInvariantOrAR =
1288 [&SE, &TheLoop](const PointerIntPair<const SCEV *, 1, bool> &P) {
1289 return SE->isLoopInvariant(S: P.getPointer(), L: TheLoop) ||
1290 isa<SCEVAddRecExpr>(Val: P.getPointer());
1291 };
1292 if (RTCheckPtrs.size() == 2 && all_of(Range&: RTCheckPtrs, P: IsLoopInvariantOrAR)) {
1293 LLVM_DEBUG(dbgs() << "LAA: Found forked pointer: " << *Ptr << "\n";
1294 for (const auto &[Idx, Q] : enumerate(RTCheckPtrs)) dbgs()
1295 << "\t(" << Idx << ") " << *Q.getPointer() << "\n");
1296 } else {
1297 RTCheckPtrs = {{replaceSymbolicStrideSCEV(PSE, PtrToStride: StridesMap, Ptr), false}};
1298 }
1299
1300 /// Check whether all pointers can participate in a runtime bounds check. They
1301 /// must either be invariant or non-wrapping affine AddRecs.
1302 SmallVector<const SCEVPredicate *> Predicates;
1303 for (auto &P : RTCheckPtrs) {
1304 // The bounds for loop-invariant pointer is trivial.
1305 if (SE->isLoopInvariant(S: P.getPointer(), L: TheLoop))
1306 continue;
1307
1308 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Val: P.getPointer());
1309 if (!AR && Assume)
1310 AR = PSE.getAsAddRec(V: Ptr, WrapPredsAdded: &Predicates);
1311 if (!AR || !AR->isAffine())
1312 return false;
1313
1314 // If there's only one option for Ptr, commit the predicates collected by
1315 // getAsAddRec and look Ptr up again afterwards: the lookup below reads the
1316 // assumptions back from PSE, so they need to be committed first.
1317 if (RTCheckPtrs.size() == 1) {
1318 PSE.addPredicates(Preds: Predicates);
1319 Predicates.clear();
1320 AR =
1321 cast<SCEVAddRecExpr>(Val: replaceSymbolicStrideSCEV(PSE, PtrToStride: StridesMap, Ptr));
1322 P.setPointer(AR);
1323 }
1324
1325 if (!isNoWrap(PSE, AR, Ptr: RTCheckPtrs.size() == 1 ? Ptr : nullptr, AccessTy,
1326 L: TheLoop, DT, /*Stride=*/std::nullopt,
1327 Predicates: Assume ? &Predicates : nullptr))
1328 return false;
1329 }
1330 PSE.addPredicates(Preds: Predicates);
1331
1332 for (const auto &[PtrExpr, NeedsFreeze] : RTCheckPtrs) {
1333 // The id of the dependence set.
1334 unsigned DepId;
1335
1336 if (DepCands.contains(V: Access)) {
1337 Value *Leader = DepCands.getLeaderValue(V: Access).getPointer();
1338 unsigned &LeaderId = DepSetId[Leader];
1339 if (!LeaderId)
1340 LeaderId = RunningDepId++;
1341 DepId = LeaderId;
1342 } else
1343 // Each access has its own dependence set.
1344 DepId = RunningDepId++;
1345
1346 bool IsWrite = Access.getInt();
1347 RtCheck.insert(Lp: TheLoop, Ptr, PtrExpr, AccessTy, WritePtr: IsWrite, DepSetId: DepId, ASId, PSE,
1348 NeedsFreeze);
1349 LLVM_DEBUG(dbgs() << "LAA: Found a runtime check ptr:" << *Ptr << '\n');
1350 }
1351
1352 return true;
1353}
1354
1355bool AccessAnalysis::canCheckPtrAtRT(
1356 RuntimePointerChecking &RtCheck, Loop *TheLoop,
1357 const DenseMap<Value *, const SCEV *> &StridesMap, Value *&UncomputablePtr,
1358 bool AllowPartial, const MemoryDepChecker &DepChecker) {
1359 // Find pointers with computable bounds. We are going to use this information
1360 // to place a runtime bound check.
1361 bool CanDoRT = true;
1362
1363 bool MayNeedRTCheck = false;
1364 if (!IsRTCheckAnalysisNeeded) return true;
1365
1366 if (auto *Deps = DepChecker.getDependences()) {
1367 // If there are unknown dependences, this means runtime checks are needed to
1368 // ensure there's no overlap between accesses to the same underlying object.
1369 // Remove the equivalence classes containing both source and destination
1370 // accesses from DepCands. This ensures runtime checks will be generated
1371 // between those accesses and prevents them from being grouped together.
1372 for (const auto &Dep : *Deps) {
1373 if (Dep.Type != MemoryDepChecker::Dependence::Unknown) {
1374 assert(MemoryDepChecker::Dependence::isSafeForVectorization(Dep.Type) ==
1375 MemoryDepChecker::VectorizationSafetyStatus::Safe &&
1376 "Should only skip safe dependences");
1377 continue;
1378 }
1379 Instruction *Src = Dep.getSource(DepChecker);
1380 Instruction *Dst = Dep.getDestination(DepChecker);
1381 DepCands.eraseClass(V: {getPointerOperand(V: Src), Src->mayWriteToMemory()});
1382 DepCands.eraseClass(V: {getPointerOperand(V: Dst), Dst->mayWriteToMemory()});
1383 }
1384 } else {
1385 CheckDeps.clear();
1386 DepCands = {};
1387 }
1388
1389 // We assign a consecutive id to access from different alias sets.
1390 // Accesses between different groups doesn't need to be checked.
1391 unsigned ASId = 0;
1392 for (const auto &AS : AST) {
1393 int NumReadPtrChecks = 0;
1394 int NumWritePtrChecks = 0;
1395 bool CanDoAliasSetRT = true;
1396 ++ASId;
1397 auto ASPointers = AS.getPointers();
1398
1399 // We assign consecutive id to access from different dependence sets.
1400 // Accesses within the same set don't need a runtime check.
1401 unsigned RunningDepId = 1;
1402 DenseMap<Value *, unsigned> DepSetId;
1403
1404 SmallVector<std::pair<MemAccessInfo, Type *>, 4> Retries;
1405
1406 // First, count how many write and read accesses are in the alias set. Also
1407 // collect MemAccessInfos for later.
1408 SmallVector<MemAccessInfo, 4> AccessInfos;
1409 for (const Value *ConstPtr : ASPointers) {
1410 Value *Ptr = const_cast<Value *>(ConstPtr);
1411 bool IsWrite = Accesses.contains(Key: MemAccessInfo(Ptr, true));
1412 if (IsWrite)
1413 ++NumWritePtrChecks;
1414 else
1415 ++NumReadPtrChecks;
1416 AccessInfos.emplace_back(Args&: Ptr, Args&: IsWrite);
1417 }
1418
1419 // We do not need runtime checks for this alias set, if there are no writes
1420 // or a single write and no reads.
1421 if (NumWritePtrChecks == 0 ||
1422 (NumWritePtrChecks == 1 && NumReadPtrChecks == 0)) {
1423 assert((ASPointers.size() <= 1 ||
1424 all_of(ASPointers,
1425 [this](const Value *Ptr) {
1426 MemAccessInfo AccessWrite(const_cast<Value *>(Ptr),
1427 true);
1428 return !DepCands.contains(AccessWrite);
1429 })) &&
1430 "Can only skip updating CanDoRT below, if all entries in AS "
1431 "are reads or there is at most 1 entry");
1432 continue;
1433 }
1434
1435 for (auto &Access : AccessInfos) {
1436 for (const auto &AccessTy : Accesses[Access]) {
1437 if (!createCheckForAccess(RtCheck, Access, AccessTy, StridesMap,
1438 DepSetId, TheLoop, RunningDepId, ASId,
1439 Assume: false)) {
1440 LLVM_DEBUG(dbgs() << "LAA: Can't find bounds for ptr:"
1441 << *Access.getPointer() << '\n');
1442 Retries.emplace_back(Args&: Access, Args: AccessTy);
1443 CanDoAliasSetRT = false;
1444 }
1445 }
1446 }
1447
1448 // Note that this function computes CanDoRT and MayNeedRTCheck
1449 // independently. For example CanDoRT=false, MayNeedRTCheck=false means that
1450 // we have a pointer for which we couldn't find the bounds but we don't
1451 // actually need to emit any checks so it does not matter.
1452 //
1453 // We need runtime checks for this alias set, if there are at least 2
1454 // dependence sets (in which case RunningDepId > 2) or if we need to re-try
1455 // any bound checks (because in that case the number of dependence sets is
1456 // incomplete).
1457 bool NeedsAliasSetRTCheck = RunningDepId > 2 || !Retries.empty();
1458
1459 // We need to perform run-time alias checks, but some pointers had bounds
1460 // that couldn't be checked.
1461 if (NeedsAliasSetRTCheck && !CanDoAliasSetRT) {
1462 // Reset the CanDoSetRt flag and retry all accesses that have failed.
1463 // We know that we need these checks, so we can now be more aggressive
1464 // and add further checks if required (overflow checks).
1465 CanDoAliasSetRT = true;
1466 for (const auto &[Access, AccessTy] : Retries) {
1467 if (!createCheckForAccess(RtCheck, Access, AccessTy, StridesMap,
1468 DepSetId, TheLoop, RunningDepId, ASId,
1469 /*Assume=*/true)) {
1470 CanDoAliasSetRT = false;
1471 UncomputablePtr = Access.getPointer();
1472 if (!AllowPartial)
1473 break;
1474 }
1475 }
1476 }
1477
1478 CanDoRT &= CanDoAliasSetRT;
1479 MayNeedRTCheck |= NeedsAliasSetRTCheck;
1480 ++ASId;
1481 }
1482
1483 // If the pointers that we would use for the bounds comparison have different
1484 // address spaces, assume the values aren't directly comparable, so we can't
1485 // use them for the runtime check. We also have to assume they could
1486 // overlap. In the future there should be metadata for whether address spaces
1487 // are disjoint.
1488 unsigned NumPointers = RtCheck.Pointers.size();
1489 for (unsigned i = 0; i < NumPointers; ++i) {
1490 for (unsigned j = i + 1; j < NumPointers; ++j) {
1491 // Only need to check pointers between two different dependency sets.
1492 if (RtCheck.Pointers[i].DependencySetId ==
1493 RtCheck.Pointers[j].DependencySetId)
1494 continue;
1495 // Only need to check pointers in the same alias set.
1496 if (RtCheck.Pointers[i].AliasSetId != RtCheck.Pointers[j].AliasSetId)
1497 continue;
1498
1499 Value *PtrI = RtCheck.Pointers[i].PointerValue;
1500 Value *PtrJ = RtCheck.Pointers[j].PointerValue;
1501
1502 unsigned ASi = PtrI->getType()->getPointerAddressSpace();
1503 unsigned ASj = PtrJ->getType()->getPointerAddressSpace();
1504 if (ASi != ASj) {
1505 LLVM_DEBUG(
1506 dbgs() << "LAA: Runtime check would require comparison between"
1507 " different address spaces\n");
1508 return false;
1509 }
1510 }
1511 }
1512
1513 if (MayNeedRTCheck && (CanDoRT || AllowPartial))
1514 RtCheck.generateChecks(DepCands);
1515
1516 LLVM_DEBUG(dbgs() << "LAA: We need to do " << RtCheck.getNumberOfChecks()
1517 << " pointer comparisons.\n");
1518
1519 // If we can do run-time checks, but there are no checks, no runtime checks
1520 // are needed. This can happen when all pointers point to the same underlying
1521 // object for example.
1522 RtCheck.Need = CanDoRT ? RtCheck.getNumberOfChecks() != 0 : MayNeedRTCheck;
1523
1524 bool CanDoRTIfNeeded = !RtCheck.Need || CanDoRT;
1525 assert(CanDoRTIfNeeded == (CanDoRT || !MayNeedRTCheck) &&
1526 "CanDoRTIfNeeded depends on RtCheck.Need");
1527 if (!CanDoRTIfNeeded && !AllowPartial)
1528 RtCheck.reset();
1529 return CanDoRTIfNeeded;
1530}
1531
1532void AccessAnalysis::buildDependenceSets() {
1533 // We process the set twice: first we process read-write pointers, last we
1534 // process read-only pointers. This allows us to skip dependence tests for
1535 // read-only pointers.
1536
1537 LLVM_DEBUG(dbgs() << "LAA: Processing memory accesses...\n");
1538 LLVM_DEBUG(dbgs() << " AST: "; AST.dump());
1539 LLVM_DEBUG(dbgs() << "LAA: Accesses(" << Accesses.size() << "):\n");
1540 LLVM_DEBUG({
1541 for (const auto &[A, _] : Accesses)
1542 dbgs() << "\t" << *A.getPointer() << " ("
1543 << (A.getInt()
1544 ? "write"
1545 : (ReadOnlyPtr.contains(A.getPointer()) ? "read-only"
1546 : "read"))
1547 << ")\n";
1548 });
1549
1550 // The AliasSetTracker has nicely partitioned our pointers by metadata
1551 // compatibility and potential for underlying-object overlap. As a result, we
1552 // only need to check for potential pointer dependencies within each alias
1553 // set.
1554 for (const auto &AS : AST) {
1555 bool AliasSetHasWrite = false;
1556
1557 // Map of (pointer to underlying objects, accessed address space) to last
1558 // access encountered.
1559 using UnderlyingObjToAccessMap =
1560 DenseMap<std::pair<const Value *, unsigned>, MemAccessInfo>;
1561 UnderlyingObjToAccessMap ObjToLastAccess;
1562
1563 // Set of access to check after all writes have been processed.
1564 PtrAccessMap DeferredAccesses;
1565
1566 // Iterate over each alias set twice, once to process read/write pointers,
1567 // and then to process read-only pointers.
1568
1569 auto ProcessAccesses = [&](bool UseDeferred) {
1570 PtrAccessMap &S = UseDeferred ? DeferredAccesses : Accesses;
1571
1572 // Note that both the alias-set tracker and the alias sets themselves used
1573 // ordered collections internally and so the iteration order here is
1574 // deterministic.
1575 for (const Value *ConstPtr : AS.getPointers()) {
1576 Value *Ptr = const_cast<Value *>(ConstPtr);
1577
1578 // For a single memory access in AliasSetTracker, Accesses may contain
1579 // both read and write, and they both need to be handled for CheckDeps.
1580 for (auto [AccessPtr, IsWrite] : S.keys()) {
1581 if (AccessPtr != Ptr)
1582 continue;
1583
1584 // If we're using the deferred access set, then it contains only
1585 // reads.
1586 bool IsReadOnlyPtr = ReadOnlyPtr.contains(Ptr) && !IsWrite;
1587 if (UseDeferred && !IsReadOnlyPtr)
1588 continue;
1589 // Otherwise, the pointer must be in the PtrAccessSet, either as a
1590 // read or a write.
1591 assert(((IsReadOnlyPtr && UseDeferred) || IsWrite ||
1592 S.contains(MemAccessInfo(Ptr, false))) &&
1593 "Alias-set pointer not in the access set?");
1594
1595 MemAccessInfo Access(Ptr, IsWrite);
1596 DepCands.insert(Data: Access);
1597
1598 // Memorize read-only pointers for later processing and skip them in
1599 // the first round (they need to be checked after we have seen all
1600 // write pointers). Note: we also mark pointer that are not
1601 // consecutive as "read-only" pointers (so that we check
1602 // "a[b[i]] +="). Hence, we need the second check for "!IsWrite".
1603 if (!UseDeferred && IsReadOnlyPtr) {
1604 // We only use the pointer keys, the types vector values don't
1605 // matter.
1606 DeferredAccesses.insert(KV: {Access, {}});
1607 continue;
1608 }
1609
1610 // If this is a write - check other reads and writes for conflicts. If
1611 // this is a read only check other writes for conflicts (but only if
1612 // there is no other write to the ptr - this is an optimization to
1613 // catch "a[i] = a[i] + " without having to do a dependence check).
1614 if ((IsWrite || IsReadOnlyPtr) && AliasSetHasWrite) {
1615 CheckDeps.push_back(Elt: Access);
1616 IsRTCheckAnalysisNeeded = true;
1617 }
1618
1619 if (IsWrite)
1620 AliasSetHasWrite = true;
1621
1622 // Create sets of pointers connected by a shared alias set and
1623 // underlying object.
1624 SmallVector<const Value *, 16> &UOs = UnderlyingObjects[Ptr];
1625 UOs = {};
1626 ::getUnderlyingObjects(V: Ptr, Objects&: UOs, LI);
1627 LLVM_DEBUG(dbgs()
1628 << "Underlying objects for pointer " << *Ptr << "\n");
1629 for (const Value *UnderlyingObj : UOs) {
1630 // nullptr never alias, don't join sets for pointer that have "null"
1631 // in their UnderlyingObjects list.
1632 if (isa<ConstantPointerNull>(Val: UnderlyingObj) &&
1633 !NullPointerIsDefined(
1634 F: TheLoop->getHeader()->getParent(),
1635 AS: UnderlyingObj->getType()->getPointerAddressSpace()))
1636 continue;
1637
1638 auto [It, Inserted] = ObjToLastAccess.try_emplace(
1639 Key: {UnderlyingObj,
1640 cast<PointerType>(Val: Ptr->getType())->getAddressSpace()},
1641 Args&: Access);
1642 if (!Inserted) {
1643 DepCands.unionSets(V1: Access, V2: It->second);
1644 It->second = Access;
1645 }
1646
1647 LLVM_DEBUG(dbgs() << " " << *UnderlyingObj << "\n");
1648 }
1649 }
1650 }
1651 };
1652
1653 ProcessAccesses(false);
1654 ProcessAccesses(true);
1655 }
1656}
1657
1658/// Check whether the access through \p Ptr has a constant stride.
1659std::optional<int64_t> llvm::getPtrStride(
1660 PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr, const Loop *Lp,
1661 const DominatorTree &DT, const DenseMap<Value *, const SCEV *> &StridesMap,
1662 bool ShouldCheckWrap, SmallVectorImpl<const SCEVPredicate *> *Predicates) {
1663 const SCEV *PtrScev = replaceSymbolicStrideSCEV(PSE, PtrToStride: StridesMap, Ptr);
1664 if (PSE.getSE()->isLoopInvariant(S: PtrScev, L: Lp))
1665 return 0;
1666
1667 assert(Ptr->getType()->isPointerTy() && "Unexpected non-ptr");
1668
1669 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Val: PtrScev);
1670 if (Predicates && !AR) {
1671 AR = PSE.getSE()->convertSCEVToAddRecWithPredicates(S: PtrScev, L: Lp,
1672 Preds&: *Predicates);
1673 }
1674
1675 if (!AR) {
1676 LLVM_DEBUG(dbgs() << "LAA: Bad stride - Not an AddRecExpr pointer " << *Ptr
1677 << " SCEV: " << *PtrScev << "\n");
1678 return std::nullopt;
1679 }
1680
1681 std::optional<int64_t> Stride =
1682 getStrideFromAddRec(AR, Lp, AccessTy, Ptr, PSE);
1683 if (!ShouldCheckWrap || !Stride)
1684 return Stride;
1685
1686 if (isNoWrap(PSE, AR, Ptr, AccessTy, L: Lp, DT, Stride, Predicates))
1687 return Stride;
1688
1689 LLVM_DEBUG(
1690 dbgs() << "LAA: Bad stride - Pointer may wrap in the address space "
1691 << *Ptr << " SCEV: " << *AR << "\n");
1692 return std::nullopt;
1693}
1694
1695/// Check whether the access through \p Ptr has a constant stride.
1696std::optional<int64_t>
1697llvm::getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr,
1698 const Loop *Lp, const DominatorTree &DT,
1699 const DenseMap<Value *, const SCEV *> &StridesMap,
1700 bool Assume, bool ShouldCheckWrap) {
1701 SmallVector<const SCEVPredicate *> Predicates;
1702 std::optional<int64_t> Stride =
1703 getPtrStride(PSE, AccessTy, Ptr, Lp, DT, StridesMap, ShouldCheckWrap,
1704 Predicates: Assume ? &Predicates : nullptr);
1705 PSE.addPredicates(Preds: Predicates);
1706 return Stride;
1707}
1708
1709std::optional<int64_t> llvm::getPointersDiff(Type *ElemTyA, Value *PtrA,
1710 Type *ElemTyB, Value *PtrB,
1711 const DataLayout &DL,
1712 ScalarEvolution &SE,
1713 bool StrictCheck, bool CheckType) {
1714 assert(PtrA && PtrB && "Expected non-nullptr pointers.");
1715
1716 // Make sure that A and B are different pointers.
1717 if (PtrA == PtrB)
1718 return 0;
1719
1720 // Make sure that the element types are the same if required.
1721 if (CheckType && ElemTyA != ElemTyB)
1722 return std::nullopt;
1723
1724 unsigned ASA = PtrA->getType()->getPointerAddressSpace();
1725 unsigned ASB = PtrB->getType()->getPointerAddressSpace();
1726
1727 // Check that the address spaces match.
1728 if (ASA != ASB)
1729 return std::nullopt;
1730 unsigned IdxWidth = DL.getIndexSizeInBits(AS: ASA);
1731
1732 APInt OffsetA(IdxWidth, 0), OffsetB(IdxWidth, 0);
1733 const Value *PtrA1 = PtrA->stripAndAccumulateConstantOffsets(
1734 DL, Offset&: OffsetA, /*AllowNonInbounds=*/true);
1735 const Value *PtrB1 = PtrB->stripAndAccumulateConstantOffsets(
1736 DL, Offset&: OffsetB, /*AllowNonInbounds=*/true);
1737
1738 std::optional<int64_t> Val;
1739 if (PtrA1 == PtrB1) {
1740 // Retrieve the address space again as pointer stripping now tracks through
1741 // `addrspacecast`.
1742 ASA = cast<PointerType>(Val: PtrA1->getType())->getAddressSpace();
1743 ASB = cast<PointerType>(Val: PtrB1->getType())->getAddressSpace();
1744 // Check that the address spaces match and that the pointers are valid.
1745 if (ASA != ASB)
1746 return std::nullopt;
1747
1748 IdxWidth = DL.getIndexSizeInBits(AS: ASA);
1749 OffsetA = OffsetA.sextOrTrunc(width: IdxWidth);
1750 OffsetB = OffsetB.sextOrTrunc(width: IdxWidth);
1751
1752 OffsetB -= OffsetA;
1753 Val = OffsetB.trySExtValue();
1754 } else {
1755 // Otherwise compute the distance with SCEV between the base pointers.
1756 const SCEV *PtrSCEVA = SE.getSCEV(V: PtrA);
1757 const SCEV *PtrSCEVB = SE.getSCEV(V: PtrB);
1758 std::optional<APInt> Diff =
1759 SE.computeConstantDifference(LHS: PtrSCEVB, RHS: PtrSCEVA);
1760 if (!Diff)
1761 return std::nullopt;
1762 Val = Diff->trySExtValue();
1763 }
1764
1765 if (!Val)
1766 return std::nullopt;
1767
1768 int64_t Size = DL.getTypeStoreSize(Ty: ElemTyA);
1769 int64_t Dist = *Val / Size;
1770
1771 // Ensure that the calculated distance matches the type-based one after all
1772 // the bitcasts removal in the provided pointers.
1773 if (!StrictCheck || Dist * Size == Val)
1774 return Dist;
1775 return std::nullopt;
1776}
1777
1778bool llvm::sortPtrAccesses(ArrayRef<Value *> VL, Type *ElemTy,
1779 const DataLayout &DL, ScalarEvolution &SE,
1780 SmallVectorImpl<unsigned> &SortedIndices) {
1781 assert(llvm::all_of(
1782 VL, [](const Value *V) { return V->getType()->isPointerTy(); }) &&
1783 "Expected list of pointer operands.");
1784 // Walk over the pointers, and map each of them to an offset relative to
1785 // first pointer in the array.
1786 Value *Ptr0 = VL[0];
1787
1788 using DistOrdPair = std::pair<int64_t, unsigned>;
1789 auto Compare = llvm::less_first();
1790 std::set<DistOrdPair, decltype(Compare)> Offsets(Compare);
1791 Offsets.emplace(args: 0, args: 0);
1792 bool IsConsecutive = true;
1793 for (auto [Idx, Ptr] : drop_begin(RangeOrContainer: enumerate(First&: VL))) {
1794 std::optional<int64_t> Diff =
1795 getPointersDiff(ElemTyA: ElemTy, PtrA: Ptr0, ElemTyB: ElemTy, PtrB: Ptr, DL, SE,
1796 /*StrictCheck=*/true);
1797 if (!Diff)
1798 return false;
1799
1800 // Check if the pointer with the same offset is found.
1801 int64_t Offset = *Diff;
1802 auto [It, IsInserted] = Offsets.emplace(args&: Offset, args&: Idx);
1803 if (!IsInserted)
1804 return false;
1805 // Consecutive order if the inserted element is the last one.
1806 IsConsecutive &= std::next(x: It) == Offsets.end();
1807 }
1808 SortedIndices.clear();
1809 if (!IsConsecutive) {
1810 // Fill SortedIndices array only if it is non-consecutive.
1811 SortedIndices.resize(N: VL.size());
1812 for (auto [Idx, Off] : enumerate(First&: Offsets))
1813 SortedIndices[Idx] = Off.second;
1814 }
1815 return true;
1816}
1817
1818/// Returns true if the memory operations \p A and \p B are consecutive.
1819bool llvm::isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL,
1820 ScalarEvolution &SE, bool CheckType) {
1821 Value *PtrA = getLoadStorePointerOperand(V: A);
1822 Value *PtrB = getLoadStorePointerOperand(V: B);
1823 if (!PtrA || !PtrB)
1824 return false;
1825 Type *ElemTyA = getLoadStoreType(I: A);
1826 Type *ElemTyB = getLoadStoreType(I: B);
1827 std::optional<int64_t> Diff =
1828 getPointersDiff(ElemTyA, PtrA, ElemTyB, PtrB, DL, SE,
1829 /*StrictCheck=*/true, CheckType);
1830 return Diff == 1;
1831}
1832
1833void MemoryDepChecker::addAccess(StoreInst *SI) {
1834 visitPointers(StartPtr: SI->getPointerOperand(), InnermostLoop: *InnermostLoop,
1835 AddPointer: [this, SI](Value *Ptr) {
1836 Accesses[MemAccessInfo(Ptr, true)].push_back(x: AccessIdx);
1837 InstMap.push_back(Elt: SI);
1838 ++AccessIdx;
1839 });
1840}
1841
1842void MemoryDepChecker::addAccess(LoadInst *LI) {
1843 visitPointers(StartPtr: LI->getPointerOperand(), InnermostLoop: *InnermostLoop,
1844 AddPointer: [this, LI](Value *Ptr) {
1845 Accesses[MemAccessInfo(Ptr, false)].push_back(x: AccessIdx);
1846 InstMap.push_back(Elt: LI);
1847 ++AccessIdx;
1848 });
1849}
1850
1851MemoryDepChecker::VectorizationSafetyStatus
1852MemoryDepChecker::Dependence::isSafeForVectorization(DepType Type) {
1853 switch (Type) {
1854 case NoDep:
1855 case Forward:
1856 case BackwardVectorizable:
1857 return VectorizationSafetyStatus::Safe;
1858
1859 case Unknown:
1860 return VectorizationSafetyStatus::PossiblySafeWithRtChecks;
1861 case ForwardButPreventsForwarding:
1862 case Backward:
1863 case BackwardVectorizableButPreventsForwarding:
1864 case IndirectUnsafe:
1865 case InvariantUnsafe:
1866 return VectorizationSafetyStatus::Unsafe;
1867 }
1868 llvm_unreachable("unexpected DepType!");
1869}
1870
1871bool MemoryDepChecker::Dependence::isBackward() const {
1872 switch (Type) {
1873 case NoDep:
1874 case Forward:
1875 case ForwardButPreventsForwarding:
1876 case Unknown:
1877 case IndirectUnsafe:
1878 case InvariantUnsafe:
1879 return false;
1880
1881 case BackwardVectorizable:
1882 case Backward:
1883 case BackwardVectorizableButPreventsForwarding:
1884 return true;
1885 }
1886 llvm_unreachable("unexpected DepType!");
1887}
1888
1889bool MemoryDepChecker::Dependence::isPossiblyBackward() const {
1890 return isBackward() || Type == Unknown || Type == IndirectUnsafe ||
1891 Type == InvariantUnsafe;
1892}
1893
1894bool MemoryDepChecker::Dependence::isForward() const {
1895 switch (Type) {
1896 case Forward:
1897 case ForwardButPreventsForwarding:
1898 return true;
1899
1900 case NoDep:
1901 case Unknown:
1902 case BackwardVectorizable:
1903 case Backward:
1904 case BackwardVectorizableButPreventsForwarding:
1905 case IndirectUnsafe:
1906 case InvariantUnsafe:
1907 return false;
1908 }
1909 llvm_unreachable("unexpected DepType!");
1910}
1911
1912bool MemoryDepChecker::couldPreventStoreLoadForward(uint64_t Distance,
1913 uint64_t TypeByteSize,
1914 unsigned CommonStride) {
1915 // If loads occur at a distance that is not a multiple of a feasible vector
1916 // factor store-load forwarding does not take place.
1917 // Positive dependences might cause troubles because vectorizing them might
1918 // prevent store-load forwarding making vectorized code run a lot slower.
1919 // a[i] = a[i-3] ^ a[i-8];
1920 // The stores to a[i:i+1] don't align with the stores to a[i-3:i-2] and
1921 // hence on your typical architecture store-load forwarding does not take
1922 // place. Vectorizing in such cases does not make sense.
1923 // Store-load forwarding distance.
1924
1925 // After this many iterations store-to-load forwarding conflicts should not
1926 // cause any slowdowns.
1927 const uint64_t NumItersForStoreLoadThroughMemory = 8 * TypeByteSize;
1928 // Maximum vector factor.
1929 uint64_t MaxVFWithoutSLForwardIssuesPowerOf2 =
1930 std::min(a: VectorizerParams::MaxVectorWidth * TypeByteSize,
1931 b: MaxStoreLoadForwardSafeDistanceInBits);
1932
1933 // Compute the smallest VF at which the store and load would be misaligned.
1934 for (uint64_t VF = 2 * TypeByteSize;
1935 VF <= MaxVFWithoutSLForwardIssuesPowerOf2; VF *= 2) {
1936 // If the number of vector iteration between the store and the load are
1937 // small we could incur conflicts.
1938 if (Distance % VF && Distance / VF < NumItersForStoreLoadThroughMemory) {
1939 MaxVFWithoutSLForwardIssuesPowerOf2 = (VF >> 1);
1940 break;
1941 }
1942 }
1943
1944 if (MaxVFWithoutSLForwardIssuesPowerOf2 < 2 * TypeByteSize) {
1945 LLVM_DEBUG(
1946 dbgs() << "LAA: Distance " << Distance
1947 << " that could cause a store-load forwarding conflict\n");
1948 return true;
1949 }
1950
1951 if (CommonStride &&
1952 MaxVFWithoutSLForwardIssuesPowerOf2 <
1953 MaxStoreLoadForwardSafeDistanceInBits &&
1954 MaxVFWithoutSLForwardIssuesPowerOf2 !=
1955 VectorizerParams::MaxVectorWidth * TypeByteSize) {
1956 uint64_t MaxVF =
1957 bit_floor(Value: MaxVFWithoutSLForwardIssuesPowerOf2 / CommonStride);
1958 uint64_t MaxVFInBits = MaxVF * TypeByteSize * 8;
1959 MaxStoreLoadForwardSafeDistanceInBits =
1960 std::min(a: MaxStoreLoadForwardSafeDistanceInBits, b: MaxVFInBits);
1961 }
1962 return false;
1963}
1964
1965void MemoryDepChecker::mergeInStatus(VectorizationSafetyStatus S) {
1966 if (Status < S)
1967 Status = S;
1968}
1969
1970/// Given a dependence-distance \p Dist between two memory accesses, that have
1971/// strides in the same direction whose absolute value of the maximum stride is
1972/// given in \p MaxStride, in a loop whose maximum backedge taken count is \p
1973/// MaxBTC, check if it is possible to prove statically that the dependence
1974/// distance is larger than the range that the accesses will travel through the
1975/// execution of the loop. If so, return true; false otherwise. This is useful
1976/// for example in loops such as the following (PR31098):
1977///
1978/// for (i = 0; i < D; ++i) {
1979/// = out[i];
1980/// out[i+D] =
1981/// }
1982static bool isSafeDependenceDistance(const DataLayout &DL, ScalarEvolution &SE,
1983 const SCEV &MaxBTC, const SCEV &Dist,
1984 uint64_t MaxStride) {
1985
1986 // If we can prove that
1987 // (**) |Dist| > MaxBTC * Step
1988 // where Step is the absolute stride of the memory accesses in bytes,
1989 // then there is no dependence.
1990 //
1991 // Rationale:
1992 // We basically want to check if the absolute distance (|Dist/Step|)
1993 // is >= the loop iteration count (or > MaxBTC).
1994 // This is equivalent to the Strong SIV Test (Practical Dependence Testing,
1995 // Section 4.2.1); Note, that for vectorization it is sufficient to prove
1996 // that the dependence distance is >= VF; This is checked elsewhere.
1997 // But in some cases we can prune dependence distances early, and
1998 // even before selecting the VF, and without a runtime test, by comparing
1999 // the distance against the loop iteration count. Since the vectorized code
2000 // will be executed only if LoopCount >= VF, proving distance >= LoopCount
2001 // also guarantees that distance >= VF.
2002 //
2003 const SCEV *Step = SE.getConstant(Ty: MaxBTC.getType(), V: MaxStride);
2004 const SCEV *Product = SE.getMulExpr(LHS: &MaxBTC, RHS: Step);
2005
2006 const SCEV *CastedDist = &Dist;
2007 const SCEV *CastedProduct = Product;
2008 uint64_t DistTypeSizeBits = DL.getTypeSizeInBits(Ty: Dist.getType());
2009 uint64_t ProductTypeSizeBits = DL.getTypeSizeInBits(Ty: Product->getType());
2010
2011 // The dependence distance can be positive/negative, so we sign extend Dist;
2012 // The multiplication of the absolute stride in bytes and the
2013 // backedgeTakenCount is non-negative, so we zero extend Product.
2014 if (DistTypeSizeBits > ProductTypeSizeBits)
2015 CastedProduct = SE.getZeroExtendExpr(Op: Product, Ty: Dist.getType());
2016 else
2017 CastedDist = SE.getNoopOrSignExtend(V: &Dist, Ty: Product->getType());
2018
2019 // Is Dist - (MaxBTC * Step) > 0 ?
2020 // (If so, then we have proven (**) because |Dist| >= Dist)
2021 const SCEV *Minus = SE.getMinusSCEV(LHS: CastedDist, RHS: CastedProduct);
2022 if (SE.isKnownPositive(S: Minus))
2023 return true;
2024
2025 // Second try: Is -Dist - (MaxBTC * Step) > 0 ?
2026 // (If so, then we have proven (**) because |Dist| >= -1*Dist)
2027 const SCEV *NegDist = SE.getNegativeSCEV(V: CastedDist);
2028 Minus = SE.getMinusSCEV(LHS: NegDist, RHS: CastedProduct);
2029 return SE.isKnownPositive(S: Minus);
2030}
2031
2032/// Check the dependence for two accesses with the same stride \p Stride.
2033/// \p Distance is the positive distance in bytes, and \p TypeByteSize is type
2034/// size in bytes.
2035///
2036/// \returns true if they are independent.
2037static bool areStridedAccessesIndependent(uint64_t Distance, uint64_t Stride,
2038 uint64_t TypeByteSize) {
2039 assert(Stride > 1 && "The stride must be greater than 1");
2040 assert(TypeByteSize > 0 && "The type size in byte must be non-zero");
2041 assert(Distance > 0 && "The distance must be non-zero");
2042
2043 // Skip if the distance is not multiple of type byte size.
2044 if (Distance % TypeByteSize)
2045 return false;
2046
2047 // No dependence if the distance is not multiple of the stride.
2048 // E.g.
2049 // for (i = 0; i < 1024 ; i += 4)
2050 // A[i+2] = A[i] + 1;
2051 //
2052 // Two accesses in memory (distance is 2, stride is 4):
2053 // | A[0] | | | | A[4] | | | |
2054 // | | | A[2] | | | | A[6] | |
2055 //
2056 // E.g.
2057 // for (i = 0; i < 1024 ; i += 3)
2058 // A[i+4] = A[i] + 1;
2059 //
2060 // Two accesses in memory (distance is 4, stride is 3):
2061 // | A[0] | | | A[3] | | | A[6] | | |
2062 // | | | | | A[4] | | | A[7] | |
2063 return Distance % Stride;
2064}
2065
2066bool MemoryDepChecker::areAccessesCompletelyBeforeOrAfter(const SCEV *Src,
2067 Type *SrcTy,
2068 const SCEV *Sink,
2069 Type *SinkTy) {
2070 const SCEV *BTC = PSE.getBackedgeTakenCount();
2071 const SCEV *SymbolicMaxBTC = PSE.getSymbolicMaxBackedgeTakenCount();
2072 ScalarEvolution &SE = *PSE.getSE();
2073 const auto &[SrcStart_, SrcEnd_] =
2074 getStartAndEndForAccess(Lp: InnermostLoop, PtrExpr: Src, AccessTy: SrcTy, BTC, MaxBTC: SymbolicMaxBTC,
2075 SE: &SE, PointerBounds: &PointerBounds, DT, AC, LoopGuards);
2076 if (isa<SCEVCouldNotCompute>(Val: SrcStart_) || isa<SCEVCouldNotCompute>(Val: SrcEnd_))
2077 return false;
2078
2079 const auto &[SinkStart_, SinkEnd_] =
2080 getStartAndEndForAccess(Lp: InnermostLoop, PtrExpr: Sink, AccessTy: SinkTy, BTC, MaxBTC: SymbolicMaxBTC,
2081 SE: &SE, PointerBounds: &PointerBounds, DT, AC, LoopGuards);
2082 if (isa<SCEVCouldNotCompute>(Val: SinkStart_) ||
2083 isa<SCEVCouldNotCompute>(Val: SinkEnd_))
2084 return false;
2085
2086 if (!LoopGuards)
2087 LoopGuards.emplace(args: ScalarEvolution::LoopGuards::collect(L: InnermostLoop, SE));
2088
2089 auto SrcEnd = SE.applyLoopGuards(Expr: SrcEnd_, Guards: *LoopGuards);
2090 auto SinkStart = SE.applyLoopGuards(Expr: SinkStart_, Guards: *LoopGuards);
2091 if (SE.isKnownPredicate(Pred: CmpInst::ICMP_ULE, LHS: SrcEnd, RHS: SinkStart))
2092 return true;
2093
2094 auto SinkEnd = SE.applyLoopGuards(Expr: SinkEnd_, Guards: *LoopGuards);
2095 auto SrcStart = SE.applyLoopGuards(Expr: SrcStart_, Guards: *LoopGuards);
2096 return SE.isKnownPredicate(Pred: CmpInst::ICMP_ULE, LHS: SinkEnd, RHS: SrcStart);
2097}
2098
2099std::variant<MemoryDepChecker::Dependence::DepType,
2100 MemoryDepChecker::DepDistanceStrideAndSizeInfo>
2101MemoryDepChecker::getDependenceDistanceStrideAndSize(
2102 const AccessAnalysis::MemAccessInfo &A, Instruction *AInst,
2103 const AccessAnalysis::MemAccessInfo &B, Instruction *BInst) {
2104 const auto &DL = InnermostLoop->getHeader()->getDataLayout();
2105 auto &SE = *PSE.getSE();
2106 const auto &[APtr, AIsWrite] = A;
2107 const auto &[BPtr, BIsWrite] = B;
2108
2109 // Two reads are independent.
2110 if (!AIsWrite && !BIsWrite)
2111 return MemoryDepChecker::Dependence::NoDep;
2112
2113 Type *ATy = getLoadStoreType(I: AInst);
2114 Type *BTy = getLoadStoreType(I: BInst);
2115
2116 // We cannot check pointers in different address spaces.
2117 if (APtr->getType()->getPointerAddressSpace() !=
2118 BPtr->getType()->getPointerAddressSpace())
2119 return MemoryDepChecker::Dependence::Unknown;
2120
2121 SmallVector<const SCEVPredicate *> Predicates;
2122 std::optional<int64_t> StrideAPtr =
2123 getPtrStride(PSE, AccessTy: ATy, Ptr: APtr, Lp: InnermostLoop, DT: *DT, StridesMap: SymbolicStrides,
2124 /*ShouldCheckWrap=*/true, Predicates: &Predicates);
2125 std::optional<int64_t> StrideBPtr =
2126 getPtrStride(PSE, AccessTy: BTy, Ptr: BPtr, Lp: InnermostLoop, DT: *DT, StridesMap: SymbolicStrides,
2127 /*ShouldCheckWrap=*/true, Predicates: &Predicates);
2128 PSE.addPredicates(Preds: Predicates);
2129
2130 const SCEV *Src = PSE.getSCEV(V: APtr);
2131 const SCEV *Sink = PSE.getSCEV(V: BPtr);
2132
2133 // If the induction step is negative we have to invert source and sink of the
2134 // dependence when measuring the distance between them. We should not swap
2135 // AIsWrite with BIsWrite, as their uses expect them in program order.
2136 if (StrideAPtr && *StrideAPtr < 0) {
2137 std::swap(a&: Src, b&: Sink);
2138 std::swap(a&: AInst, b&: BInst);
2139 std::swap(a&: ATy, b&: BTy);
2140 std::swap(lhs&: StrideAPtr, rhs&: StrideBPtr);
2141 }
2142
2143 const SCEV *Dist = SE.getMinusSCEV(LHS: Sink, RHS: Src);
2144
2145 LLVM_DEBUG(dbgs() << "LAA: Src Scev: " << *Src << "Sink Scev: " << *Sink
2146 << "\n");
2147 LLVM_DEBUG(dbgs() << "LAA: Distance for " << *AInst << " to " << *BInst
2148 << ": " << *Dist << "\n");
2149
2150 // Need accesses with constant strides and the same direction for further
2151 // dependence analysis. We don't want to vectorize "A[B[i]] += ..." and
2152 // similar code or pointer arithmetic that could wrap in the address space.
2153
2154 // If either Src or Sink are not strided (i.e. not a non-wrapping AddRec) and
2155 // not loop-invariant (stride will be 0 in that case), we cannot analyze the
2156 // dependence further and also cannot generate runtime checks.
2157 if (!StrideAPtr || !StrideBPtr) {
2158 LLVM_DEBUG(dbgs() << "Pointer access with non-constant stride\n");
2159 return MemoryDepChecker::Dependence::IndirectUnsafe;
2160 }
2161
2162 int64_t StrideAPtrInt = *StrideAPtr;
2163 int64_t StrideBPtrInt = *StrideBPtr;
2164 LLVM_DEBUG(dbgs() << "LAA: Src induction step: " << StrideAPtrInt
2165 << " Sink induction step: " << StrideBPtrInt << "\n");
2166 // At least Src or Sink are loop invariant and the other is strided or
2167 // invariant.
2168 if (!StrideAPtrInt || !StrideBPtrInt) {
2169 // If both are loop-invariant and access the same location, we cannot
2170 // vectorize.
2171 if (!StrideAPtrInt && !StrideBPtrInt && Dist->isZero())
2172 return MemoryDepChecker::Dependence::InvariantUnsafe;
2173 // Otherwise, we can generate a runtime check to disambiguate the accesses.
2174 return MemoryDepChecker::Dependence::Unknown;
2175 }
2176
2177 // Both Src and Sink have a constant stride, check if they are in the same
2178 // direction.
2179 if ((StrideAPtrInt > 0) != (StrideBPtrInt > 0)) {
2180 LLVM_DEBUG(
2181 dbgs() << "Pointer access with strides in different directions\n");
2182 return MemoryDepChecker::Dependence::Unknown;
2183 }
2184
2185 TypeSize AStoreSz = DL.getTypeStoreSize(Ty: ATy);
2186 TypeSize BStoreSz = DL.getTypeStoreSize(Ty: BTy);
2187
2188 // If store sizes are not the same, set TypeByteSize to zero, so we can check
2189 // it in the caller isDependent.
2190 uint64_t ASz = DL.getTypeAllocSize(Ty: ATy);
2191 uint64_t BSz = DL.getTypeAllocSize(Ty: BTy);
2192 uint64_t TypeByteSize = (AStoreSz == BStoreSz) ? BSz : 0;
2193
2194 uint64_t StrideAScaled = std::abs(i: StrideAPtrInt) * ASz;
2195 uint64_t StrideBScaled = std::abs(i: StrideBPtrInt) * BSz;
2196
2197 uint64_t MaxStride = std::max(a: StrideAScaled, b: StrideBScaled);
2198
2199 std::optional<uint64_t> CommonStride;
2200 if (StrideAScaled == StrideBScaled)
2201 CommonStride = StrideAScaled;
2202
2203 // TODO: Historically, we didn't retry with runtime checks when (unscaled)
2204 // strides were different but there is no inherent reason to.
2205 if (!isa<SCEVConstant>(Val: Dist))
2206 ShouldRetryWithRuntimeChecks |= StrideAPtrInt == StrideBPtrInt;
2207
2208 // If distance is a SCEVCouldNotCompute, return Unknown immediately.
2209 if (isa<SCEVCouldNotCompute>(Val: Dist)) {
2210 LLVM_DEBUG(dbgs() << "LAA: Uncomputable distance.\n");
2211 return Dependence::Unknown;
2212 }
2213
2214 return DepDistanceStrideAndSizeInfo(Dist, MaxStride, CommonStride,
2215 TypeByteSize, AIsWrite, BIsWrite);
2216}
2217
2218MemoryDepChecker::Dependence::DepType
2219MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
2220 const MemAccessInfo &B, unsigned BIdx) {
2221 assert(AIdx < BIdx && "Must pass arguments in program order");
2222
2223 // Check if we can prove that Sink only accesses memory after Src's end or
2224 // vice versa. The helper is used to perform the checks only on the exit paths
2225 // where it helps to improve the analysis result.
2226 auto CheckCompletelyBeforeOrAfter = [&]() {
2227 auto *APtr = A.getPointer();
2228 auto *BPtr = B.getPointer();
2229 Type *ATy = getLoadStoreType(I: InstMap[AIdx]);
2230 Type *BTy = getLoadStoreType(I: InstMap[BIdx]);
2231 const SCEV *Src = PSE.getSCEV(V: APtr);
2232 const SCEV *Sink = PSE.getSCEV(V: BPtr);
2233 return areAccessesCompletelyBeforeOrAfter(Src, SrcTy: ATy, Sink, SinkTy: BTy);
2234 };
2235
2236 // Get the dependence distance, stride, type size and what access writes for
2237 // the dependence between A and B.
2238 auto Res =
2239 getDependenceDistanceStrideAndSize(A, AInst: InstMap[AIdx], B, BInst: InstMap[BIdx]);
2240 if (std::holds_alternative<Dependence::DepType>(v: Res)) {
2241 if (std::get<Dependence::DepType>(v&: Res) == Dependence::Unknown &&
2242 CheckCompletelyBeforeOrAfter())
2243 return Dependence::NoDep;
2244 return std::get<Dependence::DepType>(v&: Res);
2245 }
2246
2247 auto &[Dist, MaxStride, CommonStride, TypeByteSize, AIsWrite, BIsWrite] =
2248 std::get<DepDistanceStrideAndSizeInfo>(v&: Res);
2249 bool HasSameSize = TypeByteSize > 0;
2250
2251 ScalarEvolution &SE = *PSE.getSE();
2252 auto &DL = InnermostLoop->getHeader()->getDataLayout();
2253
2254 // If the distance between the acecsses is larger than their maximum absolute
2255 // stride multiplied by the symbolic maximum backedge taken count (which is an
2256 // upper bound of the number of iterations), the accesses are independet, i.e.
2257 // they are far enough appart that accesses won't access the same location
2258 // across all loop ierations.
2259 if (HasSameSize &&
2260 isSafeDependenceDistance(
2261 DL, SE, MaxBTC: *(PSE.getSymbolicMaxBackedgeTakenCount()), Dist: *Dist, MaxStride))
2262 return Dependence::NoDep;
2263
2264 // The rest of this function relies on ConstDist being at most 64-bits, which
2265 // is checked earlier. Will assert if the calling code changes.
2266 const APInt *APDist = nullptr;
2267 uint64_t ConstDist =
2268 match(S: Dist, P: m_scev_APInt(C&: APDist)) ? APDist->abs().getZExtValue() : 0;
2269
2270 // Attempt to prove strided accesses independent.
2271 if (APDist) {
2272 // If the distance between accesses and their strides are known constants,
2273 // check whether the accesses interlace each other.
2274 if (ConstDist > 0 && CommonStride && CommonStride > 1 && HasSameSize &&
2275 areStridedAccessesIndependent(Distance: ConstDist, Stride: *CommonStride, TypeByteSize)) {
2276 LLVM_DEBUG(dbgs() << "LAA: Strided accesses are independent\n");
2277 return Dependence::NoDep;
2278 }
2279 } else {
2280 if (!LoopGuards)
2281 LoopGuards.emplace(
2282 args: ScalarEvolution::LoopGuards::collect(L: InnermostLoop, SE));
2283 Dist = SE.applyLoopGuards(Expr: Dist, Guards: *LoopGuards);
2284 }
2285
2286 // Negative distances are not plausible dependencies.
2287 if (SE.isKnownNonPositive(S: Dist)) {
2288 if (SE.isKnownNonNegative(S: Dist)) {
2289 if (HasSameSize) {
2290 // Write to the same location with the same size.
2291 return Dependence::Forward;
2292 }
2293 LLVM_DEBUG(dbgs() << "LAA: possibly zero dependence difference but "
2294 "different type sizes\n");
2295 return Dependence::Unknown;
2296 }
2297
2298 bool IsTrueDataDependence = (AIsWrite && !BIsWrite);
2299 // Check if the first access writes to a location that is read in a later
2300 // iteration, where the distance between them is not a multiple of a vector
2301 // factor and relatively small.
2302 //
2303 // NOTE: There is no need to update MaxSafeVectorWidthInBits after call to
2304 // couldPreventStoreLoadForward, even if it changed MinDepDistBytes, since a
2305 // forward dependency will allow vectorization using any width.
2306
2307 if (IsTrueDataDependence && EnableForwardingConflictDetection) {
2308 if (!ConstDist) {
2309 return CheckCompletelyBeforeOrAfter() ? Dependence::NoDep
2310 : Dependence::Unknown;
2311 }
2312 if (!HasSameSize ||
2313 couldPreventStoreLoadForward(Distance: ConstDist, TypeByteSize)) {
2314 LLVM_DEBUG(
2315 dbgs() << "LAA: Forward but may prevent st->ld forwarding\n");
2316 return Dependence::ForwardButPreventsForwarding;
2317 }
2318 }
2319
2320 LLVM_DEBUG(dbgs() << "LAA: Dependence is negative\n");
2321 return Dependence::Forward;
2322 }
2323
2324 int64_t MinDistance = SE.getSignedRangeMin(S: Dist).getSExtValue();
2325 // Below we only handle strictly positive distances.
2326 if (MinDistance <= 0) {
2327 return CheckCompletelyBeforeOrAfter() ? Dependence::NoDep
2328 : Dependence::Unknown;
2329 }
2330
2331 if (!HasSameSize) {
2332 if (CheckCompletelyBeforeOrAfter())
2333 return Dependence::NoDep;
2334 LLVM_DEBUG(dbgs() << "LAA: ReadWrite-Write positive dependency with "
2335 "different type sizes\n");
2336 return Dependence::Unknown;
2337 }
2338 // Bail out early if passed-in parameters make vectorization not feasible.
2339 unsigned MinForcedFactor =
2340 std::max(a: 1U, b: VectorizerParams::VectorizationFactor.getKnownMinValue());
2341 unsigned ForcedUnroll = (VectorizerParams::VectorizationInterleave ?
2342 VectorizerParams::VectorizationInterleave : 1);
2343 // The minimum number of iterations for a vectorized/unrolled version.
2344 unsigned MinNumIter = std::max(a: MinForcedFactor * ForcedUnroll, b: 2U);
2345
2346 // It's not vectorizable if the distance is smaller than the minimum distance
2347 // needed for a vectroized/unrolled version. Vectorizing one iteration in
2348 // front needs MaxStride. Vectorizing the last iteration needs TypeByteSize.
2349 // (No need to plus the last gap distance).
2350 //
2351 // E.g. Assume one char is 1 byte in memory and one int is 4 bytes.
2352 // foo(int *A) {
2353 // int *B = (int *)((char *)A + 14);
2354 // for (i = 0 ; i < 1024 ; i += 2)
2355 // B[i] = A[i] + 1;
2356 // }
2357 //
2358 // Two accesses in memory (stride is 4 * 2):
2359 // | A[0] | | A[2] | | A[4] | | A[6] | |
2360 // | B[0] | | B[2] | | B[4] |
2361 //
2362 // MinDistance needs for vectorizing iterations except the last iteration:
2363 // 4 * 2 * (MinNumIter - 1). MinDistance needs for the last iteration: 4.
2364 // So the minimum distance needed is: 4 * 2 * (MinNumIter - 1) + 4.
2365 //
2366 // If MinNumIter is 2, it is vectorizable as the minimum distance needed is
2367 // 12, which is less than distance.
2368 //
2369 // If MinNumIter is 4 (Say if a user forces the vectorization factor to be 4),
2370 // the minimum distance needed is 28, which is greater than distance. It is
2371 // not safe to do vectorization.
2372 //
2373 // We use MaxStride (maximum of src and sink strides) to get a conservative
2374 // lower bound on the MinDistanceNeeded in case of different strides.
2375
2376 // We know that Dist is positive, but it may not be constant. Use the signed
2377 // minimum for computations below, as this ensures we compute the closest
2378 // possible dependence distance.
2379 uint64_t MinDistanceNeeded = MaxStride * (MinNumIter - 1) + TypeByteSize;
2380 if (MinDistanceNeeded > static_cast<uint64_t>(MinDistance)) {
2381 if (!ConstDist) {
2382 // For non-constant distances, we checked the lower bound of the
2383 // dependence distance and the distance may be larger at runtime (and safe
2384 // for vectorization). Classify it as Unknown, so we re-try with runtime
2385 // checks, unless we can prove both accesses cannot overlap.
2386 return CheckCompletelyBeforeOrAfter() ? Dependence::NoDep
2387 : Dependence::Unknown;
2388 }
2389 LLVM_DEBUG(dbgs() << "LAA: Failure because of positive minimum distance "
2390 << MinDistance << '\n');
2391 return Dependence::Backward;
2392 }
2393
2394 // Unsafe if the minimum distance needed is greater than smallest dependence
2395 // distance distance.
2396 if (MinDistanceNeeded > MinDepDistBytes) {
2397 LLVM_DEBUG(dbgs() << "LAA: Failure because it needs at least "
2398 << MinDistanceNeeded << " size in bytes\n");
2399 return Dependence::Backward;
2400 }
2401
2402 MinDepDistBytes =
2403 std::min(a: static_cast<uint64_t>(MinDistance), b: MinDepDistBytes);
2404
2405 bool IsTrueDataDependence = (!AIsWrite && BIsWrite);
2406 if (IsTrueDataDependence && EnableForwardingConflictDetection && ConstDist &&
2407 couldPreventStoreLoadForward(Distance: MinDistance, TypeByteSize, CommonStride: *CommonStride))
2408 return Dependence::BackwardVectorizableButPreventsForwarding;
2409
2410 uint64_t MaxVF = MinDepDistBytes / MaxStride;
2411 LLVM_DEBUG(dbgs() << "LAA: Positive min distance " << MinDistance
2412 << " with max VF = " << MaxVF << '\n');
2413
2414 uint64_t MaxVFInBits = MaxVF * TypeByteSize * 8;
2415 if (!ConstDist && MaxVFInBits < MaxTargetVectorWidthInBits) {
2416 // For non-constant distances, we checked the lower bound of the dependence
2417 // distance and the distance may be larger at runtime (and safe for
2418 // vectorization). Classify it as Unknown, so we re-try with runtime checks,
2419 // unless we can prove both accesses cannot overlap.
2420 return CheckCompletelyBeforeOrAfter() ? Dependence::NoDep
2421 : Dependence::Unknown;
2422 }
2423
2424 if (CheckCompletelyBeforeOrAfter())
2425 return Dependence::NoDep;
2426
2427 MaxSafeVectorWidthInBits = std::min(a: MaxSafeVectorWidthInBits, b: MaxVFInBits);
2428 return Dependence::BackwardVectorizable;
2429}
2430
2431bool MemoryDepChecker::areDepsSafe(const DepCandidates &DepCands,
2432 ArrayRef<MemAccessInfo> CheckDeps) {
2433
2434 MinDepDistBytes = -1;
2435 SmallPtrSet<MemAccessInfo, 8> Visited;
2436 for (MemAccessInfo CurAccess : CheckDeps) {
2437 if (Visited.contains(Ptr: CurAccess))
2438 continue;
2439
2440 // Check accesses within this set.
2441 EquivalenceClasses<MemAccessInfo>::member_iterator AI =
2442 DepCands.findLeader(V: CurAccess);
2443 EquivalenceClasses<MemAccessInfo>::member_iterator AE =
2444 DepCands.member_end();
2445
2446 // Check every access pair.
2447 while (AI != AE) {
2448 Visited.insert(Ptr: *AI);
2449 bool AIIsWrite = AI->getInt();
2450 // Reads from the same pointer don't create extra hazards, but multiple
2451 // stores do (WAW), so start from AI for writes and next(AI) for reads.
2452 EquivalenceClasses<MemAccessInfo>::member_iterator OI =
2453 (AIIsWrite ? AI : std::next(x: AI));
2454 while (OI != AE) {
2455 // Check every accessing instruction pair in program order.
2456 auto &Acc = Accesses[*AI];
2457 for (std::vector<unsigned>::iterator I1 = Acc.begin(), I1E = Acc.end();
2458 I1 != I1E; ++I1)
2459 // When checking for WAW (OI == AI) caused by multiple writes to the
2460 // same pointer, start I2 at the next access past I1 to avoid
2461 // self-comparison.
2462 for (std::vector<unsigned>::iterator
2463 I2 = (OI == AI ? std::next(x: I1) : Accesses[*OI].begin()),
2464 I2E = (OI == AI ? I1E : Accesses[*OI].end());
2465 I2 != I2E; ++I2) {
2466 auto A = std::make_pair(x: &*AI, y&: *I1);
2467 auto B = std::make_pair(x: &*OI, y&: *I2);
2468
2469 assert(*I1 != *I2);
2470 if (*I1 > *I2)
2471 std::swap(x&: A, y&: B);
2472
2473 Dependence::DepType Type =
2474 isDependent(A: *A.first, AIdx: A.second, B: *B.first, BIdx: B.second);
2475 mergeInStatus(S: Dependence::isSafeForVectorization(Type));
2476
2477 // Gather dependences unless we accumulated MaxDependences
2478 // dependences. In that case return as soon as we find the first
2479 // unsafe dependence. This puts a limit on this quadratic
2480 // algorithm.
2481 if (RecordDependences) {
2482 if (Type != Dependence::NoDep)
2483 Dependences.emplace_back(Args&: A.second, Args&: B.second, Args&: Type);
2484
2485 if (Dependences.size() >= MaxDependences) {
2486 RecordDependences = false;
2487 Dependences.clear();
2488 LLVM_DEBUG(dbgs()
2489 << "Too many dependences, stopped recording\n");
2490 }
2491 }
2492 if (!RecordDependences && !isSafeForVectorization())
2493 return false;
2494 }
2495 ++OI;
2496 }
2497 ++AI;
2498 }
2499 }
2500
2501 LLVM_DEBUG(dbgs() << "Total Dependences: " << Dependences.size() << "\n");
2502 return isSafeForVectorization();
2503}
2504
2505SmallVector<Instruction *, 4>
2506MemoryDepChecker::getInstructionsForAccess(Value *Ptr, bool IsWrite) const {
2507 MemAccessInfo Access(Ptr, IsWrite);
2508 auto I = Accesses.find(Val: Access);
2509 SmallVector<Instruction *, 4> Insts;
2510 if (I != Accesses.end()) {
2511 transform(Range: I->second, d_first: std::back_inserter(x&: Insts),
2512 F: [&](unsigned Idx) { return this->InstMap[Idx]; });
2513 }
2514
2515 return Insts;
2516}
2517
2518const char *MemoryDepChecker::Dependence::DepName[] = {
2519 "NoDep",
2520 "Unknown",
2521 "IndirectUnsafe",
2522 "InvariantUnsafe",
2523 "Forward",
2524 "ForwardButPreventsForwarding",
2525 "Backward",
2526 "BackwardVectorizable",
2527 "BackwardVectorizableButPreventsForwarding"};
2528
2529void MemoryDepChecker::Dependence::print(
2530 raw_ostream &OS, unsigned Depth,
2531 const SmallVectorImpl<Instruction *> &Instrs) const {
2532 OS.indent(NumSpaces: Depth) << DepName[Type] << ":\n";
2533 OS.indent(NumSpaces: Depth + 2) << *Instrs[Source] << " -> \n";
2534 OS.indent(NumSpaces: Depth + 2) << *Instrs[Destination] << "\n";
2535}
2536
2537bool LoopAccessInfo::canAnalyzeLoop() {
2538 // We need to have a loop header.
2539 LLVM_DEBUG(dbgs() << "\nLAA: Checking a loop in '"
2540 << TheLoop->getHeader()->getParent()->getName() << "' from "
2541 << TheLoop->getLocStr() << "\n");
2542
2543 // We can only analyze innermost loops.
2544 if (!TheLoop->isInnermost()) {
2545 LLVM_DEBUG(dbgs() << "LAA: loop is not the innermost loop\n");
2546 recordAnalysis(RemarkName: "NotInnerMostLoop") << "loop is not the innermost loop";
2547 return false;
2548 }
2549
2550 // We must have a single backedge.
2551 if (TheLoop->getNumBackEdges() != 1) {
2552 LLVM_DEBUG(
2553 dbgs() << "LAA: loop control flow is not understood by analyzer\n");
2554 recordAnalysis(RemarkName: "CFGNotUnderstood")
2555 << "loop control flow is not understood by analyzer";
2556 return false;
2557 }
2558
2559 // ScalarEvolution needs to be able to find the symbolic max backedge taken
2560 // count, which is an upper bound on the number of loop iterations. The loop
2561 // may execute fewer iterations, if it exits via an uncountable exit.
2562 const SCEV *ExitCount = PSE->getSymbolicMaxBackedgeTakenCount();
2563 if (isa<SCEVCouldNotCompute>(Val: ExitCount)) {
2564 recordAnalysis(RemarkName: "CantComputeNumberOfIterations")
2565 << "could not determine number of loop iterations";
2566 LLVM_DEBUG(dbgs() << "LAA: SCEV could not compute the loop exit count.\n");
2567 return false;
2568 }
2569
2570 LLVM_DEBUG(dbgs() << "LAA: Found an analyzable loop: "
2571 << TheLoop->getHeader()->getName() << "\n");
2572 return true;
2573}
2574
2575bool LoopAccessInfo::analyzeLoop(AAResults *AA, const LoopInfo *LI,
2576 const TargetLibraryInfo *TLI,
2577 DominatorTree *DT) {
2578 // Holds the Load and Store instructions.
2579 SmallVector<LoadInst *, 16> Loads;
2580 SmallVector<StoreInst *, 16> Stores;
2581 SmallPtrSet<MDNode *, 8> LoopAliasScopes;
2582
2583 // Holds all the different accesses in the loop.
2584 unsigned NumReads = 0;
2585 unsigned NumReadWrites = 0;
2586
2587 bool HasComplexMemInst = false;
2588
2589 // A runtime check is only legal to insert if there are no convergent calls.
2590 HasConvergentOp = false;
2591
2592 PtrRtChecking->Pointers.clear();
2593 PtrRtChecking->Need = false;
2594
2595 const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel();
2596
2597 const bool EnableMemAccessVersioningOfLoop =
2598 EnableMemAccessVersioning &&
2599 !TheLoop->getHeader()->getParent()->hasOptSize();
2600
2601 // Traverse blocks in fixed RPOT order, regardless of their storage in the
2602 // loop info, as it may be arbitrary.
2603 LoopBlocksRPO RPOT(TheLoop);
2604 RPOT.perform(LI);
2605
2606 // Don't return early as soon as we found a memory access that cannot be
2607 // vectorize - HasConvergentOp must still be computed as it is part of LAI's
2608 // public API (used by LoopDistribute).
2609 for (BasicBlock *BB : RPOT) {
2610 // Scan the BB and collect legal loads and stores. Also detect any
2611 // convergent instructions.
2612 for (Instruction &I : *BB) {
2613 if (auto *Call = dyn_cast<CallBase>(Val: &I)) {
2614 if (Call->isConvergent())
2615 HasConvergentOp = true;
2616 }
2617
2618 // Unsafe to vectorize and we already found a convergent operation, can
2619 // early return now.
2620 if (HasComplexMemInst && HasConvergentOp)
2621 return false;
2622
2623 // Already unsafe to vectorize; keep scanning for convergent ops.
2624 if (HasComplexMemInst)
2625 continue;
2626
2627 // Record alias scopes defined inside the loop.
2628 if (auto *Decl = dyn_cast<NoAliasScopeDeclInst>(Val: &I))
2629 for (Metadata *Op : Decl->getScopeList()->operands())
2630 LoopAliasScopes.insert(Ptr: cast<MDNode>(Val: Op));
2631
2632 // Many math library functions read the rounding mode. We will only
2633 // vectorize a loop if it contains known function calls that don't set
2634 // the flag. Therefore, it is safe to ignore this read from memory.
2635 auto *Call = dyn_cast<CallInst>(Val: &I);
2636 if (Call && getVectorIntrinsicIDForCall(CI: Call, TLI))
2637 continue;
2638
2639 // If this is a load, save it. If this instruction can read from memory
2640 // but is not a load, we only allow it if it's a call to a function with a
2641 // vector mapping and no pointer arguments.
2642 if (I.mayReadFromMemory()) {
2643 auto hasPointerArgs = [](CallBase *CB) {
2644 return any_of(Range: CB->args(), P: [](Value const *Arg) {
2645 return Arg->getType()->isPointerTy();
2646 });
2647 };
2648
2649 // If the function has an explicit vectorized counterpart, and does not
2650 // take output/input pointers, we can safely assume that it can be
2651 // vectorized.
2652 if (Call && !Call->isNoBuiltin() && Call->getCalledFunction() &&
2653 !hasPointerArgs(Call) && !VFDatabase::getMappings(CI: *Call).empty())
2654 continue;
2655
2656 auto *Ld = dyn_cast<LoadInst>(Val: &I);
2657 if (!Ld) {
2658 recordAnalysis(RemarkName: "CantVectorizeInstruction", Instr: &I)
2659 << "instruction cannot be vectorized";
2660 HasComplexMemInst = true;
2661 continue;
2662 }
2663 if (!Ld->isSimple() && !IsAnnotatedParallel) {
2664 recordAnalysis(RemarkName: "NonSimpleLoad", Instr: Ld)
2665 << "read with atomic ordering or volatile read";
2666 LLVM_DEBUG(dbgs() << "LAA: Found a non-simple load.\n");
2667 HasComplexMemInst = true;
2668 continue;
2669 }
2670 NumLoads++;
2671 Loads.push_back(Elt: Ld);
2672 DepChecker->addAccess(LI: Ld);
2673 if (EnableMemAccessVersioningOfLoop)
2674 collectStridedAccess(LoadOrStoreInst: Ld);
2675 continue;
2676 }
2677
2678 // Save 'store' instructions. Abort if other instructions write to memory.
2679 if (I.mayWriteToMemory()) {
2680 auto *St = dyn_cast<StoreInst>(Val: &I);
2681 if (!St) {
2682 recordAnalysis(RemarkName: "CantVectorizeInstruction", Instr: &I)
2683 << "instruction cannot be vectorized";
2684 HasComplexMemInst = true;
2685 continue;
2686 }
2687 if (!St->isSimple() && !IsAnnotatedParallel) {
2688 recordAnalysis(RemarkName: "NonSimpleStore", Instr: St)
2689 << "write with atomic ordering or volatile write";
2690 LLVM_DEBUG(dbgs() << "LAA: Found a non-simple store.\n");
2691 HasComplexMemInst = true;
2692 continue;
2693 }
2694 NumStores++;
2695 Stores.push_back(Elt: St);
2696 DepChecker->addAccess(SI: St);
2697 if (EnableMemAccessVersioningOfLoop)
2698 collectStridedAccess(LoadOrStoreInst: St);
2699 }
2700 } // Next instr.
2701 } // Next block.
2702
2703 if (HasComplexMemInst)
2704 return false;
2705
2706 // Now we have two lists that hold the loads and the stores.
2707 // Next, we find the pointers that they use.
2708
2709 // Check if we see any stores. If there are no stores, then we don't
2710 // care if the pointers are *restrict*.
2711 if (!Stores.size()) {
2712 LLVM_DEBUG(dbgs() << "LAA: Found a read-only loop!\n");
2713 return true;
2714 }
2715
2716 MemoryDepChecker::DepCandidates DepCands;
2717 AccessAnalysis Accesses(TheLoop, AA, LI, *DT, DepCands, *PSE,
2718 LoopAliasScopes);
2719
2720 // Holds the analyzed pointers. We don't want to call getUnderlyingObjects
2721 // multiple times on the same object. If the ptr is accessed twice, once
2722 // for read and once for write, it will only appear once (on the write
2723 // list). This is okay, since we are going to check for conflicts between
2724 // writes and between reads and writes, but not between reads and reads.
2725 SmallSet<std::pair<Value *, Type *>, 16> Seen;
2726
2727 // Record uniform store addresses to identify if we have multiple stores
2728 // to the same address.
2729 SmallPtrSet<Value *, 16> UniformStores;
2730
2731 for (StoreInst *ST : Stores) {
2732 Value *Ptr = ST->getPointerOperand();
2733
2734 if (isInvariant(V: Ptr)) {
2735 // Record store instructions to loop invariant addresses
2736 StoresToInvariantAddresses.push_back(Elt: ST);
2737 HasStoreStoreDependenceInvolvingLoopInvariantAddress |=
2738 !UniformStores.insert(Ptr).second;
2739 }
2740
2741 // If we did *not* see this pointer before, insert it to the read-write
2742 // list. At this phase it is only a 'write' list.
2743 Type *AccessTy = getLoadStoreType(I: ST);
2744 if (Seen.insert(V: {Ptr, AccessTy}).second) {
2745 ++NumReadWrites;
2746
2747 MemoryLocation Loc = MemoryLocation::get(SI: ST);
2748 // The TBAA metadata could have a control dependency on the predication
2749 // condition, so we cannot rely on it when determining whether or not we
2750 // need runtime pointer checks.
2751 if (blockNeedsPredication(BB: ST->getParent(), TheLoop, DT))
2752 Loc.AATags.TBAA = nullptr;
2753
2754 // Expand forked pointers (i.e., a phi of multiple strided pointers) into
2755 // all alternatives.
2756 visitPointers(StartPtr: const_cast<Value *>(Loc.Ptr), InnermostLoop: *TheLoop,
2757 AddPointer: [&Accesses, AccessTy, Loc](Value *Ptr) {
2758 MemoryLocation NewLoc = Loc.getWithNewPtr(NewPtr: Ptr);
2759 Accesses.addStore(Loc: NewLoc, AccessTy);
2760 });
2761 }
2762 }
2763
2764 if (IsAnnotatedParallel) {
2765 LLVM_DEBUG(
2766 dbgs() << "LAA: A loop annotated parallel, ignore memory dependency "
2767 << "checks.\n");
2768 return true;
2769 }
2770
2771 for (LoadInst *LD : Loads) {
2772 Value *Ptr = LD->getPointerOperand();
2773 // If we did *not* see this pointer before, insert it to the read list. If
2774 // we *did* see it before, then it is already in the read-write list. This
2775 // allows us to vectorize expressions such as A[i] += x; Because the address
2776 // of A[i] is a read-write pointer. This only works if the index of A[i] is
2777 // strictly monotonic, which we approximate (conservatively) via
2778 // getPtrStride. If the address is unknown (e.g. A[B[i]]) then we may read,
2779 // modify, and write overlapping words. Note that "zero stride" is unsafe
2780 // and is being handled below.
2781 bool IsReadOnlyPtr = false;
2782 Type *AccessTy = getLoadStoreType(I: LD);
2783 if (Seen.insert(V: {Ptr, AccessTy}).second ||
2784 !getPtrStride(PSE&: *PSE, AccessTy, Ptr, Lp: TheLoop, DT: *DT, StridesMap: SymbolicStrides, Assume: false,
2785 ShouldCheckWrap: true)) {
2786 ++NumReads;
2787 IsReadOnlyPtr = true;
2788 }
2789
2790 // See if there is an unsafe dependency between a load to a uniform address and
2791 // store to the same uniform address.
2792 if (UniformStores.contains(Ptr)) {
2793 LLVM_DEBUG(dbgs() << "LAA: Found an unsafe dependency between a uniform "
2794 "load and uniform store to the same address!\n");
2795 HasLoadStoreDependenceInvolvingLoopInvariantAddress = true;
2796 }
2797
2798 MemoryLocation Loc = MemoryLocation::get(LI: LD);
2799 // The TBAA metadata could have a control dependency on the predication
2800 // condition, so we cannot rely on it when determining whether or not we
2801 // need runtime pointer checks.
2802 if (blockNeedsPredication(BB: LD->getParent(), TheLoop, DT))
2803 Loc.AATags.TBAA = nullptr;
2804
2805 // Expand forked pointers (i.e., a phi of multiple strided pointers) into
2806 // all alternatives.
2807 visitPointers(StartPtr: const_cast<Value *>(Loc.Ptr), InnermostLoop: *TheLoop,
2808 AddPointer: [&Accesses, AccessTy, Loc, IsReadOnlyPtr](Value *Ptr) {
2809 MemoryLocation NewLoc = Loc.getWithNewPtr(NewPtr: Ptr);
2810 Accesses.addLoad(Loc: NewLoc, AccessTy, IsReadOnly: IsReadOnlyPtr);
2811 });
2812 }
2813
2814 // If we write (or read-write) to a single destination and there are no other
2815 // reads in this loop then is it safe to vectorize: the vectorized stores
2816 // preserve ordering via replication or order-preserving @llvm.masked.scatter.
2817 if (NumReadWrites == 1 && NumReads == 0) {
2818 LLVM_DEBUG(dbgs() << "LAA: Found a write-only loop!\n");
2819 return true;
2820 }
2821
2822 // Build dependence sets and check whether we need a runtime pointer bounds
2823 // check.
2824 Accesses.buildDependenceSets();
2825
2826 // Find pointers with computable bounds. We are going to use this information
2827 // to place a runtime bound check.
2828 Value *UncomputablePtr = nullptr;
2829 HasCompletePtrRtChecking =
2830 Accesses.canCheckPtrAtRT(RtCheck&: *PtrRtChecking, TheLoop, StridesMap: SymbolicStrides,
2831 UncomputablePtr, AllowPartial, DepChecker: getDepChecker());
2832 if (!HasCompletePtrRtChecking) {
2833 const auto *I = dyn_cast_or_null<Instruction>(Val: UncomputablePtr);
2834 recordAnalysis(RemarkName: "CantIdentifyArrayBounds", Instr: I)
2835 << "cannot identify array bounds";
2836 LLVM_DEBUG(dbgs() << "LAA: We can't vectorize because we can't find "
2837 << "the array bounds.\n");
2838 return false;
2839 }
2840
2841 LLVM_DEBUG(
2842 dbgs() << "LAA: May be able to perform a memory runtime check if needed.\n");
2843
2844 bool DepsAreSafe = true;
2845 if (Accesses.isDependencyCheckNeeded()) {
2846 LLVM_DEBUG(dbgs() << "LAA: Checking memory dependencies\n");
2847 DepsAreSafe =
2848 DepChecker->areDepsSafe(DepCands, CheckDeps: Accesses.getDependenciesToCheck());
2849
2850 if (!DepsAreSafe && DepChecker->shouldRetryWithRuntimeChecks()) {
2851 LLVM_DEBUG(dbgs() << "LAA: Retrying with memory checks\n");
2852
2853 PtrRtChecking->reset();
2854 PtrRtChecking->Need = true;
2855
2856 UncomputablePtr = nullptr;
2857 HasCompletePtrRtChecking = Accesses.canCheckPtrAtRT(
2858 RtCheck&: *PtrRtChecking, TheLoop, StridesMap: SymbolicStrides, UncomputablePtr,
2859 AllowPartial, DepChecker: getDepChecker());
2860
2861 // Check that we found the bounds for the pointer.
2862 if (!HasCompletePtrRtChecking) {
2863 auto *I = dyn_cast_or_null<Instruction>(Val: UncomputablePtr);
2864 recordAnalysis(RemarkName: "CantCheckMemDepsAtRunTime", Instr: I)
2865 << "cannot check memory dependencies at runtime";
2866 LLVM_DEBUG(dbgs() << "LAA: Can't vectorize with memory checks\n");
2867 return false;
2868 }
2869
2870 // Clear the dependency checks. They are no longer needed.
2871 Accesses.resetDepChecks(DepChecker&: *DepChecker);
2872
2873 DepsAreSafe = true;
2874 }
2875 }
2876
2877 // Update the invariant address dependence flags based on dependences found
2878 // by the dep checker. Even if dependences were not recorded (too many to
2879 // track), any InvariantUnsafe dep would still have set the status to Unsafe
2880 if (const auto *Deps = DepChecker->getDependences()) {
2881 for (const auto &Dep : *Deps) {
2882 if (Dep.Type != MemoryDepChecker::Dependence::InvariantUnsafe)
2883 continue;
2884 Instruction *Src = Dep.getSource(DepChecker: *DepChecker);
2885 Instruction *Dst = Dep.getDestination(DepChecker: *DepChecker);
2886 if (isa<LoadInst>(Val: Src) != isa<LoadInst>(Val: Dst)) {
2887 HasLoadStoreDependenceInvolvingLoopInvariantAddress = true;
2888 } else {
2889 assert(isa<StoreInst>(Src) && isa<StoreInst>(Dst) &&
2890 "Expected both to be stores");
2891 HasStoreStoreDependenceInvolvingLoopInvariantAddress = true;
2892 }
2893 }
2894 }
2895
2896 if (HasConvergentOp) {
2897 recordAnalysis(RemarkName: "CantInsertRuntimeCheckWithConvergent")
2898 << "cannot add control dependency to convergent operation";
2899 LLVM_DEBUG(dbgs() << "LAA: We can't vectorize because a runtime check "
2900 "would be needed with a convergent operation\n");
2901 return false;
2902 }
2903
2904 if (DepsAreSafe) {
2905 LLVM_DEBUG(
2906 dbgs() << "LAA: No unsafe dependent memory operations in loop. We"
2907 << (PtrRtChecking->Need ? "" : " don't")
2908 << " need runtime memory checks.\n");
2909 return true;
2910 }
2911
2912 emitUnsafeDependenceRemark();
2913 return false;
2914}
2915
2916void LoopAccessInfo::emitUnsafeDependenceRemark() {
2917 const auto *Deps = getDepChecker().getDependences();
2918 if (!Deps)
2919 return;
2920 const auto *Found =
2921 llvm::find_if(Range: *Deps, P: [](const MemoryDepChecker::Dependence &D) {
2922 return MemoryDepChecker::Dependence::isSafeForVectorization(Type: D.Type) !=
2923 MemoryDepChecker::VectorizationSafetyStatus::Safe;
2924 });
2925 if (Found == Deps->end())
2926 return;
2927 MemoryDepChecker::Dependence Dep = *Found;
2928
2929 LLVM_DEBUG(dbgs() << "LAA: unsafe dependent memory operations in loop\n");
2930
2931 // Emit remark for first unsafe dependence
2932 bool HasForcedDistribution = false;
2933 std::optional<const MDOperand *> Value =
2934 findStringMetadataForLoop(TheLoop, Name: "llvm.loop.distribute.enable");
2935 if (Value) {
2936 const MDOperand *Op = *Value;
2937 assert(Op && mdconst::hasa<ConstantInt>(*Op) && "invalid metadata");
2938 HasForcedDistribution = mdconst::extract<ConstantInt>(MD: *Op)->getZExtValue();
2939 }
2940
2941 const std::string Info =
2942 HasForcedDistribution
2943 ? "unsafe dependent memory operations in loop."
2944 : "unsafe dependent memory operations in loop. Use "
2945 "#pragma clang loop distribute(enable) to allow loop distribution "
2946 "to attempt to isolate the offending operations into a separate "
2947 "loop";
2948 OptimizationRemarkAnalysis &R =
2949 recordAnalysis(RemarkName: "UnsafeDep", Instr: Dep.getDestination(DepChecker: getDepChecker())) << Info;
2950
2951 switch (Dep.Type) {
2952 case MemoryDepChecker::Dependence::NoDep:
2953 case MemoryDepChecker::Dependence::Forward:
2954 case MemoryDepChecker::Dependence::BackwardVectorizable:
2955 llvm_unreachable("Unexpected dependence");
2956 case MemoryDepChecker::Dependence::Backward:
2957 R << "\nBackward loop carried data dependence.";
2958 break;
2959 case MemoryDepChecker::Dependence::ForwardButPreventsForwarding:
2960 R << "\nForward loop carried data dependence that prevents "
2961 "store-to-load forwarding.";
2962 break;
2963 case MemoryDepChecker::Dependence::BackwardVectorizableButPreventsForwarding:
2964 R << "\nBackward loop carried data dependence that prevents "
2965 "store-to-load forwarding.";
2966 break;
2967 case MemoryDepChecker::Dependence::IndirectUnsafe:
2968 R << "\nUnsafe indirect dependence.";
2969 break;
2970 case MemoryDepChecker::Dependence::InvariantUnsafe:
2971 R << "\nUnsafe dependence on loop-invariant address.";
2972 break;
2973 case MemoryDepChecker::Dependence::Unknown:
2974 R << "\nUnknown data dependence.";
2975 break;
2976 }
2977
2978 if (Instruction *I = Dep.getSource(DepChecker: getDepChecker())) {
2979 DebugLoc SourceLoc = I->getDebugLoc();
2980 if (auto *DD = dyn_cast_or_null<Instruction>(Val: getPointerOperand(V: I)))
2981 SourceLoc = DD->getDebugLoc();
2982 if (SourceLoc)
2983 R << " Memory location is the same as accessed at "
2984 << ore::NV("Location", SourceLoc);
2985 }
2986}
2987
2988bool LoopAccessInfo::blockNeedsPredication(const BasicBlock *BB,
2989 const Loop *TheLoop,
2990 const DominatorTree *DT) {
2991 assert(TheLoop->contains(BB) && "Unknown block used");
2992
2993 // Blocks that do not dominate the latch need predication.
2994 const BasicBlock *Latch = TheLoop->getLoopLatch();
2995 assert(Latch && "Loop expected to have a single latch.");
2996 return !DT->dominates(A: BB, B: Latch);
2997}
2998
2999OptimizationRemarkAnalysis &
3000LoopAccessInfo::recordAnalysis(StringRef RemarkName, const Instruction *I) {
3001 assert(!Report && "Multiple reports generated");
3002
3003 const BasicBlock *CodeRegion = TheLoop->getHeader();
3004 DebugLoc DL = TheLoop->getStartLoc();
3005
3006 if (I) {
3007 CodeRegion = I->getParent();
3008 // If there is no debug location attached to the instruction, revert back to
3009 // using the loop's.
3010 if (I->getDebugLoc())
3011 DL = I->getDebugLoc();
3012 }
3013
3014 Report = std::make_unique<OptimizationRemarkAnalysis>(DEBUG_TYPE, args&: RemarkName,
3015 args&: DL, args&: CodeRegion);
3016 return *Report;
3017}
3018
3019bool LoopAccessInfo::isInvariant(Value *V) const {
3020 auto *SE = PSE->getSE();
3021 if (TheLoop->isLoopInvariant(V))
3022 return true;
3023 if (!SE->isSCEVable(Ty: V->getType()))
3024 return false;
3025 const SCEV *S = SE->getSCEV(V);
3026 return SE->isLoopInvariant(S, L: TheLoop);
3027}
3028
3029/// If \p Ptr is a GEP, which has a loop-variant operand, return that operand.
3030/// Otherwise, return \p Ptr.
3031static Value *getLoopVariantGEPOperand(Value *Ptr, ScalarEvolution *SE,
3032 Loop *Lp) {
3033 auto *GEP = dyn_cast<GetElementPtrInst>(Val: Ptr);
3034 if (!GEP)
3035 return Ptr;
3036
3037 Value *V = Ptr;
3038 for (const Use &U : GEP->operands()) {
3039 if (!SE->isLoopInvariant(S: SE->getSCEV(V: U), L: Lp)) {
3040 if (V == Ptr)
3041 V = U;
3042 else
3043 // There must be exactly one loop-variant operand.
3044 return Ptr;
3045 }
3046 }
3047 return V;
3048}
3049
3050/// Get the stride of a pointer access in a loop. Looks for symbolic
3051/// strides "a[i*stride]". Returns the symbolic stride, or null otherwise.
3052static const SCEV *getStrideFromPointer(Value *Ptr, ScalarEvolution *SE, Loop *Lp) {
3053 auto *PtrTy = dyn_cast<PointerType>(Val: Ptr->getType());
3054 if (!PtrTy)
3055 return nullptr;
3056
3057 // Try to remove a gep instruction to make the pointer (actually index at this
3058 // point) easier analyzable. If OrigPtr is equal to Ptr we are analyzing the
3059 // pointer, otherwise, we are analyzing the index.
3060 Value *OrigPtr = Ptr;
3061
3062 Ptr = getLoopVariantGEPOperand(Ptr, SE, Lp);
3063 const SCEV *V = SE->getSCEV(V: Ptr);
3064
3065 if (Ptr != OrigPtr)
3066 // Strip off casts.
3067 while (auto *C = dyn_cast<SCEVIntegralCastExpr>(Val: V))
3068 V = C->getOperand();
3069
3070 if (!match(S: V, P: m_scev_AffineAddRec(Op0: m_SCEV(), Op1: m_SCEV(V), L: m_SpecificLoop(L: Lp))))
3071 return nullptr;
3072
3073 // Note that the restriction after this loop invariant check are only
3074 // profitability restrictions.
3075 if (!SE->isLoopInvariant(S: V, L: Lp))
3076 return nullptr;
3077
3078 // Look for the loop invariant symbolic value.
3079 if (isa<SCEVUnknown>(Val: V))
3080 return V;
3081
3082 // Look through multiplies that scale a stride by a constant.
3083 match(S: V, P: m_scev_Mul(Op0: m_SCEVConstant(), Op1: m_SCEV(V)));
3084 if (auto *C = dyn_cast<SCEVIntegralCastExpr>(Val: V))
3085 if (isa<SCEVUnknown>(Val: C->getOperand()))
3086 return V;
3087
3088 return nullptr;
3089}
3090
3091void LoopAccessInfo::collectStridedAccess(Value *MemAccess) {
3092 Value *Ptr = getLoadStorePointerOperand(V: MemAccess);
3093 if (!Ptr)
3094 return;
3095
3096 // Note: getStrideFromPointer is a *profitability* heuristic. We
3097 // could broaden the scope of values returned here - to anything
3098 // which happens to be loop invariant and contributes to the
3099 // computation of an interesting IV - but we chose not to as we
3100 // don't have a cost model here, and broadening the scope exposes
3101 // far too many unprofitable cases.
3102 const SCEV *StrideExpr = getStrideFromPointer(Ptr, SE: PSE->getSE(), Lp: TheLoop);
3103 if (!StrideExpr)
3104 return;
3105
3106 if (match(S: StrideExpr, P: m_scev_UndefOrPoison()))
3107 return;
3108
3109 LLVM_DEBUG(dbgs() << "LAA: Found a strided access that is a candidate for "
3110 "versioning:");
3111 LLVM_DEBUG(dbgs() << " Ptr: " << *Ptr << " Stride: " << *StrideExpr << "\n");
3112
3113 if (!SpeculateUnitStride) {
3114 LLVM_DEBUG(dbgs() << " Chose not to due to -laa-speculate-unit-stride\n");
3115 return;
3116 }
3117
3118 // Avoid adding the "Stride == 1" predicate when we know that
3119 // Stride >= Trip-Count. Such a predicate will effectively optimize a single
3120 // or zero iteration loop, as Trip-Count <= Stride == 1.
3121 //
3122 // TODO: We are currently not making a very informed decision on when it is
3123 // beneficial to apply stride versioning. It might make more sense that the
3124 // users of this analysis (such as the vectorizer) will trigger it, based on
3125 // their specific cost considerations; For example, in cases where stride
3126 // versioning does not help resolving memory accesses/dependences, the
3127 // vectorizer should evaluate the cost of the runtime test, and the benefit
3128 // of various possible stride specializations, considering the alternatives
3129 // of using gather/scatters (if available).
3130
3131 const SCEV *MaxBTC = PSE->getSymbolicMaxBackedgeTakenCount();
3132
3133 // Match the types so we can compare the stride and the MaxBTC.
3134 // The Stride can be positive/negative, so we sign extend Stride;
3135 // The backedgeTakenCount is non-negative, so we zero extend MaxBTC.
3136 const DataLayout &DL = TheLoop->getHeader()->getDataLayout();
3137 uint64_t StrideTypeSizeBits = DL.getTypeSizeInBits(Ty: StrideExpr->getType());
3138 uint64_t BETypeSizeBits = DL.getTypeSizeInBits(Ty: MaxBTC->getType());
3139 const SCEV *CastedStride = StrideExpr;
3140 const SCEV *CastedBECount = MaxBTC;
3141 ScalarEvolution *SE = PSE->getSE();
3142 if (BETypeSizeBits >= StrideTypeSizeBits)
3143 CastedStride = SE->getNoopOrSignExtend(V: StrideExpr, Ty: MaxBTC->getType());
3144 else
3145 CastedBECount = SE->getZeroExtendExpr(Op: MaxBTC, Ty: StrideExpr->getType());
3146 const SCEV *StrideMinusBETaken = SE->getMinusSCEV(LHS: CastedStride, RHS: CastedBECount);
3147 // Since TripCount == BackEdgeTakenCount + 1, checking:
3148 // "Stride >= TripCount" is equivalent to checking:
3149 // Stride - MaxBTC> 0
3150 if (SE->isKnownPositive(S: StrideMinusBETaken)) {
3151 LLVM_DEBUG(
3152 dbgs() << "LAA: Stride>=TripCount; No point in versioning as the "
3153 "Stride==1 predicate will imply that the loop executes "
3154 "at most once.\n");
3155 return;
3156 }
3157 LLVM_DEBUG(dbgs() << "LAA: Found a strided access that we can version.\n");
3158
3159 // Strip back off the integer cast, and check that our result is a
3160 // SCEVUnknown as we expect.
3161 const SCEV *StrideBase = StrideExpr;
3162 if (const auto *C = dyn_cast<SCEVIntegralCastExpr>(Val: StrideBase))
3163 StrideBase = C->getOperand();
3164 SymbolicStrides[Ptr] = cast<SCEVUnknown>(Val: StrideBase);
3165}
3166
3167LoopAccessInfo::LoopAccessInfo(Loop *L, ScalarEvolution *SE,
3168 const TargetTransformInfo *TTI,
3169 const TargetLibraryInfo *TLI, AAResults *AA,
3170 DominatorTree *DT, LoopInfo *LI,
3171 AssumptionCache *AC, bool AllowPartial)
3172 : PSE(std::make_unique<PredicatedScalarEvolution>(args&: *SE, args&: *L)),
3173 PtrRtChecking(nullptr), TheLoop(L), AllowPartial(AllowPartial) {
3174 unsigned MaxTargetVectorWidthInBits = std::numeric_limits<unsigned>::max();
3175 if (TTI && !TTI->enableScalableVectorization())
3176 // Scale the vector width by 2 as rough estimate to also consider
3177 // interleaving.
3178 MaxTargetVectorWidthInBits =
3179 TTI->getRegisterBitWidth(K: TargetTransformInfo::RGK_FixedWidthVector) * 2;
3180
3181 DepChecker = std::make_unique<MemoryDepChecker>(
3182 args&: *PSE, args&: AC, args&: DT, args&: L, args&: SymbolicStrides, args&: MaxTargetVectorWidthInBits, args&: LoopGuards);
3183 PtrRtChecking =
3184 std::make_unique<RuntimePointerChecking>(args&: *DepChecker, args&: SE, args&: LoopGuards);
3185 if (canAnalyzeLoop())
3186 CanVecMem = analyzeLoop(AA, LI, TLI, DT);
3187}
3188
3189void LoopAccessInfo::print(raw_ostream &OS, unsigned Depth) const {
3190 if (CanVecMem) {
3191 OS.indent(NumSpaces: Depth) << "Memory dependences are safe";
3192 const MemoryDepChecker &DC = getDepChecker();
3193 if (!DC.isSafeForAnyVectorWidth())
3194 OS << " with a maximum safe vector width of "
3195 << DC.getMaxSafeVectorWidthInBits() << " bits";
3196 if (!DC.isSafeForAnyStoreLoadForwardDistances()) {
3197 uint64_t SLDist = DC.getStoreLoadForwardSafeDistanceInBits();
3198 OS << ", with a maximum safe store-load forward width of " << SLDist
3199 << " bits";
3200 }
3201 if (PtrRtChecking->Need)
3202 OS << " with run-time checks";
3203 OS << "\n";
3204 }
3205
3206 if (HasConvergentOp)
3207 OS.indent(NumSpaces: Depth) << "Has convergent operation in loop\n";
3208
3209 if (Report)
3210 OS.indent(NumSpaces: Depth) << "Report: " << Report->getMsg() << "\n";
3211
3212 if (auto *Dependences = DepChecker->getDependences()) {
3213 OS.indent(NumSpaces: Depth) << "Dependences:\n";
3214 for (const auto &Dep : *Dependences) {
3215 Dep.print(OS, Depth: Depth + 2, Instrs: DepChecker->getMemoryInstructions());
3216 OS << "\n";
3217 }
3218 } else
3219 OS.indent(NumSpaces: Depth) << "Too many dependences, not recorded\n";
3220
3221 // List the pair of accesses need run-time checks to prove independence.
3222 PtrRtChecking->print(OS, Depth);
3223 if (PtrRtChecking->Need && !HasCompletePtrRtChecking)
3224 OS.indent(NumSpaces: Depth) << "Generated run-time checks are incomplete\n";
3225 OS << "\n";
3226
3227 OS.indent(NumSpaces: Depth)
3228 << "Non vectorizable stores to invariant address were "
3229 << (HasStoreStoreDependenceInvolvingLoopInvariantAddress ||
3230 HasLoadStoreDependenceInvolvingLoopInvariantAddress
3231 ? ""
3232 : "not ")
3233 << "found in loop.\n";
3234
3235 OS.indent(NumSpaces: Depth) << "SCEV assumptions:\n";
3236 PSE->getPredicate().print(OS, Depth);
3237
3238 OS << "\n";
3239
3240 OS.indent(NumSpaces: Depth) << "Expressions re-written:\n";
3241 PSE->print(OS, Depth);
3242}
3243
3244const LoopAccessInfo &LoopAccessInfoManager::getInfo(Loop &L,
3245 bool AllowPartial) {
3246 const auto &[It, Inserted] = LoopAccessInfoMap.try_emplace(Key: &L);
3247
3248 // We need to create the LoopAccessInfo if either we don't already have one,
3249 // or if it was created with a different value of AllowPartial.
3250 if (Inserted || It->second->hasAllowPartial() != AllowPartial)
3251 It->second = std::make_unique<LoopAccessInfo>(args: &L, args: &SE, args&: TTI, args&: TLI, args: &AA, args: &DT,
3252 args: &LI, args&: AC, args&: AllowPartial);
3253
3254 return *It->second;
3255}
3256void LoopAccessInfoManager::clear() {
3257 // Collect LoopAccessInfo entries that may keep references to IR outside the
3258 // analyzed loop or SCEVs that may have been modified or invalidated. At the
3259 // moment, that is loops requiring memory or SCEV runtime checks, as those cache
3260 // SCEVs, e.g. for pointer expressions.
3261 LoopAccessInfoMap.remove_if(Pred: [](const auto &Entry) {
3262 const auto &LAI = Entry.second;
3263 return !(LAI->getRuntimePointerChecking()->getChecks().empty() &&
3264 LAI->getPSE().getPredicate().isAlwaysTrue());
3265 });
3266}
3267
3268bool LoopAccessInfoManager::invalidate(
3269 Function &F, const PreservedAnalyses &PA,
3270 FunctionAnalysisManager::Invalidator &Inv) {
3271 // Check whether our analysis is preserved.
3272 auto PAC = PA.getChecker<LoopAccessAnalysis>();
3273 if (!PAC.preserved() && !PAC.preservedSet<AllAnalysesOn<Function>>())
3274 // If not, give up now.
3275 return true;
3276
3277 // Check whether the analyses we depend on became invalid for any reason.
3278 // Skip checking TargetLibraryAnalysis as it is immutable and can't become
3279 // invalid.
3280 return Inv.invalidate<AAManager>(IR&: F, PA) ||
3281 Inv.invalidate<ScalarEvolutionAnalysis>(IR&: F, PA) ||
3282 Inv.invalidate<LoopAnalysis>(IR&: F, PA) ||
3283 Inv.invalidate<DominatorTreeAnalysis>(IR&: F, PA);
3284}
3285
3286LoopAccessInfoManager LoopAccessAnalysis::run(Function &F,
3287 FunctionAnalysisManager &FAM) {
3288 auto &SE = FAM.getResult<ScalarEvolutionAnalysis>(IR&: F);
3289 auto &AA = FAM.getResult<AAManager>(IR&: F);
3290 auto &DT = FAM.getResult<DominatorTreeAnalysis>(IR&: F);
3291 auto &LI = FAM.getResult<LoopAnalysis>(IR&: F);
3292 auto &TTI = FAM.getResult<TargetIRAnalysis>(IR&: F);
3293 auto &TLI = FAM.getResult<TargetLibraryAnalysis>(IR&: F);
3294 auto &AC = FAM.getResult<AssumptionAnalysis>(IR&: F);
3295 return LoopAccessInfoManager(SE, AA, DT, LI, &TTI, &TLI, &AC);
3296}
3297
3298AnalysisKey LoopAccessAnalysis::Key;
3299