1 | //===- LoopAccessAnalysis.cpp - Loop Access Analysis Implementation --------==// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // The implementation for the loop memory dependence that was originally |
10 | // developed for the loop vectorizer. |
11 | // |
12 | //===----------------------------------------------------------------------===// |
13 | |
14 | #include "llvm/Analysis/LoopAccessAnalysis.h" |
15 | #include "llvm/ADT/APInt.h" |
16 | #include "llvm/ADT/DenseMap.h" |
17 | #include "llvm/ADT/EquivalenceClasses.h" |
18 | #include "llvm/ADT/PointerIntPair.h" |
19 | #include "llvm/ADT/STLExtras.h" |
20 | #include "llvm/ADT/SetVector.h" |
21 | #include "llvm/ADT/SmallPtrSet.h" |
22 | #include "llvm/ADT/SmallSet.h" |
23 | #include "llvm/ADT/SmallVector.h" |
24 | #include "llvm/Analysis/AliasAnalysis.h" |
25 | #include "llvm/Analysis/AliasSetTracker.h" |
26 | #include "llvm/Analysis/LoopAnalysisManager.h" |
27 | #include "llvm/Analysis/LoopInfo.h" |
28 | #include "llvm/Analysis/LoopIterator.h" |
29 | #include "llvm/Analysis/MemoryLocation.h" |
30 | #include "llvm/Analysis/OptimizationRemarkEmitter.h" |
31 | #include "llvm/Analysis/ScalarEvolution.h" |
32 | #include "llvm/Analysis/ScalarEvolutionExpressions.h" |
33 | #include "llvm/Analysis/ScalarEvolutionPatternMatch.h" |
34 | #include "llvm/Analysis/TargetLibraryInfo.h" |
35 | #include "llvm/Analysis/TargetTransformInfo.h" |
36 | #include "llvm/Analysis/ValueTracking.h" |
37 | #include "llvm/Analysis/VectorUtils.h" |
38 | #include "llvm/IR/BasicBlock.h" |
39 | #include "llvm/IR/Constants.h" |
40 | #include "llvm/IR/DataLayout.h" |
41 | #include "llvm/IR/DebugLoc.h" |
42 | #include "llvm/IR/DerivedTypes.h" |
43 | #include "llvm/IR/DiagnosticInfo.h" |
44 | #include "llvm/IR/Dominators.h" |
45 | #include "llvm/IR/Function.h" |
46 | #include "llvm/IR/InstrTypes.h" |
47 | #include "llvm/IR/Instruction.h" |
48 | #include "llvm/IR/Instructions.h" |
49 | #include "llvm/IR/IntrinsicInst.h" |
50 | #include "llvm/IR/PassManager.h" |
51 | #include "llvm/IR/Type.h" |
52 | #include "llvm/IR/Value.h" |
53 | #include "llvm/IR/ValueHandle.h" |
54 | #include "llvm/Support/Casting.h" |
55 | #include "llvm/Support/CommandLine.h" |
56 | #include "llvm/Support/Debug.h" |
57 | #include "llvm/Support/ErrorHandling.h" |
58 | #include "llvm/Support/raw_ostream.h" |
59 | #include <algorithm> |
60 | #include <cassert> |
61 | #include <cstdint> |
62 | #include <iterator> |
63 | #include <utility> |
64 | #include <variant> |
65 | #include <vector> |
66 | |
67 | using namespace llvm; |
68 | using namespace llvm::SCEVPatternMatch; |
69 | |
70 | #define DEBUG_TYPE "loop-accesses" |
71 | |
72 | static cl::opt<unsigned, true> |
73 | VectorizationFactor("force-vector-width" , cl::Hidden, |
74 | cl::desc("Sets the SIMD width. Zero is autoselect." ), |
75 | cl::location(L&: VectorizerParams::VectorizationFactor)); |
76 | unsigned VectorizerParams::VectorizationFactor; |
77 | |
78 | static cl::opt<unsigned, true> |
79 | VectorizationInterleave("force-vector-interleave" , cl::Hidden, |
80 | cl::desc("Sets the vectorization interleave count. " |
81 | "Zero is autoselect." ), |
82 | cl::location( |
83 | L&: VectorizerParams::VectorizationInterleave)); |
84 | unsigned VectorizerParams::VectorizationInterleave; |
85 | |
86 | static cl::opt<unsigned, true> RuntimeMemoryCheckThreshold( |
87 | "runtime-memory-check-threshold" , cl::Hidden, |
88 | cl::desc("When performing memory disambiguation checks at runtime do not " |
89 | "generate more than this number of comparisons (default = 8)." ), |
90 | cl::location(L&: VectorizerParams::RuntimeMemoryCheckThreshold), cl::init(Val: 8)); |
91 | unsigned VectorizerParams::RuntimeMemoryCheckThreshold; |
92 | |
93 | /// The maximum iterations used to merge memory checks |
94 | static cl::opt<unsigned> MemoryCheckMergeThreshold( |
95 | "memory-check-merge-threshold" , cl::Hidden, |
96 | cl::desc("Maximum number of comparisons done when trying to merge " |
97 | "runtime memory checks. (default = 100)" ), |
98 | cl::init(Val: 100)); |
99 | |
100 | /// Maximum SIMD width. |
101 | const unsigned VectorizerParams::MaxVectorWidth = 64; |
102 | |
103 | /// We collect dependences up to this threshold. |
104 | static cl::opt<unsigned> |
105 | MaxDependences("max-dependences" , cl::Hidden, |
106 | cl::desc("Maximum number of dependences collected by " |
107 | "loop-access analysis (default = 100)" ), |
108 | cl::init(Val: 100)); |
109 | |
110 | /// This enables versioning on the strides of symbolically striding memory |
111 | /// accesses in code like the following. |
112 | /// for (i = 0; i < N; ++i) |
113 | /// A[i * Stride1] += B[i * Stride2] ... |
114 | /// |
115 | /// Will be roughly translated to |
116 | /// if (Stride1 == 1 && Stride2 == 1) { |
117 | /// for (i = 0; i < N; i+=4) |
118 | /// A[i:i+3] += ... |
119 | /// } else |
120 | /// ... |
121 | static cl::opt<bool> EnableMemAccessVersioning( |
122 | "enable-mem-access-versioning" , cl::init(Val: true), cl::Hidden, |
123 | cl::desc("Enable symbolic stride memory access versioning" )); |
124 | |
125 | /// Enable store-to-load forwarding conflict detection. This option can |
126 | /// be disabled for correctness testing. |
127 | static cl::opt<bool> EnableForwardingConflictDetection( |
128 | "store-to-load-forwarding-conflict-detection" , cl::Hidden, |
129 | cl::desc("Enable conflict detection in loop-access analysis" ), |
130 | cl::init(Val: true)); |
131 | |
132 | static cl::opt<unsigned> MaxForkedSCEVDepth( |
133 | "max-forked-scev-depth" , cl::Hidden, |
134 | cl::desc("Maximum recursion depth when finding forked SCEVs (default = 5)" ), |
135 | cl::init(Val: 5)); |
136 | |
137 | static cl::opt<bool> SpeculateUnitStride( |
138 | "laa-speculate-unit-stride" , cl::Hidden, |
139 | cl::desc("Speculate that non-constant strides are unit in LAA" ), |
140 | cl::init(Val: true)); |
141 | |
142 | static cl::opt<bool, true> HoistRuntimeChecks( |
143 | "hoist-runtime-checks" , cl::Hidden, |
144 | cl::desc( |
145 | "Hoist inner loop runtime memory checks to outer loop if possible" ), |
146 | cl::location(L&: VectorizerParams::HoistRuntimeChecks), cl::init(Val: true)); |
147 | bool VectorizerParams::HoistRuntimeChecks; |
148 | |
149 | bool VectorizerParams::isInterleaveForced() { |
150 | return ::VectorizationInterleave.getNumOccurrences() > 0; |
151 | } |
152 | |
153 | const SCEV *llvm::replaceSymbolicStrideSCEV(PredicatedScalarEvolution &PSE, |
154 | const DenseMap<Value *, const SCEV *> &PtrToStride, |
155 | Value *Ptr) { |
156 | const SCEV *OrigSCEV = PSE.getSCEV(V: Ptr); |
157 | |
158 | // If there is an entry in the map return the SCEV of the pointer with the |
159 | // symbolic stride replaced by one. |
160 | const SCEV *StrideSCEV = PtrToStride.lookup(Val: Ptr); |
161 | if (!StrideSCEV) |
162 | // For a non-symbolic stride, just return the original expression. |
163 | return OrigSCEV; |
164 | |
165 | // Note: This assert is both overly strong and overly weak. The actual |
166 | // invariant here is that StrideSCEV should be loop invariant. The only |
167 | // such invariant strides we happen to speculate right now are unknowns |
168 | // and thus this is a reasonable proxy of the actual invariant. |
169 | assert(isa<SCEVUnknown>(StrideSCEV) && "shouldn't be in map" ); |
170 | |
171 | ScalarEvolution *SE = PSE.getSE(); |
172 | const SCEV *CT = SE->getOne(Ty: StrideSCEV->getType()); |
173 | PSE.addPredicate(Pred: *SE->getEqualPredicate(LHS: StrideSCEV, RHS: CT)); |
174 | const SCEV *Expr = PSE.getSCEV(V: Ptr); |
175 | |
176 | LLVM_DEBUG(dbgs() << "LAA: Replacing SCEV: " << *OrigSCEV |
177 | << " by: " << *Expr << "\n" ); |
178 | return Expr; |
179 | } |
180 | |
181 | RuntimeCheckingPtrGroup::RuntimeCheckingPtrGroup( |
182 | unsigned Index, const RuntimePointerChecking &RtCheck) |
183 | : High(RtCheck.Pointers[Index].End), Low(RtCheck.Pointers[Index].Start), |
184 | AddressSpace(RtCheck.Pointers[Index] |
185 | .PointerValue->getType() |
186 | ->getPointerAddressSpace()), |
187 | NeedsFreeze(RtCheck.Pointers[Index].NeedsFreeze) { |
188 | Members.push_back(Elt: Index); |
189 | } |
190 | |
191 | /// Returns \p A + \p B, if it is guaranteed not to unsigned wrap. Otherwise |
192 | /// return nullptr. \p A and \p B must have the same type. |
193 | static const SCEV *addSCEVNoOverflow(const SCEV *A, const SCEV *B, |
194 | ScalarEvolution &SE) { |
195 | if (!SE.willNotOverflow(BinOp: Instruction::Add, /*IsSigned=*/Signed: false, LHS: A, RHS: B)) |
196 | return nullptr; |
197 | return SE.getAddExpr(LHS: A, RHS: B); |
198 | } |
199 | |
200 | /// Returns \p A * \p B, if it is guaranteed not to unsigned wrap. Otherwise |
201 | /// return nullptr. \p A and \p B must have the same type. |
202 | static const SCEV *mulSCEVOverflow(const SCEV *A, const SCEV *B, |
203 | ScalarEvolution &SE) { |
204 | if (!SE.willNotOverflow(BinOp: Instruction::Mul, /*IsSigned=*/Signed: false, LHS: A, RHS: B)) |
205 | return nullptr; |
206 | return SE.getMulExpr(LHS: A, RHS: B); |
207 | } |
208 | |
209 | /// Return true, if evaluating \p AR at \p MaxBTC cannot wrap, because \p AR at |
210 | /// \p MaxBTC is guaranteed inbounds of the accessed object. |
211 | static bool evaluatePtrAddRecAtMaxBTCWillNotWrap(const SCEVAddRecExpr *AR, |
212 | const SCEV *MaxBTC, |
213 | const SCEV *EltSize, |
214 | ScalarEvolution &SE, |
215 | const DataLayout &DL) { |
216 | auto *PointerBase = SE.getPointerBase(V: AR->getStart()); |
217 | auto *StartPtr = dyn_cast<SCEVUnknown>(Val: PointerBase); |
218 | if (!StartPtr) |
219 | return false; |
220 | bool CheckForNonNull, CheckForFreed; |
221 | uint64_t DerefBytes = StartPtr->getValue()->getPointerDereferenceableBytes( |
222 | DL, CanBeNull&: CheckForNonNull, CanBeFreed&: CheckForFreed); |
223 | |
224 | if (CheckForNonNull || CheckForFreed) |
225 | return false; |
226 | |
227 | const SCEV *Step = AR->getStepRecurrence(SE); |
228 | bool IsKnownNonNegative = SE.isKnownNonNegative(S: Step); |
229 | if (!IsKnownNonNegative && !SE.isKnownNegative(S: Step)) |
230 | return false; |
231 | |
232 | Type *WiderTy = SE.getWiderType(Ty1: MaxBTC->getType(), Ty2: Step->getType()); |
233 | Step = SE.getNoopOrSignExtend(V: Step, Ty: WiderTy); |
234 | MaxBTC = SE.getNoopOrZeroExtend(V: MaxBTC, Ty: WiderTy); |
235 | |
236 | // For the computations below, make sure they don't unsigned wrap. |
237 | if (!SE.isKnownPredicate(Pred: CmpInst::ICMP_UGE, LHS: AR->getStart(), RHS: StartPtr)) |
238 | return false; |
239 | const SCEV *StartOffset = SE.getNoopOrZeroExtend( |
240 | V: SE.getMinusSCEV(LHS: AR->getStart(), RHS: StartPtr), Ty: WiderTy); |
241 | |
242 | const SCEV *OffsetAtLastIter = |
243 | mulSCEVOverflow(A: MaxBTC, B: SE.getAbsExpr(Op: Step, /*IsNSW=*/false), SE); |
244 | if (!OffsetAtLastIter) |
245 | return false; |
246 | |
247 | const SCEV *OffsetEndBytes = addSCEVNoOverflow( |
248 | A: OffsetAtLastIter, B: SE.getNoopOrZeroExtend(V: EltSize, Ty: WiderTy), SE); |
249 | if (!OffsetEndBytes) |
250 | return false; |
251 | |
252 | if (IsKnownNonNegative) { |
253 | // For positive steps, check if |
254 | // (AR->getStart() - StartPtr) + (MaxBTC * Step) + EltSize <= DerefBytes, |
255 | // while making sure none of the computations unsigned wrap themselves. |
256 | const SCEV *EndBytes = addSCEVNoOverflow(A: StartOffset, B: OffsetEndBytes, SE); |
257 | if (!EndBytes) |
258 | return false; |
259 | return SE.isKnownPredicate(Pred: CmpInst::ICMP_ULE, LHS: EndBytes, |
260 | RHS: SE.getConstant(Ty: WiderTy, V: DerefBytes)); |
261 | } |
262 | |
263 | // For negative steps check if |
264 | // * StartOffset >= (MaxBTC * Step + EltSize) |
265 | // * StartOffset <= DerefBytes. |
266 | assert(SE.isKnownNegative(Step) && "must be known negative" ); |
267 | return SE.isKnownPredicate(Pred: CmpInst::ICMP_SGE, LHS: StartOffset, RHS: OffsetEndBytes) && |
268 | SE.isKnownPredicate(Pred: CmpInst::ICMP_ULE, LHS: StartOffset, |
269 | RHS: SE.getConstant(Ty: WiderTy, V: DerefBytes)); |
270 | } |
271 | |
272 | std::pair<const SCEV *, const SCEV *> llvm::getStartAndEndForAccess( |
273 | const Loop *Lp, const SCEV *PtrExpr, Type *AccessTy, const SCEV *BTC, |
274 | const SCEV *MaxBTC, ScalarEvolution *SE, |
275 | DenseMap<std::pair<const SCEV *, Type *>, |
276 | std::pair<const SCEV *, const SCEV *>> *PointerBounds) { |
277 | std::pair<const SCEV *, const SCEV *> *PtrBoundsPair; |
278 | if (PointerBounds) { |
279 | auto [Iter, Ins] = PointerBounds->insert( |
280 | KV: {{PtrExpr, AccessTy}, |
281 | {SE->getCouldNotCompute(), SE->getCouldNotCompute()}}); |
282 | if (!Ins) |
283 | return Iter->second; |
284 | PtrBoundsPair = &Iter->second; |
285 | } |
286 | |
287 | const SCEV *ScStart; |
288 | const SCEV *ScEnd; |
289 | |
290 | auto &DL = Lp->getHeader()->getDataLayout(); |
291 | Type *IdxTy = DL.getIndexType(PtrTy: PtrExpr->getType()); |
292 | const SCEV *EltSizeSCEV = SE->getStoreSizeOfExpr(IntTy: IdxTy, StoreTy: AccessTy); |
293 | if (SE->isLoopInvariant(S: PtrExpr, L: Lp)) { |
294 | ScStart = ScEnd = PtrExpr; |
295 | } else if (auto *AR = dyn_cast<SCEVAddRecExpr>(Val: PtrExpr)) { |
296 | ScStart = AR->getStart(); |
297 | if (!isa<SCEVCouldNotCompute>(Val: BTC)) |
298 | // Evaluating AR at an exact BTC is safe: LAA separately checks that |
299 | // accesses cannot wrap in the loop. If evaluating AR at BTC wraps, then |
300 | // the loop either triggers UB when executing a memory access with a |
301 | // poison pointer or the wrapping/poisoned pointer is not used. |
302 | ScEnd = AR->evaluateAtIteration(It: BTC, SE&: *SE); |
303 | else { |
304 | // Evaluating AR at MaxBTC may wrap and create an expression that is less |
305 | // than the start of the AddRec due to wrapping (for example consider |
306 | // MaxBTC = -2). If that's the case, set ScEnd to -(EltSize + 1). ScEnd |
307 | // will get incremented by EltSize before returning, so this effectively |
308 | // sets ScEnd to the maximum unsigned value for the type. Note that LAA |
309 | // separately checks that accesses cannot not wrap, so unsigned max |
310 | // represents an upper bound. |
311 | if (evaluatePtrAddRecAtMaxBTCWillNotWrap(AR, MaxBTC, EltSize: EltSizeSCEV, SE&: *SE, |
312 | DL)) { |
313 | ScEnd = AR->evaluateAtIteration(It: MaxBTC, SE&: *SE); |
314 | } else { |
315 | ScEnd = SE->getAddExpr( |
316 | LHS: SE->getNegativeSCEV(V: EltSizeSCEV), |
317 | RHS: SE->getSCEV(V: ConstantExpr::getIntToPtr( |
318 | C: ConstantInt::get(Ty: EltSizeSCEV->getType(), V: -1), Ty: AR->getType()))); |
319 | } |
320 | } |
321 | const SCEV *Step = AR->getStepRecurrence(SE&: *SE); |
322 | |
323 | // For expressions with negative step, the upper bound is ScStart and the |
324 | // lower bound is ScEnd. |
325 | if (const auto *CStep = dyn_cast<SCEVConstant>(Val: Step)) { |
326 | if (CStep->getValue()->isNegative()) |
327 | std::swap(a&: ScStart, b&: ScEnd); |
328 | } else { |
329 | // Fallback case: the step is not constant, but we can still |
330 | // get the upper and lower bounds of the interval by using min/max |
331 | // expressions. |
332 | ScStart = SE->getUMinExpr(LHS: ScStart, RHS: ScEnd); |
333 | ScEnd = SE->getUMaxExpr(LHS: AR->getStart(), RHS: ScEnd); |
334 | } |
335 | } else |
336 | return {SE->getCouldNotCompute(), SE->getCouldNotCompute()}; |
337 | |
338 | assert(SE->isLoopInvariant(ScStart, Lp) && "ScStart needs to be invariant" ); |
339 | assert(SE->isLoopInvariant(ScEnd, Lp) && "ScEnd needs to be invariant" ); |
340 | |
341 | // Add the size of the pointed element to ScEnd. |
342 | ScEnd = SE->getAddExpr(LHS: ScEnd, RHS: EltSizeSCEV); |
343 | |
344 | std::pair<const SCEV *, const SCEV *> Res = {ScStart, ScEnd}; |
345 | if (PointerBounds) |
346 | *PtrBoundsPair = Res; |
347 | return Res; |
348 | } |
349 | |
350 | /// Calculate Start and End points of memory access using |
351 | /// getStartAndEndForAccess. |
352 | void RuntimePointerChecking::insert(Loop *Lp, Value *Ptr, const SCEV *PtrExpr, |
353 | Type *AccessTy, bool WritePtr, |
354 | unsigned DepSetId, unsigned ASId, |
355 | PredicatedScalarEvolution &PSE, |
356 | bool NeedsFreeze) { |
357 | const SCEV *SymbolicMaxBTC = PSE.getSymbolicMaxBackedgeTakenCount(); |
358 | const SCEV *BTC = PSE.getBackedgeTakenCount(); |
359 | const auto &[ScStart, ScEnd] = |
360 | getStartAndEndForAccess(Lp, PtrExpr, AccessTy, BTC, MaxBTC: SymbolicMaxBTC, |
361 | SE: PSE.getSE(), PointerBounds: &DC.getPointerBounds()); |
362 | assert(!isa<SCEVCouldNotCompute>(ScStart) && |
363 | !isa<SCEVCouldNotCompute>(ScEnd) && |
364 | "must be able to compute both start and end expressions" ); |
365 | Pointers.emplace_back(Args&: Ptr, Args: ScStart, Args: ScEnd, Args&: WritePtr, Args&: DepSetId, Args&: ASId, Args&: PtrExpr, |
366 | Args&: NeedsFreeze); |
367 | } |
368 | |
369 | bool RuntimePointerChecking::tryToCreateDiffCheck( |
370 | const RuntimeCheckingPtrGroup &CGI, const RuntimeCheckingPtrGroup &CGJ) { |
371 | // If either group contains multiple different pointers, bail out. |
372 | // TODO: Support multiple pointers by using the minimum or maximum pointer, |
373 | // depending on src & sink. |
374 | if (CGI.Members.size() != 1 || CGJ.Members.size() != 1) |
375 | return false; |
376 | |
377 | const PointerInfo *Src = &Pointers[CGI.Members[0]]; |
378 | const PointerInfo *Sink = &Pointers[CGJ.Members[0]]; |
379 | |
380 | // If either pointer is read and written, multiple checks may be needed. Bail |
381 | // out. |
382 | if (!DC.getOrderForAccess(Ptr: Src->PointerValue, IsWrite: !Src->IsWritePtr).empty() || |
383 | !DC.getOrderForAccess(Ptr: Sink->PointerValue, IsWrite: !Sink->IsWritePtr).empty()) |
384 | return false; |
385 | |
386 | ArrayRef<unsigned> AccSrc = |
387 | DC.getOrderForAccess(Ptr: Src->PointerValue, IsWrite: Src->IsWritePtr); |
388 | ArrayRef<unsigned> AccSink = |
389 | DC.getOrderForAccess(Ptr: Sink->PointerValue, IsWrite: Sink->IsWritePtr); |
390 | // If either pointer is accessed multiple times, there may not be a clear |
391 | // src/sink relation. Bail out for now. |
392 | if (AccSrc.size() != 1 || AccSink.size() != 1) |
393 | return false; |
394 | |
395 | // If the sink is accessed before src, swap src/sink. |
396 | if (AccSink[0] < AccSrc[0]) |
397 | std::swap(a&: Src, b&: Sink); |
398 | |
399 | const SCEVConstant *Step; |
400 | const SCEV *SrcStart; |
401 | const SCEV *SinkStart; |
402 | const Loop *InnerLoop = DC.getInnermostLoop(); |
403 | if (!match(S: Src->Expr, |
404 | P: m_scev_AffineAddRec(Op0: m_SCEV(V&: SrcStart), Op1: m_SCEVConstant(V&: Step), |
405 | L: m_SpecificLoop(L: InnerLoop))) || |
406 | !match(S: Sink->Expr, |
407 | P: m_scev_AffineAddRec(Op0: m_SCEV(V&: SinkStart), Op1: m_scev_Specific(S: Step), |
408 | L: m_SpecificLoop(L: InnerLoop)))) |
409 | return false; |
410 | |
411 | SmallVector<Instruction *, 4> SrcInsts = |
412 | DC.getInstructionsForAccess(Ptr: Src->PointerValue, isWrite: Src->IsWritePtr); |
413 | SmallVector<Instruction *, 4> SinkInsts = |
414 | DC.getInstructionsForAccess(Ptr: Sink->PointerValue, isWrite: Sink->IsWritePtr); |
415 | Type *SrcTy = getLoadStoreType(I: SrcInsts[0]); |
416 | Type *DstTy = getLoadStoreType(I: SinkInsts[0]); |
417 | if (isa<ScalableVectorType>(Val: SrcTy) || isa<ScalableVectorType>(Val: DstTy)) |
418 | return false; |
419 | |
420 | const DataLayout &DL = InnerLoop->getHeader()->getDataLayout(); |
421 | unsigned AllocSize = |
422 | std::max(a: DL.getTypeAllocSize(Ty: SrcTy), b: DL.getTypeAllocSize(Ty: DstTy)); |
423 | |
424 | // Only matching constant steps matching the AllocSize are supported at the |
425 | // moment. This simplifies the difference computation. Can be extended in the |
426 | // future. |
427 | if (Step->getAPInt().abs() != AllocSize) |
428 | return false; |
429 | |
430 | IntegerType *IntTy = |
431 | IntegerType::get(C&: Src->PointerValue->getContext(), |
432 | NumBits: DL.getPointerSizeInBits(AS: CGI.AddressSpace)); |
433 | |
434 | // When counting down, the dependence distance needs to be swapped. |
435 | if (Step->getValue()->isNegative()) |
436 | std::swap(a&: SinkStart, b&: SrcStart); |
437 | |
438 | const SCEV *SinkStartInt = SE->getPtrToIntExpr(Op: SinkStart, Ty: IntTy); |
439 | const SCEV *SrcStartInt = SE->getPtrToIntExpr(Op: SrcStart, Ty: IntTy); |
440 | if (isa<SCEVCouldNotCompute>(Val: SinkStartInt) || |
441 | isa<SCEVCouldNotCompute>(Val: SrcStartInt)) |
442 | return false; |
443 | |
444 | // If the start values for both Src and Sink also vary according to an outer |
445 | // loop, then it's probably better to avoid creating diff checks because |
446 | // they may not be hoisted. We should instead let llvm::addRuntimeChecks |
447 | // do the expanded full range overlap checks, which can be hoisted. |
448 | if (HoistRuntimeChecks && InnerLoop->getParentLoop() && |
449 | isa<SCEVAddRecExpr>(Val: SinkStartInt) && isa<SCEVAddRecExpr>(Val: SrcStartInt)) { |
450 | auto *SrcStartAR = cast<SCEVAddRecExpr>(Val: SrcStartInt); |
451 | auto *SinkStartAR = cast<SCEVAddRecExpr>(Val: SinkStartInt); |
452 | const Loop *StartARLoop = SrcStartAR->getLoop(); |
453 | if (StartARLoop == SinkStartAR->getLoop() && |
454 | StartARLoop == InnerLoop->getParentLoop() && |
455 | // If the diff check would already be loop invariant (due to the |
456 | // recurrences being the same), then we prefer to keep the diff checks |
457 | // because they are cheaper. |
458 | SrcStartAR->getStepRecurrence(SE&: *SE) != |
459 | SinkStartAR->getStepRecurrence(SE&: *SE)) { |
460 | LLVM_DEBUG(dbgs() << "LAA: Not creating diff runtime check, since these " |
461 | "cannot be hoisted out of the outer loop\n" ); |
462 | return false; |
463 | } |
464 | } |
465 | |
466 | LLVM_DEBUG(dbgs() << "LAA: Creating diff runtime check for:\n" |
467 | << "SrcStart: " << *SrcStartInt << '\n' |
468 | << "SinkStartInt: " << *SinkStartInt << '\n'); |
469 | DiffChecks.emplace_back(Args&: SrcStartInt, Args&: SinkStartInt, Args&: AllocSize, |
470 | Args: Src->NeedsFreeze || Sink->NeedsFreeze); |
471 | return true; |
472 | } |
473 | |
474 | SmallVector<RuntimePointerCheck, 4> RuntimePointerChecking::generateChecks() { |
475 | SmallVector<RuntimePointerCheck, 4> Checks; |
476 | |
477 | for (unsigned I = 0; I < CheckingGroups.size(); ++I) { |
478 | for (unsigned J = I + 1; J < CheckingGroups.size(); ++J) { |
479 | const RuntimeCheckingPtrGroup &CGI = CheckingGroups[I]; |
480 | const RuntimeCheckingPtrGroup &CGJ = CheckingGroups[J]; |
481 | |
482 | if (needsChecking(M: CGI, N: CGJ)) { |
483 | CanUseDiffCheck = CanUseDiffCheck && tryToCreateDiffCheck(CGI, CGJ); |
484 | Checks.emplace_back(Args: &CGI, Args: &CGJ); |
485 | } |
486 | } |
487 | } |
488 | return Checks; |
489 | } |
490 | |
491 | void RuntimePointerChecking::generateChecks( |
492 | MemoryDepChecker::DepCandidates &DepCands, bool UseDependencies) { |
493 | assert(Checks.empty() && "Checks is not empty" ); |
494 | groupChecks(DepCands, UseDependencies); |
495 | Checks = generateChecks(); |
496 | } |
497 | |
498 | bool RuntimePointerChecking::needsChecking( |
499 | const RuntimeCheckingPtrGroup &M, const RuntimeCheckingPtrGroup &N) const { |
500 | for (const auto &I : M.Members) |
501 | for (const auto &J : N.Members) |
502 | if (needsChecking(I, J)) |
503 | return true; |
504 | return false; |
505 | } |
506 | |
507 | /// Compare \p I and \p J and return the minimum. |
508 | /// Return nullptr in case we couldn't find an answer. |
509 | static const SCEV *getMinFromExprs(const SCEV *I, const SCEV *J, |
510 | ScalarEvolution *SE) { |
511 | std::optional<APInt> Diff = SE->computeConstantDifference(LHS: J, RHS: I); |
512 | if (!Diff) |
513 | return nullptr; |
514 | return Diff->isNegative() ? J : I; |
515 | } |
516 | |
517 | bool RuntimeCheckingPtrGroup::addPointer( |
518 | unsigned Index, const RuntimePointerChecking &RtCheck) { |
519 | return addPointer( |
520 | Index, Start: RtCheck.Pointers[Index].Start, End: RtCheck.Pointers[Index].End, |
521 | AS: RtCheck.Pointers[Index].PointerValue->getType()->getPointerAddressSpace(), |
522 | NeedsFreeze: RtCheck.Pointers[Index].NeedsFreeze, SE&: *RtCheck.SE); |
523 | } |
524 | |
525 | bool RuntimeCheckingPtrGroup::addPointer(unsigned Index, const SCEV *Start, |
526 | const SCEV *End, unsigned AS, |
527 | bool NeedsFreeze, |
528 | ScalarEvolution &SE) { |
529 | assert(AddressSpace == AS && |
530 | "all pointers in a checking group must be in the same address space" ); |
531 | |
532 | // Compare the starts and ends with the known minimum and maximum |
533 | // of this set. We need to know how we compare against the min/max |
534 | // of the set in order to be able to emit memchecks. |
535 | const SCEV *Min0 = getMinFromExprs(I: Start, J: Low, SE: &SE); |
536 | if (!Min0) |
537 | return false; |
538 | |
539 | const SCEV *Min1 = getMinFromExprs(I: End, J: High, SE: &SE); |
540 | if (!Min1) |
541 | return false; |
542 | |
543 | // Update the low bound expression if we've found a new min value. |
544 | if (Min0 == Start) |
545 | Low = Start; |
546 | |
547 | // Update the high bound expression if we've found a new max value. |
548 | if (Min1 != End) |
549 | High = End; |
550 | |
551 | Members.push_back(Elt: Index); |
552 | this->NeedsFreeze |= NeedsFreeze; |
553 | return true; |
554 | } |
555 | |
556 | void RuntimePointerChecking::groupChecks( |
557 | MemoryDepChecker::DepCandidates &DepCands, bool UseDependencies) { |
558 | // We build the groups from dependency candidates equivalence classes |
559 | // because: |
560 | // - We know that pointers in the same equivalence class share |
561 | // the same underlying object and therefore there is a chance |
562 | // that we can compare pointers |
563 | // - We wouldn't be able to merge two pointers for which we need |
564 | // to emit a memcheck. The classes in DepCands are already |
565 | // conveniently built such that no two pointers in the same |
566 | // class need checking against each other. |
567 | |
568 | // We use the following (greedy) algorithm to construct the groups |
569 | // For every pointer in the equivalence class: |
570 | // For each existing group: |
571 | // - if the difference between this pointer and the min/max bounds |
572 | // of the group is a constant, then make the pointer part of the |
573 | // group and update the min/max bounds of that group as required. |
574 | |
575 | CheckingGroups.clear(); |
576 | |
577 | // If we need to check two pointers to the same underlying object |
578 | // with a non-constant difference, we shouldn't perform any pointer |
579 | // grouping with those pointers. This is because we can easily get |
580 | // into cases where the resulting check would return false, even when |
581 | // the accesses are safe. |
582 | // |
583 | // The following example shows this: |
584 | // for (i = 0; i < 1000; ++i) |
585 | // a[5000 + i * m] = a[i] + a[i + 9000] |
586 | // |
587 | // Here grouping gives a check of (5000, 5000 + 1000 * m) against |
588 | // (0, 10000) which is always false. However, if m is 1, there is no |
589 | // dependence. Not grouping the checks for a[i] and a[i + 9000] allows |
590 | // us to perform an accurate check in this case. |
591 | // |
592 | // The above case requires that we have an UnknownDependence between |
593 | // accesses to the same underlying object. This cannot happen unless |
594 | // FoundNonConstantDistanceDependence is set, and therefore UseDependencies |
595 | // is also false. In this case we will use the fallback path and create |
596 | // separate checking groups for all pointers. |
597 | |
598 | // If we don't have the dependency partitions, construct a new |
599 | // checking pointer group for each pointer. This is also required |
600 | // for correctness, because in this case we can have checking between |
601 | // pointers to the same underlying object. |
602 | if (!UseDependencies) { |
603 | for (unsigned I = 0; I < Pointers.size(); ++I) |
604 | CheckingGroups.emplace_back(Args&: I, Args&: *this); |
605 | return; |
606 | } |
607 | |
608 | unsigned TotalComparisons = 0; |
609 | |
610 | DenseMap<Value *, SmallVector<unsigned>> PositionMap; |
611 | for (unsigned Index = 0; Index < Pointers.size(); ++Index) |
612 | PositionMap[Pointers[Index].PointerValue].push_back(Elt: Index); |
613 | |
614 | // We need to keep track of what pointers we've already seen so we |
615 | // don't process them twice. |
616 | SmallSet<unsigned, 2> Seen; |
617 | |
618 | // Go through all equivalence classes, get the "pointer check groups" |
619 | // and add them to the overall solution. We use the order in which accesses |
620 | // appear in 'Pointers' to enforce determinism. |
621 | for (unsigned I = 0; I < Pointers.size(); ++I) { |
622 | // We've seen this pointer before, and therefore already processed |
623 | // its equivalence class. |
624 | if (Seen.contains(V: I)) |
625 | continue; |
626 | |
627 | MemoryDepChecker::MemAccessInfo Access(Pointers[I].PointerValue, |
628 | Pointers[I].IsWritePtr); |
629 | |
630 | SmallVector<RuntimeCheckingPtrGroup, 2> Groups; |
631 | |
632 | // Because DepCands is constructed by visiting accesses in the order in |
633 | // which they appear in alias sets (which is deterministic) and the |
634 | // iteration order within an equivalence class member is only dependent on |
635 | // the order in which unions and insertions are performed on the |
636 | // equivalence class, the iteration order is deterministic. |
637 | for (auto M : DepCands.members(V: Access)) { |
638 | auto PointerI = PositionMap.find(Val: M.getPointer()); |
639 | // If we can't find the pointer in PositionMap that means we can't |
640 | // generate a memcheck for it. |
641 | if (PointerI == PositionMap.end()) |
642 | continue; |
643 | for (unsigned Pointer : PointerI->second) { |
644 | bool Merged = false; |
645 | // Mark this pointer as seen. |
646 | Seen.insert(V: Pointer); |
647 | |
648 | // Go through all the existing sets and see if we can find one |
649 | // which can include this pointer. |
650 | for (RuntimeCheckingPtrGroup &Group : Groups) { |
651 | // Don't perform more than a certain amount of comparisons. |
652 | // This should limit the cost of grouping the pointers to something |
653 | // reasonable. If we do end up hitting this threshold, the algorithm |
654 | // will create separate groups for all remaining pointers. |
655 | if (TotalComparisons > MemoryCheckMergeThreshold) |
656 | break; |
657 | |
658 | TotalComparisons++; |
659 | |
660 | if (Group.addPointer(Index: Pointer, RtCheck: *this)) { |
661 | Merged = true; |
662 | break; |
663 | } |
664 | } |
665 | |
666 | if (!Merged) |
667 | // We couldn't add this pointer to any existing set or the threshold |
668 | // for the number of comparisons has been reached. Create a new group |
669 | // to hold the current pointer. |
670 | Groups.emplace_back(Args&: Pointer, Args&: *this); |
671 | } |
672 | } |
673 | |
674 | // We've computed the grouped checks for this partition. |
675 | // Save the results and continue with the next one. |
676 | llvm::append_range(C&: CheckingGroups, R&: Groups); |
677 | } |
678 | } |
679 | |
680 | bool RuntimePointerChecking::arePointersInSamePartition( |
681 | const SmallVectorImpl<int> &PtrToPartition, unsigned PtrIdx1, |
682 | unsigned PtrIdx2) { |
683 | return (PtrToPartition[PtrIdx1] != -1 && |
684 | PtrToPartition[PtrIdx1] == PtrToPartition[PtrIdx2]); |
685 | } |
686 | |
687 | bool RuntimePointerChecking::needsChecking(unsigned I, unsigned J) const { |
688 | const PointerInfo &PointerI = Pointers[I]; |
689 | const PointerInfo &PointerJ = Pointers[J]; |
690 | |
691 | // No need to check if two readonly pointers intersect. |
692 | if (!PointerI.IsWritePtr && !PointerJ.IsWritePtr) |
693 | return false; |
694 | |
695 | // Only need to check pointers between two different dependency sets. |
696 | if (PointerI.DependencySetId == PointerJ.DependencySetId) |
697 | return false; |
698 | |
699 | // Only need to check pointers in the same alias set. |
700 | return PointerI.AliasSetId == PointerJ.AliasSetId; |
701 | } |
702 | |
703 | /// Assign each RuntimeCheckingPtrGroup pointer an index for stable UTC output. |
704 | static DenseMap<const RuntimeCheckingPtrGroup *, unsigned> |
705 | getPtrToIdxMap(ArrayRef<RuntimeCheckingPtrGroup> CheckingGroups) { |
706 | DenseMap<const RuntimeCheckingPtrGroup *, unsigned> PtrIndices; |
707 | for (const auto &[Idx, CG] : enumerate(First&: CheckingGroups)) |
708 | PtrIndices[&CG] = Idx; |
709 | return PtrIndices; |
710 | } |
711 | |
712 | void RuntimePointerChecking::printChecks( |
713 | raw_ostream &OS, const SmallVectorImpl<RuntimePointerCheck> &Checks, |
714 | unsigned Depth) const { |
715 | unsigned N = 0; |
716 | auto PtrIndices = getPtrToIdxMap(CheckingGroups); |
717 | for (const auto &[Check1, Check2] : Checks) { |
718 | const auto &First = Check1->Members, &Second = Check2->Members; |
719 | OS.indent(NumSpaces: Depth) << "Check " << N++ << ":\n" ; |
720 | OS.indent(NumSpaces: Depth + 2) << "Comparing group GRP" << PtrIndices.at(Val: Check1) |
721 | << ":\n" ; |
722 | for (unsigned K : First) |
723 | OS.indent(NumSpaces: Depth + 2) << *Pointers[K].PointerValue << "\n" ; |
724 | OS.indent(NumSpaces: Depth + 2) << "Against group GRP" << PtrIndices.at(Val: Check2) |
725 | << ":\n" ; |
726 | for (unsigned K : Second) |
727 | OS.indent(NumSpaces: Depth + 2) << *Pointers[K].PointerValue << "\n" ; |
728 | } |
729 | } |
730 | |
731 | void RuntimePointerChecking::print(raw_ostream &OS, unsigned Depth) const { |
732 | |
733 | OS.indent(NumSpaces: Depth) << "Run-time memory checks:\n" ; |
734 | printChecks(OS, Checks, Depth); |
735 | |
736 | OS.indent(NumSpaces: Depth) << "Grouped accesses:\n" ; |
737 | auto PtrIndices = getPtrToIdxMap(CheckingGroups); |
738 | for (const auto &CG : CheckingGroups) { |
739 | OS.indent(NumSpaces: Depth + 2) << "Group GRP" << PtrIndices.at(Val: &CG) << ":\n" ; |
740 | OS.indent(NumSpaces: Depth + 4) << "(Low: " << *CG.Low << " High: " << *CG.High |
741 | << ")\n" ; |
742 | for (unsigned Member : CG.Members) { |
743 | OS.indent(NumSpaces: Depth + 6) << "Member: " << *Pointers[Member].Expr << "\n" ; |
744 | } |
745 | } |
746 | } |
747 | |
748 | namespace { |
749 | |
750 | /// Analyses memory accesses in a loop. |
751 | /// |
752 | /// Checks whether run time pointer checks are needed and builds sets for data |
753 | /// dependence checking. |
754 | class AccessAnalysis { |
755 | public: |
756 | /// Read or write access location. |
757 | typedef PointerIntPair<Value *, 1, bool> MemAccessInfo; |
758 | typedef SmallVector<MemAccessInfo, 8> MemAccessInfoList; |
759 | |
760 | AccessAnalysis(const Loop *TheLoop, AAResults *AA, const LoopInfo *LI, |
761 | MemoryDepChecker::DepCandidates &DA, |
762 | PredicatedScalarEvolution &PSE, |
763 | SmallPtrSetImpl<MDNode *> &LoopAliasScopes) |
764 | : TheLoop(TheLoop), BAA(*AA), AST(BAA), LI(LI), DepCands(DA), PSE(PSE), |
765 | LoopAliasScopes(LoopAliasScopes) { |
766 | // We're analyzing dependences across loop iterations. |
767 | BAA.enableCrossIterationMode(); |
768 | } |
769 | |
770 | /// Register a load and whether it is only read from. |
771 | void addLoad(const MemoryLocation &Loc, Type *AccessTy, bool IsReadOnly) { |
772 | Value *Ptr = const_cast<Value *>(Loc.Ptr); |
773 | AST.add(Loc: adjustLoc(Loc)); |
774 | Accesses[MemAccessInfo(Ptr, false)].insert(X: AccessTy); |
775 | if (IsReadOnly) |
776 | ReadOnlyPtr.insert(Ptr); |
777 | } |
778 | |
779 | /// Register a store. |
780 | void addStore(const MemoryLocation &Loc, Type *AccessTy) { |
781 | Value *Ptr = const_cast<Value *>(Loc.Ptr); |
782 | AST.add(Loc: adjustLoc(Loc)); |
783 | Accesses[MemAccessInfo(Ptr, true)].insert(X: AccessTy); |
784 | } |
785 | |
786 | /// Check if we can emit a run-time no-alias check for \p Access. |
787 | /// |
788 | /// Returns true if we can emit a run-time no alias check for \p Access. |
789 | /// If we can check this access, this also adds it to a dependence set and |
790 | /// adds a run-time to check for it to \p RtCheck. If \p Assume is true, |
791 | /// we will attempt to use additional run-time checks in order to get |
792 | /// the bounds of the pointer. |
793 | bool createCheckForAccess(RuntimePointerChecking &RtCheck, |
794 | MemAccessInfo Access, Type *AccessTy, |
795 | const DenseMap<Value *, const SCEV *> &Strides, |
796 | DenseMap<Value *, unsigned> &DepSetId, |
797 | Loop *TheLoop, unsigned &RunningDepId, |
798 | unsigned ASId, bool Assume); |
799 | |
800 | /// Check whether we can check the pointers at runtime for |
801 | /// non-intersection. |
802 | /// |
803 | /// Returns true if we need no check or if we do and we can generate them |
804 | /// (i.e. the pointers have computable bounds). A return value of false means |
805 | /// we couldn't analyze and generate runtime checks for all pointers in the |
806 | /// loop, but if \p AllowPartial is set then we will have checks for those |
807 | /// pointers we could analyze. |
808 | bool canCheckPtrAtRT(RuntimePointerChecking &RtCheck, Loop *TheLoop, |
809 | const DenseMap<Value *, const SCEV *> &Strides, |
810 | Value *&UncomputablePtr, bool AllowPartial); |
811 | |
812 | /// Goes over all memory accesses, checks whether a RT check is needed |
813 | /// and builds sets of dependent accesses. |
814 | void buildDependenceSets() { |
815 | processMemAccesses(); |
816 | } |
817 | |
818 | /// Initial processing of memory accesses determined that we need to |
819 | /// perform dependency checking. |
820 | /// |
821 | /// Note that this can later be cleared if we retry memcheck analysis without |
822 | /// dependency checking (i.e. FoundNonConstantDistanceDependence). |
823 | bool isDependencyCheckNeeded() const { return !CheckDeps.empty(); } |
824 | |
825 | /// We decided that no dependence analysis would be used. Reset the state. |
826 | void resetDepChecks(MemoryDepChecker &DepChecker) { |
827 | CheckDeps.clear(); |
828 | DepChecker.clearDependences(); |
829 | } |
830 | |
831 | const MemAccessInfoList &getDependenciesToCheck() const { return CheckDeps; } |
832 | |
833 | private: |
834 | typedef MapVector<MemAccessInfo, SmallSetVector<Type *, 1>> PtrAccessMap; |
835 | |
836 | /// Adjust the MemoryLocation so that it represents accesses to this |
837 | /// location across all iterations, rather than a single one. |
838 | MemoryLocation adjustLoc(MemoryLocation Loc) const { |
839 | // The accessed location varies within the loop, but remains within the |
840 | // underlying object. |
841 | Loc.Size = LocationSize::beforeOrAfterPointer(); |
842 | Loc.AATags.Scope = adjustAliasScopeList(ScopeList: Loc.AATags.Scope); |
843 | Loc.AATags.NoAlias = adjustAliasScopeList(ScopeList: Loc.AATags.NoAlias); |
844 | return Loc; |
845 | } |
846 | |
847 | /// Drop alias scopes that are only valid within a single loop iteration. |
848 | MDNode *adjustAliasScopeList(MDNode *ScopeList) const { |
849 | if (!ScopeList) |
850 | return nullptr; |
851 | |
852 | // For the sake of simplicity, drop the whole scope list if any scope is |
853 | // iteration-local. |
854 | if (any_of(Range: ScopeList->operands(), P: [&](Metadata *Scope) { |
855 | return LoopAliasScopes.contains(Ptr: cast<MDNode>(Val: Scope)); |
856 | })) |
857 | return nullptr; |
858 | |
859 | return ScopeList; |
860 | } |
861 | |
862 | /// Go over all memory access and check whether runtime pointer checks |
863 | /// are needed and build sets of dependency check candidates. |
864 | void processMemAccesses(); |
865 | |
866 | /// Map of all accesses. Values are the types used to access memory pointed to |
867 | /// by the pointer. |
868 | PtrAccessMap Accesses; |
869 | |
870 | /// The loop being checked. |
871 | const Loop *TheLoop; |
872 | |
873 | /// List of accesses that need a further dependence check. |
874 | MemAccessInfoList CheckDeps; |
875 | |
876 | /// Set of pointers that are read only. |
877 | SmallPtrSet<Value*, 16> ReadOnlyPtr; |
878 | |
879 | /// Batched alias analysis results. |
880 | BatchAAResults BAA; |
881 | |
882 | /// An alias set tracker to partition the access set by underlying object and |
883 | //intrinsic property (such as TBAA metadata). |
884 | AliasSetTracker AST; |
885 | |
886 | /// The LoopInfo of the loop being checked. |
887 | const LoopInfo *LI; |
888 | |
889 | /// Sets of potentially dependent accesses - members of one set share an |
890 | /// underlying pointer. The set "CheckDeps" identfies which sets really need a |
891 | /// dependence check. |
892 | MemoryDepChecker::DepCandidates &DepCands; |
893 | |
894 | /// Initial processing of memory accesses determined that we may need |
895 | /// to add memchecks. Perform the analysis to determine the necessary checks. |
896 | /// |
897 | /// Note that, this is different from isDependencyCheckNeeded. When we retry |
898 | /// memcheck analysis without dependency checking |
899 | /// (i.e. FoundNonConstantDistanceDependence), isDependencyCheckNeeded is |
900 | /// cleared while this remains set if we have potentially dependent accesses. |
901 | bool IsRTCheckAnalysisNeeded = false; |
902 | |
903 | /// The SCEV predicate containing all the SCEV-related assumptions. |
904 | PredicatedScalarEvolution &PSE; |
905 | |
906 | DenseMap<Value *, SmallVector<const Value *, 16>> UnderlyingObjects; |
907 | |
908 | /// Alias scopes that are declared inside the loop, and as such not valid |
909 | /// across iterations. |
910 | SmallPtrSetImpl<MDNode *> &LoopAliasScopes; |
911 | }; |
912 | |
913 | } // end anonymous namespace |
914 | |
915 | /// Try to compute a constant stride for \p AR. Used by getPtrStride and |
916 | /// isNoWrap. |
917 | static std::optional<int64_t> |
918 | getStrideFromAddRec(const SCEVAddRecExpr *AR, const Loop *Lp, Type *AccessTy, |
919 | Value *Ptr, PredicatedScalarEvolution &PSE) { |
920 | // The access function must stride over the innermost loop. |
921 | if (Lp != AR->getLoop()) { |
922 | LLVM_DEBUG({ |
923 | dbgs() << "LAA: Bad stride - Not striding over innermost loop " ; |
924 | if (Ptr) |
925 | dbgs() << *Ptr << " " ; |
926 | |
927 | dbgs() << "SCEV: " << *AR << "\n" ; |
928 | }); |
929 | return std::nullopt; |
930 | } |
931 | |
932 | // Check the step is constant. |
933 | const SCEV *Step = AR->getStepRecurrence(SE&: *PSE.getSE()); |
934 | |
935 | // Calculate the pointer stride and check if it is constant. |
936 | const APInt *APStepVal; |
937 | if (!match(S: Step, P: m_scev_APInt(C&: APStepVal))) { |
938 | LLVM_DEBUG({ |
939 | dbgs() << "LAA: Bad stride - Not a constant strided " ; |
940 | if (Ptr) |
941 | dbgs() << *Ptr << " " ; |
942 | dbgs() << "SCEV: " << *AR << "\n" ; |
943 | }); |
944 | return std::nullopt; |
945 | } |
946 | |
947 | const auto &DL = Lp->getHeader()->getDataLayout(); |
948 | TypeSize AllocSize = DL.getTypeAllocSize(Ty: AccessTy); |
949 | int64_t Size = AllocSize.getFixedValue(); |
950 | |
951 | // Huge step value - give up. |
952 | std::optional<int64_t> StepVal = APStepVal->trySExtValue(); |
953 | if (!StepVal) |
954 | return std::nullopt; |
955 | |
956 | // Strided access. |
957 | return *StepVal % Size ? std::nullopt : std::make_optional(t: *StepVal / Size); |
958 | } |
959 | |
960 | /// Check whether \p AR is a non-wrapping AddRec. If \p Ptr is not nullptr, use |
961 | /// informating from the IR pointer value to determine no-wrap. |
962 | static bool isNoWrap(PredicatedScalarEvolution &PSE, const SCEVAddRecExpr *AR, |
963 | Value *Ptr, Type *AccessTy, const Loop *L, bool Assume, |
964 | std::optional<int64_t> Stride = std::nullopt) { |
965 | // FIXME: This should probably only return true for NUW. |
966 | if (AR->getNoWrapFlags(Mask: SCEV::NoWrapMask)) |
967 | return true; |
968 | |
969 | if (Ptr && PSE.hasNoOverflow(V: Ptr, Flags: SCEVWrapPredicate::IncrementNUSW)) |
970 | return true; |
971 | |
972 | // An nusw getelementptr that is an AddRec cannot wrap. If it would wrap, |
973 | // the distance between the previously accessed location and the wrapped |
974 | // location will be larger than half the pointer index type space. In that |
975 | // case, the GEP would be poison and any memory access dependent on it would |
976 | // be immediate UB when executed. |
977 | if (auto *GEP = dyn_cast_if_present<GetElementPtrInst>(Val: Ptr); |
978 | GEP && GEP->hasNoUnsignedSignedWrap()) |
979 | return true; |
980 | |
981 | if (!Stride) |
982 | Stride = getStrideFromAddRec(AR, Lp: L, AccessTy, Ptr, PSE); |
983 | if (Stride) { |
984 | // If the null pointer is undefined, then a access sequence which would |
985 | // otherwise access it can be assumed not to unsigned wrap. Note that this |
986 | // assumes the object in memory is aligned to the natural alignment. |
987 | unsigned AddrSpace = AR->getType()->getPointerAddressSpace(); |
988 | if (!NullPointerIsDefined(F: L->getHeader()->getParent(), AS: AddrSpace) && |
989 | (Stride == 1 || Stride == -1)) |
990 | return true; |
991 | } |
992 | |
993 | if (Ptr && Assume) { |
994 | PSE.setNoOverflow(V: Ptr, Flags: SCEVWrapPredicate::IncrementNUSW); |
995 | LLVM_DEBUG(dbgs() << "LAA: Pointer may wrap:\n" |
996 | << "LAA: Pointer: " << *Ptr << "\n" |
997 | << "LAA: SCEV: " << *AR << "\n" |
998 | << "LAA: Added an overflow assumption\n" ); |
999 | return true; |
1000 | } |
1001 | |
1002 | return false; |
1003 | } |
1004 | |
1005 | static void visitPointers(Value *StartPtr, const Loop &InnermostLoop, |
1006 | function_ref<void(Value *)> AddPointer) { |
1007 | SmallPtrSet<Value *, 8> Visited; |
1008 | SmallVector<Value *> WorkList; |
1009 | WorkList.push_back(Elt: StartPtr); |
1010 | |
1011 | while (!WorkList.empty()) { |
1012 | Value *Ptr = WorkList.pop_back_val(); |
1013 | if (!Visited.insert(Ptr).second) |
1014 | continue; |
1015 | auto *PN = dyn_cast<PHINode>(Val: Ptr); |
1016 | // SCEV does not look through non-header PHIs inside the loop. Such phis |
1017 | // can be analyzed by adding separate accesses for each incoming pointer |
1018 | // value. |
1019 | if (PN && InnermostLoop.contains(BB: PN->getParent()) && |
1020 | PN->getParent() != InnermostLoop.getHeader()) { |
1021 | llvm::append_range(C&: WorkList, R: PN->incoming_values()); |
1022 | } else |
1023 | AddPointer(Ptr); |
1024 | } |
1025 | } |
1026 | |
1027 | // Walk back through the IR for a pointer, looking for a select like the |
1028 | // following: |
1029 | // |
1030 | // %offset = select i1 %cmp, i64 %a, i64 %b |
1031 | // %addr = getelementptr double, double* %base, i64 %offset |
1032 | // %ld = load double, double* %addr, align 8 |
1033 | // |
1034 | // We won't be able to form a single SCEVAddRecExpr from this since the |
1035 | // address for each loop iteration depends on %cmp. We could potentially |
1036 | // produce multiple valid SCEVAddRecExprs, though, and check all of them for |
1037 | // memory safety/aliasing if needed. |
1038 | // |
1039 | // If we encounter some IR we don't yet handle, or something obviously fine |
1040 | // like a constant, then we just add the SCEV for that term to the list passed |
1041 | // in by the caller. If we have a node that may potentially yield a valid |
1042 | // SCEVAddRecExpr then we decompose it into parts and build the SCEV terms |
1043 | // ourselves before adding to the list. |
1044 | static void findForkedSCEVs( |
1045 | ScalarEvolution *SE, const Loop *L, Value *Ptr, |
1046 | SmallVectorImpl<PointerIntPair<const SCEV *, 1, bool>> &ScevList, |
1047 | unsigned Depth) { |
1048 | // If our Value is a SCEVAddRecExpr, loop invariant, not an instruction, or |
1049 | // we've exceeded our limit on recursion, just return whatever we have |
1050 | // regardless of whether it can be used for a forked pointer or not, along |
1051 | // with an indication of whether it might be a poison or undef value. |
1052 | const SCEV *Scev = SE->getSCEV(V: Ptr); |
1053 | if (isa<SCEVAddRecExpr>(Val: Scev) || L->isLoopInvariant(V: Ptr) || |
1054 | !isa<Instruction>(Val: Ptr) || Depth == 0) { |
1055 | ScevList.emplace_back(Args&: Scev, Args: !isGuaranteedNotToBeUndefOrPoison(V: Ptr)); |
1056 | return; |
1057 | } |
1058 | |
1059 | Depth--; |
1060 | |
1061 | auto UndefPoisonCheck = [](PointerIntPair<const SCEV *, 1, bool> S) { |
1062 | return get<1>(Pair: S); |
1063 | }; |
1064 | |
1065 | auto GetBinOpExpr = [&SE](unsigned Opcode, const SCEV *L, const SCEV *R) { |
1066 | switch (Opcode) { |
1067 | case Instruction::Add: |
1068 | return SE->getAddExpr(LHS: L, RHS: R); |
1069 | case Instruction::Sub: |
1070 | return SE->getMinusSCEV(LHS: L, RHS: R); |
1071 | default: |
1072 | llvm_unreachable("Unexpected binary operator when walking ForkedPtrs" ); |
1073 | } |
1074 | }; |
1075 | |
1076 | Instruction *I = cast<Instruction>(Val: Ptr); |
1077 | unsigned Opcode = I->getOpcode(); |
1078 | switch (Opcode) { |
1079 | case Instruction::GetElementPtr: { |
1080 | auto *GEP = cast<GetElementPtrInst>(Val: I); |
1081 | Type *SourceTy = GEP->getSourceElementType(); |
1082 | // We only handle base + single offset GEPs here for now. |
1083 | // Not dealing with preexisting gathers yet, so no vectors. |
1084 | if (I->getNumOperands() != 2 || SourceTy->isVectorTy()) { |
1085 | ScevList.emplace_back(Args&: Scev, Args: !isGuaranteedNotToBeUndefOrPoison(V: GEP)); |
1086 | break; |
1087 | } |
1088 | SmallVector<PointerIntPair<const SCEV *, 1, bool>, 2> BaseScevs; |
1089 | SmallVector<PointerIntPair<const SCEV *, 1, bool>, 2> OffsetScevs; |
1090 | findForkedSCEVs(SE, L, Ptr: I->getOperand(i: 0), ScevList&: BaseScevs, Depth); |
1091 | findForkedSCEVs(SE, L, Ptr: I->getOperand(i: 1), ScevList&: OffsetScevs, Depth); |
1092 | |
1093 | // See if we need to freeze our fork... |
1094 | bool NeedsFreeze = any_of(Range&: BaseScevs, P: UndefPoisonCheck) || |
1095 | any_of(Range&: OffsetScevs, P: UndefPoisonCheck); |
1096 | |
1097 | // Check that we only have a single fork, on either the base or the offset. |
1098 | // Copy the SCEV across for the one without a fork in order to generate |
1099 | // the full SCEV for both sides of the GEP. |
1100 | if (OffsetScevs.size() == 2 && BaseScevs.size() == 1) |
1101 | BaseScevs.push_back(Elt: BaseScevs[0]); |
1102 | else if (BaseScevs.size() == 2 && OffsetScevs.size() == 1) |
1103 | OffsetScevs.push_back(Elt: OffsetScevs[0]); |
1104 | else { |
1105 | ScevList.emplace_back(Args&: Scev, Args&: NeedsFreeze); |
1106 | break; |
1107 | } |
1108 | |
1109 | Type *IntPtrTy = SE->getEffectiveSCEVType(Ty: GEP->getPointerOperandType()); |
1110 | |
1111 | // Find the size of the type being pointed to. We only have a single |
1112 | // index term (guarded above) so we don't need to index into arrays or |
1113 | // structures, just get the size of the scalar value. |
1114 | const SCEV *Size = SE->getSizeOfExpr(IntTy: IntPtrTy, AllocTy: SourceTy); |
1115 | |
1116 | for (auto [B, O] : zip(t&: BaseScevs, u&: OffsetScevs)) { |
1117 | const SCEV *Base = get<0>(Pair: B); |
1118 | const SCEV *Offset = get<0>(Pair: O); |
1119 | |
1120 | // Scale up the offsets by the size of the type, then add to the bases. |
1121 | const SCEV *Scaled = |
1122 | SE->getMulExpr(LHS: Size, RHS: SE->getTruncateOrSignExtend(V: Offset, Ty: IntPtrTy)); |
1123 | ScevList.emplace_back(Args: SE->getAddExpr(LHS: Base, RHS: Scaled), Args&: NeedsFreeze); |
1124 | } |
1125 | break; |
1126 | } |
1127 | case Instruction::Select: { |
1128 | SmallVector<PointerIntPair<const SCEV *, 1, bool>, 2> ChildScevs; |
1129 | // A select means we've found a forked pointer, but we currently only |
1130 | // support a single select per pointer so if there's another behind this |
1131 | // then we just bail out and return the generic SCEV. |
1132 | findForkedSCEVs(SE, L, Ptr: I->getOperand(i: 1), ScevList&: ChildScevs, Depth); |
1133 | findForkedSCEVs(SE, L, Ptr: I->getOperand(i: 2), ScevList&: ChildScevs, Depth); |
1134 | if (ChildScevs.size() == 2) |
1135 | append_range(C&: ScevList, R&: ChildScevs); |
1136 | else |
1137 | ScevList.emplace_back(Args&: Scev, Args: !isGuaranteedNotToBeUndefOrPoison(V: Ptr)); |
1138 | break; |
1139 | } |
1140 | case Instruction::PHI: { |
1141 | SmallVector<PointerIntPair<const SCEV *, 1, bool>, 2> ChildScevs; |
1142 | // A phi means we've found a forked pointer, but we currently only |
1143 | // support a single phi per pointer so if there's another behind this |
1144 | // then we just bail out and return the generic SCEV. |
1145 | if (I->getNumOperands() == 2) { |
1146 | findForkedSCEVs(SE, L, Ptr: I->getOperand(i: 0), ScevList&: ChildScevs, Depth); |
1147 | findForkedSCEVs(SE, L, Ptr: I->getOperand(i: 1), ScevList&: ChildScevs, Depth); |
1148 | } |
1149 | if (ChildScevs.size() == 2) |
1150 | append_range(C&: ScevList, R&: ChildScevs); |
1151 | else |
1152 | ScevList.emplace_back(Args&: Scev, Args: !isGuaranteedNotToBeUndefOrPoison(V: Ptr)); |
1153 | break; |
1154 | } |
1155 | case Instruction::Add: |
1156 | case Instruction::Sub: { |
1157 | SmallVector<PointerIntPair<const SCEV *, 1, bool>> LScevs; |
1158 | SmallVector<PointerIntPair<const SCEV *, 1, bool>> RScevs; |
1159 | findForkedSCEVs(SE, L, Ptr: I->getOperand(i: 0), ScevList&: LScevs, Depth); |
1160 | findForkedSCEVs(SE, L, Ptr: I->getOperand(i: 1), ScevList&: RScevs, Depth); |
1161 | |
1162 | // See if we need to freeze our fork... |
1163 | bool NeedsFreeze = |
1164 | any_of(Range&: LScevs, P: UndefPoisonCheck) || any_of(Range&: RScevs, P: UndefPoisonCheck); |
1165 | |
1166 | // Check that we only have a single fork, on either the left or right side. |
1167 | // Copy the SCEV across for the one without a fork in order to generate |
1168 | // the full SCEV for both sides of the BinOp. |
1169 | if (LScevs.size() == 2 && RScevs.size() == 1) |
1170 | RScevs.push_back(Elt: RScevs[0]); |
1171 | else if (RScevs.size() == 2 && LScevs.size() == 1) |
1172 | LScevs.push_back(Elt: LScevs[0]); |
1173 | else { |
1174 | ScevList.emplace_back(Args&: Scev, Args&: NeedsFreeze); |
1175 | break; |
1176 | } |
1177 | |
1178 | for (auto [L, R] : zip(t&: LScevs, u&: RScevs)) |
1179 | ScevList.emplace_back(Args: GetBinOpExpr(Opcode, get<0>(Pair: L), get<0>(Pair: R)), |
1180 | Args&: NeedsFreeze); |
1181 | break; |
1182 | } |
1183 | default: |
1184 | // Just return the current SCEV if we haven't handled the instruction yet. |
1185 | LLVM_DEBUG(dbgs() << "ForkedPtr unhandled instruction: " << *I << "\n" ); |
1186 | ScevList.emplace_back(Args&: Scev, Args: !isGuaranteedNotToBeUndefOrPoison(V: Ptr)); |
1187 | break; |
1188 | } |
1189 | } |
1190 | |
1191 | static SmallVector<PointerIntPair<const SCEV *, 1, bool>> |
1192 | findForkedPointer(PredicatedScalarEvolution &PSE, |
1193 | const DenseMap<Value *, const SCEV *> &StridesMap, Value *Ptr, |
1194 | const Loop *L) { |
1195 | ScalarEvolution *SE = PSE.getSE(); |
1196 | assert(SE->isSCEVable(Ptr->getType()) && "Value is not SCEVable!" ); |
1197 | SmallVector<PointerIntPair<const SCEV *, 1, bool>> Scevs; |
1198 | findForkedSCEVs(SE, L, Ptr, ScevList&: Scevs, Depth: MaxForkedSCEVDepth); |
1199 | |
1200 | // For now, we will only accept a forked pointer with two possible SCEVs |
1201 | // that are either SCEVAddRecExprs or loop invariant. |
1202 | if (Scevs.size() == 2 && |
1203 | (isa<SCEVAddRecExpr>(Val: get<0>(Pair: Scevs[0])) || |
1204 | SE->isLoopInvariant(S: get<0>(Pair: Scevs[0]), L)) && |
1205 | (isa<SCEVAddRecExpr>(Val: get<0>(Pair: Scevs[1])) || |
1206 | SE->isLoopInvariant(S: get<0>(Pair: Scevs[1]), L))) { |
1207 | LLVM_DEBUG(dbgs() << "LAA: Found forked pointer: " << *Ptr << "\n" ); |
1208 | LLVM_DEBUG(dbgs() << "\t(1) " << *get<0>(Scevs[0]) << "\n" ); |
1209 | LLVM_DEBUG(dbgs() << "\t(2) " << *get<0>(Scevs[1]) << "\n" ); |
1210 | return Scevs; |
1211 | } |
1212 | |
1213 | return {{replaceSymbolicStrideSCEV(PSE, PtrToStride: StridesMap, Ptr), false}}; |
1214 | } |
1215 | |
1216 | bool AccessAnalysis::createCheckForAccess( |
1217 | RuntimePointerChecking &RtCheck, MemAccessInfo Access, Type *AccessTy, |
1218 | const DenseMap<Value *, const SCEV *> &StridesMap, |
1219 | DenseMap<Value *, unsigned> &DepSetId, Loop *TheLoop, |
1220 | unsigned &RunningDepId, unsigned ASId, bool Assume) { |
1221 | Value *Ptr = Access.getPointer(); |
1222 | |
1223 | SmallVector<PointerIntPair<const SCEV *, 1, bool>> TranslatedPtrs = |
1224 | findForkedPointer(PSE, StridesMap, Ptr, L: TheLoop); |
1225 | assert(!TranslatedPtrs.empty() && "must have some translated pointers" ); |
1226 | |
1227 | /// Check whether all pointers can participate in a runtime bounds check. They |
1228 | /// must either be invariant or AddRecs. If ShouldCheckWrap is true, they also |
1229 | /// must not wrap. |
1230 | for (auto &P : TranslatedPtrs) { |
1231 | // The bounds for loop-invariant pointer is trivial. |
1232 | if (PSE.getSE()->isLoopInvariant(S: P.getPointer(), L: TheLoop)) |
1233 | continue; |
1234 | |
1235 | const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Val: P.getPointer()); |
1236 | if (!AR && Assume) |
1237 | AR = PSE.getAsAddRec(V: Ptr); |
1238 | if (!AR || !AR->isAffine()) |
1239 | return false; |
1240 | |
1241 | // If there's only one option for Ptr, look it up after bounds and wrap |
1242 | // checking, because assumptions might have been added to PSE. |
1243 | if (TranslatedPtrs.size() == 1) { |
1244 | AR = |
1245 | cast<SCEVAddRecExpr>(Val: replaceSymbolicStrideSCEV(PSE, PtrToStride: StridesMap, Ptr)); |
1246 | P.setPointer(AR); |
1247 | } |
1248 | |
1249 | // When we run after a failing dependency check we have to make sure |
1250 | // we don't have wrapping pointers. |
1251 | if (!isNoWrap(PSE, AR, Ptr: TranslatedPtrs.size() == 1 ? Ptr : nullptr, AccessTy, |
1252 | L: TheLoop, Assume)) { |
1253 | return false; |
1254 | } |
1255 | } |
1256 | |
1257 | for (auto [PtrExpr, NeedsFreeze] : TranslatedPtrs) { |
1258 | // The id of the dependence set. |
1259 | unsigned DepId; |
1260 | |
1261 | if (isDependencyCheckNeeded()) { |
1262 | Value *Leader = DepCands.getLeaderValue(V: Access).getPointer(); |
1263 | unsigned &LeaderId = DepSetId[Leader]; |
1264 | if (!LeaderId) |
1265 | LeaderId = RunningDepId++; |
1266 | DepId = LeaderId; |
1267 | } else |
1268 | // Each access has its own dependence set. |
1269 | DepId = RunningDepId++; |
1270 | |
1271 | bool IsWrite = Access.getInt(); |
1272 | RtCheck.insert(Lp: TheLoop, Ptr, PtrExpr, AccessTy, WritePtr: IsWrite, DepSetId: DepId, ASId, PSE, |
1273 | NeedsFreeze); |
1274 | LLVM_DEBUG(dbgs() << "LAA: Found a runtime check ptr:" << *Ptr << '\n'); |
1275 | } |
1276 | |
1277 | return true; |
1278 | } |
1279 | |
1280 | bool AccessAnalysis::canCheckPtrAtRT( |
1281 | RuntimePointerChecking &RtCheck, Loop *TheLoop, |
1282 | const DenseMap<Value *, const SCEV *> &StridesMap, Value *&UncomputablePtr, |
1283 | bool AllowPartial) { |
1284 | // Find pointers with computable bounds. We are going to use this information |
1285 | // to place a runtime bound check. |
1286 | bool CanDoRT = true; |
1287 | |
1288 | bool MayNeedRTCheck = false; |
1289 | if (!IsRTCheckAnalysisNeeded) return true; |
1290 | |
1291 | bool IsDepCheckNeeded = isDependencyCheckNeeded(); |
1292 | |
1293 | // We assign a consecutive id to access from different alias sets. |
1294 | // Accesses between different groups doesn't need to be checked. |
1295 | unsigned ASId = 0; |
1296 | for (const auto &AS : AST) { |
1297 | int NumReadPtrChecks = 0; |
1298 | int NumWritePtrChecks = 0; |
1299 | bool CanDoAliasSetRT = true; |
1300 | ++ASId; |
1301 | auto ASPointers = AS.getPointers(); |
1302 | |
1303 | // We assign consecutive id to access from different dependence sets. |
1304 | // Accesses within the same set don't need a runtime check. |
1305 | unsigned RunningDepId = 1; |
1306 | DenseMap<Value *, unsigned> DepSetId; |
1307 | |
1308 | SmallVector<std::pair<MemAccessInfo, Type *>, 4> Retries; |
1309 | |
1310 | // First, count how many write and read accesses are in the alias set. Also |
1311 | // collect MemAccessInfos for later. |
1312 | SmallVector<MemAccessInfo, 4> AccessInfos; |
1313 | for (const Value *ConstPtr : ASPointers) { |
1314 | Value *Ptr = const_cast<Value *>(ConstPtr); |
1315 | bool IsWrite = Accesses.contains(Key: MemAccessInfo(Ptr, true)); |
1316 | if (IsWrite) |
1317 | ++NumWritePtrChecks; |
1318 | else |
1319 | ++NumReadPtrChecks; |
1320 | AccessInfos.emplace_back(Args&: Ptr, Args&: IsWrite); |
1321 | } |
1322 | |
1323 | // We do not need runtime checks for this alias set, if there are no writes |
1324 | // or a single write and no reads. |
1325 | if (NumWritePtrChecks == 0 || |
1326 | (NumWritePtrChecks == 1 && NumReadPtrChecks == 0)) { |
1327 | assert((ASPointers.size() <= 1 || |
1328 | all_of(ASPointers, |
1329 | [this](const Value *Ptr) { |
1330 | MemAccessInfo AccessWrite(const_cast<Value *>(Ptr), |
1331 | true); |
1332 | return !DepCands.contains(AccessWrite); |
1333 | })) && |
1334 | "Can only skip updating CanDoRT below, if all entries in AS " |
1335 | "are reads or there is at most 1 entry" ); |
1336 | continue; |
1337 | } |
1338 | |
1339 | for (auto &Access : AccessInfos) { |
1340 | for (const auto &AccessTy : Accesses[Access]) { |
1341 | if (!createCheckForAccess(RtCheck, Access, AccessTy, StridesMap, |
1342 | DepSetId, TheLoop, RunningDepId, ASId, |
1343 | Assume: false)) { |
1344 | LLVM_DEBUG(dbgs() << "LAA: Can't find bounds for ptr:" |
1345 | << *Access.getPointer() << '\n'); |
1346 | Retries.emplace_back(Args&: Access, Args: AccessTy); |
1347 | CanDoAliasSetRT = false; |
1348 | } |
1349 | } |
1350 | } |
1351 | |
1352 | // Note that this function computes CanDoRT and MayNeedRTCheck |
1353 | // independently. For example CanDoRT=false, MayNeedRTCheck=false means that |
1354 | // we have a pointer for which we couldn't find the bounds but we don't |
1355 | // actually need to emit any checks so it does not matter. |
1356 | // |
1357 | // We need runtime checks for this alias set, if there are at least 2 |
1358 | // dependence sets (in which case RunningDepId > 2) or if we need to re-try |
1359 | // any bound checks (because in that case the number of dependence sets is |
1360 | // incomplete). |
1361 | bool NeedsAliasSetRTCheck = RunningDepId > 2 || !Retries.empty(); |
1362 | |
1363 | // We need to perform run-time alias checks, but some pointers had bounds |
1364 | // that couldn't be checked. |
1365 | if (NeedsAliasSetRTCheck && !CanDoAliasSetRT) { |
1366 | // Reset the CanDoSetRt flag and retry all accesses that have failed. |
1367 | // We know that we need these checks, so we can now be more aggressive |
1368 | // and add further checks if required (overflow checks). |
1369 | CanDoAliasSetRT = true; |
1370 | for (const auto &[Access, AccessTy] : Retries) { |
1371 | if (!createCheckForAccess(RtCheck, Access, AccessTy, StridesMap, |
1372 | DepSetId, TheLoop, RunningDepId, ASId, |
1373 | /*Assume=*/true)) { |
1374 | CanDoAliasSetRT = false; |
1375 | UncomputablePtr = Access.getPointer(); |
1376 | if (!AllowPartial) |
1377 | break; |
1378 | } |
1379 | } |
1380 | } |
1381 | |
1382 | CanDoRT &= CanDoAliasSetRT; |
1383 | MayNeedRTCheck |= NeedsAliasSetRTCheck; |
1384 | ++ASId; |
1385 | } |
1386 | |
1387 | // If the pointers that we would use for the bounds comparison have different |
1388 | // address spaces, assume the values aren't directly comparable, so we can't |
1389 | // use them for the runtime check. We also have to assume they could |
1390 | // overlap. In the future there should be metadata for whether address spaces |
1391 | // are disjoint. |
1392 | unsigned NumPointers = RtCheck.Pointers.size(); |
1393 | for (unsigned i = 0; i < NumPointers; ++i) { |
1394 | for (unsigned j = i + 1; j < NumPointers; ++j) { |
1395 | // Only need to check pointers between two different dependency sets. |
1396 | if (RtCheck.Pointers[i].DependencySetId == |
1397 | RtCheck.Pointers[j].DependencySetId) |
1398 | continue; |
1399 | // Only need to check pointers in the same alias set. |
1400 | if (RtCheck.Pointers[i].AliasSetId != RtCheck.Pointers[j].AliasSetId) |
1401 | continue; |
1402 | |
1403 | Value *PtrI = RtCheck.Pointers[i].PointerValue; |
1404 | Value *PtrJ = RtCheck.Pointers[j].PointerValue; |
1405 | |
1406 | unsigned ASi = PtrI->getType()->getPointerAddressSpace(); |
1407 | unsigned ASj = PtrJ->getType()->getPointerAddressSpace(); |
1408 | if (ASi != ASj) { |
1409 | LLVM_DEBUG( |
1410 | dbgs() << "LAA: Runtime check would require comparison between" |
1411 | " different address spaces\n" ); |
1412 | return false; |
1413 | } |
1414 | } |
1415 | } |
1416 | |
1417 | if (MayNeedRTCheck && (CanDoRT || AllowPartial)) |
1418 | RtCheck.generateChecks(DepCands, UseDependencies: IsDepCheckNeeded); |
1419 | |
1420 | LLVM_DEBUG(dbgs() << "LAA: We need to do " << RtCheck.getNumberOfChecks() |
1421 | << " pointer comparisons.\n" ); |
1422 | |
1423 | // If we can do run-time checks, but there are no checks, no runtime checks |
1424 | // are needed. This can happen when all pointers point to the same underlying |
1425 | // object for example. |
1426 | RtCheck.Need = CanDoRT ? RtCheck.getNumberOfChecks() != 0 : MayNeedRTCheck; |
1427 | |
1428 | bool CanDoRTIfNeeded = !RtCheck.Need || CanDoRT; |
1429 | assert(CanDoRTIfNeeded == (CanDoRT || !MayNeedRTCheck) && |
1430 | "CanDoRTIfNeeded depends on RtCheck.Need" ); |
1431 | if (!CanDoRTIfNeeded && !AllowPartial) |
1432 | RtCheck.reset(); |
1433 | return CanDoRTIfNeeded; |
1434 | } |
1435 | |
1436 | void AccessAnalysis::processMemAccesses() { |
1437 | // We process the set twice: first we process read-write pointers, last we |
1438 | // process read-only pointers. This allows us to skip dependence tests for |
1439 | // read-only pointers. |
1440 | |
1441 | LLVM_DEBUG(dbgs() << "LAA: Processing memory accesses...\n" ); |
1442 | LLVM_DEBUG(dbgs() << " AST: " ; AST.dump()); |
1443 | LLVM_DEBUG(dbgs() << "LAA: Accesses(" << Accesses.size() << "):\n" ); |
1444 | LLVM_DEBUG({ |
1445 | for (const auto &[A, _] : Accesses) |
1446 | dbgs() << "\t" << *A.getPointer() << " (" |
1447 | << (A.getInt() |
1448 | ? "write" |
1449 | : (ReadOnlyPtr.contains(A.getPointer()) ? "read-only" |
1450 | : "read" )) |
1451 | << ")\n" ; |
1452 | }); |
1453 | |
1454 | // The AliasSetTracker has nicely partitioned our pointers by metadata |
1455 | // compatibility and potential for underlying-object overlap. As a result, we |
1456 | // only need to check for potential pointer dependencies within each alias |
1457 | // set. |
1458 | for (const auto &AS : AST) { |
1459 | // Note that both the alias-set tracker and the alias sets themselves used |
1460 | // ordered collections internally and so the iteration order here is |
1461 | // deterministic. |
1462 | auto ASPointers = AS.getPointers(); |
1463 | |
1464 | bool SetHasWrite = false; |
1465 | |
1466 | // Map of (pointer to underlying objects, accessed address space) to last |
1467 | // access encountered. |
1468 | typedef DenseMap<std::pair<const Value *, unsigned>, MemAccessInfo> |
1469 | UnderlyingObjToAccessMap; |
1470 | UnderlyingObjToAccessMap ObjToLastAccess; |
1471 | |
1472 | // Set of access to check after all writes have been processed. |
1473 | PtrAccessMap DeferredAccesses; |
1474 | |
1475 | // Iterate over each alias set twice, once to process read/write pointers, |
1476 | // and then to process read-only pointers. |
1477 | for (int SetIteration = 0; SetIteration < 2; ++SetIteration) { |
1478 | bool UseDeferred = SetIteration > 0; |
1479 | PtrAccessMap &S = UseDeferred ? DeferredAccesses : Accesses; |
1480 | |
1481 | for (const Value *ConstPtr : ASPointers) { |
1482 | Value *Ptr = const_cast<Value *>(ConstPtr); |
1483 | |
1484 | // For a single memory access in AliasSetTracker, Accesses may contain |
1485 | // both read and write, and they both need to be handled for CheckDeps. |
1486 | for (const auto &[AC, _] : S) { |
1487 | if (AC.getPointer() != Ptr) |
1488 | continue; |
1489 | |
1490 | bool IsWrite = AC.getInt(); |
1491 | |
1492 | // If we're using the deferred access set, then it contains only |
1493 | // reads. |
1494 | bool IsReadOnlyPtr = ReadOnlyPtr.contains(Ptr) && !IsWrite; |
1495 | if (UseDeferred && !IsReadOnlyPtr) |
1496 | continue; |
1497 | // Otherwise, the pointer must be in the PtrAccessSet, either as a |
1498 | // read or a write. |
1499 | assert(((IsReadOnlyPtr && UseDeferred) || IsWrite || |
1500 | S.contains(MemAccessInfo(Ptr, false))) && |
1501 | "Alias-set pointer not in the access set?" ); |
1502 | |
1503 | MemAccessInfo Access(Ptr, IsWrite); |
1504 | DepCands.insert(Data: Access); |
1505 | |
1506 | // Memorize read-only pointers for later processing and skip them in |
1507 | // the first round (they need to be checked after we have seen all |
1508 | // write pointers). Note: we also mark pointer that are not |
1509 | // consecutive as "read-only" pointers (so that we check |
1510 | // "a[b[i]] +="). Hence, we need the second check for "!IsWrite". |
1511 | if (!UseDeferred && IsReadOnlyPtr) { |
1512 | // We only use the pointer keys, the types vector values don't |
1513 | // matter. |
1514 | DeferredAccesses.insert(KV: {Access, {}}); |
1515 | continue; |
1516 | } |
1517 | |
1518 | // If this is a write - check other reads and writes for conflicts. If |
1519 | // this is a read only check other writes for conflicts (but only if |
1520 | // there is no other write to the ptr - this is an optimization to |
1521 | // catch "a[i] = a[i] + " without having to do a dependence check). |
1522 | if ((IsWrite || IsReadOnlyPtr) && SetHasWrite) { |
1523 | CheckDeps.push_back(Elt: Access); |
1524 | IsRTCheckAnalysisNeeded = true; |
1525 | } |
1526 | |
1527 | if (IsWrite) |
1528 | SetHasWrite = true; |
1529 | |
1530 | // Create sets of pointers connected by a shared alias set and |
1531 | // underlying object. |
1532 | SmallVector<const Value *, 16> &UOs = UnderlyingObjects[Ptr]; |
1533 | UOs = {}; |
1534 | ::getUnderlyingObjects(V: Ptr, Objects&: UOs, LI); |
1535 | LLVM_DEBUG(dbgs() |
1536 | << "Underlying objects for pointer " << *Ptr << "\n" ); |
1537 | for (const Value *UnderlyingObj : UOs) { |
1538 | // nullptr never alias, don't join sets for pointer that have "null" |
1539 | // in their UnderlyingObjects list. |
1540 | if (isa<ConstantPointerNull>(Val: UnderlyingObj) && |
1541 | !NullPointerIsDefined( |
1542 | F: TheLoop->getHeader()->getParent(), |
1543 | AS: UnderlyingObj->getType()->getPointerAddressSpace())) |
1544 | continue; |
1545 | |
1546 | auto [It, Inserted] = ObjToLastAccess.try_emplace( |
1547 | Key: {UnderlyingObj, |
1548 | cast<PointerType>(Val: Ptr->getType())->getAddressSpace()}, |
1549 | Args&: Access); |
1550 | if (!Inserted) { |
1551 | DepCands.unionSets(V1: Access, V2: It->second); |
1552 | It->second = Access; |
1553 | } |
1554 | |
1555 | LLVM_DEBUG(dbgs() << " " << *UnderlyingObj << "\n" ); |
1556 | } |
1557 | } |
1558 | } |
1559 | } |
1560 | } |
1561 | } |
1562 | |
1563 | /// Check whether the access through \p Ptr has a constant stride. |
1564 | std::optional<int64_t> |
1565 | llvm::getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr, |
1566 | const Loop *Lp, |
1567 | const DenseMap<Value *, const SCEV *> &StridesMap, |
1568 | bool Assume, bool ShouldCheckWrap) { |
1569 | const SCEV *PtrScev = replaceSymbolicStrideSCEV(PSE, PtrToStride: StridesMap, Ptr); |
1570 | if (PSE.getSE()->isLoopInvariant(S: PtrScev, L: Lp)) |
1571 | return 0; |
1572 | |
1573 | assert(Ptr->getType()->isPointerTy() && "Unexpected non-ptr" ); |
1574 | if (isa<ScalableVectorType>(Val: AccessTy)) { |
1575 | LLVM_DEBUG(dbgs() << "LAA: Bad stride - Scalable object: " << *AccessTy |
1576 | << "\n" ); |
1577 | return std::nullopt; |
1578 | } |
1579 | |
1580 | const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Val: PtrScev); |
1581 | if (Assume && !AR) |
1582 | AR = PSE.getAsAddRec(V: Ptr); |
1583 | |
1584 | if (!AR) { |
1585 | LLVM_DEBUG(dbgs() << "LAA: Bad stride - Not an AddRecExpr pointer " << *Ptr |
1586 | << " SCEV: " << *PtrScev << "\n" ); |
1587 | return std::nullopt; |
1588 | } |
1589 | |
1590 | std::optional<int64_t> Stride = |
1591 | getStrideFromAddRec(AR, Lp, AccessTy, Ptr, PSE); |
1592 | if (!ShouldCheckWrap || !Stride) |
1593 | return Stride; |
1594 | |
1595 | if (isNoWrap(PSE, AR, Ptr, AccessTy, L: Lp, Assume, Stride)) |
1596 | return Stride; |
1597 | |
1598 | LLVM_DEBUG( |
1599 | dbgs() << "LAA: Bad stride - Pointer may wrap in the address space " |
1600 | << *Ptr << " SCEV: " << *AR << "\n" ); |
1601 | return std::nullopt; |
1602 | } |
1603 | |
1604 | std::optional<int64_t> llvm::getPointersDiff(Type *ElemTyA, Value *PtrA, |
1605 | Type *ElemTyB, Value *PtrB, |
1606 | const DataLayout &DL, |
1607 | ScalarEvolution &SE, |
1608 | bool StrictCheck, bool CheckType) { |
1609 | assert(PtrA && PtrB && "Expected non-nullptr pointers." ); |
1610 | |
1611 | // Make sure that A and B are different pointers. |
1612 | if (PtrA == PtrB) |
1613 | return 0; |
1614 | |
1615 | // Make sure that the element types are the same if required. |
1616 | if (CheckType && ElemTyA != ElemTyB) |
1617 | return std::nullopt; |
1618 | |
1619 | unsigned ASA = PtrA->getType()->getPointerAddressSpace(); |
1620 | unsigned ASB = PtrB->getType()->getPointerAddressSpace(); |
1621 | |
1622 | // Check that the address spaces match. |
1623 | if (ASA != ASB) |
1624 | return std::nullopt; |
1625 | unsigned IdxWidth = DL.getIndexSizeInBits(AS: ASA); |
1626 | |
1627 | APInt OffsetA(IdxWidth, 0), OffsetB(IdxWidth, 0); |
1628 | const Value *PtrA1 = PtrA->stripAndAccumulateConstantOffsets( |
1629 | DL, Offset&: OffsetA, /*AllowNonInbounds=*/true); |
1630 | const Value *PtrB1 = PtrB->stripAndAccumulateConstantOffsets( |
1631 | DL, Offset&: OffsetB, /*AllowNonInbounds=*/true); |
1632 | |
1633 | std::optional<int64_t> Val; |
1634 | if (PtrA1 == PtrB1) { |
1635 | // Retrieve the address space again as pointer stripping now tracks through |
1636 | // `addrspacecast`. |
1637 | ASA = cast<PointerType>(Val: PtrA1->getType())->getAddressSpace(); |
1638 | ASB = cast<PointerType>(Val: PtrB1->getType())->getAddressSpace(); |
1639 | // Check that the address spaces match and that the pointers are valid. |
1640 | if (ASA != ASB) |
1641 | return std::nullopt; |
1642 | |
1643 | IdxWidth = DL.getIndexSizeInBits(AS: ASA); |
1644 | OffsetA = OffsetA.sextOrTrunc(width: IdxWidth); |
1645 | OffsetB = OffsetB.sextOrTrunc(width: IdxWidth); |
1646 | |
1647 | OffsetB -= OffsetA; |
1648 | Val = OffsetB.trySExtValue(); |
1649 | } else { |
1650 | // Otherwise compute the distance with SCEV between the base pointers. |
1651 | const SCEV *PtrSCEVA = SE.getSCEV(V: PtrA); |
1652 | const SCEV *PtrSCEVB = SE.getSCEV(V: PtrB); |
1653 | std::optional<APInt> Diff = |
1654 | SE.computeConstantDifference(LHS: PtrSCEVB, RHS: PtrSCEVA); |
1655 | if (!Diff) |
1656 | return std::nullopt; |
1657 | Val = Diff->trySExtValue(); |
1658 | } |
1659 | |
1660 | if (!Val) |
1661 | return std::nullopt; |
1662 | |
1663 | int64_t Size = DL.getTypeStoreSize(Ty: ElemTyA); |
1664 | int64_t Dist = *Val / Size; |
1665 | |
1666 | // Ensure that the calculated distance matches the type-based one after all |
1667 | // the bitcasts removal in the provided pointers. |
1668 | if (!StrictCheck || Dist * Size == Val) |
1669 | return Dist; |
1670 | return std::nullopt; |
1671 | } |
1672 | |
1673 | bool llvm::sortPtrAccesses(ArrayRef<Value *> VL, Type *ElemTy, |
1674 | const DataLayout &DL, ScalarEvolution &SE, |
1675 | SmallVectorImpl<unsigned> &SortedIndices) { |
1676 | assert(llvm::all_of( |
1677 | VL, [](const Value *V) { return V->getType()->isPointerTy(); }) && |
1678 | "Expected list of pointer operands." ); |
1679 | // Walk over the pointers, and map each of them to an offset relative to |
1680 | // first pointer in the array. |
1681 | Value *Ptr0 = VL[0]; |
1682 | |
1683 | using DistOrdPair = std::pair<int64_t, unsigned>; |
1684 | auto Compare = llvm::less_first(); |
1685 | std::set<DistOrdPair, decltype(Compare)> Offsets(Compare); |
1686 | Offsets.emplace(args: 0, args: 0); |
1687 | bool IsConsecutive = true; |
1688 | for (auto [Idx, Ptr] : drop_begin(RangeOrContainer: enumerate(First&: VL))) { |
1689 | std::optional<int64_t> Diff = |
1690 | getPointersDiff(ElemTyA: ElemTy, PtrA: Ptr0, ElemTyB: ElemTy, PtrB: Ptr, DL, SE, |
1691 | /*StrictCheck=*/true); |
1692 | if (!Diff) |
1693 | return false; |
1694 | |
1695 | // Check if the pointer with the same offset is found. |
1696 | int64_t Offset = *Diff; |
1697 | auto [It, IsInserted] = Offsets.emplace(args&: Offset, args&: Idx); |
1698 | if (!IsInserted) |
1699 | return false; |
1700 | // Consecutive order if the inserted element is the last one. |
1701 | IsConsecutive &= std::next(x: It) == Offsets.end(); |
1702 | } |
1703 | SortedIndices.clear(); |
1704 | if (!IsConsecutive) { |
1705 | // Fill SortedIndices array only if it is non-consecutive. |
1706 | SortedIndices.resize(N: VL.size()); |
1707 | for (auto [Idx, Off] : enumerate(First&: Offsets)) |
1708 | SortedIndices[Idx] = Off.second; |
1709 | } |
1710 | return true; |
1711 | } |
1712 | |
1713 | /// Returns true if the memory operations \p A and \p B are consecutive. |
1714 | bool llvm::isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL, |
1715 | ScalarEvolution &SE, bool CheckType) { |
1716 | Value *PtrA = getLoadStorePointerOperand(V: A); |
1717 | Value *PtrB = getLoadStorePointerOperand(V: B); |
1718 | if (!PtrA || !PtrB) |
1719 | return false; |
1720 | Type *ElemTyA = getLoadStoreType(I: A); |
1721 | Type *ElemTyB = getLoadStoreType(I: B); |
1722 | std::optional<int64_t> Diff = |
1723 | getPointersDiff(ElemTyA, PtrA, ElemTyB, PtrB, DL, SE, |
1724 | /*StrictCheck=*/true, CheckType); |
1725 | return Diff == 1; |
1726 | } |
1727 | |
1728 | void MemoryDepChecker::addAccess(StoreInst *SI) { |
1729 | visitPointers(StartPtr: SI->getPointerOperand(), InnermostLoop: *InnermostLoop, |
1730 | AddPointer: [this, SI](Value *Ptr) { |
1731 | Accesses[MemAccessInfo(Ptr, true)].push_back(x: AccessIdx); |
1732 | InstMap.push_back(Elt: SI); |
1733 | ++AccessIdx; |
1734 | }); |
1735 | } |
1736 | |
1737 | void MemoryDepChecker::addAccess(LoadInst *LI) { |
1738 | visitPointers(StartPtr: LI->getPointerOperand(), InnermostLoop: *InnermostLoop, |
1739 | AddPointer: [this, LI](Value *Ptr) { |
1740 | Accesses[MemAccessInfo(Ptr, false)].push_back(x: AccessIdx); |
1741 | InstMap.push_back(Elt: LI); |
1742 | ++AccessIdx; |
1743 | }); |
1744 | } |
1745 | |
1746 | MemoryDepChecker::VectorizationSafetyStatus |
1747 | MemoryDepChecker::Dependence::isSafeForVectorization(DepType Type) { |
1748 | switch (Type) { |
1749 | case NoDep: |
1750 | case Forward: |
1751 | case BackwardVectorizable: |
1752 | return VectorizationSafetyStatus::Safe; |
1753 | |
1754 | case Unknown: |
1755 | return VectorizationSafetyStatus::PossiblySafeWithRtChecks; |
1756 | case ForwardButPreventsForwarding: |
1757 | case Backward: |
1758 | case BackwardVectorizableButPreventsForwarding: |
1759 | case IndirectUnsafe: |
1760 | return VectorizationSafetyStatus::Unsafe; |
1761 | } |
1762 | llvm_unreachable("unexpected DepType!" ); |
1763 | } |
1764 | |
1765 | bool MemoryDepChecker::Dependence::isBackward() const { |
1766 | switch (Type) { |
1767 | case NoDep: |
1768 | case Forward: |
1769 | case ForwardButPreventsForwarding: |
1770 | case Unknown: |
1771 | case IndirectUnsafe: |
1772 | return false; |
1773 | |
1774 | case BackwardVectorizable: |
1775 | case Backward: |
1776 | case BackwardVectorizableButPreventsForwarding: |
1777 | return true; |
1778 | } |
1779 | llvm_unreachable("unexpected DepType!" ); |
1780 | } |
1781 | |
1782 | bool MemoryDepChecker::Dependence::isPossiblyBackward() const { |
1783 | return isBackward() || Type == Unknown || Type == IndirectUnsafe; |
1784 | } |
1785 | |
1786 | bool MemoryDepChecker::Dependence::isForward() const { |
1787 | switch (Type) { |
1788 | case Forward: |
1789 | case ForwardButPreventsForwarding: |
1790 | return true; |
1791 | |
1792 | case NoDep: |
1793 | case Unknown: |
1794 | case BackwardVectorizable: |
1795 | case Backward: |
1796 | case BackwardVectorizableButPreventsForwarding: |
1797 | case IndirectUnsafe: |
1798 | return false; |
1799 | } |
1800 | llvm_unreachable("unexpected DepType!" ); |
1801 | } |
1802 | |
1803 | bool MemoryDepChecker::couldPreventStoreLoadForward(uint64_t Distance, |
1804 | uint64_t TypeByteSize, |
1805 | unsigned CommonStride) { |
1806 | // If loads occur at a distance that is not a multiple of a feasible vector |
1807 | // factor store-load forwarding does not take place. |
1808 | // Positive dependences might cause troubles because vectorizing them might |
1809 | // prevent store-load forwarding making vectorized code run a lot slower. |
1810 | // a[i] = a[i-3] ^ a[i-8]; |
1811 | // The stores to a[i:i+1] don't align with the stores to a[i-3:i-2] and |
1812 | // hence on your typical architecture store-load forwarding does not take |
1813 | // place. Vectorizing in such cases does not make sense. |
1814 | // Store-load forwarding distance. |
1815 | |
1816 | // After this many iterations store-to-load forwarding conflicts should not |
1817 | // cause any slowdowns. |
1818 | const uint64_t NumItersForStoreLoadThroughMemory = 8 * TypeByteSize; |
1819 | // Maximum vector factor. |
1820 | uint64_t MaxVFWithoutSLForwardIssuesPowerOf2 = |
1821 | std::min(a: VectorizerParams::MaxVectorWidth * TypeByteSize, |
1822 | b: MaxStoreLoadForwardSafeDistanceInBits); |
1823 | |
1824 | // Compute the smallest VF at which the store and load would be misaligned. |
1825 | for (uint64_t VF = 2 * TypeByteSize; |
1826 | VF <= MaxVFWithoutSLForwardIssuesPowerOf2; VF *= 2) { |
1827 | // If the number of vector iteration between the store and the load are |
1828 | // small we could incur conflicts. |
1829 | if (Distance % VF && Distance / VF < NumItersForStoreLoadThroughMemory) { |
1830 | MaxVFWithoutSLForwardIssuesPowerOf2 = (VF >> 1); |
1831 | break; |
1832 | } |
1833 | } |
1834 | |
1835 | if (MaxVFWithoutSLForwardIssuesPowerOf2 < 2 * TypeByteSize) { |
1836 | LLVM_DEBUG( |
1837 | dbgs() << "LAA: Distance " << Distance |
1838 | << " that could cause a store-load forwarding conflict\n" ); |
1839 | return true; |
1840 | } |
1841 | |
1842 | if (CommonStride && |
1843 | MaxVFWithoutSLForwardIssuesPowerOf2 < |
1844 | MaxStoreLoadForwardSafeDistanceInBits && |
1845 | MaxVFWithoutSLForwardIssuesPowerOf2 != |
1846 | VectorizerParams::MaxVectorWidth * TypeByteSize) { |
1847 | uint64_t MaxVF = |
1848 | bit_floor(Value: MaxVFWithoutSLForwardIssuesPowerOf2 / CommonStride); |
1849 | uint64_t MaxVFInBits = MaxVF * TypeByteSize * 8; |
1850 | MaxStoreLoadForwardSafeDistanceInBits = |
1851 | std::min(a: MaxStoreLoadForwardSafeDistanceInBits, b: MaxVFInBits); |
1852 | } |
1853 | return false; |
1854 | } |
1855 | |
1856 | void MemoryDepChecker::mergeInStatus(VectorizationSafetyStatus S) { |
1857 | if (Status < S) |
1858 | Status = S; |
1859 | } |
1860 | |
1861 | /// Given a dependence-distance \p Dist between two memory accesses, that have |
1862 | /// strides in the same direction whose absolute value of the maximum stride is |
1863 | /// given in \p MaxStride, in a loop whose maximum backedge taken count is \p |
1864 | /// MaxBTC, check if it is possible to prove statically that the dependence |
1865 | /// distance is larger than the range that the accesses will travel through the |
1866 | /// execution of the loop. If so, return true; false otherwise. This is useful |
1867 | /// for example in loops such as the following (PR31098): |
1868 | /// |
1869 | /// for (i = 0; i < D; ++i) { |
1870 | /// = out[i]; |
1871 | /// out[i+D] = |
1872 | /// } |
1873 | static bool isSafeDependenceDistance(const DataLayout &DL, ScalarEvolution &SE, |
1874 | const SCEV &MaxBTC, const SCEV &Dist, |
1875 | uint64_t MaxStride) { |
1876 | |
1877 | // If we can prove that |
1878 | // (**) |Dist| > MaxBTC * Step |
1879 | // where Step is the absolute stride of the memory accesses in bytes, |
1880 | // then there is no dependence. |
1881 | // |
1882 | // Rationale: |
1883 | // We basically want to check if the absolute distance (|Dist/Step|) |
1884 | // is >= the loop iteration count (or > MaxBTC). |
1885 | // This is equivalent to the Strong SIV Test (Practical Dependence Testing, |
1886 | // Section 4.2.1); Note, that for vectorization it is sufficient to prove |
1887 | // that the dependence distance is >= VF; This is checked elsewhere. |
1888 | // But in some cases we can prune dependence distances early, and |
1889 | // even before selecting the VF, and without a runtime test, by comparing |
1890 | // the distance against the loop iteration count. Since the vectorized code |
1891 | // will be executed only if LoopCount >= VF, proving distance >= LoopCount |
1892 | // also guarantees that distance >= VF. |
1893 | // |
1894 | const SCEV *Step = SE.getConstant(Ty: MaxBTC.getType(), V: MaxStride); |
1895 | const SCEV *Product = SE.getMulExpr(LHS: &MaxBTC, RHS: Step); |
1896 | |
1897 | const SCEV *CastedDist = &Dist; |
1898 | const SCEV *CastedProduct = Product; |
1899 | uint64_t DistTypeSizeBits = DL.getTypeSizeInBits(Ty: Dist.getType()); |
1900 | uint64_t ProductTypeSizeBits = DL.getTypeSizeInBits(Ty: Product->getType()); |
1901 | |
1902 | // The dependence distance can be positive/negative, so we sign extend Dist; |
1903 | // The multiplication of the absolute stride in bytes and the |
1904 | // backedgeTakenCount is non-negative, so we zero extend Product. |
1905 | if (DistTypeSizeBits > ProductTypeSizeBits) |
1906 | CastedProduct = SE.getZeroExtendExpr(Op: Product, Ty: Dist.getType()); |
1907 | else |
1908 | CastedDist = SE.getNoopOrSignExtend(V: &Dist, Ty: Product->getType()); |
1909 | |
1910 | // Is Dist - (MaxBTC * Step) > 0 ? |
1911 | // (If so, then we have proven (**) because |Dist| >= Dist) |
1912 | const SCEV *Minus = SE.getMinusSCEV(LHS: CastedDist, RHS: CastedProduct); |
1913 | if (SE.isKnownPositive(S: Minus)) |
1914 | return true; |
1915 | |
1916 | // Second try: Is -Dist - (MaxBTC * Step) > 0 ? |
1917 | // (If so, then we have proven (**) because |Dist| >= -1*Dist) |
1918 | const SCEV *NegDist = SE.getNegativeSCEV(V: CastedDist); |
1919 | Minus = SE.getMinusSCEV(LHS: NegDist, RHS: CastedProduct); |
1920 | return SE.isKnownPositive(S: Minus); |
1921 | } |
1922 | |
1923 | /// Check the dependence for two accesses with the same stride \p Stride. |
1924 | /// \p Distance is the positive distance in bytes, and \p TypeByteSize is type |
1925 | /// size in bytes. |
1926 | /// |
1927 | /// \returns true if they are independent. |
1928 | static bool areStridedAccessesIndependent(uint64_t Distance, uint64_t Stride, |
1929 | uint64_t TypeByteSize) { |
1930 | assert(Stride > 1 && "The stride must be greater than 1" ); |
1931 | assert(TypeByteSize > 0 && "The type size in byte must be non-zero" ); |
1932 | assert(Distance > 0 && "The distance must be non-zero" ); |
1933 | |
1934 | // Skip if the distance is not multiple of type byte size. |
1935 | if (Distance % TypeByteSize) |
1936 | return false; |
1937 | |
1938 | // No dependence if the distance is not multiple of the stride. |
1939 | // E.g. |
1940 | // for (i = 0; i < 1024 ; i += 4) |
1941 | // A[i+2] = A[i] + 1; |
1942 | // |
1943 | // Two accesses in memory (distance is 2, stride is 4): |
1944 | // | A[0] | | | | A[4] | | | | |
1945 | // | | | A[2] | | | | A[6] | | |
1946 | // |
1947 | // E.g. |
1948 | // for (i = 0; i < 1024 ; i += 3) |
1949 | // A[i+4] = A[i] + 1; |
1950 | // |
1951 | // Two accesses in memory (distance is 4, stride is 3): |
1952 | // | A[0] | | | A[3] | | | A[6] | | | |
1953 | // | | | | | A[4] | | | A[7] | | |
1954 | return Distance % Stride; |
1955 | } |
1956 | |
1957 | std::variant<MemoryDepChecker::Dependence::DepType, |
1958 | MemoryDepChecker::DepDistanceStrideAndSizeInfo> |
1959 | MemoryDepChecker::getDependenceDistanceStrideAndSize( |
1960 | const AccessAnalysis::MemAccessInfo &A, Instruction *AInst, |
1961 | const AccessAnalysis::MemAccessInfo &B, Instruction *BInst) { |
1962 | const auto &DL = InnermostLoop->getHeader()->getDataLayout(); |
1963 | auto &SE = *PSE.getSE(); |
1964 | const auto &[APtr, AIsWrite] = A; |
1965 | const auto &[BPtr, BIsWrite] = B; |
1966 | |
1967 | // Two reads are independent. |
1968 | if (!AIsWrite && !BIsWrite) |
1969 | return MemoryDepChecker::Dependence::NoDep; |
1970 | |
1971 | Type *ATy = getLoadStoreType(I: AInst); |
1972 | Type *BTy = getLoadStoreType(I: BInst); |
1973 | |
1974 | // We cannot check pointers in different address spaces. |
1975 | if (APtr->getType()->getPointerAddressSpace() != |
1976 | BPtr->getType()->getPointerAddressSpace()) |
1977 | return MemoryDepChecker::Dependence::Unknown; |
1978 | |
1979 | std::optional<int64_t> StrideAPtr = |
1980 | getPtrStride(PSE, AccessTy: ATy, Ptr: APtr, Lp: InnermostLoop, StridesMap: SymbolicStrides, Assume: true, ShouldCheckWrap: true); |
1981 | std::optional<int64_t> StrideBPtr = |
1982 | getPtrStride(PSE, AccessTy: BTy, Ptr: BPtr, Lp: InnermostLoop, StridesMap: SymbolicStrides, Assume: true, ShouldCheckWrap: true); |
1983 | |
1984 | const SCEV *Src = PSE.getSCEV(V: APtr); |
1985 | const SCEV *Sink = PSE.getSCEV(V: BPtr); |
1986 | |
1987 | // If the induction step is negative we have to invert source and sink of the |
1988 | // dependence when measuring the distance between them. We should not swap |
1989 | // AIsWrite with BIsWrite, as their uses expect them in program order. |
1990 | if (StrideAPtr && *StrideAPtr < 0) { |
1991 | std::swap(a&: Src, b&: Sink); |
1992 | std::swap(a&: AInst, b&: BInst); |
1993 | std::swap(a&: ATy, b&: BTy); |
1994 | std::swap(lhs&: StrideAPtr, rhs&: StrideBPtr); |
1995 | } |
1996 | |
1997 | const SCEV *Dist = SE.getMinusSCEV(LHS: Sink, RHS: Src); |
1998 | |
1999 | LLVM_DEBUG(dbgs() << "LAA: Src Scev: " << *Src << "Sink Scev: " << *Sink |
2000 | << "\n" ); |
2001 | LLVM_DEBUG(dbgs() << "LAA: Distance for " << *AInst << " to " << *BInst |
2002 | << ": " << *Dist << "\n" ); |
2003 | |
2004 | // Check if we can prove that Sink only accesses memory after Src's end or |
2005 | // vice versa. At the moment this is limited to cases where either source or |
2006 | // sink are loop invariant to avoid compile-time increases. This is not |
2007 | // required for correctness. |
2008 | if (SE.isLoopInvariant(S: Src, L: InnermostLoop) || |
2009 | SE.isLoopInvariant(S: Sink, L: InnermostLoop)) { |
2010 | const SCEV *BTC = PSE.getBackedgeTakenCount(); |
2011 | const SCEV *SymbolicMaxBTC = PSE.getSymbolicMaxBackedgeTakenCount(); |
2012 | const auto &[SrcStart_, SrcEnd_] = |
2013 | getStartAndEndForAccess(Lp: InnermostLoop, PtrExpr: Src, AccessTy: ATy, BTC, MaxBTC: SymbolicMaxBTC, |
2014 | SE: PSE.getSE(), PointerBounds: &PointerBounds); |
2015 | const auto &[SinkStart_, SinkEnd_] = |
2016 | getStartAndEndForAccess(Lp: InnermostLoop, PtrExpr: Sink, AccessTy: BTy, BTC, MaxBTC: SymbolicMaxBTC, |
2017 | SE: PSE.getSE(), PointerBounds: &PointerBounds); |
2018 | if (!isa<SCEVCouldNotCompute>(Val: SrcStart_) && |
2019 | !isa<SCEVCouldNotCompute>(Val: SrcEnd_) && |
2020 | !isa<SCEVCouldNotCompute>(Val: SinkStart_) && |
2021 | !isa<SCEVCouldNotCompute>(Val: SinkEnd_)) { |
2022 | if (!LoopGuards) |
2023 | LoopGuards.emplace( |
2024 | args: ScalarEvolution::LoopGuards::collect(L: InnermostLoop, SE)); |
2025 | auto SrcEnd = SE.applyLoopGuards(Expr: SrcEnd_, Guards: *LoopGuards); |
2026 | auto SinkStart = SE.applyLoopGuards(Expr: SinkStart_, Guards: *LoopGuards); |
2027 | if (SE.isKnownPredicate(Pred: CmpInst::ICMP_ULE, LHS: SrcEnd, RHS: SinkStart)) |
2028 | return MemoryDepChecker::Dependence::NoDep; |
2029 | |
2030 | auto SinkEnd = SE.applyLoopGuards(Expr: SinkEnd_, Guards: *LoopGuards); |
2031 | auto SrcStart = SE.applyLoopGuards(Expr: SrcStart_, Guards: *LoopGuards); |
2032 | if (SE.isKnownPredicate(Pred: CmpInst::ICMP_ULE, LHS: SinkEnd, RHS: SrcStart)) |
2033 | return MemoryDepChecker::Dependence::NoDep; |
2034 | } |
2035 | } |
2036 | |
2037 | // Need accesses with constant strides and the same direction for further |
2038 | // dependence analysis. We don't want to vectorize "A[B[i]] += ..." and |
2039 | // similar code or pointer arithmetic that could wrap in the address space. |
2040 | |
2041 | // If either Src or Sink are not strided (i.e. not a non-wrapping AddRec) and |
2042 | // not loop-invariant (stride will be 0 in that case), we cannot analyze the |
2043 | // dependence further and also cannot generate runtime checks. |
2044 | if (!StrideAPtr || !StrideBPtr) { |
2045 | LLVM_DEBUG(dbgs() << "Pointer access with non-constant stride\n" ); |
2046 | return MemoryDepChecker::Dependence::IndirectUnsafe; |
2047 | } |
2048 | |
2049 | int64_t StrideAPtrInt = *StrideAPtr; |
2050 | int64_t StrideBPtrInt = *StrideBPtr; |
2051 | LLVM_DEBUG(dbgs() << "LAA: Src induction step: " << StrideAPtrInt |
2052 | << " Sink induction step: " << StrideBPtrInt << "\n" ); |
2053 | // At least Src or Sink are loop invariant and the other is strided or |
2054 | // invariant. We can generate a runtime check to disambiguate the accesses. |
2055 | if (!StrideAPtrInt || !StrideBPtrInt) |
2056 | return MemoryDepChecker::Dependence::Unknown; |
2057 | |
2058 | // Both Src and Sink have a constant stride, check if they are in the same |
2059 | // direction. |
2060 | if ((StrideAPtrInt > 0) != (StrideBPtrInt > 0)) { |
2061 | LLVM_DEBUG( |
2062 | dbgs() << "Pointer access with strides in different directions\n" ); |
2063 | return MemoryDepChecker::Dependence::Unknown; |
2064 | } |
2065 | |
2066 | TypeSize AStoreSz = DL.getTypeStoreSize(Ty: ATy); |
2067 | TypeSize BStoreSz = DL.getTypeStoreSize(Ty: BTy); |
2068 | |
2069 | // If store sizes are not the same, set TypeByteSize to zero, so we can check |
2070 | // it in the caller isDependent. |
2071 | uint64_t ASz = DL.getTypeAllocSize(Ty: ATy); |
2072 | uint64_t BSz = DL.getTypeAllocSize(Ty: BTy); |
2073 | uint64_t TypeByteSize = (AStoreSz == BStoreSz) ? BSz : 0; |
2074 | |
2075 | uint64_t StrideAScaled = std::abs(i: StrideAPtrInt) * ASz; |
2076 | uint64_t StrideBScaled = std::abs(i: StrideBPtrInt) * BSz; |
2077 | |
2078 | uint64_t MaxStride = std::max(a: StrideAScaled, b: StrideBScaled); |
2079 | |
2080 | std::optional<uint64_t> CommonStride; |
2081 | if (StrideAScaled == StrideBScaled) |
2082 | CommonStride = StrideAScaled; |
2083 | |
2084 | // TODO: Historically, we don't retry with runtime checks unless the |
2085 | // (unscaled) strides are the same. Fix this once the condition for runtime |
2086 | // checks in isDependent is fixed. |
2087 | bool ShouldRetryWithRuntimeCheck = StrideAPtrInt == StrideBPtrInt; |
2088 | |
2089 | return DepDistanceStrideAndSizeInfo(Dist, MaxStride, CommonStride, |
2090 | ShouldRetryWithRuntimeCheck, TypeByteSize, |
2091 | AIsWrite, BIsWrite); |
2092 | } |
2093 | |
2094 | MemoryDepChecker::Dependence::DepType |
2095 | MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx, |
2096 | const MemAccessInfo &B, unsigned BIdx) { |
2097 | assert(AIdx < BIdx && "Must pass arguments in program order" ); |
2098 | |
2099 | // Get the dependence distance, stride, type size and what access writes for |
2100 | // the dependence between A and B. |
2101 | auto Res = |
2102 | getDependenceDistanceStrideAndSize(A, AInst: InstMap[AIdx], B, BInst: InstMap[BIdx]); |
2103 | if (std::holds_alternative<Dependence::DepType>(v: Res)) |
2104 | return std::get<Dependence::DepType>(v&: Res); |
2105 | |
2106 | auto &[Dist, MaxStride, CommonStride, ShouldRetryWithRuntimeCheck, |
2107 | TypeByteSize, AIsWrite, BIsWrite] = |
2108 | std::get<DepDistanceStrideAndSizeInfo>(v&: Res); |
2109 | bool HasSameSize = TypeByteSize > 0; |
2110 | |
2111 | if (isa<SCEVCouldNotCompute>(Val: Dist)) { |
2112 | // TODO: Relax requirement that there is a common unscaled stride to retry |
2113 | // with non-constant distance dependencies. |
2114 | FoundNonConstantDistanceDependence |= ShouldRetryWithRuntimeCheck; |
2115 | LLVM_DEBUG(dbgs() << "LAA: Dependence because of uncomputable distance.\n" ); |
2116 | return Dependence::Unknown; |
2117 | } |
2118 | |
2119 | ScalarEvolution &SE = *PSE.getSE(); |
2120 | auto &DL = InnermostLoop->getHeader()->getDataLayout(); |
2121 | |
2122 | // If the distance between the acecsses is larger than their maximum absolute |
2123 | // stride multiplied by the symbolic maximum backedge taken count (which is an |
2124 | // upper bound of the number of iterations), the accesses are independet, i.e. |
2125 | // they are far enough appart that accesses won't access the same location |
2126 | // across all loop ierations. |
2127 | if (HasSameSize && |
2128 | isSafeDependenceDistance( |
2129 | DL, SE, MaxBTC: *(PSE.getSymbolicMaxBackedgeTakenCount()), Dist: *Dist, MaxStride)) |
2130 | return Dependence::NoDep; |
2131 | |
2132 | // The rest of this function relies on ConstDist being at most 64-bits, which |
2133 | // is checked earlier. Will assert if the calling code changes. |
2134 | const APInt *APDist = nullptr; |
2135 | uint64_t ConstDist = |
2136 | match(S: Dist, P: m_scev_APInt(C&: APDist)) ? APDist->abs().getZExtValue() : 0; |
2137 | |
2138 | // Attempt to prove strided accesses independent. |
2139 | if (APDist) { |
2140 | // If the distance between accesses and their strides are known constants, |
2141 | // check whether the accesses interlace each other. |
2142 | if (ConstDist > 0 && CommonStride && CommonStride > 1 && HasSameSize && |
2143 | areStridedAccessesIndependent(Distance: ConstDist, Stride: *CommonStride, TypeByteSize)) { |
2144 | LLVM_DEBUG(dbgs() << "LAA: Strided accesses are independent\n" ); |
2145 | return Dependence::NoDep; |
2146 | } |
2147 | } else { |
2148 | if (!LoopGuards) |
2149 | LoopGuards.emplace( |
2150 | args: ScalarEvolution::LoopGuards::collect(L: InnermostLoop, SE)); |
2151 | Dist = SE.applyLoopGuards(Expr: Dist, Guards: *LoopGuards); |
2152 | } |
2153 | |
2154 | // Negative distances are not plausible dependencies. |
2155 | if (SE.isKnownNonPositive(S: Dist)) { |
2156 | if (SE.isKnownNonNegative(S: Dist)) { |
2157 | if (HasSameSize) { |
2158 | // Write to the same location with the same size. |
2159 | return Dependence::Forward; |
2160 | } |
2161 | LLVM_DEBUG(dbgs() << "LAA: possibly zero dependence difference but " |
2162 | "different type sizes\n" ); |
2163 | return Dependence::Unknown; |
2164 | } |
2165 | |
2166 | bool IsTrueDataDependence = (AIsWrite && !BIsWrite); |
2167 | // Check if the first access writes to a location that is read in a later |
2168 | // iteration, where the distance between them is not a multiple of a vector |
2169 | // factor and relatively small. |
2170 | // |
2171 | // NOTE: There is no need to update MaxSafeVectorWidthInBits after call to |
2172 | // couldPreventStoreLoadForward, even if it changed MinDepDistBytes, since a |
2173 | // forward dependency will allow vectorization using any width. |
2174 | |
2175 | if (IsTrueDataDependence && EnableForwardingConflictDetection) { |
2176 | if (!ConstDist) { |
2177 | // TODO: FoundNonConstantDistanceDependence is used as a necessary |
2178 | // condition to consider retrying with runtime checks. Historically, we |
2179 | // did not set it when strides were different but there is no inherent |
2180 | // reason to. |
2181 | FoundNonConstantDistanceDependence |= ShouldRetryWithRuntimeCheck; |
2182 | return Dependence::Unknown; |
2183 | } |
2184 | if (!HasSameSize || |
2185 | couldPreventStoreLoadForward(Distance: ConstDist, TypeByteSize)) { |
2186 | LLVM_DEBUG( |
2187 | dbgs() << "LAA: Forward but may prevent st->ld forwarding\n" ); |
2188 | return Dependence::ForwardButPreventsForwarding; |
2189 | } |
2190 | } |
2191 | |
2192 | LLVM_DEBUG(dbgs() << "LAA: Dependence is negative\n" ); |
2193 | return Dependence::Forward; |
2194 | } |
2195 | |
2196 | int64_t MinDistance = SE.getSignedRangeMin(S: Dist).getSExtValue(); |
2197 | // Below we only handle strictly positive distances. |
2198 | if (MinDistance <= 0) { |
2199 | FoundNonConstantDistanceDependence |= ShouldRetryWithRuntimeCheck; |
2200 | return Dependence::Unknown; |
2201 | } |
2202 | |
2203 | if (!ConstDist) { |
2204 | // Previously this case would be treated as Unknown, possibly setting |
2205 | // FoundNonConstantDistanceDependence to force re-trying with runtime |
2206 | // checks. Until the TODO below is addressed, set it here to preserve |
2207 | // original behavior w.r.t. re-trying with runtime checks. |
2208 | // TODO: FoundNonConstantDistanceDependence is used as a necessary |
2209 | // condition to consider retrying with runtime checks. Historically, we |
2210 | // did not set it when strides were different but there is no inherent |
2211 | // reason to. |
2212 | FoundNonConstantDistanceDependence |= ShouldRetryWithRuntimeCheck; |
2213 | } |
2214 | |
2215 | if (!HasSameSize) { |
2216 | LLVM_DEBUG(dbgs() << "LAA: ReadWrite-Write positive dependency with " |
2217 | "different type sizes\n" ); |
2218 | return Dependence::Unknown; |
2219 | } |
2220 | // Bail out early if passed-in parameters make vectorization not feasible. |
2221 | unsigned ForcedFactor = (VectorizerParams::VectorizationFactor ? |
2222 | VectorizerParams::VectorizationFactor : 1); |
2223 | unsigned ForcedUnroll = (VectorizerParams::VectorizationInterleave ? |
2224 | VectorizerParams::VectorizationInterleave : 1); |
2225 | // The minimum number of iterations for a vectorized/unrolled version. |
2226 | unsigned MinNumIter = std::max(a: ForcedFactor * ForcedUnroll, b: 2U); |
2227 | |
2228 | // It's not vectorizable if the distance is smaller than the minimum distance |
2229 | // needed for a vectroized/unrolled version. Vectorizing one iteration in |
2230 | // front needs MaxStride. Vectorizing the last iteration needs TypeByteSize. |
2231 | // (No need to plus the last gap distance). |
2232 | // |
2233 | // E.g. Assume one char is 1 byte in memory and one int is 4 bytes. |
2234 | // foo(int *A) { |
2235 | // int *B = (int *)((char *)A + 14); |
2236 | // for (i = 0 ; i < 1024 ; i += 2) |
2237 | // B[i] = A[i] + 1; |
2238 | // } |
2239 | // |
2240 | // Two accesses in memory (stride is 4 * 2): |
2241 | // | A[0] | | A[2] | | A[4] | | A[6] | | |
2242 | // | B[0] | | B[2] | | B[4] | |
2243 | // |
2244 | // MinDistance needs for vectorizing iterations except the last iteration: |
2245 | // 4 * 2 * (MinNumIter - 1). MinDistance needs for the last iteration: 4. |
2246 | // So the minimum distance needed is: 4 * 2 * (MinNumIter - 1) + 4. |
2247 | // |
2248 | // If MinNumIter is 2, it is vectorizable as the minimum distance needed is |
2249 | // 12, which is less than distance. |
2250 | // |
2251 | // If MinNumIter is 4 (Say if a user forces the vectorization factor to be 4), |
2252 | // the minimum distance needed is 28, which is greater than distance. It is |
2253 | // not safe to do vectorization. |
2254 | // |
2255 | // We use MaxStride (maximum of src and sink strides) to get a conservative |
2256 | // lower bound on the MinDistanceNeeded in case of different strides. |
2257 | |
2258 | // We know that Dist is positive, but it may not be constant. Use the signed |
2259 | // minimum for computations below, as this ensures we compute the closest |
2260 | // possible dependence distance. |
2261 | uint64_t MinDistanceNeeded = MaxStride * (MinNumIter - 1) + TypeByteSize; |
2262 | if (MinDistanceNeeded > static_cast<uint64_t>(MinDistance)) { |
2263 | if (!ConstDist) { |
2264 | // For non-constant distances, we checked the lower bound of the |
2265 | // dependence distance and the distance may be larger at runtime (and safe |
2266 | // for vectorization). Classify it as Unknown, so we re-try with runtime |
2267 | // checks. |
2268 | return Dependence::Unknown; |
2269 | } |
2270 | LLVM_DEBUG(dbgs() << "LAA: Failure because of positive minimum distance " |
2271 | << MinDistance << '\n'); |
2272 | return Dependence::Backward; |
2273 | } |
2274 | |
2275 | // Unsafe if the minimum distance needed is greater than smallest dependence |
2276 | // distance distance. |
2277 | if (MinDistanceNeeded > MinDepDistBytes) { |
2278 | LLVM_DEBUG(dbgs() << "LAA: Failure because it needs at least " |
2279 | << MinDistanceNeeded << " size in bytes\n" ); |
2280 | return Dependence::Backward; |
2281 | } |
2282 | |
2283 | // Positive distance bigger than max vectorization factor. |
2284 | // FIXME: Should use max factor instead of max distance in bytes, which could |
2285 | // not handle different types. |
2286 | // E.g. Assume one char is 1 byte in memory and one int is 4 bytes. |
2287 | // void foo (int *A, char *B) { |
2288 | // for (unsigned i = 0; i < 1024; i++) { |
2289 | // A[i+2] = A[i] + 1; |
2290 | // B[i+2] = B[i] + 1; |
2291 | // } |
2292 | // } |
2293 | // |
2294 | // This case is currently unsafe according to the max safe distance. If we |
2295 | // analyze the two accesses on array B, the max safe dependence distance |
2296 | // is 2. Then we analyze the accesses on array A, the minimum distance needed |
2297 | // is 8, which is less than 2 and forbidden vectorization, But actually |
2298 | // both A and B could be vectorized by 2 iterations. |
2299 | MinDepDistBytes = |
2300 | std::min(a: static_cast<uint64_t>(MinDistance), b: MinDepDistBytes); |
2301 | |
2302 | bool IsTrueDataDependence = (!AIsWrite && BIsWrite); |
2303 | if (IsTrueDataDependence && EnableForwardingConflictDetection && ConstDist && |
2304 | couldPreventStoreLoadForward(Distance: MinDistance, TypeByteSize, CommonStride: *CommonStride)) |
2305 | return Dependence::BackwardVectorizableButPreventsForwarding; |
2306 | |
2307 | uint64_t MaxVF = MinDepDistBytes / MaxStride; |
2308 | LLVM_DEBUG(dbgs() << "LAA: Positive min distance " << MinDistance |
2309 | << " with max VF = " << MaxVF << '\n'); |
2310 | |
2311 | uint64_t MaxVFInBits = MaxVF * TypeByteSize * 8; |
2312 | if (!ConstDist && MaxVFInBits < MaxTargetVectorWidthInBits) { |
2313 | // For non-constant distances, we checked the lower bound of the dependence |
2314 | // distance and the distance may be larger at runtime (and safe for |
2315 | // vectorization). Classify it as Unknown, so we re-try with runtime checks. |
2316 | return Dependence::Unknown; |
2317 | } |
2318 | |
2319 | MaxSafeVectorWidthInBits = std::min(a: MaxSafeVectorWidthInBits, b: MaxVFInBits); |
2320 | return Dependence::BackwardVectorizable; |
2321 | } |
2322 | |
2323 | bool MemoryDepChecker::areDepsSafe(const DepCandidates &DepCands, |
2324 | const MemAccessInfoList &CheckDeps) { |
2325 | |
2326 | MinDepDistBytes = -1; |
2327 | SmallPtrSet<MemAccessInfo, 8> Visited; |
2328 | for (MemAccessInfo CurAccess : CheckDeps) { |
2329 | if (Visited.contains(Ptr: CurAccess)) |
2330 | continue; |
2331 | |
2332 | // Check accesses within this set. |
2333 | EquivalenceClasses<MemAccessInfo>::member_iterator AI = |
2334 | DepCands.findLeader(V: CurAccess); |
2335 | EquivalenceClasses<MemAccessInfo>::member_iterator AE = |
2336 | DepCands.member_end(); |
2337 | |
2338 | // Check every access pair. |
2339 | while (AI != AE) { |
2340 | Visited.insert(Ptr: *AI); |
2341 | bool AIIsWrite = AI->getInt(); |
2342 | // Check loads only against next equivalent class, but stores also against |
2343 | // other stores in the same equivalence class - to the same address. |
2344 | EquivalenceClasses<MemAccessInfo>::member_iterator OI = |
2345 | (AIIsWrite ? AI : std::next(x: AI)); |
2346 | while (OI != AE) { |
2347 | // Check every accessing instruction pair in program order. |
2348 | auto &Acc = Accesses[*AI]; |
2349 | for (std::vector<unsigned>::iterator I1 = Acc.begin(), I1E = Acc.end(); |
2350 | I1 != I1E; ++I1) |
2351 | // Scan all accesses of another equivalence class, but only the next |
2352 | // accesses of the same equivalent class. |
2353 | for (std::vector<unsigned>::iterator |
2354 | I2 = (OI == AI ? std::next(x: I1) : Accesses[*OI].begin()), |
2355 | I2E = (OI == AI ? I1E : Accesses[*OI].end()); |
2356 | I2 != I2E; ++I2) { |
2357 | auto A = std::make_pair(x: &*AI, y&: *I1); |
2358 | auto B = std::make_pair(x: &*OI, y&: *I2); |
2359 | |
2360 | assert(*I1 != *I2); |
2361 | if (*I1 > *I2) |
2362 | std::swap(x&: A, y&: B); |
2363 | |
2364 | Dependence::DepType Type = |
2365 | isDependent(A: *A.first, AIdx: A.second, B: *B.first, BIdx: B.second); |
2366 | mergeInStatus(S: Dependence::isSafeForVectorization(Type)); |
2367 | |
2368 | // Gather dependences unless we accumulated MaxDependences |
2369 | // dependences. In that case return as soon as we find the first |
2370 | // unsafe dependence. This puts a limit on this quadratic |
2371 | // algorithm. |
2372 | if (RecordDependences) { |
2373 | if (Type != Dependence::NoDep) |
2374 | Dependences.emplace_back(Args&: A.second, Args&: B.second, Args&: Type); |
2375 | |
2376 | if (Dependences.size() >= MaxDependences) { |
2377 | RecordDependences = false; |
2378 | Dependences.clear(); |
2379 | LLVM_DEBUG(dbgs() |
2380 | << "Too many dependences, stopped recording\n" ); |
2381 | } |
2382 | } |
2383 | if (!RecordDependences && !isSafeForVectorization()) |
2384 | return false; |
2385 | } |
2386 | ++OI; |
2387 | } |
2388 | ++AI; |
2389 | } |
2390 | } |
2391 | |
2392 | LLVM_DEBUG(dbgs() << "Total Dependences: " << Dependences.size() << "\n" ); |
2393 | return isSafeForVectorization(); |
2394 | } |
2395 | |
2396 | SmallVector<Instruction *, 4> |
2397 | MemoryDepChecker::getInstructionsForAccess(Value *Ptr, bool IsWrite) const { |
2398 | MemAccessInfo Access(Ptr, IsWrite); |
2399 | auto &IndexVector = Accesses.find(Val: Access)->second; |
2400 | |
2401 | SmallVector<Instruction *, 4> Insts; |
2402 | transform(Range: IndexVector, |
2403 | d_first: std::back_inserter(x&: Insts), |
2404 | F: [&](unsigned Idx) { return this->InstMap[Idx]; }); |
2405 | return Insts; |
2406 | } |
2407 | |
2408 | const char *MemoryDepChecker::Dependence::DepName[] = { |
2409 | "NoDep" , |
2410 | "Unknown" , |
2411 | "IndirectUnsafe" , |
2412 | "Forward" , |
2413 | "ForwardButPreventsForwarding" , |
2414 | "Backward" , |
2415 | "BackwardVectorizable" , |
2416 | "BackwardVectorizableButPreventsForwarding" }; |
2417 | |
2418 | void MemoryDepChecker::Dependence::print( |
2419 | raw_ostream &OS, unsigned Depth, |
2420 | const SmallVectorImpl<Instruction *> &Instrs) const { |
2421 | OS.indent(NumSpaces: Depth) << DepName[Type] << ":\n" ; |
2422 | OS.indent(NumSpaces: Depth + 2) << *Instrs[Source] << " -> \n" ; |
2423 | OS.indent(NumSpaces: Depth + 2) << *Instrs[Destination] << "\n" ; |
2424 | } |
2425 | |
2426 | bool LoopAccessInfo::canAnalyzeLoop() { |
2427 | // We need to have a loop header. |
2428 | LLVM_DEBUG(dbgs() << "\nLAA: Checking a loop in '" |
2429 | << TheLoop->getHeader()->getParent()->getName() << "' from " |
2430 | << TheLoop->getLocStr() << "\n" ); |
2431 | |
2432 | // We can only analyze innermost loops. |
2433 | if (!TheLoop->isInnermost()) { |
2434 | LLVM_DEBUG(dbgs() << "LAA: loop is not the innermost loop\n" ); |
2435 | recordAnalysis(RemarkName: "NotInnerMostLoop" ) << "loop is not the innermost loop" ; |
2436 | return false; |
2437 | } |
2438 | |
2439 | // We must have a single backedge. |
2440 | if (TheLoop->getNumBackEdges() != 1) { |
2441 | LLVM_DEBUG( |
2442 | dbgs() << "LAA: loop control flow is not understood by analyzer\n" ); |
2443 | recordAnalysis(RemarkName: "CFGNotUnderstood" ) |
2444 | << "loop control flow is not understood by analyzer" ; |
2445 | return false; |
2446 | } |
2447 | |
2448 | // ScalarEvolution needs to be able to find the symbolic max backedge taken |
2449 | // count, which is an upper bound on the number of loop iterations. The loop |
2450 | // may execute fewer iterations, if it exits via an uncountable exit. |
2451 | const SCEV *ExitCount = PSE->getSymbolicMaxBackedgeTakenCount(); |
2452 | if (isa<SCEVCouldNotCompute>(Val: ExitCount)) { |
2453 | recordAnalysis(RemarkName: "CantComputeNumberOfIterations" ) |
2454 | << "could not determine number of loop iterations" ; |
2455 | LLVM_DEBUG(dbgs() << "LAA: SCEV could not compute the loop exit count.\n" ); |
2456 | return false; |
2457 | } |
2458 | |
2459 | LLVM_DEBUG(dbgs() << "LAA: Found an analyzable loop: " |
2460 | << TheLoop->getHeader()->getName() << "\n" ); |
2461 | return true; |
2462 | } |
2463 | |
2464 | bool LoopAccessInfo::analyzeLoop(AAResults *AA, const LoopInfo *LI, |
2465 | const TargetLibraryInfo *TLI, |
2466 | DominatorTree *DT) { |
2467 | // Holds the Load and Store instructions. |
2468 | SmallVector<LoadInst *, 16> Loads; |
2469 | SmallVector<StoreInst *, 16> Stores; |
2470 | SmallPtrSet<MDNode *, 8> LoopAliasScopes; |
2471 | |
2472 | // Holds all the different accesses in the loop. |
2473 | unsigned NumReads = 0; |
2474 | unsigned NumReadWrites = 0; |
2475 | |
2476 | bool HasComplexMemInst = false; |
2477 | |
2478 | // A runtime check is only legal to insert if there are no convergent calls. |
2479 | HasConvergentOp = false; |
2480 | |
2481 | PtrRtChecking->Pointers.clear(); |
2482 | PtrRtChecking->Need = false; |
2483 | |
2484 | const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel(); |
2485 | |
2486 | const bool EnableMemAccessVersioningOfLoop = |
2487 | EnableMemAccessVersioning && |
2488 | !TheLoop->getHeader()->getParent()->hasOptSize(); |
2489 | |
2490 | // Traverse blocks in fixed RPOT order, regardless of their storage in the |
2491 | // loop info, as it may be arbitrary. |
2492 | LoopBlocksRPO RPOT(TheLoop); |
2493 | RPOT.perform(LI); |
2494 | for (BasicBlock *BB : RPOT) { |
2495 | // Scan the BB and collect legal loads and stores. Also detect any |
2496 | // convergent instructions. |
2497 | for (Instruction &I : *BB) { |
2498 | if (auto *Call = dyn_cast<CallBase>(Val: &I)) { |
2499 | if (Call->isConvergent()) |
2500 | HasConvergentOp = true; |
2501 | } |
2502 | |
2503 | // With both a non-vectorizable memory instruction and a convergent |
2504 | // operation, found in this loop, no reason to continue the search. |
2505 | if (HasComplexMemInst && HasConvergentOp) |
2506 | return false; |
2507 | |
2508 | // Avoid hitting recordAnalysis multiple times. |
2509 | if (HasComplexMemInst) |
2510 | continue; |
2511 | |
2512 | // Record alias scopes defined inside the loop. |
2513 | if (auto *Decl = dyn_cast<NoAliasScopeDeclInst>(Val: &I)) |
2514 | for (Metadata *Op : Decl->getScopeList()->operands()) |
2515 | LoopAliasScopes.insert(Ptr: cast<MDNode>(Val: Op)); |
2516 | |
2517 | // Many math library functions read the rounding mode. We will only |
2518 | // vectorize a loop if it contains known function calls that don't set |
2519 | // the flag. Therefore, it is safe to ignore this read from memory. |
2520 | auto *Call = dyn_cast<CallInst>(Val: &I); |
2521 | if (Call && getVectorIntrinsicIDForCall(CI: Call, TLI)) |
2522 | continue; |
2523 | |
2524 | // If this is a load, save it. If this instruction can read from memory |
2525 | // but is not a load, we only allow it if it's a call to a function with a |
2526 | // vector mapping and no pointer arguments. |
2527 | if (I.mayReadFromMemory()) { |
2528 | auto hasPointerArgs = [](CallBase *CB) { |
2529 | return any_of(Range: CB->args(), P: [](Value const *Arg) { |
2530 | return Arg->getType()->isPointerTy(); |
2531 | }); |
2532 | }; |
2533 | |
2534 | // If the function has an explicit vectorized counterpart, and does not |
2535 | // take output/input pointers, we can safely assume that it can be |
2536 | // vectorized. |
2537 | if (Call && !Call->isNoBuiltin() && Call->getCalledFunction() && |
2538 | !hasPointerArgs(Call) && !VFDatabase::getMappings(CI: *Call).empty()) |
2539 | continue; |
2540 | |
2541 | auto *Ld = dyn_cast<LoadInst>(Val: &I); |
2542 | if (!Ld) { |
2543 | recordAnalysis(RemarkName: "CantVectorizeInstruction" , Instr: Ld) |
2544 | << "instruction cannot be vectorized" ; |
2545 | HasComplexMemInst = true; |
2546 | continue; |
2547 | } |
2548 | if (!Ld->isSimple() && !IsAnnotatedParallel) { |
2549 | recordAnalysis(RemarkName: "NonSimpleLoad" , Instr: Ld) |
2550 | << "read with atomic ordering or volatile read" ; |
2551 | LLVM_DEBUG(dbgs() << "LAA: Found a non-simple load.\n" ); |
2552 | HasComplexMemInst = true; |
2553 | continue; |
2554 | } |
2555 | NumLoads++; |
2556 | Loads.push_back(Elt: Ld); |
2557 | DepChecker->addAccess(LI: Ld); |
2558 | if (EnableMemAccessVersioningOfLoop) |
2559 | collectStridedAccess(LoadOrStoreInst: Ld); |
2560 | continue; |
2561 | } |
2562 | |
2563 | // Save 'store' instructions. Abort if other instructions write to memory. |
2564 | if (I.mayWriteToMemory()) { |
2565 | auto *St = dyn_cast<StoreInst>(Val: &I); |
2566 | if (!St) { |
2567 | recordAnalysis(RemarkName: "CantVectorizeInstruction" , Instr: St) |
2568 | << "instruction cannot be vectorized" ; |
2569 | HasComplexMemInst = true; |
2570 | continue; |
2571 | } |
2572 | if (!St->isSimple() && !IsAnnotatedParallel) { |
2573 | recordAnalysis(RemarkName: "NonSimpleStore" , Instr: St) |
2574 | << "write with atomic ordering or volatile write" ; |
2575 | LLVM_DEBUG(dbgs() << "LAA: Found a non-simple store.\n" ); |
2576 | HasComplexMemInst = true; |
2577 | continue; |
2578 | } |
2579 | NumStores++; |
2580 | Stores.push_back(Elt: St); |
2581 | DepChecker->addAccess(SI: St); |
2582 | if (EnableMemAccessVersioningOfLoop) |
2583 | collectStridedAccess(LoadOrStoreInst: St); |
2584 | } |
2585 | } // Next instr. |
2586 | } // Next block. |
2587 | |
2588 | if (HasComplexMemInst) |
2589 | return false; |
2590 | |
2591 | // Now we have two lists that hold the loads and the stores. |
2592 | // Next, we find the pointers that they use. |
2593 | |
2594 | // Check if we see any stores. If there are no stores, then we don't |
2595 | // care if the pointers are *restrict*. |
2596 | if (!Stores.size()) { |
2597 | LLVM_DEBUG(dbgs() << "LAA: Found a read-only loop!\n" ); |
2598 | return true; |
2599 | } |
2600 | |
2601 | MemoryDepChecker::DepCandidates DepCands; |
2602 | AccessAnalysis Accesses(TheLoop, AA, LI, DepCands, *PSE, LoopAliasScopes); |
2603 | |
2604 | // Holds the analyzed pointers. We don't want to call getUnderlyingObjects |
2605 | // multiple times on the same object. If the ptr is accessed twice, once |
2606 | // for read and once for write, it will only appear once (on the write |
2607 | // list). This is okay, since we are going to check for conflicts between |
2608 | // writes and between reads and writes, but not between reads and reads. |
2609 | SmallSet<std::pair<Value *, Type *>, 16> Seen; |
2610 | |
2611 | // Record uniform store addresses to identify if we have multiple stores |
2612 | // to the same address. |
2613 | SmallPtrSet<Value *, 16> UniformStores; |
2614 | |
2615 | for (StoreInst *ST : Stores) { |
2616 | Value *Ptr = ST->getPointerOperand(); |
2617 | |
2618 | if (isInvariant(V: Ptr)) { |
2619 | // Record store instructions to loop invariant addresses |
2620 | StoresToInvariantAddresses.push_back(Elt: ST); |
2621 | HasStoreStoreDependenceInvolvingLoopInvariantAddress |= |
2622 | !UniformStores.insert(Ptr).second; |
2623 | } |
2624 | |
2625 | // If we did *not* see this pointer before, insert it to the read-write |
2626 | // list. At this phase it is only a 'write' list. |
2627 | Type *AccessTy = getLoadStoreType(I: ST); |
2628 | if (Seen.insert(V: {Ptr, AccessTy}).second) { |
2629 | ++NumReadWrites; |
2630 | |
2631 | MemoryLocation Loc = MemoryLocation::get(SI: ST); |
2632 | // The TBAA metadata could have a control dependency on the predication |
2633 | // condition, so we cannot rely on it when determining whether or not we |
2634 | // need runtime pointer checks. |
2635 | if (blockNeedsPredication(BB: ST->getParent(), TheLoop, DT)) |
2636 | Loc.AATags.TBAA = nullptr; |
2637 | |
2638 | visitPointers(StartPtr: const_cast<Value *>(Loc.Ptr), InnermostLoop: *TheLoop, |
2639 | AddPointer: [&Accesses, AccessTy, Loc](Value *Ptr) { |
2640 | MemoryLocation NewLoc = Loc.getWithNewPtr(NewPtr: Ptr); |
2641 | Accesses.addStore(Loc: NewLoc, AccessTy); |
2642 | }); |
2643 | } |
2644 | } |
2645 | |
2646 | if (IsAnnotatedParallel) { |
2647 | LLVM_DEBUG( |
2648 | dbgs() << "LAA: A loop annotated parallel, ignore memory dependency " |
2649 | << "checks.\n" ); |
2650 | return true; |
2651 | } |
2652 | |
2653 | for (LoadInst *LD : Loads) { |
2654 | Value *Ptr = LD->getPointerOperand(); |
2655 | // If we did *not* see this pointer before, insert it to the |
2656 | // read list. If we *did* see it before, then it is already in |
2657 | // the read-write list. This allows us to vectorize expressions |
2658 | // such as A[i] += x; Because the address of A[i] is a read-write |
2659 | // pointer. This only works if the index of A[i] is consecutive. |
2660 | // If the address of i is unknown (for example A[B[i]]) then we may |
2661 | // read a few words, modify, and write a few words, and some of the |
2662 | // words may be written to the same address. |
2663 | bool IsReadOnlyPtr = false; |
2664 | Type *AccessTy = getLoadStoreType(I: LD); |
2665 | if (Seen.insert(V: {Ptr, AccessTy}).second || |
2666 | !getPtrStride(PSE&: *PSE, AccessTy, Ptr, Lp: TheLoop, StridesMap: SymbolicStrides)) { |
2667 | ++NumReads; |
2668 | IsReadOnlyPtr = true; |
2669 | } |
2670 | |
2671 | // See if there is an unsafe dependency between a load to a uniform address and |
2672 | // store to the same uniform address. |
2673 | if (UniformStores.contains(Ptr)) { |
2674 | LLVM_DEBUG(dbgs() << "LAA: Found an unsafe dependency between a uniform " |
2675 | "load and uniform store to the same address!\n" ); |
2676 | HasLoadStoreDependenceInvolvingLoopInvariantAddress = true; |
2677 | } |
2678 | |
2679 | MemoryLocation Loc = MemoryLocation::get(LI: LD); |
2680 | // The TBAA metadata could have a control dependency on the predication |
2681 | // condition, so we cannot rely on it when determining whether or not we |
2682 | // need runtime pointer checks. |
2683 | if (blockNeedsPredication(BB: LD->getParent(), TheLoop, DT)) |
2684 | Loc.AATags.TBAA = nullptr; |
2685 | |
2686 | visitPointers(StartPtr: const_cast<Value *>(Loc.Ptr), InnermostLoop: *TheLoop, |
2687 | AddPointer: [&Accesses, AccessTy, Loc, IsReadOnlyPtr](Value *Ptr) { |
2688 | MemoryLocation NewLoc = Loc.getWithNewPtr(NewPtr: Ptr); |
2689 | Accesses.addLoad(Loc: NewLoc, AccessTy, IsReadOnly: IsReadOnlyPtr); |
2690 | }); |
2691 | } |
2692 | |
2693 | // If we write (or read-write) to a single destination and there are no |
2694 | // other reads in this loop then is it safe to vectorize. |
2695 | if (NumReadWrites == 1 && NumReads == 0) { |
2696 | LLVM_DEBUG(dbgs() << "LAA: Found a write-only loop!\n" ); |
2697 | return true; |
2698 | } |
2699 | |
2700 | // Build dependence sets and check whether we need a runtime pointer bounds |
2701 | // check. |
2702 | Accesses.buildDependenceSets(); |
2703 | |
2704 | // Find pointers with computable bounds. We are going to use this information |
2705 | // to place a runtime bound check. |
2706 | Value *UncomputablePtr = nullptr; |
2707 | HasCompletePtrRtChecking = Accesses.canCheckPtrAtRT( |
2708 | RtCheck&: *PtrRtChecking, TheLoop, StridesMap: SymbolicStrides, UncomputablePtr, AllowPartial); |
2709 | if (!HasCompletePtrRtChecking) { |
2710 | const auto *I = dyn_cast_or_null<Instruction>(Val: UncomputablePtr); |
2711 | recordAnalysis(RemarkName: "CantIdentifyArrayBounds" , Instr: I) |
2712 | << "cannot identify array bounds" ; |
2713 | LLVM_DEBUG(dbgs() << "LAA: We can't vectorize because we can't find " |
2714 | << "the array bounds.\n" ); |
2715 | return false; |
2716 | } |
2717 | |
2718 | LLVM_DEBUG( |
2719 | dbgs() << "LAA: May be able to perform a memory runtime check if needed.\n" ); |
2720 | |
2721 | bool DepsAreSafe = true; |
2722 | if (Accesses.isDependencyCheckNeeded()) { |
2723 | LLVM_DEBUG(dbgs() << "LAA: Checking memory dependencies\n" ); |
2724 | DepsAreSafe = |
2725 | DepChecker->areDepsSafe(DepCands, CheckDeps: Accesses.getDependenciesToCheck()); |
2726 | |
2727 | if (!DepsAreSafe && DepChecker->shouldRetryWithRuntimeCheck()) { |
2728 | LLVM_DEBUG(dbgs() << "LAA: Retrying with memory checks\n" ); |
2729 | |
2730 | // Clear the dependency checks. We assume they are not needed. |
2731 | Accesses.resetDepChecks(DepChecker&: *DepChecker); |
2732 | |
2733 | PtrRtChecking->reset(); |
2734 | PtrRtChecking->Need = true; |
2735 | |
2736 | UncomputablePtr = nullptr; |
2737 | HasCompletePtrRtChecking = |
2738 | Accesses.canCheckPtrAtRT(RtCheck&: *PtrRtChecking, TheLoop, StridesMap: SymbolicStrides, |
2739 | UncomputablePtr, AllowPartial); |
2740 | |
2741 | // Check that we found the bounds for the pointer. |
2742 | if (!HasCompletePtrRtChecking) { |
2743 | auto *I = dyn_cast_or_null<Instruction>(Val: UncomputablePtr); |
2744 | recordAnalysis(RemarkName: "CantCheckMemDepsAtRunTime" , Instr: I) |
2745 | << "cannot check memory dependencies at runtime" ; |
2746 | LLVM_DEBUG(dbgs() << "LAA: Can't vectorize with memory checks\n" ); |
2747 | return false; |
2748 | } |
2749 | DepsAreSafe = true; |
2750 | } |
2751 | } |
2752 | |
2753 | if (HasConvergentOp) { |
2754 | recordAnalysis(RemarkName: "CantInsertRuntimeCheckWithConvergent" ) |
2755 | << "cannot add control dependency to convergent operation" ; |
2756 | LLVM_DEBUG(dbgs() << "LAA: We can't vectorize because a runtime check " |
2757 | "would be needed with a convergent operation\n" ); |
2758 | return false; |
2759 | } |
2760 | |
2761 | if (DepsAreSafe) { |
2762 | LLVM_DEBUG( |
2763 | dbgs() << "LAA: No unsafe dependent memory operations in loop. We" |
2764 | << (PtrRtChecking->Need ? "" : " don't" ) |
2765 | << " need runtime memory checks.\n" ); |
2766 | return true; |
2767 | } |
2768 | |
2769 | emitUnsafeDependenceRemark(); |
2770 | return false; |
2771 | } |
2772 | |
2773 | void LoopAccessInfo::() { |
2774 | const auto *Deps = getDepChecker().getDependences(); |
2775 | if (!Deps) |
2776 | return; |
2777 | const auto *Found = |
2778 | llvm::find_if(Range: *Deps, P: [](const MemoryDepChecker::Dependence &D) { |
2779 | return MemoryDepChecker::Dependence::isSafeForVectorization(Type: D.Type) != |
2780 | MemoryDepChecker::VectorizationSafetyStatus::Safe; |
2781 | }); |
2782 | if (Found == Deps->end()) |
2783 | return; |
2784 | MemoryDepChecker::Dependence Dep = *Found; |
2785 | |
2786 | LLVM_DEBUG(dbgs() << "LAA: unsafe dependent memory operations in loop\n" ); |
2787 | |
2788 | // Emit remark for first unsafe dependence |
2789 | bool HasForcedDistribution = false; |
2790 | std::optional<const MDOperand *> Value = |
2791 | findStringMetadataForLoop(TheLoop, Name: "llvm.loop.distribute.enable" ); |
2792 | if (Value) { |
2793 | const MDOperand *Op = *Value; |
2794 | assert(Op && mdconst::hasa<ConstantInt>(*Op) && "invalid metadata" ); |
2795 | HasForcedDistribution = mdconst::extract<ConstantInt>(MD: *Op)->getZExtValue(); |
2796 | } |
2797 | |
2798 | const std::string Info = |
2799 | HasForcedDistribution |
2800 | ? "unsafe dependent memory operations in loop." |
2801 | : "unsafe dependent memory operations in loop. Use " |
2802 | "#pragma clang loop distribute(enable) to allow loop distribution " |
2803 | "to attempt to isolate the offending operations into a separate " |
2804 | "loop" ; |
2805 | OptimizationRemarkAnalysis &R = |
2806 | recordAnalysis(RemarkName: "UnsafeDep" , Instr: Dep.getDestination(DepChecker: getDepChecker())) << Info; |
2807 | |
2808 | switch (Dep.Type) { |
2809 | case MemoryDepChecker::Dependence::NoDep: |
2810 | case MemoryDepChecker::Dependence::Forward: |
2811 | case MemoryDepChecker::Dependence::BackwardVectorizable: |
2812 | llvm_unreachable("Unexpected dependence" ); |
2813 | case MemoryDepChecker::Dependence::Backward: |
2814 | R << "\nBackward loop carried data dependence." ; |
2815 | break; |
2816 | case MemoryDepChecker::Dependence::ForwardButPreventsForwarding: |
2817 | R << "\nForward loop carried data dependence that prevents " |
2818 | "store-to-load forwarding." ; |
2819 | break; |
2820 | case MemoryDepChecker::Dependence::BackwardVectorizableButPreventsForwarding: |
2821 | R << "\nBackward loop carried data dependence that prevents " |
2822 | "store-to-load forwarding." ; |
2823 | break; |
2824 | case MemoryDepChecker::Dependence::IndirectUnsafe: |
2825 | R << "\nUnsafe indirect dependence." ; |
2826 | break; |
2827 | case MemoryDepChecker::Dependence::Unknown: |
2828 | R << "\nUnknown data dependence." ; |
2829 | break; |
2830 | } |
2831 | |
2832 | if (Instruction *I = Dep.getSource(DepChecker: getDepChecker())) { |
2833 | DebugLoc SourceLoc = I->getDebugLoc(); |
2834 | if (auto *DD = dyn_cast_or_null<Instruction>(Val: getPointerOperand(V: I))) |
2835 | SourceLoc = DD->getDebugLoc(); |
2836 | if (SourceLoc) |
2837 | R << " Memory location is the same as accessed at " |
2838 | << ore::NV("Location" , SourceLoc); |
2839 | } |
2840 | } |
2841 | |
2842 | bool LoopAccessInfo::blockNeedsPredication(BasicBlock *BB, Loop *TheLoop, |
2843 | DominatorTree *DT) { |
2844 | assert(TheLoop->contains(BB) && "Unknown block used" ); |
2845 | |
2846 | // Blocks that do not dominate the latch need predication. |
2847 | const BasicBlock *Latch = TheLoop->getLoopLatch(); |
2848 | return !DT->dominates(A: BB, B: Latch); |
2849 | } |
2850 | |
2851 | OptimizationRemarkAnalysis & |
2852 | LoopAccessInfo::recordAnalysis(StringRef , const Instruction *I) { |
2853 | assert(!Report && "Multiple reports generated" ); |
2854 | |
2855 | const BasicBlock *CodeRegion = TheLoop->getHeader(); |
2856 | DebugLoc DL = TheLoop->getStartLoc(); |
2857 | |
2858 | if (I) { |
2859 | CodeRegion = I->getParent(); |
2860 | // If there is no debug location attached to the instruction, revert back to |
2861 | // using the loop's. |
2862 | if (I->getDebugLoc()) |
2863 | DL = I->getDebugLoc(); |
2864 | } |
2865 | |
2866 | Report = std::make_unique<OptimizationRemarkAnalysis>(DEBUG_TYPE, args&: RemarkName, |
2867 | args&: DL, args&: CodeRegion); |
2868 | return *Report; |
2869 | } |
2870 | |
2871 | bool LoopAccessInfo::isInvariant(Value *V) const { |
2872 | auto *SE = PSE->getSE(); |
2873 | if (TheLoop->isLoopInvariant(V)) |
2874 | return true; |
2875 | if (!SE->isSCEVable(Ty: V->getType())) |
2876 | return false; |
2877 | const SCEV *S = SE->getSCEV(V); |
2878 | return SE->isLoopInvariant(S, L: TheLoop); |
2879 | } |
2880 | |
2881 | /// If \p Ptr is a GEP, which has a loop-variant operand, return that operand. |
2882 | /// Otherwise, return \p Ptr. |
2883 | static Value *getLoopVariantGEPOperand(Value *Ptr, ScalarEvolution *SE, |
2884 | Loop *Lp) { |
2885 | auto *GEP = dyn_cast<GetElementPtrInst>(Val: Ptr); |
2886 | if (!GEP) |
2887 | return Ptr; |
2888 | |
2889 | Value *V = Ptr; |
2890 | for (const Use &U : GEP->operands()) { |
2891 | if (!SE->isLoopInvariant(S: SE->getSCEV(V: U), L: Lp)) { |
2892 | if (V == Ptr) |
2893 | V = U; |
2894 | else |
2895 | // There must be exactly one loop-variant operand. |
2896 | return Ptr; |
2897 | } |
2898 | } |
2899 | return V; |
2900 | } |
2901 | |
2902 | /// Get the stride of a pointer access in a loop. Looks for symbolic |
2903 | /// strides "a[i*stride]". Returns the symbolic stride, or null otherwise. |
2904 | static const SCEV *getStrideFromPointer(Value *Ptr, ScalarEvolution *SE, Loop *Lp) { |
2905 | auto *PtrTy = dyn_cast<PointerType>(Val: Ptr->getType()); |
2906 | if (!PtrTy) |
2907 | return nullptr; |
2908 | |
2909 | // Try to remove a gep instruction to make the pointer (actually index at this |
2910 | // point) easier analyzable. If OrigPtr is equal to Ptr we are analyzing the |
2911 | // pointer, otherwise, we are analyzing the index. |
2912 | Value *OrigPtr = Ptr; |
2913 | |
2914 | Ptr = getLoopVariantGEPOperand(Ptr, SE, Lp); |
2915 | const SCEV *V = SE->getSCEV(V: Ptr); |
2916 | |
2917 | if (Ptr != OrigPtr) |
2918 | // Strip off casts. |
2919 | while (auto *C = dyn_cast<SCEVIntegralCastExpr>(Val: V)) |
2920 | V = C->getOperand(); |
2921 | |
2922 | if (!match(S: V, P: m_scev_AffineAddRec(Op0: m_SCEV(), Op1: m_SCEV(V), L: m_SpecificLoop(L: Lp)))) |
2923 | return nullptr; |
2924 | |
2925 | // Note that the restriction after this loop invariant check are only |
2926 | // profitability restrictions. |
2927 | if (!SE->isLoopInvariant(S: V, L: Lp)) |
2928 | return nullptr; |
2929 | |
2930 | // Look for the loop invariant symbolic value. |
2931 | if (isa<SCEVUnknown>(Val: V)) |
2932 | return V; |
2933 | |
2934 | if (auto *C = dyn_cast<SCEVIntegralCastExpr>(Val: V)) |
2935 | if (isa<SCEVUnknown>(Val: C->getOperand())) |
2936 | return V; |
2937 | |
2938 | return nullptr; |
2939 | } |
2940 | |
2941 | void LoopAccessInfo::collectStridedAccess(Value *MemAccess) { |
2942 | Value *Ptr = getLoadStorePointerOperand(V: MemAccess); |
2943 | if (!Ptr) |
2944 | return; |
2945 | |
2946 | // Note: getStrideFromPointer is a *profitability* heuristic. We |
2947 | // could broaden the scope of values returned here - to anything |
2948 | // which happens to be loop invariant and contributes to the |
2949 | // computation of an interesting IV - but we chose not to as we |
2950 | // don't have a cost model here, and broadening the scope exposes |
2951 | // far too many unprofitable cases. |
2952 | const SCEV *StrideExpr = getStrideFromPointer(Ptr, SE: PSE->getSE(), Lp: TheLoop); |
2953 | if (!StrideExpr) |
2954 | return; |
2955 | |
2956 | LLVM_DEBUG(dbgs() << "LAA: Found a strided access that is a candidate for " |
2957 | "versioning:" ); |
2958 | LLVM_DEBUG(dbgs() << " Ptr: " << *Ptr << " Stride: " << *StrideExpr << "\n" ); |
2959 | |
2960 | if (!SpeculateUnitStride) { |
2961 | LLVM_DEBUG(dbgs() << " Chose not to due to -laa-speculate-unit-stride\n" ); |
2962 | return; |
2963 | } |
2964 | |
2965 | // Avoid adding the "Stride == 1" predicate when we know that |
2966 | // Stride >= Trip-Count. Such a predicate will effectively optimize a single |
2967 | // or zero iteration loop, as Trip-Count <= Stride == 1. |
2968 | // |
2969 | // TODO: We are currently not making a very informed decision on when it is |
2970 | // beneficial to apply stride versioning. It might make more sense that the |
2971 | // users of this analysis (such as the vectorizer) will trigger it, based on |
2972 | // their specific cost considerations; For example, in cases where stride |
2973 | // versioning does not help resolving memory accesses/dependences, the |
2974 | // vectorizer should evaluate the cost of the runtime test, and the benefit |
2975 | // of various possible stride specializations, considering the alternatives |
2976 | // of using gather/scatters (if available). |
2977 | |
2978 | const SCEV *MaxBTC = PSE->getSymbolicMaxBackedgeTakenCount(); |
2979 | |
2980 | // Match the types so we can compare the stride and the MaxBTC. |
2981 | // The Stride can be positive/negative, so we sign extend Stride; |
2982 | // The backedgeTakenCount is non-negative, so we zero extend MaxBTC. |
2983 | const DataLayout &DL = TheLoop->getHeader()->getDataLayout(); |
2984 | uint64_t StrideTypeSizeBits = DL.getTypeSizeInBits(Ty: StrideExpr->getType()); |
2985 | uint64_t BETypeSizeBits = DL.getTypeSizeInBits(Ty: MaxBTC->getType()); |
2986 | const SCEV *CastedStride = StrideExpr; |
2987 | const SCEV *CastedBECount = MaxBTC; |
2988 | ScalarEvolution *SE = PSE->getSE(); |
2989 | if (BETypeSizeBits >= StrideTypeSizeBits) |
2990 | CastedStride = SE->getNoopOrSignExtend(V: StrideExpr, Ty: MaxBTC->getType()); |
2991 | else |
2992 | CastedBECount = SE->getZeroExtendExpr(Op: MaxBTC, Ty: StrideExpr->getType()); |
2993 | const SCEV *StrideMinusBETaken = SE->getMinusSCEV(LHS: CastedStride, RHS: CastedBECount); |
2994 | // Since TripCount == BackEdgeTakenCount + 1, checking: |
2995 | // "Stride >= TripCount" is equivalent to checking: |
2996 | // Stride - MaxBTC> 0 |
2997 | if (SE->isKnownPositive(S: StrideMinusBETaken)) { |
2998 | LLVM_DEBUG( |
2999 | dbgs() << "LAA: Stride>=TripCount; No point in versioning as the " |
3000 | "Stride==1 predicate will imply that the loop executes " |
3001 | "at most once.\n" ); |
3002 | return; |
3003 | } |
3004 | LLVM_DEBUG(dbgs() << "LAA: Found a strided access that we can version.\n" ); |
3005 | |
3006 | // Strip back off the integer cast, and check that our result is a |
3007 | // SCEVUnknown as we expect. |
3008 | const SCEV *StrideBase = StrideExpr; |
3009 | if (const auto *C = dyn_cast<SCEVIntegralCastExpr>(Val: StrideBase)) |
3010 | StrideBase = C->getOperand(); |
3011 | SymbolicStrides[Ptr] = cast<SCEVUnknown>(Val: StrideBase); |
3012 | } |
3013 | |
3014 | LoopAccessInfo::LoopAccessInfo(Loop *L, ScalarEvolution *SE, |
3015 | const TargetTransformInfo *TTI, |
3016 | const TargetLibraryInfo *TLI, AAResults *AA, |
3017 | DominatorTree *DT, LoopInfo *LI, |
3018 | bool AllowPartial) |
3019 | : PSE(std::make_unique<PredicatedScalarEvolution>(args&: *SE, args&: *L)), |
3020 | PtrRtChecking(nullptr), TheLoop(L), AllowPartial(AllowPartial) { |
3021 | unsigned MaxTargetVectorWidthInBits = std::numeric_limits<unsigned>::max(); |
3022 | if (TTI && !TTI->enableScalableVectorization()) |
3023 | // Scale the vector width by 2 as rough estimate to also consider |
3024 | // interleaving. |
3025 | MaxTargetVectorWidthInBits = |
3026 | TTI->getRegisterBitWidth(K: TargetTransformInfo::RGK_FixedWidthVector) * 2; |
3027 | |
3028 | DepChecker = std::make_unique<MemoryDepChecker>(args&: *PSE, args&: L, args&: SymbolicStrides, |
3029 | args&: MaxTargetVectorWidthInBits); |
3030 | PtrRtChecking = std::make_unique<RuntimePointerChecking>(args&: *DepChecker, args&: SE); |
3031 | if (canAnalyzeLoop()) |
3032 | CanVecMem = analyzeLoop(AA, LI, TLI, DT); |
3033 | } |
3034 | |
3035 | void LoopAccessInfo::print(raw_ostream &OS, unsigned Depth) const { |
3036 | if (CanVecMem) { |
3037 | OS.indent(NumSpaces: Depth) << "Memory dependences are safe" ; |
3038 | const MemoryDepChecker &DC = getDepChecker(); |
3039 | if (!DC.isSafeForAnyVectorWidth()) |
3040 | OS << " with a maximum safe vector width of " |
3041 | << DC.getMaxSafeVectorWidthInBits() << " bits" ; |
3042 | if (!DC.isSafeForAnyStoreLoadForwardDistances()) { |
3043 | uint64_t SLDist = DC.getStoreLoadForwardSafeDistanceInBits(); |
3044 | OS << ", with a maximum safe store-load forward width of " << SLDist |
3045 | << " bits" ; |
3046 | } |
3047 | if (PtrRtChecking->Need) |
3048 | OS << " with run-time checks" ; |
3049 | OS << "\n" ; |
3050 | } |
3051 | |
3052 | if (HasConvergentOp) |
3053 | OS.indent(NumSpaces: Depth) << "Has convergent operation in loop\n" ; |
3054 | |
3055 | if (Report) |
3056 | OS.indent(NumSpaces: Depth) << "Report: " << Report->getMsg() << "\n" ; |
3057 | |
3058 | if (auto *Dependences = DepChecker->getDependences()) { |
3059 | OS.indent(NumSpaces: Depth) << "Dependences:\n" ; |
3060 | for (const auto &Dep : *Dependences) { |
3061 | Dep.print(OS, Depth: Depth + 2, Instrs: DepChecker->getMemoryInstructions()); |
3062 | OS << "\n" ; |
3063 | } |
3064 | } else |
3065 | OS.indent(NumSpaces: Depth) << "Too many dependences, not recorded\n" ; |
3066 | |
3067 | // List the pair of accesses need run-time checks to prove independence. |
3068 | PtrRtChecking->print(OS, Depth); |
3069 | if (PtrRtChecking->Need && !HasCompletePtrRtChecking) |
3070 | OS.indent(NumSpaces: Depth) << "Generated run-time checks are incomplete\n" ; |
3071 | OS << "\n" ; |
3072 | |
3073 | OS.indent(NumSpaces: Depth) |
3074 | << "Non vectorizable stores to invariant address were " |
3075 | << (HasStoreStoreDependenceInvolvingLoopInvariantAddress || |
3076 | HasLoadStoreDependenceInvolvingLoopInvariantAddress |
3077 | ? "" |
3078 | : "not " ) |
3079 | << "found in loop.\n" ; |
3080 | |
3081 | OS.indent(NumSpaces: Depth) << "SCEV assumptions:\n" ; |
3082 | PSE->getPredicate().print(OS, Depth); |
3083 | |
3084 | OS << "\n" ; |
3085 | |
3086 | OS.indent(NumSpaces: Depth) << "Expressions re-written:\n" ; |
3087 | PSE->print(OS, Depth); |
3088 | } |
3089 | |
3090 | const LoopAccessInfo &LoopAccessInfoManager::getInfo(Loop &L, |
3091 | bool AllowPartial) { |
3092 | const auto &[It, Inserted] = LoopAccessInfoMap.try_emplace(Key: &L); |
3093 | |
3094 | // We need to create the LoopAccessInfo if either we don't already have one, |
3095 | // or if it was created with a different value of AllowPartial. |
3096 | if (Inserted || It->second->hasAllowPartial() != AllowPartial) |
3097 | It->second = std::make_unique<LoopAccessInfo>(args: &L, args: &SE, args&: TTI, args&: TLI, args: &AA, args: &DT, |
3098 | args: &LI, args&: AllowPartial); |
3099 | |
3100 | return *It->second; |
3101 | } |
3102 | void LoopAccessInfoManager::clear() { |
3103 | // Collect LoopAccessInfo entries that may keep references to IR outside the |
3104 | // analyzed loop or SCEVs that may have been modified or invalidated. At the |
3105 | // moment, that is loops requiring memory or SCEV runtime checks, as those cache |
3106 | // SCEVs, e.g. for pointer expressions. |
3107 | for (const auto &[L, LAI] : LoopAccessInfoMap) { |
3108 | if (LAI->getRuntimePointerChecking()->getChecks().empty() && |
3109 | LAI->getPSE().getPredicate().isAlwaysTrue()) |
3110 | continue; |
3111 | LoopAccessInfoMap.erase(Val: L); |
3112 | } |
3113 | } |
3114 | |
3115 | bool LoopAccessInfoManager::invalidate( |
3116 | Function &F, const PreservedAnalyses &PA, |
3117 | FunctionAnalysisManager::Invalidator &Inv) { |
3118 | // Check whether our analysis is preserved. |
3119 | auto PAC = PA.getChecker<LoopAccessAnalysis>(); |
3120 | if (!PAC.preserved() && !PAC.preservedSet<AllAnalysesOn<Function>>()) |
3121 | // If not, give up now. |
3122 | return true; |
3123 | |
3124 | // Check whether the analyses we depend on became invalid for any reason. |
3125 | // Skip checking TargetLibraryAnalysis as it is immutable and can't become |
3126 | // invalid. |
3127 | return Inv.invalidate<AAManager>(IR&: F, PA) || |
3128 | Inv.invalidate<ScalarEvolutionAnalysis>(IR&: F, PA) || |
3129 | Inv.invalidate<LoopAnalysis>(IR&: F, PA) || |
3130 | Inv.invalidate<DominatorTreeAnalysis>(IR&: F, PA); |
3131 | } |
3132 | |
3133 | LoopAccessInfoManager LoopAccessAnalysis::run(Function &F, |
3134 | FunctionAnalysisManager &FAM) { |
3135 | auto &SE = FAM.getResult<ScalarEvolutionAnalysis>(IR&: F); |
3136 | auto &AA = FAM.getResult<AAManager>(IR&: F); |
3137 | auto &DT = FAM.getResult<DominatorTreeAnalysis>(IR&: F); |
3138 | auto &LI = FAM.getResult<LoopAnalysis>(IR&: F); |
3139 | auto &TTI = FAM.getResult<TargetIRAnalysis>(IR&: F); |
3140 | auto &TLI = FAM.getResult<TargetLibraryAnalysis>(IR&: F); |
3141 | return LoopAccessInfoManager(SE, AA, DT, LI, &TTI, &TLI); |
3142 | } |
3143 | |
3144 | AnalysisKey LoopAccessAnalysis::Key; |
3145 | |