1 | //===- LoopAccessAnalysis.cpp - Loop Access Analysis Implementation --------==// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // The implementation for the loop memory dependence that was originally |
10 | // developed for the loop vectorizer. |
11 | // |
12 | //===----------------------------------------------------------------------===// |
13 | |
14 | #include "llvm/Analysis/LoopAccessAnalysis.h" |
15 | #include "llvm/ADT/APInt.h" |
16 | #include "llvm/ADT/DenseMap.h" |
17 | #include "llvm/ADT/EquivalenceClasses.h" |
18 | #include "llvm/ADT/PointerIntPair.h" |
19 | #include "llvm/ADT/STLExtras.h" |
20 | #include "llvm/ADT/SetVector.h" |
21 | #include "llvm/ADT/SmallPtrSet.h" |
22 | #include "llvm/ADT/SmallSet.h" |
23 | #include "llvm/ADT/SmallVector.h" |
24 | #include "llvm/Analysis/AliasAnalysis.h" |
25 | #include "llvm/Analysis/AliasSetTracker.h" |
26 | #include "llvm/Analysis/LoopAnalysisManager.h" |
27 | #include "llvm/Analysis/LoopInfo.h" |
28 | #include "llvm/Analysis/LoopIterator.h" |
29 | #include "llvm/Analysis/MemoryLocation.h" |
30 | #include "llvm/Analysis/OptimizationRemarkEmitter.h" |
31 | #include "llvm/Analysis/ScalarEvolution.h" |
32 | #include "llvm/Analysis/ScalarEvolutionExpressions.h" |
33 | #include "llvm/Analysis/TargetLibraryInfo.h" |
34 | #include "llvm/Analysis/TargetTransformInfo.h" |
35 | #include "llvm/Analysis/ValueTracking.h" |
36 | #include "llvm/Analysis/VectorUtils.h" |
37 | #include "llvm/IR/BasicBlock.h" |
38 | #include "llvm/IR/Constants.h" |
39 | #include "llvm/IR/DataLayout.h" |
40 | #include "llvm/IR/DebugLoc.h" |
41 | #include "llvm/IR/DerivedTypes.h" |
42 | #include "llvm/IR/DiagnosticInfo.h" |
43 | #include "llvm/IR/Dominators.h" |
44 | #include "llvm/IR/Function.h" |
45 | #include "llvm/IR/GetElementPtrTypeIterator.h" |
46 | #include "llvm/IR/InstrTypes.h" |
47 | #include "llvm/IR/Instruction.h" |
48 | #include "llvm/IR/Instructions.h" |
49 | #include "llvm/IR/Operator.h" |
50 | #include "llvm/IR/PassManager.h" |
51 | #include "llvm/IR/PatternMatch.h" |
52 | #include "llvm/IR/Type.h" |
53 | #include "llvm/IR/Value.h" |
54 | #include "llvm/IR/ValueHandle.h" |
55 | #include "llvm/Support/Casting.h" |
56 | #include "llvm/Support/CommandLine.h" |
57 | #include "llvm/Support/Debug.h" |
58 | #include "llvm/Support/ErrorHandling.h" |
59 | #include "llvm/Support/raw_ostream.h" |
60 | #include <algorithm> |
61 | #include <cassert> |
62 | #include <cstdint> |
63 | #include <iterator> |
64 | #include <utility> |
65 | #include <variant> |
66 | #include <vector> |
67 | |
68 | using namespace llvm; |
69 | using namespace llvm::PatternMatch; |
70 | |
71 | #define DEBUG_TYPE "loop-accesses" |
72 | |
73 | static cl::opt<unsigned, true> |
74 | VectorizationFactor("force-vector-width" , cl::Hidden, |
75 | cl::desc("Sets the SIMD width. Zero is autoselect." ), |
76 | cl::location(L&: VectorizerParams::VectorizationFactor)); |
77 | unsigned VectorizerParams::VectorizationFactor; |
78 | |
79 | static cl::opt<unsigned, true> |
80 | VectorizationInterleave("force-vector-interleave" , cl::Hidden, |
81 | cl::desc("Sets the vectorization interleave count. " |
82 | "Zero is autoselect." ), |
83 | cl::location( |
84 | L&: VectorizerParams::VectorizationInterleave)); |
85 | unsigned VectorizerParams::VectorizationInterleave; |
86 | |
87 | static cl::opt<unsigned, true> RuntimeMemoryCheckThreshold( |
88 | "runtime-memory-check-threshold" , cl::Hidden, |
89 | cl::desc("When performing memory disambiguation checks at runtime do not " |
90 | "generate more than this number of comparisons (default = 8)." ), |
91 | cl::location(L&: VectorizerParams::RuntimeMemoryCheckThreshold), cl::init(Val: 8)); |
92 | unsigned VectorizerParams::RuntimeMemoryCheckThreshold; |
93 | |
94 | /// The maximum iterations used to merge memory checks |
95 | static cl::opt<unsigned> MemoryCheckMergeThreshold( |
96 | "memory-check-merge-threshold" , cl::Hidden, |
97 | cl::desc("Maximum number of comparisons done when trying to merge " |
98 | "runtime memory checks. (default = 100)" ), |
99 | cl::init(Val: 100)); |
100 | |
101 | /// Maximum SIMD width. |
102 | const unsigned VectorizerParams::MaxVectorWidth = 64; |
103 | |
104 | /// We collect dependences up to this threshold. |
105 | static cl::opt<unsigned> |
106 | MaxDependences("max-dependences" , cl::Hidden, |
107 | cl::desc("Maximum number of dependences collected by " |
108 | "loop-access analysis (default = 100)" ), |
109 | cl::init(Val: 100)); |
110 | |
111 | /// This enables versioning on the strides of symbolically striding memory |
112 | /// accesses in code like the following. |
113 | /// for (i = 0; i < N; ++i) |
114 | /// A[i * Stride1] += B[i * Stride2] ... |
115 | /// |
116 | /// Will be roughly translated to |
117 | /// if (Stride1 == 1 && Stride2 == 1) { |
118 | /// for (i = 0; i < N; i+=4) |
119 | /// A[i:i+3] += ... |
120 | /// } else |
121 | /// ... |
122 | static cl::opt<bool> EnableMemAccessVersioning( |
123 | "enable-mem-access-versioning" , cl::init(Val: true), cl::Hidden, |
124 | cl::desc("Enable symbolic stride memory access versioning" )); |
125 | |
126 | /// Enable store-to-load forwarding conflict detection. This option can |
127 | /// be disabled for correctness testing. |
128 | static cl::opt<bool> EnableForwardingConflictDetection( |
129 | "store-to-load-forwarding-conflict-detection" , cl::Hidden, |
130 | cl::desc("Enable conflict detection in loop-access analysis" ), |
131 | cl::init(Val: true)); |
132 | |
133 | static cl::opt<unsigned> MaxForkedSCEVDepth( |
134 | "max-forked-scev-depth" , cl::Hidden, |
135 | cl::desc("Maximum recursion depth when finding forked SCEVs (default = 5)" ), |
136 | cl::init(Val: 5)); |
137 | |
138 | static cl::opt<bool> SpeculateUnitStride( |
139 | "laa-speculate-unit-stride" , cl::Hidden, |
140 | cl::desc("Speculate that non-constant strides are unit in LAA" ), |
141 | cl::init(Val: true)); |
142 | |
143 | static cl::opt<bool, true> HoistRuntimeChecks( |
144 | "hoist-runtime-checks" , cl::Hidden, |
145 | cl::desc( |
146 | "Hoist inner loop runtime memory checks to outer loop if possible" ), |
147 | cl::location(L&: VectorizerParams::HoistRuntimeChecks), cl::init(Val: true)); |
148 | bool VectorizerParams::HoistRuntimeChecks; |
149 | |
150 | bool VectorizerParams::isInterleaveForced() { |
151 | return ::VectorizationInterleave.getNumOccurrences() > 0; |
152 | } |
153 | |
154 | const SCEV *llvm::replaceSymbolicStrideSCEV(PredicatedScalarEvolution &PSE, |
155 | const DenseMap<Value *, const SCEV *> &PtrToStride, |
156 | Value *Ptr) { |
157 | const SCEV *OrigSCEV = PSE.getSCEV(V: Ptr); |
158 | |
159 | // If there is an entry in the map return the SCEV of the pointer with the |
160 | // symbolic stride replaced by one. |
161 | DenseMap<Value *, const SCEV *>::const_iterator SI = PtrToStride.find(Val: Ptr); |
162 | if (SI == PtrToStride.end()) |
163 | // For a non-symbolic stride, just return the original expression. |
164 | return OrigSCEV; |
165 | |
166 | const SCEV *StrideSCEV = SI->second; |
167 | // Note: This assert is both overly strong and overly weak. The actual |
168 | // invariant here is that StrideSCEV should be loop invariant. The only |
169 | // such invariant strides we happen to speculate right now are unknowns |
170 | // and thus this is a reasonable proxy of the actual invariant. |
171 | assert(isa<SCEVUnknown>(StrideSCEV) && "shouldn't be in map" ); |
172 | |
173 | ScalarEvolution *SE = PSE.getSE(); |
174 | const auto *CT = SE->getOne(Ty: StrideSCEV->getType()); |
175 | PSE.addPredicate(Pred: *SE->getEqualPredicate(LHS: StrideSCEV, RHS: CT)); |
176 | auto *Expr = PSE.getSCEV(V: Ptr); |
177 | |
178 | LLVM_DEBUG(dbgs() << "LAA: Replacing SCEV: " << *OrigSCEV |
179 | << " by: " << *Expr << "\n" ); |
180 | return Expr; |
181 | } |
182 | |
183 | RuntimeCheckingPtrGroup::RuntimeCheckingPtrGroup( |
184 | unsigned Index, RuntimePointerChecking &RtCheck) |
185 | : High(RtCheck.Pointers[Index].End), Low(RtCheck.Pointers[Index].Start), |
186 | AddressSpace(RtCheck.Pointers[Index] |
187 | .PointerValue->getType() |
188 | ->getPointerAddressSpace()), |
189 | NeedsFreeze(RtCheck.Pointers[Index].NeedsFreeze) { |
190 | Members.push_back(Elt: Index); |
191 | } |
192 | |
193 | /// Calculate Start and End points of memory access. |
194 | /// Let's assume A is the first access and B is a memory access on N-th loop |
195 | /// iteration. Then B is calculated as: |
196 | /// B = A + Step*N . |
197 | /// Step value may be positive or negative. |
198 | /// N is a calculated back-edge taken count: |
199 | /// N = (TripCount > 0) ? RoundDown(TripCount -1 , VF) : 0 |
200 | /// Start and End points are calculated in the following way: |
201 | /// Start = UMIN(A, B) ; End = UMAX(A, B) + SizeOfElt, |
202 | /// where SizeOfElt is the size of single memory access in bytes. |
203 | /// |
204 | /// There is no conflict when the intervals are disjoint: |
205 | /// NoConflict = (P2.Start >= P1.End) || (P1.Start >= P2.End) |
206 | static std::pair<const SCEV *, const SCEV *> getStartAndEndForAccess( |
207 | const Loop *Lp, const SCEV *PtrExpr, Type *AccessTy, |
208 | PredicatedScalarEvolution &PSE, |
209 | DenseMap<std::pair<const SCEV *, Type *>, |
210 | std::pair<const SCEV *, const SCEV *>> &PointerBounds) { |
211 | ScalarEvolution *SE = PSE.getSE(); |
212 | |
213 | auto [Iter, Ins] = PointerBounds.insert( |
214 | KV: {{PtrExpr, AccessTy}, |
215 | {SE->getCouldNotCompute(), SE->getCouldNotCompute()}}); |
216 | if (!Ins) |
217 | return Iter->second; |
218 | |
219 | const SCEV *ScStart; |
220 | const SCEV *ScEnd; |
221 | |
222 | if (SE->isLoopInvariant(S: PtrExpr, L: Lp)) { |
223 | ScStart = ScEnd = PtrExpr; |
224 | } else if (auto *AR = dyn_cast<SCEVAddRecExpr>(Val: PtrExpr)) { |
225 | const SCEV *Ex = PSE.getSymbolicMaxBackedgeTakenCount(); |
226 | |
227 | ScStart = AR->getStart(); |
228 | ScEnd = AR->evaluateAtIteration(It: Ex, SE&: *SE); |
229 | const SCEV *Step = AR->getStepRecurrence(SE&: *SE); |
230 | |
231 | // For expressions with negative step, the upper bound is ScStart and the |
232 | // lower bound is ScEnd. |
233 | if (const auto *CStep = dyn_cast<SCEVConstant>(Val: Step)) { |
234 | if (CStep->getValue()->isNegative()) |
235 | std::swap(a&: ScStart, b&: ScEnd); |
236 | } else { |
237 | // Fallback case: the step is not constant, but we can still |
238 | // get the upper and lower bounds of the interval by using min/max |
239 | // expressions. |
240 | ScStart = SE->getUMinExpr(LHS: ScStart, RHS: ScEnd); |
241 | ScEnd = SE->getUMaxExpr(LHS: AR->getStart(), RHS: ScEnd); |
242 | } |
243 | } else |
244 | return {SE->getCouldNotCompute(), SE->getCouldNotCompute()}; |
245 | |
246 | assert(SE->isLoopInvariant(ScStart, Lp) && "ScStart needs to be invariant" ); |
247 | assert(SE->isLoopInvariant(ScEnd, Lp)&& "ScEnd needs to be invariant" ); |
248 | |
249 | // Add the size of the pointed element to ScEnd. |
250 | auto &DL = Lp->getHeader()->getDataLayout(); |
251 | Type *IdxTy = DL.getIndexType(PtrTy: PtrExpr->getType()); |
252 | const SCEV *EltSizeSCEV = SE->getStoreSizeOfExpr(IntTy: IdxTy, StoreTy: AccessTy); |
253 | ScEnd = SE->getAddExpr(LHS: ScEnd, RHS: EltSizeSCEV); |
254 | |
255 | Iter->second = {ScStart, ScEnd}; |
256 | return Iter->second; |
257 | } |
258 | |
259 | /// Calculate Start and End points of memory access using |
260 | /// getStartAndEndForAccess. |
261 | void RuntimePointerChecking::insert(Loop *Lp, Value *Ptr, const SCEV *PtrExpr, |
262 | Type *AccessTy, bool WritePtr, |
263 | unsigned DepSetId, unsigned ASId, |
264 | PredicatedScalarEvolution &PSE, |
265 | bool NeedsFreeze) { |
266 | const auto &[ScStart, ScEnd] = getStartAndEndForAccess( |
267 | Lp, PtrExpr, AccessTy, PSE, PointerBounds&: DC.getPointerBounds()); |
268 | assert(!isa<SCEVCouldNotCompute>(ScStart) && |
269 | !isa<SCEVCouldNotCompute>(ScEnd) && |
270 | "must be able to compute both start and end expressions" ); |
271 | Pointers.emplace_back(Args&: Ptr, Args: ScStart, Args: ScEnd, Args&: WritePtr, Args&: DepSetId, Args&: ASId, Args&: PtrExpr, |
272 | Args&: NeedsFreeze); |
273 | } |
274 | |
275 | bool RuntimePointerChecking::tryToCreateDiffCheck( |
276 | const RuntimeCheckingPtrGroup &CGI, const RuntimeCheckingPtrGroup &CGJ) { |
277 | // If either group contains multiple different pointers, bail out. |
278 | // TODO: Support multiple pointers by using the minimum or maximum pointer, |
279 | // depending on src & sink. |
280 | if (CGI.Members.size() != 1 || CGJ.Members.size() != 1) |
281 | return false; |
282 | |
283 | PointerInfo *Src = &Pointers[CGI.Members[0]]; |
284 | PointerInfo *Sink = &Pointers[CGJ.Members[0]]; |
285 | |
286 | // If either pointer is read and written, multiple checks may be needed. Bail |
287 | // out. |
288 | if (!DC.getOrderForAccess(Ptr: Src->PointerValue, IsWrite: !Src->IsWritePtr).empty() || |
289 | !DC.getOrderForAccess(Ptr: Sink->PointerValue, IsWrite: !Sink->IsWritePtr).empty()) |
290 | return false; |
291 | |
292 | ArrayRef<unsigned> AccSrc = |
293 | DC.getOrderForAccess(Ptr: Src->PointerValue, IsWrite: Src->IsWritePtr); |
294 | ArrayRef<unsigned> AccSink = |
295 | DC.getOrderForAccess(Ptr: Sink->PointerValue, IsWrite: Sink->IsWritePtr); |
296 | // If either pointer is accessed multiple times, there may not be a clear |
297 | // src/sink relation. Bail out for now. |
298 | if (AccSrc.size() != 1 || AccSink.size() != 1) |
299 | return false; |
300 | |
301 | // If the sink is accessed before src, swap src/sink. |
302 | if (AccSink[0] < AccSrc[0]) |
303 | std::swap(a&: Src, b&: Sink); |
304 | |
305 | auto *SrcAR = dyn_cast<SCEVAddRecExpr>(Val: Src->Expr); |
306 | auto *SinkAR = dyn_cast<SCEVAddRecExpr>(Val: Sink->Expr); |
307 | if (!SrcAR || !SinkAR || SrcAR->getLoop() != DC.getInnermostLoop() || |
308 | SinkAR->getLoop() != DC.getInnermostLoop()) |
309 | return false; |
310 | |
311 | SmallVector<Instruction *, 4> SrcInsts = |
312 | DC.getInstructionsForAccess(Ptr: Src->PointerValue, isWrite: Src->IsWritePtr); |
313 | SmallVector<Instruction *, 4> SinkInsts = |
314 | DC.getInstructionsForAccess(Ptr: Sink->PointerValue, isWrite: Sink->IsWritePtr); |
315 | Type *SrcTy = getLoadStoreType(I: SrcInsts[0]); |
316 | Type *DstTy = getLoadStoreType(I: SinkInsts[0]); |
317 | if (isa<ScalableVectorType>(Val: SrcTy) || isa<ScalableVectorType>(Val: DstTy)) |
318 | return false; |
319 | |
320 | const DataLayout &DL = |
321 | SinkAR->getLoop()->getHeader()->getDataLayout(); |
322 | unsigned AllocSize = |
323 | std::max(a: DL.getTypeAllocSize(Ty: SrcTy), b: DL.getTypeAllocSize(Ty: DstTy)); |
324 | |
325 | // Only matching constant steps matching the AllocSize are supported at the |
326 | // moment. This simplifies the difference computation. Can be extended in the |
327 | // future. |
328 | auto *Step = dyn_cast<SCEVConstant>(Val: SinkAR->getStepRecurrence(SE&: *SE)); |
329 | if (!Step || Step != SrcAR->getStepRecurrence(SE&: *SE) || |
330 | Step->getAPInt().abs() != AllocSize) |
331 | return false; |
332 | |
333 | IntegerType *IntTy = |
334 | IntegerType::get(C&: Src->PointerValue->getContext(), |
335 | NumBits: DL.getPointerSizeInBits(AS: CGI.AddressSpace)); |
336 | |
337 | // When counting down, the dependence distance needs to be swapped. |
338 | if (Step->getValue()->isNegative()) |
339 | std::swap(a&: SinkAR, b&: SrcAR); |
340 | |
341 | const SCEV *SinkStartInt = SE->getPtrToIntExpr(Op: SinkAR->getStart(), Ty: IntTy); |
342 | const SCEV *SrcStartInt = SE->getPtrToIntExpr(Op: SrcAR->getStart(), Ty: IntTy); |
343 | if (isa<SCEVCouldNotCompute>(Val: SinkStartInt) || |
344 | isa<SCEVCouldNotCompute>(Val: SrcStartInt)) |
345 | return false; |
346 | |
347 | const Loop *InnerLoop = SrcAR->getLoop(); |
348 | // If the start values for both Src and Sink also vary according to an outer |
349 | // loop, then it's probably better to avoid creating diff checks because |
350 | // they may not be hoisted. We should instead let llvm::addRuntimeChecks |
351 | // do the expanded full range overlap checks, which can be hoisted. |
352 | if (HoistRuntimeChecks && InnerLoop->getParentLoop() && |
353 | isa<SCEVAddRecExpr>(Val: SinkStartInt) && isa<SCEVAddRecExpr>(Val: SrcStartInt)) { |
354 | auto *SrcStartAR = cast<SCEVAddRecExpr>(Val: SrcStartInt); |
355 | auto *SinkStartAR = cast<SCEVAddRecExpr>(Val: SinkStartInt); |
356 | const Loop *StartARLoop = SrcStartAR->getLoop(); |
357 | if (StartARLoop == SinkStartAR->getLoop() && |
358 | StartARLoop == InnerLoop->getParentLoop() && |
359 | // If the diff check would already be loop invariant (due to the |
360 | // recurrences being the same), then we prefer to keep the diff checks |
361 | // because they are cheaper. |
362 | SrcStartAR->getStepRecurrence(SE&: *SE) != |
363 | SinkStartAR->getStepRecurrence(SE&: *SE)) { |
364 | LLVM_DEBUG(dbgs() << "LAA: Not creating diff runtime check, since these " |
365 | "cannot be hoisted out of the outer loop\n" ); |
366 | return false; |
367 | } |
368 | } |
369 | |
370 | LLVM_DEBUG(dbgs() << "LAA: Creating diff runtime check for:\n" |
371 | << "SrcStart: " << *SrcStartInt << '\n' |
372 | << "SinkStartInt: " << *SinkStartInt << '\n'); |
373 | DiffChecks.emplace_back(Args&: SrcStartInt, Args&: SinkStartInt, Args&: AllocSize, |
374 | Args: Src->NeedsFreeze || Sink->NeedsFreeze); |
375 | return true; |
376 | } |
377 | |
378 | SmallVector<RuntimePointerCheck, 4> RuntimePointerChecking::generateChecks() { |
379 | SmallVector<RuntimePointerCheck, 4> Checks; |
380 | |
381 | for (unsigned I = 0; I < CheckingGroups.size(); ++I) { |
382 | for (unsigned J = I + 1; J < CheckingGroups.size(); ++J) { |
383 | const RuntimeCheckingPtrGroup &CGI = CheckingGroups[I]; |
384 | const RuntimeCheckingPtrGroup &CGJ = CheckingGroups[J]; |
385 | |
386 | if (needsChecking(M: CGI, N: CGJ)) { |
387 | CanUseDiffCheck = CanUseDiffCheck && tryToCreateDiffCheck(CGI, CGJ); |
388 | Checks.push_back(Elt: std::make_pair(x: &CGI, y: &CGJ)); |
389 | } |
390 | } |
391 | } |
392 | return Checks; |
393 | } |
394 | |
395 | void RuntimePointerChecking::generateChecks( |
396 | MemoryDepChecker::DepCandidates &DepCands, bool UseDependencies) { |
397 | assert(Checks.empty() && "Checks is not empty" ); |
398 | groupChecks(DepCands, UseDependencies); |
399 | Checks = generateChecks(); |
400 | } |
401 | |
402 | bool RuntimePointerChecking::needsChecking( |
403 | const RuntimeCheckingPtrGroup &M, const RuntimeCheckingPtrGroup &N) const { |
404 | for (const auto &I : M.Members) |
405 | for (const auto &J : N.Members) |
406 | if (needsChecking(I, J)) |
407 | return true; |
408 | return false; |
409 | } |
410 | |
411 | /// Compare \p I and \p J and return the minimum. |
412 | /// Return nullptr in case we couldn't find an answer. |
413 | static const SCEV *getMinFromExprs(const SCEV *I, const SCEV *J, |
414 | ScalarEvolution *SE) { |
415 | const SCEV *Diff = SE->getMinusSCEV(LHS: J, RHS: I); |
416 | const SCEVConstant *C = dyn_cast<const SCEVConstant>(Val: Diff); |
417 | |
418 | if (!C) |
419 | return nullptr; |
420 | return C->getValue()->isNegative() ? J : I; |
421 | } |
422 | |
423 | bool RuntimeCheckingPtrGroup::addPointer(unsigned Index, |
424 | RuntimePointerChecking &RtCheck) { |
425 | return addPointer( |
426 | Index, Start: RtCheck.Pointers[Index].Start, End: RtCheck.Pointers[Index].End, |
427 | AS: RtCheck.Pointers[Index].PointerValue->getType()->getPointerAddressSpace(), |
428 | NeedsFreeze: RtCheck.Pointers[Index].NeedsFreeze, SE&: *RtCheck.SE); |
429 | } |
430 | |
431 | bool RuntimeCheckingPtrGroup::addPointer(unsigned Index, const SCEV *Start, |
432 | const SCEV *End, unsigned AS, |
433 | bool NeedsFreeze, |
434 | ScalarEvolution &SE) { |
435 | assert(AddressSpace == AS && |
436 | "all pointers in a checking group must be in the same address space" ); |
437 | |
438 | // Compare the starts and ends with the known minimum and maximum |
439 | // of this set. We need to know how we compare against the min/max |
440 | // of the set in order to be able to emit memchecks. |
441 | const SCEV *Min0 = getMinFromExprs(I: Start, J: Low, SE: &SE); |
442 | if (!Min0) |
443 | return false; |
444 | |
445 | const SCEV *Min1 = getMinFromExprs(I: End, J: High, SE: &SE); |
446 | if (!Min1) |
447 | return false; |
448 | |
449 | // Update the low bound expression if we've found a new min value. |
450 | if (Min0 == Start) |
451 | Low = Start; |
452 | |
453 | // Update the high bound expression if we've found a new max value. |
454 | if (Min1 != End) |
455 | High = End; |
456 | |
457 | Members.push_back(Elt: Index); |
458 | this->NeedsFreeze |= NeedsFreeze; |
459 | return true; |
460 | } |
461 | |
462 | void RuntimePointerChecking::groupChecks( |
463 | MemoryDepChecker::DepCandidates &DepCands, bool UseDependencies) { |
464 | // We build the groups from dependency candidates equivalence classes |
465 | // because: |
466 | // - We know that pointers in the same equivalence class share |
467 | // the same underlying object and therefore there is a chance |
468 | // that we can compare pointers |
469 | // - We wouldn't be able to merge two pointers for which we need |
470 | // to emit a memcheck. The classes in DepCands are already |
471 | // conveniently built such that no two pointers in the same |
472 | // class need checking against each other. |
473 | |
474 | // We use the following (greedy) algorithm to construct the groups |
475 | // For every pointer in the equivalence class: |
476 | // For each existing group: |
477 | // - if the difference between this pointer and the min/max bounds |
478 | // of the group is a constant, then make the pointer part of the |
479 | // group and update the min/max bounds of that group as required. |
480 | |
481 | CheckingGroups.clear(); |
482 | |
483 | // If we need to check two pointers to the same underlying object |
484 | // with a non-constant difference, we shouldn't perform any pointer |
485 | // grouping with those pointers. This is because we can easily get |
486 | // into cases where the resulting check would return false, even when |
487 | // the accesses are safe. |
488 | // |
489 | // The following example shows this: |
490 | // for (i = 0; i < 1000; ++i) |
491 | // a[5000 + i * m] = a[i] + a[i + 9000] |
492 | // |
493 | // Here grouping gives a check of (5000, 5000 + 1000 * m) against |
494 | // (0, 10000) which is always false. However, if m is 1, there is no |
495 | // dependence. Not grouping the checks for a[i] and a[i + 9000] allows |
496 | // us to perform an accurate check in this case. |
497 | // |
498 | // The above case requires that we have an UnknownDependence between |
499 | // accesses to the same underlying object. This cannot happen unless |
500 | // FoundNonConstantDistanceDependence is set, and therefore UseDependencies |
501 | // is also false. In this case we will use the fallback path and create |
502 | // separate checking groups for all pointers. |
503 | |
504 | // If we don't have the dependency partitions, construct a new |
505 | // checking pointer group for each pointer. This is also required |
506 | // for correctness, because in this case we can have checking between |
507 | // pointers to the same underlying object. |
508 | if (!UseDependencies) { |
509 | for (unsigned I = 0; I < Pointers.size(); ++I) |
510 | CheckingGroups.push_back(Elt: RuntimeCheckingPtrGroup(I, *this)); |
511 | return; |
512 | } |
513 | |
514 | unsigned TotalComparisons = 0; |
515 | |
516 | DenseMap<Value *, SmallVector<unsigned>> PositionMap; |
517 | for (unsigned Index = 0; Index < Pointers.size(); ++Index) { |
518 | auto [It, _] = PositionMap.insert(KV: {Pointers[Index].PointerValue, {}}); |
519 | It->second.push_back(Elt: Index); |
520 | } |
521 | |
522 | // We need to keep track of what pointers we've already seen so we |
523 | // don't process them twice. |
524 | SmallSet<unsigned, 2> Seen; |
525 | |
526 | // Go through all equivalence classes, get the "pointer check groups" |
527 | // and add them to the overall solution. We use the order in which accesses |
528 | // appear in 'Pointers' to enforce determinism. |
529 | for (unsigned I = 0; I < Pointers.size(); ++I) { |
530 | // We've seen this pointer before, and therefore already processed |
531 | // its equivalence class. |
532 | if (Seen.count(V: I)) |
533 | continue; |
534 | |
535 | MemoryDepChecker::MemAccessInfo Access(Pointers[I].PointerValue, |
536 | Pointers[I].IsWritePtr); |
537 | |
538 | SmallVector<RuntimeCheckingPtrGroup, 2> Groups; |
539 | auto LeaderI = DepCands.findValue(V: DepCands.getLeaderValue(V: Access)); |
540 | |
541 | // Because DepCands is constructed by visiting accesses in the order in |
542 | // which they appear in alias sets (which is deterministic) and the |
543 | // iteration order within an equivalence class member is only dependent on |
544 | // the order in which unions and insertions are performed on the |
545 | // equivalence class, the iteration order is deterministic. |
546 | for (auto MI = DepCands.member_begin(I: LeaderI), ME = DepCands.member_end(); |
547 | MI != ME; ++MI) { |
548 | auto PointerI = PositionMap.find(Val: MI->getPointer()); |
549 | assert(PointerI != PositionMap.end() && |
550 | "pointer in equivalence class not found in PositionMap" ); |
551 | for (unsigned Pointer : PointerI->second) { |
552 | bool Merged = false; |
553 | // Mark this pointer as seen. |
554 | Seen.insert(V: Pointer); |
555 | |
556 | // Go through all the existing sets and see if we can find one |
557 | // which can include this pointer. |
558 | for (RuntimeCheckingPtrGroup &Group : Groups) { |
559 | // Don't perform more than a certain amount of comparisons. |
560 | // This should limit the cost of grouping the pointers to something |
561 | // reasonable. If we do end up hitting this threshold, the algorithm |
562 | // will create separate groups for all remaining pointers. |
563 | if (TotalComparisons > MemoryCheckMergeThreshold) |
564 | break; |
565 | |
566 | TotalComparisons++; |
567 | |
568 | if (Group.addPointer(Index: Pointer, RtCheck&: *this)) { |
569 | Merged = true; |
570 | break; |
571 | } |
572 | } |
573 | |
574 | if (!Merged) |
575 | // We couldn't add this pointer to any existing set or the threshold |
576 | // for the number of comparisons has been reached. Create a new group |
577 | // to hold the current pointer. |
578 | Groups.push_back(Elt: RuntimeCheckingPtrGroup(Pointer, *this)); |
579 | } |
580 | } |
581 | |
582 | // We've computed the grouped checks for this partition. |
583 | // Save the results and continue with the next one. |
584 | llvm::copy(Range&: Groups, Out: std::back_inserter(x&: CheckingGroups)); |
585 | } |
586 | } |
587 | |
588 | bool RuntimePointerChecking::arePointersInSamePartition( |
589 | const SmallVectorImpl<int> &PtrToPartition, unsigned PtrIdx1, |
590 | unsigned PtrIdx2) { |
591 | return (PtrToPartition[PtrIdx1] != -1 && |
592 | PtrToPartition[PtrIdx1] == PtrToPartition[PtrIdx2]); |
593 | } |
594 | |
595 | bool RuntimePointerChecking::needsChecking(unsigned I, unsigned J) const { |
596 | const PointerInfo &PointerI = Pointers[I]; |
597 | const PointerInfo &PointerJ = Pointers[J]; |
598 | |
599 | // No need to check if two readonly pointers intersect. |
600 | if (!PointerI.IsWritePtr && !PointerJ.IsWritePtr) |
601 | return false; |
602 | |
603 | // Only need to check pointers between two different dependency sets. |
604 | if (PointerI.DependencySetId == PointerJ.DependencySetId) |
605 | return false; |
606 | |
607 | // Only need to check pointers in the same alias set. |
608 | if (PointerI.AliasSetId != PointerJ.AliasSetId) |
609 | return false; |
610 | |
611 | return true; |
612 | } |
613 | |
614 | void RuntimePointerChecking::printChecks( |
615 | raw_ostream &OS, const SmallVectorImpl<RuntimePointerCheck> &Checks, |
616 | unsigned Depth) const { |
617 | unsigned N = 0; |
618 | for (const auto &[Check1, Check2] : Checks) { |
619 | const auto &First = Check1->Members, &Second = Check2->Members; |
620 | |
621 | OS.indent(NumSpaces: Depth) << "Check " << N++ << ":\n" ; |
622 | |
623 | OS.indent(NumSpaces: Depth + 2) << "Comparing group (" << Check1 << "):\n" ; |
624 | for (unsigned K : First) |
625 | OS.indent(NumSpaces: Depth + 2) << *Pointers[K].PointerValue << "\n" ; |
626 | |
627 | OS.indent(NumSpaces: Depth + 2) << "Against group (" << Check2 << "):\n" ; |
628 | for (unsigned K : Second) |
629 | OS.indent(NumSpaces: Depth + 2) << *Pointers[K].PointerValue << "\n" ; |
630 | } |
631 | } |
632 | |
633 | void RuntimePointerChecking::print(raw_ostream &OS, unsigned Depth) const { |
634 | |
635 | OS.indent(NumSpaces: Depth) << "Run-time memory checks:\n" ; |
636 | printChecks(OS, Checks, Depth); |
637 | |
638 | OS.indent(NumSpaces: Depth) << "Grouped accesses:\n" ; |
639 | for (const auto &CG : CheckingGroups) { |
640 | OS.indent(NumSpaces: Depth + 2) << "Group " << &CG << ":\n" ; |
641 | OS.indent(NumSpaces: Depth + 4) << "(Low: " << *CG.Low << " High: " << *CG.High |
642 | << ")\n" ; |
643 | for (unsigned Member : CG.Members) { |
644 | OS.indent(NumSpaces: Depth + 6) << "Member: " << *Pointers[Member].Expr << "\n" ; |
645 | } |
646 | } |
647 | } |
648 | |
649 | namespace { |
650 | |
651 | /// Analyses memory accesses in a loop. |
652 | /// |
653 | /// Checks whether run time pointer checks are needed and builds sets for data |
654 | /// dependence checking. |
655 | class AccessAnalysis { |
656 | public: |
657 | /// Read or write access location. |
658 | typedef PointerIntPair<Value *, 1, bool> MemAccessInfo; |
659 | typedef SmallVector<MemAccessInfo, 8> MemAccessInfoList; |
660 | |
661 | AccessAnalysis(Loop *TheLoop, AAResults *AA, LoopInfo *LI, |
662 | MemoryDepChecker::DepCandidates &DA, |
663 | PredicatedScalarEvolution &PSE, |
664 | SmallPtrSetImpl<MDNode *> &LoopAliasScopes) |
665 | : TheLoop(TheLoop), BAA(*AA), AST(BAA), LI(LI), DepCands(DA), PSE(PSE), |
666 | LoopAliasScopes(LoopAliasScopes) { |
667 | // We're analyzing dependences across loop iterations. |
668 | BAA.enableCrossIterationMode(); |
669 | } |
670 | |
671 | /// Register a load and whether it is only read from. |
672 | void addLoad(MemoryLocation &Loc, Type *AccessTy, bool IsReadOnly) { |
673 | Value *Ptr = const_cast<Value *>(Loc.Ptr); |
674 | AST.add(Loc: adjustLoc(Loc)); |
675 | Accesses[MemAccessInfo(Ptr, false)].insert(X: AccessTy); |
676 | if (IsReadOnly) |
677 | ReadOnlyPtr.insert(Ptr); |
678 | } |
679 | |
680 | /// Register a store. |
681 | void addStore(MemoryLocation &Loc, Type *AccessTy) { |
682 | Value *Ptr = const_cast<Value *>(Loc.Ptr); |
683 | AST.add(Loc: adjustLoc(Loc)); |
684 | Accesses[MemAccessInfo(Ptr, true)].insert(X: AccessTy); |
685 | } |
686 | |
687 | /// Check if we can emit a run-time no-alias check for \p Access. |
688 | /// |
689 | /// Returns true if we can emit a run-time no alias check for \p Access. |
690 | /// If we can check this access, this also adds it to a dependence set and |
691 | /// adds a run-time to check for it to \p RtCheck. If \p Assume is true, |
692 | /// we will attempt to use additional run-time checks in order to get |
693 | /// the bounds of the pointer. |
694 | bool createCheckForAccess(RuntimePointerChecking &RtCheck, |
695 | MemAccessInfo Access, Type *AccessTy, |
696 | const DenseMap<Value *, const SCEV *> &Strides, |
697 | DenseMap<Value *, unsigned> &DepSetId, |
698 | Loop *TheLoop, unsigned &RunningDepId, |
699 | unsigned ASId, bool ShouldCheckStride, bool Assume); |
700 | |
701 | /// Check whether we can check the pointers at runtime for |
702 | /// non-intersection. |
703 | /// |
704 | /// Returns true if we need no check or if we do and we can generate them |
705 | /// (i.e. the pointers have computable bounds). |
706 | bool canCheckPtrAtRT(RuntimePointerChecking &RtCheck, ScalarEvolution *SE, |
707 | Loop *TheLoop, const DenseMap<Value *, const SCEV *> &Strides, |
708 | Value *&UncomputablePtr, bool ShouldCheckWrap = false); |
709 | |
710 | /// Goes over all memory accesses, checks whether a RT check is needed |
711 | /// and builds sets of dependent accesses. |
712 | void buildDependenceSets() { |
713 | processMemAccesses(); |
714 | } |
715 | |
716 | /// Initial processing of memory accesses determined that we need to |
717 | /// perform dependency checking. |
718 | /// |
719 | /// Note that this can later be cleared if we retry memcheck analysis without |
720 | /// dependency checking (i.e. FoundNonConstantDistanceDependence). |
721 | bool isDependencyCheckNeeded() { return !CheckDeps.empty(); } |
722 | |
723 | /// We decided that no dependence analysis would be used. Reset the state. |
724 | void resetDepChecks(MemoryDepChecker &DepChecker) { |
725 | CheckDeps.clear(); |
726 | DepChecker.clearDependences(); |
727 | } |
728 | |
729 | MemAccessInfoList &getDependenciesToCheck() { return CheckDeps; } |
730 | |
731 | private: |
732 | typedef MapVector<MemAccessInfo, SmallSetVector<Type *, 1>> PtrAccessMap; |
733 | |
734 | /// Adjust the MemoryLocation so that it represents accesses to this |
735 | /// location across all iterations, rather than a single one. |
736 | MemoryLocation adjustLoc(MemoryLocation Loc) const { |
737 | // The accessed location varies within the loop, but remains within the |
738 | // underlying object. |
739 | Loc.Size = LocationSize::beforeOrAfterPointer(); |
740 | Loc.AATags.Scope = adjustAliasScopeList(ScopeList: Loc.AATags.Scope); |
741 | Loc.AATags.NoAlias = adjustAliasScopeList(ScopeList: Loc.AATags.NoAlias); |
742 | return Loc; |
743 | } |
744 | |
745 | /// Drop alias scopes that are only valid within a single loop iteration. |
746 | MDNode *adjustAliasScopeList(MDNode *ScopeList) const { |
747 | if (!ScopeList) |
748 | return nullptr; |
749 | |
750 | // For the sake of simplicity, drop the whole scope list if any scope is |
751 | // iteration-local. |
752 | if (any_of(Range: ScopeList->operands(), P: [&](Metadata *Scope) { |
753 | return LoopAliasScopes.contains(Ptr: cast<MDNode>(Val: Scope)); |
754 | })) |
755 | return nullptr; |
756 | |
757 | return ScopeList; |
758 | } |
759 | |
760 | /// Go over all memory access and check whether runtime pointer checks |
761 | /// are needed and build sets of dependency check candidates. |
762 | void processMemAccesses(); |
763 | |
764 | /// Map of all accesses. Values are the types used to access memory pointed to |
765 | /// by the pointer. |
766 | PtrAccessMap Accesses; |
767 | |
768 | /// The loop being checked. |
769 | const Loop *TheLoop; |
770 | |
771 | /// List of accesses that need a further dependence check. |
772 | MemAccessInfoList CheckDeps; |
773 | |
774 | /// Set of pointers that are read only. |
775 | SmallPtrSet<Value*, 16> ReadOnlyPtr; |
776 | |
777 | /// Batched alias analysis results. |
778 | BatchAAResults BAA; |
779 | |
780 | /// An alias set tracker to partition the access set by underlying object and |
781 | //intrinsic property (such as TBAA metadata). |
782 | AliasSetTracker AST; |
783 | |
784 | LoopInfo *LI; |
785 | |
786 | /// Sets of potentially dependent accesses - members of one set share an |
787 | /// underlying pointer. The set "CheckDeps" identfies which sets really need a |
788 | /// dependence check. |
789 | MemoryDepChecker::DepCandidates &DepCands; |
790 | |
791 | /// Initial processing of memory accesses determined that we may need |
792 | /// to add memchecks. Perform the analysis to determine the necessary checks. |
793 | /// |
794 | /// Note that, this is different from isDependencyCheckNeeded. When we retry |
795 | /// memcheck analysis without dependency checking |
796 | /// (i.e. FoundNonConstantDistanceDependence), isDependencyCheckNeeded is |
797 | /// cleared while this remains set if we have potentially dependent accesses. |
798 | bool IsRTCheckAnalysisNeeded = false; |
799 | |
800 | /// The SCEV predicate containing all the SCEV-related assumptions. |
801 | PredicatedScalarEvolution &PSE; |
802 | |
803 | DenseMap<Value *, SmallVector<const Value *, 16>> UnderlyingObjects; |
804 | |
805 | /// Alias scopes that are declared inside the loop, and as such not valid |
806 | /// across iterations. |
807 | SmallPtrSetImpl<MDNode *> &LoopAliasScopes; |
808 | }; |
809 | |
810 | } // end anonymous namespace |
811 | |
812 | /// Check whether a pointer can participate in a runtime bounds check. |
813 | /// If \p Assume, try harder to prove that we can compute the bounds of \p Ptr |
814 | /// by adding run-time checks (overflow checks) if necessary. |
815 | static bool hasComputableBounds(PredicatedScalarEvolution &PSE, Value *Ptr, |
816 | const SCEV *PtrScev, Loop *L, bool Assume) { |
817 | // The bounds for loop-invariant pointer is trivial. |
818 | if (PSE.getSE()->isLoopInvariant(S: PtrScev, L)) |
819 | return true; |
820 | |
821 | const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Val: PtrScev); |
822 | |
823 | if (!AR && Assume) |
824 | AR = PSE.getAsAddRec(V: Ptr); |
825 | |
826 | if (!AR) |
827 | return false; |
828 | |
829 | return AR->isAffine(); |
830 | } |
831 | |
832 | /// Check whether a pointer address cannot wrap. |
833 | static bool isNoWrap(PredicatedScalarEvolution &PSE, |
834 | const DenseMap<Value *, const SCEV *> &Strides, Value *Ptr, Type *AccessTy, |
835 | Loop *L) { |
836 | const SCEV *PtrScev = PSE.getSCEV(V: Ptr); |
837 | if (PSE.getSE()->isLoopInvariant(S: PtrScev, L)) |
838 | return true; |
839 | |
840 | int64_t Stride = getPtrStride(PSE, AccessTy, Ptr, Lp: L, StridesMap: Strides).value_or(u: 0); |
841 | if (Stride == 1 || PSE.hasNoOverflow(V: Ptr, Flags: SCEVWrapPredicate::IncrementNUSW)) |
842 | return true; |
843 | |
844 | return false; |
845 | } |
846 | |
847 | static void visitPointers(Value *StartPtr, const Loop &InnermostLoop, |
848 | function_ref<void(Value *)> AddPointer) { |
849 | SmallPtrSet<Value *, 8> Visited; |
850 | SmallVector<Value *> WorkList; |
851 | WorkList.push_back(Elt: StartPtr); |
852 | |
853 | while (!WorkList.empty()) { |
854 | Value *Ptr = WorkList.pop_back_val(); |
855 | if (!Visited.insert(Ptr).second) |
856 | continue; |
857 | auto *PN = dyn_cast<PHINode>(Val: Ptr); |
858 | // SCEV does not look through non-header PHIs inside the loop. Such phis |
859 | // can be analyzed by adding separate accesses for each incoming pointer |
860 | // value. |
861 | if (PN && InnermostLoop.contains(BB: PN->getParent()) && |
862 | PN->getParent() != InnermostLoop.getHeader()) { |
863 | for (const Use &Inc : PN->incoming_values()) |
864 | WorkList.push_back(Elt: Inc); |
865 | } else |
866 | AddPointer(Ptr); |
867 | } |
868 | } |
869 | |
870 | // Walk back through the IR for a pointer, looking for a select like the |
871 | // following: |
872 | // |
873 | // %offset = select i1 %cmp, i64 %a, i64 %b |
874 | // %addr = getelementptr double, double* %base, i64 %offset |
875 | // %ld = load double, double* %addr, align 8 |
876 | // |
877 | // We won't be able to form a single SCEVAddRecExpr from this since the |
878 | // address for each loop iteration depends on %cmp. We could potentially |
879 | // produce multiple valid SCEVAddRecExprs, though, and check all of them for |
880 | // memory safety/aliasing if needed. |
881 | // |
882 | // If we encounter some IR we don't yet handle, or something obviously fine |
883 | // like a constant, then we just add the SCEV for that term to the list passed |
884 | // in by the caller. If we have a node that may potentially yield a valid |
885 | // SCEVAddRecExpr then we decompose it into parts and build the SCEV terms |
886 | // ourselves before adding to the list. |
887 | static void findForkedSCEVs( |
888 | ScalarEvolution *SE, const Loop *L, Value *Ptr, |
889 | SmallVectorImpl<PointerIntPair<const SCEV *, 1, bool>> &ScevList, |
890 | unsigned Depth) { |
891 | // If our Value is a SCEVAddRecExpr, loop invariant, not an instruction, or |
892 | // we've exceeded our limit on recursion, just return whatever we have |
893 | // regardless of whether it can be used for a forked pointer or not, along |
894 | // with an indication of whether it might be a poison or undef value. |
895 | const SCEV *Scev = SE->getSCEV(V: Ptr); |
896 | if (isa<SCEVAddRecExpr>(Val: Scev) || L->isLoopInvariant(V: Ptr) || |
897 | !isa<Instruction>(Val: Ptr) || Depth == 0) { |
898 | ScevList.emplace_back(Args&: Scev, Args: !isGuaranteedNotToBeUndefOrPoison(V: Ptr)); |
899 | return; |
900 | } |
901 | |
902 | Depth--; |
903 | |
904 | auto UndefPoisonCheck = [](PointerIntPair<const SCEV *, 1, bool> S) { |
905 | return get<1>(Pair: S); |
906 | }; |
907 | |
908 | auto GetBinOpExpr = [&SE](unsigned Opcode, const SCEV *L, const SCEV *R) { |
909 | switch (Opcode) { |
910 | case Instruction::Add: |
911 | return SE->getAddExpr(LHS: L, RHS: R); |
912 | case Instruction::Sub: |
913 | return SE->getMinusSCEV(LHS: L, RHS: R); |
914 | default: |
915 | llvm_unreachable("Unexpected binary operator when walking ForkedPtrs" ); |
916 | } |
917 | }; |
918 | |
919 | Instruction *I = cast<Instruction>(Val: Ptr); |
920 | unsigned Opcode = I->getOpcode(); |
921 | switch (Opcode) { |
922 | case Instruction::GetElementPtr: { |
923 | GetElementPtrInst *GEP = cast<GetElementPtrInst>(Val: I); |
924 | Type *SourceTy = GEP->getSourceElementType(); |
925 | // We only handle base + single offset GEPs here for now. |
926 | // Not dealing with preexisting gathers yet, so no vectors. |
927 | if (I->getNumOperands() != 2 || SourceTy->isVectorTy()) { |
928 | ScevList.emplace_back(Args&: Scev, Args: !isGuaranteedNotToBeUndefOrPoison(V: GEP)); |
929 | break; |
930 | } |
931 | SmallVector<PointerIntPair<const SCEV *, 1, bool>, 2> BaseScevs; |
932 | SmallVector<PointerIntPair<const SCEV *, 1, bool>, 2> OffsetScevs; |
933 | findForkedSCEVs(SE, L, Ptr: I->getOperand(i: 0), ScevList&: BaseScevs, Depth); |
934 | findForkedSCEVs(SE, L, Ptr: I->getOperand(i: 1), ScevList&: OffsetScevs, Depth); |
935 | |
936 | // See if we need to freeze our fork... |
937 | bool NeedsFreeze = any_of(Range&: BaseScevs, P: UndefPoisonCheck) || |
938 | any_of(Range&: OffsetScevs, P: UndefPoisonCheck); |
939 | |
940 | // Check that we only have a single fork, on either the base or the offset. |
941 | // Copy the SCEV across for the one without a fork in order to generate |
942 | // the full SCEV for both sides of the GEP. |
943 | if (OffsetScevs.size() == 2 && BaseScevs.size() == 1) |
944 | BaseScevs.push_back(Elt: BaseScevs[0]); |
945 | else if (BaseScevs.size() == 2 && OffsetScevs.size() == 1) |
946 | OffsetScevs.push_back(Elt: OffsetScevs[0]); |
947 | else { |
948 | ScevList.emplace_back(Args&: Scev, Args&: NeedsFreeze); |
949 | break; |
950 | } |
951 | |
952 | // Find the pointer type we need to extend to. |
953 | Type *IntPtrTy = SE->getEffectiveSCEVType( |
954 | Ty: SE->getSCEV(V: GEP->getPointerOperand())->getType()); |
955 | |
956 | // Find the size of the type being pointed to. We only have a single |
957 | // index term (guarded above) so we don't need to index into arrays or |
958 | // structures, just get the size of the scalar value. |
959 | const SCEV *Size = SE->getSizeOfExpr(IntTy: IntPtrTy, AllocTy: SourceTy); |
960 | |
961 | // Scale up the offsets by the size of the type, then add to the bases. |
962 | const SCEV *Scaled1 = SE->getMulExpr( |
963 | LHS: Size, RHS: SE->getTruncateOrSignExtend(V: get<0>(Pair: OffsetScevs[0]), Ty: IntPtrTy)); |
964 | const SCEV *Scaled2 = SE->getMulExpr( |
965 | LHS: Size, RHS: SE->getTruncateOrSignExtend(V: get<0>(Pair: OffsetScevs[1]), Ty: IntPtrTy)); |
966 | ScevList.emplace_back(Args: SE->getAddExpr(LHS: get<0>(Pair: BaseScevs[0]), RHS: Scaled1), |
967 | Args&: NeedsFreeze); |
968 | ScevList.emplace_back(Args: SE->getAddExpr(LHS: get<0>(Pair: BaseScevs[1]), RHS: Scaled2), |
969 | Args&: NeedsFreeze); |
970 | break; |
971 | } |
972 | case Instruction::Select: { |
973 | SmallVector<PointerIntPair<const SCEV *, 1, bool>, 2> ChildScevs; |
974 | // A select means we've found a forked pointer, but we currently only |
975 | // support a single select per pointer so if there's another behind this |
976 | // then we just bail out and return the generic SCEV. |
977 | findForkedSCEVs(SE, L, Ptr: I->getOperand(i: 1), ScevList&: ChildScevs, Depth); |
978 | findForkedSCEVs(SE, L, Ptr: I->getOperand(i: 2), ScevList&: ChildScevs, Depth); |
979 | if (ChildScevs.size() == 2) { |
980 | ScevList.push_back(Elt: ChildScevs[0]); |
981 | ScevList.push_back(Elt: ChildScevs[1]); |
982 | } else |
983 | ScevList.emplace_back(Args&: Scev, Args: !isGuaranteedNotToBeUndefOrPoison(V: Ptr)); |
984 | break; |
985 | } |
986 | case Instruction::PHI: { |
987 | SmallVector<PointerIntPair<const SCEV *, 1, bool>, 2> ChildScevs; |
988 | // A phi means we've found a forked pointer, but we currently only |
989 | // support a single phi per pointer so if there's another behind this |
990 | // then we just bail out and return the generic SCEV. |
991 | if (I->getNumOperands() == 2) { |
992 | findForkedSCEVs(SE, L, Ptr: I->getOperand(i: 0), ScevList&: ChildScevs, Depth); |
993 | findForkedSCEVs(SE, L, Ptr: I->getOperand(i: 1), ScevList&: ChildScevs, Depth); |
994 | } |
995 | if (ChildScevs.size() == 2) { |
996 | ScevList.push_back(Elt: ChildScevs[0]); |
997 | ScevList.push_back(Elt: ChildScevs[1]); |
998 | } else |
999 | ScevList.emplace_back(Args&: Scev, Args: !isGuaranteedNotToBeUndefOrPoison(V: Ptr)); |
1000 | break; |
1001 | } |
1002 | case Instruction::Add: |
1003 | case Instruction::Sub: { |
1004 | SmallVector<PointerIntPair<const SCEV *, 1, bool>> LScevs; |
1005 | SmallVector<PointerIntPair<const SCEV *, 1, bool>> RScevs; |
1006 | findForkedSCEVs(SE, L, Ptr: I->getOperand(i: 0), ScevList&: LScevs, Depth); |
1007 | findForkedSCEVs(SE, L, Ptr: I->getOperand(i: 1), ScevList&: RScevs, Depth); |
1008 | |
1009 | // See if we need to freeze our fork... |
1010 | bool NeedsFreeze = |
1011 | any_of(Range&: LScevs, P: UndefPoisonCheck) || any_of(Range&: RScevs, P: UndefPoisonCheck); |
1012 | |
1013 | // Check that we only have a single fork, on either the left or right side. |
1014 | // Copy the SCEV across for the one without a fork in order to generate |
1015 | // the full SCEV for both sides of the BinOp. |
1016 | if (LScevs.size() == 2 && RScevs.size() == 1) |
1017 | RScevs.push_back(Elt: RScevs[0]); |
1018 | else if (RScevs.size() == 2 && LScevs.size() == 1) |
1019 | LScevs.push_back(Elt: LScevs[0]); |
1020 | else { |
1021 | ScevList.emplace_back(Args&: Scev, Args&: NeedsFreeze); |
1022 | break; |
1023 | } |
1024 | |
1025 | ScevList.emplace_back( |
1026 | Args: GetBinOpExpr(Opcode, get<0>(Pair: LScevs[0]), get<0>(Pair: RScevs[0])), |
1027 | Args&: NeedsFreeze); |
1028 | ScevList.emplace_back( |
1029 | Args: GetBinOpExpr(Opcode, get<0>(Pair: LScevs[1]), get<0>(Pair: RScevs[1])), |
1030 | Args&: NeedsFreeze); |
1031 | break; |
1032 | } |
1033 | default: |
1034 | // Just return the current SCEV if we haven't handled the instruction yet. |
1035 | LLVM_DEBUG(dbgs() << "ForkedPtr unhandled instruction: " << *I << "\n" ); |
1036 | ScevList.emplace_back(Args&: Scev, Args: !isGuaranteedNotToBeUndefOrPoison(V: Ptr)); |
1037 | break; |
1038 | } |
1039 | } |
1040 | |
1041 | static SmallVector<PointerIntPair<const SCEV *, 1, bool>> |
1042 | findForkedPointer(PredicatedScalarEvolution &PSE, |
1043 | const DenseMap<Value *, const SCEV *> &StridesMap, Value *Ptr, |
1044 | const Loop *L) { |
1045 | ScalarEvolution *SE = PSE.getSE(); |
1046 | assert(SE->isSCEVable(Ptr->getType()) && "Value is not SCEVable!" ); |
1047 | SmallVector<PointerIntPair<const SCEV *, 1, bool>> Scevs; |
1048 | findForkedSCEVs(SE, L, Ptr, ScevList&: Scevs, Depth: MaxForkedSCEVDepth); |
1049 | |
1050 | // For now, we will only accept a forked pointer with two possible SCEVs |
1051 | // that are either SCEVAddRecExprs or loop invariant. |
1052 | if (Scevs.size() == 2 && |
1053 | (isa<SCEVAddRecExpr>(Val: get<0>(Pair: Scevs[0])) || |
1054 | SE->isLoopInvariant(S: get<0>(Pair: Scevs[0]), L)) && |
1055 | (isa<SCEVAddRecExpr>(Val: get<0>(Pair: Scevs[1])) || |
1056 | SE->isLoopInvariant(S: get<0>(Pair: Scevs[1]), L))) { |
1057 | LLVM_DEBUG(dbgs() << "LAA: Found forked pointer: " << *Ptr << "\n" ); |
1058 | LLVM_DEBUG(dbgs() << "\t(1) " << *get<0>(Scevs[0]) << "\n" ); |
1059 | LLVM_DEBUG(dbgs() << "\t(2) " << *get<0>(Scevs[1]) << "\n" ); |
1060 | return Scevs; |
1061 | } |
1062 | |
1063 | return {{replaceSymbolicStrideSCEV(PSE, PtrToStride: StridesMap, Ptr), false}}; |
1064 | } |
1065 | |
1066 | bool AccessAnalysis::createCheckForAccess(RuntimePointerChecking &RtCheck, |
1067 | MemAccessInfo Access, Type *AccessTy, |
1068 | const DenseMap<Value *, const SCEV *> &StridesMap, |
1069 | DenseMap<Value *, unsigned> &DepSetId, |
1070 | Loop *TheLoop, unsigned &RunningDepId, |
1071 | unsigned ASId, bool ShouldCheckWrap, |
1072 | bool Assume) { |
1073 | Value *Ptr = Access.getPointer(); |
1074 | |
1075 | SmallVector<PointerIntPair<const SCEV *, 1, bool>> TranslatedPtrs = |
1076 | findForkedPointer(PSE, StridesMap, Ptr, L: TheLoop); |
1077 | |
1078 | for (auto &P : TranslatedPtrs) { |
1079 | const SCEV *PtrExpr = get<0>(Pair: P); |
1080 | if (!hasComputableBounds(PSE, Ptr, PtrScev: PtrExpr, L: TheLoop, Assume)) |
1081 | return false; |
1082 | |
1083 | // When we run after a failing dependency check we have to make sure |
1084 | // we don't have wrapping pointers. |
1085 | if (ShouldCheckWrap) { |
1086 | // Skip wrap checking when translating pointers. |
1087 | if (TranslatedPtrs.size() > 1) |
1088 | return false; |
1089 | |
1090 | if (!isNoWrap(PSE, Strides: StridesMap, Ptr, AccessTy, L: TheLoop)) { |
1091 | auto *Expr = PSE.getSCEV(V: Ptr); |
1092 | if (!Assume || !isa<SCEVAddRecExpr>(Val: Expr)) |
1093 | return false; |
1094 | PSE.setNoOverflow(V: Ptr, Flags: SCEVWrapPredicate::IncrementNUSW); |
1095 | } |
1096 | } |
1097 | // If there's only one option for Ptr, look it up after bounds and wrap |
1098 | // checking, because assumptions might have been added to PSE. |
1099 | if (TranslatedPtrs.size() == 1) |
1100 | TranslatedPtrs[0] = {replaceSymbolicStrideSCEV(PSE, PtrToStride: StridesMap, Ptr), |
1101 | false}; |
1102 | } |
1103 | |
1104 | for (auto [PtrExpr, NeedsFreeze] : TranslatedPtrs) { |
1105 | // The id of the dependence set. |
1106 | unsigned DepId; |
1107 | |
1108 | if (isDependencyCheckNeeded()) { |
1109 | Value *Leader = DepCands.getLeaderValue(V: Access).getPointer(); |
1110 | unsigned &LeaderId = DepSetId[Leader]; |
1111 | if (!LeaderId) |
1112 | LeaderId = RunningDepId++; |
1113 | DepId = LeaderId; |
1114 | } else |
1115 | // Each access has its own dependence set. |
1116 | DepId = RunningDepId++; |
1117 | |
1118 | bool IsWrite = Access.getInt(); |
1119 | RtCheck.insert(Lp: TheLoop, Ptr, PtrExpr, AccessTy, WritePtr: IsWrite, DepSetId: DepId, ASId, PSE, |
1120 | NeedsFreeze); |
1121 | LLVM_DEBUG(dbgs() << "LAA: Found a runtime check ptr:" << *Ptr << '\n'); |
1122 | } |
1123 | |
1124 | return true; |
1125 | } |
1126 | |
1127 | bool AccessAnalysis::canCheckPtrAtRT(RuntimePointerChecking &RtCheck, |
1128 | ScalarEvolution *SE, Loop *TheLoop, |
1129 | const DenseMap<Value *, const SCEV *> &StridesMap, |
1130 | Value *&UncomputablePtr, bool ShouldCheckWrap) { |
1131 | // Find pointers with computable bounds. We are going to use this information |
1132 | // to place a runtime bound check. |
1133 | bool CanDoRT = true; |
1134 | |
1135 | bool MayNeedRTCheck = false; |
1136 | if (!IsRTCheckAnalysisNeeded) return true; |
1137 | |
1138 | bool IsDepCheckNeeded = isDependencyCheckNeeded(); |
1139 | |
1140 | // We assign a consecutive id to access from different alias sets. |
1141 | // Accesses between different groups doesn't need to be checked. |
1142 | unsigned ASId = 0; |
1143 | for (auto &AS : AST) { |
1144 | int NumReadPtrChecks = 0; |
1145 | int NumWritePtrChecks = 0; |
1146 | bool CanDoAliasSetRT = true; |
1147 | ++ASId; |
1148 | auto ASPointers = AS.getPointers(); |
1149 | |
1150 | // We assign consecutive id to access from different dependence sets. |
1151 | // Accesses within the same set don't need a runtime check. |
1152 | unsigned RunningDepId = 1; |
1153 | DenseMap<Value *, unsigned> DepSetId; |
1154 | |
1155 | SmallVector<std::pair<MemAccessInfo, Type *>, 4> Retries; |
1156 | |
1157 | // First, count how many write and read accesses are in the alias set. Also |
1158 | // collect MemAccessInfos for later. |
1159 | SmallVector<MemAccessInfo, 4> AccessInfos; |
1160 | for (const Value *ConstPtr : ASPointers) { |
1161 | Value *Ptr = const_cast<Value *>(ConstPtr); |
1162 | bool IsWrite = Accesses.count(Key: MemAccessInfo(Ptr, true)); |
1163 | if (IsWrite) |
1164 | ++NumWritePtrChecks; |
1165 | else |
1166 | ++NumReadPtrChecks; |
1167 | AccessInfos.emplace_back(Args&: Ptr, Args&: IsWrite); |
1168 | } |
1169 | |
1170 | // We do not need runtime checks for this alias set, if there are no writes |
1171 | // or a single write and no reads. |
1172 | if (NumWritePtrChecks == 0 || |
1173 | (NumWritePtrChecks == 1 && NumReadPtrChecks == 0)) { |
1174 | assert((ASPointers.size() <= 1 || |
1175 | all_of(ASPointers, |
1176 | [this](const Value *Ptr) { |
1177 | MemAccessInfo AccessWrite(const_cast<Value *>(Ptr), |
1178 | true); |
1179 | return DepCands.findValue(AccessWrite) == DepCands.end(); |
1180 | })) && |
1181 | "Can only skip updating CanDoRT below, if all entries in AS " |
1182 | "are reads or there is at most 1 entry" ); |
1183 | continue; |
1184 | } |
1185 | |
1186 | for (auto &Access : AccessInfos) { |
1187 | for (const auto &AccessTy : Accesses[Access]) { |
1188 | if (!createCheckForAccess(RtCheck, Access, AccessTy, StridesMap, |
1189 | DepSetId, TheLoop, RunningDepId, ASId, |
1190 | ShouldCheckWrap, Assume: false)) { |
1191 | LLVM_DEBUG(dbgs() << "LAA: Can't find bounds for ptr:" |
1192 | << *Access.getPointer() << '\n'); |
1193 | Retries.push_back(Elt: {Access, AccessTy}); |
1194 | CanDoAliasSetRT = false; |
1195 | } |
1196 | } |
1197 | } |
1198 | |
1199 | // Note that this function computes CanDoRT and MayNeedRTCheck |
1200 | // independently. For example CanDoRT=false, MayNeedRTCheck=false means that |
1201 | // we have a pointer for which we couldn't find the bounds but we don't |
1202 | // actually need to emit any checks so it does not matter. |
1203 | // |
1204 | // We need runtime checks for this alias set, if there are at least 2 |
1205 | // dependence sets (in which case RunningDepId > 2) or if we need to re-try |
1206 | // any bound checks (because in that case the number of dependence sets is |
1207 | // incomplete). |
1208 | bool NeedsAliasSetRTCheck = RunningDepId > 2 || !Retries.empty(); |
1209 | |
1210 | // We need to perform run-time alias checks, but some pointers had bounds |
1211 | // that couldn't be checked. |
1212 | if (NeedsAliasSetRTCheck && !CanDoAliasSetRT) { |
1213 | // Reset the CanDoSetRt flag and retry all accesses that have failed. |
1214 | // We know that we need these checks, so we can now be more aggressive |
1215 | // and add further checks if required (overflow checks). |
1216 | CanDoAliasSetRT = true; |
1217 | for (const auto &[Access, AccessTy] : Retries) { |
1218 | if (!createCheckForAccess(RtCheck, Access, AccessTy, StridesMap, |
1219 | DepSetId, TheLoop, RunningDepId, ASId, |
1220 | ShouldCheckWrap, /*Assume=*/true)) { |
1221 | CanDoAliasSetRT = false; |
1222 | UncomputablePtr = Access.getPointer(); |
1223 | break; |
1224 | } |
1225 | } |
1226 | } |
1227 | |
1228 | CanDoRT &= CanDoAliasSetRT; |
1229 | MayNeedRTCheck |= NeedsAliasSetRTCheck; |
1230 | ++ASId; |
1231 | } |
1232 | |
1233 | // If the pointers that we would use for the bounds comparison have different |
1234 | // address spaces, assume the values aren't directly comparable, so we can't |
1235 | // use them for the runtime check. We also have to assume they could |
1236 | // overlap. In the future there should be metadata for whether address spaces |
1237 | // are disjoint. |
1238 | unsigned NumPointers = RtCheck.Pointers.size(); |
1239 | for (unsigned i = 0; i < NumPointers; ++i) { |
1240 | for (unsigned j = i + 1; j < NumPointers; ++j) { |
1241 | // Only need to check pointers between two different dependency sets. |
1242 | if (RtCheck.Pointers[i].DependencySetId == |
1243 | RtCheck.Pointers[j].DependencySetId) |
1244 | continue; |
1245 | // Only need to check pointers in the same alias set. |
1246 | if (RtCheck.Pointers[i].AliasSetId != RtCheck.Pointers[j].AliasSetId) |
1247 | continue; |
1248 | |
1249 | Value *PtrI = RtCheck.Pointers[i].PointerValue; |
1250 | Value *PtrJ = RtCheck.Pointers[j].PointerValue; |
1251 | |
1252 | unsigned ASi = PtrI->getType()->getPointerAddressSpace(); |
1253 | unsigned ASj = PtrJ->getType()->getPointerAddressSpace(); |
1254 | if (ASi != ASj) { |
1255 | LLVM_DEBUG( |
1256 | dbgs() << "LAA: Runtime check would require comparison between" |
1257 | " different address spaces\n" ); |
1258 | return false; |
1259 | } |
1260 | } |
1261 | } |
1262 | |
1263 | if (MayNeedRTCheck && CanDoRT) |
1264 | RtCheck.generateChecks(DepCands, UseDependencies: IsDepCheckNeeded); |
1265 | |
1266 | LLVM_DEBUG(dbgs() << "LAA: We need to do " << RtCheck.getNumberOfChecks() |
1267 | << " pointer comparisons.\n" ); |
1268 | |
1269 | // If we can do run-time checks, but there are no checks, no runtime checks |
1270 | // are needed. This can happen when all pointers point to the same underlying |
1271 | // object for example. |
1272 | RtCheck.Need = CanDoRT ? RtCheck.getNumberOfChecks() != 0 : MayNeedRTCheck; |
1273 | |
1274 | bool CanDoRTIfNeeded = !RtCheck.Need || CanDoRT; |
1275 | if (!CanDoRTIfNeeded) |
1276 | RtCheck.reset(); |
1277 | return CanDoRTIfNeeded; |
1278 | } |
1279 | |
1280 | void AccessAnalysis::processMemAccesses() { |
1281 | // We process the set twice: first we process read-write pointers, last we |
1282 | // process read-only pointers. This allows us to skip dependence tests for |
1283 | // read-only pointers. |
1284 | |
1285 | LLVM_DEBUG(dbgs() << "LAA: Processing memory accesses...\n" ); |
1286 | LLVM_DEBUG(dbgs() << " AST: " ; AST.dump()); |
1287 | LLVM_DEBUG(dbgs() << "LAA: Accesses(" << Accesses.size() << "):\n" ); |
1288 | LLVM_DEBUG({ |
1289 | for (const auto &[A, _] : Accesses) |
1290 | dbgs() << "\t" << *A.getPointer() << " (" |
1291 | << (A.getInt() ? "write" |
1292 | : (ReadOnlyPtr.count(A.getPointer()) ? "read-only" |
1293 | : "read" )) |
1294 | << ")\n" ; |
1295 | }); |
1296 | |
1297 | // The AliasSetTracker has nicely partitioned our pointers by metadata |
1298 | // compatibility and potential for underlying-object overlap. As a result, we |
1299 | // only need to check for potential pointer dependencies within each alias |
1300 | // set. |
1301 | for (const auto &AS : AST) { |
1302 | // Note that both the alias-set tracker and the alias sets themselves used |
1303 | // ordered collections internally and so the iteration order here is |
1304 | // deterministic. |
1305 | auto ASPointers = AS.getPointers(); |
1306 | |
1307 | bool SetHasWrite = false; |
1308 | |
1309 | // Map of pointers to last access encountered. |
1310 | typedef DenseMap<const Value*, MemAccessInfo> UnderlyingObjToAccessMap; |
1311 | UnderlyingObjToAccessMap ObjToLastAccess; |
1312 | |
1313 | // Set of access to check after all writes have been processed. |
1314 | PtrAccessMap DeferredAccesses; |
1315 | |
1316 | // Iterate over each alias set twice, once to process read/write pointers, |
1317 | // and then to process read-only pointers. |
1318 | for (int SetIteration = 0; SetIteration < 2; ++SetIteration) { |
1319 | bool UseDeferred = SetIteration > 0; |
1320 | PtrAccessMap &S = UseDeferred ? DeferredAccesses : Accesses; |
1321 | |
1322 | for (const Value *ConstPtr : ASPointers) { |
1323 | Value *Ptr = const_cast<Value *>(ConstPtr); |
1324 | |
1325 | // For a single memory access in AliasSetTracker, Accesses may contain |
1326 | // both read and write, and they both need to be handled for CheckDeps. |
1327 | for (const auto &[AC, _] : S) { |
1328 | if (AC.getPointer() != Ptr) |
1329 | continue; |
1330 | |
1331 | bool IsWrite = AC.getInt(); |
1332 | |
1333 | // If we're using the deferred access set, then it contains only |
1334 | // reads. |
1335 | bool IsReadOnlyPtr = ReadOnlyPtr.count(Ptr) && !IsWrite; |
1336 | if (UseDeferred && !IsReadOnlyPtr) |
1337 | continue; |
1338 | // Otherwise, the pointer must be in the PtrAccessSet, either as a |
1339 | // read or a write. |
1340 | assert(((IsReadOnlyPtr && UseDeferred) || IsWrite || |
1341 | S.count(MemAccessInfo(Ptr, false))) && |
1342 | "Alias-set pointer not in the access set?" ); |
1343 | |
1344 | MemAccessInfo Access(Ptr, IsWrite); |
1345 | DepCands.insert(Data: Access); |
1346 | |
1347 | // Memorize read-only pointers for later processing and skip them in |
1348 | // the first round (they need to be checked after we have seen all |
1349 | // write pointers). Note: we also mark pointer that are not |
1350 | // consecutive as "read-only" pointers (so that we check |
1351 | // "a[b[i]] +="). Hence, we need the second check for "!IsWrite". |
1352 | if (!UseDeferred && IsReadOnlyPtr) { |
1353 | // We only use the pointer keys, the types vector values don't |
1354 | // matter. |
1355 | DeferredAccesses.insert(KV: {Access, {}}); |
1356 | continue; |
1357 | } |
1358 | |
1359 | // If this is a write - check other reads and writes for conflicts. If |
1360 | // this is a read only check other writes for conflicts (but only if |
1361 | // there is no other write to the ptr - this is an optimization to |
1362 | // catch "a[i] = a[i] + " without having to do a dependence check). |
1363 | if ((IsWrite || IsReadOnlyPtr) && SetHasWrite) { |
1364 | CheckDeps.push_back(Elt: Access); |
1365 | IsRTCheckAnalysisNeeded = true; |
1366 | } |
1367 | |
1368 | if (IsWrite) |
1369 | SetHasWrite = true; |
1370 | |
1371 | // Create sets of pointers connected by a shared alias set and |
1372 | // underlying object. |
1373 | typedef SmallVector<const Value *, 16> ValueVector; |
1374 | ValueVector TempObjects; |
1375 | |
1376 | UnderlyingObjects[Ptr] = {}; |
1377 | SmallVector<const Value *, 16> &UOs = UnderlyingObjects[Ptr]; |
1378 | ::getUnderlyingObjects(V: Ptr, Objects&: UOs, LI); |
1379 | LLVM_DEBUG(dbgs() |
1380 | << "Underlying objects for pointer " << *Ptr << "\n" ); |
1381 | for (const Value *UnderlyingObj : UOs) { |
1382 | // nullptr never alias, don't join sets for pointer that have "null" |
1383 | // in their UnderlyingObjects list. |
1384 | if (isa<ConstantPointerNull>(Val: UnderlyingObj) && |
1385 | !NullPointerIsDefined( |
1386 | F: TheLoop->getHeader()->getParent(), |
1387 | AS: UnderlyingObj->getType()->getPointerAddressSpace())) |
1388 | continue; |
1389 | |
1390 | UnderlyingObjToAccessMap::iterator Prev = |
1391 | ObjToLastAccess.find(Val: UnderlyingObj); |
1392 | if (Prev != ObjToLastAccess.end()) |
1393 | DepCands.unionSets(V1: Access, V2: Prev->second); |
1394 | |
1395 | ObjToLastAccess[UnderlyingObj] = Access; |
1396 | LLVM_DEBUG(dbgs() << " " << *UnderlyingObj << "\n" ); |
1397 | } |
1398 | } |
1399 | } |
1400 | } |
1401 | } |
1402 | } |
1403 | |
1404 | /// Return true if an AddRec pointer \p Ptr is unsigned non-wrapping, |
1405 | /// i.e. monotonically increasing/decreasing. |
1406 | static bool isNoWrapAddRec(Value *Ptr, const SCEVAddRecExpr *AR, |
1407 | PredicatedScalarEvolution &PSE, const Loop *L) { |
1408 | |
1409 | // FIXME: This should probably only return true for NUW. |
1410 | if (AR->getNoWrapFlags(Mask: SCEV::NoWrapMask)) |
1411 | return true; |
1412 | |
1413 | if (PSE.hasNoOverflow(V: Ptr, Flags: SCEVWrapPredicate::IncrementNUSW)) |
1414 | return true; |
1415 | |
1416 | // Scalar evolution does not propagate the non-wrapping flags to values that |
1417 | // are derived from a non-wrapping induction variable because non-wrapping |
1418 | // could be flow-sensitive. |
1419 | // |
1420 | // Look through the potentially overflowing instruction to try to prove |
1421 | // non-wrapping for the *specific* value of Ptr. |
1422 | |
1423 | // The arithmetic implied by an inbounds GEP can't overflow. |
1424 | auto *GEP = dyn_cast<GetElementPtrInst>(Val: Ptr); |
1425 | if (!GEP || !GEP->isInBounds()) |
1426 | return false; |
1427 | |
1428 | // Make sure there is only one non-const index and analyze that. |
1429 | Value *NonConstIndex = nullptr; |
1430 | for (Value *Index : GEP->indices()) |
1431 | if (!isa<ConstantInt>(Val: Index)) { |
1432 | if (NonConstIndex) |
1433 | return false; |
1434 | NonConstIndex = Index; |
1435 | } |
1436 | if (!NonConstIndex) |
1437 | // The recurrence is on the pointer, ignore for now. |
1438 | return false; |
1439 | |
1440 | // The index in GEP is signed. It is non-wrapping if it's derived from a NSW |
1441 | // AddRec using a NSW operation. |
1442 | if (auto *OBO = dyn_cast<OverflowingBinaryOperator>(Val: NonConstIndex)) |
1443 | if (OBO->hasNoSignedWrap() && |
1444 | // Assume constant for other the operand so that the AddRec can be |
1445 | // easily found. |
1446 | isa<ConstantInt>(Val: OBO->getOperand(i_nocapture: 1))) { |
1447 | auto *OpScev = PSE.getSCEV(V: OBO->getOperand(i_nocapture: 0)); |
1448 | |
1449 | if (auto *OpAR = dyn_cast<SCEVAddRecExpr>(Val: OpScev)) |
1450 | return OpAR->getLoop() == L && OpAR->getNoWrapFlags(Mask: SCEV::FlagNSW); |
1451 | } |
1452 | |
1453 | return false; |
1454 | } |
1455 | |
1456 | /// Check whether the access through \p Ptr has a constant stride. |
1457 | std::optional<int64_t> |
1458 | llvm::getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr, |
1459 | const Loop *Lp, |
1460 | const DenseMap<Value *, const SCEV *> &StridesMap, |
1461 | bool Assume, bool ShouldCheckWrap) { |
1462 | const SCEV *PtrScev = replaceSymbolicStrideSCEV(PSE, PtrToStride: StridesMap, Ptr); |
1463 | if (PSE.getSE()->isLoopInvariant(S: PtrScev, L: Lp)) |
1464 | return {0}; |
1465 | |
1466 | Type *Ty = Ptr->getType(); |
1467 | assert(Ty->isPointerTy() && "Unexpected non-ptr" ); |
1468 | if (isa<ScalableVectorType>(Val: AccessTy)) { |
1469 | LLVM_DEBUG(dbgs() << "LAA: Bad stride - Scalable object: " << *AccessTy |
1470 | << "\n" ); |
1471 | return std::nullopt; |
1472 | } |
1473 | |
1474 | const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Val: PtrScev); |
1475 | if (Assume && !AR) |
1476 | AR = PSE.getAsAddRec(V: Ptr); |
1477 | |
1478 | if (!AR) { |
1479 | LLVM_DEBUG(dbgs() << "LAA: Bad stride - Not an AddRecExpr pointer " << *Ptr |
1480 | << " SCEV: " << *PtrScev << "\n" ); |
1481 | return std::nullopt; |
1482 | } |
1483 | |
1484 | // The access function must stride over the innermost loop. |
1485 | if (Lp != AR->getLoop()) { |
1486 | LLVM_DEBUG(dbgs() << "LAA: Bad stride - Not striding over innermost loop " |
1487 | << *Ptr << " SCEV: " << *AR << "\n" ); |
1488 | return std::nullopt; |
1489 | } |
1490 | |
1491 | // Check the step is constant. |
1492 | const SCEV *Step = AR->getStepRecurrence(SE&: *PSE.getSE()); |
1493 | |
1494 | // Calculate the pointer stride and check if it is constant. |
1495 | const SCEVConstant *C = dyn_cast<SCEVConstant>(Val: Step); |
1496 | if (!C) { |
1497 | LLVM_DEBUG(dbgs() << "LAA: Bad stride - Not a constant strided " << *Ptr |
1498 | << " SCEV: " << *AR << "\n" ); |
1499 | return std::nullopt; |
1500 | } |
1501 | |
1502 | auto &DL = Lp->getHeader()->getDataLayout(); |
1503 | TypeSize AllocSize = DL.getTypeAllocSize(Ty: AccessTy); |
1504 | int64_t Size = AllocSize.getFixedValue(); |
1505 | const APInt &APStepVal = C->getAPInt(); |
1506 | |
1507 | // Huge step value - give up. |
1508 | if (APStepVal.getBitWidth() > 64) |
1509 | return std::nullopt; |
1510 | |
1511 | int64_t StepVal = APStepVal.getSExtValue(); |
1512 | |
1513 | // Strided access. |
1514 | int64_t Stride = StepVal / Size; |
1515 | int64_t Rem = StepVal % Size; |
1516 | if (Rem) |
1517 | return std::nullopt; |
1518 | |
1519 | if (!ShouldCheckWrap) |
1520 | return Stride; |
1521 | |
1522 | // The address calculation must not wrap. Otherwise, a dependence could be |
1523 | // inverted. |
1524 | if (isNoWrapAddRec(Ptr, AR, PSE, L: Lp)) |
1525 | return Stride; |
1526 | |
1527 | // An inbounds getelementptr that is a AddRec with a unit stride |
1528 | // cannot wrap per definition. If it did, the result would be poison |
1529 | // and any memory access dependent on it would be immediate UB |
1530 | // when executed. |
1531 | if (auto *GEP = dyn_cast<GetElementPtrInst>(Val: Ptr); |
1532 | GEP && GEP->isInBounds() && (Stride == 1 || Stride == -1)) |
1533 | return Stride; |
1534 | |
1535 | // If the null pointer is undefined, then a access sequence which would |
1536 | // otherwise access it can be assumed not to unsigned wrap. Note that this |
1537 | // assumes the object in memory is aligned to the natural alignment. |
1538 | unsigned AddrSpace = Ty->getPointerAddressSpace(); |
1539 | if (!NullPointerIsDefined(F: Lp->getHeader()->getParent(), AS: AddrSpace) && |
1540 | (Stride == 1 || Stride == -1)) |
1541 | return Stride; |
1542 | |
1543 | if (Assume) { |
1544 | PSE.setNoOverflow(V: Ptr, Flags: SCEVWrapPredicate::IncrementNUSW); |
1545 | LLVM_DEBUG(dbgs() << "LAA: Pointer may wrap:\n" |
1546 | << "LAA: Pointer: " << *Ptr << "\n" |
1547 | << "LAA: SCEV: " << *AR << "\n" |
1548 | << "LAA: Added an overflow assumption\n" ); |
1549 | return Stride; |
1550 | } |
1551 | LLVM_DEBUG( |
1552 | dbgs() << "LAA: Bad stride - Pointer may wrap in the address space " |
1553 | << *Ptr << " SCEV: " << *AR << "\n" ); |
1554 | return std::nullopt; |
1555 | } |
1556 | |
1557 | std::optional<int> llvm::getPointersDiff(Type *ElemTyA, Value *PtrA, |
1558 | Type *ElemTyB, Value *PtrB, |
1559 | const DataLayout &DL, |
1560 | ScalarEvolution &SE, bool StrictCheck, |
1561 | bool CheckType) { |
1562 | assert(PtrA && PtrB && "Expected non-nullptr pointers." ); |
1563 | |
1564 | // Make sure that A and B are different pointers. |
1565 | if (PtrA == PtrB) |
1566 | return 0; |
1567 | |
1568 | // Make sure that the element types are the same if required. |
1569 | if (CheckType && ElemTyA != ElemTyB) |
1570 | return std::nullopt; |
1571 | |
1572 | unsigned ASA = PtrA->getType()->getPointerAddressSpace(); |
1573 | unsigned ASB = PtrB->getType()->getPointerAddressSpace(); |
1574 | |
1575 | // Check that the address spaces match. |
1576 | if (ASA != ASB) |
1577 | return std::nullopt; |
1578 | unsigned IdxWidth = DL.getIndexSizeInBits(AS: ASA); |
1579 | |
1580 | APInt OffsetA(IdxWidth, 0), OffsetB(IdxWidth, 0); |
1581 | Value *PtrA1 = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, Offset&: OffsetA); |
1582 | Value *PtrB1 = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, Offset&: OffsetB); |
1583 | |
1584 | int Val; |
1585 | if (PtrA1 == PtrB1) { |
1586 | // Retrieve the address space again as pointer stripping now tracks through |
1587 | // `addrspacecast`. |
1588 | ASA = cast<PointerType>(Val: PtrA1->getType())->getAddressSpace(); |
1589 | ASB = cast<PointerType>(Val: PtrB1->getType())->getAddressSpace(); |
1590 | // Check that the address spaces match and that the pointers are valid. |
1591 | if (ASA != ASB) |
1592 | return std::nullopt; |
1593 | |
1594 | IdxWidth = DL.getIndexSizeInBits(AS: ASA); |
1595 | OffsetA = OffsetA.sextOrTrunc(width: IdxWidth); |
1596 | OffsetB = OffsetB.sextOrTrunc(width: IdxWidth); |
1597 | |
1598 | OffsetB -= OffsetA; |
1599 | Val = OffsetB.getSExtValue(); |
1600 | } else { |
1601 | // Otherwise compute the distance with SCEV between the base pointers. |
1602 | const SCEV *PtrSCEVA = SE.getSCEV(V: PtrA); |
1603 | const SCEV *PtrSCEVB = SE.getSCEV(V: PtrB); |
1604 | const auto *Diff = |
1605 | dyn_cast<SCEVConstant>(Val: SE.getMinusSCEV(LHS: PtrSCEVB, RHS: PtrSCEVA)); |
1606 | if (!Diff) |
1607 | return std::nullopt; |
1608 | Val = Diff->getAPInt().getSExtValue(); |
1609 | } |
1610 | int Size = DL.getTypeStoreSize(Ty: ElemTyA); |
1611 | int Dist = Val / Size; |
1612 | |
1613 | // Ensure that the calculated distance matches the type-based one after all |
1614 | // the bitcasts removal in the provided pointers. |
1615 | if (!StrictCheck || Dist * Size == Val) |
1616 | return Dist; |
1617 | return std::nullopt; |
1618 | } |
1619 | |
1620 | bool llvm::sortPtrAccesses(ArrayRef<Value *> VL, Type *ElemTy, |
1621 | const DataLayout &DL, ScalarEvolution &SE, |
1622 | SmallVectorImpl<unsigned> &SortedIndices) { |
1623 | assert(llvm::all_of( |
1624 | VL, [](const Value *V) { return V->getType()->isPointerTy(); }) && |
1625 | "Expected list of pointer operands." ); |
1626 | // Walk over the pointers, and map each of them to an offset relative to |
1627 | // first pointer in the array. |
1628 | Value *Ptr0 = VL[0]; |
1629 | |
1630 | using DistOrdPair = std::pair<int64_t, int>; |
1631 | auto Compare = llvm::less_first(); |
1632 | std::set<DistOrdPair, decltype(Compare)> Offsets(Compare); |
1633 | Offsets.emplace(args: 0, args: 0); |
1634 | bool IsConsecutive = true; |
1635 | for (auto [Idx, Ptr] : drop_begin(RangeOrContainer: enumerate(First&: VL))) { |
1636 | std::optional<int> Diff = getPointersDiff(ElemTyA: ElemTy, PtrA: Ptr0, ElemTyB: ElemTy, PtrB: Ptr, DL, SE, |
1637 | /*StrictCheck=*/true); |
1638 | if (!Diff) |
1639 | return false; |
1640 | |
1641 | // Check if the pointer with the same offset is found. |
1642 | int64_t Offset = *Diff; |
1643 | auto [It, IsInserted] = Offsets.emplace(args&: Offset, args&: Idx); |
1644 | if (!IsInserted) |
1645 | return false; |
1646 | // Consecutive order if the inserted element is the last one. |
1647 | IsConsecutive &= std::next(x: It) == Offsets.end(); |
1648 | } |
1649 | SortedIndices.clear(); |
1650 | if (!IsConsecutive) { |
1651 | // Fill SortedIndices array only if it is non-consecutive. |
1652 | SortedIndices.resize(N: VL.size()); |
1653 | for (auto [Idx, Off] : enumerate(First&: Offsets)) |
1654 | SortedIndices[Idx] = Off.second; |
1655 | } |
1656 | return true; |
1657 | } |
1658 | |
1659 | /// Returns true if the memory operations \p A and \p B are consecutive. |
1660 | bool llvm::isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL, |
1661 | ScalarEvolution &SE, bool CheckType) { |
1662 | Value *PtrA = getLoadStorePointerOperand(V: A); |
1663 | Value *PtrB = getLoadStorePointerOperand(V: B); |
1664 | if (!PtrA || !PtrB) |
1665 | return false; |
1666 | Type *ElemTyA = getLoadStoreType(I: A); |
1667 | Type *ElemTyB = getLoadStoreType(I: B); |
1668 | std::optional<int> Diff = |
1669 | getPointersDiff(ElemTyA, PtrA, ElemTyB, PtrB, DL, SE, |
1670 | /*StrictCheck=*/true, CheckType); |
1671 | return Diff && *Diff == 1; |
1672 | } |
1673 | |
1674 | void MemoryDepChecker::addAccess(StoreInst *SI) { |
1675 | visitPointers(StartPtr: SI->getPointerOperand(), InnermostLoop: *InnermostLoop, |
1676 | AddPointer: [this, SI](Value *Ptr) { |
1677 | Accesses[MemAccessInfo(Ptr, true)].push_back(x: AccessIdx); |
1678 | InstMap.push_back(Elt: SI); |
1679 | ++AccessIdx; |
1680 | }); |
1681 | } |
1682 | |
1683 | void MemoryDepChecker::addAccess(LoadInst *LI) { |
1684 | visitPointers(StartPtr: LI->getPointerOperand(), InnermostLoop: *InnermostLoop, |
1685 | AddPointer: [this, LI](Value *Ptr) { |
1686 | Accesses[MemAccessInfo(Ptr, false)].push_back(x: AccessIdx); |
1687 | InstMap.push_back(Elt: LI); |
1688 | ++AccessIdx; |
1689 | }); |
1690 | } |
1691 | |
1692 | MemoryDepChecker::VectorizationSafetyStatus |
1693 | MemoryDepChecker::Dependence::isSafeForVectorization(DepType Type) { |
1694 | switch (Type) { |
1695 | case NoDep: |
1696 | case Forward: |
1697 | case BackwardVectorizable: |
1698 | return VectorizationSafetyStatus::Safe; |
1699 | |
1700 | case Unknown: |
1701 | return VectorizationSafetyStatus::PossiblySafeWithRtChecks; |
1702 | case ForwardButPreventsForwarding: |
1703 | case Backward: |
1704 | case BackwardVectorizableButPreventsForwarding: |
1705 | case IndirectUnsafe: |
1706 | return VectorizationSafetyStatus::Unsafe; |
1707 | } |
1708 | llvm_unreachable("unexpected DepType!" ); |
1709 | } |
1710 | |
1711 | bool MemoryDepChecker::Dependence::isBackward() const { |
1712 | switch (Type) { |
1713 | case NoDep: |
1714 | case Forward: |
1715 | case ForwardButPreventsForwarding: |
1716 | case Unknown: |
1717 | case IndirectUnsafe: |
1718 | return false; |
1719 | |
1720 | case BackwardVectorizable: |
1721 | case Backward: |
1722 | case BackwardVectorizableButPreventsForwarding: |
1723 | return true; |
1724 | } |
1725 | llvm_unreachable("unexpected DepType!" ); |
1726 | } |
1727 | |
1728 | bool MemoryDepChecker::Dependence::isPossiblyBackward() const { |
1729 | return isBackward() || Type == Unknown || Type == IndirectUnsafe; |
1730 | } |
1731 | |
1732 | bool MemoryDepChecker::Dependence::isForward() const { |
1733 | switch (Type) { |
1734 | case Forward: |
1735 | case ForwardButPreventsForwarding: |
1736 | return true; |
1737 | |
1738 | case NoDep: |
1739 | case Unknown: |
1740 | case BackwardVectorizable: |
1741 | case Backward: |
1742 | case BackwardVectorizableButPreventsForwarding: |
1743 | case IndirectUnsafe: |
1744 | return false; |
1745 | } |
1746 | llvm_unreachable("unexpected DepType!" ); |
1747 | } |
1748 | |
1749 | bool MemoryDepChecker::couldPreventStoreLoadForward(uint64_t Distance, |
1750 | uint64_t TypeByteSize) { |
1751 | // If loads occur at a distance that is not a multiple of a feasible vector |
1752 | // factor store-load forwarding does not take place. |
1753 | // Positive dependences might cause troubles because vectorizing them might |
1754 | // prevent store-load forwarding making vectorized code run a lot slower. |
1755 | // a[i] = a[i-3] ^ a[i-8]; |
1756 | // The stores to a[i:i+1] don't align with the stores to a[i-3:i-2] and |
1757 | // hence on your typical architecture store-load forwarding does not take |
1758 | // place. Vectorizing in such cases does not make sense. |
1759 | // Store-load forwarding distance. |
1760 | |
1761 | // After this many iterations store-to-load forwarding conflicts should not |
1762 | // cause any slowdowns. |
1763 | const uint64_t NumItersForStoreLoadThroughMemory = 8 * TypeByteSize; |
1764 | // Maximum vector factor. |
1765 | uint64_t MaxVFWithoutSLForwardIssues = std::min( |
1766 | a: VectorizerParams::MaxVectorWidth * TypeByteSize, b: MinDepDistBytes); |
1767 | |
1768 | // Compute the smallest VF at which the store and load would be misaligned. |
1769 | for (uint64_t VF = 2 * TypeByteSize; VF <= MaxVFWithoutSLForwardIssues; |
1770 | VF *= 2) { |
1771 | // If the number of vector iteration between the store and the load are |
1772 | // small we could incur conflicts. |
1773 | if (Distance % VF && Distance / VF < NumItersForStoreLoadThroughMemory) { |
1774 | MaxVFWithoutSLForwardIssues = (VF >> 1); |
1775 | break; |
1776 | } |
1777 | } |
1778 | |
1779 | if (MaxVFWithoutSLForwardIssues < 2 * TypeByteSize) { |
1780 | LLVM_DEBUG( |
1781 | dbgs() << "LAA: Distance " << Distance |
1782 | << " that could cause a store-load forwarding conflict\n" ); |
1783 | return true; |
1784 | } |
1785 | |
1786 | if (MaxVFWithoutSLForwardIssues < MinDepDistBytes && |
1787 | MaxVFWithoutSLForwardIssues != |
1788 | VectorizerParams::MaxVectorWidth * TypeByteSize) |
1789 | MinDepDistBytes = MaxVFWithoutSLForwardIssues; |
1790 | return false; |
1791 | } |
1792 | |
1793 | void MemoryDepChecker::mergeInStatus(VectorizationSafetyStatus S) { |
1794 | if (Status < S) |
1795 | Status = S; |
1796 | } |
1797 | |
1798 | /// Given a dependence-distance \p Dist between two |
1799 | /// memory accesses, that have strides in the same direction whose absolute |
1800 | /// value of the maximum stride is given in \p MaxStride, and that have the same |
1801 | /// type size \p TypeByteSize, in a loop whose maximum backedge taken count is |
1802 | /// \p MaxBTC, check if it is possible to prove statically that the dependence |
1803 | /// distance is larger than the range that the accesses will travel through the |
1804 | /// execution of the loop. If so, return true; false otherwise. This is useful |
1805 | /// for example in loops such as the following (PR31098): |
1806 | /// for (i = 0; i < D; ++i) { |
1807 | /// = out[i]; |
1808 | /// out[i+D] = |
1809 | /// } |
1810 | static bool isSafeDependenceDistance(const DataLayout &DL, ScalarEvolution &SE, |
1811 | const SCEV &MaxBTC, const SCEV &Dist, |
1812 | uint64_t MaxStride, |
1813 | uint64_t TypeByteSize) { |
1814 | |
1815 | // If we can prove that |
1816 | // (**) |Dist| > MaxBTC * Step |
1817 | // where Step is the absolute stride of the memory accesses in bytes, |
1818 | // then there is no dependence. |
1819 | // |
1820 | // Rationale: |
1821 | // We basically want to check if the absolute distance (|Dist/Step|) |
1822 | // is >= the loop iteration count (or > MaxBTC). |
1823 | // This is equivalent to the Strong SIV Test (Practical Dependence Testing, |
1824 | // Section 4.2.1); Note, that for vectorization it is sufficient to prove |
1825 | // that the dependence distance is >= VF; This is checked elsewhere. |
1826 | // But in some cases we can prune dependence distances early, and |
1827 | // even before selecting the VF, and without a runtime test, by comparing |
1828 | // the distance against the loop iteration count. Since the vectorized code |
1829 | // will be executed only if LoopCount >= VF, proving distance >= LoopCount |
1830 | // also guarantees that distance >= VF. |
1831 | // |
1832 | const uint64_t ByteStride = MaxStride * TypeByteSize; |
1833 | const SCEV *Step = SE.getConstant(Ty: MaxBTC.getType(), V: ByteStride); |
1834 | const SCEV *Product = SE.getMulExpr(LHS: &MaxBTC, RHS: Step); |
1835 | |
1836 | const SCEV *CastedDist = &Dist; |
1837 | const SCEV *CastedProduct = Product; |
1838 | uint64_t DistTypeSizeBits = DL.getTypeSizeInBits(Ty: Dist.getType()); |
1839 | uint64_t ProductTypeSizeBits = DL.getTypeSizeInBits(Ty: Product->getType()); |
1840 | |
1841 | // The dependence distance can be positive/negative, so we sign extend Dist; |
1842 | // The multiplication of the absolute stride in bytes and the |
1843 | // backedgeTakenCount is non-negative, so we zero extend Product. |
1844 | if (DistTypeSizeBits > ProductTypeSizeBits) |
1845 | CastedProduct = SE.getZeroExtendExpr(Op: Product, Ty: Dist.getType()); |
1846 | else |
1847 | CastedDist = SE.getNoopOrSignExtend(V: &Dist, Ty: Product->getType()); |
1848 | |
1849 | // Is Dist - (MaxBTC * Step) > 0 ? |
1850 | // (If so, then we have proven (**) because |Dist| >= Dist) |
1851 | const SCEV *Minus = SE.getMinusSCEV(LHS: CastedDist, RHS: CastedProduct); |
1852 | if (SE.isKnownPositive(S: Minus)) |
1853 | return true; |
1854 | |
1855 | // Second try: Is -Dist - (MaxBTC * Step) > 0 ? |
1856 | // (If so, then we have proven (**) because |Dist| >= -1*Dist) |
1857 | const SCEV *NegDist = SE.getNegativeSCEV(V: CastedDist); |
1858 | Minus = SE.getMinusSCEV(LHS: NegDist, RHS: CastedProduct); |
1859 | return SE.isKnownPositive(S: Minus); |
1860 | } |
1861 | |
1862 | /// Check the dependence for two accesses with the same stride \p Stride. |
1863 | /// \p Distance is the positive distance and \p TypeByteSize is type size in |
1864 | /// bytes. |
1865 | /// |
1866 | /// \returns true if they are independent. |
1867 | static bool areStridedAccessesIndependent(uint64_t Distance, uint64_t Stride, |
1868 | uint64_t TypeByteSize) { |
1869 | assert(Stride > 1 && "The stride must be greater than 1" ); |
1870 | assert(TypeByteSize > 0 && "The type size in byte must be non-zero" ); |
1871 | assert(Distance > 0 && "The distance must be non-zero" ); |
1872 | |
1873 | // Skip if the distance is not multiple of type byte size. |
1874 | if (Distance % TypeByteSize) |
1875 | return false; |
1876 | |
1877 | uint64_t ScaledDist = Distance / TypeByteSize; |
1878 | |
1879 | // No dependence if the scaled distance is not multiple of the stride. |
1880 | // E.g. |
1881 | // for (i = 0; i < 1024 ; i += 4) |
1882 | // A[i+2] = A[i] + 1; |
1883 | // |
1884 | // Two accesses in memory (scaled distance is 2, stride is 4): |
1885 | // | A[0] | | | | A[4] | | | | |
1886 | // | | | A[2] | | | | A[6] | | |
1887 | // |
1888 | // E.g. |
1889 | // for (i = 0; i < 1024 ; i += 3) |
1890 | // A[i+4] = A[i] + 1; |
1891 | // |
1892 | // Two accesses in memory (scaled distance is 4, stride is 3): |
1893 | // | A[0] | | | A[3] | | | A[6] | | | |
1894 | // | | | | | A[4] | | | A[7] | | |
1895 | return ScaledDist % Stride; |
1896 | } |
1897 | |
1898 | std::variant<MemoryDepChecker::Dependence::DepType, |
1899 | MemoryDepChecker::DepDistanceStrideAndSizeInfo> |
1900 | MemoryDepChecker::getDependenceDistanceStrideAndSize( |
1901 | const AccessAnalysis::MemAccessInfo &A, Instruction *AInst, |
1902 | const AccessAnalysis::MemAccessInfo &B, Instruction *BInst) { |
1903 | const auto &DL = InnermostLoop->getHeader()->getDataLayout(); |
1904 | auto &SE = *PSE.getSE(); |
1905 | auto [APtr, AIsWrite] = A; |
1906 | auto [BPtr, BIsWrite] = B; |
1907 | |
1908 | // Two reads are independent. |
1909 | if (!AIsWrite && !BIsWrite) |
1910 | return MemoryDepChecker::Dependence::NoDep; |
1911 | |
1912 | Type *ATy = getLoadStoreType(I: AInst); |
1913 | Type *BTy = getLoadStoreType(I: BInst); |
1914 | |
1915 | // We cannot check pointers in different address spaces. |
1916 | if (APtr->getType()->getPointerAddressSpace() != |
1917 | BPtr->getType()->getPointerAddressSpace()) |
1918 | return MemoryDepChecker::Dependence::Unknown; |
1919 | |
1920 | std::optional<int64_t> StrideAPtr = |
1921 | getPtrStride(PSE, AccessTy: ATy, Ptr: APtr, Lp: InnermostLoop, StridesMap: SymbolicStrides, Assume: true, ShouldCheckWrap: true); |
1922 | std::optional<int64_t> StrideBPtr = |
1923 | getPtrStride(PSE, AccessTy: BTy, Ptr: BPtr, Lp: InnermostLoop, StridesMap: SymbolicStrides, Assume: true, ShouldCheckWrap: true); |
1924 | |
1925 | const SCEV *Src = PSE.getSCEV(V: APtr); |
1926 | const SCEV *Sink = PSE.getSCEV(V: BPtr); |
1927 | |
1928 | // If the induction step is negative we have to invert source and sink of the |
1929 | // dependence when measuring the distance between them. We should not swap |
1930 | // AIsWrite with BIsWrite, as their uses expect them in program order. |
1931 | if (StrideAPtr && *StrideAPtr < 0) { |
1932 | std::swap(a&: Src, b&: Sink); |
1933 | std::swap(a&: AInst, b&: BInst); |
1934 | std::swap(lhs&: StrideAPtr, rhs&: StrideBPtr); |
1935 | } |
1936 | |
1937 | const SCEV *Dist = SE.getMinusSCEV(LHS: Sink, RHS: Src); |
1938 | |
1939 | LLVM_DEBUG(dbgs() << "LAA: Src Scev: " << *Src << "Sink Scev: " << *Sink |
1940 | << "\n" ); |
1941 | LLVM_DEBUG(dbgs() << "LAA: Distance for " << *AInst << " to " << *BInst |
1942 | << ": " << *Dist << "\n" ); |
1943 | |
1944 | // Check if we can prove that Sink only accesses memory after Src's end or |
1945 | // vice versa. At the moment this is limited to cases where either source or |
1946 | // sink are loop invariant to avoid compile-time increases. This is not |
1947 | // required for correctness. |
1948 | if (SE.isLoopInvariant(S: Src, L: InnermostLoop) || |
1949 | SE.isLoopInvariant(S: Sink, L: InnermostLoop)) { |
1950 | const auto &[SrcStart, SrcEnd] = |
1951 | getStartAndEndForAccess(Lp: InnermostLoop, PtrExpr: Src, AccessTy: ATy, PSE, PointerBounds); |
1952 | const auto &[SinkStart, SinkEnd] = |
1953 | getStartAndEndForAccess(Lp: InnermostLoop, PtrExpr: Sink, AccessTy: BTy, PSE, PointerBounds); |
1954 | if (!isa<SCEVCouldNotCompute>(Val: SrcStart) && |
1955 | !isa<SCEVCouldNotCompute>(Val: SrcEnd) && |
1956 | !isa<SCEVCouldNotCompute>(Val: SinkStart) && |
1957 | !isa<SCEVCouldNotCompute>(Val: SinkEnd)) { |
1958 | if (SE.isKnownPredicate(Pred: CmpInst::ICMP_ULE, LHS: SrcEnd, RHS: SinkStart)) |
1959 | return MemoryDepChecker::Dependence::NoDep; |
1960 | if (SE.isKnownPredicate(Pred: CmpInst::ICMP_ULE, LHS: SinkEnd, RHS: SrcStart)) |
1961 | return MemoryDepChecker::Dependence::NoDep; |
1962 | } |
1963 | } |
1964 | |
1965 | // Need accesses with constant strides and the same direction for further |
1966 | // dependence analysis. We don't want to vectorize "A[B[i]] += ..." and |
1967 | // similar code or pointer arithmetic that could wrap in the address space. |
1968 | |
1969 | // If either Src or Sink are not strided (i.e. not a non-wrapping AddRec) and |
1970 | // not loop-invariant (stride will be 0 in that case), we cannot analyze the |
1971 | // dependence further and also cannot generate runtime checks. |
1972 | if (!StrideAPtr || !StrideBPtr) { |
1973 | LLVM_DEBUG(dbgs() << "Pointer access with non-constant stride\n" ); |
1974 | return MemoryDepChecker::Dependence::IndirectUnsafe; |
1975 | } |
1976 | |
1977 | int64_t StrideAPtrInt = *StrideAPtr; |
1978 | int64_t StrideBPtrInt = *StrideBPtr; |
1979 | LLVM_DEBUG(dbgs() << "LAA: Src induction step: " << StrideAPtrInt |
1980 | << " Sink induction step: " << StrideBPtrInt << "\n" ); |
1981 | // At least Src or Sink are loop invariant and the other is strided or |
1982 | // invariant. We can generate a runtime check to disambiguate the accesses. |
1983 | if (StrideAPtrInt == 0 || StrideBPtrInt == 0) |
1984 | return MemoryDepChecker::Dependence::Unknown; |
1985 | |
1986 | // Both Src and Sink have a constant stride, check if they are in the same |
1987 | // direction. |
1988 | if ((StrideAPtrInt > 0 && StrideBPtrInt < 0) || |
1989 | (StrideAPtrInt < 0 && StrideBPtrInt > 0)) { |
1990 | LLVM_DEBUG( |
1991 | dbgs() << "Pointer access with strides in different directions\n" ); |
1992 | return MemoryDepChecker::Dependence::Unknown; |
1993 | } |
1994 | |
1995 | uint64_t TypeByteSize = DL.getTypeAllocSize(Ty: ATy); |
1996 | bool HasSameSize = |
1997 | DL.getTypeStoreSizeInBits(Ty: ATy) == DL.getTypeStoreSizeInBits(Ty: BTy); |
1998 | if (!HasSameSize) |
1999 | TypeByteSize = 0; |
2000 | return DepDistanceStrideAndSizeInfo(Dist, std::abs(i: StrideAPtrInt), |
2001 | std::abs(i: StrideBPtrInt), TypeByteSize, |
2002 | AIsWrite, BIsWrite); |
2003 | } |
2004 | |
2005 | MemoryDepChecker::Dependence::DepType |
2006 | MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx, |
2007 | const MemAccessInfo &B, unsigned BIdx) { |
2008 | assert(AIdx < BIdx && "Must pass arguments in program order" ); |
2009 | |
2010 | // Get the dependence distance, stride, type size and what access writes for |
2011 | // the dependence between A and B. |
2012 | auto Res = |
2013 | getDependenceDistanceStrideAndSize(A, AInst: InstMap[AIdx], B, BInst: InstMap[BIdx]); |
2014 | if (std::holds_alternative<Dependence::DepType>(v: Res)) |
2015 | return std::get<Dependence::DepType>(v&: Res); |
2016 | |
2017 | auto &[Dist, StrideA, StrideB, TypeByteSize, AIsWrite, BIsWrite] = |
2018 | std::get<DepDistanceStrideAndSizeInfo>(v&: Res); |
2019 | bool HasSameSize = TypeByteSize > 0; |
2020 | |
2021 | std::optional<uint64_t> CommonStride = |
2022 | StrideA == StrideB ? std::make_optional(t&: StrideA) : std::nullopt; |
2023 | if (isa<SCEVCouldNotCompute>(Val: Dist)) { |
2024 | // TODO: Relax requirement that there is a common stride to retry with |
2025 | // non-constant distance dependencies. |
2026 | FoundNonConstantDistanceDependence |= CommonStride.has_value(); |
2027 | LLVM_DEBUG(dbgs() << "LAA: Dependence because of uncomputable distance.\n" ); |
2028 | return Dependence::Unknown; |
2029 | } |
2030 | |
2031 | ScalarEvolution &SE = *PSE.getSE(); |
2032 | auto &DL = InnermostLoop->getHeader()->getDataLayout(); |
2033 | uint64_t MaxStride = std::max(a: StrideA, b: StrideB); |
2034 | |
2035 | // If the distance between the acecsses is larger than their maximum absolute |
2036 | // stride multiplied by the symbolic maximum backedge taken count (which is an |
2037 | // upper bound of the number of iterations), the accesses are independet, i.e. |
2038 | // they are far enough appart that accesses won't access the same location |
2039 | // across all loop ierations. |
2040 | if (HasSameSize && isSafeDependenceDistance( |
2041 | DL, SE, MaxBTC: *(PSE.getSymbolicMaxBackedgeTakenCount()), |
2042 | Dist: *Dist, MaxStride, TypeByteSize)) |
2043 | return Dependence::NoDep; |
2044 | |
2045 | const SCEVConstant *C = dyn_cast<SCEVConstant>(Val: Dist); |
2046 | |
2047 | // Attempt to prove strided accesses independent. |
2048 | if (C) { |
2049 | const APInt &Val = C->getAPInt(); |
2050 | int64_t Distance = Val.getSExtValue(); |
2051 | |
2052 | // If the distance between accesses and their strides are known constants, |
2053 | // check whether the accesses interlace each other. |
2054 | if (std::abs(i: Distance) > 0 && CommonStride && *CommonStride > 1 && |
2055 | HasSameSize && |
2056 | areStridedAccessesIndependent(Distance: std::abs(i: Distance), Stride: *CommonStride, |
2057 | TypeByteSize)) { |
2058 | LLVM_DEBUG(dbgs() << "LAA: Strided accesses are independent\n" ); |
2059 | return Dependence::NoDep; |
2060 | } |
2061 | } else |
2062 | Dist = SE.applyLoopGuards(Expr: Dist, L: InnermostLoop); |
2063 | |
2064 | // Negative distances are not plausible dependencies. |
2065 | if (SE.isKnownNonPositive(S: Dist)) { |
2066 | if (SE.isKnownNonNegative(S: Dist)) { |
2067 | if (HasSameSize) { |
2068 | // Write to the same location with the same size. |
2069 | return Dependence::Forward; |
2070 | } |
2071 | LLVM_DEBUG(dbgs() << "LAA: possibly zero dependence difference but " |
2072 | "different type sizes\n" ); |
2073 | return Dependence::Unknown; |
2074 | } |
2075 | |
2076 | bool IsTrueDataDependence = (AIsWrite && !BIsWrite); |
2077 | // Check if the first access writes to a location that is read in a later |
2078 | // iteration, where the distance between them is not a multiple of a vector |
2079 | // factor and relatively small. |
2080 | // |
2081 | // NOTE: There is no need to update MaxSafeVectorWidthInBits after call to |
2082 | // couldPreventStoreLoadForward, even if it changed MinDepDistBytes, since a |
2083 | // forward dependency will allow vectorization using any width. |
2084 | |
2085 | if (IsTrueDataDependence && EnableForwardingConflictDetection) { |
2086 | if (!C) { |
2087 | // TODO: FoundNonConstantDistanceDependence is used as a necessary |
2088 | // condition to consider retrying with runtime checks. Historically, we |
2089 | // did not set it when strides were different but there is no inherent |
2090 | // reason to. |
2091 | FoundNonConstantDistanceDependence |= CommonStride.has_value(); |
2092 | return Dependence::Unknown; |
2093 | } |
2094 | if (!HasSameSize || |
2095 | couldPreventStoreLoadForward(Distance: C->getAPInt().abs().getZExtValue(), |
2096 | TypeByteSize)) { |
2097 | LLVM_DEBUG( |
2098 | dbgs() << "LAA: Forward but may prevent st->ld forwarding\n" ); |
2099 | return Dependence::ForwardButPreventsForwarding; |
2100 | } |
2101 | } |
2102 | |
2103 | LLVM_DEBUG(dbgs() << "LAA: Dependence is negative\n" ); |
2104 | return Dependence::Forward; |
2105 | } |
2106 | |
2107 | int64_t MinDistance = SE.getSignedRangeMin(S: Dist).getSExtValue(); |
2108 | // Below we only handle strictly positive distances. |
2109 | if (MinDistance <= 0) { |
2110 | FoundNonConstantDistanceDependence |= CommonStride.has_value(); |
2111 | return Dependence::Unknown; |
2112 | } |
2113 | |
2114 | if (!isa<SCEVConstant>(Val: Dist)) { |
2115 | // Previously this case would be treated as Unknown, possibly setting |
2116 | // FoundNonConstantDistanceDependence to force re-trying with runtime |
2117 | // checks. Until the TODO below is addressed, set it here to preserve |
2118 | // original behavior w.r.t. re-trying with runtime checks. |
2119 | // TODO: FoundNonConstantDistanceDependence is used as a necessary |
2120 | // condition to consider retrying with runtime checks. Historically, we |
2121 | // did not set it when strides were different but there is no inherent |
2122 | // reason to. |
2123 | FoundNonConstantDistanceDependence |= CommonStride.has_value(); |
2124 | } |
2125 | |
2126 | if (!HasSameSize) { |
2127 | LLVM_DEBUG(dbgs() << "LAA: ReadWrite-Write positive dependency with " |
2128 | "different type sizes\n" ); |
2129 | return Dependence::Unknown; |
2130 | } |
2131 | |
2132 | if (!CommonStride) |
2133 | return Dependence::Unknown; |
2134 | |
2135 | // Bail out early if passed-in parameters make vectorization not feasible. |
2136 | unsigned ForcedFactor = (VectorizerParams::VectorizationFactor ? |
2137 | VectorizerParams::VectorizationFactor : 1); |
2138 | unsigned ForcedUnroll = (VectorizerParams::VectorizationInterleave ? |
2139 | VectorizerParams::VectorizationInterleave : 1); |
2140 | // The minimum number of iterations for a vectorized/unrolled version. |
2141 | unsigned MinNumIter = std::max(a: ForcedFactor * ForcedUnroll, b: 2U); |
2142 | |
2143 | // It's not vectorizable if the distance is smaller than the minimum distance |
2144 | // needed for a vectroized/unrolled version. Vectorizing one iteration in |
2145 | // front needs TypeByteSize * Stride. Vectorizing the last iteration needs |
2146 | // TypeByteSize (No need to plus the last gap distance). |
2147 | // |
2148 | // E.g. Assume one char is 1 byte in memory and one int is 4 bytes. |
2149 | // foo(int *A) { |
2150 | // int *B = (int *)((char *)A + 14); |
2151 | // for (i = 0 ; i < 1024 ; i += 2) |
2152 | // B[i] = A[i] + 1; |
2153 | // } |
2154 | // |
2155 | // Two accesses in memory (stride is 2): |
2156 | // | A[0] | | A[2] | | A[4] | | A[6] | | |
2157 | // | B[0] | | B[2] | | B[4] | |
2158 | // |
2159 | // MinDistance needs for vectorizing iterations except the last iteration: |
2160 | // 4 * 2 * (MinNumIter - 1). MinDistance needs for the last iteration: 4. |
2161 | // So the minimum distance needed is: 4 * 2 * (MinNumIter - 1) + 4. |
2162 | // |
2163 | // If MinNumIter is 2, it is vectorizable as the minimum distance needed is |
2164 | // 12, which is less than distance. |
2165 | // |
2166 | // If MinNumIter is 4 (Say if a user forces the vectorization factor to be 4), |
2167 | // the minimum distance needed is 28, which is greater than distance. It is |
2168 | // not safe to do vectorization. |
2169 | |
2170 | // We know that Dist is positive, but it may not be constant. Use the signed |
2171 | // minimum for computations below, as this ensures we compute the closest |
2172 | // possible dependence distance. |
2173 | uint64_t MinDistanceNeeded = |
2174 | TypeByteSize * *CommonStride * (MinNumIter - 1) + TypeByteSize; |
2175 | if (MinDistanceNeeded > static_cast<uint64_t>(MinDistance)) { |
2176 | if (!isa<SCEVConstant>(Val: Dist)) { |
2177 | // For non-constant distances, we checked the lower bound of the |
2178 | // dependence distance and the distance may be larger at runtime (and safe |
2179 | // for vectorization). Classify it as Unknown, so we re-try with runtime |
2180 | // checks. |
2181 | return Dependence::Unknown; |
2182 | } |
2183 | LLVM_DEBUG(dbgs() << "LAA: Failure because of positive minimum distance " |
2184 | << MinDistance << '\n'); |
2185 | return Dependence::Backward; |
2186 | } |
2187 | |
2188 | // Unsafe if the minimum distance needed is greater than smallest dependence |
2189 | // distance distance. |
2190 | if (MinDistanceNeeded > MinDepDistBytes) { |
2191 | LLVM_DEBUG(dbgs() << "LAA: Failure because it needs at least " |
2192 | << MinDistanceNeeded << " size in bytes\n" ); |
2193 | return Dependence::Backward; |
2194 | } |
2195 | |
2196 | // Positive distance bigger than max vectorization factor. |
2197 | // FIXME: Should use max factor instead of max distance in bytes, which could |
2198 | // not handle different types. |
2199 | // E.g. Assume one char is 1 byte in memory and one int is 4 bytes. |
2200 | // void foo (int *A, char *B) { |
2201 | // for (unsigned i = 0; i < 1024; i++) { |
2202 | // A[i+2] = A[i] + 1; |
2203 | // B[i+2] = B[i] + 1; |
2204 | // } |
2205 | // } |
2206 | // |
2207 | // This case is currently unsafe according to the max safe distance. If we |
2208 | // analyze the two accesses on array B, the max safe dependence distance |
2209 | // is 2. Then we analyze the accesses on array A, the minimum distance needed |
2210 | // is 8, which is less than 2 and forbidden vectorization, But actually |
2211 | // both A and B could be vectorized by 2 iterations. |
2212 | MinDepDistBytes = |
2213 | std::min(a: static_cast<uint64_t>(MinDistance), b: MinDepDistBytes); |
2214 | |
2215 | bool IsTrueDataDependence = (!AIsWrite && BIsWrite); |
2216 | uint64_t MinDepDistBytesOld = MinDepDistBytes; |
2217 | if (IsTrueDataDependence && EnableForwardingConflictDetection && |
2218 | isa<SCEVConstant>(Val: Dist) && |
2219 | couldPreventStoreLoadForward(Distance: MinDistance, TypeByteSize)) { |
2220 | // Sanity check that we didn't update MinDepDistBytes when calling |
2221 | // couldPreventStoreLoadForward |
2222 | assert(MinDepDistBytes == MinDepDistBytesOld && |
2223 | "An update to MinDepDistBytes requires an update to " |
2224 | "MaxSafeVectorWidthInBits" ); |
2225 | (void)MinDepDistBytesOld; |
2226 | return Dependence::BackwardVectorizableButPreventsForwarding; |
2227 | } |
2228 | |
2229 | // An update to MinDepDistBytes requires an update to MaxSafeVectorWidthInBits |
2230 | // since there is a backwards dependency. |
2231 | uint64_t MaxVF = MinDepDistBytes / (TypeByteSize * *CommonStride); |
2232 | LLVM_DEBUG(dbgs() << "LAA: Positive min distance " << MinDistance |
2233 | << " with max VF = " << MaxVF << '\n'); |
2234 | |
2235 | uint64_t MaxVFInBits = MaxVF * TypeByteSize * 8; |
2236 | if (!isa<SCEVConstant>(Val: Dist) && MaxVFInBits < MaxTargetVectorWidthInBits) { |
2237 | // For non-constant distances, we checked the lower bound of the dependence |
2238 | // distance and the distance may be larger at runtime (and safe for |
2239 | // vectorization). Classify it as Unknown, so we re-try with runtime checks. |
2240 | return Dependence::Unknown; |
2241 | } |
2242 | |
2243 | MaxSafeVectorWidthInBits = std::min(a: MaxSafeVectorWidthInBits, b: MaxVFInBits); |
2244 | return Dependence::BackwardVectorizable; |
2245 | } |
2246 | |
2247 | bool MemoryDepChecker::areDepsSafe(const DepCandidates &AccessSets, |
2248 | const MemAccessInfoList &CheckDeps) { |
2249 | |
2250 | MinDepDistBytes = -1; |
2251 | SmallPtrSet<MemAccessInfo, 8> Visited; |
2252 | for (MemAccessInfo CurAccess : CheckDeps) { |
2253 | if (Visited.count(Ptr: CurAccess)) |
2254 | continue; |
2255 | |
2256 | // Get the relevant memory access set. |
2257 | EquivalenceClasses<MemAccessInfo>::iterator I = |
2258 | AccessSets.findValue(V: AccessSets.getLeaderValue(V: CurAccess)); |
2259 | |
2260 | // Check accesses within this set. |
2261 | EquivalenceClasses<MemAccessInfo>::member_iterator AI = |
2262 | AccessSets.member_begin(I); |
2263 | EquivalenceClasses<MemAccessInfo>::member_iterator AE = |
2264 | AccessSets.member_end(); |
2265 | |
2266 | // Check every access pair. |
2267 | while (AI != AE) { |
2268 | Visited.insert(Ptr: *AI); |
2269 | bool AIIsWrite = AI->getInt(); |
2270 | // Check loads only against next equivalent class, but stores also against |
2271 | // other stores in the same equivalence class - to the same address. |
2272 | EquivalenceClasses<MemAccessInfo>::member_iterator OI = |
2273 | (AIIsWrite ? AI : std::next(x: AI)); |
2274 | while (OI != AE) { |
2275 | // Check every accessing instruction pair in program order. |
2276 | for (std::vector<unsigned>::iterator I1 = Accesses[*AI].begin(), |
2277 | I1E = Accesses[*AI].end(); I1 != I1E; ++I1) |
2278 | // Scan all accesses of another equivalence class, but only the next |
2279 | // accesses of the same equivalent class. |
2280 | for (std::vector<unsigned>::iterator |
2281 | I2 = (OI == AI ? std::next(x: I1) : Accesses[*OI].begin()), |
2282 | I2E = (OI == AI ? I1E : Accesses[*OI].end()); |
2283 | I2 != I2E; ++I2) { |
2284 | auto A = std::make_pair(x: &*AI, y&: *I1); |
2285 | auto B = std::make_pair(x: &*OI, y&: *I2); |
2286 | |
2287 | assert(*I1 != *I2); |
2288 | if (*I1 > *I2) |
2289 | std::swap(x&: A, y&: B); |
2290 | |
2291 | Dependence::DepType Type = |
2292 | isDependent(A: *A.first, AIdx: A.second, B: *B.first, BIdx: B.second); |
2293 | mergeInStatus(S: Dependence::isSafeForVectorization(Type)); |
2294 | |
2295 | // Gather dependences unless we accumulated MaxDependences |
2296 | // dependences. In that case return as soon as we find the first |
2297 | // unsafe dependence. This puts a limit on this quadratic |
2298 | // algorithm. |
2299 | if (RecordDependences) { |
2300 | if (Type != Dependence::NoDep) |
2301 | Dependences.push_back(Elt: Dependence(A.second, B.second, Type)); |
2302 | |
2303 | if (Dependences.size() >= MaxDependences) { |
2304 | RecordDependences = false; |
2305 | Dependences.clear(); |
2306 | LLVM_DEBUG(dbgs() |
2307 | << "Too many dependences, stopped recording\n" ); |
2308 | } |
2309 | } |
2310 | if (!RecordDependences && !isSafeForVectorization()) |
2311 | return false; |
2312 | } |
2313 | ++OI; |
2314 | } |
2315 | ++AI; |
2316 | } |
2317 | } |
2318 | |
2319 | LLVM_DEBUG(dbgs() << "Total Dependences: " << Dependences.size() << "\n" ); |
2320 | return isSafeForVectorization(); |
2321 | } |
2322 | |
2323 | SmallVector<Instruction *, 4> |
2324 | MemoryDepChecker::getInstructionsForAccess(Value *Ptr, bool IsWrite) const { |
2325 | MemAccessInfo Access(Ptr, IsWrite); |
2326 | auto &IndexVector = Accesses.find(Val: Access)->second; |
2327 | |
2328 | SmallVector<Instruction *, 4> Insts; |
2329 | transform(Range: IndexVector, |
2330 | d_first: std::back_inserter(x&: Insts), |
2331 | F: [&](unsigned Idx) { return this->InstMap[Idx]; }); |
2332 | return Insts; |
2333 | } |
2334 | |
2335 | const char *MemoryDepChecker::Dependence::DepName[] = { |
2336 | "NoDep" , |
2337 | "Unknown" , |
2338 | "IndirectUnsafe" , |
2339 | "Forward" , |
2340 | "ForwardButPreventsForwarding" , |
2341 | "Backward" , |
2342 | "BackwardVectorizable" , |
2343 | "BackwardVectorizableButPreventsForwarding" }; |
2344 | |
2345 | void MemoryDepChecker::Dependence::print( |
2346 | raw_ostream &OS, unsigned Depth, |
2347 | const SmallVectorImpl<Instruction *> &Instrs) const { |
2348 | OS.indent(NumSpaces: Depth) << DepName[Type] << ":\n" ; |
2349 | OS.indent(NumSpaces: Depth + 2) << *Instrs[Source] << " -> \n" ; |
2350 | OS.indent(NumSpaces: Depth + 2) << *Instrs[Destination] << "\n" ; |
2351 | } |
2352 | |
2353 | bool LoopAccessInfo::canAnalyzeLoop() { |
2354 | // We need to have a loop header. |
2355 | LLVM_DEBUG(dbgs() << "\nLAA: Checking a loop in '" |
2356 | << TheLoop->getHeader()->getParent()->getName() << "' from " |
2357 | << TheLoop->getLocStr() << "\n" ); |
2358 | |
2359 | // We can only analyze innermost loops. |
2360 | if (!TheLoop->isInnermost()) { |
2361 | LLVM_DEBUG(dbgs() << "LAA: loop is not the innermost loop\n" ); |
2362 | recordAnalysis(RemarkName: "NotInnerMostLoop" ) << "loop is not the innermost loop" ; |
2363 | return false; |
2364 | } |
2365 | |
2366 | // We must have a single backedge. |
2367 | if (TheLoop->getNumBackEdges() != 1) { |
2368 | LLVM_DEBUG( |
2369 | dbgs() << "LAA: loop control flow is not understood by analyzer\n" ); |
2370 | recordAnalysis(RemarkName: "CFGNotUnderstood" ) |
2371 | << "loop control flow is not understood by analyzer" ; |
2372 | return false; |
2373 | } |
2374 | |
2375 | // ScalarEvolution needs to be able to find the symbolic max backedge taken |
2376 | // count, which is an upper bound on the number of loop iterations. The loop |
2377 | // may execute fewer iterations, if it exits via an uncountable exit. |
2378 | const SCEV *ExitCount = PSE->getSymbolicMaxBackedgeTakenCount(); |
2379 | if (isa<SCEVCouldNotCompute>(Val: ExitCount)) { |
2380 | recordAnalysis(RemarkName: "CantComputeNumberOfIterations" ) |
2381 | << "could not determine number of loop iterations" ; |
2382 | LLVM_DEBUG(dbgs() << "LAA: SCEV could not compute the loop exit count.\n" ); |
2383 | return false; |
2384 | } |
2385 | |
2386 | LLVM_DEBUG(dbgs() << "LAA: Found an analyzable loop: " |
2387 | << TheLoop->getHeader()->getName() << "\n" ); |
2388 | return true; |
2389 | } |
2390 | |
2391 | bool LoopAccessInfo::analyzeLoop(AAResults *AA, LoopInfo *LI, |
2392 | const TargetLibraryInfo *TLI, |
2393 | DominatorTree *DT) { |
2394 | // Holds the Load and Store instructions. |
2395 | SmallVector<LoadInst *, 16> Loads; |
2396 | SmallVector<StoreInst *, 16> Stores; |
2397 | SmallPtrSet<MDNode *, 8> LoopAliasScopes; |
2398 | |
2399 | // Holds all the different accesses in the loop. |
2400 | unsigned NumReads = 0; |
2401 | unsigned NumReadWrites = 0; |
2402 | |
2403 | bool HasComplexMemInst = false; |
2404 | |
2405 | // A runtime check is only legal to insert if there are no convergent calls. |
2406 | HasConvergentOp = false; |
2407 | |
2408 | PtrRtChecking->Pointers.clear(); |
2409 | PtrRtChecking->Need = false; |
2410 | |
2411 | const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel(); |
2412 | |
2413 | const bool EnableMemAccessVersioningOfLoop = |
2414 | EnableMemAccessVersioning && |
2415 | !TheLoop->getHeader()->getParent()->hasOptSize(); |
2416 | |
2417 | // Traverse blocks in fixed RPOT order, regardless of their storage in the |
2418 | // loop info, as it may be arbitrary. |
2419 | LoopBlocksRPO RPOT(TheLoop); |
2420 | RPOT.perform(LI); |
2421 | for (BasicBlock *BB : RPOT) { |
2422 | // Scan the BB and collect legal loads and stores. Also detect any |
2423 | // convergent instructions. |
2424 | for (Instruction &I : *BB) { |
2425 | if (auto *Call = dyn_cast<CallBase>(Val: &I)) { |
2426 | if (Call->isConvergent()) |
2427 | HasConvergentOp = true; |
2428 | } |
2429 | |
2430 | // With both a non-vectorizable memory instruction and a convergent |
2431 | // operation, found in this loop, no reason to continue the search. |
2432 | if (HasComplexMemInst && HasConvergentOp) |
2433 | return false; |
2434 | |
2435 | // Avoid hitting recordAnalysis multiple times. |
2436 | if (HasComplexMemInst) |
2437 | continue; |
2438 | |
2439 | // Record alias scopes defined inside the loop. |
2440 | if (auto *Decl = dyn_cast<NoAliasScopeDeclInst>(Val: &I)) |
2441 | for (Metadata *Op : Decl->getScopeList()->operands()) |
2442 | LoopAliasScopes.insert(Ptr: cast<MDNode>(Val: Op)); |
2443 | |
2444 | // Many math library functions read the rounding mode. We will only |
2445 | // vectorize a loop if it contains known function calls that don't set |
2446 | // the flag. Therefore, it is safe to ignore this read from memory. |
2447 | auto *Call = dyn_cast<CallInst>(Val: &I); |
2448 | if (Call && getVectorIntrinsicIDForCall(CI: Call, TLI)) |
2449 | continue; |
2450 | |
2451 | // If this is a load, save it. If this instruction can read from memory |
2452 | // but is not a load, then we quit. Notice that we don't handle function |
2453 | // calls that read or write. |
2454 | if (I.mayReadFromMemory()) { |
2455 | // If the function has an explicit vectorized counterpart, we can safely |
2456 | // assume that it can be vectorized. |
2457 | if (Call && !Call->isNoBuiltin() && Call->getCalledFunction() && |
2458 | !VFDatabase::getMappings(CI: *Call).empty()) |
2459 | continue; |
2460 | |
2461 | auto *Ld = dyn_cast<LoadInst>(Val: &I); |
2462 | if (!Ld) { |
2463 | recordAnalysis(RemarkName: "CantVectorizeInstruction" , Instr: Ld) |
2464 | << "instruction cannot be vectorized" ; |
2465 | HasComplexMemInst = true; |
2466 | continue; |
2467 | } |
2468 | if (!Ld->isSimple() && !IsAnnotatedParallel) { |
2469 | recordAnalysis(RemarkName: "NonSimpleLoad" , Instr: Ld) |
2470 | << "read with atomic ordering or volatile read" ; |
2471 | LLVM_DEBUG(dbgs() << "LAA: Found a non-simple load.\n" ); |
2472 | HasComplexMemInst = true; |
2473 | continue; |
2474 | } |
2475 | NumLoads++; |
2476 | Loads.push_back(Elt: Ld); |
2477 | DepChecker->addAccess(LI: Ld); |
2478 | if (EnableMemAccessVersioningOfLoop) |
2479 | collectStridedAccess(LoadOrStoreInst: Ld); |
2480 | continue; |
2481 | } |
2482 | |
2483 | // Save 'store' instructions. Abort if other instructions write to memory. |
2484 | if (I.mayWriteToMemory()) { |
2485 | auto *St = dyn_cast<StoreInst>(Val: &I); |
2486 | if (!St) { |
2487 | recordAnalysis(RemarkName: "CantVectorizeInstruction" , Instr: St) |
2488 | << "instruction cannot be vectorized" ; |
2489 | HasComplexMemInst = true; |
2490 | continue; |
2491 | } |
2492 | if (!St->isSimple() && !IsAnnotatedParallel) { |
2493 | recordAnalysis(RemarkName: "NonSimpleStore" , Instr: St) |
2494 | << "write with atomic ordering or volatile write" ; |
2495 | LLVM_DEBUG(dbgs() << "LAA: Found a non-simple store.\n" ); |
2496 | HasComplexMemInst = true; |
2497 | continue; |
2498 | } |
2499 | NumStores++; |
2500 | Stores.push_back(Elt: St); |
2501 | DepChecker->addAccess(SI: St); |
2502 | if (EnableMemAccessVersioningOfLoop) |
2503 | collectStridedAccess(LoadOrStoreInst: St); |
2504 | } |
2505 | } // Next instr. |
2506 | } // Next block. |
2507 | |
2508 | if (HasComplexMemInst) |
2509 | return false; |
2510 | |
2511 | // Now we have two lists that hold the loads and the stores. |
2512 | // Next, we find the pointers that they use. |
2513 | |
2514 | // Check if we see any stores. If there are no stores, then we don't |
2515 | // care if the pointers are *restrict*. |
2516 | if (!Stores.size()) { |
2517 | LLVM_DEBUG(dbgs() << "LAA: Found a read-only loop!\n" ); |
2518 | return true; |
2519 | } |
2520 | |
2521 | MemoryDepChecker::DepCandidates DependentAccesses; |
2522 | AccessAnalysis Accesses(TheLoop, AA, LI, DependentAccesses, *PSE, |
2523 | LoopAliasScopes); |
2524 | |
2525 | // Holds the analyzed pointers. We don't want to call getUnderlyingObjects |
2526 | // multiple times on the same object. If the ptr is accessed twice, once |
2527 | // for read and once for write, it will only appear once (on the write |
2528 | // list). This is okay, since we are going to check for conflicts between |
2529 | // writes and between reads and writes, but not between reads and reads. |
2530 | SmallSet<std::pair<Value *, Type *>, 16> Seen; |
2531 | |
2532 | // Record uniform store addresses to identify if we have multiple stores |
2533 | // to the same address. |
2534 | SmallPtrSet<Value *, 16> UniformStores; |
2535 | |
2536 | for (StoreInst *ST : Stores) { |
2537 | Value *Ptr = ST->getPointerOperand(); |
2538 | |
2539 | if (isInvariant(V: Ptr)) { |
2540 | // Record store instructions to loop invariant addresses |
2541 | StoresToInvariantAddresses.push_back(Elt: ST); |
2542 | HasStoreStoreDependenceInvolvingLoopInvariantAddress |= |
2543 | !UniformStores.insert(Ptr).second; |
2544 | } |
2545 | |
2546 | // If we did *not* see this pointer before, insert it to the read-write |
2547 | // list. At this phase it is only a 'write' list. |
2548 | Type *AccessTy = getLoadStoreType(I: ST); |
2549 | if (Seen.insert(V: {Ptr, AccessTy}).second) { |
2550 | ++NumReadWrites; |
2551 | |
2552 | MemoryLocation Loc = MemoryLocation::get(SI: ST); |
2553 | // The TBAA metadata could have a control dependency on the predication |
2554 | // condition, so we cannot rely on it when determining whether or not we |
2555 | // need runtime pointer checks. |
2556 | if (blockNeedsPredication(BB: ST->getParent(), TheLoop, DT)) |
2557 | Loc.AATags.TBAA = nullptr; |
2558 | |
2559 | visitPointers(StartPtr: const_cast<Value *>(Loc.Ptr), InnermostLoop: *TheLoop, |
2560 | AddPointer: [&Accesses, AccessTy, Loc](Value *Ptr) { |
2561 | MemoryLocation NewLoc = Loc.getWithNewPtr(NewPtr: Ptr); |
2562 | Accesses.addStore(Loc&: NewLoc, AccessTy); |
2563 | }); |
2564 | } |
2565 | } |
2566 | |
2567 | if (IsAnnotatedParallel) { |
2568 | LLVM_DEBUG( |
2569 | dbgs() << "LAA: A loop annotated parallel, ignore memory dependency " |
2570 | << "checks.\n" ); |
2571 | return true; |
2572 | } |
2573 | |
2574 | for (LoadInst *LD : Loads) { |
2575 | Value *Ptr = LD->getPointerOperand(); |
2576 | // If we did *not* see this pointer before, insert it to the |
2577 | // read list. If we *did* see it before, then it is already in |
2578 | // the read-write list. This allows us to vectorize expressions |
2579 | // such as A[i] += x; Because the address of A[i] is a read-write |
2580 | // pointer. This only works if the index of A[i] is consecutive. |
2581 | // If the address of i is unknown (for example A[B[i]]) then we may |
2582 | // read a few words, modify, and write a few words, and some of the |
2583 | // words may be written to the same address. |
2584 | bool IsReadOnlyPtr = false; |
2585 | Type *AccessTy = getLoadStoreType(I: LD); |
2586 | if (Seen.insert(V: {Ptr, AccessTy}).second || |
2587 | !getPtrStride(PSE&: *PSE, AccessTy: LD->getType(), Ptr, Lp: TheLoop, StridesMap: SymbolicStrides).value_or(u: 0)) { |
2588 | ++NumReads; |
2589 | IsReadOnlyPtr = true; |
2590 | } |
2591 | |
2592 | // See if there is an unsafe dependency between a load to a uniform address and |
2593 | // store to the same uniform address. |
2594 | if (UniformStores.count(Ptr)) { |
2595 | LLVM_DEBUG(dbgs() << "LAA: Found an unsafe dependency between a uniform " |
2596 | "load and uniform store to the same address!\n" ); |
2597 | HasLoadStoreDependenceInvolvingLoopInvariantAddress = true; |
2598 | } |
2599 | |
2600 | MemoryLocation Loc = MemoryLocation::get(LI: LD); |
2601 | // The TBAA metadata could have a control dependency on the predication |
2602 | // condition, so we cannot rely on it when determining whether or not we |
2603 | // need runtime pointer checks. |
2604 | if (blockNeedsPredication(BB: LD->getParent(), TheLoop, DT)) |
2605 | Loc.AATags.TBAA = nullptr; |
2606 | |
2607 | visitPointers(StartPtr: const_cast<Value *>(Loc.Ptr), InnermostLoop: *TheLoop, |
2608 | AddPointer: [&Accesses, AccessTy, Loc, IsReadOnlyPtr](Value *Ptr) { |
2609 | MemoryLocation NewLoc = Loc.getWithNewPtr(NewPtr: Ptr); |
2610 | Accesses.addLoad(Loc&: NewLoc, AccessTy, IsReadOnly: IsReadOnlyPtr); |
2611 | }); |
2612 | } |
2613 | |
2614 | // If we write (or read-write) to a single destination and there are no |
2615 | // other reads in this loop then is it safe to vectorize. |
2616 | if (NumReadWrites == 1 && NumReads == 0) { |
2617 | LLVM_DEBUG(dbgs() << "LAA: Found a write-only loop!\n" ); |
2618 | return true; |
2619 | } |
2620 | |
2621 | // Build dependence sets and check whether we need a runtime pointer bounds |
2622 | // check. |
2623 | Accesses.buildDependenceSets(); |
2624 | |
2625 | // Find pointers with computable bounds. We are going to use this information |
2626 | // to place a runtime bound check. |
2627 | Value *UncomputablePtr = nullptr; |
2628 | bool CanDoRTIfNeeded = |
2629 | Accesses.canCheckPtrAtRT(RtCheck&: *PtrRtChecking, SE: PSE->getSE(), TheLoop, |
2630 | StridesMap: SymbolicStrides, UncomputablePtr, ShouldCheckWrap: false); |
2631 | if (!CanDoRTIfNeeded) { |
2632 | auto *I = dyn_cast_or_null<Instruction>(Val: UncomputablePtr); |
2633 | recordAnalysis(RemarkName: "CantIdentifyArrayBounds" , Instr: I) |
2634 | << "cannot identify array bounds" ; |
2635 | LLVM_DEBUG(dbgs() << "LAA: We can't vectorize because we can't find " |
2636 | << "the array bounds.\n" ); |
2637 | return false; |
2638 | } |
2639 | |
2640 | LLVM_DEBUG( |
2641 | dbgs() << "LAA: May be able to perform a memory runtime check if needed.\n" ); |
2642 | |
2643 | bool DepsAreSafe = true; |
2644 | if (Accesses.isDependencyCheckNeeded()) { |
2645 | LLVM_DEBUG(dbgs() << "LAA: Checking memory dependencies\n" ); |
2646 | DepsAreSafe = DepChecker->areDepsSafe(AccessSets: DependentAccesses, |
2647 | CheckDeps: Accesses.getDependenciesToCheck()); |
2648 | |
2649 | if (!DepsAreSafe && DepChecker->shouldRetryWithRuntimeCheck()) { |
2650 | LLVM_DEBUG(dbgs() << "LAA: Retrying with memory checks\n" ); |
2651 | |
2652 | // Clear the dependency checks. We assume they are not needed. |
2653 | Accesses.resetDepChecks(DepChecker&: *DepChecker); |
2654 | |
2655 | PtrRtChecking->reset(); |
2656 | PtrRtChecking->Need = true; |
2657 | |
2658 | auto *SE = PSE->getSE(); |
2659 | UncomputablePtr = nullptr; |
2660 | CanDoRTIfNeeded = Accesses.canCheckPtrAtRT( |
2661 | RtCheck&: *PtrRtChecking, SE, TheLoop, StridesMap: SymbolicStrides, UncomputablePtr, ShouldCheckWrap: true); |
2662 | |
2663 | // Check that we found the bounds for the pointer. |
2664 | if (!CanDoRTIfNeeded) { |
2665 | auto *I = dyn_cast_or_null<Instruction>(Val: UncomputablePtr); |
2666 | recordAnalysis(RemarkName: "CantCheckMemDepsAtRunTime" , Instr: I) |
2667 | << "cannot check memory dependencies at runtime" ; |
2668 | LLVM_DEBUG(dbgs() << "LAA: Can't vectorize with memory checks\n" ); |
2669 | return false; |
2670 | } |
2671 | DepsAreSafe = true; |
2672 | } |
2673 | } |
2674 | |
2675 | if (HasConvergentOp) { |
2676 | recordAnalysis(RemarkName: "CantInsertRuntimeCheckWithConvergent" ) |
2677 | << "cannot add control dependency to convergent operation" ; |
2678 | LLVM_DEBUG(dbgs() << "LAA: We can't vectorize because a runtime check " |
2679 | "would be needed with a convergent operation\n" ); |
2680 | return false; |
2681 | } |
2682 | |
2683 | if (DepsAreSafe) { |
2684 | LLVM_DEBUG( |
2685 | dbgs() << "LAA: No unsafe dependent memory operations in loop. We" |
2686 | << (PtrRtChecking->Need ? "" : " don't" ) |
2687 | << " need runtime memory checks.\n" ); |
2688 | return true; |
2689 | } |
2690 | |
2691 | emitUnsafeDependenceRemark(); |
2692 | return false; |
2693 | } |
2694 | |
2695 | void LoopAccessInfo::() { |
2696 | const auto *Deps = getDepChecker().getDependences(); |
2697 | if (!Deps) |
2698 | return; |
2699 | const auto *Found = |
2700 | llvm::find_if(Range: *Deps, P: [](const MemoryDepChecker::Dependence &D) { |
2701 | return MemoryDepChecker::Dependence::isSafeForVectorization(Type: D.Type) != |
2702 | MemoryDepChecker::VectorizationSafetyStatus::Safe; |
2703 | }); |
2704 | if (Found == Deps->end()) |
2705 | return; |
2706 | MemoryDepChecker::Dependence Dep = *Found; |
2707 | |
2708 | LLVM_DEBUG(dbgs() << "LAA: unsafe dependent memory operations in loop\n" ); |
2709 | |
2710 | // Emit remark for first unsafe dependence |
2711 | bool HasForcedDistribution = false; |
2712 | std::optional<const MDOperand *> Value = |
2713 | findStringMetadataForLoop(TheLoop, Name: "llvm.loop.distribute.enable" ); |
2714 | if (Value) { |
2715 | const MDOperand *Op = *Value; |
2716 | assert(Op && mdconst::hasa<ConstantInt>(*Op) && "invalid metadata" ); |
2717 | HasForcedDistribution = mdconst::extract<ConstantInt>(MD: *Op)->getZExtValue(); |
2718 | } |
2719 | |
2720 | const std::string Info = |
2721 | HasForcedDistribution |
2722 | ? "unsafe dependent memory operations in loop." |
2723 | : "unsafe dependent memory operations in loop. Use " |
2724 | "#pragma clang loop distribute(enable) to allow loop distribution " |
2725 | "to attempt to isolate the offending operations into a separate " |
2726 | "loop" ; |
2727 | OptimizationRemarkAnalysis &R = |
2728 | recordAnalysis(RemarkName: "UnsafeDep" , Instr: Dep.getDestination(DepChecker: getDepChecker())) << Info; |
2729 | |
2730 | switch (Dep.Type) { |
2731 | case MemoryDepChecker::Dependence::NoDep: |
2732 | case MemoryDepChecker::Dependence::Forward: |
2733 | case MemoryDepChecker::Dependence::BackwardVectorizable: |
2734 | llvm_unreachable("Unexpected dependence" ); |
2735 | case MemoryDepChecker::Dependence::Backward: |
2736 | R << "\nBackward loop carried data dependence." ; |
2737 | break; |
2738 | case MemoryDepChecker::Dependence::ForwardButPreventsForwarding: |
2739 | R << "\nForward loop carried data dependence that prevents " |
2740 | "store-to-load forwarding." ; |
2741 | break; |
2742 | case MemoryDepChecker::Dependence::BackwardVectorizableButPreventsForwarding: |
2743 | R << "\nBackward loop carried data dependence that prevents " |
2744 | "store-to-load forwarding." ; |
2745 | break; |
2746 | case MemoryDepChecker::Dependence::IndirectUnsafe: |
2747 | R << "\nUnsafe indirect dependence." ; |
2748 | break; |
2749 | case MemoryDepChecker::Dependence::Unknown: |
2750 | R << "\nUnknown data dependence." ; |
2751 | break; |
2752 | } |
2753 | |
2754 | if (Instruction *I = Dep.getSource(DepChecker: getDepChecker())) { |
2755 | DebugLoc SourceLoc = I->getDebugLoc(); |
2756 | if (auto *DD = dyn_cast_or_null<Instruction>(Val: getPointerOperand(V: I))) |
2757 | SourceLoc = DD->getDebugLoc(); |
2758 | if (SourceLoc) |
2759 | R << " Memory location is the same as accessed at " |
2760 | << ore::NV("Location" , SourceLoc); |
2761 | } |
2762 | } |
2763 | |
2764 | bool LoopAccessInfo::blockNeedsPredication(BasicBlock *BB, Loop *TheLoop, |
2765 | DominatorTree *DT) { |
2766 | assert(TheLoop->contains(BB) && "Unknown block used" ); |
2767 | |
2768 | // Blocks that do not dominate the latch need predication. |
2769 | BasicBlock* Latch = TheLoop->getLoopLatch(); |
2770 | return !DT->dominates(A: BB, B: Latch); |
2771 | } |
2772 | |
2773 | OptimizationRemarkAnalysis &LoopAccessInfo::recordAnalysis(StringRef , |
2774 | Instruction *I) { |
2775 | assert(!Report && "Multiple reports generated" ); |
2776 | |
2777 | Value *CodeRegion = TheLoop->getHeader(); |
2778 | DebugLoc DL = TheLoop->getStartLoc(); |
2779 | |
2780 | if (I) { |
2781 | CodeRegion = I->getParent(); |
2782 | // If there is no debug location attached to the instruction, revert back to |
2783 | // using the loop's. |
2784 | if (I->getDebugLoc()) |
2785 | DL = I->getDebugLoc(); |
2786 | } |
2787 | |
2788 | Report = std::make_unique<OptimizationRemarkAnalysis>(DEBUG_TYPE, args&: RemarkName, args&: DL, |
2789 | args&: CodeRegion); |
2790 | return *Report; |
2791 | } |
2792 | |
2793 | bool LoopAccessInfo::isInvariant(Value *V) const { |
2794 | auto *SE = PSE->getSE(); |
2795 | // TODO: Is this really what we want? Even without FP SCEV, we may want some |
2796 | // trivially loop-invariant FP values to be considered invariant. |
2797 | if (!SE->isSCEVable(Ty: V->getType())) |
2798 | return false; |
2799 | const SCEV *S = SE->getSCEV(V); |
2800 | return SE->isLoopInvariant(S, L: TheLoop); |
2801 | } |
2802 | |
2803 | /// Find the operand of the GEP that should be checked for consecutive |
2804 | /// stores. This ignores trailing indices that have no effect on the final |
2805 | /// pointer. |
2806 | static unsigned getGEPInductionOperand(const GetElementPtrInst *Gep) { |
2807 | const DataLayout &DL = Gep->getDataLayout(); |
2808 | unsigned LastOperand = Gep->getNumOperands() - 1; |
2809 | TypeSize GEPAllocSize = DL.getTypeAllocSize(Ty: Gep->getResultElementType()); |
2810 | |
2811 | // Walk backwards and try to peel off zeros. |
2812 | while (LastOperand > 1 && match(V: Gep->getOperand(i_nocapture: LastOperand), P: m_Zero())) { |
2813 | // Find the type we're currently indexing into. |
2814 | gep_type_iterator GEPTI = gep_type_begin(GEP: Gep); |
2815 | std::advance(i&: GEPTI, n: LastOperand - 2); |
2816 | |
2817 | // If it's a type with the same allocation size as the result of the GEP we |
2818 | // can peel off the zero index. |
2819 | TypeSize ElemSize = GEPTI.isStruct() |
2820 | ? DL.getTypeAllocSize(Ty: GEPTI.getIndexedType()) |
2821 | : GEPTI.getSequentialElementStride(DL); |
2822 | if (ElemSize != GEPAllocSize) |
2823 | break; |
2824 | --LastOperand; |
2825 | } |
2826 | |
2827 | return LastOperand; |
2828 | } |
2829 | |
2830 | /// If the argument is a GEP, then returns the operand identified by |
2831 | /// getGEPInductionOperand. However, if there is some other non-loop-invariant |
2832 | /// operand, it returns that instead. |
2833 | static Value *stripGetElementPtr(Value *Ptr, ScalarEvolution *SE, Loop *Lp) { |
2834 | GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Val: Ptr); |
2835 | if (!GEP) |
2836 | return Ptr; |
2837 | |
2838 | unsigned InductionOperand = getGEPInductionOperand(Gep: GEP); |
2839 | |
2840 | // Check that all of the gep indices are uniform except for our induction |
2841 | // operand. |
2842 | for (unsigned I = 0, E = GEP->getNumOperands(); I != E; ++I) |
2843 | if (I != InductionOperand && |
2844 | !SE->isLoopInvariant(S: SE->getSCEV(V: GEP->getOperand(i_nocapture: I)), L: Lp)) |
2845 | return Ptr; |
2846 | return GEP->getOperand(i_nocapture: InductionOperand); |
2847 | } |
2848 | |
2849 | /// Get the stride of a pointer access in a loop. Looks for symbolic |
2850 | /// strides "a[i*stride]". Returns the symbolic stride, or null otherwise. |
2851 | static const SCEV *getStrideFromPointer(Value *Ptr, ScalarEvolution *SE, Loop *Lp) { |
2852 | auto *PtrTy = dyn_cast<PointerType>(Val: Ptr->getType()); |
2853 | if (!PtrTy || PtrTy->isAggregateType()) |
2854 | return nullptr; |
2855 | |
2856 | // Try to remove a gep instruction to make the pointer (actually index at this |
2857 | // point) easier analyzable. If OrigPtr is equal to Ptr we are analyzing the |
2858 | // pointer, otherwise, we are analyzing the index. |
2859 | Value *OrigPtr = Ptr; |
2860 | |
2861 | // The size of the pointer access. |
2862 | int64_t PtrAccessSize = 1; |
2863 | |
2864 | Ptr = stripGetElementPtr(Ptr, SE, Lp); |
2865 | const SCEV *V = SE->getSCEV(V: Ptr); |
2866 | |
2867 | if (Ptr != OrigPtr) |
2868 | // Strip off casts. |
2869 | while (const SCEVIntegralCastExpr *C = dyn_cast<SCEVIntegralCastExpr>(Val: V)) |
2870 | V = C->getOperand(); |
2871 | |
2872 | const SCEVAddRecExpr *S = dyn_cast<SCEVAddRecExpr>(Val: V); |
2873 | if (!S) |
2874 | return nullptr; |
2875 | |
2876 | // If the pointer is invariant then there is no stride and it makes no |
2877 | // sense to add it here. |
2878 | if (Lp != S->getLoop()) |
2879 | return nullptr; |
2880 | |
2881 | V = S->getStepRecurrence(SE&: *SE); |
2882 | if (!V) |
2883 | return nullptr; |
2884 | |
2885 | // Strip off the size of access multiplication if we are still analyzing the |
2886 | // pointer. |
2887 | if (OrigPtr == Ptr) { |
2888 | if (const SCEVMulExpr *M = dyn_cast<SCEVMulExpr>(Val: V)) { |
2889 | if (M->getOperand(i: 0)->getSCEVType() != scConstant) |
2890 | return nullptr; |
2891 | |
2892 | const APInt &APStepVal = cast<SCEVConstant>(Val: M->getOperand(i: 0))->getAPInt(); |
2893 | |
2894 | // Huge step value - give up. |
2895 | if (APStepVal.getBitWidth() > 64) |
2896 | return nullptr; |
2897 | |
2898 | int64_t StepVal = APStepVal.getSExtValue(); |
2899 | if (PtrAccessSize != StepVal) |
2900 | return nullptr; |
2901 | V = M->getOperand(i: 1); |
2902 | } |
2903 | } |
2904 | |
2905 | // Note that the restriction after this loop invariant check are only |
2906 | // profitability restrictions. |
2907 | if (!SE->isLoopInvariant(S: V, L: Lp)) |
2908 | return nullptr; |
2909 | |
2910 | // Look for the loop invariant symbolic value. |
2911 | if (isa<SCEVUnknown>(Val: V)) |
2912 | return V; |
2913 | |
2914 | if (const auto *C = dyn_cast<SCEVIntegralCastExpr>(Val: V)) |
2915 | if (isa<SCEVUnknown>(Val: C->getOperand())) |
2916 | return V; |
2917 | |
2918 | return nullptr; |
2919 | } |
2920 | |
2921 | void LoopAccessInfo::collectStridedAccess(Value *MemAccess) { |
2922 | Value *Ptr = getLoadStorePointerOperand(V: MemAccess); |
2923 | if (!Ptr) |
2924 | return; |
2925 | |
2926 | // Note: getStrideFromPointer is a *profitability* heuristic. We |
2927 | // could broaden the scope of values returned here - to anything |
2928 | // which happens to be loop invariant and contributes to the |
2929 | // computation of an interesting IV - but we chose not to as we |
2930 | // don't have a cost model here, and broadening the scope exposes |
2931 | // far too many unprofitable cases. |
2932 | const SCEV *StrideExpr = getStrideFromPointer(Ptr, SE: PSE->getSE(), Lp: TheLoop); |
2933 | if (!StrideExpr) |
2934 | return; |
2935 | |
2936 | LLVM_DEBUG(dbgs() << "LAA: Found a strided access that is a candidate for " |
2937 | "versioning:" ); |
2938 | LLVM_DEBUG(dbgs() << " Ptr: " << *Ptr << " Stride: " << *StrideExpr << "\n" ); |
2939 | |
2940 | if (!SpeculateUnitStride) { |
2941 | LLVM_DEBUG(dbgs() << " Chose not to due to -laa-speculate-unit-stride\n" ); |
2942 | return; |
2943 | } |
2944 | |
2945 | // Avoid adding the "Stride == 1" predicate when we know that |
2946 | // Stride >= Trip-Count. Such a predicate will effectively optimize a single |
2947 | // or zero iteration loop, as Trip-Count <= Stride == 1. |
2948 | // |
2949 | // TODO: We are currently not making a very informed decision on when it is |
2950 | // beneficial to apply stride versioning. It might make more sense that the |
2951 | // users of this analysis (such as the vectorizer) will trigger it, based on |
2952 | // their specific cost considerations; For example, in cases where stride |
2953 | // versioning does not help resolving memory accesses/dependences, the |
2954 | // vectorizer should evaluate the cost of the runtime test, and the benefit |
2955 | // of various possible stride specializations, considering the alternatives |
2956 | // of using gather/scatters (if available). |
2957 | |
2958 | const SCEV *MaxBTC = PSE->getSymbolicMaxBackedgeTakenCount(); |
2959 | |
2960 | // Match the types so we can compare the stride and the MaxBTC. |
2961 | // The Stride can be positive/negative, so we sign extend Stride; |
2962 | // The backedgeTakenCount is non-negative, so we zero extend MaxBTC. |
2963 | const DataLayout &DL = TheLoop->getHeader()->getDataLayout(); |
2964 | uint64_t StrideTypeSizeBits = DL.getTypeSizeInBits(Ty: StrideExpr->getType()); |
2965 | uint64_t BETypeSizeBits = DL.getTypeSizeInBits(Ty: MaxBTC->getType()); |
2966 | const SCEV *CastedStride = StrideExpr; |
2967 | const SCEV *CastedBECount = MaxBTC; |
2968 | ScalarEvolution *SE = PSE->getSE(); |
2969 | if (BETypeSizeBits >= StrideTypeSizeBits) |
2970 | CastedStride = SE->getNoopOrSignExtend(V: StrideExpr, Ty: MaxBTC->getType()); |
2971 | else |
2972 | CastedBECount = SE->getZeroExtendExpr(Op: MaxBTC, Ty: StrideExpr->getType()); |
2973 | const SCEV *StrideMinusBETaken = SE->getMinusSCEV(LHS: CastedStride, RHS: CastedBECount); |
2974 | // Since TripCount == BackEdgeTakenCount + 1, checking: |
2975 | // "Stride >= TripCount" is equivalent to checking: |
2976 | // Stride - MaxBTC> 0 |
2977 | if (SE->isKnownPositive(S: StrideMinusBETaken)) { |
2978 | LLVM_DEBUG( |
2979 | dbgs() << "LAA: Stride>=TripCount; No point in versioning as the " |
2980 | "Stride==1 predicate will imply that the loop executes " |
2981 | "at most once.\n" ); |
2982 | return; |
2983 | } |
2984 | LLVM_DEBUG(dbgs() << "LAA: Found a strided access that we can version.\n" ); |
2985 | |
2986 | // Strip back off the integer cast, and check that our result is a |
2987 | // SCEVUnknown as we expect. |
2988 | const SCEV *StrideBase = StrideExpr; |
2989 | if (const auto *C = dyn_cast<SCEVIntegralCastExpr>(Val: StrideBase)) |
2990 | StrideBase = C->getOperand(); |
2991 | SymbolicStrides[Ptr] = cast<SCEVUnknown>(Val: StrideBase); |
2992 | } |
2993 | |
2994 | LoopAccessInfo::LoopAccessInfo(Loop *L, ScalarEvolution *SE, |
2995 | const TargetTransformInfo *TTI, |
2996 | const TargetLibraryInfo *TLI, AAResults *AA, |
2997 | DominatorTree *DT, LoopInfo *LI) |
2998 | : PSE(std::make_unique<PredicatedScalarEvolution>(args&: *SE, args&: *L)), |
2999 | PtrRtChecking(nullptr), TheLoop(L) { |
3000 | unsigned MaxTargetVectorWidthInBits = std::numeric_limits<unsigned>::max(); |
3001 | if (TTI) { |
3002 | TypeSize FixedWidth = |
3003 | TTI->getRegisterBitWidth(K: TargetTransformInfo::RGK_FixedWidthVector); |
3004 | if (FixedWidth.isNonZero()) { |
3005 | // Scale the vector width by 2 as rough estimate to also consider |
3006 | // interleaving. |
3007 | MaxTargetVectorWidthInBits = FixedWidth.getFixedValue() * 2; |
3008 | } |
3009 | |
3010 | TypeSize ScalableWidth = |
3011 | TTI->getRegisterBitWidth(K: TargetTransformInfo::RGK_ScalableVector); |
3012 | if (ScalableWidth.isNonZero()) |
3013 | MaxTargetVectorWidthInBits = std::numeric_limits<unsigned>::max(); |
3014 | } |
3015 | DepChecker = std::make_unique<MemoryDepChecker>(args&: *PSE, args&: L, args&: SymbolicStrides, |
3016 | args&: MaxTargetVectorWidthInBits); |
3017 | PtrRtChecking = std::make_unique<RuntimePointerChecking>(args&: *DepChecker, args&: SE); |
3018 | if (canAnalyzeLoop()) |
3019 | CanVecMem = analyzeLoop(AA, LI, TLI, DT); |
3020 | } |
3021 | |
3022 | void LoopAccessInfo::print(raw_ostream &OS, unsigned Depth) const { |
3023 | if (CanVecMem) { |
3024 | OS.indent(NumSpaces: Depth) << "Memory dependences are safe" ; |
3025 | const MemoryDepChecker &DC = getDepChecker(); |
3026 | if (!DC.isSafeForAnyVectorWidth()) |
3027 | OS << " with a maximum safe vector width of " |
3028 | << DC.getMaxSafeVectorWidthInBits() << " bits" ; |
3029 | if (PtrRtChecking->Need) |
3030 | OS << " with run-time checks" ; |
3031 | OS << "\n" ; |
3032 | } |
3033 | |
3034 | if (HasConvergentOp) |
3035 | OS.indent(NumSpaces: Depth) << "Has convergent operation in loop\n" ; |
3036 | |
3037 | if (Report) |
3038 | OS.indent(NumSpaces: Depth) << "Report: " << Report->getMsg() << "\n" ; |
3039 | |
3040 | if (auto *Dependences = DepChecker->getDependences()) { |
3041 | OS.indent(NumSpaces: Depth) << "Dependences:\n" ; |
3042 | for (const auto &Dep : *Dependences) { |
3043 | Dep.print(OS, Depth: Depth + 2, Instrs: DepChecker->getMemoryInstructions()); |
3044 | OS << "\n" ; |
3045 | } |
3046 | } else |
3047 | OS.indent(NumSpaces: Depth) << "Too many dependences, not recorded\n" ; |
3048 | |
3049 | // List the pair of accesses need run-time checks to prove independence. |
3050 | PtrRtChecking->print(OS, Depth); |
3051 | OS << "\n" ; |
3052 | |
3053 | OS.indent(NumSpaces: Depth) |
3054 | << "Non vectorizable stores to invariant address were " |
3055 | << (HasStoreStoreDependenceInvolvingLoopInvariantAddress || |
3056 | HasLoadStoreDependenceInvolvingLoopInvariantAddress |
3057 | ? "" |
3058 | : "not " ) |
3059 | << "found in loop.\n" ; |
3060 | |
3061 | OS.indent(NumSpaces: Depth) << "SCEV assumptions:\n" ; |
3062 | PSE->getPredicate().print(OS, Depth); |
3063 | |
3064 | OS << "\n" ; |
3065 | |
3066 | OS.indent(NumSpaces: Depth) << "Expressions re-written:\n" ; |
3067 | PSE->print(OS, Depth); |
3068 | } |
3069 | |
3070 | const LoopAccessInfo &LoopAccessInfoManager::getInfo(Loop &L) { |
3071 | auto [It, Inserted] = LoopAccessInfoMap.insert(KV: {&L, nullptr}); |
3072 | |
3073 | if (Inserted) |
3074 | It->second = |
3075 | std::make_unique<LoopAccessInfo>(args: &L, args: &SE, args&: TTI, args&: TLI, args: &AA, args: &DT, args: &LI); |
3076 | |
3077 | return *It->second; |
3078 | } |
3079 | void LoopAccessInfoManager::clear() { |
3080 | SmallVector<Loop *> ToRemove; |
3081 | // Collect LoopAccessInfo entries that may keep references to IR outside the |
3082 | // analyzed loop or SCEVs that may have been modified or invalidated. At the |
3083 | // moment, that is loops requiring memory or SCEV runtime checks, as those cache |
3084 | // SCEVs, e.g. for pointer expressions. |
3085 | for (const auto &[L, LAI] : LoopAccessInfoMap) { |
3086 | if (LAI->getRuntimePointerChecking()->getChecks().empty() && |
3087 | LAI->getPSE().getPredicate().isAlwaysTrue()) |
3088 | continue; |
3089 | ToRemove.push_back(Elt: L); |
3090 | } |
3091 | |
3092 | for (Loop *L : ToRemove) |
3093 | LoopAccessInfoMap.erase(Val: L); |
3094 | } |
3095 | |
3096 | bool LoopAccessInfoManager::invalidate( |
3097 | Function &F, const PreservedAnalyses &PA, |
3098 | FunctionAnalysisManager::Invalidator &Inv) { |
3099 | // Check whether our analysis is preserved. |
3100 | auto PAC = PA.getChecker<LoopAccessAnalysis>(); |
3101 | if (!PAC.preserved() && !PAC.preservedSet<AllAnalysesOn<Function>>()) |
3102 | // If not, give up now. |
3103 | return true; |
3104 | |
3105 | // Check whether the analyses we depend on became invalid for any reason. |
3106 | // Skip checking TargetLibraryAnalysis as it is immutable and can't become |
3107 | // invalid. |
3108 | return Inv.invalidate<AAManager>(IR&: F, PA) || |
3109 | Inv.invalidate<ScalarEvolutionAnalysis>(IR&: F, PA) || |
3110 | Inv.invalidate<LoopAnalysis>(IR&: F, PA) || |
3111 | Inv.invalidate<DominatorTreeAnalysis>(IR&: F, PA); |
3112 | } |
3113 | |
3114 | LoopAccessInfoManager LoopAccessAnalysis::run(Function &F, |
3115 | FunctionAnalysisManager &FAM) { |
3116 | auto &SE = FAM.getResult<ScalarEvolutionAnalysis>(IR&: F); |
3117 | auto &AA = FAM.getResult<AAManager>(IR&: F); |
3118 | auto &DT = FAM.getResult<DominatorTreeAnalysis>(IR&: F); |
3119 | auto &LI = FAM.getResult<LoopAnalysis>(IR&: F); |
3120 | auto &TTI = FAM.getResult<TargetIRAnalysis>(IR&: F); |
3121 | auto &TLI = FAM.getResult<TargetLibraryAnalysis>(IR&: F); |
3122 | return LoopAccessInfoManager(SE, AA, DT, LI, &TTI, &TLI); |
3123 | } |
3124 | |
3125 | AnalysisKey LoopAccessAnalysis::Key; |
3126 | |