| 1 | //===- LoopAccessAnalysis.cpp - Loop Access Analysis Implementation --------==// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | // |
| 9 | // The implementation for the loop memory dependence that was originally |
| 10 | // developed for the loop vectorizer. |
| 11 | // |
| 12 | //===----------------------------------------------------------------------===// |
| 13 | |
| 14 | #include "llvm/Analysis/LoopAccessAnalysis.h" |
| 15 | #include "llvm/ADT/APInt.h" |
| 16 | #include "llvm/ADT/DenseMap.h" |
| 17 | #include "llvm/ADT/EquivalenceClasses.h" |
| 18 | #include "llvm/ADT/PointerIntPair.h" |
| 19 | #include "llvm/ADT/STLExtras.h" |
| 20 | #include "llvm/ADT/SetVector.h" |
| 21 | #include "llvm/ADT/SmallPtrSet.h" |
| 22 | #include "llvm/ADT/SmallSet.h" |
| 23 | #include "llvm/ADT/SmallVector.h" |
| 24 | #include "llvm/Analysis/AliasAnalysis.h" |
| 25 | #include "llvm/Analysis/AliasSetTracker.h" |
| 26 | #include "llvm/Analysis/LoopAnalysisManager.h" |
| 27 | #include "llvm/Analysis/LoopInfo.h" |
| 28 | #include "llvm/Analysis/LoopIterator.h" |
| 29 | #include "llvm/Analysis/MemoryLocation.h" |
| 30 | #include "llvm/Analysis/OptimizationRemarkEmitter.h" |
| 31 | #include "llvm/Analysis/ScalarEvolution.h" |
| 32 | #include "llvm/Analysis/ScalarEvolutionExpressions.h" |
| 33 | #include "llvm/Analysis/ScalarEvolutionPatternMatch.h" |
| 34 | #include "llvm/Analysis/TargetLibraryInfo.h" |
| 35 | #include "llvm/Analysis/TargetTransformInfo.h" |
| 36 | #include "llvm/Analysis/ValueTracking.h" |
| 37 | #include "llvm/Analysis/VectorUtils.h" |
| 38 | #include "llvm/IR/BasicBlock.h" |
| 39 | #include "llvm/IR/Constants.h" |
| 40 | #include "llvm/IR/DataLayout.h" |
| 41 | #include "llvm/IR/DebugLoc.h" |
| 42 | #include "llvm/IR/DerivedTypes.h" |
| 43 | #include "llvm/IR/DiagnosticInfo.h" |
| 44 | #include "llvm/IR/Dominators.h" |
| 45 | #include "llvm/IR/Function.h" |
| 46 | #include "llvm/IR/InstrTypes.h" |
| 47 | #include "llvm/IR/Instruction.h" |
| 48 | #include "llvm/IR/Instructions.h" |
| 49 | #include "llvm/IR/IntrinsicInst.h" |
| 50 | #include "llvm/IR/PassManager.h" |
| 51 | #include "llvm/IR/Type.h" |
| 52 | #include "llvm/IR/Value.h" |
| 53 | #include "llvm/IR/ValueHandle.h" |
| 54 | #include "llvm/Support/Casting.h" |
| 55 | #include "llvm/Support/CommandLine.h" |
| 56 | #include "llvm/Support/Debug.h" |
| 57 | #include "llvm/Support/ErrorHandling.h" |
| 58 | #include "llvm/Support/raw_ostream.h" |
| 59 | #include <algorithm> |
| 60 | #include <cassert> |
| 61 | #include <cstdint> |
| 62 | #include <iterator> |
| 63 | #include <utility> |
| 64 | #include <variant> |
| 65 | #include <vector> |
| 66 | |
| 67 | using namespace llvm; |
| 68 | using namespace llvm::SCEVPatternMatch; |
| 69 | |
| 70 | #define DEBUG_TYPE "loop-accesses" |
| 71 | |
| 72 | static cl::opt<unsigned, true> |
| 73 | VectorizationFactor("force-vector-width" , cl::Hidden, |
| 74 | cl::desc("Sets the SIMD width. Zero is autoselect." ), |
| 75 | cl::location(L&: VectorizerParams::VectorizationFactor)); |
| 76 | unsigned VectorizerParams::VectorizationFactor; |
| 77 | |
| 78 | static cl::opt<unsigned, true> |
| 79 | VectorizationInterleave("force-vector-interleave" , cl::Hidden, |
| 80 | cl::desc("Sets the vectorization interleave count. " |
| 81 | "Zero is autoselect." ), |
| 82 | cl::location( |
| 83 | L&: VectorizerParams::VectorizationInterleave)); |
| 84 | unsigned VectorizerParams::VectorizationInterleave; |
| 85 | |
| 86 | static cl::opt<unsigned, true> RuntimeMemoryCheckThreshold( |
| 87 | "runtime-memory-check-threshold" , cl::Hidden, |
| 88 | cl::desc("When performing memory disambiguation checks at runtime do not " |
| 89 | "generate more than this number of comparisons (default = 8)." ), |
| 90 | cl::location(L&: VectorizerParams::RuntimeMemoryCheckThreshold), cl::init(Val: 8)); |
| 91 | unsigned VectorizerParams::RuntimeMemoryCheckThreshold; |
| 92 | |
| 93 | /// The maximum iterations used to merge memory checks |
| 94 | static cl::opt<unsigned> MemoryCheckMergeThreshold( |
| 95 | "memory-check-merge-threshold" , cl::Hidden, |
| 96 | cl::desc("Maximum number of comparisons done when trying to merge " |
| 97 | "runtime memory checks. (default = 100)" ), |
| 98 | cl::init(Val: 100)); |
| 99 | |
| 100 | /// Maximum SIMD width. |
| 101 | const unsigned VectorizerParams::MaxVectorWidth = 64; |
| 102 | |
| 103 | /// We collect dependences up to this threshold. |
| 104 | static cl::opt<unsigned> |
| 105 | MaxDependences("max-dependences" , cl::Hidden, |
| 106 | cl::desc("Maximum number of dependences collected by " |
| 107 | "loop-access analysis (default = 100)" ), |
| 108 | cl::init(Val: 100)); |
| 109 | |
| 110 | /// This enables versioning on the strides of symbolically striding memory |
| 111 | /// accesses in code like the following. |
| 112 | /// for (i = 0; i < N; ++i) |
| 113 | /// A[i * Stride1] += B[i * Stride2] ... |
| 114 | /// |
| 115 | /// Will be roughly translated to |
| 116 | /// if (Stride1 == 1 && Stride2 == 1) { |
| 117 | /// for (i = 0; i < N; i+=4) |
| 118 | /// A[i:i+3] += ... |
| 119 | /// } else |
| 120 | /// ... |
| 121 | static cl::opt<bool> EnableMemAccessVersioning( |
| 122 | "enable-mem-access-versioning" , cl::init(Val: true), cl::Hidden, |
| 123 | cl::desc("Enable symbolic stride memory access versioning" )); |
| 124 | |
| 125 | /// Enable store-to-load forwarding conflict detection. This option can |
| 126 | /// be disabled for correctness testing. |
| 127 | static cl::opt<bool> EnableForwardingConflictDetection( |
| 128 | "store-to-load-forwarding-conflict-detection" , cl::Hidden, |
| 129 | cl::desc("Enable conflict detection in loop-access analysis" ), |
| 130 | cl::init(Val: true)); |
| 131 | |
| 132 | static cl::opt<unsigned> MaxForkedSCEVDepth( |
| 133 | "max-forked-scev-depth" , cl::Hidden, |
| 134 | cl::desc("Maximum recursion depth when finding forked SCEVs (default = 5)" ), |
| 135 | cl::init(Val: 5)); |
| 136 | |
| 137 | static cl::opt<bool> SpeculateUnitStride( |
| 138 | "laa-speculate-unit-stride" , cl::Hidden, |
| 139 | cl::desc("Speculate that non-constant strides are unit in LAA" ), |
| 140 | cl::init(Val: true)); |
| 141 | |
| 142 | static cl::opt<bool, true> HoistRuntimeChecks( |
| 143 | "hoist-runtime-checks" , cl::Hidden, |
| 144 | cl::desc( |
| 145 | "Hoist inner loop runtime memory checks to outer loop if possible" ), |
| 146 | cl::location(L&: VectorizerParams::HoistRuntimeChecks), cl::init(Val: true)); |
| 147 | bool VectorizerParams::HoistRuntimeChecks; |
| 148 | |
| 149 | bool VectorizerParams::isInterleaveForced() { |
| 150 | return ::VectorizationInterleave.getNumOccurrences() > 0; |
| 151 | } |
| 152 | |
| 153 | const SCEV *llvm::replaceSymbolicStrideSCEV(PredicatedScalarEvolution &PSE, |
| 154 | const DenseMap<Value *, const SCEV *> &PtrToStride, |
| 155 | Value *Ptr) { |
| 156 | const SCEV *OrigSCEV = PSE.getSCEV(V: Ptr); |
| 157 | |
| 158 | // If there is an entry in the map return the SCEV of the pointer with the |
| 159 | // symbolic stride replaced by one. |
| 160 | const SCEV *StrideSCEV = PtrToStride.lookup(Val: Ptr); |
| 161 | if (!StrideSCEV) |
| 162 | // For a non-symbolic stride, just return the original expression. |
| 163 | return OrigSCEV; |
| 164 | |
| 165 | // Note: This assert is both overly strong and overly weak. The actual |
| 166 | // invariant here is that StrideSCEV should be loop invariant. The only |
| 167 | // such invariant strides we happen to speculate right now are unknowns |
| 168 | // and thus this is a reasonable proxy of the actual invariant. |
| 169 | assert(isa<SCEVUnknown>(StrideSCEV) && "shouldn't be in map" ); |
| 170 | |
| 171 | ScalarEvolution *SE = PSE.getSE(); |
| 172 | const SCEV *CT = SE->getOne(Ty: StrideSCEV->getType()); |
| 173 | PSE.addPredicate(Pred: *SE->getEqualPredicate(LHS: StrideSCEV, RHS: CT)); |
| 174 | const SCEV *Expr = PSE.getSCEV(V: Ptr); |
| 175 | |
| 176 | LLVM_DEBUG(dbgs() << "LAA: Replacing SCEV: " << *OrigSCEV |
| 177 | << " by: " << *Expr << "\n" ); |
| 178 | return Expr; |
| 179 | } |
| 180 | |
| 181 | RuntimeCheckingPtrGroup::RuntimeCheckingPtrGroup( |
| 182 | unsigned Index, const RuntimePointerChecking &RtCheck) |
| 183 | : High(RtCheck.Pointers[Index].End), Low(RtCheck.Pointers[Index].Start), |
| 184 | AddressSpace(RtCheck.Pointers[Index] |
| 185 | .PointerValue->getType() |
| 186 | ->getPointerAddressSpace()), |
| 187 | NeedsFreeze(RtCheck.Pointers[Index].NeedsFreeze) { |
| 188 | Members.push_back(Elt: Index); |
| 189 | } |
| 190 | |
| 191 | /// Returns \p A + \p B, if it is guaranteed not to unsigned wrap. Otherwise |
| 192 | /// return nullptr. \p A and \p B must have the same type. |
| 193 | static const SCEV *addSCEVNoOverflow(const SCEV *A, const SCEV *B, |
| 194 | ScalarEvolution &SE) { |
| 195 | if (!SE.willNotOverflow(BinOp: Instruction::Add, /*IsSigned=*/Signed: false, LHS: A, RHS: B)) |
| 196 | return nullptr; |
| 197 | return SE.getAddExpr(LHS: A, RHS: B); |
| 198 | } |
| 199 | |
| 200 | /// Returns \p A * \p B, if it is guaranteed not to unsigned wrap. Otherwise |
| 201 | /// return nullptr. \p A and \p B must have the same type. |
| 202 | static const SCEV *mulSCEVOverflow(const SCEV *A, const SCEV *B, |
| 203 | ScalarEvolution &SE) { |
| 204 | if (!SE.willNotOverflow(BinOp: Instruction::Mul, /*IsSigned=*/Signed: false, LHS: A, RHS: B)) |
| 205 | return nullptr; |
| 206 | return SE.getMulExpr(LHS: A, RHS: B); |
| 207 | } |
| 208 | |
| 209 | /// Return true, if evaluating \p AR at \p MaxBTC cannot wrap, because \p AR at |
| 210 | /// \p MaxBTC is guaranteed inbounds of the accessed object. |
| 211 | static bool evaluatePtrAddRecAtMaxBTCWillNotWrap(const SCEVAddRecExpr *AR, |
| 212 | const SCEV *MaxBTC, |
| 213 | const SCEV *EltSize, |
| 214 | ScalarEvolution &SE, |
| 215 | const DataLayout &DL) { |
| 216 | auto *PointerBase = SE.getPointerBase(V: AR->getStart()); |
| 217 | auto *StartPtr = dyn_cast<SCEVUnknown>(Val: PointerBase); |
| 218 | if (!StartPtr) |
| 219 | return false; |
| 220 | bool CheckForNonNull, CheckForFreed; |
| 221 | uint64_t DerefBytes = StartPtr->getValue()->getPointerDereferenceableBytes( |
| 222 | DL, CanBeNull&: CheckForNonNull, CanBeFreed&: CheckForFreed); |
| 223 | |
| 224 | if (CheckForNonNull || CheckForFreed) |
| 225 | return false; |
| 226 | |
| 227 | const SCEV *Step = AR->getStepRecurrence(SE); |
| 228 | bool IsKnownNonNegative = SE.isKnownNonNegative(S: Step); |
| 229 | if (!IsKnownNonNegative && !SE.isKnownNegative(S: Step)) |
| 230 | return false; |
| 231 | |
| 232 | Type *WiderTy = SE.getWiderType(Ty1: MaxBTC->getType(), Ty2: Step->getType()); |
| 233 | Step = SE.getNoopOrSignExtend(V: Step, Ty: WiderTy); |
| 234 | MaxBTC = SE.getNoopOrZeroExtend(V: MaxBTC, Ty: WiderTy); |
| 235 | |
| 236 | // For the computations below, make sure they don't unsigned wrap. |
| 237 | if (!SE.isKnownPredicate(Pred: CmpInst::ICMP_UGE, LHS: AR->getStart(), RHS: StartPtr)) |
| 238 | return false; |
| 239 | const SCEV *StartOffset = SE.getNoopOrZeroExtend( |
| 240 | V: SE.getMinusSCEV(LHS: AR->getStart(), RHS: StartPtr), Ty: WiderTy); |
| 241 | |
| 242 | const SCEV *OffsetAtLastIter = |
| 243 | mulSCEVOverflow(A: MaxBTC, B: SE.getAbsExpr(Op: Step, /*IsNSW=*/false), SE); |
| 244 | if (!OffsetAtLastIter) |
| 245 | return false; |
| 246 | |
| 247 | const SCEV *OffsetEndBytes = addSCEVNoOverflow( |
| 248 | A: OffsetAtLastIter, B: SE.getNoopOrZeroExtend(V: EltSize, Ty: WiderTy), SE); |
| 249 | if (!OffsetEndBytes) |
| 250 | return false; |
| 251 | |
| 252 | if (IsKnownNonNegative) { |
| 253 | // For positive steps, check if |
| 254 | // (AR->getStart() - StartPtr) + (MaxBTC * Step) + EltSize <= DerefBytes, |
| 255 | // while making sure none of the computations unsigned wrap themselves. |
| 256 | const SCEV *EndBytes = addSCEVNoOverflow(A: StartOffset, B: OffsetEndBytes, SE); |
| 257 | if (!EndBytes) |
| 258 | return false; |
| 259 | return SE.isKnownPredicate(Pred: CmpInst::ICMP_ULE, LHS: EndBytes, |
| 260 | RHS: SE.getConstant(Ty: WiderTy, V: DerefBytes)); |
| 261 | } |
| 262 | |
| 263 | // For negative steps check if |
| 264 | // * StartOffset >= (MaxBTC * Step + EltSize) |
| 265 | // * StartOffset <= DerefBytes. |
| 266 | assert(SE.isKnownNegative(Step) && "must be known negative" ); |
| 267 | return SE.isKnownPredicate(Pred: CmpInst::ICMP_SGE, LHS: StartOffset, RHS: OffsetEndBytes) && |
| 268 | SE.isKnownPredicate(Pred: CmpInst::ICMP_ULE, LHS: StartOffset, |
| 269 | RHS: SE.getConstant(Ty: WiderTy, V: DerefBytes)); |
| 270 | } |
| 271 | |
| 272 | std::pair<const SCEV *, const SCEV *> llvm::getStartAndEndForAccess( |
| 273 | const Loop *Lp, const SCEV *PtrExpr, Type *AccessTy, const SCEV *BTC, |
| 274 | const SCEV *MaxBTC, ScalarEvolution *SE, |
| 275 | DenseMap<std::pair<const SCEV *, Type *>, |
| 276 | std::pair<const SCEV *, const SCEV *>> *PointerBounds) { |
| 277 | std::pair<const SCEV *, const SCEV *> *PtrBoundsPair; |
| 278 | if (PointerBounds) { |
| 279 | auto [Iter, Ins] = PointerBounds->insert( |
| 280 | KV: {{PtrExpr, AccessTy}, |
| 281 | {SE->getCouldNotCompute(), SE->getCouldNotCompute()}}); |
| 282 | if (!Ins) |
| 283 | return Iter->second; |
| 284 | PtrBoundsPair = &Iter->second; |
| 285 | } |
| 286 | |
| 287 | const SCEV *ScStart; |
| 288 | const SCEV *ScEnd; |
| 289 | |
| 290 | auto &DL = Lp->getHeader()->getDataLayout(); |
| 291 | Type *IdxTy = DL.getIndexType(PtrTy: PtrExpr->getType()); |
| 292 | const SCEV *EltSizeSCEV = SE->getStoreSizeOfExpr(IntTy: IdxTy, StoreTy: AccessTy); |
| 293 | if (SE->isLoopInvariant(S: PtrExpr, L: Lp)) { |
| 294 | ScStart = ScEnd = PtrExpr; |
| 295 | } else if (auto *AR = dyn_cast<SCEVAddRecExpr>(Val: PtrExpr)) { |
| 296 | ScStart = AR->getStart(); |
| 297 | if (!isa<SCEVCouldNotCompute>(Val: BTC)) |
| 298 | // Evaluating AR at an exact BTC is safe: LAA separately checks that |
| 299 | // accesses cannot wrap in the loop. If evaluating AR at BTC wraps, then |
| 300 | // the loop either triggers UB when executing a memory access with a |
| 301 | // poison pointer or the wrapping/poisoned pointer is not used. |
| 302 | ScEnd = AR->evaluateAtIteration(It: BTC, SE&: *SE); |
| 303 | else { |
| 304 | // Evaluating AR at MaxBTC may wrap and create an expression that is less |
| 305 | // than the start of the AddRec due to wrapping (for example consider |
| 306 | // MaxBTC = -2). If that's the case, set ScEnd to -(EltSize + 1). ScEnd |
| 307 | // will get incremented by EltSize before returning, so this effectively |
| 308 | // sets ScEnd to the maximum unsigned value for the type. Note that LAA |
| 309 | // separately checks that accesses cannot not wrap, so unsigned max |
| 310 | // represents an upper bound. |
| 311 | if (evaluatePtrAddRecAtMaxBTCWillNotWrap(AR, MaxBTC, EltSize: EltSizeSCEV, SE&: *SE, |
| 312 | DL)) { |
| 313 | ScEnd = AR->evaluateAtIteration(It: MaxBTC, SE&: *SE); |
| 314 | } else { |
| 315 | ScEnd = SE->getAddExpr( |
| 316 | LHS: SE->getNegativeSCEV(V: EltSizeSCEV), |
| 317 | RHS: SE->getSCEV(V: ConstantExpr::getIntToPtr( |
| 318 | C: ConstantInt::get(Ty: EltSizeSCEV->getType(), V: -1), Ty: AR->getType()))); |
| 319 | } |
| 320 | } |
| 321 | const SCEV *Step = AR->getStepRecurrence(SE&: *SE); |
| 322 | |
| 323 | // For expressions with negative step, the upper bound is ScStart and the |
| 324 | // lower bound is ScEnd. |
| 325 | if (const auto *CStep = dyn_cast<SCEVConstant>(Val: Step)) { |
| 326 | if (CStep->getValue()->isNegative()) |
| 327 | std::swap(a&: ScStart, b&: ScEnd); |
| 328 | } else { |
| 329 | // Fallback case: the step is not constant, but we can still |
| 330 | // get the upper and lower bounds of the interval by using min/max |
| 331 | // expressions. |
| 332 | ScStart = SE->getUMinExpr(LHS: ScStart, RHS: ScEnd); |
| 333 | ScEnd = SE->getUMaxExpr(LHS: AR->getStart(), RHS: ScEnd); |
| 334 | } |
| 335 | } else |
| 336 | return {SE->getCouldNotCompute(), SE->getCouldNotCompute()}; |
| 337 | |
| 338 | assert(SE->isLoopInvariant(ScStart, Lp) && "ScStart needs to be invariant" ); |
| 339 | assert(SE->isLoopInvariant(ScEnd, Lp) && "ScEnd needs to be invariant" ); |
| 340 | |
| 341 | // Add the size of the pointed element to ScEnd. |
| 342 | ScEnd = SE->getAddExpr(LHS: ScEnd, RHS: EltSizeSCEV); |
| 343 | |
| 344 | std::pair<const SCEV *, const SCEV *> Res = {ScStart, ScEnd}; |
| 345 | if (PointerBounds) |
| 346 | *PtrBoundsPair = Res; |
| 347 | return Res; |
| 348 | } |
| 349 | |
| 350 | /// Calculate Start and End points of memory access using |
| 351 | /// getStartAndEndForAccess. |
| 352 | void RuntimePointerChecking::insert(Loop *Lp, Value *Ptr, const SCEV *PtrExpr, |
| 353 | Type *AccessTy, bool WritePtr, |
| 354 | unsigned DepSetId, unsigned ASId, |
| 355 | PredicatedScalarEvolution &PSE, |
| 356 | bool NeedsFreeze) { |
| 357 | const SCEV *SymbolicMaxBTC = PSE.getSymbolicMaxBackedgeTakenCount(); |
| 358 | const SCEV *BTC = PSE.getBackedgeTakenCount(); |
| 359 | const auto &[ScStart, ScEnd] = |
| 360 | getStartAndEndForAccess(Lp, PtrExpr, AccessTy, BTC, MaxBTC: SymbolicMaxBTC, |
| 361 | SE: PSE.getSE(), PointerBounds: &DC.getPointerBounds()); |
| 362 | assert(!isa<SCEVCouldNotCompute>(ScStart) && |
| 363 | !isa<SCEVCouldNotCompute>(ScEnd) && |
| 364 | "must be able to compute both start and end expressions" ); |
| 365 | Pointers.emplace_back(Args&: Ptr, Args: ScStart, Args: ScEnd, Args&: WritePtr, Args&: DepSetId, Args&: ASId, Args&: PtrExpr, |
| 366 | Args&: NeedsFreeze); |
| 367 | } |
| 368 | |
| 369 | bool RuntimePointerChecking::tryToCreateDiffCheck( |
| 370 | const RuntimeCheckingPtrGroup &CGI, const RuntimeCheckingPtrGroup &CGJ) { |
| 371 | // If either group contains multiple different pointers, bail out. |
| 372 | // TODO: Support multiple pointers by using the minimum or maximum pointer, |
| 373 | // depending on src & sink. |
| 374 | if (CGI.Members.size() != 1 || CGJ.Members.size() != 1) |
| 375 | return false; |
| 376 | |
| 377 | const PointerInfo *Src = &Pointers[CGI.Members[0]]; |
| 378 | const PointerInfo *Sink = &Pointers[CGJ.Members[0]]; |
| 379 | |
| 380 | // If either pointer is read and written, multiple checks may be needed. Bail |
| 381 | // out. |
| 382 | if (!DC.getOrderForAccess(Ptr: Src->PointerValue, IsWrite: !Src->IsWritePtr).empty() || |
| 383 | !DC.getOrderForAccess(Ptr: Sink->PointerValue, IsWrite: !Sink->IsWritePtr).empty()) |
| 384 | return false; |
| 385 | |
| 386 | ArrayRef<unsigned> AccSrc = |
| 387 | DC.getOrderForAccess(Ptr: Src->PointerValue, IsWrite: Src->IsWritePtr); |
| 388 | ArrayRef<unsigned> AccSink = |
| 389 | DC.getOrderForAccess(Ptr: Sink->PointerValue, IsWrite: Sink->IsWritePtr); |
| 390 | // If either pointer is accessed multiple times, there may not be a clear |
| 391 | // src/sink relation. Bail out for now. |
| 392 | if (AccSrc.size() != 1 || AccSink.size() != 1) |
| 393 | return false; |
| 394 | |
| 395 | // If the sink is accessed before src, swap src/sink. |
| 396 | if (AccSink[0] < AccSrc[0]) |
| 397 | std::swap(a&: Src, b&: Sink); |
| 398 | |
| 399 | const SCEVConstant *Step; |
| 400 | const SCEV *SrcStart; |
| 401 | const SCEV *SinkStart; |
| 402 | const Loop *InnerLoop = DC.getInnermostLoop(); |
| 403 | if (!match(S: Src->Expr, |
| 404 | P: m_scev_AffineAddRec(Op0: m_SCEV(V&: SrcStart), Op1: m_SCEVConstant(V&: Step), |
| 405 | L: m_SpecificLoop(L: InnerLoop))) || |
| 406 | !match(S: Sink->Expr, |
| 407 | P: m_scev_AffineAddRec(Op0: m_SCEV(V&: SinkStart), Op1: m_scev_Specific(S: Step), |
| 408 | L: m_SpecificLoop(L: InnerLoop)))) |
| 409 | return false; |
| 410 | |
| 411 | SmallVector<Instruction *, 4> SrcInsts = |
| 412 | DC.getInstructionsForAccess(Ptr: Src->PointerValue, isWrite: Src->IsWritePtr); |
| 413 | SmallVector<Instruction *, 4> SinkInsts = |
| 414 | DC.getInstructionsForAccess(Ptr: Sink->PointerValue, isWrite: Sink->IsWritePtr); |
| 415 | Type *SrcTy = getLoadStoreType(I: SrcInsts[0]); |
| 416 | Type *DstTy = getLoadStoreType(I: SinkInsts[0]); |
| 417 | if (isa<ScalableVectorType>(Val: SrcTy) || isa<ScalableVectorType>(Val: DstTy)) |
| 418 | return false; |
| 419 | |
| 420 | const DataLayout &DL = InnerLoop->getHeader()->getDataLayout(); |
| 421 | unsigned AllocSize = |
| 422 | std::max(a: DL.getTypeAllocSize(Ty: SrcTy), b: DL.getTypeAllocSize(Ty: DstTy)); |
| 423 | |
| 424 | // Only matching constant steps matching the AllocSize are supported at the |
| 425 | // moment. This simplifies the difference computation. Can be extended in the |
| 426 | // future. |
| 427 | if (Step->getAPInt().abs() != AllocSize) |
| 428 | return false; |
| 429 | |
| 430 | IntegerType *IntTy = |
| 431 | IntegerType::get(C&: Src->PointerValue->getContext(), |
| 432 | NumBits: DL.getPointerSizeInBits(AS: CGI.AddressSpace)); |
| 433 | |
| 434 | // When counting down, the dependence distance needs to be swapped. |
| 435 | if (Step->getValue()->isNegative()) |
| 436 | std::swap(a&: SinkStart, b&: SrcStart); |
| 437 | |
| 438 | const SCEV *SinkStartInt = SE->getPtrToIntExpr(Op: SinkStart, Ty: IntTy); |
| 439 | const SCEV *SrcStartInt = SE->getPtrToIntExpr(Op: SrcStart, Ty: IntTy); |
| 440 | if (isa<SCEVCouldNotCompute>(Val: SinkStartInt) || |
| 441 | isa<SCEVCouldNotCompute>(Val: SrcStartInt)) |
| 442 | return false; |
| 443 | |
| 444 | // If the start values for both Src and Sink also vary according to an outer |
| 445 | // loop, then it's probably better to avoid creating diff checks because |
| 446 | // they may not be hoisted. We should instead let llvm::addRuntimeChecks |
| 447 | // do the expanded full range overlap checks, which can be hoisted. |
| 448 | if (HoistRuntimeChecks && InnerLoop->getParentLoop() && |
| 449 | isa<SCEVAddRecExpr>(Val: SinkStartInt) && isa<SCEVAddRecExpr>(Val: SrcStartInt)) { |
| 450 | auto *SrcStartAR = cast<SCEVAddRecExpr>(Val: SrcStartInt); |
| 451 | auto *SinkStartAR = cast<SCEVAddRecExpr>(Val: SinkStartInt); |
| 452 | const Loop *StartARLoop = SrcStartAR->getLoop(); |
| 453 | if (StartARLoop == SinkStartAR->getLoop() && |
| 454 | StartARLoop == InnerLoop->getParentLoop() && |
| 455 | // If the diff check would already be loop invariant (due to the |
| 456 | // recurrences being the same), then we prefer to keep the diff checks |
| 457 | // because they are cheaper. |
| 458 | SrcStartAR->getStepRecurrence(SE&: *SE) != |
| 459 | SinkStartAR->getStepRecurrence(SE&: *SE)) { |
| 460 | LLVM_DEBUG(dbgs() << "LAA: Not creating diff runtime check, since these " |
| 461 | "cannot be hoisted out of the outer loop\n" ); |
| 462 | return false; |
| 463 | } |
| 464 | } |
| 465 | |
| 466 | LLVM_DEBUG(dbgs() << "LAA: Creating diff runtime check for:\n" |
| 467 | << "SrcStart: " << *SrcStartInt << '\n' |
| 468 | << "SinkStartInt: " << *SinkStartInt << '\n'); |
| 469 | DiffChecks.emplace_back(Args&: SrcStartInt, Args&: SinkStartInt, Args&: AllocSize, |
| 470 | Args: Src->NeedsFreeze || Sink->NeedsFreeze); |
| 471 | return true; |
| 472 | } |
| 473 | |
| 474 | SmallVector<RuntimePointerCheck, 4> RuntimePointerChecking::generateChecks() { |
| 475 | SmallVector<RuntimePointerCheck, 4> Checks; |
| 476 | |
| 477 | for (unsigned I = 0; I < CheckingGroups.size(); ++I) { |
| 478 | for (unsigned J = I + 1; J < CheckingGroups.size(); ++J) { |
| 479 | const RuntimeCheckingPtrGroup &CGI = CheckingGroups[I]; |
| 480 | const RuntimeCheckingPtrGroup &CGJ = CheckingGroups[J]; |
| 481 | |
| 482 | if (needsChecking(M: CGI, N: CGJ)) { |
| 483 | CanUseDiffCheck = CanUseDiffCheck && tryToCreateDiffCheck(CGI, CGJ); |
| 484 | Checks.emplace_back(Args: &CGI, Args: &CGJ); |
| 485 | } |
| 486 | } |
| 487 | } |
| 488 | return Checks; |
| 489 | } |
| 490 | |
| 491 | void RuntimePointerChecking::generateChecks( |
| 492 | MemoryDepChecker::DepCandidates &DepCands, bool UseDependencies) { |
| 493 | assert(Checks.empty() && "Checks is not empty" ); |
| 494 | groupChecks(DepCands, UseDependencies); |
| 495 | Checks = generateChecks(); |
| 496 | } |
| 497 | |
| 498 | bool RuntimePointerChecking::needsChecking( |
| 499 | const RuntimeCheckingPtrGroup &M, const RuntimeCheckingPtrGroup &N) const { |
| 500 | for (const auto &I : M.Members) |
| 501 | for (const auto &J : N.Members) |
| 502 | if (needsChecking(I, J)) |
| 503 | return true; |
| 504 | return false; |
| 505 | } |
| 506 | |
| 507 | /// Compare \p I and \p J and return the minimum. |
| 508 | /// Return nullptr in case we couldn't find an answer. |
| 509 | static const SCEV *getMinFromExprs(const SCEV *I, const SCEV *J, |
| 510 | ScalarEvolution *SE) { |
| 511 | std::optional<APInt> Diff = SE->computeConstantDifference(LHS: J, RHS: I); |
| 512 | if (!Diff) |
| 513 | return nullptr; |
| 514 | return Diff->isNegative() ? J : I; |
| 515 | } |
| 516 | |
| 517 | bool RuntimeCheckingPtrGroup::addPointer( |
| 518 | unsigned Index, const RuntimePointerChecking &RtCheck) { |
| 519 | return addPointer( |
| 520 | Index, Start: RtCheck.Pointers[Index].Start, End: RtCheck.Pointers[Index].End, |
| 521 | AS: RtCheck.Pointers[Index].PointerValue->getType()->getPointerAddressSpace(), |
| 522 | NeedsFreeze: RtCheck.Pointers[Index].NeedsFreeze, SE&: *RtCheck.SE); |
| 523 | } |
| 524 | |
| 525 | bool RuntimeCheckingPtrGroup::addPointer(unsigned Index, const SCEV *Start, |
| 526 | const SCEV *End, unsigned AS, |
| 527 | bool NeedsFreeze, |
| 528 | ScalarEvolution &SE) { |
| 529 | assert(AddressSpace == AS && |
| 530 | "all pointers in a checking group must be in the same address space" ); |
| 531 | |
| 532 | // Compare the starts and ends with the known minimum and maximum |
| 533 | // of this set. We need to know how we compare against the min/max |
| 534 | // of the set in order to be able to emit memchecks. |
| 535 | const SCEV *Min0 = getMinFromExprs(I: Start, J: Low, SE: &SE); |
| 536 | if (!Min0) |
| 537 | return false; |
| 538 | |
| 539 | const SCEV *Min1 = getMinFromExprs(I: End, J: High, SE: &SE); |
| 540 | if (!Min1) |
| 541 | return false; |
| 542 | |
| 543 | // Update the low bound expression if we've found a new min value. |
| 544 | if (Min0 == Start) |
| 545 | Low = Start; |
| 546 | |
| 547 | // Update the high bound expression if we've found a new max value. |
| 548 | if (Min1 != End) |
| 549 | High = End; |
| 550 | |
| 551 | Members.push_back(Elt: Index); |
| 552 | this->NeedsFreeze |= NeedsFreeze; |
| 553 | return true; |
| 554 | } |
| 555 | |
| 556 | void RuntimePointerChecking::groupChecks( |
| 557 | MemoryDepChecker::DepCandidates &DepCands, bool UseDependencies) { |
| 558 | // We build the groups from dependency candidates equivalence classes |
| 559 | // because: |
| 560 | // - We know that pointers in the same equivalence class share |
| 561 | // the same underlying object and therefore there is a chance |
| 562 | // that we can compare pointers |
| 563 | // - We wouldn't be able to merge two pointers for which we need |
| 564 | // to emit a memcheck. The classes in DepCands are already |
| 565 | // conveniently built such that no two pointers in the same |
| 566 | // class need checking against each other. |
| 567 | |
| 568 | // We use the following (greedy) algorithm to construct the groups |
| 569 | // For every pointer in the equivalence class: |
| 570 | // For each existing group: |
| 571 | // - if the difference between this pointer and the min/max bounds |
| 572 | // of the group is a constant, then make the pointer part of the |
| 573 | // group and update the min/max bounds of that group as required. |
| 574 | |
| 575 | CheckingGroups.clear(); |
| 576 | |
| 577 | // If we need to check two pointers to the same underlying object |
| 578 | // with a non-constant difference, we shouldn't perform any pointer |
| 579 | // grouping with those pointers. This is because we can easily get |
| 580 | // into cases where the resulting check would return false, even when |
| 581 | // the accesses are safe. |
| 582 | // |
| 583 | // The following example shows this: |
| 584 | // for (i = 0; i < 1000; ++i) |
| 585 | // a[5000 + i * m] = a[i] + a[i + 9000] |
| 586 | // |
| 587 | // Here grouping gives a check of (5000, 5000 + 1000 * m) against |
| 588 | // (0, 10000) which is always false. However, if m is 1, there is no |
| 589 | // dependence. Not grouping the checks for a[i] and a[i + 9000] allows |
| 590 | // us to perform an accurate check in this case. |
| 591 | // |
| 592 | // The above case requires that we have an UnknownDependence between |
| 593 | // accesses to the same underlying object. This cannot happen unless |
| 594 | // FoundNonConstantDistanceDependence is set, and therefore UseDependencies |
| 595 | // is also false. In this case we will use the fallback path and create |
| 596 | // separate checking groups for all pointers. |
| 597 | |
| 598 | // If we don't have the dependency partitions, construct a new |
| 599 | // checking pointer group for each pointer. This is also required |
| 600 | // for correctness, because in this case we can have checking between |
| 601 | // pointers to the same underlying object. |
| 602 | if (!UseDependencies) { |
| 603 | for (unsigned I = 0; I < Pointers.size(); ++I) |
| 604 | CheckingGroups.emplace_back(Args&: I, Args&: *this); |
| 605 | return; |
| 606 | } |
| 607 | |
| 608 | unsigned TotalComparisons = 0; |
| 609 | |
| 610 | DenseMap<Value *, SmallVector<unsigned>> PositionMap; |
| 611 | for (unsigned Index = 0; Index < Pointers.size(); ++Index) |
| 612 | PositionMap[Pointers[Index].PointerValue].push_back(Elt: Index); |
| 613 | |
| 614 | // We need to keep track of what pointers we've already seen so we |
| 615 | // don't process them twice. |
| 616 | SmallSet<unsigned, 2> Seen; |
| 617 | |
| 618 | // Go through all equivalence classes, get the "pointer check groups" |
| 619 | // and add them to the overall solution. We use the order in which accesses |
| 620 | // appear in 'Pointers' to enforce determinism. |
| 621 | for (unsigned I = 0; I < Pointers.size(); ++I) { |
| 622 | // We've seen this pointer before, and therefore already processed |
| 623 | // its equivalence class. |
| 624 | if (Seen.contains(V: I)) |
| 625 | continue; |
| 626 | |
| 627 | MemoryDepChecker::MemAccessInfo Access(Pointers[I].PointerValue, |
| 628 | Pointers[I].IsWritePtr); |
| 629 | |
| 630 | SmallVector<RuntimeCheckingPtrGroup, 2> Groups; |
| 631 | |
| 632 | // Because DepCands is constructed by visiting accesses in the order in |
| 633 | // which they appear in alias sets (which is deterministic) and the |
| 634 | // iteration order within an equivalence class member is only dependent on |
| 635 | // the order in which unions and insertions are performed on the |
| 636 | // equivalence class, the iteration order is deterministic. |
| 637 | for (auto M : DepCands.members(V: Access)) { |
| 638 | auto PointerI = PositionMap.find(Val: M.getPointer()); |
| 639 | // If we can't find the pointer in PositionMap that means we can't |
| 640 | // generate a memcheck for it. |
| 641 | if (PointerI == PositionMap.end()) |
| 642 | continue; |
| 643 | for (unsigned Pointer : PointerI->second) { |
| 644 | bool Merged = false; |
| 645 | // Mark this pointer as seen. |
| 646 | Seen.insert(V: Pointer); |
| 647 | |
| 648 | // Go through all the existing sets and see if we can find one |
| 649 | // which can include this pointer. |
| 650 | for (RuntimeCheckingPtrGroup &Group : Groups) { |
| 651 | // Don't perform more than a certain amount of comparisons. |
| 652 | // This should limit the cost of grouping the pointers to something |
| 653 | // reasonable. If we do end up hitting this threshold, the algorithm |
| 654 | // will create separate groups for all remaining pointers. |
| 655 | if (TotalComparisons > MemoryCheckMergeThreshold) |
| 656 | break; |
| 657 | |
| 658 | TotalComparisons++; |
| 659 | |
| 660 | if (Group.addPointer(Index: Pointer, RtCheck: *this)) { |
| 661 | Merged = true; |
| 662 | break; |
| 663 | } |
| 664 | } |
| 665 | |
| 666 | if (!Merged) |
| 667 | // We couldn't add this pointer to any existing set or the threshold |
| 668 | // for the number of comparisons has been reached. Create a new group |
| 669 | // to hold the current pointer. |
| 670 | Groups.emplace_back(Args&: Pointer, Args&: *this); |
| 671 | } |
| 672 | } |
| 673 | |
| 674 | // We've computed the grouped checks for this partition. |
| 675 | // Save the results and continue with the next one. |
| 676 | llvm::append_range(C&: CheckingGroups, R&: Groups); |
| 677 | } |
| 678 | } |
| 679 | |
| 680 | bool RuntimePointerChecking::arePointersInSamePartition( |
| 681 | const SmallVectorImpl<int> &PtrToPartition, unsigned PtrIdx1, |
| 682 | unsigned PtrIdx2) { |
| 683 | return (PtrToPartition[PtrIdx1] != -1 && |
| 684 | PtrToPartition[PtrIdx1] == PtrToPartition[PtrIdx2]); |
| 685 | } |
| 686 | |
| 687 | bool RuntimePointerChecking::needsChecking(unsigned I, unsigned J) const { |
| 688 | const PointerInfo &PointerI = Pointers[I]; |
| 689 | const PointerInfo &PointerJ = Pointers[J]; |
| 690 | |
| 691 | // No need to check if two readonly pointers intersect. |
| 692 | if (!PointerI.IsWritePtr && !PointerJ.IsWritePtr) |
| 693 | return false; |
| 694 | |
| 695 | // Only need to check pointers between two different dependency sets. |
| 696 | if (PointerI.DependencySetId == PointerJ.DependencySetId) |
| 697 | return false; |
| 698 | |
| 699 | // Only need to check pointers in the same alias set. |
| 700 | return PointerI.AliasSetId == PointerJ.AliasSetId; |
| 701 | } |
| 702 | |
| 703 | /// Assign each RuntimeCheckingPtrGroup pointer an index for stable UTC output. |
| 704 | static DenseMap<const RuntimeCheckingPtrGroup *, unsigned> |
| 705 | getPtrToIdxMap(ArrayRef<RuntimeCheckingPtrGroup> CheckingGroups) { |
| 706 | DenseMap<const RuntimeCheckingPtrGroup *, unsigned> PtrIndices; |
| 707 | for (const auto &[Idx, CG] : enumerate(First&: CheckingGroups)) |
| 708 | PtrIndices[&CG] = Idx; |
| 709 | return PtrIndices; |
| 710 | } |
| 711 | |
| 712 | void RuntimePointerChecking::printChecks( |
| 713 | raw_ostream &OS, const SmallVectorImpl<RuntimePointerCheck> &Checks, |
| 714 | unsigned Depth) const { |
| 715 | unsigned N = 0; |
| 716 | auto PtrIndices = getPtrToIdxMap(CheckingGroups); |
| 717 | for (const auto &[Check1, Check2] : Checks) { |
| 718 | const auto &First = Check1->Members, &Second = Check2->Members; |
| 719 | OS.indent(NumSpaces: Depth) << "Check " << N++ << ":\n" ; |
| 720 | OS.indent(NumSpaces: Depth + 2) << "Comparing group GRP" << PtrIndices.at(Val: Check1) |
| 721 | << ":\n" ; |
| 722 | for (unsigned K : First) |
| 723 | OS.indent(NumSpaces: Depth + 2) << *Pointers[K].PointerValue << "\n" ; |
| 724 | OS.indent(NumSpaces: Depth + 2) << "Against group GRP" << PtrIndices.at(Val: Check2) |
| 725 | << ":\n" ; |
| 726 | for (unsigned K : Second) |
| 727 | OS.indent(NumSpaces: Depth + 2) << *Pointers[K].PointerValue << "\n" ; |
| 728 | } |
| 729 | } |
| 730 | |
| 731 | void RuntimePointerChecking::print(raw_ostream &OS, unsigned Depth) const { |
| 732 | |
| 733 | OS.indent(NumSpaces: Depth) << "Run-time memory checks:\n" ; |
| 734 | printChecks(OS, Checks, Depth); |
| 735 | |
| 736 | OS.indent(NumSpaces: Depth) << "Grouped accesses:\n" ; |
| 737 | auto PtrIndices = getPtrToIdxMap(CheckingGroups); |
| 738 | for (const auto &CG : CheckingGroups) { |
| 739 | OS.indent(NumSpaces: Depth + 2) << "Group GRP" << PtrIndices.at(Val: &CG) << ":\n" ; |
| 740 | OS.indent(NumSpaces: Depth + 4) << "(Low: " << *CG.Low << " High: " << *CG.High |
| 741 | << ")\n" ; |
| 742 | for (unsigned Member : CG.Members) { |
| 743 | OS.indent(NumSpaces: Depth + 6) << "Member: " << *Pointers[Member].Expr << "\n" ; |
| 744 | } |
| 745 | } |
| 746 | } |
| 747 | |
| 748 | namespace { |
| 749 | |
| 750 | /// Analyses memory accesses in a loop. |
| 751 | /// |
| 752 | /// Checks whether run time pointer checks are needed and builds sets for data |
| 753 | /// dependence checking. |
| 754 | class AccessAnalysis { |
| 755 | public: |
| 756 | /// Read or write access location. |
| 757 | typedef PointerIntPair<Value *, 1, bool> MemAccessInfo; |
| 758 | typedef SmallVector<MemAccessInfo, 8> MemAccessInfoList; |
| 759 | |
| 760 | AccessAnalysis(const Loop *TheLoop, AAResults *AA, const LoopInfo *LI, |
| 761 | MemoryDepChecker::DepCandidates &DA, |
| 762 | PredicatedScalarEvolution &PSE, |
| 763 | SmallPtrSetImpl<MDNode *> &LoopAliasScopes) |
| 764 | : TheLoop(TheLoop), BAA(*AA), AST(BAA), LI(LI), DepCands(DA), PSE(PSE), |
| 765 | LoopAliasScopes(LoopAliasScopes) { |
| 766 | // We're analyzing dependences across loop iterations. |
| 767 | BAA.enableCrossIterationMode(); |
| 768 | } |
| 769 | |
| 770 | /// Register a load and whether it is only read from. |
| 771 | void addLoad(const MemoryLocation &Loc, Type *AccessTy, bool IsReadOnly) { |
| 772 | Value *Ptr = const_cast<Value *>(Loc.Ptr); |
| 773 | AST.add(Loc: adjustLoc(Loc)); |
| 774 | Accesses[MemAccessInfo(Ptr, false)].insert(X: AccessTy); |
| 775 | if (IsReadOnly) |
| 776 | ReadOnlyPtr.insert(Ptr); |
| 777 | } |
| 778 | |
| 779 | /// Register a store. |
| 780 | void addStore(const MemoryLocation &Loc, Type *AccessTy) { |
| 781 | Value *Ptr = const_cast<Value *>(Loc.Ptr); |
| 782 | AST.add(Loc: adjustLoc(Loc)); |
| 783 | Accesses[MemAccessInfo(Ptr, true)].insert(X: AccessTy); |
| 784 | } |
| 785 | |
| 786 | /// Check if we can emit a run-time no-alias check for \p Access. |
| 787 | /// |
| 788 | /// Returns true if we can emit a run-time no alias check for \p Access. |
| 789 | /// If we can check this access, this also adds it to a dependence set and |
| 790 | /// adds a run-time to check for it to \p RtCheck. If \p Assume is true, |
| 791 | /// we will attempt to use additional run-time checks in order to get |
| 792 | /// the bounds of the pointer. |
| 793 | bool createCheckForAccess(RuntimePointerChecking &RtCheck, |
| 794 | MemAccessInfo Access, Type *AccessTy, |
| 795 | const DenseMap<Value *, const SCEV *> &Strides, |
| 796 | DenseMap<Value *, unsigned> &DepSetId, |
| 797 | Loop *TheLoop, unsigned &RunningDepId, |
| 798 | unsigned ASId, bool Assume); |
| 799 | |
| 800 | /// Check whether we can check the pointers at runtime for |
| 801 | /// non-intersection. |
| 802 | /// |
| 803 | /// Returns true if we need no check or if we do and we can generate them |
| 804 | /// (i.e. the pointers have computable bounds). A return value of false means |
| 805 | /// we couldn't analyze and generate runtime checks for all pointers in the |
| 806 | /// loop, but if \p AllowPartial is set then we will have checks for those |
| 807 | /// pointers we could analyze. |
| 808 | bool canCheckPtrAtRT(RuntimePointerChecking &RtCheck, Loop *TheLoop, |
| 809 | const DenseMap<Value *, const SCEV *> &Strides, |
| 810 | Value *&UncomputablePtr, bool AllowPartial); |
| 811 | |
| 812 | /// Goes over all memory accesses, checks whether a RT check is needed |
| 813 | /// and builds sets of dependent accesses. |
| 814 | void buildDependenceSets() { |
| 815 | processMemAccesses(); |
| 816 | } |
| 817 | |
| 818 | /// Initial processing of memory accesses determined that we need to |
| 819 | /// perform dependency checking. |
| 820 | /// |
| 821 | /// Note that this can later be cleared if we retry memcheck analysis without |
| 822 | /// dependency checking (i.e. FoundNonConstantDistanceDependence). |
| 823 | bool isDependencyCheckNeeded() const { return !CheckDeps.empty(); } |
| 824 | |
| 825 | /// We decided that no dependence analysis would be used. Reset the state. |
| 826 | void resetDepChecks(MemoryDepChecker &DepChecker) { |
| 827 | CheckDeps.clear(); |
| 828 | DepChecker.clearDependences(); |
| 829 | } |
| 830 | |
| 831 | const MemAccessInfoList &getDependenciesToCheck() const { return CheckDeps; } |
| 832 | |
| 833 | private: |
| 834 | typedef MapVector<MemAccessInfo, SmallSetVector<Type *, 1>> PtrAccessMap; |
| 835 | |
| 836 | /// Adjust the MemoryLocation so that it represents accesses to this |
| 837 | /// location across all iterations, rather than a single one. |
| 838 | MemoryLocation adjustLoc(MemoryLocation Loc) const { |
| 839 | // The accessed location varies within the loop, but remains within the |
| 840 | // underlying object. |
| 841 | Loc.Size = LocationSize::beforeOrAfterPointer(); |
| 842 | Loc.AATags.Scope = adjustAliasScopeList(ScopeList: Loc.AATags.Scope); |
| 843 | Loc.AATags.NoAlias = adjustAliasScopeList(ScopeList: Loc.AATags.NoAlias); |
| 844 | return Loc; |
| 845 | } |
| 846 | |
| 847 | /// Drop alias scopes that are only valid within a single loop iteration. |
| 848 | MDNode *adjustAliasScopeList(MDNode *ScopeList) const { |
| 849 | if (!ScopeList) |
| 850 | return nullptr; |
| 851 | |
| 852 | // For the sake of simplicity, drop the whole scope list if any scope is |
| 853 | // iteration-local. |
| 854 | if (any_of(Range: ScopeList->operands(), P: [&](Metadata *Scope) { |
| 855 | return LoopAliasScopes.contains(Ptr: cast<MDNode>(Val: Scope)); |
| 856 | })) |
| 857 | return nullptr; |
| 858 | |
| 859 | return ScopeList; |
| 860 | } |
| 861 | |
| 862 | /// Go over all memory access and check whether runtime pointer checks |
| 863 | /// are needed and build sets of dependency check candidates. |
| 864 | void processMemAccesses(); |
| 865 | |
| 866 | /// Map of all accesses. Values are the types used to access memory pointed to |
| 867 | /// by the pointer. |
| 868 | PtrAccessMap Accesses; |
| 869 | |
| 870 | /// The loop being checked. |
| 871 | const Loop *TheLoop; |
| 872 | |
| 873 | /// List of accesses that need a further dependence check. |
| 874 | MemAccessInfoList CheckDeps; |
| 875 | |
| 876 | /// Set of pointers that are read only. |
| 877 | SmallPtrSet<Value*, 16> ReadOnlyPtr; |
| 878 | |
| 879 | /// Batched alias analysis results. |
| 880 | BatchAAResults BAA; |
| 881 | |
| 882 | /// An alias set tracker to partition the access set by underlying object and |
| 883 | //intrinsic property (such as TBAA metadata). |
| 884 | AliasSetTracker AST; |
| 885 | |
| 886 | /// The LoopInfo of the loop being checked. |
| 887 | const LoopInfo *LI; |
| 888 | |
| 889 | /// Sets of potentially dependent accesses - members of one set share an |
| 890 | /// underlying pointer. The set "CheckDeps" identfies which sets really need a |
| 891 | /// dependence check. |
| 892 | MemoryDepChecker::DepCandidates &DepCands; |
| 893 | |
| 894 | /// Initial processing of memory accesses determined that we may need |
| 895 | /// to add memchecks. Perform the analysis to determine the necessary checks. |
| 896 | /// |
| 897 | /// Note that, this is different from isDependencyCheckNeeded. When we retry |
| 898 | /// memcheck analysis without dependency checking |
| 899 | /// (i.e. FoundNonConstantDistanceDependence), isDependencyCheckNeeded is |
| 900 | /// cleared while this remains set if we have potentially dependent accesses. |
| 901 | bool IsRTCheckAnalysisNeeded = false; |
| 902 | |
| 903 | /// The SCEV predicate containing all the SCEV-related assumptions. |
| 904 | PredicatedScalarEvolution &PSE; |
| 905 | |
| 906 | DenseMap<Value *, SmallVector<const Value *, 16>> UnderlyingObjects; |
| 907 | |
| 908 | /// Alias scopes that are declared inside the loop, and as such not valid |
| 909 | /// across iterations. |
| 910 | SmallPtrSetImpl<MDNode *> &LoopAliasScopes; |
| 911 | }; |
| 912 | |
| 913 | } // end anonymous namespace |
| 914 | |
| 915 | /// Try to compute a constant stride for \p AR. Used by getPtrStride and |
| 916 | /// isNoWrap. |
| 917 | static std::optional<int64_t> |
| 918 | getStrideFromAddRec(const SCEVAddRecExpr *AR, const Loop *Lp, Type *AccessTy, |
| 919 | Value *Ptr, PredicatedScalarEvolution &PSE) { |
| 920 | // The access function must stride over the innermost loop. |
| 921 | if (Lp != AR->getLoop()) { |
| 922 | LLVM_DEBUG({ |
| 923 | dbgs() << "LAA: Bad stride - Not striding over innermost loop " ; |
| 924 | if (Ptr) |
| 925 | dbgs() << *Ptr << " " ; |
| 926 | |
| 927 | dbgs() << "SCEV: " << *AR << "\n" ; |
| 928 | }); |
| 929 | return std::nullopt; |
| 930 | } |
| 931 | |
| 932 | // Check the step is constant. |
| 933 | const SCEV *Step = AR->getStepRecurrence(SE&: *PSE.getSE()); |
| 934 | |
| 935 | // Calculate the pointer stride and check if it is constant. |
| 936 | const APInt *APStepVal; |
| 937 | if (!match(S: Step, P: m_scev_APInt(C&: APStepVal))) { |
| 938 | LLVM_DEBUG({ |
| 939 | dbgs() << "LAA: Bad stride - Not a constant strided " ; |
| 940 | if (Ptr) |
| 941 | dbgs() << *Ptr << " " ; |
| 942 | dbgs() << "SCEV: " << *AR << "\n" ; |
| 943 | }); |
| 944 | return std::nullopt; |
| 945 | } |
| 946 | |
| 947 | const auto &DL = Lp->getHeader()->getDataLayout(); |
| 948 | TypeSize AllocSize = DL.getTypeAllocSize(Ty: AccessTy); |
| 949 | int64_t Size = AllocSize.getFixedValue(); |
| 950 | |
| 951 | // Huge step value - give up. |
| 952 | std::optional<int64_t> StepVal = APStepVal->trySExtValue(); |
| 953 | if (!StepVal) |
| 954 | return std::nullopt; |
| 955 | |
| 956 | // Strided access. |
| 957 | return *StepVal % Size ? std::nullopt : std::make_optional(t: *StepVal / Size); |
| 958 | } |
| 959 | |
| 960 | /// Check whether \p AR is a non-wrapping AddRec. If \p Ptr is not nullptr, use |
| 961 | /// informating from the IR pointer value to determine no-wrap. |
| 962 | static bool isNoWrap(PredicatedScalarEvolution &PSE, const SCEVAddRecExpr *AR, |
| 963 | Value *Ptr, Type *AccessTy, const Loop *L, bool Assume, |
| 964 | std::optional<int64_t> Stride = std::nullopt) { |
| 965 | // FIXME: This should probably only return true for NUW. |
| 966 | if (AR->getNoWrapFlags(Mask: SCEV::NoWrapMask)) |
| 967 | return true; |
| 968 | |
| 969 | if (Ptr && PSE.hasNoOverflow(V: Ptr, Flags: SCEVWrapPredicate::IncrementNUSW)) |
| 970 | return true; |
| 971 | |
| 972 | // An nusw getelementptr that is an AddRec cannot wrap. If it would wrap, |
| 973 | // the distance between the previously accessed location and the wrapped |
| 974 | // location will be larger than half the pointer index type space. In that |
| 975 | // case, the GEP would be poison and any memory access dependent on it would |
| 976 | // be immediate UB when executed. |
| 977 | if (auto *GEP = dyn_cast_if_present<GetElementPtrInst>(Val: Ptr); |
| 978 | GEP && GEP->hasNoUnsignedSignedWrap()) |
| 979 | return true; |
| 980 | |
| 981 | if (!Stride) |
| 982 | Stride = getStrideFromAddRec(AR, Lp: L, AccessTy, Ptr, PSE); |
| 983 | if (Stride) { |
| 984 | // If the null pointer is undefined, then a access sequence which would |
| 985 | // otherwise access it can be assumed not to unsigned wrap. Note that this |
| 986 | // assumes the object in memory is aligned to the natural alignment. |
| 987 | unsigned AddrSpace = AR->getType()->getPointerAddressSpace(); |
| 988 | if (!NullPointerIsDefined(F: L->getHeader()->getParent(), AS: AddrSpace) && |
| 989 | (Stride == 1 || Stride == -1)) |
| 990 | return true; |
| 991 | } |
| 992 | |
| 993 | if (Ptr && Assume) { |
| 994 | PSE.setNoOverflow(V: Ptr, Flags: SCEVWrapPredicate::IncrementNUSW); |
| 995 | LLVM_DEBUG(dbgs() << "LAA: Pointer may wrap:\n" |
| 996 | << "LAA: Pointer: " << *Ptr << "\n" |
| 997 | << "LAA: SCEV: " << *AR << "\n" |
| 998 | << "LAA: Added an overflow assumption\n" ); |
| 999 | return true; |
| 1000 | } |
| 1001 | |
| 1002 | return false; |
| 1003 | } |
| 1004 | |
| 1005 | static void visitPointers(Value *StartPtr, const Loop &InnermostLoop, |
| 1006 | function_ref<void(Value *)> AddPointer) { |
| 1007 | SmallPtrSet<Value *, 8> Visited; |
| 1008 | SmallVector<Value *> WorkList; |
| 1009 | WorkList.push_back(Elt: StartPtr); |
| 1010 | |
| 1011 | while (!WorkList.empty()) { |
| 1012 | Value *Ptr = WorkList.pop_back_val(); |
| 1013 | if (!Visited.insert(Ptr).second) |
| 1014 | continue; |
| 1015 | auto *PN = dyn_cast<PHINode>(Val: Ptr); |
| 1016 | // SCEV does not look through non-header PHIs inside the loop. Such phis |
| 1017 | // can be analyzed by adding separate accesses for each incoming pointer |
| 1018 | // value. |
| 1019 | if (PN && InnermostLoop.contains(BB: PN->getParent()) && |
| 1020 | PN->getParent() != InnermostLoop.getHeader()) { |
| 1021 | llvm::append_range(C&: WorkList, R: PN->incoming_values()); |
| 1022 | } else |
| 1023 | AddPointer(Ptr); |
| 1024 | } |
| 1025 | } |
| 1026 | |
| 1027 | // Walk back through the IR for a pointer, looking for a select like the |
| 1028 | // following: |
| 1029 | // |
| 1030 | // %offset = select i1 %cmp, i64 %a, i64 %b |
| 1031 | // %addr = getelementptr double, double* %base, i64 %offset |
| 1032 | // %ld = load double, double* %addr, align 8 |
| 1033 | // |
| 1034 | // We won't be able to form a single SCEVAddRecExpr from this since the |
| 1035 | // address for each loop iteration depends on %cmp. We could potentially |
| 1036 | // produce multiple valid SCEVAddRecExprs, though, and check all of them for |
| 1037 | // memory safety/aliasing if needed. |
| 1038 | // |
| 1039 | // If we encounter some IR we don't yet handle, or something obviously fine |
| 1040 | // like a constant, then we just add the SCEV for that term to the list passed |
| 1041 | // in by the caller. If we have a node that may potentially yield a valid |
| 1042 | // SCEVAddRecExpr then we decompose it into parts and build the SCEV terms |
| 1043 | // ourselves before adding to the list. |
| 1044 | static void findForkedSCEVs( |
| 1045 | ScalarEvolution *SE, const Loop *L, Value *Ptr, |
| 1046 | SmallVectorImpl<PointerIntPair<const SCEV *, 1, bool>> &ScevList, |
| 1047 | unsigned Depth) { |
| 1048 | // If our Value is a SCEVAddRecExpr, loop invariant, not an instruction, or |
| 1049 | // we've exceeded our limit on recursion, just return whatever we have |
| 1050 | // regardless of whether it can be used for a forked pointer or not, along |
| 1051 | // with an indication of whether it might be a poison or undef value. |
| 1052 | const SCEV *Scev = SE->getSCEV(V: Ptr); |
| 1053 | if (isa<SCEVAddRecExpr>(Val: Scev) || L->isLoopInvariant(V: Ptr) || |
| 1054 | !isa<Instruction>(Val: Ptr) || Depth == 0) { |
| 1055 | ScevList.emplace_back(Args&: Scev, Args: !isGuaranteedNotToBeUndefOrPoison(V: Ptr)); |
| 1056 | return; |
| 1057 | } |
| 1058 | |
| 1059 | Depth--; |
| 1060 | |
| 1061 | auto UndefPoisonCheck = [](PointerIntPair<const SCEV *, 1, bool> S) { |
| 1062 | return get<1>(Pair: S); |
| 1063 | }; |
| 1064 | |
| 1065 | auto GetBinOpExpr = [&SE](unsigned Opcode, const SCEV *L, const SCEV *R) { |
| 1066 | switch (Opcode) { |
| 1067 | case Instruction::Add: |
| 1068 | return SE->getAddExpr(LHS: L, RHS: R); |
| 1069 | case Instruction::Sub: |
| 1070 | return SE->getMinusSCEV(LHS: L, RHS: R); |
| 1071 | default: |
| 1072 | llvm_unreachable("Unexpected binary operator when walking ForkedPtrs" ); |
| 1073 | } |
| 1074 | }; |
| 1075 | |
| 1076 | Instruction *I = cast<Instruction>(Val: Ptr); |
| 1077 | unsigned Opcode = I->getOpcode(); |
| 1078 | switch (Opcode) { |
| 1079 | case Instruction::GetElementPtr: { |
| 1080 | auto *GEP = cast<GetElementPtrInst>(Val: I); |
| 1081 | Type *SourceTy = GEP->getSourceElementType(); |
| 1082 | // We only handle base + single offset GEPs here for now. |
| 1083 | // Not dealing with preexisting gathers yet, so no vectors. |
| 1084 | if (I->getNumOperands() != 2 || SourceTy->isVectorTy()) { |
| 1085 | ScevList.emplace_back(Args&: Scev, Args: !isGuaranteedNotToBeUndefOrPoison(V: GEP)); |
| 1086 | break; |
| 1087 | } |
| 1088 | SmallVector<PointerIntPair<const SCEV *, 1, bool>, 2> BaseScevs; |
| 1089 | SmallVector<PointerIntPair<const SCEV *, 1, bool>, 2> OffsetScevs; |
| 1090 | findForkedSCEVs(SE, L, Ptr: I->getOperand(i: 0), ScevList&: BaseScevs, Depth); |
| 1091 | findForkedSCEVs(SE, L, Ptr: I->getOperand(i: 1), ScevList&: OffsetScevs, Depth); |
| 1092 | |
| 1093 | // See if we need to freeze our fork... |
| 1094 | bool NeedsFreeze = any_of(Range&: BaseScevs, P: UndefPoisonCheck) || |
| 1095 | any_of(Range&: OffsetScevs, P: UndefPoisonCheck); |
| 1096 | |
| 1097 | // Check that we only have a single fork, on either the base or the offset. |
| 1098 | // Copy the SCEV across for the one without a fork in order to generate |
| 1099 | // the full SCEV for both sides of the GEP. |
| 1100 | if (OffsetScevs.size() == 2 && BaseScevs.size() == 1) |
| 1101 | BaseScevs.push_back(Elt: BaseScevs[0]); |
| 1102 | else if (BaseScevs.size() == 2 && OffsetScevs.size() == 1) |
| 1103 | OffsetScevs.push_back(Elt: OffsetScevs[0]); |
| 1104 | else { |
| 1105 | ScevList.emplace_back(Args&: Scev, Args&: NeedsFreeze); |
| 1106 | break; |
| 1107 | } |
| 1108 | |
| 1109 | Type *IntPtrTy = SE->getEffectiveSCEVType(Ty: GEP->getPointerOperandType()); |
| 1110 | |
| 1111 | // Find the size of the type being pointed to. We only have a single |
| 1112 | // index term (guarded above) so we don't need to index into arrays or |
| 1113 | // structures, just get the size of the scalar value. |
| 1114 | const SCEV *Size = SE->getSizeOfExpr(IntTy: IntPtrTy, AllocTy: SourceTy); |
| 1115 | |
| 1116 | for (auto [B, O] : zip(t&: BaseScevs, u&: OffsetScevs)) { |
| 1117 | const SCEV *Base = get<0>(Pair: B); |
| 1118 | const SCEV *Offset = get<0>(Pair: O); |
| 1119 | |
| 1120 | // Scale up the offsets by the size of the type, then add to the bases. |
| 1121 | const SCEV *Scaled = |
| 1122 | SE->getMulExpr(LHS: Size, RHS: SE->getTruncateOrSignExtend(V: Offset, Ty: IntPtrTy)); |
| 1123 | ScevList.emplace_back(Args: SE->getAddExpr(LHS: Base, RHS: Scaled), Args&: NeedsFreeze); |
| 1124 | } |
| 1125 | break; |
| 1126 | } |
| 1127 | case Instruction::Select: { |
| 1128 | SmallVector<PointerIntPair<const SCEV *, 1, bool>, 2> ChildScevs; |
| 1129 | // A select means we've found a forked pointer, but we currently only |
| 1130 | // support a single select per pointer so if there's another behind this |
| 1131 | // then we just bail out and return the generic SCEV. |
| 1132 | findForkedSCEVs(SE, L, Ptr: I->getOperand(i: 1), ScevList&: ChildScevs, Depth); |
| 1133 | findForkedSCEVs(SE, L, Ptr: I->getOperand(i: 2), ScevList&: ChildScevs, Depth); |
| 1134 | if (ChildScevs.size() == 2) |
| 1135 | append_range(C&: ScevList, R&: ChildScevs); |
| 1136 | else |
| 1137 | ScevList.emplace_back(Args&: Scev, Args: !isGuaranteedNotToBeUndefOrPoison(V: Ptr)); |
| 1138 | break; |
| 1139 | } |
| 1140 | case Instruction::PHI: { |
| 1141 | SmallVector<PointerIntPair<const SCEV *, 1, bool>, 2> ChildScevs; |
| 1142 | // A phi means we've found a forked pointer, but we currently only |
| 1143 | // support a single phi per pointer so if there's another behind this |
| 1144 | // then we just bail out and return the generic SCEV. |
| 1145 | if (I->getNumOperands() == 2) { |
| 1146 | findForkedSCEVs(SE, L, Ptr: I->getOperand(i: 0), ScevList&: ChildScevs, Depth); |
| 1147 | findForkedSCEVs(SE, L, Ptr: I->getOperand(i: 1), ScevList&: ChildScevs, Depth); |
| 1148 | } |
| 1149 | if (ChildScevs.size() == 2) |
| 1150 | append_range(C&: ScevList, R&: ChildScevs); |
| 1151 | else |
| 1152 | ScevList.emplace_back(Args&: Scev, Args: !isGuaranteedNotToBeUndefOrPoison(V: Ptr)); |
| 1153 | break; |
| 1154 | } |
| 1155 | case Instruction::Add: |
| 1156 | case Instruction::Sub: { |
| 1157 | SmallVector<PointerIntPair<const SCEV *, 1, bool>> LScevs; |
| 1158 | SmallVector<PointerIntPair<const SCEV *, 1, bool>> RScevs; |
| 1159 | findForkedSCEVs(SE, L, Ptr: I->getOperand(i: 0), ScevList&: LScevs, Depth); |
| 1160 | findForkedSCEVs(SE, L, Ptr: I->getOperand(i: 1), ScevList&: RScevs, Depth); |
| 1161 | |
| 1162 | // See if we need to freeze our fork... |
| 1163 | bool NeedsFreeze = |
| 1164 | any_of(Range&: LScevs, P: UndefPoisonCheck) || any_of(Range&: RScevs, P: UndefPoisonCheck); |
| 1165 | |
| 1166 | // Check that we only have a single fork, on either the left or right side. |
| 1167 | // Copy the SCEV across for the one without a fork in order to generate |
| 1168 | // the full SCEV for both sides of the BinOp. |
| 1169 | if (LScevs.size() == 2 && RScevs.size() == 1) |
| 1170 | RScevs.push_back(Elt: RScevs[0]); |
| 1171 | else if (RScevs.size() == 2 && LScevs.size() == 1) |
| 1172 | LScevs.push_back(Elt: LScevs[0]); |
| 1173 | else { |
| 1174 | ScevList.emplace_back(Args&: Scev, Args&: NeedsFreeze); |
| 1175 | break; |
| 1176 | } |
| 1177 | |
| 1178 | for (auto [L, R] : zip(t&: LScevs, u&: RScevs)) |
| 1179 | ScevList.emplace_back(Args: GetBinOpExpr(Opcode, get<0>(Pair: L), get<0>(Pair: R)), |
| 1180 | Args&: NeedsFreeze); |
| 1181 | break; |
| 1182 | } |
| 1183 | default: |
| 1184 | // Just return the current SCEV if we haven't handled the instruction yet. |
| 1185 | LLVM_DEBUG(dbgs() << "ForkedPtr unhandled instruction: " << *I << "\n" ); |
| 1186 | ScevList.emplace_back(Args&: Scev, Args: !isGuaranteedNotToBeUndefOrPoison(V: Ptr)); |
| 1187 | break; |
| 1188 | } |
| 1189 | } |
| 1190 | |
| 1191 | static SmallVector<PointerIntPair<const SCEV *, 1, bool>> |
| 1192 | findForkedPointer(PredicatedScalarEvolution &PSE, |
| 1193 | const DenseMap<Value *, const SCEV *> &StridesMap, Value *Ptr, |
| 1194 | const Loop *L) { |
| 1195 | ScalarEvolution *SE = PSE.getSE(); |
| 1196 | assert(SE->isSCEVable(Ptr->getType()) && "Value is not SCEVable!" ); |
| 1197 | SmallVector<PointerIntPair<const SCEV *, 1, bool>> Scevs; |
| 1198 | findForkedSCEVs(SE, L, Ptr, ScevList&: Scevs, Depth: MaxForkedSCEVDepth); |
| 1199 | |
| 1200 | // For now, we will only accept a forked pointer with two possible SCEVs |
| 1201 | // that are either SCEVAddRecExprs or loop invariant. |
| 1202 | if (Scevs.size() == 2 && |
| 1203 | (isa<SCEVAddRecExpr>(Val: get<0>(Pair: Scevs[0])) || |
| 1204 | SE->isLoopInvariant(S: get<0>(Pair: Scevs[0]), L)) && |
| 1205 | (isa<SCEVAddRecExpr>(Val: get<0>(Pair: Scevs[1])) || |
| 1206 | SE->isLoopInvariant(S: get<0>(Pair: Scevs[1]), L))) { |
| 1207 | LLVM_DEBUG(dbgs() << "LAA: Found forked pointer: " << *Ptr << "\n" ); |
| 1208 | LLVM_DEBUG(dbgs() << "\t(1) " << *get<0>(Scevs[0]) << "\n" ); |
| 1209 | LLVM_DEBUG(dbgs() << "\t(2) " << *get<0>(Scevs[1]) << "\n" ); |
| 1210 | return Scevs; |
| 1211 | } |
| 1212 | |
| 1213 | return {{replaceSymbolicStrideSCEV(PSE, PtrToStride: StridesMap, Ptr), false}}; |
| 1214 | } |
| 1215 | |
| 1216 | bool AccessAnalysis::createCheckForAccess( |
| 1217 | RuntimePointerChecking &RtCheck, MemAccessInfo Access, Type *AccessTy, |
| 1218 | const DenseMap<Value *, const SCEV *> &StridesMap, |
| 1219 | DenseMap<Value *, unsigned> &DepSetId, Loop *TheLoop, |
| 1220 | unsigned &RunningDepId, unsigned ASId, bool Assume) { |
| 1221 | Value *Ptr = Access.getPointer(); |
| 1222 | |
| 1223 | SmallVector<PointerIntPair<const SCEV *, 1, bool>> TranslatedPtrs = |
| 1224 | findForkedPointer(PSE, StridesMap, Ptr, L: TheLoop); |
| 1225 | assert(!TranslatedPtrs.empty() && "must have some translated pointers" ); |
| 1226 | |
| 1227 | /// Check whether all pointers can participate in a runtime bounds check. They |
| 1228 | /// must either be invariant or AddRecs. If ShouldCheckWrap is true, they also |
| 1229 | /// must not wrap. |
| 1230 | for (auto &P : TranslatedPtrs) { |
| 1231 | // The bounds for loop-invariant pointer is trivial. |
| 1232 | if (PSE.getSE()->isLoopInvariant(S: P.getPointer(), L: TheLoop)) |
| 1233 | continue; |
| 1234 | |
| 1235 | const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Val: P.getPointer()); |
| 1236 | if (!AR && Assume) |
| 1237 | AR = PSE.getAsAddRec(V: Ptr); |
| 1238 | if (!AR || !AR->isAffine()) |
| 1239 | return false; |
| 1240 | |
| 1241 | // If there's only one option for Ptr, look it up after bounds and wrap |
| 1242 | // checking, because assumptions might have been added to PSE. |
| 1243 | if (TranslatedPtrs.size() == 1) { |
| 1244 | AR = |
| 1245 | cast<SCEVAddRecExpr>(Val: replaceSymbolicStrideSCEV(PSE, PtrToStride: StridesMap, Ptr)); |
| 1246 | P.setPointer(AR); |
| 1247 | } |
| 1248 | |
| 1249 | // When we run after a failing dependency check we have to make sure |
| 1250 | // we don't have wrapping pointers. |
| 1251 | if (!isNoWrap(PSE, AR, Ptr: TranslatedPtrs.size() == 1 ? Ptr : nullptr, AccessTy, |
| 1252 | L: TheLoop, Assume)) { |
| 1253 | return false; |
| 1254 | } |
| 1255 | } |
| 1256 | |
| 1257 | for (auto [PtrExpr, NeedsFreeze] : TranslatedPtrs) { |
| 1258 | // The id of the dependence set. |
| 1259 | unsigned DepId; |
| 1260 | |
| 1261 | if (isDependencyCheckNeeded()) { |
| 1262 | Value *Leader = DepCands.getLeaderValue(V: Access).getPointer(); |
| 1263 | unsigned &LeaderId = DepSetId[Leader]; |
| 1264 | if (!LeaderId) |
| 1265 | LeaderId = RunningDepId++; |
| 1266 | DepId = LeaderId; |
| 1267 | } else |
| 1268 | // Each access has its own dependence set. |
| 1269 | DepId = RunningDepId++; |
| 1270 | |
| 1271 | bool IsWrite = Access.getInt(); |
| 1272 | RtCheck.insert(Lp: TheLoop, Ptr, PtrExpr, AccessTy, WritePtr: IsWrite, DepSetId: DepId, ASId, PSE, |
| 1273 | NeedsFreeze); |
| 1274 | LLVM_DEBUG(dbgs() << "LAA: Found a runtime check ptr:" << *Ptr << '\n'); |
| 1275 | } |
| 1276 | |
| 1277 | return true; |
| 1278 | } |
| 1279 | |
| 1280 | bool AccessAnalysis::canCheckPtrAtRT( |
| 1281 | RuntimePointerChecking &RtCheck, Loop *TheLoop, |
| 1282 | const DenseMap<Value *, const SCEV *> &StridesMap, Value *&UncomputablePtr, |
| 1283 | bool AllowPartial) { |
| 1284 | // Find pointers with computable bounds. We are going to use this information |
| 1285 | // to place a runtime bound check. |
| 1286 | bool CanDoRT = true; |
| 1287 | |
| 1288 | bool MayNeedRTCheck = false; |
| 1289 | if (!IsRTCheckAnalysisNeeded) return true; |
| 1290 | |
| 1291 | bool IsDepCheckNeeded = isDependencyCheckNeeded(); |
| 1292 | |
| 1293 | // We assign a consecutive id to access from different alias sets. |
| 1294 | // Accesses between different groups doesn't need to be checked. |
| 1295 | unsigned ASId = 0; |
| 1296 | for (const auto &AS : AST) { |
| 1297 | int NumReadPtrChecks = 0; |
| 1298 | int NumWritePtrChecks = 0; |
| 1299 | bool CanDoAliasSetRT = true; |
| 1300 | ++ASId; |
| 1301 | auto ASPointers = AS.getPointers(); |
| 1302 | |
| 1303 | // We assign consecutive id to access from different dependence sets. |
| 1304 | // Accesses within the same set don't need a runtime check. |
| 1305 | unsigned RunningDepId = 1; |
| 1306 | DenseMap<Value *, unsigned> DepSetId; |
| 1307 | |
| 1308 | SmallVector<std::pair<MemAccessInfo, Type *>, 4> Retries; |
| 1309 | |
| 1310 | // First, count how many write and read accesses are in the alias set. Also |
| 1311 | // collect MemAccessInfos for later. |
| 1312 | SmallVector<MemAccessInfo, 4> AccessInfos; |
| 1313 | for (const Value *ConstPtr : ASPointers) { |
| 1314 | Value *Ptr = const_cast<Value *>(ConstPtr); |
| 1315 | bool IsWrite = Accesses.contains(Key: MemAccessInfo(Ptr, true)); |
| 1316 | if (IsWrite) |
| 1317 | ++NumWritePtrChecks; |
| 1318 | else |
| 1319 | ++NumReadPtrChecks; |
| 1320 | AccessInfos.emplace_back(Args&: Ptr, Args&: IsWrite); |
| 1321 | } |
| 1322 | |
| 1323 | // We do not need runtime checks for this alias set, if there are no writes |
| 1324 | // or a single write and no reads. |
| 1325 | if (NumWritePtrChecks == 0 || |
| 1326 | (NumWritePtrChecks == 1 && NumReadPtrChecks == 0)) { |
| 1327 | assert((ASPointers.size() <= 1 || |
| 1328 | all_of(ASPointers, |
| 1329 | [this](const Value *Ptr) { |
| 1330 | MemAccessInfo AccessWrite(const_cast<Value *>(Ptr), |
| 1331 | true); |
| 1332 | return !DepCands.contains(AccessWrite); |
| 1333 | })) && |
| 1334 | "Can only skip updating CanDoRT below, if all entries in AS " |
| 1335 | "are reads or there is at most 1 entry" ); |
| 1336 | continue; |
| 1337 | } |
| 1338 | |
| 1339 | for (auto &Access : AccessInfos) { |
| 1340 | for (const auto &AccessTy : Accesses[Access]) { |
| 1341 | if (!createCheckForAccess(RtCheck, Access, AccessTy, StridesMap, |
| 1342 | DepSetId, TheLoop, RunningDepId, ASId, |
| 1343 | Assume: false)) { |
| 1344 | LLVM_DEBUG(dbgs() << "LAA: Can't find bounds for ptr:" |
| 1345 | << *Access.getPointer() << '\n'); |
| 1346 | Retries.emplace_back(Args&: Access, Args: AccessTy); |
| 1347 | CanDoAliasSetRT = false; |
| 1348 | } |
| 1349 | } |
| 1350 | } |
| 1351 | |
| 1352 | // Note that this function computes CanDoRT and MayNeedRTCheck |
| 1353 | // independently. For example CanDoRT=false, MayNeedRTCheck=false means that |
| 1354 | // we have a pointer for which we couldn't find the bounds but we don't |
| 1355 | // actually need to emit any checks so it does not matter. |
| 1356 | // |
| 1357 | // We need runtime checks for this alias set, if there are at least 2 |
| 1358 | // dependence sets (in which case RunningDepId > 2) or if we need to re-try |
| 1359 | // any bound checks (because in that case the number of dependence sets is |
| 1360 | // incomplete). |
| 1361 | bool NeedsAliasSetRTCheck = RunningDepId > 2 || !Retries.empty(); |
| 1362 | |
| 1363 | // We need to perform run-time alias checks, but some pointers had bounds |
| 1364 | // that couldn't be checked. |
| 1365 | if (NeedsAliasSetRTCheck && !CanDoAliasSetRT) { |
| 1366 | // Reset the CanDoSetRt flag and retry all accesses that have failed. |
| 1367 | // We know that we need these checks, so we can now be more aggressive |
| 1368 | // and add further checks if required (overflow checks). |
| 1369 | CanDoAliasSetRT = true; |
| 1370 | for (const auto &[Access, AccessTy] : Retries) { |
| 1371 | if (!createCheckForAccess(RtCheck, Access, AccessTy, StridesMap, |
| 1372 | DepSetId, TheLoop, RunningDepId, ASId, |
| 1373 | /*Assume=*/true)) { |
| 1374 | CanDoAliasSetRT = false; |
| 1375 | UncomputablePtr = Access.getPointer(); |
| 1376 | if (!AllowPartial) |
| 1377 | break; |
| 1378 | } |
| 1379 | } |
| 1380 | } |
| 1381 | |
| 1382 | CanDoRT &= CanDoAliasSetRT; |
| 1383 | MayNeedRTCheck |= NeedsAliasSetRTCheck; |
| 1384 | ++ASId; |
| 1385 | } |
| 1386 | |
| 1387 | // If the pointers that we would use for the bounds comparison have different |
| 1388 | // address spaces, assume the values aren't directly comparable, so we can't |
| 1389 | // use them for the runtime check. We also have to assume they could |
| 1390 | // overlap. In the future there should be metadata for whether address spaces |
| 1391 | // are disjoint. |
| 1392 | unsigned NumPointers = RtCheck.Pointers.size(); |
| 1393 | for (unsigned i = 0; i < NumPointers; ++i) { |
| 1394 | for (unsigned j = i + 1; j < NumPointers; ++j) { |
| 1395 | // Only need to check pointers between two different dependency sets. |
| 1396 | if (RtCheck.Pointers[i].DependencySetId == |
| 1397 | RtCheck.Pointers[j].DependencySetId) |
| 1398 | continue; |
| 1399 | // Only need to check pointers in the same alias set. |
| 1400 | if (RtCheck.Pointers[i].AliasSetId != RtCheck.Pointers[j].AliasSetId) |
| 1401 | continue; |
| 1402 | |
| 1403 | Value *PtrI = RtCheck.Pointers[i].PointerValue; |
| 1404 | Value *PtrJ = RtCheck.Pointers[j].PointerValue; |
| 1405 | |
| 1406 | unsigned ASi = PtrI->getType()->getPointerAddressSpace(); |
| 1407 | unsigned ASj = PtrJ->getType()->getPointerAddressSpace(); |
| 1408 | if (ASi != ASj) { |
| 1409 | LLVM_DEBUG( |
| 1410 | dbgs() << "LAA: Runtime check would require comparison between" |
| 1411 | " different address spaces\n" ); |
| 1412 | return false; |
| 1413 | } |
| 1414 | } |
| 1415 | } |
| 1416 | |
| 1417 | if (MayNeedRTCheck && (CanDoRT || AllowPartial)) |
| 1418 | RtCheck.generateChecks(DepCands, UseDependencies: IsDepCheckNeeded); |
| 1419 | |
| 1420 | LLVM_DEBUG(dbgs() << "LAA: We need to do " << RtCheck.getNumberOfChecks() |
| 1421 | << " pointer comparisons.\n" ); |
| 1422 | |
| 1423 | // If we can do run-time checks, but there are no checks, no runtime checks |
| 1424 | // are needed. This can happen when all pointers point to the same underlying |
| 1425 | // object for example. |
| 1426 | RtCheck.Need = CanDoRT ? RtCheck.getNumberOfChecks() != 0 : MayNeedRTCheck; |
| 1427 | |
| 1428 | bool CanDoRTIfNeeded = !RtCheck.Need || CanDoRT; |
| 1429 | assert(CanDoRTIfNeeded == (CanDoRT || !MayNeedRTCheck) && |
| 1430 | "CanDoRTIfNeeded depends on RtCheck.Need" ); |
| 1431 | if (!CanDoRTIfNeeded && !AllowPartial) |
| 1432 | RtCheck.reset(); |
| 1433 | return CanDoRTIfNeeded; |
| 1434 | } |
| 1435 | |
| 1436 | void AccessAnalysis::processMemAccesses() { |
| 1437 | // We process the set twice: first we process read-write pointers, last we |
| 1438 | // process read-only pointers. This allows us to skip dependence tests for |
| 1439 | // read-only pointers. |
| 1440 | |
| 1441 | LLVM_DEBUG(dbgs() << "LAA: Processing memory accesses...\n" ); |
| 1442 | LLVM_DEBUG(dbgs() << " AST: " ; AST.dump()); |
| 1443 | LLVM_DEBUG(dbgs() << "LAA: Accesses(" << Accesses.size() << "):\n" ); |
| 1444 | LLVM_DEBUG({ |
| 1445 | for (const auto &[A, _] : Accesses) |
| 1446 | dbgs() << "\t" << *A.getPointer() << " (" |
| 1447 | << (A.getInt() |
| 1448 | ? "write" |
| 1449 | : (ReadOnlyPtr.contains(A.getPointer()) ? "read-only" |
| 1450 | : "read" )) |
| 1451 | << ")\n" ; |
| 1452 | }); |
| 1453 | |
| 1454 | // The AliasSetTracker has nicely partitioned our pointers by metadata |
| 1455 | // compatibility and potential for underlying-object overlap. As a result, we |
| 1456 | // only need to check for potential pointer dependencies within each alias |
| 1457 | // set. |
| 1458 | for (const auto &AS : AST) { |
| 1459 | // Note that both the alias-set tracker and the alias sets themselves used |
| 1460 | // ordered collections internally and so the iteration order here is |
| 1461 | // deterministic. |
| 1462 | auto ASPointers = AS.getPointers(); |
| 1463 | |
| 1464 | bool SetHasWrite = false; |
| 1465 | |
| 1466 | // Map of (pointer to underlying objects, accessed address space) to last |
| 1467 | // access encountered. |
| 1468 | typedef DenseMap<std::pair<const Value *, unsigned>, MemAccessInfo> |
| 1469 | UnderlyingObjToAccessMap; |
| 1470 | UnderlyingObjToAccessMap ObjToLastAccess; |
| 1471 | |
| 1472 | // Set of access to check after all writes have been processed. |
| 1473 | PtrAccessMap DeferredAccesses; |
| 1474 | |
| 1475 | // Iterate over each alias set twice, once to process read/write pointers, |
| 1476 | // and then to process read-only pointers. |
| 1477 | for (int SetIteration = 0; SetIteration < 2; ++SetIteration) { |
| 1478 | bool UseDeferred = SetIteration > 0; |
| 1479 | PtrAccessMap &S = UseDeferred ? DeferredAccesses : Accesses; |
| 1480 | |
| 1481 | for (const Value *ConstPtr : ASPointers) { |
| 1482 | Value *Ptr = const_cast<Value *>(ConstPtr); |
| 1483 | |
| 1484 | // For a single memory access in AliasSetTracker, Accesses may contain |
| 1485 | // both read and write, and they both need to be handled for CheckDeps. |
| 1486 | for (const auto &[AC, _] : S) { |
| 1487 | if (AC.getPointer() != Ptr) |
| 1488 | continue; |
| 1489 | |
| 1490 | bool IsWrite = AC.getInt(); |
| 1491 | |
| 1492 | // If we're using the deferred access set, then it contains only |
| 1493 | // reads. |
| 1494 | bool IsReadOnlyPtr = ReadOnlyPtr.contains(Ptr) && !IsWrite; |
| 1495 | if (UseDeferred && !IsReadOnlyPtr) |
| 1496 | continue; |
| 1497 | // Otherwise, the pointer must be in the PtrAccessSet, either as a |
| 1498 | // read or a write. |
| 1499 | assert(((IsReadOnlyPtr && UseDeferred) || IsWrite || |
| 1500 | S.contains(MemAccessInfo(Ptr, false))) && |
| 1501 | "Alias-set pointer not in the access set?" ); |
| 1502 | |
| 1503 | MemAccessInfo Access(Ptr, IsWrite); |
| 1504 | DepCands.insert(Data: Access); |
| 1505 | |
| 1506 | // Memorize read-only pointers for later processing and skip them in |
| 1507 | // the first round (they need to be checked after we have seen all |
| 1508 | // write pointers). Note: we also mark pointer that are not |
| 1509 | // consecutive as "read-only" pointers (so that we check |
| 1510 | // "a[b[i]] +="). Hence, we need the second check for "!IsWrite". |
| 1511 | if (!UseDeferred && IsReadOnlyPtr) { |
| 1512 | // We only use the pointer keys, the types vector values don't |
| 1513 | // matter. |
| 1514 | DeferredAccesses.insert(KV: {Access, {}}); |
| 1515 | continue; |
| 1516 | } |
| 1517 | |
| 1518 | // If this is a write - check other reads and writes for conflicts. If |
| 1519 | // this is a read only check other writes for conflicts (but only if |
| 1520 | // there is no other write to the ptr - this is an optimization to |
| 1521 | // catch "a[i] = a[i] + " without having to do a dependence check). |
| 1522 | if ((IsWrite || IsReadOnlyPtr) && SetHasWrite) { |
| 1523 | CheckDeps.push_back(Elt: Access); |
| 1524 | IsRTCheckAnalysisNeeded = true; |
| 1525 | } |
| 1526 | |
| 1527 | if (IsWrite) |
| 1528 | SetHasWrite = true; |
| 1529 | |
| 1530 | // Create sets of pointers connected by a shared alias set and |
| 1531 | // underlying object. |
| 1532 | SmallVector<const Value *, 16> &UOs = UnderlyingObjects[Ptr]; |
| 1533 | UOs = {}; |
| 1534 | ::getUnderlyingObjects(V: Ptr, Objects&: UOs, LI); |
| 1535 | LLVM_DEBUG(dbgs() |
| 1536 | << "Underlying objects for pointer " << *Ptr << "\n" ); |
| 1537 | for (const Value *UnderlyingObj : UOs) { |
| 1538 | // nullptr never alias, don't join sets for pointer that have "null" |
| 1539 | // in their UnderlyingObjects list. |
| 1540 | if (isa<ConstantPointerNull>(Val: UnderlyingObj) && |
| 1541 | !NullPointerIsDefined( |
| 1542 | F: TheLoop->getHeader()->getParent(), |
| 1543 | AS: UnderlyingObj->getType()->getPointerAddressSpace())) |
| 1544 | continue; |
| 1545 | |
| 1546 | auto [It, Inserted] = ObjToLastAccess.try_emplace( |
| 1547 | Key: {UnderlyingObj, |
| 1548 | cast<PointerType>(Val: Ptr->getType())->getAddressSpace()}, |
| 1549 | Args&: Access); |
| 1550 | if (!Inserted) { |
| 1551 | DepCands.unionSets(V1: Access, V2: It->second); |
| 1552 | It->second = Access; |
| 1553 | } |
| 1554 | |
| 1555 | LLVM_DEBUG(dbgs() << " " << *UnderlyingObj << "\n" ); |
| 1556 | } |
| 1557 | } |
| 1558 | } |
| 1559 | } |
| 1560 | } |
| 1561 | } |
| 1562 | |
| 1563 | /// Check whether the access through \p Ptr has a constant stride. |
| 1564 | std::optional<int64_t> |
| 1565 | llvm::getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr, |
| 1566 | const Loop *Lp, |
| 1567 | const DenseMap<Value *, const SCEV *> &StridesMap, |
| 1568 | bool Assume, bool ShouldCheckWrap) { |
| 1569 | const SCEV *PtrScev = replaceSymbolicStrideSCEV(PSE, PtrToStride: StridesMap, Ptr); |
| 1570 | if (PSE.getSE()->isLoopInvariant(S: PtrScev, L: Lp)) |
| 1571 | return 0; |
| 1572 | |
| 1573 | assert(Ptr->getType()->isPointerTy() && "Unexpected non-ptr" ); |
| 1574 | if (isa<ScalableVectorType>(Val: AccessTy)) { |
| 1575 | LLVM_DEBUG(dbgs() << "LAA: Bad stride - Scalable object: " << *AccessTy |
| 1576 | << "\n" ); |
| 1577 | return std::nullopt; |
| 1578 | } |
| 1579 | |
| 1580 | const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Val: PtrScev); |
| 1581 | if (Assume && !AR) |
| 1582 | AR = PSE.getAsAddRec(V: Ptr); |
| 1583 | |
| 1584 | if (!AR) { |
| 1585 | LLVM_DEBUG(dbgs() << "LAA: Bad stride - Not an AddRecExpr pointer " << *Ptr |
| 1586 | << " SCEV: " << *PtrScev << "\n" ); |
| 1587 | return std::nullopt; |
| 1588 | } |
| 1589 | |
| 1590 | std::optional<int64_t> Stride = |
| 1591 | getStrideFromAddRec(AR, Lp, AccessTy, Ptr, PSE); |
| 1592 | if (!ShouldCheckWrap || !Stride) |
| 1593 | return Stride; |
| 1594 | |
| 1595 | if (isNoWrap(PSE, AR, Ptr, AccessTy, L: Lp, Assume, Stride)) |
| 1596 | return Stride; |
| 1597 | |
| 1598 | LLVM_DEBUG( |
| 1599 | dbgs() << "LAA: Bad stride - Pointer may wrap in the address space " |
| 1600 | << *Ptr << " SCEV: " << *AR << "\n" ); |
| 1601 | return std::nullopt; |
| 1602 | } |
| 1603 | |
| 1604 | std::optional<int64_t> llvm::getPointersDiff(Type *ElemTyA, Value *PtrA, |
| 1605 | Type *ElemTyB, Value *PtrB, |
| 1606 | const DataLayout &DL, |
| 1607 | ScalarEvolution &SE, |
| 1608 | bool StrictCheck, bool CheckType) { |
| 1609 | assert(PtrA && PtrB && "Expected non-nullptr pointers." ); |
| 1610 | |
| 1611 | // Make sure that A and B are different pointers. |
| 1612 | if (PtrA == PtrB) |
| 1613 | return 0; |
| 1614 | |
| 1615 | // Make sure that the element types are the same if required. |
| 1616 | if (CheckType && ElemTyA != ElemTyB) |
| 1617 | return std::nullopt; |
| 1618 | |
| 1619 | unsigned ASA = PtrA->getType()->getPointerAddressSpace(); |
| 1620 | unsigned ASB = PtrB->getType()->getPointerAddressSpace(); |
| 1621 | |
| 1622 | // Check that the address spaces match. |
| 1623 | if (ASA != ASB) |
| 1624 | return std::nullopt; |
| 1625 | unsigned IdxWidth = DL.getIndexSizeInBits(AS: ASA); |
| 1626 | |
| 1627 | APInt OffsetA(IdxWidth, 0), OffsetB(IdxWidth, 0); |
| 1628 | const Value *PtrA1 = PtrA->stripAndAccumulateConstantOffsets( |
| 1629 | DL, Offset&: OffsetA, /*AllowNonInbounds=*/true); |
| 1630 | const Value *PtrB1 = PtrB->stripAndAccumulateConstantOffsets( |
| 1631 | DL, Offset&: OffsetB, /*AllowNonInbounds=*/true); |
| 1632 | |
| 1633 | std::optional<int64_t> Val; |
| 1634 | if (PtrA1 == PtrB1) { |
| 1635 | // Retrieve the address space again as pointer stripping now tracks through |
| 1636 | // `addrspacecast`. |
| 1637 | ASA = cast<PointerType>(Val: PtrA1->getType())->getAddressSpace(); |
| 1638 | ASB = cast<PointerType>(Val: PtrB1->getType())->getAddressSpace(); |
| 1639 | // Check that the address spaces match and that the pointers are valid. |
| 1640 | if (ASA != ASB) |
| 1641 | return std::nullopt; |
| 1642 | |
| 1643 | IdxWidth = DL.getIndexSizeInBits(AS: ASA); |
| 1644 | OffsetA = OffsetA.sextOrTrunc(width: IdxWidth); |
| 1645 | OffsetB = OffsetB.sextOrTrunc(width: IdxWidth); |
| 1646 | |
| 1647 | OffsetB -= OffsetA; |
| 1648 | Val = OffsetB.trySExtValue(); |
| 1649 | } else { |
| 1650 | // Otherwise compute the distance with SCEV between the base pointers. |
| 1651 | const SCEV *PtrSCEVA = SE.getSCEV(V: PtrA); |
| 1652 | const SCEV *PtrSCEVB = SE.getSCEV(V: PtrB); |
| 1653 | std::optional<APInt> Diff = |
| 1654 | SE.computeConstantDifference(LHS: PtrSCEVB, RHS: PtrSCEVA); |
| 1655 | if (!Diff) |
| 1656 | return std::nullopt; |
| 1657 | Val = Diff->trySExtValue(); |
| 1658 | } |
| 1659 | |
| 1660 | if (!Val) |
| 1661 | return std::nullopt; |
| 1662 | |
| 1663 | int64_t Size = DL.getTypeStoreSize(Ty: ElemTyA); |
| 1664 | int64_t Dist = *Val / Size; |
| 1665 | |
| 1666 | // Ensure that the calculated distance matches the type-based one after all |
| 1667 | // the bitcasts removal in the provided pointers. |
| 1668 | if (!StrictCheck || Dist * Size == Val) |
| 1669 | return Dist; |
| 1670 | return std::nullopt; |
| 1671 | } |
| 1672 | |
| 1673 | bool llvm::sortPtrAccesses(ArrayRef<Value *> VL, Type *ElemTy, |
| 1674 | const DataLayout &DL, ScalarEvolution &SE, |
| 1675 | SmallVectorImpl<unsigned> &SortedIndices) { |
| 1676 | assert(llvm::all_of( |
| 1677 | VL, [](const Value *V) { return V->getType()->isPointerTy(); }) && |
| 1678 | "Expected list of pointer operands." ); |
| 1679 | // Walk over the pointers, and map each of them to an offset relative to |
| 1680 | // first pointer in the array. |
| 1681 | Value *Ptr0 = VL[0]; |
| 1682 | |
| 1683 | using DistOrdPair = std::pair<int64_t, unsigned>; |
| 1684 | auto Compare = llvm::less_first(); |
| 1685 | std::set<DistOrdPair, decltype(Compare)> Offsets(Compare); |
| 1686 | Offsets.emplace(args: 0, args: 0); |
| 1687 | bool IsConsecutive = true; |
| 1688 | for (auto [Idx, Ptr] : drop_begin(RangeOrContainer: enumerate(First&: VL))) { |
| 1689 | std::optional<int64_t> Diff = |
| 1690 | getPointersDiff(ElemTyA: ElemTy, PtrA: Ptr0, ElemTyB: ElemTy, PtrB: Ptr, DL, SE, |
| 1691 | /*StrictCheck=*/true); |
| 1692 | if (!Diff) |
| 1693 | return false; |
| 1694 | |
| 1695 | // Check if the pointer with the same offset is found. |
| 1696 | int64_t Offset = *Diff; |
| 1697 | auto [It, IsInserted] = Offsets.emplace(args&: Offset, args&: Idx); |
| 1698 | if (!IsInserted) |
| 1699 | return false; |
| 1700 | // Consecutive order if the inserted element is the last one. |
| 1701 | IsConsecutive &= std::next(x: It) == Offsets.end(); |
| 1702 | } |
| 1703 | SortedIndices.clear(); |
| 1704 | if (!IsConsecutive) { |
| 1705 | // Fill SortedIndices array only if it is non-consecutive. |
| 1706 | SortedIndices.resize(N: VL.size()); |
| 1707 | for (auto [Idx, Off] : enumerate(First&: Offsets)) |
| 1708 | SortedIndices[Idx] = Off.second; |
| 1709 | } |
| 1710 | return true; |
| 1711 | } |
| 1712 | |
| 1713 | /// Returns true if the memory operations \p A and \p B are consecutive. |
| 1714 | bool llvm::isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL, |
| 1715 | ScalarEvolution &SE, bool CheckType) { |
| 1716 | Value *PtrA = getLoadStorePointerOperand(V: A); |
| 1717 | Value *PtrB = getLoadStorePointerOperand(V: B); |
| 1718 | if (!PtrA || !PtrB) |
| 1719 | return false; |
| 1720 | Type *ElemTyA = getLoadStoreType(I: A); |
| 1721 | Type *ElemTyB = getLoadStoreType(I: B); |
| 1722 | std::optional<int64_t> Diff = |
| 1723 | getPointersDiff(ElemTyA, PtrA, ElemTyB, PtrB, DL, SE, |
| 1724 | /*StrictCheck=*/true, CheckType); |
| 1725 | return Diff == 1; |
| 1726 | } |
| 1727 | |
| 1728 | void MemoryDepChecker::addAccess(StoreInst *SI) { |
| 1729 | visitPointers(StartPtr: SI->getPointerOperand(), InnermostLoop: *InnermostLoop, |
| 1730 | AddPointer: [this, SI](Value *Ptr) { |
| 1731 | Accesses[MemAccessInfo(Ptr, true)].push_back(x: AccessIdx); |
| 1732 | InstMap.push_back(Elt: SI); |
| 1733 | ++AccessIdx; |
| 1734 | }); |
| 1735 | } |
| 1736 | |
| 1737 | void MemoryDepChecker::addAccess(LoadInst *LI) { |
| 1738 | visitPointers(StartPtr: LI->getPointerOperand(), InnermostLoop: *InnermostLoop, |
| 1739 | AddPointer: [this, LI](Value *Ptr) { |
| 1740 | Accesses[MemAccessInfo(Ptr, false)].push_back(x: AccessIdx); |
| 1741 | InstMap.push_back(Elt: LI); |
| 1742 | ++AccessIdx; |
| 1743 | }); |
| 1744 | } |
| 1745 | |
| 1746 | MemoryDepChecker::VectorizationSafetyStatus |
| 1747 | MemoryDepChecker::Dependence::isSafeForVectorization(DepType Type) { |
| 1748 | switch (Type) { |
| 1749 | case NoDep: |
| 1750 | case Forward: |
| 1751 | case BackwardVectorizable: |
| 1752 | return VectorizationSafetyStatus::Safe; |
| 1753 | |
| 1754 | case Unknown: |
| 1755 | return VectorizationSafetyStatus::PossiblySafeWithRtChecks; |
| 1756 | case ForwardButPreventsForwarding: |
| 1757 | case Backward: |
| 1758 | case BackwardVectorizableButPreventsForwarding: |
| 1759 | case IndirectUnsafe: |
| 1760 | return VectorizationSafetyStatus::Unsafe; |
| 1761 | } |
| 1762 | llvm_unreachable("unexpected DepType!" ); |
| 1763 | } |
| 1764 | |
| 1765 | bool MemoryDepChecker::Dependence::isBackward() const { |
| 1766 | switch (Type) { |
| 1767 | case NoDep: |
| 1768 | case Forward: |
| 1769 | case ForwardButPreventsForwarding: |
| 1770 | case Unknown: |
| 1771 | case IndirectUnsafe: |
| 1772 | return false; |
| 1773 | |
| 1774 | case BackwardVectorizable: |
| 1775 | case Backward: |
| 1776 | case BackwardVectorizableButPreventsForwarding: |
| 1777 | return true; |
| 1778 | } |
| 1779 | llvm_unreachable("unexpected DepType!" ); |
| 1780 | } |
| 1781 | |
| 1782 | bool MemoryDepChecker::Dependence::isPossiblyBackward() const { |
| 1783 | return isBackward() || Type == Unknown || Type == IndirectUnsafe; |
| 1784 | } |
| 1785 | |
| 1786 | bool MemoryDepChecker::Dependence::isForward() const { |
| 1787 | switch (Type) { |
| 1788 | case Forward: |
| 1789 | case ForwardButPreventsForwarding: |
| 1790 | return true; |
| 1791 | |
| 1792 | case NoDep: |
| 1793 | case Unknown: |
| 1794 | case BackwardVectorizable: |
| 1795 | case Backward: |
| 1796 | case BackwardVectorizableButPreventsForwarding: |
| 1797 | case IndirectUnsafe: |
| 1798 | return false; |
| 1799 | } |
| 1800 | llvm_unreachable("unexpected DepType!" ); |
| 1801 | } |
| 1802 | |
| 1803 | bool MemoryDepChecker::couldPreventStoreLoadForward(uint64_t Distance, |
| 1804 | uint64_t TypeByteSize, |
| 1805 | unsigned CommonStride) { |
| 1806 | // If loads occur at a distance that is not a multiple of a feasible vector |
| 1807 | // factor store-load forwarding does not take place. |
| 1808 | // Positive dependences might cause troubles because vectorizing them might |
| 1809 | // prevent store-load forwarding making vectorized code run a lot slower. |
| 1810 | // a[i] = a[i-3] ^ a[i-8]; |
| 1811 | // The stores to a[i:i+1] don't align with the stores to a[i-3:i-2] and |
| 1812 | // hence on your typical architecture store-load forwarding does not take |
| 1813 | // place. Vectorizing in such cases does not make sense. |
| 1814 | // Store-load forwarding distance. |
| 1815 | |
| 1816 | // After this many iterations store-to-load forwarding conflicts should not |
| 1817 | // cause any slowdowns. |
| 1818 | const uint64_t NumItersForStoreLoadThroughMemory = 8 * TypeByteSize; |
| 1819 | // Maximum vector factor. |
| 1820 | uint64_t MaxVFWithoutSLForwardIssuesPowerOf2 = |
| 1821 | std::min(a: VectorizerParams::MaxVectorWidth * TypeByteSize, |
| 1822 | b: MaxStoreLoadForwardSafeDistanceInBits); |
| 1823 | |
| 1824 | // Compute the smallest VF at which the store and load would be misaligned. |
| 1825 | for (uint64_t VF = 2 * TypeByteSize; |
| 1826 | VF <= MaxVFWithoutSLForwardIssuesPowerOf2; VF *= 2) { |
| 1827 | // If the number of vector iteration between the store and the load are |
| 1828 | // small we could incur conflicts. |
| 1829 | if (Distance % VF && Distance / VF < NumItersForStoreLoadThroughMemory) { |
| 1830 | MaxVFWithoutSLForwardIssuesPowerOf2 = (VF >> 1); |
| 1831 | break; |
| 1832 | } |
| 1833 | } |
| 1834 | |
| 1835 | if (MaxVFWithoutSLForwardIssuesPowerOf2 < 2 * TypeByteSize) { |
| 1836 | LLVM_DEBUG( |
| 1837 | dbgs() << "LAA: Distance " << Distance |
| 1838 | << " that could cause a store-load forwarding conflict\n" ); |
| 1839 | return true; |
| 1840 | } |
| 1841 | |
| 1842 | if (CommonStride && |
| 1843 | MaxVFWithoutSLForwardIssuesPowerOf2 < |
| 1844 | MaxStoreLoadForwardSafeDistanceInBits && |
| 1845 | MaxVFWithoutSLForwardIssuesPowerOf2 != |
| 1846 | VectorizerParams::MaxVectorWidth * TypeByteSize) { |
| 1847 | uint64_t MaxVF = |
| 1848 | bit_floor(Value: MaxVFWithoutSLForwardIssuesPowerOf2 / CommonStride); |
| 1849 | uint64_t MaxVFInBits = MaxVF * TypeByteSize * 8; |
| 1850 | MaxStoreLoadForwardSafeDistanceInBits = |
| 1851 | std::min(a: MaxStoreLoadForwardSafeDistanceInBits, b: MaxVFInBits); |
| 1852 | } |
| 1853 | return false; |
| 1854 | } |
| 1855 | |
| 1856 | void MemoryDepChecker::mergeInStatus(VectorizationSafetyStatus S) { |
| 1857 | if (Status < S) |
| 1858 | Status = S; |
| 1859 | } |
| 1860 | |
| 1861 | /// Given a dependence-distance \p Dist between two memory accesses, that have |
| 1862 | /// strides in the same direction whose absolute value of the maximum stride is |
| 1863 | /// given in \p MaxStride, in a loop whose maximum backedge taken count is \p |
| 1864 | /// MaxBTC, check if it is possible to prove statically that the dependence |
| 1865 | /// distance is larger than the range that the accesses will travel through the |
| 1866 | /// execution of the loop. If so, return true; false otherwise. This is useful |
| 1867 | /// for example in loops such as the following (PR31098): |
| 1868 | /// |
| 1869 | /// for (i = 0; i < D; ++i) { |
| 1870 | /// = out[i]; |
| 1871 | /// out[i+D] = |
| 1872 | /// } |
| 1873 | static bool isSafeDependenceDistance(const DataLayout &DL, ScalarEvolution &SE, |
| 1874 | const SCEV &MaxBTC, const SCEV &Dist, |
| 1875 | uint64_t MaxStride) { |
| 1876 | |
| 1877 | // If we can prove that |
| 1878 | // (**) |Dist| > MaxBTC * Step |
| 1879 | // where Step is the absolute stride of the memory accesses in bytes, |
| 1880 | // then there is no dependence. |
| 1881 | // |
| 1882 | // Rationale: |
| 1883 | // We basically want to check if the absolute distance (|Dist/Step|) |
| 1884 | // is >= the loop iteration count (or > MaxBTC). |
| 1885 | // This is equivalent to the Strong SIV Test (Practical Dependence Testing, |
| 1886 | // Section 4.2.1); Note, that for vectorization it is sufficient to prove |
| 1887 | // that the dependence distance is >= VF; This is checked elsewhere. |
| 1888 | // But in some cases we can prune dependence distances early, and |
| 1889 | // even before selecting the VF, and without a runtime test, by comparing |
| 1890 | // the distance against the loop iteration count. Since the vectorized code |
| 1891 | // will be executed only if LoopCount >= VF, proving distance >= LoopCount |
| 1892 | // also guarantees that distance >= VF. |
| 1893 | // |
| 1894 | const SCEV *Step = SE.getConstant(Ty: MaxBTC.getType(), V: MaxStride); |
| 1895 | const SCEV *Product = SE.getMulExpr(LHS: &MaxBTC, RHS: Step); |
| 1896 | |
| 1897 | const SCEV *CastedDist = &Dist; |
| 1898 | const SCEV *CastedProduct = Product; |
| 1899 | uint64_t DistTypeSizeBits = DL.getTypeSizeInBits(Ty: Dist.getType()); |
| 1900 | uint64_t ProductTypeSizeBits = DL.getTypeSizeInBits(Ty: Product->getType()); |
| 1901 | |
| 1902 | // The dependence distance can be positive/negative, so we sign extend Dist; |
| 1903 | // The multiplication of the absolute stride in bytes and the |
| 1904 | // backedgeTakenCount is non-negative, so we zero extend Product. |
| 1905 | if (DistTypeSizeBits > ProductTypeSizeBits) |
| 1906 | CastedProduct = SE.getZeroExtendExpr(Op: Product, Ty: Dist.getType()); |
| 1907 | else |
| 1908 | CastedDist = SE.getNoopOrSignExtend(V: &Dist, Ty: Product->getType()); |
| 1909 | |
| 1910 | // Is Dist - (MaxBTC * Step) > 0 ? |
| 1911 | // (If so, then we have proven (**) because |Dist| >= Dist) |
| 1912 | const SCEV *Minus = SE.getMinusSCEV(LHS: CastedDist, RHS: CastedProduct); |
| 1913 | if (SE.isKnownPositive(S: Minus)) |
| 1914 | return true; |
| 1915 | |
| 1916 | // Second try: Is -Dist - (MaxBTC * Step) > 0 ? |
| 1917 | // (If so, then we have proven (**) because |Dist| >= -1*Dist) |
| 1918 | const SCEV *NegDist = SE.getNegativeSCEV(V: CastedDist); |
| 1919 | Minus = SE.getMinusSCEV(LHS: NegDist, RHS: CastedProduct); |
| 1920 | return SE.isKnownPositive(S: Minus); |
| 1921 | } |
| 1922 | |
| 1923 | /// Check the dependence for two accesses with the same stride \p Stride. |
| 1924 | /// \p Distance is the positive distance in bytes, and \p TypeByteSize is type |
| 1925 | /// size in bytes. |
| 1926 | /// |
| 1927 | /// \returns true if they are independent. |
| 1928 | static bool areStridedAccessesIndependent(uint64_t Distance, uint64_t Stride, |
| 1929 | uint64_t TypeByteSize) { |
| 1930 | assert(Stride > 1 && "The stride must be greater than 1" ); |
| 1931 | assert(TypeByteSize > 0 && "The type size in byte must be non-zero" ); |
| 1932 | assert(Distance > 0 && "The distance must be non-zero" ); |
| 1933 | |
| 1934 | // Skip if the distance is not multiple of type byte size. |
| 1935 | if (Distance % TypeByteSize) |
| 1936 | return false; |
| 1937 | |
| 1938 | // No dependence if the distance is not multiple of the stride. |
| 1939 | // E.g. |
| 1940 | // for (i = 0; i < 1024 ; i += 4) |
| 1941 | // A[i+2] = A[i] + 1; |
| 1942 | // |
| 1943 | // Two accesses in memory (distance is 2, stride is 4): |
| 1944 | // | A[0] | | | | A[4] | | | | |
| 1945 | // | | | A[2] | | | | A[6] | | |
| 1946 | // |
| 1947 | // E.g. |
| 1948 | // for (i = 0; i < 1024 ; i += 3) |
| 1949 | // A[i+4] = A[i] + 1; |
| 1950 | // |
| 1951 | // Two accesses in memory (distance is 4, stride is 3): |
| 1952 | // | A[0] | | | A[3] | | | A[6] | | | |
| 1953 | // | | | | | A[4] | | | A[7] | | |
| 1954 | return Distance % Stride; |
| 1955 | } |
| 1956 | |
| 1957 | std::variant<MemoryDepChecker::Dependence::DepType, |
| 1958 | MemoryDepChecker::DepDistanceStrideAndSizeInfo> |
| 1959 | MemoryDepChecker::getDependenceDistanceStrideAndSize( |
| 1960 | const AccessAnalysis::MemAccessInfo &A, Instruction *AInst, |
| 1961 | const AccessAnalysis::MemAccessInfo &B, Instruction *BInst) { |
| 1962 | const auto &DL = InnermostLoop->getHeader()->getDataLayout(); |
| 1963 | auto &SE = *PSE.getSE(); |
| 1964 | const auto &[APtr, AIsWrite] = A; |
| 1965 | const auto &[BPtr, BIsWrite] = B; |
| 1966 | |
| 1967 | // Two reads are independent. |
| 1968 | if (!AIsWrite && !BIsWrite) |
| 1969 | return MemoryDepChecker::Dependence::NoDep; |
| 1970 | |
| 1971 | Type *ATy = getLoadStoreType(I: AInst); |
| 1972 | Type *BTy = getLoadStoreType(I: BInst); |
| 1973 | |
| 1974 | // We cannot check pointers in different address spaces. |
| 1975 | if (APtr->getType()->getPointerAddressSpace() != |
| 1976 | BPtr->getType()->getPointerAddressSpace()) |
| 1977 | return MemoryDepChecker::Dependence::Unknown; |
| 1978 | |
| 1979 | std::optional<int64_t> StrideAPtr = |
| 1980 | getPtrStride(PSE, AccessTy: ATy, Ptr: APtr, Lp: InnermostLoop, StridesMap: SymbolicStrides, Assume: true, ShouldCheckWrap: true); |
| 1981 | std::optional<int64_t> StrideBPtr = |
| 1982 | getPtrStride(PSE, AccessTy: BTy, Ptr: BPtr, Lp: InnermostLoop, StridesMap: SymbolicStrides, Assume: true, ShouldCheckWrap: true); |
| 1983 | |
| 1984 | const SCEV *Src = PSE.getSCEV(V: APtr); |
| 1985 | const SCEV *Sink = PSE.getSCEV(V: BPtr); |
| 1986 | |
| 1987 | // If the induction step is negative we have to invert source and sink of the |
| 1988 | // dependence when measuring the distance between them. We should not swap |
| 1989 | // AIsWrite with BIsWrite, as their uses expect them in program order. |
| 1990 | if (StrideAPtr && *StrideAPtr < 0) { |
| 1991 | std::swap(a&: Src, b&: Sink); |
| 1992 | std::swap(a&: AInst, b&: BInst); |
| 1993 | std::swap(a&: ATy, b&: BTy); |
| 1994 | std::swap(lhs&: StrideAPtr, rhs&: StrideBPtr); |
| 1995 | } |
| 1996 | |
| 1997 | const SCEV *Dist = SE.getMinusSCEV(LHS: Sink, RHS: Src); |
| 1998 | |
| 1999 | LLVM_DEBUG(dbgs() << "LAA: Src Scev: " << *Src << "Sink Scev: " << *Sink |
| 2000 | << "\n" ); |
| 2001 | LLVM_DEBUG(dbgs() << "LAA: Distance for " << *AInst << " to " << *BInst |
| 2002 | << ": " << *Dist << "\n" ); |
| 2003 | |
| 2004 | // Check if we can prove that Sink only accesses memory after Src's end or |
| 2005 | // vice versa. At the moment this is limited to cases where either source or |
| 2006 | // sink are loop invariant to avoid compile-time increases. This is not |
| 2007 | // required for correctness. |
| 2008 | if (SE.isLoopInvariant(S: Src, L: InnermostLoop) || |
| 2009 | SE.isLoopInvariant(S: Sink, L: InnermostLoop)) { |
| 2010 | const SCEV *BTC = PSE.getBackedgeTakenCount(); |
| 2011 | const SCEV *SymbolicMaxBTC = PSE.getSymbolicMaxBackedgeTakenCount(); |
| 2012 | const auto &[SrcStart_, SrcEnd_] = |
| 2013 | getStartAndEndForAccess(Lp: InnermostLoop, PtrExpr: Src, AccessTy: ATy, BTC, MaxBTC: SymbolicMaxBTC, |
| 2014 | SE: PSE.getSE(), PointerBounds: &PointerBounds); |
| 2015 | const auto &[SinkStart_, SinkEnd_] = |
| 2016 | getStartAndEndForAccess(Lp: InnermostLoop, PtrExpr: Sink, AccessTy: BTy, BTC, MaxBTC: SymbolicMaxBTC, |
| 2017 | SE: PSE.getSE(), PointerBounds: &PointerBounds); |
| 2018 | if (!isa<SCEVCouldNotCompute>(Val: SrcStart_) && |
| 2019 | !isa<SCEVCouldNotCompute>(Val: SrcEnd_) && |
| 2020 | !isa<SCEVCouldNotCompute>(Val: SinkStart_) && |
| 2021 | !isa<SCEVCouldNotCompute>(Val: SinkEnd_)) { |
| 2022 | if (!LoopGuards) |
| 2023 | LoopGuards.emplace( |
| 2024 | args: ScalarEvolution::LoopGuards::collect(L: InnermostLoop, SE)); |
| 2025 | auto SrcEnd = SE.applyLoopGuards(Expr: SrcEnd_, Guards: *LoopGuards); |
| 2026 | auto SinkStart = SE.applyLoopGuards(Expr: SinkStart_, Guards: *LoopGuards); |
| 2027 | if (SE.isKnownPredicate(Pred: CmpInst::ICMP_ULE, LHS: SrcEnd, RHS: SinkStart)) |
| 2028 | return MemoryDepChecker::Dependence::NoDep; |
| 2029 | |
| 2030 | auto SinkEnd = SE.applyLoopGuards(Expr: SinkEnd_, Guards: *LoopGuards); |
| 2031 | auto SrcStart = SE.applyLoopGuards(Expr: SrcStart_, Guards: *LoopGuards); |
| 2032 | if (SE.isKnownPredicate(Pred: CmpInst::ICMP_ULE, LHS: SinkEnd, RHS: SrcStart)) |
| 2033 | return MemoryDepChecker::Dependence::NoDep; |
| 2034 | } |
| 2035 | } |
| 2036 | |
| 2037 | // Need accesses with constant strides and the same direction for further |
| 2038 | // dependence analysis. We don't want to vectorize "A[B[i]] += ..." and |
| 2039 | // similar code or pointer arithmetic that could wrap in the address space. |
| 2040 | |
| 2041 | // If either Src or Sink are not strided (i.e. not a non-wrapping AddRec) and |
| 2042 | // not loop-invariant (stride will be 0 in that case), we cannot analyze the |
| 2043 | // dependence further and also cannot generate runtime checks. |
| 2044 | if (!StrideAPtr || !StrideBPtr) { |
| 2045 | LLVM_DEBUG(dbgs() << "Pointer access with non-constant stride\n" ); |
| 2046 | return MemoryDepChecker::Dependence::IndirectUnsafe; |
| 2047 | } |
| 2048 | |
| 2049 | int64_t StrideAPtrInt = *StrideAPtr; |
| 2050 | int64_t StrideBPtrInt = *StrideBPtr; |
| 2051 | LLVM_DEBUG(dbgs() << "LAA: Src induction step: " << StrideAPtrInt |
| 2052 | << " Sink induction step: " << StrideBPtrInt << "\n" ); |
| 2053 | // At least Src or Sink are loop invariant and the other is strided or |
| 2054 | // invariant. We can generate a runtime check to disambiguate the accesses. |
| 2055 | if (!StrideAPtrInt || !StrideBPtrInt) |
| 2056 | return MemoryDepChecker::Dependence::Unknown; |
| 2057 | |
| 2058 | // Both Src and Sink have a constant stride, check if they are in the same |
| 2059 | // direction. |
| 2060 | if ((StrideAPtrInt > 0) != (StrideBPtrInt > 0)) { |
| 2061 | LLVM_DEBUG( |
| 2062 | dbgs() << "Pointer access with strides in different directions\n" ); |
| 2063 | return MemoryDepChecker::Dependence::Unknown; |
| 2064 | } |
| 2065 | |
| 2066 | TypeSize AStoreSz = DL.getTypeStoreSize(Ty: ATy); |
| 2067 | TypeSize BStoreSz = DL.getTypeStoreSize(Ty: BTy); |
| 2068 | |
| 2069 | // If store sizes are not the same, set TypeByteSize to zero, so we can check |
| 2070 | // it in the caller isDependent. |
| 2071 | uint64_t ASz = DL.getTypeAllocSize(Ty: ATy); |
| 2072 | uint64_t BSz = DL.getTypeAllocSize(Ty: BTy); |
| 2073 | uint64_t TypeByteSize = (AStoreSz == BStoreSz) ? BSz : 0; |
| 2074 | |
| 2075 | uint64_t StrideAScaled = std::abs(i: StrideAPtrInt) * ASz; |
| 2076 | uint64_t StrideBScaled = std::abs(i: StrideBPtrInt) * BSz; |
| 2077 | |
| 2078 | uint64_t MaxStride = std::max(a: StrideAScaled, b: StrideBScaled); |
| 2079 | |
| 2080 | std::optional<uint64_t> CommonStride; |
| 2081 | if (StrideAScaled == StrideBScaled) |
| 2082 | CommonStride = StrideAScaled; |
| 2083 | |
| 2084 | // TODO: Historically, we don't retry with runtime checks unless the |
| 2085 | // (unscaled) strides are the same. Fix this once the condition for runtime |
| 2086 | // checks in isDependent is fixed. |
| 2087 | bool ShouldRetryWithRuntimeCheck = StrideAPtrInt == StrideBPtrInt; |
| 2088 | |
| 2089 | return DepDistanceStrideAndSizeInfo(Dist, MaxStride, CommonStride, |
| 2090 | ShouldRetryWithRuntimeCheck, TypeByteSize, |
| 2091 | AIsWrite, BIsWrite); |
| 2092 | } |
| 2093 | |
| 2094 | MemoryDepChecker::Dependence::DepType |
| 2095 | MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx, |
| 2096 | const MemAccessInfo &B, unsigned BIdx) { |
| 2097 | assert(AIdx < BIdx && "Must pass arguments in program order" ); |
| 2098 | |
| 2099 | // Get the dependence distance, stride, type size and what access writes for |
| 2100 | // the dependence between A and B. |
| 2101 | auto Res = |
| 2102 | getDependenceDistanceStrideAndSize(A, AInst: InstMap[AIdx], B, BInst: InstMap[BIdx]); |
| 2103 | if (std::holds_alternative<Dependence::DepType>(v: Res)) |
| 2104 | return std::get<Dependence::DepType>(v&: Res); |
| 2105 | |
| 2106 | auto &[Dist, MaxStride, CommonStride, ShouldRetryWithRuntimeCheck, |
| 2107 | TypeByteSize, AIsWrite, BIsWrite] = |
| 2108 | std::get<DepDistanceStrideAndSizeInfo>(v&: Res); |
| 2109 | bool HasSameSize = TypeByteSize > 0; |
| 2110 | |
| 2111 | if (isa<SCEVCouldNotCompute>(Val: Dist)) { |
| 2112 | // TODO: Relax requirement that there is a common unscaled stride to retry |
| 2113 | // with non-constant distance dependencies. |
| 2114 | FoundNonConstantDistanceDependence |= ShouldRetryWithRuntimeCheck; |
| 2115 | LLVM_DEBUG(dbgs() << "LAA: Dependence because of uncomputable distance.\n" ); |
| 2116 | return Dependence::Unknown; |
| 2117 | } |
| 2118 | |
| 2119 | ScalarEvolution &SE = *PSE.getSE(); |
| 2120 | auto &DL = InnermostLoop->getHeader()->getDataLayout(); |
| 2121 | |
| 2122 | // If the distance between the acecsses is larger than their maximum absolute |
| 2123 | // stride multiplied by the symbolic maximum backedge taken count (which is an |
| 2124 | // upper bound of the number of iterations), the accesses are independet, i.e. |
| 2125 | // they are far enough appart that accesses won't access the same location |
| 2126 | // across all loop ierations. |
| 2127 | if (HasSameSize && |
| 2128 | isSafeDependenceDistance( |
| 2129 | DL, SE, MaxBTC: *(PSE.getSymbolicMaxBackedgeTakenCount()), Dist: *Dist, MaxStride)) |
| 2130 | return Dependence::NoDep; |
| 2131 | |
| 2132 | // The rest of this function relies on ConstDist being at most 64-bits, which |
| 2133 | // is checked earlier. Will assert if the calling code changes. |
| 2134 | const APInt *APDist = nullptr; |
| 2135 | uint64_t ConstDist = |
| 2136 | match(S: Dist, P: m_scev_APInt(C&: APDist)) ? APDist->abs().getZExtValue() : 0; |
| 2137 | |
| 2138 | // Attempt to prove strided accesses independent. |
| 2139 | if (APDist) { |
| 2140 | // If the distance between accesses and their strides are known constants, |
| 2141 | // check whether the accesses interlace each other. |
| 2142 | if (ConstDist > 0 && CommonStride && CommonStride > 1 && HasSameSize && |
| 2143 | areStridedAccessesIndependent(Distance: ConstDist, Stride: *CommonStride, TypeByteSize)) { |
| 2144 | LLVM_DEBUG(dbgs() << "LAA: Strided accesses are independent\n" ); |
| 2145 | return Dependence::NoDep; |
| 2146 | } |
| 2147 | } else { |
| 2148 | if (!LoopGuards) |
| 2149 | LoopGuards.emplace( |
| 2150 | args: ScalarEvolution::LoopGuards::collect(L: InnermostLoop, SE)); |
| 2151 | Dist = SE.applyLoopGuards(Expr: Dist, Guards: *LoopGuards); |
| 2152 | } |
| 2153 | |
| 2154 | // Negative distances are not plausible dependencies. |
| 2155 | if (SE.isKnownNonPositive(S: Dist)) { |
| 2156 | if (SE.isKnownNonNegative(S: Dist)) { |
| 2157 | if (HasSameSize) { |
| 2158 | // Write to the same location with the same size. |
| 2159 | return Dependence::Forward; |
| 2160 | } |
| 2161 | LLVM_DEBUG(dbgs() << "LAA: possibly zero dependence difference but " |
| 2162 | "different type sizes\n" ); |
| 2163 | return Dependence::Unknown; |
| 2164 | } |
| 2165 | |
| 2166 | bool IsTrueDataDependence = (AIsWrite && !BIsWrite); |
| 2167 | // Check if the first access writes to a location that is read in a later |
| 2168 | // iteration, where the distance between them is not a multiple of a vector |
| 2169 | // factor and relatively small. |
| 2170 | // |
| 2171 | // NOTE: There is no need to update MaxSafeVectorWidthInBits after call to |
| 2172 | // couldPreventStoreLoadForward, even if it changed MinDepDistBytes, since a |
| 2173 | // forward dependency will allow vectorization using any width. |
| 2174 | |
| 2175 | if (IsTrueDataDependence && EnableForwardingConflictDetection) { |
| 2176 | if (!ConstDist) { |
| 2177 | // TODO: FoundNonConstantDistanceDependence is used as a necessary |
| 2178 | // condition to consider retrying with runtime checks. Historically, we |
| 2179 | // did not set it when strides were different but there is no inherent |
| 2180 | // reason to. |
| 2181 | FoundNonConstantDistanceDependence |= ShouldRetryWithRuntimeCheck; |
| 2182 | return Dependence::Unknown; |
| 2183 | } |
| 2184 | if (!HasSameSize || |
| 2185 | couldPreventStoreLoadForward(Distance: ConstDist, TypeByteSize)) { |
| 2186 | LLVM_DEBUG( |
| 2187 | dbgs() << "LAA: Forward but may prevent st->ld forwarding\n" ); |
| 2188 | return Dependence::ForwardButPreventsForwarding; |
| 2189 | } |
| 2190 | } |
| 2191 | |
| 2192 | LLVM_DEBUG(dbgs() << "LAA: Dependence is negative\n" ); |
| 2193 | return Dependence::Forward; |
| 2194 | } |
| 2195 | |
| 2196 | int64_t MinDistance = SE.getSignedRangeMin(S: Dist).getSExtValue(); |
| 2197 | // Below we only handle strictly positive distances. |
| 2198 | if (MinDistance <= 0) { |
| 2199 | FoundNonConstantDistanceDependence |= ShouldRetryWithRuntimeCheck; |
| 2200 | return Dependence::Unknown; |
| 2201 | } |
| 2202 | |
| 2203 | if (!ConstDist) { |
| 2204 | // Previously this case would be treated as Unknown, possibly setting |
| 2205 | // FoundNonConstantDistanceDependence to force re-trying with runtime |
| 2206 | // checks. Until the TODO below is addressed, set it here to preserve |
| 2207 | // original behavior w.r.t. re-trying with runtime checks. |
| 2208 | // TODO: FoundNonConstantDistanceDependence is used as a necessary |
| 2209 | // condition to consider retrying with runtime checks. Historically, we |
| 2210 | // did not set it when strides were different but there is no inherent |
| 2211 | // reason to. |
| 2212 | FoundNonConstantDistanceDependence |= ShouldRetryWithRuntimeCheck; |
| 2213 | } |
| 2214 | |
| 2215 | if (!HasSameSize) { |
| 2216 | LLVM_DEBUG(dbgs() << "LAA: ReadWrite-Write positive dependency with " |
| 2217 | "different type sizes\n" ); |
| 2218 | return Dependence::Unknown; |
| 2219 | } |
| 2220 | // Bail out early if passed-in parameters make vectorization not feasible. |
| 2221 | unsigned ForcedFactor = (VectorizerParams::VectorizationFactor ? |
| 2222 | VectorizerParams::VectorizationFactor : 1); |
| 2223 | unsigned ForcedUnroll = (VectorizerParams::VectorizationInterleave ? |
| 2224 | VectorizerParams::VectorizationInterleave : 1); |
| 2225 | // The minimum number of iterations for a vectorized/unrolled version. |
| 2226 | unsigned MinNumIter = std::max(a: ForcedFactor * ForcedUnroll, b: 2U); |
| 2227 | |
| 2228 | // It's not vectorizable if the distance is smaller than the minimum distance |
| 2229 | // needed for a vectroized/unrolled version. Vectorizing one iteration in |
| 2230 | // front needs MaxStride. Vectorizing the last iteration needs TypeByteSize. |
| 2231 | // (No need to plus the last gap distance). |
| 2232 | // |
| 2233 | // E.g. Assume one char is 1 byte in memory and one int is 4 bytes. |
| 2234 | // foo(int *A) { |
| 2235 | // int *B = (int *)((char *)A + 14); |
| 2236 | // for (i = 0 ; i < 1024 ; i += 2) |
| 2237 | // B[i] = A[i] + 1; |
| 2238 | // } |
| 2239 | // |
| 2240 | // Two accesses in memory (stride is 4 * 2): |
| 2241 | // | A[0] | | A[2] | | A[4] | | A[6] | | |
| 2242 | // | B[0] | | B[2] | | B[4] | |
| 2243 | // |
| 2244 | // MinDistance needs for vectorizing iterations except the last iteration: |
| 2245 | // 4 * 2 * (MinNumIter - 1). MinDistance needs for the last iteration: 4. |
| 2246 | // So the minimum distance needed is: 4 * 2 * (MinNumIter - 1) + 4. |
| 2247 | // |
| 2248 | // If MinNumIter is 2, it is vectorizable as the minimum distance needed is |
| 2249 | // 12, which is less than distance. |
| 2250 | // |
| 2251 | // If MinNumIter is 4 (Say if a user forces the vectorization factor to be 4), |
| 2252 | // the minimum distance needed is 28, which is greater than distance. It is |
| 2253 | // not safe to do vectorization. |
| 2254 | // |
| 2255 | // We use MaxStride (maximum of src and sink strides) to get a conservative |
| 2256 | // lower bound on the MinDistanceNeeded in case of different strides. |
| 2257 | |
| 2258 | // We know that Dist is positive, but it may not be constant. Use the signed |
| 2259 | // minimum for computations below, as this ensures we compute the closest |
| 2260 | // possible dependence distance. |
| 2261 | uint64_t MinDistanceNeeded = MaxStride * (MinNumIter - 1) + TypeByteSize; |
| 2262 | if (MinDistanceNeeded > static_cast<uint64_t>(MinDistance)) { |
| 2263 | if (!ConstDist) { |
| 2264 | // For non-constant distances, we checked the lower bound of the |
| 2265 | // dependence distance and the distance may be larger at runtime (and safe |
| 2266 | // for vectorization). Classify it as Unknown, so we re-try with runtime |
| 2267 | // checks. |
| 2268 | return Dependence::Unknown; |
| 2269 | } |
| 2270 | LLVM_DEBUG(dbgs() << "LAA: Failure because of positive minimum distance " |
| 2271 | << MinDistance << '\n'); |
| 2272 | return Dependence::Backward; |
| 2273 | } |
| 2274 | |
| 2275 | // Unsafe if the minimum distance needed is greater than smallest dependence |
| 2276 | // distance distance. |
| 2277 | if (MinDistanceNeeded > MinDepDistBytes) { |
| 2278 | LLVM_DEBUG(dbgs() << "LAA: Failure because it needs at least " |
| 2279 | << MinDistanceNeeded << " size in bytes\n" ); |
| 2280 | return Dependence::Backward; |
| 2281 | } |
| 2282 | |
| 2283 | // Positive distance bigger than max vectorization factor. |
| 2284 | // FIXME: Should use max factor instead of max distance in bytes, which could |
| 2285 | // not handle different types. |
| 2286 | // E.g. Assume one char is 1 byte in memory and one int is 4 bytes. |
| 2287 | // void foo (int *A, char *B) { |
| 2288 | // for (unsigned i = 0; i < 1024; i++) { |
| 2289 | // A[i+2] = A[i] + 1; |
| 2290 | // B[i+2] = B[i] + 1; |
| 2291 | // } |
| 2292 | // } |
| 2293 | // |
| 2294 | // This case is currently unsafe according to the max safe distance. If we |
| 2295 | // analyze the two accesses on array B, the max safe dependence distance |
| 2296 | // is 2. Then we analyze the accesses on array A, the minimum distance needed |
| 2297 | // is 8, which is less than 2 and forbidden vectorization, But actually |
| 2298 | // both A and B could be vectorized by 2 iterations. |
| 2299 | MinDepDistBytes = |
| 2300 | std::min(a: static_cast<uint64_t>(MinDistance), b: MinDepDistBytes); |
| 2301 | |
| 2302 | bool IsTrueDataDependence = (!AIsWrite && BIsWrite); |
| 2303 | if (IsTrueDataDependence && EnableForwardingConflictDetection && ConstDist && |
| 2304 | couldPreventStoreLoadForward(Distance: MinDistance, TypeByteSize, CommonStride: *CommonStride)) |
| 2305 | return Dependence::BackwardVectorizableButPreventsForwarding; |
| 2306 | |
| 2307 | uint64_t MaxVF = MinDepDistBytes / MaxStride; |
| 2308 | LLVM_DEBUG(dbgs() << "LAA: Positive min distance " << MinDistance |
| 2309 | << " with max VF = " << MaxVF << '\n'); |
| 2310 | |
| 2311 | uint64_t MaxVFInBits = MaxVF * TypeByteSize * 8; |
| 2312 | if (!ConstDist && MaxVFInBits < MaxTargetVectorWidthInBits) { |
| 2313 | // For non-constant distances, we checked the lower bound of the dependence |
| 2314 | // distance and the distance may be larger at runtime (and safe for |
| 2315 | // vectorization). Classify it as Unknown, so we re-try with runtime checks. |
| 2316 | return Dependence::Unknown; |
| 2317 | } |
| 2318 | |
| 2319 | MaxSafeVectorWidthInBits = std::min(a: MaxSafeVectorWidthInBits, b: MaxVFInBits); |
| 2320 | return Dependence::BackwardVectorizable; |
| 2321 | } |
| 2322 | |
| 2323 | bool MemoryDepChecker::areDepsSafe(const DepCandidates &DepCands, |
| 2324 | const MemAccessInfoList &CheckDeps) { |
| 2325 | |
| 2326 | MinDepDistBytes = -1; |
| 2327 | SmallPtrSet<MemAccessInfo, 8> Visited; |
| 2328 | for (MemAccessInfo CurAccess : CheckDeps) { |
| 2329 | if (Visited.contains(Ptr: CurAccess)) |
| 2330 | continue; |
| 2331 | |
| 2332 | // Check accesses within this set. |
| 2333 | EquivalenceClasses<MemAccessInfo>::member_iterator AI = |
| 2334 | DepCands.findLeader(V: CurAccess); |
| 2335 | EquivalenceClasses<MemAccessInfo>::member_iterator AE = |
| 2336 | DepCands.member_end(); |
| 2337 | |
| 2338 | // Check every access pair. |
| 2339 | while (AI != AE) { |
| 2340 | Visited.insert(Ptr: *AI); |
| 2341 | bool AIIsWrite = AI->getInt(); |
| 2342 | // Check loads only against next equivalent class, but stores also against |
| 2343 | // other stores in the same equivalence class - to the same address. |
| 2344 | EquivalenceClasses<MemAccessInfo>::member_iterator OI = |
| 2345 | (AIIsWrite ? AI : std::next(x: AI)); |
| 2346 | while (OI != AE) { |
| 2347 | // Check every accessing instruction pair in program order. |
| 2348 | auto &Acc = Accesses[*AI]; |
| 2349 | for (std::vector<unsigned>::iterator I1 = Acc.begin(), I1E = Acc.end(); |
| 2350 | I1 != I1E; ++I1) |
| 2351 | // Scan all accesses of another equivalence class, but only the next |
| 2352 | // accesses of the same equivalent class. |
| 2353 | for (std::vector<unsigned>::iterator |
| 2354 | I2 = (OI == AI ? std::next(x: I1) : Accesses[*OI].begin()), |
| 2355 | I2E = (OI == AI ? I1E : Accesses[*OI].end()); |
| 2356 | I2 != I2E; ++I2) { |
| 2357 | auto A = std::make_pair(x: &*AI, y&: *I1); |
| 2358 | auto B = std::make_pair(x: &*OI, y&: *I2); |
| 2359 | |
| 2360 | assert(*I1 != *I2); |
| 2361 | if (*I1 > *I2) |
| 2362 | std::swap(x&: A, y&: B); |
| 2363 | |
| 2364 | Dependence::DepType Type = |
| 2365 | isDependent(A: *A.first, AIdx: A.second, B: *B.first, BIdx: B.second); |
| 2366 | mergeInStatus(S: Dependence::isSafeForVectorization(Type)); |
| 2367 | |
| 2368 | // Gather dependences unless we accumulated MaxDependences |
| 2369 | // dependences. In that case return as soon as we find the first |
| 2370 | // unsafe dependence. This puts a limit on this quadratic |
| 2371 | // algorithm. |
| 2372 | if (RecordDependences) { |
| 2373 | if (Type != Dependence::NoDep) |
| 2374 | Dependences.emplace_back(Args&: A.second, Args&: B.second, Args&: Type); |
| 2375 | |
| 2376 | if (Dependences.size() >= MaxDependences) { |
| 2377 | RecordDependences = false; |
| 2378 | Dependences.clear(); |
| 2379 | LLVM_DEBUG(dbgs() |
| 2380 | << "Too many dependences, stopped recording\n" ); |
| 2381 | } |
| 2382 | } |
| 2383 | if (!RecordDependences && !isSafeForVectorization()) |
| 2384 | return false; |
| 2385 | } |
| 2386 | ++OI; |
| 2387 | } |
| 2388 | ++AI; |
| 2389 | } |
| 2390 | } |
| 2391 | |
| 2392 | LLVM_DEBUG(dbgs() << "Total Dependences: " << Dependences.size() << "\n" ); |
| 2393 | return isSafeForVectorization(); |
| 2394 | } |
| 2395 | |
| 2396 | SmallVector<Instruction *, 4> |
| 2397 | MemoryDepChecker::getInstructionsForAccess(Value *Ptr, bool IsWrite) const { |
| 2398 | MemAccessInfo Access(Ptr, IsWrite); |
| 2399 | auto &IndexVector = Accesses.find(Val: Access)->second; |
| 2400 | |
| 2401 | SmallVector<Instruction *, 4> Insts; |
| 2402 | transform(Range: IndexVector, |
| 2403 | d_first: std::back_inserter(x&: Insts), |
| 2404 | F: [&](unsigned Idx) { return this->InstMap[Idx]; }); |
| 2405 | return Insts; |
| 2406 | } |
| 2407 | |
| 2408 | const char *MemoryDepChecker::Dependence::DepName[] = { |
| 2409 | "NoDep" , |
| 2410 | "Unknown" , |
| 2411 | "IndirectUnsafe" , |
| 2412 | "Forward" , |
| 2413 | "ForwardButPreventsForwarding" , |
| 2414 | "Backward" , |
| 2415 | "BackwardVectorizable" , |
| 2416 | "BackwardVectorizableButPreventsForwarding" }; |
| 2417 | |
| 2418 | void MemoryDepChecker::Dependence::print( |
| 2419 | raw_ostream &OS, unsigned Depth, |
| 2420 | const SmallVectorImpl<Instruction *> &Instrs) const { |
| 2421 | OS.indent(NumSpaces: Depth) << DepName[Type] << ":\n" ; |
| 2422 | OS.indent(NumSpaces: Depth + 2) << *Instrs[Source] << " -> \n" ; |
| 2423 | OS.indent(NumSpaces: Depth + 2) << *Instrs[Destination] << "\n" ; |
| 2424 | } |
| 2425 | |
| 2426 | bool LoopAccessInfo::canAnalyzeLoop() { |
| 2427 | // We need to have a loop header. |
| 2428 | LLVM_DEBUG(dbgs() << "\nLAA: Checking a loop in '" |
| 2429 | << TheLoop->getHeader()->getParent()->getName() << "' from " |
| 2430 | << TheLoop->getLocStr() << "\n" ); |
| 2431 | |
| 2432 | // We can only analyze innermost loops. |
| 2433 | if (!TheLoop->isInnermost()) { |
| 2434 | LLVM_DEBUG(dbgs() << "LAA: loop is not the innermost loop\n" ); |
| 2435 | recordAnalysis(RemarkName: "NotInnerMostLoop" ) << "loop is not the innermost loop" ; |
| 2436 | return false; |
| 2437 | } |
| 2438 | |
| 2439 | // We must have a single backedge. |
| 2440 | if (TheLoop->getNumBackEdges() != 1) { |
| 2441 | LLVM_DEBUG( |
| 2442 | dbgs() << "LAA: loop control flow is not understood by analyzer\n" ); |
| 2443 | recordAnalysis(RemarkName: "CFGNotUnderstood" ) |
| 2444 | << "loop control flow is not understood by analyzer" ; |
| 2445 | return false; |
| 2446 | } |
| 2447 | |
| 2448 | // ScalarEvolution needs to be able to find the symbolic max backedge taken |
| 2449 | // count, which is an upper bound on the number of loop iterations. The loop |
| 2450 | // may execute fewer iterations, if it exits via an uncountable exit. |
| 2451 | const SCEV *ExitCount = PSE->getSymbolicMaxBackedgeTakenCount(); |
| 2452 | if (isa<SCEVCouldNotCompute>(Val: ExitCount)) { |
| 2453 | recordAnalysis(RemarkName: "CantComputeNumberOfIterations" ) |
| 2454 | << "could not determine number of loop iterations" ; |
| 2455 | LLVM_DEBUG(dbgs() << "LAA: SCEV could not compute the loop exit count.\n" ); |
| 2456 | return false; |
| 2457 | } |
| 2458 | |
| 2459 | LLVM_DEBUG(dbgs() << "LAA: Found an analyzable loop: " |
| 2460 | << TheLoop->getHeader()->getName() << "\n" ); |
| 2461 | return true; |
| 2462 | } |
| 2463 | |
| 2464 | bool LoopAccessInfo::analyzeLoop(AAResults *AA, const LoopInfo *LI, |
| 2465 | const TargetLibraryInfo *TLI, |
| 2466 | DominatorTree *DT) { |
| 2467 | // Holds the Load and Store instructions. |
| 2468 | SmallVector<LoadInst *, 16> Loads; |
| 2469 | SmallVector<StoreInst *, 16> Stores; |
| 2470 | SmallPtrSet<MDNode *, 8> LoopAliasScopes; |
| 2471 | |
| 2472 | // Holds all the different accesses in the loop. |
| 2473 | unsigned NumReads = 0; |
| 2474 | unsigned NumReadWrites = 0; |
| 2475 | |
| 2476 | bool HasComplexMemInst = false; |
| 2477 | |
| 2478 | // A runtime check is only legal to insert if there are no convergent calls. |
| 2479 | HasConvergentOp = false; |
| 2480 | |
| 2481 | PtrRtChecking->Pointers.clear(); |
| 2482 | PtrRtChecking->Need = false; |
| 2483 | |
| 2484 | const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel(); |
| 2485 | |
| 2486 | const bool EnableMemAccessVersioningOfLoop = |
| 2487 | EnableMemAccessVersioning && |
| 2488 | !TheLoop->getHeader()->getParent()->hasOptSize(); |
| 2489 | |
| 2490 | // Traverse blocks in fixed RPOT order, regardless of their storage in the |
| 2491 | // loop info, as it may be arbitrary. |
| 2492 | LoopBlocksRPO RPOT(TheLoop); |
| 2493 | RPOT.perform(LI); |
| 2494 | for (BasicBlock *BB : RPOT) { |
| 2495 | // Scan the BB and collect legal loads and stores. Also detect any |
| 2496 | // convergent instructions. |
| 2497 | for (Instruction &I : *BB) { |
| 2498 | if (auto *Call = dyn_cast<CallBase>(Val: &I)) { |
| 2499 | if (Call->isConvergent()) |
| 2500 | HasConvergentOp = true; |
| 2501 | } |
| 2502 | |
| 2503 | // With both a non-vectorizable memory instruction and a convergent |
| 2504 | // operation, found in this loop, no reason to continue the search. |
| 2505 | if (HasComplexMemInst && HasConvergentOp) |
| 2506 | return false; |
| 2507 | |
| 2508 | // Avoid hitting recordAnalysis multiple times. |
| 2509 | if (HasComplexMemInst) |
| 2510 | continue; |
| 2511 | |
| 2512 | // Record alias scopes defined inside the loop. |
| 2513 | if (auto *Decl = dyn_cast<NoAliasScopeDeclInst>(Val: &I)) |
| 2514 | for (Metadata *Op : Decl->getScopeList()->operands()) |
| 2515 | LoopAliasScopes.insert(Ptr: cast<MDNode>(Val: Op)); |
| 2516 | |
| 2517 | // Many math library functions read the rounding mode. We will only |
| 2518 | // vectorize a loop if it contains known function calls that don't set |
| 2519 | // the flag. Therefore, it is safe to ignore this read from memory. |
| 2520 | auto *Call = dyn_cast<CallInst>(Val: &I); |
| 2521 | if (Call && getVectorIntrinsicIDForCall(CI: Call, TLI)) |
| 2522 | continue; |
| 2523 | |
| 2524 | // If this is a load, save it. If this instruction can read from memory |
| 2525 | // but is not a load, we only allow it if it's a call to a function with a |
| 2526 | // vector mapping and no pointer arguments. |
| 2527 | if (I.mayReadFromMemory()) { |
| 2528 | auto hasPointerArgs = [](CallBase *CB) { |
| 2529 | return any_of(Range: CB->args(), P: [](Value const *Arg) { |
| 2530 | return Arg->getType()->isPointerTy(); |
| 2531 | }); |
| 2532 | }; |
| 2533 | |
| 2534 | // If the function has an explicit vectorized counterpart, and does not |
| 2535 | // take output/input pointers, we can safely assume that it can be |
| 2536 | // vectorized. |
| 2537 | if (Call && !Call->isNoBuiltin() && Call->getCalledFunction() && |
| 2538 | !hasPointerArgs(Call) && !VFDatabase::getMappings(CI: *Call).empty()) |
| 2539 | continue; |
| 2540 | |
| 2541 | auto *Ld = dyn_cast<LoadInst>(Val: &I); |
| 2542 | if (!Ld) { |
| 2543 | recordAnalysis(RemarkName: "CantVectorizeInstruction" , Instr: Ld) |
| 2544 | << "instruction cannot be vectorized" ; |
| 2545 | HasComplexMemInst = true; |
| 2546 | continue; |
| 2547 | } |
| 2548 | if (!Ld->isSimple() && !IsAnnotatedParallel) { |
| 2549 | recordAnalysis(RemarkName: "NonSimpleLoad" , Instr: Ld) |
| 2550 | << "read with atomic ordering or volatile read" ; |
| 2551 | LLVM_DEBUG(dbgs() << "LAA: Found a non-simple load.\n" ); |
| 2552 | HasComplexMemInst = true; |
| 2553 | continue; |
| 2554 | } |
| 2555 | NumLoads++; |
| 2556 | Loads.push_back(Elt: Ld); |
| 2557 | DepChecker->addAccess(LI: Ld); |
| 2558 | if (EnableMemAccessVersioningOfLoop) |
| 2559 | collectStridedAccess(LoadOrStoreInst: Ld); |
| 2560 | continue; |
| 2561 | } |
| 2562 | |
| 2563 | // Save 'store' instructions. Abort if other instructions write to memory. |
| 2564 | if (I.mayWriteToMemory()) { |
| 2565 | auto *St = dyn_cast<StoreInst>(Val: &I); |
| 2566 | if (!St) { |
| 2567 | recordAnalysis(RemarkName: "CantVectorizeInstruction" , Instr: St) |
| 2568 | << "instruction cannot be vectorized" ; |
| 2569 | HasComplexMemInst = true; |
| 2570 | continue; |
| 2571 | } |
| 2572 | if (!St->isSimple() && !IsAnnotatedParallel) { |
| 2573 | recordAnalysis(RemarkName: "NonSimpleStore" , Instr: St) |
| 2574 | << "write with atomic ordering or volatile write" ; |
| 2575 | LLVM_DEBUG(dbgs() << "LAA: Found a non-simple store.\n" ); |
| 2576 | HasComplexMemInst = true; |
| 2577 | continue; |
| 2578 | } |
| 2579 | NumStores++; |
| 2580 | Stores.push_back(Elt: St); |
| 2581 | DepChecker->addAccess(SI: St); |
| 2582 | if (EnableMemAccessVersioningOfLoop) |
| 2583 | collectStridedAccess(LoadOrStoreInst: St); |
| 2584 | } |
| 2585 | } // Next instr. |
| 2586 | } // Next block. |
| 2587 | |
| 2588 | if (HasComplexMemInst) |
| 2589 | return false; |
| 2590 | |
| 2591 | // Now we have two lists that hold the loads and the stores. |
| 2592 | // Next, we find the pointers that they use. |
| 2593 | |
| 2594 | // Check if we see any stores. If there are no stores, then we don't |
| 2595 | // care if the pointers are *restrict*. |
| 2596 | if (!Stores.size()) { |
| 2597 | LLVM_DEBUG(dbgs() << "LAA: Found a read-only loop!\n" ); |
| 2598 | return true; |
| 2599 | } |
| 2600 | |
| 2601 | MemoryDepChecker::DepCandidates DepCands; |
| 2602 | AccessAnalysis Accesses(TheLoop, AA, LI, DepCands, *PSE, LoopAliasScopes); |
| 2603 | |
| 2604 | // Holds the analyzed pointers. We don't want to call getUnderlyingObjects |
| 2605 | // multiple times on the same object. If the ptr is accessed twice, once |
| 2606 | // for read and once for write, it will only appear once (on the write |
| 2607 | // list). This is okay, since we are going to check for conflicts between |
| 2608 | // writes and between reads and writes, but not between reads and reads. |
| 2609 | SmallSet<std::pair<Value *, Type *>, 16> Seen; |
| 2610 | |
| 2611 | // Record uniform store addresses to identify if we have multiple stores |
| 2612 | // to the same address. |
| 2613 | SmallPtrSet<Value *, 16> UniformStores; |
| 2614 | |
| 2615 | for (StoreInst *ST : Stores) { |
| 2616 | Value *Ptr = ST->getPointerOperand(); |
| 2617 | |
| 2618 | if (isInvariant(V: Ptr)) { |
| 2619 | // Record store instructions to loop invariant addresses |
| 2620 | StoresToInvariantAddresses.push_back(Elt: ST); |
| 2621 | HasStoreStoreDependenceInvolvingLoopInvariantAddress |= |
| 2622 | !UniformStores.insert(Ptr).second; |
| 2623 | } |
| 2624 | |
| 2625 | // If we did *not* see this pointer before, insert it to the read-write |
| 2626 | // list. At this phase it is only a 'write' list. |
| 2627 | Type *AccessTy = getLoadStoreType(I: ST); |
| 2628 | if (Seen.insert(V: {Ptr, AccessTy}).second) { |
| 2629 | ++NumReadWrites; |
| 2630 | |
| 2631 | MemoryLocation Loc = MemoryLocation::get(SI: ST); |
| 2632 | // The TBAA metadata could have a control dependency on the predication |
| 2633 | // condition, so we cannot rely on it when determining whether or not we |
| 2634 | // need runtime pointer checks. |
| 2635 | if (blockNeedsPredication(BB: ST->getParent(), TheLoop, DT)) |
| 2636 | Loc.AATags.TBAA = nullptr; |
| 2637 | |
| 2638 | visitPointers(StartPtr: const_cast<Value *>(Loc.Ptr), InnermostLoop: *TheLoop, |
| 2639 | AddPointer: [&Accesses, AccessTy, Loc](Value *Ptr) { |
| 2640 | MemoryLocation NewLoc = Loc.getWithNewPtr(NewPtr: Ptr); |
| 2641 | Accesses.addStore(Loc: NewLoc, AccessTy); |
| 2642 | }); |
| 2643 | } |
| 2644 | } |
| 2645 | |
| 2646 | if (IsAnnotatedParallel) { |
| 2647 | LLVM_DEBUG( |
| 2648 | dbgs() << "LAA: A loop annotated parallel, ignore memory dependency " |
| 2649 | << "checks.\n" ); |
| 2650 | return true; |
| 2651 | } |
| 2652 | |
| 2653 | for (LoadInst *LD : Loads) { |
| 2654 | Value *Ptr = LD->getPointerOperand(); |
| 2655 | // If we did *not* see this pointer before, insert it to the |
| 2656 | // read list. If we *did* see it before, then it is already in |
| 2657 | // the read-write list. This allows us to vectorize expressions |
| 2658 | // such as A[i] += x; Because the address of A[i] is a read-write |
| 2659 | // pointer. This only works if the index of A[i] is consecutive. |
| 2660 | // If the address of i is unknown (for example A[B[i]]) then we may |
| 2661 | // read a few words, modify, and write a few words, and some of the |
| 2662 | // words may be written to the same address. |
| 2663 | bool IsReadOnlyPtr = false; |
| 2664 | Type *AccessTy = getLoadStoreType(I: LD); |
| 2665 | if (Seen.insert(V: {Ptr, AccessTy}).second || |
| 2666 | !getPtrStride(PSE&: *PSE, AccessTy, Ptr, Lp: TheLoop, StridesMap: SymbolicStrides)) { |
| 2667 | ++NumReads; |
| 2668 | IsReadOnlyPtr = true; |
| 2669 | } |
| 2670 | |
| 2671 | // See if there is an unsafe dependency between a load to a uniform address and |
| 2672 | // store to the same uniform address. |
| 2673 | if (UniformStores.contains(Ptr)) { |
| 2674 | LLVM_DEBUG(dbgs() << "LAA: Found an unsafe dependency between a uniform " |
| 2675 | "load and uniform store to the same address!\n" ); |
| 2676 | HasLoadStoreDependenceInvolvingLoopInvariantAddress = true; |
| 2677 | } |
| 2678 | |
| 2679 | MemoryLocation Loc = MemoryLocation::get(LI: LD); |
| 2680 | // The TBAA metadata could have a control dependency on the predication |
| 2681 | // condition, so we cannot rely on it when determining whether or not we |
| 2682 | // need runtime pointer checks. |
| 2683 | if (blockNeedsPredication(BB: LD->getParent(), TheLoop, DT)) |
| 2684 | Loc.AATags.TBAA = nullptr; |
| 2685 | |
| 2686 | visitPointers(StartPtr: const_cast<Value *>(Loc.Ptr), InnermostLoop: *TheLoop, |
| 2687 | AddPointer: [&Accesses, AccessTy, Loc, IsReadOnlyPtr](Value *Ptr) { |
| 2688 | MemoryLocation NewLoc = Loc.getWithNewPtr(NewPtr: Ptr); |
| 2689 | Accesses.addLoad(Loc: NewLoc, AccessTy, IsReadOnly: IsReadOnlyPtr); |
| 2690 | }); |
| 2691 | } |
| 2692 | |
| 2693 | // If we write (or read-write) to a single destination and there are no |
| 2694 | // other reads in this loop then is it safe to vectorize. |
| 2695 | if (NumReadWrites == 1 && NumReads == 0) { |
| 2696 | LLVM_DEBUG(dbgs() << "LAA: Found a write-only loop!\n" ); |
| 2697 | return true; |
| 2698 | } |
| 2699 | |
| 2700 | // Build dependence sets and check whether we need a runtime pointer bounds |
| 2701 | // check. |
| 2702 | Accesses.buildDependenceSets(); |
| 2703 | |
| 2704 | // Find pointers with computable bounds. We are going to use this information |
| 2705 | // to place a runtime bound check. |
| 2706 | Value *UncomputablePtr = nullptr; |
| 2707 | HasCompletePtrRtChecking = Accesses.canCheckPtrAtRT( |
| 2708 | RtCheck&: *PtrRtChecking, TheLoop, StridesMap: SymbolicStrides, UncomputablePtr, AllowPartial); |
| 2709 | if (!HasCompletePtrRtChecking) { |
| 2710 | const auto *I = dyn_cast_or_null<Instruction>(Val: UncomputablePtr); |
| 2711 | recordAnalysis(RemarkName: "CantIdentifyArrayBounds" , Instr: I) |
| 2712 | << "cannot identify array bounds" ; |
| 2713 | LLVM_DEBUG(dbgs() << "LAA: We can't vectorize because we can't find " |
| 2714 | << "the array bounds.\n" ); |
| 2715 | return false; |
| 2716 | } |
| 2717 | |
| 2718 | LLVM_DEBUG( |
| 2719 | dbgs() << "LAA: May be able to perform a memory runtime check if needed.\n" ); |
| 2720 | |
| 2721 | bool DepsAreSafe = true; |
| 2722 | if (Accesses.isDependencyCheckNeeded()) { |
| 2723 | LLVM_DEBUG(dbgs() << "LAA: Checking memory dependencies\n" ); |
| 2724 | DepsAreSafe = |
| 2725 | DepChecker->areDepsSafe(DepCands, CheckDeps: Accesses.getDependenciesToCheck()); |
| 2726 | |
| 2727 | if (!DepsAreSafe && DepChecker->shouldRetryWithRuntimeCheck()) { |
| 2728 | LLVM_DEBUG(dbgs() << "LAA: Retrying with memory checks\n" ); |
| 2729 | |
| 2730 | // Clear the dependency checks. We assume they are not needed. |
| 2731 | Accesses.resetDepChecks(DepChecker&: *DepChecker); |
| 2732 | |
| 2733 | PtrRtChecking->reset(); |
| 2734 | PtrRtChecking->Need = true; |
| 2735 | |
| 2736 | UncomputablePtr = nullptr; |
| 2737 | HasCompletePtrRtChecking = |
| 2738 | Accesses.canCheckPtrAtRT(RtCheck&: *PtrRtChecking, TheLoop, StridesMap: SymbolicStrides, |
| 2739 | UncomputablePtr, AllowPartial); |
| 2740 | |
| 2741 | // Check that we found the bounds for the pointer. |
| 2742 | if (!HasCompletePtrRtChecking) { |
| 2743 | auto *I = dyn_cast_or_null<Instruction>(Val: UncomputablePtr); |
| 2744 | recordAnalysis(RemarkName: "CantCheckMemDepsAtRunTime" , Instr: I) |
| 2745 | << "cannot check memory dependencies at runtime" ; |
| 2746 | LLVM_DEBUG(dbgs() << "LAA: Can't vectorize with memory checks\n" ); |
| 2747 | return false; |
| 2748 | } |
| 2749 | DepsAreSafe = true; |
| 2750 | } |
| 2751 | } |
| 2752 | |
| 2753 | if (HasConvergentOp) { |
| 2754 | recordAnalysis(RemarkName: "CantInsertRuntimeCheckWithConvergent" ) |
| 2755 | << "cannot add control dependency to convergent operation" ; |
| 2756 | LLVM_DEBUG(dbgs() << "LAA: We can't vectorize because a runtime check " |
| 2757 | "would be needed with a convergent operation\n" ); |
| 2758 | return false; |
| 2759 | } |
| 2760 | |
| 2761 | if (DepsAreSafe) { |
| 2762 | LLVM_DEBUG( |
| 2763 | dbgs() << "LAA: No unsafe dependent memory operations in loop. We" |
| 2764 | << (PtrRtChecking->Need ? "" : " don't" ) |
| 2765 | << " need runtime memory checks.\n" ); |
| 2766 | return true; |
| 2767 | } |
| 2768 | |
| 2769 | emitUnsafeDependenceRemark(); |
| 2770 | return false; |
| 2771 | } |
| 2772 | |
| 2773 | void LoopAccessInfo::() { |
| 2774 | const auto *Deps = getDepChecker().getDependences(); |
| 2775 | if (!Deps) |
| 2776 | return; |
| 2777 | const auto *Found = |
| 2778 | llvm::find_if(Range: *Deps, P: [](const MemoryDepChecker::Dependence &D) { |
| 2779 | return MemoryDepChecker::Dependence::isSafeForVectorization(Type: D.Type) != |
| 2780 | MemoryDepChecker::VectorizationSafetyStatus::Safe; |
| 2781 | }); |
| 2782 | if (Found == Deps->end()) |
| 2783 | return; |
| 2784 | MemoryDepChecker::Dependence Dep = *Found; |
| 2785 | |
| 2786 | LLVM_DEBUG(dbgs() << "LAA: unsafe dependent memory operations in loop\n" ); |
| 2787 | |
| 2788 | // Emit remark for first unsafe dependence |
| 2789 | bool HasForcedDistribution = false; |
| 2790 | std::optional<const MDOperand *> Value = |
| 2791 | findStringMetadataForLoop(TheLoop, Name: "llvm.loop.distribute.enable" ); |
| 2792 | if (Value) { |
| 2793 | const MDOperand *Op = *Value; |
| 2794 | assert(Op && mdconst::hasa<ConstantInt>(*Op) && "invalid metadata" ); |
| 2795 | HasForcedDistribution = mdconst::extract<ConstantInt>(MD: *Op)->getZExtValue(); |
| 2796 | } |
| 2797 | |
| 2798 | const std::string Info = |
| 2799 | HasForcedDistribution |
| 2800 | ? "unsafe dependent memory operations in loop." |
| 2801 | : "unsafe dependent memory operations in loop. Use " |
| 2802 | "#pragma clang loop distribute(enable) to allow loop distribution " |
| 2803 | "to attempt to isolate the offending operations into a separate " |
| 2804 | "loop" ; |
| 2805 | OptimizationRemarkAnalysis &R = |
| 2806 | recordAnalysis(RemarkName: "UnsafeDep" , Instr: Dep.getDestination(DepChecker: getDepChecker())) << Info; |
| 2807 | |
| 2808 | switch (Dep.Type) { |
| 2809 | case MemoryDepChecker::Dependence::NoDep: |
| 2810 | case MemoryDepChecker::Dependence::Forward: |
| 2811 | case MemoryDepChecker::Dependence::BackwardVectorizable: |
| 2812 | llvm_unreachable("Unexpected dependence" ); |
| 2813 | case MemoryDepChecker::Dependence::Backward: |
| 2814 | R << "\nBackward loop carried data dependence." ; |
| 2815 | break; |
| 2816 | case MemoryDepChecker::Dependence::ForwardButPreventsForwarding: |
| 2817 | R << "\nForward loop carried data dependence that prevents " |
| 2818 | "store-to-load forwarding." ; |
| 2819 | break; |
| 2820 | case MemoryDepChecker::Dependence::BackwardVectorizableButPreventsForwarding: |
| 2821 | R << "\nBackward loop carried data dependence that prevents " |
| 2822 | "store-to-load forwarding." ; |
| 2823 | break; |
| 2824 | case MemoryDepChecker::Dependence::IndirectUnsafe: |
| 2825 | R << "\nUnsafe indirect dependence." ; |
| 2826 | break; |
| 2827 | case MemoryDepChecker::Dependence::Unknown: |
| 2828 | R << "\nUnknown data dependence." ; |
| 2829 | break; |
| 2830 | } |
| 2831 | |
| 2832 | if (Instruction *I = Dep.getSource(DepChecker: getDepChecker())) { |
| 2833 | DebugLoc SourceLoc = I->getDebugLoc(); |
| 2834 | if (auto *DD = dyn_cast_or_null<Instruction>(Val: getPointerOperand(V: I))) |
| 2835 | SourceLoc = DD->getDebugLoc(); |
| 2836 | if (SourceLoc) |
| 2837 | R << " Memory location is the same as accessed at " |
| 2838 | << ore::NV("Location" , SourceLoc); |
| 2839 | } |
| 2840 | } |
| 2841 | |
| 2842 | bool LoopAccessInfo::blockNeedsPredication(BasicBlock *BB, Loop *TheLoop, |
| 2843 | DominatorTree *DT) { |
| 2844 | assert(TheLoop->contains(BB) && "Unknown block used" ); |
| 2845 | |
| 2846 | // Blocks that do not dominate the latch need predication. |
| 2847 | const BasicBlock *Latch = TheLoop->getLoopLatch(); |
| 2848 | return !DT->dominates(A: BB, B: Latch); |
| 2849 | } |
| 2850 | |
| 2851 | OptimizationRemarkAnalysis & |
| 2852 | LoopAccessInfo::recordAnalysis(StringRef , const Instruction *I) { |
| 2853 | assert(!Report && "Multiple reports generated" ); |
| 2854 | |
| 2855 | const BasicBlock *CodeRegion = TheLoop->getHeader(); |
| 2856 | DebugLoc DL = TheLoop->getStartLoc(); |
| 2857 | |
| 2858 | if (I) { |
| 2859 | CodeRegion = I->getParent(); |
| 2860 | // If there is no debug location attached to the instruction, revert back to |
| 2861 | // using the loop's. |
| 2862 | if (I->getDebugLoc()) |
| 2863 | DL = I->getDebugLoc(); |
| 2864 | } |
| 2865 | |
| 2866 | Report = std::make_unique<OptimizationRemarkAnalysis>(DEBUG_TYPE, args&: RemarkName, |
| 2867 | args&: DL, args&: CodeRegion); |
| 2868 | return *Report; |
| 2869 | } |
| 2870 | |
| 2871 | bool LoopAccessInfo::isInvariant(Value *V) const { |
| 2872 | auto *SE = PSE->getSE(); |
| 2873 | if (TheLoop->isLoopInvariant(V)) |
| 2874 | return true; |
| 2875 | if (!SE->isSCEVable(Ty: V->getType())) |
| 2876 | return false; |
| 2877 | const SCEV *S = SE->getSCEV(V); |
| 2878 | return SE->isLoopInvariant(S, L: TheLoop); |
| 2879 | } |
| 2880 | |
| 2881 | /// If \p Ptr is a GEP, which has a loop-variant operand, return that operand. |
| 2882 | /// Otherwise, return \p Ptr. |
| 2883 | static Value *getLoopVariantGEPOperand(Value *Ptr, ScalarEvolution *SE, |
| 2884 | Loop *Lp) { |
| 2885 | auto *GEP = dyn_cast<GetElementPtrInst>(Val: Ptr); |
| 2886 | if (!GEP) |
| 2887 | return Ptr; |
| 2888 | |
| 2889 | Value *V = Ptr; |
| 2890 | for (const Use &U : GEP->operands()) { |
| 2891 | if (!SE->isLoopInvariant(S: SE->getSCEV(V: U), L: Lp)) { |
| 2892 | if (V == Ptr) |
| 2893 | V = U; |
| 2894 | else |
| 2895 | // There must be exactly one loop-variant operand. |
| 2896 | return Ptr; |
| 2897 | } |
| 2898 | } |
| 2899 | return V; |
| 2900 | } |
| 2901 | |
| 2902 | /// Get the stride of a pointer access in a loop. Looks for symbolic |
| 2903 | /// strides "a[i*stride]". Returns the symbolic stride, or null otherwise. |
| 2904 | static const SCEV *getStrideFromPointer(Value *Ptr, ScalarEvolution *SE, Loop *Lp) { |
| 2905 | auto *PtrTy = dyn_cast<PointerType>(Val: Ptr->getType()); |
| 2906 | if (!PtrTy) |
| 2907 | return nullptr; |
| 2908 | |
| 2909 | // Try to remove a gep instruction to make the pointer (actually index at this |
| 2910 | // point) easier analyzable. If OrigPtr is equal to Ptr we are analyzing the |
| 2911 | // pointer, otherwise, we are analyzing the index. |
| 2912 | Value *OrigPtr = Ptr; |
| 2913 | |
| 2914 | Ptr = getLoopVariantGEPOperand(Ptr, SE, Lp); |
| 2915 | const SCEV *V = SE->getSCEV(V: Ptr); |
| 2916 | |
| 2917 | if (Ptr != OrigPtr) |
| 2918 | // Strip off casts. |
| 2919 | while (auto *C = dyn_cast<SCEVIntegralCastExpr>(Val: V)) |
| 2920 | V = C->getOperand(); |
| 2921 | |
| 2922 | if (!match(S: V, P: m_scev_AffineAddRec(Op0: m_SCEV(), Op1: m_SCEV(V), L: m_SpecificLoop(L: Lp)))) |
| 2923 | return nullptr; |
| 2924 | |
| 2925 | // Note that the restriction after this loop invariant check are only |
| 2926 | // profitability restrictions. |
| 2927 | if (!SE->isLoopInvariant(S: V, L: Lp)) |
| 2928 | return nullptr; |
| 2929 | |
| 2930 | // Look for the loop invariant symbolic value. |
| 2931 | if (isa<SCEVUnknown>(Val: V)) |
| 2932 | return V; |
| 2933 | |
| 2934 | if (auto *C = dyn_cast<SCEVIntegralCastExpr>(Val: V)) |
| 2935 | if (isa<SCEVUnknown>(Val: C->getOperand())) |
| 2936 | return V; |
| 2937 | |
| 2938 | return nullptr; |
| 2939 | } |
| 2940 | |
| 2941 | void LoopAccessInfo::collectStridedAccess(Value *MemAccess) { |
| 2942 | Value *Ptr = getLoadStorePointerOperand(V: MemAccess); |
| 2943 | if (!Ptr) |
| 2944 | return; |
| 2945 | |
| 2946 | // Note: getStrideFromPointer is a *profitability* heuristic. We |
| 2947 | // could broaden the scope of values returned here - to anything |
| 2948 | // which happens to be loop invariant and contributes to the |
| 2949 | // computation of an interesting IV - but we chose not to as we |
| 2950 | // don't have a cost model here, and broadening the scope exposes |
| 2951 | // far too many unprofitable cases. |
| 2952 | const SCEV *StrideExpr = getStrideFromPointer(Ptr, SE: PSE->getSE(), Lp: TheLoop); |
| 2953 | if (!StrideExpr) |
| 2954 | return; |
| 2955 | |
| 2956 | LLVM_DEBUG(dbgs() << "LAA: Found a strided access that is a candidate for " |
| 2957 | "versioning:" ); |
| 2958 | LLVM_DEBUG(dbgs() << " Ptr: " << *Ptr << " Stride: " << *StrideExpr << "\n" ); |
| 2959 | |
| 2960 | if (!SpeculateUnitStride) { |
| 2961 | LLVM_DEBUG(dbgs() << " Chose not to due to -laa-speculate-unit-stride\n" ); |
| 2962 | return; |
| 2963 | } |
| 2964 | |
| 2965 | // Avoid adding the "Stride == 1" predicate when we know that |
| 2966 | // Stride >= Trip-Count. Such a predicate will effectively optimize a single |
| 2967 | // or zero iteration loop, as Trip-Count <= Stride == 1. |
| 2968 | // |
| 2969 | // TODO: We are currently not making a very informed decision on when it is |
| 2970 | // beneficial to apply stride versioning. It might make more sense that the |
| 2971 | // users of this analysis (such as the vectorizer) will trigger it, based on |
| 2972 | // their specific cost considerations; For example, in cases where stride |
| 2973 | // versioning does not help resolving memory accesses/dependences, the |
| 2974 | // vectorizer should evaluate the cost of the runtime test, and the benefit |
| 2975 | // of various possible stride specializations, considering the alternatives |
| 2976 | // of using gather/scatters (if available). |
| 2977 | |
| 2978 | const SCEV *MaxBTC = PSE->getSymbolicMaxBackedgeTakenCount(); |
| 2979 | |
| 2980 | // Match the types so we can compare the stride and the MaxBTC. |
| 2981 | // The Stride can be positive/negative, so we sign extend Stride; |
| 2982 | // The backedgeTakenCount is non-negative, so we zero extend MaxBTC. |
| 2983 | const DataLayout &DL = TheLoop->getHeader()->getDataLayout(); |
| 2984 | uint64_t StrideTypeSizeBits = DL.getTypeSizeInBits(Ty: StrideExpr->getType()); |
| 2985 | uint64_t BETypeSizeBits = DL.getTypeSizeInBits(Ty: MaxBTC->getType()); |
| 2986 | const SCEV *CastedStride = StrideExpr; |
| 2987 | const SCEV *CastedBECount = MaxBTC; |
| 2988 | ScalarEvolution *SE = PSE->getSE(); |
| 2989 | if (BETypeSizeBits >= StrideTypeSizeBits) |
| 2990 | CastedStride = SE->getNoopOrSignExtend(V: StrideExpr, Ty: MaxBTC->getType()); |
| 2991 | else |
| 2992 | CastedBECount = SE->getZeroExtendExpr(Op: MaxBTC, Ty: StrideExpr->getType()); |
| 2993 | const SCEV *StrideMinusBETaken = SE->getMinusSCEV(LHS: CastedStride, RHS: CastedBECount); |
| 2994 | // Since TripCount == BackEdgeTakenCount + 1, checking: |
| 2995 | // "Stride >= TripCount" is equivalent to checking: |
| 2996 | // Stride - MaxBTC> 0 |
| 2997 | if (SE->isKnownPositive(S: StrideMinusBETaken)) { |
| 2998 | LLVM_DEBUG( |
| 2999 | dbgs() << "LAA: Stride>=TripCount; No point in versioning as the " |
| 3000 | "Stride==1 predicate will imply that the loop executes " |
| 3001 | "at most once.\n" ); |
| 3002 | return; |
| 3003 | } |
| 3004 | LLVM_DEBUG(dbgs() << "LAA: Found a strided access that we can version.\n" ); |
| 3005 | |
| 3006 | // Strip back off the integer cast, and check that our result is a |
| 3007 | // SCEVUnknown as we expect. |
| 3008 | const SCEV *StrideBase = StrideExpr; |
| 3009 | if (const auto *C = dyn_cast<SCEVIntegralCastExpr>(Val: StrideBase)) |
| 3010 | StrideBase = C->getOperand(); |
| 3011 | SymbolicStrides[Ptr] = cast<SCEVUnknown>(Val: StrideBase); |
| 3012 | } |
| 3013 | |
| 3014 | LoopAccessInfo::LoopAccessInfo(Loop *L, ScalarEvolution *SE, |
| 3015 | const TargetTransformInfo *TTI, |
| 3016 | const TargetLibraryInfo *TLI, AAResults *AA, |
| 3017 | DominatorTree *DT, LoopInfo *LI, |
| 3018 | bool AllowPartial) |
| 3019 | : PSE(std::make_unique<PredicatedScalarEvolution>(args&: *SE, args&: *L)), |
| 3020 | PtrRtChecking(nullptr), TheLoop(L), AllowPartial(AllowPartial) { |
| 3021 | unsigned MaxTargetVectorWidthInBits = std::numeric_limits<unsigned>::max(); |
| 3022 | if (TTI && !TTI->enableScalableVectorization()) |
| 3023 | // Scale the vector width by 2 as rough estimate to also consider |
| 3024 | // interleaving. |
| 3025 | MaxTargetVectorWidthInBits = |
| 3026 | TTI->getRegisterBitWidth(K: TargetTransformInfo::RGK_FixedWidthVector) * 2; |
| 3027 | |
| 3028 | DepChecker = std::make_unique<MemoryDepChecker>(args&: *PSE, args&: L, args&: SymbolicStrides, |
| 3029 | args&: MaxTargetVectorWidthInBits); |
| 3030 | PtrRtChecking = std::make_unique<RuntimePointerChecking>(args&: *DepChecker, args&: SE); |
| 3031 | if (canAnalyzeLoop()) |
| 3032 | CanVecMem = analyzeLoop(AA, LI, TLI, DT); |
| 3033 | } |
| 3034 | |
| 3035 | void LoopAccessInfo::print(raw_ostream &OS, unsigned Depth) const { |
| 3036 | if (CanVecMem) { |
| 3037 | OS.indent(NumSpaces: Depth) << "Memory dependences are safe" ; |
| 3038 | const MemoryDepChecker &DC = getDepChecker(); |
| 3039 | if (!DC.isSafeForAnyVectorWidth()) |
| 3040 | OS << " with a maximum safe vector width of " |
| 3041 | << DC.getMaxSafeVectorWidthInBits() << " bits" ; |
| 3042 | if (!DC.isSafeForAnyStoreLoadForwardDistances()) { |
| 3043 | uint64_t SLDist = DC.getStoreLoadForwardSafeDistanceInBits(); |
| 3044 | OS << ", with a maximum safe store-load forward width of " << SLDist |
| 3045 | << " bits" ; |
| 3046 | } |
| 3047 | if (PtrRtChecking->Need) |
| 3048 | OS << " with run-time checks" ; |
| 3049 | OS << "\n" ; |
| 3050 | } |
| 3051 | |
| 3052 | if (HasConvergentOp) |
| 3053 | OS.indent(NumSpaces: Depth) << "Has convergent operation in loop\n" ; |
| 3054 | |
| 3055 | if (Report) |
| 3056 | OS.indent(NumSpaces: Depth) << "Report: " << Report->getMsg() << "\n" ; |
| 3057 | |
| 3058 | if (auto *Dependences = DepChecker->getDependences()) { |
| 3059 | OS.indent(NumSpaces: Depth) << "Dependences:\n" ; |
| 3060 | for (const auto &Dep : *Dependences) { |
| 3061 | Dep.print(OS, Depth: Depth + 2, Instrs: DepChecker->getMemoryInstructions()); |
| 3062 | OS << "\n" ; |
| 3063 | } |
| 3064 | } else |
| 3065 | OS.indent(NumSpaces: Depth) << "Too many dependences, not recorded\n" ; |
| 3066 | |
| 3067 | // List the pair of accesses need run-time checks to prove independence. |
| 3068 | PtrRtChecking->print(OS, Depth); |
| 3069 | if (PtrRtChecking->Need && !HasCompletePtrRtChecking) |
| 3070 | OS.indent(NumSpaces: Depth) << "Generated run-time checks are incomplete\n" ; |
| 3071 | OS << "\n" ; |
| 3072 | |
| 3073 | OS.indent(NumSpaces: Depth) |
| 3074 | << "Non vectorizable stores to invariant address were " |
| 3075 | << (HasStoreStoreDependenceInvolvingLoopInvariantAddress || |
| 3076 | HasLoadStoreDependenceInvolvingLoopInvariantAddress |
| 3077 | ? "" |
| 3078 | : "not " ) |
| 3079 | << "found in loop.\n" ; |
| 3080 | |
| 3081 | OS.indent(NumSpaces: Depth) << "SCEV assumptions:\n" ; |
| 3082 | PSE->getPredicate().print(OS, Depth); |
| 3083 | |
| 3084 | OS << "\n" ; |
| 3085 | |
| 3086 | OS.indent(NumSpaces: Depth) << "Expressions re-written:\n" ; |
| 3087 | PSE->print(OS, Depth); |
| 3088 | } |
| 3089 | |
| 3090 | const LoopAccessInfo &LoopAccessInfoManager::getInfo(Loop &L, |
| 3091 | bool AllowPartial) { |
| 3092 | const auto &[It, Inserted] = LoopAccessInfoMap.try_emplace(Key: &L); |
| 3093 | |
| 3094 | // We need to create the LoopAccessInfo if either we don't already have one, |
| 3095 | // or if it was created with a different value of AllowPartial. |
| 3096 | if (Inserted || It->second->hasAllowPartial() != AllowPartial) |
| 3097 | It->second = std::make_unique<LoopAccessInfo>(args: &L, args: &SE, args&: TTI, args&: TLI, args: &AA, args: &DT, |
| 3098 | args: &LI, args&: AllowPartial); |
| 3099 | |
| 3100 | return *It->second; |
| 3101 | } |
| 3102 | void LoopAccessInfoManager::clear() { |
| 3103 | // Collect LoopAccessInfo entries that may keep references to IR outside the |
| 3104 | // analyzed loop or SCEVs that may have been modified or invalidated. At the |
| 3105 | // moment, that is loops requiring memory or SCEV runtime checks, as those cache |
| 3106 | // SCEVs, e.g. for pointer expressions. |
| 3107 | for (const auto &[L, LAI] : LoopAccessInfoMap) { |
| 3108 | if (LAI->getRuntimePointerChecking()->getChecks().empty() && |
| 3109 | LAI->getPSE().getPredicate().isAlwaysTrue()) |
| 3110 | continue; |
| 3111 | LoopAccessInfoMap.erase(Val: L); |
| 3112 | } |
| 3113 | } |
| 3114 | |
| 3115 | bool LoopAccessInfoManager::invalidate( |
| 3116 | Function &F, const PreservedAnalyses &PA, |
| 3117 | FunctionAnalysisManager::Invalidator &Inv) { |
| 3118 | // Check whether our analysis is preserved. |
| 3119 | auto PAC = PA.getChecker<LoopAccessAnalysis>(); |
| 3120 | if (!PAC.preserved() && !PAC.preservedSet<AllAnalysesOn<Function>>()) |
| 3121 | // If not, give up now. |
| 3122 | return true; |
| 3123 | |
| 3124 | // Check whether the analyses we depend on became invalid for any reason. |
| 3125 | // Skip checking TargetLibraryAnalysis as it is immutable and can't become |
| 3126 | // invalid. |
| 3127 | return Inv.invalidate<AAManager>(IR&: F, PA) || |
| 3128 | Inv.invalidate<ScalarEvolutionAnalysis>(IR&: F, PA) || |
| 3129 | Inv.invalidate<LoopAnalysis>(IR&: F, PA) || |
| 3130 | Inv.invalidate<DominatorTreeAnalysis>(IR&: F, PA); |
| 3131 | } |
| 3132 | |
| 3133 | LoopAccessInfoManager LoopAccessAnalysis::run(Function &F, |
| 3134 | FunctionAnalysisManager &FAM) { |
| 3135 | auto &SE = FAM.getResult<ScalarEvolutionAnalysis>(IR&: F); |
| 3136 | auto &AA = FAM.getResult<AAManager>(IR&: F); |
| 3137 | auto &DT = FAM.getResult<DominatorTreeAnalysis>(IR&: F); |
| 3138 | auto &LI = FAM.getResult<LoopAnalysis>(IR&: F); |
| 3139 | auto &TTI = FAM.getResult<TargetIRAnalysis>(IR&: F); |
| 3140 | auto &TLI = FAM.getResult<TargetLibraryAnalysis>(IR&: F); |
| 3141 | return LoopAccessInfoManager(SE, AA, DT, LI, &TTI, &TLI); |
| 3142 | } |
| 3143 | |
| 3144 | AnalysisKey LoopAccessAnalysis::Key; |
| 3145 | |