| 1 | //===- LoopAccessAnalysis.cpp - Loop Access Analysis Implementation --------==// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | // |
| 9 | // The implementation for the loop memory dependence that was originally |
| 10 | // developed for the loop vectorizer. |
| 11 | // |
| 12 | //===----------------------------------------------------------------------===// |
| 13 | |
| 14 | #include "llvm/Analysis/LoopAccessAnalysis.h" |
| 15 | #include "llvm/ADT/APInt.h" |
| 16 | #include "llvm/ADT/DenseMap.h" |
| 17 | #include "llvm/ADT/EquivalenceClasses.h" |
| 18 | #include "llvm/ADT/PointerIntPair.h" |
| 19 | #include "llvm/ADT/STLExtras.h" |
| 20 | #include "llvm/ADT/SetVector.h" |
| 21 | #include "llvm/ADT/SmallPtrSet.h" |
| 22 | #include "llvm/ADT/SmallSet.h" |
| 23 | #include "llvm/ADT/SmallVector.h" |
| 24 | #include "llvm/Analysis/AliasAnalysis.h" |
| 25 | #include "llvm/Analysis/AliasSetTracker.h" |
| 26 | #include "llvm/Analysis/AssumeBundleQueries.h" |
| 27 | #include "llvm/Analysis/AssumptionCache.h" |
| 28 | #include "llvm/Analysis/LoopAnalysisManager.h" |
| 29 | #include "llvm/Analysis/LoopInfo.h" |
| 30 | #include "llvm/Analysis/LoopIterator.h" |
| 31 | #include "llvm/Analysis/MemoryLocation.h" |
| 32 | #include "llvm/Analysis/OptimizationRemarkEmitter.h" |
| 33 | #include "llvm/Analysis/ScalarEvolution.h" |
| 34 | #include "llvm/Analysis/ScalarEvolutionExpressions.h" |
| 35 | #include "llvm/Analysis/ScalarEvolutionPatternMatch.h" |
| 36 | #include "llvm/Analysis/TargetLibraryInfo.h" |
| 37 | #include "llvm/Analysis/TargetTransformInfo.h" |
| 38 | #include "llvm/Analysis/ValueTracking.h" |
| 39 | #include "llvm/Analysis/VectorUtils.h" |
| 40 | #include "llvm/IR/BasicBlock.h" |
| 41 | #include "llvm/IR/Constants.h" |
| 42 | #include "llvm/IR/DataLayout.h" |
| 43 | #include "llvm/IR/DebugLoc.h" |
| 44 | #include "llvm/IR/DerivedTypes.h" |
| 45 | #include "llvm/IR/DiagnosticInfo.h" |
| 46 | #include "llvm/IR/Dominators.h" |
| 47 | #include "llvm/IR/Function.h" |
| 48 | #include "llvm/IR/InstrTypes.h" |
| 49 | #include "llvm/IR/Instruction.h" |
| 50 | #include "llvm/IR/Instructions.h" |
| 51 | #include "llvm/IR/IntrinsicInst.h" |
| 52 | #include "llvm/IR/PassManager.h" |
| 53 | #include "llvm/IR/Type.h" |
| 54 | #include "llvm/IR/Value.h" |
| 55 | #include "llvm/IR/ValueHandle.h" |
| 56 | #include "llvm/Support/Casting.h" |
| 57 | #include "llvm/Support/CommandLine.h" |
| 58 | #include "llvm/Support/Debug.h" |
| 59 | #include "llvm/Support/ErrorHandling.h" |
| 60 | #include "llvm/Support/raw_ostream.h" |
| 61 | #include <algorithm> |
| 62 | #include <cassert> |
| 63 | #include <cstdint> |
| 64 | #include <iterator> |
| 65 | #include <utility> |
| 66 | #include <variant> |
| 67 | #include <vector> |
| 68 | |
| 69 | using namespace llvm; |
| 70 | using namespace llvm::SCEVPatternMatch; |
| 71 | |
| 72 | #define DEBUG_TYPE "loop-accesses" |
| 73 | |
| 74 | static cl::opt<unsigned, true> |
| 75 | VectorizationFactor("force-vector-width" , cl::Hidden, |
| 76 | cl::desc("Sets the SIMD width. Zero is autoselect." ), |
| 77 | cl::location(L&: VectorizerParams::VectorizationFactor)); |
| 78 | unsigned VectorizerParams::VectorizationFactor; |
| 79 | |
| 80 | static cl::opt<unsigned, true> |
| 81 | VectorizationInterleave("force-vector-interleave" , cl::Hidden, |
| 82 | cl::desc("Sets the vectorization interleave count. " |
| 83 | "Zero is autoselect." ), |
| 84 | cl::location( |
| 85 | L&: VectorizerParams::VectorizationInterleave)); |
| 86 | unsigned VectorizerParams::VectorizationInterleave; |
| 87 | |
| 88 | static cl::opt<unsigned, true> RuntimeMemoryCheckThreshold( |
| 89 | "runtime-memory-check-threshold" , cl::Hidden, |
| 90 | cl::desc("When performing memory disambiguation checks at runtime do not " |
| 91 | "generate more than this number of comparisons (default = 8)." ), |
| 92 | cl::location(L&: VectorizerParams::RuntimeMemoryCheckThreshold), cl::init(Val: 8)); |
| 93 | unsigned VectorizerParams::RuntimeMemoryCheckThreshold; |
| 94 | |
| 95 | /// The maximum iterations used to merge memory checks |
| 96 | static cl::opt<unsigned> MemoryCheckMergeThreshold( |
| 97 | "memory-check-merge-threshold" , cl::Hidden, |
| 98 | cl::desc("Maximum number of comparisons done when trying to merge " |
| 99 | "runtime memory checks. (default = 100)" ), |
| 100 | cl::init(Val: 100)); |
| 101 | |
| 102 | /// Maximum SIMD width. |
| 103 | const unsigned VectorizerParams::MaxVectorWidth = 64; |
| 104 | |
| 105 | /// We collect dependences up to this threshold. |
| 106 | static cl::opt<unsigned> |
| 107 | MaxDependences("max-dependences" , cl::Hidden, |
| 108 | cl::desc("Maximum number of dependences collected by " |
| 109 | "loop-access analysis (default = 100)" ), |
| 110 | cl::init(Val: 100)); |
| 111 | |
| 112 | /// This enables versioning on the strides of symbolically striding memory |
| 113 | /// accesses in code like the following. |
| 114 | /// for (i = 0; i < N; ++i) |
| 115 | /// A[i * Stride1] += B[i * Stride2] ... |
| 116 | /// |
| 117 | /// Will be roughly translated to |
| 118 | /// if (Stride1 == 1 && Stride2 == 1) { |
| 119 | /// for (i = 0; i < N; i+=4) |
| 120 | /// A[i:i+3] += ... |
| 121 | /// } else |
| 122 | /// ... |
| 123 | static cl::opt<bool> EnableMemAccessVersioning( |
| 124 | "enable-mem-access-versioning" , cl::init(Val: true), cl::Hidden, |
| 125 | cl::desc("Enable symbolic stride memory access versioning" )); |
| 126 | |
| 127 | /// Enable store-to-load forwarding conflict detection. This option can |
| 128 | /// be disabled for correctness testing. |
| 129 | static cl::opt<bool> EnableForwardingConflictDetection( |
| 130 | "store-to-load-forwarding-conflict-detection" , cl::Hidden, |
| 131 | cl::desc("Enable conflict detection in loop-access analysis" ), |
| 132 | cl::init(Val: true)); |
| 133 | |
| 134 | static cl::opt<unsigned> MaxForkedSCEVDepth( |
| 135 | "max-forked-scev-depth" , cl::Hidden, |
| 136 | cl::desc("Maximum recursion depth when finding forked SCEVs (default = 5)" ), |
| 137 | cl::init(Val: 5)); |
| 138 | |
| 139 | static cl::opt<bool> SpeculateUnitStride( |
| 140 | "laa-speculate-unit-stride" , cl::Hidden, |
| 141 | cl::desc("Speculate that non-constant strides are unit in LAA" ), |
| 142 | cl::init(Val: true)); |
| 143 | |
| 144 | static cl::opt<bool, true> HoistRuntimeChecks( |
| 145 | "hoist-runtime-checks" , cl::Hidden, |
| 146 | cl::desc( |
| 147 | "Hoist inner loop runtime memory checks to outer loop if possible" ), |
| 148 | cl::location(L&: VectorizerParams::HoistRuntimeChecks), cl::init(Val: true)); |
| 149 | bool VectorizerParams::HoistRuntimeChecks; |
| 150 | |
| 151 | bool VectorizerParams::isInterleaveForced() { |
| 152 | return ::VectorizationInterleave.getNumOccurrences() > 0; |
| 153 | } |
| 154 | |
| 155 | const SCEV *llvm::replaceSymbolicStrideSCEV(PredicatedScalarEvolution &PSE, |
| 156 | const DenseMap<Value *, const SCEV *> &PtrToStride, |
| 157 | Value *Ptr) { |
| 158 | const SCEV *OrigSCEV = PSE.getSCEV(V: Ptr); |
| 159 | |
| 160 | // If there is an entry in the map return the SCEV of the pointer with the |
| 161 | // symbolic stride replaced by one. |
| 162 | const SCEV *StrideSCEV = PtrToStride.lookup(Val: Ptr); |
| 163 | if (!StrideSCEV) |
| 164 | // For a non-symbolic stride, just return the original expression. |
| 165 | return OrigSCEV; |
| 166 | |
| 167 | // Note: This assert is both overly strong and overly weak. The actual |
| 168 | // invariant here is that StrideSCEV should be loop invariant. The only |
| 169 | // such invariant strides we happen to speculate right now are unknowns |
| 170 | // and thus this is a reasonable proxy of the actual invariant. |
| 171 | assert(isa<SCEVUnknown>(StrideSCEV) && "shouldn't be in map" ); |
| 172 | |
| 173 | ScalarEvolution *SE = PSE.getSE(); |
| 174 | const SCEV *CT = SE->getOne(Ty: StrideSCEV->getType()); |
| 175 | PSE.addPredicate(Pred: *SE->getEqualPredicate(LHS: StrideSCEV, RHS: CT)); |
| 176 | const SCEV *Expr = PSE.getSCEV(V: Ptr); |
| 177 | |
| 178 | LLVM_DEBUG(dbgs() << "LAA: Replacing SCEV: " << *OrigSCEV |
| 179 | << " by: " << *Expr << "\n" ); |
| 180 | return Expr; |
| 181 | } |
| 182 | |
| 183 | RuntimeCheckingPtrGroup::RuntimeCheckingPtrGroup( |
| 184 | unsigned Index, const RuntimePointerChecking &RtCheck) |
| 185 | : High(RtCheck.Pointers[Index].End), Low(RtCheck.Pointers[Index].Start), |
| 186 | AddressSpace(RtCheck.Pointers[Index] |
| 187 | .PointerValue->getType() |
| 188 | ->getPointerAddressSpace()), |
| 189 | NeedsFreeze(RtCheck.Pointers[Index].NeedsFreeze) { |
| 190 | Members.push_back(Elt: Index); |
| 191 | } |
| 192 | |
| 193 | /// Returns \p A + \p B, if it is guaranteed not to unsigned wrap. Otherwise |
| 194 | /// return nullptr. \p A and \p B must have the same type. |
| 195 | static const SCEV *addSCEVNoOverflow(const SCEV *A, const SCEV *B, |
| 196 | ScalarEvolution &SE) { |
| 197 | if (!SE.willNotOverflow(BinOp: Instruction::Add, /*IsSigned=*/Signed: false, LHS: A, RHS: B)) |
| 198 | return nullptr; |
| 199 | return SE.getAddExpr(LHS: A, RHS: B); |
| 200 | } |
| 201 | |
| 202 | /// Returns \p A * \p B, if it is guaranteed not to unsigned wrap. Otherwise |
| 203 | /// return nullptr. \p A and \p B must have the same type. |
| 204 | static const SCEV *mulSCEVOverflow(const SCEV *A, const SCEV *B, |
| 205 | ScalarEvolution &SE) { |
| 206 | if (!SE.willNotOverflow(BinOp: Instruction::Mul, /*IsSigned=*/Signed: false, LHS: A, RHS: B)) |
| 207 | return nullptr; |
| 208 | return SE.getMulExpr(LHS: A, RHS: B); |
| 209 | } |
| 210 | |
| 211 | /// Return true, if evaluating \p AR at \p MaxBTC cannot wrap, because \p AR at |
| 212 | /// \p MaxBTC is guaranteed inbounds of the accessed object. |
| 213 | static bool evaluatePtrAddRecAtMaxBTCWillNotWrap( |
| 214 | const SCEVAddRecExpr *AR, const SCEV *MaxBTC, const SCEV *EltSize, |
| 215 | ScalarEvolution &SE, const DataLayout &DL, DominatorTree *DT, |
| 216 | AssumptionCache *AC, |
| 217 | std::optional<ScalarEvolution::LoopGuards> &LoopGuards) { |
| 218 | auto *PointerBase = SE.getPointerBase(V: AR->getStart()); |
| 219 | auto *StartPtr = dyn_cast<SCEVUnknown>(Val: PointerBase); |
| 220 | if (!StartPtr) |
| 221 | return false; |
| 222 | const Loop *L = AR->getLoop(); |
| 223 | bool CheckForNonNull, CheckForFreed; |
| 224 | Value *StartPtrV = StartPtr->getValue(); |
| 225 | uint64_t DerefBytes = StartPtrV->getPointerDereferenceableBytes( |
| 226 | DL, CanBeNull&: CheckForNonNull, CanBeFreed&: CheckForFreed); |
| 227 | |
| 228 | if (DerefBytes && (CheckForNonNull || CheckForFreed)) |
| 229 | return false; |
| 230 | |
| 231 | const SCEV *Step = AR->getStepRecurrence(SE); |
| 232 | Type *WiderTy = SE.getWiderType(Ty1: MaxBTC->getType(), Ty2: Step->getType()); |
| 233 | const SCEV *DerefBytesSCEV = SE.getConstant(Ty: WiderTy, V: DerefBytes); |
| 234 | |
| 235 | // Check if we have a suitable dereferencable assumption we can use. |
| 236 | Instruction *CtxI = &*L->getHeader()->getFirstNonPHIIt(); |
| 237 | if (BasicBlock *LoopPred = L->getLoopPredecessor()) { |
| 238 | if (isa<BranchInst>(Val: LoopPred->getTerminator())) |
| 239 | CtxI = LoopPred->getTerminator(); |
| 240 | } |
| 241 | RetainedKnowledge DerefRK; |
| 242 | getKnowledgeForValue(V: StartPtrV, AttrKinds: {Attribute::Dereferenceable}, AC&: *AC, |
| 243 | Filter: [&](RetainedKnowledge RK, Instruction *Assume, auto) { |
| 244 | if (!isValidAssumeForContext(I: Assume, CxtI: CtxI, DT)) |
| 245 | return false; |
| 246 | if (StartPtrV->canBeFreed() && |
| 247 | !willNotFreeBetween(Assume, CtxI)) |
| 248 | return false; |
| 249 | DerefRK = std::max(a: DerefRK, b: RK); |
| 250 | return true; |
| 251 | }); |
| 252 | if (DerefRK) { |
| 253 | DerefBytesSCEV = |
| 254 | SE.getUMaxExpr(LHS: DerefBytesSCEV, RHS: SE.getSCEV(V: DerefRK.IRArgValue)); |
| 255 | } |
| 256 | |
| 257 | if (DerefBytesSCEV->isZero()) |
| 258 | return false; |
| 259 | |
| 260 | bool IsKnownNonNegative = SE.isKnownNonNegative(S: Step); |
| 261 | if (!IsKnownNonNegative && !SE.isKnownNegative(S: Step)) |
| 262 | return false; |
| 263 | |
| 264 | Step = SE.getNoopOrSignExtend(V: Step, Ty: WiderTy); |
| 265 | MaxBTC = SE.getNoopOrZeroExtend(V: MaxBTC, Ty: WiderTy); |
| 266 | |
| 267 | // For the computations below, make sure they don't unsigned wrap. |
| 268 | if (!SE.isKnownPredicate(Pred: CmpInst::ICMP_UGE, LHS: AR->getStart(), RHS: StartPtr)) |
| 269 | return false; |
| 270 | const SCEV *StartOffset = SE.getNoopOrZeroExtend( |
| 271 | V: SE.getMinusSCEV(LHS: AR->getStart(), RHS: StartPtr), Ty: WiderTy); |
| 272 | |
| 273 | if (!LoopGuards) |
| 274 | LoopGuards.emplace(args: ScalarEvolution::LoopGuards::collect(L: AR->getLoop(), SE)); |
| 275 | MaxBTC = SE.applyLoopGuards(Expr: MaxBTC, Guards: *LoopGuards); |
| 276 | |
| 277 | const SCEV *OffsetAtLastIter = |
| 278 | mulSCEVOverflow(A: MaxBTC, B: SE.getAbsExpr(Op: Step, /*IsNSW=*/false), SE); |
| 279 | if (!OffsetAtLastIter) { |
| 280 | // Re-try with constant max backedge-taken count if using the symbolic one |
| 281 | // failed. |
| 282 | MaxBTC = SE.getConstantMaxBackedgeTakenCount(L: AR->getLoop()); |
| 283 | if (isa<SCEVCouldNotCompute>(Val: MaxBTC)) |
| 284 | return false; |
| 285 | MaxBTC = SE.getNoopOrZeroExtend( |
| 286 | V: MaxBTC, Ty: WiderTy); |
| 287 | OffsetAtLastIter = |
| 288 | mulSCEVOverflow(A: MaxBTC, B: SE.getAbsExpr(Op: Step, /*IsNSW=*/false), SE); |
| 289 | if (!OffsetAtLastIter) |
| 290 | return false; |
| 291 | } |
| 292 | |
| 293 | const SCEV *OffsetEndBytes = addSCEVNoOverflow( |
| 294 | A: OffsetAtLastIter, B: SE.getNoopOrZeroExtend(V: EltSize, Ty: WiderTy), SE); |
| 295 | if (!OffsetEndBytes) |
| 296 | return false; |
| 297 | |
| 298 | if (IsKnownNonNegative) { |
| 299 | // For positive steps, check if |
| 300 | // (AR->getStart() - StartPtr) + (MaxBTC * Step) + EltSize <= DerefBytes, |
| 301 | // while making sure none of the computations unsigned wrap themselves. |
| 302 | const SCEV *EndBytes = addSCEVNoOverflow(A: StartOffset, B: OffsetEndBytes, SE); |
| 303 | if (!EndBytes) |
| 304 | return false; |
| 305 | |
| 306 | DerefBytesSCEV = SE.applyLoopGuards(Expr: DerefBytesSCEV, Guards: *LoopGuards); |
| 307 | return SE.isKnownPredicate(Pred: CmpInst::ICMP_ULE, LHS: EndBytes, RHS: DerefBytesSCEV); |
| 308 | } |
| 309 | |
| 310 | // For negative steps check if |
| 311 | // * StartOffset >= (MaxBTC * Step + EltSize) |
| 312 | // * StartOffset <= DerefBytes. |
| 313 | assert(SE.isKnownNegative(Step) && "must be known negative" ); |
| 314 | return SE.isKnownPredicate(Pred: CmpInst::ICMP_SGE, LHS: StartOffset, RHS: OffsetEndBytes) && |
| 315 | SE.isKnownPredicate(Pred: CmpInst::ICMP_ULE, LHS: StartOffset, RHS: DerefBytesSCEV); |
| 316 | } |
| 317 | |
| 318 | std::pair<const SCEV *, const SCEV *> llvm::getStartAndEndForAccess( |
| 319 | const Loop *Lp, const SCEV *PtrExpr, Type *AccessTy, const SCEV *BTC, |
| 320 | const SCEV *MaxBTC, ScalarEvolution *SE, |
| 321 | DenseMap<std::pair<const SCEV *, Type *>, |
| 322 | std::pair<const SCEV *, const SCEV *>> *PointerBounds, |
| 323 | DominatorTree *DT, AssumptionCache *AC, |
| 324 | std::optional<ScalarEvolution::LoopGuards> &LoopGuards) { |
| 325 | std::pair<const SCEV *, const SCEV *> *PtrBoundsPair; |
| 326 | if (PointerBounds) { |
| 327 | auto [Iter, Ins] = PointerBounds->insert( |
| 328 | KV: {{PtrExpr, AccessTy}, |
| 329 | {SE->getCouldNotCompute(), SE->getCouldNotCompute()}}); |
| 330 | if (!Ins) |
| 331 | return Iter->second; |
| 332 | PtrBoundsPair = &Iter->second; |
| 333 | } |
| 334 | |
| 335 | const SCEV *ScStart; |
| 336 | const SCEV *ScEnd; |
| 337 | |
| 338 | auto &DL = Lp->getHeader()->getDataLayout(); |
| 339 | Type *IdxTy = DL.getIndexType(PtrTy: PtrExpr->getType()); |
| 340 | const SCEV *EltSizeSCEV = SE->getStoreSizeOfExpr(IntTy: IdxTy, StoreTy: AccessTy); |
| 341 | if (SE->isLoopInvariant(S: PtrExpr, L: Lp)) { |
| 342 | ScStart = ScEnd = PtrExpr; |
| 343 | } else if (auto *AR = dyn_cast<SCEVAddRecExpr>(Val: PtrExpr)) { |
| 344 | ScStart = AR->getStart(); |
| 345 | if (!isa<SCEVCouldNotCompute>(Val: BTC)) |
| 346 | // Evaluating AR at an exact BTC is safe: LAA separately checks that |
| 347 | // accesses cannot wrap in the loop. If evaluating AR at BTC wraps, then |
| 348 | // the loop either triggers UB when executing a memory access with a |
| 349 | // poison pointer or the wrapping/poisoned pointer is not used. |
| 350 | ScEnd = AR->evaluateAtIteration(It: BTC, SE&: *SE); |
| 351 | else { |
| 352 | // Evaluating AR at MaxBTC may wrap and create an expression that is less |
| 353 | // than the start of the AddRec due to wrapping (for example consider |
| 354 | // MaxBTC = -2). If that's the case, set ScEnd to -(EltSize + 1). ScEnd |
| 355 | // will get incremented by EltSize before returning, so this effectively |
| 356 | // sets ScEnd to the maximum unsigned value for the type. Note that LAA |
| 357 | // separately checks that accesses cannot not wrap, so unsigned max |
| 358 | // represents an upper bound. |
| 359 | if (evaluatePtrAddRecAtMaxBTCWillNotWrap(AR, MaxBTC, EltSize: EltSizeSCEV, SE&: *SE, DL, |
| 360 | DT, AC, LoopGuards)) { |
| 361 | ScEnd = AR->evaluateAtIteration(It: MaxBTC, SE&: *SE); |
| 362 | } else { |
| 363 | ScEnd = SE->getAddExpr( |
| 364 | LHS: SE->getNegativeSCEV(V: EltSizeSCEV), |
| 365 | RHS: SE->getSCEV(V: ConstantExpr::getIntToPtr( |
| 366 | C: ConstantInt::getAllOnesValue(Ty: EltSizeSCEV->getType()), |
| 367 | Ty: AR->getType()))); |
| 368 | } |
| 369 | } |
| 370 | const SCEV *Step = AR->getStepRecurrence(SE&: *SE); |
| 371 | |
| 372 | // For expressions with negative step, the upper bound is ScStart and the |
| 373 | // lower bound is ScEnd. |
| 374 | if (const auto *CStep = dyn_cast<SCEVConstant>(Val: Step)) { |
| 375 | if (CStep->getValue()->isNegative()) |
| 376 | std::swap(a&: ScStart, b&: ScEnd); |
| 377 | } else { |
| 378 | // Fallback case: the step is not constant, but we can still |
| 379 | // get the upper and lower bounds of the interval by using min/max |
| 380 | // expressions. |
| 381 | ScStart = SE->getUMinExpr(LHS: ScStart, RHS: ScEnd); |
| 382 | ScEnd = SE->getUMaxExpr(LHS: AR->getStart(), RHS: ScEnd); |
| 383 | } |
| 384 | } else |
| 385 | return {SE->getCouldNotCompute(), SE->getCouldNotCompute()}; |
| 386 | |
| 387 | assert(SE->isLoopInvariant(ScStart, Lp) && "ScStart needs to be invariant" ); |
| 388 | assert(SE->isLoopInvariant(ScEnd, Lp) && "ScEnd needs to be invariant" ); |
| 389 | |
| 390 | // Add the size of the pointed element to ScEnd. |
| 391 | ScEnd = SE->getAddExpr(LHS: ScEnd, RHS: EltSizeSCEV); |
| 392 | |
| 393 | std::pair<const SCEV *, const SCEV *> Res = {ScStart, ScEnd}; |
| 394 | if (PointerBounds) |
| 395 | *PtrBoundsPair = Res; |
| 396 | return Res; |
| 397 | } |
| 398 | |
| 399 | /// Calculate Start and End points of memory access using |
| 400 | /// getStartAndEndForAccess. |
| 401 | void RuntimePointerChecking::insert(Loop *Lp, Value *Ptr, const SCEV *PtrExpr, |
| 402 | Type *AccessTy, bool WritePtr, |
| 403 | unsigned DepSetId, unsigned ASId, |
| 404 | PredicatedScalarEvolution &PSE, |
| 405 | bool NeedsFreeze) { |
| 406 | const SCEV *SymbolicMaxBTC = PSE.getSymbolicMaxBackedgeTakenCount(); |
| 407 | const SCEV *BTC = PSE.getBackedgeTakenCount(); |
| 408 | const auto &[ScStart, ScEnd] = getStartAndEndForAccess( |
| 409 | Lp, PtrExpr, AccessTy, BTC, MaxBTC: SymbolicMaxBTC, SE: PSE.getSE(), |
| 410 | PointerBounds: &DC.getPointerBounds(), DT: DC.getDT(), AC: DC.getAC(), LoopGuards); |
| 411 | assert(!isa<SCEVCouldNotCompute>(ScStart) && |
| 412 | !isa<SCEVCouldNotCompute>(ScEnd) && |
| 413 | "must be able to compute both start and end expressions" ); |
| 414 | Pointers.emplace_back(Args&: Ptr, Args: ScStart, Args: ScEnd, Args&: WritePtr, Args&: DepSetId, Args&: ASId, Args&: PtrExpr, |
| 415 | Args&: NeedsFreeze); |
| 416 | } |
| 417 | |
| 418 | bool RuntimePointerChecking::tryToCreateDiffCheck( |
| 419 | const RuntimeCheckingPtrGroup &CGI, const RuntimeCheckingPtrGroup &CGJ) { |
| 420 | // If either group contains multiple different pointers, bail out. |
| 421 | // TODO: Support multiple pointers by using the minimum or maximum pointer, |
| 422 | // depending on src & sink. |
| 423 | if (CGI.Members.size() != 1 || CGJ.Members.size() != 1) |
| 424 | return false; |
| 425 | |
| 426 | const PointerInfo *Src = &Pointers[CGI.Members[0]]; |
| 427 | const PointerInfo *Sink = &Pointers[CGJ.Members[0]]; |
| 428 | |
| 429 | // If either pointer is read and written, multiple checks may be needed. Bail |
| 430 | // out. |
| 431 | if (!DC.getOrderForAccess(Ptr: Src->PointerValue, IsWrite: !Src->IsWritePtr).empty() || |
| 432 | !DC.getOrderForAccess(Ptr: Sink->PointerValue, IsWrite: !Sink->IsWritePtr).empty()) |
| 433 | return false; |
| 434 | |
| 435 | ArrayRef<unsigned> AccSrc = |
| 436 | DC.getOrderForAccess(Ptr: Src->PointerValue, IsWrite: Src->IsWritePtr); |
| 437 | ArrayRef<unsigned> AccSink = |
| 438 | DC.getOrderForAccess(Ptr: Sink->PointerValue, IsWrite: Sink->IsWritePtr); |
| 439 | // If either pointer is accessed multiple times, there may not be a clear |
| 440 | // src/sink relation. Bail out for now. |
| 441 | if (AccSrc.size() != 1 || AccSink.size() != 1) |
| 442 | return false; |
| 443 | |
| 444 | // If the sink is accessed before src, swap src/sink. |
| 445 | if (AccSink[0] < AccSrc[0]) |
| 446 | std::swap(a&: Src, b&: Sink); |
| 447 | |
| 448 | const SCEVConstant *Step; |
| 449 | const SCEV *SrcStart; |
| 450 | const SCEV *SinkStart; |
| 451 | const Loop *InnerLoop = DC.getInnermostLoop(); |
| 452 | if (!match(S: Src->Expr, |
| 453 | P: m_scev_AffineAddRec(Op0: m_SCEV(V&: SrcStart), Op1: m_SCEVConstant(V&: Step), |
| 454 | L: m_SpecificLoop(L: InnerLoop))) || |
| 455 | !match(S: Sink->Expr, |
| 456 | P: m_scev_AffineAddRec(Op0: m_SCEV(V&: SinkStart), Op1: m_scev_Specific(S: Step), |
| 457 | L: m_SpecificLoop(L: InnerLoop)))) |
| 458 | return false; |
| 459 | |
| 460 | SmallVector<Instruction *, 4> SrcInsts = |
| 461 | DC.getInstructionsForAccess(Ptr: Src->PointerValue, isWrite: Src->IsWritePtr); |
| 462 | SmallVector<Instruction *, 4> SinkInsts = |
| 463 | DC.getInstructionsForAccess(Ptr: Sink->PointerValue, isWrite: Sink->IsWritePtr); |
| 464 | Type *SrcTy = getLoadStoreType(I: SrcInsts[0]); |
| 465 | Type *DstTy = getLoadStoreType(I: SinkInsts[0]); |
| 466 | if (isa<ScalableVectorType>(Val: SrcTy) || isa<ScalableVectorType>(Val: DstTy)) |
| 467 | return false; |
| 468 | |
| 469 | const DataLayout &DL = InnerLoop->getHeader()->getDataLayout(); |
| 470 | unsigned AllocSize = |
| 471 | std::max(a: DL.getTypeAllocSize(Ty: SrcTy), b: DL.getTypeAllocSize(Ty: DstTy)); |
| 472 | |
| 473 | // Only matching constant steps matching the AllocSize are supported at the |
| 474 | // moment. This simplifies the difference computation. Can be extended in the |
| 475 | // future. |
| 476 | if (Step->getAPInt().abs() != AllocSize) |
| 477 | return false; |
| 478 | |
| 479 | IntegerType *IntTy = |
| 480 | IntegerType::get(C&: Src->PointerValue->getContext(), |
| 481 | NumBits: DL.getPointerSizeInBits(AS: CGI.AddressSpace)); |
| 482 | |
| 483 | // When counting down, the dependence distance needs to be swapped. |
| 484 | if (Step->getValue()->isNegative()) |
| 485 | std::swap(a&: SinkStart, b&: SrcStart); |
| 486 | |
| 487 | const SCEV *SinkStartInt = SE->getPtrToIntExpr(Op: SinkStart, Ty: IntTy); |
| 488 | const SCEV *SrcStartInt = SE->getPtrToIntExpr(Op: SrcStart, Ty: IntTy); |
| 489 | if (isa<SCEVCouldNotCompute>(Val: SinkStartInt) || |
| 490 | isa<SCEVCouldNotCompute>(Val: SrcStartInt)) |
| 491 | return false; |
| 492 | |
| 493 | // If the start values for both Src and Sink also vary according to an outer |
| 494 | // loop, then it's probably better to avoid creating diff checks because |
| 495 | // they may not be hoisted. We should instead let llvm::addRuntimeChecks |
| 496 | // do the expanded full range overlap checks, which can be hoisted. |
| 497 | if (HoistRuntimeChecks && InnerLoop->getParentLoop() && |
| 498 | isa<SCEVAddRecExpr>(Val: SinkStartInt) && isa<SCEVAddRecExpr>(Val: SrcStartInt)) { |
| 499 | auto *SrcStartAR = cast<SCEVAddRecExpr>(Val: SrcStartInt); |
| 500 | auto *SinkStartAR = cast<SCEVAddRecExpr>(Val: SinkStartInt); |
| 501 | const Loop *StartARLoop = SrcStartAR->getLoop(); |
| 502 | if (StartARLoop == SinkStartAR->getLoop() && |
| 503 | StartARLoop == InnerLoop->getParentLoop() && |
| 504 | // If the diff check would already be loop invariant (due to the |
| 505 | // recurrences being the same), then we prefer to keep the diff checks |
| 506 | // because they are cheaper. |
| 507 | SrcStartAR->getStepRecurrence(SE&: *SE) != |
| 508 | SinkStartAR->getStepRecurrence(SE&: *SE)) { |
| 509 | LLVM_DEBUG(dbgs() << "LAA: Not creating diff runtime check, since these " |
| 510 | "cannot be hoisted out of the outer loop\n" ); |
| 511 | return false; |
| 512 | } |
| 513 | } |
| 514 | |
| 515 | LLVM_DEBUG(dbgs() << "LAA: Creating diff runtime check for:\n" |
| 516 | << "SrcStart: " << *SrcStartInt << '\n' |
| 517 | << "SinkStartInt: " << *SinkStartInt << '\n'); |
| 518 | DiffChecks.emplace_back(Args&: SrcStartInt, Args&: SinkStartInt, Args&: AllocSize, |
| 519 | Args: Src->NeedsFreeze || Sink->NeedsFreeze); |
| 520 | return true; |
| 521 | } |
| 522 | |
| 523 | SmallVector<RuntimePointerCheck, 4> RuntimePointerChecking::generateChecks() { |
| 524 | SmallVector<RuntimePointerCheck, 4> Checks; |
| 525 | |
| 526 | for (unsigned I = 0; I < CheckingGroups.size(); ++I) { |
| 527 | for (unsigned J = I + 1; J < CheckingGroups.size(); ++J) { |
| 528 | const RuntimeCheckingPtrGroup &CGI = CheckingGroups[I]; |
| 529 | const RuntimeCheckingPtrGroup &CGJ = CheckingGroups[J]; |
| 530 | |
| 531 | if (needsChecking(M: CGI, N: CGJ)) { |
| 532 | CanUseDiffCheck = CanUseDiffCheck && tryToCreateDiffCheck(CGI, CGJ); |
| 533 | Checks.emplace_back(Args: &CGI, Args: &CGJ); |
| 534 | } |
| 535 | } |
| 536 | } |
| 537 | return Checks; |
| 538 | } |
| 539 | |
| 540 | void RuntimePointerChecking::generateChecks( |
| 541 | MemoryDepChecker::DepCandidates &DepCands, bool UseDependencies) { |
| 542 | assert(Checks.empty() && "Checks is not empty" ); |
| 543 | groupChecks(DepCands, UseDependencies); |
| 544 | Checks = generateChecks(); |
| 545 | } |
| 546 | |
| 547 | bool RuntimePointerChecking::needsChecking( |
| 548 | const RuntimeCheckingPtrGroup &M, const RuntimeCheckingPtrGroup &N) const { |
| 549 | for (const auto &I : M.Members) |
| 550 | for (const auto &J : N.Members) |
| 551 | if (needsChecking(I, J)) |
| 552 | return true; |
| 553 | return false; |
| 554 | } |
| 555 | |
| 556 | /// Compare \p I and \p J and return the minimum. |
| 557 | /// Return nullptr in case we couldn't find an answer. |
| 558 | static const SCEV *getMinFromExprs(const SCEV *I, const SCEV *J, |
| 559 | ScalarEvolution *SE) { |
| 560 | std::optional<APInt> Diff = SE->computeConstantDifference(LHS: J, RHS: I); |
| 561 | if (!Diff) |
| 562 | return nullptr; |
| 563 | return Diff->isNegative() ? J : I; |
| 564 | } |
| 565 | |
| 566 | bool RuntimeCheckingPtrGroup::addPointer( |
| 567 | unsigned Index, const RuntimePointerChecking &RtCheck) { |
| 568 | return addPointer( |
| 569 | Index, Start: RtCheck.Pointers[Index].Start, End: RtCheck.Pointers[Index].End, |
| 570 | AS: RtCheck.Pointers[Index].PointerValue->getType()->getPointerAddressSpace(), |
| 571 | NeedsFreeze: RtCheck.Pointers[Index].NeedsFreeze, SE&: *RtCheck.SE); |
| 572 | } |
| 573 | |
| 574 | bool RuntimeCheckingPtrGroup::addPointer(unsigned Index, const SCEV *Start, |
| 575 | const SCEV *End, unsigned AS, |
| 576 | bool NeedsFreeze, |
| 577 | ScalarEvolution &SE) { |
| 578 | assert(AddressSpace == AS && |
| 579 | "all pointers in a checking group must be in the same address space" ); |
| 580 | |
| 581 | // Compare the starts and ends with the known minimum and maximum |
| 582 | // of this set. We need to know how we compare against the min/max |
| 583 | // of the set in order to be able to emit memchecks. |
| 584 | const SCEV *Min0 = getMinFromExprs(I: Start, J: Low, SE: &SE); |
| 585 | if (!Min0) |
| 586 | return false; |
| 587 | |
| 588 | const SCEV *Min1 = getMinFromExprs(I: End, J: High, SE: &SE); |
| 589 | if (!Min1) |
| 590 | return false; |
| 591 | |
| 592 | // Update the low bound expression if we've found a new min value. |
| 593 | if (Min0 == Start) |
| 594 | Low = Start; |
| 595 | |
| 596 | // Update the high bound expression if we've found a new max value. |
| 597 | if (Min1 != End) |
| 598 | High = End; |
| 599 | |
| 600 | Members.push_back(Elt: Index); |
| 601 | this->NeedsFreeze |= NeedsFreeze; |
| 602 | return true; |
| 603 | } |
| 604 | |
| 605 | void RuntimePointerChecking::groupChecks( |
| 606 | MemoryDepChecker::DepCandidates &DepCands, bool UseDependencies) { |
| 607 | // We build the groups from dependency candidates equivalence classes |
| 608 | // because: |
| 609 | // - We know that pointers in the same equivalence class share |
| 610 | // the same underlying object and therefore there is a chance |
| 611 | // that we can compare pointers |
| 612 | // - We wouldn't be able to merge two pointers for which we need |
| 613 | // to emit a memcheck. The classes in DepCands are already |
| 614 | // conveniently built such that no two pointers in the same |
| 615 | // class need checking against each other. |
| 616 | |
| 617 | // We use the following (greedy) algorithm to construct the groups |
| 618 | // For every pointer in the equivalence class: |
| 619 | // For each existing group: |
| 620 | // - if the difference between this pointer and the min/max bounds |
| 621 | // of the group is a constant, then make the pointer part of the |
| 622 | // group and update the min/max bounds of that group as required. |
| 623 | |
| 624 | CheckingGroups.clear(); |
| 625 | |
| 626 | // If we need to check two pointers to the same underlying object |
| 627 | // with a non-constant difference, we shouldn't perform any pointer |
| 628 | // grouping with those pointers. This is because we can easily get |
| 629 | // into cases where the resulting check would return false, even when |
| 630 | // the accesses are safe. |
| 631 | // |
| 632 | // The following example shows this: |
| 633 | // for (i = 0; i < 1000; ++i) |
| 634 | // a[5000 + i * m] = a[i] + a[i + 9000] |
| 635 | // |
| 636 | // Here grouping gives a check of (5000, 5000 + 1000 * m) against |
| 637 | // (0, 10000) which is always false. However, if m is 1, there is no |
| 638 | // dependence. Not grouping the checks for a[i] and a[i + 9000] allows |
| 639 | // us to perform an accurate check in this case. |
| 640 | // |
| 641 | // In the above case, we have a non-constant distance and an Unknown |
| 642 | // dependence between accesses to the same underlying object, and could retry |
| 643 | // with runtime checks. Therefore UseDependencies is false. In this case we |
| 644 | // will use the fallback path and create separate checking groups for all |
| 645 | // pointers. |
| 646 | |
| 647 | // If we don't have the dependency partitions, construct a new |
| 648 | // checking pointer group for each pointer. This is also required |
| 649 | // for correctness, because in this case we can have checking between |
| 650 | // pointers to the same underlying object. |
| 651 | if (!UseDependencies) { |
| 652 | for (unsigned I = 0; I < Pointers.size(); ++I) |
| 653 | CheckingGroups.emplace_back(Args&: I, Args&: *this); |
| 654 | return; |
| 655 | } |
| 656 | |
| 657 | unsigned TotalComparisons = 0; |
| 658 | |
| 659 | DenseMap<Value *, SmallVector<unsigned>> PositionMap; |
| 660 | for (unsigned Index = 0; Index < Pointers.size(); ++Index) |
| 661 | PositionMap[Pointers[Index].PointerValue].push_back(Elt: Index); |
| 662 | |
| 663 | // We need to keep track of what pointers we've already seen so we |
| 664 | // don't process them twice. |
| 665 | SmallSet<unsigned, 2> Seen; |
| 666 | |
| 667 | // Go through all equivalence classes, get the "pointer check groups" |
| 668 | // and add them to the overall solution. We use the order in which accesses |
| 669 | // appear in 'Pointers' to enforce determinism. |
| 670 | for (unsigned I = 0; I < Pointers.size(); ++I) { |
| 671 | // We've seen this pointer before, and therefore already processed |
| 672 | // its equivalence class. |
| 673 | if (Seen.contains(V: I)) |
| 674 | continue; |
| 675 | |
| 676 | MemoryDepChecker::MemAccessInfo Access(Pointers[I].PointerValue, |
| 677 | Pointers[I].IsWritePtr); |
| 678 | |
| 679 | SmallVector<RuntimeCheckingPtrGroup, 2> Groups; |
| 680 | |
| 681 | // Because DepCands is constructed by visiting accesses in the order in |
| 682 | // which they appear in alias sets (which is deterministic) and the |
| 683 | // iteration order within an equivalence class member is only dependent on |
| 684 | // the order in which unions and insertions are performed on the |
| 685 | // equivalence class, the iteration order is deterministic. |
| 686 | for (auto M : DepCands.members(V: Access)) { |
| 687 | auto PointerI = PositionMap.find(Val: M.getPointer()); |
| 688 | // If we can't find the pointer in PositionMap that means we can't |
| 689 | // generate a memcheck for it. |
| 690 | if (PointerI == PositionMap.end()) |
| 691 | continue; |
| 692 | for (unsigned Pointer : PointerI->second) { |
| 693 | bool Merged = false; |
| 694 | // Mark this pointer as seen. |
| 695 | Seen.insert(V: Pointer); |
| 696 | |
| 697 | // Go through all the existing sets and see if we can find one |
| 698 | // which can include this pointer. |
| 699 | for (RuntimeCheckingPtrGroup &Group : Groups) { |
| 700 | // Don't perform more than a certain amount of comparisons. |
| 701 | // This should limit the cost of grouping the pointers to something |
| 702 | // reasonable. If we do end up hitting this threshold, the algorithm |
| 703 | // will create separate groups for all remaining pointers. |
| 704 | if (TotalComparisons > MemoryCheckMergeThreshold) |
| 705 | break; |
| 706 | |
| 707 | TotalComparisons++; |
| 708 | |
| 709 | if (Group.addPointer(Index: Pointer, RtCheck: *this)) { |
| 710 | Merged = true; |
| 711 | break; |
| 712 | } |
| 713 | } |
| 714 | |
| 715 | if (!Merged) |
| 716 | // We couldn't add this pointer to any existing set or the threshold |
| 717 | // for the number of comparisons has been reached. Create a new group |
| 718 | // to hold the current pointer. |
| 719 | Groups.emplace_back(Args&: Pointer, Args&: *this); |
| 720 | } |
| 721 | } |
| 722 | |
| 723 | // We've computed the grouped checks for this partition. |
| 724 | // Save the results and continue with the next one. |
| 725 | llvm::append_range(C&: CheckingGroups, R&: Groups); |
| 726 | } |
| 727 | } |
| 728 | |
| 729 | bool RuntimePointerChecking::arePointersInSamePartition( |
| 730 | const SmallVectorImpl<int> &PtrToPartition, unsigned PtrIdx1, |
| 731 | unsigned PtrIdx2) { |
| 732 | return (PtrToPartition[PtrIdx1] != -1 && |
| 733 | PtrToPartition[PtrIdx1] == PtrToPartition[PtrIdx2]); |
| 734 | } |
| 735 | |
| 736 | bool RuntimePointerChecking::needsChecking(unsigned I, unsigned J) const { |
| 737 | const PointerInfo &PointerI = Pointers[I]; |
| 738 | const PointerInfo &PointerJ = Pointers[J]; |
| 739 | |
| 740 | // No need to check if two readonly pointers intersect. |
| 741 | if (!PointerI.IsWritePtr && !PointerJ.IsWritePtr) |
| 742 | return false; |
| 743 | |
| 744 | // Only need to check pointers between two different dependency sets. |
| 745 | if (PointerI.DependencySetId == PointerJ.DependencySetId) |
| 746 | return false; |
| 747 | |
| 748 | // Only need to check pointers in the same alias set. |
| 749 | return PointerI.AliasSetId == PointerJ.AliasSetId; |
| 750 | } |
| 751 | |
| 752 | /// Assign each RuntimeCheckingPtrGroup pointer an index for stable UTC output. |
| 753 | static DenseMap<const RuntimeCheckingPtrGroup *, unsigned> |
| 754 | getPtrToIdxMap(ArrayRef<RuntimeCheckingPtrGroup> CheckingGroups) { |
| 755 | DenseMap<const RuntimeCheckingPtrGroup *, unsigned> PtrIndices; |
| 756 | for (const auto &[Idx, CG] : enumerate(First&: CheckingGroups)) |
| 757 | PtrIndices[&CG] = Idx; |
| 758 | return PtrIndices; |
| 759 | } |
| 760 | |
| 761 | void RuntimePointerChecking::printChecks( |
| 762 | raw_ostream &OS, const SmallVectorImpl<RuntimePointerCheck> &Checks, |
| 763 | unsigned Depth) const { |
| 764 | unsigned N = 0; |
| 765 | auto PtrIndices = getPtrToIdxMap(CheckingGroups); |
| 766 | for (const auto &[Check1, Check2] : Checks) { |
| 767 | const auto &First = Check1->Members, &Second = Check2->Members; |
| 768 | OS.indent(NumSpaces: Depth) << "Check " << N++ << ":\n" ; |
| 769 | OS.indent(NumSpaces: Depth + 2) << "Comparing group GRP" << PtrIndices.at(Val: Check1) |
| 770 | << ":\n" ; |
| 771 | for (unsigned K : First) |
| 772 | OS.indent(NumSpaces: Depth + 2) << *Pointers[K].PointerValue << "\n" ; |
| 773 | OS.indent(NumSpaces: Depth + 2) << "Against group GRP" << PtrIndices.at(Val: Check2) |
| 774 | << ":\n" ; |
| 775 | for (unsigned K : Second) |
| 776 | OS.indent(NumSpaces: Depth + 2) << *Pointers[K].PointerValue << "\n" ; |
| 777 | } |
| 778 | } |
| 779 | |
| 780 | void RuntimePointerChecking::print(raw_ostream &OS, unsigned Depth) const { |
| 781 | |
| 782 | OS.indent(NumSpaces: Depth) << "Run-time memory checks:\n" ; |
| 783 | printChecks(OS, Checks, Depth); |
| 784 | |
| 785 | OS.indent(NumSpaces: Depth) << "Grouped accesses:\n" ; |
| 786 | auto PtrIndices = getPtrToIdxMap(CheckingGroups); |
| 787 | for (const auto &CG : CheckingGroups) { |
| 788 | OS.indent(NumSpaces: Depth + 2) << "Group GRP" << PtrIndices.at(Val: &CG) << ":\n" ; |
| 789 | OS.indent(NumSpaces: Depth + 4) << "(Low: " << *CG.Low << " High: " << *CG.High |
| 790 | << ")\n" ; |
| 791 | for (unsigned Member : CG.Members) { |
| 792 | OS.indent(NumSpaces: Depth + 6) << "Member: " << *Pointers[Member].Expr << "\n" ; |
| 793 | } |
| 794 | } |
| 795 | } |
| 796 | |
| 797 | namespace { |
| 798 | |
| 799 | /// Analyses memory accesses in a loop. |
| 800 | /// |
| 801 | /// Checks whether run time pointer checks are needed and builds sets for data |
| 802 | /// dependence checking. |
| 803 | class AccessAnalysis { |
| 804 | public: |
| 805 | /// Read or write access location. |
| 806 | typedef PointerIntPair<Value *, 1, bool> MemAccessInfo; |
| 807 | typedef SmallVector<MemAccessInfo, 8> MemAccessInfoList; |
| 808 | |
| 809 | AccessAnalysis(const Loop *TheLoop, AAResults *AA, const LoopInfo *LI, |
| 810 | DominatorTree &DT, MemoryDepChecker::DepCandidates &DA, |
| 811 | PredicatedScalarEvolution &PSE, |
| 812 | SmallPtrSetImpl<MDNode *> &LoopAliasScopes) |
| 813 | : TheLoop(TheLoop), BAA(*AA), AST(BAA), LI(LI), DT(DT), DepCands(DA), |
| 814 | PSE(PSE), LoopAliasScopes(LoopAliasScopes) { |
| 815 | // We're analyzing dependences across loop iterations. |
| 816 | BAA.enableCrossIterationMode(); |
| 817 | } |
| 818 | |
| 819 | /// Register a load and whether it is only read from. |
| 820 | void addLoad(const MemoryLocation &Loc, Type *AccessTy, bool IsReadOnly) { |
| 821 | Value *Ptr = const_cast<Value *>(Loc.Ptr); |
| 822 | AST.add(Loc: adjustLoc(Loc)); |
| 823 | Accesses[MemAccessInfo(Ptr, false)].insert(X: AccessTy); |
| 824 | if (IsReadOnly) |
| 825 | ReadOnlyPtr.insert(Ptr); |
| 826 | } |
| 827 | |
| 828 | /// Register a store. |
| 829 | void addStore(const MemoryLocation &Loc, Type *AccessTy) { |
| 830 | Value *Ptr = const_cast<Value *>(Loc.Ptr); |
| 831 | AST.add(Loc: adjustLoc(Loc)); |
| 832 | Accesses[MemAccessInfo(Ptr, true)].insert(X: AccessTy); |
| 833 | } |
| 834 | |
| 835 | /// Check if we can emit a run-time no-alias check for \p Access. |
| 836 | /// |
| 837 | /// Returns true if we can emit a run-time no alias check for \p Access. |
| 838 | /// If we can check this access, this also adds it to a dependence set and |
| 839 | /// adds a run-time to check for it to \p RtCheck. If \p Assume is true, |
| 840 | /// we will attempt to use additional run-time checks in order to get |
| 841 | /// the bounds of the pointer. |
| 842 | bool createCheckForAccess(RuntimePointerChecking &RtCheck, |
| 843 | MemAccessInfo Access, Type *AccessTy, |
| 844 | const DenseMap<Value *, const SCEV *> &Strides, |
| 845 | DenseMap<Value *, unsigned> &DepSetId, |
| 846 | Loop *TheLoop, unsigned &RunningDepId, |
| 847 | unsigned ASId, bool Assume); |
| 848 | |
| 849 | /// Check whether we can check the pointers at runtime for |
| 850 | /// non-intersection. |
| 851 | /// |
| 852 | /// Returns true if we need no check or if we do and we can generate them |
| 853 | /// (i.e. the pointers have computable bounds). A return value of false means |
| 854 | /// we couldn't analyze and generate runtime checks for all pointers in the |
| 855 | /// loop, but if \p AllowPartial is set then we will have checks for those |
| 856 | /// pointers we could analyze. |
| 857 | bool canCheckPtrAtRT(RuntimePointerChecking &RtCheck, Loop *TheLoop, |
| 858 | const DenseMap<Value *, const SCEV *> &Strides, |
| 859 | Value *&UncomputablePtr, bool AllowPartial); |
| 860 | |
| 861 | /// Goes over all memory accesses, checks whether a RT check is needed |
| 862 | /// and builds sets of dependent accesses. |
| 863 | void buildDependenceSets() { |
| 864 | processMemAccesses(); |
| 865 | } |
| 866 | |
| 867 | /// Initial processing of memory accesses determined that we need to |
| 868 | /// perform dependency checking. |
| 869 | /// |
| 870 | /// Note that this can later be cleared if we retry memcheck analysis without |
| 871 | /// dependency checking (i.e. ShouldRetryWithRuntimeChecks). |
| 872 | bool isDependencyCheckNeeded() const { return !CheckDeps.empty(); } |
| 873 | |
| 874 | /// We decided that no dependence analysis would be used. Reset the state. |
| 875 | void resetDepChecks(MemoryDepChecker &DepChecker) { |
| 876 | CheckDeps.clear(); |
| 877 | DepChecker.clearDependences(); |
| 878 | } |
| 879 | |
| 880 | const MemAccessInfoList &getDependenciesToCheck() const { return CheckDeps; } |
| 881 | |
| 882 | private: |
| 883 | typedef MapVector<MemAccessInfo, SmallSetVector<Type *, 1>> PtrAccessMap; |
| 884 | |
| 885 | /// Adjust the MemoryLocation so that it represents accesses to this |
| 886 | /// location across all iterations, rather than a single one. |
| 887 | MemoryLocation adjustLoc(MemoryLocation Loc) const { |
| 888 | // The accessed location varies within the loop, but remains within the |
| 889 | // underlying object. |
| 890 | Loc.Size = LocationSize::beforeOrAfterPointer(); |
| 891 | Loc.AATags.Scope = adjustAliasScopeList(ScopeList: Loc.AATags.Scope); |
| 892 | Loc.AATags.NoAlias = adjustAliasScopeList(ScopeList: Loc.AATags.NoAlias); |
| 893 | return Loc; |
| 894 | } |
| 895 | |
| 896 | /// Drop alias scopes that are only valid within a single loop iteration. |
| 897 | MDNode *adjustAliasScopeList(MDNode *ScopeList) const { |
| 898 | if (!ScopeList) |
| 899 | return nullptr; |
| 900 | |
| 901 | // For the sake of simplicity, drop the whole scope list if any scope is |
| 902 | // iteration-local. |
| 903 | if (any_of(Range: ScopeList->operands(), P: [&](Metadata *Scope) { |
| 904 | return LoopAliasScopes.contains(Ptr: cast<MDNode>(Val: Scope)); |
| 905 | })) |
| 906 | return nullptr; |
| 907 | |
| 908 | return ScopeList; |
| 909 | } |
| 910 | |
| 911 | /// Go over all memory access and check whether runtime pointer checks |
| 912 | /// are needed and build sets of dependency check candidates. |
| 913 | void processMemAccesses(); |
| 914 | |
| 915 | /// Map of all accesses. Values are the types used to access memory pointed to |
| 916 | /// by the pointer. |
| 917 | PtrAccessMap Accesses; |
| 918 | |
| 919 | /// The loop being checked. |
| 920 | const Loop *TheLoop; |
| 921 | |
| 922 | /// List of accesses that need a further dependence check. |
| 923 | MemAccessInfoList CheckDeps; |
| 924 | |
| 925 | /// Set of pointers that are read only. |
| 926 | SmallPtrSet<Value*, 16> ReadOnlyPtr; |
| 927 | |
| 928 | /// Batched alias analysis results. |
| 929 | BatchAAResults BAA; |
| 930 | |
| 931 | /// An alias set tracker to partition the access set by underlying object and |
| 932 | //intrinsic property (such as TBAA metadata). |
| 933 | AliasSetTracker AST; |
| 934 | |
| 935 | /// The LoopInfo of the loop being checked. |
| 936 | const LoopInfo *LI; |
| 937 | |
| 938 | /// The dominator tree of the function. |
| 939 | DominatorTree &DT; |
| 940 | |
| 941 | /// Sets of potentially dependent accesses - members of one set share an |
| 942 | /// underlying pointer. The set "CheckDeps" identfies which sets really need a |
| 943 | /// dependence check. |
| 944 | MemoryDepChecker::DepCandidates &DepCands; |
| 945 | |
| 946 | /// Initial processing of memory accesses determined that we may need |
| 947 | /// to add memchecks. Perform the analysis to determine the necessary checks. |
| 948 | /// |
| 949 | /// Note that, this is different from isDependencyCheckNeeded. When we retry |
| 950 | /// memcheck analysis without dependency checking |
| 951 | /// (i.e. ShouldRetryWithRuntimeChecks), isDependencyCheckNeeded is |
| 952 | /// cleared while this remains set if we have potentially dependent accesses. |
| 953 | bool IsRTCheckAnalysisNeeded = false; |
| 954 | |
| 955 | /// The SCEV predicate containing all the SCEV-related assumptions. |
| 956 | PredicatedScalarEvolution &PSE; |
| 957 | |
| 958 | DenseMap<Value *, SmallVector<const Value *, 16>> UnderlyingObjects; |
| 959 | |
| 960 | /// Alias scopes that are declared inside the loop, and as such not valid |
| 961 | /// across iterations. |
| 962 | SmallPtrSetImpl<MDNode *> &LoopAliasScopes; |
| 963 | }; |
| 964 | |
| 965 | } // end anonymous namespace |
| 966 | |
| 967 | /// Try to compute a constant stride for \p AR. Used by getPtrStride and |
| 968 | /// isNoWrap. |
| 969 | static std::optional<int64_t> |
| 970 | getStrideFromAddRec(const SCEVAddRecExpr *AR, const Loop *Lp, Type *AccessTy, |
| 971 | Value *Ptr, PredicatedScalarEvolution &PSE) { |
| 972 | if (isa<ScalableVectorType>(Val: AccessTy)) { |
| 973 | LLVM_DEBUG(dbgs() << "LAA: Bad stride - Scalable object: " << *AccessTy |
| 974 | << "\n" ); |
| 975 | return std::nullopt; |
| 976 | } |
| 977 | |
| 978 | // The access function must stride over the innermost loop. |
| 979 | if (Lp != AR->getLoop()) { |
| 980 | LLVM_DEBUG({ |
| 981 | dbgs() << "LAA: Bad stride - Not striding over innermost loop " ; |
| 982 | if (Ptr) |
| 983 | dbgs() << *Ptr << " " ; |
| 984 | |
| 985 | dbgs() << "SCEV: " << *AR << "\n" ; |
| 986 | }); |
| 987 | return std::nullopt; |
| 988 | } |
| 989 | |
| 990 | // Check the step is constant. |
| 991 | const SCEV *Step = AR->getStepRecurrence(SE&: *PSE.getSE()); |
| 992 | |
| 993 | // Calculate the pointer stride and check if it is constant. |
| 994 | const APInt *APStepVal; |
| 995 | if (!match(S: Step, P: m_scev_APInt(C&: APStepVal))) { |
| 996 | LLVM_DEBUG({ |
| 997 | dbgs() << "LAA: Bad stride - Not a constant strided " ; |
| 998 | if (Ptr) |
| 999 | dbgs() << *Ptr << " " ; |
| 1000 | dbgs() << "SCEV: " << *AR << "\n" ; |
| 1001 | }); |
| 1002 | return std::nullopt; |
| 1003 | } |
| 1004 | |
| 1005 | const auto &DL = Lp->getHeader()->getDataLayout(); |
| 1006 | TypeSize AllocSize = DL.getTypeAllocSize(Ty: AccessTy); |
| 1007 | int64_t Size = AllocSize.getFixedValue(); |
| 1008 | |
| 1009 | // Huge step value - give up. |
| 1010 | std::optional<int64_t> StepVal = APStepVal->trySExtValue(); |
| 1011 | if (!StepVal) |
| 1012 | return std::nullopt; |
| 1013 | |
| 1014 | // Strided access. |
| 1015 | return *StepVal % Size ? std::nullopt : std::make_optional(t: *StepVal / Size); |
| 1016 | } |
| 1017 | |
| 1018 | /// Check whether \p AR is a non-wrapping AddRec. If \p Ptr is not nullptr, use |
| 1019 | /// informating from the IR pointer value to determine no-wrap. |
| 1020 | static bool isNoWrap(PredicatedScalarEvolution &PSE, const SCEVAddRecExpr *AR, |
| 1021 | Value *Ptr, Type *AccessTy, const Loop *L, bool Assume, |
| 1022 | const DominatorTree &DT, |
| 1023 | std::optional<int64_t> Stride = std::nullopt) { |
| 1024 | // FIXME: This should probably only return true for NUW. |
| 1025 | if (AR->getNoWrapFlags(Mask: SCEV::NoWrapMask)) |
| 1026 | return true; |
| 1027 | |
| 1028 | if (Ptr && PSE.hasNoOverflow(V: Ptr, Flags: SCEVWrapPredicate::IncrementNUSW)) |
| 1029 | return true; |
| 1030 | |
| 1031 | // An nusw getelementptr that is an AddRec cannot wrap. If it would wrap, |
| 1032 | // the distance between the previously accessed location and the wrapped |
| 1033 | // location will be larger than half the pointer index type space. In that |
| 1034 | // case, the GEP would be poison and any memory access dependent on it would |
| 1035 | // be immediate UB when executed. |
| 1036 | if (auto *GEP = dyn_cast_if_present<GetElementPtrInst>(Val: Ptr); |
| 1037 | GEP && GEP->hasNoUnsignedSignedWrap()) { |
| 1038 | // For the above reasoning to apply, the pointer must be dereferenced in |
| 1039 | // every iteration. |
| 1040 | if (L->getHeader() == L->getLoopLatch() || |
| 1041 | any_of(Range: GEP->users(), P: [L, &DT, GEP](User *U) { |
| 1042 | if (getLoadStorePointerOperand(V: U) != GEP) |
| 1043 | return false; |
| 1044 | BasicBlock *UserBB = cast<Instruction>(Val: U)->getParent(); |
| 1045 | if (!L->contains(BB: UserBB)) |
| 1046 | return false; |
| 1047 | return !LoopAccessInfo::blockNeedsPredication(BB: UserBB, TheLoop: L, DT: &DT); |
| 1048 | })) |
| 1049 | return true; |
| 1050 | } |
| 1051 | |
| 1052 | if (!Stride) |
| 1053 | Stride = getStrideFromAddRec(AR, Lp: L, AccessTy, Ptr, PSE); |
| 1054 | if (Stride) { |
| 1055 | // If the null pointer is undefined, then a access sequence which would |
| 1056 | // otherwise access it can be assumed not to unsigned wrap. Note that this |
| 1057 | // assumes the object in memory is aligned to the natural alignment. |
| 1058 | unsigned AddrSpace = AR->getType()->getPointerAddressSpace(); |
| 1059 | if (!NullPointerIsDefined(F: L->getHeader()->getParent(), AS: AddrSpace) && |
| 1060 | (Stride == 1 || Stride == -1)) |
| 1061 | return true; |
| 1062 | } |
| 1063 | |
| 1064 | if (Ptr && Assume) { |
| 1065 | PSE.setNoOverflow(V: Ptr, Flags: SCEVWrapPredicate::IncrementNUSW); |
| 1066 | LLVM_DEBUG(dbgs() << "LAA: Pointer may wrap:\n" |
| 1067 | << "LAA: Pointer: " << *Ptr << "\n" |
| 1068 | << "LAA: SCEV: " << *AR << "\n" |
| 1069 | << "LAA: Added an overflow assumption\n" ); |
| 1070 | return true; |
| 1071 | } |
| 1072 | |
| 1073 | return false; |
| 1074 | } |
| 1075 | |
| 1076 | static void visitPointers(Value *StartPtr, const Loop &InnermostLoop, |
| 1077 | function_ref<void(Value *)> AddPointer) { |
| 1078 | SmallPtrSet<Value *, 8> Visited; |
| 1079 | SmallVector<Value *> WorkList; |
| 1080 | WorkList.push_back(Elt: StartPtr); |
| 1081 | |
| 1082 | while (!WorkList.empty()) { |
| 1083 | Value *Ptr = WorkList.pop_back_val(); |
| 1084 | if (!Visited.insert(Ptr).second) |
| 1085 | continue; |
| 1086 | auto *PN = dyn_cast<PHINode>(Val: Ptr); |
| 1087 | // SCEV does not look through non-header PHIs inside the loop. Such phis |
| 1088 | // can be analyzed by adding separate accesses for each incoming pointer |
| 1089 | // value. |
| 1090 | if (PN && InnermostLoop.contains(BB: PN->getParent()) && |
| 1091 | PN->getParent() != InnermostLoop.getHeader()) { |
| 1092 | llvm::append_range(C&: WorkList, R: PN->incoming_values()); |
| 1093 | } else |
| 1094 | AddPointer(Ptr); |
| 1095 | } |
| 1096 | } |
| 1097 | |
| 1098 | // Walk back through the IR for a pointer, looking for a select like the |
| 1099 | // following: |
| 1100 | // |
| 1101 | // %offset = select i1 %cmp, i64 %a, i64 %b |
| 1102 | // %addr = getelementptr double, double* %base, i64 %offset |
| 1103 | // %ld = load double, double* %addr, align 8 |
| 1104 | // |
| 1105 | // We won't be able to form a single SCEVAddRecExpr from this since the |
| 1106 | // address for each loop iteration depends on %cmp. We could potentially |
| 1107 | // produce multiple valid SCEVAddRecExprs, though, and check all of them for |
| 1108 | // memory safety/aliasing if needed. |
| 1109 | // |
| 1110 | // If we encounter some IR we don't yet handle, or something obviously fine |
| 1111 | // like a constant, then we just add the SCEV for that term to the list passed |
| 1112 | // in by the caller. If we have a node that may potentially yield a valid |
| 1113 | // SCEVAddRecExpr then we decompose it into parts and build the SCEV terms |
| 1114 | // ourselves before adding to the list. |
| 1115 | static void findForkedSCEVs( |
| 1116 | ScalarEvolution *SE, const Loop *L, Value *Ptr, |
| 1117 | SmallVectorImpl<PointerIntPair<const SCEV *, 1, bool>> &ScevList, |
| 1118 | unsigned Depth) { |
| 1119 | // If our Value is a SCEVAddRecExpr, loop invariant, not an instruction, or |
| 1120 | // we've exceeded our limit on recursion, just return whatever we have |
| 1121 | // regardless of whether it can be used for a forked pointer or not, along |
| 1122 | // with an indication of whether it might be a poison or undef value. |
| 1123 | const SCEV *Scev = SE->getSCEV(V: Ptr); |
| 1124 | if (isa<SCEVAddRecExpr>(Val: Scev) || L->isLoopInvariant(V: Ptr) || |
| 1125 | !isa<Instruction>(Val: Ptr) || Depth == 0) { |
| 1126 | ScevList.emplace_back(Args&: Scev, Args: !isGuaranteedNotToBeUndefOrPoison(V: Ptr)); |
| 1127 | return; |
| 1128 | } |
| 1129 | |
| 1130 | Depth--; |
| 1131 | |
| 1132 | auto UndefPoisonCheck = [](PointerIntPair<const SCEV *, 1, bool> S) { |
| 1133 | return get<1>(Pair: S); |
| 1134 | }; |
| 1135 | |
| 1136 | auto GetBinOpExpr = [&SE](unsigned Opcode, const SCEV *L, const SCEV *R) { |
| 1137 | switch (Opcode) { |
| 1138 | case Instruction::Add: |
| 1139 | return SE->getAddExpr(LHS: L, RHS: R); |
| 1140 | case Instruction::Sub: |
| 1141 | return SE->getMinusSCEV(LHS: L, RHS: R); |
| 1142 | default: |
| 1143 | llvm_unreachable("Unexpected binary operator when walking ForkedPtrs" ); |
| 1144 | } |
| 1145 | }; |
| 1146 | |
| 1147 | Instruction *I = cast<Instruction>(Val: Ptr); |
| 1148 | unsigned Opcode = I->getOpcode(); |
| 1149 | switch (Opcode) { |
| 1150 | case Instruction::GetElementPtr: { |
| 1151 | auto *GEP = cast<GetElementPtrInst>(Val: I); |
| 1152 | Type *SourceTy = GEP->getSourceElementType(); |
| 1153 | // We only handle base + single offset GEPs here for now. |
| 1154 | // Not dealing with preexisting gathers yet, so no vectors. |
| 1155 | if (I->getNumOperands() != 2 || SourceTy->isVectorTy()) { |
| 1156 | ScevList.emplace_back(Args&: Scev, Args: !isGuaranteedNotToBeUndefOrPoison(V: GEP)); |
| 1157 | break; |
| 1158 | } |
| 1159 | SmallVector<PointerIntPair<const SCEV *, 1, bool>, 2> BaseScevs; |
| 1160 | SmallVector<PointerIntPair<const SCEV *, 1, bool>, 2> OffsetScevs; |
| 1161 | findForkedSCEVs(SE, L, Ptr: I->getOperand(i: 0), ScevList&: BaseScevs, Depth); |
| 1162 | findForkedSCEVs(SE, L, Ptr: I->getOperand(i: 1), ScevList&: OffsetScevs, Depth); |
| 1163 | |
| 1164 | // See if we need to freeze our fork... |
| 1165 | bool NeedsFreeze = any_of(Range&: BaseScevs, P: UndefPoisonCheck) || |
| 1166 | any_of(Range&: OffsetScevs, P: UndefPoisonCheck); |
| 1167 | |
| 1168 | // Check that we only have a single fork, on either the base or the offset. |
| 1169 | // Copy the SCEV across for the one without a fork in order to generate |
| 1170 | // the full SCEV for both sides of the GEP. |
| 1171 | if (OffsetScevs.size() == 2 && BaseScevs.size() == 1) |
| 1172 | BaseScevs.push_back(Elt: BaseScevs[0]); |
| 1173 | else if (BaseScevs.size() == 2 && OffsetScevs.size() == 1) |
| 1174 | OffsetScevs.push_back(Elt: OffsetScevs[0]); |
| 1175 | else { |
| 1176 | ScevList.emplace_back(Args&: Scev, Args&: NeedsFreeze); |
| 1177 | break; |
| 1178 | } |
| 1179 | |
| 1180 | Type *IntPtrTy = SE->getEffectiveSCEVType(Ty: GEP->getPointerOperandType()); |
| 1181 | |
| 1182 | // Find the size of the type being pointed to. We only have a single |
| 1183 | // index term (guarded above) so we don't need to index into arrays or |
| 1184 | // structures, just get the size of the scalar value. |
| 1185 | const SCEV *Size = SE->getSizeOfExpr(IntTy: IntPtrTy, AllocTy: SourceTy); |
| 1186 | |
| 1187 | for (auto [B, O] : zip(t&: BaseScevs, u&: OffsetScevs)) { |
| 1188 | const SCEV *Base = get<0>(Pair: B); |
| 1189 | const SCEV *Offset = get<0>(Pair: O); |
| 1190 | |
| 1191 | // Scale up the offsets by the size of the type, then add to the bases. |
| 1192 | const SCEV *Scaled = |
| 1193 | SE->getMulExpr(LHS: Size, RHS: SE->getTruncateOrSignExtend(V: Offset, Ty: IntPtrTy)); |
| 1194 | ScevList.emplace_back(Args: SE->getAddExpr(LHS: Base, RHS: Scaled), Args&: NeedsFreeze); |
| 1195 | } |
| 1196 | break; |
| 1197 | } |
| 1198 | case Instruction::Select: { |
| 1199 | SmallVector<PointerIntPair<const SCEV *, 1, bool>, 2> ChildScevs; |
| 1200 | // A select means we've found a forked pointer, but we currently only |
| 1201 | // support a single select per pointer so if there's another behind this |
| 1202 | // then we just bail out and return the generic SCEV. |
| 1203 | findForkedSCEVs(SE, L, Ptr: I->getOperand(i: 1), ScevList&: ChildScevs, Depth); |
| 1204 | findForkedSCEVs(SE, L, Ptr: I->getOperand(i: 2), ScevList&: ChildScevs, Depth); |
| 1205 | if (ChildScevs.size() == 2) |
| 1206 | append_range(C&: ScevList, R&: ChildScevs); |
| 1207 | else |
| 1208 | ScevList.emplace_back(Args&: Scev, Args: !isGuaranteedNotToBeUndefOrPoison(V: Ptr)); |
| 1209 | break; |
| 1210 | } |
| 1211 | case Instruction::PHI: { |
| 1212 | SmallVector<PointerIntPair<const SCEV *, 1, bool>, 2> ChildScevs; |
| 1213 | // A phi means we've found a forked pointer, but we currently only |
| 1214 | // support a single phi per pointer so if there's another behind this |
| 1215 | // then we just bail out and return the generic SCEV. |
| 1216 | if (I->getNumOperands() == 2) { |
| 1217 | findForkedSCEVs(SE, L, Ptr: I->getOperand(i: 0), ScevList&: ChildScevs, Depth); |
| 1218 | findForkedSCEVs(SE, L, Ptr: I->getOperand(i: 1), ScevList&: ChildScevs, Depth); |
| 1219 | } |
| 1220 | if (ChildScevs.size() == 2) |
| 1221 | append_range(C&: ScevList, R&: ChildScevs); |
| 1222 | else |
| 1223 | ScevList.emplace_back(Args&: Scev, Args: !isGuaranteedNotToBeUndefOrPoison(V: Ptr)); |
| 1224 | break; |
| 1225 | } |
| 1226 | case Instruction::Add: |
| 1227 | case Instruction::Sub: { |
| 1228 | SmallVector<PointerIntPair<const SCEV *, 1, bool>> LScevs; |
| 1229 | SmallVector<PointerIntPair<const SCEV *, 1, bool>> RScevs; |
| 1230 | findForkedSCEVs(SE, L, Ptr: I->getOperand(i: 0), ScevList&: LScevs, Depth); |
| 1231 | findForkedSCEVs(SE, L, Ptr: I->getOperand(i: 1), ScevList&: RScevs, Depth); |
| 1232 | |
| 1233 | // See if we need to freeze our fork... |
| 1234 | bool NeedsFreeze = |
| 1235 | any_of(Range&: LScevs, P: UndefPoisonCheck) || any_of(Range&: RScevs, P: UndefPoisonCheck); |
| 1236 | |
| 1237 | // Check that we only have a single fork, on either the left or right side. |
| 1238 | // Copy the SCEV across for the one without a fork in order to generate |
| 1239 | // the full SCEV for both sides of the BinOp. |
| 1240 | if (LScevs.size() == 2 && RScevs.size() == 1) |
| 1241 | RScevs.push_back(Elt: RScevs[0]); |
| 1242 | else if (RScevs.size() == 2 && LScevs.size() == 1) |
| 1243 | LScevs.push_back(Elt: LScevs[0]); |
| 1244 | else { |
| 1245 | ScevList.emplace_back(Args&: Scev, Args&: NeedsFreeze); |
| 1246 | break; |
| 1247 | } |
| 1248 | |
| 1249 | for (auto [L, R] : zip(t&: LScevs, u&: RScevs)) |
| 1250 | ScevList.emplace_back(Args: GetBinOpExpr(Opcode, get<0>(Pair: L), get<0>(Pair: R)), |
| 1251 | Args&: NeedsFreeze); |
| 1252 | break; |
| 1253 | } |
| 1254 | default: |
| 1255 | // Just return the current SCEV if we haven't handled the instruction yet. |
| 1256 | LLVM_DEBUG(dbgs() << "ForkedPtr unhandled instruction: " << *I << "\n" ); |
| 1257 | ScevList.emplace_back(Args&: Scev, Args: !isGuaranteedNotToBeUndefOrPoison(V: Ptr)); |
| 1258 | break; |
| 1259 | } |
| 1260 | } |
| 1261 | |
| 1262 | bool AccessAnalysis::createCheckForAccess( |
| 1263 | RuntimePointerChecking &RtCheck, MemAccessInfo Access, Type *AccessTy, |
| 1264 | const DenseMap<Value *, const SCEV *> &StridesMap, |
| 1265 | DenseMap<Value *, unsigned> &DepSetId, Loop *TheLoop, |
| 1266 | unsigned &RunningDepId, unsigned ASId, bool Assume) { |
| 1267 | Value *Ptr = Access.getPointer(); |
| 1268 | ScalarEvolution *SE = PSE.getSE(); |
| 1269 | assert(SE->isSCEVable(Ptr->getType()) && "Value is not SCEVable!" ); |
| 1270 | |
| 1271 | SmallVector<PointerIntPair<const SCEV *, 1, bool>> RTCheckPtrs; |
| 1272 | findForkedSCEVs(SE, L: TheLoop, Ptr, ScevList&: RTCheckPtrs, Depth: MaxForkedSCEVDepth); |
| 1273 | assert(!RTCheckPtrs.empty() && |
| 1274 | "Must have some runtime-check pointer candidates" ); |
| 1275 | |
| 1276 | // RTCheckPtrs must have size 2 if there are forked pointers. Otherwise, there |
| 1277 | // are no forked pointers; replaceSymbolicStridesSCEV in this case. |
| 1278 | auto IsLoopInvariantOrAR = |
| 1279 | [&SE, &TheLoop](const PointerIntPair<const SCEV *, 1, bool> &P) { |
| 1280 | return SE->isLoopInvariant(S: P.getPointer(), L: TheLoop) || |
| 1281 | isa<SCEVAddRecExpr>(Val: P.getPointer()); |
| 1282 | }; |
| 1283 | if (RTCheckPtrs.size() == 2 && all_of(Range&: RTCheckPtrs, P: IsLoopInvariantOrAR)) { |
| 1284 | LLVM_DEBUG(dbgs() << "LAA: Found forked pointer: " << *Ptr << "\n" ; |
| 1285 | for (const auto &[Idx, Q] : enumerate(RTCheckPtrs)) dbgs() |
| 1286 | << "\t(" << Idx << ") " << *Q.getPointer() << "\n" ); |
| 1287 | } else { |
| 1288 | RTCheckPtrs = {{replaceSymbolicStrideSCEV(PSE, PtrToStride: StridesMap, Ptr), false}}; |
| 1289 | } |
| 1290 | |
| 1291 | /// Check whether all pointers can participate in a runtime bounds check. They |
| 1292 | /// must either be invariant or non-wrapping affine AddRecs. |
| 1293 | for (auto &P : RTCheckPtrs) { |
| 1294 | // The bounds for loop-invariant pointer is trivial. |
| 1295 | if (SE->isLoopInvariant(S: P.getPointer(), L: TheLoop)) |
| 1296 | continue; |
| 1297 | |
| 1298 | const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Val: P.getPointer()); |
| 1299 | if (!AR && Assume) |
| 1300 | AR = PSE.getAsAddRec(V: Ptr); |
| 1301 | if (!AR || !AR->isAffine()) |
| 1302 | return false; |
| 1303 | |
| 1304 | // If there's only one option for Ptr, look it up after bounds and wrap |
| 1305 | // checking, because assumptions might have been added to PSE. |
| 1306 | if (RTCheckPtrs.size() == 1) { |
| 1307 | AR = |
| 1308 | cast<SCEVAddRecExpr>(Val: replaceSymbolicStrideSCEV(PSE, PtrToStride: StridesMap, Ptr)); |
| 1309 | P.setPointer(AR); |
| 1310 | } |
| 1311 | |
| 1312 | if (!isNoWrap(PSE, AR, Ptr: RTCheckPtrs.size() == 1 ? Ptr : nullptr, AccessTy, |
| 1313 | L: TheLoop, Assume, DT)) |
| 1314 | return false; |
| 1315 | } |
| 1316 | |
| 1317 | for (const auto &[PtrExpr, NeedsFreeze] : RTCheckPtrs) { |
| 1318 | // The id of the dependence set. |
| 1319 | unsigned DepId; |
| 1320 | |
| 1321 | if (isDependencyCheckNeeded()) { |
| 1322 | Value *Leader = DepCands.getLeaderValue(V: Access).getPointer(); |
| 1323 | unsigned &LeaderId = DepSetId[Leader]; |
| 1324 | if (!LeaderId) |
| 1325 | LeaderId = RunningDepId++; |
| 1326 | DepId = LeaderId; |
| 1327 | } else |
| 1328 | // Each access has its own dependence set. |
| 1329 | DepId = RunningDepId++; |
| 1330 | |
| 1331 | bool IsWrite = Access.getInt(); |
| 1332 | RtCheck.insert(Lp: TheLoop, Ptr, PtrExpr, AccessTy, WritePtr: IsWrite, DepSetId: DepId, ASId, PSE, |
| 1333 | NeedsFreeze); |
| 1334 | LLVM_DEBUG(dbgs() << "LAA: Found a runtime check ptr:" << *Ptr << '\n'); |
| 1335 | } |
| 1336 | |
| 1337 | return true; |
| 1338 | } |
| 1339 | |
| 1340 | bool AccessAnalysis::canCheckPtrAtRT( |
| 1341 | RuntimePointerChecking &RtCheck, Loop *TheLoop, |
| 1342 | const DenseMap<Value *, const SCEV *> &StridesMap, Value *&UncomputablePtr, |
| 1343 | bool AllowPartial) { |
| 1344 | // Find pointers with computable bounds. We are going to use this information |
| 1345 | // to place a runtime bound check. |
| 1346 | bool CanDoRT = true; |
| 1347 | |
| 1348 | bool MayNeedRTCheck = false; |
| 1349 | if (!IsRTCheckAnalysisNeeded) return true; |
| 1350 | |
| 1351 | bool IsDepCheckNeeded = isDependencyCheckNeeded(); |
| 1352 | |
| 1353 | // We assign a consecutive id to access from different alias sets. |
| 1354 | // Accesses between different groups doesn't need to be checked. |
| 1355 | unsigned ASId = 0; |
| 1356 | for (const auto &AS : AST) { |
| 1357 | int NumReadPtrChecks = 0; |
| 1358 | int NumWritePtrChecks = 0; |
| 1359 | bool CanDoAliasSetRT = true; |
| 1360 | ++ASId; |
| 1361 | auto ASPointers = AS.getPointers(); |
| 1362 | |
| 1363 | // We assign consecutive id to access from different dependence sets. |
| 1364 | // Accesses within the same set don't need a runtime check. |
| 1365 | unsigned RunningDepId = 1; |
| 1366 | DenseMap<Value *, unsigned> DepSetId; |
| 1367 | |
| 1368 | SmallVector<std::pair<MemAccessInfo, Type *>, 4> Retries; |
| 1369 | |
| 1370 | // First, count how many write and read accesses are in the alias set. Also |
| 1371 | // collect MemAccessInfos for later. |
| 1372 | SmallVector<MemAccessInfo, 4> AccessInfos; |
| 1373 | for (const Value *ConstPtr : ASPointers) { |
| 1374 | Value *Ptr = const_cast<Value *>(ConstPtr); |
| 1375 | bool IsWrite = Accesses.contains(Key: MemAccessInfo(Ptr, true)); |
| 1376 | if (IsWrite) |
| 1377 | ++NumWritePtrChecks; |
| 1378 | else |
| 1379 | ++NumReadPtrChecks; |
| 1380 | AccessInfos.emplace_back(Args&: Ptr, Args&: IsWrite); |
| 1381 | } |
| 1382 | |
| 1383 | // We do not need runtime checks for this alias set, if there are no writes |
| 1384 | // or a single write and no reads. |
| 1385 | if (NumWritePtrChecks == 0 || |
| 1386 | (NumWritePtrChecks == 1 && NumReadPtrChecks == 0)) { |
| 1387 | assert((ASPointers.size() <= 1 || |
| 1388 | all_of(ASPointers, |
| 1389 | [this](const Value *Ptr) { |
| 1390 | MemAccessInfo AccessWrite(const_cast<Value *>(Ptr), |
| 1391 | true); |
| 1392 | return !DepCands.contains(AccessWrite); |
| 1393 | })) && |
| 1394 | "Can only skip updating CanDoRT below, if all entries in AS " |
| 1395 | "are reads or there is at most 1 entry" ); |
| 1396 | continue; |
| 1397 | } |
| 1398 | |
| 1399 | for (auto &Access : AccessInfos) { |
| 1400 | for (const auto &AccessTy : Accesses[Access]) { |
| 1401 | if (!createCheckForAccess(RtCheck, Access, AccessTy, StridesMap, |
| 1402 | DepSetId, TheLoop, RunningDepId, ASId, |
| 1403 | Assume: false)) { |
| 1404 | LLVM_DEBUG(dbgs() << "LAA: Can't find bounds for ptr:" |
| 1405 | << *Access.getPointer() << '\n'); |
| 1406 | Retries.emplace_back(Args&: Access, Args: AccessTy); |
| 1407 | CanDoAliasSetRT = false; |
| 1408 | } |
| 1409 | } |
| 1410 | } |
| 1411 | |
| 1412 | // Note that this function computes CanDoRT and MayNeedRTCheck |
| 1413 | // independently. For example CanDoRT=false, MayNeedRTCheck=false means that |
| 1414 | // we have a pointer for which we couldn't find the bounds but we don't |
| 1415 | // actually need to emit any checks so it does not matter. |
| 1416 | // |
| 1417 | // We need runtime checks for this alias set, if there are at least 2 |
| 1418 | // dependence sets (in which case RunningDepId > 2) or if we need to re-try |
| 1419 | // any bound checks (because in that case the number of dependence sets is |
| 1420 | // incomplete). |
| 1421 | bool NeedsAliasSetRTCheck = RunningDepId > 2 || !Retries.empty(); |
| 1422 | |
| 1423 | // We need to perform run-time alias checks, but some pointers had bounds |
| 1424 | // that couldn't be checked. |
| 1425 | if (NeedsAliasSetRTCheck && !CanDoAliasSetRT) { |
| 1426 | // Reset the CanDoSetRt flag and retry all accesses that have failed. |
| 1427 | // We know that we need these checks, so we can now be more aggressive |
| 1428 | // and add further checks if required (overflow checks). |
| 1429 | CanDoAliasSetRT = true; |
| 1430 | for (const auto &[Access, AccessTy] : Retries) { |
| 1431 | if (!createCheckForAccess(RtCheck, Access, AccessTy, StridesMap, |
| 1432 | DepSetId, TheLoop, RunningDepId, ASId, |
| 1433 | /*Assume=*/true)) { |
| 1434 | CanDoAliasSetRT = false; |
| 1435 | UncomputablePtr = Access.getPointer(); |
| 1436 | if (!AllowPartial) |
| 1437 | break; |
| 1438 | } |
| 1439 | } |
| 1440 | } |
| 1441 | |
| 1442 | CanDoRT &= CanDoAliasSetRT; |
| 1443 | MayNeedRTCheck |= NeedsAliasSetRTCheck; |
| 1444 | ++ASId; |
| 1445 | } |
| 1446 | |
| 1447 | // If the pointers that we would use for the bounds comparison have different |
| 1448 | // address spaces, assume the values aren't directly comparable, so we can't |
| 1449 | // use them for the runtime check. We also have to assume they could |
| 1450 | // overlap. In the future there should be metadata for whether address spaces |
| 1451 | // are disjoint. |
| 1452 | unsigned NumPointers = RtCheck.Pointers.size(); |
| 1453 | for (unsigned i = 0; i < NumPointers; ++i) { |
| 1454 | for (unsigned j = i + 1; j < NumPointers; ++j) { |
| 1455 | // Only need to check pointers between two different dependency sets. |
| 1456 | if (RtCheck.Pointers[i].DependencySetId == |
| 1457 | RtCheck.Pointers[j].DependencySetId) |
| 1458 | continue; |
| 1459 | // Only need to check pointers in the same alias set. |
| 1460 | if (RtCheck.Pointers[i].AliasSetId != RtCheck.Pointers[j].AliasSetId) |
| 1461 | continue; |
| 1462 | |
| 1463 | Value *PtrI = RtCheck.Pointers[i].PointerValue; |
| 1464 | Value *PtrJ = RtCheck.Pointers[j].PointerValue; |
| 1465 | |
| 1466 | unsigned ASi = PtrI->getType()->getPointerAddressSpace(); |
| 1467 | unsigned ASj = PtrJ->getType()->getPointerAddressSpace(); |
| 1468 | if (ASi != ASj) { |
| 1469 | LLVM_DEBUG( |
| 1470 | dbgs() << "LAA: Runtime check would require comparison between" |
| 1471 | " different address spaces\n" ); |
| 1472 | return false; |
| 1473 | } |
| 1474 | } |
| 1475 | } |
| 1476 | |
| 1477 | if (MayNeedRTCheck && (CanDoRT || AllowPartial)) |
| 1478 | RtCheck.generateChecks(DepCands, UseDependencies: IsDepCheckNeeded); |
| 1479 | |
| 1480 | LLVM_DEBUG(dbgs() << "LAA: We need to do " << RtCheck.getNumberOfChecks() |
| 1481 | << " pointer comparisons.\n" ); |
| 1482 | |
| 1483 | // If we can do run-time checks, but there are no checks, no runtime checks |
| 1484 | // are needed. This can happen when all pointers point to the same underlying |
| 1485 | // object for example. |
| 1486 | RtCheck.Need = CanDoRT ? RtCheck.getNumberOfChecks() != 0 : MayNeedRTCheck; |
| 1487 | |
| 1488 | bool CanDoRTIfNeeded = !RtCheck.Need || CanDoRT; |
| 1489 | assert(CanDoRTIfNeeded == (CanDoRT || !MayNeedRTCheck) && |
| 1490 | "CanDoRTIfNeeded depends on RtCheck.Need" ); |
| 1491 | if (!CanDoRTIfNeeded && !AllowPartial) |
| 1492 | RtCheck.reset(); |
| 1493 | return CanDoRTIfNeeded; |
| 1494 | } |
| 1495 | |
| 1496 | void AccessAnalysis::processMemAccesses() { |
| 1497 | // We process the set twice: first we process read-write pointers, last we |
| 1498 | // process read-only pointers. This allows us to skip dependence tests for |
| 1499 | // read-only pointers. |
| 1500 | |
| 1501 | LLVM_DEBUG(dbgs() << "LAA: Processing memory accesses...\n" ); |
| 1502 | LLVM_DEBUG(dbgs() << " AST: " ; AST.dump()); |
| 1503 | LLVM_DEBUG(dbgs() << "LAA: Accesses(" << Accesses.size() << "):\n" ); |
| 1504 | LLVM_DEBUG({ |
| 1505 | for (const auto &[A, _] : Accesses) |
| 1506 | dbgs() << "\t" << *A.getPointer() << " (" |
| 1507 | << (A.getInt() |
| 1508 | ? "write" |
| 1509 | : (ReadOnlyPtr.contains(A.getPointer()) ? "read-only" |
| 1510 | : "read" )) |
| 1511 | << ")\n" ; |
| 1512 | }); |
| 1513 | |
| 1514 | // The AliasSetTracker has nicely partitioned our pointers by metadata |
| 1515 | // compatibility and potential for underlying-object overlap. As a result, we |
| 1516 | // only need to check for potential pointer dependencies within each alias |
| 1517 | // set. |
| 1518 | for (const auto &AS : AST) { |
| 1519 | // Note that both the alias-set tracker and the alias sets themselves used |
| 1520 | // ordered collections internally and so the iteration order here is |
| 1521 | // deterministic. |
| 1522 | auto ASPointers = AS.getPointers(); |
| 1523 | |
| 1524 | bool SetHasWrite = false; |
| 1525 | |
| 1526 | // Map of (pointer to underlying objects, accessed address space) to last |
| 1527 | // access encountered. |
| 1528 | typedef DenseMap<std::pair<const Value *, unsigned>, MemAccessInfo> |
| 1529 | UnderlyingObjToAccessMap; |
| 1530 | UnderlyingObjToAccessMap ObjToLastAccess; |
| 1531 | |
| 1532 | // Set of access to check after all writes have been processed. |
| 1533 | PtrAccessMap DeferredAccesses; |
| 1534 | |
| 1535 | // Iterate over each alias set twice, once to process read/write pointers, |
| 1536 | // and then to process read-only pointers. |
| 1537 | for (int SetIteration = 0; SetIteration < 2; ++SetIteration) { |
| 1538 | bool UseDeferred = SetIteration > 0; |
| 1539 | PtrAccessMap &S = UseDeferred ? DeferredAccesses : Accesses; |
| 1540 | |
| 1541 | for (const Value *ConstPtr : ASPointers) { |
| 1542 | Value *Ptr = const_cast<Value *>(ConstPtr); |
| 1543 | |
| 1544 | // For a single memory access in AliasSetTracker, Accesses may contain |
| 1545 | // both read and write, and they both need to be handled for CheckDeps. |
| 1546 | for (const auto &[AC, _] : S) { |
| 1547 | if (AC.getPointer() != Ptr) |
| 1548 | continue; |
| 1549 | |
| 1550 | bool IsWrite = AC.getInt(); |
| 1551 | |
| 1552 | // If we're using the deferred access set, then it contains only |
| 1553 | // reads. |
| 1554 | bool IsReadOnlyPtr = ReadOnlyPtr.contains(Ptr) && !IsWrite; |
| 1555 | if (UseDeferred && !IsReadOnlyPtr) |
| 1556 | continue; |
| 1557 | // Otherwise, the pointer must be in the PtrAccessSet, either as a |
| 1558 | // read or a write. |
| 1559 | assert(((IsReadOnlyPtr && UseDeferred) || IsWrite || |
| 1560 | S.contains(MemAccessInfo(Ptr, false))) && |
| 1561 | "Alias-set pointer not in the access set?" ); |
| 1562 | |
| 1563 | MemAccessInfo Access(Ptr, IsWrite); |
| 1564 | DepCands.insert(Data: Access); |
| 1565 | |
| 1566 | // Memorize read-only pointers for later processing and skip them in |
| 1567 | // the first round (they need to be checked after we have seen all |
| 1568 | // write pointers). Note: we also mark pointer that are not |
| 1569 | // consecutive as "read-only" pointers (so that we check |
| 1570 | // "a[b[i]] +="). Hence, we need the second check for "!IsWrite". |
| 1571 | if (!UseDeferred && IsReadOnlyPtr) { |
| 1572 | // We only use the pointer keys, the types vector values don't |
| 1573 | // matter. |
| 1574 | DeferredAccesses.insert(KV: {Access, {}}); |
| 1575 | continue; |
| 1576 | } |
| 1577 | |
| 1578 | // If this is a write - check other reads and writes for conflicts. If |
| 1579 | // this is a read only check other writes for conflicts (but only if |
| 1580 | // there is no other write to the ptr - this is an optimization to |
| 1581 | // catch "a[i] = a[i] + " without having to do a dependence check). |
| 1582 | if ((IsWrite || IsReadOnlyPtr) && SetHasWrite) { |
| 1583 | CheckDeps.push_back(Elt: Access); |
| 1584 | IsRTCheckAnalysisNeeded = true; |
| 1585 | } |
| 1586 | |
| 1587 | if (IsWrite) |
| 1588 | SetHasWrite = true; |
| 1589 | |
| 1590 | // Create sets of pointers connected by a shared alias set and |
| 1591 | // underlying object. |
| 1592 | SmallVector<const Value *, 16> &UOs = UnderlyingObjects[Ptr]; |
| 1593 | UOs = {}; |
| 1594 | ::getUnderlyingObjects(V: Ptr, Objects&: UOs, LI); |
| 1595 | LLVM_DEBUG(dbgs() |
| 1596 | << "Underlying objects for pointer " << *Ptr << "\n" ); |
| 1597 | for (const Value *UnderlyingObj : UOs) { |
| 1598 | // nullptr never alias, don't join sets for pointer that have "null" |
| 1599 | // in their UnderlyingObjects list. |
| 1600 | if (isa<ConstantPointerNull>(Val: UnderlyingObj) && |
| 1601 | !NullPointerIsDefined( |
| 1602 | F: TheLoop->getHeader()->getParent(), |
| 1603 | AS: UnderlyingObj->getType()->getPointerAddressSpace())) |
| 1604 | continue; |
| 1605 | |
| 1606 | auto [It, Inserted] = ObjToLastAccess.try_emplace( |
| 1607 | Key: {UnderlyingObj, |
| 1608 | cast<PointerType>(Val: Ptr->getType())->getAddressSpace()}, |
| 1609 | Args&: Access); |
| 1610 | if (!Inserted) { |
| 1611 | DepCands.unionSets(V1: Access, V2: It->second); |
| 1612 | It->second = Access; |
| 1613 | } |
| 1614 | |
| 1615 | LLVM_DEBUG(dbgs() << " " << *UnderlyingObj << "\n" ); |
| 1616 | } |
| 1617 | } |
| 1618 | } |
| 1619 | } |
| 1620 | } |
| 1621 | } |
| 1622 | |
| 1623 | /// Check whether the access through \p Ptr has a constant stride. |
| 1624 | std::optional<int64_t> |
| 1625 | llvm::getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr, |
| 1626 | const Loop *Lp, const DominatorTree &DT, |
| 1627 | const DenseMap<Value *, const SCEV *> &StridesMap, |
| 1628 | bool Assume, bool ShouldCheckWrap) { |
| 1629 | const SCEV *PtrScev = replaceSymbolicStrideSCEV(PSE, PtrToStride: StridesMap, Ptr); |
| 1630 | if (PSE.getSE()->isLoopInvariant(S: PtrScev, L: Lp)) |
| 1631 | return 0; |
| 1632 | |
| 1633 | assert(Ptr->getType()->isPointerTy() && "Unexpected non-ptr" ); |
| 1634 | |
| 1635 | const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Val: PtrScev); |
| 1636 | if (Assume && !AR) |
| 1637 | AR = PSE.getAsAddRec(V: Ptr); |
| 1638 | |
| 1639 | if (!AR) { |
| 1640 | LLVM_DEBUG(dbgs() << "LAA: Bad stride - Not an AddRecExpr pointer " << *Ptr |
| 1641 | << " SCEV: " << *PtrScev << "\n" ); |
| 1642 | return std::nullopt; |
| 1643 | } |
| 1644 | |
| 1645 | std::optional<int64_t> Stride = |
| 1646 | getStrideFromAddRec(AR, Lp, AccessTy, Ptr, PSE); |
| 1647 | if (!ShouldCheckWrap || !Stride) |
| 1648 | return Stride; |
| 1649 | |
| 1650 | if (isNoWrap(PSE, AR, Ptr, AccessTy, L: Lp, Assume, DT, Stride)) |
| 1651 | return Stride; |
| 1652 | |
| 1653 | LLVM_DEBUG( |
| 1654 | dbgs() << "LAA: Bad stride - Pointer may wrap in the address space " |
| 1655 | << *Ptr << " SCEV: " << *AR << "\n" ); |
| 1656 | return std::nullopt; |
| 1657 | } |
| 1658 | |
| 1659 | std::optional<int64_t> llvm::getPointersDiff(Type *ElemTyA, Value *PtrA, |
| 1660 | Type *ElemTyB, Value *PtrB, |
| 1661 | const DataLayout &DL, |
| 1662 | ScalarEvolution &SE, |
| 1663 | bool StrictCheck, bool CheckType) { |
| 1664 | assert(PtrA && PtrB && "Expected non-nullptr pointers." ); |
| 1665 | |
| 1666 | // Make sure that A and B are different pointers. |
| 1667 | if (PtrA == PtrB) |
| 1668 | return 0; |
| 1669 | |
| 1670 | // Make sure that the element types are the same if required. |
| 1671 | if (CheckType && ElemTyA != ElemTyB) |
| 1672 | return std::nullopt; |
| 1673 | |
| 1674 | unsigned ASA = PtrA->getType()->getPointerAddressSpace(); |
| 1675 | unsigned ASB = PtrB->getType()->getPointerAddressSpace(); |
| 1676 | |
| 1677 | // Check that the address spaces match. |
| 1678 | if (ASA != ASB) |
| 1679 | return std::nullopt; |
| 1680 | unsigned IdxWidth = DL.getIndexSizeInBits(AS: ASA); |
| 1681 | |
| 1682 | APInt OffsetA(IdxWidth, 0), OffsetB(IdxWidth, 0); |
| 1683 | const Value *PtrA1 = PtrA->stripAndAccumulateConstantOffsets( |
| 1684 | DL, Offset&: OffsetA, /*AllowNonInbounds=*/true); |
| 1685 | const Value *PtrB1 = PtrB->stripAndAccumulateConstantOffsets( |
| 1686 | DL, Offset&: OffsetB, /*AllowNonInbounds=*/true); |
| 1687 | |
| 1688 | std::optional<int64_t> Val; |
| 1689 | if (PtrA1 == PtrB1) { |
| 1690 | // Retrieve the address space again as pointer stripping now tracks through |
| 1691 | // `addrspacecast`. |
| 1692 | ASA = cast<PointerType>(Val: PtrA1->getType())->getAddressSpace(); |
| 1693 | ASB = cast<PointerType>(Val: PtrB1->getType())->getAddressSpace(); |
| 1694 | // Check that the address spaces match and that the pointers are valid. |
| 1695 | if (ASA != ASB) |
| 1696 | return std::nullopt; |
| 1697 | |
| 1698 | IdxWidth = DL.getIndexSizeInBits(AS: ASA); |
| 1699 | OffsetA = OffsetA.sextOrTrunc(width: IdxWidth); |
| 1700 | OffsetB = OffsetB.sextOrTrunc(width: IdxWidth); |
| 1701 | |
| 1702 | OffsetB -= OffsetA; |
| 1703 | Val = OffsetB.trySExtValue(); |
| 1704 | } else { |
| 1705 | // Otherwise compute the distance with SCEV between the base pointers. |
| 1706 | const SCEV *PtrSCEVA = SE.getSCEV(V: PtrA); |
| 1707 | const SCEV *PtrSCEVB = SE.getSCEV(V: PtrB); |
| 1708 | std::optional<APInt> Diff = |
| 1709 | SE.computeConstantDifference(LHS: PtrSCEVB, RHS: PtrSCEVA); |
| 1710 | if (!Diff) |
| 1711 | return std::nullopt; |
| 1712 | Val = Diff->trySExtValue(); |
| 1713 | } |
| 1714 | |
| 1715 | if (!Val) |
| 1716 | return std::nullopt; |
| 1717 | |
| 1718 | int64_t Size = DL.getTypeStoreSize(Ty: ElemTyA); |
| 1719 | int64_t Dist = *Val / Size; |
| 1720 | |
| 1721 | // Ensure that the calculated distance matches the type-based one after all |
| 1722 | // the bitcasts removal in the provided pointers. |
| 1723 | if (!StrictCheck || Dist * Size == Val) |
| 1724 | return Dist; |
| 1725 | return std::nullopt; |
| 1726 | } |
| 1727 | |
| 1728 | bool llvm::sortPtrAccesses(ArrayRef<Value *> VL, Type *ElemTy, |
| 1729 | const DataLayout &DL, ScalarEvolution &SE, |
| 1730 | SmallVectorImpl<unsigned> &SortedIndices) { |
| 1731 | assert(llvm::all_of( |
| 1732 | VL, [](const Value *V) { return V->getType()->isPointerTy(); }) && |
| 1733 | "Expected list of pointer operands." ); |
| 1734 | // Walk over the pointers, and map each of them to an offset relative to |
| 1735 | // first pointer in the array. |
| 1736 | Value *Ptr0 = VL[0]; |
| 1737 | |
| 1738 | using DistOrdPair = std::pair<int64_t, unsigned>; |
| 1739 | auto Compare = llvm::less_first(); |
| 1740 | std::set<DistOrdPair, decltype(Compare)> Offsets(Compare); |
| 1741 | Offsets.emplace(args: 0, args: 0); |
| 1742 | bool IsConsecutive = true; |
| 1743 | for (auto [Idx, Ptr] : drop_begin(RangeOrContainer: enumerate(First&: VL))) { |
| 1744 | std::optional<int64_t> Diff = |
| 1745 | getPointersDiff(ElemTyA: ElemTy, PtrA: Ptr0, ElemTyB: ElemTy, PtrB: Ptr, DL, SE, |
| 1746 | /*StrictCheck=*/true); |
| 1747 | if (!Diff) |
| 1748 | return false; |
| 1749 | |
| 1750 | // Check if the pointer with the same offset is found. |
| 1751 | int64_t Offset = *Diff; |
| 1752 | auto [It, IsInserted] = Offsets.emplace(args&: Offset, args&: Idx); |
| 1753 | if (!IsInserted) |
| 1754 | return false; |
| 1755 | // Consecutive order if the inserted element is the last one. |
| 1756 | IsConsecutive &= std::next(x: It) == Offsets.end(); |
| 1757 | } |
| 1758 | SortedIndices.clear(); |
| 1759 | if (!IsConsecutive) { |
| 1760 | // Fill SortedIndices array only if it is non-consecutive. |
| 1761 | SortedIndices.resize(N: VL.size()); |
| 1762 | for (auto [Idx, Off] : enumerate(First&: Offsets)) |
| 1763 | SortedIndices[Idx] = Off.second; |
| 1764 | } |
| 1765 | return true; |
| 1766 | } |
| 1767 | |
| 1768 | /// Returns true if the memory operations \p A and \p B are consecutive. |
| 1769 | bool llvm::isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL, |
| 1770 | ScalarEvolution &SE, bool CheckType) { |
| 1771 | Value *PtrA = getLoadStorePointerOperand(V: A); |
| 1772 | Value *PtrB = getLoadStorePointerOperand(V: B); |
| 1773 | if (!PtrA || !PtrB) |
| 1774 | return false; |
| 1775 | Type *ElemTyA = getLoadStoreType(I: A); |
| 1776 | Type *ElemTyB = getLoadStoreType(I: B); |
| 1777 | std::optional<int64_t> Diff = |
| 1778 | getPointersDiff(ElemTyA, PtrA, ElemTyB, PtrB, DL, SE, |
| 1779 | /*StrictCheck=*/true, CheckType); |
| 1780 | return Diff == 1; |
| 1781 | } |
| 1782 | |
| 1783 | void MemoryDepChecker::addAccess(StoreInst *SI) { |
| 1784 | visitPointers(StartPtr: SI->getPointerOperand(), InnermostLoop: *InnermostLoop, |
| 1785 | AddPointer: [this, SI](Value *Ptr) { |
| 1786 | Accesses[MemAccessInfo(Ptr, true)].push_back(x: AccessIdx); |
| 1787 | InstMap.push_back(Elt: SI); |
| 1788 | ++AccessIdx; |
| 1789 | }); |
| 1790 | } |
| 1791 | |
| 1792 | void MemoryDepChecker::addAccess(LoadInst *LI) { |
| 1793 | visitPointers(StartPtr: LI->getPointerOperand(), InnermostLoop: *InnermostLoop, |
| 1794 | AddPointer: [this, LI](Value *Ptr) { |
| 1795 | Accesses[MemAccessInfo(Ptr, false)].push_back(x: AccessIdx); |
| 1796 | InstMap.push_back(Elt: LI); |
| 1797 | ++AccessIdx; |
| 1798 | }); |
| 1799 | } |
| 1800 | |
| 1801 | MemoryDepChecker::VectorizationSafetyStatus |
| 1802 | MemoryDepChecker::Dependence::isSafeForVectorization(DepType Type) { |
| 1803 | switch (Type) { |
| 1804 | case NoDep: |
| 1805 | case Forward: |
| 1806 | case BackwardVectorizable: |
| 1807 | return VectorizationSafetyStatus::Safe; |
| 1808 | |
| 1809 | case Unknown: |
| 1810 | return VectorizationSafetyStatus::PossiblySafeWithRtChecks; |
| 1811 | case ForwardButPreventsForwarding: |
| 1812 | case Backward: |
| 1813 | case BackwardVectorizableButPreventsForwarding: |
| 1814 | case IndirectUnsafe: |
| 1815 | return VectorizationSafetyStatus::Unsafe; |
| 1816 | } |
| 1817 | llvm_unreachable("unexpected DepType!" ); |
| 1818 | } |
| 1819 | |
| 1820 | bool MemoryDepChecker::Dependence::isBackward() const { |
| 1821 | switch (Type) { |
| 1822 | case NoDep: |
| 1823 | case Forward: |
| 1824 | case ForwardButPreventsForwarding: |
| 1825 | case Unknown: |
| 1826 | case IndirectUnsafe: |
| 1827 | return false; |
| 1828 | |
| 1829 | case BackwardVectorizable: |
| 1830 | case Backward: |
| 1831 | case BackwardVectorizableButPreventsForwarding: |
| 1832 | return true; |
| 1833 | } |
| 1834 | llvm_unreachable("unexpected DepType!" ); |
| 1835 | } |
| 1836 | |
| 1837 | bool MemoryDepChecker::Dependence::isPossiblyBackward() const { |
| 1838 | return isBackward() || Type == Unknown || Type == IndirectUnsafe; |
| 1839 | } |
| 1840 | |
| 1841 | bool MemoryDepChecker::Dependence::isForward() const { |
| 1842 | switch (Type) { |
| 1843 | case Forward: |
| 1844 | case ForwardButPreventsForwarding: |
| 1845 | return true; |
| 1846 | |
| 1847 | case NoDep: |
| 1848 | case Unknown: |
| 1849 | case BackwardVectorizable: |
| 1850 | case Backward: |
| 1851 | case BackwardVectorizableButPreventsForwarding: |
| 1852 | case IndirectUnsafe: |
| 1853 | return false; |
| 1854 | } |
| 1855 | llvm_unreachable("unexpected DepType!" ); |
| 1856 | } |
| 1857 | |
| 1858 | bool MemoryDepChecker::couldPreventStoreLoadForward(uint64_t Distance, |
| 1859 | uint64_t TypeByteSize, |
| 1860 | unsigned CommonStride) { |
| 1861 | // If loads occur at a distance that is not a multiple of a feasible vector |
| 1862 | // factor store-load forwarding does not take place. |
| 1863 | // Positive dependences might cause troubles because vectorizing them might |
| 1864 | // prevent store-load forwarding making vectorized code run a lot slower. |
| 1865 | // a[i] = a[i-3] ^ a[i-8]; |
| 1866 | // The stores to a[i:i+1] don't align with the stores to a[i-3:i-2] and |
| 1867 | // hence on your typical architecture store-load forwarding does not take |
| 1868 | // place. Vectorizing in such cases does not make sense. |
| 1869 | // Store-load forwarding distance. |
| 1870 | |
| 1871 | // After this many iterations store-to-load forwarding conflicts should not |
| 1872 | // cause any slowdowns. |
| 1873 | const uint64_t NumItersForStoreLoadThroughMemory = 8 * TypeByteSize; |
| 1874 | // Maximum vector factor. |
| 1875 | uint64_t MaxVFWithoutSLForwardIssuesPowerOf2 = |
| 1876 | std::min(a: VectorizerParams::MaxVectorWidth * TypeByteSize, |
| 1877 | b: MaxStoreLoadForwardSafeDistanceInBits); |
| 1878 | |
| 1879 | // Compute the smallest VF at which the store and load would be misaligned. |
| 1880 | for (uint64_t VF = 2 * TypeByteSize; |
| 1881 | VF <= MaxVFWithoutSLForwardIssuesPowerOf2; VF *= 2) { |
| 1882 | // If the number of vector iteration between the store and the load are |
| 1883 | // small we could incur conflicts. |
| 1884 | if (Distance % VF && Distance / VF < NumItersForStoreLoadThroughMemory) { |
| 1885 | MaxVFWithoutSLForwardIssuesPowerOf2 = (VF >> 1); |
| 1886 | break; |
| 1887 | } |
| 1888 | } |
| 1889 | |
| 1890 | if (MaxVFWithoutSLForwardIssuesPowerOf2 < 2 * TypeByteSize) { |
| 1891 | LLVM_DEBUG( |
| 1892 | dbgs() << "LAA: Distance " << Distance |
| 1893 | << " that could cause a store-load forwarding conflict\n" ); |
| 1894 | return true; |
| 1895 | } |
| 1896 | |
| 1897 | if (CommonStride && |
| 1898 | MaxVFWithoutSLForwardIssuesPowerOf2 < |
| 1899 | MaxStoreLoadForwardSafeDistanceInBits && |
| 1900 | MaxVFWithoutSLForwardIssuesPowerOf2 != |
| 1901 | VectorizerParams::MaxVectorWidth * TypeByteSize) { |
| 1902 | uint64_t MaxVF = |
| 1903 | bit_floor(Value: MaxVFWithoutSLForwardIssuesPowerOf2 / CommonStride); |
| 1904 | uint64_t MaxVFInBits = MaxVF * TypeByteSize * 8; |
| 1905 | MaxStoreLoadForwardSafeDistanceInBits = |
| 1906 | std::min(a: MaxStoreLoadForwardSafeDistanceInBits, b: MaxVFInBits); |
| 1907 | } |
| 1908 | return false; |
| 1909 | } |
| 1910 | |
| 1911 | void MemoryDepChecker::mergeInStatus(VectorizationSafetyStatus S) { |
| 1912 | if (Status < S) |
| 1913 | Status = S; |
| 1914 | } |
| 1915 | |
| 1916 | /// Given a dependence-distance \p Dist between two memory accesses, that have |
| 1917 | /// strides in the same direction whose absolute value of the maximum stride is |
| 1918 | /// given in \p MaxStride, in a loop whose maximum backedge taken count is \p |
| 1919 | /// MaxBTC, check if it is possible to prove statically that the dependence |
| 1920 | /// distance is larger than the range that the accesses will travel through the |
| 1921 | /// execution of the loop. If so, return true; false otherwise. This is useful |
| 1922 | /// for example in loops such as the following (PR31098): |
| 1923 | /// |
| 1924 | /// for (i = 0; i < D; ++i) { |
| 1925 | /// = out[i]; |
| 1926 | /// out[i+D] = |
| 1927 | /// } |
| 1928 | static bool isSafeDependenceDistance(const DataLayout &DL, ScalarEvolution &SE, |
| 1929 | const SCEV &MaxBTC, const SCEV &Dist, |
| 1930 | uint64_t MaxStride) { |
| 1931 | |
| 1932 | // If we can prove that |
| 1933 | // (**) |Dist| > MaxBTC * Step |
| 1934 | // where Step is the absolute stride of the memory accesses in bytes, |
| 1935 | // then there is no dependence. |
| 1936 | // |
| 1937 | // Rationale: |
| 1938 | // We basically want to check if the absolute distance (|Dist/Step|) |
| 1939 | // is >= the loop iteration count (or > MaxBTC). |
| 1940 | // This is equivalent to the Strong SIV Test (Practical Dependence Testing, |
| 1941 | // Section 4.2.1); Note, that for vectorization it is sufficient to prove |
| 1942 | // that the dependence distance is >= VF; This is checked elsewhere. |
| 1943 | // But in some cases we can prune dependence distances early, and |
| 1944 | // even before selecting the VF, and without a runtime test, by comparing |
| 1945 | // the distance against the loop iteration count. Since the vectorized code |
| 1946 | // will be executed only if LoopCount >= VF, proving distance >= LoopCount |
| 1947 | // also guarantees that distance >= VF. |
| 1948 | // |
| 1949 | const SCEV *Step = SE.getConstant(Ty: MaxBTC.getType(), V: MaxStride); |
| 1950 | const SCEV *Product = SE.getMulExpr(LHS: &MaxBTC, RHS: Step); |
| 1951 | |
| 1952 | const SCEV *CastedDist = &Dist; |
| 1953 | const SCEV *CastedProduct = Product; |
| 1954 | uint64_t DistTypeSizeBits = DL.getTypeSizeInBits(Ty: Dist.getType()); |
| 1955 | uint64_t ProductTypeSizeBits = DL.getTypeSizeInBits(Ty: Product->getType()); |
| 1956 | |
| 1957 | // The dependence distance can be positive/negative, so we sign extend Dist; |
| 1958 | // The multiplication of the absolute stride in bytes and the |
| 1959 | // backedgeTakenCount is non-negative, so we zero extend Product. |
| 1960 | if (DistTypeSizeBits > ProductTypeSizeBits) |
| 1961 | CastedProduct = SE.getZeroExtendExpr(Op: Product, Ty: Dist.getType()); |
| 1962 | else |
| 1963 | CastedDist = SE.getNoopOrSignExtend(V: &Dist, Ty: Product->getType()); |
| 1964 | |
| 1965 | // Is Dist - (MaxBTC * Step) > 0 ? |
| 1966 | // (If so, then we have proven (**) because |Dist| >= Dist) |
| 1967 | const SCEV *Minus = SE.getMinusSCEV(LHS: CastedDist, RHS: CastedProduct); |
| 1968 | if (SE.isKnownPositive(S: Minus)) |
| 1969 | return true; |
| 1970 | |
| 1971 | // Second try: Is -Dist - (MaxBTC * Step) > 0 ? |
| 1972 | // (If so, then we have proven (**) because |Dist| >= -1*Dist) |
| 1973 | const SCEV *NegDist = SE.getNegativeSCEV(V: CastedDist); |
| 1974 | Minus = SE.getMinusSCEV(LHS: NegDist, RHS: CastedProduct); |
| 1975 | return SE.isKnownPositive(S: Minus); |
| 1976 | } |
| 1977 | |
| 1978 | /// Check the dependence for two accesses with the same stride \p Stride. |
| 1979 | /// \p Distance is the positive distance in bytes, and \p TypeByteSize is type |
| 1980 | /// size in bytes. |
| 1981 | /// |
| 1982 | /// \returns true if they are independent. |
| 1983 | static bool areStridedAccessesIndependent(uint64_t Distance, uint64_t Stride, |
| 1984 | uint64_t TypeByteSize) { |
| 1985 | assert(Stride > 1 && "The stride must be greater than 1" ); |
| 1986 | assert(TypeByteSize > 0 && "The type size in byte must be non-zero" ); |
| 1987 | assert(Distance > 0 && "The distance must be non-zero" ); |
| 1988 | |
| 1989 | // Skip if the distance is not multiple of type byte size. |
| 1990 | if (Distance % TypeByteSize) |
| 1991 | return false; |
| 1992 | |
| 1993 | // No dependence if the distance is not multiple of the stride. |
| 1994 | // E.g. |
| 1995 | // for (i = 0; i < 1024 ; i += 4) |
| 1996 | // A[i+2] = A[i] + 1; |
| 1997 | // |
| 1998 | // Two accesses in memory (distance is 2, stride is 4): |
| 1999 | // | A[0] | | | | A[4] | | | | |
| 2000 | // | | | A[2] | | | | A[6] | | |
| 2001 | // |
| 2002 | // E.g. |
| 2003 | // for (i = 0; i < 1024 ; i += 3) |
| 2004 | // A[i+4] = A[i] + 1; |
| 2005 | // |
| 2006 | // Two accesses in memory (distance is 4, stride is 3): |
| 2007 | // | A[0] | | | A[3] | | | A[6] | | | |
| 2008 | // | | | | | A[4] | | | A[7] | | |
| 2009 | return Distance % Stride; |
| 2010 | } |
| 2011 | |
| 2012 | bool MemoryDepChecker::areAccessesCompletelyBeforeOrAfter(const SCEV *Src, |
| 2013 | Type *SrcTy, |
| 2014 | const SCEV *Sink, |
| 2015 | Type *SinkTy) { |
| 2016 | const SCEV *BTC = PSE.getBackedgeTakenCount(); |
| 2017 | const SCEV *SymbolicMaxBTC = PSE.getSymbolicMaxBackedgeTakenCount(); |
| 2018 | ScalarEvolution &SE = *PSE.getSE(); |
| 2019 | const auto &[SrcStart_, SrcEnd_] = |
| 2020 | getStartAndEndForAccess(Lp: InnermostLoop, PtrExpr: Src, AccessTy: SrcTy, BTC, MaxBTC: SymbolicMaxBTC, |
| 2021 | SE: &SE, PointerBounds: &PointerBounds, DT, AC, LoopGuards); |
| 2022 | if (isa<SCEVCouldNotCompute>(Val: SrcStart_) || isa<SCEVCouldNotCompute>(Val: SrcEnd_)) |
| 2023 | return false; |
| 2024 | |
| 2025 | const auto &[SinkStart_, SinkEnd_] = |
| 2026 | getStartAndEndForAccess(Lp: InnermostLoop, PtrExpr: Sink, AccessTy: SinkTy, BTC, MaxBTC: SymbolicMaxBTC, |
| 2027 | SE: &SE, PointerBounds: &PointerBounds, DT, AC, LoopGuards); |
| 2028 | if (isa<SCEVCouldNotCompute>(Val: SinkStart_) || |
| 2029 | isa<SCEVCouldNotCompute>(Val: SinkEnd_)) |
| 2030 | return false; |
| 2031 | |
| 2032 | if (!LoopGuards) |
| 2033 | LoopGuards.emplace(args: ScalarEvolution::LoopGuards::collect(L: InnermostLoop, SE)); |
| 2034 | |
| 2035 | auto SrcEnd = SE.applyLoopGuards(Expr: SrcEnd_, Guards: *LoopGuards); |
| 2036 | auto SinkStart = SE.applyLoopGuards(Expr: SinkStart_, Guards: *LoopGuards); |
| 2037 | if (SE.isKnownPredicate(Pred: CmpInst::ICMP_ULE, LHS: SrcEnd, RHS: SinkStart)) |
| 2038 | return true; |
| 2039 | |
| 2040 | auto SinkEnd = SE.applyLoopGuards(Expr: SinkEnd_, Guards: *LoopGuards); |
| 2041 | auto SrcStart = SE.applyLoopGuards(Expr: SrcStart_, Guards: *LoopGuards); |
| 2042 | return SE.isKnownPredicate(Pred: CmpInst::ICMP_ULE, LHS: SinkEnd, RHS: SrcStart); |
| 2043 | } |
| 2044 | |
| 2045 | std::variant<MemoryDepChecker::Dependence::DepType, |
| 2046 | MemoryDepChecker::DepDistanceStrideAndSizeInfo> |
| 2047 | MemoryDepChecker::getDependenceDistanceStrideAndSize( |
| 2048 | const AccessAnalysis::MemAccessInfo &A, Instruction *AInst, |
| 2049 | const AccessAnalysis::MemAccessInfo &B, Instruction *BInst) { |
| 2050 | const auto &DL = InnermostLoop->getHeader()->getDataLayout(); |
| 2051 | auto &SE = *PSE.getSE(); |
| 2052 | const auto &[APtr, AIsWrite] = A; |
| 2053 | const auto &[BPtr, BIsWrite] = B; |
| 2054 | |
| 2055 | // Two reads are independent. |
| 2056 | if (!AIsWrite && !BIsWrite) |
| 2057 | return MemoryDepChecker::Dependence::NoDep; |
| 2058 | |
| 2059 | Type *ATy = getLoadStoreType(I: AInst); |
| 2060 | Type *BTy = getLoadStoreType(I: BInst); |
| 2061 | |
| 2062 | // We cannot check pointers in different address spaces. |
| 2063 | if (APtr->getType()->getPointerAddressSpace() != |
| 2064 | BPtr->getType()->getPointerAddressSpace()) |
| 2065 | return MemoryDepChecker::Dependence::Unknown; |
| 2066 | |
| 2067 | std::optional<int64_t> StrideAPtr = getPtrStride( |
| 2068 | PSE, AccessTy: ATy, Ptr: APtr, Lp: InnermostLoop, DT: *DT, StridesMap: SymbolicStrides, Assume: true, ShouldCheckWrap: true); |
| 2069 | std::optional<int64_t> StrideBPtr = getPtrStride( |
| 2070 | PSE, AccessTy: BTy, Ptr: BPtr, Lp: InnermostLoop, DT: *DT, StridesMap: SymbolicStrides, Assume: true, ShouldCheckWrap: true); |
| 2071 | |
| 2072 | const SCEV *Src = PSE.getSCEV(V: APtr); |
| 2073 | const SCEV *Sink = PSE.getSCEV(V: BPtr); |
| 2074 | |
| 2075 | // If the induction step is negative we have to invert source and sink of the |
| 2076 | // dependence when measuring the distance between them. We should not swap |
| 2077 | // AIsWrite with BIsWrite, as their uses expect them in program order. |
| 2078 | if (StrideAPtr && *StrideAPtr < 0) { |
| 2079 | std::swap(a&: Src, b&: Sink); |
| 2080 | std::swap(a&: AInst, b&: BInst); |
| 2081 | std::swap(a&: ATy, b&: BTy); |
| 2082 | std::swap(lhs&: StrideAPtr, rhs&: StrideBPtr); |
| 2083 | } |
| 2084 | |
| 2085 | const SCEV *Dist = SE.getMinusSCEV(LHS: Sink, RHS: Src); |
| 2086 | |
| 2087 | LLVM_DEBUG(dbgs() << "LAA: Src Scev: " << *Src << "Sink Scev: " << *Sink |
| 2088 | << "\n" ); |
| 2089 | LLVM_DEBUG(dbgs() << "LAA: Distance for " << *AInst << " to " << *BInst |
| 2090 | << ": " << *Dist << "\n" ); |
| 2091 | |
| 2092 | // Need accesses with constant strides and the same direction for further |
| 2093 | // dependence analysis. We don't want to vectorize "A[B[i]] += ..." and |
| 2094 | // similar code or pointer arithmetic that could wrap in the address space. |
| 2095 | |
| 2096 | // If either Src or Sink are not strided (i.e. not a non-wrapping AddRec) and |
| 2097 | // not loop-invariant (stride will be 0 in that case), we cannot analyze the |
| 2098 | // dependence further and also cannot generate runtime checks. |
| 2099 | if (!StrideAPtr || !StrideBPtr) { |
| 2100 | LLVM_DEBUG(dbgs() << "Pointer access with non-constant stride\n" ); |
| 2101 | return MemoryDepChecker::Dependence::IndirectUnsafe; |
| 2102 | } |
| 2103 | |
| 2104 | int64_t StrideAPtrInt = *StrideAPtr; |
| 2105 | int64_t StrideBPtrInt = *StrideBPtr; |
| 2106 | LLVM_DEBUG(dbgs() << "LAA: Src induction step: " << StrideAPtrInt |
| 2107 | << " Sink induction step: " << StrideBPtrInt << "\n" ); |
| 2108 | // At least Src or Sink are loop invariant and the other is strided or |
| 2109 | // invariant. We can generate a runtime check to disambiguate the accesses. |
| 2110 | if (!StrideAPtrInt || !StrideBPtrInt) |
| 2111 | return MemoryDepChecker::Dependence::Unknown; |
| 2112 | |
| 2113 | // Both Src and Sink have a constant stride, check if they are in the same |
| 2114 | // direction. |
| 2115 | if ((StrideAPtrInt > 0) != (StrideBPtrInt > 0)) { |
| 2116 | LLVM_DEBUG( |
| 2117 | dbgs() << "Pointer access with strides in different directions\n" ); |
| 2118 | return MemoryDepChecker::Dependence::Unknown; |
| 2119 | } |
| 2120 | |
| 2121 | TypeSize AStoreSz = DL.getTypeStoreSize(Ty: ATy); |
| 2122 | TypeSize BStoreSz = DL.getTypeStoreSize(Ty: BTy); |
| 2123 | |
| 2124 | // If store sizes are not the same, set TypeByteSize to zero, so we can check |
| 2125 | // it in the caller isDependent. |
| 2126 | uint64_t ASz = DL.getTypeAllocSize(Ty: ATy); |
| 2127 | uint64_t BSz = DL.getTypeAllocSize(Ty: BTy); |
| 2128 | uint64_t TypeByteSize = (AStoreSz == BStoreSz) ? BSz : 0; |
| 2129 | |
| 2130 | uint64_t StrideAScaled = std::abs(i: StrideAPtrInt) * ASz; |
| 2131 | uint64_t StrideBScaled = std::abs(i: StrideBPtrInt) * BSz; |
| 2132 | |
| 2133 | uint64_t MaxStride = std::max(a: StrideAScaled, b: StrideBScaled); |
| 2134 | |
| 2135 | std::optional<uint64_t> CommonStride; |
| 2136 | if (StrideAScaled == StrideBScaled) |
| 2137 | CommonStride = StrideAScaled; |
| 2138 | |
| 2139 | // TODO: Historically, we didn't retry with runtime checks when (unscaled) |
| 2140 | // strides were different but there is no inherent reason to. |
| 2141 | if (!isa<SCEVConstant>(Val: Dist)) |
| 2142 | ShouldRetryWithRuntimeChecks |= StrideAPtrInt == StrideBPtrInt; |
| 2143 | |
| 2144 | // If distance is a SCEVCouldNotCompute, return Unknown immediately. |
| 2145 | if (isa<SCEVCouldNotCompute>(Val: Dist)) { |
| 2146 | LLVM_DEBUG(dbgs() << "LAA: Uncomputable distance.\n" ); |
| 2147 | return Dependence::Unknown; |
| 2148 | } |
| 2149 | |
| 2150 | return DepDistanceStrideAndSizeInfo(Dist, MaxStride, CommonStride, |
| 2151 | TypeByteSize, AIsWrite, BIsWrite); |
| 2152 | } |
| 2153 | |
| 2154 | MemoryDepChecker::Dependence::DepType |
| 2155 | MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx, |
| 2156 | const MemAccessInfo &B, unsigned BIdx) { |
| 2157 | assert(AIdx < BIdx && "Must pass arguments in program order" ); |
| 2158 | |
| 2159 | // Check if we can prove that Sink only accesses memory after Src's end or |
| 2160 | // vice versa. The helper is used to perform the checks only on the exit paths |
| 2161 | // where it helps to improve the analysis result. |
| 2162 | auto CheckCompletelyBeforeOrAfter = [&]() { |
| 2163 | auto *APtr = A.getPointer(); |
| 2164 | auto *BPtr = B.getPointer(); |
| 2165 | Type *ATy = getLoadStoreType(I: InstMap[AIdx]); |
| 2166 | Type *BTy = getLoadStoreType(I: InstMap[BIdx]); |
| 2167 | const SCEV *Src = PSE.getSCEV(V: APtr); |
| 2168 | const SCEV *Sink = PSE.getSCEV(V: BPtr); |
| 2169 | return areAccessesCompletelyBeforeOrAfter(Src, SrcTy: ATy, Sink, SinkTy: BTy); |
| 2170 | }; |
| 2171 | |
| 2172 | // Get the dependence distance, stride, type size and what access writes for |
| 2173 | // the dependence between A and B. |
| 2174 | auto Res = |
| 2175 | getDependenceDistanceStrideAndSize(A, AInst: InstMap[AIdx], B, BInst: InstMap[BIdx]); |
| 2176 | if (std::holds_alternative<Dependence::DepType>(v: Res)) { |
| 2177 | if (std::get<Dependence::DepType>(v&: Res) == Dependence::Unknown && |
| 2178 | CheckCompletelyBeforeOrAfter()) |
| 2179 | return Dependence::NoDep; |
| 2180 | return std::get<Dependence::DepType>(v&: Res); |
| 2181 | } |
| 2182 | |
| 2183 | auto &[Dist, MaxStride, CommonStride, TypeByteSize, AIsWrite, BIsWrite] = |
| 2184 | std::get<DepDistanceStrideAndSizeInfo>(v&: Res); |
| 2185 | bool HasSameSize = TypeByteSize > 0; |
| 2186 | |
| 2187 | ScalarEvolution &SE = *PSE.getSE(); |
| 2188 | auto &DL = InnermostLoop->getHeader()->getDataLayout(); |
| 2189 | |
| 2190 | // If the distance between the acecsses is larger than their maximum absolute |
| 2191 | // stride multiplied by the symbolic maximum backedge taken count (which is an |
| 2192 | // upper bound of the number of iterations), the accesses are independet, i.e. |
| 2193 | // they are far enough appart that accesses won't access the same location |
| 2194 | // across all loop ierations. |
| 2195 | if (HasSameSize && |
| 2196 | isSafeDependenceDistance( |
| 2197 | DL, SE, MaxBTC: *(PSE.getSymbolicMaxBackedgeTakenCount()), Dist: *Dist, MaxStride)) |
| 2198 | return Dependence::NoDep; |
| 2199 | |
| 2200 | // The rest of this function relies on ConstDist being at most 64-bits, which |
| 2201 | // is checked earlier. Will assert if the calling code changes. |
| 2202 | const APInt *APDist = nullptr; |
| 2203 | uint64_t ConstDist = |
| 2204 | match(S: Dist, P: m_scev_APInt(C&: APDist)) ? APDist->abs().getZExtValue() : 0; |
| 2205 | |
| 2206 | // Attempt to prove strided accesses independent. |
| 2207 | if (APDist) { |
| 2208 | // If the distance between accesses and their strides are known constants, |
| 2209 | // check whether the accesses interlace each other. |
| 2210 | if (ConstDist > 0 && CommonStride && CommonStride > 1 && HasSameSize && |
| 2211 | areStridedAccessesIndependent(Distance: ConstDist, Stride: *CommonStride, TypeByteSize)) { |
| 2212 | LLVM_DEBUG(dbgs() << "LAA: Strided accesses are independent\n" ); |
| 2213 | return Dependence::NoDep; |
| 2214 | } |
| 2215 | } else { |
| 2216 | if (!LoopGuards) |
| 2217 | LoopGuards.emplace( |
| 2218 | args: ScalarEvolution::LoopGuards::collect(L: InnermostLoop, SE)); |
| 2219 | Dist = SE.applyLoopGuards(Expr: Dist, Guards: *LoopGuards); |
| 2220 | } |
| 2221 | |
| 2222 | // Negative distances are not plausible dependencies. |
| 2223 | if (SE.isKnownNonPositive(S: Dist)) { |
| 2224 | if (SE.isKnownNonNegative(S: Dist)) { |
| 2225 | if (HasSameSize) { |
| 2226 | // Write to the same location with the same size. |
| 2227 | return Dependence::Forward; |
| 2228 | } |
| 2229 | LLVM_DEBUG(dbgs() << "LAA: possibly zero dependence difference but " |
| 2230 | "different type sizes\n" ); |
| 2231 | return Dependence::Unknown; |
| 2232 | } |
| 2233 | |
| 2234 | bool IsTrueDataDependence = (AIsWrite && !BIsWrite); |
| 2235 | // Check if the first access writes to a location that is read in a later |
| 2236 | // iteration, where the distance between them is not a multiple of a vector |
| 2237 | // factor and relatively small. |
| 2238 | // |
| 2239 | // NOTE: There is no need to update MaxSafeVectorWidthInBits after call to |
| 2240 | // couldPreventStoreLoadForward, even if it changed MinDepDistBytes, since a |
| 2241 | // forward dependency will allow vectorization using any width. |
| 2242 | |
| 2243 | if (IsTrueDataDependence && EnableForwardingConflictDetection) { |
| 2244 | if (!ConstDist) { |
| 2245 | return CheckCompletelyBeforeOrAfter() ? Dependence::NoDep |
| 2246 | : Dependence::Unknown; |
| 2247 | } |
| 2248 | if (!HasSameSize || |
| 2249 | couldPreventStoreLoadForward(Distance: ConstDist, TypeByteSize)) { |
| 2250 | LLVM_DEBUG( |
| 2251 | dbgs() << "LAA: Forward but may prevent st->ld forwarding\n" ); |
| 2252 | return Dependence::ForwardButPreventsForwarding; |
| 2253 | } |
| 2254 | } |
| 2255 | |
| 2256 | LLVM_DEBUG(dbgs() << "LAA: Dependence is negative\n" ); |
| 2257 | return Dependence::Forward; |
| 2258 | } |
| 2259 | |
| 2260 | int64_t MinDistance = SE.getSignedRangeMin(S: Dist).getSExtValue(); |
| 2261 | // Below we only handle strictly positive distances. |
| 2262 | if (MinDistance <= 0) { |
| 2263 | return CheckCompletelyBeforeOrAfter() ? Dependence::NoDep |
| 2264 | : Dependence::Unknown; |
| 2265 | } |
| 2266 | |
| 2267 | if (!HasSameSize) { |
| 2268 | if (CheckCompletelyBeforeOrAfter()) |
| 2269 | return Dependence::NoDep; |
| 2270 | LLVM_DEBUG(dbgs() << "LAA: ReadWrite-Write positive dependency with " |
| 2271 | "different type sizes\n" ); |
| 2272 | return Dependence::Unknown; |
| 2273 | } |
| 2274 | // Bail out early if passed-in parameters make vectorization not feasible. |
| 2275 | unsigned ForcedFactor = (VectorizerParams::VectorizationFactor ? |
| 2276 | VectorizerParams::VectorizationFactor : 1); |
| 2277 | unsigned ForcedUnroll = (VectorizerParams::VectorizationInterleave ? |
| 2278 | VectorizerParams::VectorizationInterleave : 1); |
| 2279 | // The minimum number of iterations for a vectorized/unrolled version. |
| 2280 | unsigned MinNumIter = std::max(a: ForcedFactor * ForcedUnroll, b: 2U); |
| 2281 | |
| 2282 | // It's not vectorizable if the distance is smaller than the minimum distance |
| 2283 | // needed for a vectroized/unrolled version. Vectorizing one iteration in |
| 2284 | // front needs MaxStride. Vectorizing the last iteration needs TypeByteSize. |
| 2285 | // (No need to plus the last gap distance). |
| 2286 | // |
| 2287 | // E.g. Assume one char is 1 byte in memory and one int is 4 bytes. |
| 2288 | // foo(int *A) { |
| 2289 | // int *B = (int *)((char *)A + 14); |
| 2290 | // for (i = 0 ; i < 1024 ; i += 2) |
| 2291 | // B[i] = A[i] + 1; |
| 2292 | // } |
| 2293 | // |
| 2294 | // Two accesses in memory (stride is 4 * 2): |
| 2295 | // | A[0] | | A[2] | | A[4] | | A[6] | | |
| 2296 | // | B[0] | | B[2] | | B[4] | |
| 2297 | // |
| 2298 | // MinDistance needs for vectorizing iterations except the last iteration: |
| 2299 | // 4 * 2 * (MinNumIter - 1). MinDistance needs for the last iteration: 4. |
| 2300 | // So the minimum distance needed is: 4 * 2 * (MinNumIter - 1) + 4. |
| 2301 | // |
| 2302 | // If MinNumIter is 2, it is vectorizable as the minimum distance needed is |
| 2303 | // 12, which is less than distance. |
| 2304 | // |
| 2305 | // If MinNumIter is 4 (Say if a user forces the vectorization factor to be 4), |
| 2306 | // the minimum distance needed is 28, which is greater than distance. It is |
| 2307 | // not safe to do vectorization. |
| 2308 | // |
| 2309 | // We use MaxStride (maximum of src and sink strides) to get a conservative |
| 2310 | // lower bound on the MinDistanceNeeded in case of different strides. |
| 2311 | |
| 2312 | // We know that Dist is positive, but it may not be constant. Use the signed |
| 2313 | // minimum for computations below, as this ensures we compute the closest |
| 2314 | // possible dependence distance. |
| 2315 | uint64_t MinDistanceNeeded = MaxStride * (MinNumIter - 1) + TypeByteSize; |
| 2316 | if (MinDistanceNeeded > static_cast<uint64_t>(MinDistance)) { |
| 2317 | if (!ConstDist) { |
| 2318 | // For non-constant distances, we checked the lower bound of the |
| 2319 | // dependence distance and the distance may be larger at runtime (and safe |
| 2320 | // for vectorization). Classify it as Unknown, so we re-try with runtime |
| 2321 | // checks, unless we can prove both accesses cannot overlap. |
| 2322 | return CheckCompletelyBeforeOrAfter() ? Dependence::NoDep |
| 2323 | : Dependence::Unknown; |
| 2324 | } |
| 2325 | LLVM_DEBUG(dbgs() << "LAA: Failure because of positive minimum distance " |
| 2326 | << MinDistance << '\n'); |
| 2327 | return Dependence::Backward; |
| 2328 | } |
| 2329 | |
| 2330 | // Unsafe if the minimum distance needed is greater than smallest dependence |
| 2331 | // distance distance. |
| 2332 | if (MinDistanceNeeded > MinDepDistBytes) { |
| 2333 | LLVM_DEBUG(dbgs() << "LAA: Failure because it needs at least " |
| 2334 | << MinDistanceNeeded << " size in bytes\n" ); |
| 2335 | return Dependence::Backward; |
| 2336 | } |
| 2337 | |
| 2338 | MinDepDistBytes = |
| 2339 | std::min(a: static_cast<uint64_t>(MinDistance), b: MinDepDistBytes); |
| 2340 | |
| 2341 | bool IsTrueDataDependence = (!AIsWrite && BIsWrite); |
| 2342 | if (IsTrueDataDependence && EnableForwardingConflictDetection && ConstDist && |
| 2343 | couldPreventStoreLoadForward(Distance: MinDistance, TypeByteSize, CommonStride: *CommonStride)) |
| 2344 | return Dependence::BackwardVectorizableButPreventsForwarding; |
| 2345 | |
| 2346 | uint64_t MaxVF = MinDepDistBytes / MaxStride; |
| 2347 | LLVM_DEBUG(dbgs() << "LAA: Positive min distance " << MinDistance |
| 2348 | << " with max VF = " << MaxVF << '\n'); |
| 2349 | |
| 2350 | uint64_t MaxVFInBits = MaxVF * TypeByteSize * 8; |
| 2351 | if (!ConstDist && MaxVFInBits < MaxTargetVectorWidthInBits) { |
| 2352 | // For non-constant distances, we checked the lower bound of the dependence |
| 2353 | // distance and the distance may be larger at runtime (and safe for |
| 2354 | // vectorization). Classify it as Unknown, so we re-try with runtime checks, |
| 2355 | // unless we can prove both accesses cannot overlap. |
| 2356 | return CheckCompletelyBeforeOrAfter() ? Dependence::NoDep |
| 2357 | : Dependence::Unknown; |
| 2358 | } |
| 2359 | |
| 2360 | if (CheckCompletelyBeforeOrAfter()) |
| 2361 | return Dependence::NoDep; |
| 2362 | |
| 2363 | MaxSafeVectorWidthInBits = std::min(a: MaxSafeVectorWidthInBits, b: MaxVFInBits); |
| 2364 | return Dependence::BackwardVectorizable; |
| 2365 | } |
| 2366 | |
| 2367 | bool MemoryDepChecker::areDepsSafe(const DepCandidates &DepCands, |
| 2368 | const MemAccessInfoList &CheckDeps) { |
| 2369 | |
| 2370 | MinDepDistBytes = -1; |
| 2371 | SmallPtrSet<MemAccessInfo, 8> Visited; |
| 2372 | for (MemAccessInfo CurAccess : CheckDeps) { |
| 2373 | if (Visited.contains(Ptr: CurAccess)) |
| 2374 | continue; |
| 2375 | |
| 2376 | // Check accesses within this set. |
| 2377 | EquivalenceClasses<MemAccessInfo>::member_iterator AI = |
| 2378 | DepCands.findLeader(V: CurAccess); |
| 2379 | EquivalenceClasses<MemAccessInfo>::member_iterator AE = |
| 2380 | DepCands.member_end(); |
| 2381 | |
| 2382 | // Check every access pair. |
| 2383 | while (AI != AE) { |
| 2384 | Visited.insert(Ptr: *AI); |
| 2385 | bool AIIsWrite = AI->getInt(); |
| 2386 | // Check loads only against next equivalent class, but stores also against |
| 2387 | // other stores in the same equivalence class - to the same address. |
| 2388 | EquivalenceClasses<MemAccessInfo>::member_iterator OI = |
| 2389 | (AIIsWrite ? AI : std::next(x: AI)); |
| 2390 | while (OI != AE) { |
| 2391 | // Check every accessing instruction pair in program order. |
| 2392 | auto &Acc = Accesses[*AI]; |
| 2393 | for (std::vector<unsigned>::iterator I1 = Acc.begin(), I1E = Acc.end(); |
| 2394 | I1 != I1E; ++I1) |
| 2395 | // Scan all accesses of another equivalence class, but only the next |
| 2396 | // accesses of the same equivalent class. |
| 2397 | for (std::vector<unsigned>::iterator |
| 2398 | I2 = (OI == AI ? std::next(x: I1) : Accesses[*OI].begin()), |
| 2399 | I2E = (OI == AI ? I1E : Accesses[*OI].end()); |
| 2400 | I2 != I2E; ++I2) { |
| 2401 | auto A = std::make_pair(x: &*AI, y&: *I1); |
| 2402 | auto B = std::make_pair(x: &*OI, y&: *I2); |
| 2403 | |
| 2404 | assert(*I1 != *I2); |
| 2405 | if (*I1 > *I2) |
| 2406 | std::swap(x&: A, y&: B); |
| 2407 | |
| 2408 | Dependence::DepType Type = |
| 2409 | isDependent(A: *A.first, AIdx: A.second, B: *B.first, BIdx: B.second); |
| 2410 | mergeInStatus(S: Dependence::isSafeForVectorization(Type)); |
| 2411 | |
| 2412 | // Gather dependences unless we accumulated MaxDependences |
| 2413 | // dependences. In that case return as soon as we find the first |
| 2414 | // unsafe dependence. This puts a limit on this quadratic |
| 2415 | // algorithm. |
| 2416 | if (RecordDependences) { |
| 2417 | if (Type != Dependence::NoDep) |
| 2418 | Dependences.emplace_back(Args&: A.second, Args&: B.second, Args&: Type); |
| 2419 | |
| 2420 | if (Dependences.size() >= MaxDependences) { |
| 2421 | RecordDependences = false; |
| 2422 | Dependences.clear(); |
| 2423 | LLVM_DEBUG(dbgs() |
| 2424 | << "Too many dependences, stopped recording\n" ); |
| 2425 | } |
| 2426 | } |
| 2427 | if (!RecordDependences && !isSafeForVectorization()) |
| 2428 | return false; |
| 2429 | } |
| 2430 | ++OI; |
| 2431 | } |
| 2432 | ++AI; |
| 2433 | } |
| 2434 | } |
| 2435 | |
| 2436 | LLVM_DEBUG(dbgs() << "Total Dependences: " << Dependences.size() << "\n" ); |
| 2437 | return isSafeForVectorization(); |
| 2438 | } |
| 2439 | |
| 2440 | SmallVector<Instruction *, 4> |
| 2441 | MemoryDepChecker::getInstructionsForAccess(Value *Ptr, bool IsWrite) const { |
| 2442 | MemAccessInfo Access(Ptr, IsWrite); |
| 2443 | auto I = Accesses.find(Val: Access); |
| 2444 | SmallVector<Instruction *, 4> Insts; |
| 2445 | if (I != Accesses.end()) { |
| 2446 | transform(Range: I->second, d_first: std::back_inserter(x&: Insts), |
| 2447 | F: [&](unsigned Idx) { return this->InstMap[Idx]; }); |
| 2448 | } |
| 2449 | |
| 2450 | return Insts; |
| 2451 | } |
| 2452 | |
| 2453 | const char *MemoryDepChecker::Dependence::DepName[] = { |
| 2454 | "NoDep" , |
| 2455 | "Unknown" , |
| 2456 | "IndirectUnsafe" , |
| 2457 | "Forward" , |
| 2458 | "ForwardButPreventsForwarding" , |
| 2459 | "Backward" , |
| 2460 | "BackwardVectorizable" , |
| 2461 | "BackwardVectorizableButPreventsForwarding" }; |
| 2462 | |
| 2463 | void MemoryDepChecker::Dependence::print( |
| 2464 | raw_ostream &OS, unsigned Depth, |
| 2465 | const SmallVectorImpl<Instruction *> &Instrs) const { |
| 2466 | OS.indent(NumSpaces: Depth) << DepName[Type] << ":\n" ; |
| 2467 | OS.indent(NumSpaces: Depth + 2) << *Instrs[Source] << " -> \n" ; |
| 2468 | OS.indent(NumSpaces: Depth + 2) << *Instrs[Destination] << "\n" ; |
| 2469 | } |
| 2470 | |
| 2471 | bool LoopAccessInfo::canAnalyzeLoop() { |
| 2472 | // We need to have a loop header. |
| 2473 | LLVM_DEBUG(dbgs() << "\nLAA: Checking a loop in '" |
| 2474 | << TheLoop->getHeader()->getParent()->getName() << "' from " |
| 2475 | << TheLoop->getLocStr() << "\n" ); |
| 2476 | |
| 2477 | // We can only analyze innermost loops. |
| 2478 | if (!TheLoop->isInnermost()) { |
| 2479 | LLVM_DEBUG(dbgs() << "LAA: loop is not the innermost loop\n" ); |
| 2480 | recordAnalysis(RemarkName: "NotInnerMostLoop" ) << "loop is not the innermost loop" ; |
| 2481 | return false; |
| 2482 | } |
| 2483 | |
| 2484 | // We must have a single backedge. |
| 2485 | if (TheLoop->getNumBackEdges() != 1) { |
| 2486 | LLVM_DEBUG( |
| 2487 | dbgs() << "LAA: loop control flow is not understood by analyzer\n" ); |
| 2488 | recordAnalysis(RemarkName: "CFGNotUnderstood" ) |
| 2489 | << "loop control flow is not understood by analyzer" ; |
| 2490 | return false; |
| 2491 | } |
| 2492 | |
| 2493 | // ScalarEvolution needs to be able to find the symbolic max backedge taken |
| 2494 | // count, which is an upper bound on the number of loop iterations. The loop |
| 2495 | // may execute fewer iterations, if it exits via an uncountable exit. |
| 2496 | const SCEV *ExitCount = PSE->getSymbolicMaxBackedgeTakenCount(); |
| 2497 | if (isa<SCEVCouldNotCompute>(Val: ExitCount)) { |
| 2498 | recordAnalysis(RemarkName: "CantComputeNumberOfIterations" ) |
| 2499 | << "could not determine number of loop iterations" ; |
| 2500 | LLVM_DEBUG(dbgs() << "LAA: SCEV could not compute the loop exit count.\n" ); |
| 2501 | return false; |
| 2502 | } |
| 2503 | |
| 2504 | LLVM_DEBUG(dbgs() << "LAA: Found an analyzable loop: " |
| 2505 | << TheLoop->getHeader()->getName() << "\n" ); |
| 2506 | return true; |
| 2507 | } |
| 2508 | |
| 2509 | bool LoopAccessInfo::analyzeLoop(AAResults *AA, const LoopInfo *LI, |
| 2510 | const TargetLibraryInfo *TLI, |
| 2511 | DominatorTree *DT) { |
| 2512 | // Holds the Load and Store instructions. |
| 2513 | SmallVector<LoadInst *, 16> Loads; |
| 2514 | SmallVector<StoreInst *, 16> Stores; |
| 2515 | SmallPtrSet<MDNode *, 8> LoopAliasScopes; |
| 2516 | |
| 2517 | // Holds all the different accesses in the loop. |
| 2518 | unsigned NumReads = 0; |
| 2519 | unsigned NumReadWrites = 0; |
| 2520 | |
| 2521 | bool HasComplexMemInst = false; |
| 2522 | |
| 2523 | // A runtime check is only legal to insert if there are no convergent calls. |
| 2524 | HasConvergentOp = false; |
| 2525 | |
| 2526 | PtrRtChecking->Pointers.clear(); |
| 2527 | PtrRtChecking->Need = false; |
| 2528 | |
| 2529 | const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel(); |
| 2530 | |
| 2531 | const bool EnableMemAccessVersioningOfLoop = |
| 2532 | EnableMemAccessVersioning && |
| 2533 | !TheLoop->getHeader()->getParent()->hasOptSize(); |
| 2534 | |
| 2535 | // Traverse blocks in fixed RPOT order, regardless of their storage in the |
| 2536 | // loop info, as it may be arbitrary. |
| 2537 | LoopBlocksRPO RPOT(TheLoop); |
| 2538 | RPOT.perform(LI); |
| 2539 | for (BasicBlock *BB : RPOT) { |
| 2540 | // Scan the BB and collect legal loads and stores. Also detect any |
| 2541 | // convergent instructions. |
| 2542 | for (Instruction &I : *BB) { |
| 2543 | if (auto *Call = dyn_cast<CallBase>(Val: &I)) { |
| 2544 | if (Call->isConvergent()) |
| 2545 | HasConvergentOp = true; |
| 2546 | } |
| 2547 | |
| 2548 | // With both a non-vectorizable memory instruction and a convergent |
| 2549 | // operation, found in this loop, no reason to continue the search. |
| 2550 | if (HasComplexMemInst && HasConvergentOp) |
| 2551 | return false; |
| 2552 | |
| 2553 | // Avoid hitting recordAnalysis multiple times. |
| 2554 | if (HasComplexMemInst) |
| 2555 | continue; |
| 2556 | |
| 2557 | // Record alias scopes defined inside the loop. |
| 2558 | if (auto *Decl = dyn_cast<NoAliasScopeDeclInst>(Val: &I)) |
| 2559 | for (Metadata *Op : Decl->getScopeList()->operands()) |
| 2560 | LoopAliasScopes.insert(Ptr: cast<MDNode>(Val: Op)); |
| 2561 | |
| 2562 | // Many math library functions read the rounding mode. We will only |
| 2563 | // vectorize a loop if it contains known function calls that don't set |
| 2564 | // the flag. Therefore, it is safe to ignore this read from memory. |
| 2565 | auto *Call = dyn_cast<CallInst>(Val: &I); |
| 2566 | if (Call && getVectorIntrinsicIDForCall(CI: Call, TLI)) |
| 2567 | continue; |
| 2568 | |
| 2569 | // If this is a load, save it. If this instruction can read from memory |
| 2570 | // but is not a load, we only allow it if it's a call to a function with a |
| 2571 | // vector mapping and no pointer arguments. |
| 2572 | if (I.mayReadFromMemory()) { |
| 2573 | auto hasPointerArgs = [](CallBase *CB) { |
| 2574 | return any_of(Range: CB->args(), P: [](Value const *Arg) { |
| 2575 | return Arg->getType()->isPointerTy(); |
| 2576 | }); |
| 2577 | }; |
| 2578 | |
| 2579 | // If the function has an explicit vectorized counterpart, and does not |
| 2580 | // take output/input pointers, we can safely assume that it can be |
| 2581 | // vectorized. |
| 2582 | if (Call && !Call->isNoBuiltin() && Call->getCalledFunction() && |
| 2583 | !hasPointerArgs(Call) && !VFDatabase::getMappings(CI: *Call).empty()) |
| 2584 | continue; |
| 2585 | |
| 2586 | auto *Ld = dyn_cast<LoadInst>(Val: &I); |
| 2587 | if (!Ld) { |
| 2588 | recordAnalysis(RemarkName: "CantVectorizeInstruction" , Instr: Ld) |
| 2589 | << "instruction cannot be vectorized" ; |
| 2590 | HasComplexMemInst = true; |
| 2591 | continue; |
| 2592 | } |
| 2593 | if (!Ld->isSimple() && !IsAnnotatedParallel) { |
| 2594 | recordAnalysis(RemarkName: "NonSimpleLoad" , Instr: Ld) |
| 2595 | << "read with atomic ordering or volatile read" ; |
| 2596 | LLVM_DEBUG(dbgs() << "LAA: Found a non-simple load.\n" ); |
| 2597 | HasComplexMemInst = true; |
| 2598 | continue; |
| 2599 | } |
| 2600 | NumLoads++; |
| 2601 | Loads.push_back(Elt: Ld); |
| 2602 | DepChecker->addAccess(LI: Ld); |
| 2603 | if (EnableMemAccessVersioningOfLoop) |
| 2604 | collectStridedAccess(LoadOrStoreInst: Ld); |
| 2605 | continue; |
| 2606 | } |
| 2607 | |
| 2608 | // Save 'store' instructions. Abort if other instructions write to memory. |
| 2609 | if (I.mayWriteToMemory()) { |
| 2610 | auto *St = dyn_cast<StoreInst>(Val: &I); |
| 2611 | if (!St) { |
| 2612 | recordAnalysis(RemarkName: "CantVectorizeInstruction" , Instr: St) |
| 2613 | << "instruction cannot be vectorized" ; |
| 2614 | HasComplexMemInst = true; |
| 2615 | continue; |
| 2616 | } |
| 2617 | if (!St->isSimple() && !IsAnnotatedParallel) { |
| 2618 | recordAnalysis(RemarkName: "NonSimpleStore" , Instr: St) |
| 2619 | << "write with atomic ordering or volatile write" ; |
| 2620 | LLVM_DEBUG(dbgs() << "LAA: Found a non-simple store.\n" ); |
| 2621 | HasComplexMemInst = true; |
| 2622 | continue; |
| 2623 | } |
| 2624 | NumStores++; |
| 2625 | Stores.push_back(Elt: St); |
| 2626 | DepChecker->addAccess(SI: St); |
| 2627 | if (EnableMemAccessVersioningOfLoop) |
| 2628 | collectStridedAccess(LoadOrStoreInst: St); |
| 2629 | } |
| 2630 | } // Next instr. |
| 2631 | } // Next block. |
| 2632 | |
| 2633 | if (HasComplexMemInst) |
| 2634 | return false; |
| 2635 | |
| 2636 | // Now we have two lists that hold the loads and the stores. |
| 2637 | // Next, we find the pointers that they use. |
| 2638 | |
| 2639 | // Check if we see any stores. If there are no stores, then we don't |
| 2640 | // care if the pointers are *restrict*. |
| 2641 | if (!Stores.size()) { |
| 2642 | LLVM_DEBUG(dbgs() << "LAA: Found a read-only loop!\n" ); |
| 2643 | return true; |
| 2644 | } |
| 2645 | |
| 2646 | MemoryDepChecker::DepCandidates DepCands; |
| 2647 | AccessAnalysis Accesses(TheLoop, AA, LI, *DT, DepCands, *PSE, |
| 2648 | LoopAliasScopes); |
| 2649 | |
| 2650 | // Holds the analyzed pointers. We don't want to call getUnderlyingObjects |
| 2651 | // multiple times on the same object. If the ptr is accessed twice, once |
| 2652 | // for read and once for write, it will only appear once (on the write |
| 2653 | // list). This is okay, since we are going to check for conflicts between |
| 2654 | // writes and between reads and writes, but not between reads and reads. |
| 2655 | SmallSet<std::pair<Value *, Type *>, 16> Seen; |
| 2656 | |
| 2657 | // Record uniform store addresses to identify if we have multiple stores |
| 2658 | // to the same address. |
| 2659 | SmallPtrSet<Value *, 16> UniformStores; |
| 2660 | |
| 2661 | for (StoreInst *ST : Stores) { |
| 2662 | Value *Ptr = ST->getPointerOperand(); |
| 2663 | |
| 2664 | if (isInvariant(V: Ptr)) { |
| 2665 | // Record store instructions to loop invariant addresses |
| 2666 | StoresToInvariantAddresses.push_back(Elt: ST); |
| 2667 | HasStoreStoreDependenceInvolvingLoopInvariantAddress |= |
| 2668 | !UniformStores.insert(Ptr).second; |
| 2669 | } |
| 2670 | |
| 2671 | // If we did *not* see this pointer before, insert it to the read-write |
| 2672 | // list. At this phase it is only a 'write' list. |
| 2673 | Type *AccessTy = getLoadStoreType(I: ST); |
| 2674 | if (Seen.insert(V: {Ptr, AccessTy}).second) { |
| 2675 | ++NumReadWrites; |
| 2676 | |
| 2677 | MemoryLocation Loc = MemoryLocation::get(SI: ST); |
| 2678 | // The TBAA metadata could have a control dependency on the predication |
| 2679 | // condition, so we cannot rely on it when determining whether or not we |
| 2680 | // need runtime pointer checks. |
| 2681 | if (blockNeedsPredication(BB: ST->getParent(), TheLoop, DT)) |
| 2682 | Loc.AATags.TBAA = nullptr; |
| 2683 | |
| 2684 | visitPointers(StartPtr: const_cast<Value *>(Loc.Ptr), InnermostLoop: *TheLoop, |
| 2685 | AddPointer: [&Accesses, AccessTy, Loc](Value *Ptr) { |
| 2686 | MemoryLocation NewLoc = Loc.getWithNewPtr(NewPtr: Ptr); |
| 2687 | Accesses.addStore(Loc: NewLoc, AccessTy); |
| 2688 | }); |
| 2689 | } |
| 2690 | } |
| 2691 | |
| 2692 | if (IsAnnotatedParallel) { |
| 2693 | LLVM_DEBUG( |
| 2694 | dbgs() << "LAA: A loop annotated parallel, ignore memory dependency " |
| 2695 | << "checks.\n" ); |
| 2696 | return true; |
| 2697 | } |
| 2698 | |
| 2699 | for (LoadInst *LD : Loads) { |
| 2700 | Value *Ptr = LD->getPointerOperand(); |
| 2701 | // If we did *not* see this pointer before, insert it to the |
| 2702 | // read list. If we *did* see it before, then it is already in |
| 2703 | // the read-write list. This allows us to vectorize expressions |
| 2704 | // such as A[i] += x; Because the address of A[i] is a read-write |
| 2705 | // pointer. This only works if the index of A[i] is consecutive. |
| 2706 | // If the address of i is unknown (for example A[B[i]]) then we may |
| 2707 | // read a few words, modify, and write a few words, and some of the |
| 2708 | // words may be written to the same address. |
| 2709 | bool IsReadOnlyPtr = false; |
| 2710 | Type *AccessTy = getLoadStoreType(I: LD); |
| 2711 | if (Seen.insert(V: {Ptr, AccessTy}).second || |
| 2712 | !getPtrStride(PSE&: *PSE, AccessTy, Ptr, Lp: TheLoop, DT: *DT, StridesMap: SymbolicStrides, Assume: false, |
| 2713 | ShouldCheckWrap: true)) { |
| 2714 | ++NumReads; |
| 2715 | IsReadOnlyPtr = true; |
| 2716 | } |
| 2717 | |
| 2718 | // See if there is an unsafe dependency between a load to a uniform address and |
| 2719 | // store to the same uniform address. |
| 2720 | if (UniformStores.contains(Ptr)) { |
| 2721 | LLVM_DEBUG(dbgs() << "LAA: Found an unsafe dependency between a uniform " |
| 2722 | "load and uniform store to the same address!\n" ); |
| 2723 | HasLoadStoreDependenceInvolvingLoopInvariantAddress = true; |
| 2724 | } |
| 2725 | |
| 2726 | MemoryLocation Loc = MemoryLocation::get(LI: LD); |
| 2727 | // The TBAA metadata could have a control dependency on the predication |
| 2728 | // condition, so we cannot rely on it when determining whether or not we |
| 2729 | // need runtime pointer checks. |
| 2730 | if (blockNeedsPredication(BB: LD->getParent(), TheLoop, DT)) |
| 2731 | Loc.AATags.TBAA = nullptr; |
| 2732 | |
| 2733 | visitPointers(StartPtr: const_cast<Value *>(Loc.Ptr), InnermostLoop: *TheLoop, |
| 2734 | AddPointer: [&Accesses, AccessTy, Loc, IsReadOnlyPtr](Value *Ptr) { |
| 2735 | MemoryLocation NewLoc = Loc.getWithNewPtr(NewPtr: Ptr); |
| 2736 | Accesses.addLoad(Loc: NewLoc, AccessTy, IsReadOnly: IsReadOnlyPtr); |
| 2737 | }); |
| 2738 | } |
| 2739 | |
| 2740 | // If we write (or read-write) to a single destination and there are no |
| 2741 | // other reads in this loop then is it safe to vectorize. |
| 2742 | if (NumReadWrites == 1 && NumReads == 0) { |
| 2743 | LLVM_DEBUG(dbgs() << "LAA: Found a write-only loop!\n" ); |
| 2744 | return true; |
| 2745 | } |
| 2746 | |
| 2747 | // Build dependence sets and check whether we need a runtime pointer bounds |
| 2748 | // check. |
| 2749 | Accesses.buildDependenceSets(); |
| 2750 | |
| 2751 | // Find pointers with computable bounds. We are going to use this information |
| 2752 | // to place a runtime bound check. |
| 2753 | Value *UncomputablePtr = nullptr; |
| 2754 | HasCompletePtrRtChecking = Accesses.canCheckPtrAtRT( |
| 2755 | RtCheck&: *PtrRtChecking, TheLoop, StridesMap: SymbolicStrides, UncomputablePtr, AllowPartial); |
| 2756 | if (!HasCompletePtrRtChecking) { |
| 2757 | const auto *I = dyn_cast_or_null<Instruction>(Val: UncomputablePtr); |
| 2758 | recordAnalysis(RemarkName: "CantIdentifyArrayBounds" , Instr: I) |
| 2759 | << "cannot identify array bounds" ; |
| 2760 | LLVM_DEBUG(dbgs() << "LAA: We can't vectorize because we can't find " |
| 2761 | << "the array bounds.\n" ); |
| 2762 | return false; |
| 2763 | } |
| 2764 | |
| 2765 | LLVM_DEBUG( |
| 2766 | dbgs() << "LAA: May be able to perform a memory runtime check if needed.\n" ); |
| 2767 | |
| 2768 | bool DepsAreSafe = true; |
| 2769 | if (Accesses.isDependencyCheckNeeded()) { |
| 2770 | LLVM_DEBUG(dbgs() << "LAA: Checking memory dependencies\n" ); |
| 2771 | DepsAreSafe = |
| 2772 | DepChecker->areDepsSafe(DepCands, CheckDeps: Accesses.getDependenciesToCheck()); |
| 2773 | |
| 2774 | if (!DepsAreSafe && DepChecker->shouldRetryWithRuntimeChecks()) { |
| 2775 | LLVM_DEBUG(dbgs() << "LAA: Retrying with memory checks\n" ); |
| 2776 | |
| 2777 | // Clear the dependency checks. We assume they are not needed. |
| 2778 | Accesses.resetDepChecks(DepChecker&: *DepChecker); |
| 2779 | |
| 2780 | PtrRtChecking->reset(); |
| 2781 | PtrRtChecking->Need = true; |
| 2782 | |
| 2783 | UncomputablePtr = nullptr; |
| 2784 | HasCompletePtrRtChecking = |
| 2785 | Accesses.canCheckPtrAtRT(RtCheck&: *PtrRtChecking, TheLoop, StridesMap: SymbolicStrides, |
| 2786 | UncomputablePtr, AllowPartial); |
| 2787 | |
| 2788 | // Check that we found the bounds for the pointer. |
| 2789 | if (!HasCompletePtrRtChecking) { |
| 2790 | auto *I = dyn_cast_or_null<Instruction>(Val: UncomputablePtr); |
| 2791 | recordAnalysis(RemarkName: "CantCheckMemDepsAtRunTime" , Instr: I) |
| 2792 | << "cannot check memory dependencies at runtime" ; |
| 2793 | LLVM_DEBUG(dbgs() << "LAA: Can't vectorize with memory checks\n" ); |
| 2794 | return false; |
| 2795 | } |
| 2796 | DepsAreSafe = true; |
| 2797 | } |
| 2798 | } |
| 2799 | |
| 2800 | if (HasConvergentOp) { |
| 2801 | recordAnalysis(RemarkName: "CantInsertRuntimeCheckWithConvergent" ) |
| 2802 | << "cannot add control dependency to convergent operation" ; |
| 2803 | LLVM_DEBUG(dbgs() << "LAA: We can't vectorize because a runtime check " |
| 2804 | "would be needed with a convergent operation\n" ); |
| 2805 | return false; |
| 2806 | } |
| 2807 | |
| 2808 | if (DepsAreSafe) { |
| 2809 | LLVM_DEBUG( |
| 2810 | dbgs() << "LAA: No unsafe dependent memory operations in loop. We" |
| 2811 | << (PtrRtChecking->Need ? "" : " don't" ) |
| 2812 | << " need runtime memory checks.\n" ); |
| 2813 | return true; |
| 2814 | } |
| 2815 | |
| 2816 | emitUnsafeDependenceRemark(); |
| 2817 | return false; |
| 2818 | } |
| 2819 | |
| 2820 | void LoopAccessInfo::() { |
| 2821 | const auto *Deps = getDepChecker().getDependences(); |
| 2822 | if (!Deps) |
| 2823 | return; |
| 2824 | const auto *Found = |
| 2825 | llvm::find_if(Range: *Deps, P: [](const MemoryDepChecker::Dependence &D) { |
| 2826 | return MemoryDepChecker::Dependence::isSafeForVectorization(Type: D.Type) != |
| 2827 | MemoryDepChecker::VectorizationSafetyStatus::Safe; |
| 2828 | }); |
| 2829 | if (Found == Deps->end()) |
| 2830 | return; |
| 2831 | MemoryDepChecker::Dependence Dep = *Found; |
| 2832 | |
| 2833 | LLVM_DEBUG(dbgs() << "LAA: unsafe dependent memory operations in loop\n" ); |
| 2834 | |
| 2835 | // Emit remark for first unsafe dependence |
| 2836 | bool HasForcedDistribution = false; |
| 2837 | std::optional<const MDOperand *> Value = |
| 2838 | findStringMetadataForLoop(TheLoop, Name: "llvm.loop.distribute.enable" ); |
| 2839 | if (Value) { |
| 2840 | const MDOperand *Op = *Value; |
| 2841 | assert(Op && mdconst::hasa<ConstantInt>(*Op) && "invalid metadata" ); |
| 2842 | HasForcedDistribution = mdconst::extract<ConstantInt>(MD: *Op)->getZExtValue(); |
| 2843 | } |
| 2844 | |
| 2845 | const std::string Info = |
| 2846 | HasForcedDistribution |
| 2847 | ? "unsafe dependent memory operations in loop." |
| 2848 | : "unsafe dependent memory operations in loop. Use " |
| 2849 | "#pragma clang loop distribute(enable) to allow loop distribution " |
| 2850 | "to attempt to isolate the offending operations into a separate " |
| 2851 | "loop" ; |
| 2852 | OptimizationRemarkAnalysis &R = |
| 2853 | recordAnalysis(RemarkName: "UnsafeDep" , Instr: Dep.getDestination(DepChecker: getDepChecker())) << Info; |
| 2854 | |
| 2855 | switch (Dep.Type) { |
| 2856 | case MemoryDepChecker::Dependence::NoDep: |
| 2857 | case MemoryDepChecker::Dependence::Forward: |
| 2858 | case MemoryDepChecker::Dependence::BackwardVectorizable: |
| 2859 | llvm_unreachable("Unexpected dependence" ); |
| 2860 | case MemoryDepChecker::Dependence::Backward: |
| 2861 | R << "\nBackward loop carried data dependence." ; |
| 2862 | break; |
| 2863 | case MemoryDepChecker::Dependence::ForwardButPreventsForwarding: |
| 2864 | R << "\nForward loop carried data dependence that prevents " |
| 2865 | "store-to-load forwarding." ; |
| 2866 | break; |
| 2867 | case MemoryDepChecker::Dependence::BackwardVectorizableButPreventsForwarding: |
| 2868 | R << "\nBackward loop carried data dependence that prevents " |
| 2869 | "store-to-load forwarding." ; |
| 2870 | break; |
| 2871 | case MemoryDepChecker::Dependence::IndirectUnsafe: |
| 2872 | R << "\nUnsafe indirect dependence." ; |
| 2873 | break; |
| 2874 | case MemoryDepChecker::Dependence::Unknown: |
| 2875 | R << "\nUnknown data dependence." ; |
| 2876 | break; |
| 2877 | } |
| 2878 | |
| 2879 | if (Instruction *I = Dep.getSource(DepChecker: getDepChecker())) { |
| 2880 | DebugLoc SourceLoc = I->getDebugLoc(); |
| 2881 | if (auto *DD = dyn_cast_or_null<Instruction>(Val: getPointerOperand(V: I))) |
| 2882 | SourceLoc = DD->getDebugLoc(); |
| 2883 | if (SourceLoc) |
| 2884 | R << " Memory location is the same as accessed at " |
| 2885 | << ore::NV("Location" , SourceLoc); |
| 2886 | } |
| 2887 | } |
| 2888 | |
| 2889 | bool LoopAccessInfo::blockNeedsPredication(const BasicBlock *BB, |
| 2890 | const Loop *TheLoop, |
| 2891 | const DominatorTree *DT) { |
| 2892 | assert(TheLoop->contains(BB) && "Unknown block used" ); |
| 2893 | |
| 2894 | // Blocks that do not dominate the latch need predication. |
| 2895 | const BasicBlock *Latch = TheLoop->getLoopLatch(); |
| 2896 | return !DT->dominates(A: BB, B: Latch); |
| 2897 | } |
| 2898 | |
| 2899 | OptimizationRemarkAnalysis & |
| 2900 | LoopAccessInfo::recordAnalysis(StringRef , const Instruction *I) { |
| 2901 | assert(!Report && "Multiple reports generated" ); |
| 2902 | |
| 2903 | const BasicBlock *CodeRegion = TheLoop->getHeader(); |
| 2904 | DebugLoc DL = TheLoop->getStartLoc(); |
| 2905 | |
| 2906 | if (I) { |
| 2907 | CodeRegion = I->getParent(); |
| 2908 | // If there is no debug location attached to the instruction, revert back to |
| 2909 | // using the loop's. |
| 2910 | if (I->getDebugLoc()) |
| 2911 | DL = I->getDebugLoc(); |
| 2912 | } |
| 2913 | |
| 2914 | Report = std::make_unique<OptimizationRemarkAnalysis>(DEBUG_TYPE, args&: RemarkName, |
| 2915 | args&: DL, args&: CodeRegion); |
| 2916 | return *Report; |
| 2917 | } |
| 2918 | |
| 2919 | bool LoopAccessInfo::isInvariant(Value *V) const { |
| 2920 | auto *SE = PSE->getSE(); |
| 2921 | if (TheLoop->isLoopInvariant(V)) |
| 2922 | return true; |
| 2923 | if (!SE->isSCEVable(Ty: V->getType())) |
| 2924 | return false; |
| 2925 | const SCEV *S = SE->getSCEV(V); |
| 2926 | return SE->isLoopInvariant(S, L: TheLoop); |
| 2927 | } |
| 2928 | |
| 2929 | /// If \p Ptr is a GEP, which has a loop-variant operand, return that operand. |
| 2930 | /// Otherwise, return \p Ptr. |
| 2931 | static Value *getLoopVariantGEPOperand(Value *Ptr, ScalarEvolution *SE, |
| 2932 | Loop *Lp) { |
| 2933 | auto *GEP = dyn_cast<GetElementPtrInst>(Val: Ptr); |
| 2934 | if (!GEP) |
| 2935 | return Ptr; |
| 2936 | |
| 2937 | Value *V = Ptr; |
| 2938 | for (const Use &U : GEP->operands()) { |
| 2939 | if (!SE->isLoopInvariant(S: SE->getSCEV(V: U), L: Lp)) { |
| 2940 | if (V == Ptr) |
| 2941 | V = U; |
| 2942 | else |
| 2943 | // There must be exactly one loop-variant operand. |
| 2944 | return Ptr; |
| 2945 | } |
| 2946 | } |
| 2947 | return V; |
| 2948 | } |
| 2949 | |
| 2950 | /// Get the stride of a pointer access in a loop. Looks for symbolic |
| 2951 | /// strides "a[i*stride]". Returns the symbolic stride, or null otherwise. |
| 2952 | static const SCEV *getStrideFromPointer(Value *Ptr, ScalarEvolution *SE, Loop *Lp) { |
| 2953 | auto *PtrTy = dyn_cast<PointerType>(Val: Ptr->getType()); |
| 2954 | if (!PtrTy) |
| 2955 | return nullptr; |
| 2956 | |
| 2957 | // Try to remove a gep instruction to make the pointer (actually index at this |
| 2958 | // point) easier analyzable. If OrigPtr is equal to Ptr we are analyzing the |
| 2959 | // pointer, otherwise, we are analyzing the index. |
| 2960 | Value *OrigPtr = Ptr; |
| 2961 | |
| 2962 | Ptr = getLoopVariantGEPOperand(Ptr, SE, Lp); |
| 2963 | const SCEV *V = SE->getSCEV(V: Ptr); |
| 2964 | |
| 2965 | if (Ptr != OrigPtr) |
| 2966 | // Strip off casts. |
| 2967 | while (auto *C = dyn_cast<SCEVIntegralCastExpr>(Val: V)) |
| 2968 | V = C->getOperand(); |
| 2969 | |
| 2970 | if (!match(S: V, P: m_scev_AffineAddRec(Op0: m_SCEV(), Op1: m_SCEV(V), L: m_SpecificLoop(L: Lp)))) |
| 2971 | return nullptr; |
| 2972 | |
| 2973 | // Note that the restriction after this loop invariant check are only |
| 2974 | // profitability restrictions. |
| 2975 | if (!SE->isLoopInvariant(S: V, L: Lp)) |
| 2976 | return nullptr; |
| 2977 | |
| 2978 | // Look for the loop invariant symbolic value. |
| 2979 | if (isa<SCEVUnknown>(Val: V)) |
| 2980 | return V; |
| 2981 | |
| 2982 | if (auto *C = dyn_cast<SCEVIntegralCastExpr>(Val: V)) |
| 2983 | if (isa<SCEVUnknown>(Val: C->getOperand())) |
| 2984 | return V; |
| 2985 | |
| 2986 | return nullptr; |
| 2987 | } |
| 2988 | |
| 2989 | void LoopAccessInfo::collectStridedAccess(Value *MemAccess) { |
| 2990 | Value *Ptr = getLoadStorePointerOperand(V: MemAccess); |
| 2991 | if (!Ptr) |
| 2992 | return; |
| 2993 | |
| 2994 | // Note: getStrideFromPointer is a *profitability* heuristic. We |
| 2995 | // could broaden the scope of values returned here - to anything |
| 2996 | // which happens to be loop invariant and contributes to the |
| 2997 | // computation of an interesting IV - but we chose not to as we |
| 2998 | // don't have a cost model here, and broadening the scope exposes |
| 2999 | // far too many unprofitable cases. |
| 3000 | const SCEV *StrideExpr = getStrideFromPointer(Ptr, SE: PSE->getSE(), Lp: TheLoop); |
| 3001 | if (!StrideExpr) |
| 3002 | return; |
| 3003 | |
| 3004 | if (match(S: StrideExpr, P: m_scev_UndefOrPoison())) |
| 3005 | return; |
| 3006 | |
| 3007 | LLVM_DEBUG(dbgs() << "LAA: Found a strided access that is a candidate for " |
| 3008 | "versioning:" ); |
| 3009 | LLVM_DEBUG(dbgs() << " Ptr: " << *Ptr << " Stride: " << *StrideExpr << "\n" ); |
| 3010 | |
| 3011 | if (!SpeculateUnitStride) { |
| 3012 | LLVM_DEBUG(dbgs() << " Chose not to due to -laa-speculate-unit-stride\n" ); |
| 3013 | return; |
| 3014 | } |
| 3015 | |
| 3016 | // Avoid adding the "Stride == 1" predicate when we know that |
| 3017 | // Stride >= Trip-Count. Such a predicate will effectively optimize a single |
| 3018 | // or zero iteration loop, as Trip-Count <= Stride == 1. |
| 3019 | // |
| 3020 | // TODO: We are currently not making a very informed decision on when it is |
| 3021 | // beneficial to apply stride versioning. It might make more sense that the |
| 3022 | // users of this analysis (such as the vectorizer) will trigger it, based on |
| 3023 | // their specific cost considerations; For example, in cases where stride |
| 3024 | // versioning does not help resolving memory accesses/dependences, the |
| 3025 | // vectorizer should evaluate the cost of the runtime test, and the benefit |
| 3026 | // of various possible stride specializations, considering the alternatives |
| 3027 | // of using gather/scatters (if available). |
| 3028 | |
| 3029 | const SCEV *MaxBTC = PSE->getSymbolicMaxBackedgeTakenCount(); |
| 3030 | |
| 3031 | // Match the types so we can compare the stride and the MaxBTC. |
| 3032 | // The Stride can be positive/negative, so we sign extend Stride; |
| 3033 | // The backedgeTakenCount is non-negative, so we zero extend MaxBTC. |
| 3034 | const DataLayout &DL = TheLoop->getHeader()->getDataLayout(); |
| 3035 | uint64_t StrideTypeSizeBits = DL.getTypeSizeInBits(Ty: StrideExpr->getType()); |
| 3036 | uint64_t BETypeSizeBits = DL.getTypeSizeInBits(Ty: MaxBTC->getType()); |
| 3037 | const SCEV *CastedStride = StrideExpr; |
| 3038 | const SCEV *CastedBECount = MaxBTC; |
| 3039 | ScalarEvolution *SE = PSE->getSE(); |
| 3040 | if (BETypeSizeBits >= StrideTypeSizeBits) |
| 3041 | CastedStride = SE->getNoopOrSignExtend(V: StrideExpr, Ty: MaxBTC->getType()); |
| 3042 | else |
| 3043 | CastedBECount = SE->getZeroExtendExpr(Op: MaxBTC, Ty: StrideExpr->getType()); |
| 3044 | const SCEV *StrideMinusBETaken = SE->getMinusSCEV(LHS: CastedStride, RHS: CastedBECount); |
| 3045 | // Since TripCount == BackEdgeTakenCount + 1, checking: |
| 3046 | // "Stride >= TripCount" is equivalent to checking: |
| 3047 | // Stride - MaxBTC> 0 |
| 3048 | if (SE->isKnownPositive(S: StrideMinusBETaken)) { |
| 3049 | LLVM_DEBUG( |
| 3050 | dbgs() << "LAA: Stride>=TripCount; No point in versioning as the " |
| 3051 | "Stride==1 predicate will imply that the loop executes " |
| 3052 | "at most once.\n" ); |
| 3053 | return; |
| 3054 | } |
| 3055 | LLVM_DEBUG(dbgs() << "LAA: Found a strided access that we can version.\n" ); |
| 3056 | |
| 3057 | // Strip back off the integer cast, and check that our result is a |
| 3058 | // SCEVUnknown as we expect. |
| 3059 | const SCEV *StrideBase = StrideExpr; |
| 3060 | if (const auto *C = dyn_cast<SCEVIntegralCastExpr>(Val: StrideBase)) |
| 3061 | StrideBase = C->getOperand(); |
| 3062 | SymbolicStrides[Ptr] = cast<SCEVUnknown>(Val: StrideBase); |
| 3063 | } |
| 3064 | |
| 3065 | LoopAccessInfo::LoopAccessInfo(Loop *L, ScalarEvolution *SE, |
| 3066 | const TargetTransformInfo *TTI, |
| 3067 | const TargetLibraryInfo *TLI, AAResults *AA, |
| 3068 | DominatorTree *DT, LoopInfo *LI, |
| 3069 | AssumptionCache *AC, bool AllowPartial) |
| 3070 | : PSE(std::make_unique<PredicatedScalarEvolution>(args&: *SE, args&: *L)), |
| 3071 | PtrRtChecking(nullptr), TheLoop(L), AllowPartial(AllowPartial) { |
| 3072 | unsigned MaxTargetVectorWidthInBits = std::numeric_limits<unsigned>::max(); |
| 3073 | if (TTI && !TTI->enableScalableVectorization()) |
| 3074 | // Scale the vector width by 2 as rough estimate to also consider |
| 3075 | // interleaving. |
| 3076 | MaxTargetVectorWidthInBits = |
| 3077 | TTI->getRegisterBitWidth(K: TargetTransformInfo::RGK_FixedWidthVector) * 2; |
| 3078 | |
| 3079 | DepChecker = std::make_unique<MemoryDepChecker>( |
| 3080 | args&: *PSE, args&: AC, args&: DT, args&: L, args&: SymbolicStrides, args&: MaxTargetVectorWidthInBits, args&: LoopGuards); |
| 3081 | PtrRtChecking = |
| 3082 | std::make_unique<RuntimePointerChecking>(args&: *DepChecker, args&: SE, args&: LoopGuards); |
| 3083 | if (canAnalyzeLoop()) |
| 3084 | CanVecMem = analyzeLoop(AA, LI, TLI, DT); |
| 3085 | } |
| 3086 | |
| 3087 | void LoopAccessInfo::print(raw_ostream &OS, unsigned Depth) const { |
| 3088 | if (CanVecMem) { |
| 3089 | OS.indent(NumSpaces: Depth) << "Memory dependences are safe" ; |
| 3090 | const MemoryDepChecker &DC = getDepChecker(); |
| 3091 | if (!DC.isSafeForAnyVectorWidth()) |
| 3092 | OS << " with a maximum safe vector width of " |
| 3093 | << DC.getMaxSafeVectorWidthInBits() << " bits" ; |
| 3094 | if (!DC.isSafeForAnyStoreLoadForwardDistances()) { |
| 3095 | uint64_t SLDist = DC.getStoreLoadForwardSafeDistanceInBits(); |
| 3096 | OS << ", with a maximum safe store-load forward width of " << SLDist |
| 3097 | << " bits" ; |
| 3098 | } |
| 3099 | if (PtrRtChecking->Need) |
| 3100 | OS << " with run-time checks" ; |
| 3101 | OS << "\n" ; |
| 3102 | } |
| 3103 | |
| 3104 | if (HasConvergentOp) |
| 3105 | OS.indent(NumSpaces: Depth) << "Has convergent operation in loop\n" ; |
| 3106 | |
| 3107 | if (Report) |
| 3108 | OS.indent(NumSpaces: Depth) << "Report: " << Report->getMsg() << "\n" ; |
| 3109 | |
| 3110 | if (auto *Dependences = DepChecker->getDependences()) { |
| 3111 | OS.indent(NumSpaces: Depth) << "Dependences:\n" ; |
| 3112 | for (const auto &Dep : *Dependences) { |
| 3113 | Dep.print(OS, Depth: Depth + 2, Instrs: DepChecker->getMemoryInstructions()); |
| 3114 | OS << "\n" ; |
| 3115 | } |
| 3116 | } else |
| 3117 | OS.indent(NumSpaces: Depth) << "Too many dependences, not recorded\n" ; |
| 3118 | |
| 3119 | // List the pair of accesses need run-time checks to prove independence. |
| 3120 | PtrRtChecking->print(OS, Depth); |
| 3121 | if (PtrRtChecking->Need && !HasCompletePtrRtChecking) |
| 3122 | OS.indent(NumSpaces: Depth) << "Generated run-time checks are incomplete\n" ; |
| 3123 | OS << "\n" ; |
| 3124 | |
| 3125 | OS.indent(NumSpaces: Depth) |
| 3126 | << "Non vectorizable stores to invariant address were " |
| 3127 | << (HasStoreStoreDependenceInvolvingLoopInvariantAddress || |
| 3128 | HasLoadStoreDependenceInvolvingLoopInvariantAddress |
| 3129 | ? "" |
| 3130 | : "not " ) |
| 3131 | << "found in loop.\n" ; |
| 3132 | |
| 3133 | OS.indent(NumSpaces: Depth) << "SCEV assumptions:\n" ; |
| 3134 | PSE->getPredicate().print(OS, Depth); |
| 3135 | |
| 3136 | OS << "\n" ; |
| 3137 | |
| 3138 | OS.indent(NumSpaces: Depth) << "Expressions re-written:\n" ; |
| 3139 | PSE->print(OS, Depth); |
| 3140 | } |
| 3141 | |
| 3142 | const LoopAccessInfo &LoopAccessInfoManager::getInfo(Loop &L, |
| 3143 | bool AllowPartial) { |
| 3144 | const auto &[It, Inserted] = LoopAccessInfoMap.try_emplace(Key: &L); |
| 3145 | |
| 3146 | // We need to create the LoopAccessInfo if either we don't already have one, |
| 3147 | // or if it was created with a different value of AllowPartial. |
| 3148 | if (Inserted || It->second->hasAllowPartial() != AllowPartial) |
| 3149 | It->second = std::make_unique<LoopAccessInfo>(args: &L, args: &SE, args&: TTI, args&: TLI, args: &AA, args: &DT, |
| 3150 | args: &LI, args&: AC, args&: AllowPartial); |
| 3151 | |
| 3152 | return *It->second; |
| 3153 | } |
| 3154 | void LoopAccessInfoManager::clear() { |
| 3155 | // Collect LoopAccessInfo entries that may keep references to IR outside the |
| 3156 | // analyzed loop or SCEVs that may have been modified or invalidated. At the |
| 3157 | // moment, that is loops requiring memory or SCEV runtime checks, as those cache |
| 3158 | // SCEVs, e.g. for pointer expressions. |
| 3159 | for (const auto &[L, LAI] : LoopAccessInfoMap) { |
| 3160 | if (LAI->getRuntimePointerChecking()->getChecks().empty() && |
| 3161 | LAI->getPSE().getPredicate().isAlwaysTrue()) |
| 3162 | continue; |
| 3163 | LoopAccessInfoMap.erase(Val: L); |
| 3164 | } |
| 3165 | } |
| 3166 | |
| 3167 | bool LoopAccessInfoManager::invalidate( |
| 3168 | Function &F, const PreservedAnalyses &PA, |
| 3169 | FunctionAnalysisManager::Invalidator &Inv) { |
| 3170 | // Check whether our analysis is preserved. |
| 3171 | auto PAC = PA.getChecker<LoopAccessAnalysis>(); |
| 3172 | if (!PAC.preserved() && !PAC.preservedSet<AllAnalysesOn<Function>>()) |
| 3173 | // If not, give up now. |
| 3174 | return true; |
| 3175 | |
| 3176 | // Check whether the analyses we depend on became invalid for any reason. |
| 3177 | // Skip checking TargetLibraryAnalysis as it is immutable and can't become |
| 3178 | // invalid. |
| 3179 | return Inv.invalidate<AAManager>(IR&: F, PA) || |
| 3180 | Inv.invalidate<ScalarEvolutionAnalysis>(IR&: F, PA) || |
| 3181 | Inv.invalidate<LoopAnalysis>(IR&: F, PA) || |
| 3182 | Inv.invalidate<DominatorTreeAnalysis>(IR&: F, PA); |
| 3183 | } |
| 3184 | |
| 3185 | LoopAccessInfoManager LoopAccessAnalysis::run(Function &F, |
| 3186 | FunctionAnalysisManager &FAM) { |
| 3187 | auto &SE = FAM.getResult<ScalarEvolutionAnalysis>(IR&: F); |
| 3188 | auto &AA = FAM.getResult<AAManager>(IR&: F); |
| 3189 | auto &DT = FAM.getResult<DominatorTreeAnalysis>(IR&: F); |
| 3190 | auto &LI = FAM.getResult<LoopAnalysis>(IR&: F); |
| 3191 | auto &TTI = FAM.getResult<TargetIRAnalysis>(IR&: F); |
| 3192 | auto &TLI = FAM.getResult<TargetLibraryAnalysis>(IR&: F); |
| 3193 | auto &AC = FAM.getResult<AssumptionAnalysis>(IR&: F); |
| 3194 | return LoopAccessInfoManager(SE, AA, DT, LI, &TTI, &TLI, &AC); |
| 3195 | } |
| 3196 | |
| 3197 | AnalysisKey LoopAccessAnalysis::Key; |
| 3198 | |