| 1 | //===-------- LoopIdiomVectorize.cpp - Loop idiom vectorization -----------===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | // |
| 9 | // This pass implements a pass that recognizes certain loop idioms and |
| 10 | // transforms them into more optimized versions of the same loop. In cases |
| 11 | // where this happens, it can be a significant performance win. |
| 12 | // |
| 13 | // We currently support two loops: |
| 14 | // |
| 15 | // 1. A loop that finds the first mismatched byte in an array and returns the |
| 16 | // index, i.e. something like: |
| 17 | // |
| 18 | // while (++i != n) { |
| 19 | // if (a[i] != b[i]) |
| 20 | // break; |
| 21 | // } |
| 22 | // |
| 23 | // In this example we can actually vectorize the loop despite the early exit, |
| 24 | // although the loop vectorizer does not support it. It requires some extra |
| 25 | // checks to deal with the possibility of faulting loads when crossing page |
| 26 | // boundaries. However, even with these checks it is still profitable to do the |
| 27 | // transformation. |
| 28 | // |
| 29 | // TODO List: |
| 30 | // |
| 31 | // * Add support for the inverse case where we scan for a matching element. |
| 32 | // * Permit 64-bit induction variable types. |
| 33 | // * Recognize loops that increment the IV *after* comparing bytes. |
| 34 | // * Allow 32-bit sign-extends of the IV used by the GEP. |
| 35 | // |
| 36 | // 2. A loop that finds the first matching character in an array among a set of |
| 37 | // possible matches, e.g.: |
| 38 | // |
| 39 | // for (; first != last; ++first) |
| 40 | // for (s_it = s_first; s_it != s_last; ++s_it) |
| 41 | // if (*first == *s_it) |
| 42 | // return first; |
| 43 | // return last; |
| 44 | // |
| 45 | // This corresponds to std::find_first_of (for arrays of bytes) from the C++ |
| 46 | // standard library. This function can be implemented efficiently for targets |
| 47 | // that support @llvm.experimental.vector.match. For example, on AArch64 targets |
| 48 | // that implement SVE2, this lower to a MATCH instruction, which enables us to |
| 49 | // perform up to 16x16=256 comparisons in one go. This can lead to very |
| 50 | // significant speedups. |
| 51 | // |
| 52 | // TODO: |
| 53 | // |
| 54 | // * Add support for `find_first_not_of' loops (i.e. with not-equal comparison). |
| 55 | // * Make VF a configurable parameter (right now we assume 128-bit vectors). |
| 56 | // * Potentially adjust the cost model to let the transformation kick-in even if |
| 57 | // @llvm.experimental.vector.match doesn't have direct support in hardware. |
| 58 | // |
| 59 | //===----------------------------------------------------------------------===// |
| 60 | // |
| 61 | // NOTE: This Pass matches really specific loop patterns because it's only |
| 62 | // supposed to be a temporary solution until our LoopVectorizer is powerful |
| 63 | // enough to vectorize them automatically. |
| 64 | // |
| 65 | //===----------------------------------------------------------------------===// |
| 66 | |
| 67 | #include "llvm/Transforms/Vectorize/LoopIdiomVectorize.h" |
| 68 | #include "llvm/Analysis/DomTreeUpdater.h" |
| 69 | #include "llvm/Analysis/LoopPass.h" |
| 70 | #include "llvm/Analysis/OptimizationRemarkEmitter.h" |
| 71 | #include "llvm/Analysis/TargetTransformInfo.h" |
| 72 | #include "llvm/IR/Dominators.h" |
| 73 | #include "llvm/IR/IRBuilder.h" |
| 74 | #include "llvm/IR/Intrinsics.h" |
| 75 | #include "llvm/IR/MDBuilder.h" |
| 76 | #include "llvm/IR/PatternMatch.h" |
| 77 | #include "llvm/Transforms/Utils/BasicBlockUtils.h" |
| 78 | #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" |
| 79 | |
| 80 | using namespace llvm; |
| 81 | using namespace PatternMatch; |
| 82 | |
| 83 | #define DEBUG_TYPE "loop-idiom-vectorize" |
| 84 | |
| 85 | static cl::opt<bool> DisableAll("disable-loop-idiom-vectorize-all" , cl::Hidden, |
| 86 | cl::init(Val: false), |
| 87 | cl::desc("Disable Loop Idiom Vectorize Pass." )); |
| 88 | |
| 89 | static cl::opt<LoopIdiomVectorizeStyle> |
| 90 | LITVecStyle("loop-idiom-vectorize-style" , cl::Hidden, |
| 91 | cl::desc("The vectorization style for loop idiom transform." ), |
| 92 | cl::values(clEnumValN(LoopIdiomVectorizeStyle::Masked, "masked" , |
| 93 | "Use masked vector intrinsics" ), |
| 94 | clEnumValN(LoopIdiomVectorizeStyle::Predicated, |
| 95 | "predicated" , "Use VP intrinsics" )), |
| 96 | cl::init(Val: LoopIdiomVectorizeStyle::Masked)); |
| 97 | |
| 98 | static cl::opt<bool> |
| 99 | DisableByteCmp("disable-loop-idiom-vectorize-bytecmp" , cl::Hidden, |
| 100 | cl::init(Val: false), |
| 101 | cl::desc("Proceed with Loop Idiom Vectorize Pass, but do " |
| 102 | "not convert byte-compare loop(s)." )); |
| 103 | |
| 104 | static cl::opt<unsigned> |
| 105 | ByteCmpVF("loop-idiom-vectorize-bytecmp-vf" , cl::Hidden, |
| 106 | cl::desc("The vectorization factor for byte-compare patterns." ), |
| 107 | cl::init(Val: 16)); |
| 108 | |
| 109 | static cl::opt<bool> |
| 110 | DisableFindFirstByte("disable-loop-idiom-vectorize-find-first-byte" , |
| 111 | cl::Hidden, cl::init(Val: false), |
| 112 | cl::desc("Do not convert find-first-byte loop(s)." )); |
| 113 | |
| 114 | static cl::opt<bool> |
| 115 | VerifyLoops("loop-idiom-vectorize-verify" , cl::Hidden, cl::init(Val: false), |
| 116 | cl::desc("Verify loops generated Loop Idiom Vectorize Pass." )); |
| 117 | |
| 118 | namespace { |
| 119 | class LoopIdiomVectorize { |
| 120 | LoopIdiomVectorizeStyle VectorizeStyle; |
| 121 | unsigned ByteCompareVF; |
| 122 | Loop *CurLoop = nullptr; |
| 123 | DominatorTree *DT; |
| 124 | LoopInfo *LI; |
| 125 | const TargetTransformInfo *TTI; |
| 126 | const DataLayout *DL; |
| 127 | |
| 128 | /// Interface to emit optimization remarks. |
| 129 | OptimizationRemarkEmitter &ORE; |
| 130 | |
| 131 | // Blocks that will be used for inserting vectorized code. |
| 132 | BasicBlock *EndBlock = nullptr; |
| 133 | BasicBlock * = nullptr; |
| 134 | BasicBlock *VectorLoopStartBlock = nullptr; |
| 135 | BasicBlock *VectorLoopMismatchBlock = nullptr; |
| 136 | BasicBlock *VectorLoopIncBlock = nullptr; |
| 137 | |
| 138 | public: |
| 139 | LoopIdiomVectorize(LoopIdiomVectorizeStyle S, unsigned VF, DominatorTree *DT, |
| 140 | LoopInfo *LI, const TargetTransformInfo *TTI, |
| 141 | const DataLayout *DL, OptimizationRemarkEmitter &ORE) |
| 142 | : VectorizeStyle(S), ByteCompareVF(VF), DT(DT), LI(LI), TTI(TTI), DL(DL), |
| 143 | ORE(ORE) {} |
| 144 | |
| 145 | bool run(Loop *L); |
| 146 | |
| 147 | private: |
| 148 | /// \name Countable Loop Idiom Handling |
| 149 | /// @{ |
| 150 | |
| 151 | bool runOnCountableLoop(); |
| 152 | bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount, |
| 153 | SmallVectorImpl<BasicBlock *> &ExitBlocks); |
| 154 | |
| 155 | bool recognizeByteCompare(); |
| 156 | |
| 157 | Value *expandFindMismatch(IRBuilder<> &Builder, DomTreeUpdater &DTU, |
| 158 | GetElementPtrInst *GEPA, GetElementPtrInst *GEPB, |
| 159 | Instruction *Index, Value *Start, Value *MaxLen); |
| 160 | |
| 161 | Value *createMaskedFindMismatch(IRBuilder<> &Builder, DomTreeUpdater &DTU, |
| 162 | GetElementPtrInst *GEPA, |
| 163 | GetElementPtrInst *GEPB, Value *ExtStart, |
| 164 | Value *ExtEnd); |
| 165 | Value *createPredicatedFindMismatch(IRBuilder<> &Builder, DomTreeUpdater &DTU, |
| 166 | GetElementPtrInst *GEPA, |
| 167 | GetElementPtrInst *GEPB, Value *ExtStart, |
| 168 | Value *ExtEnd); |
| 169 | |
| 170 | void transformByteCompare(GetElementPtrInst *GEPA, GetElementPtrInst *GEPB, |
| 171 | PHINode *IndPhi, Value *MaxLen, Instruction *Index, |
| 172 | Value *Start, bool IncIdx, BasicBlock *FoundBB, |
| 173 | BasicBlock *EndBB); |
| 174 | |
| 175 | bool recognizeFindFirstByte(); |
| 176 | |
| 177 | Value *expandFindFirstByte(IRBuilder<> &Builder, DomTreeUpdater &DTU, |
| 178 | unsigned VF, Type *CharTy, Value *IndPhi, |
| 179 | BasicBlock *ExitSucc, BasicBlock *ExitFail, |
| 180 | Value *SearchStart, Value *SearchEnd, |
| 181 | Value *NeedleStart, Value *NeedleEnd); |
| 182 | |
| 183 | void transformFindFirstByte(PHINode *IndPhi, unsigned VF, Type *CharTy, |
| 184 | BasicBlock *ExitSucc, BasicBlock *ExitFail, |
| 185 | Value *SearchStart, Value *SearchEnd, |
| 186 | Value *NeedleStart, Value *NeedleEnd); |
| 187 | /// @} |
| 188 | }; |
| 189 | } // anonymous namespace |
| 190 | |
| 191 | PreservedAnalyses LoopIdiomVectorizePass::run(Loop &L, LoopAnalysisManager &AM, |
| 192 | LoopStandardAnalysisResults &AR, |
| 193 | LPMUpdater &) { |
| 194 | if (DisableAll) |
| 195 | return PreservedAnalyses::all(); |
| 196 | |
| 197 | const auto *DL = &L.getHeader()->getDataLayout(); |
| 198 | |
| 199 | LoopIdiomVectorizeStyle VecStyle = VectorizeStyle; |
| 200 | if (LITVecStyle.getNumOccurrences()) |
| 201 | VecStyle = LITVecStyle; |
| 202 | |
| 203 | unsigned BCVF = ByteCompareVF; |
| 204 | if (ByteCmpVF.getNumOccurrences()) |
| 205 | BCVF = ByteCmpVF; |
| 206 | |
| 207 | Function &F = *L.getHeader()->getParent(); |
| 208 | auto &FAMP = AM.getResult<FunctionAnalysisManagerLoopProxy>(IR&: L, ExtraArgs&: AR); |
| 209 | auto *ORE = FAMP.getCachedResult<OptimizationRemarkEmitterAnalysis>(IR&: F); |
| 210 | |
| 211 | std::optional<OptimizationRemarkEmitter> ORELocal; |
| 212 | if (!ORE) { |
| 213 | ORELocal.emplace(args: &F); |
| 214 | ORE = &*ORELocal; |
| 215 | } |
| 216 | |
| 217 | LoopIdiomVectorize LIV(VecStyle, BCVF, &AR.DT, &AR.LI, &AR.TTI, DL, *ORE); |
| 218 | if (!LIV.run(L: &L)) |
| 219 | return PreservedAnalyses::all(); |
| 220 | |
| 221 | return PreservedAnalyses::none(); |
| 222 | } |
| 223 | |
| 224 | //===----------------------------------------------------------------------===// |
| 225 | // |
| 226 | // Implementation of LoopIdiomVectorize |
| 227 | // |
| 228 | //===----------------------------------------------------------------------===// |
| 229 | |
| 230 | bool LoopIdiomVectorize::run(Loop *L) { |
| 231 | CurLoop = L; |
| 232 | |
| 233 | Function &F = *L->getHeader()->getParent(); |
| 234 | if (DisableAll || F.hasOptSize()) |
| 235 | return false; |
| 236 | |
| 237 | // Bail if vectorization is disabled on loop. |
| 238 | LoopVectorizeHints Hints(L, /*InterleaveOnlyWhenForced=*/true, ORE); |
| 239 | if (!Hints.allowVectorization(F: &F, L, /*VectorizeOnlyWhenForced=*/false)) { |
| 240 | LLVM_DEBUG(dbgs() << DEBUG_TYPE << " is disabled on " << L->getName() |
| 241 | << " due to vectorization hints\n" ); |
| 242 | return false; |
| 243 | } |
| 244 | |
| 245 | if (F.hasFnAttribute(Kind: Attribute::NoImplicitFloat)) { |
| 246 | LLVM_DEBUG(dbgs() << DEBUG_TYPE << " is disabled on " << F.getName() |
| 247 | << " due to its NoImplicitFloat attribute" ); |
| 248 | return false; |
| 249 | } |
| 250 | |
| 251 | // If the loop could not be converted to canonical form, it must have an |
| 252 | // indirectbr in it, just give up. |
| 253 | if (!L->getLoopPreheader()) |
| 254 | return false; |
| 255 | |
| 256 | LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F[" << F.getName() << "] Loop %" |
| 257 | << CurLoop->getHeader()->getName() << "\n" ); |
| 258 | |
| 259 | if (recognizeByteCompare()) |
| 260 | return true; |
| 261 | |
| 262 | if (recognizeFindFirstByte()) |
| 263 | return true; |
| 264 | |
| 265 | return false; |
| 266 | } |
| 267 | |
| 268 | static void fixSuccessorPhis(Loop *L, Value *ScalarRes, Value *VectorRes, |
| 269 | BasicBlock *SuccBB, BasicBlock *IncBB) { |
| 270 | for (PHINode &PN : SuccBB->phis()) { |
| 271 | // Look through the incoming values to find ScalarRes, meaning this is a |
| 272 | // PHI collecting the results of the transformation. |
| 273 | bool ResPhi = false; |
| 274 | for (Value *Op : PN.incoming_values()) |
| 275 | if (Op == ScalarRes) { |
| 276 | ResPhi = true; |
| 277 | break; |
| 278 | } |
| 279 | |
| 280 | // Any PHI that depended upon the result of the transformation needs a new |
| 281 | // incoming value from IncBB. |
| 282 | if (ResPhi) |
| 283 | PN.addIncoming(V: VectorRes, BB: IncBB); |
| 284 | else { |
| 285 | // There should be no other outside uses of other values in the |
| 286 | // original loop. Any incoming values should either: |
| 287 | // 1. Be for blocks outside the loop, which aren't interesting. Or .. |
| 288 | // 2. These are from blocks in the loop with values defined outside |
| 289 | // the loop. We should a similar incoming value from CmpBB. |
| 290 | for (BasicBlock *BB : PN.blocks()) |
| 291 | if (L->contains(BB)) { |
| 292 | PN.addIncoming(V: PN.getIncomingValueForBlock(BB), BB: IncBB); |
| 293 | break; |
| 294 | } |
| 295 | } |
| 296 | } |
| 297 | } |
| 298 | |
| 299 | bool LoopIdiomVectorize::recognizeByteCompare() { |
| 300 | // Currently the transformation only works on scalable vector types, although |
| 301 | // there is no fundamental reason why it cannot be made to work for fixed |
| 302 | // width too. |
| 303 | |
| 304 | // We also need to know the minimum page size for the target in order to |
| 305 | // generate runtime memory checks to ensure the vector version won't fault. |
| 306 | if (!TTI->supportsScalableVectors() || !TTI->getMinPageSize().has_value() || |
| 307 | DisableByteCmp) |
| 308 | return false; |
| 309 | |
| 310 | BasicBlock * = CurLoop->getHeader(); |
| 311 | |
| 312 | // In LoopIdiomVectorize::run we have already checked that the loop |
| 313 | // has a preheader so we can assume it's in a canonical form. |
| 314 | if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 2) |
| 315 | return false; |
| 316 | |
| 317 | PHINode *PN = dyn_cast<PHINode>(Val: &Header->front()); |
| 318 | if (!PN || PN->getNumIncomingValues() != 2) |
| 319 | return false; |
| 320 | |
| 321 | auto LoopBlocks = CurLoop->getBlocks(); |
| 322 | // The first block in the loop should contain only 4 instructions, e.g. |
| 323 | // |
| 324 | // while.cond: |
| 325 | // %res.phi = phi i32 [ %start, %ph ], [ %inc, %while.body ] |
| 326 | // %inc = add i32 %res.phi, 1 |
| 327 | // %cmp.not = icmp eq i32 %inc, %n |
| 328 | // br i1 %cmp.not, label %while.end, label %while.body |
| 329 | // |
| 330 | if (LoopBlocks[0]->sizeWithoutDebug() > 4) |
| 331 | return false; |
| 332 | |
| 333 | // The second block should contain 7 instructions, e.g. |
| 334 | // |
| 335 | // while.body: |
| 336 | // %idx = zext i32 %inc to i64 |
| 337 | // %idx.a = getelementptr inbounds i8, ptr %a, i64 %idx |
| 338 | // %load.a = load i8, ptr %idx.a |
| 339 | // %idx.b = getelementptr inbounds i8, ptr %b, i64 %idx |
| 340 | // %load.b = load i8, ptr %idx.b |
| 341 | // %cmp.not.ld = icmp eq i8 %load.a, %load.b |
| 342 | // br i1 %cmp.not.ld, label %while.cond, label %while.end |
| 343 | // |
| 344 | if (LoopBlocks[1]->sizeWithoutDebug() > 7) |
| 345 | return false; |
| 346 | |
| 347 | // The incoming value to the PHI node from the loop should be an add of 1. |
| 348 | Value *StartIdx = nullptr; |
| 349 | Instruction *Index = nullptr; |
| 350 | if (!CurLoop->contains(BB: PN->getIncomingBlock(i: 0))) { |
| 351 | StartIdx = PN->getIncomingValue(i: 0); |
| 352 | Index = dyn_cast<Instruction>(Val: PN->getIncomingValue(i: 1)); |
| 353 | } else { |
| 354 | StartIdx = PN->getIncomingValue(i: 1); |
| 355 | Index = dyn_cast<Instruction>(Val: PN->getIncomingValue(i: 0)); |
| 356 | } |
| 357 | |
| 358 | // Limit to 32-bit types for now |
| 359 | if (!Index || !Index->getType()->isIntegerTy(Bitwidth: 32) || |
| 360 | !match(V: Index, P: m_c_Add(L: m_Specific(V: PN), R: m_One()))) |
| 361 | return false; |
| 362 | |
| 363 | // If we match the pattern, PN and Index will be replaced with the result of |
| 364 | // the cttz.elts intrinsic. If any other instructions are used outside of |
| 365 | // the loop, we cannot replace it. |
| 366 | for (BasicBlock *BB : LoopBlocks) |
| 367 | for (Instruction &I : *BB) |
| 368 | if (&I != PN && &I != Index) |
| 369 | for (User *U : I.users()) |
| 370 | if (!CurLoop->contains(Inst: cast<Instruction>(Val: U))) |
| 371 | return false; |
| 372 | |
| 373 | // Match the branch instruction for the header |
| 374 | Value *MaxLen; |
| 375 | BasicBlock *EndBB, *WhileBB; |
| 376 | if (!match(V: Header->getTerminator(), |
| 377 | P: m_Br(C: m_SpecificICmp(MatchPred: ICmpInst::ICMP_EQ, L: m_Specific(V: Index), |
| 378 | R: m_Value(V&: MaxLen)), |
| 379 | T: m_BasicBlock(V&: EndBB), F: m_BasicBlock(V&: WhileBB))) || |
| 380 | !CurLoop->contains(BB: WhileBB)) |
| 381 | return false; |
| 382 | |
| 383 | // WhileBB should contain the pattern of load & compare instructions. Match |
| 384 | // the pattern and find the GEP instructions used by the loads. |
| 385 | BasicBlock *FoundBB; |
| 386 | BasicBlock *TrueBB; |
| 387 | Value *LoadA, *LoadB; |
| 388 | if (!match(V: WhileBB->getTerminator(), |
| 389 | P: m_Br(C: m_SpecificICmp(MatchPred: ICmpInst::ICMP_EQ, L: m_Value(V&: LoadA), |
| 390 | R: m_Value(V&: LoadB)), |
| 391 | T: m_BasicBlock(V&: TrueBB), F: m_BasicBlock(V&: FoundBB))) || |
| 392 | !CurLoop->contains(BB: TrueBB)) |
| 393 | return false; |
| 394 | |
| 395 | Value *A, *B; |
| 396 | if (!match(V: LoadA, P: m_Load(Op: m_Value(V&: A))) || !match(V: LoadB, P: m_Load(Op: m_Value(V&: B)))) |
| 397 | return false; |
| 398 | |
| 399 | LoadInst *LoadAI = cast<LoadInst>(Val: LoadA); |
| 400 | LoadInst *LoadBI = cast<LoadInst>(Val: LoadB); |
| 401 | if (!LoadAI->isSimple() || !LoadBI->isSimple()) |
| 402 | return false; |
| 403 | |
| 404 | GetElementPtrInst *GEPA = dyn_cast<GetElementPtrInst>(Val: A); |
| 405 | GetElementPtrInst *GEPB = dyn_cast<GetElementPtrInst>(Val: B); |
| 406 | |
| 407 | if (!GEPA || !GEPB) |
| 408 | return false; |
| 409 | |
| 410 | Value *PtrA = GEPA->getPointerOperand(); |
| 411 | Value *PtrB = GEPB->getPointerOperand(); |
| 412 | |
| 413 | // Check we are loading i8 values from two loop invariant pointers |
| 414 | if (!CurLoop->isLoopInvariant(V: PtrA) || !CurLoop->isLoopInvariant(V: PtrB) || |
| 415 | !GEPA->getResultElementType()->isIntegerTy(Bitwidth: 8) || |
| 416 | !GEPB->getResultElementType()->isIntegerTy(Bitwidth: 8) || |
| 417 | !LoadAI->getType()->isIntegerTy(Bitwidth: 8) || |
| 418 | !LoadBI->getType()->isIntegerTy(Bitwidth: 8) || PtrA == PtrB) |
| 419 | return false; |
| 420 | |
| 421 | // Check that the index to the GEPs is the index we found earlier |
| 422 | if (GEPA->getNumIndices() > 1 || GEPB->getNumIndices() > 1) |
| 423 | return false; |
| 424 | |
| 425 | Value *IdxA = GEPA->getOperand(i_nocapture: GEPA->getNumIndices()); |
| 426 | Value *IdxB = GEPB->getOperand(i_nocapture: GEPB->getNumIndices()); |
| 427 | if (IdxA != IdxB || !match(V: IdxA, P: m_ZExt(Op: m_Specific(V: Index)))) |
| 428 | return false; |
| 429 | |
| 430 | // We only ever expect the pre-incremented index value to be used inside the |
| 431 | // loop. |
| 432 | if (!PN->hasOneUse()) |
| 433 | return false; |
| 434 | |
| 435 | // Ensure that when the Found and End blocks are identical the PHIs have the |
| 436 | // supported format. We don't currently allow cases like this: |
| 437 | // while.cond: |
| 438 | // ... |
| 439 | // br i1 %cmp.not, label %while.end, label %while.body |
| 440 | // |
| 441 | // while.body: |
| 442 | // ... |
| 443 | // br i1 %cmp.not2, label %while.cond, label %while.end |
| 444 | // |
| 445 | // while.end: |
| 446 | // %final_ptr = phi ptr [ %c, %while.body ], [ %d, %while.cond ] |
| 447 | // |
| 448 | // Where the incoming values for %final_ptr are unique and from each of the |
| 449 | // loop blocks, but not actually defined in the loop. This requires extra |
| 450 | // work setting up the byte.compare block, i.e. by introducing a select to |
| 451 | // choose the correct value. |
| 452 | // TODO: We could add support for this in future. |
| 453 | if (FoundBB == EndBB) { |
| 454 | for (PHINode &EndPN : EndBB->phis()) { |
| 455 | Value *WhileCondVal = EndPN.getIncomingValueForBlock(BB: Header); |
| 456 | Value *WhileBodyVal = EndPN.getIncomingValueForBlock(BB: WhileBB); |
| 457 | |
| 458 | // The value of the index when leaving the while.cond block is always the |
| 459 | // same as the end value (MaxLen) so we permit either. The value when |
| 460 | // leaving the while.body block should only be the index. Otherwise for |
| 461 | // any other values we only allow ones that are same for both blocks. |
| 462 | if (WhileCondVal != WhileBodyVal && |
| 463 | ((WhileCondVal != Index && WhileCondVal != MaxLen) || |
| 464 | (WhileBodyVal != Index))) |
| 465 | return false; |
| 466 | } |
| 467 | } |
| 468 | |
| 469 | LLVM_DEBUG(dbgs() << "FOUND IDIOM IN LOOP: \n" |
| 470 | << *(EndBB->getParent()) << "\n\n" ); |
| 471 | |
| 472 | // The index is incremented before the GEP/Load pair so we need to |
| 473 | // add 1 to the start value. |
| 474 | transformByteCompare(GEPA, GEPB, IndPhi: PN, MaxLen, Index, Start: StartIdx, /*IncIdx=*/true, |
| 475 | FoundBB, EndBB); |
| 476 | return true; |
| 477 | } |
| 478 | |
| 479 | Value *LoopIdiomVectorize::createMaskedFindMismatch( |
| 480 | IRBuilder<> &Builder, DomTreeUpdater &DTU, GetElementPtrInst *GEPA, |
| 481 | GetElementPtrInst *GEPB, Value *ExtStart, Value *ExtEnd) { |
| 482 | Type *I64Type = Builder.getInt64Ty(); |
| 483 | Type *ResType = Builder.getInt32Ty(); |
| 484 | Type *LoadType = Builder.getInt8Ty(); |
| 485 | Value *PtrA = GEPA->getPointerOperand(); |
| 486 | Value *PtrB = GEPB->getPointerOperand(); |
| 487 | |
| 488 | ScalableVectorType *PredVTy = |
| 489 | ScalableVectorType::get(ElementType: Builder.getInt1Ty(), MinNumElts: ByteCompareVF); |
| 490 | |
| 491 | Value *InitialPred = Builder.CreateIntrinsic( |
| 492 | ID: Intrinsic::get_active_lane_mask, Types: {PredVTy, I64Type}, Args: {ExtStart, ExtEnd}); |
| 493 | |
| 494 | Value *VecLen = Builder.CreateVScale(Ty: I64Type); |
| 495 | VecLen = |
| 496 | Builder.CreateMul(LHS: VecLen, RHS: ConstantInt::get(Ty: I64Type, V: ByteCompareVF), Name: "" , |
| 497 | /*HasNUW=*/true, /*HasNSW=*/true); |
| 498 | |
| 499 | Value *PFalse = Builder.CreateVectorSplat(EC: PredVTy->getElementCount(), |
| 500 | V: Builder.getInt1(V: false)); |
| 501 | |
| 502 | BranchInst *JumpToVectorLoop = BranchInst::Create(IfTrue: VectorLoopStartBlock); |
| 503 | Builder.Insert(I: JumpToVectorLoop); |
| 504 | |
| 505 | DTU.applyUpdates(Updates: {{DominatorTree::Insert, VectorLoopPreheaderBlock, |
| 506 | VectorLoopStartBlock}}); |
| 507 | |
| 508 | // Set up the first vector loop block by creating the PHIs, doing the vector |
| 509 | // loads and comparing the vectors. |
| 510 | Builder.SetInsertPoint(VectorLoopStartBlock); |
| 511 | PHINode *LoopPred = Builder.CreatePHI(Ty: PredVTy, NumReservedValues: 2, Name: "mismatch_vec_loop_pred" ); |
| 512 | LoopPred->addIncoming(V: InitialPred, BB: VectorLoopPreheaderBlock); |
| 513 | PHINode *VectorIndexPhi = Builder.CreatePHI(Ty: I64Type, NumReservedValues: 2, Name: "mismatch_vec_index" ); |
| 514 | VectorIndexPhi->addIncoming(V: ExtStart, BB: VectorLoopPreheaderBlock); |
| 515 | Type *VectorLoadType = |
| 516 | ScalableVectorType::get(ElementType: Builder.getInt8Ty(), MinNumElts: ByteCompareVF); |
| 517 | Value *Passthru = ConstantInt::getNullValue(Ty: VectorLoadType); |
| 518 | |
| 519 | Value *VectorLhsGep = |
| 520 | Builder.CreateGEP(Ty: LoadType, Ptr: PtrA, IdxList: VectorIndexPhi, Name: "" , NW: GEPA->isInBounds()); |
| 521 | Value *VectorLhsLoad = Builder.CreateMaskedLoad(Ty: VectorLoadType, Ptr: VectorLhsGep, |
| 522 | Alignment: Align(1), Mask: LoopPred, PassThru: Passthru); |
| 523 | |
| 524 | Value *VectorRhsGep = |
| 525 | Builder.CreateGEP(Ty: LoadType, Ptr: PtrB, IdxList: VectorIndexPhi, Name: "" , NW: GEPB->isInBounds()); |
| 526 | Value *VectorRhsLoad = Builder.CreateMaskedLoad(Ty: VectorLoadType, Ptr: VectorRhsGep, |
| 527 | Alignment: Align(1), Mask: LoopPred, PassThru: Passthru); |
| 528 | |
| 529 | Value *VectorMatchCmp = Builder.CreateICmpNE(LHS: VectorLhsLoad, RHS: VectorRhsLoad); |
| 530 | VectorMatchCmp = Builder.CreateSelect(C: LoopPred, True: VectorMatchCmp, False: PFalse); |
| 531 | Value *VectorMatchHasActiveLanes = Builder.CreateOrReduce(Src: VectorMatchCmp); |
| 532 | BranchInst *VectorEarlyExit = BranchInst::Create( |
| 533 | IfTrue: VectorLoopMismatchBlock, IfFalse: VectorLoopIncBlock, Cond: VectorMatchHasActiveLanes); |
| 534 | Builder.Insert(I: VectorEarlyExit); |
| 535 | |
| 536 | DTU.applyUpdates( |
| 537 | Updates: {{DominatorTree::Insert, VectorLoopStartBlock, VectorLoopMismatchBlock}, |
| 538 | {DominatorTree::Insert, VectorLoopStartBlock, VectorLoopIncBlock}}); |
| 539 | |
| 540 | // Increment the index counter and calculate the predicate for the next |
| 541 | // iteration of the loop. We branch back to the start of the loop if there |
| 542 | // is at least one active lane. |
| 543 | Builder.SetInsertPoint(VectorLoopIncBlock); |
| 544 | Value *NewVectorIndexPhi = |
| 545 | Builder.CreateAdd(LHS: VectorIndexPhi, RHS: VecLen, Name: "" , |
| 546 | /*HasNUW=*/true, /*HasNSW=*/true); |
| 547 | VectorIndexPhi->addIncoming(V: NewVectorIndexPhi, BB: VectorLoopIncBlock); |
| 548 | Value *NewPred = |
| 549 | Builder.CreateIntrinsic(ID: Intrinsic::get_active_lane_mask, |
| 550 | Types: {PredVTy, I64Type}, Args: {NewVectorIndexPhi, ExtEnd}); |
| 551 | LoopPred->addIncoming(V: NewPred, BB: VectorLoopIncBlock); |
| 552 | |
| 553 | Value *PredHasActiveLanes = |
| 554 | Builder.CreateExtractElement(Vec: NewPred, Idx: uint64_t(0)); |
| 555 | BranchInst *VectorLoopBranchBack = |
| 556 | BranchInst::Create(IfTrue: VectorLoopStartBlock, IfFalse: EndBlock, Cond: PredHasActiveLanes); |
| 557 | Builder.Insert(I: VectorLoopBranchBack); |
| 558 | |
| 559 | DTU.applyUpdates( |
| 560 | Updates: {{DominatorTree::Insert, VectorLoopIncBlock, VectorLoopStartBlock}, |
| 561 | {DominatorTree::Insert, VectorLoopIncBlock, EndBlock}}); |
| 562 | |
| 563 | // If we found a mismatch then we need to calculate which lane in the vector |
| 564 | // had a mismatch and add that on to the current loop index. |
| 565 | Builder.SetInsertPoint(VectorLoopMismatchBlock); |
| 566 | PHINode *FoundPred = Builder.CreatePHI(Ty: PredVTy, NumReservedValues: 1, Name: "mismatch_vec_found_pred" ); |
| 567 | FoundPred->addIncoming(V: VectorMatchCmp, BB: VectorLoopStartBlock); |
| 568 | PHINode *LastLoopPred = |
| 569 | Builder.CreatePHI(Ty: PredVTy, NumReservedValues: 1, Name: "mismatch_vec_last_loop_pred" ); |
| 570 | LastLoopPred->addIncoming(V: LoopPred, BB: VectorLoopStartBlock); |
| 571 | PHINode *VectorFoundIndex = |
| 572 | Builder.CreatePHI(Ty: I64Type, NumReservedValues: 1, Name: "mismatch_vec_found_index" ); |
| 573 | VectorFoundIndex->addIncoming(V: VectorIndexPhi, BB: VectorLoopStartBlock); |
| 574 | |
| 575 | Value *PredMatchCmp = Builder.CreateAnd(LHS: LastLoopPred, RHS: FoundPred); |
| 576 | Value *Ctz = Builder.CreateCountTrailingZeroElems(ResTy: ResType, Mask: PredMatchCmp); |
| 577 | Ctz = Builder.CreateZExt(V: Ctz, DestTy: I64Type); |
| 578 | Value *VectorLoopRes64 = Builder.CreateAdd(LHS: VectorFoundIndex, RHS: Ctz, Name: "" , |
| 579 | /*HasNUW=*/true, /*HasNSW=*/true); |
| 580 | return Builder.CreateTrunc(V: VectorLoopRes64, DestTy: ResType); |
| 581 | } |
| 582 | |
| 583 | Value *LoopIdiomVectorize::createPredicatedFindMismatch( |
| 584 | IRBuilder<> &Builder, DomTreeUpdater &DTU, GetElementPtrInst *GEPA, |
| 585 | GetElementPtrInst *GEPB, Value *ExtStart, Value *ExtEnd) { |
| 586 | Type *I64Type = Builder.getInt64Ty(); |
| 587 | Type *I32Type = Builder.getInt32Ty(); |
| 588 | Type *ResType = I32Type; |
| 589 | Type *LoadType = Builder.getInt8Ty(); |
| 590 | Value *PtrA = GEPA->getPointerOperand(); |
| 591 | Value *PtrB = GEPB->getPointerOperand(); |
| 592 | |
| 593 | auto *JumpToVectorLoop = BranchInst::Create(IfTrue: VectorLoopStartBlock); |
| 594 | Builder.Insert(I: JumpToVectorLoop); |
| 595 | |
| 596 | DTU.applyUpdates(Updates: {{DominatorTree::Insert, VectorLoopPreheaderBlock, |
| 597 | VectorLoopStartBlock}}); |
| 598 | |
| 599 | // Set up the first Vector loop block by creating the PHIs, doing the vector |
| 600 | // loads and comparing the vectors. |
| 601 | Builder.SetInsertPoint(VectorLoopStartBlock); |
| 602 | auto *VectorIndexPhi = Builder.CreatePHI(Ty: I64Type, NumReservedValues: 2, Name: "mismatch_vector_index" ); |
| 603 | VectorIndexPhi->addIncoming(V: ExtStart, BB: VectorLoopPreheaderBlock); |
| 604 | |
| 605 | // Calculate AVL by subtracting the vector loop index from the trip count |
| 606 | Value *AVL = Builder.CreateSub(LHS: ExtEnd, RHS: VectorIndexPhi, Name: "avl" , /*HasNUW=*/true, |
| 607 | /*HasNSW=*/true); |
| 608 | |
| 609 | auto *VectorLoadType = ScalableVectorType::get(ElementType: LoadType, MinNumElts: ByteCompareVF); |
| 610 | auto *VF = ConstantInt::get(Ty: I32Type, V: ByteCompareVF); |
| 611 | |
| 612 | Value *VL = Builder.CreateIntrinsic(ID: Intrinsic::experimental_get_vector_length, |
| 613 | Types: {I64Type}, Args: {AVL, VF, Builder.getTrue()}); |
| 614 | Value *GepOffset = VectorIndexPhi; |
| 615 | |
| 616 | Value *VectorLhsGep = |
| 617 | Builder.CreateGEP(Ty: LoadType, Ptr: PtrA, IdxList: GepOffset, Name: "" , NW: GEPA->isInBounds()); |
| 618 | VectorType *TrueMaskTy = |
| 619 | VectorType::get(ElementType: Builder.getInt1Ty(), EC: VectorLoadType->getElementCount()); |
| 620 | Value *AllTrueMask = Constant::getAllOnesValue(Ty: TrueMaskTy); |
| 621 | Value *VectorLhsLoad = Builder.CreateIntrinsic( |
| 622 | ID: Intrinsic::vp_load, Types: {VectorLoadType, VectorLhsGep->getType()}, |
| 623 | Args: {VectorLhsGep, AllTrueMask, VL}, FMFSource: nullptr, Name: "lhs.load" ); |
| 624 | |
| 625 | Value *VectorRhsGep = |
| 626 | Builder.CreateGEP(Ty: LoadType, Ptr: PtrB, IdxList: GepOffset, Name: "" , NW: GEPB->isInBounds()); |
| 627 | Value *VectorRhsLoad = Builder.CreateIntrinsic( |
| 628 | ID: Intrinsic::vp_load, Types: {VectorLoadType, VectorLhsGep->getType()}, |
| 629 | Args: {VectorRhsGep, AllTrueMask, VL}, FMFSource: nullptr, Name: "rhs.load" ); |
| 630 | |
| 631 | Value *VectorMatchCmp = |
| 632 | Builder.CreateICmpNE(LHS: VectorLhsLoad, RHS: VectorRhsLoad, Name: "mismatch.cmp" ); |
| 633 | Value *CTZ = Builder.CreateIntrinsic( |
| 634 | ID: Intrinsic::vp_cttz_elts, Types: {ResType, VectorMatchCmp->getType()}, |
| 635 | Args: {VectorMatchCmp, /*ZeroIsPoison=*/Builder.getInt1(V: false), AllTrueMask, |
| 636 | VL}); |
| 637 | Value *MismatchFound = Builder.CreateICmpNE(LHS: CTZ, RHS: VL); |
| 638 | auto *VectorEarlyExit = BranchInst::Create(IfTrue: VectorLoopMismatchBlock, |
| 639 | IfFalse: VectorLoopIncBlock, Cond: MismatchFound); |
| 640 | Builder.Insert(I: VectorEarlyExit); |
| 641 | |
| 642 | DTU.applyUpdates( |
| 643 | Updates: {{DominatorTree::Insert, VectorLoopStartBlock, VectorLoopMismatchBlock}, |
| 644 | {DominatorTree::Insert, VectorLoopStartBlock, VectorLoopIncBlock}}); |
| 645 | |
| 646 | // Increment the index counter and calculate the predicate for the next |
| 647 | // iteration of the loop. We branch back to the start of the loop if there |
| 648 | // is at least one active lane. |
| 649 | Builder.SetInsertPoint(VectorLoopIncBlock); |
| 650 | Value *VL64 = Builder.CreateZExt(V: VL, DestTy: I64Type); |
| 651 | Value *NewVectorIndexPhi = |
| 652 | Builder.CreateAdd(LHS: VectorIndexPhi, RHS: VL64, Name: "" , |
| 653 | /*HasNUW=*/true, /*HasNSW=*/true); |
| 654 | VectorIndexPhi->addIncoming(V: NewVectorIndexPhi, BB: VectorLoopIncBlock); |
| 655 | Value *ExitCond = Builder.CreateICmpNE(LHS: NewVectorIndexPhi, RHS: ExtEnd); |
| 656 | auto *VectorLoopBranchBack = |
| 657 | BranchInst::Create(IfTrue: VectorLoopStartBlock, IfFalse: EndBlock, Cond: ExitCond); |
| 658 | Builder.Insert(I: VectorLoopBranchBack); |
| 659 | |
| 660 | DTU.applyUpdates( |
| 661 | Updates: {{DominatorTree::Insert, VectorLoopIncBlock, VectorLoopStartBlock}, |
| 662 | {DominatorTree::Insert, VectorLoopIncBlock, EndBlock}}); |
| 663 | |
| 664 | // If we found a mismatch then we need to calculate which lane in the vector |
| 665 | // had a mismatch and add that on to the current loop index. |
| 666 | Builder.SetInsertPoint(VectorLoopMismatchBlock); |
| 667 | |
| 668 | // Add LCSSA phis for CTZ and VectorIndexPhi. |
| 669 | auto *CTZLCSSAPhi = Builder.CreatePHI(Ty: CTZ->getType(), NumReservedValues: 1, Name: "ctz" ); |
| 670 | CTZLCSSAPhi->addIncoming(V: CTZ, BB: VectorLoopStartBlock); |
| 671 | auto *VectorIndexLCSSAPhi = |
| 672 | Builder.CreatePHI(Ty: VectorIndexPhi->getType(), NumReservedValues: 1, Name: "mismatch_vector_index" ); |
| 673 | VectorIndexLCSSAPhi->addIncoming(V: VectorIndexPhi, BB: VectorLoopStartBlock); |
| 674 | |
| 675 | Value *CTZI64 = Builder.CreateZExt(V: CTZLCSSAPhi, DestTy: I64Type); |
| 676 | Value *VectorLoopRes64 = Builder.CreateAdd(LHS: VectorIndexLCSSAPhi, RHS: CTZI64, Name: "" , |
| 677 | /*HasNUW=*/true, /*HasNSW=*/true); |
| 678 | return Builder.CreateTrunc(V: VectorLoopRes64, DestTy: ResType); |
| 679 | } |
| 680 | |
| 681 | Value *LoopIdiomVectorize::expandFindMismatch( |
| 682 | IRBuilder<> &Builder, DomTreeUpdater &DTU, GetElementPtrInst *GEPA, |
| 683 | GetElementPtrInst *GEPB, Instruction *Index, Value *Start, Value *MaxLen) { |
| 684 | Value *PtrA = GEPA->getPointerOperand(); |
| 685 | Value *PtrB = GEPB->getPointerOperand(); |
| 686 | |
| 687 | // Get the arguments and types for the intrinsic. |
| 688 | BasicBlock * = CurLoop->getLoopPreheader(); |
| 689 | BranchInst *PHBranch = cast<BranchInst>(Val: Preheader->getTerminator()); |
| 690 | LLVMContext &Ctx = PHBranch->getContext(); |
| 691 | Type *LoadType = Type::getInt8Ty(C&: Ctx); |
| 692 | Type *ResType = Builder.getInt32Ty(); |
| 693 | |
| 694 | // Split block in the original loop preheader. |
| 695 | EndBlock = SplitBlock(Old: Preheader, SplitPt: PHBranch, DT, LI, MSSAU: nullptr, BBName: "mismatch_end" ); |
| 696 | |
| 697 | // Create the blocks that we're going to need: |
| 698 | // 1. A block for checking the zero-extended length exceeds 0 |
| 699 | // 2. A block to check that the start and end addresses of a given array |
| 700 | // lie on the same page. |
| 701 | // 3. The vector loop preheader. |
| 702 | // 4. The first vector loop block. |
| 703 | // 5. The vector loop increment block. |
| 704 | // 6. A block we can jump to from the vector loop when a mismatch is found. |
| 705 | // 7. The first block of the scalar loop itself, containing PHIs , loads |
| 706 | // and cmp. |
| 707 | // 8. A scalar loop increment block to increment the PHIs and go back |
| 708 | // around the loop. |
| 709 | |
| 710 | BasicBlock *MinItCheckBlock = BasicBlock::Create( |
| 711 | Context&: Ctx, Name: "mismatch_min_it_check" , Parent: EndBlock->getParent(), InsertBefore: EndBlock); |
| 712 | |
| 713 | // Update the terminator added by SplitBlock to branch to the first block |
| 714 | Preheader->getTerminator()->setSuccessor(Idx: 0, BB: MinItCheckBlock); |
| 715 | |
| 716 | BasicBlock *MemCheckBlock = BasicBlock::Create( |
| 717 | Context&: Ctx, Name: "mismatch_mem_check" , Parent: EndBlock->getParent(), InsertBefore: EndBlock); |
| 718 | |
| 719 | VectorLoopPreheaderBlock = BasicBlock::Create( |
| 720 | Context&: Ctx, Name: "mismatch_vec_loop_preheader" , Parent: EndBlock->getParent(), InsertBefore: EndBlock); |
| 721 | |
| 722 | VectorLoopStartBlock = BasicBlock::Create(Context&: Ctx, Name: "mismatch_vec_loop" , |
| 723 | Parent: EndBlock->getParent(), InsertBefore: EndBlock); |
| 724 | |
| 725 | VectorLoopIncBlock = BasicBlock::Create(Context&: Ctx, Name: "mismatch_vec_loop_inc" , |
| 726 | Parent: EndBlock->getParent(), InsertBefore: EndBlock); |
| 727 | |
| 728 | VectorLoopMismatchBlock = BasicBlock::Create(Context&: Ctx, Name: "mismatch_vec_loop_found" , |
| 729 | Parent: EndBlock->getParent(), InsertBefore: EndBlock); |
| 730 | |
| 731 | BasicBlock * = BasicBlock::Create( |
| 732 | Context&: Ctx, Name: "mismatch_loop_pre" , Parent: EndBlock->getParent(), InsertBefore: EndBlock); |
| 733 | |
| 734 | BasicBlock *LoopStartBlock = |
| 735 | BasicBlock::Create(Context&: Ctx, Name: "mismatch_loop" , Parent: EndBlock->getParent(), InsertBefore: EndBlock); |
| 736 | |
| 737 | BasicBlock *LoopIncBlock = BasicBlock::Create( |
| 738 | Context&: Ctx, Name: "mismatch_loop_inc" , Parent: EndBlock->getParent(), InsertBefore: EndBlock); |
| 739 | |
| 740 | DTU.applyUpdates(Updates: {{DominatorTree::Insert, Preheader, MinItCheckBlock}, |
| 741 | {DominatorTree::Delete, Preheader, EndBlock}}); |
| 742 | |
| 743 | // Update LoopInfo with the new vector & scalar loops. |
| 744 | auto VectorLoop = LI->AllocateLoop(); |
| 745 | auto ScalarLoop = LI->AllocateLoop(); |
| 746 | |
| 747 | if (CurLoop->getParentLoop()) { |
| 748 | CurLoop->getParentLoop()->addBasicBlockToLoop(NewBB: MinItCheckBlock, LI&: *LI); |
| 749 | CurLoop->getParentLoop()->addBasicBlockToLoop(NewBB: MemCheckBlock, LI&: *LI); |
| 750 | CurLoop->getParentLoop()->addBasicBlockToLoop(NewBB: VectorLoopPreheaderBlock, |
| 751 | LI&: *LI); |
| 752 | CurLoop->getParentLoop()->addChildLoop(NewChild: VectorLoop); |
| 753 | CurLoop->getParentLoop()->addBasicBlockToLoop(NewBB: VectorLoopMismatchBlock, LI&: *LI); |
| 754 | CurLoop->getParentLoop()->addBasicBlockToLoop(NewBB: LoopPreHeaderBlock, LI&: *LI); |
| 755 | CurLoop->getParentLoop()->addChildLoop(NewChild: ScalarLoop); |
| 756 | } else { |
| 757 | LI->addTopLevelLoop(New: VectorLoop); |
| 758 | LI->addTopLevelLoop(New: ScalarLoop); |
| 759 | } |
| 760 | |
| 761 | // Add the new basic blocks to their associated loops. |
| 762 | VectorLoop->addBasicBlockToLoop(NewBB: VectorLoopStartBlock, LI&: *LI); |
| 763 | VectorLoop->addBasicBlockToLoop(NewBB: VectorLoopIncBlock, LI&: *LI); |
| 764 | |
| 765 | ScalarLoop->addBasicBlockToLoop(NewBB: LoopStartBlock, LI&: *LI); |
| 766 | ScalarLoop->addBasicBlockToLoop(NewBB: LoopIncBlock, LI&: *LI); |
| 767 | |
| 768 | // Set up some types and constants that we intend to reuse. |
| 769 | Type *I64Type = Builder.getInt64Ty(); |
| 770 | |
| 771 | // Check the zero-extended iteration count > 0 |
| 772 | Builder.SetInsertPoint(MinItCheckBlock); |
| 773 | Value *ExtStart = Builder.CreateZExt(V: Start, DestTy: I64Type); |
| 774 | Value *ExtEnd = Builder.CreateZExt(V: MaxLen, DestTy: I64Type); |
| 775 | // This check doesn't really cost us very much. |
| 776 | |
| 777 | Value *LimitCheck = Builder.CreateICmpULE(LHS: Start, RHS: MaxLen); |
| 778 | BranchInst *MinItCheckBr = |
| 779 | BranchInst::Create(IfTrue: MemCheckBlock, IfFalse: LoopPreHeaderBlock, Cond: LimitCheck); |
| 780 | MinItCheckBr->setMetadata( |
| 781 | KindID: LLVMContext::MD_prof, |
| 782 | Node: MDBuilder(MinItCheckBr->getContext()).createBranchWeights(TrueWeight: 99, FalseWeight: 1)); |
| 783 | Builder.Insert(I: MinItCheckBr); |
| 784 | |
| 785 | DTU.applyUpdates( |
| 786 | Updates: {{DominatorTree::Insert, MinItCheckBlock, MemCheckBlock}, |
| 787 | {DominatorTree::Insert, MinItCheckBlock, LoopPreHeaderBlock}}); |
| 788 | |
| 789 | // For each of the arrays, check the start/end addresses are on the same |
| 790 | // page. |
| 791 | Builder.SetInsertPoint(MemCheckBlock); |
| 792 | |
| 793 | // The early exit in the original loop means that when performing vector |
| 794 | // loads we are potentially reading ahead of the early exit. So we could |
| 795 | // fault if crossing a page boundary. Therefore, we create runtime memory |
| 796 | // checks based on the minimum page size as follows: |
| 797 | // 1. Calculate the addresses of the first memory accesses in the loop, |
| 798 | // i.e. LhsStart and RhsStart. |
| 799 | // 2. Get the last accessed addresses in the loop, i.e. LhsEnd and RhsEnd. |
| 800 | // 3. Determine which pages correspond to all the memory accesses, i.e |
| 801 | // LhsStartPage, LhsEndPage, RhsStartPage, RhsEndPage. |
| 802 | // 4. If LhsStartPage == LhsEndPage and RhsStartPage == RhsEndPage, then |
| 803 | // we know we won't cross any page boundaries in the loop so we can |
| 804 | // enter the vector loop! Otherwise we fall back on the scalar loop. |
| 805 | Value *LhsStartGEP = Builder.CreateGEP(Ty: LoadType, Ptr: PtrA, IdxList: ExtStart); |
| 806 | Value *RhsStartGEP = Builder.CreateGEP(Ty: LoadType, Ptr: PtrB, IdxList: ExtStart); |
| 807 | Value *RhsStart = Builder.CreatePtrToInt(V: RhsStartGEP, DestTy: I64Type); |
| 808 | Value *LhsStart = Builder.CreatePtrToInt(V: LhsStartGEP, DestTy: I64Type); |
| 809 | Value *LhsEndGEP = Builder.CreateGEP(Ty: LoadType, Ptr: PtrA, IdxList: ExtEnd); |
| 810 | Value *RhsEndGEP = Builder.CreateGEP(Ty: LoadType, Ptr: PtrB, IdxList: ExtEnd); |
| 811 | Value *LhsEnd = Builder.CreatePtrToInt(V: LhsEndGEP, DestTy: I64Type); |
| 812 | Value *RhsEnd = Builder.CreatePtrToInt(V: RhsEndGEP, DestTy: I64Type); |
| 813 | |
| 814 | const uint64_t MinPageSize = TTI->getMinPageSize().value(); |
| 815 | const uint64_t AddrShiftAmt = llvm::Log2_64(Value: MinPageSize); |
| 816 | Value *LhsStartPage = Builder.CreateLShr(LHS: LhsStart, RHS: AddrShiftAmt); |
| 817 | Value *LhsEndPage = Builder.CreateLShr(LHS: LhsEnd, RHS: AddrShiftAmt); |
| 818 | Value *RhsStartPage = Builder.CreateLShr(LHS: RhsStart, RHS: AddrShiftAmt); |
| 819 | Value *RhsEndPage = Builder.CreateLShr(LHS: RhsEnd, RHS: AddrShiftAmt); |
| 820 | Value *LhsPageCmp = Builder.CreateICmpNE(LHS: LhsStartPage, RHS: LhsEndPage); |
| 821 | Value *RhsPageCmp = Builder.CreateICmpNE(LHS: RhsStartPage, RHS: RhsEndPage); |
| 822 | |
| 823 | Value *CombinedPageCmp = Builder.CreateOr(LHS: LhsPageCmp, RHS: RhsPageCmp); |
| 824 | BranchInst *CombinedPageCmpCmpBr = BranchInst::Create( |
| 825 | IfTrue: LoopPreHeaderBlock, IfFalse: VectorLoopPreheaderBlock, Cond: CombinedPageCmp); |
| 826 | CombinedPageCmpCmpBr->setMetadata( |
| 827 | KindID: LLVMContext::MD_prof, Node: MDBuilder(CombinedPageCmpCmpBr->getContext()) |
| 828 | .createBranchWeights(TrueWeight: 10, FalseWeight: 90)); |
| 829 | Builder.Insert(I: CombinedPageCmpCmpBr); |
| 830 | |
| 831 | DTU.applyUpdates( |
| 832 | Updates: {{DominatorTree::Insert, MemCheckBlock, LoopPreHeaderBlock}, |
| 833 | {DominatorTree::Insert, MemCheckBlock, VectorLoopPreheaderBlock}}); |
| 834 | |
| 835 | // Set up the vector loop preheader, i.e. calculate initial loop predicate, |
| 836 | // zero-extend MaxLen to 64-bits, determine the number of vector elements |
| 837 | // processed in each iteration, etc. |
| 838 | Builder.SetInsertPoint(VectorLoopPreheaderBlock); |
| 839 | |
| 840 | // At this point we know two things must be true: |
| 841 | // 1. Start <= End |
| 842 | // 2. ExtMaxLen <= MinPageSize due to the page checks. |
| 843 | // Therefore, we know that we can use a 64-bit induction variable that |
| 844 | // starts from 0 -> ExtMaxLen and it will not overflow. |
| 845 | Value *VectorLoopRes = nullptr; |
| 846 | switch (VectorizeStyle) { |
| 847 | case LoopIdiomVectorizeStyle::Masked: |
| 848 | VectorLoopRes = |
| 849 | createMaskedFindMismatch(Builder, DTU, GEPA, GEPB, ExtStart, ExtEnd); |
| 850 | break; |
| 851 | case LoopIdiomVectorizeStyle::Predicated: |
| 852 | VectorLoopRes = createPredicatedFindMismatch(Builder, DTU, GEPA, GEPB, |
| 853 | ExtStart, ExtEnd); |
| 854 | break; |
| 855 | } |
| 856 | |
| 857 | Builder.Insert(I: BranchInst::Create(IfTrue: EndBlock)); |
| 858 | |
| 859 | DTU.applyUpdates( |
| 860 | Updates: {{DominatorTree::Insert, VectorLoopMismatchBlock, EndBlock}}); |
| 861 | |
| 862 | // Generate code for scalar loop. |
| 863 | Builder.SetInsertPoint(LoopPreHeaderBlock); |
| 864 | Builder.Insert(I: BranchInst::Create(IfTrue: LoopStartBlock)); |
| 865 | |
| 866 | DTU.applyUpdates( |
| 867 | Updates: {{DominatorTree::Insert, LoopPreHeaderBlock, LoopStartBlock}}); |
| 868 | |
| 869 | Builder.SetInsertPoint(LoopStartBlock); |
| 870 | PHINode *IndexPhi = Builder.CreatePHI(Ty: ResType, NumReservedValues: 2, Name: "mismatch_index" ); |
| 871 | IndexPhi->addIncoming(V: Start, BB: LoopPreHeaderBlock); |
| 872 | |
| 873 | // Otherwise compare the values |
| 874 | // Load bytes from each array and compare them. |
| 875 | Value *GepOffset = Builder.CreateZExt(V: IndexPhi, DestTy: I64Type); |
| 876 | |
| 877 | Value *LhsGep = |
| 878 | Builder.CreateGEP(Ty: LoadType, Ptr: PtrA, IdxList: GepOffset, Name: "" , NW: GEPA->isInBounds()); |
| 879 | Value *LhsLoad = Builder.CreateLoad(Ty: LoadType, Ptr: LhsGep); |
| 880 | |
| 881 | Value *RhsGep = |
| 882 | Builder.CreateGEP(Ty: LoadType, Ptr: PtrB, IdxList: GepOffset, Name: "" , NW: GEPB->isInBounds()); |
| 883 | Value *RhsLoad = Builder.CreateLoad(Ty: LoadType, Ptr: RhsGep); |
| 884 | |
| 885 | Value *MatchCmp = Builder.CreateICmpEQ(LHS: LhsLoad, RHS: RhsLoad); |
| 886 | // If we have a mismatch then exit the loop ... |
| 887 | BranchInst *MatchCmpBr = BranchInst::Create(IfTrue: LoopIncBlock, IfFalse: EndBlock, Cond: MatchCmp); |
| 888 | Builder.Insert(I: MatchCmpBr); |
| 889 | |
| 890 | DTU.applyUpdates(Updates: {{DominatorTree::Insert, LoopStartBlock, LoopIncBlock}, |
| 891 | {DominatorTree::Insert, LoopStartBlock, EndBlock}}); |
| 892 | |
| 893 | // Have we reached the maximum permitted length for the loop? |
| 894 | Builder.SetInsertPoint(LoopIncBlock); |
| 895 | Value *PhiInc = Builder.CreateAdd(LHS: IndexPhi, RHS: ConstantInt::get(Ty: ResType, V: 1), Name: "" , |
| 896 | /*HasNUW=*/Index->hasNoUnsignedWrap(), |
| 897 | /*HasNSW=*/Index->hasNoSignedWrap()); |
| 898 | IndexPhi->addIncoming(V: PhiInc, BB: LoopIncBlock); |
| 899 | Value *IVCmp = Builder.CreateICmpEQ(LHS: PhiInc, RHS: MaxLen); |
| 900 | BranchInst *IVCmpBr = BranchInst::Create(IfTrue: EndBlock, IfFalse: LoopStartBlock, Cond: IVCmp); |
| 901 | Builder.Insert(I: IVCmpBr); |
| 902 | |
| 903 | DTU.applyUpdates(Updates: {{DominatorTree::Insert, LoopIncBlock, EndBlock}, |
| 904 | {DominatorTree::Insert, LoopIncBlock, LoopStartBlock}}); |
| 905 | |
| 906 | // In the end block we need to insert a PHI node to deal with three cases: |
| 907 | // 1. We didn't find a mismatch in the scalar loop, so we return MaxLen. |
| 908 | // 2. We exitted the scalar loop early due to a mismatch and need to return |
| 909 | // the index that we found. |
| 910 | // 3. We didn't find a mismatch in the vector loop, so we return MaxLen. |
| 911 | // 4. We exitted the vector loop early due to a mismatch and need to return |
| 912 | // the index that we found. |
| 913 | Builder.SetInsertPoint(TheBB: EndBlock, IP: EndBlock->getFirstInsertionPt()); |
| 914 | PHINode *ResPhi = Builder.CreatePHI(Ty: ResType, NumReservedValues: 4, Name: "mismatch_result" ); |
| 915 | ResPhi->addIncoming(V: MaxLen, BB: LoopIncBlock); |
| 916 | ResPhi->addIncoming(V: IndexPhi, BB: LoopStartBlock); |
| 917 | ResPhi->addIncoming(V: MaxLen, BB: VectorLoopIncBlock); |
| 918 | ResPhi->addIncoming(V: VectorLoopRes, BB: VectorLoopMismatchBlock); |
| 919 | |
| 920 | Value *FinalRes = Builder.CreateTrunc(V: ResPhi, DestTy: ResType); |
| 921 | |
| 922 | if (VerifyLoops) { |
| 923 | ScalarLoop->verifyLoop(); |
| 924 | VectorLoop->verifyLoop(); |
| 925 | if (!VectorLoop->isRecursivelyLCSSAForm(DT: *DT, LI: *LI)) |
| 926 | report_fatal_error(reason: "Loops must remain in LCSSA form!" ); |
| 927 | if (!ScalarLoop->isRecursivelyLCSSAForm(DT: *DT, LI: *LI)) |
| 928 | report_fatal_error(reason: "Loops must remain in LCSSA form!" ); |
| 929 | } |
| 930 | |
| 931 | return FinalRes; |
| 932 | } |
| 933 | |
| 934 | void LoopIdiomVectorize::transformByteCompare(GetElementPtrInst *GEPA, |
| 935 | GetElementPtrInst *GEPB, |
| 936 | PHINode *IndPhi, Value *MaxLen, |
| 937 | Instruction *Index, Value *Start, |
| 938 | bool IncIdx, BasicBlock *FoundBB, |
| 939 | BasicBlock *EndBB) { |
| 940 | |
| 941 | // Insert the byte compare code at the end of the preheader block |
| 942 | BasicBlock * = CurLoop->getLoopPreheader(); |
| 943 | BasicBlock * = CurLoop->getHeader(); |
| 944 | BranchInst *PHBranch = cast<BranchInst>(Val: Preheader->getTerminator()); |
| 945 | IRBuilder<> Builder(PHBranch); |
| 946 | DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy); |
| 947 | Builder.SetCurrentDebugLocation(PHBranch->getDebugLoc()); |
| 948 | |
| 949 | // Increment the pointer if this was done before the loads in the loop. |
| 950 | if (IncIdx) |
| 951 | Start = Builder.CreateAdd(LHS: Start, RHS: ConstantInt::get(Ty: Start->getType(), V: 1)); |
| 952 | |
| 953 | Value *ByteCmpRes = |
| 954 | expandFindMismatch(Builder, DTU, GEPA, GEPB, Index, Start, MaxLen); |
| 955 | |
| 956 | // Replaces uses of index & induction Phi with intrinsic (we already |
| 957 | // checked that the the first instruction of Header is the Phi above). |
| 958 | assert(IndPhi->hasOneUse() && "Index phi node has more than one use!" ); |
| 959 | Index->replaceAllUsesWith(V: ByteCmpRes); |
| 960 | |
| 961 | assert(PHBranch->isUnconditional() && |
| 962 | "Expected preheader to terminate with an unconditional branch." ); |
| 963 | |
| 964 | // If no mismatch was found, we can jump to the end block. Create a |
| 965 | // new basic block for the compare instruction. |
| 966 | auto *CmpBB = BasicBlock::Create(Context&: Preheader->getContext(), Name: "byte.compare" , |
| 967 | Parent: Preheader->getParent()); |
| 968 | CmpBB->moveBefore(MovePos: EndBB); |
| 969 | |
| 970 | // Replace the branch in the preheader with an always-true conditional branch. |
| 971 | // This ensures there is still a reference to the original loop. |
| 972 | Builder.CreateCondBr(Cond: Builder.getTrue(), True: CmpBB, False: Header); |
| 973 | PHBranch->eraseFromParent(); |
| 974 | |
| 975 | BasicBlock *MismatchEnd = cast<Instruction>(Val: ByteCmpRes)->getParent(); |
| 976 | DTU.applyUpdates(Updates: {{DominatorTree::Insert, MismatchEnd, CmpBB}}); |
| 977 | |
| 978 | // Create the branch to either the end or found block depending on the value |
| 979 | // returned by the intrinsic. |
| 980 | Builder.SetInsertPoint(CmpBB); |
| 981 | if (FoundBB != EndBB) { |
| 982 | Value *FoundCmp = Builder.CreateICmpEQ(LHS: ByteCmpRes, RHS: MaxLen); |
| 983 | Builder.CreateCondBr(Cond: FoundCmp, True: EndBB, False: FoundBB); |
| 984 | DTU.applyUpdates(Updates: {{DominatorTree::Insert, CmpBB, FoundBB}, |
| 985 | {DominatorTree::Insert, CmpBB, EndBB}}); |
| 986 | |
| 987 | } else { |
| 988 | Builder.CreateBr(Dest: FoundBB); |
| 989 | DTU.applyUpdates(Updates: {{DominatorTree::Insert, CmpBB, FoundBB}}); |
| 990 | } |
| 991 | |
| 992 | // Ensure all Phis in the successors of CmpBB have an incoming value from it. |
| 993 | fixSuccessorPhis(L: CurLoop, ScalarRes: ByteCmpRes, VectorRes: ByteCmpRes, SuccBB: EndBB, IncBB: CmpBB); |
| 994 | if (EndBB != FoundBB) |
| 995 | fixSuccessorPhis(L: CurLoop, ScalarRes: ByteCmpRes, VectorRes: ByteCmpRes, SuccBB: FoundBB, IncBB: CmpBB); |
| 996 | |
| 997 | // The new CmpBB block isn't part of the loop, but will need to be added to |
| 998 | // the outer loop if there is one. |
| 999 | if (!CurLoop->isOutermost()) |
| 1000 | CurLoop->getParentLoop()->addBasicBlockToLoop(NewBB: CmpBB, LI&: *LI); |
| 1001 | |
| 1002 | if (VerifyLoops && CurLoop->getParentLoop()) { |
| 1003 | CurLoop->getParentLoop()->verifyLoop(); |
| 1004 | if (!CurLoop->getParentLoop()->isRecursivelyLCSSAForm(DT: *DT, LI: *LI)) |
| 1005 | report_fatal_error(reason: "Loops must remain in LCSSA form!" ); |
| 1006 | } |
| 1007 | } |
| 1008 | |
| 1009 | bool LoopIdiomVectorize::recognizeFindFirstByte() { |
| 1010 | // Currently the transformation only works on scalable vector types, although |
| 1011 | // there is no fundamental reason why it cannot be made to work for fixed |
| 1012 | // vectors. We also need to know the target's minimum page size in order to |
| 1013 | // generate runtime memory checks to ensure the vector version won't fault. |
| 1014 | if (!TTI->supportsScalableVectors() || !TTI->getMinPageSize().has_value() || |
| 1015 | DisableFindFirstByte) |
| 1016 | return false; |
| 1017 | |
| 1018 | // Define some constants we need throughout. |
| 1019 | BasicBlock * = CurLoop->getHeader(); |
| 1020 | LLVMContext &Ctx = Header->getContext(); |
| 1021 | |
| 1022 | // We are expecting the four blocks defined below: Header, MatchBB, InnerBB, |
| 1023 | // and OuterBB. For now, we will bail our for almost anything else. The Four |
| 1024 | // blocks contain one nested loop. |
| 1025 | if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 4 || |
| 1026 | CurLoop->getSubLoops().size() != 1) |
| 1027 | return false; |
| 1028 | |
| 1029 | auto *InnerLoop = CurLoop->getSubLoops().front(); |
| 1030 | Function &F = *InnerLoop->getHeader()->getParent(); |
| 1031 | |
| 1032 | // Bail if vectorization is disabled on inner loop. |
| 1033 | LoopVectorizeHints Hints(InnerLoop, /*InterleaveOnlyWhenForced=*/true, ORE); |
| 1034 | if (!Hints.allowVectorization(F: &F, L: InnerLoop, |
| 1035 | /*VectorizeOnlyWhenForced=*/false)) { |
| 1036 | LLVM_DEBUG(dbgs() << DEBUG_TYPE << " is disabled on inner loop " |
| 1037 | << InnerLoop->getName() |
| 1038 | << " due to vectorization hints\n" ); |
| 1039 | return false; |
| 1040 | } |
| 1041 | |
| 1042 | PHINode *IndPhi = dyn_cast<PHINode>(Val: &Header->front()); |
| 1043 | if (!IndPhi || IndPhi->getNumIncomingValues() != 2) |
| 1044 | return false; |
| 1045 | |
| 1046 | // Check instruction counts. |
| 1047 | auto LoopBlocks = CurLoop->getBlocks(); |
| 1048 | if (LoopBlocks[0]->sizeWithoutDebug() > 3 || |
| 1049 | LoopBlocks[1]->sizeWithoutDebug() > 4 || |
| 1050 | LoopBlocks[2]->sizeWithoutDebug() > 3 || |
| 1051 | LoopBlocks[3]->sizeWithoutDebug() > 3) |
| 1052 | return false; |
| 1053 | |
| 1054 | // Check that no instruction other than IndPhi has outside uses. |
| 1055 | for (BasicBlock *BB : LoopBlocks) |
| 1056 | for (Instruction &I : *BB) |
| 1057 | if (&I != IndPhi) |
| 1058 | for (User *U : I.users()) |
| 1059 | if (!CurLoop->contains(Inst: cast<Instruction>(Val: U))) |
| 1060 | return false; |
| 1061 | |
| 1062 | // Match the branch instruction in the header. We are expecting an |
| 1063 | // unconditional branch to the inner loop. |
| 1064 | // |
| 1065 | // Header: |
| 1066 | // %14 = phi ptr [ %24, %OuterBB ], [ %3, %Header.preheader ] |
| 1067 | // %15 = load i8, ptr %14, align 1 |
| 1068 | // br label %MatchBB |
| 1069 | BasicBlock *MatchBB; |
| 1070 | if (!match(V: Header->getTerminator(), P: m_UnconditionalBr(Succ&: MatchBB)) || |
| 1071 | !InnerLoop->contains(BB: MatchBB)) |
| 1072 | return false; |
| 1073 | |
| 1074 | // MatchBB should be the entrypoint into the inner loop containing the |
| 1075 | // comparison between a search element and a needle. |
| 1076 | // |
| 1077 | // MatchBB: |
| 1078 | // %20 = phi ptr [ %7, %Header ], [ %17, %InnerBB ] |
| 1079 | // %21 = load i8, ptr %20, align 1 |
| 1080 | // %22 = icmp eq i8 %15, %21 |
| 1081 | // br i1 %22, label %ExitSucc, label %InnerBB |
| 1082 | BasicBlock *ExitSucc, *InnerBB; |
| 1083 | Value *LoadSearch, *LoadNeedle; |
| 1084 | CmpPredicate MatchPred; |
| 1085 | if (!match(V: MatchBB->getTerminator(), |
| 1086 | P: m_Br(C: m_ICmp(Pred&: MatchPred, L: m_Value(V&: LoadSearch), R: m_Value(V&: LoadNeedle)), |
| 1087 | T: m_BasicBlock(V&: ExitSucc), F: m_BasicBlock(V&: InnerBB))) || |
| 1088 | MatchPred != ICmpInst::ICMP_EQ || !InnerLoop->contains(BB: InnerBB)) |
| 1089 | return false; |
| 1090 | |
| 1091 | // We expect outside uses of `IndPhi' in ExitSucc (and only there). |
| 1092 | for (User *U : IndPhi->users()) |
| 1093 | if (!CurLoop->contains(Inst: cast<Instruction>(Val: U))) { |
| 1094 | auto *PN = dyn_cast<PHINode>(Val: U); |
| 1095 | if (!PN || PN->getParent() != ExitSucc) |
| 1096 | return false; |
| 1097 | } |
| 1098 | |
| 1099 | // Match the loads and check they are simple. |
| 1100 | Value *Search, *Needle; |
| 1101 | if (!match(V: LoadSearch, P: m_Load(Op: m_Value(V&: Search))) || |
| 1102 | !match(V: LoadNeedle, P: m_Load(Op: m_Value(V&: Needle))) || |
| 1103 | !cast<LoadInst>(Val: LoadSearch)->isSimple() || |
| 1104 | !cast<LoadInst>(Val: LoadNeedle)->isSimple()) |
| 1105 | return false; |
| 1106 | |
| 1107 | // Check we are loading valid characters. |
| 1108 | Type *CharTy = LoadSearch->getType(); |
| 1109 | if (!CharTy->isIntegerTy() || LoadNeedle->getType() != CharTy) |
| 1110 | return false; |
| 1111 | |
| 1112 | // Pick the vectorisation factor based on CharTy, work out the cost of the |
| 1113 | // match intrinsic and decide if we should use it. |
| 1114 | // Note: For the time being we assume 128-bit vectors. |
| 1115 | unsigned VF = 128 / CharTy->getIntegerBitWidth(); |
| 1116 | SmallVector<Type *> Args = { |
| 1117 | ScalableVectorType::get(ElementType: CharTy, MinNumElts: VF), FixedVectorType::get(ElementType: CharTy, NumElts: VF), |
| 1118 | ScalableVectorType::get(ElementType: Type::getInt1Ty(C&: Ctx), MinNumElts: VF)}; |
| 1119 | IntrinsicCostAttributes Attrs(Intrinsic::experimental_vector_match, Args[2], |
| 1120 | Args); |
| 1121 | if (TTI->getIntrinsicInstrCost(ICA: Attrs, CostKind: TTI::TCK_SizeAndLatency) > 4) |
| 1122 | return false; |
| 1123 | |
| 1124 | // The loads come from two PHIs, each with two incoming values. |
| 1125 | PHINode *PSearch = dyn_cast<PHINode>(Val: Search); |
| 1126 | PHINode *PNeedle = dyn_cast<PHINode>(Val: Needle); |
| 1127 | if (!PSearch || PSearch->getNumIncomingValues() != 2 || !PNeedle || |
| 1128 | PNeedle->getNumIncomingValues() != 2) |
| 1129 | return false; |
| 1130 | |
| 1131 | // One PHI comes from the outer loop (PSearch), the other one from the inner |
| 1132 | // loop (PNeedle). PSearch effectively corresponds to IndPhi. |
| 1133 | if (InnerLoop->contains(Inst: PSearch)) |
| 1134 | std::swap(a&: PSearch, b&: PNeedle); |
| 1135 | if (PSearch != &Header->front() || PNeedle != &MatchBB->front()) |
| 1136 | return false; |
| 1137 | |
| 1138 | // The incoming values of both PHI nodes should be a gep of 1. |
| 1139 | Value *SearchStart = PSearch->getIncomingValue(i: 0); |
| 1140 | Value *SearchIndex = PSearch->getIncomingValue(i: 1); |
| 1141 | if (CurLoop->contains(BB: PSearch->getIncomingBlock(i: 0))) |
| 1142 | std::swap(a&: SearchStart, b&: SearchIndex); |
| 1143 | |
| 1144 | Value *NeedleStart = PNeedle->getIncomingValue(i: 0); |
| 1145 | Value *NeedleIndex = PNeedle->getIncomingValue(i: 1); |
| 1146 | if (InnerLoop->contains(BB: PNeedle->getIncomingBlock(i: 0))) |
| 1147 | std::swap(a&: NeedleStart, b&: NeedleIndex); |
| 1148 | |
| 1149 | // Match the GEPs. |
| 1150 | if (!match(V: SearchIndex, P: m_GEP(Ops: m_Specific(V: PSearch), Ops: m_One())) || |
| 1151 | !match(V: NeedleIndex, P: m_GEP(Ops: m_Specific(V: PNeedle), Ops: m_One()))) |
| 1152 | return false; |
| 1153 | |
| 1154 | // Check the GEPs result type matches `CharTy'. |
| 1155 | GetElementPtrInst *GEPSearch = cast<GetElementPtrInst>(Val: SearchIndex); |
| 1156 | GetElementPtrInst *GEPNeedle = cast<GetElementPtrInst>(Val: NeedleIndex); |
| 1157 | if (GEPSearch->getResultElementType() != CharTy || |
| 1158 | GEPNeedle->getResultElementType() != CharTy) |
| 1159 | return false; |
| 1160 | |
| 1161 | // InnerBB should increment the address of the needle pointer. |
| 1162 | // |
| 1163 | // InnerBB: |
| 1164 | // %17 = getelementptr inbounds i8, ptr %20, i64 1 |
| 1165 | // %18 = icmp eq ptr %17, %10 |
| 1166 | // br i1 %18, label %OuterBB, label %MatchBB |
| 1167 | BasicBlock *OuterBB; |
| 1168 | Value *NeedleEnd; |
| 1169 | if (!match(V: InnerBB->getTerminator(), |
| 1170 | P: m_Br(C: m_SpecificICmp(MatchPred: ICmpInst::ICMP_EQ, L: m_Specific(V: GEPNeedle), |
| 1171 | R: m_Value(V&: NeedleEnd)), |
| 1172 | T: m_BasicBlock(V&: OuterBB), F: m_Specific(V: MatchBB))) || |
| 1173 | !CurLoop->contains(BB: OuterBB)) |
| 1174 | return false; |
| 1175 | |
| 1176 | // OuterBB should increment the address of the search element pointer. |
| 1177 | // |
| 1178 | // OuterBB: |
| 1179 | // %24 = getelementptr inbounds i8, ptr %14, i64 1 |
| 1180 | // %25 = icmp eq ptr %24, %6 |
| 1181 | // br i1 %25, label %ExitFail, label %Header |
| 1182 | BasicBlock *ExitFail; |
| 1183 | Value *SearchEnd; |
| 1184 | if (!match(V: OuterBB->getTerminator(), |
| 1185 | P: m_Br(C: m_SpecificICmp(MatchPred: ICmpInst::ICMP_EQ, L: m_Specific(V: GEPSearch), |
| 1186 | R: m_Value(V&: SearchEnd)), |
| 1187 | T: m_BasicBlock(V&: ExitFail), F: m_Specific(V: Header)))) |
| 1188 | return false; |
| 1189 | |
| 1190 | if (!CurLoop->isLoopInvariant(V: SearchStart) || |
| 1191 | !CurLoop->isLoopInvariant(V: SearchEnd) || |
| 1192 | !CurLoop->isLoopInvariant(V: NeedleStart) || |
| 1193 | !CurLoop->isLoopInvariant(V: NeedleEnd)) |
| 1194 | return false; |
| 1195 | |
| 1196 | LLVM_DEBUG(dbgs() << "Found idiom in loop: \n" << *CurLoop << "\n\n" ); |
| 1197 | |
| 1198 | transformFindFirstByte(IndPhi, VF, CharTy, ExitSucc, ExitFail, SearchStart, |
| 1199 | SearchEnd, NeedleStart, NeedleEnd); |
| 1200 | return true; |
| 1201 | } |
| 1202 | |
| 1203 | Value *LoopIdiomVectorize::expandFindFirstByte( |
| 1204 | IRBuilder<> &Builder, DomTreeUpdater &DTU, unsigned VF, Type *CharTy, |
| 1205 | Value *IndPhi, BasicBlock *ExitSucc, BasicBlock *ExitFail, |
| 1206 | Value *SearchStart, Value *SearchEnd, Value *NeedleStart, |
| 1207 | Value *NeedleEnd) { |
| 1208 | // Set up some types and constants that we intend to reuse. |
| 1209 | auto *PtrTy = Builder.getPtrTy(); |
| 1210 | auto *I64Ty = Builder.getInt64Ty(); |
| 1211 | auto *PredVTy = ScalableVectorType::get(ElementType: Builder.getInt1Ty(), MinNumElts: VF); |
| 1212 | auto *CharVTy = ScalableVectorType::get(ElementType: CharTy, MinNumElts: VF); |
| 1213 | auto *ConstVF = ConstantInt::get(Ty: I64Ty, V: VF); |
| 1214 | |
| 1215 | // Other common arguments. |
| 1216 | BasicBlock * = CurLoop->getLoopPreheader(); |
| 1217 | LLVMContext &Ctx = Preheader->getContext(); |
| 1218 | Value *Passthru = ConstantInt::getNullValue(Ty: CharVTy); |
| 1219 | |
| 1220 | // Split block in the original loop preheader. |
| 1221 | // SPH is the new preheader to the old scalar loop. |
| 1222 | BasicBlock *SPH = SplitBlock(Old: Preheader, SplitPt: Preheader->getTerminator(), DT, LI, |
| 1223 | MSSAU: nullptr, BBName: "scalar_preheader" ); |
| 1224 | |
| 1225 | // Create the blocks that we're going to use. |
| 1226 | // |
| 1227 | // We will have the following loops: |
| 1228 | // (O) Outer loop where we iterate over the elements of the search array. |
| 1229 | // (I) Inner loop where we iterate over the elements of the needle array. |
| 1230 | // |
| 1231 | // Overall, the blocks do the following: |
| 1232 | // (0) Check if the arrays can't cross page boundaries. If so go to (1), |
| 1233 | // otherwise fall back to the original scalar loop. |
| 1234 | // (1) Load the search array. Go to (2). |
| 1235 | // (2) (a) Load the needle array. |
| 1236 | // (b) Splat the first element to the inactive lanes. |
| 1237 | // (c) Accumulate any matches found. If we haven't reached the end of the |
| 1238 | // needle array loop back to (2), otherwise go to (3). |
| 1239 | // (3) Test if we found any match. If so go to (4), otherwise go to (5). |
| 1240 | // (4) Compute the index of the first match and exit. |
| 1241 | // (5) Check if we've reached the end of the search array. If not loop back to |
| 1242 | // (1), otherwise exit. |
| 1243 | // Blocks (0,4) are not part of any loop. Blocks (1,3,5) and (2) belong to the |
| 1244 | // outer and inner loops, respectively. |
| 1245 | BasicBlock *BB0 = BasicBlock::Create(Context&: Ctx, Name: "mem_check" , Parent: SPH->getParent(), InsertBefore: SPH); |
| 1246 | BasicBlock *BB1 = |
| 1247 | BasicBlock::Create(Context&: Ctx, Name: "find_first_vec_header" , Parent: SPH->getParent(), InsertBefore: SPH); |
| 1248 | BasicBlock *BB2 = |
| 1249 | BasicBlock::Create(Context&: Ctx, Name: "needle_check_vec" , Parent: SPH->getParent(), InsertBefore: SPH); |
| 1250 | BasicBlock *BB3 = |
| 1251 | BasicBlock::Create(Context&: Ctx, Name: "match_check_vec" , Parent: SPH->getParent(), InsertBefore: SPH); |
| 1252 | BasicBlock *BB4 = |
| 1253 | BasicBlock::Create(Context&: Ctx, Name: "calculate_match" , Parent: SPH->getParent(), InsertBefore: SPH); |
| 1254 | BasicBlock *BB5 = |
| 1255 | BasicBlock::Create(Context&: Ctx, Name: "search_check_vec" , Parent: SPH->getParent(), InsertBefore: SPH); |
| 1256 | |
| 1257 | // Update LoopInfo with the new loops. |
| 1258 | auto OuterLoop = LI->AllocateLoop(); |
| 1259 | auto InnerLoop = LI->AllocateLoop(); |
| 1260 | |
| 1261 | if (auto ParentLoop = CurLoop->getParentLoop()) { |
| 1262 | ParentLoop->addBasicBlockToLoop(NewBB: BB0, LI&: *LI); |
| 1263 | ParentLoop->addChildLoop(NewChild: OuterLoop); |
| 1264 | ParentLoop->addBasicBlockToLoop(NewBB: BB4, LI&: *LI); |
| 1265 | } else { |
| 1266 | LI->addTopLevelLoop(New: OuterLoop); |
| 1267 | } |
| 1268 | |
| 1269 | // Add the inner loop to the outer. |
| 1270 | OuterLoop->addChildLoop(NewChild: InnerLoop); |
| 1271 | |
| 1272 | // Add the new basic blocks to the corresponding loops. |
| 1273 | OuterLoop->addBasicBlockToLoop(NewBB: BB1, LI&: *LI); |
| 1274 | OuterLoop->addBasicBlockToLoop(NewBB: BB3, LI&: *LI); |
| 1275 | OuterLoop->addBasicBlockToLoop(NewBB: BB5, LI&: *LI); |
| 1276 | InnerLoop->addBasicBlockToLoop(NewBB: BB2, LI&: *LI); |
| 1277 | |
| 1278 | // Update the terminator added by SplitBlock to branch to the first block. |
| 1279 | Preheader->getTerminator()->setSuccessor(Idx: 0, BB: BB0); |
| 1280 | DTU.applyUpdates(Updates: {{DominatorTree::Delete, Preheader, SPH}, |
| 1281 | {DominatorTree::Insert, Preheader, BB0}}); |
| 1282 | |
| 1283 | // (0) Check if we could be crossing a page boundary; if so, fallback to the |
| 1284 | // old scalar loops. Also create a predicate of VF elements to be used in the |
| 1285 | // vector loops. |
| 1286 | Builder.SetInsertPoint(BB0); |
| 1287 | Value *ISearchStart = |
| 1288 | Builder.CreatePtrToInt(V: SearchStart, DestTy: I64Ty, Name: "search_start_int" ); |
| 1289 | Value *ISearchEnd = |
| 1290 | Builder.CreatePtrToInt(V: SearchEnd, DestTy: I64Ty, Name: "search_end_int" ); |
| 1291 | Value *INeedleStart = |
| 1292 | Builder.CreatePtrToInt(V: NeedleStart, DestTy: I64Ty, Name: "needle_start_int" ); |
| 1293 | Value *INeedleEnd = |
| 1294 | Builder.CreatePtrToInt(V: NeedleEnd, DestTy: I64Ty, Name: "needle_end_int" ); |
| 1295 | Value *PredVF = |
| 1296 | Builder.CreateIntrinsic(ID: Intrinsic::get_active_lane_mask, Types: {PredVTy, I64Ty}, |
| 1297 | Args: {ConstantInt::get(Ty: I64Ty, V: 0), ConstVF}); |
| 1298 | |
| 1299 | const uint64_t MinPageSize = TTI->getMinPageSize().value(); |
| 1300 | const uint64_t AddrShiftAmt = llvm::Log2_64(Value: MinPageSize); |
| 1301 | Value *SearchStartPage = |
| 1302 | Builder.CreateLShr(LHS: ISearchStart, RHS: AddrShiftAmt, Name: "search_start_page" ); |
| 1303 | Value *SearchEndPage = |
| 1304 | Builder.CreateLShr(LHS: ISearchEnd, RHS: AddrShiftAmt, Name: "search_end_page" ); |
| 1305 | Value *NeedleStartPage = |
| 1306 | Builder.CreateLShr(LHS: INeedleStart, RHS: AddrShiftAmt, Name: "needle_start_page" ); |
| 1307 | Value *NeedleEndPage = |
| 1308 | Builder.CreateLShr(LHS: INeedleEnd, RHS: AddrShiftAmt, Name: "needle_end_page" ); |
| 1309 | Value *SearchPageCmp = |
| 1310 | Builder.CreateICmpNE(LHS: SearchStartPage, RHS: SearchEndPage, Name: "search_page_cmp" ); |
| 1311 | Value *NeedlePageCmp = |
| 1312 | Builder.CreateICmpNE(LHS: NeedleStartPage, RHS: NeedleEndPage, Name: "needle_page_cmp" ); |
| 1313 | |
| 1314 | Value *CombinedPageCmp = |
| 1315 | Builder.CreateOr(LHS: SearchPageCmp, RHS: NeedlePageCmp, Name: "combined_page_cmp" ); |
| 1316 | BranchInst *CombinedPageBr = Builder.CreateCondBr(Cond: CombinedPageCmp, True: SPH, False: BB1); |
| 1317 | CombinedPageBr->setMetadata(KindID: LLVMContext::MD_prof, |
| 1318 | Node: MDBuilder(Ctx).createBranchWeights(TrueWeight: 10, FalseWeight: 90)); |
| 1319 | DTU.applyUpdates( |
| 1320 | Updates: {{DominatorTree::Insert, BB0, SPH}, {DominatorTree::Insert, BB0, BB1}}); |
| 1321 | |
| 1322 | // (1) Load the search array and branch to the inner loop. |
| 1323 | Builder.SetInsertPoint(BB1); |
| 1324 | PHINode *Search = Builder.CreatePHI(Ty: PtrTy, NumReservedValues: 2, Name: "psearch" ); |
| 1325 | Value *PredSearch = Builder.CreateIntrinsic( |
| 1326 | ID: Intrinsic::get_active_lane_mask, Types: {PredVTy, I64Ty}, |
| 1327 | Args: {Builder.CreatePtrToInt(V: Search, DestTy: I64Ty), ISearchEnd}, FMFSource: nullptr, |
| 1328 | Name: "search_pred" ); |
| 1329 | PredSearch = Builder.CreateAnd(LHS: PredVF, RHS: PredSearch, Name: "search_masked" ); |
| 1330 | Value *LoadSearch = Builder.CreateMaskedLoad( |
| 1331 | Ty: CharVTy, Ptr: Search, Alignment: Align(1), Mask: PredSearch, PassThru: Passthru, Name: "search_load_vec" ); |
| 1332 | Value *MatchInit = Constant::getNullValue(Ty: PredVTy); |
| 1333 | Builder.CreateBr(Dest: BB2); |
| 1334 | DTU.applyUpdates(Updates: {{DominatorTree::Insert, BB1, BB2}}); |
| 1335 | |
| 1336 | // (2) Inner loop. |
| 1337 | Builder.SetInsertPoint(BB2); |
| 1338 | PHINode *Needle = Builder.CreatePHI(Ty: PtrTy, NumReservedValues: 2, Name: "pneedle" ); |
| 1339 | PHINode *Match = Builder.CreatePHI(Ty: PredVTy, NumReservedValues: 2, Name: "pmatch" ); |
| 1340 | |
| 1341 | // (2.a) Load the needle array. |
| 1342 | Value *PredNeedle = Builder.CreateIntrinsic( |
| 1343 | ID: Intrinsic::get_active_lane_mask, Types: {PredVTy, I64Ty}, |
| 1344 | Args: {Builder.CreatePtrToInt(V: Needle, DestTy: I64Ty), INeedleEnd}, FMFSource: nullptr, |
| 1345 | Name: "needle_pred" ); |
| 1346 | PredNeedle = Builder.CreateAnd(LHS: PredVF, RHS: PredNeedle, Name: "needle_masked" ); |
| 1347 | Value *LoadNeedle = Builder.CreateMaskedLoad( |
| 1348 | Ty: CharVTy, Ptr: Needle, Alignment: Align(1), Mask: PredNeedle, PassThru: Passthru, Name: "needle_load_vec" ); |
| 1349 | |
| 1350 | // (2.b) Splat the first element to the inactive lanes. |
| 1351 | Value *Needle0 = |
| 1352 | Builder.CreateExtractElement(Vec: LoadNeedle, Idx: uint64_t(0), Name: "needle0" ); |
| 1353 | Value *Needle0Splat = Builder.CreateVectorSplat(EC: ElementCount::getScalable(MinVal: VF), |
| 1354 | V: Needle0, Name: "needle0" ); |
| 1355 | LoadNeedle = Builder.CreateSelect(C: PredNeedle, True: LoadNeedle, False: Needle0Splat, |
| 1356 | Name: "needle_splat" ); |
| 1357 | LoadNeedle = Builder.CreateExtractVector( |
| 1358 | DstType: FixedVectorType::get(ElementType: CharTy, NumElts: VF), SrcVec: LoadNeedle, Idx: uint64_t(0), Name: "needle_vec" ); |
| 1359 | |
| 1360 | // (2.c) Accumulate matches. |
| 1361 | Value *MatchSeg = Builder.CreateIntrinsic( |
| 1362 | ID: Intrinsic::experimental_vector_match, Types: {CharVTy, LoadNeedle->getType()}, |
| 1363 | Args: {LoadSearch, LoadNeedle, PredSearch}, FMFSource: nullptr, Name: "match_segment" ); |
| 1364 | Value *MatchAcc = Builder.CreateOr(LHS: Match, RHS: MatchSeg, Name: "match_accumulator" ); |
| 1365 | Value *NextNeedle = |
| 1366 | Builder.CreateGEP(Ty: CharTy, Ptr: Needle, IdxList: ConstVF, Name: "needle_next_vec" ); |
| 1367 | Builder.CreateCondBr(Cond: Builder.CreateICmpULT(LHS: NextNeedle, RHS: NeedleEnd), True: BB2, False: BB3); |
| 1368 | DTU.applyUpdates( |
| 1369 | Updates: {{DominatorTree::Insert, BB2, BB2}, {DominatorTree::Insert, BB2, BB3}}); |
| 1370 | |
| 1371 | // (3) Check if we found a match. |
| 1372 | Builder.SetInsertPoint(BB3); |
| 1373 | PHINode *MatchPredAccLCSSA = Builder.CreatePHI(Ty: PredVTy, NumReservedValues: 1, Name: "match_pred" ); |
| 1374 | Value *IfAnyMatch = Builder.CreateOrReduce(Src: MatchPredAccLCSSA); |
| 1375 | Builder.CreateCondBr(Cond: IfAnyMatch, True: BB4, False: BB5); |
| 1376 | DTU.applyUpdates( |
| 1377 | Updates: {{DominatorTree::Insert, BB3, BB4}, {DominatorTree::Insert, BB3, BB5}}); |
| 1378 | |
| 1379 | // (4) We found a match. Compute the index of its location and exit. |
| 1380 | Builder.SetInsertPoint(BB4); |
| 1381 | PHINode *MatchLCSSA = Builder.CreatePHI(Ty: PtrTy, NumReservedValues: 1, Name: "match_start" ); |
| 1382 | PHINode *MatchPredLCSSA = Builder.CreatePHI(Ty: PredVTy, NumReservedValues: 1, Name: "match_vec" ); |
| 1383 | Value *MatchCnt = Builder.CreateIntrinsic( |
| 1384 | ID: Intrinsic::experimental_cttz_elts, Types: {I64Ty, PredVTy}, |
| 1385 | Args: {MatchPredLCSSA, /*ZeroIsPoison=*/Builder.getInt1(V: true)}, FMFSource: nullptr, |
| 1386 | Name: "match_idx" ); |
| 1387 | Value *MatchVal = |
| 1388 | Builder.CreateGEP(Ty: CharTy, Ptr: MatchLCSSA, IdxList: MatchCnt, Name: "match_res" ); |
| 1389 | Builder.CreateBr(Dest: ExitSucc); |
| 1390 | DTU.applyUpdates(Updates: {{DominatorTree::Insert, BB4, ExitSucc}}); |
| 1391 | |
| 1392 | // (5) Check if we've reached the end of the search array. |
| 1393 | Builder.SetInsertPoint(BB5); |
| 1394 | Value *NextSearch = |
| 1395 | Builder.CreateGEP(Ty: CharTy, Ptr: Search, IdxList: ConstVF, Name: "search_next_vec" ); |
| 1396 | Builder.CreateCondBr(Cond: Builder.CreateICmpULT(LHS: NextSearch, RHS: SearchEnd), True: BB1, |
| 1397 | False: ExitFail); |
| 1398 | DTU.applyUpdates(Updates: {{DominatorTree::Insert, BB5, BB1}, |
| 1399 | {DominatorTree::Insert, BB5, ExitFail}}); |
| 1400 | |
| 1401 | // Set up the PHI nodes. |
| 1402 | Search->addIncoming(V: SearchStart, BB: BB0); |
| 1403 | Search->addIncoming(V: NextSearch, BB: BB5); |
| 1404 | Needle->addIncoming(V: NeedleStart, BB: BB1); |
| 1405 | Needle->addIncoming(V: NextNeedle, BB: BB2); |
| 1406 | Match->addIncoming(V: MatchInit, BB: BB1); |
| 1407 | Match->addIncoming(V: MatchAcc, BB: BB2); |
| 1408 | // These are needed to retain LCSSA form. |
| 1409 | MatchPredAccLCSSA->addIncoming(V: MatchAcc, BB: BB2); |
| 1410 | MatchLCSSA->addIncoming(V: Search, BB: BB3); |
| 1411 | MatchPredLCSSA->addIncoming(V: MatchPredAccLCSSA, BB: BB3); |
| 1412 | |
| 1413 | // Ensure all Phis in the successors of BB4/BB5 have an incoming value from |
| 1414 | // them. |
| 1415 | fixSuccessorPhis(L: CurLoop, ScalarRes: IndPhi, VectorRes: MatchVal, SuccBB: ExitSucc, IncBB: BB4); |
| 1416 | if (ExitSucc != ExitFail) |
| 1417 | fixSuccessorPhis(L: CurLoop, ScalarRes: IndPhi, VectorRes: MatchVal, SuccBB: ExitFail, IncBB: BB5); |
| 1418 | |
| 1419 | if (VerifyLoops) { |
| 1420 | OuterLoop->verifyLoop(); |
| 1421 | InnerLoop->verifyLoop(); |
| 1422 | if (!OuterLoop->isRecursivelyLCSSAForm(DT: *DT, LI: *LI)) |
| 1423 | report_fatal_error(reason: "Loops must remain in LCSSA form!" ); |
| 1424 | } |
| 1425 | |
| 1426 | return MatchVal; |
| 1427 | } |
| 1428 | |
| 1429 | void LoopIdiomVectorize::transformFindFirstByte( |
| 1430 | PHINode *IndPhi, unsigned VF, Type *CharTy, BasicBlock *ExitSucc, |
| 1431 | BasicBlock *ExitFail, Value *SearchStart, Value *SearchEnd, |
| 1432 | Value *NeedleStart, Value *NeedleEnd) { |
| 1433 | // Insert the find first byte code at the end of the preheader block. |
| 1434 | BasicBlock * = CurLoop->getLoopPreheader(); |
| 1435 | BranchInst *PHBranch = cast<BranchInst>(Val: Preheader->getTerminator()); |
| 1436 | IRBuilder<> Builder(PHBranch); |
| 1437 | DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy); |
| 1438 | Builder.SetCurrentDebugLocation(PHBranch->getDebugLoc()); |
| 1439 | |
| 1440 | expandFindFirstByte(Builder, DTU, VF, CharTy, IndPhi, ExitSucc, ExitFail, |
| 1441 | SearchStart, SearchEnd, NeedleStart, NeedleEnd); |
| 1442 | |
| 1443 | assert(PHBranch->isUnconditional() && |
| 1444 | "Expected preheader to terminate with an unconditional branch." ); |
| 1445 | |
| 1446 | if (VerifyLoops && CurLoop->getParentLoop()) { |
| 1447 | CurLoop->getParentLoop()->verifyLoop(); |
| 1448 | if (!CurLoop->getParentLoop()->isRecursivelyLCSSAForm(DT: *DT, LI: *LI)) |
| 1449 | report_fatal_error(reason: "Loops must remain in LCSSA form!" ); |
| 1450 | } |
| 1451 | } |
| 1452 | |