1//===-- VPlanConstruction.cpp - Transforms for initial VPlan construction -===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file implements transforms for initial VPlan construction.
11///
12//===----------------------------------------------------------------------===//
13
14#include "LoopVectorizationPlanner.h"
15#include "VPlan.h"
16#include "VPlanAnalysis.h"
17#include "VPlanCFG.h"
18#include "VPlanDominatorTree.h"
19#include "VPlanHelpers.h"
20#include "VPlanPatternMatch.h"
21#include "VPlanTransforms.h"
22#include "VPlanUtils.h"
23#include "llvm/ADT/SmallVectorExtras.h"
24#include "llvm/Analysis/Loads.h"
25#include "llvm/Analysis/LoopInfo.h"
26#include "llvm/Analysis/LoopIterator.h"
27#include "llvm/Analysis/OptimizationRemarkEmitter.h"
28#include "llvm/Analysis/ScalarEvolution.h"
29#include "llvm/Analysis/ScalarEvolutionExpressions.h"
30#include "llvm/Analysis/TargetTransformInfo.h"
31#include "llvm/IR/InstrTypes.h"
32#include "llvm/IR/MDBuilder.h"
33#include "llvm/Support/Debug.h"
34#include "llvm/Transforms/Utils/LoopUtils.h"
35#include "llvm/Transforms/Utils/LoopVersioning.h"
36#include "llvm/Transforms/Vectorize/LoopVectorize.h"
37
38#define DEBUG_TYPE "vplan"
39
40using namespace llvm;
41using namespace LoopVectorizationUtils;
42using namespace VPlanPatternMatch;
43
44namespace {
45// Class that is used to build the plain CFG for the incoming IR.
46class PlainCFGBuilder {
47 // The outermost loop of the input loop nest considered for vectorization.
48 Loop *TheLoop;
49
50 // Loop Info analysis.
51 LoopInfo *LI;
52
53 // Loop versioning for alias metadata.
54 LoopVersioning *LVer;
55
56 // Vectorization plan that we are working on.
57 std::unique_ptr<VPlan> Plan;
58
59 // Builder of the VPlan instruction-level representation.
60 VPBuilder VPIRBuilder;
61
62 // NOTE: The following maps are intentionally destroyed after the plain CFG
63 // construction because subsequent VPlan-to-VPlan transformation may
64 // invalidate them.
65 // Map incoming BasicBlocks to their newly-created VPBasicBlocks.
66 DenseMap<BasicBlock *, VPBasicBlock *> BB2VPBB;
67 // Map incoming Value definitions to their newly-created VPValues.
68 DenseMap<Value *, VPValue *> IRDef2VPValue;
69
70 // Hold phi node's that need to be fixed once the plain CFG has been built.
71 SmallVector<PHINode *, 8> PhisToFix;
72
73 // Utility functions.
74 void setVPBBPredsFromBB(VPBasicBlock *VPBB, BasicBlock *BB);
75 void fixHeaderPhis();
76 VPBasicBlock *getOrCreateVPBB(BasicBlock *BB);
77#ifndef NDEBUG
78 bool isExternalDef(Value *Val);
79#endif
80 VPValue *getOrCreateVPOperand(Value *IRVal);
81 void createVPInstructionsForVPBB(VPBasicBlock *VPBB, BasicBlock *BB);
82
83public:
84 PlainCFGBuilder(Loop *Lp, LoopInfo *LI, LoopVersioning *LVer, Type *IdxTy)
85 : TheLoop(Lp), LI(LI), LVer(LVer),
86 Plan(std::make_unique<VPlan>(args&: Lp, args&: IdxTy)) {}
87
88 /// Build plain CFG for TheLoop and connect it to Plan's entry.
89 std::unique_ptr<VPlan> buildPlainCFG();
90};
91} // anonymous namespace
92
93// Set predecessors of \p VPBB in the same order as they are in \p BB. \p VPBB
94// must have no predecessors.
95void PlainCFGBuilder::setVPBBPredsFromBB(VPBasicBlock *VPBB, BasicBlock *BB) {
96 // Collect VPBB predecessors.
97 SmallVector<VPBlockBase *, 2> VPBBPreds;
98 for (BasicBlock *Pred : predecessors(BB))
99 VPBBPreds.push_back(Elt: getOrCreateVPBB(BB: Pred));
100 VPBB->setPredecessors(VPBBPreds);
101}
102
103static bool isHeaderBB(BasicBlock *BB, Loop *L) {
104 return L && BB == L->getHeader();
105}
106
107// Add operands to VPInstructions representing phi nodes from the input IR.
108void PlainCFGBuilder::fixHeaderPhis() {
109 for (auto *Phi : PhisToFix) {
110 assert(IRDef2VPValue.count(Phi) && "Missing VPInstruction for PHINode.");
111 VPValue *VPVal = IRDef2VPValue[Phi];
112 assert(isa<VPPhi>(VPVal) && "Expected VPPhi for phi node.");
113 auto *PhiR = cast<VPPhi>(Val: VPVal);
114 assert(PhiR->getNumOperands() == 0 && "Expected VPPhi with no operands.");
115 assert(isHeaderBB(Phi->getParent(), LI->getLoopFor(Phi->getParent())) &&
116 "Expected Phi in header block.");
117 assert(Phi->getNumOperands() == 2 &&
118 "header phi must have exactly 2 operands");
119 for (BasicBlock *Pred : predecessors(BB: Phi->getParent()))
120 PhiR->addIncoming(
121 IncomingV: getOrCreateVPOperand(IRVal: Phi->getIncomingValueForBlock(BB: Pred)));
122 }
123}
124
125// Create a new empty VPBasicBlock for an incoming BasicBlock or retrieve an
126// existing one if it was already created.
127VPBasicBlock *PlainCFGBuilder::getOrCreateVPBB(BasicBlock *BB) {
128 if (auto *VPBB = BB2VPBB.lookup(Val: BB)) {
129 // Retrieve existing VPBB.
130 return VPBB;
131 }
132
133 // Create new VPBB.
134 StringRef Name = BB->getName();
135 LLVM_DEBUG(dbgs() << "Creating VPBasicBlock for " << Name << "\n");
136 VPBasicBlock *VPBB = Plan->createVPBasicBlock(Name);
137 BB2VPBB[BB] = VPBB;
138 return VPBB;
139}
140
141#ifndef NDEBUG
142// Return true if \p Val is considered an external definition. An external
143// definition is either:
144// 1. A Value that is not an Instruction. This will be refined in the future.
145// 2. An Instruction that is outside of the IR region represented in VPlan,
146// i.e., is not part of the loop nest.
147bool PlainCFGBuilder::isExternalDef(Value *Val) {
148 // All the Values that are not Instructions are considered external
149 // definitions for now.
150 Instruction *Inst = dyn_cast<Instruction>(Val);
151 if (!Inst)
152 return true;
153
154 // Check whether Instruction definition is in loop body.
155 return !TheLoop->contains(Inst);
156}
157#endif
158
159// Create a new VPValue or retrieve an existing one for the Instruction's
160// operand \p IRVal. This function must only be used to create/retrieve VPValues
161// for *Instruction's operands* and not to create regular VPInstruction's. For
162// the latter, please, look at 'createVPInstructionsForVPBB'.
163VPValue *PlainCFGBuilder::getOrCreateVPOperand(Value *IRVal) {
164 auto VPValIt = IRDef2VPValue.find(Val: IRVal);
165 if (VPValIt != IRDef2VPValue.end())
166 // Operand has an associated VPInstruction or VPValue that was previously
167 // created.
168 return VPValIt->second;
169
170 // Operand doesn't have a previously created VPInstruction/VPValue. This
171 // means that operand is:
172 // A) a definition external to VPlan,
173 // B) any other Value without specific representation in VPlan.
174 // For now, we use VPValue to represent A and B and classify both as external
175 // definitions. We may introduce specific VPValue subclasses for them in the
176 // future.
177 assert(isExternalDef(IRVal) && "Expected external definition as operand.");
178
179 // A and B: Create VPValue and add it to the pool of external definitions and
180 // to the Value->VPValue map.
181 VPValue *NewVPVal = Plan->getOrAddLiveIn(V: IRVal);
182 IRDef2VPValue[IRVal] = NewVPVal;
183 return NewVPVal;
184}
185
186// Create new VPInstructions in a VPBasicBlock, given its BasicBlock
187// counterpart. This function must be invoked in RPO so that the operands of a
188// VPInstruction in \p BB have been visited before (except for Phi nodes).
189void PlainCFGBuilder::createVPInstructionsForVPBB(VPBasicBlock *VPBB,
190 BasicBlock *BB) {
191 VPIRBuilder.setInsertPoint(VPBB);
192 // TODO: Model and preserve debug intrinsics in VPlan.
193 for (Instruction &InstRef : *BB) {
194 Instruction *Inst = &InstRef;
195
196 // There shouldn't be any VPValue for Inst at this point. Otherwise, we
197 // visited Inst when we shouldn't, breaking the RPO traversal order.
198 assert(!IRDef2VPValue.count(Inst) &&
199 "Instruction shouldn't have been visited.");
200
201 if (isa<UncondBrInst>(Val: Inst))
202 // Skip the rest of the Instruction processing for Branch instructions.
203 continue;
204
205 if (auto *Br = dyn_cast<CondBrInst>(Val: Inst)) {
206 // Conditional branch instruction are represented using BranchOnCond
207 // recipes.
208 VPValue *Cond = getOrCreateVPOperand(IRVal: Br->getCondition());
209 VPIRBuilder.createNaryOp(Opcode: VPInstruction::BranchOnCond, Operands: {Cond}, Inst, Flags: {},
210 MD: VPIRMetadata(*Inst), DL: Inst->getDebugLoc());
211 continue;
212 }
213
214 if (auto *SI = dyn_cast<SwitchInst>(Val: Inst)) {
215 // Don't emit recipes for unconditional switch instructions.
216 if (SI->getNumCases() == 0)
217 continue;
218 SmallVector<VPValue *> Ops = {getOrCreateVPOperand(IRVal: SI->getCondition())};
219 for (auto Case : SI->cases())
220 Ops.push_back(Elt: getOrCreateVPOperand(IRVal: Case.getCaseValue()));
221 VPIRBuilder.createNaryOp(Opcode: Instruction::Switch, Operands: Ops, Inst, Flags: {},
222 MD: VPIRMetadata(*Inst), DL: Inst->getDebugLoc());
223 continue;
224 }
225
226 VPSingleDefRecipe *NewR;
227 if (auto *Phi = dyn_cast<PHINode>(Val: Inst)) {
228 // Phi node's operands may not have been visited at this point. We create
229 // an empty VPInstruction that we will fix once the whole plain CFG has
230 // been built.
231 NewR = VPIRBuilder.createScalarPhi(IncomingValues: {}, DL: Phi->getDebugLoc(), Name: "vec.phi",
232 Flags: *Phi, ResultTy: Phi->getType());
233 NewR->setUnderlyingValue(Phi);
234 if (isHeaderBB(BB: Phi->getParent(), L: LI->getLoopFor(BB: Phi->getParent()))) {
235 // Header phis need to be fixed after the VPBB for the latch has been
236 // created.
237 PhisToFix.push_back(Elt: Phi);
238 } else {
239 // Add operands for VPPhi in the order matching its predecessors in
240 // VPlan.
241 DenseMap<const VPBasicBlock *, VPValue *> VPPredToIncomingValue;
242 for (unsigned I = 0; I != Phi->getNumOperands(); ++I) {
243 VPPredToIncomingValue[BB2VPBB[Phi->getIncomingBlock(i: I)]] =
244 getOrCreateVPOperand(IRVal: Phi->getIncomingValue(i: I));
245 }
246 for (VPBlockBase *Pred : VPBB->getPredecessors())
247 cast<VPPhi>(Val: NewR)->addIncoming(
248 IncomingV: VPPredToIncomingValue.lookup(Val: Pred->getExitingBasicBlock()));
249 }
250 } else {
251 // Build VPIRMetadata from the instruction and add loop versioning
252 // metadata for loads and stores.
253 VPIRMetadata MD(*Inst);
254 if (isa<LoadInst, StoreInst>(Val: Inst) && LVer) {
255 const auto &[AliasScopeMD, NoAliasMD] =
256 LVer->getNoAliasMetadataFor(OrigInst: Inst);
257 if (AliasScopeMD)
258 MD.setMetadata(Kind: LLVMContext::MD_alias_scope, Node: AliasScopeMD);
259 if (NoAliasMD)
260 MD.setMetadata(Kind: LLVMContext::MD_noalias, Node: NoAliasMD);
261 }
262
263 // Translate LLVM-IR operands into VPValue operands and set them in the
264 // new VPInstruction.
265 SmallVector<VPValue *, 4> VPOperands;
266 for (Value *Op : Inst->operands())
267 VPOperands.push_back(Elt: getOrCreateVPOperand(IRVal: Op));
268
269 if (auto *CI = dyn_cast<CastInst>(Val: Inst)) {
270 NewR = VPIRBuilder.createScalarCast(Opcode: CI->getOpcode(), Op: VPOperands[0],
271 ResultTy: CI->getType(), DL: CI->getDebugLoc(),
272 Flags: VPIRFlags(*CI), Metadata: MD);
273 NewR->setUnderlyingValue(CI);
274 } else if (auto *LI = dyn_cast<LoadInst>(Val: Inst)) {
275 NewR = VPIRBuilder.createScalarLoad(ResultTy: LI->getType(), Addr: VPOperands[0],
276 DL: LI->getDebugLoc(), Metadata: MD);
277 NewR->setUnderlyingValue(LI);
278 } else {
279 // Build VPInstruction for any arbitrary Instruction without specific
280 // representation in VPlan.
281 NewR = VPIRBuilder.createNaryOp(
282 Opcode: Inst->getOpcode(), Operands: VPOperands, Inst, Flags: VPIRFlags(*Inst), MD,
283 DL: Inst->getDebugLoc(), Name: "", ResultTy: Inst->getType());
284 }
285 }
286
287 IRDef2VPValue[Inst] = NewR;
288 }
289}
290
291// Main interface to build the plain CFG.
292std::unique_ptr<VPlan> PlainCFGBuilder::buildPlainCFG() {
293 VPIRBasicBlock *Entry = cast<VPIRBasicBlock>(Val: Plan->getEntry());
294 BB2VPBB[Entry->getIRBasicBlock()] = Entry;
295 for (VPIRBasicBlock *ExitVPBB : Plan->getExitBlocks())
296 BB2VPBB[ExitVPBB->getIRBasicBlock()] = ExitVPBB;
297
298 // 1. Scan the body of the loop in a topological order to visit each basic
299 // block after having visited its predecessor basic blocks. Create a VPBB for
300 // each BB and link it to its successor and predecessor VPBBs. Note that
301 // predecessors must be set in the same order as they are in the incomming IR.
302 // Otherwise, there might be problems with existing phi nodes and algorithm
303 // based on predecessors traversal.
304
305 // Loop PH needs to be explicitly visited since it's not taken into account by
306 // LoopBlocksDFS.
307 BasicBlock *ThePreheaderBB = TheLoop->getLoopPreheader();
308 assert((ThePreheaderBB->getTerminator()->getNumSuccessors() == 1) &&
309 "Unexpected loop preheader");
310 for (auto &I : *ThePreheaderBB) {
311 if (I.getType()->isVoidTy())
312 continue;
313 IRDef2VPValue[&I] = Plan->getOrAddLiveIn(V: &I);
314 }
315
316 LoopBlocksRPO RPO(TheLoop);
317 RPO.perform(LI);
318
319 for (BasicBlock *BB : RPO) {
320 // Create or retrieve the VPBasicBlock for this BB.
321 VPBasicBlock *VPBB = getOrCreateVPBB(BB);
322 // Set VPBB predecessors in the same order as they are in the incoming BB.
323 setVPBBPredsFromBB(VPBB, BB);
324
325 // Create VPInstructions for BB.
326 createVPInstructionsForVPBB(VPBB, BB);
327
328 // Set VPBB successors. We create empty VPBBs for successors if they don't
329 // exist already. Recipes will be created when the successor is visited
330 // during the RPO traversal.
331 if (auto *SI = dyn_cast<SwitchInst>(Val: BB->getTerminator())) {
332 SmallVector<VPBlockBase *> Succs = {
333 getOrCreateVPBB(BB: SI->getDefaultDest())};
334 for (auto Case : SI->cases())
335 Succs.push_back(Elt: getOrCreateVPBB(BB: Case.getCaseSuccessor()));
336 VPBB->setSuccessors(Succs);
337 continue;
338 }
339 if (auto *BI = dyn_cast<UncondBrInst>(Val: BB->getTerminator())) {
340 VPBB->setOneSuccessor(getOrCreateVPBB(BB: BI->getSuccessor()));
341 continue;
342 }
343 auto *BI = cast<CondBrInst>(Val: BB->getTerminator());
344 BasicBlock *IRSucc0 = BI->getSuccessor(i: 0);
345 BasicBlock *IRSucc1 = BI->getSuccessor(i: 1);
346 VPBasicBlock *Successor0 = getOrCreateVPBB(BB: IRSucc0);
347 VPBasicBlock *Successor1 = getOrCreateVPBB(BB: IRSucc1);
348 VPBB->setTwoSuccessors(IfTrue: Successor0, IfFalse: Successor1);
349 }
350
351 for (auto *EB : Plan->getExitBlocks())
352 setVPBBPredsFromBB(VPBB: EB, BB: EB->getIRBasicBlock());
353
354 // 2. The whole CFG has been built at this point so all the input Values must
355 // have a VPlan counterpart. Fix VPlan header phi by adding their
356 // corresponding VPlan operands.
357 fixHeaderPhis();
358
359 Plan->getEntry()->setOneSuccessor(getOrCreateVPBB(BB: TheLoop->getHeader()));
360 Plan->getEntry()->setPlan(&*Plan);
361
362 // Fix VPlan loop-closed-ssa exit phi's by adding incoming operands to the
363 // VPIRInstructions wrapping them.
364 // // Note that the operand order corresponds to IR predecessor order, and may
365 // need adjusting when VPlan predecessors are added, if an exit block has
366 // multiple predecessor.
367 for (auto *EB : Plan->getExitBlocks()) {
368 for (VPRecipeBase &R : EB->phis()) {
369 auto *PhiR = cast<VPIRPhi>(Val: &R);
370 PHINode &Phi = PhiR->getIRPhi();
371 assert(PhiR->getNumOperands() == 0 &&
372 "no phi operands should be added yet");
373 for (BasicBlock *Pred : predecessors(BB: EB->getIRBasicBlock()))
374 PhiR->addIncoming(
375 IncomingV: getOrCreateVPOperand(IRVal: Phi.getIncomingValueForBlock(BB: Pred)));
376 }
377 }
378
379 LLVM_DEBUG(Plan->setName("Plain CFG\n"); dbgs() << *Plan);
380 return std::move(Plan);
381}
382
383/// Checks if \p HeaderVPB is a loop header block in the plain CFG; that is, it
384/// has exactly 2 predecessors (preheader and latch), where the block
385/// dominates the latch and the preheader dominates the block. If it is a
386/// header block return true and canonicalize the predecessors of the header
387/// (making sure the preheader appears first and the latch second) and the
388/// successors of the latch (making sure the loop exit comes first). Otherwise
389/// return false.
390static bool canonicalHeaderAndLatch(VPBlockBase *HeaderVPB,
391 const VPDominatorTree &VPDT) {
392 ArrayRef<VPBlockBase *> Preds = HeaderVPB->getPredecessors();
393 if (Preds.size() != 2)
394 return false;
395
396 auto *PreheaderVPBB = Preds[0];
397 auto *LatchVPBB = Preds[1];
398 if (!VPDT.dominates(A: PreheaderVPBB, B: HeaderVPB) ||
399 !VPDT.dominates(A: HeaderVPB, B: LatchVPBB)) {
400 std::swap(a&: PreheaderVPBB, b&: LatchVPBB);
401
402 if (!VPDT.dominates(A: PreheaderVPBB, B: HeaderVPB) ||
403 !VPDT.dominates(A: HeaderVPB, B: LatchVPBB))
404 return false;
405
406 // Canonicalize predecessors of header so that preheader is first and
407 // latch second.
408 HeaderVPB->swapPredecessors();
409 for (VPRecipeBase &R : cast<VPBasicBlock>(Val: HeaderVPB)->phis())
410 R.swapOperands();
411 }
412
413 // The two successors of conditional branch match the condition, with the
414 // first successor corresponding to true and the second to false. We
415 // canonicalize the successors of the latch when introducing the region, such
416 // that the latch exits the region when its condition is true; invert the
417 // original condition if the original CFG branches to the header on true.
418 // Note that the exit edge is not yet connected for top-level loops.
419 if (LatchVPBB->getSingleSuccessor() ||
420 LatchVPBB->getSuccessors()[0] != HeaderVPB)
421 return true;
422
423 assert(LatchVPBB->getNumSuccessors() == 2 && "Must have 2 successors");
424 auto *Term = cast<VPBasicBlock>(Val: LatchVPBB)->getTerminator();
425 assert(cast<VPInstruction>(Term)->getOpcode() ==
426 VPInstruction::BranchOnCond &&
427 "terminator must be a BranchOnCond");
428 auto *Not = new VPInstruction(VPInstruction::Not, {Term->getOperand(N: 0)});
429 Not->insertBefore(InsertPos: Term);
430 Term->setOperand(I: 0, New: Not);
431 LatchVPBB->swapSuccessors();
432
433 return true;
434}
435
436/// Create a new VPRegionBlock for the loop starting at \p HeaderVPB. For the
437/// outermost loop adjust the regions exiting terminator to be based on the
438/// canonical IV.
439static void createLoopRegion(VPlan &Plan, VPBlockBase *HeaderVPB, DebugLoc DL) {
440 auto *PreheaderVPBB = HeaderVPB->getPredecessors()[0];
441 auto *LatchVPBB = cast<VPBasicBlock>(Val: HeaderVPB->getPredecessors()[1]);
442 auto *OutermostHeaderVPBB =
443 VPBlockUtils::getPlainCFGHeaderAndLatch(Plan).first;
444
445 VPBlockUtils::disconnectBlocks(From: PreheaderVPBB, To: HeaderVPB);
446 VPBlockUtils::disconnectBlocks(From: LatchVPBB, To: HeaderVPB);
447
448 // Create an empty region first and insert it between PreheaderVPBB and
449 // the exit blocks, taking care to preserve the original predecessor &
450 // successor order of blocks. Set region entry and exiting after both
451 // HeaderVPB and LatchVPBB have been disconnected from their
452 // predecessors/successors. Only the outermost loop has a canonical IV. Nested
453 // loops are assigned a canonical IV of null type and unknown debug location.
454 bool IsOutermost = HeaderVPB == OutermostHeaderVPBB;
455 Type *CanIVTy = nullptr;
456 if (IsOutermost)
457 CanIVTy = Plan.getVectorTripCount().getType();
458 else
459 DL = DebugLoc::getUnknown();
460 auto *R = Plan.createLoopRegion(CanIVTy, DL);
461
462 // Transfer latch's successors to the region.
463 VPBlockUtils::transferSuccessors(Old: LatchVPBB, New: R);
464
465 VPBlockUtils::connectBlocks(From: PreheaderVPBB, To: R);
466 R->setEntry(HeaderVPB);
467 R->setExiting(LatchVPBB);
468
469 // All VPBB's reachable shallowly from HeaderVPB belong to the current region.
470 for (VPBlockBase *VPBB : vp_depth_first_shallow(G: HeaderVPB))
471 VPBB->setParent(R);
472
473 if (!IsOutermost)
474 return;
475
476 auto *LatchTerm = LatchVPBB->getTerminator();
477 VPBuilder Builder(LatchTerm);
478 // Add a VPInstruction to increment the scalar canonical IV by VF * UF.
479 // Initially the induction increment is guaranteed to not wrap, but that may
480 // change later, e.g. when tail-folding, when the flags need to be dropped.
481 auto *CanonicalIVIncrement = Builder.createAdd(
482 LHS: R->getCanonicalIV(), RHS: &Plan.getVFxUF(), DL, Name: "index.next", WrapFlags: {true, false});
483
484 if (match(V: LatchTerm, P: m_BranchOnTwoConds())) {
485 auto *IsLatchExitTaken = Builder.createICmp(
486 Pred: CmpInst::ICMP_EQ, A: CanonicalIVIncrement, B: &Plan.getVectorTripCount());
487 LatchTerm->setOperand(I: 1, New: IsLatchExitTaken);
488 } else {
489 // We are replacing the branch to exit the region. Remove the original
490 // BranchOnCond.
491 assert(match(LatchTerm, m_BranchOnCond()) && "Unexpected terminator");
492 DebugLoc LatchDL = LatchTerm->getDebugLoc();
493 Builder.createNaryOp(Opcode: VPInstruction::BranchOnCount,
494 Operands: {CanonicalIVIncrement, &Plan.getVectorTripCount()},
495 DL: LatchDL);
496 LatchTerm->eraseFromParent();
497 }
498}
499
500/// Creates extracts for values in \p Plan defined in a loop region and used
501/// outside a loop region.
502static void createExtractsForLiveOuts(VPlan &Plan, VPBasicBlock *MiddleVPBB) {
503 VPBuilder B(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
504 for (VPBasicBlock *EB : Plan.getExitBlocks()) {
505 if (!is_contained(Range: EB->predecessors(), Element: MiddleVPBB))
506 continue;
507
508 for (VPRecipeBase &R : EB->phis()) {
509 auto *ExitIRI = cast<VPIRPhi>(Val: &R);
510 VPValue *Exiting = ExitIRI->getIncomingValueForBlock(VPBB: MiddleVPBB);
511 if (isa<VPIRValue>(Val: Exiting))
512 continue;
513 Exiting = B.createNaryOp(Opcode: VPInstruction::ExtractLastPart, Operands: Exiting);
514 Exiting = B.createNaryOp(Opcode: VPInstruction::ExtractLastLane, Operands: Exiting);
515 ExitIRI->setIncomingValueForBlock(VPBB: MiddleVPBB, V: Exiting);
516 }
517 }
518}
519
520static void addInitialSkeleton(VPlan &Plan, Type *InductionTy,
521 PredicatedScalarEvolution &PSE, Loop *TheLoop) {
522 VPDominatorTree VPDT(Plan);
523
524 auto *HeaderVPBB = cast<VPBasicBlock>(Val: Plan.getEntry()->getSingleSuccessor());
525 canonicalHeaderAndLatch(HeaderVPB: HeaderVPBB, VPDT);
526 auto *LatchVPBB = cast<VPBasicBlock>(Val: HeaderVPBB->getPredecessors()[1]);
527
528 VPBasicBlock *VecPreheader = Plan.createVPBasicBlock(Name: "vector.ph");
529 VPBlockUtils::insertBlockAfter(NewBlock: VecPreheader, BlockPtr: Plan.getEntry());
530
531 VPBasicBlock *MiddleVPBB = Plan.createVPBasicBlock(Name: "middle.block");
532 // The canonical LatchVPBB has the header block as last successor. If it has
533 // another successor, this successor is an exit block - insert middle block on
534 // its edge. Otherwise, add middle block as another successor retaining header
535 // as last. In the latter case, the latch has no conditional terminator yet,
536 // so insert a placeholder BranchOnCond that always continues to the header.
537 // It will be canonicalized to a BranchOnCount later
538 if (LatchVPBB->getNumSuccessors() == 2) {
539 VPBlockBase *LatchExitVPB = LatchVPBB->getSuccessors()[0];
540 VPBlockUtils::insertOnEdge(From: LatchVPBB, To: LatchExitVPB, BlockPtr: MiddleVPBB);
541 } else {
542 VPBlockUtils::connectBlocks(From: LatchVPBB, To: MiddleVPBB);
543 LatchVPBB->swapSuccessors();
544 VPBuilder(LatchVPBB).createNaryOp(Opcode: VPInstruction::BranchOnCond,
545 Operands: {Plan.getFalse()});
546 }
547
548 // Create SCEV and VPValue for the trip count.
549 // We use the symbolic max backedge-taken-count, which works also when
550 // vectorizing loops with uncountable early exits.
551 const SCEV *BackedgeTakenCountSCEV = PSE.getSymbolicMaxBackedgeTakenCount();
552 assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCountSCEV) &&
553 "Invalid backedge-taken count");
554 ScalarEvolution &SE = *PSE.getSE();
555 const SCEV *TripCount = SE.getTripCountFromExitCount(ExitCount: BackedgeTakenCountSCEV,
556 EvalTy: InductionTy, L: TheLoop);
557 Plan.setTripCount(vputils::getOrCreateVPValueForSCEVExpr(Plan, Expr: TripCount));
558
559 VPBasicBlock *ScalarPH = Plan.createVPBasicBlock(Name: "scalar.ph");
560 VPBlockUtils::connectBlocks(From: ScalarPH, To: Plan.getScalarHeader());
561
562 // The connection order corresponds to the operands of the conditional branch,
563 // with the middle block already connected to the exit block.
564 VPBlockUtils::connectBlocks(From: MiddleVPBB, To: ScalarPH);
565 // Also connect the entry block to the scalar preheader.
566 // TODO: Also introduce a branch recipe together with the minimum trip count
567 // check.
568 VPBlockUtils::connectBlocks(From: Plan.getEntry(), To: ScalarPH);
569 Plan.getEntry()->swapSuccessors();
570
571 createExtractsForLiveOuts(Plan, MiddleVPBB);
572
573 // Create resume phis in the scalar preheader for each phi in the scalar loop.
574 // Their incoming value from the vector loop will be the last lane of the
575 // corresponding vector loop header phi.
576 VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
577 VPBuilder ScalarPHBuilder(ScalarPH);
578 assert(equal(ScalarPH->getPredecessors(),
579 ArrayRef<VPBlockBase *>({MiddleVPBB, Plan.getEntry()})) &&
580 "unexpected predecessor order of scalar ph");
581 for (const auto &[PhiR, ScalarPhiR] :
582 zip_equal(t: HeaderVPBB->phis(), u: Plan.getScalarHeader()->phis())) {
583 auto *VectorPhiR = cast<VPPhi>(Val: &PhiR);
584 VPValue *BackedgeVal = VectorPhiR->getOperand(N: 1);
585 VPValue *ResumeFromVectorLoop =
586 MiddleBuilder.createNaryOp(Opcode: VPInstruction::ExtractLastPart, Operands: BackedgeVal);
587 ResumeFromVectorLoop = MiddleBuilder.createNaryOp(
588 Opcode: VPInstruction::ExtractLastLane, Operands: ResumeFromVectorLoop);
589 // Create scalar resume phi, with the first operand being the incoming value
590 // from the middle block and the second operand coming from the entry block.
591 auto *ResumePhiR = ScalarPHBuilder.createScalarPhi(
592 IncomingValues: {ResumeFromVectorLoop, VectorPhiR->getOperand(N: 0)},
593 DL: VectorPhiR->getDebugLoc());
594 cast<VPIRPhi>(Val: &ScalarPhiR)->addIncoming(IncomingV: ResumePhiR);
595 }
596}
597
598/// Check \p Plan's live-in and replace them with constants, if they can be
599/// simplified via SCEV.
600static void simplifyLiveInsWithSCEV(VPlan &Plan,
601 PredicatedScalarEvolution &PSE) {
602 auto GetSimplifiedLiveInViaSCEV = [&](VPValue *VPV) -> VPValue * {
603 const SCEV *Expr = vputils::getSCEVExprForVPValue(V: VPV, PSE);
604 if (auto *C = dyn_cast<SCEVConstant>(Val: Expr))
605 return Plan.getOrAddLiveIn(V: C->getValue());
606 return nullptr;
607 };
608
609 for (VPValue *LiveIn : to_vector(Range: Plan.getLiveIns())) {
610 if (VPValue *SimplifiedLiveIn = GetSimplifiedLiveInViaSCEV(LiveIn))
611 LiveIn->replaceAllUsesWith(New: SimplifiedLiveIn);
612 }
613}
614
615/// To make RUN_VPLAN_PASS print initial VPlan.
616static void printAfterInitialConstruction(VPlan &) {}
617
618std::unique_ptr<VPlan>
619VPlanTransforms::buildVPlan0(Loop *TheLoop, LoopInfo &LI, Type *InductionTy,
620 PredicatedScalarEvolution &PSE,
621 LoopVersioning *LVer) {
622 PlainCFGBuilder Builder(TheLoop, &LI, LVer, InductionTy);
623 std::unique_ptr<VPlan> VPlan0 = Builder.buildPlainCFG();
624 addInitialSkeleton(Plan&: *VPlan0, InductionTy, PSE, TheLoop);
625 simplifyLiveInsWithSCEV(Plan&: *VPlan0, PSE);
626
627 RUN_VPLAN_PASS_NO_VERIFY(printAfterInitialConstruction, *VPlan0);
628 return VPlan0;
629}
630
631/// Creates a VPWidenIntOrFpInductionRecipe or VPWidenPointerInductionRecipe
632/// for \p Phi based on \p IndDesc.
633static VPHeaderPHIRecipe *
634createWidenInductionRecipe(PHINode *Phi, VPPhi *PhiR, VPIRValue *Start,
635 const InductionDescriptor &IndDesc, VPlan &Plan,
636 PredicatedScalarEvolution &PSE, Loop &OrigLoop,
637 DebugLoc DL) {
638 [[maybe_unused]] ScalarEvolution &SE = *PSE.getSE();
639 assert(SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) &&
640 "step must be loop invariant");
641 assert((Plan.getLiveIn(IndDesc.getStartValue()) == Start ||
642 (SE.isSCEVable(IndDesc.getStartValue()->getType()) &&
643 PSE.getSCEV(IndDesc.getStartValue()) ==
644 vputils::getSCEVExprForVPValue(Start, PSE))) &&
645 "Start VPValue must match IndDesc's start value");
646
647 VPValue *Step =
648 vputils::getOrCreateVPValueForSCEVExpr(Plan, Expr: IndDesc.getStep());
649
650 VPValue *BackedgeVal = PhiR->getOperand(N: 1);
651 // Replace live-out extracts of WideIV's backedge value by ExitingIVValue
652 // recipes. optimizeInductionLiveOutUsers will later compute the proper
653 // DerivedIV.
654 auto ReplaceExtractsWithExitingIVValue = [&](VPHeaderPHIRecipe *WideIV) {
655 for (VPUser *U : to_vector(Range: BackedgeVal->users())) {
656 if (!match(U, P: m_ExtractLastPart(Op0: m_VPValue())))
657 continue;
658 auto *ExtractLastPart = cast<VPInstruction>(Val: U);
659 VPUser *ExtractLastPartUser = ExtractLastPart->getSingleUser();
660 assert(ExtractLastPartUser && "must have a single user");
661 if (!match(U: ExtractLastPartUser, P: m_ExtractLastLane(Op0: m_VPValue())))
662 continue;
663 auto *ExtractLastLane = cast<VPInstruction>(Val: ExtractLastPartUser);
664 assert(is_contained(ExtractLastLane->getParent()->successors(),
665 Plan.getScalarPreheader()) &&
666 "last lane must be extracted in the middle block");
667 VPBuilder Builder(ExtractLastLane);
668 ExtractLastLane->replaceAllUsesWith(
669 New: Builder.createNaryOp(Opcode: VPInstruction::ExitingIVValue, Operands: {WideIV}));
670 ExtractLastLane->eraseFromParent();
671 ExtractLastPart->eraseFromParent();
672 }
673 };
674
675 if (IndDesc.getKind() == InductionDescriptor::IK_PtrInduction) {
676 auto *WideIV = new VPWidenPointerInductionRecipe(
677 Phi, Start, Step, &Plan.getVFxUF(), IndDesc, DL);
678 ReplaceExtractsWithExitingIVValue(WideIV);
679 return WideIV;
680 }
681
682 assert((IndDesc.getKind() == InductionDescriptor::IK_IntInduction ||
683 IndDesc.getKind() == InductionDescriptor::IK_FpInduction) &&
684 "must have an integer or float induction at this point");
685
686 // Update wide induction increments to use the same step as the corresponding
687 // wide induction. This enables detecting induction increments directly in
688 // VPlan and removes redundant splats.
689 if (match(V: BackedgeVal, P: m_Add(Op0: m_Specific(VPV: PhiR), Op1: m_VPValue())))
690 BackedgeVal->getDefiningRecipe()->setOperand(I: 1, New: Step);
691
692 // It is always safe to copy over the NoWrap and FastMath flags. In
693 // particular, when folding tail by masking, the masked-off lanes are never
694 // used, so it is safe.
695 VPIRFlags Flags = vputils::getFlagsFromIndDesc(ID: IndDesc);
696
697 auto *WideIV = new VPWidenIntOrFpInductionRecipe(
698 Phi, Start, Step, &Plan.getVF(), IndDesc, Flags, DL);
699
700 ReplaceExtractsWithExitingIVValue(WideIV);
701 return WideIV;
702}
703
704/// Try to sink users of \p FOR after \p Previous. \returns true if sinking
705/// succeeded or was not necessary, and false otherwise.
706static bool
707sinkRecurrenceUsersAfterPrevious(VPFirstOrderRecurrencePHIRecipe *FOR,
708 VPRecipeBase *Previous,
709 const VPDominatorTree &VPDT) {
710 // Collect recipes that need sinking.
711 SmallVector<VPRecipeBase *> WorkList;
712 SmallPtrSet<VPRecipeBase *, 8> Seen;
713 Seen.insert(Ptr: Previous);
714 auto TryToPushSinkCandidate = [&](VPRecipeBase *SinkCandidate) {
715 // The previous value must not depend on the users of the recurrence phi.
716 // In that case, FOR is not a fixed order recurrence.
717 if (SinkCandidate == Previous)
718 return false;
719
720 if (isa<VPHeaderPHIRecipe>(Val: SinkCandidate) ||
721 !Seen.insert(Ptr: SinkCandidate).second ||
722 VPDT.properlyDominates(A: Previous, B: SinkCandidate))
723 return true;
724
725 if (vputils::cannotHoistOrSinkRecipe(R: *SinkCandidate, /*Sinking=*/true))
726 return false;
727
728 WorkList.push_back(Elt: SinkCandidate);
729 return true;
730 };
731
732 // Recursively sink users of FOR after Previous.
733 WorkList.push_back(Elt: FOR);
734 for (unsigned I = 0; I != WorkList.size(); ++I) {
735 VPRecipeBase *Current = WorkList[I];
736 assert(Current->getNumDefinedValues() == 1 &&
737 "only recipes with a single defined value expected");
738
739 for (VPUser *User : Current->getVPSingleValue()->users()) {
740 if (!TryToPushSinkCandidate(cast<VPRecipeBase>(Val: User)))
741 return false;
742 }
743 }
744
745 // Keep recipes to sink ordered by dominance so earlier instructions are
746 // processed first.
747 sort(C&: WorkList, Comp: [&VPDT](const VPRecipeBase *A, const VPRecipeBase *B) {
748 return VPDT.properlyDominates(A, B);
749 });
750
751 for (VPRecipeBase *SinkCandidate : WorkList) {
752 if (SinkCandidate == FOR)
753 continue;
754
755 SinkCandidate->moveAfter(MovePos: Previous);
756 Previous = SinkCandidate;
757 }
758 return true;
759}
760
761/// Try to hoist \p Previous and its operands before all users of \p FOR.
762/// \returns true if hoisting succeeded or was not necessary, and false
763/// otherwise.
764static bool hoistPreviousBeforeFORUsers(VPFirstOrderRecurrencePHIRecipe *FOR,
765 VPRecipeBase *Previous,
766 const VPDominatorTree &VPDT) {
767 if (vputils::cannotHoistOrSinkRecipe(R: *Previous))
768 return false;
769
770 // Collect recipes that need hoisting.
771 SmallVector<VPRecipeBase *> HoistCandidates;
772 SmallPtrSet<VPRecipeBase *, 8> Visited;
773 // Find the closest hoist point by looking at all users of FOR and selecting
774 // the recipe dominating all other users.
775 VPRecipeBase *HoistPoint = nullptr;
776 for (VPUser *U : FOR->users()) {
777 auto *R = cast<VPRecipeBase>(Val: U);
778 if (!HoistPoint || VPDT.properlyDominates(A: R, B: HoistPoint))
779 HoistPoint = R;
780 }
781 assert(all_of(FOR->users(),
782 [&VPDT, HoistPoint](VPUser *U) {
783 auto *R = cast<VPRecipeBase>(U);
784 return HoistPoint == R ||
785 VPDT.properlyDominates(HoistPoint, R);
786 }) &&
787 "HoistPoint must dominate all users of FOR");
788
789 auto NeedsHoisting = [HoistPoint, &VPDT,
790 &Visited](VPValue *HoistCandidateV) -> VPRecipeBase * {
791 VPRecipeBase *HoistCandidate = HoistCandidateV->getDefiningRecipe();
792 if (!HoistCandidate)
793 return nullptr;
794 // Hoist candidate was already visited, no need to hoist.
795 if (!Visited.insert(Ptr: HoistCandidate).second)
796 return nullptr;
797 // If we reached a recipe that dominates HoistPoint, we don't need to
798 // hoist the recipe.
799 if (VPDT.properlyDominates(A: HoistCandidate, B: HoistPoint))
800 return nullptr;
801 return HoistCandidate;
802 };
803
804 if (!NeedsHoisting(Previous->getVPSingleValue()))
805 return true;
806
807 // Recursively try to hoist Previous and its operands before all users of
808 // FOR.
809 HoistCandidates.push_back(Elt: Previous);
810
811 for (unsigned I = 0; I != HoistCandidates.size(); ++I) {
812 VPRecipeBase *Current = HoistCandidates[I];
813 assert(Current->getNumDefinedValues() == 1 &&
814 "only recipes with a single defined value expected");
815 if (vputils::cannotHoistOrSinkRecipe(R: *Current))
816 return false;
817
818 for (VPValue *Op : Current->operands()) {
819 // If we reach FOR, it means the original Previous depends on some other
820 // recurrence that in turn depends on FOR. If that is the case, we would
821 // also need to hoist recipes involving the other FOR, which may break
822 // dependencies.
823 if (Op == FOR)
824 return false;
825
826 if (auto *R = NeedsHoisting(Op)) {
827 // Bail out if the recipe defines multiple values.
828 // TODO: Hoisting such recipes requires additional handling.
829 if (R->getNumDefinedValues() != 1)
830 return false;
831 HoistCandidates.push_back(Elt: R);
832 }
833 }
834 }
835
836 // Order recipes to hoist by dominance so earlier instructions are processed
837 // first.
838 sort(C&: HoistCandidates, Comp: [&VPDT](const VPRecipeBase *A, const VPRecipeBase *B) {
839 return VPDT.properlyDominates(A, B);
840 });
841
842 for (VPRecipeBase *HoistCandidate : HoistCandidates) {
843 HoistCandidate->moveBefore(BB&: *HoistPoint->getParent(),
844 I: HoistPoint->getIterator());
845 }
846
847 return true;
848}
849
850/// Sink users of fixed-order recurrences past or hoist before the recipe
851/// defining the previous value, introduce FirstOrderRecurrenceSplice
852/// VPInstructions, and replace FOR uses. Returns false if hoisting or sinking
853/// fails.
854static bool tryToSinkOrHoistRecurrenceUsers(VPBasicBlock *HeaderVPBB,
855 const VPDominatorTree &VPDT) {
856 auto FORs =
857 map_to_vector(C: make_filter_range(Range: HeaderVPBB->phis(),
858 Pred: IsaPred<VPFirstOrderRecurrencePHIRecipe>),
859 F: [](VPRecipeBase &R) {
860 return cast<VPFirstOrderRecurrencePHIRecipe>(Val: &R);
861 });
862 for (VPFirstOrderRecurrencePHIRecipe *FOR : FORs) {
863 // Follow through FOR phi chains to find the actual Previous recipe.
864 // Fixed-order recurrences do not contain cycles, so this loop is
865 // guaranteed to terminate.
866 SmallPtrSet<VPFirstOrderRecurrencePHIRecipe *, 4> SeenPhis;
867 VPRecipeBase *Previous = FOR->getBackedgeValue()->getDefiningRecipe();
868 while (auto *PrevPhi =
869 dyn_cast_or_null<VPFirstOrderRecurrencePHIRecipe>(Val: Previous)) {
870 assert(PrevPhi->getParent() == FOR->getParent() &&
871 "PrevPhi must be in same block as FOR");
872 assert(SeenPhis.insert(PrevPhi).second &&
873 "PrevPhi must not be visited multiple times");
874 Previous = PrevPhi->getBackedgeValue()->getDefiningRecipe();
875 }
876
877 VPBasicBlock *InsertBlock = FOR->getParent();
878 VPBasicBlock::iterator InsertPt = InsertBlock->getFirstNonPhi();
879 if (Previous) {
880 // Sink FOR users after Previous or hoist Previous before FOR users.
881 if (!sinkRecurrenceUsersAfterPrevious(FOR, Previous, VPDT) &&
882 !hoistPreviousBeforeFORUsers(FOR, Previous, VPDT))
883 return false;
884 InsertBlock = Previous->getParent();
885 InsertPt = isa<VPHeaderPHIRecipe>(Val: Previous)
886 ? InsertBlock->getFirstNonPhi()
887 : std::next(x: Previous->getIterator());
888 }
889
890 // Create FirstOrderRecurrenceSplice and replace FOR uses.
891 VPBuilder LoopBuilder(InsertBlock, InsertPt);
892 auto *RecurSplice =
893 LoopBuilder.createNaryOp(Opcode: VPInstruction::FirstOrderRecurrenceSplice,
894 Operands: {FOR, FOR->getBackedgeValue()});
895 FOR->replaceUsesWithIf(New: RecurSplice, ShouldReplace: [RecurSplice](VPUser &U, unsigned) {
896 return &U != RecurSplice;
897 });
898 }
899
900 return true;
901}
902
903bool VPlanTransforms::createHeaderPhiRecipes(
904 VPlan &Plan, PredicatedScalarEvolution &PSE, Loop &OrigLoop,
905 const VPDominatorTree &VPDT,
906 const MapVector<PHINode *, InductionDescriptor> &Inductions,
907 const MapVector<PHINode *, RecurrenceDescriptor> &Reductions,
908 const SmallPtrSetImpl<const PHINode *> &FixedOrderRecurrences,
909 const SmallPtrSetImpl<PHINode *> &InLoopReductions, bool AllowReordering) {
910 // Retrieve the header manually from the intial plain-CFG VPlan.
911 auto [HeaderVPBB, LatchVPBB] = VPBlockUtils::getPlainCFGHeaderAndLatch(Plan);
912 assert(VPDT.dominates(HeaderVPBB, LatchVPBB) &&
913 "header must dominate its latch");
914
915 auto CreateHeaderPhiRecipe = [&](VPPhi *PhiR) -> VPHeaderPHIRecipe * {
916 // TODO: Gradually replace uses of underlying instruction by analyses on
917 // VPlan.
918 auto *Phi = cast<PHINode>(Val: PhiR->getUnderlyingInstr());
919 assert(PhiR->getNumOperands() == 2 &&
920 "Must have 2 operands for header phis");
921
922 // Extract common values once.
923 VPIRValue *Start = cast<VPIRValue>(Val: PhiR->getOperand(N: 0));
924 VPValue *BackedgeValue = PhiR->getOperand(N: 1);
925
926 if (FixedOrderRecurrences.contains(Ptr: Phi)) {
927 // TODO: Currently fixed-order recurrences are modeled as chains of
928 // first-order recurrences. If there are no users of the intermediate
929 // recurrences in the chain, the fixed order recurrence should be
930 // modeled directly, enabling more efficient codegen.
931 return new VPFirstOrderRecurrencePHIRecipe(Phi, *Start, *BackedgeValue);
932 }
933
934 auto InductionIt = Inductions.find(Key: Phi);
935 if (InductionIt != Inductions.end())
936 return createWidenInductionRecipe(Phi, PhiR, Start, IndDesc: InductionIt->second,
937 Plan, PSE, OrigLoop,
938 DL: PhiR->getDebugLoc());
939
940 assert(Reductions.contains(Phi) && "only reductions are expected now");
941 const RecurrenceDescriptor &RdxDesc = Reductions.lookup(Key: Phi);
942 assert(RdxDesc.getRecurrenceStartValue() ==
943 Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader()) &&
944 "incoming value must match start value");
945 // Will be updated later to >1 if reduction is partial.
946 unsigned ScaleFactor = 1;
947 bool UseOrderedReductions = !AllowReordering && RdxDesc.isOrdered();
948 return new VPReductionPHIRecipe(
949 Phi, RdxDesc.getRecurrenceKind(), *Start, *BackedgeValue,
950 getReductionStyle(InLoop: InLoopReductions.contains(Ptr: Phi), Ordered: UseOrderedReductions,
951 ScaleFactor),
952 Phi->getType()->isFloatingPointTy() ? RdxDesc.getFastMathFlags()
953 : VPIRFlags(),
954 RdxDesc.hasUsesOutsideReductionChain());
955 };
956
957 for (VPRecipeBase &R : make_early_inc_range(Range: HeaderVPBB->phis())) {
958 auto *PhiR = cast<VPPhi>(Val: &R);
959 VPHeaderPHIRecipe *HeaderPhiR = CreateHeaderPhiRecipe(PhiR);
960 HeaderPhiR->insertBefore(InsertPos: PhiR);
961 PhiR->replaceAllUsesWith(New: HeaderPhiR);
962 PhiR->eraseFromParent();
963 }
964
965 if (!tryToSinkOrHoistRecurrenceUsers(HeaderVPBB, VPDT))
966 return false;
967
968 // Skip renaming resume phi recipes, if any header phi has been removed.
969 if (range_size(Range: HeaderVPBB->phis()) !=
970 range_size(Range: Plan.getScalarPreheader()->phis()))
971 return true;
972 for (const auto &[HeaderPhiR, ScalarPhiR] :
973 zip_equal(t: HeaderVPBB->phis(), u: Plan.getScalarPreheader()->phis())) {
974 auto *ResumePhiR = cast<VPPhi>(Val: &ScalarPhiR);
975 if (isa<VPFirstOrderRecurrencePHIRecipe>(Val: &HeaderPhiR)) {
976 ResumePhiR->setName("scalar.recur.init");
977 auto *ExtractLastLane = cast<VPInstruction>(Val: ResumePhiR->getOperand(N: 0));
978 ExtractLastLane->setName("vector.recur.extract");
979 continue;
980 }
981 ResumePhiR->setName(isa<VPWidenInductionRecipe>(Val: HeaderPhiR)
982 ? "bc.resume.val"
983 : "bc.merge.rdx");
984 }
985 return true;
986}
987
988bool VPlanTransforms::finalizeSCEVPredicates(VPlan &Plan,
989 PredicatedScalarEvolution &PSE,
990 bool OptForSize,
991 unsigned SCEVCheckThreshold,
992 OptimizationRemarkEmitter *ORE,
993 Loop *TheLoop) {
994 // Collect which wide IVs have predicates and add them to PSE.
995 auto [HeaderVPBB, _] = VPBlockUtils::getPlainCFGHeaderAndLatch(Plan);
996 SmallPtrSet<VPWidenInductionRecipe *, 4> PredicatedIVs;
997 for (auto &R : HeaderVPBB->phis()) {
998 auto *WideIV = dyn_cast<VPWidenInductionRecipe>(Val: &R);
999 if (!WideIV || WideIV->getNoWrapPredicates().empty())
1000 continue;
1001 PredicatedIVs.insert(Ptr: WideIV);
1002 for (const auto *P : WideIV->getNoWrapPredicates())
1003 PSE.addPredicate(Pred: *P);
1004 }
1005
1006 // Bail out if exit phis use predicated IVs via ExitingIVValue, as the
1007 // predicated SCEV may not hold outside the loop (PR33706). Check each IV's
1008 // predicates directly, regardless of whether PSE was already non-trivial
1009 // from other sources (e.g., LAI predicates).
1010 // TODO: Overly conservative; the pre-computed exit values are not correct
1011 // outside the loop, but the exit values could be extracted from the vector
1012 // loop.
1013 if (!PredicatedIVs.empty())
1014 for (auto *EB : Plan.getExitBlocks())
1015 for (VPRecipeBase &R : EB->phis())
1016 for (VPValue *Op : R.operands()) {
1017 VPValue *Inner;
1018 if (!match(V: Op, P: m_ExitingIVValue(Op0: m_VPValue(V&: Inner))))
1019 continue;
1020 auto *WideIV = dyn_cast<VPWidenInductionRecipe>(Val: Inner);
1021 if (!WideIV || !PredicatedIVs.contains(Ptr: WideIV))
1022 continue;
1023 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Predicated IV has "
1024 "outside-loop use via ExitingIVValue\n");
1025 return false;
1026 }
1027
1028 unsigned TotalComplexity = PSE.getPredicate().getComplexity();
1029 if (TotalComplexity && OptForSize) {
1030 LLVM_DEBUG(
1031 dbgs() << "LV: Not vectorizing: SCEV predicates needed for induction "
1032 "but optimizing for size\n");
1033 reportVectorizationFailure(
1034 DebugMsg: "Runtime SCEV check is required with -Os/-Oz",
1035 OREMsg: "runtime SCEV checks needed but optimizing for size",
1036 ORETag: "CantVersionLoopWithOptForSize", ORE, TheLoop);
1037 return false;
1038 }
1039
1040 if (TotalComplexity > SCEVCheckThreshold) {
1041 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Too many SCEV checks needed ("
1042 << TotalComplexity << " > " << SCEVCheckThreshold
1043 << ")\n");
1044 reportVectorizationFailure(
1045 DebugMsg: "Too many SCEV checks needed",
1046 OREMsg: "Too many SCEV assumptions need to be made and checked at runtime",
1047 ORETag: "TooManySCEVRunTimeChecks", ORE, TheLoop);
1048 return false;
1049 }
1050
1051 return true;
1052}
1053
1054void VPlanTransforms::createInLoopReductionRecipes(VPlan &Plan,
1055 ElementCount MinVF) {
1056 VPBasicBlock *Header = Plan.getVectorLoopRegion()->getEntryBasicBlock();
1057 SmallVector<VPRecipeBase *> ToDelete;
1058
1059 for (VPRecipeBase &R : Header->phis()) {
1060 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(Val: &R);
1061 if (!PhiR || !PhiR->isInLoop() || (MinVF.isScalar() && !PhiR->isOrdered()))
1062 continue;
1063
1064 RecurKind Kind = PhiR->getRecurrenceKind();
1065 assert(!RecurrenceDescriptor::isFindLastRecurrenceKind(Kind) &&
1066 !RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) &&
1067 !RecurrenceDescriptor::isFindIVRecurrenceKind(Kind) &&
1068 "AnyOf and Find reductions are not allowed for in-loop reductions");
1069
1070 bool IsFPRecurrence =
1071 RecurrenceDescriptor::isFloatingPointRecurrenceKind(Kind);
1072 FastMathFlags FMFs =
1073 IsFPRecurrence ? FastMathFlags::getFast() : FastMathFlags();
1074
1075 // Collect the chain of "link" recipes for the reduction starting at PhiR.
1076 SetVector<VPSingleDefRecipe *> Worklist;
1077 Worklist.insert(X: PhiR);
1078 for (unsigned I = 0; I != Worklist.size(); ++I) {
1079 VPSingleDefRecipe *Cur = Worklist[I];
1080 for (VPUser *U : Cur->users()) {
1081 auto *UserRecipe = cast<VPSingleDefRecipe>(Val: U);
1082 if (!UserRecipe->getParent()->getEnclosingLoopRegion()) {
1083 assert((UserRecipe->getParent() == Plan.getMiddleBlock() ||
1084 UserRecipe->getParent() == Plan.getScalarPreheader()) &&
1085 "U must be either in the loop region, the middle block or the "
1086 "scalar preheader.");
1087 continue;
1088 }
1089
1090 // Stores using instructions will be sunk later.
1091 if (match(R: UserRecipe, P: m_VPInstruction<Instruction::Store>()))
1092 continue;
1093 Worklist.insert(X: UserRecipe);
1094 }
1095 }
1096
1097 // Visit operation "Links" along the reduction chain top-down starting from
1098 // the phi until LoopExitValue. We keep track of the previous item
1099 // (PreviousLink) to tell which of the two operands of a Link will remain
1100 // scalar and which will be reduced. For minmax by select(cmp), Link will be
1101 // the select instructions. Blend recipes of in-loop reduction phi's will
1102 // get folded to their non-phi operand, as the reduction recipe handles the
1103 // condition directly.
1104 VPSingleDefRecipe *PreviousLink = PhiR; // Aka Worklist[0].
1105 for (VPSingleDefRecipe *CurrentLink : drop_begin(RangeOrContainer&: Worklist)) {
1106 if (auto *Blend = dyn_cast<VPBlendRecipe>(Val: CurrentLink)) {
1107 assert(Blend->getNumIncomingValues() == 2 &&
1108 "Blend must have 2 incoming values");
1109 unsigned PhiRIdx = Blend->getIncomingValue(Idx: 0) == PhiR ? 0 : 1;
1110 assert(Blend->getIncomingValue(PhiRIdx) == PhiR &&
1111 "PhiR must be an operand of the blend");
1112 Blend->replaceAllUsesWith(New: Blend->getIncomingValue(Idx: 1 - PhiRIdx));
1113 continue;
1114 }
1115
1116 if (IsFPRecurrence) {
1117 FastMathFlags CurFMF =
1118 cast<VPRecipeWithIRFlags>(Val: CurrentLink)->getFastMathFlagsOrNone();
1119 if (match(R: CurrentLink, P: m_Select(Op0: m_VPValue(), Op1: m_VPValue(), Op2: m_VPValue())))
1120 CurFMF |= cast<VPRecipeWithIRFlags>(Val: CurrentLink->getOperand(N: 0))
1121 ->getFastMathFlagsOrNone();
1122 FMFs &= CurFMF;
1123 }
1124
1125 Instruction *CurrentLinkI = CurrentLink->getUnderlyingInstr();
1126
1127 // Recognize a call to the llvm.fmuladd intrinsic.
1128 bool IsFMulAdd = Kind == RecurKind::FMulAdd;
1129 VPValue *VecOp;
1130 VPBasicBlock *LinkVPBB = CurrentLink->getParent();
1131 if (IsFMulAdd) {
1132 assert(RecurrenceDescriptor::isFMulAddIntrinsic(CurrentLinkI) &&
1133 "Expected current VPInstruction to be a call to the "
1134 "llvm.fmuladd intrinsic");
1135 assert(CurrentLink->getOperand(2) == PreviousLink &&
1136 "expected a call where the previous link is the added operand");
1137
1138 // If the instruction is a call to the llvm.fmuladd intrinsic then we
1139 // need to create an fmul recipe (multiplying the first two operands of
1140 // the fmuladd together) to use as the vector operand for the fadd
1141 // reduction.
1142 auto *FMulRecipe = new VPInstruction(
1143 Instruction::FMul,
1144 {CurrentLink->getOperand(N: 0), CurrentLink->getOperand(N: 1)},
1145 CurrentLinkI->getFastMathFlags());
1146 LinkVPBB->insert(Recipe: FMulRecipe, InsertPt: CurrentLink->getIterator());
1147 VecOp = FMulRecipe;
1148 } else if (Kind == RecurKind::AddChainWithSubs &&
1149 match(R: CurrentLink, P: m_Sub(Op0: m_VPValue(), Op1: m_VPValue()))) {
1150 Type *PhiTy = PhiR->getScalarType();
1151 auto *Zero = Plan.getConstantInt(Ty: PhiTy, Val: 0);
1152 VPBuilder Builder(LinkVPBB, CurrentLink->getIterator());
1153 auto *Sub = Builder.createSub(LHS: Zero, RHS: CurrentLink->getOperand(N: 1),
1154 DL: CurrentLinkI->getDebugLoc());
1155 Sub->setUnderlyingValue(CurrentLinkI);
1156 VecOp = Sub;
1157 } else {
1158 // Index of the first operand which holds a non-mask vector operand.
1159 unsigned IndexOfFirstOperand = 0;
1160 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
1161 if (match(R: CurrentLink, P: m_Cmp(Op0: m_VPValue(), Op1: m_VPValue())))
1162 continue;
1163 assert(match(CurrentLink,
1164 m_Select(m_VPValue(), m_VPValue(), m_VPValue())) &&
1165 "must be a select recipe");
1166 IndexOfFirstOperand = 1;
1167 }
1168 // Note that for non-commutable operands (cmp-selects), the semantics of
1169 // the cmp-select are captured in the recurrence kind.
1170 unsigned VecOpId =
1171 CurrentLink->getOperand(N: IndexOfFirstOperand) == PreviousLink
1172 ? IndexOfFirstOperand + 1
1173 : IndexOfFirstOperand;
1174 VecOp = CurrentLink->getOperand(N: VecOpId);
1175 assert(
1176 VecOp != PreviousLink &&
1177 CurrentLink->getOperand(
1178 cast<VPInstruction>(CurrentLink)->getNumOperandsWithoutMask() -
1179 1 - (VecOpId - IndexOfFirstOperand)) == PreviousLink &&
1180 "PreviousLink must be the operand other than VecOp");
1181 }
1182
1183 assert(PhiR->getVFScaleFactor() == 1 &&
1184 "inloop reductions must be unscaled");
1185 VPValue *CondOp = cast<VPInstruction>(Val: CurrentLink)->getMask();
1186 auto *RedRecipe = new VPReductionRecipe(
1187 Kind, FMFs, CurrentLinkI, PreviousLink, VecOp, CondOp,
1188 getReductionStyle(/*IsInLoop=*/InLoop: true, Ordered: PhiR->isOrdered(), ScaleFactor: 1),
1189 CurrentLinkI->getDebugLoc());
1190 // Append the recipe to the end of the VPBasicBlock because we need to
1191 // ensure that it comes after all of it's inputs, including CondOp.
1192 // Delete CurrentLink as it will be invalid if its operand is replaced
1193 // with a reduction defined at the bottom of the block in the next link.
1194 if (LinkVPBB->getNumSuccessors() == 0)
1195 RedRecipe->insertBefore(InsertPos: &*std::prev(x: std::prev(x: LinkVPBB->end())));
1196 else
1197 LinkVPBB->appendRecipe(Recipe: RedRecipe);
1198
1199 CurrentLink->replaceAllUsesWith(New: RedRecipe);
1200 // Move any store recipes using the RedRecipe that appear before it in the
1201 // same block to just after the RedRecipe.
1202 for (VPUser *U : make_early_inc_range(Range: RedRecipe->users())) {
1203 auto *UserR = dyn_cast<VPRecipeBase>(Val: U);
1204 if (!UserR || UserR->getParent() != LinkVPBB)
1205 continue;
1206 if (!match(V: UserR, P: m_VPInstruction<Instruction::Store>()))
1207 continue;
1208 UserR->moveAfter(MovePos: RedRecipe);
1209 }
1210 ToDelete.push_back(Elt: CurrentLink);
1211 PreviousLink = RedRecipe;
1212 }
1213 }
1214
1215 for (VPRecipeBase *R : ToDelete)
1216 R->eraseFromParent();
1217}
1218
1219/// Check if all loads in the loop are dereferenceable. Iterates over the
1220/// loop body blocks reachable from \p HeaderVPBB. Returns false if any
1221/// non-dereferenceable load is found.
1222static bool areAllLoadsDereferenceable(VPBasicBlock *HeaderVPBB, Loop *TheLoop,
1223 PredicatedScalarEvolution &PSE,
1224 DominatorTree &DT, AssumptionCache *AC) {
1225 ScalarEvolution &SE = *PSE.getSE();
1226 const DataLayout &DL = TheLoop->getHeader()->getDataLayout();
1227 for (VPBasicBlock *VPBB : vp_rpo_plain_cfg_loop_body(Header: HeaderVPBB)) {
1228 for (VPRecipeBase &R : *VPBB) {
1229 auto *VPI = dyn_cast<VPInstructionWithType>(Val: &R);
1230 if (!VPI || VPI->getOpcode() != Instruction::Load) {
1231 assert(!R.mayReadFromMemory() && "unexpected recipe reading memory");
1232 continue;
1233 }
1234
1235 // Get the pointer SCEV for dereferenceability checking.
1236 VPValue *Ptr = VPI->getOperand(N: 0);
1237 const SCEV *PtrSCEV = vputils::getSCEVExprForVPValue(V: Ptr, PSE, L: TheLoop);
1238 if (isa<SCEVCouldNotCompute>(Val: PtrSCEV)) {
1239 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Found non-dereferenceable "
1240 "load with SCEVCouldNotCompute pointer\n");
1241 return false;
1242 }
1243
1244 // Check dereferenceability using the SCEV-based version.
1245 Type *LoadTy = VPI->getScalarType();
1246 const SCEV *SizeSCEV =
1247 SE.getStoreSizeOfExpr(IntTy: DL.getIndexType(PtrTy: PtrSCEV->getType()), StoreTy: LoadTy);
1248 auto *Load = cast<LoadInst>(Val: VPI->getUnderlyingValue());
1249 SmallVector<const SCEVPredicate *> Preds;
1250 if (isDereferenceableAndAlignedInLoop(PtrSCEV, Alignment: Load->getAlign(), EltSizeSCEV: SizeSCEV,
1251 L: TheLoop, SE, DT, AC, Predicates: &Preds))
1252 continue;
1253
1254 LLVM_DEBUG(
1255 dbgs() << "LV: Not vectorizing: Auto-vectorization of loops with "
1256 "potentially faulting load is not supported.\n");
1257 return false;
1258 }
1259 }
1260 return true;
1261}
1262
1263bool VPlanTransforms::handleEarlyExits(VPlan &Plan, UncountableExitStyle Style,
1264 Loop *TheLoop,
1265 PredicatedScalarEvolution &PSE,
1266 DominatorTree &DT, AssumptionCache *AC) {
1267 auto *MiddleVPBB = VPBlockUtils::getPlainCFGMiddleBlock(Plan);
1268 auto [HeaderVPBB, LatchVPBB] = VPBlockUtils::getPlainCFGHeaderAndLatch(Plan);
1269
1270 // TODO: We would like to detect uncountable exits and stores within loops
1271 // with such exits from the VPlan alone. Exit detection can be moved
1272 // here from handleUncountableEarlyExits, but we need to improve
1273 // detection of recipes which may write to memory.
1274 if (Style != UncountableExitStyle::NoUncountableExit) {
1275 // Dereferenceability is checked separately for uncountable exit loops with
1276 // stores, as only the loads contributing to the exit condition need to
1277 // be checked.
1278 if (Style == UncountableExitStyle::ReadOnly &&
1279 !areAllLoadsDereferenceable(HeaderVPBB, TheLoop, PSE, DT, AC))
1280 return false;
1281 // TODO: Check target preference for style.
1282 return handleUncountableEarlyExits(Plan, HeaderVPBB, LatchVPBB, MiddleVPBB,
1283 TheLoop, PSE, DT, AC, Style);
1284 }
1285
1286 // Disconnect countable early exits from the loop, leaving it with a single
1287 // exit from the latch. Countable early exits are left for a scalar epilog.
1288 for (VPIRBasicBlock *EB : Plan.getExitBlocks()) {
1289 for (VPBlockBase *Pred : to_vector(Range&: EB->getPredecessors())) {
1290 if (Pred == MiddleVPBB)
1291 continue;
1292
1293 // Remove phi operands for the early exiting block.
1294 for (VPRecipeBase &R : EB->phis())
1295 cast<VPIRPhi>(Val: &R)->removeIncomingValueFor(IncomingBlock: Pred);
1296 auto *EarlyExitingVPBB = cast<VPBasicBlock>(Val: Pred);
1297 EarlyExitingVPBB->getTerminator()->eraseFromParent();
1298 VPBlockUtils::disconnectBlocks(From: Pred, To: EB);
1299 }
1300 }
1301 return true;
1302}
1303
1304void VPlanTransforms::addMiddleCheck(VPlan &Plan) {
1305 auto *MiddleVPBB = VPBlockUtils::getPlainCFGMiddleBlock(Plan);
1306 // If MiddleVPBB has a single successor then the original loop does not exit
1307 // via the latch and the single successor must be the scalar preheader.
1308 // There's no need to add a runtime check to MiddleVPBB.
1309 if (MiddleVPBB->getNumSuccessors() == 1) {
1310 assert(MiddleVPBB->getSingleSuccessor() == Plan.getScalarPreheader() &&
1311 "must have ScalarPH as single successor");
1312 return;
1313 }
1314
1315 assert(MiddleVPBB->getNumSuccessors() == 2 && "must have 2 successors");
1316
1317 // Add a check in the middle block to see if we have completed all of the
1318 // iterations in the first vector loop.
1319 //
1320 // Three cases:
1321 // 1) If we require a scalar epilogue, the scalar ph must execute. Set the
1322 // condition to false.
1323 // 2) If (N - N%VF) == N, then we *don't* need to run the
1324 // remainder. Thus if tail is to be folded, we know we don't need to run
1325 // the remainder and we can set the condition to true.
1326 // 3) Otherwise, construct a runtime check.
1327
1328 // We use the same DebugLoc as the scalar loop latch terminator instead of
1329 // the corresponding compare because they may have ended up with different
1330 // line numbers and we want to avoid awkward line stepping while debugging.
1331 // E.g., if the compare has got a line number inside the loop.
1332 auto *LatchVPBB = cast<VPBasicBlock>(Val: MiddleVPBB->getSinglePredecessor());
1333 DebugLoc LatchDL = LatchVPBB->getTerminator()->getDebugLoc();
1334 VPBuilder Builder(MiddleVPBB);
1335 VPValue *Cmp =
1336 Builder.createICmp(Pred: CmpInst::ICMP_EQ, A: Plan.getTripCount(),
1337 B: &Plan.getVectorTripCount(), DL: LatchDL, Name: "cmp.n");
1338 Builder.createNaryOp(Opcode: VPInstruction::BranchOnCond, Operands: {Cmp}, DL: LatchDL);
1339}
1340
1341void VPlanTransforms::createLoopRegions(VPlan &Plan, DebugLoc DL) {
1342 VPDominatorTree VPDT(Plan);
1343 PostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> POT(
1344 Plan.getEntry());
1345 for (VPBlockBase *HeaderVPB : POT)
1346 if (canonicalHeaderAndLatch(HeaderVPB, VPDT))
1347 createLoopRegion(Plan, HeaderVPB, DL);
1348
1349 VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
1350 TopRegion->setName("vector loop");
1351 TopRegion->getEntryBasicBlock()->setName("vector.body");
1352}
1353
1354void VPlanTransforms::foldTailByMasking(VPlan &Plan) {
1355 assert(Plan.getExitBlocks().size() == 1 &&
1356 "only a single-exit block is supported currently");
1357 assert(Plan.getExitBlocks().front()->getSinglePredecessor() ==
1358 Plan.getMiddleBlock() &&
1359 "the exit block must have middle block as single predecessor");
1360
1361 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
1362 assert(LoopRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
1363 "The vector loop region must have the middle block as its single "
1364 "successor for now");
1365 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
1366
1367 Header->splitAt(SplitAt: Header->getFirstNonPhi());
1368
1369 // Create the header mask, insert it in the header and branch on it.
1370 auto *IV = new VPWidenCanonicalIVRecipe(
1371 LoopRegion->getCanonicalIV(),
1372 VPIRFlags::WrapFlagsTy(LoopRegion->hasCanonicalIVNUW(), false));
1373 VPBuilder Builder(Header, Header->getFirstNonPhi());
1374 Builder.insert(R: IV);
1375 VPValue *BTC = Plan.getOrCreateBackedgeTakenCount();
1376 VPValue *HeaderMask = Builder.createICmp(Pred: CmpInst::ICMP_ULE, A: IV, B: BTC);
1377 Builder.createNaryOp(Opcode: VPInstruction::BranchOnCond, Operands: HeaderMask);
1378
1379 VPBasicBlock *OrigLatch = LoopRegion->getExitingBasicBlock();
1380 VPValue *IVInc;
1381 [[maybe_unused]] bool TermBranchOnCount =
1382 match(V: OrigLatch->getTerminator(),
1383 P: m_BranchOnCount(Op0: m_VPValue(V&: IVInc),
1384 Op1: m_Specific(VPV: &Plan.getVectorTripCount())));
1385 assert(TermBranchOnCount &&
1386 match(IVInc, m_Add(m_Specific(LoopRegion->getCanonicalIV()),
1387 m_Specific(&Plan.getVFxUF()))) &&
1388 std::next(IVInc->getDefiningRecipe()->getIterator()) ==
1389 OrigLatch->getTerminator()->getIterator() &&
1390 "Unexpected canonical iv increment");
1391
1392 // Split the latch at the IV update, and branch to it from the header mask.
1393 VPBasicBlock *Latch =
1394 OrigLatch->splitAt(SplitAt: IVInc->getDefiningRecipe()->getIterator());
1395 Latch->setName("vector.latch");
1396 VPBlockUtils::connectBlocks(From: Header, To: Latch);
1397
1398 // Collect any values defined in the loop that need a phi. Currently this
1399 // includes header phi backedges and live-outs extracted in the middle block.
1400 // TODO: Handle early exits via Plan.getExitBlocks()
1401 MapVector<VPValue *, SmallVector<VPUser *>> NeedsPhi;
1402 for (VPRecipeBase &R : Header->phis())
1403 if (!isa<VPWidenInductionRecipe>(Val: R))
1404 NeedsPhi[cast<VPHeaderPHIRecipe>(Val&: R).getBackedgeValue()].push_back(Elt: &R);
1405
1406 VPValue *V;
1407 for (VPRecipeBase &R : *Plan.getMiddleBlock())
1408 if (match(V: &R, P: m_ExtractLastPart(Op0: m_VPValue(V))))
1409 NeedsPhi[V].push_back(Elt: &R);
1410
1411 // Insert phis for values coming past the end of the tail.
1412 Builder.setInsertPoint(TheBB: Latch, IP: Latch->begin());
1413 for (const auto &[V, Users] : NeedsPhi) {
1414 if (isa<VPIRValue>(Val: V))
1415 continue;
1416 VPValue *TailVal = Plan.getPoison(Ty: V->getScalarType());
1417 VPIRFlags Flags;
1418 assert(llvm::count_if(Users, IsaPred<VPReductionPHIRecipe>) <= 1 &&
1419 "Value used by more than two reduction phis?");
1420 auto *RedIt = find_if(Range: Users, P: IsaPred<VPReductionPHIRecipe>);
1421 auto *RdxPhi =
1422 RedIt != Users.end() ? cast<VPReductionPHIRecipe>(Val: *RedIt) : nullptr;
1423 if (RdxPhi && !RdxPhi->isInLoop()) {
1424 TailVal = RdxPhi;
1425 Flags = *RdxPhi;
1426 }
1427
1428 VPInstruction *Phi = Builder.createScalarPhi(IncomingValues: {V, TailVal}, DL: {}, Name: "", Flags);
1429 for (VPUser *U : Users)
1430 U->replaceUsesOfWith(From: V, To: Phi);
1431 }
1432
1433 // Any extract of the last element must be updated to extract from the last
1434 // active lane of the header mask instead (i.e., the lane corresponding to the
1435 // last active iteration).
1436 Builder.setInsertPoint(Plan.getMiddleBlock()->getTerminator());
1437 for (VPRecipeBase &R : *Plan.getMiddleBlock()) {
1438 VPValue *Op;
1439 if (!match(V: &R, P: m_ExtractLastLaneOfLastPart(Op0: m_VPValue(V&: Op))))
1440 continue;
1441
1442 // Compute the index of the last active lane.
1443 VPValue *LastActiveLane = Builder.createLastActiveLane(Masks: HeaderMask);
1444 auto *Ext =
1445 Builder.createNaryOp(Opcode: VPInstruction::ExtractLane, Operands: {LastActiveLane, Op});
1446 R.getVPSingleValue()->replaceAllUsesWith(New: Ext);
1447 }
1448
1449 // VectorTripCount now equals TripCount so simplify the MiddleVPBB branch.
1450 assert(match(Plan.getMiddleBlock()->getTerminator(),
1451 m_BranchOnCond(m_SpecificICmp(
1452 CmpInst::ICMP_EQ, m_Specific(Plan.getTripCount()),
1453 m_Specific(&Plan.getVectorTripCount())))) &&
1454 "Unexpected MiddleVPBB branch");
1455 Plan.getMiddleBlock()->getTerminator()->setOperand(I: 0, New: Plan.getTrue());
1456}
1457
1458/// Insert \p CheckBlockVPBB on the edge leading to the vector preheader,
1459/// connecting it to both vector and scalar preheaders. Updates scalar
1460/// preheader phis to account for the new predecessor.
1461static void insertCheckBlockBeforeVectorLoop(VPlan &Plan,
1462 VPBasicBlock *CheckBlockVPBB) {
1463 VPBlockBase *VectorPH = Plan.getVectorPreheader();
1464 auto *ScalarPH = cast<VPBasicBlock>(Val: Plan.getScalarPreheader());
1465 VPBlockBase *PreVectorPH = VectorPH->getSinglePredecessor();
1466 VPBlockUtils::insertOnEdge(From: PreVectorPH, To: VectorPH, BlockPtr: CheckBlockVPBB);
1467 VPBlockUtils::connectBlocks(From: CheckBlockVPBB, To: ScalarPH);
1468 CheckBlockVPBB->swapSuccessors();
1469 unsigned NumPreds = ScalarPH->getNumPredecessors();
1470 for (VPRecipeBase &R : ScalarPH->phis()) {
1471 auto *Phi = cast<VPPhi>(Val: &R);
1472 assert(Phi->getNumIncoming() == NumPreds - 1 &&
1473 "must have incoming values for all predecessors");
1474 Phi->addIncoming(IncomingV: Phi->getOperand(N: NumPreds - 2));
1475 }
1476}
1477
1478// Likelyhood of bypassing the vectorized loop due to a runtime check block,
1479// including memory overlap checks block and wrapping/unit-stride checks block.
1480static constexpr uint32_t CheckBypassWeights[] = {1, 127};
1481
1482/// Create a BranchOnCond terminator in \p CheckBlockVPBB. Optionally adds
1483/// branch weights.
1484static void addBypassBranch(VPlan &Plan, VPBasicBlock *CheckBlockVPBB,
1485 VPValue *Cond, bool AddBranchWeights) {
1486 DebugLoc DL = Plan.getVectorLoopRegion()->getCanonicalIV()->getDebugLoc();
1487 auto *Term = VPBuilder(CheckBlockVPBB)
1488 .createNaryOp(Opcode: VPInstruction::BranchOnCond, Operands: {Cond}, DL);
1489 if (AddBranchWeights) {
1490 MDBuilder MDB(Plan.getContext());
1491 MDNode *BranchWeights =
1492 MDB.createBranchWeights(Weights: CheckBypassWeights, /*IsExpected=*/false);
1493 Term->setMetadata(Kind: LLVMContext::MD_prof, Node: BranchWeights);
1494 }
1495}
1496
1497void VPlanTransforms::attachVPCheckBlock(VPlan &Plan, VPValue *Cond,
1498 VPBasicBlock *CheckBlock,
1499 bool AddBranchWeights) {
1500 insertCheckBlockBeforeVectorLoop(Plan, CheckBlockVPBB: CheckBlock);
1501 addBypassBranch(Plan, CheckBlockVPBB: CheckBlock, Cond, AddBranchWeights);
1502}
1503
1504void VPlanTransforms::attachCheckBlock(VPlan &Plan, Value *Cond,
1505 BasicBlock *CheckBlock,
1506 bool AddBranchWeights) {
1507 VPValue *CondVPV = Plan.getOrAddLiveIn(V: Cond);
1508 VPBasicBlock *CheckBlockVPBB = Plan.createVPIRBasicBlock(IRBB: CheckBlock);
1509 attachVPCheckBlock(Plan, Cond: CondVPV, CheckBlock: CheckBlockVPBB, AddBranchWeights);
1510}
1511
1512void VPlanTransforms::addMinimumIterationCheck(
1513 VPlan &Plan, ElementCount VF, unsigned UF,
1514 ElementCount MinProfitableTripCount, bool RequiresScalarEpilogue,
1515 bool TailFolded, Loop *OrigLoop, const uint32_t *MinItersBypassWeights,
1516 DebugLoc DL, PredicatedScalarEvolution &PSE, VPBasicBlock *CheckBlock) {
1517 // Generate code to check if the loop's trip count is less than VF * UF, or
1518 // equal to it in case a scalar epilogue is required; this implies that the
1519 // vector trip count is zero. This check also covers the case where adding one
1520 // to the backedge-taken count overflowed leading to an incorrect trip count
1521 // of zero. In this case we will also jump to the scalar loop.
1522 CmpInst::Predicate CmpPred =
1523 RequiresScalarEpilogue ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
1524 // If tail is to be folded, vector loop takes care of all iterations.
1525 VPValue *TripCountVPV = Plan.getTripCount();
1526 const SCEV *TripCount = vputils::getSCEVExprForVPValue(V: TripCountVPV, PSE);
1527 Type *TripCountTy = TripCount->getType();
1528 ScalarEvolution &SE = *PSE.getSE();
1529 auto GetMinTripCount = [&]() -> const SCEV * {
1530 // Compute max(MinProfitableTripCount, UF * VF) and return it.
1531 const SCEV *VFxUF =
1532 SE.getElementCount(Ty: TripCountTy, EC: (VF * UF), Flags: SCEV::FlagNUW);
1533 if (UF * VF.getKnownMinValue() >=
1534 MinProfitableTripCount.getKnownMinValue()) {
1535 // TODO: SCEV should be able to simplify test.
1536 return VFxUF;
1537 }
1538 const SCEV *MinProfitableTripCountSCEV =
1539 SE.getElementCount(Ty: TripCountTy, EC: MinProfitableTripCount, Flags: SCEV::FlagNUW);
1540 return SE.getUMaxExpr(LHS: MinProfitableTripCountSCEV, RHS: VFxUF);
1541 };
1542
1543 VPBuilder Builder(CheckBlock);
1544 VPValue *TripCountCheck = Plan.getFalse();
1545 const SCEV *Step = GetMinTripCount();
1546 // TripCountCheck = false, folding tail implies positive vector trip
1547 // count.
1548 if (!TailFolded) {
1549 // TODO: Emit unconditional branch to vector preheader instead of
1550 // conditional branch with known condition.
1551 TripCount = SE.applyLoopGuards(Expr: TripCount, L: OrigLoop);
1552 // Check if the trip count is < the step.
1553 if (SE.isKnownPredicate(Pred: CmpPred, LHS: TripCount, RHS: Step)) {
1554 // TODO: Ensure step is at most the trip count when determining max VF and
1555 // UF, w/o tail folding.
1556 TripCountCheck = Plan.getTrue();
1557 } else if (!SE.isKnownPredicate(Pred: CmpInst::getInversePredicate(pred: CmpPred),
1558 LHS: TripCount, RHS: Step)) {
1559 // Generate the minimum iteration check only if we cannot prove the
1560 // check is known to be true, or known to be false.
1561 // Try to expand Step into VPInstructions in CheckBlock; otherwise fall
1562 // back to a VPExpandSCEV recipe in the plan's entry block.
1563 VPValue *MinTripCountVPV =
1564 VPSCEVExpander(Builder, *PSE.getSE(), DL).tryToExpand(S: Step);
1565 if (!MinTripCountVPV)
1566 MinTripCountVPV = VPBuilder(Plan.getEntry()).createExpandSCEV(Expr: Step);
1567 TripCountCheck = Builder.createICmp(
1568 Pred: CmpPred, A: TripCountVPV, B: MinTripCountVPV, DL, Name: "min.iters.check");
1569 } // else step known to be < trip count, use TripCountCheck preset to false.
1570 }
1571 VPInstruction *Term =
1572 Builder.createNaryOp(Opcode: VPInstruction::BranchOnCond, Operands: {TripCountCheck}, DL);
1573 if (MinItersBypassWeights) {
1574 MDBuilder MDB(Plan.getContext());
1575 MDNode *BranchWeights = MDB.createBranchWeights(
1576 Weights: ArrayRef(MinItersBypassWeights, 2), /*IsExpected=*/false);
1577 Term->setMetadata(Kind: LLVMContext::MD_prof, Node: BranchWeights);
1578 }
1579}
1580
1581void VPlanTransforms::addIterationCountCheckBlock(
1582 VPlan &Plan, ElementCount VF, unsigned UF, bool RequiresScalarEpilogue,
1583 Loop *OrigLoop, const uint32_t *MinItersBypassWeights, DebugLoc DL,
1584 PredicatedScalarEvolution &PSE) {
1585 auto *CheckBlock = Plan.createVPBasicBlock(Name: "vector.main.loop.iter.check");
1586 insertCheckBlockBeforeVectorLoop(Plan, CheckBlockVPBB: CheckBlock);
1587 addMinimumIterationCheck(Plan, VF, UF, MinProfitableTripCount: ElementCount::getFixed(MinVal: 0),
1588 RequiresScalarEpilogue, /*TailFolded=*/false,
1589 OrigLoop, MinItersBypassWeights, DL, PSE,
1590 CheckBlock);
1591}
1592
1593void VPlanTransforms::addMinimumVectorEpilogueIterationCheck(
1594 VPlan &Plan, Value *VectorTripCount, bool RequiresScalarEpilogue,
1595 ElementCount EpilogueVF, unsigned EpilogueUF, unsigned MainLoopStep,
1596 unsigned EpilogueLoopStep, ScalarEvolution &SE) {
1597 // Add the minimum iteration check for the epilogue vector loop.
1598 VPValue *TC = Plan.getTripCount();
1599 Value *TripCount = TC->getLiveInIRValue();
1600 VPBuilder Builder(cast<VPBasicBlock>(Val: Plan.getEntry()));
1601 VPValue *VFxUF = Builder.createExpandSCEV(Expr: SE.getElementCount(
1602 Ty: TripCount->getType(), EC: (EpilogueVF * EpilogueUF), Flags: SCEV::FlagNUW));
1603 VPValue *Count = Builder.createSub(LHS: TC, RHS: Plan.getOrAddLiveIn(V: VectorTripCount),
1604 DL: DebugLoc::getUnknown(), Name: "n.vec.remaining");
1605
1606 // Generate code to check if the loop's trip count is less than VF * UF of
1607 // the vector epilogue loop.
1608 auto P = RequiresScalarEpilogue ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
1609 auto *CheckMinIters = Builder.createICmp(
1610 Pred: P, A: Count, B: VFxUF, DL: DebugLoc::getUnknown(), Name: "min.epilog.iters.check");
1611 VPInstruction *Branch =
1612 Builder.createNaryOp(Opcode: VPInstruction::BranchOnCond, Operands: CheckMinIters);
1613
1614 // We assume the remaining `Count` is equally distributed in
1615 // [0, MainLoopStep)
1616 // So the probability for `Count < EpilogueLoopStep` should be
1617 // min(MainLoopStep, EpilogueLoopStep) / MainLoopStep
1618 // TODO: Improve the estimate by taking the estimated trip count into
1619 // consideration.
1620 unsigned EstimatedSkipCount = std::min(a: MainLoopStep, b: EpilogueLoopStep);
1621 const uint32_t Weights[] = {EstimatedSkipCount,
1622 MainLoopStep - EstimatedSkipCount};
1623 MDBuilder MDB(Plan.getContext());
1624 MDNode *BranchWeights =
1625 MDB.createBranchWeights(Weights, /*IsExpected=*/false);
1626 Branch->setMetadata(Kind: LLVMContext::MD_prof, Node: BranchWeights);
1627}
1628
1629/// Find and return the final select instruction of the FindIV result pattern
1630/// for the given \p BackedgeVal:
1631/// select(icmp ne ComputeReductionResult(ReducedIV), Sentinel),
1632/// ComputeReductionResult(ReducedIV), Start.
1633static VPInstruction *findFindIVSelect(VPValue *BackedgeVal) {
1634 return cast<VPInstruction>(
1635 Val: vputils::findRecipe(Start: BackedgeVal, Pred: [BackedgeVal](VPRecipeBase *R) {
1636 auto *VPI = dyn_cast<VPInstruction>(Val: R);
1637 return VPI &&
1638 matchFindIVResult(VPI, ReducedIV: m_Specific(VPV: BackedgeVal), Start: m_VPValue());
1639 }));
1640}
1641
1642bool VPlanTransforms::handleMaxMinNumReductions(VPlan &Plan) {
1643 auto GetMinOrMaxCompareValue =
1644 [](VPReductionPHIRecipe *RedPhiR) -> VPValue * {
1645 auto *MinOrMaxR =
1646 dyn_cast_or_null<VPRecipeWithIRFlags>(Val: RedPhiR->getBackedgeValue());
1647 if (!MinOrMaxR)
1648 return nullptr;
1649
1650 // Check that MinOrMaxR is a VPWidenIntrinsicRecipe or VPReplicateRecipe
1651 // with an intrinsic that matches the reduction kind.
1652 Intrinsic::ID ExpectedIntrinsicID =
1653 getMinMaxReductionIntrinsicOp(RK: RedPhiR->getRecurrenceKind());
1654 if (!match(V: MinOrMaxR, P: m_Intrinsic(IntrID: ExpectedIntrinsicID)))
1655 return nullptr;
1656
1657 if (MinOrMaxR->getOperand(N: 0) == RedPhiR)
1658 return MinOrMaxR->getOperand(N: 1);
1659
1660 assert(MinOrMaxR->getOperand(1) == RedPhiR &&
1661 "Reduction phi operand expected");
1662 return MinOrMaxR->getOperand(N: 0);
1663 };
1664
1665 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
1666 SmallVector<std::pair<VPReductionPHIRecipe *, VPValue *>>
1667 MinOrMaxNumReductionsToHandle;
1668 bool HasUnsupportedPhi = false;
1669 for (auto &R : LoopRegion->getEntryBasicBlock()->phis()) {
1670 if (isa<VPWidenIntOrFpInductionRecipe>(Val: &R))
1671 continue;
1672 auto *Cur = dyn_cast<VPReductionPHIRecipe>(Val: &R);
1673 if (!Cur) {
1674 // TODO: Also support fixed-order recurrence phis.
1675 HasUnsupportedPhi = true;
1676 continue;
1677 }
1678 if (!RecurrenceDescriptor::isFPMinMaxNumRecurrenceKind(
1679 Kind: Cur->getRecurrenceKind())) {
1680 HasUnsupportedPhi = true;
1681 continue;
1682 }
1683
1684 VPValue *MinOrMaxOp = GetMinOrMaxCompareValue(Cur);
1685 if (!MinOrMaxOp)
1686 return false;
1687
1688 MinOrMaxNumReductionsToHandle.emplace_back(Args&: Cur, Args&: MinOrMaxOp);
1689 }
1690
1691 if (MinOrMaxNumReductionsToHandle.empty())
1692 return true;
1693
1694 // We won't be able to resume execution in the scalar tail, if there are
1695 // unsupported header phis or there is no scalar tail at all, due to
1696 // tail-folding.
1697 if (HasUnsupportedPhi || !Plan.hasScalarTail())
1698 return false;
1699
1700 /// Check if the vector loop of \p Plan can early exit and restart
1701 /// execution of last vector iteration in the scalar loop. This requires all
1702 /// recipes up to early exit point be side-effect free as they are
1703 /// re-executed. Currently we check that the loop is free of any recipe that
1704 /// may write to memory. Expected to operate on an early VPlan w/o nested
1705 /// regions.
1706 for (VPBlockBase *VPB : vp_depth_first_shallow(
1707 G: Plan.getVectorLoopRegion()->getEntryBasicBlock())) {
1708 auto *VPBB = cast<VPBasicBlock>(Val: VPB);
1709 for (auto &R : *VPBB) {
1710 if (R.mayWriteToMemory() && !match(V: &R, P: m_BranchOnCount()))
1711 return false;
1712 }
1713 }
1714
1715 VPBasicBlock *LatchVPBB = LoopRegion->getExitingBasicBlock();
1716 VPBuilder LatchBuilder(LatchVPBB->getTerminator());
1717 VPValue *AllNaNLanes = nullptr;
1718 SmallPtrSet<VPValue *, 2> RdxResults;
1719 for (const auto &[_, MinOrMaxOp] : MinOrMaxNumReductionsToHandle) {
1720 VPValue *RedNaNLanes =
1721 LatchBuilder.createFCmp(Pred: CmpInst::FCMP_UNO, A: MinOrMaxOp, B: MinOrMaxOp);
1722 AllNaNLanes = AllNaNLanes ? LatchBuilder.createOr(LHS: AllNaNLanes, RHS: RedNaNLanes)
1723 : RedNaNLanes;
1724 }
1725
1726 VPValue *AnyNaNLane =
1727 LatchBuilder.createNaryOp(Opcode: VPInstruction::AnyOf, Operands: {AllNaNLanes});
1728 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
1729 VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->begin());
1730 for (const auto &[RedPhiR, _] : MinOrMaxNumReductionsToHandle) {
1731 assert(RecurrenceDescriptor::isFPMinMaxNumRecurrenceKind(
1732 RedPhiR->getRecurrenceKind()) &&
1733 "unsupported reduction");
1734
1735 // If we exit early due to NaNs, compute the final reduction result based on
1736 // the reduction phi at the beginning of the last vector iteration.
1737 auto *RdxResult = vputils::findComputeReductionResult(PhiR: RedPhiR);
1738 assert(RdxResult && "must find a ComputeReductionResult");
1739
1740 auto *NewSel = MiddleBuilder.createSelect(Cond: AnyNaNLane, TrueVal: RedPhiR,
1741 FalseVal: RdxResult->getOperand(N: 0));
1742 RdxResult->setOperand(I: 0, New: NewSel);
1743 assert(!RdxResults.contains(RdxResult) && "RdxResult already used");
1744 RdxResults.insert(Ptr: RdxResult);
1745 }
1746
1747 auto *LatchExitingBranch = LatchVPBB->getTerminator();
1748 assert(match(LatchExitingBranch, m_BranchOnCount(m_VPValue(), m_VPValue())) &&
1749 "Unexpected terminator");
1750 auto *IsLatchExitTaken = LatchBuilder.createICmp(
1751 Pred: CmpInst::ICMP_EQ, A: LatchExitingBranch->getOperand(N: 0),
1752 B: LatchExitingBranch->getOperand(N: 1));
1753 auto *AnyExitTaken = LatchBuilder.createOr(LHS: AnyNaNLane, RHS: IsLatchExitTaken);
1754 LatchBuilder.createNaryOp(Opcode: VPInstruction::BranchOnCond, Operands: AnyExitTaken);
1755 LatchExitingBranch->eraseFromParent();
1756
1757 // Update resume phis for inductions in the scalar preheader. If AnyNaNLane is
1758 // true, the resume from the start of the last vector iteration via the
1759 // canonical IV, otherwise from the original value.
1760 auto IsTC = [&Plan](VPValue *V) {
1761 return V == &Plan.getVectorTripCount() || V == Plan.getTripCount();
1762 };
1763 for (auto &R : Plan.getScalarPreheader()->phis()) {
1764 auto *ResumeR = cast<VPPhi>(Val: &R);
1765 VPValue *VecV = ResumeR->getOperand(N: 0);
1766 if (RdxResults.contains(Ptr: VecV))
1767 continue;
1768 if (auto *DerivedIV = dyn_cast<VPDerivedIVRecipe>(Val: VecV)) {
1769 VPValue *DIVTC = DerivedIV->getOperand(N: 1);
1770 if (DerivedIV->hasOneUse() && IsTC(DIVTC)) {
1771 auto *NewSel = MiddleBuilder.createSelect(
1772 Cond: AnyNaNLane, TrueVal: LoopRegion->getCanonicalIV(), FalseVal: DIVTC);
1773 DerivedIV->moveAfter(MovePos: &*MiddleBuilder.getInsertPoint());
1774 DerivedIV->setOperand(I: 1, New: NewSel);
1775 continue;
1776 }
1777 }
1778 // Bail out and abandon the current, partially modified, VPlan if we
1779 // encounter resume phi that cannot be updated yet.
1780 if (!IsTC(VecV)) {
1781 LLVM_DEBUG(dbgs() << "Found resume phi we cannot update for VPlan with "
1782 "FMaxNum/FMinNum reduction.\n");
1783 return false;
1784 }
1785 auto *NewSel = MiddleBuilder.createSelect(
1786 Cond: AnyNaNLane, TrueVal: LoopRegion->getCanonicalIV(), FalseVal: VecV);
1787 ResumeR->setOperand(I: 0, New: NewSel);
1788 }
1789
1790 auto *MiddleTerm = MiddleVPBB->getTerminator();
1791 MiddleBuilder.setInsertPoint(MiddleTerm);
1792 VPValue *MiddleCond = MiddleTerm->getOperand(N: 0);
1793 VPValue *NewCond =
1794 MiddleBuilder.createAnd(LHS: MiddleCond, RHS: MiddleBuilder.createNot(Operand: AnyNaNLane));
1795 MiddleTerm->setOperand(I: 0, New: NewCond);
1796 return true;
1797}
1798
1799bool VPlanTransforms::handleFindLastReductions(VPlan &Plan) {
1800 if (Plan.hasScalarVFOnly())
1801 return false;
1802
1803 // We want to create the following nodes:
1804 // vector.body:
1805 // ...new WidenPHI recipe introduced to keep the mask value for the latest
1806 // iteration where any lane was active.
1807 // mask.phi = phi [ ir<false>, vector.ph ], [ vp<new.mask>, vector.body ]
1808 // ...data.phi (a VPReductionPHIRecipe for a FindLast reduction) already
1809 // exists, but needs updating to use 'new.data' for the backedge value.
1810 // data.phi = phi ir<default.val>, vp<new.data>
1811 //
1812 // ...'data' and 'compare' created by existing nodes...
1813 //
1814 // ...new recipes introduced to determine whether to update the reduction
1815 // values or keep the current one.
1816 // any.active = i1 any-of ir<compare>
1817 // new.mask = select vp<any.active>, ir<compare>, vp<mask.phi>
1818 // new.data = select vp<any.active>, ir<data>, ir<data.phi>
1819 //
1820 // middle.block:
1821 // ...extract-last-active replaces compute-reduction-result.
1822 // result = extract-last-active vp<new.data>, vp<new.mask>, ir<default.val>
1823
1824 SmallVector<VPReductionPHIRecipe *, 4> Phis;
1825 for (VPRecipeBase &Phi :
1826 Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
1827 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(Val: &Phi);
1828 if (PhiR && RecurrenceDescriptor::isFindLastRecurrenceKind(
1829 Kind: PhiR->getRecurrenceKind()))
1830 Phis.push_back(Elt: PhiR);
1831 }
1832
1833 if (Phis.empty())
1834 return true;
1835
1836 VPValue *HeaderMask = vputils::findHeaderMask(Plan);
1837 for (VPReductionPHIRecipe *PhiR : Phis) {
1838 // Find the condition for the select/blend.
1839 VPValue *BackedgeSelect = PhiR->getBackedgeValue();
1840 VPValue *CondSelect = BackedgeSelect;
1841
1842 // If there's a header mask, the backedge select will not be the find-last
1843 // select.
1844 if (HeaderMask &&
1845 !match(V: BackedgeSelect,
1846 P: m_SelectLike(Op0: m_Specific(VPV: HeaderMask), Op1: m_VPValue(V&: CondSelect),
1847 Op2: m_Specific(VPV: PhiR))))
1848 return false;
1849
1850 VPValue *Cond = nullptr, *Op1 = nullptr, *Op2 = nullptr;
1851
1852 // If we're matching a blend rather than a select, there should be one
1853 // incoming value which is the data, then all other incoming values should
1854 // be the phi.
1855 auto MatchBlend = [&](VPRecipeBase *R) {
1856 auto *Blend = dyn_cast<VPBlendRecipe>(Val: R);
1857 if (!Blend)
1858 return false;
1859 assert(!Blend->isNormalized() && "must run before blend normalizaion");
1860 unsigned NumIncomingDataValues = 0;
1861 for (unsigned I = 0; I < Blend->getNumIncomingValues(); ++I) {
1862 VPValue *Incoming = Blend->getIncomingValue(Idx: I);
1863 if (Incoming != PhiR) {
1864 ++NumIncomingDataValues;
1865 Cond = Blend->getMask(Idx: I);
1866 Op1 = Incoming;
1867 Op2 = PhiR;
1868 }
1869 }
1870 return NumIncomingDataValues == 1;
1871 };
1872
1873 VPSingleDefRecipe *SelectR =
1874 cast<VPSingleDefRecipe>(Val: CondSelect->getDefiningRecipe());
1875 if (!match(R: SelectR,
1876 P: m_Select(Op0: m_VPValue(V&: Cond), Op1: m_VPValue(V&: Op1), Op2: m_VPValue(V&: Op2))) &&
1877 !MatchBlend(SelectR))
1878 return false;
1879
1880 assert(Cond != HeaderMask && "Cond must not be HeaderMask");
1881
1882 // Find final reduction computation and replace it with an
1883 // extract.last.active intrinsic.
1884 auto *RdxResult =
1885 findUserOf<VPInstruction::ComputeReductionResult>(V: BackedgeSelect);
1886 assert(RdxResult && "Could not find reduction result");
1887
1888 // Add mask phi.
1889 VPBuilder Builder = VPBuilder::getToInsertAfter(R: PhiR);
1890 auto *MaskPHI = Builder.createWidenPhi(IncomingValues: Plan.getFalse());
1891
1892 // Add select for mask.
1893 Builder.setInsertPoint(SelectR);
1894
1895 if (Op1 == PhiR) {
1896 // Normalize to selecting the data operand when the condition is true by
1897 // swapping operands and negating the condition.
1898 std::swap(a&: Op1, b&: Op2);
1899 Cond = Builder.createNot(Operand: Cond);
1900 }
1901 assert(Op2 == PhiR && "data value must be selected if Cond is true");
1902
1903 if (HeaderMask)
1904 Cond = Builder.createLogicalAnd(LHS: HeaderMask, RHS: Cond);
1905
1906 VPValue *AnyOf = Builder.createNaryOp(Opcode: VPInstruction::AnyOf, Operands: {Cond});
1907 VPValue *MaskSelect = Builder.createSelect(Cond: AnyOf, TrueVal: Cond, FalseVal: MaskPHI);
1908 MaskPHI->addIncoming(IncomingV: MaskSelect);
1909
1910 // Replace select for data.
1911 VPValue *DataSelect =
1912 Builder.createSelect(Cond: AnyOf, TrueVal: Op1, FalseVal: Op2, DL: SelectR->getDebugLoc());
1913 SelectR->replaceAllUsesWith(New: DataSelect);
1914 PhiR->setBackedgeValue(DataSelect);
1915 SelectR->eraseFromParent();
1916
1917 Builder.setInsertPoint(RdxResult);
1918 auto *ExtractLastActive =
1919 Builder.createNaryOp(Opcode: VPInstruction::ExtractLastActive,
1920 Operands: {PhiR->getStartValue(), DataSelect, MaskSelect},
1921 DL: RdxResult->getDebugLoc());
1922 RdxResult->replaceAllUsesWith(New: ExtractLastActive);
1923 RdxResult->eraseFromParent();
1924 }
1925
1926 return true;
1927}
1928
1929/// Given a first argmin/argmax pattern with strict predicate consisting of
1930/// 1) a MinOrMax reduction \p MinOrMaxPhiR producing \p MinOrMaxResult,
1931/// 2) a wide induction \p WideIV,
1932/// 3) a FindLastIV reduction \p FindLastIVPhiR using \p WideIV,
1933/// return the smallest index of the FindLastIV reduction result using UMin,
1934/// unless \p MinOrMaxResult equals the start value of its MinOrMax reduction.
1935/// In that case, return the start value of the FindLastIV reduction instead.
1936/// If \p WideIV is not canonical, a new canonical wide IV is added, and the
1937/// final result is scaled back to the non-canonical \p WideIV.
1938/// The final value of the FindLastIV reduction is originally computed using
1939/// \p FindIVSelect, \p FindIVCmp, and \p FindIVRdxResult, which are replaced
1940/// and removed.
1941/// Returns true if the pattern was handled successfully, false otherwise.
1942static bool handleFirstArgMinOrMax(
1943 VPlan &Plan, VPReductionPHIRecipe *MinOrMaxPhiR,
1944 VPReductionPHIRecipe *FindLastIVPhiR, VPWidenIntOrFpInductionRecipe *WideIV,
1945 VPInstruction *MinOrMaxResult, VPInstruction *FindIVSelect,
1946 VPRecipeBase *FindIVCmp, VPInstruction *FindIVRdxResult) {
1947 assert(!FindLastIVPhiR->isInLoop() && !FindLastIVPhiR->isOrdered() &&
1948 "inloop and ordered reductions not supported");
1949 assert(FindLastIVPhiR->getVFScaleFactor() == 1 &&
1950 "FindIV reduction must not be scaled");
1951
1952 Type *Ty = Plan.getVectorLoopRegion()->getCanonicalIVType();
1953 // TODO: Support non (i.e., narrower than) canonical IV types.
1954 // TODO: Emit remarks for failed transformations.
1955 if (Ty != WideIV->getScalarType())
1956 return false;
1957
1958 auto *FindIVSelectR = cast<VPSingleDefRecipe>(
1959 Val: FindLastIVPhiR->getBackedgeValue()->getDefiningRecipe());
1960 assert(
1961 match(FindIVSelectR, m_Select(m_VPValue(), m_VPValue(), m_VPValue())) &&
1962 "backedge value must be a select");
1963 if (FindIVSelectR->getOperand(N: 1) != WideIV &&
1964 FindIVSelectR->getOperand(N: 2) != WideIV)
1965 return false;
1966
1967 // If the original wide IV is not canonical, create a new one. The canonical
1968 // wide IV is guaranteed to not wrap for all lanes that are active in the
1969 // vector loop.
1970 if (!WideIV->isCanonical()) {
1971 VPIRValue *Zero = Plan.getConstantInt(Ty, Val: 0);
1972 VPIRValue *One = Plan.getConstantInt(Ty, Val: 1);
1973 auto *WidenCanIV = new VPWidenIntOrFpInductionRecipe(
1974 nullptr, Zero, One, WideIV->getVFValue(),
1975 WideIV->getInductionDescriptor(),
1976 VPIRFlags::WrapFlagsTy(/*HasNUW=*/true, /*HasNSW=*/false),
1977 WideIV->getDebugLoc());
1978 WidenCanIV->insertBefore(InsertPos: WideIV);
1979
1980 // Update the select to use the wide canonical IV.
1981 FindIVSelectR->setOperand(I: FindIVSelectR->getOperand(N: 1) == WideIV ? 1 : 2,
1982 New: WidenCanIV);
1983 }
1984 FindLastIVPhiR->setOperand(I: 0, New: Plan.getPoison(Ty));
1985
1986 // The reduction using MinOrMaxPhiR needs adjusting to compute the correct
1987 // result:
1988 // 1. Find the first canonical indices corresponding to partial min/max
1989 // values, using loop reductions.
1990 // 2. Find which of the partial min/max values are equal to the overall
1991 // min/max value.
1992 // 3. Select among the canonical indices those corresponding to the overall
1993 // min/max value.
1994 // 4. Find the first canonical index of overall min/max and scale it back to
1995 // the original IV using VPDerivedIVRecipe.
1996 // 5. If the overall min/max equals the starting min/max, the condition in
1997 // the loop was always false, due to being strict; return the start value
1998 // of FindLastIVPhiR in that case.
1999 //
2000 // For example, we transforms two independent reduction result computations
2001 // for
2002 //
2003 // <x1> vector loop: {
2004 // vector.body:
2005 // ...
2006 // ir<%iv> = WIDEN-INDUCTION nuw nsw ir<10>, ir<1>, vp<%0>
2007 // WIDEN-REDUCTION-PHI ir<%min.idx> = phi ir<sentinel.min.start>,
2008 // ir<%min.idx.next>
2009 // WIDEN-REDUCTION-PHI ir<%min.val> = phi ir<100>, ir<%min.val.next>
2010 // ....
2011 // WIDEN-INTRINSIC ir<%min.val.next> = call llvm.umin(ir<%min.val>, ir<%l>)
2012 // WIDEN ir<%min.idx.next> = select ir<%cmp>, ir<%iv>, ir<%min.idx>
2013 // ...
2014 // }
2015 // Successor(s): middle.block
2016 //
2017 // middle.block:
2018 // vp<%iv.rdx> = compute-reduction-result (smax) vp<%min.idx.next>
2019 // vp<%min.result> = compute-reduction-result (umin) ir<%min.val.next>
2020 // vp<%cmp> = icmp ne vp<%iv.rdx>, ir<sentinel.min.start>
2021 // vp<%find.iv.result> = select vp<%cmp>, vp<%iv.rdx>, ir<10>
2022 //
2023 //
2024 // Into:
2025 //
2026 // vp<%reduced.min> = compute-reduction-result (umin) ir<%min.val.next>
2027 // vp<%reduced.mins.mask> = icmp eq ir<%min.val.next>, vp<%reduced.min>
2028 // vp<%idxs2reduce> = select vp<%reduced.mins.mask>, ir<%min.idx.next>,
2029 // ir<MaxUInt>
2030 // vp<%reduced.idx> = compute-reduction-result (umin) vp<%idxs2reduce>
2031 // vp<%scaled.idx> = DERIVED-IV ir<20> + vp<%reduced.idx> * ir<1>
2032 // vp<%always.false> = icmp eq vp<%reduced.min>, ir<100>
2033 // vp<%final.idx> = select vp<%always.false>, ir<10>,
2034 // vp<%scaled.idx>
2035
2036 VPBuilder Builder(FindIVRdxResult);
2037 VPValue *MinOrMaxExiting = MinOrMaxResult->getOperand(N: 0);
2038 auto *FinalMinOrMaxCmp =
2039 Builder.createICmp(Pred: CmpInst::ICMP_EQ, A: MinOrMaxExiting, B: MinOrMaxResult);
2040 VPValue *LastIVExiting = FindIVRdxResult->getOperand(N: 0);
2041 VPValue *MaxIV =
2042 Plan.getConstantInt(Val: APInt::getMaxValue(numBits: Ty->getIntegerBitWidth()));
2043 auto *FinalIVSelect =
2044 Builder.createSelect(Cond: FinalMinOrMaxCmp, TrueVal: LastIVExiting, FalseVal: MaxIV);
2045 VPIRFlags RdxFlags(RecurKind::UMin, false, false, FastMathFlags());
2046 VPSingleDefRecipe *FinalCanIV = Builder.createNaryOp(
2047 Opcode: VPInstruction::ComputeReductionResult, Operands: {FinalIVSelect}, Flags: RdxFlags,
2048 DL: FindIVRdxResult->getDebugLoc());
2049
2050 // If we used a new wide canonical IV convert the reduction result back to the
2051 // original IV scale before the final select.
2052 if (!WideIV->isCanonical()) {
2053 auto *DerivedIVRecipe = new VPDerivedIVRecipe(
2054 InductionDescriptor::IK_IntInduction,
2055 nullptr, // No FPBinOp for integer induction
2056 WideIV->getStartValue(), FinalCanIV, WideIV->getStepValue());
2057 DerivedIVRecipe->insertBefore(InsertPos: &*Builder.getInsertPoint());
2058 FinalCanIV = DerivedIVRecipe;
2059 }
2060
2061 // If the final min/max value matches its start value, the condition in the
2062 // loop was always false, i.e. no induction value has been selected. If that's
2063 // the case, set the result of the IV reduction to its start value.
2064 VPValue *AlwaysFalse = Builder.createICmp(Pred: CmpInst::ICMP_EQ, A: MinOrMaxResult,
2065 B: MinOrMaxPhiR->getStartValue());
2066 VPValue *FinalIV = Builder.createSelect(
2067 Cond: AlwaysFalse, TrueVal: FindIVSelect->getOperand(N: 2), FalseVal: FinalCanIV);
2068 FindIVSelect->replaceAllUsesWith(New: FinalIV);
2069
2070 // Erase the old FindIV result pattern which is now dead.
2071 FindIVSelect->eraseFromParent();
2072 FindIVCmp->eraseFromParent();
2073 FindIVRdxResult->eraseFromParent();
2074 return true;
2075}
2076
2077bool VPlanTransforms::handleMultiUseReductions(VPlan &Plan,
2078 OptimizationRemarkEmitter *ORE,
2079 Loop *TheLoop) {
2080 for (auto &PhiR : make_early_inc_range(
2081 Range: Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis())) {
2082 auto *MinOrMaxPhiR = dyn_cast<VPReductionPHIRecipe>(Val: &PhiR);
2083 // TODO: check for multi-uses in VPlan directly.
2084 if (!MinOrMaxPhiR || !MinOrMaxPhiR->hasUsesOutsideReductionChain())
2085 continue;
2086
2087 // MinOrMaxPhiR has users outside the reduction cycle in the loop. Check if
2088 // the only other user is a FindLastIV reduction. MinOrMaxPhiR must have
2089 // exactly 2 users:
2090 // 1) the min/max operation of the reduction cycle, and
2091 // 2) the compare of a FindLastIV reduction cycle. This compare must match
2092 // the min/max operation - comparing MinOrMaxPhiR with the operand of the
2093 // min/max operation, and be used only by the select of the FindLastIV
2094 // reduction cycle.
2095 RecurKind RdxKind = MinOrMaxPhiR->getRecurrenceKind();
2096 assert(
2097 RecurrenceDescriptor::isIntMinMaxRecurrenceKind(RdxKind) &&
2098 "only min/max recurrences support users outside the reduction chain");
2099
2100 auto *MinOrMaxOp =
2101 dyn_cast<VPRecipeWithIRFlags>(Val: MinOrMaxPhiR->getBackedgeValue());
2102 if (!MinOrMaxOp)
2103 return false;
2104
2105 // Check that MinOrMaxOp is a VPWidenIntrinsicRecipe or VPReplicateRecipe
2106 // with an intrinsic that matches the reduction kind.
2107 Intrinsic::ID ExpectedIntrinsicID = getMinMaxReductionIntrinsicOp(RK: RdxKind);
2108 if (!match(V: MinOrMaxOp, P: m_Intrinsic(IntrID: ExpectedIntrinsicID)))
2109 return false;
2110
2111 // MinOrMaxOp must have 2 users: 1) MinOrMaxPhiR and 2)
2112 // ComputeReductionResult.
2113 assert(MinOrMaxOp->getNumUsers() == 2 &&
2114 "MinOrMaxOp must have exactly 2 users");
2115 VPValue *MinOrMaxOpValue = MinOrMaxOp->getOperand(N: 0);
2116 if (MinOrMaxOpValue == MinOrMaxPhiR)
2117 MinOrMaxOpValue = MinOrMaxOp->getOperand(N: 1);
2118
2119 VPValue *CmpOpA;
2120 VPValue *CmpOpB;
2121 CmpPredicate Pred;
2122 auto *Cmp = dyn_cast_or_null<VPRecipeWithIRFlags>(Val: findUserOf(
2123 V: MinOrMaxPhiR, P: m_Cmp(Pred, Op0: m_VPValue(V&: CmpOpA), Op1: m_VPValue(V&: CmpOpB))));
2124 if (!Cmp || Cmp->getNumUsers() != 1 ||
2125 (CmpOpA != MinOrMaxOpValue && CmpOpB != MinOrMaxOpValue))
2126 return false;
2127
2128 if (MinOrMaxOpValue != CmpOpB)
2129 Pred = CmpInst::getSwappedPredicate(pred: Pred);
2130
2131 // MinOrMaxPhiR must have exactly 2 users:
2132 // * MinOrMaxOp,
2133 // * Cmp (that's part of a FindLastIV chain).
2134 if (MinOrMaxPhiR->getNumUsers() != 2)
2135 return false;
2136
2137 VPInstruction *MinOrMaxResult =
2138 findUserOf<VPInstruction::ComputeReductionResult>(V: MinOrMaxOp);
2139 assert(is_contained(MinOrMaxPhiR->users(), MinOrMaxOp) &&
2140 "one user must be MinOrMaxOp");
2141 assert(MinOrMaxResult && "MinOrMaxResult must be a user of MinOrMaxOp");
2142
2143 // Cmp must be used by the select of a FindLastIV chain.
2144 VPValue *Sel = dyn_cast<VPSingleDefRecipe>(Val: Cmp->getSingleUser());
2145 VPValue *IVOp, *FindIV;
2146 if (!Sel || Sel->getNumUsers() != 2 ||
2147 !match(V: Sel,
2148 P: m_Select(Op0: m_Specific(VPV: Cmp), Op1: m_VPValue(V&: IVOp), Op2: m_VPValue(V&: FindIV))))
2149 return false;
2150
2151 if (!isa<VPReductionPHIRecipe>(Val: FindIV)) {
2152 std::swap(a&: FindIV, b&: IVOp);
2153 Pred = CmpInst::getInversePredicate(pred: Pred);
2154 }
2155
2156 auto *FindIVPhiR = dyn_cast<VPReductionPHIRecipe>(Val: FindIV);
2157 if (!FindIVPhiR || !RecurrenceDescriptor::isFindIVRecurrenceKind(
2158 Kind: FindIVPhiR->getRecurrenceKind()))
2159 return false;
2160
2161 assert(!FindIVPhiR->isInLoop() && !FindIVPhiR->isOrdered() &&
2162 "cannot handle inloop/ordered reductions yet");
2163
2164 // Check if FindIVPhiR is a FindLast pattern by checking the MinMaxKind
2165 // on its ComputeReductionResult. SMax/UMax indicates FindLast.
2166 VPInstruction *FindIVResult =
2167 findUserOf<VPInstruction::ComputeReductionResult>(
2168 V: FindIVPhiR->getBackedgeValue());
2169 assert(FindIVResult &&
2170 "must be able to retrieve the FindIVResult VPInstruction");
2171 RecurKind FindIVMinMaxKind = FindIVResult->getRecurKind();
2172 if (FindIVMinMaxKind != RecurKind::SMax &&
2173 FindIVMinMaxKind != RecurKind::UMax)
2174 return false;
2175
2176 // TODO: Support cases where IVOp is the IV increment.
2177 if (!match(V: IVOp, P: m_TruncOrSelf(Op0: m_VPValue(V&: IVOp))) ||
2178 !isa<VPWidenIntOrFpInductionRecipe>(Val: IVOp))
2179 return false;
2180
2181 // Check if the predicate is compatible with the reduction kind.
2182 bool IsValidKindPred = [RdxKind, Pred]() {
2183 switch (RdxKind) {
2184 case RecurKind::UMin:
2185 return Pred == CmpInst::ICMP_UGE || Pred == CmpInst::ICMP_UGT;
2186 case RecurKind::UMax:
2187 return Pred == CmpInst::ICMP_ULE || Pred == CmpInst::ICMP_ULT;
2188 case RecurKind::SMax:
2189 return Pred == CmpInst::ICMP_SLE || Pred == CmpInst::ICMP_SLT;
2190 case RecurKind::SMin:
2191 return Pred == CmpInst::ICMP_SGE || Pred == CmpInst::ICMP_SGT;
2192 default:
2193 llvm_unreachable("unhandled recurrence kind");
2194 }
2195 }();
2196 if (!IsValidKindPred) {
2197 ORE->emit(RemarkBuilder: [&]() {
2198 return OptimizationRemarkMissed(
2199 DEBUG_TYPE, "VectorizationMultiUseReductionPredicate",
2200 TheLoop->getStartLoc(), TheLoop->getHeader())
2201 << "Multi-use reduction with predicate "
2202 << CmpInst::getPredicateName(P: Pred)
2203 << " incompatible with reduction kind";
2204 });
2205 return false;
2206 }
2207
2208 auto *FindIVSelect = findFindIVSelect(BackedgeVal: FindIVPhiR->getBackedgeValue());
2209 auto *FindIVCmp = FindIVSelect->getOperand(N: 0)->getDefiningRecipe();
2210 auto *FindIVRdxResult = cast<VPInstruction>(Val: FindIVCmp->getOperand(N: 0));
2211 assert(FindIVSelect->getParent() == MinOrMaxResult->getParent() &&
2212 "both results must be computed in the same block");
2213 // Reducing to a scalar min or max value is placed right before reducing to
2214 // its scalar iteration, in order to generate instructions that use both
2215 // their operands.
2216 MinOrMaxResult->moveBefore(BB&: *FindIVRdxResult->getParent(),
2217 I: FindIVRdxResult->getIterator());
2218
2219 bool IsStrictPredicate = ICmpInst::isLT(P: Pred) || ICmpInst::isGT(P: Pred);
2220 if (IsStrictPredicate) {
2221 if (!handleFirstArgMinOrMax(Plan, MinOrMaxPhiR, FindLastIVPhiR: FindIVPhiR,
2222 WideIV: cast<VPWidenIntOrFpInductionRecipe>(Val: IVOp),
2223 MinOrMaxResult, FindIVSelect, FindIVCmp,
2224 FindIVRdxResult))
2225 return false;
2226 continue;
2227 }
2228
2229 // The reduction using MinOrMaxPhiR needs adjusting to compute the correct
2230 // result:
2231 // 1. We need to find the last IV for which the condition based on the
2232 // min/max recurrence is true,
2233 // 2. Compare the partial min/max reduction result to its final value and,
2234 // 3. Select the lanes of the partial FindLastIV reductions which
2235 // correspond to the lanes matching the min/max reduction result.
2236 //
2237 // For example, this transforms
2238 // vp<%min.result> = compute-reduction-result ir<%min.val.next>
2239 // vp<%iv.rdx> = compute-reduction-result (smax) vp<%min.idx.next>
2240 // vp<%cmp> = icmp ne vp<%iv.rdx>, SENTINEL
2241 // vp<%find.iv.result> = select vp<%cmp>, vp<%iv.rdx>, ir<0>
2242 //
2243 // into:
2244 //
2245 // vp<min.result> = compute-reduction-result ir<%min.val.next>
2246 // vp<%final.min.cmp> = icmp eq ir<%min.val.next>, vp<min.result>
2247 // vp<%final.iv> = select vp<%final.min.cmp>, vp<%min.idx.next>, SENTINEL
2248 // vp<%iv.rdx> = compute-reduction-result (smax) vp<%final.iv>
2249 // vp<%cmp> = icmp ne vp<%iv.rdx>, SENTINEL
2250 // vp<%find.iv.result> = select vp<%cmp>, vp<%iv.rdx>, ir<0>
2251 //
2252 VPBuilder B(FindIVRdxResult);
2253 VPValue *MinOrMaxExiting = MinOrMaxResult->getOperand(N: 0);
2254 auto *FinalMinOrMaxCmp =
2255 B.createICmp(Pred: CmpInst::ICMP_EQ, A: MinOrMaxExiting, B: MinOrMaxResult);
2256 VPValue *Sentinel = FindIVCmp->getOperand(N: 1);
2257 VPValue *LastIVExiting = FindIVRdxResult->getOperand(N: 0);
2258 auto *FinalIVSelect =
2259 B.createSelect(Cond: FinalMinOrMaxCmp, TrueVal: LastIVExiting, FalseVal: Sentinel);
2260 FindIVRdxResult->setOperand(I: 0, New: FinalIVSelect);
2261 }
2262 return true;
2263}
2264