1//===-- VPlanConstruction.cpp - Transforms for initial VPlan construction -===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file implements transforms for initial VPlan construction.
11///
12//===----------------------------------------------------------------------===//
13
14#include "LoopVectorizationPlanner.h"
15#include "VPlan.h"
16#include "VPlanAnalysis.h"
17#include "VPlanCFG.h"
18#include "VPlanDominatorTree.h"
19#include "VPlanHelpers.h"
20#include "VPlanPatternMatch.h"
21#include "VPlanTransforms.h"
22#include "VPlanUtils.h"
23#include "llvm/Analysis/LoopInfo.h"
24#include "llvm/Analysis/LoopIterator.h"
25#include "llvm/Analysis/OptimizationRemarkEmitter.h"
26#include "llvm/Analysis/ScalarEvolution.h"
27#include "llvm/Analysis/ScalarEvolutionExpressions.h"
28#include "llvm/Analysis/TargetTransformInfo.h"
29#include "llvm/IR/InstrTypes.h"
30#include "llvm/IR/MDBuilder.h"
31#include "llvm/Transforms/Utils/LoopUtils.h"
32#include "llvm/Transforms/Utils/LoopVersioning.h"
33
34#define DEBUG_TYPE "vplan"
35
36using namespace llvm;
37using namespace VPlanPatternMatch;
38
39namespace {
40// Class that is used to build the plain CFG for the incoming IR.
41class PlainCFGBuilder {
42 // The outermost loop of the input loop nest considered for vectorization.
43 Loop *TheLoop;
44
45 // Loop Info analysis.
46 LoopInfo *LI;
47
48 // Loop versioning for alias metadata.
49 LoopVersioning *LVer;
50
51 // Vectorization plan that we are working on.
52 std::unique_ptr<VPlan> Plan;
53
54 // Builder of the VPlan instruction-level representation.
55 VPBuilder VPIRBuilder;
56
57 // NOTE: The following maps are intentionally destroyed after the plain CFG
58 // construction because subsequent VPlan-to-VPlan transformation may
59 // invalidate them.
60 // Map incoming BasicBlocks to their newly-created VPBasicBlocks.
61 DenseMap<BasicBlock *, VPBasicBlock *> BB2VPBB;
62 // Map incoming Value definitions to their newly-created VPValues.
63 DenseMap<Value *, VPValue *> IRDef2VPValue;
64
65 // Hold phi node's that need to be fixed once the plain CFG has been built.
66 SmallVector<PHINode *, 8> PhisToFix;
67
68 // Utility functions.
69 void setVPBBPredsFromBB(VPBasicBlock *VPBB, BasicBlock *BB);
70 void fixHeaderPhis();
71 VPBasicBlock *getOrCreateVPBB(BasicBlock *BB);
72#ifndef NDEBUG
73 bool isExternalDef(Value *Val);
74#endif
75 VPValue *getOrCreateVPOperand(Value *IRVal);
76 void createVPInstructionsForVPBB(VPBasicBlock *VPBB, BasicBlock *BB);
77
78public:
79 PlainCFGBuilder(Loop *Lp, LoopInfo *LI, LoopVersioning *LVer)
80 : TheLoop(Lp), LI(LI), LVer(LVer), Plan(std::make_unique<VPlan>(args&: Lp)) {}
81
82 /// Build plain CFG for TheLoop and connect it to Plan's entry.
83 std::unique_ptr<VPlan> buildPlainCFG();
84};
85} // anonymous namespace
86
87// Set predecessors of \p VPBB in the same order as they are in \p BB. \p VPBB
88// must have no predecessors.
89void PlainCFGBuilder::setVPBBPredsFromBB(VPBasicBlock *VPBB, BasicBlock *BB) {
90 // Collect VPBB predecessors.
91 SmallVector<VPBlockBase *, 2> VPBBPreds;
92 for (BasicBlock *Pred : predecessors(BB))
93 VPBBPreds.push_back(Elt: getOrCreateVPBB(BB: Pred));
94 VPBB->setPredecessors(VPBBPreds);
95}
96
97static bool isHeaderBB(BasicBlock *BB, Loop *L) {
98 return L && BB == L->getHeader();
99}
100
101// Add operands to VPInstructions representing phi nodes from the input IR.
102void PlainCFGBuilder::fixHeaderPhis() {
103 for (auto *Phi : PhisToFix) {
104 assert(IRDef2VPValue.count(Phi) && "Missing VPInstruction for PHINode.");
105 VPValue *VPVal = IRDef2VPValue[Phi];
106 assert(isa<VPPhi>(VPVal) && "Expected VPPhi for phi node.");
107 auto *PhiR = cast<VPPhi>(Val: VPVal);
108 assert(PhiR->getNumOperands() == 0 && "Expected VPPhi with no operands.");
109 assert(isHeaderBB(Phi->getParent(), LI->getLoopFor(Phi->getParent())) &&
110 "Expected Phi in header block.");
111 assert(Phi->getNumOperands() == 2 &&
112 "header phi must have exactly 2 operands");
113 for (BasicBlock *Pred : predecessors(BB: Phi->getParent()))
114 PhiR->addOperand(
115 Operand: getOrCreateVPOperand(IRVal: Phi->getIncomingValueForBlock(BB: Pred)));
116 }
117}
118
119// Create a new empty VPBasicBlock for an incoming BasicBlock or retrieve an
120// existing one if it was already created.
121VPBasicBlock *PlainCFGBuilder::getOrCreateVPBB(BasicBlock *BB) {
122 if (auto *VPBB = BB2VPBB.lookup(Val: BB)) {
123 // Retrieve existing VPBB.
124 return VPBB;
125 }
126
127 // Create new VPBB.
128 StringRef Name = BB->getName();
129 LLVM_DEBUG(dbgs() << "Creating VPBasicBlock for " << Name << "\n");
130 VPBasicBlock *VPBB = Plan->createVPBasicBlock(Name);
131 BB2VPBB[BB] = VPBB;
132 return VPBB;
133}
134
135#ifndef NDEBUG
136// Return true if \p Val is considered an external definition. An external
137// definition is either:
138// 1. A Value that is not an Instruction. This will be refined in the future.
139// 2. An Instruction that is outside of the IR region represented in VPlan,
140// i.e., is not part of the loop nest.
141bool PlainCFGBuilder::isExternalDef(Value *Val) {
142 // All the Values that are not Instructions are considered external
143 // definitions for now.
144 Instruction *Inst = dyn_cast<Instruction>(Val);
145 if (!Inst)
146 return true;
147
148 // Check whether Instruction definition is in loop body.
149 return !TheLoop->contains(Inst);
150}
151#endif
152
153// Create a new VPValue or retrieve an existing one for the Instruction's
154// operand \p IRVal. This function must only be used to create/retrieve VPValues
155// for *Instruction's operands* and not to create regular VPInstruction's. For
156// the latter, please, look at 'createVPInstructionsForVPBB'.
157VPValue *PlainCFGBuilder::getOrCreateVPOperand(Value *IRVal) {
158 auto VPValIt = IRDef2VPValue.find(Val: IRVal);
159 if (VPValIt != IRDef2VPValue.end())
160 // Operand has an associated VPInstruction or VPValue that was previously
161 // created.
162 return VPValIt->second;
163
164 // Operand doesn't have a previously created VPInstruction/VPValue. This
165 // means that operand is:
166 // A) a definition external to VPlan,
167 // B) any other Value without specific representation in VPlan.
168 // For now, we use VPValue to represent A and B and classify both as external
169 // definitions. We may introduce specific VPValue subclasses for them in the
170 // future.
171 assert(isExternalDef(IRVal) && "Expected external definition as operand.");
172
173 // A and B: Create VPValue and add it to the pool of external definitions and
174 // to the Value->VPValue map.
175 VPValue *NewVPVal = Plan->getOrAddLiveIn(V: IRVal);
176 IRDef2VPValue[IRVal] = NewVPVal;
177 return NewVPVal;
178}
179
180// Create new VPInstructions in a VPBasicBlock, given its BasicBlock
181// counterpart. This function must be invoked in RPO so that the operands of a
182// VPInstruction in \p BB have been visited before (except for Phi nodes).
183void PlainCFGBuilder::createVPInstructionsForVPBB(VPBasicBlock *VPBB,
184 BasicBlock *BB) {
185 VPIRBuilder.setInsertPoint(VPBB);
186 // TODO: Model and preserve debug intrinsics in VPlan.
187 for (Instruction &InstRef : BB->instructionsWithoutDebug(SkipPseudoOp: false)) {
188 Instruction *Inst = &InstRef;
189
190 // There shouldn't be any VPValue for Inst at this point. Otherwise, we
191 // visited Inst when we shouldn't, breaking the RPO traversal order.
192 assert(!IRDef2VPValue.count(Inst) &&
193 "Instruction shouldn't have been visited.");
194
195 if (isa<UncondBrInst>(Val: Inst))
196 // Skip the rest of the Instruction processing for Branch instructions.
197 continue;
198
199 if (auto *Br = dyn_cast<CondBrInst>(Val: Inst)) {
200 // Conditional branch instruction are represented using BranchOnCond
201 // recipes.
202 VPValue *Cond = getOrCreateVPOperand(IRVal: Br->getCondition());
203 VPIRBuilder.createNaryOp(Opcode: VPInstruction::BranchOnCond, Operands: {Cond}, Inst, Flags: {},
204 MD: VPIRMetadata(*Inst), DL: Inst->getDebugLoc());
205 continue;
206 }
207
208 if (auto *SI = dyn_cast<SwitchInst>(Val: Inst)) {
209 // Don't emit recipes for unconditional switch instructions.
210 if (SI->getNumCases() == 0)
211 continue;
212 SmallVector<VPValue *> Ops = {getOrCreateVPOperand(IRVal: SI->getCondition())};
213 for (auto Case : SI->cases())
214 Ops.push_back(Elt: getOrCreateVPOperand(IRVal: Case.getCaseValue()));
215 VPIRBuilder.createNaryOp(Opcode: Instruction::Switch, Operands: Ops, Inst, Flags: {},
216 MD: VPIRMetadata(*Inst), DL: Inst->getDebugLoc());
217 continue;
218 }
219
220 VPSingleDefRecipe *NewR;
221 if (auto *Phi = dyn_cast<PHINode>(Val: Inst)) {
222 // Phi node's operands may not have been visited at this point. We create
223 // an empty VPInstruction that we will fix once the whole plain CFG has
224 // been built.
225 NewR =
226 VPIRBuilder.createScalarPhi(IncomingValues: {}, DL: Phi->getDebugLoc(), Name: "vec.phi", Flags: *Phi);
227 NewR->setUnderlyingValue(Phi);
228 if (isHeaderBB(BB: Phi->getParent(), L: LI->getLoopFor(BB: Phi->getParent()))) {
229 // Header phis need to be fixed after the VPBB for the latch has been
230 // created.
231 PhisToFix.push_back(Elt: Phi);
232 } else {
233 // Add operands for VPPhi in the order matching its predecessors in
234 // VPlan.
235 DenseMap<const VPBasicBlock *, VPValue *> VPPredToIncomingValue;
236 for (unsigned I = 0; I != Phi->getNumOperands(); ++I) {
237 VPPredToIncomingValue[BB2VPBB[Phi->getIncomingBlock(i: I)]] =
238 getOrCreateVPOperand(IRVal: Phi->getIncomingValue(i: I));
239 }
240 for (VPBlockBase *Pred : VPBB->getPredecessors())
241 NewR->addOperand(
242 Operand: VPPredToIncomingValue.lookup(Val: Pred->getExitingBasicBlock()));
243 }
244 } else {
245 // Build VPIRMetadata from the instruction and add loop versioning
246 // metadata for loads and stores.
247 VPIRMetadata MD(*Inst);
248 if (isa<LoadInst, StoreInst>(Val: Inst) && LVer) {
249 const auto &[AliasScopeMD, NoAliasMD] =
250 LVer->getNoAliasMetadataFor(OrigInst: Inst);
251 if (AliasScopeMD)
252 MD.setMetadata(Kind: LLVMContext::MD_alias_scope, Node: AliasScopeMD);
253 if (NoAliasMD)
254 MD.setMetadata(Kind: LLVMContext::MD_noalias, Node: NoAliasMD);
255 }
256
257 // Translate LLVM-IR operands into VPValue operands and set them in the
258 // new VPInstruction.
259 SmallVector<VPValue *, 4> VPOperands;
260 for (Value *Op : Inst->operands())
261 VPOperands.push_back(Elt: getOrCreateVPOperand(IRVal: Op));
262
263 if (auto *CI = dyn_cast<CastInst>(Val: Inst)) {
264 NewR = VPIRBuilder.createScalarCast(Opcode: CI->getOpcode(), Op: VPOperands[0],
265 ResultTy: CI->getType(), DL: CI->getDebugLoc(),
266 Flags: VPIRFlags(*CI), Metadata: MD);
267 NewR->setUnderlyingValue(CI);
268 } else if (auto *LI = dyn_cast<LoadInst>(Val: Inst)) {
269 NewR = VPIRBuilder.createScalarLoad(ResultTy: LI->getType(), Addr: VPOperands[0],
270 DL: LI->getDebugLoc(), Metadata: MD);
271 NewR->setUnderlyingValue(LI);
272 } else {
273 // Build VPInstruction for any arbitrary Instruction without specific
274 // representation in VPlan.
275 NewR =
276 VPIRBuilder.createNaryOp(Opcode: Inst->getOpcode(), Operands: VPOperands, Inst,
277 Flags: VPIRFlags(*Inst), MD, DL: Inst->getDebugLoc());
278 }
279 }
280
281 IRDef2VPValue[Inst] = NewR;
282 }
283}
284
285// Main interface to build the plain CFG.
286std::unique_ptr<VPlan> PlainCFGBuilder::buildPlainCFG() {
287 VPIRBasicBlock *Entry = cast<VPIRBasicBlock>(Val: Plan->getEntry());
288 BB2VPBB[Entry->getIRBasicBlock()] = Entry;
289 for (VPIRBasicBlock *ExitVPBB : Plan->getExitBlocks())
290 BB2VPBB[ExitVPBB->getIRBasicBlock()] = ExitVPBB;
291
292 // 1. Scan the body of the loop in a topological order to visit each basic
293 // block after having visited its predecessor basic blocks. Create a VPBB for
294 // each BB and link it to its successor and predecessor VPBBs. Note that
295 // predecessors must be set in the same order as they are in the incomming IR.
296 // Otherwise, there might be problems with existing phi nodes and algorithm
297 // based on predecessors traversal.
298
299 // Loop PH needs to be explicitly visited since it's not taken into account by
300 // LoopBlocksDFS.
301 BasicBlock *ThePreheaderBB = TheLoop->getLoopPreheader();
302 assert((ThePreheaderBB->getTerminator()->getNumSuccessors() == 1) &&
303 "Unexpected loop preheader");
304 for (auto &I : *ThePreheaderBB) {
305 if (I.getType()->isVoidTy())
306 continue;
307 IRDef2VPValue[&I] = Plan->getOrAddLiveIn(V: &I);
308 }
309
310 LoopBlocksRPO RPO(TheLoop);
311 RPO.perform(LI);
312
313 for (BasicBlock *BB : RPO) {
314 // Create or retrieve the VPBasicBlock for this BB.
315 VPBasicBlock *VPBB = getOrCreateVPBB(BB);
316 // Set VPBB predecessors in the same order as they are in the incoming BB.
317 setVPBBPredsFromBB(VPBB, BB);
318
319 // Create VPInstructions for BB.
320 createVPInstructionsForVPBB(VPBB, BB);
321
322 // Set VPBB successors. We create empty VPBBs for successors if they don't
323 // exist already. Recipes will be created when the successor is visited
324 // during the RPO traversal.
325 if (auto *SI = dyn_cast<SwitchInst>(Val: BB->getTerminator())) {
326 SmallVector<VPBlockBase *> Succs = {
327 getOrCreateVPBB(BB: SI->getDefaultDest())};
328 for (auto Case : SI->cases())
329 Succs.push_back(Elt: getOrCreateVPBB(BB: Case.getCaseSuccessor()));
330 VPBB->setSuccessors(Succs);
331 continue;
332 }
333 if (auto *BI = dyn_cast<UncondBrInst>(Val: BB->getTerminator())) {
334 VPBB->setOneSuccessor(getOrCreateVPBB(BB: BI->getSuccessor()));
335 continue;
336 }
337 auto *BI = cast<CondBrInst>(Val: BB->getTerminator());
338 BasicBlock *IRSucc0 = BI->getSuccessor(i: 0);
339 BasicBlock *IRSucc1 = BI->getSuccessor(i: 1);
340 VPBasicBlock *Successor0 = getOrCreateVPBB(BB: IRSucc0);
341 VPBasicBlock *Successor1 = getOrCreateVPBB(BB: IRSucc1);
342 VPBB->setTwoSuccessors(IfTrue: Successor0, IfFalse: Successor1);
343 }
344
345 for (auto *EB : Plan->getExitBlocks())
346 setVPBBPredsFromBB(VPBB: EB, BB: EB->getIRBasicBlock());
347
348 // 2. The whole CFG has been built at this point so all the input Values must
349 // have a VPlan counterpart. Fix VPlan header phi by adding their
350 // corresponding VPlan operands.
351 fixHeaderPhis();
352
353 Plan->getEntry()->setOneSuccessor(getOrCreateVPBB(BB: TheLoop->getHeader()));
354 Plan->getEntry()->setPlan(&*Plan);
355
356 // Fix VPlan loop-closed-ssa exit phi's by adding incoming operands to the
357 // VPIRInstructions wrapping them.
358 // // Note that the operand order corresponds to IR predecessor order, and may
359 // need adjusting when VPlan predecessors are added, if an exit block has
360 // multiple predecessor.
361 for (auto *EB : Plan->getExitBlocks()) {
362 for (VPRecipeBase &R : EB->phis()) {
363 auto *PhiR = cast<VPIRPhi>(Val: &R);
364 PHINode &Phi = PhiR->getIRPhi();
365 assert(PhiR->getNumOperands() == 0 &&
366 "no phi operands should be added yet");
367 for (BasicBlock *Pred : predecessors(BB: EB->getIRBasicBlock()))
368 PhiR->addOperand(
369 Operand: getOrCreateVPOperand(IRVal: Phi.getIncomingValueForBlock(BB: Pred)));
370 }
371 }
372
373 LLVM_DEBUG(Plan->setName("Plain CFG\n"); dbgs() << *Plan);
374 return std::move(Plan);
375}
376
377/// Checks if \p HeaderVPB is a loop header block in the plain CFG; that is, it
378/// has exactly 2 predecessors (preheader and latch), where the block
379/// dominates the latch and the preheader dominates the block. If it is a
380/// header block return true and canonicalize the predecessors of the header
381/// (making sure the preheader appears first and the latch second) and the
382/// successors of the latch (making sure the loop exit comes first). Otherwise
383/// return false.
384static bool canonicalHeaderAndLatch(VPBlockBase *HeaderVPB,
385 const VPDominatorTree &VPDT) {
386 ArrayRef<VPBlockBase *> Preds = HeaderVPB->getPredecessors();
387 if (Preds.size() != 2)
388 return false;
389
390 auto *PreheaderVPBB = Preds[0];
391 auto *LatchVPBB = Preds[1];
392 if (!VPDT.dominates(A: PreheaderVPBB, B: HeaderVPB) ||
393 !VPDT.dominates(A: HeaderVPB, B: LatchVPBB)) {
394 std::swap(a&: PreheaderVPBB, b&: LatchVPBB);
395
396 if (!VPDT.dominates(A: PreheaderVPBB, B: HeaderVPB) ||
397 !VPDT.dominates(A: HeaderVPB, B: LatchVPBB))
398 return false;
399
400 // Canonicalize predecessors of header so that preheader is first and
401 // latch second.
402 HeaderVPB->swapPredecessors();
403 for (VPRecipeBase &R : cast<VPBasicBlock>(Val: HeaderVPB)->phis())
404 R.swapOperands();
405 }
406
407 // The two successors of conditional branch match the condition, with the
408 // first successor corresponding to true and the second to false. We
409 // canonicalize the successors of the latch when introducing the region, such
410 // that the latch exits the region when its condition is true; invert the
411 // original condition if the original CFG branches to the header on true.
412 // Note that the exit edge is not yet connected for top-level loops.
413 if (LatchVPBB->getSingleSuccessor() ||
414 LatchVPBB->getSuccessors()[0] != HeaderVPB)
415 return true;
416
417 assert(LatchVPBB->getNumSuccessors() == 2 && "Must have 2 successors");
418 auto *Term = cast<VPBasicBlock>(Val: LatchVPBB)->getTerminator();
419 assert(cast<VPInstruction>(Term)->getOpcode() ==
420 VPInstruction::BranchOnCond &&
421 "terminator must be a BranchOnCond");
422 auto *Not = new VPInstruction(VPInstruction::Not, {Term->getOperand(N: 0)});
423 Not->insertBefore(InsertPos: Term);
424 Term->setOperand(I: 0, New: Not);
425 LatchVPBB->swapSuccessors();
426
427 return true;
428}
429
430/// Create a new VPRegionBlock for the loop starting at \p HeaderVPB.
431static void createLoopRegion(VPlan &Plan, VPBlockBase *HeaderVPB) {
432 auto *PreheaderVPBB = HeaderVPB->getPredecessors()[0];
433 auto *LatchVPBB = HeaderVPB->getPredecessors()[1];
434
435 VPBlockUtils::disconnectBlocks(From: PreheaderVPBB, To: HeaderVPB);
436 VPBlockUtils::disconnectBlocks(From: LatchVPBB, To: HeaderVPB);
437
438 // Create an empty region first and insert it between PreheaderVPBB and
439 // the exit blocks, taking care to preserve the original predecessor &
440 // successor order of blocks. Set region entry and exiting after both
441 // HeaderVPB and LatchVPBB have been disconnected from their
442 // predecessors/successors.
443 auto *R = Plan.createLoopRegion();
444
445 // Transfer latch's successors to the region.
446 VPBlockUtils::transferSuccessors(Old: LatchVPBB, New: R);
447
448 VPBlockUtils::connectBlocks(From: PreheaderVPBB, To: R);
449 R->setEntry(HeaderVPB);
450 R->setExiting(LatchVPBB);
451
452 // All VPBB's reachable shallowly from HeaderVPB belong to the current region.
453 for (VPBlockBase *VPBB : vp_depth_first_shallow(G: HeaderVPB))
454 VPBB->setParent(R);
455}
456
457// Add the necessary canonical IV and branch recipes required to control the
458// loop.
459static void addCanonicalIVRecipes(VPlan &Plan, VPBasicBlock *HeaderVPBB,
460 VPBasicBlock *LatchVPBB, Type *IdxTy,
461 DebugLoc DL) {
462 auto *StartV = Plan.getConstantInt(Ty: IdxTy, Val: 0);
463
464 // Add a VPCanonicalIVPHIRecipe starting at 0 to the header.
465 auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL);
466 HeaderVPBB->insert(Recipe: CanonicalIVPHI, InsertPt: HeaderVPBB->begin());
467
468 // We are about to replace the branch to exit the region. Remove the original
469 // BranchOnCond, if there is any.
470 DebugLoc LatchDL = DL;
471 if (!LatchVPBB->empty() && match(V: &LatchVPBB->back(), P: m_BranchOnCond())) {
472 LatchDL = LatchVPBB->getTerminator()->getDebugLoc();
473 LatchVPBB->getTerminator()->eraseFromParent();
474 }
475
476 VPBuilder Builder(LatchVPBB);
477 // Add a VPInstruction to increment the scalar canonical IV by VF * UF.
478 // Initially the induction increment is guaranteed to not wrap, but that may
479 // change later, e.g. when tail-folding, when the flags need to be dropped.
480 auto *CanonicalIVIncrement = Builder.createAdd(
481 LHS: CanonicalIVPHI, RHS: &Plan.getVFxUF(), DL, Name: "index.next", WrapFlags: {true, false});
482 CanonicalIVPHI->addOperand(Operand: CanonicalIVIncrement);
483
484 // Add the BranchOnCount VPInstruction to the latch.
485 Builder.createNaryOp(Opcode: VPInstruction::BranchOnCount,
486 Operands: {CanonicalIVIncrement, &Plan.getVectorTripCount()},
487 DL: LatchDL);
488}
489
490/// Creates extracts for values in \p Plan defined in a loop region and used
491/// outside a loop region.
492static void createExtractsForLiveOuts(VPlan &Plan, VPBasicBlock *MiddleVPBB) {
493 VPBuilder B(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
494 for (VPBasicBlock *EB : Plan.getExitBlocks()) {
495 if (!is_contained(Range: EB->predecessors(), Element: MiddleVPBB))
496 continue;
497
498 for (VPRecipeBase &R : EB->phis()) {
499 auto *ExitIRI = cast<VPIRPhi>(Val: &R);
500 VPValue *Exiting = ExitIRI->getIncomingValueForBlock(VPBB: MiddleVPBB);
501 if (isa<VPIRValue>(Val: Exiting))
502 continue;
503 Exiting = B.createNaryOp(Opcode: VPInstruction::ExtractLastPart, Operands: Exiting);
504 Exiting = B.createNaryOp(Opcode: VPInstruction::ExtractLastLane, Operands: Exiting);
505 ExitIRI->setIncomingValueForBlock(VPBB: MiddleVPBB, V: Exiting);
506 }
507 }
508}
509
510/// Return an iterator range to iterate over pairs of matching phi nodes in
511/// \p Header and \p ScalarHeader, skipping the canonical IV in the former.
512static auto getMatchingPhisForScalarLoop(VPBasicBlock *Header,
513 VPBasicBlock *ScalarHeader) {
514 return zip_equal(t: drop_begin(RangeOrContainer: Header->phis()), u: ScalarHeader->phis());
515}
516
517static void addInitialSkeleton(VPlan &Plan, Type *InductionTy, DebugLoc IVDL,
518 PredicatedScalarEvolution &PSE, Loop *TheLoop) {
519 VPDominatorTree VPDT(Plan);
520
521 auto *HeaderVPBB = cast<VPBasicBlock>(Val: Plan.getEntry()->getSingleSuccessor());
522 canonicalHeaderAndLatch(HeaderVPB: HeaderVPBB, VPDT);
523 auto *LatchVPBB = cast<VPBasicBlock>(Val: HeaderVPBB->getPredecessors()[1]);
524
525 VPBasicBlock *VecPreheader = Plan.createVPBasicBlock(Name: "vector.ph");
526 VPBlockUtils::insertBlockAfter(NewBlock: VecPreheader, BlockPtr: Plan.getEntry());
527
528 VPBasicBlock *MiddleVPBB = Plan.createVPBasicBlock(Name: "middle.block");
529 // The canonical LatchVPBB has the header block as last successor. If it has
530 // another successor, this successor is an exit block - insert middle block on
531 // its edge. Otherwise, add middle block as another successor retaining header
532 // as last.
533 if (LatchVPBB->getNumSuccessors() == 2) {
534 VPBlockBase *LatchExitVPB = LatchVPBB->getSuccessors()[0];
535 VPBlockUtils::insertOnEdge(From: LatchVPBB, To: LatchExitVPB, BlockPtr: MiddleVPBB);
536 } else {
537 VPBlockUtils::connectBlocks(From: LatchVPBB, To: MiddleVPBB);
538 LatchVPBB->swapSuccessors();
539 }
540
541 addCanonicalIVRecipes(Plan, HeaderVPBB, LatchVPBB, IdxTy: InductionTy, DL: IVDL);
542
543 // Create SCEV and VPValue for the trip count.
544 // We use the symbolic max backedge-taken-count, which works also when
545 // vectorizing loops with uncountable early exits.
546 const SCEV *BackedgeTakenCountSCEV = PSE.getSymbolicMaxBackedgeTakenCount();
547 assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCountSCEV) &&
548 "Invalid backedge-taken count");
549 ScalarEvolution &SE = *PSE.getSE();
550 const SCEV *TripCount = SE.getTripCountFromExitCount(ExitCount: BackedgeTakenCountSCEV,
551 EvalTy: InductionTy, L: TheLoop);
552 Plan.setTripCount(vputils::getOrCreateVPValueForSCEVExpr(Plan, Expr: TripCount));
553
554 VPBasicBlock *ScalarPH = Plan.createVPBasicBlock(Name: "scalar.ph");
555 VPBlockUtils::connectBlocks(From: ScalarPH, To: Plan.getScalarHeader());
556
557 // The connection order corresponds to the operands of the conditional branch,
558 // with the middle block already connected to the exit block.
559 VPBlockUtils::connectBlocks(From: MiddleVPBB, To: ScalarPH);
560 // Also connect the entry block to the scalar preheader.
561 // TODO: Also introduce a branch recipe together with the minimum trip count
562 // check.
563 VPBlockUtils::connectBlocks(From: Plan.getEntry(), To: ScalarPH);
564 Plan.getEntry()->swapSuccessors();
565
566 createExtractsForLiveOuts(Plan, MiddleVPBB);
567
568 // Create resume phis in the scalar preheader for each phi in the scalar loop.
569 // Their incoming value from the vector loop will be the last lane of the
570 // corresponding vector loop header phi.
571 VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
572 VPBuilder ScalarPHBuilder(ScalarPH);
573 assert(equal(ScalarPH->getPredecessors(),
574 ArrayRef<VPBlockBase *>({MiddleVPBB, Plan.getEntry()})) &&
575 "unexpected predecessor order of scalar ph");
576 for (const auto &[PhiR, ScalarPhiR] :
577 getMatchingPhisForScalarLoop(Header: HeaderVPBB, ScalarHeader: Plan.getScalarHeader())) {
578 auto *VectorPhiR = cast<VPPhi>(Val: &PhiR);
579 VPValue *BackedgeVal = VectorPhiR->getOperand(N: 1);
580 VPValue *ResumeFromVectorLoop =
581 MiddleBuilder.createNaryOp(Opcode: VPInstruction::ExtractLastPart, Operands: BackedgeVal);
582 ResumeFromVectorLoop = MiddleBuilder.createNaryOp(
583 Opcode: VPInstruction::ExtractLastLane, Operands: ResumeFromVectorLoop);
584 // Create scalar resume phi, with the first operand being the incoming value
585 // from the middle block and the second operand coming from the entry block.
586 auto *ResumePhiR = ScalarPHBuilder.createScalarPhi(
587 IncomingValues: {ResumeFromVectorLoop, VectorPhiR->getOperand(N: 0)},
588 DL: VectorPhiR->getDebugLoc());
589 cast<VPIRPhi>(Val: &ScalarPhiR)->addOperand(Operand: ResumePhiR);
590 }
591}
592
593/// Check \p Plan's live-in and replace them with constants, if they can be
594/// simplified via SCEV.
595static void simplifyLiveInsWithSCEV(VPlan &Plan,
596 PredicatedScalarEvolution &PSE) {
597 auto GetSimplifiedLiveInViaSCEV = [&](VPValue *VPV) -> VPValue * {
598 const SCEV *Expr = vputils::getSCEVExprForVPValue(V: VPV, PSE);
599 if (auto *C = dyn_cast<SCEVConstant>(Val: Expr))
600 return Plan.getOrAddLiveIn(V: C->getValue());
601 return nullptr;
602 };
603
604 for (VPValue *LiveIn : to_vector(Range: Plan.getLiveIns())) {
605 if (VPValue *SimplifiedLiveIn = GetSimplifiedLiveInViaSCEV(LiveIn))
606 LiveIn->replaceAllUsesWith(New: SimplifiedLiveIn);
607 }
608}
609
610/// To make RUN_VPLAN_PASS print initial VPlan.
611static void printAfterInitialConstruction(VPlan &) {}
612
613std::unique_ptr<VPlan>
614VPlanTransforms::buildVPlan0(Loop *TheLoop, LoopInfo &LI, Type *InductionTy,
615 DebugLoc IVDL, PredicatedScalarEvolution &PSE,
616 LoopVersioning *LVer) {
617 PlainCFGBuilder Builder(TheLoop, &LI, LVer);
618 std::unique_ptr<VPlan> VPlan0 = Builder.buildPlainCFG();
619 addInitialSkeleton(Plan&: *VPlan0, InductionTy, IVDL, PSE, TheLoop);
620 simplifyLiveInsWithSCEV(Plan&: *VPlan0, PSE);
621
622 RUN_VPLAN_PASS_NO_VERIFY(printAfterInitialConstruction, *VPlan0);
623 return VPlan0;
624}
625
626/// Creates a VPWidenIntOrFpInductionRecipe or VPWidenPointerInductionRecipe
627/// for \p Phi based on \p IndDesc.
628static VPHeaderPHIRecipe *
629createWidenInductionRecipe(PHINode *Phi, VPPhi *PhiR, VPIRValue *Start,
630 const InductionDescriptor &IndDesc, VPlan &Plan,
631 PredicatedScalarEvolution &PSE, Loop &OrigLoop,
632 DebugLoc DL) {
633 [[maybe_unused]] ScalarEvolution &SE = *PSE.getSE();
634 assert(SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) &&
635 "step must be loop invariant");
636 assert((Plan.getLiveIn(IndDesc.getStartValue()) == Start ||
637 (SE.isSCEVable(IndDesc.getStartValue()->getType()) &&
638 SE.getSCEV(IndDesc.getStartValue()) ==
639 vputils::getSCEVExprForVPValue(Start, PSE))) &&
640 "Start VPValue must match IndDesc's start value");
641
642 VPValue *Step =
643 vputils::getOrCreateVPValueForSCEVExpr(Plan, Expr: IndDesc.getStep());
644
645 VPValue *BackedgeVal = PhiR->getOperand(N: 1);
646 // Replace live-out extracts of WideIV's backedge value by ExitingIVValue
647 // recipes. optimizeInductionLiveOutUsers will later compute the proper
648 // DerivedIV.
649 auto ReplaceExtractsWithExitingIVValue = [&](VPHeaderPHIRecipe *WideIV) {
650 for (VPUser *U : to_vector(Range: BackedgeVal->users())) {
651 if (!match(U, P: m_ExtractLastPart(Op0: m_VPValue())))
652 continue;
653 auto *ExtractLastPart = cast<VPInstruction>(Val: U);
654 VPUser *ExtractLastPartUser = ExtractLastPart->getSingleUser();
655 assert(ExtractLastPartUser && "must have a single user");
656 if (!match(U: ExtractLastPartUser, P: m_ExtractLastLane(Op0: m_VPValue())))
657 continue;
658 auto *ExtractLastLane = cast<VPInstruction>(Val: ExtractLastPartUser);
659 assert(is_contained(ExtractLastLane->getParent()->successors(),
660 Plan.getScalarPreheader()) &&
661 "last lane must be extracted in the middle block");
662 VPBuilder Builder(ExtractLastLane);
663 ExtractLastLane->replaceAllUsesWith(
664 New: Builder.createNaryOp(Opcode: VPInstruction::ExitingIVValue, Operands: {WideIV}));
665 ExtractLastLane->eraseFromParent();
666 ExtractLastPart->eraseFromParent();
667 }
668 };
669
670 if (IndDesc.getKind() == InductionDescriptor::IK_PtrInduction) {
671 auto *WideIV = new VPWidenPointerInductionRecipe(
672 Phi, Start, Step, &Plan.getVFxUF(), IndDesc, DL);
673 ReplaceExtractsWithExitingIVValue(WideIV);
674 return WideIV;
675 }
676
677 assert((IndDesc.getKind() == InductionDescriptor::IK_IntInduction ||
678 IndDesc.getKind() == InductionDescriptor::IK_FpInduction) &&
679 "must have an integer or float induction at this point");
680
681 // Update wide induction increments to use the same step as the corresponding
682 // wide induction. This enables detecting induction increments directly in
683 // VPlan and removes redundant splats.
684 if (match(V: BackedgeVal, P: m_Add(Op0: m_Specific(VPV: PhiR), Op1: m_VPValue())))
685 BackedgeVal->getDefiningRecipe()->setOperand(I: 1, New: Step);
686
687 // It is always safe to copy over the NoWrap and FastMath flags. In
688 // particular, when folding tail by masking, the masked-off lanes are never
689 // used, so it is safe.
690 VPIRFlags Flags = vputils::getFlagsFromIndDesc(ID: IndDesc);
691
692 auto *WideIV = new VPWidenIntOrFpInductionRecipe(
693 Phi, Start, Step, &Plan.getVF(), IndDesc, Flags, DL);
694
695 ReplaceExtractsWithExitingIVValue(WideIV);
696 return WideIV;
697}
698
699void VPlanTransforms::createHeaderPhiRecipes(
700 VPlan &Plan, PredicatedScalarEvolution &PSE, Loop &OrigLoop,
701 const MapVector<PHINode *, InductionDescriptor> &Inductions,
702 const MapVector<PHINode *, RecurrenceDescriptor> &Reductions,
703 const SmallPtrSetImpl<const PHINode *> &FixedOrderRecurrences,
704 const SmallPtrSetImpl<PHINode *> &InLoopReductions, bool AllowReordering) {
705 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
706
707 auto CreateHeaderPhiRecipe = [&](VPPhi *PhiR) -> VPHeaderPHIRecipe * {
708 // TODO: Gradually replace uses of underlying instruction by analyses on
709 // VPlan.
710 auto *Phi = cast<PHINode>(Val: PhiR->getUnderlyingInstr());
711 assert(PhiR->getNumOperands() == 2 &&
712 "Must have 2 operands for header phis");
713
714 // Extract common values once.
715 VPIRValue *Start = cast<VPIRValue>(Val: PhiR->getOperand(N: 0));
716 VPValue *BackedgeValue = PhiR->getOperand(N: 1);
717
718 if (FixedOrderRecurrences.contains(Ptr: Phi)) {
719 // TODO: Currently fixed-order recurrences are modeled as chains of
720 // first-order recurrences. If there are no users of the intermediate
721 // recurrences in the chain, the fixed order recurrence should be
722 // modeled directly, enabling more efficient codegen.
723 return new VPFirstOrderRecurrencePHIRecipe(Phi, *Start, *BackedgeValue);
724 }
725
726 auto InductionIt = Inductions.find(Key: Phi);
727 if (InductionIt != Inductions.end())
728 return createWidenInductionRecipe(Phi, PhiR, Start, IndDesc: InductionIt->second,
729 Plan, PSE, OrigLoop,
730 DL: PhiR->getDebugLoc());
731
732 assert(Reductions.contains(Phi) && "only reductions are expected now");
733 const RecurrenceDescriptor &RdxDesc = Reductions.lookup(Key: Phi);
734 assert(RdxDesc.getRecurrenceStartValue() ==
735 Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader()) &&
736 "incoming value must match start value");
737 // Will be updated later to >1 if reduction is partial.
738 unsigned ScaleFactor = 1;
739 bool UseOrderedReductions = !AllowReordering && RdxDesc.isOrdered();
740 return new VPReductionPHIRecipe(
741 Phi, RdxDesc.getRecurrenceKind(), *Start, *BackedgeValue,
742 getReductionStyle(InLoop: InLoopReductions.contains(Ptr: Phi), Ordered: UseOrderedReductions,
743 ScaleFactor),
744 Phi->getType()->isFloatingPointTy() ? RdxDesc.getFastMathFlags()
745 : VPIRFlags(),
746 RdxDesc.hasUsesOutsideReductionChain());
747 };
748
749 assert(isa<VPCanonicalIVPHIRecipe>(HeaderVPBB->front()) &&
750 "first recipe must be canonical IV phi");
751 for (VPRecipeBase &R : make_early_inc_range(Range: drop_begin(RangeOrContainer: HeaderVPBB->phis()))) {
752 auto *PhiR = cast<VPPhi>(Val: &R);
753 VPHeaderPHIRecipe *HeaderPhiR = CreateHeaderPhiRecipe(PhiR);
754 HeaderPhiR->insertBefore(InsertPos: PhiR);
755 PhiR->replaceAllUsesWith(New: HeaderPhiR);
756 PhiR->eraseFromParent();
757 }
758
759 for (const auto &[HeaderPhiR, ScalarPhiR] :
760 getMatchingPhisForScalarLoop(Header: HeaderVPBB, ScalarHeader: Plan.getScalarPreheader())) {
761 auto *ResumePhiR = cast<VPPhi>(Val: &ScalarPhiR);
762 if (isa<VPFirstOrderRecurrencePHIRecipe>(Val: &HeaderPhiR)) {
763 ResumePhiR->setName("scalar.recur.init");
764 auto *ExtractLastLane = cast<VPInstruction>(Val: ResumePhiR->getOperand(N: 0));
765 ExtractLastLane->setName("vector.recur.extract");
766 continue;
767 }
768 ResumePhiR->setName(isa<VPWidenInductionRecipe>(Val: HeaderPhiR)
769 ? "bc.resume.val"
770 : "bc.merge.rdx");
771 }
772}
773
774void VPlanTransforms::createInLoopReductionRecipes(
775 VPlan &Plan, const DenseSet<BasicBlock *> &BlocksNeedingPredication,
776 ElementCount MinVF) {
777 VPTypeAnalysis TypeInfo(Plan);
778 VPBasicBlock *Header = Plan.getVectorLoopRegion()->getEntryBasicBlock();
779 SmallVector<VPRecipeBase *> ToDelete;
780
781 for (VPRecipeBase &R : Header->phis()) {
782 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(Val: &R);
783 if (!PhiR || !PhiR->isInLoop() || (MinVF.isScalar() && !PhiR->isOrdered()))
784 continue;
785
786 RecurKind Kind = PhiR->getRecurrenceKind();
787 assert(!RecurrenceDescriptor::isFindLastRecurrenceKind(Kind) &&
788 !RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) &&
789 !RecurrenceDescriptor::isFindIVRecurrenceKind(Kind) &&
790 "AnyOf and Find reductions are not allowed for in-loop reductions");
791
792 bool IsFPRecurrence =
793 RecurrenceDescriptor::isFloatingPointRecurrenceKind(Kind);
794 FastMathFlags FMFs =
795 IsFPRecurrence ? FastMathFlags::getFast() : FastMathFlags();
796
797 // Collect the chain of "link" recipes for the reduction starting at PhiR.
798 SetVector<VPSingleDefRecipe *> Worklist;
799 Worklist.insert(X: PhiR);
800 for (unsigned I = 0; I != Worklist.size(); ++I) {
801 VPSingleDefRecipe *Cur = Worklist[I];
802 for (VPUser *U : Cur->users()) {
803 auto *UserRecipe = cast<VPSingleDefRecipe>(Val: U);
804 if (!UserRecipe->getParent()->getEnclosingLoopRegion()) {
805 assert((UserRecipe->getParent() == Plan.getMiddleBlock() ||
806 UserRecipe->getParent() == Plan.getScalarPreheader()) &&
807 "U must be either in the loop region, the middle block or the "
808 "scalar preheader.");
809 continue;
810 }
811
812 // Stores using instructions will be sunk later.
813 if (match(R: UserRecipe, P: m_VPInstruction<Instruction::Store>()))
814 continue;
815 Worklist.insert(X: UserRecipe);
816 }
817 }
818
819 // Visit operation "Links" along the reduction chain top-down starting from
820 // the phi until LoopExitValue. We keep track of the previous item
821 // (PreviousLink) to tell which of the two operands of a Link will remain
822 // scalar and which will be reduced. For minmax by select(cmp), Link will be
823 // the select instructions. Blend recipes of in-loop reduction phi's will
824 // get folded to their non-phi operand, as the reduction recipe handles the
825 // condition directly.
826 VPSingleDefRecipe *PreviousLink = PhiR; // Aka Worklist[0].
827 for (VPSingleDefRecipe *CurrentLink : drop_begin(RangeOrContainer&: Worklist)) {
828 if (auto *Blend = dyn_cast<VPBlendRecipe>(Val: CurrentLink)) {
829 assert(Blend->getNumIncomingValues() == 2 &&
830 "Blend must have 2 incoming values");
831 unsigned PhiRIdx = Blend->getIncomingValue(Idx: 0) == PhiR ? 0 : 1;
832 assert(Blend->getIncomingValue(PhiRIdx) == PhiR &&
833 "PhiR must be an operand of the blend");
834 Blend->replaceAllUsesWith(New: Blend->getIncomingValue(Idx: 1 - PhiRIdx));
835 continue;
836 }
837
838 if (IsFPRecurrence) {
839 FastMathFlags CurFMF =
840 cast<VPRecipeWithIRFlags>(Val: CurrentLink)->getFastMathFlags();
841 if (match(R: CurrentLink, P: m_Select(Op0: m_VPValue(), Op1: m_VPValue(), Op2: m_VPValue())))
842 CurFMF |= cast<VPRecipeWithIRFlags>(Val: CurrentLink->getOperand(N: 0))
843 ->getFastMathFlags();
844 FMFs &= CurFMF;
845 }
846
847 Instruction *CurrentLinkI = CurrentLink->getUnderlyingInstr();
848
849 // Recognize a call to the llvm.fmuladd intrinsic.
850 bool IsFMulAdd = Kind == RecurKind::FMulAdd;
851 VPValue *VecOp;
852 VPBasicBlock *LinkVPBB = CurrentLink->getParent();
853 if (IsFMulAdd) {
854 assert(RecurrenceDescriptor::isFMulAddIntrinsic(CurrentLinkI) &&
855 "Expected current VPInstruction to be a call to the "
856 "llvm.fmuladd intrinsic");
857 assert(CurrentLink->getOperand(2) == PreviousLink &&
858 "expected a call where the previous link is the added operand");
859
860 // If the instruction is a call to the llvm.fmuladd intrinsic then we
861 // need to create an fmul recipe (multiplying the first two operands of
862 // the fmuladd together) to use as the vector operand for the fadd
863 // reduction.
864 auto *FMulRecipe = new VPInstruction(
865 Instruction::FMul,
866 {CurrentLink->getOperand(N: 0), CurrentLink->getOperand(N: 1)},
867 CurrentLinkI->getFastMathFlags());
868 LinkVPBB->insert(Recipe: FMulRecipe, InsertPt: CurrentLink->getIterator());
869 VecOp = FMulRecipe;
870 } else if (Kind == RecurKind::AddChainWithSubs &&
871 match(R: CurrentLink, P: m_Sub(Op0: m_VPValue(), Op1: m_VPValue()))) {
872 Type *PhiTy = TypeInfo.inferScalarType(V: PhiR);
873 auto *Zero = Plan.getConstantInt(Ty: PhiTy, Val: 0);
874 VPBuilder Builder(LinkVPBB, CurrentLink->getIterator());
875 auto *Sub = Builder.createSub(LHS: Zero, RHS: CurrentLink->getOperand(N: 1),
876 DL: CurrentLinkI->getDebugLoc());
877 Sub->setUnderlyingValue(CurrentLinkI);
878 VecOp = Sub;
879 } else {
880 // Index of the first operand which holds a non-mask vector operand.
881 unsigned IndexOfFirstOperand = 0;
882 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
883 if (match(R: CurrentLink, P: m_Cmp(Op0: m_VPValue(), Op1: m_VPValue())))
884 continue;
885 assert(match(CurrentLink,
886 m_Select(m_VPValue(), m_VPValue(), m_VPValue())) &&
887 "must be a select recipe");
888 IndexOfFirstOperand = 1;
889 }
890 // Note that for non-commutable operands (cmp-selects), the semantics of
891 // the cmp-select are captured in the recurrence kind.
892 unsigned VecOpId =
893 CurrentLink->getOperand(N: IndexOfFirstOperand) == PreviousLink
894 ? IndexOfFirstOperand + 1
895 : IndexOfFirstOperand;
896 VecOp = CurrentLink->getOperand(N: VecOpId);
897 assert(
898 VecOp != PreviousLink &&
899 CurrentLink->getOperand(
900 cast<VPInstruction>(CurrentLink)->getNumOperandsWithoutMask() -
901 1 - (VecOpId - IndexOfFirstOperand)) == PreviousLink &&
902 "PreviousLink must be the operand other than VecOp");
903 }
904
905 // Get block mask from CurrentLink, if it needs predication.
906 VPValue *CondOp = nullptr;
907 if (BlocksNeedingPredication.contains(V: CurrentLinkI->getParent()))
908 CondOp = cast<VPInstruction>(Val: CurrentLink)->getMask();
909
910 assert(PhiR->getVFScaleFactor() == 1 &&
911 "inloop reductions must be unscaled");
912 auto *RedRecipe = new VPReductionRecipe(
913 Kind, FMFs, CurrentLinkI, PreviousLink, VecOp, CondOp,
914 getReductionStyle(/*IsInLoop=*/InLoop: true, Ordered: PhiR->isOrdered(), ScaleFactor: 1),
915 CurrentLinkI->getDebugLoc());
916 // Append the recipe to the end of the VPBasicBlock because we need to
917 // ensure that it comes after all of it's inputs, including CondOp.
918 // Delete CurrentLink as it will be invalid if its operand is replaced
919 // with a reduction defined at the bottom of the block in the next link.
920 if (LinkVPBB->getNumSuccessors() == 0)
921 RedRecipe->insertBefore(InsertPos: &*std::prev(x: std::prev(x: LinkVPBB->end())));
922 else
923 LinkVPBB->appendRecipe(Recipe: RedRecipe);
924
925 CurrentLink->replaceAllUsesWith(New: RedRecipe);
926 ToDelete.push_back(Elt: CurrentLink);
927 PreviousLink = RedRecipe;
928 }
929 }
930
931 for (VPRecipeBase *R : ToDelete)
932 R->eraseFromParent();
933}
934
935void VPlanTransforms::handleEarlyExits(VPlan &Plan,
936 bool HasUncountableEarlyExit) {
937 auto *MiddleVPBB = cast<VPBasicBlock>(
938 Val: Plan.getScalarHeader()->getSinglePredecessor()->getPredecessors()[0]);
939 auto *LatchVPBB = cast<VPBasicBlock>(Val: MiddleVPBB->getSinglePredecessor());
940 VPBlockBase *HeaderVPB = cast<VPBasicBlock>(Val: LatchVPBB->getSuccessors()[1]);
941
942 if (HasUncountableEarlyExit) {
943 handleUncountableEarlyExits(Plan, HeaderVPBB: cast<VPBasicBlock>(Val: HeaderVPB), LatchVPBB,
944 MiddleVPBB);
945 return;
946 }
947
948 // Disconnect countable early exits from the loop, leaving it with a single
949 // exit from the latch. Countable early exits are left for a scalar epilog.
950 for (VPIRBasicBlock *EB : Plan.getExitBlocks()) {
951 for (VPBlockBase *Pred : to_vector(Range&: EB->getPredecessors())) {
952 if (Pred == MiddleVPBB)
953 continue;
954
955 // Remove phi operands for the early exiting block.
956 for (VPRecipeBase &R : EB->phis())
957 cast<VPIRPhi>(Val: &R)->removeIncomingValueFor(IncomingBlock: Pred);
958 auto *EarlyExitingVPBB = cast<VPBasicBlock>(Val: Pred);
959 EarlyExitingVPBB->getTerminator()->eraseFromParent();
960 VPBlockUtils::disconnectBlocks(From: Pred, To: EB);
961 }
962 }
963}
964
965void VPlanTransforms::addMiddleCheck(VPlan &Plan, bool TailFolded) {
966 auto *MiddleVPBB = cast<VPBasicBlock>(
967 Val: Plan.getScalarHeader()->getSinglePredecessor()->getPredecessors()[0]);
968 // If MiddleVPBB has a single successor then the original loop does not exit
969 // via the latch and the single successor must be the scalar preheader.
970 // There's no need to add a runtime check to MiddleVPBB.
971 if (MiddleVPBB->getNumSuccessors() == 1) {
972 assert(MiddleVPBB->getSingleSuccessor() == Plan.getScalarPreheader() &&
973 "must have ScalarPH as single successor");
974 return;
975 }
976
977 assert(MiddleVPBB->getNumSuccessors() == 2 && "must have 2 successors");
978
979 // Add a check in the middle block to see if we have completed all of the
980 // iterations in the first vector loop.
981 //
982 // Three cases:
983 // 1) If we require a scalar epilogue, the scalar ph must execute. Set the
984 // condition to false.
985 // 2) If (N - N%VF) == N, then we *don't* need to run the
986 // remainder. Thus if tail is to be folded, we know we don't need to run
987 // the remainder and we can set the condition to true.
988 // 3) Otherwise, construct a runtime check.
989
990 // We use the same DebugLoc as the scalar loop latch terminator instead of
991 // the corresponding compare because they may have ended up with different
992 // line numbers and we want to avoid awkward line stepping while debugging.
993 // E.g., if the compare has got a line number inside the loop.
994 auto *LatchVPBB = cast<VPBasicBlock>(Val: MiddleVPBB->getSinglePredecessor());
995 DebugLoc LatchDL = LatchVPBB->getTerminator()->getDebugLoc();
996 VPBuilder Builder(MiddleVPBB);
997 VPValue *Cmp;
998 if (TailFolded)
999 Cmp = Plan.getTrue();
1000 else
1001 Cmp = Builder.createICmp(Pred: CmpInst::ICMP_EQ, A: Plan.getTripCount(),
1002 B: &Plan.getVectorTripCount(), DL: LatchDL, Name: "cmp.n");
1003 Builder.createNaryOp(Opcode: VPInstruction::BranchOnCond, Operands: {Cmp}, DL: LatchDL);
1004}
1005
1006void VPlanTransforms::createLoopRegions(VPlan &Plan) {
1007 VPDominatorTree VPDT(Plan);
1008 for (VPBlockBase *HeaderVPB : vp_post_order_shallow(G: Plan.getEntry()))
1009 if (canonicalHeaderAndLatch(HeaderVPB, VPDT))
1010 createLoopRegion(Plan, HeaderVPB);
1011
1012 VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
1013 TopRegion->setName("vector loop");
1014 TopRegion->getEntryBasicBlock()->setName("vector.body");
1015}
1016
1017void VPlanTransforms::foldTailByMasking(VPlan &Plan) {
1018 assert(Plan.getExitBlocks().size() == 1 &&
1019 "only a single-exit block is supported currently");
1020 assert(Plan.getExitBlocks().front()->getSinglePredecessor() ==
1021 Plan.getMiddleBlock() &&
1022 "the exit block must have middle block as single predecessor");
1023
1024 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
1025 assert(LoopRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
1026 "The vector loop region must have the middle block as its single "
1027 "successor for now");
1028 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
1029
1030 Header->splitAt(SplitAt: Header->getFirstNonPhi());
1031
1032 // Create the header mask, insert it in the header and branch on it.
1033 auto *IV =
1034 new VPWidenCanonicalIVRecipe(Header->getParent()->getCanonicalIV());
1035 VPBuilder Builder(Header, Header->getFirstNonPhi());
1036 Builder.insert(R: IV);
1037 VPValue *BTC = Plan.getOrCreateBackedgeTakenCount();
1038 VPValue *HeaderMask = Builder.createICmp(Pred: CmpInst::ICMP_ULE, A: IV, B: BTC);
1039 Builder.createNaryOp(Opcode: VPInstruction::BranchOnCond, Operands: HeaderMask);
1040
1041 VPBasicBlock *OrigLatch = LoopRegion->getExitingBasicBlock();
1042 VPValue *IVInc;
1043 [[maybe_unused]] bool TermBranchOnCount =
1044 match(V: OrigLatch->getTerminator(),
1045 P: m_BranchOnCount(Op0: m_VPValue(V&: IVInc),
1046 Op1: m_Specific(VPV: &Plan.getVectorTripCount())));
1047 assert(TermBranchOnCount &&
1048 match(IVInc, m_Add(m_Specific(LoopRegion->getCanonicalIV()),
1049 m_Specific(&Plan.getVFxUF()))) &&
1050 std::next(IVInc->getDefiningRecipe()->getIterator()) ==
1051 OrigLatch->getTerminator()->getIterator() &&
1052 "Unexpected canonical iv increment");
1053
1054 // Split the latch at the IV update, and branch to it from the header mask.
1055 VPBasicBlock *Latch =
1056 OrigLatch->splitAt(SplitAt: IVInc->getDefiningRecipe()->getIterator());
1057 Latch->setName("vector.latch");
1058 VPBlockUtils::connectBlocks(From: Header, To: Latch);
1059
1060 // Collect any values defined in the loop that need a phi. Currently this
1061 // includes header phi backedges and live-outs extracted in the middle block.
1062 // TODO: Handle early exits via Plan.getExitBlocks()
1063 MapVector<VPValue *, SmallVector<VPUser *>> NeedsPhi;
1064 for (VPRecipeBase &R : Header->phis())
1065 if (!isa<VPCanonicalIVPHIRecipe, VPWidenInductionRecipe>(Val: R))
1066 NeedsPhi[cast<VPHeaderPHIRecipe>(Val&: R).getBackedgeValue()].push_back(Elt: &R);
1067
1068 VPValue *V;
1069 for (VPRecipeBase &R : *Plan.getMiddleBlock())
1070 if (match(V: &R, P: m_ExtractLastPart(Op0: m_VPValue(V))))
1071 NeedsPhi[V].push_back(Elt: &R);
1072
1073 // Insert phis with a poison incoming value for past the end of the tail.
1074 Builder.setInsertPoint(TheBB: Latch, IP: Latch->begin());
1075 VPTypeAnalysis TypeInfo(Plan);
1076 for (const auto &[V, Users] : NeedsPhi) {
1077 if (isa<VPIRValue>(Val: V))
1078 continue;
1079 // TODO: For reduction phis, use phi value instead of poison so we can
1080 // remove the special casing for tail folding in
1081 // LoopVectorizationPlanner::addReductionResultComputation
1082 VPValue *Poison =
1083 Plan.getOrAddLiveIn(V: PoisonValue::get(T: TypeInfo.inferScalarType(V)));
1084 VPInstruction *Phi = Builder.createScalarPhi(IncomingValues: {V, Poison});
1085 for (VPUser *U : Users)
1086 U->replaceUsesOfWith(From: V, To: Phi);
1087 }
1088
1089 // Any extract of the last element must be updated to extract from the last
1090 // active lane of the header mask instead (i.e., the lane corresponding to the
1091 // last active iteration).
1092 Builder.setInsertPoint(Plan.getMiddleBlock()->getTerminator());
1093 for (VPRecipeBase &R : *Plan.getMiddleBlock()) {
1094 VPValue *Op;
1095 if (!match(V: &R, P: m_ExtractLastLaneOfLastPart(Op0: m_VPValue(V&: Op))))
1096 continue;
1097
1098 // Compute the index of the last active lane.
1099 VPValue *LastActiveLane =
1100 Builder.createNaryOp(Opcode: VPInstruction::LastActiveLane, Operands: HeaderMask);
1101 auto *Ext =
1102 Builder.createNaryOp(Opcode: VPInstruction::ExtractLane, Operands: {LastActiveLane, Op});
1103 R.getVPSingleValue()->replaceAllUsesWith(New: Ext);
1104 }
1105}
1106
1107/// Insert \p CheckBlockVPBB on the edge leading to the vector preheader,
1108/// connecting it to both vector and scalar preheaders. Updates scalar
1109/// preheader phis to account for the new predecessor.
1110static void insertCheckBlockBeforeVectorLoop(VPlan &Plan,
1111 VPBasicBlock *CheckBlockVPBB) {
1112 VPBlockBase *VectorPH = Plan.getVectorPreheader();
1113 auto *ScalarPH = cast<VPBasicBlock>(Val: Plan.getScalarPreheader());
1114 VPBlockBase *PreVectorPH = VectorPH->getSinglePredecessor();
1115 VPBlockUtils::insertOnEdge(From: PreVectorPH, To: VectorPH, BlockPtr: CheckBlockVPBB);
1116 VPBlockUtils::connectBlocks(From: CheckBlockVPBB, To: ScalarPH);
1117 CheckBlockVPBB->swapSuccessors();
1118 unsigned NumPreds = ScalarPH->getNumPredecessors();
1119 for (VPRecipeBase &R : ScalarPH->phis()) {
1120 auto *Phi = cast<VPPhi>(Val: &R);
1121 assert(Phi->getNumIncoming() == NumPreds - 1 &&
1122 "must have incoming values for all predecessors");
1123 Phi->addOperand(Operand: Phi->getOperand(N: NumPreds - 2));
1124 }
1125}
1126
1127// Likelyhood of bypassing the vectorized loop due to a runtime check block,
1128// including memory overlap checks block and wrapping/unit-stride checks block.
1129static constexpr uint32_t CheckBypassWeights[] = {1, 127};
1130
1131/// Create a BranchOnCond terminator in \p CheckBlockVPBB. Optionally adds
1132/// branch weights.
1133static void addBypassBranch(VPlan &Plan, VPBasicBlock *CheckBlockVPBB,
1134 VPValue *Cond, bool AddBranchWeights) {
1135 DebugLoc DL = Plan.getVectorLoopRegion()->getCanonicalIV()->getDebugLoc();
1136 auto *Term = VPBuilder(CheckBlockVPBB)
1137 .createNaryOp(Opcode: VPInstruction::BranchOnCond, Operands: {Cond}, DL);
1138 if (AddBranchWeights) {
1139 MDBuilder MDB(Plan.getContext());
1140 MDNode *BranchWeights =
1141 MDB.createBranchWeights(Weights: CheckBypassWeights, /*IsExpected=*/false);
1142 Term->setMetadata(Kind: LLVMContext::MD_prof, Node: BranchWeights);
1143 }
1144}
1145
1146void VPlanTransforms::attachCheckBlock(VPlan &Plan, Value *Cond,
1147 BasicBlock *CheckBlock,
1148 bool AddBranchWeights) {
1149 VPValue *CondVPV = Plan.getOrAddLiveIn(V: Cond);
1150 VPBasicBlock *CheckBlockVPBB = Plan.createVPIRBasicBlock(IRBB: CheckBlock);
1151 insertCheckBlockBeforeVectorLoop(Plan, CheckBlockVPBB);
1152 addBypassBranch(Plan, CheckBlockVPBB, Cond: CondVPV, AddBranchWeights);
1153}
1154
1155void VPlanTransforms::addMinimumIterationCheck(
1156 VPlan &Plan, ElementCount VF, unsigned UF,
1157 ElementCount MinProfitableTripCount, bool RequiresScalarEpilogue,
1158 bool TailFolded, Loop *OrigLoop, const uint32_t *MinItersBypassWeights,
1159 DebugLoc DL, PredicatedScalarEvolution &PSE) {
1160 // Generate code to check if the loop's trip count is less than VF * UF, or
1161 // equal to it in case a scalar epilogue is required; this implies that the
1162 // vector trip count is zero. This check also covers the case where adding one
1163 // to the backedge-taken count overflowed leading to an incorrect trip count
1164 // of zero. In this case we will also jump to the scalar loop.
1165 CmpInst::Predicate CmpPred =
1166 RequiresScalarEpilogue ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
1167 // If tail is to be folded, vector loop takes care of all iterations.
1168 VPValue *TripCountVPV = Plan.getTripCount();
1169 const SCEV *TripCount = vputils::getSCEVExprForVPValue(V: TripCountVPV, PSE);
1170 Type *TripCountTy = TripCount->getType();
1171 ScalarEvolution &SE = *PSE.getSE();
1172 auto GetMinTripCount = [&]() -> const SCEV * {
1173 // Compute max(MinProfitableTripCount, UF * VF) and return it.
1174 const SCEV *VFxUF =
1175 SE.getElementCount(Ty: TripCountTy, EC: (VF * UF), Flags: SCEV::FlagNUW);
1176 if (UF * VF.getKnownMinValue() >=
1177 MinProfitableTripCount.getKnownMinValue()) {
1178 // TODO: SCEV should be able to simplify test.
1179 return VFxUF;
1180 }
1181 const SCEV *MinProfitableTripCountSCEV =
1182 SE.getElementCount(Ty: TripCountTy, EC: MinProfitableTripCount, Flags: SCEV::FlagNUW);
1183 return SE.getUMaxExpr(LHS: MinProfitableTripCountSCEV, RHS: VFxUF);
1184 };
1185
1186 VPBasicBlock *EntryVPBB = Plan.getEntry();
1187 VPBuilder Builder(EntryVPBB);
1188 VPValue *TripCountCheck = Plan.getFalse();
1189 const SCEV *Step = GetMinTripCount();
1190 // TripCountCheck = false, folding tail implies positive vector trip
1191 // count.
1192 if (!TailFolded) {
1193 // TODO: Emit unconditional branch to vector preheader instead of
1194 // conditional branch with known condition.
1195 TripCount = SE.applyLoopGuards(Expr: TripCount, L: OrigLoop);
1196 // Check if the trip count is < the step.
1197 if (SE.isKnownPredicate(Pred: CmpPred, LHS: TripCount, RHS: Step)) {
1198 // TODO: Ensure step is at most the trip count when determining max VF and
1199 // UF, w/o tail folding.
1200 TripCountCheck = Plan.getTrue();
1201 } else if (!SE.isKnownPredicate(Pred: CmpInst::getInversePredicate(pred: CmpPred),
1202 LHS: TripCount, RHS: Step)) {
1203 // Generate the minimum iteration check only if we cannot prove the
1204 // check is known to be true, or known to be false.
1205 VPValue *MinTripCountVPV = Builder.createExpandSCEV(Expr: Step);
1206 TripCountCheck = Builder.createICmp(
1207 Pred: CmpPred, A: TripCountVPV, B: MinTripCountVPV, DL, Name: "min.iters.check");
1208 } // else step known to be < trip count, use TripCountCheck preset to false.
1209 }
1210 VPInstruction *Term =
1211 Builder.createNaryOp(Opcode: VPInstruction::BranchOnCond, Operands: {TripCountCheck}, DL);
1212 if (MinItersBypassWeights) {
1213 MDBuilder MDB(Plan.getContext());
1214 MDNode *BranchWeights = MDB.createBranchWeights(
1215 Weights: ArrayRef(MinItersBypassWeights, 2), /*IsExpected=*/false);
1216 Term->setMetadata(Kind: LLVMContext::MD_prof, Node: BranchWeights);
1217 }
1218}
1219
1220void VPlanTransforms::addMinimumVectorEpilogueIterationCheck(
1221 VPlan &Plan, Value *TripCount, Value *VectorTripCount,
1222 bool RequiresScalarEpilogue, ElementCount EpilogueVF, unsigned EpilogueUF,
1223 unsigned MainLoopStep, unsigned EpilogueLoopStep, ScalarEvolution &SE) {
1224 // Add the minimum iteration check for the epilogue vector loop.
1225 VPValue *TC = Plan.getOrAddLiveIn(V: TripCount);
1226 VPBuilder Builder(cast<VPBasicBlock>(Val: Plan.getEntry()));
1227 VPValue *VFxUF = Builder.createExpandSCEV(Expr: SE.getElementCount(
1228 Ty: TripCount->getType(), EC: (EpilogueVF * EpilogueUF), Flags: SCEV::FlagNUW));
1229 VPValue *Count = Builder.createSub(LHS: TC, RHS: Plan.getOrAddLiveIn(V: VectorTripCount),
1230 DL: DebugLoc::getUnknown(), Name: "n.vec.remaining");
1231
1232 // Generate code to check if the loop's trip count is less than VF * UF of
1233 // the vector epilogue loop.
1234 auto P = RequiresScalarEpilogue ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
1235 auto *CheckMinIters = Builder.createICmp(
1236 Pred: P, A: Count, B: VFxUF, DL: DebugLoc::getUnknown(), Name: "min.epilog.iters.check");
1237 VPInstruction *Branch =
1238 Builder.createNaryOp(Opcode: VPInstruction::BranchOnCond, Operands: CheckMinIters);
1239
1240 // We assume the remaining `Count` is equally distributed in
1241 // [0, MainLoopStep)
1242 // So the probability for `Count < EpilogueLoopStep` should be
1243 // min(MainLoopStep, EpilogueLoopStep) / MainLoopStep
1244 // TODO: Improve the estimate by taking the estimated trip count into
1245 // consideration.
1246 unsigned EstimatedSkipCount = std::min(a: MainLoopStep, b: EpilogueLoopStep);
1247 const uint32_t Weights[] = {EstimatedSkipCount,
1248 MainLoopStep - EstimatedSkipCount};
1249 MDBuilder MDB(Plan.getContext());
1250 MDNode *BranchWeights =
1251 MDB.createBranchWeights(Weights, /*IsExpected=*/false);
1252 Branch->setMetadata(Kind: LLVMContext::MD_prof, Node: BranchWeights);
1253}
1254
1255/// Find and return the final select instruction of the FindIV result pattern
1256/// for the given \p BackedgeVal:
1257/// select(icmp ne ComputeReductionResult(ReducedIV), Sentinel),
1258/// ComputeReductionResult(ReducedIV), Start.
1259static VPInstruction *findFindIVSelect(VPValue *BackedgeVal) {
1260 return cast<VPInstruction>(
1261 Val: vputils::findRecipe(Start: BackedgeVal, Pred: [BackedgeVal](VPRecipeBase *R) {
1262 auto *VPI = dyn_cast<VPInstruction>(Val: R);
1263 return VPI &&
1264 matchFindIVResult(VPI, ReducedIV: m_Specific(VPV: BackedgeVal), Start: m_VPValue());
1265 }));
1266}
1267
1268bool VPlanTransforms::handleMaxMinNumReductions(VPlan &Plan) {
1269 auto GetMinOrMaxCompareValue =
1270 [](VPReductionPHIRecipe *RedPhiR) -> VPValue * {
1271 auto *MinOrMaxR =
1272 dyn_cast_or_null<VPRecipeWithIRFlags>(Val: RedPhiR->getBackedgeValue());
1273 if (!MinOrMaxR)
1274 return nullptr;
1275
1276 // Check that MinOrMaxR is a VPWidenIntrinsicRecipe or VPReplicateRecipe
1277 // with an intrinsic that matches the reduction kind.
1278 Intrinsic::ID ExpectedIntrinsicID =
1279 getMinMaxReductionIntrinsicOp(RK: RedPhiR->getRecurrenceKind());
1280 if (!match(V: MinOrMaxR, P: m_Intrinsic(IntrID: ExpectedIntrinsicID)))
1281 return nullptr;
1282
1283 if (MinOrMaxR->getOperand(N: 0) == RedPhiR)
1284 return MinOrMaxR->getOperand(N: 1);
1285
1286 assert(MinOrMaxR->getOperand(1) == RedPhiR &&
1287 "Reduction phi operand expected");
1288 return MinOrMaxR->getOperand(N: 0);
1289 };
1290
1291 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
1292 SmallVector<std::pair<VPReductionPHIRecipe *, VPValue *>>
1293 MinOrMaxNumReductionsToHandle;
1294 bool HasUnsupportedPhi = false;
1295 for (auto &R : LoopRegion->getEntryBasicBlock()->phis()) {
1296 if (isa<VPCanonicalIVPHIRecipe, VPWidenIntOrFpInductionRecipe>(Val: &R))
1297 continue;
1298 auto *Cur = dyn_cast<VPReductionPHIRecipe>(Val: &R);
1299 if (!Cur) {
1300 // TODO: Also support fixed-order recurrence phis.
1301 HasUnsupportedPhi = true;
1302 continue;
1303 }
1304 if (!RecurrenceDescriptor::isFPMinMaxNumRecurrenceKind(
1305 Kind: Cur->getRecurrenceKind())) {
1306 HasUnsupportedPhi = true;
1307 continue;
1308 }
1309
1310 VPValue *MinOrMaxOp = GetMinOrMaxCompareValue(Cur);
1311 if (!MinOrMaxOp)
1312 return false;
1313
1314 MinOrMaxNumReductionsToHandle.emplace_back(Args&: Cur, Args&: MinOrMaxOp);
1315 }
1316
1317 if (MinOrMaxNumReductionsToHandle.empty())
1318 return true;
1319
1320 // We won't be able to resume execution in the scalar tail, if there are
1321 // unsupported header phis or there is no scalar tail at all, due to
1322 // tail-folding.
1323 if (HasUnsupportedPhi || !Plan.hasScalarTail())
1324 return false;
1325
1326 /// Check if the vector loop of \p Plan can early exit and restart
1327 /// execution of last vector iteration in the scalar loop. This requires all
1328 /// recipes up to early exit point be side-effect free as they are
1329 /// re-executed. Currently we check that the loop is free of any recipe that
1330 /// may write to memory. Expected to operate on an early VPlan w/o nested
1331 /// regions.
1332 for (VPBlockBase *VPB : vp_depth_first_shallow(
1333 G: Plan.getVectorLoopRegion()->getEntryBasicBlock())) {
1334 auto *VPBB = cast<VPBasicBlock>(Val: VPB);
1335 for (auto &R : *VPBB) {
1336 if (R.mayWriteToMemory() && !match(V: &R, P: m_BranchOnCount()))
1337 return false;
1338 }
1339 }
1340
1341 VPBasicBlock *LatchVPBB = LoopRegion->getExitingBasicBlock();
1342 VPBuilder LatchBuilder(LatchVPBB->getTerminator());
1343 VPValue *AllNaNLanes = nullptr;
1344 SmallPtrSet<VPValue *, 2> RdxResults;
1345 for (const auto &[_, MinOrMaxOp] : MinOrMaxNumReductionsToHandle) {
1346 VPValue *RedNaNLanes =
1347 LatchBuilder.createFCmp(Pred: CmpInst::FCMP_UNO, A: MinOrMaxOp, B: MinOrMaxOp);
1348 AllNaNLanes = AllNaNLanes ? LatchBuilder.createOr(LHS: AllNaNLanes, RHS: RedNaNLanes)
1349 : RedNaNLanes;
1350 }
1351
1352 VPValue *AnyNaNLane =
1353 LatchBuilder.createNaryOp(Opcode: VPInstruction::AnyOf, Operands: {AllNaNLanes});
1354 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
1355 VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->begin());
1356 for (const auto &[RedPhiR, _] : MinOrMaxNumReductionsToHandle) {
1357 assert(RecurrenceDescriptor::isFPMinMaxNumRecurrenceKind(
1358 RedPhiR->getRecurrenceKind()) &&
1359 "unsupported reduction");
1360
1361 // If we exit early due to NaNs, compute the final reduction result based on
1362 // the reduction phi at the beginning of the last vector iteration.
1363 auto *RdxResult = vputils::findComputeReductionResult(PhiR: RedPhiR);
1364 assert(RdxResult && "must find a ComputeReductionResult");
1365
1366 auto *NewSel = MiddleBuilder.createSelect(Cond: AnyNaNLane, TrueVal: RedPhiR,
1367 FalseVal: RdxResult->getOperand(N: 0));
1368 RdxResult->setOperand(I: 0, New: NewSel);
1369 assert(!RdxResults.contains(RdxResult) && "RdxResult already used");
1370 RdxResults.insert(Ptr: RdxResult);
1371 }
1372
1373 auto *LatchExitingBranch = LatchVPBB->getTerminator();
1374 assert(match(LatchExitingBranch, m_BranchOnCount(m_VPValue(), m_VPValue())) &&
1375 "Unexpected terminator");
1376 auto *IsLatchExitTaken = LatchBuilder.createICmp(
1377 Pred: CmpInst::ICMP_EQ, A: LatchExitingBranch->getOperand(N: 0),
1378 B: LatchExitingBranch->getOperand(N: 1));
1379 auto *AnyExitTaken = LatchBuilder.createOr(LHS: AnyNaNLane, RHS: IsLatchExitTaken);
1380 LatchBuilder.createNaryOp(Opcode: VPInstruction::BranchOnCond, Operands: AnyExitTaken);
1381 LatchExitingBranch->eraseFromParent();
1382
1383 // Update resume phis for inductions in the scalar preheader. If AnyNaNLane is
1384 // true, the resume from the start of the last vector iteration via the
1385 // canonical IV, otherwise from the original value.
1386 auto IsTC = [&Plan](VPValue *V) {
1387 return V == &Plan.getVectorTripCount() || V == Plan.getTripCount();
1388 };
1389 for (auto &R : Plan.getScalarPreheader()->phis()) {
1390 auto *ResumeR = cast<VPPhi>(Val: &R);
1391 VPValue *VecV = ResumeR->getOperand(N: 0);
1392 if (RdxResults.contains(Ptr: VecV))
1393 continue;
1394 if (auto *DerivedIV = dyn_cast<VPDerivedIVRecipe>(Val: VecV)) {
1395 VPValue *DIVTC = DerivedIV->getOperand(N: 1);
1396 if (DerivedIV->getNumUsers() == 1 && IsTC(DIVTC)) {
1397 auto *NewSel = MiddleBuilder.createSelect(
1398 Cond: AnyNaNLane, TrueVal: LoopRegion->getCanonicalIV(), FalseVal: DIVTC);
1399 DerivedIV->moveAfter(MovePos: &*MiddleBuilder.getInsertPoint());
1400 DerivedIV->setOperand(I: 1, New: NewSel);
1401 continue;
1402 }
1403 }
1404 // Bail out and abandon the current, partially modified, VPlan if we
1405 // encounter resume phi that cannot be updated yet.
1406 if (!IsTC(VecV)) {
1407 LLVM_DEBUG(dbgs() << "Found resume phi we cannot update for VPlan with "
1408 "FMaxNum/FMinNum reduction.\n");
1409 return false;
1410 }
1411 auto *NewSel = MiddleBuilder.createSelect(
1412 Cond: AnyNaNLane, TrueVal: LoopRegion->getCanonicalIV(), FalseVal: VecV);
1413 ResumeR->setOperand(I: 0, New: NewSel);
1414 }
1415
1416 auto *MiddleTerm = MiddleVPBB->getTerminator();
1417 MiddleBuilder.setInsertPoint(MiddleTerm);
1418 VPValue *MiddleCond = MiddleTerm->getOperand(N: 0);
1419 VPValue *NewCond =
1420 MiddleBuilder.createAnd(LHS: MiddleCond, RHS: MiddleBuilder.createNot(Operand: AnyNaNLane));
1421 MiddleTerm->setOperand(I: 0, New: NewCond);
1422 return true;
1423}
1424
1425bool VPlanTransforms::handleFindLastReductions(VPlan &Plan) {
1426 if (Plan.hasScalarVFOnly())
1427 return false;
1428
1429 // We want to create the following nodes:
1430 // vector.body:
1431 // ...new WidenPHI recipe introduced to keep the mask value for the latest
1432 // iteration where any lane was active.
1433 // mask.phi = phi [ ir<false>, vector.ph ], [ vp<new.mask>, vector.body ]
1434 // ...data.phi (a VPReductionPHIRecipe for a FindLast reduction) already
1435 // exists, but needs updating to use 'new.data' for the backedge value.
1436 // data.phi = phi ir<default.val>, vp<new.data>
1437 //
1438 // ...'data' and 'compare' created by existing nodes...
1439 //
1440 // ...new recipes introduced to determine whether to update the reduction
1441 // values or keep the current one.
1442 // any.active = i1 any-of ir<compare>
1443 // new.mask = select vp<any.active>, ir<compare>, vp<mask.phi>
1444 // new.data = select vp<any.active>, ir<data>, ir<data.phi>
1445 //
1446 // middle.block:
1447 // ...extract-last-active replaces compute-reduction-result.
1448 // result = extract-last-active vp<new.data>, vp<new.mask>, ir<default.val>
1449
1450 SmallVector<VPReductionPHIRecipe *, 4> Phis;
1451 for (VPRecipeBase &Phi :
1452 Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
1453 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(Val: &Phi);
1454 if (PhiR && RecurrenceDescriptor::isFindLastRecurrenceKind(
1455 Kind: PhiR->getRecurrenceKind()))
1456 Phis.push_back(Elt: PhiR);
1457 }
1458
1459 if (Phis.empty())
1460 return true;
1461
1462 VPValue *HeaderMask = vputils::findHeaderMask(Plan);
1463 for (VPReductionPHIRecipe *PhiR : Phis) {
1464 // Find the condition for the select/blend.
1465 VPValue *BackedgeSelect = PhiR->getBackedgeValue();
1466 VPValue *CondSelect = BackedgeSelect;
1467
1468 // If there's a header mask, the backedge select will not be the find-last
1469 // select.
1470 if (HeaderMask && !match(V: BackedgeSelect,
1471 P: m_Select(Op0: m_Specific(VPV: HeaderMask),
1472 Op1: m_VPValue(V&: CondSelect), Op2: m_Specific(VPV: PhiR))))
1473 llvm_unreachable("expected header mask select");
1474
1475 VPValue *Cond = nullptr, *Op1 = nullptr, *Op2 = nullptr;
1476
1477 // If we're matching a blend rather than a select, there should be one
1478 // incoming value which is the data, then all other incoming values should
1479 // be the phi.
1480 auto MatchBlend = [&](VPRecipeBase *R) {
1481 auto *Blend = dyn_cast<VPBlendRecipe>(Val: R);
1482 if (!Blend)
1483 return false;
1484 assert(!Blend->isNormalized() && "must run before blend normalizaion");
1485 unsigned NumIncomingDataValues = 0;
1486 for (unsigned I = 0; I < Blend->getNumIncomingValues(); ++I) {
1487 VPValue *Incoming = Blend->getIncomingValue(Idx: I);
1488 if (Incoming != PhiR) {
1489 ++NumIncomingDataValues;
1490 Cond = Blend->getMask(Idx: I);
1491 Op1 = Incoming;
1492 Op2 = PhiR;
1493 }
1494 }
1495 return NumIncomingDataValues == 1;
1496 };
1497
1498 VPSingleDefRecipe *SelectR =
1499 cast<VPSingleDefRecipe>(Val: CondSelect->getDefiningRecipe());
1500 if (!match(R: SelectR,
1501 P: m_Select(Op0: m_VPValue(V&: Cond), Op1: m_VPValue(V&: Op1), Op2: m_VPValue(V&: Op2))) &&
1502 !MatchBlend(SelectR))
1503 return false;
1504
1505 assert(Cond != HeaderMask && "Cond must not be HeaderMask");
1506
1507 // Find final reduction computation and replace it with an
1508 // extract.last.active intrinsic.
1509 auto *RdxResult =
1510 vputils::findUserOf<VPInstruction::ComputeReductionResult>(
1511 V: BackedgeSelect);
1512 assert(RdxResult && "Could not find reduction result");
1513
1514 // Add mask phi.
1515 VPBuilder Builder = VPBuilder::getToInsertAfter(R: PhiR);
1516 auto *MaskPHI = new VPWidenPHIRecipe(nullptr, /*Start=*/Plan.getFalse());
1517 Builder.insert(R: MaskPHI);
1518
1519 // Add select for mask.
1520 Builder.setInsertPoint(SelectR);
1521
1522 if (Op1 == PhiR) {
1523 // Normalize to selecting the data operand when the condition is true by
1524 // swapping operands and negating the condition.
1525 std::swap(a&: Op1, b&: Op2);
1526 Cond = Builder.createNot(Operand: Cond);
1527 }
1528 assert(Op2 == PhiR && "data value must be selected if Cond is true");
1529
1530 if (HeaderMask)
1531 Cond = Builder.createLogicalAnd(LHS: HeaderMask, RHS: Cond);
1532
1533 VPValue *AnyOf = Builder.createNaryOp(Opcode: VPInstruction::AnyOf, Operands: {Cond});
1534 VPValue *MaskSelect = Builder.createSelect(Cond: AnyOf, TrueVal: Cond, FalseVal: MaskPHI);
1535 MaskPHI->addOperand(Operand: MaskSelect);
1536
1537 // Replace select for data.
1538 VPValue *DataSelect =
1539 Builder.createSelect(Cond: AnyOf, TrueVal: Op1, FalseVal: Op2, DL: SelectR->getDebugLoc());
1540 SelectR->replaceAllUsesWith(New: DataSelect);
1541 PhiR->setBackedgeValue(DataSelect);
1542 SelectR->eraseFromParent();
1543
1544 Builder.setInsertPoint(RdxResult);
1545 auto *ExtractLastActive =
1546 Builder.createNaryOp(Opcode: VPInstruction::ExtractLastActive,
1547 Operands: {PhiR->getStartValue(), DataSelect, MaskSelect},
1548 DL: RdxResult->getDebugLoc());
1549 RdxResult->replaceAllUsesWith(New: ExtractLastActive);
1550 RdxResult->eraseFromParent();
1551 }
1552
1553 return true;
1554}
1555
1556/// Given a first argmin/argmax pattern with strict predicate consisting of
1557/// 1) a MinOrMax reduction \p MinOrMaxPhiR producing \p MinOrMaxResult,
1558/// 2) a wide induction \p WideIV,
1559/// 3) a FindLastIV reduction \p FindLastIVPhiR using \p WideIV,
1560/// return the smallest index of the FindLastIV reduction result using UMin,
1561/// unless \p MinOrMaxResult equals the start value of its MinOrMax reduction.
1562/// In that case, return the start value of the FindLastIV reduction instead.
1563/// If \p WideIV is not canonical, a new canonical wide IV is added, and the
1564/// final result is scaled back to the non-canonical \p WideIV.
1565/// The final value of the FindLastIV reduction is originally computed using
1566/// \p FindIVSelect, \p FindIVCmp, and \p FindIVRdxResult, which are replaced
1567/// and removed.
1568/// Returns true if the pattern was handled successfully, false otherwise.
1569static bool handleFirstArgMinOrMax(
1570 VPlan &Plan, VPReductionPHIRecipe *MinOrMaxPhiR,
1571 VPReductionPHIRecipe *FindLastIVPhiR, VPWidenIntOrFpInductionRecipe *WideIV,
1572 VPInstruction *MinOrMaxResult, VPInstruction *FindIVSelect,
1573 VPRecipeBase *FindIVCmp, VPInstruction *FindIVRdxResult) {
1574 assert(!FindLastIVPhiR->isInLoop() && !FindLastIVPhiR->isOrdered() &&
1575 "inloop and ordered reductions not supported");
1576 assert(FindLastIVPhiR->getVFScaleFactor() == 1 &&
1577 "FindIV reduction must not be scaled");
1578
1579 Type *Ty = Plan.getVectorLoopRegion()->getCanonicalIVType();
1580 // TODO: Support non (i.e., narrower than) canonical IV types.
1581 // TODO: Emit remarks for failed transformations.
1582 if (Ty != VPTypeAnalysis(Plan).inferScalarType(V: WideIV))
1583 return false;
1584
1585 auto *FindIVSelectR = cast<VPSingleDefRecipe>(
1586 Val: FindLastIVPhiR->getBackedgeValue()->getDefiningRecipe());
1587 assert(
1588 match(FindIVSelectR, m_Select(m_VPValue(), m_VPValue(), m_VPValue())) &&
1589 "backedge value must be a select");
1590 if (FindIVSelectR->getOperand(N: 1) != WideIV &&
1591 FindIVSelectR->getOperand(N: 2) != WideIV)
1592 return false;
1593
1594 // If the original wide IV is not canonical, create a new one. The canonical
1595 // wide IV is guaranteed to not wrap for all lanes that are active in the
1596 // vector loop.
1597 if (!WideIV->isCanonical()) {
1598 VPIRValue *Zero = Plan.getConstantInt(Ty, Val: 0);
1599 VPIRValue *One = Plan.getConstantInt(Ty, Val: 1);
1600 auto *WidenCanIV = new VPWidenIntOrFpInductionRecipe(
1601 nullptr, Zero, One, WideIV->getVFValue(),
1602 WideIV->getInductionDescriptor(),
1603 VPIRFlags::WrapFlagsTy(/*HasNUW=*/true, /*HasNSW=*/false),
1604 WideIV->getDebugLoc());
1605 WidenCanIV->insertBefore(InsertPos: WideIV);
1606
1607 // Update the select to use the wide canonical IV.
1608 FindIVSelectR->setOperand(I: FindIVSelectR->getOperand(N: 1) == WideIV ? 1 : 2,
1609 New: WidenCanIV);
1610 }
1611 FindLastIVPhiR->setOperand(I: 0, New: Plan.getOrAddLiveIn(V: PoisonValue::get(T: Ty)));
1612
1613 // The reduction using MinOrMaxPhiR needs adjusting to compute the correct
1614 // result:
1615 // 1. Find the first canonical indices corresponding to partial min/max
1616 // values, using loop reductions.
1617 // 2. Find which of the partial min/max values are equal to the overall
1618 // min/max value.
1619 // 3. Select among the canonical indices those corresponding to the overall
1620 // min/max value.
1621 // 4. Find the first canonical index of overall min/max and scale it back to
1622 // the original IV using VPDerivedIVRecipe.
1623 // 5. If the overall min/max equals the starting min/max, the condition in
1624 // the loop was always false, due to being strict; return the start value
1625 // of FindLastIVPhiR in that case.
1626 //
1627 // For example, we transforms two independent reduction result computations
1628 // for
1629 //
1630 // <x1> vector loop: {
1631 // vector.body:
1632 // ...
1633 // ir<%iv> = WIDEN-INDUCTION nuw nsw ir<10>, ir<1>, vp<%0>
1634 // WIDEN-REDUCTION-PHI ir<%min.idx> = phi ir<sentinel.min.start>,
1635 // ir<%min.idx.next>
1636 // WIDEN-REDUCTION-PHI ir<%min.val> = phi ir<100>, ir<%min.val.next>
1637 // ....
1638 // WIDEN-INTRINSIC ir<%min.val.next> = call llvm.umin(ir<%min.val>, ir<%l>)
1639 // WIDEN ir<%min.idx.next> = select ir<%cmp>, ir<%iv>, ir<%min.idx>
1640 // ...
1641 // }
1642 // Successor(s): middle.block
1643 //
1644 // middle.block:
1645 // vp<%iv.rdx> = compute-reduction-result (smax) vp<%min.idx.next>
1646 // vp<%min.result> = compute-reduction-result (umin) ir<%min.val.next>
1647 // vp<%cmp> = icmp ne vp<%iv.rdx>, ir<sentinel.min.start>
1648 // vp<%find.iv.result> = select vp<%cmp>, vp<%iv.rdx>, ir<10>
1649 //
1650 //
1651 // Into:
1652 //
1653 // vp<%reduced.min> = compute-reduction-result (umin) ir<%min.val.next>
1654 // vp<%reduced.mins.mask> = icmp eq ir<%min.val.next>, vp<%reduced.min>
1655 // vp<%idxs2reduce> = select vp<%reduced.mins.mask>, ir<%min.idx.next>,
1656 // ir<MaxUInt>
1657 // vp<%reduced.idx> = compute-reduction-result (umin) vp<%idxs2reduce>
1658 // vp<%scaled.idx> = DERIVED-IV ir<20> + vp<%reduced.idx> * ir<1>
1659 // vp<%always.false> = icmp eq vp<%reduced.min>, ir<100>
1660 // vp<%final.idx> = select vp<%always.false>, ir<10>,
1661 // vp<%scaled.idx>
1662
1663 VPBuilder Builder(FindIVRdxResult);
1664 VPValue *MinOrMaxExiting = MinOrMaxResult->getOperand(N: 0);
1665 auto *FinalMinOrMaxCmp =
1666 Builder.createICmp(Pred: CmpInst::ICMP_EQ, A: MinOrMaxExiting, B: MinOrMaxResult);
1667 VPValue *LastIVExiting = FindIVRdxResult->getOperand(N: 0);
1668 VPValue *MaxIV =
1669 Plan.getConstantInt(Val: APInt::getMaxValue(numBits: Ty->getIntegerBitWidth()));
1670 auto *FinalIVSelect =
1671 Builder.createSelect(Cond: FinalMinOrMaxCmp, TrueVal: LastIVExiting, FalseVal: MaxIV);
1672 VPIRFlags RdxFlags(RecurKind::UMin, false, false, FastMathFlags());
1673 VPSingleDefRecipe *FinalCanIV = Builder.createNaryOp(
1674 Opcode: VPInstruction::ComputeReductionResult, Operands: {FinalIVSelect}, Flags: RdxFlags,
1675 DL: FindIVRdxResult->getDebugLoc());
1676
1677 // If we used a new wide canonical IV convert the reduction result back to the
1678 // original IV scale before the final select.
1679 if (!WideIV->isCanonical()) {
1680 auto *DerivedIVRecipe =
1681 new VPDerivedIVRecipe(InductionDescriptor::IK_IntInduction,
1682 nullptr, // No FPBinOp for integer induction
1683 WideIV->getStartValue(), FinalCanIV,
1684 WideIV->getStepValue(), "derived.iv.result");
1685 DerivedIVRecipe->insertBefore(InsertPos: &*Builder.getInsertPoint());
1686 FinalCanIV = DerivedIVRecipe;
1687 }
1688
1689 // If the final min/max value matches its start value, the condition in the
1690 // loop was always false, i.e. no induction value has been selected. If that's
1691 // the case, set the result of the IV reduction to its start value.
1692 VPValue *AlwaysFalse = Builder.createICmp(Pred: CmpInst::ICMP_EQ, A: MinOrMaxResult,
1693 B: MinOrMaxPhiR->getStartValue());
1694 VPValue *FinalIV = Builder.createSelect(
1695 Cond: AlwaysFalse, TrueVal: FindIVSelect->getOperand(N: 2), FalseVal: FinalCanIV);
1696 FindIVSelect->replaceAllUsesWith(New: FinalIV);
1697
1698 // Erase the old FindIV result pattern which is now dead.
1699 FindIVSelect->eraseFromParent();
1700 FindIVCmp->eraseFromParent();
1701 FindIVRdxResult->eraseFromParent();
1702 return true;
1703}
1704
1705bool VPlanTransforms::handleMultiUseReductions(VPlan &Plan,
1706 OptimizationRemarkEmitter *ORE,
1707 Loop *TheLoop) {
1708 for (auto &PhiR : make_early_inc_range(
1709 Range: Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis())) {
1710 auto *MinOrMaxPhiR = dyn_cast<VPReductionPHIRecipe>(Val: &PhiR);
1711 // TODO: check for multi-uses in VPlan directly.
1712 if (!MinOrMaxPhiR || !MinOrMaxPhiR->hasUsesOutsideReductionChain())
1713 continue;
1714
1715 // MinOrMaxPhiR has users outside the reduction cycle in the loop. Check if
1716 // the only other user is a FindLastIV reduction. MinOrMaxPhiR must have
1717 // exactly 2 users:
1718 // 1) the min/max operation of the reduction cycle, and
1719 // 2) the compare of a FindLastIV reduction cycle. This compare must match
1720 // the min/max operation - comparing MinOrMaxPhiR with the operand of the
1721 // min/max operation, and be used only by the select of the FindLastIV
1722 // reduction cycle.
1723 RecurKind RdxKind = MinOrMaxPhiR->getRecurrenceKind();
1724 assert(
1725 RecurrenceDescriptor::isIntMinMaxRecurrenceKind(RdxKind) &&
1726 "only min/max recurrences support users outside the reduction chain");
1727
1728 auto *MinOrMaxOp =
1729 dyn_cast<VPRecipeWithIRFlags>(Val: MinOrMaxPhiR->getBackedgeValue());
1730 if (!MinOrMaxOp)
1731 return false;
1732
1733 // Check that MinOrMaxOp is a VPWidenIntrinsicRecipe or VPReplicateRecipe
1734 // with an intrinsic that matches the reduction kind.
1735 Intrinsic::ID ExpectedIntrinsicID = getMinMaxReductionIntrinsicOp(RK: RdxKind);
1736 if (!match(V: MinOrMaxOp, P: m_Intrinsic(IntrID: ExpectedIntrinsicID)))
1737 return false;
1738
1739 // MinOrMaxOp must have 2 users: 1) MinOrMaxPhiR and 2)
1740 // ComputeReductionResult.
1741 assert(MinOrMaxOp->getNumUsers() == 2 &&
1742 "MinOrMaxOp must have exactly 2 users");
1743 VPValue *MinOrMaxOpValue = MinOrMaxOp->getOperand(N: 0);
1744 if (MinOrMaxOpValue == MinOrMaxPhiR)
1745 MinOrMaxOpValue = MinOrMaxOp->getOperand(N: 1);
1746
1747 VPValue *CmpOpA;
1748 VPValue *CmpOpB;
1749 CmpPredicate Pred;
1750 auto *Cmp = dyn_cast_or_null<VPRecipeWithIRFlags>(Val: vputils::findUserOf(
1751 V: MinOrMaxPhiR, P: m_Cmp(Pred, Op0: m_VPValue(V&: CmpOpA), Op1: m_VPValue(V&: CmpOpB))));
1752 if (!Cmp || Cmp->getNumUsers() != 1 ||
1753 (CmpOpA != MinOrMaxOpValue && CmpOpB != MinOrMaxOpValue))
1754 return false;
1755
1756 if (MinOrMaxOpValue != CmpOpB)
1757 Pred = CmpInst::getSwappedPredicate(pred: Pred);
1758
1759 // MinOrMaxPhiR must have exactly 2 users:
1760 // * MinOrMaxOp,
1761 // * Cmp (that's part of a FindLastIV chain).
1762 if (MinOrMaxPhiR->getNumUsers() != 2)
1763 return false;
1764
1765 VPInstruction *MinOrMaxResult =
1766 vputils::findUserOf<VPInstruction::ComputeReductionResult>(V: MinOrMaxOp);
1767 assert(is_contained(MinOrMaxPhiR->users(), MinOrMaxOp) &&
1768 "one user must be MinOrMaxOp");
1769 assert(MinOrMaxResult && "MinOrMaxResult must be a user of MinOrMaxOp");
1770
1771 // Cmp must be used by the select of a FindLastIV chain.
1772 VPValue *Sel = dyn_cast<VPSingleDefRecipe>(Val: Cmp->getSingleUser());
1773 VPValue *IVOp, *FindIV;
1774 if (!Sel || Sel->getNumUsers() != 2 ||
1775 !match(V: Sel,
1776 P: m_Select(Op0: m_Specific(VPV: Cmp), Op1: m_VPValue(V&: IVOp), Op2: m_VPValue(V&: FindIV))))
1777 return false;
1778
1779 if (!isa<VPReductionPHIRecipe>(Val: FindIV)) {
1780 std::swap(a&: FindIV, b&: IVOp);
1781 Pred = CmpInst::getInversePredicate(pred: Pred);
1782 }
1783
1784 auto *FindIVPhiR = dyn_cast<VPReductionPHIRecipe>(Val: FindIV);
1785 if (!FindIVPhiR || !RecurrenceDescriptor::isFindIVRecurrenceKind(
1786 Kind: FindIVPhiR->getRecurrenceKind()))
1787 return false;
1788
1789 assert(!FindIVPhiR->isInLoop() && !FindIVPhiR->isOrdered() &&
1790 "cannot handle inloop/ordered reductions yet");
1791
1792 // Check if FindIVPhiR is a FindLast pattern by checking the MinMaxKind
1793 // on its ComputeReductionResult. SMax/UMax indicates FindLast.
1794 VPInstruction *FindIVResult =
1795 vputils::findUserOf<VPInstruction::ComputeReductionResult>(
1796 V: FindIVPhiR->getBackedgeValue());
1797 assert(FindIVResult &&
1798 "must be able to retrieve the FindIVResult VPInstruction");
1799 RecurKind FindIVMinMaxKind = FindIVResult->getRecurKind();
1800 if (FindIVMinMaxKind != RecurKind::SMax &&
1801 FindIVMinMaxKind != RecurKind::UMax)
1802 return false;
1803
1804 // TODO: Support cases where IVOp is the IV increment.
1805 if (!match(V: IVOp, P: m_TruncOrSelf(Op0: m_VPValue(V&: IVOp))) ||
1806 !isa<VPWidenIntOrFpInductionRecipe>(Val: IVOp))
1807 return false;
1808
1809 // Check if the predicate is compatible with the reduction kind.
1810 bool IsValidKindPred = [RdxKind, Pred]() {
1811 switch (RdxKind) {
1812 case RecurKind::UMin:
1813 return Pred == CmpInst::ICMP_UGE || Pred == CmpInst::ICMP_UGT;
1814 case RecurKind::UMax:
1815 return Pred == CmpInst::ICMP_ULE || Pred == CmpInst::ICMP_ULT;
1816 case RecurKind::SMax:
1817 return Pred == CmpInst::ICMP_SLE || Pred == CmpInst::ICMP_SLT;
1818 case RecurKind::SMin:
1819 return Pred == CmpInst::ICMP_SGE || Pred == CmpInst::ICMP_SGT;
1820 default:
1821 llvm_unreachable("unhandled recurrence kind");
1822 }
1823 }();
1824 if (!IsValidKindPred) {
1825 ORE->emit(RemarkBuilder: [&]() {
1826 return OptimizationRemarkMissed(
1827 DEBUG_TYPE, "VectorizationMultiUseReductionPredicate",
1828 TheLoop->getStartLoc(), TheLoop->getHeader())
1829 << "Multi-use reduction with predicate "
1830 << CmpInst::getPredicateName(P: Pred)
1831 << " incompatible with reduction kind";
1832 });
1833 return false;
1834 }
1835
1836 auto *FindIVSelect = findFindIVSelect(BackedgeVal: FindIVPhiR->getBackedgeValue());
1837 auto *FindIVCmp = FindIVSelect->getOperand(N: 0)->getDefiningRecipe();
1838 auto *FindIVRdxResult = cast<VPInstruction>(Val: FindIVCmp->getOperand(N: 0));
1839 assert(FindIVSelect->getParent() == MinOrMaxResult->getParent() &&
1840 "both results must be computed in the same block");
1841 // Reducing to a scalar min or max value is placed right before reducing to
1842 // its scalar iteration, in order to generate instructions that use both
1843 // their operands.
1844 MinOrMaxResult->moveBefore(BB&: *FindIVRdxResult->getParent(),
1845 I: FindIVRdxResult->getIterator());
1846
1847 bool IsStrictPredicate = ICmpInst::isLT(P: Pred) || ICmpInst::isGT(P: Pred);
1848 if (IsStrictPredicate) {
1849 if (!handleFirstArgMinOrMax(Plan, MinOrMaxPhiR, FindLastIVPhiR: FindIVPhiR,
1850 WideIV: cast<VPWidenIntOrFpInductionRecipe>(Val: IVOp),
1851 MinOrMaxResult, FindIVSelect, FindIVCmp,
1852 FindIVRdxResult))
1853 return false;
1854 continue;
1855 }
1856
1857 // The reduction using MinOrMaxPhiR needs adjusting to compute the correct
1858 // result:
1859 // 1. We need to find the last IV for which the condition based on the
1860 // min/max recurrence is true,
1861 // 2. Compare the partial min/max reduction result to its final value and,
1862 // 3. Select the lanes of the partial FindLastIV reductions which
1863 // correspond to the lanes matching the min/max reduction result.
1864 //
1865 // For example, this transforms
1866 // vp<%min.result> = compute-reduction-result ir<%min.val.next>
1867 // vp<%iv.rdx> = compute-reduction-result (smax) vp<%min.idx.next>
1868 // vp<%cmp> = icmp ne vp<%iv.rdx>, SENTINEL
1869 // vp<%find.iv.result> = select vp<%cmp>, vp<%iv.rdx>, ir<0>
1870 //
1871 // into:
1872 //
1873 // vp<min.result> = compute-reduction-result ir<%min.val.next>
1874 // vp<%final.min.cmp> = icmp eq ir<%min.val.next>, vp<min.result>
1875 // vp<%final.iv> = select vp<%final.min.cmp>, vp<%min.idx.next>, SENTINEL
1876 // vp<%iv.rdx> = compute-reduction-result (smax) vp<%final.iv>
1877 // vp<%cmp> = icmp ne vp<%iv.rdx>, SENTINEL
1878 // vp<%find.iv.result> = select vp<%cmp>, vp<%iv.rdx>, ir<0>
1879 //
1880 VPBuilder B(FindIVRdxResult);
1881 VPValue *MinOrMaxExiting = MinOrMaxResult->getOperand(N: 0);
1882 auto *FinalMinOrMaxCmp =
1883 B.createICmp(Pred: CmpInst::ICMP_EQ, A: MinOrMaxExiting, B: MinOrMaxResult);
1884 VPValue *Sentinel = FindIVCmp->getOperand(N: 1);
1885 VPValue *LastIVExiting = FindIVRdxResult->getOperand(N: 0);
1886 auto *FinalIVSelect =
1887 B.createSelect(Cond: FinalMinOrMaxCmp, TrueVal: LastIVExiting, FalseVal: Sentinel);
1888 FindIVRdxResult->setOperand(I: 0, New: FinalIVSelect);
1889 }
1890 return true;
1891}
1892