1//===-- VPlanTransforms.cpp - Utility VPlan to VPlan transforms -----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file implements a set of utility VPlan to VPlan transformations.
11///
12//===----------------------------------------------------------------------===//
13
14#include "VPlanTransforms.h"
15#include "VPRecipeBuilder.h"
16#include "VPlan.h"
17#include "VPlanAnalysis.h"
18#include "VPlanCFG.h"
19#include "VPlanDominatorTree.h"
20#include "VPlanHelpers.h"
21#include "VPlanPatternMatch.h"
22#include "VPlanUtils.h"
23#include "VPlanVerifier.h"
24#include "llvm/ADT/APInt.h"
25#include "llvm/ADT/PostOrderIterator.h"
26#include "llvm/ADT/STLExtras.h"
27#include "llvm/ADT/SetVector.h"
28#include "llvm/ADT/TypeSwitch.h"
29#include "llvm/Analysis/IVDescriptors.h"
30#include "llvm/Analysis/InstSimplifyFolder.h"
31#include "llvm/Analysis/LoopInfo.h"
32#include "llvm/Analysis/VectorUtils.h"
33#include "llvm/IR/Intrinsics.h"
34#include "llvm/IR/MDBuilder.h"
35#include "llvm/IR/PatternMatch.h"
36#include "llvm/Support/Casting.h"
37#include "llvm/Support/TypeSize.h"
38
39using namespace llvm;
40
41bool VPlanTransforms::tryToConvertVPInstructionsToVPRecipes(
42 VPlanPtr &Plan,
43 function_ref<const InductionDescriptor *(PHINode *)>
44 GetIntOrFpInductionDescriptor,
45 ScalarEvolution &SE, const TargetLibraryInfo &TLI) {
46
47 ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT(
48 Plan->getVectorLoopRegion());
49 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Range: RPOT)) {
50 // Skip blocks outside region
51 if (!VPBB->getParent())
52 break;
53 VPRecipeBase *Term = VPBB->getTerminator();
54 auto EndIter = Term ? Term->getIterator() : VPBB->end();
55 // Introduce each ingredient into VPlan.
56 for (VPRecipeBase &Ingredient :
57 make_early_inc_range(Range: make_range(x: VPBB->begin(), y: EndIter))) {
58
59 VPValue *VPV = Ingredient.getVPSingleValue();
60 if (!VPV->getUnderlyingValue())
61 continue;
62
63 Instruction *Inst = cast<Instruction>(Val: VPV->getUnderlyingValue());
64
65 VPRecipeBase *NewRecipe = nullptr;
66 if (auto *VPPhi = dyn_cast<VPWidenPHIRecipe>(Val: &Ingredient)) {
67 auto *Phi = cast<PHINode>(Val: VPPhi->getUnderlyingValue());
68 const auto *II = GetIntOrFpInductionDescriptor(Phi);
69 if (!II)
70 continue;
71
72 VPValue *Start = Plan->getOrAddLiveIn(V: II->getStartValue());
73 VPValue *Step =
74 vputils::getOrCreateVPValueForSCEVExpr(Plan&: *Plan, Expr: II->getStep(), SE);
75 NewRecipe = new VPWidenIntOrFpInductionRecipe(
76 Phi, Start, Step, &Plan->getVF(), *II, Ingredient.getDebugLoc());
77 } else {
78 assert(isa<VPInstruction>(&Ingredient) &&
79 "only VPInstructions expected here");
80 assert(!isa<PHINode>(Inst) && "phis should be handled above");
81 // Create VPWidenMemoryRecipe for loads and stores.
82 if (LoadInst *Load = dyn_cast<LoadInst>(Val: Inst)) {
83 NewRecipe = new VPWidenLoadRecipe(
84 *Load, Ingredient.getOperand(N: 0), nullptr /*Mask*/,
85 false /*Consecutive*/, false /*Reverse*/, VPIRMetadata(*Load),
86 Ingredient.getDebugLoc());
87 } else if (StoreInst *Store = dyn_cast<StoreInst>(Val: Inst)) {
88 NewRecipe = new VPWidenStoreRecipe(
89 *Store, Ingredient.getOperand(N: 1), Ingredient.getOperand(N: 0),
90 nullptr /*Mask*/, false /*Consecutive*/, false /*Reverse*/,
91 VPIRMetadata(*Store), Ingredient.getDebugLoc());
92 } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Val: Inst)) {
93 NewRecipe = new VPWidenGEPRecipe(GEP, Ingredient.operands());
94 } else if (CallInst *CI = dyn_cast<CallInst>(Val: Inst)) {
95 Intrinsic::ID VectorID = getVectorIntrinsicIDForCall(CI, TLI: &TLI);
96 if (VectorID == Intrinsic::not_intrinsic)
97 return false;
98 NewRecipe = new VPWidenIntrinsicRecipe(
99 *CI, getVectorIntrinsicIDForCall(CI, TLI: &TLI),
100 {Ingredient.op_begin(), Ingredient.op_end() - 1}, CI->getType(),
101 CI->getDebugLoc());
102 } else if (SelectInst *SI = dyn_cast<SelectInst>(Val: Inst)) {
103 NewRecipe = new VPWidenSelectRecipe(*SI, Ingredient.operands());
104 } else if (auto *CI = dyn_cast<CastInst>(Val: Inst)) {
105 NewRecipe = new VPWidenCastRecipe(
106 CI->getOpcode(), Ingredient.getOperand(N: 0), CI->getType(), *CI);
107 } else {
108 NewRecipe = new VPWidenRecipe(*Inst, Ingredient.operands());
109 }
110 }
111
112 NewRecipe->insertBefore(InsertPos: &Ingredient);
113 if (NewRecipe->getNumDefinedValues() == 1)
114 VPV->replaceAllUsesWith(New: NewRecipe->getVPSingleValue());
115 else
116 assert(NewRecipe->getNumDefinedValues() == 0 &&
117 "Only recpies with zero or one defined values expected");
118 Ingredient.eraseFromParent();
119 }
120 }
121 return true;
122}
123
124static bool sinkScalarOperands(VPlan &Plan) {
125 auto Iter = vp_depth_first_deep(G: Plan.getEntry());
126 bool Changed = false;
127 // First, collect the operands of all recipes in replicate blocks as seeds for
128 // sinking.
129 SetVector<std::pair<VPBasicBlock *, VPSingleDefRecipe *>> WorkList;
130 for (VPRegionBlock *VPR : VPBlockUtils::blocksOnly<VPRegionBlock>(Range: Iter)) {
131 VPBasicBlock *EntryVPBB = VPR->getEntryBasicBlock();
132 if (!VPR->isReplicator() || EntryVPBB->getSuccessors().size() != 2)
133 continue;
134 VPBasicBlock *VPBB = dyn_cast<VPBasicBlock>(Val: EntryVPBB->getSuccessors()[0]);
135 if (!VPBB || VPBB->getSingleSuccessor() != VPR->getExitingBasicBlock())
136 continue;
137 for (auto &Recipe : *VPBB) {
138 for (VPValue *Op : Recipe.operands())
139 if (auto *Def =
140 dyn_cast_or_null<VPSingleDefRecipe>(Val: Op->getDefiningRecipe()))
141 WorkList.insert(X: std::make_pair(x&: VPBB, y&: Def));
142 }
143 }
144
145 bool ScalarVFOnly = Plan.hasScalarVFOnly();
146 // Try to sink each replicate or scalar IV steps recipe in the worklist.
147 for (unsigned I = 0; I != WorkList.size(); ++I) {
148 VPBasicBlock *SinkTo;
149 VPSingleDefRecipe *SinkCandidate;
150 std::tie(args&: SinkTo, args&: SinkCandidate) = WorkList[I];
151 if (SinkCandidate->getParent() == SinkTo ||
152 SinkCandidate->mayHaveSideEffects() ||
153 SinkCandidate->mayReadOrWriteMemory())
154 continue;
155 if (auto *RepR = dyn_cast<VPReplicateRecipe>(Val: SinkCandidate)) {
156 if (!ScalarVFOnly && RepR->isSingleScalar())
157 continue;
158 } else if (!isa<VPScalarIVStepsRecipe>(Val: SinkCandidate))
159 continue;
160
161 bool NeedsDuplicating = false;
162 // All recipe users of the sink candidate must be in the same block SinkTo
163 // or all users outside of SinkTo must be uniform-after-vectorization (
164 // i.e., only first lane is used) . In the latter case, we need to duplicate
165 // SinkCandidate.
166 auto CanSinkWithUser = [SinkTo, &NeedsDuplicating,
167 SinkCandidate](VPUser *U) {
168 auto *UI = cast<VPRecipeBase>(Val: U);
169 if (UI->getParent() == SinkTo)
170 return true;
171 NeedsDuplicating = UI->onlyFirstLaneUsed(Op: SinkCandidate);
172 // We only know how to duplicate VPReplicateRecipes and
173 // VPScalarIVStepsRecipes for now.
174 return NeedsDuplicating &&
175 isa<VPReplicateRecipe, VPScalarIVStepsRecipe>(Val: SinkCandidate);
176 };
177 if (!all_of(Range: SinkCandidate->users(), P: CanSinkWithUser))
178 continue;
179
180 if (NeedsDuplicating) {
181 if (ScalarVFOnly)
182 continue;
183 VPSingleDefRecipe *Clone;
184 if (auto *SinkCandidateRepR =
185 dyn_cast<VPReplicateRecipe>(Val: SinkCandidate)) {
186 // TODO: Handle converting to uniform recipes as separate transform,
187 // then cloning should be sufficient here.
188 Instruction *I = SinkCandidate->getUnderlyingInstr();
189 Clone = new VPReplicateRecipe(I, SinkCandidate->operands(), true,
190 nullptr /*Mask*/, *SinkCandidateRepR);
191 // TODO: add ".cloned" suffix to name of Clone's VPValue.
192 } else {
193 Clone = SinkCandidate->clone();
194 }
195
196 Clone->insertBefore(InsertPos: SinkCandidate);
197 SinkCandidate->replaceUsesWithIf(New: Clone, ShouldReplace: [SinkTo](VPUser &U, unsigned) {
198 return cast<VPRecipeBase>(Val: &U)->getParent() != SinkTo;
199 });
200 }
201 SinkCandidate->moveBefore(BB&: *SinkTo, I: SinkTo->getFirstNonPhi());
202 for (VPValue *Op : SinkCandidate->operands())
203 if (auto *Def =
204 dyn_cast_or_null<VPSingleDefRecipe>(Val: Op->getDefiningRecipe()))
205 WorkList.insert(X: std::make_pair(x&: SinkTo, y&: Def));
206 Changed = true;
207 }
208 return Changed;
209}
210
211/// If \p R is a region with a VPBranchOnMaskRecipe in the entry block, return
212/// the mask.
213VPValue *getPredicatedMask(VPRegionBlock *R) {
214 auto *EntryBB = dyn_cast<VPBasicBlock>(Val: R->getEntry());
215 if (!EntryBB || EntryBB->size() != 1 ||
216 !isa<VPBranchOnMaskRecipe>(Val: EntryBB->begin()))
217 return nullptr;
218
219 return cast<VPBranchOnMaskRecipe>(Val: &*EntryBB->begin())->getOperand(N: 0);
220}
221
222/// If \p R is a triangle region, return the 'then' block of the triangle.
223static VPBasicBlock *getPredicatedThenBlock(VPRegionBlock *R) {
224 auto *EntryBB = cast<VPBasicBlock>(Val: R->getEntry());
225 if (EntryBB->getNumSuccessors() != 2)
226 return nullptr;
227
228 auto *Succ0 = dyn_cast<VPBasicBlock>(Val: EntryBB->getSuccessors()[0]);
229 auto *Succ1 = dyn_cast<VPBasicBlock>(Val: EntryBB->getSuccessors()[1]);
230 if (!Succ0 || !Succ1)
231 return nullptr;
232
233 if (Succ0->getNumSuccessors() + Succ1->getNumSuccessors() != 1)
234 return nullptr;
235 if (Succ0->getSingleSuccessor() == Succ1)
236 return Succ0;
237 if (Succ1->getSingleSuccessor() == Succ0)
238 return Succ1;
239 return nullptr;
240}
241
242// Merge replicate regions in their successor region, if a replicate region
243// is connected to a successor replicate region with the same predicate by a
244// single, empty VPBasicBlock.
245static bool mergeReplicateRegionsIntoSuccessors(VPlan &Plan) {
246 SmallPtrSet<VPRegionBlock *, 4> TransformedRegions;
247
248 // Collect replicate regions followed by an empty block, followed by another
249 // replicate region with matching masks to process front. This is to avoid
250 // iterator invalidation issues while merging regions.
251 SmallVector<VPRegionBlock *, 8> WorkList;
252 for (VPRegionBlock *Region1 : VPBlockUtils::blocksOnly<VPRegionBlock>(
253 Range: vp_depth_first_deep(G: Plan.getEntry()))) {
254 if (!Region1->isReplicator())
255 continue;
256 auto *MiddleBasicBlock =
257 dyn_cast_or_null<VPBasicBlock>(Val: Region1->getSingleSuccessor());
258 if (!MiddleBasicBlock || !MiddleBasicBlock->empty())
259 continue;
260
261 auto *Region2 =
262 dyn_cast_or_null<VPRegionBlock>(Val: MiddleBasicBlock->getSingleSuccessor());
263 if (!Region2 || !Region2->isReplicator())
264 continue;
265
266 VPValue *Mask1 = getPredicatedMask(R: Region1);
267 VPValue *Mask2 = getPredicatedMask(R: Region2);
268 if (!Mask1 || Mask1 != Mask2)
269 continue;
270
271 assert(Mask1 && Mask2 && "both region must have conditions");
272 WorkList.push_back(Elt: Region1);
273 }
274
275 // Move recipes from Region1 to its successor region, if both are triangles.
276 for (VPRegionBlock *Region1 : WorkList) {
277 if (TransformedRegions.contains(Ptr: Region1))
278 continue;
279 auto *MiddleBasicBlock = cast<VPBasicBlock>(Val: Region1->getSingleSuccessor());
280 auto *Region2 = cast<VPRegionBlock>(Val: MiddleBasicBlock->getSingleSuccessor());
281
282 VPBasicBlock *Then1 = getPredicatedThenBlock(R: Region1);
283 VPBasicBlock *Then2 = getPredicatedThenBlock(R: Region2);
284 if (!Then1 || !Then2)
285 continue;
286
287 // Note: No fusion-preventing memory dependencies are expected in either
288 // region. Such dependencies should be rejected during earlier dependence
289 // checks, which guarantee accesses can be re-ordered for vectorization.
290 //
291 // Move recipes to the successor region.
292 for (VPRecipeBase &ToMove : make_early_inc_range(Range: reverse(C&: *Then1)))
293 ToMove.moveBefore(BB&: *Then2, I: Then2->getFirstNonPhi());
294
295 auto *Merge1 = cast<VPBasicBlock>(Val: Then1->getSingleSuccessor());
296 auto *Merge2 = cast<VPBasicBlock>(Val: Then2->getSingleSuccessor());
297
298 // Move VPPredInstPHIRecipes from the merge block to the successor region's
299 // merge block. Update all users inside the successor region to use the
300 // original values.
301 for (VPRecipeBase &Phi1ToMove : make_early_inc_range(Range: reverse(C&: *Merge1))) {
302 VPValue *PredInst1 =
303 cast<VPPredInstPHIRecipe>(Val: &Phi1ToMove)->getOperand(N: 0);
304 VPValue *Phi1ToMoveV = Phi1ToMove.getVPSingleValue();
305 Phi1ToMoveV->replaceUsesWithIf(New: PredInst1, ShouldReplace: [Then2](VPUser &U, unsigned) {
306 return cast<VPRecipeBase>(Val: &U)->getParent() == Then2;
307 });
308
309 // Remove phi recipes that are unused after merging the regions.
310 if (Phi1ToMove.getVPSingleValue()->getNumUsers() == 0) {
311 Phi1ToMove.eraseFromParent();
312 continue;
313 }
314 Phi1ToMove.moveBefore(BB&: *Merge2, I: Merge2->begin());
315 }
316
317 // Remove the dead recipes in Region1's entry block.
318 for (VPRecipeBase &R :
319 make_early_inc_range(Range: reverse(C&: *Region1->getEntryBasicBlock())))
320 R.eraseFromParent();
321
322 // Finally, remove the first region.
323 for (VPBlockBase *Pred : make_early_inc_range(Range&: Region1->getPredecessors())) {
324 VPBlockUtils::disconnectBlocks(From: Pred, To: Region1);
325 VPBlockUtils::connectBlocks(From: Pred, To: MiddleBasicBlock);
326 }
327 VPBlockUtils::disconnectBlocks(From: Region1, To: MiddleBasicBlock);
328 TransformedRegions.insert(Ptr: Region1);
329 }
330
331 return !TransformedRegions.empty();
332}
333
334static VPRegionBlock *createReplicateRegion(VPReplicateRecipe *PredRecipe,
335 VPlan &Plan) {
336 Instruction *Instr = PredRecipe->getUnderlyingInstr();
337 // Build the triangular if-then region.
338 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
339 assert(Instr->getParent() && "Predicated instruction not in any basic block");
340 auto *BlockInMask = PredRecipe->getMask();
341 auto *MaskDef = BlockInMask->getDefiningRecipe();
342 auto *BOMRecipe = new VPBranchOnMaskRecipe(
343 BlockInMask, MaskDef ? MaskDef->getDebugLoc() : DebugLoc());
344 auto *Entry =
345 Plan.createVPBasicBlock(Name: Twine(RegionName) + ".entry", Recipe: BOMRecipe);
346
347 // Replace predicated replicate recipe with a replicate recipe without a
348 // mask but in the replicate region.
349 auto *RecipeWithoutMask = new VPReplicateRecipe(
350 PredRecipe->getUnderlyingInstr(),
351 make_range(x: PredRecipe->op_begin(), y: std::prev(x: PredRecipe->op_end())),
352 PredRecipe->isSingleScalar(), nullptr /*Mask*/, *PredRecipe);
353 auto *Pred =
354 Plan.createVPBasicBlock(Name: Twine(RegionName) + ".if", Recipe: RecipeWithoutMask);
355
356 VPPredInstPHIRecipe *PHIRecipe = nullptr;
357 if (PredRecipe->getNumUsers() != 0) {
358 PHIRecipe = new VPPredInstPHIRecipe(RecipeWithoutMask,
359 RecipeWithoutMask->getDebugLoc());
360 PredRecipe->replaceAllUsesWith(New: PHIRecipe);
361 PHIRecipe->setOperand(I: 0, New: RecipeWithoutMask);
362 }
363 PredRecipe->eraseFromParent();
364 auto *Exiting =
365 Plan.createVPBasicBlock(Name: Twine(RegionName) + ".continue", Recipe: PHIRecipe);
366 VPRegionBlock *Region =
367 Plan.createVPRegionBlock(Entry, Exiting, Name: RegionName, IsReplicator: true);
368
369 // Note: first set Entry as region entry and then connect successors starting
370 // from it in order, to propagate the "parent" of each VPBasicBlock.
371 VPBlockUtils::insertTwoBlocksAfter(IfTrue: Pred, IfFalse: Exiting, BlockPtr: Entry);
372 VPBlockUtils::connectBlocks(From: Pred, To: Exiting);
373
374 return Region;
375}
376
377static void addReplicateRegions(VPlan &Plan) {
378 SmallVector<VPReplicateRecipe *> WorkList;
379 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
380 Range: vp_depth_first_deep(G: Plan.getEntry()))) {
381 for (VPRecipeBase &R : *VPBB)
382 if (auto *RepR = dyn_cast<VPReplicateRecipe>(Val: &R)) {
383 if (RepR->isPredicated())
384 WorkList.push_back(Elt: RepR);
385 }
386 }
387
388 unsigned BBNum = 0;
389 for (VPReplicateRecipe *RepR : WorkList) {
390 VPBasicBlock *CurrentBlock = RepR->getParent();
391 VPBasicBlock *SplitBlock = CurrentBlock->splitAt(SplitAt: RepR->getIterator());
392
393 BasicBlock *OrigBB = RepR->getUnderlyingInstr()->getParent();
394 SplitBlock->setName(
395 OrigBB->hasName() ? OrigBB->getName() + "." + Twine(BBNum++) : "");
396 // Record predicated instructions for above packing optimizations.
397 VPRegionBlock *Region = createReplicateRegion(PredRecipe: RepR, Plan);
398 Region->setParent(CurrentBlock->getParent());
399 VPBlockUtils::insertOnEdge(From: CurrentBlock, To: SplitBlock, BlockPtr: Region);
400
401 VPRegionBlock *ParentRegion = Region->getParent();
402 if (ParentRegion && ParentRegion->getExiting() == CurrentBlock)
403 ParentRegion->setExiting(SplitBlock);
404 }
405}
406
407/// Remove redundant VPBasicBlocks by merging them into their predecessor if
408/// the predecessor has a single successor.
409static bool mergeBlocksIntoPredecessors(VPlan &Plan) {
410 SmallVector<VPBasicBlock *> WorkList;
411 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
412 Range: vp_depth_first_deep(G: Plan.getEntry()))) {
413 // Don't fold the blocks in the skeleton of the Plan into their single
414 // predecessors for now.
415 // TODO: Remove restriction once more of the skeleton is modeled in VPlan.
416 if (!VPBB->getParent())
417 continue;
418 auto *PredVPBB =
419 dyn_cast_or_null<VPBasicBlock>(Val: VPBB->getSinglePredecessor());
420 if (!PredVPBB || PredVPBB->getNumSuccessors() != 1 ||
421 isa<VPIRBasicBlock>(Val: PredVPBB))
422 continue;
423 WorkList.push_back(Elt: VPBB);
424 }
425
426 for (VPBasicBlock *VPBB : WorkList) {
427 VPBasicBlock *PredVPBB = cast<VPBasicBlock>(Val: VPBB->getSinglePredecessor());
428 for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB))
429 R.moveBefore(BB&: *PredVPBB, I: PredVPBB->end());
430 VPBlockUtils::disconnectBlocks(From: PredVPBB, To: VPBB);
431 auto *ParentRegion = VPBB->getParent();
432 if (ParentRegion && ParentRegion->getExiting() == VPBB)
433 ParentRegion->setExiting(PredVPBB);
434 for (auto *Succ : to_vector(Range: VPBB->successors())) {
435 VPBlockUtils::disconnectBlocks(From: VPBB, To: Succ);
436 VPBlockUtils::connectBlocks(From: PredVPBB, To: Succ);
437 }
438 // VPBB is now dead and will be cleaned up when the plan gets destroyed.
439 }
440 return !WorkList.empty();
441}
442
443void VPlanTransforms::createAndOptimizeReplicateRegions(VPlan &Plan) {
444 // Convert masked VPReplicateRecipes to if-then region blocks.
445 addReplicateRegions(Plan);
446
447 bool ShouldSimplify = true;
448 while (ShouldSimplify) {
449 ShouldSimplify = sinkScalarOperands(Plan);
450 ShouldSimplify |= mergeReplicateRegionsIntoSuccessors(Plan);
451 ShouldSimplify |= mergeBlocksIntoPredecessors(Plan);
452 }
453}
454
455/// Remove redundant casts of inductions.
456///
457/// Such redundant casts are casts of induction variables that can be ignored,
458/// because we already proved that the casted phi is equal to the uncasted phi
459/// in the vectorized loop. There is no need to vectorize the cast - the same
460/// value can be used for both the phi and casts in the vector loop.
461static void removeRedundantInductionCasts(VPlan &Plan) {
462 for (auto &Phi : Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
463 auto *IV = dyn_cast<VPWidenIntOrFpInductionRecipe>(Val: &Phi);
464 if (!IV || IV->getTruncInst())
465 continue;
466
467 // A sequence of IR Casts has potentially been recorded for IV, which
468 // *must be bypassed* when the IV is vectorized, because the vectorized IV
469 // will produce the desired casted value. This sequence forms a def-use
470 // chain and is provided in reverse order, ending with the cast that uses
471 // the IV phi. Search for the recipe of the last cast in the chain and
472 // replace it with the original IV. Note that only the final cast is
473 // expected to have users outside the cast-chain and the dead casts left
474 // over will be cleaned up later.
475 auto &Casts = IV->getInductionDescriptor().getCastInsts();
476 VPValue *FindMyCast = IV;
477 for (Instruction *IRCast : reverse(C: Casts)) {
478 VPSingleDefRecipe *FoundUserCast = nullptr;
479 for (auto *U : FindMyCast->users()) {
480 auto *UserCast = dyn_cast<VPSingleDefRecipe>(Val: U);
481 if (UserCast && UserCast->getUnderlyingValue() == IRCast) {
482 FoundUserCast = UserCast;
483 break;
484 }
485 }
486 FindMyCast = FoundUserCast;
487 }
488 FindMyCast->replaceAllUsesWith(New: IV);
489 }
490}
491
492/// Try to replace VPWidenCanonicalIVRecipes with a widened canonical IV
493/// recipe, if it exists.
494static void removeRedundantCanonicalIVs(VPlan &Plan) {
495 VPCanonicalIVPHIRecipe *CanonicalIV = Plan.getCanonicalIV();
496 VPWidenCanonicalIVRecipe *WidenNewIV = nullptr;
497 for (VPUser *U : CanonicalIV->users()) {
498 WidenNewIV = dyn_cast<VPWidenCanonicalIVRecipe>(Val: U);
499 if (WidenNewIV)
500 break;
501 }
502
503 if (!WidenNewIV)
504 return;
505
506 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
507 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
508 auto *WidenOriginalIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(Val: &Phi);
509
510 if (!WidenOriginalIV || !WidenOriginalIV->isCanonical())
511 continue;
512
513 // Replace WidenNewIV with WidenOriginalIV if WidenOriginalIV provides
514 // everything WidenNewIV's users need. That is, WidenOriginalIV will
515 // generate a vector phi or all users of WidenNewIV demand the first lane
516 // only.
517 if (any_of(Range: WidenOriginalIV->users(),
518 P: [WidenOriginalIV](VPUser *U) {
519 return !U->usesScalars(Op: WidenOriginalIV);
520 }) ||
521 vputils::onlyFirstLaneUsed(Def: WidenNewIV)) {
522 WidenNewIV->replaceAllUsesWith(New: WidenOriginalIV);
523 WidenNewIV->eraseFromParent();
524 return;
525 }
526 }
527}
528
529/// Returns true if \p R is dead and can be removed.
530static bool isDeadRecipe(VPRecipeBase &R) {
531 using namespace llvm::PatternMatch;
532 // Do remove conditional assume instructions as their conditions may be
533 // flattened.
534 auto *RepR = dyn_cast<VPReplicateRecipe>(Val: &R);
535 bool IsConditionalAssume =
536 RepR && RepR->isPredicated() &&
537 match(V: RepR->getUnderlyingInstr(), P: m_Intrinsic<Intrinsic::assume>());
538 if (IsConditionalAssume)
539 return true;
540
541 if (R.mayHaveSideEffects())
542 return false;
543
544 // Recipe is dead if no user keeps the recipe alive.
545 return all_of(Range: R.definedValues(),
546 P: [](VPValue *V) { return V->getNumUsers() == 0; });
547}
548
549void VPlanTransforms::removeDeadRecipes(VPlan &Plan) {
550 ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT(
551 Plan.getEntry());
552
553 for (VPBasicBlock *VPBB : reverse(C: VPBlockUtils::blocksOnly<VPBasicBlock>(Range: RPOT))) {
554 // The recipes in the block are processed in reverse order, to catch chains
555 // of dead recipes.
556 for (VPRecipeBase &R : make_early_inc_range(Range: reverse(C&: *VPBB))) {
557 if (isDeadRecipe(R))
558 R.eraseFromParent();
559 }
560 }
561}
562
563static VPScalarIVStepsRecipe *
564createScalarIVSteps(VPlan &Plan, InductionDescriptor::InductionKind Kind,
565 Instruction::BinaryOps InductionOpcode,
566 FPMathOperator *FPBinOp, Instruction *TruncI,
567 VPValue *StartV, VPValue *Step, DebugLoc DL,
568 VPBuilder &Builder) {
569 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
570 VPCanonicalIVPHIRecipe *CanonicalIV = Plan.getCanonicalIV();
571 VPSingleDefRecipe *BaseIV = Builder.createDerivedIV(
572 Kind, FPBinOp, Start: StartV, Current: CanonicalIV, Step, Name: "offset.idx");
573
574 // Truncate base induction if needed.
575 Type *CanonicalIVType = CanonicalIV->getScalarType();
576 VPTypeAnalysis TypeInfo(CanonicalIVType);
577 Type *ResultTy = TypeInfo.inferScalarType(V: BaseIV);
578 if (TruncI) {
579 Type *TruncTy = TruncI->getType();
580 assert(ResultTy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits() &&
581 "Not truncating.");
582 assert(ResultTy->isIntegerTy() && "Truncation requires an integer type");
583 BaseIV = Builder.createScalarCast(Opcode: Instruction::Trunc, Op: BaseIV, ResultTy: TruncTy, DL);
584 ResultTy = TruncTy;
585 }
586
587 // Truncate step if needed.
588 Type *StepTy = TypeInfo.inferScalarType(V: Step);
589 if (ResultTy != StepTy) {
590 assert(StepTy->getScalarSizeInBits() > ResultTy->getScalarSizeInBits() &&
591 "Not truncating.");
592 assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
593 auto *VecPreheader =
594 cast<VPBasicBlock>(Val: HeaderVPBB->getSingleHierarchicalPredecessor());
595 VPBuilder::InsertPointGuard Guard(Builder);
596 Builder.setInsertPoint(VecPreheader);
597 Step = Builder.createScalarCast(Opcode: Instruction::Trunc, Op: Step, ResultTy, DL);
598 }
599 return Builder.createScalarIVSteps(InductionOpcode, FPBinOp, IV: BaseIV, Step,
600 VF: &Plan.getVF(), DL);
601}
602
603static SmallVector<VPUser *> collectUsersRecursively(VPValue *V) {
604 SetVector<VPUser *> Users(llvm::from_range, V->users());
605 for (unsigned I = 0; I != Users.size(); ++I) {
606 VPRecipeBase *Cur = cast<VPRecipeBase>(Val: Users[I]);
607 if (isa<VPHeaderPHIRecipe>(Val: Cur))
608 continue;
609 for (VPValue *V : Cur->definedValues())
610 Users.insert_range(R: V->users());
611 }
612 return Users.takeVector();
613}
614
615/// Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd
616/// (IndStart, ScalarIVSteps (0, Step)) if only its scalar values are used, as
617/// VPWidenPointerInductionRecipe will generate vectors only. If some users
618/// require vectors while other require scalars, the scalar uses need to extract
619/// the scalars from the generated vectors (Note that this is different to how
620/// int/fp inductions are handled). Legalize extract-from-ends using uniform
621/// VPReplicateRecipe of wide inductions to use regular VPReplicateRecipe, so
622/// the correct end value is available. Also optimize
623/// VPWidenIntOrFpInductionRecipe, if any of its users needs scalar values, by
624/// providing them scalar steps built on the canonical scalar IV and update the
625/// original IV's users. This is an optional optimization to reduce the needs of
626/// vector extracts.
627static void legalizeAndOptimizeInductions(VPlan &Plan) {
628 using namespace llvm::VPlanPatternMatch;
629 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
630 bool HasOnlyVectorVFs = !Plan.hasScalarVFOnly();
631 VPBuilder Builder(HeaderVPBB, HeaderVPBB->getFirstNonPhi());
632 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
633 auto *PhiR = dyn_cast<VPWidenInductionRecipe>(Val: &Phi);
634 if (!PhiR)
635 continue;
636
637 // Try to narrow wide and replicating recipes to uniform recipes, based on
638 // VPlan analysis.
639 // TODO: Apply to all recipes in the future, to replace legacy uniformity
640 // analysis.
641 auto Users = collectUsersRecursively(V: PhiR);
642 for (VPUser *U : reverse(C&: Users)) {
643 auto *Def = dyn_cast<VPSingleDefRecipe>(Val: U);
644 auto *RepR = dyn_cast<VPReplicateRecipe>(Val: U);
645 // Skip recipes that shouldn't be narrowed.
646 if (!Def || !isa<VPReplicateRecipe, VPWidenRecipe>(Val: Def) ||
647 Def->getNumUsers() == 0 || !Def->getUnderlyingValue() ||
648 (RepR && (RepR->isSingleScalar() || RepR->isPredicated())))
649 continue;
650
651 // Skip recipes that may have other lanes than their first used.
652 if (!vputils::isSingleScalar(VPV: Def) && !vputils::onlyFirstLaneUsed(Def))
653 continue;
654
655 auto *Clone = new VPReplicateRecipe(Def->getUnderlyingInstr(),
656 Def->operands(), /*IsUniform*/ true);
657 Clone->insertAfter(InsertPos: Def);
658 Def->replaceAllUsesWith(New: Clone);
659 }
660
661 // Replace wide pointer inductions which have only their scalars used by
662 // PtrAdd(IndStart, ScalarIVSteps (0, Step)).
663 if (auto *PtrIV = dyn_cast<VPWidenPointerInductionRecipe>(Val: &Phi)) {
664 if (!PtrIV->onlyScalarsGenerated(IsScalable: Plan.hasScalableVF()))
665 continue;
666
667 const InductionDescriptor &ID = PtrIV->getInductionDescriptor();
668 VPValue *StartV =
669 Plan.getOrAddLiveIn(V: ConstantInt::get(Ty: ID.getStep()->getType(), V: 0));
670 VPValue *StepV = PtrIV->getOperand(N: 1);
671 VPScalarIVStepsRecipe *Steps = createScalarIVSteps(
672 Plan, Kind: InductionDescriptor::IK_IntInduction, InductionOpcode: Instruction::Add, FPBinOp: nullptr,
673 TruncI: nullptr, StartV, Step: StepV, DL: PtrIV->getDebugLoc(), Builder);
674
675 VPValue *PtrAdd = Builder.createPtrAdd(Ptr: PtrIV->getStartValue(), Offset: Steps,
676 DL: PtrIV->getDebugLoc(), Name: "next.gep");
677
678 PtrIV->replaceAllUsesWith(New: PtrAdd);
679 continue;
680 }
681
682 // Replace widened induction with scalar steps for users that only use
683 // scalars.
684 auto *WideIV = cast<VPWidenIntOrFpInductionRecipe>(Val: &Phi);
685 if (HasOnlyVectorVFs && none_of(Range: WideIV->users(), P: [WideIV](VPUser *U) {
686 return U->usesScalars(Op: WideIV);
687 }))
688 continue;
689
690 const InductionDescriptor &ID = WideIV->getInductionDescriptor();
691 VPScalarIVStepsRecipe *Steps = createScalarIVSteps(
692 Plan, Kind: ID.getKind(), InductionOpcode: ID.getInductionOpcode(),
693 FPBinOp: dyn_cast_or_null<FPMathOperator>(Val: ID.getInductionBinOp()),
694 TruncI: WideIV->getTruncInst(), StartV: WideIV->getStartValue(), Step: WideIV->getStepValue(),
695 DL: WideIV->getDebugLoc(), Builder);
696
697 // Update scalar users of IV to use Step instead.
698 if (!HasOnlyVectorVFs)
699 WideIV->replaceAllUsesWith(New: Steps);
700 else
701 WideIV->replaceUsesWithIf(New: Steps, ShouldReplace: [WideIV](VPUser &U, unsigned) {
702 return U.usesScalars(Op: WideIV);
703 });
704 }
705}
706
707/// Check if \p VPV is an untruncated wide induction, either before or after the
708/// increment. If so return the header IV (before the increment), otherwise
709/// return null.
710static VPWidenInductionRecipe *getOptimizableIVOf(VPValue *VPV) {
711 auto *WideIV = dyn_cast<VPWidenInductionRecipe>(Val: VPV);
712 if (WideIV) {
713 // VPV itself is a wide induction, separately compute the end value for exit
714 // users if it is not a truncated IV.
715 auto *IntOrFpIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(Val: WideIV);
716 return (IntOrFpIV && IntOrFpIV->getTruncInst()) ? nullptr : WideIV;
717 }
718
719 // Check if VPV is an optimizable induction increment.
720 VPRecipeBase *Def = VPV->getDefiningRecipe();
721 if (!Def || Def->getNumOperands() != 2)
722 return nullptr;
723 WideIV = dyn_cast<VPWidenInductionRecipe>(Val: Def->getOperand(N: 0));
724 if (!WideIV)
725 WideIV = dyn_cast<VPWidenInductionRecipe>(Val: Def->getOperand(N: 1));
726 if (!WideIV)
727 return nullptr;
728
729 auto IsWideIVInc = [&]() {
730 using namespace VPlanPatternMatch;
731 auto &ID = WideIV->getInductionDescriptor();
732
733 // Check if VPV increments the induction by the induction step.
734 VPValue *IVStep = WideIV->getStepValue();
735 switch (ID.getInductionOpcode()) {
736 case Instruction::Add:
737 return match(V: VPV, P: m_c_Binary<Instruction::Add>(Op0: m_Specific(VPV: WideIV),
738 Op1: m_Specific(VPV: IVStep)));
739 case Instruction::FAdd:
740 return match(V: VPV, P: m_c_Binary<Instruction::FAdd>(Op0: m_Specific(VPV: WideIV),
741 Op1: m_Specific(VPV: IVStep)));
742 case Instruction::FSub:
743 return match(V: VPV, P: m_Binary<Instruction::FSub>(Op0: m_Specific(VPV: WideIV),
744 Op1: m_Specific(VPV: IVStep)));
745 case Instruction::Sub: {
746 // IVStep will be the negated step of the subtraction. Check if Step == -1
747 // * IVStep.
748 VPValue *Step;
749 if (!match(V: VPV,
750 P: m_Binary<Instruction::Sub>(Op0: m_VPValue(), Op1: m_VPValue(V&: Step))) ||
751 !Step->isLiveIn() || !IVStep->isLiveIn())
752 return false;
753 auto *StepCI = dyn_cast<ConstantInt>(Val: Step->getLiveInIRValue());
754 auto *IVStepCI = dyn_cast<ConstantInt>(Val: IVStep->getLiveInIRValue());
755 return StepCI && IVStepCI &&
756 StepCI->getValue() == (-1 * IVStepCI->getValue());
757 }
758 default:
759 return ID.getKind() == InductionDescriptor::IK_PtrInduction &&
760 match(V: VPV, P: m_GetElementPtr(Op0: m_Specific(VPV: WideIV),
761 Op1: m_Specific(VPV: WideIV->getStepValue())));
762 }
763 llvm_unreachable("should have been covered by switch above");
764 };
765 return IsWideIVInc() ? WideIV : nullptr;
766}
767
768/// Attempts to optimize the induction variable exit values for users in the
769/// early exit block.
770static VPValue *optimizeEarlyExitInductionUser(VPlan &Plan,
771 VPTypeAnalysis &TypeInfo,
772 VPBlockBase *PredVPBB,
773 VPValue *Op) {
774 using namespace VPlanPatternMatch;
775
776 VPValue *Incoming, *Mask;
777 if (!match(V: Op, P: m_VPInstruction<Instruction::ExtractElement>(
778 Op0: m_VPValue(V&: Incoming),
779 Op1: m_VPInstruction<VPInstruction::FirstActiveLane>(
780 Op0: m_VPValue(V&: Mask)))))
781 return nullptr;
782
783 auto *WideIV = getOptimizableIVOf(VPV: Incoming);
784 if (!WideIV)
785 return nullptr;
786
787 auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(Val: WideIV);
788 if (WideIntOrFp && WideIntOrFp->getTruncInst())
789 return nullptr;
790
791 // Calculate the final index.
792 VPValue *EndValue = Plan.getCanonicalIV();
793 auto CanonicalIVType = Plan.getCanonicalIV()->getScalarType();
794 VPBuilder B(cast<VPBasicBlock>(Val: PredVPBB));
795
796 DebugLoc DL = cast<VPInstruction>(Val: Op)->getDebugLoc();
797 VPValue *FirstActiveLane =
798 B.createNaryOp(Opcode: VPInstruction::FirstActiveLane, Operands: Mask, DL);
799 Type *FirstActiveLaneType = TypeInfo.inferScalarType(V: FirstActiveLane);
800 FirstActiveLane = B.createScalarZExtOrTrunc(Op: FirstActiveLane, ResultTy: CanonicalIVType,
801 SrcTy: FirstActiveLaneType, DL);
802 EndValue = B.createNaryOp(Opcode: Instruction::Add, Operands: {EndValue, FirstActiveLane}, DL);
803
804 // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
805 // changed it means the exit is using the incremented value, so we need to
806 // add the step.
807 if (Incoming != WideIV) {
808 VPValue *One = Plan.getOrAddLiveIn(V: ConstantInt::get(Ty: CanonicalIVType, V: 1));
809 EndValue = B.createNaryOp(Opcode: Instruction::Add, Operands: {EndValue, One}, DL);
810 }
811
812 if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
813 const InductionDescriptor &ID = WideIV->getInductionDescriptor();
814 VPValue *Start = WideIV->getStartValue();
815 VPValue *Step = WideIV->getStepValue();
816 EndValue = B.createDerivedIV(
817 Kind: ID.getKind(), FPBinOp: dyn_cast_or_null<FPMathOperator>(Val: ID.getInductionBinOp()),
818 Start, Current: EndValue, Step);
819 }
820
821 return EndValue;
822}
823
824/// Attempts to optimize the induction variable exit values for users in the
825/// exit block coming from the latch in the original scalar loop.
826static VPValue *
827optimizeLatchExitInductionUser(VPlan &Plan, VPTypeAnalysis &TypeInfo,
828 VPBlockBase *PredVPBB, VPValue *Op,
829 DenseMap<VPValue *, VPValue *> &EndValues) {
830 using namespace VPlanPatternMatch;
831
832 VPValue *Incoming;
833 if (!match(V: Op, P: m_VPInstruction<VPInstruction::ExtractLastElement>(
834 Op0: m_VPValue(V&: Incoming))))
835 return nullptr;
836
837 auto *WideIV = getOptimizableIVOf(VPV: Incoming);
838 if (!WideIV)
839 return nullptr;
840
841 VPValue *EndValue = EndValues.lookup(Val: WideIV);
842 assert(EndValue && "end value must have been pre-computed");
843
844 // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
845 // changed it means the exit is using the incremented value, so we don't
846 // need to subtract the step.
847 if (Incoming != WideIV)
848 return EndValue;
849
850 // Otherwise, subtract the step from the EndValue.
851 VPBuilder B(cast<VPBasicBlock>(Val: PredVPBB)->getTerminator());
852 VPValue *Step = WideIV->getStepValue();
853 Type *ScalarTy = TypeInfo.inferScalarType(V: WideIV);
854 if (ScalarTy->isIntegerTy())
855 return B.createNaryOp(Opcode: Instruction::Sub, Operands: {EndValue, Step}, Inst: {}, Name: "ind.escape");
856 if (ScalarTy->isPointerTy()) {
857 auto *Zero = Plan.getOrAddLiveIn(
858 V: ConstantInt::get(Ty: Step->getLiveInIRValue()->getType(), V: 0));
859 return B.createPtrAdd(Ptr: EndValue,
860 Offset: B.createNaryOp(Opcode: Instruction::Sub, Operands: {Zero, Step}), DL: {},
861 Name: "ind.escape");
862 }
863 if (ScalarTy->isFloatingPointTy()) {
864 const auto &ID = WideIV->getInductionDescriptor();
865 return B.createNaryOp(
866 Opcode: ID.getInductionBinOp()->getOpcode() == Instruction::FAdd
867 ? Instruction::FSub
868 : Instruction::FAdd,
869 Operands: {EndValue, Step}, Flags: {ID.getInductionBinOp()->getFastMathFlags()});
870 }
871 llvm_unreachable("all possible induction types must be handled");
872 return nullptr;
873}
874
875void VPlanTransforms::optimizeInductionExitUsers(
876 VPlan &Plan, DenseMap<VPValue *, VPValue *> &EndValues) {
877 VPBlockBase *MiddleVPBB = Plan.getMiddleBlock();
878 VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType());
879 for (VPIRBasicBlock *ExitVPBB : Plan.getExitBlocks()) {
880 for (VPRecipeBase &R : ExitVPBB->phis()) {
881 auto *ExitIRI = cast<VPIRPhi>(Val: &R);
882
883 for (auto [Idx, PredVPBB] : enumerate(First&: ExitVPBB->getPredecessors())) {
884 VPValue *Escape = nullptr;
885 if (PredVPBB == MiddleVPBB)
886 Escape = optimizeLatchExitInductionUser(
887 Plan, TypeInfo, PredVPBB, Op: ExitIRI->getOperand(N: Idx), EndValues);
888 else
889 Escape = optimizeEarlyExitInductionUser(Plan, TypeInfo, PredVPBB,
890 Op: ExitIRI->getOperand(N: Idx));
891 if (Escape)
892 ExitIRI->setOperand(I: Idx, New: Escape);
893 }
894 }
895 }
896}
897
898/// Remove redundant EpxandSCEVRecipes in \p Plan's entry block by replacing
899/// them with already existing recipes expanding the same SCEV expression.
900static void removeRedundantExpandSCEVRecipes(VPlan &Plan) {
901 DenseMap<const SCEV *, VPValue *> SCEV2VPV;
902
903 for (VPRecipeBase &R :
904 make_early_inc_range(Range&: *Plan.getEntry()->getEntryBasicBlock())) {
905 auto *ExpR = dyn_cast<VPExpandSCEVRecipe>(Val: &R);
906 if (!ExpR)
907 continue;
908
909 auto I = SCEV2VPV.insert(KV: {ExpR->getSCEV(), ExpR});
910 if (I.second)
911 continue;
912 ExpR->replaceAllUsesWith(New: I.first->second);
913 ExpR->eraseFromParent();
914 }
915}
916
917static void recursivelyDeleteDeadRecipes(VPValue *V) {
918 SmallVector<VPValue *> WorkList;
919 SmallPtrSet<VPValue *, 8> Seen;
920 WorkList.push_back(Elt: V);
921
922 while (!WorkList.empty()) {
923 VPValue *Cur = WorkList.pop_back_val();
924 if (!Seen.insert(Ptr: Cur).second)
925 continue;
926 VPRecipeBase *R = Cur->getDefiningRecipe();
927 if (!R)
928 continue;
929 if (!isDeadRecipe(R&: *R))
930 continue;
931 WorkList.append(in_start: R->op_begin(), in_end: R->op_end());
932 R->eraseFromParent();
933 }
934}
935
936/// Try to fold \p R using InstSimplifyFolder. Will succeed and return a
937/// non-nullptr Value for a handled \p Opcode if corresponding \p Operands are
938/// foldable live-ins.
939static Value *tryToFoldLiveIns(const VPRecipeBase &R, unsigned Opcode,
940 ArrayRef<VPValue *> Operands,
941 const DataLayout &DL, VPTypeAnalysis &TypeInfo) {
942 SmallVector<Value *, 4> Ops;
943 for (VPValue *Op : Operands) {
944 if (!Op->isLiveIn() || !Op->getLiveInIRValue())
945 return nullptr;
946 Ops.push_back(Elt: Op->getLiveInIRValue());
947 }
948
949 InstSimplifyFolder Folder(DL);
950 if (Instruction::isBinaryOp(Opcode))
951 return Folder.FoldBinOp(Opc: static_cast<Instruction::BinaryOps>(Opcode), LHS: Ops[0],
952 RHS: Ops[1]);
953 if (Instruction::isCast(Opcode))
954 return Folder.FoldCast(Op: static_cast<Instruction::CastOps>(Opcode), V: Ops[0],
955 DestTy: TypeInfo.inferScalarType(V: R.getVPSingleValue()));
956 switch (Opcode) {
957 case VPInstruction::LogicalAnd:
958 return Folder.FoldSelect(C: Ops[0], True: Ops[1],
959 False: ConstantInt::getNullValue(Ty: Ops[1]->getType()));
960 case VPInstruction::Not:
961 return Folder.FoldBinOp(Opc: Instruction::BinaryOps::Xor, LHS: Ops[0],
962 RHS: Constant::getAllOnesValue(Ty: Ops[0]->getType()));
963 case Instruction::Select:
964 return Folder.FoldSelect(C: Ops[0], True: Ops[1], False: Ops[2]);
965 case Instruction::ICmp:
966 case Instruction::FCmp:
967 return Folder.FoldCmp(P: cast<VPRecipeWithIRFlags>(Val: R).getPredicate(), LHS: Ops[0],
968 RHS: Ops[1]);
969 case Instruction::GetElementPtr: {
970 auto &RFlags = cast<VPRecipeWithIRFlags>(Val: R);
971 auto *GEP = cast<GetElementPtrInst>(Val: RFlags.getUnderlyingInstr());
972 return Folder.FoldGEP(Ty: GEP->getSourceElementType(), Ptr: Ops[0], IdxList: drop_begin(RangeOrContainer&: Ops),
973 NW: RFlags.getGEPNoWrapFlags());
974 }
975 case VPInstruction::PtrAdd:
976 return Folder.FoldGEP(Ty: IntegerType::getInt8Ty(C&: TypeInfo.getContext()), Ptr: Ops[0],
977 IdxList: Ops[1],
978 NW: cast<VPRecipeWithIRFlags>(Val: R).getGEPNoWrapFlags());
979 case Instruction::InsertElement:
980 return Folder.FoldInsertElement(Vec: Ops[0], NewElt: Ops[1], Idx: Ops[2]);
981 case Instruction::ExtractElement:
982 return Folder.FoldExtractElement(Vec: Ops[0], Idx: Ops[1]);
983 }
984 return nullptr;
985}
986
987/// Try to simplify recipe \p R.
988static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
989 using namespace llvm::VPlanPatternMatch;
990 VPlan *Plan = R.getParent()->getPlan();
991
992 auto *Def = dyn_cast<VPSingleDefRecipe>(Val: &R);
993 if (!Def)
994 return;
995
996 // Simplification of live-in IR values for SingleDef recipes using
997 // InstSimplifyFolder.
998 if (TypeSwitch<VPRecipeBase *, bool>(&R)
999 .Case<VPInstruction, VPWidenRecipe, VPWidenCastRecipe,
1000 VPReplicateRecipe>(caseFn: [&](auto *I) {
1001 const DataLayout &DL =
1002 Plan->getScalarHeader()->getIRBasicBlock()->getDataLayout();
1003 Value *V = tryToFoldLiveIns(*I, I->getOpcode(), I->operands(), DL,
1004 TypeInfo);
1005 if (V)
1006 I->replaceAllUsesWith(Plan->getOrAddLiveIn(V));
1007 return V;
1008 })
1009 .Default(defaultFn: [](auto *) { return false; }))
1010 return;
1011
1012 // Fold PredPHI LiveIn -> LiveIn.
1013 if (auto *PredPHI = dyn_cast<VPPredInstPHIRecipe>(Val: &R)) {
1014 VPValue *Op = PredPHI->getOperand(N: 0);
1015 if (Op->isLiveIn())
1016 PredPHI->replaceAllUsesWith(New: Op);
1017 }
1018
1019 VPValue *A;
1020 if (match(V: Def, P: m_Trunc(Op0: m_ZExtOrSExt(Op0: m_VPValue(V&: A))))) {
1021 Type *TruncTy = TypeInfo.inferScalarType(V: Def);
1022 Type *ATy = TypeInfo.inferScalarType(V: A);
1023 if (TruncTy == ATy) {
1024 Def->replaceAllUsesWith(New: A);
1025 } else {
1026 // Don't replace a scalarizing recipe with a widened cast.
1027 if (isa<VPReplicateRecipe>(Val: Def))
1028 return;
1029 if (ATy->getScalarSizeInBits() < TruncTy->getScalarSizeInBits()) {
1030
1031 unsigned ExtOpcode = match(V: R.getOperand(N: 0), P: m_SExt(Op0: m_VPValue()))
1032 ? Instruction::SExt
1033 : Instruction::ZExt;
1034 auto *VPC =
1035 new VPWidenCastRecipe(Instruction::CastOps(ExtOpcode), A, TruncTy);
1036 if (auto *UnderlyingExt = R.getOperand(N: 0)->getUnderlyingValue()) {
1037 // UnderlyingExt has distinct return type, used to retain legacy cost.
1038 VPC->setUnderlyingValue(UnderlyingExt);
1039 }
1040 VPC->insertBefore(InsertPos: &R);
1041 Def->replaceAllUsesWith(New: VPC);
1042 } else if (ATy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits()) {
1043 auto *VPC = new VPWidenCastRecipe(Instruction::Trunc, A, TruncTy);
1044 VPC->insertBefore(InsertPos: &R);
1045 Def->replaceAllUsesWith(New: VPC);
1046 }
1047 }
1048#ifndef NDEBUG
1049 // Verify that the cached type info is for both A and its users is still
1050 // accurate by comparing it to freshly computed types.
1051 VPTypeAnalysis TypeInfo2(Plan->getCanonicalIV()->getScalarType());
1052 assert(TypeInfo.inferScalarType(A) == TypeInfo2.inferScalarType(A));
1053 for (VPUser *U : A->users()) {
1054 auto *R = cast<VPRecipeBase>(U);
1055 for (VPValue *VPV : R->definedValues())
1056 assert(TypeInfo.inferScalarType(VPV) == TypeInfo2.inferScalarType(VPV));
1057 }
1058#endif
1059 }
1060
1061 // Simplify (X && Y) || (X && !Y) -> X.
1062 // TODO: Split up into simpler, modular combines: (X && Y) || (X && Z) into X
1063 // && (Y || Z) and (X || !X) into true. This requires queuing newly created
1064 // recipes to be visited during simplification.
1065 VPValue *X, *Y;
1066 if (match(V: Def,
1067 P: m_c_BinaryOr(Op0: m_LogicalAnd(Op0: m_VPValue(V&: X), Op1: m_VPValue(V&: Y)),
1068 Op1: m_LogicalAnd(Op0: m_Deferred(V: X), Op1: m_Not(Op0: m_Deferred(V: Y)))))) {
1069 Def->replaceAllUsesWith(New: X);
1070 Def->eraseFromParent();
1071 return;
1072 }
1073
1074 // OR x, 1 -> 1.
1075 if (match(V: Def, P: m_c_BinaryOr(Op0: m_VPValue(V&: X), Op1: m_AllOnes()))) {
1076 Def->replaceAllUsesWith(New: Def->getOperand(N: 0) == X ? Def->getOperand(N: 1)
1077 : Def->getOperand(N: 0));
1078 Def->eraseFromParent();
1079 return;
1080 }
1081
1082 if (match(V: Def, P: m_Select(Op0: m_VPValue(), Op1: m_VPValue(V&: X), Op2: m_Deferred(V: X))))
1083 return Def->replaceAllUsesWith(New: X);
1084
1085 if (match(V: Def, P: m_c_Mul(Op0: m_VPValue(V&: A), Op1: m_SpecificInt(V: 1))))
1086 return Def->replaceAllUsesWith(New: A);
1087
1088 if (match(V: Def, P: m_Not(Op0: m_VPValue(V&: A)))) {
1089 if (match(V: A, P: m_Not(Op0: m_VPValue(V&: A))))
1090 return Def->replaceAllUsesWith(New: A);
1091
1092 // Try to fold Not into compares by adjusting the predicate in-place.
1093 if (isa<VPWidenRecipe>(Val: A) && A->getNumUsers() == 1) {
1094 auto *WideCmp = cast<VPWidenRecipe>(Val: A);
1095 if (WideCmp->getOpcode() == Instruction::ICmp ||
1096 WideCmp->getOpcode() == Instruction::FCmp) {
1097 WideCmp->setPredicate(
1098 CmpInst::getInversePredicate(pred: WideCmp->getPredicate()));
1099 Def->replaceAllUsesWith(New: WideCmp);
1100 // If WideCmp doesn't have a debug location, use the one from the
1101 // negation, to preserve the location.
1102 if (!WideCmp->getDebugLoc() && R.getDebugLoc())
1103 WideCmp->setDebugLoc(R.getDebugLoc());
1104 }
1105 }
1106 }
1107
1108 // Remove redundant DerviedIVs, that is 0 + A * 1 -> A and 0 + 0 * x -> 0.
1109 if ((match(V: Def,
1110 P: m_DerivedIV(Op0: m_SpecificInt(V: 0), Op1: m_VPValue(V&: A), Op2: m_SpecificInt(V: 1))) ||
1111 match(V: Def,
1112 P: m_DerivedIV(Op0: m_SpecificInt(V: 0), Op1: m_SpecificInt(V: 0), Op2: m_VPValue()))) &&
1113 TypeInfo.inferScalarType(V: Def->getOperand(N: 1)) ==
1114 TypeInfo.inferScalarType(V: Def))
1115 return Def->replaceAllUsesWith(New: Def->getOperand(N: 1));
1116
1117 if (match(V: Def, P: m_VPInstruction<VPInstruction::WideIVStep>(
1118 Op0: m_VPValue(V&: X), Op1: m_SpecificInt(V: 1)))) {
1119 Type *WideStepTy = TypeInfo.inferScalarType(V: Def);
1120 if (TypeInfo.inferScalarType(V: X) != WideStepTy)
1121 X = VPBuilder(Def).createWidenCast(Opcode: Instruction::Trunc, Op: X, ResultTy: WideStepTy);
1122 Def->replaceAllUsesWith(New: X);
1123 return;
1124 }
1125
1126 // For i1 vp.merges produced by AnyOf reductions:
1127 // vp.merge true, (or x, y), x, evl -> vp.merge y, true, x, evl
1128 if (match(V: Def, P: m_Intrinsic<Intrinsic::vp_merge>(Op0: m_True(), Op1: m_VPValue(V&: A),
1129 Op2: m_VPValue(V&: X), Op3: m_VPValue())) &&
1130 match(V: A, P: m_c_BinaryOr(Op0: m_Specific(VPV: X), Op1: m_VPValue(V&: Y))) &&
1131 TypeInfo.inferScalarType(V: R.getVPSingleValue())->isIntegerTy(Bitwidth: 1)) {
1132 Def->setOperand(I: 1, New: Def->getOperand(N: 0));
1133 Def->setOperand(I: 0, New: Y);
1134 return;
1135 }
1136
1137 if (auto *Phi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(Val: Def)) {
1138 if (Phi->getOperand(N: 0) == Phi->getOperand(N: 1))
1139 Def->replaceAllUsesWith(New: Phi->getOperand(N: 0));
1140 return;
1141 }
1142
1143 // Look through ExtractLastElement (BuildVector ....).
1144 if (match(V: &R, P: m_VPInstruction<VPInstruction::ExtractLastElement>(
1145 Op0: m_BuildVector()))) {
1146 auto *BuildVector = cast<VPInstruction>(Val: R.getOperand(N: 0));
1147 Def->replaceAllUsesWith(
1148 New: BuildVector->getOperand(N: BuildVector->getNumOperands() - 1));
1149 return;
1150 }
1151
1152 // Look through ExtractPenultimateElement (BuildVector ....).
1153 if (match(V: &R, P: m_VPInstruction<VPInstruction::ExtractPenultimateElement>(
1154 Op0: m_BuildVector()))) {
1155 auto *BuildVector = cast<VPInstruction>(Val: R.getOperand(N: 0));
1156 Def->replaceAllUsesWith(
1157 New: BuildVector->getOperand(N: BuildVector->getNumOperands() - 2));
1158 return;
1159 }
1160
1161 // Some simplifications can only be applied after unrolling. Perform them
1162 // below.
1163 if (!Plan->isUnrolled())
1164 return;
1165
1166 // VPScalarIVSteps for part 0 can be replaced by their start value, if only
1167 // the first lane is demanded.
1168 if (auto *Steps = dyn_cast<VPScalarIVStepsRecipe>(Val: Def)) {
1169 if (Steps->isPart0() && vputils::onlyFirstLaneUsed(Def: Steps)) {
1170 Steps->replaceAllUsesWith(New: Steps->getOperand(N: 0));
1171 return;
1172 }
1173 }
1174 // Simplify redundant ReductionStartVector recipes after unrolling.
1175 VPValue *StartV;
1176 if (match(V: Def, P: m_VPInstruction<VPInstruction::ReductionStartVector>(
1177 Op0: m_VPValue(V&: StartV), Op1: m_VPValue(), Op2: m_VPValue()))) {
1178 Def->replaceUsesWithIf(New: StartV, ShouldReplace: [](const VPUser &U, unsigned Idx) {
1179 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(Val: &U);
1180 return PhiR && PhiR->isInLoop();
1181 });
1182 return;
1183 }
1184
1185 if (match(V: Def,
1186 P: m_VPInstruction<VPInstruction::ExtractLastElement>(
1187 Op0: m_VPInstruction<VPInstruction::Broadcast>(Op0: m_VPValue(V&: A))))) {
1188 Def->replaceAllUsesWith(New: A);
1189 return;
1190 }
1191
1192 VPInstruction *OpVPI;
1193 if (match(V: Def, P: m_VPInstruction<VPInstruction::ExtractLastElement>(
1194 Op0: m_VPInstruction(V&: OpVPI))) &&
1195 OpVPI->isVectorToScalar()) {
1196 Def->replaceAllUsesWith(New: OpVPI);
1197 return;
1198 }
1199}
1200
1201void VPlanTransforms::simplifyRecipes(VPlan &Plan, Type &CanonicalIVTy) {
1202 ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT(
1203 Plan.getEntry());
1204 VPTypeAnalysis TypeInfo(&CanonicalIVTy);
1205 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Range: RPOT)) {
1206 for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB)) {
1207 simplifyRecipe(R, TypeInfo);
1208 }
1209 }
1210}
1211
1212static void narrowToSingleScalarRecipes(VPlan &Plan) {
1213 if (Plan.hasScalarVFOnly())
1214 return;
1215
1216 // Try to narrow wide and replicating recipes to single scalar recipes,
1217 // based on VPlan analysis. Only process blocks in the loop region for now,
1218 // without traversing into nested regions, as recipes in replicate regions
1219 // cannot be converted yet.
1220 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
1221 Range: vp_depth_first_shallow(G: Plan.getVectorLoopRegion()->getEntry()))) {
1222 for (VPRecipeBase &R : make_early_inc_range(Range: reverse(C&: *VPBB))) {
1223 if (!isa<VPWidenRecipe, VPWidenSelectRecipe, VPReplicateRecipe>(Val: &R))
1224 continue;
1225 auto *RepR = dyn_cast<VPReplicateRecipe>(Val: &R);
1226 if (RepR && (RepR->isSingleScalar() || RepR->isPredicated()))
1227 continue;
1228
1229 auto *RepOrWidenR = cast<VPSingleDefRecipe>(Val: &R);
1230 // Skip recipes that aren't single scalars or don't have only their
1231 // scalar results used. In the latter case, we would introduce extra
1232 // broadcasts.
1233 if (!vputils::isSingleScalar(VPV: RepOrWidenR) ||
1234 any_of(Range: RepOrWidenR->users(), P: [RepOrWidenR](VPUser *U) {
1235 return !U->usesScalars(Op: RepOrWidenR);
1236 }))
1237 continue;
1238
1239 auto *Clone = new VPReplicateRecipe(RepOrWidenR->getUnderlyingInstr(),
1240 RepOrWidenR->operands(),
1241 true /*IsSingleScalar*/);
1242 Clone->insertBefore(InsertPos: RepOrWidenR);
1243 RepOrWidenR->replaceAllUsesWith(New: Clone);
1244 }
1245 }
1246}
1247
1248/// Normalize and simplify VPBlendRecipes. Should be run after simplifyRecipes
1249/// to make sure the masks are simplified.
1250static void simplifyBlends(VPlan &Plan) {
1251 using namespace llvm::VPlanPatternMatch;
1252 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
1253 Range: vp_depth_first_shallow(G: Plan.getVectorLoopRegion()->getEntry()))) {
1254 for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB)) {
1255 auto *Blend = dyn_cast<VPBlendRecipe>(Val: &R);
1256 if (!Blend)
1257 continue;
1258
1259 // Try to remove redundant blend recipes.
1260 SmallPtrSet<VPValue *, 4> UniqueValues;
1261 if (Blend->isNormalized() || !match(V: Blend->getMask(Idx: 0), P: m_False()))
1262 UniqueValues.insert(Ptr: Blend->getIncomingValue(Idx: 0));
1263 for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
1264 if (!match(V: Blend->getMask(Idx: I), P: m_False()))
1265 UniqueValues.insert(Ptr: Blend->getIncomingValue(Idx: I));
1266
1267 if (UniqueValues.size() == 1) {
1268 Blend->replaceAllUsesWith(New: *UniqueValues.begin());
1269 Blend->eraseFromParent();
1270 continue;
1271 }
1272
1273 if (Blend->isNormalized())
1274 continue;
1275
1276 // Normalize the blend so its first incoming value is used as the initial
1277 // value with the others blended into it.
1278
1279 unsigned StartIndex = 0;
1280 for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
1281 // If a value's mask is used only by the blend then is can be deadcoded.
1282 // TODO: Find the most expensive mask that can be deadcoded, or a mask
1283 // that's used by multiple blends where it can be removed from them all.
1284 VPValue *Mask = Blend->getMask(Idx: I);
1285 if (Mask->getNumUsers() == 1 && !match(V: Mask, P: m_False())) {
1286 StartIndex = I;
1287 break;
1288 }
1289 }
1290
1291 SmallVector<VPValue *, 4> OperandsWithMask;
1292 OperandsWithMask.push_back(Elt: Blend->getIncomingValue(Idx: StartIndex));
1293
1294 for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
1295 if (I == StartIndex)
1296 continue;
1297 OperandsWithMask.push_back(Elt: Blend->getIncomingValue(Idx: I));
1298 OperandsWithMask.push_back(Elt: Blend->getMask(Idx: I));
1299 }
1300
1301 auto *NewBlend = new VPBlendRecipe(
1302 cast<PHINode>(Val: Blend->getUnderlyingValue()), OperandsWithMask);
1303 NewBlend->insertBefore(InsertPos: &R);
1304
1305 VPValue *DeadMask = Blend->getMask(Idx: StartIndex);
1306 Blend->replaceAllUsesWith(New: NewBlend);
1307 Blend->eraseFromParent();
1308 recursivelyDeleteDeadRecipes(V: DeadMask);
1309
1310 /// Simplify BLEND %a, %b, Not(%mask) -> BLEND %b, %a, %mask.
1311 VPValue *NewMask;
1312 if (NewBlend->getNumOperands() == 3 &&
1313 match(V: NewBlend->getMask(Idx: 1), P: m_Not(Op0: m_VPValue(V&: NewMask)))) {
1314 VPValue *Inc0 = NewBlend->getOperand(N: 0);
1315 VPValue *Inc1 = NewBlend->getOperand(N: 1);
1316 VPValue *OldMask = NewBlend->getOperand(N: 2);
1317 NewBlend->setOperand(I: 0, New: Inc1);
1318 NewBlend->setOperand(I: 1, New: Inc0);
1319 NewBlend->setOperand(I: 2, New: NewMask);
1320 if (OldMask->getNumUsers() == 0)
1321 cast<VPInstruction>(Val: OldMask)->eraseFromParent();
1322 }
1323 }
1324 }
1325}
1326
1327/// Optimize the width of vector induction variables in \p Plan based on a known
1328/// constant Trip Count, \p BestVF and \p BestUF.
1329static bool optimizeVectorInductionWidthForTCAndVFUF(VPlan &Plan,
1330 ElementCount BestVF,
1331 unsigned BestUF) {
1332 // Only proceed if we have not completely removed the vector region.
1333 if (!Plan.getVectorLoopRegion())
1334 return false;
1335
1336 if (!Plan.getTripCount()->isLiveIn())
1337 return false;
1338 auto *TC = dyn_cast_if_present<ConstantInt>(
1339 Val: Plan.getTripCount()->getUnderlyingValue());
1340 if (!TC || !BestVF.isFixed())
1341 return false;
1342
1343 // Calculate the minimum power-of-2 bit width that can fit the known TC, VF
1344 // and UF. Returns at least 8.
1345 auto ComputeBitWidth = [](APInt TC, uint64_t Align) {
1346 APInt AlignedTC =
1347 Align * APIntOps::RoundingUDiv(A: TC, B: APInt(TC.getBitWidth(), Align),
1348 RM: APInt::Rounding::UP);
1349 APInt MaxVal = AlignedTC - 1;
1350 return std::max<unsigned>(a: PowerOf2Ceil(A: MaxVal.getActiveBits()), b: 8);
1351 };
1352 unsigned NewBitWidth =
1353 ComputeBitWidth(TC->getValue(), BestVF.getKnownMinValue() * BestUF);
1354
1355 LLVMContext &Ctx = Plan.getCanonicalIV()->getScalarType()->getContext();
1356 auto *NewIVTy = IntegerType::get(C&: Ctx, NumBits: NewBitWidth);
1357
1358 bool MadeChange = false;
1359
1360 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
1361 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
1362 auto *WideIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(Val: &Phi);
1363
1364 // Currently only handle canonical IVs as it is trivial to replace the start
1365 // and stop values, and we currently only perform the optimization when the
1366 // IV has a single use.
1367 if (!WideIV || !WideIV->isCanonical() ||
1368 WideIV->hasMoreThanOneUniqueUser() ||
1369 NewIVTy == WideIV->getScalarType())
1370 continue;
1371
1372 // Currently only handle cases where the single user is a header-mask
1373 // comparison with the backedge-taken-count.
1374 using namespace VPlanPatternMatch;
1375 if (!match(
1376 U: *WideIV->user_begin(),
1377 P: m_Binary<Instruction::ICmp>(
1378 Op0: m_Specific(VPV: WideIV),
1379 Op1: m_Broadcast(Op0: m_Specific(VPV: Plan.getOrCreateBackedgeTakenCount())))))
1380 continue;
1381
1382 // Update IV operands and comparison bound to use new narrower type.
1383 auto *NewStart = Plan.getOrAddLiveIn(V: ConstantInt::get(Ty: NewIVTy, V: 0));
1384 WideIV->setStartValue(NewStart);
1385 auto *NewStep = Plan.getOrAddLiveIn(V: ConstantInt::get(Ty: NewIVTy, V: 1));
1386 WideIV->setStepValue(NewStep);
1387
1388 auto *NewBTC = new VPWidenCastRecipe(
1389 Instruction::Trunc, Plan.getOrCreateBackedgeTakenCount(), NewIVTy);
1390 Plan.getVectorPreheader()->appendRecipe(Recipe: NewBTC);
1391 auto *Cmp = cast<VPInstruction>(Val: *WideIV->user_begin());
1392 Cmp->setOperand(I: 1, New: NewBTC);
1393
1394 MadeChange = true;
1395 }
1396
1397 return MadeChange;
1398}
1399
1400/// Return true if \p Cond is known to be true for given \p BestVF and \p
1401/// BestUF.
1402static bool isConditionTrueViaVFAndUF(VPValue *Cond, VPlan &Plan,
1403 ElementCount BestVF, unsigned BestUF,
1404 ScalarEvolution &SE) {
1405 using namespace llvm::VPlanPatternMatch;
1406 if (match(V: Cond, P: m_Binary<Instruction::Or>(Op0: m_VPValue(), Op1: m_VPValue())))
1407 return any_of(Range: Cond->getDefiningRecipe()->operands(), P: [&Plan, BestVF, BestUF,
1408 &SE](VPValue *C) {
1409 return isConditionTrueViaVFAndUF(Cond: C, Plan, BestVF, BestUF, SE);
1410 });
1411
1412 auto *CanIV = Plan.getCanonicalIV();
1413 if (!match(V: Cond, P: m_Binary<Instruction::ICmp>(
1414 Op0: m_Specific(VPV: CanIV->getBackedgeValue()),
1415 Op1: m_Specific(VPV: &Plan.getVectorTripCount()))) ||
1416 cast<VPRecipeWithIRFlags>(Val: Cond->getDefiningRecipe())->getPredicate() !=
1417 CmpInst::ICMP_EQ)
1418 return false;
1419
1420 // The compare checks CanIV + VFxUF == vector trip count. The vector trip
1421 // count is not conveniently available as SCEV so far, so we compare directly
1422 // against the original trip count. This is stricter than necessary, as we
1423 // will only return true if the trip count == vector trip count.
1424 // TODO: Use SCEV for vector trip count once available, to cover cases where
1425 // vector trip count == UF * VF, but original trip count != UF * VF.
1426 const SCEV *TripCount =
1427 vputils::getSCEVExprForVPValue(V: Plan.getTripCount(), SE);
1428 assert(!isa<SCEVCouldNotCompute>(TripCount) &&
1429 "Trip count SCEV must be computable");
1430 ElementCount NumElements = BestVF.multiplyCoefficientBy(RHS: BestUF);
1431 const SCEV *C = SE.getElementCount(Ty: TripCount->getType(), EC: NumElements);
1432 return SE.isKnownPredicate(Pred: CmpInst::ICMP_EQ, LHS: TripCount, RHS: C);
1433}
1434
1435/// Try to simplify the branch condition of \p Plan. This may restrict the
1436/// resulting plan to \p BestVF and \p BestUF.
1437static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF,
1438 unsigned BestUF,
1439 PredicatedScalarEvolution &PSE) {
1440 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
1441 VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
1442 auto *Term = &ExitingVPBB->back();
1443 VPValue *Cond;
1444 ScalarEvolution &SE = *PSE.getSE();
1445 using namespace llvm::VPlanPatternMatch;
1446 if (match(V: Term, P: m_BranchOnCount(Op0: m_VPValue(), Op1: m_VPValue())) ||
1447 match(V: Term, P: m_BranchOnCond(
1448 Op0: m_Not(Op0: m_ActiveLaneMask(Op0: m_VPValue(), Op1: m_VPValue()))))) {
1449 // Try to simplify the branch condition if TC <= VF * UF when the latch
1450 // terminator is BranchOnCount or BranchOnCond where the input is
1451 // Not(ActiveLaneMask).
1452 const SCEV *TripCount =
1453 vputils::getSCEVExprForVPValue(V: Plan.getTripCount(), SE);
1454 assert(!isa<SCEVCouldNotCompute>(TripCount) &&
1455 "Trip count SCEV must be computable");
1456 ElementCount NumElements = BestVF.multiplyCoefficientBy(RHS: BestUF);
1457 const SCEV *C = SE.getElementCount(Ty: TripCount->getType(), EC: NumElements);
1458 if (TripCount->isZero() ||
1459 !SE.isKnownPredicate(Pred: CmpInst::ICMP_ULE, LHS: TripCount, RHS: C))
1460 return false;
1461 } else if (match(V: Term, P: m_BranchOnCond(Op0: m_VPValue(V&: Cond)))) {
1462 // For BranchOnCond, check if we can prove the condition to be true using VF
1463 // and UF.
1464 if (!isConditionTrueViaVFAndUF(Cond, Plan, BestVF, BestUF, SE))
1465 return false;
1466 } else {
1467 return false;
1468 }
1469
1470 // The vector loop region only executes once. If possible, completely remove
1471 // the region, otherwise replace the terminator controlling the latch with
1472 // (BranchOnCond true).
1473 auto *Header = cast<VPBasicBlock>(Val: VectorRegion->getEntry());
1474 auto *CanIVTy = Plan.getCanonicalIV()->getScalarType();
1475 if (all_of(
1476 Range: Header->phis(),
1477 P: IsaPred<VPCanonicalIVPHIRecipe, VPFirstOrderRecurrencePHIRecipe>)) {
1478 for (VPRecipeBase &HeaderR : make_early_inc_range(Range: Header->phis())) {
1479 auto *HeaderPhiR = cast<VPHeaderPHIRecipe>(Val: &HeaderR);
1480 HeaderPhiR->replaceAllUsesWith(New: HeaderPhiR->getStartValue());
1481 HeaderPhiR->eraseFromParent();
1482 }
1483
1484 VPBlockBase *Preheader = VectorRegion->getSinglePredecessor();
1485 VPBlockBase *Exit = VectorRegion->getSingleSuccessor();
1486 VPBlockUtils::disconnectBlocks(From: Preheader, To: VectorRegion);
1487 VPBlockUtils::disconnectBlocks(From: VectorRegion, To: Exit);
1488
1489 for (VPBlockBase *B : vp_depth_first_shallow(G: VectorRegion->getEntry()))
1490 B->setParent(nullptr);
1491
1492 VPBlockUtils::connectBlocks(From: Preheader, To: Header);
1493 VPBlockUtils::connectBlocks(From: ExitingVPBB, To: Exit);
1494 VPlanTransforms::simplifyRecipes(Plan, CanonicalIVTy&: *CanIVTy);
1495 } else {
1496 // The vector region contains header phis for which we cannot remove the
1497 // loop region yet.
1498 LLVMContext &Ctx = SE.getContext();
1499 auto *BOC = new VPInstruction(
1500 VPInstruction::BranchOnCond,
1501 {Plan.getOrAddLiveIn(V: ConstantInt::getTrue(Context&: Ctx))}, Term->getDebugLoc());
1502 ExitingVPBB->appendRecipe(Recipe: BOC);
1503 }
1504
1505 Term->eraseFromParent();
1506
1507 return true;
1508}
1509
1510void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
1511 unsigned BestUF,
1512 PredicatedScalarEvolution &PSE) {
1513 assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
1514 assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
1515
1516 bool MadeChange =
1517 simplifyBranchConditionForVFAndUF(Plan, BestVF, BestUF, PSE);
1518 MadeChange |= optimizeVectorInductionWidthForTCAndVFUF(Plan, BestVF, BestUF);
1519
1520 if (MadeChange) {
1521 Plan.setVF(BestVF);
1522 assert(Plan.getUF() == BestUF && "BestUF must match the Plan's UF");
1523 }
1524 // TODO: Further simplifications are possible
1525 // 1. Replace inductions with constants.
1526 // 2. Replace vector loop region with VPBasicBlock.
1527}
1528
1529/// Sink users of \p FOR after the recipe defining the previous value \p
1530/// Previous of the recurrence. \returns true if all users of \p FOR could be
1531/// re-arranged as needed or false if it is not possible.
1532static bool
1533sinkRecurrenceUsersAfterPrevious(VPFirstOrderRecurrencePHIRecipe *FOR,
1534 VPRecipeBase *Previous,
1535 VPDominatorTree &VPDT) {
1536 // Collect recipes that need sinking.
1537 SmallVector<VPRecipeBase *> WorkList;
1538 SmallPtrSet<VPRecipeBase *, 8> Seen;
1539 Seen.insert(Ptr: Previous);
1540 auto TryToPushSinkCandidate = [&](VPRecipeBase *SinkCandidate) {
1541 // The previous value must not depend on the users of the recurrence phi. In
1542 // that case, FOR is not a fixed order recurrence.
1543 if (SinkCandidate == Previous)
1544 return false;
1545
1546 if (isa<VPHeaderPHIRecipe>(Val: SinkCandidate) ||
1547 !Seen.insert(Ptr: SinkCandidate).second ||
1548 VPDT.properlyDominates(A: Previous, B: SinkCandidate))
1549 return true;
1550
1551 if (SinkCandidate->mayHaveSideEffects())
1552 return false;
1553
1554 WorkList.push_back(Elt: SinkCandidate);
1555 return true;
1556 };
1557
1558 // Recursively sink users of FOR after Previous.
1559 WorkList.push_back(Elt: FOR);
1560 for (unsigned I = 0; I != WorkList.size(); ++I) {
1561 VPRecipeBase *Current = WorkList[I];
1562 assert(Current->getNumDefinedValues() == 1 &&
1563 "only recipes with a single defined value expected");
1564
1565 for (VPUser *User : Current->getVPSingleValue()->users()) {
1566 if (!TryToPushSinkCandidate(cast<VPRecipeBase>(Val: User)))
1567 return false;
1568 }
1569 }
1570
1571 // Keep recipes to sink ordered by dominance so earlier instructions are
1572 // processed first.
1573 sort(C&: WorkList, Comp: [&VPDT](const VPRecipeBase *A, const VPRecipeBase *B) {
1574 return VPDT.properlyDominates(A, B);
1575 });
1576
1577 for (VPRecipeBase *SinkCandidate : WorkList) {
1578 if (SinkCandidate == FOR)
1579 continue;
1580
1581 SinkCandidate->moveAfter(MovePos: Previous);
1582 Previous = SinkCandidate;
1583 }
1584 return true;
1585}
1586
1587/// Try to hoist \p Previous and its operands before all users of \p FOR.
1588static bool hoistPreviousBeforeFORUsers(VPFirstOrderRecurrencePHIRecipe *FOR,
1589 VPRecipeBase *Previous,
1590 VPDominatorTree &VPDT) {
1591 if (Previous->mayHaveSideEffects() || Previous->mayReadFromMemory())
1592 return false;
1593
1594 // Collect recipes that need hoisting.
1595 SmallVector<VPRecipeBase *> HoistCandidates;
1596 SmallPtrSet<VPRecipeBase *, 8> Visited;
1597 VPRecipeBase *HoistPoint = nullptr;
1598 // Find the closest hoist point by looking at all users of FOR and selecting
1599 // the recipe dominating all other users.
1600 for (VPUser *U : FOR->users()) {
1601 auto *R = cast<VPRecipeBase>(Val: U);
1602 if (!HoistPoint || VPDT.properlyDominates(A: R, B: HoistPoint))
1603 HoistPoint = R;
1604 }
1605 assert(all_of(FOR->users(),
1606 [&VPDT, HoistPoint](VPUser *U) {
1607 auto *R = cast<VPRecipeBase>(U);
1608 return HoistPoint == R ||
1609 VPDT.properlyDominates(HoistPoint, R);
1610 }) &&
1611 "HoistPoint must dominate all users of FOR");
1612
1613 auto NeedsHoisting = [HoistPoint, &VPDT,
1614 &Visited](VPValue *HoistCandidateV) -> VPRecipeBase * {
1615 VPRecipeBase *HoistCandidate = HoistCandidateV->getDefiningRecipe();
1616 if (!HoistCandidate)
1617 return nullptr;
1618 VPRegionBlock *EnclosingLoopRegion =
1619 HoistCandidate->getParent()->getEnclosingLoopRegion();
1620 assert((!HoistCandidate->getParent()->getParent() ||
1621 HoistCandidate->getParent()->getParent() == EnclosingLoopRegion) &&
1622 "CFG in VPlan should still be flat, without replicate regions");
1623 // Hoist candidate was already visited, no need to hoist.
1624 if (!Visited.insert(Ptr: HoistCandidate).second)
1625 return nullptr;
1626
1627 // Candidate is outside loop region or a header phi, dominates FOR users w/o
1628 // hoisting.
1629 if (!EnclosingLoopRegion || isa<VPHeaderPHIRecipe>(Val: HoistCandidate))
1630 return nullptr;
1631
1632 // If we reached a recipe that dominates HoistPoint, we don't need to
1633 // hoist the recipe.
1634 if (VPDT.properlyDominates(A: HoistCandidate, B: HoistPoint))
1635 return nullptr;
1636 return HoistCandidate;
1637 };
1638 auto CanHoist = [&](VPRecipeBase *HoistCandidate) {
1639 // Avoid hoisting candidates with side-effects, as we do not yet analyze
1640 // associated dependencies.
1641 return !HoistCandidate->mayHaveSideEffects();
1642 };
1643
1644 if (!NeedsHoisting(Previous->getVPSingleValue()))
1645 return true;
1646
1647 // Recursively try to hoist Previous and its operands before all users of FOR.
1648 HoistCandidates.push_back(Elt: Previous);
1649
1650 for (unsigned I = 0; I != HoistCandidates.size(); ++I) {
1651 VPRecipeBase *Current = HoistCandidates[I];
1652 assert(Current->getNumDefinedValues() == 1 &&
1653 "only recipes with a single defined value expected");
1654 if (!CanHoist(Current))
1655 return false;
1656
1657 for (VPValue *Op : Current->operands()) {
1658 // If we reach FOR, it means the original Previous depends on some other
1659 // recurrence that in turn depends on FOR. If that is the case, we would
1660 // also need to hoist recipes involving the other FOR, which may break
1661 // dependencies.
1662 if (Op == FOR)
1663 return false;
1664
1665 if (auto *R = NeedsHoisting(Op))
1666 HoistCandidates.push_back(Elt: R);
1667 }
1668 }
1669
1670 // Order recipes to hoist by dominance so earlier instructions are processed
1671 // first.
1672 sort(C&: HoistCandidates, Comp: [&VPDT](const VPRecipeBase *A, const VPRecipeBase *B) {
1673 return VPDT.properlyDominates(A, B);
1674 });
1675
1676 for (VPRecipeBase *HoistCandidate : HoistCandidates) {
1677 HoistCandidate->moveBefore(BB&: *HoistPoint->getParent(),
1678 I: HoistPoint->getIterator());
1679 }
1680
1681 return true;
1682}
1683
1684bool VPlanTransforms::adjustFixedOrderRecurrences(VPlan &Plan,
1685 VPBuilder &LoopBuilder) {
1686 VPDominatorTree VPDT;
1687 VPDT.recalculate(Func&: Plan);
1688
1689 SmallVector<VPFirstOrderRecurrencePHIRecipe *> RecurrencePhis;
1690 for (VPRecipeBase &R :
1691 Plan.getVectorLoopRegion()->getEntry()->getEntryBasicBlock()->phis())
1692 if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(Val: &R))
1693 RecurrencePhis.push_back(Elt: FOR);
1694
1695 for (VPFirstOrderRecurrencePHIRecipe *FOR : RecurrencePhis) {
1696 SmallPtrSet<VPFirstOrderRecurrencePHIRecipe *, 4> SeenPhis;
1697 VPRecipeBase *Previous = FOR->getBackedgeValue()->getDefiningRecipe();
1698 // Fixed-order recurrences do not contain cycles, so this loop is guaranteed
1699 // to terminate.
1700 while (auto *PrevPhi =
1701 dyn_cast_or_null<VPFirstOrderRecurrencePHIRecipe>(Val: Previous)) {
1702 assert(PrevPhi->getParent() == FOR->getParent());
1703 assert(SeenPhis.insert(PrevPhi).second);
1704 Previous = PrevPhi->getBackedgeValue()->getDefiningRecipe();
1705 }
1706
1707 if (!sinkRecurrenceUsersAfterPrevious(FOR, Previous, VPDT) &&
1708 !hoistPreviousBeforeFORUsers(FOR, Previous, VPDT))
1709 return false;
1710
1711 // Introduce a recipe to combine the incoming and previous values of a
1712 // fixed-order recurrence.
1713 VPBasicBlock *InsertBlock = Previous->getParent();
1714 if (isa<VPHeaderPHIRecipe>(Val: Previous))
1715 LoopBuilder.setInsertPoint(TheBB: InsertBlock, IP: InsertBlock->getFirstNonPhi());
1716 else
1717 LoopBuilder.setInsertPoint(TheBB: InsertBlock,
1718 IP: std::next(x: Previous->getIterator()));
1719
1720 auto *RecurSplice =
1721 LoopBuilder.createNaryOp(Opcode: VPInstruction::FirstOrderRecurrenceSplice,
1722 Operands: {FOR, FOR->getBackedgeValue()});
1723
1724 FOR->replaceAllUsesWith(New: RecurSplice);
1725 // Set the first operand of RecurSplice to FOR again, after replacing
1726 // all users.
1727 RecurSplice->setOperand(I: 0, New: FOR);
1728 }
1729 return true;
1730}
1731
1732void VPlanTransforms::clearReductionWrapFlags(VPlan &Plan) {
1733 for (VPRecipeBase &R :
1734 Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
1735 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(Val: &R);
1736 if (!PhiR)
1737 continue;
1738 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
1739 RecurKind RK = RdxDesc.getRecurrenceKind();
1740 if (RK != RecurKind::Add && RK != RecurKind::Mul)
1741 continue;
1742
1743 for (VPUser *U : collectUsersRecursively(V: PhiR))
1744 if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(Val: U)) {
1745 RecWithFlags->dropPoisonGeneratingFlags();
1746 }
1747 }
1748}
1749
1750/// Move loop-invariant recipes out of the vector loop region in \p Plan.
1751static void licm(VPlan &Plan) {
1752 VPBasicBlock *Preheader = Plan.getVectorPreheader();
1753
1754 // Return true if we do not know how to (mechanically) hoist a given recipe
1755 // out of a loop region. Does not address legality concerns such as aliasing
1756 // or speculation safety.
1757 auto CannotHoistRecipe = [](VPRecipeBase &R) {
1758 // Allocas cannot be hoisted.
1759 auto *RepR = dyn_cast<VPReplicateRecipe>(Val: &R);
1760 return RepR && RepR->getOpcode() == Instruction::Alloca;
1761 };
1762
1763 // Hoist any loop invariant recipes from the vector loop region to the
1764 // preheader. Preform a shallow traversal of the vector loop region, to
1765 // exclude recipes in replicate regions.
1766 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
1767 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
1768 Range: vp_depth_first_shallow(G: LoopRegion->getEntry()))) {
1769 for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB)) {
1770 if (CannotHoistRecipe(R))
1771 continue;
1772 // TODO: Relax checks in the future, e.g. we could also hoist reads, if
1773 // their memory location is not modified in the vector loop.
1774 if (R.mayHaveSideEffects() || R.mayReadFromMemory() || R.isPhi() ||
1775 any_of(Range: R.operands(), P: [](VPValue *Op) {
1776 return !Op->isDefinedOutsideLoopRegions();
1777 }))
1778 continue;
1779 R.moveBefore(BB&: *Preheader, I: Preheader->end());
1780 }
1781 }
1782}
1783
1784void VPlanTransforms::truncateToMinimalBitwidths(
1785 VPlan &Plan, const MapVector<Instruction *, uint64_t> &MinBWs) {
1786 // Keep track of created truncates, so they can be re-used. Note that we
1787 // cannot use RAUW after creating a new truncate, as this would could make
1788 // other uses have different types for their operands, making them invalidly
1789 // typed.
1790 DenseMap<VPValue *, VPWidenCastRecipe *> ProcessedTruncs;
1791 Type *CanonicalIVType = Plan.getCanonicalIV()->getScalarType();
1792 VPTypeAnalysis TypeInfo(CanonicalIVType);
1793 VPBasicBlock *PH = Plan.getVectorPreheader();
1794 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
1795 Range: vp_depth_first_deep(G: Plan.getVectorLoopRegion()))) {
1796 for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB)) {
1797 if (!isa<VPWidenRecipe, VPWidenCastRecipe, VPReplicateRecipe,
1798 VPWidenSelectRecipe, VPWidenLoadRecipe, VPWidenIntrinsicRecipe>(
1799 Val: &R))
1800 continue;
1801
1802 VPValue *ResultVPV = R.getVPSingleValue();
1803 auto *UI = cast_or_null<Instruction>(Val: ResultVPV->getUnderlyingValue());
1804 unsigned NewResSizeInBits = MinBWs.lookup(Key: UI);
1805 if (!NewResSizeInBits)
1806 continue;
1807
1808 // If the value wasn't vectorized, we must maintain the original scalar
1809 // type. Skip those here, after incrementing NumProcessedRecipes. Also
1810 // skip casts which do not need to be handled explicitly here, as
1811 // redundant casts will be removed during recipe simplification.
1812 if (isa<VPReplicateRecipe, VPWidenCastRecipe>(Val: &R))
1813 continue;
1814
1815 Type *OldResTy = TypeInfo.inferScalarType(V: ResultVPV);
1816 unsigned OldResSizeInBits = OldResTy->getScalarSizeInBits();
1817 assert(OldResTy->isIntegerTy() && "only integer types supported");
1818 (void)OldResSizeInBits;
1819
1820 LLVMContext &Ctx = CanonicalIVType->getContext();
1821 auto *NewResTy = IntegerType::get(C&: Ctx, NumBits: NewResSizeInBits);
1822
1823 // Any wrapping introduced by shrinking this operation shouldn't be
1824 // considered undefined behavior. So, we can't unconditionally copy
1825 // arithmetic wrapping flags to VPW.
1826 if (auto *VPW = dyn_cast<VPRecipeWithIRFlags>(Val: &R))
1827 VPW->dropPoisonGeneratingFlags();
1828
1829 using namespace llvm::VPlanPatternMatch;
1830 if (OldResSizeInBits != NewResSizeInBits &&
1831 !match(V: &R, P: m_Binary<Instruction::ICmp>(Op0: m_VPValue(), Op1: m_VPValue()))) {
1832 // Extend result to original width.
1833 auto *Ext =
1834 new VPWidenCastRecipe(Instruction::ZExt, ResultVPV, OldResTy);
1835 Ext->insertAfter(InsertPos: &R);
1836 ResultVPV->replaceAllUsesWith(New: Ext);
1837 Ext->setOperand(I: 0, New: ResultVPV);
1838 assert(OldResSizeInBits > NewResSizeInBits && "Nothing to shrink?");
1839 } else {
1840 assert(
1841 match(&R, m_Binary<Instruction::ICmp>(m_VPValue(), m_VPValue())) &&
1842 "Only ICmps should not need extending the result.");
1843 }
1844
1845 assert(!isa<VPWidenStoreRecipe>(&R) && "stores cannot be narrowed");
1846 if (isa<VPWidenLoadRecipe, VPWidenIntrinsicRecipe>(Val: &R))
1847 continue;
1848
1849 // Shrink operands by introducing truncates as needed.
1850 unsigned StartIdx = isa<VPWidenSelectRecipe>(Val: &R) ? 1 : 0;
1851 for (unsigned Idx = StartIdx; Idx != R.getNumOperands(); ++Idx) {
1852 auto *Op = R.getOperand(N: Idx);
1853 unsigned OpSizeInBits =
1854 TypeInfo.inferScalarType(V: Op)->getScalarSizeInBits();
1855 if (OpSizeInBits == NewResSizeInBits)
1856 continue;
1857 assert(OpSizeInBits > NewResSizeInBits && "nothing to truncate");
1858 auto [ProcessedIter, IterIsEmpty] = ProcessedTruncs.try_emplace(Key: Op);
1859 VPWidenCastRecipe *NewOp =
1860 IterIsEmpty
1861 ? new VPWidenCastRecipe(Instruction::Trunc, Op, NewResTy)
1862 : ProcessedIter->second;
1863 R.setOperand(I: Idx, New: NewOp);
1864 if (!IterIsEmpty)
1865 continue;
1866 ProcessedIter->second = NewOp;
1867 if (!Op->isLiveIn()) {
1868 NewOp->insertBefore(InsertPos: &R);
1869 } else {
1870 PH->appendRecipe(Recipe: NewOp);
1871 }
1872 }
1873
1874 }
1875 }
1876}
1877
1878/// Remove BranchOnCond recipes with true or false conditions together with
1879/// removing dead edges to their successors.
1880static void removeBranchOnConst(VPlan &Plan) {
1881 using namespace llvm::VPlanPatternMatch;
1882 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
1883 Range: vp_depth_first_shallow(G: Plan.getEntry()))) {
1884 VPValue *Cond;
1885 if (VPBB->getNumSuccessors() != 2 || VPBB == Plan.getEntry() ||
1886 !match(V: &VPBB->back(), P: m_BranchOnCond(Op0: m_VPValue(V&: Cond))))
1887 continue;
1888
1889 unsigned RemovedIdx;
1890 if (match(V: Cond, P: m_True()))
1891 RemovedIdx = 1;
1892 else if (match(V: Cond, P: m_False()))
1893 RemovedIdx = 0;
1894 else
1895 continue;
1896
1897 VPBasicBlock *RemovedSucc =
1898 cast<VPBasicBlock>(Val: VPBB->getSuccessors()[RemovedIdx]);
1899 assert(count(RemovedSucc->getPredecessors(), VPBB) == 1 &&
1900 "There must be a single edge between VPBB and its successor");
1901 // Values coming from VPBB into phi recipes of RemoveSucc are removed from
1902 // these recipes.
1903 for (VPRecipeBase &R : RemovedSucc->phis()) {
1904 auto *Phi = cast<VPPhiAccessors>(Val: &R);
1905 assert((!isa<VPIRPhi>(&R) || RemovedSucc->getNumPredecessors() == 1) &&
1906 "VPIRPhis must have a single predecessor");
1907 Phi->removeIncomingValueFor(IncomingBlock: VPBB);
1908 }
1909 // Disconnect blocks and remove the terminator. RemovedSucc will be deleted
1910 // automatically on VPlan destruction if it becomes unreachable.
1911 VPBlockUtils::disconnectBlocks(From: VPBB, To: RemovedSucc);
1912 VPBB->back().eraseFromParent();
1913 }
1914}
1915
1916void VPlanTransforms::optimize(VPlan &Plan) {
1917 runPass(Fn: removeRedundantCanonicalIVs, Plan);
1918 runPass(Fn: removeRedundantInductionCasts, Plan);
1919
1920 runPass(Fn: simplifyRecipes, Plan, Args&: *Plan.getCanonicalIV()->getScalarType());
1921 runPass(Fn: simplifyBlends, Plan);
1922 runPass(Fn: removeDeadRecipes, Plan);
1923 runPass(Fn: narrowToSingleScalarRecipes, Plan);
1924 runPass(Fn: legalizeAndOptimizeInductions, Plan);
1925 runPass(Fn: removeRedundantExpandSCEVRecipes, Plan);
1926 runPass(Fn: simplifyRecipes, Plan, Args&: *Plan.getCanonicalIV()->getScalarType());
1927 runPass(Fn: removeBranchOnConst, Plan);
1928 runPass(Fn: removeDeadRecipes, Plan);
1929
1930 runPass(Fn: createAndOptimizeReplicateRegions, Plan);
1931 runPass(Transform: mergeBlocksIntoPredecessors, Plan);
1932 runPass(Fn: licm, Plan);
1933}
1934
1935// Add a VPActiveLaneMaskPHIRecipe and related recipes to \p Plan and replace
1936// the loop terminator with a branch-on-cond recipe with the negated
1937// active-lane-mask as operand. Note that this turns the loop into an
1938// uncountable one. Only the existing terminator is replaced, all other existing
1939// recipes/users remain unchanged, except for poison-generating flags being
1940// dropped from the canonical IV increment. Return the created
1941// VPActiveLaneMaskPHIRecipe.
1942//
1943// The function uses the following definitions:
1944//
1945// %TripCount = DataWithControlFlowWithoutRuntimeCheck ?
1946// calculate-trip-count-minus-VF (original TC) : original TC
1947// %IncrementValue = DataWithControlFlowWithoutRuntimeCheck ?
1948// CanonicalIVPhi : CanonicalIVIncrement
1949// %StartV is the canonical induction start value.
1950//
1951// The function adds the following recipes:
1952//
1953// vector.ph:
1954// %TripCount = calculate-trip-count-minus-VF (original TC)
1955// [if DataWithControlFlowWithoutRuntimeCheck]
1956// %EntryInc = canonical-iv-increment-for-part %StartV
1957// %EntryALM = active-lane-mask %EntryInc, %TripCount
1958//
1959// vector.body:
1960// ...
1961// %P = active-lane-mask-phi [ %EntryALM, %vector.ph ], [ %ALM, %vector.body ]
1962// ...
1963// %InLoopInc = canonical-iv-increment-for-part %IncrementValue
1964// %ALM = active-lane-mask %InLoopInc, TripCount
1965// %Negated = Not %ALM
1966// branch-on-cond %Negated
1967//
1968static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch(
1969 VPlan &Plan, bool DataAndControlFlowWithoutRuntimeCheck) {
1970 VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
1971 VPBasicBlock *EB = TopRegion->getExitingBasicBlock();
1972 auto *CanonicalIVPHI = Plan.getCanonicalIV();
1973 VPValue *StartV = CanonicalIVPHI->getStartValue();
1974
1975 auto *CanonicalIVIncrement =
1976 cast<VPInstruction>(Val: CanonicalIVPHI->getBackedgeValue());
1977 // TODO: Check if dropping the flags is needed if
1978 // !DataAndControlFlowWithoutRuntimeCheck.
1979 CanonicalIVIncrement->dropPoisonGeneratingFlags();
1980 DebugLoc DL = CanonicalIVIncrement->getDebugLoc();
1981 // We can't use StartV directly in the ActiveLaneMask VPInstruction, since
1982 // we have to take unrolling into account. Each part needs to start at
1983 // Part * VF
1984 auto *VecPreheader = Plan.getVectorPreheader();
1985 VPBuilder Builder(VecPreheader);
1986
1987 // Create the ActiveLaneMask instruction using the correct start values.
1988 VPValue *TC = Plan.getTripCount();
1989
1990 VPValue *TripCount, *IncrementValue;
1991 if (!DataAndControlFlowWithoutRuntimeCheck) {
1992 // When the loop is guarded by a runtime overflow check for the loop
1993 // induction variable increment by VF, we can increment the value before
1994 // the get.active.lane mask and use the unmodified tripcount.
1995 IncrementValue = CanonicalIVIncrement;
1996 TripCount = TC;
1997 } else {
1998 // When avoiding a runtime check, the active.lane.mask inside the loop
1999 // uses a modified trip count and the induction variable increment is
2000 // done after the active.lane.mask intrinsic is called.
2001 IncrementValue = CanonicalIVPHI;
2002 TripCount = Builder.createNaryOp(Opcode: VPInstruction::CalculateTripCountMinusVF,
2003 Operands: {TC}, DL);
2004 }
2005 auto *EntryIncrement = Builder.createOverflowingOp(
2006 Opcode: VPInstruction::CanonicalIVIncrementForPart, Operands: {StartV}, WrapFlags: {false, false}, DL,
2007 Name: "index.part.next");
2008
2009 // Create the active lane mask instruction in the VPlan preheader.
2010 auto *EntryALM =
2011 Builder.createNaryOp(Opcode: VPInstruction::ActiveLaneMask, Operands: {EntryIncrement, TC},
2012 DL, Name: "active.lane.mask.entry");
2013
2014 // Now create the ActiveLaneMaskPhi recipe in the main loop using the
2015 // preheader ActiveLaneMask instruction.
2016 auto *LaneMaskPhi = new VPActiveLaneMaskPHIRecipe(EntryALM, DebugLoc());
2017 LaneMaskPhi->insertAfter(InsertPos: CanonicalIVPHI);
2018
2019 // Create the active lane mask for the next iteration of the loop before the
2020 // original terminator.
2021 VPRecipeBase *OriginalTerminator = EB->getTerminator();
2022 Builder.setInsertPoint(OriginalTerminator);
2023 auto *InLoopIncrement =
2024 Builder.createOverflowingOp(Opcode: VPInstruction::CanonicalIVIncrementForPart,
2025 Operands: {IncrementValue}, WrapFlags: {false, false}, DL);
2026 auto *ALM = Builder.createNaryOp(Opcode: VPInstruction::ActiveLaneMask,
2027 Operands: {InLoopIncrement, TripCount}, DL,
2028 Name: "active.lane.mask.next");
2029 LaneMaskPhi->addOperand(Operand: ALM);
2030
2031 // Replace the original terminator with BranchOnCond. We have to invert the
2032 // mask here because a true condition means jumping to the exit block.
2033 auto *NotMask = Builder.createNot(Operand: ALM, DL);
2034 Builder.createNaryOp(Opcode: VPInstruction::BranchOnCond, Operands: {NotMask}, DL);
2035 OriginalTerminator->eraseFromParent();
2036 return LaneMaskPhi;
2037}
2038
2039/// Collect all VPValues representing a header mask through the (ICMP_ULE,
2040/// WideCanonicalIV, backedge-taken-count) pattern.
2041/// TODO: Introduce explicit recipe for header-mask instead of searching
2042/// for the header-mask pattern manually.
2043static SmallVector<VPValue *> collectAllHeaderMasks(VPlan &Plan) {
2044 SmallVector<VPValue *> WideCanonicalIVs;
2045 auto *FoundWidenCanonicalIVUser =
2046 find_if(Range: Plan.getCanonicalIV()->users(),
2047 P: [](VPUser *U) { return isa<VPWidenCanonicalIVRecipe>(Val: U); });
2048 assert(count_if(Plan.getCanonicalIV()->users(),
2049 [](VPUser *U) { return isa<VPWidenCanonicalIVRecipe>(U); }) <=
2050 1 &&
2051 "Must have at most one VPWideCanonicalIVRecipe");
2052 if (FoundWidenCanonicalIVUser != Plan.getCanonicalIV()->users().end()) {
2053 auto *WideCanonicalIV =
2054 cast<VPWidenCanonicalIVRecipe>(Val: *FoundWidenCanonicalIVUser);
2055 WideCanonicalIVs.push_back(Elt: WideCanonicalIV);
2056 }
2057
2058 // Also include VPWidenIntOrFpInductionRecipes that represent a widened
2059 // version of the canonical induction.
2060 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
2061 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
2062 auto *WidenOriginalIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(Val: &Phi);
2063 if (WidenOriginalIV && WidenOriginalIV->isCanonical())
2064 WideCanonicalIVs.push_back(Elt: WidenOriginalIV);
2065 }
2066
2067 // Walk users of wide canonical IVs and collect to all compares of the form
2068 // (ICMP_ULE, WideCanonicalIV, backedge-taken-count).
2069 SmallVector<VPValue *> HeaderMasks;
2070 for (auto *Wide : WideCanonicalIVs) {
2071 for (VPUser *U : SmallVector<VPUser *>(Wide->users())) {
2072 auto *HeaderMask = dyn_cast<VPInstruction>(Val: U);
2073 if (!HeaderMask || !vputils::isHeaderMask(V: HeaderMask, Plan))
2074 continue;
2075
2076 assert(HeaderMask->getOperand(0) == Wide &&
2077 "WidenCanonicalIV must be the first operand of the compare");
2078 HeaderMasks.push_back(Elt: HeaderMask);
2079 }
2080 }
2081 return HeaderMasks;
2082}
2083
2084void VPlanTransforms::addActiveLaneMask(
2085 VPlan &Plan, bool UseActiveLaneMaskForControlFlow,
2086 bool DataAndControlFlowWithoutRuntimeCheck) {
2087 assert((!DataAndControlFlowWithoutRuntimeCheck ||
2088 UseActiveLaneMaskForControlFlow) &&
2089 "DataAndControlFlowWithoutRuntimeCheck implies "
2090 "UseActiveLaneMaskForControlFlow");
2091
2092 auto *FoundWidenCanonicalIVUser =
2093 find_if(Range: Plan.getCanonicalIV()->users(),
2094 P: [](VPUser *U) { return isa<VPWidenCanonicalIVRecipe>(Val: U); });
2095 assert(FoundWidenCanonicalIVUser &&
2096 "Must have widened canonical IV when tail folding!");
2097 auto *WideCanonicalIV =
2098 cast<VPWidenCanonicalIVRecipe>(Val: *FoundWidenCanonicalIVUser);
2099 VPSingleDefRecipe *LaneMask;
2100 if (UseActiveLaneMaskForControlFlow) {
2101 LaneMask = addVPLaneMaskPhiAndUpdateExitBranch(
2102 Plan, DataAndControlFlowWithoutRuntimeCheck);
2103 } else {
2104 VPBuilder B = VPBuilder::getToInsertAfter(R: WideCanonicalIV);
2105 LaneMask = B.createNaryOp(Opcode: VPInstruction::ActiveLaneMask,
2106 Operands: {WideCanonicalIV, Plan.getTripCount()}, Inst: nullptr,
2107 Name: "active.lane.mask");
2108 }
2109
2110 // Walk users of WideCanonicalIV and replace all compares of the form
2111 // (ICMP_ULE, WideCanonicalIV, backedge-taken-count) with an
2112 // active-lane-mask.
2113 for (VPValue *HeaderMask : collectAllHeaderMasks(Plan))
2114 HeaderMask->replaceAllUsesWith(New: LaneMask);
2115}
2116
2117/// Try to optimize a \p CurRecipe masked by \p HeaderMask to a corresponding
2118/// EVL-based recipe without the header mask. Returns nullptr if no EVL-based
2119/// recipe could be created.
2120/// \p HeaderMask Header Mask.
2121/// \p CurRecipe Recipe to be transform.
2122/// \p TypeInfo VPlan-based type analysis.
2123/// \p AllOneMask The vector mask parameter of vector-predication intrinsics.
2124/// \p EVL The explicit vector length parameter of vector-predication
2125/// intrinsics.
2126static VPRecipeBase *optimizeMaskToEVL(VPValue *HeaderMask,
2127 VPRecipeBase &CurRecipe,
2128 VPTypeAnalysis &TypeInfo,
2129 VPValue &AllOneMask, VPValue &EVL) {
2130 using namespace llvm::VPlanPatternMatch;
2131 auto GetNewMask = [&](VPValue *OrigMask) -> VPValue * {
2132 assert(OrigMask && "Unmasked recipe when folding tail");
2133 // HeaderMask will be handled using EVL.
2134 VPValue *Mask;
2135 if (match(V: OrigMask, P: m_LogicalAnd(Op0: m_Specific(VPV: HeaderMask), Op1: m_VPValue(V&: Mask))))
2136 return Mask;
2137 return HeaderMask == OrigMask ? nullptr : OrigMask;
2138 };
2139
2140 return TypeSwitch<VPRecipeBase *, VPRecipeBase *>(&CurRecipe)
2141 .Case<VPWidenLoadRecipe>(caseFn: [&](VPWidenLoadRecipe *L) {
2142 VPValue *NewMask = GetNewMask(L->getMask());
2143 return new VPWidenLoadEVLRecipe(*L, EVL, NewMask);
2144 })
2145 .Case<VPWidenStoreRecipe>(caseFn: [&](VPWidenStoreRecipe *S) {
2146 VPValue *NewMask = GetNewMask(S->getMask());
2147 return new VPWidenStoreEVLRecipe(*S, EVL, NewMask);
2148 })
2149 .Case<VPReductionRecipe>(caseFn: [&](VPReductionRecipe *Red) {
2150 VPValue *NewMask = GetNewMask(Red->getCondOp());
2151 return new VPReductionEVLRecipe(*Red, EVL, NewMask);
2152 })
2153 .Case<VPInstruction>(caseFn: [&](VPInstruction *VPI) -> VPRecipeBase * {
2154 VPValue *LHS, *RHS;
2155 // Transform select with a header mask condition
2156 // select(header_mask, LHS, RHS)
2157 // into vector predication merge.
2158 // vp.merge(all-true, LHS, RHS, EVL)
2159 if (!match(V: VPI, P: m_Select(Op0: m_Specific(VPV: HeaderMask), Op1: m_VPValue(V&: LHS),
2160 Op2: m_VPValue(V&: RHS))))
2161 return nullptr;
2162 // Use all true as the condition because this transformation is
2163 // limited to selects whose condition is a header mask.
2164 return new VPWidenIntrinsicRecipe(
2165 Intrinsic::vp_merge, {&AllOneMask, LHS, RHS, &EVL},
2166 TypeInfo.inferScalarType(V: LHS), VPI->getDebugLoc());
2167 })
2168 .Default(defaultFn: [&](VPRecipeBase *R) { return nullptr; });
2169}
2170
2171/// Replace recipes with their EVL variants.
2172static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
2173 Type *CanonicalIVType = Plan.getCanonicalIV()->getScalarType();
2174 VPTypeAnalysis TypeInfo(CanonicalIVType);
2175 LLVMContext &Ctx = CanonicalIVType->getContext();
2176 VPValue *AllOneMask = Plan.getOrAddLiveIn(V: ConstantInt::getTrue(Context&: Ctx));
2177 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2178 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
2179
2180 assert(all_of(Plan.getVF().users(),
2181 IsaPred<VPVectorEndPointerRecipe, VPScalarIVStepsRecipe,
2182 VPWidenIntOrFpInductionRecipe>) &&
2183 "User of VF that we can't transform to EVL.");
2184 Plan.getVF().replaceAllUsesWith(New: &EVL);
2185
2186 // Defer erasing recipes till the end so that we don't invalidate the
2187 // VPTypeAnalysis cache.
2188 SmallVector<VPRecipeBase *> ToErase;
2189
2190 // Create a scalar phi to track the previous EVL if fixed-order recurrence is
2191 // contained.
2192 bool ContainsFORs =
2193 any_of(Range: Header->phis(), P: IsaPred<VPFirstOrderRecurrencePHIRecipe>);
2194 if (ContainsFORs) {
2195 // TODO: Use VPInstruction::ExplicitVectorLength to get maximum EVL.
2196 VPValue *MaxEVL = &Plan.getVF();
2197 // Emit VPScalarCastRecipe in preheader if VF is not a 32 bits integer.
2198 VPBuilder Builder(LoopRegion->getPreheaderVPBB());
2199 MaxEVL = Builder.createScalarZExtOrTrunc(Op: MaxEVL, ResultTy: Type::getInt32Ty(C&: Ctx),
2200 SrcTy: TypeInfo.inferScalarType(V: MaxEVL),
2201 DL: DebugLoc());
2202
2203 Builder.setInsertPoint(TheBB: Header, IP: Header->getFirstNonPhi());
2204 VPValue *PrevEVL =
2205 Builder.createScalarPhi(IncomingValues: {MaxEVL, &EVL}, DL: DebugLoc(), Name: "prev.evl");
2206
2207 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
2208 Range: vp_depth_first_deep(G: Plan.getVectorLoopRegion()->getEntry()))) {
2209 for (VPRecipeBase &R : *VPBB) {
2210 using namespace VPlanPatternMatch;
2211 VPValue *V1, *V2;
2212 if (!match(V: &R,
2213 P: m_VPInstruction<VPInstruction::FirstOrderRecurrenceSplice>(
2214 Op0: m_VPValue(V&: V1), Op1: m_VPValue(V&: V2))))
2215 continue;
2216 VPValue *Imm = Plan.getOrAddLiveIn(
2217 V: ConstantInt::getSigned(Ty: Type::getInt32Ty(C&: Ctx), V: -1));
2218 VPWidenIntrinsicRecipe *VPSplice = new VPWidenIntrinsicRecipe(
2219 Intrinsic::experimental_vp_splice,
2220 {V1, V2, Imm, AllOneMask, PrevEVL, &EVL},
2221 TypeInfo.inferScalarType(V: R.getVPSingleValue()), R.getDebugLoc());
2222 VPSplice->insertBefore(InsertPos: &R);
2223 R.getVPSingleValue()->replaceAllUsesWith(New: VPSplice);
2224 ToErase.push_back(Elt: &R);
2225 }
2226 }
2227 }
2228
2229 // Try to optimize header mask recipes away to their EVL variants.
2230 for (VPValue *HeaderMask : collectAllHeaderMasks(Plan)) {
2231 for (VPUser *U : collectUsersRecursively(V: HeaderMask)) {
2232 auto *CurRecipe = cast<VPRecipeBase>(Val: U);
2233 VPRecipeBase *EVLRecipe =
2234 optimizeMaskToEVL(HeaderMask, CurRecipe&: *CurRecipe, TypeInfo, AllOneMask&: *AllOneMask, EVL);
2235 if (!EVLRecipe)
2236 continue;
2237
2238 [[maybe_unused]] unsigned NumDefVal = EVLRecipe->getNumDefinedValues();
2239 assert(NumDefVal == CurRecipe->getNumDefinedValues() &&
2240 "New recipe must define the same number of values as the "
2241 "original.");
2242 assert(
2243 NumDefVal <= 1 &&
2244 "Only supports recipes with a single definition or without users.");
2245 EVLRecipe->insertBefore(InsertPos: CurRecipe);
2246 if (isa<VPSingleDefRecipe, VPWidenLoadEVLRecipe>(Val: EVLRecipe)) {
2247 VPValue *CurVPV = CurRecipe->getVPSingleValue();
2248 CurVPV->replaceAllUsesWith(New: EVLRecipe->getVPSingleValue());
2249 }
2250 ToErase.push_back(Elt: CurRecipe);
2251 }
2252 }
2253
2254 for (VPRecipeBase *R : reverse(C&: ToErase)) {
2255 SmallVector<VPValue *> PossiblyDead(R->operands());
2256 R->eraseFromParent();
2257 for (VPValue *Op : PossiblyDead)
2258 recursivelyDeleteDeadRecipes(V: Op);
2259 }
2260}
2261
2262/// Add a VPEVLBasedIVPHIRecipe and related recipes to \p Plan and
2263/// replaces all uses except the canonical IV increment of
2264/// VPCanonicalIVPHIRecipe with a VPEVLBasedIVPHIRecipe. VPCanonicalIVPHIRecipe
2265/// is used only for loop iterations counting after this transformation.
2266///
2267/// The function uses the following definitions:
2268/// %StartV is the canonical induction start value.
2269///
2270/// The function adds the following recipes:
2271///
2272/// vector.ph:
2273/// ...
2274///
2275/// vector.body:
2276/// ...
2277/// %EVLPhi = EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI [ %StartV, %vector.ph ],
2278/// [ %NextEVLIV, %vector.body ]
2279/// %AVL = sub original TC, %EVLPhi
2280/// %VPEVL = EXPLICIT-VECTOR-LENGTH %AVL
2281/// ...
2282/// %NextEVLIV = add IVSize (cast i32 %VPEVVL to IVSize), %EVLPhi
2283/// ...
2284///
2285/// If MaxSafeElements is provided, the function adds the following recipes:
2286/// vector.ph:
2287/// ...
2288///
2289/// vector.body:
2290/// ...
2291/// %EVLPhi = EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI [ %StartV, %vector.ph ],
2292/// [ %NextEVLIV, %vector.body ]
2293/// %AVL = sub original TC, %EVLPhi
2294/// %cmp = cmp ult %AVL, MaxSafeElements
2295/// %SAFE_AVL = select %cmp, %AVL, MaxSafeElements
2296/// %VPEVL = EXPLICIT-VECTOR-LENGTH %SAFE_AVL
2297/// ...
2298/// %NextEVLIV = add IVSize (cast i32 %VPEVL to IVSize), %EVLPhi
2299/// ...
2300///
2301bool VPlanTransforms::tryAddExplicitVectorLength(
2302 VPlan &Plan, const std::optional<unsigned> &MaxSafeElements) {
2303 VPBasicBlock *Header = Plan.getVectorLoopRegion()->getEntryBasicBlock();
2304 // The transform updates all users of inductions to work based on EVL, instead
2305 // of the VF directly. At the moment, widened pointer inductions cannot be
2306 // updated, so bail out if the plan contains any.
2307 bool ContainsWidenPointerInductions =
2308 any_of(Range: Header->phis(), P: IsaPred<VPWidenPointerInductionRecipe>);
2309 if (ContainsWidenPointerInductions)
2310 return false;
2311
2312 auto *CanonicalIVPHI = Plan.getCanonicalIV();
2313 auto *CanIVTy = CanonicalIVPHI->getScalarType();
2314 VPValue *StartV = CanonicalIVPHI->getStartValue();
2315
2316 // Create the ExplicitVectorLengthPhi recipe in the main loop.
2317 auto *EVLPhi = new VPEVLBasedIVPHIRecipe(StartV, DebugLoc());
2318 EVLPhi->insertAfter(InsertPos: CanonicalIVPHI);
2319 VPBuilder Builder(Header, Header->getFirstNonPhi());
2320 // Compute original TC - IV as the AVL (application vector length).
2321 VPValue *AVL = Builder.createNaryOp(
2322 Opcode: Instruction::Sub, Operands: {Plan.getTripCount(), EVLPhi}, DL: DebugLoc(), Name: "avl");
2323 if (MaxSafeElements) {
2324 // Support for MaxSafeDist for correct loop emission.
2325 VPValue *AVLSafe =
2326 Plan.getOrAddLiveIn(V: ConstantInt::get(Ty: CanIVTy, V: *MaxSafeElements));
2327 VPValue *Cmp = Builder.createICmp(Pred: ICmpInst::ICMP_ULT, A: AVL, B: AVLSafe);
2328 AVL = Builder.createSelect(Cond: Cmp, TrueVal: AVL, FalseVal: AVLSafe, DL: DebugLoc(), Name: "safe_avl");
2329 }
2330 auto *VPEVL = Builder.createNaryOp(Opcode: VPInstruction::ExplicitVectorLength, Operands: AVL,
2331 DL: DebugLoc());
2332
2333 auto *CanonicalIVIncrement =
2334 cast<VPInstruction>(Val: CanonicalIVPHI->getBackedgeValue());
2335 Builder.setInsertPoint(CanonicalIVIncrement);
2336 VPValue *OpVPEVL = VPEVL;
2337
2338 auto *I32Ty = Type::getInt32Ty(C&: CanIVTy->getContext());
2339 OpVPEVL = Builder.createScalarZExtOrTrunc(
2340 Op: OpVPEVL, ResultTy: CanIVTy, SrcTy: I32Ty, DL: CanonicalIVIncrement->getDebugLoc());
2341
2342 auto *NextEVLIV = Builder.createOverflowingOp(
2343 Opcode: Instruction::Add, Operands: {OpVPEVL, EVLPhi},
2344 WrapFlags: {CanonicalIVIncrement->hasNoUnsignedWrap(),
2345 CanonicalIVIncrement->hasNoSignedWrap()},
2346 DL: CanonicalIVIncrement->getDebugLoc(), Name: "index.evl.next");
2347 EVLPhi->addOperand(Operand: NextEVLIV);
2348
2349 transformRecipestoEVLRecipes(Plan, EVL&: *VPEVL);
2350
2351 // Replace all uses of VPCanonicalIVPHIRecipe by
2352 // VPEVLBasedIVPHIRecipe except for the canonical IV increment.
2353 CanonicalIVPHI->replaceAllUsesWith(New: EVLPhi);
2354 CanonicalIVIncrement->setOperand(I: 0, New: CanonicalIVPHI);
2355 // TODO: support unroll factor > 1.
2356 Plan.setUF(1);
2357 return true;
2358}
2359
2360void VPlanTransforms::dropPoisonGeneratingRecipes(
2361 VPlan &Plan,
2362 const std::function<bool(BasicBlock *)> &BlockNeedsPredication) {
2363 // Collect recipes in the backward slice of `Root` that may generate a poison
2364 // value that is used after vectorization.
2365 SmallPtrSet<VPRecipeBase *, 16> Visited;
2366 auto CollectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) {
2367 SmallVector<VPRecipeBase *, 16> Worklist;
2368 Worklist.push_back(Elt: Root);
2369
2370 // Traverse the backward slice of Root through its use-def chain.
2371 while (!Worklist.empty()) {
2372 VPRecipeBase *CurRec = Worklist.pop_back_val();
2373
2374 if (!Visited.insert(Ptr: CurRec).second)
2375 continue;
2376
2377 // Prune search if we find another recipe generating a widen memory
2378 // instruction. Widen memory instructions involved in address computation
2379 // will lead to gather/scatter instructions, which don't need to be
2380 // handled.
2381 if (isa<VPWidenMemoryRecipe, VPInterleaveRecipe, VPScalarIVStepsRecipe,
2382 VPHeaderPHIRecipe>(Val: CurRec))
2383 continue;
2384
2385 // This recipe contributes to the address computation of a widen
2386 // load/store. If the underlying instruction has poison-generating flags,
2387 // drop them directly.
2388 if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(Val: CurRec)) {
2389 VPValue *A, *B;
2390 using namespace llvm::VPlanPatternMatch;
2391 // Dropping disjoint from an OR may yield incorrect results, as some
2392 // analysis may have converted it to an Add implicitly (e.g. SCEV used
2393 // for dependence analysis). Instead, replace it with an equivalent Add.
2394 // This is possible as all users of the disjoint OR only access lanes
2395 // where the operands are disjoint or poison otherwise.
2396 if (match(V: RecWithFlags, P: m_BinaryOr(Op0: m_VPValue(V&: A), Op1: m_VPValue(V&: B))) &&
2397 RecWithFlags->isDisjoint()) {
2398 VPBuilder Builder(RecWithFlags);
2399 VPInstruction *New = Builder.createOverflowingOp(
2400 Opcode: Instruction::Add, Operands: {A, B}, WrapFlags: {false, false},
2401 DL: RecWithFlags->getDebugLoc());
2402 New->setUnderlyingValue(RecWithFlags->getUnderlyingValue());
2403 RecWithFlags->replaceAllUsesWith(New);
2404 RecWithFlags->eraseFromParent();
2405 CurRec = New;
2406 } else
2407 RecWithFlags->dropPoisonGeneratingFlags();
2408 } else {
2409 Instruction *Instr = dyn_cast_or_null<Instruction>(
2410 Val: CurRec->getVPSingleValue()->getUnderlyingValue());
2411 (void)Instr;
2412 assert((!Instr || !Instr->hasPoisonGeneratingFlags()) &&
2413 "found instruction with poison generating flags not covered by "
2414 "VPRecipeWithIRFlags");
2415 }
2416
2417 // Add new definitions to the worklist.
2418 for (VPValue *Operand : CurRec->operands())
2419 if (VPRecipeBase *OpDef = Operand->getDefiningRecipe())
2420 Worklist.push_back(Elt: OpDef);
2421 }
2422 });
2423
2424 // Traverse all the recipes in the VPlan and collect the poison-generating
2425 // recipes in the backward slice starting at the address of a VPWidenRecipe or
2426 // VPInterleaveRecipe.
2427 auto Iter = vp_depth_first_deep(G: Plan.getEntry());
2428 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Range: Iter)) {
2429 for (VPRecipeBase &Recipe : *VPBB) {
2430 if (auto *WidenRec = dyn_cast<VPWidenMemoryRecipe>(Val: &Recipe)) {
2431 Instruction &UnderlyingInstr = WidenRec->getIngredient();
2432 VPRecipeBase *AddrDef = WidenRec->getAddr()->getDefiningRecipe();
2433 if (AddrDef && WidenRec->isConsecutive() &&
2434 BlockNeedsPredication(UnderlyingInstr.getParent()))
2435 CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
2436 } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(Val: &Recipe)) {
2437 VPRecipeBase *AddrDef = InterleaveRec->getAddr()->getDefiningRecipe();
2438 if (AddrDef) {
2439 // Check if any member of the interleave group needs predication.
2440 const InterleaveGroup<Instruction> *InterGroup =
2441 InterleaveRec->getInterleaveGroup();
2442 bool NeedPredication = false;
2443 for (int I = 0, NumMembers = InterGroup->getNumMembers();
2444 I < NumMembers; ++I) {
2445 Instruction *Member = InterGroup->getMember(Index: I);
2446 if (Member)
2447 NeedPredication |= BlockNeedsPredication(Member->getParent());
2448 }
2449
2450 if (NeedPredication)
2451 CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
2452 }
2453 }
2454 }
2455 }
2456}
2457
2458void VPlanTransforms::createInterleaveGroups(
2459 VPlan &Plan,
2460 const SmallPtrSetImpl<const InterleaveGroup<Instruction> *>
2461 &InterleaveGroups,
2462 VPRecipeBuilder &RecipeBuilder, const bool &ScalarEpilogueAllowed) {
2463 if (InterleaveGroups.empty())
2464 return;
2465
2466 // Interleave memory: for each Interleave Group we marked earlier as relevant
2467 // for this VPlan, replace the Recipes widening its memory instructions with a
2468 // single VPInterleaveRecipe at its insertion point.
2469 VPDominatorTree VPDT;
2470 VPDT.recalculate(Func&: Plan);
2471 for (const auto *IG : InterleaveGroups) {
2472 SmallVector<VPValue *, 4> StoredValues;
2473 for (unsigned i = 0; i < IG->getFactor(); ++i)
2474 if (auto *SI = dyn_cast_or_null<StoreInst>(Val: IG->getMember(Index: i))) {
2475 auto *StoreR = cast<VPWidenStoreRecipe>(Val: RecipeBuilder.getRecipe(I: SI));
2476 StoredValues.push_back(Elt: StoreR->getStoredValue());
2477 }
2478
2479 bool NeedsMaskForGaps =
2480 IG->requiresScalarEpilogue() && !ScalarEpilogueAllowed;
2481
2482 Instruction *IRInsertPos = IG->getInsertPos();
2483 auto *InsertPos =
2484 cast<VPWidenMemoryRecipe>(Val: RecipeBuilder.getRecipe(I: IRInsertPos));
2485
2486 bool InBounds = false;
2487 if (auto *Gep = dyn_cast<GetElementPtrInst>(
2488 Val: getLoadStorePointerOperand(V: IRInsertPos)->stripPointerCasts()))
2489 InBounds = Gep->isInBounds();
2490
2491 // Get or create the start address for the interleave group.
2492 auto *Start =
2493 cast<VPWidenMemoryRecipe>(Val: RecipeBuilder.getRecipe(I: IG->getMember(Index: 0)));
2494 VPValue *Addr = Start->getAddr();
2495 VPRecipeBase *AddrDef = Addr->getDefiningRecipe();
2496 if (AddrDef && !VPDT.properlyDominates(A: AddrDef, B: InsertPos)) {
2497 // We cannot re-use the address of member zero because it does not
2498 // dominate the insert position. Instead, use the address of the insert
2499 // position and create a PtrAdd adjusting it to the address of member
2500 // zero.
2501 // TODO: Hoist Addr's defining recipe (and any operands as needed) to
2502 // InsertPos or sink loads above zero members to join it.
2503 assert(IG->getIndex(IRInsertPos) != 0 &&
2504 "index of insert position shouldn't be zero");
2505 auto &DL = IRInsertPos->getDataLayout();
2506 APInt Offset(32,
2507 DL.getTypeAllocSize(Ty: getLoadStoreType(I: IRInsertPos)) *
2508 IG->getIndex(Instr: IRInsertPos),
2509 /*IsSigned=*/true);
2510 VPValue *OffsetVPV = Plan.getOrAddLiveIn(
2511 V: ConstantInt::get(Context&: IRInsertPos->getParent()->getContext(), V: -Offset));
2512 VPBuilder B(InsertPos);
2513 Addr = InBounds ? B.createInBoundsPtrAdd(Ptr: InsertPos->getAddr(), Offset: OffsetVPV)
2514 : B.createPtrAdd(Ptr: InsertPos->getAddr(), Offset: OffsetVPV);
2515 }
2516 // If the group is reverse, adjust the index to refer to the last vector
2517 // lane instead of the first. We adjust the index from the first vector
2518 // lane, rather than directly getting the pointer for lane VF - 1, because
2519 // the pointer operand of the interleaved access is supposed to be uniform.
2520 if (IG->isReverse()) {
2521 auto *ReversePtr = new VPVectorEndPointerRecipe(
2522 Addr, &Plan.getVF(), getLoadStoreType(I: IRInsertPos),
2523 -(int64_t)IG->getFactor(),
2524 InBounds ? GEPNoWrapFlags::inBounds() : GEPNoWrapFlags::none(),
2525 InsertPos->getDebugLoc());
2526 ReversePtr->insertBefore(InsertPos);
2527 Addr = ReversePtr;
2528 }
2529 auto *VPIG = new VPInterleaveRecipe(IG, Addr, StoredValues,
2530 InsertPos->getMask(), NeedsMaskForGaps, InsertPos->getDebugLoc());
2531 VPIG->insertBefore(InsertPos);
2532
2533 unsigned J = 0;
2534 for (unsigned i = 0; i < IG->getFactor(); ++i)
2535 if (Instruction *Member = IG->getMember(Index: i)) {
2536 VPRecipeBase *MemberR = RecipeBuilder.getRecipe(I: Member);
2537 if (!Member->getType()->isVoidTy()) {
2538 VPValue *OriginalV = MemberR->getVPSingleValue();
2539 OriginalV->replaceAllUsesWith(New: VPIG->getVPValue(I: J));
2540 J++;
2541 }
2542 MemberR->eraseFromParent();
2543 }
2544 }
2545}
2546
2547/// Expand a VPWidenIntOrFpInduction into executable recipes, for the initial
2548/// value, phi and backedge value. In the following example:
2549///
2550/// vector.ph:
2551/// Successor(s): vector loop
2552///
2553/// <x1> vector loop: {
2554/// vector.body:
2555/// WIDEN-INDUCTION %i = phi %start, %step, %vf
2556/// ...
2557/// EMIT branch-on-count ...
2558/// No successors
2559/// }
2560///
2561/// WIDEN-INDUCTION will get expanded to:
2562///
2563/// vector.ph:
2564/// ...
2565/// vp<%induction.start> = ...
2566/// vp<%induction.increment> = ...
2567///
2568/// Successor(s): vector loop
2569///
2570/// <x1> vector loop: {
2571/// vector.body:
2572/// ir<%i> = WIDEN-PHI vp<%induction.start>, vp<%vec.ind.next>
2573/// ...
2574/// vp<%vec.ind.next> = add ir<%i>, vp<%induction.increment>
2575/// EMIT branch-on-count ...
2576/// No successors
2577/// }
2578static void
2579expandVPWidenIntOrFpInduction(VPWidenIntOrFpInductionRecipe *WidenIVR,
2580 VPTypeAnalysis &TypeInfo) {
2581 VPlan *Plan = WidenIVR->getParent()->getPlan();
2582 VPValue *Start = WidenIVR->getStartValue();
2583 VPValue *Step = WidenIVR->getStepValue();
2584 VPValue *VF = WidenIVR->getVFValue();
2585 DebugLoc DL = WidenIVR->getDebugLoc();
2586
2587 // The value from the original loop to which we are mapping the new induction
2588 // variable.
2589 Type *Ty = TypeInfo.inferScalarType(V: WidenIVR);
2590
2591 const InductionDescriptor &ID = WidenIVR->getInductionDescriptor();
2592 Instruction::BinaryOps AddOp;
2593 Instruction::BinaryOps MulOp;
2594 // FIXME: The newly created binary instructions should contain nsw/nuw
2595 // flags, which can be found from the original scalar operations.
2596 VPIRFlags Flags;
2597 if (ID.getKind() == InductionDescriptor::IK_IntInduction) {
2598 AddOp = Instruction::Add;
2599 MulOp = Instruction::Mul;
2600 } else {
2601 AddOp = ID.getInductionOpcode();
2602 MulOp = Instruction::FMul;
2603 Flags = ID.getInductionBinOp()->getFastMathFlags();
2604 }
2605
2606 // If the phi is truncated, truncate the start and step values.
2607 VPBuilder Builder(Plan->getVectorPreheader());
2608 Type *StepTy = TypeInfo.inferScalarType(V: Step);
2609 if (Ty->getScalarSizeInBits() < StepTy->getScalarSizeInBits()) {
2610 assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
2611 Step = Builder.createScalarCast(Opcode: Instruction::Trunc, Op: Step, ResultTy: Ty, DL);
2612 Start = Builder.createScalarCast(Opcode: Instruction::Trunc, Op: Start, ResultTy: Ty, DL);
2613 StepTy = Ty;
2614 }
2615
2616 // Construct the initial value of the vector IV in the vector loop preheader.
2617 Type *IVIntTy =
2618 IntegerType::get(C&: StepTy->getContext(), NumBits: StepTy->getScalarSizeInBits());
2619 VPValue *Init = Builder.createNaryOp(Opcode: VPInstruction::StepVector, Operands: {}, ResultTy: IVIntTy);
2620 if (StepTy->isFloatingPointTy())
2621 Init = Builder.createWidenCast(Opcode: Instruction::UIToFP, Op: Init, ResultTy: StepTy);
2622
2623 VPValue *SplatStart = Builder.createNaryOp(Opcode: VPInstruction::Broadcast, Operands: Start);
2624 VPValue *SplatStep = Builder.createNaryOp(Opcode: VPInstruction::Broadcast, Operands: Step);
2625
2626 Init = Builder.createNaryOp(Opcode: MulOp, Operands: {Init, SplatStep}, Flags);
2627 Init =
2628 Builder.createNaryOp(Opcode: AddOp, Operands: {SplatStart, Init}, Flags, DL: {}, Name: "induction");
2629
2630 // Create the widened phi of the vector IV.
2631 auto *WidePHI = new VPWidenPHIRecipe(WidenIVR->getPHINode(), nullptr,
2632 WidenIVR->getDebugLoc(), "vec.ind");
2633 WidePHI->addOperand(Operand: Init);
2634 WidePHI->insertBefore(InsertPos: WidenIVR);
2635
2636 // Create the backedge value for the vector IV.
2637 VPValue *Inc;
2638 VPValue *Prev;
2639 // If unrolled, use the increment and prev value from the operands.
2640 if (auto *SplatVF = WidenIVR->getSplatVFValue()) {
2641 Inc = SplatVF;
2642 Prev = WidenIVR->getLastUnrolledPartOperand();
2643 } else {
2644 if (VPRecipeBase *R = VF->getDefiningRecipe())
2645 Builder.setInsertPoint(TheBB: R->getParent(), IP: std::next(x: R->getIterator()));
2646 // Multiply the vectorization factor by the step using integer or
2647 // floating-point arithmetic as appropriate.
2648 if (StepTy->isFloatingPointTy())
2649 VF = Builder.createScalarCast(Opcode: Instruction::CastOps::UIToFP, Op: VF, ResultTy: StepTy,
2650 DL);
2651 else
2652 VF = Builder.createScalarZExtOrTrunc(Op: VF, ResultTy: StepTy,
2653 SrcTy: TypeInfo.inferScalarType(V: VF), DL);
2654
2655 Inc = Builder.createNaryOp(Opcode: MulOp, Operands: {Step, VF}, Flags);
2656 Inc = Builder.createNaryOp(Opcode: VPInstruction::Broadcast, Operands: Inc);
2657 Prev = WidePHI;
2658 }
2659
2660 VPBasicBlock *ExitingBB = Plan->getVectorLoopRegion()->getExitingBasicBlock();
2661 Builder.setInsertPoint(TheBB: ExitingBB, IP: ExitingBB->getTerminator()->getIterator());
2662 auto *Next = Builder.createNaryOp(Opcode: AddOp, Operands: {Prev, Inc}, Flags,
2663 DL: WidenIVR->getDebugLoc(), Name: "vec.ind.next");
2664
2665 WidePHI->addOperand(Operand: Next);
2666
2667 WidenIVR->replaceAllUsesWith(New: WidePHI);
2668}
2669
2670void VPlanTransforms::dissolveLoopRegions(VPlan &Plan) {
2671 // Replace loop regions with explicity CFG.
2672 SmallVector<VPRegionBlock *> LoopRegions;
2673 for (VPRegionBlock *R : VPBlockUtils::blocksOnly<VPRegionBlock>(
2674 Range: vp_depth_first_deep(G: Plan.getEntry()))) {
2675 if (!R->isReplicator())
2676 LoopRegions.push_back(Elt: R);
2677 }
2678 for (VPRegionBlock *R : LoopRegions)
2679 R->dissolveToCFGLoop();
2680}
2681
2682void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan,
2683 Type &CanonicalIVTy) {
2684 using namespace llvm::VPlanPatternMatch;
2685
2686 VPTypeAnalysis TypeInfo(&CanonicalIVTy);
2687 SmallVector<VPRecipeBase *> ToRemove;
2688 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
2689 Range: vp_depth_first_deep(G: Plan.getEntry()))) {
2690 for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB)) {
2691 if (auto *PhiR = dyn_cast<VPEVLBasedIVPHIRecipe>(Val: &R)) {
2692 auto *ScalarR = VPBuilder(PhiR).createScalarPhi(
2693 IncomingValues: {PhiR->getStartValue(), PhiR->getBackedgeValue()},
2694 DL: PhiR->getDebugLoc(), Name: "evl.based.iv");
2695 PhiR->replaceAllUsesWith(New: ScalarR);
2696 ToRemove.push_back(Elt: PhiR);
2697 continue;
2698 }
2699
2700 if (auto *WidenIVR = dyn_cast<VPWidenIntOrFpInductionRecipe>(Val: &R)) {
2701 expandVPWidenIntOrFpInduction(WidenIVR, TypeInfo);
2702 ToRemove.push_back(Elt: WidenIVR);
2703 continue;
2704 }
2705
2706 if (auto *Expr = dyn_cast<VPExpressionRecipe>(Val: &R)) {
2707 Expr->decompose();
2708 ToRemove.push_back(Elt: Expr);
2709 }
2710
2711 VPValue *VectorStep;
2712 VPValue *ScalarStep;
2713 if (!match(V: &R, P: m_VPInstruction<VPInstruction::WideIVStep>(
2714 Op0: m_VPValue(V&: VectorStep), Op1: m_VPValue(V&: ScalarStep))))
2715 continue;
2716
2717 // Expand WideIVStep.
2718 auto *VPI = cast<VPInstruction>(Val: &R);
2719 VPBuilder Builder(VPI);
2720 Type *IVTy = TypeInfo.inferScalarType(V: VPI);
2721 if (TypeInfo.inferScalarType(V: VectorStep) != IVTy) {
2722 Instruction::CastOps CastOp = IVTy->isFloatingPointTy()
2723 ? Instruction::UIToFP
2724 : Instruction::Trunc;
2725 VectorStep = Builder.createWidenCast(Opcode: CastOp, Op: VectorStep, ResultTy: IVTy);
2726 }
2727
2728 [[maybe_unused]] auto *ConstStep =
2729 ScalarStep->isLiveIn()
2730 ? dyn_cast<ConstantInt>(Val: ScalarStep->getLiveInIRValue())
2731 : nullptr;
2732 assert(!ConstStep || ConstStep->getValue() != 1);
2733 (void)ConstStep;
2734 if (TypeInfo.inferScalarType(V: ScalarStep) != IVTy) {
2735 ScalarStep =
2736 Builder.createWidenCast(Opcode: Instruction::Trunc, Op: ScalarStep, ResultTy: IVTy);
2737 }
2738
2739 VPIRFlags Flags;
2740 if (IVTy->isFloatingPointTy())
2741 Flags = {VPI->getFastMathFlags()};
2742
2743 unsigned MulOpc =
2744 IVTy->isFloatingPointTy() ? Instruction::FMul : Instruction::Mul;
2745 VPInstruction *Mul = Builder.createNaryOp(
2746 Opcode: MulOpc, Operands: {VectorStep, ScalarStep}, Flags, DL: R.getDebugLoc());
2747 VectorStep = Mul;
2748 VPI->replaceAllUsesWith(New: VectorStep);
2749 ToRemove.push_back(Elt: VPI);
2750 }
2751 }
2752
2753 for (VPRecipeBase *R : ToRemove)
2754 R->eraseFromParent();
2755}
2756
2757void VPlanTransforms::handleUncountableEarlyExit(
2758 VPBasicBlock *EarlyExitingVPBB, VPBasicBlock *EarlyExitVPBB, VPlan &Plan,
2759 VPBasicBlock *HeaderVPBB, VPBasicBlock *LatchVPBB, VFRange &Range) {
2760 using namespace llvm::VPlanPatternMatch;
2761
2762 VPBlockBase *MiddleVPBB = LatchVPBB->getSuccessors()[0];
2763 if (!EarlyExitVPBB->getSinglePredecessor() &&
2764 EarlyExitVPBB->getPredecessors()[1] == MiddleVPBB) {
2765 assert(EarlyExitVPBB->getNumPredecessors() == 2 &&
2766 EarlyExitVPBB->getPredecessors()[0] == EarlyExitingVPBB &&
2767 "unsupported early exit VPBB");
2768 // Early exit operand should always be last phi operand. If EarlyExitVPBB
2769 // has two predecessors and EarlyExitingVPBB is the first, swap the operands
2770 // of the phis.
2771 for (VPRecipeBase &R : EarlyExitVPBB->phis())
2772 cast<VPIRPhi>(Val: &R)->swapOperands();
2773 }
2774
2775 VPBuilder Builder(LatchVPBB->getTerminator());
2776 VPBlockBase *TrueSucc = EarlyExitingVPBB->getSuccessors()[0];
2777 assert(
2778 match(EarlyExitingVPBB->getTerminator(), m_BranchOnCond(m_VPValue())) &&
2779 "Terminator must be be BranchOnCond");
2780 VPValue *CondOfEarlyExitingVPBB =
2781 EarlyExitingVPBB->getTerminator()->getOperand(N: 0);
2782 auto *CondToEarlyExit = TrueSucc == EarlyExitVPBB
2783 ? CondOfEarlyExitingVPBB
2784 : Builder.createNot(Operand: CondOfEarlyExitingVPBB);
2785
2786 // Split the middle block and have it conditionally branch to the early exit
2787 // block if CondToEarlyExit.
2788 VPValue *IsEarlyExitTaken =
2789 Builder.createNaryOp(Opcode: VPInstruction::AnyOf, Operands: {CondToEarlyExit});
2790 VPBasicBlock *NewMiddle = Plan.createVPBasicBlock(Name: "middle.split");
2791 VPBasicBlock *VectorEarlyExitVPBB =
2792 Plan.createVPBasicBlock(Name: "vector.early.exit");
2793 VPBlockUtils::insertOnEdge(From: LatchVPBB, To: MiddleVPBB, BlockPtr: NewMiddle);
2794 VPBlockUtils::connectBlocks(From: NewMiddle, To: VectorEarlyExitVPBB);
2795 NewMiddle->swapSuccessors();
2796
2797 VPBlockUtils::connectBlocks(From: VectorEarlyExitVPBB, To: EarlyExitVPBB);
2798
2799 // Update the exit phis in the early exit block.
2800 VPBuilder MiddleBuilder(NewMiddle);
2801 VPBuilder EarlyExitB(VectorEarlyExitVPBB);
2802 for (VPRecipeBase &R : EarlyExitVPBB->phis()) {
2803 auto *ExitIRI = cast<VPIRPhi>(Val: &R);
2804 // Early exit operand should always be last, i.e., 0 if EarlyExitVPBB has
2805 // a single predecessor and 1 if it has two.
2806 unsigned EarlyExitIdx = ExitIRI->getNumOperands() - 1;
2807 if (ExitIRI->getNumOperands() != 1) {
2808 // The first of two operands corresponds to the latch exit, via MiddleVPBB
2809 // predecessor. Extract its last lane.
2810 ExitIRI->extractLastLaneOfFirstOperand(Builder&: MiddleBuilder);
2811 }
2812
2813 VPValue *IncomingFromEarlyExit = ExitIRI->getOperand(N: EarlyExitIdx);
2814 auto IsVector = [](ElementCount VF) { return VF.isVector(); };
2815 // When the VFs are vectors, need to add `extract` to get the incoming value
2816 // from early exit. When the range contains scalar VF, limit the range to
2817 // scalar VF to prevent mis-compilation for the range containing both scalar
2818 // and vector VFs.
2819 if (!IncomingFromEarlyExit->isLiveIn() &&
2820 LoopVectorizationPlanner::getDecisionAndClampRange(Predicate: IsVector, Range)) {
2821 // Update the incoming value from the early exit.
2822 VPValue *FirstActiveLane = EarlyExitB.createNaryOp(
2823 Opcode: VPInstruction::FirstActiveLane, Operands: {CondToEarlyExit}, Inst: nullptr,
2824 Name: "first.active.lane");
2825 IncomingFromEarlyExit = EarlyExitB.createNaryOp(
2826 Opcode: Instruction::ExtractElement, Operands: {IncomingFromEarlyExit, FirstActiveLane},
2827 Inst: nullptr, Name: "early.exit.value");
2828 ExitIRI->setOperand(I: EarlyExitIdx, New: IncomingFromEarlyExit);
2829 }
2830 }
2831 MiddleBuilder.createNaryOp(Opcode: VPInstruction::BranchOnCond, Operands: {IsEarlyExitTaken});
2832
2833 // Replace the condition controlling the non-early exit from the vector loop
2834 // with one exiting if either the original condition of the vector latch is
2835 // true or the early exit has been taken.
2836 auto *LatchExitingBranch = cast<VPInstruction>(Val: LatchVPBB->getTerminator());
2837 assert(LatchExitingBranch->getOpcode() == VPInstruction::BranchOnCount &&
2838 "Unexpected terminator");
2839 auto *IsLatchExitTaken =
2840 Builder.createICmp(Pred: CmpInst::ICMP_EQ, A: LatchExitingBranch->getOperand(N: 0),
2841 B: LatchExitingBranch->getOperand(N: 1));
2842 auto *AnyExitTaken = Builder.createNaryOp(
2843 Opcode: Instruction::Or, Operands: {IsEarlyExitTaken, IsLatchExitTaken});
2844 Builder.createNaryOp(Opcode: VPInstruction::BranchOnCond, Operands: AnyExitTaken);
2845 LatchExitingBranch->eraseFromParent();
2846}
2847
2848/// This function tries convert extended in-loop reductions to
2849/// VPExpressionRecipe and clamp the \p Range if it is beneficial and
2850/// valid. The created recipe must be decomposed to its constituent
2851/// recipes before execution.
2852static VPExpressionRecipe *
2853tryToMatchAndCreateExtendedReduction(VPReductionRecipe *Red, VPCostContext &Ctx,
2854 VFRange &Range) {
2855 using namespace VPlanPatternMatch;
2856
2857 Type *RedTy = Ctx.Types.inferScalarType(V: Red);
2858 VPValue *VecOp = Red->getVecOp();
2859
2860 // Clamp the range if using extended-reduction is profitable.
2861 auto IsExtendedRedValidAndClampRange = [&](unsigned Opcode, bool isZExt,
2862 Type *SrcTy) -> bool {
2863 return LoopVectorizationPlanner::getDecisionAndClampRange(
2864 Predicate: [&](ElementCount VF) {
2865 auto *SrcVecTy = cast<VectorType>(Val: toVectorTy(Scalar: SrcTy, EC: VF));
2866 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
2867 InstructionCost ExtRedCost = Ctx.TTI.getExtendedReductionCost(
2868 Opcode, IsUnsigned: isZExt, ResTy: RedTy, Ty: SrcVecTy, FMF: Red->getFastMathFlags(),
2869 CostKind);
2870 InstructionCost ExtCost =
2871 cast<VPWidenCastRecipe>(Val: VecOp)->computeCost(VF, Ctx);
2872 InstructionCost RedCost = Red->computeCost(VF, Ctx);
2873 return ExtRedCost.isValid() && ExtRedCost < ExtCost + RedCost;
2874 },
2875 Range);
2876 };
2877
2878 VPValue *A;
2879 // Match reduce(ext)).
2880 if (match(V: VecOp, P: m_ZExtOrSExt(Op0: m_VPValue(V&: A))) &&
2881 IsExtendedRedValidAndClampRange(
2882 RecurrenceDescriptor::getOpcode(Kind: Red->getRecurrenceKind()),
2883 cast<VPWidenCastRecipe>(Val: VecOp)->getOpcode() ==
2884 Instruction::CastOps::ZExt,
2885 Ctx.Types.inferScalarType(V: A)))
2886 return new VPExpressionRecipe(cast<VPWidenCastRecipe>(Val: VecOp), Red);
2887
2888 return nullptr;
2889}
2890
2891/// This function tries convert extended in-loop reductions to
2892/// VPExpressionRecipe and clamp the \p Range if it is beneficial
2893/// and valid. The created VPExpressionRecipe must be decomposed to its
2894/// constituent recipes before execution. Patterns of the
2895/// VPExpressionRecipe:
2896/// reduce.add(mul(...)),
2897/// reduce.add(mul(ext(A), ext(B))),
2898/// reduce.add(ext(mul(ext(A), ext(B)))).
2899static VPExpressionRecipe *
2900tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red,
2901 VPCostContext &Ctx, VFRange &Range) {
2902 using namespace VPlanPatternMatch;
2903
2904 unsigned Opcode = RecurrenceDescriptor::getOpcode(Kind: Red->getRecurrenceKind());
2905 if (Opcode != Instruction::Add)
2906 return nullptr;
2907
2908 Type *RedTy = Ctx.Types.inferScalarType(V: Red);
2909
2910 // Clamp the range if using multiply-accumulate-reduction is profitable.
2911 auto IsMulAccValidAndClampRange =
2912 [&](bool isZExt, VPWidenRecipe *Mul, VPWidenCastRecipe *Ext0,
2913 VPWidenCastRecipe *Ext1, VPWidenCastRecipe *OuterExt) -> bool {
2914 return LoopVectorizationPlanner::getDecisionAndClampRange(
2915 Predicate: [&](ElementCount VF) {
2916 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
2917 Type *SrcTy =
2918 Ext0 ? Ctx.Types.inferScalarType(V: Ext0->getOperand(N: 0)) : RedTy;
2919 auto *SrcVecTy = cast<VectorType>(Val: toVectorTy(Scalar: SrcTy, EC: VF));
2920 InstructionCost MulAccCost =
2921 Ctx.TTI.getMulAccReductionCost(IsUnsigned: isZExt, ResTy: RedTy, Ty: SrcVecTy, CostKind);
2922 InstructionCost MulCost = Mul->computeCost(VF, Ctx);
2923 InstructionCost RedCost = Red->computeCost(VF, Ctx);
2924 InstructionCost ExtCost = 0;
2925 if (Ext0)
2926 ExtCost += Ext0->computeCost(VF, Ctx);
2927 if (Ext1)
2928 ExtCost += Ext1->computeCost(VF, Ctx);
2929 if (OuterExt)
2930 ExtCost += OuterExt->computeCost(VF, Ctx);
2931
2932 return MulAccCost.isValid() &&
2933 MulAccCost < ExtCost + MulCost + RedCost;
2934 },
2935 Range);
2936 };
2937
2938 VPValue *VecOp = Red->getVecOp();
2939 VPValue *A, *B;
2940 // Try to match reduce.add(mul(...)).
2941 if (match(V: VecOp, P: m_Mul(Op0: m_VPValue(V&: A), Op1: m_VPValue(V&: B)))) {
2942 auto *RecipeA =
2943 dyn_cast_if_present<VPWidenCastRecipe>(Val: A->getDefiningRecipe());
2944 auto *RecipeB =
2945 dyn_cast_if_present<VPWidenCastRecipe>(Val: B->getDefiningRecipe());
2946 auto *Mul = cast<VPWidenRecipe>(Val: VecOp->getDefiningRecipe());
2947
2948 // Match reduce.add(mul(ext, ext)).
2949 if (RecipeA && RecipeB &&
2950 (RecipeA->getOpcode() == RecipeB->getOpcode() || A == B) &&
2951 match(V: RecipeA, P: m_ZExtOrSExt(Op0: m_VPValue())) &&
2952 match(V: RecipeB, P: m_ZExtOrSExt(Op0: m_VPValue())) &&
2953 IsMulAccValidAndClampRange(RecipeA->getOpcode() ==
2954 Instruction::CastOps::ZExt,
2955 Mul, RecipeA, RecipeB, nullptr)) {
2956 return new VPExpressionRecipe(RecipeA, RecipeB, Mul, Red);
2957 }
2958 // Match reduce.add(mul).
2959 if (IsMulAccValidAndClampRange(true, Mul, nullptr, nullptr, nullptr))
2960 return new VPExpressionRecipe(Mul, Red);
2961 }
2962 // Match reduce.add(ext(mul(ext(A), ext(B)))).
2963 // All extend recipes must have same opcode or A == B
2964 // which can be transform to reduce.add(zext(mul(sext(A), sext(B)))).
2965 if (match(V: VecOp, P: m_ZExtOrSExt(Op0: m_Mul(Op0: m_ZExtOrSExt(Op0: m_VPValue()),
2966 Op1: m_ZExtOrSExt(Op0: m_VPValue()))))) {
2967 auto *Ext = cast<VPWidenCastRecipe>(Val: VecOp->getDefiningRecipe());
2968 auto *Mul = cast<VPWidenRecipe>(Val: Ext->getOperand(N: 0)->getDefiningRecipe());
2969 auto *Ext0 =
2970 cast<VPWidenCastRecipe>(Val: Mul->getOperand(N: 0)->getDefiningRecipe());
2971 auto *Ext1 =
2972 cast<VPWidenCastRecipe>(Val: Mul->getOperand(N: 1)->getDefiningRecipe());
2973 if ((Ext->getOpcode() == Ext0->getOpcode() || Ext0 == Ext1) &&
2974 Ext0->getOpcode() == Ext1->getOpcode() &&
2975 IsMulAccValidAndClampRange(Ext0->getOpcode() ==
2976 Instruction::CastOps::ZExt,
2977 Mul, Ext0, Ext1, Ext)) {
2978 auto *NewExt0 = new VPWidenCastRecipe(
2979 Ext0->getOpcode(), Ext0->getOperand(N: 0), Ext->getResultType(), *Ext0,
2980 Ext0->getDebugLoc());
2981 NewExt0->insertBefore(InsertPos: Ext0);
2982
2983 VPWidenCastRecipe *NewExt1 = NewExt0;
2984 if (Ext0 != Ext1) {
2985 NewExt1 = new VPWidenCastRecipe(Ext1->getOpcode(), Ext1->getOperand(N: 0),
2986 Ext->getResultType(), *Ext1,
2987 Ext1->getDebugLoc());
2988 NewExt1->insertBefore(InsertPos: Ext1);
2989 }
2990 Mul->setOperand(I: 0, New: NewExt0);
2991 Mul->setOperand(I: 1, New: NewExt1);
2992 Red->setOperand(I: 1, New: Mul);
2993 return new VPExpressionRecipe(NewExt0, NewExt1, Mul, Red);
2994 }
2995 }
2996 return nullptr;
2997}
2998
2999/// This function tries to create abstract recipes from the reduction recipe for
3000/// following optimizations and cost estimation.
3001static void tryToCreateAbstractReductionRecipe(VPReductionRecipe *Red,
3002 VPCostContext &Ctx,
3003 VFRange &Range) {
3004 VPExpressionRecipe *AbstractR = nullptr;
3005 auto IP = std::next(x: Red->getIterator());
3006 auto *VPBB = Red->getParent();
3007 if (auto *MulAcc = tryToMatchAndCreateMulAccumulateReduction(Red, Ctx, Range))
3008 AbstractR = MulAcc;
3009 else if (auto *ExtRed = tryToMatchAndCreateExtendedReduction(Red, Ctx, Range))
3010 AbstractR = ExtRed;
3011 // Cannot create abstract inloop reduction recipes.
3012 if (!AbstractR)
3013 return;
3014
3015 AbstractR->insertBefore(BB&: *VPBB, IP);
3016 Red->replaceAllUsesWith(New: AbstractR);
3017}
3018
3019void VPlanTransforms::convertToAbstractRecipes(VPlan &Plan, VPCostContext &Ctx,
3020 VFRange &Range) {
3021 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
3022 Range: vp_depth_first_deep(G: Plan.getVectorLoopRegion()))) {
3023 for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB)) {
3024 if (auto *Red = dyn_cast<VPReductionRecipe>(Val: &R))
3025 tryToCreateAbstractReductionRecipe(Red, Ctx, Range);
3026 }
3027 }
3028}
3029
3030void VPlanTransforms::materializeBroadcasts(VPlan &Plan) {
3031 if (Plan.hasScalarVFOnly())
3032 return;
3033
3034#ifndef NDEBUG
3035 VPDominatorTree VPDT;
3036 VPDT.recalculate(Plan);
3037#endif
3038
3039 SmallVector<VPValue *> VPValues;
3040 if (Plan.getOrCreateBackedgeTakenCount()->getNumUsers() > 0)
3041 VPValues.push_back(Elt: Plan.getOrCreateBackedgeTakenCount());
3042 append_range(C&: VPValues, R: Plan.getLiveIns());
3043 for (VPRecipeBase &R : *Plan.getEntry())
3044 append_range(C&: VPValues, R: R.definedValues());
3045
3046 auto *VectorPreheader = Plan.getVectorPreheader();
3047 for (VPValue *VPV : VPValues) {
3048 if (all_of(Range: VPV->users(),
3049 P: [VPV](VPUser *U) { return U->usesScalars(Op: VPV); }) ||
3050 (VPV->isLiveIn() && VPV->getLiveInIRValue() &&
3051 isa<Constant>(Val: VPV->getLiveInIRValue())))
3052 continue;
3053
3054 // Add explicit broadcast at the insert point that dominates all users.
3055 VPBasicBlock *HoistBlock = VectorPreheader;
3056 VPBasicBlock::iterator HoistPoint = VectorPreheader->end();
3057 for (VPUser *User : VPV->users()) {
3058 if (User->usesScalars(Op: VPV))
3059 continue;
3060 if (cast<VPRecipeBase>(Val: User)->getParent() == VectorPreheader)
3061 HoistPoint = HoistBlock->begin();
3062 else
3063 assert(VPDT.dominates(VectorPreheader,
3064 cast<VPRecipeBase>(User)->getParent()) &&
3065 "All users must be in the vector preheader or dominated by it");
3066 }
3067
3068 VPBuilder Builder(cast<VPBasicBlock>(Val: HoistBlock), HoistPoint);
3069 auto *Broadcast = Builder.createNaryOp(Opcode: VPInstruction::Broadcast, Operands: {VPV});
3070 VPV->replaceUsesWithIf(New: Broadcast,
3071 ShouldReplace: [VPV, Broadcast](VPUser &U, unsigned Idx) {
3072 return Broadcast != &U && !U.usesScalars(Op: VPV);
3073 });
3074 }
3075}
3076
3077/// Returns true if \p V is VPWidenLoadRecipe or VPInterleaveRecipe that can be
3078/// converted to a narrower recipe. \p V is used by a wide recipe \p WideMember
3079/// that feeds a store interleave group at index \p Idx, \p WideMember0 is the
3080/// recipe feeding the same interleave group at index 0. A VPWidenLoadRecipe can
3081/// be narrowed to an index-independent load if it feeds all wide ops at all
3082/// indices (\p OpV must be the operand at index \p OpIdx for both the recipe at
3083/// lane 0, \p WideMember0, and \p WideMember). A VPInterleaveRecipe can be
3084/// narrowed to a wide load, if \p V is defined at \p Idx of a load interleave
3085/// group.
3086static bool canNarrowLoad(VPWidenRecipe *WideMember0, VPWidenRecipe *WideMember,
3087 unsigned OpIdx, VPValue *OpV, unsigned Idx) {
3088 auto *DefR = OpV->getDefiningRecipe();
3089 if (!DefR)
3090 return WideMember0->getOperand(N: OpIdx) == OpV;
3091 if (auto *W = dyn_cast<VPWidenLoadRecipe>(Val: DefR))
3092 return !W->getMask() && WideMember0->getOperand(N: OpIdx) == OpV;
3093
3094 if (auto *IR = dyn_cast<VPInterleaveRecipe>(Val: DefR))
3095 return IR->getInterleaveGroup()->getFactor() ==
3096 IR->getInterleaveGroup()->getNumMembers() &&
3097 IR->getVPValue(I: Idx) == OpV;
3098 return false;
3099}
3100
3101/// Returns true if \p IR is a full interleave group with factor and number of
3102/// members both equal to \p VF. The interleave group must also access the full
3103/// vector width \p VectorRegWidth.
3104static bool isConsecutiveInterleaveGroup(VPInterleaveRecipe *InterleaveR,
3105 unsigned VF, VPTypeAnalysis &TypeInfo,
3106 unsigned VectorRegWidth) {
3107 if (!InterleaveR)
3108 return false;
3109
3110 Type *GroupElementTy = nullptr;
3111 if (InterleaveR->getStoredValues().empty()) {
3112 GroupElementTy = TypeInfo.inferScalarType(V: InterleaveR->getVPValue(I: 0));
3113 if (!all_of(Range: InterleaveR->definedValues(),
3114 P: [&TypeInfo, GroupElementTy](VPValue *Op) {
3115 return TypeInfo.inferScalarType(V: Op) == GroupElementTy;
3116 }))
3117 return false;
3118 } else {
3119 GroupElementTy =
3120 TypeInfo.inferScalarType(V: InterleaveR->getStoredValues()[0]);
3121 if (!all_of(Range: InterleaveR->getStoredValues(),
3122 P: [&TypeInfo, GroupElementTy](VPValue *Op) {
3123 return TypeInfo.inferScalarType(V: Op) == GroupElementTy;
3124 }))
3125 return false;
3126 }
3127
3128 unsigned GroupSize = GroupElementTy->getScalarSizeInBits() * VF;
3129 auto IG = InterleaveR->getInterleaveGroup();
3130 return IG->getFactor() == VF && IG->getNumMembers() == VF &&
3131 GroupSize == VectorRegWidth;
3132}
3133
3134/// Returns true if \p VPValue is a narrow VPValue.
3135static bool isAlreadyNarrow(VPValue *VPV) { return VPV->isLiveIn(); }
3136
3137void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
3138 unsigned VectorRegWidth) {
3139 using namespace llvm::VPlanPatternMatch;
3140 VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
3141 if (VF.isScalable() || !VectorLoop)
3142 return;
3143
3144 VPCanonicalIVPHIRecipe *CanonicalIV = Plan.getCanonicalIV();
3145 Type *CanonicalIVType = CanonicalIV->getScalarType();
3146 VPTypeAnalysis TypeInfo(CanonicalIVType);
3147
3148 unsigned FixedVF = VF.getFixedValue();
3149 SmallVector<VPInterleaveRecipe *> StoreGroups;
3150 for (auto &R : *VectorLoop->getEntryBasicBlock()) {
3151 if (isa<VPCanonicalIVPHIRecipe>(Val: &R) ||
3152 match(V: &R, P: m_BranchOnCount(Op0: m_VPValue(), Op1: m_VPValue())))
3153 continue;
3154
3155 // Bail out on recipes not supported at the moment:
3156 // * phi recipes other than the canonical induction
3157 // * recipes writing to memory except interleave groups
3158 // Only support plans with a canonical induction phi.
3159 if (R.isPhi())
3160 return;
3161
3162 auto *InterleaveR = dyn_cast<VPInterleaveRecipe>(Val: &R);
3163 if (R.mayWriteToMemory() && !InterleaveR)
3164 return;
3165
3166 // Do not narrow interleave groups if there are VectorPointer recipes and
3167 // the plan was unrolled. The recipe implicitly uses VF from
3168 // VPTransformState.
3169 // TODO: Remove restriction once the VF for the VectorPointer offset is
3170 // modeled explicitly as operand.
3171 if (isa<VPVectorPointerRecipe>(Val: &R) && Plan.getUF() > 1)
3172 return;
3173
3174 // All other ops are allowed, but we reject uses that cannot be converted
3175 // when checking all allowed consumers (store interleave groups) below.
3176 if (!InterleaveR)
3177 continue;
3178
3179 // Bail out on non-consecutive interleave groups.
3180 if (!isConsecutiveInterleaveGroup(InterleaveR, VF: FixedVF, TypeInfo,
3181 VectorRegWidth))
3182 return;
3183
3184 // Skip read interleave groups.
3185 if (InterleaveR->getStoredValues().empty())
3186 continue;
3187
3188 // Narrow interleave groups, if all operands are already matching narrow
3189 // ops.
3190 auto *Member0 = InterleaveR->getStoredValues()[0];
3191 if (isAlreadyNarrow(VPV: Member0) &&
3192 all_of(Range: InterleaveR->getStoredValues(),
3193 P: [Member0](VPValue *VPV) { return Member0 == VPV; })) {
3194 StoreGroups.push_back(Elt: InterleaveR);
3195 continue;
3196 }
3197
3198 // For now, we only support full interleave groups storing load interleave
3199 // groups.
3200 if (all_of(Range: enumerate(First: InterleaveR->getStoredValues()), P: [](auto Op) {
3201 VPRecipeBase *DefR = Op.value()->getDefiningRecipe();
3202 if (!DefR)
3203 return false;
3204 auto *IR = dyn_cast<VPInterleaveRecipe>(Val: DefR);
3205 return IR &&
3206 IR->getInterleaveGroup()->getFactor() ==
3207 IR->getInterleaveGroup()->getNumMembers() &&
3208 IR->getVPValue(Op.index()) == Op.value();
3209 })) {
3210 StoreGroups.push_back(Elt: InterleaveR);
3211 continue;
3212 }
3213
3214 // Check if all values feeding InterleaveR are matching wide recipes, which
3215 // operands that can be narrowed.
3216 auto *WideMember0 = dyn_cast_or_null<VPWidenRecipe>(
3217 Val: InterleaveR->getStoredValues()[0]->getDefiningRecipe());
3218 if (!WideMember0)
3219 return;
3220 for (const auto &[I, V] : enumerate(First: InterleaveR->getStoredValues())) {
3221 auto *R = dyn_cast_or_null<VPWidenRecipe>(Val: V->getDefiningRecipe());
3222 if (!R || R->getOpcode() != WideMember0->getOpcode() ||
3223 R->getNumOperands() > 2)
3224 return;
3225 if (any_of(Range: enumerate(First: R->operands()),
3226 P: [WideMember0, Idx = I, R](const auto &P) {
3227 const auto &[OpIdx, OpV] = P;
3228 return !canNarrowLoad(WideMember0, R, OpIdx, OpV, Idx);
3229 }))
3230 return;
3231 }
3232 StoreGroups.push_back(Elt: InterleaveR);
3233 }
3234
3235 if (StoreGroups.empty())
3236 return;
3237
3238 // Convert InterleaveGroup \p R to a single VPWidenLoadRecipe.
3239 auto NarrowOp = [](VPValue *V) -> VPValue * {
3240 auto *R = V->getDefiningRecipe();
3241 if (!R)
3242 return V;
3243 if (auto *LoadGroup = dyn_cast<VPInterleaveRecipe>(Val: R)) {
3244 // Narrow interleave group to wide load, as transformed VPlan will only
3245 // process one original iteration.
3246 auto *L = new VPWidenLoadRecipe(
3247 *cast<LoadInst>(Val: LoadGroup->getInterleaveGroup()->getInsertPos()),
3248 LoadGroup->getAddr(), LoadGroup->getMask(), /*Consecutive=*/true,
3249 /*Reverse=*/false, {}, LoadGroup->getDebugLoc());
3250 L->insertBefore(InsertPos: LoadGroup);
3251 return L;
3252 }
3253
3254 auto *WideLoad = cast<VPWidenLoadRecipe>(Val: R);
3255
3256 // Narrow wide load to uniform scalar load, as transformed VPlan will only
3257 // process one original iteration.
3258 auto *N = new VPReplicateRecipe(&WideLoad->getIngredient(),
3259 WideLoad->operands(), /*IsUniform*/ true,
3260 /*Mask*/ nullptr, *WideLoad);
3261 N->insertBefore(InsertPos: WideLoad);
3262 return N;
3263 };
3264
3265 // Narrow operation tree rooted at store groups.
3266 for (auto *StoreGroup : StoreGroups) {
3267 VPValue *Res = nullptr;
3268 VPValue *Member0 = StoreGroup->getStoredValues()[0];
3269 if (isAlreadyNarrow(VPV: Member0)) {
3270 Res = Member0;
3271 } else if (auto *WideMember0 =
3272 dyn_cast<VPWidenRecipe>(Val: Member0->getDefiningRecipe())) {
3273 for (unsigned Idx = 0, E = WideMember0->getNumOperands(); Idx != E; ++Idx)
3274 WideMember0->setOperand(I: Idx, New: NarrowOp(WideMember0->getOperand(N: Idx)));
3275 Res = WideMember0;
3276 } else {
3277 Res = NarrowOp(Member0);
3278 }
3279
3280 auto *S = new VPWidenStoreRecipe(
3281 *cast<StoreInst>(Val: StoreGroup->getInterleaveGroup()->getInsertPos()),
3282 StoreGroup->getAddr(), Res, nullptr, /*Consecutive=*/true,
3283 /*Reverse=*/false, {}, StoreGroup->getDebugLoc());
3284 S->insertBefore(InsertPos: StoreGroup);
3285 StoreGroup->eraseFromParent();
3286 }
3287
3288 // Adjust induction to reflect that the transformed plan only processes one
3289 // original iteration.
3290 auto *CanIV = Plan.getCanonicalIV();
3291 auto *Inc = cast<VPInstruction>(Val: CanIV->getBackedgeValue());
3292 Inc->setOperand(I: 1, New: Plan.getOrAddLiveIn(V: ConstantInt::get(
3293 Ty: CanIV->getScalarType(), V: 1 * Plan.getUF())));
3294 Plan.getVF().replaceAllUsesWith(
3295 New: Plan.getOrAddLiveIn(V: ConstantInt::get(Ty: CanIV->getScalarType(), V: 1)));
3296 removeDeadRecipes(Plan);
3297}
3298
3299/// Add branch weight metadata, if the \p Plan's middle block is terminated by a
3300/// BranchOnCond recipe.
3301void VPlanTransforms::addBranchWeightToMiddleTerminator(
3302 VPlan &Plan, ElementCount VF, std::optional<unsigned> VScaleForTuning) {
3303 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
3304 auto *MiddleTerm =
3305 dyn_cast_or_null<VPInstruction>(Val: MiddleVPBB->getTerminator());
3306 // Only add branch metadata if there is a (conditional) terminator.
3307 if (!MiddleTerm)
3308 return;
3309
3310 assert(MiddleTerm->getOpcode() == VPInstruction::BranchOnCond &&
3311 "must have a BranchOnCond");
3312 // Assume that `TripCount % VectorStep ` is equally distributed.
3313 unsigned VectorStep = Plan.getUF() * VF.getKnownMinValue();
3314 if (VF.isScalable() && VScaleForTuning.has_value())
3315 VectorStep *= *VScaleForTuning;
3316 assert(VectorStep > 0 && "trip count should not be zero");
3317 MDBuilder MDB(Plan.getScalarHeader()->getIRBasicBlock()->getContext());
3318 MDNode *BranchWeights =
3319 MDB.createBranchWeights(Weights: {1, VectorStep - 1}, /*IsExpected=*/false);
3320 MiddleTerm->addMetadata(Kind: LLVMContext::MD_prof, Node: BranchWeights);
3321}
3322