VPlanTransforms.cpp source code [llvm_projects/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp]

1	//===-- VPlanTransforms.cpp - Utility VPlan to VPlan transforms -----------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	///
9	/// \file
10	/// This file implements a set of utility VPlan to VPlan transformations.
11	///
12	//===----------------------------------------------------------------------===//
13
14	#include "VPlanTransforms.h"
15	#include "VPRecipeBuilder.h"
16	#include "VPlan.h"
17	#include "VPlanAnalysis.h"
18	#include "VPlanCFG.h"
19	#include "VPlanDominatorTree.h"
20	#include "VPlanHelpers.h"
21	#include "VPlanPatternMatch.h"
22	#include "VPlanUtils.h"
23	#include "VPlanVerifier.h"
24	#include "llvm/ADT/APInt.h"
25	#include "llvm/ADT/PostOrderIterator.h"
26	#include "llvm/ADT/STLExtras.h"
27	#include "llvm/ADT/SetVector.h"
28	#include "llvm/ADT/TypeSwitch.h"
29	#include "llvm/Analysis/IVDescriptors.h"
30	#include "llvm/Analysis/InstSimplifyFolder.h"
31	#include "llvm/Analysis/LoopInfo.h"
32	#include "llvm/Analysis/VectorUtils.h"
33	#include "llvm/IR/Intrinsics.h"
34	#include "llvm/IR/MDBuilder.h"
35	#include "llvm/IR/PatternMatch.h"
36	#include "llvm/Support/Casting.h"
37	#include "llvm/Support/TypeSize.h"
38
39	using namespace llvm;
40
41	bool VPlanTransforms::tryToConvertVPInstructionsToVPRecipes(
42	VPlanPtr &Plan,
43	function_ref<const InductionDescriptor (PHINode )>
44	GetIntOrFpInductionDescriptor,
45	ScalarEvolution &SE, const TargetLibraryInfo &TLI) {
46
47	ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT(
48	Plan ->getVectorLoopRegion());
49	for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Range: RPOT)) {
50	// Skip blocks outside region
51	if (!VPBB->getParent())
52	break;
53	VPRecipeBase *Term = VPBB->getTerminator();
54	auto EndIter = Term ? Term->getIterator() : VPBB->end();
55	// Introduce each ingredient into VPlan.
56	for (VPRecipeBase &Ingredient :
57	make_early_inc_range(Range: make_range(x: VPBB->begin(), y: EndIter))) {
58
59	VPValue *VPV = Ingredient.getVPSingleValue();
60	if (!VPV->getUnderlyingValue())
61	continue;
62
63	Instruction *Inst = cast<Instruction>(Val: VPV->getUnderlyingValue());
64
65	VPRecipeBase NewRecipe = nullptr*;
66	if (auto *VPPhi = dyn_cast<VPWidenPHIRecipe>(Val: &Ingredient)) {
67	auto *Phi = cast<PHINode>(Val: VPPhi->getUnderlyingValue());
68	const auto *II = GetIntOrFpInductionDescriptor (Phi);
69	if (!II)
70	continue;
71
72	VPValue *Start = Plan ->getOrAddLiveIn(V: II->getStartValue());
73	VPValue *Step =
74	vputils::getOrCreateVPValueForSCEVExpr(Plan&: *Plan, Expr: II->getStep(), SE);
75	NewRecipe = new VPWidenIntOrFpInductionRecipe (
76	Phi, Start, Step, &Plan ->getVF(), *II, Ingredient.getDebugLoc());
77	} else {
78	assert(isa<VPInstruction>(&Ingredient) &&
79	"only VPInstructions expected here");
80	assert(!isa<PHINode>(Inst) && "phis should be handled above");
81	// Create VPWidenMemoryRecipe for loads and stores.
82	if (LoadInst *Load = dyn_cast<LoadInst>(Val: Inst)) {
83	NewRecipe = new VPWidenLoadRecipe (
84	Load, Ingredient.getOperand(N: `0`), nullptr* /Mask/,
85	false /Consecutive/, false /Reverse/, VPIRMetadata (*Load),
86	Ingredient.getDebugLoc());
87	} else if (StoreInst *Store = dyn_cast<StoreInst>(Val: Inst)) {
88	NewRecipe = new VPWidenStoreRecipe (
89	*Store, Ingredient.getOperand(N: `1`), Ingredient.getOperand(N: `0`),
90	nullptr /Mask/, false /Consecutive/, false /Reverse/,
91	VPIRMetadata (*Store), Ingredient.getDebugLoc());
92	} else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Val: Inst)) {
93	NewRecipe = new VPWidenGEPRecipe (GEP, Ingredient.operands());
94	} else if (CallInst *CI = dyn_cast<CallInst>(Val: Inst)) {
95	Intrinsic::ID VectorID = getVectorIntrinsicIDForCall(CI, TLI: &TLI);
96	if (VectorID == Intrinsic::not_intrinsic)
97	return false;
98	NewRecipe = new VPWidenIntrinsicRecipe (
99	*CI, getVectorIntrinsicIDForCall(CI, TLI: &TLI),
100	{Ingredient.op_begin(), Ingredient.op_end() - `1`}, CI->getType(),
101	CI->getDebugLoc());
102	} else if (SelectInst *SI = dyn_cast<SelectInst>(Val: Inst)) {
103	NewRecipe = new VPWidenSelectRecipe (*SI, Ingredient.operands());
104	} else if (auto *CI = dyn_cast<CastInst>(Val: Inst)) {
105	NewRecipe = new VPWidenCastRecipe (
106	CI->getOpcode(), Ingredient.getOperand(N: `0`), CI->getType(), *CI);
107	} else {
108	NewRecipe = new VPWidenRecipe (*Inst, Ingredient.operands());
109	}
110	}
111
112	NewRecipe->insertBefore(InsertPos: &Ingredient);
113	if (NewRecipe->getNumDefinedValues() == `1`)
114	VPV->replaceAllUsesWith(New: NewRecipe->getVPSingleValue());
115	else
116	assert(NewRecipe->getNumDefinedValues() == `0` &&
117	"Only recpies with zero or one defined values expected");
118	Ingredient.eraseFromParent();
119	}
120	}
121	return true;
122	}
123
124	static bool sinkScalarOperands(VPlan &Plan) {
125	auto Iter = vp_depth_first_deep(G: Plan.getEntry());
126	bool Changed = false;
127	// First, collect the operands of all recipes in replicate blocks as seeds for
128	// sinking.
129	SetVector<std::pair<VPBasicBlock , VPSingleDefRecipe >> WorkList;
130	for (VPRegionBlock *VPR : VPBlockUtils::blocksOnly<VPRegionBlock>(Range: Iter)) {
131	VPBasicBlock *EntryVPBB = VPR->getEntryBasicBlock();
132	if (!VPR->isReplicator() \|\| EntryVPBB->getSuccessors().size() != `2`)
133	continue;
134	VPBasicBlock *VPBB = dyn_cast<VPBasicBlock>(Val: EntryVPBB->getSuccessors()[`0`]);
135	if (!VPBB \|\| VPBB->getSingleSuccessor() != VPR->getExitingBasicBlock())
136	continue;
137	for (auto &Recipe : *VPBB) {
138	for (VPValue *Op : Recipe.operands())
139	if (auto *Def =
140	dyn_cast_or_null<VPSingleDefRecipe>(Val: Op->getDefiningRecipe()))
141	WorkList.insert(X: std::make_pair(x&: VPBB, y&: Def));
142	}
143	}
144
145	bool ScalarVFOnly = Plan.hasScalarVFOnly();
146	// Try to sink each replicate or scalar IV steps recipe in the worklist.
147	for (unsigned I = `0`; I != WorkList.size(); ++I) {
148	VPBasicBlock *SinkTo;
149	VPSingleDefRecipe *SinkCandidate;
150	std::tie(args&: SinkTo, args&: SinkCandidate) = WorkList [I];
151	if (SinkCandidate->getParent() == SinkTo \|\|
152	SinkCandidate->mayHaveSideEffects() \|\|
153	SinkCandidate->mayReadOrWriteMemory())
154	continue;
155	if (auto *RepR = dyn_cast<VPReplicateRecipe>(Val: SinkCandidate)) {
156	if (!ScalarVFOnly && RepR->isSingleScalar())
157	continue;
158	} else if (!isa<VPScalarIVStepsRecipe>(Val: SinkCandidate))
159	continue;
160
161	bool NeedsDuplicating = false;
162	// All recipe users of the sink candidate must be in the same block SinkTo
163	// or all users outside of SinkTo must be uniform-after-vectorization (
164	// i.e., only first lane is used) . In the latter case, we need to duplicate
165	// SinkCandidate.
166	auto CanSinkWithUser = [SinkTo, &NeedsDuplicating,
167	SinkCandidate](VPUser *U) {
168	auto *UI = cast<VPRecipeBase>(Val: U);
169	if (UI->getParent() == SinkTo)
170	return true;
171	NeedsDuplicating = UI->onlyFirstLaneUsed(Op: SinkCandidate);
172	// We only know how to duplicate VPReplicateRecipes and
173	// VPScalarIVStepsRecipes for now.
174	return NeedsDuplicating &&
175	isa<VPReplicateRecipe, VPScalarIVStepsRecipe>(Val: SinkCandidate);
176	};
177	if (!all_of(Range: SinkCandidate->users(), P: CanSinkWithUser))
178	continue;
179
180	if (NeedsDuplicating) {
181	if (ScalarVFOnly)
182	continue;
183	VPSingleDefRecipe *Clone;
184	if (auto *SinkCandidateRepR =
185	dyn_cast<VPReplicateRecipe>(Val: SinkCandidate)) {
186	// TODO: Handle converting to uniform recipes as separate transform,
187	// then cloning should be sufficient here.
188	Instruction *I = SinkCandidate->getUnderlyingInstr();
189	Clone = new VPReplicateRecipe (I, SinkCandidate->operands(), true,
190	nullptr /Mask/, *SinkCandidateRepR);
191	// TODO: add ".cloned" suffix to name of Clone's VPValue.
192	} else {
193	Clone = SinkCandidate->clone();
194	}
195
196	Clone->insertBefore(InsertPos: SinkCandidate);
197	SinkCandidate->replaceUsesWithIf(New: Clone, ShouldReplace: [SinkTo](VPUser &U, unsigned) {
198	return cast<VPRecipeBase>(Val: &U)->getParent() != SinkTo;
199	});
200	}
201	SinkCandidate->moveBefore(BB&: *SinkTo, I: SinkTo->getFirstNonPhi());
202	for (VPValue *Op : SinkCandidate->operands())
203	if (auto *Def =
204	dyn_cast_or_null<VPSingleDefRecipe>(Val: Op->getDefiningRecipe()))
205	WorkList.insert(X: std::make_pair(x&: SinkTo, y&: Def));
206	Changed = true;
207	}
208	return Changed;
209	}
210
211	/// If \p R is a region with a VPBranchOnMaskRecipe in the entry block, return
212	/// the mask.
213	VPValue getPredicatedMask(VPRegionBlock R) {
214	auto *EntryBB = dyn_cast<VPBasicBlock>(Val: R->getEntry());
215	if (!EntryBB \|\| EntryBB->size() != `1` \|\|
216	!isa<VPBranchOnMaskRecipe>(Val: EntryBB->begin()))
217	return nullptr;
218
219	return cast<VPBranchOnMaskRecipe>(Val: &*EntryBB->begin())->getOperand(N: `0`);
220	}
221
222	/// If \p R is a triangle region, return the 'then' block of the triangle.
223	static VPBasicBlock getPredicatedThenBlock(VPRegionBlock R) {
224	auto *EntryBB = cast<VPBasicBlock>(Val: R->getEntry());
225	if (EntryBB->getNumSuccessors() != `2`)
226	return nullptr;
227
228	auto *Succ0 = dyn_cast<VPBasicBlock>(Val: EntryBB->getSuccessors()[`0`]);
229	auto *Succ1 = dyn_cast<VPBasicBlock>(Val: EntryBB->getSuccessors()[`1`]);
230	if (!Succ0 \|\| !Succ1)
231	return nullptr;
232
233	if (Succ0->getNumSuccessors() + Succ1->getNumSuccessors() != `1`)
234	return nullptr;
235	if (Succ0->getSingleSuccessor() == Succ1)
236	return Succ0;
237	if (Succ1->getSingleSuccessor() == Succ0)
238	return Succ1;
239	return nullptr;
240	}
241
242	// Merge replicate regions in their successor region, if a replicate region
243	// is connected to a successor replicate region with the same predicate by a
244	// single, empty VPBasicBlock.
245	static bool mergeReplicateRegionsIntoSuccessors(VPlan &Plan) {
246	SmallPtrSet<VPRegionBlock *, `4`> TransformedRegions;
247
248	// Collect replicate regions followed by an empty block, followed by another
249	// replicate region with matching masks to process front. This is to avoid
250	// iterator invalidation issues while merging regions.
251	SmallVector<VPRegionBlock *, `8`> WorkList;
252	for (VPRegionBlock *Region1 : VPBlockUtils::blocksOnly<VPRegionBlock>(
253	Range: vp_depth_first_deep(G: Plan.getEntry()))) {
254	if (!Region1->isReplicator())
255	continue;
256	auto *MiddleBasicBlock =
257	dyn_cast_or_null<VPBasicBlock>(Val: Region1->getSingleSuccessor());
258	if (!MiddleBasicBlock \|\| !MiddleBasicBlock->empty())
259	continue;
260
261	auto *Region2 =
262	dyn_cast_or_null<VPRegionBlock>(Val: MiddleBasicBlock->getSingleSuccessor());
263	if (!Region2 \|\| !Region2->isReplicator())
264	continue;
265
266	VPValue *Mask1 = getPredicatedMask(R: Region1);
267	VPValue *Mask2 = getPredicatedMask(R: Region2);
268	if (!Mask1 \|\| Mask1 != Mask2)
269	continue;
270
271	assert(Mask1 && Mask2 && "both region must have conditions");
272	WorkList.push_back(Elt: Region1);
273	}
274
275	// Move recipes from Region1 to its successor region, if both are triangles.
276	for (VPRegionBlock *Region1 : WorkList) {
277	if (TransformedRegions.contains(Ptr: Region1))
278	continue;
279	auto *MiddleBasicBlock = cast<VPBasicBlock>(Val: Region1->getSingleSuccessor());
280	auto *Region2 = cast<VPRegionBlock>(Val: MiddleBasicBlock->getSingleSuccessor());
281
282	VPBasicBlock *Then1 = getPredicatedThenBlock(R: Region1);
283	VPBasicBlock *Then2 = getPredicatedThenBlock(R: Region2);
284	if (!Then1 \|\| !Then2)
285	continue;
286
287	// Note: No fusion-preventing memory dependencies are expected in either
288	// region. Such dependencies should be rejected during earlier dependence
289	// checks, which guarantee accesses can be re-ordered for vectorization.
290	//
291	// Move recipes to the successor region.
292	for (VPRecipeBase &ToMove : make_early_inc_range(Range: reverse(C&: *Then1)))
293	ToMove.moveBefore(BB&: *Then2, I: Then2->getFirstNonPhi());
294
295	auto *Merge1 = cast<VPBasicBlock>(Val: Then1->getSingleSuccessor());
296	auto *Merge2 = cast<VPBasicBlock>(Val: Then2->getSingleSuccessor());
297
298	// Move VPPredInstPHIRecipes from the merge block to the successor region's
299	// merge block. Update all users inside the successor region to use the
300	// original values.
301	for (VPRecipeBase &Phi1ToMove : make_early_inc_range(Range: reverse(C&: *Merge1))) {
302	VPValue *PredInst1 =
303	cast<VPPredInstPHIRecipe>(Val: &Phi1ToMove)->getOperand(N: `0`);
304	VPValue *Phi1ToMoveV = Phi1ToMove.getVPSingleValue();
305	Phi1ToMoveV->replaceUsesWithIf(New: PredInst1, ShouldReplace: [Then2](VPUser &U, unsigned) {
306	return cast<VPRecipeBase>(Val: &U)->getParent() == Then2;
307	});
308
309	// Remove phi recipes that are unused after merging the regions.
310	if (Phi1ToMove.getVPSingleValue()->getNumUsers() == `0`) {
311	Phi1ToMove.eraseFromParent();
312	continue;
313	}
314	Phi1ToMove.moveBefore(BB&: *Merge2, I: Merge2->begin());
315	}
316
317	// Remove the dead recipes in Region1's entry block.
318	for (VPRecipeBase &R :
319	make_early_inc_range(Range: reverse(C&: *Region1->getEntryBasicBlock())))
320	R.eraseFromParent();
321
322	// Finally, remove the first region.
323	for (VPBlockBase *Pred : make_early_inc_range(Range&: Region1->getPredecessors())) {
324	VPBlockUtils::disconnectBlocks(From: Pred, To: Region1);
325	VPBlockUtils::connectBlocks(From: Pred, To: MiddleBasicBlock);
326	}
327	VPBlockUtils::disconnectBlocks(From: Region1, To: MiddleBasicBlock);
328	TransformedRegions.insert(Ptr: Region1);
329	}
330
331	return !TransformedRegions.empty();
332	}
333
334	static VPRegionBlock createReplicateRegion(VPReplicateRecipe PredRecipe,
335	VPlan &Plan) {
336	Instruction *Instr = PredRecipe->getUnderlyingInstr();
337	// Build the triangular if-then region.
338	std::string RegionName = (Twine ("pred.") + Instr->getOpcodeName()).str();
339	assert(Instr->getParent() && "Predicated instruction not in any basic block");
340	auto *BlockInMask = PredRecipe->getMask();
341	auto *MaskDef = BlockInMask->getDefiningRecipe();
342	auto BOMRecipe = new* VPBranchOnMaskRecipe (
343	BlockInMask, MaskDef ? MaskDef->getDebugLoc() : DebugLoc ());
344	auto *Entry =
345	Plan.createVPBasicBlock(Name: Twine (RegionName) + ".entry", Recipe: BOMRecipe);
346
347	// Replace predicated replicate recipe with a replicate recipe without a
348	// mask but in the replicate region.
349	auto RecipeWithoutMask = new* VPReplicateRecipe (
350	PredRecipe->getUnderlyingInstr(),
351	make_range(x: PredRecipe->op_begin(), y: std::prev(x: PredRecipe->op_end())),
352	PredRecipe->isSingleScalar(), nullptr /Mask/, *PredRecipe);
353	auto *Pred =
354	Plan.createVPBasicBlock(Name: Twine (RegionName) + ".if", Recipe: RecipeWithoutMask);
355
356	VPPredInstPHIRecipe PHIRecipe = nullptr*;
357	if (PredRecipe->getNumUsers() != `0`) {
358	PHIRecipe = new VPPredInstPHIRecipe (RecipeWithoutMask,
359	RecipeWithoutMask->getDebugLoc());
360	PredRecipe->replaceAllUsesWith(New: PHIRecipe);
361	PHIRecipe->setOperand(I: `0`, New: RecipeWithoutMask);
362	}
363	PredRecipe->eraseFromParent();
364	auto *Exiting =
365	Plan.createVPBasicBlock(Name: Twine (RegionName) + ".continue", Recipe: PHIRecipe);
366	VPRegionBlock *Region =
367	Plan.createVPRegionBlock(Entry, Exiting, Name: RegionName, IsReplicator: true);
368
369	// Note: first set Entry as region entry and then connect successors starting
370	// from it in order, to propagate the "parent" of each VPBasicBlock.
371	VPBlockUtils::insertTwoBlocksAfter(IfTrue: Pred, IfFalse: Exiting, BlockPtr: Entry);
372	VPBlockUtils::connectBlocks(From: Pred, To: Exiting);
373
374	return Region;
375	}
376
377	static void addReplicateRegions(VPlan &Plan) {
378	SmallVector<VPReplicateRecipe *> WorkList;
379	for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
380	Range: vp_depth_first_deep(G: Plan.getEntry()))) {
381	for (VPRecipeBase &R : *VPBB)
382	if (auto *RepR = dyn_cast<VPReplicateRecipe>(Val: &R)) {
383	if (RepR->isPredicated())
384	WorkList.push_back(Elt: RepR);
385	}
386	}
387
388	unsigned BBNum = `0`;
389	for (VPReplicateRecipe *RepR : WorkList) {
390	VPBasicBlock *CurrentBlock = RepR->getParent();
391	VPBasicBlock *SplitBlock = CurrentBlock->splitAt(SplitAt: RepR->getIterator());
392
393	BasicBlock *OrigBB = RepR->getUnderlyingInstr()->getParent();
394	SplitBlock->setName(
395	OrigBB->hasName() ? OrigBB->getName() + "." + Twine (BBNum++) : "");
396	// Record predicated instructions for above packing optimizations.
397	VPRegionBlock *Region = createReplicateRegion(PredRecipe: RepR, Plan);
398	Region->setParent(CurrentBlock->getParent());
399	VPBlockUtils::insertOnEdge(From: CurrentBlock, To: SplitBlock, BlockPtr: Region);
400
401	VPRegionBlock *ParentRegion = Region->getParent();
402	if (ParentRegion && ParentRegion->getExiting() == CurrentBlock)
403	ParentRegion->setExiting(SplitBlock);
404	}
405	}
406
407	/// Remove redundant VPBasicBlocks by merging them into their predecessor if
408	/// the predecessor has a single successor.
409	static bool mergeBlocksIntoPredecessors(VPlan &Plan) {
410	SmallVector<VPBasicBlock *> WorkList;
411	for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
412	Range: vp_depth_first_deep(G: Plan.getEntry()))) {
413	// Don't fold the blocks in the skeleton of the Plan into their single
414	// predecessors for now.
415	// TODO: Remove restriction once more of the skeleton is modeled in VPlan.
416	if (!VPBB->getParent())
417	continue;
418	auto *PredVPBB =
419	dyn_cast_or_null<VPBasicBlock>(Val: VPBB->getSinglePredecessor());
420	if (!PredVPBB \|\| PredVPBB->getNumSuccessors() != `1` \|\|
421	isa<VPIRBasicBlock>(Val: PredVPBB))
422	continue;
423	WorkList.push_back(Elt: VPBB);
424	}
425
426	for (VPBasicBlock *VPBB : WorkList) {
427	VPBasicBlock *PredVPBB = cast<VPBasicBlock>(Val: VPBB->getSinglePredecessor());
428	for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB))
429	R.moveBefore(BB&: *PredVPBB, I: PredVPBB->end());
430	VPBlockUtils::disconnectBlocks(From: PredVPBB, To: VPBB);
431	auto *ParentRegion = VPBB->getParent();
432	if (ParentRegion && ParentRegion->getExiting() == VPBB)
433	ParentRegion->setExiting(PredVPBB);
434	for (auto *Succ : to_vector(Range: VPBB->successors())) {
435	VPBlockUtils::disconnectBlocks(From: VPBB, To: Succ);
436	VPBlockUtils::connectBlocks(From: PredVPBB, To: Succ);
437	}
438	// VPBB is now dead and will be cleaned up when the plan gets destroyed.
439	}
440	return !WorkList.empty();
441	}
442
443	void VPlanTransforms::createAndOptimizeReplicateRegions(VPlan &Plan) {
444	// Convert masked VPReplicateRecipes to if-then region blocks.
445	addReplicateRegions(Plan);
446
447	bool ShouldSimplify = true;
448	while (ShouldSimplify) {
449	ShouldSimplify = sinkScalarOperands(Plan);
450	ShouldSimplify \|= mergeReplicateRegionsIntoSuccessors(Plan);
451	ShouldSimplify \|= mergeBlocksIntoPredecessors(Plan);
452	}
453	}
454
455	/// Remove redundant casts of inductions.
456	///
457	/// Such redundant casts are casts of induction variables that can be ignored,
458	/// because we already proved that the casted phi is equal to the uncasted phi
459	/// in the vectorized loop. There is no need to vectorize the cast - the same
460	/// value can be used for both the phi and casts in the vector loop.
461	static void removeRedundantInductionCasts(VPlan &Plan) {
462	for (auto &Phi : Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
463	auto *IV = dyn_cast<VPWidenIntOrFpInductionRecipe>(Val: &Phi);
464	if (!IV \|\| IV->getTruncInst())
465	continue;
466
467	// A sequence of IR Casts has potentially been recorded for IV, which
468	// must be bypassed* when the IV is vectorized, because the vectorized IV*
469	// will produce the desired casted value. This sequence forms a def-use
470	// chain and is provided in reverse order, ending with the cast that uses
471	// the IV phi. Search for the recipe of the last cast in the chain and
472	// replace it with the original IV. Note that only the final cast is
473	// expected to have users outside the cast-chain and the dead casts left
474	// over will be cleaned up later.
475	auto &Casts = IV->getInductionDescriptor().getCastInsts();
476	VPValue *FindMyCast = IV;
477	for (Instruction *IRCast : reverse(C: Casts)) {
478	VPSingleDefRecipe FoundUserCast = nullptr*;
479	for (auto *U : FindMyCast->users()) {
480	auto *UserCast = dyn_cast<VPSingleDefRecipe>(Val: U);
481	if (UserCast && UserCast->getUnderlyingValue() == IRCast) {
482	FoundUserCast = UserCast;
483	break;
484	}
485	}
486	FindMyCast = FoundUserCast;
487	}
488	FindMyCast->replaceAllUsesWith(New: IV);
489	}
490	}
491
492	/// Try to replace VPWidenCanonicalIVRecipes with a widened canonical IV
493	/// recipe, if it exists.
494	static void removeRedundantCanonicalIVs(VPlan &Plan) {
495	VPCanonicalIVPHIRecipe *CanonicalIV = Plan.getCanonicalIV();
496	VPWidenCanonicalIVRecipe WidenNewIV = nullptr*;
497	for (VPUser *U : CanonicalIV->users()) {
498	WidenNewIV = dyn_cast<VPWidenCanonicalIVRecipe>(Val: U);
499	if (WidenNewIV)
500	break;
501	}
502
503	if (!WidenNewIV)
504	return;
505
506	VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
507	for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
508	auto *WidenOriginalIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(Val: &Phi);
509
510	if (!WidenOriginalIV \|\| !WidenOriginalIV->isCanonical())
511	continue;
512
513	// Replace WidenNewIV with WidenOriginalIV if WidenOriginalIV provides
514	// everything WidenNewIV's users need. That is, WidenOriginalIV will
515	// generate a vector phi or all users of WidenNewIV demand the first lane
516	// only.
517	if (any_of(Range: WidenOriginalIV->users(),
518	P: [WidenOriginalIV](VPUser *U) {
519	return !U->usesScalars(Op: WidenOriginalIV);
520	}) \|\|
521	vputils::onlyFirstLaneUsed(Def: WidenNewIV)) {
522	WidenNewIV->replaceAllUsesWith(New: WidenOriginalIV);
523	WidenNewIV->eraseFromParent();
524	return;
525	}
526	}
527	}
528
529	/// Returns true if \p R is dead and can be removed.
530	static bool isDeadRecipe(VPRecipeBase &R) {
531	using namespace llvm::PatternMatch;
532	// Do remove conditional assume instructions as their conditions may be
533	// flattened.
534	auto *RepR = dyn_cast<VPReplicateRecipe>(Val: &R);
535	bool IsConditionalAssume =
536	RepR && RepR->isPredicated() &&
537	match(V: RepR->getUnderlyingInstr(), P: m_Intrinsic<Intrinsic::assume>());
538	if (IsConditionalAssume)
539	return true;
540
541	if (R.mayHaveSideEffects())
542	return false;
543
544	// Recipe is dead if no user keeps the recipe alive.
545	return all_of(Range: R.definedValues(),
546	P: [](VPValue V) { return* V->getNumUsers() == `0`; });
547	}
548
549	void VPlanTransforms::removeDeadRecipes(VPlan &Plan) {
550	ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT(
551	Plan.getEntry());
552
553	for (VPBasicBlock *VPBB : reverse(C: VPBlockUtils::blocksOnly<VPBasicBlock>(Range: RPOT))) {
554	// The recipes in the block are processed in reverse order, to catch chains
555	// of dead recipes.
556	for (VPRecipeBase &R : make_early_inc_range(Range: reverse(C&: *VPBB))) {
557	if (isDeadRecipe(R))
558	R.eraseFromParent();
559	}
560	}
561	}
562
563	static VPScalarIVStepsRecipe *
564	createScalarIVSteps(VPlan &Plan, InductionDescriptor::InductionKind Kind,
565	Instruction::BinaryOps InductionOpcode,
566	FPMathOperator FPBinOp, Instruction TruncI,
567	VPValue StartV, VPValue Step, DebugLoc DL,
568	VPBuilder &Builder) {
569	VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
570	VPCanonicalIVPHIRecipe *CanonicalIV = Plan.getCanonicalIV();
571	VPSingleDefRecipe *BaseIV = Builder.createDerivedIV(
572	Kind, FPBinOp, Start: StartV, Current: CanonicalIV, Step, Name: "offset.idx");
573
574	// Truncate base induction if needed.
575	Type *CanonicalIVType = CanonicalIV->getScalarType();
576	VPTypeAnalysis TypeInfo(CanonicalIVType);
577	Type *ResultTy = TypeInfo.inferScalarType(V: BaseIV);
578	if (TruncI) {
579	Type *TruncTy = TruncI->getType();
580	assert(ResultTy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits() &&
581	"Not truncating.");
582	assert(ResultTy->isIntegerTy() && "Truncation requires an integer type");
583	BaseIV = Builder.createScalarCast(Opcode: Instruction::Trunc, Op: BaseIV, ResultTy: TruncTy, DL);
584	ResultTy = TruncTy;
585	}
586
587	// Truncate step if needed.
588	Type *StepTy = TypeInfo.inferScalarType(V: Step);
589	if (ResultTy != StepTy) {
590	assert(StepTy->getScalarSizeInBits() > ResultTy->getScalarSizeInBits() &&
591	"Not truncating.");
592	assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
593	auto *VecPreheader =
594	cast<VPBasicBlock>(Val: HeaderVPBB->getSingleHierarchicalPredecessor());
595	VPBuilder::InsertPointGuard Guard(Builder);
596	Builder.setInsertPoint(VecPreheader);
597	Step = Builder.createScalarCast(Opcode: Instruction::Trunc, Op: Step, ResultTy, DL);
598	}
599	return Builder.createScalarIVSteps(InductionOpcode, FPBinOp, IV: BaseIV, Step,
600	VF: &Plan.getVF(), DL);
601	}
602
603	static SmallVector<VPUser > collectUsersRecursively(VPValue V) {
604	SetVector<VPUser *> Users(llvm::from_range, V->users());
605	for (unsigned I = `0`; I != Users.size(); ++I) {
606	VPRecipeBase *Cur = cast<VPRecipeBase>(Val: Users [I]);
607	if (isa<VPHeaderPHIRecipe>(Val: Cur))
608	continue;
609	for (VPValue *V : Cur->definedValues())
610	Users.insert_range(R: V->users());
611	}
612	return Users.takeVector();
613	}
614
615	/// Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd
616	/// (IndStart, ScalarIVSteps (0, Step)) if only its scalar values are used, as
617	/// VPWidenPointerInductionRecipe will generate vectors only. If some users
618	/// require vectors while other require scalars, the scalar uses need to extract
619	/// the scalars from the generated vectors (Note that this is different to how
620	/// int/fp inductions are handled). Legalize extract-from-ends using uniform
621	/// VPReplicateRecipe of wide inductions to use regular VPReplicateRecipe, so
622	/// the correct end value is available. Also optimize
623	/// VPWidenIntOrFpInductionRecipe, if any of its users needs scalar values, by
624	/// providing them scalar steps built on the canonical scalar IV and update the
625	/// original IV's users. This is an optional optimization to reduce the needs of
626	/// vector extracts.
627	static void legalizeAndOptimizeInductions(VPlan &Plan) {
628	using namespace llvm::VPlanPatternMatch;
629	VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
630	bool HasOnlyVectorVFs = !Plan.hasScalarVFOnly();
631	VPBuilder Builder(HeaderVPBB, HeaderVPBB->getFirstNonPhi());
632	for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
633	auto *PhiR = dyn_cast<VPWidenInductionRecipe>(Val: &Phi);
634	if (!PhiR)
635	continue;
636
637	// Try to narrow wide and replicating recipes to uniform recipes, based on
638	// VPlan analysis.
639	// TODO: Apply to all recipes in the future, to replace legacy uniformity
640	// analysis.
641	auto Users = collectUsersRecursively(V: PhiR);
642	for (VPUser *U : reverse(C&: Users)) {
643	auto *Def = dyn_cast<VPSingleDefRecipe>(Val: U);
644	auto *RepR = dyn_cast<VPReplicateRecipe>(Val: U);
645	// Skip recipes that shouldn't be narrowed.
646	if (!Def \|\| !isa<VPReplicateRecipe, VPWidenRecipe>(Val: Def) \|\|
647	Def->getNumUsers() == `0` \|\| !Def->getUnderlyingValue() \|\|
648	(RepR && (RepR->isSingleScalar() \|\| RepR->isPredicated())))
649	continue;
650
651	// Skip recipes that may have other lanes than their first used.
652	if (!vputils::isSingleScalar(VPV: Def) && !vputils::onlyFirstLaneUsed(Def))
653	continue;
654
655	auto Clone = new* VPReplicateRecipe (Def->getUnderlyingInstr(),
656	Def->operands(), /IsUniform/ true);
657	Clone->insertAfter(InsertPos: Def);
658	Def->replaceAllUsesWith(New: Clone);
659	}
660
661	// Replace wide pointer inductions which have only their scalars used by
662	// PtrAdd(IndStart, ScalarIVSteps (0, Step)).
663	if (auto *PtrIV = dyn_cast<VPWidenPointerInductionRecipe>(Val: &Phi)) {
664	if (!PtrIV->onlyScalarsGenerated(IsScalable: Plan.hasScalableVF()))
665	continue;
666
667	const InductionDescriptor &ID = PtrIV->getInductionDescriptor();
668	VPValue *StartV =
669	Plan.getOrAddLiveIn(V: ConstantInt::get(Ty: ID.getStep()->getType(), V: `0`));
670	VPValue *StepV = PtrIV->getOperand(N: `1`);
671	VPScalarIVStepsRecipe *Steps = createScalarIVSteps(
672	Plan, Kind: InductionDescriptor::IK_IntInduction, InductionOpcode: Instruction::Add, FPBinOp: nullptr,
673	TruncI: nullptr, StartV, Step: StepV, DL: PtrIV->getDebugLoc(), Builder);
674
675	VPValue *PtrAdd = Builder.createPtrAdd(Ptr: PtrIV->getStartValue(), Offset: Steps,
676	DL: PtrIV->getDebugLoc(), Name: "next.gep");
677
678	PtrIV->replaceAllUsesWith(New: PtrAdd);
679	continue;
680	}
681
682	// Replace widened induction with scalar steps for users that only use
683	// scalars.
684	auto *WideIV = cast<VPWidenIntOrFpInductionRecipe>(Val: &Phi);
685	if (HasOnlyVectorVFs && none_of(Range: WideIV->users(), P: [WideIV](VPUser *U) {
686	return U->usesScalars(Op: WideIV);
687	}))
688	continue;
689
690	const InductionDescriptor &ID = WideIV->getInductionDescriptor();
691	VPScalarIVStepsRecipe *Steps = createScalarIVSteps(
692	Plan, Kind: ID.getKind(), InductionOpcode: ID.getInductionOpcode(),
693	FPBinOp: dyn_cast_or_null<FPMathOperator>(Val: ID.getInductionBinOp()),
694	TruncI: WideIV->getTruncInst(), StartV: WideIV->getStartValue(), Step: WideIV->getStepValue(),
695	DL: WideIV->getDebugLoc(), Builder);
696
697	// Update scalar users of IV to use Step instead.
698	if (!HasOnlyVectorVFs)
699	WideIV->replaceAllUsesWith(New: Steps);
700	else
701	WideIV->replaceUsesWithIf(New: Steps, ShouldReplace: [WideIV](VPUser &U, unsigned) {
702	return U.usesScalars(Op: WideIV);
703	});
704	}
705	}
706
707	/// Check if \p VPV is an untruncated wide induction, either before or after the
708	/// increment. If so return the header IV (before the increment), otherwise
709	/// return null.
710	static VPWidenInductionRecipe getOptimizableIVOf(VPValue VPV) {
711	auto *WideIV = dyn_cast<VPWidenInductionRecipe>(Val: VPV);
712	if (WideIV) {
713	// VPV itself is a wide induction, separately compute the end value for exit
714	// users if it is not a truncated IV.
715	auto *IntOrFpIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(Val: WideIV);
716	return (IntOrFpIV && IntOrFpIV->getTruncInst()) ? nullptr : WideIV;
717	}
718
719	// Check if VPV is an optimizable induction increment.
720	VPRecipeBase *Def = VPV->getDefiningRecipe();
721	if (!Def \|\| Def->getNumOperands() != `2`)
722	return nullptr;
723	WideIV = dyn_cast<VPWidenInductionRecipe>(Val: Def->getOperand(N: `0`));
724	if (!WideIV)
725	WideIV = dyn_cast<VPWidenInductionRecipe>(Val: Def->getOperand(N: `1`));
726	if (!WideIV)
727	return nullptr;
728
729	auto IsWideIVInc = [&]() {
730	using namespace VPlanPatternMatch;
731	auto &ID = WideIV->getInductionDescriptor();
732
733	// Check if VPV increments the induction by the induction step.
734	VPValue *IVStep = WideIV->getStepValue();
735	switch (ID.getInductionOpcode()) {
736	case Instruction::Add:
737	return match(V: VPV, P: m_c_Binary<Instruction::Add>(Op0: m_Specific(VPV: WideIV),
738	Op1: m_Specific(VPV: IVStep)));
739	case Instruction::FAdd:
740	return match(V: VPV, P: m_c_Binary<Instruction::FAdd>(Op0: m_Specific(VPV: WideIV),
741	Op1: m_Specific(VPV: IVStep)));
742	case Instruction::FSub:
743	return match(V: VPV, P: m_Binary<Instruction::FSub>(Op0: m_Specific(VPV: WideIV),
744	Op1: m_Specific(VPV: IVStep)));
745	case Instruction::Sub: {
746	// IVStep will be the negated step of the subtraction. Check if Step == -1
747	// IVStep.*
748	VPValue *Step;
749	if (!match(V: VPV,
750	P: m_Binary<Instruction::Sub>(Op0: m_VPValue(), Op1: m_VPValue(V&: Step))) \|\|
751	!Step->isLiveIn() \|\| !IVStep->isLiveIn())
752	return false;
753	auto *StepCI = dyn_cast<ConstantInt>(Val: Step->getLiveInIRValue());
754	auto *IVStepCI = dyn_cast<ConstantInt>(Val: IVStep->getLiveInIRValue());
755	return StepCI && IVStepCI &&
756	StepCI->getValue() == (-`1` * IVStepCI->getValue());
757	}
758	default:
759	return ID.getKind() == InductionDescriptor::IK_PtrInduction &&
760	match(V: VPV, P: m_GetElementPtr(Op0: m_Specific(VPV: WideIV),
761	Op1: m_Specific(VPV: WideIV->getStepValue())));
762	}
763	llvm_unreachable("should have been covered by switch above");
764	};
765	return IsWideIVInc () ? WideIV : nullptr;
766	}
767
768	/// Attempts to optimize the induction variable exit values for users in the
769	/// early exit block.
770	static VPValue *optimizeEarlyExitInductionUser(VPlan &Plan,
771	VPTypeAnalysis &TypeInfo,
772	VPBlockBase *PredVPBB,
773	VPValue *Op) {
774	using namespace VPlanPatternMatch;
775
776	VPValue Incoming, Mask;
777	if (!match(V: Op, P: m_VPInstruction<Instruction::ExtractElement>(
778	Op0: m_VPValue(V&: Incoming),
779	Op1: m_VPInstruction<VPInstruction::FirstActiveLane>(
780	Op0: m_VPValue(V&: Mask)))))
781	return nullptr;
782
783	auto *WideIV = getOptimizableIVOf(VPV: Incoming);
784	if (!WideIV)
785	return nullptr;
786
787	auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(Val: WideIV);
788	if (WideIntOrFp && WideIntOrFp->getTruncInst())
789	return nullptr;
790
791	// Calculate the final index.
792	VPValue *EndValue = Plan.getCanonicalIV();
793	auto CanonicalIVType = Plan.getCanonicalIV()->getScalarType();
794	VPBuilder B(cast<VPBasicBlock>(Val: PredVPBB));
795
796	DebugLoc DL = cast<VPInstruction>(Val: Op)->getDebugLoc();
797	VPValue *FirstActiveLane =
798	B.createNaryOp(Opcode: VPInstruction::FirstActiveLane, Operands: Mask, DL);
799	Type *FirstActiveLaneType = TypeInfo.inferScalarType(V: FirstActiveLane);
800	FirstActiveLane = B.createScalarZExtOrTrunc(Op: FirstActiveLane, ResultTy: CanonicalIVType,
801	SrcTy: FirstActiveLaneType, DL);
802	EndValue = B.createNaryOp(Opcode: Instruction::Add, Operands: {EndValue, FirstActiveLane}, DL);
803
804	// `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
805	// changed it means the exit is using the incremented value, so we need to
806	// add the step.
807	if (Incoming != WideIV) {
808	VPValue *One = Plan.getOrAddLiveIn(V: ConstantInt::get(Ty: CanonicalIVType, V: `1`));
809	EndValue = B.createNaryOp(Opcode: Instruction::Add, Operands: {EndValue, One}, DL);
810	}
811
812	if (!WideIntOrFp \|\| !WideIntOrFp->isCanonical()) {
813	const InductionDescriptor &ID = WideIV->getInductionDescriptor();
814	VPValue *Start = WideIV->getStartValue();
815	VPValue *Step = WideIV->getStepValue();
816	EndValue = B.createDerivedIV(
817	Kind: ID.getKind(), FPBinOp: dyn_cast_or_null<FPMathOperator>(Val: ID.getInductionBinOp()),
818	Start, Current: EndValue, Step);
819	}
820
821	return EndValue;
822	}
823
824	/// Attempts to optimize the induction variable exit values for users in the
825	/// exit block coming from the latch in the original scalar loop.
826	static VPValue *
827	optimizeLatchExitInductionUser(VPlan &Plan, VPTypeAnalysis &TypeInfo,
828	VPBlockBase PredVPBB, VPValue Op,
829	DenseMap<VPValue , VPValue > &EndValues) {
830	using namespace VPlanPatternMatch;
831
832	VPValue *Incoming;
833	if (!match(V: Op, P: m_VPInstruction<VPInstruction::ExtractLastElement>(
834	Op0: m_VPValue(V&: Incoming))))
835	return nullptr;
836
837	auto *WideIV = getOptimizableIVOf(VPV: Incoming);
838	if (!WideIV)
839	return nullptr;
840
841	VPValue *EndValue = EndValues.lookup(Val: WideIV);
842	assert(EndValue && "end value must have been pre-computed");
843
844	// `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
845	// changed it means the exit is using the incremented value, so we don't
846	// need to subtract the step.
847	if (Incoming != WideIV)
848	return EndValue;
849
850	// Otherwise, subtract the step from the EndValue.
851	VPBuilder B(cast<VPBasicBlock>(Val: PredVPBB)->getTerminator());
852	VPValue *Step = WideIV->getStepValue();
853	Type *ScalarTy = TypeInfo.inferScalarType(V: WideIV);
854	if (ScalarTy->isIntegerTy())
855	return B.createNaryOp(Opcode: Instruction::Sub, Operands: {EndValue, Step}, Inst: {}, Name: "ind.escape");
856	if (ScalarTy->isPointerTy()) {
857	auto *Zero = Plan.getOrAddLiveIn(
858	V: ConstantInt::get(Ty: Step->getLiveInIRValue()->getType(), V: `0`));
859	return B.createPtrAdd(Ptr: EndValue,
860	Offset: B.createNaryOp(Opcode: Instruction::Sub, Operands: {Zero, Step}), DL: {},
861	Name: "ind.escape");
862	}
863	if (ScalarTy->isFloatingPointTy()) {
864	const auto &ID = WideIV->getInductionDescriptor();
865	return B.createNaryOp(
866	Opcode: ID.getInductionBinOp()->getOpcode() == Instruction::FAdd
867	? Instruction::FSub
868	: Instruction::FAdd,
869	Operands: {EndValue, Step}, Flags: {ID.getInductionBinOp()->getFastMathFlags()});
870	}
871	llvm_unreachable("all possible induction types must be handled");
872	return nullptr;
873	}
874
875	void VPlanTransforms::optimizeInductionExitUsers(
876	VPlan &Plan, DenseMap<VPValue , VPValue > &EndValues) {
877	VPBlockBase *MiddleVPBB = Plan.getMiddleBlock();
878	VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType());
879	for (VPIRBasicBlock *ExitVPBB : Plan.getExitBlocks()) {
880	for (VPRecipeBase &R : ExitVPBB->phis()) {
881	auto *ExitIRI = cast<VPIRPhi>(Val: &R);
882
883	for (auto [Idx, PredVPBB] : enumerate(First&: ExitVPBB->getPredecessors())) {
884	VPValue Escape = nullptr*;
885	if (PredVPBB == MiddleVPBB)
886	Escape = optimizeLatchExitInductionUser(
887	Plan, TypeInfo, PredVPBB, Op: ExitIRI->getOperand(N: Idx), EndValues);
888	else
889	Escape = optimizeEarlyExitInductionUser(Plan, TypeInfo, PredVPBB,
890	Op: ExitIRI->getOperand(N: Idx));
891	if (Escape)
892	ExitIRI->setOperand(I: Idx, New: Escape);
893	}
894	}
895	}
896	}
897
898	/// Remove redundant EpxandSCEVRecipes in \p Plan's entry block by replacing
899	/// them with already existing recipes expanding the same SCEV expression.
900	static void removeRedundantExpandSCEVRecipes(VPlan &Plan) {
901	DenseMap<const SCEV , VPValue > SCEV2VPV;
902
903	for (VPRecipeBase &R :
904	make_early_inc_range(Range&: *Plan.getEntry()->getEntryBasicBlock())) {
905	auto *ExpR = dyn_cast<VPExpandSCEVRecipe>(Val: &R);
906	if (!ExpR)
907	continue;
908
909	auto I = SCEV2VPV.insert(KV: {ExpR->getSCEV(), ExpR});
910	if (I.second)
911	continue;
912	ExpR->replaceAllUsesWith(New: I.first ->second);
913	ExpR->eraseFromParent();
914	}
915	}
916
917	static void recursivelyDeleteDeadRecipes(VPValue *V) {
918	SmallVector<VPValue *> WorkList;
919	SmallPtrSet<VPValue *, `8`> Seen;
920	WorkList.push_back(Elt: V);
921
922	while (!WorkList.empty()) {
923	VPValue *Cur = WorkList.pop_back_val();
924	if (!Seen.insert(Ptr: Cur).second)
925	continue;
926	VPRecipeBase *R = Cur->getDefiningRecipe();
927	if (!R)
928	continue;
929	if (!isDeadRecipe(R&: *R))
930	continue;
931	WorkList.append(in_start: R->op_begin(), in_end: R->op_end());
932	R->eraseFromParent();
933	}
934	}
935
936	/// Try to fold \p R using InstSimplifyFolder. Will succeed and return a
937	/// non-nullptr Value for a handled \p Opcode if corresponding \p Operands are
938	/// foldable live-ins.
939	static Value tryToFoldLiveIns(const* VPRecipeBase &R, unsigned Opcode,
940	ArrayRef<VPValue *> Operands,
941	const DataLayout &DL, VPTypeAnalysis &TypeInfo) {
942	SmallVector<Value *, `4`> Ops;
943	for (VPValue *Op : Operands) {
944	if (!Op->isLiveIn() \|\| !Op->getLiveInIRValue())
945	return nullptr;
946	Ops.push_back(Elt: Op->getLiveInIRValue());
947	}
948
949	InstSimplifyFolder Folder(DL);
950	if (Instruction::isBinaryOp(Opcode))
951	return Folder.FoldBinOp(Opc: static_cast<Instruction::BinaryOps>(Opcode), LHS: Ops [`0`],
952	RHS: Ops [`1`]);
953	if (Instruction::isCast(Opcode))
954	return Folder.FoldCast(Op: static_cast<Instruction::CastOps>(Opcode), V: Ops [`0`],
955	DestTy: TypeInfo.inferScalarType(V: R.getVPSingleValue()));
956	switch (Opcode) {
957	case VPInstruction::LogicalAnd:
958	return Folder.FoldSelect(C: Ops [`0`], True: Ops [`1`],
959	False: ConstantInt::getNullValue(Ty: Ops [`1`]->getType()));
960	case VPInstruction::Not:
961	return Folder.FoldBinOp(Opc: Instruction::BinaryOps::Xor, LHS: Ops [`0`],
962	RHS: Constant::getAllOnesValue(Ty: Ops [`0`]->getType()));
963	case Instruction::Select:
964	return Folder.FoldSelect(C: Ops [`0`], True: Ops [`1`], False: Ops [`2`]);
965	case Instruction::ICmp:
966	case Instruction::FCmp:
967	return Folder.FoldCmp(P: cast<VPRecipeWithIRFlags>(Val: R).getPredicate(), LHS: Ops [`0`],
968	RHS: Ops [`1`]);
969	case Instruction::GetElementPtr: {
970	auto &RFlags = cast<VPRecipeWithIRFlags>(Val: R);
971	auto *GEP = cast<GetElementPtrInst>(Val: RFlags.getUnderlyingInstr());
972	return Folder.FoldGEP(Ty: GEP->getSourceElementType(), Ptr: Ops [`0`], IdxList: drop_begin(RangeOrContainer&: Ops),
973	NW: RFlags.getGEPNoWrapFlags());
974	}
975	case VPInstruction::PtrAdd:
976	return Folder.FoldGEP(Ty: IntegerType::getInt8Ty(C&: TypeInfo.getContext()), Ptr: Ops [`0`],
977	IdxList: Ops [`1`],
978	NW: cast<VPRecipeWithIRFlags>(Val: R).getGEPNoWrapFlags());
979	case Instruction::InsertElement:
980	return Folder.FoldInsertElement(Vec: Ops [`0`], NewElt: Ops [`1`], Idx: Ops [`2`]);
981	case Instruction::ExtractElement:
982	return Folder.FoldExtractElement(Vec: Ops [`0`], Idx: Ops [`1`]);
983	}
984	return nullptr;
985	}
986
987	/// Try to simplify recipe \p R.
988	static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
989	using namespace llvm::VPlanPatternMatch;
990	VPlan *Plan = R.getParent()->getPlan();
991
992	auto *Def = dyn_cast<VPSingleDefRecipe>(Val: &R);
993	if (!Def)
994	return;
995
996	// Simplification of live-in IR values for SingleDef recipes using
997	// InstSimplifyFolder.
998	if (TypeSwitch<VPRecipeBase , bool*>(&R)
999	.Case<VPInstruction, VPWidenRecipe, VPWidenCastRecipe,
1000	VPReplicateRecipe>(caseFn: [&](auto *I) {
1001	const DataLayout &DL =
1002	Plan->getScalarHeader()->getIRBasicBlock()->getDataLayout();
1003	Value V = tryToFoldLiveIns(I, I->getOpcode(), I->operands(), DL,
1004	TypeInfo);
1005	if (V)
1006	I->replaceAllUsesWith(Plan->getOrAddLiveIn(V));
1007	return V;
1008	})
1009	.Default(defaultFn: [](auto ) { return* false; }))
1010	return;
1011
1012	// Fold PredPHI LiveIn -> LiveIn.
1013	if (auto *PredPHI = dyn_cast<VPPredInstPHIRecipe>(Val: &R)) {
1014	VPValue *Op = PredPHI->getOperand(N: `0`);
1015	if (Op->isLiveIn())
1016	PredPHI->replaceAllUsesWith(New: Op);
1017	}
1018
1019	VPValue *A;
1020	if (match(V: Def, P: m_Trunc(Op0: m_ZExtOrSExt(Op0: m_VPValue(V&: A))))) {
1021	Type *TruncTy = TypeInfo.inferScalarType(V: Def);
1022	Type *ATy = TypeInfo.inferScalarType(V: A);
1023	if (TruncTy == ATy) {
1024	Def->replaceAllUsesWith(New: A);
1025	} else {
1026	// Don't replace a scalarizing recipe with a widened cast.
1027	if (isa<VPReplicateRecipe>(Val: Def))
1028	return;
1029	if (ATy->getScalarSizeInBits() < TruncTy->getScalarSizeInBits()) {
1030
1031	unsigned ExtOpcode = match(V: R.getOperand(N: `0`), P: m_SExt(Op0: m_VPValue()))
1032	? Instruction::SExt
1033	: Instruction::ZExt;
1034	auto *VPC =
1035	new VPWidenCastRecipe (Instruction::CastOps(ExtOpcode), A, TruncTy);
1036	if (auto *UnderlyingExt = R.getOperand(N: `0`)->getUnderlyingValue()) {
1037	// UnderlyingExt has distinct return type, used to retain legacy cost.
1038	VPC->setUnderlyingValue(UnderlyingExt);
1039	}
1040	VPC->insertBefore(InsertPos: &R);
1041	Def->replaceAllUsesWith(New: VPC);
1042	} else if (ATy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits()) {
1043	auto VPC = new* VPWidenCastRecipe (Instruction::Trunc, A, TruncTy);
1044	VPC->insertBefore(InsertPos: &R);
1045	Def->replaceAllUsesWith(New: VPC);
1046	}
1047	}
1048	#ifndef NDEBUG
1049	// Verify that the cached type info is for both A and its users is still
1050	// accurate by comparing it to freshly computed types.
1051	VPTypeAnalysis TypeInfo2(Plan->getCanonicalIV()->getScalarType());
1052	assert(TypeInfo.inferScalarType(A) == TypeInfo2.inferScalarType(A));
1053	for (VPUser *U : A->users()) {
1054	auto *R = cast<VPRecipeBase>(U);
1055	for (VPValue *VPV : R->definedValues())
1056	assert(TypeInfo.inferScalarType(VPV) == TypeInfo2.inferScalarType(VPV));
1057	}
1058	#endif
1059	}
1060
1061	// Simplify (X && Y) \|\| (X && !Y) -> X.
1062	// TODO: Split up into simpler, modular combines: (X && Y) \|\| (X && Z) into X
1063	// && (Y \|\| Z) and (X \|\| !X) into true. This requires queuing newly created
1064	// recipes to be visited during simplification.
1065	VPValue X, Y;
1066	if (match(V: Def,
1067	P: m_c_BinaryOr(Op0: m_LogicalAnd(Op0: m_VPValue(V&: X), Op1: m_VPValue(V&: Y)),
1068	Op1: m_LogicalAnd(Op0: m_Deferred(V: X), Op1: m_Not(Op0: m_Deferred(V: Y)))))) {
1069	Def->replaceAllUsesWith(New: X);
1070	Def->eraseFromParent();
1071	return;
1072	}
1073
1074	// OR x, 1 -> 1.
1075	if (match(V: Def, P: m_c_BinaryOr(Op0: m_VPValue(V&: X), Op1: m_AllOnes()))) {
1076	Def->replaceAllUsesWith(New: Def->getOperand(N: `0`) == X ? Def->getOperand(N: `1`)
1077	: Def->getOperand(N: `0`));
1078	Def->eraseFromParent();
1079	return;
1080	}
1081
1082	if (match(V: Def, P: m_Select(Op0: m_VPValue(), Op1: m_VPValue(V&: X), Op2: m_Deferred(V: X))))
1083	return Def->replaceAllUsesWith(New: X);
1084
1085	if (match(V: Def, P: m_c_Mul(Op0: m_VPValue(V&: A), Op1: m_SpecificInt(V: `1`))))
1086	return Def->replaceAllUsesWith(New: A);
1087
1088	if (match(V: Def, P: m_Not(Op0: m_VPValue(V&: A)))) {
1089	if (match(V: A, P: m_Not(Op0: m_VPValue(V&: A))))
1090	return Def->replaceAllUsesWith(New: A);
1091
1092	// Try to fold Not into compares by adjusting the predicate in-place.
1093	if (isa<VPWidenRecipe>(Val: A) && A->getNumUsers() == `1`) {
1094	auto *WideCmp = cast<VPWidenRecipe>(Val: A);
1095	if (WideCmp->getOpcode() == Instruction::ICmp \|\|
1096	WideCmp->getOpcode() == Instruction::FCmp) {
1097	WideCmp->setPredicate(
1098	CmpInst::getInversePredicate(pred: WideCmp->getPredicate()));
1099	Def->replaceAllUsesWith(New: WideCmp);
1100	// If WideCmp doesn't have a debug location, use the one from the
1101	// negation, to preserve the location.
1102	if (!WideCmp->getDebugLoc() && R.getDebugLoc())
1103	WideCmp->setDebugLoc(R.getDebugLoc());
1104	}
1105	}
1106	}
1107
1108	// Remove redundant DerviedIVs, that is 0 + A 1 -> A and 0 + 0 * x -> 0.*
1109	if ((match(V: Def,
1110	P: m_DerivedIV(Op0: m_SpecificInt(V: `0`), Op1: m_VPValue(V&: A), Op2: m_SpecificInt(V: `1`))) \|\|
1111	match(V: Def,
1112	P: m_DerivedIV(Op0: m_SpecificInt(V: `0`), Op1: m_SpecificInt(V: `0`), Op2: m_VPValue()))) &&
1113	TypeInfo.inferScalarType(V: Def->getOperand(N: `1`)) ==
1114	TypeInfo.inferScalarType(V: Def))
1115	return Def->replaceAllUsesWith(New: Def->getOperand(N: `1`));
1116
1117	if (match(V: Def, P: m_VPInstruction<VPInstruction::WideIVStep>(
1118	Op0: m_VPValue(V&: X), Op1: m_SpecificInt(V: `1`)))) {
1119	Type *WideStepTy = TypeInfo.inferScalarType(V: Def);
1120	if (TypeInfo.inferScalarType(V: X) != WideStepTy)
1121	X = VPBuilder (Def).createWidenCast(Opcode: Instruction::Trunc, Op: X, ResultTy: WideStepTy);
1122	Def->replaceAllUsesWith(New: X);
1123	return;
1124	}
1125
1126	// For i1 vp.merges produced by AnyOf reductions:
1127	// vp.merge true, (or x, y), x, evl -> vp.merge y, true, x, evl
1128	if (match(V: Def, P: m_Intrinsic<Intrinsic::vp_merge>(Op0: m_True(), Op1: m_VPValue(V&: A),
1129	Op2: m_VPValue(V&: X), Op3: m_VPValue())) &&
1130	match(V: A, P: m_c_BinaryOr(Op0: m_Specific(VPV: X), Op1: m_VPValue(V&: Y))) &&
1131	TypeInfo.inferScalarType(V: R.getVPSingleValue())->isIntegerTy(Bitwidth: `1`)) {
1132	Def->setOperand(I: `1`, New: Def->getOperand(N: `0`));
1133	Def->setOperand(I: `0`, New: Y);
1134	return;
1135	}
1136
1137	if (auto *Phi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(Val: Def)) {
1138	if (Phi->getOperand(N: `0`) == Phi->getOperand(N: `1`))
1139	Def->replaceAllUsesWith(New: Phi->getOperand(N: `0`));
1140	return;
1141	}
1142
1143	// Look through ExtractLastElement (BuildVector ....).
1144	if (match(V: &R, P: m_VPInstruction<VPInstruction::ExtractLastElement>(
1145	Op0: m_BuildVector()))) {
1146	auto *BuildVector = cast<VPInstruction>(Val: R.getOperand(N: `0`));
1147	Def->replaceAllUsesWith(
1148	New: BuildVector->getOperand(N: BuildVector->getNumOperands() - `1`));
1149	return;
1150	}
1151
1152	// Look through ExtractPenultimateElement (BuildVector ....).
1153	if (match(V: &R, P: m_VPInstruction<VPInstruction::ExtractPenultimateElement>(
1154	Op0: m_BuildVector()))) {
1155	auto *BuildVector = cast<VPInstruction>(Val: R.getOperand(N: `0`));
1156	Def->replaceAllUsesWith(
1157	New: BuildVector->getOperand(N: BuildVector->getNumOperands() - `2`));
1158	return;
1159	}
1160
1161	// Some simplifications can only be applied after unrolling. Perform them
1162	// below.
1163	if (!Plan->isUnrolled())
1164	return;
1165
1166	// VPScalarIVSteps for part 0 can be replaced by their start value, if only
1167	// the first lane is demanded.
1168	if (auto *Steps = dyn_cast<VPScalarIVStepsRecipe>(Val: Def)) {
1169	if (Steps->isPart0() && vputils::onlyFirstLaneUsed(Def: Steps)) {
1170	Steps->replaceAllUsesWith(New: Steps->getOperand(N: `0`));
1171	return;
1172	}
1173	}
1174	// Simplify redundant ReductionStartVector recipes after unrolling.
1175	VPValue *StartV;
1176	if (match(V: Def, P: m_VPInstruction<VPInstruction::ReductionStartVector>(
1177	Op0: m_VPValue(V&: StartV), Op1: m_VPValue(), Op2: m_VPValue()))) {
1178	Def->replaceUsesWithIf(New: StartV, ShouldReplace: [](const VPUser &U, unsigned Idx) {
1179	auto *PhiR = dyn_cast<VPReductionPHIRecipe>(Val: &U);
1180	return PhiR && PhiR->isInLoop();
1181	});
1182	return;
1183	}
1184
1185	if (match(V: Def,
1186	P: m_VPInstruction<VPInstruction::ExtractLastElement>(
1187	Op0: m_VPInstruction<VPInstruction::Broadcast>(Op0: m_VPValue(V&: A))))) {
1188	Def->replaceAllUsesWith(New: A);
1189	return;
1190	}
1191
1192	VPInstruction *OpVPI;
1193	if (match(V: Def, P: m_VPInstruction<VPInstruction::ExtractLastElement>(
1194	Op0: m_VPInstruction(V&: OpVPI))) &&
1195	OpVPI->isVectorToScalar()) {
1196	Def->replaceAllUsesWith(New: OpVPI);
1197	return;
1198	}
1199	}
1200
1201	void VPlanTransforms::simplifyRecipes(VPlan &Plan, Type &CanonicalIVTy) {
1202	ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT(
1203	Plan.getEntry());
1204	VPTypeAnalysis TypeInfo(&CanonicalIVTy);
1205	for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Range: RPOT)) {
1206	for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB)) {
1207	simplifyRecipe(R, TypeInfo);
1208	}
1209	}
1210	}
1211
1212	static void narrowToSingleScalarRecipes(VPlan &Plan) {
1213	if (Plan.hasScalarVFOnly())
1214	return;
1215
1216	// Try to narrow wide and replicating recipes to single scalar recipes,
1217	// based on VPlan analysis. Only process blocks in the loop region for now,
1218	// without traversing into nested regions, as recipes in replicate regions
1219	// cannot be converted yet.
1220	for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
1221	Range: vp_depth_first_shallow(G: Plan.getVectorLoopRegion()->getEntry()))) {
1222	for (VPRecipeBase &R : make_early_inc_range(Range: reverse(C&: *VPBB))) {
1223	if (!isa<VPWidenRecipe, VPWidenSelectRecipe, VPReplicateRecipe>(Val: &R))
1224	continue;
1225	auto *RepR = dyn_cast<VPReplicateRecipe>(Val: &R);
1226	if (RepR && (RepR->isSingleScalar() \|\| RepR->isPredicated()))
1227	continue;
1228
1229	auto *RepOrWidenR = cast<VPSingleDefRecipe>(Val: &R);
1230	// Skip recipes that aren't single scalars or don't have only their
1231	// scalar results used. In the latter case, we would introduce extra
1232	// broadcasts.
1233	if (!vputils::isSingleScalar(VPV: RepOrWidenR) \|\|
1234	any_of(Range: RepOrWidenR->users(), P: [RepOrWidenR](VPUser *U) {
1235	return !U->usesScalars(Op: RepOrWidenR);
1236	}))
1237	continue;
1238
1239	auto Clone = new* VPReplicateRecipe (RepOrWidenR->getUnderlyingInstr(),
1240	RepOrWidenR->operands(),
1241	true /IsSingleScalar/);
1242	Clone->insertBefore(InsertPos: RepOrWidenR);
1243	RepOrWidenR->replaceAllUsesWith(New: Clone);
1244	}
1245	}
1246	}
1247
1248	/// Normalize and simplify VPBlendRecipes. Should be run after simplifyRecipes
1249	/// to make sure the masks are simplified.
1250	static void simplifyBlends(VPlan &Plan) {
1251	using namespace llvm::VPlanPatternMatch;
1252	for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
1253	Range: vp_depth_first_shallow(G: Plan.getVectorLoopRegion()->getEntry()))) {
1254	for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB)) {
1255	auto *Blend = dyn_cast<VPBlendRecipe>(Val: &R);
1256	if (!Blend)
1257	continue;
1258
1259	// Try to remove redundant blend recipes.
1260	SmallPtrSet<VPValue *, `4`> UniqueValues;
1261	if (Blend->isNormalized() \|\| !match(V: Blend->getMask(Idx: `0`), P: m_False()))
1262	UniqueValues.insert(Ptr: Blend->getIncomingValue(Idx: `0`));
1263	for (unsigned I = `1`; I != Blend->getNumIncomingValues(); ++I)
1264	if (!match(V: Blend->getMask(Idx: I), P: m_False()))
1265	UniqueValues.insert(Ptr: Blend->getIncomingValue(Idx: I));
1266
1267	if (UniqueValues.size() == `1`) {
1268	Blend->replaceAllUsesWith(New: *UniqueValues.begin());
1269	Blend->eraseFromParent();
1270	continue;
1271	}
1272
1273	if (Blend->isNormalized())
1274	continue;
1275
1276	// Normalize the blend so its first incoming value is used as the initial
1277	// value with the others blended into it.
1278
1279	unsigned StartIndex = `0`;
1280	for (unsigned I = `0`; I != Blend->getNumIncomingValues(); ++I) {
1281	// If a value's mask is used only by the blend then is can be deadcoded.
1282	// TODO: Find the most expensive mask that can be deadcoded, or a mask
1283	// that's used by multiple blends where it can be removed from them all.
1284	VPValue *Mask = Blend->getMask(Idx: I);
1285	if (Mask->getNumUsers() == `1` && !match(V: Mask, P: m_False())) {
1286	StartIndex = I;
1287	break;
1288	}
1289	}
1290
1291	SmallVector<VPValue *, `4`> OperandsWithMask;
1292	OperandsWithMask.push_back(Elt: Blend->getIncomingValue(Idx: StartIndex));
1293
1294	for (unsigned I = `0`; I != Blend->getNumIncomingValues(); ++I) {
1295	if (I == StartIndex)
1296	continue;
1297	OperandsWithMask.push_back(Elt: Blend->getIncomingValue(Idx: I));
1298	OperandsWithMask.push_back(Elt: Blend->getMask(Idx: I));
1299	}
1300
1301	auto NewBlend = new* VPBlendRecipe (
1302	cast<PHINode>(Val: Blend->getUnderlyingValue()), OperandsWithMask);
1303	NewBlend->insertBefore(InsertPos: &R);
1304
1305	VPValue *DeadMask = Blend->getMask(Idx: StartIndex);
1306	Blend->replaceAllUsesWith(New: NewBlend);
1307	Blend->eraseFromParent();
1308	recursivelyDeleteDeadRecipes(V: DeadMask);
1309
1310	/// Simplify BLEND %a, %b, Not(%mask) -> BLEND %b, %a, %mask.
1311	VPValue *NewMask;
1312	if (NewBlend->getNumOperands() == `3` &&
1313	match(V: NewBlend->getMask(Idx: `1`), P: m_Not(Op0: m_VPValue(V&: NewMask)))) {
1314	VPValue *Inc0 = NewBlend->getOperand(N: `0`);
1315	VPValue *Inc1 = NewBlend->getOperand(N: `1`);
1316	VPValue *OldMask = NewBlend->getOperand(N: `2`);
1317	NewBlend->setOperand(I: `0`, New: Inc1);
1318	NewBlend->setOperand(I: `1`, New: Inc0);
1319	NewBlend->setOperand(I: `2`, New: NewMask);
1320	if (OldMask->getNumUsers() == `0`)
1321	cast<VPInstruction>(Val: OldMask)->eraseFromParent();
1322	}
1323	}
1324	}
1325	}
1326
1327	/// Optimize the width of vector induction variables in \p Plan based on a known
1328	/// constant Trip Count, \p BestVF and \p BestUF.
1329	static bool optimizeVectorInductionWidthForTCAndVFUF(VPlan &Plan,
1330	ElementCount BestVF,
1331	unsigned BestUF) {
1332	// Only proceed if we have not completely removed the vector region.
1333	if (!Plan.getVectorLoopRegion())
1334	return false;
1335
1336	if (!Plan.getTripCount()->isLiveIn())
1337	return false;
1338	auto *TC = dyn_cast_if_present<ConstantInt>(
1339	Val: Plan.getTripCount()->getUnderlyingValue());
1340	if (!TC \|\| !BestVF.isFixed())
1341	return false;
1342
1343	// Calculate the minimum power-of-2 bit width that can fit the known TC, VF
1344	// and UF. Returns at least 8.
1345	auto ComputeBitWidth = [](APInt TC, uint64_t Align) {
1346	APInt AlignedTC =
1347	Align * APIntOps::RoundingUDiv(A: TC, B: APInt (TC.getBitWidth(), Align),
1348	RM: APInt::Rounding::UP);
1349	APInt MaxVal = AlignedTC - `1`;
1350	return std::max<unsigned>(a: PowerOf2Ceil(A: MaxVal.getActiveBits()), b: `8`);
1351	};
1352	unsigned NewBitWidth =
1353	ComputeBitWidth (TC->getValue(), BestVF.getKnownMinValue() * BestUF);
1354
1355	LLVMContext &Ctx = Plan.getCanonicalIV()->getScalarType()->getContext();
1356	auto *NewIVTy = IntegerType::get(C&: Ctx, NumBits: NewBitWidth);
1357
1358	bool MadeChange = false;
1359
1360	VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
1361	for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
1362	auto *WideIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(Val: &Phi);
1363
1364	// Currently only handle canonical IVs as it is trivial to replace the start
1365	// and stop values, and we currently only perform the optimization when the
1366	// IV has a single use.
1367	if (!WideIV \|\| !WideIV->isCanonical() \|\|
1368	WideIV->hasMoreThanOneUniqueUser() \|\|
1369	NewIVTy == WideIV->getScalarType())
1370	continue;
1371
1372	// Currently only handle cases where the single user is a header-mask
1373	// comparison with the backedge-taken-count.
1374	using namespace VPlanPatternMatch;
1375	if (!match(
1376	U: *WideIV->user_begin(),
1377	P: m_Binary<Instruction::ICmp>(
1378	Op0: m_Specific(VPV: WideIV),
1379	Op1: m_Broadcast(Op0: m_Specific(VPV: Plan.getOrCreateBackedgeTakenCount())))))
1380	continue;
1381
1382	// Update IV operands and comparison bound to use new narrower type.
1383	auto *NewStart = Plan.getOrAddLiveIn(V: ConstantInt::get(Ty: NewIVTy, V: `0`));
1384	WideIV->setStartValue(NewStart);
1385	auto *NewStep = Plan.getOrAddLiveIn(V: ConstantInt::get(Ty: NewIVTy, V: `1`));
1386	WideIV->setStepValue(NewStep);
1387
1388	auto NewBTC = new* VPWidenCastRecipe (
1389	Instruction::Trunc, Plan.getOrCreateBackedgeTakenCount(), NewIVTy);
1390	Plan.getVectorPreheader()->appendRecipe(Recipe: NewBTC);
1391	auto Cmp = cast<VPInstruction>(Val: WideIV->user_begin());
1392	Cmp->setOperand(I: `1`, New: NewBTC);
1393
1394	MadeChange = true;
1395	}
1396
1397	return MadeChange;
1398	}
1399
1400	/// Return true if \p Cond is known to be true for given \p BestVF and \p
1401	/// BestUF.
1402	static bool isConditionTrueViaVFAndUF(VPValue *Cond, VPlan &Plan,
1403	ElementCount BestVF, unsigned BestUF,
1404	ScalarEvolution &SE) {
1405	using namespace llvm::VPlanPatternMatch;
1406	if (match(V: Cond, P: m_Binary<Instruction::Or>(Op0: m_VPValue(), Op1: m_VPValue())))
1407	return any_of(Range: Cond->getDefiningRecipe()->operands(), P: [&Plan, BestVF, BestUF,
1408	&SE](VPValue *C) {
1409	return isConditionTrueViaVFAndUF(Cond: C, Plan, BestVF, BestUF, SE);
1410	});
1411
1412	auto *CanIV = Plan.getCanonicalIV();
1413	if (!match(V: Cond, P: m_Binary<Instruction::ICmp>(
1414	Op0: m_Specific(VPV: CanIV->getBackedgeValue()),
1415	Op1: m_Specific(VPV: &Plan.getVectorTripCount()))) \|\|
1416	cast<VPRecipeWithIRFlags>(Val: Cond->getDefiningRecipe())->getPredicate() !=
1417	CmpInst::ICMP_EQ)
1418	return false;
1419
1420	// The compare checks CanIV + VFxUF == vector trip count. The vector trip
1421	// count is not conveniently available as SCEV so far, so we compare directly
1422	// against the original trip count. This is stricter than necessary, as we
1423	// will only return true if the trip count == vector trip count.
1424	// TODO: Use SCEV for vector trip count once available, to cover cases where
1425	// vector trip count == UF VF, but original trip count != UF * VF.*
1426	const SCEV *TripCount =
1427	vputils::getSCEVExprForVPValue(V: Plan.getTripCount(), SE);
1428	assert(!isa<SCEVCouldNotCompute>(TripCount) &&
1429	"Trip count SCEV must be computable");
1430	ElementCount NumElements = BestVF.multiplyCoefficientBy(RHS: BestUF);
1431	const SCEV *C = SE.getElementCount(Ty: TripCount->getType(), EC: NumElements);
1432	return SE.isKnownPredicate(Pred: CmpInst::ICMP_EQ, LHS: TripCount, RHS: C);
1433	}
1434
1435	/// Try to simplify the branch condition of \p Plan. This may restrict the
1436	/// resulting plan to \p BestVF and \p BestUF.
1437	static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF,
1438	unsigned BestUF,
1439	PredicatedScalarEvolution &PSE) {
1440	VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
1441	VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
1442	auto *Term = &ExitingVPBB->back();
1443	VPValue *Cond;
1444	ScalarEvolution &SE = *PSE.getSE();
1445	using namespace llvm::VPlanPatternMatch;
1446	if (match(V: Term, P: m_BranchOnCount(Op0: m_VPValue(), Op1: m_VPValue())) \|\|
1447	match(V: Term, P: m_BranchOnCond(
1448	Op0: m_Not(Op0: m_ActiveLaneMask(Op0: m_VPValue(), Op1: m_VPValue()))))) {
1449	// Try to simplify the branch condition if TC <= VF UF when the latch*
1450	// terminator is BranchOnCount or BranchOnCond where the input is
1451	// Not(ActiveLaneMask).
1452	const SCEV *TripCount =
1453	vputils::getSCEVExprForVPValue(V: Plan.getTripCount(), SE);
1454	assert(!isa<SCEVCouldNotCompute>(TripCount) &&
1455	"Trip count SCEV must be computable");
1456	ElementCount NumElements = BestVF.multiplyCoefficientBy(RHS: BestUF);
1457	const SCEV *C = SE.getElementCount(Ty: TripCount->getType(), EC: NumElements);
1458	if (TripCount->isZero() \|\|
1459	!SE.isKnownPredicate(Pred: CmpInst::ICMP_ULE, LHS: TripCount, RHS: C))
1460	return false;
1461	} else if (match(V: Term, P: m_BranchOnCond(Op0: m_VPValue(V&: Cond)))) {
1462	// For BranchOnCond, check if we can prove the condition to be true using VF
1463	// and UF.
1464	if (!isConditionTrueViaVFAndUF(Cond, Plan, BestVF, BestUF, SE))
1465	return false;
1466	} else {
1467	return false;
1468	}
1469
1470	// The vector loop region only executes once. If possible, completely remove
1471	// the region, otherwise replace the terminator controlling the latch with
1472	// (BranchOnCond true).
1473	auto *Header = cast<VPBasicBlock>(Val: VectorRegion->getEntry());
1474	auto *CanIVTy = Plan.getCanonicalIV()->getScalarType();
1475	if (all_of(
1476	Range: Header->phis(),
1477	P: IsaPred<VPCanonicalIVPHIRecipe, VPFirstOrderRecurrencePHIRecipe>)) {
1478	for (VPRecipeBase &HeaderR : make_early_inc_range(Range: Header->phis())) {
1479	auto *HeaderPhiR = cast<VPHeaderPHIRecipe>(Val: &HeaderR);
1480	HeaderPhiR->replaceAllUsesWith(New: HeaderPhiR->getStartValue());
1481	HeaderPhiR->eraseFromParent();
1482	}
1483
1484	VPBlockBase *Preheader = VectorRegion->getSinglePredecessor();
1485	VPBlockBase *Exit = VectorRegion->getSingleSuccessor();
1486	VPBlockUtils::disconnectBlocks(From: Preheader, To: VectorRegion);
1487	VPBlockUtils::disconnectBlocks(From: VectorRegion, To: Exit);
1488
1489	for (VPBlockBase *B : vp_depth_first_shallow(G: VectorRegion->getEntry()))
1490	B->setParent(nullptr);
1491
1492	VPBlockUtils::connectBlocks(From: Preheader, To: Header);
1493	VPBlockUtils::connectBlocks(From: ExitingVPBB, To: Exit);
1494	VPlanTransforms::simplifyRecipes(Plan, CanonicalIVTy&: *CanIVTy);
1495	} else {
1496	// The vector region contains header phis for which we cannot remove the
1497	// loop region yet.
1498	LLVMContext &Ctx = SE.getContext();
1499	auto BOC = new* VPInstruction (
1500	VPInstruction::BranchOnCond,
1501	{Plan.getOrAddLiveIn(V: ConstantInt::getTrue(Context&: Ctx))}, Term->getDebugLoc());
1502	ExitingVPBB->appendRecipe(Recipe: BOC);
1503	}
1504
1505	Term->eraseFromParent();
1506
1507	return true;
1508	}
1509
1510	void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
1511	unsigned BestUF,
1512	PredicatedScalarEvolution &PSE) {
1513	assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
1514	assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
1515
1516	bool MadeChange =
1517	simplifyBranchConditionForVFAndUF(Plan, BestVF, BestUF, PSE);
1518	MadeChange \|= optimizeVectorInductionWidthForTCAndVFUF(Plan, BestVF, BestUF);
1519
1520	if (MadeChange) {
1521	Plan.setVF(BestVF);
1522	assert(Plan.getUF() == BestUF && "BestUF must match the Plan's UF");
1523	}
1524	// TODO: Further simplifications are possible
1525	// 1. Replace inductions with constants.
1526	// 2. Replace vector loop region with VPBasicBlock.
1527	}
1528
1529	/// Sink users of \p FOR after the recipe defining the previous value \p
1530	/// Previous of the recurrence. \returns true if all users of \p FOR could be
1531	/// re-arranged as needed or false if it is not possible.
1532	static bool
1533	sinkRecurrenceUsersAfterPrevious(VPFirstOrderRecurrencePHIRecipe *FOR,
1534	VPRecipeBase *Previous,
1535	VPDominatorTree &VPDT) {
1536	// Collect recipes that need sinking.
1537	SmallVector<VPRecipeBase *> WorkList;
1538	SmallPtrSet<VPRecipeBase *, `8`> Seen;
1539	Seen.insert(Ptr: Previous);
1540	auto TryToPushSinkCandidate = [&](VPRecipeBase *SinkCandidate) {
1541	// The previous value must not depend on the users of the recurrence phi. In
1542	// that case, FOR is not a fixed order recurrence.
1543	if (SinkCandidate == Previous)
1544	return false;
1545
1546	if (isa<VPHeaderPHIRecipe>(Val: SinkCandidate) \|\|
1547	!Seen.insert(Ptr: SinkCandidate).second \|\|
1548	VPDT.properlyDominates(A: Previous, B: SinkCandidate))
1549	return true;
1550
1551	if (SinkCandidate->mayHaveSideEffects())
1552	return false;
1553
1554	WorkList.push_back(Elt: SinkCandidate);
1555	return true;
1556	};
1557
1558	// Recursively sink users of FOR after Previous.
1559	WorkList.push_back(Elt: FOR);
1560	for (unsigned I = `0`; I != WorkList.size(); ++I) {
1561	VPRecipeBase *Current = WorkList [I];
1562	assert(Current->getNumDefinedValues() == `1` &&
1563	"only recipes with a single defined value expected");
1564
1565	for (VPUser *User : Current->getVPSingleValue()->users()) {
1566	if (!TryToPushSinkCandidate (cast<VPRecipeBase>(Val: User)))
1567	return false;
1568	}
1569	}
1570
1571	// Keep recipes to sink ordered by dominance so earlier instructions are
1572	// processed first.
1573	sort(C&: WorkList, Comp: [&VPDT](const VPRecipeBase A, const* VPRecipeBase *B) {
1574	return VPDT.properlyDominates(A, B);
1575	});
1576
1577	for (VPRecipeBase *SinkCandidate : WorkList) {
1578	if (SinkCandidate == FOR)
1579	continue;
1580
1581	SinkCandidate->moveAfter(MovePos: Previous);
1582	Previous = SinkCandidate;
1583	}
1584	return true;
1585	}
1586
1587	/// Try to hoist \p Previous and its operands before all users of \p FOR.
1588	static bool hoistPreviousBeforeFORUsers(VPFirstOrderRecurrencePHIRecipe *FOR,
1589	VPRecipeBase *Previous,
1590	VPDominatorTree &VPDT) {
1591	if (Previous->mayHaveSideEffects() \|\| Previous->mayReadFromMemory())
1592	return false;
1593
1594	// Collect recipes that need hoisting.
1595	SmallVector<VPRecipeBase *> HoistCandidates;
1596	SmallPtrSet<VPRecipeBase *, `8`> Visited;
1597	VPRecipeBase HoistPoint = nullptr*;
1598	// Find the closest hoist point by looking at all users of FOR and selecting
1599	// the recipe dominating all other users.
1600	for (VPUser *U : FOR->users()) {
1601	auto *R = cast<VPRecipeBase>(Val: U);
1602	if (!HoistPoint \|\| VPDT.properlyDominates(A: R, B: HoistPoint))
1603	HoistPoint = R;
1604	}
1605	assert(all_of(FOR->users(),
1606	[&VPDT, HoistPoint](VPUser *U) {
1607	auto *R = cast<VPRecipeBase>(U);
1608	return HoistPoint == R \|\|
1609	VPDT.properlyDominates(HoistPoint, R);
1610	}) &&
1611	"HoistPoint must dominate all users of FOR");
1612
1613	auto NeedsHoisting = [HoistPoint, &VPDT,
1614	&Visited](VPValue HoistCandidateV) -> VPRecipeBase {
1615	VPRecipeBase *HoistCandidate = HoistCandidateV->getDefiningRecipe();
1616	if (!HoistCandidate)
1617	return nullptr;
1618	VPRegionBlock *EnclosingLoopRegion =
1619	HoistCandidate->getParent()->getEnclosingLoopRegion();
1620	assert((!HoistCandidate->getParent()->getParent() \|\|
1621	HoistCandidate->getParent()->getParent() == EnclosingLoopRegion) &&
1622	"CFG in VPlan should still be flat, without replicate regions");
1623	// Hoist candidate was already visited, no need to hoist.
1624	if (!Visited.insert(Ptr: HoistCandidate).second)
1625	return nullptr;
1626
1627	// Candidate is outside loop region or a header phi, dominates FOR users w/o
1628	// hoisting.
1629	if (!EnclosingLoopRegion \|\| isa<VPHeaderPHIRecipe>(Val: HoistCandidate))
1630	return nullptr;
1631
1632	// If we reached a recipe that dominates HoistPoint, we don't need to
1633	// hoist the recipe.
1634	if (VPDT.properlyDominates(A: HoistCandidate, B: HoistPoint))
1635	return nullptr;
1636	return HoistCandidate;
1637	};
1638	auto CanHoist = [&](VPRecipeBase *HoistCandidate) {
1639	// Avoid hoisting candidates with side-effects, as we do not yet analyze
1640	// associated dependencies.
1641	return !HoistCandidate->mayHaveSideEffects();
1642	};
1643
1644	if (!NeedsHoisting (Previous->getVPSingleValue()))
1645	return true;
1646
1647	// Recursively try to hoist Previous and its operands before all users of FOR.
1648	HoistCandidates.push_back(Elt: Previous);
1649
1650	for (unsigned I = `0`; I != HoistCandidates.size(); ++I) {
1651	VPRecipeBase *Current = HoistCandidates [I];
1652	assert(Current->getNumDefinedValues() == `1` &&
1653	"only recipes with a single defined value expected");
1654	if (!CanHoist (Current))
1655	return false;
1656
1657	for (VPValue *Op : Current->operands()) {
1658	// If we reach FOR, it means the original Previous depends on some other
1659	// recurrence that in turn depends on FOR. If that is the case, we would
1660	// also need to hoist recipes involving the other FOR, which may break
1661	// dependencies.
1662	if (Op == FOR)
1663	return false;
1664
1665	if (auto *R = NeedsHoisting (Op))
1666	HoistCandidates.push_back(Elt: R);
1667	}
1668	}
1669
1670	// Order recipes to hoist by dominance so earlier instructions are processed
1671	// first.
1672	sort(C&: HoistCandidates, Comp: [&VPDT](const VPRecipeBase A, const* VPRecipeBase *B) {
1673	return VPDT.properlyDominates(A, B);
1674	});
1675
1676	for (VPRecipeBase *HoistCandidate : HoistCandidates) {
1677	HoistCandidate->moveBefore(BB&: *HoistPoint->getParent(),
1678	I: HoistPoint->getIterator());
1679	}
1680
1681	return true;
1682	}
1683
1684	bool VPlanTransforms::adjustFixedOrderRecurrences(VPlan &Plan,
1685	VPBuilder &LoopBuilder) {
1686	VPDominatorTree VPDT;
1687	VPDT.recalculate(Func&: Plan);
1688
1689	SmallVector<VPFirstOrderRecurrencePHIRecipe *> RecurrencePhis;
1690	for (VPRecipeBase &R :
1691	Plan.getVectorLoopRegion()->getEntry()->getEntryBasicBlock()->phis())
1692	if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(Val: &R))
1693	RecurrencePhis.push_back(Elt: FOR);
1694
1695	for (VPFirstOrderRecurrencePHIRecipe *FOR : RecurrencePhis) {
1696	SmallPtrSet<VPFirstOrderRecurrencePHIRecipe *, `4`> SeenPhis;
1697	VPRecipeBase *Previous = FOR->getBackedgeValue()->getDefiningRecipe();
1698	// Fixed-order recurrences do not contain cycles, so this loop is guaranteed
1699	// to terminate.
1700	while (auto *PrevPhi =
1701	dyn_cast_or_null<VPFirstOrderRecurrencePHIRecipe>(Val: Previous)) {
1702	assert(PrevPhi->getParent() == FOR->getParent());
1703	assert(SeenPhis.insert(PrevPhi).second);
1704	Previous = PrevPhi->getBackedgeValue()->getDefiningRecipe();
1705	}
1706
1707	if (!sinkRecurrenceUsersAfterPrevious(FOR, Previous, VPDT) &&
1708	!hoistPreviousBeforeFORUsers(FOR, Previous, VPDT))
1709	return false;
1710
1711	// Introduce a recipe to combine the incoming and previous values of a
1712	// fixed-order recurrence.
1713	VPBasicBlock *InsertBlock = Previous->getParent();
1714	if (isa<VPHeaderPHIRecipe>(Val: Previous))
1715	LoopBuilder.setInsertPoint(TheBB: InsertBlock, IP: InsertBlock->getFirstNonPhi());
1716	else
1717	LoopBuilder.setInsertPoint(TheBB: InsertBlock,
1718	IP: std::next(x: Previous->getIterator()));
1719
1720	auto *RecurSplice =
1721	LoopBuilder.createNaryOp(Opcode: VPInstruction::FirstOrderRecurrenceSplice,
1722	Operands: {FOR, FOR->getBackedgeValue()});
1723
1724	FOR->replaceAllUsesWith(New: RecurSplice);
1725	// Set the first operand of RecurSplice to FOR again, after replacing
1726	// all users.
1727	RecurSplice->setOperand(I: `0`, New: FOR);
1728	}
1729	return true;
1730	}
1731
1732	void VPlanTransforms::clearReductionWrapFlags(VPlan &Plan) {
1733	for (VPRecipeBase &R :
1734	Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
1735	auto *PhiR = dyn_cast<VPReductionPHIRecipe>(Val: &R);
1736	if (!PhiR)
1737	continue;
1738	const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
1739	RecurKind RK = RdxDesc.getRecurrenceKind();
1740	if (RK != RecurKind::Add && RK != RecurKind::Mul)
1741	continue;
1742
1743	for (VPUser *U : collectUsersRecursively(V: PhiR))
1744	if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(Val: U)) {
1745	RecWithFlags->dropPoisonGeneratingFlags();
1746	}
1747	}
1748	}
1749
1750	/// Move loop-invariant recipes out of the vector loop region in \p Plan.
1751	static void licm(VPlan &Plan) {
1752	VPBasicBlock *Preheader = Plan.getVectorPreheader();
1753
1754	// Return true if we do not know how to (mechanically) hoist a given recipe
1755	// out of a loop region. Does not address legality concerns such as aliasing
1756	// or speculation safety.
1757	auto CannotHoistRecipe = [](VPRecipeBase &R) {
1758	// Allocas cannot be hoisted.
1759	auto *RepR = dyn_cast<VPReplicateRecipe>(Val: &R);
1760	return RepR && RepR->getOpcode() == Instruction::Alloca;
1761	};
1762
1763	// Hoist any loop invariant recipes from the vector loop region to the
1764	// preheader. Preform a shallow traversal of the vector loop region, to
1765	// exclude recipes in replicate regions.
1766	VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
1767	for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
1768	Range: vp_depth_first_shallow(G: LoopRegion->getEntry()))) {
1769	for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB)) {
1770	if (CannotHoistRecipe (R))
1771	continue;
1772	// TODO: Relax checks in the future, e.g. we could also hoist reads, if
1773	// their memory location is not modified in the vector loop.
1774	if (R.mayHaveSideEffects() \|\| R.mayReadFromMemory() \|\| R.isPhi() \|\|
1775	any_of(Range: R.operands(), P: [](VPValue *Op) {
1776	return !Op->isDefinedOutsideLoopRegions();
1777	}))
1778	continue;
1779	R.moveBefore(BB&: *Preheader, I: Preheader->end());
1780	}
1781	}
1782	}
1783
1784	void VPlanTransforms::truncateToMinimalBitwidths(
1785	VPlan &Plan, const MapVector<Instruction *, uint64_t> &MinBWs) {
1786	// Keep track of created truncates, so they can be re-used. Note that we
1787	// cannot use RAUW after creating a new truncate, as this would could make
1788	// other uses have different types for their operands, making them invalidly
1789	// typed.
1790	DenseMap<VPValue , VPWidenCastRecipe > ProcessedTruncs;
1791	Type *CanonicalIVType = Plan.getCanonicalIV()->getScalarType();
1792	VPTypeAnalysis TypeInfo(CanonicalIVType);
1793	VPBasicBlock *PH = Plan.getVectorPreheader();
1794	for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
1795	Range: vp_depth_first_deep(G: Plan.getVectorLoopRegion()))) {
1796	for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB)) {
1797	if (!isa<VPWidenRecipe, VPWidenCastRecipe, VPReplicateRecipe,
1798	VPWidenSelectRecipe, VPWidenLoadRecipe, VPWidenIntrinsicRecipe>(
1799	Val: &R))
1800	continue;
1801
1802	VPValue *ResultVPV = R.getVPSingleValue();
1803	auto *UI = cast_or_null<Instruction>(Val: ResultVPV->getUnderlyingValue());
1804	unsigned NewResSizeInBits = MinBWs.lookup(Key: UI);
1805	if (!NewResSizeInBits)
1806	continue;
1807
1808	// If the value wasn't vectorized, we must maintain the original scalar
1809	// type. Skip those here, after incrementing NumProcessedRecipes. Also
1810	// skip casts which do not need to be handled explicitly here, as
1811	// redundant casts will be removed during recipe simplification.
1812	if (isa<VPReplicateRecipe, VPWidenCastRecipe>(Val: &R))
1813	continue;
1814
1815	Type *OldResTy = TypeInfo.inferScalarType(V: ResultVPV);
1816	unsigned OldResSizeInBits = OldResTy->getScalarSizeInBits();
1817	assert(OldResTy->isIntegerTy() && "only integer types supported");
1818	(void)OldResSizeInBits;
1819
1820	LLVMContext &Ctx = CanonicalIVType->getContext();
1821	auto *NewResTy = IntegerType::get(C&: Ctx, NumBits: NewResSizeInBits);
1822
1823	// Any wrapping introduced by shrinking this operation shouldn't be
1824	// considered undefined behavior. So, we can't unconditionally copy
1825	// arithmetic wrapping flags to VPW.
1826	if (auto *VPW = dyn_cast<VPRecipeWithIRFlags>(Val: &R))
1827	VPW->dropPoisonGeneratingFlags();
1828
1829	using namespace llvm::VPlanPatternMatch;
1830	if (OldResSizeInBits != NewResSizeInBits &&
1831	!match(V: &R, P: m_Binary<Instruction::ICmp>(Op0: m_VPValue(), Op1: m_VPValue()))) {
1832	// Extend result to original width.
1833	auto *Ext =
1834	new VPWidenCastRecipe (Instruction::ZExt, ResultVPV, OldResTy);
1835	Ext->insertAfter(InsertPos: &R);
1836	ResultVPV->replaceAllUsesWith(New: Ext);
1837	Ext->setOperand(I: `0`, New: ResultVPV);
1838	assert(OldResSizeInBits > NewResSizeInBits && "Nothing to shrink?");
1839	} else {
1840	assert(
1841	match(&R, m_Binary<Instruction::ICmp>(m_VPValue(), m_VPValue())) &&
1842	"Only ICmps should not need extending the result.");
1843	}
1844
1845	assert(!isa<VPWidenStoreRecipe>(&R) && "stores cannot be narrowed");
1846	if (isa<VPWidenLoadRecipe, VPWidenIntrinsicRecipe>(Val: &R))
1847	continue;
1848
1849	// Shrink operands by introducing truncates as needed.
1850	unsigned StartIdx = isa<VPWidenSelectRecipe>(Val: &R) ? `1` : `0`;
1851	for (unsigned Idx = StartIdx; Idx != R.getNumOperands(); ++Idx) {
1852	auto *Op = R.getOperand(N: Idx);
1853	unsigned OpSizeInBits =
1854	TypeInfo.inferScalarType(V: Op)->getScalarSizeInBits();
1855	if (OpSizeInBits == NewResSizeInBits)
1856	continue;
1857	assert(OpSizeInBits > NewResSizeInBits && "nothing to truncate");
1858	auto [ProcessedIter, IterIsEmpty] = ProcessedTruncs.try_emplace(Key: Op);
1859	VPWidenCastRecipe *NewOp =
1860	IterIsEmpty
1861	? new VPWidenCastRecipe (Instruction::Trunc, Op, NewResTy)
1862	: ProcessedIter ->second;
1863	R.setOperand(I: Idx, New: NewOp);
1864	if (!IterIsEmpty)
1865	continue;
1866	ProcessedIter ->second = NewOp;
1867	if (!Op->isLiveIn()) {
1868	NewOp->insertBefore(InsertPos: &R);
1869	} else {
1870	PH->appendRecipe(Recipe: NewOp);
1871	}
1872	}
1873
1874	}
1875	}
1876	}
1877
1878	/// Remove BranchOnCond recipes with true or false conditions together with
1879	/// removing dead edges to their successors.
1880	static void removeBranchOnConst(VPlan &Plan) {
1881	using namespace llvm::VPlanPatternMatch;
1882	for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
1883	Range: vp_depth_first_shallow(G: Plan.getEntry()))) {
1884	VPValue *Cond;
1885	if (VPBB->getNumSuccessors() != `2` \|\| VPBB == Plan.getEntry() \|\|
1886	!match(V: &VPBB->back(), P: m_BranchOnCond(Op0: m_VPValue(V&: Cond))))
1887	continue;
1888
1889	unsigned RemovedIdx;
1890	if (match(V: Cond, P: m_True()))
1891	RemovedIdx = `1`;
1892	else if (match(V: Cond, P: m_False()))
1893	RemovedIdx = `0`;
1894	else
1895	continue;
1896
1897	VPBasicBlock *RemovedSucc =
1898	cast<VPBasicBlock>(Val: VPBB->getSuccessors()[RemovedIdx]);
1899	assert(count(RemovedSucc->getPredecessors(), VPBB) == `1` &&
1900	"There must be a single edge between VPBB and its successor");
1901	// Values coming from VPBB into phi recipes of RemoveSucc are removed from
1902	// these recipes.
1903	for (VPRecipeBase &R : RemovedSucc->phis()) {
1904	auto *Phi = cast<VPPhiAccessors>(Val: &R);
1905	assert((!isa<VPIRPhi>(&R) \|\| RemovedSucc->getNumPredecessors() == `1`) &&
1906	"VPIRPhis must have a single predecessor");
1907	Phi->removeIncomingValueFor(IncomingBlock: VPBB);
1908	}
1909	// Disconnect blocks and remove the terminator. RemovedSucc will be deleted
1910	// automatically on VPlan destruction if it becomes unreachable.
1911	VPBlockUtils::disconnectBlocks(From: VPBB, To: RemovedSucc);
1912	VPBB->back().eraseFromParent();
1913	}
1914	}
1915
1916	void VPlanTransforms::optimize(VPlan &Plan) {
1917	runPass(Fn: removeRedundantCanonicalIVs, Plan);
1918	runPass(Fn: removeRedundantInductionCasts, Plan);
1919
1920	runPass(Fn: simplifyRecipes, Plan, Args&: *Plan.getCanonicalIV()->getScalarType());
1921	runPass(Fn: simplifyBlends, Plan);
1922	runPass(Fn: removeDeadRecipes, Plan);
1923	runPass(Fn: narrowToSingleScalarRecipes, Plan);
1924	runPass(Fn: legalizeAndOptimizeInductions, Plan);
1925	runPass(Fn: removeRedundantExpandSCEVRecipes, Plan);
1926	runPass(Fn: simplifyRecipes, Plan, Args&: *Plan.getCanonicalIV()->getScalarType());
1927	runPass(Fn: removeBranchOnConst, Plan);
1928	runPass(Fn: removeDeadRecipes, Plan);
1929
1930	runPass(Fn: createAndOptimizeReplicateRegions, Plan);
1931	runPass(Transform: mergeBlocksIntoPredecessors, Plan);
1932	runPass(Fn: licm, Plan);
1933	}
1934
1935	// Add a VPActiveLaneMaskPHIRecipe and related recipes to \p Plan and replace
1936	// the loop terminator with a branch-on-cond recipe with the negated
1937	// active-lane-mask as operand. Note that this turns the loop into an
1938	// uncountable one. Only the existing terminator is replaced, all other existing
1939	// recipes/users remain unchanged, except for poison-generating flags being
1940	// dropped from the canonical IV increment. Return the created
1941	// VPActiveLaneMaskPHIRecipe.
1942	//
1943	// The function uses the following definitions:
1944	//
1945	// %TripCount = DataWithControlFlowWithoutRuntimeCheck ?
1946	// calculate-trip-count-minus-VF (original TC) : original TC
1947	// %IncrementValue = DataWithControlFlowWithoutRuntimeCheck ?
1948	// CanonicalIVPhi : CanonicalIVIncrement
1949	// %StartV is the canonical induction start value.
1950	//
1951	// The function adds the following recipes:
1952	//
1953	// vector.ph:
1954	// %TripCount = calculate-trip-count-minus-VF (original TC)
1955	// [if DataWithControlFlowWithoutRuntimeCheck]
1956	// %EntryInc = canonical-iv-increment-for-part %StartV
1957	// %EntryALM = active-lane-mask %EntryInc, %TripCount
1958	//
1959	// vector.body:
1960	// ...
1961	// %P = active-lane-mask-phi [ %EntryALM, %vector.ph ], [ %ALM, %vector.body ]
1962	// ...
1963	// %InLoopInc = canonical-iv-increment-for-part %IncrementValue
1964	// %ALM = active-lane-mask %InLoopInc, TripCount
1965	// %Negated = Not %ALM
1966	// branch-on-cond %Negated
1967	//
1968	static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch(
1969	VPlan &Plan, bool DataAndControlFlowWithoutRuntimeCheck) {
1970	VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
1971	VPBasicBlock *EB = TopRegion->getExitingBasicBlock();
1972	auto *CanonicalIVPHI = Plan.getCanonicalIV();
1973	VPValue *StartV = CanonicalIVPHI->getStartValue();
1974
1975	auto *CanonicalIVIncrement =
1976	cast<VPInstruction>(Val: CanonicalIVPHI->getBackedgeValue());
1977	// TODO: Check if dropping the flags is needed if
1978	// !DataAndControlFlowWithoutRuntimeCheck.
1979	CanonicalIVIncrement->dropPoisonGeneratingFlags();
1980	DebugLoc DL = CanonicalIVIncrement->getDebugLoc();
1981	// We can't use StartV directly in the ActiveLaneMask VPInstruction, since
1982	// we have to take unrolling into account. Each part needs to start at
1983	// Part VF*
1984	auto *VecPreheader = Plan.getVectorPreheader();
1985	VPBuilder Builder(VecPreheader);
1986
1987	// Create the ActiveLaneMask instruction using the correct start values.
1988	VPValue *TC = Plan.getTripCount();
1989
1990	VPValue TripCount, IncrementValue;
1991	if (!DataAndControlFlowWithoutRuntimeCheck) {
1992	// When the loop is guarded by a runtime overflow check for the loop
1993	// induction variable increment by VF, we can increment the value before
1994	// the get.active.lane mask and use the unmodified tripcount.
1995	IncrementValue = CanonicalIVIncrement;
1996	TripCount = TC;
1997	} else {
1998	// When avoiding a runtime check, the active.lane.mask inside the loop
1999	// uses a modified trip count and the induction variable increment is
2000	// done after the active.lane.mask intrinsic is called.
2001	IncrementValue = CanonicalIVPHI;
2002	TripCount = Builder.createNaryOp(Opcode: VPInstruction::CalculateTripCountMinusVF,
2003	Operands: {TC}, DL);
2004	}
2005	auto *EntryIncrement = Builder.createOverflowingOp(
2006	Opcode: VPInstruction::CanonicalIVIncrementForPart, Operands: {StartV}, WrapFlags: {false, false}, DL,
2007	Name: "index.part.next");
2008
2009	// Create the active lane mask instruction in the VPlan preheader.
2010	auto *EntryALM =
2011	Builder.createNaryOp(Opcode: VPInstruction::ActiveLaneMask, Operands: {EntryIncrement, TC},
2012	DL, Name: "active.lane.mask.entry");
2013
2014	// Now create the ActiveLaneMaskPhi recipe in the main loop using the
2015	// preheader ActiveLaneMask instruction.
2016	auto LaneMaskPhi = new* VPActiveLaneMaskPHIRecipe (EntryALM, DebugLoc ());
2017	LaneMaskPhi->insertAfter(InsertPos: CanonicalIVPHI);
2018
2019	// Create the active lane mask for the next iteration of the loop before the
2020	// original terminator.
2021	VPRecipeBase *OriginalTerminator = EB->getTerminator();
2022	Builder.setInsertPoint(OriginalTerminator);
2023	auto *InLoopIncrement =
2024	Builder.createOverflowingOp(Opcode: VPInstruction::CanonicalIVIncrementForPart,
2025	Operands: {IncrementValue}, WrapFlags: {false, false}, DL);
2026	auto *ALM = Builder.createNaryOp(Opcode: VPInstruction::ActiveLaneMask,
2027	Operands: {InLoopIncrement, TripCount}, DL,
2028	Name: "active.lane.mask.next");
2029	LaneMaskPhi->addOperand(Operand: ALM);
2030
2031	// Replace the original terminator with BranchOnCond. We have to invert the
2032	// mask here because a true condition means jumping to the exit block.
2033	auto *NotMask = Builder.createNot(Operand: ALM, DL);
2034	Builder.createNaryOp(Opcode: VPInstruction::BranchOnCond, Operands: {NotMask}, DL);
2035	OriginalTerminator->eraseFromParent();
2036	return LaneMaskPhi;
2037	}
2038
2039	/// Collect all VPValues representing a header mask through the (ICMP_ULE,
2040	/// WideCanonicalIV, backedge-taken-count) pattern.
2041	/// TODO: Introduce explicit recipe for header-mask instead of searching
2042	/// for the header-mask pattern manually.
2043	static SmallVector<VPValue *> collectAllHeaderMasks(VPlan &Plan) {
2044	SmallVector<VPValue *> WideCanonicalIVs;
2045	auto *FoundWidenCanonicalIVUser =
2046	find_if(Range: Plan.getCanonicalIV()->users(),
2047	P: [](VPUser U) { return* isa<VPWidenCanonicalIVRecipe>(Val: U); });
2048	assert(count_if(Plan.getCanonicalIV()->users(),
2049	[](VPUser U) { return* isa<VPWidenCanonicalIVRecipe>(U); }) <=
2050	`1` &&
2051	"Must have at most one VPWideCanonicalIVRecipe");
2052	if (FoundWidenCanonicalIVUser != Plan.getCanonicalIV()->users().end()) {
2053	auto *WideCanonicalIV =
2054	cast<VPWidenCanonicalIVRecipe>(Val: *FoundWidenCanonicalIVUser);
2055	WideCanonicalIVs.push_back(Elt: WideCanonicalIV);
2056	}
2057
2058	// Also include VPWidenIntOrFpInductionRecipes that represent a widened
2059	// version of the canonical induction.
2060	VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
2061	for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
2062	auto *WidenOriginalIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(Val: &Phi);
2063	if (WidenOriginalIV && WidenOriginalIV->isCanonical())
2064	WideCanonicalIVs.push_back(Elt: WidenOriginalIV);
2065	}
2066
2067	// Walk users of wide canonical IVs and collect to all compares of the form
2068	// (ICMP_ULE, WideCanonicalIV, backedge-taken-count).
2069	SmallVector<VPValue *> HeaderMasks;
2070	for (auto *Wide : WideCanonicalIVs) {
2071	for (VPUser U : SmallVector<VPUser >(Wide->users())) {
2072	auto *HeaderMask = dyn_cast<VPInstruction>(Val: U);
2073	if (!HeaderMask \|\| !vputils::isHeaderMask(V: HeaderMask, Plan))
2074	continue;
2075
2076	assert(HeaderMask->getOperand(`0`) == Wide &&
2077	"WidenCanonicalIV must be the first operand of the compare");
2078	HeaderMasks.push_back(Elt: HeaderMask);
2079	}
2080	}
2081	return HeaderMasks;
2082	}
2083
2084	void VPlanTransforms::addActiveLaneMask(
2085	VPlan &Plan, bool UseActiveLaneMaskForControlFlow,
2086	bool DataAndControlFlowWithoutRuntimeCheck) {
2087	assert((!DataAndControlFlowWithoutRuntimeCheck \|\|
2088	UseActiveLaneMaskForControlFlow) &&
2089	"DataAndControlFlowWithoutRuntimeCheck implies "
2090	"UseActiveLaneMaskForControlFlow");
2091
2092	auto *FoundWidenCanonicalIVUser =
2093	find_if(Range: Plan.getCanonicalIV()->users(),
2094	P: [](VPUser U) { return* isa<VPWidenCanonicalIVRecipe>(Val: U); });
2095	assert(FoundWidenCanonicalIVUser &&
2096	"Must have widened canonical IV when tail folding!");
2097	auto *WideCanonicalIV =
2098	cast<VPWidenCanonicalIVRecipe>(Val: *FoundWidenCanonicalIVUser);
2099	VPSingleDefRecipe *LaneMask;
2100	if (UseActiveLaneMaskForControlFlow) {
2101	LaneMask = addVPLaneMaskPhiAndUpdateExitBranch(
2102	Plan, DataAndControlFlowWithoutRuntimeCheck);
2103	} else {
2104	VPBuilder B = VPBuilder::getToInsertAfter(R: WideCanonicalIV);
2105	LaneMask = B.createNaryOp(Opcode: VPInstruction::ActiveLaneMask,
2106	Operands: {WideCanonicalIV, Plan.getTripCount()}, Inst: nullptr,
2107	Name: "active.lane.mask");
2108	}
2109
2110	// Walk users of WideCanonicalIV and replace all compares of the form
2111	// (ICMP_ULE, WideCanonicalIV, backedge-taken-count) with an
2112	// active-lane-mask.
2113	for (VPValue *HeaderMask : collectAllHeaderMasks(Plan))
2114	HeaderMask->replaceAllUsesWith(New: LaneMask);
2115	}
2116
2117	/// Try to optimize a \p CurRecipe masked by \p HeaderMask to a corresponding
2118	/// EVL-based recipe without the header mask. Returns nullptr if no EVL-based
2119	/// recipe could be created.
2120	/// \p HeaderMask Header Mask.
2121	/// \p CurRecipe Recipe to be transform.
2122	/// \p TypeInfo VPlan-based type analysis.
2123	/// \p AllOneMask The vector mask parameter of vector-predication intrinsics.
2124	/// \p EVL The explicit vector length parameter of vector-predication
2125	/// intrinsics.
2126	static VPRecipeBase optimizeMaskToEVL(VPValue HeaderMask,
2127	VPRecipeBase &CurRecipe,
2128	VPTypeAnalysis &TypeInfo,
2129	VPValue &AllOneMask, VPValue &EVL) {
2130	using namespace llvm::VPlanPatternMatch;
2131	auto GetNewMask = [&](VPValue OrigMask) -> VPValue {
2132	assert(OrigMask && "Unmasked recipe when folding tail");
2133	// HeaderMask will be handled using EVL.
2134	VPValue *Mask;
2135	if (match(V: OrigMask, P: m_LogicalAnd(Op0: m_Specific(VPV: HeaderMask), Op1: m_VPValue(V&: Mask))))
2136	return Mask;
2137	return HeaderMask == OrigMask ? nullptr : OrigMask;
2138	};
2139
2140	return TypeSwitch<VPRecipeBase , VPRecipeBase >(&CurRecipe)
2141	.Case<VPWidenLoadRecipe>(caseFn: [&](VPWidenLoadRecipe *L) {
2142	VPValue *NewMask = GetNewMask (L->getMask());
2143	return new VPWidenLoadEVLRecipe (*L, EVL, NewMask);
2144	})
2145	.Case<VPWidenStoreRecipe>(caseFn: [&](VPWidenStoreRecipe *S) {
2146	VPValue *NewMask = GetNewMask (S->getMask());
2147	return new VPWidenStoreEVLRecipe (*S, EVL, NewMask);
2148	})
2149	.Case<VPReductionRecipe>(caseFn: [&](VPReductionRecipe *Red) {
2150	VPValue *NewMask = GetNewMask (Red->getCondOp());
2151	return new VPReductionEVLRecipe (*Red, EVL, NewMask);
2152	})
2153	.Case<VPInstruction>(caseFn: [&](VPInstruction VPI) -> VPRecipeBase {
2154	VPValue LHS, RHS;
2155	// Transform select with a header mask condition
2156	// select(header_mask, LHS, RHS)
2157	// into vector predication merge.
2158	// vp.merge(all-true, LHS, RHS, EVL)
2159	if (!match(V: VPI, P: m_Select(Op0: m_Specific(VPV: HeaderMask), Op1: m_VPValue(V&: LHS),
2160	Op2: m_VPValue(V&: RHS))))
2161	return nullptr;
2162	// Use all true as the condition because this transformation is
2163	// limited to selects whose condition is a header mask.
2164	return new VPWidenIntrinsicRecipe (
2165	Intrinsic::vp_merge, {&AllOneMask, LHS, RHS, &EVL},
2166	TypeInfo.inferScalarType(V: LHS), VPI->getDebugLoc());
2167	})
2168	.Default(defaultFn: [&](VPRecipeBase R) { return* nullptr; });
2169	}
2170
2171	/// Replace recipes with their EVL variants.
2172	static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
2173	Type *CanonicalIVType = Plan.getCanonicalIV()->getScalarType();
2174	VPTypeAnalysis TypeInfo(CanonicalIVType);
2175	LLVMContext &Ctx = CanonicalIVType->getContext();
2176	VPValue *AllOneMask = Plan.getOrAddLiveIn(V: ConstantInt::getTrue(Context&: Ctx));
2177	VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2178	VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
2179
2180	assert(all_of(Plan.getVF().users(),
2181	IsaPred<VPVectorEndPointerRecipe, VPScalarIVStepsRecipe,
2182	VPWidenIntOrFpInductionRecipe>) &&
2183	"User of VF that we can't transform to EVL.");
2184	Plan.getVF().replaceAllUsesWith(New: &EVL);
2185
2186	// Defer erasing recipes till the end so that we don't invalidate the
2187	// VPTypeAnalysis cache.
2188	SmallVector<VPRecipeBase *> ToErase;
2189
2190	// Create a scalar phi to track the previous EVL if fixed-order recurrence is
2191	// contained.
2192	bool ContainsFORs =
2193	any_of(Range: Header->phis(), P: IsaPred<VPFirstOrderRecurrencePHIRecipe>);
2194	if (ContainsFORs) {
2195	// TODO: Use VPInstruction::ExplicitVectorLength to get maximum EVL.
2196	VPValue *MaxEVL = &Plan.getVF();
2197	// Emit VPScalarCastRecipe in preheader if VF is not a 32 bits integer.
2198	VPBuilder Builder(LoopRegion->getPreheaderVPBB());
2199	MaxEVL = Builder.createScalarZExtOrTrunc(Op: MaxEVL, ResultTy: Type::getInt32Ty(C&: Ctx),
2200	SrcTy: TypeInfo.inferScalarType(V: MaxEVL),
2201	DL: DebugLoc ());
2202
2203	Builder.setInsertPoint(TheBB: Header, IP: Header->getFirstNonPhi());
2204	VPValue *PrevEVL =
2205	Builder.createScalarPhi(IncomingValues: {MaxEVL, &EVL}, DL: DebugLoc (), Name: "prev.evl");
2206
2207	for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
2208	Range: vp_depth_first_deep(G: Plan.getVectorLoopRegion()->getEntry()))) {
2209	for (VPRecipeBase &R : *VPBB) {
2210	using namespace VPlanPatternMatch;
2211	VPValue V1, V2;
2212	if (!match(V: &R,
2213	P: m_VPInstruction<VPInstruction::FirstOrderRecurrenceSplice>(
2214	Op0: m_VPValue(V&: V1), Op1: m_VPValue(V&: V2))))
2215	continue;
2216	VPValue *Imm = Plan.getOrAddLiveIn(
2217	V: ConstantInt::getSigned(Ty: Type::getInt32Ty(C&: Ctx), V: -`1`));
2218	VPWidenIntrinsicRecipe VPSplice = new* VPWidenIntrinsicRecipe (
2219	Intrinsic::experimental_vp_splice,
2220	{V1, V2, Imm, AllOneMask, PrevEVL, &EVL},
2221	TypeInfo.inferScalarType(V: R.getVPSingleValue()), R.getDebugLoc());
2222	VPSplice->insertBefore(InsertPos: &R);
2223	R.getVPSingleValue()->replaceAllUsesWith(New: VPSplice);
2224	ToErase.push_back(Elt: &R);
2225	}
2226	}
2227	}
2228
2229	// Try to optimize header mask recipes away to their EVL variants.
2230	for (VPValue *HeaderMask : collectAllHeaderMasks(Plan)) {
2231	for (VPUser *U : collectUsersRecursively(V: HeaderMask)) {
2232	auto *CurRecipe = cast<VPRecipeBase>(Val: U);
2233	VPRecipeBase *EVLRecipe =
2234	optimizeMaskToEVL(HeaderMask, CurRecipe&: CurRecipe, TypeInfo, AllOneMask&: AllOneMask, EVL);
2235	if (!EVLRecipe)
2236	continue;
2237
2238	[[maybe_unused]] unsigned NumDefVal = EVLRecipe->getNumDefinedValues();
2239	assert(NumDefVal == CurRecipe->getNumDefinedValues() &&
2240	"New recipe must define the same number of values as the "
2241	"original.");
2242	assert(
2243	NumDefVal <= `1` &&
2244	"Only supports recipes with a single definition or without users.");
2245	EVLRecipe->insertBefore(InsertPos: CurRecipe);
2246	if (isa<VPSingleDefRecipe, VPWidenLoadEVLRecipe>(Val: EVLRecipe)) {
2247	VPValue *CurVPV = CurRecipe->getVPSingleValue();
2248	CurVPV->replaceAllUsesWith(New: EVLRecipe->getVPSingleValue());
2249	}
2250	ToErase.push_back(Elt: CurRecipe);
2251	}
2252	}
2253
2254	for (VPRecipeBase *R : reverse(C&: ToErase)) {
2255	SmallVector<VPValue *> PossiblyDead(R->operands());
2256	R->eraseFromParent();
2257	for (VPValue *Op : PossiblyDead)
2258	recursivelyDeleteDeadRecipes(V: Op);
2259	}
2260	}
2261
2262	/// Add a VPEVLBasedIVPHIRecipe and related recipes to \p Plan and
2263	/// replaces all uses except the canonical IV increment of
2264	/// VPCanonicalIVPHIRecipe with a VPEVLBasedIVPHIRecipe. VPCanonicalIVPHIRecipe
2265	/// is used only for loop iterations counting after this transformation.
2266	///
2267	/// The function uses the following definitions:
2268	/// %StartV is the canonical induction start value.
2269	///
2270	/// The function adds the following recipes:
2271	///
2272	/// vector.ph:
2273	/// ...
2274	///
2275	/// vector.body:
2276	/// ...
2277	/// %EVLPhi = EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI [ %StartV, %vector.ph ],
2278	/// [ %NextEVLIV, %vector.body ]
2279	/// %AVL = sub original TC, %EVLPhi
2280	/// %VPEVL = EXPLICIT-VECTOR-LENGTH %AVL
2281	/// ...
2282	/// %NextEVLIV = add IVSize (cast i32 %VPEVVL to IVSize), %EVLPhi
2283	/// ...
2284	///
2285	/// If MaxSafeElements is provided, the function adds the following recipes:
2286	/// vector.ph:
2287	/// ...
2288	///
2289	/// vector.body:
2290	/// ...
2291	/// %EVLPhi = EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI [ %StartV, %vector.ph ],
2292	/// [ %NextEVLIV, %vector.body ]
2293	/// %AVL = sub original TC, %EVLPhi
2294	/// %cmp = cmp ult %AVL, MaxSafeElements
2295	/// %SAFE_AVL = select %cmp, %AVL, MaxSafeElements
2296	/// %VPEVL = EXPLICIT-VECTOR-LENGTH %SAFE_AVL
2297	/// ...
2298	/// %NextEVLIV = add IVSize (cast i32 %VPEVL to IVSize), %EVLPhi
2299	/// ...
2300	///
2301	bool VPlanTransforms::tryAddExplicitVectorLength(
2302	VPlan &Plan, const std::optional<unsigned> &MaxSafeElements) {
2303	VPBasicBlock *Header = Plan.getVectorLoopRegion()->getEntryBasicBlock();
2304	// The transform updates all users of inductions to work based on EVL, instead
2305	// of the VF directly. At the moment, widened pointer inductions cannot be
2306	// updated, so bail out if the plan contains any.
2307	bool ContainsWidenPointerInductions =
2308	any_of(Range: Header->phis(), P: IsaPred<VPWidenPointerInductionRecipe>);
2309	if (ContainsWidenPointerInductions)
2310	return false;
2311
2312	auto *CanonicalIVPHI = Plan.getCanonicalIV();
2313	auto *CanIVTy = CanonicalIVPHI->getScalarType();
2314	VPValue *StartV = CanonicalIVPHI->getStartValue();
2315
2316	// Create the ExplicitVectorLengthPhi recipe in the main loop.
2317	auto EVLPhi = new* VPEVLBasedIVPHIRecipe (StartV, DebugLoc ());
2318	EVLPhi->insertAfter(InsertPos: CanonicalIVPHI);
2319	VPBuilder Builder(Header, Header->getFirstNonPhi());
2320	// Compute original TC - IV as the AVL (application vector length).
2321	VPValue *AVL = Builder.createNaryOp(
2322	Opcode: Instruction::Sub, Operands: {Plan.getTripCount(), EVLPhi}, DL: DebugLoc (), Name: "avl");
2323	if (MaxSafeElements) {
2324	// Support for MaxSafeDist for correct loop emission.
2325	VPValue *AVLSafe =
2326	Plan.getOrAddLiveIn(V: ConstantInt::get(Ty: CanIVTy, V: *MaxSafeElements));
2327	VPValue *Cmp = Builder.createICmp(Pred: ICmpInst::ICMP_ULT, A: AVL, B: AVLSafe);
2328	AVL = Builder.createSelect(Cond: Cmp, TrueVal: AVL, FalseVal: AVLSafe, DL: DebugLoc (), Name: "safe_avl");
2329	}
2330	auto *VPEVL = Builder.createNaryOp(Opcode: VPInstruction::ExplicitVectorLength, Operands: AVL,
2331	DL: DebugLoc ());
2332
2333	auto *CanonicalIVIncrement =
2334	cast<VPInstruction>(Val: CanonicalIVPHI->getBackedgeValue());
2335	Builder.setInsertPoint(CanonicalIVIncrement);
2336	VPValue *OpVPEVL = VPEVL;
2337
2338	auto *I32Ty = Type::getInt32Ty(C&: CanIVTy->getContext());
2339	OpVPEVL = Builder.createScalarZExtOrTrunc(
2340	Op: OpVPEVL, ResultTy: CanIVTy, SrcTy: I32Ty, DL: CanonicalIVIncrement->getDebugLoc());
2341
2342	auto *NextEVLIV = Builder.createOverflowingOp(
2343	Opcode: Instruction::Add, Operands: {OpVPEVL, EVLPhi},
2344	WrapFlags: {CanonicalIVIncrement->hasNoUnsignedWrap(),
2345	CanonicalIVIncrement->hasNoSignedWrap()},
2346	DL: CanonicalIVIncrement->getDebugLoc(), Name: "index.evl.next");
2347	EVLPhi->addOperand(Operand: NextEVLIV);
2348
2349	transformRecipestoEVLRecipes(Plan, EVL&: *VPEVL);
2350
2351	// Replace all uses of VPCanonicalIVPHIRecipe by
2352	// VPEVLBasedIVPHIRecipe except for the canonical IV increment.
2353	CanonicalIVPHI->replaceAllUsesWith(New: EVLPhi);
2354	CanonicalIVIncrement->setOperand(I: `0`, New: CanonicalIVPHI);
2355	// TODO: support unroll factor > 1.
2356	Plan.setUF(`1`);
2357	return true;
2358	}
2359
2360	void VPlanTransforms::dropPoisonGeneratingRecipes(
2361	VPlan &Plan,
2362	const std::function<bool(BasicBlock *)> &BlockNeedsPredication) {
2363	// Collect recipes in the backward slice of `Root` that may generate a poison
2364	// value that is used after vectorization.
2365	SmallPtrSet<VPRecipeBase *, `16`> Visited;
2366	auto CollectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) {
2367	SmallVector<VPRecipeBase *, `16`> Worklist;
2368	Worklist.push_back(Elt: Root);
2369
2370	// Traverse the backward slice of Root through its use-def chain.
2371	while (!Worklist.empty()) {
2372	VPRecipeBase *CurRec = Worklist.pop_back_val();
2373
2374	if (!Visited.insert(Ptr: CurRec).second)
2375	continue;
2376
2377	// Prune search if we find another recipe generating a widen memory
2378	// instruction. Widen memory instructions involved in address computation
2379	// will lead to gather/scatter instructions, which don't need to be
2380	// handled.
2381	if (isa<VPWidenMemoryRecipe, VPInterleaveRecipe, VPScalarIVStepsRecipe,
2382	VPHeaderPHIRecipe>(Val: CurRec))
2383	continue;
2384
2385	// This recipe contributes to the address computation of a widen
2386	// load/store. If the underlying instruction has poison-generating flags,
2387	// drop them directly.
2388	if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(Val: CurRec)) {
2389	VPValue A, B;
2390	using namespace llvm::VPlanPatternMatch;
2391	// Dropping disjoint from an OR may yield incorrect results, as some
2392	// analysis may have converted it to an Add implicitly (e.g. SCEV used
2393	// for dependence analysis). Instead, replace it with an equivalent Add.
2394	// This is possible as all users of the disjoint OR only access lanes
2395	// where the operands are disjoint or poison otherwise.
2396	if (match(V: RecWithFlags, P: m_BinaryOr(Op0: m_VPValue(V&: A), Op1: m_VPValue(V&: B))) &&
2397	RecWithFlags->isDisjoint()) {
2398	VPBuilder Builder(RecWithFlags);
2399	VPInstruction *New = Builder.createOverflowingOp(
2400	Opcode: Instruction::Add, Operands: {A, B}, WrapFlags: {false, false},
2401	DL: RecWithFlags->getDebugLoc());
2402	New->setUnderlyingValue(RecWithFlags->getUnderlyingValue());
2403	RecWithFlags->replaceAllUsesWith(New);
2404	RecWithFlags->eraseFromParent();
2405	CurRec = New;
2406	} else
2407	RecWithFlags->dropPoisonGeneratingFlags();
2408	} else {
2409	Instruction *Instr = dyn_cast_or_null<Instruction>(
2410	Val: CurRec->getVPSingleValue()->getUnderlyingValue());
2411	(void)Instr;
2412	assert((!Instr \|\| !Instr->hasPoisonGeneratingFlags()) &&
2413	"found instruction with poison generating flags not covered by "
2414	"VPRecipeWithIRFlags");
2415	}
2416
2417	// Add new definitions to the worklist.
2418	for (VPValue *Operand : CurRec->operands())
2419	if (VPRecipeBase *OpDef = Operand->getDefiningRecipe())
2420	Worklist.push_back(Elt: OpDef);
2421	}
2422	});
2423
2424	// Traverse all the recipes in the VPlan and collect the poison-generating
2425	// recipes in the backward slice starting at the address of a VPWidenRecipe or
2426	// VPInterleaveRecipe.
2427	auto Iter = vp_depth_first_deep(G: Plan.getEntry());
2428	for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Range: Iter)) {
2429	for (VPRecipeBase &Recipe : *VPBB) {
2430	if (auto *WidenRec = dyn_cast<VPWidenMemoryRecipe>(Val: &Recipe)) {
2431	Instruction &UnderlyingInstr = WidenRec->getIngredient();
2432	VPRecipeBase *AddrDef = WidenRec->getAddr()->getDefiningRecipe();
2433	if (AddrDef && WidenRec->isConsecutive() &&
2434	BlockNeedsPredication (UnderlyingInstr.getParent()))
2435	CollectPoisonGeneratingInstrsInBackwardSlice (AddrDef);
2436	} else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(Val: &Recipe)) {
2437	VPRecipeBase *AddrDef = InterleaveRec->getAddr()->getDefiningRecipe();
2438	if (AddrDef) {
2439	// Check if any member of the interleave group needs predication.
2440	const InterleaveGroup<Instruction> *InterGroup =
2441	InterleaveRec->getInterleaveGroup();
2442	bool NeedPredication = false;
2443	for (int I = `0`, NumMembers = InterGroup->getNumMembers();
2444	I < NumMembers; ++I) {
2445	Instruction *Member = InterGroup->getMember(Index: I);
2446	if (Member)
2447	NeedPredication \|= BlockNeedsPredication (Member->getParent());
2448	}
2449
2450	if (NeedPredication)
2451	CollectPoisonGeneratingInstrsInBackwardSlice (AddrDef);
2452	}
2453	}
2454	}
2455	}
2456	}
2457
2458	void VPlanTransforms::createInterleaveGroups(
2459	VPlan &Plan,
2460	const SmallPtrSetImpl<const InterleaveGroup<Instruction> *>
2461	&InterleaveGroups,
2462	VPRecipeBuilder &RecipeBuilder, const bool &ScalarEpilogueAllowed) {
2463	if (InterleaveGroups.empty())
2464	return;
2465
2466	// Interleave memory: for each Interleave Group we marked earlier as relevant
2467	// for this VPlan, replace the Recipes widening its memory instructions with a
2468	// single VPInterleaveRecipe at its insertion point.
2469	VPDominatorTree VPDT;
2470	VPDT.recalculate(Func&: Plan);
2471	for (const auto *IG : InterleaveGroups) {
2472	SmallVector<VPValue *, `4`> StoredValues;
2473	for (unsigned i = `0`; i < IG->getFactor(); ++i)
2474	if (auto *SI = dyn_cast_or_null<StoreInst>(Val: IG->getMember(Index: i))) {
2475	auto *StoreR = cast<VPWidenStoreRecipe>(Val: RecipeBuilder.getRecipe(I: SI));
2476	StoredValues.push_back(Elt: StoreR->getStoredValue());
2477	}
2478
2479	bool NeedsMaskForGaps =
2480	IG->requiresScalarEpilogue() && !ScalarEpilogueAllowed;
2481
2482	Instruction *IRInsertPos = IG->getInsertPos();
2483	auto *InsertPos =
2484	cast<VPWidenMemoryRecipe>(Val: RecipeBuilder.getRecipe(I: IRInsertPos));
2485
2486	bool InBounds = false;
2487	if (auto *Gep = dyn_cast<GetElementPtrInst>(
2488	Val: getLoadStorePointerOperand(V: IRInsertPos)->stripPointerCasts()))
2489	InBounds = Gep->isInBounds();
2490
2491	// Get or create the start address for the interleave group.
2492	auto *Start =
2493	cast<VPWidenMemoryRecipe>(Val: RecipeBuilder.getRecipe(I: IG->getMember(Index: `0`)));
2494	VPValue *Addr = Start->getAddr();
2495	VPRecipeBase *AddrDef = Addr->getDefiningRecipe();
2496	if (AddrDef && !VPDT.properlyDominates(A: AddrDef, B: InsertPos)) {
2497	// We cannot re-use the address of member zero because it does not
2498	// dominate the insert position. Instead, use the address of the insert
2499	// position and create a PtrAdd adjusting it to the address of member
2500	// zero.
2501	// TODO: Hoist Addr's defining recipe (and any operands as needed) to
2502	// InsertPos or sink loads above zero members to join it.
2503	assert(IG->getIndex(IRInsertPos) != `0` &&
2504	"index of insert position shouldn't be zero");
2505	auto &DL = IRInsertPos->getDataLayout();
2506	APInt Offset(`32`,
2507	DL.getTypeAllocSize(Ty: getLoadStoreType(I: IRInsertPos)) *
2508	IG->getIndex(Instr: IRInsertPos),
2509	/IsSigned=/true);
2510	VPValue *OffsetVPV = Plan.getOrAddLiveIn(
2511	V: ConstantInt::get(Context&: IRInsertPos->getParent()->getContext(), V: -Offset));
2512	VPBuilder B(InsertPos);
2513	Addr = InBounds ? B.createInBoundsPtrAdd(Ptr: InsertPos->getAddr(), Offset: OffsetVPV)
2514	: B.createPtrAdd(Ptr: InsertPos->getAddr(), Offset: OffsetVPV);
2515	}
2516	// If the group is reverse, adjust the index to refer to the last vector
2517	// lane instead of the first. We adjust the index from the first vector
2518	// lane, rather than directly getting the pointer for lane VF - 1, because
2519	// the pointer operand of the interleaved access is supposed to be uniform.
2520	if (IG->isReverse()) {
2521	auto ReversePtr = new* VPVectorEndPointerRecipe (
2522	Addr, &Plan.getVF(), getLoadStoreType(I: IRInsertPos),
2523	-(int64_t)IG->getFactor(),
2524	InBounds ? GEPNoWrapFlags::inBounds() : GEPNoWrapFlags::none(),
2525	InsertPos->getDebugLoc());
2526	ReversePtr->insertBefore(InsertPos);
2527	Addr = ReversePtr;
2528	}
2529	auto VPIG = new* VPInterleaveRecipe (IG, Addr, StoredValues,
2530	InsertPos->getMask(), NeedsMaskForGaps, InsertPos->getDebugLoc());
2531	VPIG->insertBefore(InsertPos);
2532
2533	unsigned J = `0`;
2534	for (unsigned i = `0`; i < IG->getFactor(); ++i)
2535	if (Instruction *Member = IG->getMember(Index: i)) {
2536	VPRecipeBase *MemberR = RecipeBuilder.getRecipe(I: Member);
2537	if (!Member->getType()->isVoidTy()) {
2538	VPValue *OriginalV = MemberR->getVPSingleValue();
2539	OriginalV->replaceAllUsesWith(New: VPIG->getVPValue(I: J));
2540	J++;
2541	}
2542	MemberR->eraseFromParent();
2543	}
2544	}
2545	}
2546
2547	/// Expand a VPWidenIntOrFpInduction into executable recipes, for the initial
2548	/// value, phi and backedge value. In the following example:
2549	///
2550	/// vector.ph:
2551	/// Successor(s): vector loop
2552	///
2553	/// <x1> vector loop: {
2554	/// vector.body:
2555	/// WIDEN-INDUCTION %i = phi %start, %step, %vf
2556	/// ...
2557	/// EMIT branch-on-count ...
2558	/// No successors
2559	/// }
2560	///
2561	/// WIDEN-INDUCTION will get expanded to:
2562	///
2563	/// vector.ph:
2564	/// ...
2565	/// vp<%induction.start> = ...
2566	/// vp<%induction.increment> = ...
2567	///
2568	/// Successor(s): vector loop
2569	///
2570	/// <x1> vector loop: {
2571	/// vector.body:
2572	/// ir<%i> = WIDEN-PHI vp<%induction.start>, vp<%vec.ind.next>
2573	/// ...
2574	/// vp<%vec.ind.next> = add ir<%i>, vp<%induction.increment>
2575	/// EMIT branch-on-count ...
2576	/// No successors
2577	/// }
2578	static void
2579	expandVPWidenIntOrFpInduction(VPWidenIntOrFpInductionRecipe *WidenIVR,
2580	VPTypeAnalysis &TypeInfo) {
2581	VPlan *Plan = WidenIVR->getParent()->getPlan();
2582	VPValue *Start = WidenIVR->getStartValue();
2583	VPValue *Step = WidenIVR->getStepValue();
2584	VPValue *VF = WidenIVR->getVFValue();
2585	DebugLoc DL = WidenIVR->getDebugLoc();
2586
2587	// The value from the original loop to which we are mapping the new induction
2588	// variable.
2589	Type *Ty = TypeInfo.inferScalarType(V: WidenIVR);
2590
2591	const InductionDescriptor &ID = WidenIVR->getInductionDescriptor();
2592	Instruction::BinaryOps AddOp;
2593	Instruction::BinaryOps MulOp;
2594	// FIXME: The newly created binary instructions should contain nsw/nuw
2595	// flags, which can be found from the original scalar operations.
2596	VPIRFlags Flags;
2597	if (ID.getKind() == InductionDescriptor::IK_IntInduction) {
2598	AddOp = Instruction::Add;
2599	MulOp = Instruction::Mul;
2600	} else {
2601	AddOp = ID.getInductionOpcode();
2602	MulOp = Instruction::FMul;
2603	Flags = ID.getInductionBinOp()->getFastMathFlags();
2604	}
2605
2606	// If the phi is truncated, truncate the start and step values.
2607	VPBuilder Builder(Plan->getVectorPreheader());
2608	Type *StepTy = TypeInfo.inferScalarType(V: Step);
2609	if (Ty->getScalarSizeInBits() < StepTy->getScalarSizeInBits()) {
2610	assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
2611	Step = Builder.createScalarCast(Opcode: Instruction::Trunc, Op: Step, ResultTy: Ty, DL);
2612	Start = Builder.createScalarCast(Opcode: Instruction::Trunc, Op: Start, ResultTy: Ty, DL);
2613	StepTy = Ty;
2614	}
2615
2616	// Construct the initial value of the vector IV in the vector loop preheader.
2617	Type *IVIntTy =
2618	IntegerType::get(C&: StepTy->getContext(), NumBits: StepTy->getScalarSizeInBits());
2619	VPValue *Init = Builder.createNaryOp(Opcode: VPInstruction::StepVector, Operands: {}, ResultTy: IVIntTy);
2620	if (StepTy->isFloatingPointTy())
2621	Init = Builder.createWidenCast(Opcode: Instruction::UIToFP, Op: Init, ResultTy: StepTy);
2622
2623	VPValue *SplatStart = Builder.createNaryOp(Opcode: VPInstruction::Broadcast, Operands: Start);
2624	VPValue *SplatStep = Builder.createNaryOp(Opcode: VPInstruction::Broadcast, Operands: Step);
2625
2626	Init = Builder.createNaryOp(Opcode: MulOp, Operands: {Init, SplatStep}, Flags);
2627	Init =
2628	Builder.createNaryOp(Opcode: AddOp, Operands: {SplatStart, Init}, Flags, DL: {}, Name: "induction");
2629
2630	// Create the widened phi of the vector IV.
2631	auto WidePHI = new* VPWidenPHIRecipe (WidenIVR->getPHINode(), nullptr,
2632	WidenIVR->getDebugLoc(), "vec.ind");
2633	WidePHI->addOperand(Operand: Init);
2634	WidePHI->insertBefore(InsertPos: WidenIVR);
2635
2636	// Create the backedge value for the vector IV.
2637	VPValue *Inc;
2638	VPValue *Prev;
2639	// If unrolled, use the increment and prev value from the operands.
2640	if (auto *SplatVF = WidenIVR->getSplatVFValue()) {
2641	Inc = SplatVF;
2642	Prev = WidenIVR->getLastUnrolledPartOperand();
2643	} else {
2644	if (VPRecipeBase *R = VF->getDefiningRecipe())
2645	Builder.setInsertPoint(TheBB: R->getParent(), IP: std::next(x: R->getIterator()));
2646	// Multiply the vectorization factor by the step using integer or
2647	// floating-point arithmetic as appropriate.
2648	if (StepTy->isFloatingPointTy())
2649	VF = Builder.createScalarCast(Opcode: Instruction::CastOps::UIToFP, Op: VF, ResultTy: StepTy,
2650	DL);
2651	else
2652	VF = Builder.createScalarZExtOrTrunc(Op: VF, ResultTy: StepTy,
2653	SrcTy: TypeInfo.inferScalarType(V: VF), DL);
2654
2655	Inc = Builder.createNaryOp(Opcode: MulOp, Operands: {Step, VF}, Flags);
2656	Inc = Builder.createNaryOp(Opcode: VPInstruction::Broadcast, Operands: Inc);
2657	Prev = WidePHI;
2658	}
2659
2660	VPBasicBlock *ExitingBB = Plan->getVectorLoopRegion()->getExitingBasicBlock();
2661	Builder.setInsertPoint(TheBB: ExitingBB, IP: ExitingBB->getTerminator()->getIterator());
2662	auto *Next = Builder.createNaryOp(Opcode: AddOp, Operands: {Prev, Inc}, Flags,
2663	DL: WidenIVR->getDebugLoc(), Name: "vec.ind.next");
2664
2665	WidePHI->addOperand(Operand: Next);
2666
2667	WidenIVR->replaceAllUsesWith(New: WidePHI);
2668	}
2669
2670	void VPlanTransforms::dissolveLoopRegions(VPlan &Plan) {
2671	// Replace loop regions with explicity CFG.
2672	SmallVector<VPRegionBlock *> LoopRegions;
2673	for (VPRegionBlock *R : VPBlockUtils::blocksOnly<VPRegionBlock>(
2674	Range: vp_depth_first_deep(G: Plan.getEntry()))) {
2675	if (!R->isReplicator())
2676	LoopRegions.push_back(Elt: R);
2677	}
2678	for (VPRegionBlock *R : LoopRegions)
2679	R->dissolveToCFGLoop();
2680	}
2681
2682	void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan,
2683	Type &CanonicalIVTy) {
2684	using namespace llvm::VPlanPatternMatch;
2685
2686	VPTypeAnalysis TypeInfo(&CanonicalIVTy);
2687	SmallVector<VPRecipeBase *> ToRemove;
2688	for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
2689	Range: vp_depth_first_deep(G: Plan.getEntry()))) {
2690	for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB)) {
2691	if (auto *PhiR = dyn_cast<VPEVLBasedIVPHIRecipe>(Val: &R)) {
2692	auto *ScalarR = VPBuilder (PhiR).createScalarPhi(
2693	IncomingValues: {PhiR->getStartValue(), PhiR->getBackedgeValue()},
2694	DL: PhiR->getDebugLoc(), Name: "evl.based.iv");
2695	PhiR->replaceAllUsesWith(New: ScalarR);
2696	ToRemove.push_back(Elt: PhiR);
2697	continue;
2698	}
2699
2700	if (auto *WidenIVR = dyn_cast<VPWidenIntOrFpInductionRecipe>(Val: &R)) {
2701	expandVPWidenIntOrFpInduction(WidenIVR, TypeInfo);
2702	ToRemove.push_back(Elt: WidenIVR);
2703	continue;
2704	}
2705
2706	if (auto *Expr = dyn_cast<VPExpressionRecipe>(Val: &R)) {
2707	Expr->decompose();
2708	ToRemove.push_back(Elt: Expr);
2709	}
2710
2711	VPValue *VectorStep;
2712	VPValue *ScalarStep;
2713	if (!match(V: &R, P: m_VPInstruction<VPInstruction::WideIVStep>(
2714	Op0: m_VPValue(V&: VectorStep), Op1: m_VPValue(V&: ScalarStep))))
2715	continue;
2716
2717	// Expand WideIVStep.
2718	auto *VPI = cast<VPInstruction>(Val: &R);
2719	VPBuilder Builder(VPI);
2720	Type *IVTy = TypeInfo.inferScalarType(V: VPI);
2721	if (TypeInfo.inferScalarType(V: VectorStep) != IVTy) {
2722	Instruction::CastOps CastOp = IVTy->isFloatingPointTy()
2723	? Instruction::UIToFP
2724	: Instruction::Trunc;
2725	VectorStep = Builder.createWidenCast(Opcode: CastOp, Op: VectorStep, ResultTy: IVTy);
2726	}
2727
2728	[[maybe_unused]] auto *ConstStep =
2729	ScalarStep->isLiveIn()
2730	? dyn_cast<ConstantInt>(Val: ScalarStep->getLiveInIRValue())
2731	: nullptr;
2732	assert(!ConstStep \|\| ConstStep->getValue() != `1`);
2733	(void)ConstStep;
2734	if (TypeInfo.inferScalarType(V: ScalarStep) != IVTy) {
2735	ScalarStep =
2736	Builder.createWidenCast(Opcode: Instruction::Trunc, Op: ScalarStep, ResultTy: IVTy);
2737	}
2738
2739	VPIRFlags Flags;
2740	if (IVTy->isFloatingPointTy())
2741	Flags = {VPI->getFastMathFlags()};
2742
2743	unsigned MulOpc =
2744	IVTy->isFloatingPointTy() ? Instruction::FMul : Instruction::Mul;
2745	VPInstruction *Mul = Builder.createNaryOp(
2746	Opcode: MulOpc, Operands: {VectorStep, ScalarStep}, Flags, DL: R.getDebugLoc());
2747	VectorStep = Mul;
2748	VPI->replaceAllUsesWith(New: VectorStep);
2749	ToRemove.push_back(Elt: VPI);
2750	}
2751	}
2752
2753	for (VPRecipeBase *R : ToRemove)
2754	R->eraseFromParent();
2755	}
2756
2757	void VPlanTransforms::handleUncountableEarlyExit(
2758	VPBasicBlock EarlyExitingVPBB, VPBasicBlock EarlyExitVPBB, VPlan &Plan,
2759	VPBasicBlock HeaderVPBB, VPBasicBlock LatchVPBB, VFRange &Range) {
2760	using namespace llvm::VPlanPatternMatch;
2761
2762	VPBlockBase *MiddleVPBB = LatchVPBB->getSuccessors()[`0`];
2763	if (!EarlyExitVPBB->getSinglePredecessor() &&
2764	EarlyExitVPBB->getPredecessors()[`1`] == MiddleVPBB) {
2765	assert(EarlyExitVPBB->getNumPredecessors() == `2` &&
2766	EarlyExitVPBB->getPredecessors()[`0`] == EarlyExitingVPBB &&
2767	"unsupported early exit VPBB");
2768	// Early exit operand should always be last phi operand. If EarlyExitVPBB
2769	// has two predecessors and EarlyExitingVPBB is the first, swap the operands
2770	// of the phis.
2771	for (VPRecipeBase &R : EarlyExitVPBB->phis())
2772	cast<VPIRPhi>(Val: &R)->swapOperands();
2773	}
2774
2775	VPBuilder Builder(LatchVPBB->getTerminator());
2776	VPBlockBase *TrueSucc = EarlyExitingVPBB->getSuccessors()[`0`];
2777	assert(
2778	match(EarlyExitingVPBB->getTerminator(), m_BranchOnCond(m_VPValue())) &&
2779	"Terminator must be be BranchOnCond");
2780	VPValue *CondOfEarlyExitingVPBB =
2781	EarlyExitingVPBB->getTerminator()->getOperand(N: `0`);
2782	auto *CondToEarlyExit = TrueSucc == EarlyExitVPBB
2783	? CondOfEarlyExitingVPBB
2784	: Builder.createNot(Operand: CondOfEarlyExitingVPBB);
2785
2786	// Split the middle block and have it conditionally branch to the early exit
2787	// block if CondToEarlyExit.
2788	VPValue *IsEarlyExitTaken =
2789	Builder.createNaryOp(Opcode: VPInstruction::AnyOf, Operands: {CondToEarlyExit});
2790	VPBasicBlock *NewMiddle = Plan.createVPBasicBlock(Name: "middle.split");
2791	VPBasicBlock *VectorEarlyExitVPBB =
2792	Plan.createVPBasicBlock(Name: "vector.early.exit");
2793	VPBlockUtils::insertOnEdge(From: LatchVPBB, To: MiddleVPBB, BlockPtr: NewMiddle);
2794	VPBlockUtils::connectBlocks(From: NewMiddle, To: VectorEarlyExitVPBB);
2795	NewMiddle->swapSuccessors();
2796
2797	VPBlockUtils::connectBlocks(From: VectorEarlyExitVPBB, To: EarlyExitVPBB);
2798
2799	// Update the exit phis in the early exit block.
2800	VPBuilder MiddleBuilder(NewMiddle);
2801	VPBuilder EarlyExitB(VectorEarlyExitVPBB);
2802	for (VPRecipeBase &R : EarlyExitVPBB->phis()) {
2803	auto *ExitIRI = cast<VPIRPhi>(Val: &R);
2804	// Early exit operand should always be last, i.e., 0 if EarlyExitVPBB has
2805	// a single predecessor and 1 if it has two.
2806	unsigned EarlyExitIdx = ExitIRI->getNumOperands() - `1`;
2807	if (ExitIRI->getNumOperands() != `1`) {
2808	// The first of two operands corresponds to the latch exit, via MiddleVPBB
2809	// predecessor. Extract its last lane.
2810	ExitIRI->extractLastLaneOfFirstOperand(Builder&: MiddleBuilder);
2811	}
2812
2813	VPValue *IncomingFromEarlyExit = ExitIRI->getOperand(N: EarlyExitIdx);
2814	auto IsVector = [](ElementCount VF) { return VF.isVector(); };
2815	// When the VFs are vectors, need to add `extract` to get the incoming value
2816	// from early exit. When the range contains scalar VF, limit the range to
2817	// scalar VF to prevent mis-compilation for the range containing both scalar
2818	// and vector VFs.
2819	if (!IncomingFromEarlyExit->isLiveIn() &&
2820	LoopVectorizationPlanner::getDecisionAndClampRange(Predicate: IsVector, Range)) {
2821	// Update the incoming value from the early exit.
2822	VPValue *FirstActiveLane = EarlyExitB.createNaryOp(
2823	Opcode: VPInstruction::FirstActiveLane, Operands: {CondToEarlyExit}, Inst: nullptr,
2824	Name: "first.active.lane");
2825	IncomingFromEarlyExit = EarlyExitB.createNaryOp(
2826	Opcode: Instruction::ExtractElement, Operands: {IncomingFromEarlyExit, FirstActiveLane},
2827	Inst: nullptr, Name: "early.exit.value");
2828	ExitIRI->setOperand(I: EarlyExitIdx, New: IncomingFromEarlyExit);
2829	}
2830	}
2831	MiddleBuilder.createNaryOp(Opcode: VPInstruction::BranchOnCond, Operands: {IsEarlyExitTaken});
2832
2833	// Replace the condition controlling the non-early exit from the vector loop
2834	// with one exiting if either the original condition of the vector latch is
2835	// true or the early exit has been taken.
2836	auto *LatchExitingBranch = cast<VPInstruction>(Val: LatchVPBB->getTerminator());
2837	assert(LatchExitingBranch->getOpcode() == VPInstruction::BranchOnCount &&
2838	"Unexpected terminator");
2839	auto *IsLatchExitTaken =
2840	Builder.createICmp(Pred: CmpInst::ICMP_EQ, A: LatchExitingBranch->getOperand(N: `0`),
2841	B: LatchExitingBranch->getOperand(N: `1`));
2842	auto *AnyExitTaken = Builder.createNaryOp(
2843	Opcode: Instruction::Or, Operands: {IsEarlyExitTaken, IsLatchExitTaken});
2844	Builder.createNaryOp(Opcode: VPInstruction::BranchOnCond, Operands: AnyExitTaken);
2845	LatchExitingBranch->eraseFromParent();
2846	}
2847
2848	/// This function tries convert extended in-loop reductions to
2849	/// VPExpressionRecipe and clamp the \p Range if it is beneficial and
2850	/// valid. The created recipe must be decomposed to its constituent
2851	/// recipes before execution.
2852	static VPExpressionRecipe *
2853	tryToMatchAndCreateExtendedReduction(VPReductionRecipe *Red, VPCostContext &Ctx,
2854	VFRange &Range) {
2855	using namespace VPlanPatternMatch;
2856
2857	Type *RedTy = Ctx.Types.inferScalarType(V: Red);
2858	VPValue *VecOp = Red->getVecOp();
2859
2860	// Clamp the range if using extended-reduction is profitable.
2861	auto IsExtendedRedValidAndClampRange = [&](unsigned Opcode, bool isZExt,
2862	Type SrcTy) -> bool* {
2863	return LoopVectorizationPlanner::getDecisionAndClampRange(
2864	Predicate: [&](ElementCount VF) {
2865	auto *SrcVecTy = cast<VectorType>(Val: toVectorTy(Scalar: SrcTy, EC: VF));
2866	TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
2867	InstructionCost ExtRedCost = Ctx.TTI.getExtendedReductionCost(
2868	Opcode, IsUnsigned: isZExt, ResTy: RedTy, Ty: SrcVecTy, FMF: Red->getFastMathFlags(),
2869	CostKind);
2870	InstructionCost ExtCost =
2871	cast<VPWidenCastRecipe>(Val: VecOp)->computeCost(VF, Ctx);
2872	InstructionCost RedCost = Red->computeCost(VF, Ctx);
2873	return ExtRedCost.isValid() && ExtRedCost < ExtCost + RedCost;
2874	},
2875	Range);
2876	};
2877
2878	VPValue *A;
2879	// Match reduce(ext)).
2880	if (match(V: VecOp, P: m_ZExtOrSExt(Op0: m_VPValue(V&: A))) &&
2881	IsExtendedRedValidAndClampRange (
2882	RecurrenceDescriptor::getOpcode(Kind: Red->getRecurrenceKind()),
2883	cast<VPWidenCastRecipe>(Val: VecOp)->getOpcode() ==
2884	Instruction::CastOps::ZExt,
2885	Ctx.Types.inferScalarType(V: A)))
2886	return new VPExpressionRecipe (cast<VPWidenCastRecipe>(Val: VecOp), Red);
2887
2888	return nullptr;
2889	}
2890
2891	/// This function tries convert extended in-loop reductions to
2892	/// VPExpressionRecipe and clamp the \p Range if it is beneficial
2893	/// and valid. The created VPExpressionRecipe must be decomposed to its
2894	/// constituent recipes before execution. Patterns of the
2895	/// VPExpressionRecipe:
2896	/// reduce.add(mul(...)),
2897	/// reduce.add(mul(ext(A), ext(B))),
2898	/// reduce.add(ext(mul(ext(A), ext(B)))).
2899	static VPExpressionRecipe *
2900	tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red,
2901	VPCostContext &Ctx, VFRange &Range) {
2902	using namespace VPlanPatternMatch;
2903
2904	unsigned Opcode = RecurrenceDescriptor::getOpcode(Kind: Red->getRecurrenceKind());
2905	if (Opcode != Instruction::Add)
2906	return nullptr;
2907
2908	Type *RedTy = Ctx.Types.inferScalarType(V: Red);
2909
2910	// Clamp the range if using multiply-accumulate-reduction is profitable.
2911	auto IsMulAccValidAndClampRange =
2912	[&](bool isZExt, VPWidenRecipe Mul, VPWidenCastRecipe Ext0,
2913	VPWidenCastRecipe Ext1, VPWidenCastRecipe OuterExt) -> bool {
2914	return LoopVectorizationPlanner::getDecisionAndClampRange(
2915	Predicate: [&](ElementCount VF) {
2916	TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
2917	Type *SrcTy =
2918	Ext0 ? Ctx.Types.inferScalarType(V: Ext0->getOperand(N: `0`)) : RedTy;
2919	auto *SrcVecTy = cast<VectorType>(Val: toVectorTy(Scalar: SrcTy, EC: VF));
2920	InstructionCost MulAccCost =
2921	Ctx.TTI.getMulAccReductionCost(IsUnsigned: isZExt, ResTy: RedTy, Ty: SrcVecTy, CostKind);
2922	InstructionCost MulCost = Mul->computeCost(VF, Ctx);
2923	InstructionCost RedCost = Red->computeCost(VF, Ctx);
2924	InstructionCost ExtCost = `0`;
2925	if (Ext0)
2926	ExtCost += Ext0->computeCost(VF, Ctx);
2927	if (Ext1)
2928	ExtCost += Ext1->computeCost(VF, Ctx);
2929	if (OuterExt)
2930	ExtCost += OuterExt->computeCost(VF, Ctx);
2931
2932	return MulAccCost.isValid() &&
2933	MulAccCost < ExtCost + MulCost + RedCost;
2934	},
2935	Range);
2936	};
2937
2938	VPValue *VecOp = Red->getVecOp();
2939	VPValue A, B;
2940	// Try to match reduce.add(mul(...)).
2941	if (match(V: VecOp, P: m_Mul(Op0: m_VPValue(V&: A), Op1: m_VPValue(V&: B)))) {
2942	auto *RecipeA =
2943	dyn_cast_if_present<VPWidenCastRecipe>(Val: A->getDefiningRecipe());
2944	auto *RecipeB =
2945	dyn_cast_if_present<VPWidenCastRecipe>(Val: B->getDefiningRecipe());
2946	auto *Mul = cast<VPWidenRecipe>(Val: VecOp->getDefiningRecipe());
2947
2948	// Match reduce.add(mul(ext, ext)).
2949	if (RecipeA && RecipeB &&
2950	(RecipeA->getOpcode() == RecipeB->getOpcode() \|\| A == B) &&
2951	match(V: RecipeA, P: m_ZExtOrSExt(Op0: m_VPValue())) &&
2952	match(V: RecipeB, P: m_ZExtOrSExt(Op0: m_VPValue())) &&
2953	IsMulAccValidAndClampRange (RecipeA->getOpcode() ==
2954	Instruction::CastOps::ZExt,
2955	Mul, RecipeA, RecipeB, nullptr)) {
2956	return new VPExpressionRecipe (RecipeA, RecipeB, Mul, Red);
2957	}
2958	// Match reduce.add(mul).
2959	if (IsMulAccValidAndClampRange (true, Mul, nullptr, nullptr, nullptr))
2960	return new VPExpressionRecipe (Mul, Red);
2961	}
2962	// Match reduce.add(ext(mul(ext(A), ext(B)))).
2963	// All extend recipes must have same opcode or A == B
2964	// which can be transform to reduce.add(zext(mul(sext(A), sext(B)))).
2965	if (match(V: VecOp, P: m_ZExtOrSExt(Op0: m_Mul(Op0: m_ZExtOrSExt(Op0: m_VPValue()),
2966	Op1: m_ZExtOrSExt(Op0: m_VPValue()))))) {
2967	auto *Ext = cast<VPWidenCastRecipe>(Val: VecOp->getDefiningRecipe());
2968	auto *Mul = cast<VPWidenRecipe>(Val: Ext->getOperand(N: `0`)->getDefiningRecipe());
2969	auto *Ext0 =
2970	cast<VPWidenCastRecipe>(Val: Mul->getOperand(N: `0`)->getDefiningRecipe());
2971	auto *Ext1 =
2972	cast<VPWidenCastRecipe>(Val: Mul->getOperand(N: `1`)->getDefiningRecipe());
2973	if ((Ext->getOpcode() == Ext0->getOpcode() \|\| Ext0 == Ext1) &&
2974	Ext0->getOpcode() == Ext1->getOpcode() &&
2975	IsMulAccValidAndClampRange (Ext0->getOpcode() ==
2976	Instruction::CastOps::ZExt,
2977	Mul, Ext0, Ext1, Ext)) {
2978	auto NewExt0 = new* VPWidenCastRecipe (
2979	Ext0->getOpcode(), Ext0->getOperand(N: `0`), Ext->getResultType(), *Ext0,
2980	Ext0->getDebugLoc());
2981	NewExt0->insertBefore(InsertPos: Ext0);
2982
2983	VPWidenCastRecipe *NewExt1 = NewExt0;
2984	if (Ext0 != Ext1) {
2985	NewExt1 = new VPWidenCastRecipe (Ext1->getOpcode(), Ext1->getOperand(N: `0`),
2986	Ext->getResultType(), *Ext1,
2987	Ext1->getDebugLoc());
2988	NewExt1->insertBefore(InsertPos: Ext1);
2989	}
2990	Mul->setOperand(I: `0`, New: NewExt0);
2991	Mul->setOperand(I: `1`, New: NewExt1);
2992	Red->setOperand(I: `1`, New: Mul);
2993	return new VPExpressionRecipe (NewExt0, NewExt1, Mul, Red);
2994	}
2995	}
2996	return nullptr;
2997	}
2998
2999	/// This function tries to create abstract recipes from the reduction recipe for
3000	/// following optimizations and cost estimation.
3001	static void tryToCreateAbstractReductionRecipe(VPReductionRecipe *Red,
3002	VPCostContext &Ctx,
3003	VFRange &Range) {
3004	VPExpressionRecipe AbstractR = nullptr*;
3005	auto IP = std::next(x: Red->getIterator());
3006	auto *VPBB = Red->getParent();
3007	if (auto *MulAcc = tryToMatchAndCreateMulAccumulateReduction(Red, Ctx, Range))
3008	AbstractR = MulAcc;
3009	else if (auto *ExtRed = tryToMatchAndCreateExtendedReduction(Red, Ctx, Range))
3010	AbstractR = ExtRed;
3011	// Cannot create abstract inloop reduction recipes.
3012	if (!AbstractR)
3013	return;
3014
3015	AbstractR->insertBefore(BB&: *VPBB, IP);
3016	Red->replaceAllUsesWith(New: AbstractR);
3017	}
3018
3019	void VPlanTransforms::convertToAbstractRecipes(VPlan &Plan, VPCostContext &Ctx,
3020	VFRange &Range) {
3021	for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
3022	Range: vp_depth_first_deep(G: Plan.getVectorLoopRegion()))) {
3023	for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB)) {
3024	if (auto *Red = dyn_cast<VPReductionRecipe>(Val: &R))
3025	tryToCreateAbstractReductionRecipe(Red, Ctx, Range);
3026	}
3027	}
3028	}
3029
3030	void VPlanTransforms::materializeBroadcasts(VPlan &Plan) {
3031	if (Plan.hasScalarVFOnly())
3032	return;
3033
3034	#ifndef NDEBUG
3035	VPDominatorTree VPDT;
3036	VPDT.recalculate(Plan);
3037	#endif
3038
3039	SmallVector<VPValue *> VPValues;
3040	if (Plan.getOrCreateBackedgeTakenCount()->getNumUsers() > `0`)
3041	VPValues.push_back(Elt: Plan.getOrCreateBackedgeTakenCount());
3042	append_range(C&: VPValues, R: Plan.getLiveIns());
3043	for (VPRecipeBase &R : *Plan.getEntry())
3044	append_range(C&: VPValues, R: R.definedValues());
3045
3046	auto *VectorPreheader = Plan.getVectorPreheader();
3047	for (VPValue *VPV : VPValues) {
3048	if (all_of(Range: VPV->users(),
3049	P: [VPV](VPUser U) { return* U->usesScalars(Op: VPV); }) \|\|
3050	(VPV->isLiveIn() && VPV->getLiveInIRValue() &&
3051	isa<Constant>(Val: VPV->getLiveInIRValue())))
3052	continue;
3053
3054	// Add explicit broadcast at the insert point that dominates all users.
3055	VPBasicBlock *HoistBlock = VectorPreheader;
3056	VPBasicBlock::iterator HoistPoint = VectorPreheader->end();
3057	for (VPUser *User : VPV->users()) {
3058	if (User->usesScalars(Op: VPV))
3059	continue;
3060	if (cast<VPRecipeBase>(Val: User)->getParent() == VectorPreheader)
3061	HoistPoint = HoistBlock->begin();
3062	else
3063	assert(VPDT.dominates(VectorPreheader,
3064	cast<VPRecipeBase>(User)->getParent()) &&
3065	"All users must be in the vector preheader or dominated by it");
3066	}
3067
3068	VPBuilder Builder(cast<VPBasicBlock>(Val: HoistBlock), HoistPoint);
3069	auto *Broadcast = Builder.createNaryOp(Opcode: VPInstruction::Broadcast, Operands: {VPV});
3070	VPV->replaceUsesWithIf(New: Broadcast,
3071	ShouldReplace: [VPV, Broadcast](VPUser &U, unsigned Idx) {
3072	return Broadcast != &U && !U.usesScalars(Op: VPV);
3073	});
3074	}
3075	}
3076
3077	/// Returns true if \p V is VPWidenLoadRecipe or VPInterleaveRecipe that can be
3078	/// converted to a narrower recipe. \p V is used by a wide recipe \p WideMember
3079	/// that feeds a store interleave group at index \p Idx, \p WideMember0 is the
3080	/// recipe feeding the same interleave group at index 0. A VPWidenLoadRecipe can
3081	/// be narrowed to an index-independent load if it feeds all wide ops at all
3082	/// indices (\p OpV must be the operand at index \p OpIdx for both the recipe at
3083	/// lane 0, \p WideMember0, and \p WideMember). A VPInterleaveRecipe can be
3084	/// narrowed to a wide load, if \p V is defined at \p Idx of a load interleave
3085	/// group.
3086	static bool canNarrowLoad(VPWidenRecipe WideMember0, VPWidenRecipe WideMember,
3087	unsigned OpIdx, VPValue OpV, unsigned* Idx) {
3088	auto *DefR = OpV->getDefiningRecipe();
3089	if (!DefR)
3090	return WideMember0->getOperand(N: OpIdx) == OpV;
3091	if (auto *W = dyn_cast<VPWidenLoadRecipe>(Val: DefR))
3092	return !W->getMask() && WideMember0->getOperand(N: OpIdx) == OpV;
3093
3094	if (auto *IR = dyn_cast<VPInterleaveRecipe>(Val: DefR))
3095	return IR->getInterleaveGroup()->getFactor() ==
3096	IR->getInterleaveGroup()->getNumMembers() &&
3097	IR->getVPValue(I: Idx) == OpV;
3098	return false;
3099	}
3100
3101	/// Returns true if \p IR is a full interleave group with factor and number of
3102	/// members both equal to \p VF. The interleave group must also access the full
3103	/// vector width \p VectorRegWidth.
3104	static bool isConsecutiveInterleaveGroup(VPInterleaveRecipe *InterleaveR,
3105	unsigned VF, VPTypeAnalysis &TypeInfo,
3106	unsigned VectorRegWidth) {
3107	if (!InterleaveR)
3108	return false;
3109
3110	Type GroupElementTy = nullptr*;
3111	if (InterleaveR->getStoredValues().empty()) {
3112	GroupElementTy = TypeInfo.inferScalarType(V: InterleaveR->getVPValue(I: `0`));
3113	if (!all_of(Range: InterleaveR->definedValues(),
3114	P: [&TypeInfo, GroupElementTy](VPValue *Op) {
3115	return TypeInfo.inferScalarType(V: Op) == GroupElementTy;
3116	}))
3117	return false;
3118	} else {
3119	GroupElementTy =
3120	TypeInfo.inferScalarType(V: InterleaveR->getStoredValues()[`0`]);
3121	if (!all_of(Range: InterleaveR->getStoredValues(),
3122	P: [&TypeInfo, GroupElementTy](VPValue *Op) {
3123	return TypeInfo.inferScalarType(V: Op) == GroupElementTy;
3124	}))
3125	return false;
3126	}
3127
3128	unsigned GroupSize = GroupElementTy->getScalarSizeInBits() * VF;
3129	auto IG = InterleaveR->getInterleaveGroup();
3130	return IG->getFactor() == VF && IG->getNumMembers() == VF &&
3131	GroupSize == VectorRegWidth;
3132	}
3133
3134	/// Returns true if \p VPValue is a narrow VPValue.
3135	static bool isAlreadyNarrow(VPValue VPV) { return* VPV->isLiveIn(); }
3136
3137	void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
3138	unsigned VectorRegWidth) {
3139	using namespace llvm::VPlanPatternMatch;
3140	VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
3141	if (VF.isScalable() \|\| !VectorLoop)
3142	return;
3143
3144	VPCanonicalIVPHIRecipe *CanonicalIV = Plan.getCanonicalIV();
3145	Type *CanonicalIVType = CanonicalIV->getScalarType();
3146	VPTypeAnalysis TypeInfo(CanonicalIVType);
3147
3148	unsigned FixedVF = VF.getFixedValue();
3149	SmallVector<VPInterleaveRecipe *> StoreGroups;
3150	for (auto &R : *VectorLoop->getEntryBasicBlock()) {
3151	if (isa<VPCanonicalIVPHIRecipe>(Val: &R) \|\|
3152	match(V: &R, P: m_BranchOnCount(Op0: m_VPValue(), Op1: m_VPValue())))
3153	continue;
3154
3155	// Bail out on recipes not supported at the moment:
3156	// phi recipes other than the canonical induction*
3157	// recipes writing to memory except interleave groups*
3158	// Only support plans with a canonical induction phi.
3159	if (R.isPhi())
3160	return;
3161
3162	auto *InterleaveR = dyn_cast<VPInterleaveRecipe>(Val: &R);
3163	if (R.mayWriteToMemory() && !InterleaveR)
3164	return;
3165
3166	// Do not narrow interleave groups if there are VectorPointer recipes and
3167	// the plan was unrolled. The recipe implicitly uses VF from
3168	// VPTransformState.
3169	// TODO: Remove restriction once the VF for the VectorPointer offset is
3170	// modeled explicitly as operand.
3171	if (isa<VPVectorPointerRecipe>(Val: &R) && Plan.getUF() > `1`)
3172	return;
3173
3174	// All other ops are allowed, but we reject uses that cannot be converted
3175	// when checking all allowed consumers (store interleave groups) below.
3176	if (!InterleaveR)
3177	continue;
3178
3179	// Bail out on non-consecutive interleave groups.
3180	if (!isConsecutiveInterleaveGroup(InterleaveR, VF: FixedVF, TypeInfo,
3181	VectorRegWidth))
3182	return;
3183
3184	// Skip read interleave groups.
3185	if (InterleaveR->getStoredValues().empty())
3186	continue;
3187
3188	// Narrow interleave groups, if all operands are already matching narrow
3189	// ops.
3190	auto *Member0 = InterleaveR->getStoredValues()[`0`];
3191	if (isAlreadyNarrow(VPV: Member0) &&
3192	all_of(Range: InterleaveR->getStoredValues(),
3193	P: [Member0](VPValue VPV) { return* Member0 == VPV; })) {
3194	StoreGroups.push_back(Elt: InterleaveR);
3195	continue;
3196	}
3197
3198	// For now, we only support full interleave groups storing load interleave
3199	// groups.
3200	if (all_of(Range: enumerate(First: InterleaveR->getStoredValues()), P: [](auto Op) {
3201	VPRecipeBase *DefR = Op.value()->getDefiningRecipe();
3202	if (!DefR)
3203	return false;
3204	auto *IR = dyn_cast<VPInterleaveRecipe>(Val: DefR);
3205	return IR &&
3206	IR->getInterleaveGroup()->getFactor() ==
3207	IR->getInterleaveGroup()->getNumMembers() &&
3208	IR->getVPValue(Op.index()) == Op.value();
3209	})) {
3210	StoreGroups.push_back(Elt: InterleaveR);
3211	continue;
3212	}
3213
3214	// Check if all values feeding InterleaveR are matching wide recipes, which
3215	// operands that can be narrowed.
3216	auto *WideMember0 = dyn_cast_or_null<VPWidenRecipe>(
3217	Val: InterleaveR->getStoredValues()[`0`]->getDefiningRecipe());
3218	if (!WideMember0)
3219	return;
3220	for (const auto &[I, V] : enumerate(First: InterleaveR->getStoredValues())) {
3221	auto *R = dyn_cast_or_null<VPWidenRecipe>(Val: V->getDefiningRecipe());
3222	if (!R \|\| R->getOpcode() != WideMember0->getOpcode() \|\|
3223	R->getNumOperands() > `2`)
3224	return;
3225	if (any_of(Range: enumerate(First: R->operands()),
3226	P: [WideMember0, Idx = I, R](const auto &P) {
3227	const auto &[OpIdx, OpV] = P;
3228	return !canNarrowLoad(WideMember0, R, OpIdx, OpV, Idx);
3229	}))
3230	return;
3231	}
3232	StoreGroups.push_back(Elt: InterleaveR);
3233	}
3234
3235	if (StoreGroups.empty())
3236	return;
3237
3238	// Convert InterleaveGroup \p R to a single VPWidenLoadRecipe.
3239	auto NarrowOp = [](VPValue V) -> VPValue {
3240	auto *R = V->getDefiningRecipe();
3241	if (!R)
3242	return V;
3243	if (auto *LoadGroup = dyn_cast<VPInterleaveRecipe>(Val: R)) {
3244	// Narrow interleave group to wide load, as transformed VPlan will only
3245	// process one original iteration.
3246	auto L = new* VPWidenLoadRecipe (
3247	*cast<LoadInst>(Val: LoadGroup->getInterleaveGroup()->getInsertPos()),
3248	LoadGroup->getAddr(), LoadGroup->getMask(), /Consecutive=/true,
3249	/Reverse=/false, {}, LoadGroup->getDebugLoc());
3250	L->insertBefore(InsertPos: LoadGroup);
3251	return L;
3252	}
3253
3254	auto *WideLoad = cast<VPWidenLoadRecipe>(Val: R);
3255
3256	// Narrow wide load to uniform scalar load, as transformed VPlan will only
3257	// process one original iteration.
3258	auto N = new* VPReplicateRecipe (&WideLoad->getIngredient(),
3259	WideLoad->operands(), /IsUniform/ true,
3260	/Mask/ nullptr, *WideLoad);
3261	N->insertBefore(InsertPos: WideLoad);
3262	return N;
3263	};
3264
3265	// Narrow operation tree rooted at store groups.
3266	for (auto *StoreGroup : StoreGroups) {
3267	VPValue Res = nullptr*;
3268	VPValue *Member0 = StoreGroup->getStoredValues()[`0`];
3269	if (isAlreadyNarrow(VPV: Member0)) {
3270	Res = Member0;
3271	} else if (auto *WideMember0 =
3272	dyn_cast<VPWidenRecipe>(Val: Member0->getDefiningRecipe())) {
3273	for (unsigned Idx = `0`, E = WideMember0->getNumOperands(); Idx != E; ++Idx)
3274	WideMember0->setOperand(I: Idx, New: NarrowOp (WideMember0->getOperand(N: Idx)));
3275	Res = WideMember0;
3276	} else {
3277	Res = NarrowOp (Member0);
3278	}
3279
3280	auto S = new* VPWidenStoreRecipe (
3281	*cast<StoreInst>(Val: StoreGroup->getInterleaveGroup()->getInsertPos()),
3282	StoreGroup->getAddr(), Res, nullptr, /Consecutive=/true,
3283	/Reverse=/false, {}, StoreGroup->getDebugLoc());
3284	S->insertBefore(InsertPos: StoreGroup);
3285	StoreGroup->eraseFromParent();
3286	}
3287
3288	// Adjust induction to reflect that the transformed plan only processes one
3289	// original iteration.
3290	auto *CanIV = Plan.getCanonicalIV();
3291	auto *Inc = cast<VPInstruction>(Val: CanIV->getBackedgeValue());
3292	Inc->setOperand(I: `1`, New: Plan.getOrAddLiveIn(V: ConstantInt::get(
3293	Ty: CanIV->getScalarType(), V: `1` * Plan.getUF())));
3294	Plan.getVF().replaceAllUsesWith(
3295	New: Plan.getOrAddLiveIn(V: ConstantInt::get(Ty: CanIV->getScalarType(), V: `1`)));
3296	removeDeadRecipes(Plan);
3297	}
3298
3299	/// Add branch weight metadata, if the \p Plan's middle block is terminated by a
3300	/// BranchOnCond recipe.
3301	void VPlanTransforms::addBranchWeightToMiddleTerminator(
3302	VPlan &Plan, ElementCount VF, std::optional<unsigned> VScaleForTuning) {
3303	VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
3304	auto *MiddleTerm =
3305	dyn_cast_or_null<VPInstruction>(Val: MiddleVPBB->getTerminator());
3306	// Only add branch metadata if there is a (conditional) terminator.
3307	if (!MiddleTerm)
3308	return;
3309
3310	assert(MiddleTerm->getOpcode() == VPInstruction::BranchOnCond &&
3311	"must have a BranchOnCond");
3312	// Assume that `TripCount % VectorStep ` is equally distributed.
3313	unsigned VectorStep = Plan.getUF() * VF.getKnownMinValue();
3314	if (VF.isScalable() && VScaleForTuning.has_value())
3315	VectorStep = VScaleForTuning;
3316	assert(VectorStep > `0` && "trip count should not be zero");
3317	MDBuilder MDB(Plan.getScalarHeader()->getIRBasicBlock()->getContext());
3318	MDNode *BranchWeights =
3319	MDB.createBranchWeights(Weights: {`1`, VectorStep - `1`}, /IsExpected=/false);
3320	MiddleTerm->addMetadata(Kind: LLVMContext::MD_prof, Node: BranchWeights);
3321	}
3322

Browse the source code of llvm_projects/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp