VPlanTransforms.cpp source code [llvm_projects/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp]

1	//===-- VPlanTransforms.cpp - Utility VPlan to VPlan transforms -----------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	///
9	/// \file
10	/// This file implements a set of utility VPlan to VPlan transformations.
11	///
12	//===----------------------------------------------------------------------===//
13
14	#include "VPlanTransforms.h"
15	#include "VPRecipeBuilder.h"
16	#include "VPlan.h"
17	#include "VPlanAnalysis.h"
18	#include "VPlanCFG.h"
19	#include "VPlanDominatorTree.h"
20	#include "VPlanHelpers.h"
21	#include "VPlanPatternMatch.h"
22	#include "VPlanUtils.h"
23	#include "VPlanVerifier.h"
24	#include "llvm/ADT/APInt.h"
25	#include "llvm/ADT/PostOrderIterator.h"
26	#include "llvm/ADT/STLExtras.h"
27	#include "llvm/ADT/SetOperations.h"
28	#include "llvm/ADT/SetVector.h"
29	#include "llvm/ADT/SmallPtrSet.h"
30	#include "llvm/ADT/TypeSwitch.h"
31	#include "llvm/Analysis/IVDescriptors.h"
32	#include "llvm/Analysis/InstSimplifyFolder.h"
33	#include "llvm/Analysis/LoopInfo.h"
34	#include "llvm/Analysis/MemoryLocation.h"
35	#include "llvm/Analysis/ScalarEvolutionPatternMatch.h"
36	#include "llvm/Analysis/ScopedNoAliasAA.h"
37	#include "llvm/Analysis/VectorUtils.h"
38	#include "llvm/IR/Intrinsics.h"
39	#include "llvm/IR/MDBuilder.h"
40	#include "llvm/IR/Metadata.h"
41	#include "llvm/Support/Casting.h"
42	#include "llvm/Support/TypeSize.h"
43	#include "llvm/Transforms/Utils/LoopUtils.h"
44	#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
45
46	using namespace llvm;
47	using namespace VPlanPatternMatch;
48	using namespace SCEVPatternMatch;
49
50	bool VPlanTransforms::tryToConvertVPInstructionsToVPRecipes(
51	VPlan &Plan, const TargetLibraryInfo &TLI) {
52
53	ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT(
54	Plan.getVectorLoopRegion());
55	for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Range: RPOT)) {
56	// Skip blocks outside region
57	if (!VPBB->getParent())
58	break;
59	VPRecipeBase *Term = VPBB->getTerminator();
60	auto EndIter = Term ? Term->getIterator() : VPBB->end();
61	// Introduce each ingredient into VPlan.
62	for (VPRecipeBase &Ingredient :
63	make_early_inc_range(Range: make_range(x: VPBB->begin(), y: EndIter))) {
64
65	VPValue *VPV = Ingredient.getVPSingleValue();
66	if (!VPV->getUnderlyingValue())
67	continue;
68
69	Instruction *Inst = cast<Instruction>(Val: VPV->getUnderlyingValue());
70
71	VPRecipeBase NewRecipe = nullptr*;
72	if (auto *PhiR = dyn_cast<VPPhi>(Val: &Ingredient)) {
73	auto *Phi = cast<PHINode>(Val: PhiR->getUnderlyingValue());
74	NewRecipe = new VPWidenPHIRecipe (Phi, nullptr, PhiR->getDebugLoc());
75	for (VPValue *Op : PhiR->operands())
76	NewRecipe->addOperand(Operand: Op);
77	} else if (auto *VPI = dyn_cast<VPInstruction>(Val: &Ingredient)) {
78	assert(!isa<PHINode>(Inst) && "phis should be handled above");
79	// Create VPWidenMemoryRecipe for loads and stores.
80	if (LoadInst *Load = dyn_cast<LoadInst>(Val: Inst)) {
81	NewRecipe = new VPWidenLoadRecipe (
82	Load, Ingredient.getOperand(N: `0`), nullptr* /Mask/,
83	false /Consecutive/, false /Reverse/, *VPI,
84	Ingredient.getDebugLoc());
85	} else if (StoreInst *Store = dyn_cast<StoreInst>(Val: Inst)) {
86	NewRecipe = new VPWidenStoreRecipe (
87	*Store, Ingredient.getOperand(N: `1`), Ingredient.getOperand(N: `0`),
88	nullptr /Mask/, false /Consecutive/, false /Reverse/, *VPI,
89	Ingredient.getDebugLoc());
90	} else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Val: Inst)) {
91	NewRecipe = new VPWidenGEPRecipe (GEP, Ingredient.operands(), *VPI,
92	Ingredient.getDebugLoc());
93	} else if (CallInst *CI = dyn_cast<CallInst>(Val: Inst)) {
94	Intrinsic::ID VectorID = getVectorIntrinsicIDForCall(CI, TLI: &TLI);
95	if (VectorID == Intrinsic::not_intrinsic)
96	return false;
97	NewRecipe = new VPWidenIntrinsicRecipe (
98	*CI, getVectorIntrinsicIDForCall(CI, TLI: &TLI),
99	drop_end(RangeOrContainer: Ingredient.operands()), CI->getType(), VPIRFlags (*CI),
100	*VPI, CI->getDebugLoc());
101	} else if (auto *CI = dyn_cast<CastInst>(Val: Inst)) {
102	NewRecipe = new VPWidenCastRecipe (
103	CI->getOpcode(), Ingredient.getOperand(N: `0`), CI->getType(), CI,
104	VPIRFlags (CI), VPIRMetadata (CI));
105	} else {
106	NewRecipe = new VPWidenRecipe (Inst, Ingredient.operands(), VPI,
107	*VPI, Ingredient.getDebugLoc());
108	}
109	} else {
110	assert(isa<VPWidenIntOrFpInductionRecipe>(&Ingredient) &&
111	"inductions must be created earlier");
112	continue;
113	}
114
115	NewRecipe->insertBefore(InsertPos: &Ingredient);
116	if (NewRecipe->getNumDefinedValues() == `1`)
117	VPV->replaceAllUsesWith(New: NewRecipe->getVPSingleValue());
118	else
119	assert(NewRecipe->getNumDefinedValues() == `0` &&
120	"Only recpies with zero or one defined values expected");
121	Ingredient.eraseFromParent();
122	}
123	}
124	return true;
125	}
126
127	/// Helper for extra no-alias checks via known-safe recipe and SCEV.
128	class SinkStoreInfo {
129	const SmallPtrSetImpl<VPRecipeBase *> &ExcludeRecipes;
130	VPReplicateRecipe &GroupLeader;
131	PredicatedScalarEvolution &PSE;
132	const Loop &L;
133	VPTypeAnalysis &TypeInfo;
134
135	// Return true if \p A and \p B are known to not alias for all VFs in the
136	// plan, checked via the distance between the accesses
137	bool isNoAliasViaDistance(VPReplicateRecipe A, VPReplicateRecipe B) const {
138	if (A->getOpcode() != Instruction::Store \|\|
139	B->getOpcode() != Instruction::Store)
140	return false;
141
142	VPValue *AddrA = A->getOperand(N: `1`);
143	const SCEV *SCEVA = vputils::getSCEVExprForVPValue(V: AddrA, PSE, L: &L);
144	VPValue *AddrB = B->getOperand(N: `1`);
145	const SCEV *SCEVB = vputils::getSCEVExprForVPValue(V: AddrB, PSE, L: &L);
146	if (isa<SCEVCouldNotCompute>(Val: SCEVA) \|\| isa<SCEVCouldNotCompute>(Val: SCEVB))
147	return false;
148
149	const APInt *Distance;
150	ScalarEvolution &SE = *PSE.getSE();
151	if (!match(S: SE.getMinusSCEV(LHS: SCEVA, RHS: SCEVB), P: m_scev_APInt(C&: Distance)))
152	return false;
153
154	const DataLayout &DL = SE.getDataLayout();
155	Type *TyA = TypeInfo.inferScalarType(V: A->getOperand(N: `0`));
156	uint64_t SizeA = DL.getTypeStoreSize(Ty: TyA);
157	Type *TyB = TypeInfo.inferScalarType(V: B->getOperand(N: `0`));
158	uint64_t SizeB = DL.getTypeStoreSize(Ty: TyB);
159
160	// Use the maximum store size to ensure no overlap from either direction.
161	// Currently only handles fixed sizes, as it is only used for
162	// replicating VPReplicateRecipes.
163	uint64_t MaxStoreSize = std::max(a: SizeA, b: SizeB);
164
165	auto VFs = B->getParent()->getPlan()->vectorFactors();
166	ElementCount MaxVF = *max_element(Range&: VFs, C: ElementCount::isKnownLT);
167	if (MaxVF.isScalable())
168	return false;
169	return Distance->abs().uge(
170	RHS: MaxVF.multiplyCoefficientBy(RHS: MaxStoreSize).getFixedValue());
171	}
172
173	public:
174	SinkStoreInfo(const SmallPtrSetImpl<VPRecipeBase *> &ExcludeRecipes,
175	VPReplicateRecipe &GroupLeader, PredicatedScalarEvolution &PSE,
176	const Loop &L, VPTypeAnalysis &TypeInfo)
177	: ExcludeRecipes(ExcludeRecipes), GroupLeader(GroupLeader), PSE(PSE),
178	L(L), TypeInfo(TypeInfo) {}
179
180	/// Return true if \p R should be skipped during alias checking, either
181	/// because it's in the exclude set or because no-alias can be proven via
182	/// SCEV.
183	bool shouldSkip(VPRecipeBase &R) const {
184	auto *Store = dyn_cast<VPReplicateRecipe>(Val: &R);
185	return ExcludeRecipes.contains(Ptr: &R) \|\|
186	(Store && isNoAliasViaDistance(A: Store, B: &GroupLeader));
187	}
188	};
189
190	/// Check if a memory operation doesn't alias with memory operations in blocks
191	/// between \p FirstBB and \p LastBB using scoped noalias metadata. If
192	/// \p SinkInfo is std::nullopt, only recipes that may write to memory are
193	/// checked (for load hoisting). Otherwise recipes that both read and write
194	/// memory are checked, and SCEV is used to prove no-alias between the group
195	/// leader and other replicate recipes (for store sinking).
196	static bool
197	canHoistOrSinkWithNoAliasCheck(const MemoryLocation &MemLoc,
198	VPBasicBlock FirstBB, VPBasicBlock LastBB,
199	std::optional<SinkStoreInfo> SinkInfo = {}) {
200	bool CheckReads = SinkInfo.has_value();
201	if (!MemLoc.AATags.Scope)
202	return false;
203
204	for (VPBlockBase *Block = FirstBB; Block;
205	Block = Block->getSingleSuccessor()) {
206	assert(Block->getNumSuccessors() <= `1` &&
207	"Expected at most one successor in block chain");
208	auto *VPBB = cast<VPBasicBlock>(Val: Block);
209	for (VPRecipeBase &R : *VPBB) {
210	if (SinkInfo && SinkInfo ->shouldSkip(R))
211	continue;
212
213	// Skip recipes that don't need checking.
214	if (!R.mayWriteToMemory() && !(CheckReads && R.mayReadFromMemory()))
215	continue;
216
217	auto Loc = vputils::getMemoryLocation(R);
218	if (!Loc)
219	// Conservatively assume aliasing for memory operations without
220	// location.
221	return false;
222
223	if (ScopedNoAliasAAResult::alias(LocA: *Loc, LocB: MemLoc) != AliasResult::NoAlias)
224	return false;
225	}
226
227	if (Block == LastBB)
228	break;
229	}
230	return true;
231	}
232
233	/// Collect either replicated Loads or Stores grouped by their address SCEV.
234	template <unsigned Opcode>
235	static SmallVector<SmallVector<VPReplicateRecipe *, `4`>>
236	collectGroupedReplicateMemOps(
237	VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L,
238	function_ref<bool(VPReplicateRecipe *)> FilterFn) {
239	static_assert(Opcode == Instruction::Load \|\| Opcode == Instruction::Store,
240	"Only Load and Store opcodes supported");
241	constexpr bool IsLoad = (Opcode == Instruction::Load);
242	SmallDenseMap<const SCEV , SmallVector<VPReplicateRecipe , `4`>>
243	RecipesByAddress;
244	for (VPBlockBase *Block :
245	vp_depth_first_shallow(G: Plan.getVectorLoopRegion()->getEntry())) {
246	auto *VPBB = cast<VPBasicBlock>(Val: Block);
247	for (VPRecipeBase &R : *VPBB) {
248	auto *RepR = dyn_cast<VPReplicateRecipe>(Val: &R);
249	if (!RepR \|\| RepR->getOpcode() != Opcode \|\| !FilterFn (RepR))
250	continue;
251
252	// For loads, operand 0 is address; for stores, operand 1 is address.
253	VPValue *Addr = RepR->getOperand(N: IsLoad ? `0` : `1`);
254	const SCEV *AddrSCEV = vputils::getSCEVExprForVPValue(V: Addr, PSE, L);
255	if (!isa<SCEVCouldNotCompute>(Val: AddrSCEV))
256	RecipesByAddress [AddrSCEV].push_back(Elt: RepR);
257	}
258	}
259	auto Groups = to_vector(Range: RecipesByAddress.values());
260	VPDominatorTree VPDT(Plan);
261	for (auto &Group : Groups) {
262	// Sort mem ops by dominance order, with earliest (most dominating) first.
263	stable_sort(Group, [&VPDT](VPReplicateRecipe A, VPReplicateRecipe B) {
264	return VPDT.properlyDominates(A, B);
265	});
266	}
267	return Groups;
268	}
269
270	/// Return true if we do not know how to (mechanically) hoist or sink \p R out
271	/// of a loop region.
272	static bool cannotHoistOrSinkRecipe(const VPRecipeBase &R) {
273	// Assumes don't alias anything or throw; as long as they're guaranteed to
274	// execute, they're safe to hoist.
275	if (match(V: &R, P: m_Intrinsic<Intrinsic::assume>()))
276	return false;
277
278	// TODO: Relax checks in the future, e.g. we could also hoist reads, if their
279	// memory location is not modified in the vector loop.
280	if (R.mayHaveSideEffects() \|\| R.mayReadFromMemory() \|\| R.isPhi())
281	return true;
282
283	// Allocas cannot be hoisted.
284	auto *RepR = dyn_cast<VPReplicateRecipe>(Val: &R);
285	return RepR && RepR->getOpcode() == Instruction::Alloca;
286	}
287
288	static bool sinkScalarOperands(VPlan &Plan) {
289	auto Iter = vp_depth_first_deep(G: Plan.getEntry());
290	bool ScalarVFOnly = Plan.hasScalarVFOnly();
291	bool Changed = false;
292
293	SetVector<std::pair<VPBasicBlock , VPSingleDefRecipe >> WorkList;
294	auto InsertIfValidSinkCandidate = [ScalarVFOnly, &WorkList](
295	VPBasicBlock SinkTo, VPValue Op) {
296	auto *Candidate =
297	dyn_cast_or_null<VPSingleDefRecipe>(Val: Op->getDefiningRecipe());
298	if (!Candidate)
299	return;
300
301	// We only know how to sink VPReplicateRecipes and VPScalarIVStepsRecipes
302	// for now.
303	if (!isa<VPReplicateRecipe, VPScalarIVStepsRecipe>(Val: Candidate))
304	return;
305
306	if (Candidate->getParent() == SinkTo \|\| cannotHoistOrSinkRecipe(R: *Candidate))
307	return;
308
309	if (auto *RepR = dyn_cast<VPReplicateRecipe>(Val: Candidate))
310	if (!ScalarVFOnly && RepR->isSingleScalar())
311	return;
312
313	WorkList.insert(X: {SinkTo, Candidate});
314	};
315
316	// First, collect the operands of all recipes in replicate blocks as seeds for
317	// sinking.
318	for (VPRegionBlock *VPR : VPBlockUtils::blocksOnly<VPRegionBlock>(Range: Iter)) {
319	VPBasicBlock *EntryVPBB = VPR->getEntryBasicBlock();
320	if (!VPR->isReplicator() \|\| EntryVPBB->getSuccessors().size() != `2`)
321	continue;
322	VPBasicBlock *VPBB = cast<VPBasicBlock>(Val: EntryVPBB->getSuccessors().front());
323	if (VPBB->getSingleSuccessor() != VPR->getExitingBasicBlock())
324	continue;
325	for (auto &Recipe : *VPBB)
326	for (VPValue *Op : Recipe.operands())
327	InsertIfValidSinkCandidate (VPBB, Op);
328	}
329
330	// Try to sink each replicate or scalar IV steps recipe in the worklist.
331	for (unsigned I = `0`; I != WorkList.size(); ++I) {
332	VPBasicBlock *SinkTo;
333	VPSingleDefRecipe *SinkCandidate;
334	std::tie(args&: SinkTo, args&: SinkCandidate) = WorkList [I];
335
336	// All recipe users of SinkCandidate must be in the same block SinkTo or all
337	// users outside of SinkTo must only use the first lane of SinkCandidate. In
338	// the latter case, we need to duplicate SinkCandidate.
339	auto UsersOutsideSinkTo =
340	make_filter_range(Range: SinkCandidate->users(), Pred: [SinkTo](VPUser *U) {
341	return cast<VPRecipeBase>(Val: U)->getParent() != SinkTo;
342	});
343	if (any_of(Range&: UsersOutsideSinkTo, P: [SinkCandidate](VPUser *U) {
344	return !U->usesFirstLaneOnly(Op: SinkCandidate);
345	}))
346	continue;
347	bool NeedsDuplicating = !UsersOutsideSinkTo.empty();
348
349	if (NeedsDuplicating) {
350	if (ScalarVFOnly)
351	continue;
352	VPSingleDefRecipe *Clone;
353	if (auto *SinkCandidateRepR =
354	dyn_cast<VPReplicateRecipe>(Val: SinkCandidate)) {
355	// TODO: Handle converting to uniform recipes as separate transform,
356	// then cloning should be sufficient here.
357	Instruction *I = SinkCandidate->getUnderlyingInstr();
358	Clone = new VPReplicateRecipe (I, SinkCandidate->operands(), true,
359	nullptr /Mask/, *SinkCandidateRepR,
360	*SinkCandidateRepR);
361	// TODO: add ".cloned" suffix to name of Clone's VPValue.
362	} else {
363	Clone = SinkCandidate->clone();
364	}
365
366	Clone->insertBefore(InsertPos: SinkCandidate);
367	SinkCandidate->replaceUsesWithIf(New: Clone, ShouldReplace: [SinkTo](VPUser &U, unsigned) {
368	return cast<VPRecipeBase>(Val: &U)->getParent() != SinkTo;
369	});
370	}
371	SinkCandidate->moveBefore(BB&: *SinkTo, I: SinkTo->getFirstNonPhi());
372	for (VPValue *Op : SinkCandidate->operands())
373	InsertIfValidSinkCandidate (SinkTo, Op);
374	Changed = true;
375	}
376	return Changed;
377	}
378
379	/// If \p R is a region with a VPBranchOnMaskRecipe in the entry block, return
380	/// the mask.
381	static VPValue getPredicatedMask(VPRegionBlock R) {
382	auto *EntryBB = dyn_cast<VPBasicBlock>(Val: R->getEntry());
383	if (!EntryBB \|\| EntryBB->size() != `1` \|\|
384	!isa<VPBranchOnMaskRecipe>(Val: EntryBB->begin()))
385	return nullptr;
386
387	return cast<VPBranchOnMaskRecipe>(Val: &*EntryBB->begin())->getOperand(N: `0`);
388	}
389
390	/// If \p R is a triangle region, return the 'then' block of the triangle.
391	static VPBasicBlock getPredicatedThenBlock(VPRegionBlock R) {
392	auto *EntryBB = cast<VPBasicBlock>(Val: R->getEntry());
393	if (EntryBB->getNumSuccessors() != `2`)
394	return nullptr;
395
396	auto *Succ0 = dyn_cast<VPBasicBlock>(Val: EntryBB->getSuccessors()[`0`]);
397	auto *Succ1 = dyn_cast<VPBasicBlock>(Val: EntryBB->getSuccessors()[`1`]);
398	if (!Succ0 \|\| !Succ1)
399	return nullptr;
400
401	if (Succ0->getNumSuccessors() + Succ1->getNumSuccessors() != `1`)
402	return nullptr;
403	if (Succ0->getSingleSuccessor() == Succ1)
404	return Succ0;
405	if (Succ1->getSingleSuccessor() == Succ0)
406	return Succ1;
407	return nullptr;
408	}
409
410	// Merge replicate regions in their successor region, if a replicate region
411	// is connected to a successor replicate region with the same predicate by a
412	// single, empty VPBasicBlock.
413	static bool mergeReplicateRegionsIntoSuccessors(VPlan &Plan) {
414	SmallPtrSet<VPRegionBlock *, `4`> TransformedRegions;
415
416	// Collect replicate regions followed by an empty block, followed by another
417	// replicate region with matching masks to process front. This is to avoid
418	// iterator invalidation issues while merging regions.
419	SmallVector<VPRegionBlock *, `8`> WorkList;
420	for (VPRegionBlock *Region1 : VPBlockUtils::blocksOnly<VPRegionBlock>(
421	Range: vp_depth_first_deep(G: Plan.getEntry()))) {
422	if (!Region1->isReplicator())
423	continue;
424	auto *MiddleBasicBlock =
425	dyn_cast_or_null<VPBasicBlock>(Val: Region1->getSingleSuccessor());
426	if (!MiddleBasicBlock \|\| !MiddleBasicBlock->empty())
427	continue;
428
429	auto *Region2 =
430	dyn_cast_or_null<VPRegionBlock>(Val: MiddleBasicBlock->getSingleSuccessor());
431	if (!Region2 \|\| !Region2->isReplicator())
432	continue;
433
434	VPValue *Mask1 = getPredicatedMask(R: Region1);
435	VPValue *Mask2 = getPredicatedMask(R: Region2);
436	if (!Mask1 \|\| Mask1 != Mask2)
437	continue;
438
439	assert(Mask1 && Mask2 && "both region must have conditions");
440	WorkList.push_back(Elt: Region1);
441	}
442
443	// Move recipes from Region1 to its successor region, if both are triangles.
444	for (VPRegionBlock *Region1 : WorkList) {
445	if (TransformedRegions.contains(Ptr: Region1))
446	continue;
447	auto *MiddleBasicBlock = cast<VPBasicBlock>(Val: Region1->getSingleSuccessor());
448	auto *Region2 = cast<VPRegionBlock>(Val: MiddleBasicBlock->getSingleSuccessor());
449
450	VPBasicBlock *Then1 = getPredicatedThenBlock(R: Region1);
451	VPBasicBlock *Then2 = getPredicatedThenBlock(R: Region2);
452	if (!Then1 \|\| !Then2)
453	continue;
454
455	// Note: No fusion-preventing memory dependencies are expected in either
456	// region. Such dependencies should be rejected during earlier dependence
457	// checks, which guarantee accesses can be re-ordered for vectorization.
458	//
459	// Move recipes to the successor region.
460	for (VPRecipeBase &ToMove : make_early_inc_range(Range: reverse(C&: *Then1)))
461	ToMove.moveBefore(BB&: *Then2, I: Then2->getFirstNonPhi());
462
463	auto *Merge1 = cast<VPBasicBlock>(Val: Then1->getSingleSuccessor());
464	auto *Merge2 = cast<VPBasicBlock>(Val: Then2->getSingleSuccessor());
465
466	// Move VPPredInstPHIRecipes from the merge block to the successor region's
467	// merge block. Update all users inside the successor region to use the
468	// original values.
469	for (VPRecipeBase &Phi1ToMove : make_early_inc_range(Range: reverse(C&: *Merge1))) {
470	VPValue *PredInst1 =
471	cast<VPPredInstPHIRecipe>(Val: &Phi1ToMove)->getOperand(N: `0`);
472	VPValue *Phi1ToMoveV = Phi1ToMove.getVPSingleValue();
473	Phi1ToMoveV->replaceUsesWithIf(New: PredInst1, ShouldReplace: [Then2](VPUser &U, unsigned) {
474	return cast<VPRecipeBase>(Val: &U)->getParent() == Then2;
475	});
476
477	// Remove phi recipes that are unused after merging the regions.
478	if (Phi1ToMove.getVPSingleValue()->getNumUsers() == `0`) {
479	Phi1ToMove.eraseFromParent();
480	continue;
481	}
482	Phi1ToMove.moveBefore(BB&: *Merge2, I: Merge2->begin());
483	}
484
485	// Remove the dead recipes in Region1's entry block.
486	for (VPRecipeBase &R :
487	make_early_inc_range(Range: reverse(C&: *Region1->getEntryBasicBlock())))
488	R.eraseFromParent();
489
490	// Finally, remove the first region.
491	for (VPBlockBase *Pred : make_early_inc_range(Range&: Region1->getPredecessors())) {
492	VPBlockUtils::disconnectBlocks(From: Pred, To: Region1);
493	VPBlockUtils::connectBlocks(From: Pred, To: MiddleBasicBlock);
494	}
495	VPBlockUtils::disconnectBlocks(From: Region1, To: MiddleBasicBlock);
496	TransformedRegions.insert(Ptr: Region1);
497	}
498
499	return !TransformedRegions.empty();
500	}
501
502	static VPRegionBlock createReplicateRegion(VPReplicateRecipe PredRecipe,
503	VPlan &Plan) {
504	Instruction *Instr = PredRecipe->getUnderlyingInstr();
505	// Build the triangular if-then region.
506	std::string RegionName = (Twine ("pred.") + Instr->getOpcodeName()).str();
507	assert(Instr->getParent() && "Predicated instruction not in any basic block");
508	auto *BlockInMask = PredRecipe->getMask();
509	auto *MaskDef = BlockInMask->getDefiningRecipe();
510	auto BOMRecipe = new* VPBranchOnMaskRecipe (
511	BlockInMask, MaskDef ? MaskDef->getDebugLoc() : DebugLoc::getUnknown());
512	auto *Entry =
513	Plan.createVPBasicBlock(Name: Twine (RegionName) + ".entry", Recipe: BOMRecipe);
514
515	// Replace predicated replicate recipe with a replicate recipe without a
516	// mask but in the replicate region.
517	auto RecipeWithoutMask = new* VPReplicateRecipe (
518	PredRecipe->getUnderlyingInstr(), drop_end(RangeOrContainer: PredRecipe->operands()),
519	PredRecipe->isSingleScalar(), nullptr /Mask/, PredRecipe, PredRecipe,
520	PredRecipe->getDebugLoc());
521	auto *Pred =
522	Plan.createVPBasicBlock(Name: Twine (RegionName) + ".if", Recipe: RecipeWithoutMask);
523
524	VPPredInstPHIRecipe PHIRecipe = nullptr*;
525	if (PredRecipe->getNumUsers() != `0`) {
526	PHIRecipe = new VPPredInstPHIRecipe (RecipeWithoutMask,
527	RecipeWithoutMask->getDebugLoc());
528	PredRecipe->replaceAllUsesWith(New: PHIRecipe);
529	PHIRecipe->setOperand(I: `0`, New: RecipeWithoutMask);
530	}
531	PredRecipe->eraseFromParent();
532	auto *Exiting =
533	Plan.createVPBasicBlock(Name: Twine (RegionName) + ".continue", Recipe: PHIRecipe);
534	VPRegionBlock *Region =
535	Plan.createReplicateRegion(Entry, Exiting, Name: RegionName);
536
537	// Note: first set Entry as region entry and then connect successors starting
538	// from it in order, to propagate the "parent" of each VPBasicBlock.
539	VPBlockUtils::insertTwoBlocksAfter(IfTrue: Pred, IfFalse: Exiting, BlockPtr: Entry);
540	VPBlockUtils::connectBlocks(From: Pred, To: Exiting);
541
542	return Region;
543	}
544
545	static void addReplicateRegions(VPlan &Plan) {
546	SmallVector<VPReplicateRecipe *> WorkList;
547	for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
548	Range: vp_depth_first_deep(G: Plan.getEntry()))) {
549	for (VPRecipeBase &R : *VPBB)
550	if (auto *RepR = dyn_cast<VPReplicateRecipe>(Val: &R)) {
551	if (RepR->isPredicated())
552	WorkList.push_back(Elt: RepR);
553	}
554	}
555
556	unsigned BBNum = `0`;
557	for (VPReplicateRecipe *RepR : WorkList) {
558	VPBasicBlock *CurrentBlock = RepR->getParent();
559	VPBasicBlock *SplitBlock = CurrentBlock->splitAt(SplitAt: RepR->getIterator());
560
561	BasicBlock *OrigBB = RepR->getUnderlyingInstr()->getParent();
562	SplitBlock->setName(
563	OrigBB->hasName() ? OrigBB->getName() + "." + Twine (BBNum++) : "");
564	// Record predicated instructions for above packing optimizations.
565	VPRegionBlock *Region = createReplicateRegion(PredRecipe: RepR, Plan);
566	Region->setParent(CurrentBlock->getParent());
567	VPBlockUtils::insertOnEdge(From: CurrentBlock, To: SplitBlock, BlockPtr: Region);
568
569	VPRegionBlock *ParentRegion = Region->getParent();
570	if (ParentRegion && ParentRegion->getExiting() == CurrentBlock)
571	ParentRegion->setExiting(SplitBlock);
572	}
573	}
574
575	/// Remove redundant VPBasicBlocks by merging them into their predecessor if
576	/// the predecessor has a single successor.
577	static bool mergeBlocksIntoPredecessors(VPlan &Plan) {
578	SmallVector<VPBasicBlock *> WorkList;
579	for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
580	Range: vp_depth_first_deep(G: Plan.getEntry()))) {
581	// Don't fold the blocks in the skeleton of the Plan into their single
582	// predecessors for now.
583	// TODO: Remove restriction once more of the skeleton is modeled in VPlan.
584	if (!VPBB->getParent())
585	continue;
586	auto *PredVPBB =
587	dyn_cast_or_null<VPBasicBlock>(Val: VPBB->getSinglePredecessor());
588	if (!PredVPBB \|\| PredVPBB->getNumSuccessors() != `1` \|\|
589	isa<VPIRBasicBlock>(Val: PredVPBB))
590	continue;
591	WorkList.push_back(Elt: VPBB);
592	}
593
594	for (VPBasicBlock *VPBB : WorkList) {
595	VPBasicBlock *PredVPBB = cast<VPBasicBlock>(Val: VPBB->getSinglePredecessor());
596	for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB))
597	R.moveBefore(BB&: *PredVPBB, I: PredVPBB->end());
598	VPBlockUtils::disconnectBlocks(From: PredVPBB, To: VPBB);
599	auto *ParentRegion = VPBB->getParent();
600	if (ParentRegion && ParentRegion->getExiting() == VPBB)
601	ParentRegion->setExiting(PredVPBB);
602	for (auto *Succ : to_vector(Range: VPBB->successors())) {
603	VPBlockUtils::disconnectBlocks(From: VPBB, To: Succ);
604	VPBlockUtils::connectBlocks(From: PredVPBB, To: Succ);
605	}
606	// VPBB is now dead and will be cleaned up when the plan gets destroyed.
607	}
608	return !WorkList.empty();
609	}
610
611	void VPlanTransforms::createAndOptimizeReplicateRegions(VPlan &Plan) {
612	// Convert masked VPReplicateRecipes to if-then region blocks.
613	addReplicateRegions(Plan);
614
615	bool ShouldSimplify = true;
616	while (ShouldSimplify) {
617	ShouldSimplify = sinkScalarOperands(Plan);
618	ShouldSimplify \|= mergeReplicateRegionsIntoSuccessors(Plan);
619	ShouldSimplify \|= mergeBlocksIntoPredecessors(Plan);
620	}
621	}
622
623	/// Remove redundant casts of inductions.
624	///
625	/// Such redundant casts are casts of induction variables that can be ignored,
626	/// because we already proved that the casted phi is equal to the uncasted phi
627	/// in the vectorized loop. There is no need to vectorize the cast - the same
628	/// value can be used for both the phi and casts in the vector loop.
629	static void removeRedundantInductionCasts(VPlan &Plan) {
630	for (auto &Phi : Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
631	auto *IV = dyn_cast<VPWidenIntOrFpInductionRecipe>(Val: &Phi);
632	if (!IV \|\| IV->getTruncInst())
633	continue;
634
635	// A sequence of IR Casts has potentially been recorded for IV, which
636	// must be bypassed* when the IV is vectorized, because the vectorized IV*
637	// will produce the desired casted value. This sequence forms a def-use
638	// chain and is provided in reverse order, ending with the cast that uses
639	// the IV phi. Search for the recipe of the last cast in the chain and
640	// replace it with the original IV. Note that only the final cast is
641	// expected to have users outside the cast-chain and the dead casts left
642	// over will be cleaned up later.
643	ArrayRef<Instruction *> Casts = IV->getInductionDescriptor().getCastInsts();
644	VPValue *FindMyCast = IV;
645	for (Instruction *IRCast : reverse(C&: Casts)) {
646	VPSingleDefRecipe FoundUserCast = nullptr*;
647	for (auto *U : FindMyCast->users()) {
648	auto *UserCast = dyn_cast<VPSingleDefRecipe>(Val: U);
649	if (UserCast && UserCast->getUnderlyingValue() == IRCast) {
650	FoundUserCast = UserCast;
651	break;
652	}
653	}
654	FindMyCast = FoundUserCast;
655	}
656	FindMyCast->replaceAllUsesWith(New: IV);
657	}
658	}
659
660	/// Try to replace VPWidenCanonicalIVRecipes with a widened canonical IV
661	/// recipe, if it exists.
662	static void removeRedundantCanonicalIVs(VPlan &Plan) {
663	VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
664	VPCanonicalIVPHIRecipe *CanonicalIV = LoopRegion->getCanonicalIV();
665	VPWidenCanonicalIVRecipe WidenNewIV = nullptr*;
666	for (VPUser *U : CanonicalIV->users()) {
667	WidenNewIV = dyn_cast<VPWidenCanonicalIVRecipe>(Val: U);
668	if (WidenNewIV)
669	break;
670	}
671
672	if (!WidenNewIV)
673	return;
674
675	VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
676	for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
677	auto *WidenOriginalIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(Val: &Phi);
678
679	if (!WidenOriginalIV \|\| !WidenOriginalIV->isCanonical())
680	continue;
681
682	// Replace WidenNewIV with WidenOriginalIV if WidenOriginalIV provides
683	// everything WidenNewIV's users need. That is, WidenOriginalIV will
684	// generate a vector phi or all users of WidenNewIV demand the first lane
685	// only.
686	if (Plan.hasScalarVFOnly() \|\|
687	!vputils::onlyScalarValuesUsed(Def: WidenOriginalIV) \|\|
688	vputils::onlyFirstLaneUsed(Def: WidenNewIV)) {
689	// We are replacing a wide canonical iv with a suitable wide induction.
690	// This is used to compute header mask, hence all lanes will be used and
691	// we need to drop wrap flags only applying to lanes guranteed to execute
692	// in the original scalar loop.
693	WidenOriginalIV->dropPoisonGeneratingFlags();
694	WidenNewIV->replaceAllUsesWith(New: WidenOriginalIV);
695	WidenNewIV->eraseFromParent();
696	return;
697	}
698	}
699	}
700
701	/// Returns true if \p R is dead and can be removed.
702	static bool isDeadRecipe(VPRecipeBase &R) {
703	// Do remove conditional assume instructions as their conditions may be
704	// flattened.
705	auto *RepR = dyn_cast<VPReplicateRecipe>(Val: &R);
706	bool IsConditionalAssume = RepR && RepR->isPredicated() &&
707	match(V: RepR, P: m_Intrinsic<Intrinsic::assume>());
708	if (IsConditionalAssume)
709	return true;
710
711	if (R.mayHaveSideEffects())
712	return false;
713
714	// Recipe is dead if no user keeps the recipe alive.
715	return all_of(Range: R.definedValues(),
716	P: [](VPValue V) { return* V->getNumUsers() == `0`; });
717	}
718
719	void VPlanTransforms::removeDeadRecipes(VPlan &Plan) {
720	for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
721	Range: vp_post_order_deep(G: Plan.getEntry()))) {
722	// The recipes in the block are processed in reverse order, to catch chains
723	// of dead recipes.
724	for (VPRecipeBase &R : make_early_inc_range(Range: reverse(C&: *VPBB))) {
725	if (isDeadRecipe(R)) {
726	R.eraseFromParent();
727	continue;
728	}
729
730	// Check if R is a dead VPPhi <-> update cycle and remove it.
731	auto *PhiR = dyn_cast<VPPhi>(Val: &R);
732	if (!PhiR \|\| PhiR->getNumOperands() != `2`)
733	continue;
734	VPUser *PhiUser = PhiR->getSingleUser();
735	if (!PhiUser)
736	continue;
737	VPValue *Incoming = PhiR->getOperand(N: `1`);
738	if (PhiUser != Incoming->getDefiningRecipe() \|\|
739	Incoming->getNumUsers() != `1`)
740	continue;
741	PhiR->replaceAllUsesWith(New: PhiR->getOperand(N: `0`));
742	PhiR->eraseFromParent();
743	Incoming->getDefiningRecipe()->eraseFromParent();
744	}
745	}
746	}
747
748	static VPScalarIVStepsRecipe *
749	createScalarIVSteps(VPlan &Plan, InductionDescriptor::InductionKind Kind,
750	Instruction::BinaryOps InductionOpcode,
751	FPMathOperator FPBinOp, Instruction TruncI,
752	VPIRValue StartV, VPValue Step, DebugLoc DL,
753	VPBuilder &Builder) {
754	VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
755	VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
756	VPCanonicalIVPHIRecipe *CanonicalIV = LoopRegion->getCanonicalIV();
757	VPSingleDefRecipe *BaseIV = Builder.createDerivedIV(
758	Kind, FPBinOp, Start: StartV, Current: CanonicalIV, Step, Name: "offset.idx");
759
760	// Truncate base induction if needed.
761	VPTypeAnalysis TypeInfo(Plan);
762	Type *ResultTy = TypeInfo.inferScalarType(V: BaseIV);
763	if (TruncI) {
764	Type *TruncTy = TruncI->getType();
765	assert(ResultTy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits() &&
766	"Not truncating.");
767	assert(ResultTy->isIntegerTy() && "Truncation requires an integer type");
768	BaseIV = Builder.createScalarCast(Opcode: Instruction::Trunc, Op: BaseIV, ResultTy: TruncTy, DL);
769	ResultTy = TruncTy;
770	}
771
772	// Truncate step if needed.
773	Type *StepTy = TypeInfo.inferScalarType(V: Step);
774	if (ResultTy != StepTy) {
775	assert(StepTy->getScalarSizeInBits() > ResultTy->getScalarSizeInBits() &&
776	"Not truncating.");
777	assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
778	auto *VecPreheader =
779	cast<VPBasicBlock>(Val: HeaderVPBB->getSingleHierarchicalPredecessor());
780	VPBuilder::InsertPointGuard Guard(Builder);
781	Builder.setInsertPoint(VecPreheader);
782	Step = Builder.createScalarCast(Opcode: Instruction::Trunc, Op: Step, ResultTy, DL);
783	}
784	return Builder.createScalarIVSteps(InductionOpcode, FPBinOp, IV: BaseIV, Step,
785	VF: &Plan.getVF(), DL);
786	}
787
788	static SmallVector<VPUser > collectUsersRecursively(VPValue V) {
789	SetVector<VPUser *> Users(llvm::from_range, V->users());
790	for (unsigned I = `0`; I != Users.size(); ++I) {
791	VPRecipeBase *Cur = cast<VPRecipeBase>(Val: Users [I]);
792	if (isa<VPHeaderPHIRecipe>(Val: Cur))
793	continue;
794	for (VPValue *V : Cur->definedValues())
795	Users.insert_range(R: V->users());
796	}
797	return Users.takeVector();
798	}
799
800	/// Scalarize a VPWidenPointerInductionRecipe by replacing it with a PtrAdd
801	/// (IndStart, ScalarIVSteps (0, Step)). This is used when the recipe only
802	/// generates scalar values.
803	static VPValue *
804	scalarizeVPWidenPointerInduction(VPWidenPointerInductionRecipe *PtrIV,
805	VPlan &Plan, VPBuilder &Builder) {
806	const InductionDescriptor &ID = PtrIV->getInductionDescriptor();
807	VPIRValue *StartV = Plan.getZero(Ty: ID.getStep()->getType());
808	VPValue *StepV = PtrIV->getOperand(N: `1`);
809	VPScalarIVStepsRecipe *Steps = createScalarIVSteps(
810	Plan, Kind: InductionDescriptor::IK_IntInduction, InductionOpcode: Instruction::Add, FPBinOp: nullptr,
811	TruncI: nullptr, StartV, Step: StepV, DL: PtrIV->getDebugLoc(), Builder);
812
813	return Builder.createPtrAdd(Ptr: PtrIV->getStartValue(), Offset: Steps,
814	DL: PtrIV->getDebugLoc(), Name: "next.gep");
815	}
816
817	/// Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd
818	/// (IndStart, ScalarIVSteps (0, Step)) if only its scalar values are used, as
819	/// VPWidenPointerInductionRecipe will generate vectors only. If some users
820	/// require vectors while other require scalars, the scalar uses need to extract
821	/// the scalars from the generated vectors (Note that this is different to how
822	/// int/fp inductions are handled). Legalize extract-from-ends using uniform
823	/// VPReplicateRecipe of wide inductions to use regular VPReplicateRecipe, so
824	/// the correct end value is available. Also optimize
825	/// VPWidenIntOrFpInductionRecipe, if any of its users needs scalar values, by
826	/// providing them scalar steps built on the canonical scalar IV and update the
827	/// original IV's users. This is an optional optimization to reduce the needs of
828	/// vector extracts.
829	static void legalizeAndOptimizeInductions(VPlan &Plan) {
830	VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
831	bool HasOnlyVectorVFs = !Plan.hasScalarVFOnly();
832	VPBuilder Builder(HeaderVPBB, HeaderVPBB->getFirstNonPhi());
833	for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
834	auto *PhiR = dyn_cast<VPWidenInductionRecipe>(Val: &Phi);
835	if (!PhiR)
836	continue;
837
838	// Try to narrow wide and replicating recipes to uniform recipes, based on
839	// VPlan analysis.
840	// TODO: Apply to all recipes in the future, to replace legacy uniformity
841	// analysis.
842	auto Users = collectUsersRecursively(V: PhiR);
843	for (VPUser *U : reverse(C&: Users)) {
844	auto *Def = dyn_cast<VPRecipeWithIRFlags>(Val: U);
845	auto *RepR = dyn_cast<VPReplicateRecipe>(Val: U);
846	// Skip recipes that shouldn't be narrowed.
847	if (!Def \|\| !isa<VPReplicateRecipe, VPWidenRecipe>(Val: Def) \|\|
848	Def->getNumUsers() == `0` \|\| !Def->getUnderlyingValue() \|\|
849	(RepR && (RepR->isSingleScalar() \|\| RepR->isPredicated())))
850	continue;
851
852	// Skip recipes that may have other lanes than their first used.
853	if (!vputils::isSingleScalar(VPV: Def) && !vputils::onlyFirstLaneUsed(Def))
854	continue;
855
856	auto Clone = new* VPReplicateRecipe (Def->getUnderlyingInstr(),
857	Def->operands(), /IsUniform/ true,
858	/Mask/ nullptr, /Flags/ *Def);
859	Clone->insertAfter(InsertPos: Def);
860	Def->replaceAllUsesWith(New: Clone);
861	}
862
863	// Replace wide pointer inductions which have only their scalars used by
864	// PtrAdd(IndStart, ScalarIVSteps (0, Step)).
865	if (auto *PtrIV = dyn_cast<VPWidenPointerInductionRecipe>(Val: &Phi)) {
866	if (!Plan.hasScalarVFOnly() &&
867	!PtrIV->onlyScalarsGenerated(IsScalable: Plan.hasScalableVF()))
868	continue;
869
870	VPValue *PtrAdd = scalarizeVPWidenPointerInduction(PtrIV, Plan, Builder);
871	PtrIV->replaceAllUsesWith(New: PtrAdd);
872	continue;
873	}
874
875	// Replace widened induction with scalar steps for users that only use
876	// scalars.
877	auto *WideIV = cast<VPWidenIntOrFpInductionRecipe>(Val: &Phi);
878	if (HasOnlyVectorVFs && none_of(Range: WideIV->users(), P: [WideIV](VPUser *U) {
879	return U->usesScalars(Op: WideIV);
880	}))
881	continue;
882
883	const InductionDescriptor &ID = WideIV->getInductionDescriptor();
884	VPScalarIVStepsRecipe *Steps = createScalarIVSteps(
885	Plan, Kind: ID.getKind(), InductionOpcode: ID.getInductionOpcode(),
886	FPBinOp: dyn_cast_or_null<FPMathOperator>(Val: ID.getInductionBinOp()),
887	TruncI: WideIV->getTruncInst(), StartV: WideIV->getStartValue(), Step: WideIV->getStepValue(),
888	DL: WideIV->getDebugLoc(), Builder);
889
890	// Update scalar users of IV to use Step instead.
891	if (!HasOnlyVectorVFs) {
892	assert(!Plan.hasScalableVF() &&
893	"plans containing a scalar VF cannot also include scalable VFs");
894	WideIV->replaceAllUsesWith(New: Steps);
895	} else {
896	bool HasScalableVF = Plan.hasScalableVF();
897	WideIV->replaceUsesWithIf(New: Steps,
898	ShouldReplace: [WideIV, HasScalableVF](VPUser &U, unsigned) {
899	if (HasScalableVF)
900	return U.usesFirstLaneOnly(Op: WideIV);
901	return U.usesScalars(Op: WideIV);
902	});
903	}
904	}
905	}
906
907	/// Check if \p VPV is an untruncated wide induction, either before or after the
908	/// increment. If so return the header IV (before the increment), otherwise
909	/// return null.
910	static VPWidenInductionRecipe *
911	getOptimizableIVOf(VPValue *VPV, PredicatedScalarEvolution &PSE) {
912	auto *WideIV = dyn_cast<VPWidenInductionRecipe>(Val: VPV);
913	if (WideIV) {
914	// VPV itself is a wide induction, separately compute the end value for exit
915	// users if it is not a truncated IV.
916	auto *IntOrFpIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(Val: WideIV);
917	return (IntOrFpIV && IntOrFpIV->getTruncInst()) ? nullptr : WideIV;
918	}
919
920	// Check if VPV is an optimizable induction increment.
921	VPRecipeBase *Def = VPV->getDefiningRecipe();
922	if (!Def \|\| Def->getNumOperands() != `2`)
923	return nullptr;
924	WideIV = dyn_cast<VPWidenInductionRecipe>(Val: Def->getOperand(N: `0`));
925	if (!WideIV)
926	WideIV = dyn_cast<VPWidenInductionRecipe>(Val: Def->getOperand(N: `1`));
927	if (!WideIV)
928	return nullptr;
929
930	auto IsWideIVInc = [&]() {
931	auto &ID = WideIV->getInductionDescriptor();
932
933	// Check if VPV increments the induction by the induction step.
934	VPValue *IVStep = WideIV->getStepValue();
935	switch (ID.getInductionOpcode()) {
936	case Instruction::Add:
937	return match(V: VPV, P: m_c_Add(Op0: m_Specific(VPV: WideIV), Op1: m_Specific(VPV: IVStep)));
938	case Instruction::FAdd:
939	return match(V: VPV, P: m_c_FAdd(Op0: m_Specific(VPV: WideIV), Op1: m_Specific(VPV: IVStep)));
940	case Instruction::FSub:
941	return match(V: VPV, P: m_Binary<Instruction::FSub>(Op0: m_Specific(VPV: WideIV),
942	Op1: m_Specific(VPV: IVStep)));
943	case Instruction::Sub: {
944	// IVStep will be the negated step of the subtraction. Check if Step == -1
945	// IVStep.*
946	VPValue *Step;
947	if (!match(V: VPV, P: m_Sub(Op0: m_VPValue(), Op1: m_VPValue(V&: Step))))
948	return false;
949	const SCEV *IVStepSCEV = vputils::getSCEVExprForVPValue(V: IVStep, PSE);
950	const SCEV *StepSCEV = vputils::getSCEVExprForVPValue(V: Step, PSE);
951	ScalarEvolution &SE = *PSE.getSE();
952	return !isa<SCEVCouldNotCompute>(Val: IVStepSCEV) &&
953	!isa<SCEVCouldNotCompute>(Val: StepSCEV) &&
954	IVStepSCEV == SE.getNegativeSCEV(V: StepSCEV);
955	}
956	default:
957	return ID.getKind() == InductionDescriptor::IK_PtrInduction &&
958	match(V: VPV, P: m_GetElementPtr(Op0: m_Specific(VPV: WideIV),
959	Op1: m_Specific(VPV: WideIV->getStepValue())));
960	}
961	llvm_unreachable("should have been covered by switch above");
962	};
963	return IsWideIVInc () ? WideIV : nullptr;
964	}
965
966	/// Attempts to optimize the induction variable exit values for users in the
967	/// early exit block.
968	static VPValue *optimizeEarlyExitInductionUser(VPlan &Plan,
969	VPTypeAnalysis &TypeInfo,
970	VPBlockBase *PredVPBB,
971	VPValue *Op,
972	PredicatedScalarEvolution &PSE) {
973	VPValue Incoming, Mask;
974	if (!match(V: Op, P: m_ExtractLane(Op0: m_FirstActiveLane(Op0: m_VPValue(V&: Mask)),
975	Op1: m_VPValue(V&: Incoming))))
976	return nullptr;
977
978	auto *WideIV = getOptimizableIVOf(VPV: Incoming, PSE);
979	if (!WideIV)
980	return nullptr;
981
982	auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(Val: WideIV);
983	if (WideIntOrFp && WideIntOrFp->getTruncInst())
984	return nullptr;
985
986	// Calculate the final index.
987	VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
988	auto *CanonicalIV = LoopRegion->getCanonicalIV();
989	Type *CanonicalIVType = LoopRegion->getCanonicalIVType();
990	VPBuilder B(cast<VPBasicBlock>(Val: PredVPBB));
991
992	DebugLoc DL = cast<VPInstruction>(Val: Op)->getDebugLoc();
993	VPValue *FirstActiveLane =
994	B.createNaryOp(Opcode: VPInstruction::FirstActiveLane, Operands: Mask, DL);
995	Type *FirstActiveLaneType = TypeInfo.inferScalarType(V: FirstActiveLane);
996	FirstActiveLane = B.createScalarZExtOrTrunc(Op: FirstActiveLane, ResultTy: CanonicalIVType,
997	SrcTy: FirstActiveLaneType, DL);
998	VPValue *EndValue = B.createAdd(LHS: CanonicalIV, RHS: FirstActiveLane, DL);
999
1000	// `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
1001	// changed it means the exit is using the incremented value, so we need to
1002	// add the step.
1003	if (Incoming != WideIV) {
1004	VPValue *One = Plan.getConstantInt(Ty: CanonicalIVType, Val: `1`);
1005	EndValue = B.createAdd(LHS: EndValue, RHS: One, DL);
1006	}
1007
1008	if (!WideIntOrFp \|\| !WideIntOrFp->isCanonical()) {
1009	const InductionDescriptor &ID = WideIV->getInductionDescriptor();
1010	VPIRValue *Start = WideIV->getStartValue();
1011	VPValue *Step = WideIV->getStepValue();
1012	EndValue = B.createDerivedIV(
1013	Kind: ID.getKind(), FPBinOp: dyn_cast_or_null<FPMathOperator>(Val: ID.getInductionBinOp()),
1014	Start, Current: EndValue, Step);
1015	}
1016
1017	return EndValue;
1018	}
1019
1020	/// Compute the end value for \p WideIV, unless it is truncated. Creates a
1021	/// VPDerivedIVRecipe for non-canonical inductions.
1022	static VPValue tryToComputeEndValueForInduction(VPWidenInductionRecipe WideIV,
1023	VPBuilder &VectorPHBuilder,
1024	VPTypeAnalysis &TypeInfo,
1025	VPValue *VectorTC) {
1026	auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(Val: WideIV);
1027	// Truncated wide inductions resume from the last lane of their vector value
1028	// in the last vector iteration which is handled elsewhere.
1029	if (WideIntOrFp && WideIntOrFp->getTruncInst())
1030	return nullptr;
1031
1032	VPIRValue *Start = WideIV->getStartValue();
1033	VPValue *Step = WideIV->getStepValue();
1034	const InductionDescriptor &ID = WideIV->getInductionDescriptor();
1035	VPValue *EndValue = VectorTC;
1036	if (!WideIntOrFp \|\| !WideIntOrFp->isCanonical()) {
1037	EndValue = VectorPHBuilder.createDerivedIV(
1038	Kind: ID.getKind(), FPBinOp: dyn_cast_or_null<FPMathOperator>(Val: ID.getInductionBinOp()),
1039	Start, Current: VectorTC, Step);
1040	}
1041
1042	// EndValue is derived from the vector trip count (which has the same type as
1043	// the widest induction) and thus may be wider than the induction here.
1044	Type *ScalarTypeOfWideIV = TypeInfo.inferScalarType(V: WideIV);
1045	if (ScalarTypeOfWideIV != TypeInfo.inferScalarType(V: EndValue)) {
1046	EndValue = VectorPHBuilder.createScalarCast(Opcode: Instruction::Trunc, Op: EndValue,
1047	ResultTy: ScalarTypeOfWideIV,
1048	DL: WideIV->getDebugLoc());
1049	}
1050
1051	return EndValue;
1052	}
1053
1054	/// Attempts to optimize the induction variable exit values for users in the
1055	/// exit block coming from the latch in the original scalar loop.
1056	static VPValue *optimizeLatchExitInductionUser(
1057	VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase PredVPBB, VPValue Op,
1058	DenseMap<VPValue , VPValue > &EndValues, PredicatedScalarEvolution &PSE) {
1059	VPValue *Incoming;
1060	VPWidenInductionRecipe WideIV = nullptr*;
1061	if (match(V: Op, P: m_ExtractLastLaneOfLastPart(Op0: m_VPValue(V&: Incoming))))
1062	WideIV = getOptimizableIVOf(VPV: Incoming, PSE);
1063
1064	if (!WideIV)
1065	return nullptr;
1066
1067	VPValue *EndValue = EndValues.lookup(Val: WideIV);
1068	assert(EndValue && "Must have computed the end value up front");
1069
1070	// `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
1071	// changed it means the exit is using the incremented value, so we don't
1072	// need to subtract the step.
1073	if (Incoming != WideIV)
1074	return EndValue;
1075
1076	// Otherwise, subtract the step from the EndValue.
1077	VPBuilder B(cast<VPBasicBlock>(Val: PredVPBB)->getTerminator());
1078	VPValue *Step = WideIV->getStepValue();
1079	Type *ScalarTy = TypeInfo.inferScalarType(V: WideIV);
1080	if (ScalarTy->isIntegerTy())
1081	return B.createSub(LHS: EndValue, RHS: Step, DL: DebugLoc::getUnknown(), Name: "ind.escape");
1082	if (ScalarTy->isPointerTy()) {
1083	Type *StepTy = TypeInfo.inferScalarType(V: Step);
1084	auto *Zero = Plan.getZero(Ty: StepTy);
1085	return B.createPtrAdd(Ptr: EndValue, Offset: B.createSub(LHS: Zero, RHS: Step),
1086	DL: DebugLoc::getUnknown(), Name: "ind.escape");
1087	}
1088	if (ScalarTy->isFloatingPointTy()) {
1089	const auto &ID = WideIV->getInductionDescriptor();
1090	return B.createNaryOp(
1091	Opcode: ID.getInductionBinOp()->getOpcode() == Instruction::FAdd
1092	? Instruction::FSub
1093	: Instruction::FAdd,
1094	Operands: {EndValue, Step}, Flags: {ID.getInductionBinOp()->getFastMathFlags()});
1095	}
1096	llvm_unreachable("all possible induction types must be handled");
1097	return nullptr;
1098	}
1099
1100	void VPlanTransforms::optimizeInductionLiveOutUsers(
1101	VPlan &Plan, PredicatedScalarEvolution &PSE, bool FoldTail) {
1102	// Compute end values for all inductions.
1103	VPTypeAnalysis TypeInfo(Plan);
1104	VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
1105	auto *VectorPH = cast<VPBasicBlock>(Val: VectorRegion->getSinglePredecessor());
1106	VPBuilder VectorPHBuilder(VectorPH, VectorPH->begin());
1107	DenseMap<VPValue , VPValue > EndValues;
1108	VPValue *ResumeTC =
1109	FoldTail ? Plan.getTripCount() : &Plan.getVectorTripCount();
1110	for (auto &Phi : VectorRegion->getEntryBasicBlock()->phis()) {
1111	auto *WideIV = dyn_cast<VPWidenInductionRecipe>(Val: &Phi);
1112	if (!WideIV)
1113	continue;
1114	if (VPValue *EndValue = tryToComputeEndValueForInduction(
1115	WideIV, VectorPHBuilder, TypeInfo, VectorTC: ResumeTC))
1116	EndValues [WideIV] = EndValue;
1117	}
1118
1119	VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
1120	for (VPRecipeBase &R : make_early_inc_range(Range&: *MiddleVPBB)) {
1121	VPValue *Op;
1122	if (!match(V: &R, P: m_ExitingIVValue(Op0: m_VPValue(V&: Op))))
1123	continue;
1124	auto *WideIV = cast<VPWidenInductionRecipe>(Val: Op);
1125	if (VPValue *EndValue = EndValues.lookup(Val: WideIV)) {
1126	R.getVPSingleValue()->replaceAllUsesWith(New: EndValue);
1127	R.eraseFromParent();
1128	}
1129	}
1130
1131	// Then, optimize exit block users.
1132	for (VPIRBasicBlock *ExitVPBB : Plan.getExitBlocks()) {
1133	for (VPRecipeBase &R : ExitVPBB->phis()) {
1134	auto *ExitIRI = cast<VPIRPhi>(Val: &R);
1135
1136	for (auto [Idx, PredVPBB] : enumerate(First&: ExitVPBB->getPredecessors())) {
1137	VPValue Escape = nullptr*;
1138	if (PredVPBB == MiddleVPBB)
1139	Escape = optimizeLatchExitInductionUser(Plan, TypeInfo, PredVPBB,
1140	Op: ExitIRI->getOperand(N: Idx),
1141	EndValues, PSE);
1142	else
1143	Escape = optimizeEarlyExitInductionUser(
1144	Plan, TypeInfo, PredVPBB, Op: ExitIRI->getOperand(N: Idx), PSE);
1145	if (Escape)
1146	ExitIRI->setOperand(I: Idx, New: Escape);
1147	}
1148	}
1149	}
1150	}
1151
1152	/// Remove redundant EpxandSCEVRecipes in \p Plan's entry block by replacing
1153	/// them with already existing recipes expanding the same SCEV expression.
1154	static void removeRedundantExpandSCEVRecipes(VPlan &Plan) {
1155	DenseMap<const SCEV , VPValue > SCEV2VPV;
1156
1157	for (VPRecipeBase &R :
1158	make_early_inc_range(Range&: *Plan.getEntry()->getEntryBasicBlock())) {
1159	auto *ExpR = dyn_cast<VPExpandSCEVRecipe>(Val: &R);
1160	if (!ExpR)
1161	continue;
1162
1163	const auto &[V, Inserted] = SCEV2VPV.try_emplace(Key: ExpR->getSCEV(), Args&: ExpR);
1164	if (Inserted)
1165	continue;
1166	ExpR->replaceAllUsesWith(New: V ->second);
1167	ExpR->eraseFromParent();
1168	}
1169	}
1170
1171	static void recursivelyDeleteDeadRecipes(VPValue *V) {
1172	SmallVector<VPValue *> WorkList;
1173	SmallPtrSet<VPValue *, `8`> Seen;
1174	WorkList.push_back(Elt: V);
1175
1176	while (!WorkList.empty()) {
1177	VPValue *Cur = WorkList.pop_back_val();
1178	if (!Seen.insert(Ptr: Cur).second)
1179	continue;
1180	VPRecipeBase *R = Cur->getDefiningRecipe();
1181	if (!R)
1182	continue;
1183	if (!isDeadRecipe(R&: *R))
1184	continue;
1185	append_range(C&: WorkList, R: R->operands());
1186	R->eraseFromParent();
1187	}
1188	}
1189
1190	/// Get any instruction opcode or intrinsic ID data embedded in recipe \p R.
1191	/// Returns an optional pair, where the first element indicates whether it is
1192	/// an intrinsic ID.
1193	static std::optional<std::pair<bool, unsigned>>
1194	getOpcodeOrIntrinsicID(const VPSingleDefRecipe *R) {
1195	return TypeSwitch<const VPSingleDefRecipe *,
1196	std::optional<std::pair<bool, unsigned>>>(R)
1197	.Case<VPInstruction, VPWidenRecipe, VPWidenCastRecipe, VPWidenGEPRecipe,
1198	VPReplicateRecipe>(
1199	caseFn: [](auto I) { return* std::make_pair(false, I->getOpcode()); })
1200	.Case(caseFn: [](const VPWidenIntrinsicRecipe *I) {
1201	return std::make_pair(x: true, y: I->getVectorIntrinsicID());
1202	})
1203	.Case<VPVectorPointerRecipe, VPPredInstPHIRecipe>(caseFn: [](auto *I) {
1204	// For recipes that do not directly map to LLVM IR instructions,
1205	// assign opcodes after the last VPInstruction opcode (which is also
1206	// after the last IR Instruction opcode), based on the VPRecipeID.
1207	return std::make_pair(false,
1208	VPInstruction::OpsEnd + `1` + I->getVPRecipeID());
1209	})
1210	.Default(defaultFn: [](auto ) { return* std::nullopt; });
1211	}
1212
1213	/// Try to fold \p R using InstSimplifyFolder. Will succeed and return a
1214	/// non-nullptr VPValue for a handled opcode or intrinsic ID if corresponding \p
1215	/// Operands are foldable live-ins.
1216	static VPIRValue *tryToFoldLiveIns(VPSingleDefRecipe &R,
1217	ArrayRef<VPValue *> Operands,
1218	const DataLayout &DL,
1219	VPTypeAnalysis &TypeInfo) {
1220	auto OpcodeOrIID = getOpcodeOrIntrinsicID(R: &R);
1221	if (!OpcodeOrIID)
1222	return nullptr;
1223
1224	SmallVector<Value *, `4`> Ops;
1225	for (VPValue *Op : Operands) {
1226	if (!match(V: Op, P: m_LiveIn()))
1227	return nullptr;
1228	Value *V = Op->getUnderlyingValue();
1229	if (!V)
1230	return nullptr;
1231	Ops.push_back(Elt: V);
1232	}
1233
1234	auto FoldToIRValue = [&]() -> Value * {
1235	InstSimplifyFolder Folder(DL);
1236	if (OpcodeOrIID ->first) {
1237	if (R.getNumOperands() != `2`)
1238	return nullptr;
1239	unsigned ID = OpcodeOrIID ->second;
1240	return Folder.FoldBinaryIntrinsic(ID, LHS: Ops [`0`], RHS: Ops [`1`],
1241	Ty: TypeInfo.inferScalarType(V: &R));
1242	}
1243	unsigned Opcode = OpcodeOrIID ->second;
1244	if (Instruction::isBinaryOp(Opcode))
1245	return Folder.FoldBinOp(Opc: static_cast<Instruction::BinaryOps>(Opcode),
1246	LHS: Ops [`0`], RHS: Ops [`1`]);
1247	if (Instruction::isCast(Opcode))
1248	return Folder.FoldCast(Op: static_cast<Instruction::CastOps>(Opcode), V: Ops [`0`],
1249	DestTy: TypeInfo.inferScalarType(V: R.getVPSingleValue()));
1250	switch (Opcode) {
1251	case VPInstruction::LogicalAnd:
1252	return Folder.FoldSelect(C: Ops [`0`], True: Ops [`1`],
1253	False: ConstantInt::getNullValue(Ty: Ops [`1`]->getType()));
1254	case VPInstruction::Not:
1255	return Folder.FoldBinOp(Opc: Instruction::BinaryOps::Xor, LHS: Ops [`0`],
1256	RHS: Constant::getAllOnesValue(Ty: Ops [`0`]->getType()));
1257	case Instruction::Select:
1258	return Folder.FoldSelect(C: Ops [`0`], True: Ops [`1`], False: Ops [`2`]);
1259	case Instruction::ICmp:
1260	case Instruction::FCmp:
1261	return Folder.FoldCmp(P: cast<VPRecipeWithIRFlags>(Val&: R).getPredicate(), LHS: Ops [`0`],
1262	RHS: Ops [`1`]);
1263	case Instruction::GetElementPtr: {
1264	auto &RFlags = cast<VPRecipeWithIRFlags>(Val&: R);
1265	auto *GEP = cast<GetElementPtrInst>(Val: RFlags.getUnderlyingInstr());
1266	return Folder.FoldGEP(Ty: GEP->getSourceElementType(), Ptr: Ops [`0`],
1267	IdxList: drop_begin(RangeOrContainer&: Ops), NW: RFlags.getGEPNoWrapFlags());
1268	}
1269	case VPInstruction::PtrAdd:
1270	case VPInstruction::WidePtrAdd:
1271	return Folder.FoldGEP(Ty: IntegerType::getInt8Ty(C&: TypeInfo.getContext()),
1272	Ptr: Ops [`0`], IdxList: Ops [`1`],
1273	NW: cast<VPRecipeWithIRFlags>(Val&: R).getGEPNoWrapFlags());
1274	// An extract of a live-in is an extract of a broadcast, so return the
1275	// broadcasted element.
1276	case Instruction::ExtractElement:
1277	assert(!Ops[`0`]->getType()->isVectorTy() && "Live-ins should be scalar");
1278	return Ops [`0`];
1279	}
1280	return nullptr;
1281	};
1282
1283	if (Value *V = FoldToIRValue ())
1284	return R.getParent()->getPlan()->getOrAddLiveIn(V);
1285	return nullptr;
1286	}
1287
1288	/// Try to simplify VPSingleDefRecipe \p Def.
1289	static void simplifyRecipe(VPSingleDefRecipe *Def, VPTypeAnalysis &TypeInfo) {
1290	VPlan *Plan = Def->getParent()->getPlan();
1291
1292	// Simplification of live-in IR values for SingleDef recipes using
1293	// InstSimplifyFolder.
1294	const DataLayout &DL = Plan->getDataLayout();
1295	if (VPValue V = tryToFoldLiveIns(R&: Def, Operands: Def->operands(), DL, TypeInfo))
1296	return Def->replaceAllUsesWith(New: V);
1297
1298	// Fold PredPHI LiveIn -> LiveIn.
1299	if (auto *PredPHI = dyn_cast<VPPredInstPHIRecipe>(Val: Def)) {
1300	VPValue *Op = PredPHI->getOperand(N: `0`);
1301	if (isa<VPIRValue>(Val: Op))
1302	PredPHI->replaceAllUsesWith(New: Op);
1303	}
1304
1305	VPBuilder Builder(Def);
1306
1307	// Avoid replacing VPInstructions with underlying values with new
1308	// VPInstructions, as we would fail to create widen/replicate recpes from the
1309	// new VPInstructions without an underlying value, and miss out on some
1310	// transformations that only apply to widened/replicated recipes later, by
1311	// doing so.
1312	// TODO: We should also not replace non-VPInstructions like VPWidenRecipe with
1313	// VPInstructions without underlying values, as those will get skipped during
1314	// cost computation.
1315	bool CanCreateNewRecipe =
1316	!isa<VPInstruction>(Val: Def) \|\| !Def->getUnderlyingValue();
1317
1318	VPValue *A;
1319	if (match(R: Def, P: m_Trunc(Op0: m_ZExtOrSExt(Op0: m_VPValue(V&: A))))) {
1320	Type *TruncTy = TypeInfo.inferScalarType(V: Def);
1321	Type *ATy = TypeInfo.inferScalarType(V: A);
1322	if (TruncTy == ATy) {
1323	Def->replaceAllUsesWith(New: A);
1324	} else {
1325	// Don't replace a non-widened cast recipe with a widened cast.
1326	if (!isa<VPWidenCastRecipe>(Val: Def))
1327	return;
1328	if (ATy->getScalarSizeInBits() < TruncTy->getScalarSizeInBits()) {
1329
1330	unsigned ExtOpcode = match(V: Def->getOperand(N: `0`), P: m_SExt(Op0: m_VPValue()))
1331	? Instruction::SExt
1332	: Instruction::ZExt;
1333	auto *Ext = Builder.createWidenCast(Opcode: Instruction::CastOps(ExtOpcode), Op: A,
1334	ResultTy: TruncTy);
1335	if (auto *UnderlyingExt = Def->getOperand(N: `0`)->getUnderlyingValue()) {
1336	// UnderlyingExt has distinct return type, used to retain legacy cost.
1337	Ext->setUnderlyingValue(UnderlyingExt);
1338	}
1339	Def->replaceAllUsesWith(New: Ext);
1340	} else if (ATy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits()) {
1341	auto *Trunc = Builder.createWidenCast(Opcode: Instruction::Trunc, Op: A, ResultTy: TruncTy);
1342	Def->replaceAllUsesWith(New: Trunc);
1343	}
1344	}
1345	#ifndef NDEBUG
1346	// Verify that the cached type info is for both A and its users is still
1347	// accurate by comparing it to freshly computed types.
1348	VPTypeAnalysis TypeInfo2(*Plan);
1349	assert(TypeInfo.inferScalarType(A) == TypeInfo2.inferScalarType(A));
1350	for (VPUser *U : A->users()) {
1351	auto *R = cast<VPRecipeBase>(U);
1352	for (VPValue *VPV : R->definedValues())
1353	assert(TypeInfo.inferScalarType(VPV) == TypeInfo2.inferScalarType(VPV));
1354	}
1355	#endif
1356	}
1357
1358	// Simplify (X && Y) \| (X && !Y) -> X.
1359	// TODO: Split up into simpler, modular combines: (X && Y) \| (X && Z) into X
1360	// && (Y \| Z) and (X \| !X) into true. This requires queuing newly created
1361	// recipes to be visited during simplification.
1362	VPValue X, Y, *Z;
1363	if (match(R: Def,
1364	P: m_c_BinaryOr(Op0: m_LogicalAnd(Op0: m_VPValue(V&: X), Op1: m_VPValue(V&: Y)),
1365	Op1: m_LogicalAnd(Op0: m_Deferred(V: X), Op1: m_Not(Op0: m_Deferred(V: Y)))))) {
1366	Def->replaceAllUsesWith(New: X);
1367	Def->eraseFromParent();
1368	return;
1369	}
1370
1371	// x \| AllOnes -> AllOnes
1372	if (match(R: Def, P: m_c_BinaryOr(Op0: m_VPValue(V&: X), Op1: m_AllOnes())))
1373	return Def->replaceAllUsesWith(
1374	New: Plan->getAllOnesValue(Ty: TypeInfo.inferScalarType(V: Def)));
1375
1376	// x \| 0 -> x
1377	if (match(R: Def, P: m_c_BinaryOr(Op0: m_VPValue(V&: X), Op1: m_ZeroInt())))
1378	return Def->replaceAllUsesWith(New: X);
1379
1380	// x \| !x -> AllOnes
1381	if (match(R: Def, P: m_c_BinaryOr(Op0: m_VPValue(V&: X), Op1: m_Not(Op0: m_Deferred(V: X)))))
1382	return Def->replaceAllUsesWith(
1383	New: Plan->getAllOnesValue(Ty: TypeInfo.inferScalarType(V: Def)));
1384
1385	// x & 0 -> 0
1386	if (match(R: Def, P: m_c_BinaryAnd(Op0: m_VPValue(V&: X), Op1: m_ZeroInt())))
1387	return Def->replaceAllUsesWith(
1388	New: Plan->getZero(Ty: TypeInfo.inferScalarType(V: Def)));
1389
1390	// x & AllOnes -> x
1391	if (match(R: Def, P: m_c_BinaryAnd(Op0: m_VPValue(V&: X), Op1: m_AllOnes())))
1392	return Def->replaceAllUsesWith(New: X);
1393
1394	// x && false -> false
1395	if (match(R: Def, P: m_c_LogicalAnd(Op0: m_VPValue(V&: X), Op1: m_False())))
1396	return Def->replaceAllUsesWith(New: Plan->getFalse());
1397
1398	// x && true -> x
1399	if (match(R: Def, P: m_c_LogicalAnd(Op0: m_VPValue(V&: X), Op1: m_True())))
1400	return Def->replaceAllUsesWith(New: X);
1401
1402	// (x && y) \| (x && z) -> x && (y \| z)
1403	if (CanCreateNewRecipe &&
1404	match(R: Def, P: m_c_BinaryOr(Op0: m_LogicalAnd(Op0: m_VPValue(V&: X), Op1: m_VPValue(V&: Y)),
1405	Op1: m_LogicalAnd(Op0: m_Deferred(V: X), Op1: m_VPValue(V&: Z)))) &&
1406	// Simplify only if one of the operands has one use to avoid creating an
1407	// extra recipe.
1408	(!Def->getOperand(N: `0`)->hasMoreThanOneUniqueUser() \|\|
1409	!Def->getOperand(N: `1`)->hasMoreThanOneUniqueUser()))
1410	return Def->replaceAllUsesWith(
1411	New: Builder.createLogicalAnd(LHS: X, RHS: Builder.createOr(LHS: Y, RHS: Z)));
1412
1413	// x && (x && y) -> x && y
1414	if (match(R: Def, P: m_LogicalAnd(Op0: m_VPValue(V&: X),
1415	Op1: m_LogicalAnd(Op0: m_Deferred(V: X), Op1: m_VPValue()))))
1416	return Def->replaceAllUsesWith(New: Def->getOperand(N: `1`));
1417
1418	// x && (y && x) -> x && y
1419	if (match(R: Def, P: m_LogicalAnd(Op0: m_VPValue(V&: X),
1420	Op1: m_LogicalAnd(Op0: m_VPValue(V&: Y), Op1: m_Deferred(V: X)))))
1421	return Def->replaceAllUsesWith(New: Builder.createLogicalAnd(LHS: X, RHS: Y));
1422
1423	// x && !x -> 0
1424	if (match(R: Def, P: m_LogicalAnd(Op0: m_VPValue(V&: X), Op1: m_Not(Op0: m_Deferred(V: X)))))
1425	return Def->replaceAllUsesWith(New: Plan->getFalse());
1426
1427	if (match(R: Def, P: m_Select(Op0: m_VPValue(), Op1: m_VPValue(V&: X), Op2: m_Deferred(V: X))))
1428	return Def->replaceAllUsesWith(New: X);
1429
1430	// select c, false, true -> not c
1431	VPValue *C;
1432	if (CanCreateNewRecipe &&
1433	match(R: Def, P: m_Select(Op0: m_VPValue(V&: C), Op1: m_False(), Op2: m_True())))
1434	return Def->replaceAllUsesWith(New: Builder.createNot(Operand: C));
1435
1436	// select !c, x, y -> select c, y, x
1437	if (match(R: Def, P: m_Select(Op0: m_Not(Op0: m_VPValue(V&: C)), Op1: m_VPValue(V&: X), Op2: m_VPValue(V&: Y)))) {
1438	Def->setOperand(I: `0`, New: C);
1439	Def->setOperand(I: `1`, New: Y);
1440	Def->setOperand(I: `2`, New: X);
1441	return;
1442	}
1443
1444	if (match(R: Def, P: m_c_Add(Op0: m_VPValue(V&: A), Op1: m_ZeroInt())))
1445	return Def->replaceAllUsesWith(New: A);
1446
1447	if (match(R: Def, P: m_c_Mul(Op0: m_VPValue(V&: A), Op1: m_One())))
1448	return Def->replaceAllUsesWith(New: A);
1449
1450	if (match(R: Def, P: m_c_Mul(Op0: m_VPValue(V&: A), Op1: m_ZeroInt())))
1451	return Def->replaceAllUsesWith(
1452	New: Plan->getZero(Ty: TypeInfo.inferScalarType(V: Def)));
1453
1454	const APInt *APC;
1455	if (CanCreateNewRecipe && match(R: Def, P: m_c_Mul(Op0: m_VPValue(V&: A), Op1: m_APInt(C&: APC))) &&
1456	APC->isPowerOf2())
1457	return Def->replaceAllUsesWith(New: Builder.createNaryOp(
1458	Opcode: Instruction::Shl,
1459	Operands: {A, Plan->getConstantInt(BitWidth: APC->getBitWidth(), Val: APC->exactLogBase2())},
1460	Flags: *cast<VPRecipeWithIRFlags>(Val: Def), DL: Def->getDebugLoc()));
1461
1462	// Don't convert udiv to lshr inside a replicate region, as VPInstructions are
1463	// not allowed in them.
1464	const VPRegionBlock *ParentRegion = Def->getParent()->getParent();
1465	bool IsInReplicateRegion = ParentRegion && ParentRegion->isReplicator();
1466	if (CanCreateNewRecipe && !IsInReplicateRegion &&
1467	match(R: Def, P: m_UDiv(Op0: m_VPValue(V&: A), Op1: m_APInt(C&: APC))) && APC->isPowerOf2())
1468	return Def->replaceAllUsesWith(New: Builder.createNaryOp(
1469	Opcode: Instruction::LShr,
1470	Operands: {A, Plan->getConstantInt(BitWidth: APC->getBitWidth(), Val: APC->exactLogBase2())},
1471	Flags: *cast<VPRecipeWithIRFlags>(Val: Def), DL: Def->getDebugLoc()));
1472
1473	if (match(R: Def, P: m_Not(Op0: m_VPValue(V&: A)))) {
1474	if (match(V: A, P: m_Not(Op0: m_VPValue(V&: A))))
1475	return Def->replaceAllUsesWith(New: A);
1476
1477	// Try to fold Not into compares by adjusting the predicate in-place.
1478	CmpPredicate Pred;
1479	if (match(V: A, P: m_Cmp(Pred, Op0: m_VPValue(), Op1: m_VPValue()))) {
1480	auto *Cmp = cast<VPRecipeWithIRFlags>(Val: A);
1481	if (all_of(Range: Cmp->users(),
1482	P: match_fn(P: m_CombineOr(
1483	L: m_Not(Op0: m_Specific(VPV: Cmp)),
1484	R: m_Select(Op0: m_Specific(VPV: Cmp), Op1: m_VPValue(), Op2: m_VPValue()))))) {
1485	Cmp->setPredicate(CmpInst::getInversePredicate(pred: Pred));
1486	for (VPUser *U : to_vector(Range: Cmp->users())) {
1487	auto *R = cast<VPSingleDefRecipe>(Val: U);
1488	if (match(R, P: m_Select(Op0: m_Specific(VPV: Cmp), Op1: m_VPValue(V&: X), Op2: m_VPValue(V&: Y)))) {
1489	// select (cmp pred), x, y -> select (cmp inv_pred), y, x
1490	R->setOperand(I: `1`, New: Y);
1491	R->setOperand(I: `2`, New: X);
1492	} else {
1493	// not (cmp pred) -> cmp inv_pred
1494	assert(match(R, m_Not(m_Specific(Cmp))) && "Unexpected user");
1495	R->replaceAllUsesWith(New: Cmp);
1496	}
1497	}
1498	// If Cmp doesn't have a debug location, use the one from the negation,
1499	// to preserve the location.
1500	if (!Cmp->getDebugLoc() && Def->getDebugLoc())
1501	Cmp->setDebugLoc(Def->getDebugLoc());
1502	}
1503	}
1504	}
1505
1506	// Fold any-of (fcmp uno %A, %A), (fcmp uno %B, %B), ... ->
1507	// any-of (fcmp uno %A, %B), ...
1508	if (match(R: Def, P: m_AnyOf())) {
1509	SmallVector<VPValue *, `4`> NewOps;
1510	VPRecipeBase UnpairedCmp = nullptr*;
1511	for (VPValue *Op : Def->operands()) {
1512	VPValue *X;
1513	if (Op->getNumUsers() > `1` \|\|
1514	!match(V: Op, P: m_SpecificCmp(MatchPred: CmpInst::FCMP_UNO, Op0: m_VPValue(V&: X),
1515	Op1: m_Deferred(V: X)))) {
1516	NewOps.push_back(Elt: Op);
1517	} else if (!UnpairedCmp) {
1518	UnpairedCmp = Op->getDefiningRecipe();
1519	} else {
1520	NewOps.push_back(Elt: Builder.createFCmp(Pred: CmpInst::FCMP_UNO,
1521	A: UnpairedCmp->getOperand(N: `0`), B: X));
1522	UnpairedCmp = nullptr;
1523	}
1524	}
1525
1526	if (UnpairedCmp)
1527	NewOps.push_back(Elt: UnpairedCmp->getVPSingleValue());
1528
1529	if (NewOps.size() < Def->getNumOperands()) {
1530	VPValue *NewAnyOf = Builder.createNaryOp(Opcode: VPInstruction::AnyOf, Operands: NewOps);
1531	return Def->replaceAllUsesWith(New: NewAnyOf);
1532	}
1533	}
1534
1535	// Fold (fcmp uno %X, %X) or (fcmp uno %Y, %Y) -> fcmp uno %X, %Y
1536	// This is useful for fmax/fmin without fast-math flags, where we need to
1537	// check if any operand is NaN.
1538	if (CanCreateNewRecipe &&
1539	match(R: Def, P: m_BinaryOr(Op0: m_SpecificCmp(MatchPred: CmpInst::FCMP_UNO, Op0: m_VPValue(V&: X),
1540	Op1: m_Deferred(V: X)),
1541	Op1: m_SpecificCmp(MatchPred: CmpInst::FCMP_UNO, Op0: m_VPValue(V&: Y),
1542	Op1: m_Deferred(V: Y))))) {
1543	VPValue *NewCmp = Builder.createFCmp(Pred: CmpInst::FCMP_UNO, A: X, B: Y);
1544	return Def->replaceAllUsesWith(New: NewCmp);
1545	}
1546
1547	// Remove redundant DerviedIVs, that is 0 + A 1 -> A and 0 + 0 * x -> 0.*
1548	if ((match(R: Def, P: m_DerivedIV(Op0: m_ZeroInt(), Op1: m_VPValue(V&: A), Op2: m_One())) \|\|
1549	match(R: Def, P: m_DerivedIV(Op0: m_ZeroInt(), Op1: m_ZeroInt(), Op2: m_VPValue()))) &&
1550	TypeInfo.inferScalarType(V: Def->getOperand(N: `1`)) ==
1551	TypeInfo.inferScalarType(V: Def))
1552	return Def->replaceAllUsesWith(New: Def->getOperand(N: `1`));
1553
1554	if (match(R: Def, P: m_VPInstruction<VPInstruction::WideIVStep>(Ops: m_VPValue(V&: X),
1555	Ops: m_One()))) {
1556	Type *WideStepTy = TypeInfo.inferScalarType(V: Def);
1557	if (TypeInfo.inferScalarType(V: X) != WideStepTy)
1558	X = Builder.createWidenCast(Opcode: Instruction::Trunc, Op: X, ResultTy: WideStepTy);
1559	Def->replaceAllUsesWith(New: X);
1560	return;
1561	}
1562
1563	// For i1 vp.merges produced by AnyOf reductions:
1564	// vp.merge true, (or x, y), x, evl -> vp.merge y, true, x, evl
1565	if (match(R: Def, P: m_Intrinsic<Intrinsic::vp_merge>(Op0: m_True(), Op1: m_VPValue(V&: A),
1566	Op2: m_VPValue(V&: X), Op3: m_VPValue())) &&
1567	match(V: A, P: m_c_BinaryOr(Op0: m_Specific(VPV: X), Op1: m_VPValue(V&: Y))) &&
1568	TypeInfo.inferScalarType(V: Def)->isIntegerTy(Bitwidth: `1`)) {
1569	Def->setOperand(I: `1`, New: Def->getOperand(N: `0`));
1570	Def->setOperand(I: `0`, New: Y);
1571	return;
1572	}
1573
1574	if (auto *Phi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(Val: Def)) {
1575	if (Phi->getOperand(N: `0`) == Phi->getOperand(N: `1`))
1576	Phi->replaceAllUsesWith(New: Phi->getOperand(N: `0`));
1577	return;
1578	}
1579
1580	// Simplify MaskedCond with no block mask to its single operand.
1581	if (match(R: Def, P: m_VPInstruction<VPInstruction::MaskedCond>()) &&
1582	!cast<VPInstruction>(Val: Def)->isMasked())
1583	return Def->replaceAllUsesWith(New: Def->getOperand(N: `0`));
1584
1585	// Look through ExtractLastLane.
1586	if (match(R: Def, P: m_ExtractLastLane(Op0: m_VPValue(V&: A)))) {
1587	if (match(V: A, P: m_BuildVector())) {
1588	auto *BuildVector = cast<VPInstruction>(Val: A);
1589	Def->replaceAllUsesWith(
1590	New: BuildVector->getOperand(N: BuildVector->getNumOperands() - `1`));
1591	return;
1592	}
1593	if (Plan->hasScalarVFOnly())
1594	return Def->replaceAllUsesWith(New: A);
1595	}
1596
1597	// Look through ExtractPenultimateElement (BuildVector ....).
1598	if (match(R: Def, P: m_ExtractPenultimateElement(Op0: m_BuildVector()))) {
1599	auto *BuildVector = cast<VPInstruction>(Val: Def->getOperand(N: `0`));
1600	Def->replaceAllUsesWith(
1601	New: BuildVector->getOperand(N: BuildVector->getNumOperands() - `2`));
1602	return;
1603	}
1604
1605	uint64_t Idx;
1606	if (match(R: Def, P: m_ExtractElement(Op0: m_BuildVector(), Op1: m_ConstantInt(C&: Idx)))) {
1607	auto *BuildVector = cast<VPInstruction>(Val: Def->getOperand(N: `0`));
1608	Def->replaceAllUsesWith(New: BuildVector->getOperand(N: Idx));
1609	return;
1610	}
1611
1612	if (match(R: Def, P: m_BuildVector()) && all_equal(Range: Def->operands())) {
1613	Def->replaceAllUsesWith(
1614	New: Builder.createNaryOp(Opcode: VPInstruction::Broadcast, Operands: Def->getOperand(N: `0`)));
1615	return;
1616	}
1617
1618	// Look through broadcast of single-scalar when used as select conditions; in
1619	// that case the scalar condition can be used directly.
1620	if (match(R: Def,
1621	P: m_Select(Op0: m_Broadcast(Op0: m_VPValue(V&: C)), Op1: m_VPValue(), Op2: m_VPValue()))) {
1622	assert(vputils::isSingleScalar(C) &&
1623	"broadcast operand must be single-scalar");
1624	Def->setOperand(I: `0`, New: C);
1625	return;
1626	}
1627
1628	if (isa<VPPhi, VPWidenPHIRecipe>(Val: Def)) {
1629	if (Def->getNumOperands() == `1`)
1630	Def->replaceAllUsesWith(New: Def->getOperand(N: `0`));
1631	return;
1632	}
1633
1634	VPIRValue *IRV;
1635	if (Def->getNumOperands() == `1` &&
1636	match(R: Def, P: m_ComputeReductionResult(Op0: m_VPIRValue(V&: IRV))))
1637	return Def->replaceAllUsesWith(New: IRV);
1638
1639	// Some simplifications can only be applied after unrolling. Perform them
1640	// below.
1641	if (!Plan->isUnrolled())
1642	return;
1643
1644	// After unrolling, extract-lane may be used to extract values from multiple
1645	// scalar sources. Only simplify when extracting from a single scalar source.
1646	VPValue *LaneToExtract;
1647	if (match(R: Def, P: m_ExtractLane(Op0: m_VPValue(V&: LaneToExtract), Op1: m_VPValue(V&: A)))) {
1648	// Simplify extract-lane(%lane_num, %scalar_val) -> %scalar_val.
1649	if (vputils::isSingleScalar(VPV: A))
1650	return Def->replaceAllUsesWith(New: A);
1651
1652	// Simplify extract-lane with single source to extract-element.
1653	Def->replaceAllUsesWith(New: Builder.createNaryOp(
1654	Opcode: Instruction::ExtractElement, Operands: {A, LaneToExtract}, DL: Def->getDebugLoc()));
1655	return;
1656	}
1657
1658	// Hoist an invariant increment Y of a phi X, by having X start at Y.
1659	if (match(R: Def, P: m_c_Add(Op0: m_VPValue(V&: X), Op1: m_VPValue(V&: Y))) && isa<VPIRValue>(Val: Y) &&
1660	isa<VPPhi>(Val: X)) {
1661	auto *Phi = cast<VPPhi>(Val: X);
1662	if (Phi->getOperand(N: `1`) != Def && match(V: Phi->getOperand(N: `0`), P: m_ZeroInt()) &&
1663	Phi->getSingleUser() == Def) {
1664	Phi->setOperand(I: `0`, New: Y);
1665	Def->replaceAllUsesWith(New: Phi);
1666	return;
1667	}
1668	}
1669
1670	// Simplify unrolled VectorPointer without offset, or with zero offset, to
1671	// just the pointer operand.
1672	if (auto *VPR = dyn_cast<VPVectorPointerRecipe>(Val: Def))
1673	if (!VPR->getOffset() \|\| match(V: VPR->getOffset(), P: m_ZeroInt()))
1674	return VPR->replaceAllUsesWith(New: VPR->getOperand(N: `0`));
1675
1676	// VPScalarIVSteps after unrolling can be replaced by their start value, if
1677	// the start index is zero and only the first lane 0 is demanded.
1678	if (auto *Steps = dyn_cast<VPScalarIVStepsRecipe>(Val: Def)) {
1679	if (!Steps->getStartIndex() && vputils::onlyFirstLaneUsed(Def: Steps)) {
1680	Steps->replaceAllUsesWith(New: Steps->getOperand(N: `0`));
1681	return;
1682	}
1683	}
1684	// Simplify redundant ReductionStartVector recipes after unrolling.
1685	VPValue *StartV;
1686	if (match(R: Def, P: m_VPInstruction<VPInstruction::ReductionStartVector>(
1687	Ops: m_VPValue(V&: StartV), Ops: m_VPValue(), Ops: m_VPValue()))) {
1688	Def->replaceUsesWithIf(New: StartV, ShouldReplace: [](const VPUser &U, unsigned Idx) {
1689	auto *PhiR = dyn_cast<VPReductionPHIRecipe>(Val: &U);
1690	return PhiR && PhiR->isInLoop();
1691	});
1692	return;
1693	}
1694
1695	if (match(R: Def, P: m_ExtractLastLane(Op0: m_Broadcast(Op0: m_VPValue(V&: A))))) {
1696	Def->replaceAllUsesWith(New: A);
1697	return;
1698	}
1699
1700	if (match(R: Def, P: m_ExtractLastLane(Op0: m_VPValue(V&: A))) &&
1701	((isa<VPInstruction>(Val: A) && vputils::isSingleScalar(VPV: A)) \|\|
1702	(isa<VPReplicateRecipe>(Val: A) &&
1703	cast<VPReplicateRecipe>(Val: A)->isSingleScalar())) &&
1704	all_of(Range: A->users(),
1705	P: [Def, A](VPUser U) { return* U->usesScalars(Op: A) \|\| Def == U; })) {
1706	return Def->replaceAllUsesWith(New: A);
1707	}
1708
1709	if (Plan->getConcreteUF() == `1` && match(R: Def, P: m_ExtractLastPart(Op0: m_VPValue(V&: A))))
1710	return Def->replaceAllUsesWith(New: A);
1711	}
1712
1713	void VPlanTransforms::simplifyRecipes(VPlan &Plan) {
1714	ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT(
1715	Plan.getEntry());
1716	VPTypeAnalysis TypeInfo(Plan);
1717	for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Range: RPOT)) {
1718	for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB))
1719	if (auto *Def = dyn_cast<VPSingleDefRecipe>(Val: &R))
1720	simplifyRecipe(Def, TypeInfo);
1721	}
1722	}
1723
1724	/// Reassociate (headermask && x) && y -> headermask && (x && y) to allow the
1725	/// header mask to be simplified further when tail folding, e.g. in
1726	/// optimizeEVLMasks.
1727	static void reassociateHeaderMask(VPlan &Plan) {
1728	VPValue *HeaderMask = vputils::findHeaderMask(Plan);
1729	if (!HeaderMask)
1730	return;
1731
1732	SmallVector<VPUser *> Worklist;
1733	for (VPUser *U : HeaderMask->users())
1734	if (match(U, P: m_LogicalAnd(Op0: m_Specific(VPV: HeaderMask), Op1: m_VPValue())))
1735	append_range(C&: Worklist, R: cast<VPSingleDefRecipe>(Val: U)->users());
1736
1737	while (!Worklist.empty()) {
1738	auto *R = dyn_cast<VPSingleDefRecipe>(Val: Worklist.pop_back_val());
1739	VPValue X, Y;
1740	if (!R \|\| !match(R, P: m_LogicalAnd(
1741	Op0: m_LogicalAnd(Op0: m_Specific(VPV: HeaderMask), Op1: m_VPValue(V&: X)),
1742	Op1: m_VPValue(V&: Y))))
1743	continue;
1744	append_range(C&: Worklist, R: R->users());
1745	VPBuilder Builder(R);
1746	R->replaceAllUsesWith(
1747	New: Builder.createLogicalAnd(LHS: HeaderMask, RHS: Builder.createLogicalAnd(LHS: X, RHS: Y)));
1748	}
1749	}
1750
1751	static void narrowToSingleScalarRecipes(VPlan &Plan) {
1752	if (Plan.hasScalarVFOnly())
1753	return;
1754
1755	// Try to narrow wide and replicating recipes to single scalar recipes,
1756	// based on VPlan analysis. Only process blocks in the loop region for now,
1757	// without traversing into nested regions, as recipes in replicate regions
1758	// cannot be converted yet.
1759	for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
1760	Range: vp_depth_first_shallow(G: Plan.getVectorLoopRegion()->getEntry()))) {
1761	for (VPRecipeBase &R : make_early_inc_range(Range: reverse(C&: *VPBB))) {
1762	if (!isa<VPWidenRecipe, VPWidenGEPRecipe, VPReplicateRecipe,
1763	VPWidenStoreRecipe>(Val: &R))
1764	continue;
1765	auto *RepR = dyn_cast<VPReplicateRecipe>(Val: &R);
1766	if (RepR && (RepR->isSingleScalar() \|\| RepR->isPredicated()))
1767	continue;
1768
1769	// Convert an unmasked scatter with an uniform address into
1770	// extract-last-lane + scalar store.
1771	// TODO: Add a profitability check comparing the cost of a scatter vs.
1772	// extract + scalar store.
1773	auto *WidenStoreR = dyn_cast<VPWidenStoreRecipe>(Val: &R);
1774	if (WidenStoreR && vputils::isSingleScalar(VPV: WidenStoreR->getAddr()) &&
1775	!WidenStoreR->isConsecutive()) {
1776	assert(!WidenStoreR->isReverse() &&
1777	"Not consecutive memory recipes shouldn't be reversed");
1778	VPValue *Mask = WidenStoreR->getMask();
1779
1780	// Only convert the scatter to a scalar store if it is unmasked.
1781	// TODO: Support converting scatter masked by the header mask to scalar
1782	// store.
1783	if (Mask)
1784	continue;
1785
1786	auto Extract = new* VPInstruction (VPInstruction::ExtractLastLane,
1787	{WidenStoreR->getOperand(N: `1`)});
1788	Extract->insertBefore(InsertPos: WidenStoreR);
1789
1790	// TODO: Sink the scalar store recipe to middle block if possible.
1791	auto ScalarStore = new* VPReplicateRecipe (
1792	&WidenStoreR->getIngredient(), {Extract, WidenStoreR->getAddr()},
1793	true /IsSingleScalar/, nullptr /Mask/, {},
1794	WidenStoreR /Metadata/*);
1795	ScalarStore->insertBefore(InsertPos: WidenStoreR);
1796	WidenStoreR->eraseFromParent();
1797	continue;
1798	}
1799
1800	auto *RepOrWidenR = dyn_cast<VPRecipeWithIRFlags>(Val: &R);
1801	if (RepR && isa<StoreInst>(Val: RepR->getUnderlyingInstr()) &&
1802	vputils::isSingleScalar(VPV: RepR->getOperand(N: `1`))) {
1803	auto Clone = new* VPReplicateRecipe (
1804	RepOrWidenR->getUnderlyingInstr(), RepOrWidenR->operands(),
1805	true /IsSingleScalar/, nullptr /Mask/, RepR /Flags/*,
1806	RepR /Metadata/*, RepR->getDebugLoc());
1807	Clone->insertBefore(InsertPos: RepOrWidenR);
1808	VPBuilder Builder(Clone);
1809	VPValue *ExtractOp = Clone->getOperand(N: `0`);
1810	if (vputils::isUniformAcrossVFsAndUFs(V: RepR->getOperand(N: `1`)))
1811	ExtractOp =
1812	Builder.createNaryOp(Opcode: VPInstruction::ExtractLastPart, Operands: ExtractOp);
1813	ExtractOp =
1814	Builder.createNaryOp(Opcode: VPInstruction::ExtractLastLane, Operands: ExtractOp);
1815	Clone->setOperand(I: `0`, New: ExtractOp);
1816	RepR->eraseFromParent();
1817	continue;
1818	}
1819
1820	// Skip recipes that aren't single scalars.
1821	if (!RepOrWidenR \|\| !vputils::isSingleScalar(VPV: RepOrWidenR))
1822	continue;
1823
1824	// Predicate to check if a user of Op introduces extra broadcasts.
1825	auto IntroducesBCastOf = [](const VPValue *Op) {
1826	return [Op](const VPUser *U) {
1827	if (auto *VPI = dyn_cast<VPInstruction>(Val: U)) {
1828	if (is_contained(Set: {VPInstruction::ExtractLastLane,
1829	VPInstruction::ExtractLastPart,
1830	VPInstruction::ExtractPenultimateElement},
1831	Element: VPI->getOpcode()))
1832	return false;
1833	}
1834	return !U->usesScalars(Op);
1835	};
1836	};
1837
1838	if (any_of(Range: RepOrWidenR->users(), P: IntroducesBCastOf (RepOrWidenR)) &&
1839	none_of(Range: RepOrWidenR->operands(), P: [&](VPValue *Op) {
1840	if (any_of(
1841	Range: make_filter_range(Range: Op->users(), Pred: not_equal_to(Arg&: RepOrWidenR)),
1842	P: IntroducesBCastOf (Op)))
1843	return false;
1844	// Non-constant live-ins require broadcasts, while constants do not
1845	// need explicit broadcasts.
1846	auto *IRV = dyn_cast<VPIRValue>(Val: Op);
1847	bool LiveInNeedsBroadcast = IRV && !isa<Constant>(Val: IRV->getValue());
1848	auto *OpR = dyn_cast<VPReplicateRecipe>(Val: Op);
1849	return LiveInNeedsBroadcast \|\| (OpR && OpR->isSingleScalar());
1850	}))
1851	continue;
1852
1853	auto Clone = new* VPReplicateRecipe (
1854	RepOrWidenR->getUnderlyingInstr(), RepOrWidenR->operands(),
1855	true /IsSingleScalar/, nullptr, *RepOrWidenR);
1856	Clone->insertBefore(InsertPos: RepOrWidenR);
1857	RepOrWidenR->replaceAllUsesWith(New: Clone);
1858	if (isDeadRecipe(R&: *RepOrWidenR))
1859	RepOrWidenR->eraseFromParent();
1860	}
1861	}
1862	}
1863
1864	/// Try to see if all of \p Blend's masks share a common value logically and'ed
1865	/// and remove it from the masks.
1866	static void removeCommonBlendMask(VPBlendRecipe *Blend) {
1867	if (Blend->isNormalized())
1868	return;
1869	VPValue *CommonEdgeMask;
1870	if (!match(V: Blend->getMask(Idx: `0`),
1871	P: m_LogicalAnd(Op0: m_VPValue(V&: CommonEdgeMask), Op1: m_VPValue())))
1872	return;
1873	for (unsigned I = `0`; I < Blend->getNumIncomingValues(); I++)
1874	if (!match(V: Blend->getMask(Idx: I),
1875	P: m_LogicalAnd(Op0: m_Specific(VPV: CommonEdgeMask), Op1: m_VPValue())))
1876	return;
1877	for (unsigned I = `0`; I < Blend->getNumIncomingValues(); I++)
1878	Blend->setMask(Idx: I, V: Blend->getMask(Idx: I)->getDefiningRecipe()->getOperand(N: `1`));
1879	}
1880
1881	/// Normalize and simplify VPBlendRecipes. Should be run after simplifyRecipes
1882	/// to make sure the masks are simplified.
1883	static void simplifyBlends(VPlan &Plan) {
1884	for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
1885	Range: vp_depth_first_shallow(G: Plan.getVectorLoopRegion()->getEntry()))) {
1886	for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB)) {
1887	auto *Blend = dyn_cast<VPBlendRecipe>(Val: &R);
1888	if (!Blend)
1889	continue;
1890
1891	removeCommonBlendMask(Blend);
1892
1893	// Try to remove redundant blend recipes.
1894	SmallPtrSet<VPValue *, `4`> UniqueValues;
1895	if (Blend->isNormalized() \|\| !match(V: Blend->getMask(Idx: `0`), P: m_False()))
1896	UniqueValues.insert(Ptr: Blend->getIncomingValue(Idx: `0`));
1897	for (unsigned I = `1`; I != Blend->getNumIncomingValues(); ++I)
1898	if (!match(V: Blend->getMask(Idx: I), P: m_False()))
1899	UniqueValues.insert(Ptr: Blend->getIncomingValue(Idx: I));
1900
1901	if (UniqueValues.size() == `1`) {
1902	Blend->replaceAllUsesWith(New: *UniqueValues.begin());
1903	Blend->eraseFromParent();
1904	continue;
1905	}
1906
1907	if (Blend->isNormalized())
1908	continue;
1909
1910	// Normalize the blend so its first incoming value is used as the initial
1911	// value with the others blended into it.
1912
1913	unsigned StartIndex = `0`;
1914	for (unsigned I = `0`; I != Blend->getNumIncomingValues(); ++I) {
1915	// If a value's mask is used only by the blend then is can be deadcoded.
1916	// TODO: Find the most expensive mask that can be deadcoded, or a mask
1917	// that's used by multiple blends where it can be removed from them all.
1918	VPValue *Mask = Blend->getMask(Idx: I);
1919	if (Mask->getNumUsers() == `1` && !match(V: Mask, P: m_False())) {
1920	StartIndex = I;
1921	break;
1922	}
1923	}
1924
1925	SmallVector<VPValue *, `4`> OperandsWithMask;
1926	OperandsWithMask.push_back(Elt: Blend->getIncomingValue(Idx: StartIndex));
1927
1928	for (unsigned I = `0`; I != Blend->getNumIncomingValues(); ++I) {
1929	if (I == StartIndex)
1930	continue;
1931	OperandsWithMask.push_back(Elt: Blend->getIncomingValue(Idx: I));
1932	OperandsWithMask.push_back(Elt: Blend->getMask(Idx: I));
1933	}
1934
1935	auto *NewBlend =
1936	new VPBlendRecipe (cast_or_null<PHINode>(Val: Blend->getUnderlyingValue()),
1937	OperandsWithMask, *Blend, Blend->getDebugLoc());
1938	NewBlend->insertBefore(InsertPos: &R);
1939
1940	VPValue *DeadMask = Blend->getMask(Idx: StartIndex);
1941	Blend->replaceAllUsesWith(New: NewBlend);
1942	Blend->eraseFromParent();
1943	recursivelyDeleteDeadRecipes(V: DeadMask);
1944
1945	/// Simplify BLEND %a, %b, Not(%mask) -> BLEND %b, %a, %mask.
1946	VPValue *NewMask;
1947	if (NewBlend->getNumOperands() == `3` &&
1948	match(V: NewBlend->getMask(Idx: `1`), P: m_Not(Op0: m_VPValue(V&: NewMask)))) {
1949	VPValue *Inc0 = NewBlend->getOperand(N: `0`);
1950	VPValue *Inc1 = NewBlend->getOperand(N: `1`);
1951	VPValue *OldMask = NewBlend->getOperand(N: `2`);
1952	NewBlend->setOperand(I: `0`, New: Inc1);
1953	NewBlend->setOperand(I: `1`, New: Inc0);
1954	NewBlend->setOperand(I: `2`, New: NewMask);
1955	if (OldMask->getNumUsers() == `0`)
1956	cast<VPInstruction>(Val: OldMask)->eraseFromParent();
1957	}
1958	}
1959	}
1960	}
1961
1962	/// Optimize the width of vector induction variables in \p Plan based on a known
1963	/// constant Trip Count, \p BestVF and \p BestUF.
1964	static bool optimizeVectorInductionWidthForTCAndVFUF(VPlan &Plan,
1965	ElementCount BestVF,
1966	unsigned BestUF) {
1967	// Only proceed if we have not completely removed the vector region.
1968	if (!Plan.getVectorLoopRegion())
1969	return false;
1970
1971	const APInt *TC;
1972	if (!BestVF.isFixed() \|\| !match(V: Plan.getTripCount(), P: m_APInt(C&: TC)))
1973	return false;
1974
1975	// Calculate the minimum power-of-2 bit width that can fit the known TC, VF
1976	// and UF. Returns at least 8.
1977	auto ComputeBitWidth = [](APInt TC, uint64_t Align) {
1978	APInt AlignedTC =
1979	Align * APIntOps::RoundingUDiv(A: TC, B: APInt (TC.getBitWidth(), Align),
1980	RM: APInt::Rounding::UP);
1981	APInt MaxVal = AlignedTC - `1`;
1982	return std::max<unsigned>(a: PowerOf2Ceil(A: MaxVal.getActiveBits()), b: `8`);
1983	};
1984	unsigned NewBitWidth =
1985	ComputeBitWidth (TC, BestVF.getKnownMinValue() BestUF);
1986
1987	LLVMContext &Ctx = Plan.getContext();
1988	auto *NewIVTy = IntegerType::get(C&: Ctx, NumBits: NewBitWidth);
1989
1990	bool MadeChange = false;
1991
1992	VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
1993	for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
1994	auto *WideIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(Val: &Phi);
1995
1996	// Currently only handle canonical IVs as it is trivial to replace the start
1997	// and stop values, and we currently only perform the optimization when the
1998	// IV has a single use.
1999	if (!WideIV \|\| !WideIV->isCanonical() \|\|
2000	WideIV->hasMoreThanOneUniqueUser() \|\|
2001	NewIVTy == WideIV->getScalarType())
2002	continue;
2003
2004	// Currently only handle cases where the single user is a header-mask
2005	// comparison with the backedge-taken-count.
2006	VPUser *SingleUser = WideIV->getSingleUser();
2007	if (!SingleUser \|\|
2008	!match(U: SingleUser, P: m_ICmp(Op0: m_Specific(VPV: WideIV),
2009	Op1: m_Broadcast(Op0: m_Specific(
2010	VPV: Plan.getOrCreateBackedgeTakenCount())))))
2011	continue;
2012
2013	// Update IV operands and comparison bound to use new narrower type.
2014	auto *NewStart = Plan.getZero(Ty: NewIVTy);
2015	WideIV->setStartValue(NewStart);
2016	auto *NewStep = Plan.getConstantInt(Ty: NewIVTy, Val: `1`);
2017	WideIV->setStepValue(NewStep);
2018
2019	auto NewBTC = new* VPWidenCastRecipe (
2020	Instruction::Trunc, Plan.getOrCreateBackedgeTakenCount(), NewIVTy,
2021	nullptr, VPIRFlags::getDefaultFlags(Opcode: Instruction::Trunc));
2022	Plan.getVectorPreheader()->appendRecipe(Recipe: NewBTC);
2023	auto *Cmp = cast<VPInstruction>(Val: WideIV->getSingleUser());
2024	Cmp->setOperand(I: `1`, New: NewBTC);
2025
2026	MadeChange = true;
2027	}
2028
2029	return MadeChange;
2030	}
2031
2032	/// Return true if \p Cond is known to be true for given \p BestVF and \p
2033	/// BestUF.
2034	static bool isConditionTrueViaVFAndUF(VPValue *Cond, VPlan &Plan,
2035	ElementCount BestVF, unsigned BestUF,
2036	PredicatedScalarEvolution &PSE) {
2037	if (match(V: Cond, P: m_BinaryOr(Op0: m_VPValue(), Op1: m_VPValue())))
2038	return any_of(Range: Cond->getDefiningRecipe()->operands(), P: [&Plan, BestVF, BestUF,
2039	&PSE](VPValue *C) {
2040	return isConditionTrueViaVFAndUF(Cond: C, Plan, BestVF, BestUF, PSE);
2041	});
2042
2043	auto *CanIV = Plan.getVectorLoopRegion()->getCanonicalIV();
2044	if (!match(V: Cond, P: m_SpecificICmp(MatchPred: CmpInst::ICMP_EQ,
2045	Op0: m_Specific(VPV: CanIV->getBackedgeValue()),
2046	Op1: m_Specific(VPV: &Plan.getVectorTripCount()))))
2047	return false;
2048
2049	// The compare checks CanIV + VFxUF == vector trip count. The vector trip
2050	// count is not conveniently available as SCEV so far, so we compare directly
2051	// against the original trip count. This is stricter than necessary, as we
2052	// will only return true if the trip count == vector trip count.
2053	const SCEV *VectorTripCount =
2054	vputils::getSCEVExprForVPValue(V: &Plan.getVectorTripCount(), PSE);
2055	if (isa<SCEVCouldNotCompute>(Val: VectorTripCount))
2056	VectorTripCount = vputils::getSCEVExprForVPValue(V: Plan.getTripCount(), PSE);
2057	assert(!isa<SCEVCouldNotCompute>(VectorTripCount) &&
2058	"Trip count SCEV must be computable");
2059	ScalarEvolution &SE = *PSE.getSE();
2060	ElementCount NumElements = BestVF.multiplyCoefficientBy(RHS: BestUF);
2061	const SCEV *C = SE.getElementCount(Ty: VectorTripCount->getType(), EC: NumElements);
2062	return SE.isKnownPredicate(Pred: CmpInst::ICMP_EQ, LHS: VectorTripCount, RHS: C);
2063	}
2064
2065	/// Try to replace multiple active lane masks used for control flow with
2066	/// a single, wide active lane mask instruction followed by multiple
2067	/// extract subvector intrinsics. This applies to the active lane mask
2068	/// instructions both in the loop and in the preheader.
2069	/// Incoming values of all ActiveLaneMaskPHIs are updated to use the
2070	/// new extracts from the first active lane mask, which has it's last
2071	/// operand (multiplier) set to UF.
2072	static bool tryToReplaceALMWithWideALM(VPlan &Plan, ElementCount VF,
2073	unsigned UF) {
2074	if (!EnableWideActiveLaneMask \|\| !VF.isVector() \|\| UF == `1`)
2075	return false;
2076
2077	VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
2078	VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
2079	auto *Term = &ExitingVPBB->back();
2080
2081	using namespace llvm::VPlanPatternMatch;
2082	if (!match(V: Term, P: m_BranchOnCond(Op0: m_Not(Op0: m_ActiveLaneMask(
2083	Op0: m_VPValue(), Op1: m_VPValue(), Op2: m_VPValue())))))
2084	return false;
2085
2086	auto *Header = cast<VPBasicBlock>(Val: VectorRegion->getEntry());
2087	LLVMContext &Ctx = Plan.getContext();
2088
2089	auto ExtractFromALM = [&](VPInstruction *ALM,
2090	SmallVectorImpl<VPValue *> &Extracts) {
2091	DebugLoc DL = ALM->getDebugLoc();
2092	for (unsigned Part = `0`; Part < UF; ++Part) {
2093	SmallVector<VPValue *> Ops;
2094	Ops.append(IL: {ALM, Plan.getConstantInt(BitWidth: `64`, Val: VF.getKnownMinValue() * Part)});
2095	auto *Ext =
2096	new VPWidenIntrinsicRecipe (Intrinsic::vector_extract, Ops,
2097	IntegerType::getInt1Ty(C&: Ctx), {}, {}, DL);
2098	Extracts [Part] = Ext;
2099	Ext->insertAfter(InsertPos: ALM);
2100	}
2101	};
2102
2103	// Create a list of each active lane mask phi, ordered by unroll part.
2104	SmallVector<VPActiveLaneMaskPHIRecipe > Phis(UF, nullptr*);
2105	for (VPRecipeBase &R : Header->phis()) {
2106	auto *Phi = dyn_cast<VPActiveLaneMaskPHIRecipe>(Val: &R);
2107	if (!Phi)
2108	continue;
2109	VPValue Index = nullptr*;
2110	match(V: Phi->getBackedgeValue(),
2111	P: m_ActiveLaneMask(Op0: m_VPValue(V&: Index), Op1: m_VPValue(), Op2: m_VPValue()));
2112	assert(Index && "Expected index from ActiveLaneMask instruction");
2113
2114	uint64_t Part;
2115	if (match(V: Index,
2116	P: m_VPInstruction<VPInstruction::CanonicalIVIncrementForPart>(
2117	Ops: m_VPValue(), Ops: m_Mul(Op0: m_VPValue(), Op1: m_ConstantInt(C&: Part)))))
2118	Phis [Part] = Phi;
2119	else {
2120	// Anything other than a CanonicalIVIncrementForPart is part 0
2121	assert(!match(
2122	Index,
2123	m_VPInstruction<VPInstruction::CanonicalIVIncrementForPart>()));
2124	Phis [`0`] = Phi;
2125	}
2126	}
2127
2128	assert(all_of(Phis, [](VPActiveLaneMaskPHIRecipe Phi) { return* Phi; }) &&
2129	"Expected one VPActiveLaneMaskPHIRecipe for each unroll part");
2130
2131	auto *EntryALM = cast<VPInstruction>(Val: Phis [`0`]->getStartValue());
2132	auto *LoopALM = cast<VPInstruction>(Val: Phis [`0`]->getBackedgeValue());
2133
2134	assert((EntryALM->getOpcode() == VPInstruction::ActiveLaneMask &&
2135	LoopALM->getOpcode() == VPInstruction::ActiveLaneMask) &&
2136	"Expected incoming values of Phi to be ActiveLaneMasks");
2137
2138	// When using wide lane masks, the return type of the get.active.lane.mask
2139	// intrinsic is VF x UF (last operand).
2140	VPValue *ALMMultiplier = Plan.getConstantInt(BitWidth: `64`, Val: UF);
2141	EntryALM->setOperand(I: `2`, New: ALMMultiplier);
2142	LoopALM->setOperand(I: `2`, New: ALMMultiplier);
2143
2144	// Create UF x extract vectors and insert into preheader.
2145	SmallVector<VPValue *> EntryExtracts(UF);
2146	ExtractFromALM (EntryALM, EntryExtracts);
2147
2148	// Create UF x extract vectors and insert before the loop compare & branch,
2149	// updating the compare to use the first extract.
2150	SmallVector<VPValue *> LoopExtracts(UF);
2151	ExtractFromALM (LoopALM, LoopExtracts);
2152	VPInstruction *Not = cast<VPInstruction>(Val: Term->getOperand(N: `0`));
2153	Not->setOperand(I: `0`, New: LoopExtracts [`0`]);
2154
2155	// Update the incoming values of active lane mask phis.
2156	for (unsigned Part = `0`; Part < UF; ++Part) {
2157	Phis [Part]->setStartValue(EntryExtracts [Part]);
2158	Phis [Part]->setBackedgeValue(LoopExtracts [Part]);
2159	}
2160
2161	return true;
2162	}
2163
2164	/// Try to simplify the branch condition of \p Plan. This may restrict the
2165	/// resulting plan to \p BestVF and \p BestUF.
2166	static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF,
2167	unsigned BestUF,
2168	PredicatedScalarEvolution &PSE) {
2169	VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
2170	VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
2171	auto *Term = &ExitingVPBB->back();
2172	VPValue *Cond;
2173	if (match(V: Term,
2174	P: m_BranchOnCount(Op0: m_Add(Op0: m_VPValue(), Op1: m_Specific(VPV: &Plan.getVFxUF())),
2175	Op1: m_VPValue())) \|\|
2176	match(V: Term, P: m_BranchOnCond(Op0: m_Not(Op0: m_ActiveLaneMask(
2177	Op0: m_VPValue(), Op1: m_VPValue(), Op2: m_VPValue()))))) {
2178	// Try to simplify the branch condition if VectorTC <= VF UF when the*
2179	// latch terminator is BranchOnCount or BranchOnCond(Not(ActiveLaneMask)).
2180	const SCEV *VectorTripCount =
2181	vputils::getSCEVExprForVPValue(V: &Plan.getVectorTripCount(), PSE);
2182	if (isa<SCEVCouldNotCompute>(Val: VectorTripCount))
2183	VectorTripCount =
2184	vputils::getSCEVExprForVPValue(V: Plan.getTripCount(), PSE);
2185	assert(!isa<SCEVCouldNotCompute>(VectorTripCount) &&
2186	"Trip count SCEV must be computable");
2187	ScalarEvolution &SE = *PSE.getSE();
2188	ElementCount NumElements = BestVF.multiplyCoefficientBy(RHS: BestUF);
2189	const SCEV *C = SE.getElementCount(Ty: VectorTripCount->getType(), EC: NumElements);
2190	if (!SE.isKnownPredicate(Pred: CmpInst::ICMP_ULE, LHS: VectorTripCount, RHS: C))
2191	return false;
2192	} else if (match(V: Term, P: m_BranchOnCond(Op0: m_VPValue(V&: Cond))) \|\|
2193	match(V: Term, P: m_BranchOnTwoConds(Op0: m_VPValue(), Op1: m_VPValue(V&: Cond)))) {
2194	// For BranchOnCond, check if we can prove the condition to be true using VF
2195	// and UF.
2196	if (!isConditionTrueViaVFAndUF(Cond, Plan, BestVF, BestUF, PSE))
2197	return false;
2198	} else {
2199	return false;
2200	}
2201
2202	// The vector loop region only executes once. If possible, completely remove
2203	// the region, otherwise replace the terminator controlling the latch with
2204	// (BranchOnCond true).
2205	// TODO: VPWidenIntOrFpInductionRecipe is only partially supported; add
2206	// support for other non-canonical widen induction recipes (e.g.,
2207	// VPWidenPointerInductionRecipe).
2208	// TODO: fold branch-on-constant after dissolving region.
2209	auto *Header = cast<VPBasicBlock>(Val: VectorRegion->getEntry());
2210	if (all_of(Range: Header->phis(), P: [](VPRecipeBase &Phi) {
2211	if (auto *R = dyn_cast<VPWidenIntOrFpInductionRecipe>(Val: &Phi))
2212	return R->isCanonical();
2213	return isa<VPCanonicalIVPHIRecipe, VPCurrentIterationPHIRecipe,
2214	VPFirstOrderRecurrencePHIRecipe, VPPhi>(Val: &Phi);
2215	})) {
2216	for (VPRecipeBase &HeaderR : make_early_inc_range(Range: Header->phis())) {
2217	if (auto *R = dyn_cast<VPWidenIntOrFpInductionRecipe>(Val: &HeaderR)) {
2218	VPBuilder Builder(Plan.getVectorPreheader());
2219	VPValue *StepV = Builder.createNaryOp(Opcode: VPInstruction::StepVector, Operands: {},
2220	ResultTy: R->getScalarType());
2221	HeaderR.getVPSingleValue()->replaceAllUsesWith(New: StepV);
2222	HeaderR.eraseFromParent();
2223	continue;
2224	}
2225	auto *Phi = cast<VPPhiAccessors>(Val: &HeaderR);
2226	HeaderR.getVPSingleValue()->replaceAllUsesWith(New: Phi->getIncomingValue(Idx: `0`));
2227	HeaderR.eraseFromParent();
2228	}
2229
2230	VPBlockBase *Preheader = VectorRegion->getSinglePredecessor();
2231	SmallVector<VPBlockBase *> Exits = to_vector(Range&: VectorRegion->getSuccessors());
2232	VPBlockUtils::disconnectBlocks(From: Preheader, To: VectorRegion);
2233	for (VPBlockBase *Exit : Exits)
2234	VPBlockUtils::disconnectBlocks(From: VectorRegion, To: Exit);
2235
2236	for (VPBlockBase *B : vp_depth_first_shallow(G: VectorRegion->getEntry()))
2237	B->setParent(nullptr);
2238
2239	VPBlockUtils::connectBlocks(From: Preheader, To: Header);
2240
2241	for (VPBlockBase *Exit : Exits)
2242	VPBlockUtils::connectBlocks(From: ExitingVPBB, To: Exit);
2243
2244	// Replace terminating branch-on-two-conds with branch-on-cond to early
2245	// exit.
2246	if (Exits.size() != `1`) {
2247	assert(match(Term, m_BranchOnTwoConds()) && Exits.size() == `2` &&
2248	"BranchOnTwoConds needs 2 remaining exits");
2249	VPBuilder (Term).createNaryOp(Opcode: VPInstruction::BranchOnCond,
2250	Operands: Term->getOperand(N: `0`));
2251	}
2252	VPlanTransforms::simplifyRecipes(Plan);
2253	} else {
2254	// The vector region contains header phis for which we cannot remove the
2255	// loop region yet.
2256
2257	// For BranchOnTwoConds, set the latch exit condition to true directly.
2258	if (match(V: Term, P: m_BranchOnTwoConds())) {
2259	Term->setOperand(I: `1`, New: Plan.getTrue());
2260	return true;
2261	}
2262
2263	auto BOC = new* VPInstruction (VPInstruction::BranchOnCond, {Plan.getTrue()},
2264	{}, {}, Term->getDebugLoc());
2265	ExitingVPBB->appendRecipe(Recipe: BOC);
2266	}
2267
2268	Term->eraseFromParent();
2269
2270	return true;
2271	}
2272
2273	/// From the definition of llvm.experimental.get.vector.length,
2274	/// VPInstruction::ExplicitVectorLength(%AVL) = %AVL when %AVL <= VF.
2275	static bool simplifyKnownEVL(VPlan &Plan, ElementCount VF,
2276	PredicatedScalarEvolution &PSE) {
2277	for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
2278	Range: vp_depth_first_deep(G: Plan.getEntry()))) {
2279	for (VPRecipeBase &R : *VPBB) {
2280	VPValue *AVL;
2281	if (!match(V: &R, P: m_EVL(Op0: m_VPValue(V&: AVL))))
2282	continue;
2283
2284	const SCEV *AVLSCEV = vputils::getSCEVExprForVPValue(V: AVL, PSE);
2285	if (isa<SCEVCouldNotCompute>(Val: AVLSCEV))
2286	continue;
2287	ScalarEvolution &SE = *PSE.getSE();
2288	const SCEV *VFSCEV = SE.getElementCount(Ty: AVLSCEV->getType(), EC: VF);
2289	if (!SE.isKnownPredicate(Pred: CmpInst::ICMP_ULE, LHS: AVLSCEV, RHS: VFSCEV))
2290	continue;
2291
2292	VPValue *Trunc = VPBuilder (&R).createScalarZExtOrTrunc(
2293	Op: AVL, ResultTy: Type::getInt32Ty(C&: Plan.getContext()), SrcTy: AVLSCEV->getType(),
2294	DL: R.getDebugLoc());
2295	if (Trunc != AVL) {
2296	auto *TruncR = cast<VPSingleDefRecipe>(Val: Trunc);
2297	const DataLayout &DL = Plan.getDataLayout();
2298	VPTypeAnalysis TypeInfo(Plan);
2299	if (VPValue *Folded =
2300	tryToFoldLiveIns(R&: *TruncR, Operands: TruncR->operands(), DL, TypeInfo))
2301	Trunc = Folded;
2302	}
2303	R.getVPSingleValue()->replaceAllUsesWith(New: Trunc);
2304	return true;
2305	}
2306	}
2307	return false;
2308	}
2309
2310	void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
2311	unsigned BestUF,
2312	PredicatedScalarEvolution &PSE) {
2313	assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
2314	assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
2315
2316	bool MadeChange = tryToReplaceALMWithWideALM(Plan, VF: BestVF, UF: BestUF);
2317	MadeChange \|= simplifyBranchConditionForVFAndUF(Plan, BestVF, BestUF, PSE);
2318	MadeChange \|= optimizeVectorInductionWidthForTCAndVFUF(Plan, BestVF, BestUF);
2319	MadeChange \|= simplifyKnownEVL(Plan, VF: BestVF, PSE);
2320
2321	if (MadeChange) {
2322	Plan.setVF(BestVF);
2323	assert(Plan.getConcreteUF() == BestUF && "BestUF must match the Plan's UF");
2324	}
2325	}
2326
2327	/// Sink users of \p FOR after the recipe defining the previous value \p
2328	/// Previous of the recurrence. \returns true if all users of \p FOR could be
2329	/// re-arranged as needed or false if it is not possible.
2330	static bool
2331	sinkRecurrenceUsersAfterPrevious(VPFirstOrderRecurrencePHIRecipe *FOR,
2332	VPRecipeBase *Previous,
2333	VPDominatorTree &VPDT) {
2334	// If Previous is a live-in (no defining recipe), it naturally dominates all
2335	// recipes in the loop, so no sinking is needed.
2336	if (!Previous)
2337	return true;
2338
2339	// Collect recipes that need sinking.
2340	SmallVector<VPRecipeBase *> WorkList;
2341	SmallPtrSet<VPRecipeBase *, `8`> Seen;
2342	Seen.insert(Ptr: Previous);
2343	auto TryToPushSinkCandidate = [&](VPRecipeBase *SinkCandidate) {
2344	// The previous value must not depend on the users of the recurrence phi. In
2345	// that case, FOR is not a fixed order recurrence.
2346	if (SinkCandidate == Previous)
2347	return false;
2348
2349	if (isa<VPHeaderPHIRecipe>(Val: SinkCandidate) \|\|
2350	!Seen.insert(Ptr: SinkCandidate).second \|\|
2351	VPDT.properlyDominates(A: Previous, B: SinkCandidate))
2352	return true;
2353
2354	if (cannotHoistOrSinkRecipe(R: *SinkCandidate))
2355	return false;
2356
2357	WorkList.push_back(Elt: SinkCandidate);
2358	return true;
2359	};
2360
2361	// Recursively sink users of FOR after Previous.
2362	WorkList.push_back(Elt: FOR);
2363	for (unsigned I = `0`; I != WorkList.size(); ++I) {
2364	VPRecipeBase *Current = WorkList [I];
2365	assert(Current->getNumDefinedValues() == `1` &&
2366	"only recipes with a single defined value expected");
2367
2368	for (VPUser *User : Current->getVPSingleValue()->users()) {
2369	if (!TryToPushSinkCandidate (cast<VPRecipeBase>(Val: User)))
2370	return false;
2371	}
2372	}
2373
2374	// Keep recipes to sink ordered by dominance so earlier instructions are
2375	// processed first.
2376	sort(C&: WorkList, Comp: [&VPDT](const VPRecipeBase A, const* VPRecipeBase *B) {
2377	return VPDT.properlyDominates(A, B);
2378	});
2379
2380	for (VPRecipeBase *SinkCandidate : WorkList) {
2381	if (SinkCandidate == FOR)
2382	continue;
2383
2384	SinkCandidate->moveAfter(MovePos: Previous);
2385	Previous = SinkCandidate;
2386	}
2387	return true;
2388	}
2389
2390	/// Try to hoist \p Previous and its operands before all users of \p FOR.
2391	static bool hoistPreviousBeforeFORUsers(VPFirstOrderRecurrencePHIRecipe *FOR,
2392	VPRecipeBase *Previous,
2393	VPDominatorTree &VPDT) {
2394	if (cannotHoistOrSinkRecipe(R: *Previous))
2395	return false;
2396
2397	// Collect recipes that need hoisting.
2398	SmallVector<VPRecipeBase *> HoistCandidates;
2399	SmallPtrSet<VPRecipeBase *, `8`> Visited;
2400	VPRecipeBase HoistPoint = nullptr*;
2401	// Find the closest hoist point by looking at all users of FOR and selecting
2402	// the recipe dominating all other users.
2403	for (VPUser *U : FOR->users()) {
2404	auto *R = cast<VPRecipeBase>(Val: U);
2405	if (!HoistPoint \|\| VPDT.properlyDominates(A: R, B: HoistPoint))
2406	HoistPoint = R;
2407	}
2408	assert(all_of(FOR->users(),
2409	[&VPDT, HoistPoint](VPUser *U) {
2410	auto *R = cast<VPRecipeBase>(U);
2411	return HoistPoint == R \|\|
2412	VPDT.properlyDominates(HoistPoint, R);
2413	}) &&
2414	"HoistPoint must dominate all users of FOR");
2415
2416	auto NeedsHoisting = [HoistPoint, &VPDT,
2417	&Visited](VPValue HoistCandidateV) -> VPRecipeBase {
2418	VPRecipeBase *HoistCandidate = HoistCandidateV->getDefiningRecipe();
2419	if (!HoistCandidate)
2420	return nullptr;
2421	VPRegionBlock *EnclosingLoopRegion =
2422	HoistCandidate->getParent()->getEnclosingLoopRegion();
2423	assert((!HoistCandidate->getRegion() \|\|
2424	HoistCandidate->getRegion() == EnclosingLoopRegion) &&
2425	"CFG in VPlan should still be flat, without replicate regions");
2426	// Hoist candidate was already visited, no need to hoist.
2427	if (!Visited.insert(Ptr: HoistCandidate).second)
2428	return nullptr;
2429
2430	// Candidate is outside loop region or a header phi, dominates FOR users w/o
2431	// hoisting.
2432	if (!EnclosingLoopRegion \|\| isa<VPHeaderPHIRecipe>(Val: HoistCandidate))
2433	return nullptr;
2434
2435	// If we reached a recipe that dominates HoistPoint, we don't need to
2436	// hoist the recipe.
2437	if (VPDT.properlyDominates(A: HoistCandidate, B: HoistPoint))
2438	return nullptr;
2439	return HoistCandidate;
2440	};
2441
2442	if (!NeedsHoisting (Previous->getVPSingleValue()))
2443	return true;
2444
2445	// Recursively try to hoist Previous and its operands before all users of FOR.
2446	HoistCandidates.push_back(Elt: Previous);
2447
2448	for (unsigned I = `0`; I != HoistCandidates.size(); ++I) {
2449	VPRecipeBase *Current = HoistCandidates [I];
2450	assert(Current->getNumDefinedValues() == `1` &&
2451	"only recipes with a single defined value expected");
2452	if (cannotHoistOrSinkRecipe(R: *Current))
2453	return false;
2454
2455	for (VPValue *Op : Current->operands()) {
2456	// If we reach FOR, it means the original Previous depends on some other
2457	// recurrence that in turn depends on FOR. If that is the case, we would
2458	// also need to hoist recipes involving the other FOR, which may break
2459	// dependencies.
2460	if (Op == FOR)
2461	return false;
2462
2463	if (auto *R = NeedsHoisting (Op)) {
2464	// Bail out if the recipe defines multiple values.
2465	// TODO: Hoisting such recipes requires additional handling.
2466	if (R->getNumDefinedValues() != `1`)
2467	return false;
2468	HoistCandidates.push_back(Elt: R);
2469	}
2470	}
2471	}
2472
2473	// Order recipes to hoist by dominance so earlier instructions are processed
2474	// first.
2475	sort(C&: HoistCandidates, Comp: [&VPDT](const VPRecipeBase A, const* VPRecipeBase *B) {
2476	return VPDT.properlyDominates(A, B);
2477	});
2478
2479	for (VPRecipeBase *HoistCandidate : HoistCandidates) {
2480	HoistCandidate->moveBefore(BB&: *HoistPoint->getParent(),
2481	I: HoistPoint->getIterator());
2482	}
2483
2484	return true;
2485	}
2486
2487	bool VPlanTransforms::adjustFixedOrderRecurrences(VPlan &Plan,
2488	VPBuilder &LoopBuilder) {
2489	VPDominatorTree VPDT(Plan);
2490
2491	SmallVector<VPFirstOrderRecurrencePHIRecipe *> RecurrencePhis;
2492	for (VPRecipeBase &R :
2493	Plan.getVectorLoopRegion()->getEntry()->getEntryBasicBlock()->phis())
2494	if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(Val: &R))
2495	RecurrencePhis.push_back(Elt: FOR);
2496
2497	for (VPFirstOrderRecurrencePHIRecipe *FOR : RecurrencePhis) {
2498	SmallPtrSet<VPFirstOrderRecurrencePHIRecipe *, `4`> SeenPhis;
2499	VPRecipeBase *Previous = FOR->getBackedgeValue()->getDefiningRecipe();
2500	// Fixed-order recurrences do not contain cycles, so this loop is guaranteed
2501	// to terminate.
2502	while (auto *PrevPhi =
2503	dyn_cast_or_null<VPFirstOrderRecurrencePHIRecipe>(Val: Previous)) {
2504	assert(PrevPhi->getParent() == FOR->getParent());
2505	assert(SeenPhis.insert(PrevPhi).second);
2506	Previous = PrevPhi->getBackedgeValue()->getDefiningRecipe();
2507	}
2508
2509	if (!sinkRecurrenceUsersAfterPrevious(FOR, Previous, VPDT) &&
2510	!hoistPreviousBeforeFORUsers(FOR, Previous, VPDT))
2511	return false;
2512
2513	// Introduce a recipe to combine the incoming and previous values of a
2514	// fixed-order recurrence.
2515	VPBasicBlock *InsertBlock =
2516	Previous ? Previous->getParent() : FOR->getParent();
2517	if (!Previous \|\| isa<VPHeaderPHIRecipe>(Val: Previous))
2518	LoopBuilder.setInsertPoint(TheBB: InsertBlock, IP: InsertBlock->getFirstNonPhi());
2519	else
2520	LoopBuilder.setInsertPoint(TheBB: InsertBlock,
2521	IP: std::next(x: Previous->getIterator()));
2522
2523	auto *RecurSplice =
2524	LoopBuilder.createNaryOp(Opcode: VPInstruction::FirstOrderRecurrenceSplice,
2525	Operands: {FOR, FOR->getBackedgeValue()});
2526
2527	FOR->replaceAllUsesWith(New: RecurSplice);
2528	// Set the first operand of RecurSplice to FOR again, after replacing
2529	// all users.
2530	RecurSplice->setOperand(I: `0`, New: FOR);
2531
2532	// Check for users extracting at the penultimate active lane of the FOR.
2533	// If only a single lane is active in the current iteration, we need to
2534	// select the last element from the previous iteration (from the FOR phi
2535	// directly).
2536	for (VPUser *U : RecurSplice->users()) {
2537	if (!match(U, P: m_ExtractLane(Op0: m_LastActiveLane(Op0: m_VPValue()),
2538	Op1: m_Specific(VPV: RecurSplice))))
2539	continue;
2540
2541	VPBuilder B(cast<VPInstruction>(Val: U));
2542	VPValue *LastActiveLane = cast<VPInstruction>(Val: U)->getOperand(N: `0`);
2543	VPValue *Zero = Plan.getConstantInt(BitWidth: `64`, Val: `0`);
2544	VPValue *One = Plan.getConstantInt(BitWidth: `64`, Val: `1`);
2545	VPValue *PenultimateIndex = B.createSub(LHS: LastActiveLane, RHS: One);
2546	VPValue *PenultimateLastIter =
2547	B.createNaryOp(Opcode: VPInstruction::ExtractLane,
2548	Operands: {PenultimateIndex, FOR->getBackedgeValue()});
2549	VPValue *LastPrevIter =
2550	B.createNaryOp(Opcode: VPInstruction::ExtractLastLane, Operands: FOR);
2551
2552	VPValue *Cmp = B.createICmp(Pred: CmpInst::ICMP_EQ, A: LastActiveLane, B: Zero);
2553	VPValue *Sel = B.createSelect(Cond: Cmp, TrueVal: LastPrevIter, FalseVal: PenultimateLastIter);
2554	cast<VPInstruction>(Val: U)->replaceAllUsesWith(New: Sel);
2555	}
2556	}
2557	return true;
2558	}
2559
2560	void VPlanTransforms::clearReductionWrapFlags(VPlan &Plan) {
2561	for (VPRecipeBase &R :
2562	Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
2563	auto *PhiR = dyn_cast<VPReductionPHIRecipe>(Val: &R);
2564	if (!PhiR)
2565	continue;
2566	RecurKind RK = PhiR->getRecurrenceKind();
2567	if (RK != RecurKind::Add && RK != RecurKind::Mul && RK != RecurKind::Sub &&
2568	RK != RecurKind::AddChainWithSubs)
2569	continue;
2570
2571	for (VPUser *U : collectUsersRecursively(V: PhiR))
2572	if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(Val: U)) {
2573	RecWithFlags->dropPoisonGeneratingFlags();
2574	}
2575	}
2576	}
2577
2578	namespace {
2579	struct VPCSEDenseMapInfo : public DenseMapInfo<VPSingleDefRecipe *> {
2580	static bool isSentinel(const VPSingleDefRecipe *Def) {
2581	return Def == getEmptyKey() \|\| Def == getTombstoneKey();
2582	}
2583
2584	/// If recipe \p R will lower to a GEP with a non-i8 source element type,
2585	/// return that source element type.
2586	static Type getGEPSourceElementType(const* VPSingleDefRecipe *R) {
2587	// All VPInstructions that lower to GEPs must have the i8 source element
2588	// type (as they are PtrAdds), so we omit it.
2589	return TypeSwitch<const VPSingleDefRecipe , Type >(R)
2590	.Case(caseFn: [](const VPReplicateRecipe I) -> Type {
2591	if (auto *GEP = dyn_cast<GetElementPtrInst>(Val: I->getUnderlyingValue()))
2592	return GEP->getSourceElementType();
2593	return nullptr;
2594	})
2595	.Case<VPVectorPointerRecipe, VPWidenGEPRecipe>(
2596	caseFn: [](auto I) { return* I->getSourceElementType(); })
2597	.Default(defaultFn: [](auto ) { return* nullptr; });
2598	}
2599
2600	/// Returns true if recipe \p Def can be safely handed for CSE.
2601	static bool canHandle(const VPSingleDefRecipe *Def) {
2602	// We can extend the list of handled recipes in the future,
2603	// provided we account for the data embedded in them while checking for
2604	// equality or hashing.
2605	auto C = getOpcodeOrIntrinsicID(R: Def);
2606
2607	// The issue with (Insert\|Extract)Value is that the index of the
2608	// insert/extract is not a proper operand in LLVM IR, and hence also not in
2609	// VPlan.
2610	if (!C \|\| (!C ->first && (C ->second == Instruction::InsertValue \|\|
2611	C ->second == Instruction::ExtractValue)))
2612	return false;
2613
2614	// During CSE, we can only handle recipes that don't read from memory: if
2615	// they read from memory, there could be an intervening write to memory
2616	// before the next instance is CSE'd, leading to an incorrect result.
2617	return !Def->mayReadFromMemory();
2618	}
2619
2620	/// Hash the underlying data of \p Def.
2621	static unsigned getHashValue(const VPSingleDefRecipe *Def) {
2622	const VPlan *Plan = Def->getParent()->getPlan();
2623	VPTypeAnalysis TypeInfo(*Plan);
2624	hash_code Result = hash_combine(
2625	args: Def->getVPRecipeID(), args: getOpcodeOrIntrinsicID(R: Def),
2626	args: getGEPSourceElementType(R: Def), args: TypeInfo.inferScalarType(V: Def),
2627	args: vputils::isSingleScalar(VPV: Def), args: hash_combine_range(R: Def->operands()));
2628	if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(Val: Def))
2629	if (RFlags->hasPredicate())
2630	return hash_combine(args: Result, args: RFlags->getPredicate());
2631	return Result;
2632	}
2633
2634	/// Check equality of underlying data of \p L and \p R.
2635	static bool isEqual(const VPSingleDefRecipe L, const* VPSingleDefRecipe *R) {
2636	if (isSentinel(Def: L) \|\| isSentinel(Def: R))
2637	return L == R;
2638	if (L->getVPRecipeID() != R->getVPRecipeID() \|\|
2639	getOpcodeOrIntrinsicID(R: L) != getOpcodeOrIntrinsicID(R) \|\|
2640	getGEPSourceElementType(R: L) != getGEPSourceElementType(R) \|\|
2641	vputils::isSingleScalar(VPV: L) != vputils::isSingleScalar(VPV: R) \|\|
2642	!equal(LRange: L->operands(), RRange: R->operands()))
2643	return false;
2644	assert(getOpcodeOrIntrinsicID(L) && getOpcodeOrIntrinsicID(R) &&
2645	"must have valid opcode info for both recipes");
2646	if (auto *LFlags = dyn_cast<VPRecipeWithIRFlags>(Val: L))
2647	if (LFlags->hasPredicate() &&
2648	LFlags->getPredicate() !=
2649	cast<VPRecipeWithIRFlags>(Val: R)->getPredicate())
2650	return false;
2651	// Recipes in replicate regions implicitly depend on predicate. If either
2652	// recipe is in a replicate region, only consider them equal if both have
2653	// the same parent.
2654	const VPRegionBlock *RegionL = L->getRegion();
2655	const VPRegionBlock *RegionR = R->getRegion();
2656	if (((RegionL && RegionL->isReplicator()) \|\|
2657	(RegionR && RegionR->isReplicator())) &&
2658	L->getParent() != R->getParent())
2659	return false;
2660	const VPlan *Plan = L->getParent()->getPlan();
2661	VPTypeAnalysis TypeInfo(*Plan);
2662	return TypeInfo.inferScalarType(V: L) == TypeInfo.inferScalarType(V: R);
2663	}
2664	};
2665	} // end anonymous namespace
2666
2667	/// Perform a common-subexpression-elimination of VPSingleDefRecipes on the \p
2668	/// Plan.
2669	void VPlanTransforms::cse(VPlan &Plan) {
2670	VPDominatorTree VPDT(Plan);
2671	DenseMap<VPSingleDefRecipe , VPSingleDefRecipe , VPCSEDenseMapInfo> CSEMap;
2672
2673	for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
2674	Range: vp_depth_first_deep(G: Plan.getEntry()))) {
2675	for (VPRecipeBase &R : *VPBB) {
2676	auto *Def = dyn_cast<VPSingleDefRecipe>(Val: &R);
2677	if (!Def \|\| !VPCSEDenseMapInfo::canHandle(Def))
2678	continue;
2679	if (VPSingleDefRecipe *V = CSEMap.lookup(Val: Def)) {
2680	// V must dominate Def for a valid replacement.
2681	if (!VPDT.dominates(A: V->getParent(), B: VPBB))
2682	continue;
2683	// Only keep flags present on both V and Def.
2684	if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(Val: V))
2685	RFlags->intersectFlags(Other: *cast<VPRecipeWithIRFlags>(Val: Def));
2686	Def->replaceAllUsesWith(New: V);
2687	continue;
2688	}
2689	CSEMap [Def] = Def;
2690	}
2691	}
2692	}
2693
2694	/// Move loop-invariant recipes out of the vector loop region in \p Plan.
2695	static void licm(VPlan &Plan) {
2696	VPBasicBlock *Preheader = Plan.getVectorPreheader();
2697
2698	// Hoist any loop invariant recipes from the vector loop region to the
2699	// preheader. Preform a shallow traversal of the vector loop region, to
2700	// exclude recipes in replicate regions. Since the top-level blocks in the
2701	// vector loop region are guaranteed to execute if the vector pre-header is,
2702	// we don't need to check speculation safety.
2703	VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2704	assert(Preheader->getSingleSuccessor() == LoopRegion &&
2705	"Expected vector prehader's successor to be the vector loop region");
2706	for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
2707	Range: vp_depth_first_shallow(G: LoopRegion->getEntry()))) {
2708	for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB)) {
2709	if (cannotHoistOrSinkRecipe(R))
2710	continue;
2711	if (any_of(Range: R.operands(), P: [](VPValue *Op) {
2712	return !Op->isDefinedOutsideLoopRegions();
2713	}))
2714	continue;
2715	R.moveBefore(BB&: *Preheader, I: Preheader->end());
2716	}
2717	}
2718
2719	#ifndef NDEBUG
2720	VPDominatorTree VPDT(Plan);
2721	#endif
2722	// Sink recipes with no users inside the vector loop region if all users are
2723	// in the same exit block of the region.
2724	// TODO: Extend to sink recipes from inner loops.
2725	for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
2726	Range: vp_post_order_shallow(G: LoopRegion->getEntry()))) {
2727	for (VPRecipeBase &R : make_early_inc_range(Range: reverse(C&: *VPBB))) {
2728	if (cannotHoistOrSinkRecipe(R))
2729	continue;
2730
2731	// TODO: Support sinking VPReplicateRecipe after ensuring replicateByVF
2732	// handles sunk recipes correctly.
2733	if (isa<VPReplicateRecipe>(Val: &R))
2734	continue;
2735
2736	// TODO: Use R.definedValues() instead of casting to VPSingleDefRecipe to
2737	// support recipes with multiple defined values (e.g., interleaved loads).
2738	auto *Def = cast<VPSingleDefRecipe>(Val: &R);
2739	// Skip recipes without users as we cannot determine a sink block.
2740	// TODO: Clone sinkable recipes without users to all exit blocks to reduce
2741	// their execution frequency.
2742	if (Def->getNumUsers() == `0`)
2743	continue;
2744
2745	VPBasicBlock SinkBB = nullptr*;
2746	// Cannot sink the recipe if any user
2747	// is defined in any loop region, or*
2748	// is a phi, or*
2749	// multiple users in different blocks.*
2750	if (any_of(Range: Def->users(), P: [&SinkBB](VPUser *U) {
2751	auto *UserR = cast<VPRecipeBase>(Val: U);
2752	VPBasicBlock *Parent = UserR->getParent();
2753	// TODO: If the user is a PHI node, we should check the block of
2754	// incoming value. Support PHI node users if needed.
2755	if (UserR->isPhi() \|\| Parent->getEnclosingLoopRegion())
2756	return true;
2757	// TODO: Support sinking when users are in multiple blocks.
2758	if (SinkBB && SinkBB != Parent)
2759	return true;
2760	SinkBB = Parent;
2761	return false;
2762	}))
2763	continue;
2764
2765	// Only sink to dedicated exit blocks of the loop region.
2766	if (SinkBB->getSinglePredecessor() != LoopRegion)
2767	continue;
2768
2769	// TODO: This will need to be a check instead of a assert after
2770	// conditional branches in vectorized loops are supported.
2771	assert(VPDT.properlyDominates(VPBB, SinkBB) &&
2772	"Defining block must dominate sink block");
2773	// TODO: Clone the recipe if users are on multiple exit paths, instead of
2774	// just moving.
2775	Def->moveBefore(BB&: *SinkBB, I: SinkBB->getFirstNonPhi());
2776	}
2777	}
2778	}
2779
2780	void VPlanTransforms::truncateToMinimalBitwidths(
2781	VPlan &Plan, const MapVector<Instruction *, uint64_t> &MinBWs) {
2782	if (Plan.hasScalarVFOnly())
2783	return;
2784	// Keep track of created truncates, so they can be re-used. Note that we
2785	// cannot use RAUW after creating a new truncate, as this would could make
2786	// other uses have different types for their operands, making them invalidly
2787	// typed.
2788	DenseMap<VPValue , VPWidenCastRecipe > ProcessedTruncs;
2789	VPTypeAnalysis TypeInfo(Plan);
2790	VPBasicBlock *PH = Plan.getVectorPreheader();
2791	for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
2792	Range: vp_depth_first_deep(G: Plan.getVectorLoopRegion()))) {
2793	for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB)) {
2794	if (!isa<VPWidenRecipe, VPWidenCastRecipe, VPReplicateRecipe,
2795	VPWidenLoadRecipe, VPWidenIntrinsicRecipe>(Val: &R))
2796	continue;
2797
2798	VPValue *ResultVPV = R.getVPSingleValue();
2799	auto *UI = cast_or_null<Instruction>(Val: ResultVPV->getUnderlyingValue());
2800	unsigned NewResSizeInBits = MinBWs.lookup(Key: UI);
2801	if (!NewResSizeInBits)
2802	continue;
2803
2804	// If the value wasn't vectorized, we must maintain the original scalar
2805	// type. Skip those here, after incrementing NumProcessedRecipes. Also
2806	// skip casts which do not need to be handled explicitly here, as
2807	// redundant casts will be removed during recipe simplification.
2808	if (isa<VPReplicateRecipe, VPWidenCastRecipe>(Val: &R))
2809	continue;
2810
2811	Type *OldResTy = TypeInfo.inferScalarType(V: ResultVPV);
2812	unsigned OldResSizeInBits = OldResTy->getScalarSizeInBits();
2813	assert(OldResTy->isIntegerTy() && "only integer types supported");
2814	(void)OldResSizeInBits;
2815
2816	auto *NewResTy = IntegerType::get(C&: Plan.getContext(), NumBits: NewResSizeInBits);
2817
2818	// Any wrapping introduced by shrinking this operation shouldn't be
2819	// considered undefined behavior. So, we can't unconditionally copy
2820	// arithmetic wrapping flags to VPW.
2821	if (auto *VPW = dyn_cast<VPRecipeWithIRFlags>(Val: &R))
2822	VPW->dropPoisonGeneratingFlags();
2823
2824	if (OldResSizeInBits != NewResSizeInBits &&
2825	!match(V: &R, P: m_ICmp(Op0: m_VPValue(), Op1: m_VPValue()))) {
2826	// Extend result to original width.
2827	auto Ext = new* VPWidenCastRecipe (
2828	Instruction::ZExt, ResultVPV, OldResTy, nullptr,
2829	VPIRFlags::getDefaultFlags(Opcode: Instruction::ZExt));
2830	Ext->insertAfter(InsertPos: &R);
2831	ResultVPV->replaceAllUsesWith(New: Ext);
2832	Ext->setOperand(I: `0`, New: ResultVPV);
2833	assert(OldResSizeInBits > NewResSizeInBits && "Nothing to shrink?");
2834	} else {
2835	assert(match(&R, m_ICmp(m_VPValue(), m_VPValue())) &&
2836	"Only ICmps should not need extending the result.");
2837	}
2838
2839	assert(!isa<VPWidenStoreRecipe>(&R) && "stores cannot be narrowed");
2840	if (isa<VPWidenLoadRecipe, VPWidenIntrinsicRecipe>(Val: &R))
2841	continue;
2842
2843	// Shrink operands by introducing truncates as needed.
2844	unsigned StartIdx =
2845	match(V: &R, P: m_Select(Op0: m_VPValue(), Op1: m_VPValue(), Op2: m_VPValue())) ? `1` : `0`;
2846	for (unsigned Idx = StartIdx; Idx != R.getNumOperands(); ++Idx) {
2847	auto *Op = R.getOperand(N: Idx);
2848	unsigned OpSizeInBits =
2849	TypeInfo.inferScalarType(V: Op)->getScalarSizeInBits();
2850	if (OpSizeInBits == NewResSizeInBits)
2851	continue;
2852	assert(OpSizeInBits > NewResSizeInBits && "nothing to truncate");
2853	auto [ProcessedIter, IterIsEmpty] = ProcessedTruncs.try_emplace(Key: Op);
2854	if (!IterIsEmpty) {
2855	R.setOperand(I: Idx, New: ProcessedIter ->second);
2856	continue;
2857	}
2858
2859	VPBuilder Builder;
2860	if (isa<VPIRValue>(Val: Op))
2861	Builder.setInsertPoint(PH);
2862	else
2863	Builder.setInsertPoint(&R);
2864	VPWidenCastRecipe *NewOp =
2865	Builder.createWidenCast(Opcode: Instruction::Trunc, Op, ResultTy: NewResTy);
2866	ProcessedIter ->second = NewOp;
2867	R.setOperand(I: Idx, New: NewOp);
2868	}
2869
2870	}
2871	}
2872	}
2873
2874	void VPlanTransforms::removeBranchOnConst(VPlan &Plan) {
2875	for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
2876	Range: vp_depth_first_shallow(G: Plan.getEntry()))) {
2877	VPValue *Cond;
2878	// Skip blocks that are not terminated by BranchOnCond.
2879	if (VPBB->empty() \|\| !match(V: &VPBB->back(), P: m_BranchOnCond(Op0: m_VPValue(V&: Cond))))
2880	continue;
2881
2882	assert(VPBB->getNumSuccessors() == `2` &&
2883	"Two successors expected for BranchOnCond");
2884	unsigned RemovedIdx;
2885	if (match(V: Cond, P: m_True()))
2886	RemovedIdx = `1`;
2887	else if (match(V: Cond, P: m_False()))
2888	RemovedIdx = `0`;
2889	else
2890	continue;
2891
2892	VPBasicBlock *RemovedSucc =
2893	cast<VPBasicBlock>(Val: VPBB->getSuccessors()[RemovedIdx]);
2894	assert(count(RemovedSucc->getPredecessors(), VPBB) == `1` &&
2895	"There must be a single edge between VPBB and its successor");
2896	// Values coming from VPBB into phi recipes of RemoveSucc are removed from
2897	// these recipes.
2898	for (VPRecipeBase &R : RemovedSucc->phis())
2899	cast<VPPhiAccessors>(Val: &R)->removeIncomingValueFor(IncomingBlock: VPBB);
2900
2901	// Disconnect blocks and remove the terminator. RemovedSucc will be deleted
2902	// automatically on VPlan destruction if it becomes unreachable.
2903	VPBlockUtils::disconnectBlocks(From: VPBB, To: RemovedSucc);
2904	VPBB->back().eraseFromParent();
2905	}
2906	}
2907
2908	void VPlanTransforms::optimize(VPlan &Plan) {
2909	RUN_VPLAN_PASS(removeRedundantCanonicalIVs, Plan);
2910	RUN_VPLAN_PASS(removeRedundantInductionCasts, Plan);
2911
2912	RUN_VPLAN_PASS(reassociateHeaderMask, Plan);
2913	RUN_VPLAN_PASS(simplifyRecipes, Plan);
2914	RUN_VPLAN_PASS(removeDeadRecipes, Plan);
2915	RUN_VPLAN_PASS(simplifyBlends, Plan);
2916	RUN_VPLAN_PASS(legalizeAndOptimizeInductions, Plan);
2917	RUN_VPLAN_PASS(narrowToSingleScalarRecipes, Plan);
2918	RUN_VPLAN_PASS(removeRedundantExpandSCEVRecipes, Plan);
2919	RUN_VPLAN_PASS(reassociateHeaderMask, Plan);
2920	RUN_VPLAN_PASS(simplifyRecipes, Plan);
2921	RUN_VPLAN_PASS(removeBranchOnConst, Plan);
2922	RUN_VPLAN_PASS(removeDeadRecipes, Plan);
2923
2924	RUN_VPLAN_PASS(createAndOptimizeReplicateRegions, Plan);
2925	RUN_VPLAN_PASS(hoistInvariantLoads, Plan);
2926	RUN_VPLAN_PASS(mergeBlocksIntoPredecessors, Plan);
2927	RUN_VPLAN_PASS(licm, Plan);
2928	}
2929
2930	// Add a VPActiveLaneMaskPHIRecipe and related recipes to \p Plan and replace
2931	// the loop terminator with a branch-on-cond recipe with the negated
2932	// active-lane-mask as operand. Note that this turns the loop into an
2933	// uncountable one. Only the existing terminator is replaced, all other existing
2934	// recipes/users remain unchanged, except for poison-generating flags being
2935	// dropped from the canonical IV increment. Return the created
2936	// VPActiveLaneMaskPHIRecipe.
2937	//
2938	// The function adds the following recipes:
2939	//
2940	// vector.ph:
2941	// %EntryInc = canonical-iv-increment-for-part CanonicalIVStart
2942	// %EntryALM = active-lane-mask %EntryInc, TC
2943	//
2944	// vector.body:
2945	// ...
2946	// %P = active-lane-mask-phi [ %EntryALM, %vector.ph ], [ %ALM, %vector.body ]
2947	// ...
2948	// %InLoopInc = canonical-iv-increment-for-part CanonicalIVIncrement
2949	// %ALM = active-lane-mask %InLoopInc, TC
2950	// %Negated = Not %ALM
2951	// branch-on-cond %Negated
2952	//
2953	static VPActiveLaneMaskPHIRecipe *
2954	addVPLaneMaskPhiAndUpdateExitBranch(VPlan &Plan) {
2955	VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
2956	VPBasicBlock *EB = TopRegion->getExitingBasicBlock();
2957	auto *CanonicalIVPHI = TopRegion->getCanonicalIV();
2958	VPValue *StartV = CanonicalIVPHI->getStartValue();
2959
2960	auto *CanonicalIVIncrement =
2961	cast<VPInstruction>(Val: CanonicalIVPHI->getBackedgeValue());
2962	// TODO: Check if dropping the flags is needed.
2963	CanonicalIVIncrement->dropPoisonGeneratingFlags();
2964	DebugLoc DL = CanonicalIVIncrement->getDebugLoc();
2965	// We can't use StartV directly in the ActiveLaneMask VPInstruction, since
2966	// we have to take unrolling into account. Each part needs to start at
2967	// Part VF*
2968	auto *VecPreheader = Plan.getVectorPreheader();
2969	VPBuilder Builder(VecPreheader);
2970
2971	// Create the ActiveLaneMask instruction using the correct start values.
2972	VPValue *TC = Plan.getTripCount();
2973	VPValue *VF = &Plan.getVF();
2974
2975	auto *EntryIncrement = Builder.createOverflowingOp(
2976	Opcode: VPInstruction::CanonicalIVIncrementForPart, Operands: {StartV, VF}, WrapFlags: {false, false},
2977	DL, Name: "index.part.next");
2978
2979	// Create the active lane mask instruction in the VPlan preheader.
2980	VPValue *ALMMultiplier =
2981	Plan.getConstantInt(Ty: TopRegion->getCanonicalIVType(), Val: `1`);
2982	auto *EntryALM = Builder.createNaryOp(Opcode: VPInstruction::ActiveLaneMask,
2983	Operands: {EntryIncrement, TC, ALMMultiplier}, DL,
2984	Name: "active.lane.mask.entry");
2985
2986	// Now create the ActiveLaneMaskPhi recipe in the main loop using the
2987	// preheader ActiveLaneMask instruction.
2988	auto *LaneMaskPhi =
2989	new VPActiveLaneMaskPHIRecipe (EntryALM, DebugLoc::getUnknown());
2990	LaneMaskPhi->insertAfter(InsertPos: CanonicalIVPHI);
2991
2992	// Create the active lane mask for the next iteration of the loop before the
2993	// original terminator.
2994	VPRecipeBase *OriginalTerminator = EB->getTerminator();
2995	Builder.setInsertPoint(OriginalTerminator);
2996	auto *InLoopIncrement = Builder.createOverflowingOp(
2997	Opcode: VPInstruction::CanonicalIVIncrementForPart,
2998	Operands: {CanonicalIVIncrement, &Plan.getVF()}, WrapFlags: {false, false}, DL);
2999	auto *ALM = Builder.createNaryOp(Opcode: VPInstruction::ActiveLaneMask,
3000	Operands: {InLoopIncrement, TC, ALMMultiplier}, DL,
3001	Name: "active.lane.mask.next");
3002	LaneMaskPhi->addOperand(Operand: ALM);
3003
3004	// Replace the original terminator with BranchOnCond. We have to invert the
3005	// mask here because a true condition means jumping to the exit block.
3006	auto *NotMask = Builder.createNot(Operand: ALM, DL);
3007	Builder.createNaryOp(Opcode: VPInstruction::BranchOnCond, Operands: {NotMask}, DL);
3008	OriginalTerminator->eraseFromParent();
3009	return LaneMaskPhi;
3010	}
3011
3012	void VPlanTransforms::addActiveLaneMask(VPlan &Plan,
3013	bool UseActiveLaneMaskForControlFlow) {
3014	VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3015	auto *FoundWidenCanonicalIVUser = find_if(
3016	Range: LoopRegion->getCanonicalIV()->users(), P: IsaPred<VPWidenCanonicalIVRecipe>);
3017	assert(FoundWidenCanonicalIVUser &&
3018	"Must have widened canonical IV when tail folding!");
3019	VPSingleDefRecipe *HeaderMask = vputils::findHeaderMask(Plan);
3020	auto *WideCanonicalIV =
3021	cast<VPWidenCanonicalIVRecipe>(Val: *FoundWidenCanonicalIVUser);
3022	VPSingleDefRecipe *LaneMask;
3023	if (UseActiveLaneMaskForControlFlow) {
3024	LaneMask = addVPLaneMaskPhiAndUpdateExitBranch(Plan);
3025	} else {
3026	VPBuilder B = VPBuilder::getToInsertAfter(R: WideCanonicalIV);
3027	VPValue *ALMMultiplier =
3028	Plan.getConstantInt(Ty: LoopRegion->getCanonicalIVType(), Val: `1`);
3029	LaneMask =
3030	B.createNaryOp(Opcode: VPInstruction::ActiveLaneMask,
3031	Operands: {WideCanonicalIV, Plan.getTripCount(), ALMMultiplier},
3032	DL: nullptr, Name: "active.lane.mask");
3033	}
3034
3035	// Walk users of WideCanonicalIV and replace the header mask of the form
3036	// (ICMP_ULE, WideCanonicalIV, backedge-taken-count) with an active-lane-mask,
3037	// removing the old one to ensure there is always only a single header mask.
3038	HeaderMask->replaceAllUsesWith(New: LaneMask);
3039	HeaderMask->eraseFromParent();
3040	}
3041
3042	template <typename Op0_t, typename Op1_t> struct RemoveMask_match {
3043	Op0_t In;
3044	Op1_t &Out;
3045
3046	RemoveMask_match(const Op0_t &In, Op1_t &Out) : In(In), Out(Out) {}
3047
3048	template <typename OpTy> bool match(OpTy V) const* {
3049	if (m_Specific(In).match(V)) {
3050	Out = nullptr;
3051	return true;
3052	}
3053	return m_LogicalAnd(m_Specific(In), m_VPValue(Out)).match(V);
3054	}
3055	};
3056
3057	/// Match a specific mask \p In, or a combination of it (logical-and In, Out).
3058	/// Returns the remaining part \p Out if so, or nullptr otherwise.
3059	template <typename Op0_t, typename Op1_t>
3060	static inline RemoveMask_match<Op0_t, Op1_t> m_RemoveMask(const Op0_t &In,
3061	Op1_t &Out) {
3062	return RemoveMask_match<Op0_t, Op1_t>(In, Out);
3063	}
3064
3065	/// Try to optimize a \p CurRecipe masked by \p HeaderMask to a corresponding
3066	/// EVL-based recipe without the header mask. Returns nullptr if no EVL-based
3067	/// recipe could be created.
3068	/// \p HeaderMask Header Mask.
3069	/// \p CurRecipe Recipe to be transform.
3070	/// \p TypeInfo VPlan-based type analysis.
3071	/// \p EVL The explicit vector length parameter of vector-predication
3072	/// intrinsics.
3073	static VPRecipeBase optimizeMaskToEVL(VPValue HeaderMask,
3074	VPRecipeBase &CurRecipe,
3075	VPTypeAnalysis &TypeInfo, VPValue &EVL) {
3076	VPlan *Plan = CurRecipe.getParent()->getPlan();
3077	DebugLoc DL = CurRecipe.getDebugLoc();
3078	VPValue Addr, Mask, *EndPtr;
3079
3080	/// Adjust any end pointers so that they point to the end of EVL lanes not VF.
3081	auto AdjustEndPtr = [&CurRecipe, &EVL](VPValue *EndPtr) {
3082	auto *EVLEndPtr = cast<VPVectorEndPointerRecipe>(Val: EndPtr)->clone();
3083	EVLEndPtr->insertBefore(InsertPos: &CurRecipe);
3084	EVLEndPtr->setOperand(I: `1`, New: &EVL);
3085	return EVLEndPtr;
3086	};
3087
3088	if (match(V: &CurRecipe,
3089	P: m_MaskedLoad(Addr: m_VPValue(V&: Addr), Mask: m_RemoveMask(In: HeaderMask, Out&: Mask))) &&
3090	!cast<VPWidenLoadRecipe>(Val&: CurRecipe).isReverse())
3091	return new VPWidenLoadEVLRecipe (cast<VPWidenLoadRecipe>(Val&: CurRecipe), Addr,
3092	EVL, Mask);
3093
3094	VPValue *ReversedVal;
3095	if (match(V: &CurRecipe, P: m_Reverse(Op0: m_VPValue(V&: ReversedVal))) &&
3096	match(V: ReversedVal,
3097	P: m_MaskedLoad(Addr: m_VPValue(V&: EndPtr), Mask: m_RemoveMask(In: HeaderMask, Out&: Mask))) &&
3098	match(V: EndPtr, P: m_VecEndPtr(Op0: m_VPValue(V&: Addr), Op1: m_Specific(VPV: &Plan->getVF()))) &&
3099	cast<VPWidenLoadRecipe>(Val: ReversedVal)->isReverse()) {
3100	auto LoadR = new* VPWidenLoadEVLRecipe (
3101	*cast<VPWidenLoadRecipe>(Val: ReversedVal), AdjustEndPtr (EndPtr), EVL, Mask);
3102	LoadR->insertBefore(InsertPos: &CurRecipe);
3103	return new VPWidenIntrinsicRecipe (
3104	Intrinsic::experimental_vp_reverse, {LoadR, Plan->getTrue(), &EVL},
3105	TypeInfo.inferScalarType(V: LoadR), {}, {}, DL);
3106	}
3107
3108	VPValue *StoredVal;
3109	if (match(V: &CurRecipe, P: m_MaskedStore(Addr: m_VPValue(V&: Addr), Val: m_VPValue(V&: StoredVal),
3110	Mask: m_RemoveMask(In: HeaderMask, Out&: Mask))) &&
3111	!cast<VPWidenStoreRecipe>(Val&: CurRecipe).isReverse())
3112	return new VPWidenStoreEVLRecipe (cast<VPWidenStoreRecipe>(Val&: CurRecipe), Addr,
3113	StoredVal, EVL, Mask);
3114
3115	if (match(V: &CurRecipe,
3116	P: m_MaskedStore(Addr: m_VPValue(V&: EndPtr), Val: m_Reverse(Op0: m_VPValue(V&: ReversedVal)),
3117	Mask: m_RemoveMask(In: HeaderMask, Out&: Mask))) &&
3118	match(V: EndPtr, P: m_VecEndPtr(Op0: m_VPValue(V&: Addr), Op1: m_Specific(VPV: &Plan->getVF()))) &&
3119	cast<VPWidenStoreRecipe>(Val&: CurRecipe).isReverse()) {
3120	auto NewReverse = new* VPWidenIntrinsicRecipe (
3121	Intrinsic::experimental_vp_reverse,
3122	{ReversedVal, Plan->getTrue(), &EVL},
3123	TypeInfo.inferScalarType(V: ReversedVal), {}, {}, DL);
3124	NewReverse->insertBefore(InsertPos: &CurRecipe);
3125	return new VPWidenStoreEVLRecipe (cast<VPWidenStoreRecipe>(Val&: CurRecipe),
3126	AdjustEndPtr (EndPtr), NewReverse, EVL,
3127	Mask);
3128	}
3129
3130	if (auto *Rdx = dyn_cast<VPReductionRecipe>(Val: &CurRecipe))
3131	if (Rdx->isConditional() &&
3132	match(V: Rdx->getCondOp(), P: m_RemoveMask(In: HeaderMask, Out&: Mask)))
3133	return new VPReductionEVLRecipe (*Rdx, EVL, Mask);
3134
3135	if (auto *Interleave = dyn_cast<VPInterleaveRecipe>(Val: &CurRecipe))
3136	if (Interleave->getMask() &&
3137	match(V: Interleave->getMask(), P: m_RemoveMask(In: HeaderMask, Out&: Mask)))
3138	return new VPInterleaveEVLRecipe (*Interleave, EVL, Mask);
3139
3140	VPValue LHS, RHS;
3141	if (match(V: &CurRecipe,
3142	P: m_Select(Op0: m_Specific(VPV: HeaderMask), Op1: m_VPValue(V&: LHS), Op2: m_VPValue(V&: RHS))))
3143	return new VPWidenIntrinsicRecipe (
3144	Intrinsic::vp_merge, {Plan->getTrue(), LHS, RHS, &EVL},
3145	TypeInfo.inferScalarType(V: LHS), {}, {}, DL);
3146
3147	if (match(V: &CurRecipe, P: m_Select(Op0: m_RemoveMask(In: HeaderMask, Out&: Mask), Op1: m_VPValue(V&: LHS),
3148	Op2: m_VPValue(V&: RHS))))
3149	return new VPWidenIntrinsicRecipe (
3150	Intrinsic::vp_merge, {Mask, LHS, RHS, &EVL},
3151	TypeInfo.inferScalarType(V: LHS), {}, {}, DL);
3152
3153	if (match(V: &CurRecipe, P: m_LastActiveLane(Op0: m_Specific(VPV: HeaderMask)))) {
3154	Type *Ty = TypeInfo.inferScalarType(V: CurRecipe.getVPSingleValue());
3155	VPValue *ZExt =
3156	VPBuilder (&CurRecipe).createScalarCast(Opcode: Instruction::ZExt, Op: &EVL, ResultTy: Ty, DL);
3157	return new VPInstruction (
3158	Instruction::Sub, {ZExt, Plan->getConstantInt(Ty, Val: `1`)},
3159	VPIRFlags::getDefaultFlags(Opcode: Instruction::Sub), {}, DL);
3160	}
3161
3162	return nullptr;
3163	}
3164
3165	/// Optimize away any EVL-based header masks to VP intrinsic based recipes.
3166	/// The transforms here need to preserve the original semantics.
3167	void VPlanTransforms::optimizeEVLMasks(VPlan &Plan) {
3168	// Find the EVL-based header mask if it exists: icmp ult step-vector, EVL
3169	VPValue HeaderMask = nullptr, EVL = nullptr;
3170	for (VPRecipeBase &R : *Plan.getVectorLoopRegion()->getEntryBasicBlock()) {
3171	if (match(V: &R, P: m_SpecificICmp(MatchPred: CmpInst::ICMP_ULT, Op0: m_StepVector(),
3172	Op1: m_VPValue(V&: EVL))) &&
3173	match(V: EVL, P: m_EVL(Op0: m_VPValue()))) {
3174	HeaderMask = R.getVPSingleValue();
3175	break;
3176	}
3177	}
3178	if (!HeaderMask)
3179	return;
3180
3181	VPTypeAnalysis TypeInfo(Plan);
3182	SmallVector<VPRecipeBase *> OldRecipes;
3183	for (VPUser *U : collectUsersRecursively(V: HeaderMask)) {
3184	VPRecipeBase *R = cast<VPRecipeBase>(Val: U);
3185	if (auto NewR = optimizeMaskToEVL(HeaderMask, CurRecipe&: R, TypeInfo, EVL&: *EVL)) {
3186	NewR->insertBefore(InsertPos: R);
3187	for (auto [Old, New] :
3188	zip_equal(t: R->definedValues(), u: NewR->definedValues()))
3189	Old->replaceAllUsesWith(New);
3190	OldRecipes.push_back(Elt: R);
3191	}
3192	}
3193	// Erase old recipes at the end so we don't invalidate TypeInfo.
3194	for (VPRecipeBase *R : reverse(C&: OldRecipes)) {
3195	SmallVector<VPValue *> PossiblyDead(R->operands());
3196	R->eraseFromParent();
3197	for (VPValue *Op : PossiblyDead)
3198	recursivelyDeleteDeadRecipes(V: Op);
3199	}
3200	}
3201
3202	/// After replacing the canonical IV with a EVL-based IV, fixup recipes that use
3203	/// VF to use the EVL instead to avoid incorrect updates on the penultimate
3204	/// iteration.
3205	static void fixupVFUsersForEVL(VPlan &Plan, VPValue &EVL) {
3206	VPTypeAnalysis TypeInfo(Plan);
3207	VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3208	VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
3209
3210	assert(all_of(Plan.getVF().users(),
3211	IsaPred<VPVectorEndPointerRecipe, VPScalarIVStepsRecipe,
3212	VPWidenIntOrFpInductionRecipe>) &&
3213	"User of VF that we can't transform to EVL.");
3214	Plan.getVF().replaceUsesWithIf(New: &EVL, ShouldReplace: [](VPUser &U, unsigned Idx) {
3215	return isa<VPWidenIntOrFpInductionRecipe, VPScalarIVStepsRecipe>(Val: U);
3216	});
3217
3218	assert(all_of(Plan.getVFxUF().users(),
3219	[&LoopRegion, &Plan](VPUser *U) {
3220	return match(U,
3221	m_c_Add(m_Specific(LoopRegion->getCanonicalIV()),
3222	m_Specific(&Plan.getVFxUF()))) \|\|
3223	isa<VPWidenPointerInductionRecipe>(U);
3224	}) &&
3225	"Only users of VFxUF should be VPWidenPointerInductionRecipe and the "
3226	"increment of the canonical induction.");
3227	Plan.getVFxUF().replaceUsesWithIf(New: &EVL, ShouldReplace: [](VPUser &U, unsigned Idx) {
3228	// Only replace uses in VPWidenPointerInductionRecipe; The increment of the
3229	// canonical induction must not be updated.
3230	return isa<VPWidenPointerInductionRecipe>(Val: U);
3231	});
3232
3233	// Create a scalar phi to track the previous EVL if fixed-order recurrence is
3234	// contained.
3235	bool ContainsFORs =
3236	any_of(Range: Header->phis(), P: IsaPred<VPFirstOrderRecurrencePHIRecipe>);
3237	if (ContainsFORs) {
3238	// TODO: Use VPInstruction::ExplicitVectorLength to get maximum EVL.
3239	VPValue *MaxEVL = &Plan.getVF();
3240	// Emit VPScalarCastRecipe in preheader if VF is not a 32 bits integer.
3241	VPBuilder Builder(LoopRegion->getPreheaderVPBB());
3242	MaxEVL = Builder.createScalarZExtOrTrunc(
3243	Op: MaxEVL, ResultTy: Type::getInt32Ty(C&: Plan.getContext()),
3244	SrcTy: TypeInfo.inferScalarType(V: MaxEVL), DL: DebugLoc::getUnknown());
3245
3246	Builder.setInsertPoint(TheBB: Header, IP: Header->getFirstNonPhi());
3247	VPValue *PrevEVL = Builder.createScalarPhi(
3248	IncomingValues: {MaxEVL, &EVL}, DL: DebugLoc::getUnknown(), Name: "prev.evl");
3249
3250	for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
3251	Range: vp_depth_first_deep(G: Plan.getVectorLoopRegion()->getEntry()))) {
3252	for (VPRecipeBase &R : *VPBB) {
3253	VPValue V1, V2;
3254	if (!match(V: &R,
3255	P: m_VPInstruction<VPInstruction::FirstOrderRecurrenceSplice>(
3256	Ops: m_VPValue(V&: V1), Ops: m_VPValue(V&: V2))))
3257	continue;
3258	VPValue *Imm = Plan.getOrAddLiveIn(
3259	V: ConstantInt::getSigned(Ty: Type::getInt32Ty(C&: Plan.getContext()), V: -`1`));
3260	VPWidenIntrinsicRecipe VPSplice = new* VPWidenIntrinsicRecipe (
3261	Intrinsic::experimental_vp_splice,
3262	{V1, V2, Imm, Plan.getTrue(), PrevEVL, &EVL},
3263	TypeInfo.inferScalarType(V: R.getVPSingleValue()), {}, {},
3264	R.getDebugLoc());
3265	VPSplice->insertBefore(InsertPos: &R);
3266	R.getVPSingleValue()->replaceAllUsesWith(New: VPSplice);
3267	}
3268	}
3269	}
3270
3271	VPValue *HeaderMask = vputils::findHeaderMask(Plan);
3272	if (!HeaderMask)
3273	return;
3274
3275	// Replace header masks with a mask equivalent to predicating by EVL:
3276	//
3277	// icmp ule widen-canonical-iv backedge-taken-count
3278	// ->
3279	// icmp ult step-vector, EVL
3280	VPRecipeBase *EVLR = EVL.getDefiningRecipe();
3281	VPBuilder Builder(EVLR->getParent(), std::next(x: EVLR->getIterator()));
3282	Type *EVLType = TypeInfo.inferScalarType(V: &EVL);
3283	VPValue *EVLMask = Builder.createICmp(
3284	Pred: CmpInst::ICMP_ULT,
3285	A: Builder.createNaryOp(Opcode: VPInstruction::StepVector, Operands: {}, ResultTy: EVLType), B: &EVL);
3286	HeaderMask->replaceAllUsesWith(New: EVLMask);
3287	}
3288
3289	/// Converts a tail folded vector loop region to step by
3290	/// VPInstruction::ExplicitVectorLength elements instead of VF elements each
3291	/// iteration.
3292	///
3293	/// - Add a VPCurrentIterationPHIRecipe and related recipes to \p Plan and
3294	/// replaces all uses except the canonical IV increment of
3295	/// VPCanonicalIVPHIRecipe with a VPCurrentIterationPHIRecipe.
3296	/// VPCanonicalIVPHIRecipe is used only for loop iterations counting after
3297	/// this transformation.
3298	///
3299	/// - The header mask is replaced with a header mask based on the EVL.
3300	///
3301	/// - Plans with FORs have a new phi added to keep track of the EVL of the
3302	/// previous iteration, and VPFirstOrderRecurrencePHIRecipes are replaced with
3303	/// @llvm.vp.splice.
3304	///
3305	/// The function uses the following definitions:
3306	/// %StartV is the canonical induction start value.
3307	///
3308	/// The function adds the following recipes:
3309	///
3310	/// vector.ph:
3311	/// ...
3312	///
3313	/// vector.body:
3314	/// ...
3315	/// %CurrentIter = CURRENT-ITERATION-PHI [ %StartV, %vector.ph ],
3316	/// [ %NextIter, %vector.body ]
3317	/// %AVL = phi [ trip-count, %vector.ph ], [ %NextAVL, %vector.body ]
3318	/// %VPEVL = EXPLICIT-VECTOR-LENGTH %AVL
3319	/// ...
3320	/// %OpEVL = cast i32 %VPEVL to IVSize
3321	/// %NextIter = add IVSize %OpEVL, %CurrentIter
3322	/// %NextAVL = sub IVSize nuw %AVL, %OpEVL
3323	/// ...
3324	///
3325	/// If MaxSafeElements is provided, the function adds the following recipes:
3326	/// vector.ph:
3327	/// ...
3328	///
3329	/// vector.body:
3330	/// ...
3331	/// %CurrentIter = CURRENT-ITERATION-PHI [ %StartV, %vector.ph ],
3332	/// [ %NextIter, %vector.body ]
3333	/// %AVL = phi [ trip-count, %vector.ph ], [ %NextAVL, %vector.body ]
3334	/// %cmp = cmp ult %AVL, MaxSafeElements
3335	/// %SAFE_AVL = select %cmp, %AVL, MaxSafeElements
3336	/// %VPEVL = EXPLICIT-VECTOR-LENGTH %SAFE_AVL
3337	/// ...
3338	/// %OpEVL = cast i32 %VPEVL to IVSize
3339	/// %NextIter = add IVSize %OpEVL, %CurrentIter
3340	/// %NextAVL = sub IVSize nuw %AVL, %OpEVL
3341	/// ...
3342	///
3343	void VPlanTransforms::addExplicitVectorLength(
3344	VPlan &Plan, const std::optional<unsigned> &MaxSafeElements) {
3345	if (Plan.hasScalarVFOnly())
3346	return;
3347	VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3348	VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
3349
3350	auto *CanonicalIVPHI = LoopRegion->getCanonicalIV();
3351	auto *CanIVTy = LoopRegion->getCanonicalIVType();
3352	VPValue *StartV = CanonicalIVPHI->getStartValue();
3353
3354	// Create the CurrentIteration recipe in the vector loop.
3355	auto *CurrentIteration =
3356	new VPCurrentIterationPHIRecipe (StartV, DebugLoc::getUnknown());
3357	CurrentIteration->insertAfter(InsertPos: CanonicalIVPHI);
3358	VPBuilder Builder(Header, Header->getFirstNonPhi());
3359	// Create the AVL (application vector length), starting from TC -> 0 in steps
3360	// of EVL.
3361	VPPhi *AVLPhi = Builder.createScalarPhi(
3362	IncomingValues: {Plan.getTripCount()}, DL: DebugLoc::getCompilerGenerated(), Name: "avl");
3363	VPValue *AVL = AVLPhi;
3364
3365	if (MaxSafeElements) {
3366	// Support for MaxSafeDist for correct loop emission.
3367	VPValue AVLSafe = Plan.getConstantInt(Ty: CanIVTy, Val: MaxSafeElements);
3368	VPValue *Cmp = Builder.createICmp(Pred: ICmpInst::ICMP_ULT, A: AVL, B: AVLSafe);
3369	AVL = Builder.createSelect(Cond: Cmp, TrueVal: AVL, FalseVal: AVLSafe, DL: DebugLoc::getUnknown(),
3370	Name: "safe_avl");
3371	}
3372	auto *VPEVL = Builder.createNaryOp(Opcode: VPInstruction::ExplicitVectorLength, Operands: AVL,
3373	DL: DebugLoc::getUnknown(), Name: "evl");
3374
3375	auto *CanonicalIVIncrement =
3376	cast<VPInstruction>(Val: CanonicalIVPHI->getBackedgeValue());
3377	Builder.setInsertPoint(CanonicalIVIncrement);
3378	VPValue *OpVPEVL = VPEVL;
3379
3380	auto *I32Ty = Type::getInt32Ty(C&: Plan.getContext());
3381	OpVPEVL = Builder.createScalarZExtOrTrunc(
3382	Op: OpVPEVL, ResultTy: CanIVTy, SrcTy: I32Ty, DL: CanonicalIVIncrement->getDebugLoc());
3383
3384	auto *NextIter = Builder.createAdd(LHS: OpVPEVL, RHS: CurrentIteration,
3385	DL: CanonicalIVIncrement->getDebugLoc(),
3386	Name: "current.iteration.next",
3387	WrapFlags: {CanonicalIVIncrement->hasNoUnsignedWrap(),
3388	CanonicalIVIncrement->hasNoSignedWrap()});
3389	CurrentIteration->addOperand(Operand: NextIter);
3390
3391	VPValue *NextAVL =
3392	Builder.createSub(LHS: AVLPhi, RHS: OpVPEVL, DL: DebugLoc::getCompilerGenerated(),
3393	Name: "avl.next", WrapFlags: {/NUW=/true, /NSW=/false});
3394	AVLPhi->addOperand(Operand: NextAVL);
3395
3396	fixupVFUsersForEVL(Plan, EVL&: *VPEVL);
3397	removeDeadRecipes(Plan);
3398
3399	// Replace all uses of VPCanonicalIVPHIRecipe by
3400	// VPCurrentIterationPHIRecipe except for the canonical IV increment.
3401	CanonicalIVPHI->replaceAllUsesWith(New: CurrentIteration);
3402	CanonicalIVIncrement->setOperand(I: `0`, New: CanonicalIVPHI);
3403	// TODO: support unroll factor > 1.
3404	Plan.setUF(`1`);
3405	}
3406
3407	void VPlanTransforms::convertToVariableLengthStep(VPlan &Plan) {
3408	// Find the vector loop entry by locating VPCurrentIterationPHIRecipe.
3409	// There should be only one VPCurrentIteration in the entire plan.
3410	VPCurrentIterationPHIRecipe CurrentIteration = nullptr*;
3411
3412	for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
3413	Range: vp_depth_first_shallow(G: Plan.getEntry())))
3414	for (VPRecipeBase &R : VPBB->phis())
3415	if (auto *PhiR = dyn_cast<VPCurrentIterationPHIRecipe>(Val: &R)) {
3416	assert(!CurrentIteration &&
3417	"Found multiple CurrentIteration. Only one expected");
3418	CurrentIteration = PhiR;
3419	}
3420
3421	// Early return if it is not variable-length stepping.
3422	if (!CurrentIteration)
3423	return;
3424
3425	VPBasicBlock *HeaderVPBB = CurrentIteration->getParent();
3426	VPValue *CurrentIterationIncr = CurrentIteration->getBackedgeValue();
3427
3428	// Convert CurrentIteration to concrete recipe.
3429	auto *ScalarR =
3430	VPBuilder (CurrentIteration)
3431	.createScalarPhi(
3432	IncomingValues: {CurrentIteration->getStartValue(), CurrentIterationIncr},
3433	DL: CurrentIteration->getDebugLoc(), Name: "current.iteration.iv");
3434	CurrentIteration->replaceAllUsesWith(New: ScalarR);
3435	CurrentIteration->eraseFromParent();
3436
3437	// Replace CanonicalIVInc with CurrentIteration increment.
3438	auto CanonicalIV = cast<VPPhi>(Val: &HeaderVPBB->begin());
3439	VPValue *Backedge = CanonicalIV->getIncomingValue(Idx: `1`);
3440	assert(match(Backedge, m_c_Add(m_Specific(CanonicalIV),
3441	m_Specific(&Plan.getVFxUF()))) &&
3442	"Unexpected canonical iv");
3443	Backedge->replaceAllUsesWith(New: CurrentIterationIncr);
3444
3445	// Remove unused phi and increment.
3446	VPRecipeBase *CanonicalIVIncrement = Backedge->getDefiningRecipe();
3447	CanonicalIVIncrement->eraseFromParent();
3448	CanonicalIV->eraseFromParent();
3449	}
3450
3451	void VPlanTransforms::convertEVLExitCond(VPlan &Plan) {
3452	VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3453	// The canonical IV may not exist at this stage.
3454	if (!LoopRegion \|\|
3455	!isa<VPCanonicalIVPHIRecipe>(Val: LoopRegion->getEntryBasicBlock()->front()))
3456	return;
3457	VPCanonicalIVPHIRecipe *CanIV = LoopRegion->getCanonicalIV();
3458	if (std::next(x: CanIV->getIterator()) == CanIV->getParent()->end())
3459	return;
3460	// The EVL IV is always immediately after the canonical IV.
3461	auto *EVLPhi = dyn_cast_or_null<VPCurrentIterationPHIRecipe>(
3462	Val: std::next(x: CanIV->getIterator()));
3463	if (!EVLPhi)
3464	return;
3465
3466	// Bail if not an EVL tail folded loop.
3467	VPValue *AVL;
3468	if (!match(V: EVLPhi->getBackedgeValue(),
3469	P: m_c_Add(Op0: m_ZExtOrSelf(Op0: m_EVL(Op0: m_VPValue(V&: AVL))), Op1: m_Specific(VPV: EVLPhi))))
3470	return;
3471
3472	// The AVL may be capped to a safe distance.
3473	VPValue *SafeAVL;
3474	if (match(V: AVL, P: m_Select(Op0: m_VPValue(), Op1: m_VPValue(V&: SafeAVL), Op2: m_VPValue())))
3475	AVL = SafeAVL;
3476
3477	VPValue *AVLNext;
3478	[[maybe_unused]] bool FoundAVLNext =
3479	match(V: AVL, P: m_VPInstruction<Instruction::PHI>(
3480	Ops: m_Specific(VPV: Plan.getTripCount()), Ops: m_VPValue(V&: AVLNext)));
3481	assert(FoundAVLNext && "Didn't find AVL backedge?");
3482
3483	VPBasicBlock *Latch = LoopRegion->getExitingBasicBlock();
3484	auto *LatchBr = cast<VPInstruction>(Val: Latch->getTerminator());
3485	if (match(V: LatchBr, P: m_BranchOnCond(Op0: m_True())))
3486	return;
3487
3488	assert(
3489	match(LatchBr,
3490	m_BranchOnCond(m_SpecificCmp(
3491	CmpInst::ICMP_EQ, m_Specific(CanIV->getIncomingValue(`1`)),
3492	m_Specific(&Plan.getVectorTripCount())))) &&
3493	"Expected BranchOnCond with ICmp comparing CanIV increment with vector "
3494	"trip count");
3495
3496	Type *AVLTy = VPTypeAnalysis (Plan).inferScalarType(V: AVLNext);
3497	VPBuilder Builder(LatchBr);
3498	LatchBr->setOperand(
3499	I: `0`, New: Builder.createICmp(Pred: CmpInst::ICMP_EQ, A: AVLNext, B: Plan.getZero(Ty: AVLTy)));
3500	}
3501
3502	void VPlanTransforms::replaceSymbolicStrides(
3503	VPlan &Plan, PredicatedScalarEvolution &PSE,
3504	const DenseMap<Value , const* SCEV *> &StridesMap) {
3505	// Replace VPValues for known constant strides guaranteed by predicate scalar
3506	// evolution.
3507	auto CanUseVersionedStride = [&Plan](VPUser &U, unsigned) {
3508	auto *R = cast<VPRecipeBase>(Val: &U);
3509	return R->getRegion() \|\|
3510	R->getParent() == Plan.getVectorLoopRegion()->getSinglePredecessor();
3511	};
3512	ValueToSCEVMapTy RewriteMap;
3513	for (const SCEV *Stride : StridesMap.values()) {
3514	using namespace SCEVPatternMatch;
3515	auto *StrideV = cast<SCEVUnknown>(Val: Stride)->getValue();
3516	const APInt *StrideConst;
3517	if (!match(S: PSE.getSCEV(V: StrideV), P: m_scev_APInt(C&: StrideConst)))
3518	// Only handle constant strides for now.
3519	continue;
3520
3521	auto CI = Plan.getConstantInt(Val: StrideConst);
3522	if (VPValue *StrideVPV = Plan.getLiveIn(V: StrideV))
3523	StrideVPV->replaceUsesWithIf(New: CI, ShouldReplace: CanUseVersionedStride);
3524
3525	// The versioned value may not be used in the loop directly but through a
3526	// sext/zext. Add new live-ins in those cases.
3527	for (Value *U : StrideV->users()) {
3528	if (!isa<SExtInst, ZExtInst>(Val: U))
3529	continue;
3530	VPValue *StrideVPV = Plan.getLiveIn(V: U);
3531	if (!StrideVPV)
3532	continue;
3533	unsigned BW = U->getType()->getScalarSizeInBits();
3534	APInt C =
3535	isa<SExtInst>(Val: U) ? StrideConst->sext(width: BW) : StrideConst->zext(width: BW);
3536	VPValue *CI = Plan.getConstantInt(Val: C);
3537	StrideVPV->replaceUsesWithIf(New: CI, ShouldReplace: CanUseVersionedStride);
3538	}
3539	RewriteMap [StrideV] = PSE.getSCEV(V: StrideV);
3540	}
3541
3542	for (VPRecipeBase &R : *Plan.getEntry()) {
3543	auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(Val: &R);
3544	if (!ExpSCEV)
3545	continue;
3546	const SCEV *ScevExpr = ExpSCEV->getSCEV();
3547	auto *NewSCEV =
3548	SCEVParameterRewriter::rewrite(Scev: ScevExpr, SE&: *PSE.getSE(), Map&: RewriteMap);
3549	if (NewSCEV != ScevExpr) {
3550	VPValue *NewExp = vputils::getOrCreateVPValueForSCEVExpr(Plan, Expr: NewSCEV);
3551	ExpSCEV->replaceAllUsesWith(New: NewExp);
3552	if (Plan.getTripCount() == ExpSCEV)
3553	Plan.resetTripCount(NewTripCount: NewExp);
3554	}
3555	}
3556	}
3557
3558	void VPlanTransforms::dropPoisonGeneratingRecipes(
3559	VPlan &Plan,
3560	const std::function<bool(BasicBlock *)> &BlockNeedsPredication) {
3561	// Collect recipes in the backward slice of `Root` that may generate a poison
3562	// value that is used after vectorization.
3563	SmallPtrSet<VPRecipeBase *, `16`> Visited;
3564	auto CollectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) {
3565	SmallVector<VPRecipeBase *, `16`> Worklist;
3566	Worklist.push_back(Elt: Root);
3567
3568	// Traverse the backward slice of Root through its use-def chain.
3569	while (!Worklist.empty()) {
3570	VPRecipeBase *CurRec = Worklist.pop_back_val();
3571
3572	if (!Visited.insert(Ptr: CurRec).second)
3573	continue;
3574
3575	// Prune search if we find another recipe generating a widen memory
3576	// instruction. Widen memory instructions involved in address computation
3577	// will lead to gather/scatter instructions, which don't need to be
3578	// handled.
3579	if (isa<VPWidenMemoryRecipe, VPInterleaveRecipe, VPScalarIVStepsRecipe,
3580	VPHeaderPHIRecipe>(Val: CurRec))
3581	continue;
3582
3583	// This recipe contributes to the address computation of a widen
3584	// load/store. If the underlying instruction has poison-generating flags,
3585	// drop them directly.
3586	if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(Val: CurRec)) {
3587	VPValue A, B;
3588	// Dropping disjoint from an OR may yield incorrect results, as some
3589	// analysis may have converted it to an Add implicitly (e.g. SCEV used
3590	// for dependence analysis). Instead, replace it with an equivalent Add.
3591	// This is possible as all users of the disjoint OR only access lanes
3592	// where the operands are disjoint or poison otherwise.
3593	if (match(V: RecWithFlags, P: m_BinaryOr(Op0: m_VPValue(V&: A), Op1: m_VPValue(V&: B))) &&
3594	RecWithFlags->isDisjoint()) {
3595	VPBuilder Builder(RecWithFlags);
3596	VPInstruction *New =
3597	Builder.createAdd(LHS: A, RHS: B, DL: RecWithFlags->getDebugLoc());
3598	New->setUnderlyingValue(RecWithFlags->getUnderlyingValue());
3599	RecWithFlags->replaceAllUsesWith(New);
3600	RecWithFlags->eraseFromParent();
3601	CurRec = New;
3602	} else
3603	RecWithFlags->dropPoisonGeneratingFlags();
3604	} else {
3605	Instruction *Instr = dyn_cast_or_null<Instruction>(
3606	Val: CurRec->getVPSingleValue()->getUnderlyingValue());
3607	(void)Instr;
3608	assert((!Instr \|\| !Instr->hasPoisonGeneratingFlags()) &&
3609	"found instruction with poison generating flags not covered by "
3610	"VPRecipeWithIRFlags");
3611	}
3612
3613	// Add new definitions to the worklist.
3614	for (VPValue *Operand : CurRec->operands())
3615	if (VPRecipeBase *OpDef = Operand->getDefiningRecipe())
3616	Worklist.push_back(Elt: OpDef);
3617	}
3618	});
3619
3620	// Traverse all the recipes in the VPlan and collect the poison-generating
3621	// recipes in the backward slice starting at the address of a VPWidenRecipe or
3622	// VPInterleaveRecipe.
3623	auto Iter = vp_depth_first_deep(G: Plan.getEntry());
3624	for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Range: Iter)) {
3625	for (VPRecipeBase &Recipe : *VPBB) {
3626	if (auto *WidenRec = dyn_cast<VPWidenMemoryRecipe>(Val: &Recipe)) {
3627	Instruction &UnderlyingInstr = WidenRec->getIngredient();
3628	VPRecipeBase *AddrDef = WidenRec->getAddr()->getDefiningRecipe();
3629	if (AddrDef && WidenRec->isConsecutive() &&
3630	BlockNeedsPredication (UnderlyingInstr.getParent()))
3631	CollectPoisonGeneratingInstrsInBackwardSlice (AddrDef);
3632	} else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(Val: &Recipe)) {
3633	VPRecipeBase *AddrDef = InterleaveRec->getAddr()->getDefiningRecipe();
3634	if (AddrDef) {
3635	// Check if any member of the interleave group needs predication.
3636	const InterleaveGroup<Instruction> *InterGroup =
3637	InterleaveRec->getInterleaveGroup();
3638	bool NeedPredication = false;
3639	for (int I = `0`, NumMembers = InterGroup->getNumMembers();
3640	I < NumMembers; ++I) {
3641	Instruction *Member = InterGroup->getMember(Index: I);
3642	if (Member)
3643	NeedPredication \|= BlockNeedsPredication (Member->getParent());
3644	}
3645
3646	if (NeedPredication)
3647	CollectPoisonGeneratingInstrsInBackwardSlice (AddrDef);
3648	}
3649	}
3650	}
3651	}
3652	}
3653
3654	void VPlanTransforms::createInterleaveGroups(
3655	VPlan &Plan,
3656	const SmallPtrSetImpl<const InterleaveGroup<Instruction> *>
3657	&InterleaveGroups,
3658	VPRecipeBuilder &RecipeBuilder, const bool &ScalarEpilogueAllowed) {
3659	if (InterleaveGroups.empty())
3660	return;
3661
3662	// Interleave memory: for each Interleave Group we marked earlier as relevant
3663	// for this VPlan, replace the Recipes widening its memory instructions with a
3664	// single VPInterleaveRecipe at its insertion point.
3665	VPDominatorTree VPDT(Plan);
3666	for (const auto *IG : InterleaveGroups) {
3667	auto *Start =
3668	cast<VPWidenMemoryRecipe>(Val: RecipeBuilder.getRecipe(I: IG->getMember(Index: `0`)));
3669	VPIRMetadata InterleaveMD(*Start);
3670	SmallVector<VPValue *, `4`> StoredValues;
3671	if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(Val: Start))
3672	StoredValues.push_back(Elt: StoreR->getStoredValue());
3673	for (unsigned I = `1`; I < IG->getFactor(); ++I) {
3674	Instruction *MemberI = IG->getMember(Index: I);
3675	if (!MemberI)
3676	continue;
3677	VPWidenMemoryRecipe *MemoryR =
3678	cast<VPWidenMemoryRecipe>(Val: RecipeBuilder.getRecipe(I: MemberI));
3679	if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(Val: MemoryR))
3680	StoredValues.push_back(Elt: StoreR->getStoredValue());
3681	InterleaveMD.intersect(MD: *MemoryR);
3682	}
3683
3684	bool NeedsMaskForGaps =
3685	(IG->requiresScalarEpilogue() && !ScalarEpilogueAllowed) \|\|
3686	(!StoredValues.empty() && !IG->isFull());
3687
3688	Instruction *IRInsertPos = IG->getInsertPos();
3689	auto *InsertPos =
3690	cast<VPWidenMemoryRecipe>(Val: RecipeBuilder.getRecipe(I: IRInsertPos));
3691
3692	GEPNoWrapFlags NW = GEPNoWrapFlags::none();
3693	if (auto *Gep = dyn_cast<GetElementPtrInst>(
3694	Val: getLoadStorePointerOperand(V: IRInsertPos)->stripPointerCasts()))
3695	NW = Gep->getNoWrapFlags().withoutNoUnsignedWrap();
3696
3697	// Get or create the start address for the interleave group.
3698	VPValue *Addr = Start->getAddr();
3699	VPRecipeBase *AddrDef = Addr->getDefiningRecipe();
3700	if (AddrDef && !VPDT.properlyDominates(A: AddrDef, B: InsertPos)) {
3701	// We cannot re-use the address of member zero because it does not
3702	// dominate the insert position. Instead, use the address of the insert
3703	// position and create a PtrAdd adjusting it to the address of member
3704	// zero.
3705	// TODO: Hoist Addr's defining recipe (and any operands as needed) to
3706	// InsertPos or sink loads above zero members to join it.
3707	assert(IG->getIndex(IRInsertPos) != `0` &&
3708	"index of insert position shouldn't be zero");
3709	auto &DL = IRInsertPos->getDataLayout();
3710	APInt Offset(`32`,
3711	DL.getTypeAllocSize(Ty: getLoadStoreType(I: IRInsertPos)) *
3712	IG->getIndex(Instr: IRInsertPos),
3713	/IsSigned=/true);
3714	VPValue *OffsetVPV = Plan.getConstantInt(Val: -Offset);
3715	VPBuilder B(InsertPos);
3716	Addr = B.createNoWrapPtrAdd(Ptr: InsertPos->getAddr(), Offset: OffsetVPV, GEPFlags: NW);
3717	}
3718	// If the group is reverse, adjust the index to refer to the last vector
3719	// lane instead of the first. We adjust the index from the first vector
3720	// lane, rather than directly getting the pointer for lane VF - 1, because
3721	// the pointer operand of the interleaved access is supposed to be uniform.
3722	if (IG->isReverse()) {
3723	auto ReversePtr = new* VPVectorEndPointerRecipe (
3724	Addr, &Plan.getVF(), getLoadStoreType(I: IRInsertPos),
3725	-(int64_t)IG->getFactor(), NW, InsertPos->getDebugLoc());
3726	ReversePtr->insertBefore(InsertPos);
3727	Addr = ReversePtr;
3728	}
3729	auto VPIG = new* VPInterleaveRecipe (IG, Addr, StoredValues,
3730	InsertPos->getMask(), NeedsMaskForGaps,
3731	InterleaveMD, InsertPos->getDebugLoc());
3732	VPIG->insertBefore(InsertPos);
3733
3734	unsigned J = `0`;
3735	for (unsigned i = `0`; i < IG->getFactor(); ++i)
3736	if (Instruction *Member = IG->getMember(Index: i)) {
3737	VPRecipeBase *MemberR = RecipeBuilder.getRecipe(I: Member);
3738	if (!Member->getType()->isVoidTy()) {
3739	VPValue *OriginalV = MemberR->getVPSingleValue();
3740	OriginalV->replaceAllUsesWith(New: VPIG->getVPValue(I: J));
3741	J++;
3742	}
3743	MemberR->eraseFromParent();
3744	}
3745	}
3746	}
3747
3748	/// Expand a VPWidenIntOrFpInduction into executable recipes, for the initial
3749	/// value, phi and backedge value. In the following example:
3750	///
3751	/// vector.ph:
3752	/// Successor(s): vector loop
3753	///
3754	/// <x1> vector loop: {
3755	/// vector.body:
3756	/// WIDEN-INDUCTION %i = phi %start, %step, %vf
3757	/// ...
3758	/// EMIT branch-on-count ...
3759	/// No successors
3760	/// }
3761	///
3762	/// WIDEN-INDUCTION will get expanded to:
3763	///
3764	/// vector.ph:
3765	/// ...
3766	/// vp<%induction.start> = ...
3767	/// vp<%induction.increment> = ...
3768	///
3769	/// Successor(s): vector loop
3770	///
3771	/// <x1> vector loop: {
3772	/// vector.body:
3773	/// ir<%i> = WIDEN-PHI vp<%induction.start>, vp<%vec.ind.next>
3774	/// ...
3775	/// vp<%vec.ind.next> = add ir<%i>, vp<%induction.increment>
3776	/// EMIT branch-on-count ...
3777	/// No successors
3778	/// }
3779	static void
3780	expandVPWidenIntOrFpInduction(VPWidenIntOrFpInductionRecipe *WidenIVR,
3781	VPTypeAnalysis &TypeInfo) {
3782	VPlan *Plan = WidenIVR->getParent()->getPlan();
3783	VPValue *Start = WidenIVR->getStartValue();
3784	VPValue *Step = WidenIVR->getStepValue();
3785	VPValue *VF = WidenIVR->getVFValue();
3786	DebugLoc DL = WidenIVR->getDebugLoc();
3787
3788	// The value from the original loop to which we are mapping the new induction
3789	// variable.
3790	Type *Ty = TypeInfo.inferScalarType(V: WidenIVR);
3791
3792	const InductionDescriptor &ID = WidenIVR->getInductionDescriptor();
3793	Instruction::BinaryOps AddOp;
3794	Instruction::BinaryOps MulOp;
3795	VPIRFlags Flags = *WidenIVR;
3796	if (ID.getKind() == InductionDescriptor::IK_IntInduction) {
3797	AddOp = Instruction::Add;
3798	MulOp = Instruction::Mul;
3799	} else {
3800	AddOp = ID.getInductionOpcode();
3801	MulOp = Instruction::FMul;
3802	}
3803
3804	// If the phi is truncated, truncate the start and step values.
3805	VPBuilder Builder(Plan->getVectorPreheader());
3806	Type *StepTy = TypeInfo.inferScalarType(V: Step);
3807	if (Ty->getScalarSizeInBits() < StepTy->getScalarSizeInBits()) {
3808	assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
3809	Step = Builder.createScalarCast(Opcode: Instruction::Trunc, Op: Step, ResultTy: Ty, DL);
3810	Start = Builder.createScalarCast(Opcode: Instruction::Trunc, Op: Start, ResultTy: Ty, DL);
3811	// Truncation doesn't preserve WrapFlags.
3812	Flags.dropPoisonGeneratingFlags();
3813	StepTy = Ty;
3814	}
3815
3816	// Construct the initial value of the vector IV in the vector loop preheader.
3817	Type *IVIntTy =
3818	IntegerType::get(C&: Plan->getContext(), NumBits: StepTy->getScalarSizeInBits());
3819	VPValue *Init = Builder.createNaryOp(Opcode: VPInstruction::StepVector, Operands: {}, ResultTy: IVIntTy);
3820	if (StepTy->isFloatingPointTy())
3821	Init = Builder.createWidenCast(Opcode: Instruction::UIToFP, Op: Init, ResultTy: StepTy);
3822
3823	VPValue *SplatStart = Builder.createNaryOp(Opcode: VPInstruction::Broadcast, Operands: Start);
3824	VPValue *SplatStep = Builder.createNaryOp(Opcode: VPInstruction::Broadcast, Operands: Step);
3825
3826	Init = Builder.createNaryOp(Opcode: MulOp, Operands: {Init, SplatStep}, Flags);
3827	Init = Builder.createNaryOp(Opcode: AddOp, Operands: {SplatStart, Init}, Flags,
3828	DL: DebugLoc::getUnknown(), Name: "induction");
3829
3830	// Create the widened phi of the vector IV.
3831	auto WidePHI = new* VPWidenPHIRecipe (WidenIVR->getPHINode(), Init,
3832	WidenIVR->getDebugLoc(), "vec.ind");
3833	WidePHI->insertBefore(InsertPos: WidenIVR);
3834
3835	// Create the backedge value for the vector IV.
3836	VPValue *Inc;
3837	VPValue *Prev;
3838	// If unrolled, use the increment and prev value from the operands.
3839	if (auto *SplatVF = WidenIVR->getSplatVFValue()) {
3840	Inc = SplatVF;
3841	Prev = WidenIVR->getLastUnrolledPartOperand();
3842	} else {
3843	if (VPRecipeBase *R = VF->getDefiningRecipe())
3844	Builder.setInsertPoint(TheBB: R->getParent(), IP: std::next(x: R->getIterator()));
3845	// Multiply the vectorization factor by the step using integer or
3846	// floating-point arithmetic as appropriate.
3847	if (StepTy->isFloatingPointTy())
3848	VF = Builder.createScalarCast(Opcode: Instruction::CastOps::UIToFP, Op: VF, ResultTy: StepTy,
3849	DL);
3850	else
3851	VF = Builder.createScalarZExtOrTrunc(Op: VF, ResultTy: StepTy,
3852	SrcTy: TypeInfo.inferScalarType(V: VF), DL);
3853
3854	Inc = Builder.createNaryOp(Opcode: MulOp, Operands: {Step, VF}, Flags);
3855	Inc = Builder.createNaryOp(Opcode: VPInstruction::Broadcast, Operands: Inc);
3856	Prev = WidePHI;
3857	}
3858
3859	VPBasicBlock *ExitingBB = Plan->getVectorLoopRegion()->getExitingBasicBlock();
3860	Builder.setInsertPoint(TheBB: ExitingBB, IP: ExitingBB->getTerminator()->getIterator());
3861	auto *Next = Builder.createNaryOp(Opcode: AddOp, Operands: {Prev, Inc}, Flags,
3862	DL: WidenIVR->getDebugLoc(), Name: "vec.ind.next");
3863
3864	WidePHI->addOperand(Operand: Next);
3865
3866	WidenIVR->replaceAllUsesWith(New: WidePHI);
3867	}
3868
3869	/// Expand a VPWidenPointerInductionRecipe into executable recipes, for the
3870	/// initial value, phi and backedge value. In the following example:
3871	///
3872	/// <x1> vector loop: {
3873	/// vector.body:
3874	/// EMIT ir<%ptr.iv> = WIDEN-POINTER-INDUCTION %start, %step, %vf
3875	/// ...
3876	/// EMIT branch-on-count ...
3877	/// }
3878	///
3879	/// WIDEN-POINTER-INDUCTION will get expanded to:
3880	///
3881	/// <x1> vector loop: {
3882	/// vector.body:
3883	/// EMIT-SCALAR %pointer.phi = phi %start, %ptr.ind
3884	/// EMIT %mul = mul %stepvector, %step
3885	/// EMIT %vector.gep = wide-ptradd %pointer.phi, %mul
3886	/// ...
3887	/// EMIT %ptr.ind = ptradd %pointer.phi, %vf
3888	/// EMIT branch-on-count ...
3889	/// }
3890	static void expandVPWidenPointerInduction(VPWidenPointerInductionRecipe *R,
3891	VPTypeAnalysis &TypeInfo) {
3892	VPlan *Plan = R->getParent()->getPlan();
3893	VPValue *Start = R->getStartValue();
3894	VPValue *Step = R->getStepValue();
3895	VPValue *VF = R->getVFValue();
3896
3897	assert(R->getInductionDescriptor().getKind() ==
3898	InductionDescriptor::IK_PtrInduction &&
3899	"Not a pointer induction according to InductionDescriptor!");
3900	assert(TypeInfo.inferScalarType(R)->isPointerTy() && "Unexpected type.");
3901	assert(!R->onlyScalarsGenerated(Plan->hasScalableVF()) &&
3902	"Recipe should have been replaced");
3903
3904	VPBuilder Builder(R);
3905	DebugLoc DL = R->getDebugLoc();
3906
3907	// Build a scalar pointer phi.
3908	VPPhi *ScalarPtrPhi = Builder.createScalarPhi(IncomingValues: Start, DL, Name: "pointer.phi");
3909
3910	// Create actual address geps that use the pointer phi as base and a
3911	// vectorized version of the step value (<step0, ..., stepN>) as offset.
3912	Builder.setInsertPoint(TheBB: R->getParent(), IP: R->getParent()->getFirstNonPhi());
3913	Type *StepTy = TypeInfo.inferScalarType(V: Step);
3914	VPValue *Offset = Builder.createNaryOp(Opcode: VPInstruction::StepVector, Operands: {}, ResultTy: StepTy);
3915	Offset = Builder.createOverflowingOp(Opcode: Instruction::Mul, Operands: {Offset, Step});
3916	VPValue *PtrAdd =
3917	Builder.createWidePtrAdd(Ptr: ScalarPtrPhi, Offset, DL, Name: "vector.gep");
3918	R->replaceAllUsesWith(New: PtrAdd);
3919
3920	// Create the backedge value for the scalar pointer phi.
3921	VPBasicBlock *ExitingBB = Plan->getVectorLoopRegion()->getExitingBasicBlock();
3922	Builder.setInsertPoint(TheBB: ExitingBB, IP: ExitingBB->getTerminator()->getIterator());
3923	VF = Builder.createScalarZExtOrTrunc(Op: VF, ResultTy: StepTy, SrcTy: TypeInfo.inferScalarType(V: VF),
3924	DL);
3925	VPValue *Inc = Builder.createOverflowingOp(Opcode: Instruction::Mul, Operands: {Step, VF});
3926
3927	VPValue *InductionGEP =
3928	Builder.createPtrAdd(Ptr: ScalarPtrPhi, Offset: Inc, DL, Name: "ptr.ind");
3929	ScalarPtrPhi->addOperand(Operand: InductionGEP);
3930	}
3931
3932	void VPlanTransforms::dissolveLoopRegions(VPlan &Plan) {
3933	// Replace loop regions with explicity CFG.
3934	SmallVector<VPRegionBlock *> LoopRegions;
3935	for (VPRegionBlock *R : VPBlockUtils::blocksOnly<VPRegionBlock>(
3936	Range: vp_depth_first_deep(G: Plan.getEntry()))) {
3937	if (!R->isReplicator())
3938	LoopRegions.push_back(Elt: R);
3939	}
3940	for (VPRegionBlock *R : LoopRegions)
3941	R->dissolveToCFGLoop();
3942	}
3943
3944	void VPlanTransforms::expandBranchOnTwoConds(VPlan &Plan) {
3945	SmallVector<VPInstruction *> WorkList;
3946	// The transform runs after dissolving loop regions, so all VPBasicBlocks
3947	// terminated with BranchOnTwoConds are reached via a shallow traversal.
3948	for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
3949	Range: vp_depth_first_shallow(G: Plan.getEntry()))) {
3950	if (!VPBB->empty() && match(V: &VPBB->back(), P: m_BranchOnTwoConds()))
3951	WorkList.push_back(Elt: cast<VPInstruction>(Val: &VPBB->back()));
3952	}
3953
3954	// Expand BranchOnTwoConds instructions into explicit CFG with two new
3955	// single-condition branches:
3956	// 1. A branch that replaces BranchOnTwoConds, jumps to the first successor if
3957	// the first condition is true, and otherwise jumps to a new interim block.
3958	// 2. A branch that ends the interim block, jumps to the second successor if
3959	// the second condition is true, and otherwise jumps to the third
3960	// successor.
3961	for (VPInstruction *Br : WorkList) {
3962	assert(Br->getNumOperands() == `2` &&
3963	"BranchOnTwoConds must have exactly 2 conditions");
3964	DebugLoc DL = Br->getDebugLoc();
3965	VPBasicBlock *BrOnTwoCondsBB = Br->getParent();
3966	const auto Successors = to_vector(Range&: BrOnTwoCondsBB->getSuccessors());
3967	assert(Successors.size() == `3` &&
3968	"BranchOnTwoConds must have exactly 3 successors");
3969
3970	for (VPBlockBase *Succ : Successors)
3971	VPBlockUtils::disconnectBlocks(From: BrOnTwoCondsBB, To: Succ);
3972
3973	VPValue *Cond0 = Br->getOperand(N: `0`);
3974	VPValue *Cond1 = Br->getOperand(N: `1`);
3975	VPBlockBase *Succ0 = Successors [`0`];
3976	VPBlockBase *Succ1 = Successors [`1`];
3977	VPBlockBase *Succ2 = Successors [`2`];
3978	assert(!Succ0->getParent() && !Succ1->getParent() && !Succ2->getParent() &&
3979	!BrOnTwoCondsBB->getParent() && "regions must already be dissolved");
3980
3981	VPBasicBlock *InterimBB =
3982	Plan.createVPBasicBlock(Name: BrOnTwoCondsBB->getName() + ".interim");
3983
3984	VPBuilder (BrOnTwoCondsBB)
3985	.createNaryOp(Opcode: VPInstruction::BranchOnCond, Operands: {Cond0}, DL);
3986	VPBlockUtils::connectBlocks(From: BrOnTwoCondsBB, To: Succ0);
3987	VPBlockUtils::connectBlocks(From: BrOnTwoCondsBB, To: InterimBB);
3988
3989	VPBuilder (InterimBB).createNaryOp(Opcode: VPInstruction::BranchOnCond, Operands: {Cond1}, DL);
3990	VPBlockUtils::connectBlocks(From: InterimBB, To: Succ1);
3991	VPBlockUtils::connectBlocks(From: InterimBB, To: Succ2);
3992	Br->eraseFromParent();
3993	}
3994	}
3995
3996	void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan) {
3997	VPTypeAnalysis TypeInfo(Plan);
3998	SmallVector<VPRecipeBase *> ToRemove;
3999	for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
4000	Range: vp_depth_first_deep(G: Plan.getEntry()))) {
4001	for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB)) {
4002	if (auto *WidenIVR = dyn_cast<VPWidenIntOrFpInductionRecipe>(Val: &R)) {
4003	expandVPWidenIntOrFpInduction(WidenIVR, TypeInfo);
4004	ToRemove.push_back(Elt: WidenIVR);
4005	continue;
4006	}
4007
4008	if (auto *WidenIVR = dyn_cast<VPWidenPointerInductionRecipe>(Val: &R)) {
4009	// If the recipe only generates scalars, scalarize it instead of
4010	// expanding it.
4011	if (WidenIVR->onlyScalarsGenerated(IsScalable: Plan.hasScalableVF())) {
4012	VPBuilder Builder(WidenIVR);
4013	VPValue *PtrAdd =
4014	scalarizeVPWidenPointerInduction(PtrIV: WidenIVR, Plan, Builder);
4015	WidenIVR->replaceAllUsesWith(New: PtrAdd);
4016	ToRemove.push_back(Elt: WidenIVR);
4017	continue;
4018	}
4019	expandVPWidenPointerInduction(R: WidenIVR, TypeInfo);
4020	ToRemove.push_back(Elt: WidenIVR);
4021	continue;
4022	}
4023
4024	// Expand VPBlendRecipe into VPInstruction::Select.
4025	VPBuilder Builder(&R);
4026	if (auto *Blend = dyn_cast<VPBlendRecipe>(Val: &R)) {
4027	VPValue *Select = Blend->getIncomingValue(Idx: `0`);
4028	for (unsigned I = `1`; I != Blend->getNumIncomingValues(); ++I)
4029	Select = Builder.createSelect(Cond: Blend->getMask(Idx: I),
4030	TrueVal: Blend->getIncomingValue(Idx: I), FalseVal: Select,
4031	DL: R.getDebugLoc(), Name: "predphi", Flags: *Blend);
4032	Blend->replaceAllUsesWith(New: Select);
4033	ToRemove.push_back(Elt: Blend);
4034	}
4035
4036	if (auto *VEPR = dyn_cast<VPVectorEndPointerRecipe>(Val: &R)) {
4037	if (!VEPR->getOffset()) {
4038	assert(Plan.getConcreteUF() == `1` &&
4039	"Expected unroller to have materialized offset for UF != 1");
4040	VEPR->materializeOffset();
4041	}
4042	}
4043
4044	if (auto *Expr = dyn_cast<VPExpressionRecipe>(Val: &R)) {
4045	Expr->decompose();
4046	ToRemove.push_back(Elt: Expr);
4047	}
4048
4049	// Expand LastActiveLane into Not + FirstActiveLane + Sub.
4050	auto *LastActiveL = dyn_cast<VPInstruction>(Val: &R);
4051	if (LastActiveL &&
4052	LastActiveL->getOpcode() == VPInstruction::LastActiveLane) {
4053	// Create Not(Mask) for all operands.
4054	SmallVector<VPValue *, `2`> NotMasks;
4055	for (VPValue *Op : LastActiveL->operands()) {
4056	VPValue *NotMask = Builder.createNot(Operand: Op, DL: LastActiveL->getDebugLoc());
4057	NotMasks.push_back(Elt: NotMask);
4058	}
4059
4060	// Create FirstActiveLane on the inverted masks.
4061	VPValue *FirstInactiveLane = Builder.createNaryOp(
4062	Opcode: VPInstruction::FirstActiveLane, Operands: NotMasks,
4063	DL: LastActiveL->getDebugLoc(), Name: "first.inactive.lane");
4064
4065	// Subtract 1 to get the last active lane.
4066	VPValue *One = Plan.getConstantInt(BitWidth: `64`, Val: `1`);
4067	VPValue *LastLane =
4068	Builder.createSub(LHS: FirstInactiveLane, RHS: One,
4069	DL: LastActiveL->getDebugLoc(), Name: "last.active.lane");
4070
4071	LastActiveL->replaceAllUsesWith(New: LastLane);
4072	ToRemove.push_back(Elt: LastActiveL);
4073	continue;
4074	}
4075
4076	// Lower MaskedCond with block mask to LogicalAnd.
4077	if (match(V: &R, P: m_VPInstruction<VPInstruction::MaskedCond>())) {
4078	auto *VPI = cast<VPInstruction>(Val: &R);
4079	assert(VPI->isMasked() &&
4080	"Unmasked MaskedCond should be simplified earlier");
4081	VPI->replaceAllUsesWith(New: Builder.createNaryOp(
4082	Opcode: VPInstruction::LogicalAnd, Operands: {VPI->getOperand(N: `0`), VPI->getMask()}));
4083	ToRemove.push_back(Elt: VPI);
4084	continue;
4085	}
4086
4087	// Lower BranchOnCount to ICmp + BranchOnCond.
4088	VPValue IV, TC;
4089	if (match(V: &R, P: m_BranchOnCount(Op0: m_VPValue(V&: IV), Op1: m_VPValue(V&: TC)))) {
4090	auto *BranchOnCountInst = cast<VPInstruction>(Val: &R);
4091	DebugLoc DL = BranchOnCountInst->getDebugLoc();
4092	VPValue *Cond = Builder.createICmp(Pred: CmpInst::ICMP_EQ, A: IV, B: TC, DL);
4093	Builder.createNaryOp(Opcode: VPInstruction::BranchOnCond, Operands: Cond, DL);
4094	ToRemove.push_back(Elt: BranchOnCountInst);
4095	continue;
4096	}
4097
4098	VPValue *VectorStep;
4099	VPValue *ScalarStep;
4100	if (!match(V: &R, P: m_VPInstruction<VPInstruction::WideIVStep>(
4101	Ops: m_VPValue(V&: VectorStep), Ops: m_VPValue(V&: ScalarStep))))
4102	continue;
4103
4104	// Expand WideIVStep.
4105	auto *VPI = cast<VPInstruction>(Val: &R);
4106	Type *IVTy = TypeInfo.inferScalarType(V: VPI);
4107	if (TypeInfo.inferScalarType(V: VectorStep) != IVTy) {
4108	Instruction::CastOps CastOp = IVTy->isFloatingPointTy()
4109	? Instruction::UIToFP
4110	: Instruction::Trunc;
4111	VectorStep = Builder.createWidenCast(Opcode: CastOp, Op: VectorStep, ResultTy: IVTy);
4112	}
4113
4114	assert(!match(ScalarStep, m_One()) && "Expected non-unit scalar-step");
4115	if (TypeInfo.inferScalarType(V: ScalarStep) != IVTy) {
4116	ScalarStep =
4117	Builder.createWidenCast(Opcode: Instruction::Trunc, Op: ScalarStep, ResultTy: IVTy);
4118	}
4119
4120	VPIRFlags Flags;
4121	unsigned MulOpc;
4122	if (IVTy->isFloatingPointTy()) {
4123	MulOpc = Instruction::FMul;
4124	Flags = VPI->getFastMathFlags();
4125	} else {
4126	MulOpc = Instruction::Mul;
4127	Flags = VPIRFlags::getDefaultFlags(Opcode: MulOpc);
4128	}
4129
4130	VPInstruction *Mul = Builder.createNaryOp(
4131	Opcode: MulOpc, Operands: {VectorStep, ScalarStep}, Flags, DL: R.getDebugLoc());
4132	VectorStep = Mul;
4133	VPI->replaceAllUsesWith(New: VectorStep);
4134	ToRemove.push_back(Elt: VPI);
4135	}
4136	}
4137
4138	for (VPRecipeBase *R : ToRemove)
4139	R->eraseFromParent();
4140	}
4141
4142	void VPlanTransforms::handleUncountableEarlyExits(VPlan &Plan,
4143	VPBasicBlock *HeaderVPBB,
4144	VPBasicBlock *LatchVPBB,
4145	VPBasicBlock *MiddleVPBB) {
4146	struct EarlyExitInfo {
4147	VPBasicBlock *EarlyExitingVPBB;
4148	VPIRBasicBlock *EarlyExitVPBB;
4149	VPValue *CondToExit;
4150	};
4151
4152	VPDominatorTree VPDT(Plan);
4153	VPBuilder Builder(LatchVPBB->getTerminator());
4154	SmallVector<EarlyExitInfo> Exits;
4155	for (VPIRBasicBlock *ExitBlock : Plan.getExitBlocks()) {
4156	for (VPBlockBase *Pred : to_vector(Range&: ExitBlock->getPredecessors())) {
4157	if (Pred == MiddleVPBB)
4158	continue;
4159	// Collect condition for this early exit.
4160	auto *EarlyExitingVPBB = cast<VPBasicBlock>(Val: Pred);
4161	VPBlockBase *TrueSucc = EarlyExitingVPBB->getSuccessors()[`0`];
4162	VPValue *CondOfEarlyExitingVPBB;
4163	[[maybe_unused]] bool Matched =
4164	match(V: EarlyExitingVPBB->getTerminator(),
4165	P: m_BranchOnCond(Op0: m_VPValue(V&: CondOfEarlyExitingVPBB)));
4166	assert(Matched && "Terminator must be BranchOnCond");
4167
4168	// Insert the MaskedCond in the EarlyExitingVPBB so the predicator adds
4169	// the correct block mask.
4170	VPBuilder EarlyExitingBuilder(EarlyExitingVPBB->getTerminator());
4171	auto *CondToEarlyExit = EarlyExitingBuilder.createNaryOp(
4172	Opcode: VPInstruction::MaskedCond,
4173	Operands: TrueSucc == ExitBlock
4174	? CondOfEarlyExitingVPBB
4175	: EarlyExitingBuilder.createNot(Operand: CondOfEarlyExitingVPBB));
4176	assert((isa<VPIRValue>(CondOfEarlyExitingVPBB) \|\|
4177	!VPDT.properlyDominates(EarlyExitingVPBB, LatchVPBB) \|\|
4178	VPDT.properlyDominates(
4179	CondOfEarlyExitingVPBB->getDefiningRecipe()->getParent(),
4180	LatchVPBB)) &&
4181	"exit condition must dominate the latch");
4182	Exits.push_back(Elt: {
4183	.EarlyExitingVPBB: EarlyExitingVPBB,
4184	.EarlyExitVPBB: ExitBlock,
4185	.CondToExit: CondToEarlyExit,
4186	});
4187	}
4188	}
4189
4190	assert(!Exits.empty() && "must have at least one early exit");
4191	// Sort exits by RPO order to get correct program order. RPO gives a
4192	// topological ordering of the CFG, ensuring upstream exits are checked
4193	// before downstream exits in the dispatch chain.
4194	ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
4195	HeaderVPBB);
4196	DenseMap<VPBlockBase , unsigned*> RPOIdx;
4197	for (const auto &[Num, VPB] : enumerate(First&: RPOT))
4198	RPOIdx [VPB] = Num;
4199	llvm::sort(C&: Exits, Comp: [&RPOIdx](const EarlyExitInfo &A, const EarlyExitInfo &B) {
4200	return RPOIdx [A.EarlyExitingVPBB] < RPOIdx [B.EarlyExitingVPBB];
4201	});
4202	#ifndef NDEBUG
4203	// After RPO sorting, verify that for any pair where one exit dominates
4204	// another, the dominating exit comes first. This is guaranteed by RPO
4205	// (topological order) and is required for the dispatch chain correctness.
4206	for (unsigned I = `0`; I + `1` < Exits.size(); ++I)
4207	for (unsigned J = I + `1`; J < Exits.size(); ++J)
4208	assert(!VPDT.properlyDominates(Exits[J].EarlyExitingVPBB,
4209	Exits[I].EarlyExitingVPBB) &&
4210	"RPO sort must place dominating exits before dominated ones");
4211	#endif
4212
4213	// Build the AnyOf condition for the latch terminator using logical OR
4214	// to avoid poison propagation from later exit conditions when an earlier
4215	// exit is taken.
4216	VPValue *Combined = Exits [`0`].CondToExit;
4217	for (const EarlyExitInfo &Info : drop_begin(RangeOrContainer&: Exits))
4218	Combined = Builder.createLogicalOr(LHS: Combined, RHS: Info.CondToExit);
4219
4220	VPValue *IsAnyExitTaken =
4221	Builder.createNaryOp(Opcode: VPInstruction::AnyOf, Operands: {Combined});
4222
4223	// Create the vector.early.exit blocks.
4224	SmallVector<VPBasicBlock *> VectorEarlyExitVPBBs(Exits.size());
4225	for (unsigned Idx = `0`; Idx != Exits.size(); ++Idx) {
4226	Twine BlockSuffix = Exits.size() == `1` ? "" : Twine (".") + Twine (Idx);
4227	VPBasicBlock *VectorEarlyExitVPBB =
4228	Plan.createVPBasicBlock(Name: "vector.early.exit" + BlockSuffix);
4229	VectorEarlyExitVPBBs [Idx] = VectorEarlyExitVPBB;
4230	}
4231
4232	// Create the dispatch block (or reuse the single exit block if only one
4233	// exit). The dispatch block computes the first active lane of the combined
4234	// condition and, for multiple exits, chains through conditions to determine
4235	// which exit to take.
4236	VPBasicBlock *DispatchVPBB =
4237	Exits.size() == `1` ? VectorEarlyExitVPBBs [`0`]
4238	: Plan.createVPBasicBlock(Name: "vector.early.exit.check");
4239	VPBuilder DispatchBuilder(DispatchVPBB, DispatchVPBB->begin());
4240	VPValue *FirstActiveLane =
4241	DispatchBuilder.createNaryOp(Opcode: VPInstruction::FirstActiveLane, Operands: {Combined},
4242	DL: DebugLoc::getUnknown(), Name: "first.active.lane");
4243
4244	// For each early exit, disconnect the original exiting block
4245	// (early.exiting.I) from the exit block (ir-bb<exit.I>) and route through a
4246	// new vector.early.exit block. Update ir-bb<exit.I>'s phis to extract their
4247	// values at the first active lane:
4248	//
4249	// Input:
4250	// early.exiting.I:
4251	// ...
4252	// EMIT branch-on-cond vp<%cond.I>
4253	// Successor(s): in.loop.succ, ir-bb<exit.I>
4254	//
4255	// ir-bb<exit.I>:
4256	// IR %phi = phi [ vp<%incoming.I>, early.exiting.I ], ...
4257	//
4258	// Output:
4259	// early.exiting.I:
4260	// ...
4261	// Successor(s): in.loop.succ
4262	//
4263	// vector.early.exit.I:
4264	// EMIT vp<%exit.val> = extract-lane vp<%first.lane>, vp<%incoming.I>
4265	// Successor(s): ir-bb<exit.I>
4266	//
4267	// ir-bb<exit.I>:
4268	// IR %phi = phi ... (extra operand: vp<%exit.val> from
4269	// vector.early.exit.I)
4270	//
4271	for (auto [Exit, VectorEarlyExitVPBB] :
4272	zip_equal(t&: Exits, u&: VectorEarlyExitVPBBs)) {
4273	auto &[EarlyExitingVPBB, EarlyExitVPBB, _] = Exit;
4274	// Adjust the phi nodes in EarlyExitVPBB.
4275	// 1. remove incoming values from EarlyExitingVPBB,
4276	// 2. extract the incoming value at FirstActiveLane
4277	// 3. add back the extracts as last operands for the phis
4278	// Then adjust the CFG, removing the edge between EarlyExitingVPBB and
4279	// EarlyExitVPBB and adding a new edge between VectorEarlyExitVPBB and
4280	// EarlyExitVPBB. The extracts at FirstActiveLane are now the incoming
4281	// values from VectorEarlyExitVPBB.
4282	for (VPRecipeBase &R : EarlyExitVPBB->phis()) {
4283	auto *ExitIRI = cast<VPIRPhi>(Val: &R);
4284	VPValue *IncomingVal =
4285	ExitIRI->getIncomingValueForBlock(VPBB: EarlyExitingVPBB);
4286	VPValue *NewIncoming = IncomingVal;
4287	if (!isa<VPIRValue>(Val: IncomingVal)) {
4288	VPBuilder EarlyExitBuilder(VectorEarlyExitVPBB);
4289	NewIncoming = EarlyExitBuilder.createNaryOp(
4290	Opcode: VPInstruction::ExtractLane, Operands: {FirstActiveLane, IncomingVal},
4291	DL: DebugLoc::getUnknown(), Name: "early.exit.value");
4292	}
4293	ExitIRI->removeIncomingValueFor(IncomingBlock: EarlyExitingVPBB);
4294	ExitIRI->addOperand(Operand: NewIncoming);
4295	}
4296
4297	EarlyExitingVPBB->getTerminator()->eraseFromParent();
4298	VPBlockUtils::disconnectBlocks(From: EarlyExitingVPBB, To: EarlyExitVPBB);
4299	VPBlockUtils::connectBlocks(From: VectorEarlyExitVPBB, To: EarlyExitVPBB);
4300	}
4301
4302	// Chain through exits: for each exit, check if its condition is true at
4303	// the first active lane. If so, take that exit; otherwise, try the next.
4304	// The last exit needs no check since it must be taken if all others fail.
4305	//
4306	// For 3 exits (cond.0, cond.1, cond.2), this creates:
4307	//
4308	// latch:
4309	// ...
4310	// EMIT vp<%combined> = logical-or vp<%cond.0>, vp<%cond.1>, vp<%cond.2>
4311	// ...
4312	//
4313	// vector.early.exit.check:
4314	// EMIT vp<%first.lane> = first-active-lane vp<%combined>
4315	// EMIT vp<%at.cond.0> = extract-lane vp<%first.lane>, vp<%cond.0>
4316	// EMIT branch-on-cond vp<%at.cond.0>
4317	// Successor(s): vector.early.exit.0, vector.early.exit.check.0
4318	//
4319	// vector.early.exit.check.0:
4320	// EMIT vp<%at.cond.1> = extract-lane vp<%first.lane>, vp<%cond.1>
4321	// EMIT branch-on-cond vp<%at.cond.1>
4322	// Successor(s): vector.early.exit.1, vector.early.exit.2
4323	VPBasicBlock *CurrentBB = DispatchVPBB;
4324	for (auto [I, Exit] : enumerate(First: ArrayRef(Exits).drop_back())) {
4325	VPValue *LaneVal = DispatchBuilder.createNaryOp(
4326	Opcode: VPInstruction::ExtractLane, Operands: {FirstActiveLane, Exit.CondToExit},
4327	DL: DebugLoc::getUnknown(), Name: "exit.cond.at.lane");
4328
4329	// For the last dispatch, branch directly to the last exit on false;
4330	// otherwise, create a new check block.
4331	bool IsLastDispatch = (I + `2` == Exits.size());
4332	VPBasicBlock *FalseBB =
4333	IsLastDispatch ? VectorEarlyExitVPBBs.back()
4334	: Plan.createVPBasicBlock(
4335	Name: Twine ("vector.early.exit.check.") + Twine (I));
4336
4337	DispatchBuilder.createNaryOp(Opcode: VPInstruction::BranchOnCond, Operands: {LaneVal});
4338	CurrentBB->setSuccessors({VectorEarlyExitVPBBs [I], FalseBB});
4339	VectorEarlyExitVPBBs [I]->setPredecessors({CurrentBB});
4340	FalseBB->setPredecessors({CurrentBB});
4341
4342	CurrentBB = FalseBB;
4343	DispatchBuilder.setInsertPoint(CurrentBB);
4344	}
4345
4346	// Replace the latch terminator with the new branching logic.
4347	auto *LatchExitingBranch = cast<VPInstruction>(Val: LatchVPBB->getTerminator());
4348	assert(LatchExitingBranch->getOpcode() == VPInstruction::BranchOnCount &&
4349	"Unexpected terminator");
4350	auto *IsLatchExitTaken =
4351	Builder.createICmp(Pred: CmpInst::ICMP_EQ, A: LatchExitingBranch->getOperand(N: `0`),
4352	B: LatchExitingBranch->getOperand(N: `1`));
4353
4354	DebugLoc LatchDL = LatchExitingBranch->getDebugLoc();
4355	LatchExitingBranch->eraseFromParent();
4356	Builder.setInsertPoint(LatchVPBB);
4357	Builder.createNaryOp(Opcode: VPInstruction::BranchOnTwoConds,
4358	Operands: {IsAnyExitTaken, IsLatchExitTaken}, DL: LatchDL);
4359	LatchVPBB->clearSuccessors();
4360	LatchVPBB->setSuccessors({DispatchVPBB, MiddleVPBB, HeaderVPBB});
4361	DispatchVPBB->setPredecessors({LatchVPBB});
4362	}
4363
4364	/// This function tries convert extended in-loop reductions to
4365	/// VPExpressionRecipe and clamp the \p Range if it is beneficial and
4366	/// valid. The created recipe must be decomposed to its constituent
4367	/// recipes before execution.
4368	static VPExpressionRecipe *
4369	tryToMatchAndCreateExtendedReduction(VPReductionRecipe *Red, VPCostContext &Ctx,
4370	VFRange &Range) {
4371	Type *RedTy = Ctx.Types.inferScalarType(V: Red);
4372	VPValue *VecOp = Red->getVecOp();
4373
4374	// Clamp the range if using extended-reduction is profitable.
4375	auto IsExtendedRedValidAndClampRange =
4376	[&](unsigned Opcode, Instruction::CastOps ExtOpc, Type SrcTy) -> bool* {
4377	return LoopVectorizationPlanner::getDecisionAndClampRange(
4378	Predicate: [&](ElementCount VF) {
4379	auto *SrcVecTy = cast<VectorType>(Val: toVectorTy(Scalar: SrcTy, EC: VF));
4380	TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
4381
4382	InstructionCost ExtRedCost = InstructionCost::getInvalid();
4383	InstructionCost ExtCost =
4384	cast<VPWidenCastRecipe>(Val: VecOp)->computeCost(VF, Ctx);
4385	InstructionCost RedCost = Red->computeCost(VF, Ctx);
4386
4387	if (Red->isPartialReduction()) {
4388	TargetTransformInfo::PartialReductionExtendKind ExtKind =
4389	TargetTransformInfo::getPartialReductionExtendKind(CastOpc: ExtOpc);
4390	// FIXME: Move partial reduction creation, costing and clamping
4391	// here from LoopVectorize.cpp.
4392	ExtRedCost = Ctx.TTI.getPartialReductionCost(
4393	Opcode, InputTypeA: SrcTy, InputTypeB: nullptr, AccumType: RedTy, VF, OpAExtend: ExtKind,
4394	OpBExtend: llvm::TargetTransformInfo::PR_None, BinOp: std::nullopt, CostKind: Ctx.CostKind,
4395	FMF: RedTy->isFloatingPointTy()
4396	? std::optional{Red->getFastMathFlags()}
4397	: std::nullopt);
4398	} else if (!RedTy->isFloatingPointTy()) {
4399	// TTI::getExtendedReductionCost only supports integer types.
4400	ExtRedCost = Ctx.TTI.getExtendedReductionCost(
4401	Opcode, IsUnsigned: ExtOpc == Instruction::CastOps::ZExt, ResTy: RedTy, Ty: SrcVecTy,
4402	FMF: Red->getFastMathFlags(), CostKind);
4403	}
4404	return ExtRedCost.isValid() && ExtRedCost < ExtCost + RedCost;
4405	},
4406	Range);
4407	};
4408
4409	VPValue *A;
4410	// Match reduce(ext)).
4411	if (isa<VPWidenCastRecipe>(Val: VecOp) &&
4412	(match(V: VecOp, P: m_ZExtOrSExt(Op0: m_VPValue(V&: A))) \|\|
4413	match(V: VecOp, P: m_FPExt(Op0: m_VPValue(V&: A)))) &&
4414	IsExtendedRedValidAndClampRange (
4415	RecurrenceDescriptor::getOpcode(Kind: Red->getRecurrenceKind()),
4416	cast<VPWidenCastRecipe>(Val: VecOp)->getOpcode(),
4417	Ctx.Types.inferScalarType(V: A)))
4418	return new VPExpressionRecipe (cast<VPWidenCastRecipe>(Val: VecOp), Red);
4419
4420	return nullptr;
4421	}
4422
4423	/// This function tries convert extended in-loop reductions to
4424	/// VPExpressionRecipe and clamp the \p Range if it is beneficial
4425	/// and valid. The created VPExpressionRecipe must be decomposed to its
4426	/// constituent recipes before execution. Patterns of the
4427	/// VPExpressionRecipe:
4428	/// reduce.add(mul(...)),
4429	/// reduce.add(mul(ext(A), ext(B))),
4430	/// reduce.add(ext(mul(ext(A), ext(B)))).
4431	/// reduce.fadd(fmul(ext(A), ext(B)))
4432	static VPExpressionRecipe *
4433	tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red,
4434	VPCostContext &Ctx, VFRange &Range) {
4435	unsigned Opcode = RecurrenceDescriptor::getOpcode(Kind: Red->getRecurrenceKind());
4436	if (Opcode != Instruction::Add && Opcode != Instruction::Sub &&
4437	Opcode != Instruction::FAdd)
4438	return nullptr;
4439
4440	Type *RedTy = Ctx.Types.inferScalarType(V: Red);
4441
4442	// Clamp the range if using multiply-accumulate-reduction is profitable.
4443	auto IsMulAccValidAndClampRange =
4444	[&](VPWidenRecipe Mul, VPWidenCastRecipe Ext0, VPWidenCastRecipe *Ext1,
4445	VPWidenCastRecipe OuterExt) -> bool* {
4446	return LoopVectorizationPlanner::getDecisionAndClampRange(
4447	Predicate: [&](ElementCount VF) {
4448	TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
4449	Type *SrcTy =
4450	Ext0 ? Ctx.Types.inferScalarType(V: Ext0->getOperand(N: `0`)) : RedTy;
4451	InstructionCost MulAccCost;
4452
4453	if (Red->isPartialReduction()) {
4454	Type *SrcTy2 =
4455	Ext1 ? Ctx.Types.inferScalarType(V: Ext1->getOperand(N: `0`)) : nullptr;
4456	// FIXME: Move partial reduction creation, costing and clamping
4457	// here from LoopVectorize.cpp.
4458	MulAccCost = Ctx.TTI.getPartialReductionCost(
4459	Opcode, InputTypeA: SrcTy, InputTypeB: SrcTy2, AccumType: RedTy, VF,
4460	OpAExtend: Ext0 ? TargetTransformInfo::getPartialReductionExtendKind(
4461	CastOpc: Ext0->getOpcode())
4462	: TargetTransformInfo::PR_None,
4463	OpBExtend: Ext1 ? TargetTransformInfo::getPartialReductionExtendKind(
4464	CastOpc: Ext1->getOpcode())
4465	: TargetTransformInfo::PR_None,
4466	BinOp: Mul->getOpcode(), CostKind,
4467	FMF: RedTy->isFloatingPointTy()
4468	? std::optional{Red->getFastMathFlags()}
4469	: std::nullopt);
4470	} else {
4471	// Only partial reductions support mixed or floating-point extends
4472	// at the moment.
4473	if (Ext0 && Ext1 &&
4474	(Ext0->getOpcode() != Ext1->getOpcode() \|\|
4475	Ext0->getOpcode() == Instruction::CastOps::FPExt))
4476	return false;
4477
4478	bool IsZExt =
4479	!Ext0 \|\| Ext0->getOpcode() == Instruction::CastOps::ZExt;
4480	auto *SrcVecTy = cast<VectorType>(Val: toVectorTy(Scalar: SrcTy, EC: VF));
4481	MulAccCost = Ctx.TTI.getMulAccReductionCost(IsUnsigned: IsZExt, RedOpcode: Opcode, ResTy: RedTy,
4482	Ty: SrcVecTy, CostKind);
4483	}
4484
4485	InstructionCost MulCost = Mul->computeCost(VF, Ctx);
4486	InstructionCost RedCost = Red->computeCost(VF, Ctx);
4487	InstructionCost ExtCost = `0`;
4488	if (Ext0)
4489	ExtCost += Ext0->computeCost(VF, Ctx);
4490	if (Ext1)
4491	ExtCost += Ext1->computeCost(VF, Ctx);
4492	if (OuterExt)
4493	ExtCost += OuterExt->computeCost(VF, Ctx);
4494
4495	return MulAccCost.isValid() &&
4496	MulAccCost < ExtCost + MulCost + RedCost;
4497	},
4498	Range);
4499	};
4500
4501	VPValue *VecOp = Red->getVecOp();
4502	VPRecipeBase Sub = nullptr*;
4503	VPValue A, B;
4504	VPValue Tmp = nullptr*;
4505
4506	// Try to match reduce.fadd(fmul(fpext(...), fpext(...))).
4507	if (match(V: VecOp, P: m_FMul(Op0: m_FPExt(Op0: m_VPValue()), Op1: m_FPExt(Op0: m_VPValue())))) {
4508	assert(Opcode == Instruction::FAdd &&
4509	"MulAccumulateReduction from an FMul must accumulate into an FAdd "
4510	"instruction");
4511	auto *FMul = dyn_cast<VPWidenRecipe>(Val: VecOp);
4512	if (!FMul)
4513	return nullptr;
4514
4515	auto *RecipeA = dyn_cast<VPWidenCastRecipe>(Val: FMul->getOperand(N: `0`));
4516	auto *RecipeB = dyn_cast<VPWidenCastRecipe>(Val: FMul->getOperand(N: `1`));
4517
4518	if (RecipeA && RecipeB &&
4519	IsMulAccValidAndClampRange (FMul, RecipeA, RecipeB, nullptr)) {
4520	return new VPExpressionRecipe (RecipeA, RecipeB, FMul, Red);
4521	}
4522	}
4523	if (RedTy->isFloatingPointTy())
4524	return nullptr;
4525
4526	// Sub reductions could have a sub between the add reduction and vec op.
4527	if (match(V: VecOp, P: m_Sub(Op0: m_ZeroInt(), Op1: m_VPValue(V&: Tmp)))) {
4528	Sub = VecOp->getDefiningRecipe();
4529	VecOp = Tmp;
4530	}
4531
4532	// If ValB is a constant and can be safely extended, truncate it to the same
4533	// type as ExtA's operand, then extend it to the same type as ExtA. This
4534	// creates two uniform extends that can more easily be matched by the rest of
4535	// the bundling code. The ExtB reference, ValB and operand 1 of Mul are all
4536	// replaced with the new extend of the constant.
4537	auto ExtendAndReplaceConstantOp = [&Ctx, &Red](VPWidenCastRecipe *ExtA,
4538	VPWidenCastRecipe *&ExtB,
4539	VPValue *&ValB,
4540	VPWidenRecipe *Mul) {
4541	if (!ExtA \|\| ExtB \|\| !isa<VPIRValue>(Val: ValB) \|\| Red->isPartialReduction())
4542	return;
4543	Type *NarrowTy = Ctx.Types.inferScalarType(V: ExtA->getOperand(N: `0`));
4544	Instruction::CastOps ExtOpc = ExtA->getOpcode();
4545	const APInt *Const;
4546	if (!match(V: ValB, P: m_APInt(C&: Const)) \|\|
4547	!llvm::canConstantBeExtended(
4548	C: Const, NarrowType: NarrowTy, ExtKind: TTI::getPartialReductionExtendKind(CastOpc: ExtOpc)))
4549	return;
4550	// The truncate ensures that the type of each extended operand is the
4551	// same, and it's been proven that the constant can be extended from
4552	// NarrowTy safely. Necessary since ExtA's extended operand would be
4553	// e.g. an i8, while the const will likely be an i32. This will be
4554	// elided by later optimisations.
4555	VPBuilder Builder(Mul);
4556	auto *Trunc =
4557	Builder.createWidenCast(Opcode: Instruction::CastOps::Trunc, Op: ValB, ResultTy: NarrowTy);
4558	Type *WideTy = Ctx.Types.inferScalarType(V: ExtA);
4559	ValB = ExtB = Builder.createWidenCast(Opcode: ExtOpc, Op: Trunc, ResultTy: WideTy);
4560	Mul->setOperand(I: `1`, New: ExtB);
4561	};
4562
4563	// Try to match reduce.add(mul(...)).
4564	if (match(V: VecOp, P: m_Mul(Op0: m_VPValue(V&: A), Op1: m_VPValue(V&: B)))) {
4565	auto *RecipeA = dyn_cast_if_present<VPWidenCastRecipe>(Val: A);
4566	auto *RecipeB = dyn_cast_if_present<VPWidenCastRecipe>(Val: B);
4567	auto *Mul = cast<VPWidenRecipe>(Val: VecOp);
4568
4569	// Convert reduce.add(mul(ext, const)) to reduce.add(mul(ext, ext(const)))
4570	ExtendAndReplaceConstantOp (RecipeA, RecipeB, B, Mul);
4571
4572	// Match reduce.add/sub(mul(ext, ext)).
4573	if (RecipeA && RecipeB && match(V: RecipeA, P: m_ZExtOrSExt(Op0: m_VPValue())) &&
4574	match(V: RecipeB, P: m_ZExtOrSExt(Op0: m_VPValue())) &&
4575	IsMulAccValidAndClampRange (Mul, RecipeA, RecipeB, nullptr)) {
4576	if (Sub)
4577	return new VPExpressionRecipe (RecipeA, RecipeB, Mul,
4578	cast<VPWidenRecipe>(Val: Sub), Red);
4579	return new VPExpressionRecipe (RecipeA, RecipeB, Mul, Red);
4580	}
4581	// TODO: Add an expression type for this variant with a negated mul
4582	if (!Sub && IsMulAccValidAndClampRange (Mul, nullptr, nullptr, nullptr))
4583	return new VPExpressionRecipe (Mul, Red);
4584	}
4585	// TODO: Add an expression type for negated versions of other expression
4586	// variants.
4587	if (Sub)
4588	return nullptr;
4589
4590	// Match reduce.add(ext(mul(A, B))).
4591	if (!Red->isPartialReduction() &&
4592	match(V: VecOp, P: m_ZExtOrSExt(Op0: m_Mul(Op0: m_VPValue(V&: A), Op1: m_VPValue(V&: B))))) {
4593	auto *Ext = cast<VPWidenCastRecipe>(Val: VecOp);
4594	auto *Mul = cast<VPWidenRecipe>(Val: Ext->getOperand(N: `0`));
4595	auto *Ext0 = dyn_cast_if_present<VPWidenCastRecipe>(Val: A);
4596	auto *Ext1 = dyn_cast_if_present<VPWidenCastRecipe>(Val: B);
4597
4598	// reduce.add(ext(mul(ext, const)))
4599	// -> reduce.add(ext(mul(ext, ext(const))))
4600	ExtendAndReplaceConstantOp (Ext0, Ext1, B, Mul);
4601
4602	// reduce.add(ext(mul(ext(A), ext(B))))
4603	// -> reduce.add(mul(wider_ext(A), wider_ext(B)))
4604	// The inner extends must either have the same opcode as the outer extend or
4605	// be the same, in which case the multiply can never result in a negative
4606	// value and the outer extend can be folded away by doing wider
4607	// extends for the operands of the mul.
4608	if (Ext0 && Ext1 &&
4609	(Ext->getOpcode() == Ext0->getOpcode() \|\| Ext0 == Ext1) &&
4610	Ext0->getOpcode() == Ext1->getOpcode() &&
4611	IsMulAccValidAndClampRange (Mul, Ext0, Ext1, Ext) && Mul->hasOneUse()) {
4612	auto NewExt0 = new* VPWidenCastRecipe (
4613	Ext0->getOpcode(), Ext0->getOperand(N: `0`), Ext->getResultType(), nullptr,
4614	Ext0, Ext0, Ext0->getDebugLoc());
4615	NewExt0->insertBefore(InsertPos: Ext0);
4616
4617	VPWidenCastRecipe *NewExt1 = NewExt0;
4618	if (Ext0 != Ext1) {
4619	NewExt1 = new VPWidenCastRecipe (Ext1->getOpcode(), Ext1->getOperand(N: `0`),
4620	Ext->getResultType(), nullptr, *Ext1,
4621	*Ext1, Ext1->getDebugLoc());
4622	NewExt1->insertBefore(InsertPos: Ext1);
4623	}
4624	Mul->setOperand(I: `0`, New: NewExt0);
4625	Mul->setOperand(I: `1`, New: NewExt1);
4626	Red->setOperand(I: `1`, New: Mul);
4627	return new VPExpressionRecipe (NewExt0, NewExt1, Mul, Red);
4628	}
4629	}
4630	return nullptr;
4631	}
4632
4633	/// This function tries to create abstract recipes from the reduction recipe for
4634	/// following optimizations and cost estimation.
4635	static void tryToCreateAbstractReductionRecipe(VPReductionRecipe *Red,
4636	VPCostContext &Ctx,
4637	VFRange &Range) {
4638	VPExpressionRecipe AbstractR = nullptr*;
4639	auto IP = std::next(x: Red->getIterator());
4640	auto *VPBB = Red->getParent();
4641	if (auto *MulAcc = tryToMatchAndCreateMulAccumulateReduction(Red, Ctx, Range))
4642	AbstractR = MulAcc;
4643	else if (auto *ExtRed = tryToMatchAndCreateExtendedReduction(Red, Ctx, Range))
4644	AbstractR = ExtRed;
4645	// Cannot create abstract inloop reduction recipes.
4646	if (!AbstractR)
4647	return;
4648
4649	AbstractR->insertBefore(BB&: *VPBB, IP);
4650	Red->replaceAllUsesWith(New: AbstractR);
4651	}
4652
4653	void VPlanTransforms::convertToAbstractRecipes(VPlan &Plan, VPCostContext &Ctx,
4654	VFRange &Range) {
4655	for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
4656	Range: vp_depth_first_deep(G: Plan.getVectorLoopRegion()))) {
4657	for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB)) {
4658	if (auto *Red = dyn_cast<VPReductionRecipe>(Val: &R))
4659	tryToCreateAbstractReductionRecipe(Red, Ctx, Range);
4660	}
4661	}
4662	}
4663
4664	void VPlanTransforms::materializeBroadcasts(VPlan &Plan) {
4665	if (Plan.hasScalarVFOnly())
4666	return;
4667
4668	#ifndef NDEBUG
4669	VPDominatorTree VPDT(Plan);
4670	#endif
4671
4672	SmallVector<VPValue *> VPValues;
4673	if (Plan.getOrCreateBackedgeTakenCount()->getNumUsers() > `0`)
4674	VPValues.push_back(Elt: Plan.getOrCreateBackedgeTakenCount());
4675	append_range(C&: VPValues, R: Plan.getLiveIns());
4676	for (VPRecipeBase &R : *Plan.getEntry())
4677	append_range(C&: VPValues, R: R.definedValues());
4678
4679	auto *VectorPreheader = Plan.getVectorPreheader();
4680	for (VPValue *VPV : VPValues) {
4681	if (vputils::onlyScalarValuesUsed(Def: VPV) \|\|
4682	(isa<VPIRValue>(Val: VPV) && isa<Constant>(Val: VPV->getLiveInIRValue())))
4683	continue;
4684
4685	// Add explicit broadcast at the insert point that dominates all users.
4686	VPBasicBlock *HoistBlock = VectorPreheader;
4687	VPBasicBlock::iterator HoistPoint = VectorPreheader->end();
4688	for (VPUser *User : VPV->users()) {
4689	if (User->usesScalars(Op: VPV))
4690	continue;
4691	if (cast<VPRecipeBase>(Val: User)->getParent() == VectorPreheader)
4692	HoistPoint = HoistBlock->begin();
4693	else
4694	assert(VPDT.dominates(VectorPreheader,
4695	cast<VPRecipeBase>(User)->getParent()) &&
4696	"All users must be in the vector preheader or dominated by it");
4697	}
4698
4699	VPBuilder Builder(cast<VPBasicBlock>(Val: HoistBlock), HoistPoint);
4700	auto *Broadcast = Builder.createNaryOp(Opcode: VPInstruction::Broadcast, Operands: {VPV});
4701	VPV->replaceUsesWithIf(New: Broadcast,
4702	ShouldReplace: [VPV, Broadcast](VPUser &U, unsigned Idx) {
4703	return Broadcast != &U && !U.usesScalars(Op: VPV);
4704	});
4705	}
4706	}
4707
4708	void VPlanTransforms::hoistInvariantLoads(VPlan &Plan) {
4709	VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
4710
4711	// Collect candidate loads with invariant addresses and noalias scopes
4712	// metadata and memory-writing recipes with noalias metadata.
4713	SmallVector<std::pair<VPRecipeBase *, MemoryLocation>> CandidateLoads;
4714	SmallVector<MemoryLocation> Stores;
4715	for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
4716	Range: vp_depth_first_shallow(G: LoopRegion->getEntry()))) {
4717	for (VPRecipeBase &R : *VPBB) {
4718	// Only handle single-scalar replicated loads with invariant addresses.
4719	if (auto *RepR = dyn_cast<VPReplicateRecipe>(Val: &R)) {
4720	if (RepR->isPredicated() \|\| !RepR->isSingleScalar() \|\|
4721	RepR->getOpcode() != Instruction::Load)
4722	continue;
4723
4724	VPValue *Addr = RepR->getOperand(N: `0`);
4725	if (Addr->isDefinedOutsideLoopRegions()) {
4726	MemoryLocation Loc = vputils::getMemoryLocation(R: RepR);
4727	if (!Loc.AATags.Scope)
4728	continue;
4729	CandidateLoads.push_back(Elt: {RepR, Loc});
4730	}
4731	}
4732	if (R.mayWriteToMemory()) {
4733	auto Loc = vputils::getMemoryLocation(R);
4734	if (!Loc \|\| !Loc ->AATags.Scope \|\| !Loc ->AATags.NoAlias)
4735	return;
4736	Stores.push_back(Elt: *Loc);
4737	}
4738	}
4739	}
4740
4741	VPBasicBlock *Preheader = Plan.getVectorPreheader();
4742	for (auto &[LoadRecipe, LoadLoc] : CandidateLoads) {
4743	// Hoist the load to the preheader if it doesn't alias with any stores
4744	// according to the noalias metadata. Other loads should have been hoisted
4745	// by other passes
4746	const AAMDNodes &LoadAA = LoadLoc.AATags;
4747	if (all_of(Range&: Stores, P: [&](const MemoryLocation &StoreLoc) {
4748	return !ScopedNoAliasAAResult::mayAliasInScopes(
4749	Scopes: LoadAA.Scope, NoAlias: StoreLoc.AATags.NoAlias);
4750	})) {
4751	LoadRecipe->moveBefore(BB&: *Preheader, I: Preheader->getFirstNonPhi());
4752	}
4753	}
4754	}
4755
4756	// Collect common metadata from a group of replicate recipes by intersecting
4757	// metadata from all recipes in the group.
4758	static VPIRMetadata getCommonMetadata(ArrayRef<VPReplicateRecipe *> Recipes) {
4759	VPIRMetadata CommonMetadata = *Recipes.front();
4760	for (VPReplicateRecipe *Recipe : drop_begin(RangeOrContainer&: Recipes))
4761	CommonMetadata.intersect(MD: *Recipe);
4762	return CommonMetadata;
4763	}
4764
4765	template <unsigned Opcode>
4766	static SmallVector<SmallVector<VPReplicateRecipe *, `4`>>
4767	collectComplementaryPredicatedMemOps(VPlan &Plan,
4768	PredicatedScalarEvolution &PSE,
4769	const Loop *L) {
4770	static_assert(Opcode == Instruction::Load \|\| Opcode == Instruction::Store,
4771	"Only Load and Store opcodes supported");
4772	constexpr bool IsLoad = (Opcode == Instruction::Load);
4773	VPTypeAnalysis TypeInfo(Plan);
4774
4775	// For each address, collect operations with the same or complementary masks.
4776	SmallVector<SmallVector<VPReplicateRecipe *, `4`>> AllGroups;
4777	auto GetLoadStoreValueType = [&](VPReplicateRecipe *Recipe) {
4778	return TypeInfo.inferScalarType(V: IsLoad ? Recipe : Recipe->getOperand(N: `0`));
4779	};
4780	auto Groups = collectGroupedReplicateMemOps<Opcode>(
4781	Plan, PSE, L,
4782	[](VPReplicateRecipe RepR) { return* RepR->isPredicated(); });
4783	for (auto Recipes : Groups) {
4784	if (Recipes.size() < `2`)
4785	continue;
4786
4787	// Collect groups with the same or complementary masks.
4788	for (VPReplicateRecipe *&RecipeI : Recipes) {
4789	if (!RecipeI)
4790	continue;
4791
4792	VPValue *MaskI = RecipeI->getMask();
4793	Type *TypeI = GetLoadStoreValueType(RecipeI);
4794	SmallVector<VPReplicateRecipe *, `4`> Group;
4795	Group.push_back(Elt: RecipeI);
4796	RecipeI = nullptr;
4797
4798	// Find all operations with the same or complementary masks.
4799	bool HasComplementaryMask = false;
4800	for (VPReplicateRecipe *&RecipeJ : Recipes) {
4801	if (!RecipeJ)
4802	continue;
4803
4804	VPValue *MaskJ = RecipeJ->getMask();
4805	Type *TypeJ = GetLoadStoreValueType(RecipeJ);
4806	if (TypeI == TypeJ) {
4807	// Check if any operation in the group has a complementary mask with
4808	// another, that is M1 == NOT(M2) or M2 == NOT(M1).
4809	HasComplementaryMask \|= match(V: MaskI, P: m_Not(Op0: m_Specific(VPV: MaskJ))) \|\|
4810	match(V: MaskJ, P: m_Not(Op0: m_Specific(VPV: MaskI)));
4811	Group.push_back(Elt: RecipeJ);
4812	RecipeJ = nullptr;
4813	}
4814	}
4815
4816	if (HasComplementaryMask) {
4817	assert(Group.size() >= `2` && "must have at least 2 entries");
4818	AllGroups.push_back(Elt: std::move(Group));
4819	}
4820	}
4821	}
4822
4823	return AllGroups;
4824	}
4825
4826	// Find the recipe with minimum alignment in the group.
4827	template <typename InstType>
4828	static VPReplicateRecipe *
4829	findRecipeWithMinAlign(ArrayRef<VPReplicateRecipe *> Group) {
4830	return min_element(Group, [](VPReplicateRecipe A, VPReplicateRecipe *B) {
4831	return cast<InstType>(A->getUnderlyingInstr())->getAlign() <
4832	cast<InstType>(B->getUnderlyingInstr())->getAlign();
4833	});
4834	}
4835
4836	void VPlanTransforms::hoistPredicatedLoads(VPlan &Plan,
4837	PredicatedScalarEvolution &PSE,
4838	const Loop *L) {
4839	auto Groups =
4840	collectComplementaryPredicatedMemOps<Instruction::Load>(Plan, PSE, L);
4841	if (Groups.empty())
4842	return;
4843
4844	// Process each group of loads.
4845	for (auto &Group : Groups) {
4846	// Try to use the earliest (most dominating) load to replace all others.
4847	VPReplicateRecipe *EarliestLoad = Group [`0`];
4848	VPBasicBlock *FirstBB = EarliestLoad->getParent();
4849	VPBasicBlock *LastBB = Group.back()->getParent();
4850
4851	// Check that the load doesn't alias with stores between first and last.
4852	auto LoadLoc = vputils::getMemoryLocation(R: *EarliestLoad);
4853	if (!LoadLoc \|\| !canHoistOrSinkWithNoAliasCheck(MemLoc: *LoadLoc, FirstBB, LastBB))
4854	continue;
4855
4856	// Collect common metadata from all loads in the group.
4857	VPIRMetadata CommonMetadata = getCommonMetadata(Recipes: Group);
4858
4859	// Find the load with minimum alignment to use.
4860	auto *LoadWithMinAlign = findRecipeWithMinAlign<LoadInst>(Group);
4861
4862	bool IsSingleScalar = EarliestLoad->isSingleScalar();
4863	assert(all_of(Group,
4864	[IsSingleScalar](VPReplicateRecipe *R) {
4865	return R->isSingleScalar() == IsSingleScalar;
4866	}) &&
4867	"all members in group must agree on IsSingleScalar");
4868
4869	// Create an unpredicated version of the earliest load with common
4870	// metadata.
4871	auto UnpredicatedLoad = new* VPReplicateRecipe (
4872	LoadWithMinAlign->getUnderlyingInstr(), {EarliestLoad->getOperand(N: `0`)},
4873	IsSingleScalar, /Mask=/nullptr, *EarliestLoad, CommonMetadata);
4874
4875	UnpredicatedLoad->insertBefore(InsertPos: EarliestLoad);
4876
4877	// Replace all loads in the group with the unpredicated load.
4878	for (VPReplicateRecipe *Load : Group) {
4879	Load->replaceAllUsesWith(New: UnpredicatedLoad);
4880	Load->eraseFromParent();
4881	}
4882	}
4883	}
4884
4885	static bool
4886	canSinkStoreWithNoAliasCheck(ArrayRef<VPReplicateRecipe *> StoresToSink,
4887	PredicatedScalarEvolution &PSE, const Loop &L,
4888	VPTypeAnalysis &TypeInfo) {
4889	auto StoreLoc = vputils::getMemoryLocation(R: *StoresToSink.front());
4890	if (!StoreLoc \|\| !StoreLoc ->AATags.Scope)
4891	return false;
4892
4893	// When sinking a group of stores, all members of the group alias each other.
4894	// Skip them during the alias checks.
4895	SmallPtrSet<VPRecipeBase *, `4`> StoresToSinkSet(StoresToSink.begin(),
4896	StoresToSink.end());
4897
4898	VPBasicBlock *FirstBB = StoresToSink.front()->getParent();
4899	VPBasicBlock *LastBB = StoresToSink.back()->getParent();
4900	SinkStoreInfo SinkInfo(StoresToSinkSet, *StoresToSink [`0`], PSE, L, TypeInfo);
4901	return canHoistOrSinkWithNoAliasCheck(MemLoc: *StoreLoc, FirstBB, LastBB, SinkInfo);
4902	}
4903
4904	void VPlanTransforms::sinkPredicatedStores(VPlan &Plan,
4905	PredicatedScalarEvolution &PSE,
4906	const Loop *L) {
4907	auto Groups =
4908	collectComplementaryPredicatedMemOps<Instruction::Store>(Plan, PSE, L);
4909	if (Groups.empty())
4910	return;
4911
4912	VPTypeAnalysis TypeInfo(Plan);
4913
4914	for (auto &Group : Groups) {
4915	if (!canSinkStoreWithNoAliasCheck(StoresToSink: Group, PSE, L: *L, TypeInfo))
4916	continue;
4917
4918	// Use the last (most dominated) store's location for the unconditional
4919	// store.
4920	VPReplicateRecipe *LastStore = Group.back();
4921	VPBasicBlock *InsertBB = LastStore->getParent();
4922
4923	// Collect common alias metadata from all stores in the group.
4924	VPIRMetadata CommonMetadata = getCommonMetadata(Recipes: Group);
4925
4926	// Build select chain for stored values.
4927	VPValue *SelectedValue = Group [`0`]->getOperand(N: `0`);
4928	VPBuilder Builder(InsertBB, LastStore->getIterator());
4929
4930	bool IsSingleScalar = Group [`0`]->isSingleScalar();
4931	for (unsigned I = `1`; I < Group.size(); ++I) {
4932	assert(IsSingleScalar == Group[I]->isSingleScalar() &&
4933	"all members in group must agree on IsSingleScalar");
4934	VPValue *Mask = Group [I]->getMask();
4935	VPValue *Value = Group [I]->getOperand(N: `0`);
4936	SelectedValue = Builder.createSelect(Cond: Mask, TrueVal: Value, FalseVal: SelectedValue,
4937	DL: Group [I]->getDebugLoc());
4938	}
4939
4940	// Find the store with minimum alignment to use.
4941	auto *StoreWithMinAlign = findRecipeWithMinAlign<StoreInst>(Group);
4942
4943	// Create unconditional store with selected value and common metadata.
4944	auto UnpredicatedStore = new* VPReplicateRecipe (
4945	StoreWithMinAlign->getUnderlyingInstr(),
4946	{SelectedValue, LastStore->getOperand(N: `1`)}, IsSingleScalar,
4947	/Mask=/nullptr, *LastStore, CommonMetadata);
4948	UnpredicatedStore->insertBefore(BB&: *InsertBB, IP: LastStore->getIterator());
4949
4950	// Remove all predicated stores from the group.
4951	for (VPReplicateRecipe *Store : Group)
4952	Store->eraseFromParent();
4953	}
4954	}
4955
4956	void VPlanTransforms::materializeConstantVectorTripCount(
4957	VPlan &Plan, ElementCount BestVF, unsigned BestUF,
4958	PredicatedScalarEvolution &PSE) {
4959	assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
4960	assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
4961
4962	VPValue *TC = Plan.getTripCount();
4963	if (TC->getNumUsers() == `0`)
4964	return;
4965
4966	// Skip cases for which the trip count may be non-trivial to materialize.
4967	// I.e., when a scalar tail is absent - due to tail folding, or when a scalar
4968	// tail is required.
4969	if (!Plan.hasScalarTail() \|\|
4970	Plan.getMiddleBlock()->getSingleSuccessor() ==
4971	Plan.getScalarPreheader() \|\|
4972	!isa<VPIRValue>(Val: TC))
4973	return;
4974
4975	// Materialize vector trip counts for constants early if it can simply
4976	// be computed as (Original TC / VF UF) * VF * UF.*
4977	// TODO: Compute vector trip counts for loops requiring a scalar epilogue and
4978	// tail-folded loops.
4979	ScalarEvolution &SE = *PSE.getSE();
4980	auto *TCScev = SE.getSCEV(V: TC->getLiveInIRValue());
4981	if (!isa<SCEVConstant>(Val: TCScev))
4982	return;
4983	const SCEV VFxUF = SE.getElementCount(Ty: TCScev->getType(), EC: BestVF BestUF);
4984	auto VecTCScev = SE.getMulExpr(LHS: SE.getUDivExpr(LHS: TCScev, RHS: VFxUF), RHS: VFxUF);
4985	if (auto *ConstVecTC = dyn_cast<SCEVConstant>(Val: VecTCScev))
4986	Plan.getVectorTripCount().setUnderlyingValue(ConstVecTC->getValue());
4987	}
4988
4989	void VPlanTransforms::materializeBackedgeTakenCount(VPlan &Plan,
4990	VPBasicBlock *VectorPH) {
4991	VPValue *BTC = Plan.getOrCreateBackedgeTakenCount();
4992	if (BTC->getNumUsers() == `0`)
4993	return;
4994
4995	VPBuilder Builder(VectorPH, VectorPH->begin());
4996	auto *TCTy = VPTypeAnalysis (Plan).inferScalarType(V: Plan.getTripCount());
4997	auto *TCMO =
4998	Builder.createSub(LHS: Plan.getTripCount(), RHS: Plan.getConstantInt(Ty: TCTy, Val: `1`),
4999	DL: DebugLoc::getCompilerGenerated(), Name: "trip.count.minus.1");
5000	BTC->replaceAllUsesWith(New: TCMO);
5001	}
5002
5003	void VPlanTransforms::materializePacksAndUnpacks(VPlan &Plan) {
5004	if (Plan.hasScalarVFOnly())
5005	return;
5006
5007	VPTypeAnalysis TypeInfo(Plan);
5008	VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
5009	auto VPBBsOutsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
5010	Range: vp_depth_first_shallow(G: Plan.getEntry()));
5011	auto VPBBsInsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
5012	Range: vp_depth_first_shallow(G: LoopRegion->getEntry()));
5013	// Materialize Build(Struct)Vector for all replicating VPReplicateRecipes,
5014	// VPScalarIVStepsRecipe and VPInstructions, excluding ones in replicate
5015	// regions. Those are not materialized explicitly yet. Those vector users are
5016	// still handled in VPReplicateRegion::execute(), via shouldPack().
5017	// TODO: materialize build vectors for replicating recipes in replicating
5018	// regions.
5019	for (VPBasicBlock *VPBB :
5020	concat<VPBasicBlock *>(Ranges&: VPBBsOutsideLoopRegion, Ranges&: VPBBsInsideLoopRegion)) {
5021	for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB)) {
5022	if (!isa<VPScalarIVStepsRecipe, VPReplicateRecipe, VPInstruction>(Val: &R))
5023	continue;
5024	auto *DefR = cast<VPSingleDefRecipe>(Val: &R);
5025	auto UsesVectorOrInsideReplicateRegion = [DefR, LoopRegion](VPUser *U) {
5026	VPRegionBlock *ParentRegion = cast<VPRecipeBase>(Val: U)->getRegion();
5027	return !U->usesScalars(Op: DefR) \|\| ParentRegion != LoopRegion;
5028	};
5029	if ((isa<VPReplicateRecipe>(Val: DefR) &&
5030	cast<VPReplicateRecipe>(Val: DefR)->isSingleScalar()) \|\|
5031	(isa<VPInstruction>(Val: DefR) &&
5032	(vputils::onlyFirstLaneUsed(Def: DefR) \|\|
5033	!cast<VPInstruction>(Val: DefR)->doesGeneratePerAllLanes())) \|\|
5034	none_of(Range: DefR->users(), P: UsesVectorOrInsideReplicateRegion))
5035	continue;
5036
5037	Type *ScalarTy = TypeInfo.inferScalarType(V: DefR);
5038	unsigned Opcode = ScalarTy->isStructTy()
5039	? VPInstruction::BuildStructVector
5040	: VPInstruction::BuildVector;
5041	auto BuildVector = new* VPInstruction (Opcode, {DefR});
5042	BuildVector->insertAfter(InsertPos: DefR);
5043
5044	DefR->replaceUsesWithIf(
5045	New: BuildVector, ShouldReplace: [BuildVector, &UsesVectorOrInsideReplicateRegion](
5046	VPUser &U, unsigned) {
5047	return &U != BuildVector && UsesVectorOrInsideReplicateRegion (&U);
5048	});
5049	}
5050	}
5051
5052	// Create explicit VPInstructions to convert vectors to scalars. The current
5053	// implementation is conservative - it may miss some cases that may or may not
5054	// be vector values. TODO: introduce Unpacks speculatively - remove them later
5055	// if they are known to operate on scalar values.
5056	for (VPBasicBlock *VPBB : VPBBsInsideLoopRegion) {
5057	for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB)) {
5058	if (isa<VPReplicateRecipe, VPInstruction, VPScalarIVStepsRecipe,
5059	VPDerivedIVRecipe, VPCanonicalIVPHIRecipe>(Val: &R))
5060	continue;
5061	for (VPValue *Def : R.definedValues()) {
5062	// Skip recipes that are single-scalar or only have their first lane
5063	// used.
5064	// TODO: The Defs skipped here may or may not be vector values.
5065	// Introduce Unpacks, and remove them later, if they are guaranteed to
5066	// produce scalar values.
5067	if (vputils::isSingleScalar(VPV: Def) \|\| vputils::onlyFirstLaneUsed(Def))
5068	continue;
5069
5070	// At the moment, we create unpacks only for scalar users outside
5071	// replicate regions. Recipes inside replicate regions still extract the
5072	// required lanes implicitly.
5073	// TODO: Remove once replicate regions are unrolled completely.
5074	auto IsCandidateUnpackUser = [Def](VPUser *U) {
5075	VPRegionBlock *ParentRegion = cast<VPRecipeBase>(Val: U)->getRegion();
5076	return U->usesScalars(Op: Def) &&
5077	(!ParentRegion \|\| !ParentRegion->isReplicator());
5078	};
5079	if (none_of(Range: Def->users(), P: IsCandidateUnpackUser))
5080	continue;
5081
5082	auto Unpack = new* VPInstruction (VPInstruction::Unpack, {Def});
5083	if (R.isPhi())
5084	Unpack->insertBefore(BB&: *VPBB, IP: VPBB->getFirstNonPhi());
5085	else
5086	Unpack->insertAfter(InsertPos: &R);
5087	Def->replaceUsesWithIf(New: Unpack,
5088	ShouldReplace: [&IsCandidateUnpackUser](VPUser &U, unsigned) {
5089	return IsCandidateUnpackUser (&U);
5090	});
5091	}
5092	}
5093	}
5094	}
5095
5096	void VPlanTransforms::materializeVectorTripCount(VPlan &Plan,
5097	VPBasicBlock *VectorPHVPBB,
5098	bool TailByMasking,
5099	bool RequiresScalarEpilogue,
5100	VPValue *Step) {
5101	VPSymbolicValue &VectorTC = Plan.getVectorTripCount();
5102	// There's nothing to do if there are no users of the vector trip count or its
5103	// IR value has already been set.
5104	if (VectorTC.getNumUsers() == `0` \|\| VectorTC.getUnderlyingValue())
5105	return;
5106
5107	VPValue *TC = Plan.getTripCount();
5108	Type *TCTy = VPTypeAnalysis (Plan).inferScalarType(V: TC);
5109	VPBasicBlock::iterator InsertPt = VectorPHVPBB->begin();
5110	if (auto *StepR = Step->getDefiningRecipe()) {
5111	assert(StepR->getParent() == VectorPHVPBB &&
5112	"Step must be defined in VectorPHVPBB");
5113	// Insert after Step's definition to maintain valid def-use ordering.
5114	InsertPt = std::next(x: StepR->getIterator());
5115	}
5116	VPBuilder Builder(VectorPHVPBB, InsertPt);
5117
5118	// If the tail is to be folded by masking, round the number of iterations N
5119	// up to a multiple of Step instead of rounding down. This is done by first
5120	// adding Step-1 and then rounding down. Note that it's ok if this addition
5121	// overflows: the vector induction variable will eventually wrap to zero given
5122	// that it starts at zero and its Step is a power of two; the loop will then
5123	// exit, with the last early-exit vector comparison also producing all-true.
5124	if (TailByMasking) {
5125	TC = Builder.createAdd(
5126	LHS: TC, RHS: Builder.createSub(LHS: Step, RHS: Plan.getConstantInt(Ty: TCTy, Val: `1`)),
5127	DL: DebugLoc::getCompilerGenerated(), Name: "n.rnd.up");
5128	}
5129
5130	// Now we need to generate the expression for the part of the loop that the
5131	// vectorized body will execute. This is equal to N - (N % Step) if scalar
5132	// iterations are not required for correctness, or N - Step, otherwise. Step
5133	// is equal to the vectorization factor (number of SIMD elements) times the
5134	// unroll factor (number of SIMD instructions).
5135	VPValue *R =
5136	Builder.createNaryOp(Opcode: Instruction::URem, Operands: {TC, Step},
5137	DL: DebugLoc::getCompilerGenerated(), Name: "n.mod.vf");
5138
5139	// There are cases where we must* run at least one iteration in the remainder*
5140	// loop. See the cost model for when this can happen. If the step evenly
5141	// divides the trip count, we set the remainder to be equal to the step. If
5142	// the step does not evenly divide the trip count, no adjustment is necessary
5143	// since there will already be scalar iterations. Note that the minimum
5144	// iterations check ensures that N >= Step.
5145	if (RequiresScalarEpilogue) {
5146	assert(!TailByMasking &&
5147	"requiring scalar epilogue is not supported with fail folding");
5148	VPValue *IsZero =
5149	Builder.createICmp(Pred: CmpInst::ICMP_EQ, A: R, B: Plan.getZero(Ty: TCTy));
5150	R = Builder.createSelect(Cond: IsZero, TrueVal: Step, FalseVal: R);
5151	}
5152
5153	VPValue *Res =
5154	Builder.createSub(LHS: TC, RHS: R, DL: DebugLoc::getCompilerGenerated(), Name: "n.vec");
5155	VectorTC.replaceAllUsesWith(New: Res);
5156	}
5157
5158	void VPlanTransforms::materializeFactors(VPlan &Plan, VPBasicBlock *VectorPH,
5159	ElementCount VFEC) {
5160	// If VF and VFxUF have already been materialized (no remaining users),
5161	// there's nothing more to do.
5162	if (Plan.getVF().isMaterialized()) {
5163	assert(Plan.getVFxUF().isMaterialized() &&
5164	"VF and VFxUF must be materialized together");
5165	return;
5166	}
5167
5168	VPBuilder Builder(VectorPH, VectorPH->begin());
5169	Type *TCTy = VPTypeAnalysis (Plan).inferScalarType(V: Plan.getTripCount());
5170	VPValue &VF = Plan.getVF();
5171	VPValue &VFxUF = Plan.getVFxUF();
5172	// If there are no users of the runtime VF, compute VFxUF by constant folding
5173	// the multiplication of VF and UF.
5174	if (VF.getNumUsers() == `0`) {
5175	VPValue *RuntimeVFxUF =
5176	Builder.createElementCount(Ty: TCTy, EC: VFEC * Plan.getConcreteUF());
5177	VFxUF.replaceAllUsesWith(New: RuntimeVFxUF);
5178	return;
5179	}
5180
5181	// For users of the runtime VF, compute it as VF * vscale, and VFxUF as (VF *
5182	// vscale) UF.*
5183	VPValue *RuntimeVF = Builder.createElementCount(Ty: TCTy, EC: VFEC);
5184	if (!vputils::onlyScalarValuesUsed(Def: &VF)) {
5185	VPValue *BC = Builder.createNaryOp(Opcode: VPInstruction::Broadcast, Operands: RuntimeVF);
5186	VF.replaceUsesWithIf(
5187	New: BC, ShouldReplace: [&VF](VPUser &U, unsigned) { return !U.usesScalars(Op: &VF); });
5188	}
5189	VF.replaceAllUsesWith(New: RuntimeVF);
5190
5191	VPValue *MulByUF = Builder.createOverflowingOp(
5192	Opcode: Instruction::Mul,
5193	Operands: {RuntimeVF, Plan.getConstantInt(Ty: TCTy, Val: Plan.getConcreteUF())},
5194	WrapFlags: {true, false});
5195	VFxUF.replaceAllUsesWith(New: MulByUF);
5196	}
5197
5198	DenseMap<const SCEV , Value >
5199	VPlanTransforms::expandSCEVs(VPlan &Plan, ScalarEvolution &SE) {
5200	SCEVExpander Expander(SE, "induction", /PreserveLCSSA=/false);
5201
5202	auto *Entry = cast<VPIRBasicBlock>(Val: Plan.getEntry());
5203	BasicBlock *EntryBB = Entry->getIRBasicBlock();
5204	DenseMap<const SCEV , Value > ExpandedSCEVs;
5205	for (VPRecipeBase &R : make_early_inc_range(Range&: *Entry)) {
5206	if (isa<VPIRInstruction, VPIRPhi>(Val: &R))
5207	continue;
5208	auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(Val: &R);
5209	if (!ExpSCEV)
5210	break;
5211	const SCEV *Expr = ExpSCEV->getSCEV();
5212	Value *Res =
5213	Expander.expandCodeFor(SH: Expr, Ty: Expr->getType(), I: EntryBB->getTerminator());
5214	ExpandedSCEVs [ExpSCEV->getSCEV()] = Res;
5215	VPValue *Exp = Plan.getOrAddLiveIn(V: Res);
5216	ExpSCEV->replaceAllUsesWith(New: Exp);
5217	if (Plan.getTripCount() == ExpSCEV)
5218	Plan.resetTripCount(NewTripCount: Exp);
5219	ExpSCEV->eraseFromParent();
5220	}
5221	assert(none_of(*Entry, IsaPred<VPExpandSCEVRecipe>) &&
5222	"VPExpandSCEVRecipes must be at the beginning of the entry block, "
5223	"before any VPIRInstructions");
5224	// Add IR instructions in the entry basic block but not in the VPIRBasicBlock
5225	// to the VPIRBasicBlock.
5226	auto EI = Entry->begin();
5227	for (Instruction &I : drop_end(RangeOrContainer&: *EntryBB)) {
5228	if (EI != Entry->end() && isa<VPIRInstruction>(Val: *EI) &&
5229	&cast<VPIRInstruction>(Val: &*EI)->getInstruction() == &I) {
5230	EI ++;
5231	continue;
5232	}
5233	VPIRInstruction::create(I)->insertBefore(BB&: *Entry, IP: EI);
5234	}
5235
5236	return ExpandedSCEVs;
5237	}
5238
5239	/// Returns true if \p V is VPWidenLoadRecipe or VPInterleaveRecipe that can be
5240	/// converted to a narrower recipe. \p V is used by a wide recipe that feeds a
5241	/// store interleave group at index \p Idx, \p WideMember0 is the recipe feeding
5242	/// the same interleave group at index 0. A VPWidenLoadRecipe can be narrowed to
5243	/// an index-independent load if it feeds all wide ops at all indices (\p OpV
5244	/// must be the operand at index \p OpIdx for both the recipe at lane 0, \p
5245	/// WideMember0). A VPInterleaveRecipe can be narrowed to a wide load, if \p V
5246	/// is defined at \p Idx of a load interleave group.
5247	static bool canNarrowLoad(VPSingleDefRecipe WideMember0, unsigned* OpIdx,
5248	VPValue OpV, unsigned* Idx, bool IsScalable) {
5249	VPValue *Member0Op = WideMember0->getOperand(N: OpIdx);
5250	VPRecipeBase *Member0OpR = Member0Op->getDefiningRecipe();
5251	if (!Member0OpR)
5252	return Member0Op == OpV;
5253	if (auto *W = dyn_cast<VPWidenLoadRecipe>(Val: Member0OpR))
5254	// For scalable VFs, the narrowed plan processes vscale iterations at once,
5255	// so a shared wide load cannot be narrowed to a uniform scalar; bail out.
5256	return !IsScalable && !W->getMask() && W->isConsecutive() &&
5257	Member0Op == OpV;
5258	if (auto *IR = dyn_cast<VPInterleaveRecipe>(Val: Member0OpR))
5259	return IR->getInterleaveGroup()->isFull() && IR->getVPValue(I: Idx) == OpV;
5260	return false;
5261	}
5262
5263	static bool canNarrowOps(ArrayRef<VPValue > Ops, bool* IsScalable) {
5264	SmallVector<VPValue *> Ops0;
5265	auto *WideMember0 = dyn_cast<VPSingleDefRecipe>(Val: Ops [`0`]);
5266	if (!WideMember0)
5267	return false;
5268	for (VPValue *V : Ops) {
5269	if (!isa<VPWidenRecipe, VPWidenCastRecipe>(Val: V))
5270	return false;
5271	auto *R = cast<VPSingleDefRecipe>(Val: V);
5272	if (getOpcodeOrIntrinsicID(R) != getOpcodeOrIntrinsicID(R: WideMember0))
5273	return false;
5274	}
5275
5276	for (unsigned Idx = `0`; Idx != WideMember0->getNumOperands(); ++Idx) {
5277	SmallVector<VPValue *> OpsI;
5278	for (VPValue *Op : Ops)
5279	OpsI.push_back(Elt: Op->getDefiningRecipe()->getOperand(N: Idx));
5280
5281	if (canNarrowOps(Ops: OpsI, IsScalable))
5282	continue;
5283
5284	if (any_of(Range: enumerate(First&: OpsI), P: [WideMember0, Idx, IsScalable](const auto &P) {
5285	const auto &[OpIdx, OpV] = P;
5286	return !canNarrowLoad(WideMember0, Idx, OpV, OpIdx, IsScalable);
5287	}))
5288	return false;
5289	}
5290
5291	return true;
5292	}
5293
5294	/// Returns VF from \p VFs if \p IR is a full interleave group with factor and
5295	/// number of members both equal to VF. The interleave group must also access
5296	/// the full vector width.
5297	static std::optional<ElementCount> isConsecutiveInterleaveGroup(
5298	VPInterleaveRecipe *InterleaveR, ArrayRef<ElementCount> VFs,
5299	VPTypeAnalysis &TypeInfo, const TargetTransformInfo &TTI) {
5300	if (!InterleaveR \|\| InterleaveR->getMask())
5301	return std::nullopt;
5302
5303	Type GroupElementTy = nullptr*;
5304	if (InterleaveR->getStoredValues().empty()) {
5305	GroupElementTy = TypeInfo.inferScalarType(V: InterleaveR->getVPValue(I: `0`));
5306	if (!all_of(Range: InterleaveR->definedValues(),
5307	P: [&TypeInfo, GroupElementTy](VPValue *Op) {
5308	return TypeInfo.inferScalarType(V: Op) == GroupElementTy;
5309	}))
5310	return std::nullopt;
5311	} else {
5312	GroupElementTy =
5313	TypeInfo.inferScalarType(V: InterleaveR->getStoredValues()[`0`]);
5314	if (!all_of(Range: InterleaveR->getStoredValues(),
5315	P: [&TypeInfo, GroupElementTy](VPValue *Op) {
5316	return TypeInfo.inferScalarType(V: Op) == GroupElementTy;
5317	}))
5318	return std::nullopt;
5319	}
5320
5321	auto IG = InterleaveR->getInterleaveGroup();
5322	if (IG->getFactor() != IG->getNumMembers())
5323	return std::nullopt;
5324
5325	auto GetVectorBitWidthForVF = [&TTI](ElementCount VF) {
5326	TypeSize Size = TTI.getRegisterBitWidth(
5327	K: VF.isFixed() ? TargetTransformInfo::RGK_FixedWidthVector
5328	: TargetTransformInfo::RGK_ScalableVector);
5329	assert(Size.isScalable() == VF.isScalable() &&
5330	"if Size is scalable, VF must be scalable and vice versa");
5331	return Size.getKnownMinValue();
5332	};
5333
5334	for (ElementCount VF : VFs) {
5335	unsigned MinVal = VF.getKnownMinValue();
5336	unsigned GroupSize = GroupElementTy->getScalarSizeInBits() * MinVal;
5337	if (IG->getFactor() == MinVal && GroupSize == GetVectorBitWidthForVF (VF))
5338	return {VF};
5339	}
5340	return std::nullopt;
5341	}
5342
5343	/// Returns true if \p VPValue is a narrow VPValue.
5344	static bool isAlreadyNarrow(VPValue *VPV) {
5345	if (isa<VPIRValue>(Val: VPV))
5346	return true;
5347	auto *RepR = dyn_cast<VPReplicateRecipe>(Val: VPV);
5348	return RepR && RepR->isSingleScalar();
5349	}
5350
5351	// Convert a wide recipe defining a VPValue \p V feeding an interleave group to
5352	// a narrow variant.
5353	static VPValue *
5354	narrowInterleaveGroupOp(VPValue V, SmallPtrSetImpl<VPValue > &NarrowedOps) {
5355	auto *R = V->getDefiningRecipe();
5356	if (!R \|\| NarrowedOps.contains(Ptr: V))
5357	return V;
5358
5359	if (isAlreadyNarrow(VPV: V))
5360	return V;
5361
5362	if (isa<VPWidenRecipe, VPWidenCastRecipe>(Val: R)) {
5363	auto *WideMember0 = cast<VPSingleDefRecipe>(Val: R);
5364	for (unsigned Idx = `0`, E = WideMember0->getNumOperands(); Idx != E; ++Idx)
5365	WideMember0->setOperand(
5366	I: Idx,
5367	New: narrowInterleaveGroupOp(V: WideMember0->getOperand(N: Idx), NarrowedOps));
5368	return V;
5369	}
5370
5371	if (auto *LoadGroup = dyn_cast<VPInterleaveRecipe>(Val: R)) {
5372	// Narrow interleave group to wide load, as transformed VPlan will only
5373	// process one original iteration.
5374	auto *LI = cast<LoadInst>(Val: LoadGroup->getInterleaveGroup()->getInsertPos());
5375	auto L = new* VPWidenLoadRecipe (
5376	LI, LoadGroup->getAddr(), LoadGroup->getMask(), /Consecutive=/*true,
5377	/Reverse=/false, {}, LoadGroup->getDebugLoc());
5378	L->insertBefore(InsertPos: LoadGroup);
5379	NarrowedOps.insert(Ptr: L);
5380	return L;
5381	}
5382
5383	if (auto *RepR = dyn_cast<VPReplicateRecipe>(Val: R)) {
5384	assert(RepR->isSingleScalar() &&
5385	isa<LoadInst>(RepR->getUnderlyingInstr()) &&
5386	"must be a single scalar load");
5387	NarrowedOps.insert(Ptr: RepR);
5388	return RepR;
5389	}
5390
5391	auto *WideLoad = cast<VPWidenLoadRecipe>(Val: R);
5392	VPValue *PtrOp = WideLoad->getAddr();
5393	if (auto *VecPtr = dyn_cast<VPVectorPointerRecipe>(Val: PtrOp))
5394	PtrOp = VecPtr->getOperand(N: `0`);
5395	// Narrow wide load to uniform scalar load, as transformed VPlan will only
5396	// process one original iteration.
5397	auto N = new* VPReplicateRecipe (&WideLoad->getIngredient(), {PtrOp},
5398	/IsUniform/ true,
5399	/Mask/ nullptr, {}, *WideLoad);
5400	N->insertBefore(InsertPos: WideLoad);
5401	NarrowedOps.insert(Ptr: N);
5402	return N;
5403	}
5404
5405	std::unique_ptr<VPlan>
5406	VPlanTransforms::narrowInterleaveGroups(VPlan &Plan,
5407	const TargetTransformInfo &TTI) {
5408	VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
5409
5410	if (!VectorLoop)
5411	return nullptr;
5412
5413	// Only handle single-block loops for now.
5414	if (VectorLoop->getEntryBasicBlock() != VectorLoop->getExitingBasicBlock())
5415	return nullptr;
5416
5417	// Skip plans when we may not be able to properly narrow.
5418	VPBasicBlock *Exiting = VectorLoop->getExitingBasicBlock();
5419	if (!match(V: &Exiting->back(), P: m_BranchOnCount()))
5420	return nullptr;
5421
5422	assert(match(&Exiting->back(),
5423	m_BranchOnCount(m_Add(m_VPValue(), m_Specific(&Plan.getVFxUF())),
5424	m_Specific(&Plan.getVectorTripCount()))) &&
5425	"unexpected branch-on-count");
5426
5427	VPTypeAnalysis TypeInfo(Plan);
5428	SmallVector<VPInterleaveRecipe *> StoreGroups;
5429	std::optional<ElementCount> VFToOptimize;
5430	for (auto &R : *VectorLoop->getEntryBasicBlock()) {
5431	if (isa<VPCanonicalIVPHIRecipe>(Val: &R))
5432	continue;
5433
5434	if (isa<VPDerivedIVRecipe, VPScalarIVStepsRecipe>(Val: &R) &&
5435	vputils::onlyFirstLaneUsed(Def: cast<VPSingleDefRecipe>(Val: &R)))
5436	continue;
5437
5438	// Bail out on recipes not supported at the moment:
5439	// phi recipes other than the canonical induction*
5440	// recipes writing to memory except interleave groups*
5441	// Only support plans with a canonical induction phi.
5442	if (R.isPhi())
5443	return nullptr;
5444
5445	auto *InterleaveR = dyn_cast<VPInterleaveRecipe>(Val: &R);
5446	if (R.mayWriteToMemory() && !InterleaveR)
5447	return nullptr;
5448
5449	// All other ops are allowed, but we reject uses that cannot be converted
5450	// when checking all allowed consumers (store interleave groups) below.
5451	if (!InterleaveR)
5452	continue;
5453
5454	// Try to find a single VF, where all interleave groups are consecutive and
5455	// saturate the full vector width. If we already have a candidate VF, check
5456	// if it is applicable for the current InterleaveR, otherwise look for a
5457	// suitable VF across the Plan's VFs.
5458	SmallVector<ElementCount> VFs =
5459	VFToOptimize ? SmallVector<ElementCount>({*VFToOptimize})
5460	: to_vector(Range: Plan.vectorFactors());
5461	std::optional<ElementCount> NarrowedVF =
5462	isConsecutiveInterleaveGroup(InterleaveR, VFs, TypeInfo, TTI);
5463	if (!NarrowedVF \|\| (VFToOptimize && NarrowedVF != VFToOptimize))
5464	return nullptr;
5465	VFToOptimize = NarrowedVF;
5466
5467	// Skip read interleave groups.
5468	if (InterleaveR->getStoredValues().empty())
5469	continue;
5470
5471	// Narrow interleave groups, if all operands are already matching narrow
5472	// ops.
5473	auto *Member0 = InterleaveR->getStoredValues()[`0`];
5474	if (isAlreadyNarrow(VPV: Member0) &&
5475	all_of(Range: InterleaveR->getStoredValues(), P: equal_to(Arg&: Member0))) {
5476	StoreGroups.push_back(Elt: InterleaveR);
5477	continue;
5478	}
5479
5480	// For now, we only support full interleave groups storing load interleave
5481	// groups.
5482	if (all_of(Range: enumerate(First: InterleaveR->getStoredValues()), P: [](auto Op) {
5483	VPRecipeBase *DefR = Op.value()->getDefiningRecipe();
5484	if (!DefR)
5485	return false;
5486	auto *IR = dyn_cast<VPInterleaveRecipe>(Val: DefR);
5487	return IR && IR->getInterleaveGroup()->isFull() &&
5488	IR->getVPValue(Op.index()) == Op.value();
5489	})) {
5490	StoreGroups.push_back(Elt: InterleaveR);
5491	continue;
5492	}
5493
5494	// Check if all values feeding InterleaveR are matching wide recipes, which
5495	// operands that can be narrowed.
5496	if (!canNarrowOps(Ops: InterleaveR->getStoredValues(),
5497	IsScalable: VFToOptimize ->isScalable()))
5498	return nullptr;
5499	StoreGroups.push_back(Elt: InterleaveR);
5500	}
5501
5502	if (StoreGroups.empty())
5503	return nullptr;
5504
5505	VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
5506	bool RequiresScalarEpilogue =
5507	MiddleVPBB->getNumSuccessors() == `1` &&
5508	MiddleVPBB->getSingleSuccessor() == Plan.getScalarPreheader();
5509	// Bail out for tail-folding (middle block with a single successor to exit).
5510	if (MiddleVPBB->getNumSuccessors() != `2` && !RequiresScalarEpilogue)
5511	return nullptr;
5512
5513	// All interleave groups in Plan can be narrowed for VFToOptimize. Split the
5514	// original Plan into 2: a) a new clone which contains all VFs of Plan, except
5515	// VFToOptimize, and b) the original Plan with VFToOptimize as single VF.
5516	// TODO: Handle cases where only some interleave groups can be narrowed.
5517	std::unique_ptr<VPlan> NewPlan;
5518	if (size(Range: Plan.vectorFactors()) != `1`) {
5519	NewPlan = std::unique_ptr<VPlan>(Plan.duplicate());
5520	Plan.setVF(*VFToOptimize);
5521	NewPlan ->removeVF(VF: *VFToOptimize);
5522	}
5523
5524	// Convert InterleaveGroup \p R to a single VPWidenLoadRecipe.
5525	SmallPtrSet<VPValue *, `4`> NarrowedOps;
5526	// Narrow operation tree rooted at store groups.
5527	for (auto *StoreGroup : StoreGroups) {
5528	VPValue *Res =
5529	narrowInterleaveGroupOp(V: StoreGroup->getStoredValues()[`0`], NarrowedOps);
5530	auto *SI =
5531	cast<StoreInst>(Val: StoreGroup->getInterleaveGroup()->getInsertPos());
5532	auto S = new* VPWidenStoreRecipe (
5533	SI, StoreGroup->getAddr(), Res, nullptr, /Consecutive=/*true,
5534	/Reverse=/false, {}, StoreGroup->getDebugLoc());
5535	S->insertBefore(InsertPos: StoreGroup);
5536	StoreGroup->eraseFromParent();
5537	}
5538
5539	// Adjust induction to reflect that the transformed plan only processes one
5540	// original iteration.
5541	auto *CanIV = VectorLoop->getCanonicalIV();
5542	auto *Inc = cast<VPInstruction>(Val: CanIV->getBackedgeValue());
5543	VPBasicBlock *VectorPH = Plan.getVectorPreheader();
5544	VPBuilder PHBuilder(VectorPH, VectorPH->begin());
5545
5546	VPValue *UF = &Plan.getUF();
5547	VPValue *Step;
5548	if (VFToOptimize ->isScalable()) {
5549	VPValue *VScale = PHBuilder.createElementCount(
5550	Ty: VectorLoop->getCanonicalIVType(), EC: ElementCount::getScalable(MinVal: `1`));
5551	Step = PHBuilder.createOverflowingOp(Opcode: Instruction::Mul, Operands: {VScale, UF},
5552	WrapFlags: {true, false});
5553	Plan.getVF().replaceAllUsesWith(New: VScale);
5554	} else {
5555	Step = UF;
5556	Plan.getVF().replaceAllUsesWith(
5557	New: Plan.getConstantInt(Ty: CanIV->getScalarType(), Val: `1`));
5558	}
5559	// Materialize vector trip count with the narrowed step.
5560	materializeVectorTripCount(Plan, VectorPHVPBB: VectorPH, /TailByMasking=/false,
5561	RequiresScalarEpilogue, Step);
5562
5563	Inc->setOperand(I: `1`, New: Step);
5564	Plan.getVFxUF().replaceAllUsesWith(New: Step);
5565
5566	removeDeadRecipes(Plan);
5567	assert(none_of(*VectorLoop->getEntryBasicBlock(),
5568	IsaPred<VPVectorPointerRecipe>) &&
5569	"All VPVectorPointerRecipes should have been removed");
5570	return NewPlan;
5571	}
5572
5573	/// Add branch weight metadata, if the \p Plan's middle block is terminated by a
5574	/// BranchOnCond recipe.
5575	void VPlanTransforms::addBranchWeightToMiddleTerminator(
5576	VPlan &Plan, ElementCount VF, std::optional<unsigned> VScaleForTuning) {
5577	VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
5578	auto *MiddleTerm =
5579	dyn_cast_or_null<VPInstruction>(Val: MiddleVPBB->getTerminator());
5580	// Only add branch metadata if there is a (conditional) terminator.
5581	if (!MiddleTerm)
5582	return;
5583
5584	assert(MiddleTerm->getOpcode() == VPInstruction::BranchOnCond &&
5585	"must have a BranchOnCond");
5586	// Assume that `TripCount % VectorStep ` is equally distributed.
5587	unsigned VectorStep = Plan.getConcreteUF() * VF.getKnownMinValue();
5588	if (VF.isScalable() && VScaleForTuning.has_value())
5589	VectorStep = VScaleForTuning;
5590	assert(VectorStep > `0` && "trip count should not be zero");
5591	MDBuilder MDB(Plan.getContext());
5592	MDNode *BranchWeights =
5593	MDB.createBranchWeights(Weights: {`1`, VectorStep - `1`}, /IsExpected=/false);
5594	MiddleTerm->setMetadata(Kind: LLVMContext::MD_prof, Node: BranchWeights);
5595	}
5596
5597	void VPlanTransforms::addExitUsersForFirstOrderRecurrences(VPlan &Plan,
5598	VFRange &Range) {
5599	VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
5600	auto *MiddleVPBB = Plan.getMiddleBlock();
5601	VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
5602
5603	auto IsScalableOne = [](ElementCount VF) -> bool {
5604	return VF == ElementCount::getScalable(MinVal: `1`);
5605	};
5606
5607	for (auto &HeaderPhi : VectorRegion->getEntryBasicBlock()->phis()) {
5608	auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(Val: &HeaderPhi);
5609	if (!FOR)
5610	continue;
5611
5612	assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
5613	"Cannot handle loops with uncountable early exits");
5614
5615	// This is the second phase of vectorizing first-order recurrences, creating
5616	// extract for users outside the loop. An overview of the transformation is
5617	// described below. Suppose we have the following loop with some use after
5618	// the loop of the last a[i-1],
5619	//
5620	// for (int i = 0; i < n; ++i) {
5621	// t = a[i - 1];
5622	// b[i] = a[i] - t;
5623	// }
5624	// use t;
5625	//
5626	// There is a first-order recurrence on "a". For this loop, the shorthand
5627	// scalar IR looks like:
5628	//
5629	// scalar.ph:
5630	// s.init = a[-1]
5631	// br scalar.body
5632	//
5633	// scalar.body:
5634	// i = phi [0, scalar.ph], [i+1, scalar.body]
5635	// s1 = phi [s.init, scalar.ph], [s2, scalar.body]
5636	// s2 = a[i]
5637	// b[i] = s2 - s1
5638	// br cond, scalar.body, exit.block
5639	//
5640	// exit.block:
5641	// use = lcssa.phi [s1, scalar.body]
5642	//
5643	// In this example, s1 is a recurrence because it's value depends on the
5644	// previous iteration. In the first phase of vectorization, we created a
5645	// VPFirstOrderRecurrencePHIRecipe v1 for s1. Now we create the extracts
5646	// for users in the scalar preheader and exit block.
5647	//
5648	// vector.ph:
5649	// v_init = vector(..., ..., ..., a[-1])
5650	// br vector.body
5651	//
5652	// vector.body
5653	// i = phi [0, vector.ph], [i+4, vector.body]
5654	// v1 = phi [v_init, vector.ph], [v2, vector.body]
5655	// v2 = a[i, i+1, i+2, i+3]
5656	// b[i] = v2 - v1
5657	// // Next, third phase will introduce v1' = splice(v1(3), v2(0, 1, 2))
5658	// b[i, i+1, i+2, i+3] = v2 - v1
5659	// br cond, vector.body, middle.block
5660	//
5661	// middle.block:
5662	// vector.recur.extract.for.phi = v2(2)
5663	// vector.recur.extract = v2(3)
5664	// br cond, scalar.ph, exit.block
5665	//
5666	// scalar.ph:
5667	// scalar.recur.init = phi [vector.recur.extract, middle.block],
5668	// [s.init, otherwise]
5669	// br scalar.body
5670	//
5671	// scalar.body:
5672	// i = phi [0, scalar.ph], [i+1, scalar.body]
5673	// s1 = phi [scalar.recur.init, scalar.ph], [s2, scalar.body]
5674	// s2 = a[i]
5675	// b[i] = s2 - s1
5676	// br cond, scalar.body, exit.block
5677	//
5678	// exit.block:
5679	// lo = lcssa.phi [s1, scalar.body],
5680	// [vector.recur.extract.for.phi, middle.block]
5681	//
5682	// Now update VPIRInstructions modeling LCSSA phis in the exit block.
5683	// Extract the penultimate value of the recurrence and use it as operand for
5684	// the VPIRInstruction modeling the phi.
5685	for (VPRecipeBase &R : make_early_inc_range(
5686	Range: make_range(x: MiddleVPBB->getFirstNonPhi(), y: MiddleVPBB->end()))) {
5687	if (!match(V: &R, P: m_ExtractLastLaneOfLastPart(Op0: m_Specific(VPV: FOR))))
5688	continue;
5689
5690	// For VF vscale x 1, if vscale = 1, we are unable to extract the
5691	// penultimate value of the recurrence. Instead we rely on the existing
5692	// extract of the last element from the result of
5693	// VPInstruction::FirstOrderRecurrenceSplice.
5694	// TODO: Consider vscale_range info and UF.
5695	if (LoopVectorizationPlanner::getDecisionAndClampRange(Predicate: IsScalableOne,
5696	Range))
5697	return;
5698	VPValue *PenultimateElement = MiddleBuilder.createNaryOp(
5699	Opcode: VPInstruction::ExtractPenultimateElement, Operands: FOR->getBackedgeValue(), DL: {},
5700	Name: "vector.recur.extract.for.phi");
5701	for (VPUser *U : to_vector(Range: cast<VPInstruction>(Val: &R)->users())) {
5702	auto *ExitPhi = dyn_cast<VPIRPhi>(Val: U);
5703	if (!ExitPhi)
5704	continue;
5705	ExitPhi->replaceUsesOfWith(From: cast<VPInstruction>(Val: &R), To: PenultimateElement);
5706	}
5707	}
5708	}
5709	}
5710
5711	void VPlanTransforms::optimizeFindIVReductions(VPlan &Plan,
5712	PredicatedScalarEvolution &PSE,
5713	Loop &L) {
5714	ScalarEvolution &SE = *PSE.getSE();
5715	VPRegionBlock *VectorLoopRegion = Plan.getVectorLoopRegion();
5716
5717	// Helper lambda to check if the IV range excludes the sentinel value.
5718	auto CheckSentinel = [&SE](const SCEV IVSCEV, bool* UseMax,
5719	bool Signed) -> std::optional<APInt> {
5720	unsigned BW = IVSCEV->getType()->getScalarSizeInBits();
5721	APInt Sentinel =
5722	UseMax
5723	? (Signed ? APInt::getSignedMinValue(numBits: BW) : APInt::getMinValue(numBits: BW))
5724	: (Signed ? APInt::getSignedMaxValue(numBits: BW) : APInt::getMaxValue(numBits: BW));
5725
5726	ConstantRange IVRange =
5727	Signed ? SE.getSignedRange(S: IVSCEV) : SE.getUnsignedRange(S: IVSCEV);
5728	if (!IVRange.contains(Val: Sentinel))
5729	return Sentinel;
5730	return std::nullopt;
5731	};
5732
5733	VPValue *HeaderMask = vputils::findHeaderMask(Plan);
5734	for (VPRecipeBase &Phi :
5735	make_early_inc_range(Range: VectorLoopRegion->getEntryBasicBlock()->phis())) {
5736	auto *PhiR = dyn_cast<VPReductionPHIRecipe>(Val: &Phi);
5737	if (!PhiR \|\| !RecurrenceDescriptor::isFindLastRecurrenceKind(
5738	Kind: PhiR->getRecurrenceKind()))
5739	continue;
5740
5741	Type *PhiTy = VPTypeAnalysis (Plan).inferScalarType(V: PhiR);
5742	if (PhiTy->isPointerTy() \|\| PhiTy->isFloatingPointTy())
5743	continue;
5744
5745	// If there's a header mask, the backedge select will not be the find-last
5746	// select.
5747	VPValue *BackedgeVal = PhiR->getBackedgeValue();
5748	VPValue *CondSelect = BackedgeVal;
5749	if (HeaderMask &&
5750	!match(V: BackedgeVal, P: m_Select(Op0: m_Specific(VPV: HeaderMask),
5751	Op1: m_VPValue(V&: CondSelect), Op2: m_Specific(VPV: PhiR))))
5752	llvm_unreachable("expected header mask select");
5753
5754	// Get the IV from the conditional select of the reduction phi.
5755	// The conditional select should be a select between the phi and the IV.
5756	VPValue Cond, TrueVal, *FalseVal;
5757	if (!match(V: CondSelect, P: m_Select(Op0: m_VPValue(V&: Cond), Op1: m_VPValue(V&: TrueVal),
5758	Op2: m_VPValue(V&: FalseVal))))
5759	continue;
5760
5761	// The non-phi operand of the select is the IV.
5762	assert(is_contained(CondSelect->getDefiningRecipe()->operands(), PhiR));
5763	VPValue *IV = TrueVal == PhiR ? FalseVal : TrueVal;
5764
5765	const SCEV *IVSCEV = vputils::getSCEVExprForVPValue(V: IV, PSE, L: &L);
5766	const SCEV *Step;
5767	if (!match(S: IVSCEV, P: m_scev_AffineAddRec(Op0: m_SCEV(), Op1: m_SCEV(V&: Step))))
5768	continue;
5769
5770	// Determine direction from SCEV step.
5771	if (!SE.isKnownNonZero(S: Step))
5772	continue;
5773
5774	// Positive step means we need UMax/SMax to find the last IV value, and
5775	// UMin/SMin otherwise.
5776	bool UseMax = SE.isKnownPositive(S: Step);
5777	bool UseSigned = true;
5778	std::optional<APInt> SentinelVal =
5779	CheckSentinel (IVSCEV, UseMax, /IsSigned=/true);
5780	if (!SentinelVal) {
5781	SentinelVal = CheckSentinel (IVSCEV, UseMax, /IsSigned=/false);
5782	UseSigned = false;
5783	}
5784
5785	// If no sentinel was found, fall back to a boolean AnyOf reduction to track
5786	// if the condition was ever true. Requires the IV to not wrap, otherwise we
5787	// cannot use min/max.
5788	if (!SentinelVal) {
5789	auto *AR = cast<SCEVAddRecExpr>(Val: IVSCEV);
5790	if (AR->hasNoSignedWrap())
5791	UseSigned = true;
5792	else if (AR->hasNoUnsignedWrap())
5793	UseSigned = false;
5794	else
5795	continue;
5796	}
5797
5798	VPInstruction *RdxResult = cast<VPInstruction>(Val: vputils::findRecipe(
5799	Start: BackedgeVal,
5800	Pred: match_fn(P: m_VPInstruction<VPInstruction::ComputeReductionResult>())));
5801
5802	RecurKind MinMaxKind =
5803	UseMax ? (UseSigned ? RecurKind::SMax : RecurKind::UMax)
5804	: (UseSigned ? RecurKind::SMin : RecurKind::UMin);
5805	VPIRFlags Flags(MinMaxKind, /IsOrdered=/false, /IsInLoop=/false,
5806	FastMathFlags ());
5807	DebugLoc ExitDL = RdxResult->getDebugLoc();
5808	VPBuilder MiddleBuilder(RdxResult);
5809	VPValue *ReducedIV =
5810	MiddleBuilder.createNaryOp(Opcode: VPInstruction::ComputeReductionResult,
5811	Operands: RdxResult->getOperand(N: `0`), Flags, DL: ExitDL);
5812
5813	VPValue *NewRdxResult;
5814	VPValue *StartVPV = PhiR->getStartValue();
5815	if (SentinelVal) {
5816	// Sentinel-based approach: reduce IVs with min/max, compare against
5817	// sentinel to detect if condition was ever true, select accordingly.
5818	VPValue Sentinel = Plan.getConstantInt(Val: SentinelVal);
5819	auto *Cmp = MiddleBuilder.createICmp(Pred: CmpInst::ICMP_NE, A: ReducedIV,
5820	B: Sentinel, DL: ExitDL);
5821	NewRdxResult =
5822	MiddleBuilder.createSelect(Cond: Cmp, TrueVal: ReducedIV, FalseVal: StartVPV, DL: ExitDL);
5823	StartVPV = Sentinel;
5824	} else {
5825	// Introduce a boolean AnyOf reduction to track if the condition was ever
5826	// true in the loop. Use it to select the initial start value, if it was
5827	// never true.
5828	auto AnyOfPhi = new* VPReductionPHIRecipe (
5829	/Phi=/nullptr, RecurKind::Or, Plan.getFalse(), Plan.getFalse(),
5830	RdxUnordered{.VFScaleFactor: `1`}, {}, /HasUsesOutsideReductionChain=/false);
5831	AnyOfPhi->insertAfter(InsertPos: PhiR);
5832
5833	VPBuilder LoopBuilder(BackedgeVal->getDefiningRecipe());
5834	VPValue *AnyOfCond = Cond;
5835	if (TrueVal == PhiR)
5836	AnyOfCond = LoopBuilder.createNot(Operand: Cond);
5837	VPValue *OrVal = LoopBuilder.createOr(LHS: AnyOfPhi, RHS: AnyOfCond);
5838	AnyOfPhi->setOperand(I: `1`, New: OrVal);
5839
5840	NewRdxResult =
5841	MiddleBuilder.createNaryOp(Opcode: VPInstruction::ComputeAnyOfResult,
5842	Operands: {StartVPV, ReducedIV, OrVal}, Flags: {}, DL: ExitDL);
5843
5844	// Initialize the IV reduction phi with the neutral element, not the
5845	// original start value, to ensure correct min/max reduction results.
5846	StartVPV = Plan.getOrAddLiveIn(
5847	V: getRecurrenceIdentity(K: MinMaxKind, Tp: IVSCEV->getType(), FMF: {}));
5848	}
5849	RdxResult->replaceAllUsesWith(New: NewRdxResult);
5850	RdxResult->eraseFromParent();
5851
5852	auto NewPhiR = new* VPReductionPHIRecipe (
5853	cast<PHINode>(Val: PhiR->getUnderlyingInstr()), RecurKind::FindIV, *StartVPV,
5854	*CondSelect, RdxUnordered{.VFScaleFactor: `1`}, {}, PhiR->hasUsesOutsideReductionChain());
5855	NewPhiR->insertBefore(InsertPos: PhiR);
5856	PhiR->replaceAllUsesWith(New: NewPhiR);
5857	PhiR->eraseFromParent();
5858	}
5859	}
5860
5861	namespace {
5862
5863	/// Holds the binary operation used to compute the extended operand and the
5864	/// casts that feed into it.
5865	struct ExtendedReductionOperand {
5866	VPWidenRecipe BinOp = nullptr*;
5867	// Note: The second cast recipe may be null.
5868	std::array<VPWidenCastRecipe *, `2`> CastRecipes = {};
5869	};
5870
5871	/// A chain of recipes that form a partial reduction. Matches either
5872	/// reduction_bin_op (extend (A), accumulator), or
5873	/// reduction_bin_op (bin_op (extend (A), (extend (B))), accumulator).
5874	struct VPPartialReductionChain {
5875	/// The top-level binary operation that forms the reduction to a scalar
5876	/// after the loop body.
5877	VPWidenRecipe *ReductionBinOp;
5878	/// The user of the extends that is then reduced.
5879	ExtendedReductionOperand ExtendedOp;
5880	unsigned ScaleFactor;
5881	/// The recurrence kind for the entire partial reduction chain.
5882	/// This allows distinguishing between Sub and AddWithSub recurrences,
5883	/// when the ReductionBinOp is a Instruction::Sub.
5884	RecurKind RK;
5885	};
5886
5887	static VPSingleDefRecipe *
5888	optimizeExtendsForPartialReduction(VPSingleDefRecipe *BinOp,
5889	VPTypeAnalysis &TypeInfo) {
5890	// reduce.add(mul(ext(A), C))
5891	// -> reduce.add(mul(ext(A), ext(trunc(C))))
5892	const APInt *Const;
5893	if (match(R: BinOp, P: m_Mul(Op0: m_ZExtOrSExt(Op0: m_VPValue()), Op1: m_APInt(C&: Const)))) {
5894	auto *ExtA = cast<VPWidenCastRecipe>(Val: BinOp->getOperand(N: `0`));
5895	Instruction::CastOps ExtOpc = ExtA->getOpcode();
5896	Type *NarrowTy = TypeInfo.inferScalarType(V: ExtA->getOperand(N: `0`));
5897	if (!BinOp->hasOneUse() \|\|
5898	!llvm::canConstantBeExtended(
5899	C: Const, NarrowType: NarrowTy, ExtKind: TTI::getPartialReductionExtendKind(CastOpc: ExtOpc)))
5900	return BinOp;
5901
5902	VPBuilder Builder(BinOp);
5903	auto *Trunc = Builder.createWidenCast(Opcode: Instruction::CastOps::Trunc,
5904	Op: BinOp->getOperand(N: `1`), ResultTy: NarrowTy);
5905	Type *WideTy = TypeInfo.inferScalarType(V: ExtA);
5906	BinOp->setOperand(I: `1`, New: Builder.createWidenCast(Opcode: ExtOpc, Op: Trunc, ResultTy: WideTy));
5907	return BinOp;
5908	}
5909
5910	// reduce.add(ext(mul(ext(A), ext(B))))
5911	// -> reduce.add(mul(wider_ext(A), wider_ext(B)))
5912	if (match(R: BinOp, P: m_ZExtOrSExt(Op0: m_Mul(Op0: m_ZExtOrSExt(Op0: m_VPValue()),
5913	Op1: m_ZExtOrSExt(Op0: m_VPValue()))))) {
5914	auto *Ext = cast<VPWidenCastRecipe>(Val: BinOp);
5915	auto *Mul = cast<VPWidenRecipe>(Val: Ext->getOperand(N: `0`));
5916	auto *MulLHS = cast<VPWidenCastRecipe>(Val: Mul->getOperand(N: `0`));
5917	auto *MulRHS = cast<VPWidenCastRecipe>(Val: Mul->getOperand(N: `1`));
5918	if (!Mul->hasOneUse() \|\|
5919	(Ext->getOpcode() != MulLHS->getOpcode() && MulLHS != MulRHS) \|\|
5920	MulLHS->getOpcode() != MulRHS->getOpcode())
5921	return BinOp;
5922	VPBuilder Builder(Mul);
5923	Mul->setOperand(I: `0`, New: Builder.createWidenCast(Opcode: MulLHS->getOpcode(),
5924	Op: MulLHS->getOperand(N: `0`),
5925	ResultTy: Ext->getResultType()));
5926	Mul->setOperand(I: `1`, New: MulLHS == MulRHS
5927	? Mul->getOperand(N: `0`)
5928	: Builder.createWidenCast(Opcode: MulRHS->getOpcode(),
5929	Op: MulRHS->getOperand(N: `0`),
5930	ResultTy: Ext->getResultType()));
5931	return Mul;
5932	}
5933
5934	return BinOp;
5935	}
5936
5937	// Helper to transform a partial reduction chain into a partial reduction
5938	// recipe. Assumes profitability has been checked.
5939	static void transformToPartialReduction(const VPPartialReductionChain &Chain,
5940	VPTypeAnalysis &TypeInfo, VPlan &Plan,
5941	VPReductionPHIRecipe *RdxPhi) {
5942	VPWidenRecipe *WidenRecipe = Chain.ReductionBinOp;
5943	assert(WidenRecipe->getNumOperands() == `2` && "Expected binary operation");
5944
5945	VPValue *BinOpVal = WidenRecipe->getOperand(N: `0`);
5946	VPValue *Accumulator = WidenRecipe->getOperand(N: `1`);
5947
5948	// Swap if needed to ensure Accumulator is the PHI or partial reduction.
5949	if (isa<VPReductionPHIRecipe, VPReductionRecipe>(Val: BinOpVal) \|\|
5950	isa<VPExpressionRecipe>(Val: BinOpVal))
5951	std::swap(a&: BinOpVal, b&: Accumulator);
5952	auto *BinOp = cast<VPSingleDefRecipe>(Val: BinOpVal->getDefiningRecipe());
5953
5954	// Sub-reductions can be implemented in two ways:
5955	// (1) negate the operand in the vector loop (the default way).
5956	// (2) subtract the reduced value from the init value in the middle block.
5957	// Both ways keep the reduction itself as an 'add' reduction.
5958	//
5959	// The ISD nodes for partial reductions don't support folding the
5960	// sub/negation into its operands because the following is not a valid
5961	// transformation:
5962	// sub(0, mul(ext(a), ext(b)))
5963	// -> mul(ext(a), ext(sub(0, b)))
5964	//
5965	// It's therefore better to choose option (2) such that the partial
5966	// reduction is always positive (starting at '0') and to do a final
5967	// subtract in the middle block.
5968	if (WidenRecipe->getOpcode() == Instruction::Sub &&
5969	Chain.RK != RecurKind::Sub) {
5970	VPBuilder Builder(WidenRecipe);
5971	Type *ElemTy = TypeInfo.inferScalarType(V: BinOp);
5972	auto *Zero = Plan.getZero(Ty: ElemTy);
5973	VPIRFlags Flags = WidenRecipe->getUnderlyingInstr()
5974	? VPIRFlags (*WidenRecipe->getUnderlyingInstr())
5975	: VPIRFlags ();
5976	auto NegRecipe = new* VPWidenRecipe (Instruction::Sub, {Zero, BinOp}, Flags,
5977	VPIRMetadata (), DebugLoc::getUnknown());
5978	Builder.insert(R: NegRecipe);
5979	BinOp = NegRecipe;
5980	}
5981
5982	// FIXME: Do these transforms before invoking the cost-model.
5983	BinOp = optimizeExtendsForPartialReduction(BinOp, TypeInfo);
5984
5985	// Check if WidenRecipe is the final result of the reduction. If so look
5986	// through selects for predicated reductions.
5987	VPValue Cond = nullptr*;
5988	VPValue *ExitValue = cast_or_null<VPInstruction>(Val: vputils::findUserOf(
5989	V: WidenRecipe,
5990	P: m_Select(Op0: m_VPValue(V&: Cond), Op1: m_Specific(VPV: WidenRecipe), Op2: m_Specific(VPV: RdxPhi))));
5991	bool IsLastInChain = RdxPhi->getBackedgeValue() == WidenRecipe \|\|
5992	RdxPhi->getBackedgeValue() == ExitValue;
5993	assert((!ExitValue \|\| IsLastInChain) &&
5994	"if we found ExitValue, it must match RdxPhi's backedge value");
5995
5996	Type *PhiType = TypeInfo.inferScalarType(V: RdxPhi);
5997	RecurKind RdxKind =
5998	PhiType->isFloatingPointTy() ? RecurKind::FAdd : RecurKind::Add;
5999	auto PartialRed = new* VPReductionRecipe (
6000	RdxKind,
6001	RdxKind == RecurKind::FAdd ? WidenRecipe->getFastMathFlags()
6002	: FastMathFlags (),
6003	WidenRecipe->getUnderlyingInstr(), Accumulator, BinOp, Cond,
6004	RdxUnordered{/VFScaleFactor=/Chain.ScaleFactor});
6005	PartialRed->insertBefore(InsertPos: WidenRecipe);
6006
6007	if (Cond)
6008	ExitValue->replaceAllUsesWith(New: PartialRed);
6009	WidenRecipe->replaceAllUsesWith(New: PartialRed);
6010
6011	// We only need to update the PHI node once, which is when we find the
6012	// last reduction in the chain.
6013	if (!IsLastInChain)
6014	return;
6015
6016	// Scale the PHI and ReductionStartVector by the VFScaleFactor
6017	assert(RdxPhi->getVFScaleFactor() == `1` && "scale factor must not be set");
6018	RdxPhi->setVFScaleFactor(Chain.ScaleFactor);
6019
6020	auto *StartInst = cast<VPInstruction>(Val: RdxPhi->getStartValue());
6021	assert(StartInst->getOpcode() == VPInstruction::ReductionStartVector);
6022	auto *NewScaleFactor = Plan.getConstantInt(BitWidth: `32`, Val: Chain.ScaleFactor);
6023	StartInst->setOperand(I: `2`, New: NewScaleFactor);
6024
6025	// If this is the last value in a sub-reduction chain, then update the PHI
6026	// node to start at `0` and update the reduction-result to subtract from
6027	// the PHI's start value.
6028	if (Chain.RK != RecurKind::Sub)
6029	return;
6030
6031	VPValue *OldStartValue = StartInst->getOperand(N: `0`);
6032	StartInst->setOperand(I: `0`, New: StartInst->getOperand(N: `1`));
6033
6034	// Replace reduction_result by 'sub (startval, reductionresult)'.
6035	VPInstruction *RdxResult = vputils::findComputeReductionResult(PhiR: RdxPhi);
6036	assert(RdxResult && "Could not find reduction result");
6037
6038	VPBuilder Builder = VPBuilder::getToInsertAfter(R: RdxResult);
6039	constexpr unsigned SubOpc = Instruction::BinaryOps::Sub;
6040	VPInstruction *NewResult = Builder.createNaryOp(
6041	Opcode: SubOpc, Operands: {OldStartValue, RdxResult}, Flags: VPIRFlags::getDefaultFlags(Opcode: SubOpc),
6042	DL: RdxPhi->getDebugLoc());
6043	RdxResult->replaceUsesWithIf(
6044	New: NewResult,
6045	ShouldReplace: [&NewResult](VPUser &U, unsigned Idx) { return &U != NewResult; });
6046	}
6047
6048	/// Check if a partial reduction chain is is supported by the target (i.e. does
6049	/// not have an invalid cost) for the given VF range. Clamps the range and
6050	/// returns true if profitable for any VF.
6051	static bool isValidPartialReduction(const VPPartialReductionChain &Chain,
6052	Type *PhiType, VPCostContext &CostCtx,
6053	VFRange &Range) {
6054	auto GetExtInfo = [&CostCtx](VPWidenCastRecipe *Ext)
6055	-> std::pair<Type *, TargetTransformInfo::PartialReductionExtendKind> {
6056	if (!Ext)
6057	return {nullptr, TargetTransformInfo::PR_None};
6058	Type *ExtOpType = CostCtx.Types.inferScalarType(V: Ext->getOperand(N: `0`));
6059	auto ExtKind = TargetTransformInfo::getPartialReductionExtendKind(
6060	CastOpc: static_cast<Instruction::CastOps>(Ext->getOpcode()));
6061	return {ExtOpType, ExtKind};
6062	};
6063	ExtendedReductionOperand ExtendedOp = Chain.ExtendedOp;
6064	VPWidenCastRecipe *ExtendA = ExtendedOp.CastRecipes [`0`];
6065	VPWidenCastRecipe *ExtendB = ExtendedOp.CastRecipes [`1`];
6066
6067	Type ExtOpTypeA, ExtOpTypeB;
6068	TargetTransformInfo::PartialReductionExtendKind ExtKindA, ExtKindB;
6069	std::tie(args&: ExtOpTypeA, args&: ExtKindA) = GetExtInfo (ExtendA);
6070	std::tie(args&: ExtOpTypeB, args&: ExtKindB) = GetExtInfo (ExtendB);
6071
6072	// If ExtendB is nullptr but there's a separate BinOp, the second operand
6073	// was a constant that can use the same extend kind as the first.
6074	if (!ExtendB && ExtendedOp.BinOp &&
6075	ExtendedOp.BinOp != Chain.ReductionBinOp) {
6076	const APInt Const = nullptr*;
6077	for (VPValue *Op : ExtendedOp.BinOp->operands()) {
6078	if (match(V: Op, P: m_APInt(C&: Const)))
6079	break;
6080	}
6081	if (!Const \|\| !canConstantBeExtended(C: Const, NarrowType: ExtOpTypeA, ExtKind: ExtKindA))
6082	return false;
6083	ExtOpTypeB = ExtOpTypeA;
6084	ExtKindB = ExtKindA;
6085	}
6086
6087	std::optional<unsigned> BinOpc;
6088	if (ExtendedOp.BinOp && ExtendedOp.BinOp != Chain.ReductionBinOp)
6089	BinOpc = ExtendedOp.BinOp->getOpcode();
6090
6091	VPWidenRecipe *WidenRecipe = Chain.ReductionBinOp;
6092	return LoopVectorizationPlanner::getDecisionAndClampRange(
6093	Predicate: [&](ElementCount VF) {
6094	return CostCtx.TTI
6095	.getPartialReductionCost(
6096	Opcode: WidenRecipe->getOpcode(), InputTypeA: ExtOpTypeA, InputTypeB: ExtOpTypeB, AccumType: PhiType, VF,
6097	OpAExtend: ExtKindA, OpBExtend: ExtKindB, BinOp: BinOpc, CostKind: CostCtx.CostKind,
6098	FMF: PhiType->isFloatingPointTy()
6099	? std::optional{WidenRecipe->getFastMathFlags()}
6100	: std::nullopt)
6101	.isValid();
6102	},
6103	Range);
6104	}
6105
6106	/// Checks if \p Op (which is an operand of \p UpdateR) is an extended reduction
6107	/// operand. This is an operand where the source of the value (e.g. a load) has
6108	/// been extended (sext, zext, or fpext) before it is used in the reduction.
6109	///
6110	/// Possible forms matched by this function:
6111	/// - UpdateR(PrevValue, ext(...))
6112	/// - UpdateR(PrevValue, BinOp(ext(...), ext(...)))
6113	/// - UpdateR(PrevValue, BinOp(ext(...), Constant))
6114	/// - UpdateR(PrevValue, neg(BinOp(ext(...), ext(...))))
6115	/// - UpdateR(PrevValue, neg(BinOp(ext(...), Constant)))
6116	/// - UpdateR(PrevValue, ext(mul(ext(...), ext(...))))
6117	/// - UpdateR(PrevValue, ext(mul(ext(...), Constant)))
6118	///
6119	/// Note: The second operand of UpdateR corresponds to \p Op in the examples.
6120	static std::optional<ExtendedReductionOperand>
6121	matchExtendedReductionOperand(VPWidenRecipe UpdateR, VPValue Op) {
6122	assert(is_contained(UpdateR->operands(), Op) &&
6123	"Op should be operand of UpdateR");
6124
6125	// If Op is an extend, then it's still a valid partial reduction if the
6126	// extended mul fulfills the other requirements.
6127	// For example, reduce.add(ext(mul(ext(A), ext(B)))) is still a valid partial
6128	// reduction since the inner extends will be widened. We already have oneUse
6129	// checks on the inner extends so widening them is safe.
6130	std::optional<TTI::PartialReductionExtendKind> OuterExtKind;
6131	if (match(V: Op, P: m_ZExtOrSExt(Op0: m_Mul(Op0: m_VPValue(), Op1: m_VPValue()))) \|\|
6132	match(V: Op, P: m_FPExt(Op0: m_FMul(Op0: m_VPValue(), Op1: m_VPValue())))) {
6133	auto *CastRecipe = dyn_cast<VPWidenCastRecipe>(Val: Op);
6134	if (!CastRecipe)
6135	return std::nullopt;
6136	auto CastOp = static_cast<Instruction::CastOps>(CastRecipe->getOpcode());
6137	OuterExtKind = TTI::getPartialReductionExtendKind(CastOpc: CastOp);
6138	Op = CastRecipe->getOperand(N: `0`);
6139	}
6140
6141	// If the update is a binary op, check both of its operands to see if
6142	// they are extends. Otherwise, see if the update comes directly from an
6143	// extend.
6144	std::array<VPWidenCastRecipe *, `2`> CastRecipes = {};
6145
6146	// Match extends and populate CastRecipes. Returns false if matching fails.
6147	auto MatchExtends = [OuterExtKind,
6148	&CastRecipes](ArrayRef<VPValue *> Operands) {
6149	assert(Operands.size() <= `2` && "expected at most 2 operands");
6150
6151	for (const auto &[I, OpVal] : enumerate(First&: Operands)) {
6152	// Allow constant as second operand - validation happens in
6153	// isValidPartialReduction.
6154	const APInt *Unused;
6155	if (I > `0` && CastRecipes [`0`] && match(V: OpVal, P: m_APInt(C&: Unused)))
6156	continue;
6157
6158	VPValue *ExtInput;
6159	if (!match(V: OpVal, P: m_ZExtOrSExt(Op0: m_VPValue(V&: ExtInput))) &&
6160	!match(V: OpVal, P: m_FPExt(Op0: m_VPValue(V&: ExtInput))))
6161	return false;
6162
6163	CastRecipes [I] = dyn_cast<VPWidenCastRecipe>(Val: OpVal);
6164	if (!CastRecipes [I])
6165	return false;
6166
6167	// The outer extend kind must match the inner extends for folding.
6168	if (OuterExtKind) {
6169	auto CastOp =
6170	static_cast<Instruction::CastOps>(CastRecipes [I]->getOpcode());
6171	if (*OuterExtKind != TTI::getPartialReductionExtendKind(CastOpc: CastOp))
6172	return false;
6173	}
6174	}
6175	return CastRecipes [`0`] != nullptr;
6176	};
6177
6178	// If Op is a binary operator, check both of its operands to see if they are
6179	// extends. Otherwise, see if the update comes directly from an extend.
6180	auto *BinOp = dyn_cast<VPWidenRecipe>(Val: Op);
6181	if (BinOp && Instruction::isBinaryOp(Opcode: BinOp->getOpcode())) {
6182	if (!BinOp->hasOneUse())
6183	return std::nullopt;
6184
6185	// Handle neg(binop(ext, ext)) pattern.
6186	VPValue OtherOp = nullptr*;
6187	if (match(V: BinOp, P: m_Sub(Op0: m_ZeroInt(), Op1: m_VPValue(V&: OtherOp))))
6188	BinOp = dyn_cast<VPWidenRecipe>(Val: OtherOp);
6189
6190	if (!BinOp \|\| !Instruction::isBinaryOp(Opcode: BinOp->getOpcode()) \|\|
6191	!MatchExtends (BinOp->operands()))
6192	return std::nullopt;
6193	} else if (match(V: UpdateR, P: m_Add(Op0: m_VPValue(), Op1: m_VPValue())) \|\|
6194	match(V: UpdateR, P: m_FAdd(Op0: m_VPValue(), Op1: m_VPValue()))) {
6195	// We already know Op is an operand of UpdateR.
6196	if (!MatchExtends ({Op}))
6197	return std::nullopt;
6198	BinOp = UpdateR;
6199	} else {
6200	return std::nullopt;
6201	}
6202
6203	return ExtendedReductionOperand{.BinOp: BinOp, .CastRecipes: CastRecipes};
6204	}
6205
6206	/// Examines each operation in the reduction chain corresponding to \p RedPhiR,
6207	/// and determines if the target can use a cheaper operation with a wider
6208	/// per-iteration input VF and narrower PHI VF. If successful, returns the chain
6209	/// of operations in the reduction.
6210	static std::optional<SmallVector<VPPartialReductionChain>>
6211	getScaledReductions(VPReductionPHIRecipe *RedPhiR, VPCostContext &CostCtx,
6212	VFRange &Range) {
6213	// Get the backedge value from the reduction PHI and find the
6214	// ComputeReductionResult that uses it (directly or through a select for
6215	// predicated reductions).
6216	auto *RdxResult = vputils::findComputeReductionResult(PhiR: RedPhiR);
6217	if (!RdxResult)
6218	return std::nullopt;
6219	VPValue *ExitValue = RdxResult->getOperand(N: `0`);
6220	match(V: ExitValue, P: m_Select(Op0: m_VPValue(), Op1: m_VPValue(V&: ExitValue), Op2: m_VPValue()));
6221
6222	SmallVector<VPPartialReductionChain> Chains;
6223	RecurKind RK = RedPhiR->getRecurrenceKind();
6224	Type *PhiType = CostCtx.Types.inferScalarType(V: RedPhiR);
6225	TypeSize PHISize = PhiType->getPrimitiveSizeInBits();
6226
6227	// Work backwards from the ExitValue examining each reduction operation.
6228	VPValue *CurrentValue = ExitValue;
6229	while (CurrentValue != RedPhiR) {
6230	auto *UpdateR = dyn_cast<VPWidenRecipe>(Val: CurrentValue);
6231	if (!UpdateR \|\| !Instruction::isBinaryOp(Opcode: UpdateR->getOpcode()))
6232	return std::nullopt;
6233
6234	VPValue *Op = UpdateR->getOperand(N: `1`);
6235	VPValue *PrevValue = UpdateR->getOperand(N: `0`);
6236
6237	// Find the extended operand. The other operand (PrevValue) is the next link
6238	// in the reduction chain.
6239	std::optional<ExtendedReductionOperand> ExtendedOp =
6240	matchExtendedReductionOperand(UpdateR, Op);
6241	if (!ExtendedOp) {
6242	ExtendedOp = matchExtendedReductionOperand(UpdateR, Op: PrevValue);
6243	if (!ExtendedOp)
6244	return std::nullopt;
6245	std::swap(a&: Op, b&: PrevValue);
6246	}
6247
6248	Type *ExtSrcType = CostCtx.Types.inferScalarType(
6249	V: ExtendedOp ->CastRecipes [`0`]->getOperand(N: `0`));
6250	TypeSize ExtSrcSize = ExtSrcType->getPrimitiveSizeInBits();
6251	if (!PHISize.hasKnownScalarFactor(RHS: ExtSrcSize))
6252	return std::nullopt;
6253
6254	VPPartialReductionChain Chain(
6255	{.ReductionBinOp: UpdateR, .ExtendedOp: *ExtendedOp,
6256	.ScaleFactor: static_cast<unsigned>(PHISize.getKnownScalarFactor(RHS: ExtSrcSize)), .RK: RK});
6257	if (!isValidPartialReduction(Chain, PhiType, CostCtx, Range))
6258	return std::nullopt;
6259
6260	Chains.push_back(Elt: Chain);
6261	CurrentValue = PrevValue;
6262	}
6263
6264	// The chains were collected by traversing backwards from the exit value.
6265	// Reverse the chains so they are in program order.
6266	std::reverse(first: Chains.begin(), last: Chains.end());
6267	return Chains;
6268	}
6269	} // namespace
6270
6271	void VPlanTransforms::createPartialReductions(VPlan &Plan,
6272	VPCostContext &CostCtx,
6273	VFRange &Range) {
6274	// Find all possible valid partial reductions, grouping chains by their PHI.
6275	// This grouping allows invalidating the whole chain, if any link is not a
6276	// valid partial reduction.
6277	MapVector<VPReductionPHIRecipe *, SmallVector<VPPartialReductionChain>>
6278	ChainsByPhi;
6279	VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
6280	for (VPRecipeBase &R : HeaderVPBB->phis()) {
6281	auto *RedPhiR = dyn_cast<VPReductionPHIRecipe>(Val: &R);
6282	if (!RedPhiR)
6283	continue;
6284
6285	if (auto Chains = getScaledReductions(RedPhiR, CostCtx, Range))
6286	ChainsByPhi.try_emplace(Key: RedPhiR, Args: std::move(*Chains));
6287	}
6288
6289	if (ChainsByPhi.empty())
6290	return;
6291
6292	// Build set of partial reduction operations for extend user validation and
6293	// a map of reduction bin ops to their scale factors for scale validation.
6294	SmallPtrSet<VPRecipeBase *, `4`> PartialReductionOps;
6295	DenseMap<VPSingleDefRecipe , unsigned*> ScaledReductionMap;
6296	for (const auto &[_, Chains] : ChainsByPhi)
6297	for (const VPPartialReductionChain &Chain : Chains) {
6298	PartialReductionOps.insert(Ptr: Chain.ExtendedOp.BinOp);
6299	ScaledReductionMap [Chain.ReductionBinOp] = Chain.ScaleFactor;
6300	}
6301
6302	// A partial reduction is invalid if any of its extends are used by
6303	// something that isn't another partial reduction. This is because the
6304	// extends are intended to be lowered along with the reduction itself.
6305	auto ExtendUsersValid = [&](VPWidenCastRecipe *Ext) {
6306	return !Ext \|\| all_of(Range: Ext->users(), P: [&](VPUser *U) {
6307	return PartialReductionOps.contains(Ptr: cast<VPRecipeBase>(Val: U));
6308	});
6309	};
6310
6311	// Validate chains: check that extends are only used by partial reductions,
6312	// and that reduction bin ops are only used by other partial reductions with
6313	// matching scale factors, are outside the loop region or the select
6314	// introduced by tail-folding. Otherwise we would create users of scaled
6315	// reductions where the types of the other operands don't match.
6316	for (auto &[RedPhiR, Chains] : ChainsByPhi) {
6317	for (const VPPartialReductionChain &Chain : Chains) {
6318	if (!all_of(Range: Chain.ExtendedOp.CastRecipes, P: ExtendUsersValid)) {
6319	Chains.clear();
6320	break;
6321	}
6322	auto UseIsValid = [&, RedPhiR = RedPhiR](VPUser *U) {
6323	if (auto *PhiR = dyn_cast<VPReductionPHIRecipe>(Val: U))
6324	return PhiR == RedPhiR;
6325	auto *R = cast<VPSingleDefRecipe>(Val: U);
6326	return Chain.ScaleFactor == ScaledReductionMap.lookup_or(Val: R, Default: `0`) \|\|
6327	match(R, P: m_ComputeReductionResult(
6328	Op0: m_Specific(VPV: Chain.ReductionBinOp))) \|\|
6329	match(R, P: m_Select(Op0: m_VPValue(), Op1: m_Specific(VPV: Chain.ReductionBinOp),
6330	Op2: m_Specific(VPV: RedPhiR)));
6331	};
6332	if (!all_of(Range: Chain.ReductionBinOp->users(), P: UseIsValid)) {
6333	Chains.clear();
6334	break;
6335	}
6336
6337	// Check if the compute-reduction-result is used by a sunk store.
6338	// TODO: Also form partial reductions in those cases.
6339	if (auto *RdxResult = vputils::findComputeReductionResult(PhiR: RedPhiR)) {
6340	if (any_of(Range: RdxResult->users(), P: [](VPUser *U) {
6341	auto *RepR = dyn_cast<VPReplicateRecipe>(Val: U);
6342	return RepR && isa<StoreInst>(Val: RepR->getUnderlyingInstr());
6343	})) {
6344	Chains.clear();
6345	break;
6346	}
6347	}
6348	}
6349	}
6350
6351	for (auto &[Phi, Chains] : ChainsByPhi)
6352	for (const VPPartialReductionChain &Chain : Chains)
6353	transformToPartialReduction(Chain, TypeInfo&: CostCtx.Types, Plan, RdxPhi: Phi);
6354	}
6355

Browse the source code of llvm_projects/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp