VPlanRecipes.cpp source code [llvm_projects/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp]

1	//===- VPlanRecipes.cpp - Implementations for VPlan recipes ---------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	///
9	/// \file
10	/// This file contains implementations for different VPlan recipes.
11	///
12	//===----------------------------------------------------------------------===//
13
14	#include "LoopVectorizationPlanner.h"
15	#include "VPlan.h"
16	#include "VPlanAnalysis.h"
17	#include "VPlanHelpers.h"
18	#include "VPlanPatternMatch.h"
19	#include "VPlanUtils.h"
20	#include "llvm/ADT/STLExtras.h"
21	#include "llvm/ADT/SmallVector.h"
22	#include "llvm/ADT/Twine.h"
23	#include "llvm/Analysis/AssumptionCache.h"
24	#include "llvm/Analysis/IVDescriptors.h"
25	#include "llvm/Analysis/LoopInfo.h"
26	#include "llvm/Analysis/ScalarEvolutionExpressions.h"
27	#include "llvm/IR/BasicBlock.h"
28	#include "llvm/IR/IRBuilder.h"
29	#include "llvm/IR/Instruction.h"
30	#include "llvm/IR/Instructions.h"
31	#include "llvm/IR/Intrinsics.h"
32	#include "llvm/IR/Type.h"
33	#include "llvm/IR/Value.h"
34	#include "llvm/Support/Casting.h"
35	#include "llvm/Support/CommandLine.h"
36	#include "llvm/Support/Debug.h"
37	#include "llvm/Support/raw_ostream.h"
38	#include "llvm/Transforms/Utils/BasicBlockUtils.h"
39	#include "llvm/Transforms/Utils/LoopUtils.h"
40	#include <cassert>
41
42	using namespace llvm;
43	using namespace llvm::VPlanPatternMatch;
44
45	using VectorParts = SmallVector<Value *, `2`>;
46
47	#define LV_NAME "loop-vectorize"
48	#define DEBUG_TYPE LV_NAME
49
50	bool VPRecipeBase::mayWriteToMemory() const {
51	switch (getVPRecipeID()) {
52	case VPExpressionSC:
53	return cast<VPExpressionRecipe>(Val: this)->mayReadOrWriteMemory();
54	case VPInstructionSC: {
55	auto VPI = cast<VPInstruction>(Val: this*);
56	// Loads read from memory but don't write to memory.
57	if (VPI->getOpcode() == Instruction::Load)
58	return false;
59	return VPI->opcodeMayReadOrWriteFromMemory();
60	}
61	case VPInterleaveEVLSC:
62	case VPInterleaveSC:
63	return cast<VPInterleaveBase>(Val: this)->getNumStoreOperands() > `0`;
64	case VPWidenStoreEVLSC:
65	case VPWidenStoreSC:
66	return true;
67	case VPReplicateSC:
68	return cast<Instruction>(Val: getVPSingleValue()->getUnderlyingValue())
69	->mayWriteToMemory();
70	case VPWidenCallSC:
71	return !cast<VPWidenCallRecipe>(Val: this)
72	->getCalledScalarFunction()
73	->onlyReadsMemory();
74	case VPWidenIntrinsicSC:
75	return cast<VPWidenIntrinsicRecipe>(Val: this)->mayWriteToMemory();
76	case VPActiveLaneMaskPHISC:
77	case VPCanonicalIVPHISC:
78	case VPCurrentIterationPHISC:
79	case VPBranchOnMaskSC:
80	case VPDerivedIVSC:
81	case VPFirstOrderRecurrencePHISC:
82	case VPReductionPHISC:
83	case VPScalarIVStepsSC:
84	case VPPredInstPHISC:
85	return false;
86	case VPBlendSC:
87	case VPReductionEVLSC:
88	case VPReductionSC:
89	case VPVectorPointerSC:
90	case VPWidenCanonicalIVSC:
91	case VPWidenCastSC:
92	case VPWidenGEPSC:
93	case VPWidenIntOrFpInductionSC:
94	case VPWidenLoadEVLSC:
95	case VPWidenLoadSC:
96	case VPWidenPHISC:
97	case VPWidenPointerInductionSC:
98	case VPWidenSC: {
99	const Instruction *I =
100	dyn_cast_or_null<Instruction>(Val: getVPSingleValue()->getUnderlyingValue());
101	(void)I;
102	assert((!I \|\| !I->mayWriteToMemory()) &&
103	"underlying instruction may write to memory");
104	return false;
105	}
106	default:
107	return true;
108	}
109	}
110
111	bool VPRecipeBase::mayReadFromMemory() const {
112	switch (getVPRecipeID()) {
113	case VPExpressionSC:
114	return cast<VPExpressionRecipe>(Val: this)->mayReadOrWriteMemory();
115	case VPInstructionSC:
116	return cast<VPInstruction>(Val: this)->opcodeMayReadOrWriteFromMemory();
117	case VPWidenLoadEVLSC:
118	case VPWidenLoadSC:
119	return true;
120	case VPReplicateSC:
121	return cast<Instruction>(Val: getVPSingleValue()->getUnderlyingValue())
122	->mayReadFromMemory();
123	case VPWidenCallSC:
124	return !cast<VPWidenCallRecipe>(Val: this)
125	->getCalledScalarFunction()
126	->onlyWritesMemory();
127	case VPWidenIntrinsicSC:
128	return cast<VPWidenIntrinsicRecipe>(Val: this)->mayReadFromMemory();
129	case VPBranchOnMaskSC:
130	case VPDerivedIVSC:
131	case VPFirstOrderRecurrencePHISC:
132	case VPReductionPHISC:
133	case VPPredInstPHISC:
134	case VPScalarIVStepsSC:
135	case VPWidenStoreEVLSC:
136	case VPWidenStoreSC:
137	return false;
138	case VPBlendSC:
139	case VPReductionEVLSC:
140	case VPReductionSC:
141	case VPVectorPointerSC:
142	case VPWidenCanonicalIVSC:
143	case VPWidenCastSC:
144	case VPWidenGEPSC:
145	case VPWidenIntOrFpInductionSC:
146	case VPWidenPHISC:
147	case VPWidenPointerInductionSC:
148	case VPWidenSC: {
149	const Instruction *I =
150	dyn_cast_or_null<Instruction>(Val: getVPSingleValue()->getUnderlyingValue());
151	(void)I;
152	assert((!I \|\| !I->mayReadFromMemory()) &&
153	"underlying instruction may read from memory");
154	return false;
155	}
156	default:
157	// FIXME: Return false if the recipe represents an interleaved store.
158	return true;
159	}
160	}
161
162	bool VPRecipeBase::mayHaveSideEffects() const {
163	switch (getVPRecipeID()) {
164	case VPExpressionSC:
165	return cast<VPExpressionRecipe>(Val: this)->mayHaveSideEffects();
166	case VPActiveLaneMaskPHISC:
167	case VPDerivedIVSC:
168	case VPFirstOrderRecurrencePHISC:
169	case VPReductionPHISC:
170	case VPPredInstPHISC:
171	case VPVectorEndPointerSC:
172	return false;
173	case VPInstructionSC: {
174	auto VPI = cast<VPInstruction>(Val: this*);
175	return mayWriteToMemory() \|\|
176	VPI->getOpcode() == VPInstruction::BranchOnCount \|\|
177	VPI->getOpcode() == VPInstruction::BranchOnCond \|\|
178	VPI->getOpcode() == VPInstruction::BranchOnTwoConds;
179	}
180	case VPWidenCallSC: {
181	Function Fn = cast<VPWidenCallRecipe>(Val: this*)->getCalledScalarFunction();
182	return mayWriteToMemory() \|\| !Fn->doesNotThrow() \|\| !Fn->willReturn();
183	}
184	case VPWidenIntrinsicSC:
185	return cast<VPWidenIntrinsicRecipe>(Val: this)->mayHaveSideEffects();
186	case VPBlendSC:
187	case VPReductionEVLSC:
188	case VPReductionSC:
189	case VPScalarIVStepsSC:
190	case VPVectorPointerSC:
191	case VPWidenCanonicalIVSC:
192	case VPWidenCastSC:
193	case VPWidenGEPSC:
194	case VPWidenIntOrFpInductionSC:
195	case VPWidenPHISC:
196	case VPWidenPointerInductionSC:
197	case VPWidenSC: {
198	const Instruction *I =
199	dyn_cast_or_null<Instruction>(Val: getVPSingleValue()->getUnderlyingValue());
200	(void)I;
201	assert((!I \|\| !I->mayHaveSideEffects()) &&
202	"underlying instruction has side-effects");
203	return false;
204	}
205	case VPInterleaveEVLSC:
206	case VPInterleaveSC:
207	return mayWriteToMemory();
208	case VPWidenLoadEVLSC:
209	case VPWidenLoadSC:
210	case VPWidenStoreEVLSC:
211	case VPWidenStoreSC:
212	assert(
213	cast<VPWidenMemoryRecipe>(this)->getIngredient().mayHaveSideEffects() ==
214	mayWriteToMemory() &&
215	"mayHaveSideffects result for ingredient differs from this "
216	"implementation");
217	return mayWriteToMemory();
218	case VPReplicateSC: {
219	auto R = cast<VPReplicateRecipe>(Val: this*);
220	return R->getUnderlyingInstr()->mayHaveSideEffects();
221	}
222	default:
223	return true;
224	}
225	}
226
227	void VPRecipeBase::insertBefore(VPRecipeBase *InsertPos) {
228	assert(!Parent && "Recipe already in some VPBasicBlock");
229	assert(InsertPos->getParent() &&
230	"Insertion position not in any VPBasicBlock");
231	InsertPos->getParent()->insert(Recipe: this, InsertPt: InsertPos->getIterator());
232	}
233
234	void VPRecipeBase::insertBefore(VPBasicBlock &BB,
235	iplist<VPRecipeBase>::iterator I) {
236	assert(!Parent && "Recipe already in some VPBasicBlock");
237	assert(I == BB.end() \|\| I->getParent() == &BB);
238	BB.insert(Recipe: this, InsertPt: I);
239	}
240
241	void VPRecipeBase::insertAfter(VPRecipeBase *InsertPos) {
242	assert(!Parent && "Recipe already in some VPBasicBlock");
243	assert(InsertPos->getParent() &&
244	"Insertion position not in any VPBasicBlock");
245	InsertPos->getParent()->insert(Recipe: this, InsertPt: std::next(x: InsertPos->getIterator()));
246	}
247
248	void VPRecipeBase::removeFromParent() {
249	assert(getParent() && "Recipe not in any VPBasicBlock");
250	getParent()->getRecipeList().remove(IT: getIterator());
251	Parent = nullptr;
252	}
253
254	iplist<VPRecipeBase>::iterator VPRecipeBase::eraseFromParent() {
255	assert(getParent() && "Recipe not in any VPBasicBlock");
256	return getParent()->getRecipeList().erase(where: getIterator());
257	}
258
259	void VPRecipeBase::moveAfter(VPRecipeBase *InsertPos) {
260	removeFromParent();
261	insertAfter(InsertPos);
262	}
263
264	void VPRecipeBase::moveBefore(VPBasicBlock &BB,
265	iplist<VPRecipeBase>::iterator I) {
266	removeFromParent();
267	insertBefore(BB, I);
268	}
269
270	InstructionCost VPRecipeBase::cost(ElementCount VF, VPCostContext &Ctx) {
271	// Get the underlying instruction for the recipe, if there is one. It is used
272	// to
273	// decide if cost computation should be skipped for this recipe,*
274	// apply forced target instruction cost.*
275	Instruction UI = nullptr*;
276	if (auto S = dyn_cast<VPSingleDefRecipe>(Val: this*))
277	UI = dyn_cast_or_null<Instruction>(Val: S->getUnderlyingValue());
278	else if (auto IG = dyn_cast<VPInterleaveBase>(Val: this*))
279	UI = IG->getInsertPos();
280	else if (auto WidenMem = dyn_cast<VPWidenMemoryRecipe>(Val: this*))
281	UI = &WidenMem->getIngredient();
282
283	InstructionCost RecipeCost;
284	if (UI && Ctx.skipCostComputation(UI, IsVector: VF.isVector())) {
285	RecipeCost = `0`;
286	} else {
287	RecipeCost = computeCost(VF, Ctx);
288	if (ForceTargetInstructionCost.getNumOccurrences() > `0` &&
289	RecipeCost.isValid()) {
290	if (UI)
291	RecipeCost = InstructionCost (ForceTargetInstructionCost);
292	else
293	RecipeCost = InstructionCost (`0`);
294	}
295	}
296
297	LLVM_DEBUG({
298	dbgs() << "Cost of " << RecipeCost << " for VF " << VF << ": ";
299	dump();
300	});
301	return RecipeCost;
302	}
303
304	InstructionCost VPRecipeBase::computeCost(ElementCount VF,
305	VPCostContext &Ctx) const {
306	llvm_unreachable("subclasses should implement computeCost");
307	}
308
309	bool VPRecipeBase::isPhi() const {
310	return (getVPRecipeID() >= VPFirstPHISC && getVPRecipeID() <= VPLastPHISC) \|\|
311	isa<VPPhi, VPIRPhi>(Val: this);
312	}
313
314	bool VPRecipeBase::isScalarCast() const {
315	auto VPI = dyn_cast<VPInstruction>(Val: this*);
316	return VPI && Instruction::isCast(Opcode: VPI->getOpcode());
317	}
318
319	void VPIRFlags::intersectFlags(const VPIRFlags &Other) {
320	assert(OpType == Other.OpType && "OpType must match");
321	switch (OpType) {
322	case OperationType::OverflowingBinOp:
323	WrapFlags.HasNUW &= Other.WrapFlags.HasNUW;
324	WrapFlags.HasNSW &= Other.WrapFlags.HasNSW;
325	break;
326	case OperationType::Trunc:
327	TruncFlags.HasNUW &= Other.TruncFlags.HasNUW;
328	TruncFlags.HasNSW &= Other.TruncFlags.HasNSW;
329	break;
330	case OperationType::DisjointOp:
331	DisjointFlags.IsDisjoint &= Other.DisjointFlags.IsDisjoint;
332	break;
333	case OperationType::PossiblyExactOp:
334	ExactFlags.IsExact &= Other.ExactFlags.IsExact;
335	break;
336	case OperationType::GEPOp:
337	GEPFlagsStorage &= Other.GEPFlagsStorage;
338	break;
339	case OperationType::FPMathOp:
340	case OperationType::FCmp:
341	assert((OpType != OperationType::FCmp \|\|
342	FCmpFlags.CmpPredStorage == Other.FCmpFlags.CmpPredStorage) &&
343	"Cannot drop CmpPredicate");
344	getFMFsRef().NoNaNs &= Other.getFMFsRef().NoNaNs;
345	getFMFsRef().NoInfs &= Other.getFMFsRef().NoInfs;
346	break;
347	case OperationType::NonNegOp:
348	NonNegFlags.NonNeg &= Other.NonNegFlags.NonNeg;
349	break;
350	case OperationType::Cmp:
351	assert(CmpPredStorage == Other.CmpPredStorage &&
352	"Cannot drop CmpPredicate");
353	break;
354	case OperationType::ReductionOp:
355	assert(ReductionFlags.Kind == Other.ReductionFlags.Kind &&
356	"Cannot change RecurKind");
357	assert(ReductionFlags.IsOrdered == Other.ReductionFlags.IsOrdered &&
358	"Cannot change IsOrdered");
359	assert(ReductionFlags.IsInLoop == Other.ReductionFlags.IsInLoop &&
360	"Cannot change IsInLoop");
361	getFMFsRef().NoNaNs &= Other.getFMFsRef().NoNaNs;
362	getFMFsRef().NoInfs &= Other.getFMFsRef().NoInfs;
363	break;
364	case OperationType::Other:
365	break;
366	}
367	}
368
369	FastMathFlags VPIRFlags::getFastMathFlags() const {
370	assert((OpType == OperationType::FPMathOp \|\| OpType == OperationType::FCmp \|\|
371	OpType == OperationType::ReductionOp \|\|
372	OpType == OperationType::Other) &&
373	"recipe doesn't have fast math flags");
374	if (OpType == OperationType::Other)
375	return FastMathFlags ();
376	const FastMathFlagsTy &F = getFMFsRef();
377	FastMathFlags Res;
378	Res.setAllowReassoc(F.AllowReassoc);
379	Res.setNoNaNs(F.NoNaNs);
380	Res.setNoInfs(F.NoInfs);
381	Res.setNoSignedZeros(F.NoSignedZeros);
382	Res.setAllowReciprocal(F.AllowReciprocal);
383	Res.setAllowContract(F.AllowContract);
384	Res.setApproxFunc(F.ApproxFunc);
385	return Res;
386	}
387
388	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
389	void VPSingleDefRecipe::dump() const { VPRecipeBase::dump(); }
390
391	void VPRecipeBase::print(raw_ostream &O, const Twine &Indent,
392	VPSlotTracker &SlotTracker) const {
393	printRecipe(O, Indent, SlotTracker);
394	if (auto DL = getDebugLoc()) {
395	O << ", !dbg ";
396	DL.print(O);
397	}
398
399	if (auto Metadata = dyn_cast<VPIRMetadata>(this*))
400	Metadata->print(O, SlotTracker);
401	}
402	#endif
403
404	template <unsigned PartOpIdx>
405	VPValue *
406	VPUnrollPartAccessor<PartOpIdx>::getUnrollPartOperand(const VPUser &U) const {
407	if (U.getNumOperands() == PartOpIdx + `1`)
408	return U.getOperand(N: PartOpIdx);
409	return nullptr;
410	}
411
412	template <unsigned PartOpIdx>
413	unsigned VPUnrollPartAccessor<PartOpIdx>::getUnrollPart(const VPUser &U) const {
414	if (auto *UnrollPartOp = getUnrollPartOperand(U))
415	return cast<VPConstantInt>(UnrollPartOp)->getZExtValue();
416	return `0`;
417	}
418
419	namespace llvm {
420	template class VPUnrollPartAccessor<`1`>;
421	template class VPUnrollPartAccessor<`2`>;
422	template class VPUnrollPartAccessor<`3`>;
423	}
424
425	VPInstruction::VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands,
426	const VPIRFlags &Flags, const VPIRMetadata &MD,
427	DebugLoc DL, const Twine &Name)
428	: VPRecipeWithIRFlags (VPRecipeBase::VPInstructionSC, Operands, Flags, DL),
429	VPIRMetadata (MD), Opcode(Opcode), Name(Name.str()) {
430	assert(flagsValidForOpcode(getOpcode()) &&
431	"Set flags not supported for the provided opcode");
432	assert(hasRequiredFlagsForOpcode(getOpcode()) &&
433	"Opcode requires specific flags to be set");
434	assert((getNumOperandsForOpcode() == -`1u` \|\|
435	getNumOperandsForOpcode() == getNumOperands() \|\|
436	(isMasked() && getNumOperandsForOpcode() + `1` == getNumOperands())) &&
437	"number of operands does not match opcode");
438	}
439
440	unsigned VPInstruction::getNumOperandsForOpcode() const {
441	if (Instruction::isUnaryOp(Opcode) \|\| Instruction::isCast(Opcode))
442	return `1`;
443
444	if (Instruction::isBinaryOp(Opcode))
445	return `2`;
446
447	switch (Opcode) {
448	case VPInstruction::StepVector:
449	case VPInstruction::VScale:
450	return `0`;
451	case Instruction::Alloca:
452	case Instruction::ExtractValue:
453	case Instruction::Freeze:
454	case Instruction::Load:
455	case VPInstruction::BranchOnCond:
456	case VPInstruction::Broadcast:
457	case VPInstruction::ExitingIVValue:
458	case VPInstruction::ExplicitVectorLength:
459	case VPInstruction::ExtractLastLane:
460	case VPInstruction::ExtractLastPart:
461	case VPInstruction::ExtractPenultimateElement:
462	case VPInstruction::MaskedCond:
463	case VPInstruction::Not:
464	case VPInstruction::ResumeForEpilogue:
465	case VPInstruction::Reverse:
466	case VPInstruction::Unpack:
467	return `1`;
468	case Instruction::ICmp:
469	case Instruction::FCmp:
470	case Instruction::ExtractElement:
471	case Instruction::Store:
472	case VPInstruction::BranchOnCount:
473	case VPInstruction::BranchOnTwoConds:
474	case VPInstruction::FirstOrderRecurrenceSplice:
475	case VPInstruction::LogicalAnd:
476	case VPInstruction::LogicalOr:
477	case VPInstruction::PtrAdd:
478	case VPInstruction::WidePtrAdd:
479	case VPInstruction::WideIVStep:
480	case VPInstruction::CalculateTripCountMinusVF:
481	return `2`;
482	case Instruction::Select:
483	case VPInstruction::ActiveLaneMask:
484	case VPInstruction::ReductionStartVector:
485	return `3`;
486	case Instruction::Call: {
487	// For unmasked calls, the last argument will the called function. Use that
488	// to compute the number of operands without the mask.
489	VPValue *LastOp = getOperand(N: getNumOperands() - `1`);
490	if (isa<VPIRValue>(Val: LastOp) && isa<Function>(Val: LastOp->getLiveInIRValue()))
491	return getNumOperands();
492	return getNumOperands() - `1`;
493	}
494	case Instruction::GetElementPtr:
495	case Instruction::PHI:
496	case Instruction::Switch:
497	case VPInstruction::AnyOf:
498	case VPInstruction::BuildStructVector:
499	case VPInstruction::BuildVector:
500	case VPInstruction::CanonicalIVIncrementForPart:
501	case VPInstruction::ComputeAnyOfResult:
502	case VPInstruction::ComputeReductionResult:
503	case VPInstruction::FirstActiveLane:
504	case VPInstruction::LastActiveLane:
505	case VPInstruction::SLPLoad:
506	case VPInstruction::SLPStore:
507	case VPInstruction::ExtractLane:
508	case VPInstruction::ExtractLastActive:
509	// Cannot determine the number of operands from the opcode.
510	return -`1u`;
511	}
512	llvm_unreachable("all cases should be handled above");
513	}
514
515	bool VPInstruction::doesGeneratePerAllLanes() const {
516	return Opcode == VPInstruction::PtrAdd && !vputils::onlyFirstLaneUsed(Def: this);
517	}
518
519	bool VPInstruction::canGenerateScalarForFirstLane() const {
520	if (Instruction::isBinaryOp(Opcode: getOpcode()) \|\| Instruction::isCast(Opcode: getOpcode()))
521	return true;
522	if (isSingleScalar() \|\| isVectorToScalar())
523	return true;
524	switch (Opcode) {
525	case Instruction::Freeze:
526	case Instruction::ICmp:
527	case Instruction::PHI:
528	case Instruction::Select:
529	case VPInstruction::BranchOnCond:
530	case VPInstruction::BranchOnTwoConds:
531	case VPInstruction::BranchOnCount:
532	case VPInstruction::CalculateTripCountMinusVF:
533	case VPInstruction::CanonicalIVIncrementForPart:
534	case VPInstruction::PtrAdd:
535	case VPInstruction::ExplicitVectorLength:
536	case VPInstruction::AnyOf:
537	case VPInstruction::Not:
538	return true;
539	default:
540	return false;
541	}
542	}
543
544	Value *VPInstruction::generate(VPTransformState &State) {
545	IRBuilderBase &Builder = State.Builder;
546
547	if (Instruction::isBinaryOp(Opcode: getOpcode())) {
548	bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(Def: this);
549	Value *A = State.get(Def: getOperand(N: `0`), IsScalar: OnlyFirstLaneUsed);
550	Value *B = State.get(Def: getOperand(N: `1`), IsScalar: OnlyFirstLaneUsed);
551	auto *Res =
552	Builder.CreateBinOp(Opc: (Instruction::BinaryOps)getOpcode(), LHS: A, RHS: B, Name);
553	if (auto *I = dyn_cast<Instruction>(Val: Res))
554	applyFlags(I&: *I);
555	return Res;
556	}
557
558	switch (getOpcode()) {
559	case VPInstruction::Not: {
560	bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(Def: this);
561	Value *A = State.get(Def: getOperand(N: `0`), IsScalar: OnlyFirstLaneUsed);
562	return Builder.CreateNot(V: A, Name);
563	}
564	case Instruction::ExtractElement: {
565	assert(State.VF.isVector() && "Only extract elements from vectors");
566	if (auto *Idx = dyn_cast<VPConstantInt>(Val: getOperand(N: `1`)))
567	return State.get(Def: getOperand(N: `0`), Lane: VPLane (Idx->getZExtValue()));
568	Value *Vec = State.get(Def: getOperand(N: `0`));
569	Value Idx = State.get(Def: getOperand(N: `1`), /IsScalar=/*true);
570	return Builder.CreateExtractElement(Vec, Idx, Name);
571	}
572	case Instruction::Freeze: {
573	Value Op = State.get(Def: getOperand(N: `0`), IsScalar: vputils::onlyFirstLaneUsed(Def: this*));
574	return Builder.CreateFreeze(V: Op, Name);
575	}
576	case Instruction::FCmp:
577	case Instruction::ICmp: {
578	bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(Def: this);
579	Value *A = State.get(Def: getOperand(N: `0`), IsScalar: OnlyFirstLaneUsed);
580	Value *B = State.get(Def: getOperand(N: `1`), IsScalar: OnlyFirstLaneUsed);
581	return Builder.CreateCmp(Pred: getPredicate(), LHS: A, RHS: B, Name);
582	}
583	case Instruction::PHI: {
584	llvm_unreachable("should be handled by VPPhi::execute");
585	}
586	case Instruction::Select: {
587	bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(Def: this);
588	Value *Cond =
589	State.get(Def: getOperand(N: `0`),
590	IsScalar: OnlyFirstLaneUsed \|\| vputils::isSingleScalar(VPV: getOperand(N: `0`)));
591	Value *Op1 = State.get(Def: getOperand(N: `1`), IsScalar: OnlyFirstLaneUsed);
592	Value *Op2 = State.get(Def: getOperand(N: `2`), IsScalar: OnlyFirstLaneUsed);
593	return Builder.CreateSelectFMF(C: Cond, True: Op1, False: Op2, FMFSource: getFastMathFlags(), Name);
594	}
595	case VPInstruction::ActiveLaneMask: {
596	// Get first lane of vector induction variable.
597	Value *VIVElem0 = State.get(Def: getOperand(N: `0`), Lane: VPLane (`0`));
598	// Get the original loop tripcount.
599	Value *ScalarTC = State.get(Def: getOperand(N: `1`), Lane: VPLane (`0`));
600
601	// If this part of the active lane mask is scalar, generate the CMP directly
602	// to avoid unnecessary extracts.
603	if (State.VF.isScalar())
604	return Builder.CreateCmp(Pred: CmpInst::Predicate::ICMP_ULT, LHS: VIVElem0, RHS: ScalarTC,
605	Name);
606
607	ElementCount EC = State.VF.multiplyCoefficientBy(
608	RHS: cast<VPConstantInt>(Val: getOperand(N: `2`))->getZExtValue());
609	auto *PredTy = VectorType::get(ElementType: Builder.getInt1Ty(), EC);
610	return Builder.CreateIntrinsic(ID: Intrinsic::get_active_lane_mask,
611	Types: {PredTy, ScalarTC->getType()},
612	Args: {VIVElem0, ScalarTC}, FMFSource: nullptr, Name);
613	}
614	case VPInstruction::FirstOrderRecurrenceSplice: {
615	// Generate code to combine the previous and current values in vector v3.
616	//
617	// vector.ph:
618	// v_init = vector(..., ..., ..., a[-1])
619	// br vector.body
620	//
621	// vector.body
622	// i = phi [0, vector.ph], [i+4, vector.body]
623	// v1 = phi [v_init, vector.ph], [v2, vector.body]
624	// v2 = a[i, i+1, i+2, i+3];
625	// v3 = vector(v1(3), v2(0, 1, 2))
626
627	auto *V1 = State.get(Def: getOperand(N: `0`));
628	if (!V1->getType()->isVectorTy())
629	return V1;
630	Value *V2 = State.get(Def: getOperand(N: `1`));
631	return Builder.CreateVectorSpliceRight(V1, V2, Offset: `1`, Name);
632	}
633	case VPInstruction::CalculateTripCountMinusVF: {
634	Value *ScalarTC = State.get(Def: getOperand(N: `0`), Lane: VPLane (`0`));
635	Value *VFxUF = State.get(Def: getOperand(N: `1`), Lane: VPLane (`0`));
636	Value *Sub = Builder.CreateSub(LHS: ScalarTC, RHS: VFxUF);
637	Value *Cmp =
638	Builder.CreateICmp(P: CmpInst::Predicate::ICMP_UGT, LHS: ScalarTC, RHS: VFxUF);
639	Value *Zero = ConstantInt::getNullValue(Ty: ScalarTC->getType());
640	return Builder.CreateSelect(C: Cmp, True: Sub, False: Zero);
641	}
642	case VPInstruction::ExplicitVectorLength: {
643	// TODO: Restructure this code with an explicit remainder loop, vsetvli can
644	// be outside of the main loop.
645	Value AVL = State.get(Def: getOperand(N: `0`), /IsScalar/* true);
646	// Compute EVL
647	assert(AVL->getType()->isIntegerTy() &&
648	"Requested vector length should be an integer.");
649
650	assert(State.VF.isScalable() && "Expected scalable vector factor.");
651	Value *VFArg = Builder.getInt32(C: State.VF.getKnownMinValue());
652
653	Value *EVL = Builder.CreateIntrinsic(
654	RetTy: Builder.getInt32Ty(), ID: Intrinsic::experimental_get_vector_length,
655	Args: {AVL, VFArg, Builder.getTrue()});
656	return EVL;
657	}
658	case VPInstruction::CanonicalIVIncrementForPart: {
659	auto *IV = State.get(Def: getOperand(N: `0`), Lane: VPLane (`0`));
660	auto *VFxPart = State.get(Def: getOperand(N: `1`), Lane: VPLane (`0`));
661	// The canonical IV is incremented by the vectorization factor (num of
662	// SIMD elements) times the unroll part.
663	return Builder.CreateAdd(LHS: IV, RHS: VFxPart, Name, HasNUW: hasNoUnsignedWrap(),
664	HasNSW: hasNoSignedWrap());
665	}
666	case VPInstruction::BranchOnCond: {
667	Value *Cond = State.get(Def: getOperand(N: `0`), Lane: VPLane (`0`));
668	// Replace the temporary unreachable terminator with a new conditional
669	// branch, hooking it up to backward destination for latch blocks now, and
670	// to forward destination(s) later when they are created.
671	// Second successor may be backwards - iff it is already in VPBB2IRBB.
672	VPBasicBlock *SecondVPSucc =
673	cast<VPBasicBlock>(Val: getParent()->getSuccessors()[`1`]);
674	BasicBlock *SecondIRSucc = State.CFG.VPBB2IRBB.lookup(Val: SecondVPSucc);
675	BasicBlock *IRBB = State.CFG.VPBB2IRBB [getParent()];
676	auto *Br = Builder.CreateCondBr(Cond, True: IRBB, False: SecondIRSucc);
677	// First successor is always forward, reset it to nullptr.
678	Br->setSuccessor(idx: `0`, NewSucc: nullptr);
679	IRBB->getTerminator()->eraseFromParent();
680	applyMetadata(I&: *Br);
681	return Br;
682	}
683	case VPInstruction::Broadcast: {
684	return Builder.CreateVectorSplat(
685	EC: State.VF, V: State.get(Def: getOperand(N: `0`), /IsScalar/ true), Name: "broadcast");
686	}
687	case VPInstruction::BuildStructVector: {
688	// For struct types, we need to build a new 'wide' struct type, where each
689	// element is widened, i.e., we create a struct of vectors.
690	auto *StructTy =
691	cast<StructType>(Val: State.TypeAnalysis.inferScalarType(V: getOperand(N: `0`)));
692	Value *Res = PoisonValue::get(T: toVectorizedTy(Ty: StructTy, EC: State.VF));
693	for (const auto &[LaneIndex, Op] : enumerate(First: operands())) {
694	for (unsigned FieldIndex = `0`; FieldIndex != StructTy->getNumElements();
695	FieldIndex++) {
696	Value *ScalarValue =
697	Builder.CreateExtractValue(Agg: State.get(Def: Op, IsScalar: true), Idxs: FieldIndex);
698	Value *VectorValue = Builder.CreateExtractValue(Agg: Res, Idxs: FieldIndex);
699	VectorValue =
700	Builder.CreateInsertElement(Vec: VectorValue, NewElt: ScalarValue, Idx: LaneIndex);
701	Res = Builder.CreateInsertValue(Agg: Res, Val: VectorValue, Idxs: FieldIndex);
702	}
703	}
704	return Res;
705	}
706	case VPInstruction::BuildVector: {
707	auto *ScalarTy = State.TypeAnalysis.inferScalarType(V: getOperand(N: `0`));
708	auto NumOfElements = ElementCount::getFixed(MinVal: getNumOperands());
709	Value *Res = PoisonValue::get(T: toVectorizedTy(Ty: ScalarTy, EC: NumOfElements));
710	for (const auto &[Idx, Op] : enumerate(First: operands()))
711	Res = Builder.CreateInsertElement(Vec: Res, NewElt: State.get(Def: Op, IsScalar: true),
712	Idx: Builder.getInt32(C: Idx));
713	return Res;
714	}
715	case VPInstruction::ReductionStartVector: {
716	if (State.VF.isScalar())
717	return State.get(Def: getOperand(N: `0`), IsScalar: true);
718	IRBuilderBase::FastMathFlagGuard FMFG(Builder);
719	Builder.setFastMathFlags(getFastMathFlags());
720	// If this start vector is scaled then it should produce a vector with fewer
721	// elements than the VF.
722	ElementCount VF = State.VF.divideCoefficientBy(
723	RHS: cast<VPConstantInt>(Val: getOperand(N: `2`))->getZExtValue());
724	auto Iden = Builder.CreateVectorSplat(EC: VF, V: State.get(Def: getOperand(N: `1`), IsScalar: true*));
725	return Builder.CreateInsertElement(Vec: Iden, NewElt: State.get(Def: getOperand(N: `0`), IsScalar: true),
726	Idx: Builder.getInt32(C: `0`));
727	}
728	case VPInstruction::ComputeAnyOfResult: {
729	Value *Start = State.get(Def: getOperand(N: `0`), Lane: VPLane (`0`));
730	Value *NewVal = State.get(Def: getOperand(N: `1`), Lane: VPLane (`0`));
731	Value *ReducedResult = State.get(Def: getOperand(N: `2`));
732	for (unsigned Idx = `3`; Idx < getNumOperands(); ++Idx)
733	ReducedResult =
734	Builder.CreateBinOp(Opc: Instruction::Or, LHS: State.get(Def: getOperand(N: Idx)),
735	RHS: ReducedResult, Name: "bin.rdx");
736	// If any predicate is true it means that we want to select the new value.
737	if (ReducedResult->getType()->isVectorTy())
738	ReducedResult = Builder.CreateOrReduce(Src: ReducedResult);
739	// The compares in the loop may yield poison, which propagates through the
740	// bitwise ORs. Freeze it here before the condition is used.
741	ReducedResult = Builder.CreateFreeze(V: ReducedResult);
742	return Builder.CreateSelect(C: ReducedResult, True: NewVal, False: Start, Name: "rdx.select");
743	}
744	case VPInstruction::ComputeReductionResult: {
745	RecurKind RK = getRecurKind();
746	bool IsOrdered = isReductionOrdered();
747	bool IsInLoop = isReductionInLoop();
748	assert(!RecurrenceDescriptor::isFindIVRecurrenceKind(RK) &&
749	"FindIV should use min/max reduction kinds");
750
751	// The recipe may have multiple operands to be reduced together.
752	unsigned NumOperandsToReduce = getNumOperands();
753	VectorParts RdxParts(NumOperandsToReduce);
754	for (unsigned Part = `0`; Part < NumOperandsToReduce; ++Part)
755	RdxParts [Part] = State.get(Def: getOperand(N: Part), IsScalar: IsInLoop);
756
757	IRBuilderBase::FastMathFlagGuard FMFG(Builder);
758	Builder.setFastMathFlags(getFastMathFlags());
759
760	// Reduce multiple operands into one.
761	Value *ReducedPartRdx = RdxParts [`0`];
762	if (IsOrdered) {
763	ReducedPartRdx = RdxParts [NumOperandsToReduce - `1`];
764	} else {
765	// Floating-point operations should have some FMF to enable the reduction.
766	for (unsigned Part = `1`; Part < NumOperandsToReduce; ++Part) {
767	Value *RdxPart = RdxParts [Part];
768	if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind: RK))
769	ReducedPartRdx = createMinMaxOp(Builder, RK, Left: ReducedPartRdx, Right: RdxPart);
770	else {
771	// For sub-recurrences, each part's reduction variable is already
772	// negative, we need to do: reduce.add(-acc_uf0 + -acc_uf1)
773	Instruction::BinaryOps Opcode =
774	RK == RecurKind::Sub
775	? Instruction::Add
776	: (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(Kind: RK);
777	ReducedPartRdx =
778	Builder.CreateBinOp(Opc: Opcode, LHS: RdxPart, RHS: ReducedPartRdx, Name: "bin.rdx");
779	}
780	}
781	}
782
783	// Create the reduction after the loop. Note that inloop reductions create
784	// the target reduction in the loop using a Reduction recipe.
785	if (State.VF.isVector() && !IsInLoop) {
786	// TODO: Support in-order reductions based on the recurrence descriptor.
787	// All ops in the reduction inherit fast-math-flags from the recurrence
788	// descriptor.
789	ReducedPartRdx = createSimpleReduction(B&: Builder, Src: ReducedPartRdx, RdxKind: RK);
790	}
791
792	return ReducedPartRdx;
793	}
794	case VPInstruction::ExtractLastLane:
795	case VPInstruction::ExtractPenultimateElement: {
796	unsigned Offset =
797	getOpcode() == VPInstruction::ExtractPenultimateElement ? `2` : `1`;
798	Value *Res;
799	if (State.VF.isVector()) {
800	assert(Offset <= State.VF.getKnownMinValue() &&
801	"invalid offset to extract from");
802	// Extract lane VF - Offset from the operand.
803	Res = State.get(Def: getOperand(N: `0`), Lane: VPLane::getLaneFromEnd(VF: State.VF, Offset));
804	} else {
805	// TODO: Remove ExtractLastLane for scalar VFs.
806	assert(Offset <= `1` && "invalid offset to extract from");
807	Res = State.get(Def: getOperand(N: `0`));
808	}
809	if (isa<ExtractElementInst>(Val: Res))
810	Res->setName(Name);
811	return Res;
812	}
813	case VPInstruction::LogicalAnd: {
814	Value *A = State.get(Def: getOperand(N: `0`));
815	Value *B = State.get(Def: getOperand(N: `1`));
816	return Builder.CreateLogicalAnd(Cond1: A, Cond2: B, Name);
817	}
818	case VPInstruction::LogicalOr: {
819	Value *A = State.get(Def: getOperand(N: `0`));
820	Value *B = State.get(Def: getOperand(N: `1`));
821	return Builder.CreateLogicalOr(Cond1: A, Cond2: B, Name);
822	}
823	case VPInstruction::PtrAdd: {
824	assert((State.VF.isScalar() \|\| vputils::onlyFirstLaneUsed(this)) &&
825	"can only generate first lane for PtrAdd");
826	Value *Ptr = State.get(Def: getOperand(N: `0`), Lane: VPLane (`0`));
827	Value *Addend = State.get(Def: getOperand(N: `1`), Lane: VPLane (`0`));
828	return Builder.CreatePtrAdd(Ptr, Offset: Addend, Name, NW: getGEPNoWrapFlags());
829	}
830	case VPInstruction::WidePtrAdd: {
831	Value *Ptr =
832	State.get(Def: getOperand(N: `0`), IsScalar: vputils::isSingleScalar(VPV: getOperand(N: `0`)));
833	Value *Addend = State.get(Def: getOperand(N: `1`));
834	return Builder.CreatePtrAdd(Ptr, Offset: Addend, Name, NW: getGEPNoWrapFlags());
835	}
836	case VPInstruction::AnyOf: {
837	Value *Res = Builder.CreateFreeze(V: State.get(Def: getOperand(N: `0`)));
838	for (VPValue *Op : drop_begin(RangeOrContainer: operands()))
839	Res = Builder.CreateOr(LHS: Res, RHS: Builder.CreateFreeze(V: State.get(Def: Op)));
840	return State.VF.isScalar() ? Res : Builder.CreateOrReduce(Src: Res);
841	}
842	case VPInstruction::ExtractLane: {
843	assert(getNumOperands() != `2` && "ExtractLane from single source should be "
844	"simplified to ExtractElement.");
845	Value LaneToExtract = State.get(Def: getOperand(N: `0`), IsScalar: true*);
846	Type *IdxTy = State.TypeAnalysis.inferScalarType(V: getOperand(N: `0`));
847	Value Res = nullptr*;
848	Value *RuntimeVF = getRuntimeVF(B&: Builder, Ty: IdxTy, VF: State.VF);
849
850	for (unsigned Idx = `1`; Idx != getNumOperands(); ++Idx) {
851	Value *VectorStart =
852	Builder.CreateMul(LHS: RuntimeVF, RHS: ConstantInt::get(Ty: IdxTy, V: Idx - `1`));
853	Value *VectorIdx = Idx == `1`
854	? LaneToExtract
855	: Builder.CreateSub(LHS: LaneToExtract, RHS: VectorStart);
856	Value *Ext = State.VF.isScalar()
857	? State.get(Def: getOperand(N: Idx))
858	: Builder.CreateExtractElement(
859	Vec: State.get(Def: getOperand(N: Idx)), Idx: VectorIdx);
860	if (Res) {
861	Value *Cmp = Builder.CreateICmpUGE(LHS: LaneToExtract, RHS: VectorStart);
862	Res = Builder.CreateSelect(C: Cmp, True: Ext, False: Res);
863	} else {
864	Res = Ext;
865	}
866	}
867	return Res;
868	}
869	case VPInstruction::FirstActiveLane: {
870	if (getNumOperands() == `1`) {
871	Value *Mask = State.get(Def: getOperand(N: `0`));
872	return Builder.CreateCountTrailingZeroElems(ResTy: Builder.getInt64Ty(), Mask,
873	/ZeroIsPoison=/false, Name);
874	}
875	// If there are multiple operands, create a chain of selects to pick the
876	// first operand with an active lane and add the number of lanes of the
877	// preceding operands.
878	Value *RuntimeVF = getRuntimeVF(B&: Builder, Ty: Builder.getInt64Ty(), VF: State.VF);
879	unsigned LastOpIdx = getNumOperands() - `1`;
880	Value Res = nullptr*;
881	for (int Idx = LastOpIdx; Idx >= `0`; --Idx) {
882	Value *TrailingZeros =
883	State.VF.isScalar()
884	? Builder.CreateZExt(
885	V: Builder.CreateICmpEQ(LHS: State.get(Def: getOperand(N: Idx)),
886	RHS: Builder.getFalse()),
887	DestTy: Builder.getInt64Ty())
888	: Builder.CreateCountTrailingZeroElems(
889	ResTy: Builder.getInt64Ty(), Mask: State.get(Def: getOperand(N: Idx)),
890	/ZeroIsPoison=/false, Name);
891	Value *Current = Builder.CreateAdd(
892	LHS: Builder.CreateMul(LHS: RuntimeVF, RHS: Builder.getInt64(C: Idx)), RHS: TrailingZeros);
893	if (Res) {
894	Value *Cmp = Builder.CreateICmpNE(LHS: TrailingZeros, RHS: RuntimeVF);
895	Res = Builder.CreateSelect(C: Cmp, True: Current, False: Res);
896	} else {
897	Res = Current;
898	}
899	}
900
901	return Res;
902	}
903	case VPInstruction::ResumeForEpilogue:
904	return State.get(Def: getOperand(N: `0`), IsScalar: true);
905	case VPInstruction::Reverse:
906	return Builder.CreateVectorReverse(V: State.get(Def: getOperand(N: `0`)), Name: "reverse");
907	case VPInstruction::ExtractLastActive: {
908	Value Result = State.get(Def: getOperand(N: `0`), /IsScalar=/*true);
909	for (unsigned Idx = `1`; Idx < getNumOperands(); Idx += `2`) {
910	Value *Data = State.get(Def: getOperand(N: Idx));
911	Value *Mask = State.get(Def: getOperand(N: Idx + `1`));
912	Type *VTy = Data->getType();
913
914	if (State.VF.isScalar())
915	Result = Builder.CreateSelect(C: Mask, True: Data, False: Result);
916	else
917	Result = Builder.CreateIntrinsic(
918	ID: Intrinsic::experimental_vector_extract_last_active, Types: {VTy},
919	Args: {Data, Mask, Result});
920	}
921
922	return Result;
923	}
924	default:
925	llvm_unreachable("Unsupported opcode for instruction");
926	}
927	}
928
929	InstructionCost VPRecipeWithIRFlags::getCostForRecipeWithOpcode(
930	unsigned Opcode, ElementCount VF, VPCostContext &Ctx) const {
931	Type ScalarTy = Ctx.Types.inferScalarType(V: this*);
932	Type *ResultTy = VF.isVector() ? toVectorTy(Scalar: ScalarTy, EC: VF) : ScalarTy;
933	switch (Opcode) {
934	case Instruction::FNeg:
935	return Ctx.TTI.getArithmeticInstrCost(Opcode, Ty: ResultTy, CostKind: Ctx.CostKind);
936	case Instruction::UDiv:
937	case Instruction::SDiv:
938	case Instruction::SRem:
939	case Instruction::URem:
940	case Instruction::Add:
941	case Instruction::FAdd:
942	case Instruction::Sub:
943	case Instruction::FSub:
944	case Instruction::Mul:
945	case Instruction::FMul:
946	case Instruction::FDiv:
947	case Instruction::FRem:
948	case Instruction::Shl:
949	case Instruction::LShr:
950	case Instruction::AShr:
951	case Instruction::And:
952	case Instruction::Or:
953	case Instruction::Xor: {
954	TargetTransformInfo::OperandValueInfo RHSInfo = {
955	.Kind: TargetTransformInfo::OK_AnyValue, .Properties: TargetTransformInfo::OP_None};
956
957	if (VF.isVector()) {
958	// Certain instructions can be cheaper to vectorize if they have a
959	// constant second vector operand. One example of this are shifts on x86.
960	VPValue *RHS = getOperand(N: `1`);
961	RHSInfo = Ctx.getOperandInfo(V: RHS);
962
963	if (RHSInfo.Kind == TargetTransformInfo::OK_AnyValue &&
964	getOperand(N: `1`)->isDefinedOutsideLoopRegions())
965	RHSInfo.Kind = TargetTransformInfo::OK_UniformValue;
966	}
967
968	Instruction *CtxI = dyn_cast_or_null<Instruction>(Val: getUnderlyingValue());
969	SmallVector<const Value *, `4`> Operands;
970	if (CtxI)
971	Operands.append(in_start: CtxI->value_op_begin(), in_end: CtxI->value_op_end());
972	return Ctx.TTI.getArithmeticInstrCost(
973	Opcode, Ty: ResultTy, CostKind: Ctx.CostKind,
974	Opd1Info: {.Kind: TargetTransformInfo::OK_AnyValue, .Properties: TargetTransformInfo::OP_None},
975	Opd2Info: RHSInfo, Args: Operands, CxtI: CtxI, TLibInfo: &Ctx.TLI);
976	}
977	case Instruction::Freeze:
978	// This opcode is unknown. Assume that it is the same as 'mul'.
979	return Ctx.TTI.getArithmeticInstrCost(Opcode: Instruction::Mul, Ty: ResultTy,
980	CostKind: Ctx.CostKind);
981	case Instruction::ExtractValue:
982	return Ctx.TTI.getInsertExtractValueCost(Opcode: Instruction::ExtractValue,
983	CostKind: Ctx.CostKind);
984	case Instruction::ICmp:
985	case Instruction::FCmp: {
986	Type *ScalarOpTy = Ctx.Types.inferScalarType(V: getOperand(N: `0`));
987	Type *OpTy = VF.isVector() ? toVectorTy(Scalar: ScalarOpTy, EC: VF) : ScalarOpTy;
988	Instruction *CtxI = dyn_cast_or_null<Instruction>(Val: getUnderlyingValue());
989	return Ctx.TTI.getCmpSelInstrCost(
990	Opcode, ValTy: OpTy, CondTy: CmpInst::makeCmpResultType(opnd_type: OpTy), VecPred: getPredicate(),
991	CostKind: Ctx.CostKind, Op1Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None},
992	Op2Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, I: CtxI);
993	}
994	case Instruction::BitCast: {
995	Type ScalarTy = Ctx.Types.inferScalarType(V: this*);
996	if (ScalarTy->isPointerTy())
997	return `0`;
998	[[fallthrough]];
999	}
1000	case Instruction::SExt:
1001	case Instruction::ZExt:
1002	case Instruction::FPToUI:
1003	case Instruction::FPToSI:
1004	case Instruction::FPExt:
1005	case Instruction::PtrToInt:
1006	case Instruction::PtrToAddr:
1007	case Instruction::IntToPtr:
1008	case Instruction::SIToFP:
1009	case Instruction::UIToFP:
1010	case Instruction::Trunc:
1011	case Instruction::FPTrunc:
1012	case Instruction::AddrSpaceCast: {
1013	// Computes the CastContextHint from a recipe that may access memory.
1014	auto ComputeCCH = [&](const VPRecipeBase *R) -> TTI::CastContextHint {
1015	if (isa<VPInterleaveBase>(Val: R))
1016	return TTI::CastContextHint::Interleave;
1017	if (const auto *ReplicateRecipe = dyn_cast<VPReplicateRecipe>(Val: R)) {
1018	// Only compute CCH for memory operations, matching the legacy model
1019	// which only considers loads/stores for cast context hints.
1020	auto *UI = cast<Instruction>(Val: ReplicateRecipe->getUnderlyingValue());
1021	if (!isa<LoadInst, StoreInst>(Val: UI))
1022	return TTI::CastContextHint::None;
1023	return ReplicateRecipe->isPredicated() ? TTI::CastContextHint::Masked
1024	: TTI::CastContextHint::Normal;
1025	}
1026	const auto *WidenMemoryRecipe = dyn_cast<VPWidenMemoryRecipe>(Val: R);
1027	if (WidenMemoryRecipe == nullptr)
1028	return TTI::CastContextHint::None;
1029	if (VF.isScalar())
1030	return TTI::CastContextHint::Normal;
1031	if (!WidenMemoryRecipe->isConsecutive())
1032	return TTI::CastContextHint::GatherScatter;
1033	if (WidenMemoryRecipe->isReverse())
1034	return TTI::CastContextHint::Reversed;
1035	if (WidenMemoryRecipe->isMasked())
1036	return TTI::CastContextHint::Masked;
1037	return TTI::CastContextHint::Normal;
1038	};
1039
1040	VPValue *Operand = getOperand(N: `0`);
1041	TTI::CastContextHint CCH = TTI::CastContextHint::None;
1042	// For Trunc/FPTrunc, get the context from the only user.
1043	if (Opcode == Instruction::Trunc \|\| Opcode == Instruction::FPTrunc) {
1044	auto GetOnlyUser = [](const VPSingleDefRecipe R) -> VPRecipeBase {
1045	if (R->getNumUsers() == `0` \|\| R->hasMoreThanOneUniqueUser())
1046	return nullptr;
1047	return dyn_cast<VPRecipeBase>(Val: *R->user_begin());
1048	};
1049	if (VPRecipeBase Recipe = GetOnlyUser (this*)) {
1050	if (match(V: Recipe, P: m_Reverse(Op0: m_VPValue())))
1051	Recipe = GetOnlyUser (cast<VPInstruction>(Val: Recipe));
1052	if (Recipe)
1053	CCH = ComputeCCH (Recipe);
1054	}
1055	}
1056	// For Z/Sext, get the context from the operand.
1057	else if (Opcode == Instruction::ZExt \|\| Opcode == Instruction::SExt \|\|
1058	Opcode == Instruction::FPExt) {
1059	if (auto *Recipe = Operand->getDefiningRecipe()) {
1060	VPValue *ReverseOp;
1061	if (match(V: Recipe, P: m_Reverse(Op0: m_VPValue(V&: ReverseOp))))
1062	Recipe = ReverseOp->getDefiningRecipe();
1063	if (Recipe)
1064	CCH = ComputeCCH (Recipe);
1065	}
1066	}
1067
1068	auto *ScalarSrcTy = Ctx.Types.inferScalarType(V: Operand);
1069	Type *SrcTy = VF.isVector() ? toVectorTy(Scalar: ScalarSrcTy, EC: VF) : ScalarSrcTy;
1070	// Arm TTI will use the underlying instruction to determine the cost.
1071	return Ctx.TTI.getCastInstrCost(
1072	Opcode, Dst: ResultTy, Src: SrcTy, CCH, CostKind: Ctx.CostKind,
1073	I: dyn_cast_if_present<Instruction>(Val: getUnderlyingValue()));
1074	}
1075	case Instruction::Select: {
1076	SelectInst *SI = cast_or_null<SelectInst>(Val: getUnderlyingValue());
1077	bool IsScalarCond = getOperand(N: `0`)->isDefinedOutsideLoopRegions();
1078	Type ScalarTy = Ctx.Types.inferScalarType(V: this*);
1079
1080	VPValue Op0, Op1;
1081	bool IsLogicalAnd =
1082	match(V: this, P: m_c_LogicalAnd(Op0: m_VPValue(V&: Op0), Op1: m_VPValue(V&: Op1)));
1083	bool IsLogicalOr =
1084	match(V: this, P: m_c_LogicalOr(Op0: m_VPValue(V&: Op0), Op1: m_VPValue(V&: Op1)));
1085	// Also match the inverted forms:
1086	// select x, false, y --> !x & y (still AND)
1087	// select x, y, true --> !x \| y (still OR)
1088	IsLogicalAnd \|=
1089	match(V: this, P: m_Select(Op0: m_VPValue(V&: Op0), Op1: m_False(), Op2: m_VPValue(V&: Op1)));
1090	IsLogicalOr \|=
1091	match(V: this, P: m_Select(Op0: m_VPValue(V&: Op0), Op1: m_VPValue(V&: Op1), Op2: m_True()));
1092
1093	if (!IsScalarCond && ScalarTy->getScalarSizeInBits() == `1` &&
1094	(IsLogicalAnd \|\| IsLogicalOr)) {
1095	// select x, y, false --> x & y
1096	// select x, true, y --> x \| y
1097	const auto [Op1VK, Op1VP] = Ctx.getOperandInfo(V: Op0);
1098	const auto [Op2VK, Op2VP] = Ctx.getOperandInfo(V: Op1);
1099
1100	SmallVector<const Value *, `2`> Operands;
1101	if (SI && all_of(Range: operands(),
1102	P: [](VPValue Op) { return* Op->getUnderlyingValue(); }))
1103	append_range(C&: Operands, R: SI->operands());
1104	return Ctx.TTI.getArithmeticInstrCost(
1105	Opcode: IsLogicalOr ? Instruction::Or : Instruction::And, Ty: ResultTy,
1106	CostKind: Ctx.CostKind, Opd1Info: {.Kind: Op1VK, .Properties: Op1VP}, Opd2Info: {.Kind: Op2VK, .Properties: Op2VP}, Args: Operands, CxtI: SI);
1107	}
1108
1109	Type *CondTy = Ctx.Types.inferScalarType(V: getOperand(N: `0`));
1110	if (!IsScalarCond && VF.isVector())
1111	CondTy = VectorType::get(ElementType: CondTy, EC: VF);
1112
1113	llvm::CmpPredicate Pred;
1114	if (!match(V: getOperand(N: `0`), P: m_Cmp(Pred, Op0: m_VPValue(), Op1: m_VPValue())))
1115	if (auto *CondIRV = dyn_cast<VPIRValue>(Val: getOperand(N: `0`)))
1116	if (auto *Cmp = dyn_cast<CmpInst>(Val: CondIRV->getValue()))
1117	Pred = Cmp->getPredicate();
1118	Type VectorTy = toVectorTy(Scalar: Ctx.Types.inferScalarType(V: this*), EC: VF);
1119	return Ctx.TTI.getCmpSelInstrCost(
1120	Opcode: Instruction::Select, ValTy: VectorTy, CondTy, VecPred: Pred, CostKind: Ctx.CostKind,
1121	Op1Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, Op2Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, I: SI);
1122	}
1123	}
1124	llvm_unreachable("called for unsupported opcode");
1125	}
1126
1127	InstructionCost VPInstruction::computeCost(ElementCount VF,
1128	VPCostContext &Ctx) const {
1129	if (Instruction::isBinaryOp(Opcode: getOpcode())) {
1130	if (!getUnderlyingValue() && getOpcode() != Instruction::FMul) {
1131	// TODO: Compute cost for VPInstructions without underlying values once
1132	// the legacy cost model has been retired.
1133	return `0`;
1134	}
1135
1136	assert(!doesGeneratePerAllLanes() &&
1137	"Should only generate a vector value or single scalar, not scalars "
1138	"for all lanes.");
1139	return getCostForRecipeWithOpcode(
1140	Opcode: getOpcode(),
1141	VF: vputils::onlyFirstLaneUsed(Def: this) ? ElementCount::getFixed(MinVal: `1`) : VF, Ctx);
1142	}
1143
1144	switch (getOpcode()) {
1145	case Instruction::Select: {
1146	llvm::CmpPredicate Pred = CmpInst::BAD_ICMP_PREDICATE;
1147	match(V: getOperand(N: `0`), P: m_Cmp(Pred, Op0: m_VPValue(), Op1: m_VPValue()));
1148	auto *CondTy = Ctx.Types.inferScalarType(V: getOperand(N: `0`));
1149	auto *VecTy = Ctx.Types.inferScalarType(V: getOperand(N: `1`));
1150	if (!vputils::onlyFirstLaneUsed(Def: this)) {
1151	CondTy = toVectorTy(Scalar: CondTy, EC: VF);
1152	VecTy = toVectorTy(Scalar: VecTy, EC: VF);
1153	}
1154	return Ctx.TTI.getCmpSelInstrCost(Opcode: Instruction::Select, ValTy: VecTy, CondTy, VecPred: Pred,
1155	CostKind: Ctx.CostKind);
1156	}
1157	case Instruction::ExtractElement:
1158	case VPInstruction::ExtractLane: {
1159	if (VF.isScalar()) {
1160	// ExtractLane with VF=1 takes care of handling extracting across multiple
1161	// parts.
1162	return `0`;
1163	}
1164
1165	// Add on the cost of extracting the element.
1166	auto *VecTy = toVectorTy(Scalar: Ctx.Types.inferScalarType(V: getOperand(N: `0`)), EC: VF);
1167	return Ctx.TTI.getVectorInstrCost(Opcode: Instruction::ExtractElement, Val: VecTy,
1168	CostKind: Ctx.CostKind);
1169	}
1170	case VPInstruction::AnyOf: {
1171	auto VecTy = toVectorTy(Scalar: Ctx.Types.inferScalarType(V: this*), EC: VF);
1172	return Ctx.TTI.getArithmeticReductionCost(
1173	Opcode: Instruction::Or, Ty: cast<VectorType>(Val: VecTy), FMF: std::nullopt, CostKind: Ctx.CostKind);
1174	}
1175	case VPInstruction::FirstActiveLane: {
1176	Type *ScalarTy = Ctx.Types.inferScalarType(V: getOperand(N: `0`));
1177	if (VF.isScalar())
1178	return Ctx.TTI.getCmpSelInstrCost(Opcode: Instruction::ICmp, ValTy: ScalarTy,
1179	CondTy: CmpInst::makeCmpResultType(opnd_type: ScalarTy),
1180	VecPred: CmpInst::ICMP_EQ, CostKind: Ctx.CostKind);
1181	// Calculate the cost of determining the lane index.
1182	auto *PredTy = toVectorTy(Scalar: ScalarTy, EC: VF);
1183	IntrinsicCostAttributes Attrs(Intrinsic::experimental_cttz_elts,
1184	Type::getInt64Ty(C&: Ctx.LLVMCtx),
1185	{PredTy, Type::getInt1Ty(C&: Ctx.LLVMCtx)});
1186	return Ctx.TTI.getIntrinsicInstrCost(ICA: Attrs, CostKind: Ctx.CostKind);
1187	}
1188	case VPInstruction::LastActiveLane: {
1189	Type *ScalarTy = Ctx.Types.inferScalarType(V: getOperand(N: `0`));
1190	if (VF.isScalar())
1191	return Ctx.TTI.getCmpSelInstrCost(Opcode: Instruction::ICmp, ValTy: ScalarTy,
1192	CondTy: CmpInst::makeCmpResultType(opnd_type: ScalarTy),
1193	VecPred: CmpInst::ICMP_EQ, CostKind: Ctx.CostKind);
1194	// Calculate the cost of determining the lane index: NOT + cttz_elts + SUB.
1195	auto *PredTy = toVectorTy(Scalar: ScalarTy, EC: VF);
1196	IntrinsicCostAttributes Attrs(Intrinsic::experimental_cttz_elts,
1197	Type::getInt64Ty(C&: Ctx.LLVMCtx),
1198	{PredTy, Type::getInt1Ty(C&: Ctx.LLVMCtx)});
1199	InstructionCost Cost = Ctx.TTI.getIntrinsicInstrCost(ICA: Attrs, CostKind: Ctx.CostKind);
1200	// Add cost of NOT operation on the predicate.
1201	Cost += Ctx.TTI.getArithmeticInstrCost(
1202	Opcode: Instruction::Xor, Ty: PredTy, CostKind: Ctx.CostKind,
1203	Opd1Info: {.Kind: TargetTransformInfo::OK_AnyValue, .Properties: TargetTransformInfo::OP_None},
1204	Opd2Info: {.Kind: TargetTransformInfo::OK_UniformConstantValue,
1205	.Properties: TargetTransformInfo::OP_None});
1206	// Add cost of SUB operation on the index.
1207	Cost += Ctx.TTI.getArithmeticInstrCost(
1208	Opcode: Instruction::Sub, Ty: Type::getInt64Ty(C&: Ctx.LLVMCtx), CostKind: Ctx.CostKind);
1209	return Cost;
1210	}
1211	case VPInstruction::ExtractLastActive: {
1212	Type ScalarTy = Ctx.Types.inferScalarType(V: this*);
1213	Type *VecTy = toVectorTy(Scalar: ScalarTy, EC: VF);
1214	Type *MaskTy = toVectorTy(Scalar: Type::getInt1Ty(C&: Ctx.LLVMCtx), EC: VF);
1215	IntrinsicCostAttributes ICA(
1216	Intrinsic::experimental_vector_extract_last_active, ScalarTy,
1217	{VecTy, MaskTy, ScalarTy});
1218	return Ctx.TTI.getIntrinsicInstrCost(ICA, CostKind: Ctx.CostKind);
1219	}
1220	case VPInstruction::FirstOrderRecurrenceSplice: {
1221	assert(VF.isVector() && "Scalar FirstOrderRecurrenceSplice?");
1222	SmallVector<int> Mask(VF.getKnownMinValue());
1223	std::iota(first: Mask.begin(), last: Mask.end(), value: VF.getKnownMinValue() - `1`);
1224	Type VectorTy = toVectorTy(Scalar: Ctx.Types.inferScalarType(V: this*), EC: VF);
1225
1226	return Ctx.TTI.getShuffleCost(Kind: TargetTransformInfo::SK_Splice,
1227	DstTy: cast<VectorType>(Val: VectorTy),
1228	SrcTy: cast<VectorType>(Val: VectorTy), Mask,
1229	CostKind: Ctx.CostKind, Index: VF.getKnownMinValue() - `1`);
1230	}
1231	case VPInstruction::ActiveLaneMask: {
1232	Type *ArgTy = Ctx.Types.inferScalarType(V: getOperand(N: `0`));
1233	unsigned Multiplier = cast<VPConstantInt>(Val: getOperand(N: `2`))->getZExtValue();
1234	Type RetTy = toVectorTy(Scalar: Type::getInt1Ty(C&: Ctx.LLVMCtx), EC: VF Multiplier);
1235	IntrinsicCostAttributes Attrs(Intrinsic::get_active_lane_mask, RetTy,
1236	{ArgTy, ArgTy});
1237	return Ctx.TTI.getIntrinsicInstrCost(ICA: Attrs, CostKind: Ctx.CostKind);
1238	}
1239	case VPInstruction::ExplicitVectorLength: {
1240	Type *Arg0Ty = Ctx.Types.inferScalarType(V: getOperand(N: `0`));
1241	Type *I32Ty = Type::getInt32Ty(C&: Ctx.LLVMCtx);
1242	Type *I1Ty = Type::getInt1Ty(C&: Ctx.LLVMCtx);
1243	IntrinsicCostAttributes Attrs(Intrinsic::experimental_get_vector_length,
1244	I32Ty, {Arg0Ty, I32Ty, I1Ty});
1245	return Ctx.TTI.getIntrinsicInstrCost(ICA: Attrs, CostKind: Ctx.CostKind);
1246	}
1247	case VPInstruction::Reverse: {
1248	assert(VF.isVector() && "Reverse operation must be vector type");
1249	auto *VectorTy = cast<VectorType>(
1250	Val: toVectorTy(Scalar: Ctx.Types.inferScalarType(V: getOperand(N: `0`)), EC: VF));
1251	return Ctx.TTI.getShuffleCost(Kind: TargetTransformInfo::SK_Reverse, DstTy: VectorTy,
1252	SrcTy: VectorTy, /Mask=/{}, CostKind: Ctx.CostKind,
1253	/Index=/`0`);
1254	}
1255	case VPInstruction::ExtractLastLane: {
1256	// Add on the cost of extracting the element.
1257	auto *VecTy = toVectorTy(Scalar: Ctx.Types.inferScalarType(V: getOperand(N: `0`)), EC: VF);
1258	return Ctx.TTI.getIndexedVectorInstrCostFromEnd(Opcode: Instruction::ExtractElement,
1259	Val: VecTy, CostKind: Ctx.CostKind, Index: `0`);
1260	}
1261	case VPInstruction::ExtractPenultimateElement:
1262	if (VF == ElementCount::getScalable(MinVal: `1`))
1263	return InstructionCost::getInvalid();
1264	[[fallthrough]];
1265	default:
1266	// TODO: Compute cost other VPInstructions once the legacy cost model has
1267	// been retired.
1268	assert(!getUnderlyingValue() &&
1269	"unexpected VPInstruction witht underlying value");
1270	return `0`;
1271	}
1272	}
1273
1274	bool VPInstruction::isVectorToScalar() const {
1275	return getOpcode() == VPInstruction::ExtractLastLane \|\|
1276	getOpcode() == VPInstruction::ExtractPenultimateElement \|\|
1277	getOpcode() == Instruction::ExtractElement \|\|
1278	getOpcode() == VPInstruction::ExtractLane \|\|
1279	getOpcode() == VPInstruction::FirstActiveLane \|\|
1280	getOpcode() == VPInstruction::LastActiveLane \|\|
1281	getOpcode() == VPInstruction::ComputeAnyOfResult \|\|
1282	getOpcode() == VPInstruction::ExtractLastActive \|\|
1283	getOpcode() == VPInstruction::ComputeReductionResult \|\|
1284	getOpcode() == VPInstruction::AnyOf;
1285	}
1286
1287	bool VPInstruction::isSingleScalar() const {
1288	switch (getOpcode()) {
1289	case Instruction::Load:
1290	case Instruction::PHI:
1291	case VPInstruction::ExplicitVectorLength:
1292	case VPInstruction::ResumeForEpilogue:
1293	case VPInstruction::VScale:
1294	return true;
1295	default:
1296	return isScalarCast();
1297	}
1298	}
1299
1300	void VPInstruction::execute(VPTransformState &State) {
1301	assert(!isMasked() && "cannot execute masked VPInstruction");
1302	assert(!State.Lane && "VPInstruction executing an Lane");
1303	IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
1304	assert(flagsValidForOpcode(getOpcode()) &&
1305	"Set flags not supported for the provided opcode");
1306	assert(hasRequiredFlagsForOpcode(getOpcode()) &&
1307	"Opcode requires specific flags to be set");
1308	if (hasFastMathFlags())
1309	State.Builder.setFastMathFlags(getFastMathFlags());
1310	Value *GeneratedValue = generate(State);
1311	if (!hasResult())
1312	return;
1313	assert(GeneratedValue && "generate must produce a value");
1314	bool GeneratesPerFirstLaneOnly = canGenerateScalarForFirstLane() &&
1315	(vputils::onlyFirstLaneUsed(Def: this) \|\|
1316	isVectorToScalar() \|\| isSingleScalar());
1317	assert((((GeneratedValue->getType()->isVectorTy() \|\|
1318	GeneratedValue->getType()->isStructTy()) ==
1319	!GeneratesPerFirstLaneOnly) \|\|
1320	State.VF.isScalar()) &&
1321	"scalar value but not only first lane defined");
1322	State.set(Def: this, V: GeneratedValue,
1323	/IsScalar/ GeneratesPerFirstLaneOnly);
1324	if (getOpcode() == VPInstruction::ResumeForEpilogue) {
1325	// FIXME: This is a workaround to enable reliable updates of the scalar loop
1326	// resume phis, when vectorizing the epilogue. Must be removed once epilogue
1327	// vectorization explicitly connects VPlans.
1328	setUnderlyingValue(GeneratedValue);
1329	}
1330	}
1331
1332	bool VPInstruction::opcodeMayReadOrWriteFromMemory() const {
1333	if (Instruction::isBinaryOp(Opcode: getOpcode()) \|\| Instruction::isCast(Opcode: getOpcode()))
1334	return false;
1335	switch (getOpcode()) {
1336	case Instruction::GetElementPtr:
1337	case Instruction::ExtractElement:
1338	case Instruction::Freeze:
1339	case Instruction::FCmp:
1340	case Instruction::ICmp:
1341	case Instruction::Select:
1342	case Instruction::PHI:
1343	case VPInstruction::AnyOf:
1344	case VPInstruction::BranchOnCond:
1345	case VPInstruction::BranchOnTwoConds:
1346	case VPInstruction::BranchOnCount:
1347	case VPInstruction::Broadcast:
1348	case VPInstruction::BuildStructVector:
1349	case VPInstruction::BuildVector:
1350	case VPInstruction::CalculateTripCountMinusVF:
1351	case VPInstruction::CanonicalIVIncrementForPart:
1352	case VPInstruction::ExtractLane:
1353	case VPInstruction::ExtractLastLane:
1354	case VPInstruction::ExtractLastPart:
1355	case VPInstruction::ExtractPenultimateElement:
1356	case VPInstruction::ActiveLaneMask:
1357	case VPInstruction::ExitingIVValue:
1358	case VPInstruction::ExplicitVectorLength:
1359	case VPInstruction::FirstActiveLane:
1360	case VPInstruction::LastActiveLane:
1361	case VPInstruction::ExtractLastActive:
1362	case VPInstruction::FirstOrderRecurrenceSplice:
1363	case VPInstruction::LogicalAnd:
1364	case VPInstruction::LogicalOr:
1365	case VPInstruction::MaskedCond:
1366	case VPInstruction::Not:
1367	case VPInstruction::PtrAdd:
1368	case VPInstruction::WideIVStep:
1369	case VPInstruction::WidePtrAdd:
1370	case VPInstruction::StepVector:
1371	case VPInstruction::ReductionStartVector:
1372	case VPInstruction::Reverse:
1373	case VPInstruction::VScale:
1374	case VPInstruction::Unpack:
1375	return false;
1376	default:
1377	return true;
1378	}
1379	}
1380
1381	bool VPInstruction::usesFirstLaneOnly(const VPValue Op) const* {
1382	assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
1383	if (Instruction::isBinaryOp(Opcode: getOpcode()) \|\| Instruction::isCast(Opcode: getOpcode()))
1384	return vputils::onlyFirstLaneUsed(Def: this);
1385
1386	switch (getOpcode()) {
1387	default:
1388	return false;
1389	case Instruction::ExtractElement:
1390	return Op == getOperand(N: `1`);
1391	case Instruction::PHI:
1392	return true;
1393	case Instruction::FCmp:
1394	case Instruction::ICmp:
1395	case Instruction::Select:
1396	case Instruction::Or:
1397	case Instruction::Freeze:
1398	case VPInstruction::Not:
1399	// TODO: Cover additional opcodes.
1400	return vputils::onlyFirstLaneUsed(Def: this);
1401	case Instruction::Load:
1402	case VPInstruction::ActiveLaneMask:
1403	case VPInstruction::ExplicitVectorLength:
1404	case VPInstruction::CalculateTripCountMinusVF:
1405	case VPInstruction::CanonicalIVIncrementForPart:
1406	case VPInstruction::BranchOnCount:
1407	case VPInstruction::BranchOnCond:
1408	case VPInstruction::Broadcast:
1409	case VPInstruction::ReductionStartVector:
1410	return true;
1411	case VPInstruction::BuildStructVector:
1412	case VPInstruction::BuildVector:
1413	// Before replicating by VF, Build(Struct)Vector uses all lanes of the
1414	// operand, after replicating its operands only the first lane is used.
1415	// Before replicating, it will have only a single operand.
1416	return getNumOperands() > `1`;
1417	case VPInstruction::PtrAdd:
1418	return Op == getOperand(N: `0`) \|\| vputils::onlyFirstLaneUsed(Def: this);
1419	case VPInstruction::WidePtrAdd:
1420	// WidePtrAdd supports scalar and vector base addresses.
1421	return false;
1422	case VPInstruction::ComputeAnyOfResult:
1423	return Op == getOperand(N: `0`) \|\| Op == getOperand(N: `1`);
1424	case VPInstruction::ExitingIVValue:
1425	case VPInstruction::ExtractLane:
1426	return Op == getOperand(N: `0`);
1427	};
1428	llvm_unreachable("switch should return");
1429	}
1430
1431	bool VPInstruction::usesFirstPartOnly(const VPValue Op) const* {
1432	assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
1433	if (Instruction::isBinaryOp(Opcode: getOpcode()))
1434	return vputils::onlyFirstPartUsed(Def: this);
1435
1436	switch (getOpcode()) {
1437	default:
1438	return false;
1439	case Instruction::FCmp:
1440	case Instruction::ICmp:
1441	case Instruction::Select:
1442	return vputils::onlyFirstPartUsed(Def: this);
1443	case VPInstruction::BranchOnCount:
1444	case VPInstruction::BranchOnCond:
1445	case VPInstruction::BranchOnTwoConds:
1446	case VPInstruction::CanonicalIVIncrementForPart:
1447	return true;
1448	};
1449	llvm_unreachable("switch should return");
1450	}
1451
1452	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
1453	void VPInstruction::dump() const {
1454	VPSlotTracker SlotTracker(getParent()->getPlan());
1455	printRecipe(dbgs(), "", SlotTracker);
1456	}
1457
1458	void VPInstruction::printRecipe(raw_ostream &O, const Twine &Indent,
1459	VPSlotTracker &SlotTracker) const {
1460	O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "") << " ";
1461
1462	if (hasResult()) {
1463	printAsOperand(O, SlotTracker);
1464	O << " = ";
1465	}
1466
1467	switch (getOpcode()) {
1468	case VPInstruction::Not:
1469	O << "not";
1470	break;
1471	case VPInstruction::SLPLoad:
1472	O << "combined load";
1473	break;
1474	case VPInstruction::SLPStore:
1475	O << "combined store";
1476	break;
1477	case VPInstruction::ActiveLaneMask:
1478	O << "active lane mask";
1479	break;
1480	case VPInstruction::ExplicitVectorLength:
1481	O << "EXPLICIT-VECTOR-LENGTH";
1482	break;
1483	case VPInstruction::FirstOrderRecurrenceSplice:
1484	O << "first-order splice";
1485	break;
1486	case VPInstruction::BranchOnCond:
1487	O << "branch-on-cond";
1488	break;
1489	case VPInstruction::BranchOnTwoConds:
1490	O << "branch-on-two-conds";
1491	break;
1492	case VPInstruction::CalculateTripCountMinusVF:
1493	O << "TC > VF ? TC - VF : 0";
1494	break;
1495	case VPInstruction::CanonicalIVIncrementForPart:
1496	O << "VF * Part +";
1497	break;
1498	case VPInstruction::BranchOnCount:
1499	O << "branch-on-count";
1500	break;
1501	case VPInstruction::Broadcast:
1502	O << "broadcast";
1503	break;
1504	case VPInstruction::BuildStructVector:
1505	O << "buildstructvector";
1506	break;
1507	case VPInstruction::BuildVector:
1508	O << "buildvector";
1509	break;
1510	case VPInstruction::ExitingIVValue:
1511	O << "exiting-iv-value";
1512	break;
1513	case VPInstruction::MaskedCond:
1514	O << "masked-cond";
1515	break;
1516	case VPInstruction::ExtractLane:
1517	O << "extract-lane";
1518	break;
1519	case VPInstruction::ExtractLastLane:
1520	O << "extract-last-lane";
1521	break;
1522	case VPInstruction::ExtractLastPart:
1523	O << "extract-last-part";
1524	break;
1525	case VPInstruction::ExtractPenultimateElement:
1526	O << "extract-penultimate-element";
1527	break;
1528	case VPInstruction::ComputeAnyOfResult:
1529	O << "compute-anyof-result";
1530	break;
1531	case VPInstruction::ComputeReductionResult:
1532	O << "compute-reduction-result";
1533	break;
1534	case VPInstruction::LogicalAnd:
1535	O << "logical-and";
1536	break;
1537	case VPInstruction::LogicalOr:
1538	O << "logical-or";
1539	break;
1540	case VPInstruction::PtrAdd:
1541	O << "ptradd";
1542	break;
1543	case VPInstruction::WidePtrAdd:
1544	O << "wide-ptradd";
1545	break;
1546	case VPInstruction::AnyOf:
1547	O << "any-of";
1548	break;
1549	case VPInstruction::FirstActiveLane:
1550	O << "first-active-lane";
1551	break;
1552	case VPInstruction::LastActiveLane:
1553	O << "last-active-lane";
1554	break;
1555	case VPInstruction::ReductionStartVector:
1556	O << "reduction-start-vector";
1557	break;
1558	case VPInstruction::ResumeForEpilogue:
1559	O << "resume-for-epilogue";
1560	break;
1561	case VPInstruction::Reverse:
1562	O << "reverse";
1563	break;
1564	case VPInstruction::Unpack:
1565	O << "unpack";
1566	break;
1567	case VPInstruction::ExtractLastActive:
1568	O << "extract-last-active";
1569	break;
1570	default:
1571	O << Instruction::getOpcodeName(getOpcode());
1572	}
1573
1574	printFlags(O);
1575	printOperands(O, SlotTracker);
1576	}
1577	#endif
1578
1579	void VPInstructionWithType::execute(VPTransformState &State) {
1580	State.setDebugLocFrom(getDebugLoc());
1581	if (isScalarCast()) {
1582	Value *Op = State.get(Def: getOperand(N: `0`), Lane: VPLane (`0`));
1583	Value *Cast = State.Builder.CreateCast(Op: Instruction::CastOps(getOpcode()),
1584	V: Op, DestTy: ResultTy);
1585	State.set(Def: this, V: Cast, Lane: VPLane (`0`));
1586	return;
1587	}
1588	switch (getOpcode()) {
1589	case VPInstruction::StepVector: {
1590	Value *StepVector =
1591	State.Builder.CreateStepVector(DstType: VectorType::get(ElementType: ResultTy, EC: State.VF));
1592	State.set(Def: this, V: StepVector);
1593	break;
1594	}
1595	case VPInstruction::VScale: {
1596	Value *VScale = State.Builder.CreateVScale(Ty: ResultTy);
1597	State.set(Def: this, V: VScale, IsScalar: true);
1598	break;
1599	}
1600
1601	default:
1602	llvm_unreachable("opcode not implemented yet");
1603	}
1604	}
1605
1606	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
1607	void VPInstructionWithType::printRecipe(raw_ostream &O, const Twine &Indent,
1608	VPSlotTracker &SlotTracker) const {
1609	O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "") << " ";
1610	printAsOperand(O, SlotTracker);
1611	O << " = ";
1612
1613	switch (getOpcode()) {
1614	case VPInstruction::WideIVStep:
1615	O << "wide-iv-step ";
1616	printOperands(O, SlotTracker);
1617	break;
1618	case VPInstruction::StepVector:
1619	O << "step-vector " << *ResultTy;
1620	break;
1621	case VPInstruction::VScale:
1622	O << "vscale " << *ResultTy;
1623	break;
1624	case Instruction::Load:
1625	O << "load ";
1626	printOperands(O, SlotTracker);
1627	break;
1628	default:
1629	assert(Instruction::isCast(getOpcode()) && "unhandled opcode");
1630	O << Instruction::getOpcodeName(getOpcode()) << " ";
1631	printOperands(O, SlotTracker);
1632	O << " to " << *ResultTy;
1633	}
1634	}
1635	#endif
1636
1637	void VPPhi::execute(VPTransformState &State) {
1638	State.setDebugLocFrom(getDebugLoc());
1639	PHINode *NewPhi = State.Builder.CreatePHI(
1640	Ty: State.TypeAnalysis.inferScalarType(V: this), NumReservedValues: `2`, Name: getName());
1641	unsigned NumIncoming = getNumIncoming();
1642	if (getParent() != getParent()->getPlan()->getScalarPreheader()) {
1643	// TODO: Fixup all incoming values of header phis once recipes defining them
1644	// are introduced.
1645	NumIncoming = `1`;
1646	}
1647	for (unsigned Idx = `0`; Idx != NumIncoming; ++Idx) {
1648	Value *IncV = State.get(Def: getIncomingValue(Idx), Lane: VPLane (`0`));
1649	BasicBlock *PredBB = State.CFG.VPBB2IRBB.at(Val: getIncomingBlock(Idx));
1650	NewPhi->addIncoming(V: IncV, BB: PredBB);
1651	}
1652	State.set(Def: this, V: NewPhi, Lane: VPLane (`0`));
1653	}
1654
1655	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
1656	void VPPhi::printRecipe(raw_ostream &O, const Twine &Indent,
1657	VPSlotTracker &SlotTracker) const {
1658	O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "") << " ";
1659	printAsOperand(O, SlotTracker);
1660	O << " = phi";
1661	printFlags(O);
1662	printPhiOperands(O, SlotTracker);
1663	}
1664	#endif
1665
1666	VPIRInstruction *VPIRInstruction ::create(Instruction &I) {
1667	if (auto *Phi = dyn_cast<PHINode>(Val: &I))
1668	return new VPIRPhi (*Phi);
1669	return new VPIRInstruction (I);
1670	}
1671
1672	void VPIRInstruction::execute(VPTransformState &State) {
1673	assert(!isa<VPIRPhi>(this) && getNumOperands() == `0` &&
1674	"PHINodes must be handled by VPIRPhi");
1675	// Advance the insert point after the wrapped IR instruction. This allows
1676	// interleaving VPIRInstructions and other recipes.
1677	State.Builder.SetInsertPoint(TheBB: I.getParent(), IP: std::next(x: I.getIterator()));
1678	}
1679
1680	InstructionCost VPIRInstruction::computeCost(ElementCount VF,
1681	VPCostContext &Ctx) const {
1682	// The recipe wraps an existing IR instruction on the border of VPlan's scope,
1683	// hence it does not contribute to the cost-modeling for the VPlan.
1684	return `0`;
1685	}
1686
1687	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
1688	void VPIRInstruction::printRecipe(raw_ostream &O, const Twine &Indent,
1689	VPSlotTracker &SlotTracker) const {
1690	O << Indent << "IR " << I;
1691	}
1692	#endif
1693
1694	void VPIRPhi::execute(VPTransformState &State) {
1695	PHINode *Phi = &getIRPhi();
1696	for (const auto &[Idx, Op] : enumerate(First: operands())) {
1697	VPValue *ExitValue = Op;
1698	auto Lane = vputils::isSingleScalar(VPV: ExitValue)
1699	? VPLane::getFirstLane()
1700	: VPLane::getLastLaneForVF(VF: State.VF);
1701	VPBlockBase *Pred = getParent()->getPredecessors()[Idx];
1702	auto *PredVPBB = Pred->getExitingBasicBlock();
1703	BasicBlock *PredBB = State.CFG.VPBB2IRBB [PredVPBB];
1704	// Set insertion point in PredBB in case an extract needs to be generated.
1705	// TODO: Model extracts explicitly.
1706	State.Builder.SetInsertPoint(PredBB->getTerminator());
1707	Value *V = State.get(Def: ExitValue, Lane: VPLane (Lane));
1708	// If there is no existing block for PredBB in the phi, add a new incoming
1709	// value. Otherwise update the existing incoming value for PredBB.
1710	if (Phi->getBasicBlockIndex(BB: PredBB) == -`1`)
1711	Phi->addIncoming(V, BB: PredBB);
1712	else
1713	Phi->setIncomingValueForBlock(BB: PredBB, V);
1714	}
1715
1716	// Advance the insert point after the wrapped IR instruction. This allows
1717	// interleaving VPIRInstructions and other recipes.
1718	State.Builder.SetInsertPoint(TheBB: Phi->getParent(), IP: std::next(x: Phi->getIterator()));
1719	}
1720
1721	void VPPhiAccessors::removeIncomingValueFor(VPBlockBase IncomingBlock) const* {
1722	VPRecipeBase R = const_cast<VPRecipeBase >(getAsRecipe());
1723	assert(R->getNumOperands() == R->getParent()->getNumPredecessors() &&
1724	"Number of phi operands must match number of predecessors");
1725	unsigned Position = R->getParent()->getIndexForPredecessor(Pred: IncomingBlock);
1726	R->removeOperand(Idx: Position);
1727	}
1728
1729	VPValue *
1730	VPPhiAccessors::getIncomingValueForBlock(const VPBasicBlock VPBB) const* {
1731	VPRecipeBase R = const_cast<VPRecipeBase >(getAsRecipe());
1732	return getIncomingValue(Idx: R->getParent()->getIndexForPredecessor(Pred: VPBB));
1733	}
1734
1735	void VPPhiAccessors::setIncomingValueForBlock(const VPBasicBlock *VPBB,
1736	VPValue V) const* {
1737	VPRecipeBase R = const_cast<VPRecipeBase >(getAsRecipe());
1738	R->setOperand(I: R->getParent()->getIndexForPredecessor(Pred: VPBB), New: V);
1739	}
1740
1741	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
1742	void VPPhiAccessors::printPhiOperands(raw_ostream &O,
1743	VPSlotTracker &SlotTracker) const {
1744	interleaveComma(enumerate(getAsRecipe()->operands()), O,
1745	[this, &O, &SlotTracker](auto Op) {
1746	O << "[ ";
1747	Op.value()->printAsOperand(O, SlotTracker);
1748	O << ", ";
1749	getIncomingBlock(Op.index())->printAsOperand(O);
1750	O << " ]";
1751	});
1752	}
1753	#endif
1754
1755	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
1756	void VPIRPhi::printRecipe(raw_ostream &O, const Twine &Indent,
1757	VPSlotTracker &SlotTracker) const {
1758	VPIRInstruction::printRecipe(O, Indent, SlotTracker);
1759
1760	if (getNumOperands() != `0`) {
1761	O << " (extra operand" << (getNumOperands() > `1` ? "s" : "") << ": ";
1762	interleaveComma(incoming_values_and_blocks(), O,
1763	[&O, &SlotTracker](auto Op) {
1764	std::get<`0`>(Op)->printAsOperand(O, SlotTracker);
1765	O << " from ";
1766	std::get<`1`>(Op)->printAsOperand(O);
1767	});
1768	O << ")";
1769	}
1770	}
1771	#endif
1772
1773	void VPIRMetadata::applyMetadata(Instruction &I) const {
1774	for (const auto &[Kind, Node] : Metadata)
1775	I.setMetadata(KindID: Kind, Node);
1776	}
1777
1778	void VPIRMetadata::intersect(const VPIRMetadata &Other) {
1779	SmallVector<std::pair<unsigned, MDNode *>> MetadataIntersection;
1780	for (const auto &[KindA, MDA] : Metadata) {
1781	for (const auto &[KindB, MDB] : Other.Metadata) {
1782	if (KindA == KindB && MDA == MDB) {
1783	MetadataIntersection.emplace_back(Args: KindA, Args: MDA);
1784	break;
1785	}
1786	}
1787	}
1788	Metadata = std::move(MetadataIntersection);
1789	}
1790
1791	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
1792	void VPIRMetadata::print(raw_ostream &O, VPSlotTracker &SlotTracker) const {
1793	const Module *M = SlotTracker.getModule();
1794	if (Metadata.empty() \|\| !M)
1795	return;
1796
1797	ArrayRef<StringRef> MDNames = SlotTracker.getMDNames();
1798	O << " (";
1799	interleaveComma(Metadata, O, [&](const auto &KindNodePair) {
1800	auto [Kind, Node] = KindNodePair;
1801	assert(Kind < MDNames.size() && !MDNames[Kind].empty() &&
1802	"Unexpected unnamed metadata kind");
1803	O << "!" << MDNames[Kind] << " ";
1804	Node->printAsOperand(O, M);
1805	});
1806	O << ")";
1807	}
1808	#endif
1809
1810	void VPWidenCallRecipe::execute(VPTransformState &State) {
1811	assert(State.VF.isVector() && "not widening");
1812	assert(Variant != nullptr && "Can't create vector function.");
1813
1814	FunctionType *VFTy = Variant->getFunctionType();
1815	// Add return type if intrinsic is overloaded on it.
1816	SmallVector<Value *, `4`> Args;
1817	for (const auto &I : enumerate(First: args())) {
1818	Value *Arg;
1819	// Some vectorized function variants may also take a scalar argument,
1820	// e.g. linear parameters for pointers. This needs to be the scalar value
1821	// from the start of the respective part when interleaving.
1822	if (!VFTy->getParamType(i: I.index())->isVectorTy())
1823	Arg = State.get(Def: I.value(), Lane: VPLane (`0`));
1824	else
1825	Arg = State.get(Def: I.value(), IsScalar: usesFirstLaneOnly(Op: I.value()));
1826	Args.push_back(Elt: Arg);
1827	}
1828
1829	auto *CI = cast_or_null<CallInst>(Val: getUnderlyingValue());
1830	SmallVector<OperandBundleDef, `1`> OpBundles;
1831	if (CI)
1832	CI->getOperandBundlesAsDefs(Defs&: OpBundles);
1833
1834	CallInst *V = State.Builder.CreateCall(Callee: Variant, Args, OpBundles);
1835	applyFlags(I&: *V);
1836	applyMetadata(I&: *V);
1837	V->setCallingConv(Variant->getCallingConv());
1838
1839	if (!V->getType()->isVoidTy())
1840	State.set(Def: this, V);
1841	}
1842
1843	InstructionCost VPWidenCallRecipe::computeCost(ElementCount VF,
1844	VPCostContext &Ctx) const {
1845	return Ctx.TTI.getCallInstrCost(F: nullptr, RetTy: Variant->getReturnType(),
1846	Tys: Variant->getFunctionType()->params(),
1847	CostKind: Ctx.CostKind);
1848	}
1849
1850	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
1851	void VPWidenCallRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
1852	VPSlotTracker &SlotTracker) const {
1853	O << Indent << "WIDEN-CALL ";
1854
1855	Function *CalledFn = getCalledScalarFunction();
1856	if (CalledFn->getReturnType()->isVoidTy())
1857	O << "void ";
1858	else {
1859	printAsOperand(O, SlotTracker);
1860	O << " = ";
1861	}
1862
1863	O << "call";
1864	printFlags(O);
1865	O << " @" << CalledFn->getName() << "(";
1866	interleaveComma(args(), O, [&O, &SlotTracker](VPValue *Op) {
1867	Op->printAsOperand(O, SlotTracker);
1868	});
1869	O << ")";
1870
1871	O << " (using library function";
1872	if (Variant->hasName())
1873	O << ": " << Variant->getName();
1874	O << ")";
1875	}
1876	#endif
1877
1878	void VPWidenIntrinsicRecipe::execute(VPTransformState &State) {
1879	assert(State.VF.isVector() && "not widening");
1880
1881	SmallVector<Type *, `2`> TysForDecl;
1882	// Add return type if intrinsic is overloaded on it.
1883	if (isVectorIntrinsicWithOverloadTypeAtArg(ID: VectorIntrinsicID, OpdIdx: -`1`,
1884	TTI: State.TTI)) {
1885	Type *RetTy = toVectorizedTy(Ty: getResultType(), EC: State.VF);
1886	ArrayRef<Type *> ContainedTys = getContainedTypes(Ty: RetTy);
1887	for (auto [Idx, Ty] : enumerate(First&: ContainedTys)) {
1888	if (isVectorIntrinsicWithStructReturnOverloadAtField(ID: VectorIntrinsicID,
1889	RetIdx: Idx, TTI: State.TTI))
1890	TysForDecl.push_back(Elt: Ty);
1891	}
1892	}
1893	SmallVector<Value *, `4`> Args;
1894	for (const auto &I : enumerate(First: operands())) {
1895	// Some intrinsics have a scalar argument - don't replace it with a
1896	// vector.
1897	Value *Arg;
1898	if (isVectorIntrinsicWithScalarOpAtArg(ID: VectorIntrinsicID, ScalarOpdIdx: I.index(),
1899	TTI: State.TTI))
1900	Arg = State.get(Def: I.value(), Lane: VPLane (`0`));
1901	else
1902	Arg = State.get(Def: I.value(), IsScalar: usesFirstLaneOnly(Op: I.value()));
1903	if (isVectorIntrinsicWithOverloadTypeAtArg(ID: VectorIntrinsicID, OpdIdx: I.index(),
1904	TTI: State.TTI))
1905	TysForDecl.push_back(Elt: Arg->getType());
1906	Args.push_back(Elt: Arg);
1907	}
1908
1909	// Use vector version of the intrinsic.
1910	Module *M = State.Builder.GetInsertBlock()->getModule();
1911	Function *VectorF =
1912	Intrinsic::getOrInsertDeclaration(M, id: VectorIntrinsicID, Tys: TysForDecl);
1913	assert(VectorF &&
1914	"Can't retrieve vector intrinsic or vector-predication intrinsics.");
1915
1916	auto *CI = cast_or_null<CallInst>(Val: getUnderlyingValue());
1917	SmallVector<OperandBundleDef, `1`> OpBundles;
1918	if (CI)
1919	CI->getOperandBundlesAsDefs(Defs&: OpBundles);
1920
1921	CallInst *V = State.Builder.CreateCall(Callee: VectorF, Args, OpBundles);
1922
1923	applyFlags(I&: *V);
1924	applyMetadata(I&: *V);
1925
1926	if (!V->getType()->isVoidTy())
1927	State.set(Def: this, V);
1928	}
1929
1930	/// Compute the cost for the intrinsic \p ID with \p Operands, produced by \p R.
1931	static InstructionCost getCostForIntrinsics(Intrinsic::ID ID,
1932	ArrayRef<const VPValue *> Operands,
1933	const VPRecipeWithIRFlags &R,
1934	ElementCount VF,
1935	VPCostContext &Ctx) {
1936	// Some backends analyze intrinsic arguments to determine cost. Use the
1937	// underlying value for the operand if it has one. Otherwise try to use the
1938	// operand of the underlying call instruction, if there is one. Otherwise
1939	// clear Arguments.
1940	// TODO: Rework TTI interface to be independent of concrete IR values.
1941	SmallVector<const Value *> Arguments;
1942	for (const auto &[Idx, Op] : enumerate(First&: Operands)) {
1943	auto *V = Op->getUnderlyingValue();
1944	if (!V) {
1945	if (auto *UI = dyn_cast_or_null<CallBase>(Val: R.getUnderlyingValue())) {
1946	Arguments.push_back(Elt: UI->getArgOperand(i: Idx));
1947	continue;
1948	}
1949	Arguments.clear();
1950	break;
1951	}
1952	Arguments.push_back(Elt: V);
1953	}
1954
1955	Type *ScalarRetTy = Ctx.Types.inferScalarType(V: &R);
1956	Type *RetTy = VF.isVector() ? toVectorizedTy(Ty: ScalarRetTy, EC: VF) : ScalarRetTy;
1957	SmallVector<Type *> ParamTys;
1958	for (const VPValue *Op : Operands) {
1959	ParamTys.push_back(Elt: VF.isVector()
1960	? toVectorTy(Scalar: Ctx.Types.inferScalarType(V: Op), EC: VF)
1961	: Ctx.Types.inferScalarType(V: Op));
1962	}
1963
1964	// TODO: Rework TTI interface to avoid reliance on underlying IntrinsicInst.
1965	IntrinsicCostAttributes CostAttrs(
1966	ID, RetTy, Arguments, ParamTys, R.getFastMathFlags(),
1967	dyn_cast_or_null<IntrinsicInst>(Val: R.getUnderlyingValue()),
1968	InstructionCost::getInvalid());
1969	return Ctx.TTI.getIntrinsicInstrCost(ICA: CostAttrs, CostKind: Ctx.CostKind);
1970	}
1971
1972	InstructionCost VPWidenIntrinsicRecipe::computeCost(ElementCount VF,
1973	VPCostContext &Ctx) const {
1974	SmallVector<const VPValue *> ArgOps(operands());
1975	return getCostForIntrinsics(ID: VectorIntrinsicID, Operands: ArgOps, R: *this, VF, Ctx);
1976	}
1977
1978	StringRef VPWidenIntrinsicRecipe::getIntrinsicName() const {
1979	return Intrinsic::getBaseName(id: VectorIntrinsicID);
1980	}
1981
1982	bool VPWidenIntrinsicRecipe::usesFirstLaneOnly(const VPValue Op) const* {
1983	assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
1984	return all_of(Range: enumerate(First: operands()), P: [this, &Op](const auto &X) {
1985	auto [Idx, V] = X;
1986	return V != Op \|\| isVectorIntrinsicWithScalarOpAtArg(getVectorIntrinsicID(),
1987	Idx, nullptr);
1988	});
1989	}
1990
1991	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
1992	void VPWidenIntrinsicRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
1993	VPSlotTracker &SlotTracker) const {
1994	O << Indent << "WIDEN-INTRINSIC ";
1995	if (ResultTy->isVoidTy()) {
1996	O << "void ";
1997	} else {
1998	printAsOperand(O, SlotTracker);
1999	O << " = ";
2000	}
2001
2002	O << "call";
2003	printFlags(O);
2004	O << getIntrinsicName() << "(";
2005
2006	interleaveComma(operands(), O, [&O, &SlotTracker](VPValue *Op) {
2007	Op->printAsOperand(O, SlotTracker);
2008	});
2009	O << ")";
2010	}
2011	#endif
2012
2013	void VPHistogramRecipe::execute(VPTransformState &State) {
2014	IRBuilderBase &Builder = State.Builder;
2015
2016	Value *Address = State.get(Def: getOperand(N: `0`));
2017	Value IncAmt = State.get(Def: getOperand(N: `1`), /IsScalar=/*true);
2018	VectorType *VTy = cast<VectorType>(Val: Address->getType());
2019
2020	// The histogram intrinsic requires a mask even if the recipe doesn't;
2021	// if the mask operand was omitted then all lanes should be executed and
2022	// we just need to synthesize an all-true mask.
2023	Value Mask = nullptr*;
2024	if (VPValue *VPMask = getMask())
2025	Mask = State.get(Def: VPMask);
2026	else
2027	Mask =
2028	Builder.CreateVectorSplat(EC: VTy->getElementCount(), V: Builder.getInt1(V: `1`));
2029
2030	// If this is a subtract, we want to invert the increment amount. We may
2031	// add a separate intrinsic in future, but for now we'll try this.
2032	if (Opcode == Instruction::Sub)
2033	IncAmt = Builder.CreateNeg(V: IncAmt);
2034	else
2035	assert(Opcode == Instruction::Add && "only add or sub supported for now");
2036
2037	State.Builder.CreateIntrinsic(ID: Intrinsic::experimental_vector_histogram_add,
2038	Types: {VTy, IncAmt->getType()},
2039	Args: {Address, IncAmt, Mask});
2040	}
2041
2042	InstructionCost VPHistogramRecipe::computeCost(ElementCount VF,
2043	VPCostContext &Ctx) const {
2044	// FIXME: Take the gather and scatter into account as well. For now we're
2045	// generating the same cost as the fallback path, but we'll likely
2046	// need to create a new TTI method for determining the cost, including
2047	// whether we can use base + vec-of-smaller-indices or just
2048	// vec-of-pointers.
2049	assert(VF.isVector() && "Invalid VF for histogram cost");
2050	Type *AddressTy = Ctx.Types.inferScalarType(V: getOperand(N: `0`));
2051	VPValue *IncAmt = getOperand(N: `1`);
2052	Type *IncTy = Ctx.Types.inferScalarType(V: IncAmt);
2053	VectorType *VTy = VectorType::get(ElementType: IncTy, EC: VF);
2054
2055	// Assume that a non-constant update value (or a constant != 1) requires
2056	// a multiply, and add that into the cost.
2057	InstructionCost MulCost =
2058	Ctx.TTI.getArithmeticInstrCost(Opcode: Instruction::Mul, Ty: VTy, CostKind: Ctx.CostKind);
2059	if (auto *CI = dyn_cast<VPConstantInt>(Val: IncAmt))
2060	if (CI->isOne())
2061	MulCost = TTI::TCC_Free;
2062
2063	// Find the cost of the histogram operation itself.
2064	Type *PtrTy = VectorType::get(ElementType: AddressTy, EC: VF);
2065	Type *MaskTy = VectorType::get(ElementType: Type::getInt1Ty(C&: Ctx.LLVMCtx), EC: VF);
2066	IntrinsicCostAttributes ICA(Intrinsic::experimental_vector_histogram_add,
2067	Type::getVoidTy(C&: Ctx.LLVMCtx),
2068	{PtrTy, IncTy, MaskTy});
2069
2070	// Add the costs together with the add/sub operation.
2071	return Ctx.TTI.getIntrinsicInstrCost(ICA, CostKind: Ctx.CostKind) + MulCost +
2072	Ctx.TTI.getArithmeticInstrCost(Opcode, Ty: VTy, CostKind: Ctx.CostKind);
2073	}
2074
2075	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
2076	void VPHistogramRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
2077	VPSlotTracker &SlotTracker) const {
2078	O << Indent << "WIDEN-HISTOGRAM buckets: ";
2079	getOperand(`0`)->printAsOperand(O, SlotTracker);
2080
2081	if (Opcode == Instruction::Sub)
2082	O << ", dec: ";
2083	else {
2084	assert(Opcode == Instruction::Add);
2085	O << ", inc: ";
2086	}
2087	getOperand(`1`)->printAsOperand(O, SlotTracker);
2088
2089	if (VPValue *Mask = getMask()) {
2090	O << ", mask: ";
2091	Mask->printAsOperand(O, SlotTracker);
2092	}
2093	}
2094	#endif
2095
2096	VPIRFlags::FastMathFlagsTy::FastMathFlagsTy(const FastMathFlags &FMF) {
2097	AllowReassoc = FMF.allowReassoc();
2098	NoNaNs = FMF.noNaNs();
2099	NoInfs = FMF.noInfs();
2100	NoSignedZeros = FMF.noSignedZeros();
2101	AllowReciprocal = FMF.allowReciprocal();
2102	AllowContract = FMF.allowContract();
2103	ApproxFunc = FMF.approxFunc();
2104	}
2105
2106	VPIRFlags VPIRFlags::getDefaultFlags(unsigned Opcode) {
2107	switch (Opcode) {
2108	case Instruction::Add:
2109	case Instruction::Sub:
2110	case Instruction::Mul:
2111	case Instruction::Shl:
2112	case VPInstruction::CanonicalIVIncrementForPart:
2113	return WrapFlagsTy (false, false);
2114	case Instruction::Trunc:
2115	return TruncFlagsTy (false, false);
2116	case Instruction::Or:
2117	return DisjointFlagsTy (false);
2118	case Instruction::AShr:
2119	case Instruction::LShr:
2120	case Instruction::UDiv:
2121	case Instruction::SDiv:
2122	return ExactFlagsTy (false);
2123	case Instruction::GetElementPtr:
2124	case VPInstruction::PtrAdd:
2125	case VPInstruction::WidePtrAdd:
2126	return GEPNoWrapFlags::none();
2127	case Instruction::ZExt:
2128	case Instruction::UIToFP:
2129	return NonNegFlagsTy (false);
2130	case Instruction::FAdd:
2131	case Instruction::FSub:
2132	case Instruction::FMul:
2133	case Instruction::FDiv:
2134	case Instruction::FRem:
2135	case Instruction::FNeg:
2136	case Instruction::FPExt:
2137	case Instruction::FPTrunc:
2138	return FastMathFlags ();
2139	case Instruction::ICmp:
2140	case Instruction::FCmp:
2141	case VPInstruction::ComputeReductionResult:
2142	llvm_unreachable("opcode requires explicit flags");
2143	default:
2144	return VPIRFlags ();
2145	}
2146	}
2147
2148	#if !defined(NDEBUG)
2149	bool VPIRFlags::flagsValidForOpcode(unsigned Opcode) const {
2150	switch (OpType) {
2151	case OperationType::OverflowingBinOp:
2152	return Opcode == Instruction::Add \|\| Opcode == Instruction::Sub \|\|
2153	Opcode == Instruction::Mul \|\| Opcode == Instruction::Shl \|\|
2154	Opcode == VPInstruction::VPInstruction::CanonicalIVIncrementForPart;
2155	case OperationType::Trunc:
2156	return Opcode == Instruction::Trunc;
2157	case OperationType::DisjointOp:
2158	return Opcode == Instruction::Or;
2159	case OperationType::PossiblyExactOp:
2160	return Opcode == Instruction::AShr \|\| Opcode == Instruction::LShr \|\|
2161	Opcode == Instruction::UDiv \|\| Opcode == Instruction::SDiv;
2162	case OperationType::GEPOp:
2163	return Opcode == Instruction::GetElementPtr \|\|
2164	Opcode == VPInstruction::PtrAdd \|\|
2165	Opcode == VPInstruction::WidePtrAdd;
2166	case OperationType::FPMathOp:
2167	return Opcode == Instruction::Call \|\| Opcode == Instruction::FAdd \|\|
2168	Opcode == Instruction::FMul \|\| Opcode == Instruction::FSub \|\|
2169	Opcode == Instruction::FNeg \|\| Opcode == Instruction::FDiv \|\|
2170	Opcode == Instruction::FRem \|\| Opcode == Instruction::FPExt \|\|
2171	Opcode == Instruction::FPTrunc \|\| Opcode == Instruction::PHI \|\|
2172	Opcode == Instruction::Select \|\|
2173	Opcode == VPInstruction::WideIVStep \|\|
2174	Opcode == VPInstruction::ReductionStartVector;
2175	case OperationType::FCmp:
2176	return Opcode == Instruction::FCmp;
2177	case OperationType::NonNegOp:
2178	return Opcode == Instruction::ZExt \|\| Opcode == Instruction::UIToFP;
2179	case OperationType::Cmp:
2180	return Opcode == Instruction::FCmp \|\| Opcode == Instruction::ICmp;
2181	case OperationType::ReductionOp:
2182	return Opcode == VPInstruction::ComputeReductionResult;
2183	case OperationType::Other:
2184	return true;
2185	}
2186	llvm_unreachable("Unknown OperationType enum");
2187	}
2188
2189	bool VPIRFlags::hasRequiredFlagsForOpcode(unsigned Opcode) const {
2190	// Handle opcodes without default flags.
2191	if (Opcode == Instruction::ICmp)
2192	return OpType == OperationType::Cmp;
2193	if (Opcode == Instruction::FCmp)
2194	return OpType == OperationType::FCmp;
2195	if (Opcode == VPInstruction::ComputeReductionResult)
2196	return OpType == OperationType::ReductionOp;
2197
2198	OperationType Required = getDefaultFlags(Opcode).OpType;
2199	return Required == OperationType::Other \|\| Required == OpType;
2200	}
2201	#endif
2202
2203	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
2204	void VPIRFlags::printFlags(raw_ostream &O) const {
2205	switch (OpType) {
2206	case OperationType::Cmp:
2207	O << " " << CmpInst::getPredicateName(getPredicate());
2208	break;
2209	case OperationType::FCmp:
2210	O << " " << CmpInst::getPredicateName(getPredicate());
2211	getFastMathFlags().print(O);
2212	break;
2213	case OperationType::DisjointOp:
2214	if (DisjointFlags.IsDisjoint)
2215	O << " disjoint";
2216	break;
2217	case OperationType::PossiblyExactOp:
2218	if (ExactFlags.IsExact)
2219	O << " exact";
2220	break;
2221	case OperationType::OverflowingBinOp:
2222	if (WrapFlags.HasNUW)
2223	O << " nuw";
2224	if (WrapFlags.HasNSW)
2225	O << " nsw";
2226	break;
2227	case OperationType::Trunc:
2228	if (TruncFlags.HasNUW)
2229	O << " nuw";
2230	if (TruncFlags.HasNSW)
2231	O << " nsw";
2232	break;
2233	case OperationType::FPMathOp:
2234	getFastMathFlags().print(O);
2235	break;
2236	case OperationType::GEPOp: {
2237	GEPNoWrapFlags Flags = getGEPNoWrapFlags();
2238	if (Flags.isInBounds())
2239	O << " inbounds";
2240	else if (Flags.hasNoUnsignedSignedWrap())
2241	O << " nusw";
2242	if (Flags.hasNoUnsignedWrap())
2243	O << " nuw";
2244	break;
2245	}
2246	case OperationType::NonNegOp:
2247	if (NonNegFlags.NonNeg)
2248	O << " nneg";
2249	break;
2250	case OperationType::ReductionOp: {
2251	RecurKind RK = getRecurKind();
2252	O << " (";
2253	switch (RK) {
2254	case RecurKind::AnyOf:
2255	O << "any-of";
2256	break;
2257	case RecurKind::FindLast:
2258	O << "find-last";
2259	break;
2260	case RecurKind::SMax:
2261	O << "smax";
2262	break;
2263	case RecurKind::SMin:
2264	O << "smin";
2265	break;
2266	case RecurKind::UMax:
2267	O << "umax";
2268	break;
2269	case RecurKind::UMin:
2270	O << "umin";
2271	break;
2272	case RecurKind::FMinNum:
2273	O << "fminnum";
2274	break;
2275	case RecurKind::FMaxNum:
2276	O << "fmaxnum";
2277	break;
2278	case RecurKind::FMinimum:
2279	O << "fminimum";
2280	break;
2281	case RecurKind::FMaximum:
2282	O << "fmaximum";
2283	break;
2284	case RecurKind::FMinimumNum:
2285	O << "fminimumnum";
2286	break;
2287	case RecurKind::FMaximumNum:
2288	O << "fmaximumnum";
2289	break;
2290	default:
2291	O << Instruction::getOpcodeName(RecurrenceDescriptor::getOpcode(RK));
2292	break;
2293	}
2294	if (isReductionInLoop())
2295	O << ", in-loop";
2296	if (isReductionOrdered())
2297	O << ", ordered";
2298	O << ")";
2299	getFastMathFlags().print(O);
2300	break;
2301	}
2302	case OperationType::Other:
2303	break;
2304	}
2305	O << " ";
2306	}
2307	#endif
2308
2309	void VPWidenRecipe::execute(VPTransformState &State) {
2310	auto &Builder = State.Builder;
2311	switch (Opcode) {
2312	case Instruction::Call:
2313	case Instruction::UncondBr:
2314	case Instruction::CondBr:
2315	case Instruction::PHI:
2316	case Instruction::GetElementPtr:
2317	llvm_unreachable("This instruction is handled by a different recipe.");
2318	case Instruction::UDiv:
2319	case Instruction::SDiv:
2320	case Instruction::SRem:
2321	case Instruction::URem:
2322	case Instruction::Add:
2323	case Instruction::FAdd:
2324	case Instruction::Sub:
2325	case Instruction::FSub:
2326	case Instruction::FNeg:
2327	case Instruction::Mul:
2328	case Instruction::FMul:
2329	case Instruction::FDiv:
2330	case Instruction::FRem:
2331	case Instruction::Shl:
2332	case Instruction::LShr:
2333	case Instruction::AShr:
2334	case Instruction::And:
2335	case Instruction::Or:
2336	case Instruction::Xor: {
2337	// Just widen unops and binops.
2338	SmallVector<Value *, `2`> Ops;
2339	for (VPValue *VPOp : operands())
2340	Ops.push_back(Elt: State.get(Def: VPOp));
2341
2342	Value *V = Builder.CreateNAryOp(Opc: Opcode, Ops);
2343
2344	if (auto *VecOp = dyn_cast<Instruction>(Val: V)) {
2345	applyFlags(I&: *VecOp);
2346	applyMetadata(I&: *VecOp);
2347	}
2348
2349	// Use this vector value for all users of the original instruction.
2350	State.set(Def: this, V);
2351	break;
2352	}
2353	case Instruction::ExtractValue: {
2354	assert(getNumOperands() == `2` && "expected single level extractvalue");
2355	Value *Op = State.get(Def: getOperand(N: `0`));
2356	Value *Extract = Builder.CreateExtractValue(
2357	Agg: Op, Idxs: cast<VPConstantInt>(Val: getOperand(N: `1`))->getZExtValue());
2358	State.set(Def: this, V: Extract);
2359	break;
2360	}
2361	case Instruction::Freeze: {
2362	Value *Op = State.get(Def: getOperand(N: `0`));
2363	Value *Freeze = Builder.CreateFreeze(V: Op);
2364	State.set(Def: this, V: Freeze);
2365	break;
2366	}
2367	case Instruction::ICmp:
2368	case Instruction::FCmp: {
2369	// Widen compares. Generate vector compares.
2370	bool FCmp = Opcode == Instruction::FCmp;
2371	Value *A = State.get(Def: getOperand(N: `0`));
2372	Value *B = State.get(Def: getOperand(N: `1`));
2373	Value C = nullptr*;
2374	if (FCmp) {
2375	C = Builder.CreateFCmp(P: getPredicate(), LHS: A, RHS: B);
2376	} else {
2377	C = Builder.CreateICmp(P: getPredicate(), LHS: A, RHS: B);
2378	}
2379	if (auto *I = dyn_cast<Instruction>(Val: C)) {
2380	applyFlags(I&: *I);
2381	applyMetadata(I&: *I);
2382	}
2383	State.set(Def: this, V: C);
2384	break;
2385	}
2386	case Instruction::Select: {
2387	VPValue *CondOp = getOperand(N: `0`);
2388	Value *Cond = State.get(Def: CondOp, IsScalar: vputils::isSingleScalar(VPV: CondOp));
2389	Value *Op0 = State.get(Def: getOperand(N: `1`));
2390	Value *Op1 = State.get(Def: getOperand(N: `2`));
2391	Value *Sel = State.Builder.CreateSelect(C: Cond, True: Op0, False: Op1);
2392	State.set(Def: this, V: Sel);
2393	if (auto *I = dyn_cast<Instruction>(Val: Sel)) {
2394	if (isa<FPMathOperator>(Val: I))
2395	applyFlags(I&: *I);
2396	applyMetadata(I&: *I);
2397	}
2398	break;
2399	}
2400	default:
2401	// This instruction is not vectorized by simple widening.
2402	LLVM_DEBUG(dbgs() << "LV: Found an unhandled opcode : "
2403	<< Instruction::getOpcodeName(Opcode));
2404	llvm_unreachable("Unhandled instruction!");
2405	} // end of switch.
2406
2407	#if !defined(NDEBUG)
2408	// Verify that VPlan type inference results agree with the type of the
2409	// generated values.
2410	assert(VectorType::get(State.TypeAnalysis.inferScalarType(this), State.VF) ==
2411	State.get(this)->getType() &&
2412	"inferred type and type from generated instructions do not match");
2413	#endif
2414	}
2415
2416	InstructionCost VPWidenRecipe::computeCost(ElementCount VF,
2417	VPCostContext &Ctx) const {
2418	switch (Opcode) {
2419	case Instruction::UDiv:
2420	case Instruction::SDiv:
2421	case Instruction::SRem:
2422	case Instruction::URem:
2423	// If the div/rem operation isn't safe to speculate and requires
2424	// predication, then the only way we can even create a vplan is to insert
2425	// a select on the second input operand to ensure we use the value of 1
2426	// for the inactive lanes. The select will be costed separately.
2427	case Instruction::FNeg:
2428	case Instruction::Add:
2429	case Instruction::FAdd:
2430	case Instruction::Sub:
2431	case Instruction::FSub:
2432	case Instruction::Mul:
2433	case Instruction::FMul:
2434	case Instruction::FDiv:
2435	case Instruction::FRem:
2436	case Instruction::Shl:
2437	case Instruction::LShr:
2438	case Instruction::AShr:
2439	case Instruction::And:
2440	case Instruction::Or:
2441	case Instruction::Xor:
2442	case Instruction::Freeze:
2443	case Instruction::ExtractValue:
2444	case Instruction::ICmp:
2445	case Instruction::FCmp:
2446	case Instruction::Select:
2447	return getCostForRecipeWithOpcode(Opcode: getOpcode(), VF, Ctx);
2448	default:
2449	llvm_unreachable("Unsupported opcode for instruction");
2450	}
2451	}
2452
2453	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
2454	void VPWidenRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
2455	VPSlotTracker &SlotTracker) const {
2456	O << Indent << "WIDEN ";
2457	printAsOperand(O, SlotTracker);
2458	O << " = " << Instruction::getOpcodeName(Opcode);
2459	printFlags(O);
2460	printOperands(O, SlotTracker);
2461	}
2462	#endif
2463
2464	void VPWidenCastRecipe::execute(VPTransformState &State) {
2465	auto &Builder = State.Builder;
2466	/// Vectorize casts.
2467	assert(State.VF.isVector() && "Not vectorizing?");
2468	Type *DestTy = VectorType::get(ElementType: getResultType(), EC: State.VF);
2469	VPValue *Op = getOperand(N: `0`);
2470	Value *A = State.get(Def: Op);
2471	Value *Cast = Builder.CreateCast(Op: Instruction::CastOps(Opcode), V: A, DestTy);
2472	State.set(Def: this, V: Cast);
2473	if (auto *CastOp = dyn_cast<Instruction>(Val: Cast)) {
2474	applyFlags(I&: *CastOp);
2475	applyMetadata(I&: *CastOp);
2476	}
2477	}
2478
2479	InstructionCost VPWidenCastRecipe::computeCost(ElementCount VF,
2480	VPCostContext &Ctx) const {
2481	// TODO: In some cases, VPWidenCastRecipes are created but not considered in
2482	// the legacy cost model, including truncates/extends when evaluating a
2483	// reduction in a smaller type.
2484	if (!getUnderlyingValue())
2485	return `0`;
2486	return getCostForRecipeWithOpcode(Opcode: getOpcode(), VF, Ctx);
2487	}
2488
2489	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
2490	void VPWidenCastRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
2491	VPSlotTracker &SlotTracker) const {
2492	O << Indent << "WIDEN-CAST ";
2493	printAsOperand(O, SlotTracker);
2494	O << " = " << Instruction::getOpcodeName(Opcode);
2495	printFlags(O);
2496	printOperands(O, SlotTracker);
2497	O << " to " << *getResultType();
2498	}
2499	#endif
2500
2501	InstructionCost VPHeaderPHIRecipe::computeCost(ElementCount VF,
2502	VPCostContext &Ctx) const {
2503	return Ctx.TTI.getCFInstrCost(Opcode: Instruction::PHI, CostKind: Ctx.CostKind);
2504	}
2505
2506	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
2507	void VPWidenIntOrFpInductionRecipe::printRecipe(
2508	raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const {
2509	O << Indent;
2510	printAsOperand(O, SlotTracker);
2511	O << " = WIDEN-INDUCTION";
2512	printFlags(O);
2513	printOperands(O, SlotTracker);
2514
2515	if (auto *TI = getTruncInst())
2516	O << " (truncated to " << *TI->getType() << ")";
2517	}
2518	#endif
2519
2520	bool VPWidenIntOrFpInductionRecipe::isCanonical() const {
2521	// The step may be defined by a recipe in the preheader (e.g. if it requires
2522	// SCEV expansion), but for the canonical induction the step is required to be
2523	// 1, which is represented as live-in.
2524	auto *StepC = dyn_cast<VPConstantInt>(Val: getStepValue());
2525	auto *StartC = dyn_cast<VPConstantInt>(Val: getStartValue());
2526	return StartC && StartC->isZero() && StepC && StepC->isOne() &&
2527	getScalarType() == getRegion()->getCanonicalIVType();
2528	}
2529
2530	void VPDerivedIVRecipe::execute(VPTransformState &State) {
2531	assert(!State.Lane && "VPDerivedIVRecipe being replicated.");
2532
2533	// Fast-math-flags propagate from the original induction instruction.
2534	IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
2535	if (FPBinOp)
2536	State.Builder.setFastMathFlags(FPBinOp->getFastMathFlags());
2537
2538	Value *Step = State.get(Def: getStepValue(), Lane: VPLane (`0`));
2539	Value *Index = State.get(Def: getOperand(N: `1`), Lane: VPLane (`0`));
2540	Value *DerivedIV = emitTransformedIndex(
2541	B&: State.Builder, Index, StartValue: getStartValue()->getLiveInIRValue(), Step, InductionKind: Kind,
2542	InductionBinOp: cast_if_present<BinaryOperator>(Val: FPBinOp));
2543	DerivedIV->setName(Name);
2544	State.set(Def: this, V: DerivedIV, Lane: VPLane (`0`));
2545	}
2546
2547	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
2548	void VPDerivedIVRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
2549	VPSlotTracker &SlotTracker) const {
2550	O << Indent;
2551	printAsOperand(O, SlotTracker);
2552	O << " = DERIVED-IV ";
2553	getStartValue()->printAsOperand(O, SlotTracker);
2554	O << " + ";
2555	getOperand(`1`)->printAsOperand(O, SlotTracker);
2556	O << " * ";
2557	getStepValue()->printAsOperand(O, SlotTracker);
2558	}
2559	#endif
2560
2561	void VPScalarIVStepsRecipe::execute(VPTransformState &State) {
2562	// Fast-math-flags propagate from the original induction instruction.
2563	IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
2564	State.Builder.setFastMathFlags(getFastMathFlags());
2565
2566	/// Compute scalar induction steps. \p ScalarIV is the scalar induction
2567	/// variable on which to base the steps, \p Step is the size of the step.
2568
2569	Value *BaseIV = State.get(Def: getOperand(N: `0`), Lane: VPLane (`0`));
2570	Value *Step = State.get(Def: getStepValue(), Lane: VPLane (`0`));
2571	IRBuilderBase &Builder = State.Builder;
2572
2573	// Ensure step has the same type as that of scalar IV.
2574	Type *BaseIVTy = BaseIV->getType()->getScalarType();
2575	assert(BaseIVTy == Step->getType() && "Types of BaseIV and Step must match!");
2576
2577	// We build scalar steps for both integer and floating-point induction
2578	// variables. Here, we determine the kind of arithmetic we will perform.
2579	Instruction::BinaryOps AddOp;
2580	Instruction::BinaryOps MulOp;
2581	if (BaseIVTy->isIntegerTy()) {
2582	AddOp = Instruction::Add;
2583	MulOp = Instruction::Mul;
2584	} else {
2585	AddOp = InductionOpcode;
2586	MulOp = Instruction::FMul;
2587	}
2588
2589	// Determine the number of scalars we need to generate.
2590	bool FirstLaneOnly = vputils::onlyFirstLaneUsed(Def: this);
2591	// Compute the scalar steps and save the results in State.
2592
2593	unsigned StartLane = `0`;
2594	unsigned EndLane = FirstLaneOnly ? `1` : State.VF.getKnownMinValue();
2595	if (State.Lane) {
2596	StartLane = State.Lane ->getKnownLane();
2597	EndLane = StartLane + `1`;
2598	}
2599	Value StartIdx0 = getStartIndex() ? State.get(Def: getStartIndex(), IsScalar: true*)
2600	: Constant::getNullValue(Ty: BaseIVTy);
2601
2602	for (unsigned Lane = StartLane; Lane < EndLane; ++Lane) {
2603	// It is okay if the induction variable type cannot hold the lane number,
2604	// we expect truncation in this case.
2605	Constant *LaneValue =
2606	BaseIVTy->isIntegerTy()
2607	? ConstantInt::get(Ty: BaseIVTy, V: Lane, /IsSigned=/false,
2608	/ImplicitTrunc=/true)
2609	: ConstantFP::get(Ty: BaseIVTy, V: Lane);
2610	Value *StartIdx = Builder.CreateBinOp(Opc: AddOp, LHS: StartIdx0, RHS: LaneValue);
2611	// The step returned by `createStepForVF` is a runtime-evaluated value
2612	// when VF is scalable. Otherwise, it should be folded into a Constant.
2613	assert((State.VF.isScalable() \|\| isa<Constant>(StartIdx)) &&
2614	"Expected StartIdx to be folded to a constant when VF is not "
2615	"scalable");
2616	auto *Mul = Builder.CreateBinOp(Opc: MulOp, LHS: StartIdx, RHS: Step);
2617	auto *Add = Builder.CreateBinOp(Opc: AddOp, LHS: BaseIV, RHS: Mul);
2618	State.set(Def: this, V: Add, Lane: VPLane (Lane));
2619	}
2620	}
2621
2622	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
2623	void VPScalarIVStepsRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
2624	VPSlotTracker &SlotTracker) const {
2625	O << Indent;
2626	printAsOperand(O, SlotTracker);
2627	O << " = SCALAR-STEPS ";
2628	printOperands(O, SlotTracker);
2629	}
2630	#endif
2631
2632	bool VPWidenGEPRecipe::usesFirstLaneOnly(const VPValue Op) const* {
2633	assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
2634	return vputils::isSingleScalar(VPV: Op);
2635	}
2636
2637	void VPWidenGEPRecipe::execute(VPTransformState &State) {
2638	assert(State.VF.isVector() && "not widening");
2639	// Construct a vector GEP by widening the operands of the scalar GEP as
2640	// necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
2641	// results in a vector of pointers when at least one operand of the GEP
2642	// is vector-typed. Thus, to keep the representation compact, we only use
2643	// vector-typed operands for loop-varying values.
2644
2645	bool AllOperandsAreInvariant = all_of(Range: operands(), P: [](VPValue *Op) {
2646	return Op->isDefinedOutsideLoopRegions();
2647	});
2648	if (AllOperandsAreInvariant) {
2649	// If we are vectorizing, but the GEP has only loop-invariant operands,
2650	// the GEP we build (by only using vector-typed operands for
2651	// loop-varying values) would be a scalar pointer. Thus, to ensure we
2652	// produce a vector of pointers, we need to either arbitrarily pick an
2653	// operand to broadcast, or broadcast a clone of the original GEP.
2654	// Here, we broadcast a clone of the original.
2655
2656	SmallVector<Value *> Ops;
2657	for (unsigned I = `0`, E = getNumOperands(); I != E; I++)
2658	Ops.push_back(Elt: State.get(Def: getOperand(N: I), Lane: VPLane (`0`)));
2659
2660	auto *NewGEP =
2661	State.Builder.CreateGEP(Ty: getSourceElementType(), Ptr: Ops [`0`], IdxList: drop_begin(RangeOrContainer&: Ops),
2662	Name: "", NW: getGEPNoWrapFlags());
2663	Value *Splat = State.Builder.CreateVectorSplat(EC: State.VF, V: NewGEP);
2664	State.set(Def: this, V: Splat);
2665	return;
2666	}
2667
2668	// If the GEP has at least one loop-varying operand, we are sure to
2669	// produce a vector of pointers unless VF is scalar.
2670	// The pointer operand of the new GEP. If it's loop-invariant, we
2671	// won't broadcast it.
2672	auto *Ptr = State.get(Def: getOperand(N: `0`), IsScalar: isPointerLoopInvariant());
2673
2674	// Collect all the indices for the new GEP. If any index is
2675	// loop-invariant, we won't broadcast it.
2676	SmallVector<Value *, `4`> Indices;
2677	for (unsigned I = `1`, E = getNumOperands(); I < E; I++) {
2678	VPValue *Operand = getOperand(N: I);
2679	Indices.push_back(Elt: State.get(Def: Operand, IsScalar: isIndexLoopInvariant(I: I - `1`)));
2680	}
2681
2682	// Create the new GEP. Note that this GEP may be a scalar if VF == 1,
2683	// but it should be a vector, otherwise.
2684	auto *NewGEP = State.Builder.CreateGEP(Ty: getSourceElementType(), Ptr, IdxList: Indices,
2685	Name: "", NW: getGEPNoWrapFlags());
2686	assert((State.VF.isScalar() \|\| NewGEP->getType()->isVectorTy()) &&
2687	"NewGEP is not a pointer vector");
2688	State.set(Def: this, V: NewGEP);
2689	}
2690
2691	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
2692	void VPWidenGEPRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
2693	VPSlotTracker &SlotTracker) const {
2694	O << Indent << "WIDEN-GEP ";
2695	O << (isPointerLoopInvariant() ? "Inv" : "Var");
2696	for (size_t I = `0`; I < getNumOperands() - `1`; ++I)
2697	O << "[" << (isIndexLoopInvariant(I) ? "Inv" : "Var") << "]";
2698
2699	O << " ";
2700	printAsOperand(O, SlotTracker);
2701	O << " = getelementptr";
2702	printFlags(O);
2703	printOperands(O, SlotTracker);
2704	}
2705	#endif
2706
2707	void VPVectorEndPointerRecipe::materializeOffset(unsigned Part) {
2708	assert(!getOffset() && "Unexpected offset operand");
2709	VPBuilder Builder(this);
2710	VPlan &Plan = *getParent()->getPlan();
2711	VPValue *VFVal = getVFValue();
2712	VPTypeAnalysis TypeInfo(Plan);
2713	const DataLayout &DL = Plan.getDataLayout();
2714	Type IndexTy = DL.getIndexType(PtrTy: TypeInfo.inferScalarType(V: this*));
2715	VPValue *Stride =
2716	Plan.getConstantInt(Ty: IndexTy, Val: getStride(), /IsSigned=/true);
2717	Type *VFTy = TypeInfo.inferScalarType(V: VFVal);
2718	VPValue *VF = Builder.createScalarZExtOrTrunc(Op: VFVal, ResultTy: IndexTy, SrcTy: VFTy,
2719	DL: DebugLoc::getUnknown());
2720
2721	// Offset for Part0 = Offset0 = Stride (VF - 1).*
2722	VPInstruction *VFMinusOne =
2723	Builder.createSub(LHS: VF, RHS: Plan.getConstantInt(Ty: IndexTy, Val: `1u`),
2724	DL: DebugLoc::getUnknown(), Name: "", WrapFlags: {true, true});
2725	VPInstruction *Offset0 =
2726	Builder.createOverflowingOp(Opcode: Instruction::Mul, Operands: {VFMinusOne, Stride});
2727
2728	// Offset for PartN = Offset0 + Part Stride * VF.*
2729	VPValue *PartxStride =
2730	Plan.getConstantInt(Ty: IndexTy, Val: Part * getStride(), /IsSigned=/true);
2731	VPValue *Offset = Builder.createAdd(
2732	LHS: Offset0,
2733	RHS: Builder.createOverflowingOp(Opcode: Instruction::Mul, Operands: {PartxStride, VF}));
2734	addOperand(Operand: Offset);
2735	}
2736
2737	void VPVectorEndPointerRecipe::execute(VPTransformState &State) {
2738	auto &Builder = State.Builder;
2739	assert(getOffset() && "Expected prior materialization of offset");
2740	Value Ptr = State.get(Def: getPointer(), IsScalar: true*);
2741	Value Offset = State.get(Def: getOffset(), IsScalar: true*);
2742	Value *ResultPtr = Builder.CreateGEP(Ty: getSourceElementType(), Ptr, IdxList: Offset, Name: "",
2743	NW: getGEPNoWrapFlags());
2744	State.set(Def: this, V: ResultPtr, /IsScalar/ true);
2745	}
2746
2747	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
2748	void VPVectorEndPointerRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
2749	VPSlotTracker &SlotTracker) const {
2750	O << Indent;
2751	printAsOperand(O, SlotTracker);
2752	O << " = vector-end-pointer";
2753	printFlags(O);
2754	printOperands(O, SlotTracker);
2755	}
2756	#endif
2757
2758	void VPVectorPointerRecipe::execute(VPTransformState &State) {
2759	auto &Builder = State.Builder;
2760	assert(getOffset() &&
2761	"Expected prior simplification of recipe without offset");
2762	Value *Ptr = State.get(Def: getOperand(N: `0`), Lane: VPLane (`0`));
2763	Value Offset = State.get(Def: getOffset(), IsScalar: true*);
2764	Value *ResultPtr = Builder.CreateGEP(Ty: getSourceElementType(), Ptr, IdxList: Offset, Name: "",
2765	NW: getGEPNoWrapFlags());
2766	State.set(Def: this, V: ResultPtr, /IsScalar/ true);
2767	}
2768
2769	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
2770	void VPVectorPointerRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
2771	VPSlotTracker &SlotTracker) const {
2772	O << Indent;
2773	printAsOperand(O, SlotTracker);
2774	O << " = vector-pointer";
2775	printFlags(O);
2776	printOperands(O, SlotTracker);
2777	}
2778	#endif
2779
2780	InstructionCost VPBlendRecipe::computeCost(ElementCount VF,
2781	VPCostContext &Ctx) const {
2782	// A blend will be expanded to a select VPInstruction, which will generate a
2783	// scalar select if only the first lane is used.
2784	if (vputils::onlyFirstLaneUsed(Def: this))
2785	VF = ElementCount::getFixed(MinVal: `1`);
2786
2787	Type ResultTy = toVectorTy(Scalar: Ctx.Types.inferScalarType(V: this*), EC: VF);
2788	Type *CmpTy = toVectorTy(Scalar: Type::getInt1Ty(C&: Ctx.Types.getContext()), EC: VF);
2789	return (getNumIncomingValues() - `1`) *
2790	Ctx.TTI.getCmpSelInstrCost(Opcode: Instruction::Select, ValTy: ResultTy, CondTy: CmpTy,
2791	VecPred: CmpInst::BAD_ICMP_PREDICATE, CostKind: Ctx.CostKind);
2792	}
2793
2794	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
2795	void VPBlendRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
2796	VPSlotTracker &SlotTracker) const {
2797	O << Indent << "BLEND ";
2798	printAsOperand(O, SlotTracker);
2799	O << " =";
2800	printFlags(O);
2801	if (getNumIncomingValues() == `1`) {
2802	// Not a User of any mask: not really blending, this is a
2803	// single-predecessor phi.
2804	getIncomingValue(`0`)->printAsOperand(O, SlotTracker);
2805	} else {
2806	for (unsigned I = `0`, E = getNumIncomingValues(); I < E; ++I) {
2807	if (I != `0`)
2808	O << " ";
2809	getIncomingValue(I)->printAsOperand(O, SlotTracker);
2810	if (I == `0` && isNormalized())
2811	continue;
2812	O << "/";
2813	getMask(I)->printAsOperand(O, SlotTracker);
2814	}
2815	}
2816	}
2817	#endif
2818
2819	void VPReductionRecipe::execute(VPTransformState &State) {
2820	assert(!State.Lane && "Reduction being replicated.");
2821	RecurKind Kind = getRecurrenceKind();
2822	assert(!RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) &&
2823	"In-loop AnyOf reductions aren't currently supported");
2824	// Propagate the fast-math flags carried by the underlying instruction.
2825	IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
2826	State.Builder.setFastMathFlags(getFastMathFlags());
2827	Value *NewVecOp = State.get(Def: getVecOp());
2828	if (VPValue *Cond = getCondOp()) {
2829	Value *NewCond = State.get(Def: Cond, IsScalar: State.VF.isScalar());
2830	VectorType *VecTy = dyn_cast<VectorType>(Val: NewVecOp->getType());
2831	Type *ElementTy = VecTy ? VecTy->getElementType() : NewVecOp->getType();
2832
2833	Value *Start = getRecurrenceIdentity(K: Kind, Tp: ElementTy, FMF: getFastMathFlags());
2834	if (State.VF.isVector())
2835	Start = State.Builder.CreateVectorSplat(EC: VecTy->getElementCount(), V: Start);
2836
2837	Value *Select = State.Builder.CreateSelect(C: NewCond, True: NewVecOp, False: Start);
2838	NewVecOp = Select;
2839	}
2840	Value *NewRed;
2841	Value *NextInChain;
2842	if (isOrdered()) {
2843	Value PrevInChain = State.get(Def: getChainOp(), /IsScalar/* true);
2844	if (State.VF.isVector())
2845	NewRed =
2846	createOrderedReduction(B&: State.Builder, RdxKind: Kind, Src: NewVecOp, Start: PrevInChain);
2847	else
2848	NewRed = State.Builder.CreateBinOp(
2849	Opc: (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(Kind),
2850	LHS: PrevInChain, RHS: NewVecOp);
2851	PrevInChain = NewRed;
2852	NextInChain = NewRed;
2853	} else if (isPartialReduction()) {
2854	assert((Kind == RecurKind::Add \|\| Kind == RecurKind::FAdd) &&
2855	"Unexpected partial reduction kind");
2856	Value PrevInChain = State.get(Def: getChainOp(), /IsScalar/* false);
2857	NewRed = State.Builder.CreateIntrinsic(
2858	RetTy: PrevInChain->getType(),
2859	ID: Kind == RecurKind::Add ? Intrinsic::vector_partial_reduce_add
2860	: Intrinsic::vector_partial_reduce_fadd,
2861	Args: {PrevInChain, NewVecOp}, FMFSource: State.Builder.getFastMathFlags(),
2862	Name: "partial.reduce");
2863	PrevInChain = NewRed;
2864	NextInChain = NewRed;
2865	} else {
2866	assert(isInLoop() &&
2867	"The reduction must either be ordered, partial or in-loop");
2868	Value PrevInChain = State.get(Def: getChainOp(), /IsScalar/* true);
2869	NewRed = createSimpleReduction(B&: State.Builder, Src: NewVecOp, RdxKind: Kind);
2870	if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
2871	NextInChain = createMinMaxOp(Builder&: State.Builder, RK: Kind, Left: NewRed, Right: PrevInChain);
2872	else
2873	NextInChain = State.Builder.CreateBinOp(
2874	Opc: (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(Kind),
2875	LHS: PrevInChain, RHS: NewRed);
2876	}
2877	State.set(Def: this, V: NextInChain, /IsScalar/ !isPartialReduction());
2878	}
2879
2880	void VPReductionEVLRecipe::execute(VPTransformState &State) {
2881	assert(!State.Lane && "Reduction being replicated.");
2882
2883	auto &Builder = State.Builder;
2884	// Propagate the fast-math flags carried by the underlying instruction.
2885	IRBuilderBase::FastMathFlagGuard FMFGuard(Builder);
2886	Builder.setFastMathFlags(getFastMathFlags());
2887
2888	RecurKind Kind = getRecurrenceKind();
2889	Value Prev = State.get(Def: getChainOp(), /IsScalar/* true);
2890	Value *VecOp = State.get(Def: getVecOp());
2891	Value *EVL = State.get(Def: getEVL(), Lane: VPLane (`0`));
2892
2893	Value *Mask;
2894	if (VPValue *CondOp = getCondOp())
2895	Mask = State.get(Def: CondOp);
2896	else
2897	Mask = Builder.CreateVectorSplat(EC: State.VF, V: Builder.getTrue());
2898
2899	Value *NewRed;
2900	if (isOrdered()) {
2901	NewRed = createOrderedReduction(B&: Builder, RdxKind: Kind, Src: VecOp, Start: Prev, Mask, EVL);
2902	} else {
2903	NewRed = createSimpleReduction(B&: Builder, Src: VecOp, RdxKind: Kind, Mask, EVL);
2904	if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
2905	NewRed = createMinMaxOp(Builder, RK: Kind, Left: NewRed, Right: Prev);
2906	else
2907	NewRed = Builder.CreateBinOp(
2908	Opc: (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(Kind), LHS: NewRed,
2909	RHS: Prev);
2910	}
2911	State.set(Def: this, V: NewRed, /IsScalar/ true);
2912	}
2913
2914	InstructionCost VPReductionRecipe::computeCost(ElementCount VF,
2915	VPCostContext &Ctx) const {
2916	RecurKind RdxKind = getRecurrenceKind();
2917	Type ElementTy = Ctx.Types.inferScalarType(V: this*);
2918	auto *VectorTy = cast<VectorType>(Val: toVectorTy(Scalar: ElementTy, EC: VF));
2919	unsigned Opcode = RecurrenceDescriptor::getOpcode(Kind: RdxKind);
2920	FastMathFlags FMFs = getFastMathFlags();
2921	std::optional<FastMathFlags> OptionalFMF =
2922	ElementTy->isFloatingPointTy() ? std::make_optional(t&: FMFs) : std::nullopt;
2923
2924	if (isPartialReduction()) {
2925	InstructionCost CondCost = `0`;
2926	if (isConditional()) {
2927	CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
2928	auto *CondTy = cast<VectorType>(
2929	Val: toVectorTy(Scalar: Ctx.Types.inferScalarType(V: getCondOp()), EC: VF));
2930	CondCost = Ctx.TTI.getCmpSelInstrCost(Opcode: Instruction::Select, ValTy: VectorTy,
2931	CondTy, VecPred: Pred, CostKind: Ctx.CostKind);
2932	}
2933	return CondCost + Ctx.TTI.getPartialReductionCost(
2934	Opcode, InputTypeA: ElementTy, InputTypeB: ElementTy, AccumType: ElementTy, VF,
2935	OpAExtend: TTI::PR_None, OpBExtend: TTI::PR_None, BinOp: {}, CostKind: Ctx.CostKind,
2936	FMF: OptionalFMF);
2937	}
2938
2939	// TODO: Support any-of reductions.
2940	assert(
2941	(!RecurrenceDescriptor::isAnyOfRecurrenceKind(RdxKind) \|\|
2942	ForceTargetInstructionCost.getNumOccurrences() > `0`) &&
2943	"Any-of reduction not implemented in VPlan-based cost model currently.");
2944
2945	// Note that TTI should model the cost of moving result to the scalar register
2946	// and the BinOp cost in the getMinMaxReductionCost().
2947	if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind: RdxKind)) {
2948	Intrinsic::ID Id = getMinMaxReductionIntrinsicOp(RK: RdxKind);
2949	return Ctx.TTI.getMinMaxReductionCost(IID: Id, Ty: VectorTy, FMF: FMFs, CostKind: Ctx.CostKind);
2950	}
2951
2952	// Note that TTI should model the cost of moving result to the scalar register
2953	// and the BinOp cost in the getArithmeticReductionCost().
2954	return Ctx.TTI.getArithmeticReductionCost(Opcode, Ty: VectorTy, FMF: OptionalFMF,
2955	CostKind: Ctx.CostKind);
2956	}
2957
2958	VPExpressionRecipe::VPExpressionRecipe(
2959	ExpressionTypes ExpressionType,
2960	ArrayRef<VPSingleDefRecipe *> ExpressionRecipes)
2961	: VPSingleDefRecipe (VPRecipeBase::VPExpressionSC, {}, {}),
2962	ExpressionRecipes (ExpressionRecipes), ExpressionType(ExpressionType) {
2963	assert(!ExpressionRecipes.empty() && "Nothing to combine?");
2964	assert(
2965	none_of(ExpressionRecipes,
2966	[](VPSingleDefRecipe R) { return* R->mayHaveSideEffects(); }) &&
2967	"expression cannot contain recipes with side-effects");
2968
2969	// Maintain a copy of the expression recipes as a set of users.
2970	SmallPtrSet<VPUser *, `4`> ExpressionRecipesAsSetOfUsers;
2971	for (auto *R : ExpressionRecipes)
2972	ExpressionRecipesAsSetOfUsers.insert(Ptr: R);
2973
2974	// Recipes in the expression, except the last one, must only be used by
2975	// (other) recipes inside the expression. If there are other users, external
2976	// to the expression, use a clone of the recipe for external users.
2977	for (VPSingleDefRecipe *R : reverse(C&: ExpressionRecipes)) {
2978	if (R != ExpressionRecipes.back() &&
2979	any_of(Range: R->users(), P: [&ExpressionRecipesAsSetOfUsers](VPUser *U) {
2980	return !ExpressionRecipesAsSetOfUsers.contains(Ptr: U);
2981	})) {
2982	// There are users outside of the expression. Clone the recipe and use the
2983	// clone those external users.
2984	VPSingleDefRecipe *CopyForExtUsers = R->clone();
2985	R->replaceUsesWithIf(New: CopyForExtUsers, ShouldReplace: [&ExpressionRecipesAsSetOfUsers](
2986	VPUser &U, unsigned) {
2987	return !ExpressionRecipesAsSetOfUsers.contains(Ptr: &U);
2988	});
2989	CopyForExtUsers->insertBefore(InsertPos: R);
2990	}
2991	if (R->getParent())
2992	R->removeFromParent();
2993	}
2994
2995	// Internalize all external operands to the expression recipes. To do so,
2996	// create new temporary VPValues for all operands defined by a recipe outside
2997	// the expression. The original operands are added as operands of the
2998	// VPExpressionRecipe itself.
2999	for (auto *R : ExpressionRecipes) {
3000	for (const auto &[Idx, Op] : enumerate(First: R->operands())) {
3001	auto *Def = Op->getDefiningRecipe();
3002	if (Def && ExpressionRecipesAsSetOfUsers.contains(Ptr: Def))
3003	continue;
3004	addOperand(Operand: Op);
3005	LiveInPlaceholders.push_back(Elt: new VPSymbolicValue ());
3006	}
3007	}
3008
3009	// Replace each external operand with the first one created for it in
3010	// LiveInPlaceholders.
3011	for (auto *R : ExpressionRecipes)
3012	for (auto const &[LiveIn, Tmp] : zip(t: operands(), u&: LiveInPlaceholders))
3013	R->replaceUsesOfWith(From: LiveIn, To: Tmp);
3014	}
3015
3016	void VPExpressionRecipe::decompose() {
3017	for (auto *R : ExpressionRecipes)
3018	// Since the list could contain duplicates, make sure the recipe hasn't
3019	// already been inserted.
3020	if (!R->getParent())
3021	R->insertBefore(InsertPos: this);
3022
3023	for (const auto &[Idx, Op] : enumerate(First: operands()))
3024	LiveInPlaceholders [Idx]->replaceAllUsesWith(New: Op);
3025
3026	replaceAllUsesWith(New: ExpressionRecipes.back());
3027	ExpressionRecipes.clear();
3028	}
3029
3030	InstructionCost VPExpressionRecipe::computeCost(ElementCount VF,
3031	VPCostContext &Ctx) const {
3032	Type RedTy = Ctx.Types.inferScalarType(V: this*);
3033	auto *SrcVecTy = cast<VectorType>(
3034	Val: toVectorTy(Scalar: Ctx.Types.inferScalarType(V: getOperand(N: `0`)), EC: VF));
3035	unsigned Opcode = RecurrenceDescriptor::getOpcode(
3036	Kind: cast<VPReductionRecipe>(Val: ExpressionRecipes.back())->getRecurrenceKind());
3037	switch (ExpressionType) {
3038	case ExpressionTypes::ExtendedReduction: {
3039	unsigned Opcode = RecurrenceDescriptor::getOpcode(
3040	Kind: cast<VPReductionRecipe>(Val: ExpressionRecipes [`1`])->getRecurrenceKind());
3041	auto *ExtR = cast<VPWidenCastRecipe>(Val: ExpressionRecipes [`0`]);
3042	auto *RedR = cast<VPReductionRecipe>(Val: ExpressionRecipes.back());
3043
3044	if (RedR->isPartialReduction())
3045	return Ctx.TTI.getPartialReductionCost(
3046	Opcode, InputTypeA: Ctx.Types.inferScalarType(V: getOperand(N: `0`)), InputTypeB: nullptr, AccumType: RedTy, VF,
3047	OpAExtend: TargetTransformInfo::getPartialReductionExtendKind(CastOpc: ExtR->getOpcode()),
3048	OpBExtend: TargetTransformInfo::PR_None, BinOp: std::nullopt, CostKind: Ctx.CostKind,
3049	FMF: RedTy->isFloatingPointTy() ? std::optional{RedR->getFastMathFlags()}
3050	: std::nullopt);
3051	else if (!RedTy->isFloatingPointTy())
3052	// TTI::getExtendedReductionCost only supports integer types.
3053	return Ctx.TTI.getExtendedReductionCost(
3054	Opcode, IsUnsigned: ExtR->getOpcode() == Instruction::ZExt, ResTy: RedTy, Ty: SrcVecTy,
3055	FMF: std::nullopt, CostKind: Ctx.CostKind);
3056	else
3057	return InstructionCost::getInvalid();
3058	}
3059	case ExpressionTypes::MulAccReduction:
3060	return Ctx.TTI.getMulAccReductionCost(IsUnsigned: false, RedOpcode: Opcode, ResTy: RedTy, Ty: SrcVecTy,
3061	CostKind: Ctx.CostKind);
3062
3063	case ExpressionTypes::ExtNegatedMulAccReduction:
3064	assert(Opcode == Instruction::Add && "Unexpected opcode");
3065	Opcode = Instruction::Sub;
3066	[[fallthrough]];
3067	case ExpressionTypes::ExtMulAccReduction: {
3068	auto *RedR = cast<VPReductionRecipe>(Val: ExpressionRecipes.back());
3069	if (RedR->isPartialReduction()) {
3070	auto *Ext0R = cast<VPWidenCastRecipe>(Val: ExpressionRecipes [`0`]);
3071	auto *Ext1R = cast<VPWidenCastRecipe>(Val: ExpressionRecipes [`1`]);
3072	auto *Mul = cast<VPWidenRecipe>(Val: ExpressionRecipes [`2`]);
3073	return Ctx.TTI.getPartialReductionCost(
3074	Opcode, InputTypeA: Ctx.Types.inferScalarType(V: getOperand(N: `0`)),
3075	InputTypeB: Ctx.Types.inferScalarType(V: getOperand(N: `1`)), AccumType: RedTy, VF,
3076	OpAExtend: TargetTransformInfo::getPartialReductionExtendKind(
3077	CastOpc: Ext0R->getOpcode()),
3078	OpBExtend: TargetTransformInfo::getPartialReductionExtendKind(
3079	CastOpc: Ext1R->getOpcode()),
3080	BinOp: Mul->getOpcode(), CostKind: Ctx.CostKind,
3081	FMF: RedTy->isFloatingPointTy() ? std::optional{RedR->getFastMathFlags()}
3082	: std::nullopt);
3083	}
3084	return Ctx.TTI.getMulAccReductionCost(
3085	IsUnsigned: cast<VPWidenCastRecipe>(Val: ExpressionRecipes.front())->getOpcode() ==
3086	Instruction::ZExt,
3087	RedOpcode: Opcode, ResTy: RedTy, Ty: SrcVecTy, CostKind: Ctx.CostKind);
3088	}
3089	}
3090	llvm_unreachable("Unknown VPExpressionRecipe::ExpressionTypes enum");
3091	}
3092
3093	bool VPExpressionRecipe::mayReadOrWriteMemory() const {
3094	return any_of(Range: ExpressionRecipes, P: [](VPSingleDefRecipe *R) {
3095	return R->mayReadFromMemory() \|\| R->mayWriteToMemory();
3096	});
3097	}
3098
3099	bool VPExpressionRecipe::mayHaveSideEffects() const {
3100	assert(
3101	none_of(ExpressionRecipes,
3102	[](VPSingleDefRecipe R) { return* R->mayHaveSideEffects(); }) &&
3103	"expression cannot contain recipes with side-effects");
3104	return false;
3105	}
3106
3107	bool VPExpressionRecipe::isSingleScalar() const {
3108	// Cannot use vputils::isSingleScalar(), because all external operands
3109	// of the expression will be live-ins while bundled.
3110	auto *RR = dyn_cast<VPReductionRecipe>(Val: ExpressionRecipes.back());
3111	return RR && !RR->isPartialReduction();
3112	}
3113
3114	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
3115
3116	void VPExpressionRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
3117	VPSlotTracker &SlotTracker) const {
3118	O << Indent << "EXPRESSION ";
3119	printAsOperand(O, SlotTracker);
3120	O << " = ";
3121	auto *Red = cast<VPReductionRecipe>(ExpressionRecipes.back());
3122	unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind());
3123
3124	switch (ExpressionType) {
3125	case ExpressionTypes::ExtendedReduction: {
3126	getOperand(`1`)->printAsOperand(O, SlotTracker);
3127	O << " + " << (Red->isPartialReduction() ? "partial." : "") << "reduce.";
3128	O << Instruction::getOpcodeName(Opcode) << " (";
3129	getOperand(`0`)->printAsOperand(O, SlotTracker);
3130	Red->printFlags(O);
3131
3132	auto *Ext0 = cast<VPWidenCastRecipe>(ExpressionRecipes[`0`]);
3133	O << Instruction::getOpcodeName(Ext0->getOpcode()) << " to "
3134	<< *Ext0->getResultType();
3135	if (Red->isConditional()) {
3136	O << ", ";
3137	Red->getCondOp()->printAsOperand(O, SlotTracker);
3138	}
3139	O << ")";
3140	break;
3141	}
3142	case ExpressionTypes::ExtNegatedMulAccReduction: {
3143	getOperand(getNumOperands() - `1`)->printAsOperand(O, SlotTracker);
3144	O << " + " << (Red->isPartialReduction() ? "partial." : "") << "reduce.";
3145	O << Instruction::getOpcodeName(
3146	RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()))
3147	<< " (sub (0, mul";
3148	auto *Mul = cast<VPWidenRecipe>(ExpressionRecipes[`2`]);
3149	Mul->printFlags(O);
3150	O << "(";
3151	getOperand(`0`)->printAsOperand(O, SlotTracker);
3152	auto *Ext0 = cast<VPWidenCastRecipe>(ExpressionRecipes[`0`]);
3153	O << " " << Instruction::getOpcodeName(Ext0->getOpcode()) << " to "
3154	<< *Ext0->getResultType() << "), (";
3155	getOperand(`1`)->printAsOperand(O, SlotTracker);
3156	auto *Ext1 = cast<VPWidenCastRecipe>(ExpressionRecipes[`1`]);
3157	O << " " << Instruction::getOpcodeName(Ext1->getOpcode()) << " to "
3158	<< *Ext1->getResultType() << ")";
3159	if (Red->isConditional()) {
3160	O << ", ";
3161	Red->getCondOp()->printAsOperand(O, SlotTracker);
3162	}
3163	O << "))";
3164	break;
3165	}
3166	case ExpressionTypes::MulAccReduction:
3167	case ExpressionTypes::ExtMulAccReduction: {
3168	getOperand(getNumOperands() - `1`)->printAsOperand(O, SlotTracker);
3169	O << " + " << (Red->isPartialReduction() ? "partial." : "") << "reduce.";
3170	O << Instruction::getOpcodeName(
3171	RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()))
3172	<< " (";
3173	O << "mul";
3174	bool IsExtended = ExpressionType == ExpressionTypes::ExtMulAccReduction;
3175	auto *Mul = cast<VPWidenRecipe>(IsExtended ? ExpressionRecipes[`2`]
3176	: ExpressionRecipes[`0`]);
3177	Mul->printFlags(O);
3178	if (IsExtended)
3179	O << "(";
3180	getOperand(`0`)->printAsOperand(O, SlotTracker);
3181	if (IsExtended) {
3182	auto *Ext0 = cast<VPWidenCastRecipe>(ExpressionRecipes[`0`]);
3183	O << " " << Instruction::getOpcodeName(Ext0->getOpcode()) << " to "
3184	<< *Ext0->getResultType() << "), (";
3185	} else {
3186	O << ", ";
3187	}
3188	getOperand(`1`)->printAsOperand(O, SlotTracker);
3189	if (IsExtended) {
3190	auto *Ext1 = cast<VPWidenCastRecipe>(ExpressionRecipes[`1`]);
3191	O << " " << Instruction::getOpcodeName(Ext1->getOpcode()) << " to "
3192	<< *Ext1->getResultType() << ")";
3193	}
3194	if (Red->isConditional()) {
3195	O << ", ";
3196	Red->getCondOp()->printAsOperand(O, SlotTracker);
3197	}
3198	O << ")";
3199	break;
3200	}
3201	}
3202	}
3203
3204	void VPReductionRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
3205	VPSlotTracker &SlotTracker) const {
3206	if (isPartialReduction())
3207	O << Indent << "PARTIAL-REDUCE ";
3208	else
3209	O << Indent << "REDUCE ";
3210	printAsOperand(O, SlotTracker);
3211	O << " = ";
3212	getChainOp()->printAsOperand(O, SlotTracker);
3213	O << " +";
3214	printFlags(O);
3215	O << " reduce."
3216	<< Instruction::getOpcodeName(
3217	RecurrenceDescriptor::getOpcode(getRecurrenceKind()))
3218	<< " (";
3219	getVecOp()->printAsOperand(O, SlotTracker);
3220	if (isConditional()) {
3221	O << ", ";
3222	getCondOp()->printAsOperand(O, SlotTracker);
3223	}
3224	O << ")";
3225	}
3226
3227	void VPReductionEVLRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
3228	VPSlotTracker &SlotTracker) const {
3229	O << Indent << "REDUCE ";
3230	printAsOperand(O, SlotTracker);
3231	O << " = ";
3232	getChainOp()->printAsOperand(O, SlotTracker);
3233	O << " +";
3234	printFlags(O);
3235	O << " vp.reduce."
3236	<< Instruction::getOpcodeName(
3237	RecurrenceDescriptor::getOpcode(getRecurrenceKind()))
3238	<< " (";
3239	getVecOp()->printAsOperand(O, SlotTracker);
3240	O << ", ";
3241	getEVL()->printAsOperand(O, SlotTracker);
3242	if (isConditional()) {
3243	O << ", ";
3244	getCondOp()->printAsOperand(O, SlotTracker);
3245	}
3246	O << ")";
3247	}
3248
3249	#endif
3250
3251	/// A helper function to scalarize a single Instruction in the innermost loop.
3252	/// Generates a sequence of scalar instances for lane \p Lane. Uses the VPValue
3253	/// operands from \p RepRecipe instead of \p Instr's operands.
3254	static void scalarizeInstruction(const Instruction *Instr,
3255	VPReplicateRecipe *RepRecipe,
3256	const VPLane &Lane, VPTransformState &State) {
3257	assert((!Instr->getType()->isAggregateType() \|\|
3258	canVectorizeTy(Instr->getType())) &&
3259	"Expected vectorizable or non-aggregate type.");
3260
3261	// Does this instruction return a value ?
3262	bool IsVoidRetTy = Instr->getType()->isVoidTy();
3263
3264	Instruction *Cloned = Instr->clone();
3265	if (!IsVoidRetTy) {
3266	Cloned->setName(Instr->getName() + ".cloned");
3267	Type *ResultTy = State.TypeAnalysis.inferScalarType(V: RepRecipe);
3268	// The operands of the replicate recipe may have been narrowed, resulting in
3269	// a narrower result type. Update the type of the cloned instruction to the
3270	// correct type.
3271	if (ResultTy != Cloned->getType())
3272	Cloned->mutateType(Ty: ResultTy);
3273	}
3274
3275	RepRecipe->applyFlags(I&: *Cloned);
3276	RepRecipe->applyMetadata(I&: *Cloned);
3277
3278	if (RepRecipe->hasPredicate())
3279	cast<CmpInst>(Val: Cloned)->setPredicate(RepRecipe->getPredicate());
3280
3281	if (auto DL = RepRecipe->getDebugLoc())
3282	State.setDebugLocFrom(DL);
3283
3284	// Replace the operands of the cloned instructions with their scalar
3285	// equivalents in the new loop.
3286	for (const auto &I : enumerate(First: RepRecipe->operands())) {
3287	auto InputLane = Lane;
3288	VPValue *Operand = I.value();
3289	if (vputils::isSingleScalar(VPV: Operand))
3290	InputLane = VPLane::getFirstLane();
3291	Cloned->setOperand(i: I.index(), Val: State.get(Def: Operand, Lane: InputLane));
3292	}
3293
3294	// Place the cloned scalar in the new loop.
3295	State.Builder.Insert(I: Cloned);
3296
3297	State.set(Def: RepRecipe, V: Cloned, Lane);
3298
3299	// If we just cloned a new assumption, add it the assumption cache.
3300	if (auto *II = dyn_cast<AssumeInst>(Val: Cloned))
3301	State.AC->registerAssumption(CI: II);
3302
3303	assert(
3304	(RepRecipe->getRegion() \|\|
3305	!RepRecipe->getParent()->getPlan()->getVectorLoopRegion() \|\|
3306	all_of(RepRecipe->operands(),
3307	[](VPValue Op) { return* Op->isDefinedOutsideLoopRegions(); })) &&
3308	"Expected a recipe is either within a region or all of its operands "
3309	"are defined outside the vectorized region.");
3310	}
3311
3312	void VPReplicateRecipe::execute(VPTransformState &State) {
3313	Instruction *UI = getUnderlyingInstr();
3314
3315	if (!State.Lane) {
3316	assert(IsSingleScalar && "VPReplicateRecipes outside replicate regions "
3317	"must have already been unrolled");
3318	scalarizeInstruction(Instr: UI, RepRecipe: this, Lane: VPLane (`0`), State);
3319	return;
3320	}
3321
3322	assert((State.VF.isScalar() \|\| !isSingleScalar()) &&
3323	"uniform recipe shouldn't be predicated");
3324	assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
3325	scalarizeInstruction(Instr: UI, RepRecipe: this, Lane: *State.Lane, State);
3326	// Insert scalar instance packing it into a vector.
3327	if (State.VF.isVector() && shouldPack()) {
3328	Value *WideValue =
3329	State.Lane ->isFirstLane()
3330	? PoisonValue::get(T: toVectorizedTy(Ty: UI->getType(), EC: State.VF))
3331	: State.get(Def: this);
3332	State.set(Def: this, V: State.packScalarIntoVectorizedValue(Def: this, WideValue,
3333	Lane: *State.Lane));
3334	}
3335	}
3336
3337	bool VPReplicateRecipe::shouldPack() const {
3338	// Find if the recipe is used by a widened recipe via an intervening
3339	// VPPredInstPHIRecipe. In this case, also pack the scalar values in a vector.
3340	return any_of(Range: users(), P: [](const VPUser *U) {
3341	if (auto *PredR = dyn_cast<VPPredInstPHIRecipe>(Val: U))
3342	return !vputils::onlyScalarValuesUsed(Def: PredR);
3343	return false;
3344	});
3345	}
3346
3347	/// Returns a SCEV expression for \p Ptr if it is a pointer computation for
3348	/// which the legacy cost model computes a SCEV expression when computing the
3349	/// address cost. Computing SCEVs for VPValues is incomplete and returns
3350	/// SCEVCouldNotCompute in cases the legacy cost model can compute SCEVs. In
3351	/// those cases we fall back to the legacy cost model. Otherwise return nullptr.
3352	static const SCEV getAddressAccessSCEV(const* VPValue *Ptr,
3353	PredicatedScalarEvolution &PSE,
3354	const Loop *L) {
3355	const SCEV *Addr = vputils::getSCEVExprForVPValue(V: Ptr, PSE, L);
3356	if (isa<SCEVCouldNotCompute>(Val: Addr))
3357	return Addr;
3358
3359	return vputils::isAddressSCEVForCost(Addr, SE&: PSE.getSE(), L) ? Addr : nullptr*;
3360	}
3361
3362	/// Returns true if \p V is used as part of the address of another load or
3363	/// store.
3364	static bool isUsedByLoadStoreAddress(const VPUser *V) {
3365	SmallPtrSet<const VPUser *, `4`> Seen;
3366	SmallVector<const VPUser *> WorkList = {V};
3367
3368	while (!WorkList.empty()) {
3369	auto *Cur = dyn_cast<VPSingleDefRecipe>(Val: WorkList.pop_back_val());
3370	if (!Cur \|\| !Seen.insert(Ptr: Cur).second)
3371	continue;
3372
3373	auto *Blend = dyn_cast<VPBlendRecipe>(Val: Cur);
3374	// Skip blends that use V only through a compare by checking if any incoming
3375	// value was already visited.
3376	if (Blend && none_of(Range: seq<unsigned>(Begin: `0`, End: Blend->getNumIncomingValues()),
3377	P: [&](unsigned I) {
3378	return Seen.contains(
3379	Ptr: Blend->getIncomingValue(Idx: I)->getDefiningRecipe());
3380	}))
3381	continue;
3382
3383	for (VPUser *U : Cur->users()) {
3384	if (auto *InterleaveR = dyn_cast<VPInterleaveBase>(Val: U))
3385	if (InterleaveR->getAddr() == Cur)
3386	return true;
3387	if (auto *RepR = dyn_cast<VPReplicateRecipe>(Val: U)) {
3388	if (RepR->getOpcode() == Instruction::Load &&
3389	RepR->getOperand(N: `0`) == Cur)
3390	return true;
3391	if (RepR->getOpcode() == Instruction::Store &&
3392	RepR->getOperand(N: `1`) == Cur)
3393	return true;
3394	}
3395	if (auto *MemR = dyn_cast<VPWidenMemoryRecipe>(Val: U)) {
3396	if (MemR->getAddr() == Cur && MemR->isConsecutive())
3397	return true;
3398	}
3399	}
3400
3401	// The legacy cost model only supports scalarization loads/stores with phi
3402	// addresses, if the phi is directly used as load/store address. Don't
3403	// traverse further for Blends.
3404	if (Blend)
3405	continue;
3406
3407	append_range(C&: WorkList, R: Cur->users());
3408	}
3409	return false;
3410	}
3411
3412	InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
3413	VPCostContext &Ctx) const {
3414	Instruction *UI = cast<Instruction>(Val: getUnderlyingValue());
3415	// VPReplicateRecipe may be cloned as part of an existing VPlan-to-VPlan
3416	// transform, avoid computing their cost multiple times for now.
3417	Ctx.SkipCostComputation.insert(Ptr: UI);
3418
3419	if (VF.isScalable() && !isSingleScalar())
3420	return InstructionCost::getInvalid();
3421
3422	switch (UI->getOpcode()) {
3423	case Instruction::Alloca:
3424	if (VF.isScalable())
3425	return InstructionCost::getInvalid();
3426	return Ctx.TTI.getArithmeticInstrCost(
3427	Opcode: Instruction::Mul, Ty: Ctx.Types.inferScalarType(V: this), CostKind: Ctx.CostKind);
3428	case Instruction::GetElementPtr:
3429	// We mark this instruction as zero-cost because the cost of GEPs in
3430	// vectorized code depends on whether the corresponding memory instruction
3431	// is scalarized or not. Therefore, we handle GEPs with the memory
3432	// instruction cost.
3433	return `0`;
3434	case Instruction::Call: {
3435	auto *CalledFn =
3436	cast<Function>(Val: getOperand(N: getNumOperands() - `1`)->getLiveInIRValue());
3437
3438	SmallVector<const VPValue *> ArgOps(drop_end(RangeOrContainer: operands()));
3439	SmallVector<Type *, `4`> Tys;
3440	for (const VPValue *ArgOp : ArgOps)
3441	Tys.push_back(Elt: Ctx.Types.inferScalarType(V: ArgOp));
3442
3443	if (CalledFn->isIntrinsic())
3444	// Various pseudo-intrinsics with costs of 0 are scalarized instead of
3445	// vectorized via VPWidenIntrinsicRecipe. Return 0 for them early.
3446	switch (CalledFn->getIntrinsicID()) {
3447	case Intrinsic::assume:
3448	case Intrinsic::lifetime_end:
3449	case Intrinsic::lifetime_start:
3450	case Intrinsic::sideeffect:
3451	case Intrinsic::pseudoprobe:
3452	case Intrinsic::experimental_noalias_scope_decl: {
3453	assert(getCostForIntrinsics(CalledFn->getIntrinsicID(), ArgOps, *this,
3454	ElementCount::getFixed(`1`), Ctx) == `0` &&
3455	"scalarizing intrinsic should be free");
3456	return InstructionCost (`0`);
3457	}
3458	default:
3459	break;
3460	}
3461
3462	Type ResultTy = Ctx.Types.inferScalarType(V: this*);
3463	InstructionCost ScalarCallCost =
3464	Ctx.TTI.getCallInstrCost(F: CalledFn, RetTy: ResultTy, Tys, CostKind: Ctx.CostKind);
3465	if (isSingleScalar()) {
3466	if (CalledFn->isIntrinsic())
3467	ScalarCallCost = std::min(
3468	a: ScalarCallCost,
3469	b: getCostForIntrinsics(ID: CalledFn->getIntrinsicID(), Operands: ArgOps, R: *this,
3470	VF: ElementCount::getFixed(MinVal: `1`), Ctx));
3471	return ScalarCallCost;
3472	}
3473
3474	return ScalarCallCost * VF.getFixedValue() +
3475	Ctx.getScalarizationOverhead(ResultTy, Operands: ArgOps, VF);
3476	}
3477	case Instruction::Add:
3478	case Instruction::Sub:
3479	case Instruction::FAdd:
3480	case Instruction::FSub:
3481	case Instruction::Mul:
3482	case Instruction::FMul:
3483	case Instruction::FDiv:
3484	case Instruction::FRem:
3485	case Instruction::Shl:
3486	case Instruction::LShr:
3487	case Instruction::AShr:
3488	case Instruction::And:
3489	case Instruction::Or:
3490	case Instruction::Xor:
3491	case Instruction::ICmp:
3492	case Instruction::FCmp:
3493	return getCostForRecipeWithOpcode(Opcode: getOpcode(), VF: ElementCount::getFixed(MinVal: `1`),
3494	Ctx) *
3495	(isSingleScalar() ? `1` : VF.getFixedValue());
3496	case Instruction::SDiv:
3497	case Instruction::UDiv:
3498	case Instruction::SRem:
3499	case Instruction::URem: {
3500	InstructionCost ScalarCost =
3501	getCostForRecipeWithOpcode(Opcode: getOpcode(), VF: ElementCount::getFixed(MinVal: `1`), Ctx);
3502	if (isSingleScalar())
3503	return ScalarCost;
3504
3505	// If any of the operands is from a different replicate region and has its
3506	// cost skipped, it may have been forced to scalar. Fall back to legacy cost
3507	// model to avoid cost mis-match.
3508	if (any_of(Range: operands(), P: [&Ctx, VF](VPValue *Op) {
3509	auto *PredR = dyn_cast<VPPredInstPHIRecipe>(Val: Op);
3510	if (!PredR)
3511	return false;
3512	return Ctx.skipCostComputation(
3513	UI: dyn_cast_or_null<Instruction>(
3514	Val: PredR->getOperand(N: `0`)->getUnderlyingValue()),
3515	IsVector: VF.isVector());
3516	}))
3517	break;
3518
3519	ScalarCost = ScalarCost * VF.getFixedValue() +
3520	Ctx.getScalarizationOverhead(ResultTy: Ctx.Types.inferScalarType(V: this),
3521	Operands: to_vector(Range: operands()), VF);
3522	// If the recipe is not predicated (i.e. not in a replicate region), return
3523	// the scalar cost. Otherwise handle predicated cost.
3524	if (!getRegion()->isReplicator())
3525	return ScalarCost;
3526
3527	// Account for the phi nodes that we will create.
3528	ScalarCost += VF.getFixedValue() *
3529	Ctx.TTI.getCFInstrCost(Opcode: Instruction::PHI, CostKind: Ctx.CostKind);
3530	// Scale the cost by the probability of executing the predicated blocks.
3531	// This assumes the predicated block for each vector lane is equally
3532	// likely.
3533	ScalarCost /= Ctx.getPredBlockCostDivisor(BB: UI->getParent());
3534	return ScalarCost;
3535	}
3536	case Instruction::Load:
3537	case Instruction::Store: {
3538	bool IsLoad = UI->getOpcode() == Instruction::Load;
3539	const VPValue *PtrOp = getOperand(N: !IsLoad);
3540	const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr: PtrOp, PSE&: Ctx.PSE, L: Ctx.L);
3541	if (isa_and_nonnull<SCEVCouldNotCompute>(Val: PtrSCEV))
3542	break;
3543
3544	Type ValTy = Ctx.Types.inferScalarType(V: IsLoad ? this* : getOperand(N: `0`));
3545	Type *ScalarPtrTy = Ctx.Types.inferScalarType(V: PtrOp);
3546	const Align Alignment = getLoadStoreAlignment(I: UI);
3547	unsigned AS = cast<PointerType>(Val: ScalarPtrTy)->getAddressSpace();
3548	TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(V: UI->getOperand(i: `0`));
3549	bool PreferVectorizedAddressing = Ctx.TTI.prefersVectorizedAddressing();
3550	bool UsedByLoadStoreAddress =
3551	!PreferVectorizedAddressing && isUsedByLoadStoreAddress(V: this);
3552	InstructionCost ScalarMemOpCost = Ctx.TTI.getMemoryOpCost(
3553	Opcode: UI->getOpcode(), Src: ValTy, Alignment, AddressSpace: AS, CostKind: Ctx.CostKind, OpdInfo: OpInfo,
3554	I: UsedByLoadStoreAddress ? UI : nullptr);
3555
3556	Type *PtrTy = isSingleScalar() ? ScalarPtrTy : toVectorTy(Scalar: ScalarPtrTy, EC: VF);
3557	InstructionCost ScalarCost =
3558	ScalarMemOpCost +
3559	Ctx.TTI.getAddressComputationCost(
3560	PtrTy, SE: UsedByLoadStoreAddress ? nullptr : Ctx.PSE.getSE(), Ptr: PtrSCEV,
3561	CostKind: Ctx.CostKind);
3562	if (isSingleScalar())
3563	return ScalarCost;
3564
3565	SmallVector<const VPValue *> OpsToScalarize;
3566	Type *ResultTy = Type::getVoidTy(C&: PtrTy->getContext());
3567	// Set ResultTy and OpsToScalarize, if scalarization is needed. Currently we
3568	// don't assign scalarization overhead in general, if the target prefers
3569	// vectorized addressing or the loaded value is used as part of an address
3570	// of another load or store.
3571	if (!UsedByLoadStoreAddress) {
3572	bool EfficientVectorLoadStore =
3573	Ctx.TTI.supportsEfficientVectorElementLoadStore();
3574	if (!(IsLoad && !PreferVectorizedAddressing) &&
3575	!(!IsLoad && EfficientVectorLoadStore))
3576	append_range(C&: OpsToScalarize, R: operands());
3577
3578	if (!EfficientVectorLoadStore)
3579	ResultTy = Ctx.Types.inferScalarType(V: this);
3580	}
3581
3582	TTI::VectorInstrContext VIC =
3583	IsLoad ? TTI::VectorInstrContext::Load : TTI::VectorInstrContext::Store;
3584	InstructionCost Cost =
3585	(ScalarCost * VF.getFixedValue()) +
3586	Ctx.getScalarizationOverhead(ResultTy, Operands: OpsToScalarize, VF, VIC, AlwaysIncludeReplicatingR: true);
3587
3588	const VPRegionBlock *ParentRegion = getRegion();
3589	if (ParentRegion && ParentRegion->isReplicator()) {
3590	// TODO: Handle loop-invariant pointers in predicated blocks. For now,
3591	// fall back to the legacy cost model.
3592	if (!PtrSCEV \|\| Ctx.PSE.getSE()->isLoopInvariant(S: PtrSCEV, L: Ctx.L))
3593	break;
3594	Cost /= Ctx.getPredBlockCostDivisor(BB: UI->getParent());
3595	Cost += Ctx.TTI.getCFInstrCost(Opcode: Instruction::CondBr, CostKind: Ctx.CostKind);
3596
3597	auto *VecI1Ty = VectorType::get(
3598	ElementType: IntegerType::getInt1Ty(C&: Ctx.L->getHeader()->getContext()), EC: VF);
3599	Cost += Ctx.TTI.getScalarizationOverhead(
3600	Ty: VecI1Ty, DemandedElts: APInt::getAllOnes(numBits: VF.getFixedValue()),
3601	/Insert=/false, /Extract=/true, CostKind: Ctx.CostKind);
3602
3603	if (Ctx.useEmulatedMaskMemRefHack(R: this, VF)) {
3604	// Artificially setting to a high enough value to practically disable
3605	// vectorization with such operations.
3606	return `3000000`;
3607	}
3608	}
3609	return Cost;
3610	}
3611	case Instruction::SExt:
3612	case Instruction::ZExt:
3613	case Instruction::FPToUI:
3614	case Instruction::FPToSI:
3615	case Instruction::FPExt:
3616	case Instruction::PtrToInt:
3617	case Instruction::PtrToAddr:
3618	case Instruction::IntToPtr:
3619	case Instruction::SIToFP:
3620	case Instruction::UIToFP:
3621	case Instruction::Trunc:
3622	case Instruction::FPTrunc:
3623	case Instruction::Select:
3624	case Instruction::AddrSpaceCast: {
3625	return getCostForRecipeWithOpcode(Opcode: getOpcode(), VF: ElementCount::getFixed(MinVal: `1`),
3626	Ctx) *
3627	(isSingleScalar() ? `1` : VF.getFixedValue());
3628	}
3629	case Instruction::ExtractValue:
3630	case Instruction::InsertValue:
3631	return Ctx.TTI.getInsertExtractValueCost(Opcode: getOpcode(), CostKind: Ctx.CostKind);
3632	}
3633
3634	return Ctx.getLegacyCost(UI, VF);
3635	}
3636
3637	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
3638	void VPReplicateRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
3639	VPSlotTracker &SlotTracker) const {
3640	O << Indent << (IsSingleScalar ? "CLONE " : "REPLICATE ");
3641
3642	if (!getUnderlyingInstr()->getType()->isVoidTy()) {
3643	printAsOperand(O, SlotTracker);
3644	O << " = ";
3645	}
3646	if (auto *CB = dyn_cast<CallBase>(getUnderlyingInstr())) {
3647	O << "call";
3648	printFlags(O);
3649	O << "@" << CB->getCalledFunction()->getName() << "(";
3650	interleaveComma(make_range(op_begin(), op_begin() + (getNumOperands() - `1`)),
3651	O, [&O, &SlotTracker](VPValue *Op) {
3652	Op->printAsOperand(O, SlotTracker);
3653	});
3654	O << ")";
3655	} else {
3656	O << Instruction::getOpcodeName(getUnderlyingInstr()->getOpcode());
3657	printFlags(O);
3658	printOperands(O, SlotTracker);
3659	}
3660
3661	if (shouldPack())
3662	O << " (S->V)";
3663	}
3664	#endif
3665
3666	void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
3667	assert(State.Lane && "Branch on Mask works only on single instance.");
3668
3669	VPValue *BlockInMask = getOperand(N: `0`);
3670	Value ConditionBit = State.get(Def: BlockInMask, Lane: State.Lane);
3671
3672	// Replace the temporary unreachable terminator with a new conditional branch,
3673	// whose two destinations will be set later when they are created.
3674	auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
3675	assert(isa<UnreachableInst>(CurrentTerminator) &&
3676	"Expected to replace unreachable terminator with conditional branch.");
3677	auto CondBr =
3678	State.Builder.CreateCondBr(Cond: ConditionBit, True: State.CFG.PrevBB, False: nullptr);
3679	CondBr->setSuccessor(idx: `0`, NewSucc: nullptr);
3680	CurrentTerminator->eraseFromParent();
3681	}
3682
3683	InstructionCost VPBranchOnMaskRecipe::computeCost(ElementCount VF,
3684	VPCostContext &Ctx) const {
3685	// The legacy cost model doesn't assign costs to branches for individual
3686	// replicate regions. Match the current behavior in the VPlan cost model for
3687	// now.
3688	return `0`;
3689	}
3690
3691	void VPPredInstPHIRecipe::execute(VPTransformState &State) {
3692	assert(State.Lane && "Predicated instruction PHI works per instance.");
3693	Instruction *ScalarPredInst =
3694	cast<Instruction>(Val: State.get(Def: getOperand(N: `0`), Lane: *State.Lane));
3695	BasicBlock *PredicatedBB = ScalarPredInst->getParent();
3696	BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
3697	assert(PredicatingBB && "Predicated block has no single predecessor.");
3698	assert(isa<VPReplicateRecipe>(getOperand(`0`)) &&
3699	"operand must be VPReplicateRecipe");
3700
3701	// By current pack/unpack logic we need to generate only a single phi node: if
3702	// a vector value for the predicated instruction exists at this point it means
3703	// the instruction has vector users only, and a phi for the vector value is
3704	// needed. In this case the recipe of the predicated instruction is marked to
3705	// also do that packing, thereby "hoisting" the insert-element sequence.
3706	// Otherwise, a phi node for the scalar value is needed.
3707	if (State.hasVectorValue(Def: getOperand(N: `0`))) {
3708	auto *VecI = cast<Instruction>(Val: State.get(Def: getOperand(N: `0`)));
3709	assert((isa<InsertElementInst, InsertValueInst>(VecI)) &&
3710	"Packed operands must generate an insertelement or insertvalue");
3711
3712	// If VectorI is a struct, it will be a sequence like:
3713	// %1 = insertvalue %unmodified, %x, 0
3714	// %2 = insertvalue %1, %y, 1
3715	// %VectorI = insertvalue %2, %z, 2
3716	// To get the unmodified vector we need to look through the chain.
3717	if (auto *StructTy = dyn_cast<StructType>(Val: VecI->getType()))
3718	for (unsigned I = `0`; I < StructTy->getNumContainedTypes() - `1`; I++)
3719	VecI = cast<InsertValueInst>(Val: VecI->getOperand(i: `0`));
3720
3721	PHINode *VPhi = State.Builder.CreatePHI(Ty: VecI->getType(), NumReservedValues: `2`);
3722	VPhi->addIncoming(V: VecI->getOperand(i: `0`), BB: PredicatingBB); // Unmodified vector.
3723	VPhi->addIncoming(V: VecI, BB: PredicatedBB); // New vector with inserted element.
3724	if (State.hasVectorValue(Def: this))
3725	State.reset(Def: this, V: VPhi);
3726	else
3727	State.set(Def: this, V: VPhi);
3728	// NOTE: Currently we need to update the value of the operand, so the next
3729	// predicated iteration inserts its generated value in the correct vector.
3730	State.reset(Def: getOperand(N: `0`), V: VPhi);
3731	} else {
3732	if (vputils::onlyFirstLaneUsed(Def: this) && !State.Lane ->isFirstLane())
3733	return;
3734
3735	Type *PredInstType = State.TypeAnalysis.inferScalarType(V: getOperand(N: `0`));
3736	PHINode *Phi = State.Builder.CreatePHI(Ty: PredInstType, NumReservedValues: `2`);
3737	Phi->addIncoming(V: PoisonValue::get(T: ScalarPredInst->getType()),
3738	BB: PredicatingBB);
3739	Phi->addIncoming(V: ScalarPredInst, BB: PredicatedBB);
3740	if (State.hasScalarValue(Def: this, Lane: *State.Lane))
3741	State.reset(Def: this, V: Phi, Lane: *State.Lane);
3742	else
3743	State.set(Def: this, V: Phi, Lane: *State.Lane);
3744	// NOTE: Currently we need to update the value of the operand, so the next
3745	// predicated iteration inserts its generated value in the correct vector.
3746	State.reset(Def: getOperand(N: `0`), V: Phi, Lane: *State.Lane);
3747	}
3748	}
3749
3750	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
3751	void VPPredInstPHIRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
3752	VPSlotTracker &SlotTracker) const {
3753	O << Indent << "PHI-PREDICATED-INSTRUCTION ";
3754	printAsOperand(O, SlotTracker);
3755	O << " = ";
3756	printOperands(O, SlotTracker);
3757	}
3758	#endif
3759
3760	InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF,
3761	VPCostContext &Ctx) const {
3762	Type *Ty = toVectorTy(Scalar: getLoadStoreType(I: &Ingredient), EC: VF);
3763	unsigned AS = cast<PointerType>(Val: Ctx.Types.inferScalarType(V: getAddr()))
3764	->getAddressSpace();
3765	unsigned Opcode = isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe>(Val: this)
3766	? Instruction::Load
3767	: Instruction::Store;
3768
3769	if (!Consecutive) {
3770	// TODO: Using the original IR may not be accurate.
3771	// Currently, ARM will use the underlying IR to calculate gather/scatter
3772	// instruction cost.
3773	assert(!Reverse &&
3774	"Inconsecutive memory access should not have the order.");
3775
3776	const Value *Ptr = getLoadStorePointerOperand(V: &Ingredient);
3777	Type *PtrTy = Ptr->getType();
3778
3779	// If the address value is uniform across all lanes, then the address can be
3780	// calculated with scalar type and broadcast.
3781	if (!vputils::isSingleScalar(VPV: getAddr()))
3782	PtrTy = toVectorTy(Scalar: PtrTy, EC: VF);
3783
3784	unsigned IID = isa<VPWidenLoadRecipe>(Val: this) ? Intrinsic::masked_gather
3785	: isa<VPWidenStoreRecipe>(Val: this) ? Intrinsic::masked_scatter
3786	: isa<VPWidenLoadEVLRecipe>(Val: this) ? Intrinsic::vp_gather
3787	: Intrinsic::vp_scatter;
3788	return Ctx.TTI.getAddressComputationCost(PtrTy, SE: nullptr, Ptr: nullptr,
3789	CostKind: Ctx.CostKind) +
3790	Ctx.TTI.getMemIntrinsicInstrCost(
3791	MICA: MemIntrinsicCostAttributes (IID, Ty, Ptr, IsMasked, Alignment,
3792	&Ingredient),
3793	CostKind: Ctx.CostKind);
3794	}
3795
3796	InstructionCost Cost = `0`;
3797	if (IsMasked) {
3798	unsigned IID = isa<VPWidenLoadRecipe>(Val: this) ? Intrinsic::masked_load
3799	: Intrinsic::masked_store;
3800	Cost += Ctx.TTI.getMemIntrinsicInstrCost(
3801	MICA: MemIntrinsicCostAttributes (IID, Ty, Alignment, AS), CostKind: Ctx.CostKind);
3802	} else {
3803	TTI::OperandValueInfo OpInfo = Ctx.getOperandInfo(
3804	V: isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe>(Val: this) ? getOperand(N: `0`)
3805	: getOperand(N: `1`));
3806	Cost += Ctx.TTI.getMemoryOpCost(Opcode, Src: Ty, Alignment, AddressSpace: AS, CostKind: Ctx.CostKind,
3807	OpdInfo: OpInfo, I: &Ingredient);
3808	}
3809	return Cost;
3810	}
3811
3812	void VPWidenLoadRecipe::execute(VPTransformState &State) {
3813	Type *ScalarDataTy = getLoadStoreType(I: &Ingredient);
3814	auto *DataTy = VectorType::get(ElementType: ScalarDataTy, EC: State.VF);
3815	bool CreateGather = !isConsecutive();
3816
3817	auto &Builder = State.Builder;
3818	Value Mask = nullptr*;
3819	if (auto *VPMask = getMask()) {
3820	// Mask reversal is only needed for non-all-one (null) masks, as reverse
3821	// of a null all-one mask is a null mask.
3822	Mask = State.get(Def: VPMask);
3823	if (isReverse())
3824	Mask = Builder.CreateVectorReverse(V: Mask, Name: "reverse");
3825	}
3826
3827	Value Addr = State.get(Def: getAddr(), /IsScalar/* !CreateGather);
3828	Value *NewLI;
3829	if (CreateGather) {
3830	NewLI = Builder.CreateMaskedGather(Ty: DataTy, Ptrs: Addr, Alignment, Mask, PassThru: nullptr,
3831	Name: "wide.masked.gather");
3832	} else if (Mask) {
3833	NewLI =
3834	Builder.CreateMaskedLoad(Ty: DataTy, Ptr: Addr, Alignment, Mask,
3835	PassThru: PoisonValue::get(T: DataTy), Name: "wide.masked.load");
3836	} else {
3837	NewLI = Builder.CreateAlignedLoad(Ty: DataTy, Ptr: Addr, Align: Alignment, Name: "wide.load");
3838	}
3839	applyMetadata(I&: *cast<Instruction>(Val: NewLI));
3840	State.set(Def: this, V: NewLI);
3841	}
3842
3843	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
3844	void VPWidenLoadRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
3845	VPSlotTracker &SlotTracker) const {
3846	O << Indent << "WIDEN ";
3847	printAsOperand(O, SlotTracker);
3848	O << " = load ";
3849	printOperands(O, SlotTracker);
3850	}
3851	#endif
3852
3853	/// Use all-true mask for reverse rather than actual mask, as it avoids a
3854	/// dependence w/o affecting the result.
3855	static Instruction createReverseEVL(IRBuilderBase &Builder, Value Operand,
3856	Value EVL, const* Twine &Name) {
3857	VectorType *ValTy = cast<VectorType>(Val: Operand->getType());
3858	Value *AllTrueMask =
3859	Builder.CreateVectorSplat(EC: ValTy->getElementCount(), V: Builder.getTrue());
3860	return Builder.CreateIntrinsic(RetTy: ValTy, ID: Intrinsic::experimental_vp_reverse,
3861	Args: {Operand, AllTrueMask, EVL}, FMFSource: nullptr, Name);
3862	}
3863
3864	void VPWidenLoadEVLRecipe::execute(VPTransformState &State) {
3865	Type *ScalarDataTy = getLoadStoreType(I: &Ingredient);
3866	auto *DataTy = VectorType::get(ElementType: ScalarDataTy, EC: State.VF);
3867	bool CreateGather = !isConsecutive();
3868
3869	auto &Builder = State.Builder;
3870	CallInst *NewLI;
3871	Value *EVL = State.get(Def: getEVL(), Lane: VPLane (`0`));
3872	Value *Addr = State.get(Def: getAddr(), IsScalar: !CreateGather);
3873	Value Mask = nullptr*;
3874	if (VPValue *VPMask = getMask()) {
3875	Mask = State.get(Def: VPMask);
3876	if (isReverse())
3877	Mask = createReverseEVL(Builder, Operand: Mask, EVL, Name: "vp.reverse.mask");
3878	} else {
3879	Mask = Builder.CreateVectorSplat(EC: State.VF, V: Builder.getTrue());
3880	}
3881
3882	if (CreateGather) {
3883	NewLI =
3884	Builder.CreateIntrinsic(RetTy: DataTy, ID: Intrinsic::vp_gather, Args: {Addr, Mask, EVL},
3885	FMFSource: nullptr, Name: "wide.masked.gather");
3886	} else {
3887	NewLI = Builder.CreateIntrinsic(RetTy: DataTy, ID: Intrinsic::vp_load,
3888	Args: {Addr, Mask, EVL}, FMFSource: nullptr, Name: "vp.op.load");
3889	}
3890	NewLI->addParamAttr(
3891	ArgNo: `0`, Attr: Attribute::getWithAlignment(Context&: NewLI->getContext(), Alignment));
3892	applyMetadata(I&: *NewLI);
3893	Instruction *Res = NewLI;
3894	State.set(Def: this, V: Res);
3895	}
3896
3897	InstructionCost VPWidenLoadEVLRecipe::computeCost(ElementCount VF,
3898	VPCostContext &Ctx) const {
3899	if (!Consecutive \|\| IsMasked)
3900	return VPWidenMemoryRecipe::computeCost(VF, Ctx);
3901
3902	// We need to use the getMemIntrinsicInstrCost() instead of getMemoryOpCost()
3903	// here because the EVL recipes using EVL to replace the tail mask. But in the
3904	// legacy model, it will always calculate the cost of mask.
3905	// TODO: Using getMemoryOpCost() instead of getMemIntrinsicInstrCost when we
3906	// don't need to compare to the legacy cost model.
3907	Type *Ty = toVectorTy(Scalar: getLoadStoreType(I: &Ingredient), EC: VF);
3908	unsigned AS = cast<PointerType>(Val: Ctx.Types.inferScalarType(V: getAddr()))
3909	->getAddressSpace();
3910	return Ctx.TTI.getMemIntrinsicInstrCost(
3911	MICA: MemIntrinsicCostAttributes (Intrinsic::vp_load, Ty, Alignment, AS),
3912	CostKind: Ctx.CostKind);
3913	}
3914
3915	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
3916	void VPWidenLoadEVLRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
3917	VPSlotTracker &SlotTracker) const {
3918	O << Indent << "WIDEN ";
3919	printAsOperand(O, SlotTracker);
3920	O << " = vp.load ";
3921	printOperands(O, SlotTracker);
3922	}
3923	#endif
3924
3925	void VPWidenStoreRecipe::execute(VPTransformState &State) {
3926	VPValue *StoredVPValue = getStoredValue();
3927	bool CreateScatter = !isConsecutive();
3928
3929	auto &Builder = State.Builder;
3930
3931	Value Mask = nullptr*;
3932	if (auto *VPMask = getMask()) {
3933	// Mask reversal is only needed for non-all-one (null) masks, as reverse
3934	// of a null all-one mask is a null mask.
3935	Mask = State.get(Def: VPMask);
3936	if (isReverse())
3937	Mask = Builder.CreateVectorReverse(V: Mask, Name: "reverse");
3938	}
3939
3940	Value *StoredVal = State.get(Def: StoredVPValue);
3941	Value Addr = State.get(Def: getAddr(), /IsScalar/* !CreateScatter);
3942	Instruction NewSI = nullptr*;
3943	if (CreateScatter)
3944	NewSI = Builder.CreateMaskedScatter(Val: StoredVal, Ptrs: Addr, Alignment, Mask);
3945	else if (Mask)
3946	NewSI = Builder.CreateMaskedStore(Val: StoredVal, Ptr: Addr, Alignment, Mask);
3947	else
3948	NewSI = Builder.CreateAlignedStore(Val: StoredVal, Ptr: Addr, Align: Alignment);
3949	applyMetadata(I&: *NewSI);
3950	}
3951
3952	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
3953	void VPWidenStoreRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
3954	VPSlotTracker &SlotTracker) const {
3955	O << Indent << "WIDEN store ";
3956	printOperands(O, SlotTracker);
3957	}
3958	#endif
3959
3960	void VPWidenStoreEVLRecipe::execute(VPTransformState &State) {
3961	VPValue *StoredValue = getStoredValue();
3962	bool CreateScatter = !isConsecutive();
3963
3964	auto &Builder = State.Builder;
3965
3966	CallInst NewSI = nullptr*;
3967	Value *StoredVal = State.get(Def: StoredValue);
3968	Value *EVL = State.get(Def: getEVL(), Lane: VPLane (`0`));
3969	Value Mask = nullptr*;
3970	if (VPValue *VPMask = getMask()) {
3971	Mask = State.get(Def: VPMask);
3972	if (isReverse())
3973	Mask = createReverseEVL(Builder, Operand: Mask, EVL, Name: "vp.reverse.mask");
3974	} else {
3975	Mask = Builder.CreateVectorSplat(EC: State.VF, V: Builder.getTrue());
3976	}
3977	Value *Addr = State.get(Def: getAddr(), IsScalar: !CreateScatter);
3978	if (CreateScatter) {
3979	NewSI = Builder.CreateIntrinsic(RetTy: Type::getVoidTy(C&: EVL->getContext()),
3980	ID: Intrinsic::vp_scatter,
3981	Args: {StoredVal, Addr, Mask, EVL});
3982	} else {
3983	NewSI = Builder.CreateIntrinsic(RetTy: Type::getVoidTy(C&: EVL->getContext()),
3984	ID: Intrinsic::vp_store,
3985	Args: {StoredVal, Addr, Mask, EVL});
3986	}
3987	NewSI->addParamAttr(
3988	ArgNo: `1`, Attr: Attribute::getWithAlignment(Context&: NewSI->getContext(), Alignment));
3989	applyMetadata(I&: *NewSI);
3990	}
3991
3992	InstructionCost VPWidenStoreEVLRecipe::computeCost(ElementCount VF,
3993	VPCostContext &Ctx) const {
3994	if (!Consecutive \|\| IsMasked)
3995	return VPWidenMemoryRecipe::computeCost(VF, Ctx);
3996
3997	// We need to use the getMemIntrinsicInstrCost() instead of getMemoryOpCost()
3998	// here because the EVL recipes using EVL to replace the tail mask. But in the
3999	// legacy model, it will always calculate the cost of mask.
4000	// TODO: Using getMemoryOpCost() instead of getMemIntrinsicInstrCost when we
4001	// don't need to compare to the legacy cost model.
4002	Type *Ty = toVectorTy(Scalar: getLoadStoreType(I: &Ingredient), EC: VF);
4003	unsigned AS = cast<PointerType>(Val: Ctx.Types.inferScalarType(V: getAddr()))
4004	->getAddressSpace();
4005	return Ctx.TTI.getMemIntrinsicInstrCost(
4006	MICA: MemIntrinsicCostAttributes (Intrinsic::vp_store, Ty, Alignment, AS),
4007	CostKind: Ctx.CostKind);
4008	}
4009
4010	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
4011	void VPWidenStoreEVLRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
4012	VPSlotTracker &SlotTracker) const {
4013	O << Indent << "WIDEN vp.store ";
4014	printOperands(O, SlotTracker);
4015	}
4016	#endif
4017
4018	static Value createBitOrPointerCast(IRBuilderBase &Builder, Value V,
4019	VectorType DstVTy, const* DataLayout &DL) {
4020	// Verify that V is a vector type with same number of elements as DstVTy.
4021	auto VF = DstVTy->getElementCount();
4022	auto *SrcVecTy = cast<VectorType>(Val: V->getType());
4023	assert(VF == SrcVecTy->getElementCount() && "Vector dimensions do not match");
4024	Type *SrcElemTy = SrcVecTy->getElementType();
4025	Type *DstElemTy = DstVTy->getElementType();
4026	assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
4027	"Vector elements must have same size");
4028
4029	// Do a direct cast if element types are castable.
4030	if (CastInst::isBitOrNoopPointerCastable(SrcTy: SrcElemTy, DestTy: DstElemTy, DL)) {
4031	return Builder.CreateBitOrPointerCast(V, DestTy: DstVTy);
4032	}
4033	// V cannot be directly casted to desired vector type.
4034	// May happen when V is a floating point vector but DstVTy is a vector of
4035	// pointers or vice-versa. Handle this using a two-step bitcast using an
4036	// intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
4037	assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
4038	"Only one type should be a pointer type");
4039	assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
4040	"Only one type should be a floating point type");
4041	Type *IntTy =
4042	IntegerType::getIntNTy(C&: V->getContext(), N: DL.getTypeSizeInBits(Ty: SrcElemTy));
4043	auto *VecIntTy = VectorType::get(ElementType: IntTy, EC: VF);
4044	Value *CastVal = Builder.CreateBitOrPointerCast(V, DestTy: VecIntTy);
4045	return Builder.CreateBitOrPointerCast(V: CastVal, DestTy: DstVTy);
4046	}
4047
4048	/// Return a vector containing interleaved elements from multiple
4049	/// smaller input vectors.
4050	static Value interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value > Vals,
4051	const Twine &Name) {
4052	unsigned Factor = Vals.size();
4053	assert(Factor > `1` && "Tried to interleave invalid number of vectors");
4054
4055	VectorType *VecTy = cast<VectorType>(Val: Vals [`0`]->getType());
4056	#ifndef NDEBUG
4057	for (Value *Val : Vals)
4058	assert(Val->getType() == VecTy && "Tried to interleave mismatched types");
4059	#endif
4060
4061	// Scalable vectors cannot use arbitrary shufflevectors (only splats), so
4062	// must use intrinsics to interleave.
4063	if (VecTy->isScalableTy()) {
4064	assert(Factor <= `8` && "Unsupported interleave factor for scalable vectors");
4065	return Builder.CreateVectorInterleave(Ops: Vals, Name);
4066	}
4067
4068	// Fixed length. Start by concatenating all vectors into a wide vector.
4069	Value *WideVec = concatenateVectors(Builder, Vecs: Vals);
4070
4071	// Interleave the elements into the wide vector.
4072	const unsigned NumElts = VecTy->getElementCount().getFixedValue();
4073	return Builder.CreateShuffleVector(
4074	V: WideVec, Mask: createInterleaveMask(VF: NumElts, NumVecs: Factor), Name);
4075	}
4076
4077	// Try to vectorize the interleave group that \p Instr belongs to.
4078	//
4079	// E.g. Translate following interleaved load group (factor = 3):
4080	// for (i = 0; i < N; i+=3) {
4081	// R = Pic[i]; // Member of index 0
4082	// G = Pic[i+1]; // Member of index 1
4083	// B = Pic[i+2]; // Member of index 2
4084	// ... // do something to R, G, B
4085	// }
4086	// To:
4087	// %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B
4088	// %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements
4089	// %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements
4090	// %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements
4091	//
4092	// Or translate following interleaved store group (factor = 3):
4093	// for (i = 0; i < N; i+=3) {
4094	// ... do something to R, G, B
4095	// Pic[i] = R; // Member of index 0
4096	// Pic[i+1] = G; // Member of index 1
4097	// Pic[i+2] = B; // Member of index 2
4098	// }
4099	// To:
4100	// %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
4101	// %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
4102	// %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
4103	// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements
4104	// store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B
4105	void VPInterleaveRecipe::execute(VPTransformState &State) {
4106	assert(!State.Lane && "Interleave group being replicated.");
4107	assert((!needsMaskForGaps() \|\| !State.VF.isScalable()) &&
4108	"Masking gaps for scalable vectors is not yet supported.");
4109	const InterleaveGroup<Instruction> *Group = getInterleaveGroup();
4110	Instruction *Instr = Group->getInsertPos();
4111
4112	// Prepare for the vector type of the interleaved load/store.
4113	Type *ScalarTy = getLoadStoreType(I: Instr);
4114	unsigned InterleaveFactor = Group->getFactor();
4115	auto VecTy = VectorType::get(ElementType: ScalarTy, EC: State.VF InterleaveFactor);
4116
4117	VPValue *BlockInMask = getMask();
4118	VPValue *Addr = getAddr();
4119	Value *ResAddr = State.get(Def: Addr, Lane: VPLane (`0`));
4120
4121	auto CreateGroupMask = [&BlockInMask, &State,
4122	&InterleaveFactor](Value MaskForGaps) -> Value {
4123	if (State.VF.isScalable()) {
4124	assert(!MaskForGaps && "Interleaved groups with gaps are not supported.");
4125	assert(InterleaveFactor <= `8` &&
4126	"Unsupported deinterleave factor for scalable vectors");
4127	auto *ResBlockInMask = State.get(Def: BlockInMask);
4128	SmallVector<Value *> Ops(InterleaveFactor, ResBlockInMask);
4129	return interleaveVectors(Builder&: State.Builder, Vals: Ops, Name: "interleaved.mask");
4130	}
4131
4132	if (!BlockInMask)
4133	return MaskForGaps;
4134
4135	Value *ResBlockInMask = State.get(Def: BlockInMask);
4136	Value *ShuffledMask = State.Builder.CreateShuffleVector(
4137	V: ResBlockInMask,
4138	Mask: createReplicatedMask(ReplicationFactor: InterleaveFactor, VF: State.VF.getFixedValue()),
4139	Name: "interleaved.mask");
4140	return MaskForGaps ? State.Builder.CreateBinOp(Opc: Instruction::And,
4141	LHS: ShuffledMask, RHS: MaskForGaps)
4142	: ShuffledMask;
4143	};
4144
4145	const DataLayout &DL = Instr->getDataLayout();
4146	// Vectorize the interleaved load group.
4147	if (isa<LoadInst>(Val: Instr)) {
4148	Value MaskForGaps = nullptr*;
4149	if (needsMaskForGaps()) {
4150	MaskForGaps =
4151	createBitMaskForGaps(Builder&: State.Builder, VF: State.VF.getFixedValue(), Group: *Group);
4152	assert(MaskForGaps && "Mask for Gaps is required but it is null");
4153	}
4154
4155	Instruction *NewLoad;
4156	if (BlockInMask \|\| MaskForGaps) {
4157	Value *GroupMask = CreateGroupMask (MaskForGaps);
4158	Value *PoisonVec = PoisonValue::get(T: VecTy);
4159	NewLoad = State.Builder.CreateMaskedLoad(Ty: VecTy, Ptr: ResAddr,
4160	Alignment: Group->getAlign(), Mask: GroupMask,
4161	PassThru: PoisonVec, Name: "wide.masked.vec");
4162	} else
4163	NewLoad = State.Builder.CreateAlignedLoad(Ty: VecTy, Ptr: ResAddr,
4164	Align: Group->getAlign(), Name: "wide.vec");
4165	applyMetadata(I&: *NewLoad);
4166	// TODO: Also manage existing metadata using VPIRMetadata.
4167	Group->addMetadata(NewInst: NewLoad);
4168
4169	ArrayRef<VPRecipeValue *> VPDefs = definedValues();
4170	if (VecTy->isScalableTy()) {
4171	// Scalable vectors cannot use arbitrary shufflevectors (only splats),
4172	// so must use intrinsics to deinterleave.
4173	assert(InterleaveFactor <= `8` &&
4174	"Unsupported deinterleave factor for scalable vectors");
4175	NewLoad = State.Builder.CreateIntrinsic(
4176	ID: Intrinsic::getDeinterleaveIntrinsicID(Factor: InterleaveFactor),
4177	Types: NewLoad->getType(), Args: NewLoad,
4178	/FMFSource=/nullptr, Name: "strided.vec");
4179	}
4180
4181	auto CreateStridedVector = [&InterleaveFactor, &State,
4182	&NewLoad](unsigned Index) -> Value * {
4183	assert(Index < InterleaveFactor && "Illegal group index");
4184	if (State.VF.isScalable())
4185	return State.Builder.CreateExtractValue(Agg: NewLoad, Idxs: Index);
4186
4187	// For fixed length VF, use shuffle to extract the sub-vectors from the
4188	// wide load.
4189	auto StrideMask =
4190	createStrideMask(Start: Index, Stride: InterleaveFactor, VF: State.VF.getFixedValue());
4191	return State.Builder.CreateShuffleVector(V: NewLoad, Mask: StrideMask,
4192	Name: "strided.vec");
4193	};
4194
4195	for (unsigned I = `0`, J = `0`; I < InterleaveFactor; ++I) {
4196	Instruction *Member = Group->getMember(Index: I);
4197
4198	// Skip the gaps in the group.
4199	if (!Member)
4200	continue;
4201
4202	Value *StridedVec = CreateStridedVector (I);
4203
4204	// If this member has different type, cast the result type.
4205	if (Member->getType() != ScalarTy) {
4206	VectorType *OtherVTy = VectorType::get(ElementType: Member->getType(), EC: State.VF);
4207	StridedVec =
4208	createBitOrPointerCast(Builder&: State.Builder, V: StridedVec, DstVTy: OtherVTy, DL);
4209	}
4210
4211	if (Group->isReverse())
4212	StridedVec = State.Builder.CreateVectorReverse(V: StridedVec, Name: "reverse");
4213
4214	State.set(Def: VPDefs [J], V: StridedVec);
4215	++J;
4216	}
4217	return;
4218	}
4219
4220	// The sub vector type for current instruction.
4221	auto *SubVT = VectorType::get(ElementType: ScalarTy, EC: State.VF);
4222
4223	// Vectorize the interleaved store group.
4224	Value *MaskForGaps =
4225	createBitMaskForGaps(Builder&: State.Builder, VF: State.VF.getKnownMinValue(), Group: *Group);
4226	assert(((MaskForGaps != nullptr) == needsMaskForGaps()) &&
4227	"Mismatch between NeedsMaskForGaps and MaskForGaps");
4228	ArrayRef<VPValue *> StoredValues = getStoredValues();
4229	// Collect the stored vector from each member.
4230	SmallVector<Value *, `4`> StoredVecs;
4231	unsigned StoredIdx = `0`;
4232	for (unsigned i = `0`; i < InterleaveFactor; i++) {
4233	assert((Group->getMember(i) \|\| MaskForGaps) &&
4234	"Fail to get a member from an interleaved store group");
4235	Instruction *Member = Group->getMember(Index: i);
4236
4237	// Skip the gaps in the group.
4238	if (!Member) {
4239	Value *Undef = PoisonValue::get(T: SubVT);
4240	StoredVecs.push_back(Elt: Undef);
4241	continue;
4242	}
4243
4244	Value *StoredVec = State.get(Def: StoredValues [StoredIdx]);
4245	++StoredIdx;
4246
4247	if (Group->isReverse())
4248	StoredVec = State.Builder.CreateVectorReverse(V: StoredVec, Name: "reverse");
4249
4250	// If this member has different type, cast it to a unified type.
4251
4252	if (StoredVec->getType() != SubVT)
4253	StoredVec = createBitOrPointerCast(Builder&: State.Builder, V: StoredVec, DstVTy: SubVT, DL);
4254
4255	StoredVecs.push_back(Elt: StoredVec);
4256	}
4257
4258	// Interleave all the smaller vectors into one wider vector.
4259	Value *IVec = interleaveVectors(Builder&: State.Builder, Vals: StoredVecs, Name: "interleaved.vec");
4260	Instruction *NewStoreInstr;
4261	if (BlockInMask \|\| MaskForGaps) {
4262	Value *GroupMask = CreateGroupMask (MaskForGaps);
4263	NewStoreInstr = State.Builder.CreateMaskedStore(
4264	Val: IVec, Ptr: ResAddr, Alignment: Group->getAlign(), Mask: GroupMask);
4265	} else
4266	NewStoreInstr =
4267	State.Builder.CreateAlignedStore(Val: IVec, Ptr: ResAddr, Align: Group->getAlign());
4268
4269	applyMetadata(I&: *NewStoreInstr);
4270	// TODO: Also manage existing metadata using VPIRMetadata.
4271	Group->addMetadata(NewInst: NewStoreInstr);
4272	}
4273
4274	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
4275	void VPInterleaveRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
4276	VPSlotTracker &SlotTracker) const {
4277	const InterleaveGroup<Instruction> *IG = getInterleaveGroup();
4278	O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
4279	IG->getInsertPos()->printAsOperand(O, false);
4280	O << ", ";
4281	getAddr()->printAsOperand(O, SlotTracker);
4282	VPValue *Mask = getMask();
4283	if (Mask) {
4284	O << ", ";
4285	Mask->printAsOperand(O, SlotTracker);
4286	}
4287
4288	unsigned OpIdx = `0`;
4289	for (unsigned i = `0`; i < IG->getFactor(); ++i) {
4290	if (!IG->getMember(i))
4291	continue;
4292	if (getNumStoreOperands() > `0`) {
4293	O << "\n" << Indent << " store ";
4294	getOperand(`1` + OpIdx)->printAsOperand(O, SlotTracker);
4295	O << " to index " << i;
4296	} else {
4297	O << "\n" << Indent << " ";
4298	getVPValue(OpIdx)->printAsOperand(O, SlotTracker);
4299	O << " = load from index " << i;
4300	}
4301	++OpIdx;
4302	}
4303	}
4304	#endif
4305
4306	void VPInterleaveEVLRecipe::execute(VPTransformState &State) {
4307	assert(!State.Lane && "Interleave group being replicated.");
4308	assert(State.VF.isScalable() &&
4309	"Only support scalable VF for EVL tail-folding.");
4310	assert(!needsMaskForGaps() &&
4311	"Masking gaps for scalable vectors is not yet supported.");
4312	const InterleaveGroup<Instruction> *Group = getInterleaveGroup();
4313	Instruction *Instr = Group->getInsertPos();
4314
4315	// Prepare for the vector type of the interleaved load/store.
4316	Type *ScalarTy = getLoadStoreType(I: Instr);
4317	unsigned InterleaveFactor = Group->getFactor();
4318	assert(InterleaveFactor <= `8` &&
4319	"Unsupported deinterleave/interleave factor for scalable vectors");
4320	ElementCount WideVF = State.VF * InterleaveFactor;
4321	auto *VecTy = VectorType::get(ElementType: ScalarTy, EC: WideVF);
4322
4323	VPValue *Addr = getAddr();
4324	Value *ResAddr = State.get(Def: Addr, Lane: VPLane (`0`));
4325	Value *EVL = State.get(Def: getEVL(), Lane: VPLane (`0`));
4326	Value *InterleaveEVL = State.Builder.CreateMul(
4327	LHS: EVL, RHS: ConstantInt::get(Ty: EVL->getType(), V: InterleaveFactor), Name: "interleave.evl",
4328	/ NUW= / HasNUW: true, / NSW= / HasNSW: true);
4329	LLVMContext &Ctx = State.Builder.getContext();
4330
4331	Value GroupMask = nullptr*;
4332	if (VPValue *BlockInMask = getMask()) {
4333	SmallVector<Value *> Ops(InterleaveFactor, State.get(Def: BlockInMask));
4334	GroupMask = interleaveVectors(Builder&: State.Builder, Vals: Ops, Name: "interleaved.mask");
4335	} else {
4336	GroupMask =
4337	State.Builder.CreateVectorSplat(EC: WideVF, V: State.Builder.getTrue());
4338	}
4339
4340	// Vectorize the interleaved load group.
4341	if (isa<LoadInst>(Val: Instr)) {
4342	CallInst *NewLoad = State.Builder.CreateIntrinsic(
4343	RetTy: VecTy, ID: Intrinsic::vp_load, Args: {ResAddr, GroupMask, InterleaveEVL}, FMFSource: nullptr,
4344	Name: "wide.vp.load");
4345	NewLoad->addParamAttr(ArgNo: `0`,
4346	Attr: Attribute::getWithAlignment(Context&: Ctx, Alignment: Group->getAlign()));
4347
4348	applyMetadata(I&: *NewLoad);
4349	// TODO: Also manage existing metadata using VPIRMetadata.
4350	Group->addMetadata(NewInst: NewLoad);
4351
4352	// Scalable vectors cannot use arbitrary shufflevectors (only splats),
4353	// so must use intrinsics to deinterleave.
4354	NewLoad = State.Builder.CreateIntrinsic(
4355	ID: Intrinsic::getDeinterleaveIntrinsicID(Factor: InterleaveFactor),
4356	Types: NewLoad->getType(), Args: NewLoad,
4357	/FMFSource=/nullptr, Name: "strided.vec");
4358
4359	const DataLayout &DL = Instr->getDataLayout();
4360	for (unsigned I = `0`, J = `0`; I < InterleaveFactor; ++I) {
4361	Instruction *Member = Group->getMember(Index: I);
4362	// Skip the gaps in the group.
4363	if (!Member)
4364	continue;
4365
4366	Value *StridedVec = State.Builder.CreateExtractValue(Agg: NewLoad, Idxs: I);
4367	// If this member has different type, cast the result type.
4368	if (Member->getType() != ScalarTy) {
4369	VectorType *OtherVTy = VectorType::get(ElementType: Member->getType(), EC: State.VF);
4370	StridedVec =
4371	createBitOrPointerCast(Builder&: State.Builder, V: StridedVec, DstVTy: OtherVTy, DL);
4372	}
4373
4374	State.set(Def: getVPValue(I: J), V: StridedVec);
4375	++J;
4376	}
4377	return;
4378	} // End for interleaved load.
4379
4380	// The sub vector type for current instruction.
4381	auto *SubVT = VectorType::get(ElementType: ScalarTy, EC: State.VF);
4382	// Vectorize the interleaved store group.
4383	ArrayRef<VPValue *> StoredValues = getStoredValues();
4384	// Collect the stored vector from each member.
4385	SmallVector<Value *, `4`> StoredVecs;
4386	const DataLayout &DL = Instr->getDataLayout();
4387	for (unsigned I = `0`, StoredIdx = `0`; I < InterleaveFactor; I++) {
4388	Instruction *Member = Group->getMember(Index: I);
4389	// Skip the gaps in the group.
4390	if (!Member) {
4391	StoredVecs.push_back(Elt: PoisonValue::get(T: SubVT));
4392	continue;
4393	}
4394
4395	Value *StoredVec = State.get(Def: StoredValues [StoredIdx]);
4396	// If this member has different type, cast it to a unified type.
4397	if (StoredVec->getType() != SubVT)
4398	StoredVec = createBitOrPointerCast(Builder&: State.Builder, V: StoredVec, DstVTy: SubVT, DL);
4399
4400	StoredVecs.push_back(Elt: StoredVec);
4401	++StoredIdx;
4402	}
4403
4404	// Interleave all the smaller vectors into one wider vector.
4405	Value *IVec = interleaveVectors(Builder&: State.Builder, Vals: StoredVecs, Name: "interleaved.vec");
4406	CallInst *NewStore =
4407	State.Builder.CreateIntrinsic(RetTy: Type::getVoidTy(C&: Ctx), ID: Intrinsic::vp_store,
4408	Args: {IVec, ResAddr, GroupMask, InterleaveEVL});
4409	NewStore->addParamAttr(ArgNo: `1`,
4410	Attr: Attribute::getWithAlignment(Context&: Ctx, Alignment: Group->getAlign()));
4411
4412	applyMetadata(I&: *NewStore);
4413	// TODO: Also manage existing metadata using VPIRMetadata.
4414	Group->addMetadata(NewInst: NewStore);
4415	}
4416
4417	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
4418	void VPInterleaveEVLRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
4419	VPSlotTracker &SlotTracker) const {
4420	const InterleaveGroup<Instruction> *IG = getInterleaveGroup();
4421	O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
4422	IG->getInsertPos()->printAsOperand(O, false);
4423	O << ", ";
4424	getAddr()->printAsOperand(O, SlotTracker);
4425	O << ", ";
4426	getEVL()->printAsOperand(O, SlotTracker);
4427	if (VPValue *Mask = getMask()) {
4428	O << ", ";
4429	Mask->printAsOperand(O, SlotTracker);
4430	}
4431
4432	unsigned OpIdx = `0`;
4433	for (unsigned i = `0`; i < IG->getFactor(); ++i) {
4434	if (!IG->getMember(i))
4435	continue;
4436	if (getNumStoreOperands() > `0`) {
4437	O << "\n" << Indent << " vp.store ";
4438	getOperand(`2` + OpIdx)->printAsOperand(O, SlotTracker);
4439	O << " to index " << i;
4440	} else {
4441	O << "\n" << Indent << " ";
4442	getVPValue(OpIdx)->printAsOperand(O, SlotTracker);
4443	O << " = vp.load from index " << i;
4444	}
4445	++OpIdx;
4446	}
4447	}
4448	#endif
4449
4450	InstructionCost VPInterleaveBase::computeCost(ElementCount VF,
4451	VPCostContext &Ctx) const {
4452	Instruction *InsertPos = getInsertPos();
4453	// Find the VPValue index of the interleave group. We need to skip gaps.
4454	unsigned InsertPosIdx = `0`;
4455	for (unsigned Idx = `0`; IG->getFactor(); ++Idx)
4456	if (auto *Member = IG->getMember(Index: Idx)) {
4457	if (Member == InsertPos)
4458	break;
4459	InsertPosIdx++;
4460	}
4461	Type *ValTy = Ctx.Types.inferScalarType(
4462	V: getNumDefinedValues() > `0` ? getVPValue(I: InsertPosIdx)
4463	: getStoredValues()[InsertPosIdx]);
4464	auto *VectorTy = cast<VectorType>(Val: toVectorTy(Scalar: ValTy, EC: VF));
4465	unsigned AS = cast<PointerType>(Val: Ctx.Types.inferScalarType(V: getAddr()))
4466	->getAddressSpace();
4467
4468	unsigned InterleaveFactor = IG->getFactor();
4469	auto WideVecTy = VectorType::get(ElementType: ValTy, EC: VF InterleaveFactor);
4470
4471	// Holds the indices of existing members in the interleaved group.
4472	SmallVector<unsigned, `4`> Indices;
4473	for (unsigned IF = `0`; IF < InterleaveFactor; IF++)
4474	if (IG->getMember(Index: IF))
4475	Indices.push_back(Elt: IF);
4476
4477	// Calculate the cost of the whole interleaved group.
4478	InstructionCost Cost = Ctx.TTI.getInterleavedMemoryOpCost(
4479	Opcode: InsertPos->getOpcode(), VecTy: WideVecTy, Factor: IG->getFactor(), Indices,
4480	Alignment: IG->getAlign(), AddressSpace: AS, CostKind: Ctx.CostKind, UseMaskForCond: getMask(), UseMaskForGaps: NeedsMaskForGaps);
4481
4482	if (!IG->isReverse())
4483	return Cost;
4484
4485	return Cost + IG->getNumMembers() *
4486	Ctx.TTI.getShuffleCost(Kind: TargetTransformInfo::SK_Reverse,
4487	DstTy: VectorTy, SrcTy: VectorTy, Mask: {}, CostKind: Ctx.CostKind,
4488	Index: `0`);
4489	}
4490
4491	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
4492	void VPCanonicalIVPHIRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
4493	VPSlotTracker &SlotTracker) const {
4494	O << Indent << "EMIT ";
4495	printAsOperand(O, SlotTracker);
4496	O << " = CANONICAL-INDUCTION ";
4497	printOperands(O, SlotTracker);
4498	}
4499	#endif
4500
4501	bool VPWidenPointerInductionRecipe::onlyScalarsGenerated(bool IsScalable) {
4502	return vputils::onlyScalarValuesUsed(Def: this) &&
4503	(!IsScalable \|\| vputils::onlyFirstLaneUsed(Def: this));
4504	}
4505
4506	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
4507	void VPWidenPointerInductionRecipe::printRecipe(
4508	raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const {
4509	assert((getNumOperands() == `3` \|\| getNumOperands() == `5`) &&
4510	"unexpected number of operands");
4511	O << Indent << "EMIT ";
4512	printAsOperand(O, SlotTracker);
4513	O << " = WIDEN-POINTER-INDUCTION ";
4514	getStartValue()->printAsOperand(O, SlotTracker);
4515	O << ", ";
4516	getStepValue()->printAsOperand(O, SlotTracker);
4517	O << ", ";
4518	getOperand(`2`)->printAsOperand(O, SlotTracker);
4519	if (getNumOperands() == `5`) {
4520	O << ", ";
4521	getOperand(`3`)->printAsOperand(O, SlotTracker);
4522	O << ", ";
4523	getOperand(`4`)->printAsOperand(O, SlotTracker);
4524	}
4525	}
4526
4527	void VPExpandSCEVRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
4528	VPSlotTracker &SlotTracker) const {
4529	O << Indent << "EMIT ";
4530	printAsOperand(O, SlotTracker);
4531	O << " = EXPAND SCEV " << *Expr;
4532	}
4533	#endif
4534
4535	void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) {
4536	Value CanonicalIV = State.get(Def: getOperand(N: `0`), /IsScalar/* true);
4537	Type *STy = CanonicalIV->getType();
4538	IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
4539	ElementCount VF = State.VF;
4540	Value *VStart = VF.isScalar()
4541	? CanonicalIV
4542	: Builder.CreateVectorSplat(EC: VF, V: CanonicalIV, Name: "broadcast");
4543	Value VStep = createStepForVF(B&: Builder, Ty: STy, VF, Step: getUnrollPart(U: this));
4544	if (VF.isVector()) {
4545	VStep = Builder.CreateVectorSplat(EC: VF, V: VStep);
4546	VStep =
4547	Builder.CreateAdd(LHS: VStep, RHS: Builder.CreateStepVector(DstType: VStep->getType()));
4548	}
4549	Value *CanonicalVectorIV = Builder.CreateAdd(LHS: VStart, RHS: VStep, Name: "vec.iv");
4550	State.set(Def: this, V: CanonicalVectorIV);
4551	}
4552
4553	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
4554	void VPWidenCanonicalIVRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
4555	VPSlotTracker &SlotTracker) const {
4556	O << Indent << "EMIT ";
4557	printAsOperand(O, SlotTracker);
4558	O << " = WIDEN-CANONICAL-INDUCTION ";
4559	printOperands(O, SlotTracker);
4560	}
4561	#endif
4562
4563	void VPFirstOrderRecurrencePHIRecipe::execute(VPTransformState &State) {
4564	auto &Builder = State.Builder;
4565	// Create a vector from the initial value.
4566	auto *VectorInit = getStartValue()->getLiveInIRValue();
4567
4568	Type *VecTy = State.VF.isScalar()
4569	? VectorInit->getType()
4570	: VectorType::get(ElementType: VectorInit->getType(), EC: State.VF);
4571
4572	BasicBlock *VectorPH =
4573	State.CFG.VPBB2IRBB.at(Val: getParent()->getCFGPredecessor(Idx: `0`));
4574	if (State.VF.isVector()) {
4575	auto *IdxTy = Builder.getInt32Ty();
4576	auto *One = ConstantInt::get(Ty: IdxTy, V: `1`);
4577	IRBuilder<>::InsertPointGuard Guard(Builder);
4578	Builder.SetInsertPoint(VectorPH->getTerminator());
4579	auto *RuntimeVF = getRuntimeVF(B&: Builder, Ty: IdxTy, VF: State.VF);
4580	auto *LastIdx = Builder.CreateSub(LHS: RuntimeVF, RHS: One);
4581	VectorInit = Builder.CreateInsertElement(
4582	Vec: PoisonValue::get(T: VecTy), NewElt: VectorInit, Idx: LastIdx, Name: "vector.recur.init");
4583	}
4584
4585	// Create a phi node for the new recurrence.
4586	PHINode *Phi = PHINode::Create(Ty: VecTy, NumReservedValues: `2`, NameStr: "vector.recur");
4587	Phi->insertBefore(InsertPos: State.CFG.PrevBB->getFirstInsertionPt());
4588	Phi->addIncoming(V: VectorInit, BB: VectorPH);
4589	State.set(Def: this, V: Phi);
4590	}
4591
4592	InstructionCost
4593	VPFirstOrderRecurrencePHIRecipe::computeCost(ElementCount VF,
4594	VPCostContext &Ctx) const {
4595	if (VF.isScalar())
4596	return Ctx.TTI.getCFInstrCost(Opcode: Instruction::PHI, CostKind: Ctx.CostKind);
4597
4598	return `0`;
4599	}
4600
4601	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
4602	void VPFirstOrderRecurrencePHIRecipe::printRecipe(
4603	raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const {
4604	O << Indent << "FIRST-ORDER-RECURRENCE-PHI ";
4605	printAsOperand(O, SlotTracker);
4606	O << " = phi ";
4607	printOperands(O, SlotTracker);
4608	}
4609	#endif
4610
4611	void VPReductionPHIRecipe::execute(VPTransformState &State) {
4612	// Reductions do not have to start at zero. They can start with
4613	// any loop invariant values.
4614	VPValue *StartVPV = getStartValue();
4615
4616	// In order to support recurrences we need to be able to vectorize Phi nodes.
4617	// Phi nodes have cycles, so we need to vectorize them in two stages. This is
4618	// stage #1: We create a new vector PHI node with no incoming edges. We'll use
4619	// this value when we vectorize all of the instructions that use the PHI.
4620	BasicBlock *VectorPH =
4621	State.CFG.VPBB2IRBB.at(Val: getParent()->getCFGPredecessor(Idx: `0`));
4622	bool ScalarPHI = State.VF.isScalar() \|\| isInLoop();
4623	Value *StartV = State.get(Def: StartVPV, IsScalar: ScalarPHI);
4624	Type *VecTy = StartV->getType();
4625
4626	BasicBlock *HeaderBB = State.CFG.PrevBB;
4627	assert(State.CurrentParentLoop->getHeader() == HeaderBB &&
4628	"recipe must be in the vector loop header");
4629	auto *Phi = PHINode::Create(Ty: VecTy, NumReservedValues: `2`, NameStr: "vec.phi");
4630	Phi->insertBefore(InsertPos: HeaderBB->getFirstInsertionPt());
4631	State.set(Def: this, V: Phi, IsScalar: isInLoop());
4632
4633	Phi->addIncoming(V: StartV, BB: VectorPH);
4634	}
4635
4636	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
4637	void VPReductionPHIRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
4638	VPSlotTracker &SlotTracker) const {
4639	O << Indent << "WIDEN-REDUCTION-PHI ";
4640
4641	printAsOperand(O, SlotTracker);
4642	O << " = phi";
4643	printFlags(O);
4644	printOperands(O, SlotTracker);
4645	if (getVFScaleFactor() > `1`)
4646	O << " (VF scaled by 1/" << getVFScaleFactor() << ")";
4647	}
4648	#endif
4649
4650	void VPWidenPHIRecipe::execute(VPTransformState &State) {
4651	Value *Op0 = State.get(Def: getOperand(N: `0`));
4652	Type *VecTy = Op0->getType();
4653	Instruction *VecPhi = State.Builder.CreatePHI(Ty: VecTy, NumReservedValues: `2`, Name);
4654	State.set(Def: this, V: VecPhi);
4655	}
4656
4657	InstructionCost VPWidenPHIRecipe::computeCost(ElementCount VF,
4658	VPCostContext &Ctx) const {
4659	return Ctx.TTI.getCFInstrCost(Opcode: Instruction::PHI, CostKind: Ctx.CostKind);
4660	}
4661
4662	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
4663	void VPWidenPHIRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
4664	VPSlotTracker &SlotTracker) const {
4665	O << Indent << "WIDEN-PHI ";
4666
4667	printAsOperand(O, SlotTracker);
4668	O << " = phi ";
4669	printPhiOperands(O, SlotTracker);
4670	}
4671	#endif
4672
4673	void VPActiveLaneMaskPHIRecipe::execute(VPTransformState &State) {
4674	BasicBlock *VectorPH =
4675	State.CFG.VPBB2IRBB.at(Val: getParent()->getCFGPredecessor(Idx: `0`));
4676	Value *StartMask = State.get(Def: getOperand(N: `0`));
4677	PHINode *Phi =
4678	State.Builder.CreatePHI(Ty: StartMask->getType(), NumReservedValues: `2`, Name: "active.lane.mask");
4679	Phi->addIncoming(V: StartMask, BB: VectorPH);
4680	State.set(Def: this, V: Phi);
4681	}
4682
4683	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
4684	void VPActiveLaneMaskPHIRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
4685	VPSlotTracker &SlotTracker) const {
4686	O << Indent << "ACTIVE-LANE-MASK-PHI ";
4687
4688	printAsOperand(O, SlotTracker);
4689	O << " = phi ";
4690	printOperands(O, SlotTracker);
4691	}
4692	#endif
4693
4694	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
4695	void VPCurrentIterationPHIRecipe::printRecipe(
4696	raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const {
4697	O << Indent << "CURRENT-ITERATION-PHI ";
4698
4699	printAsOperand(O, SlotTracker);
4700	O << " = phi ";
4701	printOperands(O, SlotTracker);
4702	}
4703	#endif
4704

Browse the source code of llvm_projects/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp