VPlanRecipes.cpp source code [llvm_projects/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp]

1	//===- VPlanRecipes.cpp - Implementations for VPlan recipes ---------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	///
9	/// \file
10	/// This file contains implementations for different VPlan recipes.
11	///
12	//===----------------------------------------------------------------------===//
13
14	#include "LoopVectorizationPlanner.h"
15	#include "VPlan.h"
16	#include "VPlanAnalysis.h"
17	#include "VPlanHelpers.h"
18	#include "VPlanPatternMatch.h"
19	#include "VPlanUtils.h"
20	#include "llvm/ADT/STLExtras.h"
21	#include "llvm/ADT/SmallVector.h"
22	#include "llvm/ADT/Twine.h"
23	#include "llvm/Analysis/AssumptionCache.h"
24	#include "llvm/Analysis/IVDescriptors.h"
25	#include "llvm/Analysis/LoopInfo.h"
26	#include "llvm/IR/BasicBlock.h"
27	#include "llvm/IR/IRBuilder.h"
28	#include "llvm/IR/Instruction.h"
29	#include "llvm/IR/Instructions.h"
30	#include "llvm/IR/Intrinsics.h"
31	#include "llvm/IR/Type.h"
32	#include "llvm/IR/Value.h"
33	#include "llvm/Support/Casting.h"
34	#include "llvm/Support/CommandLine.h"
35	#include "llvm/Support/Debug.h"
36	#include "llvm/Support/raw_ostream.h"
37	#include "llvm/Transforms/Utils/BasicBlockUtils.h"
38	#include "llvm/Transforms/Utils/LoopUtils.h"
39	#include "llvm/Transforms/Utils/LoopVersioning.h"
40	#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
41	#include <cassert>
42
43	using namespace llvm;
44
45	using VectorParts = SmallVector<Value *, `2`>;
46
47	#define LV_NAME "loop-vectorize"
48	#define DEBUG_TYPE LV_NAME
49
50	bool VPRecipeBase::mayWriteToMemory() const {
51	switch (getVPDefID()) {
52	case VPExpressionSC:
53	return cast<VPExpressionRecipe>(Val: this)->mayReadOrWriteMemory();
54	case VPInstructionSC:
55	return cast<VPInstruction>(Val: this)->opcodeMayReadOrWriteFromMemory();
56	case VPInterleaveSC:
57	return cast<VPInterleaveRecipe>(Val: this)->getNumStoreOperands() > `0`;
58	case VPWidenStoreEVLSC:
59	case VPWidenStoreSC:
60	return true;
61	case VPReplicateSC:
62	return cast<Instruction>(Val: getVPSingleValue()->getUnderlyingValue())
63	->mayWriteToMemory();
64	case VPWidenCallSC:
65	return !cast<VPWidenCallRecipe>(Val: this)
66	->getCalledScalarFunction()
67	->onlyReadsMemory();
68	case VPWidenIntrinsicSC:
69	return cast<VPWidenIntrinsicRecipe>(Val: this)->mayWriteToMemory();
70	case VPBranchOnMaskSC:
71	case VPFirstOrderRecurrencePHISC:
72	case VPScalarIVStepsSC:
73	case VPPredInstPHISC:
74	return false;
75	case VPBlendSC:
76	case VPReductionEVLSC:
77	case VPReductionSC:
78	case VPVectorPointerSC:
79	case VPWidenCanonicalIVSC:
80	case VPWidenCastSC:
81	case VPWidenGEPSC:
82	case VPWidenIntOrFpInductionSC:
83	case VPWidenLoadEVLSC:
84	case VPWidenLoadSC:
85	case VPWidenPHISC:
86	case VPWidenSC:
87	case VPWidenSelectSC: {
88	const Instruction *I =
89	dyn_cast_or_null<Instruction>(Val: getVPSingleValue()->getUnderlyingValue());
90	(void)I;
91	assert((!I \|\| !I->mayWriteToMemory()) &&
92	"underlying instruction may write to memory");
93	return false;
94	}
95	default:
96	return true;
97	}
98	}
99
100	bool VPRecipeBase::mayReadFromMemory() const {
101	switch (getVPDefID()) {
102	case VPExpressionSC:
103	return cast<VPExpressionRecipe>(Val: this)->mayReadOrWriteMemory();
104	case VPInstructionSC:
105	return cast<VPInstruction>(Val: this)->opcodeMayReadOrWriteFromMemory();
106	case VPWidenLoadEVLSC:
107	case VPWidenLoadSC:
108	return true;
109	case VPReplicateSC:
110	return cast<Instruction>(Val: getVPSingleValue()->getUnderlyingValue())
111	->mayReadFromMemory();
112	case VPWidenCallSC:
113	return !cast<VPWidenCallRecipe>(Val: this)
114	->getCalledScalarFunction()
115	->onlyWritesMemory();
116	case VPWidenIntrinsicSC:
117	return cast<VPWidenIntrinsicRecipe>(Val: this)->mayReadFromMemory();
118	case VPBranchOnMaskSC:
119	case VPFirstOrderRecurrencePHISC:
120	case VPPredInstPHISC:
121	case VPScalarIVStepsSC:
122	case VPWidenStoreEVLSC:
123	case VPWidenStoreSC:
124	return false;
125	case VPBlendSC:
126	case VPReductionEVLSC:
127	case VPReductionSC:
128	case VPVectorPointerSC:
129	case VPWidenCanonicalIVSC:
130	case VPWidenCastSC:
131	case VPWidenGEPSC:
132	case VPWidenIntOrFpInductionSC:
133	case VPWidenPHISC:
134	case VPWidenSC:
135	case VPWidenSelectSC: {
136	const Instruction *I =
137	dyn_cast_or_null<Instruction>(Val: getVPSingleValue()->getUnderlyingValue());
138	(void)I;
139	assert((!I \|\| !I->mayReadFromMemory()) &&
140	"underlying instruction may read from memory");
141	return false;
142	}
143	default:
144	return true;
145	}
146	}
147
148	bool VPRecipeBase::mayHaveSideEffects() const {
149	switch (getVPDefID()) {
150	case VPExpressionSC:
151	return cast<VPExpressionRecipe>(Val: this)->mayHaveSideEffects();
152	case VPDerivedIVSC:
153	case VPFirstOrderRecurrencePHISC:
154	case VPPredInstPHISC:
155	case VPVectorEndPointerSC:
156	return false;
157	case VPInstructionSC:
158	return mayWriteToMemory();
159	case VPWidenCallSC: {
160	Function Fn = cast<VPWidenCallRecipe>(Val: this*)->getCalledScalarFunction();
161	return mayWriteToMemory() \|\| !Fn->doesNotThrow() \|\| !Fn->willReturn();
162	}
163	case VPWidenIntrinsicSC:
164	return cast<VPWidenIntrinsicRecipe>(Val: this)->mayHaveSideEffects();
165	case VPBlendSC:
166	case VPReductionEVLSC:
167	case VPReductionSC:
168	case VPScalarIVStepsSC:
169	case VPVectorPointerSC:
170	case VPWidenCanonicalIVSC:
171	case VPWidenCastSC:
172	case VPWidenGEPSC:
173	case VPWidenIntOrFpInductionSC:
174	case VPWidenPHISC:
175	case VPWidenPointerInductionSC:
176	case VPWidenSC:
177	case VPWidenSelectSC: {
178	const Instruction *I =
179	dyn_cast_or_null<Instruction>(Val: getVPSingleValue()->getUnderlyingValue());
180	(void)I;
181	assert((!I \|\| !I->mayHaveSideEffects()) &&
182	"underlying instruction has side-effects");
183	return false;
184	}
185	case VPInterleaveSC:
186	return mayWriteToMemory();
187	case VPWidenLoadEVLSC:
188	case VPWidenLoadSC:
189	case VPWidenStoreEVLSC:
190	case VPWidenStoreSC:
191	assert(
192	cast<VPWidenMemoryRecipe>(this)->getIngredient().mayHaveSideEffects() ==
193	mayWriteToMemory() &&
194	"mayHaveSideffects result for ingredient differs from this "
195	"implementation");
196	return mayWriteToMemory();
197	case VPReplicateSC: {
198	auto R = cast<VPReplicateRecipe>(Val: this*);
199	return R->getUnderlyingInstr()->mayHaveSideEffects();
200	}
201	default:
202	return true;
203	}
204	}
205
206	void VPRecipeBase::insertBefore(VPRecipeBase *InsertPos) {
207	assert(!Parent && "Recipe already in some VPBasicBlock");
208	assert(InsertPos->getParent() &&
209	"Insertion position not in any VPBasicBlock");
210	InsertPos->getParent()->insert(Recipe: this, InsertPt: InsertPos->getIterator());
211	}
212
213	void VPRecipeBase::insertBefore(VPBasicBlock &BB,
214	iplist<VPRecipeBase>::iterator I) {
215	assert(!Parent && "Recipe already in some VPBasicBlock");
216	assert(I == BB.end() \|\| I->getParent() == &BB);
217	BB.insert(Recipe: this, InsertPt: I);
218	}
219
220	void VPRecipeBase::insertAfter(VPRecipeBase *InsertPos) {
221	assert(!Parent && "Recipe already in some VPBasicBlock");
222	assert(InsertPos->getParent() &&
223	"Insertion position not in any VPBasicBlock");
224	InsertPos->getParent()->insert(Recipe: this, InsertPt: std::next(x: InsertPos->getIterator()));
225	}
226
227	void VPRecipeBase::removeFromParent() {
228	assert(getParent() && "Recipe not in any VPBasicBlock");
229	getParent()->getRecipeList().remove(IT: getIterator());
230	Parent = nullptr;
231	}
232
233	iplist<VPRecipeBase>::iterator VPRecipeBase::eraseFromParent() {
234	assert(getParent() && "Recipe not in any VPBasicBlock");
235	return getParent()->getRecipeList().erase(where: getIterator());
236	}
237
238	void VPRecipeBase::moveAfter(VPRecipeBase *InsertPos) {
239	removeFromParent();
240	insertAfter(InsertPos);
241	}
242
243	void VPRecipeBase::moveBefore(VPBasicBlock &BB,
244	iplist<VPRecipeBase>::iterator I) {
245	removeFromParent();
246	insertBefore(BB, I);
247	}
248
249	InstructionCost VPRecipeBase::cost(ElementCount VF, VPCostContext &Ctx) {
250	// Get the underlying instruction for the recipe, if there is one. It is used
251	// to
252	// decide if cost computation should be skipped for this recipe,*
253	// apply forced target instruction cost.*
254	Instruction UI = nullptr*;
255	if (auto S = dyn_cast<VPSingleDefRecipe>(Val: this*))
256	UI = dyn_cast_or_null<Instruction>(Val: S->getUnderlyingValue());
257	else if (auto IG = dyn_cast<VPInterleaveRecipe>(Val: this*))
258	UI = IG->getInsertPos();
259	else if (auto WidenMem = dyn_cast<VPWidenMemoryRecipe>(Val: this*))
260	UI = &WidenMem->getIngredient();
261
262	InstructionCost RecipeCost;
263	if (UI && Ctx.skipCostComputation(UI, IsVector: VF.isVector())) {
264	RecipeCost = `0`;
265	} else {
266	RecipeCost = computeCost(VF, Ctx);
267	if (UI && ForceTargetInstructionCost.getNumOccurrences() > `0` &&
268	RecipeCost.isValid())
269	RecipeCost = InstructionCost (ForceTargetInstructionCost);
270	}
271
272	LLVM_DEBUG({
273	dbgs() << "Cost of " << RecipeCost << " for VF " << VF << ": ";
274	dump();
275	});
276	return RecipeCost;
277	}
278
279	InstructionCost VPRecipeBase::computeCost(ElementCount VF,
280	VPCostContext &Ctx) const {
281	llvm_unreachable("subclasses should implement computeCost");
282	}
283
284	bool VPRecipeBase::isPhi() const {
285	return (getVPDefID() >= VPFirstPHISC && getVPDefID() <= VPLastPHISC) \|\|
286	(isa<VPInstruction>(Val: this) &&
287	cast<VPInstruction>(Val: this)->getOpcode() == Instruction::PHI) \|\|
288	isa<VPIRPhi>(Val: this);
289	}
290
291	bool VPRecipeBase::isScalarCast() const {
292	auto VPI = dyn_cast<VPInstruction>(Val: this*);
293	return VPI && Instruction::isCast(Opcode: VPI->getOpcode());
294	}
295
296	InstructionCost
297	VPPartialReductionRecipe::computeCost(ElementCount VF,
298	VPCostContext &Ctx) const {
299	std::optional<unsigned> Opcode;
300	VPValue *Op = getOperand(N: `0`);
301	VPRecipeBase *OpR = Op->getDefiningRecipe();
302
303	// If the partial reduction is predicated, a select will be operand 0
304	using namespace llvm::VPlanPatternMatch;
305	if (match(V: getOperand(N: `1`), P: m_Select(Op0: m_VPValue(), Op1: m_VPValue(V&: Op), Op2: m_VPValue()))) {
306	OpR = Op->getDefiningRecipe();
307	}
308
309	Type InputTypeA = nullptr, InputTypeB = nullptr;
310	TTI::PartialReductionExtendKind ExtAType = TTI::PR_None,
311	ExtBType = TTI::PR_None;
312
313	auto GetExtendKind = [](VPRecipeBase *R) {
314	if (!R)
315	return TTI::PR_None;
316	auto *WidenCastR = dyn_cast<VPWidenCastRecipe>(Val: R);
317	if (!WidenCastR)
318	return TTI::PR_None;
319	if (WidenCastR->getOpcode() == Instruction::CastOps::ZExt)
320	return TTI::PR_ZeroExtend;
321	if (WidenCastR->getOpcode() == Instruction::CastOps::SExt)
322	return TTI::PR_SignExtend;
323	return TTI::PR_None;
324	};
325
326	// Pick out opcode, type/ext information and use sub side effects from a widen
327	// recipe.
328	auto HandleWiden = [&](VPWidenRecipe *Widen) {
329	if (match(V: Widen,
330	P: m_Binary<Instruction::Sub>(Op0: m_SpecificInt(V: `0`), Op1: m_VPValue(V&: Op)))) {
331	Widen = dyn_cast<VPWidenRecipe>(Val: Op->getDefiningRecipe());
332	}
333	Opcode = Widen->getOpcode();
334	VPRecipeBase *ExtAR = Widen->getOperand(N: `0`)->getDefiningRecipe();
335	VPRecipeBase *ExtBR = Widen->getOperand(N: `1`)->getDefiningRecipe();
336	InputTypeA = Ctx.Types.inferScalarType(V: ExtAR ? ExtAR->getOperand(N: `0`)
337	: Widen->getOperand(N: `0`));
338	InputTypeB = Ctx.Types.inferScalarType(V: ExtBR ? ExtBR->getOperand(N: `0`)
339	: Widen->getOperand(N: `1`));
340	ExtAType = GetExtendKind (ExtAR);
341	ExtBType = GetExtendKind (ExtBR);
342	};
343
344	if (isa<VPWidenCastRecipe>(Val: OpR)) {
345	InputTypeA = Ctx.Types.inferScalarType(V: OpR->getOperand(N: `0`));
346	ExtAType = GetExtendKind (OpR);
347	} else if (isa<VPReductionPHIRecipe>(Val: OpR)) {
348	auto RedPhiOp1R = getOperand(N: `1`)->getDefiningRecipe();
349	if (isa<VPWidenCastRecipe>(Val: RedPhiOp1R)) {
350	InputTypeA = Ctx.Types.inferScalarType(V: RedPhiOp1R->getOperand(N: `0`));
351	ExtAType = GetExtendKind (RedPhiOp1R);
352	} else if (auto Widen = dyn_cast<VPWidenRecipe>(Val: RedPhiOp1R))
353	HandleWiden (Widen);
354	} else if (auto Widen = dyn_cast<VPWidenRecipe>(Val: OpR)) {
355	HandleWiden (Widen);
356	} else if (auto Reduction = dyn_cast<VPPartialReductionRecipe>(Val: OpR)) {
357	return Reduction->computeCost(VF, Ctx);
358	}
359	auto *PhiType = Ctx.Types.inferScalarType(V: getOperand(N: `1`));
360	return Ctx.TTI.getPartialReductionCost(Opcode: getOpcode(), InputTypeA, InputTypeB,
361	AccumType: PhiType, VF, OpAExtend: ExtAType, OpBExtend: ExtBType,
362	BinOp: Opcode, CostKind: Ctx.CostKind);
363	}
364
365	void VPPartialReductionRecipe::execute(VPTransformState &State) {
366	auto &Builder = State.Builder;
367
368	assert(getOpcode() == Instruction::Add &&
369	"Unhandled partial reduction opcode");
370
371	Value *BinOpVal = State.get(Def: getOperand(N: `1`));
372	Value *PhiVal = State.get(Def: getOperand(N: `0`));
373	assert(PhiVal && BinOpVal && "Phi and Mul must be set");
374
375	Type *RetTy = PhiVal->getType();
376
377	CallInst *V = Builder.CreateIntrinsic(
378	RetTy, ID: Intrinsic::experimental_vector_partial_reduce_add,
379	Args: {PhiVal, BinOpVal}, FMFSource: nullptr, Name: "partial.reduce");
380
381	State.set(Def: this, V);
382	}
383
384	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
385	void VPPartialReductionRecipe::print(raw_ostream &O, const Twine &Indent,
386	VPSlotTracker &SlotTracker) const {
387	O << Indent << "PARTIAL-REDUCE ";
388	printAsOperand(O, SlotTracker);
389	O << " = " << Instruction::getOpcodeName(getOpcode()) << " ";
390	printOperands(O, SlotTracker);
391	}
392	#endif
393
394	FastMathFlags VPIRFlags::getFastMathFlags() const {
395	assert(OpType == OperationType::FPMathOp &&
396	"recipe doesn't have fast math flags");
397	FastMathFlags Res;
398	Res.setAllowReassoc(FMFs.AllowReassoc);
399	Res.setNoNaNs(FMFs.NoNaNs);
400	Res.setNoInfs(FMFs.NoInfs);
401	Res.setNoSignedZeros(FMFs.NoSignedZeros);
402	Res.setAllowReciprocal(FMFs.AllowReciprocal);
403	Res.setAllowContract(FMFs.AllowContract);
404	Res.setApproxFunc(FMFs.ApproxFunc);
405	return Res;
406	}
407
408	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
409	void VPSingleDefRecipe::dump() const { VPDef::dump(); }
410	#endif
411
412	template <unsigned PartOpIdx>
413	VPValue *
414	VPUnrollPartAccessor<PartOpIdx>::getUnrollPartOperand(VPUser &U) const {
415	if (U.getNumOperands() == PartOpIdx + `1`)
416	return U.getOperand(N: PartOpIdx);
417	return nullptr;
418	}
419
420	template <unsigned PartOpIdx>
421	unsigned VPUnrollPartAccessor<PartOpIdx>::getUnrollPart(VPUser &U) const {
422	if (auto *UnrollPartOp = getUnrollPartOperand(U))
423	return cast<ConstantInt>(UnrollPartOp->getLiveInIRValue())->getZExtValue();
424	return `0`;
425	}
426
427	namespace llvm {
428	template class VPUnrollPartAccessor<`2`>;
429	template class VPUnrollPartAccessor<`3`>;
430	}
431
432	VPInstruction::VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands,
433	const VPIRFlags &Flags, DebugLoc DL,
434	const Twine &Name)
435	: VPRecipeWithIRFlags (VPDef::VPInstructionSC, Operands, Flags, DL),
436	VPIRMetadata (), Opcode(Opcode), Name(Name.str()) {
437	assert(flagsValidForOpcode(getOpcode()) &&
438	"Set flags not supported for the provided opcode");
439	assert((getNumOperandsForOpcode(Opcode) == -`1u` \|\|
440	getNumOperandsForOpcode(Opcode) == getNumOperands()) &&
441	"number of operands does not match opcode");
442	}
443
444	#ifndef NDEBUG
445	unsigned VPInstruction::getNumOperandsForOpcode(unsigned Opcode) {
446	if (Instruction::isUnaryOp(Opcode) \|\| Instruction::isCast(Opcode))
447	return `1`;
448
449	if (Instruction::isBinaryOp(Opcode))
450	return `2`;
451
452	switch (Opcode) {
453	case VPInstruction::StepVector:
454	return `0`;
455	case Instruction::Alloca:
456	case Instruction::ExtractValue:
457	case Instruction::Freeze:
458	case Instruction::Load:
459	case VPInstruction::AnyOf:
460	case VPInstruction::BranchOnCond:
461	case VPInstruction::CalculateTripCountMinusVF:
462	case VPInstruction::CanonicalIVIncrementForPart:
463	case VPInstruction::ExplicitVectorLength:
464	case VPInstruction::ExtractLastElement:
465	case VPInstruction::ExtractPenultimateElement:
466	case VPInstruction::FirstActiveLane:
467	case VPInstruction::Not:
468	return `1`;
469	case Instruction::ICmp:
470	case Instruction::FCmp:
471	case Instruction::Store:
472	case VPInstruction::ActiveLaneMask:
473	case VPInstruction::BranchOnCount:
474	case VPInstruction::ComputeReductionResult:
475	case VPInstruction::FirstOrderRecurrenceSplice:
476	case VPInstruction::LogicalAnd:
477	case VPInstruction::PtrAdd:
478	case VPInstruction::WideIVStep:
479	return `2`;
480	case Instruction::Select:
481	case VPInstruction::ComputeAnyOfResult:
482	case VPInstruction::ReductionStartVector:
483	return `3`;
484	case VPInstruction::ComputeFindIVResult:
485	return `4`;
486	case Instruction::Call:
487	case Instruction::GetElementPtr:
488	case Instruction::PHI:
489	case Instruction::Switch:
490	// Cannot determine the number of operands from the opcode.
491	return -`1u`;
492	}
493	llvm_unreachable("all cases should be handled above");
494	}
495	#endif
496
497	bool VPInstruction::doesGeneratePerAllLanes() const {
498	return Opcode == VPInstruction::PtrAdd && !vputils::onlyFirstLaneUsed(Def: this);
499	}
500
501	bool VPInstruction::canGenerateScalarForFirstLane() const {
502	if (Instruction::isBinaryOp(Opcode: getOpcode()) \|\| Instruction::isCast(Opcode: getOpcode()))
503	return true;
504	if (isSingleScalar() \|\| isVectorToScalar())
505	return true;
506	switch (Opcode) {
507	case Instruction::Freeze:
508	case Instruction::ICmp:
509	case Instruction::PHI:
510	case Instruction::Select:
511	case VPInstruction::BranchOnCond:
512	case VPInstruction::BranchOnCount:
513	case VPInstruction::CalculateTripCountMinusVF:
514	case VPInstruction::CanonicalIVIncrementForPart:
515	case VPInstruction::PtrAdd:
516	case VPInstruction::ExplicitVectorLength:
517	case VPInstruction::AnyOf:
518	return true;
519	default:
520	return false;
521	}
522	}
523
524	Value *VPInstruction::generatePerLane(VPTransformState &State,
525	const VPLane &Lane) {
526	IRBuilderBase &Builder = State.Builder;
527
528	assert(getOpcode() == VPInstruction::PtrAdd &&
529	"only PtrAdd opcodes are supported for now");
530	return Builder.CreatePtrAdd(Ptr: State.get(Def: getOperand(N: `0`), Lane),
531	Offset: State.get(Def: getOperand(N: `1`), Lane), Name);
532	}
533
534	/// Create a conditional branch using \p Cond branching to the successors of \p
535	/// VPBB. Note that the first successor is always forward (i.e. not created yet)
536	/// while the second successor may already have been created (if it is a header
537	/// block and VPBB is a latch).
538	static BranchInst createCondBranch(Value Cond, VPBasicBlock *VPBB,
539	VPTransformState &State) {
540	// Replace the temporary unreachable terminator with a new conditional
541	// branch, hooking it up to backward destination (header) for latch blocks
542	// now, and to forward destination(s) later when they are created.
543	// Second successor may be backwards - iff it is already in VPBB2IRBB.
544	VPBasicBlock *SecondVPSucc = cast<VPBasicBlock>(Val: VPBB->getSuccessors()[`1`]);
545	BasicBlock *SecondIRSucc = State.CFG.VPBB2IRBB.lookup(Val: SecondVPSucc);
546	BasicBlock *IRBB = State.CFG.VPBB2IRBB [VPBB];
547	BranchInst *CondBr = State.Builder.CreateCondBr(Cond, True: IRBB, False: SecondIRSucc);
548	// First successor is always forward, reset it to nullptr
549	CondBr->setSuccessor(idx: `0`, NewSucc: nullptr);
550	IRBB->getTerminator()->eraseFromParent();
551	return CondBr;
552	}
553
554	Value *VPInstruction::generate(VPTransformState &State) {
555	IRBuilderBase &Builder = State.Builder;
556
557	if (Instruction::isBinaryOp(Opcode: getOpcode())) {
558	bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(Def: this);
559	Value *A = State.get(Def: getOperand(N: `0`), IsScalar: OnlyFirstLaneUsed);
560	Value *B = State.get(Def: getOperand(N: `1`), IsScalar: OnlyFirstLaneUsed);
561	auto *Res =
562	Builder.CreateBinOp(Opc: (Instruction::BinaryOps)getOpcode(), LHS: A, RHS: B, Name);
563	if (auto *I = dyn_cast<Instruction>(Val: Res))
564	applyFlags(I&: *I);
565	return Res;
566	}
567
568	switch (getOpcode()) {
569	case VPInstruction::Not: {
570	Value *A = State.get(Def: getOperand(N: `0`));
571	return Builder.CreateNot(V: A, Name);
572	}
573	case Instruction::ExtractElement: {
574	assert(State.VF.isVector() && "Only extract elements from vectors");
575	if (getOperand(N: `1`)->isLiveIn()) {
576	unsigned IdxToExtract =
577	cast<ConstantInt>(Val: getOperand(N: `1`)->getLiveInIRValue())->getZExtValue();
578	return State.get(Def: getOperand(N: `0`), Lane: VPLane (IdxToExtract));
579	}
580	Value *Vec = State.get(Def: getOperand(N: `0`));
581	Value Idx = State.get(Def: getOperand(N: `1`), /IsScalar=/*true);
582	return Builder.CreateExtractElement(Vec, Idx, Name);
583	}
584	case Instruction::Freeze: {
585	Value Op = State.get(Def: getOperand(N: `0`), IsScalar: vputils::onlyFirstLaneUsed(Def: this*));
586	return Builder.CreateFreeze(V: Op, Name);
587	}
588	case Instruction::ICmp: {
589	bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(Def: this);
590	Value *A = State.get(Def: getOperand(N: `0`), IsScalar: OnlyFirstLaneUsed);
591	Value *B = State.get(Def: getOperand(N: `1`), IsScalar: OnlyFirstLaneUsed);
592	return Builder.CreateCmp(Pred: getPredicate(), LHS: A, RHS: B, Name);
593	}
594	case Instruction::PHI: {
595	llvm_unreachable("should be handled by VPPhi::execute");
596	}
597	case Instruction::Select: {
598	bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(Def: this);
599	Value *Cond = State.get(Def: getOperand(N: `0`), IsScalar: OnlyFirstLaneUsed);
600	Value *Op1 = State.get(Def: getOperand(N: `1`), IsScalar: OnlyFirstLaneUsed);
601	Value *Op2 = State.get(Def: getOperand(N: `2`), IsScalar: OnlyFirstLaneUsed);
602	return Builder.CreateSelect(C: Cond, True: Op1, False: Op2, Name);
603	}
604	case VPInstruction::ActiveLaneMask: {
605	// Get first lane of vector induction variable.
606	Value *VIVElem0 = State.get(Def: getOperand(N: `0`), Lane: VPLane (`0`));
607	// Get the original loop tripcount.
608	Value *ScalarTC = State.get(Def: getOperand(N: `1`), Lane: VPLane (`0`));
609
610	// If this part of the active lane mask is scalar, generate the CMP directly
611	// to avoid unnecessary extracts.
612	if (State.VF.isScalar())
613	return Builder.CreateCmp(Pred: CmpInst::Predicate::ICMP_ULT, LHS: VIVElem0, RHS: ScalarTC,
614	Name);
615
616	auto *Int1Ty = Type::getInt1Ty(C&: Builder.getContext());
617	auto *PredTy = VectorType::get(ElementType: Int1Ty, EC: State.VF);
618	return Builder.CreateIntrinsic(ID: Intrinsic::get_active_lane_mask,
619	Types: {PredTy, ScalarTC->getType()},
620	Args: {VIVElem0, ScalarTC}, FMFSource: nullptr, Name);
621	}
622	case VPInstruction::FirstOrderRecurrenceSplice: {
623	// Generate code to combine the previous and current values in vector v3.
624	//
625	// vector.ph:
626	// v_init = vector(..., ..., ..., a[-1])
627	// br vector.body
628	//
629	// vector.body
630	// i = phi [0, vector.ph], [i+4, vector.body]
631	// v1 = phi [v_init, vector.ph], [v2, vector.body]
632	// v2 = a[i, i+1, i+2, i+3];
633	// v3 = vector(v1(3), v2(0, 1, 2))
634
635	auto *V1 = State.get(Def: getOperand(N: `0`));
636	if (!V1->getType()->isVectorTy())
637	return V1;
638	Value *V2 = State.get(Def: getOperand(N: `1`));
639	return Builder.CreateVectorSplice(V1, V2, Imm: -`1`, Name);
640	}
641	case VPInstruction::CalculateTripCountMinusVF: {
642	unsigned UF = getParent()->getPlan()->getUF();
643	Value *ScalarTC = State.get(Def: getOperand(N: `0`), Lane: VPLane (`0`));
644	Value *Step = createStepForVF(B&: Builder, Ty: ScalarTC->getType(), VF: State.VF, Step: UF);
645	Value *Sub = Builder.CreateSub(LHS: ScalarTC, RHS: Step);
646	Value *Cmp = Builder.CreateICmp(P: CmpInst::Predicate::ICMP_UGT, LHS: ScalarTC, RHS: Step);
647	Value *Zero = ConstantInt::get(Ty: ScalarTC->getType(), V: `0`);
648	return Builder.CreateSelect(C: Cmp, True: Sub, False: Zero);
649	}
650	case VPInstruction::ExplicitVectorLength: {
651	// TODO: Restructure this code with an explicit remainder loop, vsetvli can
652	// be outside of the main loop.
653	Value AVL = State.get(Def: getOperand(N: `0`), /IsScalar/* true);
654	// Compute EVL
655	assert(AVL->getType()->isIntegerTy() &&
656	"Requested vector length should be an integer.");
657
658	assert(State.VF.isScalable() && "Expected scalable vector factor.");
659	Value *VFArg = State.Builder.getInt32(C: State.VF.getKnownMinValue());
660
661	Value *EVL = State.Builder.CreateIntrinsic(
662	RetTy: State.Builder.getInt32Ty(), ID: Intrinsic::experimental_get_vector_length,
663	Args: {AVL, VFArg, State.Builder.getTrue()});
664	return EVL;
665	}
666	case VPInstruction::CanonicalIVIncrementForPart: {
667	unsigned Part = getUnrollPart(U&: *this);
668	auto *IV = State.get(Def: getOperand(N: `0`), Lane: VPLane (`0`));
669	assert(Part != `0` && "Must have a positive part");
670	// The canonical IV is incremented by the vectorization factor (num of
671	// SIMD elements) times the unroll part.
672	Value *Step = createStepForVF(B&: Builder, Ty: IV->getType(), VF: State.VF, Step: Part);
673	return Builder.CreateAdd(LHS: IV, RHS: Step, Name, HasNUW: hasNoUnsignedWrap(),
674	HasNSW: hasNoSignedWrap());
675	}
676	case VPInstruction::BranchOnCond: {
677	Value *Cond = State.get(Def: getOperand(N: `0`), Lane: VPLane (`0`));
678	auto *Br = createCondBranch(Cond, VPBB: getParent(), State);
679	applyMetadata(I&: *Br);
680	return Br;
681	}
682	case VPInstruction::BranchOnCount: {
683	// First create the compare.
684	Value IV = State.get(Def: getOperand(N: `0`), /IsScalar/* true);
685	Value TC = State.get(Def: getOperand(N: `1`), /IsScalar/* true);
686	Value *Cond = Builder.CreateICmpEQ(LHS: IV, RHS: TC);
687	return createCondBranch(Cond, VPBB: getParent(), State);
688	}
689	case VPInstruction::Broadcast: {
690	return Builder.CreateVectorSplat(
691	EC: State.VF, V: State.get(Def: getOperand(N: `0`), /IsScalar/ true), Name: "broadcast");
692	}
693	case VPInstruction::BuildStructVector: {
694	// For struct types, we need to build a new 'wide' struct type, where each
695	// element is widened, i.e., we create a struct of vectors.
696	auto *StructTy =
697	cast<StructType>(Val: State.TypeAnalysis.inferScalarType(V: getOperand(N: `0`)));
698	Value *Res = PoisonValue::get(T: toVectorizedTy(Ty: StructTy, EC: State.VF));
699	for (const auto &[LaneIndex, Op] : enumerate(First: operands())) {
700	for (unsigned FieldIndex = `0`; FieldIndex != StructTy->getNumElements();
701	FieldIndex++) {
702	Value *ScalarValue =
703	Builder.CreateExtractValue(Agg: State.get(Def: Op, IsScalar: true), Idxs: FieldIndex);
704	Value *VectorValue = Builder.CreateExtractValue(Agg: Res, Idxs: FieldIndex);
705	VectorValue =
706	Builder.CreateInsertElement(Vec: VectorValue, NewElt: ScalarValue, Idx: LaneIndex);
707	Res = Builder.CreateInsertValue(Agg: Res, Val: VectorValue, Idxs: FieldIndex);
708	}
709	}
710	return Res;
711	}
712	case VPInstruction::BuildVector: {
713	auto *ScalarTy = State.TypeAnalysis.inferScalarType(V: getOperand(N: `0`));
714	auto NumOfElements = ElementCount::getFixed(MinVal: getNumOperands());
715	Value *Res = PoisonValue::get(T: toVectorizedTy(Ty: ScalarTy, EC: NumOfElements));
716	for (const auto &[Idx, Op] : enumerate(First: operands()))
717	Res = State.Builder.CreateInsertElement(Vec: Res, NewElt: State.get(Def: Op, IsScalar: true),
718	Idx: State.Builder.getInt32(C: Idx));
719	return Res;
720	}
721	case VPInstruction::ReductionStartVector: {
722	if (State.VF.isScalar())
723	return State.get(Def: getOperand(N: `0`), IsScalar: true);
724	IRBuilderBase::FastMathFlagGuard FMFG(Builder);
725	Builder.setFastMathFlags(getFastMathFlags());
726	// If this start vector is scaled then it should produce a vector with fewer
727	// elements than the VF.
728	ElementCount VF = State.VF.divideCoefficientBy(
729	RHS: cast<ConstantInt>(Val: getOperand(N: `2`)->getLiveInIRValue())->getZExtValue());
730	auto Iden = Builder.CreateVectorSplat(EC: VF, V: State.get(Def: getOperand(N: `1`), IsScalar: true*));
731	Constant *Zero = Builder.getInt32(C: `0`);
732	return Builder.CreateInsertElement(Vec: Iden, NewElt: State.get(Def: getOperand(N: `0`), IsScalar: true),
733	Idx: Zero);
734	}
735	case VPInstruction::ComputeAnyOfResult: {
736	// FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary
737	// and will be removed by breaking up the recipe further.
738	auto *PhiR = cast<VPReductionPHIRecipe>(Val: getOperand(N: `0`));
739	auto *OrigPhi = cast<PHINode>(Val: PhiR->getUnderlyingValue());
740	Value *ReducedPartRdx = State.get(Def: getOperand(N: `2`));
741	for (unsigned Idx = `3`; Idx < getNumOperands(); ++Idx)
742	ReducedPartRdx = Builder.CreateBinOp(
743	Opc: (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(
744	Kind: RecurKind::AnyOf),
745	LHS: State.get(Def: getOperand(N: Idx)), RHS: ReducedPartRdx, Name: "bin.rdx");
746	return createAnyOfReduction(B&: Builder, Src: ReducedPartRdx,
747	InitVal: State.get(Def: getOperand(N: `1`), Lane: VPLane (`0`)), OrigPhi);
748	}
749	case VPInstruction::ComputeFindIVResult: {
750	// FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary
751	// and will be removed by breaking up the recipe further.
752	auto *PhiR = cast<VPReductionPHIRecipe>(Val: getOperand(N: `0`));
753	// Get its reduction variable descriptor.
754	const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
755	RecurKind RK = RdxDesc.getRecurrenceKind();
756	assert(RecurrenceDescriptor::isFindIVRecurrenceKind(RK) &&
757	"Unexpected reduction kind");
758	assert(!PhiR->isInLoop() &&
759	"In-loop FindLastIV reduction is not supported yet");
760
761	// The recipe's operands are the reduction phi, the start value, the
762	// sentinel value, followed by one operand for each part of the reduction.
763	unsigned UF = getNumOperands() - `3`;
764	Value *ReducedPartRdx = State.get(Def: getOperand(N: `3`));
765	RecurKind MinMaxKind;
766	bool IsSigned = RecurrenceDescriptor::isSignedRecurrenceKind(Kind: RK);
767	if (RecurrenceDescriptor::isFindLastIVRecurrenceKind(Kind: RK)) {
768	MinMaxKind = IsSigned ? RecurKind::SMax : RecurKind::UMax;
769	} else {
770	assert(RecurrenceDescriptor::isFindFirstIVRecurrenceKind(RK) &&
771	"Kind must either be FindLastIV or FindFirstIV");
772	assert(IsSigned && "Only FindFirstIV with SMax is currently supported");
773	MinMaxKind = RecurKind::SMin;
774	}
775	for (unsigned Part = `1`; Part < UF; ++Part)
776	ReducedPartRdx = createMinMaxOp(Builder, RK: MinMaxKind, Left: ReducedPartRdx,
777	Right: State.get(Def: getOperand(N: `3` + Part)));
778
779	Value Start = State.get(Def: getOperand(N: `1`), IsScalar: true*);
780	Value *Sentinel = getOperand(N: `2`)->getLiveInIRValue();
781	return createFindLastIVReduction(B&: Builder, Src: ReducedPartRdx, RdxKind: RK, Start,
782	Sentinel);
783	}
784	case VPInstruction::ComputeReductionResult: {
785	// FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary
786	// and will be removed by breaking up the recipe further.
787	auto *PhiR = cast<VPReductionPHIRecipe>(Val: getOperand(N: `0`));
788	// Get its reduction variable descriptor.
789	const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
790
791	RecurKind RK = RdxDesc.getRecurrenceKind();
792	assert(!RecurrenceDescriptor::isFindIVRecurrenceKind(RK) &&
793	"should be handled by ComputeFindIVResult");
794
795	// The recipe's operands are the reduction phi, followed by one operand for
796	// each part of the reduction.
797	unsigned UF = getNumOperands() - `1`;
798	VectorParts RdxParts(UF);
799	for (unsigned Part = `0`; Part < UF; ++Part)
800	RdxParts [Part] = State.get(Def: getOperand(N: `1` + Part), IsScalar: PhiR->isInLoop());
801
802	IRBuilderBase::FastMathFlagGuard FMFG(Builder);
803	if (hasFastMathFlags())
804	Builder.setFastMathFlags(getFastMathFlags());
805
806	// Reduce all of the unrolled parts into a single vector.
807	Value *ReducedPartRdx = RdxParts [`0`];
808	if (PhiR->isOrdered()) {
809	ReducedPartRdx = RdxParts [UF - `1`];
810	} else {
811	// Floating-point operations should have some FMF to enable the reduction.
812	for (unsigned Part = `1`; Part < UF; ++Part) {
813	Value *RdxPart = RdxParts [Part];
814	if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind: RK))
815	ReducedPartRdx = createMinMaxOp(Builder, RK, Left: ReducedPartRdx, Right: RdxPart);
816	else
817	ReducedPartRdx =
818	Builder.CreateBinOp(Opc: (Instruction::BinaryOps)RdxDesc.getOpcode(),
819	LHS: RdxPart, RHS: ReducedPartRdx, Name: "bin.rdx");
820	}
821	}
822
823	// Create the reduction after the loop. Note that inloop reductions create
824	// the target reduction in the loop using a Reduction recipe.
825	if (State.VF.isVector() && !PhiR->isInLoop()) {
826	// TODO: Support in-order reductions based on the recurrence descriptor.
827	// All ops in the reduction inherit fast-math-flags from the recurrence
828	// descriptor.
829	ReducedPartRdx = createSimpleReduction(B&: Builder, Src: ReducedPartRdx, RdxKind: RK);
830	}
831
832	return ReducedPartRdx;
833	}
834	case VPInstruction::ExtractLastElement:
835	case VPInstruction::ExtractPenultimateElement: {
836	unsigned Offset = getOpcode() == VPInstruction::ExtractLastElement ? `1` : `2`;
837	Value *Res;
838	if (State.VF.isVector()) {
839	assert(Offset <= State.VF.getKnownMinValue() &&
840	"invalid offset to extract from");
841	// Extract lane VF - Offset from the operand.
842	Res = State.get(Def: getOperand(N: `0`), Lane: VPLane::getLaneFromEnd(VF: State.VF, Offset));
843	} else {
844	assert(Offset <= `1` && "invalid offset to extract from");
845	Res = State.get(Def: getOperand(N: `0`));
846	}
847	if (isa<ExtractElementInst>(Val: Res))
848	Res->setName(Name);
849	return Res;
850	}
851	case VPInstruction::LogicalAnd: {
852	Value *A = State.get(Def: getOperand(N: `0`));
853	Value *B = State.get(Def: getOperand(N: `1`));
854	return Builder.CreateLogicalAnd(Cond1: A, Cond2: B, Name);
855	}
856	case VPInstruction::PtrAdd: {
857	assert(vputils::onlyFirstLaneUsed(this) &&
858	"can only generate first lane for PtrAdd");
859	Value *Ptr = State.get(Def: getOperand(N: `0`), Lane: VPLane (`0`));
860	Value *Addend = State.get(Def: getOperand(N: `1`), Lane: VPLane (`0`));
861	return Builder.CreatePtrAdd(Ptr, Offset: Addend, Name, NW: getGEPNoWrapFlags());
862	}
863	case VPInstruction::AnyOf: {
864	Value *Res = State.get(Def: getOperand(N: `0`));
865	for (VPValue *Op : drop_begin(RangeOrContainer: operands()))
866	Res = Builder.CreateOr(LHS: Res, RHS: State.get(Def: Op));
867	return Builder.CreateOrReduce(Src: Res);
868	}
869	case VPInstruction::FirstActiveLane: {
870	if (getNumOperands() == `1`) {
871	Value *Mask = State.get(Def: getOperand(N: `0`));
872	return Builder.CreateCountTrailingZeroElems(ResTy: Builder.getInt64Ty(), Mask,
873	ZeroIsPoison: true, Name);
874	}
875	// If there are multiple operands, create a chain of selects to pick the
876	// first operand with an active lane and add the number of lanes of the
877	// preceding operands.
878	Value *RuntimeVF =
879	getRuntimeVF(B&: State.Builder, Ty: State.Builder.getInt64Ty(), VF: State.VF);
880	unsigned LastOpIdx = getNumOperands() - `1`;
881	Value Res = nullptr*;
882	for (int Idx = LastOpIdx; Idx >= `0`; --Idx) {
883	Value *TrailingZeros = Builder.CreateCountTrailingZeroElems(
884	ResTy: Builder.getInt64Ty(), Mask: State.get(Def: getOperand(N: Idx)), ZeroIsPoison: true, Name);
885	Value *Current = Builder.CreateAdd(
886	LHS: Builder.CreateMul(LHS: RuntimeVF, RHS: Builder.getInt64(C: Idx)), RHS: TrailingZeros);
887	if (Res) {
888	Value *Cmp = Builder.CreateICmpNE(LHS: TrailingZeros, RHS: RuntimeVF);
889	Res = Builder.CreateSelect(C: Cmp, True: Current, False: Res);
890	} else {
891	Res = Current;
892	}
893	}
894
895	return Res;
896	}
897	default:
898	llvm_unreachable("Unsupported opcode for instruction");
899	}
900	}
901
902	InstructionCost VPInstruction::computeCost(ElementCount VF,
903	VPCostContext &Ctx) const {
904	if (Instruction::isBinaryOp(Opcode: getOpcode())) {
905	Type ResTy = Ctx.Types.inferScalarType(V: this*);
906	if (!vputils::onlyFirstLaneUsed(Def: this))
907	ResTy = toVectorTy(Scalar: ResTy, EC: VF);
908
909	if (!getUnderlyingValue()) {
910	switch (getOpcode()) {
911	case Instruction::FMul:
912	return Ctx.TTI.getArithmeticInstrCost(Opcode: getOpcode(), Ty: ResTy, CostKind: Ctx.CostKind);
913	default:
914	// TODO: Compute cost for VPInstructions without underlying values once
915	// the legacy cost model has been retired.
916	return `0`;
917	}
918	}
919
920	assert(!doesGeneratePerAllLanes() &&
921	"Should only generate a vector value or single scalar, not scalars "
922	"for all lanes.");
923	return Ctx.TTI.getArithmeticInstrCost(Opcode: getOpcode(), Ty: ResTy, CostKind: Ctx.CostKind);
924	}
925
926	switch (getOpcode()) {
927	case Instruction::ExtractElement: {
928	// Add on the cost of extracting the element.
929	auto *VecTy = toVectorTy(Scalar: Ctx.Types.inferScalarType(V: getOperand(N: `0`)), EC: VF);
930	return Ctx.TTI.getVectorInstrCost(Opcode: Instruction::ExtractElement, Val: VecTy,
931	CostKind: Ctx.CostKind);
932	}
933	case VPInstruction::AnyOf: {
934	auto VecTy = toVectorTy(Scalar: Ctx.Types.inferScalarType(V: this*), EC: VF);
935	return Ctx.TTI.getArithmeticReductionCost(
936	Opcode: Instruction::Or, Ty: cast<VectorType>(Val: VecTy), FMF: std::nullopt, CostKind: Ctx.CostKind);
937	}
938	case VPInstruction::FirstActiveLane: {
939	// Calculate the cost of determining the lane index.
940	auto *PredTy = toVectorTy(Scalar: Ctx.Types.inferScalarType(V: getOperand(N: `0`)), EC: VF);
941	IntrinsicCostAttributes Attrs(Intrinsic::experimental_cttz_elts,
942	Type::getInt64Ty(C&: Ctx.LLVMCtx),
943	{PredTy, Type::getInt1Ty(C&: Ctx.LLVMCtx)});
944	return Ctx.TTI.getIntrinsicInstrCost(ICA: Attrs, CostKind: Ctx.CostKind);
945	}
946	case VPInstruction::FirstOrderRecurrenceSplice: {
947	assert(VF.isVector() && "Scalar FirstOrderRecurrenceSplice?");
948	SmallVector<int> Mask(VF.getKnownMinValue());
949	std::iota(first: Mask.begin(), last: Mask.end(), value: VF.getKnownMinValue() - `1`);
950	Type VectorTy = toVectorTy(Scalar: Ctx.Types.inferScalarType(V: this*), EC: VF);
951
952	return Ctx.TTI.getShuffleCost(Kind: TargetTransformInfo::SK_Splice,
953	DstTy: cast<VectorType>(Val: VectorTy),
954	SrcTy: cast<VectorType>(Val: VectorTy), Mask,
955	CostKind: Ctx.CostKind, Index: VF.getKnownMinValue() - `1`);
956	}
957	case VPInstruction::ActiveLaneMask: {
958	Type *ArgTy = Ctx.Types.inferScalarType(V: getOperand(N: `0`));
959	Type *RetTy = toVectorTy(Scalar: Type::getInt1Ty(C&: Ctx.LLVMCtx), EC: VF);
960	IntrinsicCostAttributes Attrs(Intrinsic::get_active_lane_mask, RetTy,
961	{ArgTy, ArgTy});
962	return Ctx.TTI.getIntrinsicInstrCost(ICA: Attrs, CostKind: Ctx.CostKind);
963	}
964	case VPInstruction::ExplicitVectorLength: {
965	Type *Arg0Ty = Ctx.Types.inferScalarType(V: getOperand(N: `0`));
966	Type *I32Ty = Type::getInt32Ty(C&: Ctx.LLVMCtx);
967	Type *I1Ty = Type::getInt1Ty(C&: Ctx.LLVMCtx);
968	IntrinsicCostAttributes Attrs(Intrinsic::experimental_get_vector_length,
969	I32Ty, {Arg0Ty, I32Ty, I1Ty});
970	return Ctx.TTI.getIntrinsicInstrCost(ICA: Attrs, CostKind: Ctx.CostKind);
971	}
972	case VPInstruction::ExtractPenultimateElement:
973	if (VF == ElementCount::getScalable(MinVal: `1`))
974	return InstructionCost::getInvalid();
975	LLVM_FALLTHROUGH;
976	default:
977	// TODO: Compute cost other VPInstructions once the legacy cost model has
978	// been retired.
979	assert(!getUnderlyingValue() &&
980	"unexpected VPInstruction witht underlying value");
981	return `0`;
982	}
983	}
984
985	bool VPInstruction::isVectorToScalar() const {
986	return getOpcode() == VPInstruction::ExtractLastElement \|\|
987	getOpcode() == VPInstruction::ExtractPenultimateElement \|\|
988	getOpcode() == Instruction::ExtractElement \|\|
989	getOpcode() == VPInstruction::FirstActiveLane \|\|
990	getOpcode() == VPInstruction::ComputeAnyOfResult \|\|
991	getOpcode() == VPInstruction::ComputeFindIVResult \|\|
992	getOpcode() == VPInstruction::ComputeReductionResult \|\|
993	getOpcode() == VPInstruction::AnyOf;
994	}
995
996	bool VPInstruction::isSingleScalar() const {
997	return getOpcode() == Instruction::PHI \|\| isScalarCast();
998	}
999
1000	void VPInstruction::execute(VPTransformState &State) {
1001	assert(!State.Lane && "VPInstruction executing an Lane");
1002	IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
1003	assert(flagsValidForOpcode(getOpcode()) &&
1004	"Set flags not supported for the provided opcode");
1005	if (hasFastMathFlags())
1006	State.Builder.setFastMathFlags(getFastMathFlags());
1007	bool GeneratesPerFirstLaneOnly = canGenerateScalarForFirstLane() &&
1008	(vputils::onlyFirstLaneUsed(Def: this) \|\|
1009	isVectorToScalar() \|\| isSingleScalar());
1010	bool GeneratesPerAllLanes = doesGeneratePerAllLanes();
1011	if (GeneratesPerAllLanes) {
1012	for (unsigned Lane = `0`, NumLanes = State.VF.getFixedValue();
1013	Lane != NumLanes; ++Lane) {
1014	Value *GeneratedValue = generatePerLane(State, Lane: VPLane (Lane));
1015	assert(GeneratedValue && "generatePerLane must produce a value");
1016	State.set(Def: this, V: GeneratedValue, Lane: VPLane (Lane));
1017	}
1018	return;
1019	}
1020
1021	Value *GeneratedValue = generate(State);
1022	if (!hasResult())
1023	return;
1024	assert(GeneratedValue && "generate must produce a value");
1025	assert((((GeneratedValue->getType()->isVectorTy() \|\|
1026	GeneratedValue->getType()->isStructTy()) ==
1027	!GeneratesPerFirstLaneOnly) \|\|
1028	State.VF.isScalar()) &&
1029	"scalar value but not only first lane defined");
1030	State.set(Def: this, V: GeneratedValue,
1031	/IsScalar/ GeneratesPerFirstLaneOnly);
1032	}
1033
1034	bool VPInstruction::opcodeMayReadOrWriteFromMemory() const {
1035	if (Instruction::isBinaryOp(Opcode: getOpcode()) \|\| Instruction::isCast(Opcode: getOpcode()))
1036	return false;
1037	switch (getOpcode()) {
1038	case Instruction::ExtractElement:
1039	case Instruction::Freeze:
1040	case Instruction::ICmp:
1041	case Instruction::Select:
1042	case VPInstruction::AnyOf:
1043	case VPInstruction::BuildStructVector:
1044	case VPInstruction::BuildVector:
1045	case VPInstruction::CalculateTripCountMinusVF:
1046	case VPInstruction::CanonicalIVIncrementForPart:
1047	case VPInstruction::ExtractLastElement:
1048	case VPInstruction::ExtractPenultimateElement:
1049	case VPInstruction::FirstActiveLane:
1050	case VPInstruction::FirstOrderRecurrenceSplice:
1051	case VPInstruction::LogicalAnd:
1052	case VPInstruction::Not:
1053	case VPInstruction::PtrAdd:
1054	case VPInstruction::WideIVStep:
1055	case VPInstruction::StepVector:
1056	case VPInstruction::ReductionStartVector:
1057	return false;
1058	default:
1059	return true;
1060	}
1061	}
1062
1063	bool VPInstruction::onlyFirstLaneUsed(const VPValue Op) const* {
1064	assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
1065	if (Instruction::isBinaryOp(Opcode: getOpcode()) \|\| Instruction::isCast(Opcode: getOpcode()))
1066	return vputils::onlyFirstLaneUsed(Def: this);
1067
1068	switch (getOpcode()) {
1069	default:
1070	return false;
1071	case Instruction::ExtractElement:
1072	return Op == getOperand(N: `1`);
1073	case Instruction::PHI:
1074	return true;
1075	case Instruction::ICmp:
1076	case Instruction::Select:
1077	case Instruction::Or:
1078	case Instruction::Freeze:
1079	// TODO: Cover additional opcodes.
1080	return vputils::onlyFirstLaneUsed(Def: this);
1081	case VPInstruction::ActiveLaneMask:
1082	case VPInstruction::ExplicitVectorLength:
1083	case VPInstruction::CalculateTripCountMinusVF:
1084	case VPInstruction::CanonicalIVIncrementForPart:
1085	case VPInstruction::BranchOnCount:
1086	case VPInstruction::BranchOnCond:
1087	case VPInstruction::Broadcast:
1088	case VPInstruction::ReductionStartVector:
1089	return true;
1090	case VPInstruction::PtrAdd:
1091	return Op == getOperand(N: `0`) \|\| vputils::onlyFirstLaneUsed(Def: this);
1092	case VPInstruction::ComputeAnyOfResult:
1093	case VPInstruction::ComputeFindIVResult:
1094	return Op == getOperand(N: `1`);
1095	};
1096	llvm_unreachable("switch should return");
1097	}
1098
1099	bool VPInstruction::onlyFirstPartUsed(const VPValue Op) const* {
1100	assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
1101	if (Instruction::isBinaryOp(Opcode: getOpcode()))
1102	return vputils::onlyFirstPartUsed(Def: this);
1103
1104	switch (getOpcode()) {
1105	default:
1106	return false;
1107	case Instruction::ICmp:
1108	case Instruction::Select:
1109	return vputils::onlyFirstPartUsed(Def: this);
1110	case VPInstruction::BranchOnCount:
1111	case VPInstruction::BranchOnCond:
1112	case VPInstruction::CanonicalIVIncrementForPart:
1113	return true;
1114	};
1115	llvm_unreachable("switch should return");
1116	}
1117
1118	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
1119	void VPInstruction::dump() const {
1120	VPSlotTracker SlotTracker(getParent()->getPlan());
1121	print(dbgs(), "", SlotTracker);
1122	}
1123
1124	void VPInstruction::print(raw_ostream &O, const Twine &Indent,
1125	VPSlotTracker &SlotTracker) const {
1126	O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "") << " ";
1127
1128	if (hasResult()) {
1129	printAsOperand(O, SlotTracker);
1130	O << " = ";
1131	}
1132
1133	switch (getOpcode()) {
1134	case VPInstruction::Not:
1135	O << "not";
1136	break;
1137	case VPInstruction::SLPLoad:
1138	O << "combined load";
1139	break;
1140	case VPInstruction::SLPStore:
1141	O << "combined store";
1142	break;
1143	case VPInstruction::ActiveLaneMask:
1144	O << "active lane mask";
1145	break;
1146	case VPInstruction::ExplicitVectorLength:
1147	O << "EXPLICIT-VECTOR-LENGTH";
1148	break;
1149	case VPInstruction::FirstOrderRecurrenceSplice:
1150	O << "first-order splice";
1151	break;
1152	case VPInstruction::BranchOnCond:
1153	O << "branch-on-cond";
1154	break;
1155	case VPInstruction::CalculateTripCountMinusVF:
1156	O << "TC > VF ? TC - VF : 0";
1157	break;
1158	case VPInstruction::CanonicalIVIncrementForPart:
1159	O << "VF * Part +";
1160	break;
1161	case VPInstruction::BranchOnCount:
1162	O << "branch-on-count";
1163	break;
1164	case VPInstruction::Broadcast:
1165	O << "broadcast";
1166	break;
1167	case VPInstruction::BuildStructVector:
1168	O << "buildstructvector";
1169	break;
1170	case VPInstruction::BuildVector:
1171	O << "buildvector";
1172	break;
1173	case VPInstruction::ExtractLastElement:
1174	O << "extract-last-element";
1175	break;
1176	case VPInstruction::ExtractPenultimateElement:
1177	O << "extract-penultimate-element";
1178	break;
1179	case VPInstruction::ComputeAnyOfResult:
1180	O << "compute-anyof-result";
1181	break;
1182	case VPInstruction::ComputeFindIVResult:
1183	O << "compute-find-iv-result";
1184	break;
1185	case VPInstruction::ComputeReductionResult:
1186	O << "compute-reduction-result";
1187	break;
1188	case VPInstruction::LogicalAnd:
1189	O << "logical-and";
1190	break;
1191	case VPInstruction::PtrAdd:
1192	O << "ptradd";
1193	break;
1194	case VPInstruction::AnyOf:
1195	O << "any-of";
1196	break;
1197	case VPInstruction::FirstActiveLane:
1198	O << "first-active-lane";
1199	break;
1200	case VPInstruction::ReductionStartVector:
1201	O << "reduction-start-vector";
1202	break;
1203	default:
1204	O << Instruction::getOpcodeName(getOpcode());
1205	}
1206
1207	printFlags(O);
1208	printOperands(O, SlotTracker);
1209
1210	if (auto DL = getDebugLoc()) {
1211	O << ", !dbg ";
1212	DL.print(O);
1213	}
1214	}
1215	#endif
1216
1217	void VPInstructionWithType::execute(VPTransformState &State) {
1218	State.setDebugLocFrom(getDebugLoc());
1219	if (isScalarCast()) {
1220	Value *Op = State.get(Def: getOperand(N: `0`), Lane: VPLane (`0`));
1221	Value *Cast = State.Builder.CreateCast(Op: Instruction::CastOps(getOpcode()),
1222	V: Op, DestTy: ResultTy);
1223	State.set(Def: this, V: Cast, Lane: VPLane (`0`));
1224	return;
1225	}
1226	switch (getOpcode()) {
1227	case VPInstruction::StepVector: {
1228	Value *StepVector =
1229	State.Builder.CreateStepVector(DstType: VectorType::get(ElementType: ResultTy, EC: State.VF));
1230	State.set(Def: this, V: StepVector);
1231	break;
1232	}
1233	default:
1234	llvm_unreachable("opcode not implemented yet");
1235	}
1236	}
1237
1238	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
1239	void VPInstructionWithType::print(raw_ostream &O, const Twine &Indent,
1240	VPSlotTracker &SlotTracker) const {
1241	O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "") << " ";
1242	printAsOperand(O, SlotTracker);
1243	O << " = ";
1244
1245	switch (getOpcode()) {
1246	case VPInstruction::WideIVStep:
1247	O << "wide-iv-step ";
1248	printOperands(O, SlotTracker);
1249	break;
1250	case VPInstruction::StepVector:
1251	O << "step-vector " << *ResultTy;
1252	break;
1253	default:
1254	assert(Instruction::isCast(getOpcode()) && "unhandled opcode");
1255	O << Instruction::getOpcodeName(getOpcode()) << " ";
1256	printOperands(O, SlotTracker);
1257	O << " to " << *ResultTy;
1258	}
1259	}
1260	#endif
1261
1262	void VPPhi::execute(VPTransformState &State) {
1263	State.setDebugLocFrom(getDebugLoc());
1264	PHINode *NewPhi = State.Builder.CreatePHI(
1265	Ty: State.TypeAnalysis.inferScalarType(V: this), NumReservedValues: `2`, Name: getName());
1266	unsigned NumIncoming = getNumIncoming();
1267	if (getParent() != getParent()->getPlan()->getScalarPreheader()) {
1268	// TODO: Fixup all incoming values of header phis once recipes defining them
1269	// are introduced.
1270	NumIncoming = `1`;
1271	}
1272	for (unsigned Idx = `0`; Idx != NumIncoming; ++Idx) {
1273	Value *IncV = State.get(Def: getIncomingValue(Idx), Lane: VPLane (`0`));
1274	BasicBlock *PredBB = State.CFG.VPBB2IRBB.at(Val: getIncomingBlock(Idx));
1275	NewPhi->addIncoming(V: IncV, BB: PredBB);
1276	}
1277	State.set(Def: this, V: NewPhi, Lane: VPLane (`0`));
1278	}
1279
1280	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
1281	void VPPhi::print(raw_ostream &O, const Twine &Indent,
1282	VPSlotTracker &SlotTracker) const {
1283	O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "") << " ";
1284	printAsOperand(O, SlotTracker);
1285	O << " = phi ";
1286	printPhiOperands(O, SlotTracker);
1287	}
1288	#endif
1289
1290	VPIRInstruction *VPIRInstruction ::create(Instruction &I) {
1291	if (auto *Phi = dyn_cast<PHINode>(Val: &I))
1292	return new VPIRPhi (*Phi);
1293	return new VPIRInstruction (I);
1294	}
1295
1296	void VPIRInstruction::execute(VPTransformState &State) {
1297	assert(!isa<VPIRPhi>(this) && getNumOperands() == `0` &&
1298	"PHINodes must be handled by VPIRPhi");
1299	// Advance the insert point after the wrapped IR instruction. This allows
1300	// interleaving VPIRInstructions and other recipes.
1301	State.Builder.SetInsertPoint(TheBB: I.getParent(), IP: std::next(x: I.getIterator()));
1302	}
1303
1304	InstructionCost VPIRInstruction::computeCost(ElementCount VF,
1305	VPCostContext &Ctx) const {
1306	// The recipe wraps an existing IR instruction on the border of VPlan's scope,
1307	// hence it does not contribute to the cost-modeling for the VPlan.
1308	return `0`;
1309	}
1310
1311	void VPIRInstruction::extractLastLaneOfFirstOperand(VPBuilder &Builder) {
1312	assert(isa<PHINode>(getInstruction()) &&
1313	"can only update exiting operands to phi nodes");
1314	assert(getNumOperands() > `0` && "must have at least one operand");
1315	VPValue *Exiting = getOperand(N: `0`);
1316	if (Exiting->isLiveIn())
1317	return;
1318
1319	Exiting = Builder.createNaryOp(Opcode: VPInstruction::ExtractLastElement, Operands: {Exiting});
1320	setOperand(I: `0`, New: Exiting);
1321	}
1322
1323	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
1324	void VPIRInstruction::print(raw_ostream &O, const Twine &Indent,
1325	VPSlotTracker &SlotTracker) const {
1326	O << Indent << "IR " << I;
1327	}
1328	#endif
1329
1330	void VPIRPhi::execute(VPTransformState &State) {
1331	PHINode *Phi = &getIRPhi();
1332	for (const auto &[Idx, Op] : enumerate(First: operands())) {
1333	VPValue *ExitValue = Op;
1334	auto Lane = vputils::isSingleScalar(VPV: ExitValue)
1335	? VPLane::getFirstLane()
1336	: VPLane::getLastLaneForVF(VF: State.VF);
1337	VPBlockBase *Pred = getParent()->getPredecessors()[Idx];
1338	auto *PredVPBB = Pred->getExitingBasicBlock();
1339	BasicBlock *PredBB = State.CFG.VPBB2IRBB [PredVPBB];
1340	// Set insertion point in PredBB in case an extract needs to be generated.
1341	// TODO: Model extracts explicitly.
1342	State.Builder.SetInsertPoint(TheBB: PredBB, IP: PredBB->getFirstNonPHIIt());
1343	Value *V = State.get(Def: ExitValue, Lane: VPLane (Lane));
1344	// If there is no existing block for PredBB in the phi, add a new incoming
1345	// value. Otherwise update the existing incoming value for PredBB.
1346	if (Phi->getBasicBlockIndex(BB: PredBB) == -`1`)
1347	Phi->addIncoming(V, BB: PredBB);
1348	else
1349	Phi->setIncomingValueForBlock(BB: PredBB, V);
1350	}
1351
1352	// Advance the insert point after the wrapped IR instruction. This allows
1353	// interleaving VPIRInstructions and other recipes.
1354	State.Builder.SetInsertPoint(TheBB: Phi->getParent(), IP: std::next(x: Phi->getIterator()));
1355	}
1356
1357	void VPPhiAccessors::removeIncomingValueFor(VPBlockBase IncomingBlock) const* {
1358	VPRecipeBase R = const_cast<VPRecipeBase >(getAsRecipe());
1359	assert(R->getNumOperands() == R->getParent()->getNumPredecessors() &&
1360	"Number of phi operands must match number of predecessors");
1361	unsigned Position = R->getParent()->getIndexForPredecessor(Pred: IncomingBlock);
1362	R->removeOperand(Idx: Position);
1363	}
1364
1365	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
1366	void VPPhiAccessors::printPhiOperands(raw_ostream &O,
1367	VPSlotTracker &SlotTracker) const {
1368	interleaveComma(enumerate(getAsRecipe()->operands()), O,
1369	[this, &O, &SlotTracker](auto Op) {
1370	O << "[ ";
1371	Op.value()->printAsOperand(O, SlotTracker);
1372	O << ", ";
1373	getIncomingBlock(Op.index())->printAsOperand(O);
1374	O << " ]";
1375	});
1376	}
1377	#endif
1378
1379	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
1380	void VPIRPhi::print(raw_ostream &O, const Twine &Indent,
1381	VPSlotTracker &SlotTracker) const {
1382	VPIRInstruction::print(O, Indent, SlotTracker);
1383
1384	if (getNumOperands() != `0`) {
1385	O << " (extra operand" << (getNumOperands() > `1` ? "s" : "") << ": ";
1386	interleaveComma(
1387	enumerate(operands()), O, [this, &O, &SlotTracker](auto Op) {
1388	Op.value()->printAsOperand(O, SlotTracker);
1389	O << " from ";
1390	getParent()->getPredecessors()[Op.index()]->printAsOperand(O);
1391	});
1392	O << ")";
1393	}
1394	}
1395	#endif
1396
1397	VPIRMetadata::VPIRMetadata(Instruction &I, LoopVersioning *LVer)
1398	: VPIRMetadata (I) {
1399	if (!LVer \|\| !isa<LoadInst, StoreInst>(Val: &I))
1400	return;
1401	const auto &[AliasScopeMD, NoAliasMD] = LVer->getNoAliasMetadataFor(OrigInst: &I);
1402	if (AliasScopeMD)
1403	Metadata.emplace_back(Args: LLVMContext::MD_alias_scope, Args: AliasScopeMD);
1404	if (NoAliasMD)
1405	Metadata.emplace_back(Args: LLVMContext::MD_noalias, Args: NoAliasMD);
1406	}
1407
1408	void VPIRMetadata::applyMetadata(Instruction &I) const {
1409	for (const auto &[Kind, Node] : Metadata)
1410	I.setMetadata(KindID: Kind, Node);
1411	}
1412
1413	void VPWidenCallRecipe::execute(VPTransformState &State) {
1414	assert(State.VF.isVector() && "not widening");
1415	assert(Variant != nullptr && "Can't create vector function.");
1416
1417	FunctionType *VFTy = Variant->getFunctionType();
1418	// Add return type if intrinsic is overloaded on it.
1419	SmallVector<Value *, `4`> Args;
1420	for (const auto &I : enumerate(First: args())) {
1421	Value *Arg;
1422	// Some vectorized function variants may also take a scalar argument,
1423	// e.g. linear parameters for pointers. This needs to be the scalar value
1424	// from the start of the respective part when interleaving.
1425	if (!VFTy->getParamType(i: I.index())->isVectorTy())
1426	Arg = State.get(Def: I.value(), Lane: VPLane (`0`));
1427	else
1428	Arg = State.get(Def: I.value(), IsScalar: onlyFirstLaneUsed(Op: I.value()));
1429	Args.push_back(Elt: Arg);
1430	}
1431
1432	auto *CI = cast_or_null<CallInst>(Val: getUnderlyingValue());
1433	SmallVector<OperandBundleDef, `1`> OpBundles;
1434	if (CI)
1435	CI->getOperandBundlesAsDefs(Defs&: OpBundles);
1436
1437	CallInst *V = State.Builder.CreateCall(Callee: Variant, Args, OpBundles);
1438	applyFlags(I&: *V);
1439	applyMetadata(I&: *V);
1440	V->setCallingConv(Variant->getCallingConv());
1441
1442	if (!V->getType()->isVoidTy())
1443	State.set(Def: this, V);
1444	}
1445
1446	InstructionCost VPWidenCallRecipe::computeCost(ElementCount VF,
1447	VPCostContext &Ctx) const {
1448	return Ctx.TTI.getCallInstrCost(F: nullptr, RetTy: Variant->getReturnType(),
1449	Tys: Variant->getFunctionType()->params(),
1450	CostKind: Ctx.CostKind);
1451	}
1452
1453	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
1454	void VPWidenCallRecipe::print(raw_ostream &O, const Twine &Indent,
1455	VPSlotTracker &SlotTracker) const {
1456	O << Indent << "WIDEN-CALL ";
1457
1458	Function *CalledFn = getCalledScalarFunction();
1459	if (CalledFn->getReturnType()->isVoidTy())
1460	O << "void ";
1461	else {
1462	printAsOperand(O, SlotTracker);
1463	O << " = ";
1464	}
1465
1466	O << "call";
1467	printFlags(O);
1468	O << " @" << CalledFn->getName() << "(";
1469	interleaveComma(args(), O, [&O, &SlotTracker](VPValue *Op) {
1470	Op->printAsOperand(O, SlotTracker);
1471	});
1472	O << ")";
1473
1474	O << " (using library function";
1475	if (Variant->hasName())
1476	O << ": " << Variant->getName();
1477	O << ")";
1478	}
1479	#endif
1480
1481	void VPWidenIntrinsicRecipe::execute(VPTransformState &State) {
1482	assert(State.VF.isVector() && "not widening");
1483
1484	SmallVector<Type *, `2`> TysForDecl;
1485	// Add return type if intrinsic is overloaded on it.
1486	if (isVectorIntrinsicWithOverloadTypeAtArg(ID: VectorIntrinsicID, OpdIdx: -`1`, TTI: State.TTI))
1487	TysForDecl.push_back(Elt: VectorType::get(ElementType: getResultType(), EC: State.VF));
1488	SmallVector<Value *, `4`> Args;
1489	for (const auto &I : enumerate(First: operands())) {
1490	// Some intrinsics have a scalar argument - don't replace it with a
1491	// vector.
1492	Value *Arg;
1493	if (isVectorIntrinsicWithScalarOpAtArg(ID: VectorIntrinsicID, ScalarOpdIdx: I.index(),
1494	TTI: State.TTI))
1495	Arg = State.get(Def: I.value(), Lane: VPLane (`0`));
1496	else
1497	Arg = State.get(Def: I.value(), IsScalar: onlyFirstLaneUsed(Op: I.value()));
1498	if (isVectorIntrinsicWithOverloadTypeAtArg(ID: VectorIntrinsicID, OpdIdx: I.index(),
1499	TTI: State.TTI))
1500	TysForDecl.push_back(Elt: Arg->getType());
1501	Args.push_back(Elt: Arg);
1502	}
1503
1504	// Use vector version of the intrinsic.
1505	Module *M = State.Builder.GetInsertBlock()->getModule();
1506	Function *VectorF =
1507	Intrinsic::getOrInsertDeclaration(M, id: VectorIntrinsicID, Tys: TysForDecl);
1508	assert(VectorF &&
1509	"Can't retrieve vector intrinsic or vector-predication intrinsics.");
1510
1511	auto *CI = cast_or_null<CallInst>(Val: getUnderlyingValue());
1512	SmallVector<OperandBundleDef, `1`> OpBundles;
1513	if (CI)
1514	CI->getOperandBundlesAsDefs(Defs&: OpBundles);
1515
1516	CallInst *V = State.Builder.CreateCall(Callee: VectorF, Args, OpBundles);
1517
1518	applyFlags(I&: *V);
1519	applyMetadata(I&: *V);
1520
1521	if (!V->getType()->isVoidTy())
1522	State.set(Def: this, V);
1523	}
1524
1525	InstructionCost VPWidenIntrinsicRecipe::computeCost(ElementCount VF,
1526	VPCostContext &Ctx) const {
1527	// Some backends analyze intrinsic arguments to determine cost. Use the
1528	// underlying value for the operand if it has one. Otherwise try to use the
1529	// operand of the underlying call instruction, if there is one. Otherwise
1530	// clear Arguments.
1531	// TODO: Rework TTI interface to be independent of concrete IR values.
1532	SmallVector<const Value *> Arguments;
1533	for (const auto &[Idx, Op] : enumerate(First: operands())) {
1534	auto *V = Op->getUnderlyingValue();
1535	if (!V) {
1536	if (auto *UI = dyn_cast_or_null<CallBase>(Val: getUnderlyingValue())) {
1537	Arguments.push_back(Elt: UI->getArgOperand(i: Idx));
1538	continue;
1539	}
1540	Arguments.clear();
1541	break;
1542	}
1543	Arguments.push_back(Elt: V);
1544	}
1545
1546	Type RetTy = toVectorizedTy(Ty: Ctx.Types.inferScalarType(V: this*), EC: VF);
1547	SmallVector<Type *> ParamTys;
1548	for (unsigned I = `0`; I != getNumOperands(); ++I)
1549	ParamTys.push_back(
1550	Elt: toVectorTy(Scalar: Ctx.Types.inferScalarType(V: getOperand(N: I)), EC: VF));
1551
1552	// TODO: Rework TTI interface to avoid reliance on underlying IntrinsicInst.
1553	FastMathFlags FMF = hasFastMathFlags() ? getFastMathFlags() : FastMathFlags ();
1554	IntrinsicCostAttributes CostAttrs(
1555	VectorIntrinsicID, RetTy, Arguments, ParamTys, FMF,
1556	dyn_cast_or_null<IntrinsicInst>(Val: getUnderlyingValue()),
1557	InstructionCost::getInvalid(), &Ctx.TLI);
1558	return Ctx.TTI.getIntrinsicInstrCost(ICA: CostAttrs, CostKind: Ctx.CostKind);
1559	}
1560
1561	StringRef VPWidenIntrinsicRecipe::getIntrinsicName() const {
1562	return Intrinsic::getBaseName(id: VectorIntrinsicID);
1563	}
1564
1565	bool VPWidenIntrinsicRecipe::onlyFirstLaneUsed(const VPValue Op) const* {
1566	assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
1567	return all_of(Range: enumerate(First: operands()), P: [this, &Op](const auto &X) {
1568	auto [Idx, V] = X;
1569	return V != Op \|\| isVectorIntrinsicWithScalarOpAtArg(getVectorIntrinsicID(),
1570	Idx, nullptr);
1571	});
1572	}
1573
1574	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
1575	void VPWidenIntrinsicRecipe::print(raw_ostream &O, const Twine &Indent,
1576	VPSlotTracker &SlotTracker) const {
1577	O << Indent << "WIDEN-INTRINSIC ";
1578	if (ResultTy->isVoidTy()) {
1579	O << "void ";
1580	} else {
1581	printAsOperand(O, SlotTracker);
1582	O << " = ";
1583	}
1584
1585	O << "call";
1586	printFlags(O);
1587	O << getIntrinsicName() << "(";
1588
1589	interleaveComma(operands(), O, [&O, &SlotTracker](VPValue *Op) {
1590	Op->printAsOperand(O, SlotTracker);
1591	});
1592	O << ")";
1593	}
1594	#endif
1595
1596	void VPHistogramRecipe::execute(VPTransformState &State) {
1597	IRBuilderBase &Builder = State.Builder;
1598
1599	Value *Address = State.get(Def: getOperand(N: `0`));
1600	Value IncAmt = State.get(Def: getOperand(N: `1`), /IsScalar=/*true);
1601	VectorType *VTy = cast<VectorType>(Val: Address->getType());
1602
1603	// The histogram intrinsic requires a mask even if the recipe doesn't;
1604	// if the mask operand was omitted then all lanes should be executed and
1605	// we just need to synthesize an all-true mask.
1606	Value Mask = nullptr*;
1607	if (VPValue *VPMask = getMask())
1608	Mask = State.get(Def: VPMask);
1609	else
1610	Mask =
1611	Builder.CreateVectorSplat(EC: VTy->getElementCount(), V: Builder.getInt1(V: `1`));
1612
1613	// If this is a subtract, we want to invert the increment amount. We may
1614	// add a separate intrinsic in future, but for now we'll try this.
1615	if (Opcode == Instruction::Sub)
1616	IncAmt = Builder.CreateNeg(V: IncAmt);
1617	else
1618	assert(Opcode == Instruction::Add && "only add or sub supported for now");
1619
1620	State.Builder.CreateIntrinsic(ID: Intrinsic::experimental_vector_histogram_add,
1621	Types: {VTy, IncAmt->getType()},
1622	Args: {Address, IncAmt, Mask});
1623	}
1624
1625	InstructionCost VPHistogramRecipe::computeCost(ElementCount VF,
1626	VPCostContext &Ctx) const {
1627	// FIXME: Take the gather and scatter into account as well. For now we're
1628	// generating the same cost as the fallback path, but we'll likely
1629	// need to create a new TTI method for determining the cost, including
1630	// whether we can use base + vec-of-smaller-indices or just
1631	// vec-of-pointers.
1632	assert(VF.isVector() && "Invalid VF for histogram cost");
1633	Type *AddressTy = Ctx.Types.inferScalarType(V: getOperand(N: `0`));
1634	VPValue *IncAmt = getOperand(N: `1`);
1635	Type *IncTy = Ctx.Types.inferScalarType(V: IncAmt);
1636	VectorType *VTy = VectorType::get(ElementType: IncTy, EC: VF);
1637
1638	// Assume that a non-constant update value (or a constant != 1) requires
1639	// a multiply, and add that into the cost.
1640	InstructionCost MulCost =
1641	Ctx.TTI.getArithmeticInstrCost(Opcode: Instruction::Mul, Ty: VTy, CostKind: Ctx.CostKind);
1642	if (IncAmt->isLiveIn()) {
1643	ConstantInt *CI = dyn_cast<ConstantInt>(Val: IncAmt->getLiveInIRValue());
1644
1645	if (CI && CI->getZExtValue() == `1`)
1646	MulCost = TTI::TCC_Free;
1647	}
1648
1649	// Find the cost of the histogram operation itself.
1650	Type *PtrTy = VectorType::get(ElementType: AddressTy, EC: VF);
1651	Type *MaskTy = VectorType::get(ElementType: Type::getInt1Ty(C&: Ctx.LLVMCtx), EC: VF);
1652	IntrinsicCostAttributes ICA(Intrinsic::experimental_vector_histogram_add,
1653	Type::getVoidTy(C&: Ctx.LLVMCtx),
1654	{PtrTy, IncTy, MaskTy});
1655
1656	// Add the costs together with the add/sub operation.
1657	return Ctx.TTI.getIntrinsicInstrCost(ICA, CostKind: Ctx.CostKind) + MulCost +
1658	Ctx.TTI.getArithmeticInstrCost(Opcode, Ty: VTy, CostKind: Ctx.CostKind);
1659	}
1660
1661	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
1662	void VPHistogramRecipe::print(raw_ostream &O, const Twine &Indent,
1663	VPSlotTracker &SlotTracker) const {
1664	O << Indent << "WIDEN-HISTOGRAM buckets: ";
1665	getOperand(`0`)->printAsOperand(O, SlotTracker);
1666
1667	if (Opcode == Instruction::Sub)
1668	O << ", dec: ";
1669	else {
1670	assert(Opcode == Instruction::Add);
1671	O << ", inc: ";
1672	}
1673	getOperand(`1`)->printAsOperand(O, SlotTracker);
1674
1675	if (VPValue *Mask = getMask()) {
1676	O << ", mask: ";
1677	Mask->printAsOperand(O, SlotTracker);
1678	}
1679	}
1680
1681	void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent,
1682	VPSlotTracker &SlotTracker) const {
1683	O << Indent << "WIDEN-SELECT ";
1684	printAsOperand(O, SlotTracker);
1685	O << " = select ";
1686	printFlags(O);
1687	getOperand(`0`)->printAsOperand(O, SlotTracker);
1688	O << ", ";
1689	getOperand(`1`)->printAsOperand(O, SlotTracker);
1690	O << ", ";
1691	getOperand(`2`)->printAsOperand(O, SlotTracker);
1692	O << (isInvariantCond() ? " (condition is loop invariant)" : "");
1693	}
1694	#endif
1695
1696	void VPWidenSelectRecipe::execute(VPTransformState &State) {
1697	// The condition can be loop invariant but still defined inside the
1698	// loop. This means that we can't just use the original 'cond' value.
1699	// We have to take the 'vectorized' value and pick the first lane.
1700	// Instcombine will make this a no-op.
1701	auto *InvarCond =
1702	isInvariantCond() ? State.get(Def: getCond(), Lane: VPLane (`0`)) : nullptr;
1703
1704	Value *Cond = InvarCond ? InvarCond : State.get(Def: getCond());
1705	Value *Op0 = State.get(Def: getOperand(N: `1`));
1706	Value *Op1 = State.get(Def: getOperand(N: `2`));
1707	Value *Sel = State.Builder.CreateSelect(C: Cond, True: Op0, False: Op1);
1708	State.set(Def: this, V: Sel);
1709	if (auto *I = dyn_cast<Instruction>(Val: Sel)) {
1710	if (isa<FPMathOperator>(Val: I))
1711	applyFlags(I&: *I);
1712	applyMetadata(I&: *I);
1713	}
1714	}
1715
1716	InstructionCost VPWidenSelectRecipe::computeCost(ElementCount VF,
1717	VPCostContext &Ctx) const {
1718	SelectInst *SI = cast<SelectInst>(Val: getUnderlyingValue());
1719	bool ScalarCond = getOperand(N: `0`)->isDefinedOutsideLoopRegions();
1720	Type ScalarTy = Ctx.Types.inferScalarType(V: this*);
1721	Type VectorTy = toVectorTy(Scalar: Ctx.Types.inferScalarType(V: this*), EC: VF);
1722
1723	VPValue Op0, Op1;
1724	using namespace llvm::VPlanPatternMatch;
1725	if (!ScalarCond && ScalarTy->getScalarSizeInBits() == `1` &&
1726	(match(V: this, P: m_LogicalAnd(Op0: m_VPValue(V&: Op0), Op1: m_VPValue(V&: Op1))) \|\|
1727	match(V: this, P: m_LogicalOr(Op0: m_VPValue(V&: Op0), Op1: m_VPValue(V&: Op1))))) {
1728	// select x, y, false --> x & y
1729	// select x, true, y --> x \| y
1730	const auto [Op1VK, Op1VP] = Ctx.getOperandInfo(V: Op0);
1731	const auto [Op2VK, Op2VP] = Ctx.getOperandInfo(V: Op1);
1732
1733	SmallVector<const Value *, `2`> Operands;
1734	if (all_of(Range: operands(),
1735	P: [](VPValue Op) { return* Op->getUnderlyingValue(); }))
1736	Operands.append(in_start: SI->op_begin(), in_end: SI->op_end());
1737	bool IsLogicalOr = match(V: this, P: m_LogicalOr(Op0: m_VPValue(V&: Op0), Op1: m_VPValue(V&: Op1)));
1738	return Ctx.TTI.getArithmeticInstrCost(
1739	Opcode: IsLogicalOr ? Instruction::Or : Instruction::And, Ty: VectorTy,
1740	CostKind: Ctx.CostKind, Opd1Info: {.Kind: Op1VK, .Properties: Op1VP}, Opd2Info: {.Kind: Op2VK, .Properties: Op2VP}, Args: Operands, CxtI: SI);
1741	}
1742
1743	Type *CondTy = Ctx.Types.inferScalarType(V: getOperand(N: `0`));
1744	if (!ScalarCond)
1745	CondTy = VectorType::get(ElementType: CondTy, EC: VF);
1746
1747	CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
1748	if (auto *Cmp = dyn_cast<CmpInst>(Val: SI->getCondition()))
1749	Pred = Cmp->getPredicate();
1750	return Ctx.TTI.getCmpSelInstrCost(
1751	Opcode: Instruction::Select, ValTy: VectorTy, CondTy, VecPred: Pred, CostKind: Ctx.CostKind,
1752	Op1Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, Op2Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, I: SI);
1753	}
1754
1755	VPIRFlags::FastMathFlagsTy::FastMathFlagsTy(const FastMathFlags &FMF) {
1756	AllowReassoc = FMF.allowReassoc();
1757	NoNaNs = FMF.noNaNs();
1758	NoInfs = FMF.noInfs();
1759	NoSignedZeros = FMF.noSignedZeros();
1760	AllowReciprocal = FMF.allowReciprocal();
1761	AllowContract = FMF.allowContract();
1762	ApproxFunc = FMF.approxFunc();
1763	}
1764
1765	#if !defined(NDEBUG)
1766	bool VPIRFlags::flagsValidForOpcode(unsigned Opcode) const {
1767	switch (OpType) {
1768	case OperationType::OverflowingBinOp:
1769	return Opcode == Instruction::Add \|\| Opcode == Instruction::Sub \|\|
1770	Opcode == Instruction::Mul \|\|
1771	Opcode == VPInstruction::VPInstruction::CanonicalIVIncrementForPart;
1772	case OperationType::DisjointOp:
1773	return Opcode == Instruction::Or;
1774	case OperationType::PossiblyExactOp:
1775	return Opcode == Instruction::AShr;
1776	case OperationType::GEPOp:
1777	return Opcode == Instruction::GetElementPtr \|\|
1778	Opcode == VPInstruction::PtrAdd;
1779	case OperationType::FPMathOp:
1780	return Opcode == Instruction::FAdd \|\| Opcode == Instruction::FMul \|\|
1781	Opcode == Instruction::FSub \|\| Opcode == Instruction::FNeg \|\|
1782	Opcode == Instruction::FDiv \|\| Opcode == Instruction::FRem \|\|
1783	Opcode == Instruction::FCmp \|\| Opcode == Instruction::Select \|\|
1784	Opcode == VPInstruction::WideIVStep \|\|
1785	Opcode == VPInstruction::ReductionStartVector \|\|
1786	Opcode == VPInstruction::ComputeReductionResult;
1787	case OperationType::NonNegOp:
1788	return Opcode == Instruction::ZExt;
1789	break;
1790	case OperationType::Cmp:
1791	return Opcode == Instruction::ICmp;
1792	case OperationType::Other:
1793	return true;
1794	}
1795	llvm_unreachable("Unknown OperationType enum");
1796	}
1797	#endif
1798
1799	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
1800	void VPIRFlags::printFlags(raw_ostream &O) const {
1801	switch (OpType) {
1802	case OperationType::Cmp:
1803	O << " " << CmpInst::getPredicateName(getPredicate());
1804	break;
1805	case OperationType::DisjointOp:
1806	if (DisjointFlags.IsDisjoint)
1807	O << " disjoint";
1808	break;
1809	case OperationType::PossiblyExactOp:
1810	if (ExactFlags.IsExact)
1811	O << " exact";
1812	break;
1813	case OperationType::OverflowingBinOp:
1814	if (WrapFlags.HasNUW)
1815	O << " nuw";
1816	if (WrapFlags.HasNSW)
1817	O << " nsw";
1818	break;
1819	case OperationType::FPMathOp:
1820	getFastMathFlags().print(O);
1821	break;
1822	case OperationType::GEPOp:
1823	if (GEPFlags.isInBounds())
1824	O << " inbounds";
1825	else if (GEPFlags.hasNoUnsignedSignedWrap())
1826	O << " nusw";
1827	if (GEPFlags.hasNoUnsignedWrap())
1828	O << " nuw";
1829	break;
1830	case OperationType::NonNegOp:
1831	if (NonNegFlags.NonNeg)
1832	O << " nneg";
1833	break;
1834	case OperationType::Other:
1835	break;
1836	}
1837	O << " ";
1838	}
1839	#endif
1840
1841	void VPWidenRecipe::execute(VPTransformState &State) {
1842	auto &Builder = State.Builder;
1843	switch (Opcode) {
1844	case Instruction::Call:
1845	case Instruction::Br:
1846	case Instruction::PHI:
1847	case Instruction::GetElementPtr:
1848	case Instruction::Select:
1849	llvm_unreachable("This instruction is handled by a different recipe.");
1850	case Instruction::UDiv:
1851	case Instruction::SDiv:
1852	case Instruction::SRem:
1853	case Instruction::URem:
1854	case Instruction::Add:
1855	case Instruction::FAdd:
1856	case Instruction::Sub:
1857	case Instruction::FSub:
1858	case Instruction::FNeg:
1859	case Instruction::Mul:
1860	case Instruction::FMul:
1861	case Instruction::FDiv:
1862	case Instruction::FRem:
1863	case Instruction::Shl:
1864	case Instruction::LShr:
1865	case Instruction::AShr:
1866	case Instruction::And:
1867	case Instruction::Or:
1868	case Instruction::Xor: {
1869	// Just widen unops and binops.
1870	SmallVector<Value *, `2`> Ops;
1871	for (VPValue *VPOp : operands())
1872	Ops.push_back(Elt: State.get(Def: VPOp));
1873
1874	Value *V = Builder.CreateNAryOp(Opc: Opcode, Ops);
1875
1876	if (auto *VecOp = dyn_cast<Instruction>(Val: V)) {
1877	applyFlags(I&: *VecOp);
1878	applyMetadata(I&: *VecOp);
1879	}
1880
1881	// Use this vector value for all users of the original instruction.
1882	State.set(Def: this, V);
1883	break;
1884	}
1885	case Instruction::ExtractValue: {
1886	assert(getNumOperands() == `2` && "expected single level extractvalue");
1887	Value *Op = State.get(Def: getOperand(N: `0`));
1888	auto *CI = cast<ConstantInt>(Val: getOperand(N: `1`)->getLiveInIRValue());
1889	Value *Extract = Builder.CreateExtractValue(Agg: Op, Idxs: CI->getZExtValue());
1890	State.set(Def: this, V: Extract);
1891	break;
1892	}
1893	case Instruction::Freeze: {
1894	Value *Op = State.get(Def: getOperand(N: `0`));
1895	Value *Freeze = Builder.CreateFreeze(V: Op);
1896	State.set(Def: this, V: Freeze);
1897	break;
1898	}
1899	case Instruction::ICmp:
1900	case Instruction::FCmp: {
1901	// Widen compares. Generate vector compares.
1902	bool FCmp = Opcode == Instruction::FCmp;
1903	Value *A = State.get(Def: getOperand(N: `0`));
1904	Value *B = State.get(Def: getOperand(N: `1`));
1905	Value C = nullptr*;
1906	if (FCmp) {
1907	// Propagate fast math flags.
1908	C = Builder.CreateFCmpFMF(
1909	P: getPredicate(), LHS: A, RHS: B,
1910	FMFSource: dyn_cast_or_null<Instruction>(Val: getUnderlyingValue()));
1911	} else {
1912	C = Builder.CreateICmp(P: getPredicate(), LHS: A, RHS: B);
1913	}
1914	if (auto *I = dyn_cast<Instruction>(Val: C))
1915	applyMetadata(I&: *I);
1916	State.set(Def: this, V: C);
1917	break;
1918	}
1919	default:
1920	// This instruction is not vectorized by simple widening.
1921	LLVM_DEBUG(dbgs() << "LV: Found an unhandled opcode : "
1922	<< Instruction::getOpcodeName(Opcode));
1923	llvm_unreachable("Unhandled instruction!");
1924	} // end of switch.
1925
1926	#if !defined(NDEBUG)
1927	// Verify that VPlan type inference results agree with the type of the
1928	// generated values.
1929	assert(VectorType::get(State.TypeAnalysis.inferScalarType(this), State.VF) ==
1930	State.get(this)->getType() &&
1931	"inferred type and type from generated instructions do not match");
1932	#endif
1933	}
1934
1935	InstructionCost VPWidenRecipe::computeCost(ElementCount VF,
1936	VPCostContext &Ctx) const {
1937	switch (Opcode) {
1938	case Instruction::FNeg: {
1939	Type VectorTy = toVectorTy(Scalar: Ctx.Types.inferScalarType(V: this*), EC: VF);
1940	return Ctx.TTI.getArithmeticInstrCost(
1941	Opcode, Ty: VectorTy, CostKind: Ctx.CostKind,
1942	Opd1Info: {.Kind: TargetTransformInfo::OK_AnyValue, .Properties: TargetTransformInfo::OP_None},
1943	Opd2Info: {.Kind: TargetTransformInfo::OK_AnyValue, .Properties: TargetTransformInfo::OP_None});
1944	}
1945
1946	case Instruction::UDiv:
1947	case Instruction::SDiv:
1948	case Instruction::SRem:
1949	case Instruction::URem:
1950	// More complex computation, let the legacy cost-model handle this for now.
1951	return Ctx.getLegacyCost(UI: cast<Instruction>(Val: getUnderlyingValue()), VF);
1952	case Instruction::Add:
1953	case Instruction::FAdd:
1954	case Instruction::Sub:
1955	case Instruction::FSub:
1956	case Instruction::Mul:
1957	case Instruction::FMul:
1958	case Instruction::FDiv:
1959	case Instruction::FRem:
1960	case Instruction::Shl:
1961	case Instruction::LShr:
1962	case Instruction::AShr:
1963	case Instruction::And:
1964	case Instruction::Or:
1965	case Instruction::Xor: {
1966	VPValue *RHS = getOperand(N: `1`);
1967	// Certain instructions can be cheaper to vectorize if they have a constant
1968	// second vector operand. One example of this are shifts on x86.
1969	TargetTransformInfo::OperandValueInfo RHSInfo = Ctx.getOperandInfo(V: RHS);
1970
1971	if (RHSInfo.Kind == TargetTransformInfo::OK_AnyValue &&
1972	getOperand(N: `1`)->isDefinedOutsideLoopRegions())
1973	RHSInfo.Kind = TargetTransformInfo::OK_UniformValue;
1974	Type VectorTy = toVectorTy(Scalar: Ctx.Types.inferScalarType(V: this*), EC: VF);
1975	Instruction *CtxI = dyn_cast_or_null<Instruction>(Val: getUnderlyingValue());
1976
1977	SmallVector<const Value *, `4`> Operands;
1978	if (CtxI)
1979	Operands.append(in_start: CtxI->value_op_begin(), in_end: CtxI->value_op_end());
1980	return Ctx.TTI.getArithmeticInstrCost(
1981	Opcode, Ty: VectorTy, CostKind: Ctx.CostKind,
1982	Opd1Info: {.Kind: TargetTransformInfo::OK_AnyValue, .Properties: TargetTransformInfo::OP_None},
1983	Opd2Info: RHSInfo, Args: Operands, CxtI: CtxI, TLibInfo: &Ctx.TLI);
1984	}
1985	case Instruction::Freeze: {
1986	// This opcode is unknown. Assume that it is the same as 'mul'.
1987	Type VectorTy = toVectorTy(Scalar: Ctx.Types.inferScalarType(V: this*), EC: VF);
1988	return Ctx.TTI.getArithmeticInstrCost(Opcode: Instruction::Mul, Ty: VectorTy,
1989	CostKind: Ctx.CostKind);
1990	}
1991	case Instruction::ExtractValue: {
1992	return Ctx.TTI.getInsertExtractValueCost(Opcode: Instruction::ExtractValue,
1993	CostKind: Ctx.CostKind);
1994	}
1995	case Instruction::ICmp:
1996	case Instruction::FCmp: {
1997	Instruction *CtxI = dyn_cast_or_null<Instruction>(Val: getUnderlyingValue());
1998	Type *VectorTy = toVectorTy(Scalar: Ctx.Types.inferScalarType(V: getOperand(N: `0`)), EC: VF);
1999	return Ctx.TTI.getCmpSelInstrCost(
2000	Opcode, ValTy: VectorTy, CondTy: CmpInst::makeCmpResultType(opnd_type: VectorTy), VecPred: getPredicate(),
2001	CostKind: Ctx.CostKind, Op1Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None},
2002	Op2Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, I: CtxI);
2003	}
2004	default:
2005	llvm_unreachable("Unsupported opcode for instruction");
2006	}
2007	}
2008
2009	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
2010	void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent,
2011	VPSlotTracker &SlotTracker) const {
2012	O << Indent << "WIDEN ";
2013	printAsOperand(O, SlotTracker);
2014	O << " = " << Instruction::getOpcodeName(Opcode);
2015	printFlags(O);
2016	printOperands(O, SlotTracker);
2017	}
2018	#endif
2019
2020	void VPWidenCastRecipe::execute(VPTransformState &State) {
2021	auto &Builder = State.Builder;
2022	/// Vectorize casts.
2023	assert(State.VF.isVector() && "Not vectorizing?");
2024	Type *DestTy = VectorType::get(ElementType: getResultType(), EC: State.VF);
2025	VPValue *Op = getOperand(N: `0`);
2026	Value *A = State.get(Def: Op);
2027	Value *Cast = Builder.CreateCast(Op: Instruction::CastOps(Opcode), V: A, DestTy);
2028	State.set(Def: this, V: Cast);
2029	if (auto *CastOp = dyn_cast<Instruction>(Val: Cast)) {
2030	applyFlags(I&: *CastOp);
2031	applyMetadata(I&: *CastOp);
2032	}
2033	}
2034
2035	InstructionCost VPWidenCastRecipe::computeCost(ElementCount VF,
2036	VPCostContext &Ctx) const {
2037	// TODO: In some cases, VPWidenCastRecipes are created but not considered in
2038	// the legacy cost model, including truncates/extends when evaluating a
2039	// reduction in a smaller type.
2040	if (!getUnderlyingValue())
2041	return `0`;
2042	// Computes the CastContextHint from a recipes that may access memory.
2043	auto ComputeCCH = [&](const VPRecipeBase *R) -> TTI::CastContextHint {
2044	if (VF.isScalar())
2045	return TTI::CastContextHint::Normal;
2046	if (isa<VPInterleaveRecipe>(Val: R))
2047	return TTI::CastContextHint::Interleave;
2048	if (const auto *ReplicateRecipe = dyn_cast<VPReplicateRecipe>(Val: R))
2049	return ReplicateRecipe->isPredicated() ? TTI::CastContextHint::Masked
2050	: TTI::CastContextHint::Normal;
2051	const auto *WidenMemoryRecipe = dyn_cast<VPWidenMemoryRecipe>(Val: R);
2052	if (WidenMemoryRecipe == nullptr)
2053	return TTI::CastContextHint::None;
2054	if (!WidenMemoryRecipe->isConsecutive())
2055	return TTI::CastContextHint::GatherScatter;
2056	if (WidenMemoryRecipe->isReverse())
2057	return TTI::CastContextHint::Reversed;
2058	if (WidenMemoryRecipe->isMasked())
2059	return TTI::CastContextHint::Masked;
2060	return TTI::CastContextHint::Normal;
2061	};
2062
2063	VPValue *Operand = getOperand(N: `0`);
2064	TTI::CastContextHint CCH = TTI::CastContextHint::None;
2065	// For Trunc/FPTrunc, get the context from the only user.
2066	if ((Opcode == Instruction::Trunc \|\| Opcode == Instruction::FPTrunc) &&
2067	!hasMoreThanOneUniqueUser() && getNumUsers() > `0`) {
2068	if (auto StoreRecipe = dyn_cast<VPRecipeBase>(Val: user_begin()))
2069	CCH = ComputeCCH (StoreRecipe);
2070	}
2071	// For Z/Sext, get the context from the operand.
2072	else if (Opcode == Instruction::ZExt \|\| Opcode == Instruction::SExt \|\|
2073	Opcode == Instruction::FPExt) {
2074	if (Operand->isLiveIn())
2075	CCH = TTI::CastContextHint::Normal;
2076	else if (Operand->getDefiningRecipe())
2077	CCH = ComputeCCH (Operand->getDefiningRecipe());
2078	}
2079
2080	auto *SrcTy =
2081	cast<VectorType>(Val: toVectorTy(Scalar: Ctx.Types.inferScalarType(V: Operand), EC: VF));
2082	auto *DestTy = cast<VectorType>(Val: toVectorTy(Scalar: getResultType(), EC: VF));
2083	// Arm TTI will use the underlying instruction to determine the cost.
2084	return Ctx.TTI.getCastInstrCost(
2085	Opcode, Dst: DestTy, Src: SrcTy, CCH, CostKind: Ctx.CostKind,
2086	I: dyn_cast_if_present<Instruction>(Val: getUnderlyingValue()));
2087	}
2088
2089	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
2090	void VPWidenCastRecipe::print(raw_ostream &O, const Twine &Indent,
2091	VPSlotTracker &SlotTracker) const {
2092	O << Indent << "WIDEN-CAST ";
2093	printAsOperand(O, SlotTracker);
2094	O << " = " << Instruction::getOpcodeName(Opcode);
2095	printFlags(O);
2096	printOperands(O, SlotTracker);
2097	O << " to " << *getResultType();
2098	}
2099	#endif
2100
2101	InstructionCost VPHeaderPHIRecipe::computeCost(ElementCount VF,
2102	VPCostContext &Ctx) const {
2103	return Ctx.TTI.getCFInstrCost(Opcode: Instruction::PHI, CostKind: Ctx.CostKind);
2104	}
2105
2106	/// A helper function that returns an integer or floating-point constant with
2107	/// value C.
2108	static Constant getSignedIntOrFpConstant(Type Ty, int64_t C) {
2109	return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, V: C)
2110	: ConstantFP::get(Ty, V: C);
2111	}
2112
2113	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
2114	void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent,
2115	VPSlotTracker &SlotTracker) const {
2116	O << Indent;
2117	printAsOperand(O, SlotTracker);
2118	O << " = WIDEN-INDUCTION ";
2119	printOperands(O, SlotTracker);
2120
2121	if (auto *TI = getTruncInst())
2122	O << " (truncated to " << *TI->getType() << ")";
2123	}
2124	#endif
2125
2126	bool VPWidenIntOrFpInductionRecipe::isCanonical() const {
2127	// The step may be defined by a recipe in the preheader (e.g. if it requires
2128	// SCEV expansion), but for the canonical induction the step is required to be
2129	// 1, which is represented as live-in.
2130	if (getStepValue()->getDefiningRecipe())
2131	return false;
2132	auto *StepC = dyn_cast<ConstantInt>(Val: getStepValue()->getLiveInIRValue());
2133	auto *StartC = dyn_cast<ConstantInt>(Val: getStartValue()->getLiveInIRValue());
2134	auto CanIV = cast<VPCanonicalIVPHIRecipe>(Val: &getParent()->begin());
2135	return StartC && StartC->isZero() && StepC && StepC->isOne() &&
2136	getScalarType() == CanIV->getScalarType();
2137	}
2138
2139	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
2140	void VPDerivedIVRecipe::print(raw_ostream &O, const Twine &Indent,
2141	VPSlotTracker &SlotTracker) const {
2142	O << Indent;
2143	printAsOperand(O, SlotTracker);
2144	O << " = DERIVED-IV ";
2145	getStartValue()->printAsOperand(O, SlotTracker);
2146	O << " + ";
2147	getOperand(`1`)->printAsOperand(O, SlotTracker);
2148	O << " * ";
2149	getStepValue()->printAsOperand(O, SlotTracker);
2150	}
2151	#endif
2152
2153	void VPScalarIVStepsRecipe::execute(VPTransformState &State) {
2154	// Fast-math-flags propagate from the original induction instruction.
2155	IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
2156	if (hasFastMathFlags())
2157	State.Builder.setFastMathFlags(getFastMathFlags());
2158
2159	/// Compute scalar induction steps. \p ScalarIV is the scalar induction
2160	/// variable on which to base the steps, \p Step is the size of the step.
2161
2162	Value *BaseIV = State.get(Def: getOperand(N: `0`), Lane: VPLane (`0`));
2163	Value *Step = State.get(Def: getStepValue(), Lane: VPLane (`0`));
2164	IRBuilderBase &Builder = State.Builder;
2165
2166	// Ensure step has the same type as that of scalar IV.
2167	Type *BaseIVTy = BaseIV->getType()->getScalarType();
2168	assert(BaseIVTy == Step->getType() && "Types of BaseIV and Step must match!");
2169
2170	// We build scalar steps for both integer and floating-point induction
2171	// variables. Here, we determine the kind of arithmetic we will perform.
2172	Instruction::BinaryOps AddOp;
2173	Instruction::BinaryOps MulOp;
2174	if (BaseIVTy->isIntegerTy()) {
2175	AddOp = Instruction::Add;
2176	MulOp = Instruction::Mul;
2177	} else {
2178	AddOp = InductionOpcode;
2179	MulOp = Instruction::FMul;
2180	}
2181
2182	// Determine the number of scalars we need to generate for each unroll
2183	// iteration.
2184	bool FirstLaneOnly = vputils::onlyFirstLaneUsed(Def: this);
2185	// Compute the scalar steps and save the results in State.
2186	Type *IntStepTy =
2187	IntegerType::get(C&: BaseIVTy->getContext(), NumBits: BaseIVTy->getScalarSizeInBits());
2188	Type VecIVTy = nullptr*;
2189	Value UnitStepVec = nullptr, SplatStep = nullptr, SplatIV = nullptr*;
2190	if (!FirstLaneOnly && State.VF.isScalable()) {
2191	VecIVTy = VectorType::get(ElementType: BaseIVTy, EC: State.VF);
2192	UnitStepVec =
2193	Builder.CreateStepVector(DstType: VectorType::get(ElementType: IntStepTy, EC: State.VF));
2194	SplatStep = Builder.CreateVectorSplat(EC: State.VF, V: Step);
2195	SplatIV = Builder.CreateVectorSplat(EC: State.VF, V: BaseIV);
2196	}
2197
2198	unsigned StartLane = `0`;
2199	unsigned EndLane = FirstLaneOnly ? `1` : State.VF.getKnownMinValue();
2200	if (State.Lane) {
2201	StartLane = State.Lane ->getKnownLane();
2202	EndLane = StartLane + `1`;
2203	}
2204	Value *StartIdx0;
2205	if (getUnrollPart(U&: *this) == `0`)
2206	StartIdx0 = ConstantInt::get(Ty: IntStepTy, V: `0`);
2207	else {
2208	StartIdx0 = State.get(Def: getOperand(N: `2`), IsScalar: true);
2209	if (getUnrollPart(U&: *this) != `1`) {
2210	StartIdx0 =
2211	Builder.CreateMul(LHS: StartIdx0, RHS: ConstantInt::get(Ty: StartIdx0->getType(),
2212	V: getUnrollPart(U&: *this)));
2213	}
2214	StartIdx0 = Builder.CreateSExtOrTrunc(V: StartIdx0, DestTy: IntStepTy);
2215	}
2216
2217	if (!FirstLaneOnly && State.VF.isScalable()) {
2218	auto *SplatStartIdx = Builder.CreateVectorSplat(EC: State.VF, V: StartIdx0);
2219	auto *InitVec = Builder.CreateAdd(LHS: SplatStartIdx, RHS: UnitStepVec);
2220	if (BaseIVTy->isFloatingPointTy())
2221	InitVec = Builder.CreateSIToFP(V: InitVec, DestTy: VecIVTy);
2222	auto *Mul = Builder.CreateBinOp(Opc: MulOp, LHS: InitVec, RHS: SplatStep);
2223	auto *Add = Builder.CreateBinOp(Opc: AddOp, LHS: SplatIV, RHS: Mul);
2224	State.set(Def: this, V: Add);
2225	// It's useful to record the lane values too for the known minimum number
2226	// of elements so we do those below. This improves the code quality when
2227	// trying to extract the first element, for example.
2228	}
2229
2230	if (BaseIVTy->isFloatingPointTy())
2231	StartIdx0 = Builder.CreateSIToFP(V: StartIdx0, DestTy: BaseIVTy);
2232
2233	for (unsigned Lane = StartLane; Lane < EndLane; ++Lane) {
2234	Value *StartIdx = Builder.CreateBinOp(
2235	Opc: AddOp, LHS: StartIdx0, RHS: getSignedIntOrFpConstant(Ty: BaseIVTy, C: Lane));
2236	// The step returned by `createStepForVF` is a runtime-evaluated value
2237	// when VF is scalable. Otherwise, it should be folded into a Constant.
2238	assert((State.VF.isScalable() \|\| isa<Constant>(StartIdx)) &&
2239	"Expected StartIdx to be folded to a constant when VF is not "
2240	"scalable");
2241	auto *Mul = Builder.CreateBinOp(Opc: MulOp, LHS: StartIdx, RHS: Step);
2242	auto *Add = Builder.CreateBinOp(Opc: AddOp, LHS: BaseIV, RHS: Mul);
2243	State.set(Def: this, V: Add, Lane: VPLane (Lane));
2244	}
2245	}
2246
2247	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
2248	void VPScalarIVStepsRecipe::print(raw_ostream &O, const Twine &Indent,
2249	VPSlotTracker &SlotTracker) const {
2250	O << Indent;
2251	printAsOperand(O, SlotTracker);
2252	O << " = SCALAR-STEPS ";
2253	printOperands(O, SlotTracker);
2254	}
2255	#endif
2256
2257	void VPWidenGEPRecipe::execute(VPTransformState &State) {
2258	assert(State.VF.isVector() && "not widening");
2259	auto *GEP = cast<GetElementPtrInst>(Val: getUnderlyingInstr());
2260	// Construct a vector GEP by widening the operands of the scalar GEP as
2261	// necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
2262	// results in a vector of pointers when at least one operand of the GEP
2263	// is vector-typed. Thus, to keep the representation compact, we only use
2264	// vector-typed operands for loop-varying values.
2265
2266	if (areAllOperandsInvariant()) {
2267	// If we are vectorizing, but the GEP has only loop-invariant operands,
2268	// the GEP we build (by only using vector-typed operands for
2269	// loop-varying values) would be a scalar pointer. Thus, to ensure we
2270	// produce a vector of pointers, we need to either arbitrarily pick an
2271	// operand to broadcast, or broadcast a clone of the original GEP.
2272	// Here, we broadcast a clone of the original.
2273	//
2274	// TODO: If at some point we decide to scalarize instructions having
2275	// loop-invariant operands, this special case will no longer be
2276	// required. We would add the scalarization decision to
2277	// collectLoopScalars() and teach getVectorValue() to broadcast
2278	// the lane-zero scalar value.
2279	SmallVector<Value *> Ops;
2280	for (unsigned I = `0`, E = getNumOperands(); I != E; I++)
2281	Ops.push_back(Elt: State.get(Def: getOperand(N: I), Lane: VPLane (`0`)));
2282
2283	auto *NewGEP = State.Builder.CreateGEP(Ty: GEP->getSourceElementType(), Ptr: Ops [`0`],
2284	IdxList: ArrayRef(Ops).drop_front(), Name: "",
2285	NW: getGEPNoWrapFlags());
2286	Value *Splat = State.Builder.CreateVectorSplat(EC: State.VF, V: NewGEP);
2287	State.set(Def: this, V: Splat);
2288	} else {
2289	// If the GEP has at least one loop-varying operand, we are sure to
2290	// produce a vector of pointers unless VF is scalar.
2291	// The pointer operand of the new GEP. If it's loop-invariant, we
2292	// won't broadcast it.
2293	auto *Ptr = isPointerLoopInvariant() ? State.get(Def: getOperand(N: `0`), Lane: VPLane (`0`))
2294	: State.get(Def: getOperand(N: `0`));
2295
2296	// Collect all the indices for the new GEP. If any index is
2297	// loop-invariant, we won't broadcast it.
2298	SmallVector<Value *, `4`> Indices;
2299	for (unsigned I = `1`, E = getNumOperands(); I < E; I++) {
2300	VPValue *Operand = getOperand(N: I);
2301	if (isIndexLoopInvariant(I: I - `1`))
2302	Indices.push_back(Elt: State.get(Def: Operand, Lane: VPLane (`0`)));
2303	else
2304	Indices.push_back(Elt: State.get(Def: Operand));
2305	}
2306
2307	// Create the new GEP. Note that this GEP may be a scalar if VF == 1,
2308	// but it should be a vector, otherwise.
2309	auto *NewGEP = State.Builder.CreateGEP(Ty: GEP->getSourceElementType(), Ptr,
2310	IdxList: Indices, Name: "", NW: getGEPNoWrapFlags());
2311	assert((State.VF.isScalar() \|\| NewGEP->getType()->isVectorTy()) &&
2312	"NewGEP is not a pointer vector");
2313	State.set(Def: this, V: NewGEP);
2314	}
2315	}
2316
2317	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
2318	void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent,
2319	VPSlotTracker &SlotTracker) const {
2320	O << Indent << "WIDEN-GEP ";
2321	O << (isPointerLoopInvariant() ? "Inv" : "Var");
2322	for (size_t I = `0`; I < getNumOperands() - `1`; ++I)
2323	O << "[" << (isIndexLoopInvariant(I) ? "Inv" : "Var") << "]";
2324
2325	O << " ";
2326	printAsOperand(O, SlotTracker);
2327	O << " = getelementptr";
2328	printFlags(O);
2329	printOperands(O, SlotTracker);
2330	}
2331	#endif
2332
2333	static Type getGEPIndexTy(bool* IsScalable, bool IsReverse, bool IsUnitStride,
2334	unsigned CurrentPart, IRBuilderBase &Builder) {
2335	// Use i32 for the gep index type when the value is constant,
2336	// or query DataLayout for a more suitable index type otherwise.
2337	const DataLayout &DL = Builder.GetInsertBlock()->getDataLayout();
2338	return !IsUnitStride \|\| (IsScalable && (IsReverse \|\| CurrentPart > `0`))
2339	? DL.getIndexType(PtrTy: Builder.getPtrTy(AddrSpace: `0`))
2340	: Builder.getInt32Ty();
2341	}
2342
2343	void VPVectorEndPointerRecipe::execute(VPTransformState &State) {
2344	auto &Builder = State.Builder;
2345	unsigned CurrentPart = getUnrollPart(U&: *this);
2346	bool IsUnitStride = Stride == `1` \|\| Stride == -`1`;
2347	Type IndexTy = getGEPIndexTy(IsScalable: State.VF.isScalable(), /IsReverse/* true,
2348	IsUnitStride, CurrentPart, Builder);
2349
2350	// The wide store needs to start at the last vector element.
2351	Value *RunTimeVF = State.get(Def: getVFValue(), Lane: VPLane (`0`));
2352	if (IndexTy != RunTimeVF->getType())
2353	RunTimeVF = Builder.CreateZExtOrTrunc(V: RunTimeVF, DestTy: IndexTy);
2354	// NumElt = Stride CurrentPart * RunTimeVF*
2355	Value *NumElt = Builder.CreateMul(
2356	LHS: ConstantInt::get(Ty: IndexTy, V: Stride * (int64_t)CurrentPart), RHS: RunTimeVF);
2357	// LastLane = Stride (RunTimeVF - 1)*
2358	Value *LastLane = Builder.CreateSub(LHS: RunTimeVF, RHS: ConstantInt::get(Ty: IndexTy, V: `1`));
2359	if (Stride != `1`)
2360	LastLane = Builder.CreateMul(LHS: ConstantInt::get(Ty: IndexTy, V: Stride), RHS: LastLane);
2361	Value *Ptr = State.get(Def: getOperand(N: `0`), Lane: VPLane (`0`));
2362	Value *ResultPtr =
2363	Builder.CreateGEP(Ty: IndexedTy, Ptr, IdxList: NumElt, Name: "", NW: getGEPNoWrapFlags());
2364	ResultPtr = Builder.CreateGEP(Ty: IndexedTy, Ptr: ResultPtr, IdxList: LastLane, Name: "",
2365	NW: getGEPNoWrapFlags());
2366
2367	State.set(Def: this, V: ResultPtr, /IsScalar/ true);
2368	}
2369
2370	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
2371	void VPVectorEndPointerRecipe::print(raw_ostream &O, const Twine &Indent,
2372	VPSlotTracker &SlotTracker) const {
2373	O << Indent;
2374	printAsOperand(O, SlotTracker);
2375	O << " = vector-end-pointer";
2376	printFlags(O);
2377	printOperands(O, SlotTracker);
2378	}
2379	#endif
2380
2381	void VPVectorPointerRecipe::execute(VPTransformState &State) {
2382	auto &Builder = State.Builder;
2383	unsigned CurrentPart = getUnrollPart(U&: *this);
2384	Type IndexTy = getGEPIndexTy(IsScalable: State.VF.isScalable(), /IsReverse/* false,
2385	/IsUnitStride/ true, CurrentPart, Builder);
2386	Value *Ptr = State.get(Def: getOperand(N: `0`), Lane: VPLane (`0`));
2387
2388	Value *Increment = createStepForVF(B&: Builder, Ty: IndexTy, VF: State.VF, Step: CurrentPart);
2389	Value *ResultPtr =
2390	Builder.CreateGEP(Ty: IndexedTy, Ptr, IdxList: Increment, Name: "", NW: getGEPNoWrapFlags());
2391
2392	State.set(Def: this, V: ResultPtr, /IsScalar/ true);
2393	}
2394
2395	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
2396	void VPVectorPointerRecipe::print(raw_ostream &O, const Twine &Indent,
2397	VPSlotTracker &SlotTracker) const {
2398	O << Indent;
2399	printAsOperand(O, SlotTracker);
2400	O << " = vector-pointer ";
2401
2402	printOperands(O, SlotTracker);
2403	}
2404	#endif
2405
2406	void VPBlendRecipe::execute(VPTransformState &State) {
2407	assert(isNormalized() && "Expected blend to be normalized!");
2408	// We know that all PHIs in non-header blocks are converted into
2409	// selects, so we don't have to worry about the insertion order and we
2410	// can just use the builder.
2411	// At this point we generate the predication tree. There may be
2412	// duplications since this is a simple recursive scan, but future
2413	// optimizations will clean it up.
2414
2415	unsigned NumIncoming = getNumIncomingValues();
2416
2417	// Generate a sequence of selects of the form:
2418	// SELECT(Mask3, In3,
2419	// SELECT(Mask2, In2,
2420	// SELECT(Mask1, In1,
2421	// In0)))
2422	// Note that Mask0 is never used: lanes for which no path reaches this phi and
2423	// are essentially undef are taken from In0.
2424	bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(Def: this);
2425	Value Result = nullptr*;
2426	for (unsigned In = `0`; In < NumIncoming; ++In) {
2427	// We might have single edge PHIs (blocks) - use an identity
2428	// 'select' for the first PHI operand.
2429	Value *In0 = State.get(Def: getIncomingValue(Idx: In), IsScalar: OnlyFirstLaneUsed);
2430	if (In == `0`)
2431	Result = In0; // Initialize with the first incoming value.
2432	else {
2433	// Select between the current value and the previous incoming edge
2434	// based on the incoming mask.
2435	Value *Cond = State.get(Def: getMask(Idx: In), IsScalar: OnlyFirstLaneUsed);
2436	Result = State.Builder.CreateSelect(C: Cond, True: In0, False: Result, Name: "predphi");
2437	}
2438	}
2439	State.set(Def: this, V: Result, IsScalar: OnlyFirstLaneUsed);
2440	}
2441
2442	InstructionCost VPBlendRecipe::computeCost(ElementCount VF,
2443	VPCostContext &Ctx) const {
2444	// Handle cases where only the first lane is used the same way as the legacy
2445	// cost model.
2446	if (vputils::onlyFirstLaneUsed(Def: this))
2447	return Ctx.TTI.getCFInstrCost(Opcode: Instruction::PHI, CostKind: Ctx.CostKind);
2448
2449	Type ResultTy = toVectorTy(Scalar: Ctx.Types.inferScalarType(V: this*), EC: VF);
2450	Type *CmpTy = toVectorTy(Scalar: Type::getInt1Ty(C&: Ctx.Types.getContext()), EC: VF);
2451	return (getNumIncomingValues() - `1`) *
2452	Ctx.TTI.getCmpSelInstrCost(Opcode: Instruction::Select, ValTy: ResultTy, CondTy: CmpTy,
2453	VecPred: CmpInst::BAD_ICMP_PREDICATE, CostKind: Ctx.CostKind);
2454	}
2455
2456	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
2457	void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent,
2458	VPSlotTracker &SlotTracker) const {
2459	O << Indent << "BLEND ";
2460	printAsOperand(O, SlotTracker);
2461	O << " =";
2462	if (getNumIncomingValues() == `1`) {
2463	// Not a User of any mask: not really blending, this is a
2464	// single-predecessor phi.
2465	O << " ";
2466	getIncomingValue(`0`)->printAsOperand(O, SlotTracker);
2467	} else {
2468	for (unsigned I = `0`, E = getNumIncomingValues(); I < E; ++I) {
2469	O << " ";
2470	getIncomingValue(I)->printAsOperand(O, SlotTracker);
2471	if (I == `0`)
2472	continue;
2473	O << "/";
2474	getMask(I)->printAsOperand(O, SlotTracker);
2475	}
2476	}
2477	}
2478	#endif
2479
2480	void VPReductionRecipe::execute(VPTransformState &State) {
2481	assert(!State.Lane && "Reduction being replicated.");
2482	Value PrevInChain = State.get(Def: getChainOp(), /IsScalar/* true);
2483	RecurKind Kind = getRecurrenceKind();
2484	assert(!RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) &&
2485	"In-loop AnyOf reductions aren't currently supported");
2486	// Propagate the fast-math flags carried by the underlying instruction.
2487	IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
2488	State.Builder.setFastMathFlags(getFastMathFlags());
2489	Value *NewVecOp = State.get(Def: getVecOp());
2490	if (VPValue *Cond = getCondOp()) {
2491	Value *NewCond = State.get(Def: Cond, IsScalar: State.VF.isScalar());
2492	VectorType *VecTy = dyn_cast<VectorType>(Val: NewVecOp->getType());
2493	Type *ElementTy = VecTy ? VecTy->getElementType() : NewVecOp->getType();
2494
2495	Value *Start = getRecurrenceIdentity(K: Kind, Tp: ElementTy, FMF: getFastMathFlags());
2496	if (State.VF.isVector())
2497	Start = State.Builder.CreateVectorSplat(EC: VecTy->getElementCount(), V: Start);
2498
2499	Value *Select = State.Builder.CreateSelect(C: NewCond, True: NewVecOp, False: Start);
2500	NewVecOp = Select;
2501	}
2502	Value *NewRed;
2503	Value *NextInChain;
2504	if (IsOrdered) {
2505	if (State.VF.isVector())
2506	NewRed =
2507	createOrderedReduction(B&: State.Builder, RdxKind: Kind, Src: NewVecOp, Start: PrevInChain);
2508	else
2509	NewRed = State.Builder.CreateBinOp(
2510	Opc: (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(Kind),
2511	LHS: PrevInChain, RHS: NewVecOp);
2512	PrevInChain = NewRed;
2513	NextInChain = NewRed;
2514	} else {
2515	PrevInChain = State.get(Def: getChainOp(), /IsScalar/ true);
2516	NewRed = createSimpleReduction(B&: State.Builder, Src: NewVecOp, RdxKind: Kind);
2517	if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
2518	NextInChain = createMinMaxOp(Builder&: State.Builder, RK: Kind, Left: NewRed, Right: PrevInChain);
2519	else
2520	NextInChain = State.Builder.CreateBinOp(
2521	Opc: (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(Kind), LHS: NewRed,
2522	RHS: PrevInChain);
2523	}
2524	State.set(Def: this, V: NextInChain, /IsScalar/ true);
2525	}
2526
2527	void VPReductionEVLRecipe::execute(VPTransformState &State) {
2528	assert(!State.Lane && "Reduction being replicated.");
2529
2530	auto &Builder = State.Builder;
2531	// Propagate the fast-math flags carried by the underlying instruction.
2532	IRBuilderBase::FastMathFlagGuard FMFGuard(Builder);
2533	Builder.setFastMathFlags(getFastMathFlags());
2534
2535	RecurKind Kind = getRecurrenceKind();
2536	Value Prev = State.get(Def: getChainOp(), /IsScalar/* true);
2537	Value *VecOp = State.get(Def: getVecOp());
2538	Value *EVL = State.get(Def: getEVL(), Lane: VPLane (`0`));
2539
2540	Value *Mask;
2541	if (VPValue *CondOp = getCondOp())
2542	Mask = State.get(Def: CondOp);
2543	else
2544	Mask = Builder.CreateVectorSplat(EC: State.VF, V: Builder.getTrue());
2545
2546	Value *NewRed;
2547	if (isOrdered()) {
2548	NewRed = createOrderedReduction(B&: Builder, RdxKind: Kind, Src: VecOp, Start: Prev, Mask, EVL);
2549	} else {
2550	NewRed = createSimpleReduction(B&: Builder, Src: VecOp, RdxKind: Kind, Mask, EVL);
2551	if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
2552	NewRed = createMinMaxOp(Builder, RK: Kind, Left: NewRed, Right: Prev);
2553	else
2554	NewRed = Builder.CreateBinOp(
2555	Opc: (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(Kind), LHS: NewRed,
2556	RHS: Prev);
2557	}
2558	State.set(Def: this, V: NewRed, /IsScalar/ true);
2559	}
2560
2561	InstructionCost VPReductionRecipe::computeCost(ElementCount VF,
2562	VPCostContext &Ctx) const {
2563	RecurKind RdxKind = getRecurrenceKind();
2564	Type ElementTy = Ctx.Types.inferScalarType(V: this*);
2565	auto *VectorTy = cast<VectorType>(Val: toVectorTy(Scalar: ElementTy, EC: VF));
2566	unsigned Opcode = RecurrenceDescriptor::getOpcode(Kind: RdxKind);
2567	FastMathFlags FMFs = getFastMathFlags();
2568	std::optional<FastMathFlags> OptionalFMF =
2569	ElementTy->isFloatingPointTy() ? std::make_optional(t&: FMFs) : std::nullopt;
2570
2571	// TODO: Support any-of reductions.
2572	assert(
2573	(!RecurrenceDescriptor::isAnyOfRecurrenceKind(RdxKind) \|\|
2574	ForceTargetInstructionCost.getNumOccurrences() > `0`) &&
2575	"Any-of reduction not implemented in VPlan-based cost model currently.");
2576
2577	// Note that TTI should model the cost of moving result to the scalar register
2578	// and the BinOp cost in the getMinMaxReductionCost().
2579	if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind: RdxKind)) {
2580	Intrinsic::ID Id = getMinMaxReductionIntrinsicOp(RK: RdxKind);
2581	return Ctx.TTI.getMinMaxReductionCost(IID: Id, Ty: VectorTy, FMF: FMFs, CostKind: Ctx.CostKind);
2582	}
2583
2584	// Note that TTI should model the cost of moving result to the scalar register
2585	// and the BinOp cost in the getArithmeticReductionCost().
2586	return Ctx.TTI.getArithmeticReductionCost(Opcode, Ty: VectorTy, FMF: OptionalFMF,
2587	CostKind: Ctx.CostKind);
2588	}
2589
2590	VPExpressionRecipe::VPExpressionRecipe(
2591	ExpressionTypes ExpressionType,
2592	ArrayRef<VPSingleDefRecipe *> ExpressionRecipes)
2593	: VPSingleDefRecipe (VPDef::VPExpressionSC, {}, {}),
2594	ExpressionRecipes (SetVector<VPSingleDefRecipe *>(
2595	ExpressionRecipes.begin(), ExpressionRecipes.end())
2596	.takeVector()),
2597	ExpressionType(ExpressionType) {
2598	assert(!ExpressionRecipes.empty() && "Nothing to combine?");
2599	assert(
2600	none_of(ExpressionRecipes,
2601	[](VPSingleDefRecipe R) { return* R->mayHaveSideEffects(); }) &&
2602	"expression cannot contain recipes with side-effects");
2603
2604	// Maintain a copy of the expression recipes as a set of users.
2605	SmallPtrSet<VPUser *, `4`> ExpressionRecipesAsSetOfUsers;
2606	for (auto *R : ExpressionRecipes)
2607	ExpressionRecipesAsSetOfUsers.insert(Ptr: R);
2608
2609	// Recipes in the expression, except the last one, must only be used by
2610	// (other) recipes inside the expression. If there are other users, external
2611	// to the expression, use a clone of the recipe for external users.
2612	for (VPSingleDefRecipe *R : ExpressionRecipes) {
2613	if (R != ExpressionRecipes.back() &&
2614	any_of(Range: R->users(), P: [&ExpressionRecipesAsSetOfUsers](VPUser *U) {
2615	return !ExpressionRecipesAsSetOfUsers.contains(Ptr: U);
2616	})) {
2617	// There are users outside of the expression. Clone the recipe and use the
2618	// clone those external users.
2619	VPSingleDefRecipe *CopyForExtUsers = R->clone();
2620	R->replaceUsesWithIf(New: CopyForExtUsers, ShouldReplace: [&ExpressionRecipesAsSetOfUsers](
2621	VPUser &U, unsigned) {
2622	return !ExpressionRecipesAsSetOfUsers.contains(Ptr: &U);
2623	});
2624	CopyForExtUsers->insertBefore(InsertPos: R);
2625	}
2626	if (R->getParent())
2627	R->removeFromParent();
2628	}
2629
2630	// Internalize all external operands to the expression recipes. To do so,
2631	// create new temporary VPValues for all operands defined by a recipe outside
2632	// the expression. The original operands are added as operands of the
2633	// VPExpressionRecipe itself.
2634	for (auto *R : ExpressionRecipes) {
2635	for (const auto &[Idx, Op] : enumerate(First: R->operands())) {
2636	auto *Def = Op->getDefiningRecipe();
2637	if (Def && ExpressionRecipesAsSetOfUsers.contains(Ptr: Def))
2638	continue;
2639	addOperand(Operand: Op);
2640	LiveInPlaceholders.push_back(Elt: new VPValue ());
2641	R->setOperand(I: Idx, New: LiveInPlaceholders.back());
2642	}
2643	}
2644	}
2645
2646	void VPExpressionRecipe::decompose() {
2647	for (auto *R : ExpressionRecipes)
2648	R->insertBefore(InsertPos: this);
2649
2650	for (const auto &[Idx, Op] : enumerate(First: operands()))
2651	LiveInPlaceholders [Idx]->replaceAllUsesWith(New: Op);
2652
2653	replaceAllUsesWith(New: ExpressionRecipes.back());
2654	ExpressionRecipes.clear();
2655	}
2656
2657	InstructionCost VPExpressionRecipe::computeCost(ElementCount VF,
2658	VPCostContext &Ctx) const {
2659	Type RedTy = Ctx.Types.inferScalarType(V: this*);
2660	auto *SrcVecTy = cast<VectorType>(
2661	Val: toVectorTy(Scalar: Ctx.Types.inferScalarType(V: getOperand(N: `0`)), EC: VF));
2662	assert(RedTy->isIntegerTy() &&
2663	"VPExpressionRecipe only supports integer types currently.");
2664	switch (ExpressionType) {
2665	case ExpressionTypes::ExtendedReduction: {
2666	unsigned Opcode = RecurrenceDescriptor::getOpcode(
2667	Kind: cast<VPReductionRecipe>(Val: ExpressionRecipes [`1`])->getRecurrenceKind());
2668	return Ctx.TTI.getExtendedReductionCost(
2669	Opcode,
2670	IsUnsigned: cast<VPWidenCastRecipe>(Val: ExpressionRecipes.front())->getOpcode() ==
2671	Instruction::ZExt,
2672	ResTy: RedTy, Ty: SrcVecTy, FMF: std::nullopt, CostKind: Ctx.CostKind);
2673	}
2674	case ExpressionTypes::MulAccReduction:
2675	return Ctx.TTI.getMulAccReductionCost(IsUnsigned: false, ResTy: RedTy, Ty: SrcVecTy, CostKind: Ctx.CostKind);
2676
2677	case ExpressionTypes::ExtMulAccReduction:
2678	return Ctx.TTI.getMulAccReductionCost(
2679	IsUnsigned: cast<VPWidenCastRecipe>(Val: ExpressionRecipes.front())->getOpcode() ==
2680	Instruction::ZExt,
2681	ResTy: RedTy, Ty: SrcVecTy, CostKind: Ctx.CostKind);
2682	}
2683	llvm_unreachable("Unknown VPExpressionRecipe::ExpressionTypes enum");
2684	}
2685
2686	bool VPExpressionRecipe::mayReadOrWriteMemory() const {
2687	return any_of(Range: ExpressionRecipes, P: [](VPSingleDefRecipe *R) {
2688	return R->mayReadFromMemory() \|\| R->mayWriteToMemory();
2689	});
2690	}
2691
2692	bool VPExpressionRecipe::mayHaveSideEffects() const {
2693	assert(
2694	none_of(ExpressionRecipes,
2695	[](VPSingleDefRecipe R) { return* R->mayHaveSideEffects(); }) &&
2696	"expression cannot contain recipes with side-effects");
2697	return false;
2698	}
2699
2700	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
2701
2702	void VPExpressionRecipe::print(raw_ostream &O, const Twine &Indent,
2703	VPSlotTracker &SlotTracker) const {
2704	O << Indent << "EXPRESSION ";
2705	printAsOperand(O, SlotTracker);
2706	O << " = ";
2707	auto *Red = cast<VPReductionRecipe>(ExpressionRecipes.back());
2708	unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind());
2709
2710	switch (ExpressionType) {
2711	case ExpressionTypes::ExtendedReduction: {
2712	getOperand(`1`)->printAsOperand(O, SlotTracker);
2713	O << " +";
2714	O << " reduce." << Instruction::getOpcodeName(Opcode) << " (";
2715	getOperand(`0`)->printAsOperand(O, SlotTracker);
2716	Red->printFlags(O);
2717
2718	auto *Ext0 = cast<VPWidenCastRecipe>(ExpressionRecipes[`0`]);
2719	O << Instruction::getOpcodeName(Ext0->getOpcode()) << " to "
2720	<< *Ext0->getResultType();
2721	if (Red->isConditional()) {
2722	O << ", ";
2723	Red->getCondOp()->printAsOperand(O, SlotTracker);
2724	}
2725	O << ")";
2726	break;
2727	}
2728	case ExpressionTypes::MulAccReduction:
2729	case ExpressionTypes::ExtMulAccReduction: {
2730	getOperand(getNumOperands() - `1`)->printAsOperand(O, SlotTracker);
2731	O << " + ";
2732	O << "reduce."
2733	<< Instruction::getOpcodeName(
2734	RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()))
2735	<< " (";
2736	O << "mul";
2737	bool IsExtended = ExpressionType == ExpressionTypes::ExtMulAccReduction;
2738	auto *Mul = cast<VPWidenRecipe>(IsExtended ? ExpressionRecipes[`2`]
2739	: ExpressionRecipes[`0`]);
2740	Mul->printFlags(O);
2741	if (IsExtended)
2742	O << "(";
2743	getOperand(`0`)->printAsOperand(O, SlotTracker);
2744	if (IsExtended) {
2745	auto *Ext0 = cast<VPWidenCastRecipe>(ExpressionRecipes[`0`]);
2746	O << " " << Instruction::getOpcodeName(Ext0->getOpcode()) << " to "
2747	<< *Ext0->getResultType() << "), (";
2748	} else {
2749	O << ", ";
2750	}
2751	getOperand(`1`)->printAsOperand(O, SlotTracker);
2752	if (IsExtended) {
2753	auto *Ext1 = cast<VPWidenCastRecipe>(ExpressionRecipes[`1`]);
2754	O << " " << Instruction::getOpcodeName(Ext1->getOpcode()) << " to "
2755	<< *Ext1->getResultType() << ")";
2756	}
2757	if (Red->isConditional()) {
2758	O << ", ";
2759	Red->getCondOp()->printAsOperand(O, SlotTracker);
2760	}
2761	O << ")";
2762	break;
2763	}
2764	}
2765	}
2766
2767	void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent,
2768	VPSlotTracker &SlotTracker) const {
2769	O << Indent << "REDUCE ";
2770	printAsOperand(O, SlotTracker);
2771	O << " = ";
2772	getChainOp()->printAsOperand(O, SlotTracker);
2773	O << " +";
2774	printFlags(O);
2775	O << " reduce."
2776	<< Instruction::getOpcodeName(
2777	RecurrenceDescriptor::getOpcode(getRecurrenceKind()))
2778	<< " (";
2779	getVecOp()->printAsOperand(O, SlotTracker);
2780	if (isConditional()) {
2781	O << ", ";
2782	getCondOp()->printAsOperand(O, SlotTracker);
2783	}
2784	O << ")";
2785	}
2786
2787	void VPReductionEVLRecipe::print(raw_ostream &O, const Twine &Indent,
2788	VPSlotTracker &SlotTracker) const {
2789	O << Indent << "REDUCE ";
2790	printAsOperand(O, SlotTracker);
2791	O << " = ";
2792	getChainOp()->printAsOperand(O, SlotTracker);
2793	O << " +";
2794	printFlags(O);
2795	O << " vp.reduce."
2796	<< Instruction::getOpcodeName(
2797	RecurrenceDescriptor::getOpcode(getRecurrenceKind()))
2798	<< " (";
2799	getVecOp()->printAsOperand(O, SlotTracker);
2800	O << ", ";
2801	getEVL()->printAsOperand(O, SlotTracker);
2802	if (isConditional()) {
2803	O << ", ";
2804	getCondOp()->printAsOperand(O, SlotTracker);
2805	}
2806	O << ")";
2807	}
2808
2809	#endif
2810
2811	/// A helper function to scalarize a single Instruction in the innermost loop.
2812	/// Generates a sequence of scalar instances for lane \p Lane. Uses the VPValue
2813	/// operands from \p RepRecipe instead of \p Instr's operands.
2814	static void scalarizeInstruction(const Instruction *Instr,
2815	VPReplicateRecipe *RepRecipe,
2816	const VPLane &Lane, VPTransformState &State) {
2817	assert((!Instr->getType()->isAggregateType() \|\|
2818	canVectorizeTy(Instr->getType())) &&
2819	"Expected vectorizable or non-aggregate type.");
2820
2821	// Does this instruction return a value ?
2822	bool IsVoidRetTy = Instr->getType()->isVoidTy();
2823
2824	Instruction *Cloned = Instr->clone();
2825	if (!IsVoidRetTy) {
2826	Cloned->setName(Instr->getName() + ".cloned");
2827	#if !defined(NDEBUG)
2828	// Verify that VPlan type inference results agree with the type of the
2829	// generated values.
2830	assert(State.TypeAnalysis.inferScalarType(RepRecipe) == Cloned->getType() &&
2831	"inferred type and type from generated instructions do not match");
2832	#endif
2833	}
2834
2835	RepRecipe->applyFlags(I&: *Cloned);
2836	RepRecipe->applyMetadata(I&: *Cloned);
2837
2838	if (auto DL = RepRecipe->getDebugLoc())
2839	State.setDebugLocFrom(DL);
2840
2841	// Replace the operands of the cloned instructions with their scalar
2842	// equivalents in the new loop.
2843	for (const auto &I : enumerate(First: RepRecipe->operands())) {
2844	auto InputLane = Lane;
2845	VPValue *Operand = I.value();
2846	if (vputils::isSingleScalar(VPV: Operand))
2847	InputLane = VPLane::getFirstLane();
2848	Cloned->setOperand(i: I.index(), Val: State.get(Def: Operand, Lane: InputLane));
2849	}
2850
2851	// Place the cloned scalar in the new loop.
2852	State.Builder.Insert(I: Cloned);
2853
2854	State.set(Def: RepRecipe, V: Cloned, Lane);
2855
2856	// If we just cloned a new assumption, add it the assumption cache.
2857	if (auto *II = dyn_cast<AssumeInst>(Val: Cloned))
2858	State.AC->registerAssumption(CI: II);
2859
2860	assert(
2861	(RepRecipe->getParent()->getParent() \|\|
2862	!RepRecipe->getParent()->getPlan()->getVectorLoopRegion() \|\|
2863	all_of(RepRecipe->operands(),
2864	[](VPValue Op) { return* Op->isDefinedOutsideLoopRegions(); })) &&
2865	"Expected a recipe is either within a region or all of its operands "
2866	"are defined outside the vectorized region.");
2867	}
2868
2869	void VPReplicateRecipe::execute(VPTransformState &State) {
2870	Instruction *UI = getUnderlyingInstr();
2871
2872	if (!State.Lane) {
2873	assert(IsSingleScalar && "VPReplicateRecipes outside replicate regions "
2874	"must have already been unrolled");
2875	scalarizeInstruction(Instr: UI, RepRecipe: this, Lane: VPLane (`0`), State);
2876	return;
2877	}
2878
2879	assert((State.VF.isScalar() \|\| !isSingleScalar()) &&
2880	"uniform recipe shouldn't be predicated");
2881	assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
2882	scalarizeInstruction(Instr: UI, RepRecipe: this, Lane: *State.Lane, State);
2883	// Insert scalar instance packing it into a vector.
2884	if (State.VF.isVector() && shouldPack()) {
2885	Value *WideValue =
2886	State.Lane ->isFirstLane()
2887	? PoisonValue::get(T: VectorType::get(ElementType: UI->getType(), EC: State.VF))
2888	: State.get(Def: this);
2889	State.set(Def: this, V: State.packScalarIntoVectorizedValue(Def: this, WideValue,
2890	Lane: *State.Lane));
2891	}
2892	}
2893
2894	bool VPReplicateRecipe::shouldPack() const {
2895	// Find if the recipe is used by a widened recipe via an intervening
2896	// VPPredInstPHIRecipe. In this case, also pack the scalar values in a vector.
2897	return any_of(Range: users(), P: [](const VPUser *U) {
2898	if (auto *PredR = dyn_cast<VPPredInstPHIRecipe>(Val: U))
2899	return any_of(Range: PredR->users(), P: [PredR](const VPUser *U) {
2900	return !U->usesScalars(Op: PredR);
2901	});
2902	return false;
2903	});
2904	}
2905
2906	InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
2907	VPCostContext &Ctx) const {
2908	Instruction *UI = cast<Instruction>(Val: getUnderlyingValue());
2909	// VPReplicateRecipe may be cloned as part of an existing VPlan-to-VPlan
2910	// transform, avoid computing their cost multiple times for now.
2911	Ctx.SkipCostComputation.insert(Ptr: UI);
2912
2913	TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
2914	Type ResultTy = Ctx.Types.inferScalarType(V: this*);
2915	switch (UI->getOpcode()) {
2916	case Instruction::GetElementPtr:
2917	// We mark this instruction as zero-cost because the cost of GEPs in
2918	// vectorized code depends on whether the corresponding memory instruction
2919	// is scalarized or not. Therefore, we handle GEPs with the memory
2920	// instruction cost.
2921	return `0`;
2922	case Instruction::Add:
2923	case Instruction::Sub:
2924	case Instruction::FAdd:
2925	case Instruction::FSub:
2926	case Instruction::Mul:
2927	case Instruction::FMul:
2928	case Instruction::FDiv:
2929	case Instruction::FRem:
2930	case Instruction::Shl:
2931	case Instruction::LShr:
2932	case Instruction::AShr:
2933	case Instruction::And:
2934	case Instruction::Or:
2935	case Instruction::Xor: {
2936	auto Op2Info = Ctx.getOperandInfo(V: getOperand(N: `1`));
2937	SmallVector<const Value *, `4`> Operands(UI->operand_values());
2938	return Ctx.TTI.getArithmeticInstrCost(
2939	Opcode: UI->getOpcode(), Ty: ResultTy, CostKind,
2940	Opd1Info: {.Kind: TargetTransformInfo::OK_AnyValue, .Properties: TargetTransformInfo::OP_None},
2941	Opd2Info: Op2Info, Args: Operands, CxtI: UI, TLibInfo: &Ctx.TLI) *
2942	(isSingleScalar() ? `1` : VF.getFixedValue());
2943	}
2944	}
2945
2946	return Ctx.getLegacyCost(UI, VF);
2947	}
2948
2949	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
2950	void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent,
2951	VPSlotTracker &SlotTracker) const {
2952	O << Indent << (IsSingleScalar ? "CLONE " : "REPLICATE ");
2953
2954	if (!getUnderlyingInstr()->getType()->isVoidTy()) {
2955	printAsOperand(O, SlotTracker);
2956	O << " = ";
2957	}
2958	if (auto *CB = dyn_cast<CallBase>(getUnderlyingInstr())) {
2959	O << "call";
2960	printFlags(O);
2961	O << "@" << CB->getCalledFunction()->getName() << "(";
2962	interleaveComma(make_range(op_begin(), op_begin() + (getNumOperands() - `1`)),
2963	O, [&O, &SlotTracker](VPValue *Op) {
2964	Op->printAsOperand(O, SlotTracker);
2965	});
2966	O << ")";
2967	} else {
2968	O << Instruction::getOpcodeName(getUnderlyingInstr()->getOpcode());
2969	printFlags(O);
2970	printOperands(O, SlotTracker);
2971	}
2972
2973	if (shouldPack())
2974	O << " (S->V)";
2975	}
2976	#endif
2977
2978	void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
2979	assert(State.Lane && "Branch on Mask works only on single instance.");
2980
2981	VPValue *BlockInMask = getOperand(N: `0`);
2982	Value ConditionBit = State.get(Def: BlockInMask, Lane: State.Lane);
2983
2984	// Replace the temporary unreachable terminator with a new conditional branch,
2985	// whose two destinations will be set later when they are created.
2986	auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
2987	assert(isa<UnreachableInst>(CurrentTerminator) &&
2988	"Expected to replace unreachable terminator with conditional branch.");
2989	auto CondBr =
2990	State.Builder.CreateCondBr(Cond: ConditionBit, True: State.CFG.PrevBB, False: nullptr);
2991	CondBr->setSuccessor(idx: `0`, NewSucc: nullptr);
2992	CurrentTerminator->eraseFromParent();
2993	}
2994
2995	InstructionCost VPBranchOnMaskRecipe::computeCost(ElementCount VF,
2996	VPCostContext &Ctx) const {
2997	// The legacy cost model doesn't assign costs to branches for individual
2998	// replicate regions. Match the current behavior in the VPlan cost model for
2999	// now.
3000	return `0`;
3001	}
3002
3003	void VPPredInstPHIRecipe::execute(VPTransformState &State) {
3004	assert(State.Lane && "Predicated instruction PHI works per instance.");
3005	Instruction *ScalarPredInst =
3006	cast<Instruction>(Val: State.get(Def: getOperand(N: `0`), Lane: *State.Lane));
3007	BasicBlock *PredicatedBB = ScalarPredInst->getParent();
3008	BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
3009	assert(PredicatingBB && "Predicated block has no single predecessor.");
3010	assert(isa<VPReplicateRecipe>(getOperand(`0`)) &&
3011	"operand must be VPReplicateRecipe");
3012
3013	// By current pack/unpack logic we need to generate only a single phi node: if
3014	// a vector value for the predicated instruction exists at this point it means
3015	// the instruction has vector users only, and a phi for the vector value is
3016	// needed. In this case the recipe of the predicated instruction is marked to
3017	// also do that packing, thereby "hoisting" the insert-element sequence.
3018	// Otherwise, a phi node for the scalar value is needed.
3019	if (State.hasVectorValue(Def: getOperand(N: `0`))) {
3020	Value *VectorValue = State.get(Def: getOperand(N: `0`));
3021	InsertElementInst *IEI = cast<InsertElementInst>(Val: VectorValue);
3022	PHINode *VPhi = State.Builder.CreatePHI(Ty: IEI->getType(), NumReservedValues: `2`);
3023	VPhi->addIncoming(V: IEI->getOperand(i_nocapture: `0`), BB: PredicatingBB); // Unmodified vector.
3024	VPhi->addIncoming(V: IEI, BB: PredicatedBB); // New vector with inserted element.
3025	if (State.hasVectorValue(Def: this))
3026	State.reset(Def: this, V: VPhi);
3027	else
3028	State.set(Def: this, V: VPhi);
3029	// NOTE: Currently we need to update the value of the operand, so the next
3030	// predicated iteration inserts its generated value in the correct vector.
3031	State.reset(Def: getOperand(N: `0`), V: VPhi);
3032	} else {
3033	if (vputils::onlyFirstLaneUsed(Def: this) && !State.Lane ->isFirstLane())
3034	return;
3035
3036	Type *PredInstType = State.TypeAnalysis.inferScalarType(V: getOperand(N: `0`));
3037	PHINode *Phi = State.Builder.CreatePHI(Ty: PredInstType, NumReservedValues: `2`);
3038	Phi->addIncoming(V: PoisonValue::get(T: ScalarPredInst->getType()),
3039	BB: PredicatingBB);
3040	Phi->addIncoming(V: ScalarPredInst, BB: PredicatedBB);
3041	if (State.hasScalarValue(Def: this, Lane: *State.Lane))
3042	State.reset(Def: this, V: Phi, Lane: *State.Lane);
3043	else
3044	State.set(Def: this, V: Phi, Lane: *State.Lane);
3045	// NOTE: Currently we need to update the value of the operand, so the next
3046	// predicated iteration inserts its generated value in the correct vector.
3047	State.reset(Def: getOperand(N: `0`), V: Phi, Lane: *State.Lane);
3048	}
3049	}
3050
3051	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
3052	void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent,
3053	VPSlotTracker &SlotTracker) const {
3054	O << Indent << "PHI-PREDICATED-INSTRUCTION ";
3055	printAsOperand(O, SlotTracker);
3056	O << " = ";
3057	printOperands(O, SlotTracker);
3058	}
3059	#endif
3060
3061	InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF,
3062	VPCostContext &Ctx) const {
3063	Type *Ty = toVectorTy(Scalar: getLoadStoreType(I: &Ingredient), EC: VF);
3064	const Align Alignment =
3065	getLoadStoreAlignment(I: const_cast<Instruction *>(&Ingredient));
3066	unsigned AS = cast<PointerType>(Val: Ctx.Types.inferScalarType(V: getAddr()))
3067	->getAddressSpace();
3068	unsigned Opcode = isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe>(Val: this)
3069	? Instruction::Load
3070	: Instruction::Store;
3071
3072	if (!Consecutive) {
3073	// TODO: Using the original IR may not be accurate.
3074	// Currently, ARM will use the underlying IR to calculate gather/scatter
3075	// instruction cost.
3076	const Value *Ptr = getLoadStorePointerOperand(V: &Ingredient);
3077	assert(!Reverse &&
3078	"Inconsecutive memory access should not have the order.");
3079	return Ctx.TTI.getAddressComputationCost(Ty) +
3080	Ctx.TTI.getGatherScatterOpCost(Opcode, DataTy: Ty, Ptr, VariableMask: IsMasked, Alignment,
3081	CostKind: Ctx.CostKind, I: &Ingredient);
3082	}
3083
3084	InstructionCost Cost = `0`;
3085	if (IsMasked) {
3086	Cost +=
3087	Ctx.TTI.getMaskedMemoryOpCost(Opcode, Src: Ty, Alignment, AddressSpace: AS, CostKind: Ctx.CostKind);
3088	} else {
3089	TTI::OperandValueInfo OpInfo = Ctx.getOperandInfo(
3090	V: isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe>(Val: this) ? getOperand(N: `0`)
3091	: getOperand(N: `1`));
3092	Cost += Ctx.TTI.getMemoryOpCost(Opcode, Src: Ty, Alignment, AddressSpace: AS, CostKind: Ctx.CostKind,
3093	OpdInfo: OpInfo, I: &Ingredient);
3094	}
3095	if (!Reverse)
3096	return Cost;
3097
3098	return Cost += Ctx.TTI.getShuffleCost(
3099	Kind: TargetTransformInfo::SK_Reverse, DstTy: cast<VectorType>(Val: Ty),
3100	SrcTy: cast<VectorType>(Val: Ty), Mask: {}, CostKind: Ctx.CostKind, Index: `0`);
3101	}
3102
3103	void VPWidenLoadRecipe::execute(VPTransformState &State) {
3104	Type *ScalarDataTy = getLoadStoreType(I: &Ingredient);
3105	auto *DataTy = VectorType::get(ElementType: ScalarDataTy, EC: State.VF);
3106	const Align Alignment = getLoadStoreAlignment(I: &Ingredient);
3107	bool CreateGather = !isConsecutive();
3108
3109	auto &Builder = State.Builder;
3110	Value Mask = nullptr*;
3111	if (auto *VPMask = getMask()) {
3112	// Mask reversal is only needed for non-all-one (null) masks, as reverse
3113	// of a null all-one mask is a null mask.
3114	Mask = State.get(Def: VPMask);
3115	if (isReverse())
3116	Mask = Builder.CreateVectorReverse(V: Mask, Name: "reverse");
3117	}
3118
3119	Value Addr = State.get(Def: getAddr(), /IsScalar/* !CreateGather);
3120	Value *NewLI;
3121	if (CreateGather) {
3122	NewLI = Builder.CreateMaskedGather(Ty: DataTy, Ptrs: Addr, Alignment, Mask, PassThru: nullptr,
3123	Name: "wide.masked.gather");
3124	} else if (Mask) {
3125	NewLI =
3126	Builder.CreateMaskedLoad(Ty: DataTy, Ptr: Addr, Alignment, Mask,
3127	PassThru: PoisonValue::get(T: DataTy), Name: "wide.masked.load");
3128	} else {
3129	NewLI = Builder.CreateAlignedLoad(Ty: DataTy, Ptr: Addr, Align: Alignment, Name: "wide.load");
3130	}
3131	applyMetadata(I&: *cast<Instruction>(Val: NewLI));
3132	if (Reverse)
3133	NewLI = Builder.CreateVectorReverse(V: NewLI, Name: "reverse");
3134	State.set(Def: this, V: NewLI);
3135	}
3136
3137	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
3138	void VPWidenLoadRecipe::print(raw_ostream &O, const Twine &Indent,
3139	VPSlotTracker &SlotTracker) const {
3140	O << Indent << "WIDEN ";
3141	printAsOperand(O, SlotTracker);
3142	O << " = load ";
3143	printOperands(O, SlotTracker);
3144	}
3145	#endif
3146
3147	/// Use all-true mask for reverse rather than actual mask, as it avoids a
3148	/// dependence w/o affecting the result.
3149	static Instruction createReverseEVL(IRBuilderBase &Builder, Value Operand,
3150	Value EVL, const* Twine &Name) {
3151	VectorType *ValTy = cast<VectorType>(Val: Operand->getType());
3152	Value *AllTrueMask =
3153	Builder.CreateVectorSplat(EC: ValTy->getElementCount(), V: Builder.getTrue());
3154	return Builder.CreateIntrinsic(RetTy: ValTy, ID: Intrinsic::experimental_vp_reverse,
3155	Args: {Operand, AllTrueMask, EVL}, FMFSource: nullptr, Name);
3156	}
3157
3158	void VPWidenLoadEVLRecipe::execute(VPTransformState &State) {
3159	Type *ScalarDataTy = getLoadStoreType(I: &Ingredient);
3160	auto *DataTy = VectorType::get(ElementType: ScalarDataTy, EC: State.VF);
3161	const Align Alignment = getLoadStoreAlignment(I: &Ingredient);
3162	bool CreateGather = !isConsecutive();
3163
3164	auto &Builder = State.Builder;
3165	CallInst *NewLI;
3166	Value *EVL = State.get(Def: getEVL(), Lane: VPLane (`0`));
3167	Value *Addr = State.get(Def: getAddr(), IsScalar: !CreateGather);
3168	Value Mask = nullptr*;
3169	if (VPValue *VPMask = getMask()) {
3170	Mask = State.get(Def: VPMask);
3171	if (isReverse())
3172	Mask = createReverseEVL(Builder, Operand: Mask, EVL, Name: "vp.reverse.mask");
3173	} else {
3174	Mask = Builder.CreateVectorSplat(EC: State.VF, V: Builder.getTrue());
3175	}
3176
3177	if (CreateGather) {
3178	NewLI =
3179	Builder.CreateIntrinsic(RetTy: DataTy, ID: Intrinsic::vp_gather, Args: {Addr, Mask, EVL},
3180	FMFSource: nullptr, Name: "wide.masked.gather");
3181	} else {
3182	NewLI = Builder.CreateIntrinsic(RetTy: DataTy, ID: Intrinsic::vp_load,
3183	Args: {Addr, Mask, EVL}, FMFSource: nullptr, Name: "vp.op.load");
3184	}
3185	NewLI->addParamAttr(
3186	ArgNo: `0`, Attr: Attribute::getWithAlignment(Context&: NewLI->getContext(), Alignment));
3187	applyMetadata(I&: *NewLI);
3188	Instruction *Res = NewLI;
3189	if (isReverse())
3190	Res = createReverseEVL(Builder, Operand: Res, EVL, Name: "vp.reverse");
3191	State.set(Def: this, V: Res);
3192	}
3193
3194	InstructionCost VPWidenLoadEVLRecipe::computeCost(ElementCount VF,
3195	VPCostContext &Ctx) const {
3196	if (!Consecutive \|\| IsMasked)
3197	return VPWidenMemoryRecipe::computeCost(VF, Ctx);
3198
3199	// We need to use the getMaskedMemoryOpCost() instead of getMemoryOpCost()
3200	// here because the EVL recipes using EVL to replace the tail mask. But in the
3201	// legacy model, it will always calculate the cost of mask.
3202	// TODO: Using getMemoryOpCost() instead of getMaskedMemoryOpCost when we
3203	// don't need to compare to the legacy cost model.
3204	Type *Ty = toVectorTy(Scalar: getLoadStoreType(I: &Ingredient), EC: VF);
3205	const Align Alignment =
3206	getLoadStoreAlignment(I: const_cast<Instruction *>(&Ingredient));
3207	unsigned AS =
3208	getLoadStoreAddressSpace(I: const_cast<Instruction *>(&Ingredient));
3209	InstructionCost Cost = Ctx.TTI.getMaskedMemoryOpCost(
3210	Opcode: Instruction::Load, Src: Ty, Alignment, AddressSpace: AS, CostKind: Ctx.CostKind);
3211	if (!Reverse)
3212	return Cost;
3213
3214	return Cost + Ctx.TTI.getShuffleCost(
3215	Kind: TargetTransformInfo::SK_Reverse, DstTy: cast<VectorType>(Val: Ty),
3216	SrcTy: cast<VectorType>(Val: Ty), Mask: {}, CostKind: Ctx.CostKind, Index: `0`);
3217	}
3218
3219	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
3220	void VPWidenLoadEVLRecipe::print(raw_ostream &O, const Twine &Indent,
3221	VPSlotTracker &SlotTracker) const {
3222	O << Indent << "WIDEN ";
3223	printAsOperand(O, SlotTracker);
3224	O << " = vp.load ";
3225	printOperands(O, SlotTracker);
3226	}
3227	#endif
3228
3229	void VPWidenStoreRecipe::execute(VPTransformState &State) {
3230	VPValue *StoredVPValue = getStoredValue();
3231	bool CreateScatter = !isConsecutive();
3232	const Align Alignment = getLoadStoreAlignment(I: &Ingredient);
3233
3234	auto &Builder = State.Builder;
3235
3236	Value Mask = nullptr*;
3237	if (auto *VPMask = getMask()) {
3238	// Mask reversal is only needed for non-all-one (null) masks, as reverse
3239	// of a null all-one mask is a null mask.
3240	Mask = State.get(Def: VPMask);
3241	if (isReverse())
3242	Mask = Builder.CreateVectorReverse(V: Mask, Name: "reverse");
3243	}
3244
3245	Value *StoredVal = State.get(Def: StoredVPValue);
3246	if (isReverse()) {
3247	// If we store to reverse consecutive memory locations, then we need
3248	// to reverse the order of elements in the stored value.
3249	StoredVal = Builder.CreateVectorReverse(V: StoredVal, Name: "reverse");
3250	// We don't want to update the value in the map as it might be used in
3251	// another expression. So don't call resetVectorValue(StoredVal).
3252	}
3253	Value Addr = State.get(Def: getAddr(), /IsScalar/* !CreateScatter);
3254	Instruction NewSI = nullptr*;
3255	if (CreateScatter)
3256	NewSI = Builder.CreateMaskedScatter(Val: StoredVal, Ptrs: Addr, Alignment, Mask);
3257	else if (Mask)
3258	NewSI = Builder.CreateMaskedStore(Val: StoredVal, Ptr: Addr, Alignment, Mask);
3259	else
3260	NewSI = Builder.CreateAlignedStore(Val: StoredVal, Ptr: Addr, Align: Alignment);
3261	applyMetadata(I&: *NewSI);
3262	}
3263
3264	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
3265	void VPWidenStoreRecipe::print(raw_ostream &O, const Twine &Indent,
3266	VPSlotTracker &SlotTracker) const {
3267	O << Indent << "WIDEN store ";
3268	printOperands(O, SlotTracker);
3269	}
3270	#endif
3271
3272	void VPWidenStoreEVLRecipe::execute(VPTransformState &State) {
3273	VPValue *StoredValue = getStoredValue();
3274	bool CreateScatter = !isConsecutive();
3275	const Align Alignment = getLoadStoreAlignment(I: &Ingredient);
3276
3277	auto &Builder = State.Builder;
3278
3279	CallInst NewSI = nullptr*;
3280	Value *StoredVal = State.get(Def: StoredValue);
3281	Value *EVL = State.get(Def: getEVL(), Lane: VPLane (`0`));
3282	if (isReverse())
3283	StoredVal = createReverseEVL(Builder, Operand: StoredVal, EVL, Name: "vp.reverse");
3284	Value Mask = nullptr*;
3285	if (VPValue *VPMask = getMask()) {
3286	Mask = State.get(Def: VPMask);
3287	if (isReverse())
3288	Mask = createReverseEVL(Builder, Operand: Mask, EVL, Name: "vp.reverse.mask");
3289	} else {
3290	Mask = Builder.CreateVectorSplat(EC: State.VF, V: Builder.getTrue());
3291	}
3292	Value *Addr = State.get(Def: getAddr(), IsScalar: !CreateScatter);
3293	if (CreateScatter) {
3294	NewSI = Builder.CreateIntrinsic(RetTy: Type::getVoidTy(C&: EVL->getContext()),
3295	ID: Intrinsic::vp_scatter,
3296	Args: {StoredVal, Addr, Mask, EVL});
3297	} else {
3298	NewSI = Builder.CreateIntrinsic(RetTy: Type::getVoidTy(C&: EVL->getContext()),
3299	ID: Intrinsic::vp_store,
3300	Args: {StoredVal, Addr, Mask, EVL});
3301	}
3302	NewSI->addParamAttr(
3303	ArgNo: `1`, Attr: Attribute::getWithAlignment(Context&: NewSI->getContext(), Alignment));
3304	applyMetadata(I&: *NewSI);
3305	}
3306
3307	InstructionCost VPWidenStoreEVLRecipe::computeCost(ElementCount VF,
3308	VPCostContext &Ctx) const {
3309	if (!Consecutive \|\| IsMasked)
3310	return VPWidenMemoryRecipe::computeCost(VF, Ctx);
3311
3312	// We need to use the getMaskedMemoryOpCost() instead of getMemoryOpCost()
3313	// here because the EVL recipes using EVL to replace the tail mask. But in the
3314	// legacy model, it will always calculate the cost of mask.
3315	// TODO: Using getMemoryOpCost() instead of getMaskedMemoryOpCost when we
3316	// don't need to compare to the legacy cost model.
3317	Type *Ty = toVectorTy(Scalar: getLoadStoreType(I: &Ingredient), EC: VF);
3318	const Align Alignment =
3319	getLoadStoreAlignment(I: const_cast<Instruction *>(&Ingredient));
3320	unsigned AS =
3321	getLoadStoreAddressSpace(I: const_cast<Instruction *>(&Ingredient));
3322	InstructionCost Cost = Ctx.TTI.getMaskedMemoryOpCost(
3323	Opcode: Instruction::Store, Src: Ty, Alignment, AddressSpace: AS, CostKind: Ctx.CostKind);
3324	if (!Reverse)
3325	return Cost;
3326
3327	return Cost + Ctx.TTI.getShuffleCost(
3328	Kind: TargetTransformInfo::SK_Reverse, DstTy: cast<VectorType>(Val: Ty),
3329	SrcTy: cast<VectorType>(Val: Ty), Mask: {}, CostKind: Ctx.CostKind, Index: `0`);
3330	}
3331
3332	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
3333	void VPWidenStoreEVLRecipe::print(raw_ostream &O, const Twine &Indent,
3334	VPSlotTracker &SlotTracker) const {
3335	O << Indent << "WIDEN vp.store ";
3336	printOperands(O, SlotTracker);
3337	}
3338	#endif
3339
3340	static Value createBitOrPointerCast(IRBuilderBase &Builder, Value V,
3341	VectorType DstVTy, const* DataLayout &DL) {
3342	// Verify that V is a vector type with same number of elements as DstVTy.
3343	auto VF = DstVTy->getElementCount();
3344	auto *SrcVecTy = cast<VectorType>(Val: V->getType());
3345	assert(VF == SrcVecTy->getElementCount() && "Vector dimensions do not match");
3346	Type *SrcElemTy = SrcVecTy->getElementType();
3347	Type *DstElemTy = DstVTy->getElementType();
3348	assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
3349	"Vector elements must have same size");
3350
3351	// Do a direct cast if element types are castable.
3352	if (CastInst::isBitOrNoopPointerCastable(SrcTy: SrcElemTy, DestTy: DstElemTy, DL)) {
3353	return Builder.CreateBitOrPointerCast(V, DestTy: DstVTy);
3354	}
3355	// V cannot be directly casted to desired vector type.
3356	// May happen when V is a floating point vector but DstVTy is a vector of
3357	// pointers or vice-versa. Handle this using a two-step bitcast using an
3358	// intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
3359	assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
3360	"Only one type should be a pointer type");
3361	assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
3362	"Only one type should be a floating point type");
3363	Type *IntTy =
3364	IntegerType::getIntNTy(C&: V->getContext(), N: DL.getTypeSizeInBits(Ty: SrcElemTy));
3365	auto *VecIntTy = VectorType::get(ElementType: IntTy, EC: VF);
3366	Value *CastVal = Builder.CreateBitOrPointerCast(V, DestTy: VecIntTy);
3367	return Builder.CreateBitOrPointerCast(V: CastVal, DestTy: DstVTy);
3368	}
3369
3370	/// Return a vector containing interleaved elements from multiple
3371	/// smaller input vectors.
3372	static Value interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value > Vals,
3373	const Twine &Name) {
3374	unsigned Factor = Vals.size();
3375	assert(Factor > `1` && "Tried to interleave invalid number of vectors");
3376
3377	VectorType *VecTy = cast<VectorType>(Val: Vals [`0`]->getType());
3378	#ifndef NDEBUG
3379	for (Value *Val : Vals)
3380	assert(Val->getType() == VecTy && "Tried to interleave mismatched types");
3381	#endif
3382
3383	// Scalable vectors cannot use arbitrary shufflevectors (only splats), so
3384	// must use intrinsics to interleave.
3385	if (VecTy->isScalableTy()) {
3386	assert(Factor <= `8` && "Unsupported interleave factor for scalable vectors");
3387	VectorType *InterleaveTy =
3388	VectorType::get(ElementType: VecTy->getElementType(),
3389	EC: VecTy->getElementCount().multiplyCoefficientBy(RHS: Factor));
3390	return Builder.CreateIntrinsic(RetTy: InterleaveTy,
3391	ID: getInterleaveIntrinsicID(Factor), Args: Vals,
3392	/FMFSource=/nullptr, Name);
3393	}
3394
3395	// Fixed length. Start by concatenating all vectors into a wide vector.
3396	Value *WideVec = concatenateVectors(Builder, Vecs: Vals);
3397
3398	// Interleave the elements into the wide vector.
3399	const unsigned NumElts = VecTy->getElementCount().getFixedValue();
3400	return Builder.CreateShuffleVector(
3401	V: WideVec, Mask: createInterleaveMask(VF: NumElts, NumVecs: Factor), Name);
3402	}
3403
3404	// Try to vectorize the interleave group that \p Instr belongs to.
3405	//
3406	// E.g. Translate following interleaved load group (factor = 3):
3407	// for (i = 0; i < N; i+=3) {
3408	// R = Pic[i]; // Member of index 0
3409	// G = Pic[i+1]; // Member of index 1
3410	// B = Pic[i+2]; // Member of index 2
3411	// ... // do something to R, G, B
3412	// }
3413	// To:
3414	// %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B
3415	// %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements
3416	// %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements
3417	// %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements
3418	//
3419	// Or translate following interleaved store group (factor = 3):
3420	// for (i = 0; i < N; i+=3) {
3421	// ... do something to R, G, B
3422	// Pic[i] = R; // Member of index 0
3423	// Pic[i+1] = G; // Member of index 1
3424	// Pic[i+2] = B; // Member of index 2
3425	// }
3426	// To:
3427	// %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
3428	// %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
3429	// %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
3430	// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements
3431	// store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B
3432	void VPInterleaveRecipe::execute(VPTransformState &State) {
3433	assert(!State.Lane && "Interleave group being replicated.");
3434	const InterleaveGroup<Instruction> *Group = IG;
3435	Instruction *Instr = Group->getInsertPos();
3436
3437	// Prepare for the vector type of the interleaved load/store.
3438	Type *ScalarTy = getLoadStoreType(I: Instr);
3439	unsigned InterleaveFactor = Group->getFactor();
3440	auto VecTy = VectorType::get(ElementType: ScalarTy, EC: State.VF InterleaveFactor);
3441
3442	VPValue *BlockInMask = getMask();
3443	VPValue *Addr = getAddr();
3444	Value *ResAddr = State.get(Def: Addr, Lane: VPLane (`0`));
3445	Value *PoisonVec = PoisonValue::get(T: VecTy);
3446
3447	auto CreateGroupMask = [&BlockInMask, &State,
3448	&InterleaveFactor](Value MaskForGaps) -> Value {
3449	if (State.VF.isScalable()) {
3450	assert(!MaskForGaps && "Interleaved groups with gaps are not supported.");
3451	assert(InterleaveFactor <= `8` &&
3452	"Unsupported deinterleave factor for scalable vectors");
3453	auto *ResBlockInMask = State.get(Def: BlockInMask);
3454	SmallVector<Value *> Ops(InterleaveFactor, ResBlockInMask);
3455	return interleaveVectors(Builder&: State.Builder, Vals: Ops, Name: "interleaved.mask");
3456	}
3457
3458	if (!BlockInMask)
3459	return MaskForGaps;
3460
3461	Value *ResBlockInMask = State.get(Def: BlockInMask);
3462	Value *ShuffledMask = State.Builder.CreateShuffleVector(
3463	V: ResBlockInMask,
3464	Mask: createReplicatedMask(ReplicationFactor: InterleaveFactor, VF: State.VF.getFixedValue()),
3465	Name: "interleaved.mask");
3466	return MaskForGaps ? State.Builder.CreateBinOp(Opc: Instruction::And,
3467	LHS: ShuffledMask, RHS: MaskForGaps)
3468	: ShuffledMask;
3469	};
3470
3471	const DataLayout &DL = Instr->getDataLayout();
3472	// Vectorize the interleaved load group.
3473	if (isa<LoadInst>(Val: Instr)) {
3474	Value MaskForGaps = nullptr*;
3475	if (NeedsMaskForGaps) {
3476	MaskForGaps =
3477	createBitMaskForGaps(Builder&: State.Builder, VF: State.VF.getFixedValue(), Group: *Group);
3478	assert(MaskForGaps && "Mask for Gaps is required but it is null");
3479	}
3480
3481	Instruction *NewLoad;
3482	if (BlockInMask \|\| MaskForGaps) {
3483	Value *GroupMask = CreateGroupMask (MaskForGaps);
3484	NewLoad = State.Builder.CreateMaskedLoad(Ty: VecTy, Ptr: ResAddr,
3485	Alignment: Group->getAlign(), Mask: GroupMask,
3486	PassThru: PoisonVec, Name: "wide.masked.vec");
3487	} else
3488	NewLoad = State.Builder.CreateAlignedLoad(Ty: VecTy, Ptr: ResAddr,
3489	Align: Group->getAlign(), Name: "wide.vec");
3490	Group->addMetadata(NewInst: NewLoad);
3491
3492	ArrayRef<VPValue *> VPDefs = definedValues();
3493	const DataLayout &DL = State.CFG.PrevBB->getDataLayout();
3494	if (VecTy->isScalableTy()) {
3495	// Scalable vectors cannot use arbitrary shufflevectors (only splats),
3496	// so must use intrinsics to deinterleave.
3497	assert(InterleaveFactor <= `8` &&
3498	"Unsupported deinterleave factor for scalable vectors");
3499	Value *Deinterleave = State.Builder.CreateIntrinsic(
3500	ID: getDeinterleaveIntrinsicID(Factor: InterleaveFactor), Types: NewLoad->getType(),
3501	Args: NewLoad,
3502	/FMFSource=/nullptr, Name: "strided.vec");
3503
3504	for (unsigned I = `0`, J = `0`; I < InterleaveFactor; ++I) {
3505	Instruction *Member = Group->getMember(Index: I);
3506	Value *StridedVec = State.Builder.CreateExtractValue(Agg: Deinterleave, Idxs: I);
3507	if (!Member) {
3508	// This value is not needed as it's not used
3509	cast<Instruction>(Val: StridedVec)->eraseFromParent();
3510	continue;
3511	}
3512	// If this member has different type, cast the result type.
3513	if (Member->getType() != ScalarTy) {
3514	VectorType *OtherVTy = VectorType::get(ElementType: Member->getType(), EC: State.VF);
3515	StridedVec =
3516	createBitOrPointerCast(Builder&: State.Builder, V: StridedVec, DstVTy: OtherVTy, DL);
3517	}
3518
3519	if (Group->isReverse())
3520	StridedVec = State.Builder.CreateVectorReverse(V: StridedVec, Name: "reverse");
3521
3522	State.set(Def: VPDefs [J], V: StridedVec);
3523	++J;
3524	}
3525
3526	return;
3527	}
3528	assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
3529
3530	// For each member in the group, shuffle out the appropriate data from the
3531	// wide loads.
3532	unsigned J = `0`;
3533	for (unsigned I = `0`; I < InterleaveFactor; ++I) {
3534	Instruction *Member = Group->getMember(Index: I);
3535
3536	// Skip the gaps in the group.
3537	if (!Member)
3538	continue;
3539
3540	auto StrideMask =
3541	createStrideMask(Start: I, Stride: InterleaveFactor, VF: State.VF.getFixedValue());
3542	Value *StridedVec =
3543	State.Builder.CreateShuffleVector(V: NewLoad, Mask: StrideMask, Name: "strided.vec");
3544
3545	// If this member has different type, cast the result type.
3546	if (Member->getType() != ScalarTy) {
3547	VectorType *OtherVTy = VectorType::get(ElementType: Member->getType(), EC: State.VF);
3548	StridedVec =
3549	createBitOrPointerCast(Builder&: State.Builder, V: StridedVec, DstVTy: OtherVTy, DL);
3550	}
3551
3552	if (Group->isReverse())
3553	StridedVec = State.Builder.CreateVectorReverse(V: StridedVec, Name: "reverse");
3554
3555	State.set(Def: VPDefs [J], V: StridedVec);
3556	++J;
3557	}
3558	return;
3559	}
3560
3561	// The sub vector type for current instruction.
3562	auto *SubVT = VectorType::get(ElementType: ScalarTy, EC: State.VF);
3563
3564	// Vectorize the interleaved store group.
3565	Value *MaskForGaps =
3566	createBitMaskForGaps(Builder&: State.Builder, VF: State.VF.getKnownMinValue(), Group: *Group);
3567	assert((!MaskForGaps \|\| !State.VF.isScalable()) &&
3568	"masking gaps for scalable vectors is not yet supported.");
3569	ArrayRef<VPValue *> StoredValues = getStoredValues();
3570	// Collect the stored vector from each member.
3571	SmallVector<Value *, `4`> StoredVecs;
3572	unsigned StoredIdx = `0`;
3573	for (unsigned i = `0`; i < InterleaveFactor; i++) {
3574	assert((Group->getMember(i) \|\| MaskForGaps) &&
3575	"Fail to get a member from an interleaved store group");
3576	Instruction *Member = Group->getMember(Index: i);
3577
3578	// Skip the gaps in the group.
3579	if (!Member) {
3580	Value *Undef = PoisonValue::get(T: SubVT);
3581	StoredVecs.push_back(Elt: Undef);
3582	continue;
3583	}
3584
3585	Value *StoredVec = State.get(Def: StoredValues [StoredIdx]);
3586	++StoredIdx;
3587
3588	if (Group->isReverse())
3589	StoredVec = State.Builder.CreateVectorReverse(V: StoredVec, Name: "reverse");
3590
3591	// If this member has different type, cast it to a unified type.
3592
3593	if (StoredVec->getType() != SubVT)
3594	StoredVec = createBitOrPointerCast(Builder&: State.Builder, V: StoredVec, DstVTy: SubVT, DL);
3595
3596	StoredVecs.push_back(Elt: StoredVec);
3597	}
3598
3599	// Interleave all the smaller vectors into one wider vector.
3600	Value *IVec = interleaveVectors(Builder&: State.Builder, Vals: StoredVecs, Name: "interleaved.vec");
3601	Instruction *NewStoreInstr;
3602	if (BlockInMask \|\| MaskForGaps) {
3603	Value *GroupMask = CreateGroupMask (MaskForGaps);
3604	NewStoreInstr = State.Builder.CreateMaskedStore(
3605	Val: IVec, Ptr: ResAddr, Alignment: Group->getAlign(), Mask: GroupMask);
3606	} else
3607	NewStoreInstr =
3608	State.Builder.CreateAlignedStore(Val: IVec, Ptr: ResAddr, Align: Group->getAlign());
3609
3610	Group->addMetadata(NewInst: NewStoreInstr);
3611	}
3612
3613	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
3614	void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
3615	VPSlotTracker &SlotTracker) const {
3616	O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
3617	IG->getInsertPos()->printAsOperand(O, false);
3618	O << ", ";
3619	getAddr()->printAsOperand(O, SlotTracker);
3620	VPValue *Mask = getMask();
3621	if (Mask) {
3622	O << ", ";
3623	Mask->printAsOperand(O, SlotTracker);
3624	}
3625
3626	unsigned OpIdx = `0`;
3627	for (unsigned i = `0`; i < IG->getFactor(); ++i) {
3628	if (!IG->getMember(i))
3629	continue;
3630	if (getNumStoreOperands() > `0`) {
3631	O << "\n" << Indent << " store ";
3632	getOperand(`1` + OpIdx)->printAsOperand(O, SlotTracker);
3633	O << " to index " << i;
3634	} else {
3635	O << "\n" << Indent << " ";
3636	getVPValue(OpIdx)->printAsOperand(O, SlotTracker);
3637	O << " = load from index " << i;
3638	}
3639	++OpIdx;
3640	}
3641	}
3642	#endif
3643
3644	InstructionCost VPInterleaveRecipe::computeCost(ElementCount VF,
3645	VPCostContext &Ctx) const {
3646	Instruction *InsertPos = getInsertPos();
3647	// Find the VPValue index of the interleave group. We need to skip gaps.
3648	unsigned InsertPosIdx = `0`;
3649	for (unsigned Idx = `0`; IG->getFactor(); ++Idx)
3650	if (auto *Member = IG->getMember(Index: Idx)) {
3651	if (Member == InsertPos)
3652	break;
3653	InsertPosIdx++;
3654	}
3655	Type *ValTy = Ctx.Types.inferScalarType(
3656	V: getNumDefinedValues() > `0` ? getVPValue(I: InsertPosIdx)
3657	: getStoredValues()[InsertPosIdx]);
3658	auto *VectorTy = cast<VectorType>(Val: toVectorTy(Scalar: ValTy, EC: VF));
3659	unsigned AS = getLoadStoreAddressSpace(I: InsertPos);
3660
3661	unsigned InterleaveFactor = IG->getFactor();
3662	auto WideVecTy = VectorType::get(ElementType: ValTy, EC: VF InterleaveFactor);
3663
3664	// Holds the indices of existing members in the interleaved group.
3665	SmallVector<unsigned, `4`> Indices;
3666	for (unsigned IF = `0`; IF < InterleaveFactor; IF++)
3667	if (IG->getMember(Index: IF))
3668	Indices.push_back(Elt: IF);
3669
3670	// Calculate the cost of the whole interleaved group.
3671	InstructionCost Cost = Ctx.TTI.getInterleavedMemoryOpCost(
3672	Opcode: InsertPos->getOpcode(), VecTy: WideVecTy, Factor: IG->getFactor(), Indices,
3673	Alignment: IG->getAlign(), AddressSpace: AS, CostKind: Ctx.CostKind, UseMaskForCond: getMask(), UseMaskForGaps: NeedsMaskForGaps);
3674
3675	if (!IG->isReverse())
3676	return Cost;
3677
3678	return Cost + IG->getNumMembers() *
3679	Ctx.TTI.getShuffleCost(Kind: TargetTransformInfo::SK_Reverse,
3680	DstTy: VectorTy, SrcTy: VectorTy, Mask: {}, CostKind: Ctx.CostKind,
3681	Index: `0`);
3682	}
3683
3684	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
3685	void VPCanonicalIVPHIRecipe::print(raw_ostream &O, const Twine &Indent,
3686	VPSlotTracker &SlotTracker) const {
3687	O << Indent << "EMIT ";
3688	printAsOperand(O, SlotTracker);
3689	O << " = CANONICAL-INDUCTION ";
3690	printOperands(O, SlotTracker);
3691	}
3692	#endif
3693
3694	bool VPWidenPointerInductionRecipe::onlyScalarsGenerated(bool IsScalable) {
3695	return IsScalarAfterVectorization &&
3696	(!IsScalable \|\| vputils::onlyFirstLaneUsed(Def: this));
3697	}
3698
3699	void VPWidenPointerInductionRecipe::execute(VPTransformState &State) {
3700	assert(getInductionDescriptor().getKind() ==
3701	InductionDescriptor::IK_PtrInduction &&
3702	"Not a pointer induction according to InductionDescriptor!");
3703	assert(State.TypeAnalysis.inferScalarType(this)->isPointerTy() &&
3704	"Unexpected type.");
3705	assert(!onlyScalarsGenerated(State.VF.isScalable()) &&
3706	"Recipe should have been replaced");
3707
3708	unsigned CurrentPart = getUnrollPart(U&: *this);
3709
3710	// Build a pointer phi
3711	Value *ScalarStartValue = getStartValue()->getLiveInIRValue();
3712	Type *ScStValueType = ScalarStartValue->getType();
3713
3714	BasicBlock *VectorPH =
3715	State.CFG.VPBB2IRBB.at(Val: getParent()->getCFGPredecessor(Idx: `0`));
3716	PHINode NewPointerPhi = nullptr*;
3717	if (CurrentPart == `0`) {
3718	IRBuilder<>::InsertPointGuard Guard(State.Builder);
3719	if (State.Builder.GetInsertPoint() !=
3720	State.Builder.GetInsertBlock()->getFirstNonPHIIt())
3721	State.Builder.SetInsertPoint(
3722	State.Builder.GetInsertBlock()->getFirstNonPHIIt());
3723	NewPointerPhi = State.Builder.CreatePHI(Ty: ScStValueType, NumReservedValues: `2`, Name: "pointer.phi");
3724	NewPointerPhi->addIncoming(V: ScalarStartValue, BB: VectorPH);
3725	NewPointerPhi->setDebugLoc(getDebugLoc());
3726	} else {
3727	// The recipe has been unrolled. In that case, fetch the single pointer phi
3728	// shared among all unrolled parts of the recipe.
3729	auto *GEP =
3730	cast<GetElementPtrInst>(Val: State.get(Def: getFirstUnrolledPartOperand()));
3731	NewPointerPhi = cast<PHINode>(Val: GEP->getPointerOperand());
3732	}
3733
3734	// A pointer induction, performed by using a gep
3735	BasicBlock::iterator InductionLoc = State.Builder.GetInsertPoint();
3736	Value *ScalarStepValue = State.get(Def: getStepValue(), Lane: VPLane (`0`));
3737	Type *PhiType = State.TypeAnalysis.inferScalarType(V: getStepValue());
3738	Value *RuntimeVF = getRuntimeVF(B&: State.Builder, Ty: PhiType, VF: State.VF);
3739	// Add induction update using an incorrect block temporarily. The phi node
3740	// will be fixed after VPlan execution. Note that at this point the latch
3741	// block cannot be used, as it does not exist yet.
3742	// TODO: Model increment value in VPlan, by turning the recipe into a
3743	// multi-def and a subclass of VPHeaderPHIRecipe.
3744	if (CurrentPart == `0`) {
3745	// The recipe represents the first part of the pointer induction. Create the
3746	// GEP to increment the phi across all unrolled parts.
3747	Value NumUnrolledElems = State.get(Def: getOperand(N: `2`), IsScalar: true*);
3748
3749	Value *InductionGEP = GetElementPtrInst::Create(
3750	PointeeType: State.Builder.getInt8Ty(), Ptr: NewPointerPhi,
3751	IdxList: State.Builder.CreateMul(
3752	LHS: ScalarStepValue,
3753	RHS: State.Builder.CreateTrunc(V: NumUnrolledElems, DestTy: PhiType)),
3754	NameStr: "ptr.ind", InsertBefore: InductionLoc);
3755
3756	NewPointerPhi->addIncoming(V: InductionGEP, BB: VectorPH);
3757	}
3758
3759	// Create actual address geps that use the pointer phi as base and a
3760	// vectorized version of the step value (<step0, ..., stepN>) as offset.
3761	Type *VecPhiType = VectorType::get(ElementType: PhiType, EC: State.VF);
3762	Value *StartOffsetScalar = State.Builder.CreateMul(
3763	LHS: RuntimeVF, RHS: ConstantInt::get(Ty: PhiType, V: CurrentPart));
3764	Value *StartOffset =
3765	State.Builder.CreateVectorSplat(EC: State.VF, V: StartOffsetScalar);
3766	// Create a vector of consecutive numbers from zero to VF.
3767	StartOffset = State.Builder.CreateAdd(
3768	LHS: StartOffset, RHS: State.Builder.CreateStepVector(DstType: VecPhiType));
3769
3770	assert(ScalarStepValue == State.get(getOperand(`1`), VPLane(`0`)) &&
3771	"scalar step must be the same across all parts");
3772	Value *GEP = State.Builder.CreateGEP(
3773	Ty: State.Builder.getInt8Ty(), Ptr: NewPointerPhi,
3774	IdxList: State.Builder.CreateMul(LHS: StartOffset, RHS: State.Builder.CreateVectorSplat(
3775	EC: State.VF, V: ScalarStepValue)),
3776	Name: "vector.gep");
3777	State.set(Def: this, V: GEP);
3778	}
3779
3780	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
3781	void VPWidenPointerInductionRecipe::print(raw_ostream &O, const Twine &Indent,
3782	VPSlotTracker &SlotTracker) const {
3783	assert((getNumOperands() == `3` \|\| getNumOperands() == `5`) &&
3784	"unexpected number of operands");
3785	O << Indent << "EMIT ";
3786	printAsOperand(O, SlotTracker);
3787	O << " = WIDEN-POINTER-INDUCTION ";
3788	getStartValue()->printAsOperand(O, SlotTracker);
3789	O << ", ";
3790	getStepValue()->printAsOperand(O, SlotTracker);
3791	O << ", ";
3792	getOperand(`2`)->printAsOperand(O, SlotTracker);
3793	if (getNumOperands() == `5`) {
3794	O << ", ";
3795	getOperand(`3`)->printAsOperand(O, SlotTracker);
3796	O << ", ";
3797	getOperand(`4`)->printAsOperand(O, SlotTracker);
3798	}
3799	}
3800	#endif
3801
3802	void VPExpandSCEVRecipe::execute(VPTransformState &State) {
3803	assert(!State.Lane && "cannot be used in per-lane");
3804	const DataLayout &DL = SE.getDataLayout();
3805	SCEVExpander Exp(SE, DL, "induction", /PreserveLCSSA=/true);
3806	Value *Res = Exp.expandCodeFor(SH: Expr, Ty: Expr->getType(),
3807	I: &*State.Builder.GetInsertPoint());
3808	State.set(Def: this, V: Res, Lane: VPLane (`0`));
3809	}
3810
3811	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
3812	void VPExpandSCEVRecipe::print(raw_ostream &O, const Twine &Indent,
3813	VPSlotTracker &SlotTracker) const {
3814	O << Indent << "EMIT ";
3815	printAsOperand(O, SlotTracker);
3816	O << " = EXPAND SCEV " << *Expr;
3817	}
3818	#endif
3819
3820	void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) {
3821	Value CanonicalIV = State.get(Def: getOperand(N: `0`), /IsScalar/* true);
3822	Type *STy = CanonicalIV->getType();
3823	IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
3824	ElementCount VF = State.VF;
3825	Value *VStart = VF.isScalar()
3826	? CanonicalIV
3827	: Builder.CreateVectorSplat(EC: VF, V: CanonicalIV, Name: "broadcast");
3828	Value VStep = createStepForVF(B&: Builder, Ty: STy, VF, Step: getUnrollPart(U&: this));
3829	if (VF.isVector()) {
3830	VStep = Builder.CreateVectorSplat(EC: VF, V: VStep);
3831	VStep =
3832	Builder.CreateAdd(LHS: VStep, RHS: Builder.CreateStepVector(DstType: VStep->getType()));
3833	}
3834	Value *CanonicalVectorIV = Builder.CreateAdd(LHS: VStart, RHS: VStep, Name: "vec.iv");
3835	State.set(Def: this, V: CanonicalVectorIV);
3836	}
3837
3838	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
3839	void VPWidenCanonicalIVRecipe::print(raw_ostream &O, const Twine &Indent,
3840	VPSlotTracker &SlotTracker) const {
3841	O << Indent << "EMIT ";
3842	printAsOperand(O, SlotTracker);
3843	O << " = WIDEN-CANONICAL-INDUCTION ";
3844	printOperands(O, SlotTracker);
3845	}
3846	#endif
3847
3848	void VPFirstOrderRecurrencePHIRecipe::execute(VPTransformState &State) {
3849	auto &Builder = State.Builder;
3850	// Create a vector from the initial value.
3851	auto *VectorInit = getStartValue()->getLiveInIRValue();
3852
3853	Type *VecTy = State.VF.isScalar()
3854	? VectorInit->getType()
3855	: VectorType::get(ElementType: VectorInit->getType(), EC: State.VF);
3856
3857	BasicBlock *VectorPH =
3858	State.CFG.VPBB2IRBB.at(Val: getParent()->getCFGPredecessor(Idx: `0`));
3859	if (State.VF.isVector()) {
3860	auto *IdxTy = Builder.getInt32Ty();
3861	auto *One = ConstantInt::get(Ty: IdxTy, V: `1`);
3862	IRBuilder<>::InsertPointGuard Guard(Builder);
3863	Builder.SetInsertPoint(VectorPH->getTerminator());
3864	auto *RuntimeVF = getRuntimeVF(B&: Builder, Ty: IdxTy, VF: State.VF);
3865	auto *LastIdx = Builder.CreateSub(LHS: RuntimeVF, RHS: One);
3866	VectorInit = Builder.CreateInsertElement(
3867	Vec: PoisonValue::get(T: VecTy), NewElt: VectorInit, Idx: LastIdx, Name: "vector.recur.init");
3868	}
3869
3870	// Create a phi node for the new recurrence.
3871	PHINode *Phi = PHINode::Create(Ty: VecTy, NumReservedValues: `2`, NameStr: "vector.recur");
3872	Phi->insertBefore(InsertPos: State.CFG.PrevBB->getFirstInsertionPt());
3873	Phi->addIncoming(V: VectorInit, BB: VectorPH);
3874	State.set(Def: this, V: Phi);
3875	}
3876
3877	InstructionCost
3878	VPFirstOrderRecurrencePHIRecipe::computeCost(ElementCount VF,
3879	VPCostContext &Ctx) const {
3880	if (VF.isScalar())
3881	return Ctx.TTI.getCFInstrCost(Opcode: Instruction::PHI, CostKind: Ctx.CostKind);
3882
3883	return `0`;
3884	}
3885
3886	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
3887	void VPFirstOrderRecurrencePHIRecipe::print(raw_ostream &O, const Twine &Indent,
3888	VPSlotTracker &SlotTracker) const {
3889	O << Indent << "FIRST-ORDER-RECURRENCE-PHI ";
3890	printAsOperand(O, SlotTracker);
3891	O << " = phi ";
3892	printOperands(O, SlotTracker);
3893	}
3894	#endif
3895
3896	void VPReductionPHIRecipe::execute(VPTransformState &State) {
3897	// Reductions do not have to start at zero. They can start with
3898	// any loop invariant values.
3899	VPValue *StartVPV = getStartValue();
3900
3901	// In order to support recurrences we need to be able to vectorize Phi nodes.
3902	// Phi nodes have cycles, so we need to vectorize them in two stages. This is
3903	// stage #1: We create a new vector PHI node with no incoming edges. We'll use
3904	// this value when we vectorize all of the instructions that use the PHI.
3905	BasicBlock *VectorPH =
3906	State.CFG.VPBB2IRBB.at(Val: getParent()->getCFGPredecessor(Idx: `0`));
3907	bool ScalarPHI = State.VF.isScalar() \|\| IsInLoop;
3908	Value *StartV = State.get(Def: StartVPV, IsScalar: ScalarPHI);
3909	Type *VecTy = StartV->getType();
3910
3911	BasicBlock *HeaderBB = State.CFG.PrevBB;
3912	assert(State.CurrentParentLoop->getHeader() == HeaderBB &&
3913	"recipe must be in the vector loop header");
3914	auto *Phi = PHINode::Create(Ty: VecTy, NumReservedValues: `2`, NameStr: "vec.phi");
3915	Phi->insertBefore(InsertPos: HeaderBB->getFirstInsertionPt());
3916	State.set(Def: this, V: Phi, IsScalar: IsInLoop);
3917
3918	Phi->addIncoming(V: StartV, BB: VectorPH);
3919	}
3920
3921	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
3922	void VPReductionPHIRecipe::print(raw_ostream &O, const Twine &Indent,
3923	VPSlotTracker &SlotTracker) const {
3924	O << Indent << "WIDEN-REDUCTION-PHI ";
3925
3926	printAsOperand(O, SlotTracker);
3927	O << " = phi ";
3928	printOperands(O, SlotTracker);
3929	if (VFScaleFactor != `1`)
3930	O << " (VF scaled by 1/" << VFScaleFactor << ")";
3931	}
3932	#endif
3933
3934	void VPWidenPHIRecipe::execute(VPTransformState &State) {
3935	Value *Op0 = State.get(Def: getOperand(N: `0`));
3936	Type *VecTy = Op0->getType();
3937	Instruction *VecPhi = State.Builder.CreatePHI(Ty: VecTy, NumReservedValues: `2`, Name);
3938	// Manually move it with the other PHIs in case PHI recipes above this one
3939	// also inserted non-phi instructions.
3940	// TODO: Remove once VPWidenPointerInductionRecipe is also expanded in
3941	// convertToConcreteRecipes.
3942	VecPhi->moveBefore(InsertPos: State.Builder.GetInsertBlock()->getFirstNonPHIIt());
3943	State.set(Def: this, V: VecPhi);
3944	}
3945
3946	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
3947	void VPWidenPHIRecipe::print(raw_ostream &O, const Twine &Indent,
3948	VPSlotTracker &SlotTracker) const {
3949	O << Indent << "WIDEN-PHI ";
3950
3951	printAsOperand(O, SlotTracker);
3952	O << " = phi ";
3953	printPhiOperands(O, SlotTracker);
3954	}
3955	#endif
3956
3957	// TODO: It would be good to use the existing VPWidenPHIRecipe instead and
3958	// remove VPActiveLaneMaskPHIRecipe.
3959	void VPActiveLaneMaskPHIRecipe::execute(VPTransformState &State) {
3960	BasicBlock *VectorPH =
3961	State.CFG.VPBB2IRBB.at(Val: getParent()->getCFGPredecessor(Idx: `0`));
3962	Value *StartMask = State.get(Def: getOperand(N: `0`));
3963	PHINode *Phi =
3964	State.Builder.CreatePHI(Ty: StartMask->getType(), NumReservedValues: `2`, Name: "active.lane.mask");
3965	Phi->addIncoming(V: StartMask, BB: VectorPH);
3966	State.set(Def: this, V: Phi);
3967	}
3968
3969	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
3970	void VPActiveLaneMaskPHIRecipe::print(raw_ostream &O, const Twine &Indent,
3971	VPSlotTracker &SlotTracker) const {
3972	O << Indent << "ACTIVE-LANE-MASK-PHI ";
3973
3974	printAsOperand(O, SlotTracker);
3975	O << " = phi ";
3976	printOperands(O, SlotTracker);
3977	}
3978	#endif
3979
3980	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
3981	void VPEVLBasedIVPHIRecipe::print(raw_ostream &O, const Twine &Indent,
3982	VPSlotTracker &SlotTracker) const {
3983	O << Indent << "EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI ";
3984
3985	printAsOperand(O, SlotTracker);
3986	O << " = phi ";
3987	printOperands(O, SlotTracker);
3988	}
3989	#endif
3990

Browse the source code of llvm_projects/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp