VectorCombine.cpp source code [llvm_projects/llvm/lib/Transforms/Vectorize/VectorCombine.cpp]

1	//===------- VectorCombine.cpp - Optimize partial vector operations -------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This pass optimizes scalar/vector interactions using target cost models. The
10	// transforms implemented here may not fit in traditional loop-based or SLP
11	// vectorization passes.
12	//
13	//===----------------------------------------------------------------------===//
14
15	#include "llvm/Transforms/Vectorize/VectorCombine.h"
16	#include "llvm/ADT/DenseMap.h"
17	#include "llvm/ADT/STLExtras.h"
18	#include "llvm/ADT/ScopeExit.h"
19	#include "llvm/ADT/Statistic.h"
20	#include "llvm/Analysis/AssumptionCache.h"
21	#include "llvm/Analysis/BasicAliasAnalysis.h"
22	#include "llvm/Analysis/GlobalsModRef.h"
23	#include "llvm/Analysis/Loads.h"
24	#include "llvm/Analysis/TargetTransformInfo.h"
25	#include "llvm/Analysis/ValueTracking.h"
26	#include "llvm/Analysis/VectorUtils.h"
27	#include "llvm/IR/Dominators.h"
28	#include "llvm/IR/Function.h"
29	#include "llvm/IR/IRBuilder.h"
30	#include "llvm/IR/PatternMatch.h"
31	#include "llvm/Support/CommandLine.h"
32	#include "llvm/Transforms/Utils/Local.h"
33	#include "llvm/Transforms/Utils/LoopUtils.h"
34	#include <numeric>
35	#include <queue>
36
37	#define DEBUG_TYPE "vector-combine"
38	#include "llvm/Transforms/Utils/InstructionWorklist.h"
39
40	using namespace llvm;
41	using namespace llvm::PatternMatch;
42
43	STATISTIC(NumVecLoad, "Number of vector loads formed");
44	STATISTIC(NumVecCmp, "Number of vector compares formed");
45	STATISTIC(NumVecBO, "Number of vector binops formed");
46	STATISTIC(NumVecCmpBO, "Number of vector compare + binop formed");
47	STATISTIC(NumShufOfBitcast, "Number of shuffles moved after bitcast");
48	STATISTIC(NumScalarBO, "Number of scalar binops formed");
49	STATISTIC(NumScalarCmp, "Number of scalar compares formed");
50
51	static cl::opt<bool> DisableVectorCombine(
52	"disable-vector-combine", cl::init(Val: false), cl::Hidden,
53	cl::desc ("Disable all vector combine transforms"));
54
55	static cl::opt<bool> DisableBinopExtractShuffle(
56	"disable-binop-extract-shuffle", cl::init(Val: false), cl::Hidden,
57	cl::desc ("Disable binop extract to shuffle transforms"));
58
59	static cl::opt<unsigned> MaxInstrsToScan(
60	"vector-combine-max-scan-instrs", cl::init(Val: `30`), cl::Hidden,
61	cl::desc ("Max number of instructions to scan for vector combining."));
62
63	static const unsigned InvalidIndex = std::numeric_limits<unsigned>::max();
64
65	namespace {
66	class VectorCombine {
67	public:
68	VectorCombine(Function &F, const TargetTransformInfo &TTI,
69	const DominatorTree &DT, AAResults &AA, AssumptionCache &AC,
70	const DataLayout DL, bool* TryEarlyFoldsOnly)
71	: F(F), Builder (F.getContext()), TTI(TTI), DT(DT), AA(AA), AC(AC), DL(DL),
72	TryEarlyFoldsOnly(TryEarlyFoldsOnly) {}
73
74	bool run();
75
76	private:
77	Function &F;
78	IRBuilder<> Builder;
79	const TargetTransformInfo &TTI;
80	const DominatorTree &DT;
81	AAResults &AA;
82	AssumptionCache &AC;
83	const DataLayout *DL;
84
85	/// If true, only perform beneficial early IR transforms. Do not introduce new
86	/// vector operations.
87	bool TryEarlyFoldsOnly;
88
89	InstructionWorklist Worklist;
90
91	// TODO: Direct calls from the top-level "run" loop use a plain "Instruction"
92	// parameter. That should be updated to specific sub-classes because the
93	// run loop was changed to dispatch on opcode.
94	bool vectorizeLoadInsert(Instruction &I);
95	bool widenSubvectorLoad(Instruction &I);
96	ExtractElementInst getShuffleExtract(ExtractElementInst Ext0,
97	ExtractElementInst *Ext1,
98	unsigned PreferredExtractIndex) const;
99	bool isExtractExtractCheap(ExtractElementInst Ext0, ExtractElementInst Ext1,
100	const Instruction &I,
101	ExtractElementInst *&ConvertToShuffle,
102	unsigned PreferredExtractIndex);
103	void foldExtExtCmp(ExtractElementInst Ext0, ExtractElementInst Ext1,
104	Instruction &I);
105	void foldExtExtBinop(ExtractElementInst Ext0, ExtractElementInst Ext1,
106	Instruction &I);
107	bool foldExtractExtract(Instruction &I);
108	bool foldInsExtFNeg(Instruction &I);
109	bool foldBitcastShuffle(Instruction &I);
110	bool scalarizeBinopOrCmp(Instruction &I);
111	bool scalarizeVPIntrinsic(Instruction &I);
112	bool foldExtractedCmps(Instruction &I);
113	bool foldSingleElementStore(Instruction &I);
114	bool scalarizeLoadExtract(Instruction &I);
115	bool foldShuffleOfBinops(Instruction &I);
116	bool foldShuffleOfCastops(Instruction &I);
117	bool foldShuffleOfShuffles(Instruction &I);
118	bool foldShuffleToIdentity(Instruction &I);
119	bool foldShuffleFromReductions(Instruction &I);
120	bool foldCastFromReductions(Instruction &I);
121	bool foldSelectShuffle(Instruction &I, bool FromReduction = false);
122
123	void replaceValue(Value &Old, Value &New) {
124	Old.replaceAllUsesWith(V: &New);
125	if (auto *NewI = dyn_cast<Instruction>(Val: &New)) {
126	New.takeName(V: &Old);
127	Worklist.pushUsersToWorkList(I&: *NewI);
128	Worklist.pushValue(V: NewI);
129	}
130	Worklist.pushValue(V: &Old);
131	}
132
133	void eraseInstruction(Instruction &I) {
134	for (Value *Op : I.operands())
135	Worklist.pushValue(V: Op);
136	Worklist.remove(I: &I);
137	I.eraseFromParent();
138	}
139	};
140	} // namespace
141
142	/// Return the source operand of a potentially bitcasted value. If there is no
143	/// bitcast, return the input value itself.
144	static Value peekThroughBitcasts(Value V) {
145	while (auto *BitCast = dyn_cast<BitCastInst>(Val: V))
146	V = BitCast->getOperand(i_nocapture: `0`);
147	return V;
148	}
149
150	static bool canWidenLoad(LoadInst Load, const* TargetTransformInfo &TTI) {
151	// Do not widen load if atomic/volatile or under asan/hwasan/memtag/tsan.
152	// The widened load may load data from dirty regions or create data races
153	// non-existent in the source.
154	if (!Load \|\| !Load->isSimple() \|\| !Load->hasOneUse() \|\|
155	Load->getFunction()->hasFnAttribute(Kind: Attribute::SanitizeMemTag) \|\|
156	mustSuppressSpeculation(LI: *Load))
157	return false;
158
159	// We are potentially transforming byte-sized (8-bit) memory accesses, so make
160	// sure we have all of our type-based constraints in place for this target.
161	Type *ScalarTy = Load->getType()->getScalarType();
162	uint64_t ScalarSize = ScalarTy->getPrimitiveSizeInBits();
163	unsigned MinVectorSize = TTI.getMinVectorRegisterBitWidth();
164	if (!ScalarSize \|\| !MinVectorSize \|\| MinVectorSize % ScalarSize != `0` \|\|
165	ScalarSize % `8` != `0`)
166	return false;
167
168	return true;
169	}
170
171	bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
172	// Match insert into fixed vector of scalar value.
173	// TODO: Handle non-zero insert index.
174	Value *Scalar;
175	if (!match(V: &I, P: m_InsertElt(Val: m_Undef(), Elt: m_Value(V&: Scalar), Idx: m_ZeroInt())) \|\|
176	!Scalar->hasOneUse())
177	return false;
178
179	// Optionally match an extract from another vector.
180	Value *X;
181	bool HasExtract = match(V: Scalar, P: m_ExtractElt(Val: m_Value(V&: X), Idx: m_ZeroInt()));
182	if (!HasExtract)
183	X = Scalar;
184
185	auto *Load = dyn_cast<LoadInst>(Val: X);
186	if (!canWidenLoad(Load, TTI))
187	return false;
188
189	Type *ScalarTy = Scalar->getType();
190	uint64_t ScalarSize = ScalarTy->getPrimitiveSizeInBits();
191	unsigned MinVectorSize = TTI.getMinVectorRegisterBitWidth();
192
193	// Check safety of replacing the scalar load with a larger vector load.
194	// We use minimal alignment (maximum flexibility) because we only care about
195	// the dereferenceable region. When calculating cost and creating a new op,
196	// we may use a larger value based on alignment attributes.
197	Value *SrcPtr = Load->getPointerOperand()->stripPointerCasts();
198	assert(isa<PointerType>(SrcPtr->getType()) && "Expected a pointer type");
199
200	unsigned MinVecNumElts = MinVectorSize / ScalarSize;
201	auto MinVecTy = VectorType::get(ElementType: ScalarTy, NumElements: MinVecNumElts, Scalable: false*);
202	unsigned OffsetEltIndex = `0`;
203	Align Alignment = Load->getAlign();
204	if (!isSafeToLoadUnconditionally(V: SrcPtr, Ty: MinVecTy, Alignment: Align (`1`), DL: *DL, ScanFrom: Load, AC: &AC,
205	DT: &DT)) {
206	// It is not safe to load directly from the pointer, but we can still peek
207	// through gep offsets and check if it safe to load from a base address with
208	// updated alignment. If it is, we can shuffle the element(s) into place
209	// after loading.
210	unsigned OffsetBitWidth = DL->getIndexTypeSizeInBits(Ty: SrcPtr->getType());
211	APInt Offset(OffsetBitWidth, `0`);
212	SrcPtr = SrcPtr->stripAndAccumulateInBoundsConstantOffsets(DL: *DL, Offset);
213
214	// We want to shuffle the result down from a high element of a vector, so
215	// the offset must be positive.
216	if (Offset.isNegative())
217	return false;
218
219	// The offset must be a multiple of the scalar element to shuffle cleanly
220	// in the element's size.
221	uint64_t ScalarSizeInBytes = ScalarSize / `8`;
222	if (Offset.urem(RHS: ScalarSizeInBytes) != `0`)
223	return false;
224
225	// If we load MinVecNumElts, will our target element still be loaded?
226	OffsetEltIndex = Offset.udiv(RHS: ScalarSizeInBytes).getZExtValue();
227	if (OffsetEltIndex >= MinVecNumElts)
228	return false;
229
230	if (!isSafeToLoadUnconditionally(V: SrcPtr, Ty: MinVecTy, Alignment: Align (`1`), DL: *DL, ScanFrom: Load, AC: &AC,
231	DT: &DT))
232	return false;
233
234	// Update alignment with offset value. Note that the offset could be negated
235	// to more accurately represent "(new) SrcPtr - Offset = (old) SrcPtr", but
236	// negation does not change the result of the alignment calculation.
237	Alignment = commonAlignment(A: Alignment, Offset: Offset.getZExtValue());
238	}
239
240	// Original pattern: insertelt undef, load [free casts of] PtrOp, 0
241	// Use the greater of the alignment on the load or its source pointer.
242	Alignment = std::max(a: SrcPtr->getPointerAlignment(DL: *DL), b: Alignment);
243	Type *LoadTy = Load->getType();
244	unsigned AS = Load->getPointerAddressSpace();
245	InstructionCost OldCost =
246	TTI.getMemoryOpCost(Opcode: Instruction::Load, Src: LoadTy, Alignment, AddressSpace: AS);
247	APInt DemandedElts = APInt::getOneBitSet(numBits: MinVecNumElts, BitNo: `0`);
248	TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
249	OldCost +=
250	TTI.getScalarizationOverhead(Ty: MinVecTy, DemandedElts,
251	/ Insert / true, Extract: HasExtract, CostKind);
252
253	// New pattern: load VecPtr
254	InstructionCost NewCost =
255	TTI.getMemoryOpCost(Opcode: Instruction::Load, Src: MinVecTy, Alignment, AddressSpace: AS);
256	// Optionally, we are shuffling the loaded vector element(s) into place.
257	// For the mask set everything but element 0 to undef to prevent poison from
258	// propagating from the extra loaded memory. This will also optionally
259	// shrink/grow the vector from the loaded size to the output size.
260	// We assume this operation has no cost in codegen if there was no offset.
261	// Note that we could use freeze to avoid poison problems, but then we might
262	// still need a shuffle to change the vector size.
263	auto *Ty = cast<FixedVectorType>(Val: I.getType());
264	unsigned OutputNumElts = Ty->getNumElements();
265	SmallVector<int, `16`> Mask(OutputNumElts, PoisonMaskElem);
266	assert(OffsetEltIndex < MinVecNumElts && "Address offset too big");
267	Mask [`0`] = OffsetEltIndex;
268	if (OffsetEltIndex)
269	NewCost += TTI.getShuffleCost(Kind: TTI::SK_PermuteSingleSrc, Tp: MinVecTy, Mask);
270
271	// We can aggressively convert to the vector form because the backend can
272	// invert this transform if it does not result in a performance win.
273	if (OldCost < NewCost \|\| !NewCost.isValid())
274	return false;
275
276	// It is safe and potentially profitable to load a vector directly:
277	// inselt undef, load Scalar, 0 --> load VecPtr
278	IRBuilder<> Builder(Load);
279	Value *CastedPtr =
280	Builder.CreatePointerBitCastOrAddrSpaceCast(V: SrcPtr, DestTy: Builder.getPtrTy(AddrSpace: AS));
281	Value *VecLd = Builder.CreateAlignedLoad(Ty: MinVecTy, Ptr: CastedPtr, Align: Alignment);
282	VecLd = Builder.CreateShuffleVector(V: VecLd, Mask);
283
284	replaceValue(Old&: I, New&: *VecLd);
285	++NumVecLoad;
286	return true;
287	}
288
289	/// If we are loading a vector and then inserting it into a larger vector with
290	/// undefined elements, try to load the larger vector and eliminate the insert.
291	/// This removes a shuffle in IR and may allow combining of other loaded values.
292	bool VectorCombine::widenSubvectorLoad(Instruction &I) {
293	// Match subvector insert of fixed vector.
294	auto *Shuf = cast<ShuffleVectorInst>(Val: &I);
295	if (!Shuf->isIdentityWithPadding())
296	return false;
297
298	// Allow a non-canonical shuffle mask that is choosing elements from op1.
299	unsigned NumOpElts =
300	cast<FixedVectorType>(Val: Shuf->getOperand(i_nocapture: `0`)->getType())->getNumElements();
301	unsigned OpIndex = any_of(Range: Shuf->getShuffleMask(), P: [&NumOpElts](int M) {
302	return M >= (int)(NumOpElts);
303	});
304
305	auto *Load = dyn_cast<LoadInst>(Val: Shuf->getOperand(i_nocapture: OpIndex));
306	if (!canWidenLoad(Load, TTI))
307	return false;
308
309	// We use minimal alignment (maximum flexibility) because we only care about
310	// the dereferenceable region. When calculating cost and creating a new op,
311	// we may use a larger value based on alignment attributes.
312	auto *Ty = cast<FixedVectorType>(Val: I.getType());
313	Value *SrcPtr = Load->getPointerOperand()->stripPointerCasts();
314	assert(isa<PointerType>(SrcPtr->getType()) && "Expected a pointer type");
315	Align Alignment = Load->getAlign();
316	if (!isSafeToLoadUnconditionally(V: SrcPtr, Ty, Alignment: Align (`1`), DL: *DL, ScanFrom: Load, AC: &AC, DT: &DT))
317	return false;
318
319	Alignment = std::max(a: SrcPtr->getPointerAlignment(DL: *DL), b: Alignment);
320	Type *LoadTy = Load->getType();
321	unsigned AS = Load->getPointerAddressSpace();
322
323	// Original pattern: insert_subvector (load PtrOp)
324	// This conservatively assumes that the cost of a subvector insert into an
325	// undef value is 0. We could add that cost if the cost model accurately
326	// reflects the real cost of that operation.
327	InstructionCost OldCost =
328	TTI.getMemoryOpCost(Opcode: Instruction::Load, Src: LoadTy, Alignment, AddressSpace: AS);
329
330	// New pattern: load PtrOp
331	InstructionCost NewCost =
332	TTI.getMemoryOpCost(Opcode: Instruction::Load, Src: Ty, Alignment, AddressSpace: AS);
333
334	// We can aggressively convert to the vector form because the backend can
335	// invert this transform if it does not result in a performance win.
336	if (OldCost < NewCost \|\| !NewCost.isValid())
337	return false;
338
339	IRBuilder<> Builder(Load);
340	Value *CastedPtr =
341	Builder.CreatePointerBitCastOrAddrSpaceCast(V: SrcPtr, DestTy: Builder.getPtrTy(AddrSpace: AS));
342	Value *VecLd = Builder.CreateAlignedLoad(Ty, Ptr: CastedPtr, Align: Alignment);
343	replaceValue(Old&: I, New&: *VecLd);
344	++NumVecLoad;
345	return true;
346	}
347
348	/// Determine which, if any, of the inputs should be replaced by a shuffle
349	/// followed by extract from a different index.
350	ExtractElementInst *VectorCombine::getShuffleExtract(
351	ExtractElementInst Ext0, ExtractElementInst Ext1,
352	unsigned PreferredExtractIndex = InvalidIndex) const {
353	auto *Index0C = dyn_cast<ConstantInt>(Val: Ext0->getIndexOperand());
354	auto *Index1C = dyn_cast<ConstantInt>(Val: Ext1->getIndexOperand());
355	assert(Index0C && Index1C && "Expected constant extract indexes");
356
357	unsigned Index0 = Index0C->getZExtValue();
358	unsigned Index1 = Index1C->getZExtValue();
359
360	// If the extract indexes are identical, no shuffle is needed.
361	if (Index0 == Index1)
362	return nullptr;
363
364	Type *VecTy = Ext0->getVectorOperand()->getType();
365	TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
366	assert(VecTy == Ext1->getVectorOperand()->getType() && "Need matching types");
367	InstructionCost Cost0 =
368	TTI.getVectorInstrCost(I: *Ext0, Val: VecTy, CostKind, Index: Index0);
369	InstructionCost Cost1 =
370	TTI.getVectorInstrCost(I: *Ext1, Val: VecTy, CostKind, Index: Index1);
371
372	// If both costs are invalid no shuffle is needed
373	if (!Cost0.isValid() && !Cost1.isValid())
374	return nullptr;
375
376	// We are extracting from 2 different indexes, so one operand must be shuffled
377	// before performing a vector operation and/or extract. The more expensive
378	// extract will be replaced by a shuffle.
379	if (Cost0 > Cost1)
380	return Ext0;
381	if (Cost1 > Cost0)
382	return Ext1;
383
384	// If the costs are equal and there is a preferred extract index, shuffle the
385	// opposite operand.
386	if (PreferredExtractIndex == Index0)
387	return Ext1;
388	if (PreferredExtractIndex == Index1)
389	return Ext0;
390
391	// Otherwise, replace the extract with the higher index.
392	return Index0 > Index1 ? Ext0 : Ext1;
393	}
394
395	/// Compare the relative costs of 2 extracts followed by scalar operation vs.
396	/// vector operation(s) followed by extract. Return true if the existing
397	/// instructions are cheaper than a vector alternative. Otherwise, return false
398	/// and if one of the extracts should be transformed to a shufflevector, set
399	/// \p ConvertToShuffle to that extract instruction.
400	bool VectorCombine::isExtractExtractCheap(ExtractElementInst *Ext0,
401	ExtractElementInst *Ext1,
402	const Instruction &I,
403	ExtractElementInst *&ConvertToShuffle,
404	unsigned PreferredExtractIndex) {
405	auto *Ext0IndexC = dyn_cast<ConstantInt>(Val: Ext0->getOperand(i_nocapture: `1`));
406	auto *Ext1IndexC = dyn_cast<ConstantInt>(Val: Ext1->getOperand(i_nocapture: `1`));
407	assert(Ext0IndexC && Ext1IndexC && "Expected constant extract indexes");
408
409	unsigned Opcode = I.getOpcode();
410	Type *ScalarTy = Ext0->getType();
411	auto *VecTy = cast<VectorType>(Val: Ext0->getOperand(i_nocapture: `0`)->getType());
412	InstructionCost ScalarOpCost, VectorOpCost;
413
414	// Get cost estimates for scalar and vector versions of the operation.
415	bool IsBinOp = Instruction::isBinaryOp(Opcode);
416	if (IsBinOp) {
417	ScalarOpCost = TTI.getArithmeticInstrCost(Opcode, Ty: ScalarTy);
418	VectorOpCost = TTI.getArithmeticInstrCost(Opcode, Ty: VecTy);
419	} else {
420	assert((Opcode == Instruction::ICmp \|\| Opcode == Instruction::FCmp) &&
421	"Expected a compare");
422	CmpInst::Predicate Pred = cast<CmpInst>(Val: I).getPredicate();
423	ScalarOpCost = TTI.getCmpSelInstrCost(
424	Opcode, ValTy: ScalarTy, CondTy: CmpInst::makeCmpResultType(opnd_type: ScalarTy), VecPred: Pred);
425	VectorOpCost = TTI.getCmpSelInstrCost(
426	Opcode, ValTy: VecTy, CondTy: CmpInst::makeCmpResultType(opnd_type: VecTy), VecPred: Pred);
427	}
428
429	// Get cost estimates for the extract elements. These costs will factor into
430	// both sequences.
431	unsigned Ext0Index = Ext0IndexC->getZExtValue();
432	unsigned Ext1Index = Ext1IndexC->getZExtValue();
433	TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
434
435	InstructionCost Extract0Cost =
436	TTI.getVectorInstrCost(I: *Ext0, Val: VecTy, CostKind, Index: Ext0Index);
437	InstructionCost Extract1Cost =
438	TTI.getVectorInstrCost(I: *Ext1, Val: VecTy, CostKind, Index: Ext1Index);
439
440	// A more expensive extract will always be replaced by a splat shuffle.
441	// For example, if Ext0 is more expensive:
442	// opcode (extelt V0, Ext0), (ext V1, Ext1) -->
443	// extelt (opcode (splat V0, Ext0), V1), Ext1
444	// TODO: Evaluate whether that always results in lowest cost. Alternatively,
445	// check the cost of creating a broadcast shuffle and shuffling both
446	// operands to element 0.
447	InstructionCost CheapExtractCost = std::min(a: Extract0Cost, b: Extract1Cost);
448
449	// Extra uses of the extracts mean that we include those costs in the
450	// vector total because those instructions will not be eliminated.
451	InstructionCost OldCost, NewCost;
452	if (Ext0->getOperand(i_nocapture: `0`) == Ext1->getOperand(i_nocapture: `0`) && Ext0Index == Ext1Index) {
453	// Handle a special case. If the 2 extracts are identical, adjust the
454	// formulas to account for that. The extra use charge allows for either the
455	// CSE'd pattern or an unoptimized form with identical values:
456	// opcode (extelt V, C), (extelt V, C) --> extelt (opcode V, V), C
457	bool HasUseTax = Ext0 == Ext1 ? !Ext0->hasNUses(N: `2`)
458	: !Ext0->hasOneUse() \|\| !Ext1->hasOneUse();
459	OldCost = CheapExtractCost + ScalarOpCost;
460	NewCost = VectorOpCost + CheapExtractCost + HasUseTax * CheapExtractCost;
461	} else {
462	// Handle the general case. Each extract is actually a different value:
463	// opcode (extelt V0, C0), (extelt V1, C1) --> extelt (opcode V0, V1), C
464	OldCost = Extract0Cost + Extract1Cost + ScalarOpCost;
465	NewCost = VectorOpCost + CheapExtractCost +
466	!Ext0->hasOneUse() * Extract0Cost +
467	!Ext1->hasOneUse() * Extract1Cost;
468	}
469
470	ConvertToShuffle = getShuffleExtract(Ext0, Ext1, PreferredExtractIndex);
471	if (ConvertToShuffle) {
472	if (IsBinOp && DisableBinopExtractShuffle)
473	return true;
474
475	// If we are extracting from 2 different indexes, then one operand must be
476	// shuffled before performing the vector operation. The shuffle mask is
477	// poison except for 1 lane that is being translated to the remaining
478	// extraction lane. Therefore, it is a splat shuffle. Ex:
479	// ShufMask = { poison, poison, 0, poison }
480	// TODO: The cost model has an option for a "broadcast" shuffle
481	// (splat-from-element-0), but no option for a more general splat.
482	NewCost +=
483	TTI.getShuffleCost(Kind: TargetTransformInfo::SK_PermuteSingleSrc, Tp: VecTy);
484	}
485
486	// Aggressively form a vector op if the cost is equal because the transform
487	// may enable further optimization.
488	// Codegen can reverse this transform (scalarize) if it was not profitable.
489	return OldCost < NewCost;
490	}
491
492	/// Create a shuffle that translates (shifts) 1 element from the input vector
493	/// to a new element location.
494	static Value createShiftShuffle(Value Vec, unsigned OldIndex,
495	unsigned NewIndex, IRBuilder<> &Builder) {
496	// The shuffle mask is poison except for 1 lane that is being translated
497	// to the new element index. Example for OldIndex == 2 and NewIndex == 0:
498	// ShufMask = { 2, poison, poison, poison }
499	auto *VecTy = cast<FixedVectorType>(Val: Vec->getType());
500	SmallVector<int, `32`> ShufMask(VecTy->getNumElements(), PoisonMaskElem);
501	ShufMask [NewIndex] = OldIndex;
502	return Builder.CreateShuffleVector(V: Vec, Mask: ShufMask, Name: "shift");
503	}
504
505	/// Given an extract element instruction with constant index operand, shuffle
506	/// the source vector (shift the scalar element) to a NewIndex for extraction.
507	/// Return null if the input can be constant folded, so that we are not creating
508	/// unnecessary instructions.
509	static ExtractElementInst translateExtract(ExtractElementInst ExtElt,
510	unsigned NewIndex,
511	IRBuilder<> &Builder) {
512	// Shufflevectors can only be created for fixed-width vectors.
513	if (!isa<FixedVectorType>(Val: ExtElt->getOperand(i_nocapture: `0`)->getType()))
514	return nullptr;
515
516	// If the extract can be constant-folded, this code is unsimplified. Defer
517	// to other passes to handle that.
518	Value *X = ExtElt->getVectorOperand();
519	Value *C = ExtElt->getIndexOperand();
520	assert(isa<ConstantInt>(C) && "Expected a constant index operand");
521	if (isa<Constant>(Val: X))
522	return nullptr;
523
524	Value *Shuf = createShiftShuffle(Vec: X, OldIndex: cast<ConstantInt>(Val: C)->getZExtValue(),
525	NewIndex, Builder);
526	return cast<ExtractElementInst>(Val: Builder.CreateExtractElement(Vec: Shuf, Idx: NewIndex));
527	}
528
529	/// Try to reduce extract element costs by converting scalar compares to vector
530	/// compares followed by extract.
531	/// cmp (ext0 V0, C), (ext1 V1, C)
532	void VectorCombine::foldExtExtCmp(ExtractElementInst *Ext0,
533	ExtractElementInst *Ext1, Instruction &I) {
534	assert(isa<CmpInst>(&I) && "Expected a compare");
535	assert(cast<ConstantInt>(Ext0->getIndexOperand())->getZExtValue() ==
536	cast<ConstantInt>(Ext1->getIndexOperand())->getZExtValue() &&
537	"Expected matching constant extract indexes");
538
539	// cmp Pred (extelt V0, C), (extelt V1, C) --> extelt (cmp Pred V0, V1), C
540	++NumVecCmp;
541	CmpInst::Predicate Pred = cast<CmpInst>(Val: &I)->getPredicate();
542	Value V0 = Ext0->getVectorOperand(), V1 = Ext1->getVectorOperand();
543	Value *VecCmp = Builder.CreateCmp(Pred, LHS: V0, RHS: V1);
544	Value *NewExt = Builder.CreateExtractElement(Vec: VecCmp, Idx: Ext0->getIndexOperand());
545	replaceValue(Old&: I, New&: *NewExt);
546	}
547
548	/// Try to reduce extract element costs by converting scalar binops to vector
549	/// binops followed by extract.
550	/// bo (ext0 V0, C), (ext1 V1, C)
551	void VectorCombine::foldExtExtBinop(ExtractElementInst *Ext0,
552	ExtractElementInst *Ext1, Instruction &I) {
553	assert(isa<BinaryOperator>(&I) && "Expected a binary operator");
554	assert(cast<ConstantInt>(Ext0->getIndexOperand())->getZExtValue() ==
555	cast<ConstantInt>(Ext1->getIndexOperand())->getZExtValue() &&
556	"Expected matching constant extract indexes");
557
558	// bo (extelt V0, C), (extelt V1, C) --> extelt (bo V0, V1), C
559	++NumVecBO;
560	Value V0 = Ext0->getVectorOperand(), V1 = Ext1->getVectorOperand();
561	Value *VecBO =
562	Builder.CreateBinOp(Opc: cast<BinaryOperator>(Val: &I)->getOpcode(), LHS: V0, RHS: V1);
563
564	// All IR flags are safe to back-propagate because any potential poison
565	// created in unused vector elements is discarded by the extract.
566	if (auto *VecBOInst = dyn_cast<Instruction>(Val: VecBO))
567	VecBOInst->copyIRFlags(V: &I);
568
569	Value *NewExt = Builder.CreateExtractElement(Vec: VecBO, Idx: Ext0->getIndexOperand());
570	replaceValue(Old&: I, New&: *NewExt);
571	}
572
573	/// Match an instruction with extracted vector operands.
574	bool VectorCombine::foldExtractExtract(Instruction &I) {
575	// It is not safe to transform things like div, urem, etc. because we may
576	// create undefined behavior when executing those on unknown vector elements.
577	if (!isSafeToSpeculativelyExecute(I: &I))
578	return false;
579
580	Instruction I0, I1;
581	CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
582	if (!match(V: &I, P: m_Cmp(Pred, L: m_Instruction(I&: I0), R: m_Instruction(I&: I1))) &&
583	!match(V: &I, P: m_BinOp(L: m_Instruction(I&: I0), R: m_Instruction(I&: I1))))
584	return false;
585
586	Value V0, V1;
587	uint64_t C0, C1;
588	if (!match(V: I0, P: m_ExtractElt(Val: m_Value(V&: V0), Idx: m_ConstantInt(V&: C0))) \|\|
589	!match(V: I1, P: m_ExtractElt(Val: m_Value(V&: V1), Idx: m_ConstantInt(V&: C1))) \|\|
590	V0->getType() != V1->getType())
591	return false;
592
593	// If the scalar value 'I' is going to be re-inserted into a vector, then try
594	// to create an extract to that same element. The extract/insert can be
595	// reduced to a "select shuffle".
596	// TODO: If we add a larger pattern match that starts from an insert, this
597	// probably becomes unnecessary.
598	auto *Ext0 = cast<ExtractElementInst>(Val: I0);
599	auto *Ext1 = cast<ExtractElementInst>(Val: I1);
600	uint64_t InsertIndex = InvalidIndex;
601	if (I.hasOneUse())
602	match(V: I.user_back(),
603	P: m_InsertElt(Val: m_Value(), Elt: m_Value(), Idx: m_ConstantInt(V&: InsertIndex)));
604
605	ExtractElementInst *ExtractToChange;
606	if (isExtractExtractCheap(Ext0, Ext1, I, ConvertToShuffle&: ExtractToChange, PreferredExtractIndex: InsertIndex))
607	return false;
608
609	if (ExtractToChange) {
610	unsigned CheapExtractIdx = ExtractToChange == Ext0 ? C1 : C0;
611	ExtractElementInst *NewExtract =
612	translateExtract(ExtElt: ExtractToChange, NewIndex: CheapExtractIdx, Builder);
613	if (!NewExtract)
614	return false;
615	if (ExtractToChange == Ext0)
616	Ext0 = NewExtract;
617	else
618	Ext1 = NewExtract;
619	}
620
621	if (Pred != CmpInst::BAD_ICMP_PREDICATE)
622	foldExtExtCmp(Ext0, Ext1, I);
623	else
624	foldExtExtBinop(Ext0, Ext1, I);
625
626	Worklist.push(I: Ext0);
627	Worklist.push(I: Ext1);
628	return true;
629	}
630
631	/// Try to replace an extract + scalar fneg + insert with a vector fneg +
632	/// shuffle.
633	bool VectorCombine::foldInsExtFNeg(Instruction &I) {
634	// Match an insert (op (extract)) pattern.
635	Value *DestVec;
636	uint64_t Index;
637	Instruction *FNeg;
638	if (!match(V: &I, P: m_InsertElt(Val: m_Value(V&: DestVec), Elt: m_OneUse(SubPattern: m_Instruction(I&: FNeg)),
639	Idx: m_ConstantInt(V&: Index))))
640	return false;
641
642	// Note: This handles the canonical fneg instruction and "fsub -0.0, X".
643	Value *SrcVec;
644	Instruction *Extract;
645	if (!match(V: FNeg, P: m_FNeg(X: m_CombineAnd(
646	L: m_Instruction(I&: Extract),
647	R: m_ExtractElt(Val: m_Value(V&: SrcVec), Idx: m_SpecificInt(V: Index))))))
648	return false;
649
650	// TODO: We could handle this with a length-changing shuffle.
651	auto *VecTy = cast<FixedVectorType>(Val: I.getType());
652	if (SrcVec->getType() != VecTy)
653	return false;
654
655	// Ignore bogus insert/extract index.
656	unsigned NumElts = VecTy->getNumElements();
657	if (Index >= NumElts)
658	return false;
659
660	// We are inserting the negated element into the same lane that we extracted
661	// from. This is equivalent to a select-shuffle that chooses all but the
662	// negated element from the destination vector.
663	SmallVector<int> Mask(NumElts);
664	std::iota(first: Mask.begin(), last: Mask.end(), value: `0`);
665	Mask [Index] = Index + NumElts;
666
667	Type *ScalarTy = VecTy->getScalarType();
668	TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
669	InstructionCost OldCost =
670	TTI.getArithmeticInstrCost(Opcode: Instruction::FNeg, Ty: ScalarTy) +
671	TTI.getVectorInstrCost(I, Val: VecTy, CostKind, Index);
672
673	// If the extract has one use, it will be eliminated, so count it in the
674	// original cost. If it has more than one use, ignore the cost because it will
675	// be the same before/after.
676	if (Extract->hasOneUse())
677	OldCost += TTI.getVectorInstrCost(I: *Extract, Val: VecTy, CostKind, Index);
678
679	InstructionCost NewCost =
680	TTI.getArithmeticInstrCost(Opcode: Instruction::FNeg, Ty: VecTy) +
681	TTI.getShuffleCost(Kind: TargetTransformInfo::SK_Select, Tp: VecTy, Mask);
682
683	if (NewCost > OldCost)
684	return false;
685
686	// insertelt DestVec, (fneg (extractelt SrcVec, Index)), Index -->
687	// shuffle DestVec, (fneg SrcVec), Mask
688	Value *VecFNeg = Builder.CreateFNegFMF(V: SrcVec, FMFSource: FNeg);
689	Value *Shuf = Builder.CreateShuffleVector(V1: DestVec, V2: VecFNeg, Mask);
690	replaceValue(Old&: I, New&: *Shuf);
691	return true;
692	}
693
694	/// If this is a bitcast of a shuffle, try to bitcast the source vector to the
695	/// destination type followed by shuffle. This can enable further transforms by
696	/// moving bitcasts or shuffles together.
697	bool VectorCombine::foldBitcastShuffle(Instruction &I) {
698	Value V0, V1;
699	ArrayRef<int> Mask;
700	if (!match(V: &I, P: m_BitCast(Op: m_OneUse(
701	SubPattern: m_Shuffle(v1: m_Value(V&: V0), v2: m_Value(V&: V1), mask: m_Mask (Mask))))))
702	return false;
703
704	// 1) Do not fold bitcast shuffle for scalable type. First, shuffle cost for
705	// scalable type is unknown; Second, we cannot reason if the narrowed shuffle
706	// mask for scalable type is a splat or not.
707	// 2) Disallow non-vector casts.
708	// TODO: We could allow any shuffle.
709	auto *DestTy = dyn_cast<FixedVectorType>(Val: I.getType());
710	auto *SrcTy = dyn_cast<FixedVectorType>(Val: V0->getType());
711	if (!DestTy \|\| !SrcTy)
712	return false;
713
714	unsigned DestEltSize = DestTy->getScalarSizeInBits();
715	unsigned SrcEltSize = SrcTy->getScalarSizeInBits();
716	if (SrcTy->getPrimitiveSizeInBits() % DestEltSize != `0`)
717	return false;
718
719	bool IsUnary = isa<UndefValue>(Val: V1);
720
721	// For binary shuffles, only fold bitcast(shuffle(X,Y))
722	// if it won't increase the number of bitcasts.
723	if (!IsUnary) {
724	auto *BCTy0 = dyn_cast<FixedVectorType>(Val: peekThroughBitcasts(V: V0)->getType());
725	auto *BCTy1 = dyn_cast<FixedVectorType>(Val: peekThroughBitcasts(V: V1)->getType());
726	if (!(BCTy0 && BCTy0->getElementType() == DestTy->getElementType()) &&
727	!(BCTy1 && BCTy1->getElementType() == DestTy->getElementType()))
728	return false;
729	}
730
731	SmallVector<int, `16`> NewMask;
732	if (DestEltSize <= SrcEltSize) {
733	// The bitcast is from wide to narrow/equal elements. The shuffle mask can
734	// always be expanded to the equivalent form choosing narrower elements.
735	assert(SrcEltSize % DestEltSize == `0` && "Unexpected shuffle mask");
736	unsigned ScaleFactor = SrcEltSize / DestEltSize;
737	narrowShuffleMaskElts(Scale: ScaleFactor, Mask, ScaledMask&: NewMask);
738	} else {
739	// The bitcast is from narrow elements to wide elements. The shuffle mask
740	// must choose consecutive elements to allow casting first.
741	assert(DestEltSize % SrcEltSize == `0` && "Unexpected shuffle mask");
742	unsigned ScaleFactor = DestEltSize / SrcEltSize;
743	if (!widenShuffleMaskElts(Scale: ScaleFactor, Mask, ScaledMask&: NewMask))
744	return false;
745	}
746
747	// Bitcast the shuffle src - keep its original width but using the destination
748	// scalar type.
749	unsigned NumSrcElts = SrcTy->getPrimitiveSizeInBits() / DestEltSize;
750	auto *NewShuffleTy =
751	FixedVectorType::get(ElementType: DestTy->getScalarType(), NumElts: NumSrcElts);
752	auto *OldShuffleTy =
753	FixedVectorType::get(ElementType: SrcTy->getScalarType(), NumElts: Mask.size());
754	unsigned NumOps = IsUnary ? `1` : `2`;
755
756	// The new shuffle must not cost more than the old shuffle.
757	TargetTransformInfo::TargetCostKind CK =
758	TargetTransformInfo::TCK_RecipThroughput;
759	TargetTransformInfo::ShuffleKind SK =
760	IsUnary ? TargetTransformInfo::SK_PermuteSingleSrc
761	: TargetTransformInfo::SK_PermuteTwoSrc;
762
763	InstructionCost DestCost =
764	TTI.getShuffleCost(Kind: SK, Tp: NewShuffleTy, Mask: NewMask, CostKind: CK) +
765	(NumOps * TTI.getCastInstrCost(Opcode: Instruction::BitCast, Dst: NewShuffleTy, Src: SrcTy,
766	CCH: TargetTransformInfo::CastContextHint::None,
767	CostKind: CK));
768	InstructionCost SrcCost =
769	TTI.getShuffleCost(Kind: SK, Tp: SrcTy, Mask, CostKind: CK) +
770	TTI.getCastInstrCost(Opcode: Instruction::BitCast, Dst: DestTy, Src: OldShuffleTy,
771	CCH: TargetTransformInfo::CastContextHint::None, CostKind: CK);
772	if (DestCost > SrcCost \|\| !DestCost.isValid())
773	return false;
774
775	// bitcast (shuf V0, V1, MaskC) --> shuf (bitcast V0), (bitcast V1), MaskC'
776	++NumShufOfBitcast;
777	Value *CastV0 = Builder.CreateBitCast(V: peekThroughBitcasts(V: V0), DestTy: NewShuffleTy);
778	Value *CastV1 = Builder.CreateBitCast(V: peekThroughBitcasts(V: V1), DestTy: NewShuffleTy);
779	Value *Shuf = Builder.CreateShuffleVector(V1: CastV0, V2: CastV1, Mask: NewMask);
780	replaceValue(Old&: I, New&: *Shuf);
781	return true;
782	}
783
784	/// VP Intrinsics whose vector operands are both splat values may be simplified
785	/// into the scalar version of the operation and the result splatted. This
786	/// can lead to scalarization down the line.
787	bool VectorCombine::scalarizeVPIntrinsic(Instruction &I) {
788	if (!isa<VPIntrinsic>(Val: I))
789	return false;
790	VPIntrinsic &VPI = cast<VPIntrinsic>(Val&: I);
791	Value *Op0 = VPI.getArgOperand(i: `0`);
792	Value *Op1 = VPI.getArgOperand(i: `1`);
793
794	if (!isSplatValue(V: Op0) \|\| !isSplatValue(V: Op1))
795	return false;
796
797	// Check getSplatValue early in this function, to avoid doing unnecessary
798	// work.
799	Value *ScalarOp0 = getSplatValue(V: Op0);
800	Value *ScalarOp1 = getSplatValue(V: Op1);
801	if (!ScalarOp0 \|\| !ScalarOp1)
802	return false;
803
804	// For the binary VP intrinsics supported here, the result on disabled lanes
805	// is a poison value. For now, only do this simplification if all lanes
806	// are active.
807	// TODO: Relax the condition that all lanes are active by using insertelement
808	// on inactive lanes.
809	auto IsAllTrueMask = [](Value *MaskVal) {
810	if (Value *SplattedVal = getSplatValue(V: MaskVal))
811	if (auto *ConstValue = dyn_cast<Constant>(Val: SplattedVal))
812	return ConstValue->isAllOnesValue();
813	return false;
814	};
815	if (!IsAllTrueMask (VPI.getArgOperand(i: `2`)))
816	return false;
817
818	// Check to make sure we support scalarization of the intrinsic
819	Intrinsic::ID IntrID = VPI.getIntrinsicID();
820	if (!VPBinOpIntrinsic::isVPBinOp(ID: IntrID))
821	return false;
822
823	// Calculate cost of splatting both operands into vectors and the vector
824	// intrinsic
825	VectorType *VecTy = cast<VectorType>(Val: VPI.getType());
826	TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
827	SmallVector<int> Mask;
828	if (auto *FVTy = dyn_cast<FixedVectorType>(Val: VecTy))
829	Mask.resize(N: FVTy->getNumElements(), NV: `0`);
830	InstructionCost SplatCost =
831	TTI.getVectorInstrCost(Opcode: Instruction::InsertElement, Val: VecTy, CostKind, Index: `0`) +
832	TTI.getShuffleCost(Kind: TargetTransformInfo::SK_Broadcast, Tp: VecTy, Mask);
833
834	// Calculate the cost of the VP Intrinsic
835	SmallVector<Type *, `4`> Args;
836	for (Value *V : VPI.args())
837	Args.push_back(Elt: V->getType());
838	IntrinsicCostAttributes Attrs(IntrID, VecTy, Args);
839	InstructionCost VectorOpCost = TTI.getIntrinsicInstrCost(ICA: Attrs, CostKind);
840	InstructionCost OldCost = `2` * SplatCost + VectorOpCost;
841
842	// Determine scalar opcode
843	std::optional<unsigned> FunctionalOpcode =
844	VPI.getFunctionalOpcode();
845	std::optional<Intrinsic::ID> ScalarIntrID = std::nullopt;
846	if (!FunctionalOpcode) {
847	ScalarIntrID = VPI.getFunctionalIntrinsicID();
848	if (!ScalarIntrID)
849	return false;
850	}
851
852	// Calculate cost of scalarizing
853	InstructionCost ScalarOpCost = `0`;
854	if (ScalarIntrID) {
855	IntrinsicCostAttributes Attrs(*ScalarIntrID, VecTy->getScalarType(), Args);
856	ScalarOpCost = TTI.getIntrinsicInstrCost(ICA: Attrs, CostKind);
857	} else {
858	ScalarOpCost =
859	TTI.getArithmeticInstrCost(Opcode: *FunctionalOpcode, Ty: VecTy->getScalarType());
860	}
861
862	// The existing splats may be kept around if other instructions use them.
863	InstructionCost CostToKeepSplats =
864	(SplatCost * !Op0->hasOneUse()) + (SplatCost * !Op1->hasOneUse());
865	InstructionCost NewCost = ScalarOpCost + SplatCost + CostToKeepSplats;
866
867	LLVM_DEBUG(dbgs() << "Found a VP Intrinsic to scalarize: " << VPI
868	<< "\n");
869	LLVM_DEBUG(dbgs() << "Cost of Intrinsic: " << OldCost
870	<< ", Cost of scalarizing:" << NewCost << "\n");
871
872	// We want to scalarize unless the vector variant actually has lower cost.
873	if (OldCost < NewCost \|\| !NewCost.isValid())
874	return false;
875
876	// Scalarize the intrinsic
877	ElementCount EC = cast<VectorType>(Val: Op0->getType())->getElementCount();
878	Value *EVL = VPI.getArgOperand(i: `3`);
879
880	// If the VP op might introduce UB or poison, we can scalarize it provided
881	// that we know the EVL > 0: If the EVL is zero, then the original VP op
882	// becomes a no-op and thus won't be UB, so make sure we don't introduce UB by
883	// scalarizing it.
884	bool SafeToSpeculate;
885	if (ScalarIntrID)
886	SafeToSpeculate = Intrinsic::getAttributes(C&: I.getContext(), id: *ScalarIntrID)
887	.hasFnAttr(Kind: Attribute::AttrKind::Speculatable);
888	else
889	SafeToSpeculate = isSafeToSpeculativelyExecuteWithOpcode(
890	Opcode: FunctionalOpcode, Inst: &VPI, CtxI: nullptr*, AC: &AC, DT: &DT);
891	if (!SafeToSpeculate &&
892	!isKnownNonZero(V: EVL, Q: SimplifyQuery (*DL, &DT, &AC, &VPI)))
893	return false;
894
895	Value *ScalarVal =
896	ScalarIntrID
897	? Builder.CreateIntrinsic(RetTy: VecTy->getScalarType(), ID: *ScalarIntrID,
898	Args: {ScalarOp0, ScalarOp1})
899	: Builder.CreateBinOp(Opc: (Instruction::BinaryOps)(*FunctionalOpcode),
900	LHS: ScalarOp0, RHS: ScalarOp1);
901
902	replaceValue(Old&: VPI, New&: *Builder.CreateVectorSplat(EC, V: ScalarVal));
903	return true;
904	}
905
906	/// Match a vector binop or compare instruction with at least one inserted
907	/// scalar operand and convert to scalar binop/cmp followed by insertelement.
908	bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) {
909	CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
910	Value Ins0, Ins1;
911	if (!match(V: &I, P: m_BinOp(L: m_Value(V&: Ins0), R: m_Value(V&: Ins1))) &&
912	!match(V: &I, P: m_Cmp(Pred, L: m_Value(V&: Ins0), R: m_Value(V&: Ins1))))
913	return false;
914
915	// Do not convert the vector condition of a vector select into a scalar
916	// condition. That may cause problems for codegen because of differences in
917	// boolean formats and register-file transfers.
918	// TODO: Can we account for that in the cost model?
919	bool IsCmp = Pred != CmpInst::Predicate::BAD_ICMP_PREDICATE;
920	if (IsCmp)
921	for (User *U : I.users())
922	if (match(V: U, P: m_Select(C: m_Specific(V: &I), L: m_Value(), R: m_Value())))
923	return false;
924
925	// Match against one or both scalar values being inserted into constant
926	// vectors:
927	// vec_op VecC0, (inselt VecC1, V1, Index)
928	// vec_op (inselt VecC0, V0, Index), VecC1
929	// vec_op (inselt VecC0, V0, Index), (inselt VecC1, V1, Index)
930	// TODO: Deal with mismatched index constants and variable indexes?
931	Constant VecC0 = nullptr, VecC1 = nullptr;
932	Value V0 = nullptr, V1 = nullptr;
933	uint64_t Index0 = `0`, Index1 = `0`;
934	if (!match(V: Ins0, P: m_InsertElt(Val: m_Constant(C&: VecC0), Elt: m_Value(V&: V0),
935	Idx: m_ConstantInt(V&: Index0))) &&
936	!match(V: Ins0, P: m_Constant(C&: VecC0)))
937	return false;
938	if (!match(V: Ins1, P: m_InsertElt(Val: m_Constant(C&: VecC1), Elt: m_Value(V&: V1),
939	Idx: m_ConstantInt(V&: Index1))) &&
940	!match(V: Ins1, P: m_Constant(C&: VecC1)))
941	return false;
942
943	bool IsConst0 = !V0;
944	bool IsConst1 = !V1;
945	if (IsConst0 && IsConst1)
946	return false;
947	if (!IsConst0 && !IsConst1 && Index0 != Index1)
948	return false;
949
950	// Bail for single insertion if it is a load.
951	// TODO: Handle this once getVectorInstrCost can cost for load/stores.
952	auto *I0 = dyn_cast_or_null<Instruction>(Val: V0);
953	auto *I1 = dyn_cast_or_null<Instruction>(Val: V1);
954	if ((IsConst0 && I1 && I1->mayReadFromMemory()) \|\|
955	(IsConst1 && I0 && I0->mayReadFromMemory()))
956	return false;
957
958	uint64_t Index = IsConst0 ? Index1 : Index0;
959	Type *ScalarTy = IsConst0 ? V1->getType() : V0->getType();
960	Type *VecTy = I.getType();
961	assert(VecTy->isVectorTy() &&
962	(IsConst0 \|\| IsConst1 \|\| V0->getType() == V1->getType()) &&
963	(ScalarTy->isIntegerTy() \|\| ScalarTy->isFloatingPointTy() \|\|
964	ScalarTy->isPointerTy()) &&
965	"Unexpected types for insert element into binop or cmp");
966
967	unsigned Opcode = I.getOpcode();
968	InstructionCost ScalarOpCost, VectorOpCost;
969	if (IsCmp) {
970	CmpInst::Predicate Pred = cast<CmpInst>(Val&: I).getPredicate();
971	ScalarOpCost = TTI.getCmpSelInstrCost(
972	Opcode, ValTy: ScalarTy, CondTy: CmpInst::makeCmpResultType(opnd_type: ScalarTy), VecPred: Pred);
973	VectorOpCost = TTI.getCmpSelInstrCost(
974	Opcode, ValTy: VecTy, CondTy: CmpInst::makeCmpResultType(opnd_type: VecTy), VecPred: Pred);
975	} else {
976	ScalarOpCost = TTI.getArithmeticInstrCost(Opcode, Ty: ScalarTy);
977	VectorOpCost = TTI.getArithmeticInstrCost(Opcode, Ty: VecTy);
978	}
979
980	// Get cost estimate for the insert element. This cost will factor into
981	// both sequences.
982	TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
983	InstructionCost InsertCost = TTI.getVectorInstrCost(
984	Opcode: Instruction::InsertElement, Val: VecTy, CostKind, Index);
985	InstructionCost OldCost =
986	(IsConst0 ? `0` : InsertCost) + (IsConst1 ? `0` : InsertCost) + VectorOpCost;
987	InstructionCost NewCost = ScalarOpCost + InsertCost +
988	(IsConst0 ? `0` : !Ins0->hasOneUse() * InsertCost) +
989	(IsConst1 ? `0` : !Ins1->hasOneUse() * InsertCost);
990
991	// We want to scalarize unless the vector variant actually has lower cost.
992	if (OldCost < NewCost \|\| !NewCost.isValid())
993	return false;
994
995	// vec_op (inselt VecC0, V0, Index), (inselt VecC1, V1, Index) -->
996	// inselt NewVecC, (scalar_op V0, V1), Index
997	if (IsCmp)
998	++NumScalarCmp;
999	else
1000	++NumScalarBO;
1001
1002	// For constant cases, extract the scalar element, this should constant fold.
1003	if (IsConst0)
1004	V0 = ConstantExpr::getExtractElement(Vec: VecC0, Idx: Builder.getInt64(C: Index));
1005	if (IsConst1)
1006	V1 = ConstantExpr::getExtractElement(Vec: VecC1, Idx: Builder.getInt64(C: Index));
1007
1008	Value *Scalar =
1009	IsCmp ? Builder.CreateCmp(Pred, LHS: V0, RHS: V1)
1010	: Builder.CreateBinOp(Opc: (Instruction::BinaryOps)Opcode, LHS: V0, RHS: V1);
1011
1012	Scalar->setName(I.getName() + ".scalar");
1013
1014	// All IR flags are safe to back-propagate. There is no potential for extra
1015	// poison to be created by the scalar instruction.
1016	if (auto *ScalarInst = dyn_cast<Instruction>(Val: Scalar))
1017	ScalarInst->copyIRFlags(V: &I);
1018
1019	// Fold the vector constants in the original vectors into a new base vector.
1020	Value *NewVecC =
1021	IsCmp ? Builder.CreateCmp(Pred, LHS: VecC0, RHS: VecC1)
1022	: Builder.CreateBinOp(Opc: (Instruction::BinaryOps)Opcode, LHS: VecC0, RHS: VecC1);
1023	Value *Insert = Builder.CreateInsertElement(Vec: NewVecC, NewElt: Scalar, Idx: Index);
1024	replaceValue(Old&: I, New&: *Insert);
1025	return true;
1026	}
1027
1028	/// Try to combine a scalar binop + 2 scalar compares of extracted elements of
1029	/// a vector into vector operations followed by extract. Note: The SLP pass
1030	/// may miss this pattern because of implementation problems.
1031	bool VectorCombine::foldExtractedCmps(Instruction &I) {
1032	// We are looking for a scalar binop of booleans.
1033	// binop i1 (cmp Pred I0, C0), (cmp Pred I1, C1)
1034	if (!I.isBinaryOp() \|\| !I.getType()->isIntegerTy(Bitwidth: `1`))
1035	return false;
1036
1037	// The compare predicates should match, and each compare should have a
1038	// constant operand.
1039	// TODO: Relax the one-use constraints.
1040	Value B0 = I.getOperand(i: `0`), B1 = I.getOperand(i: `1`);
1041	Instruction I0, I1;
1042	Constant C0, C1;
1043	CmpInst::Predicate P0, P1;
1044	if (!match(V: B0, P: m_OneUse(SubPattern: m_Cmp(Pred&: P0, L: m_Instruction(I&: I0), R: m_Constant(C&: C0)))) \|\|
1045	!match(V: B1, P: m_OneUse(SubPattern: m_Cmp(Pred&: P1, L: m_Instruction(I&: I1), R: m_Constant(C&: C1)))) \|\|
1046	P0 != P1)
1047	return false;
1048
1049	// The compare operands must be extracts of the same vector with constant
1050	// extract indexes.
1051	// TODO: Relax the one-use constraints.
1052	Value *X;
1053	uint64_t Index0, Index1;
1054	if (!match(V: I0, P: m_OneUse(SubPattern: m_ExtractElt(Val: m_Value(V&: X), Idx: m_ConstantInt(V&: Index0)))) \|\|
1055	!match(V: I1, P: m_OneUse(SubPattern: m_ExtractElt(Val: m_Specific(V: X), Idx: m_ConstantInt(V&: Index1)))))
1056	return false;
1057
1058	auto *Ext0 = cast<ExtractElementInst>(Val: I0);
1059	auto *Ext1 = cast<ExtractElementInst>(Val: I1);
1060	ExtractElementInst *ConvertToShuf = getShuffleExtract(Ext0, Ext1);
1061	if (!ConvertToShuf)
1062	return false;
1063
1064	// The original scalar pattern is:
1065	// binop i1 (cmp Pred (ext X, Index0), C0), (cmp Pred (ext X, Index1), C1)
1066	CmpInst::Predicate Pred = P0;
1067	unsigned CmpOpcode = CmpInst::isFPPredicate(P: Pred) ? Instruction::FCmp
1068	: Instruction::ICmp;
1069	auto *VecTy = dyn_cast<FixedVectorType>(Val: X->getType());
1070	if (!VecTy)
1071	return false;
1072
1073	TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
1074	InstructionCost OldCost =
1075	TTI.getVectorInstrCost(I: *Ext0, Val: VecTy, CostKind, Index: Index0);
1076	OldCost += TTI.getVectorInstrCost(I: *Ext1, Val: VecTy, CostKind, Index: Index1);
1077	OldCost +=
1078	TTI.getCmpSelInstrCost(Opcode: CmpOpcode, ValTy: I0->getType(),
1079	CondTy: CmpInst::makeCmpResultType(opnd_type: I0->getType()), VecPred: Pred) *
1080	`2`;
1081	OldCost += TTI.getArithmeticInstrCost(Opcode: I.getOpcode(), Ty: I.getType());
1082
1083	// The proposed vector pattern is:
1084	// vcmp = cmp Pred X, VecC
1085	// ext (binop vNi1 vcmp, (shuffle vcmp, Index1)), Index0
1086	int CheapIndex = ConvertToShuf == Ext0 ? Index1 : Index0;
1087	int ExpensiveIndex = ConvertToShuf == Ext0 ? Index0 : Index1;
1088	auto *CmpTy = cast<FixedVectorType>(Val: CmpInst::makeCmpResultType(opnd_type: X->getType()));
1089	InstructionCost NewCost = TTI.getCmpSelInstrCost(
1090	Opcode: CmpOpcode, ValTy: X->getType(), CondTy: CmpInst::makeCmpResultType(opnd_type: X->getType()), VecPred: Pred);
1091	SmallVector<int, `32`> ShufMask(VecTy->getNumElements(), PoisonMaskElem);
1092	ShufMask [CheapIndex] = ExpensiveIndex;
1093	NewCost += TTI.getShuffleCost(Kind: TargetTransformInfo::SK_PermuteSingleSrc, Tp: CmpTy,
1094	Mask: ShufMask);
1095	NewCost += TTI.getArithmeticInstrCost(Opcode: I.getOpcode(), Ty: CmpTy);
1096	NewCost += TTI.getVectorInstrCost(I: *Ext0, Val: CmpTy, CostKind, Index: CheapIndex);
1097
1098	// Aggressively form vector ops if the cost is equal because the transform
1099	// may enable further optimization.
1100	// Codegen can reverse this transform (scalarize) if it was not profitable.
1101	if (OldCost < NewCost \|\| !NewCost.isValid())
1102	return false;
1103
1104	// Create a vector constant from the 2 scalar constants.
1105	SmallVector<Constant *, `32`> CmpC(VecTy->getNumElements(),
1106	PoisonValue::get(T: VecTy->getElementType()));
1107	CmpC [Index0] = C0;
1108	CmpC [Index1] = C1;
1109	Value *VCmp = Builder.CreateCmp(Pred, LHS: X, RHS: ConstantVector::get(V: CmpC));
1110
1111	Value *Shuf = createShiftShuffle(Vec: VCmp, OldIndex: ExpensiveIndex, NewIndex: CheapIndex, Builder);
1112	Value *VecLogic = Builder.CreateBinOp(Opc: cast<BinaryOperator>(Val&: I).getOpcode(),
1113	LHS: VCmp, RHS: Shuf);
1114	Value *NewExt = Builder.CreateExtractElement(Vec: VecLogic, Idx: CheapIndex);
1115	replaceValue(Old&: I, New&: *NewExt);
1116	++NumVecCmpBO;
1117	return true;
1118	}
1119
1120	// Check if memory loc modified between two instrs in the same BB
1121	static bool isMemModifiedBetween(BasicBlock::iterator Begin,
1122	BasicBlock::iterator End,
1123	const MemoryLocation &Loc, AAResults &AA) {
1124	unsigned NumScanned = `0`;
1125	return std::any_of(first: Begin, last: End, pred: [&](const Instruction &Instr) {
1126	return isModSet(MRI: AA.getModRefInfo(I: &Instr, OptLoc: Loc)) \|\|
1127	++NumScanned > MaxInstrsToScan;
1128	});
1129	}
1130
1131	namespace {
1132	/// Helper class to indicate whether a vector index can be safely scalarized and
1133	/// if a freeze needs to be inserted.
1134	class ScalarizationResult {
1135	enum class StatusTy { Unsafe, Safe, SafeWithFreeze };
1136
1137	StatusTy Status;
1138	Value *ToFreeze;
1139
1140	ScalarizationResult(StatusTy Status, Value ToFreeze = nullptr*)
1141	: Status(Status), ToFreeze(ToFreeze) {}
1142
1143	public:
1144	ScalarizationResult(const ScalarizationResult &Other) = default;
1145	~ScalarizationResult() {
1146	assert(!ToFreeze && "freeze() not called with ToFreeze being set");
1147	}
1148
1149	static ScalarizationResult unsafe() { return {StatusTy::Unsafe}; }
1150	static ScalarizationResult safe() { return {StatusTy::Safe}; }
1151	static ScalarizationResult safeWithFreeze(Value *ToFreeze) {
1152	return {StatusTy::SafeWithFreeze, ToFreeze};
1153	}
1154
1155	/// Returns true if the index can be scalarize without requiring a freeze.
1156	bool isSafe() const { return Status == StatusTy::Safe; }
1157	/// Returns true if the index cannot be scalarized.
1158	bool isUnsafe() const { return Status == StatusTy::Unsafe; }
1159	/// Returns true if the index can be scalarize, but requires inserting a
1160	/// freeze.
1161	bool isSafeWithFreeze() const { return Status == StatusTy::SafeWithFreeze; }
1162
1163	/// Reset the state of Unsafe and clear ToFreze if set.
1164	void discard() {
1165	ToFreeze = nullptr;
1166	Status = StatusTy::Unsafe;
1167	}
1168
1169	/// Freeze the ToFreeze and update the use in \p User to use it.
1170	void freeze(IRBuilder<> &Builder, Instruction &UserI) {
1171	assert(isSafeWithFreeze() &&
1172	"should only be used when freezing is required");
1173	assert(is_contained(ToFreeze->users(), &UserI) &&
1174	"UserI must be a user of ToFreeze");
1175	IRBuilder<>::InsertPointGuard Guard(Builder);
1176	Builder.SetInsertPoint(cast<Instruction>(Val: &UserI));
1177	Value *Frozen =
1178	Builder.CreateFreeze(V: ToFreeze, Name: ToFreeze->getName() + ".frozen");
1179	for (Use &U : make_early_inc_range(Range: (UserI.operands())))
1180	if (U.get() == ToFreeze)
1181	U.set(Frozen);
1182
1183	ToFreeze = nullptr;
1184	}
1185	};
1186	} // namespace
1187
1188	/// Check if it is legal to scalarize a memory access to \p VecTy at index \p
1189	/// Idx. \p Idx must access a valid vector element.
1190	static ScalarizationResult canScalarizeAccess(VectorType VecTy, Value Idx,
1191	Instruction *CtxI,
1192	AssumptionCache &AC,
1193	const DominatorTree &DT) {
1194	// We do checks for both fixed vector types and scalable vector types.
1195	// This is the number of elements of fixed vector types,
1196	// or the minimum number of elements of scalable vector types.
1197	uint64_t NumElements = VecTy->getElementCount().getKnownMinValue();
1198
1199	if (auto *C = dyn_cast<ConstantInt>(Val: Idx)) {
1200	if (C->getValue().ult(RHS: NumElements))
1201	return ScalarizationResult::safe();
1202	return ScalarizationResult::unsafe();
1203	}
1204
1205	unsigned IntWidth = Idx->getType()->getScalarSizeInBits();
1206	APInt Zero(IntWidth, `0`);
1207	APInt MaxElts(IntWidth, NumElements);
1208	ConstantRange ValidIndices(Zero, MaxElts);
1209	ConstantRange IdxRange(IntWidth, true);
1210
1211	if (isGuaranteedNotToBePoison(V: Idx, AC: &AC)) {
1212	if (ValidIndices.contains(CR: computeConstantRange(V: Idx, / ForSigned / false,
1213	UseInstrInfo: true, AC: &AC, CtxI, DT: &DT)))
1214	return ScalarizationResult::safe();
1215	return ScalarizationResult::unsafe();
1216	}
1217
1218	// If the index may be poison, check if we can insert a freeze before the
1219	// range of the index is restricted.
1220	Value *IdxBase;
1221	ConstantInt *CI;
1222	if (match(V: Idx, P: m_And(L: m_Value(V&: IdxBase), R: m_ConstantInt(CI)))) {
1223	IdxRange = IdxRange.binaryAnd(Other: CI->getValue());
1224	} else if (match(V: Idx, P: m_URem(L: m_Value(V&: IdxBase), R: m_ConstantInt(CI)))) {
1225	IdxRange = IdxRange.urem(Other: CI->getValue());
1226	}
1227
1228	if (ValidIndices.contains(CR: IdxRange))
1229	return ScalarizationResult::safeWithFreeze(ToFreeze: IdxBase);
1230	return ScalarizationResult::unsafe();
1231	}
1232
1233	/// The memory operation on a vector of \p ScalarType had alignment of
1234	/// \p VectorAlignment. Compute the maximal, but conservatively correct,
1235	/// alignment that will be valid for the memory operation on a single scalar
1236	/// element of the same type with index \p Idx.
1237	static Align computeAlignmentAfterScalarization(Align VectorAlignment,
1238	Type ScalarType, Value Idx,
1239	const DataLayout &DL) {
1240	if (auto *C = dyn_cast<ConstantInt>(Val: Idx))
1241	return commonAlignment(A: VectorAlignment,
1242	Offset: C->getZExtValue() * DL.getTypeStoreSize(Ty: ScalarType));
1243	return commonAlignment(A: VectorAlignment, Offset: DL.getTypeStoreSize(Ty: ScalarType));
1244	}
1245
1246	// Combine patterns like:
1247	// %0 = load <4 x i32>, <4 x i32> %a*
1248	// %1 = insertelement <4 x i32> %0, i32 %b, i32 1
1249	// store <4 x i32> %1, <4 x i32> %a*
1250	// to:
1251	// %0 = bitcast <4 x i32>* %a to i32*
1252	// %1 = getelementptr inbounds i32, i32 %0, i64 0, i64 1*
1253	// store i32 %b, i32 %1*
1254	bool VectorCombine::foldSingleElementStore(Instruction &I) {
1255	auto *SI = cast<StoreInst>(Val: &I);
1256	if (!SI->isSimple() \|\| !isa<VectorType>(Val: SI->getValueOperand()->getType()))
1257	return false;
1258
1259	// TODO: Combine more complicated patterns (multiple insert) by referencing
1260	// TargetTransformInfo.
1261	Instruction *Source;
1262	Value *NewElement;
1263	Value *Idx;
1264	if (!match(V: SI->getValueOperand(),
1265	P: m_InsertElt(Val: m_Instruction(I&: Source), Elt: m_Value(V&: NewElement),
1266	Idx: m_Value(V&: Idx))))
1267	return false;
1268
1269	if (auto *Load = dyn_cast<LoadInst>(Val: Source)) {
1270	auto VecTy = cast<VectorType>(Val: SI->getValueOperand()->getType());
1271	Value *SrcAddr = Load->getPointerOperand()->stripPointerCasts();
1272	// Don't optimize for atomic/volatile load or store. Ensure memory is not
1273	// modified between, vector type matches store size, and index is inbounds.
1274	if (!Load->isSimple() \|\| Load->getParent() != SI->getParent() \|\|
1275	!DL->typeSizeEqualsStoreSize(Ty: Load->getType()->getScalarType()) \|\|
1276	SrcAddr != SI->getPointerOperand()->stripPointerCasts())
1277	return false;
1278
1279	auto ScalarizableIdx = canScalarizeAccess(VecTy, Idx, CtxI: Load, AC, DT);
1280	if (ScalarizableIdx.isUnsafe() \|\|
1281	isMemModifiedBetween(Begin: Load->getIterator(), End: SI->getIterator(),
1282	Loc: MemoryLocation::get(SI), AA))
1283	return false;
1284
1285	if (ScalarizableIdx.isSafeWithFreeze())
1286	ScalarizableIdx.freeze(Builder, UserI&: *cast<Instruction>(Val: Idx));
1287	Value *GEP = Builder.CreateInBoundsGEP(
1288	Ty: SI->getValueOperand()->getType(), Ptr: SI->getPointerOperand(),
1289	IdxList: {ConstantInt::get(Ty: Idx->getType(), V: `0`), Idx});
1290	StoreInst *NSI = Builder.CreateStore(Val: NewElement, Ptr: GEP);
1291	NSI->copyMetadata(SrcInst: *SI);
1292	Align ScalarOpAlignment = computeAlignmentAfterScalarization(
1293	VectorAlignment: std::max(a: SI->getAlign(), b: Load->getAlign()), ScalarType: NewElement->getType(), Idx,
1294	DL: *DL);
1295	NSI->setAlignment(ScalarOpAlignment);
1296	replaceValue(Old&: I, New&: *NSI);
1297	eraseInstruction(I);
1298	return true;
1299	}
1300
1301	return false;
1302	}
1303
1304	/// Try to scalarize vector loads feeding extractelement instructions.
1305	bool VectorCombine::scalarizeLoadExtract(Instruction &I) {
1306	Value *Ptr;
1307	if (!match(V: &I, P: m_Load(Op: m_Value(V&: Ptr))))
1308	return false;
1309
1310	auto *VecTy = cast<VectorType>(Val: I.getType());
1311	auto *LI = cast<LoadInst>(Val: &I);
1312	if (LI->isVolatile() \|\| !DL->typeSizeEqualsStoreSize(Ty: VecTy->getScalarType()))
1313	return false;
1314
1315	InstructionCost OriginalCost =
1316	TTI.getMemoryOpCost(Opcode: Instruction::Load, Src: VecTy, Alignment: LI->getAlign(),
1317	AddressSpace: LI->getPointerAddressSpace());
1318	InstructionCost ScalarizedCost = `0`;
1319
1320	Instruction *LastCheckedInst = LI;
1321	unsigned NumInstChecked = `0`;
1322	DenseMap<ExtractElementInst *, ScalarizationResult> NeedFreeze;
1323	auto FailureGuard = make_scope_exit(F: [&]() {
1324	// If the transform is aborted, discard the ScalarizationResults.
1325	for (auto &Pair : NeedFreeze)
1326	Pair.second.discard();
1327	});
1328
1329	// Check if all users of the load are extracts with no memory modifications
1330	// between the load and the extract. Compute the cost of both the original
1331	// code and the scalarized version.
1332	for (User *U : LI->users()) {
1333	auto *UI = dyn_cast<ExtractElementInst>(Val: U);
1334	if (!UI \|\| UI->getParent() != LI->getParent())
1335	return false;
1336
1337	// Check if any instruction between the load and the extract may modify
1338	// memory.
1339	if (LastCheckedInst->comesBefore(Other: UI)) {
1340	for (Instruction &I :
1341	make_range(x: std::next(x: LI->getIterator()), y: UI->getIterator())) {
1342	// Bail out if we reached the check limit or the instruction may write
1343	// to memory.
1344	if (NumInstChecked == MaxInstrsToScan \|\| I.mayWriteToMemory())
1345	return false;
1346	NumInstChecked++;
1347	}
1348	LastCheckedInst = UI;
1349	}
1350
1351	auto ScalarIdx = canScalarizeAccess(VecTy, Idx: UI->getOperand(i_nocapture: `1`), CtxI: &I, AC, DT);
1352	if (ScalarIdx.isUnsafe())
1353	return false;
1354	if (ScalarIdx.isSafeWithFreeze()) {
1355	NeedFreeze.try_emplace(Key: UI, Args&: ScalarIdx);
1356	ScalarIdx.discard();
1357	}
1358
1359	auto *Index = dyn_cast<ConstantInt>(Val: UI->getOperand(i_nocapture: `1`));
1360	TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
1361	OriginalCost +=
1362	TTI.getVectorInstrCost(Opcode: Instruction::ExtractElement, Val: VecTy, CostKind,
1363	Index: Index ? Index->getZExtValue() : -`1`);
1364	ScalarizedCost +=
1365	TTI.getMemoryOpCost(Opcode: Instruction::Load, Src: VecTy->getElementType(),
1366	Alignment: Align (`1`), AddressSpace: LI->getPointerAddressSpace());
1367	ScalarizedCost += TTI.getAddressComputationCost(Ty: VecTy->getElementType());
1368	}
1369
1370	if (ScalarizedCost >= OriginalCost)
1371	return false;
1372
1373	// Replace extracts with narrow scalar loads.
1374	for (User *U : LI->users()) {
1375	auto *EI = cast<ExtractElementInst>(Val: U);
1376	Value *Idx = EI->getOperand(i_nocapture: `1`);
1377
1378	// Insert 'freeze' for poison indexes.
1379	auto It = NeedFreeze.find(Val: EI);
1380	if (It != NeedFreeze.end())
1381	It ->second.freeze(Builder, UserI&: *cast<Instruction>(Val: Idx));
1382
1383	Builder.SetInsertPoint(EI);
1384	Value *GEP =
1385	Builder.CreateInBoundsGEP(Ty: VecTy, Ptr, IdxList: {Builder.getInt32(C: `0`), Idx});
1386	auto *NewLoad = cast<LoadInst>(Val: Builder.CreateLoad(
1387	Ty: VecTy->getElementType(), Ptr: GEP, Name: EI->getName() + ".scalar"));
1388
1389	Align ScalarOpAlignment = computeAlignmentAfterScalarization(
1390	VectorAlignment: LI->getAlign(), ScalarType: VecTy->getElementType(), Idx, DL: *DL);
1391	NewLoad->setAlignment(ScalarOpAlignment);
1392
1393	replaceValue(Old&: EI, New&: NewLoad);
1394	}
1395
1396	FailureGuard.release();
1397	return true;
1398	}
1399
1400	/// Try to convert "shuffle (binop), (binop)" into "binop (shuffle), (shuffle)".
1401	bool VectorCombine::foldShuffleOfBinops(Instruction &I) {
1402	BinaryOperator B0, B1;
1403	ArrayRef<int> OldMask;
1404	if (!match(V: &I, P: m_Shuffle(v1: m_OneUse(SubPattern: m_BinOp(I&: B0)), v2: m_OneUse(SubPattern: m_BinOp(I&: B1)),
1405	mask: m_Mask (OldMask))))
1406	return false;
1407
1408	// Don't introduce poison into div/rem.
1409	if (any_of(Range&: OldMask, P: [](int M) { return M == PoisonMaskElem; }) &&
1410	B0->isIntDivRem())
1411	return false;
1412
1413	// TODO: Add support for addlike etc.
1414	Instruction::BinaryOps Opcode = B0->getOpcode();
1415	if (Opcode != B1->getOpcode())
1416	return false;
1417
1418	auto *ShuffleDstTy = dyn_cast<FixedVectorType>(Val: I.getType());
1419	auto *BinOpTy = dyn_cast<FixedVectorType>(Val: B0->getType());
1420	if (!ShuffleDstTy \|\| !BinOpTy)
1421	return false;
1422
1423	unsigned NumSrcElts = BinOpTy->getNumElements();
1424
1425	// If we have something like "add X, Y" and "add Z, X", swap ops to match.
1426	Value X = B0->getOperand(i_nocapture: `0`), Y = B0->getOperand(i_nocapture: `1`);
1427	Value Z = B1->getOperand(i_nocapture: `0`), W = B1->getOperand(i_nocapture: `1`);
1428	if (BinaryOperator::isCommutative(Opcode) && X != Z && Y != W &&
1429	(X == W \|\| Y == Z))
1430	std::swap(a&: X, b&: Y);
1431
1432	auto ConvertToUnary = [NumSrcElts](int &M) {
1433	if (M >= (int)NumSrcElts)
1434	M -= NumSrcElts;
1435	};
1436
1437	SmallVector<int> NewMask0(OldMask.begin(), OldMask.end());
1438	TargetTransformInfo::ShuffleKind SK0 = TargetTransformInfo::SK_PermuteTwoSrc;
1439	if (X == Z) {
1440	llvm::for_each(Range&: NewMask0, F: ConvertToUnary);
1441	SK0 = TargetTransformInfo::SK_PermuteSingleSrc;
1442	Z = PoisonValue::get(T: BinOpTy);
1443	}
1444
1445	SmallVector<int> NewMask1(OldMask.begin(), OldMask.end());
1446	TargetTransformInfo::ShuffleKind SK1 = TargetTransformInfo::SK_PermuteTwoSrc;
1447	if (Y == W) {
1448	llvm::for_each(Range&: NewMask1, F: ConvertToUnary);
1449	SK1 = TargetTransformInfo::SK_PermuteSingleSrc;
1450	W = PoisonValue::get(T: BinOpTy);
1451	}
1452
1453	// Try to replace a binop with a shuffle if the shuffle is not costly.
1454	TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
1455
1456	InstructionCost OldCost =
1457	TTI.getArithmeticInstrCost(Opcode: B0->getOpcode(), Ty: BinOpTy, CostKind) +
1458	TTI.getArithmeticInstrCost(Opcode: B1->getOpcode(), Ty: BinOpTy, CostKind) +
1459	TTI.getShuffleCost(Kind: TargetTransformInfo::SK_PermuteTwoSrc, Tp: BinOpTy,
1460	Mask: OldMask, CostKind, Index: `0`, SubTp: nullptr, Args: {B0, B1}, CxtI: &I);
1461
1462	InstructionCost NewCost =
1463	TTI.getShuffleCost(Kind: SK0, Tp: BinOpTy, Mask: NewMask0, CostKind, Index: `0`, SubTp: nullptr, Args: {X, Z}) +
1464	TTI.getShuffleCost(Kind: SK1, Tp: BinOpTy, Mask: NewMask1, CostKind, Index: `0`, SubTp: nullptr, Args: {Y, W}) +
1465	TTI.getArithmeticInstrCost(Opcode, Ty: ShuffleDstTy, CostKind);
1466
1467	LLVM_DEBUG(dbgs() << "Found a shuffle feeding two binops: " << I
1468	<< "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
1469	<< "\n");
1470	if (NewCost >= OldCost)
1471	return false;
1472
1473	Value *Shuf0 = Builder.CreateShuffleVector(V1: X, V2: Z, Mask: NewMask0);
1474	Value *Shuf1 = Builder.CreateShuffleVector(V1: Y, V2: W, Mask: NewMask1);
1475	Value *NewBO = Builder.CreateBinOp(Opc: Opcode, LHS: Shuf0, RHS: Shuf1);
1476
1477	// Intersect flags from the old binops.
1478	if (auto *NewInst = dyn_cast<Instruction>(Val: NewBO)) {
1479	NewInst->copyIRFlags(V: B0);
1480	NewInst->andIRFlags(V: B1);
1481	}
1482
1483	Worklist.pushValue(V: Shuf0);
1484	Worklist.pushValue(V: Shuf1);
1485	replaceValue(Old&: I, New&: *NewBO);
1486	return true;
1487	}
1488
1489	/// Try to convert "shuffle (castop), (castop)" with a shared castop operand
1490	/// into "castop (shuffle)".
1491	bool VectorCombine::foldShuffleOfCastops(Instruction &I) {
1492	Value V0, V1;
1493	ArrayRef<int> OldMask;
1494	if (!match(V: &I, P: m_Shuffle(v1: m_Value(V&: V0), v2: m_Value(V&: V1), mask: m_Mask (OldMask))))
1495	return false;
1496
1497	auto *C0 = dyn_cast<CastInst>(Val: V0);
1498	auto *C1 = dyn_cast<CastInst>(Val: V1);
1499	if (!C0 \|\| !C1)
1500	return false;
1501
1502	Instruction::CastOps Opcode = C0->getOpcode();
1503	if (C0->getSrcTy() != C1->getSrcTy())
1504	return false;
1505
1506	// Handle shuffle(zext_nneg(x), sext(y)) -> sext(shuffle(x,y)) folds.
1507	if (Opcode != C1->getOpcode()) {
1508	if (match(V: C0, P: m_SExtLike(Op: m_Value())) && match(V: C1, P: m_SExtLike(Op: m_Value())))
1509	Opcode = Instruction::SExt;
1510	else
1511	return false;
1512	}
1513
1514	auto *ShuffleDstTy = dyn_cast<FixedVectorType>(Val: I.getType());
1515	auto *CastDstTy = dyn_cast<FixedVectorType>(Val: C0->getDestTy());
1516	auto *CastSrcTy = dyn_cast<FixedVectorType>(Val: C0->getSrcTy());
1517	if (!ShuffleDstTy \|\| !CastDstTy \|\| !CastSrcTy)
1518	return false;
1519
1520	unsigned NumSrcElts = CastSrcTy->getNumElements();
1521	unsigned NumDstElts = CastDstTy->getNumElements();
1522	assert((NumDstElts == NumSrcElts \|\| Opcode == Instruction::BitCast) &&
1523	"Only bitcasts expected to alter src/dst element counts");
1524
1525	// Check for bitcasting of unscalable vector types.
1526	// e.g. <32 x i40> -> <40 x i32>
1527	if (NumDstElts != NumSrcElts && (NumSrcElts % NumDstElts) != `0` &&
1528	(NumDstElts % NumSrcElts) != `0`)
1529	return false;
1530
1531	SmallVector<int, `16`> NewMask;
1532	if (NumSrcElts >= NumDstElts) {
1533	// The bitcast is from wide to narrow/equal elements. The shuffle mask can
1534	// always be expanded to the equivalent form choosing narrower elements.
1535	assert(NumSrcElts % NumDstElts == `0` && "Unexpected shuffle mask");
1536	unsigned ScaleFactor = NumSrcElts / NumDstElts;
1537	narrowShuffleMaskElts(Scale: ScaleFactor, Mask: OldMask, ScaledMask&: NewMask);
1538	} else {
1539	// The bitcast is from narrow elements to wide elements. The shuffle mask
1540	// must choose consecutive elements to allow casting first.
1541	assert(NumDstElts % NumSrcElts == `0` && "Unexpected shuffle mask");
1542	unsigned ScaleFactor = NumDstElts / NumSrcElts;
1543	if (!widenShuffleMaskElts(Scale: ScaleFactor, Mask: OldMask, ScaledMask&: NewMask))
1544	return false;
1545	}
1546
1547	auto *NewShuffleDstTy =
1548	FixedVectorType::get(ElementType: CastSrcTy->getScalarType(), NumElts: NewMask.size());
1549
1550	// Try to replace a castop with a shuffle if the shuffle is not costly.
1551	TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
1552
1553	InstructionCost CostC0 =
1554	TTI.getCastInstrCost(Opcode: C0->getOpcode(), Dst: CastDstTy, Src: CastSrcTy,
1555	CCH: TTI::CastContextHint::None, CostKind);
1556	InstructionCost CostC1 =
1557	TTI.getCastInstrCost(Opcode: C1->getOpcode(), Dst: CastDstTy, Src: CastSrcTy,
1558	CCH: TTI::CastContextHint::None, CostKind);
1559	InstructionCost OldCost = CostC0 + CostC1;
1560	OldCost +=
1561	TTI.getShuffleCost(Kind: TargetTransformInfo::SK_PermuteTwoSrc, Tp: CastDstTy,
1562	Mask: OldMask, CostKind, Index: `0`, SubTp: nullptr, Args: std::nullopt, CxtI: &I);
1563
1564	InstructionCost NewCost = TTI.getShuffleCost(
1565	Kind: TargetTransformInfo::SK_PermuteTwoSrc, Tp: CastSrcTy, Mask: NewMask, CostKind);
1566	NewCost += TTI.getCastInstrCost(Opcode, Dst: ShuffleDstTy, Src: NewShuffleDstTy,
1567	CCH: TTI::CastContextHint::None, CostKind);
1568	if (!C0->hasOneUse())
1569	NewCost += CostC0;
1570	if (!C1->hasOneUse())
1571	NewCost += CostC1;
1572
1573	LLVM_DEBUG(dbgs() << "Found a shuffle feeding two casts: " << I
1574	<< "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
1575	<< "\n");
1576	if (NewCost > OldCost)
1577	return false;
1578
1579	Value *Shuf = Builder.CreateShuffleVector(V1: C0->getOperand(i_nocapture: `0`),
1580	V2: C1->getOperand(i_nocapture: `0`), Mask: NewMask);
1581	Value *Cast = Builder.CreateCast(Op: Opcode, V: Shuf, DestTy: ShuffleDstTy);
1582
1583	// Intersect flags from the old casts.
1584	if (auto *NewInst = dyn_cast<Instruction>(Val: Cast)) {
1585	NewInst->copyIRFlags(V: C0);
1586	NewInst->andIRFlags(V: C1);
1587	}
1588
1589	Worklist.pushValue(V: Shuf);
1590	replaceValue(Old&: I, New&: *Cast);
1591	return true;
1592	}
1593
1594	/// Try to convert "shuffle (shuffle x, undef), (shuffle y, undef)"
1595	/// into "shuffle x, y".
1596	bool VectorCombine::foldShuffleOfShuffles(Instruction &I) {
1597	Value V0, V1;
1598	UndefValue U0, U1;
1599	ArrayRef<int> OuterMask, InnerMask0, InnerMask1;
1600	if (!match(V: &I, P: m_Shuffle(v1: m_OneUse(SubPattern: m_Shuffle(v1: m_Value(V&: V0), v2: m_UndefValue(U&: U0),
1601	mask: m_Mask (InnerMask0))),
1602	v2: m_OneUse(SubPattern: m_Shuffle(v1: m_Value(V&: V1), v2: m_UndefValue(U&: U1),
1603	mask: m_Mask (InnerMask1))),
1604	mask: m_Mask (OuterMask))))
1605	return false;
1606
1607	auto *ShufI0 = dyn_cast<Instruction>(Val: I.getOperand(i: `0`));
1608	auto *ShufI1 = dyn_cast<Instruction>(Val: I.getOperand(i: `1`));
1609	auto *ShuffleDstTy = dyn_cast<FixedVectorType>(Val: I.getType());
1610	auto *ShuffleSrcTy = dyn_cast<FixedVectorType>(Val: V0->getType());
1611	auto *ShuffleImmTy = dyn_cast<FixedVectorType>(Val: I.getOperand(i: `0`)->getType());
1612	if (!ShuffleDstTy \|\| !ShuffleSrcTy \|\| !ShuffleImmTy \|\|
1613	V0->getType() != V1->getType())
1614	return false;
1615
1616	unsigned NumSrcElts = ShuffleSrcTy->getNumElements();
1617	unsigned NumImmElts = ShuffleImmTy->getNumElements();
1618
1619	// Bail if either inner masks reference a RHS undef arg.
1620	if ((!isa<PoisonValue>(Val: U0) &&
1621	any_of(Range&: InnerMask0, P: [&](int M) { return M >= (int)NumSrcElts; })) \|\|
1622	(!isa<PoisonValue>(Val: U1) &&
1623	any_of(Range&: InnerMask1, P: [&](int M) { return M >= (int)NumSrcElts; })))
1624	return false;
1625
1626	// Merge shuffles - replace index to the RHS poison arg with PoisonMaskElem,
1627	SmallVector<int, `16`> NewMask(OuterMask.begin(), OuterMask.end());
1628	for (int &M : NewMask) {
1629	if (`0` <= M && M < (int)NumImmElts) {
1630	M = (InnerMask0 [M] >= (int)NumSrcElts) ? PoisonMaskElem : InnerMask0 [M];
1631	} else if (M >= (int)NumImmElts) {
1632	if (InnerMask1 [M - NumImmElts] >= (int)NumSrcElts)
1633	M = PoisonMaskElem;
1634	else
1635	M = InnerMask1 [M - NumImmElts] + (V0 == V1 ? `0` : NumSrcElts);
1636	}
1637	}
1638
1639	// Have we folded to an Identity shuffle?
1640	if (ShuffleVectorInst::isIdentityMask(Mask: NewMask, NumSrcElts)) {
1641	replaceValue(Old&: I, New&: *V0);
1642	return true;
1643	}
1644
1645	// Try to merge the shuffles if the new shuffle is not costly.
1646	TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
1647
1648	InstructionCost OldCost =
1649	TTI.getShuffleCost(Kind: TargetTransformInfo::SK_PermuteSingleSrc, Tp: ShuffleSrcTy,
1650	Mask: InnerMask0, CostKind, Index: `0`, SubTp: nullptr, Args: {V0, U0}, CxtI: ShufI0) +
1651	TTI.getShuffleCost(Kind: TargetTransformInfo::SK_PermuteSingleSrc, Tp: ShuffleSrcTy,
1652	Mask: InnerMask1, CostKind, Index: `0`, SubTp: nullptr, Args: {V1, U1}, CxtI: ShufI1) +
1653	TTI.getShuffleCost(Kind: TargetTransformInfo::SK_PermuteTwoSrc, Tp: ShuffleImmTy,
1654	Mask: OuterMask, CostKind, Index: `0`, SubTp: nullptr, Args: {ShufI0, ShufI1}, CxtI: &I);
1655
1656	InstructionCost NewCost =
1657	TTI.getShuffleCost(Kind: TargetTransformInfo::SK_PermuteTwoSrc, Tp: ShuffleSrcTy,
1658	Mask: NewMask, CostKind, Index: `0`, SubTp: nullptr, Args: {V0, V1});
1659
1660	LLVM_DEBUG(dbgs() << "Found a shuffle feeding two shuffles: " << I
1661	<< "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
1662	<< "\n");
1663	if (NewCost > OldCost)
1664	return false;
1665
1666	// Clear unused sources to poison.
1667	if (none_of(Range&: NewMask, P: [&](int M) { return `0` <= M && M < (int)NumSrcElts; }))
1668	V0 = PoisonValue::get(T: ShuffleSrcTy);
1669	if (none_of(Range&: NewMask, P: [&](int M) { return (int)NumSrcElts <= M; }))
1670	V1 = PoisonValue::get(T: ShuffleSrcTy);
1671
1672	Value *Shuf = Builder.CreateShuffleVector(V1: V0, V2: V1, Mask: NewMask);
1673	replaceValue(Old&: I, New&: *Shuf);
1674	return true;
1675	}
1676
1677	using InstLane = std::pair<Use , int*>;
1678
1679	static InstLane lookThroughShuffles(Use U, int* Lane) {
1680	while (auto *SV = dyn_cast<ShuffleVectorInst>(Val: U->get())) {
1681	unsigned NumElts =
1682	cast<FixedVectorType>(Val: SV->getOperand(i_nocapture: `0`)->getType())->getNumElements();
1683	int M = SV->getMaskValue(Elt: Lane);
1684	if (M < `0`)
1685	return {nullptr, PoisonMaskElem};
1686	if (static_cast<unsigned>(M) < NumElts) {
1687	U = &SV->getOperandUse(i: `0`);
1688	Lane = M;
1689	} else {
1690	U = &SV->getOperandUse(i: `1`);
1691	Lane = M - NumElts;
1692	}
1693	}
1694	return InstLane {U, Lane};
1695	}
1696
1697	static SmallVector<InstLane>
1698	generateInstLaneVectorFromOperand(ArrayRef<InstLane> Item, int Op) {
1699	SmallVector<InstLane> NItem;
1700	for (InstLane IL : Item) {
1701	auto [U, Lane] = IL;
1702	InstLane OpLane =
1703	U ? lookThroughShuffles(U: &cast<Instruction>(Val: U->get())->getOperandUse(i: Op),
1704	Lane)
1705	: InstLane {nullptr, PoisonMaskElem};
1706	NItem.emplace_back(Args&: OpLane);
1707	}
1708	return NItem;
1709	}
1710
1711	/// Detect concat of multiple values into a vector
1712	static bool isFreeConcat(ArrayRef<InstLane> Item,
1713	const TargetTransformInfo &TTI) {
1714	auto *Ty = cast<FixedVectorType>(Val: Item.front().first->get()->getType());
1715	unsigned NumElts = Ty->getNumElements();
1716	if (Item.size() == NumElts \|\| NumElts == `1` \|\| Item.size() % NumElts != `0`)
1717	return false;
1718
1719	// Check that the concat is free, usually meaning that the type will be split
1720	// during legalization.
1721	SmallVector<int, `16`> ConcatMask(NumElts * `2`);
1722	std::iota(first: ConcatMask.begin(), last: ConcatMask.end(), value: `0`);
1723	if (TTI.getShuffleCost(Kind: TTI::SK_PermuteTwoSrc, Tp: Ty, Mask: ConcatMask,
1724	CostKind: TTI::TCK_RecipThroughput) != `0`)
1725	return false;
1726
1727	unsigned NumSlices = Item.size() / NumElts;
1728	// Currently we generate a tree of shuffles for the concats, which limits us
1729	// to a power2.
1730	if (!isPowerOf2_32(Value: NumSlices))
1731	return false;
1732	for (unsigned Slice = `0`; Slice < NumSlices; ++Slice) {
1733	Use SliceV = Item [Slice NumElts].first;
1734	if (!SliceV \|\| SliceV->get()->getType() != Ty)
1735	return false;
1736	for (unsigned Elt = `0`; Elt < NumElts; ++Elt) {
1737	auto [V, Lane] = Item [Slice * NumElts + Elt];
1738	if (Lane != static_cast<int>(Elt) \|\| SliceV->get() != V->get())
1739	return false;
1740	}
1741	}
1742	return true;
1743	}
1744
1745	static Value generateNewInstTree(ArrayRef<InstLane> Item, FixedVectorType Ty,
1746	const SmallPtrSet<Use *, `4`> &IdentityLeafs,
1747	const SmallPtrSet<Use *, `4`> &SplatLeafs,
1748	const SmallPtrSet<Use *, `4`> &ConcatLeafs,
1749	IRBuilder<> &Builder) {
1750	auto [FrontU, FrontLane] = Item.front();
1751
1752	if (IdentityLeafs.contains(Ptr: FrontU)) {
1753	return FrontU->get();
1754	}
1755	if (SplatLeafs.contains(Ptr: FrontU)) {
1756	SmallVector<int, `16`> Mask(Ty->getNumElements(), FrontLane);
1757	return Builder.CreateShuffleVector(V: FrontU->get(), Mask);
1758	}
1759	if (ConcatLeafs.contains(Ptr: FrontU)) {
1760	unsigned NumElts =
1761	cast<FixedVectorType>(Val: FrontU->get()->getType())->getNumElements();
1762	SmallVector<Value > Values(Item.size() / NumElts, nullptr*);
1763	for (unsigned S = `0`; S < Values.size(); ++S)
1764	Values [S] = Item [S * NumElts].first->get();
1765
1766	while (Values.size() > `1`) {
1767	NumElts *= `2`;
1768	SmallVector<int, `16`> Mask(NumElts, `0`);
1769	std::iota(first: Mask.begin(), last: Mask.end(), value: `0`);
1770	SmallVector<Value > NewValues(Values.size() / `2`, nullptr*);
1771	for (unsigned S = `0`; S < NewValues.size(); ++S)
1772	NewValues [S] =
1773	Builder.CreateShuffleVector(V1: Values [S * `2`], V2: Values [S * `2` + `1`], Mask);
1774	Values = NewValues;
1775	}
1776	return Values [`0`];
1777	}
1778
1779	auto *I = cast<Instruction>(Val: FrontU->get());
1780	auto *II = dyn_cast<IntrinsicInst>(Val: I);
1781	unsigned NumOps = I->getNumOperands() - (II ? `1` : `0`);
1782	SmallVector<Value *> Ops(NumOps);
1783	for (unsigned Idx = `0`; Idx < NumOps; Idx++) {
1784	if (II && isVectorIntrinsicWithScalarOpAtArg(ID: II->getIntrinsicID(), ScalarOpdIdx: Idx)) {
1785	Ops [Idx] = II->getOperand(i_nocapture: Idx);
1786	continue;
1787	}
1788	Ops [Idx] =
1789	generateNewInstTree(Item: generateInstLaneVectorFromOperand(Item, Op: Idx), Ty,
1790	IdentityLeafs, SplatLeafs, ConcatLeafs, Builder);
1791	}
1792
1793	SmallVector<Value *, `8`> ValueList;
1794	for (const auto &Lane : Item)
1795	if (Lane.first)
1796	ValueList.push_back(Elt: Lane.first->get());
1797
1798	Type *DstTy =
1799	FixedVectorType::get(ElementType: I->getType()->getScalarType(), NumElts: Ty->getNumElements());
1800	if (auto *BI = dyn_cast<BinaryOperator>(Val: I)) {
1801	auto *Value = Builder.CreateBinOp(Opc: (Instruction::BinaryOps)BI->getOpcode(),
1802	LHS: Ops [`0`], RHS: Ops [`1`]);
1803	propagateIRFlags(I: Value, VL: ValueList);
1804	return Value;
1805	}
1806	if (auto *CI = dyn_cast<CmpInst>(Val: I)) {
1807	auto *Value = Builder.CreateCmp(Pred: CI->getPredicate(), LHS: Ops [`0`], RHS: Ops [`1`]);
1808	propagateIRFlags(I: Value, VL: ValueList);
1809	return Value;
1810	}
1811	if (auto *SI = dyn_cast<SelectInst>(Val: I)) {
1812	auto *Value = Builder.CreateSelect(C: Ops [`0`], True: Ops [`1`], False: Ops [`2`], Name: "", MDFrom: SI);
1813	propagateIRFlags(I: Value, VL: ValueList);
1814	return Value;
1815	}
1816	if (auto *CI = dyn_cast<CastInst>(Val: I)) {
1817	auto *Value = Builder.CreateCast(Op: (Instruction::CastOps)CI->getOpcode(),
1818	V: Ops [`0`], DestTy: DstTy);
1819	propagateIRFlags(I: Value, VL: ValueList);
1820	return Value;
1821	}
1822	if (II) {
1823	auto *Value = Builder.CreateIntrinsic(RetTy: DstTy, ID: II->getIntrinsicID(), Args: Ops);
1824	propagateIRFlags(I: Value, VL: ValueList);
1825	return Value;
1826	}
1827	assert(isa<UnaryInstruction>(I) && "Unexpected instruction type in Generate");
1828	auto *Value =
1829	Builder.CreateUnOp(Opc: (Instruction::UnaryOps)I->getOpcode(), V: Ops [`0`]);
1830	propagateIRFlags(I: Value, VL: ValueList);
1831	return Value;
1832	}
1833
1834	// Starting from a shuffle, look up through operands tracking the shuffled index
1835	// of each lane. If we can simplify away the shuffles to identities then
1836	// do so.
1837	bool VectorCombine::foldShuffleToIdentity(Instruction &I) {
1838	auto *Ty = dyn_cast<FixedVectorType>(Val: I.getType());
1839	if (!Ty \|\| I.use_empty())
1840	return false;
1841
1842	SmallVector<InstLane> Start(Ty->getNumElements());
1843	for (unsigned M = `0`, E = Ty->getNumElements(); M < E; ++M)
1844	Start [M] = lookThroughShuffles(U: &*I.use_begin(), Lane: M);
1845
1846	SmallVector<SmallVector<InstLane>> Worklist;
1847	Worklist.push_back(Elt: Start);
1848	SmallPtrSet<Use *, `4`> IdentityLeafs, SplatLeafs, ConcatLeafs;
1849	unsigned NumVisited = `0`;
1850
1851	while (!Worklist.empty()) {
1852	if (++NumVisited > MaxInstrsToScan)
1853	return false;
1854
1855	SmallVector<InstLane> Item = Worklist.pop_back_val();
1856	auto [FrontU, FrontLane] = Item.front();
1857
1858	// If we found an undef first lane then bail out to keep things simple.
1859	if (!FrontU)
1860	return false;
1861
1862	// Helper to peek through bitcasts to the same value.
1863	auto IsEquiv = [&](Value X, Value Y) {
1864	return X->getType() == Y->getType() &&
1865	peekThroughBitcasts(V: X) == peekThroughBitcasts(V: Y);
1866	};
1867
1868	// Look for an identity value.
1869	if (FrontLane == `0` &&
1870	cast<FixedVectorType>(Val: FrontU->get()->getType())->getNumElements() ==
1871	Ty->getNumElements() &&
1872	all_of(Range: drop_begin(RangeOrContainer: enumerate(First&: Item)), P: [IsEquiv, Item](const auto &E) {
1873	Value *FrontV = Item.front().first->get();
1874	return !E.value().first \|\| (IsEquiv(E.value().first->get(), FrontV) &&
1875	E.value().second == (int)E.index());
1876	})) {
1877	IdentityLeafs.insert(Ptr: FrontU);
1878	continue;
1879	}
1880	// Look for constants, for the moment only supporting constant splats.
1881	if (auto *C = dyn_cast<Constant>(Val: FrontU);
1882	C && C->getSplatValue() &&
1883	all_of(Range: drop_begin(RangeOrContainer&: Item), P: [Item](InstLane &IL) {
1884	Value *FrontV = Item.front().first->get();
1885	Use *U = IL.first;
1886	return !U \|\| U->get() == FrontV;
1887	})) {
1888	SplatLeafs.insert(Ptr: FrontU);
1889	continue;
1890	}
1891	// Look for a splat value.
1892	if (all_of(Range: drop_begin(RangeOrContainer&: Item), P: [Item](InstLane &IL) {
1893	auto [FrontU, FrontLane] = Item.front();
1894	auto [U, Lane] = IL;
1895	return !U \|\| (U->get() == FrontU->get() && Lane == FrontLane);
1896	})) {
1897	SplatLeafs.insert(Ptr: FrontU);
1898	continue;
1899	}
1900
1901	// We need each element to be the same type of value, and check that each
1902	// element has a single use.
1903	if (all_of(Range: drop_begin(RangeOrContainer&: Item), P: [Item](InstLane IL) {
1904	Value *FrontV = Item.front().first->get();
1905	if (!IL.first)
1906	return true;
1907	Value *V = IL.first->get();
1908	if (auto *I = dyn_cast<Instruction>(Val: V); I && !I->hasOneUse())
1909	return false;
1910	if (V->getValueID() != FrontV->getValueID())
1911	return false;
1912	if (auto *CI = dyn_cast<CmpInst>(Val: V))
1913	if (CI->getPredicate() != cast<CmpInst>(Val: FrontV)->getPredicate())
1914	return false;
1915	if (auto *CI = dyn_cast<CastInst>(Val: V))
1916	if (CI->getSrcTy() != cast<CastInst>(Val: FrontV)->getSrcTy())
1917	return false;
1918	if (auto *SI = dyn_cast<SelectInst>(Val: V))
1919	if (!isa<VectorType>(Val: SI->getOperand(i_nocapture: `0`)->getType()) \|\|
1920	SI->getOperand(i_nocapture: `0`)->getType() !=
1921	cast<SelectInst>(Val: FrontV)->getOperand(i_nocapture: `0`)->getType())
1922	return false;
1923	if (isa<CallInst>(Val: V) && !isa<IntrinsicInst>(Val: V))
1924	return false;
1925	auto *II = dyn_cast<IntrinsicInst>(Val: V);
1926	return !II \|\| (isa<IntrinsicInst>(Val: FrontV) &&
1927	II->getIntrinsicID() ==
1928	cast<IntrinsicInst>(Val: FrontV)->getIntrinsicID());
1929	})) {
1930	// Check the operator is one that we support.
1931	if (isa<BinaryOperator, CmpInst>(Val: FrontU)) {
1932	// We exclude div/rem in case they hit UB from poison lanes.
1933	if (auto *BO = dyn_cast<BinaryOperator>(Val: FrontU);
1934	BO && BO->isIntDivRem())
1935	return false;
1936	Worklist.push_back(Elt: generateInstLaneVectorFromOperand(Item, Op: `0`));
1937	Worklist.push_back(Elt: generateInstLaneVectorFromOperand(Item, Op: `1`));
1938	continue;
1939	} else if (isa<UnaryOperator, TruncInst, ZExtInst, SExtInst>(Val: FrontU)) {
1940	Worklist.push_back(Elt: generateInstLaneVectorFromOperand(Item, Op: `0`));
1941	continue;
1942	} else if (auto *BitCast = dyn_cast<BitCastInst>(Val: FrontU)) {
1943	// TODO: Handle vector widening/narrowing bitcasts.
1944	auto *DstTy = dyn_cast<FixedVectorType>(Val: BitCast->getDestTy());
1945	auto *SrcTy = dyn_cast<FixedVectorType>(Val: BitCast->getSrcTy());
1946	if (DstTy && SrcTy &&
1947	SrcTy->getNumElements() == DstTy->getNumElements()) {
1948	Worklist.push_back(Elt: generateInstLaneVectorFromOperand(Item, Op: `0`));
1949	continue;
1950	}
1951	} else if (isa<SelectInst>(Val: FrontU)) {
1952	Worklist.push_back(Elt: generateInstLaneVectorFromOperand(Item, Op: `0`));
1953	Worklist.push_back(Elt: generateInstLaneVectorFromOperand(Item, Op: `1`));
1954	Worklist.push_back(Elt: generateInstLaneVectorFromOperand(Item, Op: `2`));
1955	continue;
1956	} else if (auto *II = dyn_cast<IntrinsicInst>(Val: FrontU);
1957	II && isTriviallyVectorizable(ID: II->getIntrinsicID())) {
1958	for (unsigned Op = `0`, E = II->getNumOperands() - `1`; Op < E; Op++) {
1959	if (isVectorIntrinsicWithScalarOpAtArg(ID: II->getIntrinsicID(), ScalarOpdIdx: Op)) {
1960	if (!all_of(Range: drop_begin(RangeOrContainer&: Item), P: [Item, Op](InstLane &IL) {
1961	Value *FrontV = Item.front().first->get();
1962	Use *U = IL.first;
1963	return !U \|\| (cast<Instruction>(Val: U->get())->getOperand(i: Op) ==
1964	cast<Instruction>(Val: FrontV)->getOperand(i: Op));
1965	}))
1966	return false;
1967	continue;
1968	}
1969	Worklist.push_back(Elt: generateInstLaneVectorFromOperand(Item, Op));
1970	}
1971	continue;
1972	}
1973	}
1974
1975	if (isFreeConcat(Item, TTI)) {
1976	ConcatLeafs.insert(Ptr: FrontU);
1977	continue;
1978	}
1979
1980	return false;
1981	}
1982
1983	if (NumVisited <= `1`)
1984	return false;
1985
1986	// If we got this far, we know the shuffles are superfluous and can be
1987	// removed. Scan through again and generate the new tree of instructions.
1988	Builder.SetInsertPoint(&I);
1989	Value *V = generateNewInstTree(Item: Start, Ty, IdentityLeafs, SplatLeafs,
1990	ConcatLeafs, Builder);
1991	replaceValue(Old&: I, New&: *V);
1992	return true;
1993	}
1994
1995	/// Given a commutative reduction, the order of the input lanes does not alter
1996	/// the results. We can use this to remove certain shuffles feeding the
1997	/// reduction, removing the need to shuffle at all.
1998	bool VectorCombine::foldShuffleFromReductions(Instruction &I) {
1999	auto *II = dyn_cast<IntrinsicInst>(Val: &I);
2000	if (!II)
2001	return false;
2002	switch (II->getIntrinsicID()) {
2003	case Intrinsic::vector_reduce_add:
2004	case Intrinsic::vector_reduce_mul:
2005	case Intrinsic::vector_reduce_and:
2006	case Intrinsic::vector_reduce_or:
2007	case Intrinsic::vector_reduce_xor:
2008	case Intrinsic::vector_reduce_smin:
2009	case Intrinsic::vector_reduce_smax:
2010	case Intrinsic::vector_reduce_umin:
2011	case Intrinsic::vector_reduce_umax:
2012	break;
2013	default:
2014	return false;
2015	}
2016
2017	// Find all the inputs when looking through operations that do not alter the
2018	// lane order (binops, for example). Currently we look for a single shuffle,
2019	// and can ignore splat values.
2020	std::queue<Value *> Worklist;
2021	SmallPtrSet<Value *, `4`> Visited;
2022	ShuffleVectorInst Shuffle = nullptr*;
2023	if (auto *Op = dyn_cast<Instruction>(Val: I.getOperand(i: `0`)))
2024	Worklist.push(x: Op);
2025
2026	while (!Worklist.empty()) {
2027	Value *CV = Worklist.front();
2028	Worklist.pop();
2029	if (Visited.contains(Ptr: CV))
2030	continue;
2031
2032	// Splats don't change the order, so can be safely ignored.
2033	if (isSplatValue(V: CV))
2034	continue;
2035
2036	Visited.insert(Ptr: CV);
2037
2038	if (auto *CI = dyn_cast<Instruction>(Val: CV)) {
2039	if (CI->isBinaryOp()) {
2040	for (auto *Op : CI->operand_values())
2041	Worklist.push(x: Op);
2042	continue;
2043	} else if (auto *SV = dyn_cast<ShuffleVectorInst>(Val: CI)) {
2044	if (Shuffle && Shuffle != SV)
2045	return false;
2046	Shuffle = SV;
2047	continue;
2048	}
2049	}
2050
2051	// Anything else is currently an unknown node.
2052	return false;
2053	}
2054
2055	if (!Shuffle)
2056	return false;
2057
2058	// Check all uses of the binary ops and shuffles are also included in the
2059	// lane-invariant operations (Visited should be the list of lanewise
2060	// instructions, including the shuffle that we found).
2061	for (auto *V : Visited)
2062	for (auto *U : V->users())
2063	if (!Visited.contains(Ptr: U) && U != &I)
2064	return false;
2065
2066	FixedVectorType *VecType =
2067	dyn_cast<FixedVectorType>(Val: II->getOperand(i_nocapture: `0`)->getType());
2068	if (!VecType)
2069	return false;
2070	FixedVectorType *ShuffleInputType =
2071	dyn_cast<FixedVectorType>(Val: Shuffle->getOperand(i_nocapture: `0`)->getType());
2072	if (!ShuffleInputType)
2073	return false;
2074	unsigned NumInputElts = ShuffleInputType->getNumElements();
2075
2076	// Find the mask from sorting the lanes into order. This is most likely to
2077	// become a identity or concat mask. Undef elements are pushed to the end.
2078	SmallVector<int> ConcatMask;
2079	Shuffle->getShuffleMask(Result&: ConcatMask);
2080	sort(C&: ConcatMask, Comp: [](int X, int Y) { return (unsigned)X < (unsigned)Y; });
2081	// In the case of a truncating shuffle it's possible for the mask
2082	// to have an index greater than the size of the resulting vector.
2083	// This requires special handling.
2084	bool IsTruncatingShuffle = VecType->getNumElements() < NumInputElts;
2085	bool UsesSecondVec =
2086	any_of(Range&: ConcatMask, P: [&](int M) { return M >= (int)NumInputElts; });
2087
2088	FixedVectorType *VecTyForCost =
2089	(UsesSecondVec && !IsTruncatingShuffle) ? VecType : ShuffleInputType;
2090	InstructionCost OldCost = TTI.getShuffleCost(
2091	Kind: UsesSecondVec ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc,
2092	Tp: VecTyForCost, Mask: Shuffle->getShuffleMask());
2093	InstructionCost NewCost = TTI.getShuffleCost(
2094	Kind: UsesSecondVec ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc,
2095	Tp: VecTyForCost, Mask: ConcatMask);
2096
2097	LLVM_DEBUG(dbgs() << "Found a reduction feeding from a shuffle: " << *Shuffle
2098	<< "\n");
2099	LLVM_DEBUG(dbgs() << " OldCost: " << OldCost << " vs NewCost: " << NewCost
2100	<< "\n");
2101	if (NewCost < OldCost) {
2102	Builder.SetInsertPoint(Shuffle);
2103	Value *NewShuffle = Builder.CreateShuffleVector(
2104	V1: Shuffle->getOperand(i_nocapture: `0`), V2: Shuffle->getOperand(i_nocapture: `1`), Mask: ConcatMask);
2105	LLVM_DEBUG(dbgs() << "Created new shuffle: " << *NewShuffle << "\n");
2106	replaceValue(Old&: Shuffle, New&: NewShuffle);
2107	}
2108
2109	// See if we can re-use foldSelectShuffle, getting it to reduce the size of
2110	// the shuffle into a nicer order, as it can ignore the order of the shuffles.
2111	return foldSelectShuffle(I&: Shuffle, FromReduction: true*);
2112	}
2113
2114	/// Determine if its more efficient to fold:
2115	/// reduce(trunc(x)) -> trunc(reduce(x)).
2116	/// reduce(sext(x)) -> sext(reduce(x)).
2117	/// reduce(zext(x)) -> zext(reduce(x)).
2118	bool VectorCombine::foldCastFromReductions(Instruction &I) {
2119	auto *II = dyn_cast<IntrinsicInst>(Val: &I);
2120	if (!II)
2121	return false;
2122
2123	bool TruncOnly = false;
2124	Intrinsic::ID IID = II->getIntrinsicID();
2125	switch (IID) {
2126	case Intrinsic::vector_reduce_add:
2127	case Intrinsic::vector_reduce_mul:
2128	TruncOnly = true;
2129	break;
2130	case Intrinsic::vector_reduce_and:
2131	case Intrinsic::vector_reduce_or:
2132	case Intrinsic::vector_reduce_xor:
2133	break;
2134	default:
2135	return false;
2136	}
2137
2138	unsigned ReductionOpc = getArithmeticReductionInstruction(RdxID: IID);
2139	Value *ReductionSrc = I.getOperand(i: `0`);
2140
2141	Value *Src;
2142	if (!match(V: ReductionSrc, P: m_OneUse(SubPattern: m_Trunc(Op: m_Value(V&: Src)))) &&
2143	(TruncOnly \|\| !match(V: ReductionSrc, P: m_OneUse(SubPattern: m_ZExtOrSExt(Op: m_Value(V&: Src))))))
2144	return false;
2145
2146	auto CastOpc =
2147	(Instruction::CastOps)cast<Instruction>(Val: ReductionSrc)->getOpcode();
2148
2149	auto *SrcTy = cast<VectorType>(Val: Src->getType());
2150	auto *ReductionSrcTy = cast<VectorType>(Val: ReductionSrc->getType());
2151	Type *ResultTy = I.getType();
2152
2153	TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
2154	InstructionCost OldCost = TTI.getArithmeticReductionCost(
2155	Opcode: ReductionOpc, Ty: ReductionSrcTy, FMF: std::nullopt, CostKind);
2156	OldCost += TTI.getCastInstrCost(Opcode: CastOpc, Dst: ReductionSrcTy, Src: SrcTy,
2157	CCH: TTI::CastContextHint::None, CostKind,
2158	I: cast<CastInst>(Val: ReductionSrc));
2159	InstructionCost NewCost =
2160	TTI.getArithmeticReductionCost(Opcode: ReductionOpc, Ty: SrcTy, FMF: std::nullopt,
2161	CostKind) +
2162	TTI.getCastInstrCost(Opcode: CastOpc, Dst: ResultTy, Src: ReductionSrcTy->getScalarType(),
2163	CCH: TTI::CastContextHint::None, CostKind);
2164
2165	if (OldCost <= NewCost \|\| !NewCost.isValid())
2166	return false;
2167
2168	Value *NewReduction = Builder.CreateIntrinsic(RetTy: SrcTy->getScalarType(),
2169	ID: II->getIntrinsicID(), Args: {Src});
2170	Value *NewCast = Builder.CreateCast(Op: CastOpc, V: NewReduction, DestTy: ResultTy);
2171	replaceValue(Old&: I, New&: *NewCast);
2172	return true;
2173	}
2174
2175	/// This method looks for groups of shuffles acting on binops, of the form:
2176	/// %x = shuffle ...
2177	/// %y = shuffle ...
2178	/// %a = binop %x, %y
2179	/// %b = binop %x, %y
2180	/// shuffle %a, %b, selectmask
2181	/// We may, especially if the shuffle is wider than legal, be able to convert
2182	/// the shuffle to a form where only parts of a and b need to be computed. On
2183	/// architectures with no obvious "select" shuffle, this can reduce the total
2184	/// number of operations if the target reports them as cheaper.
2185	bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) {
2186	auto *SVI = cast<ShuffleVectorInst>(Val: &I);
2187	auto *VT = cast<FixedVectorType>(Val: I.getType());
2188	auto *Op0 = dyn_cast<Instruction>(Val: SVI->getOperand(i_nocapture: `0`));
2189	auto *Op1 = dyn_cast<Instruction>(Val: SVI->getOperand(i_nocapture: `1`));
2190	if (!Op0 \|\| !Op1 \|\| Op0 == Op1 \|\| !Op0->isBinaryOp() \|\| !Op1->isBinaryOp() \|\|
2191	VT != Op0->getType())
2192	return false;
2193
2194	auto *SVI0A = dyn_cast<Instruction>(Val: Op0->getOperand(i: `0`));
2195	auto *SVI0B = dyn_cast<Instruction>(Val: Op0->getOperand(i: `1`));
2196	auto *SVI1A = dyn_cast<Instruction>(Val: Op1->getOperand(i: `0`));
2197	auto *SVI1B = dyn_cast<Instruction>(Val: Op1->getOperand(i: `1`));
2198	SmallPtrSet<Instruction *, `4`> InputShuffles({SVI0A, SVI0B, SVI1A, SVI1B});
2199	auto checkSVNonOpUses = [&](Instruction *I) {
2200	if (!I \|\| I->getOperand(i: `0`)->getType() != VT)
2201	return true;
2202	return any_of(Range: I->users(), P: [&](User *U) {
2203	return U != Op0 && U != Op1 &&
2204	!(isa<ShuffleVectorInst>(Val: U) &&
2205	(InputShuffles.contains(Ptr: cast<Instruction>(Val: U)) \|\|
2206	isInstructionTriviallyDead(I: cast<Instruction>(Val: U))));
2207	});
2208	};
2209	if (checkSVNonOpUses (SVI0A) \|\| checkSVNonOpUses (SVI0B) \|\|
2210	checkSVNonOpUses (SVI1A) \|\| checkSVNonOpUses (SVI1B))
2211	return false;
2212
2213	// Collect all the uses that are shuffles that we can transform together. We
2214	// may not have a single shuffle, but a group that can all be transformed
2215	// together profitably.
2216	SmallVector<ShuffleVectorInst *> Shuffles;
2217	auto collectShuffles = [&](Instruction *I) {
2218	for (auto *U : I->users()) {
2219	auto *SV = dyn_cast<ShuffleVectorInst>(Val: U);
2220	if (!SV \|\| SV->getType() != VT)
2221	return false;
2222	if ((SV->getOperand(i_nocapture: `0`) != Op0 && SV->getOperand(i_nocapture: `0`) != Op1) \|\|
2223	(SV->getOperand(i_nocapture: `1`) != Op0 && SV->getOperand(i_nocapture: `1`) != Op1))
2224	return false;
2225	if (!llvm::is_contained(Range&: Shuffles, Element: SV))
2226	Shuffles.push_back(Elt: SV);
2227	}
2228	return true;
2229	};
2230	if (!collectShuffles (Op0) \|\| !collectShuffles (Op1))
2231	return false;
2232	// From a reduction, we need to be processing a single shuffle, otherwise the
2233	// other uses will not be lane-invariant.
2234	if (FromReduction && Shuffles.size() > `1`)
2235	return false;
2236
2237	// Add any shuffle uses for the shuffles we have found, to include them in our
2238	// cost calculations.
2239	if (!FromReduction) {
2240	for (ShuffleVectorInst *SV : Shuffles) {
2241	for (auto *U : SV->users()) {
2242	ShuffleVectorInst *SSV = dyn_cast<ShuffleVectorInst>(Val: U);
2243	if (SSV && isa<UndefValue>(Val: SSV->getOperand(i_nocapture: `1`)) && SSV->getType() == VT)
2244	Shuffles.push_back(Elt: SSV);
2245	}
2246	}
2247	}
2248
2249	// For each of the output shuffles, we try to sort all the first vector
2250	// elements to the beginning, followed by the second array elements at the
2251	// end. If the binops are legalized to smaller vectors, this may reduce total
2252	// number of binops. We compute the ReconstructMask mask needed to convert
2253	// back to the original lane order.
2254	SmallVector<std::pair<int, int>> V1, V2;
2255	SmallVector<SmallVector<int>> OrigReconstructMasks;
2256	int MaxV1Elt = `0`, MaxV2Elt = `0`;
2257	unsigned NumElts = VT->getNumElements();
2258	for (ShuffleVectorInst *SVN : Shuffles) {
2259	SmallVector<int> Mask;
2260	SVN->getShuffleMask(Result&: Mask);
2261
2262	// Check the operands are the same as the original, or reversed (in which
2263	// case we need to commute the mask).
2264	Value *SVOp0 = SVN->getOperand(i_nocapture: `0`);
2265	Value *SVOp1 = SVN->getOperand(i_nocapture: `1`);
2266	if (isa<UndefValue>(Val: SVOp1)) {
2267	auto *SSV = cast<ShuffleVectorInst>(Val: SVOp0);
2268	SVOp0 = SSV->getOperand(i_nocapture: `0`);
2269	SVOp1 = SSV->getOperand(i_nocapture: `1`);
2270	for (unsigned I = `0`, E = Mask.size(); I != E; I++) {
2271	if (Mask [I] >= static_cast<int>(SSV->getShuffleMask().size()))
2272	return false;
2273	Mask [I] = Mask [I] < `0` ? Mask [I] : SSV->getMaskValue(Elt: Mask [I]);
2274	}
2275	}
2276	if (SVOp0 == Op1 && SVOp1 == Op0) {
2277	std::swap(a&: SVOp0, b&: SVOp1);
2278	ShuffleVectorInst::commuteShuffleMask(Mask, InVecNumElts: NumElts);
2279	}
2280	if (SVOp0 != Op0 \|\| SVOp1 != Op1)
2281	return false;
2282
2283	// Calculate the reconstruction mask for this shuffle, as the mask needed to
2284	// take the packed values from Op0/Op1 and reconstructing to the original
2285	// order.
2286	SmallVector<int> ReconstructMask;
2287	for (unsigned I = `0`; I < Mask.size(); I++) {
2288	if (Mask [I] < `0`) {
2289	ReconstructMask.push_back(Elt: -`1`);
2290	} else if (Mask [I] < static_cast<int>(NumElts)) {
2291	MaxV1Elt = std::max(a: MaxV1Elt, b: Mask [I]);
2292	auto It = find_if(Range&: V1, P: [&](const std::pair<int, int> &A) {
2293	return Mask [I] == A.first;
2294	});
2295	if (It != V1.end())
2296	ReconstructMask.push_back(Elt: It - V1.begin());
2297	else {
2298	ReconstructMask.push_back(Elt: V1.size());
2299	V1.emplace_back(Args&: Mask [I], Args: V1.size());
2300	}
2301	} else {
2302	MaxV2Elt = std::max<int>(a: MaxV2Elt, b: Mask [I] - NumElts);
2303	auto It = find_if(Range&: V2, P: [&](const std::pair<int, int> &A) {
2304	return Mask [I] - static_cast<int>(NumElts) == A.first;
2305	});
2306	if (It != V2.end())
2307	ReconstructMask.push_back(Elt: NumElts + It - V2.begin());
2308	else {
2309	ReconstructMask.push_back(Elt: NumElts + V2.size());
2310	V2.emplace_back(Args: Mask [I] - NumElts, Args: NumElts + V2.size());
2311	}
2312	}
2313	}
2314
2315	// For reductions, we know that the lane ordering out doesn't alter the
2316	// result. In-order can help simplify the shuffle away.
2317	if (FromReduction)
2318	sort(C&: ReconstructMask);
2319	OrigReconstructMasks.push_back(Elt: std::move(ReconstructMask));
2320	}
2321
2322	// If the Maximum element used from V1 and V2 are not larger than the new
2323	// vectors, the vectors are already packes and performing the optimization
2324	// again will likely not help any further. This also prevents us from getting
2325	// stuck in a cycle in case the costs do not also rule it out.
2326	if (V1.empty() \|\| V2.empty() \|\|
2327	(MaxV1Elt == static_cast<int>(V1.size()) - `1` &&
2328	MaxV2Elt == static_cast<int>(V2.size()) - `1`))
2329	return false;
2330
2331	// GetBaseMaskValue takes one of the inputs, which may either be a shuffle, a
2332	// shuffle of another shuffle, or not a shuffle (that is treated like a
2333	// identity shuffle).
2334	auto GetBaseMaskValue = [&](Instruction I, int* M) {
2335	auto *SV = dyn_cast<ShuffleVectorInst>(Val: I);
2336	if (!SV)
2337	return M;
2338	if (isa<UndefValue>(Val: SV->getOperand(i_nocapture: `1`)))
2339	if (auto *SSV = dyn_cast<ShuffleVectorInst>(Val: SV->getOperand(i_nocapture: `0`)))
2340	if (InputShuffles.contains(Ptr: SSV))
2341	return SSV->getMaskValue(Elt: SV->getMaskValue(Elt: M));
2342	return SV->getMaskValue(Elt: M);
2343	};
2344
2345	// Attempt to sort the inputs my ascending mask values to make simpler input
2346	// shuffles and push complex shuffles down to the uses. We sort on the first
2347	// of the two input shuffle orders, to try and get at least one input into a
2348	// nice order.
2349	auto SortBase = [&](Instruction A, std::pair<int, int*> X,
2350	std::pair<int, int> Y) {
2351	int MXA = GetBaseMaskValue (A, X.first);
2352	int MYA = GetBaseMaskValue (A, Y.first);
2353	return MXA < MYA;
2354	};
2355	stable_sort(Range&: V1, C: [&](std::pair<int, int> A, std::pair<int, int> B) {
2356	return SortBase (SVI0A, A, B);
2357	});
2358	stable_sort(Range&: V2, C: [&](std::pair<int, int> A, std::pair<int, int> B) {
2359	return SortBase (SVI1A, A, B);
2360	});
2361	// Calculate our ReconstructMasks from the OrigReconstructMasks and the
2362	// modified order of the input shuffles.
2363	SmallVector<SmallVector<int>> ReconstructMasks;
2364	for (const auto &Mask : OrigReconstructMasks) {
2365	SmallVector<int> ReconstructMask;
2366	for (int M : Mask) {
2367	auto FindIndex = [](const SmallVector<std::pair<int, int>> &V, int M) {
2368	auto It = find_if(Range: V, P: [M](auto A) { return A.second == M; });
2369	assert(It != V.end() && "Expected all entries in Mask");
2370	return std::distance(first: V.begin(), last: It);
2371	};
2372	if (M < `0`)
2373	ReconstructMask.push_back(Elt: -`1`);
2374	else if (M < static_cast<int>(NumElts)) {
2375	ReconstructMask.push_back(Elt: FindIndex (V1, M));
2376	} else {
2377	ReconstructMask.push_back(Elt: NumElts + FindIndex (V2, M));
2378	}
2379	}
2380	ReconstructMasks.push_back(Elt: std::move(ReconstructMask));
2381	}
2382
2383	// Calculate the masks needed for the new input shuffles, which get padded
2384	// with undef
2385	SmallVector<int> V1A, V1B, V2A, V2B;
2386	for (unsigned I = `0`; I < V1.size(); I++) {
2387	V1A.push_back(Elt: GetBaseMaskValue (SVI0A, V1 [I].first));
2388	V1B.push_back(Elt: GetBaseMaskValue (SVI0B, V1 [I].first));
2389	}
2390	for (unsigned I = `0`; I < V2.size(); I++) {
2391	V2A.push_back(Elt: GetBaseMaskValue (SVI1A, V2 [I].first));
2392	V2B.push_back(Elt: GetBaseMaskValue (SVI1B, V2 [I].first));
2393	}
2394	while (V1A.size() < NumElts) {
2395	V1A.push_back(Elt: PoisonMaskElem);
2396	V1B.push_back(Elt: PoisonMaskElem);
2397	}
2398	while (V2A.size() < NumElts) {
2399	V2A.push_back(Elt: PoisonMaskElem);
2400	V2B.push_back(Elt: PoisonMaskElem);
2401	}
2402
2403	auto AddShuffleCost = [&](InstructionCost C, Instruction *I) {
2404	auto *SV = dyn_cast<ShuffleVectorInst>(Val: I);
2405	if (!SV)
2406	return C;
2407	return C + TTI.getShuffleCost(Kind: isa<UndefValue>(Val: SV->getOperand(i_nocapture: `1`))
2408	? TTI::SK_PermuteSingleSrc
2409	: TTI::SK_PermuteTwoSrc,
2410	Tp: VT, Mask: SV->getShuffleMask());
2411	};
2412	auto AddShuffleMaskCost = [&](InstructionCost C, ArrayRef<int> Mask) {
2413	return C + TTI.getShuffleCost(Kind: TTI::SK_PermuteTwoSrc, Tp: VT, Mask);
2414	};
2415
2416	// Get the costs of the shuffles + binops before and after with the new
2417	// shuffle masks.
2418	InstructionCost CostBefore =
2419	TTI.getArithmeticInstrCost(Opcode: Op0->getOpcode(), Ty: VT) +
2420	TTI.getArithmeticInstrCost(Opcode: Op1->getOpcode(), Ty: VT);
2421	CostBefore += std::accumulate(first: Shuffles.begin(), last: Shuffles.end(),
2422	init: InstructionCost (`0`), binary_op: AddShuffleCost);
2423	CostBefore += std::accumulate(first: InputShuffles.begin(), last: InputShuffles.end(),
2424	init: InstructionCost (`0`), binary_op: AddShuffleCost);
2425
2426	// The new binops will be unused for lanes past the used shuffle lengths.
2427	// These types attempt to get the correct cost for that from the target.
2428	FixedVectorType *Op0SmallVT =
2429	FixedVectorType::get(ElementType: VT->getScalarType(), NumElts: V1.size());
2430	FixedVectorType *Op1SmallVT =
2431	FixedVectorType::get(ElementType: VT->getScalarType(), NumElts: V2.size());
2432	InstructionCost CostAfter =
2433	TTI.getArithmeticInstrCost(Opcode: Op0->getOpcode(), Ty: Op0SmallVT) +
2434	TTI.getArithmeticInstrCost(Opcode: Op1->getOpcode(), Ty: Op1SmallVT);
2435	CostAfter += std::accumulate(first: ReconstructMasks.begin(), last: ReconstructMasks.end(),
2436	init: InstructionCost (`0`), binary_op: AddShuffleMaskCost);
2437	std::set<SmallVector<int>> OutputShuffleMasks({V1A, V1B, V2A, V2B});
2438	CostAfter +=
2439	std::accumulate(first: OutputShuffleMasks.begin(), last: OutputShuffleMasks.end(),
2440	init: InstructionCost (`0`), binary_op: AddShuffleMaskCost);
2441
2442	LLVM_DEBUG(dbgs() << "Found a binop select shuffle pattern: " << I << "\n");
2443	LLVM_DEBUG(dbgs() << " CostBefore: " << CostBefore
2444	<< " vs CostAfter: " << CostAfter << "\n");
2445	if (CostBefore <= CostAfter)
2446	return false;
2447
2448	// The cost model has passed, create the new instructions.
2449	auto GetShuffleOperand = [&](Instruction I, unsigned* Op) -> Value * {
2450	auto *SV = dyn_cast<ShuffleVectorInst>(Val: I);
2451	if (!SV)
2452	return I;
2453	if (isa<UndefValue>(Val: SV->getOperand(i_nocapture: `1`)))
2454	if (auto *SSV = dyn_cast<ShuffleVectorInst>(Val: SV->getOperand(i_nocapture: `0`)))
2455	if (InputShuffles.contains(Ptr: SSV))
2456	return SSV->getOperand(i_nocapture: Op);
2457	return SV->getOperand(i_nocapture: Op);
2458	};
2459	Builder.SetInsertPoint(*SVI0A->getInsertionPointAfterDef());
2460	Value *NSV0A = Builder.CreateShuffleVector(V1: GetShuffleOperand (SVI0A, `0`),
2461	V2: GetShuffleOperand (SVI0A, `1`), Mask: V1A);
2462	Builder.SetInsertPoint(*SVI0B->getInsertionPointAfterDef());
2463	Value *NSV0B = Builder.CreateShuffleVector(V1: GetShuffleOperand (SVI0B, `0`),
2464	V2: GetShuffleOperand (SVI0B, `1`), Mask: V1B);
2465	Builder.SetInsertPoint(*SVI1A->getInsertionPointAfterDef());
2466	Value *NSV1A = Builder.CreateShuffleVector(V1: GetShuffleOperand (SVI1A, `0`),
2467	V2: GetShuffleOperand (SVI1A, `1`), Mask: V2A);
2468	Builder.SetInsertPoint(*SVI1B->getInsertionPointAfterDef());
2469	Value *NSV1B = Builder.CreateShuffleVector(V1: GetShuffleOperand (SVI1B, `0`),
2470	V2: GetShuffleOperand (SVI1B, `1`), Mask: V2B);
2471	Builder.SetInsertPoint(Op0);
2472	Value *NOp0 = Builder.CreateBinOp(Opc: (Instruction::BinaryOps)Op0->getOpcode(),
2473	LHS: NSV0A, RHS: NSV0B);
2474	if (auto *I = dyn_cast<Instruction>(Val: NOp0))
2475	I->copyIRFlags(V: Op0, IncludeWrapFlags: true);
2476	Builder.SetInsertPoint(Op1);
2477	Value *NOp1 = Builder.CreateBinOp(Opc: (Instruction::BinaryOps)Op1->getOpcode(),
2478	LHS: NSV1A, RHS: NSV1B);
2479	if (auto *I = dyn_cast<Instruction>(Val: NOp1))
2480	I->copyIRFlags(V: Op1, IncludeWrapFlags: true);
2481
2482	for (int S = `0`, E = ReconstructMasks.size(); S != E; S++) {
2483	Builder.SetInsertPoint(Shuffles [S]);
2484	Value *NSV = Builder.CreateShuffleVector(V1: NOp0, V2: NOp1, Mask: ReconstructMasks [S]);
2485	replaceValue(Old&: Shuffles [S], New&: NSV);
2486	}
2487
2488	Worklist.pushValue(V: NSV0A);
2489	Worklist.pushValue(V: NSV0B);
2490	Worklist.pushValue(V: NSV1A);
2491	Worklist.pushValue(V: NSV1B);
2492	for (auto *S : Shuffles)
2493	Worklist.add(I: S);
2494	return true;
2495	}
2496
2497	/// This is the entry point for all transforms. Pass manager differences are
2498	/// handled in the callers of this function.
2499	bool VectorCombine::run() {
2500	if (DisableVectorCombine)
2501	return false;
2502
2503	// Don't attempt vectorization if the target does not support vectors.
2504	if (!TTI.getNumberOfRegisters(ClassID: TTI.getRegisterClassForType(/Vector/ true)))
2505	return false;
2506
2507	bool MadeChange = false;
2508	auto FoldInst = [this, &MadeChange](Instruction &I) {
2509	Builder.SetInsertPoint(&I);
2510	bool IsFixedVectorType = isa<FixedVectorType>(Val: I.getType());
2511	auto Opcode = I.getOpcode();
2512
2513	// These folds should be beneficial regardless of when this pass is run
2514	// in the optimization pipeline.
2515	// The type checking is for run-time efficiency. We can avoid wasting time
2516	// dispatching to folding functions if there's no chance of matching.
2517	if (IsFixedVectorType) {
2518	switch (Opcode) {
2519	case Instruction::InsertElement:
2520	MadeChange \|= vectorizeLoadInsert(I);
2521	break;
2522	case Instruction::ShuffleVector:
2523	MadeChange \|= widenSubvectorLoad(I);
2524	break;
2525	default:
2526	break;
2527	}
2528	}
2529
2530	// This transform works with scalable and fixed vectors
2531	// TODO: Identify and allow other scalable transforms
2532	if (isa<VectorType>(Val: I.getType())) {
2533	MadeChange \|= scalarizeBinopOrCmp(I);
2534	MadeChange \|= scalarizeLoadExtract(I);
2535	MadeChange \|= scalarizeVPIntrinsic(I);
2536	}
2537
2538	if (Opcode == Instruction::Store)
2539	MadeChange \|= foldSingleElementStore(I);
2540
2541	// If this is an early pipeline invocation of this pass, we are done.
2542	if (TryEarlyFoldsOnly)
2543	return;
2544
2545	// Otherwise, try folds that improve codegen but may interfere with
2546	// early IR canonicalizations.
2547	// The type checking is for run-time efficiency. We can avoid wasting time
2548	// dispatching to folding functions if there's no chance of matching.
2549	if (IsFixedVectorType) {
2550	switch (Opcode) {
2551	case Instruction::InsertElement:
2552	MadeChange \|= foldInsExtFNeg(I);
2553	break;
2554	case Instruction::ShuffleVector:
2555	MadeChange \|= foldShuffleOfBinops(I);
2556	MadeChange \|= foldShuffleOfCastops(I);
2557	MadeChange \|= foldShuffleOfShuffles(I);
2558	MadeChange \|= foldSelectShuffle(I);
2559	MadeChange \|= foldShuffleToIdentity(I);
2560	break;
2561	case Instruction::BitCast:
2562	MadeChange \|= foldBitcastShuffle(I);
2563	break;
2564	}
2565	} else {
2566	switch (Opcode) {
2567	case Instruction::Call:
2568	MadeChange \|= foldShuffleFromReductions(I);
2569	MadeChange \|= foldCastFromReductions(I);
2570	break;
2571	case Instruction::ICmp:
2572	case Instruction::FCmp:
2573	MadeChange \|= foldExtractExtract(I);
2574	break;
2575	default:
2576	if (Instruction::isBinaryOp(Opcode)) {
2577	MadeChange \|= foldExtractExtract(I);
2578	MadeChange \|= foldExtractedCmps(I);
2579	}
2580	break;
2581	}
2582	}
2583	};
2584
2585	for (BasicBlock &BB : F) {
2586	// Ignore unreachable basic blocks.
2587	if (!DT.isReachableFromEntry(A: &BB))
2588	continue;
2589	// Use early increment range so that we can erase instructions in loop.
2590	for (Instruction &I : make_early_inc_range(Range&: BB)) {
2591	if (I.isDebugOrPseudoInst())
2592	continue;
2593	FoldInst (I);
2594	}
2595	}
2596
2597	while (!Worklist.isEmpty()) {
2598	Instruction *I = Worklist.removeOne();
2599	if (!I)
2600	continue;
2601
2602	if (isInstructionTriviallyDead(I)) {
2603	eraseInstruction(I&: *I);
2604	continue;
2605	}
2606
2607	FoldInst (*I);
2608	}
2609
2610	return MadeChange;
2611	}
2612
2613	PreservedAnalyses VectorCombinePass::run(Function &F,
2614	FunctionAnalysisManager &FAM) {
2615	auto &AC = FAM.getResult<AssumptionAnalysis>(IR&: F);
2616	TargetTransformInfo &TTI = FAM.getResult<TargetIRAnalysis>(IR&: F);
2617	DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(IR&: F);
2618	AAResults &AA = FAM.getResult<AAManager>(IR&: F);
2619	const DataLayout *DL = &F.getDataLayout();
2620	VectorCombine Combiner(F, TTI, DT, AA, AC, DL, TryEarlyFoldsOnly);
2621	if (!Combiner.run())
2622	return PreservedAnalyses::all();
2623	PreservedAnalyses PA;
2624	PA.preserveSet<CFGAnalyses>();
2625	return PA;
2626	}
2627

Browse the source code of llvm_projects/llvm/lib/Transforms/Vectorize/VectorCombine.cpp