LoopVectorizationPlanner.cpp source code [llvm_projects/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.cpp]

1	//===- LoopVectorizationPlanner.cpp - VF selection and planning -----------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	///
9	/// \file
10	/// This file implements VFSelectionContext methods for loop vectorization
11	/// VF selection, independent of cost-modeling decisions.
12	///
13	//===----------------------------------------------------------------------===//
14
15	#include "LoopVectorizationPlanner.h"
16	#include "llvm/Analysis/LoopInfo.h"
17	#include "llvm/Analysis/OptimizationRemarkEmitter.h"
18	#include "llvm/Analysis/ScalarEvolution.h"
19	#include "llvm/IR/DiagnosticInfo.h"
20	#include "llvm/Support/CommandLine.h"
21	#include "llvm/Support/Debug.h"
22	#include "llvm/Support/MathExtras.h"
23	#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
24	#include "llvm/Transforms/Vectorize/LoopVectorize.h"
25
26	using namespace llvm;
27	using namespace LoopVectorizationUtils;
28
29	#define DEBUG_TYPE "loop-vectorize"
30
31	extern cl::opt<bool> VPlanBuildOuterloopStressTest;
32
33	static cl::opt<bool> MaximizeBandwidth(
34	"vectorizer-maximize-bandwidth", cl::init(Val: false), cl::Hidden,
35	cl::desc("Maximize bandwidth when selecting vectorization factor which "
36	"will be determined by the smallest type in loop."));
37
38	static cl::opt<bool> UseWiderVFIfCallVariantsPresent(
39	"vectorizer-maximize-bandwidth-for-vector-calls", cl::init(Val: true),
40	cl::Hidden,
41	cl::desc("Try wider VFs if they enable the use of vector variants"));
42
43	static cl::opt<bool> ConsiderRegPressure(
44	"vectorizer-consider-reg-pressure", cl::init(Val: false), cl::Hidden,
45	cl::desc("Discard VFs if their register pressure is too high."));
46
47	static cl::opt<bool> ForceTargetSupportsScalableVectors(
48	"force-target-supports-scalable-vectors", cl::init(Val: false), cl::Hidden,
49	cl::desc(
50	"Pretend that scalable vectors are supported, even if the target does "
51	"not support them. This flag should only be used for testing."));
52
53	cl::opt<bool> llvm::PreferInLoopReductions(
54	"prefer-inloop-reductions", cl::init(Val: false), cl::Hidden,
55	cl::desc("Prefer in-loop vector reductions, "
56	"overriding the targets preference."));
57
58	/// Note: This currently only applies to `llvm.masked.load` and
59	/// `llvm.masked.store`. TODO: Extend this to cover other operations as needed.
60	static cl::opt<bool> ForceTargetSupportsMaskedMemoryOps(
61	"force-target-supports-masked-memory-ops", cl::init(Val: false), cl::Hidden,
62	cl::desc("Assume the target supports masked memory operations (used for "
63	"testing)."));
64
65	static cl::opt<bool> ForceTargetSupportsGatherScatterOps(
66	"force-target-supports-gather-scatter-ops", cl::init(Val: false), cl::Hidden,
67	cl::desc("Assume the target supports gather/scatter operations (used for "
68	"testing)."));
69
70	/// Write a \p DebugMsg about vectorization to the debug output stream. If \p I
71	/// is passed, the message relates to that particular instruction.
72	#ifndef NDEBUG
73	static void debugVectorizationMessage(const StringRef Prefix,
74	const StringRef DebugMsg,
75	Instruction *I) {
76	dbgs() << "LV: " << Prefix << DebugMsg;
77	if (I != nullptr)
78	dbgs() << " " << *I;
79	else
80	dbgs() << `'.'`;
81	dbgs() << `'\n'`;
82	}
83	#endif
84
85	/// Create an analysis remark that explains why vectorization failed
86	/// \p RemarkName is the identifier for the remark. If \p I is passed it is an
87	/// instruction that prevents vectorization. Otherwise \p TheLoop is used for
88	/// the location of the remark. If \p DL is passed, use it as debug location for
89	/// the remark. \return the remark object that can be streamed to.
90	static OptimizationRemarkAnalysis createLVAnalysis(StringRef RemarkName,
91	const Loop *TheLoop,
92	Instruction *I,
93	DebugLoc DL = {}) {
94	BasicBlock *CodeRegion = I ? I->getParent() : TheLoop->getHeader();
95	// If debug location is attached to the instruction, use it. Otherwise if DL
96	// was not provided, use the loop's.
97	if (I && I->getDebugLoc())
98	DL = I->getDebugLoc();
99	else if (!DL)
100	DL = TheLoop->getStartLoc();
101
102	return OptimizationRemarkAnalysis (DEBUG_TYPE, RemarkName, DL, CodeRegion);
103	}
104
105	void LoopVectorizationUtils::reportVectorizationFailure(
106	const StringRef DebugMsg, const StringRef OREMsg, const StringRef ORETag,
107	OptimizationRemarkEmitter ORE, const* Loop TheLoop, Instruction I) {
108	LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I));
109	ORE->emit(OptDiag: createLVAnalysis(RemarkName: ORETag, TheLoop, I)
110	<< "loop not vectorized: " << OREMsg);
111	}
112
113	void LoopVectorizationUtils::reportVectorizationInfo(
114	const StringRef Msg, const StringRef ORETag, OptimizationRemarkEmitter *ORE,
115	const Loop TheLoop, Instruction I, DebugLoc DL) {
116	LLVM_DEBUG(debugVectorizationMessage("", Msg, I));
117	ORE->emit(OptDiag: createLVAnalysis(RemarkName: ORETag, TheLoop, I, DL) << Msg);
118	}
119
120	void LoopVectorizationUtils::reportVectorization(OptimizationRemarkEmitter *ORE,
121	Loop *TheLoop,
122	ElementCount VFWidth,
123	unsigned IC) {
124	LLVM_DEBUG(debugVectorizationMessage(
125	"Vectorizing: ", TheLoop->isInnermost() ? "innermost loop" : "outer loop",
126	nullptr));
127	StringRef LoopType = TheLoop->isInnermost() ? "" : "outer ";
128	ORE->emit(RemarkBuilder: [&]() {
129	return OptimizationRemark (DEBUG_TYPE, "Vectorized", TheLoop->getStartLoc(),
130	TheLoop->getHeader())
131	<< "vectorized " << LoopType << "loop (vectorization width: "
132	<< ore::NV ("VectorizationFactor", VFWidth)
133	<< ", interleaved count: " << ore::NV ("InterleaveCount", IC) << ")";
134	});
135	}
136
137	bool VFSelectionContext::isLegalMaskedLoadOrStore(Instruction *I,
138	ElementCount VF) const {
139	assert(isa<LoadInst>(I) \|\| isa<StoreInst>(I));
140	auto *Ty = getLoadStoreType(I);
141	const unsigned AS = getLoadStoreAddressSpace(I);
142	const Align Alignment = getLoadStoreAlignment(I);
143
144	return ForceTargetSupportsMaskedMemoryOps \|\|
145	(isa<LoadInst>(Val: I) ? TTI.isLegalMaskedLoad(DataType: Ty, Alignment, AddressSpace: AS)
146	: TTI.isLegalMaskedStore(DataType: Ty, Alignment, AddressSpace: AS));
147	}
148
149	bool VFSelectionContext::isLegalGatherOrScatter(Value *V,
150	ElementCount VF) const {
151	bool LI = isa<LoadInst>(Val: V);
152	bool SI = isa<StoreInst>(Val: V);
153	if (!LI && !SI)
154	return false;
155	auto *Ty = getLoadStoreType(I: V);
156	Align Align = getLoadStoreAlignment(I: V);
157	if (VF.isVector())
158	Ty = VectorType::get(ElementType: Ty, EC: VF);
159	return ForceTargetSupportsGatherScatterOps \|\|
160	(LI && TTI.isLegalMaskedGather(DataType: Ty, Alignment: Align)) \|\|
161	(SI && TTI.isLegalMaskedScatter(DataType: Ty, Alignment: Align));
162	}
163
164	bool VFSelectionContext::supportsScalableVectors() const {
165	return TTI.supportsScalableVectors() \|\| ForceTargetSupportsScalableVectors \|\|
166	VectorizerParams::VectorizationFactor.isScalable();
167	}
168
169	bool VFSelectionContext::useMaxBandwidth(bool IsScalable) const {
170	TargetTransformInfo::RegisterKind RegKind =
171	IsScalable ? TargetTransformInfo::RGK_ScalableVector
172	: TargetTransformInfo::RGK_FixedWidthVector;
173	return MaximizeBandwidth \|\| (MaximizeBandwidth.getNumOccurrences() == `0` &&
174	(TTI.shouldMaximizeVectorBandwidth(K: RegKind) \|\|
175	(UseWiderVFIfCallVariantsPresent &&
176	Legal->hasVectorCallVariants())));
177	}
178
179	bool VFSelectionContext::shouldConsiderRegPressureForVF(ElementCount VF) const {
180	if (ConsiderRegPressure.getNumOccurrences())
181	return ConsiderRegPressure;
182
183	// TODO: We should eventually consider register pressure for all targets. The
184	// TTI hook is temporary whilst target-specific issues are being fixed.
185	if (TTI.shouldConsiderVectorizationRegPressure())
186	return true;
187
188	if (!useMaxBandwidth(IsScalable: VF.isScalable()))
189	return false;
190	// Only calculate register pressure for VFs enabled by MaxBandwidth.
191	return ElementCount::isKnownGT(
192	LHS: VF, RHS: VF.isScalable() ? MaxPermissibleVFWithoutMaxBW.ScalableVF
193	: MaxPermissibleVFWithoutMaxBW.FixedVF);
194	}
195
196	ElementCount VFSelectionContext::clampVFByMaxTripCount(
197	ElementCount VF, unsigned MaxTripCount, unsigned UserIC,
198	bool FoldTailByMasking, bool RequiresScalarEpilogue) const {
199	unsigned EstimatedVF = VF.getKnownMinValue();
200	if (VF.isScalable() && F.hasFnAttribute(Kind: Attribute::VScaleRange)) {
201	auto Attr = F.getFnAttribute(Kind: Attribute::VScaleRange);
202	auto Min = Attr.getVScaleRangeMin();
203	EstimatedVF *= Min;
204	}
205
206	// When a scalar epilogue is required, at least one iteration of the scalar
207	// loop has to execute. Adjust MaxTripCount accordingly to avoid picking a
208	// max VF that results in a dead vector loop.
209	if (MaxTripCount > `0` && RequiresScalarEpilogue)
210	MaxTripCount -= `1`;
211
212	// When the user specifies an interleave count, we need to ensure that
213	// VF UserIC <= MaxTripCount to avoid a dead vector loop.*
214	unsigned IC = UserIC > `0` ? UserIC : `1`;
215	unsigned EstimatedVFTimesIC = EstimatedVF * IC;
216
217	if (MaxTripCount && MaxTripCount <= EstimatedVFTimesIC &&
218	(!FoldTailByMasking \|\| isPowerOf2_32(Value: MaxTripCount))) {
219	// If upper bound loop trip count (TC) is known at compile time there is no
220	// point in choosing VF greater than TC / IC (as done in the loop below).
221	// Select maximum power of two which doesn't exceed TC / IC. If VF is
222	// scalable, we only fall back on a fixed VF when the TC is less than or
223	// equal to the known number of lanes.
224	auto ClampedUpperTripCount = llvm::bit_floor(Value: MaxTripCount / IC);
225	if (ClampedUpperTripCount == `0`)
226	ClampedUpperTripCount = `1`;
227	LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
228	"exceeding the constant trip count"
229	<< (UserIC > `0` ? " divided by UserIC" : "") << ": "
230	<< ClampedUpperTripCount << "\n");
231	return ElementCount::get(MinVal: ClampedUpperTripCount,
232	Scalable: FoldTailByMasking ? VF.isScalable() : false);
233	}
234	return VF;
235	}
236
237	ElementCount VFSelectionContext::getMaximizedVFForTarget(
238	unsigned MaxTripCount, unsigned SmallestType, unsigned WidestType,
239	ElementCount MaxSafeVF, unsigned UserIC, bool FoldTailByMasking,
240	bool RequiresScalarEpilogue) {
241	bool ComputeScalableMaxVF = MaxSafeVF.isScalable();
242	const TypeSize WidestRegister = TTI.getRegisterBitWidth(
243	K: ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
244	: TargetTransformInfo::RGK_FixedWidthVector);
245
246	// Convenience function to return the minimum of two ElementCounts.
247	auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) {
248	assert((LHS.isScalable() == RHS.isScalable()) &&
249	"Scalable flags must match");
250	return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS;
251	};
252
253	// Ensure MaxVF is a power of 2; the dependence distance bound may not be.
254	// Note that both WidestRegister and WidestType may not be a powers of 2.
255	auto MaxVectorElementCount = ElementCount::get(
256	MinVal: llvm::bit_floor(Value: WidestRegister.getKnownMinValue() / WidestType),
257	Scalable: ComputeScalableMaxVF);
258	MaxVectorElementCount = MinVF (MaxVectorElementCount, MaxSafeVF);
259	LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
260	<< (MaxVectorElementCount * WidestType) << " bits.\n");
261
262	if (!MaxVectorElementCount) {
263	LLVM_DEBUG(dbgs() << "LV: The target has no "
264	<< (ComputeScalableMaxVF ? "scalable" : "fixed")
265	<< " vector registers.\n");
266	return ElementCount::getFixed(MinVal: `1`);
267	}
268
269	ElementCount MaxVF =
270	clampVFByMaxTripCount(VF: MaxVectorElementCount, MaxTripCount, UserIC,
271	FoldTailByMasking, RequiresScalarEpilogue);
272	// If the MaxVF was already clamped, there's no point in trying to pick a
273	// larger one.
274	if (MaxVF != MaxVectorElementCount)
275	return MaxVF;
276
277	if (MaxVF.isScalable())
278	MaxPermissibleVFWithoutMaxBW.ScalableVF = MaxVF;
279	else
280	MaxPermissibleVFWithoutMaxBW.FixedVF = MaxVF;
281
282	if (useMaxBandwidth(IsScalable: ComputeScalableMaxVF)) {
283	auto MaxVectorElementCountMaxBW = ElementCount::get(
284	MinVal: llvm::bit_floor(Value: WidestRegister.getKnownMinValue() / SmallestType),
285	Scalable: ComputeScalableMaxVF);
286	MaxVF = MinVF (MaxVectorElementCountMaxBW, MaxSafeVF);
287
288	if (ElementCount MinVF =
289	TTI.getMinimumVF(ElemWidth: SmallestType, IsScalable: ComputeScalableMaxVF)) {
290	if (ElementCount::isKnownLT(LHS: MaxVF, RHS: MinVF)) {
291	LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
292	<< ") with target's minimum: " << MinVF << `'\n'`);
293	MaxVF = MinVF;
294	}
295	}
296
297	MaxVF = clampVFByMaxTripCount(VF: MaxVF, MaxTripCount, UserIC,
298	FoldTailByMasking, RequiresScalarEpilogue);
299	}
300	return MaxVF;
301	}
302
303	std::optional<unsigned> llvm::getMaxVScale(const Function &F,
304	const TargetTransformInfo &TTI) {
305	if (std::optional<unsigned> MaxVScale = TTI.getMaxVScale())
306	return MaxVScale;
307
308	if (F.hasFnAttribute(Kind: Attribute::VScaleRange))
309	return F.getFnAttribute(Kind: Attribute::VScaleRange).getVScaleRangeMax();
310
311	return std::nullopt;
312	}
313
314	bool VFSelectionContext::isScalableVectorizationAllowed() {
315	if (IsScalableVectorizationAllowed)
316	return *IsScalableVectorizationAllowed;
317
318	IsScalableVectorizationAllowed = false;
319	if (!supportsScalableVectors())
320	return false;
321
322	if (Hints->isScalableVectorizationDisabled()) {
323	reportVectorizationInfo(Msg: "Scalable vectorization is explicitly disabled",
324	ORETag: "ScalableVectorizationDisabled", ORE, TheLoop);
325	return false;
326	}
327
328	LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n");
329
330	auto MaxScalableVF = ElementCount::getScalable(
331	MinVal: std::numeric_limits<ElementCount::ScalarTy>::max());
332
333	// Test that the loop-vectorizer can legalize all operations for this MaxVF.
334	// FIXME: While for scalable vectors this is currently sufficient, this should
335	// be replaced by a more detailed mechanism that filters out specific VFs,
336	// instead of invalidating vectorization for a whole set of VFs based on the
337	// MaxVF.
338
339	// Disable scalable vectorization if the loop contains unsupported reductions.
340	if (!all_of(Range: Legal->getReductionVars(), P: [&](const auto &Reduction) -> bool {
341	return TTI.isLegalToVectorizeReduction(RdxDesc: Reduction.second, VF: MaxScalableVF);
342	})) {
343	reportVectorizationInfo(
344	Msg: "Scalable vectorization not supported for the reduction "
345	"operations found in this loop.",
346	ORETag: "ScalableVFUnfeasible", ORE, TheLoop);
347	return false;
348	}
349
350	// Disable scalable vectorization if the loop contains any instructions
351	// with element types not supported for scalable vectors.
352	if (any_of(Range&: ElementTypesInLoop, P: [&](Type *Ty) {
353	return !Ty->isVoidTy() && !TTI.isElementTypeLegalForScalableVector(Ty);
354	})) {
355	reportVectorizationInfo(Msg: "Scalable vectorization is not supported "
356	"for all element types found in this loop.",
357	ORETag: "ScalableVFUnfeasible", ORE, TheLoop);
358	return false;
359	}
360
361	if (!Legal->isSafeForAnyVectorWidth() && !getMaxVScale(F, TTI)) {
362	reportVectorizationInfo(Msg: "The target does not provide maximum vscale value "
363	"for safe distance analysis.",
364	ORETag: "ScalableVFUnfeasible", ORE, TheLoop);
365	return false;
366	}
367
368	IsScalableVectorizationAllowed = true;
369	return true;
370	}
371
372	ElementCount
373	VFSelectionContext::getMaxLegalScalableVF(unsigned MaxSafeElements) {
374	if (!isScalableVectorizationAllowed())
375	return ElementCount::getScalable(MinVal: `0`);
376
377	auto MaxScalableVF = ElementCount::getScalable(
378	MinVal: std::numeric_limits<ElementCount::ScalarTy>::max());
379	if (Legal->isSafeForAnyVectorWidth())
380	return MaxScalableVF;
381
382	std::optional<unsigned> MaxVScale = getMaxVScale(F, TTI);
383	// Limit MaxScalableVF by the maximum safe dependence distance.
384	MaxScalableVF = ElementCount::getScalable(MinVal: MaxSafeElements / *MaxVScale);
385
386	if (!MaxScalableVF)
387	reportVectorizationInfo(
388	Msg: "Max legal vector width too small, scalable vectorization "
389	"unfeasible.",
390	ORETag: "ScalableVFUnfeasible", ORE, TheLoop);
391
392	return MaxScalableVF;
393	}
394
395	FixedScalableVFPair VFSelectionContext::computeFeasibleMaxVF(
396	unsigned MaxTripCount, ElementCount UserVF, unsigned UserIC,
397	bool FoldTailByMasking, bool RequiresScalarEpilogue) {
398	auto [SmallestType, WidestType] = getSmallestAndWidestTypes();
399
400	// Get the maximum safe dependence distance in bits computed by LAA.
401	// It is computed by MaxVF sizeOf(type) * 8, where type is taken from*
402	// the memory accesses that is most restrictive (involved in the smallest
403	// dependence distance).
404	unsigned MaxSafeElementsPowerOf2 =
405	llvm::bit_floor(Value: Legal->getMaxSafeVectorWidthInBits() / WidestType);
406	if (!Legal->isSafeForAnyStoreLoadForwardDistances()) {
407	unsigned SLDist = Legal->getMaxStoreLoadForwardSafeDistanceInBits();
408	MaxSafeElementsPowerOf2 =
409	std::min(a: MaxSafeElementsPowerOf2, b: SLDist / WidestType);
410	}
411
412	auto MaxSafeFixedVF = ElementCount::getFixed(MinVal: MaxSafeElementsPowerOf2);
413	auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements: MaxSafeElementsPowerOf2);
414
415	if (!Legal->isSafeForAnyVectorWidth())
416	MaxSafeElements = MaxSafeElementsPowerOf2;
417
418	LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF
419	<< ".\n");
420	LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF
421	<< ".\n");
422
423	// First analyze the UserVF, fall back if the UserVF should be ignored.
424	if (UserVF) {
425	auto MaxSafeUserVF =
426	UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF;
427
428	if (ElementCount::isKnownLE(LHS: UserVF, RHS: MaxSafeUserVF)) {
429	// If `VF=vscale x N` is safe, then so is `VF=N`
430	if (UserVF.isScalable())
431	return FixedScalableVFPair (
432	ElementCount::getFixed(MinVal: UserVF.getKnownMinValue()), UserVF);
433
434	return UserVF;
435	}
436
437	assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF));
438
439	// Only clamp if the UserVF is not scalable. If the UserVF is scalable, it
440	// is better to ignore the hint and let the compiler choose a suitable VF.
441	if (!UserVF.isScalable()) {
442	LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
443	<< " is unsafe, clamping to max safe VF="
444	<< MaxSafeFixedVF << ".\n");
445	ORE->emit(RemarkBuilder: [&]() {
446	return OptimizationRemarkAnalysis (DEBUG_TYPE, "VectorizationFactor",
447	TheLoop->getStartLoc(),
448	TheLoop->getHeader())
449	<< "User-specified vectorization factor "
450	<< ore::NV ("UserVectorizationFactor", UserVF)
451	<< " is unsafe, clamping to maximum safe vectorization factor "
452	<< ore::NV ("VectorizationFactor", MaxSafeFixedVF);
453	});
454	return MaxSafeFixedVF;
455	}
456
457	if (!supportsScalableVectors()) {
458	LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
459	<< " is ignored because scalable vectors are not "
460	"available.\n");
461	ORE->emit(RemarkBuilder: [&]() {
462	return OptimizationRemarkAnalysis (DEBUG_TYPE, "VectorizationFactor",
463	TheLoop->getStartLoc(),
464	TheLoop->getHeader())
465	<< "User-specified vectorization factor "
466	<< ore::NV ("UserVectorizationFactor", UserVF)
467	<< " is ignored because the target does not support scalable "
468	"vectors. The compiler will pick a more suitable value.";
469	});
470	} else {
471	LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
472	<< " is unsafe. Ignoring scalable UserVF.\n");
473	ORE->emit(RemarkBuilder: [&]() {
474	return OptimizationRemarkAnalysis (DEBUG_TYPE, "VectorizationFactor",
475	TheLoop->getStartLoc(),
476	TheLoop->getHeader())
477	<< "User-specified vectorization factor "
478	<< ore::NV ("UserVectorizationFactor", UserVF)
479	<< " is unsafe. Ignoring the hint to let the compiler pick a "
480	"more suitable value.";
481	});
482	}
483	}
484
485	LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
486	<< " / " << WidestType << " bits.\n");
487
488	FixedScalableVFPair Result(ElementCount::getFixed(MinVal: `1`),
489	ElementCount::getScalable(MinVal: `0`));
490	if (auto MaxVF = getMaximizedVFForTarget(
491	MaxTripCount, SmallestType, WidestType, MaxSafeVF: MaxSafeFixedVF, UserIC,
492	FoldTailByMasking, RequiresScalarEpilogue))
493	Result.FixedVF = MaxVF;
494
495	if (auto MaxVF = getMaximizedVFForTarget(
496	MaxTripCount, SmallestType, WidestType, MaxSafeVF: MaxSafeScalableVF, UserIC,
497	FoldTailByMasking, RequiresScalarEpilogue))
498	if (MaxVF.isScalable()) {
499	Result.ScalableVF = MaxVF;
500	LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF
501	<< "\n");
502	}
503
504	return Result;
505	}
506
507	std::pair<unsigned, unsigned>
508	VFSelectionContext::getSmallestAndWidestTypes() const {
509	unsigned MinWidth = -`1U`;
510	unsigned MaxWidth = `8`;
511	const DataLayout &DL = F.getDataLayout();
512	// For in-loop reductions, no element types are added to ElementTypesInLoop
513	// if there are no loads/stores in the loop. In this case, check through the
514	// reduction variables to determine the maximum width.
515	if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) {
516	for (const auto &[_, RdxDesc] : Legal->getReductionVars()) {
517	// When finding the min width used by the recurrence we need to account
518	// for casts on the input operands of the recurrence.
519	MinWidth = std::min(
520	a: MinWidth,
521	b: std::min(a: RdxDesc.getMinWidthCastToRecurrenceTypeInBits(),
522	b: RdxDesc.getRecurrenceType()->getScalarSizeInBits()));
523	MaxWidth = std::max(a: MaxWidth,
524	b: RdxDesc.getRecurrenceType()->getScalarSizeInBits());
525	}
526	} else {
527	for (Type *T : ElementTypesInLoop) {
528	MinWidth = std::min<unsigned>(
529	a: MinWidth, b: DL.getTypeSizeInBits(Ty: T->getScalarType()).getFixedValue());
530	MaxWidth = std::max<unsigned>(
531	a: MaxWidth, b: DL.getTypeSizeInBits(Ty: T->getScalarType()).getFixedValue());
532	}
533	}
534	return {MinWidth, MaxWidth};
535	}
536
537	void VFSelectionContext::collectElementTypesForWidening(
538	const SmallPtrSetImpl<const Value > ValuesToIgnore) {
539	ElementTypesInLoop.clear();
540	// For each block.
541	for (BasicBlock *BB : TheLoop->blocks()) {
542	// For each instruction in the loop.
543	for (Instruction &I : *BB) {
544	Type *T = I.getType();
545
546	// Skip ignored values.
547	if (ValuesToIgnore && ValuesToIgnore->contains(Ptr: &I))
548	continue;
549
550	// Only examine Loads, Stores and PHINodes.
551	if (!isa<LoadInst, StoreInst, PHINode>(Val: I))
552	continue;
553
554	// Examine PHI nodes that are reduction variables. Update the type to
555	// account for the recurrence type.
556	if (auto *PN = dyn_cast<PHINode>(Val: &I)) {
557	if (!Legal->isReductionVariable(PN))
558	continue;
559	const RecurrenceDescriptor &RdxDesc =
560	Legal->getRecurrenceDescriptor(PN);
561	if (PreferInLoopReductions \|\| useOrderedReductions(RdxDesc) \|\|
562	TTI.preferInLoopReduction(Kind: RdxDesc.getRecurrenceKind(),
563	Ty: RdxDesc.getRecurrenceType()))
564	continue;
565	T = RdxDesc.getRecurrenceType();
566	}
567
568	// Examine the stored values.
569	if (auto *ST = dyn_cast<StoreInst>(Val: &I))
570	T = ST->getValueOperand()->getType();
571
572	assert(T->isSized() &&
573	"Expected the load/store/recurrence type to be sized");
574
575	ElementTypesInLoop.insert(Ptr: T);
576	}
577	}
578	}
579
580	void VFSelectionContext::initializeVScaleForTuning() {
581	if (!supportsScalableVectors())
582	return;
583
584	if (F.hasFnAttribute(Kind: Attribute::VScaleRange)) {
585	auto Attr = F.getFnAttribute(Kind: Attribute::VScaleRange);
586	auto Min = Attr.getVScaleRangeMin();
587	auto Max = Attr.getVScaleRangeMax();
588	if (Max && Min == Max) {
589	VScaleForTuning = Max;
590	return;
591	}
592	}
593
594	VScaleForTuning = TTI.getVScaleForTuning();
595	}
596
597	bool VFSelectionContext::useOrderedReductions(
598	const RecurrenceDescriptor &RdxDesc) const {
599	return !Hints->allowReordering() && RdxDesc.isOrdered();
600	}
601
602	bool VFSelectionContext::runtimeChecksRequired() {
603	LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
604
605	Loop L = const_cast<Loop >(TheLoop);
606	if (Legal->getRuntimePointerChecking()->Need) {
607	reportVectorizationFailure(
608	DebugMsg: "Runtime ptr check is required with -Os/-Oz",
609	OREMsg: "runtime pointer checks needed. Enable vectorization of this "
610	"loop with '#pragma clang loop vectorize(enable)' when "
611	"compiling with -Os/-Oz",
612	ORETag: "CantVersionLoopWithOptForSize", ORE, TheLoop: L);
613	return true;
614	}
615
616	if (!PSE.getPredicate().isAlwaysTrue()) {
617	reportVectorizationFailure(
618	DebugMsg: "Runtime SCEV check is required with -Os/-Oz",
619	OREMsg: "runtime SCEV checks needed. Enable vectorization of this "
620	"loop with '#pragma clang loop vectorize(enable)' when "
621	"compiling with -Os/-Oz",
622	ORETag: "CantVersionLoopWithOptForSize", ORE, TheLoop: L);
623	return true;
624	}
625
626	// FIXME: Avoid specializing for stride==1 instead of bailing out.
627	if (!Legal->getLAI()->getSymbolicStrides().empty()) {
628	reportVectorizationFailure(
629	DebugMsg: "Runtime stride check for small trip count",
630	OREMsg: "runtime stride == 1 checks needed. Enable vectorization of "
631	"this loop without such check by compiling with -Os/-Oz",
632	ORETag: "CantVersionLoopWithOptForSize", ORE, TheLoop: L);
633	return true;
634	}
635
636	return false;
637	}
638
639	void VFSelectionContext::computeMinimalBitwidths() {
640	MinBWs = computeMinimumValueSizes(Blocks: TheLoop->getBlocks(), DB&: *DB, TTI: &TTI);
641	}
642
643	void VFSelectionContext::collectInLoopReductions() {
644	// Avoid duplicating work finding in-loop reductions.
645	if (!InLoopReductions.empty())
646	return;
647
648	for (const auto &Reduction : Legal->getReductionVars()) {
649	PHINode *Phi = Reduction.first;
650	const RecurrenceDescriptor &RdxDesc = Reduction.second;
651
652	// Multi-use reductions (e.g., used in FindLastIV patterns) are handled
653	// separately and should not be considered for in-loop reductions.
654	if (RdxDesc.hasUsesOutsideReductionChain())
655	continue;
656
657	// We don't collect reductions that are type promoted (yet).
658	if (RdxDesc.getRecurrenceType() != Phi->getType())
659	continue;
660
661	// In-loop AnyOf and FindIV reductions are not yet supported.
662	RecurKind Kind = RdxDesc.getRecurrenceKind();
663	if (RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) \|\|
664	RecurrenceDescriptor::isFindIVRecurrenceKind(Kind) \|\|
665	RecurrenceDescriptor::isFindLastRecurrenceKind(Kind))
666	continue;
667
668	// If the target would prefer this reduction to happen "in-loop", then we
669	// want to record it as such.
670	if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) &&
671	!TTI.preferInLoopReduction(Kind, Ty: Phi->getType()))
672	continue;
673
674	// Check that we can correctly put the reductions into the loop, by
675	// finding the chain of operations that leads from the phi to the loop
676	// exit value.
677	SmallVector<Instruction *, `4`> ReductionOperations =
678	RdxDesc.getReductionOpChain(Phi, L: const_cast<Loop *>(TheLoop));
679	bool InLoop = !ReductionOperations.empty();
680
681	if (InLoop) {
682	InLoopReductions.insert(Ptr: Phi);
683	// Add the elements to InLoopReductionImmediateChains for cost modelling.
684	Instruction *LastChain = Phi;
685	for (auto *I : ReductionOperations) {
686	InLoopReductionImmediateChains [I] = LastChain;
687	LastChain = I;
688	}
689	}
690	LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")
691	<< " reduction for phi: " << *Phi << "\n");
692	}
693	}
694
695	bool LoopVectorizationPlanner::isMoreProfitable(const VectorizationFactor &A,
696	const VectorizationFactor &B,
697	const unsigned MaxTripCount,
698	bool HasTail,
699	bool IsEpilogue) const {
700	InstructionCost CostA = A.Cost;
701	InstructionCost CostB = B.Cost;
702
703	// When there is a hint to always prefer scalable vectors, honour that hint.
704	if (Hints.isScalableVectorizationAlwaysPreferred())
705	if (A.Width.isScalable() && CostA.isValid() && !B.Width.isScalable() &&
706	!B.Width.isScalar())
707	return true;
708
709	// Improve estimate for the vector width if it is scalable.
710	unsigned EstimatedWidthA = A.Width.getKnownMinValue();
711	unsigned EstimatedWidthB = B.Width.getKnownMinValue();
712	if (std::optional<unsigned> VScale = Config.getVScaleForTuning()) {
713	if (A.Width.isScalable())
714	EstimatedWidthA = VScale;
715	if (B.Width.isScalable())
716	EstimatedWidthB = VScale;
717	}
718
719	// When optimizing for size choose whichever is smallest, which will be the
720	// one with the smallest cost for the whole loop. On a tie pick the larger
721	// vector width, on the assumption that throughput will be greater.
722	if (Config.CostKind == TTI::TCK_CodeSize)
723	return CostA < CostB \|\|
724	(CostA == CostB && EstimatedWidthA > EstimatedWidthB);
725
726	// Assume vscale may be larger than 1 (or the value being tuned for),
727	// so that scalable vectorization is slightly favorable over fixed-width
728	// vectorization.
729	bool PreferScalable = !TTI.preferFixedOverScalableIfEqualCost(IsEpilogue) &&
730	A.Width.isScalable() && !B.Width.isScalable();
731
732	auto CmpFn = [PreferScalable](const InstructionCost &LHS,
733	const InstructionCost &RHS) {
734	return PreferScalable ? LHS <= RHS : LHS < RHS;
735	};
736
737	// To avoid the need for FP division:
738	// (CostA / EstimatedWidthA) < (CostB / EstimatedWidthB)
739	// <=> (CostA EstimatedWidthB) < (CostB * EstimatedWidthA)*
740	bool LowerCostWithoutTC =
741	CmpFn (CostA * EstimatedWidthB, CostB * EstimatedWidthA);
742	if (!MaxTripCount)
743	return LowerCostWithoutTC;
744
745	auto GetCostForTC = [MaxTripCount, HasTail](unsigned VF,
746	InstructionCost VectorCost,
747	InstructionCost ScalarCost) {
748	// If the trip count is a known (possibly small) constant, the trip count
749	// will be rounded up to an integer number of iterations under
750	// FoldTailByMasking. The total cost in that case will be
751	// VecCostceil(TripCount/VF). When not folding the tail, the total*
752	// cost will be VecCostfloor(TC/VF) + ScalarCost(TC%VF). There will be
753	// some extra overheads, but for the purpose of comparing the costs of
754	// different VFs we can use this to compare the total loop-body cost
755	// expected after vectorization.
756	if (HasTail)
757	return VectorCost * (MaxTripCount / VF) +
758	ScalarCost * (MaxTripCount % VF);
759	return VectorCost * divideCeil(Numerator: MaxTripCount, Denominator: VF);
760	};
761
762	auto RTCostA = GetCostForTC (EstimatedWidthA, CostA, A.ScalarCost);
763	auto RTCostB = GetCostForTC (EstimatedWidthB, CostB, B.ScalarCost);
764	bool LowerCostWithTC = CmpFn (RTCostA, RTCostB);
765	LLVM_DEBUG(if (LowerCostWithTC != LowerCostWithoutTC) {
766	dbgs() << "LV: VF " << (LowerCostWithTC ? A.Width : B.Width)
767	<< " has lower cost than VF "
768	<< (LowerCostWithTC ? B.Width : A.Width)
769	<< " when taking the cost of the remaining scalar loop iterations "
770	"into consideration for a maximum trip count of "
771	<< MaxTripCount << ".\n";
772	});
773	return LowerCostWithTC;
774	}
775
776	bool LoopVectorizationPlanner::isMoreProfitable(const VectorizationFactor &A,
777	const VectorizationFactor &B,
778	bool HasTail,
779	bool IsEpilogue) const {
780	const unsigned MaxTripCount = PSE.getSmallConstantMaxTripCount();
781	return LoopVectorizationPlanner::isMoreProfitable(A, B, MaxTripCount, HasTail,
782	IsEpilogue);
783	}
784
785	// TODO: we could return a pair of values that specify the max VF and
786	// min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
787	// `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
788	// doesn't have a cost model that can choose which plan to execute if
789	// more than one is generated.
790	FixedScalableVFPair
791	VFSelectionContext::computeVPlanOuterloopVF(ElementCount UserVF) {
792	if (UserVF.isScalable() && !supportsScalableVectors()) {
793	reportVectorizationFailure(
794	DebugMsg: "Scalable vectorization requested but not supported by the target",
795	OREMsg: "the scalable user-specified vectorization width for outer-loop "
796	"vectorization cannot be used because the target does not support "
797	"scalable vectors.",
798	ORETag: "ScalableVFUnfeasible", ORE, TheLoop);
799	return FixedScalableVFPair::getNone();
800	}
801
802	ElementCount VF = UserVF;
803	if (VF.isZero()) {
804	auto [_, WidestType] = getSmallestAndWidestTypes();
805
806	auto RegKind = TTI.enableScalableVectorization()
807	? TargetTransformInfo::RGK_ScalableVector
808	: TargetTransformInfo::RGK_FixedWidthVector;
809
810	TypeSize RegSize = TTI.getRegisterBitWidth(K: RegKind);
811	// The widest type may be wider than the register width and WidestType may
812	// not be a power of two; round the element count down to a power of two.
813	unsigned N = std::max<uint64_t>(
814	a: `1`, b: llvm::bit_floor(Value: RegSize.getKnownMinValue() / WidestType));
815	VF = ElementCount::get(MinVal: N, Scalable: RegSize.isScalable());
816	LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
817
818	// Make sure we have a VF > 1 for stress testing.
819	if (VPlanBuildOuterloopStressTest && VF.isScalar()) {
820	LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
821	<< "overriding computed VF.\n");
822	VF = ElementCount::getFixed(MinVal: `4`);
823	}
824	}
825	assert(isPowerOf2_32(VF.getKnownMinValue()) &&
826	"VF needs to be a power of two");
827	if (VF.isScalar())
828	return FixedScalableVFPair::getNone();
829	LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
830	<< "VF " << VF << " to build VPlans.\n");
831	return FixedScalableVFPair (VF);
832	}
833

Browse the source code of llvm_projects/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.cpp