1//===- LoopVectorizationPlanner.h - Planner for LoopVectorization ---------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file provides a LoopVectorizationPlanner class.
11/// InnerLoopVectorizer vectorizes loops which contain only one basic
12/// LoopVectorizationPlanner - drives the vectorization process after having
13/// passed Legality checks.
14/// The planner builds and optimizes the Vectorization Plans which record the
15/// decisions how to vectorize the given loop. In particular, represent the
16/// control-flow of the vectorized version, the replication of instructions that
17/// are to be scalarized, and interleave access groups.
18///
19/// Also provides a VPlan-based builder utility analogous to IRBuilder.
20/// It provides an instruction-level API for generating VPInstructions while
21/// abstracting away the Recipe manipulation details.
22//===----------------------------------------------------------------------===//
23
24#ifndef LLVM_TRANSFORMS_VECTORIZE_LOOPVECTORIZATIONPLANNER_H
25#define LLVM_TRANSFORMS_VECTORIZE_LOOPVECTORIZATIONPLANNER_H
26
27#include "VPlan.h"
28#include "llvm/ADT/SmallSet.h"
29#include "llvm/Analysis/TargetTransformInfo.h"
30#include "llvm/Support/InstructionCost.h"
31
32namespace {
33class GeneratedRTChecks;
34}
35
36namespace llvm {
37
38class LoopInfo;
39class DominatorTree;
40class LoopVectorizationLegality;
41class LoopVectorizationCostModel;
42class PredicatedScalarEvolution;
43class LoopVectorizeHints;
44class RecurrenceDescriptor;
45class LoopVersioning;
46class OptimizationRemarkEmitter;
47class TargetLibraryInfo;
48class VPRecipeBuilder;
49struct VPRegisterUsage;
50struct VFRange;
51
52extern cl::opt<bool> EnableVPlanNativePath;
53extern cl::opt<unsigned> ForceTargetInstructionCost;
54extern cl::opt<bool> PreferInLoopReductions;
55
56/// \return An upper bound for vscale based on TTI or the vscale_range
57/// attribute.
58std::optional<unsigned> getMaxVScale(const Function &F,
59 const TargetTransformInfo &TTI);
60
61// Utility functions that are used by different vectorization classes
62namespace LoopVectorizationUtils {
63
64/// Reports a vectorization failure: print \p DebugMsg for debugging
65/// purposes along with the corresponding optimization remark \p RemarkName.
66/// If \p I is passed, it is an instruction that prevents vectorization.
67/// Otherwise, the loop \p TheLoop is used for the location of the remark.
68void reportVectorizationFailure(const StringRef DebugMsg,
69 const StringRef OREMsg, const StringRef ORETag,
70 OptimizationRemarkEmitter *ORE,
71 const Loop *TheLoop, Instruction *I = nullptr);
72
73/// Same as above, but the debug message and optimization remark are identical
74inline void reportVectorizationFailure(const StringRef DebugMsg,
75 const StringRef ORETag,
76 OptimizationRemarkEmitter *ORE,
77 const Loop *TheLoop,
78 Instruction *I = nullptr) {
79 reportVectorizationFailure(DebugMsg, OREMsg: DebugMsg, ORETag, ORE, TheLoop, I);
80}
81
82/// Reports an informative message: print \p Msg for debugging purposes as well
83/// as an optimization remark. Uses either \p I as location of the remark, or
84/// otherwise \p TheLoop. If \p DL is passed, use it as debug location for the
85/// remark.
86void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag,
87 OptimizationRemarkEmitter *ORE,
88 const Loop *TheLoop, Instruction *I = nullptr,
89 DebugLoc DL = {});
90
91/// Report successful vectorization of the loop. In case an outer loop is
92/// vectorized, prepend "outer" to the vectorization remark.
93void reportVectorization(OptimizationRemarkEmitter *ORE, Loop *TheLoop,
94 ElementCount VFWidth, unsigned IC);
95
96} // namespace LoopVectorizationUtils
97
98/// VPlan-based builder utility analogous to IRBuilder.
99class VPBuilder {
100 VPBasicBlock *BB = nullptr;
101 VPBasicBlock::iterator InsertPt = VPBasicBlock::iterator();
102
103 /// Insert \p VPI in BB at InsertPt if BB is set.
104 template <typename T> T *tryInsertInstruction(T *R) {
105 if (BB)
106 BB->insert(Recipe: R, InsertPt);
107 return R;
108 }
109
110 VPInstruction *createInstruction(unsigned Opcode,
111 ArrayRef<VPValue *> Operands,
112 const VPIRMetadata &MD, DebugLoc DL,
113 const Twine &Name = "") {
114 return tryInsertInstruction(
115 R: new VPInstruction(Opcode, Operands, {}, MD, DL, Name));
116 }
117
118public:
119 VPlan &getPlan() const {
120 assert(getInsertBlock() && "Insert block must be set");
121 return *getInsertBlock()->getPlan();
122 }
123
124 VPBuilder() = default;
125 VPBuilder(VPBasicBlock *InsertBB) { setInsertPoint(InsertBB); }
126 VPBuilder(VPRecipeBase *InsertPt) { setInsertPoint(InsertPt); }
127 VPBuilder(VPBasicBlock *TheBB, VPBasicBlock::iterator IP) {
128 setInsertPoint(TheBB, IP);
129 }
130
131 /// Clear the insertion point: created instructions will not be inserted into
132 /// a block.
133 void clearInsertionPoint() {
134 BB = nullptr;
135 InsertPt = VPBasicBlock::iterator();
136 }
137
138 VPBasicBlock *getInsertBlock() const { return BB; }
139 VPBasicBlock::iterator getInsertPoint() const { return InsertPt; }
140
141 /// Create a VPBuilder to insert after \p R.
142 static VPBuilder getToInsertAfter(VPRecipeBase *R) {
143 VPBuilder B;
144 B.setInsertPoint(TheBB: R->getParent(), IP: std::next(x: R->getIterator()));
145 return B;
146 }
147
148 /// InsertPoint - A saved insertion point.
149 class VPInsertPoint {
150 VPBasicBlock *Block = nullptr;
151 VPBasicBlock::iterator Point;
152
153 public:
154 /// Creates a new insertion point which doesn't point to anything.
155 VPInsertPoint() = default;
156
157 /// Creates a new insertion point at the given location.
158 VPInsertPoint(VPBasicBlock *InsertBlock, VPBasicBlock::iterator InsertPoint)
159 : Block(InsertBlock), Point(InsertPoint) {}
160
161 /// Returns true if this insert point is set.
162 bool isSet() const { return Block != nullptr; }
163
164 VPBasicBlock *getBlock() const { return Block; }
165 VPBasicBlock::iterator getPoint() const { return Point; }
166 };
167
168 /// Sets the current insert point to a previously-saved location.
169 void restoreIP(VPInsertPoint IP) {
170 if (IP.isSet())
171 setInsertPoint(TheBB: IP.getBlock(), IP: IP.getPoint());
172 else
173 clearInsertionPoint();
174 }
175
176 /// This specifies that created VPInstructions should be appended to the end
177 /// of the specified block.
178 void setInsertPoint(VPBasicBlock *TheBB) {
179 assert(TheBB && "Attempting to set a null insert point");
180 BB = TheBB;
181 InsertPt = BB->end();
182 }
183
184 /// This specifies that created instructions should be inserted at the
185 /// specified point.
186 void setInsertPoint(VPBasicBlock *TheBB, VPBasicBlock::iterator IP) {
187 BB = TheBB;
188 InsertPt = IP;
189 }
190
191 /// This specifies that created instructions should be inserted at the
192 /// specified point.
193 void setInsertPoint(VPRecipeBase *IP) {
194 BB = IP->getParent();
195 InsertPt = IP->getIterator();
196 }
197
198 /// Insert \p R at the current insertion point. Returns \p R unchanged.
199 template <typename T> [[maybe_unused]] T *insert(T *R) {
200 BB->insert(Recipe: R, InsertPt);
201 return R;
202 }
203
204 /// Create an N-ary operation with \p Opcode, \p Operands and set \p Inst as
205 /// its underlying Instruction.
206 VPInstruction *createNaryOp(unsigned Opcode, ArrayRef<VPValue *> Operands,
207 Instruction *Inst = nullptr,
208 const VPIRFlags &Flags = {},
209 const VPIRMetadata &MD = {},
210 DebugLoc DL = DebugLoc::getUnknown(),
211 const Twine &Name = "",
212 Type *ResultTy = nullptr) {
213 VPInstruction *NewVPInst = tryInsertInstruction(
214 R: new VPInstruction(Opcode, Operands, Flags, MD, DL, Name, ResultTy));
215 NewVPInst->setUnderlyingValue(Inst);
216 return NewVPInst;
217 }
218 VPInstruction *createNaryOp(unsigned Opcode, ArrayRef<VPValue *> Operands,
219 DebugLoc DL, const Twine &Name = "") {
220 return createInstruction(Opcode, Operands, MD: {}, DL, Name);
221 }
222 VPInstruction *createNaryOp(unsigned Opcode, ArrayRef<VPValue *> Operands,
223 const VPIRFlags &Flags,
224 DebugLoc DL = DebugLoc::getUnknown(),
225 const Twine &Name = "") {
226 return tryInsertInstruction(
227 R: new VPInstruction(Opcode, Operands, Flags, {}, DL, Name));
228 }
229
230 VPInstruction *createNaryOp(unsigned Opcode, ArrayRef<VPValue *> Operands,
231 Type *ResultTy, const VPIRFlags &Flags = {},
232 DebugLoc DL = DebugLoc::getUnknown(),
233 const Twine &Name = "") {
234 return tryInsertInstruction(R: new VPInstructionWithType(
235 Opcode, Operands, ResultTy, Flags, {}, DL, Name));
236 }
237
238 VPInstruction *createFirstActiveLane(ArrayRef<VPValue *> Masks,
239 DebugLoc DL = DebugLoc::getUnknown(),
240 const Twine &Name = "") {
241 // Assume that the maximum possible number of elements in a vector fits
242 // within the index type for the default address space.
243 VPlan &Plan = getPlan();
244 Type *IndexTy = Plan.getDataLayout().getIndexType(C&: Plan.getContext(), AddressSpace: 0);
245 return tryInsertInstruction(R: new VPInstruction(
246 VPInstruction::FirstActiveLane, Masks, {}, {}, DL, Name, IndexTy));
247 }
248
249 VPInstruction *createLastActiveLane(ArrayRef<VPValue *> Masks,
250 DebugLoc DL = DebugLoc::getUnknown(),
251 const Twine &Name = "") {
252 // Assume that the maximum possible number of elements in a vector fits
253 // within the index type for the default address space.
254 VPlan &Plan = getPlan();
255 Type *IndexTy = Plan.getDataLayout().getIndexType(C&: Plan.getContext(), AddressSpace: 0);
256 return tryInsertInstruction(R: new VPInstruction(
257 VPInstruction::LastActiveLane, Masks, {}, {}, DL, Name, IndexTy));
258 }
259
260 VPInstruction *createOverflowingOp(
261 unsigned Opcode, ArrayRef<VPValue *> Operands,
262 VPRecipeWithIRFlags::WrapFlagsTy WrapFlags = {false, false},
263 DebugLoc DL = DebugLoc::getUnknown(), const Twine &Name = "") {
264 return tryInsertInstruction(
265 R: new VPInstruction(Opcode, Operands, WrapFlags, {}, DL, Name));
266 }
267
268 VPInstruction *createNot(VPValue *Operand,
269 DebugLoc DL = DebugLoc::getUnknown(),
270 const Twine &Name = "") {
271 return createInstruction(Opcode: VPInstruction::Not, Operands: {Operand}, MD: {}, DL, Name);
272 }
273
274 VPInstruction *createAnd(VPValue *LHS, VPValue *RHS,
275 DebugLoc DL = DebugLoc::getUnknown(),
276 const Twine &Name = "") {
277 return createInstruction(Opcode: Instruction::BinaryOps::And, Operands: {LHS, RHS}, MD: {}, DL,
278 Name);
279 }
280
281 VPInstruction *createOr(VPValue *LHS, VPValue *RHS,
282 DebugLoc DL = DebugLoc::getUnknown(),
283 const Twine &Name = "") {
284
285 return tryInsertInstruction(R: new VPInstruction(
286 Instruction::BinaryOps::Or, {LHS, RHS},
287 VPRecipeWithIRFlags::DisjointFlagsTy(false), {}, DL, Name));
288 }
289
290 VPInstruction *
291 createAdd(VPValue *LHS, VPValue *RHS, DebugLoc DL = DebugLoc::getUnknown(),
292 const Twine &Name = "",
293 VPRecipeWithIRFlags::WrapFlagsTy WrapFlags = {false, false}) {
294 return createOverflowingOp(Opcode: Instruction::Add, Operands: {LHS, RHS}, WrapFlags, DL,
295 Name);
296 }
297
298 VPInstruction *
299 createSub(VPValue *LHS, VPValue *RHS, DebugLoc DL = DebugLoc::getUnknown(),
300 const Twine &Name = "",
301 VPRecipeWithIRFlags::WrapFlagsTy WrapFlags = {false, false}) {
302 return createOverflowingOp(Opcode: Instruction::Sub, Operands: {LHS, RHS}, WrapFlags, DL,
303 Name);
304 }
305
306 VPInstruction *createLogicalAnd(VPValue *LHS, VPValue *RHS,
307 DebugLoc DL = DebugLoc::getUnknown(),
308 const Twine &Name = "") {
309 return createNaryOp(Opcode: VPInstruction::LogicalAnd, Operands: {LHS, RHS}, DL, Name);
310 }
311
312 VPInstruction *createLogicalOr(VPValue *LHS, VPValue *RHS,
313 DebugLoc DL = DebugLoc::getUnknown(),
314 const Twine &Name = "") {
315 return createNaryOp(Opcode: VPInstruction::LogicalOr, Operands: {LHS, RHS}, DL, Name);
316 }
317
318 VPInstruction *createSelect(VPValue *Cond, VPValue *TrueVal,
319 VPValue *FalseVal,
320 DebugLoc DL = DebugLoc::getUnknown(),
321 const Twine &Name = "",
322 const VPIRFlags &Flags = {}) {
323 return tryInsertInstruction(R: new VPInstruction(
324 Instruction::Select, {Cond, TrueVal, FalseVal}, Flags, {}, DL, Name));
325 }
326
327 /// Create a new ICmp VPInstruction with predicate \p Pred and operands \p A
328 /// and \p B.
329 VPInstruction *createICmp(CmpInst::Predicate Pred, VPValue *A, VPValue *B,
330 DebugLoc DL = DebugLoc::getUnknown(),
331 const Twine &Name = "") {
332 assert(Pred >= CmpInst::FIRST_ICMP_PREDICATE &&
333 Pred <= CmpInst::LAST_ICMP_PREDICATE && "invalid predicate");
334 return tryInsertInstruction(
335 R: new VPInstruction(Instruction::ICmp, {A, B}, Pred, {}, DL, Name));
336 }
337
338 /// Create a new FCmp VPInstruction with predicate \p Pred and operands \p A
339 /// and \p B.
340 VPInstruction *createFCmp(CmpInst::Predicate Pred, VPValue *A, VPValue *B,
341 DebugLoc DL = DebugLoc::getUnknown(),
342 const Twine &Name = "") {
343 assert(Pred >= CmpInst::FIRST_FCMP_PREDICATE &&
344 Pred <= CmpInst::LAST_FCMP_PREDICATE && "invalid predicate");
345 return tryInsertInstruction(
346 R: new VPInstruction(Instruction::FCmp, {A, B},
347 VPIRFlags(Pred, FastMathFlags()), {}, DL, Name));
348 }
349
350 /// Create an AnyOf reduction pattern: or-reduce \p ChainOp, freeze the
351 /// result, then select between \p TrueVal and \p FalseVal.
352 VPInstruction *createAnyOfReduction(VPValue *ChainOp, VPValue *TrueVal,
353 VPValue *FalseVal,
354 DebugLoc DL = DebugLoc::getUnknown());
355
356 VPInstruction *createPtrAdd(VPValue *Ptr, VPValue *Offset,
357 DebugLoc DL = DebugLoc::getUnknown(),
358 const Twine &Name = "") {
359 return tryInsertInstruction(
360 R: new VPInstruction(VPInstruction::PtrAdd, {Ptr, Offset},
361 GEPNoWrapFlags::none(), {}, DL, Name));
362 }
363
364 VPInstruction *createNoWrapPtrAdd(VPValue *Ptr, VPValue *Offset,
365 GEPNoWrapFlags GEPFlags,
366 DebugLoc DL = DebugLoc::getUnknown(),
367 const Twine &Name = "") {
368 return tryInsertInstruction(R: new VPInstruction(
369 VPInstruction::PtrAdd, {Ptr, Offset}, GEPFlags, {}, DL, Name));
370 }
371
372 VPInstruction *createWidePtrAdd(VPValue *Ptr, VPValue *Offset,
373 DebugLoc DL = DebugLoc::getUnknown(),
374 const Twine &Name = "") {
375 return tryInsertInstruction(
376 R: new VPInstruction(VPInstruction::WidePtrAdd, {Ptr, Offset},
377 GEPNoWrapFlags::none(), {}, DL, Name));
378 }
379
380 VPPhi *createScalarPhi(ArrayRef<VPValue *> IncomingValues,
381 DebugLoc DL = DebugLoc::getUnknown(),
382 const Twine &Name = "", const VPIRFlags &Flags = {},
383 Type *ResultTy = nullptr) {
384 return tryInsertInstruction(
385 R: new VPPhi(IncomingValues, Flags, DL, Name, ResultTy));
386 }
387
388 VPWidenPHIRecipe *createWidenPhi(ArrayRef<VPValue *> IncomingValues,
389 DebugLoc DL = DebugLoc::getUnknown(),
390 const Twine &Name = "") {
391 return tryInsertInstruction(R: new VPWidenPHIRecipe(IncomingValues, DL, Name));
392 }
393
394 VPValue *createElementCount(Type *Ty, ElementCount EC) {
395 VPlan &Plan = *getInsertBlock()->getPlan();
396 VPValue *RuntimeEC = Plan.getConstantInt(Ty, Val: EC.getKnownMinValue());
397 if (EC.isScalable()) {
398 VPValue *VScale = createNaryOp(Opcode: VPInstruction::VScale, Operands: {}, ResultTy: Ty);
399 RuntimeEC = EC.getKnownMinValue() == 1
400 ? VScale
401 : createOverflowingOp(Opcode: Instruction::Mul,
402 Operands: {VScale, RuntimeEC}, WrapFlags: {true, false});
403 }
404 return RuntimeEC;
405 }
406
407 /// Convert the input value \p Current to the corresponding value of an
408 /// induction with \p Start and \p Step values, using \p Start + \p Current *
409 /// \p Step.
410 VPDerivedIVRecipe *createDerivedIV(InductionDescriptor::InductionKind Kind,
411 FPMathOperator *FPBinOp, VPIRValue *Start,
412 VPValue *Current, VPValue *Step) {
413 return tryInsertInstruction(
414 R: new VPDerivedIVRecipe(Kind, FPBinOp, Start, Current, Step));
415 }
416
417 VPInstructionWithType *createScalarLoad(Type *ResultTy, VPValue *Addr,
418 DebugLoc DL,
419 const VPIRMetadata &Metadata = {}) {
420 return tryInsertInstruction(R: new VPInstructionWithType(
421 Instruction::Load, Addr, ResultTy, {}, Metadata, DL));
422 }
423
424 VPInstruction *createScalarCast(Instruction::CastOps Opcode, VPValue *Op,
425 Type *ResultTy, DebugLoc DL,
426 const VPIRMetadata &Metadata = {}) {
427 return tryInsertInstruction(R: new VPInstructionWithType(
428 Opcode, Op, ResultTy, VPIRFlags::getDefaultFlags(Opcode), Metadata,
429 DL));
430 }
431
432 VPInstruction *createScalarCast(Instruction::CastOps Opcode, VPValue *Op,
433 Type *ResultTy, DebugLoc DL,
434 const VPIRFlags &Flags,
435 const VPIRMetadata &Metadata = {}) {
436 return tryInsertInstruction(
437 R: new VPInstructionWithType(Opcode, Op, ResultTy, Flags, Metadata, DL));
438 }
439
440 VPValue *createScalarZExtOrTrunc(VPValue *Op, Type *ResultTy, Type *SrcTy,
441 DebugLoc DL) {
442 if (ResultTy == SrcTy)
443 return Op;
444 Instruction::CastOps CastOp =
445 ResultTy->getScalarSizeInBits() < SrcTy->getScalarSizeInBits()
446 ? Instruction::Trunc
447 : Instruction::ZExt;
448 return createScalarCast(Opcode: CastOp, Op, ResultTy, DL);
449 }
450
451 VPValue *createScalarSExtOrTrunc(VPValue *Op, Type *ResultTy, Type *SrcTy,
452 DebugLoc DL) {
453 if (ResultTy == SrcTy)
454 return Op;
455 Instruction::CastOps CastOp =
456 ResultTy->getScalarSizeInBits() < SrcTy->getScalarSizeInBits()
457 ? Instruction::Trunc
458 : Instruction::SExt;
459 return createScalarCast(Opcode: CastOp, Op, ResultTy, DL);
460 }
461
462 VPValue *createScalarFreeze(VPValue *Op, Type *ResultTy, DebugLoc DL) {
463 return tryInsertInstruction(
464 R: new VPInstruction(Instruction::Freeze, Op, {}, {}, DL));
465 }
466
467 VPWidenCastRecipe *createWidenCast(Instruction::CastOps Opcode, VPValue *Op,
468 Type *ResultTy) {
469 return tryInsertInstruction(R: new VPWidenCastRecipe(
470 Opcode, Op, ResultTy, nullptr, VPIRFlags::getDefaultFlags(Opcode)));
471 }
472
473 /// Create a single-scalar recipe with \p Opcode and \p Operands without
474 /// inserting it.
475 static VPSingleDefRecipe *createSingleScalarOp(unsigned Opcode,
476 ArrayRef<VPValue *> Operands,
477 VPValue *Mask,
478 const VPIRFlags &Flags,
479 const VPIRMetadata &Metadata,
480 DebugLoc DL, Instruction *UV) {
481 if (Instruction::isCast(Opcode)) {
482 assert(!Mask && "Cast cannot be predicated");
483 return new VPInstructionWithType(Opcode, Operands, UV->getType(), Flags,
484 Metadata, DL, UV->getName(), UV);
485 }
486 return new VPReplicateRecipe(UV, Operands, /*IsSingleScalar=*/true, Mask,
487 Flags, Metadata, DL);
488 }
489
490 VPScalarIVStepsRecipe *
491 createScalarIVSteps(Instruction::BinaryOps InductionOpcode,
492 FPMathOperator *FPBinOp, VPValue *IV, VPValue *Step,
493 VPValue *VF, DebugLoc DL) {
494 return tryInsertInstruction(R: new VPScalarIVStepsRecipe(
495 IV, Step, VF, InductionOpcode,
496 FPBinOp ? FPBinOp->getFastMathFlags() : FastMathFlags(), DL));
497 }
498
499 VPExpandSCEVRecipe *createExpandSCEV(const SCEV *Expr) {
500 return tryInsertInstruction(R: new VPExpandSCEVRecipe(Expr));
501 }
502
503 VPVectorPointerRecipe *
504 createVectorPointer(VPValue *Ptr, Type *SourceElementTy, VPValue *Stride,
505 GEPNoWrapFlags GEPFlags, DebugLoc DL) {
506 return tryInsertInstruction(
507 R: new VPVectorPointerRecipe(Ptr, SourceElementTy, Stride, GEPFlags, DL));
508 }
509
510 VPWidenMemIntrinsicRecipe *createWidenMemIntrinsic(
511 Intrinsic::ID VectorIntrinsicID, ArrayRef<VPValue *> CallArguments,
512 Type *Ty, Align Alignment, const VPIRMetadata &MD, DebugLoc DL) {
513 return tryInsertInstruction(R: new VPWidenMemIntrinsicRecipe(
514 VectorIntrinsicID, CallArguments, Ty, Alignment, MD, DL));
515 }
516
517 //===--------------------------------------------------------------------===//
518 // RAII helpers.
519 //===--------------------------------------------------------------------===//
520
521 /// RAII object that stores the current insertion point and restores it when
522 /// the object is destroyed.
523 class InsertPointGuard {
524 VPBuilder &Builder;
525 VPBasicBlock *Block;
526 VPBasicBlock::iterator Point;
527
528 public:
529 InsertPointGuard(VPBuilder &B)
530 : Builder(B), Block(B.getInsertBlock()), Point(B.getInsertPoint()) {}
531
532 InsertPointGuard(const InsertPointGuard &) = delete;
533 InsertPointGuard &operator=(const InsertPointGuard &) = delete;
534
535 ~InsertPointGuard() { Builder.restoreIP(IP: VPInsertPoint(Block, Point)); }
536 };
537};
538
539/// TODO: The following VectorizationFactor was pulled out of
540/// LoopVectorizationCostModel class. LV also deals with
541/// VectorizerParams::VectorizationFactor.
542/// We need to streamline them.
543
544/// Information about vectorization costs.
545struct VectorizationFactor {
546 /// Vector width with best cost.
547 ElementCount Width;
548
549 /// Cost of the loop with that width.
550 InstructionCost Cost;
551
552 /// Cost of the scalar loop.
553 InstructionCost ScalarCost;
554
555 /// The minimum trip count required to make vectorization profitable, e.g. due
556 /// to runtime checks.
557 ElementCount MinProfitableTripCount;
558
559 VectorizationFactor(ElementCount Width, InstructionCost Cost,
560 InstructionCost ScalarCost)
561 : Width(Width), Cost(Cost), ScalarCost(ScalarCost) {}
562
563 /// Width 1 means no vectorization, cost 0 means uncomputed cost.
564 static VectorizationFactor Disabled() {
565 return {ElementCount::getFixed(MinVal: 1), 0, 0};
566 }
567
568 bool operator==(const VectorizationFactor &rhs) const {
569 return Width == rhs.Width && Cost == rhs.Cost;
570 }
571
572 bool operator!=(const VectorizationFactor &rhs) const {
573 return !(*this == rhs);
574 }
575};
576
577/// A class that represents two vectorization factors (initialized with 0 by
578/// default). One for fixed-width vectorization and one for scalable
579/// vectorization. This can be used by the vectorizer to choose from a range of
580/// fixed and/or scalable VFs in order to find the most cost-effective VF to
581/// vectorize with.
582struct FixedScalableVFPair {
583 ElementCount FixedVF;
584 ElementCount ScalableVF;
585
586 FixedScalableVFPair()
587 : FixedVF(ElementCount::getFixed(MinVal: 0)),
588 ScalableVF(ElementCount::getScalable(MinVal: 0)) {}
589 FixedScalableVFPair(const ElementCount &Max) : FixedScalableVFPair() {
590 *(Max.isScalable() ? &ScalableVF : &FixedVF) = Max;
591 }
592 FixedScalableVFPair(const ElementCount &FixedVF,
593 const ElementCount &ScalableVF)
594 : FixedVF(FixedVF), ScalableVF(ScalableVF) {
595 assert(!FixedVF.isScalable() && ScalableVF.isScalable() &&
596 "Invalid scalable properties");
597 }
598
599 static FixedScalableVFPair getNone() { return FixedScalableVFPair(); }
600
601 /// \return true if either fixed- or scalable VF is non-zero.
602 explicit operator bool() const { return FixedVF || ScalableVF; }
603
604 /// \return true if either fixed- or scalable VF is a valid vector VF.
605 bool hasVector() const { return FixedVF.isVector() || ScalableVF.isVector(); }
606};
607
608/// Holds state needed to make cost decisions before computing costs per-VF,
609/// including the maximum VFs.
610class VFSelectionContext {
611 /// \return True if maximizing vector bandwidth is enabled by the target or
612 /// user options, for the given register kind (scalable or fixed-width).
613 bool useMaxBandwidth(bool IsScalable) const;
614
615 /// \return the maximized element count based on the targets vector
616 /// registers and the loop trip-count, but limited to a maximum safe VF.
617 /// This is a helper function of computeFeasibleMaxVF.
618 ElementCount getMaximizedVFForTarget(unsigned MaxTripCount,
619 unsigned SmallestType,
620 unsigned WidestType,
621 ElementCount MaxSafeVF, unsigned UserIC,
622 bool FoldTailByMasking,
623 bool RequiresScalarEpilogue);
624
625 /// If \p VF * \p UserIC > MaxTripcount, clamps VF to the next lower VF
626 /// that results in VF * UserIC <= MaxTripCount.
627 ElementCount clampVFByMaxTripCount(ElementCount VF, unsigned MaxTripCount,
628 unsigned UserIC, bool FoldTailByMasking,
629 bool RequiresScalarEpilogue) const;
630
631 /// Checks if scalable vectorization is supported and enabled. Caches the
632 /// result to avoid repeated debug dumps for repeated queries.
633 bool isScalableVectorizationAllowed();
634
635 /// \return the maximum legal scalable VF, based on the safe max number
636 /// of elements.
637 ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
638
639 /// Initializes the value of vscale used for tuning the cost model. If
640 /// vscale_range.min == vscale_range.max then return vscale_range.max, else
641 /// return the value returned by the corresponding TTI method.
642 void initializeVScaleForTuning();
643
644 const TargetTransformInfo &TTI;
645 const LoopVectorizationLegality *Legal;
646 const Loop *TheLoop;
647 const Function &F;
648 PredicatedScalarEvolution &PSE;
649 DemandedBits *DB;
650 OptimizationRemarkEmitter *ORE;
651 const LoopVectorizeHints *Hints;
652
653 /// Cached result of isScalableVectorizationAllowed.
654 std::optional<bool> IsScalableVectorizationAllowed;
655
656 /// Used to store the value of vscale used for tuning the cost model. It is
657 /// initialized during object construction.
658 std::optional<unsigned> VScaleForTuning;
659
660 /// The highest VF possible for this loop, without using MaxBandwidth.
661 FixedScalableVFPair MaxPermissibleVFWithoutMaxBW;
662
663 /// All element types found in the loop.
664 SmallPtrSet<Type *, 16> ElementTypesInLoop;
665
666 /// PHINodes of the reductions that should be expanded in-loop. Set by
667 /// collectInLoopReductions.
668 SmallPtrSet<PHINode *, 4> InLoopReductions;
669
670 /// A Map of inloop reduction operations and their immediate chain operand.
671 /// FIXME: This can be removed once reductions can be costed correctly in
672 /// VPlan. This was added to allow quick lookup of the inloop operations.
673 /// Set by collectInLoopReductions.
674 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains;
675
676 /// Maximum safe number of elements to be processed per vector iteration,
677 /// which do not prevent store-load forwarding and are safe with regard to the
678 /// memory dependencies. Required for EVL-based vectorization, where this
679 /// value is used as the upper bound of the safe AVL. Set by
680 /// computeFeasibleMaxVF.
681 std::optional<unsigned> MaxSafeElements;
682
683 /// Map of scalar integer values to the smallest bitwidth they can be legally
684 /// represented as. The vector equivalents of these values should be truncated
685 /// to this type.
686 MapVector<Instruction *, uint64_t> MinBWs;
687
688public:
689 /// The kind of cost that we are calculating.
690 const TTI::TargetCostKind CostKind;
691
692 /// Whether this loop should be optimized for size based on function attribute
693 /// or profile information.
694 const bool OptForSize;
695
696 VFSelectionContext(const TargetTransformInfo &TTI,
697 const LoopVectorizationLegality *Legal,
698 const Loop *TheLoop, const Function &F,
699 PredicatedScalarEvolution &PSE, DemandedBits *DB,
700 OptimizationRemarkEmitter *ORE,
701 const LoopVectorizeHints *Hints, bool OptForSize)
702 : TTI(TTI), Legal(Legal), TheLoop(TheLoop), F(F), PSE(PSE), DB(DB),
703 ORE(ORE), Hints(Hints),
704 CostKind(F.hasMinSize() ? TTI::TCK_CodeSize : TTI::TCK_RecipThroughput),
705 OptForSize(OptForSize) {
706 initializeVScaleForTuning();
707 }
708
709 /// \return The vscale value used for tuning the cost model.
710 std::optional<unsigned> getVScaleForTuning() const { return VScaleForTuning; }
711
712 /// \return True if register pressure should be considered for the given VF.
713 bool shouldConsiderRegPressureForVF(ElementCount VF) const;
714
715 /// \return True if scalable vectors are supported by the target or forced.
716 bool supportsScalableVectors() const;
717
718 /// Collect element types in the loop that need widening.
719 void collectElementTypesForWidening(
720 const SmallPtrSetImpl<const Value *> *ValuesToIgnore = nullptr);
721
722 /// \return The size (in bits) of the smallest and widest types in the code
723 /// that need to be vectorized. We ignore values that remain scalar such as
724 /// 64 bit loop indices.
725 std::pair<unsigned, unsigned> getSmallestAndWidestTypes() const;
726
727 /// \return An upper bound for the vectorization factors for both
728 /// fixed and scalable vectorization, where the minimum-known number of
729 /// elements is a power-of-2 larger than zero. If scalable vectorization is
730 /// disabled or unsupported, then the scalable part will be equal to
731 /// ElementCount::getScalable(0). Also sets MaxSafeElements.
732 FixedScalableVFPair computeFeasibleMaxVF(unsigned MaxTripCount,
733 ElementCount UserVF, unsigned UserIC,
734 bool FoldTailByMasking,
735 bool RequiresScalarEpilogue);
736
737 /// Return maximum safe number of elements to be processed per vector
738 /// iteration, which do not prevent store-load forwarding and are safe with
739 /// regard to the memory dependencies. Required for EVL-based VPlans to
740 /// correctly calculate AVL (application vector length) as min(remaining AVL,
741 /// MaxSafeElements). Set by computeFeasibleMaxVF.
742 /// TODO: need to consider adjusting cost model to use this value as a
743 /// vectorization factor for EVL-based vectorization.
744 std::optional<unsigned> getMaxSafeElements() const { return MaxSafeElements; }
745
746 /// Returns true if we should use strict in-order reductions for the given
747 /// RdxDesc. This is true if the -enable-strict-reductions flag is passed,
748 /// the IsOrdered flag of RdxDesc is set and we do not allow reordering
749 /// of FP operations.
750 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const;
751
752 /// Returns true if the target machine supports masked loads or stores
753 /// for \p I's data type and alignment. The caller must ensure the access is
754 /// consecutive or part of an interleave group.
755 bool isLegalMaskedLoadOrStore(Instruction *I, ElementCount VF) const;
756
757 /// Returns true if the target machine can represent \p V as a masked gather
758 /// or scatter operation.
759 bool isLegalGatherOrScatter(Value *V, ElementCount VF) const;
760
761 /// Split reductions into those that happen in the loop, and those that
762 /// happen outside. In-loop reductions are collected into InLoopReductions.
763 /// InLoopReductionImmediateChains is filled with each in-loop reduction
764 /// operation and its immediate chain operand for use during cost modelling.
765 void collectInLoopReductions();
766
767 /// Returns true if the Phi is part of an inloop reduction.
768 bool isInLoopReduction(PHINode *Phi) const {
769 return InLoopReductions.contains(Ptr: Phi);
770 }
771
772 /// Returns the set of in-loop reduction PHIs.
773 const SmallPtrSetImpl<PHINode *> &getInLoopReductions() const {
774 return InLoopReductions;
775 }
776
777 /// Returns the immediate chain operand of in-loop reduction operation \p I,
778 /// or nullptr if \p I is not an in-loop reduction operation.
779 Instruction *getInLoopReductionImmediateChain(Instruction *I) const {
780 return InLoopReductionImmediateChains.lookup(Val: I);
781 }
782
783 /// Check whether vectorization would require runtime checks. When optimizing
784 /// for size, returning true here aborts vectorization.
785 bool runtimeChecksRequired();
786
787 /// Returns a scalable VF to use for outer-loop vectorization if the target
788 /// supports it and a fixed VF otherwise.
789 FixedScalableVFPair computeVPlanOuterloopVF(ElementCount UserVF);
790
791 /// Compute smallest bitwidth each instruction can be represented with.
792 /// The vector equivalents of these instructions should be truncated to this
793 /// type.
794 void computeMinimalBitwidths();
795
796 /// \returns The smallest bitwidth each instruction can be represented with.
797 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
798 return MinBWs;
799 }
800};
801
802/// Planner drives the vectorization process after having passed
803/// Legality checks.
804class LoopVectorizationPlanner {
805 /// The loop that we evaluate.
806 Loop *OrigLoop;
807
808 /// Loop Info analysis.
809 LoopInfo *LI;
810
811 /// The dominator tree.
812 DominatorTree *DT;
813
814 /// Target Library Info.
815 const TargetLibraryInfo *TLI;
816
817 /// Target Transform Info.
818 const TargetTransformInfo &TTI;
819
820 /// The legality analysis.
821 LoopVectorizationLegality *Legal;
822
823 /// The profitability analysis.
824 LoopVectorizationCostModel &CM;
825
826 /// VF selection state independent of cost-modeling decisions.
827 VFSelectionContext &Config;
828
829 /// The interleaved access analysis.
830 InterleavedAccessInfo &IAI;
831
832 PredicatedScalarEvolution &PSE;
833
834 const LoopVectorizeHints &Hints;
835
836 OptimizationRemarkEmitter *ORE;
837
838 SmallVector<VPlanPtr, 4> VPlans;
839
840 /// Profitable vector factors.
841 SmallVector<VectorizationFactor, 8> ProfitableVFs;
842
843 /// A builder used to construct the current plan.
844 VPBuilder Builder;
845
846 /// Computes the cost of \p Plan for vectorization factor \p VF.
847 ///
848 /// The current implementation requires access to the
849 /// LoopVectorizationLegality to handle inductions and reductions, which is
850 /// why it is kept separate from the VPlan-only cost infrastructure.
851 ///
852 /// TODO: Move to VPlan::cost once the use of LoopVectorizationLegality has
853 /// been retired.
854 InstructionCost cost(VPlan &Plan, ElementCount VF, VPRegisterUsage *RU) const;
855
856 /// Precompute costs for certain instructions using the legacy cost model. The
857 /// function is used to bring up the VPlan-based cost model to initially avoid
858 /// taking different decisions due to inaccuracies in the legacy cost model.
859 InstructionCost precomputeCosts(VPlan &Plan, ElementCount VF,
860 VPCostContext &CostCtx) const;
861
862public:
863 LoopVectorizationPlanner(
864 Loop *L, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI,
865 const TargetTransformInfo &TTI, LoopVectorizationLegality *Legal,
866 LoopVectorizationCostModel &CM, VFSelectionContext &Config,
867 InterleavedAccessInfo &IAI, PredicatedScalarEvolution &PSE,
868 const LoopVectorizeHints &Hints, OptimizationRemarkEmitter *ORE)
869 : OrigLoop(L), LI(LI), DT(DT), TLI(TLI), TTI(TTI), Legal(Legal), CM(CM),
870 Config(Config), IAI(IAI), PSE(PSE), Hints(Hints), ORE(ORE) {}
871
872 /// Build VPlans for the specified \p UserVF and \p UserIC if they are
873 /// non-zero or all applicable candidate VFs otherwise. If vectorization and
874 /// interleaving should be avoided up-front, no plans are generated.
875 void plan(ElementCount UserVF, unsigned UserIC);
876
877 /// Return the VPlan for \p VF. At the moment, there is always a single VPlan
878 /// for each VF.
879 VPlan &getPlanFor(ElementCount VF) const;
880
881 /// Compute and return the most profitable vectorization factor and the
882 /// corresponding best VPlan. Also collect all profitable VFs in
883 /// ProfitableVFs.
884 std::pair<VectorizationFactor, VPlan *> computeBestVF();
885
886 /// \return The desired interleave count.
887 /// If interleave count has been specified by metadata it will be returned.
888 /// Otherwise, the interleave count is computed and returned. VF and LoopCost
889 /// are the selected vectorization factor and the cost of the selected VF.
890 unsigned selectInterleaveCount(VPlan &Plan, ElementCount VF,
891 InstructionCost LoopCost);
892
893 /// Generate the IR code for the vectorized loop captured in VPlan \p BestPlan
894 /// according to the best selected \p VF and \p UF.
895 ///
896 /// TODO: \p EpilogueVecKind should be removed once the re-use issue has been
897 /// fixed.
898 ///
899 /// Returns a mapping of SCEVs to their expanded IR values.
900 /// Note that this is a temporary workaround needed due to the current
901 /// epilogue handling.
902 enum class EpilogueVectorizationKind {
903 None, ///< Not part of epilogue vectorization.
904 MainLoop, ///< Vectorizing the main loop of epilogue vectorization.
905 Epilogue ///< Vectorizing the epilogue loop.
906 };
907 DenseMap<const SCEV *, Value *>
908 executePlan(ElementCount VF, unsigned UF, VPlan &BestPlan,
909 InnerLoopVectorizer &LB, DominatorTree *DT,
910 EpilogueVectorizationKind EpilogueVecKind =
911 EpilogueVectorizationKind::None);
912
913#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
914 void printPlans(raw_ostream &O);
915#endif
916
917 /// Look through the existing plans and return true if we have one with
918 /// vectorization factor \p VF.
919 bool hasPlanWithVF(ElementCount VF) const {
920 return any_of(Range: VPlans,
921 P: [&](const VPlanPtr &Plan) { return Plan->hasVF(VF); });
922 }
923
924 /// Test a \p Predicate on a \p Range of VF's. Return the value of applying
925 /// \p Predicate on Range.Start, possibly decreasing Range.End such that the
926 /// returned value holds for the entire \p Range.
927 static bool
928 getDecisionAndClampRange(const std::function<bool(ElementCount)> &Predicate,
929 VFRange &Range);
930
931 /// \return A VPlan for the most profitable epilogue vectorization, with its
932 /// VF narrowed to the chosen factor. The returned plan is a duplicate.
933 /// Returns nullptr if epilogue vectorization is not supported or not
934 /// profitable for the loop.
935 std::unique_ptr<VPlan>
936 selectBestEpiloguePlan(VPlan &MainPlan, ElementCount MainLoopVF, unsigned IC);
937
938 /// Emit remarks for recipes with invalid costs in the available VPlans.
939 void emitInvalidCostRemarks(OptimizationRemarkEmitter *ORE);
940
941 /// Create a check to \p Plan to see if the vector loop should be executed
942 /// based on its trip count.
943 void addMinimumIterationCheck(VPlan &Plan, ElementCount VF, unsigned UF,
944 ElementCount MinProfitableTripCount) const;
945
946 /// Attach the runtime checks of \p RTChecks to \p Plan.
947 void attachRuntimeChecks(VPlan &Plan, GeneratedRTChecks &RTChecks,
948 bool HasBranchWeights) const;
949
950 /// Update loop metadata and profile info for both the scalar remainder loop
951 /// and \p VectorLoop, if it exists. Keeps all loop hints from the original
952 /// loop on the vector loop and replaces vectorizer-specific metadata. The
953 /// loop ID of the original loop \p OrigLoopID must be passed, together with
954 /// the average trip count and invocation weight of the original loop (\p
955 /// OrigAverageTripCount and \p OrigLoopInvocationWeight respectively). They
956 /// cannot be retrieved after the plan has been executed, as the original loop
957 /// may have been removed.
958 void updateLoopMetadataAndProfileInfo(
959 Loop *VectorLoop, VPBasicBlock *HeaderVPBB, const VPlan &Plan,
960 bool VectorizingEpilogue, MDNode *OrigLoopID,
961 std::optional<unsigned> OrigAverageTripCount,
962 unsigned OrigLoopInvocationWeight, unsigned EstimatedVFxUF,
963 bool DisableRuntimeUnroll);
964
965private:
966 /// Build an initial VPlan, with HCFG wrapping the original scalar loop and
967 /// scalar transformations applied. Returns null if an initial VPlan cannot
968 /// be built.
969 VPlanPtr tryToBuildVPlan1();
970
971 /// Build a VPlan using VPRecipes according to the information gathered by
972 /// Legal and VPlan-based analysis. For outer loops, performs basic recipe
973 /// conversion only. For inner loops, \p Range's largest included VF is
974 /// restricted to the maximum VF the returned VPlan is valid for. If no VPlan
975 /// can be built for the input range, set the largest included VF to the
976 /// maximum VF for which no plan could be built. Each VPlan is built starting
977 /// from a copy of \p InitialPlan, which is a plain CFG VPlan wrapping the
978 /// original scalar loop.
979 VPlanPtr tryToBuildVPlan(VPlanPtr InitialPlan, VFRange &Range);
980
981 /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive,
982 /// based on \p VPlan1 and according to the information gathered by Legal
983 /// when it checked if it is legal to vectorize the loop.
984 void buildVPlans(VPlan &VPlan1, ElementCount MinVF, ElementCount MaxVF);
985
986 /// Add ComputeReductionResult recipes to the middle block to compute the
987 /// final reduction results. Add Select recipes to the latch block when
988 /// folding tail, to feed ComputeReductionResult with the last or penultimate
989 /// iteration values according to the header mask.
990 void addReductionResultComputation(VPlanPtr &Plan,
991 VPRecipeBuilder &RecipeBuilder,
992 ElementCount MinVF);
993
994 /// Returns true if the per-lane cost of VectorizationFactor A is lower than
995 /// that of B.
996 bool isMoreProfitable(const VectorizationFactor &A,
997 const VectorizationFactor &B, bool HasTail,
998 bool IsEpilogue = false) const;
999
1000 /// Returns true if the per-lane cost of VectorizationFactor A is lower than
1001 /// that of B in the context of vectorizing a loop with known \p MaxTripCount.
1002 bool isMoreProfitable(const VectorizationFactor &A,
1003 const VectorizationFactor &B,
1004 const unsigned MaxTripCount, bool HasTail,
1005 bool IsEpilogue = false) const;
1006
1007 /// Determines if we have the infrastructure to vectorize the loop and its
1008 /// epilogue, assuming the main loop is vectorized by \p MainPlan.
1009 bool isCandidateForEpilogueVectorization(VPlan &MainPlan) const;
1010};
1011
1012/// A helper function that returns true if the given type is irregular. The
1013/// type is irregular if its allocated size doesn't equal the store size of an
1014/// element of the corresponding vector type.
1015inline bool hasIrregularType(Type *Ty, const DataLayout &DL) {
1016 // Determine if an array of N elements of type Ty is "bitcast compatible"
1017 // with a <N x Ty> vector.
1018 // This is only true if there is no padding between the array elements.
1019 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
1020}
1021
1022} // namespace llvm
1023
1024#endif // LLVM_TRANSFORMS_VECTORIZE_LOOPVECTORIZATIONPLANNER_H
1025