1 | //===- LoopVectorizationPlanner.h - Planner for LoopVectorization ---------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | /// |
9 | /// \file |
10 | /// This file provides a LoopVectorizationPlanner class. |
11 | /// InnerLoopVectorizer vectorizes loops which contain only one basic |
12 | /// LoopVectorizationPlanner - drives the vectorization process after having |
13 | /// passed Legality checks. |
14 | /// The planner builds and optimizes the Vectorization Plans which record the |
15 | /// decisions how to vectorize the given loop. In particular, represent the |
16 | /// control-flow of the vectorized version, the replication of instructions that |
17 | /// are to be scalarized, and interleave access groups. |
18 | /// |
19 | /// Also provides a VPlan-based builder utility analogous to IRBuilder. |
20 | /// It provides an instruction-level API for generating VPInstructions while |
21 | /// abstracting away the Recipe manipulation details. |
22 | //===----------------------------------------------------------------------===// |
23 | |
24 | #ifndef LLVM_TRANSFORMS_VECTORIZE_LOOPVECTORIZATIONPLANNER_H |
25 | #define LLVM_TRANSFORMS_VECTORIZE_LOOPVECTORIZATIONPLANNER_H |
26 | |
27 | #include "VPlan.h" |
28 | #include "llvm/ADT/SmallSet.h" |
29 | #include "llvm/Support/InstructionCost.h" |
30 | |
31 | namespace llvm { |
32 | |
33 | class LoopInfo; |
34 | class DominatorTree; |
35 | class LoopVectorizationLegality; |
36 | class LoopVectorizationCostModel; |
37 | class PredicatedScalarEvolution; |
38 | class LoopVectorizeHints; |
39 | class ; |
40 | class TargetTransformInfo; |
41 | class TargetLibraryInfo; |
42 | class VPRecipeBuilder; |
43 | |
44 | /// VPlan-based builder utility analogous to IRBuilder. |
45 | class VPBuilder { |
46 | VPBasicBlock *BB = nullptr; |
47 | VPBasicBlock::iterator InsertPt = VPBasicBlock::iterator(); |
48 | |
49 | /// Insert \p VPI in BB at InsertPt if BB is set. |
50 | VPInstruction *tryInsertInstruction(VPInstruction *VPI) { |
51 | if (BB) |
52 | BB->insert(Recipe: VPI, InsertPt); |
53 | return VPI; |
54 | } |
55 | |
56 | VPInstruction *createInstruction(unsigned Opcode, |
57 | ArrayRef<VPValue *> Operands, DebugLoc DL, |
58 | const Twine &Name = "" ) { |
59 | return tryInsertInstruction(VPI: new VPInstruction(Opcode, Operands, DL, Name)); |
60 | } |
61 | |
62 | VPInstruction *createInstruction(unsigned Opcode, |
63 | std::initializer_list<VPValue *> Operands, |
64 | DebugLoc DL, const Twine &Name = "" ) { |
65 | return createInstruction(Opcode, Operands: ArrayRef<VPValue *>(Operands), DL, Name); |
66 | } |
67 | |
68 | public: |
69 | VPBuilder() = default; |
70 | VPBuilder(VPBasicBlock *InsertBB) { setInsertPoint(InsertBB); } |
71 | VPBuilder(VPRecipeBase *InsertPt) { setInsertPoint(InsertPt); } |
72 | |
73 | /// Clear the insertion point: created instructions will not be inserted into |
74 | /// a block. |
75 | void clearInsertionPoint() { |
76 | BB = nullptr; |
77 | InsertPt = VPBasicBlock::iterator(); |
78 | } |
79 | |
80 | VPBasicBlock *getInsertBlock() const { return BB; } |
81 | VPBasicBlock::iterator getInsertPoint() const { return InsertPt; } |
82 | |
83 | /// Create a VPBuilder to insert after \p R. |
84 | static VPBuilder getToInsertAfter(VPRecipeBase *R) { |
85 | VPBuilder B; |
86 | B.setInsertPoint(TheBB: R->getParent(), IP: std::next(x: R->getIterator())); |
87 | return B; |
88 | } |
89 | |
90 | /// InsertPoint - A saved insertion point. |
91 | class VPInsertPoint { |
92 | VPBasicBlock *Block = nullptr; |
93 | VPBasicBlock::iterator Point; |
94 | |
95 | public: |
96 | /// Creates a new insertion point which doesn't point to anything. |
97 | VPInsertPoint() = default; |
98 | |
99 | /// Creates a new insertion point at the given location. |
100 | VPInsertPoint(VPBasicBlock *InsertBlock, VPBasicBlock::iterator InsertPoint) |
101 | : Block(InsertBlock), Point(InsertPoint) {} |
102 | |
103 | /// Returns true if this insert point is set. |
104 | bool isSet() const { return Block != nullptr; } |
105 | |
106 | VPBasicBlock *getBlock() const { return Block; } |
107 | VPBasicBlock::iterator getPoint() const { return Point; } |
108 | }; |
109 | |
110 | /// Sets the current insert point to a previously-saved location. |
111 | void restoreIP(VPInsertPoint IP) { |
112 | if (IP.isSet()) |
113 | setInsertPoint(TheBB: IP.getBlock(), IP: IP.getPoint()); |
114 | else |
115 | clearInsertionPoint(); |
116 | } |
117 | |
118 | /// This specifies that created VPInstructions should be appended to the end |
119 | /// of the specified block. |
120 | void setInsertPoint(VPBasicBlock *TheBB) { |
121 | assert(TheBB && "Attempting to set a null insert point" ); |
122 | BB = TheBB; |
123 | InsertPt = BB->end(); |
124 | } |
125 | |
126 | /// This specifies that created instructions should be inserted at the |
127 | /// specified point. |
128 | void setInsertPoint(VPBasicBlock *TheBB, VPBasicBlock::iterator IP) { |
129 | BB = TheBB; |
130 | InsertPt = IP; |
131 | } |
132 | |
133 | /// This specifies that created instructions should be inserted at the |
134 | /// specified point. |
135 | void setInsertPoint(VPRecipeBase *IP) { |
136 | BB = IP->getParent(); |
137 | InsertPt = IP->getIterator(); |
138 | } |
139 | |
140 | /// Create an N-ary operation with \p Opcode, \p Operands and set \p Inst as |
141 | /// its underlying Instruction. |
142 | VPInstruction *createNaryOp(unsigned Opcode, ArrayRef<VPValue *> Operands, |
143 | Instruction *Inst = nullptr, |
144 | const Twine &Name = "" ) { |
145 | DebugLoc DL; |
146 | if (Inst) |
147 | DL = Inst->getDebugLoc(); |
148 | VPInstruction *NewVPInst = createInstruction(Opcode, Operands, DL, Name); |
149 | NewVPInst->setUnderlyingValue(Inst); |
150 | return NewVPInst; |
151 | } |
152 | VPInstruction *createNaryOp(unsigned Opcode, ArrayRef<VPValue *> Operands, |
153 | DebugLoc DL, const Twine &Name = "" ) { |
154 | return createInstruction(Opcode, Operands, DL, Name); |
155 | } |
156 | |
157 | VPInstruction *createOverflowingOp(unsigned Opcode, |
158 | std::initializer_list<VPValue *> Operands, |
159 | VPRecipeWithIRFlags::WrapFlagsTy WrapFlags, |
160 | DebugLoc DL = {}, const Twine &Name = "" ) { |
161 | return tryInsertInstruction( |
162 | VPI: new VPInstruction(Opcode, Operands, WrapFlags, DL, Name)); |
163 | } |
164 | VPValue *createNot(VPValue *Operand, DebugLoc DL = {}, |
165 | const Twine &Name = "" ) { |
166 | return createInstruction(Opcode: VPInstruction::Not, Operands: {Operand}, DL, Name); |
167 | } |
168 | |
169 | VPValue *createAnd(VPValue *LHS, VPValue *RHS, DebugLoc DL = {}, |
170 | const Twine &Name = "" ) { |
171 | return createInstruction(Opcode: Instruction::BinaryOps::And, Operands: {LHS, RHS}, DL, Name); |
172 | } |
173 | |
174 | VPValue *createOr(VPValue *LHS, VPValue *RHS, DebugLoc DL = {}, |
175 | const Twine &Name = "" ) { |
176 | |
177 | return tryInsertInstruction(VPI: new VPInstruction( |
178 | Instruction::BinaryOps::Or, {LHS, RHS}, |
179 | VPRecipeWithIRFlags::DisjointFlagsTy(false), DL, Name)); |
180 | } |
181 | |
182 | VPValue *createLogicalAnd(VPValue *LHS, VPValue *RHS, DebugLoc DL = {}, |
183 | const Twine &Name = "" ) { |
184 | return tryInsertInstruction( |
185 | VPI: new VPInstruction(VPInstruction::LogicalAnd, {LHS, RHS}, DL, Name)); |
186 | } |
187 | |
188 | VPValue *createSelect(VPValue *Cond, VPValue *TrueVal, VPValue *FalseVal, |
189 | DebugLoc DL = {}, const Twine &Name = "" , |
190 | std::optional<FastMathFlags> FMFs = std::nullopt) { |
191 | auto *Select = |
192 | FMFs ? new VPInstruction(Instruction::Select, {Cond, TrueVal, FalseVal}, |
193 | *FMFs, DL, Name) |
194 | : new VPInstruction(Instruction::Select, {Cond, TrueVal, FalseVal}, |
195 | DL, Name); |
196 | return tryInsertInstruction(VPI: Select); |
197 | } |
198 | |
199 | /// Create a new ICmp VPInstruction with predicate \p Pred and operands \p A |
200 | /// and \p B. |
201 | /// TODO: add createFCmp when needed. |
202 | VPValue *createICmp(CmpInst::Predicate Pred, VPValue *A, VPValue *B, |
203 | DebugLoc DL = {}, const Twine &Name = "" ); |
204 | |
205 | //===--------------------------------------------------------------------===// |
206 | // RAII helpers. |
207 | //===--------------------------------------------------------------------===// |
208 | |
209 | /// RAII object that stores the current insertion point and restores it when |
210 | /// the object is destroyed. |
211 | class InsertPointGuard { |
212 | VPBuilder &Builder; |
213 | VPBasicBlock *Block; |
214 | VPBasicBlock::iterator Point; |
215 | |
216 | public: |
217 | InsertPointGuard(VPBuilder &B) |
218 | : Builder(B), Block(B.getInsertBlock()), Point(B.getInsertPoint()) {} |
219 | |
220 | InsertPointGuard(const InsertPointGuard &) = delete; |
221 | InsertPointGuard &operator=(const InsertPointGuard &) = delete; |
222 | |
223 | ~InsertPointGuard() { Builder.restoreIP(IP: VPInsertPoint(Block, Point)); } |
224 | }; |
225 | }; |
226 | |
227 | /// TODO: The following VectorizationFactor was pulled out of |
228 | /// LoopVectorizationCostModel class. LV also deals with |
229 | /// VectorizerParams::VectorizationFactor. |
230 | /// We need to streamline them. |
231 | |
232 | /// Information about vectorization costs. |
233 | struct VectorizationFactor { |
234 | /// Vector width with best cost. |
235 | ElementCount Width; |
236 | |
237 | /// Cost of the loop with that width. |
238 | InstructionCost Cost; |
239 | |
240 | /// Cost of the scalar loop. |
241 | InstructionCost ScalarCost; |
242 | |
243 | /// The minimum trip count required to make vectorization profitable, e.g. due |
244 | /// to runtime checks. |
245 | ElementCount MinProfitableTripCount; |
246 | |
247 | VectorizationFactor(ElementCount Width, InstructionCost Cost, |
248 | InstructionCost ScalarCost) |
249 | : Width(Width), Cost(Cost), ScalarCost(ScalarCost) {} |
250 | |
251 | /// Width 1 means no vectorization, cost 0 means uncomputed cost. |
252 | static VectorizationFactor Disabled() { |
253 | return {ElementCount::getFixed(MinVal: 1), 0, 0}; |
254 | } |
255 | |
256 | bool operator==(const VectorizationFactor &rhs) const { |
257 | return Width == rhs.Width && Cost == rhs.Cost; |
258 | } |
259 | |
260 | bool operator!=(const VectorizationFactor &rhs) const { |
261 | return !(*this == rhs); |
262 | } |
263 | }; |
264 | |
265 | /// A class that represents two vectorization factors (initialized with 0 by |
266 | /// default). One for fixed-width vectorization and one for scalable |
267 | /// vectorization. This can be used by the vectorizer to choose from a range of |
268 | /// fixed and/or scalable VFs in order to find the most cost-effective VF to |
269 | /// vectorize with. |
270 | struct FixedScalableVFPair { |
271 | ElementCount FixedVF; |
272 | ElementCount ScalableVF; |
273 | |
274 | FixedScalableVFPair() |
275 | : FixedVF(ElementCount::getFixed(MinVal: 0)), |
276 | ScalableVF(ElementCount::getScalable(MinVal: 0)) {} |
277 | FixedScalableVFPair(const ElementCount &Max) : FixedScalableVFPair() { |
278 | *(Max.isScalable() ? &ScalableVF : &FixedVF) = Max; |
279 | } |
280 | FixedScalableVFPair(const ElementCount &FixedVF, |
281 | const ElementCount &ScalableVF) |
282 | : FixedVF(FixedVF), ScalableVF(ScalableVF) { |
283 | assert(!FixedVF.isScalable() && ScalableVF.isScalable() && |
284 | "Invalid scalable properties" ); |
285 | } |
286 | |
287 | static FixedScalableVFPair getNone() { return FixedScalableVFPair(); } |
288 | |
289 | /// \return true if either fixed- or scalable VF is non-zero. |
290 | explicit operator bool() const { return FixedVF || ScalableVF; } |
291 | |
292 | /// \return true if either fixed- or scalable VF is a valid vector VF. |
293 | bool hasVector() const { return FixedVF.isVector() || ScalableVF.isVector(); } |
294 | }; |
295 | |
296 | /// Planner drives the vectorization process after having passed |
297 | /// Legality checks. |
298 | class LoopVectorizationPlanner { |
299 | /// The loop that we evaluate. |
300 | Loop *OrigLoop; |
301 | |
302 | /// Loop Info analysis. |
303 | LoopInfo *LI; |
304 | |
305 | /// The dominator tree. |
306 | DominatorTree *DT; |
307 | |
308 | /// Target Library Info. |
309 | const TargetLibraryInfo *TLI; |
310 | |
311 | /// Target Transform Info. |
312 | const TargetTransformInfo &TTI; |
313 | |
314 | /// The legality analysis. |
315 | LoopVectorizationLegality *Legal; |
316 | |
317 | /// The profitability analysis. |
318 | LoopVectorizationCostModel &CM; |
319 | |
320 | /// The interleaved access analysis. |
321 | InterleavedAccessInfo &IAI; |
322 | |
323 | PredicatedScalarEvolution &PSE; |
324 | |
325 | const LoopVectorizeHints &Hints; |
326 | |
327 | OptimizationRemarkEmitter *ORE; |
328 | |
329 | SmallVector<VPlanPtr, 4> VPlans; |
330 | |
331 | /// Profitable vector factors. |
332 | SmallVector<VectorizationFactor, 8> ProfitableVFs; |
333 | |
334 | /// A builder used to construct the current plan. |
335 | VPBuilder Builder; |
336 | |
337 | /// Computes the cost of \p Plan for vectorization factor \p VF. |
338 | /// |
339 | /// The current implementation requires access to the |
340 | /// LoopVectorizationLegality to handle inductions and reductions, which is |
341 | /// why it is kept separate from the VPlan-only cost infrastructure. |
342 | /// |
343 | /// TODO: Move to VPlan::cost once the use of LoopVectorizationLegality has |
344 | /// been retired. |
345 | InstructionCost cost(VPlan &Plan, ElementCount VF) const; |
346 | |
347 | public: |
348 | LoopVectorizationPlanner( |
349 | Loop *L, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, |
350 | const TargetTransformInfo &TTI, LoopVectorizationLegality *Legal, |
351 | LoopVectorizationCostModel &CM, InterleavedAccessInfo &IAI, |
352 | PredicatedScalarEvolution &PSE, const LoopVectorizeHints &Hints, |
353 | OptimizationRemarkEmitter *ORE) |
354 | : OrigLoop(L), LI(LI), DT(DT), TLI(TLI), TTI(TTI), Legal(Legal), CM(CM), |
355 | IAI(IAI), PSE(PSE), Hints(Hints), ORE(ORE) {} |
356 | |
357 | /// Plan how to best vectorize, return the best VF and its cost, or |
358 | /// std::nullopt if vectorization and interleaving should be avoided up front. |
359 | std::optional<VectorizationFactor> plan(ElementCount UserVF, unsigned UserIC); |
360 | |
361 | /// Use the VPlan-native path to plan how to best vectorize, return the best |
362 | /// VF and its cost. |
363 | VectorizationFactor planInVPlanNativePath(ElementCount UserVF); |
364 | |
365 | /// Return the best VPlan for \p VF. |
366 | VPlan &getBestPlanFor(ElementCount VF) const; |
367 | |
368 | /// Return the most profitable plan and fix its VF to the most profitable one. |
369 | VPlan &getBestPlan() const; |
370 | |
371 | /// Generate the IR code for the vectorized loop captured in VPlan \p BestPlan |
372 | /// according to the best selected \p VF and \p UF. |
373 | /// |
374 | /// TODO: \p IsEpilogueVectorization is needed to avoid issues due to epilogue |
375 | /// vectorization re-using plans for both the main and epilogue vector loops. |
376 | /// It should be removed once the re-use issue has been fixed. |
377 | /// \p ExpandedSCEVs is passed during execution of the plan for epilogue loop |
378 | /// to re-use expansion results generated during main plan execution. |
379 | /// |
380 | /// Returns a mapping of SCEVs to their expanded IR values and a mapping for |
381 | /// the reduction resume values. Note that this is a temporary workaround |
382 | /// needed due to the current epilogue handling. |
383 | std::pair<DenseMap<const SCEV *, Value *>, |
384 | DenseMap<const RecurrenceDescriptor *, Value *>> |
385 | executePlan(ElementCount VF, unsigned UF, VPlan &BestPlan, |
386 | InnerLoopVectorizer &LB, DominatorTree *DT, |
387 | bool IsEpilogueVectorization, |
388 | const DenseMap<const SCEV *, Value *> *ExpandedSCEVs = nullptr); |
389 | |
390 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
391 | void printPlans(raw_ostream &O); |
392 | #endif |
393 | |
394 | /// Look through the existing plans and return true if we have one with |
395 | /// vectorization factor \p VF. |
396 | bool hasPlanWithVF(ElementCount VF) const { |
397 | return any_of(Range: VPlans, |
398 | P: [&](const VPlanPtr &Plan) { return Plan->hasVF(VF); }); |
399 | } |
400 | |
401 | /// Test a \p Predicate on a \p Range of VF's. Return the value of applying |
402 | /// \p Predicate on Range.Start, possibly decreasing Range.End such that the |
403 | /// returned value holds for the entire \p Range. |
404 | static bool |
405 | getDecisionAndClampRange(const std::function<bool(ElementCount)> &Predicate, |
406 | VFRange &Range); |
407 | |
408 | /// \return The most profitable vectorization factor and the cost of that VF |
409 | /// for vectorizing the epilogue. Returns VectorizationFactor::Disabled if |
410 | /// epilogue vectorization is not supported for the loop. |
411 | VectorizationFactor |
412 | selectEpilogueVectorizationFactor(const ElementCount MaxVF, unsigned IC); |
413 | |
414 | protected: |
415 | /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive, |
416 | /// according to the information gathered by Legal when it checked if it is |
417 | /// legal to vectorize the loop. |
418 | void buildVPlans(ElementCount MinVF, ElementCount MaxVF); |
419 | |
420 | private: |
421 | /// Build a VPlan according to the information gathered by Legal. \return a |
422 | /// VPlan for vectorization factors \p Range.Start and up to \p Range.End |
423 | /// exclusive, possibly decreasing \p Range.End. |
424 | VPlanPtr buildVPlan(VFRange &Range); |
425 | |
426 | /// Build a VPlan using VPRecipes according to the information gather by |
427 | /// Legal. This method is only used for the legacy inner loop vectorizer. |
428 | /// \p Range's largest included VF is restricted to the maximum VF the |
429 | /// returned VPlan is valid for. If no VPlan can be built for the input range, |
430 | /// set the largest included VF to the maximum VF for which no plan could be |
431 | /// built. |
432 | VPlanPtr tryToBuildVPlanWithVPRecipes(VFRange &Range); |
433 | |
434 | /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive, |
435 | /// according to the information gathered by Legal when it checked if it is |
436 | /// legal to vectorize the loop. This method creates VPlans using VPRecipes. |
437 | void buildVPlansWithVPRecipes(ElementCount MinVF, ElementCount MaxVF); |
438 | |
439 | // Adjust the recipes for reductions. For in-loop reductions the chain of |
440 | // instructions leading from the loop exit instr to the phi need to be |
441 | // converted to reductions, with one operand being vector and the other being |
442 | // the scalar reduction chain. For other reductions, a select is introduced |
443 | // between the phi and live-out recipes when folding the tail. |
444 | void adjustRecipesForReductions(VPlanPtr &Plan, |
445 | VPRecipeBuilder &RecipeBuilder, |
446 | ElementCount MinVF); |
447 | |
448 | /// \return The most profitable vectorization factor for the available VPlans |
449 | /// and the cost of that VF. |
450 | /// This is now only used to verify the decisions by the new VPlan-based |
451 | /// cost-model and will be retired once the VPlan-based cost-model is |
452 | /// stabilized. |
453 | VectorizationFactor selectVectorizationFactor(); |
454 | |
455 | /// Returns true if the per-lane cost of VectorizationFactor A is lower than |
456 | /// that of B. |
457 | bool isMoreProfitable(const VectorizationFactor &A, |
458 | const VectorizationFactor &B) const; |
459 | |
460 | /// Determines if we have the infrastructure to vectorize the loop and its |
461 | /// epilogue, assuming the main loop is vectorized by \p VF. |
462 | bool isCandidateForEpilogueVectorization(const ElementCount VF) const; |
463 | }; |
464 | |
465 | } // namespace llvm |
466 | |
467 | #endif // LLVM_TRANSFORMS_VECTORIZE_LOOPVECTORIZATIONPLANNER_H |
468 | |