1//===- VPlanTransforms.h - Utility VPlan to VPlan transforms --------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file provides utility VPlan to VPlan transformations.
11//===----------------------------------------------------------------------===//
12
13#ifndef LLVM_TRANSFORMS_VECTORIZE_VPLANTRANSFORMS_H
14#define LLVM_TRANSFORMS_VECTORIZE_VPLANTRANSFORMS_H
15
16#include "VPlan.h"
17#include "VPlanVerifier.h"
18#include "llvm/ADT/STLFunctionalExtras.h"
19#include "llvm/ADT/ScopeExit.h"
20#include "llvm/Support/CommandLine.h"
21#include "llvm/Support/Compiler.h"
22#include "llvm/Support/Regex.h"
23
24namespace llvm {
25
26class InductionDescriptor;
27class Instruction;
28class Loop;
29class LoopVersioning;
30class OptimizationRemarkEmitter;
31class PHINode;
32class ScalarEvolution;
33class PredicatedScalarEvolution;
34class TargetLibraryInfo;
35class TargetTransformInfo;
36class VPBuilder;
37class VPRecipeBuilder;
38struct VFRange;
39
40LLVM_ABI_FOR_TEST extern cl::opt<bool> VerifyEachVPlan;
41LLVM_ABI_FOR_TEST extern cl::opt<bool> EnableWideActiveLaneMask;
42
43#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
44LLVM_ABI_FOR_TEST extern cl::opt<bool> VPlanPrintAfterAll;
45LLVM_ABI_FOR_TEST extern cl::list<std::string> VPlanPrintAfterPasses;
46LLVM_ABI_FOR_TEST extern cl::opt<bool> VPlanPrintVectorRegionScope;
47#endif
48
49struct VPlanTransforms {
50 /// Helper to run a VPlan pass \p Pass on \p VPlan, forwarding extra arguments
51 /// to the pass. Performs verification/printing after each VPlan pass if
52 /// requested via command line options.
53 template <bool EnableVerify = true, typename PassTy, typename... ArgsTy>
54 static decltype(auto) runPass(StringRef PassName, PassTy &&Pass, VPlan &Plan,
55 ArgsTy &&...Args) {
56 scope_exit PostTransformActions{[&]() {
57#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
58 // Make sure to print before verification, so that output is more useful
59 // in case of failures:
60 if (VPlanPrintAfterAll ||
61 (VPlanPrintAfterPasses.getNumOccurrences() > 0 &&
62 any_of(VPlanPrintAfterPasses, [PassName](StringRef Entry) {
63 return Regex(Entry).match(PassName);
64 }))) {
65 dbgs()
66 << "VPlan for loop in '"
67 << Plan.getScalarHeader()->getIRBasicBlock()->getParent()->getName()
68 << "' after " << PassName << '\n';
69 if (VPlanPrintVectorRegionScope && Plan.getVectorLoopRegion())
70 Plan.getVectorLoopRegion()->print(dbgs());
71 else
72 dbgs() << Plan << '\n';
73 }
74#endif
75 if (VerifyEachVPlan && EnableVerify)
76 verifyVPlanIsValid(Plan);
77 }};
78
79 return std::forward<PassTy>(Pass)(Plan, std::forward<ArgsTy>(Args)...);
80 }
81#define RUN_VPLAN_PASS(PASS, ...) \
82 llvm::VPlanTransforms::runPass(#PASS, PASS, __VA_ARGS__)
83#define RUN_VPLAN_PASS_NO_VERIFY(PASS, ...) \
84 llvm::VPlanTransforms::runPass<false>(#PASS, PASS, __VA_ARGS__)
85
86 /// Create a base VPlan0, serving as the common starting point for all later
87 /// candidates. It consists of an initial plain CFG loop with loop blocks from
88 /// \p TheLoop being directly translated to VPBasicBlocks with VPInstruction
89 /// corresponding to the input IR.
90 ///
91 /// The created loop is wrapped in an initial skeleton to facilitate
92 /// vectorization, consisting of a vector pre-header, an exit block for the
93 /// main vector loop (middle.block) and a new block as preheader of the scalar
94 /// loop (scalar.ph). See below for an illustration. It also adds a canonical
95 /// IV and its increment, using \p InductionTy and \p IVDL, and creates a
96 /// VPValue expression for the original trip count.
97 ///
98 /// [ ] <-- Plan's entry VPIRBasicBlock, wrapping the original loop's
99 /// / \ old preheader. Will contain iteration number check and SCEV
100 /// | | expansions.
101 /// | |
102 /// / v
103 /// | [ ] <-- vector loop bypass (may consist of multiple blocks) will be
104 /// | / | added later.
105 /// | / v
106 /// || [ ] <-- vector pre header.
107 /// |/ |
108 /// | v
109 /// | [ ] \ <-- plain CFG loop wrapping original loop to be vectorized.
110 /// | [ ]_|
111 /// | |
112 /// | v
113 /// | [ ] <--- middle-block with the branch to successors
114 /// | / |
115 /// | / |
116 /// | | v
117 /// \--->[ ] <--- scalar preheader (initial a VPBasicBlock, which will be
118 /// | | replaced later by a VPIRBasicBlock wrapping the scalar
119 /// | | preheader basic block.
120 /// | |
121 /// v <-- edge from middle to exit iff epilogue is not required.
122 /// | [ ] \
123 /// | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue,
124 /// | | header wrapped in VPIRBasicBlock).
125 /// \ |
126 /// \ v
127 /// >[ ] <-- original loop exit block(s), wrapped in VPIRBasicBlocks.
128 LLVM_ABI_FOR_TEST static std::unique_ptr<VPlan>
129 buildVPlan0(Loop *TheLoop, LoopInfo &LI, Type *InductionTy, DebugLoc IVDL,
130 PredicatedScalarEvolution &PSE, LoopVersioning *LVer = nullptr);
131
132 /// Replace VPPhi recipes in \p Plan's header with corresponding
133 /// VPHeaderPHIRecipe subclasses for inductions, reductions, and
134 /// fixed-order recurrences. This processes all header phis and creates
135 /// the appropriate widened recipe for each one.
136 static void createHeaderPhiRecipes(
137 VPlan &Plan, PredicatedScalarEvolution &PSE, Loop &OrigLoop,
138 const MapVector<PHINode *, InductionDescriptor> &Inductions,
139 const MapVector<PHINode *, RecurrenceDescriptor> &Reductions,
140 const SmallPtrSetImpl<const PHINode *> &FixedOrderRecurrences,
141 const SmallPtrSetImpl<PHINode *> &InLoopReductions, bool AllowReordering);
142
143 /// Create VPReductionRecipes for in-loop reductions. This processes chains
144 /// of operations contributing to in-loop reductions and creates appropriate
145 /// VPReductionRecipe instances.
146 static void createInLoopReductionRecipes(
147 VPlan &Plan, const DenseSet<BasicBlock *> &BlocksNeedingPredication,
148 ElementCount MinVF);
149
150 /// Update \p Plan to account for all early exits.
151 LLVM_ABI_FOR_TEST static void handleEarlyExits(VPlan &Plan,
152 bool HasUncountableExit);
153
154 /// If a check is needed to guard executing the scalar epilogue loop, it will
155 /// be added to the middle block.
156 LLVM_ABI_FOR_TEST static void addMiddleCheck(VPlan &Plan,
157 bool RequiresScalarEpilogueCheck,
158 bool TailFolded);
159
160 // Create a check to \p Plan to see if the vector loop should be executed.
161 static void
162 addMinimumIterationCheck(VPlan &Plan, ElementCount VF, unsigned UF,
163 ElementCount MinProfitableTripCount,
164 bool RequiresScalarEpilogue, bool TailFolded,
165 bool CheckNeededWithTailFolding, Loop *OrigLoop,
166 const uint32_t *MinItersBypassWeights, DebugLoc DL,
167 PredicatedScalarEvolution &PSE);
168
169 /// Add a check to \p Plan to see if the epilogue vector loop should be
170 /// executed.
171 static void addMinimumVectorEpilogueIterationCheck(
172 VPlan &Plan, Value *TripCount, Value *VectorTripCount,
173 bool RequiresScalarEpilogue, ElementCount EpilogueVF, unsigned EpilogueUF,
174 unsigned MainLoopStep, unsigned EpilogueLoopStep, ScalarEvolution &SE);
175
176 /// Replace loops in \p Plan's flat CFG with VPRegionBlocks, turning \p Plan's
177 /// flat CFG into a hierarchical CFG.
178 LLVM_ABI_FOR_TEST static void createLoopRegions(VPlan &Plan);
179
180 /// Wrap runtime check block \p CheckBlock in a VPIRBB and \p Cond in a
181 /// VPValue and connect the block to \p Plan, using the VPValue as branch
182 /// condition.
183 static void attachCheckBlock(VPlan &Plan, Value *Cond, BasicBlock *CheckBlock,
184 bool AddBranchWeights);
185
186 /// Replaces the VPInstructions in \p Plan with corresponding
187 /// widen recipes. Returns false if any VPInstructions could not be converted
188 /// to a wide recipe if needed.
189 LLVM_ABI_FOR_TEST static bool
190 tryToConvertVPInstructionsToVPRecipes(VPlan &Plan,
191 const TargetLibraryInfo &TLI);
192
193 /// Try to legalize reductions with multiple in-loop uses. Currently only
194 /// strict and non-strict min/max reductions used by FindLastIV reductions are
195 /// supported, corresponding to computing the first and last argmin/argmax,
196 /// respectively. Otherwise return false.
197 static bool handleMultiUseReductions(VPlan &Plan,
198 OptimizationRemarkEmitter *ORE,
199 Loop *TheLoop);
200
201 /// Try to have all users of fixed-order recurrences appear after the recipe
202 /// defining their previous value, by either sinking users or hoisting recipes
203 /// defining their previous value (and its operands). Then introduce
204 /// FirstOrderRecurrenceSplice VPInstructions to combine the value from the
205 /// recurrence phis and previous values.
206 /// \returns true if all users of fixed-order recurrences could be re-arranged
207 /// as needed or false if it is not possible. In the latter case, \p Plan is
208 /// not valid.
209 static bool adjustFixedOrderRecurrences(VPlan &Plan, VPBuilder &Builder);
210
211 /// Check if \p Plan contains any FMaxNum or FMinNum reductions. If they do,
212 /// try to update the vector loop to exit early if any input is NaN and resume
213 /// executing in the scalar loop to handle the NaNs there. Return false if
214 /// this attempt was unsuccessful.
215 static bool handleMaxMinNumReductions(VPlan &Plan);
216
217 /// Check if \p Plan contains any FindLast reductions. If it does, try to
218 /// update the vector loop to save the appropriate state using selects
219 /// for entire vectors for both the latest mask containing at least one active
220 /// element and the corresponding data vector. Return false if this attempt
221 /// was unsuccessful.
222 static bool handleFindLastReductions(VPlan &Plan);
223
224 /// Clear NSW/NUW flags from reduction instructions if necessary.
225 static void clearReductionWrapFlags(VPlan &Plan);
226
227 /// Explicitly unroll \p Plan by \p UF.
228 static void unrollByUF(VPlan &Plan, unsigned UF);
229
230 /// Replace each replicating VPReplicateRecipe and VPInstruction outside of
231 /// any replicate region in \p Plan with \p VF single-scalar recipes.
232 /// TODO: Also replicate VPScalarIVSteps and VPReplicateRecipes inside
233 /// replicate regions, thereby dissolving the latter.
234 static void replicateByVF(VPlan &Plan, ElementCount VF);
235
236 /// Optimize \p Plan based on \p BestVF and \p BestUF. This may restrict the
237 /// resulting plan to \p BestVF and \p BestUF.
238 static void optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
239 unsigned BestUF,
240 PredicatedScalarEvolution &PSE);
241
242 /// Apply VPlan-to-VPlan optimizations to \p Plan, including induction recipe
243 /// optimizations, dead recipe removal, replicate region optimizations and
244 /// block merging.
245 LLVM_ABI_FOR_TEST static void optimize(VPlan &Plan);
246
247 /// Wrap predicated VPReplicateRecipes with a mask operand in an if-then
248 /// region block and remove the mask operand. Optimize the created regions by
249 /// iteratively sinking scalar operands into the region, followed by merging
250 /// regions until no improvements are remaining.
251 static void createAndOptimizeReplicateRegions(VPlan &Plan);
252
253 /// Replace (ICMP_ULE, wide canonical IV, backedge-taken-count) checks with an
254 /// (active-lane-mask recipe, wide canonical IV, trip-count). If \p
255 /// UseActiveLaneMaskForControlFlow is true, introduce an
256 /// VPActiveLaneMaskPHIRecipe. If \p DataAndControlFlowWithoutRuntimeCheck is
257 /// true, no minimum-iteration runtime check will be created (during skeleton
258 /// creation) and instead it is handled using active-lane-mask. \p
259 /// DataAndControlFlowWithoutRuntimeCheck implies \p
260 /// UseActiveLaneMaskForControlFlow.
261 static void addActiveLaneMask(VPlan &Plan,
262 bool UseActiveLaneMaskForControlFlow,
263 bool DataAndControlFlowWithoutRuntimeCheck);
264
265 /// Insert truncates and extends for any truncated recipe. Redundant casts
266 /// will be folded later.
267 static void
268 truncateToMinimalBitwidths(VPlan &Plan,
269 const MapVector<Instruction *, uint64_t> &MinBWs);
270
271 /// Replace symbolic strides from \p StridesMap in \p Plan with constants when
272 /// possible.
273 static void
274 replaceSymbolicStrides(VPlan &Plan, PredicatedScalarEvolution &PSE,
275 const DenseMap<Value *, const SCEV *> &StridesMap);
276
277 /// Drop poison flags from recipes that may generate a poison value that is
278 /// used after vectorization, even when their operands are not poison. Those
279 /// recipes meet the following conditions:
280 /// * Contribute to the address computation of a recipe generating a widen
281 /// memory load/store (VPWidenMemoryInstructionRecipe or
282 /// VPInterleaveRecipe).
283 /// * Such a widen memory load/store has at least one underlying Instruction
284 /// that is in a basic block that needs predication and after vectorization
285 /// the generated instruction won't be predicated.
286 /// Uses \p BlockNeedsPredication to check if a block needs predicating.
287 /// TODO: Replace BlockNeedsPredication callback with retrieving info from
288 /// VPlan directly.
289 static void dropPoisonGeneratingRecipes(
290 VPlan &Plan,
291 const std::function<bool(BasicBlock *)> &BlockNeedsPredication);
292
293 /// Add a VPCurrentIterationPHIRecipe and related recipes to \p Plan and
294 /// replaces all uses except the canonical IV increment of
295 /// VPCanonicalIVPHIRecipe with a VPCurrentIterationPHIRecipe.
296 /// VPCanonicalIVPHIRecipe is only used to control the loop after
297 /// this transformation.
298 static void
299 addExplicitVectorLength(VPlan &Plan,
300 const std::optional<unsigned> &MaxEVLSafeElements);
301
302 /// Optimize recipes which use an EVL-based header mask to VP intrinsics, for
303 /// example:
304 ///
305 /// %mask = icmp ult step-vector, EVL
306 /// %load = load %ptr, %mask
307 /// -->
308 /// %load = vp.load %ptr, EVL
309 static void optimizeEVLMasks(VPlan &Plan);
310
311 // For each Interleave Group in \p InterleaveGroups replace the Recipes
312 // widening its memory instructions with a single VPInterleaveRecipe at its
313 // insertion point.
314 static void createInterleaveGroups(
315 VPlan &Plan,
316 const SmallPtrSetImpl<const InterleaveGroup<Instruction> *>
317 &InterleaveGroups,
318 VPRecipeBuilder &RecipeBuilder, const bool &ScalarEpilogueAllowed);
319
320 /// Remove dead recipes from \p Plan.
321 static void removeDeadRecipes(VPlan &Plan);
322
323 /// Update \p Plan to account for uncountable early exits by introducing
324 /// appropriate branching logic in the latch that handles early exits and the
325 /// latch exit condition. Multiple exits are handled with a dispatch block
326 /// that determines which exit to take based on lane-by-lane semantics.
327 static void handleUncountableEarlyExits(VPlan &Plan, VPBasicBlock *HeaderVPBB,
328 VPBasicBlock *LatchVPBB,
329 VPBasicBlock *MiddleVPBB);
330
331 /// Replaces the exit condition from
332 /// (branch-on-cond eq CanonicalIVInc, VectorTripCount)
333 /// to
334 /// (branch-on-cond eq AVLNext, 0)
335 static void convertEVLExitCond(VPlan &Plan);
336
337 /// Replace loop regions with explicit CFG.
338 static void dissolveLoopRegions(VPlan &Plan);
339
340 /// Expand BranchOnTwoConds instructions into explicit CFG with
341 /// BranchOnCond instructions. Should be called after dissolveLoopRegions.
342 static void expandBranchOnTwoConds(VPlan &Plan);
343
344 /// Transform loops with variable-length stepping after region
345 /// dissolution.
346 ///
347 /// Once loop regions are replaced with explicit CFG, loops can step with
348 /// variable vector lengths instead of fixed lengths. This transformation:
349 /// * Makes CurrentIteration-Phi concrete.
350 // * Removes CanonicalIV and increment.
351 static void convertToVariableLengthStep(VPlan &Plan);
352
353 /// Lower abstract recipes to concrete ones, that can be codegen'd.
354 static void convertToConcreteRecipes(VPlan &Plan);
355
356 /// This function converts initial recipes to the abstract recipes and clamps
357 /// \p Range based on cost model for following optimizations and cost
358 /// estimations. The converted abstract recipes will lower to concrete
359 /// recipes before codegen.
360 static void convertToAbstractRecipes(VPlan &Plan, VPCostContext &Ctx,
361 VFRange &Range);
362
363 /// Perform instcombine-like simplifications on recipes in \p Plan.
364 static void simplifyRecipes(VPlan &Plan);
365
366 /// Remove BranchOnCond recipes with true or false conditions together with
367 /// removing dead edges to their successors.
368 static void removeBranchOnConst(VPlan &Plan);
369
370 /// Perform common-subexpression-elimination on \p Plan.
371 static void cse(VPlan &Plan);
372
373 /// If there's a single exit block, optimize its phi recipes that use exiting
374 /// IV values by feeding them precomputed end values instead, possibly taken
375 /// one step backwards.
376 static void
377 optimizeInductionExitUsers(VPlan &Plan,
378 DenseMap<VPValue *, VPValue *> &EndValues,
379 PredicatedScalarEvolution &PSE);
380
381 /// Add explicit broadcasts for live-ins and VPValues defined in \p Plan's entry block if they are used as vectors.
382 static void materializeBroadcasts(VPlan &Plan);
383
384 /// Hoist single-scalar loads with invariant addresses out of the vector loop
385 /// to the preheader, if they are proven not to alias with any stores in the
386 /// plan using noalias metadata.
387 static void hoistInvariantLoads(VPlan &Plan);
388
389 /// Hoist predicated loads from the same address to the loop entry block, if
390 /// they are guaranteed to execute on both paths (i.e., in replicate regions
391 /// with complementary masks P and NOT P).
392 static void hoistPredicatedLoads(VPlan &Plan, PredicatedScalarEvolution &PSE,
393 const Loop *L);
394
395 /// Sink predicated stores to the same address with complementary predicates
396 /// (P and NOT P) to an unconditional store with select recipes for the
397 /// stored values. This eliminates branching overhead when all paths
398 /// unconditionally store to the same location.
399 static void sinkPredicatedStores(VPlan &Plan, PredicatedScalarEvolution &PSE,
400 const Loop *L);
401
402 // Materialize vector trip counts for constants early if it can simply be
403 // computed as (Original TC / VF * UF) * VF * UF.
404 static void
405 materializeConstantVectorTripCount(VPlan &Plan, ElementCount BestVF,
406 unsigned BestUF,
407 PredicatedScalarEvolution &PSE);
408
409 /// Materialize vector trip count computations to a set of VPInstructions.
410 static void materializeVectorTripCount(VPlan &Plan,
411 VPBasicBlock *VectorPHVPBB,
412 bool TailByMasking,
413 bool RequiresScalarEpilogue);
414
415 /// Materialize the backedge-taken count to be computed explicitly using
416 /// VPInstructions.
417 static void materializeBackedgeTakenCount(VPlan &Plan,
418 VPBasicBlock *VectorPH);
419
420 /// Add explicit Build[Struct]Vector recipes to Pack multiple scalar values
421 /// into vectors and Unpack recipes to extract scalars from vectors as
422 /// needed.
423 static void materializePacksAndUnpacks(VPlan &Plan);
424
425 /// Materialize UF, VF and VFxUF to be computed explicitly using
426 /// VPInstructions.
427 static void materializeFactors(VPlan &Plan, VPBasicBlock *VectorPH,
428 ElementCount VF);
429
430 /// Expand VPExpandSCEVRecipes in \p Plan's entry block. Each
431 /// VPExpandSCEVRecipe is replaced with a live-in wrapping the expanded IR
432 /// value. A mapping from SCEV expressions to their expanded IR value is
433 /// returned.
434 static DenseMap<const SCEV *, Value *> expandSCEVs(VPlan &Plan,
435 ScalarEvolution &SE);
436
437 /// Try to find a single VF among \p Plan's VFs for which all interleave
438 /// groups (with known minimum VF elements) can be replaced by wide loads and
439 /// stores processing VF elements, if all transformed interleave groups access
440 /// the full vector width (checked via the maximum vector register width). If
441 /// the transformation can be applied, the original \p Plan will be split in
442 /// 2:
443 /// 1. The original Plan with the single VF containing the optimized recipes
444 /// using wide loads instead of interleave groups.
445 /// 2. A new clone which contains all VFs of Plan except the optimized VF.
446 ///
447 /// This effectively is a very simple form of loop-aware SLP, where we use
448 /// interleave groups to identify candidates.
449 static std::unique_ptr<VPlan>
450 narrowInterleaveGroups(VPlan &Plan, const TargetTransformInfo &TTI);
451
452 /// Predicate and linearize the control-flow in the only loop region of
453 /// \p Plan. If \p FoldTail is true, create a mask guarding the loop
454 /// header, otherwise use all-true for the header mask.
455 static void introduceMasksAndLinearize(VPlan &Plan, bool FoldTail);
456
457 /// Add branch weight metadata, if the \p Plan's middle block is terminated by
458 /// a BranchOnCond recipe.
459 static void
460 addBranchWeightToMiddleTerminator(VPlan &Plan, ElementCount VF,
461 std::optional<unsigned> VScaleForTuning);
462
463 /// Update the resume phis in the scalar preheader after creating wide recipes
464 /// for first-order recurrences, reductions and inductions. End values for
465 /// inductions are added to \p IVEndValues.
466 static void
467 updateScalarResumePhis(VPlan &Plan,
468 DenseMap<VPValue *, VPValue *> &IVEndValues);
469
470 /// Handle users in the exit block for first order reductions in the original
471 /// exit block. The penultimate value of recurrences is fed to their LCSSA phi
472 /// users in the original exit block using the VPIRInstruction wrapping to the
473 /// LCSSA phi.
474 static void addExitUsersForFirstOrderRecurrences(VPlan &Plan, VFRange &Range);
475
476 /// Optimize FindLast reductions selecting IVs (or expressions of IVs) by
477 /// converting them to FindIV reductions, if their IV range excludes a
478 /// suitable sentinel value. For expressions of IVs, the expression is sunk
479 /// to the middle block.
480 static void optimizeFindIVReductions(VPlan &Plan,
481 PredicatedScalarEvolution &PSE, Loop &L);
482
483 /// Detect and create partial reduction recipes for scaled reductions in
484 /// \p Plan. Must be called after recipe construction. If partial reductions
485 /// are only valid for a subset of VFs in Range, Range.End is updated.
486 static void createPartialReductions(VPlan &Plan, VPCostContext &CostCtx,
487 VFRange &Range);
488};
489
490} // namespace llvm
491
492#endif // LLVM_TRANSFORMS_VECTORIZE_VPLANTRANSFORMS_H
493