1//===- VPlanTransforms.h - Utility VPlan to VPlan transforms --------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file provides utility VPlan to VPlan transformations.
11//===----------------------------------------------------------------------===//
12
13#ifndef LLVM_TRANSFORMS_VECTORIZE_VPLANTRANSFORMS_H
14#define LLVM_TRANSFORMS_VECTORIZE_VPLANTRANSFORMS_H
15
16#include "VPlan.h"
17#include "VPlanVerifier.h"
18#include "llvm/ADT/STLFunctionalExtras.h"
19#include "llvm/ADT/ScopeExit.h"
20#include "llvm/Support/CommandLine.h"
21#include "llvm/Support/Compiler.h"
22
23namespace llvm {
24
25class InductionDescriptor;
26class Instruction;
27class LoopVersioning;
28class PHINode;
29class ScalarEvolution;
30class PredicatedScalarEvolution;
31class TargetLibraryInfo;
32class TargetTransformInfo;
33class VPBuilder;
34class VPRecipeBuilder;
35struct VFRange;
36
37LLVM_ABI_FOR_TEST extern cl::opt<bool> VerifyEachVPlan;
38LLVM_ABI_FOR_TEST extern cl::opt<bool> EnableWideActiveLaneMask;
39
40#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
41LLVM_ABI_FOR_TEST extern cl::opt<bool> PrintAfterEachVPlanPass;
42#endif
43
44struct VPlanTransforms {
45 /// Helper to run a VPlan pass \p Pass on \p VPlan, forwarding extra arguments
46 /// to the pass. Performs verification/printing after each VPlan pass if
47 /// requested via command line options.
48 template <bool EnableVerify = true, typename PassTy, typename... ArgsTy>
49 static decltype(auto) runPass(StringRef PassName, PassTy &&Pass, VPlan &Plan,
50 ArgsTy &&...Args) {
51 scope_exit PostTransformActions{[&]() {
52#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
53 // Make sure to print before verification, so that output is more useful
54 // in case of failures:
55 if (PrintAfterEachVPlanPass) {
56 dbgs() << "VPlan after " << PassName << '\n';
57 dbgs() << Plan << '\n';
58 }
59#endif
60 if (VerifyEachVPlan && EnableVerify)
61 verifyVPlanIsValid(Plan);
62 }};
63
64 return std::forward<PassTy>(Pass)(Plan, std::forward<ArgsTy>(Args)...);
65 }
66#define RUN_VPLAN_PASS(PASS, ...) \
67 llvm::VPlanTransforms::runPass(#PASS, PASS, __VA_ARGS__)
68#define RUN_VPLAN_PASS_NO_VERIFY(PASS, ...) \
69 llvm::VPlanTransforms::runPass<false>(#PASS, PASS, __VA_ARGS__)
70
71 /// Create a base VPlan0, serving as the common starting point for all later
72 /// candidates. It consists of an initial plain CFG loop with loop blocks from
73 /// \p TheLoop being directly translated to VPBasicBlocks with VPInstruction
74 /// corresponding to the input IR.
75 ///
76 /// The created loop is wrapped in an initial skeleton to facilitate
77 /// vectorization, consisting of a vector pre-header, an exit block for the
78 /// main vector loop (middle.block) and a new block as preheader of the scalar
79 /// loop (scalar.ph). See below for an illustration. It also adds a canonical
80 /// IV and its increment, using \p InductionTy and \p IVDL, and creates a
81 /// VPValue expression for the original trip count.
82 ///
83 /// [ ] <-- Plan's entry VPIRBasicBlock, wrapping the original loop's
84 /// / \ old preheader. Will contain iteration number check and SCEV
85 /// | | expansions.
86 /// | |
87 /// / v
88 /// | [ ] <-- vector loop bypass (may consist of multiple blocks) will be
89 /// | / | added later.
90 /// | / v
91 /// || [ ] <-- vector pre header.
92 /// |/ |
93 /// | v
94 /// | [ ] \ <-- plain CFG loop wrapping original loop to be vectorized.
95 /// | [ ]_|
96 /// | |
97 /// | v
98 /// | [ ] <--- middle-block with the branch to successors
99 /// | / |
100 /// | / |
101 /// | | v
102 /// \--->[ ] <--- scalar preheader (initial a VPBasicBlock, which will be
103 /// | | replaced later by a VPIRBasicBlock wrapping the scalar
104 /// | | preheader basic block.
105 /// | |
106 /// v <-- edge from middle to exit iff epilogue is not required.
107 /// | [ ] \
108 /// | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue,
109 /// | | header wrapped in VPIRBasicBlock).
110 /// \ |
111 /// \ v
112 /// >[ ] <-- original loop exit block(s), wrapped in VPIRBasicBlocks.
113 LLVM_ABI_FOR_TEST static std::unique_ptr<VPlan>
114 buildVPlan0(Loop *TheLoop, LoopInfo &LI, Type *InductionTy, DebugLoc IVDL,
115 PredicatedScalarEvolution &PSE, LoopVersioning *LVer = nullptr);
116
117 /// Replace VPPhi recipes in \p Plan's header with corresponding
118 /// VPHeaderPHIRecipe subclasses for inductions, reductions, and
119 /// fixed-order recurrences. This processes all header phis and creates
120 /// the appropriate widened recipe for each one.
121 static void createHeaderPhiRecipes(
122 VPlan &Plan, PredicatedScalarEvolution &PSE, Loop &OrigLoop,
123 const MapVector<PHINode *, InductionDescriptor> &Inductions,
124 const MapVector<PHINode *, RecurrenceDescriptor> &Reductions,
125 const SmallPtrSetImpl<const PHINode *> &FixedOrderRecurrences,
126 const SmallPtrSetImpl<PHINode *> &InLoopReductions, bool AllowReordering);
127
128 /// Create VPReductionRecipes for in-loop reductions. This processes chains
129 /// of operations contributing to in-loop reductions and creates appropriate
130 /// VPReductionRecipe instances. Block masks from \p BlockMaskCache are used
131 /// to add predication for blocks in \p BlocksNeedingPredication.
132 static void createInLoopReductionRecipes(
133 VPlan &Plan, const DenseMap<VPBasicBlock *, VPValue *> &BlockMaskCache,
134 const DenseSet<BasicBlock *> &BlocksNeedingPredication,
135 ElementCount MinVF);
136
137 /// Update \p Plan to account for all early exits.
138 LLVM_ABI_FOR_TEST static void handleEarlyExits(VPlan &Plan,
139 bool HasUncountableExit);
140
141 /// If a check is needed to guard executing the scalar epilogue loop, it will
142 /// be added to the middle block.
143 LLVM_ABI_FOR_TEST static void addMiddleCheck(VPlan &Plan,
144 bool RequiresScalarEpilogueCheck,
145 bool TailFolded);
146
147 // Create a check to \p Plan to see if the vector loop should be executed.
148 static void
149 addMinimumIterationCheck(VPlan &Plan, ElementCount VF, unsigned UF,
150 ElementCount MinProfitableTripCount,
151 bool RequiresScalarEpilogue, bool TailFolded,
152 bool CheckNeededWithTailFolding, Loop *OrigLoop,
153 const uint32_t *MinItersBypassWeights, DebugLoc DL,
154 PredicatedScalarEvolution &PSE);
155
156 /// Add a check to \p Plan to see if the epilogue vector loop should be
157 /// executed.
158 static void addMinimumVectorEpilogueIterationCheck(
159 VPlan &Plan, Value *TripCount, Value *VectorTripCount,
160 bool RequiresScalarEpilogue, ElementCount EpilogueVF, unsigned EpilogueUF,
161 unsigned MainLoopStep, unsigned EpilogueLoopStep, ScalarEvolution &SE);
162
163 /// Replace loops in \p Plan's flat CFG with VPRegionBlocks, turning \p Plan's
164 /// flat CFG into a hierarchical CFG.
165 LLVM_ABI_FOR_TEST static void createLoopRegions(VPlan &Plan);
166
167 /// Wrap runtime check block \p CheckBlock in a VPIRBB and \p Cond in a
168 /// VPValue and connect the block to \p Plan, using the VPValue as branch
169 /// condition.
170 static void attachCheckBlock(VPlan &Plan, Value *Cond, BasicBlock *CheckBlock,
171 bool AddBranchWeights);
172
173 /// Replaces the VPInstructions in \p Plan with corresponding
174 /// widen recipes. Returns false if any VPInstructions could not be converted
175 /// to a wide recipe if needed.
176 LLVM_ABI_FOR_TEST static bool
177 tryToConvertVPInstructionsToVPRecipes(VPlan &Plan,
178 const TargetLibraryInfo &TLI);
179
180 /// Try to legalize reductions with multiple in-loop uses. Currently only
181 /// min/max reductions used by FindLastIV reductions are supported. Otherwise
182 /// return false.
183 static bool handleMultiUseReductions(VPlan &Plan);
184
185 /// Try to have all users of fixed-order recurrences appear after the recipe
186 /// defining their previous value, by either sinking users or hoisting recipes
187 /// defining their previous value (and its operands). Then introduce
188 /// FirstOrderRecurrenceSplice VPInstructions to combine the value from the
189 /// recurrence phis and previous values.
190 /// \returns true if all users of fixed-order recurrences could be re-arranged
191 /// as needed or false if it is not possible. In the latter case, \p Plan is
192 /// not valid.
193 static bool adjustFixedOrderRecurrences(VPlan &Plan, VPBuilder &Builder);
194
195 /// Check if \p Plan contains any FMaxNum or FMinNum reductions. If they do,
196 /// try to update the vector loop to exit early if any input is NaN and resume
197 /// executing in the scalar loop to handle the NaNs there. Return false if
198 /// this attempt was unsuccessful.
199 static bool handleMaxMinNumReductions(VPlan &Plan);
200
201 /// Check if \p Plan contains any FindLast reductions. If it does, try to
202 /// update the vector loop to save the appropriate state using selects
203 /// for entire vectors for both the latest mask containing at least one active
204 /// element and the corresponding data vector. Return false if this attempt
205 /// was unsuccessful.
206 static bool handleFindLastReductions(VPlan &Plan);
207
208 /// Clear NSW/NUW flags from reduction instructions if necessary.
209 static void clearReductionWrapFlags(VPlan &Plan);
210
211 /// Explicitly unroll \p Plan by \p UF.
212 static void unrollByUF(VPlan &Plan, unsigned UF);
213
214 /// Replace each replicating VPReplicateRecipe and VPInstruction outside of
215 /// any replicate region in \p Plan with \p VF single-scalar recipes.
216 /// TODO: Also replicate VPScalarIVSteps and VPReplicateRecipes inside
217 /// replicate regions, thereby dissolving the latter.
218 static void replicateByVF(VPlan &Plan, ElementCount VF);
219
220 /// Optimize \p Plan based on \p BestVF and \p BestUF. This may restrict the
221 /// resulting plan to \p BestVF and \p BestUF.
222 static void optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
223 unsigned BestUF,
224 PredicatedScalarEvolution &PSE);
225
226 /// Apply VPlan-to-VPlan optimizations to \p Plan, including induction recipe
227 /// optimizations, dead recipe removal, replicate region optimizations and
228 /// block merging.
229 LLVM_ABI_FOR_TEST static void optimize(VPlan &Plan);
230
231 /// Wrap predicated VPReplicateRecipes with a mask operand in an if-then
232 /// region block and remove the mask operand. Optimize the created regions by
233 /// iteratively sinking scalar operands into the region, followed by merging
234 /// regions until no improvements are remaining.
235 static void createAndOptimizeReplicateRegions(VPlan &Plan);
236
237 /// Replace (ICMP_ULE, wide canonical IV, backedge-taken-count) checks with an
238 /// (active-lane-mask recipe, wide canonical IV, trip-count). If \p
239 /// UseActiveLaneMaskForControlFlow is true, introduce an
240 /// VPActiveLaneMaskPHIRecipe. If \p DataAndControlFlowWithoutRuntimeCheck is
241 /// true, no minimum-iteration runtime check will be created (during skeleton
242 /// creation) and instead it is handled using active-lane-mask. \p
243 /// DataAndControlFlowWithoutRuntimeCheck implies \p
244 /// UseActiveLaneMaskForControlFlow.
245 static void addActiveLaneMask(VPlan &Plan,
246 bool UseActiveLaneMaskForControlFlow,
247 bool DataAndControlFlowWithoutRuntimeCheck);
248
249 /// Insert truncates and extends for any truncated recipe. Redundant casts
250 /// will be folded later.
251 static void
252 truncateToMinimalBitwidths(VPlan &Plan,
253 const MapVector<Instruction *, uint64_t> &MinBWs);
254
255 /// Replace symbolic strides from \p StridesMap in \p Plan with constants when
256 /// possible.
257 static void
258 replaceSymbolicStrides(VPlan &Plan, PredicatedScalarEvolution &PSE,
259 const DenseMap<Value *, const SCEV *> &StridesMap);
260
261 /// Drop poison flags from recipes that may generate a poison value that is
262 /// used after vectorization, even when their operands are not poison. Those
263 /// recipes meet the following conditions:
264 /// * Contribute to the address computation of a recipe generating a widen
265 /// memory load/store (VPWidenMemoryInstructionRecipe or
266 /// VPInterleaveRecipe).
267 /// * Such a widen memory load/store has at least one underlying Instruction
268 /// that is in a basic block that needs predication and after vectorization
269 /// the generated instruction won't be predicated.
270 /// Uses \p BlockNeedsPredication to check if a block needs predicating.
271 /// TODO: Replace BlockNeedsPredication callback with retrieving info from
272 /// VPlan directly.
273 static void dropPoisonGeneratingRecipes(
274 VPlan &Plan,
275 const std::function<bool(BasicBlock *)> &BlockNeedsPredication);
276
277 /// Add a VPEVLBasedIVPHIRecipe and related recipes to \p Plan and
278 /// replaces all uses except the canonical IV increment of
279 /// VPCanonicalIVPHIRecipe with a VPEVLBasedIVPHIRecipe.
280 /// VPCanonicalIVPHIRecipe is only used to control the loop after
281 /// this transformation.
282 static void
283 addExplicitVectorLength(VPlan &Plan,
284 const std::optional<unsigned> &MaxEVLSafeElements);
285
286 /// Optimize recipes which use an EVL-based header mask to VP intrinsics, for
287 /// example:
288 ///
289 /// %mask = icmp ult step-vector, EVL
290 /// %load = load %ptr, %mask
291 /// -->
292 /// %load = vp.load %ptr, EVL
293 static void optimizeEVLMasks(VPlan &Plan);
294
295 // For each Interleave Group in \p InterleaveGroups replace the Recipes
296 // widening its memory instructions with a single VPInterleaveRecipe at its
297 // insertion point.
298 static void createInterleaveGroups(
299 VPlan &Plan,
300 const SmallPtrSetImpl<const InterleaveGroup<Instruction> *>
301 &InterleaveGroups,
302 VPRecipeBuilder &RecipeBuilder, const bool &ScalarEpilogueAllowed);
303
304 /// Remove dead recipes from \p Plan.
305 static void removeDeadRecipes(VPlan &Plan);
306
307 /// Update \p Plan to account for the uncountable early exit from \p
308 /// EarlyExitingVPBB to \p EarlyExitVPBB by introducing a BranchOnTwoConds
309 /// terminator in the latch that handles the early exit and the latch exit
310 /// condition.
311 static void handleUncountableEarlyExit(VPBasicBlock *EarlyExitingVPBB,
312 VPBasicBlock *EarlyExitVPBB,
313 VPlan &Plan, VPBasicBlock *HeaderVPBB,
314 VPBasicBlock *LatchVPBB);
315
316 /// Replaces the exit condition from
317 /// (branch-on-cond eq CanonicalIVInc, VectorTripCount)
318 /// to
319 /// (branch-on-cond eq AVLNext, 0)
320 static void convertEVLExitCond(VPlan &Plan);
321
322 /// Replace loop regions with explicit CFG.
323 static void dissolveLoopRegions(VPlan &Plan);
324
325 /// Expand BranchOnTwoConds instructions into explicit CFG with
326 /// BranchOnCond instructions. Should be called after dissolveLoopRegions.
327 static void expandBranchOnTwoConds(VPlan &Plan);
328
329 /// Transform EVL loops to use variable-length stepping after region
330 /// dissolution.
331 ///
332 /// Once loop regions are replaced with explicit CFG, EVL loops can step with
333 /// variable vector lengths instead of fixed lengths. This transformation:
334 /// * Makes EVL-Phi concrete.
335 // * Removes CanonicalIV and increment.
336 static void canonicalizeEVLLoops(VPlan &Plan);
337
338 /// Lower abstract recipes to concrete ones, that can be codegen'd.
339 static void convertToConcreteRecipes(VPlan &Plan);
340
341 /// This function converts initial recipes to the abstract recipes and clamps
342 /// \p Range based on cost model for following optimizations and cost
343 /// estimations. The converted abstract recipes will lower to concrete
344 /// recipes before codegen.
345 static void convertToAbstractRecipes(VPlan &Plan, VPCostContext &Ctx,
346 VFRange &Range);
347
348 /// Perform instcombine-like simplifications on recipes in \p Plan.
349 static void simplifyRecipes(VPlan &Plan);
350
351 /// Remove BranchOnCond recipes with true or false conditions together with
352 /// removing dead edges to their successors.
353 static void removeBranchOnConst(VPlan &Plan);
354
355 /// Perform common-subexpression-elimination on \p Plan.
356 static void cse(VPlan &Plan);
357
358 /// If there's a single exit block, optimize its phi recipes that use exiting
359 /// IV values by feeding them precomputed end values instead, possibly taken
360 /// one step backwards.
361 static void
362 optimizeInductionExitUsers(VPlan &Plan,
363 DenseMap<VPValue *, VPValue *> &EndValues,
364 PredicatedScalarEvolution &PSE);
365
366 /// Add explicit broadcasts for live-ins and VPValues defined in \p Plan's entry block if they are used as vectors.
367 static void materializeBroadcasts(VPlan &Plan);
368
369 /// Hoist single-scalar loads with invariant addresses out of the vector loop
370 /// to the preheader, if they are proven not to alias with any stores in the
371 /// plan using noalias metadata.
372 static void hoistInvariantLoads(VPlan &Plan);
373
374 /// Hoist predicated loads from the same address to the loop entry block, if
375 /// they are guaranteed to execute on both paths (i.e., in replicate regions
376 /// with complementary masks P and NOT P).
377 static void hoistPredicatedLoads(VPlan &Plan, PredicatedScalarEvolution &PSE,
378 const Loop *L);
379
380 /// Sink predicated stores to the same address with complementary predicates
381 /// (P and NOT P) to an unconditional store with select recipes for the
382 /// stored values. This eliminates branching overhead when all paths
383 /// unconditionally store to the same location.
384 static void sinkPredicatedStores(VPlan &Plan, PredicatedScalarEvolution &PSE,
385 const Loop *L);
386
387 // Materialize vector trip counts for constants early if it can simply be
388 // computed as (Original TC / VF * UF) * VF * UF.
389 static void
390 materializeConstantVectorTripCount(VPlan &Plan, ElementCount BestVF,
391 unsigned BestUF,
392 PredicatedScalarEvolution &PSE);
393
394 /// Materialize vector trip count computations to a set of VPInstructions.
395 static void materializeVectorTripCount(VPlan &Plan,
396 VPBasicBlock *VectorPHVPBB,
397 bool TailByMasking,
398 bool RequiresScalarEpilogue);
399
400 /// Materialize the backedge-taken count to be computed explicitly using
401 /// VPInstructions.
402 static void materializeBackedgeTakenCount(VPlan &Plan,
403 VPBasicBlock *VectorPH);
404
405 /// Add explicit Build[Struct]Vector recipes to Pack multiple scalar values
406 /// into vectors and Unpack recipes to extract scalars from vectors as
407 /// needed.
408 static void materializePacksAndUnpacks(VPlan &Plan);
409
410 /// Materialize VF and VFxUF to be computed explicitly using VPInstructions.
411 static void materializeVFAndVFxUF(VPlan &Plan, VPBasicBlock *VectorPH,
412 ElementCount VF);
413
414 /// Expand VPExpandSCEVRecipes in \p Plan's entry block. Each
415 /// VPExpandSCEVRecipe is replaced with a live-in wrapping the expanded IR
416 /// value. A mapping from SCEV expressions to their expanded IR value is
417 /// returned.
418 static DenseMap<const SCEV *, Value *> expandSCEVs(VPlan &Plan,
419 ScalarEvolution &SE);
420
421 /// Try to convert a plan with interleave groups with VF elements to a plan
422 /// with the interleave groups replaced by wide loads and stores processing VF
423 /// elements, if all transformed interleave groups access the full vector
424 /// width (checked via \o VectorRegWidth). This effectively is a very simple
425 /// form of loop-aware SLP, where we use interleave groups to identify
426 /// candidates.
427 static void narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
428 TypeSize VectorRegWidth);
429
430 /// Predicate and linearize the control-flow in the only loop region of
431 /// \p Plan. If \p FoldTail is true, create a mask guarding the loop
432 /// header, otherwise use all-true for the header mask. Masks for blocks are
433 /// added to a block-to-mask map which is returned in order to be used later
434 /// for wide recipe construction. This argument is temporary and will be
435 /// removed in the future.
436 static DenseMap<VPBasicBlock *, VPValue *>
437 introduceMasksAndLinearize(VPlan &Plan, bool FoldTail);
438
439 /// Add branch weight metadata, if the \p Plan's middle block is terminated by
440 /// a BranchOnCond recipe.
441 static void
442 addBranchWeightToMiddleTerminator(VPlan &Plan, ElementCount VF,
443 std::optional<unsigned> VScaleForTuning);
444
445 /// Update the resume phis in the scalar preheader after creating wide recipes
446 /// for first-order recurrences, reductions and inductions. End values for
447 /// inductions are added to \p IVEndValues.
448 static void
449 updateScalarResumePhis(VPlan &Plan,
450 DenseMap<VPValue *, VPValue *> &IVEndValues);
451
452 /// Handle users in the exit block for first order reductions in the original
453 /// exit block. The penultimate value of recurrences is fed to their LCSSA phi
454 /// users in the original exit block using the VPIRInstruction wrapping to the
455 /// LCSSA phi.
456 static void addExitUsersForFirstOrderRecurrences(VPlan &Plan, VFRange &Range);
457
458 /// Detect and create partial reduction recipes for scaled reductions in
459 /// \p Plan. Must be called after recipe construction. If partial reductions
460 /// are only valid for a subset of VFs in Range, Range.End is updated.
461 static void createPartialReductions(VPlan &Plan, VPCostContext &CostCtx,
462 VFRange &Range);
463};
464
465} // namespace llvm
466
467#endif // LLVM_TRANSFORMS_VECTORIZE_VPLANTRANSFORMS_H
468