1//===- VPlanTransforms.h - Utility VPlan to VPlan transforms --------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file provides utility VPlan to VPlan transformations.
11//===----------------------------------------------------------------------===//
12
13#ifndef LLVM_TRANSFORMS_VECTORIZE_VPLANTRANSFORMS_H
14#define LLVM_TRANSFORMS_VECTORIZE_VPLANTRANSFORMS_H
15
16#include "VPlan.h"
17#include "VPlanVerifier.h"
18#include "llvm/ADT/STLFunctionalExtras.h"
19#include "llvm/ADT/ScopeExit.h"
20#include "llvm/Analysis/TargetTransformInfo.h"
21#include "llvm/Support/CommandLine.h"
22#include "llvm/Support/Compiler.h"
23#include "llvm/Support/Regex.h"
24
25namespace llvm {
26
27class InductionDescriptor;
28class Instruction;
29class Loop;
30class LoopVersioning;
31class OptimizationRemarkEmitter;
32class PHINode;
33class ScalarEvolution;
34class PredicatedScalarEvolution;
35class TargetLibraryInfo;
36class TargetTransformInfo;
37class VPBuilder;
38class VPRecipeBuilder;
39struct VFRange;
40
41LLVM_ABI_FOR_TEST extern cl::opt<bool> VerifyEachVPlan;
42LLVM_ABI_FOR_TEST extern cl::opt<bool> EnableWideActiveLaneMask;
43
44#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
45LLVM_ABI_FOR_TEST extern cl::opt<bool> VPlanPrintBeforeAll;
46LLVM_ABI_FOR_TEST extern cl::opt<bool> VPlanPrintAfterAll;
47LLVM_ABI_FOR_TEST extern cl::list<std::string> VPlanPrintBeforePasses;
48LLVM_ABI_FOR_TEST extern cl::list<std::string> VPlanPrintAfterPasses;
49LLVM_ABI_FOR_TEST extern cl::opt<bool> VPlanPrintVectorRegionScope;
50#endif
51
52struct VPlanTransforms {
53 /// Helper to run a VPlan pass \p Pass on \p VPlan, forwarding extra arguments
54 /// to the pass. Performs verification/printing after each VPlan pass if
55 /// requested via command line options.
56 template <bool EnableVerify = true, typename PassTy, typename... ArgsTy>
57 static decltype(auto) runPass(StringRef PassName, PassTy &&Pass, VPlan &Plan,
58 ArgsTy &&...Args) {
59#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
60 auto PrintPlan = [&](StringRef BeforeOrAfterStr) {
61 dbgs()
62 << "VPlan for loop in '"
63 << Plan.getScalarHeader()->getIRBasicBlock()->getParent()->getName()
64 << "' " << BeforeOrAfterStr << " " << PassName << '\n';
65 if (VPlanPrintVectorRegionScope && Plan.getVectorLoopRegion())
66 Plan.getVectorLoopRegion()->print(dbgs());
67 else
68 dbgs() << Plan << '\n';
69 };
70
71 auto MatchesPassListOption = [&](const cl::list<std::string> &ListOpt) {
72 return (ListOpt.getNumOccurrences() > 0 &&
73 any_of(ListOpt, [PassName](StringRef Entry) {
74 return Regex(Entry).match(PassName);
75 }));
76 };
77
78 if (VPlanPrintBeforeAll || MatchesPassListOption(VPlanPrintBeforePasses))
79 PrintPlan("before");
80#endif
81
82 scope_exit PostTransformActions{[&]() {
83#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
84 // Make sure to print before verification, so that output is more useful
85 // in case of failures:
86 if (VPlanPrintAfterAll || MatchesPassListOption(VPlanPrintAfterPasses))
87 PrintPlan("after");
88#endif
89 if (VerifyEachVPlan && EnableVerify) {
90 if (!verifyVPlanIsValid(Plan))
91 report_fatal_error(reason: "Broken VPlan found, compilation aborted!");
92 }
93 }};
94
95 return std::forward<PassTy>(Pass)(Plan, std::forward<ArgsTy>(Args)...);
96 }
97#define RUN_VPLAN_PASS(PASS, ...) \
98 llvm::VPlanTransforms::runPass(#PASS, PASS, __VA_ARGS__)
99#define RUN_VPLAN_PASS_NO_VERIFY(PASS, ...) \
100 llvm::VPlanTransforms::runPass<false>(#PASS, PASS, __VA_ARGS__)
101
102 /// Create a base VPlan0, serving as the common starting point for all later
103 /// candidates. It consists of an initial plain CFG loop with loop blocks from
104 /// \p TheLoop being directly translated to VPBasicBlocks with VPInstruction
105 /// corresponding to the input IR.
106 ///
107 /// The created loop is wrapped in an initial skeleton to facilitate
108 /// vectorization, consisting of a vector pre-header, an exit block for the
109 /// main vector loop (middle.block) and a new block as preheader of the scalar
110 /// loop (scalar.ph). See below for an illustration. It also creates a
111 /// VPValue expression for the original trip count.
112 ///
113 /// [ ] <-- Plan's entry VPIRBasicBlock, wrapping the original loop's
114 /// / \ old preheader. Will contain iteration number check and SCEV
115 /// | | expansions.
116 /// | |
117 /// / v
118 /// | [ ] <-- vector loop bypass (may consist of multiple blocks) will be
119 /// | / | added later.
120 /// | / v
121 /// || [ ] <-- vector pre header.
122 /// |/ |
123 /// | v
124 /// | [ ] \ <-- plain CFG loop wrapping original loop to be vectorized.
125 /// | [ ]_|
126 /// | |
127 /// | v
128 /// | [ ] <--- middle-block with the branch to successors
129 /// | / |
130 /// | / |
131 /// | | v
132 /// \--->[ ] <--- scalar preheader (initial a VPBasicBlock, which will be
133 /// | | replaced later by a VPIRBasicBlock wrapping the scalar
134 /// | | preheader basic block.
135 /// | |
136 /// v <-- edge from middle to exit iff epilogue is not required.
137 /// | [ ] \
138 /// | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue,
139 /// | | header wrapped in VPIRBasicBlock).
140 /// \ |
141 /// \ v
142 /// >[ ] <-- original loop exit block(s), wrapped in VPIRBasicBlocks.
143 LLVM_ABI_FOR_TEST static std::unique_ptr<VPlan>
144 buildVPlan0(Loop *TheLoop, LoopInfo &LI, Type *InductionTy,
145 PredicatedScalarEvolution &PSE, LoopVersioning *LVer = nullptr);
146
147 /// Replace VPPhi recipes in \p Plan's header with corresponding
148 /// VPHeaderPHIRecipe subclasses for inductions, reductions, and
149 /// fixed-order recurrences. This processes all header phis and creates
150 /// the appropriate widened recipe for each one. For fixed-order
151 /// recurrences, also creates FirstOrderRecurrenceSplice instructions and
152 /// sinks/hoists users as needed. Returns false if any fixed-order
153 /// recurrence cannot be handled.
154 static bool createHeaderPhiRecipes(
155 VPlan &Plan, PredicatedScalarEvolution &PSE, Loop &OrigLoop,
156 const VPDominatorTree &VPDT,
157 const MapVector<PHINode *, InductionDescriptor> &Inductions,
158 const MapVector<PHINode *, RecurrenceDescriptor> &Reductions,
159 const SmallPtrSetImpl<const PHINode *> &FixedOrderRecurrences,
160 const SmallPtrSetImpl<PHINode *> &InLoopReductions, bool AllowReordering);
161
162 /// Finalize SCEV predicates by adding induction predicates from \p Plan to
163 /// \p PSE and checking constraints. Returns false if predicated IVs have
164 /// outside-loop uses via ExitingIVValue, if SCEV predicate complexity exceeds
165 /// \p SCEVCheckThreshold, or if predicates are needed but \p OptForSize is
166 /// true.
167 static bool
168 finalizeSCEVPredicates(VPlan &Plan, PredicatedScalarEvolution &PSE,
169 bool OptForSize, unsigned SCEVCheckThreshold,
170 OptimizationRemarkEmitter *ORE, Loop *TheLoop);
171
172 /// Create VPReductionRecipes for in-loop reductions. This processes chains
173 /// of operations contributing to in-loop reductions and creates appropriate
174 /// VPReductionRecipe instances.
175 static void createInLoopReductionRecipes(VPlan &Plan, ElementCount MinVF);
176
177 /// Update \p Plan to account for all early exits. If \p Style is not
178 /// NoUncountableExit, handles uncountable early exits and checks that all
179 /// loads are dereferenceable. Returns false if a non-dereferenceable load is
180 /// found.
181 LLVM_ABI_FOR_TEST static bool
182 handleEarlyExits(VPlan &Plan, UncountableExitStyle Style, Loop *TheLoop,
183 PredicatedScalarEvolution &PSE, DominatorTree &DT,
184 AssumptionCache *AC);
185
186 /// If a check is needed to guard executing the scalar epilogue loop, it will
187 /// be added to the middle block.
188 LLVM_ABI_FOR_TEST static void addMiddleCheck(VPlan &Plan);
189
190 // Create a check in \p CheckBlock to see if the vector loop should be
191 // executed. May create VPExpandSCEV recipes in the plan's entry block.
192 static void addMinimumIterationCheck(
193 VPlan &Plan, ElementCount VF, unsigned UF,
194 ElementCount MinProfitableTripCount, bool RequiresScalarEpilogue,
195 bool TailFolded, Loop *OrigLoop, const uint32_t *MinItersBypassWeights,
196 DebugLoc DL, PredicatedScalarEvolution &PSE, VPBasicBlock *CheckBlock);
197
198 /// Add a new check block before the vector preheader to \p Plan to check if
199 /// the main vector loop should be executed (TC >= VF * UF).
200 static void
201 addIterationCountCheckBlock(VPlan &Plan, ElementCount VF, unsigned UF,
202 bool RequiresScalarEpilogue, Loop *OrigLoop,
203 const uint32_t *MinItersBypassWeights,
204 DebugLoc DL, PredicatedScalarEvolution &PSE);
205
206 /// Add a check to \p Plan to see if the epilogue vector loop should be
207 /// executed.
208 static void addMinimumVectorEpilogueIterationCheck(
209 VPlan &Plan, Value *VectorTripCount, bool RequiresScalarEpilogue,
210 ElementCount EpilogueVF, unsigned EpilogueUF, unsigned MainLoopStep,
211 unsigned EpilogueLoopStep, ScalarEvolution &SE);
212
213 /// Replace loops in \p Plan's flat CFG with VPRegionBlocks, turning \p Plan's
214 /// flat CFG into a hierarchical CFG. For the outermost loop, also create the
215 /// canonical IV's increment and adjust the latch terminator: replace
216 /// BranchOnCond with BranchOnCount, using \p DL for the canonical IV.
217 LLVM_ABI_FOR_TEST static void createLoopRegions(VPlan &Plan, DebugLoc DL);
218
219 /// Wrap runtime check block \p CheckBlock in a VPIRBB and \p Cond in a
220 /// VPValue and connect the block to \p Plan, using the VPValue as branch
221 /// condition.
222 static void attachVPCheckBlock(VPlan &Plan, VPValue *Cond,
223 VPBasicBlock *CheckBlock,
224 bool AddBranchWeights);
225 static void attachCheckBlock(VPlan &Plan, Value *Cond, BasicBlock *CheckBlock,
226 bool AddBranchWeights);
227
228 /// Replaces the VPInstructions in \p Plan with corresponding
229 /// widen recipes. Returns false if any VPInstructions could not be converted
230 /// to a wide recipe if needed.
231 LLVM_ABI_FOR_TEST static bool
232 tryToConvertVPInstructionsToVPRecipes(VPlan &Plan,
233 const TargetLibraryInfo &TLI);
234
235 /// Try to legalize reductions with multiple in-loop uses. Currently only
236 /// strict and non-strict min/max reductions used by FindLastIV reductions are
237 /// supported, corresponding to computing the first and last argmin/argmax,
238 /// respectively. Otherwise return false.
239 static bool handleMultiUseReductions(VPlan &Plan,
240 OptimizationRemarkEmitter *ORE,
241 Loop *TheLoop);
242
243 /// Check if \p Plan contains any FMaxNum or FMinNum reductions. If they do,
244 /// try to update the vector loop to exit early if any input is NaN and resume
245 /// executing in the scalar loop to handle the NaNs there. Return false if
246 /// this attempt was unsuccessful.
247 static bool handleMaxMinNumReductions(VPlan &Plan);
248
249 /// Check if \p Plan contains any FindLast reductions. If it does, try to
250 /// update the vector loop to save the appropriate state using selects
251 /// for entire vectors for both the latest mask containing at least one active
252 /// element and the corresponding data vector. Return false if this attempt
253 /// was unsuccessful.
254 static bool handleFindLastReductions(VPlan &Plan);
255
256 /// Clear NSW/NUW flags from reduction instructions if necessary.
257 static void clearReductionWrapFlags(VPlan &Plan);
258
259 /// Explicitly unroll \p Plan by \p UF.
260 static void unrollByUF(VPlan &Plan, unsigned UF);
261
262 /// Replace replicating VPReplicateRecipe, VPScalarIVStepsRecipe and
263 /// VPInstruction in \p Plan with \p VF single-scalar recipes. Replicate
264 /// regions are dissolved by replicating their blocks and their recipes \p VF
265 /// times.
266 /// TODO: Also dissolve replicate regions with live outs.
267 static void replicateByVF(VPlan &Plan, ElementCount VF);
268
269 /// Optimize \p Plan based on \p BestVF and \p BestUF. This may restrict the
270 /// resulting plan to \p BestVF and \p BestUF.
271 static void optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
272 unsigned BestUF,
273 PredicatedScalarEvolution &PSE);
274
275 /// Try to simplify VPInstruction::ExplicitVectorLength recipes when the AVL
276 /// is known to be <= VF, replacing them with the AVL directly.
277 static bool simplifyKnownEVL(VPlan &Plan, ElementCount VF,
278 PredicatedScalarEvolution &PSE);
279
280 /// Apply VPlan-to-VPlan optimizations to \p Plan, including induction recipe
281 /// optimizations, dead recipe removal, replicate region optimizations and
282 /// block merging.
283 LLVM_ABI_FOR_TEST static void optimize(VPlan &Plan);
284
285 /// Remove redundant VPBasicBlocks by merging them into their single
286 /// predecessor if the latter has a single successor.
287 static bool mergeBlocksIntoPredecessors(VPlan &Plan);
288
289 /// Wrap predicated VPReplicateRecipes with a mask operand in an if-then
290 /// region block and remove the mask operand. Optimize the created regions by
291 /// iteratively sinking scalar operands into the region, followed by merging
292 /// regions until no improvements are remaining.
293 static void createAndOptimizeReplicateRegions(VPlan &Plan);
294
295 /// Replace (ICMP_ULE, wide canonical IV, backedge-taken-count) checks with an
296 /// (active-lane-mask recipe, wide canonical IV, trip-count). If \p
297 /// UseActiveLaneMaskForControlFlow is true, introduce an
298 /// VPActiveLaneMaskPHIRecipe.
299 static void addActiveLaneMask(VPlan &Plan,
300 bool UseActiveLaneMaskForControlFlow);
301
302 /// Insert truncates and extends for any truncated recipe. Redundant casts
303 /// will be folded later.
304 static void
305 truncateToMinimalBitwidths(VPlan &Plan,
306 const MapVector<Instruction *, uint64_t> &MinBWs);
307
308 /// Replace symbolic strides from \p StridesMap in \p Plan with constants when
309 /// possible.
310 static void
311 replaceSymbolicStrides(VPlan &Plan, PredicatedScalarEvolution &PSE,
312 const DenseMap<Value *, const SCEV *> &StridesMap,
313 const VPDominatorTree &VPDT);
314
315 /// Drop poison flags from recipes that may generate a poison value that is
316 /// used after vectorization, even when their operands are not poison. Those
317 /// recipes meet the following conditions:
318 /// * Contribute to the address computation of a recipe generating a widen
319 /// memory load/store (VPWidenMemoryInstructionRecipe or
320 /// VPInterleaveRecipe).
321 /// * Such a widen memory load/store is masked, but not with the header mask.
322 static void dropPoisonGeneratingRecipes(VPlan &Plan);
323
324 /// Add a VPCurrentIterationPHIRecipe and related recipes to \p Plan and
325 /// replaces all uses of the canonical IV except for the canonical IV
326 /// increment with a VPCurrentIterationPHIRecipe. The canonical IV is only
327 /// used to control the loop after this transformation.
328 static void
329 addExplicitVectorLength(VPlan &Plan,
330 const std::optional<unsigned> &MaxEVLSafeElements);
331
332 /// Optimize recipes which use an EVL-based header mask to VP intrinsics, for
333 /// example:
334 ///
335 /// %mask = icmp ult step-vector, EVL
336 /// %load = load %ptr, %mask
337 /// -->
338 /// %load = vp.load %ptr, EVL
339 static void optimizeEVLMasks(VPlan &Plan);
340
341 // For each Interleave Group in \p InterleaveGroups replace the Recipes
342 // widening its memory instructions with a single VPInterleaveRecipe at its
343 // insertion point.
344 static void createInterleaveGroups(
345 VPlan &Plan,
346 const SmallPtrSetImpl<const InterleaveGroup<Instruction> *>
347 &InterleaveGroups,
348 const bool &EpilogueAllowed);
349
350 /// Transform widen memory recipes into strided access recipes when legal
351 /// and profitable. Clamps \p Range to maintain consistency with widen
352 /// decisions of \p Plan, and uses \p Ctx to evaluate the cost.
353 static void convertToStridedAccesses(VPlan &Plan,
354 PredicatedScalarEvolution &PSE, Loop &L,
355 VPCostContext &Ctx, VFRange &Range);
356
357 /// Remove dead recipes from \p Plan.
358 static void removeDeadRecipes(VPlan &Plan);
359
360 /// Update \p Plan to account for uncountable early exits by introducing
361 /// appropriate branching logic in the latch that handles early exits and the
362 /// latch exit condition. Multiple exits are handled with a dispatch block
363 /// that determines which exit to take based on lane-by-lane semantics.
364 static bool handleUncountableEarlyExits(
365 VPlan &Plan, VPBasicBlock *HeaderVPBB, VPBasicBlock *LatchVPBB,
366 VPBasicBlock *MiddleVPBB, Loop *TheLoop, PredicatedScalarEvolution &PSE,
367 DominatorTree &DT, AssumptionCache *AC, UncountableExitStyle Style);
368
369 /// Replaces the exit condition from
370 /// (branch-on-cond eq CanonicalIVInc, VectorTripCount)
371 /// to
372 /// (branch-on-cond eq AVLNext, 0)
373 static void convertEVLExitCond(VPlan &Plan);
374
375 /// Replace loop regions with explicit CFG.
376 static void dissolveLoopRegions(VPlan &Plan);
377
378 /// Expand BranchOnTwoConds instructions into explicit CFG with
379 /// BranchOnCond instructions. Should be called after dissolveLoopRegions.
380 static void expandBranchOnTwoConds(VPlan &Plan);
381
382 /// Transform loops with variable-length stepping after region
383 /// dissolution.
384 ///
385 /// Once loop regions are replaced with explicit CFG, loops can step with
386 /// variable vector lengths instead of fixed lengths. This transformation:
387 /// * Makes CurrentIteration-Phi concrete.
388 // * Removes CanonicalIV and increment.
389 static void convertToVariableLengthStep(VPlan &Plan);
390
391 /// Lower abstract recipes to concrete ones, that can be codegen'd.
392 static void convertToConcreteRecipes(VPlan &Plan);
393
394 /// This function converts initial recipes to the abstract recipes and clamps
395 /// \p Range based on cost model for following optimizations and cost
396 /// estimations. The converted abstract recipes will lower to concrete
397 /// recipes before codegen.
398 static void convertToAbstractRecipes(VPlan &Plan, VPCostContext &Ctx,
399 VFRange &Range);
400
401 /// Perform instcombine-like simplifications on recipes in \p Plan.
402 static void simplifyRecipes(VPlan &Plan);
403
404 /// Cancel out redundant reverses in \p Plan, e.g. reverse(reverse(x)) -> x.
405 static void simplifyReverses(VPlan &Plan);
406
407 /// Remove BranchOnCond recipes with true or false conditions together with
408 /// removing dead edges to their successors. If \p OnlyLatches is true, only
409 /// process loop latches. Returns true if incoming values from any phi-like
410 /// recipe have been removed.
411 static bool removeBranchOnConst(VPlan &Plan, bool OnlyLatches = false);
412
413 /// Perform common-subexpression-elimination on \p Plan.
414 static void cse(VPlan &Plan);
415
416 /// If there's a single exit block, optimize its phi recipes that use exiting
417 /// IV values by feeding them precomputed end values instead, possibly taken
418 /// one step backwards.
419 static void optimizeInductionLiveOutUsers(VPlan &Plan,
420 PredicatedScalarEvolution &PSE,
421 bool FoldTail);
422
423 /// Add explicit broadcasts for live-ins and VPValues defined in \p Plan's entry block if they are used as vectors.
424 static void materializeBroadcasts(VPlan &Plan);
425
426 /// Hoist predicated loads from the same address to the loop entry block, if
427 /// they are guaranteed to execute on both paths (i.e., in replicate regions
428 /// with complementary masks P and NOT P).
429 static void hoistPredicatedLoads(VPlan &Plan, PredicatedScalarEvolution &PSE,
430 const Loop *L);
431
432 /// Sink predicated stores to the same address with complementary predicates
433 /// (P and NOT P) to an unconditional store with select recipes for the
434 /// stored values. This eliminates branching overhead when all paths
435 /// unconditionally store to the same location.
436 static void sinkPredicatedStores(VPlan &Plan, PredicatedScalarEvolution &PSE,
437 const Loop *L);
438
439 // Materialize vector trip counts for constants early if it can simply be
440 // computed as (Original TC / VF * UF) * VF * UF.
441 static void
442 materializeConstantVectorTripCount(VPlan &Plan, ElementCount BestVF,
443 unsigned BestUF,
444 PredicatedScalarEvolution &PSE);
445
446 /// Materialize vector trip count computations to a set of VPInstructions.
447 /// \p Step is used as the step value for the trip count computation.
448 /// \p MaxRuntimeStep is the maximum possible runtime value of Step, used to
449 /// prove the trip count is divisible by the step for scalable VFs.
450 static void materializeVectorTripCount(
451 VPlan &Plan, VPBasicBlock *VectorPHVPBB, bool TailByMasking,
452 bool RequiresScalarEpilogue, VPValue *Step,
453 std::optional<uint64_t> MaxRuntimeStep = std::nullopt);
454
455 /// Materialize the backedge-taken count to be computed explicitly using
456 /// VPInstructions.
457 static void materializeBackedgeTakenCount(VPlan &Plan,
458 VPBasicBlock *VectorPH);
459
460 /// Add explicit Build[Struct]Vector recipes to Pack multiple scalar values
461 /// into vectors and Unpack recipes to extract scalars from vectors as
462 /// needed.
463 static void materializePacksAndUnpacks(VPlan &Plan);
464
465 /// Materialize UF, VF and VFxUF to be computed explicitly using
466 /// VPInstructions.
467 static void materializeFactors(VPlan &Plan, VPBasicBlock *VectorPH,
468 ElementCount VF);
469
470 /// Attaches the alias-mask to the existing header-mask.
471 static void attachAliasMaskToHeaderMask(VPlan &Plan);
472
473 /// Materializes within the \p AliasCheckVPBB block. Updates the header mask
474 /// of the loop to use the alias mask. Returns the clamped VF.
475 static VPValue *materializeAliasMask(VPlan &Plan,
476 VPBasicBlock *AliasCheckVPBB,
477 ArrayRef<PointerDiffInfo> DiffChecks);
478
479 /// Materializes the alias mask within a check block before the loop. The
480 /// vector loop will only be entered if the clamped VF from the alias mask
481 /// is not scalar.
482 static void materializeAliasMaskCheckBlock(
483 VPlan &Plan, ArrayRef<PointerDiffInfo> DiffChecks, bool HasBranchWeights);
484
485 /// Try to expand VPExpandSCEVRecipes in \p Plan's entry block to
486 /// VPInstructions. Recipes that cannot be expanded (like casts, min/max) are
487 /// kept for later IR-level expansion.
488 static void expandSCEVsToVPInstructions(VPlan &Plan, ScalarEvolution &SE);
489
490 /// Expand remaining VPExpandSCEVRecipes in \p Plan's entry block using
491 /// SCEVExpander. Each VPExpandSCEVRecipe is replaced with a live-in wrapping
492 /// the expanded IR value. A mapping from SCEV expressions to their expanded
493 /// IR value is returned.
494 static DenseMap<const SCEV *, Value *> expandSCEVs(VPlan &Plan,
495 ScalarEvolution &SE);
496
497 /// Try to find a single VF among \p Plan's VFs for which all interleave
498 /// groups (with known minimum VF elements) can be replaced by wide loads and
499 /// stores processing VF elements, if all transformed interleave groups access
500 /// the full vector width (checked via the maximum vector register width). If
501 /// the transformation can be applied, the original \p Plan will be split in
502 /// 2:
503 /// 1. The original Plan with the single VF containing the optimized recipes
504 /// using wide loads instead of interleave groups.
505 /// 2. A new clone which contains all VFs of Plan except the optimized VF.
506 ///
507 /// This effectively is a very simple form of loop-aware SLP, where we use
508 /// interleave groups to identify candidates.
509 static std::unique_ptr<VPlan>
510 narrowInterleaveGroups(VPlan &Plan, const TargetTransformInfo &TTI);
511
512 /// Adapts the vector loop region for tail folding by introducing a header
513 /// mask and conditionally executing the content of the region:
514 ///
515 /// Vector loop region before:
516 /// +-------------------------------------------+
517 /// |%iv = ... |
518 /// |... |
519 /// |%iv.next = add %iv, vfxuf |
520 /// |branch-on-count %iv.next, vector-trip-count|
521 /// +-------------------------------------------+
522 ///
523 /// Vector loop region after:
524 /// +-------------------------------------------+
525 /// |%iv = ... |
526 /// |%wide.iv = widen-canonical-iv ... |
527 /// |%header-mask = icmp ule %wide.iv, BTC |
528 /// |branch-on-cond %header-mask |---+
529 /// +-------------------------------------------+ |
530 /// | |
531 /// v |
532 /// +-------------------------------------------+ |
533 /// | ... | |
534 /// +-------------------------------------------+ |
535 /// | |
536 /// v |
537 /// +-------------------------------------------+ |
538 /// |<phis> = phi [..., ...], [poison, header] |
539 /// |%iv.next = add %iv, vfxuf |<--+
540 /// |branch-on-count %iv.next, vector-trip-count|
541 /// +-------------------------------------------+
542 ///
543 /// Any VPInstruction::ExtractLastLanes are also updated to extract from the
544 /// last active lane of the header mask.
545 static void foldTailByMasking(VPlan &Plan);
546
547 /// Predicate and linearize the control-flow in the only loop region of
548 /// \p Plan.
549 static void introduceMasksAndLinearize(VPlan &Plan);
550
551 /// Replace a VPWidenCanonicalIVRecipe if it is present in \p Plan, with a
552 /// VPWidenIntOrFpInductionRecipe, provided it would not cause additional
553 /// spills for \p VF at unroll factor \p UF.
554 static void replaceWideCanonicalIVWithWideIV(
555 VPlan &Plan, ScalarEvolution &SE, const TargetTransformInfo &TTI,
556 TargetTransformInfo::TargetCostKind CostKind, ElementCount VF,
557 unsigned UF, const SmallPtrSetImpl<const Value *> &ValuesToIgnore);
558
559 /// Add branch weight metadata, if the \p Plan's middle block is terminated by
560 /// a BranchOnCond recipe.
561 static void
562 addBranchWeightToMiddleTerminator(VPlan &Plan, ElementCount VF,
563 std::optional<unsigned> VScaleForTuning);
564
565 /// Adjust first-order recurrence users in the middle block: create
566 /// penultimate element extracts for LCSSA phi users, and handle penultimate
567 /// extracts of the last active lane edge.
568 static void adjustFirstOrderRecurrenceMiddleUsers(VPlan &Plan,
569 VFRange &Range);
570
571 /// Optimize FindLast reductions selecting IVs (or expressions of IVs) by
572 /// converting them to FindIV reductions, if their IV range excludes a
573 /// suitable sentinel value. For expressions of IVs, the expression is sunk
574 /// to the middle block.
575 static void optimizeFindIVReductions(VPlan &Plan,
576 PredicatedScalarEvolution &PSE, Loop &L);
577
578 /// Detect and create partial reduction recipes for scaled reductions in
579 /// \p Plan. Must be called after recipe construction. If partial reductions
580 /// are only valid for a subset of VFs in Range, Range.End is updated.
581 static void createPartialReductions(VPlan &Plan, VPCostContext &CostCtx,
582 VFRange &Range);
583
584 /// Convert load/store VPInstructions in \p Plan into widened or replicate
585 /// recipes. Non load/store input instructions are left unchanged.
586 static void makeMemOpWideningDecisions(VPlan &Plan, VFRange &Range,
587 VPRecipeBuilder &RecipeBuilder,
588 PredicatedScalarEvolution &PSE,
589 const Loop *L);
590
591 /// Make VPlan-based scalarization decision prior to delegating to the ones
592 /// made by the legacy CM. Only transforms "usesFirstLaneOnly` def-use chains
593 /// enabled by prior widening of consecutive memory operations for now.
594 static void makeScalarizationDecisions(VPlan &Plan, VFRange &Range);
595
596 /// Convert call VPInstructions in \p Plan into widened call, vector
597 /// intrinsic or replicate recipes based on a cost comparison via \p CostCtx.
598 static void makeCallWideningDecisions(VPlan &Plan, VFRange &Range,
599 VPRecipeBuilder &RecipeBuilder,
600 VPCostContext &CostCtx);
601};
602
603} // namespace llvm
604
605#endif // LLVM_TRANSFORMS_VECTORIZE_VPLANTRANSFORMS_H
606