1//===-- VPlanTransforms.cpp - Utility VPlan to VPlan transforms -----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file implements a set of utility VPlan to VPlan transformations.
11///
12//===----------------------------------------------------------------------===//
13
14#include "VPlanTransforms.h"
15#include "VPRecipeBuilder.h"
16#include "VPlan.h"
17#include "VPlanAnalysis.h"
18#include "VPlanCFG.h"
19#include "VPlanDominatorTree.h"
20#include "VPlanHelpers.h"
21#include "VPlanPatternMatch.h"
22#include "VPlanUtils.h"
23#include "VPlanVerifier.h"
24#include "llvm/ADT/APInt.h"
25#include "llvm/ADT/PostOrderIterator.h"
26#include "llvm/ADT/STLExtras.h"
27#include "llvm/ADT/SetVector.h"
28#include "llvm/ADT/SmallPtrSet.h"
29#include "llvm/ADT/TypeSwitch.h"
30#include "llvm/Analysis/IVDescriptors.h"
31#include "llvm/Analysis/InstSimplifyFolder.h"
32#include "llvm/Analysis/Loads.h"
33#include "llvm/Analysis/LoopInfo.h"
34#include "llvm/Analysis/MemoryLocation.h"
35#include "llvm/Analysis/ScalarEvolutionPatternMatch.h"
36#include "llvm/Analysis/ScopedNoAliasAA.h"
37#include "llvm/Analysis/VectorUtils.h"
38#include "llvm/IR/Intrinsics.h"
39#include "llvm/IR/MDBuilder.h"
40#include "llvm/IR/Metadata.h"
41#include "llvm/Support/Casting.h"
42#include "llvm/Support/TypeSize.h"
43#include "llvm/Transforms/Utils/LoopUtils.h"
44#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
45
46using namespace llvm;
47using namespace VPlanPatternMatch;
48using namespace SCEVPatternMatch;
49
50bool VPlanTransforms::tryToConvertVPInstructionsToVPRecipes(
51 VPlan &Plan, const TargetLibraryInfo &TLI) {
52
53 ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT(
54 Plan.getVectorLoopRegion());
55 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Range&: RPOT)) {
56 // Skip blocks outside region
57 if (!VPBB->getParent())
58 break;
59 VPRecipeBase *Term = VPBB->getTerminator();
60 auto EndIter = Term ? Term->getIterator() : VPBB->end();
61 // Introduce each ingredient into VPlan.
62 for (VPRecipeBase &Ingredient :
63 make_early_inc_range(Range: make_range(x: VPBB->begin(), y: EndIter))) {
64
65 VPValue *VPV = Ingredient.getVPSingleValue();
66 if (!VPV->getUnderlyingValue())
67 continue;
68
69 Instruction *Inst = cast<Instruction>(Val: VPV->getUnderlyingValue());
70
71 VPRecipeBase *NewRecipe = nullptr;
72 if (auto *PhiR = dyn_cast<VPPhi>(Val: &Ingredient)) {
73 auto *Phi = cast<PHINode>(Val: PhiR->getUnderlyingValue());
74 NewRecipe = new VPWidenPHIRecipe(PhiR->operands(), PhiR->getDebugLoc(),
75 Phi->getName());
76 } else if (auto *VPI = dyn_cast<VPInstruction>(Val: &Ingredient)) {
77 assert(!isa<PHINode>(Inst) && "phis should be handled above");
78 // Create VPWidenMemoryRecipe for loads and stores.
79 if (LoadInst *Load = dyn_cast<LoadInst>(Val: Inst)) {
80 NewRecipe = new VPWidenLoadRecipe(
81 *Load, Ingredient.getOperand(N: 0), nullptr /*Mask*/,
82 false /*Consecutive*/, *VPI, Ingredient.getDebugLoc());
83 } else if (StoreInst *Store = dyn_cast<StoreInst>(Val: Inst)) {
84 NewRecipe = new VPWidenStoreRecipe(
85 *Store, Ingredient.getOperand(N: 1), Ingredient.getOperand(N: 0),
86 nullptr /*Mask*/, false /*Consecutive*/, *VPI,
87 Ingredient.getDebugLoc());
88 } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Val: Inst)) {
89 NewRecipe = new VPWidenGEPRecipe(GEP->getSourceElementType(),
90 Ingredient.operands(), *VPI,
91 Ingredient.getDebugLoc(), GEP);
92 } else if (CallInst *CI = dyn_cast<CallInst>(Val: Inst)) {
93 Intrinsic::ID VectorID = getVectorIntrinsicIDForCall(CI, TLI: &TLI);
94 if (VectorID == Intrinsic::not_intrinsic)
95 return false;
96
97 // The noalias.scope.decl intrinsic declares a noalias scope that
98 // is valid for a single iteration. Emitting it as a single-scalar
99 // replicate would incorrectly extend the scope across multiple
100 // original iterations packed into one vector iteration.
101 // FIXME: If we want to vectorize this loop, then we have to drop
102 // all the associated !alias.scope and !noalias.
103 if (VectorID == Intrinsic::experimental_noalias_scope_decl)
104 return false;
105
106 // These intrinsics are recognized by getVectorIntrinsicIDForCall
107 // but are not widenable. Emit them as replicate instead of widening.
108 if (VectorID == Intrinsic::assume ||
109 VectorID == Intrinsic::lifetime_end ||
110 VectorID == Intrinsic::lifetime_start ||
111 VectorID == Intrinsic::sideeffect ||
112 VectorID == Intrinsic::pseudoprobe) {
113 // If the operand of llvm.assume holds before vectorization, it will
114 // also hold per lane.
115 // llvm.pseudoprobe requires to be duplicated per lane for accurate
116 // sample count.
117 const bool IsSingleScalar = VectorID != Intrinsic::assume &&
118 VectorID != Intrinsic::pseudoprobe;
119 NewRecipe = new VPReplicateRecipe(CI, Ingredient.operands(),
120 /*IsSingleScalar=*/IsSingleScalar,
121 /*Mask=*/nullptr, *VPI, *VPI,
122 Ingredient.getDebugLoc());
123 } else {
124 NewRecipe = new VPWidenIntrinsicRecipe(
125 *CI, VectorID, drop_end(RangeOrContainer: Ingredient.operands()), CI->getType(),
126 VPIRFlags(*CI), *VPI, CI->getDebugLoc());
127 }
128 } else if (auto *CI = dyn_cast<CastInst>(Val: Inst)) {
129 NewRecipe = new VPWidenCastRecipe(
130 CI->getOpcode(), Ingredient.getOperand(N: 0), CI->getType(), CI,
131 VPIRFlags(*CI), VPIRMetadata(*CI));
132 } else {
133 NewRecipe = new VPWidenRecipe(*Inst, Ingredient.operands(), *VPI,
134 *VPI, Ingredient.getDebugLoc());
135 }
136 } else {
137 assert(isa<VPWidenIntOrFpInductionRecipe>(&Ingredient) &&
138 "inductions must be created earlier");
139 continue;
140 }
141
142 NewRecipe->insertBefore(InsertPos: &Ingredient);
143 if (NewRecipe->getNumDefinedValues() == 1)
144 VPV->replaceAllUsesWith(New: NewRecipe->getVPSingleValue());
145 else
146 assert(NewRecipe->getNumDefinedValues() == 0 &&
147 "Only recpies with zero or one defined values expected");
148 Ingredient.eraseFromParent();
149 }
150 }
151 return true;
152}
153
154/// Helper for extra no-alias checks via known-safe recipe and SCEV.
155class SinkStoreInfo {
156 SmallPtrSet<VPReplicateRecipe *, 4> ExcludeRecipes;
157 VPReplicateRecipe &GroupLeader;
158 PredicatedScalarEvolution *PSE = nullptr;
159 const Loop *L = nullptr;
160
161 // Return true if \p A and \p B are known to not alias for all VFs in the
162 // plan, checked via the distance between the accesses
163 bool isNoAliasViaDistance(VPReplicateRecipe *A, VPReplicateRecipe *B) const {
164 if (A->getOpcode() != Instruction::Store ||
165 B->getOpcode() != Instruction::Store)
166 return false;
167
168 if (!PSE || !L)
169 return A == B;
170
171 VPValue *AddrA = A->getOperand(N: 1);
172 const SCEV *SCEVA = vputils::getSCEVExprForVPValue(V: AddrA, PSE&: *PSE, L);
173 VPValue *AddrB = B->getOperand(N: 1);
174 const SCEV *SCEVB = vputils::getSCEVExprForVPValue(V: AddrB, PSE&: *PSE, L);
175 if (isa<SCEVCouldNotCompute>(Val: SCEVA) || isa<SCEVCouldNotCompute>(Val: SCEVB))
176 return false;
177
178 const APInt *Distance;
179 ScalarEvolution &SE = *PSE->getSE();
180 if (!match(S: SE.getMinusSCEV(LHS: SCEVA, RHS: SCEVB), P: m_scev_APInt(C&: Distance)))
181 return false;
182
183 const DataLayout &DL = SE.getDataLayout();
184 Type *TyA = A->getOperand(N: 0)->getScalarType();
185 uint64_t SizeA = DL.getTypeStoreSize(Ty: TyA);
186 Type *TyB = B->getOperand(N: 0)->getScalarType();
187 uint64_t SizeB = DL.getTypeStoreSize(Ty: TyB);
188
189 // Use the maximum store size to ensure no overlap from either direction.
190 // Currently only handles fixed sizes, as it is only used for
191 // replicating VPReplicateRecipes.
192 uint64_t MaxStoreSize = std::max(a: SizeA, b: SizeB);
193
194 auto VFs = B->getParent()->getPlan()->vectorFactors();
195 ElementCount MaxVF = *max_element(Range&: VFs, C: ElementCount::isKnownLT);
196 if (MaxVF.isScalable())
197 return false;
198 return Distance->abs().uge(
199 RHS: MaxVF.multiplyCoefficientBy(RHS: MaxStoreSize).getFixedValue());
200 }
201
202public:
203 SinkStoreInfo(ArrayRef<VPReplicateRecipe *> ExcludeRecipes,
204 VPReplicateRecipe &GroupLeader, PredicatedScalarEvolution &PSE,
205 const Loop &L)
206 : ExcludeRecipes(ExcludeRecipes.begin(), ExcludeRecipes.end()),
207 GroupLeader(GroupLeader), PSE(&PSE), L(&L) {}
208
209 SinkStoreInfo(VPReplicateRecipe &GroupLeader) : GroupLeader(GroupLeader) {}
210
211 /// Return true if \p R should be skipped during alias checking, either
212 /// because it's in the exclude set or because no-alias can be proven via
213 /// SCEV.
214 bool shouldSkip(VPRecipeBase &R) const {
215 auto *Store = dyn_cast<VPReplicateRecipe>(Val: &R);
216 return ExcludeRecipes.contains(Ptr: Store) ||
217 (Store && isNoAliasViaDistance(A: Store, B: &GroupLeader));
218 }
219};
220
221/// Check if a memory operation doesn't alias with memory operations using
222/// scoped noalias metadata, in blocks in the single-successor chain between \p
223/// FirstBB and \p LastBB. If \p SinkInfo is std::nullopt, only recipes that may
224/// write to memory are checked (for load hoisting). Otherwise recipes that both
225/// read and write memory are checked, and SCEV is used to prove no-alias
226/// between the group leader and other replicate recipes (for store sinking).
227static bool
228canHoistOrSinkWithNoAliasCheck(const MemoryLocation &MemLoc,
229 VPBasicBlock *FirstBB, VPBasicBlock *LastBB,
230 std::optional<SinkStoreInfo> SinkInfo = {}) {
231 bool CheckReads = SinkInfo.has_value();
232 if (!MemLoc.AATags.Scope)
233 return false;
234
235 for (VPBasicBlock *VPBB :
236 VPBlockUtils::blocksInSingleSuccessorChainBetween(FirstBB, LastBB)) {
237 for (VPRecipeBase &R : *VPBB) {
238 if (SinkInfo && SinkInfo->shouldSkip(R))
239 continue;
240
241 // Skip recipes that don't need checking.
242 if (!R.mayWriteToMemory() && !(CheckReads && R.mayReadFromMemory()))
243 continue;
244
245 auto Loc = vputils::getMemoryLocation(R);
246 if (!Loc)
247 // Conservatively assume aliasing for memory operations without
248 // location.
249 return false;
250
251 if (ScopedNoAliasAAResult::alias(LocA: *Loc, LocB: MemLoc) != AliasResult::NoAlias)
252 return false;
253 }
254 }
255 return true;
256}
257
258/// Get the value type of the replicate load or store. \p IsLoad indicates
259/// whether it is a load.
260static Type *getLoadStoreValueType(VPReplicateRecipe *R, bool IsLoad) {
261 return (IsLoad ? R : R->getOperand(N: 0))->getScalarType();
262}
263
264/// Collect either replicated Loads or Stores grouped by their address SCEV and
265/// their load-store type, in a deep-traversal of the vector loop region in \p
266/// Plan.
267template <unsigned Opcode>
268static SmallVector<SmallVector<VPReplicateRecipe *, 4>>
269collectGroupedReplicateMemOps(
270 VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L,
271 function_ref<bool(VPReplicateRecipe *)> FilterFn) {
272 static_assert(Opcode == Instruction::Load || Opcode == Instruction::Store,
273 "Only Load and Store opcodes supported");
274 constexpr bool IsLoad = (Opcode == Instruction::Load);
275 SmallDenseMap<std::pair<const SCEV *, const Type *>,
276 SmallVector<VPReplicateRecipe *, 4>>
277 RecipesByAddressAndType;
278 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
279 Range: vp_depth_first_deep(G: Plan.getVectorLoopRegion()->getEntry()))) {
280 for (VPRecipeBase &R : *VPBB) {
281 auto *RepR = dyn_cast<VPReplicateRecipe>(Val: &R);
282 if (!RepR || RepR->getOpcode() != Opcode || !FilterFn(RepR))
283 continue;
284
285 // For loads, operand 0 is address; for stores, operand 1 is address.
286 VPValue *Addr = RepR->getOperand(N: IsLoad ? 0 : 1);
287 const Type *LoadStoreTy = getLoadStoreValueType(R: RepR, IsLoad);
288 const SCEV *AddrSCEV = vputils::getSCEVExprForVPValue(V: Addr, PSE, L);
289 if (!isa<SCEVCouldNotCompute>(Val: AddrSCEV))
290 RecipesByAddressAndType[{AddrSCEV, LoadStoreTy}].push_back(Elt: RepR);
291 }
292 }
293 auto Groups = to_vector(Range: RecipesByAddressAndType.values());
294 VPDominatorTree VPDT(Plan);
295 for (auto &Group : Groups) {
296 // Sort mem ops by dominance order, with earliest (most dominating) first.
297 stable_sort(Group, [&VPDT](VPReplicateRecipe *A, VPReplicateRecipe *B) {
298 return VPDT.properlyDominates(A, B);
299 });
300 }
301 return Groups;
302}
303
304static bool sinkScalarOperands(VPlan &Plan) {
305 auto Iter = vp_depth_first_deep(G: Plan.getEntry());
306 bool ScalarVFOnly = Plan.hasScalarVFOnly();
307 bool Changed = false;
308
309 SetVector<std::pair<VPBasicBlock *, VPSingleDefRecipe *>> WorkList;
310 auto InsertIfValidSinkCandidate = [ScalarVFOnly, &WorkList](
311 VPBasicBlock *SinkTo, VPValue *Op) {
312 auto *Candidate =
313 dyn_cast_or_null<VPSingleDefRecipe>(Val: Op->getDefiningRecipe());
314 if (!Candidate)
315 return;
316
317 // We only know how to sink VPReplicateRecipes and VPScalarIVStepsRecipes
318 // for now.
319 if (!isa<VPReplicateRecipe, VPScalarIVStepsRecipe>(Val: Candidate))
320 return;
321
322 if (Candidate->getParent() == SinkTo ||
323 vputils::cannotHoistOrSinkRecipe(R: *Candidate, /*Sinking=*/true))
324 return;
325
326 if (auto *RepR = dyn_cast<VPReplicateRecipe>(Val: Candidate))
327 if (!ScalarVFOnly && RepR->isSingleScalar())
328 return;
329
330 WorkList.insert(X: {SinkTo, Candidate});
331 };
332
333 // First, collect the operands of all recipes in replicate blocks as seeds for
334 // sinking.
335 for (VPRegionBlock *VPR : VPBlockUtils::blocksOnly<VPRegionBlock>(Range&: Iter)) {
336 VPBasicBlock *EntryVPBB = VPR->getEntryBasicBlock();
337 if (!VPR->isReplicator() || EntryVPBB->getSuccessors().size() != 2)
338 continue;
339 VPBasicBlock *VPBB = cast<VPBasicBlock>(Val: EntryVPBB->getSuccessors().front());
340 if (VPBB->getSingleSuccessor() != VPR->getExitingBasicBlock())
341 continue;
342 for (auto &Recipe : *VPBB)
343 for (VPValue *Op : Recipe.operands())
344 InsertIfValidSinkCandidate(VPBB, Op);
345 }
346
347 // Try to sink each replicate or scalar IV steps recipe in the worklist.
348 for (unsigned I = 0; I != WorkList.size(); ++I) {
349 VPBasicBlock *SinkTo;
350 VPSingleDefRecipe *SinkCandidate;
351 std::tie(args&: SinkTo, args&: SinkCandidate) = WorkList[I];
352
353 // All recipe users of SinkCandidate must be in the same block SinkTo or all
354 // users outside of SinkTo must only use the first lane of SinkCandidate. In
355 // the latter case, we need to duplicate SinkCandidate.
356 auto UsersOutsideSinkTo =
357 make_filter_range(Range: SinkCandidate->users(), Pred: [SinkTo](VPUser *U) {
358 return cast<VPRecipeBase>(Val: U)->getParent() != SinkTo;
359 });
360 if (any_of(Range&: UsersOutsideSinkTo, P: [SinkCandidate](VPUser *U) {
361 return !U->usesFirstLaneOnly(Op: SinkCandidate);
362 }))
363 continue;
364 bool NeedsDuplicating = !UsersOutsideSinkTo.empty();
365
366 if (NeedsDuplicating) {
367 if (ScalarVFOnly)
368 continue;
369 VPSingleDefRecipe *Clone;
370 if (auto *SinkCandidateRepR =
371 dyn_cast<VPReplicateRecipe>(Val: SinkCandidate)) {
372 // TODO: Handle converting to uniform recipes as separate transform,
373 // then cloning should be sufficient here.
374 Clone = VPBuilder::createSingleScalarOp(
375 Opcode: SinkCandidateRepR->getOpcode(), Operands: SinkCandidate->operands(),
376 /*Mask=*/nullptr, Flags: *SinkCandidateRepR, Metadata: *SinkCandidateRepR,
377 DL: SinkCandidate->getDebugLoc(), UV: SinkCandidate->getUnderlyingInstr());
378 // TODO: add ".cloned" suffix to name of Clone's VPValue.
379 } else {
380 Clone = SinkCandidate->clone();
381 }
382
383 Clone->insertBefore(InsertPos: SinkCandidate);
384 SinkCandidate->replaceUsesWithIf(New: Clone, ShouldReplace: [SinkTo](VPUser &U, unsigned) {
385 return cast<VPRecipeBase>(Val: &U)->getParent() != SinkTo;
386 });
387 }
388 SinkCandidate->moveBefore(BB&: *SinkTo, I: SinkTo->getFirstNonPhi());
389 for (VPValue *Op : SinkCandidate->operands())
390 InsertIfValidSinkCandidate(SinkTo, Op);
391 Changed = true;
392 }
393 return Changed;
394}
395
396/// If \p R is a region with a VPBranchOnMaskRecipe in the entry block, return
397/// the mask.
398static VPValue *getPredicatedMask(VPRegionBlock *R) {
399 auto *EntryBB = dyn_cast<VPBasicBlock>(Val: R->getEntry());
400 if (!EntryBB || EntryBB->size() != 1 ||
401 !isa<VPBranchOnMaskRecipe>(Val: EntryBB->begin()))
402 return nullptr;
403
404 return cast<VPBranchOnMaskRecipe>(Val: &*EntryBB->begin())->getOperand(N: 0);
405}
406
407/// If \p R is a triangle region, return the 'then' block of the triangle.
408static VPBasicBlock *getPredicatedThenBlock(VPRegionBlock *R) {
409 auto *EntryBB = cast<VPBasicBlock>(Val: R->getEntry());
410 if (EntryBB->getNumSuccessors() != 2)
411 return nullptr;
412
413 auto *Succ0 = dyn_cast<VPBasicBlock>(Val: EntryBB->getSuccessors()[0]);
414 auto *Succ1 = dyn_cast<VPBasicBlock>(Val: EntryBB->getSuccessors()[1]);
415 if (!Succ0 || !Succ1)
416 return nullptr;
417
418 if (Succ0->getNumSuccessors() + Succ1->getNumSuccessors() != 1)
419 return nullptr;
420 if (Succ0->getSingleSuccessor() == Succ1)
421 return Succ0;
422 if (Succ1->getSingleSuccessor() == Succ0)
423 return Succ1;
424 return nullptr;
425}
426
427// Merge replicate regions in their successor region, if a replicate region
428// is connected to a successor replicate region with the same predicate by a
429// single, empty VPBasicBlock.
430static bool mergeReplicateRegionsIntoSuccessors(VPlan &Plan) {
431 SmallPtrSet<VPRegionBlock *, 4> TransformedRegions;
432
433 // Collect replicate regions followed by an empty block, followed by another
434 // replicate region with matching masks to process front. This is to avoid
435 // iterator invalidation issues while merging regions.
436 SmallVector<VPRegionBlock *, 8> WorkList;
437 for (VPRegionBlock *Region1 : VPBlockUtils::blocksOnly<VPRegionBlock>(
438 Range: vp_depth_first_deep(G: Plan.getEntry()))) {
439 if (!Region1->isReplicator())
440 continue;
441 auto *MiddleBasicBlock =
442 dyn_cast_or_null<VPBasicBlock>(Val: Region1->getSingleSuccessor());
443 if (!MiddleBasicBlock || !MiddleBasicBlock->empty())
444 continue;
445
446 auto *Region2 =
447 dyn_cast_or_null<VPRegionBlock>(Val: MiddleBasicBlock->getSingleSuccessor());
448 if (!Region2 || !Region2->isReplicator())
449 continue;
450
451 VPValue *Mask1 = getPredicatedMask(R: Region1);
452 VPValue *Mask2 = getPredicatedMask(R: Region2);
453 if (!Mask1 || Mask1 != Mask2)
454 continue;
455
456 assert(Mask1 && Mask2 && "both region must have conditions");
457 WorkList.push_back(Elt: Region1);
458 }
459
460 // Move recipes from Region1 to its successor region, if both are triangles.
461 for (VPRegionBlock *Region1 : WorkList) {
462 if (TransformedRegions.contains(Ptr: Region1))
463 continue;
464 auto *MiddleBasicBlock = cast<VPBasicBlock>(Val: Region1->getSingleSuccessor());
465 auto *Region2 = cast<VPRegionBlock>(Val: MiddleBasicBlock->getSingleSuccessor());
466
467 VPBasicBlock *Then1 = getPredicatedThenBlock(R: Region1);
468 VPBasicBlock *Then2 = getPredicatedThenBlock(R: Region2);
469 if (!Then1 || !Then2)
470 continue;
471
472 // Note: No fusion-preventing memory dependencies are expected in either
473 // region. Such dependencies should be rejected during earlier dependence
474 // checks, which guarantee accesses can be re-ordered for vectorization.
475 //
476 // Move recipes to the successor region.
477 for (VPRecipeBase &ToMove : make_early_inc_range(Range: reverse(C&: *Then1)))
478 ToMove.moveBefore(BB&: *Then2, I: Then2->getFirstNonPhi());
479
480 auto *Merge1 = cast<VPBasicBlock>(Val: Then1->getSingleSuccessor());
481 auto *Merge2 = cast<VPBasicBlock>(Val: Then2->getSingleSuccessor());
482
483 // Move VPPredInstPHIRecipes from the merge block to the successor region's
484 // merge block. Update all users inside the successor region to use the
485 // original values.
486 for (VPRecipeBase &Phi1ToMove : make_early_inc_range(Range: reverse(C&: *Merge1))) {
487 VPValue *PredInst1 =
488 cast<VPPredInstPHIRecipe>(Val: &Phi1ToMove)->getOperand(N: 0);
489 VPValue *Phi1ToMoveV = Phi1ToMove.getVPSingleValue();
490 Phi1ToMoveV->replaceUsesWithIf(New: PredInst1, ShouldReplace: [Then2](VPUser &U, unsigned) {
491 return cast<VPRecipeBase>(Val: &U)->getParent() == Then2;
492 });
493
494 // Remove phi recipes that are unused after merging the regions.
495 if (Phi1ToMove.getVPSingleValue()->user_empty()) {
496 Phi1ToMove.eraseFromParent();
497 continue;
498 }
499 Phi1ToMove.moveBefore(BB&: *Merge2, I: Merge2->begin());
500 }
501
502 // Remove the dead recipes in Region1's entry block.
503 for (VPRecipeBase &R :
504 make_early_inc_range(Range: reverse(C&: *Region1->getEntryBasicBlock())))
505 R.eraseFromParent();
506
507 // Finally, remove the first region.
508 for (VPBlockBase *Pred : make_early_inc_range(Range&: Region1->getPredecessors())) {
509 VPBlockUtils::disconnectBlocks(From: Pred, To: Region1);
510 VPBlockUtils::connectBlocks(From: Pred, To: MiddleBasicBlock);
511 }
512 VPBlockUtils::disconnectBlocks(From: Region1, To: MiddleBasicBlock);
513 TransformedRegions.insert(Ptr: Region1);
514 }
515
516 return !TransformedRegions.empty();
517}
518
519static VPRegionBlock *createReplicateRegion(VPReplicateRecipe *PredRecipe,
520 VPRegionBlock *ParentRegion,
521 VPlan &Plan) {
522 Instruction *Instr = PredRecipe->getUnderlyingInstr();
523 // Build the triangular if-then region.
524 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
525 assert(Instr->getParent() && "Predicated instruction not in any basic block");
526 auto *BlockInMask = PredRecipe->getMask();
527 auto *MaskDef = BlockInMask->getDefiningRecipe();
528 auto *BOMRecipe = new VPBranchOnMaskRecipe(
529 BlockInMask, MaskDef ? MaskDef->getDebugLoc() : DebugLoc::getUnknown());
530 auto *Entry =
531 Plan.createVPBasicBlock(Name: Twine(RegionName) + ".entry", Recipe: BOMRecipe);
532
533 // Replace predicated replicate recipe with a replicate recipe without a
534 // mask but in the replicate region.
535 auto *RecipeWithoutMask = new VPReplicateRecipe(
536 PredRecipe->getUnderlyingInstr(), PredRecipe->operandsWithoutMask(),
537 PredRecipe->isSingleScalar(), nullptr /*Mask*/, *PredRecipe, *PredRecipe,
538 PredRecipe->getDebugLoc());
539 auto *Pred =
540 Plan.createVPBasicBlock(Name: Twine(RegionName) + ".if", Recipe: RecipeWithoutMask);
541 auto *Exiting = Plan.createVPBasicBlock(Name: Twine(RegionName) + ".continue");
542 VPRegionBlock *Region =
543 Plan.createReplicateRegion(Entry, Exiting, Name: RegionName);
544
545 // Note: first set Entry as region entry and then connect successors starting
546 // from it in order, to propagate the "parent" of each VPBasicBlock.
547 Region->setParent(ParentRegion);
548 VPBlockUtils::insertTwoBlocksAfter(IfTrue: Pred, IfFalse: Exiting, BlockPtr: Entry);
549 VPBlockUtils::connectBlocks(From: Pred, To: Exiting);
550
551 if (!PredRecipe->user_empty()) {
552 auto *PHIRecipe = new VPPredInstPHIRecipe(RecipeWithoutMask,
553 RecipeWithoutMask->getDebugLoc());
554 Exiting->appendRecipe(Recipe: PHIRecipe);
555 PredRecipe->replaceAllUsesWith(New: PHIRecipe);
556 }
557 PredRecipe->eraseFromParent();
558 return Region;
559}
560
561static void addReplicateRegions(VPlan &Plan) {
562 SmallVector<VPReplicateRecipe *> WorkList;
563 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
564 Range: vp_depth_first_deep(G: Plan.getEntry()))) {
565 for (VPRecipeBase &R : *VPBB)
566 if (auto *RepR = dyn_cast<VPReplicateRecipe>(Val: &R)) {
567 if (RepR->isPredicated())
568 WorkList.push_back(Elt: RepR);
569 }
570 }
571
572 unsigned BBNum = 0;
573 for (VPReplicateRecipe *RepR : WorkList) {
574 VPBasicBlock *CurrentBlock = RepR->getParent();
575 VPBasicBlock *SplitBlock = CurrentBlock->splitAt(SplitAt: RepR->getIterator());
576
577 BasicBlock *OrigBB = RepR->getUnderlyingInstr()->getParent();
578 SplitBlock->setName(
579 OrigBB->hasName() ? OrigBB->getName() + "." + Twine(BBNum++) : "");
580 // Record predicated instructions for above packing optimizations.
581 VPRegionBlock *Region =
582 createReplicateRegion(PredRecipe: RepR, ParentRegion: CurrentBlock->getParent(), Plan);
583 VPBlockUtils::insertOnEdge(From: CurrentBlock, To: SplitBlock, BlockPtr: Region);
584
585 VPRegionBlock *ParentRegion = Region->getParent();
586 if (ParentRegion && ParentRegion->getExiting() == CurrentBlock)
587 ParentRegion->setExiting(SplitBlock);
588 }
589}
590
591bool VPlanTransforms::mergeBlocksIntoPredecessors(VPlan &Plan) {
592 SmallVector<VPBasicBlock *> WorkList;
593 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
594 Range: vp_depth_first_deep(G: Plan.getEntry()))) {
595 // Don't fold the blocks in the skeleton of the Plan into their single
596 // predecessors for now.
597 // TODO: Remove restriction once more of the skeleton is modeled in VPlan.
598 if (!VPBB->getParent())
599 continue;
600 auto *PredVPBB =
601 dyn_cast_or_null<VPBasicBlock>(Val: VPBB->getSinglePredecessor());
602 if (!PredVPBB || PredVPBB->getNumSuccessors() != 1 ||
603 isa<VPIRBasicBlock>(Val: PredVPBB))
604 continue;
605 WorkList.push_back(Elt: VPBB);
606 }
607
608 for (VPBasicBlock *VPBB : WorkList) {
609 VPBasicBlock *PredVPBB = cast<VPBasicBlock>(Val: VPBB->getSinglePredecessor());
610 for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB))
611 R.moveBefore(BB&: *PredVPBB, I: PredVPBB->end());
612 VPBlockUtils::disconnectBlocks(From: PredVPBB, To: VPBB);
613 auto *ParentRegion = VPBB->getParent();
614 if (ParentRegion && ParentRegion->getExiting() == VPBB)
615 ParentRegion->setExiting(PredVPBB);
616 VPBlockUtils::transferSuccessors(Old: VPBB, New: PredVPBB);
617 // VPBB is now dead and will be cleaned up when the plan gets destroyed.
618 }
619 return !WorkList.empty();
620}
621
622void VPlanTransforms::createAndOptimizeReplicateRegions(VPlan &Plan) {
623 // Convert masked VPReplicateRecipes to if-then region blocks.
624 addReplicateRegions(Plan);
625
626 bool ShouldSimplify = true;
627 while (ShouldSimplify) {
628 ShouldSimplify = sinkScalarOperands(Plan);
629 ShouldSimplify |= mergeReplicateRegionsIntoSuccessors(Plan);
630 ShouldSimplify |= mergeBlocksIntoPredecessors(Plan);
631 }
632}
633
634/// Remove redundant casts of inductions.
635///
636/// Such redundant casts are casts of induction variables that can be ignored,
637/// because we already proved that the casted phi is equal to the uncasted phi
638/// in the vectorized loop. There is no need to vectorize the cast - the same
639/// value can be used for both the phi and casts in the vector loop.
640static void removeRedundantInductionCasts(VPlan &Plan) {
641 for (auto &Phi : Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
642 auto *IV = dyn_cast<VPWidenIntOrFpInductionRecipe>(Val: &Phi);
643 if (!IV || IV->getTruncInst())
644 continue;
645
646 // A sequence of IR Casts has potentially been recorded for IV, which
647 // *must be bypassed* when the IV is vectorized, because the vectorized IV
648 // will produce the desired casted value. This sequence forms a def-use
649 // chain and is provided in reverse order, ending with the cast that uses
650 // the IV phi. Search for the recipe of the last cast in the chain and
651 // replace it with the original IV. Note that only the final cast is
652 // expected to have users outside the cast-chain and the dead casts left
653 // over will be cleaned up later.
654 ArrayRef<Instruction *> Casts = IV->getInductionDescriptor().getCastInsts();
655 VPValue *FindMyCast = IV;
656 for (Instruction *IRCast : reverse(C&: Casts)) {
657 VPSingleDefRecipe *FoundUserCast = nullptr;
658 for (auto *U : FindMyCast->users()) {
659 auto *UserCast = dyn_cast<VPSingleDefRecipe>(Val: U);
660 if (UserCast && UserCast->getUnderlyingValue() == IRCast) {
661 FoundUserCast = UserCast;
662 break;
663 }
664 }
665 // A cast recipe in the chain may have been removed by earlier DCE.
666 if (!FoundUserCast)
667 break;
668 FindMyCast = FoundUserCast;
669 }
670 if (FindMyCast != IV)
671 FindMyCast->replaceAllUsesWith(New: IV);
672 }
673}
674
675static VPScalarIVStepsRecipe *
676createScalarIVSteps(VPlan &Plan, InductionDescriptor::InductionKind Kind,
677 Instruction::BinaryOps InductionOpcode,
678 FPMathOperator *FPBinOp, Instruction *TruncI,
679 VPIRValue *StartV, VPValue *Step, DebugLoc DL,
680 VPBuilder &Builder) {
681 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
682 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
683 VPValue *CanonicalIV = LoopRegion->getCanonicalIV();
684 VPSingleDefRecipe *BaseIV =
685 Builder.createDerivedIV(Kind, FPBinOp, Start: StartV, Current: CanonicalIV, Step);
686
687 // Truncate base induction if needed.
688 Type *ResultTy = BaseIV->getScalarType();
689 if (TruncI) {
690 Type *TruncTy = TruncI->getType();
691 assert(ResultTy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits() &&
692 "Not truncating.");
693 assert(ResultTy->isIntegerTy() && "Truncation requires an integer type");
694 BaseIV = Builder.createScalarCast(Opcode: Instruction::Trunc, Op: BaseIV, ResultTy: TruncTy, DL);
695 ResultTy = TruncTy;
696 }
697
698 // Truncate step if needed.
699 Type *StepTy = Step->getScalarType();
700 if (ResultTy != StepTy) {
701 assert(StepTy->getScalarSizeInBits() > ResultTy->getScalarSizeInBits() &&
702 "Not truncating.");
703 assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
704 auto *VecPreheader =
705 cast<VPBasicBlock>(Val: HeaderVPBB->getSingleHierarchicalPredecessor());
706 VPBuilder::InsertPointGuard Guard(Builder);
707 Builder.setInsertPoint(VecPreheader);
708 Step = Builder.createScalarCast(Opcode: Instruction::Trunc, Op: Step, ResultTy, DL);
709 }
710 return Builder.createScalarIVSteps(InductionOpcode, FPBinOp, IV: BaseIV, Step,
711 VF: &Plan.getVF(), DL);
712}
713
714void VPlanTransforms::replaceWideCanonicalIVWithWideIV(
715 VPlan &Plan, ScalarEvolution &SE, const TargetTransformInfo &TTI,
716 TargetTransformInfo::TargetCostKind CostKind, ElementCount VF, unsigned UF,
717 const SmallPtrSetImpl<const Value *> &ValuesToIgnore) {
718 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
719 if (!LoopRegion)
720 return;
721
722 auto *WideCanIV =
723 findUserOf<VPWidenCanonicalIVRecipe>(V: LoopRegion->getCanonicalIV());
724 if (!WideCanIV)
725 return;
726
727 Type *CanIVTy = LoopRegion->getCanonicalIVType();
728
729 // Replace the wide canonical IV with a scalar-iv-steps over the canonical
730 // IV.
731 if (Plan.hasScalarVFOnly() || vputils::onlyFirstLaneUsed(Def: WideCanIV)) {
732 VPBuilder Builder(WideCanIV);
733 WideCanIV->replaceAllUsesWith(New: createScalarIVSteps(
734 Plan, Kind: InductionDescriptor::IK_IntInduction, InductionOpcode: Instruction::Add, FPBinOp: nullptr,
735 TruncI: nullptr, StartV: Plan.getZero(Ty: CanIVTy), Step: Plan.getConstantInt(Ty: CanIVTy, Val: 1),
736 DL: WideCanIV->getDebugLoc(), Builder));
737 WideCanIV->eraseFromParent();
738 return;
739 }
740
741 if (vputils::onlyScalarValuesUsed(Def: WideCanIV))
742 return;
743
744 // If a canonical VPWidenIntOrFpInductionRecipe already produces vector lanes
745 // in the header, reuse it instead of introducing another wide induction phi.
746 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
747 for (VPRecipeBase &Phi : Header->phis()) {
748 VPWidenIntOrFpInductionRecipe *WidenIV;
749 if (!match(V: &Phi, P: m_CanonicalWidenIV(V&: WidenIV)))
750 continue;
751 // The reused wide IV feeds the header mask, whose lanes may extend past
752 // the trip count; drop flags that only hold inside the scalar loop.
753 WidenIV->dropPoisonGeneratingFlags();
754 WideCanIV->replaceAllUsesWith(New: WidenIV);
755 WideCanIV->eraseFromParent();
756 return;
757 }
758
759 // Introduce a new VPWidenIntOrFpInductionRecipe if profitable.
760 auto *VecTy = VectorType::get(ElementType: CanIVTy, EC: VF);
761 InstructionCost BroadcastCost = TTI.getShuffleCost(
762 Kind: TargetTransformInfo::SK_Broadcast, DstTy: VecTy, SrcTy: VecTy, Mask: {}, CostKind);
763 InstructionCost PHICost = TTI.getCFInstrCost(Opcode: Instruction::PHI, CostKind);
764 if (PHICost > BroadcastCost)
765 return;
766
767 // Bail out if the additional wide induction phi increase the expected spill
768 // cost.
769 VPRegisterUsage UnrolledBase =
770 calculateRegisterUsageForPlan(Plan, VFs: VF, TTI, ValuesToIgnore)[0];
771 for (unsigned &NumUsers : make_second_range(c&: UnrolledBase.MaxLocalUsers))
772 NumUsers *= UF;
773 unsigned RegClass = TTI.getRegisterClassForType(/*Vector=*/true, Ty: VecTy);
774 VPRegisterUsage Projected = UnrolledBase;
775 Projected.MaxLocalUsers[RegClass] += TTI.getRegUsageForType(Ty: VecTy);
776 if (Projected.spillCost(TTI, CostKind) >
777 UnrolledBase.spillCost(TTI, CostKind))
778 return;
779
780 InductionDescriptor ID =
781 InductionDescriptor::getCanonicalIntInduction(Ty: CanIVTy, SE);
782 VPValue *StepV = Plan.getConstantInt(Ty: CanIVTy, Val: 1);
783 auto *NewWideIV = new VPWidenIntOrFpInductionRecipe(
784 /*IV=*/nullptr, Plan.getZero(Ty: CanIVTy), StepV, &Plan.getVF(), ID,
785 WideCanIV->getNoWrapFlags(), WideCanIV->getDebugLoc());
786 NewWideIV->insertBefore(InsertPos: &*Header->getFirstNonPhi());
787 WideCanIV->replaceAllUsesWith(New: NewWideIV);
788 WideCanIV->eraseFromParent();
789}
790
791/// Returns true if \p R is dead and can be removed.
792static bool isDeadRecipe(VPRecipeBase &R) {
793 // Do remove conditional assume instructions as their conditions may be
794 // flattened.
795 auto *RepR = dyn_cast<VPReplicateRecipe>(Val: &R);
796 bool IsConditionalAssume = RepR && RepR->isPredicated() &&
797 match(V: RepR, P: m_Intrinsic<Intrinsic::assume>());
798 if (IsConditionalAssume)
799 return true;
800
801 if (R.mayHaveSideEffects())
802 return false;
803
804 // Recipe is dead if no user keeps the recipe alive.
805 return all_of(Range: R.definedValues(), P: [](VPValue *V) { return V->user_empty(); });
806}
807
808void VPlanTransforms::removeDeadRecipes(VPlan &Plan) {
809 PostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> POT(
810 Plan.getEntry());
811 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Range&: POT)) {
812 // The recipes in the block are processed in reverse order, to catch chains
813 // of dead recipes.
814 for (VPRecipeBase &R : make_early_inc_range(Range: reverse(C&: *VPBB))) {
815 if (isDeadRecipe(R)) {
816 R.eraseFromParent();
817 continue;
818 }
819
820 // Check if R is a dead VPPhi <-> update cycle and remove it.
821 VPValue *Start, *Incoming;
822 if (!match(V: &R, P: m_VPPhi(Op0: m_VPValue(V&: Start), Op1: m_VPValue(V&: Incoming))))
823 continue;
824 auto *PhiR = cast<VPPhi>(Val: &R);
825 VPUser *PhiUser = PhiR->getSingleUser();
826 if (!PhiUser)
827 continue;
828 if (PhiUser != Incoming->getDefiningRecipe() ||
829 Incoming->getNumUsers() != 1)
830 continue;
831 PhiR->replaceAllUsesWith(New: Start);
832 PhiR->eraseFromParent();
833 Incoming->getDefiningRecipe()->eraseFromParent();
834 }
835 }
836}
837
838static SmallVector<VPUser *> collectUsersRecursively(VPValue *V) {
839 SetVector<VPUser *> Users(llvm::from_range, V->users());
840 for (unsigned I = 0; I != Users.size(); ++I) {
841 VPRecipeBase *Cur = cast<VPRecipeBase>(Val: Users[I]);
842 for (VPValue *V : Cur->definedValues())
843 Users.insert_range(R: V->users());
844 }
845 return Users.takeVector();
846}
847
848/// Scalarize a VPWidenPointerInductionRecipe by replacing it with a PtrAdd
849/// (IndStart, ScalarIVSteps (0, Step)). This is used when the recipe only
850/// generates scalar values.
851static VPValue *
852scalarizeVPWidenPointerInduction(VPWidenPointerInductionRecipe *PtrIV,
853 VPlan &Plan, VPBuilder &Builder) {
854 const InductionDescriptor &ID = PtrIV->getInductionDescriptor();
855 VPIRValue *StartV = Plan.getZero(Ty: ID.getStep()->getType());
856 VPValue *StepV = PtrIV->getOperand(N: 1);
857 VPScalarIVStepsRecipe *Steps = createScalarIVSteps(
858 Plan, Kind: InductionDescriptor::IK_IntInduction, InductionOpcode: Instruction::Add, FPBinOp: nullptr,
859 TruncI: nullptr, StartV, Step: StepV, DL: PtrIV->getDebugLoc(), Builder);
860
861 return Builder.createPtrAdd(Ptr: PtrIV->getStartValue(), Offset: Steps,
862 DL: PtrIV->getDebugLoc(), Name: "next.gep");
863}
864
865/// Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd
866/// (IndStart, ScalarIVSteps (0, Step)) if only its scalar values are used, as
867/// VPWidenPointerInductionRecipe will generate vectors only. If some users
868/// require vectors while other require scalars, the scalar uses need to extract
869/// the scalars from the generated vectors (Note that this is different to how
870/// int/fp inductions are handled). Legalize extract-from-ends using uniform
871/// VPReplicateRecipe of wide inductions to use regular VPReplicateRecipe, so
872/// the correct end value is available. Also optimize
873/// VPWidenIntOrFpInductionRecipe, if any of its users needs scalar values, by
874/// providing them scalar steps built on the canonical scalar IV and update the
875/// original IV's users. This is an optional optimization to reduce the needs of
876/// vector extracts.
877static void legalizeAndOptimizeInductions(VPlan &Plan) {
878 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
879 bool HasOnlyVectorVFs = !Plan.hasScalarVFOnly();
880 VPBuilder Builder(HeaderVPBB, HeaderVPBB->getFirstNonPhi());
881 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
882 auto *PhiR = dyn_cast<VPWidenInductionRecipe>(Val: &Phi);
883 if (!PhiR)
884 continue;
885
886 // Try to narrow wide and replicating recipes to uniform recipes, based on
887 // VPlan analysis.
888 // TODO: Apply to all recipes in the future, to replace legacy uniformity
889 // analysis.
890 auto Users = collectUsersRecursively(V: PhiR);
891 for (VPUser *U : reverse(C&: Users)) {
892 auto *Def = dyn_cast<VPRecipeWithIRFlags>(Val: U);
893 auto *RepR = dyn_cast<VPReplicateRecipe>(Val: U);
894 // Skip recipes that shouldn't be narrowed.
895 if (!Def || !isa<VPReplicateRecipe, VPWidenRecipe>(Val: Def) ||
896 Def->user_empty() || !Def->getUnderlyingValue() ||
897 (RepR && (RepR->isSingleScalar() || RepR->isPredicated())))
898 continue;
899
900 // Skip recipes that may have other lanes than their first used.
901 if (!vputils::isSingleScalar(VPV: Def) && !vputils::onlyFirstLaneUsed(Def))
902 continue;
903
904 // TODO: Support scalarizing ExtractValue.
905 if (match(V: Def,
906 P: m_Binary<Instruction::ExtractValue>(Op0: m_VPValue(), Op1: m_VPValue())))
907 continue;
908
909 auto *Clone = VPBuilder::createSingleScalarOp(
910 Opcode: Def->getUnderlyingInstr()->getOpcode(), Operands: Def->operands(),
911 /*Mask=*/nullptr, Flags: *Def, Metadata: {}, DL: DebugLoc::getUnknown(),
912 UV: Def->getUnderlyingInstr());
913 Clone->insertAfter(InsertPos: Def);
914 Def->replaceAllUsesWith(New: Clone);
915 }
916
917 // Replace wide pointer inductions which have only their scalars used by
918 // PtrAdd(IndStart, ScalarIVSteps (0, Step)).
919 if (auto *PtrIV = dyn_cast<VPWidenPointerInductionRecipe>(Val: &Phi)) {
920 if (!Plan.hasScalarVFOnly() &&
921 !PtrIV->onlyScalarsGenerated(IsScalable: Plan.hasScalableVF()))
922 continue;
923
924 VPValue *PtrAdd = scalarizeVPWidenPointerInduction(PtrIV, Plan, Builder);
925 PtrIV->replaceAllUsesWith(New: PtrAdd);
926 continue;
927 }
928
929 // Replace widened induction with scalar steps for users that only use
930 // scalars.
931 auto *WideIV = cast<VPWidenIntOrFpInductionRecipe>(Val: &Phi);
932 if (HasOnlyVectorVFs && none_of(Range: WideIV->users(), P: [WideIV](VPUser *U) {
933 return U->usesScalars(Op: WideIV);
934 }))
935 continue;
936
937 const InductionDescriptor &ID = WideIV->getInductionDescriptor();
938 VPScalarIVStepsRecipe *Steps = createScalarIVSteps(
939 Plan, Kind: ID.getKind(), InductionOpcode: ID.getInductionOpcode(),
940 FPBinOp: dyn_cast_or_null<FPMathOperator>(Val: ID.getInductionBinOp()),
941 TruncI: WideIV->getTruncInst(), StartV: WideIV->getStartValue(), Step: WideIV->getStepValue(),
942 DL: WideIV->getDebugLoc(), Builder);
943
944 // Update scalar users of IV to use Step instead.
945 if (!HasOnlyVectorVFs) {
946 assert(!Plan.hasScalableVF() &&
947 "plans containing a scalar VF cannot also include scalable VFs");
948 WideIV->replaceAllUsesWith(New: Steps);
949 } else {
950 bool HasScalableVF = Plan.hasScalableVF();
951 WideIV->replaceUsesWithIf(New: Steps,
952 ShouldReplace: [WideIV, HasScalableVF](VPUser &U, unsigned) {
953 if (HasScalableVF)
954 return U.usesFirstLaneOnly(Op: WideIV);
955 return U.usesScalars(Op: WideIV);
956 });
957 }
958 }
959}
960
961/// Check if \p VPV is an untruncated wide induction, either before or after the
962/// increment. If so return the header IV (before the increment), otherwise
963/// return null.
964static VPWidenInductionRecipe *
965getOptimizableIVOf(VPValue *VPV, PredicatedScalarEvolution &PSE) {
966 auto *WideIV = dyn_cast<VPWidenInductionRecipe>(Val: VPV);
967 if (WideIV) {
968 // VPV itself is a wide induction, separately compute the end value for exit
969 // users if it is not a truncated IV.
970 auto *IntOrFpIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(Val: WideIV);
971 return (IntOrFpIV && IntOrFpIV->getTruncInst()) ? nullptr : WideIV;
972 }
973
974 // Check if VPV is an optimizable induction increment.
975 VPRecipeBase *Def = VPV->getDefiningRecipe();
976 if (!Def || Def->getNumOperands() != 2)
977 return nullptr;
978 WideIV = dyn_cast<VPWidenInductionRecipe>(Val: Def->getOperand(N: 0));
979 if (!WideIV)
980 WideIV = dyn_cast<VPWidenInductionRecipe>(Val: Def->getOperand(N: 1));
981 if (!WideIV)
982 return nullptr;
983
984 auto IsWideIVInc = [&]() {
985 auto &ID = WideIV->getInductionDescriptor();
986
987 // Check if VPV increments the induction by the induction step.
988 VPValue *IVStep = WideIV->getStepValue();
989 switch (ID.getInductionOpcode()) {
990 case Instruction::Add:
991 return match(V: VPV, P: m_c_Add(Op0: m_Specific(VPV: WideIV), Op1: m_Specific(VPV: IVStep)));
992 case Instruction::FAdd:
993 return match(V: VPV, P: m_c_FAdd(Op0: m_Specific(VPV: WideIV), Op1: m_Specific(VPV: IVStep)));
994 case Instruction::FSub:
995 return match(V: VPV, P: m_Binary<Instruction::FSub>(Op0: m_Specific(VPV: WideIV),
996 Op1: m_Specific(VPV: IVStep)));
997 case Instruction::Sub: {
998 // IVStep will be the negated step of the subtraction. Check if Step == -1
999 // * IVStep.
1000 VPValue *Step;
1001 if (!match(V: VPV, P: m_Sub(Op0: m_VPValue(), Op1: m_VPValue(V&: Step))))
1002 return false;
1003 const SCEV *IVStepSCEV = vputils::getSCEVExprForVPValue(V: IVStep, PSE);
1004 const SCEV *StepSCEV = vputils::getSCEVExprForVPValue(V: Step, PSE);
1005 ScalarEvolution &SE = *PSE.getSE();
1006 return !isa<SCEVCouldNotCompute>(Val: IVStepSCEV) &&
1007 !isa<SCEVCouldNotCompute>(Val: StepSCEV) &&
1008 IVStepSCEV == SE.getNegativeSCEV(V: StepSCEV);
1009 }
1010 default:
1011 return ID.getKind() == InductionDescriptor::IK_PtrInduction &&
1012 match(V: VPV, P: m_GetElementPtr(Op0: m_Specific(VPV: WideIV),
1013 Op1: m_Specific(VPV: WideIV->getStepValue())));
1014 }
1015 llvm_unreachable("should have been covered by switch above");
1016 };
1017 return IsWideIVInc() ? WideIV : nullptr;
1018}
1019
1020/// Attempts to optimize the induction variable exit values for users in the
1021/// early exit block.
1022static VPValue *optimizeEarlyExitInductionUser(VPlan &Plan, VPValue *Op,
1023 PredicatedScalarEvolution &PSE) {
1024 VPValue *Incoming, *Mask;
1025 if (!match(V: Op, P: m_ExtractLane(Op0: m_FirstActiveLane(Op0: m_VPValue(V&: Mask)),
1026 Op1: m_VPValue(V&: Incoming))))
1027 return nullptr;
1028
1029 auto *WideIV = getOptimizableIVOf(VPV: Incoming, PSE);
1030 if (!WideIV)
1031 return nullptr;
1032
1033 auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(Val: WideIV);
1034 if (WideIntOrFp && WideIntOrFp->getTruncInst())
1035 return nullptr;
1036
1037 // Calculate the final index.
1038 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
1039 auto *CanonicalIV = LoopRegion->getCanonicalIV();
1040 Type *CanonicalIVType = LoopRegion->getCanonicalIVType();
1041 auto *ExtractR = cast<VPInstruction>(Val: Op);
1042 VPBuilder B(ExtractR);
1043
1044 DebugLoc DL = ExtractR->getDebugLoc();
1045 VPValue *FirstActiveLane = B.createFirstActiveLane(Masks: Mask, DL);
1046 FirstActiveLane = B.createScalarZExtOrTrunc(
1047 Op: FirstActiveLane, ResultTy: CanonicalIVType, SrcTy: FirstActiveLane->getScalarType(), DL);
1048 VPValue *EndValue = B.createAdd(LHS: CanonicalIV, RHS: FirstActiveLane, DL);
1049
1050 // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
1051 // changed it means the exit is using the incremented value, so we need to
1052 // add the step.
1053 if (Incoming != WideIV) {
1054 VPValue *One = Plan.getConstantInt(Ty: CanonicalIVType, Val: 1);
1055 EndValue = B.createAdd(LHS: EndValue, RHS: One, DL);
1056 }
1057
1058 if (!match(V: WideIV, P: m_CanonicalWidenIV())) {
1059 const InductionDescriptor &ID = WideIV->getInductionDescriptor();
1060 VPIRValue *Start = WideIV->getStartValue();
1061 VPValue *Step = WideIV->getStepValue();
1062 EndValue = B.createDerivedIV(
1063 Kind: ID.getKind(), FPBinOp: dyn_cast_or_null<FPMathOperator>(Val: ID.getInductionBinOp()),
1064 Start, Current: EndValue, Step);
1065 }
1066
1067 return EndValue;
1068}
1069
1070/// Compute the end value for \p WideIV, unless it is truncated. Creates a
1071/// VPDerivedIVRecipe for non-canonical inductions.
1072static VPValue *tryToComputeEndValueForInduction(VPWidenInductionRecipe *WideIV,
1073 VPBuilder &VectorPHBuilder,
1074 VPValue *VectorTC) {
1075 auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(Val: WideIV);
1076 // Truncated wide inductions resume from the last lane of their vector value
1077 // in the last vector iteration which is handled elsewhere.
1078 if (WideIntOrFp && WideIntOrFp->getTruncInst())
1079 return nullptr;
1080
1081 VPIRValue *Start = WideIV->getStartValue();
1082 VPValue *Step = WideIV->getStepValue();
1083 const InductionDescriptor &ID = WideIV->getInductionDescriptor();
1084 VPValue *EndValue = VectorTC;
1085 if (!match(V: WideIV, P: m_CanonicalWidenIV())) {
1086 EndValue = VectorPHBuilder.createDerivedIV(
1087 Kind: ID.getKind(), FPBinOp: dyn_cast_or_null<FPMathOperator>(Val: ID.getInductionBinOp()),
1088 Start, Current: VectorTC, Step);
1089 }
1090
1091 // EndValue is derived from the vector trip count (which has the same type as
1092 // the widest induction) and thus may be wider than the induction here.
1093 Type *ScalarTypeOfWideIV = WideIV->getScalarType();
1094 if (ScalarTypeOfWideIV != EndValue->getScalarType()) {
1095 EndValue = VectorPHBuilder.createScalarCast(Opcode: Instruction::Trunc, Op: EndValue,
1096 ResultTy: ScalarTypeOfWideIV,
1097 DL: WideIV->getDebugLoc());
1098 }
1099
1100 return EndValue;
1101}
1102
1103/// Attempts to optimize the induction variable exit values for users in the
1104/// exit block coming from the latch in the original scalar loop.
1105static VPValue *
1106optimizeLatchExitInductionUser(VPlan &Plan, VPValue *Op,
1107 DenseMap<VPValue *, VPValue *> &EndValues,
1108 PredicatedScalarEvolution &PSE) {
1109 VPValue *Incoming;
1110 if (!match(V: Op, P: m_ExtractLastLaneOfLastPart(Op0: m_VPValue(V&: Incoming))))
1111 return nullptr;
1112
1113 VPWidenInductionRecipe *WideIV = getOptimizableIVOf(VPV: Incoming, PSE);
1114 if (!WideIV)
1115 return nullptr;
1116
1117 VPValue *EndValue = EndValues.lookup(Val: WideIV);
1118 assert(EndValue && "Must have computed the end value up front");
1119
1120 // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
1121 // changed it means the exit is using the incremented value, so we don't
1122 // need to subtract the step.
1123 if (Incoming != WideIV)
1124 return EndValue;
1125
1126 // Otherwise, subtract the step from the EndValue.
1127 auto *ExtractR = cast<VPInstruction>(Val: Op);
1128 VPBuilder B(ExtractR);
1129 VPValue *Step = WideIV->getStepValue();
1130 Type *ScalarTy = WideIV->getScalarType();
1131 if (ScalarTy->isIntegerTy())
1132 return B.createSub(LHS: EndValue, RHS: Step, DL: DebugLoc::getUnknown(), Name: "ind.escape");
1133 if (ScalarTy->isPointerTy()) {
1134 Type *StepTy = Step->getScalarType();
1135 auto *Zero = Plan.getZero(Ty: StepTy);
1136 return B.createPtrAdd(Ptr: EndValue, Offset: B.createSub(LHS: Zero, RHS: Step),
1137 DL: DebugLoc::getUnknown(), Name: "ind.escape");
1138 }
1139 if (ScalarTy->isFloatingPointTy()) {
1140 const auto &ID = WideIV->getInductionDescriptor();
1141 return B.createNaryOp(
1142 Opcode: ID.getInductionBinOp()->getOpcode() == Instruction::FAdd
1143 ? Instruction::FSub
1144 : Instruction::FAdd,
1145 Operands: {EndValue, Step}, Flags: {ID.getInductionBinOp()->getFastMathFlags()});
1146 }
1147 llvm_unreachable("all possible induction types must be handled");
1148 return nullptr;
1149}
1150
1151void VPlanTransforms::optimizeInductionLiveOutUsers(
1152 VPlan &Plan, PredicatedScalarEvolution &PSE, bool FoldTail) {
1153 // Compute end values for all inductions.
1154 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
1155 auto *VectorPH = cast<VPBasicBlock>(Val: VectorRegion->getSinglePredecessor());
1156 VPBuilder VectorPHBuilder(VectorPH, VectorPH->begin());
1157 DenseMap<VPValue *, VPValue *> EndValues;
1158 VPValue *ResumeTC =
1159 FoldTail ? Plan.getTripCount() : &Plan.getVectorTripCount();
1160 for (auto &Phi : VectorRegion->getEntryBasicBlock()->phis()) {
1161 auto *WideIV = dyn_cast<VPWidenInductionRecipe>(Val: &Phi);
1162 if (!WideIV)
1163 continue;
1164 if (VPValue *EndValue =
1165 tryToComputeEndValueForInduction(WideIV, VectorPHBuilder, VectorTC: ResumeTC))
1166 EndValues[WideIV] = EndValue;
1167 }
1168
1169 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
1170 for (VPRecipeBase &R : make_early_inc_range(Range&: *MiddleVPBB)) {
1171 VPValue *Op;
1172 if (!match(V: &R, P: m_ExitingIVValue(Op0: m_VPValue(V&: Op))))
1173 continue;
1174 auto *WideIV = cast<VPWidenInductionRecipe>(Val: Op);
1175 if (VPValue *EndValue = EndValues.lookup(Val: WideIV)) {
1176 R.getVPSingleValue()->replaceAllUsesWith(New: EndValue);
1177 R.eraseFromParent();
1178 }
1179 }
1180
1181 // Then, optimize exit block users.
1182 for (VPIRBasicBlock *ExitVPBB : Plan.getExitBlocks()) {
1183 for (VPRecipeBase &R : ExitVPBB->phis()) {
1184 auto *ExitIRI = cast<VPIRPhi>(Val: &R);
1185
1186 for (auto [Idx, PredVPBB] : enumerate(First&: ExitVPBB->getPredecessors())) {
1187 VPValue *Escape = nullptr;
1188 if (PredVPBB == MiddleVPBB)
1189 Escape = optimizeLatchExitInductionUser(
1190 Plan, Op: ExitIRI->getOperand(N: Idx), EndValues, PSE);
1191 else
1192 Escape = optimizeEarlyExitInductionUser(
1193 Plan, Op: ExitIRI->getOperand(N: Idx), PSE);
1194 if (Escape)
1195 ExitIRI->setOperand(I: Idx, New: Escape);
1196 }
1197 }
1198 }
1199}
1200
1201/// Remove redundant ExpandSCEVRecipes in \p Plan's entry block by replacing
1202/// them with already existing recipes expanding the same SCEV expression.
1203static void removeRedundantExpandSCEVRecipes(VPlan &Plan) {
1204 DenseMap<const SCEV *, VPValue *> SCEV2VPV;
1205
1206 for (VPRecipeBase &R :
1207 make_early_inc_range(Range&: *Plan.getEntry()->getEntryBasicBlock())) {
1208 auto *ExpR = dyn_cast<VPExpandSCEVRecipe>(Val: &R);
1209 if (!ExpR)
1210 continue;
1211
1212 const auto &[V, Inserted] = SCEV2VPV.try_emplace(Key: ExpR->getSCEV(), Args&: ExpR);
1213 if (Inserted)
1214 continue;
1215
1216 ExpR->replaceAllUsesWith(New: V->second);
1217 if (ExpR == Plan.getTripCount())
1218 Plan.resetTripCount(NewTripCount: V->second);
1219
1220 ExpR->eraseFromParent();
1221 }
1222}
1223
1224static void recursivelyDeleteDeadRecipes(VPValue *V) {
1225 SmallVector<VPValue *> WorkList;
1226 SmallPtrSet<VPValue *, 8> Seen;
1227 WorkList.push_back(Elt: V);
1228
1229 while (!WorkList.empty()) {
1230 VPValue *Cur = WorkList.pop_back_val();
1231 if (!Seen.insert(Ptr: Cur).second)
1232 continue;
1233 VPRecipeBase *R = Cur->getDefiningRecipe();
1234 if (!R)
1235 continue;
1236 if (!isDeadRecipe(R&: *R))
1237 continue;
1238 append_range(C&: WorkList, R: R->operands());
1239 R->eraseFromParent();
1240 }
1241}
1242
1243/// Get any instruction opcode or intrinsic ID data embedded in recipe \p R.
1244/// Returns an optional pair, where the first element indicates whether it is
1245/// an intrinsic ID.
1246static std::optional<std::pair<bool, unsigned>>
1247getOpcodeOrIntrinsicID(const VPSingleDefRecipe *R) {
1248 return TypeSwitch<const VPSingleDefRecipe *,
1249 std::optional<std::pair<bool, unsigned>>>(R)
1250 .Case<VPInstruction, VPWidenRecipe, VPWidenCastRecipe, VPWidenGEPRecipe,
1251 VPReplicateRecipe>(
1252 caseFn: [](auto *I) { return std::make_pair(false, I->getOpcode()); })
1253 .Case(caseFn: [](const VPWidenIntrinsicRecipe *I) {
1254 return std::make_pair(x: true, y: I->getVectorIntrinsicID());
1255 })
1256 .Case<VPVectorPointerRecipe, VPPredInstPHIRecipe, VPScalarIVStepsRecipe>(
1257 caseFn: [](auto *I) {
1258 // For recipes that do not directly map to LLVM IR instructions,
1259 // assign opcodes after the last VPInstruction opcode (which is also
1260 // after the last IR Instruction opcode), based on the VPRecipeID.
1261 return std::make_pair(false, VPInstruction::OpsEnd + 1 +
1262 I->getVPRecipeID());
1263 })
1264 .Default(defaultFn: [](auto *) { return std::nullopt; });
1265}
1266
1267/// Try to fold \p R using InstSimplifyFolder. Will succeed and return a
1268/// non-nullptr VPValue for a handled opcode or intrinsic ID if corresponding \p
1269/// Operands are foldable live-ins.
1270static VPIRValue *tryToFoldLiveIns(VPSingleDefRecipe &R,
1271 ArrayRef<VPValue *> Operands,
1272 const DataLayout &DL) {
1273 auto OpcodeOrIID = getOpcodeOrIntrinsicID(R: &R);
1274 if (!OpcodeOrIID)
1275 return nullptr;
1276
1277 SmallVector<Value *, 4> Ops;
1278 for (VPValue *Op : Operands) {
1279 VPValue *Candidate = Op;
1280 match(V: Op, P: m_Broadcast(Op0: m_VPValue(V&: Candidate)));
1281 if (!match(V: Candidate, P: m_LiveIn()))
1282 return nullptr;
1283 Value *V = Candidate->getUnderlyingValue();
1284 if (!V)
1285 return nullptr;
1286 Ops.push_back(Elt: V);
1287 }
1288
1289 VPlan &Plan = *R.getParent()->getPlan();
1290 auto FoldToIRValue = [&]() -> Value * {
1291 InstSimplifyFolder Folder(DL);
1292 if (OpcodeOrIID->first) {
1293 auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(Val: &R);
1294 return Folder.FoldIntrinsic(ID: OpcodeOrIID->second, Ops, Ty: R.getScalarType(),
1295 FMF: RFlags ? RFlags->getFastMathFlagsOrNone()
1296 : FastMathFlags());
1297 }
1298 unsigned Opcode = OpcodeOrIID->second;
1299 if (Instruction::isBinaryOp(Opcode))
1300 return Folder.FoldBinOp(Opc: static_cast<Instruction::BinaryOps>(Opcode),
1301 LHS: Ops[0], RHS: Ops[1]);
1302 if (Instruction::isCast(Opcode))
1303 return Folder.FoldCast(Op: static_cast<Instruction::CastOps>(Opcode), V: Ops[0],
1304 DestTy: R.getVPSingleValue()->getScalarType());
1305 switch (Opcode) {
1306 case VPInstruction::Not:
1307 return Folder.FoldBinOp(Opc: Instruction::BinaryOps::Xor, LHS: Ops[0],
1308 RHS: Constant::getAllOnesValue(Ty: Ops[0]->getType()));
1309 case Instruction::Select:
1310 return Folder.FoldSelect(C: Ops[0], True: Ops[1], False: Ops[2]);
1311 case Instruction::ICmp:
1312 case Instruction::FCmp:
1313 return Folder.FoldCmp(P: cast<VPRecipeWithIRFlags>(Val&: R).getPredicate(), LHS: Ops[0],
1314 RHS: Ops[1]);
1315 case Instruction::GetElementPtr: {
1316 auto &RFlags = cast<VPRecipeWithIRFlags>(Val&: R);
1317 auto *GEP = cast<GetElementPtrInst>(Val: RFlags.getUnderlyingInstr());
1318 return Folder.FoldGEP(Ty: GEP->getSourceElementType(), Ptr: Ops[0],
1319 IdxList: drop_begin(RangeOrContainer&: Ops), NW: RFlags.getGEPNoWrapFlags());
1320 }
1321 case VPInstruction::PtrAdd:
1322 case VPInstruction::WidePtrAdd:
1323 return Folder.FoldGEP(Ty: IntegerType::getInt8Ty(C&: Plan.getContext()), Ptr: Ops[0],
1324 IdxList: Ops[1],
1325 NW: cast<VPRecipeWithIRFlags>(Val&: R).getGEPNoWrapFlags());
1326 // An extract of a live-in is an extract of a broadcast, so return the
1327 // broadcasted element.
1328 case Instruction::ExtractElement:
1329 assert(!Ops[0]->getType()->isVectorTy() && "Live-ins should be scalar");
1330 return Ops[0];
1331 }
1332 return nullptr;
1333 };
1334
1335 if (Value *V = FoldToIRValue())
1336 return Plan.getOrAddLiveIn(V);
1337 return nullptr;
1338}
1339
1340/// Try to simplify logical and bitwise recipes in \p Def.
1341static bool simplifyLogicalRecipe(VPSingleDefRecipe *Def, VPBuilder &Builder,
1342 bool CanCreateNewRecipe) {
1343 VPlan *Plan = Def->getParent()->getPlan();
1344
1345 // Simplify (X && Y) | (X && !Y) -> X.
1346 // TODO: Split up into simpler, modular combines: (X && Y) | (X && Z) into X
1347 // && (Y | Z) and (X | !X) into true. This requires queuing newly created
1348 // recipes to be visited during simplification.
1349 VPValue *X, *Y, *Z;
1350 if (match(R: Def,
1351 P: m_c_BinaryOr(Op0: m_LogicalAnd(Op0: m_VPValue(V&: X), Op1: m_VPValue(V&: Y)),
1352 Op1: m_LogicalAnd(Op0: m_Deferred(V: X), Op1: m_Not(Op0: m_Deferred(V: Y)))))) {
1353 Def->replaceAllUsesWith(New: X);
1354 Def->eraseFromParent();
1355 return true;
1356 }
1357
1358 // x | AllOnes -> AllOnes
1359 if (match(R: Def, P: m_c_BinaryOr(Op0: m_VPValue(V&: X), Op1: m_AllOnes()))) {
1360 Def->replaceAllUsesWith(New: Plan->getAllOnesValue(Ty: Def->getScalarType()));
1361 return true;
1362 }
1363
1364 // x | 0 -> x
1365 if (match(R: Def, P: m_c_BinaryOr(Op0: m_VPValue(V&: X), Op1: m_ZeroInt()))) {
1366 Def->replaceAllUsesWith(New: X);
1367 return true;
1368 }
1369
1370 // x | !x -> AllOnes
1371 if (match(R: Def, P: m_c_BinaryOr(Op0: m_VPValue(V&: X), Op1: m_Not(Op0: m_Deferred(V: X))))) {
1372 Def->replaceAllUsesWith(New: Plan->getAllOnesValue(Ty: Def->getScalarType()));
1373 return true;
1374 }
1375
1376 // x & 0 -> 0
1377 if (match(R: Def, P: m_c_BinaryAnd(Op0: m_VPValue(V&: X), Op1: m_ZeroInt()))) {
1378 Def->replaceAllUsesWith(New: Plan->getZero(Ty: Def->getScalarType()));
1379 return true;
1380 }
1381
1382 // x & AllOnes -> x
1383 if (match(R: Def, P: m_c_BinaryAnd(Op0: m_VPValue(V&: X), Op1: m_AllOnes()))) {
1384 Def->replaceAllUsesWith(New: X);
1385 return true;
1386 }
1387
1388 // x && false -> false
1389 if (match(R: Def, P: m_c_LogicalAnd(Op0: m_VPValue(V&: X), Op1: m_False()))) {
1390 Def->replaceAllUsesWith(New: Plan->getFalse());
1391 return true;
1392 }
1393
1394 // x && true -> x
1395 if (match(R: Def, P: m_c_LogicalAnd(Op0: m_VPValue(V&: X), Op1: m_True()))) {
1396 Def->replaceAllUsesWith(New: X);
1397 return true;
1398 }
1399
1400 // (x && y) | (x && z) -> x && (y | z)
1401 if (CanCreateNewRecipe &&
1402 match(R: Def, P: m_c_BinaryOr(Op0: m_LogicalAnd(Op0: m_VPValue(V&: X), Op1: m_VPValue(V&: Y)),
1403 Op1: m_LogicalAnd(Op0: m_Deferred(V: X), Op1: m_VPValue(V&: Z)))) &&
1404 // Simplify only if one of the operands has one use to avoid creating an
1405 // extra recipe.
1406 (!Def->getOperand(N: 0)->hasMoreThanOneUniqueUser() ||
1407 !Def->getOperand(N: 1)->hasMoreThanOneUniqueUser())) {
1408 Def->replaceAllUsesWith(
1409 New: Builder.createLogicalAnd(LHS: X, RHS: Builder.createOr(LHS: Y, RHS: Z)));
1410 return true;
1411 }
1412
1413 // x && (x && y) -> x && y
1414 if (match(R: Def, P: m_LogicalAnd(Op0: m_VPValue(V&: X),
1415 Op1: m_LogicalAnd(Op0: m_Deferred(V: X), Op1: m_VPValue())))) {
1416 Def->replaceAllUsesWith(New: Def->getOperand(N: 1));
1417 return true;
1418 }
1419
1420 // x && (y && x) -> x && y
1421 if (match(R: Def, P: m_LogicalAnd(Op0: m_VPValue(V&: X),
1422 Op1: m_LogicalAnd(Op0: m_VPValue(V&: Y), Op1: m_Deferred(V: X))))) {
1423 Def->replaceAllUsesWith(New: Builder.createLogicalAnd(LHS: X, RHS: Y));
1424 return true;
1425 }
1426
1427 // x && !x -> 0
1428 if (match(R: Def, P: m_LogicalAnd(Op0: m_VPValue(V&: X), Op1: m_Not(Op0: m_Deferred(V: X))))) {
1429 Def->replaceAllUsesWith(New: Plan->getFalse());
1430 return true;
1431 }
1432
1433 if (match(R: Def, P: m_Select(Op0: m_VPValue(), Op1: m_VPValue(V&: X), Op2: m_Deferred(V: X)))) {
1434 Def->replaceAllUsesWith(New: X);
1435 return true;
1436 }
1437
1438 // select c, false, true -> not c
1439 VPValue *C;
1440 if (CanCreateNewRecipe &&
1441 match(R: Def, P: m_Select(Op0: m_VPValue(V&: C), Op1: m_False(), Op2: m_True()))) {
1442 Def->replaceAllUsesWith(New: Builder.createNot(Operand: C));
1443 return true;
1444 }
1445
1446 // select !c, x, y -> select c, y, x
1447 if (match(R: Def, P: m_Select(Op0: m_Not(Op0: m_VPValue(V&: C)), Op1: m_VPValue(V&: X), Op2: m_VPValue(V&: Y)))) {
1448 Def->setOperand(I: 0, New: C);
1449 Def->setOperand(I: 1, New: Y);
1450 Def->setOperand(I: 2, New: X);
1451 return true;
1452 }
1453
1454 // select x, (i1 y | z), y -> y | (x && z)
1455 if (CanCreateNewRecipe &&
1456 match(R: Def, P: m_Select(Op0: m_VPValue(V&: X),
1457 Op1: m_OneUse(SubPattern: m_c_BinaryOr(Op0: m_VPValue(V&: Y), Op1: m_VPValue(V&: Z))),
1458 Op2: m_Deferred(V: Y))) &&
1459 Y->getScalarType()->isIntegerTy(BitWidth: 1)) {
1460 Def->replaceAllUsesWith(
1461 New: Builder.createOr(LHS: Y, RHS: Builder.createLogicalAnd(LHS: X, RHS: Z)));
1462 return true;
1463 }
1464
1465 return false;
1466}
1467
1468/// Try to simplify VPSingleDefRecipe \p Def.
1469static void simplifyRecipe(VPSingleDefRecipe *Def) {
1470 VPlan *Plan = Def->getParent()->getPlan();
1471
1472 // Simplification of live-in IR values for SingleDef recipes using
1473 // InstSimplifyFolder.
1474 const DataLayout &DL = Plan->getDataLayout();
1475 if (VPValue *V = tryToFoldLiveIns(R&: *Def, Operands: Def->operands(), DL))
1476 return Def->replaceAllUsesWith(New: V);
1477
1478 // Fold PredPHI LiveIn -> LiveIn.
1479 if (auto *PredPHI = dyn_cast<VPPredInstPHIRecipe>(Val: Def)) {
1480 VPValue *Op = PredPHI->getOperand(N: 0);
1481 if (isa<VPIRValue>(Val: Op))
1482 PredPHI->replaceAllUsesWith(New: Op);
1483 }
1484
1485 // Drop the mask of a predicated store masked by the header mask (which is
1486 // guaranteed to be true at least for the first lane) and both the stored
1487 // value and the address are uniform across VF and UF.
1488 if (auto *RepR = dyn_cast<VPReplicateRecipe>(Val: Def);
1489 RepR && RepR->isPredicated() && RepR->getOpcode() == Instruction::Store &&
1490 all_of(Range: RepR->operandsWithoutMask(), P: vputils::isUniformAcrossVFsAndUFs) &&
1491 vputils::isHeaderMask(V: RepR->getMask(), Plan: *Plan)) {
1492 auto *Unmasked = new VPReplicateRecipe(
1493 RepR->getUnderlyingInstr(), RepR->operandsWithoutMask(),
1494 RepR->isSingleScalar(), /*Mask=*/nullptr, *RepR, *RepR,
1495 RepR->getDebugLoc());
1496 Unmasked->insertBefore(InsertPos: RepR);
1497 RepR->replaceAllUsesWith(New: Unmasked);
1498 RepR->eraseFromParent();
1499 return;
1500 }
1501
1502 VPBuilder Builder(Def);
1503
1504 // Avoid replacing VPInstructions with underlying values with new
1505 // VPInstructions, as we would fail to create widen/replicate recpes from the
1506 // new VPInstructions without an underlying value, and miss out on some
1507 // transformations that only apply to widened/replicated recipes later, by
1508 // doing so.
1509 // TODO: We should also not replace non-VPInstructions like VPWidenRecipe with
1510 // VPInstructions without underlying values, as those will get skipped during
1511 // cost computation.
1512 bool CanCreateNewRecipe =
1513 !isa<VPInstruction>(Val: Def) || !Def->getUnderlyingValue();
1514
1515 VPValue *A;
1516 if (match(R: Def, P: m_Trunc(Op0: m_ZExtOrSExt(Op0: m_VPValue(V&: A))))) {
1517 Type *TruncTy = Def->getScalarType();
1518 Type *ATy = A->getScalarType();
1519 if (TruncTy == ATy) {
1520 Def->replaceAllUsesWith(New: A);
1521 } else {
1522 // Don't replace a non-widened cast recipe with a widened cast.
1523 if (!isa<VPWidenCastRecipe>(Val: Def))
1524 return;
1525 if (ATy->getScalarSizeInBits() < TruncTy->getScalarSizeInBits()) {
1526
1527 unsigned ExtOpcode = match(V: Def->getOperand(N: 0), P: m_SExt(Op0: m_VPValue()))
1528 ? Instruction::SExt
1529 : Instruction::ZExt;
1530 auto *Ext = Builder.createWidenCast(Opcode: Instruction::CastOps(ExtOpcode), Op: A,
1531 ResultTy: TruncTy);
1532 if (auto *UnderlyingExt = Def->getOperand(N: 0)->getUnderlyingValue()) {
1533 // UnderlyingExt has distinct return type, used to retain legacy cost.
1534 Ext->setUnderlyingValue(UnderlyingExt);
1535 }
1536 Def->replaceAllUsesWith(New: Ext);
1537 } else if (ATy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits()) {
1538 auto *Trunc = Builder.createWidenCast(Opcode: Instruction::Trunc, Op: A, ResultTy: TruncTy);
1539 Def->replaceAllUsesWith(New: Trunc);
1540 }
1541 }
1542 }
1543
1544 if (simplifyLogicalRecipe(Def, Builder, CanCreateNewRecipe))
1545 return;
1546
1547 VPValue *X, *Y, *C;
1548 if (match(R: Def, P: m_c_Add(Op0: m_VPValue(V&: A), Op1: m_ZeroInt())))
1549 return Def->replaceAllUsesWith(New: A);
1550
1551 if (match(R: Def, P: m_c_Mul(Op0: m_VPValue(V&: A), Op1: m_One())))
1552 return Def->replaceAllUsesWith(New: A);
1553
1554 if (match(R: Def, P: m_c_Mul(Op0: m_VPValue(V&: A), Op1: m_ZeroInt())))
1555 return Def->replaceAllUsesWith(New: Plan->getZero(Ty: Def->getScalarType()));
1556
1557 if (CanCreateNewRecipe && match(R: Def, P: m_c_Mul(Op0: m_VPValue(V&: A), Op1: m_AllOnes()))) {
1558 // Preserve nsw from the Mul on the new Sub.
1559 VPIRFlags::WrapFlagsTy NW = {
1560 false, cast<VPRecipeWithIRFlags>(Val: Def)->hasNoSignedWrap()};
1561 return Def->replaceAllUsesWith(New: Builder.createSub(
1562 LHS: Plan->getZero(Ty: A->getScalarType()), RHS: A, DL: Def->getDebugLoc(), Name: "", WrapFlags: NW));
1563 }
1564
1565 if (CanCreateNewRecipe &&
1566 match(R: Def, P: m_c_Add(Op0: m_VPValue(V&: X), Op1: m_Sub(Op0: m_ZeroInt(), Op1: m_VPValue(V&: Y))))) {
1567 // Preserve nsw from the Add and the Sub, if it's present on both, on the
1568 // new Sub.
1569 VPIRFlags::WrapFlagsTy NW = {
1570 false,
1571 cast<VPRecipeWithIRFlags>(Val: Def)->hasNoSignedWrap() &&
1572 cast<VPRecipeWithIRFlags>(Val: Def->getOperand(N: Def->getOperand(N: 0) == X))
1573 ->hasNoSignedWrap()};
1574 return Def->replaceAllUsesWith(
1575 New: Builder.createSub(LHS: X, RHS: Y, DL: Def->getDebugLoc(), Name: "", WrapFlags: NW));
1576 }
1577
1578 const APInt *APC;
1579 if (CanCreateNewRecipe && match(R: Def, P: m_c_Mul(Op0: m_VPValue(V&: A), Op1: m_APInt(C&: APC))) &&
1580 APC->isPowerOf2())
1581 return Def->replaceAllUsesWith(New: Builder.createNaryOp(
1582 Opcode: Instruction::Shl,
1583 Operands: {A, Plan->getConstantInt(BitWidth: APC->getBitWidth(), Val: APC->exactLogBase2())},
1584 Flags: *cast<VPRecipeWithIRFlags>(Val: Def), DL: Def->getDebugLoc()));
1585
1586 if (CanCreateNewRecipe && match(R: Def, P: m_UDiv(Op0: m_VPValue(V&: A), Op1: m_APInt(C&: APC))) &&
1587 APC->isPowerOf2())
1588 return Def->replaceAllUsesWith(New: Builder.createNaryOp(
1589 Opcode: Instruction::LShr,
1590 Operands: {A, Plan->getConstantInt(BitWidth: APC->getBitWidth(), Val: APC->exactLogBase2())},
1591 Flags: *cast<VPRecipeWithIRFlags>(Val: Def), DL: Def->getDebugLoc()));
1592
1593 if (match(R: Def, P: m_Not(Op0: m_VPValue(V&: A)))) {
1594 if (match(V: A, P: m_Not(Op0: m_VPValue(V&: A))))
1595 return Def->replaceAllUsesWith(New: A);
1596
1597 // Try to fold Not into compares by adjusting the predicate in-place.
1598 CmpPredicate Pred;
1599 if (match(V: A, P: m_Cmp(Pred, Op0: m_VPValue(), Op1: m_VPValue()))) {
1600 auto *Cmp = cast<VPRecipeWithIRFlags>(Val: A);
1601 if (all_of(Range: Cmp->users(),
1602 P: match_fn(P: m_CombineOr(
1603 Ps: m_Not(Op0: m_Specific(VPV: Cmp)),
1604 Ps: m_Select(Op0: m_Specific(VPV: Cmp), Op1: m_VPValue(), Op2: m_VPValue()))))) {
1605 Cmp->setPredicate(CmpInst::getInversePredicate(pred: Pred));
1606 for (VPUser *U : to_vector(Range: Cmp->users())) {
1607 auto *R = cast<VPSingleDefRecipe>(Val: U);
1608 if (match(R, P: m_Select(Op0: m_Specific(VPV: Cmp), Op1: m_VPValue(V&: X), Op2: m_VPValue(V&: Y)))) {
1609 // select (cmp pred), x, y -> select (cmp inv_pred), y, x
1610 R->setOperand(I: 1, New: Y);
1611 R->setOperand(I: 2, New: X);
1612 } else {
1613 // not (cmp pred) -> cmp inv_pred
1614 assert(match(R, m_Not(m_Specific(Cmp))) && "Unexpected user");
1615 R->replaceAllUsesWith(New: Cmp);
1616 }
1617 }
1618 // If Cmp doesn't have a debug location, use the one from the negation,
1619 // to preserve the location.
1620 if (!Cmp->getDebugLoc() && Def->getDebugLoc())
1621 Cmp->setDebugLoc(Def->getDebugLoc());
1622 }
1623 }
1624 }
1625
1626 // Fold any-of (fcmp uno %A, %A), (fcmp uno %B, %B), ... ->
1627 // any-of (fcmp uno %A, %B), ...
1628 if (match(R: Def, P: m_AnyOf())) {
1629 SmallVector<VPValue *, 4> NewOps;
1630 VPRecipeBase *UnpairedCmp = nullptr;
1631 for (VPValue *Op : Def->operands()) {
1632 VPValue *X;
1633 if (Op->getNumUsers() > 1 ||
1634 !match(V: Op, P: m_SpecificCmp(MatchPred: CmpInst::FCMP_UNO, Op0: m_VPValue(V&: X),
1635 Op1: m_Deferred(V: X)))) {
1636 NewOps.push_back(Elt: Op);
1637 } else if (!UnpairedCmp) {
1638 UnpairedCmp = Op->getDefiningRecipe();
1639 } else {
1640 NewOps.push_back(Elt: Builder.createFCmp(Pred: CmpInst::FCMP_UNO,
1641 A: UnpairedCmp->getOperand(N: 0), B: X));
1642 UnpairedCmp = nullptr;
1643 }
1644 }
1645
1646 if (UnpairedCmp)
1647 NewOps.push_back(Elt: UnpairedCmp->getVPSingleValue());
1648
1649 if (NewOps.size() < Def->getNumOperands()) {
1650 VPValue *NewAnyOf = Builder.createNaryOp(Opcode: VPInstruction::AnyOf, Operands: NewOps);
1651 return Def->replaceAllUsesWith(New: NewAnyOf);
1652 }
1653 }
1654
1655 // Fold (fcmp uno %X, %X) or (fcmp uno %Y, %Y) -> fcmp uno %X, %Y
1656 // This is useful for fmax/fmin without fast-math flags, where we need to
1657 // check if any operand is NaN.
1658 if (CanCreateNewRecipe &&
1659 match(R: Def, P: m_BinaryOr(Op0: m_SpecificCmp(MatchPred: CmpInst::FCMP_UNO, Op0: m_VPValue(V&: X),
1660 Op1: m_Deferred(V: X)),
1661 Op1: m_SpecificCmp(MatchPred: CmpInst::FCMP_UNO, Op0: m_VPValue(V&: Y),
1662 Op1: m_Deferred(V: Y))))) {
1663 VPValue *NewCmp = Builder.createFCmp(Pred: CmpInst::FCMP_UNO, A: X, B: Y);
1664 return Def->replaceAllUsesWith(New: NewCmp);
1665 }
1666
1667 // Remove redundant DerviedIVs, that is 0 + A * 1 -> A and 0 + 0 * x -> 0.
1668 if ((match(R: Def, P: m_DerivedIV(Op0: m_ZeroInt(), Op1: m_VPValue(V&: A), Op2: m_One())) ||
1669 match(R: Def, P: m_DerivedIV(Op0: m_ZeroInt(), Op1: m_ZeroInt(), Op2: m_VPValue()))) &&
1670 Def->getOperand(N: 1)->getScalarType() == Def->getScalarType())
1671 return Def->replaceAllUsesWith(New: Def->getOperand(N: 1));
1672
1673 if (match(R: Def, P: m_VPInstruction<VPInstruction::WideIVStep>(Ops: m_VPValue(V&: X),
1674 Ops: m_One()))) {
1675 Type *WideStepTy = Def->getScalarType();
1676 if (X->getScalarType() != WideStepTy)
1677 X = Builder.createWidenCast(Opcode: Instruction::Trunc, Op: X, ResultTy: WideStepTy);
1678 Def->replaceAllUsesWith(New: X);
1679 return;
1680 }
1681
1682 // For i1 vp.merges produced by AnyOf reductions:
1683 // vp.merge true, (or x, y), x, evl -> vp.merge y, true, x, evl
1684 if (match(R: Def, P: m_Intrinsic<Intrinsic::vp_merge>(Op0: m_True(), Op1: m_VPValue(V&: A),
1685 Op2: m_VPValue(V&: X), Op3: m_VPValue())) &&
1686 match(V: A, P: m_c_BinaryOr(Op0: m_Specific(VPV: X), Op1: m_VPValue(V&: Y))) &&
1687 Def->getScalarType()->isIntegerTy(BitWidth: 1)) {
1688 Def->setOperand(I: 1, New: Def->getOperand(N: 0));
1689 Def->setOperand(I: 0, New: Y);
1690 return;
1691 }
1692
1693 // Simplify MaskedCond with no block mask to its single operand.
1694 if (match(R: Def, P: m_VPInstruction<VPInstruction::MaskedCond>()) &&
1695 !cast<VPInstruction>(Val: Def)->isMasked())
1696 return Def->replaceAllUsesWith(New: Def->getOperand(N: 0));
1697
1698 // Look through ExtractLastLane.
1699 if (match(R: Def, P: m_ExtractLastLane(Op0: m_VPValue(V&: A)))) {
1700 if (match(V: A, P: m_BuildVector())) {
1701 auto *BuildVector = cast<VPInstruction>(Val: A);
1702 Def->replaceAllUsesWith(
1703 New: BuildVector->getOperand(N: BuildVector->getNumOperands() - 1));
1704 return;
1705 }
1706
1707 if (match(V: A, P: m_Broadcast(Op0: m_VPValue(V&: X))))
1708 return Def->replaceAllUsesWith(New: X);
1709
1710 if (isa<VPInstruction, VPReplicateRecipe>(Val: A) && vputils::isSingleScalar(VPV: A))
1711 return Def->replaceAllUsesWith(New: A);
1712
1713 if (Plan->hasScalarVFOnly())
1714 return Def->replaceAllUsesWith(New: A);
1715 }
1716
1717 // Look through ExtractPenultimateElement (BuildVector ....).
1718 if (match(R: Def, P: m_ExtractPenultimateElement(Op0: m_BuildVector()))) {
1719 auto *BuildVector = cast<VPInstruction>(Val: Def->getOperand(N: 0));
1720 Def->replaceAllUsesWith(
1721 New: BuildVector->getOperand(N: BuildVector->getNumOperands() - 2));
1722 return;
1723 }
1724
1725 uint64_t Idx;
1726 if (match(R: Def, P: m_ExtractElement(Op0: m_BuildVector(), Op1: m_ConstantInt(C&: Idx)))) {
1727 auto *BuildVector = cast<VPInstruction>(Val: Def->getOperand(N: 0));
1728 Def->replaceAllUsesWith(New: BuildVector->getOperand(N: Idx));
1729 return;
1730 }
1731
1732 if (match(R: Def, P: m_BuildVector()) && all_equal(Range: Def->operands())) {
1733 Def->replaceAllUsesWith(
1734 New: Builder.createNaryOp(Opcode: VPInstruction::Broadcast, Operands: Def->getOperand(N: 0)));
1735 return;
1736 }
1737
1738 // Look through broadcast of single-scalar when used as select conditions; in
1739 // that case the scalar condition can be used directly.
1740 if (match(R: Def,
1741 P: m_Select(Op0: m_Broadcast(Op0: m_VPValue(V&: C)), Op1: m_VPValue(), Op2: m_VPValue()))) {
1742 assert(vputils::isSingleScalar(C) &&
1743 "broadcast operand must be single-scalar");
1744 Def->setOperand(I: 0, New: C);
1745 return;
1746 }
1747
1748 if (match(R: Def, P: m_Broadcast(Op0: m_VPValue(V&: X))))
1749 return Def->replaceUsesWithIf(
1750 New: X, ShouldReplace: [Def](const VPUser &U, unsigned) { return U.usesScalars(Op: Def); });
1751
1752 if (isa<VPPhi, VPWidenPHIRecipe, VPHeaderPHIRecipe>(Val: Def)) {
1753 if (Def->getNumOperands() == 1) {
1754 Def->replaceAllUsesWith(New: Def->getOperand(N: 0));
1755 return;
1756 }
1757 if (auto *Phi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(Val: Def)) {
1758 if (all_equal(Range: Phi->incoming_values()))
1759 Phi->replaceAllUsesWith(New: Phi->getOperand(N: 0));
1760 }
1761 return;
1762 }
1763
1764 VPIRValue *IRV;
1765 if (Def->getNumOperands() == 1 &&
1766 match(R: Def, P: m_ComputeReductionResult(Op0: m_VPIRValue(V&: IRV))))
1767 return Def->replaceAllUsesWith(New: IRV);
1768
1769 // Some simplifications can only be applied after unrolling. Perform them
1770 // below.
1771 if (!Plan->isUnrolled())
1772 return;
1773
1774 // After unrolling, extract-lane may be used to extract values from multiple
1775 // scalar sources. Only simplify when extracting from a single scalar source.
1776 VPValue *LaneToExtract;
1777 if (match(R: Def, P: m_ExtractLane(Op0: m_VPValue(V&: LaneToExtract), Op1: m_VPValue(V&: A)))) {
1778 // Simplify extract-lane(%lane_num, %scalar_val) -> %scalar_val.
1779 if (vputils::isSingleScalar(VPV: A))
1780 return Def->replaceAllUsesWith(New: A);
1781
1782 // Replace extract-lane(0, canonical-WIDEN-INDUCTION) with the region's
1783 // scalar canonical IV.
1784 VPWidenIntOrFpInductionRecipe *WidenIV;
1785 if (match(V: LaneToExtract, P: m_ZeroInt()) &&
1786 match(V: A, P: m_CanonicalWidenIV(V&: WidenIV)))
1787 return Def->replaceAllUsesWith(New: WidenIV->getRegion()->getCanonicalIV());
1788
1789 // Simplify extract-lane with single source to extract-element.
1790 Def->replaceAllUsesWith(New: Builder.createNaryOp(
1791 Opcode: Instruction::ExtractElement, Operands: {A, LaneToExtract}, DL: Def->getDebugLoc()));
1792 return;
1793 }
1794
1795 // Look for cycles where Def is of the form:
1796 // X = phi(0, IVInc) ; used only by IVInc, or by IVInc and Inc = X + Y
1797 // IVInc = X + Step ; used by X and Def
1798 // Def = IVInc + Y
1799 // Fold the increment Y into the phi's start value, replace Def with IVInc,
1800 // and if Inc exists, replace it with X.
1801 if (match(R: Def, P: m_Add(Op0: m_Add(Op0: m_VPValue(V&: X), Op1: m_VPValue()), Op1: m_VPValue(V&: Y))) &&
1802 isa<VPIRValue>(Val: Y) &&
1803 match(V: X, P: m_VPPhi(Op0: m_ZeroInt(), Op1: m_Specific(VPV: Def->getOperand(N: 0))))) {
1804 auto *Phi = cast<VPPhi>(Val: X);
1805 auto *IVInc = Def->getOperand(N: 0);
1806 if (IVInc->getNumUsers() == 2) {
1807 // If Phi has a second user (besides IVInc's defining recipe), it must
1808 // be Inc = Phi + Y for the fold to apply.
1809 auto *Inc = dyn_cast_or_null<VPSingleDefRecipe>(
1810 Val: findUserOf(V: Phi, P: m_Add(Op0: m_Specific(VPV: Phi), Op1: m_Specific(VPV: Y))));
1811 if (Phi->getNumUsers() == 1 || (Phi->getNumUsers() == 2 && Inc)) {
1812 Def->replaceAllUsesWith(New: IVInc);
1813 if (Inc)
1814 Inc->replaceAllUsesWith(New: Phi);
1815 Phi->setOperand(I: 0, New: Y);
1816 return;
1817 }
1818 }
1819 }
1820
1821 // Simplify unrolled VectorPointer without offset, or with zero offset, to
1822 // just the pointer operand.
1823 if (auto *VPR = dyn_cast<VPVectorPointerRecipe>(Val: Def))
1824 if (!VPR->getVFxPart() || match(V: VPR->getVFxPart(), P: m_ZeroInt()))
1825 return VPR->replaceAllUsesWith(New: VPR->getOperand(N: 0));
1826
1827 // VPScalarIVSteps after unrolling can be replaced by their start value, if
1828 // the start index is zero and only the first lane 0 is demanded.
1829 if (auto *Steps = dyn_cast<VPScalarIVStepsRecipe>(Val: Def)) {
1830 if (!Steps->getStartIndex() && vputils::onlyFirstLaneUsed(Def: Steps)) {
1831 Steps->replaceAllUsesWith(New: Steps->getOperand(N: 0));
1832 return;
1833 }
1834 }
1835 // Simplify redundant ReductionStartVector recipes after unrolling.
1836 VPValue *StartV;
1837 if (match(R: Def, P: m_VPInstruction<VPInstruction::ReductionStartVector>(
1838 Ops: m_VPValue(V&: StartV), Ops: m_VPValue(), Ops: m_VPValue()))) {
1839 Def->replaceUsesWithIf(New: StartV, ShouldReplace: [](const VPUser &U, unsigned Idx) {
1840 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(Val: &U);
1841 return PhiR && PhiR->isInLoop();
1842 });
1843 return;
1844 }
1845
1846 if (Plan->getConcreteUF() == 1 && match(R: Def, P: m_ExtractLastPart(Op0: m_VPValue(V&: A))))
1847 return Def->replaceAllUsesWith(New: A);
1848}
1849
1850void VPlanTransforms::simplifyRecipes(VPlan &Plan) {
1851 ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT(
1852 Plan.getEntry());
1853 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Range&: RPOT)) {
1854 for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB))
1855 if (auto *Def = dyn_cast<VPSingleDefRecipe>(Val: &R))
1856 simplifyRecipe(Def);
1857 }
1858}
1859
1860void VPlanTransforms::simplifyReverses(VPlan &Plan) {
1861 VPValue *X;
1862 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
1863 Range: vp_depth_first_deep(G: Plan.getEntry())))
1864 for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB))
1865 if (match(V: &R, P: m_Reverse(Op0: m_Reverse(Op0: m_VPValue(V&: X)))))
1866 R.getVPSingleValue()->replaceAllUsesWith(New: X);
1867}
1868
1869/// Reassociate (headermask && x) && y -> headermask && (x && y) to allow the
1870/// header mask to be simplified further when tail folding, e.g. in
1871/// optimizeEVLMasks.
1872static void reassociateHeaderMask(VPlan &Plan) {
1873 VPValue *HeaderMask = vputils::findHeaderMask(Plan);
1874 if (!HeaderMask)
1875 return;
1876
1877 SmallVector<VPUser *> Worklist;
1878 for (VPUser *U : HeaderMask->users())
1879 if (match(U, P: m_LogicalAnd(Op0: m_Specific(VPV: HeaderMask), Op1: m_VPValue())))
1880 append_range(C&: Worklist, R: cast<VPSingleDefRecipe>(Val: U)->users());
1881
1882 while (!Worklist.empty()) {
1883 auto *R = dyn_cast<VPSingleDefRecipe>(Val: Worklist.pop_back_val());
1884 VPValue *X, *Y;
1885 if (!R || !match(R, P: m_LogicalAnd(
1886 Op0: m_LogicalAnd(Op0: m_Specific(VPV: HeaderMask), Op1: m_VPValue(V&: X)),
1887 Op1: m_VPValue(V&: Y))))
1888 continue;
1889 append_range(C&: Worklist, R: R->users());
1890 VPBuilder Builder(R);
1891 R->replaceAllUsesWith(
1892 New: Builder.createLogicalAnd(LHS: HeaderMask, RHS: Builder.createLogicalAnd(LHS: X, RHS: Y)));
1893 }
1894}
1895
1896static std::optional<Instruction::BinaryOps>
1897getUnmaskedDivRemOpcode(Intrinsic::ID ID) {
1898 switch (ID) {
1899 case Intrinsic::masked_udiv:
1900 return Instruction::UDiv;
1901 case Intrinsic::masked_sdiv:
1902 return Instruction::SDiv;
1903 case Intrinsic::masked_urem:
1904 return Instruction::URem;
1905 case Intrinsic::masked_srem:
1906 return Instruction::SRem;
1907 default:
1908 return {};
1909 }
1910}
1911
1912static void narrowToSingleScalarRecipes(VPlan &Plan) {
1913 if (Plan.hasScalarVFOnly())
1914 return;
1915
1916 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
1917 Range: vp_depth_first_deep(G: Plan.getEntry()))) {
1918 for (VPRecipeBase &R : make_early_inc_range(Range: reverse(C&: *VPBB))) {
1919 if (!isa<VPWidenRecipe, VPWidenGEPRecipe, VPReplicateRecipe,
1920 VPWidenIntrinsicRecipe>(Val: &R))
1921 continue;
1922 auto *RepR = dyn_cast<VPReplicateRecipe>(Val: &R);
1923 if (RepR && (RepR->isSingleScalar() || RepR->isPredicated()))
1924 continue;
1925
1926 auto *RepOrWidenR = cast<VPRecipeWithIRFlags>(Val: &R);
1927 if (RepR && RepR->getOpcode() == Instruction::Store &&
1928 vputils::isSingleScalar(VPV: RepR->getOperand(N: 1))) {
1929 auto *Clone = new VPReplicateRecipe(
1930 RepOrWidenR->getUnderlyingInstr(), RepOrWidenR->operands(),
1931 true /*IsSingleScalar*/, nullptr /*Mask*/, *RepR /*Flags*/,
1932 *RepR /*Metadata*/, RepR->getDebugLoc());
1933 Clone->insertBefore(InsertPos: RepOrWidenR);
1934 VPBuilder Builder(Clone);
1935 VPValue *ExtractOp = Clone->getOperand(N: 0);
1936 if (vputils::isUniformAcrossVFsAndUFs(V: RepR->getOperand(N: 1)))
1937 ExtractOp =
1938 Builder.createNaryOp(Opcode: VPInstruction::ExtractLastPart, Operands: ExtractOp);
1939 ExtractOp =
1940 Builder.createNaryOp(Opcode: VPInstruction::ExtractLastLane, Operands: ExtractOp);
1941 Clone->setOperand(I: 0, New: ExtractOp);
1942 RepR->eraseFromParent();
1943 continue;
1944 }
1945
1946 // Narrow llvm.masked.{u,s}{div,rem} intrinsics with a safe divisor.
1947 if (auto *IntrR = dyn_cast<VPWidenIntrinsicRecipe>(Val: RepOrWidenR)) {
1948 if (!vputils::onlyFirstLaneUsed(Def: IntrR))
1949 continue;
1950 auto Opc = getUnmaskedDivRemOpcode(ID: IntrR->getVectorIntrinsicID());
1951 if (!Opc)
1952 continue;
1953 VPBuilder Builder(IntrR);
1954 VPValue *SafeDivisor = Builder.createSelect(
1955 Cond: IntrR->getOperand(N: 2), TrueVal: IntrR->getOperand(N: 1),
1956 FalseVal: Plan.getConstantInt(Ty: IntrR->getScalarType(), Val: 1));
1957 VPValue *Clone = Builder.createNaryOp(
1958 Opcode: *Opc, Operands: {IntrR->getOperand(N: 0), SafeDivisor},
1959 Flags: VPIRFlags::getDefaultFlags(Opcode: *Opc), DL: IntrR->getDebugLoc());
1960 IntrR->replaceAllUsesWith(New: Clone);
1961 IntrR->eraseFromParent();
1962 continue;
1963 }
1964
1965 // Skip recipes that aren't single scalars.
1966 if (!vputils::isSingleScalar(VPV: RepOrWidenR))
1967 continue;
1968
1969 // Predicate to check if a user of Op introduces extra broadcasts.
1970 auto IntroducesBCastOf = [](const VPValue *Op) {
1971 return [Op](const VPUser *U) {
1972 if (auto *VPI = dyn_cast<VPInstruction>(Val: U)) {
1973 if (is_contained(Set: {VPInstruction::ExtractLastLane,
1974 VPInstruction::ExtractLastPart,
1975 VPInstruction::ExtractPenultimateElement},
1976 Element: VPI->getOpcode()))
1977 return false;
1978 }
1979 return !U->usesScalars(Op);
1980 };
1981 };
1982
1983 if (any_of(Range: RepOrWidenR->users(), P: IntroducesBCastOf(RepOrWidenR)) &&
1984 none_of(Range: RepOrWidenR->operands(), P: [&](VPValue *Op) {
1985 if (any_of(
1986 Range: make_filter_range(Range: Op->users(), Pred: not_equal_to(Arg&: RepOrWidenR)),
1987 P: IntroducesBCastOf(Op)))
1988 return false;
1989 // Non-constant live-ins require broadcasts, while constants do not
1990 // need explicit broadcasts.
1991 auto *IRV = dyn_cast<VPIRValue>(Val: Op);
1992 bool LiveInNeedsBroadcast = IRV && !isa<Constant>(Val: IRV->getValue());
1993 auto *OpR = dyn_cast<VPReplicateRecipe>(Val: Op);
1994 return LiveInNeedsBroadcast || (OpR && OpR->isSingleScalar());
1995 }))
1996 continue;
1997
1998 auto *Clone = VPBuilder::createSingleScalarOp(
1999 Opcode: getOpcodeOrIntrinsicID(R: RepOrWidenR)->second, Operands: RepOrWidenR->operands(),
2000 /*Mask=*/nullptr, Flags: *RepOrWidenR, Metadata: {}, DL: DebugLoc::getUnknown(),
2001 UV: RepOrWidenR->getUnderlyingInstr());
2002 Clone->insertBefore(InsertPos: RepOrWidenR);
2003 RepOrWidenR->replaceAllUsesWith(New: Clone);
2004 if (isDeadRecipe(R&: *RepOrWidenR))
2005 RepOrWidenR->eraseFromParent();
2006 }
2007 }
2008}
2009
2010/// Try to see if all of \p Blend's masks share a common value logically and'ed
2011/// and remove it from the masks.
2012static void removeCommonBlendMask(VPBlendRecipe *Blend) {
2013 if (Blend->isNormalized())
2014 return;
2015 VPValue *CommonEdgeMask;
2016 if (!match(V: Blend->getMask(Idx: 0),
2017 P: m_LogicalAnd(Op0: m_VPValue(V&: CommonEdgeMask), Op1: m_VPValue())))
2018 return;
2019 for (unsigned I = 0; I < Blend->getNumIncomingValues(); I++)
2020 if (!match(V: Blend->getMask(Idx: I),
2021 P: m_LogicalAnd(Op0: m_Specific(VPV: CommonEdgeMask), Op1: m_VPValue())))
2022 return;
2023 for (unsigned I = 0; I < Blend->getNumIncomingValues(); I++)
2024 Blend->setMask(Idx: I, V: Blend->getMask(Idx: I)->getDefiningRecipe()->getOperand(N: 1));
2025}
2026
2027/// Normalize and simplify VPBlendRecipes. Should be run after simplifyRecipes
2028/// to make sure the masks are simplified.
2029static void simplifyBlends(VPlan &Plan) {
2030 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
2031 Range: vp_depth_first_shallow(G: Plan.getVectorLoopRegion()->getEntry()))) {
2032 for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB)) {
2033 auto *Blend = dyn_cast<VPBlendRecipe>(Val: &R);
2034 if (!Blend)
2035 continue;
2036
2037 removeCommonBlendMask(Blend);
2038
2039 // Try to remove redundant blend recipes.
2040 SmallPtrSet<VPValue *, 4> UniqueValues;
2041 if (Blend->isNormalized() || !match(V: Blend->getMask(Idx: 0), P: m_False()))
2042 UniqueValues.insert(Ptr: Blend->getIncomingValue(Idx: 0));
2043 for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
2044 if (!match(V: Blend->getMask(Idx: I), P: m_False()))
2045 UniqueValues.insert(Ptr: Blend->getIncomingValue(Idx: I));
2046
2047 if (UniqueValues.size() == 1) {
2048 Blend->replaceAllUsesWith(New: *UniqueValues.begin());
2049 Blend->eraseFromParent();
2050 continue;
2051 }
2052
2053 if (Blend->isNormalized())
2054 continue;
2055
2056 // Normalize the blend so its first incoming value is used as the initial
2057 // value with the others blended into it.
2058
2059 unsigned StartIndex = 0;
2060 for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
2061 // If a value's mask is used only by the blend then is can be deadcoded.
2062 // TODO: Find the most expensive mask that can be deadcoded, or a mask
2063 // that's used by multiple blends where it can be removed from them all.
2064 VPValue *Mask = Blend->getMask(Idx: I);
2065 if (Mask->hasOneUse() && !match(V: Mask, P: m_False())) {
2066 StartIndex = I;
2067 break;
2068 }
2069 }
2070
2071 SmallVector<VPValue *, 4> OperandsWithMask;
2072 OperandsWithMask.push_back(Elt: Blend->getIncomingValue(Idx: StartIndex));
2073
2074 for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
2075 if (I == StartIndex)
2076 continue;
2077 OperandsWithMask.push_back(Elt: Blend->getIncomingValue(Idx: I));
2078 OperandsWithMask.push_back(Elt: Blend->getMask(Idx: I));
2079 }
2080
2081 auto *NewBlend =
2082 new VPBlendRecipe(cast_or_null<PHINode>(Val: Blend->getUnderlyingValue()),
2083 OperandsWithMask, *Blend, Blend->getDebugLoc());
2084 NewBlend->insertBefore(InsertPos: &R);
2085
2086 VPValue *DeadMask = Blend->getMask(Idx: StartIndex);
2087 Blend->replaceAllUsesWith(New: NewBlend);
2088 Blend->eraseFromParent();
2089 recursivelyDeleteDeadRecipes(V: DeadMask);
2090
2091 /// Simplify BLEND %a, %b, Not(%mask) -> BLEND %b, %a, %mask.
2092 VPValue *NewMask;
2093 if (NewBlend->getNumOperands() == 3 &&
2094 match(V: NewBlend->getMask(Idx: 1), P: m_Not(Op0: m_VPValue(V&: NewMask)))) {
2095 VPValue *Inc0 = NewBlend->getOperand(N: 0);
2096 VPValue *Inc1 = NewBlend->getOperand(N: 1);
2097 VPValue *OldMask = NewBlend->getOperand(N: 2);
2098 NewBlend->setOperand(I: 0, New: Inc1);
2099 NewBlend->setOperand(I: 1, New: Inc0);
2100 NewBlend->setOperand(I: 2, New: NewMask);
2101 if (OldMask->user_empty())
2102 cast<VPInstruction>(Val: OldMask)->eraseFromParent();
2103 }
2104 }
2105 }
2106}
2107
2108/// Optimize the width of vector induction variables in \p Plan based on a known
2109/// constant Trip Count, \p BestVF and \p BestUF.
2110static bool optimizeVectorInductionWidthForTCAndVFUF(VPlan &Plan,
2111 ElementCount BestVF,
2112 unsigned BestUF) {
2113 // Only proceed if we have not completely removed the vector region.
2114 if (!Plan.getVectorLoopRegion())
2115 return false;
2116
2117 const APInt *TC;
2118 if (!BestVF.isFixed() || !match(V: Plan.getTripCount(), P: m_APInt(C&: TC)))
2119 return false;
2120
2121 // Calculate the minimum power-of-2 bit width that can fit the known TC, VF
2122 // and UF. Returns at least 8.
2123 auto ComputeBitWidth = [](APInt TC, uint64_t Align) {
2124 APInt AlignedTC =
2125 Align * APIntOps::RoundingUDiv(A: TC, B: APInt(TC.getBitWidth(), Align),
2126 RM: APInt::Rounding::UP);
2127 APInt MaxVal = AlignedTC - 1;
2128 return std::max<unsigned>(a: PowerOf2Ceil(A: MaxVal.getActiveBits()), b: 8);
2129 };
2130 unsigned NewBitWidth =
2131 ComputeBitWidth(*TC, BestVF.getKnownMinValue() * BestUF);
2132
2133 LLVMContext &Ctx = Plan.getContext();
2134 auto *NewIVTy = IntegerType::get(C&: Ctx, NumBits: NewBitWidth);
2135
2136 bool MadeChange = false;
2137
2138 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
2139 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
2140 // Currently only handle canonical IVs as it is trivial to replace the start
2141 // and stop values, and we currently only perform the optimization when the
2142 // IV has a single use.
2143 VPWidenIntOrFpInductionRecipe *WideIV;
2144 if (!match(V: &Phi, P: m_CanonicalWidenIV(V&: WideIV)))
2145 continue;
2146 if (WideIV->hasMoreThanOneUniqueUser() ||
2147 NewIVTy == WideIV->getScalarType())
2148 continue;
2149
2150 // Currently only handle cases where the single user is a header-mask
2151 // comparison with the backedge-taken-count.
2152 VPUser *SingleUser = WideIV->getSingleUser();
2153 if (!SingleUser ||
2154 !match(U: SingleUser,
2155 P: m_ICmp(Op0: m_Specific(VPV: WideIV),
2156 Op1: m_Broadcast(Op0: m_Specific(VPV: Plan.getBackedgeTakenCount())))))
2157 continue;
2158
2159 // Update IV operands and comparison bound to use new narrower type.
2160 assert(!WideIV->getTruncInst() &&
2161 "canonical IV is not expected to have a truncation");
2162 auto *NewWideIV = new VPWidenIntOrFpInductionRecipe(
2163 WideIV->getPHINode(), Plan.getZero(Ty: NewIVTy),
2164 Plan.getConstantInt(Ty: NewIVTy, Val: 1), WideIV->getVFValue(),
2165 WideIV->getInductionDescriptor(), *WideIV, WideIV->getDebugLoc());
2166 NewWideIV->insertBefore(InsertPos: WideIV);
2167
2168 auto *NewBTC = new VPWidenCastRecipe(
2169 Instruction::Trunc, Plan.getOrCreateBackedgeTakenCount(), NewIVTy,
2170 nullptr, VPIRFlags::getDefaultFlags(Opcode: Instruction::Trunc));
2171 Plan.getVectorPreheader()->appendRecipe(Recipe: NewBTC);
2172 auto *Cmp = cast<VPInstruction>(Val: WideIV->getSingleUser());
2173 Cmp->replaceAllUsesWith(
2174 New: VPBuilder(Cmp).createICmp(Pred: Cmp->getPredicate(), A: NewWideIV, B: NewBTC));
2175
2176 MadeChange = true;
2177 }
2178
2179 return MadeChange;
2180}
2181
2182/// Return true if \p Cond is known to be true for given \p BestVF and \p
2183/// BestUF.
2184static bool isConditionTrueViaVFAndUF(VPValue *Cond, VPlan &Plan,
2185 ElementCount BestVF, unsigned BestUF,
2186 PredicatedScalarEvolution &PSE) {
2187 if (match(V: Cond, P: m_BinaryOr(Op0: m_VPValue(), Op1: m_VPValue())))
2188 return any_of(Range: Cond->getDefiningRecipe()->operands(), P: [&Plan, BestVF, BestUF,
2189 &PSE](VPValue *C) {
2190 return isConditionTrueViaVFAndUF(Cond: C, Plan, BestVF, BestUF, PSE);
2191 });
2192
2193 auto *CanIV = Plan.getVectorLoopRegion()->getCanonicalIV();
2194 if (!match(V: Cond, P: m_SpecificICmp(
2195 MatchPred: CmpInst::ICMP_EQ,
2196 Op0: m_c_Add(Op0: m_Specific(VPV: CanIV), Op1: m_Specific(VPV: &Plan.getVFxUF())),
2197 Op1: m_Specific(VPV: &Plan.getVectorTripCount()))))
2198 return false;
2199
2200 // The compare checks CanIV + VFxUF == vector trip count. The vector trip
2201 // count is not conveniently available as SCEV so far, so we compare directly
2202 // against the original trip count. This is stricter than necessary, as we
2203 // will only return true if the trip count == vector trip count.
2204 const SCEV *VectorTripCount =
2205 vputils::getSCEVExprForVPValue(V: &Plan.getVectorTripCount(), PSE);
2206 if (isa<SCEVCouldNotCompute>(Val: VectorTripCount))
2207 VectorTripCount = vputils::getSCEVExprForVPValue(V: Plan.getTripCount(), PSE);
2208 assert(!isa<SCEVCouldNotCompute>(VectorTripCount) &&
2209 "Trip count SCEV must be computable");
2210 ScalarEvolution &SE = *PSE.getSE();
2211 ElementCount NumElements = BestVF.multiplyCoefficientBy(RHS: BestUF);
2212 const SCEV *C = SE.getElementCount(Ty: VectorTripCount->getType(), EC: NumElements);
2213 return SE.isKnownPredicate(Pred: CmpInst::ICMP_EQ, LHS: VectorTripCount, RHS: C);
2214}
2215
2216/// Try to replace multiple active lane masks used for control flow with
2217/// a single, wide active lane mask instruction followed by multiple
2218/// extract subvector intrinsics. This applies to the active lane mask
2219/// instructions both in the loop and in the preheader.
2220/// Incoming values of all ActiveLaneMaskPHIs are updated to use the
2221/// new extracts from the first active lane mask, which has it's last
2222/// operand (multiplier) set to UF.
2223static bool tryToReplaceALMWithWideALM(VPlan &Plan, ElementCount VF,
2224 unsigned UF) {
2225 if (!EnableWideActiveLaneMask || !VF.isVector() || UF == 1)
2226 return false;
2227
2228 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
2229 VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
2230 auto *Term = &ExitingVPBB->back();
2231
2232 using namespace llvm::VPlanPatternMatch;
2233 if (!match(V: Term, P: m_BranchOnCond(Op0: m_Not(Op0: m_ActiveLaneMask(
2234 Op0: m_VPValue(), Op1: m_VPValue(), Op2: m_VPValue())))))
2235 return false;
2236
2237 auto *Header = cast<VPBasicBlock>(Val: VectorRegion->getEntry());
2238 LLVMContext &Ctx = Plan.getContext();
2239
2240 auto ExtractFromALM = [&](VPInstruction *ALM,
2241 SmallVectorImpl<VPValue *> &Extracts) {
2242 DebugLoc DL = ALM->getDebugLoc();
2243 for (unsigned Part = 0; Part < UF; ++Part) {
2244 SmallVector<VPValue *> Ops;
2245 Ops.append(IL: {ALM, Plan.getConstantInt(BitWidth: 64, Val: VF.getKnownMinValue() * Part)});
2246 auto *Ext =
2247 new VPWidenIntrinsicRecipe(Intrinsic::vector_extract, Ops,
2248 IntegerType::getInt1Ty(C&: Ctx), {}, {}, DL);
2249 Extracts[Part] = Ext;
2250 Ext->insertAfter(InsertPos: ALM);
2251 }
2252 };
2253
2254 // Create a list of each active lane mask phi, ordered by unroll part.
2255 SmallVector<VPActiveLaneMaskPHIRecipe *> Phis(UF, nullptr);
2256 for (VPRecipeBase &R : Header->phis()) {
2257 auto *Phi = dyn_cast<VPActiveLaneMaskPHIRecipe>(Val: &R);
2258 if (!Phi)
2259 continue;
2260 VPValue *Index = nullptr;
2261 match(V: Phi->getBackedgeValue(),
2262 P: m_ActiveLaneMask(Op0: m_VPValue(V&: Index), Op1: m_VPValue(), Op2: m_VPValue()));
2263 assert(Index && "Expected index from ActiveLaneMask instruction");
2264
2265 uint64_t Part;
2266 if (match(V: Index,
2267 P: m_VPInstruction<VPInstruction::CanonicalIVIncrementForPart>(
2268 Ops: m_VPValue(), Ops: m_Mul(Op0: m_VPValue(), Op1: m_ConstantInt(C&: Part)))))
2269 Phis[Part] = Phi;
2270 else {
2271 // Anything other than a CanonicalIVIncrementForPart is part 0
2272 assert(!match(
2273 Index,
2274 m_VPInstruction<VPInstruction::CanonicalIVIncrementForPart>()));
2275 Phis[0] = Phi;
2276 }
2277 }
2278
2279 assert(all_of(Phis, not_equal_to(nullptr)) &&
2280 "Expected one VPActiveLaneMaskPHIRecipe for each unroll part");
2281
2282 auto *EntryALM = cast<VPInstruction>(Val: Phis[0]->getStartValue());
2283 auto *LoopALM = cast<VPInstruction>(Val: Phis[0]->getBackedgeValue());
2284
2285 assert((EntryALM->getOpcode() == VPInstruction::ActiveLaneMask &&
2286 LoopALM->getOpcode() == VPInstruction::ActiveLaneMask) &&
2287 "Expected incoming values of Phi to be ActiveLaneMasks");
2288
2289 // When using wide lane masks, the return type of the get.active.lane.mask
2290 // intrinsic is VF x UF (last operand).
2291 VPValue *ALMMultiplier = Plan.getConstantInt(BitWidth: 64, Val: UF);
2292 EntryALM->setOperand(I: 2, New: ALMMultiplier);
2293 LoopALM->setOperand(I: 2, New: ALMMultiplier);
2294
2295 // Create UF x extract vectors and insert into preheader.
2296 SmallVector<VPValue *> EntryExtracts(UF);
2297 ExtractFromALM(EntryALM, EntryExtracts);
2298
2299 // Create UF x extract vectors and insert before the loop compare & branch,
2300 // updating the compare to use the first extract.
2301 SmallVector<VPValue *> LoopExtracts(UF);
2302 ExtractFromALM(LoopALM, LoopExtracts);
2303 VPInstruction *Not = cast<VPInstruction>(Val: Term->getOperand(N: 0));
2304 Not->setOperand(I: 0, New: LoopExtracts[0]);
2305
2306 // Update the incoming values of active lane mask phis.
2307 for (unsigned Part = 0; Part < UF; ++Part) {
2308 Phis[Part]->setStartValue(EntryExtracts[Part]);
2309 Phis[Part]->setBackedgeValue(LoopExtracts[Part]);
2310 }
2311
2312 return true;
2313}
2314
2315/// Try to simplify the branch condition of \p Plan. This may restrict the
2316/// resulting plan to \p BestVF and \p BestUF.
2317static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF,
2318 unsigned BestUF,
2319 PredicatedScalarEvolution &PSE) {
2320 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
2321 VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
2322 auto *Term = &ExitingVPBB->back();
2323 VPValue *Cond;
2324 auto m_CanIVInc = m_Add(Op0: m_VPValue(), Op1: m_Specific(VPV: &Plan.getVFxUF()));
2325 // Check if the branch condition compares the canonical IV increment (for main
2326 // loop), or the canonical IV increment plus an offset (for epilog loop).
2327 if (match(V: Term, P: m_BranchOnCount(
2328 Op0: m_CombineOr(Ps: m_CanIVInc, Ps: m_c_Add(Op0: m_CanIVInc, Op1: m_LiveIn())),
2329 Op1: m_VPValue())) ||
2330 match(V: Term, P: m_BranchOnCond(Op0: m_Not(Op0: m_ActiveLaneMask(
2331 Op0: m_VPValue(), Op1: m_VPValue(), Op2: m_VPValue()))))) {
2332 // Try to simplify the branch condition if VectorTC <= VF * UF when the
2333 // latch terminator is BranchOnCount or BranchOnCond(Not(ActiveLaneMask)).
2334 const SCEV *VectorTripCount =
2335 vputils::getSCEVExprForVPValue(V: &Plan.getVectorTripCount(), PSE);
2336 if (isa<SCEVCouldNotCompute>(Val: VectorTripCount))
2337 VectorTripCount =
2338 vputils::getSCEVExprForVPValue(V: Plan.getTripCount(), PSE);
2339 assert(!isa<SCEVCouldNotCompute>(VectorTripCount) &&
2340 "Trip count SCEV must be computable");
2341 ScalarEvolution &SE = *PSE.getSE();
2342 ElementCount NumElements = BestVF.multiplyCoefficientBy(RHS: BestUF);
2343 const SCEV *C = SE.getElementCount(Ty: VectorTripCount->getType(), EC: NumElements);
2344 if (!SE.isKnownPredicate(Pred: CmpInst::ICMP_ULE, LHS: VectorTripCount, RHS: C))
2345 return false;
2346 } else if (match(V: Term, P: m_BranchOnCond(Op0: m_VPValue(V&: Cond))) ||
2347 match(V: Term, P: m_BranchOnTwoConds(Op0: m_VPValue(), Op1: m_VPValue(V&: Cond)))) {
2348 // For BranchOnCond, check if we can prove the condition to be true using VF
2349 // and UF.
2350 if (!isConditionTrueViaVFAndUF(Cond, Plan, BestVF, BestUF, PSE))
2351 return false;
2352 } else {
2353 return false;
2354 }
2355
2356 // The vector loop region only executes once. Convert terminator of the
2357 // exiting block to exit in the first iteration.
2358 if (match(V: Term, P: m_BranchOnTwoConds())) {
2359 Term->setOperand(I: 1, New: Plan.getTrue());
2360 return true;
2361 }
2362
2363 auto *BOC = new VPInstruction(VPInstruction::BranchOnCond, Plan.getTrue(), {},
2364 {}, Term->getDebugLoc());
2365 ExitingVPBB->appendRecipe(Recipe: BOC);
2366 Term->eraseFromParent();
2367
2368 return true;
2369}
2370
2371/// From the definition of llvm.experimental.get.vector.length,
2372/// VPInstruction::ExplicitVectorLength(%AVL) = %AVL when %AVL <= VF.
2373bool VPlanTransforms::simplifyKnownEVL(VPlan &Plan, ElementCount VF,
2374 PredicatedScalarEvolution &PSE) {
2375 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
2376 Range: vp_depth_first_deep(G: Plan.getEntry()))) {
2377 for (VPRecipeBase &R : *VPBB) {
2378 VPValue *AVL;
2379 if (!match(V: &R, P: m_EVL(Op0: m_VPValue(V&: AVL))))
2380 continue;
2381
2382 const SCEV *AVLSCEV = vputils::getSCEVExprForVPValue(V: AVL, PSE);
2383 if (isa<SCEVCouldNotCompute>(Val: AVLSCEV))
2384 continue;
2385 ScalarEvolution &SE = *PSE.getSE();
2386 const SCEV *VFSCEV = SE.getElementCount(Ty: AVLSCEV->getType(), EC: VF);
2387 if (!SE.isKnownPredicate(Pred: CmpInst::ICMP_ULE, LHS: AVLSCEV, RHS: VFSCEV))
2388 continue;
2389
2390 VPValue *Trunc = VPBuilder(&R).createScalarZExtOrTrunc(
2391 Op: AVL, ResultTy: Type::getInt32Ty(C&: Plan.getContext()), SrcTy: AVLSCEV->getType(),
2392 DL: R.getDebugLoc());
2393 if (Trunc != AVL) {
2394 auto *TruncR = cast<VPSingleDefRecipe>(Val: Trunc);
2395 const DataLayout &DL = Plan.getDataLayout();
2396 if (VPValue *Folded = tryToFoldLiveIns(R&: *TruncR, Operands: TruncR->operands(), DL))
2397 Trunc = Folded;
2398 }
2399 R.getVPSingleValue()->replaceAllUsesWith(New: Trunc);
2400 return true;
2401 }
2402 }
2403 return false;
2404}
2405
2406void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
2407 unsigned BestUF,
2408 PredicatedScalarEvolution &PSE) {
2409 assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
2410 assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
2411
2412 bool MadeChange = tryToReplaceALMWithWideALM(Plan, VF: BestVF, UF: BestUF);
2413 MadeChange |= simplifyBranchConditionForVFAndUF(Plan, BestVF, BestUF, PSE);
2414 MadeChange |= optimizeVectorInductionWidthForTCAndVFUF(Plan, BestVF, BestUF);
2415
2416 if (MadeChange) {
2417 Plan.setVF(BestVF);
2418 assert(Plan.getConcreteUF() == BestUF && "BestUF must match the Plan's UF");
2419 }
2420}
2421
2422void VPlanTransforms::clearReductionWrapFlags(VPlan &Plan) {
2423 for (VPRecipeBase &R :
2424 Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
2425 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(Val: &R);
2426 if (!PhiR)
2427 continue;
2428 RecurKind RK = PhiR->getRecurrenceKind();
2429 if (RK != RecurKind::Add && RK != RecurKind::Mul && RK != RecurKind::Sub &&
2430 RK != RecurKind::AddChainWithSubs)
2431 continue;
2432
2433 for (VPUser *U : collectUsersRecursively(V: PhiR))
2434 if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(Val: U)) {
2435 RecWithFlags->dropPoisonGeneratingFlags();
2436 }
2437 }
2438}
2439
2440namespace {
2441struct VPCSEDenseMapInfo : public DenseMapInfo<VPSingleDefRecipe *> {
2442 /// If recipe \p R will lower to a GEP with a non-i8 source element type,
2443 /// return that source element type.
2444 static Type *getGEPSourceElementType(const VPSingleDefRecipe *R) {
2445 // All VPInstructions that lower to GEPs must have the i8 source element
2446 // type (as they are PtrAdds), so we omit it.
2447 return TypeSwitch<const VPSingleDefRecipe *, Type *>(R)
2448 .Case(caseFn: [](const VPReplicateRecipe *I) -> Type * {
2449 if (auto *GEP = dyn_cast<GetElementPtrInst>(Val: I->getUnderlyingValue()))
2450 return GEP->getSourceElementType();
2451 return nullptr;
2452 })
2453 .Case<VPVectorPointerRecipe, VPWidenGEPRecipe>(
2454 caseFn: [](auto *I) { return I->getSourceElementType(); })
2455 .Default(defaultFn: [](auto *) { return nullptr; });
2456 }
2457
2458 /// Returns true if recipe \p Def can be safely handed for CSE.
2459 static bool canHandle(const VPSingleDefRecipe *Def) {
2460 // We can extend the list of handled recipes in the future,
2461 // provided we account for the data embedded in them while checking for
2462 // equality or hashing.
2463 auto C = getOpcodeOrIntrinsicID(R: Def);
2464
2465 // The issue with (Insert|Extract)Value is that the index of the
2466 // insert/extract is not a proper operand in LLVM IR, and hence also not in
2467 // VPlan.
2468 if (!C || (!C->first && (C->second == Instruction::InsertValue ||
2469 C->second == Instruction::ExtractValue)))
2470 return false;
2471
2472 // During CSE, we can only handle recipes that don't read from memory: if
2473 // they read from memory, there could be an intervening write to memory
2474 // before the next instance is CSE'd, leading to an incorrect result.
2475 return !Def->mayReadFromMemory();
2476 }
2477
2478 /// Hash the underlying data of \p Def.
2479 static unsigned getHashValue(const VPSingleDefRecipe *Def) {
2480 hash_code Result = hash_combine(
2481 args: Def->getVPRecipeID(), args: getOpcodeOrIntrinsicID(R: Def),
2482 args: getGEPSourceElementType(R: Def), args: Def->getScalarType(),
2483 args: vputils::isSingleScalar(VPV: Def), args: hash_combine_range(R: Def->operands()));
2484 if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(Val: Def))
2485 if (RFlags->hasPredicate())
2486 return hash_combine(args: Result, args: RFlags->getPredicate());
2487 if (auto *SIVSteps = dyn_cast<VPScalarIVStepsRecipe>(Val: Def))
2488 return hash_combine(args: Result, args: SIVSteps->getInductionOpcode());
2489 return Result;
2490 }
2491
2492 /// Check equality of underlying data of \p L and \p R.
2493 static bool isEqual(const VPSingleDefRecipe *L, const VPSingleDefRecipe *R) {
2494 if (L->getVPRecipeID() != R->getVPRecipeID() ||
2495 getOpcodeOrIntrinsicID(R: L) != getOpcodeOrIntrinsicID(R) ||
2496 getGEPSourceElementType(R: L) != getGEPSourceElementType(R) ||
2497 vputils::isSingleScalar(VPV: L) != vputils::isSingleScalar(VPV: R) ||
2498 !equal(LRange: L->operands(), RRange: R->operands()))
2499 return false;
2500 assert(getOpcodeOrIntrinsicID(L) && getOpcodeOrIntrinsicID(R) &&
2501 "must have valid opcode info for both recipes");
2502 if (auto *LFlags = dyn_cast<VPRecipeWithIRFlags>(Val: L))
2503 if (LFlags->hasPredicate() &&
2504 LFlags->getPredicate() !=
2505 cast<VPRecipeWithIRFlags>(Val: R)->getPredicate())
2506 return false;
2507 if (auto *LSIV = dyn_cast<VPScalarIVStepsRecipe>(Val: L))
2508 if (LSIV->getInductionOpcode() !=
2509 cast<VPScalarIVStepsRecipe>(Val: R)->getInductionOpcode())
2510 return false;
2511 // Recipes in replicate regions implicitly depend on predicate. If either
2512 // recipe is in a replicate region, only consider them equal if both have
2513 // the same parent.
2514 const VPRegionBlock *RegionL = L->getRegion();
2515 const VPRegionBlock *RegionR = R->getRegion();
2516 if (((RegionL && RegionL->isReplicator()) ||
2517 (RegionR && RegionR->isReplicator())) &&
2518 L->getParent() != R->getParent())
2519 return false;
2520 return L->getScalarType() == R->getScalarType();
2521 }
2522};
2523} // end anonymous namespace
2524
2525/// Perform a common-subexpression-elimination of VPSingleDefRecipes on the \p
2526/// Plan.
2527void VPlanTransforms::cse(VPlan &Plan) {
2528 VPDominatorTree VPDT(Plan);
2529 DenseMap<VPSingleDefRecipe *, VPSingleDefRecipe *, VPCSEDenseMapInfo> CSEMap;
2530
2531 ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT(
2532 Plan.getEntry());
2533 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Range&: RPOT)) {
2534 for (VPRecipeBase &R : *VPBB) {
2535 auto *Def = dyn_cast<VPSingleDefRecipe>(Val: &R);
2536 if (!Def || !VPCSEDenseMapInfo::canHandle(Def))
2537 continue;
2538 if (VPSingleDefRecipe *V = CSEMap.lookup(Val: Def)) {
2539 // V must dominate Def for a valid replacement.
2540 if (!VPDT.dominates(A: V->getParent(), B: VPBB))
2541 continue;
2542 // Only keep flags present on both V and Def.
2543 if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(Val: V))
2544 RFlags->intersectFlags(Other: *cast<VPRecipeWithIRFlags>(Val: Def));
2545 Def->replaceAllUsesWith(New: V);
2546 continue;
2547 }
2548 CSEMap[Def] = Def;
2549 }
2550 }
2551}
2552
2553/// Return true if we do not know how to (mechanically) hoist or sink a
2554/// non-memory or memory recipe \p R out of a loop region. When sinking, passing
2555/// \p Sinking = true ensures that assumes aren't sunk.
2556static bool cannotHoistOrSinkRecipe(VPRecipeBase &R, VPBasicBlock *FirstBB,
2557 VPBasicBlock *LastBB,
2558 bool Sinking = false) {
2559 if (!isa<VPReplicateRecipe>(Val: R) || !R.mayReadOrWriteMemory() ||
2560 match(V: &R, P: m_Intrinsic<Intrinsic::assume>()))
2561 return vputils::cannotHoistOrSinkRecipe(R, Sinking);
2562
2563 // Check that the memory operation doesn't alias between FirstBB and LastBB.
2564 auto MemLoc = vputils::getMemoryLocation(R);
2565
2566 // TODO: Could make use of SinkStoreInfo::isNoAliasViaDistance by collecting
2567 // stores upfront, and constructing a full SinkStoreInfo.
2568 auto SinkInfo =
2569 Sinking ? std::make_optional(t: SinkStoreInfo(cast<VPReplicateRecipe>(Val&: R)))
2570 : std::nullopt;
2571
2572 return !MemLoc ||
2573 !canHoistOrSinkWithNoAliasCheck(MemLoc: *MemLoc, FirstBB, LastBB, SinkInfo);
2574}
2575
2576/// Move loop-invariant recipes out of the vector loop region in \p Plan.
2577static void licm(VPlan &Plan) {
2578 VPBasicBlock *Preheader = Plan.getVectorPreheader();
2579
2580 // Hoist any loop invariant recipes from the vector loop region to the
2581 // preheader. Preform a shallow traversal of the vector loop region, to
2582 // exclude recipes in replicate regions. Since the top-level blocks in the
2583 // vector loop region are guaranteed to execute if the vector pre-header is,
2584 // we don't need to check speculation safety.
2585 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2586 assert(Preheader->getSingleSuccessor() == LoopRegion &&
2587 "Expected vector prehader's successor to be the vector loop region");
2588 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
2589 Range: vp_depth_first_shallow(G: LoopRegion->getEntry()))) {
2590 for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB)) {
2591 if (cannotHoistOrSinkRecipe(R, FirstBB: LoopRegion->getEntryBasicBlock(),
2592 LastBB: LoopRegion->getExitingBasicBlock()))
2593 continue;
2594 if (any_of(Range: R.operands(), P: [](VPValue *Op) {
2595 return !Op->isDefinedOutsideLoopRegions();
2596 }))
2597 continue;
2598 R.moveBefore(BB&: *Preheader, I: Preheader->end());
2599 }
2600 }
2601
2602#ifndef NDEBUG
2603 VPDominatorTree VPDT(Plan);
2604#endif
2605 // Sink recipes with no users inside the vector loop region if all users are
2606 // in the same exit block of the region.
2607 // TODO: Extend to sink recipes from inner loops.
2608 PostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> POT(
2609 LoopRegion->getEntry());
2610 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Range&: POT)) {
2611 for (VPRecipeBase &R : make_early_inc_range(Range: reverse(C&: *VPBB))) {
2612 if (cannotHoistOrSinkRecipe(R, FirstBB: LoopRegion->getEntryBasicBlock(),
2613 LastBB: LoopRegion->getExitingBasicBlock(),
2614 /*Sinking=*/true))
2615 continue;
2616
2617 if (auto *RepR = dyn_cast<VPReplicateRecipe>(Val: &R)) {
2618 assert(!RepR->isPredicated() &&
2619 "Expected prior transformation of predicated replicates to "
2620 "replicate regions");
2621 // narrowToSingleScalarRecipes should have already maximally narrowed
2622 // replicates to single-scalar replicates.
2623 // TODO: When unrolling, replicateByVF doesn't handle sunk
2624 // non-single-scalar replicates correctly.
2625 if (!RepR->isSingleScalar())
2626 continue;
2627
2628 // The pointer operand of stores must be loop-invariant.
2629 if (RepR->getOpcode() == Instruction::Store &&
2630 !RepR->getOperand(N: 1)->isDefinedOutsideLoopRegions())
2631 continue;
2632 }
2633
2634 [[maybe_unused]] auto *RepR = dyn_cast<VPReplicateRecipe>(Val: &R);
2635 assert((!R.mayWriteToMemory() ||
2636 (RepR && RepR->getOpcode() == Instruction::Store &&
2637 RepR->getOperand(1)->isDefinedOutsideLoopRegions())) &&
2638 "The only recipes that may write to memory are expected to be "
2639 "stores with invariant pointer-operand");
2640
2641 // TODO: Use R.definedValues() instead of casting to VPSingleDefRecipe to
2642 // support recipes with multiple defined values (e.g., interleaved loads).
2643 auto *Def = cast<VPSingleDefRecipe>(Val: &R);
2644
2645 // Cannot sink the recipe if the user is defined in a loop region or a
2646 // non-successor of the vector loop region. Cannot sink if user is a phi
2647 // either.
2648 VPBasicBlock *SinkBB = nullptr;
2649 if (any_of(Range: Def->users(), P: [&SinkBB, &LoopRegion](VPUser *U) {
2650 auto *UserR = cast<VPRecipeBase>(Val: U);
2651 VPBasicBlock *Parent = UserR->getParent();
2652 // TODO: Support sinking when users are in multiple blocks.
2653 if (SinkBB && SinkBB != Parent)
2654 return true;
2655 SinkBB = Parent;
2656 // TODO: If the user is a PHI node, we should check the block of
2657 // incoming value. Support PHI node users if needed.
2658 return UserR->isPhi() || Parent->getEnclosingLoopRegion() ||
2659 Parent->getSinglePredecessor() != LoopRegion;
2660 }))
2661 continue;
2662
2663 if (!SinkBB)
2664 SinkBB = cast<VPBasicBlock>(Val: LoopRegion->getSingleSuccessor());
2665
2666 // TODO: This will need to be a check instead of a assert after
2667 // conditional branches in vectorized loops are supported.
2668 assert(VPDT.properlyDominates(VPBB, SinkBB) &&
2669 "Defining block must dominate sink block");
2670 // TODO: Clone the recipe if users are on multiple exit paths, instead of
2671 // just moving.
2672 Def->moveBefore(BB&: *SinkBB, I: SinkBB->getFirstNonPhi());
2673 }
2674 }
2675}
2676
2677void VPlanTransforms::truncateToMinimalBitwidths(
2678 VPlan &Plan, const MapVector<Instruction *, uint64_t> &MinBWs) {
2679 if (Plan.hasScalarVFOnly())
2680 return;
2681 // Keep track of created truncates, so they can be re-used. Note that we
2682 // cannot use RAUW after creating a new truncate, as this would could make
2683 // other uses have different types for their operands, making them invalidly
2684 // typed.
2685 DenseMap<VPValue *, VPWidenCastRecipe *> ProcessedTruncs;
2686 VPBasicBlock *PH = Plan.getVectorPreheader();
2687 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
2688 Range: vp_depth_first_deep(G: Plan.getVectorLoopRegion()))) {
2689 for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB)) {
2690 if (!isa<VPWidenRecipe, VPWidenCastRecipe, VPReplicateRecipe,
2691 VPWidenLoadRecipe, VPWidenIntrinsicRecipe>(Val: &R))
2692 continue;
2693
2694 VPValue *ResultVPV = R.getVPSingleValue();
2695 auto *UI = cast_or_null<Instruction>(Val: ResultVPV->getUnderlyingValue());
2696 unsigned NewResSizeInBits = MinBWs.lookup(Key: UI);
2697 if (!NewResSizeInBits)
2698 continue;
2699
2700 // If the value wasn't vectorized, we must maintain the original scalar
2701 // type. Skip those here, after incrementing NumProcessedRecipes. Also
2702 // skip casts which do not need to be handled explicitly here, as
2703 // redundant casts will be removed during recipe simplification.
2704 if (isa<VPReplicateRecipe, VPWidenCastRecipe>(Val: &R))
2705 continue;
2706
2707 Type *OldResTy = ResultVPV->getScalarType();
2708 unsigned OldResSizeInBits = OldResTy->getScalarSizeInBits();
2709 assert(OldResTy->isIntegerTy() && "only integer types supported");
2710 (void)OldResSizeInBits;
2711
2712 auto *NewResTy = IntegerType::get(C&: Plan.getContext(), NumBits: NewResSizeInBits);
2713
2714 // Any wrapping introduced by shrinking this operation shouldn't be
2715 // considered undefined behavior. So, we can't unconditionally copy
2716 // arithmetic wrapping flags to VPW.
2717 if (auto *VPW = dyn_cast<VPRecipeWithIRFlags>(Val: &R))
2718 VPW->dropPoisonGeneratingFlags();
2719
2720 assert((OldResSizeInBits != NewResSizeInBits ||
2721 match(&R, m_ICmp(m_VPValue(), m_VPValue()))) &&
2722 "Only ICmps should not need extending the result.");
2723 assert(!isa<VPWidenStoreRecipe>(&R) && "stores cannot be narrowed");
2724
2725 // For loads/intrinsics we don't recreate the recipe; just wrap the
2726 // original wide result in a ZExt to OldResTy.
2727 if (isa<VPWidenLoadRecipe, VPWidenIntrinsicRecipe>(Val: &R)) {
2728 if (OldResSizeInBits != NewResSizeInBits) {
2729 auto *Ext = VPBuilder::getToInsertAfter(R: &R).createWidenCast(
2730 Opcode: Instruction::ZExt, Op: ResultVPV, ResultTy: OldResTy);
2731 ResultVPV->replaceAllUsesWith(New: Ext);
2732 Ext->setOperand(I: 0, New: ResultVPV);
2733 }
2734 continue;
2735 }
2736
2737 // Shrink operands by introducing truncates as needed.
2738 unsigned StartIdx =
2739 match(V: &R, P: m_Select(Op0: m_VPValue(), Op1: m_VPValue(), Op2: m_VPValue())) ? 1 : 0;
2740 SmallVector<VPValue *> NewOperands(R.operands());
2741 for (VPValue *&Op : drop_begin(RangeOrContainer&: NewOperands, N: StartIdx)) {
2742 unsigned OpSizeInBits = Op->getScalarType()->getScalarSizeInBits();
2743 if (OpSizeInBits == NewResSizeInBits)
2744 continue;
2745 assert(OpSizeInBits > NewResSizeInBits && "nothing to truncate");
2746 auto [ProcessedIter, Inserted] = ProcessedTruncs.try_emplace(Key: Op);
2747 if (Inserted) {
2748 VPBuilder Builder;
2749 if (isa<VPIRValue>(Val: Op))
2750 Builder.setInsertPoint(PH);
2751 else
2752 Builder.setInsertPoint(&R);
2753 ProcessedIter->second =
2754 Builder.createWidenCast(Opcode: Instruction::Trunc, Op, ResultTy: NewResTy);
2755 }
2756 Op = ProcessedIter->second;
2757 }
2758
2759 auto *NWR = cast<VPWidenRecipe>(Val: &R)->cloneWithOperands(NewOperands);
2760 NWR->insertBefore(InsertPos: &R);
2761
2762 // Wrap NWR in a ZExt to preserve the original wide type for downstream
2763 // users (unless this is an ICmp, which produces i1 regardless).
2764 VPValue *Replacement = NWR->getVPSingleValue();
2765 if (OldResSizeInBits != NewResSizeInBits)
2766 Replacement =
2767 VPBuilder::getToInsertAfter(R: NWR)
2768 .createWidenCast(Opcode: Instruction::ZExt, Op: Replacement, ResultTy: OldResTy)
2769 ->getVPSingleValue();
2770 ResultVPV->replaceAllUsesWith(New: Replacement);
2771 R.eraseFromParent();
2772 }
2773 }
2774}
2775
2776bool VPlanTransforms::removeBranchOnConst(VPlan &Plan, bool OnlyLatches) {
2777 std::optional<VPDominatorTree> VPDT;
2778 if (OnlyLatches)
2779 VPDT.emplace(args&: Plan);
2780
2781 // Collect all blocks before modifying the CFG so we can identify unreachable
2782 // ones after constant branch removal.
2783 SmallVector<VPBlockBase *> AllBlocks(vp_depth_first_shallow(G: Plan.getEntry()));
2784
2785 bool SimplifiedPhi = false;
2786 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Range&: AllBlocks)) {
2787 VPValue *Cond;
2788 // Skip blocks that are not terminated by BranchOnCond.
2789 if (VPBB->empty() || !match(V: &VPBB->back(), P: m_BranchOnCond(Op0: m_VPValue(V&: Cond))))
2790 continue;
2791
2792 if (OnlyLatches && !VPBlockUtils::isLatch(VPB: VPBB, VPDT: *VPDT))
2793 continue;
2794
2795 assert(VPBB->getNumSuccessors() == 2 &&
2796 "Two successors expected for BranchOnCond");
2797 unsigned RemovedIdx;
2798 if (match(V: Cond, P: m_True()))
2799 RemovedIdx = 1;
2800 else if (match(V: Cond, P: m_False()))
2801 RemovedIdx = 0;
2802 else
2803 continue;
2804
2805 VPBasicBlock *RemovedSucc =
2806 cast<VPBasicBlock>(Val: VPBB->getSuccessors()[RemovedIdx]);
2807 assert(count(RemovedSucc->getPredecessors(), VPBB) == 1 &&
2808 "There must be a single edge between VPBB and its successor");
2809 // Values coming from VPBB into phi recipes of RemovedSucc are removed from
2810 // these recipes.
2811 auto Phis = RemovedSucc->phis();
2812 for (VPRecipeBase &R : Phis)
2813 cast<VPPhiAccessors>(Val: &R)->removeIncomingValueFor(IncomingBlock: VPBB);
2814 SimplifiedPhi |= !std::empty(cont: Phis);
2815
2816 // Disconnect blocks and remove the terminator.
2817 VPBlockUtils::disconnectBlocks(From: VPBB, To: RemovedSucc);
2818 VPBB->back().eraseFromParent();
2819 }
2820
2821 // Compute which blocks are still reachable from the entry after constant
2822 // branch removal.
2823 SmallPtrSet<VPBlockBase *, 16> Reachable(
2824 llvm::from_range, vp_depth_first_shallow(G: Plan.getEntry()));
2825
2826 // Detach all unreachable blocks from their successors, removing their recipes
2827 // and incoming values from phi recipes.
2828 VPSymbolicValue Tmp(nullptr);
2829 for (VPBlockBase *B : AllBlocks) {
2830 if (Reachable.contains(Ptr: B))
2831 continue;
2832 for (VPBlockBase *Succ : to_vector(Range: B->successors())) {
2833 if (auto *SuccBB = dyn_cast<VPBasicBlock>(Val: Succ))
2834 for (VPRecipeBase &R : SuccBB->phis())
2835 cast<VPPhiAccessors>(Val: &R)->removeIncomingValueFor(IncomingBlock: B);
2836 VPBlockUtils::disconnectBlocks(From: B, To: Succ);
2837 }
2838 for (VPBasicBlock *DeadBB :
2839 VPBlockUtils::blocksOnly<VPBasicBlock>(Range: vp_depth_first_deep(G: B))) {
2840 for (VPRecipeBase &R : make_early_inc_range(Range&: *DeadBB)) {
2841 for (VPValue *Def : R.definedValues())
2842 Def->replaceAllUsesWith(New: &Tmp);
2843 R.eraseFromParent();
2844 }
2845 }
2846 }
2847 return SimplifiedPhi;
2848}
2849
2850void VPlanTransforms::optimize(VPlan &Plan) {
2851 RUN_VPLAN_PASS(removeRedundantInductionCasts, Plan);
2852
2853 RUN_VPLAN_PASS(reassociateHeaderMask, Plan);
2854 RUN_VPLAN_PASS(simplifyRecipes, Plan);
2855 RUN_VPLAN_PASS(removeDeadRecipes, Plan);
2856 RUN_VPLAN_PASS(simplifyBlends, Plan);
2857 RUN_VPLAN_PASS(legalizeAndOptimizeInductions, Plan);
2858 RUN_VPLAN_PASS(narrowToSingleScalarRecipes, Plan);
2859 RUN_VPLAN_PASS(removeRedundantExpandSCEVRecipes, Plan);
2860 RUN_VPLAN_PASS(reassociateHeaderMask, Plan);
2861 RUN_VPLAN_PASS(simplifyRecipes, Plan);
2862 RUN_VPLAN_PASS(removeBranchOnConst, Plan, /*OnlyLatches=*/false);
2863 RUN_VPLAN_PASS(simplifyReverses, Plan);
2864 RUN_VPLAN_PASS(removeDeadRecipes, Plan);
2865
2866 RUN_VPLAN_PASS(createAndOptimizeReplicateRegions, Plan);
2867 RUN_VPLAN_PASS(mergeBlocksIntoPredecessors, Plan);
2868 RUN_VPLAN_PASS(licm, Plan);
2869}
2870
2871// Add a VPActiveLaneMaskPHIRecipe and related recipes to \p Plan and replace
2872// the loop terminator with a branch-on-cond recipe with the negated
2873// active-lane-mask as operand. Note that this turns the loop into an
2874// uncountable one. Only the existing terminator is replaced, all other existing
2875// recipes/users remain unchanged, except for poison-generating flags being
2876// dropped from the canonical IV increment. Return the created
2877// VPActiveLaneMaskPHIRecipe.
2878//
2879// The function adds the following recipes:
2880//
2881// vector.ph:
2882// %EntryInc = canonical-iv-increment-for-part CanonicalIVStart
2883// %EntryALM = active-lane-mask %EntryInc, TC
2884//
2885// vector.body:
2886// ...
2887// %P = active-lane-mask-phi [ %EntryALM, %vector.ph ], [ %ALM, %vector.body ]
2888// ...
2889// %InLoopInc = canonical-iv-increment-for-part CanonicalIVIncrement
2890// %ALM = active-lane-mask %InLoopInc, TC
2891// %Negated = Not %ALM
2892// branch-on-cond %Negated
2893//
2894static VPActiveLaneMaskPHIRecipe *
2895addVPLaneMaskPhiAndUpdateExitBranch(VPlan &Plan) {
2896 VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
2897 VPBasicBlock *EB = TopRegion->getExitingBasicBlock();
2898 VPValue *StartV = Plan.getZero(Ty: TopRegion->getCanonicalIVType());
2899 auto *CanonicalIVIncrement = TopRegion->getOrCreateCanonicalIVIncrement();
2900 // TODO: Check if dropping the flags is needed.
2901 TopRegion->clearCanonicalIVNUW(Increment: CanonicalIVIncrement);
2902 DebugLoc DL = CanonicalIVIncrement->getDebugLoc();
2903 // We can't use StartV directly in the ActiveLaneMask VPInstruction, since
2904 // we have to take unrolling into account. Each part needs to start at
2905 // Part * VF
2906 auto *VecPreheader = Plan.getVectorPreheader();
2907 VPBuilder Builder(VecPreheader);
2908
2909 // Create the ActiveLaneMask instruction using the correct start values.
2910 VPValue *TC = Plan.getTripCount();
2911 VPValue *VF = &Plan.getVF();
2912
2913 auto *EntryIncrement = Builder.createOverflowingOp(
2914 Opcode: VPInstruction::CanonicalIVIncrementForPart, Operands: {StartV, VF}, WrapFlags: {false, false},
2915 DL, Name: "index.part.next");
2916
2917 // Create the active lane mask instruction in the VPlan preheader.
2918 VPValue *ALMMultiplier =
2919 Plan.getConstantInt(Ty: TopRegion->getCanonicalIVType(), Val: 1);
2920 auto *EntryALM = Builder.createNaryOp(Opcode: VPInstruction::ActiveLaneMask,
2921 Operands: {EntryIncrement, TC, ALMMultiplier}, DL,
2922 Name: "active.lane.mask.entry");
2923
2924 // Now create the ActiveLaneMaskPhi recipe in the main loop using the
2925 // preheader ActiveLaneMask instruction.
2926 auto *LaneMaskPhi =
2927 new VPActiveLaneMaskPHIRecipe(EntryALM, DebugLoc::getUnknown());
2928 auto *HeaderVPBB = TopRegion->getEntryBasicBlock();
2929 LaneMaskPhi->insertBefore(BB&: *HeaderVPBB, IP: HeaderVPBB->begin());
2930
2931 // Create the active lane mask for the next iteration of the loop before the
2932 // original terminator.
2933 VPRecipeBase *OriginalTerminator = EB->getTerminator();
2934 Builder.setInsertPoint(OriginalTerminator);
2935 auto *InLoopIncrement = Builder.createOverflowingOp(
2936 Opcode: VPInstruction::CanonicalIVIncrementForPart,
2937 Operands: {CanonicalIVIncrement, &Plan.getVF()}, WrapFlags: {false, false}, DL);
2938 auto *ALM = Builder.createNaryOp(Opcode: VPInstruction::ActiveLaneMask,
2939 Operands: {InLoopIncrement, TC, ALMMultiplier}, DL,
2940 Name: "active.lane.mask.next");
2941 LaneMaskPhi->addBackedgeValue(V: ALM);
2942
2943 // Replace the original terminator with BranchOnCond. We have to invert the
2944 // mask here because a true condition means jumping to the exit block.
2945 auto *NotMask = Builder.createNot(Operand: ALM, DL);
2946 Builder.createNaryOp(Opcode: VPInstruction::BranchOnCond, Operands: {NotMask}, DL);
2947 OriginalTerminator->eraseFromParent();
2948 return LaneMaskPhi;
2949}
2950
2951void VPlanTransforms::addActiveLaneMask(VPlan &Plan,
2952 bool UseActiveLaneMaskForControlFlow) {
2953 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2954 auto *WideCanonicalIV =
2955 findUserOf<VPWidenCanonicalIVRecipe>(V: LoopRegion->getCanonicalIV());
2956 assert(WideCanonicalIV &&
2957 "Must have widened canonical IV when tail folding!");
2958 VPSingleDefRecipe *HeaderMask = vputils::findHeaderMask(Plan);
2959 VPSingleDefRecipe *LaneMask;
2960 if (UseActiveLaneMaskForControlFlow) {
2961 LaneMask = addVPLaneMaskPhiAndUpdateExitBranch(Plan);
2962 } else {
2963 VPBuilder B = VPBuilder::getToInsertAfter(R: WideCanonicalIV);
2964 VPValue *ALMMultiplier =
2965 Plan.getConstantInt(Ty: LoopRegion->getCanonicalIVType(), Val: 1);
2966 LaneMask =
2967 B.createNaryOp(Opcode: VPInstruction::ActiveLaneMask,
2968 Operands: {WideCanonicalIV, Plan.getTripCount(), ALMMultiplier},
2969 DL: nullptr, Name: "active.lane.mask");
2970 }
2971
2972 // Walk users of WideCanonicalIV and replace the header mask of the form
2973 // (ICMP_ULE, WideCanonicalIV, backedge-taken-count) with an active-lane-mask,
2974 // removing the old one to ensure there is always only a single header mask.
2975 HeaderMask->replaceAllUsesWith(New: LaneMask);
2976 HeaderMask->eraseFromParent();
2977}
2978
2979template <typename Op0_t, typename Op1_t> struct RemoveMask_match {
2980 Op0_t In;
2981 Op1_t &Out;
2982
2983 RemoveMask_match(const Op0_t &In, Op1_t &Out) : In(In), Out(Out) {}
2984
2985 template <typename OpTy> bool match(OpTy *V) const {
2986 if (m_Specific(In).match(V)) {
2987 Out = nullptr;
2988 return true;
2989 }
2990 return m_LogicalAnd(m_Specific(In), m_VPValue(Out)).match(V);
2991 }
2992};
2993
2994/// Match a specific mask \p In, or a combination of it (logical-and In, Out).
2995/// Returns the remaining part \p Out if so, or nullptr otherwise.
2996template <typename Op0_t, typename Op1_t>
2997static inline RemoveMask_match<Op0_t, Op1_t> m_RemoveMask(const Op0_t &In,
2998 Op1_t &Out) {
2999 return RemoveMask_match<Op0_t, Op1_t>(In, Out);
3000}
3001
3002static std::optional<Intrinsic::ID> getVPDivRemIntrinsic(Intrinsic::ID IntrID) {
3003 switch (IntrID) {
3004 case Intrinsic::masked_udiv:
3005 return Intrinsic::vp_udiv;
3006 case Intrinsic::masked_sdiv:
3007 return Intrinsic::vp_sdiv;
3008 case Intrinsic::masked_urem:
3009 return Intrinsic::vp_urem;
3010 case Intrinsic::masked_srem:
3011 return Intrinsic::vp_srem;
3012 default:
3013 return std::nullopt;
3014 }
3015}
3016
3017/// Try to optimize a \p CurRecipe masked by \p HeaderMask to a corresponding
3018/// EVL-based recipe without the header mask. Returns nullptr if no EVL-based
3019/// recipe could be created.
3020/// \p HeaderMask Header Mask.
3021/// \p CurRecipe Recipe to be transform.
3022/// \p EVL The explicit vector length parameter of vector-predication
3023/// intrinsics.
3024static VPRecipeBase *optimizeMaskToEVL(VPValue *HeaderMask,
3025 VPRecipeBase &CurRecipe, VPValue &EVL) {
3026 VPlan *Plan = CurRecipe.getParent()->getPlan();
3027 DebugLoc DL = CurRecipe.getDebugLoc();
3028 VPValue *Addr, *Mask, *EndPtr;
3029
3030 /// Adjust any end pointers so that they point to the end of EVL lanes not VF.
3031 auto AdjustEndPtr = [&CurRecipe, &EVL](VPValue *EndPtr) {
3032 auto *EVLEndPtr = cast<VPVectorEndPointerRecipe>(Val: EndPtr)->clone();
3033 EVLEndPtr->insertBefore(InsertPos: &CurRecipe);
3034 // Cast EVL (i32) to match the VF operand's type.
3035 VPValue *EVLAsVF = VPBuilder(EVLEndPtr).createScalarZExtOrTrunc(
3036 Op: &EVL, ResultTy: EVLEndPtr->getOperand(N: 1)->getScalarType(), SrcTy: EVL.getScalarType(),
3037 DL: DebugLoc::getUnknown());
3038 EVLEndPtr->setOperand(I: 1, New: EVLAsVF);
3039 return EVLEndPtr;
3040 };
3041
3042 auto GetVPReverse = [&CurRecipe, &EVL, Plan,
3043 DL](VPValue *V) -> VPWidenIntrinsicRecipe * {
3044 if (!V)
3045 return nullptr;
3046 auto *Reverse = new VPWidenIntrinsicRecipe(
3047 Intrinsic::experimental_vp_reverse, {V, Plan->getTrue(), &EVL},
3048 V->getScalarType(), {}, {}, DL);
3049 Reverse->insertBefore(InsertPos: &CurRecipe);
3050 return Reverse;
3051 };
3052
3053 if (match(V: &CurRecipe,
3054 P: m_MaskedLoad(Addr: m_VPValue(V&: Addr), Mask: m_RemoveMask(In: HeaderMask, Out&: Mask))))
3055 return new VPWidenLoadEVLRecipe(cast<VPWidenLoadRecipe>(Val&: CurRecipe), Addr,
3056 EVL, Mask);
3057
3058 if (match(V: &CurRecipe,
3059 P: m_MaskedLoad(Addr: m_VPValue(V&: EndPtr),
3060 Mask: m_Reverse(Op0: m_RemoveMask(In: HeaderMask, Out&: Mask)))) &&
3061 match(V: EndPtr, P: m_VecEndPtr(Op0: m_VPValue(), Op1: m_Specific(VPV: &Plan->getVF())))) {
3062 Mask = GetVPReverse(Mask);
3063 Addr = AdjustEndPtr(EndPtr);
3064 auto *LoadR = new VPWidenLoadEVLRecipe(cast<VPWidenLoadRecipe>(Val&: CurRecipe),
3065 Addr, EVL, Mask);
3066 LoadR->insertBefore(InsertPos: &CurRecipe);
3067 VPValue *Poison = Plan->getPoison(Ty: LoadR->getScalarType());
3068 return new VPWidenIntrinsicRecipe(Intrinsic::vector_splice_left,
3069 {Poison, LoadR, &EVL},
3070 LoadR->getScalarType(), {}, {}, DL);
3071 }
3072
3073 VPValue *Stride;
3074 if (match(V: &CurRecipe, P: m_Intrinsic<Intrinsic::experimental_vp_strided_load>(
3075 Op0: m_VPValue(V&: Addr), Op1: m_VPValue(V&: Stride),
3076 Op2: m_RemoveMask(In: HeaderMask, Out&: Mask),
3077 Op3: m_TruncOrSelf(Op0: m_Specific(VPV: &Plan->getVF()))))) {
3078 if (!Mask)
3079 Mask = Plan->getTrue();
3080 auto *NewLoad = cast<VPWidenMemIntrinsicRecipe>(Val: &CurRecipe)->clone();
3081 NewLoad->setOperand(I: 2, New: Mask);
3082 NewLoad->setOperand(I: 3, New: &EVL);
3083 return NewLoad;
3084 }
3085
3086 VPValue *StoredVal;
3087 if (match(V: &CurRecipe, P: m_MaskedStore(Addr: m_VPValue(V&: Addr), Val: m_VPValue(V&: StoredVal),
3088 Mask: m_RemoveMask(In: HeaderMask, Out&: Mask))))
3089 return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(Val&: CurRecipe), Addr,
3090 StoredVal, EVL, Mask);
3091
3092 if (match(V: &CurRecipe,
3093 P: m_MaskedStore(Addr: m_VPValue(V&: EndPtr), Val: m_VPValue(V&: StoredVal),
3094 Mask: m_Reverse(Op0: m_RemoveMask(In: HeaderMask, Out&: Mask)))) &&
3095 match(V: EndPtr, P: m_VecEndPtr(Op0: m_VPValue(), Op1: m_Specific(VPV: &Plan->getVF())))) {
3096 Mask = GetVPReverse(Mask);
3097 Addr = AdjustEndPtr(EndPtr);
3098 VPValue *Poison = Plan->getPoison(Ty: StoredVal->getScalarType());
3099 auto *SpliceR = new VPWidenIntrinsicRecipe(
3100 Intrinsic::vector_splice_right, {StoredVal, Poison, &EVL},
3101 StoredVal->getScalarType(), {}, {}, DL);
3102 SpliceR->insertBefore(InsertPos: &CurRecipe);
3103 return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(Val&: CurRecipe), Addr,
3104 SpliceR, EVL, Mask);
3105 }
3106
3107 if (auto *Rdx = dyn_cast<VPReductionRecipe>(Val: &CurRecipe))
3108 if (Rdx->isConditional() &&
3109 match(V: Rdx->getCondOp(), P: m_RemoveMask(In: HeaderMask, Out&: Mask)))
3110 return new VPReductionEVLRecipe(*Rdx, EVL, Mask);
3111
3112 if (auto *Interleave = dyn_cast<VPInterleaveRecipe>(Val: &CurRecipe))
3113 if (Interleave->getMask() &&
3114 match(V: Interleave->getMask(), P: m_RemoveMask(In: HeaderMask, Out&: Mask)))
3115 return new VPInterleaveEVLRecipe(*Interleave, EVL, Mask);
3116
3117 VPValue *LHS, *RHS;
3118 if (match(V: &CurRecipe, P: m_SelectLike(Op0: m_RemoveMask(In: HeaderMask, Out&: Mask),
3119 Op1: m_VPValue(V&: LHS), Op2: m_VPValue(V&: RHS))))
3120 return new VPWidenIntrinsicRecipe(
3121 Intrinsic::vp_merge, {Mask ? Mask : Plan->getTrue(), LHS, RHS, &EVL},
3122 LHS->getScalarType(), {}, {}, DL);
3123
3124 if (match(V: &CurRecipe, P: m_LastActiveLane(Op0: m_Specific(VPV: HeaderMask)))) {
3125 Type *Ty = CurRecipe.getVPSingleValue()->getScalarType();
3126 VPValue *ZExt =
3127 VPBuilder(&CurRecipe)
3128 .createScalarZExtOrTrunc(Op: &EVL, ResultTy: Ty, SrcTy: EVL.getScalarType(), DL);
3129 return new VPInstruction(
3130 Instruction::Sub, {ZExt, Plan->getConstantInt(Ty, Val: 1)},
3131 VPIRFlags::getDefaultFlags(Opcode: Instruction::Sub), {}, DL);
3132 }
3133
3134 // lhs | (headermask && rhs) -> vp.merge rhs, true, lhs, evl
3135 if (match(V: &CurRecipe,
3136 P: m_c_BinaryOr(Op0: m_VPValue(V&: LHS),
3137 Op1: m_LogicalAnd(Op0: m_Specific(VPV: HeaderMask), Op1: m_VPValue(V&: RHS)))))
3138 return new VPWidenIntrinsicRecipe(Intrinsic::vp_merge,
3139 {RHS, Plan->getTrue(), LHS, &EVL},
3140 LHS->getScalarType(), {}, {}, DL);
3141
3142 if (auto *IntrR = dyn_cast<VPWidenIntrinsicRecipe>(Val: &CurRecipe))
3143 if (auto VPID = getVPDivRemIntrinsic(IntrID: IntrR->getVectorIntrinsicID()))
3144 if (match(V: IntrR->getOperand(N: 2), P: m_RemoveMask(In: HeaderMask, Out&: Mask)))
3145 return new VPWidenIntrinsicRecipe(*VPID,
3146 {IntrR->getOperand(N: 0),
3147 IntrR->getOperand(N: 1),
3148 Mask ? Mask : Plan->getTrue(), &EVL},
3149 IntrR->getScalarType(), {}, {}, DL);
3150
3151 return nullptr;
3152}
3153
3154/// Optimize away any EVL-based header masks to VP intrinsic based recipes.
3155/// The transforms here need to preserve the original semantics.
3156void VPlanTransforms::optimizeEVLMasks(VPlan &Plan) {
3157 // Find the EVL-based header mask if it exists: icmp ult step-vector, EVL
3158 VPValue *HeaderMask = nullptr, *EVL = nullptr;
3159 for (VPRecipeBase &R : *Plan.getVectorLoopRegion()->getEntryBasicBlock()) {
3160 if (match(V: &R, P: m_SpecificICmp(MatchPred: CmpInst::ICMP_ULT, Op0: m_StepVector(),
3161 Op1: m_VPValue(V&: EVL))) &&
3162 match(V: EVL, P: m_EVL(Op0: m_VPValue()))) {
3163 HeaderMask = R.getVPSingleValue();
3164 break;
3165 }
3166 }
3167 if (!HeaderMask)
3168 return;
3169
3170 SmallVector<VPRecipeBase *> OldRecipes;
3171 for (VPUser *U : collectUsersRecursively(V: HeaderMask)) {
3172 VPRecipeBase *R = cast<VPRecipeBase>(Val: U);
3173 if (auto *NewR = optimizeMaskToEVL(HeaderMask, CurRecipe&: *R, EVL&: *EVL)) {
3174 NewR->insertBefore(InsertPos: R);
3175 for (auto [Old, New] :
3176 zip_equal(t: R->definedValues(), u: NewR->definedValues()))
3177 Old->replaceAllUsesWith(New);
3178 OldRecipes.push_back(Elt: R);
3179 }
3180 }
3181
3182 // Replace remaining (HeaderMask && Mask) with vp.merge (True, Mask,
3183 // False, EVL)
3184 for (VPUser *U : collectUsersRecursively(V: HeaderMask)) {
3185 VPValue *Mask;
3186 if (match(U, P: m_LogicalAnd(Op0: m_Specific(VPV: HeaderMask), Op1: m_VPValue(V&: Mask)))) {
3187 auto *LogicalAnd = cast<VPInstruction>(Val: U);
3188 auto *Merge = new VPWidenIntrinsicRecipe(
3189 Intrinsic::vp_merge, {Plan.getTrue(), Mask, Plan.getFalse(), EVL},
3190 Mask->getScalarType(), {}, {}, LogicalAnd->getDebugLoc());
3191 Merge->insertBefore(InsertPos: LogicalAnd);
3192 LogicalAnd->replaceAllUsesWith(New: Merge);
3193 OldRecipes.push_back(Elt: LogicalAnd);
3194 }
3195 }
3196
3197 // Fold the following splice patterns:
3198 // splice.right(splice.left(poison, x, evl), poison, evl) -> x
3199 // vector.reverse(splice.left(poison, x, evl)) -> vp.reverse(x, true, evl)
3200 // splice.right(vector.reverse(x), poison, evl) -> vp.reverse(x, true, evl)
3201 for (VPUser *U : collectUsersRecursively(V: EVL)) {
3202 auto *R = cast<VPRecipeBase>(Val: U);
3203 VPValue *X;
3204 if (match(U, P: m_Intrinsic<Intrinsic::vector_splice_right>(
3205 Op0: m_Intrinsic<Intrinsic::vector_splice_left>(
3206 Op0: m_Poison(), Op1: m_VPValue(V&: X), Op2: m_Specific(VPV: EVL)),
3207 Op1: m_Poison(), Op2: m_Specific(VPV: EVL)))) {
3208 R->getVPSingleValue()->replaceAllUsesWith(New: X);
3209 OldRecipes.push_back(Elt: R);
3210 continue;
3211 }
3212
3213 if (!match(U,
3214 P: m_CombineOr(
3215 Ps: m_Reverse(Op0: m_Intrinsic<Intrinsic::vector_splice_left>(
3216 Op0: m_Poison(), Op1: m_VPValue(V&: X), Op2: m_Specific(VPV: EVL))),
3217 Ps: m_Intrinsic<Intrinsic::vector_splice_right>(
3218 Op0: m_Reverse(Op0: m_VPValue(V&: X)), Op1: m_Poison(), Op2: m_Specific(VPV: EVL)))))
3219 continue;
3220
3221 auto *VPReverse = new VPWidenIntrinsicRecipe(
3222 Intrinsic::experimental_vp_reverse, {X, Plan.getTrue(), EVL},
3223 X->getScalarType(), {}, {}, R->getDebugLoc());
3224 VPReverse->insertBefore(InsertPos: R);
3225 R->getVPSingleValue()->replaceAllUsesWith(New: VPReverse);
3226 OldRecipes.push_back(Elt: R);
3227 }
3228
3229 for (VPRecipeBase *R : reverse(C&: OldRecipes)) {
3230 SmallVector<VPValue *> PossiblyDead(R->operands());
3231 R->eraseFromParent();
3232 for (VPValue *Op : PossiblyDead)
3233 recursivelyDeleteDeadRecipes(V: Op);
3234 }
3235}
3236
3237/// After replacing the canonical IV with a EVL-based IV, fixup recipes that use
3238/// VF to use the EVL instead to avoid incorrect updates on the penultimate
3239/// iteration.
3240static void fixupVFUsersForEVL(VPlan &Plan, VPValue &EVL) {
3241 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3242 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
3243
3244 // EVL is i32 but VF/VFxUF are IdxTy. Convert as needed.
3245 VPValue *EVLAsIdx =
3246 VPBuilder::getToInsertAfter(R: EVL.getDefiningRecipe())
3247 .createScalarZExtOrTrunc(Op: &EVL, ResultTy: Plan.getVF().getScalarType(),
3248 SrcTy: EVL.getScalarType(), DL: DebugLoc::getUnknown());
3249
3250 assert(all_of(Plan.getVF().users(),
3251 [&Plan](VPUser *U) {
3252 auto IsAllowedUser =
3253 IsaPred<VPVectorEndPointerRecipe, VPScalarIVStepsRecipe,
3254 VPWidenIntOrFpInductionRecipe,
3255 VPWidenMemIntrinsicRecipe>;
3256 if (match(U, m_Trunc(m_Specific(&Plan.getVF()))))
3257 return all_of(cast<VPSingleDefRecipe>(U)->users(),
3258 IsAllowedUser);
3259 return IsAllowedUser(U);
3260 }) &&
3261 "User of VF that we can't transform to EVL.");
3262 Plan.getVF().replaceUsesWithIf(New: EVLAsIdx, ShouldReplace: [](VPUser &U, unsigned Idx) {
3263 return isa<VPWidenIntOrFpInductionRecipe, VPScalarIVStepsRecipe>(Val: U);
3264 });
3265
3266 assert(all_of(Plan.getVFxUF().users(),
3267 match_fn(m_CombineOr(
3268 m_c_Add(m_Specific(LoopRegion->getCanonicalIV()),
3269 m_Specific(&Plan.getVFxUF())),
3270 m_Isa<VPWidenPointerInductionRecipe>()))) &&
3271 "Only users of VFxUF should be VPWidenPointerInductionRecipe and the "
3272 "increment of the canonical induction.");
3273 Plan.getVFxUF().replaceUsesWithIf(New: EVLAsIdx, ShouldReplace: [](VPUser &U, unsigned Idx) {
3274 // Only replace uses in VPWidenPointerInductionRecipe; The increment of the
3275 // canonical induction must not be updated.
3276 return isa<VPWidenPointerInductionRecipe>(Val: U);
3277 });
3278
3279 // Create a scalar phi to track the previous EVL if fixed-order recurrence is
3280 // contained.
3281 bool ContainsFORs =
3282 any_of(Range: Header->phis(), P: IsaPred<VPFirstOrderRecurrencePHIRecipe>);
3283 if (ContainsFORs) {
3284 // TODO: Use VPInstruction::ExplicitVectorLength to get maximum EVL.
3285 VPValue *MaxEVL = &Plan.getVF();
3286 // Emit VPScalarCastRecipe in preheader if VF is not a 32 bits integer.
3287 VPBuilder Builder(LoopRegion->getPreheaderVPBB());
3288 MaxEVL = Builder.createScalarZExtOrTrunc(
3289 Op: MaxEVL, ResultTy: Type::getInt32Ty(C&: Plan.getContext()), SrcTy: MaxEVL->getScalarType(),
3290 DL: DebugLoc::getUnknown());
3291
3292 Builder.setInsertPoint(TheBB: Header, IP: Header->getFirstNonPhi());
3293 VPValue *PrevEVL = Builder.createScalarPhi(
3294 IncomingValues: {MaxEVL, &EVL}, DL: DebugLoc::getUnknown(), Name: "prev.evl");
3295
3296 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
3297 Range: vp_depth_first_deep(G: Plan.getVectorLoopRegion()->getEntry()))) {
3298 for (VPRecipeBase &R : *VPBB) {
3299 VPValue *V1, *V2;
3300 if (!match(V: &R,
3301 P: m_VPInstruction<VPInstruction::FirstOrderRecurrenceSplice>(
3302 Ops: m_VPValue(V&: V1), Ops: m_VPValue(V&: V2))))
3303 continue;
3304 VPValue *Imm = Plan.getOrAddLiveIn(
3305 V: ConstantInt::getSigned(Ty: Type::getInt32Ty(C&: Plan.getContext()), V: -1));
3306 VPWidenIntrinsicRecipe *VPSplice = new VPWidenIntrinsicRecipe(
3307 Intrinsic::experimental_vp_splice,
3308 {V1, V2, Imm, Plan.getTrue(), PrevEVL, &EVL},
3309 R.getVPSingleValue()->getScalarType(), {}, {}, R.getDebugLoc());
3310 VPSplice->insertBefore(InsertPos: &R);
3311 R.getVPSingleValue()->replaceAllUsesWith(New: VPSplice);
3312 }
3313 }
3314 }
3315
3316 VPValue *HeaderMask = vputils::findHeaderMask(Plan);
3317 if (!HeaderMask)
3318 return;
3319
3320 // Ensure that any reduction that uses a select to mask off tail lanes does so
3321 // in the vector loop, not the middle block, since EVL tail folding can have
3322 // tail elements in the penultimate iteration.
3323 assert(all_of(*Plan.getMiddleBlock(), [&Plan, HeaderMask](VPRecipeBase &R) {
3324 if (match(&R, m_ComputeReductionResult(m_Select(m_Specific(HeaderMask),
3325 m_VPValue(), m_VPValue()))))
3326 return R.getOperand(0)->getDefiningRecipe()->getRegion() ==
3327 Plan.getVectorLoopRegion();
3328 return true;
3329 }));
3330
3331 // Replace header masks with a mask equivalent to predicating by EVL:
3332 //
3333 // icmp ule widen-canonical-iv backedge-taken-count
3334 // ->
3335 // icmp ult step-vector, EVL
3336 VPRecipeBase *EVLR = EVL.getDefiningRecipe();
3337 VPBuilder Builder(EVLR->getParent(), std::next(x: EVLR->getIterator()));
3338 Type *EVLType = EVL.getScalarType();
3339 VPValue *EVLMask = Builder.createICmp(
3340 Pred: CmpInst::ICMP_ULT,
3341 A: Builder.createNaryOp(Opcode: VPInstruction::StepVector, Operands: {}, ResultTy: EVLType), B: &EVL);
3342 HeaderMask->replaceAllUsesWith(New: EVLMask);
3343}
3344
3345/// Converts a tail folded vector loop region to step by
3346/// VPInstruction::ExplicitVectorLength elements instead of VF elements each
3347/// iteration.
3348///
3349/// - Add a VPCurrentIterationPHIRecipe and related recipes to \p Plan and
3350/// replaces all uses of the canonical IV except for the canonical IV
3351/// increment with a VPCurrentIterationPHIRecipe. The canonical IV is used
3352/// only for loop iterations counting after this transformation.
3353///
3354/// - The header mask is replaced with a header mask based on the EVL.
3355///
3356/// - Plans with FORs have a new phi added to keep track of the EVL of the
3357/// previous iteration, and VPFirstOrderRecurrencePHIRecipes are replaced with
3358/// @llvm.vp.splice.
3359///
3360/// The function uses the following definitions:
3361/// %StartV is the canonical induction start value.
3362///
3363/// The function adds the following recipes:
3364///
3365/// vector.ph:
3366/// ...
3367///
3368/// vector.body:
3369/// ...
3370/// %CurrentIter = CURRENT-ITERATION-PHI [ %StartV, %vector.ph ],
3371/// [ %NextIter, %vector.body ]
3372/// %AVL = phi [ trip-count, %vector.ph ], [ %NextAVL, %vector.body ]
3373/// %VPEVL = EXPLICIT-VECTOR-LENGTH %AVL
3374/// ...
3375/// %OpEVL = cast i32 %VPEVL to IVSize
3376/// %NextIter = add IVSize %OpEVL, %CurrentIter
3377/// %NextAVL = sub IVSize nuw %AVL, %OpEVL
3378/// ...
3379///
3380/// If MaxSafeElements is provided, the function adds the following recipes:
3381/// vector.ph:
3382/// ...
3383///
3384/// vector.body:
3385/// ...
3386/// %CurrentIter = CURRENT-ITERATION-PHI [ %StartV, %vector.ph ],
3387/// [ %NextIter, %vector.body ]
3388/// %AVL = phi [ trip-count, %vector.ph ], [ %NextAVL, %vector.body ]
3389/// %cmp = cmp ult %AVL, MaxSafeElements
3390/// %SAFE_AVL = select %cmp, %AVL, MaxSafeElements
3391/// %VPEVL = EXPLICIT-VECTOR-LENGTH %SAFE_AVL
3392/// ...
3393/// %OpEVL = cast i32 %VPEVL to IVSize
3394/// %NextIter = add IVSize %OpEVL, %CurrentIter
3395/// %NextAVL = sub IVSize nuw %AVL, %OpEVL
3396/// ...
3397///
3398void VPlanTransforms::addExplicitVectorLength(
3399 VPlan &Plan, const std::optional<unsigned> &MaxSafeElements) {
3400 if (Plan.hasScalarVFOnly())
3401 return;
3402 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3403 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
3404
3405 auto *CanonicalIV = LoopRegion->getCanonicalIV();
3406 auto *CanIVTy = LoopRegion->getCanonicalIVType();
3407 VPValue *StartV = Plan.getZero(Ty: CanIVTy);
3408 auto *CanonicalIVIncrement = LoopRegion->getOrCreateCanonicalIVIncrement();
3409
3410 // Create the CurrentIteration recipe in the vector loop.
3411 auto *CurrentIteration =
3412 new VPCurrentIterationPHIRecipe(StartV, DebugLoc::getUnknown());
3413 CurrentIteration->insertBefore(BB&: *Header, IP: Header->begin());
3414 VPBuilder Builder(Header, Header->getFirstNonPhi());
3415 // Create the AVL (application vector length), starting from TC -> 0 in steps
3416 // of EVL.
3417 VPPhi *AVLPhi = Builder.createScalarPhi(
3418 IncomingValues: {Plan.getTripCount()}, DL: DebugLoc::getCompilerGenerated(), Name: "avl");
3419 VPValue *AVL = AVLPhi;
3420
3421 if (MaxSafeElements) {
3422 // Support for MaxSafeDist for correct loop emission.
3423 VPValue *AVLSafe = Plan.getConstantInt(Ty: CanIVTy, Val: *MaxSafeElements);
3424 VPValue *Cmp = Builder.createICmp(Pred: ICmpInst::ICMP_ULT, A: AVL, B: AVLSafe);
3425 AVL = Builder.createSelect(Cond: Cmp, TrueVal: AVL, FalseVal: AVLSafe, DL: DebugLoc::getUnknown(),
3426 Name: "safe_avl");
3427 }
3428 auto *VPEVL = Builder.createNaryOp(Opcode: VPInstruction::ExplicitVectorLength, Operands: AVL,
3429 DL: DebugLoc::getUnknown(), Name: "evl");
3430
3431 Builder.setInsertPoint(CanonicalIVIncrement);
3432 VPValue *OpVPEVL = VPEVL;
3433
3434 auto *I32Ty = Type::getInt32Ty(C&: Plan.getContext());
3435 OpVPEVL = Builder.createScalarZExtOrTrunc(
3436 Op: OpVPEVL, ResultTy: CanIVTy, SrcTy: I32Ty, DL: CanonicalIVIncrement->getDebugLoc());
3437
3438 auto *NextIter = Builder.createAdd(
3439 LHS: OpVPEVL, RHS: CurrentIteration, DL: CanonicalIVIncrement->getDebugLoc(),
3440 Name: "current.iteration.next", WrapFlags: CanonicalIVIncrement->getNoWrapFlags());
3441 CurrentIteration->addBackedgeValue(V: NextIter);
3442
3443 VPValue *NextAVL =
3444 Builder.createSub(LHS: AVLPhi, RHS: OpVPEVL, DL: DebugLoc::getCompilerGenerated(),
3445 Name: "avl.next", WrapFlags: {/*NUW=*/true, /*NSW=*/false});
3446 AVLPhi->addIncoming(IncomingV: NextAVL);
3447
3448 fixupVFUsersForEVL(Plan, EVL&: *VPEVL);
3449 removeDeadRecipes(Plan);
3450
3451 // Replace all uses of the canonical IV with VPCurrentIterationPHIRecipe
3452 // except for the canonical IV increment.
3453 CanonicalIV->replaceAllUsesWith(New: CurrentIteration);
3454 CanonicalIVIncrement->setOperand(I: 0, New: CanonicalIV);
3455 // TODO: support unroll factor > 1.
3456 Plan.setUF(1);
3457}
3458
3459void VPlanTransforms::convertToVariableLengthStep(VPlan &Plan) {
3460 // Find the vector loop entry by locating VPCurrentIterationPHIRecipe.
3461 // There should be only one VPCurrentIteration in the entire plan.
3462 VPCurrentIterationPHIRecipe *CurrentIteration = nullptr;
3463
3464 for (VPBasicBlock *VPBB : VPBlockUtils::blocksAs<VPBasicBlock>(
3465 Range: vp_depth_first_shallow(G: Plan.getEntry())))
3466 for (VPRecipeBase &R : VPBB->phis())
3467 if (auto *PhiR = dyn_cast<VPCurrentIterationPHIRecipe>(Val: &R)) {
3468 assert(!CurrentIteration &&
3469 "Found multiple CurrentIteration. Only one expected");
3470 CurrentIteration = PhiR;
3471 }
3472
3473 // Early return if it is not variable-length stepping.
3474 if (!CurrentIteration)
3475 return;
3476
3477 VPBasicBlock *HeaderVPBB = CurrentIteration->getParent();
3478 VPValue *CurrentIterationIncr = CurrentIteration->getBackedgeValue();
3479
3480 // Convert CurrentIteration to concrete recipe.
3481 auto *ScalarR =
3482 VPBuilder(CurrentIteration)
3483 .createScalarPhi(
3484 IncomingValues: {CurrentIteration->getStartValue(), CurrentIterationIncr},
3485 DL: CurrentIteration->getDebugLoc(), Name: "current.iteration.iv");
3486 CurrentIteration->replaceAllUsesWith(New: ScalarR);
3487 CurrentIteration->eraseFromParent();
3488
3489 // Replace CanonicalIVInc with CurrentIteration increment if it exists.
3490 auto *CanonicalIV = cast<VPPhi>(Val: &*HeaderVPBB->begin());
3491 if (auto *CanIVInc = findUserOf(
3492 V: CanonicalIV, P: m_c_Add(Op0: m_VPValue(), Op1: m_Specific(VPV: &Plan.getVFxUF())))) {
3493 cast<VPInstruction>(Val: CanIVInc)->replaceAllUsesWith(New: CurrentIterationIncr);
3494 CanIVInc->eraseFromParent();
3495 }
3496}
3497
3498void VPlanTransforms::convertEVLExitCond(VPlan &Plan) {
3499 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3500 if (!LoopRegion)
3501 return;
3502 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
3503 if (Header->empty())
3504 return;
3505 // The EVL IV is always at the beginning.
3506 auto *EVLPhi = dyn_cast<VPCurrentIterationPHIRecipe>(Val: &Header->front());
3507 if (!EVLPhi)
3508 return;
3509
3510 // Bail if not an EVL tail folded loop.
3511 VPValue *AVL;
3512 if (!match(V: EVLPhi->getBackedgeValue(),
3513 P: m_c_Add(Op0: m_ZExtOrSelf(Op0: m_EVL(Op0: m_VPValue(V&: AVL))), Op1: m_Specific(VPV: EVLPhi))))
3514 return;
3515
3516 // The AVL may be capped to a safe distance.
3517 VPValue *SafeAVL, *UnsafeAVL;
3518 if (match(V: AVL,
3519 P: m_Select(Op0: m_SpecificICmp(MatchPred: CmpInst::ICMP_ULT, Op0: m_VPValue(V&: UnsafeAVL),
3520 Op1: m_VPValue(V&: SafeAVL)),
3521 Op1: m_Deferred(V: UnsafeAVL), Op2: m_Deferred(V: SafeAVL))))
3522 AVL = UnsafeAVL;
3523
3524 VPValue *AVLNext;
3525 [[maybe_unused]] bool FoundAVLNext =
3526 match(V: AVL, P: m_VPInstruction<Instruction::PHI>(
3527 Ops: m_Specific(VPV: Plan.getTripCount()), Ops: m_VPValue(V&: AVLNext)));
3528 assert(FoundAVLNext && "Didn't find AVL backedge?");
3529
3530 VPBasicBlock *Latch = LoopRegion->getExitingBasicBlock();
3531 auto *LatchBr = cast<VPInstruction>(Val: Latch->getTerminator());
3532 if (match(V: LatchBr, P: m_BranchOnCond(Op0: m_True())))
3533 return;
3534
3535 VPValue *CanIVInc;
3536 [[maybe_unused]] bool FoundIncrement = match(
3537 V: LatchBr,
3538 P: m_BranchOnCond(Op0: m_SpecificCmp(MatchPred: CmpInst::ICMP_EQ, Op0: m_VPValue(V&: CanIVInc),
3539 Op1: m_Specific(VPV: &Plan.getVectorTripCount()))));
3540 assert(FoundIncrement &&
3541 match(CanIVInc, m_Add(m_Specific(LoopRegion->getCanonicalIV()),
3542 m_Specific(&Plan.getVFxUF()))) &&
3543 "Expected BranchOnCond with ICmp comparing CanIV + VFxUF with vector "
3544 "trip count");
3545
3546 Type *AVLTy = AVLNext->getScalarType();
3547 VPBuilder Builder(LatchBr);
3548 LatchBr->setOperand(
3549 I: 0, New: Builder.createICmp(Pred: CmpInst::ICMP_EQ, A: AVLNext, B: Plan.getZero(Ty: AVLTy)));
3550}
3551
3552void VPlanTransforms::replaceSymbolicStrides(
3553 VPlan &Plan, PredicatedScalarEvolution &PSE,
3554 const DenseMap<Value *, const SCEV *> &StridesMap,
3555 const VPDominatorTree &VPDT) {
3556 // Replace VPValues for known constant strides guaranteed by predicated scalar
3557 // evolution that are guaranteed to be guarded by the runtime checks; that is,
3558 // blocks dominated by the vector preheader.
3559 assert(!Plan.getVectorLoopRegion() &&
3560 "expected to run before loop regions are created");
3561 VPBlockBase *Preheader = Plan.getEntry()->getSuccessors()[1];
3562 auto CanUseVersionedStride = [&VPDT, Preheader](VPUser &U, unsigned) {
3563 auto *R = cast<VPRecipeBase>(Val: &U);
3564 VPBlockBase *Parent = R->getParent();
3565 return VPDT.dominates(A: Preheader, B: Parent);
3566 };
3567 ValueToSCEVMapTy RewriteMap;
3568 for (const SCEV *Stride : StridesMap.values()) {
3569 using namespace SCEVPatternMatch;
3570 auto *StrideV = cast<SCEVUnknown>(Val: Stride)->getValue();
3571 const APInt *StrideConst;
3572 if (!match(S: PSE.getSCEV(V: StrideV), P: m_scev_APInt(C&: StrideConst)))
3573 // Only handle constant strides for now.
3574 continue;
3575
3576 auto *CI = Plan.getConstantInt(Val: *StrideConst);
3577 if (VPValue *StrideVPV = Plan.getLiveIn(V: StrideV))
3578 StrideVPV->replaceUsesWithIf(New: CI, ShouldReplace: CanUseVersionedStride);
3579
3580 // The versioned value may not be used in the loop directly but through a
3581 // sext/zext. Add new live-ins in those cases.
3582 for (Value *U : StrideV->users()) {
3583 if (!isa<SExtInst, ZExtInst>(Val: U))
3584 continue;
3585 VPValue *StrideVPV = Plan.getLiveIn(V: U);
3586 if (!StrideVPV)
3587 continue;
3588 unsigned BW = U->getType()->getScalarSizeInBits();
3589 APInt C =
3590 isa<SExtInst>(Val: U) ? StrideConst->sext(width: BW) : StrideConst->zext(width: BW);
3591 VPValue *CI = Plan.getConstantInt(Val: C);
3592 StrideVPV->replaceUsesWithIf(New: CI, ShouldReplace: CanUseVersionedStride);
3593 }
3594 RewriteMap[StrideV] = PSE.getSCEV(V: StrideV);
3595 }
3596
3597 for (VPRecipeBase &R : *Plan.getEntry()) {
3598 auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(Val: &R);
3599 if (!ExpSCEV)
3600 continue;
3601 const SCEV *ScevExpr = ExpSCEV->getSCEV();
3602 auto *NewSCEV =
3603 SCEVParameterRewriter::rewrite(Scev: ScevExpr, SE&: *PSE.getSE(), Map&: RewriteMap);
3604 if (NewSCEV != ScevExpr) {
3605 VPValue *NewExp = vputils::getOrCreateVPValueForSCEVExpr(Plan, Expr: NewSCEV);
3606 ExpSCEV->replaceAllUsesWith(New: NewExp);
3607 if (Plan.getTripCount() == ExpSCEV)
3608 Plan.resetTripCount(NewTripCount: NewExp);
3609 }
3610 }
3611}
3612
3613void VPlanTransforms::dropPoisonGeneratingRecipes(VPlan &Plan) {
3614 // Collect recipes in the backward slice of `Root` that may generate a poison
3615 // value that is used after vectorization.
3616 SmallPtrSet<VPRecipeBase *, 16> Visited;
3617 auto CollectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) {
3618 SmallVector<VPRecipeBase *, 16> Worklist;
3619 Worklist.push_back(Elt: Root);
3620
3621 // Traverse the backward slice of Root through its use-def chain.
3622 while (!Worklist.empty()) {
3623 VPRecipeBase *CurRec = Worklist.pop_back_val();
3624
3625 if (!Visited.insert(Ptr: CurRec).second)
3626 continue;
3627
3628 // Prune search if we find another recipe generating a widen memory
3629 // instruction. Widen memory instructions involved in address computation
3630 // will lead to gather/scatter instructions, which don't need to be
3631 // handled.
3632 if (isa<VPWidenMemoryRecipe, VPInterleaveRecipe, VPScalarIVStepsRecipe,
3633 VPHeaderPHIRecipe>(Val: CurRec))
3634 continue;
3635
3636 // This recipe contributes to the address computation of a widen
3637 // load/store. If the underlying instruction has poison-generating flags,
3638 // drop them directly.
3639 if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(Val: CurRec)) {
3640 VPValue *A, *B;
3641 // Dropping disjoint from an OR may yield incorrect results, as some
3642 // analysis may have converted it to an Add implicitly (e.g. SCEV used
3643 // for dependence analysis). Instead, replace it with an equivalent Add.
3644 // This is possible as all users of the disjoint OR only access lanes
3645 // where the operands are disjoint or poison otherwise.
3646 if (match(V: RecWithFlags, P: m_BinaryOr(Op0: m_VPValue(V&: A), Op1: m_VPValue(V&: B))) &&
3647 RecWithFlags->isDisjoint()) {
3648 VPBuilder Builder(RecWithFlags);
3649 VPInstruction *New =
3650 Builder.createAdd(LHS: A, RHS: B, DL: RecWithFlags->getDebugLoc());
3651 New->setUnderlyingValue(RecWithFlags->getUnderlyingValue());
3652 RecWithFlags->replaceAllUsesWith(New);
3653 RecWithFlags->eraseFromParent();
3654 CurRec = New;
3655 } else
3656 RecWithFlags->dropPoisonGeneratingFlags();
3657 } else {
3658 Instruction *Instr = dyn_cast_or_null<Instruction>(
3659 Val: CurRec->getVPSingleValue()->getUnderlyingValue());
3660 (void)Instr;
3661 assert((!Instr || !Instr->hasPoisonGeneratingFlags()) &&
3662 "found instruction with poison generating flags not covered by "
3663 "VPRecipeWithIRFlags");
3664 }
3665
3666 // Add new definitions to the worklist.
3667 for (VPValue *Operand : CurRec->operands())
3668 if (VPRecipeBase *OpDef = Operand->getDefiningRecipe())
3669 Worklist.push_back(Elt: OpDef);
3670 }
3671 });
3672
3673 // We want to exclude the tail folding case, as we don't need to drop flags
3674 // for operations computing the first lane in this case: the first lane of the
3675 // header mask must always be true.
3676 auto IsNotHeaderMask = [&Plan](VPValue *Mask) {
3677 return Mask && !vputils::isHeaderMask(V: Mask, Plan);
3678 };
3679
3680 // Traverse all the recipes in the VPlan and collect the poison-generating
3681 // recipes in the backward slice starting at the address of a VPWidenRecipe or
3682 // VPInterleaveRecipe.
3683 auto Iter =
3684 vp_depth_first_shallow(G: Plan.getVectorLoopRegion()->getEntryBasicBlock());
3685 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Range&: Iter)) {
3686 for (VPRecipeBase &Recipe : *VPBB) {
3687 if (auto *WidenRec = dyn_cast<VPWidenMemoryRecipe>(Val: &Recipe)) {
3688 VPRecipeBase *AddrDef = WidenRec->getAddr()->getDefiningRecipe();
3689 if (AddrDef && WidenRec->isConsecutive() &&
3690 IsNotHeaderMask(WidenRec->getMask()))
3691 CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
3692 } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(Val: &Recipe)) {
3693 VPRecipeBase *AddrDef = InterleaveRec->getAddr()->getDefiningRecipe();
3694 if (AddrDef && IsNotHeaderMask(InterleaveRec->getMask()))
3695 CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
3696 }
3697 }
3698 }
3699}
3700
3701void VPlanTransforms::createInterleaveGroups(
3702 VPlan &Plan,
3703 const SmallPtrSetImpl<const InterleaveGroup<Instruction> *>
3704 &InterleaveGroups,
3705 const bool &EpilogueAllowed) {
3706 if (InterleaveGroups.empty())
3707 return;
3708
3709 DenseMap<Instruction *, VPWidenMemoryRecipe *> IRMemberToRecipe;
3710 for (VPBasicBlock *VPBB :
3711 VPBlockUtils::blocksOnly<VPBasicBlock>(Range: vp_depth_first_shallow(
3712 G: Plan.getVectorLoopRegion()->getEntryBasicBlock())))
3713 for (VPRecipeBase &R : make_filter_range(Range&: *VPBB, Pred: [](VPRecipeBase &R) {
3714 return isa<VPWidenMemoryRecipe>(Val: &R);
3715 })) {
3716 auto *MemR = cast<VPWidenMemoryRecipe>(Val: &R);
3717 IRMemberToRecipe[&MemR->getIngredient()] = MemR;
3718 }
3719
3720 // Interleave memory: for each Interleave Group we marked earlier as relevant
3721 // for this VPlan, replace the Recipes widening its memory instructions with a
3722 // single VPInterleaveRecipe at its insertion point.
3723 VPDominatorTree VPDT(Plan);
3724 for (const auto *IG : InterleaveGroups) {
3725 // Skip interleave groups where members don't have recipes. This can happen
3726 // when removeDeadRecipes removes recipes that are part of interleave groups
3727 // but have no users.
3728 if (llvm::any_of(Range: IG->members(), P: [&IRMemberToRecipe](Instruction *Member) {
3729 return !IRMemberToRecipe.contains(Val: Member);
3730 }))
3731 continue;
3732
3733 auto *Start = IRMemberToRecipe.lookup(Val: IG->getMember(Index: 0));
3734 VPIRMetadata InterleaveMD(*Start);
3735 SmallVector<VPValue *, 4> StoredValues;
3736 if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(Val: Start->getAsRecipe()))
3737 StoredValues.push_back(Elt: StoreR->getStoredValue());
3738 for (unsigned I = 1; I < IG->getFactor(); ++I) {
3739 Instruction *MemberI = IG->getMember(Index: I);
3740 if (!MemberI)
3741 continue;
3742 VPWidenMemoryRecipe *MemoryR = IRMemberToRecipe.lookup(Val: MemberI);
3743 if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(Val: MemoryR->getAsRecipe()))
3744 StoredValues.push_back(Elt: StoreR->getStoredValue());
3745 InterleaveMD.intersect(MD: *MemoryR);
3746 }
3747
3748 bool NeedsMaskForGaps =
3749 (IG->requiresScalarEpilogue() && !EpilogueAllowed) ||
3750 (!StoredValues.empty() && !IG->isFull());
3751
3752 Instruction *IRInsertPos = IG->getInsertPos();
3753 auto *InsertPos = IRMemberToRecipe.lookup(Val: IRInsertPos);
3754 VPRecipeBase *InsertPosR = InsertPos->getAsRecipe();
3755
3756 GEPNoWrapFlags NW = GEPNoWrapFlags::none();
3757 if (auto *Gep = dyn_cast<GetElementPtrInst>(
3758 Val: getLoadStorePointerOperand(V: IRInsertPos)->stripPointerCasts()))
3759 NW = Gep->getNoWrapFlags().withoutNoUnsignedWrap();
3760
3761 // Get or create the start address for the interleave group.
3762 VPValue *Addr = Start->getAddr();
3763 VPRecipeBase *AddrDef = Addr->getDefiningRecipe();
3764 if (AddrDef && !VPDT.properlyDominates(A: AddrDef, B: InsertPosR)) {
3765 // We cannot re-use the address of member zero because it does not
3766 // dominate the insert position. Instead, use the address of the insert
3767 // position and create a PtrAdd adjusting it to the address of member
3768 // zero.
3769 // TODO: Hoist Addr's defining recipe (and any operands as needed) to
3770 // InsertPos or sink loads above zero members to join it.
3771 assert(IG->getIndex(IRInsertPos) != 0 &&
3772 "index of insert position shouldn't be zero");
3773 auto &DL = IRInsertPos->getDataLayout();
3774 APInt Offset(32,
3775 DL.getTypeAllocSize(Ty: getLoadStoreType(I: IRInsertPos)) *
3776 IG->getIndex(Instr: IRInsertPos),
3777 /*IsSigned=*/true);
3778 VPValue *OffsetVPV = Plan.getConstantInt(Val: -Offset);
3779 VPBuilder B(InsertPosR);
3780 Addr = B.createNoWrapPtrAdd(Ptr: InsertPos->getAddr(), Offset: OffsetVPV, GEPFlags: NW);
3781 }
3782 // If the group is reverse, adjust the index to refer to the last vector
3783 // lane instead of the first. We adjust the index from the first vector
3784 // lane, rather than directly getting the pointer for lane VF - 1, because
3785 // the pointer operand of the interleaved access is supposed to be uniform.
3786 if (IG->isReverse()) {
3787 auto *ReversePtr = new VPVectorEndPointerRecipe(
3788 Addr, &Plan.getVF(), getLoadStoreType(I: IRInsertPos),
3789 -(int64_t)IG->getFactor(), NW, InsertPosR->getDebugLoc());
3790 ReversePtr->insertBefore(InsertPos: InsertPosR);
3791 Addr = ReversePtr;
3792 }
3793 auto *VPIG = new VPInterleaveRecipe(
3794 IG, Addr, StoredValues, InsertPos->getMask(), NeedsMaskForGaps,
3795 InterleaveMD, InsertPosR->getDebugLoc());
3796 VPIG->insertBefore(InsertPos: InsertPosR);
3797
3798 unsigned J = 0;
3799 for (unsigned i = 0; i < IG->getFactor(); ++i)
3800 if (Instruction *Member = IG->getMember(Index: i)) {
3801 VPRecipeBase *MemberR = IRMemberToRecipe.lookup(Val: Member)->getAsRecipe();
3802 if (!Member->getType()->isVoidTy()) {
3803 VPValue *OriginalV = MemberR->getVPSingleValue();
3804 OriginalV->replaceAllUsesWith(New: VPIG->getVPValue(I: J));
3805 J++;
3806 }
3807 MemberR->eraseFromParent();
3808 }
3809 }
3810}
3811
3812/// Expand a VPWidenIntOrFpInduction into executable recipes, for the initial
3813/// value, phi and backedge value. In the following example:
3814///
3815/// vector.ph:
3816/// Successor(s): vector loop
3817///
3818/// <x1> vector loop: {
3819/// vector.body:
3820/// WIDEN-INDUCTION %i = phi %start, %step, %vf
3821/// ...
3822/// EMIT branch-on-count ...
3823/// No successors
3824/// }
3825///
3826/// WIDEN-INDUCTION will get expanded to:
3827///
3828/// vector.ph:
3829/// ...
3830/// vp<%induction.start> = ...
3831/// vp<%induction.increment> = ...
3832///
3833/// Successor(s): vector loop
3834///
3835/// <x1> vector loop: {
3836/// vector.body:
3837/// ir<%i> = WIDEN-PHI vp<%induction.start>, vp<%vec.ind.next>
3838/// ...
3839/// vp<%vec.ind.next> = add ir<%i>, vp<%induction.increment>
3840/// EMIT branch-on-count ...
3841/// No successors
3842/// }
3843static void
3844expandVPWidenIntOrFpInduction(VPWidenIntOrFpInductionRecipe *WidenIVR) {
3845 VPlan *Plan = WidenIVR->getParent()->getPlan();
3846 VPValue *Start = WidenIVR->getStartValue();
3847 VPValue *Step = WidenIVR->getStepValue();
3848 VPValue *VF = WidenIVR->getVFValue();
3849 DebugLoc DL = WidenIVR->getDebugLoc();
3850
3851 // The value from the original loop to which we are mapping the new induction
3852 // variable.
3853 Type *Ty = WidenIVR->getScalarType();
3854
3855 const InductionDescriptor &ID = WidenIVR->getInductionDescriptor();
3856 Instruction::BinaryOps AddOp;
3857 Instruction::BinaryOps MulOp;
3858 VPIRFlags Flags = *WidenIVR;
3859 if (ID.getKind() == InductionDescriptor::IK_IntInduction) {
3860 AddOp = Instruction::Add;
3861 MulOp = Instruction::Mul;
3862 } else {
3863 AddOp = ID.getInductionOpcode();
3864 MulOp = Instruction::FMul;
3865 }
3866
3867 // If the phi is truncated, truncate the start and step values.
3868 VPBuilder Builder(Plan->getVectorPreheader());
3869 Type *StepTy = Step->getScalarType();
3870 if (Ty->getScalarSizeInBits() < StepTy->getScalarSizeInBits()) {
3871 assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
3872 Step = Builder.createScalarCast(Opcode: Instruction::Trunc, Op: Step, ResultTy: Ty, DL);
3873 Start = Builder.createScalarCast(Opcode: Instruction::Trunc, Op: Start, ResultTy: Ty, DL);
3874 StepTy = Ty;
3875 }
3876
3877 // Construct the initial value of the vector IV in the vector loop preheader.
3878 Type *IVIntTy =
3879 IntegerType::get(C&: Plan->getContext(), NumBits: StepTy->getScalarSizeInBits());
3880 VPValue *Init = Builder.createNaryOp(Opcode: VPInstruction::StepVector, Operands: {}, ResultTy: IVIntTy);
3881 if (StepTy->isFloatingPointTy())
3882 Init = Builder.createWidenCast(Opcode: Instruction::UIToFP, Op: Init, ResultTy: StepTy);
3883
3884 VPValue *SplatStart = Builder.createNaryOp(Opcode: VPInstruction::Broadcast, Operands: Start);
3885 VPValue *SplatStep = Builder.createNaryOp(Opcode: VPInstruction::Broadcast, Operands: Step);
3886
3887 Init = Builder.createNaryOp(Opcode: MulOp, Operands: {Init, SplatStep}, Flags);
3888 Init = Builder.createNaryOp(Opcode: AddOp, Operands: {SplatStart, Init}, Flags,
3889 DL: DebugLoc::getUnknown(), Name: "induction");
3890
3891 // Create the widened phi of the vector IV.
3892 auto *WidePHI = VPBuilder(WidenIVR).createWidenPhi(
3893 IncomingValues: Init, DL: WidenIVR->getDebugLoc(), Name: "vec.ind");
3894
3895 // Create the backedge value for the vector IV.
3896 VPValue *Inc;
3897 VPValue *Prev;
3898 // If unrolled, use the increment and prev value from the operands.
3899 if (auto *SplatVF = WidenIVR->getSplatVFValue()) {
3900 Inc = SplatVF;
3901 Prev = WidenIVR->getLastUnrolledPartOperand();
3902 } else {
3903 // Move the insertion point after the VF definition when the VF is defined
3904 // inside a loop, such as for EVL tail-folding.
3905 if (VPRecipeBase *R = VF->getDefiningRecipe())
3906 if (R->getParent()->getEnclosingLoopRegion())
3907 Builder.setInsertPoint(TheBB: R->getParent(), IP: std::next(x: R->getIterator()));
3908
3909 // Multiply the vectorization factor by the step using integer or
3910 // floating-point arithmetic as appropriate.
3911 if (StepTy->isFloatingPointTy())
3912 VF = Builder.createScalarCast(Opcode: Instruction::CastOps::UIToFP, Op: VF, ResultTy: StepTy,
3913 DL);
3914 else
3915 VF = Builder.createScalarZExtOrTrunc(Op: VF, ResultTy: StepTy, SrcTy: VF->getScalarType(), DL);
3916
3917 Inc = Builder.createNaryOp(Opcode: MulOp, Operands: {Step, VF}, Flags);
3918 Inc = Builder.createNaryOp(Opcode: VPInstruction::Broadcast, Operands: Inc);
3919 Prev = WidePHI;
3920 }
3921
3922 VPBasicBlock *ExitingBB = Plan->getVectorLoopRegion()->getExitingBasicBlock();
3923 Builder.setInsertPoint(TheBB: ExitingBB, IP: ExitingBB->getTerminator()->getIterator());
3924 auto *Next = Builder.createNaryOp(Opcode: AddOp, Operands: {Prev, Inc}, Flags,
3925 DL: WidenIVR->getDebugLoc(), Name: "vec.ind.next");
3926
3927 WidePHI->addIncoming(IncomingV: Next);
3928
3929 WidenIVR->replaceAllUsesWith(New: WidePHI);
3930}
3931
3932/// Expand a VPWidenPointerInductionRecipe into executable recipes, for the
3933/// initial value, phi and backedge value. In the following example:
3934///
3935/// <x1> vector loop: {
3936/// vector.body:
3937/// EMIT ir<%ptr.iv> = WIDEN-POINTER-INDUCTION %start, %step, %vf
3938/// ...
3939/// EMIT branch-on-count ...
3940/// }
3941///
3942/// WIDEN-POINTER-INDUCTION will get expanded to:
3943///
3944/// <x1> vector loop: {
3945/// vector.body:
3946/// EMIT-SCALAR %pointer.phi = phi %start, %ptr.ind
3947/// EMIT %mul = mul %stepvector, %step
3948/// EMIT %vector.gep = wide-ptradd %pointer.phi, %mul
3949/// ...
3950/// EMIT %ptr.ind = ptradd %pointer.phi, %vf
3951/// EMIT branch-on-count ...
3952/// }
3953static void expandVPWidenPointerInduction(VPWidenPointerInductionRecipe *R) {
3954 VPlan *Plan = R->getParent()->getPlan();
3955 VPValue *Start = R->getStartValue();
3956 VPValue *Step = R->getStepValue();
3957 VPValue *VF = R->getVFValue();
3958
3959 assert(R->getInductionDescriptor().getKind() ==
3960 InductionDescriptor::IK_PtrInduction &&
3961 "Not a pointer induction according to InductionDescriptor!");
3962 assert(R->getScalarType()->isPointerTy() && "Unexpected type.");
3963 assert(!R->onlyScalarsGenerated(Plan->hasScalableVF()) &&
3964 "Recipe should have been replaced");
3965
3966 VPBuilder Builder(R);
3967 DebugLoc DL = R->getDebugLoc();
3968
3969 // Build a scalar pointer phi.
3970 VPPhi *ScalarPtrPhi = Builder.createScalarPhi(IncomingValues: Start, DL, Name: "pointer.phi");
3971
3972 // Create actual address geps that use the pointer phi as base and a
3973 // vectorized version of the step value (<step*0, ..., step*N>) as offset.
3974 Builder.setInsertPoint(TheBB: R->getParent(), IP: R->getParent()->getFirstNonPhi());
3975 Type *StepTy = Step->getScalarType();
3976 VPValue *Offset = Builder.createNaryOp(Opcode: VPInstruction::StepVector, Operands: {}, ResultTy: StepTy);
3977 Offset = Builder.createOverflowingOp(Opcode: Instruction::Mul, Operands: {Offset, Step});
3978 VPValue *PtrAdd =
3979 Builder.createWidePtrAdd(Ptr: ScalarPtrPhi, Offset, DL, Name: "vector.gep");
3980 R->replaceAllUsesWith(New: PtrAdd);
3981
3982 // Create the backedge value for the scalar pointer phi.
3983 VPBasicBlock *ExitingBB = Plan->getVectorLoopRegion()->getExitingBasicBlock();
3984 Builder.setInsertPoint(TheBB: ExitingBB, IP: ExitingBB->getTerminator()->getIterator());
3985 VF = Builder.createScalarZExtOrTrunc(Op: VF, ResultTy: StepTy, SrcTy: VF->getScalarType(), DL);
3986 VPValue *Inc = Builder.createOverflowingOp(Opcode: Instruction::Mul, Operands: {Step, VF});
3987
3988 VPValue *InductionGEP =
3989 Builder.createPtrAdd(Ptr: ScalarPtrPhi, Offset: Inc, DL, Name: "ptr.ind");
3990 ScalarPtrPhi->addIncoming(IncomingV: InductionGEP);
3991}
3992
3993/// Expand a VPDerivedIVRecipe into executable recipes.
3994static void expandVPDerivedIV(VPDerivedIVRecipe *R) {
3995 VPBuilder Builder(R);
3996 VPIRValue *Start = R->getStartValue();
3997 VPValue *Step = R->getStepValue();
3998 VPValue *Index = R->getIndex();
3999 Type *StepTy = Step->getScalarType();
4000 Type *IndexTy = Index->getScalarType();
4001 Index = StepTy->isIntegerTy()
4002 ? Builder.createScalarSExtOrTrunc(
4003 Op: Index, ResultTy: StepTy, SrcTy: IndexTy, DL: DebugLoc::getCompilerGenerated())
4004 : Builder.createScalarCast(Opcode: Instruction::SIToFP, Op: Index, ResultTy: StepTy,
4005 DL: DebugLoc::getCompilerGenerated());
4006 switch (R->getInductionKind()) {
4007 case InductionDescriptor::IK_IntInduction: {
4008 assert(Index->getScalarType() == Start->getScalarType() &&
4009 "Index type does not match StartValue type");
4010 return R->replaceAllUsesWith(New: Builder.createAdd(
4011 LHS: Start, RHS: Builder.createOverflowingOp(Opcode: Instruction::Mul, Operands: {Index, Step})));
4012 }
4013 case InductionDescriptor::IK_PtrInduction:
4014 return R->replaceAllUsesWith(New: Builder.createPtrAdd(
4015 Ptr: Start, Offset: Builder.createOverflowingOp(Opcode: Instruction::Mul, Operands: {Index, Step})));
4016 case InductionDescriptor::IK_FpInduction: {
4017 assert(StepTy->isFloatingPointTy() && "Expected FP Step value");
4018 const FPMathOperator *FPBinOp = R->getFPBinOp();
4019 assert(FPBinOp &&
4020 (FPBinOp->getOpcode() == Instruction::FAdd ||
4021 FPBinOp->getOpcode() == Instruction::FSub) &&
4022 "Original BinOp should be defined for FP induction");
4023 FastMathFlags FMF = FPBinOp->getFastMathFlags();
4024 VPValue *FMul = Builder.createNaryOp(Opcode: Instruction::FMul, Operands: {Step, Index}, Flags: FMF);
4025 return R->replaceAllUsesWith(
4026 New: Builder.createNaryOp(Opcode: FPBinOp->getOpcode(), Operands: {Start, FMul}, Flags: FMF));
4027 }
4028 case InductionDescriptor::IK_NoInduction:
4029 return;
4030 }
4031 llvm_unreachable("Unhandled induction kind");
4032}
4033
4034void VPlanTransforms::dissolveLoopRegions(VPlan &Plan) {
4035 // Replace loop regions with explicity CFG.
4036 SmallVector<VPRegionBlock *> LoopRegions;
4037 for (VPRegionBlock *R : VPBlockUtils::blocksOnly<VPRegionBlock>(
4038 Range: vp_depth_first_deep(G: Plan.getEntry()))) {
4039 if (!R->isReplicator())
4040 LoopRegions.push_back(Elt: R);
4041 }
4042 for (VPRegionBlock *R : LoopRegions)
4043 R->dissolveToCFGLoop();
4044}
4045
4046void VPlanTransforms::expandBranchOnTwoConds(VPlan &Plan) {
4047 SmallVector<VPInstruction *> WorkList;
4048 // The transform runs after dissolving loop regions, so all VPBasicBlocks
4049 // terminated with BranchOnTwoConds are reached via a shallow traversal.
4050 for (VPBasicBlock *VPBB : VPBlockUtils::blocksAs<VPBasicBlock>(
4051 Range: vp_depth_first_shallow(G: Plan.getEntry()))) {
4052 if (!VPBB->empty() && match(V: &VPBB->back(), P: m_BranchOnTwoConds()))
4053 WorkList.push_back(Elt: cast<VPInstruction>(Val: &VPBB->back()));
4054 }
4055
4056 // Expand BranchOnTwoConds instructions into explicit CFG with two new
4057 // single-condition branches:
4058 // 1. A branch that replaces BranchOnTwoConds, jumps to the first successor if
4059 // the first condition is true, and otherwise jumps to a new interim block.
4060 // 2. A branch that ends the interim block, jumps to the second successor if
4061 // the second condition is true, and otherwise jumps to the third
4062 // successor.
4063 for (VPInstruction *Br : WorkList) {
4064 assert(Br->getNumOperands() == 2 &&
4065 "BranchOnTwoConds must have exactly 2 conditions");
4066 DebugLoc DL = Br->getDebugLoc();
4067 VPBasicBlock *BrOnTwoCondsBB = Br->getParent();
4068 const auto Successors = to_vector(Range&: BrOnTwoCondsBB->getSuccessors());
4069 assert(Successors.size() == 3 &&
4070 "BranchOnTwoConds must have exactly 3 successors");
4071
4072 for (VPBlockBase *Succ : Successors)
4073 VPBlockUtils::disconnectBlocks(From: BrOnTwoCondsBB, To: Succ);
4074
4075 VPValue *Cond0 = Br->getOperand(N: 0);
4076 VPValue *Cond1 = Br->getOperand(N: 1);
4077 VPBlockBase *Succ0 = Successors[0];
4078 VPBlockBase *Succ1 = Successors[1];
4079 VPBlockBase *Succ2 = Successors[2];
4080
4081 // If the successor block for both conditions is the same, then combine the
4082 // two conditions and plant a single conditional branch.
4083 if (Succ0 == Succ1) {
4084 VPBuilder Builder(Br);
4085 VPValue *Combined = Builder.createOr(LHS: Cond0, RHS: Cond1, DL);
4086 Builder.createNaryOp(Opcode: VPInstruction::BranchOnCond, Operands: {Combined}, DL);
4087 VPBlockUtils::connectBlocks(From: BrOnTwoCondsBB, To: Succ0);
4088 VPBlockUtils::connectBlocks(From: BrOnTwoCondsBB, To: Succ2);
4089 Br->eraseFromParent();
4090 continue;
4091 }
4092
4093 assert(!Succ0->getParent() && !Succ1->getParent() && !Succ2->getParent() &&
4094 !BrOnTwoCondsBB->getParent() && "regions must already be dissolved");
4095
4096 VPBasicBlock *InterimBB =
4097 Plan.createVPBasicBlock(Name: BrOnTwoCondsBB->getName() + ".interim");
4098
4099 VPBuilder(BrOnTwoCondsBB)
4100 .createNaryOp(Opcode: VPInstruction::BranchOnCond, Operands: {Cond0}, DL);
4101 VPBlockUtils::connectBlocks(From: BrOnTwoCondsBB, To: Succ0);
4102 VPBlockUtils::connectBlocks(From: BrOnTwoCondsBB, To: InterimBB);
4103
4104 VPBuilder(InterimBB).createNaryOp(Opcode: VPInstruction::BranchOnCond, Operands: {Cond1}, DL);
4105 VPBlockUtils::connectBlocks(From: InterimBB, To: Succ1);
4106 VPBlockUtils::connectBlocks(From: InterimBB, To: Succ2);
4107 Br->eraseFromParent();
4108 }
4109}
4110
4111void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan) {
4112 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
4113 Range: vp_depth_first_deep(G: Plan.getEntry()))) {
4114 for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB)) {
4115 VPBuilder Builder(&R);
4116 if (auto *WidenIVR = dyn_cast<VPWidenIntOrFpInductionRecipe>(Val: &R)) {
4117 expandVPWidenIntOrFpInduction(WidenIVR);
4118 WidenIVR->eraseFromParent();
4119 continue;
4120 }
4121
4122 if (auto *WidenIVR = dyn_cast<VPWidenPointerInductionRecipe>(Val: &R)) {
4123 // If the recipe only generates scalars, scalarize it instead of
4124 // expanding it.
4125 if (WidenIVR->onlyScalarsGenerated(IsScalable: Plan.hasScalableVF())) {
4126 VPValue *PtrAdd =
4127 scalarizeVPWidenPointerInduction(PtrIV: WidenIVR, Plan, Builder);
4128 WidenIVR->replaceAllUsesWith(New: PtrAdd);
4129 WidenIVR->eraseFromParent();
4130 continue;
4131 }
4132 expandVPWidenPointerInduction(R: WidenIVR);
4133 WidenIVR->eraseFromParent();
4134 continue;
4135 }
4136
4137 if (auto *DerivedIVR = dyn_cast<VPDerivedIVRecipe>(Val: &R)) {
4138 expandVPDerivedIV(R: DerivedIVR);
4139 DerivedIVR->eraseFromParent();
4140 continue;
4141 }
4142
4143 if (auto *WideCanIV = dyn_cast<VPWidenCanonicalIVRecipe>(Val: &R)) {
4144 VPValue *CanIV = WideCanIV->getCanonicalIV();
4145 Type *CanIVTy = CanIV->getScalarType();
4146 VPValue *Step = WideCanIV->getStepValue();
4147 if (!Step) {
4148 assert(Plan.getConcreteUF() == 1 &&
4149 "Expected unroller to have materialized step for UF != 1");
4150 Step = Plan.getZero(Ty: CanIVTy);
4151 }
4152 CanIV = Builder.createNaryOp(Opcode: VPInstruction::Broadcast, Operands: CanIV);
4153 Step = Builder.createNaryOp(Opcode: VPInstruction::Broadcast, Operands: Step);
4154 Step = Builder.createAdd(
4155 LHS: Step, RHS: Builder.createNaryOp(Opcode: VPInstruction::StepVector, Operands: {}, ResultTy: CanIVTy));
4156 VPValue *CanVecIV =
4157 Builder.createAdd(LHS: CanIV, RHS: Step, DL: WideCanIV->getDebugLoc(), Name: "vec.iv",
4158 WrapFlags: WideCanIV->getNoWrapFlags());
4159 WideCanIV->replaceAllUsesWith(New: CanVecIV);
4160 WideCanIV->eraseFromParent();
4161 continue;
4162 }
4163
4164 // Expand VPBlendRecipe into VPInstruction::Select.
4165 if (auto *Blend = dyn_cast<VPBlendRecipe>(Val: &R)) {
4166 VPValue *Select = Blend->getIncomingValue(Idx: 0);
4167 for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
4168 Select = Builder.createSelect(Cond: Blend->getMask(Idx: I),
4169 TrueVal: Blend->getIncomingValue(Idx: I), FalseVal: Select,
4170 DL: R.getDebugLoc(), Name: "predphi", Flags: *Blend);
4171 Blend->replaceAllUsesWith(New: Select);
4172 Blend->eraseFromParent();
4173 continue;
4174 }
4175
4176 if (auto *VEPR = dyn_cast<VPVectorEndPointerRecipe>(Val: &R)) {
4177 if (!VEPR->getOffset()) {
4178 assert(Plan.getConcreteUF() == 1 &&
4179 "Expected unroller to have materialized offset for UF != 1");
4180 VEPR->materializeOffset();
4181 }
4182 continue;
4183 }
4184
4185 if (auto *Expr = dyn_cast<VPExpressionRecipe>(Val: &R)) {
4186 Expr->decompose();
4187 Expr->eraseFromParent();
4188 continue;
4189 }
4190
4191 // Expand LastActiveLane into Not + FirstActiveLane + Sub.
4192 auto *LastActiveL = dyn_cast<VPInstruction>(Val: &R);
4193 if (LastActiveL &&
4194 LastActiveL->getOpcode() == VPInstruction::LastActiveLane) {
4195 // Create Not(Mask) for all operands.
4196 SmallVector<VPValue *, 2> NotMasks;
4197 for (VPValue *Op : LastActiveL->operands()) {
4198 VPValue *NotMask = Builder.createNot(Operand: Op, DL: LastActiveL->getDebugLoc());
4199 NotMasks.push_back(Elt: NotMask);
4200 }
4201
4202 // Create FirstActiveLane on the inverted masks.
4203 VPValue *FirstInactiveLane = Builder.createFirstActiveLane(
4204 Masks: NotMasks, DL: LastActiveL->getDebugLoc(), Name: "first.inactive.lane");
4205
4206 // Subtract 1 to get the last active lane.
4207 VPValue *One =
4208 Plan.getConstantInt(Ty: FirstInactiveLane->getScalarType(), Val: 1);
4209 VPValue *LastLane =
4210 Builder.createSub(LHS: FirstInactiveLane, RHS: One,
4211 DL: LastActiveL->getDebugLoc(), Name: "last.active.lane");
4212
4213 LastActiveL->replaceAllUsesWith(New: LastLane);
4214 LastActiveL->eraseFromParent();
4215 continue;
4216 }
4217
4218 // Lower MaskedCond with block mask to LogicalAnd.
4219 if (match(V: &R, P: m_VPInstruction<VPInstruction::MaskedCond>())) {
4220 auto *VPI = cast<VPInstruction>(Val: &R);
4221 assert(VPI->isMasked() &&
4222 "Unmasked MaskedCond should be simplified earlier");
4223 VPI->replaceAllUsesWith(New: Builder.createNaryOp(
4224 Opcode: VPInstruction::LogicalAnd, Operands: {VPI->getMask(), VPI->getOperand(N: 0)}));
4225 VPI->eraseFromParent();
4226 continue;
4227 }
4228
4229 // Lower CanonicalIVIncrementForPart to plain Add.
4230 if (match(
4231 V: &R,
4232 P: m_VPInstruction<VPInstruction::CanonicalIVIncrementForPart>())) {
4233 auto *VPI = cast<VPInstruction>(Val: &R);
4234 VPValue *Add = Builder.createOverflowingOp(
4235 Opcode: Instruction::Add, Operands: VPI->operands(), WrapFlags: VPI->getNoWrapFlags(),
4236 DL: VPI->getDebugLoc());
4237 VPI->replaceAllUsesWith(New: Add);
4238 VPI->eraseFromParent();
4239 continue;
4240 }
4241
4242 // Lower BranchOnCount to ICmp + BranchOnCond.
4243 VPValue *IV, *TC;
4244 if (match(V: &R, P: m_BranchOnCount(Op0: m_VPValue(V&: IV), Op1: m_VPValue(V&: TC)))) {
4245 auto *BranchOnCountInst = cast<VPInstruction>(Val: &R);
4246 DebugLoc DL = BranchOnCountInst->getDebugLoc();
4247 VPValue *Cond = Builder.createICmp(Pred: CmpInst::ICMP_EQ, A: IV, B: TC, DL);
4248 Builder.createNaryOp(Opcode: VPInstruction::BranchOnCond, Operands: Cond, DL);
4249 BranchOnCountInst->eraseFromParent();
4250 continue;
4251 }
4252
4253 VPValue *VectorStep;
4254 VPValue *ScalarStep;
4255 if (!match(V: &R, P: m_VPInstruction<VPInstruction::WideIVStep>(
4256 Ops: m_VPValue(V&: VectorStep), Ops: m_VPValue(V&: ScalarStep))))
4257 continue;
4258
4259 // Expand WideIVStep.
4260 auto *VPI = cast<VPInstruction>(Val: &R);
4261 Type *IVTy = VPI->getScalarType();
4262 if (VectorStep->getScalarType() != IVTy) {
4263 Instruction::CastOps CastOp = IVTy->isFloatingPointTy()
4264 ? Instruction::UIToFP
4265 : Instruction::Trunc;
4266 VectorStep = Builder.createWidenCast(Opcode: CastOp, Op: VectorStep, ResultTy: IVTy);
4267 }
4268
4269 assert(!match(ScalarStep, m_One()) && "Expected non-unit scalar-step");
4270 if (ScalarStep->getScalarType() != IVTy) {
4271 ScalarStep =
4272 Builder.createWidenCast(Opcode: Instruction::Trunc, Op: ScalarStep, ResultTy: IVTy);
4273 }
4274
4275 VPIRFlags Flags;
4276 unsigned MulOpc;
4277 if (IVTy->isFloatingPointTy()) {
4278 MulOpc = Instruction::FMul;
4279 Flags = VPI->getFastMathFlagsOrNone();
4280 } else {
4281 MulOpc = Instruction::Mul;
4282 Flags = VPIRFlags::getDefaultFlags(Opcode: MulOpc);
4283 }
4284
4285 VPInstruction *Mul = Builder.createNaryOp(
4286 Opcode: MulOpc, Operands: {VectorStep, ScalarStep}, Flags, DL: R.getDebugLoc());
4287 VectorStep = Mul;
4288 VPI->replaceAllUsesWith(New: VectorStep);
4289 VPI->eraseFromParent();
4290 }
4291 }
4292}
4293
4294/// Returns the VPValue representing the uncountable exit comparison used by
4295/// AnyOf if the recipes it depends on can be traced back to live-ins and
4296/// the addresses (in GEP/PtrAdd form) of any (non-masked) load used in
4297/// generating the values for the comparison. The recipes are stored in
4298/// \p Recipes.
4299static std::optional<VPValue *>
4300getRecipesForUncountableExit(SmallVectorImpl<VPInstruction *> &Recipes,
4301 VPBasicBlock *LatchVPBB) {
4302 // Given a plain CFG VPlan loop with countable latch exiting block
4303 // \p LatchVPBB, we're looking to match the recipes contributing to the
4304 // uncountable exit condition comparison (here, vp<%4>) back to either
4305 // live-ins or the address nodes for the load used as part of the uncountable
4306 // exit comparison so that we can either move them within the loop, or copy
4307 // them to the preheader depending on the chosen method for dealing with
4308 // stores in uncountable exit loops.
4309 //
4310 // Currently, the address of the load is restricted to a GEP with 2 operands
4311 // and a live-in base address. This constraint may be relaxed later.
4312 //
4313 // VPlan ' for UF>=1' {
4314 // Live-in vp<%0> = VF * UF
4315 // Live-in vp<%1> = vector-trip-count
4316 // Live-in ir<20> = original trip-count
4317 //
4318 // ir-bb<entry>:
4319 // Successor(s): scalar.ph, vector.ph
4320 //
4321 // vector.ph:
4322 // Successor(s): for.body
4323 //
4324 // for.body:
4325 // EMIT vp<%2> = phi ir<0>, vp<%index.next>
4326 // EMIT-SCALAR ir<%iv> = phi [ ir<0>, vector.ph ], [ ir<%iv.next>, for.inc ]
4327 // EMIT ir<%uncountable.addr> = getelementptr inbounds nuw ir<%pred>,ir<%iv>
4328 // EMIT ir<%uncountable.val> = load ir<%uncountable.addr>
4329 // EMIT ir<%uncountable.cond> = icmp sgt ir<%uncountable.val>, ir<500>
4330 // EMIT vp<%3> = masked-cond ir<%uncountable.cond>
4331 // Successor(s): for.inc
4332 //
4333 // for.inc:
4334 // EMIT ir<%iv.next> = add nuw nsw ir<%iv>, ir<1>
4335 // EMIT ir<%countable.cond> = icmp eq ir<%iv.next>, ir<20>
4336 // EMIT vp<%index.next> = add nuw vp<%2>, vp<%0>
4337 // EMIT vp<%4> = any-of ir<%3>
4338 // EMIT vp<%5> = icmp eq vp<%index.next>, vp<%1>
4339 // EMIT branch-on-two-conds vp<%4>, vp<%5>
4340 // Successor(s): middle.block, middle.block, for.body
4341 //
4342 // middle.block:
4343 // Successor(s): ir-bb<exit>, scalar.ph
4344 //
4345 // ir-bb<exit>:
4346 // No successors
4347 //
4348 // scalar.ph:
4349 // }
4350
4351 // Find the uncountable loop exit condition.
4352 VPValue *UncountableCondition = nullptr;
4353 if (!match(V: LatchVPBB->getTerminator(),
4354 P: m_BranchOnTwoConds(Op0: m_AnyOf(Op0: m_VPValue(V&: UncountableCondition)),
4355 Op1: m_VPValue())))
4356 return std::nullopt;
4357
4358 SmallVector<VPValue *, 4> Worklist;
4359 Worklist.push_back(Elt: UncountableCondition);
4360 while (!Worklist.empty()) {
4361 VPValue *V = Worklist.pop_back_val();
4362
4363 // Any value defined outside the loop does not need to be copied.
4364 if (V->isDefinedOutsideLoopRegions())
4365 continue;
4366
4367 // FIXME: Remove the single user restriction; it's here because we're
4368 // starting with the simplest set of loops we can, and multiple
4369 // users means needing to add PHI nodes in the transform.
4370 if (V->getNumUsers() > 1)
4371 return std::nullopt;
4372
4373 VPValue *Op1, *Op2;
4374 // Walk back through recipes until we find at least one load from memory.
4375 if (match(V, P: m_ICmp(Op0: m_VPValue(V&: Op1), Op1: m_VPValue(V&: Op2)))) {
4376 Worklist.push_back(Elt: Op1);
4377 Worklist.push_back(Elt: Op2);
4378 Recipes.push_back(Elt: cast<VPInstruction>(Val: V->getDefiningRecipe()));
4379 } else if (match(V, P: m_VPInstruction<Instruction::Load>(Ops: m_VPValue(V&: Op1)))) {
4380 VPRecipeBase *GepR = Op1->getDefiningRecipe();
4381 // Only matching base + single offset term for now.
4382 if (GepR->getNumOperands() != 2)
4383 return std::nullopt;
4384 // Matching a GEP with a loop-invariant base ptr.
4385 if (!match(V: GepR, P: m_VPInstruction<Instruction::GetElementPtr>(
4386 Ops: m_LiveIn(), Ops: m_VPValue())))
4387 return std::nullopt;
4388 Recipes.push_back(Elt: cast<VPInstruction>(Val: V->getDefiningRecipe()));
4389 Recipes.push_back(Elt: cast<VPInstruction>(Val: GepR));
4390 } else if (match(V, P: m_VPInstruction<VPInstruction::MaskedCond>(
4391 Ops: m_VPValue(V&: Op1)))) {
4392 Worklist.push_back(Elt: Op1);
4393 Recipes.push_back(Elt: cast<VPInstruction>(Val: V->getDefiningRecipe()));
4394 } else
4395 return std::nullopt;
4396 }
4397
4398 // If we couldn't match anything, don't return the condition. It may be
4399 // defined outside the loop.
4400 if (Recipes.empty() || none_of(Range&: Recipes, P: [](VPInstruction *I) {
4401 return match(V: I, P: m_VPInstruction<Instruction::GetElementPtr>());
4402 }))
4403 return std::nullopt;
4404
4405 return UncountableCondition;
4406}
4407
4408struct EarlyExitInfo {
4409 VPBasicBlock *EarlyExitingVPBB;
4410 VPIRBasicBlock *EarlyExitVPBB;
4411 VPValue *CondToExit;
4412};
4413
4414/// Update \p Plan to mask memory operations in the loop based on whether the
4415/// early exit is taken or not.
4416///
4417/// We're currently expecting to find a loop with properties similar to the
4418/// following:
4419///
4420/// for.body:
4421/// ir<%indvars.iv> = WIDEN-INDUCTION nuw nsw ir<0>, ir<1>, vp<%0>
4422/// EMIT ir<%arrayidx> = getelementptr inbounds nuw ir<@c>, ir<%indvars.iv>
4423/// EMIT-SCALAR ir<%0> = load ir<%arrayidx>
4424/// EMIT ir<%cmp1> = icmp sgt ir<%0>, ir<5>
4425/// EMIT vp<%1> = masked-cond ir<%cmp1>
4426/// Successor(s): if.end
4427///
4428/// if.end:
4429/// EMIT ir<%arrayidx3> = getelementptr inbounds nuw ir<@src>, ir<%indvars.iv>
4430/// EMIT-SCALAR ir<%2> = load ir<%arrayidx3>
4431/// EMIT ir<%add> = add nsw ir<%2>, ir<42>
4432/// EMIT ir<%arrayidx5> = getelementptr inbounds nuw ir<@dst>, ir<%indvars.iv>
4433/// EMIT store ir<%add>, ir<%arrayidx5>
4434/// EMIT ir<%indvars.iv.next> = add nuw nsw ir<%indvars.iv>, ir<1>
4435/// EMIT vp<%3> = any-of ir<%1>
4436/// EMIT ir<%exitcond.not> = icmp eq ir<%indvars.iv.next>, ir<10000>
4437/// EMIT branch-on-two-conds vp<%3>, ir<%exitcond.not>
4438/// Successor(s): middle.block, middle.block, for.body
4439///
4440/// We currently expect LoopVectorizationLegality to ensure that:
4441/// * There must also be a counted exit. We will need to support speculative
4442/// or first-faulting loads before we can remove this restriction.
4443/// * Any stores within the loop must not alias with the load used for the
4444/// uncountable exit. We can relax this a bit with runtime aliasing checks.
4445/// * Other memory operations in the loop can take place before or after the
4446/// uncountable exit, but must also be unconditional. We need to support
4447/// combining the conditions in VPlanPredicator.
4448/// * The loop must have a single unconditional load contributing to the
4449/// uncountable exit comparison, and the other term must be loop-invariant.
4450/// Improving upon this requires work in getRecipesForUncountableExit to
4451/// handle more complex recipe graphs.
4452static bool handleUncountableExitsWithSideEffects(
4453 VPlan &Plan, SmallVectorImpl<EarlyExitInfo> &Exits,
4454 VPBasicBlock *HeaderVPBB, VPBasicBlock *LatchVPBB, VPBasicBlock *MiddleVPBB,
4455 Loop *TheLoop, PredicatedScalarEvolution &PSE, DominatorTree &DT,
4456 AssumptionCache *AC) {
4457
4458 // Disconnect early exiting blocks from successors, remove branches. We
4459 // currently don't support multiple uses for recipes involved in creating
4460 // the uncountable exit condition.
4461 for (auto &Exit : Exits) {
4462 if (Exit.EarlyExitingVPBB == LatchVPBB)
4463 continue;
4464
4465 for (VPRecipeBase &R : Exit.EarlyExitVPBB->phis())
4466 cast<VPIRPhi>(Val: &R)->removeIncomingValueFor(IncomingBlock: Exit.EarlyExitingVPBB);
4467 Exit.EarlyExitingVPBB->getTerminator()->eraseFromParent();
4468 VPBlockUtils::disconnectBlocks(From: Exit.EarlyExitingVPBB, To: Exit.EarlyExitVPBB);
4469 }
4470
4471 VPDominatorTree VPDT(Plan);
4472
4473 // We can abandon a VPlan entirely if we return false here, so we shouldn't
4474 // crash if some earlier assumptions on scalar IR don't hold for the vplan
4475 // version of the loop.
4476 SmallVector<VPInstruction *, 8> ConditionRecipes;
4477
4478 std::optional<VPValue *> Cond =
4479 getRecipesForUncountableExit(Recipes&: ConditionRecipes, LatchVPBB);
4480 if (!Cond)
4481 return false;
4482
4483 // Find load contributing to condition.
4484 // At the moment LoopVectorizationLegality only supports a single
4485 // early-exit expression with a compare and a single load that must
4486 // be unconditional.
4487 // TODO: Support more than one load.
4488 auto *Load =
4489 find_singleton<VPInstruction>(Range&: ConditionRecipes, P: [](auto *I, bool _) {
4490 return match(I, m_VPInstruction<Instruction::Load>(Ops: m_VPValue()))
4491 ? I
4492 : nullptr;
4493 });
4494 assert(Load && "Couldn't find exactly one load");
4495 // TODO: Support conditional loads for uncountable exits.
4496 assert(VPDT.dominates(Load->getParent(), LatchVPBB) &&
4497 "Uncountable exit condition load is conditional.");
4498 VPInstruction *Ptr = cast<VPInstruction>(Val: Load->getOperand(N: 0));
4499
4500 // Ensure that we are guaranteed to be able to dereference the memory used
4501 // for determining the uncountable exit for the maximum possible number of
4502 // scalar iterations of the loop.
4503 //
4504 // TODO: Support first-faulting loads in cases where we don't know whether
4505 // all possible addresses are dereferenceable.
4506 {
4507 SmallVector<const SCEVPredicate *, 4> Predicates;
4508 const SCEV *PtrSCEV = vputils::getSCEVExprForVPValue(V: Ptr, PSE, L: TheLoop);
4509 const DataLayout &DL = Plan.getDataLayout();
4510 APInt EltSize(DL.getIndexTypeSizeInBits(Ty: Ptr->getScalarType()),
4511 DL.getTypeStoreSize(Ty: Load->getScalarType()).getFixedValue());
4512 if (!isDereferenceableAndAlignedInLoop(
4513 PtrSCEV, Alignment: cast<LoadInst>(Val: Load->getUnderlyingInstr())->getAlign(),
4514 EltSizeSCEV: PSE.getSE()->getConstant(Val: EltSize), L: TheLoop, SE&: *PSE.getSE(), DT, AC,
4515 Predicates: &Predicates))
4516 return false;
4517 }
4518
4519 // Check for a single GEP for the condition load to see if we can link it to
4520 // a widen IV recipe with a step of 1; we're only interested in contiguous
4521 // accesses for the condition load right now.
4522 auto *IV = cast<VPWidenInductionRecipe>(Val: &HeaderVPBB->front());
4523 if (!match(V: IV->getStartValue(), P: m_SpecificInt(V: 0)) ||
4524 !match(V: IV->getStepValue(), P: m_SpecificInt(V: 1)))
4525 return false;
4526 if (!match(V: Ptr, P: m_VPInstruction<Instruction::GetElementPtr>(Ops: m_LiveIn(),
4527 Ops: m_Specific(VPV: IV))))
4528 return false;
4529
4530 // We want to guarantee that the uncountable exit condition (and the mask
4531 // we will generate from it) are available for all operations in the loop
4532 // that need to be masked. If the condition recipes are not already the first
4533 // recipes in the header after the last phi, move them there.
4534 auto InsertIt = HeaderVPBB->getFirstNonPhi();
4535 while (InsertIt != HeaderVPBB->end() &&
4536 is_contained(Range&: ConditionRecipes, Element: &*InsertIt)) {
4537 erase(C&: ConditionRecipes, V: &*InsertIt);
4538 InsertIt++;
4539 }
4540 for (auto *Recipe : reverse(C&: ConditionRecipes))
4541 Recipe->moveBefore(BB&: *HeaderVPBB, I: InsertIt);
4542
4543 // Create a mask to represent all lanes that fully execute in the vector loop,
4544 // stopping short of any early exit.
4545 VPBuilder MaskBuilder(HeaderVPBB, InsertIt);
4546 VPValue *FirstActive = MaskBuilder.createFirstActiveLane(Masks: *Cond);
4547 Type *IVScalarTy = IV->getScalarType();
4548 Type *FirstActiveTy = FirstActive->getScalarType();
4549 VPValue *ALMMultiplier = Plan.getConstantInt(Ty: IVScalarTy, Val: 1);
4550 VPValue *Zero = Plan.getZero(Ty: IVScalarTy);
4551 FirstActive = MaskBuilder.createScalarZExtOrTrunc(Op: FirstActive, ResultTy: IVScalarTy,
4552 SrcTy: FirstActiveTy, DL: DebugLoc());
4553 VPValue *Mask = MaskBuilder.createNaryOp(Opcode: VPInstruction::ActiveLaneMask,
4554 Operands: {Zero, FirstActive, ALMMultiplier},
4555 DL: DebugLoc(), Name: "uncountable.exit.mask");
4556
4557 // Convert all other memory operations to use the mask.
4558 for (VPBasicBlock *VPBB : vp_rpo_plain_cfg_loop_body(Header: HeaderVPBB))
4559 for (VPRecipeBase &R : *VPBB)
4560 if (R.mayReadOrWriteMemory() && &R != Load) {
4561 // TODO: Handle conditional memory operations in the loop.
4562 if (!VPDT.dominates(A: R.getParent(), B: LatchVPBB))
4563 return false;
4564 cast<VPInstruction>(Val: &R)->addMask(Mask);
4565 }
4566
4567 // Update middle block branch to compare (IV + however many lanes were active)
4568 // against the full trip count, since we may be exiting the vector loop early.
4569 // If we didn't take an early exit, we should get the equivalent of VF from
4570 // the FirstActiveLane.
4571 assert(match(MiddleVPBB->getTerminator(), m_BranchOnCond()) &&
4572 "Expected BranchOnCond terminator for MiddleVPBB");
4573 VPBuilder MiddleBuilder(MiddleVPBB->getTerminator());
4574 VPValue *ScalarIV = MiddleBuilder.createNaryOp(Opcode: VPInstruction::ExtractLane,
4575 Operands: {Zero, IV}, DL: DebugLoc());
4576 VPValue *ExitIV = MiddleBuilder.createAdd(LHS: ScalarIV, RHS: FirstActive);
4577 VPValue *FullTC =
4578 MiddleBuilder.createICmp(Pred: CmpInst::ICMP_EQ, A: ExitIV, B: Plan.getTripCount());
4579 MiddleVPBB->getTerminator()->setOperand(I: 0, New: FullTC);
4580
4581 // Update resume phi in scalar.ph.
4582 VPBasicBlock *ScalarPH = Plan.getScalarPreheader();
4583 auto Phis = ScalarPH->phis();
4584 // TODO: Handle more than one Phi; re-derive from IV.
4585 // TODO: Handle reductions.
4586 if (range_size(Range&: Phis) != 1)
4587 return false;
4588 VPPhi *ContinueIV = cast<VPPhi>(Val: Phis.begin());
4589 // Make sure we're referring to the same IV.
4590 assert(
4591 match(ContinueIV->getOperand(0),
4592 m_VPInstruction<VPInstruction::ExitingIVValue>(m_Specific(IV))) &&
4593 "Continuing from different IV");
4594 ContinueIV->setOperand(I: 0, New: ExitIV);
4595 return true;
4596}
4597
4598bool VPlanTransforms::handleUncountableEarlyExits(
4599 VPlan &Plan, VPBasicBlock *HeaderVPBB, VPBasicBlock *LatchVPBB,
4600 VPBasicBlock *MiddleVPBB, Loop *TheLoop, PredicatedScalarEvolution &PSE,
4601 DominatorTree &DT, AssumptionCache *AC, UncountableExitStyle Style) {
4602#ifndef NDEBUG
4603 VPDominatorTree VPDT(Plan);
4604#endif
4605 VPBuilder LatchBuilder(LatchVPBB->getTerminator());
4606 SmallVector<EarlyExitInfo> Exits;
4607 for (VPIRBasicBlock *ExitBlock : Plan.getExitBlocks()) {
4608 for (VPBlockBase *Pred : to_vector(Range&: ExitBlock->getPredecessors())) {
4609 if (Pred == MiddleVPBB)
4610 continue;
4611 // Collect condition for this early exit.
4612 auto *EarlyExitingVPBB = cast<VPBasicBlock>(Val: Pred);
4613 VPBlockBase *TrueSucc = EarlyExitingVPBB->getSuccessors()[0];
4614 VPValue *CondOfEarlyExitingVPBB;
4615 [[maybe_unused]] bool Matched =
4616 match(V: EarlyExitingVPBB->getTerminator(),
4617 P: m_BranchOnCond(Op0: m_VPValue(V&: CondOfEarlyExitingVPBB)));
4618 assert(Matched && "Terminator must be BranchOnCond");
4619
4620 // Insert the MaskedCond in the EarlyExitingVPBB so the predicator adds
4621 // the correct block mask.
4622 VPBuilder EarlyExitingBuilder(EarlyExitingVPBB->getTerminator());
4623 auto *CondToEarlyExit = EarlyExitingBuilder.createNaryOp(
4624 Opcode: VPInstruction::MaskedCond,
4625 Operands: TrueSucc == ExitBlock
4626 ? CondOfEarlyExitingVPBB
4627 : EarlyExitingBuilder.createNot(Operand: CondOfEarlyExitingVPBB));
4628 assert((isa<VPIRValue>(CondOfEarlyExitingVPBB) ||
4629 !VPDT.properlyDominates(EarlyExitingVPBB, LatchVPBB) ||
4630 VPDT.properlyDominates(
4631 CondOfEarlyExitingVPBB->getDefiningRecipe()->getParent(),
4632 LatchVPBB)) &&
4633 "exit condition must dominate the latch");
4634 Exits.push_back(Elt: {
4635 .EarlyExitingVPBB: EarlyExitingVPBB,
4636 .EarlyExitVPBB: ExitBlock,
4637 .CondToExit: CondToEarlyExit,
4638 });
4639 }
4640 }
4641
4642 assert(!Exits.empty() && "must have at least one early exit");
4643 // Sort exits by RPO order to get correct program order. RPO gives a
4644 // topological ordering of the CFG, ensuring upstream exits are checked
4645 // before downstream exits in the dispatch chain.
4646 ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
4647 HeaderVPBB);
4648 DenseMap<VPBlockBase *, unsigned> RPOIdx;
4649 for (const auto &[Num, VPB] : enumerate(First&: RPOT))
4650 RPOIdx[VPB] = Num;
4651 llvm::sort(C&: Exits, Comp: [&RPOIdx](const EarlyExitInfo &A, const EarlyExitInfo &B) {
4652 return RPOIdx[A.EarlyExitingVPBB] < RPOIdx[B.EarlyExitingVPBB];
4653 });
4654#ifndef NDEBUG
4655 // After RPO sorting, verify that for any pair where one exit dominates
4656 // another, the dominating exit comes first. This is guaranteed by RPO
4657 // (topological order) and is required for the dispatch chain correctness.
4658 for (unsigned I = 0; I + 1 < Exits.size(); ++I)
4659 for (unsigned J = I + 1; J < Exits.size(); ++J)
4660 assert(!VPDT.properlyDominates(Exits[J].EarlyExitingVPBB,
4661 Exits[I].EarlyExitingVPBB) &&
4662 "RPO sort must place dominating exits before dominated ones");
4663#endif
4664
4665 // Build the AnyOf condition for the latch terminator using logical OR
4666 // to avoid poison propagation from later exit conditions when an earlier
4667 // exit is taken.
4668 VPValue *Combined = Exits[0].CondToExit;
4669 for (const EarlyExitInfo &Info : drop_begin(RangeOrContainer&: Exits))
4670 Combined = LatchBuilder.createLogicalOr(LHS: Combined, RHS: Info.CondToExit);
4671
4672 VPValue *IsAnyExitTaken =
4673 LatchBuilder.createNaryOp(Opcode: VPInstruction::AnyOf, Operands: {Combined});
4674
4675 // Create a comparison for the latch exit condition and replace the
4676 // BranchOnCond with a BranchOnTwoConds. The original BranchOnCond's condition
4677 // is used as the latch-exit condition; canonical IV recipes have not been
4678 // introduced yet, so there is no BranchOnCount to derive the condition from.
4679 auto *LatchExitingBranch = cast<VPInstruction>(Val: LatchVPBB->getTerminator());
4680 assert(LatchExitingBranch->getOpcode() == VPInstruction::BranchOnCond &&
4681 "Unexpected terminator");
4682 VPValue *IsLatchExitTaken = LatchExitingBranch->getOperand(N: 0);
4683 DebugLoc LatchDL = LatchExitingBranch->getDebugLoc();
4684 LatchExitingBranch->eraseFromParent();
4685 LatchBuilder.setInsertPoint(LatchVPBB);
4686 LatchBuilder.createNaryOp(Opcode: VPInstruction::BranchOnTwoConds,
4687 Operands: {IsAnyExitTaken, IsLatchExitTaken}, DL: LatchDL);
4688 LatchVPBB->clearSuccessors();
4689
4690 if (Style == UncountableExitStyle::MaskedHandleExitInScalarLoop) {
4691 // If handling the exiting lane in the scalar loop, combine the exit
4692 // conditions into a single BranchOnCond.
4693 LatchVPBB->setSuccessors({MiddleVPBB, MiddleVPBB, HeaderVPBB});
4694 MiddleVPBB->clearPredecessors();
4695 MiddleVPBB->setPredecessors({LatchVPBB, LatchVPBB});
4696 return handleUncountableExitsWithSideEffects(
4697 Plan, Exits, HeaderVPBB, LatchVPBB, MiddleVPBB, TheLoop, PSE, DT, AC);
4698 }
4699
4700 // Create the vector.early.exit blocks.
4701 SmallVector<VPBasicBlock *> VectorEarlyExitVPBBs(Exits.size());
4702 for (unsigned Idx = 0; Idx != Exits.size(); ++Idx) {
4703 Twine BlockSuffix = Exits.size() == 1 ? "" : Twine(".") + Twine(Idx);
4704 VPBasicBlock *VectorEarlyExitVPBB =
4705 Plan.createVPBasicBlock(Name: "vector.early.exit" + BlockSuffix);
4706 VectorEarlyExitVPBBs[Idx] = VectorEarlyExitVPBB;
4707 }
4708
4709 // Create the dispatch block (or reuse the single exit block if only one
4710 // exit). The dispatch block computes the first active lane of the combined
4711 // condition and, for multiple exits, chains through conditions to determine
4712 // which exit to take.
4713 VPBasicBlock *DispatchVPBB =
4714 Exits.size() == 1 ? VectorEarlyExitVPBBs[0]
4715 : Plan.createVPBasicBlock(Name: "vector.early.exit.check");
4716 DispatchVPBB->setPredecessors({LatchVPBB});
4717 LatchVPBB->setSuccessors({DispatchVPBB, MiddleVPBB, HeaderVPBB});
4718 VPBuilder DispatchBuilder(DispatchVPBB, DispatchVPBB->begin());
4719 VPValue *FirstActiveLane = DispatchBuilder.createFirstActiveLane(
4720 Masks: {Combined}, DL: DebugLoc::getUnknown(), Name: "first.active.lane");
4721
4722 // For each early exit, disconnect the original exiting block
4723 // (early.exiting.I) from the exit block (ir-bb<exit.I>) and route through a
4724 // new vector.early.exit block. Update ir-bb<exit.I>'s phis to extract their
4725 // values at the first active lane:
4726 //
4727 // Input:
4728 // early.exiting.I:
4729 // ...
4730 // EMIT branch-on-cond vp<%cond.I>
4731 // Successor(s): in.loop.succ, ir-bb<exit.I>
4732 //
4733 // ir-bb<exit.I>:
4734 // IR %phi = phi [ vp<%incoming.I>, early.exiting.I ], ...
4735 //
4736 // Output:
4737 // early.exiting.I:
4738 // ...
4739 // Successor(s): in.loop.succ
4740 //
4741 // vector.early.exit.I:
4742 // EMIT vp<%exit.val> = extract-lane vp<%first.lane>, vp<%incoming.I>
4743 // Successor(s): ir-bb<exit.I>
4744 //
4745 // ir-bb<exit.I>:
4746 // IR %phi = phi ... (extra operand: vp<%exit.val> from
4747 // vector.early.exit.I)
4748 //
4749 for (auto [Exit, VectorEarlyExitVPBB] :
4750 zip_equal(t&: Exits, u&: VectorEarlyExitVPBBs)) {
4751 auto &[EarlyExitingVPBB, EarlyExitVPBB, _] = Exit;
4752 // Adjust the phi nodes in EarlyExitVPBB.
4753 // 1. remove incoming values from EarlyExitingVPBB,
4754 // 2. extract the incoming value at FirstActiveLane
4755 // 3. add back the extracts as last operands for the phis
4756 // Then adjust the CFG, removing the edge between EarlyExitingVPBB and
4757 // EarlyExitVPBB and adding a new edge between VectorEarlyExitVPBB and
4758 // EarlyExitVPBB. The extracts at FirstActiveLane are now the incoming
4759 // values from VectorEarlyExitVPBB.
4760 for (VPRecipeBase &R : EarlyExitVPBB->phis()) {
4761 auto *ExitIRI = cast<VPIRPhi>(Val: &R);
4762 VPValue *IncomingVal =
4763 ExitIRI->getIncomingValueForBlock(VPBB: EarlyExitingVPBB);
4764 VPValue *NewIncoming = IncomingVal;
4765 if (!isa<VPIRValue>(Val: IncomingVal)) {
4766 VPBuilder EarlyExitBuilder(VectorEarlyExitVPBB);
4767 NewIncoming = EarlyExitBuilder.createNaryOp(
4768 Opcode: VPInstruction::ExtractLane, Operands: {FirstActiveLane, IncomingVal},
4769 DL: DebugLoc::getUnknown(), Name: "early.exit.value");
4770 }
4771 ExitIRI->removeIncomingValueFor(IncomingBlock: EarlyExitingVPBB);
4772 ExitIRI->addIncoming(IncomingV: NewIncoming);
4773 }
4774
4775 EarlyExitingVPBB->getTerminator()->eraseFromParent();
4776 VPBlockUtils::disconnectBlocks(From: EarlyExitingVPBB, To: EarlyExitVPBB);
4777 VPBlockUtils::connectBlocks(From: VectorEarlyExitVPBB, To: EarlyExitVPBB);
4778 }
4779
4780 // Chain through exits: for each exit, check if its condition is true at
4781 // the first active lane. If so, take that exit; otherwise, try the next.
4782 // The last exit needs no check since it must be taken if all others fail.
4783 //
4784 // For 3 exits (cond.0, cond.1, cond.2), this creates:
4785 //
4786 // latch:
4787 // ...
4788 // EMIT vp<%combined> = logical-or vp<%cond.0>, vp<%cond.1>, vp<%cond.2>
4789 // ...
4790 //
4791 // vector.early.exit.check:
4792 // EMIT vp<%first.lane> = first-active-lane vp<%combined>
4793 // EMIT vp<%at.cond.0> = extract-lane vp<%first.lane>, vp<%cond.0>
4794 // EMIT branch-on-cond vp<%at.cond.0>
4795 // Successor(s): vector.early.exit.0, vector.early.exit.check.0
4796 //
4797 // vector.early.exit.check.0:
4798 // EMIT vp<%at.cond.1> = extract-lane vp<%first.lane>, vp<%cond.1>
4799 // EMIT branch-on-cond vp<%at.cond.1>
4800 // Successor(s): vector.early.exit.1, vector.early.exit.2
4801 VPBasicBlock *CurrentBB = DispatchVPBB;
4802 for (auto [I, Exit] : enumerate(First: ArrayRef(Exits).drop_back())) {
4803 VPValue *LaneVal = DispatchBuilder.createNaryOp(
4804 Opcode: VPInstruction::ExtractLane, Operands: {FirstActiveLane, Exit.CondToExit},
4805 DL: DebugLoc::getUnknown(), Name: "exit.cond.at.lane");
4806
4807 // For the last dispatch, branch directly to the last exit on false;
4808 // otherwise, create a new check block.
4809 bool IsLastDispatch = (I + 2 == Exits.size());
4810 VPBasicBlock *FalseBB =
4811 IsLastDispatch ? VectorEarlyExitVPBBs.back()
4812 : Plan.createVPBasicBlock(
4813 Name: Twine("vector.early.exit.check.") + Twine(I));
4814
4815 DispatchBuilder.createNaryOp(Opcode: VPInstruction::BranchOnCond, Operands: {LaneVal});
4816 CurrentBB->setSuccessors({VectorEarlyExitVPBBs[I], FalseBB});
4817 VectorEarlyExitVPBBs[I]->setPredecessors({CurrentBB});
4818 FalseBB->setPredecessors({CurrentBB});
4819
4820 CurrentBB = FalseBB;
4821 DispatchBuilder.setInsertPoint(CurrentBB);
4822 }
4823
4824 return true;
4825}
4826
4827/// This function tries convert extended in-loop reductions to
4828/// VPExpressionRecipe and clamp the \p Range if it is beneficial and
4829/// valid. The created recipe must be decomposed to its constituent
4830/// recipes before execution.
4831static VPExpressionRecipe *
4832tryToMatchAndCreateExtendedReduction(VPReductionRecipe *Red, VPCostContext &Ctx,
4833 VFRange &Range) {
4834 Type *RedTy = Red->getScalarType();
4835 VPValue *VecOp = Red->getVecOp();
4836
4837 assert(!Red->isPartialReduction() &&
4838 "This path does not support partial reductions");
4839
4840 // Clamp the range if using extended-reduction is profitable.
4841 auto IsExtendedRedValidAndClampRange =
4842 [&](unsigned Opcode, Instruction::CastOps ExtOpc, Type *SrcTy) -> bool {
4843 return LoopVectorizationPlanner::getDecisionAndClampRange(
4844 Predicate: [&](ElementCount VF) {
4845 auto *SrcVecTy = cast<VectorType>(Val: toVectorTy(Scalar: SrcTy, EC: VF));
4846 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
4847
4848 InstructionCost ExtRedCost = InstructionCost::getInvalid();
4849 InstructionCost ExtCost =
4850 cast<VPWidenCastRecipe>(Val: VecOp)->computeCost(VF, Ctx);
4851 InstructionCost RedCost = Red->computeCost(VF, Ctx);
4852
4853 assert(!RedTy->isFloatingPointTy() &&
4854 "getExtendedReductionCost only supports integer types");
4855 ExtRedCost = Ctx.TTI.getExtendedReductionCost(
4856 Opcode, IsUnsigned: ExtOpc == Instruction::CastOps::ZExt, ResTy: RedTy, Ty: SrcVecTy,
4857 FMF: Red->getFastMathFlagsOrNone(), CostKind);
4858 return ExtRedCost.isValid() && ExtRedCost < ExtCost + RedCost;
4859 },
4860 Range);
4861 };
4862
4863 VPValue *A;
4864 // Match reduce(ext)).
4865 if (match(V: VecOp, P: m_Isa<VPWidenCastRecipe>(P: m_ZExtOrSExt(Op0: m_VPValue(V&: A)))) &&
4866 IsExtendedRedValidAndClampRange(
4867 RecurrenceDescriptor::getOpcode(Kind: Red->getRecurrenceKind()),
4868 cast<VPWidenCastRecipe>(Val: VecOp)->getOpcode(), A->getScalarType()))
4869 return new VPExpressionRecipe(cast<VPWidenCastRecipe>(Val: VecOp), Red);
4870
4871 return nullptr;
4872}
4873
4874/// This function tries convert extended in-loop reductions to
4875/// VPExpressionRecipe and clamp the \p Range if it is beneficial
4876/// and valid. The created VPExpressionRecipe must be decomposed to its
4877/// constituent recipes before execution. Patterns of the
4878/// VPExpressionRecipe:
4879/// reduce.add(mul(...)),
4880/// reduce.add(mul(ext(A), ext(B))),
4881/// reduce.add(ext(mul(ext(A), ext(B)))).
4882/// reduce.fadd(fmul(ext(A), ext(B)))
4883static VPExpressionRecipe *
4884tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red,
4885 VPCostContext &Ctx, VFRange &Range) {
4886 unsigned Opcode = RecurrenceDescriptor::getOpcode(Kind: Red->getRecurrenceKind());
4887 if (Opcode != Instruction::Add && Opcode != Instruction::Sub &&
4888 Opcode != Instruction::FAdd)
4889 return nullptr;
4890
4891 assert(!Red->isPartialReduction() &&
4892 "This path does not support partial reductions");
4893 Type *RedTy = Red->getScalarType();
4894
4895 // Clamp the range if using multiply-accumulate-reduction is profitable.
4896 auto IsMulAccValidAndClampRange =
4897 [&](VPWidenRecipe *Mul, VPWidenCastRecipe *Ext0, VPWidenCastRecipe *Ext1,
4898 VPWidenCastRecipe *OuterExt) -> bool {
4899 return LoopVectorizationPlanner::getDecisionAndClampRange(
4900 Predicate: [&](ElementCount VF) {
4901 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
4902 Type *SrcTy = Ext0 ? Ext0->getOperand(N: 0)->getScalarType() : RedTy;
4903 InstructionCost MulAccCost;
4904
4905 // getMulAccReductionCost for in-loop reductions does not support
4906 // mixed or floating-point extends.
4907 if (Ext0 && Ext1 &&
4908 (Ext0->getOpcode() != Ext1->getOpcode() ||
4909 Ext0->getOpcode() == Instruction::CastOps::FPExt))
4910 return false;
4911
4912 bool IsZExt =
4913 !Ext0 || Ext0->getOpcode() == Instruction::CastOps::ZExt;
4914 auto *SrcVecTy = cast<VectorType>(Val: toVectorTy(Scalar: SrcTy, EC: VF));
4915 MulAccCost = Ctx.TTI.getMulAccReductionCost(IsUnsigned: IsZExt, RedOpcode: Opcode, ResTy: RedTy,
4916 Ty: SrcVecTy, CostKind);
4917
4918 InstructionCost MulCost = Mul->computeCost(VF, Ctx);
4919 InstructionCost RedCost = Red->computeCost(VF, Ctx);
4920 InstructionCost ExtCost = 0;
4921 if (Ext0)
4922 ExtCost += Ext0->computeCost(VF, Ctx);
4923 if (Ext1)
4924 ExtCost += Ext1->computeCost(VF, Ctx);
4925 if (OuterExt)
4926 ExtCost += OuterExt->computeCost(VF, Ctx);
4927
4928 return MulAccCost.isValid() &&
4929 MulAccCost < ExtCost + MulCost + RedCost;
4930 },
4931 Range);
4932 };
4933
4934 VPValue *VecOp = Red->getVecOp();
4935 VPRecipeBase *Sub = nullptr;
4936 VPValue *A, *B;
4937 VPValue *Tmp = nullptr;
4938
4939 if (RedTy->isFloatingPointTy())
4940 return nullptr;
4941
4942 // Sub reductions could have a sub between the add reduction and vec op.
4943 if (match(V: VecOp, P: m_Sub(Op0: m_ZeroInt(), Op1: m_VPValue(V&: Tmp)))) {
4944 Sub = VecOp->getDefiningRecipe();
4945 VecOp = Tmp;
4946 }
4947
4948 // If ValB is a constant and can be safely extended, truncate it to the same
4949 // type as ExtA's operand, then extend it to the same type as ExtA. This
4950 // creates two uniform extends that can more easily be matched by the rest of
4951 // the bundling code. The ExtB reference, ValB and operand 1 of Mul are all
4952 // replaced with the new extend of the constant.
4953 auto ExtendAndReplaceConstantOp = [](VPWidenCastRecipe *ExtA,
4954 VPWidenCastRecipe *&ExtB, VPValue *&ValB,
4955 VPWidenRecipe *Mul) {
4956 if (!ExtA || ExtB || !isa<VPIRValue>(Val: ValB))
4957 return;
4958 Type *NarrowTy = ExtA->getOperand(N: 0)->getScalarType();
4959 Instruction::CastOps ExtOpc = ExtA->getOpcode();
4960 const APInt *Const;
4961 if (!match(V: ValB, P: m_APInt(C&: Const)) ||
4962 !llvm::canConstantBeExtended(
4963 C: Const, NarrowType: NarrowTy, ExtKind: TTI::getPartialReductionExtendKind(CastOpc: ExtOpc)))
4964 return;
4965 // The truncate ensures that the type of each extended operand is the
4966 // same, and it's been proven that the constant can be extended from
4967 // NarrowTy safely. Necessary since ExtA's extended operand would be
4968 // e.g. an i8, while the const will likely be an i32. This will be
4969 // elided by later optimisations.
4970 VPBuilder Builder(Mul);
4971 auto *Trunc =
4972 Builder.createWidenCast(Opcode: Instruction::CastOps::Trunc, Op: ValB, ResultTy: NarrowTy);
4973 Type *WideTy = ExtA->getScalarType();
4974 ValB = ExtB = Builder.createWidenCast(Opcode: ExtOpc, Op: Trunc, ResultTy: WideTy);
4975 Mul->setOperand(I: 1, New: ExtB);
4976 };
4977
4978 // Try to match reduce.add(mul(...)).
4979 if (match(V: VecOp, P: m_Mul(Op0: m_VPValue(V&: A), Op1: m_VPValue(V&: B)))) {
4980 auto *RecipeA = dyn_cast<VPWidenCastRecipe>(Val: A);
4981 auto *RecipeB = dyn_cast<VPWidenCastRecipe>(Val: B);
4982 auto *Mul = cast<VPWidenRecipe>(Val: VecOp);
4983
4984 // Convert reduce.add(mul(ext, const)) to reduce.add(mul(ext, ext(const)))
4985 ExtendAndReplaceConstantOp(RecipeA, RecipeB, B, Mul);
4986
4987 // Match reduce.add/sub(mul(ext, ext)).
4988 if (RecipeA && RecipeB && match(V: RecipeA, P: m_ZExtOrSExt(Op0: m_VPValue())) &&
4989 match(V: RecipeB, P: m_ZExtOrSExt(Op0: m_VPValue())) &&
4990 IsMulAccValidAndClampRange(Mul, RecipeA, RecipeB, nullptr)) {
4991 if (Sub)
4992 return new VPExpressionRecipe(RecipeA, RecipeB, Mul,
4993 cast<VPWidenRecipe>(Val: Sub), Red);
4994 return new VPExpressionRecipe(RecipeA, RecipeB, Mul, Red);
4995 }
4996 // TODO: Add an expression type for this variant with a negated mul
4997 if (!Sub && IsMulAccValidAndClampRange(Mul, nullptr, nullptr, nullptr))
4998 return new VPExpressionRecipe(Mul, Red);
4999 }
5000 // TODO: Add an expression type for negated versions of other expression
5001 // variants.
5002 if (Sub)
5003 return nullptr;
5004
5005 // Match reduce.add(ext(mul(A, B))).
5006 if (match(V: VecOp, P: m_ZExtOrSExt(Op0: m_Mul(Op0: m_VPValue(V&: A), Op1: m_VPValue(V&: B))))) {
5007 auto *Ext = cast<VPWidenCastRecipe>(Val: VecOp);
5008 auto *Mul = cast<VPWidenRecipe>(Val: Ext->getOperand(N: 0));
5009 auto *Ext0 = dyn_cast<VPWidenCastRecipe>(Val: A);
5010 auto *Ext1 = dyn_cast<VPWidenCastRecipe>(Val: B);
5011
5012 // reduce.add(ext(mul(ext, const)))
5013 // -> reduce.add(ext(mul(ext, ext(const))))
5014 ExtendAndReplaceConstantOp(Ext0, Ext1, B, Mul);
5015
5016 // reduce.add(ext(mul(ext(A), ext(B))))
5017 // -> reduce.add(mul(wider_ext(A), wider_ext(B)))
5018 // The inner extends must either have the same opcode as the outer extend or
5019 // be the same, in which case the multiply can never result in a negative
5020 // value and the outer extend can be folded away by doing wider
5021 // extends for the operands of the mul.
5022 if (Ext0 && Ext1 &&
5023 (Ext->getOpcode() == Ext0->getOpcode() || Ext0 == Ext1) &&
5024 Ext0->getOpcode() == Ext1->getOpcode() &&
5025 IsMulAccValidAndClampRange(Mul, Ext0, Ext1, Ext) && Mul->hasOneUse()) {
5026 auto *NewExt0 = new VPWidenCastRecipe(
5027 Ext0->getOpcode(), Ext0->getOperand(N: 0), Ext->getScalarType(), nullptr,
5028 *Ext0, *Ext0, Ext0->getDebugLoc());
5029 NewExt0->insertBefore(InsertPos: Ext0);
5030
5031 VPWidenCastRecipe *NewExt1 = NewExt0;
5032 if (Ext0 != Ext1) {
5033 NewExt1 = new VPWidenCastRecipe(Ext1->getOpcode(), Ext1->getOperand(N: 0),
5034 Ext->getScalarType(), nullptr, *Ext1,
5035 *Ext1, Ext1->getDebugLoc());
5036 NewExt1->insertBefore(InsertPos: Ext1);
5037 }
5038 auto *NewMul = Mul->cloneWithOperands(NewOperands: {NewExt0, NewExt1});
5039 NewMul->insertBefore(InsertPos: Mul);
5040 Ext->replaceAllUsesWith(New: NewMul);
5041 Ext->eraseFromParent();
5042 Mul->eraseFromParent();
5043 return new VPExpressionRecipe(NewExt0, NewExt1, NewMul, Red);
5044 }
5045 }
5046 return nullptr;
5047}
5048
5049/// This function tries to create abstract recipes from the reduction recipe for
5050/// following optimizations and cost estimation.
5051static void tryToCreateAbstractReductionRecipe(VPReductionRecipe *Red,
5052 VPCostContext &Ctx,
5053 VFRange &Range) {
5054 // Creation of VPExpressions for partial reductions is entirely handled in
5055 // transformToPartialReduction.
5056 assert(!Red->isPartialReduction() &&
5057 "This path does not support partial reductions");
5058
5059 VPExpressionRecipe *AbstractR = nullptr;
5060 auto IP = std::next(x: Red->getIterator());
5061 auto *VPBB = Red->getParent();
5062 if (auto *MulAcc = tryToMatchAndCreateMulAccumulateReduction(Red, Ctx, Range))
5063 AbstractR = MulAcc;
5064 else if (auto *ExtRed = tryToMatchAndCreateExtendedReduction(Red, Ctx, Range))
5065 AbstractR = ExtRed;
5066 // Cannot create abstract inloop reduction recipes.
5067 if (!AbstractR)
5068 return;
5069
5070 AbstractR->insertBefore(BB&: *VPBB, IP);
5071 Red->replaceAllUsesWith(New: AbstractR);
5072}
5073
5074void VPlanTransforms::convertToAbstractRecipes(VPlan &Plan, VPCostContext &Ctx,
5075 VFRange &Range) {
5076 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
5077 Range: vp_depth_first_deep(G: Plan.getVectorLoopRegion()))) {
5078 for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB)) {
5079 if (auto *Red = dyn_cast<VPReductionRecipe>(Val: &R))
5080 tryToCreateAbstractReductionRecipe(Red, Ctx, Range);
5081 }
5082 }
5083}
5084
5085void VPlanTransforms::materializeBroadcasts(VPlan &Plan) {
5086 if (Plan.hasScalarVFOnly())
5087 return;
5088
5089#ifndef NDEBUG
5090 VPDominatorTree VPDT(Plan);
5091#endif
5092
5093 SmallVector<VPValue *> VPValues;
5094 if (VPValue *BTC = Plan.getBackedgeTakenCount())
5095 VPValues.push_back(Elt: BTC);
5096 append_range(C&: VPValues, R: Plan.getLiveIns());
5097 for (VPRecipeBase &R : *Plan.getEntry())
5098 append_range(C&: VPValues, R: R.definedValues());
5099
5100 auto *VectorPreheader = Plan.getVectorPreheader();
5101 for (VPValue *VPV : VPValues) {
5102 if (vputils::onlyScalarValuesUsed(Def: VPV) ||
5103 (isa<VPIRValue>(Val: VPV) && isa<Constant>(Val: VPV->getLiveInIRValue())))
5104 continue;
5105
5106 // Add explicit broadcast at the insert point that dominates all users.
5107 VPBasicBlock *HoistBlock = VectorPreheader;
5108 VPBasicBlock::iterator HoistPoint = VectorPreheader->end();
5109 for (VPUser *User : VPV->users()) {
5110 if (User->usesScalars(Op: VPV))
5111 continue;
5112 if (cast<VPRecipeBase>(Val: User)->getParent() == VectorPreheader)
5113 HoistPoint = HoistBlock->begin();
5114 else
5115 assert(VPDT.dominates(VectorPreheader,
5116 cast<VPRecipeBase>(User)->getParent()) &&
5117 "All users must be in the vector preheader or dominated by it");
5118 }
5119
5120 VPBuilder Builder(cast<VPBasicBlock>(Val: HoistBlock), HoistPoint);
5121 auto *Broadcast = Builder.createNaryOp(Opcode: VPInstruction::Broadcast, Operands: {VPV});
5122 VPV->replaceUsesWithIf(New: Broadcast,
5123 ShouldReplace: [VPV, Broadcast](VPUser &U, unsigned Idx) {
5124 return Broadcast != &U && !U.usesScalars(Op: VPV);
5125 });
5126 }
5127}
5128
5129// Collect common metadata from a group of replicate recipes by intersecting
5130// metadata from all recipes in the group.
5131static VPIRMetadata getCommonMetadata(ArrayRef<VPReplicateRecipe *> Recipes) {
5132 VPIRMetadata CommonMetadata = *Recipes.front();
5133 for (VPReplicateRecipe *Recipe : drop_begin(RangeOrContainer&: Recipes))
5134 CommonMetadata.intersect(MD: *Recipe);
5135 return CommonMetadata;
5136}
5137
5138template <unsigned Opcode>
5139static SmallVector<SmallVector<VPReplicateRecipe *, 4>>
5140collectComplementaryPredicatedMemOps(VPlan &Plan,
5141 PredicatedScalarEvolution &PSE,
5142 const Loop *L) {
5143 static_assert(Opcode == Instruction::Load || Opcode == Instruction::Store,
5144 "Only Load and Store opcodes supported");
5145 [[maybe_unused]] constexpr bool IsLoad = (Opcode == Instruction::Load);
5146
5147 // For each address, collect operations with the same or complementary masks.
5148 SmallVector<SmallVector<VPReplicateRecipe *, 4>> AllGroups;
5149 auto Groups = collectGroupedReplicateMemOps<Opcode>(
5150 Plan, PSE, L,
5151 [](VPReplicateRecipe *RepR) { return RepR->isPredicated(); });
5152 for (auto Recipes : Groups) {
5153 if (Recipes.size() < 2)
5154 continue;
5155
5156 assert(all_equal(
5157 map_range(Recipes, bind_back<getLoadStoreValueType>(IsLoad))) &&
5158 "Expected all recipes in group to have the same load-store type");
5159
5160 // Collect groups with the same or complementary masks.
5161 for (VPReplicateRecipe *&RecipeI : Recipes) {
5162 if (!RecipeI)
5163 continue;
5164
5165 VPValue *MaskI = RecipeI->getMask();
5166 SmallVector<VPReplicateRecipe *, 4> Group;
5167 Group.push_back(Elt: RecipeI);
5168 RecipeI = nullptr;
5169
5170 // Find all operations with the same or complementary masks.
5171 bool HasComplementaryMask = false;
5172 for (VPReplicateRecipe *&RecipeJ : Recipes) {
5173 if (!RecipeJ)
5174 continue;
5175
5176 VPValue *MaskJ = RecipeJ->getMask();
5177 // Check if any operation in the group has a complementary mask with
5178 // another, that is M1 == NOT(M2) or M2 == NOT(M1).
5179 HasComplementaryMask |= match(V: MaskI, P: m_Not(Op0: m_Specific(VPV: MaskJ))) ||
5180 match(V: MaskJ, P: m_Not(Op0: m_Specific(VPV: MaskI)));
5181 Group.push_back(Elt: RecipeJ);
5182 RecipeJ = nullptr;
5183 }
5184
5185 if (HasComplementaryMask) {
5186 assert(Group.size() >= 2 && "must have at least 2 entries");
5187 AllGroups.push_back(Elt: std::move(Group));
5188 }
5189 }
5190 }
5191
5192 return AllGroups;
5193}
5194
5195// Find the recipe with minimum alignment in the group.
5196template <typename InstType>
5197static VPReplicateRecipe *
5198findRecipeWithMinAlign(ArrayRef<VPReplicateRecipe *> Group) {
5199 return *min_element(Group, [](VPReplicateRecipe *A, VPReplicateRecipe *B) {
5200 return cast<InstType>(A->getUnderlyingInstr())->getAlign() <
5201 cast<InstType>(B->getUnderlyingInstr())->getAlign();
5202 });
5203}
5204
5205void VPlanTransforms::hoistPredicatedLoads(VPlan &Plan,
5206 PredicatedScalarEvolution &PSE,
5207 const Loop *L) {
5208 auto Groups =
5209 collectComplementaryPredicatedMemOps<Instruction::Load>(Plan, PSE, L);
5210 if (Groups.empty())
5211 return;
5212
5213 // Process each group of loads.
5214 for (auto &Group : Groups) {
5215 // Try to use the earliest (most dominating) load to replace all others.
5216 VPReplicateRecipe *EarliestLoad = Group[0];
5217 VPBasicBlock *FirstBB = EarliestLoad->getParent();
5218 VPBasicBlock *LastBB = Group.back()->getParent();
5219
5220 // Check that the load doesn't alias with stores between first and last.
5221 auto LoadLoc = vputils::getMemoryLocation(R: *EarliestLoad);
5222 if (!LoadLoc || !canHoistOrSinkWithNoAliasCheck(MemLoc: *LoadLoc, FirstBB, LastBB))
5223 continue;
5224
5225 // Collect common metadata from all loads in the group.
5226 VPIRMetadata CommonMetadata = getCommonMetadata(Recipes: Group);
5227
5228 // Find the load with minimum alignment to use.
5229 auto *LoadWithMinAlign = findRecipeWithMinAlign<LoadInst>(Group);
5230
5231 bool IsSingleScalar = EarliestLoad->isSingleScalar();
5232 assert(all_of(Group,
5233 [IsSingleScalar](VPReplicateRecipe *R) {
5234 return R->isSingleScalar() == IsSingleScalar;
5235 }) &&
5236 "all members in group must agree on IsSingleScalar");
5237
5238 // Create an unpredicated version of the earliest load with common
5239 // metadata.
5240 auto *UnpredicatedLoad = new VPReplicateRecipe(
5241 LoadWithMinAlign->getUnderlyingInstr(), {EarliestLoad->getOperand(N: 0)},
5242 IsSingleScalar, /*Mask=*/nullptr, *EarliestLoad, CommonMetadata);
5243
5244 UnpredicatedLoad->insertBefore(InsertPos: EarliestLoad);
5245
5246 // Replace all loads in the group with the unpredicated load.
5247 for (VPReplicateRecipe *Load : Group) {
5248 Load->replaceAllUsesWith(New: UnpredicatedLoad);
5249 Load->eraseFromParent();
5250 }
5251 }
5252}
5253
5254static bool
5255canSinkStoreWithNoAliasCheck(ArrayRef<VPReplicateRecipe *> StoresToSink,
5256 PredicatedScalarEvolution &PSE, const Loop &L) {
5257 auto StoreLoc = vputils::getMemoryLocation(R: *StoresToSink.front());
5258 if (!StoreLoc || !StoreLoc->AATags.Scope)
5259 return false;
5260
5261 // When sinking a group of stores, all members of the group alias each other.
5262 // Skip them during the alias checks.
5263 VPBasicBlock *FirstBB = StoresToSink.front()->getParent();
5264 VPBasicBlock *LastBB = StoresToSink.back()->getParent();
5265 SinkStoreInfo SinkInfo(StoresToSink, *StoresToSink[0], PSE, L);
5266 return canHoistOrSinkWithNoAliasCheck(MemLoc: *StoreLoc, FirstBB, LastBB, SinkInfo);
5267}
5268
5269void VPlanTransforms::sinkPredicatedStores(VPlan &Plan,
5270 PredicatedScalarEvolution &PSE,
5271 const Loop *L) {
5272 auto Groups =
5273 collectComplementaryPredicatedMemOps<Instruction::Store>(Plan, PSE, L);
5274 if (Groups.empty())
5275 return;
5276
5277 for (auto &Group : Groups) {
5278 if (!canSinkStoreWithNoAliasCheck(StoresToSink: Group, PSE, L: *L))
5279 continue;
5280
5281 // Use the last (most dominated) store's location for the unconditional
5282 // store.
5283 VPReplicateRecipe *LastStore = Group.back();
5284 VPBasicBlock *InsertBB = LastStore->getParent();
5285
5286 // Collect common alias metadata from all stores in the group.
5287 VPIRMetadata CommonMetadata = getCommonMetadata(Recipes: Group);
5288
5289 // Build select chain for stored values.
5290 VPValue *SelectedValue = Group[0]->getOperand(N: 0);
5291 VPBuilder Builder(InsertBB, LastStore->getIterator());
5292
5293 bool IsSingleScalar = Group[0]->isSingleScalar();
5294 for (unsigned I = 1; I < Group.size(); ++I) {
5295 assert(IsSingleScalar == Group[I]->isSingleScalar() &&
5296 "all members in group must agree on IsSingleScalar");
5297 VPValue *Mask = Group[I]->getMask();
5298 VPValue *Value = Group[I]->getOperand(N: 0);
5299 SelectedValue = Builder.createSelect(Cond: Mask, TrueVal: Value, FalseVal: SelectedValue,
5300 DL: Group[I]->getDebugLoc());
5301 }
5302
5303 // Find the store with minimum alignment to use.
5304 auto *StoreWithMinAlign = findRecipeWithMinAlign<StoreInst>(Group);
5305
5306 // Create unconditional store with selected value and common metadata.
5307 auto *UnpredicatedStore = new VPReplicateRecipe(
5308 StoreWithMinAlign->getUnderlyingInstr(),
5309 {SelectedValue, LastStore->getOperand(N: 1)}, IsSingleScalar,
5310 /*Mask=*/nullptr, *LastStore, CommonMetadata);
5311 UnpredicatedStore->insertBefore(BB&: *InsertBB, IP: LastStore->getIterator());
5312
5313 // Remove all predicated stores from the group.
5314 for (VPReplicateRecipe *Store : Group)
5315 Store->eraseFromParent();
5316 }
5317}
5318
5319void VPlanTransforms::materializeConstantVectorTripCount(
5320 VPlan &Plan, ElementCount BestVF, unsigned BestUF,
5321 PredicatedScalarEvolution &PSE) {
5322 assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
5323 assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
5324
5325 VPValue *TC = Plan.getTripCount();
5326 if (TC->user_empty())
5327 return;
5328
5329 // Skip cases for which the trip count may be non-trivial to materialize.
5330 // I.e., when a scalar tail is absent - due to tail folding, or when a scalar
5331 // tail is required.
5332 if (!Plan.hasScalarTail() ||
5333 Plan.getMiddleBlock()->getSingleSuccessor() ==
5334 Plan.getScalarPreheader() ||
5335 !isa<VPIRValue>(Val: TC))
5336 return;
5337
5338 // Materialize vector trip counts for constants early if it can simply
5339 // be computed as (Original TC / VF * UF) * VF * UF.
5340 // TODO: Compute vector trip counts for loops requiring a scalar epilogue and
5341 // tail-folded loops.
5342 ScalarEvolution &SE = *PSE.getSE();
5343 auto *TCScev = SE.getSCEV(V: TC->getLiveInIRValue());
5344 if (!isa<SCEVConstant>(Val: TCScev))
5345 return;
5346 const SCEV *VFxUF = SE.getElementCount(Ty: TCScev->getType(), EC: BestVF * BestUF);
5347 auto VecTCScev = SE.getMulExpr(LHS: SE.getUDivExpr(LHS: TCScev, RHS: VFxUF), RHS: VFxUF);
5348 if (auto *ConstVecTC = dyn_cast<SCEVConstant>(Val: VecTCScev))
5349 Plan.getVectorTripCount().setUnderlyingValue(ConstVecTC->getValue());
5350}
5351
5352void VPlanTransforms::materializeBackedgeTakenCount(VPlan &Plan,
5353 VPBasicBlock *VectorPH) {
5354 VPValue *BTC = Plan.getOrCreateBackedgeTakenCount();
5355 if (BTC->user_empty())
5356 return;
5357
5358 VPBuilder Builder(VectorPH, VectorPH->begin());
5359 auto *TCTy = Plan.getTripCount()->getScalarType();
5360 auto *TCMO =
5361 Builder.createSub(LHS: Plan.getTripCount(), RHS: Plan.getConstantInt(Ty: TCTy, Val: 1),
5362 DL: DebugLoc::getCompilerGenerated(), Name: "trip.count.minus.1");
5363 BTC->replaceAllUsesWith(New: TCMO);
5364}
5365
5366void VPlanTransforms::materializePacksAndUnpacks(VPlan &Plan) {
5367 if (Plan.hasScalarVFOnly())
5368 return;
5369
5370 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
5371 auto VPBBsOutsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
5372 Range: vp_depth_first_shallow(G: Plan.getEntry()));
5373 auto VPBBsInsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
5374 Range: vp_depth_first_shallow(G: LoopRegion->getEntry()));
5375 // Materialize Build(Struct)Vector for all replicating VPReplicateRecipes,
5376 // VPScalarIVStepsRecipe and VPInstructions, excluding ones in replicate
5377 // regions. Those are not materialized explicitly yet.
5378 // TODO: materialize build vectors for replicating recipes in replicating
5379 // regions.
5380 for (VPBasicBlock *VPBB :
5381 concat<VPBasicBlock *>(Ranges&: VPBBsOutsideLoopRegion, Ranges&: VPBBsInsideLoopRegion)) {
5382 for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB)) {
5383 if (!isa<VPScalarIVStepsRecipe, VPReplicateRecipe, VPInstruction>(Val: &R))
5384 continue;
5385 auto *DefR = cast<VPSingleDefRecipe>(Val: &R);
5386 auto UsesVectorOrInsideReplicateRegion = [DefR, LoopRegion](VPUser *U) {
5387 VPRegionBlock *ParentRegion = cast<VPRecipeBase>(Val: U)->getRegion();
5388 return !U->usesScalars(Op: DefR) || ParentRegion != LoopRegion;
5389 };
5390 if ((isa<VPReplicateRecipe>(Val: DefR) &&
5391 cast<VPReplicateRecipe>(Val: DefR)->isSingleScalar()) ||
5392 (isa<VPInstruction>(Val: DefR) &&
5393 (vputils::onlyFirstLaneUsed(Def: DefR) ||
5394 !cast<VPInstruction>(Val: DefR)->doesGeneratePerAllLanes())) ||
5395 none_of(Range: DefR->users(), P: UsesVectorOrInsideReplicateRegion))
5396 continue;
5397
5398 Type *ScalarTy = DefR->getScalarType();
5399 unsigned Opcode = ScalarTy->isStructTy()
5400 ? VPInstruction::BuildStructVector
5401 : VPInstruction::BuildVector;
5402 auto *BuildVector = new VPInstruction(Opcode, {DefR});
5403 BuildVector->insertAfter(InsertPos: DefR);
5404
5405 DefR->replaceUsesWithIf(
5406 New: BuildVector, ShouldReplace: [BuildVector, &UsesVectorOrInsideReplicateRegion](
5407 VPUser &U, unsigned) {
5408 return &U != BuildVector && UsesVectorOrInsideReplicateRegion(&U);
5409 });
5410 }
5411 }
5412
5413 // Create explicit VPInstructions to convert vectors to scalars. The current
5414 // implementation is conservative - it may miss some cases that may or may not
5415 // be vector values. TODO: introduce Unpacks speculatively - remove them later
5416 // if they are known to operate on scalar values.
5417 for (VPBasicBlock *VPBB : VPBBsInsideLoopRegion) {
5418 for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB)) {
5419 if (isa<VPReplicateRecipe, VPInstruction, VPScalarIVStepsRecipe,
5420 VPDerivedIVRecipe>(Val: &R))
5421 continue;
5422 for (VPValue *Def : R.definedValues()) {
5423 // Skip recipes that are single-scalar.
5424 // TODO: The Defs skipped here may or may not be vector values.
5425 // Introduce Unpacks, and remove them later, if they are guaranteed to
5426 // produce scalar values.
5427 if (vputils::isSingleScalar(VPV: Def))
5428 continue;
5429
5430 // Only introduce an Unpack if some, but not all, users use the first
5431 // lane only.
5432 unsigned NumFirstLaneUsers = count_if(Range: Def->users(), P: [&Def](VPUser *U) {
5433 return U->usesFirstLaneOnly(Op: Def);
5434 });
5435 if (!NumFirstLaneUsers || NumFirstLaneUsers == Def->getNumUsers())
5436 continue;
5437
5438 auto *Unpack = new VPInstruction(VPInstruction::Unpack, {Def});
5439 if (R.isPhi())
5440 Unpack->insertBefore(BB&: *VPBB, IP: VPBB->getFirstNonPhi());
5441 else
5442 Unpack->insertAfter(InsertPos: &R);
5443 Def->replaceUsesWithIf(New: Unpack, ShouldReplace: [&Def](VPUser &U, unsigned) {
5444 return U.usesFirstLaneOnly(Op: Def);
5445 });
5446 }
5447 }
5448 }
5449}
5450
5451void VPlanTransforms::materializeVectorTripCount(
5452 VPlan &Plan, VPBasicBlock *VectorPHVPBB, bool TailByMasking,
5453 bool RequiresScalarEpilogue, VPValue *Step,
5454 std::optional<uint64_t> MaxRuntimeStep) {
5455 VPSymbolicValue &VectorTC = Plan.getVectorTripCount();
5456 // There's nothing to do if there are no users of the vector trip count or its
5457 // IR value has already been set.
5458 if (VectorTC.user_empty() || VectorTC.getUnderlyingValue())
5459 return;
5460
5461 VPValue *TC = Plan.getTripCount();
5462 Type *TCTy = TC->getScalarType();
5463 VPBasicBlock::iterator InsertPt = VectorPHVPBB->begin();
5464 if (auto *StepR = Step->getDefiningRecipe()) {
5465 assert(VPDominatorTree(Plan).dominates(StepR->getParent(), VectorPHVPBB) &&
5466 "Step VPBB must dominate VectorPHVPBB");
5467 // Insert after Step's definition to maintain valid def-use ordering.
5468 InsertPt = std::next(x: StepR->getIterator());
5469 }
5470 VPBuilder Builder(VectorPHVPBB, InsertPt);
5471
5472 // For scalable steps, if TC is a constant and is divisible by the maximum
5473 // possible runtime step, then TC % Step == 0 for all valid vscale values
5474 // and the vector trip count equals TC directly.
5475 const APInt *TCVal;
5476 if (!RequiresScalarEpilogue && match(V: TC, P: m_APInt(C&: TCVal)) && MaxRuntimeStep &&
5477 TCVal->urem(RHS: *MaxRuntimeStep) == 0) {
5478 VectorTC.replaceAllUsesWith(New: TC);
5479 return;
5480 }
5481
5482 // If the tail is to be folded by masking, round the number of iterations N
5483 // up to a multiple of Step instead of rounding down. This is done by first
5484 // adding Step-1 and then rounding down. Note that it's ok if this addition
5485 // overflows: the vector induction variable will eventually wrap to zero given
5486 // that it starts at zero and its Step is a power of two; the loop will then
5487 // exit, with the last early-exit vector comparison also producing all-true.
5488 if (TailByMasking) {
5489 TC = Builder.createAdd(
5490 LHS: TC, RHS: Builder.createSub(LHS: Step, RHS: Plan.getConstantInt(Ty: TCTy, Val: 1)),
5491 DL: DebugLoc::getCompilerGenerated(), Name: "n.rnd.up");
5492 }
5493
5494 // Now we need to generate the expression for the part of the loop that the
5495 // vectorized body will execute. This is equal to N - (N % Step) if scalar
5496 // iterations are not required for correctness, or N - Step, otherwise. Step
5497 // is equal to the vectorization factor (number of SIMD elements) times the
5498 // unroll factor (number of SIMD instructions).
5499 VPValue *R =
5500 Builder.createNaryOp(Opcode: Instruction::URem, Operands: {TC, Step},
5501 DL: DebugLoc::getCompilerGenerated(), Name: "n.mod.vf");
5502
5503 // There are cases where we *must* run at least one iteration in the remainder
5504 // loop. See the cost model for when this can happen. If the step evenly
5505 // divides the trip count, we set the remainder to be equal to the step. If
5506 // the step does not evenly divide the trip count, no adjustment is necessary
5507 // since there will already be scalar iterations. Note that the minimum
5508 // iterations check ensures that N >= Step.
5509 if (RequiresScalarEpilogue) {
5510 assert(!TailByMasking &&
5511 "requiring scalar epilogue is not supported with fail folding");
5512 VPValue *IsZero =
5513 Builder.createICmp(Pred: CmpInst::ICMP_EQ, A: R, B: Plan.getZero(Ty: TCTy));
5514 R = Builder.createSelect(Cond: IsZero, TrueVal: Step, FalseVal: R);
5515 }
5516
5517 VPValue *Res =
5518 Builder.createSub(LHS: TC, RHS: R, DL: DebugLoc::getCompilerGenerated(), Name: "n.vec");
5519 VectorTC.replaceAllUsesWith(New: Res);
5520}
5521
5522void VPlanTransforms::materializeFactors(VPlan &Plan, VPBasicBlock *VectorPH,
5523 ElementCount VFEC) {
5524 // If VF and VFxUF have already been materialized (no remaining users),
5525 // there's nothing more to do.
5526 if (Plan.getVF().isMaterialized()) {
5527 assert(Plan.getVFxUF().isMaterialized() &&
5528 "VF and VFxUF must be materialized together");
5529 return;
5530 }
5531
5532 VPBuilder Builder(VectorPH, VectorPH->begin());
5533 Type *TCTy = Plan.getTripCount()->getScalarType();
5534 VPValue &VF = Plan.getVF();
5535 VPValue &VFxUF = Plan.getVFxUF();
5536 // If there are no users of the runtime VF, compute VFxUF by constant folding
5537 // the multiplication of VF and UF.
5538 if (VF.user_empty()) {
5539 VPValue *RuntimeVFxUF =
5540 Builder.createElementCount(Ty: TCTy, EC: VFEC * Plan.getConcreteUF());
5541 VFxUF.replaceAllUsesWith(New: RuntimeVFxUF);
5542 return;
5543 }
5544
5545 // For users of the runtime VF, compute it as VF * vscale, and VFxUF as (VF *
5546 // vscale) * UF.
5547 VPValue *RuntimeVF = Builder.createElementCount(Ty: TCTy, EC: VFEC);
5548 if (!vputils::onlyScalarValuesUsed(Def: &VF)) {
5549 VPValue *BC = Builder.createNaryOp(Opcode: VPInstruction::Broadcast, Operands: RuntimeVF);
5550 VF.replaceUsesWithIf(
5551 New: BC, ShouldReplace: [&VF](VPUser &U, unsigned) { return !U.usesScalars(Op: &VF); });
5552 }
5553 VF.replaceAllUsesWith(New: RuntimeVF);
5554
5555 VPValue *MulByUF = Builder.createOverflowingOp(
5556 Opcode: Instruction::Mul,
5557 Operands: {RuntimeVF, Plan.getConstantInt(Ty: TCTy, Val: Plan.getConcreteUF())},
5558 WrapFlags: {true, false});
5559 VFxUF.replaceAllUsesWith(New: MulByUF);
5560}
5561
5562void VPlanTransforms::attachAliasMaskToHeaderMask(VPlan &Plan) {
5563 VPSingleDefRecipe *HeaderMask = vputils::findHeaderMask(Plan);
5564 auto *HeaderMaskDef = HeaderMask->getDefiningRecipe();
5565 Type *I1Ty = IntegerType::getInt1Ty(C&: Plan.getContext());
5566
5567 VPBuilder Builder(Plan.getVectorPreheader());
5568 auto *AliasMask = Builder.createNaryOp(
5569 Opcode: VPInstruction::IncomingAliasMask, Operands: {}, Inst: nullptr, Flags: {}, MD: {},
5570 DL: DebugLoc::getUnknown(), Name: "incoming.alias.mask", ResultTy: I1Ty);
5571
5572 if (HeaderMaskDef->isPhi())
5573 Builder = VPBuilder(&*HeaderMaskDef->getParent()->getFirstNonPhi());
5574 else
5575 Builder = VPBuilder::getToInsertAfter(R: HeaderMaskDef);
5576
5577 // Update all existing users of the header mask to "HeaderMask & AliasMask".
5578 auto *ClampedHeaderMask = Builder.createAnd(LHS: HeaderMask, RHS: AliasMask);
5579 HeaderMask->replaceUsesWithIf(New: ClampedHeaderMask, ShouldReplace: [&](VPUser &U, unsigned) {
5580 return &U != ClampedHeaderMask;
5581 });
5582}
5583
5584VPValue *
5585VPlanTransforms::materializeAliasMask(VPlan &Plan, VPBasicBlock *AliasCheckVPBB,
5586 ArrayRef<PointerDiffInfo> DiffChecks) {
5587 VPBuilder Builder(AliasCheckVPBB);
5588 Type *I1Ty = IntegerType::getInt1Ty(C&: Plan.getContext());
5589
5590 VPValue *IncomingAliasMask = vputils::findIncomingAliasMask(Plan);
5591 assert(IncomingAliasMask && "Expected an alias mask!");
5592
5593 VPValue *AliasMask = nullptr;
5594 for (const PointerDiffInfo &Check : DiffChecks) {
5595 VPValue *Src = vputils::getOrCreateVPValueForSCEVExpr(Plan, Expr: Check.SrcStart);
5596 VPValue *Sink =
5597 vputils::getOrCreateVPValueForSCEVExpr(Plan, Expr: Check.SinkStart);
5598 Type *AddrType = Src->getScalarType();
5599
5600 // TODO: Only freeze the required pointer (not both src and sink).
5601 if (Check.NeedsFreeze) {
5602 Src = Builder.createScalarFreeze(Op: Src, ResultTy: AddrType, DL: DebugLoc::getUnknown());
5603 Sink = Builder.createScalarFreeze(Op: Sink, ResultTy: AddrType, DL: DebugLoc::getUnknown());
5604 }
5605
5606 // TODO: Generate loop_dependence_raw_mask when there's a read-after-write
5607 // dependency between the source and the sink. This is not necessary for
5608 // correctness of the mask, but using the "raw" variant prevents loads
5609 // depending on the completion of stores.
5610 VPWidenIntrinsicRecipe *WARMask = Builder.insert(R: new VPWidenIntrinsicRecipe(
5611 Intrinsic::loop_dependence_war_mask,
5612 {Src, Sink, Plan.getConstantInt(Ty: AddrType, Val: Check.AccessSize)}, I1Ty));
5613
5614 if (AliasMask)
5615 AliasMask = Builder.createAnd(LHS: AliasMask, RHS: WARMask);
5616 else
5617 AliasMask = WARMask;
5618 }
5619
5620 Type *IVTy = Plan.getVectorLoopRegion()->getCanonicalIVType();
5621 Type *IndexTy = Plan.getDataLayout().getIndexType(C&: Plan.getContext(), AddressSpace: 0);
5622 VPValue *NumActive = Builder.createNaryOp(
5623 Opcode: VPInstruction::NumActiveLanes, Operands: {AliasMask}, Inst: nullptr, Flags: {}, MD: {},
5624 DL: DebugLoc::getUnknown(), Name: "num.active.lanes", ResultTy: IndexTy);
5625 VPValue *ClampedVF = Builder.createScalarZExtOrTrunc(
5626 Op: NumActive, ResultTy: IVTy, SrcTy: IndexTy, DL: DebugLoc::getCompilerGenerated());
5627
5628 IncomingAliasMask->replaceAllUsesWith(New: AliasMask);
5629
5630 return ClampedVF;
5631}
5632
5633void VPlanTransforms::materializeAliasMaskCheckBlock(
5634 VPlan &Plan, ArrayRef<PointerDiffInfo> DiffChecks, bool HasBranchWeights) {
5635 VPBasicBlock *ClampedVFCheck =
5636 Plan.createVPBasicBlock(Name: "vector.clamped.vf.check");
5637
5638 VPValue *ClampedVF = materializeAliasMask(Plan, AliasCheckVPBB: ClampedVFCheck, DiffChecks);
5639 VPBuilder Builder(ClampedVFCheck);
5640 DebugLoc DL = DebugLoc::getCompilerGenerated();
5641 Type *TCTy = Plan.getTripCount()->getScalarType();
5642
5643 // Check the "ClampedVF" from the alias mask is larger than one.
5644 VPValue *IsScalar =
5645 Builder.createICmp(Pred: CmpInst::ICMP_ULE, A: ClampedVF,
5646 B: Plan.getConstantInt(Ty: TCTy, Val: 1), DL, Name: "vf.is.scalar");
5647
5648 VPValue *TripCount = Plan.getTripCount();
5649 VPValue *MaxUIntTripCount =
5650 Plan.getConstantInt(Val: cast<IntegerType>(Val: TCTy)->getMask());
5651 VPValue *DistanceToMax = Builder.createSub(LHS: MaxUIntTripCount, RHS: TripCount);
5652
5653 // For tail-folding: Don't execute the vector loop if (UMax - n) < ClampedVF.
5654 // Note: The ClampedVF may not be a power-of-two. This means the loop exit
5655 // condition (index.next == n.vec) may not be correct in the case of an
5656 // overflow. The issue is `n.vec` could be zero due to an overflow, but
5657 // index.next is not guaranteed to overflow to zero as the ClampedVF is not a
5658 // power-of-two).
5659 VPValue *TripCountCheck = Builder.createICmp(
5660 Pred: ICmpInst::ICMP_ULT, A: DistanceToMax, B: ClampedVF, DL, Name: "vf.step.overflow");
5661
5662 VPValue *Cond = Builder.createOr(LHS: IsScalar, RHS: TripCountCheck, DL);
5663 attachVPCheckBlock(Plan, Cond, CheckBlock: ClampedVFCheck, AddBranchWeights: HasBranchWeights);
5664
5665 // Materialize the trip count early as this will add a use of (VFxUF) that
5666 // needs to be replaced with the ClampedVF.
5667 materializeVectorTripCount(Plan, VectorPHVPBB: Plan.getVectorPreheader(),
5668 /*TailByMasking=*/true,
5669 /*RequiresScalarEpilogue=*/false,
5670 Step: &Plan.getVFxUF());
5671
5672 assert(Plan.getConcreteUF() == 1 &&
5673 "Clamped VF not supported with interleaving");
5674 Plan.getVF().replaceAllUsesWith(New: ClampedVF);
5675 Plan.getVFxUF().replaceAllUsesWith(New: ClampedVF);
5676}
5677
5678void VPlanTransforms::expandSCEVsToVPInstructions(VPlan &Plan,
5679 ScalarEvolution &SE) {
5680 auto *Entry = Plan.getEntry();
5681 VPBuilder Builder(Entry, Entry->begin());
5682 DebugLoc DL = cast<VPIRBasicBlock>(Val: Entry)
5683 ->getIRBasicBlock()
5684 ->getTerminator()
5685 ->getDebugLoc();
5686 VPSCEVExpander Expander(Builder, SE, DL);
5687
5688 // Expand VPExpandSCEVRecipes to VPInstructions using VPSCEVExpander. During
5689 // the transition, unsupported VPExpandSCEVRecipes are skipped and left for
5690 // late expansion.
5691 for (VPRecipeBase &R : make_early_inc_range(Range&: *Entry)) {
5692 auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(Val: &R);
5693 if (!ExpSCEV || ExpSCEV->user_empty())
5694 continue;
5695 Builder.setInsertPoint(ExpSCEV);
5696 VPValue *Expanded = Expander.tryToExpand(S: ExpSCEV->getSCEV());
5697 if (!Expanded)
5698 continue;
5699 ExpSCEV->replaceAllUsesWith(New: Expanded);
5700 // TripCount should not be used after expansion to VPInstructions. Reset to
5701 // poison to avoid dangling references.
5702 if (Plan.getTripCount() == ExpSCEV)
5703 Plan.resetTripCount(NewTripCount: Plan.getPoison(Ty: ExpSCEV->getScalarType()));
5704 ExpSCEV->eraseFromParent();
5705 }
5706}
5707
5708DenseMap<const SCEV *, Value *>
5709VPlanTransforms::expandSCEVs(VPlan &Plan, ScalarEvolution &SE) {
5710 SCEVExpander Expander(SE, "induction", /*PreserveLCSSA=*/false);
5711
5712 auto *Entry = cast<VPIRBasicBlock>(Val: Plan.getEntry());
5713 BasicBlock *EntryBB = Entry->getIRBasicBlock();
5714 DenseMap<const SCEV *, Value *> ExpandedSCEVs;
5715 // Expand remaining VPExpandSCEVRecipes to IR instructions using SCEVExpander.
5716 for (VPRecipeBase &R : make_early_inc_range(Range&: *Entry)) {
5717 auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(Val: &R);
5718 if (!ExpSCEV)
5719 continue;
5720 const SCEV *Expr = ExpSCEV->getSCEV();
5721 Value *Res =
5722 Expander.expandCodeFor(SH: Expr, Ty: Expr->getType(), I: EntryBB->getTerminator());
5723 ExpandedSCEVs[Expr] = Res;
5724 VPValue *Exp = Plan.getOrAddLiveIn(V: Res);
5725 ExpSCEV->replaceAllUsesWith(New: Exp);
5726 if (Plan.getTripCount() == ExpSCEV)
5727 Plan.resetTripCount(NewTripCount: Exp);
5728 ExpSCEV->eraseFromParent();
5729 }
5730 assert(none_of(*Entry, IsaPred<VPExpandSCEVRecipe>) &&
5731 "all VPExpandSCEVRecipes must have been expanded");
5732 // Add IR instructions in the entry basic block but not in the VPIRBasicBlock
5733 // to the VPIRBasicBlock.
5734 auto EI = Entry->begin();
5735 for (Instruction &I : drop_end(RangeOrContainer&: *EntryBB)) {
5736 if (EI != Entry->end() && isa<VPIRInstruction>(Val: *EI) &&
5737 &cast<VPIRInstruction>(Val: &*EI)->getInstruction() == &I) {
5738 EI++;
5739 continue;
5740 }
5741 VPIRInstruction::create(I)->insertBefore(BB&: *Entry, IP: EI);
5742 }
5743
5744 return ExpandedSCEVs;
5745}
5746
5747/// Returns true if \p V is VPWidenLoadRecipe or VPInterleaveRecipe that can be
5748/// converted to a narrower recipe. \p V is used by a wide recipe that feeds a
5749/// store interleave group at index \p Idx, \p WideMember0 is the recipe feeding
5750/// the same interleave group at index 0. A VPWidenLoadRecipe can be narrowed to
5751/// an index-independent load if it feeds all wide ops at all indices (\p OpV
5752/// must be the operand at index \p OpIdx for both the recipe at lane 0, \p
5753/// WideMember0). A VPInterleaveRecipe can be narrowed to a wide load, if \p V
5754/// is defined at \p Idx of a load interleave group.
5755/// A live-in or recipe defined outside the loop region can be converted, if it
5756/// is the same across all lanes, or we can create a BuildVector for it.
5757static bool canNarrowLoad(VPSingleDefRecipe *WideMember0, unsigned OpIdx,
5758 VPValue *OpV, unsigned Idx, bool IsScalable) {
5759 VPValue *Member0Op = WideMember0->getOperand(N: OpIdx);
5760 if (Member0Op->isDefinedOutsideLoopRegions()) {
5761 // Operand matches Member0, broadcast across all fields for both live-ins
5762 // and recipes.
5763 if (Member0Op == OpV)
5764 return true;
5765 // Otherwise distinct per-field VPValues are assembled into a BuildVector.
5766 return !IsScalable && OpV->isDefinedOutsideLoopRegions() &&
5767 OpV->getScalarType() == Member0Op->getScalarType();
5768 }
5769 VPRecipeBase *Member0OpR = Member0Op->getDefiningRecipe();
5770 if (auto *W = dyn_cast<VPWidenLoadRecipe>(Val: Member0OpR))
5771 // For scalable VFs, the narrowed plan processes vscale iterations at once,
5772 // so a shared wide load cannot be narrowed to a uniform scalar; bail out.
5773 return !IsScalable && !W->getMask() && W->isConsecutive() &&
5774 Member0Op == OpV;
5775 if (auto *IR = dyn_cast<VPInterleaveRecipe>(Val: Member0OpR))
5776 return IR->getInterleaveGroup()->isFull() && IR->getVPValue(I: Idx) == OpV;
5777 return false;
5778}
5779
5780static bool canNarrowOps(ArrayRef<VPValue *> Ops, bool IsScalable) {
5781 SmallVector<VPValue *> Ops0;
5782 auto *WideMember0 = dyn_cast<VPRecipeWithIRFlags>(Val: Ops[0]);
5783 if (!WideMember0)
5784 return false;
5785 for (VPValue *V : Ops) {
5786 if (!isa<VPWidenRecipe, VPWidenCastRecipe>(Val: V))
5787 return false;
5788 auto *R = cast<VPRecipeWithIRFlags>(Val: V);
5789 if (getOpcodeOrIntrinsicID(R) != getOpcodeOrIntrinsicID(R: WideMember0))
5790 return false;
5791 if (R->getScalarType() != WideMember0->getScalarType())
5792 return false;
5793 if (R->hasPredicate() && R->getPredicate() != WideMember0->getPredicate())
5794 return false;
5795 }
5796
5797 for (unsigned Idx = 0; Idx != WideMember0->getNumOperands(); ++Idx) {
5798 SmallVector<VPValue *> OpsI;
5799 for (VPValue *Op : Ops)
5800 OpsI.push_back(Elt: Op->getDefiningRecipe()->getOperand(N: Idx));
5801
5802 if (canNarrowOps(Ops: OpsI, IsScalable))
5803 continue;
5804
5805 if (any_of(Range: enumerate(First&: OpsI), P: [WideMember0, Idx, IsScalable](const auto &P) {
5806 const auto &[OpIdx, OpV] = P;
5807 return !canNarrowLoad(WideMember0, Idx, OpV, OpIdx, IsScalable);
5808 }))
5809 return false;
5810 }
5811
5812 return true;
5813}
5814
5815/// Returns VF from \p VFs if \p IR is a full interleave group with factor and
5816/// number of members both equal to VF. The interleave group must also access
5817/// the full vector width.
5818static std::optional<ElementCount>
5819isConsecutiveInterleaveGroup(VPInterleaveRecipe *InterleaveR,
5820 ArrayRef<ElementCount> VFs,
5821 const TargetTransformInfo &TTI) {
5822 if (!InterleaveR || InterleaveR->getMask())
5823 return std::nullopt;
5824
5825 Type *GroupElementTy = nullptr;
5826 if (InterleaveR->getStoredValues().empty()) {
5827 GroupElementTy = InterleaveR->getVPValue(I: 0)->getScalarType();
5828 if (!all_of(Range: InterleaveR->definedValues(), P: [GroupElementTy](VPValue *Op) {
5829 return Op->getScalarType() == GroupElementTy;
5830 }))
5831 return std::nullopt;
5832 } else {
5833 GroupElementTy = InterleaveR->getStoredValues()[0]->getScalarType();
5834 if (!all_of(Range: InterleaveR->getStoredValues(), P: [GroupElementTy](VPValue *Op) {
5835 return Op->getScalarType() == GroupElementTy;
5836 }))
5837 return std::nullopt;
5838 }
5839
5840 auto IG = InterleaveR->getInterleaveGroup();
5841 if (IG->getFactor() != IG->getNumMembers())
5842 return std::nullopt;
5843
5844 auto GetVectorBitWidthForVF = [&TTI](ElementCount VF) {
5845 TypeSize Size = TTI.getRegisterBitWidth(
5846 K: VF.isFixed() ? TargetTransformInfo::RGK_FixedWidthVector
5847 : TargetTransformInfo::RGK_ScalableVector);
5848 assert(Size.isScalable() == VF.isScalable() &&
5849 "if Size is scalable, VF must be scalable and vice versa");
5850 return Size.getKnownMinValue();
5851 };
5852
5853 for (ElementCount VF : VFs) {
5854 unsigned MinVal = VF.getKnownMinValue();
5855 unsigned GroupSize = GroupElementTy->getScalarSizeInBits() * MinVal;
5856 if (IG->getFactor() == MinVal && GroupSize == GetVectorBitWidthForVF(VF))
5857 return {VF};
5858 }
5859 return std::nullopt;
5860}
5861
5862/// Returns true if \p VPValue is a narrow VPValue.
5863static bool isAlreadyNarrow(VPValue *VPV) {
5864 if (isa<VPIRValue>(Val: VPV))
5865 return true;
5866 auto *RepR = dyn_cast<VPReplicateRecipe>(Val: VPV);
5867 return RepR && RepR->isSingleScalar();
5868}
5869
5870// Convert the wide recipes defining the VPValues in \p Members feeding an
5871// interleave group to a single narrow variant. The first member is reused as
5872// the narrowed recipe. BuildVectors for live-in operands are inserted into \p
5873// Preheader.
5874static VPValue *narrowInterleaveGroupOp(ArrayRef<VPValue *> Members,
5875 SmallPtrSetImpl<VPValue *> &NarrowedOps,
5876 VPBasicBlock *Preheader) {
5877 VPValue *V = Members.front();
5878 if (NarrowedOps.contains(Ptr: V))
5879 return V;
5880
5881 if (V->isDefinedOutsideLoopRegions()) {
5882 assert(all_of(Members,
5883 [V](VPValue *M) {
5884 return M->isDefinedOutsideLoopRegions() &&
5885 M->getScalarType() == V->getScalarType();
5886 }) &&
5887 "expected distinct loop-invariant values of matching scalar type");
5888 auto *BV = new VPInstruction(VPInstruction::BuildVector, Members);
5889 Preheader->appendRecipe(Recipe: BV);
5890 NarrowedOps.insert(Ptr: BV);
5891 return BV;
5892 }
5893
5894 if (isAlreadyNarrow(VPV: V))
5895 return V;
5896
5897 VPRecipeBase *R = V->getDefiningRecipe();
5898 if (isa<VPWidenRecipe, VPWidenCastRecipe>(Val: R)) {
5899 auto *WideMember0 = cast<VPRecipeWithIRFlags>(Val: R);
5900 for (VPValue *Member : Members.drop_front())
5901 WideMember0->intersectFlags(Other: *cast<VPRecipeWithIRFlags>(Val: Member));
5902 for (unsigned Idx = 0, E = WideMember0->getNumOperands(); Idx != E; ++Idx) {
5903 SmallVector<VPValue *> OpsI;
5904 for (VPValue *Member : Members)
5905 OpsI.push_back(Elt: Member->getDefiningRecipe()->getOperand(N: Idx));
5906 WideMember0->setOperand(
5907 I: Idx, New: narrowInterleaveGroupOp(Members: OpsI, NarrowedOps, Preheader));
5908 }
5909 return V;
5910 }
5911
5912 if (auto *LoadGroup = dyn_cast<VPInterleaveRecipe>(Val: R)) {
5913 // Narrow interleave group to wide load, as transformed VPlan will only
5914 // process one original iteration.
5915 auto *LI = cast<LoadInst>(Val: LoadGroup->getInterleaveGroup()->getInsertPos());
5916 auto *L = new VPWidenLoadRecipe(*LI, LoadGroup->getAddr(),
5917 LoadGroup->getMask(), /*Consecutive=*/true,
5918 *LoadGroup, LoadGroup->getDebugLoc());
5919 L->insertBefore(InsertPos: LoadGroup);
5920 NarrowedOps.insert(Ptr: L);
5921 return L;
5922 }
5923
5924 if (auto *RepR = dyn_cast<VPReplicateRecipe>(Val: R)) {
5925 assert(RepR->isSingleScalar() && RepR->getOpcode() == Instruction::Load &&
5926 "must be a single scalar load");
5927 NarrowedOps.insert(Ptr: RepR);
5928 return RepR;
5929 }
5930
5931 auto *WideLoad = cast<VPWidenLoadRecipe>(Val: R);
5932 VPValue *PtrOp = WideLoad->getAddr();
5933 if (auto *VecPtr = dyn_cast<VPVectorPointerRecipe>(Val: PtrOp))
5934 PtrOp = VecPtr->getOperand(N: 0);
5935 // Narrow wide load to uniform scalar load, as transformed VPlan will only
5936 // process one original iteration.
5937 auto *N = new VPReplicateRecipe(&WideLoad->getIngredient(), {PtrOp},
5938 /*IsUniform*/ true,
5939 /*Mask*/ nullptr, {}, *WideLoad);
5940 N->insertBefore(InsertPos: WideLoad);
5941 NarrowedOps.insert(Ptr: N);
5942 return N;
5943}
5944
5945std::unique_ptr<VPlan>
5946VPlanTransforms::narrowInterleaveGroups(VPlan &Plan,
5947 const TargetTransformInfo &TTI) {
5948 VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
5949
5950 if (!VectorLoop)
5951 return nullptr;
5952
5953 // Only handle single-block loops for now.
5954 if (VectorLoop->getEntryBasicBlock() != VectorLoop->getExitingBasicBlock())
5955 return nullptr;
5956
5957 // Skip plans when we may not be able to properly narrow.
5958 VPBasicBlock *Exiting = VectorLoop->getExitingBasicBlock();
5959 if (!match(V: &Exiting->back(), P: m_BranchOnCount()))
5960 return nullptr;
5961
5962 assert(match(&Exiting->back(),
5963 m_BranchOnCount(m_Add(m_VPValue(), m_Specific(&Plan.getVFxUF())),
5964 m_Specific(&Plan.getVectorTripCount()))) &&
5965 "unexpected branch-on-count");
5966
5967 SmallVector<VPInterleaveRecipe *> StoreGroups;
5968 std::optional<ElementCount> VFToOptimize;
5969 for (auto &R : *VectorLoop->getEntryBasicBlock()) {
5970 if (isa<VPDerivedIVRecipe, VPScalarIVStepsRecipe>(Val: &R) &&
5971 vputils::onlyFirstLaneUsed(Def: cast<VPSingleDefRecipe>(Val: &R)))
5972 continue;
5973
5974 // Bail out on recipes not supported at the moment:
5975 // * phi recipes other than the canonical induction
5976 // * recipes writing to memory except interleave groups
5977 // Only support plans with a canonical induction phi.
5978 if (R.isPhi())
5979 return nullptr;
5980
5981 auto *InterleaveR = dyn_cast<VPInterleaveRecipe>(Val: &R);
5982 if (R.mayWriteToMemory() && !InterleaveR)
5983 return nullptr;
5984
5985 // Bail out if any recipe defines a vector value used outside the
5986 // vector loop region.
5987 if (any_of(Range: R.definedValues(), P: [&](VPValue *V) {
5988 return any_of(Range: V->users(), P: [&](VPUser *U) {
5989 auto *UR = cast<VPRecipeBase>(Val: U);
5990 return UR->getParent()->getParent() != VectorLoop;
5991 });
5992 }))
5993 return nullptr;
5994
5995 // All other ops are allowed, but we reject uses that cannot be converted
5996 // when checking all allowed consumers (store interleave groups) below.
5997 if (!InterleaveR)
5998 continue;
5999
6000 // Try to find a single VF, where all interleave groups are consecutive and
6001 // saturate the full vector width. If we already have a candidate VF, check
6002 // if it is applicable for the current InterleaveR, otherwise look for a
6003 // suitable VF across the Plan's VFs.
6004 SmallVector<ElementCount> VFs =
6005 VFToOptimize ? SmallVector<ElementCount>({*VFToOptimize})
6006 : to_vector(Range: Plan.vectorFactors());
6007 std::optional<ElementCount> NarrowedVF =
6008 isConsecutiveInterleaveGroup(InterleaveR, VFs, TTI);
6009 if (!NarrowedVF || (VFToOptimize && NarrowedVF != VFToOptimize))
6010 return nullptr;
6011 VFToOptimize = NarrowedVF;
6012
6013 // Skip read interleave groups.
6014 if (InterleaveR->getStoredValues().empty())
6015 continue;
6016
6017 // Narrow interleave groups, if all operands are already matching narrow
6018 // ops.
6019 auto *Member0 = InterleaveR->getStoredValues()[0];
6020 if (isAlreadyNarrow(VPV: Member0) &&
6021 all_of(Range: InterleaveR->getStoredValues(), P: equal_to(Arg&: Member0))) {
6022 StoreGroups.push_back(Elt: InterleaveR);
6023 continue;
6024 }
6025
6026 // For now, we only support full interleave groups storing load interleave
6027 // groups.
6028 if (all_of(Range: enumerate(First: InterleaveR->getStoredValues()), P: [](auto Op) {
6029 VPRecipeBase *DefR = Op.value()->getDefiningRecipe();
6030 if (!DefR)
6031 return false;
6032 auto *IR = dyn_cast<VPInterleaveRecipe>(Val: DefR);
6033 return IR && IR->getInterleaveGroup()->isFull() &&
6034 IR->getVPValue(Op.index()) == Op.value();
6035 })) {
6036 StoreGroups.push_back(Elt: InterleaveR);
6037 continue;
6038 }
6039
6040 // Check if all values feeding InterleaveR are matching wide recipes, which
6041 // operands that can be narrowed.
6042 if (!canNarrowOps(Ops: InterleaveR->getStoredValues(),
6043 IsScalable: VFToOptimize->isScalable()))
6044 return nullptr;
6045 StoreGroups.push_back(Elt: InterleaveR);
6046 }
6047
6048 if (StoreGroups.empty())
6049 return nullptr;
6050
6051 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
6052 bool RequiresScalarEpilogue =
6053 MiddleVPBB->getNumSuccessors() == 1 &&
6054 MiddleVPBB->getSingleSuccessor() == Plan.getScalarPreheader();
6055 // Bail out for tail-folding (middle block with a single successor to exit).
6056 if (MiddleVPBB->getNumSuccessors() != 2 && !RequiresScalarEpilogue)
6057 return nullptr;
6058
6059 // All interleave groups in Plan can be narrowed for VFToOptimize. Split the
6060 // original Plan into 2: a) a new clone which contains all VFs of Plan, except
6061 // VFToOptimize, and b) the original Plan with VFToOptimize as single VF.
6062 // TODO: Handle cases where only some interleave groups can be narrowed.
6063 std::unique_ptr<VPlan> NewPlan;
6064 if (size(Range: Plan.vectorFactors()) != 1) {
6065 NewPlan = std::unique_ptr<VPlan>(Plan.duplicate());
6066 Plan.setVF(*VFToOptimize);
6067 NewPlan->removeVF(VF: *VFToOptimize);
6068 }
6069
6070 // Convert InterleaveGroup \p R to a single VPWidenLoadRecipe.
6071 SmallPtrSet<VPValue *, 4> NarrowedOps;
6072 VPBasicBlock *Preheader = Plan.getVectorPreheader();
6073 // Narrow operation tree rooted at store groups.
6074 for (auto *StoreGroup : StoreGroups) {
6075 VPValue *Res = narrowInterleaveGroupOp(Members: StoreGroup->getStoredValues(),
6076 NarrowedOps, Preheader);
6077 auto *SI =
6078 cast<StoreInst>(Val: StoreGroup->getInterleaveGroup()->getInsertPos());
6079 auto *S = new VPWidenStoreRecipe(*SI, StoreGroup->getAddr(), Res, nullptr,
6080 /*Consecutive=*/true, *StoreGroup,
6081 StoreGroup->getDebugLoc());
6082 S->insertBefore(InsertPos: StoreGroup);
6083 StoreGroup->eraseFromParent();
6084 }
6085
6086 // Adjust induction to reflect that the transformed plan only processes one
6087 // original iteration.
6088 VPInstruction *CanIVInc = vputils::findCanonicalIVIncrement(Plan);
6089 Type *CanIVTy = VectorLoop->getCanonicalIVType();
6090 VPBasicBlock *VectorPH = Plan.getVectorPreheader();
6091 VPBuilder PHBuilder(VectorPH, VectorPH->begin());
6092
6093 VPValue *UF = &Plan.getUF();
6094 VPValue *Step;
6095 if (VFToOptimize->isScalable()) {
6096 VPValue *VScale =
6097 PHBuilder.createElementCount(Ty: CanIVTy, EC: ElementCount::getScalable(MinVal: 1));
6098 Step = PHBuilder.createOverflowingOp(Opcode: Instruction::Mul, Operands: {VScale, UF},
6099 WrapFlags: {true, false});
6100 Plan.getVF().replaceAllUsesWith(New: VScale);
6101 } else {
6102 Step = UF;
6103 Plan.getVF().replaceAllUsesWith(New: Plan.getConstantInt(Ty: CanIVTy, Val: 1));
6104 }
6105 // Materialize vector trip count with the narrowed step.
6106 materializeVectorTripCount(Plan, VectorPHVPBB: VectorPH, /*TailByMasking=*/false,
6107 RequiresScalarEpilogue, Step);
6108
6109 CanIVInc->setOperand(I: 1, New: Step);
6110 Plan.getVFxUF().replaceAllUsesWith(New: Step);
6111
6112 removeDeadRecipes(Plan);
6113 assert(none_of(*VectorLoop->getEntryBasicBlock(),
6114 IsaPred<VPVectorPointerRecipe>) &&
6115 "All VPVectorPointerRecipes should have been removed");
6116 return NewPlan;
6117}
6118
6119/// Add branch weight metadata, if the \p Plan's middle block is terminated by a
6120/// BranchOnCond recipe.
6121void VPlanTransforms::addBranchWeightToMiddleTerminator(
6122 VPlan &Plan, ElementCount VF, std::optional<unsigned> VScaleForTuning) {
6123 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
6124 auto *MiddleTerm =
6125 dyn_cast_or_null<VPInstruction>(Val: MiddleVPBB->getTerminator());
6126 // Only add branch metadata if there is a (conditional) terminator.
6127 if (!MiddleTerm)
6128 return;
6129
6130 assert(MiddleTerm->getOpcode() == VPInstruction::BranchOnCond &&
6131 "must have a BranchOnCond");
6132 // Assume that `TripCount % VectorStep ` is equally distributed.
6133 unsigned VectorStep = Plan.getConcreteUF() * VF.getKnownMinValue();
6134 if (VF.isScalable() && VScaleForTuning.has_value())
6135 VectorStep *= *VScaleForTuning;
6136 assert(VectorStep > 0 && "trip count should not be zero");
6137 MDBuilder MDB(Plan.getContext());
6138 MDNode *BranchWeights =
6139 MDB.createBranchWeights(Weights: {1, VectorStep - 1}, /*IsExpected=*/false);
6140 MiddleTerm->setMetadata(Kind: LLVMContext::MD_prof, Node: BranchWeights);
6141}
6142
6143void VPlanTransforms::adjustFirstOrderRecurrenceMiddleUsers(VPlan &Plan,
6144 VFRange &Range) {
6145 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
6146 auto *MiddleVPBB = Plan.getMiddleBlock();
6147 VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
6148
6149 auto IsScalableOne = [](ElementCount VF) -> bool {
6150 return VF == ElementCount::getScalable(MinVal: 1);
6151 };
6152
6153 for (auto &HeaderPhi : VectorRegion->getEntryBasicBlock()->phis()) {
6154 auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(Val: &HeaderPhi);
6155 if (!FOR)
6156 continue;
6157
6158 assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
6159 "Cannot handle loops with uncountable early exits");
6160
6161 // Find the existing splice for this FOR, created in
6162 // createHeaderPhiRecipes. All uses of FOR have already been replaced with
6163 // RecurSplice there; only RecurSplice itself still references FOR.
6164 auto *RecurSplice =
6165 findUserOf<VPInstruction::FirstOrderRecurrenceSplice>(V: FOR);
6166 assert(RecurSplice && "expected FirstOrderRecurrenceSplice");
6167
6168 // For VF vscale x 1, if vscale = 1, we are unable to extract the
6169 // penultimate value of the recurrence. Instead we rely on the existing
6170 // extract of the last element from the result of
6171 // VPInstruction::FirstOrderRecurrenceSplice.
6172 // TODO: Consider vscale_range info and UF.
6173 if (any_of(Range: RecurSplice->users(),
6174 P: [](VPUser *U) { return !cast<VPRecipeBase>(Val: U)->getRegion(); }) &&
6175 LoopVectorizationPlanner::getDecisionAndClampRange(Predicate: IsScalableOne,
6176 Range))
6177 return;
6178
6179 // This is the second phase of vectorizing first-order recurrences, creating
6180 // extracts for users outside the loop. An overview of the transformation is
6181 // described below. Suppose we have the following loop with some use after
6182 // the loop of the last a[i-1],
6183 //
6184 // for (int i = 0; i < n; ++i) {
6185 // t = a[i - 1];
6186 // b[i] = a[i] - t;
6187 // }
6188 // use t;
6189 //
6190 // There is a first-order recurrence on "a". For this loop, the shorthand
6191 // scalar IR looks like:
6192 //
6193 // scalar.ph:
6194 // s.init = a[-1]
6195 // br scalar.body
6196 //
6197 // scalar.body:
6198 // i = phi [0, scalar.ph], [i+1, scalar.body]
6199 // s1 = phi [s.init, scalar.ph], [s2, scalar.body]
6200 // s2 = a[i]
6201 // b[i] = s2 - s1
6202 // br cond, scalar.body, exit.block
6203 //
6204 // exit.block:
6205 // use = lcssa.phi [s1, scalar.body]
6206 //
6207 // In this example, s1 is a recurrence because it's value depends on the
6208 // previous iteration. In the first phase of vectorization, we created a
6209 // VPFirstOrderRecurrencePHIRecipe v1 for s1. Now we create the extracts
6210 // for users in the scalar preheader and exit block.
6211 //
6212 // vector.ph:
6213 // v_init = vector(..., ..., ..., a[-1])
6214 // br vector.body
6215 //
6216 // vector.body
6217 // i = phi [0, vector.ph], [i+4, vector.body]
6218 // v1 = phi [v_init, vector.ph], [v2, vector.body]
6219 // v2 = a[i, i+1, i+2, i+3]
6220 // v1' = splice(v1(3), v2(0, 1, 2))
6221 // b[i, i+1, i+2, i+3] = v2 - v1'
6222 // br cond, vector.body, middle.block
6223 //
6224 // middle.block:
6225 // vector.recur.extract.for.phi = v2(2)
6226 // vector.recur.extract = v2(3)
6227 // br cond, scalar.ph, exit.block
6228 //
6229 // scalar.ph:
6230 // scalar.recur.init = phi [vector.recur.extract, middle.block],
6231 // [s.init, otherwise]
6232 // br scalar.body
6233 //
6234 // scalar.body:
6235 // i = phi [0, scalar.ph], [i+1, scalar.body]
6236 // s1 = phi [scalar.recur.init, scalar.ph], [s2, scalar.body]
6237 // s2 = a[i]
6238 // b[i] = s2 - s1
6239 // br cond, scalar.body, exit.block
6240 //
6241 // exit.block:
6242 // lo = lcssa.phi [s1, scalar.body],
6243 // [vector.recur.extract.for.phi, middle.block]
6244 //
6245 // Update extracts of the splice in the middle block: they extract the
6246 // penultimate element of the recurrence.
6247 for (VPRecipeBase &R : make_early_inc_range(
6248 Range: make_range(x: MiddleVPBB->getFirstNonPhi(), y: MiddleVPBB->end()))) {
6249 if (!match(V: &R, P: m_ExtractLastLaneOfLastPart(Op0: m_Specific(VPV: RecurSplice))))
6250 continue;
6251
6252 auto *ExtractR = cast<VPInstruction>(Val: &R);
6253 VPValue *PenultimateElement = MiddleBuilder.createNaryOp(
6254 Opcode: VPInstruction::ExtractPenultimateElement, Operands: RecurSplice->getOperand(N: 1),
6255 DL: {}, Name: "vector.recur.extract.for.phi");
6256 for (VPUser *ExitU : to_vector(Range: ExtractR->users())) {
6257 if (auto *ExitPhi = dyn_cast<VPIRPhi>(Val: ExitU))
6258 ExitPhi->replaceUsesOfWith(From: ExtractR, To: PenultimateElement);
6259 }
6260 }
6261 }
6262}
6263
6264/// Check if \p V is a binary expression of a widened IV and a loop-invariant
6265/// value. Returns the widened IV if found, nullptr otherwise.
6266static VPWidenIntOrFpInductionRecipe *getExpressionIV(VPValue *V) {
6267 auto *BinOp = dyn_cast<VPWidenRecipe>(Val: V);
6268 if (!BinOp || !Instruction::isBinaryOp(Opcode: BinOp->getOpcode()) ||
6269 Instruction::isIntDivRem(Opcode: BinOp->getOpcode()))
6270 return nullptr;
6271
6272 VPValue *WidenIVCandidate = BinOp->getOperand(N: 0);
6273 VPValue *InvariantCandidate = BinOp->getOperand(N: 1);
6274 if (!isa<VPWidenIntOrFpInductionRecipe>(Val: WidenIVCandidate))
6275 std::swap(a&: WidenIVCandidate, b&: InvariantCandidate);
6276
6277 if (!InvariantCandidate->isDefinedOutsideLoopRegions())
6278 return nullptr;
6279
6280 return dyn_cast<VPWidenIntOrFpInductionRecipe>(Val: WidenIVCandidate);
6281}
6282
6283/// Create a scalar version of \p BinOp, with its \p WidenIV operand replaced
6284/// by \p ScalarIV, and place it after \p ScalarIV's defining recipe.
6285static VPValue *cloneBinOpForScalarIV(VPWidenRecipe *BinOp, VPValue *ScalarIV,
6286 VPWidenIntOrFpInductionRecipe *WidenIV) {
6287 assert(Instruction::isBinaryOp(BinOp->getOpcode()) &&
6288 BinOp->getNumOperands() == 2 && "BinOp must have 2 operands");
6289 auto *ClonedOp = BinOp->clone();
6290 if (ClonedOp->getOperand(N: 0) == WidenIV) {
6291 ClonedOp->setOperand(I: 0, New: ScalarIV);
6292 } else {
6293 assert(ClonedOp->getOperand(1) == WidenIV && "one operand must be WideIV");
6294 ClonedOp->setOperand(I: 1, New: ScalarIV);
6295 }
6296 ClonedOp->insertAfter(InsertPos: ScalarIV->getDefiningRecipe());
6297 return ClonedOp;
6298}
6299
6300void VPlanTransforms::optimizeFindIVReductions(VPlan &Plan,
6301 PredicatedScalarEvolution &PSE,
6302 Loop &L) {
6303 ScalarEvolution &SE = *PSE.getSE();
6304 VPRegionBlock *VectorLoopRegion = Plan.getVectorLoopRegion();
6305
6306 // Helper lambda to check if the IV range excludes the sentinel value. Try
6307 // signed first, then unsigned. Return an excluded sentinel if found,
6308 // otherwise return std::nullopt.
6309 auto CheckSentinel = [&SE](const SCEV *IVSCEV,
6310 bool UseMax) -> std::optional<APSInt> {
6311 unsigned BW = IVSCEV->getType()->getScalarSizeInBits();
6312 for (bool Signed : {true, false}) {
6313 APSInt Sentinel = UseMax ? APSInt::getMinValue(numBits: BW, /*Unsigned=*/!Signed)
6314 : APSInt::getMaxValue(numBits: BW, /*Unsigned=*/!Signed);
6315
6316 ConstantRange IVRange =
6317 Signed ? SE.getSignedRange(S: IVSCEV) : SE.getUnsignedRange(S: IVSCEV);
6318 if (!IVRange.contains(Val: Sentinel))
6319 return Sentinel;
6320 }
6321 return std::nullopt;
6322 };
6323
6324 VPValue *HeaderMask = vputils::findHeaderMask(Plan);
6325 for (VPRecipeBase &Phi :
6326 make_early_inc_range(Range: VectorLoopRegion->getEntryBasicBlock()->phis())) {
6327 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(Val: &Phi);
6328 if (!PhiR || !RecurrenceDescriptor::isFindLastRecurrenceKind(
6329 Kind: PhiR->getRecurrenceKind()))
6330 continue;
6331
6332 Type *PhiTy = PhiR->getScalarType();
6333 if (PhiTy->isPointerTy() || PhiTy->isFloatingPointTy())
6334 continue;
6335
6336 // If there's a header mask, the backedge select will not be the find-last
6337 // select.
6338 VPValue *BackedgeVal = PhiR->getBackedgeValue();
6339 auto *FindLastSelect = cast<VPSingleDefRecipe>(Val: BackedgeVal);
6340 if (HeaderMask &&
6341 !match(V: BackedgeVal,
6342 P: m_Select(Op0: m_Specific(VPV: HeaderMask),
6343 Op1: m_VPSingleDefRecipe(V&: FindLastSelect), Op2: m_Specific(VPV: PhiR))))
6344 continue;
6345
6346 // Get the find-last expression from the find-last select of the reduction
6347 // phi. The find-last select should be a select between the phi and the
6348 // find-last expression.
6349 VPValue *Cond, *FindLastExpression;
6350 if (!match(R: FindLastSelect, P: m_SelectLike(Op0: m_VPValue(V&: Cond), Op1: m_Specific(VPV: PhiR),
6351 Op2: m_VPValue(V&: FindLastExpression))) &&
6352 !match(R: FindLastSelect,
6353 P: m_SelectLike(Op0: m_VPValue(V&: Cond), Op1: m_VPValue(V&: FindLastExpression),
6354 Op2: m_Specific(VPV: PhiR))))
6355 continue;
6356
6357 // Check if FindLastExpression is a simple expression of a widened IV. If
6358 // so, we can track the underlying IV instead and sink the expression.
6359 auto *IVOfExpressionToSink = getExpressionIV(V: FindLastExpression);
6360 const SCEV *IVSCEV = vputils::getSCEVExprForVPValue(
6361 V: IVOfExpressionToSink ? IVOfExpressionToSink : FindLastExpression, PSE,
6362 L: &L);
6363 const SCEV *Step;
6364 if (!match(S: IVSCEV, P: m_scev_AffineAddRec(Op0: m_SCEV(), Op1: m_SCEV(V&: Step)))) {
6365 assert(!match(vputils::getSCEVExprForVPValue(FindLastExpression, PSE, &L),
6366 m_scev_AffineAddRec(m_SCEV(), m_SCEV())) &&
6367 "IVOfExpressionToSink not being an AddRec must imply "
6368 "FindLastExpression not being an AddRec.");
6369 continue;
6370 }
6371
6372 // Determine direction from SCEV step.
6373 if (!SE.isKnownNonZero(S: Step))
6374 continue;
6375
6376 // Positive step means we need UMax/SMax to find the last IV value, and
6377 // UMin/SMin otherwise.
6378 bool UseMax = SE.isKnownPositive(S: Step);
6379 std::optional<APSInt> SentinelVal = CheckSentinel(IVSCEV, UseMax);
6380 bool UseSigned = SentinelVal && SentinelVal->isSigned();
6381
6382 // Sinking an expression will disable epilogue vectorization. Only use it,
6383 // if FindLastExpression cannot be vectorized via a sentinel. Sinking may
6384 // also prevent vectorizing using a sentinel (e.g., if the expression is a
6385 // multiply or divide by large constant, respectively), which also makes
6386 // sinking undesirable.
6387 if (IVOfExpressionToSink) {
6388 const SCEV *FindLastExpressionSCEV =
6389 vputils::getSCEVExprForVPValue(V: FindLastExpression, PSE, L: &L);
6390 if (match(S: FindLastExpressionSCEV,
6391 P: m_scev_AffineAddRec(Op0: m_SCEV(), Op1: m_SCEV(V&: Step)))) {
6392 bool NewUseMax = SE.isKnownPositive(S: Step);
6393 if (auto NewSentinel =
6394 CheckSentinel(FindLastExpressionSCEV, NewUseMax)) {
6395 // The original expression already has a sentinel, so prefer not
6396 // sinking to keep epilogue vectorization possible.
6397 SentinelVal = *NewSentinel;
6398 UseSigned = NewSentinel->isSigned();
6399 UseMax = NewUseMax;
6400 IVSCEV = FindLastExpressionSCEV;
6401 IVOfExpressionToSink = nullptr;
6402 }
6403 }
6404 }
6405
6406 // If no sentinel was found, fall back to a boolean AnyOf reduction to track
6407 // if the condition was ever true. Requires the IV to not wrap, otherwise we
6408 // cannot use min/max.
6409 if (!SentinelVal) {
6410 auto *AR = cast<SCEVAddRecExpr>(Val: IVSCEV);
6411 if (AR->hasNoSignedWrap())
6412 UseSigned = true;
6413 else if (AR->hasNoUnsignedWrap())
6414 UseSigned = false;
6415 else
6416 continue;
6417 }
6418
6419 VPInstruction *RdxResult = cast<VPInstruction>(Val: vputils::findRecipe(
6420 Start: BackedgeVal,
6421 Pred: match_fn(P: m_VPInstruction<VPInstruction::ComputeReductionResult>())));
6422
6423 VPValue *NewFindLastSelect = BackedgeVal;
6424 VPValue *SelectCond = Cond;
6425 if (!SentinelVal || IVOfExpressionToSink) {
6426 // When we need to create a new select, normalize the condition so that
6427 // PhiR is the last operand and include the header mask if needed.
6428 DebugLoc DL = FindLastSelect->getDefiningRecipe()->getDebugLoc();
6429 VPBuilder LoopBuilder(FindLastSelect->getDefiningRecipe());
6430 if (FindLastSelect->getDefiningRecipe()->getOperand(N: 1) == PhiR)
6431 SelectCond = LoopBuilder.createNot(Operand: SelectCond);
6432
6433 // When tail folding, mask the condition with the header mask to prevent
6434 // propagating poison from inactive lanes in the last vector iteration.
6435 if (HeaderMask)
6436 SelectCond = LoopBuilder.createLogicalAnd(LHS: HeaderMask, RHS: SelectCond);
6437
6438 if (SelectCond != Cond || IVOfExpressionToSink) {
6439 NewFindLastSelect = LoopBuilder.createSelect(
6440 Cond: SelectCond,
6441 TrueVal: IVOfExpressionToSink ? IVOfExpressionToSink : FindLastExpression,
6442 FalseVal: PhiR, DL);
6443 }
6444 }
6445
6446 // Create the reduction result in the middle block using sentinel directly.
6447 RecurKind MinMaxKind =
6448 UseMax ? (UseSigned ? RecurKind::SMax : RecurKind::UMax)
6449 : (UseSigned ? RecurKind::SMin : RecurKind::UMin);
6450 VPIRFlags Flags(MinMaxKind, /*IsOrdered=*/false, /*IsInLoop=*/false,
6451 FastMathFlags());
6452 DebugLoc ExitDL = RdxResult->getDebugLoc();
6453 VPBuilder MiddleBuilder(RdxResult);
6454 VPValue *ReducedIV =
6455 MiddleBuilder.createNaryOp(Opcode: VPInstruction::ComputeReductionResult,
6456 Operands: NewFindLastSelect, Flags, DL: ExitDL);
6457
6458 // If IVOfExpressionToSink is an expression to sink, sink it now.
6459 VPValue *VectorRegionExitingVal = ReducedIV;
6460 if (IVOfExpressionToSink)
6461 VectorRegionExitingVal =
6462 cloneBinOpForScalarIV(BinOp: cast<VPWidenRecipe>(Val: FindLastExpression),
6463 ScalarIV: ReducedIV, WidenIV: IVOfExpressionToSink);
6464
6465 VPValue *NewRdxResult;
6466 VPValue *StartVPV = PhiR->getStartValue();
6467 if (SentinelVal) {
6468 // Sentinel-based approach: reduce IVs with min/max, compare against
6469 // sentinel to detect if condition was ever true, select accordingly.
6470 VPValue *Sentinel = Plan.getConstantInt(Val: *SentinelVal);
6471 auto *Cmp = MiddleBuilder.createICmp(Pred: CmpInst::ICMP_NE, A: ReducedIV,
6472 B: Sentinel, DL: ExitDL);
6473 NewRdxResult = MiddleBuilder.createSelect(Cond: Cmp, TrueVal: VectorRegionExitingVal,
6474 FalseVal: StartVPV, DL: ExitDL);
6475 StartVPV = Sentinel;
6476 } else {
6477 // Introduce a boolean AnyOf reduction to track if the condition was ever
6478 // true in the loop. Use it to select the initial start value, if it was
6479 // never true.
6480 auto *AnyOfPhi = new VPReductionPHIRecipe(
6481 /*Phi=*/nullptr, RecurKind::Or, *Plan.getFalse(), *Plan.getFalse(),
6482 RdxUnordered{.VFScaleFactor: 1}, {}, /*HasUsesOutsideReductionChain=*/false);
6483 AnyOfPhi->insertAfter(InsertPos: PhiR);
6484
6485 VPBuilder LoopBuilder(BackedgeVal->getDefiningRecipe());
6486 VPValue *OrVal = LoopBuilder.createOr(LHS: AnyOfPhi, RHS: SelectCond);
6487 AnyOfPhi->setOperand(I: 1, New: OrVal);
6488
6489 NewRdxResult = MiddleBuilder.createAnyOfReduction(
6490 ChainOp: OrVal, TrueVal: VectorRegionExitingVal, FalseVal: StartVPV, DL: ExitDL);
6491
6492 // Initialize the IV reduction phi with the neutral element, not the
6493 // original start value, to ensure correct min/max reduction results.
6494 StartVPV = Plan.getOrAddLiveIn(
6495 V: getRecurrenceIdentity(K: MinMaxKind, Tp: IVSCEV->getType(), FMF: {}));
6496 }
6497 RdxResult->replaceAllUsesWith(New: NewRdxResult);
6498 RdxResult->eraseFromParent();
6499
6500 auto *NewPhiR = new VPReductionPHIRecipe(
6501 cast<PHINode>(Val: PhiR->getUnderlyingInstr()), RecurKind::FindIV, *StartVPV,
6502 *NewFindLastSelect, RdxUnordered{.VFScaleFactor: 1}, {},
6503 PhiR->hasUsesOutsideReductionChain());
6504 NewPhiR->insertBefore(InsertPos: PhiR);
6505 PhiR->replaceAllUsesWith(New: NewPhiR);
6506 PhiR->eraseFromParent();
6507 }
6508}
6509
6510namespace {
6511
6512using ExtendKind = TTI::PartialReductionExtendKind;
6513struct ReductionExtend {
6514 Type *SrcType = nullptr;
6515 ExtendKind Kind = ExtendKind::PR_None;
6516};
6517
6518/// Describes the extends used to compute the extended reduction operand.
6519/// ExtendB is optional. If ExtendB is present, ExtendsUser is a binary
6520/// operation.
6521struct ExtendedReductionOperand {
6522 /// The recipe that consumes the extends.
6523 VPWidenRecipe *ExtendsUser = nullptr;
6524 /// Extend descriptions (inputs to getPartialReductionCost).
6525 ReductionExtend ExtendA, ExtendB;
6526};
6527
6528/// A chain of recipes that form a partial reduction. Matches either
6529/// reduction_bin_op (extended op, accumulator), or
6530/// reduction_bin_op (accumulator, extended op).
6531/// The possible forms of the "extended op" are listed in
6532/// matchExtendedReductionOperand.
6533struct VPPartialReductionChain {
6534 /// The top-level binary operation that forms the reduction to a scalar
6535 /// after the loop body.
6536 VPWidenRecipe *ReductionBinOp = nullptr;
6537 /// The user of the extends that is then reduced.
6538 ExtendedReductionOperand ExtendedOp;
6539 /// The recurrence kind for the entire partial reduction chain.
6540 /// This allows distinguishing between Sub and AddWithSub recurrences,
6541 /// when the ReductionBinOp is a Instruction::Sub.
6542 RecurKind RK;
6543 /// The index of the accumulator operand of ReductionBinOp. The extended op
6544 /// is `1 - AccumulatorOpIdx`.
6545 unsigned AccumulatorOpIdx;
6546 unsigned ScaleFactor;
6547};
6548
6549static VPSingleDefRecipe *
6550optimizeExtendsForPartialReduction(VPSingleDefRecipe *Op) {
6551 // reduce.add(mul(ext(A), C))
6552 // -> reduce.add(mul(ext(A), ext(trunc(C))))
6553 const APInt *Const;
6554 if (match(R: Op, P: m_Mul(Op0: m_ZExtOrSExt(Op0: m_VPValue()), Op1: m_APInt(C&: Const)))) {
6555 auto *ExtA = cast<VPWidenCastRecipe>(Val: Op->getOperand(N: 0));
6556 Instruction::CastOps ExtOpc = ExtA->getOpcode();
6557 Type *NarrowTy = ExtA->getOperand(N: 0)->getScalarType();
6558 if (!Op->hasOneUse() ||
6559 !llvm::canConstantBeExtended(
6560 C: Const, NarrowType: NarrowTy, ExtKind: TTI::getPartialReductionExtendKind(CastOpc: ExtOpc)))
6561 return Op;
6562
6563 VPBuilder Builder(Op);
6564 auto *Trunc = Builder.createWidenCast(Opcode: Instruction::CastOps::Trunc,
6565 Op: Op->getOperand(N: 1), ResultTy: NarrowTy);
6566 Type *WideTy = ExtA->getScalarType();
6567 Op->setOperand(I: 1, New: Builder.createWidenCast(Opcode: ExtOpc, Op: Trunc, ResultTy: WideTy));
6568 return Op;
6569 }
6570
6571 // reduce.add(abs(sub(ext(A), ext(B))))
6572 // -> reduce.add(ext(absolute-difference(A, B)))
6573 VPValue *X, *Y;
6574 if (match(R: Op, P: m_WidenIntrinsic<Intrinsic::abs>(Ops: m_Sub(
6575 Op0: m_ZExtOrSExt(Op0: m_VPValue(V&: X)), Op1: m_ZExtOrSExt(Op0: m_VPValue(V&: Y)))))) {
6576 auto *Sub = Op->getOperand(N: 0)->getDefiningRecipe();
6577 auto *Ext = cast<VPWidenCastRecipe>(Val: Sub->getOperand(N: 0));
6578 assert(Ext->getOpcode() ==
6579 cast<VPWidenCastRecipe>(Sub->getOperand(1))->getOpcode() &&
6580 "Expected both the LHS and RHS extends to be the same");
6581 bool IsSigned = Ext->getOpcode() == Instruction::SExt;
6582 VPBuilder Builder(Op);
6583 Type *SrcTy = X->getScalarType();
6584 auto *FreezeX = Builder.insert(R: new VPWidenRecipe(Instruction::Freeze, {X}));
6585 auto *FreezeY = Builder.insert(R: new VPWidenRecipe(Instruction::Freeze, {Y}));
6586 auto *Max = Builder.insert(
6587 R: new VPWidenIntrinsicRecipe(IsSigned ? Intrinsic::smax : Intrinsic::umax,
6588 {FreezeX, FreezeY}, SrcTy));
6589 auto *Min = Builder.insert(
6590 R: new VPWidenIntrinsicRecipe(IsSigned ? Intrinsic::smin : Intrinsic::umin,
6591 {FreezeX, FreezeY}, SrcTy));
6592 auto *AbsDiff =
6593 Builder.insert(R: new VPWidenRecipe(Instruction::Sub, {Max, Min}));
6594 return Builder.createWidenCast(Opcode: Instruction::CastOps::ZExt, Op: AbsDiff,
6595 ResultTy: Op->getScalarType());
6596 }
6597
6598 // reduce.add(ext(mul(ext(A), ext(B))))
6599 // -> reduce.add(mul(wider_ext(A), wider_ext(B)))
6600 // TODO: Support this optimization for float types.
6601 if (match(R: Op, P: m_ZExtOrSExt(Op0: m_Mul(Op0: m_ZExtOrSExt(Op0: m_VPValue()),
6602 Op1: m_ZExtOrSExt(Op0: m_VPValue()))))) {
6603 auto *Ext = cast<VPWidenCastRecipe>(Val: Op);
6604 auto *Mul = cast<VPWidenRecipe>(Val: Ext->getOperand(N: 0));
6605 auto *MulLHS = cast<VPWidenCastRecipe>(Val: Mul->getOperand(N: 0));
6606 auto *MulRHS = cast<VPWidenCastRecipe>(Val: Mul->getOperand(N: 1));
6607 if (!Mul->hasOneUse() ||
6608 (Ext->getOpcode() != MulLHS->getOpcode() && MulLHS != MulRHS) ||
6609 MulLHS->getOpcode() != MulRHS->getOpcode())
6610 return Op;
6611 VPBuilder Builder(Mul);
6612 auto *NewLHS = Builder.createWidenCast(
6613 Opcode: MulLHS->getOpcode(), Op: MulLHS->getOperand(N: 0), ResultTy: Ext->getScalarType());
6614 auto *NewRHS = MulLHS == MulRHS
6615 ? NewLHS
6616 : Builder.createWidenCast(Opcode: MulRHS->getOpcode(),
6617 Op: MulRHS->getOperand(N: 0),
6618 ResultTy: Ext->getScalarType());
6619 auto *NewMul = Mul->cloneWithOperands(NewOperands: {NewLHS, NewRHS});
6620 Builder.insert(R: NewMul);
6621 Op->replaceAllUsesWith(New: NewMul);
6622 Op->eraseFromParent();
6623 Mul->eraseFromParent();
6624 return NewMul;
6625 }
6626
6627 return Op;
6628}
6629
6630static VPExpressionRecipe *
6631createPartialReductionExpression(VPReductionRecipe *Red) {
6632 VPValue *VecOp = Red->getVecOp();
6633
6634 // reduce.[f]add(ext(op))
6635 // -> VPExpressionRecipe(op, red)
6636 if (match(V: VecOp, P: m_WidenAnyExtend(Op0: m_VPValue())))
6637 return new VPExpressionRecipe(cast<VPWidenCastRecipe>(Val: VecOp), Red);
6638
6639 // reduce.[f]add(neg(ext(op)))
6640 // -> VPExpressionRecipe(op, sub/neg, red)
6641 if (match(V: VecOp, P: m_AnyNeg(Op0: m_WidenAnyExtend(Op0: m_VPValue())))) {
6642 auto *Neg = cast<VPWidenRecipe>(Val: VecOp);
6643 auto *Ext =
6644 cast<VPWidenCastRecipe>(Val: Neg->getOperand(N: Neg->getNumOperands() - 1));
6645 return new VPExpressionRecipe(Ext, Neg, Red);
6646 }
6647
6648 // reduce.[f]add([f]mul(ext(a), ext(b)))
6649 // -> VPExpressionRecipe(a, b, mul, red)
6650 if (match(V: VecOp, P: m_FMul(Op0: m_FPExt(Op0: m_VPValue()), Op1: m_FPExt(Op0: m_VPValue()))) ||
6651 match(V: VecOp,
6652 P: m_Mul(Op0: m_ZExtOrSExt(Op0: m_VPValue()), Op1: m_ZExtOrSExt(Op0: m_VPValue())))) {
6653 auto *Mul = cast<VPWidenRecipe>(Val: VecOp);
6654 auto *ExtA = cast<VPWidenCastRecipe>(Val: Mul->getOperand(N: 0));
6655 auto *ExtB = cast<VPWidenCastRecipe>(Val: Mul->getOperand(N: 1));
6656 return new VPExpressionRecipe(ExtA, ExtB, Mul, Red);
6657 }
6658
6659 // reduce.fadd(fneg(fmul(fpext(a), fpext(b))))
6660 // -> VPExpressionRecipe(a, b, fmul, fsub, red)
6661 if (match(V: VecOp,
6662 P: m_FNeg(Op0: m_FMul(Op0: m_FPExt(Op0: m_VPValue()), Op1: m_FPExt(Op0: m_VPValue()))))) {
6663 auto *FNeg = cast<VPWidenRecipe>(Val: VecOp);
6664 auto *FMul = cast<VPWidenRecipe>(Val: FNeg->getOperand(N: 0));
6665 auto *ExtA = cast<VPWidenCastRecipe>(Val: FMul->getOperand(N: 0));
6666 auto *ExtB = cast<VPWidenCastRecipe>(Val: FMul->getOperand(N: 1));
6667 return new VPExpressionRecipe(ExtA, ExtB, FMul, FNeg, Red);
6668 }
6669
6670 // reduce.add(neg(mul(ext(a), ext(b))))
6671 // -> VPExpressionRecipe(a, b, mul, sub, red)
6672 if (match(V: VecOp, P: m_Sub(Op0: m_ZeroInt(), Op1: m_Mul(Op0: m_ZExtOrSExt(Op0: m_VPValue()),
6673 Op1: m_ZExtOrSExt(Op0: m_VPValue()))))) {
6674 auto *Sub = cast<VPWidenRecipe>(Val: VecOp);
6675 auto *Mul = cast<VPWidenRecipe>(Val: Sub->getOperand(N: 1));
6676 auto *ExtA = cast<VPWidenCastRecipe>(Val: Mul->getOperand(N: 0));
6677 auto *ExtB = cast<VPWidenCastRecipe>(Val: Mul->getOperand(N: 1));
6678 return new VPExpressionRecipe(ExtA, ExtB, Mul, Sub, Red);
6679 }
6680
6681 llvm_unreachable("Unsupported expression");
6682}
6683
6684// Helper to transform a partial reduction chain into a partial reduction
6685// recipe. Assumes profitability has been checked.
6686static void transformToPartialReduction(const VPPartialReductionChain &Chain,
6687 VPlan &Plan,
6688 VPReductionPHIRecipe *RdxPhi) {
6689 VPWidenRecipe *WidenRecipe = Chain.ReductionBinOp;
6690 assert(WidenRecipe->getNumOperands() == 2 && "Expected binary operation");
6691
6692 VPValue *Accumulator = WidenRecipe->getOperand(N: Chain.AccumulatorOpIdx);
6693 auto *ExtendedOp = cast<VPSingleDefRecipe>(
6694 Val: WidenRecipe->getOperand(N: 1 - Chain.AccumulatorOpIdx));
6695
6696 // FIXME: Do these transforms before invoking the cost-model.
6697 ExtendedOp = optimizeExtendsForPartialReduction(Op: ExtendedOp);
6698
6699 // Sub-reductions can be implemented in two ways:
6700 // (1) negate the operand in the vector loop (the default way).
6701 // (2) subtract the reduced value from the init value in the middle block.
6702 // Both ways keep the reduction itself as an 'add' reduction.
6703 //
6704 // The ISD nodes for partial reductions don't support folding the
6705 // sub/negation into its operands because the following is not a valid
6706 // transformation:
6707 // sub(0, mul(ext(a), ext(b)))
6708 // -> mul(ext(a), ext(sub(0, b)))
6709 //
6710 // It's therefore better to choose option (2) such that the partial
6711 // reduction is always positive (starting at '0') and to do a final
6712 // subtract in the middle block.
6713 if ((WidenRecipe->getOpcode() == Instruction::Sub &&
6714 Chain.RK != RecurKind::Sub) ||
6715 (WidenRecipe->getOpcode() == Instruction::FSub &&
6716 Chain.RK != RecurKind::FSub)) {
6717 VPBuilder Builder(WidenRecipe);
6718 Type *ElemTy = ExtendedOp->getScalarType();
6719 VPWidenRecipe *NegRecipe;
6720 if (WidenRecipe->getOpcode() == Instruction::FSub) {
6721 NegRecipe =
6722 new VPWidenRecipe(Instruction::FNeg, {ExtendedOp}, VPIRFlags(),
6723 VPIRMetadata(), DebugLoc::getUnknown());
6724 } else {
6725 auto *Zero = Plan.getZero(Ty: ElemTy);
6726 NegRecipe =
6727 new VPWidenRecipe(Instruction::Sub, {Zero, ExtendedOp}, VPIRFlags(),
6728 VPIRMetadata(), DebugLoc::getUnknown());
6729 }
6730 Builder.insert(R: NegRecipe);
6731 ExtendedOp = NegRecipe;
6732 }
6733
6734 // Check if WidenRecipe is the final result of the reduction. If so look
6735 // through selects for predicated reductions.
6736 VPValue *Cond = nullptr;
6737 VPValue *ExitValue = cast_or_null<VPInstruction>(
6738 Val: findUserOf(V: WidenRecipe, P: m_Select(Op0: m_VPValue(V&: Cond), Op1: m_Specific(VPV: WidenRecipe),
6739 Op2: m_Specific(VPV: RdxPhi))));
6740 bool IsLastInChain = RdxPhi->getBackedgeValue() == WidenRecipe ||
6741 RdxPhi->getBackedgeValue() == ExitValue;
6742 assert((!ExitValue || IsLastInChain) &&
6743 "if we found ExitValue, it must match RdxPhi's backedge value");
6744
6745 Type *PhiType = RdxPhi->getScalarType();
6746 RecurKind RdxKind =
6747 PhiType->isFloatingPointTy() ? RecurKind::FAdd : RecurKind::Add;
6748 auto *PartialRed = new VPReductionRecipe(
6749 RdxKind,
6750 RdxKind == RecurKind::FAdd ? WidenRecipe->getFastMathFlagsOrNone()
6751 : FastMathFlags(),
6752 WidenRecipe->getUnderlyingInstr(), Accumulator, ExtendedOp, Cond,
6753 RdxUnordered{/*VFScaleFactor=*/Chain.ScaleFactor});
6754 PartialRed->insertBefore(InsertPos: WidenRecipe);
6755
6756 if (Cond)
6757 ExitValue->replaceAllUsesWith(New: PartialRed);
6758 WidenRecipe->replaceAllUsesWith(New: PartialRed);
6759
6760 // For cost-model purposes, fold this into a VPExpression.
6761 VPExpressionRecipe *E = createPartialReductionExpression(Red: PartialRed);
6762 E->insertBefore(InsertPos: WidenRecipe);
6763 PartialRed->replaceAllUsesWith(New: E);
6764
6765 // We only need to update the PHI node once, which is when we find the
6766 // last reduction in the chain.
6767 if (!IsLastInChain)
6768 return;
6769
6770 // Scale the PHI and ReductionStartVector by the VFScaleFactor
6771 assert(RdxPhi->getVFScaleFactor() == 1 && "scale factor must not be set");
6772 RdxPhi->setVFScaleFactor(Chain.ScaleFactor);
6773
6774 auto *StartInst = cast<VPInstruction>(Val: RdxPhi->getStartValue());
6775 assert(StartInst->getOpcode() == VPInstruction::ReductionStartVector);
6776 auto *NewScaleFactor = Plan.getConstantInt(BitWidth: 32, Val: Chain.ScaleFactor);
6777 StartInst->setOperand(I: 2, New: NewScaleFactor);
6778
6779 // If this is the last value in a sub-reduction chain, then update the PHI
6780 // node to start at `0` and update the reduction-result to subtract from
6781 // the PHI's start value.
6782 if (Chain.RK != RecurKind::Sub && Chain.RK != RecurKind::FSub)
6783 return;
6784
6785 VPValue *OldStartValue = StartInst->getOperand(N: 0);
6786 StartInst->setOperand(I: 0, New: StartInst->getOperand(N: 1));
6787
6788 // Replace reduction_result by 'sub (startval, reductionresult)'.
6789 VPInstruction *RdxResult = vputils::findComputeReductionResult(PhiR: RdxPhi);
6790 assert(RdxResult && "Could not find reduction result");
6791
6792 VPBuilder Builder = VPBuilder::getToInsertAfter(R: RdxResult);
6793 unsigned SubOpc = Chain.RK == RecurKind::FSub ? Instruction::BinaryOps::FSub
6794 : Instruction::BinaryOps::Sub;
6795 VPInstruction *NewResult = Builder.createNaryOp(
6796 Opcode: SubOpc, Operands: {OldStartValue, RdxResult}, Flags: VPIRFlags::getDefaultFlags(Opcode: SubOpc),
6797 DL: RdxPhi->getDebugLoc());
6798 RdxResult->replaceUsesWithIf(
6799 New: NewResult,
6800 ShouldReplace: [&NewResult](VPUser &U, unsigned Idx) { return &U != NewResult; });
6801}
6802
6803/// Returns the cost of a link in a partial-reduction chain for a given VF.
6804static InstructionCost
6805getPartialReductionLinkCost(VPCostContext &CostCtx,
6806 const VPPartialReductionChain &Link,
6807 ElementCount VF) {
6808 Type *RdxType = Link.ReductionBinOp->getScalarType();
6809 const ExtendedReductionOperand &ExtendedOp = Link.ExtendedOp;
6810 std::optional<unsigned> BinOpc = std::nullopt;
6811 // If ExtendB is not none, then the "ExtendsUser" is the binary operation.
6812 if (ExtendedOp.ExtendB.Kind != ExtendKind::PR_None)
6813 BinOpc = ExtendedOp.ExtendsUser->getOpcode();
6814
6815 std::optional<llvm::FastMathFlags> Flags;
6816 if (RdxType->isFloatingPointTy())
6817 Flags = Link.ReductionBinOp->getFastMathFlagsOrNone();
6818
6819 auto GetLinkOpcode = [&Link]() -> unsigned {
6820 switch (Link.RK) {
6821 case RecurKind::Sub:
6822 return Instruction::Add;
6823 case RecurKind::FSub:
6824 return Instruction::FAdd;
6825 default:
6826 return Link.ReductionBinOp->getOpcode();
6827 }
6828 };
6829
6830 return CostCtx.TTI.getPartialReductionCost(
6831 Opcode: GetLinkOpcode(), InputTypeA: ExtendedOp.ExtendA.SrcType, InputTypeB: ExtendedOp.ExtendB.SrcType,
6832 AccumType: RdxType, VF, OpAExtend: ExtendedOp.ExtendA.Kind, OpBExtend: ExtendedOp.ExtendB.Kind, BinOp: BinOpc,
6833 CostKind: CostCtx.CostKind, FMF: Flags);
6834}
6835
6836static ExtendKind getPartialReductionExtendKind(VPWidenCastRecipe *Cast) {
6837 return TTI::getPartialReductionExtendKind(CastOpc: Cast->getOpcode());
6838}
6839
6840/// Checks if \p Op (which is an operand of \p UpdateR) is an extended reduction
6841/// operand. This is an operand where the source of the value (e.g. a load) has
6842/// been extended (sext, zext, or fpext) before it is used in the reduction.
6843///
6844/// Possible forms matched by this function:
6845/// - UpdateR(PrevValue, ext(...))
6846/// - UpdateR(PrevValue, mul(ext(...), ext(...)))
6847/// - UpdateR(PrevValue, mul(ext(...), Constant))
6848/// - UpdateR(PrevValue, ext(mul(ext(...), ext(...))))
6849/// - UpdateR(PrevValue, ext(mul(ext(...), Constant)))
6850/// - UpdateR(PrevValue, abs(sub(ext(...), ext(...)))
6851///
6852/// Note: The second operand of UpdateR corresponds to \p Op in the examples.
6853static std::optional<ExtendedReductionOperand>
6854matchExtendedReductionOperand(VPWidenRecipe *UpdateR, VPValue *Op) {
6855 assert(is_contained(UpdateR->operands(), Op) &&
6856 "Op should be operand of UpdateR");
6857
6858 // Try matching an absolute difference operand of the form
6859 // `abs(sub(ext(A), ext(B)))`. This will be later transformed into
6860 // `ext(absolute-difference(A, B))`. This allows us to perform the absolute
6861 // difference on a wider type and get the extend for "free" from the partial
6862 // reduction.
6863 VPValue *X, *Y;
6864 if (Op->hasOneUse() &&
6865 match(V: Op, P: m_WidenIntrinsic<Intrinsic::abs>(
6866 Ops: m_OneUse(SubPattern: m_Sub(Op0: m_WidenAnyExtend(Op0: m_VPValue(V&: X)),
6867 Op1: m_WidenAnyExtend(Op0: m_VPValue(V&: Y))))))) {
6868 auto *Abs = cast<VPWidenIntrinsicRecipe>(Val: Op);
6869 auto *Sub = cast<VPWidenRecipe>(Val: Abs->getOperand(N: 0));
6870 auto *LHSExt = cast<VPWidenCastRecipe>(Val: Sub->getOperand(N: 0));
6871 auto *RHSExt = cast<VPWidenCastRecipe>(Val: Sub->getOperand(N: 1));
6872 Type *LHSInputType = X->getScalarType();
6873 Type *RHSInputType = Y->getScalarType();
6874 if (LHSInputType != RHSInputType ||
6875 LHSExt->getOpcode() != RHSExt->getOpcode())
6876 return std::nullopt;
6877 // Note: This is essentially the same as matching ext(...) as we will
6878 // rewrite this operand to ext(absolute-difference(A, B)).
6879 return ExtendedReductionOperand{
6880 .ExtendsUser: Sub,
6881 /*ExtendA=*/{.SrcType: LHSInputType, .Kind: getPartialReductionExtendKind(Cast: LHSExt)},
6882 /*ExtendB=*/{}};
6883 }
6884
6885 std::optional<TTI::PartialReductionExtendKind> OuterExtKind;
6886 if (match(V: Op, P: m_WidenAnyExtend(Op0: m_VPValue()))) {
6887 auto *CastRecipe = cast<VPWidenCastRecipe>(Val: Op);
6888 VPValue *CastSource = CastRecipe->getOperand(N: 0);
6889 OuterExtKind = getPartialReductionExtendKind(Cast: CastRecipe);
6890 if (match(V: CastSource, P: m_Mul(Op0: m_VPValue(), Op1: m_VPValue())) ||
6891 match(V: CastSource, P: m_FMul(Op0: m_VPValue(), Op1: m_VPValue()))) {
6892 // Match: ext(mul(...))
6893 // Record the outer extend kind and set `Op` to the mul. We can then match
6894 // this as a binary operation. Note: We can optimize out the outer extend
6895 // by widening the inner extends to match it. See
6896 // optimizeExtendsForPartialReduction.
6897 Op = CastSource;
6898 } else {
6899 return ExtendedReductionOperand{
6900 .ExtendsUser: UpdateR,
6901 /*ExtendA=*/{.SrcType: CastSource->getScalarType(), .Kind: *OuterExtKind},
6902 /*ExtendB=*/{}};
6903 }
6904 }
6905
6906 if (!Op->hasOneUse())
6907 return std::nullopt;
6908
6909 VPWidenRecipe *MulOp = dyn_cast<VPWidenRecipe>(Val: Op);
6910 if (!MulOp ||
6911 !is_contained(Set: {Instruction::Mul, Instruction::FMul}, Element: MulOp->getOpcode()))
6912 return std::nullopt;
6913
6914 // The rest of the matching assumes `Op` is a (possibly extended) mul
6915 // operation.
6916
6917 VPValue *LHS = MulOp->getOperand(N: 0);
6918 VPValue *RHS = MulOp->getOperand(N: 1);
6919
6920 // The LHS of the operation must always be an extend.
6921 if (!match(V: LHS, P: m_WidenAnyExtend(Op0: m_VPValue())))
6922 return std::nullopt;
6923
6924 auto *LHSCast = cast<VPWidenCastRecipe>(Val: LHS);
6925 Type *LHSInputType = LHSCast->getOperand(N: 0)->getScalarType();
6926 ExtendKind LHSExtendKind = getPartialReductionExtendKind(Cast: LHSCast);
6927
6928 // The RHS of the operation can be an extend or a constant integer.
6929 const APInt *RHSConst = nullptr;
6930 VPWidenCastRecipe *RHSCast = nullptr;
6931 if (match(V: RHS, P: m_WidenAnyExtend(Op0: m_VPValue())))
6932 RHSCast = cast<VPWidenCastRecipe>(Val: RHS);
6933 else if (!match(V: RHS, P: m_APInt(C&: RHSConst)) ||
6934 !canConstantBeExtended(C: RHSConst, NarrowType: LHSInputType, ExtKind: LHSExtendKind))
6935 return std::nullopt;
6936
6937 // The outer extend kind must match the inner extends for folding.
6938 for (VPWidenCastRecipe *Cast : {LHSCast, RHSCast})
6939 if (Cast && OuterExtKind &&
6940 getPartialReductionExtendKind(Cast) != OuterExtKind)
6941 return std::nullopt;
6942
6943 Type *RHSInputType = LHSInputType;
6944 ExtendKind RHSExtendKind = LHSExtendKind;
6945 if (RHSCast) {
6946 RHSInputType = RHSCast->getOperand(N: 0)->getScalarType();
6947 RHSExtendKind = getPartialReductionExtendKind(Cast: RHSCast);
6948 }
6949
6950 return ExtendedReductionOperand{
6951 .ExtendsUser: MulOp, .ExtendA: {.SrcType: LHSInputType, .Kind: LHSExtendKind}, .ExtendB: {.SrcType: RHSInputType, .Kind: RHSExtendKind}};
6952}
6953
6954/// Examines each operation in the reduction chain corresponding to \p RedPhiR,
6955/// and determines if the target can use a cheaper operation with a wider
6956/// per-iteration input VF and narrower PHI VF. If successful, returns the chain
6957/// of operations in the reduction.
6958static std::optional<SmallVector<VPPartialReductionChain>>
6959getScaledReductions(VPReductionPHIRecipe *RedPhiR) {
6960 // Get the backedge value from the reduction PHI and find the
6961 // ComputeReductionResult that uses it (directly or through a select for
6962 // predicated reductions).
6963 auto *RdxResult = vputils::findComputeReductionResult(PhiR: RedPhiR);
6964 if (!RdxResult)
6965 return std::nullopt;
6966 VPValue *ExitValue = RdxResult->getOperand(N: 0);
6967 match(V: ExitValue, P: m_Select(Op0: m_VPValue(), Op1: m_VPValue(V&: ExitValue), Op2: m_VPValue()));
6968
6969 SmallVector<VPPartialReductionChain> Chain;
6970 RecurKind RK = RedPhiR->getRecurrenceKind();
6971 Type *PhiType = RedPhiR->getScalarType();
6972 TypeSize PHISize = PhiType->getPrimitiveSizeInBits();
6973
6974 // Work backwards from the ExitValue examining each reduction operation.
6975 VPValue *CurrentValue = ExitValue;
6976 while (CurrentValue != RedPhiR) {
6977 auto *UpdateR = dyn_cast<VPWidenRecipe>(Val: CurrentValue);
6978 if (!UpdateR || !Instruction::isBinaryOp(Opcode: UpdateR->getOpcode()))
6979 return std::nullopt;
6980
6981 VPValue *Op = UpdateR->getOperand(N: 1);
6982 VPValue *PrevValue = UpdateR->getOperand(N: 0);
6983
6984 // Find the extended operand. The other operand (PrevValue) is the next link
6985 // in the reduction chain.
6986 std::optional<ExtendedReductionOperand> ExtendedOp =
6987 matchExtendedReductionOperand(UpdateR, Op);
6988 if (!ExtendedOp) {
6989 ExtendedOp = matchExtendedReductionOperand(UpdateR, Op: PrevValue);
6990 if (!ExtendedOp)
6991 return std::nullopt;
6992 std::swap(a&: Op, b&: PrevValue);
6993 }
6994
6995 Type *ExtSrcType = ExtendedOp->ExtendA.SrcType;
6996 TypeSize ExtSrcSize = ExtSrcType->getPrimitiveSizeInBits();
6997 if (!PHISize.hasKnownScalarFactor(RHS: ExtSrcSize))
6998 return std::nullopt;
6999
7000 VPPartialReductionChain Link(
7001 {.ReductionBinOp: UpdateR, .ExtendedOp: *ExtendedOp, .RK: RK,
7002 .AccumulatorOpIdx: PrevValue == UpdateR->getOperand(N: 0) ? 0U : 1U,
7003 .ScaleFactor: static_cast<unsigned>(PHISize.getKnownScalarFactor(RHS: ExtSrcSize))});
7004 Chain.push_back(Elt: Link);
7005 CurrentValue = PrevValue;
7006 }
7007
7008 // The chain links were collected by traversing backwards from the exit value.
7009 // Reverse the chains so they are in program order.
7010 std::reverse(first: Chain.begin(), last: Chain.end());
7011 return Chain;
7012}
7013} // namespace
7014
7015void VPlanTransforms::createPartialReductions(VPlan &Plan,
7016 VPCostContext &CostCtx,
7017 VFRange &Range) {
7018 // Find all possible valid partial reductions, grouping chains by their PHI.
7019 // This grouping allows invalidating the whole chain, if any link is not a
7020 // valid partial reduction.
7021 MapVector<VPReductionPHIRecipe *, SmallVector<VPPartialReductionChain>>
7022 ChainsByPhi;
7023 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
7024 for (VPRecipeBase &R : HeaderVPBB->phis()) {
7025 auto *RedPhiR = dyn_cast<VPReductionPHIRecipe>(Val: &R);
7026 if (!RedPhiR)
7027 continue;
7028
7029 if (auto Chains = getScaledReductions(RedPhiR))
7030 ChainsByPhi.try_emplace(Key: RedPhiR, Args: std::move(*Chains));
7031 }
7032
7033 if (ChainsByPhi.empty())
7034 return;
7035
7036 // Build set of partial reduction operations for extend user validation and
7037 // a map of reduction bin ops to their scale factors for scale validation.
7038 SmallPtrSet<VPRecipeBase *, 4> PartialReductionOps;
7039 DenseMap<VPSingleDefRecipe *, unsigned> ScaledReductionMap;
7040 for (const auto &[_, Chains] : ChainsByPhi)
7041 for (const VPPartialReductionChain &Chain : Chains) {
7042 PartialReductionOps.insert(Ptr: Chain.ExtendedOp.ExtendsUser);
7043 ScaledReductionMap[Chain.ReductionBinOp] = Chain.ScaleFactor;
7044 }
7045
7046 // A partial reduction is invalid if any of its extends are used by
7047 // something that isn't another partial reduction. This is because the
7048 // extends are intended to be lowered along with the reduction itself.
7049 auto ExtendUsersValid = [&](VPValue *Ext) {
7050 return !isa<VPWidenCastRecipe>(Val: Ext) || all_of(Range: Ext->users(), P: [&](VPUser *U) {
7051 return PartialReductionOps.contains(Ptr: cast<VPRecipeBase>(Val: U));
7052 });
7053 };
7054
7055 auto IsProfitablePartialReductionChainForVF =
7056 [&](ArrayRef<VPPartialReductionChain> Chain, ElementCount VF) -> bool {
7057 InstructionCost PartialCost = 0, RegularCost = 0;
7058
7059 // The chain is a profitable partial reduction chain if the cost of handling
7060 // the entire chain is cheaper when using partial reductions than when
7061 // handling the entire chain using regular reductions.
7062 for (const VPPartialReductionChain &Link : Chain) {
7063 const ExtendedReductionOperand &ExtendedOp = Link.ExtendedOp;
7064 InstructionCost LinkCost = getPartialReductionLinkCost(CostCtx, Link, VF);
7065 if (!LinkCost.isValid())
7066 return false;
7067
7068 PartialCost += LinkCost;
7069 RegularCost += Link.ReductionBinOp->computeCost(VF, Ctx&: CostCtx);
7070 // If ExtendB is not none, then the "ExtendsUser" is the binary operation.
7071 if (ExtendedOp.ExtendB.Kind != ExtendKind::PR_None)
7072 RegularCost += ExtendedOp.ExtendsUser->computeCost(VF, Ctx&: CostCtx);
7073 for (VPValue *Op : ExtendedOp.ExtendsUser->operands())
7074 if (auto *Extend = dyn_cast<VPWidenCastRecipe>(Val: Op))
7075 RegularCost += Extend->computeCost(VF, Ctx&: CostCtx);
7076 }
7077 return PartialCost.isValid() && PartialCost < RegularCost;
7078 };
7079
7080 // Validate chains: check that extends are only used by partial reductions,
7081 // and that reduction bin ops are only used by other partial reductions with
7082 // matching scale factors, are outside the loop region or the select
7083 // introduced by tail-folding. Otherwise we would create users of scaled
7084 // reductions where the types of the other operands don't match.
7085 for (auto &[RedPhiR, Chains] : ChainsByPhi) {
7086 for (const VPPartialReductionChain &Chain : Chains) {
7087 if (!all_of(Range: Chain.ExtendedOp.ExtendsUser->operands(), P: ExtendUsersValid)) {
7088 Chains.clear();
7089 break;
7090 }
7091 auto UseIsValid = [&, RedPhiR = RedPhiR](VPUser *U) {
7092 if (auto *PhiR = dyn_cast<VPReductionPHIRecipe>(Val: U))
7093 return PhiR == RedPhiR;
7094 auto *R = cast<VPSingleDefRecipe>(Val: U);
7095 return Chain.ScaleFactor == ScaledReductionMap.lookup_or(Val: R, Default: 0) ||
7096 match(R, P: m_ComputeReductionResult(
7097 Op0: m_Specific(VPV: Chain.ReductionBinOp))) ||
7098 match(R, P: m_Select(Op0: m_VPValue(), Op1: m_Specific(VPV: Chain.ReductionBinOp),
7099 Op2: m_Specific(VPV: RedPhiR)));
7100 };
7101 if (!all_of(Range: Chain.ReductionBinOp->users(), P: UseIsValid)) {
7102 Chains.clear();
7103 break;
7104 }
7105
7106 // Check if the compute-reduction-result is used by a sunk store.
7107 // TODO: Also form partial reductions in those cases.
7108 if (auto *RdxResult = vputils::findComputeReductionResult(PhiR: RedPhiR)) {
7109 if (any_of(Range: RdxResult->users(), P: [](VPUser *U) {
7110 auto *RepR = dyn_cast<VPReplicateRecipe>(Val: U);
7111 return RepR && RepR->getOpcode() == Instruction::Store;
7112 })) {
7113 Chains.clear();
7114 break;
7115 }
7116 }
7117 }
7118
7119 // Clear the chain if it is not profitable.
7120 if (!LoopVectorizationPlanner::getDecisionAndClampRange(
7121 Predicate: [&, &Chains = Chains](ElementCount VF) {
7122 return IsProfitablePartialReductionChainForVF(Chains, VF);
7123 },
7124 Range))
7125 Chains.clear();
7126 }
7127
7128 for (auto &[Phi, Chains] : ChainsByPhi)
7129 for (const VPPartialReductionChain &Chain : Chains)
7130 transformToPartialReduction(Chain, Plan, RdxPhi: Phi);
7131}
7132
7133void VPlanTransforms::makeMemOpWideningDecisions(VPlan &Plan, VFRange &Range,
7134 VPRecipeBuilder &RecipeBuilder,
7135 PredicatedScalarEvolution &PSE,
7136 const Loop *L) {
7137 // Collect all loads/stores first. We will start with ones having simpler
7138 // decisions followed by more complex ones that are potentially
7139 // guided/dependent on the simpler ones.
7140 SmallVector<VPInstruction *> MemOps;
7141 for (VPBasicBlock *VPBB :
7142 VPBlockUtils::blocksOnly<VPBasicBlock>(Range: vp_depth_first_shallow(
7143 G: Plan.getVectorLoopRegion()->getEntryBasicBlock()))) {
7144 for (VPRecipeBase &R : *VPBB) {
7145 auto *VPI = dyn_cast<VPInstruction>(Val: &R);
7146 if (VPI && VPI->getUnderlyingValue() &&
7147 is_contained(Set: {Instruction::Load, Instruction::Store},
7148 Element: VPI->getOpcode()))
7149 MemOps.push_back(Elt: VPI);
7150 }
7151 }
7152
7153 // Few helpers to process different kinds of memory operations.
7154
7155 // To be used as argument to `VPlanTransforms::runPass` which explicitly
7156 // specified pass name, hence `VPlan &` parameter.
7157 auto ProcessSubset = [&](VPlan &, auto ProcessVPInst) {
7158 SmallVector<VPInstruction *> RemainingMemOps;
7159 for (VPInstruction *VPI : MemOps) {
7160 if (!ProcessVPInst(VPI))
7161 RemainingMemOps.push_back(Elt: VPI);
7162 }
7163
7164 MemOps.clear();
7165 std::swap(LHS&: MemOps, RHS&: RemainingMemOps);
7166 };
7167
7168 auto ReplaceWith = [&](VPInstruction *VPI, VPRecipeBase *New) {
7169 New->insertBefore(InsertPos: VPI);
7170 if (VPI->getOpcode() == Instruction::Load)
7171 VPI->replaceAllUsesWith(New: New->getVPSingleValue());
7172 VPI->eraseFromParent();
7173
7174 // VPI has been processed.
7175 return true;
7176 };
7177
7178 auto Scalarize = [&](VPInstruction *VPI) {
7179 return ReplaceWith(VPI, RecipeBuilder.handleReplication(VPI, Range));
7180 };
7181
7182 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
7183 VPBuilder FinalRedStoresBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
7184 VPlanTransforms::runPass(
7185 PassName: "lowerMemoryIdioms", Pass&: ProcessSubset, Plan, Args: [&](VPInstruction *VPI) {
7186 if (RecipeBuilder.replaceWithFinalIfReductionStore(
7187 VPI, FinalRedStoresBuilder))
7188 return true;
7189
7190 // Filter out scalar VPlan for the remaining idioms.
7191 if (LoopVectorizationPlanner::getDecisionAndClampRange(
7192 Predicate: [](ElementCount VF) { return VF.isScalar(); }, Range))
7193 return false;
7194
7195 if (VPHistogramRecipe *Histogram = RecipeBuilder.widenIfHistogram(VPI))
7196 return ReplaceWith(VPI, Histogram);
7197
7198 return false;
7199 });
7200
7201 // Filter out scalar VPlan for the remaining memory operations.
7202 if (LoopVectorizationPlanner::getDecisionAndClampRange(
7203 Predicate: [](ElementCount VF) { return VF.isScalar(); }, Range))
7204 return;
7205
7206 // If the instruction's allocated size doesn't equal it's type size, it
7207 // requires padding and will be scalarized.
7208 VPlanTransforms::runPass(
7209 PassName: "scalarizeMemOpsWithIrregularTypes", Pass&: ProcessSubset, Plan,
7210 Args: [&](VPInstruction *VPI) {
7211 Instruction *I = VPI->getUnderlyingInstr();
7212 if (hasIrregularType(Ty: getLoadStoreType(I), DL: I->getDataLayout()))
7213 return Scalarize(VPI);
7214
7215 return false;
7216 });
7217
7218 if (!RecipeBuilder.prefersVectorizedAddressing()) {
7219 VPlanTransforms::runPass(
7220 PassName: "makeVPlanMemOpDecision", Pass&: ProcessSubset, Plan, Args: [&](VPInstruction *VPI) {
7221 Instruction *I = VPI->getUnderlyingInstr();
7222 bool IsLoad = VPI->getOpcode() == Instruction::Load;
7223 if (RecipeBuilder.isPredicatedInst(I) || !IsLoad ||
7224 !vputils::isUsedByLoadStoreAddress(V: VPI))
7225 return false;
7226
7227 // Scalarize loads used as addresses, matching the legacy CM. The load
7228 // is single-scalar if the pointer is loop-invariant, otherwise it is
7229 // replicated per-lane. No mask is needed as the load is not
7230 // predicated.
7231 VPValue *Ptr = VPI->getOperand(N: 0);
7232 const SCEV *PtrSCEV = vputils::getSCEVExprForVPValue(V: Ptr, PSE, L);
7233 bool IsSingleScalarLoad = !isa<SCEVCouldNotCompute>(Val: PtrSCEV) &&
7234 PSE.getSE()->isLoopInvariant(S: PtrSCEV, L);
7235
7236 ReplaceWith(VPI,
7237 new VPReplicateRecipe(
7238 I, Ptr, /*IsSingleScalar=*/IsSingleScalarLoad,
7239 /*Mask=*/nullptr, *VPI, *VPI, VPI->getDebugLoc()));
7240 return true;
7241 });
7242 }
7243
7244 VPlanTransforms::runPass(PassName: "delegateMemOpWideningToLegacyCM", Pass&: ProcessSubset,
7245 Plan, Args: [&](VPInstruction *VPI) {
7246 if (VPRecipeBase *Recipe =
7247 RecipeBuilder.tryToWidenMemory(VPI, Range))
7248 return ReplaceWith(VPI, Recipe);
7249
7250 return Scalarize(VPI);
7251 });
7252}
7253
7254void VPlanTransforms::makeScalarizationDecisions(VPlan &Plan, VFRange &Range) {
7255 if (LoopVectorizationPlanner::getDecisionAndClampRange(
7256 Predicate: [&](ElementCount VF) { return VF.isScalar(); }, Range))
7257 return;
7258
7259 PostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> POT(
7260 Plan.getEntry());
7261 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Range&: POT)) {
7262 for (VPRecipeBase &R : make_early_inc_range(Range: reverse(C&: *VPBB))) {
7263 auto *VPI = dyn_cast<VPInstruction>(Val: &R);
7264 if (!VPI)
7265 continue;
7266
7267 auto *I = cast_or_null<Instruction>(Val: VPI->getUnderlyingValue());
7268 // Wouldn't be able to create a `VPReplicateRecipe` anyway.
7269 if (!I)
7270 continue;
7271
7272 // If executing other lanes produces side-effects we can't avoid them.
7273 if (VPI->mayHaveSideEffects())
7274 continue;
7275
7276 // We want to drop the mask operand, verify we can safely do that.
7277 if (VPI->isMasked() && !VPI->isSafeToSpeculativelyExecute())
7278 continue;
7279
7280 // Avoid rewriting IV increment as that interferes with
7281 // `removeRedundantCanonicalIVs`.
7282 if (VPI->getOpcode() == Instruction::Add &&
7283 any_of(Range: VPI->operands(), P: IsaPred<VPWidenIntOrFpInductionRecipe>))
7284 continue;
7285
7286 // Other lanes are needed - can't drop them.
7287 if (!vputils::onlyFirstLaneUsed(Def: VPI))
7288 continue;
7289
7290 auto *Recipe = VPBuilder::createSingleScalarOp(
7291 Opcode: VPI->getOpcode(), Operands: VPI->operandsWithoutMask(), /*Mask=*/nullptr, Flags: *VPI,
7292 Metadata: *VPI, DL: VPI->getDebugLoc(), UV: I);
7293 Recipe->insertBefore(InsertPos: VPI);
7294 VPI->replaceAllUsesWith(New: Recipe);
7295 VPI->eraseFromParent();
7296 }
7297 }
7298}
7299
7300/// Returns true if \p Info's parameter kinds are compatible with \p Args.
7301static bool areVFParamsOk(const VFInfo &Info, ArrayRef<VPValue *> Args,
7302 PredicatedScalarEvolution &PSE, const Loop *L) {
7303 ScalarEvolution *SE = PSE.getSE();
7304 return all_of(Range: Info.Shape.Parameters, P: [&](VFParameter Param) {
7305 switch (Param.ParamKind) {
7306 case VFParamKind::Vector:
7307 case VFParamKind::GlobalPredicate:
7308 return true;
7309 case VFParamKind::OMP_Uniform:
7310 return SE->isSCEVable(Ty: Args[Param.ParamPos]->getScalarType()) &&
7311 SE->isLoopInvariant(
7312 S: vputils::getSCEVExprForVPValue(V: Args[Param.ParamPos], PSE, L),
7313 L);
7314 case VFParamKind::OMP_Linear:
7315 return match(S: vputils::getSCEVExprForVPValue(V: Args[Param.ParamPos], PSE, L),
7316 P: m_scev_AffineAddRec(
7317 Op0: m_SCEV(), Op1: m_scev_SpecificSInt(V: Param.LinearStepOrPos),
7318 L: m_SpecificLoop(L)));
7319 default:
7320 return false;
7321 }
7322 });
7323}
7324
7325/// Find a vector variant of \p CI for \p VF, respecting \p MaskRequired.
7326/// Returns the variant function, or nullptr. Masked variants are assumed to
7327/// take the mask as a trailing parameter.
7328static Function *findVectorVariant(CallInst *CI, ArrayRef<VPValue *> Args,
7329 ElementCount VF, bool MaskRequired,
7330 PredicatedScalarEvolution &PSE,
7331 const Loop *L) {
7332 if (CI->isNoBuiltin())
7333 return nullptr;
7334 auto Mappings = VFDatabase::getMappings(CI: *CI);
7335 const auto *It = find_if(Range&: Mappings, P: [&](const VFInfo &Info) {
7336 return Info.Shape.VF == VF && (!MaskRequired || Info.isMasked()) &&
7337 areVFParamsOk(Info, Args, PSE, L);
7338 });
7339 if (It == Mappings.end())
7340 return nullptr;
7341 return CI->getModule()->getFunction(Name: It->VectorName);
7342}
7343
7344namespace {
7345/// The outcome of choosing how to widen a call at a given VF.
7346struct CallWideningDecision {
7347 enum class KindTy { Scalarize, Intrinsic, VectorVariant };
7348 CallWideningDecision(KindTy Kind, Function *Variant = nullptr)
7349 : Kind(Kind), Variant(Variant) {}
7350 KindTy Kind;
7351
7352 /// Set when Kind == VectorVariant.
7353 Function *Variant;
7354
7355 bool operator==(const CallWideningDecision &Other) const {
7356 return Kind == Other.Kind && Variant == Other.Variant;
7357 }
7358};
7359} // namespace
7360
7361/// Pick the cheapest widening for the call \p VPI at \p VF among scalarization,
7362/// vector intrinsic, and vector library variant.
7363static CallWideningDecision decideCallWidening(VPInstruction &VPI,
7364 ArrayRef<VPValue *> Ops,
7365 ElementCount VF,
7366 VPCostContext &CostCtx) {
7367 auto *CI = cast<CallInst>(Val: VPI.getUnderlyingInstr());
7368
7369 // Scalar VFs and calls forced or known to scalarize always replicate.
7370 if (VF.isScalar() || CostCtx.willBeScalarized(I: CI, VF))
7371 return CallWideningDecision::KindTy::Scalarize;
7372
7373 auto *CalledFn = cast<Function>(
7374 Val: VPI.getOperand(N: VPI.getNumOperandsWithoutMask() - 1)->getLiveInIRValue());
7375 Type *ResultTy = VPI.getScalarType();
7376 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI: &CostCtx.TLI);
7377 bool MaskRequired = CostCtx.isMaskRequired(I: CI);
7378
7379 // Pseudo intrinsics (assume, lifetime, ...) are always scalarized.
7380 if (ID && VPCostContext::isFreeScalarIntrinsic(ID))
7381 return CallWideningDecision::KindTy::Scalarize;
7382
7383 InstructionCost ScalarCost =
7384 VPReplicateRecipe::computeCallCost(CalledFn, ResultTy, ArgOps: Ops,
7385 /*IsSingleScalar=*/false, VF, Ctx&: CostCtx);
7386
7387 Function *VecFunc =
7388 findVectorVariant(CI, Args: Ops, VF, MaskRequired, PSE&: CostCtx.PSE, L: CostCtx.L);
7389 InstructionCost VecCallCost = InstructionCost::getInvalid();
7390 if (VecFunc)
7391 VecCallCost = VPWidenCallRecipe::computeCallCost(Variant: VecFunc, Ctx&: CostCtx);
7392
7393 // Prefer the intrinsic if it is at least as cheap as scalarizing and any
7394 // available vector variant.
7395 if (ID) {
7396 InstructionCost IntrinsicCost =
7397 VPWidenIntrinsicRecipe::computeCallCost(ID, Operands: Ops, R: VPI, VF, Ctx&: CostCtx);
7398 if (IntrinsicCost.isValid() && ScalarCost >= IntrinsicCost &&
7399 (!VecFunc || VecCallCost >= IntrinsicCost))
7400 return CallWideningDecision::KindTy::Intrinsic;
7401 }
7402
7403 // Otherwise, use a vector library variant when it beats scalarizing.
7404 if (VecFunc && ScalarCost >= VecCallCost)
7405 return {CallWideningDecision::KindTy::VectorVariant, VecFunc};
7406
7407 return CallWideningDecision::KindTy::Scalarize;
7408}
7409
7410void VPlanTransforms::makeCallWideningDecisions(VPlan &Plan, VFRange &Range,
7411 VPRecipeBuilder &RecipeBuilder,
7412 VPCostContext &CostCtx) {
7413 for (VPBasicBlock *VPBB : VPBlockUtils::blocksAs<VPBasicBlock>(
7414 Range: vp_depth_first_shallow(G: Plan.getVectorLoopRegion()->getEntry()))) {
7415 for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB)) {
7416 auto *VPI = dyn_cast<VPInstruction>(Val: &R);
7417 if (!VPI || !VPI->getUnderlyingValue() ||
7418 VPI->getOpcode() != Instruction::Call)
7419 continue;
7420
7421 auto *CI = cast<CallInst>(Val: VPI->getUnderlyingInstr());
7422 SmallVector<VPValue *, 4> Ops(VPI->op_begin(),
7423 VPI->op_begin() + CI->arg_size());
7424
7425 CallWideningDecision Decision =
7426 decideCallWidening(VPI&: *VPI, Ops, VF: Range.Start, CostCtx);
7427 LoopVectorizationPlanner::getDecisionAndClampRange(
7428 Predicate: [&](ElementCount VF) {
7429 return Decision == decideCallWidening(VPI&: *VPI, Ops, VF, CostCtx);
7430 },
7431 Range);
7432
7433 VPSingleDefRecipe *Replacement = nullptr;
7434 switch (Decision.Kind) {
7435 case CallWideningDecision::KindTy::Intrinsic: {
7436 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI: &CostCtx.TLI);
7437 Type *ResultTy = VPI->getScalarType();
7438 Replacement = new VPWidenIntrinsicRecipe(*CI, ID, Ops, ResultTy, *VPI,
7439 *VPI, VPI->getDebugLoc());
7440 break;
7441 }
7442 case CallWideningDecision::KindTy::VectorVariant: {
7443 // Masked variants take the mask as a trailing parameter, so they have
7444 // one more parameter than the original call's arguments.
7445 if (Decision.Variant->arg_size() > Ops.size()) {
7446 VPValue *Mask = VPI->isMasked() ? VPI->getMask() : Plan.getTrue();
7447 Ops.push_back(Elt: Mask);
7448 }
7449 Ops.push_back(Elt: VPI->getOperand(N: VPI->getNumOperandsWithoutMask() - 1));
7450 Replacement = new VPWidenCallRecipe(CI, Decision.Variant, Ops, *VPI,
7451 *VPI, VPI->getDebugLoc());
7452 break;
7453 }
7454 case CallWideningDecision::KindTy::Scalarize:
7455 Replacement = RecipeBuilder.handleReplication(VPI, Range);
7456 break;
7457 }
7458
7459 Replacement->insertBefore(InsertPos: VPI);
7460 VPI->replaceAllUsesWith(New: Replacement);
7461 VPI->eraseFromParent();
7462 }
7463 }
7464}
7465
7466void VPlanTransforms::convertToStridedAccesses(VPlan &Plan,
7467 PredicatedScalarEvolution &PSE,
7468 Loop &L, VPCostContext &Ctx,
7469 VFRange &Range) {
7470 if (Plan.hasScalarVFOnly())
7471 return;
7472
7473 VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
7474 VPValue *I32VF = nullptr;
7475 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
7476 Range: vp_depth_first_shallow(G: VectorLoop->getEntry()))) {
7477 for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB)) {
7478 auto *LoadR = dyn_cast<VPWidenLoadRecipe>(Val: &R);
7479 // TODO: Support strided store.
7480 // TODO: Transform reverse access into strided access with -1 stride.
7481 // TODO: Transform gather/scatter with uniform address into strided access
7482 // with 0 stride.
7483 // TODO: Transform interleave access into multiple strided accesses.
7484 if (!LoadR || LoadR->isConsecutive())
7485 continue;
7486
7487 auto *Ptr = dyn_cast<VPWidenGEPRecipe>(Val: LoadR->getAddr());
7488 if (!Ptr)
7489 continue;
7490
7491 // Check if this is a strided access by analyzing the address SCEV for an
7492 // affine addRec.
7493 const SCEV *PtrSCEV = vputils::getSCEVExprForVPValue(V: Ptr, PSE, L: &L);
7494 const SCEV *Start;
7495 const SCEVConstant *Step;
7496 // TODO: Support non-constant loop invariant stride.
7497 if (!match(S: PtrSCEV,
7498 P: m_scev_AffineAddRec(Op0: m_SCEV(V&: Start), Op1: m_SCEVConstant(V&: Step),
7499 L: m_SpecificLoop(L: &L))))
7500 continue;
7501
7502 Type *LoadTy = LoadR->getScalarType();
7503 Align Alignment = LoadR->getAlign();
7504 auto IsProfitable = [&](ElementCount VF) {
7505 Type *DataTy = toVectorTy(Scalar: LoadTy, EC: VF);
7506 if (!Ctx.TTI.isLegalStridedLoadStore(DataType: DataTy, Alignment))
7507 return false;
7508 const InstructionCost CurrentCost = LoadR->computeCost(VF, Ctx);
7509 const InstructionCost StridedLoadStoreCost =
7510 VPWidenMemIntrinsicRecipe::computeMemIntrinsicCost(
7511 IID: Intrinsic::experimental_vp_strided_load, Ty: DataTy,
7512 IsMasked: LoadR->isMasked(), Alignment, Ctx);
7513 return StridedLoadStoreCost < CurrentCost;
7514 };
7515
7516 if (!LoopVectorizationPlanner::getDecisionAndClampRange(Predicate: IsProfitable,
7517 Range))
7518 continue;
7519
7520 // Invalidate the legacy widening decision so the cost of replaced load is
7521 // not counted during precomputeCosts.
7522 // TODO: Remove once the legacy exit cost computation is retired.
7523 for (ElementCount VF : Range)
7524 Ctx.invalidateWideningDecision(I: &LoadR->getIngredient(), VF);
7525
7526 // Get VF as i32 for the vector length operand.
7527 if (!I32VF) {
7528 VPBuilder Builder(Plan.getVectorPreheader());
7529 I32VF = Builder.createScalarZExtOrTrunc(
7530 Op: &Plan.getVF(), ResultTy: Type::getInt32Ty(C&: Plan.getContext()),
7531 SrcTy: Plan.getVF().getScalarType(), DL: DebugLoc::getUnknown());
7532 }
7533
7534 VPBuilder Builder(LoadR);
7535 // Create the base pointer of strided access.
7536 // TODO: reuse VPDerivedIVRecipe for base pointer computation when it
7537 // supports a general VPValue as the start value.
7538 VPValue *StartVPV = vputils::getOrCreateVPValueForSCEVExpr(Plan, Expr: Start);
7539 VPValue *StrideInBytes = Plan.getOrAddLiveIn(V: Step->getValue());
7540 Type *IndexTy = Plan.getDataLayout().getIndexType(PtrTy: Ptr->getScalarType());
7541 assert(IndexTy == StrideInBytes->getScalarType() &&
7542 "Stride type from SCEV must match the index type");
7543 VPValue *CanIV = Builder.createScalarSExtOrTrunc(
7544 Op: VectorLoop->getCanonicalIV(), ResultTy: IndexTy,
7545 SrcTy: VectorLoop->getCanonicalIVType(), DL: DebugLoc::getUnknown());
7546 auto *AddRecPtr = cast<SCEVAddRecExpr>(Val: PtrSCEV);
7547 auto *Offset = Builder.createOverflowingOp(
7548 Opcode: Instruction::Mul, Operands: {CanIV, StrideInBytes},
7549 WrapFlags: {AddRecPtr->hasNoUnsignedWrap(), AddRecPtr->hasNoSignedWrap()});
7550 auto *BasePtr = Builder.createNoWrapPtrAdd(
7551 Ptr: StartVPV, Offset,
7552 GEPFlags: AddRecPtr->hasNoUnsignedWrap() ? GEPNoWrapFlags::noUnsignedWrap()
7553 : GEPNoWrapFlags::none());
7554
7555 // Create a new vector pointer for strided access.
7556 VPValue *NewPtr = Builder.createVectorPointer(
7557 Ptr: BasePtr, SourceElementTy: Type::getInt8Ty(C&: Plan.getContext()), Stride: StrideInBytes,
7558 GEPFlags: Ptr->getGEPNoWrapFlags(), DL: Ptr->getDebugLoc());
7559
7560 VPValue *Mask = LoadR->getMask();
7561 if (!Mask)
7562 Mask = Plan.getTrue();
7563 auto *StridedLoad = Builder.createWidenMemIntrinsic(
7564 VectorIntrinsicID: Intrinsic::experimental_vp_strided_load,
7565 CallArguments: {NewPtr, StrideInBytes, Mask, I32VF}, Ty: LoadTy, Alignment, MD: *LoadR,
7566 DL: LoadR->getDebugLoc());
7567 LoadR->replaceAllUsesWith(New: StridedLoad);
7568 }
7569 }
7570}
7571