1//===-- VPlanTransforms.cpp - Utility VPlan to VPlan transforms -----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file implements a set of utility VPlan to VPlan transformations.
11///
12//===----------------------------------------------------------------------===//
13
14#include "VPlanTransforms.h"
15#include "VPRecipeBuilder.h"
16#include "VPlan.h"
17#include "VPlanAnalysis.h"
18#include "VPlanCFG.h"
19#include "VPlanDominatorTree.h"
20#include "VPlanHelpers.h"
21#include "VPlanPatternMatch.h"
22#include "VPlanUtils.h"
23#include "VPlanVerifier.h"
24#include "llvm/ADT/APInt.h"
25#include "llvm/ADT/PostOrderIterator.h"
26#include "llvm/ADT/STLExtras.h"
27#include "llvm/ADT/SetOperations.h"
28#include "llvm/ADT/SetVector.h"
29#include "llvm/ADT/SmallPtrSet.h"
30#include "llvm/ADT/TypeSwitch.h"
31#include "llvm/Analysis/IVDescriptors.h"
32#include "llvm/Analysis/InstSimplifyFolder.h"
33#include "llvm/Analysis/LoopInfo.h"
34#include "llvm/Analysis/MemoryLocation.h"
35#include "llvm/Analysis/ScalarEvolutionPatternMatch.h"
36#include "llvm/Analysis/ScopedNoAliasAA.h"
37#include "llvm/Analysis/VectorUtils.h"
38#include "llvm/IR/Intrinsics.h"
39#include "llvm/IR/MDBuilder.h"
40#include "llvm/IR/Metadata.h"
41#include "llvm/Support/Casting.h"
42#include "llvm/Support/TypeSize.h"
43#include "llvm/Transforms/Utils/LoopUtils.h"
44#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
45
46using namespace llvm;
47using namespace VPlanPatternMatch;
48using namespace SCEVPatternMatch;
49
50bool VPlanTransforms::tryToConvertVPInstructionsToVPRecipes(
51 VPlan &Plan, const TargetLibraryInfo &TLI) {
52
53 ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT(
54 Plan.getVectorLoopRegion());
55 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Range: RPOT)) {
56 // Skip blocks outside region
57 if (!VPBB->getParent())
58 break;
59 VPRecipeBase *Term = VPBB->getTerminator();
60 auto EndIter = Term ? Term->getIterator() : VPBB->end();
61 // Introduce each ingredient into VPlan.
62 for (VPRecipeBase &Ingredient :
63 make_early_inc_range(Range: make_range(x: VPBB->begin(), y: EndIter))) {
64
65 VPValue *VPV = Ingredient.getVPSingleValue();
66 if (!VPV->getUnderlyingValue())
67 continue;
68
69 Instruction *Inst = cast<Instruction>(Val: VPV->getUnderlyingValue());
70
71 VPRecipeBase *NewRecipe = nullptr;
72 if (auto *PhiR = dyn_cast<VPPhi>(Val: &Ingredient)) {
73 auto *Phi = cast<PHINode>(Val: PhiR->getUnderlyingValue());
74 NewRecipe = new VPWidenPHIRecipe(Phi, nullptr, PhiR->getDebugLoc());
75 for (VPValue *Op : PhiR->operands())
76 NewRecipe->addOperand(Operand: Op);
77 } else if (auto *VPI = dyn_cast<VPInstruction>(Val: &Ingredient)) {
78 assert(!isa<PHINode>(Inst) && "phis should be handled above");
79 // Create VPWidenMemoryRecipe for loads and stores.
80 if (LoadInst *Load = dyn_cast<LoadInst>(Val: Inst)) {
81 NewRecipe = new VPWidenLoadRecipe(
82 *Load, Ingredient.getOperand(N: 0), nullptr /*Mask*/,
83 false /*Consecutive*/, false /*Reverse*/, *VPI,
84 Ingredient.getDebugLoc());
85 } else if (StoreInst *Store = dyn_cast<StoreInst>(Val: Inst)) {
86 NewRecipe = new VPWidenStoreRecipe(
87 *Store, Ingredient.getOperand(N: 1), Ingredient.getOperand(N: 0),
88 nullptr /*Mask*/, false /*Consecutive*/, false /*Reverse*/, *VPI,
89 Ingredient.getDebugLoc());
90 } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Val: Inst)) {
91 NewRecipe = new VPWidenGEPRecipe(GEP, Ingredient.operands(), *VPI,
92 Ingredient.getDebugLoc());
93 } else if (CallInst *CI = dyn_cast<CallInst>(Val: Inst)) {
94 Intrinsic::ID VectorID = getVectorIntrinsicIDForCall(CI, TLI: &TLI);
95 if (VectorID == Intrinsic::not_intrinsic)
96 return false;
97
98 // The noalias.scope.decl intrinsic declares a noalias scope that
99 // is valid for a single iteration. Emitting it as a single-scalar
100 // replicate would incorrectly extend the scope across multiple
101 // original iterations packed into one vector iteration.
102 // FIXME: If we want to vectorize this loop, then we have to drop
103 // all the associated !alias.scope and !noalias.
104 if (VectorID == Intrinsic::experimental_noalias_scope_decl)
105 return false;
106
107 // These intrinsics are recognized by getVectorIntrinsicIDForCall
108 // but are not widenable. Emit them as replicate instead of widening.
109 if (VectorID == Intrinsic::assume ||
110 VectorID == Intrinsic::lifetime_end ||
111 VectorID == Intrinsic::lifetime_start ||
112 VectorID == Intrinsic::sideeffect ||
113 VectorID == Intrinsic::pseudoprobe) {
114 // If the operand of llvm.assume holds before vectorization, it will
115 // also hold per lane.
116 // llvm.pseudoprobe requires to be duplicated per lane for accurate
117 // sample count.
118 const bool IsSingleScalar = VectorID != Intrinsic::assume &&
119 VectorID != Intrinsic::pseudoprobe;
120 NewRecipe = new VPReplicateRecipe(CI, Ingredient.operands(),
121 /*IsSingleScalar=*/IsSingleScalar,
122 /*Mask=*/nullptr, *VPI, *VPI,
123 Ingredient.getDebugLoc());
124 } else {
125 NewRecipe = new VPWidenIntrinsicRecipe(
126 *CI, VectorID, drop_end(RangeOrContainer: Ingredient.operands()), CI->getType(),
127 VPIRFlags(*CI), *VPI, CI->getDebugLoc());
128 }
129 } else if (auto *CI = dyn_cast<CastInst>(Val: Inst)) {
130 NewRecipe = new VPWidenCastRecipe(
131 CI->getOpcode(), Ingredient.getOperand(N: 0), CI->getType(), CI,
132 VPIRFlags(*CI), VPIRMetadata(*CI));
133 } else {
134 NewRecipe = new VPWidenRecipe(*Inst, Ingredient.operands(), *VPI,
135 *VPI, Ingredient.getDebugLoc());
136 }
137 } else {
138 assert(isa<VPWidenIntOrFpInductionRecipe>(&Ingredient) &&
139 "inductions must be created earlier");
140 continue;
141 }
142
143 NewRecipe->insertBefore(InsertPos: &Ingredient);
144 if (NewRecipe->getNumDefinedValues() == 1)
145 VPV->replaceAllUsesWith(New: NewRecipe->getVPSingleValue());
146 else
147 assert(NewRecipe->getNumDefinedValues() == 0 &&
148 "Only recpies with zero or one defined values expected");
149 Ingredient.eraseFromParent();
150 }
151 }
152 return true;
153}
154
155/// Helper for extra no-alias checks via known-safe recipe and SCEV.
156class SinkStoreInfo {
157 const SmallPtrSetImpl<VPRecipeBase *> &ExcludeRecipes;
158 VPReplicateRecipe &GroupLeader;
159 PredicatedScalarEvolution &PSE;
160 const Loop &L;
161 VPTypeAnalysis &TypeInfo;
162
163 // Return true if \p A and \p B are known to not alias for all VFs in the
164 // plan, checked via the distance between the accesses
165 bool isNoAliasViaDistance(VPReplicateRecipe *A, VPReplicateRecipe *B) const {
166 if (A->getOpcode() != Instruction::Store ||
167 B->getOpcode() != Instruction::Store)
168 return false;
169
170 VPValue *AddrA = A->getOperand(N: 1);
171 const SCEV *SCEVA = vputils::getSCEVExprForVPValue(V: AddrA, PSE, L: &L);
172 VPValue *AddrB = B->getOperand(N: 1);
173 const SCEV *SCEVB = vputils::getSCEVExprForVPValue(V: AddrB, PSE, L: &L);
174 if (isa<SCEVCouldNotCompute>(Val: SCEVA) || isa<SCEVCouldNotCompute>(Val: SCEVB))
175 return false;
176
177 const APInt *Distance;
178 ScalarEvolution &SE = *PSE.getSE();
179 if (!match(S: SE.getMinusSCEV(LHS: SCEVA, RHS: SCEVB), P: m_scev_APInt(C&: Distance)))
180 return false;
181
182 const DataLayout &DL = SE.getDataLayout();
183 Type *TyA = TypeInfo.inferScalarType(V: A->getOperand(N: 0));
184 uint64_t SizeA = DL.getTypeStoreSize(Ty: TyA);
185 Type *TyB = TypeInfo.inferScalarType(V: B->getOperand(N: 0));
186 uint64_t SizeB = DL.getTypeStoreSize(Ty: TyB);
187
188 // Use the maximum store size to ensure no overlap from either direction.
189 // Currently only handles fixed sizes, as it is only used for
190 // replicating VPReplicateRecipes.
191 uint64_t MaxStoreSize = std::max(a: SizeA, b: SizeB);
192
193 auto VFs = B->getParent()->getPlan()->vectorFactors();
194 ElementCount MaxVF = *max_element(Range&: VFs, C: ElementCount::isKnownLT);
195 if (MaxVF.isScalable())
196 return false;
197 return Distance->abs().uge(
198 RHS: MaxVF.multiplyCoefficientBy(RHS: MaxStoreSize).getFixedValue());
199 }
200
201public:
202 SinkStoreInfo(const SmallPtrSetImpl<VPRecipeBase *> &ExcludeRecipes,
203 VPReplicateRecipe &GroupLeader, PredicatedScalarEvolution &PSE,
204 const Loop &L, VPTypeAnalysis &TypeInfo)
205 : ExcludeRecipes(ExcludeRecipes), GroupLeader(GroupLeader), PSE(PSE),
206 L(L), TypeInfo(TypeInfo) {}
207
208 /// Return true if \p R should be skipped during alias checking, either
209 /// because it's in the exclude set or because no-alias can be proven via
210 /// SCEV.
211 bool shouldSkip(VPRecipeBase &R) const {
212 auto *Store = dyn_cast<VPReplicateRecipe>(Val: &R);
213 return ExcludeRecipes.contains(Ptr: &R) ||
214 (Store && isNoAliasViaDistance(A: Store, B: &GroupLeader));
215 }
216};
217
218/// Check if a memory operation doesn't alias with memory operations using
219/// scoped noalias metadata, in blocks in the single-successor chain between \p
220/// FirstBB and \p LastBB. If \p SinkInfo is std::nullopt, only recipes that may
221/// write to memory are checked (for load hoisting). Otherwise recipes that both
222/// read and write memory are checked, and SCEV is used to prove no-alias
223/// between the group leader and other replicate recipes (for store sinking).
224static bool
225canHoistOrSinkWithNoAliasCheck(const MemoryLocation &MemLoc,
226 VPBasicBlock *FirstBB, VPBasicBlock *LastBB,
227 std::optional<SinkStoreInfo> SinkInfo = {}) {
228 bool CheckReads = SinkInfo.has_value();
229 if (!MemLoc.AATags.Scope)
230 return false;
231
232 for (VPBasicBlock *VPBB :
233 VPBlockUtils::blocksInSingleSuccessorChainBetween(FirstBB, LastBB)) {
234 for (VPRecipeBase &R : *VPBB) {
235 if (SinkInfo && SinkInfo->shouldSkip(R))
236 continue;
237
238 // Skip recipes that don't need checking.
239 if (!R.mayWriteToMemory() && !(CheckReads && R.mayReadFromMemory()))
240 continue;
241
242 auto Loc = vputils::getMemoryLocation(R);
243 if (!Loc)
244 // Conservatively assume aliasing for memory operations without
245 // location.
246 return false;
247
248 if (ScopedNoAliasAAResult::alias(LocA: *Loc, LocB: MemLoc) != AliasResult::NoAlias)
249 return false;
250 }
251 }
252 return true;
253}
254
255/// Collect either replicated Loads or Stores grouped by their address SCEV, in
256/// a deep-traversal of the vector loop region in \p Plan.
257template <unsigned Opcode>
258static SmallVector<SmallVector<VPReplicateRecipe *, 4>>
259collectGroupedReplicateMemOps(
260 VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L,
261 function_ref<bool(VPReplicateRecipe *)> FilterFn) {
262 static_assert(Opcode == Instruction::Load || Opcode == Instruction::Store,
263 "Only Load and Store opcodes supported");
264 constexpr bool IsLoad = (Opcode == Instruction::Load);
265 SmallDenseMap<const SCEV *, SmallVector<VPReplicateRecipe *, 4>>
266 RecipesByAddress;
267 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
268 Range: vp_depth_first_deep(G: Plan.getVectorLoopRegion()->getEntry()))) {
269 for (VPRecipeBase &R : *VPBB) {
270 auto *RepR = dyn_cast<VPReplicateRecipe>(Val: &R);
271 if (!RepR || RepR->getOpcode() != Opcode || !FilterFn(RepR))
272 continue;
273
274 // For loads, operand 0 is address; for stores, operand 1 is address.
275 VPValue *Addr = RepR->getOperand(N: IsLoad ? 0 : 1);
276 const SCEV *AddrSCEV = vputils::getSCEVExprForVPValue(V: Addr, PSE, L);
277 if (!isa<SCEVCouldNotCompute>(Val: AddrSCEV))
278 RecipesByAddress[AddrSCEV].push_back(Elt: RepR);
279 }
280 }
281 auto Groups = to_vector(Range: RecipesByAddress.values());
282 VPDominatorTree VPDT(Plan);
283 for (auto &Group : Groups) {
284 // Sort mem ops by dominance order, with earliest (most dominating) first.
285 stable_sort(Group, [&VPDT](VPReplicateRecipe *A, VPReplicateRecipe *B) {
286 return VPDT.properlyDominates(A, B);
287 });
288 }
289 return Groups;
290}
291
292/// Return true if we do not know how to (mechanically) hoist or sink \p R out
293/// of a loop region.
294static bool cannotHoistOrSinkRecipe(const VPRecipeBase &R) {
295 // Assumes don't alias anything or throw; as long as they're guaranteed to
296 // execute, they're safe to hoist.
297 if (match(V: &R, P: m_Intrinsic<Intrinsic::assume>()))
298 return false;
299
300 // TODO: Relax checks in the future, e.g. we could also hoist reads, if their
301 // memory location is not modified in the vector loop.
302 if (R.mayHaveSideEffects() || R.mayReadFromMemory() || R.isPhi())
303 return true;
304
305 // Allocas cannot be hoisted.
306 auto *RepR = dyn_cast<VPReplicateRecipe>(Val: &R);
307 return RepR && RepR->getOpcode() == Instruction::Alloca;
308}
309
310static bool sinkScalarOperands(VPlan &Plan) {
311 auto Iter = vp_depth_first_deep(G: Plan.getEntry());
312 bool ScalarVFOnly = Plan.hasScalarVFOnly();
313 bool Changed = false;
314
315 SetVector<std::pair<VPBasicBlock *, VPSingleDefRecipe *>> WorkList;
316 auto InsertIfValidSinkCandidate = [ScalarVFOnly, &WorkList](
317 VPBasicBlock *SinkTo, VPValue *Op) {
318 auto *Candidate =
319 dyn_cast_or_null<VPSingleDefRecipe>(Val: Op->getDefiningRecipe());
320 if (!Candidate)
321 return;
322
323 // We only know how to sink VPReplicateRecipes and VPScalarIVStepsRecipes
324 // for now.
325 if (!isa<VPReplicateRecipe, VPScalarIVStepsRecipe>(Val: Candidate))
326 return;
327
328 if (Candidate->getParent() == SinkTo || cannotHoistOrSinkRecipe(R: *Candidate))
329 return;
330
331 if (auto *RepR = dyn_cast<VPReplicateRecipe>(Val: Candidate))
332 if (!ScalarVFOnly && RepR->isSingleScalar())
333 return;
334
335 WorkList.insert(X: {SinkTo, Candidate});
336 };
337
338 // First, collect the operands of all recipes in replicate blocks as seeds for
339 // sinking.
340 for (VPRegionBlock *VPR : VPBlockUtils::blocksOnly<VPRegionBlock>(Range: Iter)) {
341 VPBasicBlock *EntryVPBB = VPR->getEntryBasicBlock();
342 if (!VPR->isReplicator() || EntryVPBB->getSuccessors().size() != 2)
343 continue;
344 VPBasicBlock *VPBB = cast<VPBasicBlock>(Val: EntryVPBB->getSuccessors().front());
345 if (VPBB->getSingleSuccessor() != VPR->getExitingBasicBlock())
346 continue;
347 for (auto &Recipe : *VPBB)
348 for (VPValue *Op : Recipe.operands())
349 InsertIfValidSinkCandidate(VPBB, Op);
350 }
351
352 // Try to sink each replicate or scalar IV steps recipe in the worklist.
353 for (unsigned I = 0; I != WorkList.size(); ++I) {
354 VPBasicBlock *SinkTo;
355 VPSingleDefRecipe *SinkCandidate;
356 std::tie(args&: SinkTo, args&: SinkCandidate) = WorkList[I];
357
358 // All recipe users of SinkCandidate must be in the same block SinkTo or all
359 // users outside of SinkTo must only use the first lane of SinkCandidate. In
360 // the latter case, we need to duplicate SinkCandidate.
361 auto UsersOutsideSinkTo =
362 make_filter_range(Range: SinkCandidate->users(), Pred: [SinkTo](VPUser *U) {
363 return cast<VPRecipeBase>(Val: U)->getParent() != SinkTo;
364 });
365 if (any_of(Range&: UsersOutsideSinkTo, P: [SinkCandidate](VPUser *U) {
366 return !U->usesFirstLaneOnly(Op: SinkCandidate);
367 }))
368 continue;
369 bool NeedsDuplicating = !UsersOutsideSinkTo.empty();
370
371 if (NeedsDuplicating) {
372 if (ScalarVFOnly)
373 continue;
374 VPSingleDefRecipe *Clone;
375 if (auto *SinkCandidateRepR =
376 dyn_cast<VPReplicateRecipe>(Val: SinkCandidate)) {
377 // TODO: Handle converting to uniform recipes as separate transform,
378 // then cloning should be sufficient here.
379 Instruction *I = SinkCandidate->getUnderlyingInstr();
380 Clone = new VPReplicateRecipe(I, SinkCandidate->operands(), true,
381 nullptr /*Mask*/, *SinkCandidateRepR,
382 *SinkCandidateRepR);
383 // TODO: add ".cloned" suffix to name of Clone's VPValue.
384 } else {
385 Clone = SinkCandidate->clone();
386 }
387
388 Clone->insertBefore(InsertPos: SinkCandidate);
389 SinkCandidate->replaceUsesWithIf(New: Clone, ShouldReplace: [SinkTo](VPUser &U, unsigned) {
390 return cast<VPRecipeBase>(Val: &U)->getParent() != SinkTo;
391 });
392 }
393 SinkCandidate->moveBefore(BB&: *SinkTo, I: SinkTo->getFirstNonPhi());
394 for (VPValue *Op : SinkCandidate->operands())
395 InsertIfValidSinkCandidate(SinkTo, Op);
396 Changed = true;
397 }
398 return Changed;
399}
400
401/// If \p R is a region with a VPBranchOnMaskRecipe in the entry block, return
402/// the mask.
403static VPValue *getPredicatedMask(VPRegionBlock *R) {
404 auto *EntryBB = dyn_cast<VPBasicBlock>(Val: R->getEntry());
405 if (!EntryBB || EntryBB->size() != 1 ||
406 !isa<VPBranchOnMaskRecipe>(Val: EntryBB->begin()))
407 return nullptr;
408
409 return cast<VPBranchOnMaskRecipe>(Val: &*EntryBB->begin())->getOperand(N: 0);
410}
411
412/// If \p R is a triangle region, return the 'then' block of the triangle.
413static VPBasicBlock *getPredicatedThenBlock(VPRegionBlock *R) {
414 auto *EntryBB = cast<VPBasicBlock>(Val: R->getEntry());
415 if (EntryBB->getNumSuccessors() != 2)
416 return nullptr;
417
418 auto *Succ0 = dyn_cast<VPBasicBlock>(Val: EntryBB->getSuccessors()[0]);
419 auto *Succ1 = dyn_cast<VPBasicBlock>(Val: EntryBB->getSuccessors()[1]);
420 if (!Succ0 || !Succ1)
421 return nullptr;
422
423 if (Succ0->getNumSuccessors() + Succ1->getNumSuccessors() != 1)
424 return nullptr;
425 if (Succ0->getSingleSuccessor() == Succ1)
426 return Succ0;
427 if (Succ1->getSingleSuccessor() == Succ0)
428 return Succ1;
429 return nullptr;
430}
431
432// Merge replicate regions in their successor region, if a replicate region
433// is connected to a successor replicate region with the same predicate by a
434// single, empty VPBasicBlock.
435static bool mergeReplicateRegionsIntoSuccessors(VPlan &Plan) {
436 SmallPtrSet<VPRegionBlock *, 4> TransformedRegions;
437
438 // Collect replicate regions followed by an empty block, followed by another
439 // replicate region with matching masks to process front. This is to avoid
440 // iterator invalidation issues while merging regions.
441 SmallVector<VPRegionBlock *, 8> WorkList;
442 for (VPRegionBlock *Region1 : VPBlockUtils::blocksOnly<VPRegionBlock>(
443 Range: vp_depth_first_deep(G: Plan.getEntry()))) {
444 if (!Region1->isReplicator())
445 continue;
446 auto *MiddleBasicBlock =
447 dyn_cast_or_null<VPBasicBlock>(Val: Region1->getSingleSuccessor());
448 if (!MiddleBasicBlock || !MiddleBasicBlock->empty())
449 continue;
450
451 auto *Region2 =
452 dyn_cast_or_null<VPRegionBlock>(Val: MiddleBasicBlock->getSingleSuccessor());
453 if (!Region2 || !Region2->isReplicator())
454 continue;
455
456 VPValue *Mask1 = getPredicatedMask(R: Region1);
457 VPValue *Mask2 = getPredicatedMask(R: Region2);
458 if (!Mask1 || Mask1 != Mask2)
459 continue;
460
461 assert(Mask1 && Mask2 && "both region must have conditions");
462 WorkList.push_back(Elt: Region1);
463 }
464
465 // Move recipes from Region1 to its successor region, if both are triangles.
466 for (VPRegionBlock *Region1 : WorkList) {
467 if (TransformedRegions.contains(Ptr: Region1))
468 continue;
469 auto *MiddleBasicBlock = cast<VPBasicBlock>(Val: Region1->getSingleSuccessor());
470 auto *Region2 = cast<VPRegionBlock>(Val: MiddleBasicBlock->getSingleSuccessor());
471
472 VPBasicBlock *Then1 = getPredicatedThenBlock(R: Region1);
473 VPBasicBlock *Then2 = getPredicatedThenBlock(R: Region2);
474 if (!Then1 || !Then2)
475 continue;
476
477 // Note: No fusion-preventing memory dependencies are expected in either
478 // region. Such dependencies should be rejected during earlier dependence
479 // checks, which guarantee accesses can be re-ordered for vectorization.
480 //
481 // Move recipes to the successor region.
482 for (VPRecipeBase &ToMove : make_early_inc_range(Range: reverse(C&: *Then1)))
483 ToMove.moveBefore(BB&: *Then2, I: Then2->getFirstNonPhi());
484
485 auto *Merge1 = cast<VPBasicBlock>(Val: Then1->getSingleSuccessor());
486 auto *Merge2 = cast<VPBasicBlock>(Val: Then2->getSingleSuccessor());
487
488 // Move VPPredInstPHIRecipes from the merge block to the successor region's
489 // merge block. Update all users inside the successor region to use the
490 // original values.
491 for (VPRecipeBase &Phi1ToMove : make_early_inc_range(Range: reverse(C&: *Merge1))) {
492 VPValue *PredInst1 =
493 cast<VPPredInstPHIRecipe>(Val: &Phi1ToMove)->getOperand(N: 0);
494 VPValue *Phi1ToMoveV = Phi1ToMove.getVPSingleValue();
495 Phi1ToMoveV->replaceUsesWithIf(New: PredInst1, ShouldReplace: [Then2](VPUser &U, unsigned) {
496 return cast<VPRecipeBase>(Val: &U)->getParent() == Then2;
497 });
498
499 // Remove phi recipes that are unused after merging the regions.
500 if (Phi1ToMove.getVPSingleValue()->getNumUsers() == 0) {
501 Phi1ToMove.eraseFromParent();
502 continue;
503 }
504 Phi1ToMove.moveBefore(BB&: *Merge2, I: Merge2->begin());
505 }
506
507 // Remove the dead recipes in Region1's entry block.
508 for (VPRecipeBase &R :
509 make_early_inc_range(Range: reverse(C&: *Region1->getEntryBasicBlock())))
510 R.eraseFromParent();
511
512 // Finally, remove the first region.
513 for (VPBlockBase *Pred : make_early_inc_range(Range&: Region1->getPredecessors())) {
514 VPBlockUtils::disconnectBlocks(From: Pred, To: Region1);
515 VPBlockUtils::connectBlocks(From: Pred, To: MiddleBasicBlock);
516 }
517 VPBlockUtils::disconnectBlocks(From: Region1, To: MiddleBasicBlock);
518 TransformedRegions.insert(Ptr: Region1);
519 }
520
521 return !TransformedRegions.empty();
522}
523
524static VPRegionBlock *createReplicateRegion(VPReplicateRecipe *PredRecipe,
525 VPlan &Plan) {
526 Instruction *Instr = PredRecipe->getUnderlyingInstr();
527 // Build the triangular if-then region.
528 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
529 assert(Instr->getParent() && "Predicated instruction not in any basic block");
530 auto *BlockInMask = PredRecipe->getMask();
531 auto *MaskDef = BlockInMask->getDefiningRecipe();
532 auto *BOMRecipe = new VPBranchOnMaskRecipe(
533 BlockInMask, MaskDef ? MaskDef->getDebugLoc() : DebugLoc::getUnknown());
534 auto *Entry =
535 Plan.createVPBasicBlock(Name: Twine(RegionName) + ".entry", Recipe: BOMRecipe);
536
537 // Replace predicated replicate recipe with a replicate recipe without a
538 // mask but in the replicate region.
539 auto *RecipeWithoutMask = new VPReplicateRecipe(
540 PredRecipe->getUnderlyingInstr(), drop_end(RangeOrContainer: PredRecipe->operands()),
541 PredRecipe->isSingleScalar(), nullptr /*Mask*/, *PredRecipe, *PredRecipe,
542 PredRecipe->getDebugLoc());
543 auto *Pred =
544 Plan.createVPBasicBlock(Name: Twine(RegionName) + ".if", Recipe: RecipeWithoutMask);
545
546 VPPredInstPHIRecipe *PHIRecipe = nullptr;
547 if (PredRecipe->getNumUsers() != 0) {
548 PHIRecipe = new VPPredInstPHIRecipe(RecipeWithoutMask,
549 RecipeWithoutMask->getDebugLoc());
550 PredRecipe->replaceAllUsesWith(New: PHIRecipe);
551 PHIRecipe->setOperand(I: 0, New: RecipeWithoutMask);
552 }
553 PredRecipe->eraseFromParent();
554 auto *Exiting =
555 Plan.createVPBasicBlock(Name: Twine(RegionName) + ".continue", Recipe: PHIRecipe);
556 VPRegionBlock *Region =
557 Plan.createReplicateRegion(Entry, Exiting, Name: RegionName);
558
559 // Note: first set Entry as region entry and then connect successors starting
560 // from it in order, to propagate the "parent" of each VPBasicBlock.
561 VPBlockUtils::insertTwoBlocksAfter(IfTrue: Pred, IfFalse: Exiting, BlockPtr: Entry);
562 VPBlockUtils::connectBlocks(From: Pred, To: Exiting);
563
564 return Region;
565}
566
567static void addReplicateRegions(VPlan &Plan) {
568 SmallVector<VPReplicateRecipe *> WorkList;
569 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
570 Range: vp_depth_first_deep(G: Plan.getEntry()))) {
571 for (VPRecipeBase &R : *VPBB)
572 if (auto *RepR = dyn_cast<VPReplicateRecipe>(Val: &R)) {
573 if (RepR->isPredicated())
574 WorkList.push_back(Elt: RepR);
575 }
576 }
577
578 unsigned BBNum = 0;
579 for (VPReplicateRecipe *RepR : WorkList) {
580 VPBasicBlock *CurrentBlock = RepR->getParent();
581 VPBasicBlock *SplitBlock = CurrentBlock->splitAt(SplitAt: RepR->getIterator());
582
583 BasicBlock *OrigBB = RepR->getUnderlyingInstr()->getParent();
584 SplitBlock->setName(
585 OrigBB->hasName() ? OrigBB->getName() + "." + Twine(BBNum++) : "");
586 // Record predicated instructions for above packing optimizations.
587 VPRegionBlock *Region = createReplicateRegion(PredRecipe: RepR, Plan);
588 Region->setParent(CurrentBlock->getParent());
589 VPBlockUtils::insertOnEdge(From: CurrentBlock, To: SplitBlock, BlockPtr: Region);
590
591 VPRegionBlock *ParentRegion = Region->getParent();
592 if (ParentRegion && ParentRegion->getExiting() == CurrentBlock)
593 ParentRegion->setExiting(SplitBlock);
594 }
595}
596
597bool VPlanTransforms::mergeBlocksIntoPredecessors(VPlan &Plan) {
598 SmallVector<VPBasicBlock *> WorkList;
599 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
600 Range: vp_depth_first_deep(G: Plan.getEntry()))) {
601 // Don't fold the blocks in the skeleton of the Plan into their single
602 // predecessors for now.
603 // TODO: Remove restriction once more of the skeleton is modeled in VPlan.
604 if (!VPBB->getParent())
605 continue;
606 auto *PredVPBB =
607 dyn_cast_or_null<VPBasicBlock>(Val: VPBB->getSinglePredecessor());
608 if (!PredVPBB || PredVPBB->getNumSuccessors() != 1 ||
609 isa<VPIRBasicBlock>(Val: PredVPBB))
610 continue;
611 WorkList.push_back(Elt: VPBB);
612 }
613
614 for (VPBasicBlock *VPBB : WorkList) {
615 VPBasicBlock *PredVPBB = cast<VPBasicBlock>(Val: VPBB->getSinglePredecessor());
616 for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB))
617 R.moveBefore(BB&: *PredVPBB, I: PredVPBB->end());
618 VPBlockUtils::disconnectBlocks(From: PredVPBB, To: VPBB);
619 auto *ParentRegion = VPBB->getParent();
620 if (ParentRegion && ParentRegion->getExiting() == VPBB)
621 ParentRegion->setExiting(PredVPBB);
622 VPBlockUtils::transferSuccessors(Old: VPBB, New: PredVPBB);
623 // VPBB is now dead and will be cleaned up when the plan gets destroyed.
624 }
625 return !WorkList.empty();
626}
627
628void VPlanTransforms::createAndOptimizeReplicateRegions(VPlan &Plan) {
629 // Convert masked VPReplicateRecipes to if-then region blocks.
630 addReplicateRegions(Plan);
631
632 bool ShouldSimplify = true;
633 while (ShouldSimplify) {
634 ShouldSimplify = sinkScalarOperands(Plan);
635 ShouldSimplify |= mergeReplicateRegionsIntoSuccessors(Plan);
636 ShouldSimplify |= mergeBlocksIntoPredecessors(Plan);
637 }
638}
639
640/// Remove redundant casts of inductions.
641///
642/// Such redundant casts are casts of induction variables that can be ignored,
643/// because we already proved that the casted phi is equal to the uncasted phi
644/// in the vectorized loop. There is no need to vectorize the cast - the same
645/// value can be used for both the phi and casts in the vector loop.
646static void removeRedundantInductionCasts(VPlan &Plan) {
647 for (auto &Phi : Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
648 auto *IV = dyn_cast<VPWidenIntOrFpInductionRecipe>(Val: &Phi);
649 if (!IV || IV->getTruncInst())
650 continue;
651
652 // A sequence of IR Casts has potentially been recorded for IV, which
653 // *must be bypassed* when the IV is vectorized, because the vectorized IV
654 // will produce the desired casted value. This sequence forms a def-use
655 // chain and is provided in reverse order, ending with the cast that uses
656 // the IV phi. Search for the recipe of the last cast in the chain and
657 // replace it with the original IV. Note that only the final cast is
658 // expected to have users outside the cast-chain and the dead casts left
659 // over will be cleaned up later.
660 ArrayRef<Instruction *> Casts = IV->getInductionDescriptor().getCastInsts();
661 VPValue *FindMyCast = IV;
662 for (Instruction *IRCast : reverse(C&: Casts)) {
663 VPSingleDefRecipe *FoundUserCast = nullptr;
664 for (auto *U : FindMyCast->users()) {
665 auto *UserCast = dyn_cast<VPSingleDefRecipe>(Val: U);
666 if (UserCast && UserCast->getUnderlyingValue() == IRCast) {
667 FoundUserCast = UserCast;
668 break;
669 }
670 }
671 FindMyCast = FoundUserCast;
672 }
673 FindMyCast->replaceAllUsesWith(New: IV);
674 }
675}
676
677/// Try to replace VPWidenCanonicalIVRecipes with a widened canonical IV
678/// recipe, if it exists.
679static void removeRedundantCanonicalIVs(VPlan &Plan) {
680 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
681 VPCanonicalIVPHIRecipe *CanonicalIV = LoopRegion->getCanonicalIV();
682 VPWidenCanonicalIVRecipe *WidenNewIV = nullptr;
683 for (VPUser *U : CanonicalIV->users()) {
684 WidenNewIV = dyn_cast<VPWidenCanonicalIVRecipe>(Val: U);
685 if (WidenNewIV)
686 break;
687 }
688
689 if (!WidenNewIV)
690 return;
691
692 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
693 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
694 auto *WidenOriginalIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(Val: &Phi);
695
696 if (!WidenOriginalIV || !WidenOriginalIV->isCanonical())
697 continue;
698
699 // Replace WidenNewIV with WidenOriginalIV if WidenOriginalIV provides
700 // everything WidenNewIV's users need. That is, WidenOriginalIV will
701 // generate a vector phi or all users of WidenNewIV demand the first lane
702 // only.
703 if (Plan.hasScalarVFOnly() ||
704 !vputils::onlyScalarValuesUsed(Def: WidenOriginalIV) ||
705 vputils::onlyFirstLaneUsed(Def: WidenNewIV)) {
706 // We are replacing a wide canonical iv with a suitable wide induction.
707 // This is used to compute header mask, hence all lanes will be used and
708 // we need to drop wrap flags only applying to lanes guranteed to execute
709 // in the original scalar loop.
710 WidenOriginalIV->dropPoisonGeneratingFlags();
711 WidenNewIV->replaceAllUsesWith(New: WidenOriginalIV);
712 WidenNewIV->eraseFromParent();
713 return;
714 }
715 }
716}
717
718/// Returns true if \p R is dead and can be removed.
719static bool isDeadRecipe(VPRecipeBase &R) {
720 // Do remove conditional assume instructions as their conditions may be
721 // flattened.
722 auto *RepR = dyn_cast<VPReplicateRecipe>(Val: &R);
723 bool IsConditionalAssume = RepR && RepR->isPredicated() &&
724 match(V: RepR, P: m_Intrinsic<Intrinsic::assume>());
725 if (IsConditionalAssume)
726 return true;
727
728 if (R.mayHaveSideEffects())
729 return false;
730
731 // Recipe is dead if no user keeps the recipe alive.
732 return all_of(Range: R.definedValues(),
733 P: [](VPValue *V) { return V->getNumUsers() == 0; });
734}
735
736void VPlanTransforms::removeDeadRecipes(VPlan &Plan) {
737 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
738 Range: vp_post_order_deep(G: Plan.getEntry()))) {
739 // The recipes in the block are processed in reverse order, to catch chains
740 // of dead recipes.
741 for (VPRecipeBase &R : make_early_inc_range(Range: reverse(C&: *VPBB))) {
742 if (isDeadRecipe(R)) {
743 R.eraseFromParent();
744 continue;
745 }
746
747 // Check if R is a dead VPPhi <-> update cycle and remove it.
748 VPValue *Start, *Incoming;
749 if (!match(V: &R, P: m_VPPhi(Op0: m_VPValue(V&: Start), Op1: m_VPValue(V&: Incoming))))
750 continue;
751 auto *PhiR = cast<VPPhi>(Val: &R);
752 VPUser *PhiUser = PhiR->getSingleUser();
753 if (!PhiUser)
754 continue;
755 if (PhiUser != Incoming->getDefiningRecipe() ||
756 Incoming->getNumUsers() != 1)
757 continue;
758 PhiR->replaceAllUsesWith(New: Start);
759 PhiR->eraseFromParent();
760 Incoming->getDefiningRecipe()->eraseFromParent();
761 }
762 }
763}
764
765static VPScalarIVStepsRecipe *
766createScalarIVSteps(VPlan &Plan, InductionDescriptor::InductionKind Kind,
767 Instruction::BinaryOps InductionOpcode,
768 FPMathOperator *FPBinOp, Instruction *TruncI,
769 VPIRValue *StartV, VPValue *Step, DebugLoc DL,
770 VPBuilder &Builder) {
771 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
772 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
773 VPCanonicalIVPHIRecipe *CanonicalIV = LoopRegion->getCanonicalIV();
774 VPSingleDefRecipe *BaseIV = Builder.createDerivedIV(
775 Kind, FPBinOp, Start: StartV, Current: CanonicalIV, Step, Name: "offset.idx");
776
777 // Truncate base induction if needed.
778 VPTypeAnalysis TypeInfo(Plan);
779 Type *ResultTy = TypeInfo.inferScalarType(V: BaseIV);
780 if (TruncI) {
781 Type *TruncTy = TruncI->getType();
782 assert(ResultTy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits() &&
783 "Not truncating.");
784 assert(ResultTy->isIntegerTy() && "Truncation requires an integer type");
785 BaseIV = Builder.createScalarCast(Opcode: Instruction::Trunc, Op: BaseIV, ResultTy: TruncTy, DL);
786 ResultTy = TruncTy;
787 }
788
789 // Truncate step if needed.
790 Type *StepTy = TypeInfo.inferScalarType(V: Step);
791 if (ResultTy != StepTy) {
792 assert(StepTy->getScalarSizeInBits() > ResultTy->getScalarSizeInBits() &&
793 "Not truncating.");
794 assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
795 auto *VecPreheader =
796 cast<VPBasicBlock>(Val: HeaderVPBB->getSingleHierarchicalPredecessor());
797 VPBuilder::InsertPointGuard Guard(Builder);
798 Builder.setInsertPoint(VecPreheader);
799 Step = Builder.createScalarCast(Opcode: Instruction::Trunc, Op: Step, ResultTy, DL);
800 }
801 return Builder.createScalarIVSteps(InductionOpcode, FPBinOp, IV: BaseIV, Step,
802 VF: &Plan.getVF(), DL);
803}
804
805static SmallVector<VPUser *> collectUsersRecursively(VPValue *V) {
806 SetVector<VPUser *> Users(llvm::from_range, V->users());
807 for (unsigned I = 0; I != Users.size(); ++I) {
808 VPRecipeBase *Cur = cast<VPRecipeBase>(Val: Users[I]);
809 if (isa<VPHeaderPHIRecipe>(Val: Cur))
810 continue;
811 for (VPValue *V : Cur->definedValues())
812 Users.insert_range(R: V->users());
813 }
814 return Users.takeVector();
815}
816
817/// Scalarize a VPWidenPointerInductionRecipe by replacing it with a PtrAdd
818/// (IndStart, ScalarIVSteps (0, Step)). This is used when the recipe only
819/// generates scalar values.
820static VPValue *
821scalarizeVPWidenPointerInduction(VPWidenPointerInductionRecipe *PtrIV,
822 VPlan &Plan, VPBuilder &Builder) {
823 const InductionDescriptor &ID = PtrIV->getInductionDescriptor();
824 VPIRValue *StartV = Plan.getZero(Ty: ID.getStep()->getType());
825 VPValue *StepV = PtrIV->getOperand(N: 1);
826 VPScalarIVStepsRecipe *Steps = createScalarIVSteps(
827 Plan, Kind: InductionDescriptor::IK_IntInduction, InductionOpcode: Instruction::Add, FPBinOp: nullptr,
828 TruncI: nullptr, StartV, Step: StepV, DL: PtrIV->getDebugLoc(), Builder);
829
830 return Builder.createPtrAdd(Ptr: PtrIV->getStartValue(), Offset: Steps,
831 DL: PtrIV->getDebugLoc(), Name: "next.gep");
832}
833
834/// Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd
835/// (IndStart, ScalarIVSteps (0, Step)) if only its scalar values are used, as
836/// VPWidenPointerInductionRecipe will generate vectors only. If some users
837/// require vectors while other require scalars, the scalar uses need to extract
838/// the scalars from the generated vectors (Note that this is different to how
839/// int/fp inductions are handled). Legalize extract-from-ends using uniform
840/// VPReplicateRecipe of wide inductions to use regular VPReplicateRecipe, so
841/// the correct end value is available. Also optimize
842/// VPWidenIntOrFpInductionRecipe, if any of its users needs scalar values, by
843/// providing them scalar steps built on the canonical scalar IV and update the
844/// original IV's users. This is an optional optimization to reduce the needs of
845/// vector extracts.
846static void legalizeAndOptimizeInductions(VPlan &Plan) {
847 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
848 bool HasOnlyVectorVFs = !Plan.hasScalarVFOnly();
849 VPBuilder Builder(HeaderVPBB, HeaderVPBB->getFirstNonPhi());
850 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
851 auto *PhiR = dyn_cast<VPWidenInductionRecipe>(Val: &Phi);
852 if (!PhiR)
853 continue;
854
855 // Try to narrow wide and replicating recipes to uniform recipes, based on
856 // VPlan analysis.
857 // TODO: Apply to all recipes in the future, to replace legacy uniformity
858 // analysis.
859 auto Users = collectUsersRecursively(V: PhiR);
860 for (VPUser *U : reverse(C&: Users)) {
861 auto *Def = dyn_cast<VPRecipeWithIRFlags>(Val: U);
862 auto *RepR = dyn_cast<VPReplicateRecipe>(Val: U);
863 // Skip recipes that shouldn't be narrowed.
864 if (!Def || !isa<VPReplicateRecipe, VPWidenRecipe>(Val: Def) ||
865 Def->getNumUsers() == 0 || !Def->getUnderlyingValue() ||
866 (RepR && (RepR->isSingleScalar() || RepR->isPredicated())))
867 continue;
868
869 // Skip recipes that may have other lanes than their first used.
870 if (!vputils::isSingleScalar(VPV: Def) && !vputils::onlyFirstLaneUsed(Def))
871 continue;
872
873 auto *Clone = new VPReplicateRecipe(Def->getUnderlyingInstr(),
874 Def->operands(), /*IsUniform*/ true,
875 /*Mask*/ nullptr, /*Flags*/ *Def);
876 Clone->insertAfter(InsertPos: Def);
877 Def->replaceAllUsesWith(New: Clone);
878 }
879
880 // Replace wide pointer inductions which have only their scalars used by
881 // PtrAdd(IndStart, ScalarIVSteps (0, Step)).
882 if (auto *PtrIV = dyn_cast<VPWidenPointerInductionRecipe>(Val: &Phi)) {
883 if (!Plan.hasScalarVFOnly() &&
884 !PtrIV->onlyScalarsGenerated(IsScalable: Plan.hasScalableVF()))
885 continue;
886
887 VPValue *PtrAdd = scalarizeVPWidenPointerInduction(PtrIV, Plan, Builder);
888 PtrIV->replaceAllUsesWith(New: PtrAdd);
889 continue;
890 }
891
892 // Replace widened induction with scalar steps for users that only use
893 // scalars.
894 auto *WideIV = cast<VPWidenIntOrFpInductionRecipe>(Val: &Phi);
895 if (HasOnlyVectorVFs && none_of(Range: WideIV->users(), P: [WideIV](VPUser *U) {
896 return U->usesScalars(Op: WideIV);
897 }))
898 continue;
899
900 const InductionDescriptor &ID = WideIV->getInductionDescriptor();
901 VPScalarIVStepsRecipe *Steps = createScalarIVSteps(
902 Plan, Kind: ID.getKind(), InductionOpcode: ID.getInductionOpcode(),
903 FPBinOp: dyn_cast_or_null<FPMathOperator>(Val: ID.getInductionBinOp()),
904 TruncI: WideIV->getTruncInst(), StartV: WideIV->getStartValue(), Step: WideIV->getStepValue(),
905 DL: WideIV->getDebugLoc(), Builder);
906
907 // Update scalar users of IV to use Step instead.
908 if (!HasOnlyVectorVFs) {
909 assert(!Plan.hasScalableVF() &&
910 "plans containing a scalar VF cannot also include scalable VFs");
911 WideIV->replaceAllUsesWith(New: Steps);
912 } else {
913 bool HasScalableVF = Plan.hasScalableVF();
914 WideIV->replaceUsesWithIf(New: Steps,
915 ShouldReplace: [WideIV, HasScalableVF](VPUser &U, unsigned) {
916 if (HasScalableVF)
917 return U.usesFirstLaneOnly(Op: WideIV);
918 return U.usesScalars(Op: WideIV);
919 });
920 }
921 }
922}
923
924/// Check if \p VPV is an untruncated wide induction, either before or after the
925/// increment. If so return the header IV (before the increment), otherwise
926/// return null.
927static VPWidenInductionRecipe *
928getOptimizableIVOf(VPValue *VPV, PredicatedScalarEvolution &PSE) {
929 auto *WideIV = dyn_cast<VPWidenInductionRecipe>(Val: VPV);
930 if (WideIV) {
931 // VPV itself is a wide induction, separately compute the end value for exit
932 // users if it is not a truncated IV.
933 auto *IntOrFpIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(Val: WideIV);
934 return (IntOrFpIV && IntOrFpIV->getTruncInst()) ? nullptr : WideIV;
935 }
936
937 // Check if VPV is an optimizable induction increment.
938 VPRecipeBase *Def = VPV->getDefiningRecipe();
939 if (!Def || Def->getNumOperands() != 2)
940 return nullptr;
941 WideIV = dyn_cast<VPWidenInductionRecipe>(Val: Def->getOperand(N: 0));
942 if (!WideIV)
943 WideIV = dyn_cast<VPWidenInductionRecipe>(Val: Def->getOperand(N: 1));
944 if (!WideIV)
945 return nullptr;
946
947 auto IsWideIVInc = [&]() {
948 auto &ID = WideIV->getInductionDescriptor();
949
950 // Check if VPV increments the induction by the induction step.
951 VPValue *IVStep = WideIV->getStepValue();
952 switch (ID.getInductionOpcode()) {
953 case Instruction::Add:
954 return match(V: VPV, P: m_c_Add(Op0: m_Specific(VPV: WideIV), Op1: m_Specific(VPV: IVStep)));
955 case Instruction::FAdd:
956 return match(V: VPV, P: m_c_FAdd(Op0: m_Specific(VPV: WideIV), Op1: m_Specific(VPV: IVStep)));
957 case Instruction::FSub:
958 return match(V: VPV, P: m_Binary<Instruction::FSub>(Op0: m_Specific(VPV: WideIV),
959 Op1: m_Specific(VPV: IVStep)));
960 case Instruction::Sub: {
961 // IVStep will be the negated step of the subtraction. Check if Step == -1
962 // * IVStep.
963 VPValue *Step;
964 if (!match(V: VPV, P: m_Sub(Op0: m_VPValue(), Op1: m_VPValue(V&: Step))))
965 return false;
966 const SCEV *IVStepSCEV = vputils::getSCEVExprForVPValue(V: IVStep, PSE);
967 const SCEV *StepSCEV = vputils::getSCEVExprForVPValue(V: Step, PSE);
968 ScalarEvolution &SE = *PSE.getSE();
969 return !isa<SCEVCouldNotCompute>(Val: IVStepSCEV) &&
970 !isa<SCEVCouldNotCompute>(Val: StepSCEV) &&
971 IVStepSCEV == SE.getNegativeSCEV(V: StepSCEV);
972 }
973 default:
974 return ID.getKind() == InductionDescriptor::IK_PtrInduction &&
975 match(V: VPV, P: m_GetElementPtr(Op0: m_Specific(VPV: WideIV),
976 Op1: m_Specific(VPV: WideIV->getStepValue())));
977 }
978 llvm_unreachable("should have been covered by switch above");
979 };
980 return IsWideIVInc() ? WideIV : nullptr;
981}
982
983/// Attempts to optimize the induction variable exit values for users in the
984/// early exit block.
985static VPValue *optimizeEarlyExitInductionUser(VPlan &Plan,
986 VPTypeAnalysis &TypeInfo,
987 VPBlockBase *PredVPBB,
988 VPValue *Op,
989 PredicatedScalarEvolution &PSE) {
990 VPValue *Incoming, *Mask;
991 if (!match(V: Op, P: m_ExtractLane(Op0: m_FirstActiveLane(Op0: m_VPValue(V&: Mask)),
992 Op1: m_VPValue(V&: Incoming))))
993 return nullptr;
994
995 auto *WideIV = getOptimizableIVOf(VPV: Incoming, PSE);
996 if (!WideIV)
997 return nullptr;
998
999 auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(Val: WideIV);
1000 if (WideIntOrFp && WideIntOrFp->getTruncInst())
1001 return nullptr;
1002
1003 // Calculate the final index.
1004 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
1005 auto *CanonicalIV = LoopRegion->getCanonicalIV();
1006 Type *CanonicalIVType = LoopRegion->getCanonicalIVType();
1007 VPBuilder B(cast<VPBasicBlock>(Val: PredVPBB));
1008
1009 DebugLoc DL = cast<VPInstruction>(Val: Op)->getDebugLoc();
1010 VPValue *FirstActiveLane =
1011 B.createNaryOp(Opcode: VPInstruction::FirstActiveLane, Operands: Mask, DL);
1012 Type *FirstActiveLaneType = TypeInfo.inferScalarType(V: FirstActiveLane);
1013 FirstActiveLane = B.createScalarZExtOrTrunc(Op: FirstActiveLane, ResultTy: CanonicalIVType,
1014 SrcTy: FirstActiveLaneType, DL);
1015 VPValue *EndValue = B.createAdd(LHS: CanonicalIV, RHS: FirstActiveLane, DL);
1016
1017 // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
1018 // changed it means the exit is using the incremented value, so we need to
1019 // add the step.
1020 if (Incoming != WideIV) {
1021 VPValue *One = Plan.getConstantInt(Ty: CanonicalIVType, Val: 1);
1022 EndValue = B.createAdd(LHS: EndValue, RHS: One, DL);
1023 }
1024
1025 if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
1026 const InductionDescriptor &ID = WideIV->getInductionDescriptor();
1027 VPIRValue *Start = WideIV->getStartValue();
1028 VPValue *Step = WideIV->getStepValue();
1029 EndValue = B.createDerivedIV(
1030 Kind: ID.getKind(), FPBinOp: dyn_cast_or_null<FPMathOperator>(Val: ID.getInductionBinOp()),
1031 Start, Current: EndValue, Step);
1032 }
1033
1034 return EndValue;
1035}
1036
1037/// Compute the end value for \p WideIV, unless it is truncated. Creates a
1038/// VPDerivedIVRecipe for non-canonical inductions.
1039static VPValue *tryToComputeEndValueForInduction(VPWidenInductionRecipe *WideIV,
1040 VPBuilder &VectorPHBuilder,
1041 VPTypeAnalysis &TypeInfo,
1042 VPValue *VectorTC) {
1043 auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(Val: WideIV);
1044 // Truncated wide inductions resume from the last lane of their vector value
1045 // in the last vector iteration which is handled elsewhere.
1046 if (WideIntOrFp && WideIntOrFp->getTruncInst())
1047 return nullptr;
1048
1049 VPIRValue *Start = WideIV->getStartValue();
1050 VPValue *Step = WideIV->getStepValue();
1051 const InductionDescriptor &ID = WideIV->getInductionDescriptor();
1052 VPValue *EndValue = VectorTC;
1053 if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
1054 EndValue = VectorPHBuilder.createDerivedIV(
1055 Kind: ID.getKind(), FPBinOp: dyn_cast_or_null<FPMathOperator>(Val: ID.getInductionBinOp()),
1056 Start, Current: VectorTC, Step);
1057 }
1058
1059 // EndValue is derived from the vector trip count (which has the same type as
1060 // the widest induction) and thus may be wider than the induction here.
1061 Type *ScalarTypeOfWideIV = TypeInfo.inferScalarType(V: WideIV);
1062 if (ScalarTypeOfWideIV != TypeInfo.inferScalarType(V: EndValue)) {
1063 EndValue = VectorPHBuilder.createScalarCast(Opcode: Instruction::Trunc, Op: EndValue,
1064 ResultTy: ScalarTypeOfWideIV,
1065 DL: WideIV->getDebugLoc());
1066 }
1067
1068 return EndValue;
1069}
1070
1071/// Attempts to optimize the induction variable exit values for users in the
1072/// exit block coming from the latch in the original scalar loop.
1073static VPValue *optimizeLatchExitInductionUser(
1074 VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op,
1075 DenseMap<VPValue *, VPValue *> &EndValues, PredicatedScalarEvolution &PSE) {
1076 VPValue *Incoming;
1077 VPWidenInductionRecipe *WideIV = nullptr;
1078 if (match(V: Op, P: m_ExtractLastLaneOfLastPart(Op0: m_VPValue(V&: Incoming))))
1079 WideIV = getOptimizableIVOf(VPV: Incoming, PSE);
1080
1081 if (!WideIV)
1082 return nullptr;
1083
1084 VPValue *EndValue = EndValues.lookup(Val: WideIV);
1085 assert(EndValue && "Must have computed the end value up front");
1086
1087 // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
1088 // changed it means the exit is using the incremented value, so we don't
1089 // need to subtract the step.
1090 if (Incoming != WideIV)
1091 return EndValue;
1092
1093 // Otherwise, subtract the step from the EndValue.
1094 VPBuilder B(cast<VPBasicBlock>(Val: PredVPBB)->getTerminator());
1095 VPValue *Step = WideIV->getStepValue();
1096 Type *ScalarTy = TypeInfo.inferScalarType(V: WideIV);
1097 if (ScalarTy->isIntegerTy())
1098 return B.createSub(LHS: EndValue, RHS: Step, DL: DebugLoc::getUnknown(), Name: "ind.escape");
1099 if (ScalarTy->isPointerTy()) {
1100 Type *StepTy = TypeInfo.inferScalarType(V: Step);
1101 auto *Zero = Plan.getZero(Ty: StepTy);
1102 return B.createPtrAdd(Ptr: EndValue, Offset: B.createSub(LHS: Zero, RHS: Step),
1103 DL: DebugLoc::getUnknown(), Name: "ind.escape");
1104 }
1105 if (ScalarTy->isFloatingPointTy()) {
1106 const auto &ID = WideIV->getInductionDescriptor();
1107 return B.createNaryOp(
1108 Opcode: ID.getInductionBinOp()->getOpcode() == Instruction::FAdd
1109 ? Instruction::FSub
1110 : Instruction::FAdd,
1111 Operands: {EndValue, Step}, Flags: {ID.getInductionBinOp()->getFastMathFlags()});
1112 }
1113 llvm_unreachable("all possible induction types must be handled");
1114 return nullptr;
1115}
1116
1117void VPlanTransforms::optimizeInductionLiveOutUsers(
1118 VPlan &Plan, PredicatedScalarEvolution &PSE, bool FoldTail) {
1119 // Compute end values for all inductions.
1120 VPTypeAnalysis TypeInfo(Plan);
1121 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
1122 auto *VectorPH = cast<VPBasicBlock>(Val: VectorRegion->getSinglePredecessor());
1123 VPBuilder VectorPHBuilder(VectorPH, VectorPH->begin());
1124 DenseMap<VPValue *, VPValue *> EndValues;
1125 VPValue *ResumeTC =
1126 FoldTail ? Plan.getTripCount() : &Plan.getVectorTripCount();
1127 for (auto &Phi : VectorRegion->getEntryBasicBlock()->phis()) {
1128 auto *WideIV = dyn_cast<VPWidenInductionRecipe>(Val: &Phi);
1129 if (!WideIV)
1130 continue;
1131 if (VPValue *EndValue = tryToComputeEndValueForInduction(
1132 WideIV, VectorPHBuilder, TypeInfo, VectorTC: ResumeTC))
1133 EndValues[WideIV] = EndValue;
1134 }
1135
1136 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
1137 for (VPRecipeBase &R : make_early_inc_range(Range&: *MiddleVPBB)) {
1138 VPValue *Op;
1139 if (!match(V: &R, P: m_ExitingIVValue(Op0: m_VPValue(V&: Op))))
1140 continue;
1141 auto *WideIV = cast<VPWidenInductionRecipe>(Val: Op);
1142 if (VPValue *EndValue = EndValues.lookup(Val: WideIV)) {
1143 R.getVPSingleValue()->replaceAllUsesWith(New: EndValue);
1144 R.eraseFromParent();
1145 }
1146 }
1147
1148 // Then, optimize exit block users.
1149 for (VPIRBasicBlock *ExitVPBB : Plan.getExitBlocks()) {
1150 for (VPRecipeBase &R : ExitVPBB->phis()) {
1151 auto *ExitIRI = cast<VPIRPhi>(Val: &R);
1152
1153 for (auto [Idx, PredVPBB] : enumerate(First&: ExitVPBB->getPredecessors())) {
1154 VPValue *Escape = nullptr;
1155 if (PredVPBB == MiddleVPBB)
1156 Escape = optimizeLatchExitInductionUser(Plan, TypeInfo, PredVPBB,
1157 Op: ExitIRI->getOperand(N: Idx),
1158 EndValues, PSE);
1159 else
1160 Escape = optimizeEarlyExitInductionUser(
1161 Plan, TypeInfo, PredVPBB, Op: ExitIRI->getOperand(N: Idx), PSE);
1162 if (Escape)
1163 ExitIRI->setOperand(I: Idx, New: Escape);
1164 }
1165 }
1166 }
1167}
1168
1169/// Remove redundant EpxandSCEVRecipes in \p Plan's entry block by replacing
1170/// them with already existing recipes expanding the same SCEV expression.
1171static void removeRedundantExpandSCEVRecipes(VPlan &Plan) {
1172 DenseMap<const SCEV *, VPValue *> SCEV2VPV;
1173
1174 for (VPRecipeBase &R :
1175 make_early_inc_range(Range&: *Plan.getEntry()->getEntryBasicBlock())) {
1176 auto *ExpR = dyn_cast<VPExpandSCEVRecipe>(Val: &R);
1177 if (!ExpR)
1178 continue;
1179
1180 const auto &[V, Inserted] = SCEV2VPV.try_emplace(Key: ExpR->getSCEV(), Args&: ExpR);
1181 if (Inserted)
1182 continue;
1183 ExpR->replaceAllUsesWith(New: V->second);
1184 ExpR->eraseFromParent();
1185 }
1186}
1187
1188static void recursivelyDeleteDeadRecipes(VPValue *V) {
1189 SmallVector<VPValue *> WorkList;
1190 SmallPtrSet<VPValue *, 8> Seen;
1191 WorkList.push_back(Elt: V);
1192
1193 while (!WorkList.empty()) {
1194 VPValue *Cur = WorkList.pop_back_val();
1195 if (!Seen.insert(Ptr: Cur).second)
1196 continue;
1197 VPRecipeBase *R = Cur->getDefiningRecipe();
1198 if (!R)
1199 continue;
1200 if (!isDeadRecipe(R&: *R))
1201 continue;
1202 append_range(C&: WorkList, R: R->operands());
1203 R->eraseFromParent();
1204 }
1205}
1206
1207/// Get any instruction opcode or intrinsic ID data embedded in recipe \p R.
1208/// Returns an optional pair, where the first element indicates whether it is
1209/// an intrinsic ID.
1210static std::optional<std::pair<bool, unsigned>>
1211getOpcodeOrIntrinsicID(const VPSingleDefRecipe *R) {
1212 return TypeSwitch<const VPSingleDefRecipe *,
1213 std::optional<std::pair<bool, unsigned>>>(R)
1214 .Case<VPInstruction, VPWidenRecipe, VPWidenCastRecipe, VPWidenGEPRecipe,
1215 VPReplicateRecipe>(
1216 caseFn: [](auto *I) { return std::make_pair(false, I->getOpcode()); })
1217 .Case(caseFn: [](const VPWidenIntrinsicRecipe *I) {
1218 return std::make_pair(x: true, y: I->getVectorIntrinsicID());
1219 })
1220 .Case<VPVectorPointerRecipe, VPPredInstPHIRecipe>(caseFn: [](auto *I) {
1221 // For recipes that do not directly map to LLVM IR instructions,
1222 // assign opcodes after the last VPInstruction opcode (which is also
1223 // after the last IR Instruction opcode), based on the VPRecipeID.
1224 return std::make_pair(false,
1225 VPInstruction::OpsEnd + 1 + I->getVPRecipeID());
1226 })
1227 .Default(defaultFn: [](auto *) { return std::nullopt; });
1228}
1229
1230/// Try to fold \p R using InstSimplifyFolder. Will succeed and return a
1231/// non-nullptr VPValue for a handled opcode or intrinsic ID if corresponding \p
1232/// Operands are foldable live-ins.
1233static VPIRValue *tryToFoldLiveIns(VPSingleDefRecipe &R,
1234 ArrayRef<VPValue *> Operands,
1235 const DataLayout &DL,
1236 VPTypeAnalysis &TypeInfo) {
1237 auto OpcodeOrIID = getOpcodeOrIntrinsicID(R: &R);
1238 if (!OpcodeOrIID)
1239 return nullptr;
1240
1241 SmallVector<Value *, 4> Ops;
1242 for (VPValue *Op : Operands) {
1243 if (!match(V: Op, P: m_LiveIn()))
1244 return nullptr;
1245 Value *V = Op->getUnderlyingValue();
1246 if (!V)
1247 return nullptr;
1248 Ops.push_back(Elt: V);
1249 }
1250
1251 auto FoldToIRValue = [&]() -> Value * {
1252 InstSimplifyFolder Folder(DL);
1253 if (OpcodeOrIID->first) {
1254 if (R.getNumOperands() != 2)
1255 return nullptr;
1256 unsigned ID = OpcodeOrIID->second;
1257 return Folder.FoldBinaryIntrinsic(ID, LHS: Ops[0], RHS: Ops[1],
1258 Ty: TypeInfo.inferScalarType(V: &R));
1259 }
1260 unsigned Opcode = OpcodeOrIID->second;
1261 if (Instruction::isBinaryOp(Opcode))
1262 return Folder.FoldBinOp(Opc: static_cast<Instruction::BinaryOps>(Opcode),
1263 LHS: Ops[0], RHS: Ops[1]);
1264 if (Instruction::isCast(Opcode))
1265 return Folder.FoldCast(Op: static_cast<Instruction::CastOps>(Opcode), V: Ops[0],
1266 DestTy: TypeInfo.inferScalarType(V: R.getVPSingleValue()));
1267 switch (Opcode) {
1268 case VPInstruction::LogicalAnd:
1269 return Folder.FoldSelect(C: Ops[0], True: Ops[1],
1270 False: ConstantInt::getNullValue(Ty: Ops[1]->getType()));
1271 case VPInstruction::Not:
1272 return Folder.FoldBinOp(Opc: Instruction::BinaryOps::Xor, LHS: Ops[0],
1273 RHS: Constant::getAllOnesValue(Ty: Ops[0]->getType()));
1274 case Instruction::Select:
1275 return Folder.FoldSelect(C: Ops[0], True: Ops[1], False: Ops[2]);
1276 case Instruction::ICmp:
1277 case Instruction::FCmp:
1278 return Folder.FoldCmp(P: cast<VPRecipeWithIRFlags>(Val&: R).getPredicate(), LHS: Ops[0],
1279 RHS: Ops[1]);
1280 case Instruction::GetElementPtr: {
1281 auto &RFlags = cast<VPRecipeWithIRFlags>(Val&: R);
1282 auto *GEP = cast<GetElementPtrInst>(Val: RFlags.getUnderlyingInstr());
1283 return Folder.FoldGEP(Ty: GEP->getSourceElementType(), Ptr: Ops[0],
1284 IdxList: drop_begin(RangeOrContainer&: Ops), NW: RFlags.getGEPNoWrapFlags());
1285 }
1286 case VPInstruction::PtrAdd:
1287 case VPInstruction::WidePtrAdd:
1288 return Folder.FoldGEP(Ty: IntegerType::getInt8Ty(C&: TypeInfo.getContext()),
1289 Ptr: Ops[0], IdxList: Ops[1],
1290 NW: cast<VPRecipeWithIRFlags>(Val&: R).getGEPNoWrapFlags());
1291 // An extract of a live-in is an extract of a broadcast, so return the
1292 // broadcasted element.
1293 case Instruction::ExtractElement:
1294 assert(!Ops[0]->getType()->isVectorTy() && "Live-ins should be scalar");
1295 return Ops[0];
1296 }
1297 return nullptr;
1298 };
1299
1300 if (Value *V = FoldToIRValue())
1301 return R.getParent()->getPlan()->getOrAddLiveIn(V);
1302 return nullptr;
1303}
1304
1305/// Try to simplify VPSingleDefRecipe \p Def.
1306static void simplifyRecipe(VPSingleDefRecipe *Def, VPTypeAnalysis &TypeInfo) {
1307 VPlan *Plan = Def->getParent()->getPlan();
1308
1309 // Simplification of live-in IR values for SingleDef recipes using
1310 // InstSimplifyFolder.
1311 const DataLayout &DL = Plan->getDataLayout();
1312 if (VPValue *V = tryToFoldLiveIns(R&: *Def, Operands: Def->operands(), DL, TypeInfo))
1313 return Def->replaceAllUsesWith(New: V);
1314
1315 // Fold PredPHI LiveIn -> LiveIn.
1316 if (auto *PredPHI = dyn_cast<VPPredInstPHIRecipe>(Val: Def)) {
1317 VPValue *Op = PredPHI->getOperand(N: 0);
1318 if (isa<VPIRValue>(Val: Op))
1319 PredPHI->replaceAllUsesWith(New: Op);
1320 }
1321
1322 VPBuilder Builder(Def);
1323
1324 // Avoid replacing VPInstructions with underlying values with new
1325 // VPInstructions, as we would fail to create widen/replicate recpes from the
1326 // new VPInstructions without an underlying value, and miss out on some
1327 // transformations that only apply to widened/replicated recipes later, by
1328 // doing so.
1329 // TODO: We should also not replace non-VPInstructions like VPWidenRecipe with
1330 // VPInstructions without underlying values, as those will get skipped during
1331 // cost computation.
1332 bool CanCreateNewRecipe =
1333 !isa<VPInstruction>(Val: Def) || !Def->getUnderlyingValue();
1334
1335 VPValue *A;
1336 if (match(R: Def, P: m_Trunc(Op0: m_ZExtOrSExt(Op0: m_VPValue(V&: A))))) {
1337 Type *TruncTy = TypeInfo.inferScalarType(V: Def);
1338 Type *ATy = TypeInfo.inferScalarType(V: A);
1339 if (TruncTy == ATy) {
1340 Def->replaceAllUsesWith(New: A);
1341 } else {
1342 // Don't replace a non-widened cast recipe with a widened cast.
1343 if (!isa<VPWidenCastRecipe>(Val: Def))
1344 return;
1345 if (ATy->getScalarSizeInBits() < TruncTy->getScalarSizeInBits()) {
1346
1347 unsigned ExtOpcode = match(V: Def->getOperand(N: 0), P: m_SExt(Op0: m_VPValue()))
1348 ? Instruction::SExt
1349 : Instruction::ZExt;
1350 auto *Ext = Builder.createWidenCast(Opcode: Instruction::CastOps(ExtOpcode), Op: A,
1351 ResultTy: TruncTy);
1352 if (auto *UnderlyingExt = Def->getOperand(N: 0)->getUnderlyingValue()) {
1353 // UnderlyingExt has distinct return type, used to retain legacy cost.
1354 Ext->setUnderlyingValue(UnderlyingExt);
1355 }
1356 Def->replaceAllUsesWith(New: Ext);
1357 } else if (ATy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits()) {
1358 auto *Trunc = Builder.createWidenCast(Opcode: Instruction::Trunc, Op: A, ResultTy: TruncTy);
1359 Def->replaceAllUsesWith(New: Trunc);
1360 }
1361 }
1362#ifndef NDEBUG
1363 // Verify that the cached type info is for both A and its users is still
1364 // accurate by comparing it to freshly computed types.
1365 VPTypeAnalysis TypeInfo2(*Plan);
1366 assert(TypeInfo.inferScalarType(A) == TypeInfo2.inferScalarType(A));
1367 for (VPUser *U : A->users()) {
1368 auto *R = cast<VPRecipeBase>(U);
1369 for (VPValue *VPV : R->definedValues())
1370 assert(TypeInfo.inferScalarType(VPV) == TypeInfo2.inferScalarType(VPV));
1371 }
1372#endif
1373 }
1374
1375 // Simplify (X && Y) | (X && !Y) -> X.
1376 // TODO: Split up into simpler, modular combines: (X && Y) | (X && Z) into X
1377 // && (Y | Z) and (X | !X) into true. This requires queuing newly created
1378 // recipes to be visited during simplification.
1379 VPValue *X, *Y, *Z;
1380 if (match(R: Def,
1381 P: m_c_BinaryOr(Op0: m_LogicalAnd(Op0: m_VPValue(V&: X), Op1: m_VPValue(V&: Y)),
1382 Op1: m_LogicalAnd(Op0: m_Deferred(V: X), Op1: m_Not(Op0: m_Deferred(V: Y)))))) {
1383 Def->replaceAllUsesWith(New: X);
1384 Def->eraseFromParent();
1385 return;
1386 }
1387
1388 // x | AllOnes -> AllOnes
1389 if (match(R: Def, P: m_c_BinaryOr(Op0: m_VPValue(V&: X), Op1: m_AllOnes())))
1390 return Def->replaceAllUsesWith(
1391 New: Plan->getAllOnesValue(Ty: TypeInfo.inferScalarType(V: Def)));
1392
1393 // x | 0 -> x
1394 if (match(R: Def, P: m_c_BinaryOr(Op0: m_VPValue(V&: X), Op1: m_ZeroInt())))
1395 return Def->replaceAllUsesWith(New: X);
1396
1397 // x | !x -> AllOnes
1398 if (match(R: Def, P: m_c_BinaryOr(Op0: m_VPValue(V&: X), Op1: m_Not(Op0: m_Deferred(V: X)))))
1399 return Def->replaceAllUsesWith(
1400 New: Plan->getAllOnesValue(Ty: TypeInfo.inferScalarType(V: Def)));
1401
1402 // x & 0 -> 0
1403 if (match(R: Def, P: m_c_BinaryAnd(Op0: m_VPValue(V&: X), Op1: m_ZeroInt())))
1404 return Def->replaceAllUsesWith(
1405 New: Plan->getZero(Ty: TypeInfo.inferScalarType(V: Def)));
1406
1407 // x & AllOnes -> x
1408 if (match(R: Def, P: m_c_BinaryAnd(Op0: m_VPValue(V&: X), Op1: m_AllOnes())))
1409 return Def->replaceAllUsesWith(New: X);
1410
1411 // x && false -> false
1412 if (match(R: Def, P: m_c_LogicalAnd(Op0: m_VPValue(V&: X), Op1: m_False())))
1413 return Def->replaceAllUsesWith(New: Plan->getFalse());
1414
1415 // x && true -> x
1416 if (match(R: Def, P: m_c_LogicalAnd(Op0: m_VPValue(V&: X), Op1: m_True())))
1417 return Def->replaceAllUsesWith(New: X);
1418
1419 // (x && y) | (x && z) -> x && (y | z)
1420 if (CanCreateNewRecipe &&
1421 match(R: Def, P: m_c_BinaryOr(Op0: m_LogicalAnd(Op0: m_VPValue(V&: X), Op1: m_VPValue(V&: Y)),
1422 Op1: m_LogicalAnd(Op0: m_Deferred(V: X), Op1: m_VPValue(V&: Z)))) &&
1423 // Simplify only if one of the operands has one use to avoid creating an
1424 // extra recipe.
1425 (!Def->getOperand(N: 0)->hasMoreThanOneUniqueUser() ||
1426 !Def->getOperand(N: 1)->hasMoreThanOneUniqueUser()))
1427 return Def->replaceAllUsesWith(
1428 New: Builder.createLogicalAnd(LHS: X, RHS: Builder.createOr(LHS: Y, RHS: Z)));
1429
1430 // x && (x && y) -> x && y
1431 if (match(R: Def, P: m_LogicalAnd(Op0: m_VPValue(V&: X),
1432 Op1: m_LogicalAnd(Op0: m_Deferred(V: X), Op1: m_VPValue()))))
1433 return Def->replaceAllUsesWith(New: Def->getOperand(N: 1));
1434
1435 // x && (y && x) -> x && y
1436 if (match(R: Def, P: m_LogicalAnd(Op0: m_VPValue(V&: X),
1437 Op1: m_LogicalAnd(Op0: m_VPValue(V&: Y), Op1: m_Deferred(V: X)))))
1438 return Def->replaceAllUsesWith(New: Builder.createLogicalAnd(LHS: X, RHS: Y));
1439
1440 // x && !x -> 0
1441 if (match(R: Def, P: m_LogicalAnd(Op0: m_VPValue(V&: X), Op1: m_Not(Op0: m_Deferred(V: X)))))
1442 return Def->replaceAllUsesWith(New: Plan->getFalse());
1443
1444 if (match(R: Def, P: m_Select(Op0: m_VPValue(), Op1: m_VPValue(V&: X), Op2: m_Deferred(V: X))))
1445 return Def->replaceAllUsesWith(New: X);
1446
1447 // select c, false, true -> not c
1448 VPValue *C;
1449 if (CanCreateNewRecipe &&
1450 match(R: Def, P: m_Select(Op0: m_VPValue(V&: C), Op1: m_False(), Op2: m_True())))
1451 return Def->replaceAllUsesWith(New: Builder.createNot(Operand: C));
1452
1453 // select !c, x, y -> select c, y, x
1454 if (match(R: Def, P: m_Select(Op0: m_Not(Op0: m_VPValue(V&: C)), Op1: m_VPValue(V&: X), Op2: m_VPValue(V&: Y)))) {
1455 Def->setOperand(I: 0, New: C);
1456 Def->setOperand(I: 1, New: Y);
1457 Def->setOperand(I: 2, New: X);
1458 return;
1459 }
1460
1461 if (match(R: Def, P: m_c_Add(Op0: m_VPValue(V&: A), Op1: m_ZeroInt())))
1462 return Def->replaceAllUsesWith(New: A);
1463
1464 if (match(R: Def, P: m_c_Mul(Op0: m_VPValue(V&: A), Op1: m_One())))
1465 return Def->replaceAllUsesWith(New: A);
1466
1467 if (match(R: Def, P: m_c_Mul(Op0: m_VPValue(V&: A), Op1: m_ZeroInt())))
1468 return Def->replaceAllUsesWith(
1469 New: Plan->getZero(Ty: TypeInfo.inferScalarType(V: Def)));
1470
1471 if (CanCreateNewRecipe && match(R: Def, P: m_c_Mul(Op0: m_VPValue(V&: A), Op1: m_AllOnes()))) {
1472 // Preserve nsw from the Mul on the new Sub.
1473 VPIRFlags::WrapFlagsTy NW = {
1474 false, cast<VPRecipeWithIRFlags>(Val: Def)->hasNoSignedWrap()};
1475 return Def->replaceAllUsesWith(
1476 New: Builder.createSub(LHS: Plan->getZero(Ty: TypeInfo.inferScalarType(V: A)), RHS: A,
1477 DL: Def->getDebugLoc(), Name: "", WrapFlags: NW));
1478 }
1479
1480 const APInt *APC;
1481 if (CanCreateNewRecipe && match(R: Def, P: m_c_Mul(Op0: m_VPValue(V&: A), Op1: m_APInt(C&: APC))) &&
1482 APC->isPowerOf2())
1483 return Def->replaceAllUsesWith(New: Builder.createNaryOp(
1484 Opcode: Instruction::Shl,
1485 Operands: {A, Plan->getConstantInt(BitWidth: APC->getBitWidth(), Val: APC->exactLogBase2())},
1486 Flags: *cast<VPRecipeWithIRFlags>(Val: Def), DL: Def->getDebugLoc()));
1487
1488 // Don't convert udiv to lshr inside a replicate region, as VPInstructions are
1489 // not allowed in them.
1490 const VPRegionBlock *ParentRegion = Def->getParent()->getParent();
1491 bool IsInReplicateRegion = ParentRegion && ParentRegion->isReplicator();
1492 if (CanCreateNewRecipe && !IsInReplicateRegion &&
1493 match(R: Def, P: m_UDiv(Op0: m_VPValue(V&: A), Op1: m_APInt(C&: APC))) && APC->isPowerOf2())
1494 return Def->replaceAllUsesWith(New: Builder.createNaryOp(
1495 Opcode: Instruction::LShr,
1496 Operands: {A, Plan->getConstantInt(BitWidth: APC->getBitWidth(), Val: APC->exactLogBase2())},
1497 Flags: *cast<VPRecipeWithIRFlags>(Val: Def), DL: Def->getDebugLoc()));
1498
1499 if (match(R: Def, P: m_Not(Op0: m_VPValue(V&: A)))) {
1500 if (match(V: A, P: m_Not(Op0: m_VPValue(V&: A))))
1501 return Def->replaceAllUsesWith(New: A);
1502
1503 // Try to fold Not into compares by adjusting the predicate in-place.
1504 CmpPredicate Pred;
1505 if (match(V: A, P: m_Cmp(Pred, Op0: m_VPValue(), Op1: m_VPValue()))) {
1506 auto *Cmp = cast<VPRecipeWithIRFlags>(Val: A);
1507 if (all_of(Range: Cmp->users(),
1508 P: match_fn(P: m_CombineOr(
1509 L: m_Not(Op0: m_Specific(VPV: Cmp)),
1510 R: m_Select(Op0: m_Specific(VPV: Cmp), Op1: m_VPValue(), Op2: m_VPValue()))))) {
1511 Cmp->setPredicate(CmpInst::getInversePredicate(pred: Pred));
1512 for (VPUser *U : to_vector(Range: Cmp->users())) {
1513 auto *R = cast<VPSingleDefRecipe>(Val: U);
1514 if (match(R, P: m_Select(Op0: m_Specific(VPV: Cmp), Op1: m_VPValue(V&: X), Op2: m_VPValue(V&: Y)))) {
1515 // select (cmp pred), x, y -> select (cmp inv_pred), y, x
1516 R->setOperand(I: 1, New: Y);
1517 R->setOperand(I: 2, New: X);
1518 } else {
1519 // not (cmp pred) -> cmp inv_pred
1520 assert(match(R, m_Not(m_Specific(Cmp))) && "Unexpected user");
1521 R->replaceAllUsesWith(New: Cmp);
1522 }
1523 }
1524 // If Cmp doesn't have a debug location, use the one from the negation,
1525 // to preserve the location.
1526 if (!Cmp->getDebugLoc() && Def->getDebugLoc())
1527 Cmp->setDebugLoc(Def->getDebugLoc());
1528 }
1529 }
1530 }
1531
1532 // Fold any-of (fcmp uno %A, %A), (fcmp uno %B, %B), ... ->
1533 // any-of (fcmp uno %A, %B), ...
1534 if (match(R: Def, P: m_AnyOf())) {
1535 SmallVector<VPValue *, 4> NewOps;
1536 VPRecipeBase *UnpairedCmp = nullptr;
1537 for (VPValue *Op : Def->operands()) {
1538 VPValue *X;
1539 if (Op->getNumUsers() > 1 ||
1540 !match(V: Op, P: m_SpecificCmp(MatchPred: CmpInst::FCMP_UNO, Op0: m_VPValue(V&: X),
1541 Op1: m_Deferred(V: X)))) {
1542 NewOps.push_back(Elt: Op);
1543 } else if (!UnpairedCmp) {
1544 UnpairedCmp = Op->getDefiningRecipe();
1545 } else {
1546 NewOps.push_back(Elt: Builder.createFCmp(Pred: CmpInst::FCMP_UNO,
1547 A: UnpairedCmp->getOperand(N: 0), B: X));
1548 UnpairedCmp = nullptr;
1549 }
1550 }
1551
1552 if (UnpairedCmp)
1553 NewOps.push_back(Elt: UnpairedCmp->getVPSingleValue());
1554
1555 if (NewOps.size() < Def->getNumOperands()) {
1556 VPValue *NewAnyOf = Builder.createNaryOp(Opcode: VPInstruction::AnyOf, Operands: NewOps);
1557 return Def->replaceAllUsesWith(New: NewAnyOf);
1558 }
1559 }
1560
1561 // Fold (fcmp uno %X, %X) or (fcmp uno %Y, %Y) -> fcmp uno %X, %Y
1562 // This is useful for fmax/fmin without fast-math flags, where we need to
1563 // check if any operand is NaN.
1564 if (CanCreateNewRecipe &&
1565 match(R: Def, P: m_BinaryOr(Op0: m_SpecificCmp(MatchPred: CmpInst::FCMP_UNO, Op0: m_VPValue(V&: X),
1566 Op1: m_Deferred(V: X)),
1567 Op1: m_SpecificCmp(MatchPred: CmpInst::FCMP_UNO, Op0: m_VPValue(V&: Y),
1568 Op1: m_Deferred(V: Y))))) {
1569 VPValue *NewCmp = Builder.createFCmp(Pred: CmpInst::FCMP_UNO, A: X, B: Y);
1570 return Def->replaceAllUsesWith(New: NewCmp);
1571 }
1572
1573 // Remove redundant DerviedIVs, that is 0 + A * 1 -> A and 0 + 0 * x -> 0.
1574 if ((match(R: Def, P: m_DerivedIV(Op0: m_ZeroInt(), Op1: m_VPValue(V&: A), Op2: m_One())) ||
1575 match(R: Def, P: m_DerivedIV(Op0: m_ZeroInt(), Op1: m_ZeroInt(), Op2: m_VPValue()))) &&
1576 TypeInfo.inferScalarType(V: Def->getOperand(N: 1)) ==
1577 TypeInfo.inferScalarType(V: Def))
1578 return Def->replaceAllUsesWith(New: Def->getOperand(N: 1));
1579
1580 if (match(R: Def, P: m_VPInstruction<VPInstruction::WideIVStep>(Ops: m_VPValue(V&: X),
1581 Ops: m_One()))) {
1582 Type *WideStepTy = TypeInfo.inferScalarType(V: Def);
1583 if (TypeInfo.inferScalarType(V: X) != WideStepTy)
1584 X = Builder.createWidenCast(Opcode: Instruction::Trunc, Op: X, ResultTy: WideStepTy);
1585 Def->replaceAllUsesWith(New: X);
1586 return;
1587 }
1588
1589 // For i1 vp.merges produced by AnyOf reductions:
1590 // vp.merge true, (or x, y), x, evl -> vp.merge y, true, x, evl
1591 if (match(R: Def, P: m_Intrinsic<Intrinsic::vp_merge>(Op0: m_True(), Op1: m_VPValue(V&: A),
1592 Op2: m_VPValue(V&: X), Op3: m_VPValue())) &&
1593 match(V: A, P: m_c_BinaryOr(Op0: m_Specific(VPV: X), Op1: m_VPValue(V&: Y))) &&
1594 TypeInfo.inferScalarType(V: Def)->isIntegerTy(Bitwidth: 1)) {
1595 Def->setOperand(I: 1, New: Def->getOperand(N: 0));
1596 Def->setOperand(I: 0, New: Y);
1597 return;
1598 }
1599
1600 // Simplify MaskedCond with no block mask to its single operand.
1601 if (match(R: Def, P: m_VPInstruction<VPInstruction::MaskedCond>()) &&
1602 !cast<VPInstruction>(Val: Def)->isMasked())
1603 return Def->replaceAllUsesWith(New: Def->getOperand(N: 0));
1604
1605 // Look through ExtractLastLane.
1606 if (match(R: Def, P: m_ExtractLastLane(Op0: m_VPValue(V&: A)))) {
1607 if (match(V: A, P: m_BuildVector())) {
1608 auto *BuildVector = cast<VPInstruction>(Val: A);
1609 Def->replaceAllUsesWith(
1610 New: BuildVector->getOperand(N: BuildVector->getNumOperands() - 1));
1611 return;
1612 }
1613 if (Plan->hasScalarVFOnly())
1614 return Def->replaceAllUsesWith(New: A);
1615 }
1616
1617 // Look through ExtractPenultimateElement (BuildVector ....).
1618 if (match(R: Def, P: m_ExtractPenultimateElement(Op0: m_BuildVector()))) {
1619 auto *BuildVector = cast<VPInstruction>(Val: Def->getOperand(N: 0));
1620 Def->replaceAllUsesWith(
1621 New: BuildVector->getOperand(N: BuildVector->getNumOperands() - 2));
1622 return;
1623 }
1624
1625 uint64_t Idx;
1626 if (match(R: Def, P: m_ExtractElement(Op0: m_BuildVector(), Op1: m_ConstantInt(C&: Idx)))) {
1627 auto *BuildVector = cast<VPInstruction>(Val: Def->getOperand(N: 0));
1628 Def->replaceAllUsesWith(New: BuildVector->getOperand(N: Idx));
1629 return;
1630 }
1631
1632 if (match(R: Def, P: m_BuildVector()) && all_equal(Range: Def->operands())) {
1633 Def->replaceAllUsesWith(
1634 New: Builder.createNaryOp(Opcode: VPInstruction::Broadcast, Operands: Def->getOperand(N: 0)));
1635 return;
1636 }
1637
1638 // Look through broadcast of single-scalar when used as select conditions; in
1639 // that case the scalar condition can be used directly.
1640 if (match(R: Def,
1641 P: m_Select(Op0: m_Broadcast(Op0: m_VPValue(V&: C)), Op1: m_VPValue(), Op2: m_VPValue()))) {
1642 assert(vputils::isSingleScalar(C) &&
1643 "broadcast operand must be single-scalar");
1644 Def->setOperand(I: 0, New: C);
1645 return;
1646 }
1647
1648 if (isa<VPPhi, VPWidenPHIRecipe, VPHeaderPHIRecipe>(Val: Def)) {
1649 if (Def->getNumOperands() == 1) {
1650 Def->replaceAllUsesWith(New: Def->getOperand(N: 0));
1651 return;
1652 }
1653 if (auto *Phi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(Val: Def)) {
1654 if (all_equal(Range: Phi->incoming_values()))
1655 Phi->replaceAllUsesWith(New: Phi->getOperand(N: 0));
1656 }
1657 return;
1658 }
1659
1660 VPIRValue *IRV;
1661 if (Def->getNumOperands() == 1 &&
1662 match(R: Def, P: m_ComputeReductionResult(Op0: m_VPIRValue(V&: IRV))))
1663 return Def->replaceAllUsesWith(New: IRV);
1664
1665 // Some simplifications can only be applied after unrolling. Perform them
1666 // below.
1667 if (!Plan->isUnrolled())
1668 return;
1669
1670 // After unrolling, extract-lane may be used to extract values from multiple
1671 // scalar sources. Only simplify when extracting from a single scalar source.
1672 VPValue *LaneToExtract;
1673 if (match(R: Def, P: m_ExtractLane(Op0: m_VPValue(V&: LaneToExtract), Op1: m_VPValue(V&: A)))) {
1674 // Simplify extract-lane(%lane_num, %scalar_val) -> %scalar_val.
1675 if (vputils::isSingleScalar(VPV: A))
1676 return Def->replaceAllUsesWith(New: A);
1677
1678 // Simplify extract-lane with single source to extract-element.
1679 Def->replaceAllUsesWith(New: Builder.createNaryOp(
1680 Opcode: Instruction::ExtractElement, Operands: {A, LaneToExtract}, DL: Def->getDebugLoc()));
1681 return;
1682 }
1683
1684 // Look for cycles where Def is of the form:
1685 // X = phi(0, IVInc) ; used only by IVInc, or by IVInc and Inc = X + Y
1686 // IVInc = X + Step ; used by X and Def
1687 // Def = IVInc + Y
1688 // Fold the increment Y into the phi's start value, replace Def with IVInc,
1689 // and if Inc exists, replace it with X.
1690 if (match(R: Def, P: m_Add(Op0: m_Add(Op0: m_VPValue(V&: X), Op1: m_VPValue()), Op1: m_VPValue(V&: Y))) &&
1691 isa<VPIRValue>(Val: Y) && !isa<VPConstantInt>(Val: Y) &&
1692 match(V: X, P: m_VPPhi(Op0: m_ZeroInt(), Op1: m_Specific(VPV: Def->getOperand(N: 0))))) {
1693 auto *Phi = cast<VPPhi>(Val: X);
1694 auto *IVInc = Def->getOperand(N: 0);
1695 if (IVInc->getNumUsers() == 2) {
1696 // If Phi has a second user (besides IVInc's defining recipe), it must
1697 // be Inc = Phi + Y for the fold to apply.
1698 auto *Inc = dyn_cast_or_null<VPSingleDefRecipe>(
1699 Val: vputils::findUserOf(V: Phi, P: m_Add(Op0: m_Specific(VPV: Phi), Op1: m_Specific(VPV: Y))));
1700 if (Phi->getNumUsers() == 1 || (Phi->getNumUsers() == 2 && Inc)) {
1701 Def->replaceAllUsesWith(New: IVInc);
1702 if (Inc)
1703 Inc->replaceAllUsesWith(New: Phi);
1704 Phi->setOperand(I: 0, New: Y);
1705 return;
1706 }
1707 }
1708 }
1709
1710 // Simplify unrolled VectorPointer without offset, or with zero offset, to
1711 // just the pointer operand.
1712 if (auto *VPR = dyn_cast<VPVectorPointerRecipe>(Val: Def))
1713 if (!VPR->getOffset() || match(V: VPR->getOffset(), P: m_ZeroInt()))
1714 return VPR->replaceAllUsesWith(New: VPR->getOperand(N: 0));
1715
1716 // VPScalarIVSteps after unrolling can be replaced by their start value, if
1717 // the start index is zero and only the first lane 0 is demanded.
1718 if (auto *Steps = dyn_cast<VPScalarIVStepsRecipe>(Val: Def)) {
1719 if (!Steps->getStartIndex() && vputils::onlyFirstLaneUsed(Def: Steps)) {
1720 Steps->replaceAllUsesWith(New: Steps->getOperand(N: 0));
1721 return;
1722 }
1723 }
1724 // Simplify redundant ReductionStartVector recipes after unrolling.
1725 VPValue *StartV;
1726 if (match(R: Def, P: m_VPInstruction<VPInstruction::ReductionStartVector>(
1727 Ops: m_VPValue(V&: StartV), Ops: m_VPValue(), Ops: m_VPValue()))) {
1728 Def->replaceUsesWithIf(New: StartV, ShouldReplace: [](const VPUser &U, unsigned Idx) {
1729 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(Val: &U);
1730 return PhiR && PhiR->isInLoop();
1731 });
1732 return;
1733 }
1734
1735 if (match(R: Def, P: m_ExtractLastLane(Op0: m_Broadcast(Op0: m_VPValue(V&: A))))) {
1736 Def->replaceAllUsesWith(New: A);
1737 return;
1738 }
1739
1740 if (match(R: Def, P: m_ExtractLastLane(Op0: m_VPValue(V&: A))) &&
1741 ((isa<VPInstruction>(Val: A) && vputils::isSingleScalar(VPV: A)) ||
1742 (isa<VPReplicateRecipe>(Val: A) &&
1743 cast<VPReplicateRecipe>(Val: A)->isSingleScalar())) &&
1744 all_of(Range: A->users(),
1745 P: [Def, A](VPUser *U) { return U->usesScalars(Op: A) || Def == U; })) {
1746 return Def->replaceAllUsesWith(New: A);
1747 }
1748
1749 if (Plan->getConcreteUF() == 1 && match(R: Def, P: m_ExtractLastPart(Op0: m_VPValue(V&: A))))
1750 return Def->replaceAllUsesWith(New: A);
1751}
1752
1753void VPlanTransforms::simplifyRecipes(VPlan &Plan) {
1754 ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT(
1755 Plan.getEntry());
1756 VPTypeAnalysis TypeInfo(Plan);
1757 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Range: RPOT)) {
1758 for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB))
1759 if (auto *Def = dyn_cast<VPSingleDefRecipe>(Val: &R))
1760 simplifyRecipe(Def, TypeInfo);
1761 }
1762}
1763
1764/// Reassociate (headermask && x) && y -> headermask && (x && y) to allow the
1765/// header mask to be simplified further when tail folding, e.g. in
1766/// optimizeEVLMasks.
1767static void reassociateHeaderMask(VPlan &Plan) {
1768 VPValue *HeaderMask = vputils::findHeaderMask(Plan);
1769 if (!HeaderMask)
1770 return;
1771
1772 SmallVector<VPUser *> Worklist;
1773 for (VPUser *U : HeaderMask->users())
1774 if (match(U, P: m_LogicalAnd(Op0: m_Specific(VPV: HeaderMask), Op1: m_VPValue())))
1775 append_range(C&: Worklist, R: cast<VPSingleDefRecipe>(Val: U)->users());
1776
1777 while (!Worklist.empty()) {
1778 auto *R = dyn_cast<VPSingleDefRecipe>(Val: Worklist.pop_back_val());
1779 VPValue *X, *Y;
1780 if (!R || !match(R, P: m_LogicalAnd(
1781 Op0: m_LogicalAnd(Op0: m_Specific(VPV: HeaderMask), Op1: m_VPValue(V&: X)),
1782 Op1: m_VPValue(V&: Y))))
1783 continue;
1784 append_range(C&: Worklist, R: R->users());
1785 VPBuilder Builder(R);
1786 R->replaceAllUsesWith(
1787 New: Builder.createLogicalAnd(LHS: HeaderMask, RHS: Builder.createLogicalAnd(LHS: X, RHS: Y)));
1788 }
1789}
1790
1791static void narrowToSingleScalarRecipes(VPlan &Plan) {
1792 if (Plan.hasScalarVFOnly())
1793 return;
1794
1795 // Try to narrow wide and replicating recipes to single scalar recipes,
1796 // based on VPlan analysis. Only process blocks in the loop region for now,
1797 // without traversing into nested regions, as recipes in replicate regions
1798 // cannot be converted yet.
1799 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
1800 Range: vp_depth_first_shallow(G: Plan.getVectorLoopRegion()->getEntry()))) {
1801 for (VPRecipeBase &R : make_early_inc_range(Range: reverse(C&: *VPBB))) {
1802 if (!isa<VPWidenRecipe, VPWidenGEPRecipe, VPReplicateRecipe,
1803 VPWidenStoreRecipe>(Val: &R))
1804 continue;
1805 auto *RepR = dyn_cast<VPReplicateRecipe>(Val: &R);
1806 if (RepR && (RepR->isSingleScalar() || RepR->isPredicated()))
1807 continue;
1808
1809 // Convert an unmasked scatter with an uniform address into
1810 // extract-last-lane + scalar store.
1811 // TODO: Add a profitability check comparing the cost of a scatter vs.
1812 // extract + scalar store.
1813 auto *WidenStoreR = dyn_cast<VPWidenStoreRecipe>(Val: &R);
1814 if (WidenStoreR && vputils::isSingleScalar(VPV: WidenStoreR->getAddr()) &&
1815 !WidenStoreR->isConsecutive()) {
1816 assert(!WidenStoreR->isReverse() &&
1817 "Not consecutive memory recipes shouldn't be reversed");
1818 VPValue *Mask = WidenStoreR->getMask();
1819
1820 // Only convert the scatter to a scalar store if it is unmasked.
1821 // TODO: Support converting scatter masked by the header mask to scalar
1822 // store.
1823 if (Mask)
1824 continue;
1825
1826 auto *Extract = new VPInstruction(VPInstruction::ExtractLastLane,
1827 {WidenStoreR->getOperand(N: 1)});
1828 Extract->insertBefore(InsertPos: WidenStoreR);
1829
1830 // TODO: Sink the scalar store recipe to middle block if possible.
1831 auto *ScalarStore = new VPReplicateRecipe(
1832 &WidenStoreR->getIngredient(), {Extract, WidenStoreR->getAddr()},
1833 true /*IsSingleScalar*/, nullptr /*Mask*/, {},
1834 *WidenStoreR /*Metadata*/);
1835 ScalarStore->insertBefore(InsertPos: WidenStoreR);
1836 WidenStoreR->eraseFromParent();
1837 continue;
1838 }
1839
1840 auto *RepOrWidenR = dyn_cast<VPRecipeWithIRFlags>(Val: &R);
1841 if (RepR && isa<StoreInst>(Val: RepR->getUnderlyingInstr()) &&
1842 vputils::isSingleScalar(VPV: RepR->getOperand(N: 1))) {
1843 auto *Clone = new VPReplicateRecipe(
1844 RepOrWidenR->getUnderlyingInstr(), RepOrWidenR->operands(),
1845 true /*IsSingleScalar*/, nullptr /*Mask*/, *RepR /*Flags*/,
1846 *RepR /*Metadata*/, RepR->getDebugLoc());
1847 Clone->insertBefore(InsertPos: RepOrWidenR);
1848 VPBuilder Builder(Clone);
1849 VPValue *ExtractOp = Clone->getOperand(N: 0);
1850 if (vputils::isUniformAcrossVFsAndUFs(V: RepR->getOperand(N: 1)))
1851 ExtractOp =
1852 Builder.createNaryOp(Opcode: VPInstruction::ExtractLastPart, Operands: ExtractOp);
1853 ExtractOp =
1854 Builder.createNaryOp(Opcode: VPInstruction::ExtractLastLane, Operands: ExtractOp);
1855 Clone->setOperand(I: 0, New: ExtractOp);
1856 RepR->eraseFromParent();
1857 continue;
1858 }
1859
1860 // Skip recipes that aren't single scalars.
1861 if (!RepOrWidenR || !vputils::isSingleScalar(VPV: RepOrWidenR))
1862 continue;
1863
1864 // Predicate to check if a user of Op introduces extra broadcasts.
1865 auto IntroducesBCastOf = [](const VPValue *Op) {
1866 return [Op](const VPUser *U) {
1867 if (auto *VPI = dyn_cast<VPInstruction>(Val: U)) {
1868 if (is_contained(Set: {VPInstruction::ExtractLastLane,
1869 VPInstruction::ExtractLastPart,
1870 VPInstruction::ExtractPenultimateElement},
1871 Element: VPI->getOpcode()))
1872 return false;
1873 }
1874 return !U->usesScalars(Op);
1875 };
1876 };
1877
1878 if (any_of(Range: RepOrWidenR->users(), P: IntroducesBCastOf(RepOrWidenR)) &&
1879 none_of(Range: RepOrWidenR->operands(), P: [&](VPValue *Op) {
1880 if (any_of(
1881 Range: make_filter_range(Range: Op->users(), Pred: not_equal_to(Arg&: RepOrWidenR)),
1882 P: IntroducesBCastOf(Op)))
1883 return false;
1884 // Non-constant live-ins require broadcasts, while constants do not
1885 // need explicit broadcasts.
1886 auto *IRV = dyn_cast<VPIRValue>(Val: Op);
1887 bool LiveInNeedsBroadcast = IRV && !isa<Constant>(Val: IRV->getValue());
1888 auto *OpR = dyn_cast<VPReplicateRecipe>(Val: Op);
1889 return LiveInNeedsBroadcast || (OpR && OpR->isSingleScalar());
1890 }))
1891 continue;
1892
1893 auto *Clone = new VPReplicateRecipe(
1894 RepOrWidenR->getUnderlyingInstr(), RepOrWidenR->operands(),
1895 true /*IsSingleScalar*/, nullptr, *RepOrWidenR);
1896 Clone->insertBefore(InsertPos: RepOrWidenR);
1897 RepOrWidenR->replaceAllUsesWith(New: Clone);
1898 if (isDeadRecipe(R&: *RepOrWidenR))
1899 RepOrWidenR->eraseFromParent();
1900 }
1901 }
1902}
1903
1904/// Try to see if all of \p Blend's masks share a common value logically and'ed
1905/// and remove it from the masks.
1906static void removeCommonBlendMask(VPBlendRecipe *Blend) {
1907 if (Blend->isNormalized())
1908 return;
1909 VPValue *CommonEdgeMask;
1910 if (!match(V: Blend->getMask(Idx: 0),
1911 P: m_LogicalAnd(Op0: m_VPValue(V&: CommonEdgeMask), Op1: m_VPValue())))
1912 return;
1913 for (unsigned I = 0; I < Blend->getNumIncomingValues(); I++)
1914 if (!match(V: Blend->getMask(Idx: I),
1915 P: m_LogicalAnd(Op0: m_Specific(VPV: CommonEdgeMask), Op1: m_VPValue())))
1916 return;
1917 for (unsigned I = 0; I < Blend->getNumIncomingValues(); I++)
1918 Blend->setMask(Idx: I, V: Blend->getMask(Idx: I)->getDefiningRecipe()->getOperand(N: 1));
1919}
1920
1921/// Normalize and simplify VPBlendRecipes. Should be run after simplifyRecipes
1922/// to make sure the masks are simplified.
1923static void simplifyBlends(VPlan &Plan) {
1924 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
1925 Range: vp_depth_first_shallow(G: Plan.getVectorLoopRegion()->getEntry()))) {
1926 for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB)) {
1927 auto *Blend = dyn_cast<VPBlendRecipe>(Val: &R);
1928 if (!Blend)
1929 continue;
1930
1931 removeCommonBlendMask(Blend);
1932
1933 // Try to remove redundant blend recipes.
1934 SmallPtrSet<VPValue *, 4> UniqueValues;
1935 if (Blend->isNormalized() || !match(V: Blend->getMask(Idx: 0), P: m_False()))
1936 UniqueValues.insert(Ptr: Blend->getIncomingValue(Idx: 0));
1937 for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
1938 if (!match(V: Blend->getMask(Idx: I), P: m_False()))
1939 UniqueValues.insert(Ptr: Blend->getIncomingValue(Idx: I));
1940
1941 if (UniqueValues.size() == 1) {
1942 Blend->replaceAllUsesWith(New: *UniqueValues.begin());
1943 Blend->eraseFromParent();
1944 continue;
1945 }
1946
1947 if (Blend->isNormalized())
1948 continue;
1949
1950 // Normalize the blend so its first incoming value is used as the initial
1951 // value with the others blended into it.
1952
1953 unsigned StartIndex = 0;
1954 for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
1955 // If a value's mask is used only by the blend then is can be deadcoded.
1956 // TODO: Find the most expensive mask that can be deadcoded, or a mask
1957 // that's used by multiple blends where it can be removed from them all.
1958 VPValue *Mask = Blend->getMask(Idx: I);
1959 if (Mask->getNumUsers() == 1 && !match(V: Mask, P: m_False())) {
1960 StartIndex = I;
1961 break;
1962 }
1963 }
1964
1965 SmallVector<VPValue *, 4> OperandsWithMask;
1966 OperandsWithMask.push_back(Elt: Blend->getIncomingValue(Idx: StartIndex));
1967
1968 for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
1969 if (I == StartIndex)
1970 continue;
1971 OperandsWithMask.push_back(Elt: Blend->getIncomingValue(Idx: I));
1972 OperandsWithMask.push_back(Elt: Blend->getMask(Idx: I));
1973 }
1974
1975 auto *NewBlend =
1976 new VPBlendRecipe(cast_or_null<PHINode>(Val: Blend->getUnderlyingValue()),
1977 OperandsWithMask, *Blend, Blend->getDebugLoc());
1978 NewBlend->insertBefore(InsertPos: &R);
1979
1980 VPValue *DeadMask = Blend->getMask(Idx: StartIndex);
1981 Blend->replaceAllUsesWith(New: NewBlend);
1982 Blend->eraseFromParent();
1983 recursivelyDeleteDeadRecipes(V: DeadMask);
1984
1985 /// Simplify BLEND %a, %b, Not(%mask) -> BLEND %b, %a, %mask.
1986 VPValue *NewMask;
1987 if (NewBlend->getNumOperands() == 3 &&
1988 match(V: NewBlend->getMask(Idx: 1), P: m_Not(Op0: m_VPValue(V&: NewMask)))) {
1989 VPValue *Inc0 = NewBlend->getOperand(N: 0);
1990 VPValue *Inc1 = NewBlend->getOperand(N: 1);
1991 VPValue *OldMask = NewBlend->getOperand(N: 2);
1992 NewBlend->setOperand(I: 0, New: Inc1);
1993 NewBlend->setOperand(I: 1, New: Inc0);
1994 NewBlend->setOperand(I: 2, New: NewMask);
1995 if (OldMask->getNumUsers() == 0)
1996 cast<VPInstruction>(Val: OldMask)->eraseFromParent();
1997 }
1998 }
1999 }
2000}
2001
2002/// Optimize the width of vector induction variables in \p Plan based on a known
2003/// constant Trip Count, \p BestVF and \p BestUF.
2004static bool optimizeVectorInductionWidthForTCAndVFUF(VPlan &Plan,
2005 ElementCount BestVF,
2006 unsigned BestUF) {
2007 // Only proceed if we have not completely removed the vector region.
2008 if (!Plan.getVectorLoopRegion())
2009 return false;
2010
2011 const APInt *TC;
2012 if (!BestVF.isFixed() || !match(V: Plan.getTripCount(), P: m_APInt(C&: TC)))
2013 return false;
2014
2015 // Calculate the minimum power-of-2 bit width that can fit the known TC, VF
2016 // and UF. Returns at least 8.
2017 auto ComputeBitWidth = [](APInt TC, uint64_t Align) {
2018 APInt AlignedTC =
2019 Align * APIntOps::RoundingUDiv(A: TC, B: APInt(TC.getBitWidth(), Align),
2020 RM: APInt::Rounding::UP);
2021 APInt MaxVal = AlignedTC - 1;
2022 return std::max<unsigned>(a: PowerOf2Ceil(A: MaxVal.getActiveBits()), b: 8);
2023 };
2024 unsigned NewBitWidth =
2025 ComputeBitWidth(*TC, BestVF.getKnownMinValue() * BestUF);
2026
2027 LLVMContext &Ctx = Plan.getContext();
2028 auto *NewIVTy = IntegerType::get(C&: Ctx, NumBits: NewBitWidth);
2029
2030 bool MadeChange = false;
2031
2032 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
2033 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
2034 auto *WideIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(Val: &Phi);
2035
2036 // Currently only handle canonical IVs as it is trivial to replace the start
2037 // and stop values, and we currently only perform the optimization when the
2038 // IV has a single use.
2039 if (!WideIV || !WideIV->isCanonical() ||
2040 WideIV->hasMoreThanOneUniqueUser() ||
2041 NewIVTy == WideIV->getScalarType())
2042 continue;
2043
2044 // Currently only handle cases where the single user is a header-mask
2045 // comparison with the backedge-taken-count.
2046 VPUser *SingleUser = WideIV->getSingleUser();
2047 if (!SingleUser ||
2048 !match(U: SingleUser,
2049 P: m_ICmp(Op0: m_Specific(VPV: WideIV),
2050 Op1: m_Broadcast(Op0: m_Specific(VPV: Plan.getBackedgeTakenCount())))))
2051 continue;
2052
2053 // Update IV operands and comparison bound to use new narrower type.
2054 auto *NewStart = Plan.getZero(Ty: NewIVTy);
2055 WideIV->setStartValue(NewStart);
2056 auto *NewStep = Plan.getConstantInt(Ty: NewIVTy, Val: 1);
2057 WideIV->setStepValue(NewStep);
2058
2059 auto *NewBTC = new VPWidenCastRecipe(
2060 Instruction::Trunc, Plan.getOrCreateBackedgeTakenCount(), NewIVTy,
2061 nullptr, VPIRFlags::getDefaultFlags(Opcode: Instruction::Trunc));
2062 Plan.getVectorPreheader()->appendRecipe(Recipe: NewBTC);
2063 auto *Cmp = cast<VPInstruction>(Val: WideIV->getSingleUser());
2064 Cmp->setOperand(I: 1, New: NewBTC);
2065
2066 MadeChange = true;
2067 }
2068
2069 return MadeChange;
2070}
2071
2072/// Return true if \p Cond is known to be true for given \p BestVF and \p
2073/// BestUF.
2074static bool isConditionTrueViaVFAndUF(VPValue *Cond, VPlan &Plan,
2075 ElementCount BestVF, unsigned BestUF,
2076 PredicatedScalarEvolution &PSE) {
2077 if (match(V: Cond, P: m_BinaryOr(Op0: m_VPValue(), Op1: m_VPValue())))
2078 return any_of(Range: Cond->getDefiningRecipe()->operands(), P: [&Plan, BestVF, BestUF,
2079 &PSE](VPValue *C) {
2080 return isConditionTrueViaVFAndUF(Cond: C, Plan, BestVF, BestUF, PSE);
2081 });
2082
2083 auto *CanIV = Plan.getVectorLoopRegion()->getCanonicalIV();
2084 if (!match(V: Cond, P: m_SpecificICmp(MatchPred: CmpInst::ICMP_EQ,
2085 Op0: m_Specific(VPV: CanIV->getBackedgeValue()),
2086 Op1: m_Specific(VPV: &Plan.getVectorTripCount()))))
2087 return false;
2088
2089 // The compare checks CanIV + VFxUF == vector trip count. The vector trip
2090 // count is not conveniently available as SCEV so far, so we compare directly
2091 // against the original trip count. This is stricter than necessary, as we
2092 // will only return true if the trip count == vector trip count.
2093 const SCEV *VectorTripCount =
2094 vputils::getSCEVExprForVPValue(V: &Plan.getVectorTripCount(), PSE);
2095 if (isa<SCEVCouldNotCompute>(Val: VectorTripCount))
2096 VectorTripCount = vputils::getSCEVExprForVPValue(V: Plan.getTripCount(), PSE);
2097 assert(!isa<SCEVCouldNotCompute>(VectorTripCount) &&
2098 "Trip count SCEV must be computable");
2099 ScalarEvolution &SE = *PSE.getSE();
2100 ElementCount NumElements = BestVF.multiplyCoefficientBy(RHS: BestUF);
2101 const SCEV *C = SE.getElementCount(Ty: VectorTripCount->getType(), EC: NumElements);
2102 return SE.isKnownPredicate(Pred: CmpInst::ICMP_EQ, LHS: VectorTripCount, RHS: C);
2103}
2104
2105/// Try to replace multiple active lane masks used for control flow with
2106/// a single, wide active lane mask instruction followed by multiple
2107/// extract subvector intrinsics. This applies to the active lane mask
2108/// instructions both in the loop and in the preheader.
2109/// Incoming values of all ActiveLaneMaskPHIs are updated to use the
2110/// new extracts from the first active lane mask, which has it's last
2111/// operand (multiplier) set to UF.
2112static bool tryToReplaceALMWithWideALM(VPlan &Plan, ElementCount VF,
2113 unsigned UF) {
2114 if (!EnableWideActiveLaneMask || !VF.isVector() || UF == 1)
2115 return false;
2116
2117 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
2118 VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
2119 auto *Term = &ExitingVPBB->back();
2120
2121 using namespace llvm::VPlanPatternMatch;
2122 if (!match(V: Term, P: m_BranchOnCond(Op0: m_Not(Op0: m_ActiveLaneMask(
2123 Op0: m_VPValue(), Op1: m_VPValue(), Op2: m_VPValue())))))
2124 return false;
2125
2126 auto *Header = cast<VPBasicBlock>(Val: VectorRegion->getEntry());
2127 LLVMContext &Ctx = Plan.getContext();
2128
2129 auto ExtractFromALM = [&](VPInstruction *ALM,
2130 SmallVectorImpl<VPValue *> &Extracts) {
2131 DebugLoc DL = ALM->getDebugLoc();
2132 for (unsigned Part = 0; Part < UF; ++Part) {
2133 SmallVector<VPValue *> Ops;
2134 Ops.append(IL: {ALM, Plan.getConstantInt(BitWidth: 64, Val: VF.getKnownMinValue() * Part)});
2135 auto *Ext =
2136 new VPWidenIntrinsicRecipe(Intrinsic::vector_extract, Ops,
2137 IntegerType::getInt1Ty(C&: Ctx), {}, {}, DL);
2138 Extracts[Part] = Ext;
2139 Ext->insertAfter(InsertPos: ALM);
2140 }
2141 };
2142
2143 // Create a list of each active lane mask phi, ordered by unroll part.
2144 SmallVector<VPActiveLaneMaskPHIRecipe *> Phis(UF, nullptr);
2145 for (VPRecipeBase &R : Header->phis()) {
2146 auto *Phi = dyn_cast<VPActiveLaneMaskPHIRecipe>(Val: &R);
2147 if (!Phi)
2148 continue;
2149 VPValue *Index = nullptr;
2150 match(V: Phi->getBackedgeValue(),
2151 P: m_ActiveLaneMask(Op0: m_VPValue(V&: Index), Op1: m_VPValue(), Op2: m_VPValue()));
2152 assert(Index && "Expected index from ActiveLaneMask instruction");
2153
2154 uint64_t Part;
2155 if (match(V: Index,
2156 P: m_VPInstruction<VPInstruction::CanonicalIVIncrementForPart>(
2157 Ops: m_VPValue(), Ops: m_Mul(Op0: m_VPValue(), Op1: m_ConstantInt(C&: Part)))))
2158 Phis[Part] = Phi;
2159 else {
2160 // Anything other than a CanonicalIVIncrementForPart is part 0
2161 assert(!match(
2162 Index,
2163 m_VPInstruction<VPInstruction::CanonicalIVIncrementForPart>()));
2164 Phis[0] = Phi;
2165 }
2166 }
2167
2168 assert(all_of(Phis, not_equal_to(nullptr)) &&
2169 "Expected one VPActiveLaneMaskPHIRecipe for each unroll part");
2170
2171 auto *EntryALM = cast<VPInstruction>(Val: Phis[0]->getStartValue());
2172 auto *LoopALM = cast<VPInstruction>(Val: Phis[0]->getBackedgeValue());
2173
2174 assert((EntryALM->getOpcode() == VPInstruction::ActiveLaneMask &&
2175 LoopALM->getOpcode() == VPInstruction::ActiveLaneMask) &&
2176 "Expected incoming values of Phi to be ActiveLaneMasks");
2177
2178 // When using wide lane masks, the return type of the get.active.lane.mask
2179 // intrinsic is VF x UF (last operand).
2180 VPValue *ALMMultiplier = Plan.getConstantInt(BitWidth: 64, Val: UF);
2181 EntryALM->setOperand(I: 2, New: ALMMultiplier);
2182 LoopALM->setOperand(I: 2, New: ALMMultiplier);
2183
2184 // Create UF x extract vectors and insert into preheader.
2185 SmallVector<VPValue *> EntryExtracts(UF);
2186 ExtractFromALM(EntryALM, EntryExtracts);
2187
2188 // Create UF x extract vectors and insert before the loop compare & branch,
2189 // updating the compare to use the first extract.
2190 SmallVector<VPValue *> LoopExtracts(UF);
2191 ExtractFromALM(LoopALM, LoopExtracts);
2192 VPInstruction *Not = cast<VPInstruction>(Val: Term->getOperand(N: 0));
2193 Not->setOperand(I: 0, New: LoopExtracts[0]);
2194
2195 // Update the incoming values of active lane mask phis.
2196 for (unsigned Part = 0; Part < UF; ++Part) {
2197 Phis[Part]->setStartValue(EntryExtracts[Part]);
2198 Phis[Part]->setBackedgeValue(LoopExtracts[Part]);
2199 }
2200
2201 return true;
2202}
2203
2204/// Try to simplify the branch condition of \p Plan. This may restrict the
2205/// resulting plan to \p BestVF and \p BestUF.
2206static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF,
2207 unsigned BestUF,
2208 PredicatedScalarEvolution &PSE) {
2209 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
2210 VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
2211 auto *Term = &ExitingVPBB->back();
2212 VPValue *Cond;
2213 auto m_CanIVInc = m_Add(Op0: m_VPValue(), Op1: m_Specific(VPV: &Plan.getVFxUF()));
2214 // Check if the branch condition compares the canonical IV increment (for main
2215 // loop), or the canonical IV increment plus an offset (for epilog loop).
2216 if (match(V: Term, P: m_BranchOnCount(
2217 Op0: m_CombineOr(L: m_CanIVInc, R: m_c_Add(Op0: m_CanIVInc, Op1: m_LiveIn())),
2218 Op1: m_VPValue())) ||
2219 match(V: Term, P: m_BranchOnCond(Op0: m_Not(Op0: m_ActiveLaneMask(
2220 Op0: m_VPValue(), Op1: m_VPValue(), Op2: m_VPValue()))))) {
2221 // Try to simplify the branch condition if VectorTC <= VF * UF when the
2222 // latch terminator is BranchOnCount or BranchOnCond(Not(ActiveLaneMask)).
2223 const SCEV *VectorTripCount =
2224 vputils::getSCEVExprForVPValue(V: &Plan.getVectorTripCount(), PSE);
2225 if (isa<SCEVCouldNotCompute>(Val: VectorTripCount))
2226 VectorTripCount =
2227 vputils::getSCEVExprForVPValue(V: Plan.getTripCount(), PSE);
2228 assert(!isa<SCEVCouldNotCompute>(VectorTripCount) &&
2229 "Trip count SCEV must be computable");
2230 ScalarEvolution &SE = *PSE.getSE();
2231 ElementCount NumElements = BestVF.multiplyCoefficientBy(RHS: BestUF);
2232 const SCEV *C = SE.getElementCount(Ty: VectorTripCount->getType(), EC: NumElements);
2233 if (!SE.isKnownPredicate(Pred: CmpInst::ICMP_ULE, LHS: VectorTripCount, RHS: C))
2234 return false;
2235 } else if (match(V: Term, P: m_BranchOnCond(Op0: m_VPValue(V&: Cond))) ||
2236 match(V: Term, P: m_BranchOnTwoConds(Op0: m_VPValue(), Op1: m_VPValue(V&: Cond)))) {
2237 // For BranchOnCond, check if we can prove the condition to be true using VF
2238 // and UF.
2239 if (!isConditionTrueViaVFAndUF(Cond, Plan, BestVF, BestUF, PSE))
2240 return false;
2241 } else {
2242 return false;
2243 }
2244
2245 // The vector loop region only executes once. Convert terminator of the
2246 // exiting block to exit in the first iteration.
2247 if (match(V: Term, P: m_BranchOnTwoConds())) {
2248 Term->setOperand(I: 1, New: Plan.getTrue());
2249 return true;
2250 }
2251
2252 auto *BOC = new VPInstruction(VPInstruction::BranchOnCond, Plan.getTrue(), {},
2253 {}, Term->getDebugLoc());
2254 ExitingVPBB->appendRecipe(Recipe: BOC);
2255 Term->eraseFromParent();
2256
2257 return true;
2258}
2259
2260/// From the definition of llvm.experimental.get.vector.length,
2261/// VPInstruction::ExplicitVectorLength(%AVL) = %AVL when %AVL <= VF.
2262bool VPlanTransforms::simplifyKnownEVL(VPlan &Plan, ElementCount VF,
2263 PredicatedScalarEvolution &PSE) {
2264 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
2265 Range: vp_depth_first_deep(G: Plan.getEntry()))) {
2266 for (VPRecipeBase &R : *VPBB) {
2267 VPValue *AVL;
2268 if (!match(V: &R, P: m_EVL(Op0: m_VPValue(V&: AVL))))
2269 continue;
2270
2271 const SCEV *AVLSCEV = vputils::getSCEVExprForVPValue(V: AVL, PSE);
2272 if (isa<SCEVCouldNotCompute>(Val: AVLSCEV))
2273 continue;
2274 ScalarEvolution &SE = *PSE.getSE();
2275 const SCEV *VFSCEV = SE.getElementCount(Ty: AVLSCEV->getType(), EC: VF);
2276 if (!SE.isKnownPredicate(Pred: CmpInst::ICMP_ULE, LHS: AVLSCEV, RHS: VFSCEV))
2277 continue;
2278
2279 VPValue *Trunc = VPBuilder(&R).createScalarZExtOrTrunc(
2280 Op: AVL, ResultTy: Type::getInt32Ty(C&: Plan.getContext()), SrcTy: AVLSCEV->getType(),
2281 DL: R.getDebugLoc());
2282 if (Trunc != AVL) {
2283 auto *TruncR = cast<VPSingleDefRecipe>(Val: Trunc);
2284 const DataLayout &DL = Plan.getDataLayout();
2285 VPTypeAnalysis TypeInfo(Plan);
2286 if (VPValue *Folded =
2287 tryToFoldLiveIns(R&: *TruncR, Operands: TruncR->operands(), DL, TypeInfo))
2288 Trunc = Folded;
2289 }
2290 R.getVPSingleValue()->replaceAllUsesWith(New: Trunc);
2291 return true;
2292 }
2293 }
2294 return false;
2295}
2296
2297void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
2298 unsigned BestUF,
2299 PredicatedScalarEvolution &PSE) {
2300 assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
2301 assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
2302
2303 bool MadeChange = tryToReplaceALMWithWideALM(Plan, VF: BestVF, UF: BestUF);
2304 MadeChange |= simplifyBranchConditionForVFAndUF(Plan, BestVF, BestUF, PSE);
2305 MadeChange |= optimizeVectorInductionWidthForTCAndVFUF(Plan, BestVF, BestUF);
2306
2307 if (MadeChange) {
2308 Plan.setVF(BestVF);
2309 assert(Plan.getConcreteUF() == BestUF && "BestUF must match the Plan's UF");
2310 }
2311}
2312
2313/// Sink users of \p FOR after the recipe defining the previous value \p
2314/// Previous of the recurrence. \returns true if all users of \p FOR could be
2315/// re-arranged as needed or false if it is not possible.
2316static bool
2317sinkRecurrenceUsersAfterPrevious(VPFirstOrderRecurrencePHIRecipe *FOR,
2318 VPRecipeBase *Previous,
2319 VPDominatorTree &VPDT) {
2320 // If Previous is a live-in (no defining recipe), it naturally dominates all
2321 // recipes in the loop, so no sinking is needed.
2322 if (!Previous)
2323 return true;
2324
2325 // Collect recipes that need sinking.
2326 SmallVector<VPRecipeBase *> WorkList;
2327 SmallPtrSet<VPRecipeBase *, 8> Seen;
2328 Seen.insert(Ptr: Previous);
2329 auto TryToPushSinkCandidate = [&](VPRecipeBase *SinkCandidate) {
2330 // The previous value must not depend on the users of the recurrence phi. In
2331 // that case, FOR is not a fixed order recurrence.
2332 if (SinkCandidate == Previous)
2333 return false;
2334
2335 if (isa<VPHeaderPHIRecipe>(Val: SinkCandidate) ||
2336 !Seen.insert(Ptr: SinkCandidate).second ||
2337 VPDT.properlyDominates(A: Previous, B: SinkCandidate))
2338 return true;
2339
2340 if (cannotHoistOrSinkRecipe(R: *SinkCandidate))
2341 return false;
2342
2343 WorkList.push_back(Elt: SinkCandidate);
2344 return true;
2345 };
2346
2347 // Recursively sink users of FOR after Previous.
2348 WorkList.push_back(Elt: FOR);
2349 for (unsigned I = 0; I != WorkList.size(); ++I) {
2350 VPRecipeBase *Current = WorkList[I];
2351 assert(Current->getNumDefinedValues() == 1 &&
2352 "only recipes with a single defined value expected");
2353
2354 for (VPUser *User : Current->getVPSingleValue()->users()) {
2355 if (!TryToPushSinkCandidate(cast<VPRecipeBase>(Val: User)))
2356 return false;
2357 }
2358 }
2359
2360 // Keep recipes to sink ordered by dominance so earlier instructions are
2361 // processed first.
2362 sort(C&: WorkList, Comp: [&VPDT](const VPRecipeBase *A, const VPRecipeBase *B) {
2363 return VPDT.properlyDominates(A, B);
2364 });
2365
2366 for (VPRecipeBase *SinkCandidate : WorkList) {
2367 if (SinkCandidate == FOR)
2368 continue;
2369
2370 SinkCandidate->moveAfter(MovePos: Previous);
2371 Previous = SinkCandidate;
2372 }
2373 return true;
2374}
2375
2376/// Try to hoist \p Previous and its operands before all users of \p FOR.
2377static bool hoistPreviousBeforeFORUsers(VPFirstOrderRecurrencePHIRecipe *FOR,
2378 VPRecipeBase *Previous,
2379 VPDominatorTree &VPDT) {
2380 if (cannotHoistOrSinkRecipe(R: *Previous))
2381 return false;
2382
2383 // Collect recipes that need hoisting.
2384 SmallVector<VPRecipeBase *> HoistCandidates;
2385 SmallPtrSet<VPRecipeBase *, 8> Visited;
2386 VPRecipeBase *HoistPoint = nullptr;
2387 // Find the closest hoist point by looking at all users of FOR and selecting
2388 // the recipe dominating all other users.
2389 for (VPUser *U : FOR->users()) {
2390 auto *R = cast<VPRecipeBase>(Val: U);
2391 if (!HoistPoint || VPDT.properlyDominates(A: R, B: HoistPoint))
2392 HoistPoint = R;
2393 }
2394 assert(all_of(FOR->users(),
2395 [&VPDT, HoistPoint](VPUser *U) {
2396 auto *R = cast<VPRecipeBase>(U);
2397 return HoistPoint == R ||
2398 VPDT.properlyDominates(HoistPoint, R);
2399 }) &&
2400 "HoistPoint must dominate all users of FOR");
2401
2402 auto NeedsHoisting = [HoistPoint, &VPDT,
2403 &Visited](VPValue *HoistCandidateV) -> VPRecipeBase * {
2404 VPRecipeBase *HoistCandidate = HoistCandidateV->getDefiningRecipe();
2405 if (!HoistCandidate)
2406 return nullptr;
2407 VPRegionBlock *EnclosingLoopRegion =
2408 HoistCandidate->getParent()->getEnclosingLoopRegion();
2409 assert((!HoistCandidate->getRegion() ||
2410 HoistCandidate->getRegion() == EnclosingLoopRegion) &&
2411 "CFG in VPlan should still be flat, without replicate regions");
2412 // Hoist candidate was already visited, no need to hoist.
2413 if (!Visited.insert(Ptr: HoistCandidate).second)
2414 return nullptr;
2415
2416 // Candidate is outside loop region or a header phi, dominates FOR users w/o
2417 // hoisting.
2418 if (!EnclosingLoopRegion || isa<VPHeaderPHIRecipe>(Val: HoistCandidate))
2419 return nullptr;
2420
2421 // If we reached a recipe that dominates HoistPoint, we don't need to
2422 // hoist the recipe.
2423 if (VPDT.properlyDominates(A: HoistCandidate, B: HoistPoint))
2424 return nullptr;
2425 return HoistCandidate;
2426 };
2427
2428 if (!NeedsHoisting(Previous->getVPSingleValue()))
2429 return true;
2430
2431 // Recursively try to hoist Previous and its operands before all users of FOR.
2432 HoistCandidates.push_back(Elt: Previous);
2433
2434 for (unsigned I = 0; I != HoistCandidates.size(); ++I) {
2435 VPRecipeBase *Current = HoistCandidates[I];
2436 assert(Current->getNumDefinedValues() == 1 &&
2437 "only recipes with a single defined value expected");
2438 if (cannotHoistOrSinkRecipe(R: *Current))
2439 return false;
2440
2441 for (VPValue *Op : Current->operands()) {
2442 // If we reach FOR, it means the original Previous depends on some other
2443 // recurrence that in turn depends on FOR. If that is the case, we would
2444 // also need to hoist recipes involving the other FOR, which may break
2445 // dependencies.
2446 if (Op == FOR)
2447 return false;
2448
2449 if (auto *R = NeedsHoisting(Op)) {
2450 // Bail out if the recipe defines multiple values.
2451 // TODO: Hoisting such recipes requires additional handling.
2452 if (R->getNumDefinedValues() != 1)
2453 return false;
2454 HoistCandidates.push_back(Elt: R);
2455 }
2456 }
2457 }
2458
2459 // Order recipes to hoist by dominance so earlier instructions are processed
2460 // first.
2461 sort(C&: HoistCandidates, Comp: [&VPDT](const VPRecipeBase *A, const VPRecipeBase *B) {
2462 return VPDT.properlyDominates(A, B);
2463 });
2464
2465 for (VPRecipeBase *HoistCandidate : HoistCandidates) {
2466 HoistCandidate->moveBefore(BB&: *HoistPoint->getParent(),
2467 I: HoistPoint->getIterator());
2468 }
2469
2470 return true;
2471}
2472
2473bool VPlanTransforms::adjustFixedOrderRecurrences(VPlan &Plan,
2474 VPBuilder &LoopBuilder) {
2475 VPDominatorTree VPDT(Plan);
2476 VPTypeAnalysis TypeInfo(Plan);
2477
2478 SmallVector<VPFirstOrderRecurrencePHIRecipe *> RecurrencePhis;
2479 for (VPRecipeBase &R :
2480 Plan.getVectorLoopRegion()->getEntry()->getEntryBasicBlock()->phis())
2481 if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(Val: &R))
2482 RecurrencePhis.push_back(Elt: FOR);
2483
2484 for (VPFirstOrderRecurrencePHIRecipe *FOR : RecurrencePhis) {
2485 SmallPtrSet<VPFirstOrderRecurrencePHIRecipe *, 4> SeenPhis;
2486 VPRecipeBase *Previous = FOR->getBackedgeValue()->getDefiningRecipe();
2487 // Fixed-order recurrences do not contain cycles, so this loop is guaranteed
2488 // to terminate.
2489 while (auto *PrevPhi =
2490 dyn_cast_or_null<VPFirstOrderRecurrencePHIRecipe>(Val: Previous)) {
2491 assert(PrevPhi->getParent() == FOR->getParent());
2492 assert(SeenPhis.insert(PrevPhi).second);
2493 Previous = PrevPhi->getBackedgeValue()->getDefiningRecipe();
2494 }
2495
2496 if (!sinkRecurrenceUsersAfterPrevious(FOR, Previous, VPDT) &&
2497 !hoistPreviousBeforeFORUsers(FOR, Previous, VPDT))
2498 return false;
2499
2500 // Introduce a recipe to combine the incoming and previous values of a
2501 // fixed-order recurrence.
2502 VPBasicBlock *InsertBlock =
2503 Previous ? Previous->getParent() : FOR->getParent();
2504 if (!Previous || isa<VPHeaderPHIRecipe>(Val: Previous))
2505 LoopBuilder.setInsertPoint(TheBB: InsertBlock, IP: InsertBlock->getFirstNonPhi());
2506 else
2507 LoopBuilder.setInsertPoint(TheBB: InsertBlock,
2508 IP: std::next(x: Previous->getIterator()));
2509
2510 auto *RecurSplice =
2511 LoopBuilder.createNaryOp(Opcode: VPInstruction::FirstOrderRecurrenceSplice,
2512 Operands: {FOR, FOR->getBackedgeValue()});
2513
2514 FOR->replaceAllUsesWith(New: RecurSplice);
2515 // Set the first operand of RecurSplice to FOR again, after replacing
2516 // all users.
2517 RecurSplice->setOperand(I: 0, New: FOR);
2518
2519 // Check for users extracting at the penultimate active lane of the FOR.
2520 // If only a single lane is active in the current iteration, we need to
2521 // select the last element from the previous iteration (from the FOR phi
2522 // directly).
2523 for (VPUser *U : RecurSplice->users()) {
2524 if (!match(U, P: m_ExtractLane(Op0: m_LastActiveLane(Op0: m_VPValue()),
2525 Op1: m_Specific(VPV: RecurSplice))))
2526 continue;
2527
2528 VPBuilder B(cast<VPInstruction>(Val: U));
2529 VPValue *LastActiveLane = cast<VPInstruction>(Val: U)->getOperand(N: 0);
2530 Type *Ty = TypeInfo.inferScalarType(V: LastActiveLane);
2531 VPValue *Zero = Plan.getConstantInt(Ty, Val: 0);
2532 VPValue *One = Plan.getConstantInt(Ty, Val: 1);
2533 VPValue *PenultimateIndex = B.createSub(LHS: LastActiveLane, RHS: One);
2534 VPValue *PenultimateLastIter =
2535 B.createNaryOp(Opcode: VPInstruction::ExtractLane,
2536 Operands: {PenultimateIndex, FOR->getBackedgeValue()});
2537 VPValue *LastPrevIter =
2538 B.createNaryOp(Opcode: VPInstruction::ExtractLastLane, Operands: FOR);
2539
2540 VPValue *Cmp = B.createICmp(Pred: CmpInst::ICMP_EQ, A: LastActiveLane, B: Zero);
2541 VPValue *Sel = B.createSelect(Cond: Cmp, TrueVal: LastPrevIter, FalseVal: PenultimateLastIter);
2542 cast<VPInstruction>(Val: U)->replaceAllUsesWith(New: Sel);
2543 }
2544 }
2545 return true;
2546}
2547
2548void VPlanTransforms::clearReductionWrapFlags(VPlan &Plan) {
2549 for (VPRecipeBase &R :
2550 Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
2551 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(Val: &R);
2552 if (!PhiR)
2553 continue;
2554 RecurKind RK = PhiR->getRecurrenceKind();
2555 if (RK != RecurKind::Add && RK != RecurKind::Mul && RK != RecurKind::Sub &&
2556 RK != RecurKind::AddChainWithSubs)
2557 continue;
2558
2559 for (VPUser *U : collectUsersRecursively(V: PhiR))
2560 if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(Val: U)) {
2561 RecWithFlags->dropPoisonGeneratingFlags();
2562 }
2563 }
2564}
2565
2566namespace {
2567struct VPCSEDenseMapInfo : public DenseMapInfo<VPSingleDefRecipe *> {
2568 static bool isSentinel(const VPSingleDefRecipe *Def) {
2569 return Def == getEmptyKey() || Def == getTombstoneKey();
2570 }
2571
2572 /// If recipe \p R will lower to a GEP with a non-i8 source element type,
2573 /// return that source element type.
2574 static Type *getGEPSourceElementType(const VPSingleDefRecipe *R) {
2575 // All VPInstructions that lower to GEPs must have the i8 source element
2576 // type (as they are PtrAdds), so we omit it.
2577 return TypeSwitch<const VPSingleDefRecipe *, Type *>(R)
2578 .Case(caseFn: [](const VPReplicateRecipe *I) -> Type * {
2579 if (auto *GEP = dyn_cast<GetElementPtrInst>(Val: I->getUnderlyingValue()))
2580 return GEP->getSourceElementType();
2581 return nullptr;
2582 })
2583 .Case<VPVectorPointerRecipe, VPWidenGEPRecipe>(
2584 caseFn: [](auto *I) { return I->getSourceElementType(); })
2585 .Default(defaultFn: [](auto *) { return nullptr; });
2586 }
2587
2588 /// Returns true if recipe \p Def can be safely handed for CSE.
2589 static bool canHandle(const VPSingleDefRecipe *Def) {
2590 // We can extend the list of handled recipes in the future,
2591 // provided we account for the data embedded in them while checking for
2592 // equality or hashing.
2593 auto C = getOpcodeOrIntrinsicID(R: Def);
2594
2595 // The issue with (Insert|Extract)Value is that the index of the
2596 // insert/extract is not a proper operand in LLVM IR, and hence also not in
2597 // VPlan.
2598 if (!C || (!C->first && (C->second == Instruction::InsertValue ||
2599 C->second == Instruction::ExtractValue)))
2600 return false;
2601
2602 // During CSE, we can only handle recipes that don't read from memory: if
2603 // they read from memory, there could be an intervening write to memory
2604 // before the next instance is CSE'd, leading to an incorrect result.
2605 return !Def->mayReadFromMemory();
2606 }
2607
2608 /// Hash the underlying data of \p Def.
2609 static unsigned getHashValue(const VPSingleDefRecipe *Def) {
2610 const VPlan *Plan = Def->getParent()->getPlan();
2611 VPTypeAnalysis TypeInfo(*Plan);
2612 hash_code Result = hash_combine(
2613 args: Def->getVPRecipeID(), args: getOpcodeOrIntrinsicID(R: Def),
2614 args: getGEPSourceElementType(R: Def), args: TypeInfo.inferScalarType(V: Def),
2615 args: vputils::isSingleScalar(VPV: Def), args: hash_combine_range(R: Def->operands()));
2616 if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(Val: Def))
2617 if (RFlags->hasPredicate())
2618 return hash_combine(args: Result, args: RFlags->getPredicate());
2619 return Result;
2620 }
2621
2622 /// Check equality of underlying data of \p L and \p R.
2623 static bool isEqual(const VPSingleDefRecipe *L, const VPSingleDefRecipe *R) {
2624 if (isSentinel(Def: L) || isSentinel(Def: R))
2625 return L == R;
2626 if (L->getVPRecipeID() != R->getVPRecipeID() ||
2627 getOpcodeOrIntrinsicID(R: L) != getOpcodeOrIntrinsicID(R) ||
2628 getGEPSourceElementType(R: L) != getGEPSourceElementType(R) ||
2629 vputils::isSingleScalar(VPV: L) != vputils::isSingleScalar(VPV: R) ||
2630 !equal(LRange: L->operands(), RRange: R->operands()))
2631 return false;
2632 assert(getOpcodeOrIntrinsicID(L) && getOpcodeOrIntrinsicID(R) &&
2633 "must have valid opcode info for both recipes");
2634 if (auto *LFlags = dyn_cast<VPRecipeWithIRFlags>(Val: L))
2635 if (LFlags->hasPredicate() &&
2636 LFlags->getPredicate() !=
2637 cast<VPRecipeWithIRFlags>(Val: R)->getPredicate())
2638 return false;
2639 // Recipes in replicate regions implicitly depend on predicate. If either
2640 // recipe is in a replicate region, only consider them equal if both have
2641 // the same parent.
2642 const VPRegionBlock *RegionL = L->getRegion();
2643 const VPRegionBlock *RegionR = R->getRegion();
2644 if (((RegionL && RegionL->isReplicator()) ||
2645 (RegionR && RegionR->isReplicator())) &&
2646 L->getParent() != R->getParent())
2647 return false;
2648 const VPlan *Plan = L->getParent()->getPlan();
2649 VPTypeAnalysis TypeInfo(*Plan);
2650 return TypeInfo.inferScalarType(V: L) == TypeInfo.inferScalarType(V: R);
2651 }
2652};
2653} // end anonymous namespace
2654
2655/// Perform a common-subexpression-elimination of VPSingleDefRecipes on the \p
2656/// Plan.
2657void VPlanTransforms::cse(VPlan &Plan) {
2658 VPDominatorTree VPDT(Plan);
2659 DenseMap<VPSingleDefRecipe *, VPSingleDefRecipe *, VPCSEDenseMapInfo> CSEMap;
2660
2661 ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT(
2662 Plan.getEntry());
2663 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Range: RPOT)) {
2664 for (VPRecipeBase &R : *VPBB) {
2665 auto *Def = dyn_cast<VPSingleDefRecipe>(Val: &R);
2666 if (!Def || !VPCSEDenseMapInfo::canHandle(Def))
2667 continue;
2668 if (VPSingleDefRecipe *V = CSEMap.lookup(Val: Def)) {
2669 // V must dominate Def for a valid replacement.
2670 if (!VPDT.dominates(A: V->getParent(), B: VPBB))
2671 continue;
2672 // Only keep flags present on both V and Def.
2673 if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(Val: V))
2674 RFlags->intersectFlags(Other: *cast<VPRecipeWithIRFlags>(Val: Def));
2675 Def->replaceAllUsesWith(New: V);
2676 continue;
2677 }
2678 CSEMap[Def] = Def;
2679 }
2680 }
2681}
2682
2683/// Move loop-invariant recipes out of the vector loop region in \p Plan.
2684static void licm(VPlan &Plan) {
2685 VPBasicBlock *Preheader = Plan.getVectorPreheader();
2686
2687 // Hoist any loop invariant recipes from the vector loop region to the
2688 // preheader. Preform a shallow traversal of the vector loop region, to
2689 // exclude recipes in replicate regions. Since the top-level blocks in the
2690 // vector loop region are guaranteed to execute if the vector pre-header is,
2691 // we don't need to check speculation safety.
2692 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2693 assert(Preheader->getSingleSuccessor() == LoopRegion &&
2694 "Expected vector prehader's successor to be the vector loop region");
2695 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
2696 Range: vp_depth_first_shallow(G: LoopRegion->getEntry()))) {
2697 for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB)) {
2698 if (cannotHoistOrSinkRecipe(R))
2699 continue;
2700 if (any_of(Range: R.operands(), P: [](VPValue *Op) {
2701 return !Op->isDefinedOutsideLoopRegions();
2702 }))
2703 continue;
2704 R.moveBefore(BB&: *Preheader, I: Preheader->end());
2705 }
2706 }
2707
2708#ifndef NDEBUG
2709 VPDominatorTree VPDT(Plan);
2710#endif
2711 // Sink recipes with no users inside the vector loop region if all users are
2712 // in the same exit block of the region.
2713 // TODO: Extend to sink recipes from inner loops.
2714 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
2715 Range: vp_post_order_shallow(G: LoopRegion->getEntry()))) {
2716 for (VPRecipeBase &R : make_early_inc_range(Range: reverse(C&: *VPBB))) {
2717 if (cannotHoistOrSinkRecipe(R))
2718 continue;
2719
2720 if (auto *RepR = dyn_cast<VPReplicateRecipe>(Val: &R)) {
2721 assert(!RepR->isPredicated() &&
2722 "Expected prior transformation of predicated replicates to "
2723 "replicate regions");
2724 // narrowToSingleScalarRecipes should have already maximally narrowed
2725 // replicates to single-scalar replicates.
2726 // TODO: When unrolling, replicateByVF doesn't handle sunk
2727 // non-single-scalar replicates correctly.
2728 if (!RepR->isSingleScalar())
2729 continue;
2730 }
2731
2732 // TODO: Use R.definedValues() instead of casting to VPSingleDefRecipe to
2733 // support recipes with multiple defined values (e.g., interleaved loads).
2734 auto *Def = cast<VPSingleDefRecipe>(Val: &R);
2735 // Skip recipes without users as we cannot determine a sink block.
2736 // TODO: Clone sinkable recipes without users to all exit blocks to reduce
2737 // their execution frequency.
2738 if (Def->getNumUsers() == 0)
2739 continue;
2740
2741 VPBasicBlock *SinkBB = nullptr;
2742 // Cannot sink the recipe if any user
2743 // * is defined in any loop region, or
2744 // * is a phi, or
2745 // * multiple users in different blocks.
2746 if (any_of(Range: Def->users(), P: [&SinkBB](VPUser *U) {
2747 auto *UserR = cast<VPRecipeBase>(Val: U);
2748 VPBasicBlock *Parent = UserR->getParent();
2749 // TODO: If the user is a PHI node, we should check the block of
2750 // incoming value. Support PHI node users if needed.
2751 if (UserR->isPhi() || Parent->getEnclosingLoopRegion())
2752 return true;
2753 // TODO: Support sinking when users are in multiple blocks.
2754 if (SinkBB && SinkBB != Parent)
2755 return true;
2756 SinkBB = Parent;
2757 return false;
2758 }))
2759 continue;
2760
2761 // Only sink to dedicated exit blocks of the loop region.
2762 if (SinkBB->getSinglePredecessor() != LoopRegion)
2763 continue;
2764
2765 // TODO: This will need to be a check instead of a assert after
2766 // conditional branches in vectorized loops are supported.
2767 assert(VPDT.properlyDominates(VPBB, SinkBB) &&
2768 "Defining block must dominate sink block");
2769 // TODO: Clone the recipe if users are on multiple exit paths, instead of
2770 // just moving.
2771 Def->moveBefore(BB&: *SinkBB, I: SinkBB->getFirstNonPhi());
2772 }
2773 }
2774}
2775
2776void VPlanTransforms::truncateToMinimalBitwidths(
2777 VPlan &Plan, const MapVector<Instruction *, uint64_t> &MinBWs) {
2778 if (Plan.hasScalarVFOnly())
2779 return;
2780 // Keep track of created truncates, so they can be re-used. Note that we
2781 // cannot use RAUW after creating a new truncate, as this would could make
2782 // other uses have different types for their operands, making them invalidly
2783 // typed.
2784 DenseMap<VPValue *, VPWidenCastRecipe *> ProcessedTruncs;
2785 VPTypeAnalysis TypeInfo(Plan);
2786 VPBasicBlock *PH = Plan.getVectorPreheader();
2787 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
2788 Range: vp_depth_first_deep(G: Plan.getVectorLoopRegion()))) {
2789 for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB)) {
2790 if (!isa<VPWidenRecipe, VPWidenCastRecipe, VPReplicateRecipe,
2791 VPWidenLoadRecipe, VPWidenIntrinsicRecipe>(Val: &R))
2792 continue;
2793
2794 VPValue *ResultVPV = R.getVPSingleValue();
2795 auto *UI = cast_or_null<Instruction>(Val: ResultVPV->getUnderlyingValue());
2796 unsigned NewResSizeInBits = MinBWs.lookup(Key: UI);
2797 if (!NewResSizeInBits)
2798 continue;
2799
2800 // If the value wasn't vectorized, we must maintain the original scalar
2801 // type. Skip those here, after incrementing NumProcessedRecipes. Also
2802 // skip casts which do not need to be handled explicitly here, as
2803 // redundant casts will be removed during recipe simplification.
2804 if (isa<VPReplicateRecipe, VPWidenCastRecipe>(Val: &R))
2805 continue;
2806
2807 Type *OldResTy = TypeInfo.inferScalarType(V: ResultVPV);
2808 unsigned OldResSizeInBits = OldResTy->getScalarSizeInBits();
2809 assert(OldResTy->isIntegerTy() && "only integer types supported");
2810 (void)OldResSizeInBits;
2811
2812 auto *NewResTy = IntegerType::get(C&: Plan.getContext(), NumBits: NewResSizeInBits);
2813
2814 // Any wrapping introduced by shrinking this operation shouldn't be
2815 // considered undefined behavior. So, we can't unconditionally copy
2816 // arithmetic wrapping flags to VPW.
2817 if (auto *VPW = dyn_cast<VPRecipeWithIRFlags>(Val: &R))
2818 VPW->dropPoisonGeneratingFlags();
2819
2820 if (OldResSizeInBits != NewResSizeInBits &&
2821 !match(V: &R, P: m_ICmp(Op0: m_VPValue(), Op1: m_VPValue()))) {
2822 // Extend result to original width.
2823 auto *Ext = new VPWidenCastRecipe(
2824 Instruction::ZExt, ResultVPV, OldResTy, nullptr,
2825 VPIRFlags::getDefaultFlags(Opcode: Instruction::ZExt));
2826 Ext->insertAfter(InsertPos: &R);
2827 ResultVPV->replaceAllUsesWith(New: Ext);
2828 Ext->setOperand(I: 0, New: ResultVPV);
2829 assert(OldResSizeInBits > NewResSizeInBits && "Nothing to shrink?");
2830 } else {
2831 assert(match(&R, m_ICmp(m_VPValue(), m_VPValue())) &&
2832 "Only ICmps should not need extending the result.");
2833 }
2834
2835 assert(!isa<VPWidenStoreRecipe>(&R) && "stores cannot be narrowed");
2836 if (isa<VPWidenLoadRecipe, VPWidenIntrinsicRecipe>(Val: &R))
2837 continue;
2838
2839 // Shrink operands by introducing truncates as needed.
2840 unsigned StartIdx =
2841 match(V: &R, P: m_Select(Op0: m_VPValue(), Op1: m_VPValue(), Op2: m_VPValue())) ? 1 : 0;
2842 for (unsigned Idx = StartIdx; Idx != R.getNumOperands(); ++Idx) {
2843 auto *Op = R.getOperand(N: Idx);
2844 unsigned OpSizeInBits =
2845 TypeInfo.inferScalarType(V: Op)->getScalarSizeInBits();
2846 if (OpSizeInBits == NewResSizeInBits)
2847 continue;
2848 assert(OpSizeInBits > NewResSizeInBits && "nothing to truncate");
2849 auto [ProcessedIter, IterIsEmpty] = ProcessedTruncs.try_emplace(Key: Op);
2850 if (!IterIsEmpty) {
2851 R.setOperand(I: Idx, New: ProcessedIter->second);
2852 continue;
2853 }
2854
2855 VPBuilder Builder;
2856 if (isa<VPIRValue>(Val: Op))
2857 Builder.setInsertPoint(PH);
2858 else
2859 Builder.setInsertPoint(&R);
2860 VPWidenCastRecipe *NewOp =
2861 Builder.createWidenCast(Opcode: Instruction::Trunc, Op, ResultTy: NewResTy);
2862 ProcessedIter->second = NewOp;
2863 R.setOperand(I: Idx, New: NewOp);
2864 }
2865
2866 }
2867 }
2868}
2869
2870void VPlanTransforms::removeBranchOnConst(VPlan &Plan, bool OnlyLatches) {
2871 std::optional<VPDominatorTree> VPDT;
2872 if (OnlyLatches)
2873 VPDT.emplace(args&: Plan);
2874
2875 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
2876 Range: vp_depth_first_shallow(G: Plan.getEntry()))) {
2877 VPValue *Cond;
2878 // Skip blocks that are not terminated by BranchOnCond.
2879 if (VPBB->empty() || !match(V: &VPBB->back(), P: m_BranchOnCond(Op0: m_VPValue(V&: Cond))))
2880 continue;
2881
2882 if (OnlyLatches && !VPBlockUtils::isLatch(VPB: VPBB, VPDT: *VPDT))
2883 continue;
2884
2885 assert(VPBB->getNumSuccessors() == 2 &&
2886 "Two successors expected for BranchOnCond");
2887 unsigned RemovedIdx;
2888 if (match(V: Cond, P: m_True()))
2889 RemovedIdx = 1;
2890 else if (match(V: Cond, P: m_False()))
2891 RemovedIdx = 0;
2892 else
2893 continue;
2894
2895 VPBasicBlock *RemovedSucc =
2896 cast<VPBasicBlock>(Val: VPBB->getSuccessors()[RemovedIdx]);
2897 assert(count(RemovedSucc->getPredecessors(), VPBB) == 1 &&
2898 "There must be a single edge between VPBB and its successor");
2899 // Values coming from VPBB into phi recipes of RemoveSucc are removed from
2900 // these recipes.
2901 for (VPRecipeBase &R : RemovedSucc->phis())
2902 cast<VPPhiAccessors>(Val: &R)->removeIncomingValueFor(IncomingBlock: VPBB);
2903
2904 // Disconnect blocks and remove the terminator. RemovedSucc will be deleted
2905 // automatically on VPlan destruction if it becomes unreachable.
2906 VPBlockUtils::disconnectBlocks(From: VPBB, To: RemovedSucc);
2907 VPBB->back().eraseFromParent();
2908 }
2909}
2910
2911void VPlanTransforms::optimize(VPlan &Plan) {
2912 RUN_VPLAN_PASS(removeRedundantCanonicalIVs, Plan);
2913 RUN_VPLAN_PASS(removeRedundantInductionCasts, Plan);
2914
2915 RUN_VPLAN_PASS(reassociateHeaderMask, Plan);
2916 RUN_VPLAN_PASS(simplifyRecipes, Plan);
2917 RUN_VPLAN_PASS(removeDeadRecipes, Plan);
2918 RUN_VPLAN_PASS(simplifyBlends, Plan);
2919 RUN_VPLAN_PASS(legalizeAndOptimizeInductions, Plan);
2920 RUN_VPLAN_PASS(narrowToSingleScalarRecipes, Plan);
2921 RUN_VPLAN_PASS(removeRedundantExpandSCEVRecipes, Plan);
2922 RUN_VPLAN_PASS(reassociateHeaderMask, Plan);
2923 RUN_VPLAN_PASS(simplifyRecipes, Plan);
2924 RUN_VPLAN_PASS(removeBranchOnConst, Plan, /*OnlyLatches=*/false);
2925 RUN_VPLAN_PASS(removeDeadRecipes, Plan);
2926
2927 RUN_VPLAN_PASS(createAndOptimizeReplicateRegions, Plan);
2928 RUN_VPLAN_PASS(hoistInvariantLoads, Plan);
2929 RUN_VPLAN_PASS(mergeBlocksIntoPredecessors, Plan);
2930 RUN_VPLAN_PASS(licm, Plan);
2931}
2932
2933// Add a VPActiveLaneMaskPHIRecipe and related recipes to \p Plan and replace
2934// the loop terminator with a branch-on-cond recipe with the negated
2935// active-lane-mask as operand. Note that this turns the loop into an
2936// uncountable one. Only the existing terminator is replaced, all other existing
2937// recipes/users remain unchanged, except for poison-generating flags being
2938// dropped from the canonical IV increment. Return the created
2939// VPActiveLaneMaskPHIRecipe.
2940//
2941// The function adds the following recipes:
2942//
2943// vector.ph:
2944// %EntryInc = canonical-iv-increment-for-part CanonicalIVStart
2945// %EntryALM = active-lane-mask %EntryInc, TC
2946//
2947// vector.body:
2948// ...
2949// %P = active-lane-mask-phi [ %EntryALM, %vector.ph ], [ %ALM, %vector.body ]
2950// ...
2951// %InLoopInc = canonical-iv-increment-for-part CanonicalIVIncrement
2952// %ALM = active-lane-mask %InLoopInc, TC
2953// %Negated = Not %ALM
2954// branch-on-cond %Negated
2955//
2956static VPActiveLaneMaskPHIRecipe *
2957addVPLaneMaskPhiAndUpdateExitBranch(VPlan &Plan) {
2958 VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
2959 VPBasicBlock *EB = TopRegion->getExitingBasicBlock();
2960 auto *CanonicalIVPHI = TopRegion->getCanonicalIV();
2961 VPValue *StartV = CanonicalIVPHI->getStartValue();
2962
2963 auto *CanonicalIVIncrement =
2964 cast<VPInstruction>(Val: CanonicalIVPHI->getBackedgeValue());
2965 // TODO: Check if dropping the flags is needed.
2966 CanonicalIVIncrement->dropPoisonGeneratingFlags();
2967 DebugLoc DL = CanonicalIVIncrement->getDebugLoc();
2968 // We can't use StartV directly in the ActiveLaneMask VPInstruction, since
2969 // we have to take unrolling into account. Each part needs to start at
2970 // Part * VF
2971 auto *VecPreheader = Plan.getVectorPreheader();
2972 VPBuilder Builder(VecPreheader);
2973
2974 // Create the ActiveLaneMask instruction using the correct start values.
2975 VPValue *TC = Plan.getTripCount();
2976 VPValue *VF = &Plan.getVF();
2977
2978 auto *EntryIncrement = Builder.createOverflowingOp(
2979 Opcode: VPInstruction::CanonicalIVIncrementForPart, Operands: {StartV, VF}, WrapFlags: {false, false},
2980 DL, Name: "index.part.next");
2981
2982 // Create the active lane mask instruction in the VPlan preheader.
2983 VPValue *ALMMultiplier =
2984 Plan.getConstantInt(Ty: TopRegion->getCanonicalIVType(), Val: 1);
2985 auto *EntryALM = Builder.createNaryOp(Opcode: VPInstruction::ActiveLaneMask,
2986 Operands: {EntryIncrement, TC, ALMMultiplier}, DL,
2987 Name: "active.lane.mask.entry");
2988
2989 // Now create the ActiveLaneMaskPhi recipe in the main loop using the
2990 // preheader ActiveLaneMask instruction.
2991 auto *LaneMaskPhi =
2992 new VPActiveLaneMaskPHIRecipe(EntryALM, DebugLoc::getUnknown());
2993 LaneMaskPhi->insertAfter(InsertPos: CanonicalIVPHI);
2994
2995 // Create the active lane mask for the next iteration of the loop before the
2996 // original terminator.
2997 VPRecipeBase *OriginalTerminator = EB->getTerminator();
2998 Builder.setInsertPoint(OriginalTerminator);
2999 auto *InLoopIncrement = Builder.createOverflowingOp(
3000 Opcode: VPInstruction::CanonicalIVIncrementForPart,
3001 Operands: {CanonicalIVIncrement, &Plan.getVF()}, WrapFlags: {false, false}, DL);
3002 auto *ALM = Builder.createNaryOp(Opcode: VPInstruction::ActiveLaneMask,
3003 Operands: {InLoopIncrement, TC, ALMMultiplier}, DL,
3004 Name: "active.lane.mask.next");
3005 LaneMaskPhi->addOperand(Operand: ALM);
3006
3007 // Replace the original terminator with BranchOnCond. We have to invert the
3008 // mask here because a true condition means jumping to the exit block.
3009 auto *NotMask = Builder.createNot(Operand: ALM, DL);
3010 Builder.createNaryOp(Opcode: VPInstruction::BranchOnCond, Operands: {NotMask}, DL);
3011 OriginalTerminator->eraseFromParent();
3012 return LaneMaskPhi;
3013}
3014
3015void VPlanTransforms::addActiveLaneMask(VPlan &Plan,
3016 bool UseActiveLaneMaskForControlFlow) {
3017 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3018 auto *FoundWidenCanonicalIVUser = find_if(
3019 Range: LoopRegion->getCanonicalIV()->users(), P: IsaPred<VPWidenCanonicalIVRecipe>);
3020 assert(FoundWidenCanonicalIVUser &&
3021 "Must have widened canonical IV when tail folding!");
3022 VPSingleDefRecipe *HeaderMask = vputils::findHeaderMask(Plan);
3023 auto *WideCanonicalIV =
3024 cast<VPWidenCanonicalIVRecipe>(Val: *FoundWidenCanonicalIVUser);
3025 VPSingleDefRecipe *LaneMask;
3026 if (UseActiveLaneMaskForControlFlow) {
3027 LaneMask = addVPLaneMaskPhiAndUpdateExitBranch(Plan);
3028 } else {
3029 VPBuilder B = VPBuilder::getToInsertAfter(R: WideCanonicalIV);
3030 VPValue *ALMMultiplier =
3031 Plan.getConstantInt(Ty: LoopRegion->getCanonicalIVType(), Val: 1);
3032 LaneMask =
3033 B.createNaryOp(Opcode: VPInstruction::ActiveLaneMask,
3034 Operands: {WideCanonicalIV, Plan.getTripCount(), ALMMultiplier},
3035 DL: nullptr, Name: "active.lane.mask");
3036 }
3037
3038 // Walk users of WideCanonicalIV and replace the header mask of the form
3039 // (ICMP_ULE, WideCanonicalIV, backedge-taken-count) with an active-lane-mask,
3040 // removing the old one to ensure there is always only a single header mask.
3041 HeaderMask->replaceAllUsesWith(New: LaneMask);
3042 HeaderMask->eraseFromParent();
3043}
3044
3045template <typename Op0_t, typename Op1_t> struct RemoveMask_match {
3046 Op0_t In;
3047 Op1_t &Out;
3048
3049 RemoveMask_match(const Op0_t &In, Op1_t &Out) : In(In), Out(Out) {}
3050
3051 template <typename OpTy> bool match(OpTy *V) const {
3052 if (m_Specific(In).match(V)) {
3053 Out = nullptr;
3054 return true;
3055 }
3056 return m_LogicalAnd(m_Specific(In), m_VPValue(Out)).match(V);
3057 }
3058};
3059
3060/// Match a specific mask \p In, or a combination of it (logical-and In, Out).
3061/// Returns the remaining part \p Out if so, or nullptr otherwise.
3062template <typename Op0_t, typename Op1_t>
3063static inline RemoveMask_match<Op0_t, Op1_t> m_RemoveMask(const Op0_t &In,
3064 Op1_t &Out) {
3065 return RemoveMask_match<Op0_t, Op1_t>(In, Out);
3066}
3067
3068/// Try to optimize a \p CurRecipe masked by \p HeaderMask to a corresponding
3069/// EVL-based recipe without the header mask. Returns nullptr if no EVL-based
3070/// recipe could be created.
3071/// \p HeaderMask Header Mask.
3072/// \p CurRecipe Recipe to be transform.
3073/// \p TypeInfo VPlan-based type analysis.
3074/// \p EVL The explicit vector length parameter of vector-predication
3075/// intrinsics.
3076static VPRecipeBase *optimizeMaskToEVL(VPValue *HeaderMask,
3077 VPRecipeBase &CurRecipe,
3078 VPTypeAnalysis &TypeInfo, VPValue &EVL) {
3079 VPlan *Plan = CurRecipe.getParent()->getPlan();
3080 DebugLoc DL = CurRecipe.getDebugLoc();
3081 VPValue *Addr, *Mask, *EndPtr;
3082
3083 /// Adjust any end pointers so that they point to the end of EVL lanes not VF.
3084 auto AdjustEndPtr = [&CurRecipe, &EVL](VPValue *EndPtr) {
3085 auto *EVLEndPtr = cast<VPVectorEndPointerRecipe>(Val: EndPtr)->clone();
3086 EVLEndPtr->insertBefore(InsertPos: &CurRecipe);
3087 EVLEndPtr->setOperand(I: 1, New: &EVL);
3088 return EVLEndPtr;
3089 };
3090
3091 if (match(V: &CurRecipe,
3092 P: m_MaskedLoad(Addr: m_VPValue(V&: Addr), Mask: m_RemoveMask(In: HeaderMask, Out&: Mask))) &&
3093 !cast<VPWidenLoadRecipe>(Val&: CurRecipe).isReverse())
3094 return new VPWidenLoadEVLRecipe(cast<VPWidenLoadRecipe>(Val&: CurRecipe), Addr,
3095 EVL, Mask);
3096
3097 VPValue *ReversedVal;
3098 if (match(V: &CurRecipe, P: m_Reverse(Op0: m_VPValue(V&: ReversedVal))) &&
3099 match(V: ReversedVal,
3100 P: m_MaskedLoad(Addr: m_VPValue(V&: EndPtr), Mask: m_RemoveMask(In: HeaderMask, Out&: Mask))) &&
3101 match(V: EndPtr, P: m_VecEndPtr(Op0: m_VPValue(V&: Addr), Op1: m_Specific(VPV: &Plan->getVF()))) &&
3102 cast<VPWidenLoadRecipe>(Val: ReversedVal)->isReverse()) {
3103 auto *LoadR = new VPWidenLoadEVLRecipe(
3104 *cast<VPWidenLoadRecipe>(Val: ReversedVal), AdjustEndPtr(EndPtr), EVL, Mask);
3105 LoadR->insertBefore(InsertPos: &CurRecipe);
3106 return new VPWidenIntrinsicRecipe(
3107 Intrinsic::experimental_vp_reverse, {LoadR, Plan->getTrue(), &EVL},
3108 TypeInfo.inferScalarType(V: LoadR), {}, {}, DL);
3109 }
3110
3111 VPValue *StoredVal;
3112 if (match(V: &CurRecipe, P: m_MaskedStore(Addr: m_VPValue(V&: Addr), Val: m_VPValue(V&: StoredVal),
3113 Mask: m_RemoveMask(In: HeaderMask, Out&: Mask))) &&
3114 !cast<VPWidenStoreRecipe>(Val&: CurRecipe).isReverse())
3115 return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(Val&: CurRecipe), Addr,
3116 StoredVal, EVL, Mask);
3117
3118 if (match(V: &CurRecipe,
3119 P: m_MaskedStore(Addr: m_VPValue(V&: EndPtr), Val: m_Reverse(Op0: m_VPValue(V&: ReversedVal)),
3120 Mask: m_RemoveMask(In: HeaderMask, Out&: Mask))) &&
3121 match(V: EndPtr, P: m_VecEndPtr(Op0: m_VPValue(V&: Addr), Op1: m_Specific(VPV: &Plan->getVF()))) &&
3122 cast<VPWidenStoreRecipe>(Val&: CurRecipe).isReverse()) {
3123 auto *NewReverse = new VPWidenIntrinsicRecipe(
3124 Intrinsic::experimental_vp_reverse,
3125 {ReversedVal, Plan->getTrue(), &EVL},
3126 TypeInfo.inferScalarType(V: ReversedVal), {}, {}, DL);
3127 NewReverse->insertBefore(InsertPos: &CurRecipe);
3128 return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(Val&: CurRecipe),
3129 AdjustEndPtr(EndPtr), NewReverse, EVL,
3130 Mask);
3131 }
3132
3133 if (auto *Rdx = dyn_cast<VPReductionRecipe>(Val: &CurRecipe))
3134 if (Rdx->isConditional() &&
3135 match(V: Rdx->getCondOp(), P: m_RemoveMask(In: HeaderMask, Out&: Mask)))
3136 return new VPReductionEVLRecipe(*Rdx, EVL, Mask);
3137
3138 if (auto *Interleave = dyn_cast<VPInterleaveRecipe>(Val: &CurRecipe))
3139 if (Interleave->getMask() &&
3140 match(V: Interleave->getMask(), P: m_RemoveMask(In: HeaderMask, Out&: Mask)))
3141 return new VPInterleaveEVLRecipe(*Interleave, EVL, Mask);
3142
3143 VPValue *LHS, *RHS;
3144 if (match(V: &CurRecipe,
3145 P: m_Select(Op0: m_Specific(VPV: HeaderMask), Op1: m_VPValue(V&: LHS), Op2: m_VPValue(V&: RHS))))
3146 return new VPWidenIntrinsicRecipe(
3147 Intrinsic::vp_merge, {Plan->getTrue(), LHS, RHS, &EVL},
3148 TypeInfo.inferScalarType(V: LHS), {}, {}, DL);
3149
3150 if (match(V: &CurRecipe, P: m_Select(Op0: m_RemoveMask(In: HeaderMask, Out&: Mask), Op1: m_VPValue(V&: LHS),
3151 Op2: m_VPValue(V&: RHS))))
3152 return new VPWidenIntrinsicRecipe(
3153 Intrinsic::vp_merge, {Mask, LHS, RHS, &EVL},
3154 TypeInfo.inferScalarType(V: LHS), {}, {}, DL);
3155
3156 if (match(V: &CurRecipe, P: m_LastActiveLane(Op0: m_Specific(VPV: HeaderMask)))) {
3157 Type *Ty = TypeInfo.inferScalarType(V: CurRecipe.getVPSingleValue());
3158 VPValue *ZExt = VPBuilder(&CurRecipe)
3159 .createScalarZExtOrTrunc(
3160 Op: &EVL, ResultTy: Ty, SrcTy: TypeInfo.inferScalarType(V: &EVL), DL);
3161 return new VPInstruction(
3162 Instruction::Sub, {ZExt, Plan->getConstantInt(Ty, Val: 1)},
3163 VPIRFlags::getDefaultFlags(Opcode: Instruction::Sub), {}, DL);
3164 }
3165
3166 return nullptr;
3167}
3168
3169/// Optimize away any EVL-based header masks to VP intrinsic based recipes.
3170/// The transforms here need to preserve the original semantics.
3171void VPlanTransforms::optimizeEVLMasks(VPlan &Plan) {
3172 // Find the EVL-based header mask if it exists: icmp ult step-vector, EVL
3173 VPValue *HeaderMask = nullptr, *EVL = nullptr;
3174 for (VPRecipeBase &R : *Plan.getVectorLoopRegion()->getEntryBasicBlock()) {
3175 if (match(V: &R, P: m_SpecificICmp(MatchPred: CmpInst::ICMP_ULT, Op0: m_StepVector(),
3176 Op1: m_VPValue(V&: EVL))) &&
3177 match(V: EVL, P: m_EVL(Op0: m_VPValue()))) {
3178 HeaderMask = R.getVPSingleValue();
3179 break;
3180 }
3181 }
3182 if (!HeaderMask)
3183 return;
3184
3185 VPTypeAnalysis TypeInfo(Plan);
3186 SmallVector<VPRecipeBase *> OldRecipes;
3187 for (VPUser *U : collectUsersRecursively(V: HeaderMask)) {
3188 VPRecipeBase *R = cast<VPRecipeBase>(Val: U);
3189 if (auto *NewR = optimizeMaskToEVL(HeaderMask, CurRecipe&: *R, TypeInfo, EVL&: *EVL)) {
3190 NewR->insertBefore(InsertPos: R);
3191 for (auto [Old, New] :
3192 zip_equal(t: R->definedValues(), u: NewR->definedValues()))
3193 Old->replaceAllUsesWith(New);
3194 OldRecipes.push_back(Elt: R);
3195 }
3196 }
3197
3198 // Replace remaining (HeaderMask && Mask) with vp.merge (True, Mask,
3199 // False, EVL)
3200 for (VPUser *U : collectUsersRecursively(V: HeaderMask)) {
3201 VPValue *Mask;
3202 if (match(U, P: m_LogicalAnd(Op0: m_Specific(VPV: HeaderMask), Op1: m_VPValue(V&: Mask)))) {
3203 auto *LogicalAnd = cast<VPInstruction>(Val: U);
3204 auto *Merge = new VPWidenIntrinsicRecipe(
3205 Intrinsic::vp_merge, {Plan.getTrue(), Mask, Plan.getFalse(), EVL},
3206 TypeInfo.inferScalarType(V: Mask), {}, {}, LogicalAnd->getDebugLoc());
3207 Merge->insertBefore(InsertPos: LogicalAnd);
3208 LogicalAnd->replaceAllUsesWith(New: Merge);
3209 OldRecipes.push_back(Elt: LogicalAnd);
3210 }
3211 }
3212
3213 // Erase old recipes at the end so we don't invalidate TypeInfo.
3214 for (VPRecipeBase *R : reverse(C&: OldRecipes)) {
3215 SmallVector<VPValue *> PossiblyDead(R->operands());
3216 R->eraseFromParent();
3217 for (VPValue *Op : PossiblyDead)
3218 recursivelyDeleteDeadRecipes(V: Op);
3219 }
3220}
3221
3222/// After replacing the canonical IV with a EVL-based IV, fixup recipes that use
3223/// VF to use the EVL instead to avoid incorrect updates on the penultimate
3224/// iteration.
3225static void fixupVFUsersForEVL(VPlan &Plan, VPValue &EVL) {
3226 VPTypeAnalysis TypeInfo(Plan);
3227 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3228 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
3229
3230 assert(all_of(Plan.getVF().users(),
3231 IsaPred<VPVectorEndPointerRecipe, VPScalarIVStepsRecipe,
3232 VPWidenIntOrFpInductionRecipe>) &&
3233 "User of VF that we can't transform to EVL.");
3234 Plan.getVF().replaceUsesWithIf(New: &EVL, ShouldReplace: [](VPUser &U, unsigned Idx) {
3235 return isa<VPWidenIntOrFpInductionRecipe, VPScalarIVStepsRecipe>(Val: U);
3236 });
3237
3238 assert(all_of(Plan.getVFxUF().users(),
3239 match_fn(m_CombineOr(
3240 m_c_Add(m_Specific(LoopRegion->getCanonicalIV()),
3241 m_Specific(&Plan.getVFxUF())),
3242 m_Isa<VPWidenPointerInductionRecipe>()))) &&
3243 "Only users of VFxUF should be VPWidenPointerInductionRecipe and the "
3244 "increment of the canonical induction.");
3245 Plan.getVFxUF().replaceUsesWithIf(New: &EVL, ShouldReplace: [](VPUser &U, unsigned Idx) {
3246 // Only replace uses in VPWidenPointerInductionRecipe; The increment of the
3247 // canonical induction must not be updated.
3248 return isa<VPWidenPointerInductionRecipe>(Val: U);
3249 });
3250
3251 // Create a scalar phi to track the previous EVL if fixed-order recurrence is
3252 // contained.
3253 bool ContainsFORs =
3254 any_of(Range: Header->phis(), P: IsaPred<VPFirstOrderRecurrencePHIRecipe>);
3255 if (ContainsFORs) {
3256 // TODO: Use VPInstruction::ExplicitVectorLength to get maximum EVL.
3257 VPValue *MaxEVL = &Plan.getVF();
3258 // Emit VPScalarCastRecipe in preheader if VF is not a 32 bits integer.
3259 VPBuilder Builder(LoopRegion->getPreheaderVPBB());
3260 MaxEVL = Builder.createScalarZExtOrTrunc(
3261 Op: MaxEVL, ResultTy: Type::getInt32Ty(C&: Plan.getContext()),
3262 SrcTy: TypeInfo.inferScalarType(V: MaxEVL), DL: DebugLoc::getUnknown());
3263
3264 Builder.setInsertPoint(TheBB: Header, IP: Header->getFirstNonPhi());
3265 VPValue *PrevEVL = Builder.createScalarPhi(
3266 IncomingValues: {MaxEVL, &EVL}, DL: DebugLoc::getUnknown(), Name: "prev.evl");
3267
3268 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
3269 Range: vp_depth_first_deep(G: Plan.getVectorLoopRegion()->getEntry()))) {
3270 for (VPRecipeBase &R : *VPBB) {
3271 VPValue *V1, *V2;
3272 if (!match(V: &R,
3273 P: m_VPInstruction<VPInstruction::FirstOrderRecurrenceSplice>(
3274 Ops: m_VPValue(V&: V1), Ops: m_VPValue(V&: V2))))
3275 continue;
3276 VPValue *Imm = Plan.getOrAddLiveIn(
3277 V: ConstantInt::getSigned(Ty: Type::getInt32Ty(C&: Plan.getContext()), V: -1));
3278 VPWidenIntrinsicRecipe *VPSplice = new VPWidenIntrinsicRecipe(
3279 Intrinsic::experimental_vp_splice,
3280 {V1, V2, Imm, Plan.getTrue(), PrevEVL, &EVL},
3281 TypeInfo.inferScalarType(V: R.getVPSingleValue()), {}, {},
3282 R.getDebugLoc());
3283 VPSplice->insertBefore(InsertPos: &R);
3284 R.getVPSingleValue()->replaceAllUsesWith(New: VPSplice);
3285 }
3286 }
3287 }
3288
3289 VPValue *HeaderMask = vputils::findHeaderMask(Plan);
3290 if (!HeaderMask)
3291 return;
3292
3293 // Replace header masks with a mask equivalent to predicating by EVL:
3294 //
3295 // icmp ule widen-canonical-iv backedge-taken-count
3296 // ->
3297 // icmp ult step-vector, EVL
3298 VPRecipeBase *EVLR = EVL.getDefiningRecipe();
3299 VPBuilder Builder(EVLR->getParent(), std::next(x: EVLR->getIterator()));
3300 Type *EVLType = TypeInfo.inferScalarType(V: &EVL);
3301 VPValue *EVLMask = Builder.createICmp(
3302 Pred: CmpInst::ICMP_ULT,
3303 A: Builder.createNaryOp(Opcode: VPInstruction::StepVector, Operands: {}, ResultTy: EVLType), B: &EVL);
3304 HeaderMask->replaceAllUsesWith(New: EVLMask);
3305}
3306
3307/// Converts a tail folded vector loop region to step by
3308/// VPInstruction::ExplicitVectorLength elements instead of VF elements each
3309/// iteration.
3310///
3311/// - Add a VPCurrentIterationPHIRecipe and related recipes to \p Plan and
3312/// replaces all uses except the canonical IV increment of
3313/// VPCanonicalIVPHIRecipe with a VPCurrentIterationPHIRecipe.
3314/// VPCanonicalIVPHIRecipe is used only for loop iterations counting after
3315/// this transformation.
3316///
3317/// - The header mask is replaced with a header mask based on the EVL.
3318///
3319/// - Plans with FORs have a new phi added to keep track of the EVL of the
3320/// previous iteration, and VPFirstOrderRecurrencePHIRecipes are replaced with
3321/// @llvm.vp.splice.
3322///
3323/// The function uses the following definitions:
3324/// %StartV is the canonical induction start value.
3325///
3326/// The function adds the following recipes:
3327///
3328/// vector.ph:
3329/// ...
3330///
3331/// vector.body:
3332/// ...
3333/// %CurrentIter = CURRENT-ITERATION-PHI [ %StartV, %vector.ph ],
3334/// [ %NextIter, %vector.body ]
3335/// %AVL = phi [ trip-count, %vector.ph ], [ %NextAVL, %vector.body ]
3336/// %VPEVL = EXPLICIT-VECTOR-LENGTH %AVL
3337/// ...
3338/// %OpEVL = cast i32 %VPEVL to IVSize
3339/// %NextIter = add IVSize %OpEVL, %CurrentIter
3340/// %NextAVL = sub IVSize nuw %AVL, %OpEVL
3341/// ...
3342///
3343/// If MaxSafeElements is provided, the function adds the following recipes:
3344/// vector.ph:
3345/// ...
3346///
3347/// vector.body:
3348/// ...
3349/// %CurrentIter = CURRENT-ITERATION-PHI [ %StartV, %vector.ph ],
3350/// [ %NextIter, %vector.body ]
3351/// %AVL = phi [ trip-count, %vector.ph ], [ %NextAVL, %vector.body ]
3352/// %cmp = cmp ult %AVL, MaxSafeElements
3353/// %SAFE_AVL = select %cmp, %AVL, MaxSafeElements
3354/// %VPEVL = EXPLICIT-VECTOR-LENGTH %SAFE_AVL
3355/// ...
3356/// %OpEVL = cast i32 %VPEVL to IVSize
3357/// %NextIter = add IVSize %OpEVL, %CurrentIter
3358/// %NextAVL = sub IVSize nuw %AVL, %OpEVL
3359/// ...
3360///
3361void VPlanTransforms::addExplicitVectorLength(
3362 VPlan &Plan, const std::optional<unsigned> &MaxSafeElements) {
3363 if (Plan.hasScalarVFOnly())
3364 return;
3365 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3366 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
3367
3368 auto *CanonicalIVPHI = LoopRegion->getCanonicalIV();
3369 auto *CanIVTy = LoopRegion->getCanonicalIVType();
3370 VPValue *StartV = CanonicalIVPHI->getStartValue();
3371
3372 // Create the CurrentIteration recipe in the vector loop.
3373 auto *CurrentIteration =
3374 new VPCurrentIterationPHIRecipe(StartV, DebugLoc::getUnknown());
3375 CurrentIteration->insertAfter(InsertPos: CanonicalIVPHI);
3376 VPBuilder Builder(Header, Header->getFirstNonPhi());
3377 // Create the AVL (application vector length), starting from TC -> 0 in steps
3378 // of EVL.
3379 VPPhi *AVLPhi = Builder.createScalarPhi(
3380 IncomingValues: {Plan.getTripCount()}, DL: DebugLoc::getCompilerGenerated(), Name: "avl");
3381 VPValue *AVL = AVLPhi;
3382
3383 if (MaxSafeElements) {
3384 // Support for MaxSafeDist for correct loop emission.
3385 VPValue *AVLSafe = Plan.getConstantInt(Ty: CanIVTy, Val: *MaxSafeElements);
3386 VPValue *Cmp = Builder.createICmp(Pred: ICmpInst::ICMP_ULT, A: AVL, B: AVLSafe);
3387 AVL = Builder.createSelect(Cond: Cmp, TrueVal: AVL, FalseVal: AVLSafe, DL: DebugLoc::getUnknown(),
3388 Name: "safe_avl");
3389 }
3390 auto *VPEVL = Builder.createNaryOp(Opcode: VPInstruction::ExplicitVectorLength, Operands: AVL,
3391 DL: DebugLoc::getUnknown(), Name: "evl");
3392
3393 auto *CanonicalIVIncrement =
3394 cast<VPInstruction>(Val: CanonicalIVPHI->getBackedgeValue());
3395 Builder.setInsertPoint(CanonicalIVIncrement);
3396 VPValue *OpVPEVL = VPEVL;
3397
3398 auto *I32Ty = Type::getInt32Ty(C&: Plan.getContext());
3399 OpVPEVL = Builder.createScalarZExtOrTrunc(
3400 Op: OpVPEVL, ResultTy: CanIVTy, SrcTy: I32Ty, DL: CanonicalIVIncrement->getDebugLoc());
3401
3402 auto *NextIter = Builder.createAdd(
3403 LHS: OpVPEVL, RHS: CurrentIteration, DL: CanonicalIVIncrement->getDebugLoc(),
3404 Name: "current.iteration.next", WrapFlags: CanonicalIVIncrement->getNoWrapFlags());
3405 CurrentIteration->addOperand(Operand: NextIter);
3406
3407 VPValue *NextAVL =
3408 Builder.createSub(LHS: AVLPhi, RHS: OpVPEVL, DL: DebugLoc::getCompilerGenerated(),
3409 Name: "avl.next", WrapFlags: {/*NUW=*/true, /*NSW=*/false});
3410 AVLPhi->addOperand(Operand: NextAVL);
3411
3412 fixupVFUsersForEVL(Plan, EVL&: *VPEVL);
3413 removeDeadRecipes(Plan);
3414
3415 // Replace all uses of VPCanonicalIVPHIRecipe by
3416 // VPCurrentIterationPHIRecipe except for the canonical IV increment.
3417 CanonicalIVPHI->replaceAllUsesWith(New: CurrentIteration);
3418 CanonicalIVIncrement->setOperand(I: 0, New: CanonicalIVPHI);
3419 // TODO: support unroll factor > 1.
3420 Plan.setUF(1);
3421}
3422
3423void VPlanTransforms::convertToVariableLengthStep(VPlan &Plan) {
3424 // Find the vector loop entry by locating VPCurrentIterationPHIRecipe.
3425 // There should be only one VPCurrentIteration in the entire plan.
3426 VPCurrentIterationPHIRecipe *CurrentIteration = nullptr;
3427
3428 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
3429 Range: vp_depth_first_shallow(G: Plan.getEntry())))
3430 for (VPRecipeBase &R : VPBB->phis())
3431 if (auto *PhiR = dyn_cast<VPCurrentIterationPHIRecipe>(Val: &R)) {
3432 assert(!CurrentIteration &&
3433 "Found multiple CurrentIteration. Only one expected");
3434 CurrentIteration = PhiR;
3435 }
3436
3437 // Early return if it is not variable-length stepping.
3438 if (!CurrentIteration)
3439 return;
3440
3441 VPBasicBlock *HeaderVPBB = CurrentIteration->getParent();
3442 VPValue *CurrentIterationIncr = CurrentIteration->getBackedgeValue();
3443
3444 // Convert CurrentIteration to concrete recipe.
3445 auto *ScalarR =
3446 VPBuilder(CurrentIteration)
3447 .createScalarPhi(
3448 IncomingValues: {CurrentIteration->getStartValue(), CurrentIterationIncr},
3449 DL: CurrentIteration->getDebugLoc(), Name: "current.iteration.iv");
3450 CurrentIteration->replaceAllUsesWith(New: ScalarR);
3451 CurrentIteration->eraseFromParent();
3452
3453 // Replace CanonicalIVInc with CurrentIteration increment.
3454 auto *CanonicalIV = cast<VPPhi>(Val: &*HeaderVPBB->begin());
3455 VPValue *Backedge = CanonicalIV->getIncomingValue(Idx: 1);
3456 assert(match(Backedge, m_c_Add(m_Specific(CanonicalIV),
3457 m_Specific(&Plan.getVFxUF()))) &&
3458 "Unexpected canonical iv");
3459 Backedge->replaceAllUsesWith(New: CurrentIterationIncr);
3460
3461 // Remove unused phi and increment.
3462 VPRecipeBase *CanonicalIVIncrement = Backedge->getDefiningRecipe();
3463 CanonicalIVIncrement->eraseFromParent();
3464 CanonicalIV->eraseFromParent();
3465}
3466
3467void VPlanTransforms::convertEVLExitCond(VPlan &Plan) {
3468 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3469 // The canonical IV may not exist at this stage.
3470 if (!LoopRegion ||
3471 !isa<VPCanonicalIVPHIRecipe>(Val: LoopRegion->getEntryBasicBlock()->front()))
3472 return;
3473 VPCanonicalIVPHIRecipe *CanIV = LoopRegion->getCanonicalIV();
3474 if (std::next(x: CanIV->getIterator()) == CanIV->getParent()->end())
3475 return;
3476 // The EVL IV is always immediately after the canonical IV.
3477 auto *EVLPhi = dyn_cast_or_null<VPCurrentIterationPHIRecipe>(
3478 Val: std::next(x: CanIV->getIterator()));
3479 if (!EVLPhi)
3480 return;
3481
3482 // Bail if not an EVL tail folded loop.
3483 VPValue *AVL;
3484 if (!match(V: EVLPhi->getBackedgeValue(),
3485 P: m_c_Add(Op0: m_ZExtOrSelf(Op0: m_EVL(Op0: m_VPValue(V&: AVL))), Op1: m_Specific(VPV: EVLPhi))))
3486 return;
3487
3488 // The AVL may be capped to a safe distance.
3489 VPValue *SafeAVL, *UnsafeAVL;
3490 if (match(V: AVL,
3491 P: m_Select(Op0: m_SpecificICmp(MatchPred: CmpInst::ICMP_ULT, Op0: m_VPValue(V&: UnsafeAVL),
3492 Op1: m_VPValue(V&: SafeAVL)),
3493 Op1: m_Deferred(V: UnsafeAVL), Op2: m_Deferred(V: SafeAVL))))
3494 AVL = UnsafeAVL;
3495
3496 VPValue *AVLNext;
3497 [[maybe_unused]] bool FoundAVLNext =
3498 match(V: AVL, P: m_VPInstruction<Instruction::PHI>(
3499 Ops: m_Specific(VPV: Plan.getTripCount()), Ops: m_VPValue(V&: AVLNext)));
3500 assert(FoundAVLNext && "Didn't find AVL backedge?");
3501
3502 VPBasicBlock *Latch = LoopRegion->getExitingBasicBlock();
3503 auto *LatchBr = cast<VPInstruction>(Val: Latch->getTerminator());
3504 if (match(V: LatchBr, P: m_BranchOnCond(Op0: m_True())))
3505 return;
3506
3507 assert(
3508 match(LatchBr,
3509 m_BranchOnCond(m_SpecificCmp(
3510 CmpInst::ICMP_EQ, m_Specific(CanIV->getIncomingValue(1)),
3511 m_Specific(&Plan.getVectorTripCount())))) &&
3512 "Expected BranchOnCond with ICmp comparing CanIV increment with vector "
3513 "trip count");
3514
3515 Type *AVLTy = VPTypeAnalysis(Plan).inferScalarType(V: AVLNext);
3516 VPBuilder Builder(LatchBr);
3517 LatchBr->setOperand(
3518 I: 0, New: Builder.createICmp(Pred: CmpInst::ICMP_EQ, A: AVLNext, B: Plan.getZero(Ty: AVLTy)));
3519}
3520
3521void VPlanTransforms::replaceSymbolicStrides(
3522 VPlan &Plan, PredicatedScalarEvolution &PSE,
3523 const DenseMap<Value *, const SCEV *> &StridesMap) {
3524 // Replace VPValues for known constant strides guaranteed by predicate scalar
3525 // evolution.
3526 auto CanUseVersionedStride = [&Plan](VPUser &U, unsigned) {
3527 auto *R = cast<VPRecipeBase>(Val: &U);
3528 return R->getRegion() ||
3529 R->getParent() == Plan.getVectorLoopRegion()->getSinglePredecessor();
3530 };
3531 ValueToSCEVMapTy RewriteMap;
3532 for (const SCEV *Stride : StridesMap.values()) {
3533 using namespace SCEVPatternMatch;
3534 auto *StrideV = cast<SCEVUnknown>(Val: Stride)->getValue();
3535 const APInt *StrideConst;
3536 if (!match(S: PSE.getSCEV(V: StrideV), P: m_scev_APInt(C&: StrideConst)))
3537 // Only handle constant strides for now.
3538 continue;
3539
3540 auto *CI = Plan.getConstantInt(Val: *StrideConst);
3541 if (VPValue *StrideVPV = Plan.getLiveIn(V: StrideV))
3542 StrideVPV->replaceUsesWithIf(New: CI, ShouldReplace: CanUseVersionedStride);
3543
3544 // The versioned value may not be used in the loop directly but through a
3545 // sext/zext. Add new live-ins in those cases.
3546 for (Value *U : StrideV->users()) {
3547 if (!isa<SExtInst, ZExtInst>(Val: U))
3548 continue;
3549 VPValue *StrideVPV = Plan.getLiveIn(V: U);
3550 if (!StrideVPV)
3551 continue;
3552 unsigned BW = U->getType()->getScalarSizeInBits();
3553 APInt C =
3554 isa<SExtInst>(Val: U) ? StrideConst->sext(width: BW) : StrideConst->zext(width: BW);
3555 VPValue *CI = Plan.getConstantInt(Val: C);
3556 StrideVPV->replaceUsesWithIf(New: CI, ShouldReplace: CanUseVersionedStride);
3557 }
3558 RewriteMap[StrideV] = PSE.getSCEV(V: StrideV);
3559 }
3560
3561 for (VPRecipeBase &R : *Plan.getEntry()) {
3562 auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(Val: &R);
3563 if (!ExpSCEV)
3564 continue;
3565 const SCEV *ScevExpr = ExpSCEV->getSCEV();
3566 auto *NewSCEV =
3567 SCEVParameterRewriter::rewrite(Scev: ScevExpr, SE&: *PSE.getSE(), Map&: RewriteMap);
3568 if (NewSCEV != ScevExpr) {
3569 VPValue *NewExp = vputils::getOrCreateVPValueForSCEVExpr(Plan, Expr: NewSCEV);
3570 ExpSCEV->replaceAllUsesWith(New: NewExp);
3571 if (Plan.getTripCount() == ExpSCEV)
3572 Plan.resetTripCount(NewTripCount: NewExp);
3573 }
3574 }
3575}
3576
3577void VPlanTransforms::dropPoisonGeneratingRecipes(
3578 VPlan &Plan,
3579 const std::function<bool(BasicBlock *)> &BlockNeedsPredication) {
3580 // Collect recipes in the backward slice of `Root` that may generate a poison
3581 // value that is used after vectorization.
3582 SmallPtrSet<VPRecipeBase *, 16> Visited;
3583 auto CollectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) {
3584 SmallVector<VPRecipeBase *, 16> Worklist;
3585 Worklist.push_back(Elt: Root);
3586
3587 // Traverse the backward slice of Root through its use-def chain.
3588 while (!Worklist.empty()) {
3589 VPRecipeBase *CurRec = Worklist.pop_back_val();
3590
3591 if (!Visited.insert(Ptr: CurRec).second)
3592 continue;
3593
3594 // Prune search if we find another recipe generating a widen memory
3595 // instruction. Widen memory instructions involved in address computation
3596 // will lead to gather/scatter instructions, which don't need to be
3597 // handled.
3598 if (isa<VPWidenMemoryRecipe, VPInterleaveRecipe, VPScalarIVStepsRecipe,
3599 VPHeaderPHIRecipe>(Val: CurRec))
3600 continue;
3601
3602 // This recipe contributes to the address computation of a widen
3603 // load/store. If the underlying instruction has poison-generating flags,
3604 // drop them directly.
3605 if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(Val: CurRec)) {
3606 VPValue *A, *B;
3607 // Dropping disjoint from an OR may yield incorrect results, as some
3608 // analysis may have converted it to an Add implicitly (e.g. SCEV used
3609 // for dependence analysis). Instead, replace it with an equivalent Add.
3610 // This is possible as all users of the disjoint OR only access lanes
3611 // where the operands are disjoint or poison otherwise.
3612 if (match(V: RecWithFlags, P: m_BinaryOr(Op0: m_VPValue(V&: A), Op1: m_VPValue(V&: B))) &&
3613 RecWithFlags->isDisjoint()) {
3614 VPBuilder Builder(RecWithFlags);
3615 VPInstruction *New =
3616 Builder.createAdd(LHS: A, RHS: B, DL: RecWithFlags->getDebugLoc());
3617 New->setUnderlyingValue(RecWithFlags->getUnderlyingValue());
3618 RecWithFlags->replaceAllUsesWith(New);
3619 RecWithFlags->eraseFromParent();
3620 CurRec = New;
3621 } else
3622 RecWithFlags->dropPoisonGeneratingFlags();
3623 } else {
3624 Instruction *Instr = dyn_cast_or_null<Instruction>(
3625 Val: CurRec->getVPSingleValue()->getUnderlyingValue());
3626 (void)Instr;
3627 assert((!Instr || !Instr->hasPoisonGeneratingFlags()) &&
3628 "found instruction with poison generating flags not covered by "
3629 "VPRecipeWithIRFlags");
3630 }
3631
3632 // Add new definitions to the worklist.
3633 for (VPValue *Operand : CurRec->operands())
3634 if (VPRecipeBase *OpDef = Operand->getDefiningRecipe())
3635 Worklist.push_back(Elt: OpDef);
3636 }
3637 });
3638
3639 // Traverse all the recipes in the VPlan and collect the poison-generating
3640 // recipes in the backward slice starting at the address of a VPWidenRecipe or
3641 // VPInterleaveRecipe.
3642 auto Iter = vp_depth_first_deep(G: Plan.getEntry());
3643 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Range: Iter)) {
3644 for (VPRecipeBase &Recipe : *VPBB) {
3645 if (auto *WidenRec = dyn_cast<VPWidenMemoryRecipe>(Val: &Recipe)) {
3646 Instruction &UnderlyingInstr = WidenRec->getIngredient();
3647 VPRecipeBase *AddrDef = WidenRec->getAddr()->getDefiningRecipe();
3648 if (AddrDef && WidenRec->isConsecutive() &&
3649 BlockNeedsPredication(UnderlyingInstr.getParent()))
3650 CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
3651 } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(Val: &Recipe)) {
3652 VPRecipeBase *AddrDef = InterleaveRec->getAddr()->getDefiningRecipe();
3653 if (AddrDef) {
3654 // Check if any member of the interleave group needs predication.
3655 const InterleaveGroup<Instruction> *InterGroup =
3656 InterleaveRec->getInterleaveGroup();
3657 bool NeedPredication = false;
3658 for (int I = 0, NumMembers = InterGroup->getNumMembers();
3659 I < NumMembers; ++I) {
3660 Instruction *Member = InterGroup->getMember(Index: I);
3661 if (Member)
3662 NeedPredication |= BlockNeedsPredication(Member->getParent());
3663 }
3664
3665 if (NeedPredication)
3666 CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
3667 }
3668 }
3669 }
3670 }
3671}
3672
3673void VPlanTransforms::createInterleaveGroups(
3674 VPlan &Plan,
3675 const SmallPtrSetImpl<const InterleaveGroup<Instruction> *>
3676 &InterleaveGroups,
3677 VPRecipeBuilder &RecipeBuilder, const bool &ScalarEpilogueAllowed) {
3678 if (InterleaveGroups.empty())
3679 return;
3680
3681 // Interleave memory: for each Interleave Group we marked earlier as relevant
3682 // for this VPlan, replace the Recipes widening its memory instructions with a
3683 // single VPInterleaveRecipe at its insertion point.
3684 VPDominatorTree VPDT(Plan);
3685 for (const auto *IG : InterleaveGroups) {
3686 auto *Start =
3687 cast<VPWidenMemoryRecipe>(Val: RecipeBuilder.getRecipe(I: IG->getMember(Index: 0)));
3688 VPIRMetadata InterleaveMD(*Start);
3689 SmallVector<VPValue *, 4> StoredValues;
3690 if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(Val: Start))
3691 StoredValues.push_back(Elt: StoreR->getStoredValue());
3692 for (unsigned I = 1; I < IG->getFactor(); ++I) {
3693 Instruction *MemberI = IG->getMember(Index: I);
3694 if (!MemberI)
3695 continue;
3696 VPWidenMemoryRecipe *MemoryR =
3697 cast<VPWidenMemoryRecipe>(Val: RecipeBuilder.getRecipe(I: MemberI));
3698 if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(Val: MemoryR))
3699 StoredValues.push_back(Elt: StoreR->getStoredValue());
3700 InterleaveMD.intersect(MD: *MemoryR);
3701 }
3702
3703 bool NeedsMaskForGaps =
3704 (IG->requiresScalarEpilogue() && !ScalarEpilogueAllowed) ||
3705 (!StoredValues.empty() && !IG->isFull());
3706
3707 Instruction *IRInsertPos = IG->getInsertPos();
3708 auto *InsertPos =
3709 cast<VPWidenMemoryRecipe>(Val: RecipeBuilder.getRecipe(I: IRInsertPos));
3710
3711 GEPNoWrapFlags NW = GEPNoWrapFlags::none();
3712 if (auto *Gep = dyn_cast<GetElementPtrInst>(
3713 Val: getLoadStorePointerOperand(V: IRInsertPos)->stripPointerCasts()))
3714 NW = Gep->getNoWrapFlags().withoutNoUnsignedWrap();
3715
3716 // Get or create the start address for the interleave group.
3717 VPValue *Addr = Start->getAddr();
3718 VPRecipeBase *AddrDef = Addr->getDefiningRecipe();
3719 if (AddrDef && !VPDT.properlyDominates(A: AddrDef, B: InsertPos)) {
3720 // We cannot re-use the address of member zero because it does not
3721 // dominate the insert position. Instead, use the address of the insert
3722 // position and create a PtrAdd adjusting it to the address of member
3723 // zero.
3724 // TODO: Hoist Addr's defining recipe (and any operands as needed) to
3725 // InsertPos or sink loads above zero members to join it.
3726 assert(IG->getIndex(IRInsertPos) != 0 &&
3727 "index of insert position shouldn't be zero");
3728 auto &DL = IRInsertPos->getDataLayout();
3729 APInt Offset(32,
3730 DL.getTypeAllocSize(Ty: getLoadStoreType(I: IRInsertPos)) *
3731 IG->getIndex(Instr: IRInsertPos),
3732 /*IsSigned=*/true);
3733 VPValue *OffsetVPV = Plan.getConstantInt(Val: -Offset);
3734 VPBuilder B(InsertPos);
3735 Addr = B.createNoWrapPtrAdd(Ptr: InsertPos->getAddr(), Offset: OffsetVPV, GEPFlags: NW);
3736 }
3737 // If the group is reverse, adjust the index to refer to the last vector
3738 // lane instead of the first. We adjust the index from the first vector
3739 // lane, rather than directly getting the pointer for lane VF - 1, because
3740 // the pointer operand of the interleaved access is supposed to be uniform.
3741 if (IG->isReverse()) {
3742 auto *ReversePtr = new VPVectorEndPointerRecipe(
3743 Addr, &Plan.getVF(), getLoadStoreType(I: IRInsertPos),
3744 -(int64_t)IG->getFactor(), NW, InsertPos->getDebugLoc());
3745 ReversePtr->insertBefore(InsertPos);
3746 Addr = ReversePtr;
3747 }
3748 auto *VPIG = new VPInterleaveRecipe(IG, Addr, StoredValues,
3749 InsertPos->getMask(), NeedsMaskForGaps,
3750 InterleaveMD, InsertPos->getDebugLoc());
3751 VPIG->insertBefore(InsertPos);
3752
3753 unsigned J = 0;
3754 for (unsigned i = 0; i < IG->getFactor(); ++i)
3755 if (Instruction *Member = IG->getMember(Index: i)) {
3756 VPRecipeBase *MemberR = RecipeBuilder.getRecipe(I: Member);
3757 if (!Member->getType()->isVoidTy()) {
3758 VPValue *OriginalV = MemberR->getVPSingleValue();
3759 OriginalV->replaceAllUsesWith(New: VPIG->getVPValue(I: J));
3760 J++;
3761 }
3762 MemberR->eraseFromParent();
3763 }
3764 }
3765}
3766
3767/// Expand a VPWidenIntOrFpInduction into executable recipes, for the initial
3768/// value, phi and backedge value. In the following example:
3769///
3770/// vector.ph:
3771/// Successor(s): vector loop
3772///
3773/// <x1> vector loop: {
3774/// vector.body:
3775/// WIDEN-INDUCTION %i = phi %start, %step, %vf
3776/// ...
3777/// EMIT branch-on-count ...
3778/// No successors
3779/// }
3780///
3781/// WIDEN-INDUCTION will get expanded to:
3782///
3783/// vector.ph:
3784/// ...
3785/// vp<%induction.start> = ...
3786/// vp<%induction.increment> = ...
3787///
3788/// Successor(s): vector loop
3789///
3790/// <x1> vector loop: {
3791/// vector.body:
3792/// ir<%i> = WIDEN-PHI vp<%induction.start>, vp<%vec.ind.next>
3793/// ...
3794/// vp<%vec.ind.next> = add ir<%i>, vp<%induction.increment>
3795/// EMIT branch-on-count ...
3796/// No successors
3797/// }
3798static void
3799expandVPWidenIntOrFpInduction(VPWidenIntOrFpInductionRecipe *WidenIVR,
3800 VPTypeAnalysis &TypeInfo) {
3801 VPlan *Plan = WidenIVR->getParent()->getPlan();
3802 VPValue *Start = WidenIVR->getStartValue();
3803 VPValue *Step = WidenIVR->getStepValue();
3804 VPValue *VF = WidenIVR->getVFValue();
3805 DebugLoc DL = WidenIVR->getDebugLoc();
3806
3807 // The value from the original loop to which we are mapping the new induction
3808 // variable.
3809 Type *Ty = TypeInfo.inferScalarType(V: WidenIVR);
3810
3811 const InductionDescriptor &ID = WidenIVR->getInductionDescriptor();
3812 Instruction::BinaryOps AddOp;
3813 Instruction::BinaryOps MulOp;
3814 VPIRFlags Flags = *WidenIVR;
3815 if (ID.getKind() == InductionDescriptor::IK_IntInduction) {
3816 AddOp = Instruction::Add;
3817 MulOp = Instruction::Mul;
3818 } else {
3819 AddOp = ID.getInductionOpcode();
3820 MulOp = Instruction::FMul;
3821 }
3822
3823 // If the phi is truncated, truncate the start and step values.
3824 VPBuilder Builder(Plan->getVectorPreheader());
3825 Type *StepTy = TypeInfo.inferScalarType(V: Step);
3826 if (Ty->getScalarSizeInBits() < StepTy->getScalarSizeInBits()) {
3827 assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
3828 Step = Builder.createScalarCast(Opcode: Instruction::Trunc, Op: Step, ResultTy: Ty, DL);
3829 Start = Builder.createScalarCast(Opcode: Instruction::Trunc, Op: Start, ResultTy: Ty, DL);
3830 StepTy = Ty;
3831 }
3832
3833 // Construct the initial value of the vector IV in the vector loop preheader.
3834 Type *IVIntTy =
3835 IntegerType::get(C&: Plan->getContext(), NumBits: StepTy->getScalarSizeInBits());
3836 VPValue *Init = Builder.createNaryOp(Opcode: VPInstruction::StepVector, Operands: {}, ResultTy: IVIntTy);
3837 if (StepTy->isFloatingPointTy())
3838 Init = Builder.createWidenCast(Opcode: Instruction::UIToFP, Op: Init, ResultTy: StepTy);
3839
3840 VPValue *SplatStart = Builder.createNaryOp(Opcode: VPInstruction::Broadcast, Operands: Start);
3841 VPValue *SplatStep = Builder.createNaryOp(Opcode: VPInstruction::Broadcast, Operands: Step);
3842
3843 Init = Builder.createNaryOp(Opcode: MulOp, Operands: {Init, SplatStep}, Flags);
3844 Init = Builder.createNaryOp(Opcode: AddOp, Operands: {SplatStart, Init}, Flags,
3845 DL: DebugLoc::getUnknown(), Name: "induction");
3846
3847 // Create the widened phi of the vector IV.
3848 auto *WidePHI = new VPWidenPHIRecipe(WidenIVR->getPHINode(), Init,
3849 WidenIVR->getDebugLoc(), "vec.ind");
3850 WidePHI->insertBefore(InsertPos: WidenIVR);
3851
3852 // Create the backedge value for the vector IV.
3853 VPValue *Inc;
3854 VPValue *Prev;
3855 // If unrolled, use the increment and prev value from the operands.
3856 if (auto *SplatVF = WidenIVR->getSplatVFValue()) {
3857 Inc = SplatVF;
3858 Prev = WidenIVR->getLastUnrolledPartOperand();
3859 } else {
3860 if (VPRecipeBase *R = VF->getDefiningRecipe())
3861 Builder.setInsertPoint(TheBB: R->getParent(), IP: std::next(x: R->getIterator()));
3862 // Multiply the vectorization factor by the step using integer or
3863 // floating-point arithmetic as appropriate.
3864 if (StepTy->isFloatingPointTy())
3865 VF = Builder.createScalarCast(Opcode: Instruction::CastOps::UIToFP, Op: VF, ResultTy: StepTy,
3866 DL);
3867 else
3868 VF = Builder.createScalarZExtOrTrunc(Op: VF, ResultTy: StepTy,
3869 SrcTy: TypeInfo.inferScalarType(V: VF), DL);
3870
3871 Inc = Builder.createNaryOp(Opcode: MulOp, Operands: {Step, VF}, Flags);
3872 Inc = Builder.createNaryOp(Opcode: VPInstruction::Broadcast, Operands: Inc);
3873 Prev = WidePHI;
3874 }
3875
3876 VPBasicBlock *ExitingBB = Plan->getVectorLoopRegion()->getExitingBasicBlock();
3877 Builder.setInsertPoint(TheBB: ExitingBB, IP: ExitingBB->getTerminator()->getIterator());
3878 auto *Next = Builder.createNaryOp(Opcode: AddOp, Operands: {Prev, Inc}, Flags,
3879 DL: WidenIVR->getDebugLoc(), Name: "vec.ind.next");
3880
3881 WidePHI->addOperand(Operand: Next);
3882
3883 WidenIVR->replaceAllUsesWith(New: WidePHI);
3884}
3885
3886/// Expand a VPWidenPointerInductionRecipe into executable recipes, for the
3887/// initial value, phi and backedge value. In the following example:
3888///
3889/// <x1> vector loop: {
3890/// vector.body:
3891/// EMIT ir<%ptr.iv> = WIDEN-POINTER-INDUCTION %start, %step, %vf
3892/// ...
3893/// EMIT branch-on-count ...
3894/// }
3895///
3896/// WIDEN-POINTER-INDUCTION will get expanded to:
3897///
3898/// <x1> vector loop: {
3899/// vector.body:
3900/// EMIT-SCALAR %pointer.phi = phi %start, %ptr.ind
3901/// EMIT %mul = mul %stepvector, %step
3902/// EMIT %vector.gep = wide-ptradd %pointer.phi, %mul
3903/// ...
3904/// EMIT %ptr.ind = ptradd %pointer.phi, %vf
3905/// EMIT branch-on-count ...
3906/// }
3907static void expandVPWidenPointerInduction(VPWidenPointerInductionRecipe *R,
3908 VPTypeAnalysis &TypeInfo) {
3909 VPlan *Plan = R->getParent()->getPlan();
3910 VPValue *Start = R->getStartValue();
3911 VPValue *Step = R->getStepValue();
3912 VPValue *VF = R->getVFValue();
3913
3914 assert(R->getInductionDescriptor().getKind() ==
3915 InductionDescriptor::IK_PtrInduction &&
3916 "Not a pointer induction according to InductionDescriptor!");
3917 assert(TypeInfo.inferScalarType(R)->isPointerTy() && "Unexpected type.");
3918 assert(!R->onlyScalarsGenerated(Plan->hasScalableVF()) &&
3919 "Recipe should have been replaced");
3920
3921 VPBuilder Builder(R);
3922 DebugLoc DL = R->getDebugLoc();
3923
3924 // Build a scalar pointer phi.
3925 VPPhi *ScalarPtrPhi = Builder.createScalarPhi(IncomingValues: Start, DL, Name: "pointer.phi");
3926
3927 // Create actual address geps that use the pointer phi as base and a
3928 // vectorized version of the step value (<step*0, ..., step*N>) as offset.
3929 Builder.setInsertPoint(TheBB: R->getParent(), IP: R->getParent()->getFirstNonPhi());
3930 Type *StepTy = TypeInfo.inferScalarType(V: Step);
3931 VPValue *Offset = Builder.createNaryOp(Opcode: VPInstruction::StepVector, Operands: {}, ResultTy: StepTy);
3932 Offset = Builder.createOverflowingOp(Opcode: Instruction::Mul, Operands: {Offset, Step});
3933 VPValue *PtrAdd =
3934 Builder.createWidePtrAdd(Ptr: ScalarPtrPhi, Offset, DL, Name: "vector.gep");
3935 R->replaceAllUsesWith(New: PtrAdd);
3936
3937 // Create the backedge value for the scalar pointer phi.
3938 VPBasicBlock *ExitingBB = Plan->getVectorLoopRegion()->getExitingBasicBlock();
3939 Builder.setInsertPoint(TheBB: ExitingBB, IP: ExitingBB->getTerminator()->getIterator());
3940 VF = Builder.createScalarZExtOrTrunc(Op: VF, ResultTy: StepTy, SrcTy: TypeInfo.inferScalarType(V: VF),
3941 DL);
3942 VPValue *Inc = Builder.createOverflowingOp(Opcode: Instruction::Mul, Operands: {Step, VF});
3943
3944 VPValue *InductionGEP =
3945 Builder.createPtrAdd(Ptr: ScalarPtrPhi, Offset: Inc, DL, Name: "ptr.ind");
3946 ScalarPtrPhi->addOperand(Operand: InductionGEP);
3947}
3948
3949void VPlanTransforms::dissolveLoopRegions(VPlan &Plan) {
3950 // Replace loop regions with explicity CFG.
3951 SmallVector<VPRegionBlock *> LoopRegions;
3952 for (VPRegionBlock *R : VPBlockUtils::blocksOnly<VPRegionBlock>(
3953 Range: vp_depth_first_deep(G: Plan.getEntry()))) {
3954 if (!R->isReplicator())
3955 LoopRegions.push_back(Elt: R);
3956 }
3957 for (VPRegionBlock *R : LoopRegions)
3958 R->dissolveToCFGLoop();
3959}
3960
3961void VPlanTransforms::expandBranchOnTwoConds(VPlan &Plan) {
3962 SmallVector<VPInstruction *> WorkList;
3963 // The transform runs after dissolving loop regions, so all VPBasicBlocks
3964 // terminated with BranchOnTwoConds are reached via a shallow traversal.
3965 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
3966 Range: vp_depth_first_shallow(G: Plan.getEntry()))) {
3967 if (!VPBB->empty() && match(V: &VPBB->back(), P: m_BranchOnTwoConds()))
3968 WorkList.push_back(Elt: cast<VPInstruction>(Val: &VPBB->back()));
3969 }
3970
3971 // Expand BranchOnTwoConds instructions into explicit CFG with two new
3972 // single-condition branches:
3973 // 1. A branch that replaces BranchOnTwoConds, jumps to the first successor if
3974 // the first condition is true, and otherwise jumps to a new interim block.
3975 // 2. A branch that ends the interim block, jumps to the second successor if
3976 // the second condition is true, and otherwise jumps to the third
3977 // successor.
3978 for (VPInstruction *Br : WorkList) {
3979 assert(Br->getNumOperands() == 2 &&
3980 "BranchOnTwoConds must have exactly 2 conditions");
3981 DebugLoc DL = Br->getDebugLoc();
3982 VPBasicBlock *BrOnTwoCondsBB = Br->getParent();
3983 const auto Successors = to_vector(Range&: BrOnTwoCondsBB->getSuccessors());
3984 assert(Successors.size() == 3 &&
3985 "BranchOnTwoConds must have exactly 3 successors");
3986
3987 for (VPBlockBase *Succ : Successors)
3988 VPBlockUtils::disconnectBlocks(From: BrOnTwoCondsBB, To: Succ);
3989
3990 VPValue *Cond0 = Br->getOperand(N: 0);
3991 VPValue *Cond1 = Br->getOperand(N: 1);
3992 VPBlockBase *Succ0 = Successors[0];
3993 VPBlockBase *Succ1 = Successors[1];
3994 VPBlockBase *Succ2 = Successors[2];
3995 assert(!Succ0->getParent() && !Succ1->getParent() && !Succ2->getParent() &&
3996 !BrOnTwoCondsBB->getParent() && "regions must already be dissolved");
3997
3998 VPBasicBlock *InterimBB =
3999 Plan.createVPBasicBlock(Name: BrOnTwoCondsBB->getName() + ".interim");
4000
4001 VPBuilder(BrOnTwoCondsBB)
4002 .createNaryOp(Opcode: VPInstruction::BranchOnCond, Operands: {Cond0}, DL);
4003 VPBlockUtils::connectBlocks(From: BrOnTwoCondsBB, To: Succ0);
4004 VPBlockUtils::connectBlocks(From: BrOnTwoCondsBB, To: InterimBB);
4005
4006 VPBuilder(InterimBB).createNaryOp(Opcode: VPInstruction::BranchOnCond, Operands: {Cond1}, DL);
4007 VPBlockUtils::connectBlocks(From: InterimBB, To: Succ1);
4008 VPBlockUtils::connectBlocks(From: InterimBB, To: Succ2);
4009 Br->eraseFromParent();
4010 }
4011}
4012
4013void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan) {
4014 VPTypeAnalysis TypeInfo(Plan);
4015 SmallVector<VPRecipeBase *> ToRemove;
4016 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
4017 Range: vp_depth_first_deep(G: Plan.getEntry()))) {
4018 for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB)) {
4019 if (auto *WidenIVR = dyn_cast<VPWidenIntOrFpInductionRecipe>(Val: &R)) {
4020 expandVPWidenIntOrFpInduction(WidenIVR, TypeInfo);
4021 ToRemove.push_back(Elt: WidenIVR);
4022 continue;
4023 }
4024
4025 if (auto *WidenIVR = dyn_cast<VPWidenPointerInductionRecipe>(Val: &R)) {
4026 // If the recipe only generates scalars, scalarize it instead of
4027 // expanding it.
4028 if (WidenIVR->onlyScalarsGenerated(IsScalable: Plan.hasScalableVF())) {
4029 VPBuilder Builder(WidenIVR);
4030 VPValue *PtrAdd =
4031 scalarizeVPWidenPointerInduction(PtrIV: WidenIVR, Plan, Builder);
4032 WidenIVR->replaceAllUsesWith(New: PtrAdd);
4033 ToRemove.push_back(Elt: WidenIVR);
4034 continue;
4035 }
4036 expandVPWidenPointerInduction(R: WidenIVR, TypeInfo);
4037 ToRemove.push_back(Elt: WidenIVR);
4038 continue;
4039 }
4040
4041 // Expand VPBlendRecipe into VPInstruction::Select.
4042 VPBuilder Builder(&R);
4043 if (auto *Blend = dyn_cast<VPBlendRecipe>(Val: &R)) {
4044 VPValue *Select = Blend->getIncomingValue(Idx: 0);
4045 for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
4046 Select = Builder.createSelect(Cond: Blend->getMask(Idx: I),
4047 TrueVal: Blend->getIncomingValue(Idx: I), FalseVal: Select,
4048 DL: R.getDebugLoc(), Name: "predphi", Flags: *Blend);
4049 Blend->replaceAllUsesWith(New: Select);
4050 ToRemove.push_back(Elt: Blend);
4051 }
4052
4053 if (auto *VEPR = dyn_cast<VPVectorEndPointerRecipe>(Val: &R)) {
4054 if (!VEPR->getOffset()) {
4055 assert(Plan.getConcreteUF() == 1 &&
4056 "Expected unroller to have materialized offset for UF != 1");
4057 VEPR->materializeOffset();
4058 }
4059 }
4060
4061 if (auto *Expr = dyn_cast<VPExpressionRecipe>(Val: &R)) {
4062 Expr->decompose();
4063 ToRemove.push_back(Elt: Expr);
4064 }
4065
4066 // Expand LastActiveLane into Not + FirstActiveLane + Sub.
4067 auto *LastActiveL = dyn_cast<VPInstruction>(Val: &R);
4068 if (LastActiveL &&
4069 LastActiveL->getOpcode() == VPInstruction::LastActiveLane) {
4070 // Create Not(Mask) for all operands.
4071 SmallVector<VPValue *, 2> NotMasks;
4072 for (VPValue *Op : LastActiveL->operands()) {
4073 VPValue *NotMask = Builder.createNot(Operand: Op, DL: LastActiveL->getDebugLoc());
4074 NotMasks.push_back(Elt: NotMask);
4075 }
4076
4077 // Create FirstActiveLane on the inverted masks.
4078 VPValue *FirstInactiveLane = Builder.createNaryOp(
4079 Opcode: VPInstruction::FirstActiveLane, Operands: NotMasks,
4080 DL: LastActiveL->getDebugLoc(), Name: "first.inactive.lane");
4081
4082 // Subtract 1 to get the last active lane.
4083 VPValue *One =
4084 Plan.getConstantInt(Ty: TypeInfo.inferScalarType(V: FirstInactiveLane), Val: 1);
4085 VPValue *LastLane =
4086 Builder.createSub(LHS: FirstInactiveLane, RHS: One,
4087 DL: LastActiveL->getDebugLoc(), Name: "last.active.lane");
4088
4089 LastActiveL->replaceAllUsesWith(New: LastLane);
4090 ToRemove.push_back(Elt: LastActiveL);
4091 continue;
4092 }
4093
4094 // Lower MaskedCond with block mask to LogicalAnd.
4095 if (match(V: &R, P: m_VPInstruction<VPInstruction::MaskedCond>())) {
4096 auto *VPI = cast<VPInstruction>(Val: &R);
4097 assert(VPI->isMasked() &&
4098 "Unmasked MaskedCond should be simplified earlier");
4099 VPI->replaceAllUsesWith(New: Builder.createNaryOp(
4100 Opcode: VPInstruction::LogicalAnd, Operands: {VPI->getMask(), VPI->getOperand(N: 0)}));
4101 ToRemove.push_back(Elt: VPI);
4102 continue;
4103 }
4104
4105 // Lower BranchOnCount to ICmp + BranchOnCond.
4106 VPValue *IV, *TC;
4107 if (match(V: &R, P: m_BranchOnCount(Op0: m_VPValue(V&: IV), Op1: m_VPValue(V&: TC)))) {
4108 auto *BranchOnCountInst = cast<VPInstruction>(Val: &R);
4109 DebugLoc DL = BranchOnCountInst->getDebugLoc();
4110 VPValue *Cond = Builder.createICmp(Pred: CmpInst::ICMP_EQ, A: IV, B: TC, DL);
4111 Builder.createNaryOp(Opcode: VPInstruction::BranchOnCond, Operands: Cond, DL);
4112 ToRemove.push_back(Elt: BranchOnCountInst);
4113 continue;
4114 }
4115
4116 VPValue *VectorStep;
4117 VPValue *ScalarStep;
4118 if (!match(V: &R, P: m_VPInstruction<VPInstruction::WideIVStep>(
4119 Ops: m_VPValue(V&: VectorStep), Ops: m_VPValue(V&: ScalarStep))))
4120 continue;
4121
4122 // Expand WideIVStep.
4123 auto *VPI = cast<VPInstruction>(Val: &R);
4124 Type *IVTy = TypeInfo.inferScalarType(V: VPI);
4125 if (TypeInfo.inferScalarType(V: VectorStep) != IVTy) {
4126 Instruction::CastOps CastOp = IVTy->isFloatingPointTy()
4127 ? Instruction::UIToFP
4128 : Instruction::Trunc;
4129 VectorStep = Builder.createWidenCast(Opcode: CastOp, Op: VectorStep, ResultTy: IVTy);
4130 }
4131
4132 assert(!match(ScalarStep, m_One()) && "Expected non-unit scalar-step");
4133 if (TypeInfo.inferScalarType(V: ScalarStep) != IVTy) {
4134 ScalarStep =
4135 Builder.createWidenCast(Opcode: Instruction::Trunc, Op: ScalarStep, ResultTy: IVTy);
4136 }
4137
4138 VPIRFlags Flags;
4139 unsigned MulOpc;
4140 if (IVTy->isFloatingPointTy()) {
4141 MulOpc = Instruction::FMul;
4142 Flags = VPI->getFastMathFlags();
4143 } else {
4144 MulOpc = Instruction::Mul;
4145 Flags = VPIRFlags::getDefaultFlags(Opcode: MulOpc);
4146 }
4147
4148 VPInstruction *Mul = Builder.createNaryOp(
4149 Opcode: MulOpc, Operands: {VectorStep, ScalarStep}, Flags, DL: R.getDebugLoc());
4150 VectorStep = Mul;
4151 VPI->replaceAllUsesWith(New: VectorStep);
4152 ToRemove.push_back(Elt: VPI);
4153 }
4154 }
4155
4156 for (VPRecipeBase *R : ToRemove)
4157 R->eraseFromParent();
4158}
4159
4160void VPlanTransforms::handleUncountableEarlyExits(VPlan &Plan,
4161 VPBasicBlock *HeaderVPBB,
4162 VPBasicBlock *LatchVPBB,
4163 VPBasicBlock *MiddleVPBB,
4164 UncountableExitStyle Style) {
4165 struct EarlyExitInfo {
4166 VPBasicBlock *EarlyExitingVPBB;
4167 VPIRBasicBlock *EarlyExitVPBB;
4168 VPValue *CondToExit;
4169 };
4170
4171 VPDominatorTree VPDT(Plan);
4172 VPBuilder Builder(LatchVPBB->getTerminator());
4173 SmallVector<EarlyExitInfo> Exits;
4174 for (VPIRBasicBlock *ExitBlock : Plan.getExitBlocks()) {
4175 for (VPBlockBase *Pred : to_vector(Range&: ExitBlock->getPredecessors())) {
4176 if (Pred == MiddleVPBB)
4177 continue;
4178 // Collect condition for this early exit.
4179 auto *EarlyExitingVPBB = cast<VPBasicBlock>(Val: Pred);
4180 VPBlockBase *TrueSucc = EarlyExitingVPBB->getSuccessors()[0];
4181 VPValue *CondOfEarlyExitingVPBB;
4182 [[maybe_unused]] bool Matched =
4183 match(V: EarlyExitingVPBB->getTerminator(),
4184 P: m_BranchOnCond(Op0: m_VPValue(V&: CondOfEarlyExitingVPBB)));
4185 assert(Matched && "Terminator must be BranchOnCond");
4186
4187 // Insert the MaskedCond in the EarlyExitingVPBB so the predicator adds
4188 // the correct block mask.
4189 VPBuilder EarlyExitingBuilder(EarlyExitingVPBB->getTerminator());
4190 auto *CondToEarlyExit = EarlyExitingBuilder.createNaryOp(
4191 Opcode: VPInstruction::MaskedCond,
4192 Operands: TrueSucc == ExitBlock
4193 ? CondOfEarlyExitingVPBB
4194 : EarlyExitingBuilder.createNot(Operand: CondOfEarlyExitingVPBB));
4195 assert((isa<VPIRValue>(CondOfEarlyExitingVPBB) ||
4196 !VPDT.properlyDominates(EarlyExitingVPBB, LatchVPBB) ||
4197 VPDT.properlyDominates(
4198 CondOfEarlyExitingVPBB->getDefiningRecipe()->getParent(),
4199 LatchVPBB)) &&
4200 "exit condition must dominate the latch");
4201 Exits.push_back(Elt: {
4202 .EarlyExitingVPBB: EarlyExitingVPBB,
4203 .EarlyExitVPBB: ExitBlock,
4204 .CondToExit: CondToEarlyExit,
4205 });
4206 }
4207 }
4208
4209 assert(!Exits.empty() && "must have at least one early exit");
4210 // Sort exits by RPO order to get correct program order. RPO gives a
4211 // topological ordering of the CFG, ensuring upstream exits are checked
4212 // before downstream exits in the dispatch chain.
4213 ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
4214 HeaderVPBB);
4215 DenseMap<VPBlockBase *, unsigned> RPOIdx;
4216 for (const auto &[Num, VPB] : enumerate(First&: RPOT))
4217 RPOIdx[VPB] = Num;
4218 llvm::sort(C&: Exits, Comp: [&RPOIdx](const EarlyExitInfo &A, const EarlyExitInfo &B) {
4219 return RPOIdx[A.EarlyExitingVPBB] < RPOIdx[B.EarlyExitingVPBB];
4220 });
4221#ifndef NDEBUG
4222 // After RPO sorting, verify that for any pair where one exit dominates
4223 // another, the dominating exit comes first. This is guaranteed by RPO
4224 // (topological order) and is required for the dispatch chain correctness.
4225 for (unsigned I = 0; I + 1 < Exits.size(); ++I)
4226 for (unsigned J = I + 1; J < Exits.size(); ++J)
4227 assert(!VPDT.properlyDominates(Exits[J].EarlyExitingVPBB,
4228 Exits[I].EarlyExitingVPBB) &&
4229 "RPO sort must place dominating exits before dominated ones");
4230#endif
4231
4232 // Build the AnyOf condition for the latch terminator using logical OR
4233 // to avoid poison propagation from later exit conditions when an earlier
4234 // exit is taken.
4235 VPValue *Combined = Exits[0].CondToExit;
4236 for (const EarlyExitInfo &Info : drop_begin(RangeOrContainer&: Exits))
4237 Combined = Builder.createLogicalOr(LHS: Combined, RHS: Info.CondToExit);
4238
4239 VPValue *IsAnyExitTaken =
4240 Builder.createNaryOp(Opcode: VPInstruction::AnyOf, Operands: {Combined});
4241
4242 assert(Style == UncountableExitStyle::ReadOnly &&
4243 "Early exit store masking not implemented");
4244
4245 // Create the vector.early.exit blocks.
4246 SmallVector<VPBasicBlock *> VectorEarlyExitVPBBs(Exits.size());
4247 for (unsigned Idx = 0; Idx != Exits.size(); ++Idx) {
4248 Twine BlockSuffix = Exits.size() == 1 ? "" : Twine(".") + Twine(Idx);
4249 VPBasicBlock *VectorEarlyExitVPBB =
4250 Plan.createVPBasicBlock(Name: "vector.early.exit" + BlockSuffix);
4251 VectorEarlyExitVPBBs[Idx] = VectorEarlyExitVPBB;
4252 }
4253
4254 // Create the dispatch block (or reuse the single exit block if only one
4255 // exit). The dispatch block computes the first active lane of the combined
4256 // condition and, for multiple exits, chains through conditions to determine
4257 // which exit to take.
4258 VPBasicBlock *DispatchVPBB =
4259 Exits.size() == 1 ? VectorEarlyExitVPBBs[0]
4260 : Plan.createVPBasicBlock(Name: "vector.early.exit.check");
4261 VPBuilder DispatchBuilder(DispatchVPBB, DispatchVPBB->begin());
4262 VPValue *FirstActiveLane =
4263 DispatchBuilder.createNaryOp(Opcode: VPInstruction::FirstActiveLane, Operands: {Combined},
4264 DL: DebugLoc::getUnknown(), Name: "first.active.lane");
4265
4266 // For each early exit, disconnect the original exiting block
4267 // (early.exiting.I) from the exit block (ir-bb<exit.I>) and route through a
4268 // new vector.early.exit block. Update ir-bb<exit.I>'s phis to extract their
4269 // values at the first active lane:
4270 //
4271 // Input:
4272 // early.exiting.I:
4273 // ...
4274 // EMIT branch-on-cond vp<%cond.I>
4275 // Successor(s): in.loop.succ, ir-bb<exit.I>
4276 //
4277 // ir-bb<exit.I>:
4278 // IR %phi = phi [ vp<%incoming.I>, early.exiting.I ], ...
4279 //
4280 // Output:
4281 // early.exiting.I:
4282 // ...
4283 // Successor(s): in.loop.succ
4284 //
4285 // vector.early.exit.I:
4286 // EMIT vp<%exit.val> = extract-lane vp<%first.lane>, vp<%incoming.I>
4287 // Successor(s): ir-bb<exit.I>
4288 //
4289 // ir-bb<exit.I>:
4290 // IR %phi = phi ... (extra operand: vp<%exit.val> from
4291 // vector.early.exit.I)
4292 //
4293 for (auto [Exit, VectorEarlyExitVPBB] :
4294 zip_equal(t&: Exits, u&: VectorEarlyExitVPBBs)) {
4295 auto &[EarlyExitingVPBB, EarlyExitVPBB, _] = Exit;
4296 // Adjust the phi nodes in EarlyExitVPBB.
4297 // 1. remove incoming values from EarlyExitingVPBB,
4298 // 2. extract the incoming value at FirstActiveLane
4299 // 3. add back the extracts as last operands for the phis
4300 // Then adjust the CFG, removing the edge between EarlyExitingVPBB and
4301 // EarlyExitVPBB and adding a new edge between VectorEarlyExitVPBB and
4302 // EarlyExitVPBB. The extracts at FirstActiveLane are now the incoming
4303 // values from VectorEarlyExitVPBB.
4304 for (VPRecipeBase &R : EarlyExitVPBB->phis()) {
4305 auto *ExitIRI = cast<VPIRPhi>(Val: &R);
4306 VPValue *IncomingVal =
4307 ExitIRI->getIncomingValueForBlock(VPBB: EarlyExitingVPBB);
4308 VPValue *NewIncoming = IncomingVal;
4309 if (!isa<VPIRValue>(Val: IncomingVal)) {
4310 VPBuilder EarlyExitBuilder(VectorEarlyExitVPBB);
4311 NewIncoming = EarlyExitBuilder.createNaryOp(
4312 Opcode: VPInstruction::ExtractLane, Operands: {FirstActiveLane, IncomingVal},
4313 DL: DebugLoc::getUnknown(), Name: "early.exit.value");
4314 }
4315 ExitIRI->removeIncomingValueFor(IncomingBlock: EarlyExitingVPBB);
4316 ExitIRI->addOperand(Operand: NewIncoming);
4317 }
4318
4319 EarlyExitingVPBB->getTerminator()->eraseFromParent();
4320 VPBlockUtils::disconnectBlocks(From: EarlyExitingVPBB, To: EarlyExitVPBB);
4321 VPBlockUtils::connectBlocks(From: VectorEarlyExitVPBB, To: EarlyExitVPBB);
4322 }
4323
4324 // Chain through exits: for each exit, check if its condition is true at
4325 // the first active lane. If so, take that exit; otherwise, try the next.
4326 // The last exit needs no check since it must be taken if all others fail.
4327 //
4328 // For 3 exits (cond.0, cond.1, cond.2), this creates:
4329 //
4330 // latch:
4331 // ...
4332 // EMIT vp<%combined> = logical-or vp<%cond.0>, vp<%cond.1>, vp<%cond.2>
4333 // ...
4334 //
4335 // vector.early.exit.check:
4336 // EMIT vp<%first.lane> = first-active-lane vp<%combined>
4337 // EMIT vp<%at.cond.0> = extract-lane vp<%first.lane>, vp<%cond.0>
4338 // EMIT branch-on-cond vp<%at.cond.0>
4339 // Successor(s): vector.early.exit.0, vector.early.exit.check.0
4340 //
4341 // vector.early.exit.check.0:
4342 // EMIT vp<%at.cond.1> = extract-lane vp<%first.lane>, vp<%cond.1>
4343 // EMIT branch-on-cond vp<%at.cond.1>
4344 // Successor(s): vector.early.exit.1, vector.early.exit.2
4345 VPBasicBlock *CurrentBB = DispatchVPBB;
4346 for (auto [I, Exit] : enumerate(First: ArrayRef(Exits).drop_back())) {
4347 VPValue *LaneVal = DispatchBuilder.createNaryOp(
4348 Opcode: VPInstruction::ExtractLane, Operands: {FirstActiveLane, Exit.CondToExit},
4349 DL: DebugLoc::getUnknown(), Name: "exit.cond.at.lane");
4350
4351 // For the last dispatch, branch directly to the last exit on false;
4352 // otherwise, create a new check block.
4353 bool IsLastDispatch = (I + 2 == Exits.size());
4354 VPBasicBlock *FalseBB =
4355 IsLastDispatch ? VectorEarlyExitVPBBs.back()
4356 : Plan.createVPBasicBlock(
4357 Name: Twine("vector.early.exit.check.") + Twine(I));
4358
4359 DispatchBuilder.createNaryOp(Opcode: VPInstruction::BranchOnCond, Operands: {LaneVal});
4360 CurrentBB->setSuccessors({VectorEarlyExitVPBBs[I], FalseBB});
4361 VectorEarlyExitVPBBs[I]->setPredecessors({CurrentBB});
4362 FalseBB->setPredecessors({CurrentBB});
4363
4364 CurrentBB = FalseBB;
4365 DispatchBuilder.setInsertPoint(CurrentBB);
4366 }
4367
4368 // Replace the latch terminator with the new branching logic.
4369 auto *LatchExitingBranch = cast<VPInstruction>(Val: LatchVPBB->getTerminator());
4370 assert(LatchExitingBranch->getOpcode() == VPInstruction::BranchOnCount &&
4371 "Unexpected terminator");
4372 auto *IsLatchExitTaken =
4373 Builder.createICmp(Pred: CmpInst::ICMP_EQ, A: LatchExitingBranch->getOperand(N: 0),
4374 B: LatchExitingBranch->getOperand(N: 1));
4375
4376 DebugLoc LatchDL = LatchExitingBranch->getDebugLoc();
4377 LatchExitingBranch->eraseFromParent();
4378 Builder.setInsertPoint(LatchVPBB);
4379 Builder.createNaryOp(Opcode: VPInstruction::BranchOnTwoConds,
4380 Operands: {IsAnyExitTaken, IsLatchExitTaken}, DL: LatchDL);
4381 LatchVPBB->clearSuccessors();
4382 LatchVPBB->setSuccessors({DispatchVPBB, MiddleVPBB, HeaderVPBB});
4383 DispatchVPBB->setPredecessors({LatchVPBB});
4384}
4385
4386/// This function tries convert extended in-loop reductions to
4387/// VPExpressionRecipe and clamp the \p Range if it is beneficial and
4388/// valid. The created recipe must be decomposed to its constituent
4389/// recipes before execution.
4390static VPExpressionRecipe *
4391tryToMatchAndCreateExtendedReduction(VPReductionRecipe *Red, VPCostContext &Ctx,
4392 VFRange &Range) {
4393 Type *RedTy = Ctx.Types.inferScalarType(V: Red);
4394 VPValue *VecOp = Red->getVecOp();
4395
4396 // For partial reductions, the decision has already been made at the point of
4397 // transforming reductions -> partial reductions for a given plan, based on
4398 // the cost-model.
4399 if (Red->isPartialReduction())
4400 return new VPExpressionRecipe(cast<VPWidenCastRecipe>(Val: VecOp), Red);
4401
4402 // Clamp the range if using extended-reduction is profitable.
4403 auto IsExtendedRedValidAndClampRange =
4404 [&](unsigned Opcode, Instruction::CastOps ExtOpc, Type *SrcTy) -> bool {
4405 return LoopVectorizationPlanner::getDecisionAndClampRange(
4406 Predicate: [&](ElementCount VF) {
4407 auto *SrcVecTy = cast<VectorType>(Val: toVectorTy(Scalar: SrcTy, EC: VF));
4408 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
4409
4410 InstructionCost ExtRedCost = InstructionCost::getInvalid();
4411 InstructionCost ExtCost =
4412 cast<VPWidenCastRecipe>(Val: VecOp)->computeCost(VF, Ctx);
4413 InstructionCost RedCost = Red->computeCost(VF, Ctx);
4414
4415 // TTI::getExtendedReductionCost for in-loop reductions
4416 // only supports integer types.
4417 if (RedTy->isFloatingPointTy())
4418 return false;
4419 ExtRedCost = Ctx.TTI.getExtendedReductionCost(
4420 Opcode, IsUnsigned: ExtOpc == Instruction::CastOps::ZExt, ResTy: RedTy, Ty: SrcVecTy,
4421 FMF: Red->getFastMathFlags(), CostKind);
4422 return ExtRedCost.isValid() && ExtRedCost < ExtCost + RedCost;
4423 },
4424 Range);
4425 };
4426
4427 VPValue *A;
4428 // Match reduce(ext)).
4429 if (match(V: VecOp, P: m_Isa<VPWidenCastRecipe>(P: m_CombineOr(
4430 L: m_ZExtOrSExt(Op0: m_VPValue(V&: A)), R: m_FPExt(Op0: m_VPValue(V&: A))))) &&
4431 IsExtendedRedValidAndClampRange(
4432 RecurrenceDescriptor::getOpcode(Kind: Red->getRecurrenceKind()),
4433 cast<VPWidenCastRecipe>(Val: VecOp)->getOpcode(),
4434 Ctx.Types.inferScalarType(V: A)))
4435 return new VPExpressionRecipe(cast<VPWidenCastRecipe>(Val: VecOp), Red);
4436
4437 return nullptr;
4438}
4439
4440/// This function tries convert extended in-loop reductions to
4441/// VPExpressionRecipe and clamp the \p Range if it is beneficial
4442/// and valid. The created VPExpressionRecipe must be decomposed to its
4443/// constituent recipes before execution. Patterns of the
4444/// VPExpressionRecipe:
4445/// reduce.add(mul(...)),
4446/// reduce.add(mul(ext(A), ext(B))),
4447/// reduce.add(ext(mul(ext(A), ext(B)))).
4448/// reduce.fadd(fmul(ext(A), ext(B)))
4449static VPExpressionRecipe *
4450tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red,
4451 VPCostContext &Ctx, VFRange &Range) {
4452 unsigned Opcode = RecurrenceDescriptor::getOpcode(Kind: Red->getRecurrenceKind());
4453 if (Opcode != Instruction::Add && Opcode != Instruction::Sub &&
4454 Opcode != Instruction::FAdd)
4455 return nullptr;
4456
4457 Type *RedTy = Ctx.Types.inferScalarType(V: Red);
4458
4459 // Clamp the range if using multiply-accumulate-reduction is profitable.
4460 auto IsMulAccValidAndClampRange =
4461 [&](VPWidenRecipe *Mul, VPWidenCastRecipe *Ext0, VPWidenCastRecipe *Ext1,
4462 VPWidenCastRecipe *OuterExt) -> bool {
4463 return LoopVectorizationPlanner::getDecisionAndClampRange(
4464 Predicate: [&](ElementCount VF) {
4465 // For partial reductions, the decision has already been made at the
4466 // point of transforming reductions -> partial reductions for a given
4467 // plan, based on the cost-model.
4468 if (Red->isPartialReduction())
4469 return true;
4470
4471 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
4472 Type *SrcTy =
4473 Ext0 ? Ctx.Types.inferScalarType(V: Ext0->getOperand(N: 0)) : RedTy;
4474 InstructionCost MulAccCost;
4475
4476 // Only partial reductions support mixed or floating-point extends at
4477 // the moment.
4478 if (Ext0 && Ext1 &&
4479 (Ext0->getOpcode() != Ext1->getOpcode() ||
4480 Ext0->getOpcode() == Instruction::CastOps::FPExt))
4481 return false;
4482
4483 bool IsZExt =
4484 !Ext0 || Ext0->getOpcode() == Instruction::CastOps::ZExt;
4485 auto *SrcVecTy = cast<VectorType>(Val: toVectorTy(Scalar: SrcTy, EC: VF));
4486 MulAccCost = Ctx.TTI.getMulAccReductionCost(IsUnsigned: IsZExt, RedOpcode: Opcode, ResTy: RedTy,
4487 Ty: SrcVecTy, CostKind);
4488
4489 InstructionCost MulCost = Mul->computeCost(VF, Ctx);
4490 InstructionCost RedCost = Red->computeCost(VF, Ctx);
4491 InstructionCost ExtCost = 0;
4492 if (Ext0)
4493 ExtCost += Ext0->computeCost(VF, Ctx);
4494 if (Ext1)
4495 ExtCost += Ext1->computeCost(VF, Ctx);
4496 if (OuterExt)
4497 ExtCost += OuterExt->computeCost(VF, Ctx);
4498
4499 return MulAccCost.isValid() &&
4500 MulAccCost < ExtCost + MulCost + RedCost;
4501 },
4502 Range);
4503 };
4504
4505 VPValue *VecOp = Red->getVecOp();
4506 VPRecipeBase *Sub = nullptr;
4507 VPValue *A, *B;
4508 VPValue *Tmp = nullptr;
4509
4510 // Try to match reduce.fadd(fmul(fpext(...), fpext(...))).
4511 if (match(V: VecOp, P: m_FMul(Op0: m_FPExt(Op0: m_VPValue()), Op1: m_FPExt(Op0: m_VPValue())))) {
4512 assert(Opcode == Instruction::FAdd &&
4513 "MulAccumulateReduction from an FMul must accumulate into an FAdd "
4514 "instruction");
4515 auto *FMul = dyn_cast<VPWidenRecipe>(Val: VecOp);
4516 if (!FMul)
4517 return nullptr;
4518
4519 auto *RecipeA = dyn_cast<VPWidenCastRecipe>(Val: FMul->getOperand(N: 0));
4520 auto *RecipeB = dyn_cast<VPWidenCastRecipe>(Val: FMul->getOperand(N: 1));
4521
4522 if (RecipeA && RecipeB &&
4523 IsMulAccValidAndClampRange(FMul, RecipeA, RecipeB, nullptr)) {
4524 return new VPExpressionRecipe(RecipeA, RecipeB, FMul, Red);
4525 }
4526 }
4527 if (RedTy->isFloatingPointTy())
4528 return nullptr;
4529
4530 // Sub reductions could have a sub between the add reduction and vec op.
4531 if (match(V: VecOp, P: m_Sub(Op0: m_ZeroInt(), Op1: m_VPValue(V&: Tmp)))) {
4532 Sub = VecOp->getDefiningRecipe();
4533 VecOp = Tmp;
4534 }
4535
4536 // If ValB is a constant and can be safely extended, truncate it to the same
4537 // type as ExtA's operand, then extend it to the same type as ExtA. This
4538 // creates two uniform extends that can more easily be matched by the rest of
4539 // the bundling code. The ExtB reference, ValB and operand 1 of Mul are all
4540 // replaced with the new extend of the constant.
4541 auto ExtendAndReplaceConstantOp = [&Ctx, &Red](VPWidenCastRecipe *ExtA,
4542 VPWidenCastRecipe *&ExtB,
4543 VPValue *&ValB,
4544 VPWidenRecipe *Mul) {
4545 if (!ExtA || ExtB || !isa<VPIRValue>(Val: ValB) || Red->isPartialReduction())
4546 return;
4547 Type *NarrowTy = Ctx.Types.inferScalarType(V: ExtA->getOperand(N: 0));
4548 Instruction::CastOps ExtOpc = ExtA->getOpcode();
4549 const APInt *Const;
4550 if (!match(V: ValB, P: m_APInt(C&: Const)) ||
4551 !llvm::canConstantBeExtended(
4552 C: Const, NarrowType: NarrowTy, ExtKind: TTI::getPartialReductionExtendKind(CastOpc: ExtOpc)))
4553 return;
4554 // The truncate ensures that the type of each extended operand is the
4555 // same, and it's been proven that the constant can be extended from
4556 // NarrowTy safely. Necessary since ExtA's extended operand would be
4557 // e.g. an i8, while the const will likely be an i32. This will be
4558 // elided by later optimisations.
4559 VPBuilder Builder(Mul);
4560 auto *Trunc =
4561 Builder.createWidenCast(Opcode: Instruction::CastOps::Trunc, Op: ValB, ResultTy: NarrowTy);
4562 Type *WideTy = Ctx.Types.inferScalarType(V: ExtA);
4563 ValB = ExtB = Builder.createWidenCast(Opcode: ExtOpc, Op: Trunc, ResultTy: WideTy);
4564 Mul->setOperand(I: 1, New: ExtB);
4565 };
4566
4567 // Try to match reduce.add(mul(...)).
4568 if (match(V: VecOp, P: m_Mul(Op0: m_VPValue(V&: A), Op1: m_VPValue(V&: B)))) {
4569 auto *RecipeA = dyn_cast_if_present<VPWidenCastRecipe>(Val: A);
4570 auto *RecipeB = dyn_cast_if_present<VPWidenCastRecipe>(Val: B);
4571 auto *Mul = cast<VPWidenRecipe>(Val: VecOp);
4572
4573 // Convert reduce.add(mul(ext, const)) to reduce.add(mul(ext, ext(const)))
4574 ExtendAndReplaceConstantOp(RecipeA, RecipeB, B, Mul);
4575
4576 // Match reduce.add/sub(mul(ext, ext)).
4577 if (RecipeA && RecipeB && match(V: RecipeA, P: m_ZExtOrSExt(Op0: m_VPValue())) &&
4578 match(V: RecipeB, P: m_ZExtOrSExt(Op0: m_VPValue())) &&
4579 IsMulAccValidAndClampRange(Mul, RecipeA, RecipeB, nullptr)) {
4580 if (Sub)
4581 return new VPExpressionRecipe(RecipeA, RecipeB, Mul,
4582 cast<VPWidenRecipe>(Val: Sub), Red);
4583 return new VPExpressionRecipe(RecipeA, RecipeB, Mul, Red);
4584 }
4585 // TODO: Add an expression type for this variant with a negated mul
4586 if (!Sub && IsMulAccValidAndClampRange(Mul, nullptr, nullptr, nullptr))
4587 return new VPExpressionRecipe(Mul, Red);
4588 }
4589 // TODO: Add an expression type for negated versions of other expression
4590 // variants.
4591 if (Sub)
4592 return nullptr;
4593
4594 // Match reduce.add(ext(mul(A, B))).
4595 if (!Red->isPartialReduction() &&
4596 match(V: VecOp, P: m_ZExtOrSExt(Op0: m_Mul(Op0: m_VPValue(V&: A), Op1: m_VPValue(V&: B))))) {
4597 auto *Ext = cast<VPWidenCastRecipe>(Val: VecOp);
4598 auto *Mul = cast<VPWidenRecipe>(Val: Ext->getOperand(N: 0));
4599 auto *Ext0 = dyn_cast_if_present<VPWidenCastRecipe>(Val: A);
4600 auto *Ext1 = dyn_cast_if_present<VPWidenCastRecipe>(Val: B);
4601
4602 // reduce.add(ext(mul(ext, const)))
4603 // -> reduce.add(ext(mul(ext, ext(const))))
4604 ExtendAndReplaceConstantOp(Ext0, Ext1, B, Mul);
4605
4606 // reduce.add(ext(mul(ext(A), ext(B))))
4607 // -> reduce.add(mul(wider_ext(A), wider_ext(B)))
4608 // The inner extends must either have the same opcode as the outer extend or
4609 // be the same, in which case the multiply can never result in a negative
4610 // value and the outer extend can be folded away by doing wider
4611 // extends for the operands of the mul.
4612 if (Ext0 && Ext1 &&
4613 (Ext->getOpcode() == Ext0->getOpcode() || Ext0 == Ext1) &&
4614 Ext0->getOpcode() == Ext1->getOpcode() &&
4615 IsMulAccValidAndClampRange(Mul, Ext0, Ext1, Ext) && Mul->hasOneUse()) {
4616 auto *NewExt0 = new VPWidenCastRecipe(
4617 Ext0->getOpcode(), Ext0->getOperand(N: 0), Ext->getResultType(), nullptr,
4618 *Ext0, *Ext0, Ext0->getDebugLoc());
4619 NewExt0->insertBefore(InsertPos: Ext0);
4620
4621 VPWidenCastRecipe *NewExt1 = NewExt0;
4622 if (Ext0 != Ext1) {
4623 NewExt1 = new VPWidenCastRecipe(Ext1->getOpcode(), Ext1->getOperand(N: 0),
4624 Ext->getResultType(), nullptr, *Ext1,
4625 *Ext1, Ext1->getDebugLoc());
4626 NewExt1->insertBefore(InsertPos: Ext1);
4627 }
4628 Mul->setOperand(I: 0, New: NewExt0);
4629 Mul->setOperand(I: 1, New: NewExt1);
4630 Red->setOperand(I: 1, New: Mul);
4631 return new VPExpressionRecipe(NewExt0, NewExt1, Mul, Red);
4632 }
4633 }
4634 return nullptr;
4635}
4636
4637/// This function tries to create abstract recipes from the reduction recipe for
4638/// following optimizations and cost estimation.
4639static void tryToCreateAbstractReductionRecipe(VPReductionRecipe *Red,
4640 VPCostContext &Ctx,
4641 VFRange &Range) {
4642 VPExpressionRecipe *AbstractR = nullptr;
4643 auto IP = std::next(x: Red->getIterator());
4644 auto *VPBB = Red->getParent();
4645 if (auto *MulAcc = tryToMatchAndCreateMulAccumulateReduction(Red, Ctx, Range))
4646 AbstractR = MulAcc;
4647 else if (auto *ExtRed = tryToMatchAndCreateExtendedReduction(Red, Ctx, Range))
4648 AbstractR = ExtRed;
4649 // Cannot create abstract inloop reduction recipes.
4650 if (!AbstractR)
4651 return;
4652
4653 AbstractR->insertBefore(BB&: *VPBB, IP);
4654 Red->replaceAllUsesWith(New: AbstractR);
4655}
4656
4657void VPlanTransforms::convertToAbstractRecipes(VPlan &Plan, VPCostContext &Ctx,
4658 VFRange &Range) {
4659 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
4660 Range: vp_depth_first_deep(G: Plan.getVectorLoopRegion()))) {
4661 for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB)) {
4662 if (auto *Red = dyn_cast<VPReductionRecipe>(Val: &R))
4663 tryToCreateAbstractReductionRecipe(Red, Ctx, Range);
4664 }
4665 }
4666}
4667
4668void VPlanTransforms::materializeBroadcasts(VPlan &Plan) {
4669 if (Plan.hasScalarVFOnly())
4670 return;
4671
4672#ifndef NDEBUG
4673 VPDominatorTree VPDT(Plan);
4674#endif
4675
4676 SmallVector<VPValue *> VPValues;
4677 if (VPValue *BTC = Plan.getBackedgeTakenCount())
4678 VPValues.push_back(Elt: BTC);
4679 append_range(C&: VPValues, R: Plan.getLiveIns());
4680 for (VPRecipeBase &R : *Plan.getEntry())
4681 append_range(C&: VPValues, R: R.definedValues());
4682
4683 auto *VectorPreheader = Plan.getVectorPreheader();
4684 for (VPValue *VPV : VPValues) {
4685 if (vputils::onlyScalarValuesUsed(Def: VPV) ||
4686 (isa<VPIRValue>(Val: VPV) && isa<Constant>(Val: VPV->getLiveInIRValue())))
4687 continue;
4688
4689 // Add explicit broadcast at the insert point that dominates all users.
4690 VPBasicBlock *HoistBlock = VectorPreheader;
4691 VPBasicBlock::iterator HoistPoint = VectorPreheader->end();
4692 for (VPUser *User : VPV->users()) {
4693 if (User->usesScalars(Op: VPV))
4694 continue;
4695 if (cast<VPRecipeBase>(Val: User)->getParent() == VectorPreheader)
4696 HoistPoint = HoistBlock->begin();
4697 else
4698 assert(VPDT.dominates(VectorPreheader,
4699 cast<VPRecipeBase>(User)->getParent()) &&
4700 "All users must be in the vector preheader or dominated by it");
4701 }
4702
4703 VPBuilder Builder(cast<VPBasicBlock>(Val: HoistBlock), HoistPoint);
4704 auto *Broadcast = Builder.createNaryOp(Opcode: VPInstruction::Broadcast, Operands: {VPV});
4705 VPV->replaceUsesWithIf(New: Broadcast,
4706 ShouldReplace: [VPV, Broadcast](VPUser &U, unsigned Idx) {
4707 return Broadcast != &U && !U.usesScalars(Op: VPV);
4708 });
4709 }
4710}
4711
4712void VPlanTransforms::hoistInvariantLoads(VPlan &Plan) {
4713 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
4714
4715 // Collect candidate loads with invariant addresses and noalias scopes
4716 // metadata and memory-writing recipes with noalias metadata.
4717 SmallVector<std::pair<VPRecipeBase *, MemoryLocation>> CandidateLoads;
4718 SmallVector<MemoryLocation> Stores;
4719 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
4720 Range: vp_depth_first_shallow(G: LoopRegion->getEntry()))) {
4721 for (VPRecipeBase &R : *VPBB) {
4722 // Only handle single-scalar replicated loads with invariant addresses.
4723 if (auto *RepR = dyn_cast<VPReplicateRecipe>(Val: &R)) {
4724 if (RepR->isPredicated() || !RepR->isSingleScalar() ||
4725 RepR->getOpcode() != Instruction::Load)
4726 continue;
4727
4728 VPValue *Addr = RepR->getOperand(N: 0);
4729 if (Addr->isDefinedOutsideLoopRegions()) {
4730 MemoryLocation Loc = *vputils::getMemoryLocation(R: *RepR);
4731 if (!Loc.AATags.Scope)
4732 continue;
4733 CandidateLoads.push_back(Elt: {RepR, Loc});
4734 }
4735 }
4736 if (R.mayWriteToMemory()) {
4737 auto Loc = vputils::getMemoryLocation(R);
4738 if (!Loc || !Loc->AATags.Scope || !Loc->AATags.NoAlias)
4739 return;
4740 Stores.push_back(Elt: *Loc);
4741 }
4742 }
4743 }
4744
4745 VPBasicBlock *Preheader = Plan.getVectorPreheader();
4746 for (auto &[LoadRecipe, LoadLoc] : CandidateLoads) {
4747 // Hoist the load to the preheader if it doesn't alias with any stores
4748 // according to the noalias metadata. Other loads should have been hoisted
4749 // by other passes
4750 const AAMDNodes &LoadAA = LoadLoc.AATags;
4751 if (all_of(Range&: Stores, P: [&](const MemoryLocation &StoreLoc) {
4752 return !ScopedNoAliasAAResult::mayAliasInScopes(
4753 Scopes: LoadAA.Scope, NoAlias: StoreLoc.AATags.NoAlias);
4754 })) {
4755 LoadRecipe->moveBefore(BB&: *Preheader, I: Preheader->getFirstNonPhi());
4756 }
4757 }
4758}
4759
4760// Collect common metadata from a group of replicate recipes by intersecting
4761// metadata from all recipes in the group.
4762static VPIRMetadata getCommonMetadata(ArrayRef<VPReplicateRecipe *> Recipes) {
4763 VPIRMetadata CommonMetadata = *Recipes.front();
4764 for (VPReplicateRecipe *Recipe : drop_begin(RangeOrContainer&: Recipes))
4765 CommonMetadata.intersect(MD: *Recipe);
4766 return CommonMetadata;
4767}
4768
4769template <unsigned Opcode>
4770static SmallVector<SmallVector<VPReplicateRecipe *, 4>>
4771collectComplementaryPredicatedMemOps(VPlan &Plan,
4772 PredicatedScalarEvolution &PSE,
4773 const Loop *L) {
4774 static_assert(Opcode == Instruction::Load || Opcode == Instruction::Store,
4775 "Only Load and Store opcodes supported");
4776 constexpr bool IsLoad = (Opcode == Instruction::Load);
4777 VPTypeAnalysis TypeInfo(Plan);
4778
4779 // For each address, collect operations with the same or complementary masks.
4780 SmallVector<SmallVector<VPReplicateRecipe *, 4>> AllGroups;
4781 auto GetLoadStoreValueType = [&](VPReplicateRecipe *Recipe) {
4782 return TypeInfo.inferScalarType(V: IsLoad ? Recipe : Recipe->getOperand(N: 0));
4783 };
4784 auto Groups = collectGroupedReplicateMemOps<Opcode>(
4785 Plan, PSE, L,
4786 [](VPReplicateRecipe *RepR) { return RepR->isPredicated(); });
4787 for (auto Recipes : Groups) {
4788 if (Recipes.size() < 2)
4789 continue;
4790
4791 // Collect groups with the same or complementary masks.
4792 for (VPReplicateRecipe *&RecipeI : Recipes) {
4793 if (!RecipeI)
4794 continue;
4795
4796 VPValue *MaskI = RecipeI->getMask();
4797 Type *TypeI = GetLoadStoreValueType(RecipeI);
4798 SmallVector<VPReplicateRecipe *, 4> Group;
4799 Group.push_back(Elt: RecipeI);
4800 RecipeI = nullptr;
4801
4802 // Find all operations with the same or complementary masks.
4803 bool HasComplementaryMask = false;
4804 for (VPReplicateRecipe *&RecipeJ : Recipes) {
4805 if (!RecipeJ)
4806 continue;
4807
4808 VPValue *MaskJ = RecipeJ->getMask();
4809 Type *TypeJ = GetLoadStoreValueType(RecipeJ);
4810 if (TypeI == TypeJ) {
4811 // Check if any operation in the group has a complementary mask with
4812 // another, that is M1 == NOT(M2) or M2 == NOT(M1).
4813 HasComplementaryMask |= match(V: MaskI, P: m_Not(Op0: m_Specific(VPV: MaskJ))) ||
4814 match(V: MaskJ, P: m_Not(Op0: m_Specific(VPV: MaskI)));
4815 Group.push_back(Elt: RecipeJ);
4816 RecipeJ = nullptr;
4817 }
4818 }
4819
4820 if (HasComplementaryMask) {
4821 assert(Group.size() >= 2 && "must have at least 2 entries");
4822 AllGroups.push_back(Elt: std::move(Group));
4823 }
4824 }
4825 }
4826
4827 return AllGroups;
4828}
4829
4830// Find the recipe with minimum alignment in the group.
4831template <typename InstType>
4832static VPReplicateRecipe *
4833findRecipeWithMinAlign(ArrayRef<VPReplicateRecipe *> Group) {
4834 return *min_element(Group, [](VPReplicateRecipe *A, VPReplicateRecipe *B) {
4835 return cast<InstType>(A->getUnderlyingInstr())->getAlign() <
4836 cast<InstType>(B->getUnderlyingInstr())->getAlign();
4837 });
4838}
4839
4840void VPlanTransforms::hoistPredicatedLoads(VPlan &Plan,
4841 PredicatedScalarEvolution &PSE,
4842 const Loop *L) {
4843 auto Groups =
4844 collectComplementaryPredicatedMemOps<Instruction::Load>(Plan, PSE, L);
4845 if (Groups.empty())
4846 return;
4847
4848 // Process each group of loads.
4849 for (auto &Group : Groups) {
4850 // Try to use the earliest (most dominating) load to replace all others.
4851 VPReplicateRecipe *EarliestLoad = Group[0];
4852 VPBasicBlock *FirstBB = EarliestLoad->getParent();
4853 VPBasicBlock *LastBB = Group.back()->getParent();
4854
4855 // Check that the load doesn't alias with stores between first and last.
4856 auto LoadLoc = vputils::getMemoryLocation(R: *EarliestLoad);
4857 if (!LoadLoc || !canHoistOrSinkWithNoAliasCheck(MemLoc: *LoadLoc, FirstBB, LastBB))
4858 continue;
4859
4860 // Collect common metadata from all loads in the group.
4861 VPIRMetadata CommonMetadata = getCommonMetadata(Recipes: Group);
4862
4863 // Find the load with minimum alignment to use.
4864 auto *LoadWithMinAlign = findRecipeWithMinAlign<LoadInst>(Group);
4865
4866 bool IsSingleScalar = EarliestLoad->isSingleScalar();
4867 assert(all_of(Group,
4868 [IsSingleScalar](VPReplicateRecipe *R) {
4869 return R->isSingleScalar() == IsSingleScalar;
4870 }) &&
4871 "all members in group must agree on IsSingleScalar");
4872
4873 // Create an unpredicated version of the earliest load with common
4874 // metadata.
4875 auto *UnpredicatedLoad = new VPReplicateRecipe(
4876 LoadWithMinAlign->getUnderlyingInstr(), {EarliestLoad->getOperand(N: 0)},
4877 IsSingleScalar, /*Mask=*/nullptr, *EarliestLoad, CommonMetadata);
4878
4879 UnpredicatedLoad->insertBefore(InsertPos: EarliestLoad);
4880
4881 // Replace all loads in the group with the unpredicated load.
4882 for (VPReplicateRecipe *Load : Group) {
4883 Load->replaceAllUsesWith(New: UnpredicatedLoad);
4884 Load->eraseFromParent();
4885 }
4886 }
4887}
4888
4889static bool
4890canSinkStoreWithNoAliasCheck(ArrayRef<VPReplicateRecipe *> StoresToSink,
4891 PredicatedScalarEvolution &PSE, const Loop &L,
4892 VPTypeAnalysis &TypeInfo) {
4893 auto StoreLoc = vputils::getMemoryLocation(R: *StoresToSink.front());
4894 if (!StoreLoc || !StoreLoc->AATags.Scope)
4895 return false;
4896
4897 // When sinking a group of stores, all members of the group alias each other.
4898 // Skip them during the alias checks.
4899 SmallPtrSet<VPRecipeBase *, 4> StoresToSinkSet(StoresToSink.begin(),
4900 StoresToSink.end());
4901
4902 VPBasicBlock *FirstBB = StoresToSink.front()->getParent();
4903 VPBasicBlock *LastBB = StoresToSink.back()->getParent();
4904 SinkStoreInfo SinkInfo(StoresToSinkSet, *StoresToSink[0], PSE, L, TypeInfo);
4905 return canHoistOrSinkWithNoAliasCheck(MemLoc: *StoreLoc, FirstBB, LastBB, SinkInfo);
4906}
4907
4908void VPlanTransforms::sinkPredicatedStores(VPlan &Plan,
4909 PredicatedScalarEvolution &PSE,
4910 const Loop *L) {
4911 auto Groups =
4912 collectComplementaryPredicatedMemOps<Instruction::Store>(Plan, PSE, L);
4913 if (Groups.empty())
4914 return;
4915
4916 VPTypeAnalysis TypeInfo(Plan);
4917
4918 for (auto &Group : Groups) {
4919 if (!canSinkStoreWithNoAliasCheck(StoresToSink: Group, PSE, L: *L, TypeInfo))
4920 continue;
4921
4922 // Use the last (most dominated) store's location for the unconditional
4923 // store.
4924 VPReplicateRecipe *LastStore = Group.back();
4925 VPBasicBlock *InsertBB = LastStore->getParent();
4926
4927 // Collect common alias metadata from all stores in the group.
4928 VPIRMetadata CommonMetadata = getCommonMetadata(Recipes: Group);
4929
4930 // Build select chain for stored values.
4931 VPValue *SelectedValue = Group[0]->getOperand(N: 0);
4932 VPBuilder Builder(InsertBB, LastStore->getIterator());
4933
4934 bool IsSingleScalar = Group[0]->isSingleScalar();
4935 for (unsigned I = 1; I < Group.size(); ++I) {
4936 assert(IsSingleScalar == Group[I]->isSingleScalar() &&
4937 "all members in group must agree on IsSingleScalar");
4938 VPValue *Mask = Group[I]->getMask();
4939 VPValue *Value = Group[I]->getOperand(N: 0);
4940 SelectedValue = Builder.createSelect(Cond: Mask, TrueVal: Value, FalseVal: SelectedValue,
4941 DL: Group[I]->getDebugLoc());
4942 }
4943
4944 // Find the store with minimum alignment to use.
4945 auto *StoreWithMinAlign = findRecipeWithMinAlign<StoreInst>(Group);
4946
4947 // Create unconditional store with selected value and common metadata.
4948 auto *UnpredicatedStore = new VPReplicateRecipe(
4949 StoreWithMinAlign->getUnderlyingInstr(),
4950 {SelectedValue, LastStore->getOperand(N: 1)}, IsSingleScalar,
4951 /*Mask=*/nullptr, *LastStore, CommonMetadata);
4952 UnpredicatedStore->insertBefore(BB&: *InsertBB, IP: LastStore->getIterator());
4953
4954 // Remove all predicated stores from the group.
4955 for (VPReplicateRecipe *Store : Group)
4956 Store->eraseFromParent();
4957 }
4958}
4959
4960void VPlanTransforms::materializeConstantVectorTripCount(
4961 VPlan &Plan, ElementCount BestVF, unsigned BestUF,
4962 PredicatedScalarEvolution &PSE) {
4963 assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
4964 assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
4965
4966 VPValue *TC = Plan.getTripCount();
4967 if (TC->getNumUsers() == 0)
4968 return;
4969
4970 // Skip cases for which the trip count may be non-trivial to materialize.
4971 // I.e., when a scalar tail is absent - due to tail folding, or when a scalar
4972 // tail is required.
4973 if (!Plan.hasScalarTail() ||
4974 Plan.getMiddleBlock()->getSingleSuccessor() ==
4975 Plan.getScalarPreheader() ||
4976 !isa<VPIRValue>(Val: TC))
4977 return;
4978
4979 // Materialize vector trip counts for constants early if it can simply
4980 // be computed as (Original TC / VF * UF) * VF * UF.
4981 // TODO: Compute vector trip counts for loops requiring a scalar epilogue and
4982 // tail-folded loops.
4983 ScalarEvolution &SE = *PSE.getSE();
4984 auto *TCScev = SE.getSCEV(V: TC->getLiveInIRValue());
4985 if (!isa<SCEVConstant>(Val: TCScev))
4986 return;
4987 const SCEV *VFxUF = SE.getElementCount(Ty: TCScev->getType(), EC: BestVF * BestUF);
4988 auto VecTCScev = SE.getMulExpr(LHS: SE.getUDivExpr(LHS: TCScev, RHS: VFxUF), RHS: VFxUF);
4989 if (auto *ConstVecTC = dyn_cast<SCEVConstant>(Val: VecTCScev))
4990 Plan.getVectorTripCount().setUnderlyingValue(ConstVecTC->getValue());
4991}
4992
4993void VPlanTransforms::materializeBackedgeTakenCount(VPlan &Plan,
4994 VPBasicBlock *VectorPH) {
4995 VPValue *BTC = Plan.getOrCreateBackedgeTakenCount();
4996 if (BTC->getNumUsers() == 0)
4997 return;
4998
4999 VPBuilder Builder(VectorPH, VectorPH->begin());
5000 auto *TCTy = VPTypeAnalysis(Plan).inferScalarType(V: Plan.getTripCount());
5001 auto *TCMO =
5002 Builder.createSub(LHS: Plan.getTripCount(), RHS: Plan.getConstantInt(Ty: TCTy, Val: 1),
5003 DL: DebugLoc::getCompilerGenerated(), Name: "trip.count.minus.1");
5004 BTC->replaceAllUsesWith(New: TCMO);
5005}
5006
5007void VPlanTransforms::materializePacksAndUnpacks(VPlan &Plan) {
5008 if (Plan.hasScalarVFOnly())
5009 return;
5010
5011 VPTypeAnalysis TypeInfo(Plan);
5012 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
5013 auto VPBBsOutsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
5014 Range: vp_depth_first_shallow(G: Plan.getEntry()));
5015 auto VPBBsInsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
5016 Range: vp_depth_first_shallow(G: LoopRegion->getEntry()));
5017 // Materialize Build(Struct)Vector for all replicating VPReplicateRecipes,
5018 // VPScalarIVStepsRecipe and VPInstructions, excluding ones in replicate
5019 // regions. Those are not materialized explicitly yet. Those vector users are
5020 // still handled in VPReplicateRegion::execute(), via shouldPack().
5021 // TODO: materialize build vectors for replicating recipes in replicating
5022 // regions.
5023 for (VPBasicBlock *VPBB :
5024 concat<VPBasicBlock *>(Ranges&: VPBBsOutsideLoopRegion, Ranges&: VPBBsInsideLoopRegion)) {
5025 for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB)) {
5026 if (!isa<VPScalarIVStepsRecipe, VPReplicateRecipe, VPInstruction>(Val: &R))
5027 continue;
5028 auto *DefR = cast<VPSingleDefRecipe>(Val: &R);
5029 auto UsesVectorOrInsideReplicateRegion = [DefR, LoopRegion](VPUser *U) {
5030 VPRegionBlock *ParentRegion = cast<VPRecipeBase>(Val: U)->getRegion();
5031 return !U->usesScalars(Op: DefR) || ParentRegion != LoopRegion;
5032 };
5033 if ((isa<VPReplicateRecipe>(Val: DefR) &&
5034 cast<VPReplicateRecipe>(Val: DefR)->isSingleScalar()) ||
5035 (isa<VPInstruction>(Val: DefR) &&
5036 (vputils::onlyFirstLaneUsed(Def: DefR) ||
5037 !cast<VPInstruction>(Val: DefR)->doesGeneratePerAllLanes())) ||
5038 none_of(Range: DefR->users(), P: UsesVectorOrInsideReplicateRegion))
5039 continue;
5040
5041 Type *ScalarTy = TypeInfo.inferScalarType(V: DefR);
5042 unsigned Opcode = ScalarTy->isStructTy()
5043 ? VPInstruction::BuildStructVector
5044 : VPInstruction::BuildVector;
5045 auto *BuildVector = new VPInstruction(Opcode, {DefR});
5046 BuildVector->insertAfter(InsertPos: DefR);
5047
5048 DefR->replaceUsesWithIf(
5049 New: BuildVector, ShouldReplace: [BuildVector, &UsesVectorOrInsideReplicateRegion](
5050 VPUser &U, unsigned) {
5051 return &U != BuildVector && UsesVectorOrInsideReplicateRegion(&U);
5052 });
5053 }
5054 }
5055
5056 // Create explicit VPInstructions to convert vectors to scalars. The current
5057 // implementation is conservative - it may miss some cases that may or may not
5058 // be vector values. TODO: introduce Unpacks speculatively - remove them later
5059 // if they are known to operate on scalar values.
5060 for (VPBasicBlock *VPBB : VPBBsInsideLoopRegion) {
5061 for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB)) {
5062 if (isa<VPReplicateRecipe, VPInstruction, VPScalarIVStepsRecipe,
5063 VPDerivedIVRecipe, VPCanonicalIVPHIRecipe>(Val: &R))
5064 continue;
5065 for (VPValue *Def : R.definedValues()) {
5066 // Skip recipes that are single-scalar or only have their first lane
5067 // used.
5068 // TODO: The Defs skipped here may or may not be vector values.
5069 // Introduce Unpacks, and remove them later, if they are guaranteed to
5070 // produce scalar values.
5071 if (vputils::isSingleScalar(VPV: Def) || vputils::onlyFirstLaneUsed(Def))
5072 continue;
5073
5074 // At the moment, we create unpacks only for scalar users outside
5075 // replicate regions. Recipes inside replicate regions still extract the
5076 // required lanes implicitly.
5077 // TODO: Remove once replicate regions are unrolled completely.
5078 auto IsCandidateUnpackUser = [Def](VPUser *U) {
5079 VPRegionBlock *ParentRegion = cast<VPRecipeBase>(Val: U)->getRegion();
5080 return U->usesScalars(Op: Def) &&
5081 (!ParentRegion || !ParentRegion->isReplicator());
5082 };
5083 if (none_of(Range: Def->users(), P: IsCandidateUnpackUser))
5084 continue;
5085
5086 auto *Unpack = new VPInstruction(VPInstruction::Unpack, {Def});
5087 if (R.isPhi())
5088 Unpack->insertBefore(BB&: *VPBB, IP: VPBB->getFirstNonPhi());
5089 else
5090 Unpack->insertAfter(InsertPos: &R);
5091 Def->replaceUsesWithIf(New: Unpack,
5092 ShouldReplace: [&IsCandidateUnpackUser](VPUser &U, unsigned) {
5093 return IsCandidateUnpackUser(&U);
5094 });
5095 }
5096 }
5097 }
5098}
5099
5100void VPlanTransforms::materializeVectorTripCount(VPlan &Plan,
5101 VPBasicBlock *VectorPHVPBB,
5102 bool TailByMasking,
5103 bool RequiresScalarEpilogue,
5104 VPValue *Step) {
5105 VPSymbolicValue &VectorTC = Plan.getVectorTripCount();
5106 // There's nothing to do if there are no users of the vector trip count or its
5107 // IR value has already been set.
5108 if (VectorTC.getNumUsers() == 0 || VectorTC.getUnderlyingValue())
5109 return;
5110
5111 VPValue *TC = Plan.getTripCount();
5112 Type *TCTy = VPTypeAnalysis(Plan).inferScalarType(V: TC);
5113 VPBasicBlock::iterator InsertPt = VectorPHVPBB->begin();
5114 if (auto *StepR = Step->getDefiningRecipe()) {
5115 assert(StepR->getParent() == VectorPHVPBB &&
5116 "Step must be defined in VectorPHVPBB");
5117 // Insert after Step's definition to maintain valid def-use ordering.
5118 InsertPt = std::next(x: StepR->getIterator());
5119 }
5120 VPBuilder Builder(VectorPHVPBB, InsertPt);
5121
5122 // If the tail is to be folded by masking, round the number of iterations N
5123 // up to a multiple of Step instead of rounding down. This is done by first
5124 // adding Step-1 and then rounding down. Note that it's ok if this addition
5125 // overflows: the vector induction variable will eventually wrap to zero given
5126 // that it starts at zero and its Step is a power of two; the loop will then
5127 // exit, with the last early-exit vector comparison also producing all-true.
5128 if (TailByMasking) {
5129 TC = Builder.createAdd(
5130 LHS: TC, RHS: Builder.createSub(LHS: Step, RHS: Plan.getConstantInt(Ty: TCTy, Val: 1)),
5131 DL: DebugLoc::getCompilerGenerated(), Name: "n.rnd.up");
5132 }
5133
5134 // Now we need to generate the expression for the part of the loop that the
5135 // vectorized body will execute. This is equal to N - (N % Step) if scalar
5136 // iterations are not required for correctness, or N - Step, otherwise. Step
5137 // is equal to the vectorization factor (number of SIMD elements) times the
5138 // unroll factor (number of SIMD instructions).
5139 VPValue *R =
5140 Builder.createNaryOp(Opcode: Instruction::URem, Operands: {TC, Step},
5141 DL: DebugLoc::getCompilerGenerated(), Name: "n.mod.vf");
5142
5143 // There are cases where we *must* run at least one iteration in the remainder
5144 // loop. See the cost model for when this can happen. If the step evenly
5145 // divides the trip count, we set the remainder to be equal to the step. If
5146 // the step does not evenly divide the trip count, no adjustment is necessary
5147 // since there will already be scalar iterations. Note that the minimum
5148 // iterations check ensures that N >= Step.
5149 if (RequiresScalarEpilogue) {
5150 assert(!TailByMasking &&
5151 "requiring scalar epilogue is not supported with fail folding");
5152 VPValue *IsZero =
5153 Builder.createICmp(Pred: CmpInst::ICMP_EQ, A: R, B: Plan.getZero(Ty: TCTy));
5154 R = Builder.createSelect(Cond: IsZero, TrueVal: Step, FalseVal: R);
5155 }
5156
5157 VPValue *Res =
5158 Builder.createSub(LHS: TC, RHS: R, DL: DebugLoc::getCompilerGenerated(), Name: "n.vec");
5159 VectorTC.replaceAllUsesWith(New: Res);
5160}
5161
5162void VPlanTransforms::materializeFactors(VPlan &Plan, VPBasicBlock *VectorPH,
5163 ElementCount VFEC) {
5164 // If VF and VFxUF have already been materialized (no remaining users),
5165 // there's nothing more to do.
5166 if (Plan.getVF().isMaterialized()) {
5167 assert(Plan.getVFxUF().isMaterialized() &&
5168 "VF and VFxUF must be materialized together");
5169 return;
5170 }
5171
5172 VPBuilder Builder(VectorPH, VectorPH->begin());
5173 Type *TCTy = VPTypeAnalysis(Plan).inferScalarType(V: Plan.getTripCount());
5174 VPValue &VF = Plan.getVF();
5175 VPValue &VFxUF = Plan.getVFxUF();
5176 // If there are no users of the runtime VF, compute VFxUF by constant folding
5177 // the multiplication of VF and UF.
5178 if (VF.getNumUsers() == 0) {
5179 VPValue *RuntimeVFxUF =
5180 Builder.createElementCount(Ty: TCTy, EC: VFEC * Plan.getConcreteUF());
5181 VFxUF.replaceAllUsesWith(New: RuntimeVFxUF);
5182 return;
5183 }
5184
5185 // For users of the runtime VF, compute it as VF * vscale, and VFxUF as (VF *
5186 // vscale) * UF.
5187 VPValue *RuntimeVF = Builder.createElementCount(Ty: TCTy, EC: VFEC);
5188 if (!vputils::onlyScalarValuesUsed(Def: &VF)) {
5189 VPValue *BC = Builder.createNaryOp(Opcode: VPInstruction::Broadcast, Operands: RuntimeVF);
5190 VF.replaceUsesWithIf(
5191 New: BC, ShouldReplace: [&VF](VPUser &U, unsigned) { return !U.usesScalars(Op: &VF); });
5192 }
5193 VF.replaceAllUsesWith(New: RuntimeVF);
5194
5195 VPValue *MulByUF = Builder.createOverflowingOp(
5196 Opcode: Instruction::Mul,
5197 Operands: {RuntimeVF, Plan.getConstantInt(Ty: TCTy, Val: Plan.getConcreteUF())},
5198 WrapFlags: {true, false});
5199 VFxUF.replaceAllUsesWith(New: MulByUF);
5200}
5201
5202DenseMap<const SCEV *, Value *>
5203VPlanTransforms::expandSCEVs(VPlan &Plan, ScalarEvolution &SE) {
5204 SCEVExpander Expander(SE, "induction", /*PreserveLCSSA=*/false);
5205
5206 auto *Entry = cast<VPIRBasicBlock>(Val: Plan.getEntry());
5207 BasicBlock *EntryBB = Entry->getIRBasicBlock();
5208 DenseMap<const SCEV *, Value *> ExpandedSCEVs;
5209 for (VPRecipeBase &R : make_early_inc_range(Range&: *Entry)) {
5210 if (isa<VPIRInstruction, VPIRPhi>(Val: &R))
5211 continue;
5212 auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(Val: &R);
5213 if (!ExpSCEV)
5214 break;
5215 const SCEV *Expr = ExpSCEV->getSCEV();
5216 Value *Res =
5217 Expander.expandCodeFor(SH: Expr, Ty: Expr->getType(), I: EntryBB->getTerminator());
5218 ExpandedSCEVs[ExpSCEV->getSCEV()] = Res;
5219 VPValue *Exp = Plan.getOrAddLiveIn(V: Res);
5220 ExpSCEV->replaceAllUsesWith(New: Exp);
5221 if (Plan.getTripCount() == ExpSCEV)
5222 Plan.resetTripCount(NewTripCount: Exp);
5223 ExpSCEV->eraseFromParent();
5224 }
5225 assert(none_of(*Entry, IsaPred<VPExpandSCEVRecipe>) &&
5226 "VPExpandSCEVRecipes must be at the beginning of the entry block, "
5227 "before any VPIRInstructions");
5228 // Add IR instructions in the entry basic block but not in the VPIRBasicBlock
5229 // to the VPIRBasicBlock.
5230 auto EI = Entry->begin();
5231 for (Instruction &I : drop_end(RangeOrContainer&: *EntryBB)) {
5232 if (EI != Entry->end() && isa<VPIRInstruction>(Val: *EI) &&
5233 &cast<VPIRInstruction>(Val: &*EI)->getInstruction() == &I) {
5234 EI++;
5235 continue;
5236 }
5237 VPIRInstruction::create(I)->insertBefore(BB&: *Entry, IP: EI);
5238 }
5239
5240 return ExpandedSCEVs;
5241}
5242
5243/// Returns true if \p V is VPWidenLoadRecipe or VPInterleaveRecipe that can be
5244/// converted to a narrower recipe. \p V is used by a wide recipe that feeds a
5245/// store interleave group at index \p Idx, \p WideMember0 is the recipe feeding
5246/// the same interleave group at index 0. A VPWidenLoadRecipe can be narrowed to
5247/// an index-independent load if it feeds all wide ops at all indices (\p OpV
5248/// must be the operand at index \p OpIdx for both the recipe at lane 0, \p
5249/// WideMember0). A VPInterleaveRecipe can be narrowed to a wide load, if \p V
5250/// is defined at \p Idx of a load interleave group.
5251static bool canNarrowLoad(VPSingleDefRecipe *WideMember0, unsigned OpIdx,
5252 VPValue *OpV, unsigned Idx, bool IsScalable) {
5253 VPValue *Member0Op = WideMember0->getOperand(N: OpIdx);
5254 VPRecipeBase *Member0OpR = Member0Op->getDefiningRecipe();
5255 if (!Member0OpR)
5256 return Member0Op == OpV;
5257 if (auto *W = dyn_cast<VPWidenLoadRecipe>(Val: Member0OpR))
5258 // For scalable VFs, the narrowed plan processes vscale iterations at once,
5259 // so a shared wide load cannot be narrowed to a uniform scalar; bail out.
5260 return !IsScalable && !W->getMask() && W->isConsecutive() &&
5261 Member0Op == OpV;
5262 if (auto *IR = dyn_cast<VPInterleaveRecipe>(Val: Member0OpR))
5263 return IR->getInterleaveGroup()->isFull() && IR->getVPValue(I: Idx) == OpV;
5264 return false;
5265}
5266
5267static bool canNarrowOps(ArrayRef<VPValue *> Ops, bool IsScalable) {
5268 SmallVector<VPValue *> Ops0;
5269 auto *WideMember0 = dyn_cast<VPSingleDefRecipe>(Val: Ops[0]);
5270 if (!WideMember0)
5271 return false;
5272 for (VPValue *V : Ops) {
5273 if (!isa<VPWidenRecipe, VPWidenCastRecipe>(Val: V))
5274 return false;
5275 auto *R = cast<VPSingleDefRecipe>(Val: V);
5276 if (getOpcodeOrIntrinsicID(R) != getOpcodeOrIntrinsicID(R: WideMember0))
5277 return false;
5278 }
5279
5280 for (unsigned Idx = 0; Idx != WideMember0->getNumOperands(); ++Idx) {
5281 SmallVector<VPValue *> OpsI;
5282 for (VPValue *Op : Ops)
5283 OpsI.push_back(Elt: Op->getDefiningRecipe()->getOperand(N: Idx));
5284
5285 if (canNarrowOps(Ops: OpsI, IsScalable))
5286 continue;
5287
5288 if (any_of(Range: enumerate(First&: OpsI), P: [WideMember0, Idx, IsScalable](const auto &P) {
5289 const auto &[OpIdx, OpV] = P;
5290 return !canNarrowLoad(WideMember0, Idx, OpV, OpIdx, IsScalable);
5291 }))
5292 return false;
5293 }
5294
5295 return true;
5296}
5297
5298/// Returns VF from \p VFs if \p IR is a full interleave group with factor and
5299/// number of members both equal to VF. The interleave group must also access
5300/// the full vector width.
5301static std::optional<ElementCount> isConsecutiveInterleaveGroup(
5302 VPInterleaveRecipe *InterleaveR, ArrayRef<ElementCount> VFs,
5303 VPTypeAnalysis &TypeInfo, const TargetTransformInfo &TTI) {
5304 if (!InterleaveR || InterleaveR->getMask())
5305 return std::nullopt;
5306
5307 Type *GroupElementTy = nullptr;
5308 if (InterleaveR->getStoredValues().empty()) {
5309 GroupElementTy = TypeInfo.inferScalarType(V: InterleaveR->getVPValue(I: 0));
5310 if (!all_of(Range: InterleaveR->definedValues(),
5311 P: [&TypeInfo, GroupElementTy](VPValue *Op) {
5312 return TypeInfo.inferScalarType(V: Op) == GroupElementTy;
5313 }))
5314 return std::nullopt;
5315 } else {
5316 GroupElementTy =
5317 TypeInfo.inferScalarType(V: InterleaveR->getStoredValues()[0]);
5318 if (!all_of(Range: InterleaveR->getStoredValues(),
5319 P: [&TypeInfo, GroupElementTy](VPValue *Op) {
5320 return TypeInfo.inferScalarType(V: Op) == GroupElementTy;
5321 }))
5322 return std::nullopt;
5323 }
5324
5325 auto IG = InterleaveR->getInterleaveGroup();
5326 if (IG->getFactor() != IG->getNumMembers())
5327 return std::nullopt;
5328
5329 auto GetVectorBitWidthForVF = [&TTI](ElementCount VF) {
5330 TypeSize Size = TTI.getRegisterBitWidth(
5331 K: VF.isFixed() ? TargetTransformInfo::RGK_FixedWidthVector
5332 : TargetTransformInfo::RGK_ScalableVector);
5333 assert(Size.isScalable() == VF.isScalable() &&
5334 "if Size is scalable, VF must be scalable and vice versa");
5335 return Size.getKnownMinValue();
5336 };
5337
5338 for (ElementCount VF : VFs) {
5339 unsigned MinVal = VF.getKnownMinValue();
5340 unsigned GroupSize = GroupElementTy->getScalarSizeInBits() * MinVal;
5341 if (IG->getFactor() == MinVal && GroupSize == GetVectorBitWidthForVF(VF))
5342 return {VF};
5343 }
5344 return std::nullopt;
5345}
5346
5347/// Returns true if \p VPValue is a narrow VPValue.
5348static bool isAlreadyNarrow(VPValue *VPV) {
5349 if (isa<VPIRValue>(Val: VPV))
5350 return true;
5351 auto *RepR = dyn_cast<VPReplicateRecipe>(Val: VPV);
5352 return RepR && RepR->isSingleScalar();
5353}
5354
5355// Convert a wide recipe defining a VPValue \p V feeding an interleave group to
5356// a narrow variant.
5357static VPValue *
5358narrowInterleaveGroupOp(VPValue *V, SmallPtrSetImpl<VPValue *> &NarrowedOps) {
5359 auto *R = V->getDefiningRecipe();
5360 if (!R || NarrowedOps.contains(Ptr: V))
5361 return V;
5362
5363 if (isAlreadyNarrow(VPV: V))
5364 return V;
5365
5366 if (isa<VPWidenRecipe, VPWidenCastRecipe>(Val: R)) {
5367 auto *WideMember0 = cast<VPSingleDefRecipe>(Val: R);
5368 for (unsigned Idx = 0, E = WideMember0->getNumOperands(); Idx != E; ++Idx)
5369 WideMember0->setOperand(
5370 I: Idx,
5371 New: narrowInterleaveGroupOp(V: WideMember0->getOperand(N: Idx), NarrowedOps));
5372 return V;
5373 }
5374
5375 if (auto *LoadGroup = dyn_cast<VPInterleaveRecipe>(Val: R)) {
5376 // Narrow interleave group to wide load, as transformed VPlan will only
5377 // process one original iteration.
5378 auto *LI = cast<LoadInst>(Val: LoadGroup->getInterleaveGroup()->getInsertPos());
5379 auto *L = new VPWidenLoadRecipe(
5380 *LI, LoadGroup->getAddr(), LoadGroup->getMask(), /*Consecutive=*/true,
5381 /*Reverse=*/false, {}, LoadGroup->getDebugLoc());
5382 L->insertBefore(InsertPos: LoadGroup);
5383 NarrowedOps.insert(Ptr: L);
5384 return L;
5385 }
5386
5387 if (auto *RepR = dyn_cast<VPReplicateRecipe>(Val: R)) {
5388 assert(RepR->isSingleScalar() &&
5389 isa<LoadInst>(RepR->getUnderlyingInstr()) &&
5390 "must be a single scalar load");
5391 NarrowedOps.insert(Ptr: RepR);
5392 return RepR;
5393 }
5394
5395 auto *WideLoad = cast<VPWidenLoadRecipe>(Val: R);
5396 VPValue *PtrOp = WideLoad->getAddr();
5397 if (auto *VecPtr = dyn_cast<VPVectorPointerRecipe>(Val: PtrOp))
5398 PtrOp = VecPtr->getOperand(N: 0);
5399 // Narrow wide load to uniform scalar load, as transformed VPlan will only
5400 // process one original iteration.
5401 auto *N = new VPReplicateRecipe(&WideLoad->getIngredient(), {PtrOp},
5402 /*IsUniform*/ true,
5403 /*Mask*/ nullptr, {}, *WideLoad);
5404 N->insertBefore(InsertPos: WideLoad);
5405 NarrowedOps.insert(Ptr: N);
5406 return N;
5407}
5408
5409std::unique_ptr<VPlan>
5410VPlanTransforms::narrowInterleaveGroups(VPlan &Plan,
5411 const TargetTransformInfo &TTI) {
5412 VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
5413
5414 if (!VectorLoop)
5415 return nullptr;
5416
5417 // Only handle single-block loops for now.
5418 if (VectorLoop->getEntryBasicBlock() != VectorLoop->getExitingBasicBlock())
5419 return nullptr;
5420
5421 // Skip plans when we may not be able to properly narrow.
5422 VPBasicBlock *Exiting = VectorLoop->getExitingBasicBlock();
5423 if (!match(V: &Exiting->back(), P: m_BranchOnCount()))
5424 return nullptr;
5425
5426 assert(match(&Exiting->back(),
5427 m_BranchOnCount(m_Add(m_VPValue(), m_Specific(&Plan.getVFxUF())),
5428 m_Specific(&Plan.getVectorTripCount()))) &&
5429 "unexpected branch-on-count");
5430
5431 VPTypeAnalysis TypeInfo(Plan);
5432 SmallVector<VPInterleaveRecipe *> StoreGroups;
5433 std::optional<ElementCount> VFToOptimize;
5434 for (auto &R : *VectorLoop->getEntryBasicBlock()) {
5435 if (isa<VPCanonicalIVPHIRecipe>(Val: &R))
5436 continue;
5437
5438 if (isa<VPDerivedIVRecipe, VPScalarIVStepsRecipe>(Val: &R) &&
5439 vputils::onlyFirstLaneUsed(Def: cast<VPSingleDefRecipe>(Val: &R)))
5440 continue;
5441
5442 // Bail out on recipes not supported at the moment:
5443 // * phi recipes other than the canonical induction
5444 // * recipes writing to memory except interleave groups
5445 // Only support plans with a canonical induction phi.
5446 if (R.isPhi())
5447 return nullptr;
5448
5449 auto *InterleaveR = dyn_cast<VPInterleaveRecipe>(Val: &R);
5450 if (R.mayWriteToMemory() && !InterleaveR)
5451 return nullptr;
5452
5453 // All other ops are allowed, but we reject uses that cannot be converted
5454 // when checking all allowed consumers (store interleave groups) below.
5455 if (!InterleaveR)
5456 continue;
5457
5458 // Try to find a single VF, where all interleave groups are consecutive and
5459 // saturate the full vector width. If we already have a candidate VF, check
5460 // if it is applicable for the current InterleaveR, otherwise look for a
5461 // suitable VF across the Plan's VFs.
5462 SmallVector<ElementCount> VFs =
5463 VFToOptimize ? SmallVector<ElementCount>({*VFToOptimize})
5464 : to_vector(Range: Plan.vectorFactors());
5465 std::optional<ElementCount> NarrowedVF =
5466 isConsecutiveInterleaveGroup(InterleaveR, VFs, TypeInfo, TTI);
5467 if (!NarrowedVF || (VFToOptimize && NarrowedVF != VFToOptimize))
5468 return nullptr;
5469 VFToOptimize = NarrowedVF;
5470
5471 // Skip read interleave groups.
5472 if (InterleaveR->getStoredValues().empty())
5473 continue;
5474
5475 // Narrow interleave groups, if all operands are already matching narrow
5476 // ops.
5477 auto *Member0 = InterleaveR->getStoredValues()[0];
5478 if (isAlreadyNarrow(VPV: Member0) &&
5479 all_of(Range: InterleaveR->getStoredValues(), P: equal_to(Arg&: Member0))) {
5480 StoreGroups.push_back(Elt: InterleaveR);
5481 continue;
5482 }
5483
5484 // For now, we only support full interleave groups storing load interleave
5485 // groups.
5486 if (all_of(Range: enumerate(First: InterleaveR->getStoredValues()), P: [](auto Op) {
5487 VPRecipeBase *DefR = Op.value()->getDefiningRecipe();
5488 if (!DefR)
5489 return false;
5490 auto *IR = dyn_cast<VPInterleaveRecipe>(Val: DefR);
5491 return IR && IR->getInterleaveGroup()->isFull() &&
5492 IR->getVPValue(Op.index()) == Op.value();
5493 })) {
5494 StoreGroups.push_back(Elt: InterleaveR);
5495 continue;
5496 }
5497
5498 // Check if all values feeding InterleaveR are matching wide recipes, which
5499 // operands that can be narrowed.
5500 if (!canNarrowOps(Ops: InterleaveR->getStoredValues(),
5501 IsScalable: VFToOptimize->isScalable()))
5502 return nullptr;
5503 StoreGroups.push_back(Elt: InterleaveR);
5504 }
5505
5506 if (StoreGroups.empty())
5507 return nullptr;
5508
5509 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
5510 bool RequiresScalarEpilogue =
5511 MiddleVPBB->getNumSuccessors() == 1 &&
5512 MiddleVPBB->getSingleSuccessor() == Plan.getScalarPreheader();
5513 // Bail out for tail-folding (middle block with a single successor to exit).
5514 if (MiddleVPBB->getNumSuccessors() != 2 && !RequiresScalarEpilogue)
5515 return nullptr;
5516
5517 // All interleave groups in Plan can be narrowed for VFToOptimize. Split the
5518 // original Plan into 2: a) a new clone which contains all VFs of Plan, except
5519 // VFToOptimize, and b) the original Plan with VFToOptimize as single VF.
5520 // TODO: Handle cases where only some interleave groups can be narrowed.
5521 std::unique_ptr<VPlan> NewPlan;
5522 if (size(Range: Plan.vectorFactors()) != 1) {
5523 NewPlan = std::unique_ptr<VPlan>(Plan.duplicate());
5524 Plan.setVF(*VFToOptimize);
5525 NewPlan->removeVF(VF: *VFToOptimize);
5526 }
5527
5528 // Convert InterleaveGroup \p R to a single VPWidenLoadRecipe.
5529 SmallPtrSet<VPValue *, 4> NarrowedOps;
5530 // Narrow operation tree rooted at store groups.
5531 for (auto *StoreGroup : StoreGroups) {
5532 VPValue *Res =
5533 narrowInterleaveGroupOp(V: StoreGroup->getStoredValues()[0], NarrowedOps);
5534 auto *SI =
5535 cast<StoreInst>(Val: StoreGroup->getInterleaveGroup()->getInsertPos());
5536 auto *S = new VPWidenStoreRecipe(
5537 *SI, StoreGroup->getAddr(), Res, nullptr, /*Consecutive=*/true,
5538 /*Reverse=*/false, {}, StoreGroup->getDebugLoc());
5539 S->insertBefore(InsertPos: StoreGroup);
5540 StoreGroup->eraseFromParent();
5541 }
5542
5543 // Adjust induction to reflect that the transformed plan only processes one
5544 // original iteration.
5545 auto *CanIV = VectorLoop->getCanonicalIV();
5546 auto *Inc = cast<VPInstruction>(Val: CanIV->getBackedgeValue());
5547 VPBasicBlock *VectorPH = Plan.getVectorPreheader();
5548 VPBuilder PHBuilder(VectorPH, VectorPH->begin());
5549
5550 VPValue *UF = &Plan.getUF();
5551 VPValue *Step;
5552 if (VFToOptimize->isScalable()) {
5553 VPValue *VScale = PHBuilder.createElementCount(
5554 Ty: VectorLoop->getCanonicalIVType(), EC: ElementCount::getScalable(MinVal: 1));
5555 Step = PHBuilder.createOverflowingOp(Opcode: Instruction::Mul, Operands: {VScale, UF},
5556 WrapFlags: {true, false});
5557 Plan.getVF().replaceAllUsesWith(New: VScale);
5558 } else {
5559 Step = UF;
5560 Plan.getVF().replaceAllUsesWith(
5561 New: Plan.getConstantInt(Ty: CanIV->getScalarType(), Val: 1));
5562 }
5563 // Materialize vector trip count with the narrowed step.
5564 materializeVectorTripCount(Plan, VectorPHVPBB: VectorPH, /*TailByMasking=*/false,
5565 RequiresScalarEpilogue, Step);
5566
5567 Inc->setOperand(I: 1, New: Step);
5568 Plan.getVFxUF().replaceAllUsesWith(New: Step);
5569
5570 removeDeadRecipes(Plan);
5571 assert(none_of(*VectorLoop->getEntryBasicBlock(),
5572 IsaPred<VPVectorPointerRecipe>) &&
5573 "All VPVectorPointerRecipes should have been removed");
5574 return NewPlan;
5575}
5576
5577/// Add branch weight metadata, if the \p Plan's middle block is terminated by a
5578/// BranchOnCond recipe.
5579void VPlanTransforms::addBranchWeightToMiddleTerminator(
5580 VPlan &Plan, ElementCount VF, std::optional<unsigned> VScaleForTuning) {
5581 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
5582 auto *MiddleTerm =
5583 dyn_cast_or_null<VPInstruction>(Val: MiddleVPBB->getTerminator());
5584 // Only add branch metadata if there is a (conditional) terminator.
5585 if (!MiddleTerm)
5586 return;
5587
5588 assert(MiddleTerm->getOpcode() == VPInstruction::BranchOnCond &&
5589 "must have a BranchOnCond");
5590 // Assume that `TripCount % VectorStep ` is equally distributed.
5591 unsigned VectorStep = Plan.getConcreteUF() * VF.getKnownMinValue();
5592 if (VF.isScalable() && VScaleForTuning.has_value())
5593 VectorStep *= *VScaleForTuning;
5594 assert(VectorStep > 0 && "trip count should not be zero");
5595 MDBuilder MDB(Plan.getContext());
5596 MDNode *BranchWeights =
5597 MDB.createBranchWeights(Weights: {1, VectorStep - 1}, /*IsExpected=*/false);
5598 MiddleTerm->setMetadata(Kind: LLVMContext::MD_prof, Node: BranchWeights);
5599}
5600
5601void VPlanTransforms::addExitUsersForFirstOrderRecurrences(VPlan &Plan,
5602 VFRange &Range) {
5603 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
5604 auto *MiddleVPBB = Plan.getMiddleBlock();
5605 VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
5606
5607 auto IsScalableOne = [](ElementCount VF) -> bool {
5608 return VF == ElementCount::getScalable(MinVal: 1);
5609 };
5610
5611 for (auto &HeaderPhi : VectorRegion->getEntryBasicBlock()->phis()) {
5612 auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(Val: &HeaderPhi);
5613 if (!FOR)
5614 continue;
5615
5616 assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
5617 "Cannot handle loops with uncountable early exits");
5618
5619 // This is the second phase of vectorizing first-order recurrences, creating
5620 // extract for users outside the loop. An overview of the transformation is
5621 // described below. Suppose we have the following loop with some use after
5622 // the loop of the last a[i-1],
5623 //
5624 // for (int i = 0; i < n; ++i) {
5625 // t = a[i - 1];
5626 // b[i] = a[i] - t;
5627 // }
5628 // use t;
5629 //
5630 // There is a first-order recurrence on "a". For this loop, the shorthand
5631 // scalar IR looks like:
5632 //
5633 // scalar.ph:
5634 // s.init = a[-1]
5635 // br scalar.body
5636 //
5637 // scalar.body:
5638 // i = phi [0, scalar.ph], [i+1, scalar.body]
5639 // s1 = phi [s.init, scalar.ph], [s2, scalar.body]
5640 // s2 = a[i]
5641 // b[i] = s2 - s1
5642 // br cond, scalar.body, exit.block
5643 //
5644 // exit.block:
5645 // use = lcssa.phi [s1, scalar.body]
5646 //
5647 // In this example, s1 is a recurrence because it's value depends on the
5648 // previous iteration. In the first phase of vectorization, we created a
5649 // VPFirstOrderRecurrencePHIRecipe v1 for s1. Now we create the extracts
5650 // for users in the scalar preheader and exit block.
5651 //
5652 // vector.ph:
5653 // v_init = vector(..., ..., ..., a[-1])
5654 // br vector.body
5655 //
5656 // vector.body
5657 // i = phi [0, vector.ph], [i+4, vector.body]
5658 // v1 = phi [v_init, vector.ph], [v2, vector.body]
5659 // v2 = a[i, i+1, i+2, i+3]
5660 // b[i] = v2 - v1
5661 // // Next, third phase will introduce v1' = splice(v1(3), v2(0, 1, 2))
5662 // b[i, i+1, i+2, i+3] = v2 - v1
5663 // br cond, vector.body, middle.block
5664 //
5665 // middle.block:
5666 // vector.recur.extract.for.phi = v2(2)
5667 // vector.recur.extract = v2(3)
5668 // br cond, scalar.ph, exit.block
5669 //
5670 // scalar.ph:
5671 // scalar.recur.init = phi [vector.recur.extract, middle.block],
5672 // [s.init, otherwise]
5673 // br scalar.body
5674 //
5675 // scalar.body:
5676 // i = phi [0, scalar.ph], [i+1, scalar.body]
5677 // s1 = phi [scalar.recur.init, scalar.ph], [s2, scalar.body]
5678 // s2 = a[i]
5679 // b[i] = s2 - s1
5680 // br cond, scalar.body, exit.block
5681 //
5682 // exit.block:
5683 // lo = lcssa.phi [s1, scalar.body],
5684 // [vector.recur.extract.for.phi, middle.block]
5685 //
5686 // Now update VPIRInstructions modeling LCSSA phis in the exit block.
5687 // Extract the penultimate value of the recurrence and use it as operand for
5688 // the VPIRInstruction modeling the phi.
5689 for (VPRecipeBase &R : make_early_inc_range(
5690 Range: make_range(x: MiddleVPBB->getFirstNonPhi(), y: MiddleVPBB->end()))) {
5691 if (!match(V: &R, P: m_ExtractLastLaneOfLastPart(Op0: m_Specific(VPV: FOR))))
5692 continue;
5693
5694 // For VF vscale x 1, if vscale = 1, we are unable to extract the
5695 // penultimate value of the recurrence. Instead we rely on the existing
5696 // extract of the last element from the result of
5697 // VPInstruction::FirstOrderRecurrenceSplice.
5698 // TODO: Consider vscale_range info and UF.
5699 if (LoopVectorizationPlanner::getDecisionAndClampRange(Predicate: IsScalableOne,
5700 Range))
5701 return;
5702 VPValue *PenultimateElement = MiddleBuilder.createNaryOp(
5703 Opcode: VPInstruction::ExtractPenultimateElement, Operands: FOR->getBackedgeValue(), DL: {},
5704 Name: "vector.recur.extract.for.phi");
5705 for (VPUser *U : to_vector(Range: cast<VPInstruction>(Val: &R)->users())) {
5706 auto *ExitPhi = dyn_cast<VPIRPhi>(Val: U);
5707 if (!ExitPhi)
5708 continue;
5709 ExitPhi->replaceUsesOfWith(From: cast<VPInstruction>(Val: &R), To: PenultimateElement);
5710 }
5711 }
5712 }
5713}
5714
5715void VPlanTransforms::optimizeFindIVReductions(VPlan &Plan,
5716 PredicatedScalarEvolution &PSE,
5717 Loop &L) {
5718 ScalarEvolution &SE = *PSE.getSE();
5719 VPRegionBlock *VectorLoopRegion = Plan.getVectorLoopRegion();
5720
5721 // Helper lambda to check if the IV range excludes the sentinel value. Try
5722 // signed first, then unsigned. Return an excluded sentinel if found,
5723 // otherwise return std::nullopt.
5724 auto CheckSentinel = [&SE](const SCEV *IVSCEV,
5725 bool UseMax) -> std::optional<APSInt> {
5726 unsigned BW = IVSCEV->getType()->getScalarSizeInBits();
5727 for (bool Signed : {true, false}) {
5728 APSInt Sentinel = UseMax ? APSInt::getMinValue(numBits: BW, /*Unsigned=*/!Signed)
5729 : APSInt::getMaxValue(numBits: BW, /*Unsigned=*/!Signed);
5730
5731 ConstantRange IVRange =
5732 Signed ? SE.getSignedRange(S: IVSCEV) : SE.getUnsignedRange(S: IVSCEV);
5733 if (!IVRange.contains(Val: Sentinel))
5734 return Sentinel;
5735 }
5736 return std::nullopt;
5737 };
5738
5739 VPValue *HeaderMask = vputils::findHeaderMask(Plan);
5740 for (VPRecipeBase &Phi :
5741 make_early_inc_range(Range: VectorLoopRegion->getEntryBasicBlock()->phis())) {
5742 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(Val: &Phi);
5743 if (!PhiR || !RecurrenceDescriptor::isFindLastRecurrenceKind(
5744 Kind: PhiR->getRecurrenceKind()))
5745 continue;
5746
5747 Type *PhiTy = VPTypeAnalysis(Plan).inferScalarType(V: PhiR);
5748 if (PhiTy->isPointerTy() || PhiTy->isFloatingPointTy())
5749 continue;
5750
5751 // If there's a header mask, the backedge select will not be the find-last
5752 // select.
5753 VPValue *BackedgeVal = PhiR->getBackedgeValue();
5754 VPValue *FindLastSelect = BackedgeVal;
5755 if (HeaderMask && !match(V: BackedgeVal, P: m_Select(Op0: m_Specific(VPV: HeaderMask),
5756 Op1: m_VPValue(V&: FindLastSelect),
5757 Op2: m_Specific(VPV: PhiR))))
5758 llvm_unreachable("expected header mask select");
5759
5760 // Get the IV from the find-last select of the reduction phi.
5761 // The find-last select should be a select between the phi and the IV.
5762 VPValue *Cond, *TrueVal, *FalseVal;
5763 if (!match(V: FindLastSelect, P: m_Select(Op0: m_VPValue(V&: Cond), Op1: m_VPValue(V&: TrueVal),
5764 Op2: m_VPValue(V&: FalseVal))))
5765 continue;
5766
5767 // The non-phi operand of the select is the IV.
5768 assert(is_contained(FindLastSelect->getDefiningRecipe()->operands(), PhiR));
5769 VPValue *IV = TrueVal == PhiR ? FalseVal : TrueVal;
5770
5771 const SCEV *IVSCEV = vputils::getSCEVExprForVPValue(V: IV, PSE, L: &L);
5772 const SCEV *Step;
5773 if (!match(S: IVSCEV, P: m_scev_AffineAddRec(Op0: m_SCEV(), Op1: m_SCEV(V&: Step))))
5774 continue;
5775
5776 // Determine direction from SCEV step.
5777 if (!SE.isKnownNonZero(S: Step))
5778 continue;
5779
5780 // Positive step means we need UMax/SMax to find the last IV value, and
5781 // UMin/SMin otherwise.
5782 bool UseMax = SE.isKnownPositive(S: Step);
5783 std::optional<APSInt> SentinelVal = CheckSentinel(IVSCEV, UseMax);
5784 bool UseSigned = SentinelVal && SentinelVal->isSigned();
5785
5786 // If no sentinel was found, fall back to a boolean AnyOf reduction to track
5787 // if the condition was ever true. Requires the IV to not wrap, otherwise we
5788 // cannot use min/max.
5789 if (!SentinelVal) {
5790 auto *AR = cast<SCEVAddRecExpr>(Val: IVSCEV);
5791 if (AR->hasNoSignedWrap())
5792 UseSigned = true;
5793 else if (AR->hasNoUnsignedWrap())
5794 UseSigned = false;
5795 else
5796 continue;
5797 }
5798
5799 VPInstruction *RdxResult = cast<VPInstruction>(Val: vputils::findRecipe(
5800 Start: BackedgeVal,
5801 Pred: match_fn(P: m_VPInstruction<VPInstruction::ComputeReductionResult>())));
5802
5803 RecurKind MinMaxKind =
5804 UseMax ? (UseSigned ? RecurKind::SMax : RecurKind::UMax)
5805 : (UseSigned ? RecurKind::SMin : RecurKind::UMin);
5806 VPIRFlags Flags(MinMaxKind, /*IsOrdered=*/false, /*IsInLoop=*/false,
5807 FastMathFlags());
5808 DebugLoc ExitDL = RdxResult->getDebugLoc();
5809 VPBuilder MiddleBuilder(RdxResult);
5810 VPValue *ReducedIV =
5811 MiddleBuilder.createNaryOp(Opcode: VPInstruction::ComputeReductionResult,
5812 Operands: RdxResult->getOperand(N: 0), Flags, DL: ExitDL);
5813
5814 VPValue *NewRdxResult;
5815 VPValue *StartVPV = PhiR->getStartValue();
5816 if (SentinelVal) {
5817 // Sentinel-based approach: reduce IVs with min/max, compare against
5818 // sentinel to detect if condition was ever true, select accordingly.
5819 VPValue *Sentinel = Plan.getConstantInt(Val: *SentinelVal);
5820 auto *Cmp = MiddleBuilder.createICmp(Pred: CmpInst::ICMP_NE, A: ReducedIV,
5821 B: Sentinel, DL: ExitDL);
5822 NewRdxResult =
5823 MiddleBuilder.createSelect(Cond: Cmp, TrueVal: ReducedIV, FalseVal: StartVPV, DL: ExitDL);
5824 StartVPV = Sentinel;
5825 } else {
5826 // Introduce a boolean AnyOf reduction to track if the condition was ever
5827 // true in the loop. Use it to select the initial start value, if it was
5828 // never true.
5829 auto *AnyOfPhi = new VPReductionPHIRecipe(
5830 /*Phi=*/nullptr, RecurKind::Or, *Plan.getFalse(), *Plan.getFalse(),
5831 RdxUnordered{.VFScaleFactor: 1}, {}, /*HasUsesOutsideReductionChain=*/false);
5832 AnyOfPhi->insertAfter(InsertPos: PhiR);
5833
5834 VPBuilder LoopBuilder(BackedgeVal->getDefiningRecipe());
5835 VPValue *AnyOfCond = Cond;
5836 if (TrueVal == PhiR)
5837 AnyOfCond = LoopBuilder.createNot(Operand: Cond);
5838 VPValue *OrVal = LoopBuilder.createOr(LHS: AnyOfPhi, RHS: AnyOfCond);
5839 AnyOfPhi->setOperand(I: 1, New: OrVal);
5840
5841 NewRdxResult =
5842 MiddleBuilder.createNaryOp(Opcode: VPInstruction::ComputeAnyOfResult,
5843 Operands: {StartVPV, ReducedIV, OrVal}, Flags: {}, DL: ExitDL);
5844
5845 // Initialize the IV reduction phi with the neutral element, not the
5846 // original start value, to ensure correct min/max reduction results.
5847 StartVPV = Plan.getOrAddLiveIn(
5848 V: getRecurrenceIdentity(K: MinMaxKind, Tp: IVSCEV->getType(), FMF: {}));
5849 }
5850 RdxResult->replaceAllUsesWith(New: NewRdxResult);
5851 RdxResult->eraseFromParent();
5852
5853 auto *NewPhiR = new VPReductionPHIRecipe(
5854 cast<PHINode>(Val: PhiR->getUnderlyingInstr()), RecurKind::FindIV, *StartVPV,
5855 *FindLastSelect, RdxUnordered{.VFScaleFactor: 1}, {},
5856 PhiR->hasUsesOutsideReductionChain());
5857 NewPhiR->insertBefore(InsertPos: PhiR);
5858 PhiR->replaceAllUsesWith(New: NewPhiR);
5859 PhiR->eraseFromParent();
5860 }
5861}
5862
5863namespace {
5864
5865/// Holds the binary operation used to compute the extended operand and the
5866/// casts that feed into it.
5867struct ExtendedReductionOperand {
5868 VPWidenRecipe *BinOp = nullptr;
5869 // Note: The second cast recipe may be null.
5870 std::array<VPWidenCastRecipe *, 2> CastRecipes = {};
5871};
5872
5873/// A chain of recipes that form a partial reduction. Matches either
5874/// reduction_bin_op (extend (A), accumulator), or
5875/// reduction_bin_op (bin_op (extend (A), (extend (B))), accumulator).
5876struct VPPartialReductionChain {
5877 /// The top-level binary operation that forms the reduction to a scalar
5878 /// after the loop body.
5879 VPWidenRecipe *ReductionBinOp;
5880 /// The user of the extends that is then reduced.
5881 ExtendedReductionOperand ExtendedOp;
5882 unsigned ScaleFactor;
5883 /// The recurrence kind for the entire partial reduction chain.
5884 /// This allows distinguishing between Sub and AddWithSub recurrences,
5885 /// when the ReductionBinOp is a Instruction::Sub.
5886 RecurKind RK;
5887};
5888
5889static VPSingleDefRecipe *
5890optimizeExtendsForPartialReduction(VPSingleDefRecipe *BinOp,
5891 VPTypeAnalysis &TypeInfo) {
5892 // reduce.add(mul(ext(A), C))
5893 // -> reduce.add(mul(ext(A), ext(trunc(C))))
5894 const APInt *Const;
5895 if (match(R: BinOp, P: m_Mul(Op0: m_ZExtOrSExt(Op0: m_VPValue()), Op1: m_APInt(C&: Const)))) {
5896 auto *ExtA = cast<VPWidenCastRecipe>(Val: BinOp->getOperand(N: 0));
5897 Instruction::CastOps ExtOpc = ExtA->getOpcode();
5898 Type *NarrowTy = TypeInfo.inferScalarType(V: ExtA->getOperand(N: 0));
5899 if (!BinOp->hasOneUse() ||
5900 !llvm::canConstantBeExtended(
5901 C: Const, NarrowType: NarrowTy, ExtKind: TTI::getPartialReductionExtendKind(CastOpc: ExtOpc)))
5902 return BinOp;
5903
5904 VPBuilder Builder(BinOp);
5905 auto *Trunc = Builder.createWidenCast(Opcode: Instruction::CastOps::Trunc,
5906 Op: BinOp->getOperand(N: 1), ResultTy: NarrowTy);
5907 Type *WideTy = TypeInfo.inferScalarType(V: ExtA);
5908 BinOp->setOperand(I: 1, New: Builder.createWidenCast(Opcode: ExtOpc, Op: Trunc, ResultTy: WideTy));
5909 return BinOp;
5910 }
5911
5912 // reduce.add(ext(mul(ext(A), ext(B))))
5913 // -> reduce.add(mul(wider_ext(A), wider_ext(B)))
5914 // TODO: Support this optimization for float types.
5915 if (match(R: BinOp, P: m_ZExtOrSExt(Op0: m_Mul(Op0: m_ZExtOrSExt(Op0: m_VPValue()),
5916 Op1: m_ZExtOrSExt(Op0: m_VPValue()))))) {
5917 auto *Ext = cast<VPWidenCastRecipe>(Val: BinOp);
5918 auto *Mul = cast<VPWidenRecipe>(Val: Ext->getOperand(N: 0));
5919 auto *MulLHS = cast<VPWidenCastRecipe>(Val: Mul->getOperand(N: 0));
5920 auto *MulRHS = cast<VPWidenCastRecipe>(Val: Mul->getOperand(N: 1));
5921 if (!Mul->hasOneUse() ||
5922 (Ext->getOpcode() != MulLHS->getOpcode() && MulLHS != MulRHS) ||
5923 MulLHS->getOpcode() != MulRHS->getOpcode())
5924 return BinOp;
5925 VPBuilder Builder(Mul);
5926 Mul->setOperand(I: 0, New: Builder.createWidenCast(Opcode: MulLHS->getOpcode(),
5927 Op: MulLHS->getOperand(N: 0),
5928 ResultTy: Ext->getResultType()));
5929 Mul->setOperand(I: 1, New: MulLHS == MulRHS
5930 ? Mul->getOperand(N: 0)
5931 : Builder.createWidenCast(Opcode: MulRHS->getOpcode(),
5932 Op: MulRHS->getOperand(N: 0),
5933 ResultTy: Ext->getResultType()));
5934 return Mul;
5935 }
5936
5937 return BinOp;
5938}
5939
5940// Helper to transform a partial reduction chain into a partial reduction
5941// recipe. Assumes profitability has been checked.
5942static void transformToPartialReduction(const VPPartialReductionChain &Chain,
5943 VPTypeAnalysis &TypeInfo, VPlan &Plan,
5944 VPReductionPHIRecipe *RdxPhi) {
5945 VPWidenRecipe *WidenRecipe = Chain.ReductionBinOp;
5946 assert(WidenRecipe->getNumOperands() == 2 && "Expected binary operation");
5947
5948 VPValue *BinOpVal = WidenRecipe->getOperand(N: 0);
5949 VPValue *Accumulator = WidenRecipe->getOperand(N: 1);
5950
5951 // Swap if needed to ensure Accumulator is the PHI or partial reduction.
5952 if (isa<VPReductionPHIRecipe, VPReductionRecipe>(Val: BinOpVal) ||
5953 isa<VPExpressionRecipe>(Val: BinOpVal))
5954 std::swap(a&: BinOpVal, b&: Accumulator);
5955 auto *BinOp = cast<VPSingleDefRecipe>(Val: BinOpVal->getDefiningRecipe());
5956
5957 // Sub-reductions can be implemented in two ways:
5958 // (1) negate the operand in the vector loop (the default way).
5959 // (2) subtract the reduced value from the init value in the middle block.
5960 // Both ways keep the reduction itself as an 'add' reduction.
5961 //
5962 // The ISD nodes for partial reductions don't support folding the
5963 // sub/negation into its operands because the following is not a valid
5964 // transformation:
5965 // sub(0, mul(ext(a), ext(b)))
5966 // -> mul(ext(a), ext(sub(0, b)))
5967 //
5968 // It's therefore better to choose option (2) such that the partial
5969 // reduction is always positive (starting at '0') and to do a final
5970 // subtract in the middle block.
5971 if (WidenRecipe->getOpcode() == Instruction::Sub &&
5972 Chain.RK != RecurKind::Sub) {
5973 VPBuilder Builder(WidenRecipe);
5974 Type *ElemTy = TypeInfo.inferScalarType(V: BinOp);
5975 auto *Zero = Plan.getZero(Ty: ElemTy);
5976 VPIRFlags Flags = WidenRecipe->getUnderlyingInstr()
5977 ? VPIRFlags(*WidenRecipe->getUnderlyingInstr())
5978 : VPIRFlags();
5979 auto *NegRecipe = new VPWidenRecipe(Instruction::Sub, {Zero, BinOp}, Flags,
5980 VPIRMetadata(), DebugLoc::getUnknown());
5981 Builder.insert(R: NegRecipe);
5982 BinOp = NegRecipe;
5983 }
5984
5985 // FIXME: Do these transforms before invoking the cost-model.
5986 BinOp = optimizeExtendsForPartialReduction(BinOp, TypeInfo);
5987
5988 // Check if WidenRecipe is the final result of the reduction. If so look
5989 // through selects for predicated reductions.
5990 VPValue *Cond = nullptr;
5991 VPValue *ExitValue = cast_or_null<VPInstruction>(Val: vputils::findUserOf(
5992 V: WidenRecipe,
5993 P: m_Select(Op0: m_VPValue(V&: Cond), Op1: m_Specific(VPV: WidenRecipe), Op2: m_Specific(VPV: RdxPhi))));
5994 bool IsLastInChain = RdxPhi->getBackedgeValue() == WidenRecipe ||
5995 RdxPhi->getBackedgeValue() == ExitValue;
5996 assert((!ExitValue || IsLastInChain) &&
5997 "if we found ExitValue, it must match RdxPhi's backedge value");
5998
5999 Type *PhiType = TypeInfo.inferScalarType(V: RdxPhi);
6000 RecurKind RdxKind =
6001 PhiType->isFloatingPointTy() ? RecurKind::FAdd : RecurKind::Add;
6002 auto *PartialRed = new VPReductionRecipe(
6003 RdxKind,
6004 RdxKind == RecurKind::FAdd ? WidenRecipe->getFastMathFlags()
6005 : FastMathFlags(),
6006 WidenRecipe->getUnderlyingInstr(), Accumulator, BinOp, Cond,
6007 RdxUnordered{/*VFScaleFactor=*/Chain.ScaleFactor});
6008 PartialRed->insertBefore(InsertPos: WidenRecipe);
6009
6010 if (Cond)
6011 ExitValue->replaceAllUsesWith(New: PartialRed);
6012 WidenRecipe->replaceAllUsesWith(New: PartialRed);
6013
6014 // We only need to update the PHI node once, which is when we find the
6015 // last reduction in the chain.
6016 if (!IsLastInChain)
6017 return;
6018
6019 // Scale the PHI and ReductionStartVector by the VFScaleFactor
6020 assert(RdxPhi->getVFScaleFactor() == 1 && "scale factor must not be set");
6021 RdxPhi->setVFScaleFactor(Chain.ScaleFactor);
6022
6023 auto *StartInst = cast<VPInstruction>(Val: RdxPhi->getStartValue());
6024 assert(StartInst->getOpcode() == VPInstruction::ReductionStartVector);
6025 auto *NewScaleFactor = Plan.getConstantInt(BitWidth: 32, Val: Chain.ScaleFactor);
6026 StartInst->setOperand(I: 2, New: NewScaleFactor);
6027
6028 // If this is the last value in a sub-reduction chain, then update the PHI
6029 // node to start at `0` and update the reduction-result to subtract from
6030 // the PHI's start value.
6031 if (Chain.RK != RecurKind::Sub)
6032 return;
6033
6034 VPValue *OldStartValue = StartInst->getOperand(N: 0);
6035 StartInst->setOperand(I: 0, New: StartInst->getOperand(N: 1));
6036
6037 // Replace reduction_result by 'sub (startval, reductionresult)'.
6038 VPInstruction *RdxResult = vputils::findComputeReductionResult(PhiR: RdxPhi);
6039 assert(RdxResult && "Could not find reduction result");
6040
6041 VPBuilder Builder = VPBuilder::getToInsertAfter(R: RdxResult);
6042 constexpr unsigned SubOpc = Instruction::BinaryOps::Sub;
6043 VPInstruction *NewResult = Builder.createNaryOp(
6044 Opcode: SubOpc, Operands: {OldStartValue, RdxResult}, Flags: VPIRFlags::getDefaultFlags(Opcode: SubOpc),
6045 DL: RdxPhi->getDebugLoc());
6046 RdxResult->replaceUsesWithIf(
6047 New: NewResult,
6048 ShouldReplace: [&NewResult](VPUser &U, unsigned Idx) { return &U != NewResult; });
6049}
6050
6051/// Returns the cost of a link in a partial-reduction chain for a given VF.
6052static InstructionCost
6053getPartialReductionLinkCost(VPCostContext &CostCtx,
6054 const VPPartialReductionChain &Link,
6055 ElementCount VF) {
6056 auto GetExtInfo = [&CostCtx](VPWidenCastRecipe *Ext)
6057 -> std::pair<Type *, TargetTransformInfo::PartialReductionExtendKind> {
6058 if (!Ext)
6059 return {nullptr, TargetTransformInfo::PR_None};
6060 Type *ExtOpType = CostCtx.Types.inferScalarType(V: Ext->getOperand(N: 0));
6061 auto ExtKind = TargetTransformInfo::getPartialReductionExtendKind(
6062 CastOpc: static_cast<Instruction::CastOps>(Ext->getOpcode()));
6063 return {ExtOpType, ExtKind};
6064 };
6065
6066 const ExtendedReductionOperand &ExtendedOp = Link.ExtendedOp;
6067 VPWidenCastRecipe *ExtendA = ExtendedOp.CastRecipes[0];
6068 VPWidenCastRecipe *ExtendB = ExtendedOp.CastRecipes[1];
6069
6070 Type *ExtOpTypeA, *ExtOpTypeB;
6071 TargetTransformInfo::PartialReductionExtendKind ExtKindA, ExtKindB;
6072 std::tie(args&: ExtOpTypeA, args&: ExtKindA) = GetExtInfo(ExtendA);
6073 std::tie(args&: ExtOpTypeB, args&: ExtKindB) = GetExtInfo(ExtendB);
6074
6075 std::optional<unsigned> BinOpc;
6076 if (ExtendedOp.BinOp && ExtendedOp.BinOp != Link.ReductionBinOp)
6077 BinOpc = ExtendedOp.BinOp->getOpcode();
6078
6079 // If ExtendB is nullptr but there's a separate BinOp, the second operand
6080 // was a constant that can use the same extend kind as the first.
6081 if (!ExtendB && BinOpc) {
6082 const APInt *Const = nullptr;
6083 for (VPValue *Op : ExtendedOp.BinOp->operands()) {
6084 if (match(V: Op, P: m_APInt(C&: Const)))
6085 break;
6086 }
6087 if (!Const || !canConstantBeExtended(C: Const, NarrowType: ExtOpTypeA, ExtKind: ExtKindA))
6088 return InstructionCost::getInvalid();
6089 ExtOpTypeB = ExtOpTypeA;
6090 ExtKindB = ExtKindA;
6091 }
6092
6093 Type *RdxType = CostCtx.Types.inferScalarType(V: Link.ReductionBinOp);
6094 std::optional<llvm::FastMathFlags> Flags;
6095 if (RdxType->isFloatingPointTy())
6096 Flags = Link.ReductionBinOp->getFastMathFlags();
6097
6098 unsigned Opcode = Link.RK == RecurKind::Sub
6099 ? (unsigned)Instruction::Add
6100 : Link.ReductionBinOp->getOpcode();
6101 return CostCtx.TTI.getPartialReductionCost(Opcode, InputTypeA: ExtOpTypeA, InputTypeB: ExtOpTypeB,
6102 AccumType: RdxType, VF, OpAExtend: ExtKindA, OpBExtend: ExtKindB,
6103 BinOp: BinOpc, CostKind: CostCtx.CostKind, FMF: Flags);
6104}
6105
6106static TTI::PartialReductionExtendKind
6107getPartialReductionExtendKind(VPWidenCastRecipe *Cast) {
6108 return TTI::getPartialReductionExtendKind(CastOpc: Cast->getOpcode());
6109}
6110
6111/// Checks if \p Op (which is an operand of \p UpdateR) is an extended reduction
6112/// operand. This is an operand where the source of the value (e.g. a load) has
6113/// been extended (sext, zext, or fpext) before it is used in the reduction.
6114///
6115/// Possible forms matched by this function:
6116/// - UpdateR(PrevValue, ext(...))
6117/// - UpdateR(PrevValue, BinOp(ext(...), ext(...)))
6118/// - UpdateR(PrevValue, BinOp(ext(...), Constant))
6119/// - UpdateR(PrevValue, neg(BinOp(ext(...), ext(...))))
6120/// - UpdateR(PrevValue, neg(BinOp(ext(...), Constant)))
6121/// - UpdateR(PrevValue, ext(mul(ext(...), ext(...))))
6122/// - UpdateR(PrevValue, ext(mul(ext(...), Constant)))
6123///
6124/// Note: The second operand of UpdateR corresponds to \p Op in the examples.
6125static std::optional<ExtendedReductionOperand>
6126matchExtendedReductionOperand(VPWidenRecipe *UpdateR, VPValue *Op) {
6127 assert(is_contained(UpdateR->operands(), Op) &&
6128 "Op should be operand of UpdateR");
6129
6130 std::optional<TTI::PartialReductionExtendKind> OuterExtKind;
6131 if (match(V: Op, P: m_WidenAnyExtend(Op0: m_VPValue()))) {
6132 auto *CastRecipe = cast<VPWidenCastRecipe>(Val: Op);
6133 VPValue *CastSource = CastRecipe->getOperand(N: 0);
6134 if (match(V: CastSource, P: m_Mul(Op0: m_VPValue(), Op1: m_VPValue())) ||
6135 match(V: CastSource, P: m_FMul(Op0: m_VPValue(), Op1: m_VPValue()))) {
6136 // Match: ext(mul(...))
6137 // Record the outer extend kind and set `Op` to the mul. We can then match
6138 // this as a binary operation. Note: We can optimize out the outer extend
6139 // by widening the inner extends to match it. See
6140 // optimizeExtendsForPartialReduction.
6141 Op = CastSource;
6142 OuterExtKind = getPartialReductionExtendKind(Cast: CastRecipe);
6143 } else if (UpdateR->getOpcode() == Instruction::Add ||
6144 UpdateR->getOpcode() == Instruction::FAdd) {
6145 // Match: UpdateR(PrevValue, ext(...))
6146 // TODO: Remove the add/fadd restriction (we should be able to handle this
6147 // case for sub reductions too).
6148 return ExtendedReductionOperand{.BinOp: UpdateR, .CastRecipes: {CastRecipe, nullptr}};
6149 }
6150 }
6151
6152 if (!Op->hasOneUse())
6153 return std::nullopt;
6154
6155 // Handle neg(...) pattern (aka sub(0, ...)).
6156 VPValue *NegatedOp = nullptr;
6157 if (match(V: Op, P: m_Sub(Op0: m_ZeroInt(), Op1: m_VPValue(V&: NegatedOp))))
6158 Op = NegatedOp;
6159
6160 VPWidenRecipe *BinOp = dyn_cast<VPWidenRecipe>(Val: Op);
6161 if (!BinOp || !Instruction::isBinaryOp(Opcode: BinOp->getOpcode()))
6162 return std::nullopt;
6163
6164 // The rest of the matching assumes `Op` is a (possibly extended/negated)
6165 // binary operation.
6166
6167 VPValue *LHS = BinOp->getOperand(N: 0);
6168 VPValue *RHS = BinOp->getOperand(N: 1);
6169
6170 // The LHS of the operation must always be an extend.
6171 if (!match(V: LHS, P: m_WidenAnyExtend(Op0: m_VPValue())))
6172 return std::nullopt;
6173
6174 auto *LHSCast = cast<VPWidenCastRecipe>(Val: LHS);
6175
6176 // The RHS of the operation can be an extend or a constant integer.
6177 // The constant will be validated in isValidPartialReduction.
6178 VPWidenCastRecipe *RHSCast = nullptr;
6179 if (match(V: RHS, P: m_WidenAnyExtend(Op0: m_VPValue())))
6180 RHSCast = cast<VPWidenCastRecipe>(Val: RHS);
6181 else if (!isa<VPConstantInt>(Val: RHS))
6182 return std::nullopt;
6183
6184 // The outer extend kind must match the inner extends for folding.
6185 for (VPWidenCastRecipe *Cast : {LHSCast, RHSCast})
6186 if (Cast && OuterExtKind &&
6187 getPartialReductionExtendKind(Cast) != OuterExtKind)
6188 return std::nullopt;
6189
6190 return ExtendedReductionOperand{.BinOp: BinOp, .CastRecipes: {LHSCast, RHSCast}};
6191}
6192
6193/// Examines each operation in the reduction chain corresponding to \p RedPhiR,
6194/// and determines if the target can use a cheaper operation with a wider
6195/// per-iteration input VF and narrower PHI VF. If successful, returns the chain
6196/// of operations in the reduction.
6197static std::optional<SmallVector<VPPartialReductionChain>>
6198getScaledReductions(VPReductionPHIRecipe *RedPhiR, VPCostContext &CostCtx,
6199 VFRange &Range) {
6200 // Get the backedge value from the reduction PHI and find the
6201 // ComputeReductionResult that uses it (directly or through a select for
6202 // predicated reductions).
6203 auto *RdxResult = vputils::findComputeReductionResult(PhiR: RedPhiR);
6204 if (!RdxResult)
6205 return std::nullopt;
6206 VPValue *ExitValue = RdxResult->getOperand(N: 0);
6207 match(V: ExitValue, P: m_Select(Op0: m_VPValue(), Op1: m_VPValue(V&: ExitValue), Op2: m_VPValue()));
6208
6209 SmallVector<VPPartialReductionChain> Chain;
6210 RecurKind RK = RedPhiR->getRecurrenceKind();
6211 Type *PhiType = CostCtx.Types.inferScalarType(V: RedPhiR);
6212 TypeSize PHISize = PhiType->getPrimitiveSizeInBits();
6213
6214 // Work backwards from the ExitValue examining each reduction operation.
6215 VPValue *CurrentValue = ExitValue;
6216 while (CurrentValue != RedPhiR) {
6217 auto *UpdateR = dyn_cast<VPWidenRecipe>(Val: CurrentValue);
6218 if (!UpdateR || !Instruction::isBinaryOp(Opcode: UpdateR->getOpcode()))
6219 return std::nullopt;
6220
6221 VPValue *Op = UpdateR->getOperand(N: 1);
6222 VPValue *PrevValue = UpdateR->getOperand(N: 0);
6223
6224 // Find the extended operand. The other operand (PrevValue) is the next link
6225 // in the reduction chain.
6226 std::optional<ExtendedReductionOperand> ExtendedOp =
6227 matchExtendedReductionOperand(UpdateR, Op);
6228 if (!ExtendedOp) {
6229 ExtendedOp = matchExtendedReductionOperand(UpdateR, Op: PrevValue);
6230 if (!ExtendedOp)
6231 return std::nullopt;
6232 std::swap(a&: Op, b&: PrevValue);
6233 }
6234
6235 Type *ExtSrcType = CostCtx.Types.inferScalarType(
6236 V: ExtendedOp->CastRecipes[0]->getOperand(N: 0));
6237 TypeSize ExtSrcSize = ExtSrcType->getPrimitiveSizeInBits();
6238 if (!PHISize.hasKnownScalarFactor(RHS: ExtSrcSize))
6239 return std::nullopt;
6240
6241 // Check if a partial reduction chain is supported by the target (i.e. does
6242 // not have an invalid cost) for the given VF range. Clamps the range and
6243 // returns true if feasible for any VF.
6244 VPPartialReductionChain Link(
6245 {.ReductionBinOp: UpdateR, .ExtendedOp: *ExtendedOp,
6246 .ScaleFactor: static_cast<unsigned>(PHISize.getKnownScalarFactor(RHS: ExtSrcSize)), .RK: RK});
6247 Chain.push_back(Elt: Link);
6248 CurrentValue = PrevValue;
6249 }
6250
6251 // The chain links were collected by traversing backwards from the exit value.
6252 // Reverse the chains so they are in program order.
6253 std::reverse(first: Chain.begin(), last: Chain.end());
6254 return Chain;
6255}
6256} // namespace
6257
6258void VPlanTransforms::createPartialReductions(VPlan &Plan,
6259 VPCostContext &CostCtx,
6260 VFRange &Range) {
6261 // Find all possible valid partial reductions, grouping chains by their PHI.
6262 // This grouping allows invalidating the whole chain, if any link is not a
6263 // valid partial reduction.
6264 MapVector<VPReductionPHIRecipe *, SmallVector<VPPartialReductionChain>>
6265 ChainsByPhi;
6266 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
6267 for (VPRecipeBase &R : HeaderVPBB->phis()) {
6268 auto *RedPhiR = dyn_cast<VPReductionPHIRecipe>(Val: &R);
6269 if (!RedPhiR)
6270 continue;
6271
6272 if (auto Chains = getScaledReductions(RedPhiR, CostCtx, Range))
6273 ChainsByPhi.try_emplace(Key: RedPhiR, Args: std::move(*Chains));
6274 }
6275
6276 if (ChainsByPhi.empty())
6277 return;
6278
6279 // Build set of partial reduction operations for extend user validation and
6280 // a map of reduction bin ops to their scale factors for scale validation.
6281 SmallPtrSet<VPRecipeBase *, 4> PartialReductionOps;
6282 DenseMap<VPSingleDefRecipe *, unsigned> ScaledReductionMap;
6283 for (const auto &[_, Chains] : ChainsByPhi)
6284 for (const VPPartialReductionChain &Chain : Chains) {
6285 PartialReductionOps.insert(Ptr: Chain.ExtendedOp.BinOp);
6286 ScaledReductionMap[Chain.ReductionBinOp] = Chain.ScaleFactor;
6287 }
6288
6289 // A partial reduction is invalid if any of its extends are used by
6290 // something that isn't another partial reduction. This is because the
6291 // extends are intended to be lowered along with the reduction itself.
6292 auto ExtendUsersValid = [&](VPWidenCastRecipe *Ext) {
6293 return !Ext || all_of(Range: Ext->users(), P: [&](VPUser *U) {
6294 return PartialReductionOps.contains(Ptr: cast<VPRecipeBase>(Val: U));
6295 });
6296 };
6297
6298 auto IsProfitablePartialReductionChainForVF =
6299 [&](ArrayRef<VPPartialReductionChain> Chain, ElementCount VF) -> bool {
6300 InstructionCost PartialCost = 0, RegularCost = 0;
6301
6302 // The chain is a profitable partial reduction chain if the cost of handling
6303 // the entire chain is cheaper when using partial reductions than when
6304 // handling the entire chain using regular reductions.
6305 for (const VPPartialReductionChain &Link : Chain) {
6306 const ExtendedReductionOperand &ExtendedOp = Link.ExtendedOp;
6307 InstructionCost LinkCost = getPartialReductionLinkCost(CostCtx, Link, VF);
6308 if (!LinkCost.isValid())
6309 return false;
6310
6311 PartialCost += LinkCost;
6312 RegularCost += Link.ReductionBinOp->computeCost(VF, Ctx&: CostCtx);
6313 if (ExtendedOp.BinOp && ExtendedOp.BinOp != Link.ReductionBinOp)
6314 RegularCost += ExtendedOp.BinOp->computeCost(VF, Ctx&: CostCtx);
6315 for (VPWidenCastRecipe *Extend : ExtendedOp.CastRecipes)
6316 if (Extend)
6317 RegularCost += Extend->computeCost(VF, Ctx&: CostCtx);
6318 }
6319 return PartialCost.isValid() && PartialCost <= RegularCost;
6320 };
6321
6322 // Validate chains: check that extends are only used by partial reductions,
6323 // and that reduction bin ops are only used by other partial reductions with
6324 // matching scale factors, are outside the loop region or the select
6325 // introduced by tail-folding. Otherwise we would create users of scaled
6326 // reductions where the types of the other operands don't match.
6327 for (auto &[RedPhiR, Chains] : ChainsByPhi) {
6328 for (const VPPartialReductionChain &Chain : Chains) {
6329 if (!all_of(Range: Chain.ExtendedOp.CastRecipes, P: ExtendUsersValid)) {
6330 Chains.clear();
6331 break;
6332 }
6333 auto UseIsValid = [&, RedPhiR = RedPhiR](VPUser *U) {
6334 if (auto *PhiR = dyn_cast<VPReductionPHIRecipe>(Val: U))
6335 return PhiR == RedPhiR;
6336 auto *R = cast<VPSingleDefRecipe>(Val: U);
6337 return Chain.ScaleFactor == ScaledReductionMap.lookup_or(Val: R, Default: 0) ||
6338 match(R, P: m_ComputeReductionResult(
6339 Op0: m_Specific(VPV: Chain.ReductionBinOp))) ||
6340 match(R, P: m_Select(Op0: m_VPValue(), Op1: m_Specific(VPV: Chain.ReductionBinOp),
6341 Op2: m_Specific(VPV: RedPhiR)));
6342 };
6343 if (!all_of(Range: Chain.ReductionBinOp->users(), P: UseIsValid)) {
6344 Chains.clear();
6345 break;
6346 }
6347
6348 // Check if the compute-reduction-result is used by a sunk store.
6349 // TODO: Also form partial reductions in those cases.
6350 if (auto *RdxResult = vputils::findComputeReductionResult(PhiR: RedPhiR)) {
6351 if (any_of(Range: RdxResult->users(), P: [](VPUser *U) {
6352 auto *RepR = dyn_cast<VPReplicateRecipe>(Val: U);
6353 return RepR && isa<StoreInst>(Val: RepR->getUnderlyingInstr());
6354 })) {
6355 Chains.clear();
6356 break;
6357 }
6358 }
6359 }
6360
6361 // Clear the chain if it is not profitable.
6362 if (!LoopVectorizationPlanner::getDecisionAndClampRange(
6363 Predicate: [&, &Chains = Chains](ElementCount VF) {
6364 return IsProfitablePartialReductionChainForVF(Chains, VF);
6365 },
6366 Range))
6367 Chains.clear();
6368 }
6369
6370 for (auto &[Phi, Chains] : ChainsByPhi)
6371 for (const VPPartialReductionChain &Chain : Chains)
6372 transformToPartialReduction(Chain, TypeInfo&: CostCtx.Types, Plan, RdxPhi: Phi);
6373}
6374